From 642f16949d66d6ce38abded858fae7676894acfe Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Tue, 31 Mar 2026 22:42:28 +0800 Subject: [PATCH 001/204] [Bugfix] Update Whisper model loading to support multi-GPU configurations and optimize CUDA memory management (#2354) Signed-off-by: wangyu <410167048@qq.com> --- .buildkite/test-merge.yml | 1 - tests/conftest.py | 26 +++++++++++++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index fc1f7a6796..7bee193191 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -259,7 +259,6 @@ steps: depends_on: upload-merge-pipeline commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model" - pytest -s -v tests/e2e/online_serving/test_mimo_audio.py -m "advanced_model" --run-level "advanced_model" diff --git a/tests/conftest.py b/tests/conftest.py index adc048e847..fb88869542 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -48,6 +48,7 @@ from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniSamplingParams from vllm_omni.outputs import OmniRequestOutput +from vllm_omni.platforms import current_omni_platform logger = init_logger(__name__) @@ -1065,7 +1066,7 @@ def convert_audio_to_text(audio_data): Convert base64 encoded audio data to text using speech recognition. """ audio_data = base64.b64decode(audio_data) - output_path = f"./test_{int(time.time())}.wav" + output_path = f"./test_{uuid.uuid4().hex}.wav" with open(output_path, "wb") as audio_file: audio_file.write(audio_data) @@ -1089,8 +1090,24 @@ def _merge_base64_audio_to_segment(base64_list: list[str]): def _whisper_transcribe_in_current_process(output_path: str) -> str: import whisper - # Keep Whisper on CPU to avoid consuming GPU memory in tests. - model = whisper.load_model("small", device="cpu") + # Multi-GPU: use last visible device to avoid colliding with default device 0; single device uses 0. + device_index = None + if current_omni_platform.is_available(): + n = current_omni_platform.get_device_count() + if n == 1: + device_index = 0 + elif n > 1: + device_index = n - 1 + + if device_index is not None: + torch_device = current_omni_platform.get_torch_device(device_index) + current_omni_platform.set_device(torch_device) + device = str(torch_device) + use_accelerator = True + else: + use_accelerator = False + device = "cpu" + model = whisper.load_model("small", device=device) try: text = model.transcribe( output_path, @@ -1101,6 +1118,9 @@ def _whisper_transcribe_in_current_process(output_path: str) -> str: finally: del model gc.collect() + if use_accelerator: + current_omni_platform.synchronize() + current_omni_platform.empty_cache() return text or "" From f8d0bf538904eaaa3139826e01a371b5da7e24e1 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 31 Mar 2026 09:49:48 -0700 Subject: [PATCH 002/204] [release] Add nightly wheel release index (#2345) Signed-off-by: khluu Co-authored-by: Claude Opus 4.6 (1M context) --- .buildkite/nightly-release-pipeline.yaml | 20 ++ .../generate-and-upload-nightly-index.sh | 87 ++++++++ .buildkite/scripts/generate-nightly-index.py | 193 ++++++++++++++++++ .buildkite/scripts/upload-nightly-wheels.sh | 33 +++ 4 files changed, 333 insertions(+) create mode 100644 .buildkite/nightly-release-pipeline.yaml create mode 100755 .buildkite/scripts/generate-and-upload-nightly-index.sh create mode 100755 .buildkite/scripts/generate-nightly-index.py create mode 100755 .buildkite/scripts/upload-nightly-wheels.sh diff --git a/.buildkite/nightly-release-pipeline.yaml b/.buildkite/nightly-release-pipeline.yaml new file mode 100644 index 0000000000..25c52ba3b4 --- /dev/null +++ b/.buildkite/nightly-release-pipeline.yaml @@ -0,0 +1,20 @@ +steps: + - label: "Build and upload wheel" + key: "build-wheel" + agents: + queue: cpu_queue_release + commands: + - "curl -LsSf https://astral.sh/uv/install.sh | sh" + - 'export PATH="$HOME/.local/bin:$PATH"' + - "uv venv --python=3.12 && source .venv/bin/activate" + - "uv pip install --upgrade build" + - "python3 -m build" + - "bash .buildkite/scripts/upload-nightly-wheels.sh" + + - label: "Generate and upload wheel indices" + depends_on: "build-wheel" + allow_dependency_failure: true + agents: + queue: small_cpu_queue_release + commands: + - "bash .buildkite/scripts/generate-and-upload-nightly-index.sh" diff --git a/.buildkite/scripts/generate-and-upload-nightly-index.sh b/.buildkite/scripts/generate-and-upload-nightly-index.sh new file mode 100755 index 0000000000..6624af3230 --- /dev/null +++ b/.buildkite/scripts/generate-and-upload-nightly-index.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash + +set -ex + +# Generate and upload wheel indices for all vllm-omni wheels in the commit directory. +# This script should run once after all wheels have been built and uploaded. +# All paths are under the omni/ prefix in the vllm-wheels S3 bucket. + +# ======== setup ======== + +BUCKET="vllm-wheels" +INDICES_OUTPUT_DIR="indices" +PYTHON="${PYTHON_PROG:-python3}" +SUBPATH="omni/$BUILDKITE_COMMIT" +S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/" + +# detect if python3.12+ is available +has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)") +if [[ "$has_new_python" -eq 0 ]]; then + # use new python from docker + docker pull python:3-slim + PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3" +fi + +echo "Using python interpreter: $PYTHON" +echo "Python version: $($PYTHON --version)" + +# ======== generate and upload indices ======== + +# list all wheels in the commit directory +echo "Existing wheels on S3:" +aws s3 ls "$S3_COMMIT_PREFIX" || echo "(no objects found)" +obj_json="objects.json" +aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json" +mkdir -p "$INDICES_OUTPUT_DIR" + +# HACK: we do not need regex module here, but it is required by pre-commit hook +# To avoid any external dependency, we simply replace it back to the stdlib re module +sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py + +# Generate indices -- the version is just the commit hash (not omni/{commit}) +# because relative paths are computed between the index and wheel directories, +# both of which live under the omni/ prefix in S3. +$PYTHON .buildkite/scripts/generate-nightly-index.py \ + --version "$BUILDKITE_COMMIT" \ + --current-objects "$obj_json" \ + --output-dir "$INDICES_OUTPUT_DIR" \ + --comment "commit $BUILDKITE_COMMIT" + +# copy indices to /omni/{commit}/ unconditionally +echo "Uploading indices to $S3_COMMIT_PREFIX" +aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX" + +# copy to /omni/nightly/ when NIGHTLY=1 +if [[ "${NIGHTLY:-}" == "1" ]]; then + echo "Uploading indices to overwrite /omni/nightly/" + aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/omni/nightly/" +fi + +# detect version from any wheel in the commit directory +first_wheel_key=$($PYTHON -c "import json; obj=json.load(open('$obj_json')); print(next((c['Key'] for c in obj.get('Contents', []) if c['Key'].endswith('.whl')), ''))") +if [[ -z "$first_wheel_key" ]]; then + echo "Error: No wheels found in $S3_COMMIT_PREFIX" + exit 1 +fi +first_wheel=$(basename "$first_wheel_key") +aws s3 cp "s3://$BUCKET/${first_wheel_key}" "/tmp/${first_wheel}" +version=$(unzip -p "/tmp/${first_wheel}" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) +rm -f "/tmp/${first_wheel}" +echo "Version in wheel: $version" +pure_version="${version%%+*}" +echo "Pure version (without variant): $pure_version" + +# re-generate and copy to /omni/{version}/ only if it does not have "dev" in the version +if [[ "$version" != *"dev"* ]]; then + echo "Re-generating indices for /omni/$pure_version/" + rm -rf "${INDICES_OUTPUT_DIR:?}" + mkdir -p "$INDICES_OUTPUT_DIR" + # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path + $PYTHON .buildkite/scripts/generate-nightly-index.py \ + --version "$pure_version" \ + --wheel-dir "$BUILDKITE_COMMIT" \ + --current-objects "$obj_json" \ + --output-dir "$INDICES_OUTPUT_DIR" \ + --comment "version $pure_version" + aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/omni/$pure_version/" +fi diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py new file mode 100755 index 0000000000..c616c446b0 --- /dev/null +++ b/.buildkite/scripts/generate-nightly-index.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import json +import sys +from dataclasses import asdict, dataclass +from datetime import datetime +from pathlib import Path +from typing import Any +from urllib.parse import quote + +import regex as re + + +def normalize_package_name(name: str) -> str: + """Normalize package name per PEP 503.""" + return re.sub(r"[-_.]+", "-", name).lower() + + +if not sys.version_info >= (3, 12): + raise RuntimeError("This script requires Python 3.12 or higher.") + +INDEX_HTML_TEMPLATE = """ + + + + +{items} + + +""" + + +@dataclass +class WheelFileInfo: + package_name: str + version: str + build_tag: str | None + python_tag: str + abi_tag: str + platform_tag: str + filename: str + + +def parse_from_filename(file: str) -> WheelFileInfo: + """ + Parse wheel filename per PEP 427: + {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl + """ + wheel_file_re = re.compile( + r"^(?P.+)-(?P[^-]+?)(-(?P[^-]+))?-(?P[^-]+)-(?P[^-]+)-(?P[^-]+)\.whl$" + ) + match = wheel_file_re.match(file) + if not match: + raise ValueError(f"Invalid wheel file name: {file}") + + return WheelFileInfo( + package_name=match.group("package_name"), + version=match.group("version"), + build_tag=match.group("build_tag"), + python_tag=match.group("python_tag"), + abi_tag=match.group("abi_tag"), + platform_tag=match.group("platform_tag"), + filename=file, + ) + + +def generate_project_list(package_names: list[str], comment: str = "") -> str: + """Generate top-level PEP 503 project list HTML.""" + href_tags = [] + for name in sorted(package_names): + href_tags.append(f' {name}/
') + return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment) + + +def generate_package_index( + wheel_files: list[WheelFileInfo], + wheel_base_dir: Path, + index_base_dir: Path, + comment: str = "", +) -> tuple[str, str]: + """Generate package index HTML and metadata JSON linking to wheel files.""" + href_tags = [] + metadata = [] + for file in sorted(wheel_files, key=lambda x: x.filename): + relative_path = wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename + # handle '+' in URL; avoid double-encoding '/' and '%2B' (AWS S3 behavior) + file_path_quoted = quote(relative_path.as_posix(), safe=":%/") + href_tags.append(f' {file.filename}
') + file_meta = asdict(file) + file_meta["path"] = file_path_quoted + metadata.append(file_meta) + index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags), comment=comment) + metadata_str = json.dumps(metadata, indent=2) + return index_str, metadata_str + + +def generate_index( + whl_files: list[str], + wheel_base_dir: Path, + index_base_dir: Path, + comment: str = "", +): + """ + Generate PEP 503 index for all wheel files. + + Output structure: + index_base_dir/ + index.html # project list linking to vllm-omni/ + vllm-omni/ + index.html # package index linking to wheel files + metadata.json # machine-readable metadata + """ + parsed_files = [parse_from_filename(f) for f in whl_files] + + if not parsed_files: + print("No wheel files found, skipping index generation.") + return + + comment_str = f" ({comment})" if comment else "" + comment_tmpl = f"Generated on {datetime.now().isoformat()}{comment_str}" + + # Group by normalized package name + packages: dict[str, list[WheelFileInfo]] = {} + for file in parsed_files: + name = normalize_package_name(file.package_name) + packages.setdefault(name, []).append(file) + + print(f"Found packages: {list(packages.keys())}") + + # Generate per-package index + for package, files in packages.items(): + package_dir = index_base_dir / package + package_dir.mkdir(parents=True, exist_ok=True) + index_str, metadata_str = generate_package_index(files, wheel_base_dir, package_dir, comment) + with open(package_dir / "index.html", "w") as f: + f.write(index_str) + with open(package_dir / "metadata.json", "w") as f: + f.write(metadata_str) + + # Generate top-level project list + project_list_str = generate_project_list(sorted(packages.keys()), comment_tmpl) + with open(index_base_dir / "index.html", "w") as f: + f.write(project_list_str) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Generate PEP 503 wheel index from S3 object listing.") + parser.add_argument("--version", type=str, required=True, help="Version string (e.g., commit hash)") + parser.add_argument("--current-objects", type=str, required=True, help="Path to JSON from S3 list-objects-v2") + parser.add_argument("--output-dir", type=str, required=True, help="Directory to write index files") + parser.add_argument("--wheel-dir", type=str, default=None, help="Wheel directory (defaults to --version)") + parser.add_argument("--comment", type=str, default="", help="Comment for generated HTML") + + args = parser.parse_args() + + version = args.version + if "\\" in version or "/" in version: + raise ValueError("Version string must not contain slashes or backslashes.") + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + with open(args.current_objects) as f: + current_objects: dict[str, list[dict[str, Any]]] = json.load(f) + + wheel_files = [ + item["Key"].split("/")[-1] for item in current_objects.get("Contents", []) if item["Key"].endswith(".whl") + ] + + print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}") + + # For release versions, filter to only matching non-dev wheels + PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$") + if PY_VERSION_RE.match(version): + wheel_files = [f for f in wheel_files if version in f and "dev" not in f] + print(f"Non-nightly version detected, wheel files used: {wheel_files}") + else: + print("Nightly version detected, keeping all wheel files.") + + wheel_dir = (args.wheel_dir or version).strip().rstrip("/") + wheel_base_dir = Path(output_dir).parent / wheel_dir + index_base_dir = Path(output_dir) + + generate_index( + whl_files=wheel_files, + wheel_base_dir=wheel_base_dir, + index_base_dir=index_base_dir, + comment=args.comment.strip(), + ) + print(f"Successfully generated index in {output_dir}") diff --git a/.buildkite/scripts/upload-nightly-wheels.sh b/.buildkite/scripts/upload-nightly-wheels.sh new file mode 100755 index 0000000000..d50da1deda --- /dev/null +++ b/.buildkite/scripts/upload-nightly-wheels.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +set -ex + +# Upload a single wheel to S3 under the omni/ prefix. +# Index generation is handled separately by generate-and-upload-nightly-index.sh. + +BUCKET="vllm-wheels" +SUBPATH="omni/$BUILDKITE_COMMIT" +S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/" + +# ========= collect & upload the wheel ========== + +# python3 -m build outputs to dist/ by default +wheel_files=(dist/*.whl) + +# Check that exactly one wheel is found +if [[ ${#wheel_files[@]} -ne 1 ]]; then + echo "Error: Expected exactly one wheel file in dist/, but found ${#wheel_files[@]}" + exit 1 +fi +wheel="${wheel_files[0]}" + +echo "Uploading wheel: $wheel" + +# Extract the version from the wheel +version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) +echo "Version in wheel: $version" + +# Upload wheel to S3 +aws s3 cp "$wheel" "$S3_COMMIT_PREFIX" + +echo "Wheel uploaded to $S3_COMMIT_PREFIX. Index generation is handled by a separate step." From 369f301a1baf482c5c9c26f292403f7fbee879a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Wed, 1 Apr 2026 09:19:13 +0800 Subject: [PATCH 003/204] [BugFix] Add BAGEL single-stage diffusion config and fix multiple `` bug (#2381) Signed-off-by: princepride --- examples/offline_inference/bagel/end2end.py | 2 +- .../bagel/openai_chat_client.py | 2 +- .../stage_configs/bagel_single_stage.yaml | 32 +++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index 6562f32ae6..922a1af236 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -101,7 +101,7 @@ def main(): if not prompts: # Default prompt for text2img test if none provided - prompts = ["<|im_start|>A cute cat<|im_end|>"] + prompts = ["A cute cat"] print(f"[Info] No prompts provided, using default: {prompts}") omni_outputs = [] diff --git a/examples/online_serving/bagel/openai_chat_client.py b/examples/online_serving/bagel/openai_chat_client.py index fd5f4cac5d..cc9ec32db9 100755 --- a/examples/online_serving/bagel/openai_chat_client.py +++ b/examples/online_serving/bagel/openai_chat_client.py @@ -125,7 +125,7 @@ def generate_image( def main(): parser = argparse.ArgumentParser(description="Bagel multimodal chat client") - parser.add_argument("--prompt", "-p", default="<|im_start|>A cute cat<|im_end|>", help="Text prompt") + parser.add_argument("--prompt", "-p", default="A cute cat", help="Text prompt") parser.add_argument("--output", "-o", default="bagel_output.png", help="Output file (for image results)") parser.add_argument("--server", "-s", default="http://localhost:8091", help="Server URL") diff --git a/vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml b/vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml new file mode 100644 index 0000000000..2c1d84af49 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml @@ -0,0 +1,32 @@ +# Stage 0: Thinker (multimodal understanding + text generation) + +stage_args: + + - stage_id: 0 + stage_type: diffusion + runtime: + devices: "0" + engine_args: + model_stage: dit + max_num_seqs: 1 + gpu_memory_utilization: 0.45 + enforce_eager: true + trust_remote_code: true + engine_output_type: image + distributed_executor_backend: "mp" + enable_prefix_caching: false + max_num_batched_tokens: 32768 + tensor_parallel_size: 1 + + final_output: true + final_output_type: image + is_comprehension: false + default_sampling_params: + seed: 52 + +# Runtime edges +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 From dd0b6fd4e827a9c20802b564144265027b61eeae Mon Sep 17 00:00:00 2001 From: Lancer Date: Wed, 1 Apr 2026 09:25:34 +0800 Subject: [PATCH 004/204] [Bugfix] Fix layer-wise offload incompatibility with HSDP (#2021) Signed-off-by: Lancer --- .../offloader/test_layerwise_backend.py | 129 ++++++++++++++++++ .../diffusion/offloader/layerwise_backend.py | 74 ++++++++-- 2 files changed, 189 insertions(+), 14 deletions(-) create mode 100644 tests/diffusion/offloader/test_layerwise_backend.py diff --git a/tests/diffusion/offloader/test_layerwise_backend.py b/tests/diffusion/offloader/test_layerwise_backend.py new file mode 100644 index 0000000000..7df3c1bb1a --- /dev/null +++ b/tests/diffusion/offloader/test_layerwise_backend.py @@ -0,0 +1,129 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Unit tests for LayerwiseOffloadHook.""" + +import gc +import os +import socket +from contextlib import contextmanager + +import pytest +import torch +import torch.distributed as dist +from torch import nn +from torch.distributed.tensor import DeviceMesh, DTensor, Replicate + +import vllm_omni.diffusion.offloader.layerwise_backend as layerwise_backend_module +from vllm_omni.diffusion.offloader.layerwise_backend import LayerwiseOffloadHook +from vllm_omni.platforms import current_omni_platform + +pytestmark = [pytest.mark.diffusion, pytest.mark.cpu, pytest.mark.core_model] + + +class DummyStream: + def wait_stream(self, _stream) -> None: + return None + + def wait_event(self, _event) -> None: + return None + + +class DummyEvent: + def record(self, _stream) -> None: + return None + + +@contextmanager +def dummy_stream(_stream): + yield None + + +def _find_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("127.0.0.1", 0)) + return int(s.getsockname()[1]) + + +def _set_dist_env(*, rank: int, world_size: int, master_port: int) -> None: + os.environ["RANK"] = str(rank) + os.environ["LOCAL_RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(master_port) + + +def _cleanup_distributed() -> None: + if dist.is_initialized(): + dist.destroy_process_group() + + for key in ["MASTER_ADDR", "MASTER_PORT", "RANK", "WORLD_SIZE", "LOCAL_RANK"]: + os.environ.pop(key, None) + + gc.collect() + if current_omni_platform.is_available(): + current_omni_platform.empty_cache() + current_omni_platform.synchronize() + + +@pytest.fixture(scope="module") +def dist_group(): + master_port = _find_free_port() + _set_dist_env(rank=0, world_size=1, master_port=master_port) + + dist.init_process_group("gloo", rank=0, world_size=1) + try: + yield + finally: + _cleanup_distributed() + + +@pytest.fixture +def patched_offload_runtime(mocker): + mocker.patch.object(layerwise_backend_module.current_omni_platform, "Stream", DummyStream) + mocker.patch.object(layerwise_backend_module.current_omni_platform, "Event", DummyEvent) + mocker.patch.object(layerwise_backend_module.current_omni_platform, "current_stream", lambda: DummyStream()) + mocker.patch.object(layerwise_backend_module.current_omni_platform, "stream", dummy_stream) + + +class TinyBlock(nn.Module): + def __init__(self, values: torch.Tensor): + super().__init__() + mesh = DeviceMesh("cpu", [0]) + dtensor = DTensor.from_local(values, mesh, [Replicate()]) + self.weight = nn.Parameter(dtensor) + + +def _make_values(start: float) -> torch.Tensor: + return torch.arange(start, start + 4, dtype=torch.float32) + + +class TestLayerwiseOffloadHook: + def test_dtensor_wrapper_is_preserved_across_prefetch_and_offload(self, dist_group, patched_offload_runtime): + current_block = TinyBlock(_make_values(1.0)) + next_block = TinyBlock(_make_values(10.0)) + + hook = LayerwiseOffloadHook( + next_block=next_block, + device=torch.device("cpu"), + stream=DummyStream(), + pin_memory=False, + ) + + hook.initialize_hook(current_block) + + assert isinstance(next_block.weight, DTensor) + assert next_block.weight.to_local().is_meta + assert next_block.weight.to_local().shape == torch.Size([4]) + assert hook.dtype_metadata[next_block.weight.dtype][0]["shape"] == torch.Size([4]) + + hook.prefetch_layer(non_blocking=False) + assert isinstance(next_block.weight, DTensor) + assert torch.equal(next_block.weight.to_local(), _make_values(10.0)) + assert next_block.weight.to_local().shape == torch.Size([4]) + + hook.offload_layer() + assert isinstance(current_block.weight, DTensor) + assert current_block.weight.to_local().is_meta + assert current_block.weight.to_local().shape == torch.Size([4]) + assert not hook.is_materialized diff --git a/vllm_omni/diffusion/offloader/layerwise_backend.py b/vllm_omni/diffusion/offloader/layerwise_backend.py index 5b66ae5ee2..20af5b5d82 100644 --- a/vllm_omni/diffusion/offloader/layerwise_backend.py +++ b/vllm_omni/diffusion/offloader/layerwise_backend.py @@ -6,6 +6,7 @@ import torch from torch import nn +from torch.distributed.tensor import DTensor from vllm.logger import init_logger from vllm_omni.diffusion.hooks import HookRegistry, ModelHook @@ -58,6 +59,31 @@ def __init__( self.dtype_cpu_flattened_weights: dict[torch.dtype, torch.Tensor] = {} self.dtype_metadata: dict[torch.dtype, list[dict[str, Any]]] = {} + @staticmethod + def _is_dtensor(t: torch.Tensor) -> bool: + return isinstance(t, DTensor) + + @staticmethod + def _set_tensor_storage(target: torch.Tensor, value: torch.Tensor) -> None: + if LayerwiseOffloadHook._is_dtensor(target): + target._local_tensor = value + else: + target.data = value + + @staticmethod + def _make_offload_placeholder(tensor: torch.Tensor) -> torch.Tensor: + if LayerwiseOffloadHook._is_dtensor(tensor): + local_shape = tuple(tensor.to_local().shape) + return torch.empty(local_shape, device="meta", dtype=tensor.dtype) + return torch.empty((0,), device=tensor.device, dtype=tensor.dtype) + + @staticmethod + def _is_materialized_tensor(t: torch.Tensor) -> bool: + if LayerwiseOffloadHook._is_dtensor(t): + local_t = t.to_local() + return not local_t.is_meta + return not t.is_meta and t.data.numel() > 0 + def initialize_hook(self, module: nn.Module) -> nn.Module: # This all happen during the hook instance being registered to hook registry; # the input module is kept intact @@ -71,7 +97,10 @@ def initialize_hook(self, module: nn.Module) -> nn.Module: # Pre-allocate gpu tensors in a flattened way self.dtype_cpu_flattened_weights, self.dtype_metadata = LayerwiseOffloadHook._to_cpu( - self.next_block_parameters, self.next_block_buffers, self.device, self.pin_memory + self.next_block_parameters, + self.next_block_buffers, + self.device, + self.pin_memory, ) return module @@ -106,13 +135,17 @@ def _to_cpu( for dtype, name2weights in dtype_grouped_weights.items(): # total # of parameters + buffers - total_numel = sum(t.numel() for _, t in name2weights.items()) + weights_with_local = [] + for name, t in name2weights.items(): + local_t = t.to_local() if hasattr(t, "to_local") else t + weights_with_local.append((name, t, local_t)) + total_numel = sum(local.numel() for _, _, local in weights_with_local) cpu_tensor = torch.empty(total_numel, dtype=dtype, device="cpu", pin_memory=pin_memory) current_offset = 0 - for name, param_or_buf in name2weights.items(): - numel = param_or_buf.numel() - cpu_tensor[current_offset : current_offset + numel].copy_(param_or_buf.flatten()) + for name, original_tensor, local_tensor in weights_with_local: + numel = local_tensor.numel() + cpu_tensor[current_offset : current_offset + numel].copy_(local_tensor.flatten()) if dtype not in dtype_metadata: dtype_metadata[dtype] = [] dtype_metadata[dtype].append( @@ -120,11 +153,13 @@ def _to_cpu( "name": name, "offset": current_offset, "numel": numel, - "shape": param_or_buf.shape, + "shape": local_tensor.shape, } ) - param_or_buf.data = torch.empty((), device=device, dtype=dtype) + LayerwiseOffloadHook._set_tensor_storage( + original_tensor, LayerwiseOffloadHook._make_offload_placeholder(original_tensor) + ) current_offset += numel dtype_cpu_flattened_weights[dtype] = cpu_tensor @@ -135,7 +170,7 @@ def _to_cpu( def is_materialized(self) -> bool: """Check whether this block's parameters hold real data on device.""" for param in self.block_parameters.values(): - return param.data.dim() > 0 + return LayerwiseOffloadHook._is_materialized_tensor(param) return True @@ -172,8 +207,9 @@ def prefetch_layer(self, non_blocking: bool = True) -> None: layer_params[target_name] if target_name in layer_params else layer_bufs[target_name] ) - target_param_or_buf.data = gpu_weight[metadata["offset"] : metadata["offset"] + metadata["numel"]].view( - metadata["shape"] + LayerwiseOffloadHook._set_tensor_storage( + target_param_or_buf, + gpu_weight[metadata["offset"] : metadata["offset"] + metadata["numel"]].view(metadata["shape"]), ) self._prefetch_done = evt @@ -191,9 +227,9 @@ def offload_layer(self) -> None: # free GPU residency for _, param in self.block_parameters.items(): - param.data = torch.empty((), device=self.device, dtype=param.dtype) + LayerwiseOffloadHook._set_tensor_storage(param, LayerwiseOffloadHook._make_offload_placeholder(param)) for _, buf in self.block_buffers.items(): - buf.data = torch.empty((), device=self.device, dtype=buf.dtype) + LayerwiseOffloadHook._set_tensor_storage(buf, LayerwiseOffloadHook._make_offload_placeholder(buf)) def pre_forward(self, module: nn.Module, *args: Any, **kwargs: Any) -> tuple[tuple, dict]: # if the previous hook was skipped and the weights are not on device, @@ -311,7 +347,11 @@ def enable(self, pipeline: nn.Module) -> None: # during the last layer compute of the previous request. last_block, first_block = blocks[-1], blocks[0] last_hook = apply_block_hook( - last_block, first_block, self.device, self.copy_stream, self.config.pin_cpu_memory + last_block, + first_block, + self.device, + self.copy_stream, + self.config.pin_cpu_memory, ) last_hook.prefetch_layer(non_blocking=False) @@ -319,7 +359,13 @@ def enable(self, pipeline: nn.Module) -> None: # Register hook for each of blocks for i, block in enumerate(blocks[:-1]): next_block = blocks[(i + 1) % num_blocks] - hook = apply_block_hook(block, next_block, self.device, self.copy_stream, self.config.pin_cpu_memory) + hook = apply_block_hook( + block, + next_block, + self.device, + self.copy_stream, + self.config.pin_cpu_memory, + ) block_hooks.append(hook) # NOTE(yuanheng-zhao): We make each hook gets a backward reference to the hook From 7274e15840bd750c964776ac0a6761c13499e6eb Mon Sep 17 00:00:00 2001 From: fattysand <44150064+Fattysand@users.noreply.github.com> Date: Wed, 1 Apr 2026 09:40:12 +0800 Subject: [PATCH 005/204] [BugFix] qwen3_tts chunk boundary handling logic in initial chunk (IC) (#2378) Signed-off-by: Fattysand --- .../test_qwen3_tts_async_chunk.py | 51 +++++++++++++++---- .../stage_input_processors/qwen3_tts.py | 4 +- 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/tests/model_executor/stage_input_processors/test_qwen3_tts_async_chunk.py b/tests/model_executor/stage_input_processors/test_qwen3_tts_async_chunk.py index edf46eb9cc..95ee229298 100644 --- a/tests/model_executor/stage_input_processors/test_qwen3_tts_async_chunk.py +++ b/tests/model_executor/stage_input_processors/test_qwen3_tts_async_chunk.py @@ -97,21 +97,50 @@ def test_flush_on_finish(): _CASES = [ + # ── IC boundary rule ────────────────────────────────────────────── + # IC phase: length <= chunk_size (uses <=, consistent with fish_speech) + # IC emits fill the entire first chunk_size worth of frames, so the + # normal phase always starts at a clean chunk boundary. + # initial_coverage = (chunk_size // initial_chunk_size) * initial_chunk_size + # # Dynamic IC=16, cs=25, initial_coverage=16 - ((25, 25, 0), 24, False, None), # IC phase: 24%16!=0 -> hold - ((25, 25, 0), 25, False, None), # transition: adjusted=9, hold (no replay) - ((25, 25, 0), 41, False, (16, 41)), # first normal emit, lc=16 + # IC does NOT evenly divide cs, so initial_coverage < cs. + # IC emits at 16; frames 17-25 remain in IC phase but 25%16!=0 -> hold. + # Normal phase: adjusted = length - 16, emit when adjusted % 25 == 0. + ((25, 25, 0), 24, False, None), # IC: 24<=25, 24%16!=0 -> hold + ((25, 25, 0), 25, False, None), # IC: 25<=25, 25%16!=0 -> hold + ((25, 25, 0), 41, False, (16, 41)), # normal: adjusted=25, 25%25==0 -> emit, lc=16 + # # Per-request IC=10, cs=25, initial_coverage=20 - ((25, 25, 10), 9, False, None), # IC: hold - ((25, 25, 10), 10, False, (0, 10)), # IC: emit at boundary - ((25, 25, 10), 25, False, None), # transition: hold (no replay) - ((25, 25, 10), 45, False, (20, 45)), # first normal emit, lc=20 + # IC does NOT evenly divide cs; IC emits at 10, 20. + # Frames 21-25 are still IC phase but 21..25 % 10 != 0 -> hold. + ((25, 25, 10), 9, False, None), # IC: 9%10!=0 -> hold + ((25, 25, 10), 10, False, (0, 10)), # IC: 10%10==0 -> emit, lc=0 + ((25, 25, 10), 25, False, None), # IC: 25<=25, 25%10!=0 -> hold + ((25, 25, 10), 45, False, (20, 45)), # normal: adjusted=25, 25%25==0 -> emit, lc=20 ((25, 25, 10), 5, True, (0, 5)), # finished flushes IC tail ((25, 25, 10), 33, True, (20, 33)), # finished flushes normal tail - # IC=8, cs=16: IC divides chunk_size evenly (edge case) - ((16, 25, 8), 8, False, (0, 8)), # IC: emit - ((16, 25, 8), 16, False, None), # transition: hold (no replay) - ((16, 25, 8), 24, False, (8, 24)), # first normal emit, lc=8 + # + # IC=8, cs=16: IC evenly divides chunk_size (edge case) + # initial_coverage = (16//8)*8 = 16 == chunk_size. + # IC fills the entire first chunk: emits at 8 and 16. + # Normal phase starts at frame 17; first normal emit at 16+16=32. + ((16, 25, 8), 8, False, (0, 8)), # IC: 8%8==0 -> emit, lc=0 + ((16, 25, 8), 16, False, (8, 16)), # IC: 16<=16, 16%8==0 -> emit, lc=8 + ((16, 25, 8), 24, False, None), # normal: adjusted=8, 8%16!=0 -> hold + ((16, 25, 8), 32, False, (16, 32)), # normal: adjusted=16, 16%16==0 -> first emit, lc=16 + # + # IC=5, cs=25: IC evenly divides chunk_size + # initial_coverage = (25//5)*5 = 25 == chunk_size. + # IC fills the entire first chunk: emits at 5, 10, 15, 20, 25. + # Normal phase starts at frame 26; first normal emit at 25+25=50. + # Emit intervals: 5,5,5,5,5,25,25,... — smooth transition, no gap. + ((25, 25, 5), 5, False, (0, 5)), # IC: 5%5==0 -> emit, lc=0 + ((25, 25, 5), 12, False, None), # IC: 12%5!=0 -> hold + ((25, 25, 5), 25, False, (20, 25)), # IC: 25<=25, 25%5==0 -> emit, lc=20 + ((25, 25, 5), 30, False, None), # normal: adjusted=5, 5%25!=0 -> hold + ((25, 25, 5), 50, False, (25, 50)), # normal: adjusted=25, 25%25==0 -> first emit, lc=25 + # # Per-request override: IC=15 at n_frames=10 -> 10%15!=0 -> hold ((25, 25, 15), 10, False, None), ] diff --git a/vllm_omni/model_executor/stage_input_processors/qwen3_tts.py b/vllm_omni/model_executor/stage_input_processors/qwen3_tts.py index 21815b09b3..ade0169321 100644 --- a/vllm_omni/model_executor/stage_input_processors/qwen3_tts.py +++ b/vllm_omni/model_executor/stage_input_processors/qwen3_tts.py @@ -212,7 +212,7 @@ def talker2code2wav_async_chunk( } return None - in_initial_phase = initial_chunk_size > 0 and initial_chunk_size < chunk_size and length < chunk_size + in_initial_phase = initial_chunk_size > 0 and initial_chunk_size < chunk_size and length <= chunk_size if in_initial_phase: # IC phase: emit every initial_chunk_size frames with growing left context. @@ -225,7 +225,7 @@ def talker2code2wav_async_chunk( # Normal phase: offset so the first normal emit picks up after IC phase. # IC is stateless (may change with load); any mismatch is absorbed by left_context. initial_coverage = ( - ((chunk_size - 1) // initial_chunk_size) * initial_chunk_size if 0 < initial_chunk_size < chunk_size else 0 + (chunk_size // initial_chunk_size) * initial_chunk_size if 0 < initial_chunk_size < chunk_size else 0 ) adjusted = length - initial_coverage if not finished and adjusted % chunk_size != 0: From 7b965a7b6c8f73090f28364ab029037f08c890b2 Mon Sep 17 00:00:00 2001 From: Samit <285365963@qq.com> Date: Wed, 1 Apr 2026 10:47:33 +0800 Subject: [PATCH 006/204] [Feat][Benchmark] Add synchronous video generation endpoint POST /v1/videos/sync for benchmark test (#2049) Signed-off-by: samithuang <285365963@qq.com> --- .../examples/online_serving/image_to_video.md | 35 +++- .../examples/online_serving/text_to_video.md | 33 +++- .../online_serving/image_to_video/README.md | 35 +++- .../online_serving/text_to_video/README.md | 33 +++- .../openai_api/test_video_server.py | 153 ++++++++++++++++++ vllm_omni/entrypoints/openai/api_server.py | 144 ++++++++++------- vllm_omni/entrypoints/openai/serving_video.py | 50 +++++- 7 files changed, 421 insertions(+), 62 deletions(-) diff --git a/docs/user_guide/examples/online_serving/image_to_video.md b/docs/user_guide/examples/online_serving/image_to_video.md index 912450acf1..00b67d74e2 100644 --- a/docs/user_guide/examples/online_serving/image_to_video.md +++ b/docs/user_guide/examples/online_serving/image_to_video.md @@ -37,12 +37,45 @@ artifact, poll the job status and then download the completed file from the content endpoint. The main endpoints are: -- `POST /v1/videos`: create a video generation job +- `POST /v1/videos`: create a video generation job (async) +- `POST /v1/videos/sync`: generate a video and return raw bytes (sync, for benchmarks) - `GET /v1/videos/{video_id}`: retrieve the current job status and metadata - `GET /v1/videos`: list stored video jobs - `GET /v1/videos/{video_id}/content`: download the generated video file - `DELETE /v1/videos/{video_id}`: delete the job and any stored output +## Sync API (Benchmark / Testing) + +`POST /v1/videos/sync` is a synchronous alternative that blocks until generation +completes and returns the raw video bytes (`video/mp4`) directly in the response +body. It is designed for benchmark and testing scenarios where one-shot +request/response latency measurement is needed. + +The sync endpoint accepts the same form parameters as `POST /v1/videos`. It does +not create any stored job record — the response is purely the generated video +file. Metadata is returned via response headers: + +- `X-Request-Id`: unique identifier for this generation request +- `X-Model`: model name used for generation +- `X-Inference-Time-S`: wall-clock inference time in seconds + +```bash +curl -X POST http://localhost:8091/v1/videos/sync \ + -F "prompt=A bear playing with yarn, smooth motion" \ + -F "input_reference=@/path/to/input.png" \ + -F "size=832x480" \ + -F "num_frames=33" \ + -F "fps=16" \ + -F "negative_prompt=low quality, blurry, static" \ + -F "num_inference_steps=40" \ + -F "guidance_scale=1.0" \ + -F "guidance_scale_2=1.0" \ + -F "boundary_ratio=0.875" \ + -F "flow_shift=12.0" \ + -F "seed=42" \ + -o sync_i2v_output.mp4 +``` + ## Storage Generated video files are stored on local disk by the async video API. diff --git a/docs/user_guide/examples/online_serving/text_to_video.md b/docs/user_guide/examples/online_serving/text_to_video.md index ea7a94ed95..d58296fcc7 100644 --- a/docs/user_guide/examples/online_serving/text_to_video.md +++ b/docs/user_guide/examples/online_serving/text_to_video.md @@ -37,12 +37,43 @@ artifact, poll the job status and then download the completed file from the content endpoint. The main endpoints are: -- `POST /v1/videos`: create a video generation job +- `POST /v1/videos`: create a video generation job (async) +- `POST /v1/videos/sync`: generate a video and return raw bytes (sync, for benchmarks) - `GET /v1/videos/{video_id}`: retrieve the current job status and metadata - `GET /v1/videos`: list stored video jobs - `GET /v1/videos/{video_id}/content`: download the generated video file - `DELETE /v1/videos/{video_id}`: delete the job and any stored output +## Sync API (Benchmark / Testing) + +`POST /v1/videos/sync` is a synchronous alternative that blocks until generation +completes and returns the raw video bytes (`video/mp4`) directly in the response +body. It is designed for benchmark and testing scenarios where one-shot +request/response latency measurement is needed. + +The sync endpoint accepts the same form parameters as `POST /v1/videos`. It does +not create any stored job record — the response is purely the generated video +file. Metadata is returned via response headers: + +- `X-Request-Id`: unique identifier for this generation request +- `X-Model`: model name used for generation +- `X-Inference-Time-S`: wall-clock inference time in seconds + +```bash +curl -X POST http://localhost:8091/v1/videos/sync \ + -F "prompt=Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \ + -F "size=832x480" \ + -F "num_frames=33" \ + -F "fps=16" \ + -F "num_inference_steps=40" \ + -F "guidance_scale=4.0" \ + -F "guidance_scale_2=4.0" \ + -F "boundary_ratio=0.875" \ + -F "flow_shift=5.0" \ + -F "seed=42" \ + -o sync_t2v_output.mp4 +``` + ## Storage Generated video files are stored on local disk by the async video API. diff --git a/examples/online_serving/image_to_video/README.md b/examples/online_serving/image_to_video/README.md index fea99efa60..49283bd9a0 100644 --- a/examples/online_serving/image_to_video/README.md +++ b/examples/online_serving/image_to_video/README.md @@ -34,12 +34,45 @@ artifact, poll the job status and then download the completed file from the content endpoint. The main endpoints are: -- `POST /v1/videos`: create a video generation job +- `POST /v1/videos`: create a video generation job (async) +- `POST /v1/videos/sync`: generate a video and return raw bytes (sync, for benchmarks) - `GET /v1/videos/{video_id}`: retrieve the current job status and metadata - `GET /v1/videos`: list stored video jobs - `GET /v1/videos/{video_id}/content`: download the generated video file - `DELETE /v1/videos/{video_id}`: delete the job and any stored output +## Sync API (Benchmark / Testing) + +`POST /v1/videos/sync` is a synchronous alternative that blocks until generation +completes and returns the raw video bytes (`video/mp4`) directly in the response +body. It is designed for benchmark and testing scenarios where one-shot +request/response latency measurement is needed. + +The sync endpoint accepts the same form parameters as `POST /v1/videos`. It does +not create any stored job record — the response is purely the generated video +file. Metadata is returned via response headers: + +- `X-Request-Id`: unique identifier for this generation request +- `X-Model`: model name used for generation +- `X-Inference-Time-S`: wall-clock inference time in seconds + +```bash +curl -X POST http://localhost:8091/v1/videos/sync \ + -F "prompt=A bear playing with yarn, smooth motion" \ + -F "input_reference=@/path/to/input.png" \ + -F "size=832x480" \ + -F "num_frames=33" \ + -F "fps=16" \ + -F "negative_prompt=low quality, blurry, static" \ + -F "num_inference_steps=40" \ + -F "guidance_scale=1.0" \ + -F "guidance_scale_2=1.0" \ + -F "boundary_ratio=0.875" \ + -F "flow_shift=12.0" \ + -F "seed=42" \ + -o sync_i2v_output.mp4 +``` + ## Storage Generated video files are stored on local disk by the async video API. diff --git a/examples/online_serving/text_to_video/README.md b/examples/online_serving/text_to_video/README.md index 92afb1fc9b..44e676671f 100644 --- a/examples/online_serving/text_to_video/README.md +++ b/examples/online_serving/text_to_video/README.md @@ -34,12 +34,43 @@ artifact, poll the job status and then download the completed file from the content endpoint. The main endpoints are: -- `POST /v1/videos`: create a video generation job +- `POST /v1/videos`: create a video generation job (async) +- `POST /v1/videos/sync`: generate a video and return raw bytes (sync, for benchmarks) - `GET /v1/videos/{video_id}`: retrieve the current job status and metadata - `GET /v1/videos`: list stored video jobs - `GET /v1/videos/{video_id}/content`: download the generated video file - `DELETE /v1/videos/{video_id}`: delete the job and any stored output +## Sync API (Benchmark / Testing) + +`POST /v1/videos/sync` is a synchronous alternative that blocks until generation +completes and returns the raw video bytes (`video/mp4`) directly in the response +body. It is designed for benchmark and testing scenarios where one-shot +request/response latency measurement is needed. + +The sync endpoint accepts the same form parameters as `POST /v1/videos`. It does +not create any stored job record — the response is purely the generated video +file. Metadata is returned via response headers: + +- `X-Request-Id`: unique identifier for this generation request +- `X-Model`: model name used for generation +- `X-Inference-Time-S`: wall-clock inference time in seconds + +```bash +curl -X POST http://localhost:8091/v1/videos/sync \ + -F "prompt=Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \ + -F "size=832x480" \ + -F "num_frames=33" \ + -F "fps=16" \ + -F "num_inference_steps=40" \ + -F "guidance_scale=4.0" \ + -F "guidance_scale_2=4.0" \ + -F "boundary_ratio=0.875" \ + -F "flow_shift=5.0" \ + -F "seed=42" \ + -o sync_t2v_output.mp4 +``` + ## Storage Generated video files are stored on local disk by the async video API. diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py index f8d68d95a2..7200b38abb 100644 --- a/tests/entrypoints/openai_api/test_video_server.py +++ b/tests/entrypoints/openai_api/test_video_server.py @@ -735,3 +735,156 @@ def test_extra_params_merged_with_existing_extra_args(test_client, mocker: Mocke assert captured.extra_args["flow_shift"] == 0.5 assert captured.extra_args["use_zero_init"] is True assert captured.extra_args["zero_steps"] == 2 + + +# --------------------------------------------------------------------------- +# Sync endpoint tests (POST /v1/videos/sync) +# --------------------------------------------------------------------------- + + +def _mock_encode_video_bytes(mocker: MockerFixture, return_value: bytes = b"fake-video-bytes"): + """Mock the raw-bytes encoder used by the sync video path.""" + return mocker.patch( + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=return_value, + ) + + +def test_sync_t2v_returns_video_bytes(test_client, mocker: MockerFixture): + """Sync endpoint should block until generation finishes and return raw + video bytes with metadata headers.""" + _mock_encode_video_bytes(mocker, b"fake-video-bytes") + response = test_client.post( + "/v1/videos/sync", + data={ + "prompt": "A cat running across the street.", + "size": "640x360", + "seconds": "2", + "fps": "12", + }, + ) + + assert response.status_code == 200 + assert response.headers["content-type"] == "video/mp4" + assert response.content == b"fake-video-bytes" + assert response.headers["x-request-id"].startswith("video_sync-") + assert response.headers["x-model"] == "Wan-AI/Wan2.2-T2V-A14B-Diffusers" + assert float(response.headers["x-inference-time-s"]) >= 0 + + +def test_sync_i2v_returns_video_bytes(test_client, mocker: MockerFixture): + """Sync I2V endpoint should accept an uploaded reference image and return + raw video bytes.""" + image_bytes = _make_test_image_bytes((48, 32)) + _mock_encode_video_bytes(mocker, b"i2v-video-data") + response = test_client.post( + "/v1/videos/sync", + data={"prompt": "A bear playing with yarn."}, + files={"input_reference": ("input.png", image_bytes, "image/png")}, + ) + + assert response.status_code == 200 + assert response.content == b"i2v-video-data" + assert response.headers["content-type"] == "video/mp4" + + +def test_sync_i2v_with_image_reference(test_client, mocker: MockerFixture): + """Sync I2V endpoint should accept a JSON image_reference field.""" + _mock_encode_video_bytes(mocker, b"ref-video") + response = test_client.post( + "/v1/videos/sync", + data={ + "prompt": "A fox running through snow.", + "image_reference": json.dumps({"image_url": _make_test_image_data_url((40, 24))}), + }, + ) + + assert response.status_code == 200 + assert response.content == b"ref-video" + + +def test_sync_missing_handler_returns_503(): + app = FastAPI() + app.include_router(router) + app.state.openai_serving_video = None + client = TestClient(app) + + response = client.post( + "/v1/videos/sync", + data={"prompt": "no handler"}, + ) + assert response.status_code == 503 + assert "not initialized" in response.json()["detail"].lower() + + +def test_sync_missing_prompt_returns_422(test_client): + response = test_client.post( + "/v1/videos/sync", + data={"size": "320x240"}, + ) + assert response.status_code == 422 + + +def test_sync_rejects_both_references(test_client): + response = test_client.post( + "/v1/videos/sync", + data={ + "prompt": "bad refs", + "image_reference": '{"image_url": "https://example.com/cat.png"}', + }, + files={"input_reference": ("input.png", _make_test_image_bytes(), "image/png")}, + ) + assert response.status_code == 400 + assert "either input_reference or image_reference" in response.json()["detail"].lower() + + +def test_sync_generation_error_returns_500(test_client, mocker: MockerFixture): + """If the underlying generation raises, the sync endpoint should return 500.""" + mocker.patch.object( + OmniOpenAIServingVideo, + "generate_video_bytes", + side_effect=RuntimeError("GPU exploded"), + ) + response = test_client.post( + "/v1/videos/sync", + data={"prompt": "will fail"}, + ) + assert response.status_code == 500 + assert "GPU exploded" in response.json()["detail"] + + +def test_sync_does_not_create_store_entry(test_client, mocker: MockerFixture): + """The sync endpoint should NOT leave any record in VIDEO_STORE — it is + stateless by design.""" + _mock_encode_video_bytes(mocker) + response = test_client.post( + "/v1/videos/sync", + data={"prompt": "stateless test"}, + ) + assert response.status_code == 200 + loop = asyncio.new_event_loop() + try: + stored = loop.run_until_complete(api_server.VIDEO_STORE.list_values()) + finally: + loop.close() + assert len(stored) == 0 + + +def test_sync_sampling_params_pass_through(test_client, mocker: MockerFixture): + """Sampling parameters should propagate to the engine through the sync path.""" + _mock_encode_video_bytes(mocker) + response = test_client.post( + "/v1/videos/sync", + data={ + "prompt": "param pass", + "num_inference_steps": "30", + "guidance_scale": "6.5", + "seed": "42", + }, + ) + assert response.status_code == 200 + engine = test_client.app.state.openai_serving_video._engine_client + captured = engine.captured_sampling_params_list[0] + assert captured.num_inference_steps == 30 + assert captured.guidance_scale == 6.5 + assert captured.seed == 42 diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index ba47c0b316..d832b2726c 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -20,7 +20,7 @@ import httpx import vllm.envs as envs from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, Request, UploadFile, WebSocket -from fastapi.responses import FileResponse, JSONResponse, StreamingResponse +from fastapi.responses import FileResponse, JSONResponse, Response, StreamingResponse from PIL import Image from pydantic import BaseModel, Field from starlette.datastructures import State @@ -1956,16 +1956,10 @@ async def _run_video_generation_job( raise -@router.post( - "/v1/videos", - responses={ - HTTPStatus.OK.value: {"model": VideoResponse}, - HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, - HTTPStatus.SERVICE_UNAVAILABLE.value: {"model": ErrorResponse}, - HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, - }, -) -async def create_video( +VIDEO_SYNC_TIMEOUT_S = 600.0 + + +async def _parse_video_form( raw_request: Request, prompt: str = Form(...), input_reference: UploadFile | None = File(default=None), @@ -1988,48 +1982,15 @@ async def create_video( negative_prompt: str | None = Form(default=None), lora: str | None = Form(default=None), extra_params: str | None = Form(default=None), -) -> VideoResponse: - """Create an asynchronous video generation job. - - This OpenAI-style endpoint accepts multipart form-data, validates the - request payload, persists a queued job record, and starts generation in the - background. The response contains metadata for polling job status rather - than the generated video bytes. +) -> tuple[VideoGenerationRequest, "OmniOpenAIServingVideo", str, ReferenceImage | None]: + """FastAPI dependency that parses video form data, validates inputs, + resolves the handler, and decodes any reference image. - Args: - raw_request: Raw FastAPI request for accessing app state. - prompt: Text prompt describing the requested video. - input_reference: Optional uploaded reference image file. - image_reference: Optional JSON-encoded reference image descriptor. - model: Optional model name supplied by the client. - seconds: Optional target duration string accepted by the video API. - size: Optional output size string such as ``1280x720``. - user: Optional user identifier forwarded in the stored request. - width: Optional explicit output width override. - height: Optional explicit output height override. - num_frames: Optional explicit frame count override. - fps: Optional explicit frame rate override. - num_inference_steps: Optional inference step override. - guidance_scale: Optional primary guidance scale override. - guidance_scale_2: Optional secondary guidance scale override. - boundary_ratio: Optional boundary ratio override. - flow_shift: Optional flow shift override. - true_cfg_scale: Optional true CFG scale override. - seed: Optional random seed override. - negative_prompt: Optional negative prompt. - lora: Optional JSON-encoded per-request LoRA configuration. - extra_params: Optional model-specific parameters passed directly to the model's extra_args. - - Returns: - A queued ``VideoResponse`` that includes the generated job identifier - and initial metadata for later retrieval. - - Raises: - HTTPException: If the request is invalid, the video handler is - unavailable, or job initialization fails. + Used by both ``POST /v1/videos`` (async) and ``POST /v1/videos/sync``. """ input_reference_bytes = await input_reference.read() if input_reference is not None else None parsed_image_reference = _parse_form_json(image_reference) + if parsed_image_reference is not None and input_reference_bytes is not None: raise HTTPException( status_code=HTTPStatus.BAD_REQUEST.value, @@ -2058,7 +2019,6 @@ async def create_video( "lora": _parse_form_json(lora, expected_type=dict), "extra_params": _parse_form_json(extra_params, expected_type=dict), } - request_data = {k: v for k, v in request_data.items() if v is not None} request = VideoGenerationRequest(**request_data) @@ -2082,25 +2042,101 @@ async def create_video( except HTTPException: raise except Exception as e: - logger.exception("Video generation failed: %s", e) + logger.exception("Video generation setup failed: %s", e) raise HTTPException( status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, - detail=f"Video generation failed: {str(e)}", + detail=f"Video generation setup failed: {str(e)}", ) - ref = video_response_from_request(effective_model_name, request) try: image_data = await decode_input_reference(request.image_reference, input_reference_bytes) except InvalidInputReferenceError as exc: raise HTTPException(400, detail=str(exc) or "Invalid input reference.") from exc - reference_image = ReferenceImage(data=image_data) if image_data is not None else image_data + reference_image = ReferenceImage(data=image_data) if image_data is not None else None + return request, handler, effective_model_name, reference_image + + +@router.post( + "/v1/videos", + responses={ + HTTPStatus.OK.value: {"model": VideoResponse}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.SERVICE_UNAVAILABLE.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +async def create_video( + ctx: tuple[VideoGenerationRequest, OmniOpenAIServingVideo, str, ReferenceImage | None] = Depends(_parse_video_form), +) -> VideoResponse: + """Create an asynchronous video generation job. + + Accepts multipart form-data (see ``_parse_video_form`` for parameters), + persists a queued job record, and starts generation in the background. + """ + request, handler, effective_model_name, reference_image = ctx + ref = video_response_from_request(effective_model_name, request) await VIDEO_STORE.upsert(ref.id, ref) task = asyncio.create_task(_run_video_generation_job(handler, request, ref.id, reference_image)) await VIDEO_TASKS.upsert(ref.id, task) return ref +@router.post( + "/v1/videos/sync", + responses={ + HTTPStatus.OK.value: {"content": {"video/mp4": {}}}, + HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse}, + HTTPStatus.SERVICE_UNAVAILABLE.value: {"model": ErrorResponse}, + HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, + }, +) +async def create_video_sync( + ctx: tuple[VideoGenerationRequest, OmniOpenAIServingVideo, str, ReferenceImage | None] = Depends(_parse_video_form), +) -> Response: + """Synchronous video generation endpoint. + + Accepts the same form parameters as ``POST /v1/videos`` but blocks until + generation completes and returns raw video bytes (``video/mp4``) directly. + Designed for benchmark and testing scenarios. + + Metadata is returned via response headers ``X-Request-Id``, + ``X-Model``, and ``X-Inference-Time-S``. + """ + request, handler, effective_model_name, reference_image = ctx + request_id = f"video_sync-{random_uuid()}" + started_at = time.perf_counter() + try: + video_bytes = await asyncio.wait_for( + handler.generate_video_bytes(request, request_id, reference_image=reference_image), + timeout=VIDEO_SYNC_TIMEOUT_S, + ) + except asyncio.TimeoutError: + raise HTTPException( + status_code=HTTPStatus.GATEWAY_TIMEOUT.value, + detail=f"Video generation timed out after {VIDEO_SYNC_TIMEOUT_S}s.", + ) + except HTTPException: + raise + except Exception as exc: + logger.exception("Sync video generation failed for request_id=%s", request_id) + raise HTTPException( + status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=f"Video generation failed: {str(exc)}", + ) from exc + inference_time_s = time.perf_counter() - started_at + + return Response( + content=video_bytes, + media_type="video/mp4", + headers={ + "X-Request-Id": request_id, + "X-Model": effective_model_name, + "X-Inference-Time-S": f"{inference_time_s:.3f}", + }, + ) + + @router.get("/v1/videos", response_model=VideoListResponse) async def list_videos( after: str | None = None, diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py index 5ccc7c3d27..2987c81fba 100644 --- a/vllm_omni/entrypoints/openai/serving_video.py +++ b/vllm_omni/entrypoints/openai/serving_video.py @@ -20,7 +20,7 @@ VideoGenerationResponse, ) from vllm_omni.entrypoints.openai.utils import get_stage_type, parse_lora_request -from vllm_omni.entrypoints.openai.video_api_utils import encode_video_base64 +from vllm_omni.entrypoints.openai.video_api_utils import _encode_video_bytes, encode_video_base64 from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniSamplingParams, OmniTextPrompt logger = init_logger(__name__) @@ -71,13 +71,18 @@ def for_diffusion( stage_configs=stage_configs, ) - async def generate_videos( + async def _run_and_extract( self, request: VideoGenerationRequest, reference_id: str, *, reference_image: ReferenceImage | None = None, - ) -> VideoGenerationResponse: + ) -> tuple[list[Any], list[Any | None], int, int]: + """Run the generation pipeline and extract video/audio outputs. + + Returns: + Tuple of (videos, audios, audio_sample_rate, output_fps). + """ prompt: OmniTextPrompt = OmniTextPrompt(prompt=request.prompt) if request.negative_prompt is not None: prompt["negative_prompt"] = request.negative_prompt @@ -144,12 +149,23 @@ async def generate_videos( ) result = await self._run_generation(prompt, gen_params, reference_id) - _t_encode_start = time.perf_counter() videos = self._extract_video_outputs(result) audios = self._extract_audio_outputs(result, expected_count=len(videos)) audio_sample_rate = self._resolve_audio_sample_rate(result) output_fps = vp.fps or 24 + return videos, audios, audio_sample_rate, output_fps + async def generate_videos( + self, + request: VideoGenerationRequest, + reference_id: str, + *, + reference_image: ReferenceImage | None = None, + ) -> VideoGenerationResponse: + videos, audios, audio_sample_rate, output_fps = await self._run_and_extract( + request, reference_id, reference_image=reference_image + ) + _t_encode_start = time.perf_counter() video_data = [ VideoData( b64_json=( @@ -169,6 +185,32 @@ async def generate_videos( logger.info("Video response encoding (MP4+base64): %.2f ms", _t_encode_ms) return VideoGenerationResponse(created=int(time.time()), data=video_data) + async def generate_video_bytes( + self, + request: VideoGenerationRequest, + reference_id: str, + *, + reference_image: ReferenceImage | None = None, + ) -> bytes: + """Generate a video and return raw MP4 bytes, bypassing base64 encoding.""" + videos, audios, audio_sample_rate, output_fps = await self._run_and_extract( + request, reference_id, reference_image=reference_image + ) + if len(videos) > 1: + logger.warning( + "Video request %s generated %d outputs; returning only the first.", reference_id, len(videos) + ) + audio = audios[0] + _t_encode_start = time.perf_counter() + video_bytes = _encode_video_bytes( + videos[0], + fps=output_fps, + **({"audio": audio, "audio_sample_rate": audio_sample_rate} if audio is not None else {}), + ) + _t_encode_ms = (time.perf_counter() - _t_encode_start) * 1000 + logger.info("Video response encoding (MP4 bytes): %.2f ms", _t_encode_ms) + return video_bytes + @staticmethod def _apply_lora(lora_body: Any, gen_params: OmniDiffusionSamplingParams) -> None: try: From 183775ec36928930cca26eb74c8a6fb248eca251 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Wed, 1 Apr 2026 12:19:47 +0800 Subject: [PATCH 007/204] [Docs] Update WeChat QR code for community support (#2402) Signed-off-by: david6666666 Co-authored-by: david6666666 --- docs/assets/WeChat.jpg | Bin 101300 -> 100428 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/docs/assets/WeChat.jpg b/docs/assets/WeChat.jpg index 15c25513c4883319dc0db33b265f4af6f49c5e83..28956a12099dfaed166e958f24218ed201858cbd 100644 GIT binary patch literal 100428 zcmd?Rhddg%&ce-*&-@go!Lo6p@C%YJt`C#8PzEbdn839J2O$4Wp8C~ zAw2J|uKRcYe$VUq51#Awy6=i}e8=Z=9Pf2}{nbdPw{BwEL_tBZRY~!r1_cEbEB+@( zvjIQ3)pLUxf2dv6Rl0WO3_PD8Pdl7eC#{)RvBf0Y0I`(2a&`MV4y!~gu9co|+8 zigyx!uKf%DmvQENE%9#~jnncJuWFfp;UDX4NV|jX zj^=s63E`tAb?Wq5T$}i50z@{=>1Ui=?=SMWG{9fosNe|G_A`R(j~-!gp$yYCyqQs& zc;n{ojN>P{l{J$zVn2?#tyZSUp4;_a_;dTC^R@OuH23e{=aQ8v z_IDQ&7WT>Rbl;1gb4x|^`fbw}k(Ff|YkNEiGNt(xP`gz1p=aBszNQ z0i9IO@q(qG{%ZD(>!_(u+_6%P7bzjm!8Sv$q@>+=M>W_@{i`NXe;81H-c?%_)%`}(TF)O2*_{#XR{ zdeXg^?RjJ?dh?yKuvLccuJt$!ODt~jg9TyRw~cmza15om6N9y1qP`p zy2i$~9lG$&_-U9faY`k(H&alEy!%pD$KX6SQYD>mmzJ$$cDUKhb(NBW>F5oLXh}Q6 zr(w+PIaXA4b#=*UU+yixIrBvHYo3$MfW)OdM|x{(>*PzT2A|W7DuTYgzH(wf0&hwF zbWS(@Q8>F)e(n2%H4gsxUw`}{Pinc#NkVuwlW>Pyz1>6lA3yu zfq_Auf`a#0O#Imxl7^<{c{^_3qkk5M(@XIQxgM|dvH~7PMCkZWj~A|n78M;Yv0~OH zkF|dYJ8%Q+(9JhQ)^ zjMJ$6{75)yDKtAklq7KO_SBcr29!!x)ep*=l@@e++Xrl$OlWZZK5D)xwH#0-3iIs4|#o2Yp{ z9;ydLRU)G zbyzvi{FFOvQ1Cg{JR>8+xX6<-&t?-14g1E88&$0tR7_0x7P@?B9kbJ%rU&|pyr@*I zmCv4Kv_0PV`~PV$8Pmeh@o0Rx&>?`)6R<)M2>RD&f`tHy;`C856 zQ%|LxO1KgX+!t<(OG!=F{_Lj=9VURw*5Ha&qz>K4w3n%Og=%UcPBe<=(DChM{51Y3JW6SQbjV{}B%( zceu@8DqjB8k?*2vjnx<2C0S)vc>er(r6AU0pGF)hc^62L0tVM4ckf!Z&%Ju}>TBCy z8W|t=t=G*m-PzaIrx?9KUd!+Et&N9{vQ3pZl=rI|3^}4QwZ)Qqjq9=Yq zyy>g=`pr>gyWF}m+?MP!PV8I1fW`G3S8wPjod&and%Jwhm3;cTS8v`hxUVc+PpUu8 z(2`+9{r39{T4`k!m9lOAJAC`+|DmtDp0%)!mG;HL$G^u_;^x#|KBcQa<3)|yvNG|Q zs+~CEq7~mE24+Wg6vD3JySKXcpm)?A`_B(j|IZIfx}V*amF|6haZ$&^&5Tw)-5Z@I z*l^2U(Jdn*BaQ-MCUsnag_kZ~e0DZQCa>4g#l`2r1KRQa8=1G-nI!E;DqSRGWo2)4 z1&VFkwh0nnwq-Xc%7i*#AQoM%dY&3mX?A(H|au;WO!_1 zF`lp)7gv(n$;f!GJ&n%!u(UK;S65eU_7s+?)Og;@th4%s?$I;joiVuvE-o%%ogS;v zCat6xnKP1hLnos(AFv*)j8ln>8sLf;8M!IsG-W*WPC{HfIB>6+Pz|x{jQc7>8YHrA zeDfY@{}5?HvbG)hexzq~^vtD%mF16%v2uiN@ee`Cg0QzCPo43Afed zd7@59ChQTu7VPc)PrSrN^@|q|E?BK2)tybX^z~c1s;!f-~?-ic1V zl?&?Xl$#k38&B!ulzbjN+7tGjhLYlDTMoUlva;#OAO(d*!2-|smKM`Mo8?72zo4La zJ&Tr16RPE<`Rg6;-d_JwfgW_zkjl~;`Q|}tjW%;t*yg-^<7d9 zV@3EG<+B0j54pJ|e)ODCgIuRQ{!CJrR#a$?8vI#ae(tt3_t~Z;+k#En^>?tTl;F+p zf^yfDez&~2x>?$7@nJFveb&1+^tFzL#yVf<6 z;xX9>VZkDcg?n+av3&~}ZSQRcPX^YQc3mmuN>)Tkq(tkW^6>BwJaQyNk|p2p@@0xJ zo-^J$X?m0uR+dwJN=ceTZw-F>tFurxRX5`wNr$ofKHEYbJ$h7}X;S|T+wiqv*`{yQ zYzHfIsn>1T(DTfPXCJEV6kUv-t7Z=zi@Ko#Kalnq0NI zKSXb}(Zsk;ZPd=M2;573<;s=Da%+404Ox0;X4~7aYy?F`o3w?^%uZ^o1C=%b3@B?2q z_upPlB>B_}+1nKE@1|5xcyp|+VKYuy5WAxC^JioGn!7jrDv!se{QCIuV;N~{zPzz< z?2U1jjz;CE;9xrRU%Tk7TeegJfW(q6WW0Fs!q3Z#(k@>jh!d5)j;Xq?F0@(F@WzeA z-lk1rZi}`Z%TwPoj5HF?^q2Z?&s!>LNYacWT^QUYp#S*;?MO>jiZZ#a?V|qd28y2k z{<2K-=2w?`+o^GI#YYp+VWw(LR{ku!$S;U;$O^cBf5#yKt+%gV3x4a6(lM5rpxL~6 zyOhYxCh>{jrokAGTC<9ko^6BZ^BM{1;=Cj>ot6a!`}60|nKCY3y0pG5BP%OBadzf6 zt7XA=f%Hosc+#ud=M3Dq6FqF0PY1EdzM@yTk?%4)R2FNVl$7M1 zO;sO%#wb+N);6g$ksKX`ZL z0W)p<>s59M4f(={qeUk+mYtyCc;4qI$j$RUJ6&|4dqpX`PuJr8yoHt+-*bac_Fd0? zd$9i1tfixpRZKNK+GKx4Fb@X~3xslV%Z`+}IN$;-)yi-lPeMJ<<4S;zXi`xB%h z)U$vs?^C{;V>wmhowg;h3GF!S}wr{}&te7=3hH@1qP`*2Y)S9I5ESCl4ae-&4D zDxA%7%c=cF|5icXC__3b)M&Nb-QQQ^WGDSn!dJQ27*kqWT1MiQhJ=LR(w3E#X^5Q; zApn+m(92iC6;^$b2JRt}=4Dt6eV4|8sgIi-+`M_S{xH^|SI~t6ZP@yb&A2LalIXSwSyMerrtNAr!!Xp92{fBOO!ow#y^()5ZdjEq54{rHXUESSm;1M^pYT!D#K5w#ouVkgF?E3ZV_ zLGK)JEXU!)B(>U1S2wqA>_xgAJ5Gj&rKbxpOFIUekc`Ujd!=2-F6k+|XGVO?ZV78H z-@CXjN+d1*ojZ2zk~c9)YDl8FcJ11RUAuO@V5{cqzE6Gnq)Z88$~P1{f+?vRQc~>L zxwH88X2#3Qi?fzpMU>xqhV)ifmakwdcMT3wH#IfAC<_b-sLOpx_aY}}_vg=_Px{{` zs&PBcpVG$0M&0m~f`Yu7TJZi13W}-e>F)Hx6>seOA=(pt``OtkfN0m_N0PdFdI7t{ zK-}czUc8@8y;JfouM0(~NS+iwsH4grC)L$+(II;y&+ z<~BZs>CQHmP7v@fiG7W?Yoe*6;&_H1IHw#PkhtscMH)r{{dM8W+{)5A*ZE3g=bIggtFkyd ze9E6&to5Ba6(E$Qy?un_&QrOzzZgnOOVw&x3*1~Lrl*a_A&#I}?EL%zrur%>Dp8Vl zrdeOcf4rrxtFKQoBmmQ65wrN&_tdnkMOe2C6+zgd=(le4=jj8`k8Jcwm8 ziA2(i`Cv1+jVx~E%^#GKa$pOaOkT^N$M0_H#MIQ-01@++qpx4Te)jm>{w%$%!_Dal z9Wk}hQrRtoM4QiXTN2e~M&G{C{EDS9&pMb)R&{Gvc&QL*Yi-_+#npxStTmnc;lqJ} zTz=r;H-~9@Y~}^cB8L! z7!O}$NAqhiA8N@u;2#teA!*JJsLBnlCTiCTFj+a@>5(qd^y<^+WXjJA{^(OWnHE(%%^3Nh6!JZTc<9eIz$B_%Z~9s?g#)z-dZ7zB{yVrTc1BmuWn z6}T;B4rE@dVKTgVQ+Xv+t_3dwhUKKzS6O>vU<_aRl#@c z*rkV(eBG~f#3$5#`%=-ayt?>a9yq1^!=+0f?70EiA{%}chU%s906z7!i&9hH<@DN(M9iA*-|Q@iRw6l!cW?s_4@PTZ^;Cs$o<3PcNzIxYBP1kr zGD*|XG3SOv#=cgYR)=s#$2<_X8M_S!#W#&_zh3> zR|f|N<5ORreju--(==UJU0r=;eylyO7tA*2)2UeAvrm0bKVY2t zJw2xJe%mCJHel)sm-oPuL+NspacN)bbL@t9UHo7lD5s#v=%*OT=(qDOH_FySHEG{q z`<#`gEwUFwbj*iVtyjxe^yj-4@AWBeT-JC;aK6lm%K?W{(=!1dxJTOPFXXK3Bk)Hl z!7VA0Syj7>cV1Z3H+Y|Y_A@ZVScuE2!l!*Y9{+^{iZOl*v-B4|%;LwX)2%Jo<}9@K z0x>9_pJ1ZhL%@kTUc;x8MYGx094Gh>jdap*xV`jU4!B9pifUi7`I_D4;RAz$j_Pp{F*$*eGw9G5>-M$y9 z;|~wctHoy0#>at|KE;kxZ?*p27kfCZP<}Y}q5J;3D_q&)0`nGed6DuzZCWX2JeE({ z-D6mcBMu+~9B=E#yR?J(ZO`OM_1$`ap^+l4f@@C_fG5I`tj!KmN)c~Uu9(N zhO}<9Ou{TggK7Oc5y2~0jg8mcV-#p|{M^~e27Xj5)r3Mu(44%;dTui_GlDvd_Y)E( zw)BpV0qDVI?3}zZM9rGU*^ubFO^6 zqn>rQS6s-l>$vswz?r1_c#Vq}RZ%VF0LDyX($Wqs%r+Zjl@m&n(`Z(6;_wdWd8p{g zrI60pA)dsIofsP%>jTh8RpEYY-uh0x`AM%h13f*5-KT~ZFJJmYRyhNuEbz369A%rdUJW~9?W%pA3~wGFfSZ;Urqefg5kzW*Dc=yrA@xX!V)dg{PF5;D?dj<$s;__BWCg0U z&h68SGgej^<-GT-!-In~S_qSyxZbi`n;}SmGWl(%NM7*c0jc`N?i8=)w~-zciC|2^~gCN-FIO1ZBCK zHxFu)?`+*G59*oXw(|4a%V!!%ruRQuF+baL_!4v~&aNI3EF&Q>!P-hWYDB~4WHu_J~I+Uru8?tc|d)5{_x00?Z` zq5d+?*a?}Jwp59lHYjG2`xIkZ=aPNQWp0|ApGv7u*}(0Zlar$*rPgDHj`gCHeshJ@ zi^dP=ieFz{l2SLU3ViwUy%3Wl|+=xa0Pk4AZIPPt_f6k@rucxA-N`Ch~C+GP2 z^N*Tzp;!_cO|tlg4I3s}EegH#(weO9#hug8kiTSsm+s%cKUngrX@7seAiCtxJ3x!` zW^C-fMZfA3l7<`5$@RDPNQ*WSHWfQRYuT6smB6!R0^|t}S_!$;?88 z3Mnz#ntj%&Rf~|6nI%f3APu@H+S|WVAhvO7b@e@m)TpT4JR-?k^HRdc5@u#>)rVqz zKj?KAo#@`aP5Sq}fPfbTc0JYg5r;0sl2*eTOtZS)ZDkf$<7`4*U0>EMPNJu7G_!xAW7wjTMO!6dP956!V^J~6R*Co*KkVu?d~nSM(7 z^P{c2jvtmj(#i@}nLt)&4twxHoR%i11?|o!-P{*IMbpc0=*8&*8?ncXj)GwH9(y zb?q3KPPbYcmWQ@f&qgp+GgUpO^*c4xe0qw#Y4JezsHf=P1y1uTEYiU<>jy{A;ss$h z*{)!hrI|F3>imvHh0o5_8L}Mhp^H9yE`|QpDHLP!@}^U2YU!FgA9oku$$agxrbm~) z_@7SCFnymoc~8?hJzWpq-~#m$`>g~3prD(_j~^8@pBozub8&b5b-|@?+i+m1Z?j{i zCquy9U>S6FTa}G5s!%Cf6D*c5jLkNQIR;WQY}fDQpi&|A~{SP}LdCciIYhJua-fy?2k|h9`t6Lkw{xL6YYKq7` z0~LS^*33|5lJWKH-~o@-vTG_U{lM%lnxz6;)8S-?bV2?4P+25zWz0w)JC;o>Okr9XZ=k___q-?(whmWIcgYHD<7OwV6sgK{}d zbjgK_fBW_=7BHeg;?a{QVW;lx@<*wHl1`6)Gek~JORGY4iyD|IV>F;UaNxiY8GdLb zNO;u1QCMh(*RGYDwa$Rao%7$$$atF2NP43lKMu|Aa8@xRaVp$r#|3K0AnF!e-bYPJ zemLzR(T1ZC4+27o%R#V4!Yqp;5mH@$>82+SIO#&)gX^9+e}2!+)^~xXE4X=4U?RP# zrB{IJuGf;iy1FhWFR-w%bi+4(+TQo&KUskK#Wr9H8eclU2M-bgW&spDBO@8@UM=oB ze0UqqMB$i+t?ki|H%n)6LY|Ne3Rf2InXUk#$-#q2mhMPBFZ>%QuzNJSGd`eqXlRqV zy1HJ>z5DmqQFwyNtCh99zCx9go2%p028@T=xDB@Qvw-@v3u3>gss)k;cJd1f(!m_M z(pw&=HT{4^YS+N$M<0(Sl-*-IfqK9!VeMT1q3`-n}X!tBeM& zlm!I^+N|?4Grk5M%R6@+)IGj40k&>qfmq@4++G-{$FE)E(M|yR(UeVRv{W(Uqu*nJ(>zVpXg?;dJz(Koy{W?D^juR2I>N!a-^w0johj-1` zQ4*qvQc7&>y3aK=r~G&B*|QPSbXL%IHn#L;HdA0(0?e4QT!;Gof?0~{_1@CTxqIJ# zN1t9xJj9NdN>-lxtG*C4W~~K7QerCpfByVQoIa_2>h$T45O|bDED}i3`U~M+^Bpb^7OBG03l^2XvRM8`>-B_l zs@V^~w$7|6^~&Q17ngB^UYj)^PIhXAPaY;#4GWrLFuP zPA^ovyNxArcywSuXOu-RUiFbYP=n z*8aqK+0N3E9arTW)9}j&GW($SH7uJD)L%g0fyt6@(FeU>?w=|v>EcMrIVs^FRKyay6HK}{G`p|GY; zaq7VLOQ)*6UZ1G`GAb+GV|68O`iO`K_uFeVh7nw?gD zKk!gjjK|71M|dV?YHDglQaY$LtmEw8_IEQ8A|-VDjlQf$yNLF?LUVj^mT`%LA&g-W z00weDgXlKDL)@-HabFb6n6Ta~;D%f~?c`wo6As76g*9h@lDcsiz(O^+^<6i30wSN4?@mWjJe_QUq#~|$( zzU%4a@hq=lf&Vsf;=2iX0G2efb*h+tuh(A)pj(D6dm94GJqDFqge9n>qXQ#%tRte>8b&o<4^`6F|(2`}NWV^W2j#s;c#%0KQe{xHnQ%=875QUPhehV}%% zQI1xK=CS4WaFXD@c`(>szEdiKdhhp`pAB*8VwGOHttwjcM{j`5Qn1BEMdkNp$w@swMO!n^< zGNu{VMLGBye`YTrg&kcVa<}F=36EdBc=6)pHSK}& z_1m}G8wTB=@wYp4Hagw}rY6EW9U1%f?L)sCr!6zBDJ@+`*a`RDo-$&riL#O%TS&umfqe+C`zrRI&c>A6`e0o99 z3tkNHI6&*-X0A|gyaSTY!jdLZ3v#IhCo`!Rt;1l(T`e*;At4NVR&#zBmn;;ao3xDD zr^l*OXbIiVX=!e(T_rjUh++W+4~DfEl*zR$j~$`(zIR)C*Q85`zesev9HW92!eN@F zS8*sp2YlxTBu%?~^P4v-QKKIXXsM|kFZeU1GBkP|3P@GW6S1uF0DpfD#E+hhQ9Thg z--o@e+RA$pjQmCg6`_6%M1+LMf%z*D3rR0XLltMdPj-1`yVF#kf8y*$qCA||5tlDL zCcxQrId}>jXZk$=Z~wRiptB!*)Y?;!uB$aSp`8%EF9|U?uNCipfx=dHZ^6P z<1o(Jn1}oNHCg*W`5sEb#j&;)YRm(kOP79Yg+i2=nD{JUw#6*nf3DsANch8to{&WL zXF!ESluq%X?iaAk6AB6&cI?>UhjRqIF8=n79CI$r&w8~&n3KbMsTfb zWaQ5$A8Oyp~GJ{x7QDvQd>PCe93yf!yCciF;1K>1+CqNgqrdd87q%;zs$@X5-` z+Mh8r)D=j(ef#zwvv53AzFAv4I52W=OMZ&QfZ*lHp`*tBjhE$Af0xPsg#$ZC8$lE1 z?%xGbhz$u3Ur+Iv(BU@2AcK~*GWkUM*blH{E`xRGiJCQgiyY8 z>CB{lqrmj^^z+oz`^j!7N^kxw(j-SGoQ)ygLDA4w2v;OMEWLH0zaOfvg!byYH*dV1 zXNODzHTCshP3{~yS0)UGap6L%sm-UVDoVryvQ}Dt{Lo&q`w$w+;D}UL-~z>;`L~pg zW^U`A-4|wklb<7v^)}M*^}zoAo+)47bqGWp52uDB`e$KUua|)JfU8d=YY{|H)X|A( z(ghaJJAClfSdW&3GKs{4{7r*-D*z@uHaMu!_Z~bLY@{|cGRi8i`uv%zxVX5%>e`p5 zmqyF{0fM{jtQN4xGs~TRPjLMD@s`&v6;vn;rx>7ffh!*znSqY(c)I}OA-$7RQ&Uj` z4?{!c5e-so6hIchtoao;uD^B<4ANi#qM?;h#A!mq!mwez1@=H=8P^3k;pW@F|FXVIslKxECIWGd z%Mda#PU$080g! z{138RpZWQXi;qv~;bKqGiT4jJ&mDkYF7GF_fffD1d+LagP@UiQojbYrZ>z8}hN;9v zP|bB^rZA_%nEXu^XMTAPYqj%wFIF}-YO6(KSBmt|NKiQIi~|IjmcCe?nd=t^(~gGV)7jw^HtZ$p(T>X5!0amN}y>k zM?c;4$DJ2FLUf_N#Sk+6FJbPR%$CcNSKCl*WAi@Qd_{~E>aSfp(UmPn# z3bLr5GZHWcF$ca@FQPC6t118N%-X}u_uNfm`}F<0%)@EE#Uk?Z{yz%);dNZqD6u~W zBZPYgEyweRUkfK6Q@tb5A05T3LPcri-K9ubcm>O70&h6nxe0XuN?>WM@3J3`3zFY6 zos>AzOA_f?&`LQ}F-5~I<;ZY`OKJWS42l67PCnvxCZf$DX zMUIYksxSd~{{G{KiS)qeXpK}5z)YrTW5B$1Zd0t%_SbLT^yHI2e2{{bAq@Q*IzVxe z=W2W=#Cs+WDRFU6SbgK-hYugVY;K-5sf+sWSb<7I>EPfHSb>dQ)Y0+O;UI#2f?{HW z`OKj)8@4hLT+dOU8T2>Npisu=J_U}WxV-$%yfySMPt?U@tF7O^?}k6enrY21?7n?LK|u`}d3kw? zkXsXHPt~>LSaA~f#g5I=!GV`Z>D7~EmS;2#&CGy@;F7>8Nni7rOPiY`C4;iEL>4+% zR}w@L030JBifB|g!}?Hyg7a5(fnU{N#Ux^SvS4+^<(uh+ix)qexhM#4-79+k&s?ho zJQDv1-hHu|nGgR!S%~#mU3pP>oyaV@$dDWPKH}}#3z>U1o>NyZ9vRjKhVhnJcs@Ju z#KK!IZ3B@)pZeq>hKxl-M1)@N2$BizOQT{Go=m-4_6V)>_V&&!=R0t~2X$JrjFRGe zmKmc@^6R&7y$Lj;oiF>l%ZLAZ)KEZIR~OUp^hhTZS|0?pn#55`iD(7Y@tS25)JH*3 z+B7Q&yI1B$erQ<%GZ1Nk#&@HDhj*Z$Ca)~lN{xHQ#2A<>`y2lRBKQ6I^9JH>vS-NR z^c!edKT9Wkky+Y;pn2kAKN7iw%cs}dmglq`dF7<0&RvJ}iUM{XNUkArX2H?XXR=sy z^PNQq#9>AT0VkApB17pg+OiW`i)OXyN(?&mwd)77&0BR6YW?2M^;K<;LRviR`#xIQ zr<$LOsQ9Rd*Rr|3-!Dr{9RE~o+Ql(!_OFc~zX+R}@DpGv4E*lgN5kPC$RrZ5$n*f+ zR_MdqPr^W@g6@ADDV}{O?;|a(Xb|q3_|2N&4swey?r4#Tsg&mRJt=o` z!#)G;2Vqk@5e`{QlD7NPKj_(wd=EK)_=_!Ew{va~S}TsmK)vtj?L86w008h2QWo{( zTkW~0P$Lp;K!*1r%E(bqMpTc4Y+sU%wXH1|j@{@namY$A-x6(1>tZ+lK)4kE}BWaW#be zf!R=I^K9N^W^%E4{5cXvu8ooXGEA}h!`R5^1cIiOTnXS;+aacQNJs#*5W#0961EYM zN_HS2hQR}YPSyG`R%g0g#oHv@GcH-#7$p)VC1vUN?@}oD2X`8Op5^)n|*IQ`{O4vx!7RA-%R!eDI=1 z)7!zx&3)>tzz<#?9-+zs+4=EKaA7E+4@Z--Q{MEcjqA;N8;rE3X#1i7-CiRH~ z9Pa6E>(i>z{t-!ho%V56f4P~bvm^U1Xo&?fON5%-ZDh1=GpVu~)Q@b4 zU-=#rEc0zzpmm=j5%XsfyT!Z~^T$2hZ?723-s!${CPi+lW22d~BNrhfITL0l@wGp{XK>c1xA?QZ#}8kFw-N9C{b%|MpugPE(H(L@)Q@s( zBLDXp+SuCqt|i(wgdTTv6tAZZImPCPm?A4HD;+cQLHiqAM#vgaLevu!D(Q@k<*0~` zV0IX@+eqLa`|pKzUtpIY;&d6yN+ZGBcE7mzZoTYVHMY@4)`!b#Yqy~YUqQ6}U*%9AI1 z3F}b&_g5Q8Zg-I%*nwtQ`X?)@Bk@OB+4=|pgKhYVs|!Mk`*4O{;65ji9>uWMqV4Ni z>jdsh(!t>a74xK95>rC&)H zASppy2%`_D!~`lw#J(9!VDU~ZjTdf5RIbY7AJ&8xVnU)1AvHv+PBFF*!ax=`ynekx zsuN`7ngpRs=S@S&;u5pyFpj9tFsg7I?Cb89Lmd4hS4>@<2K?V$9TKyUZt)36s`#+e z0op?yNVva!7zI$Z{sf*ko;SU1oD4>w`gbIVNN2hJw(O8dyYwLf2}QgSfqPq2K?jn{ z{1`W2uBvswg9p55Q$5l3a2g)_irJoLv5Z?JTQ|E2`IZB8`Xaf+B$;2k(ac;lahQp82VM zZ7Fl;r*RkN`tbE!!op!VqY$Mqo)BW0>)Vol^1?VLkI8*~RJvqp;% z6y_2Yjc_7mTXgb3kQyv?E{fMks>D@5tFX4-hYn`=SQFChk4%&8+9V7tA;B2-`Lk-g z#Liv2LSfmcH=Dq*{k&}=rmDJ{6Kf%EOl9+~gY3v?Tz#yGMBpz>0@_6vEYDp)D(3T1 z!Zz+18zWsJV(UVk*Z%dqhscQIl^kW0U)dWw`9qYOmshyvbeu*AR-NXcv4zE#KbLcH zv4~KcBr*CI0#Oaeuybb!V9WhQx4lOU*G-$(3*$Nv_Yi#1|32`7v&a1~>3Cd?wW|^v z%M}_DQati8>RdO?w`Iniu<4&+(x_SBo@o_Y@h;hIx`-PbOGlQSMDm}m0^z@Zg=ZB5 zU>GY@5D}=bx@>CtNt}}+TzNf$2wKxjyLbQW6T5ZEYq+AKUr{5Owc;x-|4- zb&svZ6%`3B>1H7RC!k_!GoL2vcNfkq*kdhrLv#c?}2J8RV3qpOo2JQt3vJNg|jthtv3>}Cy# z>V1fWY9s{GaNylMaA38@_d>3@%jV9GYpmT98y6P>EAw71Es@T`RE}BAFZgzOhkM!! zJqnw|5pnFdV@$$8%@jrj)rYH)(nshixCr|oz zks&o#=gnn>L`L%Fm}8FhYN+PTo2gx7FoIGpUN>Ib53MQZ1Omn3Ju&?0$afr&BqA7O zRPv#<@6O9%Zd6)lF`$!>_s4y>bt&CqG(;*jTTp&DetcL!-frWXC-V>HHx8v{TmS5t zCqcaMTARKos;1U-w2Zr;YtbLv%TB*UD_;VQi^G?TeP9o*G))ifa2kA&Y-Z{cG<^RV z6hVm6hkfOB2izOyYAZqCV1~bML6AXl^IxxJC&v{)F*>XU} zY5m&z`t69Dm$P%8tcEQOg&Pxsh$;mZ?)^I}+{2Q9g8B8$Rh7Vs4cnM$08O%Ls~Q`3 zqO*ULB4J4sAt(TW!T9FQEktNHpA6@M!1@>i#3KEg@8W>GrFqK^Fyh_Dv8#7z*=k)L zfd=>#78aHp$uGHLwzasVq-UWMoA)bja0kI{*FoIMi#$O?>AB6Fg@xmXgx%1N$=O+M zisK?qSS7fpO3|YGd@&p59rfsudZ7am{#%nJFBZ5yvK?*;lnlCim-@#Chx8^g$lZy5 z4OOS%wCv1s3F|(Z$*HLqL1?w7tp`5uQNLi`Z2zAu0Cpop{XLkEup^YWDhv-{H)86# zF5xWOkh2)fP9k1lLRw5kOo?D#&e6ic!kke4nwbtjAH;OHdIgiXCA*TUYAORSa4VRz zZ?1vpjjx`r^X(3)Dz}3>dT5Gm$`fxb_CAp90?F2zwjb|cfuK?<&Utc}nE62LhiR}^ zd>bSSY7AZK#S}VD?qfdY{BTGFJ`^p2$*<-Xh;HV_?b|En!{&eoX{XA# zEz&HO>R{BB@3mF0Lb4N54_7+!obC(%16gFlNNe_k|sB zdvdEpXf6H07jy9OC6$XaSI8mJifybnJdIND8regpUWz{s<$s8@16*&doy3M%qrRX2 zDrsvRcx}ZJKQ*jXWE3DU45qqXK6XI!eSPzXQ28GJLG(RAWKffxpu~3}ScRkW^Y`~6 z#D$t+Xk4&?lI9kGBjf?)wNt@~d=e6P8}qWd!5`gR%1|!6Aei*tFMW$yxFNG`#1uM4 zR=+M@G&Jlvkbc@x9fD9drbJ-4Ff(rfz={pXa$k1FCI zeN!K{p{J)$ca3ITzhQ&Mpa6AMRaJvS_M5D%xPonGJO2RAPoNlU)<|KNmI#fc;80|y z5hJ*K>lR5QT@5oxjbpr-&)ZkoZ(nwsUBooK5~8L~?PZ22(W&(er(j`NSz123^#qfB zFb+4@lcApYMMtyPk>DyBBa*IXw3S8j;cg}-jfywf*}Om%s;z_l{iodK=_%MDZK+Pv z*RQV3%Nm-RDz}S5A+Vn4D(1HW#q-P0KW4{_2tpm={b>^U3jj-MjeOHHHnbN6%4D6mqS-+-&-3O_XgKn+ z&pdtl^s|k8?nm1$tKSn>^r>kC>@yJuj!5#CS8S#5Bx;S43?@^(hoKOp(jUN}NHb}s zjOCP-)se6{RIAO&*%=u}!dy#daQ8O9n!lH=sh!g3y;KA*>Zr9dD$=?^SS(Z0Btyd& zU28M*fnJ)F>nWy zV#+|YoI9}tir?p!>xPE=_9whEZxyQE_-og+nKa~!@vD^{RQoU(p+cOpNvm(uQhz_P zlFy`XH~HTAnw#-i>7e{l-n)1HS+$Kk36YW85AZ%8@2{>l8J^CqRl3#c)pD?cUv${b zE2>rHh`zxMm+;qx>w>QrKHMvD^NALJ__O{Sl~ykPw)cJ&y%c4+-78KnoV53KN1|xv zR%zdo6)oYm5JdeJvt{OaC7Le8RAqE5!ii&wr+Uz0t3P7==SqG^)N9#{HGb{Ad_m%M zNAB%eg@aXd_o>lx5j$M=CuSjLM(V}T)p)F-=+S^!Oy-dBd{erH6$*C;}&8jqR_faF5wnVBB$Vg{hk<(xIG*G zm!wbclm6Loh&b0Jhf5#-P_Mgfzl+;e=zYH;4SiaNwb_$`?Qc8ejr`xg4RP`45ho!c ztu`K6mDWMA7#_yFl+8@L%OL(s1(jiFZvH_7gP>NGSR;A&F|rQyk@En0rm-;`cuctJG#(9sc=~j05<<^?sK!hgkYyCHq8uv%0t2tsiOSOu z+zACCk8nA!M1<)gKfe~y{b2q?%z_6rhbiM$XF-h=38hs@PR>h#45yVCIOxa#PCTW> z-#9JKA0EIgWq;p+j7hgJ{5p^f1im(jRH3Y@+TaIEm4sY_W5uImF81N(nuyCE5f<)c zA#w|bwk&C|b)5!#M@Cc$pR(_zP6|5imT|H^CZqUUc`_jOg+k<;sO1wB3okcT5#+S~zyJY>4vx}3{FRlbkJPeV6#a2G)V$(Ocbs@LZSbqM!HMbpUdZ1?e z2QJn2e3wXI7R2v|Y`LcqU{;&=diLzux$>U3=7u6)J-(q)I`;d{Y(7o%wD529r^zV$ zLs)EU|0?+p3G|cTyp&f78=xDoPiy+!jc@N)oXEk$7%WAAet%;R_&UH^B#|b8~V$3kzk@S&@X>I5|0q4>oPw zgeM++5$Bx5BL0XoN*WLm6T6EEGND?r0#rEfe|_-AS}qF%{PRR8=_1Tt-V1!N$D!_;^9Ye#-Y$)k6y)8ot)F zs9Dn%_`^D*)2=~JIvjhq3JTR#J%64m?KHKH5V1wl!6yF>S|ZlD0otcl1{_{0Fu8`s zi@-HxS&yguBnaEjzzdNpbQsrPi*1Mia{Qgfq-$af0gDX7a|+bj-+Q77Vo4D9Gc5yy zkSYMTK|5JlUy0O0!%@P@Ni;(Ih+JA$mT_>qg#F0w*>-2~AWqGARSu$|+b~q)Iw}Fv z+2y;rx>jP{L=9kCUIn3{<8J2y!EM8!ZXsrP5Rlq2XZu+gfPeaNGx(G z4dd;n+1z4h4?96HHwGteup4T7FN`Pfkk=hjP<{=#M@&ez3m|2q8Tg^FaXmIzA6V*( zS0T6{Ki=CUG?)yx;WDmGZ4UPh533+u_fn(@A+E%1D~jV#5bfIZe(l_`BN!9M8qJl( z#a;vo(lc5no+E^3jpPm>8NdS!_Lp+QYgMa&2A?=La2QqRWx@15`0L0`X^%zl@t|2C zTOovL;UTf<9}W0oexC;d1;4$UKTW54;)I+0Y$kh7^$ZSHNF^ZX6GA+5D5)OH#}8TH zak96Mj}nqHVdPRo0!z%O7JKf@09dM8=K@c!iSQ7R`tUfTYoTGfXqvbz-R5dl8`Y2{G<$Cu0;>>{P<<>F_4Q%Qn8e~tK#pTgk8FBz>RwbM8HEgDW!~P+ z&z5iA4hsu=rc105Htz~}p z%^xLPyctlDif38HEuS^~YEHz;_HNnm?0u4yRgZ6%WMkRcb>C884XHnLPa5}aN{nwc z8xF(G-1MscQ}_W9Lp)t(bHK}#RQ=v#{@RyZFCn~eIK{P;Cws2y5L?DCm!(pT=9YWkiwjD3 zH8Jwt)^*b!5Ar?o{O9-d?VdUsWy8i_~R4r z121p=``b`XiILS|hScn_#<=D5n_nDX|4H5{pUB_#gWb1}W`6a5u!p>2wx!!Cnkstt z{VyO1l!oH9M3~|9*k6G1Kfh^DQCTtoQV8yS;`x8Rhn4oH@PEIjAx?~vFc@39Cp~d6 z^eU>+J07!6ALW*l_y?XZNH>x#q&-IEhtmlQ0!t;c{Bvh$z5rP>N2CF-&Yn~pv&^wOV z{$h-*J5}Hz4xF|_4gT}iiiL!myCysk!^lWPHdhaGY{kQVcEdFAx!G3*$$m{VkT5Xl z}}RhC!o@I)CE- zJf;XBXKga^9l~*87@1d&7R^FXpB}lmp}BlWbg}qPnGCgoknV@X;rz1WDR2pEC6y=- zD5?1#i$7mJ@KcTpMb--hp99Y?i0ycAL02~zAEkuU0yh^|u%{=*kU5brAoxR*E*{~~ zfJd=RA@?wplT}y!mfyG^?AIfbq4?c;+)jLNr!hKJ)x8jxWhiCgWn zBb2jXTq@HVKIj{3nFPH&$0K|K0*qPZ0zEWWR<9Nyu2BpxC23$5CNcb6?WI5PYm*Ed zV0kvmIG|bO;K>8pWQ=;rqc|rSoJ+)HW)bq~Ndu$Ez8;0cV5QLW9w`E%A)0nzwa8(d zYxnhamZL2qglN&2hnd`f#aJmZF-GF~JpT_}?*WeW-~JC@MbRW0(pNN4$_kAuBa}+^ z$V^5>q_WChMY0;UGP7qkl~F_?dxfmB8ze;GdA)t_`~Lrr<9VLrxPQOnzGZx_>w1s# ze63R?)5BvGahu`iI*T2S{CmAa*=s7{w+O*x-!gPe$Qm@=EyrSC7$Y9&q zz0p7ZpY;KN=%XW@ejx=W1gIXtNK(2sTUi|Yw~jhf1n6Xu-C>w*>)YvtC81wEab7F8 zs-4)Bwe4WXIrrkqp2sXk;0h`ge@(5)Ps*xfVLm5(pNmV@u*FN+=>CN}GH+L|bdTK} zfA*eOsl?B>Q3rgwkJo*z+c-QeIyLcJXsTxx>bjnymqNGKxO33sKT@}TS?=jYInxh3 z>PqLWzrQD+svY4}!e(D-p*g;Go8gd&eC|gpm1;&6{Z^v%^4h1 zG$0$fNQ4fA{WyDmEpXzy=swQBh~S{ViEiDbD?j~fB!HZ-NHHeb6^4d}(I8f`=h5`6 zKnTOB=`h6dL13%Or3eKQ9RD0Ix%W>Z>Iz-6#_}(j`Knc18~CpIu$cX?GiIKg@j@v7 z5{9?0l5~ZO;2)%Fq*^(tscoJTa$_dZbF{>bF3mbm&B5PX7G8lW0NvmyU^;^xa z_v&554@e*N(MAQIhhPDvuD8xOPQcaPkr8l40rWg}`x@{&O?bS05@Ta`I}S(OE);Y6 z^Q{2%P;5Nn$g0wt8wwwmFOS5{JT#i7*a{{#25w#|M{VWTAFpXC8Te-oiu-B}Be ziVj{0c!|)10+O|1qj$vSTW65YIKo-9%?YQslWOgyV+^N?Orvo%b)o}_9__-{FZ*!N z!y%;2%d4oUc%6+>hJ{?SJvR1?y!@7kA1L5VufkERm>c|*Eo3uX7U$G>*SNdlzzTZX z4AAk^@E@cJ)msa*`t8FH!#rYY7k<f#xEp#XzBTs8i<(amg9889rIopn@>#&EXXZ6ro9$V+b zeQ#N9C(HLF1Ab&4d|z29vsFTc+y{+w;waOEA|U*397fyhlqU*ORyOy%tZcPV1k`X| z;u;9j%qTG8*s2vZS>{4_ICdtWNm^a9+VMbt$9oUW2bPsw(^<^pk~|UHT;|TF=g~%Y z+cV`n5d5N>Ja=B(?|_J+>8a`&6`p;)*Z!RSs3#jAKV-x`UiMjCAtA+@F;1%5ky-u0 zw()ylVZXfS;GoZonVC6xb#niQ2Yg8`hx9HjRo-b}54y_kNyZyw-3k!CBDLF*SWbO-GbpI;+=KE&dM27V;|| z{@lTkl2|S1Sr@?(=-S!E9RD^^SmK3}&XQWtJPm%Ca?nP1HKmd@`(x}MZ{xh>OdE5u zr6+P9uXy}iUJypbU>_~((`UhH|Frp}CPv#*pcWsx3MV3a-a?c4pP0o8E! z3rXoNDY6T0Uwdu6#LCo1InH%+u9y=r=@)lhBc*_`6&EcV;0%q8B2EYR&_iU}g4blkb>9NBo5+Lu& zON&H#?*kEKZ6u7Ag1dLeK2?Bp86tDRacE8S!EIXGvmU<2f*b9S*~iXKL*SstB2Xnn z1B=b^M=qg17T3>zYEA(cF6!huefqT40xr*3)B-u|l8n#M{bbueeoUl}FU{esc+{$j z_Ed?H_KG5e{^XEJ<;D+Xy!bUS@vv19bjj@tIFn@0E2=aBlKlR{@z=atKAmckwC6-UhAFKNbF|1%^KMOIW{*f-mDG4q6lX@RP9uJTQ1PH`jej4KqXw zxaGikL^$upEwyD4K?p2JtRnjSvZbG|Ci5!~~8z=xj_{ZmvvRA7bp-wElDV(5&RXMXh72mU~# z`36iaX=SbGK*5}N*7!<|ZqsWJWuP3jViM8*yn$|p{5O=LmweGx0QAUs>T)gu(Ib#R4{Y^D^3i~5tao;f|{|@rUVKAGA=jQ11Tkn zAl+Mvkau~aP)wMb1qMQ7E~!KWg81sB7KYt0BdzH!and<2L7||fs*u!50RdI zc;;+}$B*$oN}_U8ErI!*4o{KoFi>>;1vIoB9Ubq!pXDLfGRhH(lAR;Wzn;6KJx5xI z)?+=3fa}?+uC5(OXVs1&0h!Xcq*Y5m*sOp!sWEv9qLy+zQkXSR6S`iR}e<&ON_b&}JSLSU=iTwtxLFA`^^PU+Ft>~y@&@*o_MdHAh zFqu=cuY%J5ptR*_nFB+Dbjg%A3sLPqP#8c^2Rs`(v1=XA`{VzFEaX8;49!~>r(e=T zB!HIu(wjd-E9nZqvnwno3K68V1dXmfI=PK@1$0Rp(C7q-TTU zwb#({e^Bcsv8D*W`e;Ous3_<8SN%ezepQ%8_KjWuy;)*7`% zA?P?&r`sb9E2!2Zs2h?CU6#aA+1*Jl`PALrhYmYUX&i!J_uO3JgsIzzHR&51j8`27 z2sVLbeP~({94E5Np05?-y;$-Ou*C!2OLLtr0AJ$>|@c64x>Rh6_=;U9U`xdl*#2$kgga6|pG#L^r?d#&J&;_2Y! zPeC8|c$f`HW>3VuJQB$V9vdAM#h#OhmTUZ}Bo9)WPD4+SW@K!9j{q1V`RIMtuUls{ zqn2|;zB~J}Pg*2OpSw_;UYgm=$mj}u4qt+SN$}{=(2uHcdrd$ff#b>>5}|*fIkmb5 zgtri)g9vSGu`0YTf@T{aT}IbfGfL#axM|;$;3c-d$mg-K*M?|69KFCOy}K*a7jFQC zTT+1_ZvzJs{81HOnn1@#6+>_3r$tA(2_}YWtWOCG>LP zm?bDE&-~h2__E!QkyD$6KnX^`C^e%|y@ zD2r^4;&>|hqcTJ<5GDii}}Rde|FZ!}QPv1_^`(N#}CS|9MX8JMczzI|R@s`yYR zVNS}nee8`oHN(yLBLq_Z#8KR&{qh#CMwMPZJdy_2uYVVc0O$t6m{5Vjrx!LB01vez z5&Bp!;%E1fw79(FUed>kEwq8QyxK{Cr1)NE!-w@06k%{3P9Z^1YC5TSl;0@1&nS!E z^=1*X7rxDlK$?C8TCX=RRsp%?3Zkxia(c9x4rF}PDE0?C3dzcJRjm1LG`K5-$~@C( z9sv2Y;?B56IFR7r&^v3Cob`Uz%N{ln9%8oZ*tD#aaP z;`Lawr`TVjZLenP0#tU=!QNRQuF>S=I7E=r!5z>^pK`EA&3$>2oDhwXU*>O_0I-M^ zTL5fH-YjT(8nZU+6xBSpoLiFz^dfTfFAlp{8@*Tbqt_7T=_-Q!@IOv&FY_T<6=Q*n>10hW!%CyXoDPgj)2*wpAaIWW2km{PoCA z9QEpJO!=pMI!h*T0|>0%{KxMeld_H|?W8Dvm&eb{i!C#Lbgc|AS6cLXO&|QQ!#7m< z&`GYLJ9R%|gXEc2tpx@#ihcM6Kjnw{|L^a*$7H-=7rn@{3ZLYZyPe{08q~lW?gBfL zTvQmm7b@>?&@V_0ry9ik9Vd^&!eCvJLWj~Z<|Hp?&*G^hy5zC5Df8!#%lc?sb&jiS znC1;p2n&vJN-lP$w>dGL{v^fPY5#cH9-u-*&{TX8LX;Ssf!dk4jcDcMet%u-P62q< zYR@YyyqToweyX9c_|WvDKSW`W7=0M{(P`ZV$NPN2ZC`T>`Q03>wD>;2=(J3Cp4T)p@r?gV{= zgK1}y@@A|(vpzi!4aW2QeCL@`U{kODe1F+Z#dYqsf@ZZz@nNKKAiH3o4R>Tj+Y+>`yQ1CiE?J>3e26Wq!Pye>vjCg5DwA8iy6RDVZc=f9)wgp3{axOpuQ+tOIh@fDa|CYOQ%Sj4+s|k@Hjm#1iO~fMYN?NZKOhkmh9V1Ju-hR?x~b z0uIPchmuCvq#^pvG}2A=oqO@~>V@BN#NnNh=0a}vnCpWF59(|nE4jJ&*NzKka4OLIO{m(ZovtZl!DsTG9(%|$s0^Rn{r3s@3JaAE+_xxw&K_+}cg z)k;5^^0?M~a~(j!4Mj-*bRY?jajFmjTK3upyLpk|aV|F3kl8Rui#_3Y7aaXPKktr@ zcj^z5x(}>D1kjtB;ZHrsNtBha^6qcWmk$?Cp6F^qk%u(*D@_1UJt6D7^1$;XKMliw z%3_X`!`b4W|5p~Tpy*#Qp)qwGKDWo|gBnK=xAe$JO+bsbO0=+q?5X>9DzV(jN>j(^ zqD2*DKNJYg9;n>5$G+E)I*^vN8Zm>}_A0wUF|S6l^N?r+HqXql zK0HyF+5483%}6olPi5ba1XWL9bb?j+2IZ^wA(12ze`YP+#((td0!m9=Jj?^sNO&%4 zrPp7R@uqLdmsfcd6~#crQdSPfU`|=vm4HOdj2CGmbJ^g~m-Ft$Jnr3-#=<_vcJeNo zibD<ceo7S8HKTBkltF1)_B9z{FAre`2JOYISk~3`H_wT!ms~3K~2roj%Fjc$ivJb)y z@fm4sl>h?BvT4)nZ^>W?(YYtZBRkL)E2_T@Sx+tD$(;D>?U@(r(}fUrCU^F1zI&y-*{`~&)? zpbgwf&4<0WV0ypK2L8q}M^@Z+g5rRP%k*i82MrAj`h_9@SAqsUQm|Y~;~3D6gepyd z6n*1w!@BUvTHqCTQ;~9iu-&+)#AyeFR)u*uKQiK{@b;qw zSaltpKKGXIL-s*syoE*H_Qni%ggT2_f&ItiwVf*Kw?tqi@ zU>0ch`!ggMOH1_I$DGsFB_Q9D;s*x?cFWdORh>R(4EJ5^)aLEmljAwha|;Wroddkv zvne7Gi^iCW13gv@x4mSIDz2-O4m`ZPP6<{R-qIL}l#e`75tD`!f&PAee!+Fbz$Qya znLs0NoaS8qFe-|l+6!tZ+t%CPf9p5mJlEF7_K6)=;Tt)hszKRucvmO~eXLkB^WtdZOzFPJ@D4|GJI6I?oLkz<`C^_5@%+anQ|YUdXYCfA zRUGzTy^}}kDmQ~9;}&UDY<59^cYMB+b2m3>?r;1&*O$BGpUwR|ItO-n4J$i19S|$g zm`tqjQHUSbF;6M3>s0dNI+B>%Bu`-y}7c7HzQWyUF@1#I2 z+OgG?-;XYDScl-33^9BreKEU}vN-5JyVYrwOU)hGY4R}R;=bOyPnUlxJxgi1Ts6(S z5nq*P?eZ##H*#2cfaB3mdB%Zgw|m~v4#8{Q*ea1H6aRnz;*rS9q$;&Ka*#ZcY;8f{ zaOVFW>hqQ)tsJgdapFt>kaK4T1vSgW#6({YnbCv*ja)q|8=F-?-LI#kC^Qp!htfbP zU0dWj8{!5rF{Q!&LwPw3GH;6VOk0v-rb_|gL11h#eiVjn{P$Vsv`);m>i$Do1*$@& zHz=#nCr_+?43CcPzw=}%lXqx$9Jc?V!be*H**!)@i7Ph}89ejWsr*VzsnNVRnGQgcec_a=!qBTKv6B1Ny58X7uraj4kV z+{n!AK*|t=$*0)3ozPn(S)KZtw_zO~CpLrGCQlyCGxAj`(>aH9FAoJd&%W{oy5WkF z!fPEOM^xbh5UPTa!5{o2vx`0pah*|*9~W}?IWtoUqOxY} z?0p1TuzaYk4V@xb1Uk@cd(-BG4OSOcQdXmm89ObDDjb!7f9)g=t#vqpFDP&$qZ)ep zvZs5XGR33Bre)!>Ty55x^0qq?A7dYCwoB8$5Oo`alFW+0M>o+mQu9#IgiV2a%ZKzj zu?`r@bLcc{wPB8{0J#*{jJvfGVwpqZxV8IH{94~3yH3f#dp!siZZmt>qi;Suf7-#-@f_y*BtI|zO1WkKroI*Ja zw{#*oHtrB4g9_NJia0|PgPUxT9x^{SCs=jm>ec(uq&#YE1`r4HVP|b5@@p-iHK*o5 z#_9vjRIN4yBo!bxJ{mGJjvmj`K!LagrsMKot#PpVVK$G(Ek%ghJV?5k-A~Lofi5TZ z%`~8kdmtWVZ2;||NtZ;}fOW3Lnwp_sBLm-|6WE2g#^o=gdc6aS z>eoo@nYU`a?i^*fkxn_Sy}S<}EIr_hRO{m)DB)Re$TQQ37)Q>m0SpFmc0XaAP zb$}m`5DAkLvIF+C1#orE_pA6p=;m)FnE#m`r-MpAsfJC4IQmE#wGkO6TI-*sPBLlsN7hp@q@YN^KSMvM~HUyV7 zffXn?3Z>dVxZN~H^JRB&ajnL^Um3^^UhOv$0)Tvuwd~h?Wor>1iMJ5{_69fnf$l4y zy%9OwEZeKfNZV23^fKwCv1W|h|B&Ep>3ZA$g2KCw3s3xu78BapcbMz*k`dTHS1M)wR=az%ATx{^9B4{I~$1uTr49B*p$u}y2zM@WK3{y6llvmBxOZVH8OPVZq-{} zrlPF5hm|+RE5n%uz5h|nl~awx(p?9qL(r#Yi2EWxox2#8d7qj4ye8EqkB=&oUoz*%k}#_({7HQKct0h8O~I3-U>678i- zH02{7&-^Bt@fHELh4j>yAKbtQpqs;WYzeJ-~HWW3z5cJ~$z~I*A5riM3^5lkpphG(O;dfQ-Id(A(7MKA( zheyQR=q({VBlz3K4i%U~Y#Q~>la|~F;G1a~MK|t3EnFoFFX5O(wXZhsmKrq$NKhO$o z2;sY6>gu(j29-G3%9rZ`fC6KEE1g$FcXtXc#1%!n;=JLsRX56(m5> zL`32l?}2-&wb{v+15_jDsM)y4uIlO;6PDhQU7MMDu zb+{NO0Z?=xUoGR~H>9J-sEz~IKZ!9WLsM{uWN0cWap+u=5t^_?MtWgFs&GPp2`IG@ zT&8R~FrG}J(yX_R6t#0-F6z;xRe)e(6`;B13p@GT7dvM}4@3mlVcQoaBp=WLEKHEr z<|d+~EFc}*#_fkrMkO=91;SjEFw3h0gSjt2TKfd6r9~%#2)7{z;5AInEw0-e_~9SX zRW$)|KGs4Q{r3Y`@esA_UZSPH|NQxk!as5pC&^9uZ`%2|cLJa6{s}#%6ck$5(;=>d zqS!9M>d%b%dV2c4HpIi0<6YAYc06~Mz`{mCf`YH13As8-JWQI;WHww+$71it#<_Ig z94mKZVBpO+D6JU-9zTv8g^Wob@~LZK=X|Zrx^u z5MBr`3Qb#ic+;_$pCNv;;5@+KOFs1`k(0Q*s1o!>M&a2rr-+{i4Q>4*KXOCC8d!}Z;PM2r%V(V{Wr3u?q-^nH8#_A#+TGyS zn@+$Z(cBWvZ;9}8M1VTXxL|lrQvUwq$6xApp+%!1k_l~cp@Hf&UV?$iFYfQ+-XA@qQu@y6RnU1w+FhKU8T`k3 zdremF^c1JBEK9Za47|8txPC6dzKK8Z#)8xw7yTzCCuU1SVH$>5%cLxUoxc6dswv#~ zmo|KZdxM|gFU7l)QZx)(B>g-l_1A9`Q&XHjWj=S_UQ~AW^u8S;?GOIXcg!B#!2l5f zempkc@xa0WzK`pdPt*7x_2~b!2JYXr4)65c)r()^@iQG|dqryWkdIZ$4Jn0x>v}vu zOmpmip03xHG1EP~UgQ7cpG|8Dnm)(e5cSfK9Gza^`R{8GiTa}A<`p}o*808M&Y3lh$8#p}!ywrqr8@i%VBx zj*osNwk1~ffk7Nq9Yt2g-=lIb=3m z2VVw8sp@d=-AfDO!1c2ABO#C$|AUz-w+VFPjeb4wE$R8616=7Q4WZzH1LI)|a3^7! zg@+C|HY{X^H+U71H`P&SVRStmOVtQtQiLK9_>NL}Q-K0*r%8gTe$kO7G3#tkpt~cO zr#2P47-94J*V92-7-?&VAIu_47Ru@RZ=*T&;^i=P8B?)1;fb@q9sxm2KQ5X*=dP(( zqCd^e$@%Qhk1#ZH9spb-Gp;Pz3*e!^@!Zx#b(lHWmZW?~2c}A?{NKJ;`rN#1F@XS3 z@&2ps9DOJ^*PDlhVK-bOCAjjz3okaK&8``Rc&Q)5#NI#U#$bq7R>S@)c5ki1ZV9D} z=HFqP5Es{niFIL9VrZz=@_)z8`c^#zuii4@j0z-Tq5MA~zrr1L)#mGEZGJ0Lq7bArIE)JLD;e|Gw(6Groyyk%Ug9C zV1B~6msc(==o9sO?XWJ6Q~^u}OKgLHsR1!wMXT{R_fUamzvbd!ZJ+gkuakrRO-`4$ z-b{N{RaL{SA_P$0LB*c%8(ha~fUQ)Uwd>aH;^g!=3nSe>DH@p>!)Gq~VVx#?=BHoT zlBUHhCuiNO918pmZIK)mmo-sBs?u$Oi}Y-sP1xhdD{;iiO#)1GL*{`35j)|J7&T=* ze@?;uFLpX*J@_zWz}(xf2tK}VWMtn}%vsXrxYd_eE|VA+zR;%3DkP{qvwgh-AG2!inc3FfUb5KbX9PCrL0z3;(*cHSf)|%5)e$AxL4%bAqK)ZJ6TFPX=K&Dib(^1X&}%}>Ynj6lXaTwLub)2+XUH_DgP7qVqNwud z=~G{nGe-X3{EqaLA<=!L1=1fDjSHwKBoh@-DYUPbBII2{Vq!@1ejc7TXp{Xv1|V(^ z=(l>)Usf6b@4rFq3k@fuXvWA*eUmOZ6Jb^&d=VU)S0O3 z>iVK(+>-;qgZo*ig|aEqR4mZ9);cYB8&SW|8Pgp_aM6&_5lktSQ&4D2Mw@u; z>lYB9H8l~~8I7&wD1`zFOg?}A0nFv}L>DtK1C`Y7^ssakHYepbu>A9n?)TWwP!o*6 zd6S-JH+AJ8VoW&|RV+~AJ{7*obo&}=*|%`5UcLIQw=_zJuz(LR*()AwEn9|`QT4KV zc84QLTG2@aC~pf(|3|CL%@{jwf#b8=j#WV4Rk_xL8CW~p6C^HUK%?$f36gA_Fn=8^ z`n7FraK+T#@gELBB{+Zr>e6H&K5s|A1{V}^_f4tzNwiuo!9{puJy#NR1`$@E)k2Od0w16Pz}DHSi1W6Vy+y2) zOmggwwnbajO$BOXi}A!-h6i|gw}`af2HG*PEV8Hu&I4zcmBSdJf>$&Gx;QDHi2LBW zmZ7w8tlK@nMz;qj6fkGG*0`suMBN=N=76uGFbB#K`X-=<(^tE}TSG-Dsy> z;#d^(O~bpR1B-*pCyMAAB0es*zNw05FKq4SFW;zB}(1_zbKqUkBhr3+2g+O_vcdiB$W1xb*e)PdCYzrTM zgapWKRRlK9_^=!};f5 zNDM4JtMK$6yO$)Jl-|?2wSH<}R`K3nKH=jpQzU5^HU{?mF7}q7CQth?O{M>u=c3=Q ztmBY8G^KH$6tVyLt}T;1enJr~C|>0mJv})-x}CDh%At=-IK)U6`Ld=61EbsR(*Z&u2eK@kwA=!M%D1%O z>$(Fra@HAD(F$Dp>Ft>BS!jdAzG^mZL}C-PydW6PPTFsFztI|>Msm6D5 zWraIhPp`CgE>J!K#PF?7F9-TU*1<0PH&{y}3^oHL+ zvfivSzG={372^*yb#$!E?ib}75G-4_o+CrjDSa5dU_C}8fyKqeVKzXQCvjB_TBGQCj<%wfic9sVS=+(0 zQ)cU*irO*Vad!`$QX}#yAbK(8LdqD)>NNk$!2bj2U6hyyQxieVVl?jM8N^*ZSemPM zxik$j*8?_;OEYT4_BNP&S-h;(%L2w#8;QxZ$Rz(n&4bjmto1NFs=A4F0GgkpFQ}yw zodSd_mQ);@Z*k8Fj6*IkfG6bqkSTZ*MvV0vH(sd=0C@__uhw}*V$>y(Djx}wvzx>*fzCL}MaC1PS?25@a=JG{8rzj8Uiri;wS zq1vR>`cq&lr-$doxLOr2*I=X}hT48rnAH|QPZ+!?uK%L($8UXgC1m^PdRAfj7q`Bp zSUCY2IjCFsT!qcmrS$#<&?U|Jrk`PoC1X|+%!v#N@O9wFO+=H>4>^zAbPFPWM!E_z z>ug9>{hE&bj7o^gBaATyAc6-+v=Ca<%eQDJ@Fta`lMed{qJ;w9LJ!zCW&`7=g2w3G zDQKOtKlRg4^sWDo%)*!9a<8xAe}Lw@JkkFb0Br=t#VZUT!^etZLcQ=hU^+6gO>h)p zdE3dL2CEAwEXg=$x!FEkf+fGiFm9X%uw{zcX-3-LAiH-73+FbOx1K>;2U)lp@dsg^ z@GjF^d^=IfL}S_3c9o{_#qSmu--$U)6>1^a`V%lzRewiwwV(ps16)McjR*=_0Xu*9L`D?5;CXLV5! zWm~c6?Vr6%i`$4Uee@RCfHbX~!g*6^!u*mrpwPNrynj3Dr_Wqa^sRI{Ud{n_9NQ za$owD=s^obkxVOdUSf1~R1@Q^u1J8hBRGJT*e2Q-7m~NwdKieT+<6HgVW_K}u1KK$ z=s+?|gf`}MoB$Vfg%`)YuebMO5%FYVuD7?hKU6g|GU7!XW36N$TLa=KJ_dl#{N9lz zeO{3ar=uP{D&UZ!{A~5kv5kbP5zXOEaio3@Z zaZ6;|rCfMD`9@%m>m`H%L5?dFF<% z)$|Qrtgt1CQ5qq%k}Uj&ohvRlB?NkzKS76hJd z0l-*xL@yT3GCMrgB(f;vFIfEufz;;rI9Jw!1#D|cpW|)-z>#D$b-&Fh&q%AR9y9aU zc9SII$sgyYFKL?KUsm_2v&t{K2&xlW4z0xZ0K;7WvhK< zIr(Wi$)q)$dSCwUe;Ld4-G;*Jgdh8^Ewa~ylS|gPEA1xLP`T7X5CJkbCQa^%le+K6 z)e7CK?8*DzI1MyMwljQwi!w`PtGAlAZ1b*P4?2vvhfj&OGUHpW(Z!kauaZ={JfQd0VwTI)nSmW zDB*%yRWY@N=r!j$h$9GNKrT-QK6&B;8t`&mGYZ12q@*)Ab(GOsgS|7VyKQ2!4?$}! z4f$b=jDCH`xv@;e+`Rw}ttGb`sEBe9a2#A*l@0myt5k3fVv6iea(&_$C_~kWjQAnG zEBL8mtTJ~FucbZMO2~IIfV%ZL##~|N3gmLW@j_mS`_jS>7lA9aEH|yKX>m+Q>ku{e z=4CRK;tCxmiVSvgh7bZlx@(s&3;KRCIM^36<)J|`H#rWiws%bRMo|)(0t&4m{L=|W z+K446%u3(X5x^2kK>iaIYiId^+x|mZGcOr&jMUtK7ei69aIQ81Sq9>0{bckzECgJNr@_t39|*JmpV!c}lC0YJQDaxU{EY)ELi`#vu5A9o?i zgh+{@+5imeLve9f&JhX@Fqwe7dP56yc zdn1e`Os7>NAxn_PYmawEX#|?mt~5PpynhqtvQQVZjGAeVQ(Qs3HP1b?gmM{(LhxKa zJ3g|%n5hi>_$bUPjpgNc=3b{GNbH$iAx4?#z{=_Zks)}*2|OKcvXfYgd^-8Q#|{G) z!>f-@Ti$WNGioC-?sXDXR<^y=-`j?+Si^=;eTNn|O0BCO7$5h+dtHHQu12rka*~jw z&?AHy?G!N$tPA^7y)%3$?BO6`8AnI)J6I4x5{Gxa4%G*4#6g9iy?eR124IK{oB9N` zEH+RqYX7NLjf)oYh|Q~)s2SsiED_UTD!#r){lLIRjZ+;*MFT;p`H<0U<7xKy;{CO# za~lx$cB~hK^dpiV0vcY@g9Z-1D9^hTX3RD2kUaGK^+eI0(LP$4q= zGRjo%sAFVa<-fv2l=Yh>em9s1v+Uei$EJh0PYW2tvuzQ2T!GgY)wOKWAw`uf-jD`C z4B)c9eS3IDW*x7n4-mMzCr9e{(Xm5zR|&fOW|b&B+|UL#G-*Q=P@sTPaFQs`n~s6_ zhv4O|jv31F?2BKDd9ZM*par+6YJ;s&)MfD%K#yuvPlZMx8p>~!E}UN$aTa2$URQ)d znP*n=CqS=;-rgu3GgJWCA7p1x0D8}FcZVRPW`Hhh?MBgx%}yV)QQPtIW>KNu5K;Q| z70zo#wQKGi`<(s1;@I;RzlH8VBzGYs7VeU&wj5(cEz!4?mEJSI(C5?)?BBD82I_^g zcQ$JO$}fER@-w*nEiV0kb{mT8AJD<_%FET1PBFvCUp`U)fUb@ z=6BuAzOd;fyW~g!IG%ElENg)Gdge>1Z9>&SW>>=2WG<~E$f7cRphaRkzx8@+)u0Ui zt9`FmLZ7r~4St<}xbv!JMaki| zl9D1b90^DqM|bSURQfx>Ftxz1aihrC(dpf&6vS6e|4GebF*)S~IYTMYrqnJ= z%!&)vOr}4gT6XqMHI&_<`H$+5kC)ej5cr*|B`Fl5zQOA{qyA#w9_wc_i)Z1JVIYpj zBqrQ$=w{tw%o%AZ6pYq7p{ExaW6rXQyf}`QjsJcVi8AucKf$Q2B+o>v+=_06j^FuH z>P)6f>&__cO%B~H&aXmvb*`AKqm5bB8fV+mqnEs8EXsP1hNEn)PS6fpaQf01%sT6g zNt-kX0Zwn1@&D(2I!XWrgc}Wk3`fJW3NDqRSsa>-i0!EEJ43}ONrO?_13j6bB9&0Q zZ~y0P4GeBYvpCY+fS$hZ8|n@J0lUMjGtOC(#`WG2#e*49WstSBCl;ltA_-c;U2{(TU*-`xGQI!4KvMueZWpF znkCaX{GZMP*AoDk6}#d6RuGbo5dRiQa~&>te7XJA{cB4r-z z^uyc%!CZ4LP=6kXBhwhf%eDn z1K`rSpu&*$Lw+6aysobsEM-Or4p9|Fzk&975{{WcYurvZfo9Ye!Nl+y7Uq#(xtRV* z>>+Jv4C>k-q2ZzCk%k5*oN*hOu_5mGTdD0g`Xx`|HJ^jpY>vo(!v$Z*~A@L zNNJU8j$0rGU>>#ovaxYF6hfiaD1erMhn+!X@cV2);U(9Fzr+f1k1uD>=uw33lAHW$ z7k;_-{N`#UkVfTDNr&d=<>cH!+Sa?T^c(=cAt4LRpZ@uiX?k>|MrlB!j@IDdg$Y2o z8&O&S=>R6t5PND}#5j;9eF)9usNfR?fB>#-Uz#Xqco-O15hnr(`m^7aFMP@L6KJeo z=cWS%go*I|*E3YK*ZO$jwOs(Ab!3h#{Y1k4Z=8gnjZjWNIHFPLjEdwVajIwuM=(}^IFsXmvjx*z5|MIWp-TUwV zc*rIsy`wK43qKmFRN_~cqmqDt zr0r7c-vXf5Cz8cw{UEGRIt&>&RmKP=2isLm%i&dYbmfo}#L{V08zINFwmugRn)k+7 z2sx}AM~`Yg7FYd%8AU|;RvnlKs<#|b1c`JSOdMTC0&1lDs)`6+*-**~6fhA~B&Ht&7bmnVIW>-QN>HtV z@V7=4o5W_!5qOS?scCf@Mzy)YyY%Q_fu$v^N4dGVm;A|8%G16qyOQZ5jZR|YXw#R2 zto9(8qd5nmMeaj1zy@=jACUfe|9O!d8y02zM-J_e*l9CcKpgk6rvJ@JfPtK~^Dwha z^BBfQd7HLeeDw~OWyC(@#@qytqOF~_q1ERc>htL^YHI@&4a#$eg@l@$J|b~0$%vbN zCC)hYb2Z=}0~29UG8;XJxK!&4gcF+SE?tZL8e1QP`bg&xyJ1h%R_!l(^yQBN1NXi# z4;L_SkNGz4ccWxZ#yX04)XlC=I`+-}WkODC^vJX#92Fzv1ARfbkw($aZo7>a8DFyd z_^Wl^eRam>Yh0S!wpk%-ry9Sk-+Sp}O=D*AHP%k_G9;h+R`ruL#;?P1$2zFVIOs(r zRbMg@7yS;?62?qZas|E6+#Bc4s*J_@*&~DQ*SbL+XyX6>zRayJHWZ1aBk@>Z!T9ratozI`-;^3B@pbaY5QmRw~z$$CcQPb(R4JkSPJ-NE5x%%jG{>&o;z95$9V z$tSU7-m`VNuAq4y7kBay!>b>PSaX{6{Dsv{CnN6R)D zzbGnFPafy;leq)KdVgIrFdpAnzEvkbv&tZJQozp<(LZR{)e4PejKYo$Hc8&J1G;&TsvnvvH* zKZ$&GSrzOw1RGHws@1k4adc+jHx8VQO1|O@r#s1FQ!P1eGe!%Ap}pmdDa$8B*3(O^ zH)XR07K;cIm>X6D1Ef$IFllb2C7Bt6b;)cv5JL$n#Nk6iu>l}#B_>Id<%QlHZZLNU z3f|S%C;D949{m(mP|z}FtG-fXgD5luuRna)@T#~N^W;_|V}U{_d`xAKUDP8fkVpsffhNcX#l{Y3Naz((PKZWs~!&r}n!u8I8rs&n-sv>hZs6bF>t zqzB$!=pT4mcIOHXx!r?ql1yhKv=)A$8;-RdhzdbXy)sRwU?VV|KD2s}Wf&0`{~Ybe+$l!dSI!GB4ia_#f%$>{xphPYa{v27VvgWdG-TaQ zIRO-dH-a?Lb+bSa@^UMj-yo*Xi8liF&--<=uzYfg&yStF#x~|aCOQIzSP8|l@4yl5 zNV3~dV)`Ka`ZS!McZQo-f>Z8x{6hk~2Dl+(vwnHp;0>*o=%#BEJy(_T0l@WtS|>}! z%EXTUe9NFMEhTkw_$*&-*_kq)TkHqdSq@h;96R#sPpMbulsro$3cELS?2Z)lB6OfY zq`Uen(feSd(~!_iaAf6h@o*5>G@m|Jsv`eoi{6f4jbqmlk1=^UUA918O?LW0F<|cV7kms1#F6x5RduKo%xM^ksdJKggZo$v zS5jo^VKhgK%*?8T)G~ac<=8keP@V&IVA<57$HXsHZ_C2}Z`q{W%6Ihsk|K^7lFh81 zy&4D|zP%g=bTm97p83)&Yt$!ocj;~1u#;f^jNYd{L~IZVR}a7`(E$^xrluxKr}t}^ zs@=%P;qarF4xk_TxfLJ;TL1U5Ag{PIe`?L#c|+qpTlj}G*35Bz!IirTa5e=okDmF4 zH<8VvkD|5#@u31DA|knLQvLg~oYbtA;%)_MD&l(CZc7SgeZx1#8jQ4|NYs(@k9!6< zgmLrcisD{el4YMhU4Uu<`<)J+m+uQmwj@bX^^3DVf4U#(vAv^{h&4_AKzcigbX8^; z;L|Ns8ehVCl*U#rvLSQdiI2qI9IbIT6%8UuIZAIQ*C`s`{x(*GrM6^Tc;X--Rf-D^ z(qmUhmCPd8)x+=>zXm69GSBI*#ZO?%mOnn@k_W^S@OpzQ>YGiJU+1{T3fBm%YCnv-shu%?e5azb~{PN z2_OCv0bB};zdtj;d7!^|><{$L1L&$RPZI@^wUw3844GqDTUW=I$jrX9gecqR`!Z&~ zN#s0#-m>Z>W0otum*MM;Gd<_Nirw<#Okg>8P5D`6c!#0SkQtMX;eC-{&1YG=^|whf z2J#e_e~+Af@b`<2fkxeWq2oC@V?Tk07(kGFP1(`mp1W#lXq24!}Cn`0%4u&CmmpDUsS{ z@MOe+uz23VBIP^o+>gVDLh+Vx$ox@Cq2Ns3+{|MJNyJ8kh_U;Qe{#X8Ztzg~4}{#i zAi?G7MdA`^nbipl`TR*&*s*1CJ|9@*}Q0$WSJ3uB2lpl)7`3R&+LfJ+1HE~Fo=!jR@yCalad zZL30_Kwf$j<||6zCq3v^Pr&j`JBzvG^Z+6f$25467(QgfD+#m{#K_N$5nGs0m z=SWupGPSQDSUcSl){$~N`&y_Ipy_3xEENm&W4a8~fv#9QQ(MR3s98@`nW7Fe>%sG^ zpdey?JqUGu1YHQg0IoNKi+m+6Mt7jqKVyyM^tbtPoH@0!R2N3{-HhmpJb;Z&92%ie z0H?cSCA`Mp%v%&gjur$xG3bPELN=*4a~uDka0-(V)whwvO}}xYCuGed@D-6Oy!($S z!mFUM+PXC6iSjm-OJ>UQ7zn5=puP+vNL=IB2I&JCM~2EB|JnQaQy>(kvP+RgOotjN z){5f~|hO@|w7&EkQl;GDsXs8roTAziGE4lLGNo-05S!mS?B$yK{(Nhr*iBLj=TiA-)k z85*2>Eq1aB3R{H_i|@Ky=Cx^0(iG9>7Ed2S*`u9^(~79y^z^x;>}({l+JfxE6Db^7Z^ zp|O1APY>Q%um$*>B!(R!cf(;0Sedc|vYOt~b=5%XpyaR?F#C?K2NH{C(uIIN6=!7? zUk5kQ;_&tN0G9PD-I`t2Cc@zZ$E$_s{C#~lw=pJaG@atWFh;yL=W{#+)=NXDoQLAa z@*Qb#?S~%0FZ&Rcr=(%cyLTKktJmv)8-+_%TZjRlr6ri*Ho_cmLVn?1ab8pN44HV= z*$iCuEU4Mw^RSYnYUTIvqSkV+sN{wQBkjs_+XAG>1OSaTxhiHm;hN_Ud=0|z#wO$v zauYLjx(z~*tr2!c!^0x&E6!5{zE{n#=Oo23_;L5cI-1iP^LvtL%wPMtw!Y~OkP?pmy9c_YmioXJ82W$O7 z&U(f}) zSIhwAwet2eU;&Ky%n;77y>(d_@2c{!1IWt1Dv&}^K>C`8Xg}jQmEcL3c^M53 zd)m1}VZOO$?fxN7#Sd_tNInSa@M($J?mA{hhg&l-{%xKg4({UMI;2G%H7f?`rU_Yb zFcMHOFq&8KT7ohE=;*?0c#hkTR^@}*;3YFfpNeYf=uD|Q$?giNLCp8;?6%r~{mt-P z)=jJI^Bqz?@Mcn(`$CsR&Wr$OjbSA8|w01Ekk!Zh)cWHODwC2 zrv#dV)gb0o#&O7OKnNTmwc$R=yeOfWed&Ik-49+~UIRRjfYp$k-psT(w&1`8X35`` zfnBHf>wS4*S#p3?Mkpd<5eF1 zbM#;3Tp4-|v_KqGDl4#WQW1jqpu-(h(8F*g+rxl`1 z@20Y&9;>Bmt217xuzh6c@Wyy)Z{F{?Av#JXog69V^y3da`6;|L^y1z6<2$z$i61+9 zl(guy&9_mN<2FA5rwAG?cgV(}gI-56?|Y$<*$yix6Bv{gIJP{Xxi`VV{O;1=OD!mm zHiD-hr!2f4u7G7RWHb7F>TBE>EUQb0-Tu1B74x0p6;Ga9@o-3=zTXqtr-#{(42|4a z7#w>hlisPNsI4L4l)s%Jgx$r#abjYD^0$2yty)NPKG6P~5Ie!_Y%-Y}iX(!Njco%C z3?d?gAOnBMCvoxQ!`at93gx@sIvTlLqV|>DS*0Sn{r{orJHV;_+y75?kw`^lXG9s5 zsFRUZ$S71uW(z5aj5Lg7lr3au9-~l5MP`yx3HhcXBP$6VQRM%6`##U__xoSh_qnb| zg>%m5^LgL*{aSbT-jTnZc8(J|0zQrYLio`;tuqss;Vc4{uSNphMdajGK;!ZSy`SgA zRTqsAI=N;ql|;pM{IKyQ7i&xD$;qsgvnGt)6Fsa6`V!CWjf{y`8*!4HWKj#Y+K{cp z$4;EMmA7}8f(1)2j*OsxUP+&Okk?4HnOtsAUTWs);t*KKP? zm%PI^b_pic=#ZO!~GBSm2ysuH4*#LYm-?(ZSDw%Lx zGm;`W z)WM3j-ssyx^$)6juP^nz$y_UGmiq9(fk|wr?RlB7?FXzhlEfMG&h+;9W#3f{ zfwnumTHVRk?;Y(+V4mve;#-H7s)b(C-~7Zmqre`^K=;{hw6g=H@`Gl36`4bPS)RGNh8f21!5;e5^2Ry?nL8*bTq=@FAA@~ z#;mEi+4IH~uE!k(Dl84LOG5V4XLA#X15O*fcIBQvH7@pW+#7r0(p>5m%QlgE>2F2h^7&1NQXHY5ix)oGc zvF7R8z=(4D-%VfACrP4+!Ur)*4+#w{X9}zLpH|h)ym(f3ETC)VKH$8 z_+ENg6L#J3?{v+K!D2p^{4!mM=t$_Qke7@W3ft_O@|+*R{vnkN<8aByHW(wGW4>_r z9(kL=iVayA9j@-~;y_8W+#ugE;1>{}x#ON{LT06#H`qPF{NF?0zyD3ot7eD&t3Y`> zwmi8QW$R=357L*&Xeo#262+DK0va%L|;4*%?_a& zdoLhv?dihbBjdjjEm(s|wYhBx1qptPDDj4ak03Rw#nVnxp8fNu4j&o)?l|=2IMZTi zb@CkiJ6tG0wcYQkLG?=5F1;L7K!Rsi9`CeNng{IfXA~Bhkgyc}X8(W%XL<&hcGEH8 zI(ab4OV!o-9ie#!xg*Qx*z*>Ciy7Qi%oi2jy@A+V9W1+k!d7OhC!R|v3zyU@F%Xb- zlbSHyXkW-pg`kSaaVPYTD5#n7t^4N3%|z4bX&yk|t_*6M6|){JhvH?it~{$(GsDQ` zhdxssTh*}db8iX2Yx#@Qrz1bb8sO)Bc|2alhyj(TcX2b9w%s^F+W#-rzO0~#!jXwl z2aE2F%RPuWUygA<0Exz{ArnFV9dN}T198@Vsi~>Yzjtnd4F43OQV7gRWb0)DP|_}A zS9LsSh;!No&%0@P^Xg^HK2xuhSX>%}@Nr$kX_Qw2De+bGC&QLNO;pIsh$T%e8Q(u_ zI7}caGFC`;LBfE*=eX*Bb?yL*O^@SdTmhRxTZJrkeO#XT!YVC*a|^g%&h+zG2n8?y z_-E+X#Lu4rV7Y%hx}vG#c-+BZDW={4q<^eGc0~ht6eGdPfC4i^p^x_j81Dx>SAIx@ zdH(&O2{Pp%ee%UqAMBjPC-T?^Lky7Vw9B{tYL2f0N*e#${10vpvTeux?!`aTGJxvC z^k8oL>fh)Hw3ci2YVEA;z~zY_v&9`FB_x`H(#Zg}NC-FW#jWeOxHVv8bq^w3FOVdL zR*nw7O59u74Ij*Kvib;aLb5wWIvF$LZphYie{I4Nswcj92uxxLb&jCi(+}NgKmQ;z*3j)RPX#Fc8K=lhc8tP6Tpq zZiO=BQMbbF;AnI(&r}4D?w5APv8q`!=&Xjt?9MafWo}g?W$Ou`z?fcX&J|Aw;;Ol^m_#cRGhQ82gUd}erIix9ZX}+=;kMg2>Pk85 z_*!xo?cd=nbKC49)ba; z2_yS*{^Ex{)fxO)FB9!9Y47Z8aN>1}Oc{Nvd6m!3{NcIJXFswpTNdJur0qSB9=2HI zpj+(+EzKiV2JcEqiX9gv2*#HRmoI+8DY(A%j&CN;ATDvK7tX_C?DVj053V|bvIy#> zSh%*JuRES`qrbq^S&7l|v%{>Z&1A9_?$(#u2-O)#U+)fVP7(dLKR&tlcBo8hd( zOZ@XFv_ESf7!Dl?u!G*Wpb|stRxF2-Nk#|15<_-}1%hY#Vbis+02yXqh$!#9MC6b) z1u}6ZCb13+2#!PnPB&0-88UIA6p--TCcu=;JI}yVBZ_zw+p(s1@9wufSRiZaTq}}F zJ0J)sb^D8K0%I7iI;c3YH$e2pEoS6%z|8FV$lhX)P{fCtBfR=n2N*ANgI9BBkf$P` zyKf(l6k}lHDOKT(+Nui7G*x5>5{#bV;g>krNJ%Wx=oqOqj-`)g#u3#>%sWoW$m^(q zzs2nA-a4G%J?LHw$}d9A3et92&(PE1vq)KL@_r1!+Bm#6d2xD0ac(zlv4l1Hi|Jrr zUmq=dB6cyv17}N+v;THHw8S6O+jv-Ywx5THe=$l2eY;_ezJttKMGz+y7qUHr=G5#g zl+0uYvcA9d3vr352yG`MUTP!esXQJl)lZ_0&PM;p0V+2?bvd{wV^s?K~ zCBO$Xq2S~Dpb|%%>iy5M$jJyN4;!s+!KsY*}_ujyjh zL~Kdm__zcGIVlukP=O4`KR-YJc}svg&9oNwWa5aiJEHy~?bx4<*QQ%zYJ4_m_>sy* zq3q(Vf;RCQgn6OP{>wDjpl_w%&0ghVfbstg!ZR=hrFV8PHGpb>UTZo26hBFo@^_=1 znS6Ip1Qw)6Ztk2}yXK|RQ@ml5E7z<}<06y%=G|;d&;j#d{Wea&>*td(G)17W;j#0_ zuOkp6TGy%GQplb0sgc`)j6viI&N*`i+7cj+Torkx#(xH~LRP&NF*zfb6@S*GF=o_6 z7kim>)|{T>VE>wU1-S2TVI54Vfcx+c+apUTF zYDh;~#SaK`)iHufeu9)AY=a^!Wcp4F>>6wZ=;sL9){=rqI%d3}G*juFPQtY!4WU>f zu_Ml|ikl2mwP6)FaI&vp)s=W7X%zLlo;|jJt}(jF#HAqa4v8*)`q_`2Xox~Y+FDZd z1*irItUgE$w1r?IF#7u+Xgd?XdF*>AdaWw0Uwdtx{GIZ+>>GDOZC>&S#7i#Y_~Gsr zJwQauzrU%x>h0~-AkzKf!nh_>8(ruO%p1#*w+k>Y#2pW66Nxym$icLVCR8h;^Uj?+ z&tV2NyGZnd!9hV>_3sc@u7iLf^NY8U`ip&AZ(2W;9A|e*hQB+Cw7Ly%7(Zld30bNz@Q+2=S0%rBDd zVqQObd~bD-ehIUXxf5Qq(LPD+v*TE=c%!rh?!+b>5X?!{KiJtj>*sJE|P)qE$ z+&7B5_+Nk$!MeJWllxTpv3^CEqN}BDI;r*I>X#bdy*q&m90ThXV9S^H;<&xNCS;e= zWzLQO6}9sY73!^CkDdX!j$%qmDn0BKB3mt$0 zK^p185L|9@U$S3NLsL^1O%lf;p@4q$oo!*824_LQ*wKG& zz{iA~@>EP3B(-Mi)~Gz(Rx&d)Gnf~oJ~5m{CJ_Rg3`|$RSal7eMGb_8!O^UN+w|V} zeq<5W;C5CoJminm)w7Kz8Ey{)*5vIZrieirncoDdbfd913e%#V^pBMV}( zezDVm`?c}#62u7_l!O0FMkFP1pbs}ao}83aU00_$?yY!#ga88L8Bt_bK=zDuGKjO? za4pg>KL8N_0cIqx1j9UXaZnauv-+$p7vY9&u#)}!0neey^^9j zb2JvdA#@+!{XfEs~C zogY8PWr4%W`E_oW6<1Fsw+wm64qIQ+p1<=705@|geE=VEo7@^k3Kpp=V*KK2Gl4d2 zadIR1pd&t!M?#RS9}WELKqEq;4o4x4++-BR1jodIryD(xv{KWDGsK=gX?5&arXgWX zT9J|Hm`pTrwUsf41W%2o@)6l>6DcKzhG8|6+^bfZK3M>VzW(&nXS-qKTYll&rj?GR zw+Z$0s1kgaC<3cnX{&{Vj4VgEx$cFvrA8|4|^5D_GwPKF_R(XP2yO z*bRYQw@P!+%tS-3sICl^;JMp+?trCOD))cUKu-lH#q!Zk)QH0)^_i=z}3l>VLn zWnI%tmKi#!K~9!+LOb~8O>v^XEGz|qQ^VSj&CvC6oi|n|h(ZydV5a(6$P?oMANjB# zqu{vkannlFNc&_VBcs>G_xbni*;8_R$&XJZ2$C(V*^3r^7EG4;DBjRg#Zqs~B3Kf` z>|}(`Ujk3GpQoO_z8H2log8z(ni8>{B>EiZWq~{JK;}d%Yt&*)a3#|?*i02yzUSb|8F)Irt`*FRLIFly?NxZ&_j*-xwsRxZ(LP-ay;u z&}%SF{6v85xoNSV5AF|KYrZ!7_&635Sy)(pZDg4lU(c*z>*Unjg!G2;TbyxWUylkv zycuFMk+9NoG+iXlNOe}LNN9)N(nkk5#IR-lB1JGDB8UB?-u_4Qfrx&oi*U9ux!z7#sZ0mPIxeO3_ivn?&s+} zH?!rW5&FE=R-A*Y2kj+S`8~BX+|klmX8 zZsL2RU=2QnhA&}&Kx}w6i2-s5OJ-gsDy;wuYxXoJ-BddCsT20)+MsJm@=06=L9{RG zW3jXF%Ep*Kl2U%NiULgYSB0zkd}CNmTFO_ADv8Ph!0|=XIC9j8%@;c9Y7BzH2(!cf z6-v)j>mf)zIG{JJmgji*_;DS&_;kz>U|rPA-UAX45fx=X02oQY_!WO6a%gC%H~fM~ z>+X=TEU8~c#!OSrR5t35Y7b>DEhV$3k-s9F2W*_2jwm(;JTYOzSkVLV4T)RS?O5rC?ifG<)KS8BcA@^-tYqFDAvHQmUN zTW5<$%Olu+yH=nF0ZSQ6zX1MY<+lq@0g-_Hy#Kica^xZhSqNJ&o|k;SizXRMh^4M> z_)E;i02S|lHhg~$(EXu_=AzSRk2pu2PfgJ2$188@ODx*UuAIMc;j8H<N%zN@(1O zBT;TTI#+QfLJYlx=#SAoZhW5BrwP+217wBvZ!YOw13ScxR!_Wad34~ui?}DvM?S$o zs#J9UzJ1|A?0Ac49%#6dn3(8~6e}l^dY=oPWpwNmRu+BzDI;|Ey`lm<$5*k-WpoU; z2sRB_jdst?&6$n5s2D1Mtwq~WP=3FpWaYQSeWWRYC#|CkjvRtUbUgL)^jxKOp_xT? zP9gEm8_-3gPICth+l0H0`PDnzbU_Ho zDV2n8xCHx!)l~#`xLhR%`J)8{GO-Zfin}8y-r|u)JRX})od!{q3E@lI>>*&I5R&adwUJpaT(kyiqA15Cf>^+f5^_NQ>j^uZCjB`7a{*lug_ z>0^VH`srPM(RgIwBn!aP7dm;$pYPAAXkw9vM34631I$_wmC^57gXTjpWPl#A>b`+! zFL@NhjxCmLm{K9M2qfK5tjr2x0SC-+Al4a`gUD?$6wClmBf5Umag=dgJv}|CWL%x1+e=#!0fd%VHzx`h44a;m89T_C z>N`4aQLA87h=L~ZRx5y8abQ_ls%LOKl6+ZFt(Srs4s_hHjkE)2u`y1s0f&0od&eAKG zLLJ-giVoOL;N~7pgol}rg4o~c&?+WF6yBoHpBLaD&(j7pvOKwV>zCHcJ(Jy{am~8e zJ{Q)j-{vd%>!4C%WWUAOMdJz9z|+j$2klmn8)8w zw!v&~2S=nuJ}#Kw01Zng3g(|Ap0OMu8#aO6m4v~j6-wwYGlCvLu3FS_Ksxpq=3~(l zS2;7hDgcfQvU~~)M5QVDoLy8DF|vlAx6N^TPMjEG1HRx)0>%&i5DH5u-ms$QOzKxS z#xedoB}1f%;ps$WaxrKb{Yw|2<$QYisAI?zl!p$Iq?{FI0>2X~Lve@q?4&pb*^L^q$-0QC+rwBY$D5D7LCmAa@{p3^MYBaN=KuMsl~7R>07>8*3 zV&&3fs}cX9`95icq!P})RJ#+7!$bq;AmoPHlO8M3y4$>nkxII(y(c0hXqDWgL8aPO z`poA>gvqQPPIJjpzIkoc)TV}KuVcPUgCnQ;|zc&e%yV>nm&ZPYAYDs(v90XsP-2vLP~K>(8N`A=9VDj z7zphG%K;03IA#dQu(>&+p7|-<=fEg}of*|{ut31}ucE3hAi~c6fFO~{`QzIM)U%9h zprrnS;})mT3P?XGsN+8%_=h6WPybo>@@a^~9(qklV5}uF{GVf8zx)SA+w)lg?hPoI z2>F>RWO_!P9hr}$HU3iF@k3rje=NaAO7U_w2O9Ms4Vd|Qpz=jjzkx;=x+kKo z28&115pzvx#tGIs5&^q`xZpNlLqt13U;ZNub3p<(q{j@9hO=P-3%P7ymfc23jvD&B zYjW`$qO=^xvfAb-@qg)8|N9z#B=`gv6{(60kTMo6Y1D)6d=cCyZW5diyJ@C=ma^~j zPIA6Ey-l!_l^4|))2^5n8!9M7E zP#D*|e24Ma{7UQRPoI)u4p`a?AzK7-bQh=~0__G;LoD}=;@#E*EG+Fn*A{G1DBIBl zg(!%&6NT>}c$|b7MSW6C8?$XwPmRC%paYIou?-tE_l-h6Q?S@S;tfA=gRDaxgHD5m z6ZkAKHMQ=D;S4`#{)c>nwelQ9pEL}CH@1&hPE0Hy!tI=nM#@AK7&5&L8Ot;p{qGhaijs?dMMN1%ey$!@U>VN+2hq z3v?GieTe%^PD#IZcM~0*pf?VL;ypef1-Y?{k?Ux-iL-&Z|J;z8;v)9kfb-_ zGW@&?6>;Weg_@QYPP|*UyPJSG(@&SF_Pg!`e7iq(`*uQk;%3V$0AmH`(vT&T?IN2t zh3f5}V!p*ne(&eDHP+e0&s z2RtT0NP`WbO9+R(x0`#ZCS**M-3b#*C?LW{D{rnvXbf&qQ+gBPfXDi9rd6 zf^qvedH`P(R(e@lqi&jwldh!fhR*!-1 z9FE$dw#7jY%VtVjz%lBWn#QCRV>$W_x1l$_kHQcJ=?|`sZ6sI@_{?+6Nwg_!R=Bis z-MSa~tRK!Z@LFVgkD6hFzez7C&Hj!Uob_FMuhC zCMp6d2xFpuX`^E{eCGEm{)G~1IU$rs*Xr^Gia=1frpr44=JjykipAfHNY+SHN{Uqh zK9;>zf4=YPpu&#Vk46;TP8g^;?kwXbDmdF|hhe~mkti+qJJ$Y(R}?G5&x-;BRy?f$ zv0&2*Rl*`7!_=%19r-q^oVpW_uUNWlsTW-Zl5v-l9E2bID&D5@M#dXpLVDL$ilXFa zxHFVb5>ifKbWSIarb5P&wSE0DR0pCFbV>#$WZ7^vZA3WR2F2YOAH}7eJwr!K2 zXKyG(NiDOOwvN*b$vpw5b7)x$p^7_??=uj zY66q6L+yOu3{*HcehKx*fa}uTJ6ar#U3cB3W^-D5aZ{gq&ONB{5oYuFF_-Rt?=P?V z3r+1uDccZjC?&*HwehuEdb~ABe?UB<4t$WlG0Qmx$yfM~1_cEVgIWY}?FiTU&&$+G zuBNGesjdsy_*x)5#B`V&|0q+rhlhnN)im>W0QuM56L?b9xxUm0jdBI7Cmn&ykE{Rv zem@sRB9#8i07L)(`H5>^RcD9_th(E?OYj-{S0p;rdsgofT>P2G6d1!%>5Hp&*z>W# z=qnbn;V(35vH>~ zI93QW4`jAh6!I#y^> z3T@$gMV3YvAze8&D0%Ja!!vjuf}Im?gDQyz3})QeQFZC!B>FI8DkLuw5Q60_P6<#S zt`@O73w8O$O2PXX3ZEEy|S8_^(kY!y^%iVhyj*KPBQ&_=Kh5fD#q; zLr`5XRS~d23C2Dv6*9-wH0%$JK>R{rY#gSJr6YSowD#_e1|hqDFc)^8e$->$E=AD; zm~HoBSk^k4iuQ63+J$|?*iaIK-SsIJIXDK70nj1iSRhy5qo>=@2Y{pw0AK69z)mTG zzj{ky$ybc&e%L^{;u-z>QwSQ{{)FzssSIxr59LJ*iCZNQM|xk$y6J%;fScRrx@rG1+WQi1eUY;o%7P7Ylv$BXCA}ue`cp4p+sE$ zOmZ4{dIQw`_pF~CnvR`^GC)_qO=jgSK*Tga$XxLPvoM2n)5FE zfLewkHOpiW?O730)AGEB`}!Ec{3slYPulWmQESRebUEDr;vsymaGU<83h~M9K#|JS z=ouT6DPv}O%P=YifMfaj;^o4vU$Oei4{j3^m3$Jsbd^X^!bq8j z2>YtVmCJSe+LFu;q_&mD18o-3AbTsep%g(~U8o+hYxcidfcNhJzx1HTGkqNd^+V)4 zjzfVY3|by89>C^0TL6UcbotewmifYvhEHfL3q8CgmPp*|K&uC;Ga#?hviytyQ4;@{ zLBF#ckc#Qp07OA_PF1WuVCcT|OMnuSoP&9w7FK|t-i}Zi#bVPTAgk&ZFZ6O+(6=o@ za;1MC^u~JqgnM&w;+yQN5M(;oyiVpMbN%>mlxLE|5wFcg2G-!%+b2X^H2AT{4lBnzuLMMjvkb2fHbX%gZ4mEiac;oBzpkZ&s!W zz|pI`*MwL^i3VV6tcV-HZ7Y(A(EyrxWG=hrhu$p2x!a~FL2>*On&t`Q)VG>fdT-BH z|NOY2w$=}veWBUjjJI4MeLJYIOyC6A+x!l1+T8ul_#6M zU{OBY*9zU|XITXJQIuM*?Ryfyg&zCO?{W;Tz=H}(u|;K{Kn^h54l4G|$K?F$VJRpD zf+fDwBphcbblM@9ODPl^1V@mSHf78GYqKg5$FtweESG-PvMU=?XUzwRqt08Fi+4Fe zHYjbi&Q$s!O*O#jiFkOlUX~wzTL%eWOA6yDRE$%N9xYv7D@2|@?ug!NUQNR%44i+P z!xKspPgq%5iwlKU^Fw%@g>kauB1`zBN|Llfu`{6r}E4#fMglRO{5RSji+ z!a`ah^Cr6b6<4;pivbU-U`e1SdWLKE=~~a5yeu6$V`I12es*6sjC)ltdxr)Fl>p#(sm5uz{Q7EATFd5eL@K;VN^uKZj4V2 zcUz+RfVUK!AL;8e$#d3&7Zxdoc7^U(;*FeZF`d&y%uSw5vfSNqQGtb77gIAbkuW2+ zwi1>dmX*>&K0Z%w8Dgk1f9N9Ol#EAe*#$v3*rG^BJpB_0do&)r#mBaq@olJHmMDTE zOeMELO@*>>mY%V{lJ0R!^G%ngU%o zKRuiC<1xbFb!rjG2bwCv{ETvP#)TAnUc4BMo+TPT6n}4;e%bR&EpV6Iu54V9D@B!9 z0U#5*2EQCFv7gX-{8$)+42ejMNV%!*sQfr@*h^cWAOT|0F&b>J5OeERw#nYYL(ycH zorM$fcf3_5u2~dH0H#U{BMF0^m-Q%tTt53b7n@ZyowhZEdHDA$Zhq*A|5KYqcd*Qc z?M3ZZ_Wl`^R{-u0S|7G=L25*+jaU%Vh)MmX zEAw<-SXwe$-l5a{x5C0vJ&D6>{~%d)2Dg=AKoEDmTPK!KG5tOTzs>Ox3`@Pv)Lv?oFhPTi>INfla@&a#?cD#LL8XHioixfeE5T9!+<9JJV|uLNh1w|U86t%o5Hc|?pDj9UZxnIs!>GejHsMk`1!@1 zx5Z=Y#iK4&B=u~5S|Ig0PI*grU_;x~(XFZ`;fPPEvvnK`ZtJC;8Y`zYN^WNo@a(Bk zfU3GLUD;nt)0e)KsPk+&=6UR%5ShjcCy4c*`ERjov%jPP`q7h5b0L6FE~>K6msjOm zjnCrd&jKSdc5e6&daF_5q3lxsO7LYl7KhG0`yGJP3*~=t`z>y@dv&%ItF86HoZJNe8u8C zY>3R>nb}!~`7Y8^j^g;L2D@o~_T(wZC)nWnE+}7*RG!X%{#m{7sWc1IVqo@_CCS(GpeqOsUPES{+Rgb;6a37)UUr@^5A zJ9ZPD)#zD5Ni17tSBUqs{>Q|GZDq0uVbF1;mV9o(R0CPC_|=SMS0hrjB?G|caJ51a znD_eiHV#eDnR*yY#U995E#Bfj6U?@JfzqniG}PfN+nkF8Eiqya8u2+yGrl&5BsV|r z-e}-9CW8<%ZbdS8G8Le?1p2BBoXl9;x5>5d*>Z%o@ycd)ETNFnOwKB}vV{Rz<+S`F zp%xhFrINuGZ-oe9gdc%((R0|OC#rP=;4J}@O5?z2uIk#l9ZBM>6F+{WoiTxX)3nk*5JSgWKK5up5yL26{QTo4EM+a?dBDe@3x3@v`F!e3!g+ z%TDYT1ng}bi=$N%S5I^B(a_MT9axAO8-}S1apB<~Huwo=fo~%zIwO;o-+MxuAj1=u zmAy+mxWw&O4K=Te(bM_q^^kU�n^T$|Dg#*Jkgx-!E~FTm{`y@g1!PM6G&R7+md7 z!NXFzw;1b9192Sfk1a)b^G~;>lR_+O|EF#n7jyp< zS>4p6JNO6@e$}9zbKSCyQaN$3zpIc%@;nYW_4%+yG=22DFo0~OpU);nOS~fjiDR~8 zMC~v=j;5i|gCz+kT%8{?d#)RP5DPRT?T=Gl-T#1@+B-D^h_5RB{4C1lH5eiHkH>Ao zGkg_a9*F&f=@N=2t-(jgAS7#_P|baOdZ3r^e$UFxTmue)_JBl2Lb&p#;ybjWX_w2> zp_Fk!oK0Ry&>0}nwAA@~u>2vWfc+aAF|5dA4$Uip*UcB-jz6xOq9H9r7gVHJ_n?`LC%mO*&%Jtipy&L$W6*A8MZjKL#!a1*TOdLWCWZ{ zHjDmq*aUGlFqOEs(Apha0W5V`078-)7zSHY;I4{bFyj<1Di81?fqm*KxtqzO@#Vt^O>=<-_7{nq4{?6BA>#gO!lYqFA zNWZgpUo%t@i)Hy%3kioA{DZbyS?JPmD>enRczLJdKy%4s-bq~7 zU(2mV8ouM6F^qP#JbveH8oa+?H>j&)kK^XfEYp!6qQS>*=Ihpjql-!Itk1)DD?_Hr?vBIJ{H- z_WJ#pWf!a^9l+JGEZa!Cs^Uro0Rr%?0f+75dsCuzP|nj>Yh(iPGLs8PVsUsuZ{(YF zele>=X6*m@XLcj6H}9LD+g^&dW&EBOAp;2bK4_FjfZ_K-HW(aLoKa!kMYrnk~Lj- zS*UxO`Wb2?DNHvlqwKTHGc-9{Z(9=5}wS^x24k_B$wq(!;n zEsb;=@~j`sX~B>cj4+GRFiTlU@UH2~x3e{Ygqy!_Zfk2Zrb3c_?5G^<x6(eCF{ zN$DCPc?Is37zrc^+=h5Sk{2<3%+^*Nc6)0*B8WjlvwsvG;a2DYlT#kK4cC5nha+8Y z7$~Q2X$}VciIlz&4PwGo-G)$_{YFS`Ef}rdD|AULHwQF#$(cm5QYhI1QsOUJ&b>Tk zmrkIqavyn#Xb<;PFKpwh>5E=Rn)FX6v0k0+>va(=zh7-Pf{^#H8{O4G$kdK_&dn5) z+Jg}eq7H+D(|}iz{mYm4(K&O??!gjfL%n?8Lm&oNcw1I1{L3ors{qCdY5Z6+x`Fwirg?#SHdM^*!L5=5Nc zLWh>o7ifU-*m?Z^A}m6$992P+>pun02`uxH+w}{U%cUqabwY8Vu**LxCNPXZlL8oux!EcRj z{4UlqxUuDSeN^Ygx|OGO3WPRyZrZ5Ft3Z(m7P8_|CnxWbG8!~8 zaO>nlEd-&?>s{k$a!Kgn%3+vDYoH?uzE#LcIzco9ca9@8OKoQJcYJzeGnAGE`YV!m zb?10~9jZrATrxrhhK7Dv|JsQjZ{qwIusAe^I&+m8NVO4$5rJE#3o~XeR(g&$kmGtUaMg8aQgvtc zhzJ8I!W6cTKM`!^nSV;I9d-h*c*@RBqhLD{w8;81g(_H~0`p`~zKpNQKabE(5}Q*@ zradcINLxFhr1XcQuztdQ{Ih@%>c9FRZW%s&gaueL>yP8TkNyCs!vp9fTiyZj--3WD zzs1#T5V6%$(Um*|lJoZboGPy6Gf)QHgEX9`o;VM!3ULXI+Xrxv)a9QCdi4{U_0NN7 zAILT-^Y0(|Ea3xaSQJSuBewEMwF$7Q(LJ0YIzRjzpW zum=r(yK^Gj&0$ZV;jxZ8ic0*FAr6mf0RTdxu}!IztGLPecz8DGo(8158mMN`}iIXm?V~f0e?i(r26)PIix|uj( z6Y8O1F{AYTu+K8m++p>Q7La}T2t{%kZeznS8hXbWFho)gXhS)WE=MaRE30)+2Q8$0 ze9Pf`MA0B#UR_rb%IQP}2aQ4=KE5DmX=BC8fHb86p)7^_i7J&0=l#GJa?BQ*Ak<_I zcX5nzA(ni32kobj{jSo>xn~hMl^6ZEgovXNZoNUb3P$-o?&_JR9kIz6IPD=_Cn$T{ zsE-~!T1{=hN~W|Cw-^J6*FB+|)SY|)O;sJI@qtebw z@f`nkD4$~MqxrdAe=yGcviYrwOUIQoK6&^N!JTxPk)x`65>2z88-;@I_Lt$_^!;pi z$3YzLEBOSznz<1Y{vF~nBCP!-y3zqjlqD4GdypDlL<&Y-F$hN0D}vvtp}V0tcu+CO z__^ci=JeA57y3=bL`F^+!ez6ki1aVwO}~JWF%LT6-|AU_pd{QnEhF>3UBj{(uaKai z2r)uS^7b#K#9V$YE+V4tQq^qzW@RWc^=gV`kd>?pba7uq82H$z7WBx>gS5F|`B&r7 zPt*|qxI48%|VZy{z~+vir>C`aNBJJnR)?Pd|hyHs)^;Sg}*MIW^_vDKXd6re>G&Z%xj2 zl+87(2v+>b(C+Gu-1Q;p>;)e?ou#9m`-K~mBhwR}7jqw{7h2RmVpvf+qp&4k=rK70WtXo^!G96nuc!v1U5#y7$|%U4pK!%gfUL^Yd{dJWn6$ zf4FU{0?*F2^Ire`ceb-9Oj(^71b^#Ln>NYNepQ#`IdFZYD8u6GU`I5Pj-_No1wy0XLvItkY49||hl1QoLTCScq4 z4dYBc8rEk6X_8(lS$R#I==1YL|7vY*Jwn$ZNx{e$5Ad$Uq`w=GjhKs=5zH~n@cy@r z7Mc}n3@-rR-Hls_=I;G-XbG5;!@p01&upVUd-ja1df0TS196Y&L$W)5!+T$Ycw877r z{KiTk+trRK0cZ-=-MD$Pdw#sXpUE1CTB#w9*)Le-m}M0n0Ng?YYB;(#$ymVzFMUZO z^DF)5PaCi=dZ}a&1bot^2!Z8m%Re0Mt+FzpV~*@)!}}QjTV7N|Bo+HdL5T>laPsTm z4Wb6&w=z|oPn_Itq&M4l`^{~{lO4WxJ`roOeJvW1NRKEYgUXa_)Juc-9p^;p-Vigc zaXXXYn)j=2Fz+}_zV&JwUZ3NKUH9hUL3}Em!0>xh(}VzH;Ir0S*9C6i&Fk0Uo$r7z z$V1X?Y~*?jU4!ZNBCKVyO2%+u1M6?lVv;J^yKs*zLdaDfgAnBKAOx3uXnqPvnC`!| zDm68nND#taAr%ojFeH33xR6;Bh$dyjbpP1cP2d(!ECAG=!XQ#W+_jKO#wb*Qs55S_ zM&d(L{(0=B`-Z(F=T2jWZa_gry+c!81kq1@3eMB(0F6-kOG;e8Rn!xmgC`(7Dmu517-YAz1MGfUE|WCnIZ&l=h$ePhaDdnIO4TVTGtnp7>o>g~+iC&6D-` z)`@Nay)m|zOH4zpABb3o*sTV9*nI?nVtugz$YQK4*fa#|e;=92?9Y36G(vT+Yk%?Y z+)hpegmjuft_mV4_ArtC;xk0FhgC8TCR=ikl457I{X^VKrtw>6Lj zL^|vG*)wm_mH_Iq`cxL&!?k_%Z#(Ft;J7g&2T$@t^TqbGniQEQ@f z8c~#x$?$5I9L)YCSt8RODx;w@>@u(bB0Np=K>?PQAi<-cbru#6f2apU3(?c~1_v*F zThi|Pz`1t^C&Ekj4{u;4@!*(*xEv}TKW2aQ+h`vNO32FMmKnvu0<@t|rqHQDt9e@o zH}8YmgLA(sz_s=F_j5UHKl_;~Nny6H2^*PJf=&&$O@XvbRhf#j(Q}9R#Z`!oa)yOU~I+~Gmq9lx(>6%9RnJy0Q zCWIv5LFRc+I+!$U?D8|j?oz12qEA_)(gfu|Y=k}~LZJ+5d_FK8 z=T4I?Wa*GV_;HF7zicqsuIUedmacvD25eO*0?pRz^9#1Q*0NzVsiw|6!xxoZg^~3D zszRH;3w~qIQR6T)O4Z01)P6FC##c=3L`dK|>(e=+va+1+Zf-%{KdvvRt;4dQq+==Q z1hofPDMh&8GrbM$P0GRA2xR&F=Z{Gv@7lG&%dTmBnqDg!0qfHC-$(v&Ln!}dS=q}u zCZ?tVXeF~dhH;WqdDHcxIzd;TgJG~O1(?e<S&6MS1nIUH9LlR*LNItwwfl z5xt{6fsPOJ>PXpZE?+e5OY_cRsKVD{^db1GZdsMx|UTC+O z-nXK0jeP*&G}AyA9w)pR&20)Ev&u)Se9S*hs}VUh)A%OD#2qM~mwp?+w{U+?&k>FH z(Z)ygjO=_DF6c2AHooD^GwJ4UJUF9&u=}3lCFe7&{XzUwGAAxdmCw}wtaD7^*jX0m zs(I#5aObHIBN5We)Y#*vH;1 z=U)@C;kL_xfYQ|4?4zD|Fs&56^kg&@&26m@Xy?(~tnei>rTy{OJ81e29peaeq_xys zAG>!jV=1dg;McTQ^!g7x?RS>Wo@kVOrEO0Y{O~b%E1zqCismMnZ#|7R5jFq+U*oYa zEqZ2dVRm^f$5u1Rqot=bs?7h;Dj+`duj$d|;Z(iN(JZH_|DYnJQ#YHuV5jqyPP+>F zaAtVgzJ8+@D#8D8?QMU>_dHHUrlAu3=*|>)(Vf9Xz$U#0FvQZ2bD_z}0(3Ht5L3AC z`g4$`Q7&D&G(tzH%|C{P;xIY-`}!`Sdn4O7Eo176LFk*?N(u^0C|zjIy40G-O=Rc1iGB|m>(rPb^@d3+1(oQ?uKJkiVCrPfGDNZ4SAu1rZt zFuyh?w%FNJLxtz9Vvt33Hklne=ImV9wk8_4n`UKL-j?m#SDQ{jxJ||}-whLQVAuY% z#l%!=ZnayJ1L|D6QRAQAzx77oEfG>tQ3;+$gaBE}DNYxY0JyHf$jI0`0~h%gFczQ6 z){1Vmtz}YBRFt^uOEwl^+S)Yq@rf4?o;QO6dpkQn++~ppWte~p4(6{ZmW1|h4l)PN zQXqXA?TVO_b=D=8kv>I1K|!pg9O5t4i_`kXMmVMR3J3_;8A0P?1~H@hNvJ7KAsnXw z5dBTG2KkxA=vf&sg$ApT4O$rBjMXJKZDI#B807x*=g%ML|6&yscdq2-_C<}W_bMqW zss^1ulPvh62p&&pY2g{vB;naHzfBKFxIPXa8G#SJ> z58VI$8c+^}LJ?(XJQw?ynF7&R%*WR-zmPRG2Y;(;VNK07;IDe$b@hS73GdvQu(2D5 z?sXipw12i}p2i+JHW=Yqujc)FjNVBD^SGxwYC98<0r6G`1eXIIvzY88rr?Jh z9*!Nrs9ue_6zqm5P)2~Ux0K7EFs~urJj%iGr(|Ib^cPmiI4JmFI*1Gd4T~Hxoykrj zY}(rcs|C~5yeSCdb|HSzD$7D)&;aOW+_&!utL6sP7JSJ@3rb$tS>zSGsl4d1eV4=_ z;BB(i39R^U74}c}I`wkb|EbF7F;o4|UyAHp_bXxo;*lj+ zznM1xbUBoF9~>=qL^DJhqUlWo#L~)|(|};HrDbQk4S&ksKqKih(ZCz)2yQTV#{4&q zQccVgp6<1Eb;M<^MaBJe9feshR`4yMXK^3!40OGCv22Z;l_jlXh1%Tixy|zs5W-Ct zcNm}HE`l>ta=jPaJv^?XT{}GDfF2DS;4~jB0I3&(W*9v&q)gZmqIMt!h)%akqO3T2 z$7^Uk@AzQM7eL?hET61JMoryLF-<{r5}@nU+oWx;jjePOZ1zl@$nmtNVVT2P&kY?MvR(wlmXnVAv=V; zPJ^Jm20Wc-;18N7uq0J*UwB_)~i zInZ|iD8=s0+ZzdrdI@Jrlr8 z9mzh32az72V1!_dW_g*1#7>Ulg2vqbcMAimq=WrvWKR%O)F>MFs}tWY<+XLn1=4H9 zEs?Ho8PyoID2#xOnCxQ2g!~T#KOV|R#Dl5s_p%l)oCm(X$xkPcSFj1G4-Nc6LKLTo z-ZAI7Zyrlhf)PsS_V`6a-a79|q$Yh5j)>Fl!v4ftKjCk-1G^Ixu*I2(*&ylQ?0%N0 zH~JAs_$wLT99<53GDjxTg+CFzl5z!`9~!xidQ@yXF~o11xjo`Bj~+Rq$(VTYJE+PsLOsX00%UM(Ji4$fMG!`S!+EH*bUK({q1FUDZPS; zOp6~!vJu?gK*AAG84gC1ggJx;$GE> zB8OfWXg0gb^6WQ{q%FTLYXCuTI{ws%%ymquwE6*SXIeZWz0cU&=o};XEWkr{UrQWH zno<|Tl~A_RGcob_a%4gvSpvP1I^_y5Y3LcWYF2JlM1;CI#>69fJ~`sZ$Iu z1WDydgr-e7v!TJI1r3t?-DSLc0~%2-H|>I6IV8Dx!nVTc2B80_q@V|Y+%J`t?teVc zN5RdbF$6F5^iS2w%aHysW8o*~;GB+)kM{&-gDx^wb0R*ZO&M0ByEvaThKeZ^DGVLJ z+pSNZ#RoVE&BAK`;y8pVU}~3gz5y->w6n4UuZThgp@KOLI5v{MS!ybw^Czw#=X?@o zsHCJdrzqR3dG6dMfZ0Hl(DALp^B8Nuckp0p(c21iXC2tZ>-2iQS;8|*qylxKr|>#{ zpY6oDUmAyr2UAoxA8JeS>WSQcAT367WBF^di?iB1O)=Lk14i5c;O+PJWAU#1Mp=#j zhwI76V7K9x`PApJma|vpGgf}os8`>AKBJ{-dahx5rd0+tFLSN*#lv-yvv&;bx7@Yx zNbv57{5T_gasGP!%gV>oPfS(Z5&vZ4-Sf}CSpvg^={>Cr@x|7uJHxKm7qyP5@->BB&s|RW@2@;f25n7y+vfkN zpS|*JpX%urFl?%j{95@}vj+n+-M^LdI;r|1ytT;|3y;f(J!A2a_%nFd*BiZ-^7=Rq zT(<0WZlB?`vC>#SIt_&s=b+{rf6sgme%BSu&1duV^`kF5Z8m50hxE-XWA}Eq%N>bd zIBUl*g3*kF{r#$ou3{8xN5uKFN8=`u5usBm)hZDJ8;~)b{JdW<9GqG^>X77#pP2W z(*Ta~FdyGYG|5Qi)z5vPhcThs(5)tS51`r7h5gQqv|wBVFw}M;uPn(P&Lcmpl(0+h zL4It)^QSm$3am;uSN%r#wv@{^Qi>Wu0MrEwi$fXO7@TUoE~HCX%zZ&PgUk{ku%7N!=m9XWne%eQ={VS{!O_V_ zU3YPrr{Mm zI!&vfR!tg-nAlrlbjd*1k(Rfgh6r-h4_GiN5eMc)$RDPTDer(oYBT=$iZhAWc>eJ` zio`SmS41VkCD8$jnaz&}Y$>7K7@GkN;JDUOUjd!&5VpDUbw&>uD~gueg?d7b#NDvFApAhcO{;8w%v(2nTv$jFUILtv}nKXT+R!2j}nC$Ng-6%>54i9iu- z^a^H4qF_#og1}G((he;EiFU&&5oU50O*zRwi`Ky?Z=7E)*6a$(E{Y-qmOxO)T? zbTGh96dxbIZGe8yo;y%1RA&uJ|6imtyqw{GkWP;$I+sAr`!X}(S-vYR-k_2W&wFvKHr@ z5Bb!$amz<54`RYA&PJL6JnHVF)w%e&P8mIn71A1)Rls7!s(E%9+2~pWtUNYsM~~8> z?(zu}6cSno%)^(-S4K^ZW{fo)%n-9{_Aqu3)pO?*<1ZuD{qEhnc@yN#n9X6(nQLba z2dZ2gpy!rkNlOR2GOui?@@gJGNu{_H8F@OfOuZLOwHYNl;{@!UE^Z%uMTq z0728^pnx${QYAZ0m!b%41L7j>gozTfNZ9!Hi;G7?mXa%{O;JpStpo||a6mln@1b8| zq_yc|&%A(nrxR7Rui+9Yps_qtnvd}VS}#WZL|4d`17MJ_CUv#wHIof^@xUI+s6>%! z1^sccy$Ass<>UZ{Ko0T?Y1-BnhHg_k3Y;dM-&Te&<~KYbgFWLoC`O&{`T3nelC4=G zC&vNc1u}S;jZ~oM{ty&ID<|$-g6hs1kON*6T(F;$?`xzBVbfGR7yjbKi_n5Yvm~s9 z#uH5jbO~I3yEWDhZ;S%tSA+;R0~KgLW1ZiMLB={>{Qw6`vO%YxuWw*oz$3_HE)v~} zajHf8ec+~qn)jKggG66`-3c0EM@?CER@CA5p;@ZGZMxAzYsURX<+Bxl)WLpGGo1)b z^O+d__3PX~J9r;4Nle}Ce0Q$UJ->$Y;bkJC_FMY{apQQkw`hPP9zUjWp@#YZQq4ij zsIB`Y?v9V%c8cvk>o^91&%3&VN*h&&(H+p9xae`abpM>g1VzS2i4D=dw_8@&&as#jH1#}eCI?twu8jOazdFV33 zU;KgDUDf~%1}~I9$3jyuRoX7=Lg`5a6-%xzNU{z0{JMl&Ti??^G;rcOF)%pDJwP;m zqq)!w1+!&t5Lv|?*?K|^qseX?54F5~6~qy7{8W$m$&}RV&>`LyFO6<wCtNpq&F&pIik;mo~^?j&Hjwwq^ZN4a_V3H5fGc;=(KmG!Js3y$^bN zedQliFRcS4!Q(|aU?_Q7eFS`5P=P2o1nP;$Cp-$;7ym4_wShp{6*Z!AVo|fS7D=hA zzxT}V3P5YkA)E_7fYS-l0SUjF4*V!M=fC@KW7q-5?f0r01=u-kCb*t6+f(i?8&s*L zdXImp2xv4p`nF1pGN$kq6JGX|GlI6M=m zZ@iF6YS@*!*K4>i;(k2SR{yT}kgJF3S=8M6#R$jQ@rxKVhwZ)v^KssDoaBBt%VZa z!}5Ng)R$?03MHF{ev`-tQrQ3Z>biXz|M%^;(Z$Y3|NlNE)t3pj5H&%IWp*k5DIQbG zrW2}7|CA9Zc}TjD=0z)_94G%(F|qQJXJn!)5_ zU|CSdtC`Qv&E3Z7ieE(z&CSeA{wZgGfPo+U-oOBaRGXty&L}Yu&&5OF?)rK!@JwnM zR}tQYdPe?-h2s14p5wxwoevQ5C4W9c5n(9;Rf*Fw8B9g=o>IuLnS{Z zF)^{`tsjnDho{H3CHJ<8j--(2yDBWcoG4Y(}+CUA{&*6Gr;J^jokQ{Vk0yK zE5+}13^{_}^U*s?xprj6y@bpp04y=(0N%G z3=a>Z&+zD;5&)KjzzFu~Iw(*MadFWIQhElEwyfpy1A;ucMIRE1Qw7V1a2qCReS&&Z zrwTg%5;WsS z4mSTE) z|E~6-jKUXIeen-)EpR3G_=q8pWc_o+$8g(V?MkA^3rxfZyq%D6Ika4R9bgnh8SE#= zP2(HuxOn&CD=5R!R313z4Hehn=$L{O-+>*ZoP z++on+zD%XTWH=ssX}|v$4TzO4l7Fjwg2YdB$}~}SW!8@L};y1zyllE>85uNW}zdi z25#Y)O$Awym|g$Fca}#l-2;PT&9jPX?YeaegB&Qv9oa;vPJ(PC6nsVlsE=gnV-usJ z?Mx!lijX&}si*`r1|tL(oQ}4-ia?woh<1pqdh2Hq30&3M8Y6sc-MV#1;mU~5^u=f6 zdx_EpisU0lYLe8bFjU|!N}}Rs%>PmuS4-b5bWKq!{E6%MhKx$o5~l@khv4ME9FV-* z%J3(mF`*sm9~iiZtZGN`iWu=dN=iz;IjAMau)jR()uAKFN%do;NX`El=1VL?V@da^ zZsPd(2x?EO$_YWE&Mq!=L%-7da3Br^Q6M>eT1{Y$9s*d2b!v& z;;k%3xIbg1Kj$@o1clgPH@-`Ttcrp|&SF*7_qi*l$${P)n73TM z0b)6!&r(3Of`%7T9RR(k_X&spFhZu7x|!?vas6!L=MSf|AgL*O_}j)Jvil1*HQ&22 zd;uXK-Qmb5OkjR13U1^Fj{aQs;p}((>hSf_!!g{|lsgZDH?F;dda%wi_OG6y_o7q0 zl$q|e3hl#k`zWUtvC_Hk%Ylz7q9&b)Y!$T3KH>pxk!v`qwMc ziU0YL9Yj5G{~!I5)S<>P-~H?WYHuC*^ zW5%18|NZJ+#%2XeX}a->CWQ(wdoj)`Pe^SUzGPNk8evQVLcI14QHC@m=tlekeYLpWWDBM17c=qyC{FV3_5jzpY zkKcuov*i7ge2!=mn&&z{o&Ik4=%vqq>XOWn)Q7=euAGk(LAIKyKz%j?x`I6#x6uMC zF$X)}`cvO$d)?N#;Z)19VT>|E?)cQ7pNo0v`Y83r@N0v)hU`FcrLmt3&VXpx|2Ppeui8AnfIu{k+m?;AwZnoMAm3c+HTijYmE}12`Dq~YmQBxrDxE+CST0@Qk4#7$d?MI~q-N(( zU^y2cQ?<7MV(=MNwSy?0S5_95Bzx#+eXz(hjMoZ;2fXOIM2Y?= zJUkH+w8yl&c6l>WXufN}>KgA3ungc?P-8Iiv7RR+jF}P43dIMoOrWvsLv|s7xX55} zi(^1&2Y}!Va+U|-lX44asEJGKfsD$O6qLY;yY3VM7Phtv-hi%Ixu zg+JIjQGMfoB!hz&&L8_9t%7sPOb{MBBEzW80!~4ajxt$qlF1!p%Luc_sAT{|6O~9G z^(4&wX(AHZ2FMu39~|wNuwnV^GR~7&SW!YN3qgQmz9OBeAViW#E%kv33Q=4fdufs& z+jyGDVfi>bEUbu}w&7d1YzeEXD1`T5Mf4XYC?Md8lpxt(vN13wNFwFBItx{ZC)iXi z(PgN3_b;^qbuh1ERW}NRECG-an_fe4@oHiTiT1#t+b`witTp3*{t#U5+2x)OI5%P3 zi7`0~To#dc?a|&okoP)uj%b}}jvrna@mRhCjPluKJXX{w_S&Pv{~3mLS`+{rx~TI5 zt`WIhm?>RDNe8Y;y`q%GskjP}%&kuB?CiwkXQaZMQcmnv*iC)yz$iQg)m_|{szd}H zh+FpwvxKjno`FH`>fK(iZTrQ51bc5{IJID;f%(eZ52jg;L2Xfj7(?1ZS@c1lk2_0( zpm>3rP8SoWSCeAP)B?7`5&$Uv!fj{=XTbA}nnjDd>;x)#W}&kI8N6&B)q)uH1*Kz> z0cS`A*c`R|JO|8zm0QHDtac+eg|Wgc*J39#q#m?FajG)}&kU5s z=HGdD$fUXx3)EO=3Cuw^ZhS@wFlL5*YYS0DMU9|Zx`+np9_vnN+@dY?hi?k2pv)@4 zC73WH0`3&vt&GA?J56%#6V80IYbHipe+BHcVmTruv&8h;HEXm)bw6SgNjAFxm?w_} zx)edPL2EuW+`J1J#a~mWV5U=dBId@7O5o&U!738_S5lV_aB$FFQyM117(%CaiY+24 z4I(R+$INoa_oBBcj+4nhVLwo_hIlc36LE?fqm@e*-^_3%{#2haPTo3EfZovQy2}~C z0P+bOx|+-Q*w{yW;De^o!q$Ib4JPN{Vf2LjG^qGBE{mkp-?$~pv?qLgNXkSGGg{pi zRHJO}NPi=oT9XEde10*pbF?Zp2>M?og9M7`M2H$L04@q@z>{13;loWm;@&a4XD~(Z zMiNY5+dVMg3ul|W9{#5ve*Kjxc4Ph~tQ^?`-~sZO^}A z3m~ksa&*2H*#c^@s(H1Dt9eFzM7a4SxLI6IO|S_Nj(DO$lFLF-qh)U26G0z9hTSmdy;{pab z&Ov|b>(bPdIL>k96tRS{`|^P4u`2jfR1}hpHOnr$k&11Pb0bRBheV4fx3G_8Ov`f| z`)?w-XkLKTfWNWfaU^+0eWtVHJ@R^Zx^LSdyA9uO5Xa>9s1dMAT&7baZH89I!vh0j zpKp4Yi5vIP2_u#BgTfGIvyUiWQn;Keo&&bLJv6?US`I(JZF>^%(Jd#tj;XyZ@o4K+ z^!m4lvKG!915Y9;3bmPP%?b`D1`1KGJDtxEa)8x+a@=fgZyDEYdE~Y1@#ai&l0P56 zUHqx9uH`(rgHM$*AMVq3Ge2%rQa-MPdWd1rUqxd)p2wu@hQ@1^%t|$h+_^C07`9LOZ3a$jC)RKpZJeGr#Fh-K;h6HiMW4g#F6UDt0b6g3IcL_^dkd}3wl6I zlQK(DSKO4>=J?((6^5HErVmqfJBU4uWTvtn zas2;&3WRyO$d|GB;Q9cN?^h1f+nlEJS4mr}J9h~~p!~lO%aQQ{vyFwo*+o+F+b17>8&hQz8CM+6aI0FJCv;c zFT3cZ!H{f<8mcLGqq=^8nQO}S$g~k5y%NHN#e4OL5?Y)N3&B)~Y7ZgaK zTR8uh&@vfZH)xNC>?sxs?VCUjS3#BAK%)_v28$3}c9Me27-puH5tj@}ixf%!_)Zk* zknJRV!&CAB{h3}Wim0a<8NkH2t#918i*80O16q=GE3h0kv2Z;Z9i3egi<8jD(o^A$ z)_|%VoB|w?Zubntf?;g0T>J6Y#DflGUDZ7F#EZA@-rYtcnWKy?A{G*L`6nAh_U_re zn*{8KNOYw-(PCmSPk)QF#z12xE66{!436%WyN}Ek21RIyq+_mh8XflqN5?nuG03~P zhlhOMbRA$RC{}FNM570RqgpuAA3|PY+N@~=;gE?wmmiizPhbtO6IK;0PI9`PHqP2KSkMw}_P3Wgwt)v2JrM7-6b#(?BxE^LF)zS?vX z0}ry~qd1tEnb9Jn14cH7k3AI?_}rvUlX?|1Gj330w0&RT=K=FpkvatxTO3-*Ol|b2 z1nT4V*QE#A^#O@Rp2E2RJv)HxRip}H`bWw;$o$^ji>v3wOd)>)z0K|gZH!hr_qH;m zJm?w2bSdTK8~+S9Q-+A{QInDg1ZQjzzqgxMJJaa4Z38eF2nQ?dR$AxXCn|*UL;=-E z1GzPdMt#`@5^Tfk=hLIG62}3b1t-0g8QM~>pxKUX359JQm>DQ++qydmPnE0Kf z%ge^sT!R|ybh)-SDkQp|NML9%hJzwi7$l+!?EBJr3&8yj0)3GKm3?Aq<_@O{TJXb8 zZu%Cq~RPM%A)k@?E6%ltt2nW&;=!LGGOPC-EMOg)f zRCVtz;TiBnB;;JvpuQAc!+y-NG5LXyC<$qrQyyx4_aWbZ1hI5|*5d2A0g6SDTN?IV?<6H<7KQ>3o$)B&~4OSfWuLX*ur_he8|w z#V&sy`c7utz?0p{z+r$RZ4~W)8UIM}*@zw33Uru*vm_gd0go4Xyk$J$=Ca$}Fx(7W2gc zHyoVn@JyX9$IK%}a{gt;dC2?%h?{OB0<;eH@~U__y4G3P)D9yH{n-dfU?jfR(8}wO zPNN>6H9e0IgcWNMDQSoSm#)4RY9Fof)sKuF@b`$?$)4a$m2 z-KxJTHjl`M)nsJ6b<0~?7$N55?uU4OWjLPa@f0T$VL1^$UQ&ZMJ=ksDPO8TUr^FvS#J0Xp& z`jKr)N2IVTpcF(rXTD`_hu!HJ=n!iQM#@DTE^JeF6TM0Tf`aVR@u{izVJ2;_H$#K1 z)r=p!i`1eWb`atKI)flX7_MWv2{svIO)398#&p8{+|S$-&bpL4vHT>xe6ah41u}Xx zUfcU!0-Ak$93mhIkEMlZWr$1aH~g3v?eV?=0WHn10XIck*~+5FyLij`_1n^>Bv3@}Iv0-WES#;)h(#b5 zdzpME2aL-v6G0<#o7xXIX<40zGBU5$_uf6)a|(n}WP`a9+N;|W6@MnyZ>s-@O2K3k zrxn$)*de!H55+1!uVc}k>CxZceDdq?=S23f0#&BiSuMd+ucI~RqjiUGss@DC2R+g| zRpog@*?(_I8b_?C4bYQSmaQt;Oawxq&kH$0U-M!2pOj=}cZPakJH1bbXCdW_zX|T} zUki+G*CC!mg@ax_hPD@??WCO=ms0<=sCY5`N;uOzaq!Pf_wRt6pz++g9fOi9W*b%$ z%I}{y(CoYLZ40#)D8os3@ zTIwkFK)!gVM?e4Nf;sQMia#CCgfspe8AdFBX0p=@dSXTf@ioJYU2SBVr`rNH;FFqc zO3PH`F*2apY1{0ZkZ_GjDf#N#+3mLPH`Ld7&V{iW_2?^hKOFhfm8yK+UD2eOsXXA` z;KDDFJF)U`idDC^?p<}$tgNz98_!WZAVm-@^3fy2gRq0Tik~JSl8n?bSUlAW;EW<02AkELs=R`G#pD9k5?y(c5g+Z zH_mf_0GULMZERo=v(o3_@QP{D0BLg=VbgL4VZd8lTujaZ0upM6R@Vw0M4c$AmsHj8 zCPmEh z4)83kdw2idy%fznl-tuomenYk?sA z1$#?2*f)9!*EbPNM>QM9IC*%yfDA%g;Ei`7N`0WbsyhrTwygq$jnw{K3o~5-fa}(y z3x+c*^_u>FI;+i+>`Vb6A=`ZrP!Fo?1AaeYN%U?p=-y9IijwI|NNlgS^#1oGj$I|& zIwlMG8+jw3XbB6u@9yqze$7V$>e_pIpI zqOuhpmQARd2Xm#c-*1Bdb?E2vL>cNOH-MAcFDRH(Gu5_SNaA_AyNgqUF_sL`uE{l9Db}E} zx8gP?Y}BKIJjok~CAav-bB|A;_%o?b3GF6w)egu~PtQ*O`Qz~?&(#SMtBY8YBMq|; zV_v-A#dySQbrrnaEeP+PIh!HT27%lYgHX?EJL<$A5rKSy#ozE21%bILTr43v9VjgAw~zmCy-oT zwE_`;4Y_8v10KtZLWnlG$60`SMGCb$`=4xw5d)$PEOWqT{a99ZPlwn?P;~WdePOth znA0HbmA-8f@)n#Snef8dhOBc>FqN{uF*wSWIFmz0P-JS)6&ONZj^siQtHrG4imzYW z;m4sn9zso2>6)6`)e7(+9ds*Hzwz`m@x%MjuqMecUI^}00((aRJDV&oM+*@9F3M;Q ze|}+M`iygLi&EiSW9b(5mqod`V$bm7fjOM$r5Gb|sxJXi$kW4;qF`BUGxuW@p@kay ztRx4<2Se|woYplscn=SJkbnviAc5!#Y19D74VYM6T?Ly`5gIfj6{h1iilM5dCIB?z z%Vg5x5)-|#F?CJZ!MBkTSd1J59|((~811dt6Lt9FUzz^UI{j|Uyz{*l#at$e-eNr% z%RT^L)V$uM`4LZ~s-fYNH%x@Tb&|mZCEt(Oe`qK>w{IuL^zNzbOQp0%q-DZd59*aw zBvuv}MT#?VeK*n3(XmfWLJ=Gb4XJe{SklkO-~9wdN#xDNWSS19Hk-;B1ASleMh@cs zY2sER*>3SMkcF+}C}@zj2r$V%5d~l-i$$S6A!c601Yl<77KS69jRe^SwW=^EcZhAk zk?0%=&ggpN^MQ&Oi0RNE8Noliy@ySdpP!#ZHVg_w6h(rR64m*bB%1%5uwphjR? z^`+;3T!KLZXqR_T1F*R&3^>yp!ses1`@kd+ITBrz-qq9Ng?Xvu;z|Fhn5Ls6pOf zQiPlh+W{gtL>aC|5rR>jpzmxk$1EC8)-&)%lKGih;6I2XfdKE`0n<@M#*&oq>UmPcX44jbW`Au{?VsQAW6 ztN6zl=p$HsIrwQ7!^;61wE2lw%)BAG4;;eD&3C159$DzooHlI7-P3EQaB>u z1E-4=3{3AVzwA|d(^<6p>OUA7{Ut(V(m-;i%DM8g7W2XTDblZ8ugcPi$~1WboFn-> z(82|5Aiy2&?%F$Qg22yy9N#;5rg@x)QkioMpxfIo*2|&Cn;Hz<6-hrDyB1~czFq|X zeCBX~((XY3UG8$tr+8$yiCR@6495ETPCZWZ9L z!706LlU0xx_^$S+!oY=CaOHJF#DSQXpC2kg#-1hHgY3Bg%dPyAv3ju$f3Mr(2`OE{ zd|Or3HZ;aE=}_n}_17i_)Leyq|1Ne^b#tUt?Sz9Q={xI15b77eCg!(7OqDR{fl%3} zC`0Q+5r|cVxVnUIu%t<+VJprbr_;M26+%oJei2ixHXm7L3S zWHMxy%iDory#;X^%RL-&=m$Tdx)?WuWbA>k5F>@eCvuxwAk-6C&K5WdzmV}TD=|7l zpkRq_lCc>Xkhh?BNqHI!u^CaaG+5i(e!_pqMuzQm_x7a=zM<3#R+o$XhW`jnWy#XTpPf!|Ci=rwKMi zXP{4l#Dn@G5qoSWX_Tk1#W5hhoW?q4?I!x0@VNYQ074&BoO1~dMYI+{0=cNd2(W2g zfHA1v#NAy-oFAPM6Z`9jV?iO3HiE5*nPiC^cAc@3%Z4T&E(Bd#OMXOt?&=RW)Uj_r3rTU2U3XH@GWZceKq1fb4cokBi}t%3o*k&C zZsW;)b{U$YwP+agCSt^#{sJb;UX!f<`o$%O@j@&w5TylfTCIwX<@sT_d^X^g~`H znukUosMg-v7Qq#7IoO-$F;i3?wE3;WGTR>}66G>o8J++BxRBqDtEaQC ze%p(-lGgbeVCf{BTH4I0tm8lrY%Cd@n0Pg?$M!5h;-}bBsKtfE#G>{(_wO%Z82B_w z7XbXc8VLXcnUuY(0JP2GqYKVfBkH`)=y(*htOltvZkuUwMwbEdZoCA6&_cM%xkFP}ceeXT%t1pqO|=oeC` zstGHm8jbF;r{At)CyxD{&OgP(#7r>A8-SidM6FGEh;p$r@yW|2ai(BG0TWabJWgaP^akx1(#mxM5CwB zH(4RnW~HVU7!);$fi%TrtGL#zp(37O@c?U~4-F)KW-v9w24F)_6wneUL|oU1@SM%^ z+(=Cw(j5{U{3$?u`jMS7==!X8L5~6fc8nu>?ZD4m^wlCI_kx^}V$=gtS>Zccaeve! zq1_=EV!c&-@}ZqF_?jMs7qN?6zH-F|BS7txQJ`rCZ9V$xBFA1_HRr^&4(*%h0+0UsX7E}wzgN24f#Pcz{V`z7LIQq~= z*EGakYRM_1uF2uzP^!f|`x1E@XKconl`y2cs|VF0y4CV`IGRT_+)MYR*{hNBJ;279 zq6>cJ&Gtqf`*RDinrGycrS5LZ6iLL@<}6J{w|cBE{@z6ovl8m%tfL`(J?VTunp(%T zIdqJ&zgL>fhJ5y++~djA`rd3J!Z2oE8~mz!;K5AWRy@~ty~b~((ckD?MWq@;KYXcw z@b~l9lqG{5R6ZXa2l5U>RLN2l$t7c(Gq)#S+#&c0P1_`Wek+x;P0rG9v*EXNpOnSQ zYVT8BSS{^(R4e996UV6Zh)R;s&55G1p}bAU$2e8^{<=v+Z+)0o<=ilxW1Lj}s;FK5 zhJBYCeb-Sn?pkY1Ikh;vacZrVfv|h@t^<3iPM)6m{Hgj}`jYhZfa;0h9<$+9Z|HP} zMqV_;9OwU494D5)o=Wh^qa>X@-Nz?uq4+y&Ll|M^F%o0bn|Ot<)a zG{0(?+o0bwbg5Y4LUz=Huz=Qc`EJhqdeINUwg&Rsvg9vs7yU7s_#@pC-4D6h+ji|r zY&O4ofWSKjl;2|dQSOrtX$ z62FIr%At?ZiKvAN;RkSGVFDJlW*U3fL``NP?t|dD*CNr7sUTC-RQZY9Gy{TZsQbFR zx;~-r)QBK)^^FvYj4&C4ywn?cJ*1b2#4&(uTP4@w!w(5JlYqqwj{euk2w%%FY+__- z#ntA%4>xf3&Ckp8Y}*Q42Co(w+B?}DtdjNq{rge#UE8-80}IoPKsssqn^RuYMIxvQ zIo+81l-DL0g&`?W76fS=N}Zv-%?3CklH^0VAwn4F*JNaYT4t|f`zmhG@Dpz%o>!yo z`}Ht1Q}e~S$0R9w5EWdNP8ou^W6_8-??1{}j?fHFjF!iybhg(#(R4Sey094cInxA@ zlHy`jh^7Heq<-x)i;y14o7jS)@vNz`Gs(}wZd{F#ZrpZh#2s&8kyaB$HcZq;*?M;UmzJlOW&4fPhi7(uHS-UsRp%$z3*i_dN=e|jtq&O&fNh3 zBj>Qun4&^g79ZHh(v!^Y?#_svO5uVz%4-4f68*M@=KHOckD9 zo+daH_p&7?O*MX`ACmQbELk%ZDLD&!ow{j=4QV`z(2v+N59LS*jkB^4D+Y+#Z)Q=@ zkIiA@mjAi@dGYjV5KEI^kACB{2(?}W%m|hBts{mWEYAR~pb3Z$>D#_*mkdCk11#JV zQ$w|hTOk$Gby9W~fj1g4y4pz|$YxmfKOq+_VtZi6&Yi895<*M!o%i{I3JVM0-K9}O z4_<-s$Ud@3ZbC8U6}UJusJM2Bx)aMW@DsJ_c$+Rt$djpUnO6GSRfZM^m?2E;8zR$NIK-XmxhGV1uubT<&7 zR~j8Bzgrzmf*`baEB@u?*gzh^!HT)hK*POx?EkM*8G z!T*+_54Iz%h8o^y$qPYstt%%vDhQkV%M-=ltT+|EJ9(5yz;)$ZO=dA9<`ObiJ>;y9zr+LKjJG z@n;la$$#>SQ1B!zs%D-f4)7w{Vqcs`ct&o*w~?Rk!=aqmekuM5HqR#5B_`eR%qPMq ze1D#VYvDAh$t+l$)*o;?4PHmmnfcWZOvJU!dNYEjL&)O_M;GWaJ0O1pi)HUYe9Ymp zB>X|bXgqec`|}v{z`+g4vp*t7 zIiO2u-&UCrl%sbS%NW)b_L!CDtQHs+(HUVzY0K=%l``ZZ@7=pkgfXxjs0x z@cEO^xpZ*{qefpDCubU}2VbfSV|lKhOK+8Sov=BmlPYx6;6Y=d-wi+S)&~*>gQYL& z0>&pdtHsFFGSPK3%wY#V*0oG`}NIavD{>~g-7Rv9K9dSka@8XpR?rh+EB)y zPc)~m>l+V<#L$n;*!PT$-}vuWNS2nsg%4B%iv$V6&aCl}Mhyf>GHUdic|V}ezw`eX ztYM%5R{sD_nnU?*aDpjg3h_Iq_}f4*dfx8g|}WuOI3%6 zxHSxm&3)LS`SRNkcgzWAx}@}6GV&RKqnvdv1gxQkJT9K!E{jv&xrBvvMH_H{#Q&k$ zv1iZ0`~Yw7brsRazjqd(C4KY``!v%K#AgX$v*bo}S^+ja0ouWuZwPd!KTu|G=9}y2 z2{loL4(LMSko5+ms`LqpZWjf)n=1C!L%&}oe4C%2Cl%(Q-$OXL5VEBv+5j$+^T?6$ zUQ6&?WRjs>6x;{+!ciq|n0xgy`5WknXw-w(T3=umd;VmNLX9;UH3~2<`^^YsLkv{9DY*U%P4@j&w{XJY{6lfXS--AeY0f;=8tSr7Dq>@vR?Phf@c~q#=ZB_| zBNWhdWZGvk#|^q1ZQ&LD8A=3&V*ba9aPYZyL=}Dd>-!S=e zf;Ii>`sgtkUWTTi%oZ<_h`5wkQKaD$MONBpIqYZH8DoV(TlGxW)&Qoh#`p3X)Pxi} z1;HHSQM9zQ{oo`{EoOnr-oATxiWo;e>>*Wn1J6MwI0qn0

bc{BxhjDZNm$HB7m5ILSVO9Q3{k~Bg!3KdC;UQS9)?%Q{`xiv^Z@)~5q+yi2u5^MmV zDu0@-jt@R!xgFfP%luyfBn;pVpIRhikTemjl4=cn20A6Fca!Brk#1&|qKZfVi~5Ph zZ>6QNU6cTzN#ow(lP6jZAoozQ-xDgG5o6NeMgOk$-UKU+5W#|v$ zK|91wLyhPOI7i;k{xuq5*h>>&{%^5oR1_-?p`iQ4pT?~(&|#B5J?O=$Zl(bt>F9%* zE__ZG&C!k>J2sR6Y~XNknDSOYU-G!(N^GCAqvONK$jExiqsJgzygFNE2Hyz4U4{^^ zL)$T9;Z;+I5sP9q;yHR}^kvkBppA)V`dnt_6YSU(*mTw8FHGJRKw*%oG2_ol_h>jzZ?;Oc8 zw7GaOwk8UC)*sEy`of7oR=F#@AJ0OFroK3i*Oi~(kF3apj=v)c;m|W!MDn&ro{`4N=; zO-^?>c(~<+G}2A_SBn^et#uSeW|IzV_fD)-M)u z_#IWptuK^|DT$nst|X%!etOpET43iP-Jia?g6!@Pih6wwv%NBXYyqvi|9;yS-<3CB zb}$NaoZ~GCb4}Unq9{rx`RYY)AKH%{8@$nh=vqijQ!Nt007Ke9Ol+qNE>$&X>qKE{ zU}fHTJs?UpVP2rw98oP9X)#JGY)( ze507d7jN4|A=SbP3ouBOorA*`r(%L_1g0=z>S5ah<71JtI0GO0+oI1uXgHw073o#y zuz&}q$;7?0$~e*6_iW8f7gG{LQ)f{|`T6rFOC>0y%NmSIwKE9J)vC$G{;<)b$!0zm z$9tqTLR2jMC(u;ucN26aa>x$;A9nje7!Q<&YXy|#B))z{eV>&~&igE%SK#p@hw>3A z#ct_>@O;Y`4yNUmt?4k$*Yt{pPA8B4G;7U!TOpE+P0mO|8NF_qO;!#cJVA12m8D-{ zK2PL)4bVIuBVYG}j`P|$rtb@*!hJoP2YXv$yfTcZ&nxV)7jD&Xm(%cK5wbt)$hC9MjPc^Qe%9AwV||OCdKf$`1JcoVQKRxWl;|r z`2vIAHKtJL;`lp6cL z1CQ42>AJsC1V&STD43jE9kZU2u^vBiG|y>O--Y6(;j*aM))&VN!Z^qUk6+WS>u^-K zYlbH>rp@tbYkOH$;cz*_AgS0G2&(As_3S#SPZ$(`<=`_4%3m-@ZA zzUgmO1L04R&JXx1%U{xM1wb)xGiAQz!RPv+;Z4^KK8Vm~db~e!HR*M#UwDr%%3opA zRgJq|h`a>&cXHF*Yc~<8e0~E_J|K~ZGdfJ)ybF8kwmT|Gq_7PIf|;m>rv!*`aBBuG zFL}7`>faCLW8}%y(itdDF@!A7P?$p^AFSsT>D+OEf~U|Hb**lGBal= zM1w`JE5{Kx5ELwkVO5LK{c`(yxw={ZSea z1d1R9-c%VKVThIkU|$(V{p(%tW~Y1Toed4098C~Q#2B{taQ(|&e*e7 zP|`$PL*ix)?XAO6LyjaXWz^(7IENENNMwDAp|p(5!3uIfVN*U7Me1s@eVQsuXQR0U zypwAcjji7i7_PQ4(b1-C1z1vK7KPhxg&N4KOEXp5uW)YgVCdye?C_Y2_0me2xCFo) z&|0I;Eke6mXvGNg#$IG1U=~3#dg5Qby$4zfP&-&%yY?c>Rp5$SVcYkigYOBuJk^vO);uBww*1Rnv+a+9f`u$gh;H8Vj@ulbX3)8p+kYpY!461q( zZF$(-i~?W&X|(m+NBx0G5dalF*fQ7O7_4K*38nt|{lZa~8>o@i2(d_DYsv} zMEde)@I7o47ZMeXz!O!X&K%AVN$dih`jdNL_E3fgBwsZRD!E#ZrHLm;o;bWl$t*>> zj0q>8)#z|Y0-iQA0{P-e`M(&6M=TrAXR;WRC7wrcNE$027V+iWi^tZ{ZROC}%dwY~ zXYH<5ek&Cl73Wnj7us6nAE~UE%)(--Z&emj{mpq1>WQL4QzV^npn{dPmY#d-IL3m9 zP^n*-8fz1Ta*qfx+D#%No?845j_L%czVhoRz>1v8U}n2Fg@5~TSt}}5%A0!G-x)A* z!i%b0Osi@_u~bz^vS6G$eL3RCS1aI4?ONhlyt7q8mM?GgRB}K7J^!hh-2phO!*sop z#NL2-Cq3G!Z&QE%Bx0OnZ-sC&)A*I#G922U4gUcFhS% z=7Y;$Z|HH%sMtq6;{+VsjQL}q_nfju>bnV+N0CU=AS!@3FrTtXhOJNTDMrgF5HqB5&yXHGL?|9n^S2Q$cq8<`9znf)SZ96~q^* zGPB(v2?-LF2GVs!P#LzB(o*)2tx=3&wqF~U54Q7Xa)Hm#P9hH;-dhZMUe^dv-SScGeZR!?kb(FGC zKKTI-%{SPci!#n*-8y3WFMB8AFZ~Ga#+YbSa!(AaF2NG+_(yfrR!}J~{C+>8=O+k} z!3_t8UHgTN87 zHS#uS&pOdRDfB^9Tuf*SAN&qy@@38I7G&2A%H=3UB1+*Q}R(y$e24phJsA=E5(xO4x1idF1T)i)B!OBYu^_J zFXLYq-}M>RE4Koxj-m=-==M5*BFe5@3dob2-@97f$I4XTIpJZPip`lB-}nlFzLFE{ zu3NzFBZf=JZVA{ddm!p;vN}7FpYRxoFHY9}>gi!;8p0u>S##{lm2+LIdPvgpd4z5| z7h;g5Jo1Sj1{Pq(^VR`39N`0K=+qx>R)ZiAGcom?3sK=~tiWx!Ke4F$2Xu+Rqg6)J z)itkbyz(`<_9eqm(0jvobBGCIQ8`D`i!;lL2ZEK{Bv4kPqrYU`2JFDo!^l1rB(3a7 z*H;UXEzp4Uqf*=X2cIPnf{p}ZykP|Tt?RA4=#-X{8`1$S_G4?UnZMy zMbl9cl0Pu(x9-a)Aw?@8t31)2gCm4`J(>8|OTN6vS$rtJJ~(o1ow~7CI!)-a0_9(7 z_Jic+3;(aZYyXEbfBT+oH_<^VQz_b%ltapyMZ0Bf30Y0D36)dGVHi0?+G3kx__ifq zjYGviU@DQRoWJkS^uzP~3(xEI_^Ewu=)UK^ug`V8PnYAm==V#z zg7he(?+vYd3g!McGdokIEgusZ@{?5P_U|~)0M315gX>=R`Z6y&rS>*GWB6#agl{DO z5c~^=J2LT*4g>q@QoPwhqE`S3Y)o2QxMWLmkNtY!bgJ##;B~pZ>(u3zeghi!J-pP7 zl?2$WmUU3|?s|xH;GxquSf0Tfy%ZzO&2Y0sEBj#TjVw|W^LkAM*G*kKld?PP9^MiJ z%w7yDnjRMn&p)`)8!nl&bwTx$|wPj6>c^Z)jY?-QZJN3Bahh0}(2goF07a z-ocsB;mUwo=d-!mc*M);BY)p*Q*Q&JpT@NTx?t@0mhpF}`L@;SIJ^upK%V$Q82f8B zj+Z1j>hZh+F*=IpbqEC`7ZWz!5`MS5UlZ}b%0p&PPecIv1Wsmx!$VFm0S|QQ5#k3F#g=mx><{H9=$a!Mh9fh9(NC3}&ESvYl z`}Hm4PP&gxRH`*48#R!kcmBO=Q(_$`v5NJzXLLSc+EJ9t06h?jWde#`4NXl$QpR6@ zHpsXvA3{{k!1;l697hv7Vv9bAymUi;yT?z=2TmXo%;x;;cx^yKi!=(@>mc-H6!94O z`nGD!y^4yFzcFPM+vOoz08=%KIbF}dYcoEsS=8jmz~bI#p^3htYaP;1SJ#hYNRm&G z8FHgM1d{!wySpm)Q+_KWvBvGMyAQ7TcRet=-K z=wq-uZeSwLbOlQUwnL~~TcwZS+i`Sn)OgA7p-lJ3bwwHZx77u{ z0qYH^NA|q#>+lXEZtfZ&n;aTy=K)-3r{c{zX9f6Q?N4PwgJJAZ1YOqr8RLPf|pJ@((6~&4Gb|sr+6hFfhqI z9GppPW8A-Aw_QK5JqDUG3?h;XC05q(3%NKuFEQ$BXb{A7b>RH8`0C~Z6fj*cKqtv8{G&p>){sx8<^#D=*DbMaqhhYqd1%Ioj{1rp4?%|8fH z3b(7P9~v%#rkGu>yF*nq4Y{?cdY=v4>Z2`Ok~Cdv(jlIB-RIP6HuvhKsX;TEyuPMY zSU`&JD6U=1=fNFKy)LmHI|Bkk{RK-fEfo*aGjzAa+&RjHs+6Qp#oJoxpuL{>{JB$+ zl2fFDJIeB|txxJbT?Z?VR!@+`DKdTS+VwGqhjm!%a-<4JUsC6dy`c-iSBd63U3syl zJ1EYW&-b@!Mfr@Hl@OvPPF4j#E(t3J zS>M_@pC?>I6he)~zO6kCzYs_fWt1x$Nn z_ffpC)_eX$95ow1T0-bUoRmOEGWC(w(K(<*3A|6%fE~%U&26_ey^i%__=IDv$}U~H zw8PM_$77>wI>A;$;yc8nhihJ%(E0NgCV(&-tt$>ZZ)!>h`IZ_o13MTF{DX}nIK8O0 zw%e&=1k4PdXN%P0DJk#_S&beNgM`!!g%LO?0%_RiHNA9LBjCKMctcYO^po$kO84^Xc4skSo>4pTXgA8V5 zq^@FiBC3Pb!XAh#`i*yiNX%MW$}-ouJBXD;21MV-GWXW%Hv zRK2zt5)bT_Xp5Y2!RM0b%GL%dG3n4D`C-{=DQ@V@drqq`(>7OO0R^2xgA=q4V$vHi z+K}41T#rRz@WZ#I`6JUe=bzEt*k)nq*{M3Bhje4*+~{;^ub#qd$jV@IGyJaffw(%r z$NXP~YMsyU2cQRdEsvfJ1p~9Kt<9$f(KK(+%I~iwyO+lLD*6;Mh<40FBh)F1$sELM z?4sTE42@eb8tPWB4Fs1va`wz0#huTd?Hkd!6+!|uQ6Y~Nhv8o}fJ6iqjr2_imEmo; zAti5k`dS;553mb$x8_3qk$&a}qW#XrGIqL1;>P%dk@yqB`_g!Z==BtBl%BX0eE~=@ zk0JC%`+qY~Hqy4q=@#H8jDxq9OFEx`9>cY!`k76^vQd+2^uc)yO^hw90T4J~Lakd1 zAPIQcBkpbAIfh{uY&n*HMW4otqQmim2#=C`@zN!)s~8);=-m?zpds<}uP6-|`G|Ll zC76R?#rADG_CK-wt@C>VA-eLa&qRMup<(Rw_9vi;>Z#zMpqFd-L>^$hWeXxPBN~Ot zdbB%eEmrT`Defy)?$uGi_2J6aZlCl>(cE>43s&;$!rZp??=lQKxGi&bhX&P@wiK*= z2NW!JLW{9?innZT#@ljb(+lI@{DrF@WpSEMTlvh8PH2NV^QM&aG)D!BxJ60JjwfhN zj+N+7>Ft}@*vL={6nnJiE;HV=;qnq5Tl80+#_2!%(WCl$tjaH8|Mip+9dy1bXv6^b z+E>hNoe48&f~J_`;CR$0rR8?-=fQfR&cIY%oWQ;p!oWW*7w0@*KMkJMssTURs3>G6 zdaPxqI?mtvzC#PGWT#=lV1t-%o$l0B3mutDbn>hhhGM0E{zxQ*I(9Xuhc#e zbqT>WYe!$CUV5yXfJ@BJY$hs23I}wzDL|IE7+A)_zy)#&AA&Y%rNQmtg)KO7=oKJj zqJ~spsoi~ON-jpz)4!zbRobxd9PRbx6>>zfdPXB*(lY281`(b3KKS z+nIH^fgZKtg`&USC@sBtV+K(WfihS2;-YPCMmHm3V@X0Urq`Pcf4JMAlc#vwVKkqD zD2v*yV?@Yy$SZT&5dU~%h(P~Wl#*BGaG9cB~DQzjj$)EX!t$y z@=a(g>GbZsfCt0x-SZeScB)y1tJrGMrdqzZ+KcPGRz{`;LZJQ1|2E$ibtvw*J@k%P z4SeM56v$n+F~NuoLccz;O2jq&j%PA*!QL+HYyV}!XQcAsohzXum+c9 zVEN@!1wz>*_5=dc(Fr@Kum#v?Z|EnBRDN+uNgDJgbI=8gRJT0T%VV&=Q5jXD25pwt)s%r`)A@8B*;7;$i(w&T*XV$rO&fw7BN zc>NR8!t{x6|6Vh5^EDwK;14wmJ$Dvp#CKun()ZG!dpZplBOt4=X?ij4u2nWB^zKRK z9o2+fcN}xy8d10?2%i-(Ko@i}i4ZX>7UBXdLA&K^+0>~nBu10zcX`bb$#!_!Wq&%G zJrF(FueDmrSANh7+yuT{G$%2Aldm_jWV|I`gVANx;umCy><@)qBL%WmB2`?oa^(r( zIH18+0GB%_eOAb7F(3(3ipx+VsSpJ85y6bO!h0CCY&MY{3CraSsHY!HK^id)>?h4X zE*~!3kiod45q=>bE5+Th3a_3h_3}Szivw=I{S6T$YLRvwi?-8vwb(g2u!!!11Cwb* zMEt7DHyQVgIzi$_pzhQ}yt(cOVZpsZ*V&ExoInHIil37NwTQQH96gV$rfakn0qD{C z7HXu3f8LEFbsDW%Wb#iys6fbF=%~exozuWT_lErlCL62CY)5&R0Pg}3xj_)U5csk) z@kzQEcWoz;l|cB)t&+gOo58lZT}Z>{0vT9+xN;K44qVB~CSn+38_z?k(E@lqV*d3} zhu}z*G)xd^dRy?JNBG@$C^LWv^4M@+=>`Cer96GU4g3|p$(|=C%a0{(wK?>3=oYK~ z$O95>lrL)vVSa}mQA>-9i!bG%!C^r6a-Xow=ul396oGJ}swz827FSb6cog3x6!|H* zJPvo0@b4tlzimA_7`({*4?QG~mVTKM24fMtFyQfRoN(Sqbg>vj+h@a5#bWnM7cWN4 z1cEBU+M75AN)hRIs)(Qj56GiiZ7L5-s{nCTk@;5=mt10!h*p;d7iWY&!4fl)L(B$b zemFAJl+=J(2UsD9p@)FqOcEhV%Z!^r9A+UDsy6;O;|$ThFR6tPQx*ZBwgH*+T%-$U z%9PB`%>@I?wsC@8$y{%RG!5nR21gy%W{=V{Fr|0s`IL??o|&4Gh^4`WR*I6piUw$_ zT67PL;h*mT?9SFUfVd7s&{9582*Jwp7<%-JSHyK*>mFGhiw4|mxs7fCSm!7`^1QW6 z2x^Ajzi&j>$o{(kt$r+I#T%s4}6)3^>o85H=>6_OtJUv6-x*2*YMLk1qt+o@d;v| zTv%S{s+fnP6a(NkJ2M)O(aEz)GbD)iGQ)v*Q9|}6q%dJgEFWRSIgXaXNC+_h5>@bZ@pL+~+pxx_MXo3{L zOO^$v^qav2s|;(&vhaiAx?Z`Gx9`Oi#pTvgT--*zI#M@8La=JDM#UHI`nB>Mw3qbe=3>%NH1K-!Xx$?@gb-Ql2N4ZwMj1GEC zbK!vI9UX&{qqdS|KBIA}%s#AC^$vpIouj@;2(x*Qi0Ui}>48J`sxZ7l?!?}07~6uiEH+#~6jqoAbwR7;MX6RTgkr93lGIR4o1_s5CeNTV)8!%EmSnf}TX z$-zVGLono#alIw;sr{~jx|+D6X`h=dDL<>5zU#|*&JdmzeQ&|670LkVA{Y#cs_ljA z|61Cr#U##%TQD6!k^HP>h_mg_xwnl^_YDOi~tWPFRb_d z7YtNwcz%=oT2DUFeHAj~mgGJXnnAA-+t*&BbMhyB+c%h$6=z1FR4OenhoXa)v=x22EjKUe}^EKxsgfszb@y z2Xsp$Ux*|uLJ%E@nP9Mqtuz90Viu<4&QA-} zNA%E94`R-bB1%9kLJJ?UB_{&#BauiX7>A)qnOKLzvu&zF9OK7&tU^3J()9t|r8697 z(gL`wT4`W6Yva`&3mAY}szh^mKBOH$_etGA!1hG=!i?ic0ih`plasxr1q1&?ZCttX zM>#Bp@vra3rXl^2P;ka((H77+86rnu#1?Ho51KELv?Le;mR1`4t9RZd{$XWBme1$Y zEn!kymyw>XTzBBm*@U273o`>Kp9oUbiIvr^PhYLAQ|l$8Onrm-0>ObgeEdHeEw`*V zCK75SLTp<#!BYl&HM<5V+gr3nlAQ2?>MO^stRSTa&Nz!ET$1x1m--}RmD!{&QEv$s4{LXq4mGafFH;lw)D}m56nlo4|zNn)fQZm|DDDS^4kj z6KUA{P|UP|{N+e*Ik;%EL2CC)h}xQQ@=#*(;A6;gI*c+p2JMet1~Q0MHnA{I1>sbv z(kDKmDgDFhvYO}LEf)239y`iv7K#3JVaaMK(H|Zy`eyGG`&s}0|L}j|CKN3!l`;5G VviIDY{p15Y@T;Xs&fXIj{{xm)utxv@ literal 101300 zcmd43c{o*V+Xt)(4W`m$2$fJ~icOKR4DCX9nIcm{#!VtLP@z!BxTO#&5t%b&PUd8a zka-9dlCkf%+|TpA?{R#`_y6}C$Nj{%hqc$buJim&=khy4KEbewZ4(U*4TIWAWo;Uo zb)5KbKRqoT`TG6TZ~Sw{LRan5sZ%ru@iRTm`ZY8(>+uu*!T+uK-=FVU{m;*GYqtK+ z&&1R4#9+LV_;>X;{9pPhy?WwziuMUbn$&uZQT(!=qNsM39{=;8H}|EX;igejRycc& z`u(fxS=P2Wnjf-)oXTgIH~TZBNAriZr?X!#Yp>XGUG0JDa)*{#K)b1kmSkmnT>4!1 z;om!+>oT8xnDP1R;$m66@%bHY61AN%dXvLc(#Of#uZxpKF_BVlB@S`#UCZOctVqw} z<6vqrxgP%#zk~(Qa{l`$bL!w|aYfX>M~j zCIqX+Hyv+Fy}%is zivF%#Z?w+3^fi3j!~R3X*XJ1b9KA-O|v<7yJ<<$^iyx!Ow}OqTg3m(&8UlBO{|Ba@7@Q8BPvq2O4IU^$MbY z=DbgH**C?hB&Met7c+_&70}RpaGq!S8zy4Br=YNK{;=kyG^4`Xht8*(>v1Xj6u3>7 zO7(bzSf!NRV^UIAS5K_Dv0z{5KKs6-BhsdGzQy%aVr21*dt&j=-;^(N&s{zcc}lWjD$w6wJM_e7_jmlxjh_J+-+ZN@k1gu*Qwo|Gi0gmeThIEg)&%94M9UGOK^{K#pcGEp3u6o0G?dR&>(#@(KRDQ{$@a5;` ziq17eN?4jYIc4}62-XF#OTE#GSFLri?#veIe&;Hp#8GGzRJ^iqh+>?Po?i0WAcw7P zbY`?!H(8r^$bD|Ahb#8j^~x+V&PAExolRnGR$aM{f3TjmI6gUc{q8#2Ef?(!D*U(q z#A{S3U-DfJefjc5d^q!B$=c7O&W7We)-4CM#uFLXk2uuvVeeShMO7G18Rj{C3*?l2 zP$_fu`>utB1@S7jBUg46xzGCBEDrpd8P)mT6i+w3c&^ahrNH6a+Hk|Yd-uMv?em%K;hoCNZ{<+`3 zf9E<4s(VWh*T9boPKg8vGwgcuVtAdR-creJHKSx-pBcn^5wl$HCYxa>uP6M>{X~h-Nr|L8XFrs zqU*W%yW+!#lfo`vvTSzl5HT`zCSx}r+2x_Cs;d4a@}>6kO?%|My10{YGM>JA^=d@- z?WMXt?S0BPh>K10?d|Qo_QlJULP-KTiN5s$#zh{vZqt{%rSCCvmTua9&^Pe#wI3Hf z7Jm!n{I2FXdX(dx>y&W5ZI*RQDP_36I;y57TtrM%^o+BxWHM#z`Zai&e|ma)#_xS* z=IqWRv+f1ge#Uj&KM^vTw7fWH=I)+vctLGDl}g>;^0BI_ikp1q4D&Qgbwd)qP(T3p1k`gO8p(3B;NUS$fix&^_8$asNfzxW%|e<%g>^U7?|@ zyX3tJ48QsM*HTH_x(nRQCMC~q9z9Y{>!g1yA$zBMUS%0an*Gab9RZ8;PEJGjrkJ{_ zcLbRSjM<)?bVuo+y;K*)Yoi&*h>|OndnuT%gO*b!CeZGFv{HTKg=*CX>X%)73kJ-Absi`?E%ve#96l!}%n&O|fjpr5Jz2Y6nbz?~@^^EeJ1E-?a zTUl8Rk(GW9NQ<8m+&rNhxueD4*kw;EeLOB|^=Uux+RL9`Xc_j`92seUW8!W2%y*OR z*Zl3Dnw#rl?)Eh?GR;$;Pv{C19#d}!T4H)Fa_Z2rW1QJ`U#%x4eRnUfEKaSy^dy-a zc8FDrr@LoeCTnJyl3aKU=ONzTe9x!&_4NydcDHoSTsC;}_Gfo^xGlPBb<`)<55A*K zhiMa4X3sIbl;R1ygBBaX%$c_@HlFiPbDRXau<0Jj5$=H8}gKQtH4Bx<(x0a`y zY9jO_#Vy`r#mqQm*|gm?t?=J{v^C9GzR8gq6Z7oCyQ{Q8|GYZK;Nj`1Qf1v;K+1jh z?!Hl=O1x^o{@MBZ7+LSvuP3UCzkdDtt<%1EJ7?0_WbHGr^$&YjVOP@C*Vo6Hskz1K z&W^Qc%!s1`=R1#<@!J>q8RXbSqFVdhyJwa}j)~|zmvEA0%f^jjY=@+!cg0Yt5#z@kdXKlv zRAXJ5wY>Z%a{O$L-363D#kFug&NGO(Pw;P*+MdjL>YP-5Wql5 z`Bru>o}ARLpXAtI>8I~{`pMBq8#yd;>C27WLmnKQoQ~g0@2HyDb{Fiv&ni&L@k%F& zylcmf$}CT(JkP}=M{HUjd|Yh{ZAcxro`VAQ$`z)gaK zG$;0c@UK;cqkI>}P6t1q4R>i@HXelPm)2D2*m{LQY3Mzc$!O z)GwE@&tiXsg{$Qb9Xd3ERTL_Gfo;9QO<=WXSzZE#>PA!su$O&K)_I10D%txm(%X9- zx<8$VhsVXS&&e4~N3Us_i;R4FzL`N<`qO2}GiXsFPl^r@Apg5qK;iZvYw7^-SO6* zNKR=7Zp|pk>dVxGg#G+laqD$BXm0KlF*1*8MmIMB6xVdTR8QSlczx~&+b~Xj<5Puo ziBmp61k9;Mg$MHjQy4jAX2gucY_?yW?25EX@!x)^bi6Gsa?BacCuk~&O_M6k(wd^T zjz+J|YG$PAM;Vj6LqjqEoWsBemPB_E{mha3ZT%mfdo3^g=q?8RA>Er<Nmf?l<)fgWAO-4fiAyKzM*+LJ`mwpx9BF9KJU%|}4&ssz>&m{hj*hMT*W6q< zhsNl+6kVQC&u!~@nmcnZg}2Alou=eponQ;6u3fu!f1AL819vsZ;8ejB?R>ZCAvL?s z5^uV^0ZAaL>g8We@h1$OGWJW3oJj)|>*$t!!s>R4j^}9aqC$4UpzfX!#f_xumK}Mf zme~)e>7n~&r=qv^$QqIK)c@AkJ0&KRXIJMH?Jp00@jR#*jc}WGpTELn=Y?&Kg0EM? zTK6c@YsqA#xNeskIivB7{*j{JvVq^aXVm?&vq>w;^gPN!;}QWqmK-K-qQ2wixQrm z6gHcRr&|kf@M!a=-#FyR9U9Vjm>|~~O8@6G&Y1o^*B59*R%~;|j;vAKh<1P;t zaI82kn>wU2%jd%!%xT>bh^{`c>o)g?>}HX}^}~l~c?ZfV+zI6hVW0T3RMp2%tEYre zV~w9{nZ6Uu>-I6fXDzprXo(+YrpHT4N}Qf6@AC2S`S80NwVooyr-8@#>DN-C*DJj5 zcx%ZXzW9-ZYEWf6W3ht>g92yo@d6+Nn!EOmvXRh{4 zt8Hym8JDsB`Q6yegHa*RLBr#$7oQ94-(TKUSFN=}*pR>TdR53_{w+?@DhX}l*ko&f z2hD?Q4}wede*eCOEugY1`NfwX-^zFd1U7mt&$bB;Z(qNLmXU*=fV&GGrlxyYMU1G) z4V!M$h!_`1E_8vi@fR0Df z^c}^5_Ipn;7g5#+3vP2CM+5toR=gss{&5y~iveMUo3kn+MKAb=9yEV(U za|bQco-I8d(+twGvNb!T0JT&DIRrEW_t5>7e=}kuL+CkR;f)Du+x}sarlDcvlG{R{ zQpg2LSFWVwHLnU)--HhwpdD(6Wdce(GkbF#%|(17@fzK9(+&Ig@BgslypW{~tJKKD zR}L~UjDVtaKssT(PqZ~-${J|PM#sO;`h$5rB z-|ea**?-YA+3=2rrlyZYO<46-4J%+^CIS$Hgh$G557Eb^r2HM}DVA^V?4;D6bOQ@o z2PxuGR9i>KZXj`z<^HsmSLY>+qg&!nFad)NQXg4#zO`f`AWvN#EFSY9lqYaS z)2N)wO>vEoL^NanPUQ4(A3GK!a4uHP^L=%-pOsPajQcS`gX{-xo}?@7qN(#VG{85d zSRau?F2FbFgBA-*fK0tOX`1Q!nU~j8+}|m+V5qCRYiW6|N7M1waD)AfzD((PXJ^S2 zE}IxEFikr?hD8S%ZpLr9#>25U8La@P)K=9?xaCcFEN-dN#ycXi?; zN79*Rin)&cs;MsGQoP-TV~~L|1{2B!#KpD4WReFb|3Q-(4Gp2?Y)(9-L`O&0G{1#S z(pN)H9i=86B`MBkU~)3mzw@k!?i-rvf$Dwvo&=0>9ebH+R>dfIAwP4({l?-%&aORs z>?UW5`S3K1#g8z0o98 zrfTgZqB%PCRhy=ba)+69X4?kaxfVcNVeH^>k;cW?Ux!cd3kr-EE@dEnRn! z=OK5J#xbFWPOOG+Kw@#>ojLYg$|p`xKAeGREC!e6; z7BW?u$DO1o?k#>noBKRRH})gZba{MUoW9C<#k0sWZeqK!dWsb1E?)J7Ft6lX1v>2R zQjM!u4O=oy_?GxhdX+v5fBqz5)u?ci@r10lGFg43(x=4 zX2Wh@P4K0LcE^rr-gD(L`JBOdz65H6g`O-+Ryz*22od@T`alHAs&4C*4hmq+z0=8d3<`Pj$seGj5B|3e!hQT1?B7h z=M4r124z5lB=hO+f$9(ykq{AMxm}_cZ&~N}8lNfAHPx_`Mq4ws$7|9Y53>RV%^29Wd$)iAS)FRvQzWDjqo}B8UZeJ^>WOvp)J`{5IqEVN+N679=_ef_YgR?c-;f(~ zmPws3$aCVIUOYg*78w7DjhujhfHY-DbEv^C)iCehio$u!oz>IRGt!KKQ-;;J(ETw_ zJs5@~l%HCcgGL3?dt{t_ZDMLq;Y(!8GjH9xSA(2m-xDUp$4jnUNMr?8$`~vM40)hW zW?^X_gM)xFDH=3H0ASL-L4s65$Son1<)b$sO-Nhv#-iAGzkMr_kdPo+H#IfIboJ_0 z>e~!MFuGfC3rb&|Pm37(3-O|)rsh87+%NvKFPOojHiE`gNERn0@tu8ju5N@9BkOt> z0_`nwQjK^ZH+|y^t$lAY`WAV;QREd?mgXaeEnj<|&Cbpi_4wle#eHqFOcxaKC`l{R znLJtsw)be--@u{uDp}UASz20p@Prq?i9BGJVq98YerrcVlbe^PF!3cTa){Sw`?y>{ za^H`{T!nl3n@(t^q1z3S6@FrcRCv;rQpG!2cs$Ir<83z<6cm`V@ApLM%P4-mY^gvH^*{8$^mu@NX2znGd16|qMr3)nS2 z3oB)5hVQU8Ky}@1ZgY~Jo?Z#?P44gvVAY$94AXjCPrvJACnzDeARtKO*ur#y*e6lm z1cq(fd1OeYCG`TkwvJ9+qv^D|hK4tf(p}LutG;r-hfki^O=d=N?**-==Gj_&46UBY zj{$PxH^>&G6d#3XVI4?wP)lE*ZRUGNNOCteAe@OPu~S zsKKwD;M)|VZ^q2BX{Yd(Z{NOkIc;KMqGMuW8d~x9?HwK62(zU(EX0s#{sdyt2p1Ns z3-WEuwl!$av5zeYbZ~c<`3`0Z78Eh={QJA!W%RuQ&&6}vlh&aRAJV`L@y00^YZKt- zr@z$jghBsx80Bh5eB1ap&~bt{4`qKXzQKjFK$m7*eB|2F?0DOmVlU5J*Qq@^9Lhd~ z`vkisvW;`#MqgjwMJSqY(Ef%jDg!w;YlI1EwXE5pD0oYsvHg%;qr!*bs3W#fCUib3 zKl>_B9G-+;uYmTf1eJ$mzAp{FMd_n`CzM6nAZT3!B&#YeD=ppswb|?M%Ce|Q>H4Oc zZ+8yy^!yID)oqOz?W&l41IAEsI2ELvZIIK8?xKLn{k6 z*{sdgUUd7siMNrd)x@Py-gJ0o>ic)e7tcJ^m1Yn4Jfv24(OGDoLV?X<4P+Mh+*thf z?Y>uce+w|QM6VLIL7LEiY24wIU6)U^X5!seIawzQ7N@DAHKb6Ji>n{P94ygvnDbI1 zR_NY4YztBs%r5GmwFgU!{BKuzq{y%wohl|OgqGs-@H^H!Kgc6}x;fsZF)+nSt8gO^ zsF;5CzEvsr^BdDMW$!-(pTWcXE#v<46%zY_#*>W)K0dy$bYu#mtgn$O^LjLfp@#tl z-zO$YzI@T2aUzq+7cFLs|M7=f`W4#%R)s6QW$S8e3zsL18>c%x6*If!kX1@{Pg&mTzEm(+}u1pQh>P$A)f!t zQyL1#_3PKsTWFS0+2Wkey@dx9fjW(DBrX!dcX|(O76xdscg-$!4EOb|MU^E$wAg=@ z>L0+7>_>~dmfhin)1YxBN`rDeQ@lf1cn@LoXt`~>2^FdY8$yNiqNA(pJ#4k(-n0zT zmsIKJsrrm5jo;|wutN=RqLPX+sWWj%zbEaxDeF2ZYHtV!5Vggr{-hlI&-Kba#G`e^ zu*dje@Z7C}^=8u!*X#kOY?wg(d1KEF*u&c&zvA5`Zca}SH(;}fp{cesE(m>(QM7;j z{(TqzP1#(}jRwgZ&?8OIe^p~QZ``?|s0@u@FvWrAXDyhQ-@%k!f{ogErxgYu?=l#?WLr8}k z&|<`0*$mG@A7eIK2CY#E*a;`i4CXG%D@ZWKG;XUc4=w4;UO?b zd_$~^%O*nPvw%iOe8Ha!8^xBLP%xi1U5*%fd{9`JF;O$JG&CeD>u_z1tW@(b!NNT* zoCffO4bd<{0V9SLU&~qry6&|yZ<**ORQF8j-bK_T&FR);9l9?$_VzQRLL(;ml|TD* zI24!xkv7zl)VFhTGQi?99Seu=AnLi`JV_#IDJoEjWLi#+)B_GuFhzB$Sak_qT&xb5 z!UDXVZuo&O+l!%_$3Cv7F^?ql2o1SMj~)>$Vw7MLpCMH!XHzrvUkj3UOF*cH-yp#B z9|fyygsEm@b-;@9DokUw_4;Vp_2DAMiQ_j>jN`DmDTE(l0>E`w8uX+BfKRka0QlvukI$~UWY?+g zxP?L29#6hL&h4j@noV#;BRdnDE<2zRsG7ajO<%it`@!4{-pPV$hue@*KFY0IUW&C8h)moDGS9) z-ZFkG)@xy7OiYYQ6_iRTU&`#*KPhLP-5!xvxxb?VT&bD+9KeMZ1lvGh3fah><-f4l z(?>i|h8>|T(>g-b3D~fC2P@|*prSumAwiB?w{CrNb>d50KakYmi(6G_W$Buc;&J0= zGt8^E(5aMfk&)LtckWy}yl%UR>AAU`F(Dx#+XD9-i_(z8K@$0aCX(B=YrQZ-a|fKK zhfbxOO%5KR`m+k`?PQc=kkOP^QC0n@okR$>VcS4D=(Gy(Rh6u*UycmQ{oV8rE@M~m z4P$5OiJ(0t5UrI(EUn>bZ7tOF@HlGt?rL3cI@p<_vvcN%E-sA^KxxnoBmhbfX;H57 z^Y_1feQt91(W`?Wx1T(Dl9_WH>yq|HF}zv3ulc+(L`KTKo3)UtucD%&rZ08O zYiZXL`5SgWB0^W51J}T+hzmEHogaRvf^`L;dq?RA4FI3R&pwr=8kG|#%3!pR%x8)k z{-QI)g>PnJy2p3=iNgoq7k>Jy_{^JmZ@}j@i^Db5+p9il#k3D({#I96u0~rYBw*uD zPrV;e&N{CyM`v)1^vq(#*_TJS*oEAnCa; zhE@iTzb>Ek<86zMcw)G$_AhHAK!wKR7=~k>UT+ieOvTWXe&gE~mbUV0YZRYdWt2wh zU5y4?UyAUIsl*z`DO-<*qxtW@2gIUT^J(0M;Ds-901f=8|Ai6htl*JW$%DS3XT!SX zo~IAEA9D0GT|K-u=ZHwdJ8yGx?m^zEf%G)i`g${b&>^z7El>MY+?kIi(O8Qdfe_5mu?3RdQltO3-azqz#gr*$Ve>TmCGtDs1^kMVQ z{;G6`Z>4c&ul3$mauY=iDxsk>6~*xz+A)La#MgH_ZEbB;s*t-$s-0T;d!-{o#aRw! zLj}0)?(%sw>?T5-PJ!2IqN!P1C7qm{yo-ZFP=<___TziU+pe($Nfs8CcEar@q7?6| zg1FeriT4|kK62zpu4Tg>y9tALS6`1%!G?aIw;TRa)6^_C_L^65CWGNs0y{WX3miPS z=>{5Y6X~#hH^)?0-aZXRoSm6*%OlW9KLl^v+lL|uBMUtx(T5-{yV%(e83ZJxSV}}+ z)9!k@Pg+_xHI$j~H>g(&@l}0m^ zHvhmplzrIti)2~gS2|oqbGvA6`rTJ2-TsWH6-Q7qva;%&hgZ-&{?tkJfSD1YB@l-# zj!*}LjS3=2eD?PC1u%%F#0Bb6*ASd}nAZ(XS_TNk7U1&dhl$qmFRV4g&akksyXai9 z6mn{6>NOOJR8mDn#W!59$ngt_n!8a!542HX&?!$CN`Y%|$$9X1jvsPqy!FScHyvCk z#ULm33MtCWx;16@%FimUKffmo^v2)9)bDT_P&*zvOnqROqyYHHzt|S{>C-1}Tt8DY zGm8Klb?iEV<$-HwRDOkA2gUtVnC^c?aEn@;ugKFp2$Qi%vIR5^;~=;Zj1nLN-tgR5 zS*j&P86bhUgr%oB{iVJAI6P)?&lPlwr!Cfzrf%1-Te?R{%gOC2Eh~%cgXgT6Zd!pS zK zC>RN#xmOY!?%lhmgf^ffC?X>A0o$=@80Y<{=R&AJU`z~WsGwf3m666az$5e`&1o;I zuRgvGbCVZ_+Jw%ha!eO!KL_xIcpfp-0{a-weEsett8&;2>|O4O+{s`1p9ibU>A-Hf^aH9fL>F>^c69Z2bLsFMIG|{1+P5?OFMzaIa~cEB-U6@XC;=`7=AvusB>-xXM15O( zVwKd@&)8f+bNm7HC^)_5+TZQ8d4a!gK-At;aD7g=#)GDttJ=xMrB1RHSNb@jkoikV zv-N=9iN@(6%}K`hs$Y7Bx!(==p!BK*XScMfxcGByQKsG6Yds=*t$eQ!Fy3_zNobu7GKXF-u50MfT53er3m{4X8xkGvmem!&6`1SxVNUh@=BU z)uyjZ{as<|DZC!eLx$P`R2f1dL#C1_JgzV7=jT_ulV$VmoBzB#Tk_+FLwq`7Z4XeC zEmG%D9mHUA*ev2D`(833%@<=Mr(6ip`aUYQ#pG$Y@-;K$yQp}~Eg}-{7PV0ILC3;K z^}wL!h5LfUAjymj4_6hC^)fF%D!DKl`W-v6*0qau{TiIgKmfqdiP^^JO|9T1Q1CrF0xl zoY|=-M>~336MTJFweBSI>fmerl?=jD(dS-O@M?tl)+a;u{DUxpw=QJ(DntVO1GBEr z*$LY!DIbLo|D~hDubzl@b~)`GW8J>}4jA3mui=n%f5Hx+O()073CRQ^Z}3FcwVs=N z?V7aL+*j8)GZ-P=BNu81h@69(tB8KVHL*5tQzu_PzY-#-XlDS(FD5B@HjLwP<1-`c zyzAQ;WSpcAZ;?58E`?u!%f`mWTN)=Lj1`L!xbU@tARix}S~{$kEq%2SV&3z<1H;Wp zIeDMQXM_6*cd};j_n)6;Y9s_+Vd+Zx@(xPvlPm92`N&OA(^yv*U_e%m$+qtio}Yf_ zG4JH9n_{_6J{DdtLtkyQ)ahTAlL&zrj;&hz7!_mG-8${nlZ*Yi(kK1Z5#WW^EJCk36B{uY`n8ma9^DSO9w&)?A2=OC$FFxLNN zSs&b~+MfKJi~nB*aqos%dqui)*sveeggUqXM}*N6WA?h-s|Ht}Y~7Jc#NvoFz%5!u z#O@9_#li2_s&PO5U#Tdxz4);-bB6g@`@MW)r+t{SQt19;So|j?z(d6Ae3`KP|uYkT>3a8{#w}`q_v4*_{)TlDv)5q+HoZw~f;7!CRyxJ&NadtoZLUNm@7d zs>2CaU35AS*EWv0GwRO)yzs%EV@T(Pi*`+=C$sCBk9!6Yt5>r=?sE?odl> zD}!Hs5$-iY=YDBvBFMnsH%di9nfAszNWjool&b`E-*CessOn7z0rU@yZ@T;xoQpQd z&E5Sr+Q^7*cXu~j_bWeiQIb`v2;F-tuez^M;2m@l5Hnw+`j(dMz(ZU19Ma3!FD)(Y z8_4_oK+4C*hv?3o>pTB!`@yrl4;i%HcI8o&U?r%Cgv;OXa&UE(x~8fA7dhFFVL~=P zbc;OZxgp5JnV~5vLBv-UsfvS@*E=?L&^-^yx!d?O=#2gF|50?d7gF!tqa)JZANUsm z0XIM_h`U9{hbl{)00gk!=INO5wVjUk)|4+^F|qE-6@vy78$qQCp7&qR0*4XCKRbG* zz~c497VEr0PB@9TxkZPyp{>R{Uj5#F3lPX`)rW)oaIZ<1flZS6pIDS1wt%1N2 z2lbxn6-rd8x0M&^R0p^m@ZiD#1Ihp~-kK4cA%`o_s(hq1zXEE{a#Ip+Vw9 zC(+g*-&p{X;}ZcgR`)Y}C)IvsW##uUYBL^q`#zMiH`0Z9Z1D5#H+UCsAkNWz$S1>5 z6?N<@*ED0;b95IJJYztdOgN>|E@P}T=F9UFt*_Z7?Yj+EN<9uhc%SSkZaABS!%{#hD|dK* zIIvGC7z?6~?rkr$*4<^~JorBd=gLpHEQpnF($l|fmF08Ch2O${U}p=)c-^WDRY8zC*KHy(>~Ug)^IEd;;I7L zjZO@LBO9=CzCF=7ew+s4r+AAMdT{U1P~~AR)EiD2=Zi&&Wvt_gSRwCGeHxU3J zU_uuqACfVKULGf}UOCHpj!6=df;5;yVML5C8G4DR`^;rz?Dn!FM&~hm^we1NweG@~ z*w1m3MuX_%Lg7q=DE(3{fbDQ!R0F!!8x*VB`SvegOu>0_hk(C@W@cwC>Z>a%HdF<$ zR~=4*eQgRAHj?rS#0D-mrH(k}xXFv>ave;%;5xQZZ{NPH{FV0w(XqdkZb&5&>xq+t zBZ6|!xajEJd-o`H$ymmhEi5XiL@ZB~043zZ)}NWP>|UNv+tcPj*cV?b>M?@z4sqqf zNsDt+_w#i^Vf7J;&T5)xn~ST8tE;Qwn~S%uq&4&-XOL{)2fQI!edH&bbLQoywTIaP zMm!Lk&KvOd)g^M-^Oq&r>~b_)#%JS>oZ~tGjJ9RVmVsa$plo+HH^2I71WEYF)DOH# zz+=8&^8SkE<>&X|1mzB7TGZ^Q4dDwKp~81_0Jk@EijuHogux0J0`>D6Bd8E#Xk ziE-ggqt48G_S7TY{Y1|5pnD`txN9p*qySOxw@!ZI>!(A%WV*Z&x*WB0v(e_LS}^e%2*-aB|2$mqk+(EaXt zZ{CR7$&+`mu&hDM2j&z45z>UgHA}Ns5n2{LH6)`*^VF$3Si24%hL(d~swgY_8VO@) z3>>9`Cw|T~Q2R^1A9G&yk&%bLY6{Ao8W$UAN!|YBgZW>x-1PL9gU2itf`R^Z-}%Oc zPcv9k7a_kuQi@P;a1fna_aTEp26f-q*m$J7@SP!#-zMjg#x=l~^sTKkvLS+@xh2|t zcaQNsvyq_}3tY$-yY292JJEI+ZP6d&jiLCNDM?I1qg&7%xE6~Bn!SGstS8v@hL zuCA`!;;`?=-3QSRh*7(R-xG&iFUN61^83t*7PfY*B$hg$5yq%P^xbYI_2@P~-Sy`Fe)#xqAR|Mb$vBXRv zZ<#mf47s{#9&%hiEPVV5d98^wzp&7UvX?s$LZvQV3?X*eBC zBy>nl0IF) zg?X!UfPgv0BUB<<$jhZ%j?eO|uWD`G@ppO7*p9?0dsWk!44hPkf=H<*#>>Ff_+^sN z#QQ;Ca|c*=?z~sL^y~fz6-nd=4YBgW!Oe6i6@-+wgaxWRd?MsmBJ!ZUfD5^{%|$-r znDGQFE$Jbj@{lJSNi`KYsKV zutYKy;YYL~IOCgczo%}SDv2T|mElB*51)m3>G88~LmL&e+!26SlVCR(@F>H{NhR^_ z-`|VDz=-kB>1HSFWaqe$tuw_!qZqRtwq~uYs-o15!gq(IqBJD444d->Amb40?%jS! zub8GbW2&RVGiLUCheT#hj%7U{mxkfAf`|O^ZEx*1ZQk4;6@&x9NQuO7D4@=y>?d^j zTnq{fQHvJF4tdOuZEvGOQW4|l_nr5&`I>)t&ZU_SSoRMx2wNB!Do~Ea?1QYdE6pt} z?_t!Vys_~N@v5A4VtOphr{*?PfN~J1zomvH5?w`Q>4!0f7^VwgE*>B;2p`~sD;yYE zkc2ob%BGc(0`vRs?k5yg8U!VgqKF*I%E%C}wl4yYQ9?4Jxgc@uuW4%#iql0ivod(8 zsNId@vkRj?cPDU*3K%ZguUO&?&S?39dZ-11SXR`D1Vi(u%U_~KO(4|Ku<|Q_cT*fT-)}WQ_dq| zo>lHvsLrC8T{d{*H#u3-j zF|d^zP66rmfhg8CnkptdQ-INg1XA4n`}arQ9Y)HpmAf93I``1|SvZq0rGNn;C2w7D zPiD?4j8f;=c3xs0hV{^%>u84DBf=kHonKij46>{rj}(Q%|L?*$X1Is8wzdQCX&ePt znRU-G_s`s|@b?+&D4i>RmDnE~3I9EvE_8C|>=QZR4YL3^Klb#4N>(|1O#{@lCr{vzF5= zy$wkXbfSn=1owft?kbNIGrOB0O>+|{tJmmyoUrzyD~74^2DU+u&$|I1w2Av1JhlB* z{(gSnKFOhzs?F2ie$UHALSFT}Qz1mx%ab#wR6oA|UY5{2DpyHz`;{=zFa@2XPIrxB zd()kP{U;P8dQv8)+oW9O$}`T~5En|GO`P+iY5aY2;OJ^XMAPwx*D57=j&sj*K>0)l z!4HDBHCbq1(jBmM)ojjGM$joV3GN+L^z9|BeI@B23=Jg#`sqF-{ZLAA0CK zBO&qZL3PSnbwh~cwRe?zxG)2-Bs1qLL@#Ls4L__~lGfv#{WA6v+Hmw5s9hw!N(|Xfdz7{-z)?Rf8;n zYCi{}fCzsO<>L*$%{4U9_;diAG7dT#!iG`pzk>Nsgr%xEl2B@|!4mDT))2ghqK$#HJ>J6SOvFb9s!FWw=tTDmprR<+H>0-H-ArLE?Mh7q58!V;oOA#EVXNWUU>aNd> z+`?R|4cV!Km=(mJbfm5b3oI?M`DU2<5$Y>s4%gWWV1d0tO+J9g~YH&E{&DhdGijBb2{O(@ao!0tFz} zxLgEah?FzNRNhH7H4}fAnGr1*C87&m=vYO6WMaoK%12oVJ0L1bjL&_PvmWOg(2=liKiXc_h)-VJ6W>pdLV6{Hri7 z&d#F6N!dhc+);uU?bnQDqNl$JASY3wkFo#yf+Hbq+sB}gl|U{K>$!tK>>A?c0f#u1 zm6ry?jT=i!ZjqAkHxtAM?n*=QA^HnP)yI#UKvwJ|lQ5$843vYC0^zS86<)kN@5W!b z=?fQVXfQ!5#?AfyG8v;B#{*Q$I1a(eL8p3DC{?9=F4fQvCA!v@;_O@w7|$N?7)4+R zWg54XFznpH$r(WS2lXf7n=Y2!WeBJoLuLnUQ?v>mN(C`gszgOch=*B5;egim9jWRz z8J1b=w$zvxvMPhsA#_BrV2fiQmwX^-#ic+qpehWzh(_t4`bSRHZlD?Y_U#sy`tH6f z?Qd?)GoIvXJ%z;-URbH#hC4%KW@T;9vcu$ZFoqd20A-yW9Zl?_VXBZLL~q%UA5acdqQf^&k~e)s^H)uzL{f6#sB7r}*dB73|RUpEmrShE6Q%D}e#qYR_J9GOqspyu50uuE<+&al$3OH$dIHZ(m-4t9r#m6l0;0i2H62 zdHTMG0WGo3A3VR2B_n-;= zL(A`3VP`d)ZLXe@lm0mWtoju$p4g1cmDe*9+ju%mMmv+JzvlluVkj@mg zC^5_?t|$xchDq`HzFhHGVy*bvc@O_US~3g;rN2Bd@mm_0T6=^IW{CqC(5EZ+$#Iep<+R zo^|ix>Hi*xojfCd)Zjs~tZJ(Sz$U#4txuUFjqQ1yn zZCrzQ&xc->jk7~)Ruejv5lSTY8tytWki_2!jZB#Ii2Z)N&w~LF* z{-KVcA<@kO8%i4M*KFv+CKy3bNI_5uvo?Kwx5dQ7y1rn?9cv;1 zH?F}$Tl(O5UxYxQQYGU)!$0PnA4JH>d!!wXucl5A$y=Kv!{Xu}6jTN5<2QL4JyW^R z=&u;zf@0YBlrt5)=Ax--oB|O59Dz$Na}FYx0u^M6zz!qQ02P$GE+M?o!QMlGyPw(& z8oZ_Tlsc@UyBJsA_B9)c%ctjFfwF^dA*X_ziGqScL~yVvv&P3G&Cr9*%*+`2aE~K6 zpE)DC*e#{tg5J8AzfT*H27=n19r^R;&ylGq5t&vrOorA|q%|lhbl?^WD7}&l9MTUh z4|2iY@FgYTEL*Z+jOPIf|{MiVU57c~)RZ7cs>Z zr>5R*n}&4JUK%9p(U9vn zi0kAOj$wA(T19j&2?Xa78{hu@$R)ly_qr$|1mfzndoSvLWM!nBrwoORd}O$xQWnB= zmZ{CS>~%4z{Kd<^*(seEG`0?=RzBEWMs3C*rg*?K5*{7dk!Y=^^;UpIk(D^?O%yU3 zh;c3Mh7v>~h>U06$PV@6wN}Gd{}tqU63bCp{-e&XdAd57MEoXU^P~ipHyyN%i%~g% zWyJjalhF?cu?SaI7VQ(=FuC>=Zx|_e1BSBig9qmRxGjPZrU&So3&MtiIRQf!fcPU<@z_2VKOlW8kGZ_ z(%aXE=?*%kJ?{ZDBKlYlo!f%II}lgI5HwsYiep5F=X5LbPZfTHcw&9-PKb`WvqON+4N3L#g0zjSf1UP@cg-0g*5~e z#O1~64((36uYzzvnMe3WZ~^ehYe-rf;mScN06~8M!!N4=xCRX0Ua}PlQzC9!>oWHB zos_Dwazt*$P!3EZ%UH@0k zIf9rL79C!Q75Si>VG6@{?Z}r7HQ0j#K|jFVnpS5>5m(0K>AY&cAx0;#=W95U0B;~} z)NOIZ%qLO-F?p)kQ^o}!v>8!Hq@4y%4#^BQ#BzaoR~}A+?oHgRiBbp*dmn48FADVp z#@@zl2Sn#Iz%CHIif_^-04MH77EC&aQL6_=1zZRqz}+{W3!2gHPx$g^ypcaoiS5RUwBFbU%NkxSkJtZd6{v-IaD(!aU6z0msd z3hbe%NaJ1xO{DDt+Us$5ky2h_oZ=!8q8>g=%(3ByD&bVYuOO!?ZG?>no@dKE43pxL z+{w97>^2c^&H~&@>w9Mo>Nbi(uIJ)aZz?!M5ccFwP8;x4+8hnfg&BdXBaI5qRGgkZ z95_B3HX(2@yMKK$c;{*+g~Um6S`Wr6D1&b!V*+n4oC>0J7u9rUO!Lh(>$Z4hBjx5l zfAK>SQPCZ6U1&)}Se%%6m1!j+-k(R-0sSCs;BUwOe(}*caE_o*^9q@@%N`3u(e_*V z93Cy~O7rk6Ux2C^PCqLv>;7b0n)eg*;h_OP?|JP;1d*3dVD9)|QO_|9 z7Y5506Sv6lt#CguGL7HET|t2eRgE|B^V zrfyxlw6x<3mR>N&IPMq0MHt8@C&O_<#6BeBzAeFv%}K>ME(q^b+eV{dgwNroPA9imvZw76kE+aBOi6ld&t4teP?G6({1y5 zRWGN(h!`1`()-WnG@g5UC3h!UjLv57KWbw)>Gd>h<^Lh8ib5o$ zC@Qz13?=he<|YxD%8*PYvxYmFDP!iM%=8#4%9zZALYYO$P^9|*PS4)&`~SZ6t@Z8o zti9V$+{1O9*LfbtZ#t6lq~886o1)oG9xfTLG;2)oi3{iVIUyf+rcCDPzP9eX938vl z2~XCRNADj0hdeUSYK>y{eE4G}OQP7!{GbX^AM4r){HI8h_@;aQS7<}COu59wQNK?O zjstmzJXw+x-t9XB`)j=GZAIipsXQ%AF1)vDVKJloZ?5ylD_X`M{vMJ4%A>8ox+bkN zd-EO(7RFnHFWKGh^1TVJ4CcS|Ju*#ey^5HYY2cep+m7Eoe%H72=v@;9E!m?I_0pl_FZJ$o4h;{B4>;GFy#FxD#UNMM_(w=Mx}-(+oBVwH zj)7}t(T&1+f5#5x{L>=K;$xfeThDMLhWUfs(qDQh%|0?(EBx1Qlg6>?Np(fDZVDau zA&b9Zn_N@n3pLATS4j*#_&C}q>{sx&HrrH~53i#2{vbuRnyKqXvM>3YP`lrx_9Bj|ER;x6sij0GsKdBJ3HzyG3+4a53J?Gj?w!sZ{%?hoF5;E(amPghynk-4YC7 zo=x|u(Oj&wn}LChh3$$vJd$Rdv5&v3s`59gdiQP}yO>4%%zFq|i=C`(Y|d2%u=b2+ zpS86mDQLr|-@Sdy1g4tzG9AwP<=C4CPCvzqH72WUX5`o^WA;OS6Q22%@_uWz7QU4J z_$Z_|y$4@yEd(bje+#m+Z^F4}KaQPb9f-zInKjJJD>rZ6Y?}#)$NPTM-x#mJ%z4zq zvN$7(I!wti-3WKW^;HUh-dz|QF-~!@+Q@*0~BlazL(w@)Q zofLoP6%n!7@#{yE(cB#IceQ}1>Pwk7f1UFl8Y7uhh}Qg7f28yy&bqa0)`Zm3jtlJB zQ;8!@w95S4xtoAMG}E0B;)4tVncQ}OC{w6P4Qcv67E%6_c#>EEJ!w{T^^2fKM#cTQge+s_wS5DLqp*$#o&Kd0s$M(v;DyS z?xDn^<`$ewRVO2e8#3Ji0^*sesdMe4h*mm{0Ktbexr7&g|NJPgr|#Ods|Zqh=WdaW zkl)h41)iH5`10irEMuXqEjalvR|j#|cQ@44t@-=&^ExHzt^13Mi+_^JceopoNmvs} z+CpHh#@ilOC4p6^v5T)}Tfe?`6s>GOik-;u9NYmMa@WIRiRhZJMeWfD`9$(!#zS>o z-3`R~Y>fm%%Ij=||25#bt7DfPuM$KeNNev!H?tT02 zS-uCJk?(vOtvxpnkCzfWUelR+MJ{9ZFlsC79qQ#pp@+YWlFGM!za9wm$%eYOZ+!`f zI11H#1)^Fa!#y^GT>fY-0TS=9%Xi)o8vHt_(iA5a@qjOdQ{@pX6ovgs ziG5Ir6Fw?Q=jmx_EoYu<#jdl0J%Bqc--BMU7z_+UJ0oXpy{9DyCwbyz(3Cv{?a;2% zx8#7IhNvyW{xtsjp|o4LQYzeib{;tvIFK~)p9EOGMPSVbG=6U3boPlZ?7-Oi$UpI$ zYZ(9Y$Dj>kjx5M@BmZyS z9d*{qsxE2z<*FSeZ6QGCLWtlLtl#b0gsd%is2s|O&Lf`+Sw;Q4ZRfLoBk9?ZCU1#e z6cWvj%DUR*zH4C78BSfOCdnDadwb`$D}hgTFe$KXIewe<(nvhEob70>Nb| zdv}67y@CS1-fk*Fm6-cBZQMB8^~XqS_l9dA#wRAmXHt?v&Me^ z{#^u41fy|Wy)@0;yfr$uwtEj=Y7OWP$e8_=1`ZH~xkdR78yHHYmJcYofoZa}YwfSlwH$S&eegCd4vG8!0CcXz0 zNwykT6v1s~T3W0qd%U|_0MYVpyOQ?o84Ize+G+qA6yCIHlk%s)Mms^ZR8zFH#_~qt zB(V#h#?j_)CLXQ;nhl;)WkW~|*)qdILUc0Nwr}6742lD8Naa{GVtbXV!1W8w-`HF> zKeVlBUItZeI<`NSn6IdA1zgw>>baX!mW6j)Lov)@iPEf#kXqL@IHxQctV_RM%81@%A+Pf?W$ClABvWLqIh!L!cb+cSw~AKjYBlgy{><=cdx>ZOGmRT zGD?$v>&?2_D~5Ed(WYv%_YR&?S-AX1eq?i4M$5UhlDfd{;=2{zFC@6~Fvu}$y2-Kl z(K}JsQZ7XXI?L+lrQmrH3+$PExSnpxRx{OmS@9uv`yW-e6JGCh7IR7U=1RNHpvGOb zhD*1Xj(hu&8#_ zC1=!EB`D!-<^9GUa8=63Fs~fAZdTD4`!d314{BLP*teHEu-> zpV(>JBNka_+>wrDVQ%h6c_C(?uTLD+PBsq$NeEp*fjRAMXXh3`GEcSZP@UYpb0;$2 z+S*!V@7}D34QMTznwx_Ss#;q&T&#~wXx72HigKb*!1#+rpAm?{a*vLV-d|;IX&DHI z?w}*se`w46463TD86cBOY;FPX)7aS=X;6o=(XOY6+cK8wI6_pmZg5__zH?_SIYG&z zm{o^${`g@7>Y(EB?}#hvkgfb=(VWESY=UJ!)^(6=&89tlGUk8gy6pxX_a8XW1R%u# zdUk7I`bpVv=7ju;ltBNbBn^#X0R9X;k&<+#VgBm3Z{J2qM?H9e+@e3b)A{9Yu(Aks zltR_Ec`3`175iuY-{lndh0#eazPd#Dy=Q1Eb7TIDaZY<}8c<G?90UUie-m>0`^hTjH zkiC9gMkf~5D+X{Y)T{Lk*x|@oNYT6^DRSeF`xeQoPn2WOdV#aO*i2O3aBI|MU;-9o{w z(@vj;PNNe!+?2HIn`YEJfQCfEG5=3Wf*k5RdQ>HfzTG=mQKYQ#^7h7?nLMA#{3=+I zegyr%v%Earbhb^KR>B!}ZG3?*4r;{hAKku6hf#ffeL*Czrg*OYyu!1veGqPR7_`1; zzN-&sVPT;_OL6LPvH&{nm5hvxs=vMx0<@w+K3N^vTcp8%^&3vDG?WsfZc5sKzBVqo z=H`0gkdxQc3>t_f393K{%rd?LLOu!<;bd3b#{X&objN@cvcr|kM3yK#nIhQ7^J4+i ztpPT7zU#)X+40o54_P33JHa~Zct}b}l+Kzwhs0P8XnV^a$xE%f5y-BS?f}a9Cmy$` zvF?hB6-Z@GY!B(dq4W#__mqXKkw~1{T1CiPHR{+m-+?`XF#DF)_GZwu1cimY&if0( zYZ+`+C&%F4;ly%l(Cx+B_@P>VJe!B?2+CPWbW@|LXK{RJ52KzUd~4ocq~?0Uk0Af% z<6Sg5M^X6dc&u8v(hHIs0|_;GuRdW@ipHFCfGp?OOAU5JB91OaII zhP@s?I0ps?)5-ltaU%HppJ%&Si67|m=L)G$vHHGae zf(RlI?&4f4{H0+#BVSimo`o1YR~@JM6%=W~Z=gL8|AL^QTv8JDV^Q(W5-@9KZq9tJ z6|qb(w!2Gk`pVs8+ogK89hAjOGz#f61eS%zac^!oRDd7LZ zTA+1D7Kz@?v>i<-OG;8wXbWLu=g>48q~cj90@mD+nIV~?5!b-6jnc;-=borN4C=fJ zMbv?D5^@V^L&l(m(>rW7MaPHPW!w*&o0)~R6vKW={H6v(Moy#^VX;`oRZi~F`^|1C z$F!tsm;%*|BAMm4QR22(T(A3%WatElwsQG#WlQkR5-~F~jtG(8ovRKYkpzN7h!sE> z3_bLPIpnd?aE*1+^RG>v&$?elRE((Vg5U5frG5rl-ia%db$nf4FRcoPR#+ZMlgR%} zS?r$DEL>sfg=C=1RpzFqI+J~Wpr$yIb@SC| z;TgS(V8!kbd_r_7dP-KPKrPOldzmQ?FEw8b@%W0z@`V%@Iu{=gO>HsQ9~BkV|HL3S z)Dd2>Vk!rp4!$Q8irT?8g~Wbpkh`le)zg!rs-hy)rlhWZ{KkzNwlP>SO6^lX`3_zk zYYpty0X9KA9l_VA9t&dFNgs9-wYDe^kDQVfnyVfvsNSEjun3KkJRxLE7v)3gNW&^f zrXm?=Xv{HO6@Kp`h>Q6&(7qkVW@np;{8;zj3r^|k=r%e6O=@7MmWCcHf^K8H0eZ!1 z&Ij4qqQICqp4W)Aii1$Qz%;n+HBY)@DZQ8s!Vho`JJJ1l< zXSN{f*j_#j1?>Z?w9mno@RBK0iQhZNOvN?^eJ(k%_-cNTQry3B73E8N=7VPXZZEIp zUy!C!NPF~XCo(KftT!S;Hs7vbVh|X-{;|JqU-`foLxigKx;4VM!}wS|H^Wu&jZQV= zoZOfUZjR@eML3C=MqhRKDoDXwaa3-lZp#nKJPEOZg4qt!&+jBFoH=o&U2}O!( zC=JMYa0AqV3T|CFX19Ykz56+7rw#~OuX!GN&SP%DW0#sq5SyDTVr|kJU;t4VNBd$B z+=;zdpo@^%d{d-_Zi+>Z{xpg2R4VpwWh%tW8#8B72ZWAYFgL%|z7L&dRPg+{vb>YmH~l%8MMC3NZuOAJZ*AKC3X!N$VEMb$u3LNd8_iB7dLN=-<8 z(jvE8nF<1Jq^2X4d1&oV#JR1l| z80-$(CL=NQ&mO;18unF<--Unt?r8k3HnPwDF@N#A7X3u|0)C$Pd2o`+?%oA&nSlpm zO8;n<<*ylh5VYq1`k612jx37=8xrR&=EL2j9Fs%2bRVso75*{jdrgvHf(7wuF9hE-X7!B9SMYE8|M_ru z+hY-XpoGQ`54FcPEa27NZp>}#;7@MWR|NW6caH3H52;B{HUkSE5)!IB3R z6|3yaw<8+C1@V`eX=h@gNj{d2f{gk4)a%VXR4^vj0AMTGhlYmw0B;(aYXz-`VHtYO zT?mMU_}1R)H&jW=)O9;#D#2ITkI#&EtlPPBXINh4;;p>oWIi;I@=C+l$&e`PY@3N4 z5kgTf-tI~8SBcl@uVXKZjEOPHz!4aTWE6oy&~b$q$_uvd*l`c(x&3nk!^8gI!y+ol zGnK6oC({#)!oUlag?95ozyFRRCjgF9NTk+jE_NJU52xIHBP-OZ;H0%PLgM1?Aq|1A zkcQ0@PG9315-J>s=WMv$5P7phgM(pXFcy6RI9i$6f}K10j99o3+R0C}x9BuLX{97N zz*dU_lE08h`9$~bRab1XibLAOS=94I)1<1<-Mt~1ijAiNB-iC}@JXL=u!c!Z^c1rR z3kw(L6Gvr|!{0w1Nbx;Vs1E8v=GwQT!S0F*3WSm=lJ@x7?+>YgZibGot`F4P;)O(q z^$k5j5N`p_A|%3L&pv3=MdDP$@|Zaed*Uwj7@`*_<;@P`djB;Bt+a@<6rJqRf6p+Q zawg`FBycg*8v5C$*RCTtrv_<({Hgb!_2 zFY7rZ;wRIWu3As=f!uZwjw0y7dybzQK%dh|GUPf*?X0Syu`e|Q$^s%QZnp$`Jivby zH~2O17m?xaj*h~f*DPuXdJIHPX}t@()V0gS%&T}3fsdl5L#KD?(k*a;Cr3v?F(Wg} zYUrRDRFpr_N!U~0zkgqZ^VwVjGXSnZryLgB5G!5h?ggzNskG8}8p}bEPS{L|0&+G$ z-+Q<)V-ljHH1;2^*vvDRGyp%Wr#i#9#^Z03?nrH?^>B5Qsg|^(zrgg(jkfhQ4mB!=>qig+#RuLn789vst&n z4frDgDV0){!vy<)tbftkgZ|_jwDc)vCdS4u{>5tXq|mT6H@V77oxm`kyK<9go-RGm$rs-fT9Bf1Na$OdQsQu zHh`D*gW+NqC$zy9NaJY9v8S;16B6|F9~+w%=;sll9jH`=W(40evb-aR0pUN3i|#3A z-FcQPz=8`7^C5WwC0Ym#0uixGttTejV@=IbaJ@wDI#`1HaumTAAv6RvpihpNcYAFQ zRnG|(6;s0koXY<~uUD@gBxwUmRp=fO9`+6w0-!m|)~z*p)+mw_umCO%p8+3#>(;Hv zJp9Buz!0i+w~jTdulk>UmUmw;+kX&P+0e6zRx1|PzEevqyvrLez$xMSsr3GF7uQ3a zYF3u3i(U55xNqB^H_;j2`6oH2;0m3jpF2l{kx6q5GbwYFZ1F-05Ik&~eH z>&v-Qr#7K&z$XMkLo&~H<3Fj`tmvQsxiHsS;^L71^*(W(L$RAnYKfx=*^r3ArP42T z6%V1|>{$Y@CF$_Rph0@|Md?Fs7jRG!bhblMpbwZpeq?G3!E9%MH_@Mv+=fS(0MF(z zcreo4uj(gglE9DLROkWaFs(>gBRUa?Bpf&mU+`3_2Sr5*q^FjYNc{vI964@3(wOjV ztW7A)5at$R@t~j}E=y}a_2b7^;w}Y9^de#mG~dPz8(v_StuC2B6lqkiUGNGdM6g>v z7&w7%3{AdB6}skwyZ7t~ZPkH{sx{-}?e{GZX&z0`JTVrS7Kv4V7#`l~GHn#H`Bn9J ztO#l402R~aql5o!!b*K+*|N)A0~`f9)Z}Iz*jbR!eZUPOq3D@%Z=O-Bv6ZzrE}|4A zcSj+w%opm$Q_UpChx7qfHb@P!9JrmB$b(&VuzeJ)LU95U9_1*-7FP$rFFwuRZeaJ= zBMt76WVX43dw~xc!Q`-CQZauJ4M@J_t z5BV~UuU>iKeZJMvZNnY!M5y+p`wnSWT}y@Vo*rW@`O`nfJ2VDIjO}+KF-G#BgM&l< zqnVmOJkSZ~4xQZrvo{5>|4bVgMR`F~znOfqyFFS*S(c4!nwzEKI|z;j zY`wR&uQ%Z>Z1zSO@Tu+TY5~_5CWTT!BQF)gSnU`9Oj@y+MSyIz-7f2=AFxt90cI#u zk>kBSCN-!07tEkLmC2r_OhtM;+_UBLMw?dq$~juYIrAy)`c--??L&CNKsAv5q9U!9qkxwPytzWFdvSHtgt3g1-IUQ_+SE< z{sQQ>L@R{%7_KK^e4;c+1eH%%aJZ;V6Gs8c1cyhi@#ho&He~0I(S5h(#6Y<4X%Kox zT8UBmG~e>pDbF)zW8j6Y1R`6Q&>xpcROxC;~$Fl z`mUx~=5#wDKf>pa+^m9b!RKX5X!&u28}aUJj)@6IX{Eda%iGwr-s7_}Ad;!3lm@yS zdMyDDb340rXnT4LX*Al|OP3563bBnrV{>^NC%?D1x4eYSfddC@Gf{pBan^&Fj7JmQ zLnSV?ix*R8qToaaC+J|+mZl{?YKQST&k75N+p}|Xg$iRTTRrh&Xn#u|7GU&JVK`o{ zcw%5|tTxm1(xugSN5aLQKffG3bkDefq2aAA^*#Ea2_L&`0w`W^~Or^Wxw%*ylZ zBfKrh&W6%h@{U`7_a>-Yzcs?KheNKYqvhcFu4S?;Gl_d4L|BdVs-AJ|yMa`{XYuK# zyAn)rX5>FjqZR^VrQor$6~1_J*xp9x{j|a+-F!8?J=v}^PtKJneYW^?@vYD{!fiop zH53y&;1DaZY>!Bc_iHdZ-hZ z?puq-k+9Yl66Ux>QRCPBpiRgJ}x{vhE1qD=Gh@-8$ugJ~e5fI&xq zpFUs@&22THy+Hb#FEv1)1A4W33FL+#hUY0{0`7C~FIvE)$bpRSSvbTW1iO-1hS5<` zqQl()c3~6c#X_uyem@%sRek;iTU#2y7VUI#%*M(pr~yt6?@`?-vT>NkA$E9z32L|T z(|V)nka}5ITh~8ngMX?1?b|jBS{)Bp^KWTU&}}aW^=y035JBqz?b{k4uzS~!`;x8+ zUg9VIPXb7F!QtNLPHTo5V(JEJP}E!p;G91VFnK+RrSSct!x+48$J4vt=2o6V=%33#0sL z2x%LL4Y(CZqxs<-4onC^HTn%$P+H6pNNuhGi1wwzOO#_)_oBPoXmAmCr~*?=brN9n zipRHxbW=OO4kM~602M__A<#jsRp)>dNWcH`=Sm6a|NlSv^I#cccPMucL!`&W1D<{E z48;>z(JS2Iq`XI3r%SFa1I|NFNfVL56N}v2Od997ADNUPv%)|t7TW>xN z0p`AUwG#^~5jCNFz9cPtri^2|j5`BmdTU|kIed|x4=I<1v>d-Y7n6P%20*sbXJJc;_* z4%>XKSU*m&rq{21-|OHzOkv{?NDTo1fDNGDh0vquLDFZ$fcV30WY$m!Q>2XBoNBWa zz97xYXA3aoJ%WOQsSa4TS@#_g5qcX8Sh!dLkf-wiN+PE};MtUNo3oaP0V;J4A2QMu zJ9Gtj7&^sGTCkp8DLIgB^-_iOX12f$2@#kB#mA5oUX2J zx;pyAj>K5XzVopW|}z-@Gph5+qScNuyvX6y<(a*QSEgzp&(^x_%9e0O~uOiZBK5 zRXpJFdb#=_qA2vt-sgvJxH zRKwUj;XQkb@^O9nE$;-EwA_!1V#E$Gbowh!1v31Qk;5_^JEZ+nu+^)ue}ufaE9HVL z0fIfal_J;|c0SF8FCfM=_4TVWH?p!O%s8Wo0?KJW4nO??Cf_tGeLs`Eb+FZN3koWa z=8~oxeO_TZG%#F8j>Nf1+XRP&@nzU3tE%qlIQXIHink@ha1IVwwb8FIzXaU7H{6aB z@Foz1%TKEiVf!#G`W_OA(H1n915M~(Qq&Mc0VGiJJ4|No2b9r+k%S+-(3wrn>a!S4 zn{}rEsY482e|sPh8N4q3Jqeg;c{Xb1viMUJ@S1?fuAxY<|7)OUQjP=T7I5hSnfQe} z+%rBZL~@ExPGc{)|Ke?VXr{|1=#A=_?mom|1_-))3~?(x5q&l}o2LXF5SWiRw^EcS z_7wrW|2S*tzPptB)pj+xX7)oSpLPl!x1A4rutQDfpiI||{9nKQ!tK=F zq%u1G{;j>U@MqH8&cbVqmZCFrIeVUG*`=4C30ANA$6DWn{~?#)HnSgkL<9R>jm28i z!)h~d`!1CVIRaaIHSV?lXTQyeW*MUwlSk2hn-kTdy{)I&H!W#!b&qr%3(*S>xFrgl z!BEN`HM$yqWY;m;{6V}?4);HjoVuDm9NT2%Q*xvaJhoaZ-JAFK(q_^^jCTj!H!Wi- zEq!7QwXQ$+p?JE4bwuwwb7#NoGuhcplNoNj(f?EIyVcB#Yal?ThcSLmyZru#7-p~d zg{HkT^-r$;=L7hESF>0x5!<-U)y?2NuF%xp5qa5BZ$*#ewSMn{3ZXL)i?S;o)(iQw zyEj8%TO5b;Q5K0DI;5xAlzTrq`W67wewr}mmO%rV znB)W#iP)h-cv%dm308;A3_8zSMi+6?0Sk3d`cA?fF;lK57H7mA3^z$8f`K9!ENo*? zm3=~$6()s;Vg)MhH<|f>EB5T(9mKojA0wol27yYN$;>MhLJs3dsb3Gy3v>^Z6(~`T zs)@&_{8tNLs}bihw-pDQpd~1~dq!2T79dF`%n_^nHXg^J_60>Jxmu*&C-ANRX3W$A_0zX;JyvFY^eEE%rNmx)5p z6xxcSd{j4lceR4mSMi+3_Mo?udiDiH^~8xFqWImr_YDAhk~f_F9)TA4AdWUh;~>aB zVH$g+RXo&X=+qu-kE33j7sOdf68cC_R!(*qs^zDXKJn0fhQGgd6w9C=RbP}< zJYc44kXnSrf)crd!})xJGe98oix*X*`6cW@cVrce0tF54X+iy0Bn3T_PI3d7^K8KF z^-lAkF{{fev|i!TU%{Z>`fT(>Sv~aG>p|vt16rz|2huG9c2&DyDw!95-amqJuS+m2@!9B%BhGkl6Zt zdOGSmKwmllriOQBK2$d}_|?~|rbuBKl;qodN>_oCC%^FLf&KfrBqbjxRbg-u5|DUK zx+*Inn+s@S%1;hX&X={d%E_1)DnmrWqmfZjvM9~kTro2U#OLx9DU2xf1`nz`D!>PW z8pTsU=Fp+S!U8oM85f$f5$lW6T6wz>5JX6(Zwd6mZ4Ke;V{LK)5VRBJ2s9`jmBJKQ(TH!$Ki&Va938>)SPQh zH_R8r;FS~X4|d_KAV-(NNwR3eH&e2^&V^@5tSiT*uB=rud)6vHe-cMFiRk)(+a zyP@bsji{&2kE|!`Lk>eW!{g((QK_`cR{H3cxX-W0B5qjJeh9Z9A@7pgA&YB5=0NdY zNVL7Np$c5|3k=IZU=@8cZ_Zao;(~N(O+0k_3lssZ`A(%oz7ESnb(o#! z4gX|Pv9~E{XjJVh`{`vwhWfd-J!2aq@2=HN-4GpBsHSL@IPpciA&6+`)LsNm`4aIadyW>)0RKGnrMPn($QV6n#1O(m;PDH z*m%#S!(Wlla#lgXhxtH!Z*^HqKR`2+4HQo>XU_|=BEUdv=%TB8ft9}m>JB|#WtIw1 z7Z|Hg3L5wOBZkpZE;+%(v3IO3go{C$ii&ZANd)$@2wn}6Bovbh3QNc`@NOnR+u+LM z&k&)ytf|`;NYL|wr_LxlZ#Vpafx^+ITn+*zF2RG?s!WWY20_Pp%XZ&Bljpi**qqSw zmDnE~BbWU?KDX{0H&a&f$;u(4HrMD@9>H~E1?7>?8!wnBzy9Y7zgb6^ku{xU>*-5? zL9g@pJ8Y1X!u?|jTbEw7nx8>Jc2>P)4a%z5LDQM*`1}1H{;j1rQ5oJRtYg|CYjMjt z-I+>ZBfpIrN+yq=6iBT^sNcx_)t*aifQ`#5Xs4h1r;K$IjQJ4@O!!q`sSKP9jB+AC=W#gU(7>YJ$_u@t@yj2(kH}XjvP65uYVG<+e`#O- zZSG;C9bIgf_RVg4?@V3mvWg#w#l=uYub>qRjj@_;>Es^CO6&ZKbXX=#3(H-?D_Nkf z`2jh?a)k1o=27%hL%;T-p@HUBipInX2B9z#{3wRWFh;33`!oTePtG#TpR=@N1mXq^ zlS)O;@8fXy-^;xdycNc=ZJWQBzlP1hyWBg4znwSby0WT^m zecVdX*DLiM&ke|&2Z1US)4gTuR#_BoP$GQ!5)Y|4UiI&e^y_2BA!}h~A5WbGR_tVB z@H7fjs7Nr#Fk;pxi9SJ{n4?M8XdZ*!j6`NRv50bSB*#6z%>E995bcDo!#KGeYKbN= zEe3Va_U7j1){T-Os)!YKFpoiN^%<9p_boB&H&#@5zt=%QIfVy)PbxA8?hx{v3L=9* z4N9gvJMq7CXx<`Sy2-p3&_PZ%Ms`aZ3Z^1S^r8tYf~3Ku+uR%=1TjFlLrBn;Z)Oe^ zgD1pG7ii?6Kqyg|Jr!_SC#K1obK(Y&2mxlEgB)}Uzpqu>AZ4(=?hjIV)>G3tFngDU z#JYvYV9Zc!mWk0|2?>nFcZ`fp#>@adOdSlYD3kaQFoxMkUo$ zRr{b$?)$EEq6)Y#!4FlP-b0YGMe5qKO~zaIq8-Eh9K#fMpf8)Tg+GFC$a}cN66g_t z^}Pw(-sV6#hX?++Zylo-1zPD<7$O@sjXIcw<*4?Kj*Kv%$Qt^twENK^^dJAh_-5GN zm;bM>_&U_}|NR?Y-y7FfFwcE?zXBZ41;Ya99z+0+2uxUnqUZgI7CSPQJ|Brls9OO! zLO))F54Z$5qGWx6n6kk|Coz8c)4vxN!Bgs{ zp>l$k{uaCMZGW5+3=lD%{(}W|Y1jh^(f{Ig7=n2eMr_@N+h~J{ReP`-8LQWSO&rRf zKC&>f6Cnam2#~D*+%diD{!swobZ6YBj`>n4e`(slmjqMH7`+;IAQv^V_bxF#;n-9D z5^&ZGPGCFfQzbPx9nQg{yk4@I16m;;vT;j+QGW0ON?5 zIe!|elQpCbBJWE?(;!P?>VytymnK~Q)+b9cZnkeErEC#mh10)Sgn}Bgh9BR-8l~hG z7M6HzD=Ze{oyTw!*X-ossU5|bSrT*^>9l9hp369qgNH^TzA)aYirCVA^lItak5S0h zQz@PhJ_Wz=3n2Q#y&8~eXr|`^&c#%i)wyKoMJ&MXhwA7uuu}JrBM{^hl)eqI_c$6TUzu z)?(^4zL5L(wTfv~NLMF`Kt_)t<$$m3wKkCj6NwTrF_P&ld>R-UBM17&Rv23<`m)_^ z_LLhU5!R7QWKlJeoIqB-A+>UH7V-inl<4aZH}8QU zpl2M$2zfM>%@-jv2#ScP$_%@Ie;IU-%3WlJW;^CzvgF6;gZcMCzc4wADamyoRzn|0 z-(q^wj5Rk#{~V|KPc_&jiYLgJP}Jc4?X|VFURcS_b3p5Kv-alZUQlwoOQ+DMNw;Ui?`8>+|ny@)&vKu&jQ#;?Me;PEzKwZK4NI zk)oO|&QFrETOKrW%>LsUmK$tItnS62Th}cF*=6tp{h;LY)1h{69*y^tgA#Jq-&rD| zV-qn1!INh})P!6=VPyguL)hAdocV(Ti%W8mgl$9wseX<8ZQWF*vCfEm8B=B~N(YTJ zX>NukxM%IWkB%S_DsoI`;+opuxyQxu+oy}oOJvWIg3|wc=8@{D+L&ESh6*4p1rcSP$r4Ltbg}zUmNBly#hCB zKMu~Gk&TTFnrrO&uU=tgdBg`B~M}Q+LDiw%p zzhXr!K5)KdCOJV#X&VFG{5f~`l3+4YC>w`Vi59Bjn11&DW$Mv#$62wMVEA+#ac z;)SVjH;`$@!R@472kX#dh?p3N2-jm{;NpSak!*A|8%4o z&O9b^Kwa6lZWSFKMUO)^P<*%-9GJuudH;TbHpYCcf(fDeNj|psWc7=Tt9_U6GcJ1Z z1X^>Z{8fIyc9y2FTjN5#hP1?f38q5FVXlE>-%4~RrQLqq405M-Z&-~sTF^lf1>Npn zA?>wtxL-ud=hFQ)R(cyY8Dc7fG)D@g#m#`-pixzDHd81HKKMmE%{S2jRDj(h8jOwthOJ}}0~sRKwlgUoaKjUl0G?^R!cHb!YA=ukDGLS9o_XT1KQSpFF22$BV~Ifm zCJMntwjJQ&&SHLwCpO1x-};u-PYW4s}tLy4aZ#{)!3@N{lwE4NYmm!??Z~LrD!DH|+ z2s=^DOa%cWIxnk$5^fbByJ`BjN&tRHdeMB-DU;g!D4s}D-vaZRp}0j`MwnqHP*mA- z=MuVRd+R}4-J5Z?k_Vs+G?6N)XOfccZVn!NgXqLY@L>|Y zsO!QP2YX=*x?FmK%!Li3;>g6H;TuEjBZ%ht2<*MNQ|uTPA&$vA5(japk-5%Eu>=r6 ziB}&>#S0?;VI+>gP{`9Im~2ODyn z&BcgATKM(LVaH(%oG`SF)R7;;9-vP;F)3l=BF&uhaC4p^-pB41G%{h1cxW%b@LbkI z1<{}L^%|a2gb0Z`Tcc~JBn9Co0X%g`MXQ-e$Ii7RbCu%$)7?*=D`dnwxIGHo6-&kp zAa40-8_hp5p;k>zjZb=X!vUY`K-{i?QVw=$-T-)E1VbP=0Sf#$vLwESZuGEu{TAC8 z(B$U<4Fsn`_D#PezJ?W2$be`YDkdO3SP8ecUX4`ET>xZ#MD-n$O60$g^o8b;a#!n! z_%loqQZfzoNUcCKsA~!xhZ7&CDE*oJ2shRz+HWYp0&DMcF$6QY^gJ;*pcM;!T9_j? zTfXNjNtB_Ib+lZ9W&kY{TUPKj(AD=XG=R>ASL$GY#7))o_uiFBlr>w{uTQ*MZLRjX zuaB!X6(O=0V)dWeM#K>3@^q2_#10A_Zyr?K@%Qx&Y&R&mmK~f5=p&#xJ0~Z2j|Nco zL@CU?)7`IO{~5PF(dsOiU}Zy;xa%|d_;Pmit~*v{&vRA!Wc6GGC*M(Kcni|W!-}Yj z?MNtawDC>jCCU-xKhmM+Y|P&nBU`oc_t9GSmI7W11x=#4JwDtz24j*?{E~s- zgk8?xm>4_-nQkOdAndKC zP%4hdoIFTep+2(ENUuO4`Mf}ug+f8GxC-Fa^MZuU|MSDIzQq$I?#j2=ik)n3a)ri6 z^9}@Xpd9k^#W%9=WE|*|eV|)#jfp$}Qkqm4M2VZsRY4`}R1*-Hh#6uW#*3z=3^dlTw zVSlnfA(JsKM#K_?d4V;hu>}G~(|aH=^o)y(qqC(z8NrQQ0XJ!k2!T%4`;9w#BP-S0 zTw*sv#U(I&Y|;s9nj3Q;mmm7OAO6NkC!pfXLBg*uYTAPoKayuYIg7;7;0Ydy*zwF`h;;%j1>apY9l+OQW^m;61ypYk zenHs_GaB<25zk^sR(mc(fxtfdqUCC%LP)_bfha6-JvRxvk5Wp%!6Wp+lNIGax;ZG% zf?7j%85G;|5#`%3^5mM3k6&j`d;`SGy?-2|$%$)H*VXXZ*=2gtm=Y&HCe}|jos%fS z=933v2j1WSCNcXG8sJdgES*J$@VFrlL}b_vm~doV+&)Y9<%!ozB=S%-4VfMJ_2oeR zH1Xr&ZO%83!k!5LEq30GDG-{Ct?;5pg>zzJDw<^z1f2i@uP?lT8J5g|<^_jiFlL?1 zc65aMQD|%|Bnzm#H*^O3g{@3YSHSFXzmxR^YGE!hvCt*BUrX;;ub;8q0k*Jh0_#dq zu#t@3p~2p(^{p5;u$5?=Q_L{$^DOEC^!9rI!TY0*OEIguvvmdfKfz&;M{KBIUMGFV zI18K$mxiYw9(s#Q(fWaPI~2SGm^pj;^fo5}#qCnpD40oZZdic5)Xt;!9Gs{@adGdG z#NmWbL-XB|bH3>TdC+{e;K8hBrOvFU(|RBDo9r3?rjLyL3wtiXFLK(dNgGg%OHVg? zLNByzk3o)tvLyQ;)&WT;Ofeb&dvqBTYyLDCPDXO(EI}L?BaR7!59%K8eGgm-K#9NQ zd)OtRW-P6wt$hH+I|(h+F?s&{`74mUZ4Em_&QeIKogA_%&O)Q;Iy<>WcFTTH8sy%^YP0;W=OG@r31`&tF?%3umm#v*wUolKcI$p0O9KgF4T%K$pc9L{-nGzrG-&3e^^#W35FRU8sL9QRQ4k{^sRhCzD4>os z*4Lwu!Cx$nJh}(2)DAw;p{zROQKhx6W@bJP_ZD9nZP7f$%LjRB zy$pH;!gq~THIBBXvrSvbQ#`TymiUih#1-^Z)f3fJb0>+05fc?XH%A(J1VVc&3&%dE zqd*d>mwteZN1sl?T2zkOb$1FJxc+$62=-LB$y6 zh}MKpS%rlT0BaU4@#0P?DPu=#WHmJAdz8xKNY1Jw!6KGZYi$g!cH{nx?p`s*%C<&L zOi5bF7bxxSBX|tM>qEGS4N|bzi|Z^z?dj_J`jI0C(fhv+uABoB86ZQ8;CvG{%ro;q z2lZ5#BN@t0?FB&;&Z`m41No98lZ9!usT$XNNzUE&7nrlhE!osuDsM=7P=8%dXWjOo ztdjqbW6Ytht8@wFjb)Ebbn_$ZyH`d_DK$ zVQf^Ht(zcWvM_f47n(PlR+|Udj^>{;ZmDOE_u?5*_Pg3N5XRee18V8c__>_(BR?x%(r}028{%x0kSKY0I&mrfCl*93r63dd_11~eyurlPoS^x&Gv)=#x zvG?~@u-o5j4l-@w4dtKUIoR9Eg2riNdokw$@lnN)D*oa@rI0Q5))Y#7P28b5lDKHjtYwy*E8;?Ha^k0bMj z1U^a*75ZZQQ|yczV->`PJ;x8fowa` zG;er-Ll84@bte}8E)E~LjK*QL6Yy*(_l}~BFD@AiMjQNsV)T?EH6v$*vIbrNaz#$=@t(wr=wq-Hw(IEhi%Ybpf#6^#1c&HQVrUZ1x z=*l@pAA|Xdh4h%be$OE5>c;WmH@>IJT#j9_7J@SiXeGyg2Q(2bLeG$ z&tRh7W)6wOC`9#rpz%@jQYa)Jb_5gi z;LkY<;PXXoZH64dQ;IOX*0{`lE3}Pk!0oN@m0Je2Mo@6@asYob46$^Y>Wp4#)}-rZ z@O~3Su>>gjqr$Zyvm~m1FAr{^UFCasz4pXU@-pa#7vPK#3UBYb~R)lfi#>W)E8 zM=m9w(i_?7A=T(e-=W|G#}?AwnNUSmKq<}yOqDvD^z?+w z$VpgZ-!161USB~0$)l9nj!vtuzrtU&iuiKm01EToR%t=B^$YZ}ku)OT6oGJuQ*y`@ z!1YGkzri=Kvl}f$nj4^S=|_i5(VUt}2Y1(qh*ZlJD@b;hED#hO4~!cpb3b*7p%)|8 zEPQ7XNRq6)aSKcUc{jIGKYcvXp#j0UKdMf%?X0*4JO1u)8|K{ny6yMh12emB?~eDp zmB@8#_th_)cg4fg(wfplFNuvk5)DtwR+z2~KJi6ZGFGAc<82M9Cy`d zWPr#DDB-7D5ntyHGDI{6ILt%HnR7{cASAhC+~+^MGNcWPNgO`hhqu0h!;64$UpH)$ z&ph=R;2=@%bRy;zg0v2|hP5`q!FZ|IZ_FHwNjzl2YJF3sM=$d76|RcnpOy z#PO;1E*YrCCcE-gdJCcqFiX?HnnH#HHDx$prNgqgaY@KK1y%qlA|s95Bo4%(m`py9 zf%GksP$x9#h9wly1Q%g;ppxlM1 z5);&N+DVU2J;uLnDse(u+GP}&uJxWX-`)n6r6RfP6Pzg`P}BCNn|^{)iZE{0nAG|- zghMAT={%b2l2TZ3`XE#nBNE_1=*~W6w!w&k+~tG|Q-QL|%D3YC$552eNDWr_TN`tr zSz*bTrr80vW2XWTc#WB--!8has-gPD{br{>zbW)iK%l<9y@8T*nhBG#bx&UU* z;qAZSIUVg@EenBbToJpRe{JNn-Rj2AC)aLyb<;u>hBNrpeq3{NtAMYQNZoo*@jgP< zcq4*#h=CY9|9CgEY=D<&M}*&HIlKD#L-Zr+t5*13=3-|HwIji`!r1%24b-qbn!LQ> z)fEmpI=UCQNiUn4w0f^HB-U?n@UUdypE0W`=dQf?g&b(A92o&lia2%&oVzPx8kI1a zgPVYPr3(deEDDTU#9Fvu7dyYV@9dIC(MXrLmCH zi!1LegA-f6o*$$lTSRtnB=JfDRi&6JR!r?gHVjv#viTnSeB~mwWu0C@gV8m2wVGGkZCCzJZrr zvm(j3IKf?>R=t8BHXe^qNymfzW~-re-}HN8bd(V-afuTGTi1fu?JJE&V6~q8^Z5|4vWMugdD*77{ zv)T=Jd(pi!qGB7!Y{OudZ*y})rLlXjI6C^^bo+k#9ab<@p*wHxyb^Wz3J=D}em`x8 zI*YS?Tog|kow6z$` zD&j_hvoT$^G?L^LU?U1OAW1RTka(|T8nP1&aTT)AMTRXgKN@_8fhxbGYG>CxrQQ%`vLO0aIu5gQ24(fnu99PQ@)4;ZZ5Pj2VhhAcr6+0Waxy#1+xbl>RrZw zc3?iP3VSztcu@dY?_02NLL-*&k$AaI;U9$_Yakw+7HO zh4tp=qIG|ao}Ml}?<}#p22xN==ZR1vBZ9DVo=i-IgX$=bg>>w3vZzcPzem2oslX*9 z^jP-@g@W}$<`-DC5lE+9t^}$fgTT1x@=XB^M5BA6h#K$SzmEj8QC_rW0qoeeXHVXZ=M#9aNgi4#$-!DO zz-5~h+tZUt2em}Vk$|gY=mb)uAO1axrfV!ro5WQZT!n0FV^USO$eoFy!O`;)`|Hb^ zn)G@@!LZe^+)&PV%x**}a}*p2rNl$=6_f;|f=z=r%p&^_5|7};WQR776`E`$>J=AD z(5+ng3i?7i>Dtu0@QIEfIR&y;)eX|}UeTw$xKx=Icq9M>Rin6Bg+zpX%Kj@T$dF$R zs$wfM)gID1y!c-@U8!)P%}o-T`% z=!(9fwhVp#Fi2}QFF`vUp1b(U`AhlF{|{g90nYWi$Bh$7ijYc1Mnc1u_@Xk(NLDIQ zW@IEIJELTl5g`giMk+}eDG`Z`q>PfX$tYV$^t^7JbDsZoJ=gU=*E!ee_lxiNeD2SE zzu&L*9*o{8anCT%B5W*>g*MfcnFT(9s*@q_Z|*)M^!+eSK+AL0SZ0Q4SpfW`st11l z6!E%cJ3a0>^z{yjFvA%%wZ;52CTQU7-WisKM*%s5VvaiCwv;mTPo&?~&mIS|R^o<} z{xS&mm`WsCM&W|?|Fp%POG?$$xP*ka_R={ONCr5jLdatTAHFp{gZ(lgIax#_1z6{D zC8y}{`7!m>y^yjaQN&F@B@B(278>0fP@wUAQkZT|WE@|^!FBYF@1DAeY*8L_vHebf#nk!!yC*FKUU~R191fD8t7!LJz%SF=Q z1=-=RepXz(;g%_e`pn^Lls7bJ$NnT?pwM4sxt+WKdEy+75_2HxX=g`HLH;QGTu_O* zY9*;~7!&I*e!Wm=H6c_@V>mi5BO~0|V`gqn4f-g?@;(MMiUa*0pPIUiZ74J{k`7S8 zJ$MxXyKf;@R9wQevsZg%LEDHsajgPFaXB=UUUx?vaVL<2TM5(++lZG;j0bT}b0M%)Zbzi@7 zA^K_s3K%j_2W0tW90r#YPqdwj%*xEH0K-Z)z>gn4^5S7ZEx$iQ{}Ogm-(08+)+v9N zC&gSi73Es!ll}c_TssWzQK6u@;UkX^$;{3!Jnv4`Au|rO1WW!oKmaIMSK_2NE?tj3 z8Dl`zRM7q5kIK+5-80ys`=G^Yw2_(C{EzMhtIsGwsf$fl$G_ND%5JRS5;7^mgjWsF zX^aSh&Vb|ucH?j%k^~}ov`(BedZKG5qZK}Ukj?$4GapRD;RRF3&P|-?|p;vrsO28G2fAi?P z1WOVu9+x0{gD{Z%6MPjolDs5N6&NvsEeXb>x*uLlIC(GQAb~0kTjk0>_>$L>lXD)3 zL;B3(T55@l@|yk*O&ITVR5n)#$t*(XIy-VI5fiEeoa2K6z9+faK9FI;#X_a*+iT2EFmNE$oWO;e zmWm2xX`Qj^x`4(-fsDe&#dOE;MePuLJEWgFVRj`^)gr7Fg8&0?T?{aj45zjjQh+XE z#|l8Y5wO?Cm_sNc##K!&8St+F<2EIlH+Uu3cTh-gFYb=@f6Tg9%AW&$liNwN@|q z#j6(mvP|G1^8j`NfIMgLpv8M8>(~Pr>z|V7Nyro4Iwv*!deIohyY>?31q%$gNvJD; z*z?@iXL|ZNkcFy3<7nbAfVTST5R^>}-w-xMM5<(5Y)qB)52w&fw_H0wILnouu`4c0 zAj{T+m?{7^B9L&H&$+6dA49fH>Xg=b0$P#~HHRkDS!B{yT^2KHfOV*EPuF^&!X~2S zZ4HhXT>7E$7EeI0EEzF?JD4+tbXFQvM%EuUBmNYDlf}(eb+z~^|Kg%=V!G#AcJciq zCGg%69Pm2kvT<@@=yTEW(j<8gCK7gIM`M zqZpZOPX;t#a~p@1?G;{$s_W;C58!{gvu^vXfX-Y|epe#AdWNMwULQh@c5T|}%DDR_ z9ygBQ*)D0k-dY@qmr!G)!D}swlU%=H!%F|9c_B=v-blh3s2s;sk=e?Kswwk~2T2HB zLFS!;?JEZ{+=a{e=3py=9e7>WEITn&c`>>TB&pSSawJrSY1P{i9ZF4QhM5!M2nE5} z)AUr{FC3U4X)kfbg0OB0guooAv302TC=jn8(#qJj;T=itLdFw*3RJG+VDcot@fTqj z%UJ2cqsn>QSy6AFRB2^pWnNz1i)~rD@uH^~$i#^{lANrds94bO0;(T;!Vr=p{`d0x zV-Qha-`>epT@Eomb|meQKTsl-1ENb%D8NJ$guXsGeDf{3>@v}49L z+Rs}W*oOBu!ly9s^CsoFcS$Xx*KvkbOqX6e=>;(@nehf;6ce<-R8-=Ae>_PRMXZ>q z$@|ej;9%=e_iWAWJ!+TkQ=*x3jO2JHj?`^C6dx<=kGKaX)LNN8e+_=1LBd6bTL-?L z8DLE<3Pc8!w&N@;ERO*$W?Dh$H(Bnn0dCt>>W2?oeT4l-(g47r?FnUhl++;*9AexE z&>50*2Q@kg%D~~GRp+-f|9zXuo#!AHeqa)UMiyWdaUA-r-~Ev`at--^_nE4l#lck$ zjd*2u=|034 z%u{?A(8``U6!bZju#||~j*JJjQ>ySuwAh;zndVBu-Vq;p(O9$xxWERYA~8Z$IV@7s@n zSdb7C%T*^?MW2zO-$B6!^cpphz2{9F%LKP=_4c=UffmaG#FW>_-28kuKW?3v81^oqj+h*FV$%i~T!C}$O37@*`qE;Nx@OGK2I=1H@wSAt1&}vB1Vf{@3)@W< zfiI2M$3ZCpzOuzOQP?)-3IOM+Uly2+0PHt0xPh2?+S_jx9m8fyOn53gWn~kJw8?Zd z7*t~Vgt28Hgz$AcRHvN4d-YZ9J4(DTB*fDmhVactcf<)yY?dOi4-m*tG=Yu zpmqR65>Z5y&;&o`C`(qnuo3zXPkZ1h>b+w)>dGOx^%_y|nPUc$l0CsWVtIKi&0t42 z(p5Oh4jepKfg9mzPo^u(PwY|lAQA6`OmXBa2@4Iqh!<4>j73F-Io3>AF4PF@@pcy! zMeRI}kw_Idt7jYzNFvO$_5yc-(W$d@AFw%9;4Ja72cf({@#gW3I?%MlFe)WO$cx~q z-j^mmK0bNXW&_9+GkVN?l+$*}U2#682W8@TK8}VUEZFWxWD)B!DHv}~vpq`w& zKfewT4K70IjSsyoX5=|Nd%z{#oOXtx-_)q%y;y(yGZ3pG2?^}J?Og|u1OQFwe$=;A zRIa#rU>azUTZ&K52nL$W6BulJnpK*^oMHB3=5AiK&>o2K)nIl1X&BHL`N+ghaJm}H z_Ug9}F*`%)$p9tn9{W{E-G);|3H|zv!(n{_UZb}LNk?DZ0<4?qt=VttXtE;`4KY=DapkpTXrOWNzpK^#8^ zwatlXy{EUA%z<`-O$9P$)<28xQD*@|Zll#F(G{P<3EZ~qmpC_c+`x7Siz z)|KVGIl=)yfVrLgMd3&;PL8}glMWof7dJr>bjOi1;|cUYX@4=!ij4?740l3B3KoS9 z$2<|SK)^4)uI_GXYzzT_8-0eO*b8v3jYmW)_nv@+~Bz{emDe&W(BC&+CMhY1`r?N%OND~4&r`vQT&!0VcqkT zJOD&N1qmW+`eIi(4a{I{b~(7@*(<=SP$3_-rcjA}UDemUn^#oD`&(YklExQ!-Rw@K z$7Q)!B5TsY2uS8;oTWT%iq;s3Sd%uCHB4@U}Y)Ul^|M}-*!h4T=A~^|uOjEhznTrBd<+MQ2Twj-(@qGMcF=Y;T zz6n2bfalxwHK2KR4!fd*Qv%^#oBqA4iwfRGGwaNk8rfhk*Fs-;%^lx~S6EnmL?N;l zs&h_$>7`u9H&EA4hAN$Z2xXHnf(!Y{31A&(e+h>))rW#J~8K?V>_0zkAA zC4-8z;dGP0aH7{u`;ZLG5EM|(NXX;TG za>~07zLM`GMp)KqNb!^waU5i!gGtK5_R<3qA>SLUI~&RFyYK~c z$FXG8{9M3!o@$r*EslePFe<>LzP}RHRnA;61u8!vV|4OJZ@xQ^*xY+H%5FKIm@r=5f+{EeD`u{tQ9O zfDi0Vn|phEx6Qbs*24B6c;1Vou8VKodN+ZDqd;qO>))u2gx$Ki~y7t z-ExRJMEKicCh8pwNZZi#1ZzK#XalVdxZ+vu2CAS{ck|XQ|M4|tSm5=qH84ho8zB7q z{Ie&>T*42%n=eDIr!`yP;tWn_XM%SvHyCsrKZUu1R zi};|AVY-FVOs_N1bMg0XaQ$=!J2oJ)_c91>;pcV*9HheoWKA_*_-=f9n(Y8**H8MB z25jFK zvHuUhdksdL5uZN@-yCE6qQ4y){hf|8zzMuq^4Y8Z7#9r_0U|fN0tp2fs)%_=GXwa; z!+Ur6W{!#wXS1;C!juqyG!*jS7G60gRUPGa@HaLx8t!wq#8d&{xLW;1UVZ|n6%-U4 zLlwq|f{W+m-*096r0IN6O)mcgRSCCk6^4E7KKYXLfOsjbd0<&=3J(P*HpXc#C%??g zNJ;IS_$B(PeO)d>oWg;|Wl z*=2$p1P}SHpb}j87K^JK@gfSe&!aE(;P}(3a}Krp^H3a>RRKbn+1)mLf6K7;dPGEo zw&-JEpTM%Lue|T*D6Nz^l5Mb2G4rUAPe$A&q_?E~E|q?X3m+%DBGpEHGLyd4S7QXV zg0Gl;gi=#MnjVZ(^^GftoD{k9!OMoB@W{y4VdNA=&Ibuoy#l6Y z=WA;miMQ;58YjVW1lfaqViz_`p`R-;;mt9~W+i;3@sUnE`R9cr${G6IL~k`B2(^(9+M-g2ktAeSC&7qw ze)I5$yCJ8yN^XcrN(x6OW_@lAat@C74BiCg)3y7RlE&p{NWid!%I*9z8Bt$fA1iry z4q@BNI-j#;f0v*U??;!|&`^baf8aCpje0$=B=Hp1Z=H1J3PJgG5Co=eia_=}imH{2 z3%h;hoN~6BhLR8$XVQ8qzDtr@597t|e4-vVCK>EwUyJXi0&Cf3^Q;;A`2$%(+m%s8 zgTnd~?Qs^{wdjOu!kjpG@V;b6NFq!8|CR=8pFS->6rDw_*wat9O>}XFJQ8wrbo`2` zGq^VAmR&w)+IQ2Ue9U~hcuo`Is#p&{uoEUdHo;ZukSal*I9oC$4OyDc{A7ke3JTo` z0B?OiZ2_Q>;OSI1q|tX!0LqBT1TF!kd<@j1q+kh=P8LWi7x20g8p>?o0e2(gvP$X6 z)2Gg8HA~!})((f4j3b4_W1*ZaDS)hKyGA5Kbozk2=Q!_q$6t%KSXBCkJuT;bJU|%z zMG?g)lA4EgU^_?#!aNo#K3;{BjD-zjk}wA2f#U*Qh*8eQlcK=0cIV@aX+0*|@9Cy* zBiGz74am`A)1LI6Z`vX7NNX3`3hQR6a0Ip<1`yFy<7sN#&HP`d|^Zc_AQtOd3n!XFh*gp=_DQ00_oivZ-ZVd>*({hy*3th zI?a?iqOo$R^J&o7-%Oq9m%iE2O*SgQfu(gprA3Z!$>1<_5Ux*1=x`dJ%Nhm-*C&T7 z@+IP@sI#MuODsnB$W;oiS~teuKc8_jN=}2qZ6zUswCUmv29|Ig8+U*);4TZ^4+YhM z#evEY=p~##Tn!mxGE>r>kK@eWM{ntPTkEF}nkBUp?hmtnQUzS&bp12D1*{oS7u6IZ z#P3vo&a9e|gq6XLxGG_>z@^kD8YSxrbpDmepg#xaT11c;NhTR2JLEz}alnJZ%(4ZM zZciQR5^>WJhjax)*l8!I@A*oS;$0>39}7Z0;GTZdB9Ph2sKF1N5(V3Uo^>TuLCjFh zN?F&Fxpp06R>EvoP!!|)qmwYW=4fe&!+spTa^H4d`X27^m%fsPo|-B#xn3WnJXB8* zF^;9b^|4XX_uxD|*5jiZh;{!8u^B6X|uO zoV>hHf$`25WUB;J4L$vVs2XO-8bp8dZsJ>aZ!aHt!-6SpT<__6ZY6dm3lojB-WxoS z_jh%5=}O;>l_5wZ%B=&*5z2&lK|L@yMm_-~PghGzjGys1I50AroXH#*0CYgeAbm&Ar9;pH6K`A3F%{3MB@vyd?! z?E2Z}do1URF%qQ;&!lY&AtxvlisYC)xM80^b9s@=#uQ&IbeKqpafG6ke+L`GOK)@l zn<1|myx=hui~HRRo!E4Y;}JX!G`FN}E0Q=7Sx%HkGY-bz0eaJ+zCxThIy(?Z8R^T? z>*?v~)8~$p^bA0Lt@m4?NVn#W>?MSaksHZU#erMzH90X+j$T|nQ7{SS91JcKzra9G z{|cwy#-`P4)~KMo8&HT7Gu?tuk=t?v8GOseuNP3zBwW9~)iy;|b!fB~T5KLNQEx^Co@O+p^iY}5L6za#9-V{+mjD~Mb0@{3qy{Z-{T4t391#U(T5)r8n}Y&k zm5&i3&;*b{Ns)H&VhM=g0@RL1mkv?o>zbaRYYHNZAo&I&Vu}8B%Ka~pNY?L=)2)4E z^0GEv3d(mxb#I+=50V7r9(v`9g^@czPxuLQ-8yjknnT~9MZp3idZ~C>5JKJccPDd@ zN8gw%viINU>P9%k%m|K8g1_0}+)7B)n4HeprJHR)1Um`Rwyea2A-3h-&^G#tX7P?= zK`3my4xo%6Yr+X#t*>!(EP)Pj6=zzz@weg!Am3DCKFIiOsEbU1CnN+xrhJWANZ4^X zT0xR-bKAsMx9JdbV79q^J1ehExhObwEL zxrr`gTZ1W@wp+zXHKoCxYbt4J80c-g-Xa!Ug+m_Xl6Hj?C>U&buJ?7G?@ zEW0BY|dMr?V4r8$yOqZIG&gMjEVHek!h91y;EJhU|w7bok)1`#7? zFAxbl_|UM%E_{O!WTw{VZD(UZi}>i?mbqqb>BYY_u#ls&e&`msL50lH#^WD}-P!xS z$i@7_j`)@dzL3oTm}$NL6vV|t6zWgokr{}j&5~u2B4DD}6QB-QLHmf>M$#*5F{tHQ zD3-yLy_Y0vcNV%u!ZHkx!UluXB8)IsA!E`@f(>^jA)w)Eau-Aq6RmK7?`WMsdMpnu z<+NbE7RekGRYU0ruM)9>e_&)#R_EsrS_7GNU??vuZm zx(@9NnG5MPr+!LHp9PUB?Q5!?0Ys=VCgYJc!?WmRPa?+fa=Y{GK0SJ8Tn+Y5Eu5Y2 z1oTF`NJ6&B>;8?}kUGzyH}Y-&wEc;H^dH=^w1+p6lGb8BD{&*OJ$girGDV-cYek6Q z^vi2MhhG0$e0-JhP6~ws6vCCIW7!4)CZXs<0s+iLbdnAoTD_72j66R5H{1m^eA8CT zvGqJW>(IEf%6iLByCAP}Y=%f1L&}>reW_Jg1E>}WlshkN4G;qj2D3@3`Rm{(Vval& zQ0>C7PKT2FcOkzVj#SlMG79c^0e}O+MXlHfd8MR|^cDd4k<;~QS{dKobH6)8+P*Qsdes-?t&`2 zyvx2aHqK#Jc4`b$hh-AoH4d_MC~{U&Q9;o|gSRJoC2%?DqjnFAMD|J#^Q+Qk=�d z9LG#(AnLn*C0|2LV~DAQfS`$UfRHu12%m_sLd4|$D8tR-&3jn_o%t5DL%4EXOFH)t zbE>kvyUsNwBIwfY5$jmN_V(SoAhZ+k7<{6b+Q{i;8$jUCga#9o;bwByg|ka2BfNZg zShI7eQOaN)Gc7e06=is46N=bV=@uAAobYOKhqC{R6-G%iwN(ZS9VaHSgb7W9OP^MQ6MkRv*LHB%-Uav-5UQE2wAP z4E?Z@**DujT@t&bd?D5*cI7026cB!4V0OEH+QKlH5FalXCX|N|o^s%!6OlPM$Dlvo zesf)EdsdCO5b7-08a`tdodPn9J}mvkHrV*(i_zFp$L;hU@Y*FWG4zel-n(7#gd<)h zu&5Qott7T>JD@TG7`ZyQ)@AS%lT5w6u^9GmskUr~62 z8P~yC#1vw~KC1vLTH*~P>+Wzi`KNo68iIKt03cqg7ZA~pO-+fevXL-c>h=5qf;b>C z3Plw9O#P7(C~e=?*Ps3sjSSCSf(MqCf`(1csZT2k_%X3nzl_Qb5 zF8*mZ;d#vY7WQzdR+~{sHh?nXIr!B6Rdm-DH`Tx!4ImTy&sl+&MdVljZ^&`nwS(tP z+V~hopeQ49I6a?+ii)TeL5Rime z%~3)kK^V^@Ml0{eyWaR>;=;8GShy~F$kk~d1h9ue6`C-43U0`Ix~T!jl5y!mLqp9d zJmflp;|S%}lz_0G4+koh=DtqMg1QJiOmkF2I^Fw(0HR;iA~@ds(SJvj)F?HgYnk}@ z_j*mzqWo$2BiRlPMw|#YA~iF5WYE2elmjrLRDkiexR3^FT zC-~i4aXEfOVa+`=T;OuDwxvd)S zHmzTO8Kt`1nC3GKn?z@!vVB)K(WwxqPcdSP;`7hKI?%Lq=kyK?1jFJSEg)k_;tq%; zsW448I#eYOma*- zb>e^l_VlI^lr+G6BTE#Qzxw+1S6y-Vf~}7$aYJTw7=Ailpb#}a$#!M^*}I8j&E7C}i7+yHQ5<&8QN zfrHz`WDSla&SX#+0qWr&X@ilbauf;wo|rqtfTEc1nLi);Dyz& zcmd3}%SQ)RV^$rDt$=5QoeT&Dne%taTYo{IQt9F1Y2gLq&VJ6nV~K_VyrCWvu@r+F z?3%vYi+r=jerY`Vo*f*3S=3K{7+UJcD!q-EjXF;8BWcD}CbyRZjy}*4Z1CZ7bi#!} z?JYR*E15@W+1CeObF{m?mLw=#+gi18ol?owgyjiOl>yq~ZddvGZhaAYO@==A8Qxjj zy1LUs1z)hyee#y_SawN4j*r^pz z_2LC+&A$@&%Pa-!@G`n`1S_AsDdWHZm~7S?uN3p@oiXvkFC$=UP<_p{g9(7$9d7~W z4_YEP{Q@wW#sC8^XT8xz1C+>0D}#;zW@#ok1e20T30(5Y5|}GB+*+Qw-$zG92frQ^ zQy-5zud1p*GlMXDBY6UURY)GhoQb@U&QcIH{hA$NJxpPDs&q;wmeN~nZ|}-{mKyH@y#KaOmsta z6atF=nsg!hPYW=s35vxpUi9K15||MX=+z)0lB%k zOk1r&&SkH6QLV^49#2X!qFklQ`1AsOlsJ6Git+>15D)O}-W``+nnVPik4~O&(ZQfj zMyl8pN2+BhdR$!Gv1BZZa^OP=t{Bv;`y(=pjqoz)_p3=*(BhSEAH@1leKT`#)Ta0T z_;E7!$aefR6VZQWbI(ppT!PT-(5S|)|KwAk?ng+GKa z-rk8Vf0wgsA?%msc;2Y*O#2Y4DlyR5X`_@}4N*4R(d)dm6%dUGv>eYiAYD?XH}SZk zXFitJiu@1opEOjuzbEA3kOf*4EiFTJ5%P30&G*Ox8T|!oZ%$tdy#}8Z#ZXJI zs+D8BEp0vI$3t@jtAu>=xvCBjq7X2ZneILS!eSyQ$7nSxaW03!P*LyLC67`Z-}~Ot zy1KfnpM=)}=V`i6eCw`GPG+sn$W}%#p%KTgx9lQ#{v1=pk>y1swFNNdUIw^SZ-m;f z0<4||g*a!yQ&@B3Oihxo3_hj`&C=w%h(`H&KSN}&0i{+2yjk=p=0bbv>}9<g4(SDfcPQSb_|82Z8;34g7l1GLIJ=m)4NXnF+!Ipa3PG;0@B%)x9%eeD2&M-gRTS$8~k7G&5od_OutiC#5Z} zL7-P-g&Y>Pqj&`f=9n{ElnY6Om{?R%WZ_~gM$Orz1HeksFE|0@C{d=FN)5!e;q_=cA0LzyGS7wVP${7Uc01Rgb3|0a;n z$Vb1ia>ose!bFSYJz){Zg|g&K(;uC9`;EsmTx3h}lwWCt$Y0!FB6GF#iHPT&z69${ z_v;YOZxRQY1I4v+L^1P9FMcP3%U5cbGG*m?!P<(2d6~jV+VNkf$DmS(&)HWYov_!J zg7#y$olc2avb?#$I;FV<06E4)Afs7*OKDKZ?h^BDd5y99AZj>kPb#1OKgfMHy&lU$-~{xvZbljL_Voq0Ky z?p>)px{&t}1DP%C!yrT1)lV2{mk&VRM#?CGg69s$C_LBS^Q40J>X1rSITn*Sm9JL*I$9{CPN+|ClOyQx4m0#ND<}8$Z*rgopNOK_YNr;&$jTD*H3J1`aq)R%6J_$y}TwGQR~L^f1wCICMLAf|}E zva9;ps?ZY_Xx={wIanlyEuOH4A#0m=3?~2(og~?EE@N=(yS14+Ge8TxeEBkQ$`>^) zD{&`ZYHK1Qm2yd7kqtkF7U}YtAyLHLv}jF4)_!>`wQ404+XFrqyuJM%uZ4CgNlph{ zSMz;HttN2MtS2j-xe)bWlsgZo8@rGMs!BA&0i-%VW!rR`W7z@$o(VD~SONM${L^;SU2!d?eBsxD&aA~eh`q<4Y=2a662@ofVm3`p zwBw*zSJ+Dq)sz$kFMhY1*Hol0t$Oxf;G~fgvmF1ExyVebN55=iT831vv1ybx@tr|t z1EA9@wkN>Fg4O;&5rcagf~<;kOH6QqANlkz;aDImuSg{pkCdoJ5C>u0EDwnwk_0K_ zpT(n0_#dP188E}*EJ$z#j+_dJu6|1pE4U?ilMxk(ldcg{=kcq7&X+^SQC>-jbpq*H z{jFphbmDj@tMVUWrjRQH6_xW)^S4_!f6#!K z6K4I9%s+xYJJYI;l;jCFezrW=WN2u!ZyVx}L>0Uy12wAQufwR*2W}=f93TuuAVo-R zKdQ%W{7;Lxz-b}(%b6mX2`HbIjY-gb)I;4!^pGcCddJzGz&3@T>Z4&(>$qgeNuxog zP$06qI{%@WfWN)!L6R+jul?>%ni-64h+qzR;|g4i$dc;G1hi!i%kq%6b`@qUiC-Yu zOj~4~_Ur4=a>6qIiez|#!~Q4MP~{taBX87LQG`un_PDksibvL?S&rBdkB8n=Cp1J@Qx%YL=2}95 z+ZH*@t(`j5PUWcfCj82k9hvY{T!M=F^sm02o&Z!q>r0QYlZ)pEJ$Zc|8MT3@FZHNB zZmo^Ieci(X@VYf>UQ&tL)wl_HsQ2n?#F9vL2e}G|5h_{0zdwG21FMM^D8z=91h0>y z4p^udp8b^}a%cxZ^-OelV1ufJ8;>6{#R$?yI3}aE>&1c+2osT$OFZ6xLY9m>U&l=m zK$Rixj)tUmA>k>s43O_>4urW?zVaZd0mP3y2wtAz3uvpP;9%622srKzkdYe%`R$;9 z9+*QQ%;CTqKFI>-o|;<9@|ohw#=F=INodqzd|<+O_$dQX}z$TAEc76+Cj{2 z#3RL+jH@*MT>ZFgP+(veR7qFKA_1NoSEP-SR`Un*jW~#{xxnB$>fnWUy625T_4x*} zmGRb_$QuyI+h8G?90|c*2QFAg?&-i`+LbFcSdB5kz>G8u(Ut4V>2AwgkLmp1I4OkI zAn%%JcaLA|xJ?2Gmaj!pa`H;<^B}b_H@5M4Hf{n3MY)tyU-qRu*C0OI$`LXrog?Kr zlI;u+-R~~<_3@#d&wKi`sgY$eC#Pa)@#+Z_lmV{D@!IVgcyd}Ki3~zYe|^SqXf_LF zqvn%Nh-k%R3z0);Yl91+{5U?g@oL|}{U=Hl!lW&pm`MEk!+F66VbawLGa+2V>m|yj zcZGO%U-)E z&y%i<53i)y4$0zL1Kiv0u?+1DA=RwGM z5+%V2$1YtrF3&i_wVG-^4$}=npYU2O2;`SaXNyI*f<~1cbtL5y*U>$r_;&H-J)iP=s)lVfC&-P7lJKz?sz7r^~J6gj?AV zemxUM6ptMK3-j|zS2v1yj^x9k3hrClk8_`&f+M%XcNugyG&qwwjNsD=fdF8`q5}v* zl2uKBTRQyP?=QEk0WC6)!2GG@$RN%GFLSK1xXm8 zj_idd5yd><&j(|h%$`I1pdoWWXVXALvDC}?4 z%PFR-J-?U!Zcg8-e?U(!&+K(az$r_|&zu|)1K026Rma#^SX_O1aQ&eldD}kNoGtI* zlX>|0v3{(edhP{3|Gp5@O&g0nKEIHcPx&-e|N3=9@@(n*HYtyR(fY67ps&@|A=vQbv2k#C zmz0Id!2*MR8s`L5*FWn>SHpc;1t?I<0q5FUPysE)qBy%ya+j0RWc!G*jR|A}T|n-& zZZKocc+e>?zBxJg<3Tpwd*vP@K%${YVM6(YB|*f&9+|TwqY0@HPV=#^Q3iKmk(=9% zjE)lGIRKT(^A9%CkAdzC{33YC;J`kpmT178SX1_rL8P;_btv~4aXzgaR*#|mPterJ zNgfUk1|k#DT}6fPcaisTH$3Lyt00O=jR#0g>p~UYTqq9Jb>w_}8WI660)B6B!&M~C zXh8zWD*?WAeRK$3Lx>2ZhSX!GI~>17am&+z#WDYg<+Q;d$LQ$y=Pb& zZ$~TYz%Fz_%%SB_7rUJX4<*dD0HOF46_YjUKxRVLnfRr)|1u{DXZ?FMYB6htwg<69=UYO zsteITS`I)s=)r?F7LP*eQ$PFq0s{45xtTiEc}Ix`ZOiuSt+=<7zQ2ORPXWrK0xN1| z_X34Ifs-2CpvEA2GN~cN#J&O+oS?{Ru^EvCVArJz7W;3Qo_2W(=m0>yX-0HHv6uCJeMw-J0R)vT$bm{{3hzjkED3zeEPUxb6M#C8-kp{V zJb8_+Gw}o~b9I{TlMOm;=^-Z|qhMr4?TFwHR{&;N!%lZLu|7wIC zS=mLEJq53T@Ti?0jDKJ?^vXx6_CAL&BIb)!4H-B_@esynt}+0Gpuge?tyD5KpZO-z8xR6LRHPbs~eow>G^1Z)qj^39QjhO3T7E7y}E)9nBt?sLv9Gn?jRkivCppET% zbP0IT>KX7)^5=AL>aY%LywgISQ zu(~M~dlule%4URz@+7H{X74b2T`#=6itYB>PVV1$5? z@<|aI7?MmRm@H{kTQb|xj-z-p_LzixP$^6(>$Du=UP0H@g{oD{0UQ(qNI%M_Z9+n? z@m+yVpcsT`@~y`r^4nKJ83Y&r5+^cF)7h(Q<&Ip8=A|PsO9CZZH>mwO}$f(}9KXti5s72e`IZE{OF|IV0%%nnVNUyy{J4+c9g(K ztC5+i*TObgiDksuF0&pLdj;&`;%~NTuwpEBHx92rZ;X3DTPH91y}hoE#Hv3yVh;sv zJtan<8})(PFg#;h(7h#`c}X1cDSLg5PTyp!L9LJnA_JL)$o>cp3hMvhh1LFRxaF$c ztdft!f3*OeR<*I^P>c;t=Yyolhu5+7HwtBOD79SgPgmf4N7?kde*PR%`AiTr9Tj=- zZaI-g+h-=?`wvY|JtgNiO1O4Gsa+{W*?4>B8tJsKA4zpsg za$vO+{TgzFie$@t`mq-pjw+mxKBvur%pZ7^)q$XWi(G2_NMCoOkZuMM77Q zVPM(rF9zh5%-bFPjK(Dhi)Ve$5ahCCwAcA%*arcTI)_F8qZ|NNaxMo*k^K;!7z-?y zVmGJ=uQ>|cKL`jg5K#74FCbT(pqWcs!otHVarK-=Vs{>A0#9?l-?qCVXnEe@Zd@>x ztf*Unjx15?JhMbbd^t!#8(S1T;l|Ys4WjQ|`-$U^02js&(8SH*3@#$^L&S7(E*bvC z0l8>JJ|h0c2$4!sJ8sms`El3i#6#R{@O*+^*F%*IA>_ zJH4%GdQtWX{x(w2L~>>KYc!P0p3$S=#$8Vy!}3!(bm&|pp5Cks449B^TNs1*rDGTh zuDVtT3!~z*u=Z;jFvL0O+I#JW)ro`8UEST|`?Emd8WEm2I@uA=F|G`woKTpY7siiz zKR6EuUt57R1_txxIbdMlucccH0tw$|)jCd2#D4XWV;zS0`cSWVs5jl4&_cpNUT9mc~X7_2l zTgYYAyHLW@d`CqzMqqP$#Ra|R`%IcQMrA9_@7ZNnSM;TakG{fF*FR0vrm_6uN41oo ziL6R)qoMg@@^%k|GGH2NS~|phR@nLPQI6)@_ttQ@-n&@gdNN;SeQ~O{58NOzi(LU*H?RroK{*Hh(dDL3~xG6Bx$?RW$=2r&gxM^1-8@}`9>hYRw+d%=o5rcxt;mzNPt zAAUg|o55gn_VreKOjx2u8*?s)0D6n2$ayAn42#HEt5mQqU)vLRxY+xFt4B!7^e=xx zU;(!%(uQ8<4j3t;Tk(^w(@%vgy2sC8S!58E=;`es~CfL|Du&=MV z1nNv5vcz+-2XX)dd6(0VM9ZW14Z1rpg!k0c!#{}nGPC~WpJj#@sy{Nd>#bsuIr$!q zNX#AmiIzPTwTV$~P4+y}tttS37Z+KUS5)xpynCvP1BqS2A`sVUn`Z1qvZY^m0ToIu zrAUI|_fpvTZRgSPoNdZvoO)jiT!cqOSvee2Jz>yK0#B(KQ^{rwBzhbQb!Ek%;NTuI z3UFXqoIpcEgZ>{1-1u~HmdcQI$3$VEv zz?v;2-hl{dpK1e-CjovGMQ#O8W^OL7gaTwF1;Qebb|hmHQ78hzIkPPCSV zSe2>D#ItuPZ{K&@DAVz>>BY<}O z_v^?`WOEkykt}u%YaIwji8eSj1vf7U#k5re22bsQ7{}zEkS{-1tX3}zKsX{r&hp|1`4mfRIVF2SYG(pe6VDS;^jQ~_6124$%@@vu2 z<>^qWNs)X6y9cxYgV!O-%FNCf0ESQ5%zKPm0Ujc-YuzVL;b1!q^R3 z#C{}?kia4U5n2=+F+pHS`d%!dxrKMOGQ9Jr`mrlQdw_s+?{ z0U9DcO9OcZ#tGdyLeygv_aATnv_YrF1cNIhSI&fnx;hmKn1ScS!LOZ+0~-JY-PD;` zK+*~I66;yI(TCONZ4fiz!ZbBEJ5=w48&4QvTfNCi6*SO6&!T__~sJ~hsfrD{CfF9sIu1FrdEr9l)Vc? z$Fr$#m;-=>iROvJO!}lVFnD$p}0ytt($SCR8?1VZufR4zDjOmOga#U?fj6Cq$Ddz=E*65 zYN!6~TV_^cEZx?7dY9Xq(2^9mqLFgK*7%hIHt2)gdEN;>Ktw69{mFNtbg6>(L?j2a ztOefYSpg8Cz!N7t&TUJB|fa;Tc0gaDxEAPaBKD_b6$MH*k&F5c- zBd=X!M%@Fwl}HZu-MXb8djPX>hYYUl%f8f2bqJX1Q;Vmf@@RHD!Q zA^m38zK&?mJKUkbc<_X$$~61==!4yg<~(p}h__D(qg|JQ*Vyp?qDuj~49z^7H{!<= zM0vg}J$Sa0o9D}_JL^!-C<(88K51vzoa9)*bGvy9kDj<)hre6u|Mw@D1Hb4lOlbU^ zU{O8C=8u%5>Z$>DbZn{mW!{GeO&|aZ1-hqs)MR{k0u#n*GuI z2RfEa)74opRAitvFF!vJ1;oGyOr!2xZVimNEK=aPFaiIDHIcTF_qwa3Ya%>0C7>vNuV&| z)l-0^%uT)D$H`~mhX#lEvFh^@5uJ?g(beuSl&gq;NG!k}EDn2sjSrMIN(i$57vqe^ zQh@^Oc5rmG;UQ)pUxaRDJ;$w^NO9bcl%xS4BnI_h_JgaPFGym-;ygr_f*gh{nQuu3 zRAC9xOoAJ3Jut8uilQ6lUlU~U;ZfLKv@|q6*hik<%6H{O;ZyrhqL|vL2(W=)JtlUn z6Gu}8X2fo*y^@BPV1WE$0~JF<6UvEH5J40-Vuc3Ue)?lJY`e&f;xokyY$yK@VecJ?W!wLcs}#~u z(l*Pgj7ViiDw`-keOY{NEyjaAr;AptU~q{vP1a34)^mspXd4g^ZW0< z!*yNfd7Z~`yx*_&egx|i5vjY7NH_{oEqtkic>O7MrIPMp%+LV*UWh4Z?RZSk3NY9X zkr78^;xz@=aTSR9Nl>t|!!@%$oL3|rnOw_Fdwb$&?t3|nqWS~ylJFsGINVi{oDw4f zDZnkHI2<0tjbKJbM^HM+4V6N+ow`0EylK&`a32d*q-OjNC@4kkn21>+{}4rNRR4t* zZhKnyV6UXx-x1e{wLcXa5y^^->4&nS6u;YTIMCM{ywR+N;)aa-XbmoO01Dc&&`EnQ z9c(no;O&eBh(U?SQ?8oLY|=0fbl?g7s%Z|L?UUN8;iF0<`HaM1`0?!9=QX!$H-!gk z!Je1d@S79=W3VzL#ZOTTirArEHvmVcICm38CxE8jVN;N-#HyLNLS+J5U_BOEiTH$# zK|w*`6NF&kIveM0r9!A%^+I+zPlz9AqE7XnyaW;nkI?&&F|e~zfRX$xeso+tvYQM% zw9zk2w`|7PYK&FlA0$r%U}U3WS=uuQR{~~qhnv{4Ntx9di#lU74w`iyo&g6@{oTbE zL^~VLha=hGO9;19as>bxk|-JM5A5rR>9>y_f!qDTzkUFW7%>5sPYCarlEIVd<#`;T zc2L;|&m}uzz2W&t;u7`o$-{CXR`5&`j|6#BysFAP%BLMG-Ntt!a|Z4wElx(l5S#t4 zWZZTlJc%Kbo=?aHg=bGh&B;8*I3}+jn<+XFMH8oPf}>YYP^=+UcL$&Tn~*bP*d?-| zv9TE$1&^idz)rIwu}(#4+KH-qI(_ zkJ@a<$v{SX(5awjKZV?qc>o4* z%#oDc(GkSJQoy0PdhcKW3gKM!(FtOpCz(?n)Fe6`E9}F&pZJtkO>CQ)8C z-m>i7#)H^7I@-uIdGx1o)6MuwSdh-Ai$LX_XaTb?pef#(tBg_}h_1qLHW>FWFXqr1 zOCdL4_U55hOt40Gke^!s(;>yNMH!Tl4PQQD^S4by3>^YEtU=$+@G9Ar2+UUoC@tJfFfr zCiP0HjB}-El$RMC=laRaXLi3|KjizRp~~PDoHgTe`dp*(R3kOuMU|-3KObud)*2{2LzhT9QuIGO zCVqI_;HnRS|l4bkp^_$;)z zEU)b>kUY$Aby?_;t&8Ewk?~tg?mc_Qfkz)FJl|vMv46%>YI(CGJm~Pua8tkv@%Qz`Hm?cD~2I`vQBq8e+qSz zp^dmSQxB?i0}V}WGD+Ekq}?bj2`UE}oonTTv=(gD`yx2M$z@PocceOhSK(RLn$G74WrsLYTm; z{nV|#8c<&?#_kEX6PY>ece`71-V>3krast(fqKGgk>Go66G+eCrN7c~wKp&%#2avt zQMEc!;U`>J6D{GbC)FyLEQ~RPb6N&inl(_iv^Q^E=3XnufFd=*ffQj^xHND6Y4dSw zhsnN;*uNeL6VuU0fF?UzjtYW<-UAtI7Yl-AlP!RBK^CMG5}GS>>XZ}!1a)1^u0ZPd zRv<(17=nutVCuT4+Y$ao_lp|)$SSZH;mysMYEw|)TsRC&HN3s4u8#g5%IDS#n}A|M z>|N4Yh;n`|?f!!Y-WYm)IUKy+9W>u<(OPB~U0V~u@3C#T>bOsP`14Y-Q$ zusFWQvc%>|HTdc+%n!TAg+Ienizx;@s;S;ZEA+hS=F;MFMwvt z%F7#mT4k$F%Y9JowS#ZuQ-VZ!nq z*eMcoV6uRIENINhby^9x`BLGhQCHXC)6XuEt6Bt^Ow#5A1cH)N5f&u(vpP{$M*I-q zHZPEgMox6=C`i~DazJ>$_+KDFpXgK$OAe?b>61vhl~b`AyAT%uczrGHb^~c?&ze5~ zX#>DKR&F-6XfI@}tE;>1q{c>0XD2S+qPsaKPo9i0>E^FLc>}5@OgvIDMp-mP&%wz_ zI~~twXk-+@+(=15WKJNI2kYTp|5kB%HNLZtgtT_v4WHLDk3YB zCjpjM+6jYFdEn<)^^8PZ6Io>@Dml(h`k3F!gw4c25CdAM@~;!N z;1d{a*4U8GV2AT<0O;NYFk1JGm>w}L#Y@*5fi5NMm_R7JM|WW33@yXsZIbG$c;JufQd@wJ4L4P zq1bN~K}%IRE0gyM6D`T4Qb)UMzHQ#wlEp|&ipFT-8hG-i~MYC(h3zwqR}m@t=)oCm1RGoiGJi+ zv>d&zf?n+t&SjeMDOQ%tH8&7OG`d;9L>qB`)Cx|w?aoCzkHf?AErS1&=P$8d``-9h z5U6~aAm~Z^RaG`D`PUO9u#tyRm{U?%VKkW1^|0Yb>K{T6+jFnWOQA_O!S@sxrU91! z${@j>1|iy1&LLgBx@>IEhDvL*j-P9+fzJF|dJU#e+7P)1K`~>fC(KO2!JoJpE7SSw z*P#k^G~U^^MEn>a*tM6BHw zxJ`a!7~KsEgGTfaBYF?kIb|iK(3ml_>*T_pG?Azf?A5oP3@Mwf3z!E>H=v(xx* zC)XY8z8quS{RslN)%)rnaMy$7HLUFq-Sp)cgmqRaD+JbZi2cD%$>6OjkBfL7OgeH$*|=gL4GIktJr|Go|R zo%_ubsm3AUmI+Pvm!7!L#}Q%O0eS9 z^f>qpoxlL-C2x2ytSmKOoa5%wejQJ7W6dqaUhI&Mb)!g#`vH+VSgHdqeXd`S(`b&vCG7kp+S1iH3r0t@kWx;G z-D-4%j1olYh)!cO6%`c*e6r2nB>krZNrSI=F^4|dbnzmMq0p; zB*12LL_UOT6%OQYPN^VhRB@TtQ(Ns|jUE>h`YMg}@cz@M-gnK1K)rf^s6@v`I6u;3 z<{rdmfT>$FR#UfFW|JyuyFfsaJu>le$fNxL$2Stv9x& z#xaP`=|bS{?b4%Un!2^MVVnCNAv5oP6p0sMz*=Ws|NMQl<>t+sQ8zwQM&NvE(OQ6R<1(DeO5P^T9Z0~OEcw`{pZvI813Fy#Qf zH;v!QApl&i&!qtREi|TfX0F)WwQ$q=oXk*q!tPGvmyUW}&kkB@l^79RwKP6M-*pT=USF9FXkSF5AJw}PBRB%2IH zN?AhzQP`On5uBR%yaxq_5yP+xoqD=Dkp}764fgjZks}X1EC7V+y-|2_+@>2erNxsO zY0kDzX8~R;AsfDJc^cXtqE$GnF{KQ2KoL$R?xUauNo7zt46JoMKolO;GeP@_!@I_^ z-1-mlLRsNDEU&X~ccnt#wqsPzl#J2@trKB(B542jyc+Ki+sVasgl|My zFUOGz)7%M&<7lsGYJoCcOCY6JG5h6y|1f!c3xflyr%}=nmv>Yz|>0j=i+eLB%J96uf|d>0if@oqpD3p3qXly=5uEw3(}3>SCKF*x(( zGBzM9sPz(a$eoH;`aY3{1EEc8)*Fedkr~`ZBt*z|x{Mn&?&pGR1EiD$>7l4e2H0{jO zA&j2{j7m2U9Ch;ACKCJ!t&*4s9?=z2$P$YTDfn46-_d~H#gB|;5aH^BhqfWP0W#w> zZ9bra>_?8ezIGTlge1+lIv6w7pk)*bAS-q9!HwU`CO*54TRl-;Ng$1UpMs8ZCmaVN z=bgwTNi8s4eNM>4D7WupbIrK)VV$jwjZt+j4u&OM3`CH)%@+)CgoRInNQxP|=048` z$QBUa*Jw+eU4JnFl{62a&`zM|MT#=F`Z`!R46n7UGl5`?P-W3c_>`DwexOyE%40YP7Gu3J+W?9Z+n$6yyx*Gv1atL^hEdrpd-B6 z>`fAYA0EyHJk}Q&D}b)ik+^5iSOo7;UN%V2jCU6HzATKnI~G5z7~TvcKjO9=e5VxI3W9WoHyJ~ zqHfWtuHMTXOdyZJARCU>KF5*LbZM9@Rh({yFO` z-h0|~I$_4;;KnbxS|K^^+`AG0%l;jZ9$84wm}RaV=(r&<#-j%j5#RLGGoq3&#>DS$C*Ew;bYABGwd^KWB9d8@300 zE03}KQgrF5%bxA?i6Jbuq^w^|Sb2eSs((?^C)SK7`)(eI*pwhOcN79pX>P96*pR+6 zJF=D|i9<$;Ga3M~Z^@Id2N}<);6FFg_C~3$mxr2ku6FH6(SN)(|NIL1ROHiA{|qk) zr9iv#KObhL!tZSXNKPYtEYV6@1=q3<7rgWNlRr{2KOE^JEc2W3i3Vw1`!3SVY0NLZ zv~_Q;6*Mw*V)JQC9H@c5tFzCpLjD$mz0d!8>H&lxfug zUswwD)XBtvC?I)cx{Lia65|1IiZvF1%E}m%nIkw09s~sJ=#slgluFp!@{C?yEGopa zjAolunj_`FZ}+jWc{`}FVcimTXH;wv1NFimJ~SYBGtgmDSC5LJ?3P#HY~Xdf^h*Slu{2nh)&y{>}o;v=@BT%xH& zu35evcJbS2i5a#+7O)#e0N)mzn=$-iVtNV(_wT1f6<@KEEVAC0hdk*rWZoJHO^;Xu z@#b%;hO@n3i^^6&-!y2<06R^tgrVTP_5}uQ>geojbnRM7&37PWR@m+)fW_1U06548sRDptH|vsr^)>x7qX2vU#3>pbYGd;5V-2m4X6tWb>+a5 zwzmJurIW?#kJk3;K8~Gg*vz%jb!N@{luTKY=UE&5A$r&8g83AM#>rJO-f)w>Z3_N2 zui(n`SaI%cxoZEKdFg>C|B(`OefT?c=ij;luS>jYVxZvg<$S4c{P~j+7kB8}CJG87 zts(g$^^L&2(bqz!N}NZCpo10@!?m{YUC?{A&jU~4353b0w}1)uM4F4i84BX*iZx9J z?P_qHH|s4lf_rc>e{Lwa?CGc2wWT&+m#Eba)IzM1c1K!gU*u@jp&2M4Sxj?ZAv_ z^id|?Zrr@N&eC$srRidxRp$s#(Ax3ZGt3xPpHOWF4#@*g=i$K+fWsJd7H`AIR)Qup z!j__~{g}AJZweTMB5V;KMh%RhF`pO9n`m}dMv*}$4E-$R{IjtZ8* zdsmKO)NGxLWrhFuDN0xcM%}GP?<;S@(l0@^ARrl^jncn!b2fm;hr?16fP87Ey?p+FCA%U1+`glD z)l)ZXrdE?_5*q?`r777Cb@bH4grUC$(9b{M?w*c=%i2KH-Qq-a*dI?zxd$^#-lMk@ zY^p;aJ6S1~WC`7$7Xht>NXj6p$;xMw2+!Di?E2nJWoVRtLy-xYxUe-P(QIcQz4rad zs>a2VNb=lWBOz=!0SH5@QjLBo8~ z-Fka2<}A%vD0lZe#K zu@CCSF2QaaLn7HekU7ZOCTN2=1Sj>-iV!SWSOs}OTP3JG2aMwq?k^lU7>@b9xEsD}NXSoKCq+En(Si!>B!mfTA(A4XIqE)1fGH{>m&TBC zumvK68+fQ&-GJvZ=Z{stGekQjh1)RR+JuR^sDM}vbOf7Z&YbDS_mXcX0UZXCHftuP z5UlhBk5+CY_Refu;-`WRq^EHZh*m7$FvPH*$z+>bzaw0|DBqLmd7~39^?0MNg6YF^-#9n)B8Bq14@I#~-Vf##-dI z{C24baxQmOOXTa>rl}k&qt7rgJssLhL*cCy%GFP~lcEL_uHlH1fecwYmh4;~VId6E zZ$~mf&9|o*QfF55wm-@*Sa##{*_p__%F-#6Dt zT;xyuHg*7pAhyIVbzT6;W>HDH7UM|6_GOeGolkV)BzSBR|B~XRWp?U(fUN58qK@8s z^KI3WN>8;ef7x)3MAde)7I&HtaZ^ib7SpLyE<4t-_vuyvcm9`_;c4Hd5S9?F_W8dY za=znRJ{G9DMr}f&828bo?q50tspq@c-Rk0?b)c3WBtMq_#n*<*V8#l>h7A>mg_a^~JE?nd7rY0h+bMmB=Arb|Wa=yGbSNoE9h^|X~@tQP>=gz3oE zNboe`kkkve8NDde!W3lgjWijJqeWa8UR)rqBQa_~xXfT(yH+jS@$3;!w0SO2UYVHwk3F7j{i+%cSAfL@2aYaOlv5ygt0uis(KW8(pl; z9^o=0kmCcuTN};an8n^k&tC!i(SB$sN%#xyDO9J=HPd4$&a8eoey~rGSSbNe<+}|n z!SxeIVuDdDlQHc-)NwxXBMbg{5};E;HjR~kr;?@I|>TSZirX! zhcbke{Khf3ZQV$`nixRRUI!wH!xt0)AVoubx*#tV3US3HxJbe$M-U9=zBHGloC>7+ z2IJ$?a73#2!@s2k8=P?rIE)S)55TKokJhsjhMp5B^31RIz)KGgyZjw5Bia&2=6kms z;c~w@RPQT%lwjwtuiVeD5F~ovavmT=n4ef)kEFae(nV{lJAtNJVOvbhA<9-be2C#4 zDFB#jC3kCNznmx0=Hdjs?W}|GMJpvB<6%R}yn+JQZTKzQ#1eJ);kQun-NFb^Rmq5m z2!c2E&5-Dt6HtPoCPR*g!54LC5D2gdL-^|}v?!1mo}&Wau#=wxwbtj}^P2!%-Nkf| z#2hTi)c~yC``1_D%{y?SDq6V0D2K)9$+!xn;3hy7v;=V`nMY)pt<56JBhm;cj%j1i z1IBoqC>-8#NQ(lvgQ5;jW)EoNhxF_L1Y+Ml6s(N3gQs$nNE2%Aa5B)-UD&AfflCuJ ztZZPM%j4@%bGeLJ6R?D&v8^4Vpa7~;g51>b<`;h8LEqzJ0Z@E?w)>rC9vaq)zlC`0 z>)x>a;YCipH&VhmV(@``o!Jkgf`rZ^@zERDQTT`@rCc4=#})mI!`sOeTVVc&d&^O2 zBe8o!hrHx}%YbXVdj1U=Q{VnCv(l&5@9U^NLva}0I{l2Bpy+f^+(8^7(v%Ksy&I6L z#M!-wHH7-suOewLQTrgam`=R~R(B>mNBh4A3XlOL#F(GG2oq#+U7dRW8!SwL_Q^J| z9$K#qRy|S?Cb`s3)%^fpm2H|kq(g;YpY)Xe_%ZkHD;*h7xKd|N z17WX-+~V^H9Km}$6Dwcz-s4sj7y%e(vk0=G1L&ZE(^qV_#n?6STwdHpc=@fOBKMW& z6#(`LxX$d-ijY7!`M?r&DM<+6eJsQKV2VOi%BmBhexc#pSoz8^*j3};`^@x5KZ415 z(pRsc>Jrk?(GhTdossb#)@jHpWMI|9z>o*Rr271T*e)>XUCrPi2BOLkg+C7wyWQyWMqtM^q{&)OKY~T6 zK3Nbu%xEFW1YC)5%FxO%1Oo*+CMFh6)GuPXJ1Kr~_fP&?3-APC>412P2o#G$q`cwS zkCv8hlw=bFxLqFRcO;+|z)-!&n(rZ>@!;kyB%sioPt@f1ojV@5$iLBx8eZei2d zj3Ak+=A5O@iH*F^1NIJhtW6T zo4>+ST-H$V>SW2vUc&~AB%1uBijbG;MQ*MFLvE`iGSlcRA_^fT8k*svc8A-WDEz*=aopUcm($aEqwH%68W>m?DWWV>dYUR z6Tse1{_`VgD#&w`E-CZV*h&B2zsBXzAI$!Aa_44|_xr6+F5dt5Z>xpVA7p3p7?cUD z*&&}|=^BR@dy=y>anfsL6`%GVJqyh3`MbgfrpMIJi;h@aB}pNdWCl*i(ow!&H~lDa z<3^GPDOL5hpX)oz>o zFRT4>YVmyRag8E1{o-HdA1IkRvVB+Wa%0m}&~$y7%-im#Vv*r1oxI5Y-Y3@WM>LCf zAO;!3c0)~}rvQjHOc~>y@TL0bnm{20u9$m&4#3^Lhzib40wHt>2}^V;^3ap5MI2^n zOcJPwtAhj?L!pb)&_kX&iQiqD^m;g;}ksRPeCS$_p4k6dMNc87(9wG}u4q1@fD zt0Wbx#r@LFn>pku!Ast7zYf{k3TPJ`8|#nlyQ8rgJyvWXH_;>$$^uYNTmtavI|#q( zXha6O_p3LswYDJadg7PD3NmUTaxsLRu9FL**fP0Dq;CKbD3E#{ zL-Ks@+@{^Jn&Pb{;aJh|Rs!^LxCYTSH{oB_&aa{%9HR>k>=JSAK-`6sg!76eHX0NJ zjmfY9?hY5RJFsF&z<|384%dKu-2(8oi(-9R1)%`R%u8pffXgQGo*X1P-X)#GnWOBqtziQZIyI{5@i-lgMn5_2Ai7Y(2mtzoWMJ zja39c1z`+{RS2xU0a{`aI}DO|0u6wo2-q`1wt9Vfe%gDp2!xNU7=4Mn*GmXA160U_ z0o6O^!n~Q#FRwx16}e$r+S*b;ZlB5qvX$nX`p2&uP6s-b5D!TDAcr=v+6(n9bnD{4 zIr2lx^ISb?6NY$}e!uxI;|#qNJiVC0PD1#BN^WOW*%=6C<2^(R?QJFhb^(ul(DS|u z4=3rLnSZ^zW7dyrTn+gqA-rKbF8_Wr4_&b*UZHSNvOl(}by*>Z5?JhAIIY*u5siaf8Ps-#KOCl{=b$cCR8}&tD5RCrvZt-FHT4zArIa5Q#2kT zc0i&D6Sa08QD7+lpP;;*b7}?{LhR9>nvey;1?eNknsylES@*72FN7g~Y9{t#AY#aN zd3t$8Oy~oU$ek^_--DN-gLX1x4E5$o99FlTML^f0Xk!?_66j|YI-(Vpl^J)tf zWVrzReL~7pxNs=~2MnO}Q&mC9)f2fG4|LZ7RiKy%!efu)^^F<9gOc)BS%ru#P%Q|U zLjblHy4;8}oMK{o?{3^vn$iusAen_UXbLJ{3*t=URHPjYL|^H_^?iqp)i?%_0hwba zEO}l6WrsIR8&_sXeAb5b>$eUKqJ-8Hyt5IdEW(Qr+J&?86)45ZSdwE49Dh6VwTPO9 zSc0O7maG$)5K!ragibuKszLiX$oPa6i3^!eR0cf)KVk^I2@L_?+Ed5oH+GogC?e2r z3ZCB4nD(TM%kPd=*qITOfQKLKJaWB1E_AC^`r3A6(A1Z|^*NZkV~aU2ZH!^(4zJ ziaQ?OHRhD@ddX__4qM{&c-6cQI;@hS`yJ=WVOVzGgYwi`K58zddu;D0<>K!cY*mw$ zZC85z!`0or^A*-RMZHnm_obO)?>45F2rl>4X{Po zH=<3#sH`z5=x;UwqPOZtkI@M^K&9DyPp}FaPY+R&%495zYb{HPb-E{-gvOV|IIKNQ z(FF|h6|^#~LeTnCLob^mtAt0US0q`J*Qm_@{4bg|0C>JoytWD{#fW4#Cuqu5A>H{N zDCgeMv7*1dd}K0cW(r}n`e9Y;0O)otEEAm+3FxqIwD~H27xOmnP1T{;t`^#oriWe0 zmm_#kN*SQIKjL>pVJ%n7@GZkD1a~Sz|t?Sr6lnd)>BUJ zdZIz-9sHIveBrLqvfz3Kuq_E+5 z8x8BP%TcyF{Z1oYB3kdp$2$adcXR%9r`*vn3B+htWBb&{ho4XRb?YkYSU%pGb@OeC zCH{hpcuwOo^uo4Dg<0Z_yk4K59HlaLDk%?`kD|Mw!`x1Od%(f81;2a^tA_kC5S+D0 zNNB!7x2xt>{`Ir&^wQLOL@qQR{lUS=5iCc3Wnd{+|NKSaxZEehVU6>;<|C}$$M zUW>XGyh+-}B5ur9>rvx;)CJ%D)@Qr_gatRYBvGa^@(u5aw53z~T0W;tCEnR{W>LxK zcxr%UYOV^6?b|GiBZDmQvCk^hbaqT1W9=FUHC1C^xKE1!MFd{M@XV}>%gRO+XiuKUYWrjolHjPY$Gcm#RpH^z1#1`U|0>Jv5(oQ+bD zRL{Dh3crJU5%u#Zj@8j zac1d$Dy8sAYph$t;2%*QxOZpZF@d>dP(VkW~u)h#DT0tQt5&P9+&OQS$z`a1&TjO0R8lb#utVt;4JiRGSo zQ=5l>Y$md0pk zfhQ?Go)YEO=jiV4ZZBYM45_B5xmJ9nZp1kR5#srZd>o$G-$R8tLE_H-`ms-^4p(0T ze!)=;NBGWZC)ey~bbMWyp%K223|PVeGOKFm(63}W1MXP4cm(TCQC3zpD-oE#0o*P& z?q!hD$jjf06LMV`$^DKLmp(USn{Vpt?d8@94kL4(ks6`dx5R{MpGb`wtmPFHZlD^g zG$K||5=^O62V@3$7$L&uFxEPNojbL_4c{tYnurBbWop5b!9#L*Rt9%QgCiER?(vDU z0DZL=Qw9Z@`2gN|;|Ab^kpN?mA^1Z)Wk9e}&4i)9{uAK-*URzH+`>XP+Jn@qEm_JZ zLi*QO%fIW*hSco$MiT98yLZTYYfp zuv$lRbMo`kVVwKb8;cNoJW1(-qz8o2bf6ys51K`^js3mF~Yd|Q_Wr^{`OmPjNqnfnJB1tuhMlzS2Y%mVII zp27uaaih|g*K#B`kw4l79-nfxL-yXaT z+2g~lLHO=}r>?woCOg<4JH&geD+3(99gxrAcSN?+>P?$o9I7=27W7_SMSQv=v({QgjGC4FaLvqi!1?BeXW{^Xugma@6Qr^{eIX$s{QYndaM!Zm8^TODn-(0SmWTTZb~KvA8SBs$ zC?teAZ5a;E3zB z-Ix+=(}FBei^Bxrtz0};XU_?@X}I}3G+R7a%R)R*T3Lt6>?Fx{H`}@GreEOUxK{IImAyDF z?^p%*!Gj-;wYvn1_M=9?*S&It2#o$^^KCAuo|z^J0=wBl|NN*;$P+8S^1jSh+@bD z(o?5SX_$!@l$1!r#n^fMqK-~voL%zuZ(9%+q$%fpi$_reaRd7sz+f1JWx|irSFvzj`@;B2oINO1dlS~upcFV5;Z-2!)pI5OQt(Q_n5{^Rq1~OkcLkeRsaL~%>eEN2Xpk6{=w1w+ z#bV!4Wrk&?x$J%C&VfcrYiAS?II08liy&bchXbp@x;j?;ep_2xr7My2LM^sEH5uY6 zVrq!)MMs6PUGny!FHmNJUhi;95nKAx-N;)|O#}|_-o6cAfIN++zcON9Bxav*Zlk;< zd=RY5#&uqCBi+JBX*@m9McixLM8jm0m zIV(5UWYiMLP8c8lB8-F%OifK)z8-~B0X2ZujD$u3+=EZ>zc%g|j?@K1;5i;n3_n`@ z`!{@{1wz)ln;6Y*Z&#?pUO#G+81%K9Zp&I3$HWoqsuz%_8bH1A`to_4qz_^5#C*5c zeU0qc^hi`KHqi3O0<1R-Q4C~PNU8%H`tp#rApwgxf|E-}D5X4*=CKwCxeVH>A|{E2 zUxJC6DQHV5)89_q8xVPSVKdI*D_^!yuEH3A+1VRK&);2j(G!E=Ha*J_!2nSGe~3!&IC0QiIr16dL)Su?ChonI_a$jOeF6og)d| z9Vjo-ZiwqWUX+WA+6|%6a??5rfL@+FfBt;s z_K^TXY{Db zhu;RZd%fQO?#RqFT+mFl1VUwPw>CKeOH9IHHZyTy4AtgcRAUD()Zslk0DLNTi(=k3 zY%_e3$avi__I6U)uX_(gG5m>`CG_Ij2MT1>F-R=`wW09X-Liv@jslQuPWbeDD&}Hh z|70p6Xe!QugDuA?wDPUlcW$_IMq=sUvTJXsCgHrsBYP%rYJY&(%S^0r6~zV+#VF%0 zrK$;a!rDpT$;5F+j2}mdnjJn3D~cd;`LP7XMW5lZ@W9hTaiS`q9d6KK6a%Mhrkuo) zN>iG$F~F?-S#<=E*SmmWcehYc@FRH5=fQ)laP~)Nz9|vEVRr2?3pJ@Y}j2;n4r zxIEOawJE6kt*5`jQ@U-KxnvZ(3W_w9zVUikcv|6Xj1wC9SwK{(;bm9Q=`kVgM#sPQ zDxecifdDz+Z;1a1jI*eKR~fFR!?U^I^B@RSK4K05M5SgFRxzdyusema2=k!-AiA&x z7qncEF)^gd@q$_}2HYSpYmfCfj<2tgx5YRH)iAMdh=qY?zm5B`qbd^UZPT17T8B7c z+{9v_Gy;-tD0|0Y{f*Pc^Ux;}z=sHxBP0xmSnzGoO5BBL{oCla)1WXVnnKqQ8XhFW zy~g;{M+RTjCP*lW;CsG{sllz&B=zY~KM)%-anQOSqB15F1SXj&{jtveqQu=w<_60$ zZ#hchqS5V)F7N^ThAh!U)SbPW4QCr}piFd^9B(kj+#^1@q}qFoK_weqjw=MM^Kj>aJwGS&w0 z-d_!IV=1oe$Dw{Gq>W>(#NNVDL!Yi+koegZv^gNnqkxKF@CbkWI9*oBEaAtPE!_xV zf9ra=USfV$(vNoVXNZ<8j=^3`jdV-nT$F|PMFA@nFD)+Q&+eN5ed39gWIbjNZ2*Su zzFEz%iVW9+xjlatAuntIFC%_x*i~k7@$lS62WK-@R)Rk#w{Etgy!;m2R#881f)~ap zcqaQs{Clwhztvb-7rx;qO~d)K!Pb%uLdix%Itk7q;l!!tu;Ab|=z+T0#vt0l4CV8g zq-N(ct<)4x|A}BBLJ!9TKYU1mPj8&-iqoW3_{xSN=H%_1V z`8sr*M<{8?tx@8_m()V>3#gBtTiR13U%cuP{KCwJEq+{vI`IDP0M7FP0R)Y~tJ1|? zF>^_{t+gYDnugWvN$F&8H17EDGdU_&JD*MA@dt!r2WVKQ19pE^u`aB8dRoU^Md^QD zHfv_r6~IAi-Q(W)IS*d4K3CMEp6^JZ-1#Z;qut+2#=ZR?YtB>(sQ!$$?&i;&lR`_#O{B><-7wv`R^s%hQ zWgSV@myFZL0)w;5f`iKFleMjty;FV`8P*@KRS!bRH$Ae6MMxJM3|fmDW3zDQMJ$pq z$8gMmv#*Biej_BE0tF)opBrOmdH{Qih99-qJ?n;J>kd{3fl|Q4UxL80 zbwM1HmyynDdb0&yl`k>fh)Kpu=xqzQ4Sm(yxxT2@1vLHdaqw1Nk3wL~KWGj(0~{u$ zKy#WdIB$eNB5&on$;PXHfQ$y%4ydyg)Fv>;*Uxb~G5irM);Jerw*0s7bf*i$Bm|C8 zG)&Y4=mJSy&TNT)Kr+2RuiyS*fwIFJ0hl+i9|+N+iOZWcI*cqaIim|hHhZG(2&|&e zCIzPUPI+bV!fWCS2fAG91$!7o>IW zT>RCAO?!`f02Hq*y9K;;wGu~goKQJCBB7`@>@03=$NBiazW$u~Ba2Kzkz&89Tsh6i z(1=r+M6_Y&HjkG+WdqLg7LY+>rZwP6h}qZI_)}S#O~VxUpnH8pijQ4C!W{hHcH_&J zWee1I5;%9~)~zM2Ily}{)$~@JP;#IL*H>Ly>vtBU;z0G4YDYq3$;690_v!~Bz2EIyyR7B7-9%*TW5@C=BiI9dw4ZdxOg< z5w^FA;`w1J3NktUi|vNaFqoH+;&5rV3Jk3nS^mO730odX3_QXcC1k!1_ldsh@Xigy zCP?wzQE$Dc>Ij1z`!)uKm4rTbxP+y!1M?WOBit5P%TN+BT;Uf#_tq7DwskU>MK*y0 z+_jgHks)1<7Y_ibv=Wz~kh_ESQMcf7R2Vo}VmMqJ85`RSiA_#8bZ^`g2BMO7Bh3uI zetneu3zsb8=g*%Foc^NY-T>xQ#s*+R2R6sb{_!(0f~c)Gp%hjW25}3i&u6C{vNsX+ zQ!{9*I6}L5bGrMOHH8NjFHBdULZfy1YvR59c+fWzEM*00{-;iq-p0N(M=yQuIZ!M| z5`}Yr81M5WF3yv?3|8~5vaFuV{5y-U9ucq@9vGm8zkIzWrRs$Xn>d1fO?Barzi(L; z{N}cgs)k1C+r?*02gQt@q?IPhU$@9f?rHYgUJ-*X>l@mtX1;U9Um^b>~AHy~Y-s#1KADpH-!NQ}tOj zJIa5A&Ci^gfNd>}F9;y0Pam>jx=@!z*d(ev_Q#HKFPnA?XrIj(9#H;wZ2|V8OLB6) zaY9#2OpHKW+DMYj=87+0wx#;MB8fzkS_nrig_fZ<8Uo_I1j^BXA9HAWUgUb}5g-&m zIKsQ?afD3$g@*lWG(nzfje=1|VRZ;J z5z-Az5<@CC`ji5@;Zi+9`S`u{|1=Cb)gQ6 zF5e7rt9n-Ky?g74V9Z7d-wRP-M25!1_(9TO@?|UKsv-!wfJ|v-xthhlkvjt$-J$7~ zuUt_CM|ADiuK@+ISX>dkJ$-#$2h-n{#0a9es4Oo(8*c$I{hvy~uU1W2&>no^IEmFj zY(=Ayg?}d$A|14TAoDqIEElN1p_*EdwlZMQ6@JPb#{GLLKISh0T}%GySTmXfF7<75 z@DoJ$kcl4lM7z#l-VVSqe;7&RFK#l#ql6`C@%CZg&pVRQwlNTTD!jcp5~&R+)jlsR z@qGTZ*1XzlU1v2k?qEylt;TBv44m(-@55F=oTEPrQA=@Y)WM^XKRiA?eg4-MXx_E} z4ga98j=J)teyxD8@Gc0et7Ec3vpXFdSN$39)i!9LJ5(2sQde3{9Ba{7_@e7{X#8F& zI?R3ZyO(8wqop$a^*MczcrRp?unPvZs$iQq8_!X4dPwWug9k?{9DyXVU!n1QGC;pW zSnsXFS?iYVyn0u|7#JAX=Q859AvjDghQ3%e)-wr1~Q3dh0x|yy~=Nvaof@a#3Ysb}IdUVG87)1PLDD1Lx~cWw+s%$&XgmhC$U8jwK41RU5w!&v8eMzb>;J?nR&?K; zFBmm_=XxHv%-Qx)3hDa9qnu1mf42%r+;H6KXW8?JzK|Q14%wJECgj{&L(mUk`64ql=P;#T!MCgHISHY{nnWa&d_tR|+AeiO(*P z@rOuD{`1scMAqWDt4q_J-DSC93wus1jJ%Y$3-+JCNktcTbV{B-dE@`z`|ojL55Kw# zA5dyhKbKr(H1(FjmG`ET4v8xT0=8GYGey_zm!;ENGg>7PpCQ1Tw=jER)(t^&M;2N@ zVZdROXM<`24PY?nCUyw9AHpgqfkU@0G_YeQy^1lHvu@D3bc~Hz-qZzMwBK*;MHz=MCa-!rBP#F- zybs2?bASGLz{qH?LEh}mo5nYQGm^|^)yX>|lvwm7q_jY`|8x)!_tnOfUH1OShKH=$ zdQ7tk2SahS|MFl=$bxTZh*2;u3*upJ)hhQg7sR2?x}p22fu19?W%Zgh>@k7~Op@1h zM%?TF7M&|;9g z$r0EN!TR(qR8?1RI`d#BYh<={ul&^X^p$Fo(T*0c@y7iGa;V?Ab4MVzs;rEv-JyG9 zZ3;9pabN>%G_ZSv0^w>d1eIdVSb(K|FftR&4*|t(lyS9y-2xh{*VEF1Tk)?HhU$rI zYd(#DiP(sVsTx1}0DwZXA|J`M=u5+d%kbqdj|aHdkNPK{Bqo^MYhd&{`!*50P+H`j z4ZGs5iUa8RjyU-~CyY#0?z zAL=(UN%Hb@*%;i})rcU=uKtl@u?Noe&#GHcIlP=%|0o(2;o5&AXD9`9ToTA2v)@oL z@b^Nhw)Fx+Cq{J~@uN+cYgRp_F@chp=&9~CWuf~=Hl~hW3ymiQP7yC`p$9LZfWL5o ziI60UEzpRRpy54~b`eMiq?2LX=9t7rhSMfmG>}o1uaHHXXz?sAj(GOBH$hgg{Yynf z#A{TH8?Xu#TjZ}C*-a=?#WH#+Is=ef+nb1tiS$^B(SV}1fD_?CjTk@#+q=0Ml{i`*!d~b{Y#`weVCHw8XvO~zDM<1ZP z3D0$}HwhtmwV3a?U%eW?MS+K6oBks>-${st(_X-#QhIuO>z1|2oN$lXHs=7-7DQVY zo2S1szOksl_l^=ph@v*;cjtV1xord*I4JACgrNOp4hM}}myMFz|fTdZWB;}{BUm8+QB~m8@`N~IZ;M%{26&mfYD5r3V-0n9&&3x1ZgOl z_8o$dgU^4(`3-OBwaZfZAKWTt=IiGdn!xnNiD8d!zEcCh(Y&+^42+DaT0Eia(2_Jr zqKw*JH3tm$6~rZ#!oVgz;9{|i;gBYxC*iVBpVq=NUUabe43u8qfwK2SVw4Wg1@^0u zAq@2Ne2j)Zj}H@+Q^E)inf9`ggwsd@oc@QWCb>F#>hN6r1|J9t8uZZ({d_Jqw$*51 zba|jmjTttRN6N$Y^sOJA&u3!`G9;YTVm!I^9dDX1g4vQuPtgS51(?ye8s$z2u(t!v zIY^En2KET^iJ6&Fpo;g5q5Yyq&_Q^n)66el9R3Z~SlRx#Dx+#slud4bI)Tgi?Gn5r zOhsW;?6E3Nn2b`-!Ktyr`(tzAj>P?`I$sA6=#)*H`?4zuK($c@o4yc~RFN2O60f4UB0ow4M2x zP>}oLV>>@Y%ZVH~Ptmf51_pEz$+FA|iHU~LN7^e%NJx;8Y?&#j(Vn4gx7Pr(P-zW? z!Y8}csd_kaZa`rHi4*pqJdT)91%SkckCY)bgJ;uyG!ppVm79Y-adB)LU?Qv@|NXld z%td%Z77Qb4G-rCZA}LX1Zht_|m5S|YoHs)iFrWH``1@f5J=+t#8gloP0&}hy8(Y8o zt(x!P?A#4KZPn`wPJ`^oM(M7;hj9F7wSQRp|ISSkC7fJY;)+HL%_SFDl>X|W+GlsQ zNkD%-`E)p;t$m?Cr+JOJL{IhOdH5XGKiVMF5r@RPKBHPDA|AQ_T&;9#Cu4 zPmZH~+WRqIruJ%2TY4cHp`asu7*qrL;q$MY7Xc@?qc0(okRgRZ14km>XWdXR5xs6^ z3#5uy@b~A%FWd*XLOw`Dgy~IPbOrH;FHXM=i;Ak4I$QdJ#AovHef@QmyTAM1y?bW; zl?YV$TwdM@8dx$yk)yAIS286&-h1wKUc3m%%!h?_Qb!0m_~IyER*eIjPGadePe1K5 zGxNvILxE9^Ehe<2pp&T>h!Im`0IHV3sN?pwxnxeRV(}s~+MI_X@uh1Ld7`goDVi^) zXV1`0@bd&%>K>Og&0ii~gLk-kVhfLY_aBXJ-$aSz|JU7>zeByZf34JM;i&Xe$U69O$~(_vA4tvr7b$Uv%CNOU)|nx{~Z|x&ae* zs;1rUlW8MbMzJ);Y;Kc?wVF$DhDBaoyZO-U)E*#q1&iQKI3D%sd<5VfGkpwhw|s@pEvC4m6}jwnNq8k7D!15rMtvsd3tNvo?{<#9lxo_AII z1PH$uYO3S4s1q3n)%^URuV&YnKB=tW z*FnkaCznI5PH!d()MAgZG_xk?2#tkYEwp`VnQAy{n`LCHxA^} zLy_IDVgg~|3Shg8cI(Y)8}#g(GO!-)8L#qU?|D!nVW5oDR;@wIANY-`T&I=6->p%|wv*SiM{I<%-RLaAC zsnb&EYCn{LQ+qgQaJb|RFIa63tLV|7g zz42V+Jr~g#_YSAw_j}>BxO1LjZbSBQqhd9G+Pjh0;;xIRxa6iyacGc@gduZ4dlZjj zllbp%h$=SUL8MP4$x2ucp&ngh3v4zn>v3BHMv|iIp%y;J=_JiLfTJ6jzDIYYiu=&{ zkxDK;p?buA7L;C_{J@(5W-=@!HZ~Y@_vuVq*t?LJ9Y@mujEra-$zT+V&m#u?RY9=Lp)lcnbDT*${8zv@JSTQ)A;Knue_zir8p@ z5&uHVhbQ;u3Tg5S$jQn&;^bx7lp#a~SXU;k6+D_Jcup3DibNR5)o4u^Lmq;aVsUe9 zm@Df29X&nOpNw0gw*%zfJZ^4Po*w)H z2*QZEGz~<>+|uOmVII_6U1c&Uf+#J_jE|p>n;^m&`{Ko*Qnqd7NXyB2fB$|Z$sr~> z8ds#bV+wD;tMasU)t8A09Ok_Bt|~DrP-^cwBMb_8mx1JmWM${a=reN|Q0c>-DQ79L z2UqA$Xy{glfx*G$c*XGp%N8$aE4>y=u)cZUxm<KRJ2% z)9UW({Vk`J)`>|;`O08{=?cP~TP7UEb-EQEj^}_tVe;EIGs{?THn82x3>Y#cLG_bf zSeaa#!9fQnwOZxvii&j|qmR+!OUuZds&aC7=Y=7)-nB{PoX4e0W>+peqn}SprzeFG zt~ph??@dA{5bblAASdcYuAGLk8A|U3ti2kz5Hp%|IqsMnJIEv zcO(+X8sUFZMX=(C_O>v$gR)vks2sGvnT^^F(d#I{7z_7}M3NxwVY11YJ?#h;-u zjhHG*+A10jsvVKbl16Iy7QwcWbWzW_5G??~QOB|N?>g{7nm%SbY&qLaUI}y}WJ%2$ z`$NdXR@a!zzzLS9mVwj*lWqj38CJnSjqQic&6fg#4Q?y_@;d-q8UhR2^iq=^KI8|! zA0bi-L*x$E^GDtaRRZe#DC@8IFm%h`>14ge^RfH8?w7V%0g6NR;UZKFP9##o?ltL9S9wO#(3s^=-WEs z=e}#-kF6s_h`N3v#TC=a-GLSt$Xuxd0PQV>1c$M07oGrV^N?tg#Rrg|#YHU0ti{*) zyl7c=L51ASY=fOe-fEf8gib3~IVQY*D-nec_!Qc|A9~fhAt6Qz$vHWzAhj^+_OyGQ z=x=RuG#!JYFS@aF7c9^lD|mjex{1c*)^z`|kM*?IVl{Zaf@x+=&GL!C)?`yXG8$x( zJfHxz57L`AzaYd|GaHms#fncj6G&%16p4gP!@+G&oC{nB!V-1hol#j0|3l3q*v((X zbOJlZq`zG5zwzz6?T}41l{$(bfmj2F`=kE1;5!-(O&75u@aK*Z#dSC5_9^#33%(9X z-zHR!@!`W|ln*JEh{yc{SrCOGb!A`>z)R)RV|1TNVNuC2~vg4SHaz=Y0FiK%}jd+6?U?VhnbOaxb)3m?=-~VHHH*a3?PV&|B z932*N3uT2Jx=FFsZk!3_v^&2wn4FE>YpiNNVRTiKdpyRpHRG+7{p{?akb#TZiAHY| z%h%|e`W(FfVnmPEY9Ng?qaDe)GfmqV6ZsrYa-PTYx*zJexN*TXV!3hTan|H<=}x;- zg~##->pY;^ziO9$VA=mPdH?N42iD*bb85l!aix9G^U+@KJt%yn6XHC5CG(`*8I#MG z*;X6AZJ>y3$z`pXiV`WEW0BV#)~aUjf>2N^;SAc(M;vAd3<}jb@7m{@Z~@KW6Rk;w z+P)kEQy<9tem}DM>VD#JXkGDtU$!C6GmnyYoHOiS2pNwa(mveqL7kjJqO6KQlY+gUYBhfR84lNs;{i1RK|$T`$nl0@J9Q6bwvjL! zR}nAz0R_b5pLIM?aUf^?mkCc4iaI29E|Q%;JTc)5PH%_{q*rX$E>o`n9w@Sw0yJrx zMwMN&ZpV3=TPw5;1@!2cm^j>lXY-K~v@}x5_E`+k_+fd(#o>H!?*(MBlPVKfCe*5| z3xR%qJn%;hWuWR^_x3LLE>%NFH2i&SbBrbWBRVi$C|j5%!RY9|_BZNy+0CS1o1JN! z#tE*8nzK4(zZLT0=7X;u;vx>iW_)1-S|oym#Ry_sD}l6&7|=SLwYk>_(&}l6!SP>~ zEb&70fYaw@#0;Z4UK-9PzkTR!lqwUEn`D(}$%bEW4nvHDp%5(=tl*m)QNCSH9a_`@49YKv8LXfbD=wVi zTnK2ANG4u>*};~jJmHV3qCtA-~qP9LaIO>!ia36>n2 z#J1Rns8${*-QfWL#L$?s;IxBj8m^LiaQ#=8pY{gEew+kz0ULu@54Jj^yA(3A_l)uP zZ#sE+mO5)|?L7)@0H+C`Gl17PNd2K#@WS*KZQEK`SEusrB^i<4!zy@=lLr9?zo1|O z9)!7TZegzjR+}DPi{0G4h0tP?eXvGB_*#SBYU-H6{IG}YtRp1VO0@g&127OS2c@8^ zjQ{e%xo2is3Q7?RvQAe;0w~Ef@sh**wPFqt9c{4*eu@z(BJbD&ba3!YoO}0VbC4d1 zz(7y5-C+RzG$w^3kHs%3oOrVS%jjrC9suOH3in?PuwNK?A__+OAO%1u)^|)n0Tm&` z%kv&mi21N$cMmcK1GhXOQF2KRv6s64)%&ibsJ9T*s%icOVWXhZQVp2L?hsHE`|t2| zvgYjg;S5f7q<%5vCVIC7P3`#UWH^8H+kY`-H^2=)sPlN~nd;y8tv6v$8j`Dbp&iVpZ?yPdv4A?DmDLq$s8 z?&(k2x)hL4Wl*3fWX3|I0Y)?H1qjjiCuXOF@X7e58_VETFF<>&O95?e_ImZe`QwI$ z0wjbj%&7qJW4mBwYM7m_y*4BGC#G3TDZ^WHux`3UU3?9-GTj8xJ3vn5L6rqGisz#N zibFy>U4wV=;tpJ^v@WDpo&-i1F17{ zwZJ^w?IC^~Yfm2&gJ{ho^BmgK7<|#tXIf>AXIXVW?$7FjjcHr;H>SpnSYOM>4+Gu5)`%NK- zK*Gd(z{T;mr%diY|LpOy`*;jv2pWR6>CiiO0Fpv#h2gMG8Pq+-7uJP_fw7Z{YaeHm zAy$1xlIrBN6{UZOn-(<4J%XZe4d`< zDY2?8iESCtta9QJ*fhvIRF&02Q`%=}c>9fmN{s&NtklPkO|xu_pa;RbVR!m6tV^PY z%W>sb2?<5L#g+%jzQb!g7rl|kf>i0cb46*rEl(Ooe3~+b%TvU_L}m)@O;Ky2PCjZ@$G@ ze>eC0!h@TgpE$z+nxi|=X)!QpG#^e+Vje}syNFEqu7Hg@PDAxh7-ztOy1iu?@>73p z6Z1K!2~nVAYoaFqLK^JHtVl}WyJu-q5W-_N(}eJ>4E_dA7r%e+_H5U{si>F!FILuX zQ?arO%m`7|#%n-r!P}U$#CO@pD_vAOcYH*mUGLds!N~`vaVMV&^&hc^hBKjPZAH7!ksK6WD#E4X>O(>VZ8uAs8dmF1}s`+ zLEgk#k}ZLZz{oQYw)(0vVPwD%xZ3^%Hk7&Rf&3+r#BCF?&Z^V{445n*Prt-cCIt4@IOFXoVvCikJyK0f&CTt|eHoyy!Y`mFDTuKK!sHhq zZgf<%8>~wzfa!o?HN!vdFR==SU%5|gV;Y1#M965jX5l=@H4W0)GE{IMV5{yGID9Ud zfQ*$5hdG}7F#p1&3`is5^^*k|+7Fs-PQ5cIS>XX9q$6Fk@P0Vnb Date: Wed, 1 Apr 2026 14:53:17 +0800 Subject: [PATCH 008/204] [CI] [skip ci]Nightly Report Optim (#2406) Signed-off-by: Alicia <115451386+congw729@users.noreply.github.com> --- tools/nightly/generate_nightly_perf_excel.py | 33 ++++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/tools/nightly/generate_nightly_perf_excel.py b/tools/nightly/generate_nightly_perf_excel.py index 9b9d128ca1..817f37f664 100644 --- a/tools/nightly/generate_nightly_perf_excel.py +++ b/tools/nightly/generate_nightly_perf_excel.py @@ -78,16 +78,22 @@ "output_throughput", "total_token_throughput", "mean_ttft_ms", + "median_ttft_ms", "p99_ttft_ms", "mean_tpot_ms", + "median_tpot_ms", "p99_tpot_ms", "mean_itl_ms", + "median_itl_ms", "p99_itl_ms", "mean_e2el_ms", + "median_e2el_ms", "p99_e2el_ms", "mean_audio_rtf", + "median_audio_rtf", "p99_audio_rtf", "mean_audio_duration_s", + "median_audio_duration_s", "p99_audio_duration_s", ) # Columns that get float coercion and number format in Excel. Excludes request_rate ("inf" str) @@ -143,16 +149,22 @@ def _load_summary_columns(script_dir: str) -> list[str]: "output_throughput", "total_token_throughput", "mean_ttft_ms", + "median_ttft_ms", "p99_ttft_ms", "mean_tpot_ms", + "median_tpot_ms", "p99_tpot_ms", "mean_itl_ms", + "median_itl_ms", "p99_itl_ms", "mean_e2el_ms", + "median_e2el_ms", "p99_e2el_ms", "mean_audio_rtf", + "median_audio_rtf", "p99_audio_rtf", "mean_audio_duration_s", + "median_audio_duration_s", "p99_audio_duration_s", "commit_sha", "build_id", @@ -447,14 +459,29 @@ def _apply_build_metadata_to_latest_only( build_id: str | None, build_url: str | None, ) -> None: - """Set commit_sha, build_id, build_url only on rows with the latest date. - Other rows get None so that build info is not duplicated for older benchmark data. + """Set commit_sha, build_id, build_url on rows from the latest calendar day. + + Dates are expected like YYYYMMDD-HHMMSS (filename / benchmark convention). All rows + whose date starts with the same YYYYMMDD as the lexicographic max date receive + build metadata; older calendar days get None. + When max date is shorter than 8 chars, falls back to exact match. """ if not records: return max_date = max((r.get("date") or "") for r in records) + use_day_prefix = len(max_date) >= 8 + day_prefix = max_date[:8] if use_day_prefix else "" + for r in records: - if (r.get("date") or "") == max_date: + d = r.get("date") or "" + if use_day_prefix and d.startswith(day_prefix): + in_latest_day = True + elif not use_day_prefix and d == max_date: + in_latest_day = True + else: + in_latest_day = False + + if in_latest_day: r["commit_sha"] = commit_sha r["build_id"] = build_id r["build_url"] = build_url From c3376a466b67db11d5ac4abd9bc19f3f53eef145 Mon Sep 17 00:00:00 2001 From: Ding Zuhao Date: Wed, 1 Apr 2026 16:33:02 +0800 Subject: [PATCH 009/204] [Feature][HunyuanImage3.0] Add cfgP to HunyuanImage3.0 (#1751) Signed-off-by: Ding Zuhao --- .../hunyuan_image_3_transformer.py | 103 ++++++++++++++++-- .../pipeline_hunyuan_image_3.py | 5 +- 2 files changed, 96 insertions(+), 12 deletions(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py index d189137234..3d670809ba 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py +++ b/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py @@ -61,6 +61,9 @@ ) from vllm_omni.diffusion.attention.layer import Attention from vllm_omni.diffusion.distributed.parallel_state import ( + get_cfg_group, + get_classifier_free_guidance_rank, + get_classifier_free_guidance_world_size, get_pp_group, get_sequence_parallel_rank, get_sequence_parallel_world_size, @@ -2535,6 +2538,61 @@ def num_timesteps(self): def set_scheduler(self, new_scheduler): self.register_modules(scheduler=new_scheduler) + @staticmethod + def _split_model_kwargs_for_cfg_parallel(model_kwargs: dict[str, Any], batch_size: int, cfg_rank: int) -> None: + """Split batch-doubled model_kwargs in-place for CFG parallel. + + The tokenizer produces inputs with cfg_factor=2, so all batch-dim + tensors have shape [batch_size*2, ...]. This method slices them + so that rank 0 gets the conditioned half and rank 1 gets the + unconditioned half. + """ + s = slice(cfg_rank * batch_size, (cfg_rank + 1) * batch_size) + + # Tensor fields with leading batch dimension + tensor_keys = [ + "position_ids", + "image_mask", + "gen_timestep_scatter_index", + "cond_vae_image_mask", + "cond_vit_image_mask", + "cond_timestep_scatter_index", + ] + for key in tensor_keys: + if key in model_kwargs and model_kwargs[key] is not None: + model_kwargs[key] = model_kwargs[key][s] + + # custom_pos_emb: tuple of (cos, sin) + if "custom_pos_emb" in model_kwargs and model_kwargs["custom_pos_emb"] is not None: + cos, sin = model_kwargs["custom_pos_emb"] + model_kwargs["custom_pos_emb"] = (cos[s], sin[s]) + + # cond_vae_images: tensor or list + if model_kwargs.get("cond_vae_images") is not None: + v = model_kwargs["cond_vae_images"] + if isinstance(v, torch.Tensor): + model_kwargs["cond_vae_images"] = v[s] + elif isinstance(v, list): + model_kwargs["cond_vae_images"] = v[s.start : s.stop] + + # cond_timestep: tensor or list + if model_kwargs.get("cond_timestep") is not None: + v = model_kwargs["cond_timestep"] + if isinstance(v, torch.Tensor): + model_kwargs["cond_timestep"] = v[s] + elif isinstance(v, list): + model_kwargs["cond_timestep"] = v[s.start : s.stop] + + # cond_vit_images: list of tensors + if model_kwargs.get("cond_vit_images") is not None: + model_kwargs["cond_vit_images"] = model_kwargs["cond_vit_images"][s.start : s.stop] + + # vit_kwargs: dict of lists + if model_kwargs.get("vit_kwargs") is not None: + model_kwargs["vit_kwargs"] = { + k: v[s.start : s.stop] if isinstance(v, list) else v[s] for k, v in model_kwargs["vit_kwargs"].items() + } + @torch.no_grad() def __call__( self, @@ -2621,7 +2679,8 @@ def __call__( self._guidance_scale = guidance_scale self._guidance_rescale = guidance_rescale - cfg_factor = 1 + self.do_classifier_free_guidance + # Detect CFG parallel configuration (only 2-branch layout is supported) + cfg_parallel_ready = self.do_classifier_free_guidance and get_classifier_free_guidance_world_size() == 2 # Define call parameters device = self._execution_device @@ -2649,13 +2708,33 @@ def __call__( # Prepare extra step kwargs. _scheduler_step_extra_kwargs = self.prepare_extra_func_kwargs(self.scheduler.step, {"generator": generator}) - # Prepare model kwargs + # Prepare model kwargs — attention mask is built from the full + # (cfg_factor=2) batch before any splitting so that each rank's + # slice is correct. input_ids = model_kwargs.pop("input_ids") attention_mask = self.model._prepare_attention_mask_for_generation( # noqa input_ids, self.model.generation_config, model_kwargs=model_kwargs, ) + + # Split inputs for CFG parallel: each rank processes only its branch. + if cfg_parallel_ready: + cfg_group = get_cfg_group() + cfg_rank = get_classifier_free_guidance_rank() + + # Ensure all ranks start with the same latents + latents = latents.contiguous() + cfg_group.broadcast(latents, src=0) + + # Split batch-doubled tensors: rank 0 → conditioned, rank 1 → unconditioned + s = slice(cfg_rank * batch_size, (cfg_rank + 1) * batch_size) + input_ids = input_ids[s] + attention_mask = attention_mask[s] + self._split_model_kwargs_for_cfg_parallel(model_kwargs, batch_size, cfg_rank) + else: + cfg_factor = 1 + self.do_classifier_free_guidance + b, _, q_len1, seq_len = attention_mask.shape query_lens = [q_len1] * b seq_lens = [seq_len] * b @@ -2678,9 +2757,12 @@ def __call__( with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * cfg_factor) - # latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + if cfg_parallel_ready: + # CFG parallel: each rank forwards its own branch (no batch doubling) + latent_model_input = latents + else: + # Sequential CFG: double the batch + latent_model_input = torch.cat([latents] * cfg_factor) t_expand = t.repeat(latent_model_input.shape[0]) @@ -2721,14 +2803,17 @@ def __call__( # TeaCache fast path: reuse previous prediction pred = tc_prev_pred - # perform guidance - if self.do_classifier_free_guidance: + # Perform guidance + if cfg_parallel_ready: + # CFG parallel: all_gather → all ranks combine locally (no broadcast needed) + gathered = cfg_group.all_gather(pred, separate_tensors=True) + pred = self.cfg_operator(gathered[0], gathered[1], self.guidance_scale, step=i) + elif self.do_classifier_free_guidance: pred_cond, pred_uncond = pred.chunk(2) pred = self.cfg_operator(pred_cond, pred_uncond, self.guidance_scale, step=i) - # compute the previous noisy sample x_t -> x_t-1 + # Scheduler step (all ranks compute locally in CFG parallel) latents = self.scheduler.step(pred, t, latents, **_scheduler_step_extra_kwargs, return_dict=False)[0] - if i != len(timesteps) - 1 and should_compute: model_kwargs = self.model._update_model_kwargs_for_generation( # noqa model_output, diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py b/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py index c19e8a65a8..ba24818dc9 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py +++ b/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py @@ -149,13 +149,12 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: "time_embed_2", "final_layer.model", ] - - device_str = f"{get_local_device()}" + device = get_local_device() named_modules = dict(self.named_modules()) for prefix in non_model_layer_prefixes: mod = named_modules.get(prefix) if mod: - mod.to(device_str) + mod.to(device) unexpected_keywords = [ "guidance_emb", From 08cb436d4e9271fe6272702fe10da768aad42df9 Mon Sep 17 00:00:00 2001 From: zdoba Date: Wed, 1 Apr 2026 16:41:25 +0800 Subject: [PATCH 010/204] Fix: ensure input tensor is contiguous in GroupCoordinator.all_gather (#2367) Signed-off-by: daixinning Co-authored-by: daixinning --- vllm_omni/diffusion/distributed/group_coordinator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/distributed/group_coordinator.py b/vllm_omni/diffusion/distributed/group_coordinator.py index b722f61c07..8ab38f2a65 100644 --- a/vllm_omni/diffusion/distributed/group_coordinator.py +++ b/vllm_omni/diffusion/distributed/group_coordinator.py @@ -213,7 +213,7 @@ def all_gather( input_size[0] *= world_size output_tensor = torch.empty(input_size, dtype=input_.dtype, device=input_.device) # All-gather. - torch.distributed.all_gather_into_tensor(output_tensor, input_, group=self.device_group) + torch.distributed.all_gather_into_tensor(output_tensor, input_.contiguous(), group=self.device_group) if dim != 0: input_size[0] //= world_size output_tensor = output_tensor.reshape( From d40840b1144bce12e5cc4d5ced8fb22820e8fd81 Mon Sep 17 00:00:00 2001 From: NATURE Date: Wed, 1 Apr 2026 16:50:45 +0800 Subject: [PATCH 011/204] [Perf] Bagel KV-ready early forwarding and time step consistency for /v1/chat/completions (#2398) Signed-off-by: natureofnature --- examples/offline_inference/bagel/end2end.py | 1 - .../offline_inference/test_bagel_img2img.py | 4 +- .../offline_inference/test_bagel_text2img.py | 4 +- .../test_quantization_fp8.py | 1 - vllm_omni/core/sched/omni_ar_scheduler.py | 30 ++++++- vllm_omni/engine/orchestrator.py | 85 +++++++++++++------ vllm_omni/entrypoints/openai/serving_chat.py | 14 ++- 7 files changed, 102 insertions(+), 37 deletions(-) diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index 922a1af236..efcdea2355 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -168,7 +168,6 @@ def main(): params_list = omni.default_sampling_params_list if args.modality in ("text2img", "img2img"): - params_list[0].max_tokens = 1 # type: ignore if len(params_list) > 1: diffusion_params = params_list[1] diffusion_params.num_inference_steps = args.steps # type: ignore diff --git a/tests/e2e/offline_inference/test_bagel_img2img.py b/tests/e2e/offline_inference/test_bagel_img2img.py index c7df4f91be..a0c3f6cc9f 100644 --- a/tests/e2e/offline_inference/test_bagel_img2img.py +++ b/tests/e2e/offline_inference/test_bagel_img2img.py @@ -79,19 +79,17 @@ def _find_free_port() -> int: return port -def _configure_sampling_params(omni: Omni, max_tokens: int = 1, num_inference_steps: int = 15) -> list: +def _configure_sampling_params(omni: Omni, num_inference_steps: int = 15) -> list: """Configure sampling parameters for Bagel img2img generation. Args: omni: The Omni instance to get default params from. - max_tokens: Maximum tokens for the first stage. num_inference_steps: Number of inference steps for the diffusion stage. Returns: Configured sampling params list. """ params_list = omni.default_sampling_params_list - params_list[0].max_tokens = max_tokens # type: ignore if len(params_list) > 1: params_list[1].num_inference_steps = num_inference_steps # type: ignore params_list[1].extra_args = { # type: ignore diff --git a/tests/e2e/offline_inference/test_bagel_text2img.py b/tests/e2e/offline_inference/test_bagel_text2img.py index 505e12438d..c74763a35a 100644 --- a/tests/e2e/offline_inference/test_bagel_text2img.py +++ b/tests/e2e/offline_inference/test_bagel_text2img.py @@ -80,19 +80,17 @@ def _find_free_port() -> int: return port -def _configure_sampling_params(omni: Omni, max_tokens: int = 1, num_inference_steps: int = 15) -> list: +def _configure_sampling_params(omni: Omni, num_inference_steps: int = 15) -> list: """Configure sampling parameters for Bagel text2img generation. Args: omni: The Omni instance to get default params from. - max_tokens: Maximum tokens for the first stage. num_inference_steps: Number of inference steps for the diffusion stage. Returns: Configured sampling params list. """ params_list = omni.default_sampling_params_list - params_list[0].max_tokens = max_tokens # type: ignore if len(params_list) > 1: params_list[1].num_inference_steps = num_inference_steps # type: ignore params_list[1].extra_args = { # type: ignore diff --git a/tests/e2e/offline_inference/test_quantization_fp8.py b/tests/e2e/offline_inference/test_quantization_fp8.py index 5943afa028..f71c53de74 100644 --- a/tests/e2e/offline_inference/test_quantization_fp8.py +++ b/tests/e2e/offline_inference/test_quantization_fp8.py @@ -120,7 +120,6 @@ def _generate_bagel_image( torch.cuda.reset_peak_memory_stats() params_list = omni.default_sampling_params_list - params_list[0].max_tokens = 1 # type: ignore if len(params_list) > 1: params_list[1].num_inference_steps = num_inference_steps # type: ignore params_list[1].extra_args = { # type: ignore diff --git a/vllm_omni/core/sched/omni_ar_scheduler.py b/vllm_omni/core/sched/omni_ar_scheduler.py index 71da4d5925..c4d8452225 100644 --- a/vllm_omni/core/sched/omni_ar_scheduler.py +++ b/vllm_omni/core/sched/omni_ar_scheduler.py @@ -95,8 +95,19 @@ def _process_kv_transfer_trigger(self, request: Request, new_token_ids: list[int return False criteria_type = self.kv_transfer_criteria.get("type") + if ( + self.kv_transfer_criteria.get("stop_after_transfer", True) + and request.request_id in self.transfer_triggered_requests + ): + # For split pipelines that only need the transferred KV + # snapshot, stop AR decode once KV extraction has completed. + # This frees stage-0 resources without requiring an + # orchestrator-side abort. + if request.request_id not in self.active_kv_transfers: + request.status = RequestStatus.FINISHED_STOPPED + return True + return False - # Universal duplicate check for once semantics if request.request_id in self.transfer_triggered_requests: return False @@ -456,6 +467,23 @@ def update_from_output( kv_extracted_ids = getattr(model_runner_output, "kv_extracted_req_ids", None) if kv_extracted_ids: for req_id in kv_extracted_ids: + # Emit a kv_ready signal so the orchestrator can forward + # the request to the DiT stage immediately after KV + # extraction, without waiting for AR decode to finish. + req = self.requests.get(req_id) + if req is not None and not req.is_finished(): + eco = engine_core_outputs.get(req.client_index) + if eco is None: + eco = EngineCoreOutputs() + engine_core_outputs[req.client_index] = eco + eco.outputs.append( + EngineCoreOutput( + request_id=req_id, + new_token_ids=[], + kv_transfer_params={"kv_ready": True}, + ) + ) + # Mark transfer as finished if req_id in self.active_kv_transfers: self.active_kv_transfers.remove(req_id) diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index e6373ec96e..8128c25c64 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -268,6 +268,9 @@ async def _orchestration_loop(self) -> None: continue idle = False + # Handle prefill-finished KV-ready signals before finished outputs. + await self._handle_kv_ready_raw_outputs(stage_id, raw_outputs) + # 2) Process raw outputs through the output processor request_outputs = await self._process_stage_outputs(stage_id, raw_outputs) @@ -313,25 +316,7 @@ async def _route_output( # CFG companion handling: companions don't produce user-visible output # and don't forward to the next stage directly. if finished and req_id in self._companion_ids: - parent_id = self._companion_to_parent.get(req_id) - if parent_id is not None: - self._companion_done.setdefault(parent_id, set()).add(req_id) - logger.debug( - "[Orchestrator] CFG companion %s done (parent=%s)", - req_id, - parent_id, - ) - # Check if parent is waiting and all companions are done - if parent_id in self._deferred_parents and self._all_companions_done(parent_id): - deferred = self._deferred_parents.pop(parent_id) - parent_state = self.request_states.get(parent_id) - if parent_state is not None: - await self._forward_to_next_stage( - parent_id, - deferred["stage_id"], - deferred["output"], - parent_state, - ) + await self._handle_cfg_companion_ready(req_id) self.request_states.pop(req_id, None) return @@ -358,17 +343,17 @@ async def _route_output( } ) - if finished and stage_id < req_state.final_stage_id and not self.async_chunk: - # If this parent has CFG companions, defer forwarding until all done + if ( + finished + and stage_id < req_state.final_stage_id + and not self.async_chunk + and not self._next_stage_already_submitted(stage_id, req_state) + ): if req_id in self._companion_map and not self._all_companions_done(req_id): self._deferred_parents[req_id] = { "stage_id": stage_id, "output": output, } - logger.debug( - "[Orchestrator] Parent %s deferred, waiting for CFG companions", - req_id, - ) else: await self._forward_to_next_stage(req_id, stage_id, output, req_state) @@ -393,6 +378,56 @@ def _all_companions_done(self, parent_id: str) -> bool: done_set = self._companion_done.get(parent_id, set()) return all(cid in done_set for cid in role_map.values()) + def _next_stage_already_submitted(self, stage_id: int, req_state: OrchestratorRequestState) -> bool: + return (stage_id + 1) in req_state.stage_submit_ts + + async def _handle_cfg_companion_ready(self, req_id: str) -> None: + """Mark a CFG companion as done; if all companions are done, flush deferred parent.""" + parent_id = self._companion_to_parent.get(req_id) + if parent_id is None: + return + done_set = self._companion_done.setdefault(parent_id, set()) + if req_id in done_set: + return + done_set.add(req_id) + if parent_id in self._deferred_parents and self._all_companions_done(parent_id): + deferred = self._deferred_parents.pop(parent_id) + parent_state = self.request_states.get(parent_id) + if parent_state is not None and not self._next_stage_already_submitted(deferred["stage_id"], parent_state): + await self._forward_to_next_stage( + parent_id, + deferred["stage_id"], + deferred["output"], + parent_state, + ) + + async def _handle_kv_ready_raw_outputs(self, stage_id: int, raw_outputs: EngineCoreOutputs) -> None: + """Forward split requests once stage-0 KV is ready, not only when decode fully finishes.""" + if self.async_chunk: + return + for raw_output in raw_outputs.outputs: + kv_params = getattr(raw_output, "kv_transfer_params", None) + if not (isinstance(kv_params, dict) and kv_params.get("kv_ready")): + continue + req_id = raw_output.request_id + req_state = self.request_states.get(req_id) + if req_state is None: + continue + if req_id in self._companion_ids: + await self._handle_cfg_companion_ready(req_id) + continue + if stage_id >= req_state.final_stage_id: + continue + if self._next_stage_already_submitted(stage_id, req_state): + continue + if req_id in self._companion_map and not self._all_companions_done(req_id): + self._deferred_parents[req_id] = { + "stage_id": stage_id, + "output": raw_output, + } + else: + await self._forward_to_next_stage(req_id, stage_id, raw_output, req_state) + def _build_stage_metrics( self, stage_id: int, diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 7354b573f6..527947be92 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -276,6 +276,7 @@ async def create_chat_completion( output_modalities if output_modalities is not None else self.engine_client.output_modalities ) + num_inference_steps = None # Omni multistage image generation: Stage-0 (AR) should receive a clean # text prompt (and optional conditioning image/size) so the model's own # processor can construct the correct inputs. @@ -309,6 +310,12 @@ async def create_chat_completion( extra_body = request.model_extra or {} height = extra_body.get("height") width = extra_body.get("width") + num_inference_steps = extra_body.get("num_inference_steps") + if num_inference_steps is not None: + try: + num_inference_steps = int(num_inference_steps) + except Exception: + num_inference_steps = None if "size" in extra_body: try: size_str = extra_body["size"] @@ -372,14 +379,15 @@ async def create_chat_completion( # Use standard OpenAI API parameters for comprehension stage sampling_params_list = self._build_sampling_params_list_from_request(request) - # Apply user-specified height/width to diffusion stage(s) for image generation - if _image_gen_height is not None or _image_gen_width is not None: + # Apply user-specified overrides to diffusion stage(s) for image generation + if _image_gen_height is not None or _image_gen_width is not None or num_inference_steps is not None: for idx, sp in enumerate(sampling_params_list): - # Diffusion stages typically have height/width attributes if hasattr(sp, "height") and _image_gen_height is not None: sp.height = _image_gen_height if hasattr(sp, "width") and _image_gen_width is not None: sp.width = _image_gen_width + if hasattr(sp, "num_inference_steps") and num_inference_steps is not None: + sp.num_inference_steps = num_inference_steps self._log_inputs( request_id, From 3fd4a4dc27db9709604ac923aa278fcb583c4956 Mon Sep 17 00:00:00 2001 From: Wu JIAZHEN <83007646+asukaqaq-s@users.noreply.github.com> Date: Wed, 1 Apr 2026 17:25:02 +0800 Subject: [PATCH 012/204] [Feat] Support step-boundary abort in diffusion (#1769) Signed-off-by: jader Signed-off-by: asukaqaq-s <1311722138@qq.com> Co-authored-by: jader --- docs/design/module/dit_module.md | 4 +- docs/user_guide/diffusion_features.md | 111 ++--- tests/diffusion/test_diffusion_scheduler.py | 379 +++++++++++++++++- .../diffusion/test_diffusion_step_pipeline.py | 257 +++++++++++- .../test_multiproc_engine_concurrency.py | 38 +- .../test_qwen_image_expansion.py | 5 + tests/entrypoints/test_async_omni_abort.py | 85 ++++ .../entrypoints/test_async_omni_diffusion.py | 98 +++++ vllm_omni/diffusion/data.py | 9 + vllm_omni/diffusion/diffusion_engine.py | 109 ++++- vllm_omni/diffusion/executor/abstract.py | 21 +- .../diffusion/executor/multiproc_executor.py | 71 +++- vllm_omni/diffusion/lora/manager.py | 9 + vllm_omni/diffusion/sched/__init__.py | 6 +- vllm_omni/diffusion/sched/base_scheduler.py | 89 +++- vllm_omni/diffusion/sched/interface.py | 8 +- .../diffusion/sched/request_scheduler.py | 81 +--- vllm_omni/diffusion/sched/step_scheduler.py | 129 ++++++ vllm_omni/diffusion/stage_diffusion_client.py | 16 + vllm_omni/diffusion/worker/__init__.py | 27 +- vllm_omni/engine/async_omni_engine.py | 1 + vllm_omni/entrypoints/async_omni_diffusion.py | 11 +- vllm_omni/entrypoints/cli/serve.py | 5 + 23 files changed, 1378 insertions(+), 191 deletions(-) create mode 100644 tests/entrypoints/test_async_omni_abort.py create mode 100644 vllm_omni/diffusion/sched/step_scheduler.py diff --git a/docs/design/module/dit_module.md b/docs/design/module/dit_module.md index e24a75238f..b0c7e9fc7f 100644 --- a/docs/design/module/dit_module.md +++ b/docs/design/module/dit_module.md @@ -192,7 +192,7 @@ class _BaseScheduler(SchedulerInterface): self._waiting = deque() self._running = [] self._finished_req_ids = set() - self._max_batch_size = 1 + self.max_num_running_reqs = 1 ``` **Design Features**: @@ -201,7 +201,7 @@ class _BaseScheduler(SchedulerInterface): - **Shared cleanup logic**: Request-id registration, finish handling, and state removal are centralized instead of duplicated in each policy. -- **Current constraint**: `_max_batch_size` remains `1` because the current engine path is still synchronous request-mode execution. +- **Current constraint**: `max_num_running_reqs` remains `1` because the current engine path is still synchronous request-mode execution. #### 2.4 Current `RequestScheduler` Policy diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 7e325c1edc..f0969b677f 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -15,6 +15,7 @@ vLLM-Omni supports various advanced features for diffusion models: - Acceleration: **cache methods**, **parallelism methods** - Memory optimization: **cpu offloading**, **quantization** - Extensions: **LoRA inference** +- Execution modes: **step execution** ## Supported Features @@ -64,6 +65,16 @@ Extension methods add specialized capabilities to diffusion models beyond standa | **[LoRA Inference](diffusion/lora.md)** | Enables inference with Low-Rank Adaptation (LoRA) adapters weights | Reinforcement learning extensions | +### Execution Modes + +Execution modes control how the diffusion pipeline processes denoise steps. + +| Method | Description | Best For | +|--------|-------------|----------| +| **[Step Execution](diffusion/step_execution.md)** | Per-step denoise execution with mid-request abort support | Request cancellation between denoise steps, fine-grained execution control | + +**Note:** Step execution is currently supported by QwenImagePipeline only. See [Supported Models](#supported-models) for details. + ### Quantization Methods | Method | Configuration | Description | Best For | @@ -87,28 +98,28 @@ The following tables show which models support each feature: ### ImageGen -| Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | -|-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:| -| **Bagel** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **FLUX.1-dev** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | -| **FLUX.2-klein** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | -| **FLUX.1-Kontext-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | -| **FLUX.2-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | -| **GLM-Image** | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **HunyuanImage3** | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | -| **LongCat-Image** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **LongCat-Image-Edit** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **MammothModa2(T2I)** | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| **Nextstep_1(T2I)** | ❓ | ❓ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **OmniGen2** | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| **Ovis-Image** | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | -| **Qwen-Image** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | -| **Qwen-Image-2512** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | -| **Qwen-Image-Edit** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | -| **Qwen-Image-Edit-2509** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | -| **Qwen-Image-Layered** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | -| **Stable-Diffusion3.5** | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **Z-Image** | ✅ | ✅ | ✅ | ❓ | ✅ (TP=2 only) | ❌ | ❌ | ✅ | ✅ | +| Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | +|-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| +| **Bagel** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **FLUX.1-dev** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | +| **FLUX.2-klein** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | +| **FLUX.1-Kontext-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | +| **FLUX.2-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | +| **GLM-Image** | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **HunyuanImage3** | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | +| **LongCat-Image** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **LongCat-Image-Edit** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **MammothModa2(T2I)** | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **Nextstep_1(T2I)** | ❓ | ❓ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **OmniGen2** | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **Ovis-Image** | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **Qwen-Image** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | +| **Qwen-Image-2512** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | +| **Qwen-Image-Edit** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | +| **Qwen-Image-Edit-2509** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | +| **Qwen-Image-Layered** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | +| **Stable-Diffusion3.5** | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **Z-Image** | ✅ | ✅ | ✅ | ❓ | ✅ (TP=2 only) | ❌ | ❌ | ✅ | ✅ | ❌ | > Notes: > 1. Nextstep_1(T2I) does not support cache acceleration methods such as TeaCache or Cache-DiT. @@ -116,19 +127,19 @@ The following tables show which models support each feature: ### VideoGen -| Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | -|-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:| -| **Wan2.2** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | -| **LTX-2** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **Helios** | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | -| **HunyuanVideo-1.5 T2V I2V** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| **DreamID-Omni** | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | +|-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| +| **Wan2.2** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| **LTX-2** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **Helios** | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | +| **HunyuanVideo-1.5 T2V I2V** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | +| **DreamID-Omni** | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ### AudioGen -| Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | -|-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:| -| **Stable-Audio-Open** | ❌ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | ✅ | +| Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | +|-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| +| **Stable-Audio-Open** | ❌ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ## Feature Compatibility @@ -139,21 +150,22 @@ The following tables show which models support each feature: - ❌: No support plan - ❓: Not verified yet and Not Recommended -| | ⚡TeaCache | ⚡Cache-DiT | 🔀Ulysses-SP | 🔀Ring-Attn | 🔀CFG-Parallel | 🔀Tensor Parallel | 🔀HSDP | 🔀Expert Parallel | 💾CPU Offloading (Layerwise) | 💾CPU Offloading (Module-wise) | 💾VAE Patch Parallel | 💾FP8 Quant | 🔧LoRA Inference | -|---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| -| **⚡TeaCache** | | | | | | | | | | | | | | -| **⚡Cache-DiT** | ❌ | | | | | | | | | | | | | -| **🔀Ulysses-SP** | ✅ | ✅ | | | | | | | | | | | | -| **🔀Ring-Attn** | ✅ | ✅ | ✅ | | | | | | | | | | | -| **🔀CFG-Parallel** | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | -| **🔀Tensor Parallel** | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | -| **🔀HSDP** | ❓ | ❓ | ❓ | ❓ | ❓ | ❌ | | | | | | | | -| **🔀Expert Parallel** | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | | | | | | | -| **💾CPU Offloading (Layerwise)** | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | | | | | -| **💾CPU Offloading (Module-wise)** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❓ | ❓ | ❌ | | | | | -| **💾VAE Patch Parallel** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | | | -| **💾FP8 Quant** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❓ | ❓ | ✅ | ✅ | ✅ | | | -| **🔧LoRA Inference** | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | | +| | ⚡TeaCache | ⚡Cache-DiT | 🔀Ulysses-SP | 🔀Ring-Attn | 🔀CFG-Parallel | 🔀Tensor Parallel | 🔀HSDP | 🔀Expert Parallel | 💾CPU Offloading (Layerwise) | 💾CPU Offloading (Module-wise) | 💾VAE Patch Parallel | 💾FP8 Quant | 🔧LoRA Inference | 🔄Step Execution | +|---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:| +| **⚡TeaCache** | | | | | | | | | | | | | | | +| **⚡Cache-DiT** | ❌ | | | | | | | | | | | | | | +| **🔀Ulysses-SP** | ✅ | ✅ | | | | | | | | | | | | | +| **🔀Ring-Attn** | ✅ | ✅ | ✅ | | | | | | | | | | | | +| **🔀CFG-Parallel** | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | +| **🔀Tensor Parallel** | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | +| **🔀HSDP** | ❓ | ❓ | ❓ | ❓ | ❓ | ❌ | | | | | | | | | +| **🔀Expert Parallel** | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | | | | | | | | +| **💾CPU Offloading (Layerwise)** | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | | | | | | +| **💾CPU Offloading (Module-wise)** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❓ | ❓ | ❌ | | | | | | +| **💾VAE Patch Parallel** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | | | | +| **💾FP8 Quant** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❓ | ❓ | ✅ | ✅ | ✅ | | | | +| **🔧LoRA Inference** | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | | | +| **🔄Step Execution** | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❓ | ❓ | ✅ | ❓ | ✅ | ✅ | ❌ | | !!! info @@ -162,6 +174,7 @@ The following tables show which models support each feature: 3. CPU Offloading (Layerwise) and CPU Offloading (Module-wise) are not compatible. 4. CPU Offloading (Layerwise) supports single-card for now. 5. Using FP8-Quant as an example of qunatization methods. + 6. Step Execution is not compatible with cache backends (TeaCache, Cache-DiT) or LoRA. ## Learn More @@ -185,6 +198,10 @@ The following tables show which models support each feature: - **[LoRA Inference Guide](diffusion/lora.md)** - Low-Rank Adaptation for style customization and fine-tuning +**Execution Modes:** + +- **[Step Execution Guide](diffusion/step_execution.md)** - Per-step denoise execution with mid-request abort support + **Advanced Topics:** - **[Feature Compatibility](feature_compatibility.md)** - How to combine multiple features for maximum performance diff --git a/tests/diffusion/test_diffusion_scheduler.py b/tests/diffusion/test_diffusion_scheduler.py index 171a6278cd..4324ba1e63 100644 --- a/tests/diffusion/test_diffusion_scheduler.py +++ b/tests/diffusion/test_diffusion_scheduler.py @@ -1,12 +1,15 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import queue import threading +from types import SimpleNamespace from unittest.mock import Mock, patch import pytest +import torch -from vllm_omni.diffusion.data import DiffusionOutput +from vllm_omni.diffusion.data import DiffusionOutput, DiffusionRequestAbortedError from vllm_omni.diffusion.diffusion_engine import DiffusionEngine from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.diffusion.sched import ( @@ -14,6 +17,7 @@ RequestScheduler, Scheduler, SchedulerInterface, + StepScheduler, ) from vllm_omni.diffusion.sched.interface import CachedRequestData, NewRequestData from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -29,9 +33,46 @@ def _make_request(req_id: str) -> OmniDiffusionRequest: ) -def _make_request_output(req_id: str, *, error: str | None = None) -> DiffusionOutput: - del req_id - return DiffusionOutput(output=None, error=error) +def _make_request_output(req_id: str, *, error: str | None = None, finished: bool = True): + return SimpleNamespace( + req_id=req_id, + step_index=None, + finished=finished, + result=DiffusionOutput(output=None, error=error), + ) + + +def _make_step_output( + req_id: str, + step_index: int, + *, + finished: bool = False, + error: str | None = None, +): + return SimpleNamespace( + req_id=req_id, + step_index=step_index, + finished=finished, + result=DiffusionOutput(output=None, error=error) if error is not None else None, + ) + + +def _make_step_request( + req_id: str, + *, + num_inference_steps: int = 4, + step_index: int | None = None, + sampling_params: OmniDiffusionSamplingParams | None = None, +) -> OmniDiffusionRequest: + return OmniDiffusionRequest( + prompts=[f"prompt_{req_id}"], + sampling_params=sampling_params + or OmniDiffusionSamplingParams( + num_inference_steps=num_inference_steps, + step_index=step_index, + ), + request_ids=[req_id], + ) def _new_ids(sched_output) -> list[str]: @@ -43,7 +84,7 @@ def _cached_ids(sched_output) -> list[str]: class _StubScheduler(SchedulerInterface): - def __init__(self, request: OmniDiffusionRequest, output: DiffusionOutput) -> None: + def __init__(self, request: OmniDiffusionRequest, output) -> None: self._request = request self._output = output self.initialized_with = None @@ -75,9 +116,10 @@ def schedule(self): is_empty=False, ) - def update_from_output(self, sched_output, output: DiffusionOutput) -> set[str]: + def update_from_output(self, sched_output, output) -> set[str]: del sched_output assert output is self._output + self._state.status = DiffusionRequestStatus.FINISHED_COMPLETED return {self._sched_req_id} def has_requests(self) -> bool: @@ -185,9 +227,14 @@ def test_abort_request_for_waiting_and_running(self) -> None: state_b = self.scheduler.get_request_state(req_id_b) assert state_b.status == DiffusionRequestStatus.FINISHED_ABORTED + first = self.scheduler.schedule() + assert first.finished_req_ids == {req_id_b} # A should still run normally. - output_a = self.scheduler.schedule() - assert _new_ids(output_a) == [req_id_a] + assert _new_ids(first) == [req_id_a] + + # B is already marked finished aborted, scheduling again should not pull it. + second = self.scheduler.schedule() + assert second.finished_req_ids == set() # Abort running request. self.scheduler.finish_requests(req_id_a, DiffusionRequestStatus.FINISHED_ABORTED) @@ -233,33 +280,33 @@ def test_add_req_and_wait_for_response_single_path(self) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.scheduler = RequestScheduler() engine.scheduler.initialize(Mock()) - engine.executor = Mock() - engine._rpc_lock = threading.Lock() + engine._rpc_lock = threading.RLock() + engine.abort_queue = queue.Queue() request = _make_request("engine") - expected = DiffusionOutput(output=None) - engine.executor.add_req.return_value = expected + runner_output = _make_request_output("engine") + engine.execute_fn = Mock(return_value=runner_output) output = engine.add_req_and_wait_for_response(request) - assert output is expected - engine.executor.add_req.assert_called_once_with(request) + assert output is runner_output.result + engine.execute_fn.assert_called_once() def test_supports_scheduler_interface_injection(self) -> None: request = _make_request("engine_iface") - expected = DiffusionOutput(output=None) - scheduler = _StubScheduler(request, expected) + runner_output = _make_request_output("engine_iface") + scheduler = _StubScheduler(request, runner_output) engine = DiffusionEngine.__new__(DiffusionEngine) engine.scheduler = scheduler - engine.executor = Mock() - engine.executor.add_req = Mock(return_value=expected) - engine._rpc_lock = threading.Lock() + engine._rpc_lock = threading.RLock() + engine.abort_queue = queue.Queue() + engine.execute_fn = Mock(return_value=runner_output) output = engine.add_req_and_wait_for_response(request) - assert output is expected - engine.executor.add_req.assert_called_once_with(request) + assert output is runner_output.result + engine.execute_fn.assert_called_once() def test_initializes_injected_scheduler(self) -> None: request = _make_request("init") @@ -289,6 +336,59 @@ def test_scheduler_alias_keeps_default_request_scheduler(self) -> None: assert req_id in finished assert scheduler.get_request_state(req_id).status == DiffusionRequestStatus.FINISHED_COMPLETED + def test_step_raises_aborted_error(self) -> None: + engine = DiffusionEngine.__new__(DiffusionEngine) + engine.pre_process_func = None + engine.add_req_and_wait_for_response = Mock( + return_value=DiffusionOutput(aborted=True, abort_message="Request req-abort aborted.") + ) + + with pytest.raises(DiffusionRequestAbortedError, match="Request req-abort aborted"): + engine.step(_make_request("req-abort")) + + def test_abort_queue_marks_request_finished_aborted(self) -> None: + engine = DiffusionEngine.__new__(DiffusionEngine) + engine.scheduler = RequestScheduler() + engine.scheduler.initialize(Mock()) + engine.abort_queue = queue.Queue() + + req_id = engine.scheduler.add_request(_make_request("req-abort")) + engine.abort("req-abort") + engine._process_aborts_queue() + + assert engine.scheduler.get_request_state(req_id).status == DiffusionRequestStatus.FINISHED_ABORTED + + def test_finalize_finished_request_returns_aborted_output(self) -> None: + engine = DiffusionEngine.__new__(DiffusionEngine) + engine.scheduler = RequestScheduler() + engine.scheduler.initialize(Mock()) + + req_id = engine.scheduler.add_request(_make_request("req-finalize")) + engine.scheduler.finish_requests(req_id, DiffusionRequestStatus.FINISHED_ABORTED) + + output = engine._finalize_finished_request(req_id) + + assert output.aborted is True + assert output.abort_message == "Request req-finalize aborted." + + def test_initializes_step_scheduler_when_step_execution_enabled(self) -> None: + od_config = Mock(model_class_name="mock_model") + od_config.step_execution = True + fake_executor = Mock() + fake_executor_cls = Mock(return_value=fake_executor) + + with ( + patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_post_process_func", return_value=None), + patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_pre_process_func", return_value=None), + patch("vllm_omni.diffusion.diffusion_engine.DiffusionExecutor.get_class", return_value=fake_executor_cls), + patch.object(DiffusionEngine, "_dummy_run", return_value=None), + ): + engine = DiffusionEngine(od_config) + + assert isinstance(engine.scheduler, StepScheduler) + assert engine.execute_fn is fake_executor.execute_step + fake_executor_cls.assert_called_once_with(od_config) + def test_dummy_run_raises_on_output_error(self) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.od_config = Mock(model_class_name="mock_model") @@ -297,3 +397,240 @@ def test_dummy_run_raises_on_output_error(self) -> None: with pytest.raises(RuntimeError, match="Dummy run failed: boom"): engine._dummy_run() + + +class TestStepScheduler: + def setup_method(self) -> None: + self.scheduler: StepScheduler = StepScheduler() + self.scheduler.initialize(Mock()) + + def test_single_request_step_lifecycle(self) -> None: + request = _make_step_request("step", num_inference_steps=3) + req_id = self.scheduler.add_request(request) + + first = self.scheduler.schedule() + assert _new_ids(first) == [req_id] + assert _cached_ids(first) == [] + assert first.num_running_reqs == 1 + assert first.num_waiting_reqs == 0 + + finished = self.scheduler.update_from_output(first, _make_step_output(req_id, step_index=1)) + assert finished == set() + assert self.scheduler.get_request_state(req_id).status == DiffusionRequestStatus.RUNNING + assert request.sampling_params.step_index == 1 + assert self.scheduler.has_requests() is True + + second = self.scheduler.schedule() + assert _new_ids(second) == [] + assert _cached_ids(second) == [req_id] + assert second.num_running_reqs == 1 + assert second.num_waiting_reqs == 0 + + finished = self.scheduler.update_from_output(second, _make_step_output(req_id, step_index=2)) + assert finished == set() + assert request.sampling_params.step_index == 2 + + third = self.scheduler.schedule() + assert _new_ids(third) == [] + assert _cached_ids(third) == [req_id] + + finished = self.scheduler.update_from_output( + third, + _make_step_output(req_id, step_index=3, finished=True), + ) + assert finished == {req_id} + assert self.scheduler.get_request_state(req_id).status == DiffusionRequestStatus.FINISHED_COMPLETED + assert request.sampling_params.step_index == 3 + assert self.scheduler.has_requests() is False + + def test_fifo_single_request_scheduling(self) -> None: + req_id_a = self.scheduler.add_request(_make_step_request("a", num_inference_steps=2)) + req_id_b = self.scheduler.add_request(_make_step_request("b", num_inference_steps=2)) + + first = self.scheduler.schedule() + assert _new_ids(first) == [req_id_a] + assert _cached_ids(first) == [] + assert first.num_running_reqs == 1 + assert first.num_waiting_reqs == 1 + + finished = self.scheduler.update_from_output(first, _make_step_output(req_id_a, step_index=1)) + assert finished == set() + + second = self.scheduler.schedule() + assert _new_ids(second) == [] + assert _cached_ids(second) == [req_id_a] + assert second.num_running_reqs == 1 + assert second.num_waiting_reqs == 1 + + finished = self.scheduler.update_from_output( + second, + _make_step_output(req_id_a, step_index=2, finished=True), + ) + assert finished == {req_id_a} + + third = self.scheduler.schedule() + assert _new_ids(third) == [req_id_b] + assert _cached_ids(third) == [] + assert third.num_running_reqs == 1 + assert third.num_waiting_reqs == 0 + + def test_error_output_marks_finished_error(self) -> None: + req_id = self.scheduler.add_request(_make_step_request("err", num_inference_steps=3)) + + sched_output = self.scheduler.schedule() + assert _new_ids(sched_output) == [req_id] + finished = self.scheduler.update_from_output( + sched_output, + _make_step_output(req_id, step_index=1, finished=True, error="worker failed"), + ) + + assert finished == {req_id} + state = self.scheduler.get_request_state(req_id) + assert state.status == DiffusionRequestStatus.FINISHED_ERROR + assert state.error == "worker failed" + assert self.scheduler.has_requests() is False + + def test_missing_step_index_marks_finished_error(self) -> None: + req_id = self.scheduler.add_request(_make_step_request("missing", num_inference_steps=3)) + + sched_output = self.scheduler.schedule() + finished = self.scheduler.update_from_output( + sched_output, + SimpleNamespace( + req_id=req_id, + step_index=None, + finished=True, + result=None, + ), + ) + + assert finished == {req_id} + state = self.scheduler.get_request_state(req_id) + assert state.status == DiffusionRequestStatus.FINISHED_ERROR + assert state.error == "Missing step_index in RunnerOutput" + + def test_abort_request_for_waiting_and_running(self) -> None: + req_id_a = self.scheduler.add_request(_make_step_request("a", num_inference_steps=2)) + req_id_b = self.scheduler.add_request(_make_step_request("b", num_inference_steps=2)) + + self.scheduler.finish_requests(req_id_b, DiffusionRequestStatus.FINISHED_ABORTED) + assert self.scheduler.get_request_state(req_id_b).status == DiffusionRequestStatus.FINISHED_ABORTED + + running = self.scheduler.schedule() + assert _new_ids(running) == [req_id_a] + + self.scheduler.finish_requests(req_id_a, DiffusionRequestStatus.FINISHED_ABORTED) + assert self.scheduler.get_request_state(req_id_a).status == DiffusionRequestStatus.FINISHED_ABORTED + assert self.scheduler.has_requests() is False + + def test_has_requests_state_transition(self) -> None: + assert self.scheduler.has_requests() is False + + req_id = self.scheduler.add_request(_make_step_request("has", num_inference_steps=2)) + assert self.scheduler.has_requests() is True + + sched_output = self.scheduler.schedule() + assert self.scheduler.has_requests() is True + + finished = self.scheduler.update_from_output( + sched_output, + _make_step_output(req_id, step_index=2, finished=True), + ) + assert finished == {req_id} + assert self.scheduler.get_request_state(req_id).status == DiffusionRequestStatus.FINISHED_COMPLETED + assert self.scheduler.has_requests() is False + + def test_scheduled_request_aborted_before_update_is_returned_finished(self) -> None: + req_id = self.scheduler.add_request(_make_step_request("abort-late", num_inference_steps=2)) + + sched_output = self.scheduler.schedule() + self.scheduler.finish_requests(req_id, DiffusionRequestStatus.FINISHED_ABORTED) + + finished = self.scheduler.update_from_output( + sched_output, + _make_step_output(req_id, step_index=1), + ) + assert finished == {req_id} + assert self.scheduler.get_request_state(req_id).status == DiffusionRequestStatus.FINISHED_ABORTED + + def test_preempt_request_preserves_step_index(self) -> None: + request = _make_step_request("preempt", num_inference_steps=3) + req_id = self.scheduler.add_request(request) + + first = self.scheduler.schedule() + assert self.scheduler.update_from_output(first, _make_step_output(req_id, step_index=1)) == set() + assert request.sampling_params.step_index == 1 + + second = self.scheduler.schedule() + assert _cached_ids(second) == [req_id] + assert self.scheduler.preempt_request(req_id) is True + assert self.scheduler.get_request_state(req_id).status == DiffusionRequestStatus.PREEMPTED + assert request.sampling_params.step_index == 1 + + third = self.scheduler.schedule() + assert _cached_ids(third) == [req_id] + assert request.sampling_params.step_index == 1 + + @pytest.mark.parametrize( + ("sampling_params", "expected_steps"), + [ + ( + OmniDiffusionSamplingParams( + timesteps=torch.tensor([1.0, 0.5, 0.0]), + sigmas=[1.0, 0.5, 0.25, 0.0], + num_inference_steps=5, + ), + 3, + ), + ( + OmniDiffusionSamplingParams( + sigmas=[1.0, 0.5], + num_inference_steps=5, + ), + 2, + ), + ( + OmniDiffusionSamplingParams( + num_inference_steps=4, + ), + 4, + ), + ], + ) + def test_total_steps_priority(self, sampling_params: OmniDiffusionSamplingParams, expected_steps: int) -> None: + request = _make_step_request("priority", sampling_params=sampling_params) + req_id = self.scheduler.add_request(request) + + for _ in range(expected_steps - 1): + sched_output = self.scheduler.schedule() + assert sched_output.scheduled_req_ids == [req_id] + next_step = request.sampling_params.step_index + 1 + assert ( + self.scheduler.update_from_output( + sched_output, + _make_step_output(req_id, step_index=next_step), + ) + == set() + ) + + final_output = self.scheduler.schedule() + assert final_output.scheduled_req_ids == [req_id] + assert self.scheduler.update_from_output( + final_output, + _make_step_output(req_id, step_index=expected_steps, finished=True), + ) == {req_id} + assert self.scheduler.get_request_state(req_id).status == DiffusionRequestStatus.FINISHED_COMPLETED + + @pytest.mark.parametrize( + "sampling_params", + [ + OmniDiffusionSamplingParams(num_inference_steps=0), + OmniDiffusionSamplingParams(num_inference_steps=3, step_index=3), + OmniDiffusionSamplingParams(num_inference_steps=3, step_index=-1), + ], + ) + def test_rejects_invalid_initial_step_state(self, sampling_params: OmniDiffusionSamplingParams) -> None: + request = _make_step_request("invalid", sampling_params=sampling_params) + + with pytest.raises(ValueError): + self.scheduler.add_request(request) diff --git a/tests/diffusion/test_diffusion_step_pipeline.py b/tests/diffusion/test_diffusion_step_pipeline.py index ad08487fe9..68aba9ba3b 100644 --- a/tests/diffusion/test_diffusion_step_pipeline.py +++ b/tests/diffusion/test_diffusion_step_pipeline.py @@ -1,10 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for step-level diffusion runner and worker execution.""" +"""Tests for step-level diffusion execution across runner / worker / executor / engine.""" import os +import queue +import threading from contextlib import contextmanager from types import SimpleNamespace +from unittest.mock import Mock import pytest import torch @@ -12,6 +15,7 @@ import vllm_omni.diffusion.worker.diffusion_model_runner as model_runner_module from tests.utils import hardware_test from vllm_omni.diffusion.data import DiffusionOutput +from vllm_omni.diffusion.diffusion_engine import DiffusionEngine from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin from vllm_omni.diffusion.distributed.comm import RingComm, SeqAllToAll4D from vllm_omni.diffusion.distributed.parallel_state import ( @@ -20,10 +24,13 @@ init_distributed_environment, initialize_model_parallel, ) +from vllm_omni.diffusion.executor.multiproc_executor import MultiprocDiffusionExecutor from vllm_omni.diffusion.ipc import ( pack_diffusion_output_shm, unpack_diffusion_output_shm, ) +from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.diffusion.sched import StepScheduler from vllm_omni.diffusion.sched.interface import ( CachedRequestData, DiffusionSchedulerOutput, @@ -32,6 +39,8 @@ from vllm_omni.diffusion.worker.diffusion_model_runner import DiffusionModelRunner from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker from vllm_omni.diffusion.worker.utils import RunnerOutput +from vllm_omni.engine.async_omni_engine import AsyncOmniEngine +from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform pytestmark = [pytest.mark.core_model, pytest.mark.diffusion] @@ -86,6 +95,23 @@ def post_decode(self, state, **kwargs): return DiffusionOutput(output=torch.tensor([state.step_index], dtype=torch.float32)) +class _InterruptingStepPipeline(_StepPipeline): + interrupt = True + + def denoise_step(self, state, **kwargs): + del state, kwargs + self.denoise_calls += 1 + return None + + def step_scheduler(self, state, noise_pred, **kwargs): + del state, noise_pred, kwargs + raise AssertionError("step_scheduler should not run after interrupt") + + def post_decode(self, state, **kwargs): + del state, kwargs + raise AssertionError("post_decode should not run after interrupt") + + class _IdentityNoiseTransformer(torch.nn.Module): def forward(self, x: torch.Tensor, **kwargs): del kwargs @@ -188,6 +214,21 @@ def _make_step_request(num_inference_steps: int = 2): ) +def _assert_aborted_output(output: DiffusionOutput, request_id: str) -> None: + assert output.output is None + assert output.error is None + assert output.aborted is True + assert output.abort_message == f"Request {request_id} aborted." + + +def _make_engine_request(req_id: str = "req-1", num_inference_steps: int = 2) -> OmniDiffusionRequest: + return OmniDiffusionRequest( + prompts=[f"prompt-{req_id}"], + sampling_params=OmniDiffusionSamplingParams(num_inference_steps=num_inference_steps), + request_ids=[req_id], + ) + + def _make_runner(): runner = object.__new__(DiffusionModelRunner) runner.vllm_config = object() @@ -242,6 +283,18 @@ def _make_cached_scheduler_output(sched_req_id="req-1", step_id=1, finished_req_ ) +def _make_engine(scheduler, execute_fn=None) -> DiffusionEngine: + engine = object.__new__(DiffusionEngine) + engine.od_config = SimpleNamespace(model_class_name="QwenImagePipeline") + engine.pre_process_func = None + engine.post_process_func = None + engine.scheduler = scheduler + engine.execute_fn = execute_fn + engine._rpc_lock = threading.RLock() + engine.abort_queue = queue.Queue() + return engine + + def _expected_output_for_mode(mode: str) -> torch.Tensor: if mode == "cfg": return torch.tensor([[3.0]]) @@ -322,6 +375,52 @@ def test_completes_request_and_clears_state(self, monkeypatch): assert runner.pipeline.scheduler_calls == 2 assert runner.pipeline.decode_calls == 1 + def test_rejects_multi_request_step_batch(self): + runner = _make_runner() + req_1 = _make_step_request() + req_2 = _make_step_request() + req_2.request_ids = ["req-2"] + + scheduler_output = DiffusionSchedulerOutput( + step_id=0, + scheduled_new_reqs=[ + NewRequestData(sched_req_id="req-1", req=req_1), + NewRequestData(sched_req_id="req-2", req=req_2), + ], + scheduled_cached_reqs=CachedRequestData.make_empty(), + finished_req_ids=set(), + num_running_reqs=2, + num_waiting_reqs=0, + ) + + with pytest.raises(ValueError, match="batch_size=1"): + DiffusionModelRunner.execute_stepwise(runner, scheduler_output) + + def test_rejects_missing_cached_state(self): + runner = _make_runner() + + with pytest.raises(ValueError, match="Missing cached state"): + DiffusionModelRunner.execute_stepwise(runner, _make_cached_scheduler_output(sched_req_id="req-missing")) + + def test_interrupt_marks_request_finished_and_clears_state(self, monkeypatch): + runner = _make_runner() + runner.pipeline = _InterruptingStepPipeline() + req = _make_step_request() + monkeypatch.setattr(model_runner_module, "set_forward_context", _noop_forward_context) + + output = DiffusionModelRunner.execute_stepwise(runner, _make_scheduler_output(req, step_id=0)) + + assert output.req_id == "req-1" + assert output.step_index == 0 + assert output.finished is True + assert output.result is not None + assert output.result.error == "stepwise denoise interrupted" + assert "req-1" not in runner.state_cache + assert runner.pipeline.prepare_calls == 1 + assert runner.pipeline.denoise_calls == 1 + assert runner.pipeline.scheduler_calls == 0 + assert runner.pipeline.decode_calls == 0 + def test_load_model_rejects_unsupported_step_execution(self, monkeypatch): class _RequestOnlyPipeline: pass @@ -439,6 +538,153 @@ def test_rejects_lora_requests_in_step_mode(self): DiffusionWorker.execute_stepwise(worker, scheduler_output) +@pytest.mark.cpu +class TestExecutor: + """MultiprocDiffusionExecutor.execute_step""" + + def test_execute_step_passes_through_runner_output(self): + executor = object.__new__(MultiprocDiffusionExecutor) + executor._ensure_open = lambda: None + expected = RunnerOutput(req_id="req-step", step_index=1, finished=False, result=None) + executor.collective_rpc = Mock(return_value=expected) + + request = _make_engine_request("req-step", num_inference_steps=2) + scheduler_output = _make_scheduler_output(request, sched_req_id="req-step") + + output = MultiprocDiffusionExecutor.execute_step(executor, scheduler_output) + + assert output is expected + + +@pytest.mark.cpu +class TestEngine: + """Step-execution paths in DiffusionEngine.add_req_and_wait_for_response""" + + @pytest.mark.parametrize( + ("execute_fn", "expected_error"), + [ + ( + lambda _: RunnerOutput( + req_id="req-error", + step_index=1, + finished=True, + result=DiffusionOutput(error="boom"), + ), + "boom", + ), + ( + lambda _: (_ for _ in ()).throw(RuntimeError("gpu on fire")), + "gpu on fire", + ), + ], + ) + def test_step_engine_returns_error(self, execute_fn, expected_error): + scheduler = StepScheduler() + scheduler.initialize(Mock()) + engine = _make_engine(scheduler, execute_fn=execute_fn) + + output = engine.add_req_and_wait_for_response(_make_engine_request("req-error", num_inference_steps=2)) + + assert output.output is None + assert expected_error in output.error + + def test_step_execution_completes(self): + scheduler = StepScheduler() + scheduler.initialize(Mock()) + engine = _make_engine(scheduler) + request = _make_engine_request("req-step", num_inference_steps=2) + + call_count = {"n": 0} + + def execute_fn(_): + call_count["n"] += 1 + finished = call_count["n"] == 2 + return RunnerOutput( + req_id="req-step", + step_index=call_count["n"], + finished=finished, + result=(DiffusionOutput(output=torch.tensor([2.0])) if finished else None), + ) + + engine.execute_fn = execute_fn + + output = engine.add_req_and_wait_for_response(request) + + assert call_count["n"] == 2 + assert output.error is None + assert torch.equal(output.output, torch.tensor([2.0])) + + def test_step_abort_stops_rescheduling_after_first_step(self): + scheduler = StepScheduler() + scheduler.initialize(Mock()) + engine = _make_engine(scheduler) + request = _make_engine_request("req-stop", num_inference_steps=4) + + step = {"n": 0} + + def execute_fn(_): + step["n"] += 1 + engine.abort("req-stop") + return RunnerOutput( + req_id="req-stop", + step_index=1, + finished=False, + result=None, + ) + + engine.execute_fn = execute_fn + + output = engine.add_req_and_wait_for_response(request) + + assert step["n"] == 1 + _assert_aborted_output(output, "req-stop") + + def test_step_abort_after_reschedule_returns_aborted_output(self): + scheduler = StepScheduler() + scheduler.initialize(Mock()) + engine = _make_engine(scheduler) + request = _make_engine_request("req-mid", num_inference_steps=4) + + step = {"n": 0} + + def execute_fn(sched_output): + step["n"] += 1 + if step["n"] == 2: + assert sched_output == _make_cached_scheduler_output("req-mid", step_id=1) + engine.abort("req-mid") + return RunnerOutput( + req_id="req-mid", + step_index=step["n"], + finished=False, + result=None, + ) + + engine.execute_fn = execute_fn + + output = engine.add_req_and_wait_for_response(request) + + assert step["n"] == 2 + _assert_aborted_output(output, "req-mid") + + def test_finished_step_without_result_returns_error(self): + scheduler = StepScheduler() + scheduler.initialize(Mock()) + engine = _make_engine( + scheduler, + execute_fn=lambda _: RunnerOutput( + req_id="req-missing", + step_index=1, + finished=True, + result=None, + ), + ) + + output = engine.add_req_and_wait_for_response(_make_engine_request("req-missing", num_inference_steps=1)) + + assert output.output is None + assert output.error == "Diffusion execution finished without a final output." + + @pytest.mark.cpu class TestIPC: def test_pack_unpack_runner_output_shm(self): @@ -458,6 +704,15 @@ def test_pack_unpack_runner_output_shm(self): class TestSupportedPipelines: """Step-execution protocol checks for supported pipelines.""" + def test_default_stage_config_includes_step_execution(self): + stage_cfg = AsyncOmniEngine._create_default_diffusion_stage_cfg( + { + "step_execution": True, + } + )[0] + + assert stage_cfg["engine_args"]["step_execution"] is True + def test_qwen_image_supports_step_execution(self): from vllm_omni.diffusion.models.interface import SupportsStepExecution, supports_step_execution from vllm_omni.diffusion.models.qwen_image.pipeline_qwen_image import QwenImagePipeline diff --git a/tests/diffusion/test_multiproc_engine_concurrency.py b/tests/diffusion/test_multiproc_engine_concurrency.py index adb8dc338c..517f98ddaa 100644 --- a/tests/diffusion/test_multiproc_engine_concurrency.py +++ b/tests/diffusion/test_multiproc_engine_concurrency.py @@ -66,7 +66,9 @@ def _make_engine(num_gpus: int = 1): sched.initialize(Mock()) engine.scheduler = sched engine.executor = executor - engine._rpc_lock = threading.Lock() + engine._rpc_lock = threading.RLock() + engine.abort_queue = queue.Queue() + engine.execute_fn = executor.execute_request return engine, executor, req_q, res_q @@ -80,7 +82,7 @@ def _run(): req = req_q.get(timeout=10) method = req.get("method", "") args = req.get("args", ()) - if method == "generate" and args and hasattr(args[0], "request_ids"): + if method in {"generate", "execute_model"} and args and hasattr(args[0], "request_ids"): tag = f"result_for_{args[0].request_ids[0]}" elif args: tag = f"result_for_{args[0]}" @@ -116,11 +118,11 @@ def _controlled(item): return a_enqueued, b_complete -# ──────────────────── bug-reproduction: concurrent add_req ──────────────── +# ───────────────── concurrent request execution ───────────────── -class TestConcurrentAddReqBug: - """Two concurrent ``add_req_and_wait_for_response()`` calls swap results.""" +class TestConcurrentRequestExecution: + """Concurrent request execution should not swap results.""" def test_results_are_correctly_routed(self): engine, executor, req_q, res_q = _make_engine() @@ -151,11 +153,11 @@ def _b(): assert results["B"].error == "result_for_B" -# ──────────────── bug-reproduction: concurrent collective_rpc ───────────── +# ───────────────── concurrent collective RPC ───────────────── -class TestConcurrentCollectiveRpcBug: - """Two concurrent ``collective_rpc()`` calls swap results.""" +class TestConcurrentCollectiveRpc: + """Concurrent ``collective_rpc()`` calls should not swap results.""" def test_results_are_correctly_routed(self): engine, executor, req_q, res_q = _make_engine() @@ -192,11 +194,11 @@ def _b(): assert results["B"].error == "result_for_call_B" -# ──────── bug-reproduction: add_req vs collective_rpc concurrently ──────── +# ──────────── concurrent request execution and collective RPC ──────────── -class TestConcurrentAddReqVsCollectiveRpcBug: - """``add_req`` and ``collective_rpc`` running concurrently swap results.""" +class TestConcurrentRequestExecutionAndCollectiveRpc: + """Request execution and ``collective_rpc()`` should not swap results.""" def test_results_are_correctly_routed(self): engine, executor, req_q, res_q = _make_engine() @@ -205,7 +207,7 @@ def test_results_are_correctly_routed(self): results: dict[str, object] = {} - def _a(): # add_req path + def _a(): # request execution path results["A"] = engine.add_req_and_wait_for_response(_mock_request("A")) def _b(): # collective_rpc path @@ -230,10 +232,10 @@ def _b(): # collective_rpc path assert results["B"].error == "result_for_call_B" -# ─────────────── backward-compatibility (serial) tests ──────────────────── +# ─────────────────────── serial operation coverage ─────────────────────── -class TestSerialOperations: +class TestSerialEngineOperations: """Verify correct behaviour for single-threaded (serial) usage. These tests must pass both **before** and **after** any concurrency fix @@ -385,18 +387,18 @@ def _hanging_dequeue(timeout=None): executor._result_mq.dequeue = _hanging_dequeue - # Thread running add_req — acquires the lock, enqueues, then + # Thread running request execution — acquires the lock, enqueues, then # blocks on dequeue forever (worker hang). - def _stalled_add_req(): + def _stalled_request_execution(): try: engine.add_req_and_wait_for_response(_mock_request("stalled")) except Exception: pass - t = threading.Thread(target=_stalled_add_req, daemon=True) + t = threading.Thread(target=_stalled_request_execution, daemon=True) t.start() - # Wait until add_req is truly inside the lock and blocking. + # Wait until request execution is truly inside the lock and blocking. add_req_blocked.wait(5) # collective_rpc should time out at lock acquisition, not hang. diff --git a/tests/e2e/online_serving/test_qwen_image_expansion.py b/tests/e2e/online_serving/test_qwen_image_expansion.py index e5bcde417e..6d6d236016 100644 --- a/tests/e2e/online_serving/test_qwen_image_expansion.py +++ b/tests/e2e/online_serving/test_qwen_image_expansion.py @@ -28,6 +28,11 @@ def _get_diffusion_feature_cases(model: str): return [ + pytest.param( + OmniServerParams(model=model, server_args=["--step-execution"]), + id="step_execution", + marks=SINGLE_CARD_FEATURE_MARKS, + ), pytest.param( OmniServerParams(model=model, server_args=["--cache-backend", "tea_cache"]), id="cache_tea_cache", diff --git a/tests/entrypoints/test_async_omni_abort.py b/tests/entrypoints/test_async_omni_abort.py new file mode 100644 index 0000000000..71f3e99feb --- /dev/null +++ b/tests/entrypoints/test_async_omni_abort.py @@ -0,0 +1,85 @@ +import asyncio +from types import SimpleNamespace + +import pytest + +from vllm_omni.entrypoints.async_omni import AsyncOmni + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def test_generate_accepts_request_after_repeated_cancellations(): + async def run_test(): + submitted_request_ids = [] + aborted_request_batches = [] + + async def fake_add_request_async(*, request_id, prompt, sampling_params_list, final_stage_id): + del prompt, sampling_params_list, final_stage_id + submitted_request_ids.append(request_id) + + async def fake_abort_async(request_ids): + aborted_request_batches.append(list(request_ids)) + + async def fake_process_results(request_id, metrics, final_stage_id_for_e2e, req_start_ts, wall_start_ts): + del metrics, final_stage_id_for_e2e, req_start_ts, wall_start_ts + if request_id.startswith("cancel-"): + await asyncio.Future() + return + yield SimpleNamespace( + stage_id=0, + request_output=SimpleNamespace(outputs=[]), + finished=True, + ) + + async def collect_outputs(request_id): + outputs = [] + async for output in AsyncOmni.generate( + omni, + prompt={"prompt": "prompt"}, + request_id=request_id, + sampling_params_list=[SimpleNamespace()], + output_modalities=["image"], + ): + outputs.append(output) + return outputs + + omni = object.__new__(AsyncOmni) + omni._pause_cond = asyncio.Condition() + omni._paused = False + omni.engine = SimpleNamespace( + num_stages=1, + add_request_async=fake_add_request_async, + abort_async=fake_abort_async, + ) + omni.log_stats = False + omni.request_states = {} + omni._final_output_handler = lambda: None + omni.resolve_sampling_params_list = lambda params: params + omni._compute_final_stage_id = lambda output_modalities: 0 + omni._process_orchestrator_results = fake_process_results + omni._log_summary_and_cleanup = lambda request_id: omni.request_states.pop(request_id, None) + + assert len(await collect_outputs("baseline")) == 1 + + for idx in range(3): + task = asyncio.create_task(collect_outputs(f"cancel-{idx}")) + await asyncio.sleep(0) + task.cancel() + with pytest.raises(asyncio.CancelledError): + await task + + assert len(await collect_outputs("after-cancel")) == 1 + assert submitted_request_ids == [ + "baseline", + "cancel-0", + "cancel-1", + "cancel-2", + "after-cancel", + ] + assert aborted_request_batches == [ + ["cancel-0"], + ["cancel-1"], + ["cancel-2"], + ] + + asyncio.run(run_test()) diff --git a/tests/entrypoints/test_async_omni_diffusion.py b/tests/entrypoints/test_async_omni_diffusion.py index c0eae0992f..c8aaae4f94 100644 --- a/tests/entrypoints/test_async_omni_diffusion.py +++ b/tests/entrypoints/test_async_omni_diffusion.py @@ -1,9 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio +import threading +from concurrent.futures import ThreadPoolExecutor +from types import SimpleNamespace +from unittest.mock import Mock + import pytest +import vllm_omni.diffusion.stage_diffusion_client as stage_diffusion_client_module +from vllm_omni.diffusion.data import DiffusionRequestAbortedError +from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient from vllm_omni.entrypoints.async_omni_diffusion import AsyncOmniDiffusion +from vllm_omni.inputs.data import OmniDiffusionSamplingParams pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -13,3 +23,91 @@ def test_get_diffusion_od_config_returns_direct_config(): diffusion.od_config = object() assert diffusion.get_diffusion_od_config() is diffusion.od_config + + +def test_async_omni_diffusion_generate_aborts_engine_on_cancel(): + async def run_test(): + started = threading.Event() + release = threading.Event() + abort = Mock() + + def step(request): + del request + started.set() + release.wait(timeout=5) + return [SimpleNamespace(request_id="req-1")] + + diffusion = object.__new__(AsyncOmniDiffusion) + diffusion.engine = SimpleNamespace(step=step, abort=abort) + diffusion._executor = ThreadPoolExecutor(max_workers=1) + + task = asyncio.create_task( + diffusion.generate( + prompt="hello", + sampling_params=OmniDiffusionSamplingParams(), + request_id="req-1", + ) + ) + try: + assert await asyncio.to_thread(started.wait, 1) + task.cancel() + with pytest.raises(asyncio.CancelledError): + await task + finally: + release.set() + diffusion._executor.shutdown(wait=True) + + abort.assert_called_once_with("req-1") + + asyncio.run(run_test()) + + +def test_stage_diffusion_client_abort_requests_forwards_to_engine(): + async def run_test(): + aborted_request_ids: list[list[str]] = [] + + async def abort(request_ids): + aborted_request_ids.append(request_ids) + + client = object.__new__(StageDiffusionClient) + client._engine = SimpleNamespace(abort=abort) + client._tasks = {} + + task = asyncio.create_task(asyncio.sleep(60)) + client._tasks["req-1"] = task + + await client.abort_requests_async(["req-1", "req-2"]) + + with pytest.raises(asyncio.CancelledError): + await task + assert client._tasks == {} + assert aborted_request_ids == [["req-1", "req-2"]] + + asyncio.run(run_test()) + + +def test_stage_diffusion_client_run_treats_abort_as_normal_path(monkeypatch): + async def run_test(): + async def generate(prompt, sampling_params, request_id): + del prompt, sampling_params + raise DiffusionRequestAbortedError(f"Request {request_id} aborted.") + + info = Mock() + exception = Mock() + monkeypatch.setattr(stage_diffusion_client_module.logger, "info", info) + monkeypatch.setattr(stage_diffusion_client_module.logger, "exception", exception) + + client = object.__new__(StageDiffusionClient) + client.stage_id = 3 + client._engine = SimpleNamespace(generate=generate) + client._output_queue = asyncio.Queue() + client._tasks = {"req-1": object()} + + await client._run("req-1", "prompt", OmniDiffusionSamplingParams()) + + assert client._output_queue.empty() + assert client._tasks == {} + info.assert_called_once() + exception.assert_not_called() + + asyncio.run(run_test()) diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py index 488378b40f..12eb5ed3da 100644 --- a/vllm_omni/diffusion/data.py +++ b/vllm_omni/diffusion/data.py @@ -492,6 +492,9 @@ class OmniDiffusionConfig: # Step mode settings step_execution: bool = False + # Maximum number of sequences to generate in a batch + max_num_seqs: int = 1 + @property def is_moe(self) -> bool: num_experts = self.tf_model_config.get("num_experts", None) @@ -658,6 +661,8 @@ class DiffusionOutput: trajectory_latents: torch.Tensor | None = None trajectory_decoded: list[torch.Tensor] | None = None error: str | None = None + aborted: bool = False + abort_message: str | None = None post_process_func: Callable[..., Any] | None = None @@ -675,6 +680,10 @@ class DiffusionOutput: peak_memory_mb: float = 0.0 +class DiffusionRequestAbortedError(RuntimeError): + """Raised when a diffusion request ends via user-visible abort.""" + + class AttentionBackendEnum(enum.Enum): FA = enum.auto() SLIDING_TILE_ATTN = enum.auto() diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index ff0f753b40..308c8cef80 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -1,6 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + +import queue import threading import time from collections.abc import Iterable @@ -11,7 +14,11 @@ import torch from vllm.logger import init_logger -from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig +from vllm_omni.diffusion.data import ( + DiffusionOutput, + DiffusionRequestAbortedError, + OmniDiffusionConfig, +) from vllm_omni.diffusion.executor.abstract import DiffusionExecutor from vllm_omni.diffusion.registry import ( DiffusionModelRegistry, @@ -19,7 +26,9 @@ get_diffusion_pre_process_func, ) from vllm_omni.diffusion.request import OmniDiffusionRequest -from vllm_omni.diffusion.sched import RequestScheduler, SchedulerInterface +from vllm_omni.diffusion.sched import RequestScheduler, SchedulerInterface, StepScheduler +from vllm_omni.diffusion.sched.interface import DiffusionRequestStatus +from vllm_omni.diffusion.worker.utils import RunnerOutput from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniTextPrompt from vllm_omni.outputs import OmniRequestOutput @@ -72,9 +81,14 @@ def __init__( executor_class = DiffusionExecutor.get_class(od_config) self.executor = executor_class(od_config) - self.scheduler: SchedulerInterface = scheduler or RequestScheduler() + self.step_execution = bool(getattr(od_config, "step_execution", False)) + self.scheduler: SchedulerInterface = scheduler or ( + StepScheduler() if self.step_execution else RequestScheduler() + ) self.scheduler.initialize(od_config) - self._rpc_lock = threading.Lock() + self._rpc_lock = threading.RLock() + self.abort_queue: queue.Queue[str] = queue.Queue() + self.execute_fn = self.executor.execute_step if self.step_execution else self.executor.execute_request try: self._dummy_run() @@ -98,6 +112,8 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: output = self.add_req_and_wait_for_response(request) exec_total_time = time.perf_counter() - exec_start_time + if output.aborted: + raise DiffusionRequestAbortedError(output.abort_message or "Diffusion request aborted.") if output.error: raise Exception(f"{output.error}") logger.info("Generation completed successfully.") @@ -264,7 +280,7 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: def make_engine( config: OmniDiffusionConfig, scheduler: SchedulerInterface | None = None, - ) -> "DiffusionEngine": + ) -> DiffusionEngine: """Factory method to create a DiffusionEngine instance. Args: @@ -281,8 +297,11 @@ def add_req_and_wait_for_response(self, request: OmniDiffusionRequest) -> Diffus # keep scheduling and executing until the target request is finished while True: + self._process_aborts_queue() sched_output = self.scheduler.schedule() if sched_output.is_empty: + if target_sched_req_id in sched_output.finished_req_ids: + return self._finalize_finished_request(target_sched_req_id) if not self.scheduler.has_requests(): raise RuntimeError("Diffusion scheduler has no runnable requests.") continue @@ -292,21 +311,26 @@ def add_req_and_wait_for_response(self, request: OmniDiffusionRequest) -> Diffus # vllm_omni/diffusion/sched/base_scheduler.py), so we directly # take the single scheduled request here. sched_req_id = sched_output.scheduled_req_ids[0] - req = sched_output.scheduled_new_reqs[0].req try: - output = self.executor.add_req(req) + runner_output = self.execute_fn(sched_output) except Exception as exc: - logger.error( - "Execution failed for diffusion request %s", - sched_req_id, - exc_info=True, + logger.error("Execution failed for diffusion request %s", sched_req_id, exc_info=True) + runner_output = RunnerOutput( + req_id=sched_req_id, + step_index=None, + finished=True, + result=DiffusionOutput(error=str(exc)), ) - output = DiffusionOutput(error=str(exc)) - finished_req_ids = self.scheduler.update_from_output(sched_output, output) + self._process_aborts_queue() + + finished_req_ids = self.scheduler.update_from_output(sched_output, runner_output) if target_sched_req_id in finished_req_ids: - self.scheduler.pop_request_state(target_sched_req_id) - return output + return self._finalize_finished_request( + target_sched_req_id, + runner_output=runner_output, + missing_result_error="Diffusion execution finished without a final output.", + ) def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None: """Start or stop torch profiling on all diffusion workers. @@ -437,6 +461,55 @@ def close(self) -> None: self.executor.shutdown() def abort(self, request_id: str | Iterable[str]) -> None: - # TODO implement it - logger.warning("DiffusionEngine abort is not implemented yet") - pass + request_ids = [request_id] if isinstance(request_id, str) else list(request_id) + for req_id in request_ids: + self.abort_queue.put(req_id) + + def _process_aborts_queue(self) -> None: + if self.abort_queue.empty(): + return + + request_ids: list[str] = [] + while not self.abort_queue.empty(): + ids = self.abort_queue.get_nowait() + request_ids.extend((ids,) if isinstance(ids, str) else ids) + + self._abort_requests(request_ids) + + def _abort_requests(self, request_ids: str | Iterable[str]) -> None: + request_ids = [request_ids] if isinstance(request_ids, str) else list(request_ids) + + sched_req_ids: list[str] = [] + for request_id in dict.fromkeys(request_ids): + sched_req_id = self.scheduler.get_sched_req_id(request_id) + if sched_req_id is not None: + sched_req_ids.append(sched_req_id) + + for sched_req_id in dict.fromkeys(sched_req_ids): + if self.scheduler.get_request_state(sched_req_id) is not None: + self.scheduler.finish_requests(sched_req_id, DiffusionRequestStatus.FINISHED_ABORTED) + + def _finalize_finished_request( + self, + sched_req_id: str, + runner_output: RunnerOutput | None = None, + missing_result_error: str = "Diffusion scheduler finished target request without execution output.", + ) -> DiffusionOutput: + state = self.scheduler.get_request_state(sched_req_id) + popped_state = self.scheduler.pop_request_state(sched_req_id) + state = state or popped_state + + if state is None: + raise RuntimeError(f"Diffusion scheduler lost state for request {sched_req_id}.") + + if state.status == DiffusionRequestStatus.FINISHED_ABORTED: + request_id = state.req.request_ids[0] if state.req.request_ids else sched_req_id + return DiffusionOutput( + aborted=True, + abort_message=f"Request {request_id} aborted.", + ) + + if runner_output is not None and runner_output.result is not None: + return runner_output.result + + return DiffusionOutput(error=missing_result_error) diff --git a/vllm_omni/diffusion/executor/abstract.py b/vllm_omni/diffusion/executor/abstract.py index e41f41d119..564980f660 100644 --- a/vllm_omni/diffusion/executor/abstract.py +++ b/vllm_omni/diffusion/executor/abstract.py @@ -1,11 +1,17 @@ +from __future__ import annotations + from abc import ABC, abstractmethod -from typing import Any +from typing import TYPE_CHECKING, Any from vllm.utils.import_utils import resolve_obj_by_qualname from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig from vllm_omni.diffusion.request import OmniDiffusionRequest +if TYPE_CHECKING: + from vllm_omni.diffusion.sched.interface import DiffusionSchedulerOutput + from vllm_omni.diffusion.worker.utils import RunnerOutput + class DiffusionExecutor(ABC): """Abstract base class for Diffusion executors.""" @@ -13,7 +19,7 @@ class DiffusionExecutor(ABC): uses_multiproc: bool = False @staticmethod - def get_class(od_config: OmniDiffusionConfig) -> type["DiffusionExecutor"]: + def get_class(od_config: OmniDiffusionConfig) -> type[DiffusionExecutor]: executor_class: type[DiffusionExecutor] distributed_executor_backend = od_config.distributed_executor_backend @@ -63,6 +69,16 @@ def add_req(self, requests: OmniDiffusionRequest) -> DiffusionOutput: """Add requests to the execution queue.""" pass + @abstractmethod + def execute_request(self, scheduler_output: DiffusionSchedulerOutput) -> RunnerOutput: + """Execute request-mode work from a scheduler output.""" + pass + + @abstractmethod + def execute_step(self, scheduler_output: DiffusionSchedulerOutput) -> RunnerOutput: + """Execute step-mode work from a scheduler output.""" + pass + @abstractmethod def collective_rpc( self, @@ -71,6 +87,7 @@ def collective_rpc( args: tuple = (), kwargs: dict | None = None, unique_reply_rank: int | None = None, + exec_all_ranks: bool = False, ) -> Any: """Execute a method on workers.""" pass diff --git a/vllm_omni/diffusion/executor/multiproc_executor.py b/vllm_omni/diffusion/executor/multiproc_executor.py index 1756633ba6..e55a464fb4 100644 --- a/vllm_omni/diffusion/executor/multiproc_executor.py +++ b/vllm_omni/diffusion/executor/multiproc_executor.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import multiprocessing as mp import time import weakref from dataclasses import dataclass -from typing import Any +from typing import TYPE_CHECKING, Any import zmq from vllm.distributed.device_communicators.shm_broadcast import MessageQueue @@ -14,6 +16,10 @@ from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.diffusion.worker import WorkerProc +if TYPE_CHECKING: + from vllm_omni.diffusion.sched.interface import DiffusionSchedulerOutput + from vllm_omni.diffusion.worker.utils import RunnerOutput + logger = init_logger(__name__) @@ -190,6 +196,61 @@ def add_req(self, request: OmniDiffusionRequest) -> DiffusionOutput: logger.error(f"Generate call failed: {e}") raise + def execute_request(self, scheduler_output: DiffusionSchedulerOutput) -> RunnerOutput: + """Adapt request-mode scheduler output to worker execute_model RPC.""" + from vllm_omni.diffusion.worker.utils import RunnerOutput + + self._ensure_open() + if scheduler_output.num_scheduled_reqs != 1: + raise ValueError( + f"Request mode currently supports batch_size=1, " + f"but got {scheduler_output.num_scheduled_reqs} scheduled requests." + ) + + new_req = scheduler_output.scheduled_new_reqs[0] + result = self.collective_rpc( + "execute_model", + args=(new_req.req, self.od_config), + unique_reply_rank=0, + exec_all_ranks=True, + ) + if not isinstance(result, DiffusionOutput): + raise RuntimeError(f"Unexpected response type for execute_request: {type(result)!r}") + + return RunnerOutput( + req_id=new_req.sched_req_id, + step_index=None, + finished=True, + result=result, + ) + + def execute_step(self, scheduler_output: DiffusionSchedulerOutput) -> RunnerOutput: + """Forward step-mode scheduler output to worker execute_stepwise RPC.""" + from vllm_omni.diffusion.worker.utils import RunnerOutput + + self._ensure_open() + result = self.collective_rpc( + "execute_stepwise", + args=(scheduler_output,), + unique_reply_rank=0, + exec_all_ranks=True, + ) + + if isinstance(result, RunnerOutput): + return result + # TODO: Remove this fallback; DiffusionOutput cannot faithfully represent + # failed multi-request step batches. + if isinstance(result, DiffusionOutput): + req_id = scheduler_output.scheduled_req_ids[0] if scheduler_output.scheduled_req_ids else "" + return RunnerOutput( + req_id=req_id, + step_index=None, + finished=True, + result=result, + ) + else: + raise RuntimeError(f"Unexpected response type for execute_step: {type(result)!r}") + def collective_rpc( self, method: str, @@ -197,6 +258,7 @@ def collective_rpc( args: tuple = (), kwargs: dict | None = None, unique_reply_rank: int | None = None, + exec_all_ranks: bool = False, ) -> Any: self._ensure_open() @@ -212,7 +274,7 @@ def collective_rpc( "args": args, "kwargs": kwargs, "output_rank": unique_reply_rank if unique_reply_rank is not None else 0, - "exec_all_ranks": unique_reply_rank is None, + "exec_all_ranks": unique_reply_rank is None or exec_all_ranks, } try: @@ -228,6 +290,11 @@ def collective_rpc( try: response = self._result_mq.dequeue(timeout=dequeue_timeout) + try: + unpack_diffusion_output_shm(response) + except Exception as e: + logger.warning("SHM unpack failed (data may already be inline): %s", e) + # Check if response indicates an error if isinstance(response, dict) and response.get("status") == "error": raise RuntimeError( diff --git a/vllm_omni/diffusion/lora/manager.py b/vllm_omni/diffusion/lora/manager.py index 1466a33584..5f75e26cb1 100644 --- a/vllm_omni/diffusion/lora/manager.py +++ b/vllm_omni/diffusion/lora/manager.py @@ -218,10 +218,16 @@ def set_active_adapter(self, lora_request: LoRARequest | None, lora_scale: float lora_scale: The external scale for the LoRA adapter. """ if lora_request is None: + if self._active_adapter_id is None: + logger.debug("No lora_request provided and adapters are already inactive") + return logger.debug("No lora_request provided, deactivating all LoRA adapters") self._deactivate_all_adapters() return elif math.isclose(0.0, lora_scale): + if self._active_adapter_id is None: + logger.debug("Received LoRA scale 0 with adapters already inactive") + return logger.warning("Received a request with LoRA scale 0; deactivating all LoRA adapters") self._deactivate_all_adapters() return @@ -605,6 +611,9 @@ def _activate_adapter(self, adapter_id: int, scale: float) -> None: self._update_adapter_scale(adapter_id, scale) def _deactivate_all_adapters(self) -> None: + if self._active_adapter_id is None: + logger.debug("All adapters already inactive") + return logger.info("Deactivating all adapters: %d layers", len(self._lora_modules)) for lora_layer in self._lora_modules.values(): lora_layer.reset_lora(0) diff --git a/vllm_omni/diffusion/sched/__init__.py b/vllm_omni/diffusion/sched/__init__.py index 650a1a1e6f..e026373384 100644 --- a/vllm_omni/diffusion/sched/__init__.py +++ b/vllm_omni/diffusion/sched/__init__.py @@ -10,16 +10,18 @@ SchedulerInterface, ) from vllm_omni.diffusion.sched.request_scheduler import RequestScheduler +from vllm_omni.diffusion.sched.step_scheduler import StepScheduler Scheduler = RequestScheduler __all__ = [ + "DiffusionRequestStatus", "CachedRequestData", "DiffusionRequestState", - "DiffusionRequestStatus", "DiffusionSchedulerOutput", "NewRequestData", + "SchedulerInterface", "RequestScheduler", + "StepScheduler", "Scheduler", - "SchedulerInterface", ] diff --git a/vllm_omni/diffusion/sched/base_scheduler.py b/vllm_omni/diffusion/sched/base_scheduler.py index a59fa50d1e..6a7ee3d3ef 100644 --- a/vllm_omni/diffusion/sched/base_scheduler.py +++ b/vllm_omni/diffusion/sched/base_scheduler.py @@ -5,13 +5,21 @@ from collections import deque +from vllm.logger import init_logger + from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.diffusion.sched.interface import ( + CachedRequestData, DiffusionRequestState, DiffusionRequestStatus, + DiffusionSchedulerOutput, + NewRequestData, SchedulerInterface, ) +logger = init_logger(__name__) + class _BaseScheduler(SchedulerInterface): """Shared queue/state bookkeeping for diffusion schedulers.""" @@ -24,8 +32,6 @@ def __init__(self) -> None: self._waiting: deque[str] = deque() self._running: list[str] = [] self._finished_req_ids: set[str] = set() - # The current DiffusionEngine execution mode does not support real - # request batching well, so we keep this fixed at 1 for now. self._max_batch_size: int = 1 def initialize(self, od_config: OmniDiffusionConfig) -> None: @@ -36,8 +42,67 @@ def initialize(self, od_config: OmniDiffusionConfig) -> None: self._waiting.clear() self._running.clear() self._finished_req_ids.clear() + # The current DiffusionEngine execution mode does not support real + # request batching well, so we keep this fixed at 1 for now. + # TODO: Add support for multiple concurrent requests + self.max_num_running_reqs = 1 self._reset_scheduler_state() + def add_request(self, request: OmniDiffusionRequest) -> str: + sched_req_id = self._make_sched_req_id(request) + return self._add_request_with_sched_req_id(sched_req_id, request) + + def _add_request_with_sched_req_id(self, sched_req_id: str, request: OmniDiffusionRequest) -> str: + state = DiffusionRequestState(sched_req_id=sched_req_id, req=request) + self._request_states[sched_req_id] = state + self._register_request_ids(request.request_ids, sched_req_id) + self._waiting.append(sched_req_id) + logger.debug("%s add_request: %s (waiting=%d)", self.__class__.__name__, sched_req_id, len(self._waiting)) + return sched_req_id + + def schedule(self) -> DiffusionSchedulerOutput: + scheduled_new_reqs: list[NewRequestData] = [] + scheduled_cached_req_ids: list[str] = [] + + # First, schedule the RUNNING request(s) + for sched_req_id in self._running: + state = self._request_states.get(sched_req_id) + if state is not None: + scheduled_cached_req_ids.append(sched_req_id) + + # Second, schedule WAITING requests while capacity remains. + while self._waiting and len(self._running) < self.max_num_running_reqs: + sched_req_id = self._waiting[0] + state = self._request_states.get(sched_req_id) + if state is None: + self._waiting.popleft() + continue + if not self._can_schedule_waiting(state): + break + + self._waiting.popleft() + was_new_request = state.status == DiffusionRequestStatus.WAITING + state.status = DiffusionRequestStatus.RUNNING + self._running.append(sched_req_id) + if was_new_request: + scheduled_new_reqs.append(NewRequestData.from_state(state)) + else: + scheduled_cached_req_ids.append(sched_req_id) + + scheduler_output = DiffusionSchedulerOutput( + step_id=self._step_id, + scheduled_new_reqs=scheduled_new_reqs, + scheduled_cached_reqs=CachedRequestData(sched_req_ids=scheduled_cached_req_ids), + finished_req_ids=set(self._finished_req_ids), + num_running_reqs=len(self._running), + num_waiting_reqs=len(self._waiting), + ) + + # update after schedule + self._step_id += 1 + self._finished_req_ids.clear() + return scheduler_output + def has_requests(self) -> bool: return bool(self._waiting or self._running) @@ -121,12 +186,32 @@ def _finish_requests( self._finished_req_ids |= finished_req_ids return finished_req_ids + def _finalize_update_from_output( + self, + sched_output: DiffusionSchedulerOutput, + statuses: dict[str, DiffusionRequestStatus], + errors: dict[str, str | None] | None = None, + ) -> set[str]: + # A scheduled request may be aborted after schedule() but before + # update_from_output() processes the runner output. It is already + # marked finished at that point, but we still need to surface its id + # in this update so the engine can observe the terminal state. + finished_req_ids = { + sched_req_id for sched_req_id in sched_output.scheduled_req_ids if sched_req_id in self._finished_req_ids + } + finished_req_ids |= self._finish_requests(statuses, errors) + return finished_req_ids + def _reset_scheduler_state(self) -> None: """Reset subclass-owned state during initialize()/close().""" def _pop_extra_request_state(self, sched_req_id: str) -> None: """Remove subclass-owned per-request state before popping request state.""" + def _can_schedule_waiting(self, state: DiffusionRequestState) -> bool: + del state + return True + def _register_request_ids(self, request_ids: list[str], sched_req_id: str) -> None: for request_id in request_ids: existing = self._request_id_to_sched_req_id.get(request_id) diff --git a/vllm_omni/diffusion/sched/interface.py b/vllm_omni/diffusion/sched/interface.py index 427cad03d0..4db6f41355 100644 --- a/vllm_omni/diffusion/sched/interface.py +++ b/vllm_omni/diffusion/sched/interface.py @@ -8,12 +8,16 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from functools import cached_property +from typing import TYPE_CHECKING from vllm.logger import init_logger -from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig +from vllm_omni.diffusion.data import OmniDiffusionConfig from vllm_omni.diffusion.request import OmniDiffusionRequest +if TYPE_CHECKING: + from vllm_omni.diffusion.worker.utils import RunnerOutput + logger = init_logger(__name__) @@ -141,7 +145,7 @@ def schedule(self) -> DiffusionSchedulerOutput: """Run one scheduling cycle.""" @abstractmethod - def update_from_output(self, sched_output: DiffusionSchedulerOutput, output: DiffusionOutput) -> set[str]: + def update_from_output(self, sched_output: DiffusionSchedulerOutput, output: RunnerOutput) -> set[str]: """Update scheduler state from executor output.""" @abstractmethod diff --git a/vllm_omni/diffusion/sched/request_scheduler.py b/vllm_omni/diffusion/sched/request_scheduler.py index ed8316ee58..f641648e96 100644 --- a/vllm_omni/diffusion/sched/request_scheduler.py +++ b/vllm_omni/diffusion/sched/request_scheduler.py @@ -3,103 +3,48 @@ from __future__ import annotations -from vllm.logger import init_logger +from typing import TYPE_CHECKING -from vllm_omni.diffusion.data import DiffusionOutput from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.diffusion.sched.base_scheduler import _BaseScheduler from vllm_omni.diffusion.sched.interface import ( - CachedRequestData, - DiffusionRequestState, DiffusionRequestStatus, DiffusionSchedulerOutput, - NewRequestData, ) -logger = init_logger(__name__) +if TYPE_CHECKING: + from vllm_omni.diffusion.worker.utils import RunnerOutput class RequestScheduler(_BaseScheduler): """Diffusion scheduler with vLLM-style waiting/running queues.""" def add_request(self, request: OmniDiffusionRequest) -> str: - sched_req_id = self._make_sched_req_id(request) - state = DiffusionRequestState(sched_req_id=sched_req_id, req=request) - self._request_states[sched_req_id] = state - self._register_request_ids(request.request_ids, sched_req_id) - self._waiting.append(sched_req_id) - logger.debug("Scheduler add_request: %s (waiting=%d)", sched_req_id, len(self._waiting)) - return sched_req_id + return super().add_request(request) def schedule(self) -> DiffusionSchedulerOutput: - scheduled_new_reqs: list[NewRequestData] = [] - scheduled_cached_req_ids: list[str] = [] + return super().schedule() - # First, schedule the RUNNING request(s) - for sched_req_id in self._running: - state = self._request_states.get(sched_req_id) - if state is not None: - scheduled_cached_req_ids.append(sched_req_id) - - # Second, schedule WAITING requests while capacity remains. - while self._waiting and len(self._running) < self._max_batch_size: - sched_req_id = self._waiting.popleft() - state = self._request_states.get(sched_req_id) - if state is None: - continue - was_new_request = state.status == DiffusionRequestStatus.WAITING - state.status = DiffusionRequestStatus.RUNNING - self._running.append(sched_req_id) - if was_new_request: - scheduled_new_reqs.append(NewRequestData.from_state(state)) - else: - scheduled_cached_req_ids.append(sched_req_id) - - scheduler_output = DiffusionSchedulerOutput( - step_id=self._step_id, - scheduled_new_reqs=scheduled_new_reqs, - scheduled_cached_reqs=CachedRequestData(sched_req_ids=scheduled_cached_req_ids), - finished_req_ids=set(self._finished_req_ids), - num_running_reqs=len(self._running), - num_waiting_reqs=len(self._waiting), - ) - - self._step_id += 1 - self._finished_req_ids.clear() - return scheduler_output - - def update_from_output(self, sched_output: DiffusionSchedulerOutput, output: DiffusionOutput) -> set[str]: + def update_from_output(self, sched_output: DiffusionSchedulerOutput, output: RunnerOutput) -> set[str]: scheduled_req_ids = sched_output.scheduled_req_ids if not scheduled_req_ids: return set() - # A scheduled request may be aborted after schedule() but before - # update_from_output() processes the runner output. It is already - # marked finished at that point, but we still need to surface its id - # in this update so the engine can observe the terminal state. - finished_req_ids = { - sched_req_id for sched_req_id in scheduled_req_ids if sched_req_id in self._finished_req_ids - } terminal_statuses: dict[str, DiffusionRequestStatus] = {} terminal_errors: dict[str, str | None] = {} - # NOTE: request-mode currently assumes one executor call produces one - # DiffusionOutput for the single scheduled request in this cycle. + result = output.result for sched_req_id in scheduled_req_ids: state = self._request_states.get(sched_req_id) if state is None or state.is_finished(): continue - if output.error: + if result is None: terminal_statuses[sched_req_id] = DiffusionRequestStatus.FINISHED_ERROR - terminal_errors[sched_req_id] = output.error + terminal_errors[sched_req_id] = "No output result" + elif result.error: + terminal_statuses[sched_req_id] = DiffusionRequestStatus.FINISHED_ERROR + terminal_errors[sched_req_id] = result.error else: terminal_statuses[sched_req_id] = DiffusionRequestStatus.FINISHED_COMPLETED terminal_errors[sched_req_id] = None - finished_req_ids |= self._finish_requests(terminal_statuses, terminal_errors) - return finished_req_ids - - def abort_request(self, sched_req_id: str) -> bool: - if self.get_request_state(sched_req_id) is None: - return False - self.finish_requests(sched_req_id, DiffusionRequestStatus.FINISHED_ABORTED) - return True + return self._finalize_update_from_output(sched_output, terminal_statuses, terminal_errors) diff --git a/vllm_omni/diffusion/sched/step_scheduler.py b/vllm_omni/diffusion/sched/step_scheduler.py new file mode 100644 index 0000000000..4d995dcf40 --- /dev/null +++ b/vllm_omni/diffusion/sched/step_scheduler.py @@ -0,0 +1,129 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +from vllm.logger import init_logger + +from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.diffusion.sched.base_scheduler import _BaseScheduler +from vllm_omni.diffusion.sched.interface import ( + DiffusionRequestStatus, + DiffusionSchedulerOutput, +) + +if TYPE_CHECKING: + from vllm_omni.diffusion.worker.utils import RunnerOutput + +logger = init_logger(__name__) + + +@dataclass +class _StepProgress: + current_step: int + total_steps: int + + +class StepScheduler(_BaseScheduler): + """Placeholder scheduler that advances a request one denoise step per update.""" + + def __init__(self) -> None: + super().__init__() + self._request_progress: dict[str, _StepProgress] = {} + + def _reset_scheduler_state(self) -> None: + self._request_progress.clear() + + def add_request(self, request: OmniDiffusionRequest) -> str: + sched_req_id = self._make_sched_req_id(request) + total_steps = self._get_total_steps(request) + if total_steps <= 0: + raise ValueError(f"Diffusion request {sched_req_id} must have positive total_steps, got {total_steps}") + + current_step = request.sampling_params.step_index or 0 + if current_step < 0 or current_step >= total_steps: + raise ValueError( + f"Diffusion request {sched_req_id} has invalid initial step_index {current_step} " + f"for total_steps={total_steps}" + ) + + request.sampling_params.step_index = current_step + sched_req_id = self._add_request_with_sched_req_id(sched_req_id, request) + self._request_progress[sched_req_id] = _StepProgress(current_step=current_step, total_steps=total_steps) + logger.debug( + "StepScheduler add_request: %s (step=%d/%d, waiting=%d)", + sched_req_id, + current_step, + total_steps, + len(self._waiting), + ) + return sched_req_id + + def schedule(self) -> DiffusionSchedulerOutput: + return super().schedule() + + def update_from_output(self, sched_output: DiffusionSchedulerOutput, output: RunnerOutput) -> set[str]: + scheduled_req_ids = sched_output.scheduled_req_ids + if not scheduled_req_ids: + return set() + + terminal_statuses: dict[str, DiffusionRequestStatus] = {} + terminal_errors: dict[str, str | None] = {} + output_error = output.result.error if output.result is not None else None + for sched_req_id in scheduled_req_ids: + state = self._request_states.get(sched_req_id) + progress = self._request_progress.get(sched_req_id) + if state is None or progress is None or state.is_finished(): + continue + + if output_error is not None: + terminal_statuses[sched_req_id] = DiffusionRequestStatus.FINISHED_ERROR + terminal_errors[sched_req_id] = output_error + continue + + if output.step_index is None: + logger.warning( + "Received RunnerOutput with no step_index for request %s, treating as error", + sched_req_id, + ) + terminal_statuses[sched_req_id] = DiffusionRequestStatus.FINISHED_ERROR + terminal_errors[sched_req_id] = "Missing step_index in RunnerOutput" + continue + + # We assume that the decoding stage is executed immediately after the denoising stage completes. + progress.current_step = output.step_index + state.req.sampling_params.step_index = output.step_index + if output.finished: + terminal_statuses[sched_req_id] = DiffusionRequestStatus.FINISHED_COMPLETED + terminal_errors[sched_req_id] = None + else: + state.error = None + + return self._finalize_update_from_output(sched_output, terminal_statuses, terminal_errors) + + def _pop_extra_request_state(self, sched_req_id: str) -> None: + self._request_progress.pop(sched_req_id, None) + + def _get_total_steps(self, request: OmniDiffusionRequest) -> int: + sampling = request.sampling_params + + if sampling.timesteps is not None: + return self._sequence_length(sampling.timesteps) + if sampling.sigmas is not None: + return len(sampling.sigmas) + return int(sampling.num_inference_steps) + + @staticmethod + def _sequence_length(values: Any) -> int: + ndim = getattr(values, "ndim", None) + if ndim == 0: + return 1 + + shape = getattr(values, "shape", None) + if shape is not None: + return int(shape[0]) + + return len(values) diff --git a/vllm_omni/diffusion/stage_diffusion_client.py b/vllm_omni/diffusion/stage_diffusion_client.py index 5a6fb6371f..ddad2f9f3f 100644 --- a/vllm_omni/diffusion/stage_diffusion_client.py +++ b/vllm_omni/diffusion/stage_diffusion_client.py @@ -12,6 +12,7 @@ from vllm.logger import init_logger +from vllm_omni.diffusion.data import DiffusionRequestAbortedError from vllm_omni.engine.stage_init_utils import StageMetadata from vllm_omni.entrypoints.async_omni_diffusion import AsyncOmniDiffusion from vllm_omni.outputs import OmniRequestOutput @@ -74,6 +75,20 @@ async def _run( try: result = await self._engine.generate(prompt, sampling_params, request_id) await self._output_queue.put(result) + except asyncio.CancelledError: + logger.info( + "[StageDiffusionClient] Stage-%s req=%s cancelled", + self.stage_id, + request_id, + ) + raise + except DiffusionRequestAbortedError as e: + logger.info( + "[StageDiffusionClient] Stage-%s req=%s aborted: %s", + self.stage_id, + request_id, + e, + ) except Exception as e: logger.exception( "[StageDiffusionClient] Stage-%s req=%s failed: %s", @@ -138,6 +153,7 @@ async def abort_requests_async(self, request_ids: list[str]) -> None: task = self._tasks.pop(rid, None) if task: task.cancel() + await self._engine.abort(request_ids) async def collective_rpc_async( self, diff --git a/vllm_omni/diffusion/worker/__init__.py b/vllm_omni/diffusion/worker/__init__.py index 8af0283857..80a7addf3c 100644 --- a/vllm_omni/diffusion/worker/__init__.py +++ b/vllm_omni/diffusion/worker/__init__.py @@ -2,14 +2,31 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Worker classes for diffusion models.""" -from vllm_omni.diffusion.worker.diffusion_model_runner import DiffusionModelRunner -from vllm_omni.diffusion.worker.diffusion_worker import ( - DiffusionWorker, - WorkerProc, -) +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from vllm_omni.diffusion.worker.diffusion_model_runner import DiffusionModelRunner + from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker, WorkerProc __all__ = [ "DiffusionModelRunner", "DiffusionWorker", "WorkerProc", ] + + +def __getattr__(name: str) -> Any: + if name == "DiffusionModelRunner": + from vllm_omni.diffusion.worker.diffusion_model_runner import DiffusionModelRunner + + return DiffusionModelRunner + if name in {"DiffusionWorker", "WorkerProc"}: + from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker, WorkerProc + + return { + "DiffusionWorker": DiffusionWorker, + "WorkerProc": WorkerProc, + }[name] + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index a4d87c96e4..9de3dc867f 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -860,6 +860,7 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list: "max_num_seqs": 1, "parallel_config": parallel_config, "model_class_name": kwargs.get("model_class_name", None), + "step_execution": kwargs.get("step_execution", False), "vae_use_slicing": kwargs.get("vae_use_slicing", False), "vae_use_tiling": kwargs.get("vae_use_tiling", False), "cache_backend": cache_backend, diff --git a/vllm_omni/entrypoints/async_omni_diffusion.py b/vllm_omni/entrypoints/async_omni_diffusion.py index a7a02eded6..674c3509d2 100644 --- a/vllm_omni/entrypoints/async_omni_diffusion.py +++ b/vllm_omni/entrypoints/async_omni_diffusion.py @@ -18,7 +18,11 @@ from vllm.logger import init_logger from vllm.transformers_utils.config import get_hf_file_to_dict -from vllm_omni.diffusion.data import OmniDiffusionConfig, TransformerConfig +from vllm_omni.diffusion.data import ( + DiffusionRequestAbortedError, + OmniDiffusionConfig, + TransformerConfig, +) from vllm_omni.diffusion.diffusion_engine import DiffusionEngine from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniPromptType @@ -308,6 +312,11 @@ async def generate( request, ) result = result[0] + except asyncio.CancelledError: + self.engine.abort(request_id) + raise + except DiffusionRequestAbortedError: + raise except Exception as e: logger.error("Generation failed for request %s: %s", request_id, e) raise RuntimeError(f"Diffusion generation failed: {e}") from e diff --git a/vllm_omni/entrypoints/cli/serve.py b/vllm_omni/entrypoints/cli/serve.py index f924d64c39..4e1c8d3a94 100644 --- a/vllm_omni/entrypoints/cli/serve.py +++ b/vllm_omni/entrypoints/cli/serve.py @@ -267,6 +267,11 @@ def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgu action="store_true", help="Enable cache-dit summary logging after diffusion forward passes.", ) + omni_config_group.add_argument( + "--step-execution", + action="store_true", + help="Enable per-step diffusion execution so running requests can be aborted between denoise steps.", + ) # VAE memory optimization parameters omni_config_group.add_argument( From bf5bd0a4c00feed487f5f5e70810de84fe3d4604 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Wed, 1 Apr 2026 17:29:02 +0800 Subject: [PATCH 013/204] [BugFix]: Fix bagel single-stage img2img fallback to text2img bug (#2397) Signed-off-by: princepride --- vllm_omni/diffusion/models/bagel/pipeline_bagel.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py index c4155a9fc8..aa4f0a74f0 100644 --- a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py +++ b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py @@ -387,7 +387,12 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: else: image_input = ( - None if isinstance(first_prompt, str) else (first_prompt.get("multi_modal_data") or {}).get("image") + None + if isinstance(first_prompt, str) + else ( + (first_prompt.get("multi_modal_data") or {}).get("image") + or (first_prompt.get("multi_modal_data") or {}).get("img2img") + ) ) if image_input and not isinstance(image_input, list): image_input = [image_input] From 3def008b324f636940953862daedec54b9021a87 Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Wed, 1 Apr 2026 17:55:00 +0800 Subject: [PATCH 014/204] [Feat] Add MUSA platform support for Moore Threads GPUs (#2337) Signed-off-by: Xiaodong Ye --- pyproject.toml | 4 + requirements/musa.txt | 4 + setup.py | 19 ++- tests/utils.py | 46 ++++++- .../diffusion/attention/backends/abstract.py | 12 ++ .../diffusion/attention/backends/utils/fa.py | 3 + vllm_omni/diffusion/layers/custom_op.py | 6 + vllm_omni/platforms/__init__.py | 17 +++ vllm_omni/platforms/interface.py | 4 + vllm_omni/platforms/musa/__init__.py | 6 + vllm_omni/platforms/musa/platform.py | 123 ++++++++++++++++++ vllm_omni/platforms/musa/worker/__init__.py | 9 ++ .../platforms/musa/worker/musa_ar_worker.py | 103 +++++++++++++++ .../musa/worker/musa_generation_worker.py | 106 +++++++++++++++ vllm_omni/profiler/omni_torch_profiler.py | 3 +- 15 files changed, 454 insertions(+), 11 deletions(-) create mode 100644 requirements/musa.txt create mode 100644 vllm_omni/platforms/musa/__init__.py create mode 100644 vllm_omni/platforms/musa/platform.py create mode 100644 vllm_omni/platforms/musa/worker/__init__.py create mode 100644 vllm_omni/platforms/musa/worker/musa_ar_worker.py create mode 100644 vllm_omni/platforms/musa/worker/musa_generation_worker.py diff --git a/pyproject.toml b/pyproject.toml index 43e9506fd0..15e7c6305a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -177,20 +177,24 @@ markers = [ "rocm: Tests that run on AMD/ROCm (auto-added)", "xpu: Tests that run on XPU (auto-added)", "npu: Tests that run on NPU/Ascend (auto-added)", + "musa: Tests that run on MUSA/Moore Threads (auto-added)", # specified computation resources marks (auto-added) "H100: Tests that require H100 GPU", "L4: Tests that require L4 GPU", "MI325: Tests that require MI325 GPU (AMD/ROCm)", + "S5000: Tests that require S5000 GPU (Moore Threads/MUSA)", "A2: Tests that require A2 NPU", "A3: Tests that require A3 NPU", "distributed_cuda: Tests that require multi cards on CUDA platform", "distributed_rocm: Tests that require multi cards on ROCm platform", "distributed_xpu: Tests that require multi cards on XPU platform", "distributed_npu: Tests that require multi cards on NPU platform", + "distributed_musa: Tests that require multi cards on MUSA platform", "skipif_cuda: Skip if the num of CUDA cards is less than the required", "skipif_rocm: Skip if the num of ROCm cards is less than the required", "skipif_xpu: Skip if the num of XPU cards is less than the required", "skipif_npu: Skip if the num of NPU cards is less than the required", + "skipif_musa: Skip if the num of MUSA cards is less than the required", # more detailed markers "slow: Slow tests (may skip in quick CI)", "benchmark: Benchmark tests", diff --git a/requirements/musa.txt b/requirements/musa.txt new file mode 100644 index 0000000000..112f326046 --- /dev/null +++ b/requirements/musa.txt @@ -0,0 +1,4 @@ +-r common.txt +# MUSA platform dependencies +torchada>=0.1.46 +onnxruntime>=1.23.2 diff --git a/setup.py b/setup.py index 4ff4936b43..057212d67f 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ This setup.py implements platform-aware dependency routing so users can run `pip install vllm-omni` and automatically receive the correct platform-specific -dependencies (CUDA/ROCm/CPU/XPU/NPU) without requiring extras like `[cuda]`. +dependencies (CUDA/ROCm/CPU/XPU/NPU/MUSA) without requiring extras like `[cuda]`. """ import os @@ -46,16 +46,16 @@ def detect_target_device() -> str: Priority order: 1. VLLM_OMNI_TARGET_DEVICE environment variable (highest priority) - 2. Torch backend detection (cuda, rocm, npu, xpu) + 2. Torch backend detection (cuda, rocm, npu, xpu, musa) 3. CPU fallback (default) Returns: - str: Device name ('cuda', 'rocm', 'npu', 'xpu', or 'cpu') + str: Device name ('cuda', 'rocm', 'npu', 'xpu', 'musa', or 'cpu') """ # Priority 1: Explicit override via environment variable target_device = os.environ.get("VLLM_OMNI_TARGET_DEVICE") if target_device: - valid_devices = ["cuda", "rocm", "npu", "xpu", "cpu"] + valid_devices = ["cuda", "rocm", "npu", "xpu", "musa", "cpu"] if target_device.lower() in valid_devices: print(f"Using target device from VLLM_OMNI_TARGET_DEVICE: {target_device.lower()}") return target_device.lower() @@ -97,6 +97,15 @@ def detect_target_device() -> str: except Exception: pass + # Check for MUSA (Moore Threads) + if hasattr(torch, "musa"): + try: + if torch.musa.is_available(): + print("Detected MUSA backend from torch") + return "musa" + except Exception: + pass + print("No GPU backend detected in torch, defaulting to CPU") return "cpu" @@ -152,6 +161,8 @@ def get_vllm_omni_version() -> str: version += f"{sep}npu" elif device == "xpu": version += f"{sep}xpu" + elif device == "musa": + version += f"{sep}musa" elif device == "cpu": version += f"{sep}cpu" else: diff --git a/tests/utils.py b/tests/utils.py index 72fc6639ac..84edbbf3d1 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -430,13 +430,41 @@ def xpu_marks(*, res: str, num_cards: int): return marks + [test_distributed] +def musa_marks(*, res: str, num_cards: int): + """ + Get a collection of pytest marks to apply for `@musa_test`. + + Args: + res: Resource type, e.g., "S5000". + num_cards: Number of GPU cards required. + + Returns: + List of pytest marks to apply. + """ + test_platform_detail = pytest.mark.musa + + if res == "S5000": + test_resource = pytest.mark.S5000 + else: + raise ValueError(f"Invalid MUSA resource type: {res}. Supported: S5000") + + marks = [test_resource, test_platform_detail] + + if num_cards == 1: + return marks + else: + test_distributed = pytest.mark.distributed_musa(num_cards=num_cards) + # TODO: add MUSA support for `skipif_musa` marker + return marks + [test_distributed] + + def gpu_marks(*, res: str, num_cards: int): """ Get a collection of pytest marks to apply for `@gpu_test`. Platform is automatically determined based on resource type. Args: - res: Resource type, e.g., "L4", "H100" for CUDA, or "MI325" for ROCm, or "B60" for XPU. + res: Resource type, e.g., "L4", "H100" for CUDA, or "MI325" for ROCm, or "B60" for XPU, or "S5000" for MUSA. num_cards: Number of GPU cards required. Returns: @@ -449,7 +477,9 @@ def gpu_marks(*, res: str, num_cards: int): return [test_platform] + rocm_marks(res=res, num_cards=num_cards) if res == "B60": return [test_platform] + xpu_marks(res=res, num_cards=num_cards) - raise ValueError(f"Invalid resource type: {res}. Supported: L4, H100, MI325") + if res == "S5000": + return [test_platform] + musa_marks(res=res, num_cards=num_cards) + raise ValueError(f"Invalid resource type: {res}. Supported: L4, H100, MI325, B60, S5000") def npu_marks(*, res: str, num_cards: int): @@ -476,13 +506,13 @@ def npu_marks(*, res: str, num_cards: int): def hardware_marks(*, res: dict[str, str], num_cards: int | dict[str, int] = 1): """ Get a collection of pytest marks to apply for `@hardware_test`, - including CUDA, ROCm, XPU, and NPU, + including CUDA, ROCm, XPU, NPU, and MUSA, based on the specified platforms and resources. """ # Validate platforms # Don't validate platform details in this decorator for platform, _ in res.items(): - if platform not in ("cuda", "rocm", "xpu", "npu"): + if platform not in ("cuda", "rocm", "xpu", "npu", "musa"): raise ValueError(f"Unsupported platform: {platform}") # Normalize num_cards @@ -505,6 +535,8 @@ def hardware_marks(*, res: dict[str, str], num_cards: int | dict[str, int] = 1): cards = num_cards_dict[platform] if platform == "cuda" or platform == "rocm" or platform == "xpu": marks = gpu_marks(res=resource, num_cards=cards) + elif platform == "musa": + marks = musa_marks(res=resource, num_cards=cards) elif platform == "npu": marks = npu_marks(res=resource, num_cards=cards) else: @@ -522,15 +554,17 @@ def hardware_test(*, res: dict[str, str], num_cards: int | dict[str, int] = 1): res: Mapping from platform to resource type. Supported platforms/resources: - cuda: L4, H100 - rocm: MI325 + - xpu: B60 - npu: A2, A3 + - musa: S5000 num_cards: Number of cards required. Can be: - int: same card count for all platforms (default: 1) - dict: per-platform card count, e.g., {"cuda": 2, "rocm": 2} Example: @hardware_test( - res={"cuda": "L4", "rocm": "MI325", "npu": "A2"}, - num_cards={"cuda": 2, "rocm": 2, "npu": 2}, + res={"cuda": "L4", "rocm": "MI325", "npu": "A2", "musa": "S5000"}, + num_cards={"cuda": 2, "rocm": 2, "npu": 2, "musa": 2}, ) def test_multi_platform(): ... diff --git a/vllm_omni/diffusion/attention/backends/abstract.py b/vllm_omni/diffusion/attention/backends/abstract.py index d0a62bcd9c..472fde422d 100644 --- a/vllm_omni/diffusion/attention/backends/abstract.py +++ b/vllm_omni/diffusion/attention/backends/abstract.py @@ -99,6 +99,8 @@ def forward( return self.forward_npu(query, key, value, attn_metadata) elif current_omni_platform.is_xpu(): return self.forward_xpu(query, key, value, attn_metadata) + elif current_omni_platform.is_musa(): + return self.forward_musa(query, key, value, attn_metadata) else: raise NotImplementedError(f"No forward implementation for platform: {current_omni_platform}") @@ -138,3 +140,13 @@ def forward_hip( ) -> torch.Tensor: # By default, HIP ops are compatible with CUDA ops. return self.forward_cuda(query, key, value, attn_metadata) + + def forward_musa( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_metadata: T | None = None, + ) -> torch.Tensor: + # By default, MUSA ops are compatible with CUDA ops. + return self.forward_cuda(query, key, value, attn_metadata) diff --git a/vllm_omni/diffusion/attention/backends/utils/fa.py b/vllm_omni/diffusion/attention/backends/utils/fa.py index 1474598d79..1fd47790f0 100644 --- a/vllm_omni/diffusion/attention/backends/utils/fa.py +++ b/vllm_omni/diffusion/attention/backends/utils/fa.py @@ -35,6 +35,9 @@ from vllm.v1.attention.backends.fa_utils import flash_attn_varlen_func # noqa: F401 except (ImportError, ModuleNotFoundError): pass +elif current_omni_platform.is_musa(): + # XXX (MUSA): Add MUSA-specific Flash Attention when available + pass else: # CUDA: try FA3 -> FA2 fallback chain # Try FA3 from fa3-fwd PyPI package diff --git a/vllm_omni/diffusion/layers/custom_op.py b/vllm_omni/diffusion/layers/custom_op.py index 321bcbf8ad..27e3bce1f2 100644 --- a/vllm_omni/diffusion/layers/custom_op.py +++ b/vllm_omni/diffusion/layers/custom_op.py @@ -25,6 +25,8 @@ def dispatch_forward(self) -> Callable: return self.forward_npu elif current_omni_platform.is_xpu(): return self.forward_xpu + elif current_omni_platform.is_musa(): + return self.forward_musa else: return self.forward_native @@ -51,3 +53,7 @@ def forward_xpu(self, *args, **kwargs): def forward_hip(self, *args, **kwargs): # By default, we assume that HIP ops are compatible with CUDA ops. return self.forward_cuda(*args, **kwargs) + + def forward_musa(self, *args, **kwargs): + # By default, we assume that MUSA ops are compatible with CUDA ops. + return self.forward_cuda(*args, **kwargs) diff --git a/vllm_omni/platforms/__init__.py b/vllm_omni/platforms/__init__.py index ae29b71ed9..64a7cdb16f 100644 --- a/vllm_omni/platforms/__init__.py +++ b/vllm_omni/platforms/__init__.py @@ -105,11 +105,28 @@ def xpu_omni_platform_plugin() -> str | None: return "vllm_omni.platforms.xpu.platform.XPUOmniPlatform" if is_xpu else None +def musa_omni_platform_plugin() -> str | None: + """Check if MUSA OmniPlatform should be activated.""" + is_musa = False + logger.debug("Checking if MUSA OmniPlatform is available.") + try: + import torchada + + if torchada.is_musa_platform(): + is_musa = True + logger.debug("Confirmed MUSA OmniPlatform is available.") + except Exception as e: + logger.debug("MUSA OmniPlatform is not available because: %s", str(e)) + + return "vllm_omni.platforms.musa.platform.MUSAOmniPlatform" if is_musa else None + + builtin_omni_platform_plugins = { "cuda": cuda_omni_platform_plugin, "rocm": rocm_omni_platform_plugin, "npu": npu_omni_platform_plugin, "xpu": xpu_omni_platform_plugin, + "musa": musa_omni_platform_plugin, } diff --git a/vllm_omni/platforms/interface.py b/vllm_omni/platforms/interface.py index 7739cec78b..4325851e5f 100644 --- a/vllm_omni/platforms/interface.py +++ b/vllm_omni/platforms/interface.py @@ -15,6 +15,7 @@ class OmniPlatformEnum(Enum): ROCM = "rocm" NPU = "npu" XPU = "xpu" + MUSA = "musa" UNSPECIFIED = "unspecified" @@ -41,6 +42,9 @@ def is_cuda(self) -> bool: def is_rocm(self) -> bool: return self._omni_enum == OmniPlatformEnum.ROCM + def is_musa(self) -> bool: + return self._omni_enum == OmniPlatformEnum.MUSA + @classmethod def get_omni_ar_worker_cls(cls) -> str: raise NotImplementedError diff --git a/vllm_omni/platforms/musa/__init__.py b/vllm_omni/platforms/musa/__init__.py new file mode 100644 index 0000000000..70ea7a9629 --- /dev/null +++ b/vllm_omni/platforms/musa/__init__.py @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm_omni.platforms.musa.platform import MUSAOmniPlatform + +__all__ = ["MUSAOmniPlatform"] diff --git a/vllm_omni/platforms/musa/platform.py b/vllm_omni/platforms/musa/platform.py new file mode 100644 index 0000000000..932ce62d27 --- /dev/null +++ b/vllm_omni/platforms/musa/platform.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Any + +import torch +from vllm.logger import init_logger +from vllm_musa.platform import MUSAPlatformBase + +from vllm_omni.diffusion.attention.backends.registry import DiffusionAttentionBackendEnum +from vllm_omni.platforms.interface import OmniPlatform, OmniPlatformEnum + +logger = init_logger(__name__) + + +class MUSAOmniPlatform(OmniPlatform, MUSAPlatformBase): + """MUSA/Moore Threads GPU implementation of OmniPlatform. + + Inherits all MUSA-specific implementations from vllm-musa's MUSAPlatformBase, + and adds Omni-specific interfaces from OmniPlatform. + """ + + _omni_enum = OmniPlatformEnum.MUSA + + @classmethod + def get_omni_ar_worker_cls(cls) -> str: + return "vllm_omni.platforms.musa.worker.musa_ar_worker.MUSAARWorker" + + @classmethod + def get_omni_generation_worker_cls(cls) -> str: + return "vllm_omni.platforms.musa.worker.musa_generation_worker.MUSAGenerationWorker" + + @classmethod + def get_default_stage_config_path(cls) -> str: + return "vllm_omni/model_executor/stage_configs" + + @classmethod + def get_diffusion_model_impl_qualname(cls, op_name: str) -> str: + # MUSA uses default implementations for diffusion ops + if op_name == "hunyuan_fused_moe": + return "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault" + return super().get_diffusion_model_impl_qualname(op_name) + + @classmethod + def prepare_diffusion_op_runtime(cls, op_name: str, **kwargs: Any) -> None: + # MUSA uses default runtime preparation + return None + + @classmethod + def get_diffusion_attn_backend_cls( + cls, + selected_backend: str | None, + head_size: int, + ) -> str: + """Get the diffusion attention backend class path for MUSA platform. + + MUSA currently supports SDPA (Scaled Dot Product Attention) as the + primary backend. Flash Attention support may be added in future + when MUSA-specific implementations are available. + + Args: + selected_backend: User-selected backend name (e.g., "FLASH_ATTN", + "TORCH_SDPA"). If None, uses platform default. + head_size: Attention head size. + + Returns: + Fully qualified class path of the selected backend. + """ + if selected_backend is not None: + backend_upper = selected_backend.upper() + backend = DiffusionAttentionBackendEnum[backend_upper] + logger.info("Using diffusion attention backend '%s'", backend_upper) + return backend.get_path() + + # Default to SDPA for MUSA as it's the most compatible backend + logger.info("Defaulting to diffusion attention backend SDPA") + return DiffusionAttentionBackendEnum.TORCH_SDPA.get_path() + + @classmethod + def supports_torch_inductor(cls) -> bool: + """MUSA supports torch.compile with inductor backend.""" + return True + + @classmethod + def get_torch_device(cls, local_rank: int | None = None) -> torch.device: + """Get the torch device for MUSA platform. + + Args: + local_rank: Optional local rank for multi-GPU setups. + + Returns: + torch.device for MUSA GPU. + """ + if local_rank is None: + return torch.device("musa") + return torch.device("musa", local_rank) + + @classmethod + def get_device_count(cls) -> int: + """Get the number of available MUSA devices.""" + return torch.musa.device_count() + + @classmethod + def synchronize(cls) -> None: + """Synchronize all MUSA operations.""" + torch.musa.synchronize() + + @classmethod + def get_free_memory(cls, device: torch.device | None = None) -> int: + """Get the free memory on the MUSA device. + + Args: + device: Optional device to query. If None, uses current device. + + Returns: + Free memory in bytes. + """ + free, _ = torch.musa.mem_get_info(device) + return free + + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + return torch.musa.get_device_name(device_id) diff --git a/vllm_omni/platforms/musa/worker/__init__.py b/vllm_omni/platforms/musa/worker/__init__.py new file mode 100644 index 0000000000..bd0054870e --- /dev/null +++ b/vllm_omni/platforms/musa/worker/__init__.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm_omni.platforms.musa.worker.musa_ar_worker import MUSAARWorker +from vllm_omni.platforms.musa.worker.musa_generation_worker import ( + MUSAGenerationWorker, +) + +__all__ = ["MUSAARWorker", "MUSAGenerationWorker"] diff --git a/vllm_omni/platforms/musa/worker/musa_ar_worker.py b/vllm_omni/platforms/musa/worker/musa_ar_worker.py new file mode 100644 index 0000000000..258e911df1 --- /dev/null +++ b/vllm_omni/platforms/musa/worker/musa_ar_worker.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""MUSA AR (Autoregressive) worker for vLLM-Omni. + +This worker handles autoregressive model stages (thinker/talker) on MUSA devices. +""" + +import gc +import os + +import torch +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.utils.mem_utils import MemorySnapshot, format_gib +from vllm.utils.torch_utils import set_random_seed +from vllm.v1.utils import report_usage_stats +from vllm.v1.worker.gpu_worker import init_worker_distributed_environment +from vllm.v1.worker.utils import request_memory +from vllm.v1.worker.workspace import init_workspace_manager + +from vllm_omni.worker.base import OmniGPUWorkerBase +from vllm_omni.worker.gpu_ar_model_runner import GPUARModelRunner +from vllm_omni.worker.mixins import OmniWorkerMixin + +logger = init_logger(__name__) + + +class MUSAARWorker(OmniWorkerMixin, OmniGPUWorkerBase): + """MUSA AR worker for thinker/talker stages in Omni model.""" + + def init_device(self): + """Initialize the MUSA device for this worker.""" + # This env var set by Ray causes exceptions with graph building. + os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) + parallel_config = self.parallel_config + if ( + parallel_config.distributed_executor_backend not in ("ray", "external_launcher") + and parallel_config.data_parallel_backend != "ray" + and parallel_config.nnodes_within_dp == 1 + ): + # Use local DP rank if available, otherwise use global DP rank. + dp_local_rank = self.parallel_config.data_parallel_rank_local + if dp_local_rank is None: + dp_local_rank = self.parallel_config.data_parallel_index + + tp_pp_world_size = self.parallel_config.pipeline_parallel_size * self.parallel_config.tensor_parallel_size + + # DP_LOCAL_RANK * TP_PP_WORLD_SIZE + TP_LOCAL_RANK + self.local_rank += dp_local_rank * tp_pp_world_size + assert self.local_rank < torch.musa.device_count(), ( + f"DP adjusted local rank {self.local_rank} is out of bounds. " + ) + visible_device_count = torch.musa.device_count() + assert self.parallel_config.local_world_size <= visible_device_count, ( + f"local_world_size ({self.parallel_config.local_world_size}) must " + f"be less than or equal to the number of visible devices " + f"({visible_device_count})." + ) + + self.device = torch.device(f"musa:{self.local_rank}") + torch.musa.set_device(self.device) + + current_platform.check_if_supports_dtype(self.model_config.dtype) + + # Initialize the distributed environment BEFORE taking memory snapshot + # This ensures NCCL buffers are allocated before we measure available memory + init_worker_distributed_environment( + self.vllm_config, + self.rank, + self.distributed_init_method, + self.local_rank, + current_platform.dist_backend, + ) + + # Set random seed. + set_random_seed(self.model_config.seed) + + # Now take memory snapshot after distributed environment is initialized + gc.collect() + torch.musa.empty_cache() + + # Take current memory snapshot + self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device) + self.requested_memory = request_memory(init_snapshot, self.cache_config) + logger.debug("worker init memory snapshot: %r", self.init_snapshot) + logger.debug("worker requested memory: %sGiB", format_gib(self.requested_memory)) + + # Initialize workspace manager + num_ubatches = 2 if self.vllm_config.parallel_config.enable_dbo else 1 + init_workspace_manager(self.device, num_ubatches) + + if self.use_v2_model_runner: + # OMNI: v2 model runner does not yet include omni hooks. + logger.warning("OMNI MUSAARWorker forces v1 model runner for omni hooks.") + self.use_v2_model_runner = False + + # Construct the model runner + self.model_runner = GPUARModelRunner(self.vllm_config, self.device) + + if self.rank == 0: + # If usage stat is enabled, collect relevant info. + report_usage_stats(self.vllm_config) diff --git a/vllm_omni/platforms/musa/worker/musa_generation_worker.py b/vllm_omni/platforms/musa/worker/musa_generation_worker.py new file mode 100644 index 0000000000..f433f8897e --- /dev/null +++ b/vllm_omni/platforms/musa/worker/musa_generation_worker.py @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""MUSA generation worker for vLLM-Omni. + +This worker handles non-autoregressive generation stages (e.g., code2wav waveform +generation) on MUSA devices. +""" + +import gc +import os + +import torch +from vllm.logger import init_logger +from vllm.platforms import current_platform +from vllm.tracing import instrument +from vllm.utils.mem_utils import MemorySnapshot, format_gib +from vllm.utils.torch_utils import set_random_seed +from vllm.v1.utils import report_usage_stats +from vllm.v1.worker.gpu_worker import init_worker_distributed_environment +from vllm.v1.worker.utils import request_memory +from vllm.v1.worker.workspace import init_workspace_manager + +from vllm_omni.worker.base import OmniGPUWorkerBase +from vllm_omni.worker.gpu_generation_model_runner import GPUGenerationModelRunner +from vllm_omni.worker.mixins import OmniWorkerMixin + +logger = init_logger(__name__) + + +class MUSAGenerationWorker(OmniWorkerMixin, OmniGPUWorkerBase): + """MUSA generation worker for non-AR waveform generation stage.""" + + @instrument(span_name="Init device") + def init_device(self): + """Initialize the MUSA device for this worker.""" + # This env var set by Ray causes exceptions with graph building. + os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) + parallel_config = self.parallel_config + if ( + parallel_config.distributed_executor_backend not in ("ray", "external_launcher") + and parallel_config.data_parallel_backend != "ray" + and parallel_config.nnodes_within_dp == 1 + ): + # Use local DP rank if available, otherwise use global DP rank. + dp_local_rank = self.parallel_config.data_parallel_rank_local + if dp_local_rank is None: + dp_local_rank = self.parallel_config.data_parallel_index + + tp_pp_world_size = self.parallel_config.pipeline_parallel_size * self.parallel_config.tensor_parallel_size + + # DP_LOCAL_RANK * TP_PP_WORLD_SIZE + TP_LOCAL_RANK + self.local_rank += dp_local_rank * tp_pp_world_size + assert self.local_rank < torch.musa.device_count(), ( + f"DP adjusted local rank {self.local_rank} is out of bounds. " + ) + visible_device_count = torch.musa.device_count() + assert self.parallel_config.local_world_size <= visible_device_count, ( + f"local_world_size ({self.parallel_config.local_world_size}) must " + f"be less than or equal to the number of visible devices " + f"({visible_device_count})." + ) + + self.device = torch.device(f"musa:{self.local_rank}") + torch.musa.set_device(self.device) + + current_platform.check_if_supports_dtype(self.model_config.dtype) + + # Initialize the distributed environment BEFORE taking memory snapshot + # This ensures NCCL buffers are allocated before we measure available memory + init_worker_distributed_environment( + self.vllm_config, + self.rank, + self.distributed_init_method, + self.local_rank, + current_platform.dist_backend, + ) + + # Set random seed. + set_random_seed(self.model_config.seed) + + # Now take memory snapshot after distributed environment is initialized + gc.collect() + torch.musa.empty_cache() + + # Take current memory snapshot + self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device) + self.requested_memory = request_memory(init_snapshot, self.cache_config) + logger.debug("worker init memory snapshot: %r", self.init_snapshot) + logger.debug("worker requested memory: %sGiB", format_gib(self.requested_memory)) + + # Initialize workspace manager + num_ubatches = 2 if self.vllm_config.parallel_config.enable_dbo else 1 + init_workspace_manager(self.device, num_ubatches) + + if self.use_v2_model_runner: + # OMNI: v2 model runner does not yet include omni hooks. + logger.warning("OMNI MUSAGenerationWorker forces v1 model runner for omni hooks.") + self.use_v2_model_runner = False + + # Construct the model runner + self.model_runner = GPUGenerationModelRunner(self.vllm_config, self.device) + + if self.rank == 0: + # If usage stat is enabled, collect relevant info. + report_usage_stats(self.vllm_config) diff --git a/vllm_omni/profiler/omni_torch_profiler.py b/vllm_omni/profiler/omni_torch_profiler.py index 7d03ad328f..2257a21283 100644 --- a/vllm_omni/profiler/omni_torch_profiler.py +++ b/vllm_omni/profiler/omni_torch_profiler.py @@ -18,11 +18,12 @@ logger = init_logger(__name__) # NPU has its custom profiler -TorchProfilerActivity = Literal["CPU", "CUDA", "XPU", "NPU"] +TorchProfilerActivity = Literal["CPU", "CUDA", "XPU", "NPU", "MUSA"] TorchProfilerActivityMap = { "CPU": torch.profiler.ProfilerActivity.CPU, "CUDA": torch.profiler.ProfilerActivity.CUDA, "XPU": torch.profiler.ProfilerActivity.XPU, + "MUSA": torch.profiler.ProfilerActivity.CUDA, } From 6ef0e907af7c9468fa45783669eb10ee03ffe905 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 1 Apr 2026 04:35:48 -0700 Subject: [PATCH 015/204] Add new committers to governance page (#2419) --- docs/community/governance.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/community/governance.md b/docs/community/governance.md index a5296526fc..6af578e2d8 100644 --- a/docs/community/governance.md +++ b/docs/community/governance.md @@ -37,10 +37,12 @@ Committers have write access and merge rights. They typically have deep expertis - [@gcanlin](https://github.com/gcanlin): Hardware plugin and NPU integration - [@Isotr0py](https://github.com/Isotr0py): Diffusion and Quantization - [@linyueqian](https://github.com/linyueqian): TTS and Omni Support +- [@lishunyang12](https://github.com/lishunyang12): Quantization and Configuration - [@princepride](https://github.com/princepride): Diffusion and Omni Support - [@SamitHuang](https://github.com/SamitHuang): RL and Diffusion - [@tzhouam](https://github.com/tzhouam): Engine and New Model Support -- [@wtomin](https://github.com/wtomin): +- [@wtomin](https://github.com/wtomin): Diffusion and Parallelism +- [@ZeldaHuang](https://github.com/ZeldaHuang): Omni Support - [@ZJY0516](https://github.com/ZJY0516): Diffusion and CustomOp ## Meetings From 4e4bbc42a6f4d511ec6c3542bbb439afbe563892 Mon Sep 17 00:00:00 2001 From: TJian Date: Wed, 1 Apr 2026 19:37:34 +0800 Subject: [PATCH 016/204] [CI] Tune GPU resources for test (#2401) Signed-off-by: tjtanaa --- .buildkite/test-merge.yml | 2 +- .buildkite/test-ready.yml | 4 ++-- tests/e2e/online_serving/test_mimo_audio.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 7bee193191..5ee9363374 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -174,7 +174,7 @@ steps: pytest -s -v tests/engine/test_async_omni_engine_abort.py ' agents: - queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU + queue: "gpu_1_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: - docker#v5.2.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 89839a2d1e..91ea92a5ce 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -180,7 +180,7 @@ steps: pytest -s -v tests/engine/test_async_omni_engine_abort.py ' agents: - queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU + queue: "gpu_1_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: - docker#v5.2.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT @@ -271,7 +271,7 @@ steps: - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT resources: limits: - nvidia.com/gpu: 2 + nvidia.com/gpu: 1 volumeMounts: - name: devshm mountPath: /dev/shm diff --git a/tests/e2e/online_serving/test_mimo_audio.py b/tests/e2e/online_serving/test_mimo_audio.py index 639c46a65c..2fb63c1e42 100644 --- a/tests/e2e/online_serving/test_mimo_audio.py +++ b/tests/e2e/online_serving/test_mimo_audio.py @@ -95,7 +95,7 @@ def get_max_batch_size(size_type="few"): @pytest.mark.advanced_model @pytest.mark.core_model @pytest.mark.omni -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=2) +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=1) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_audio_to_text_audio_001(omni_server, openai_client) -> None: """ @@ -128,7 +128,7 @@ def test_audio_to_text_audio_001(omni_server, openai_client) -> None: @pytest.mark.advanced_model @pytest.mark.omni -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=2) +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=1) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_text_to_text_001(omni_server, openai_client) -> None: """ From 70a62651b9ea4780ebc655c67e9243c6d8a7e3d6 Mon Sep 17 00:00:00 2001 From: Lancer Date: Wed, 1 Apr 2026 20:01:09 +0800 Subject: [PATCH 017/204] [Feat] support HSDP for Qwen-image series, Z-Image, GLM-Image (#2029) Signed-off-by: Lancer --- docs/user_guide/diffusion_features.md | 14 +++++++------- .../test_qwen_image_edit_expansion.py | 12 ++++++++++++ .../online_serving/test_qwen_image_expansion.py | 12 ++++++++++++ .../test_qwen_image_layered_expansion.py | 12 ++++++++++++ tests/e2e/online_serving/test_zimage_expansion.py | 12 ++++++++++++ vllm_omni/diffusion/distributed/hsdp_utils.py | 9 +++++++++ .../models/glm_image/glm_image_transformer.py | 3 +++ .../hunyuan_video/hunyuan_video_15_transformer.py | 7 ++----- .../models/qwen_image/pipeline_qwen_image.py | 2 +- .../models/qwen_image/pipeline_qwen_image_edit.py | 2 +- .../qwen_image/pipeline_qwen_image_edit_plus.py | 2 +- .../qwen_image/pipeline_qwen_image_layered.py | 2 +- .../models/qwen_image/qwen_image_transformer.py | 3 +++ .../diffusion/models/z_image/pipeline_z_image.py | 2 +- .../models/z_image/z_image_transformer.py | 6 ++++++ 15 files changed, 83 insertions(+), 17 deletions(-) create mode 100644 vllm_omni/diffusion/distributed/hsdp_utils.py diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index f0969b677f..d633e7de8c 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -105,7 +105,7 @@ The following tables show which models support each feature: | **FLUX.2-klein** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | | **FLUX.1-Kontext-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **FLUX.2-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **GLM-Image** | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **GLM-Image** | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **HunyuanImage3** | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | | **LongCat-Image** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **LongCat-Image-Edit** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | @@ -113,13 +113,13 @@ The following tables show which models support each feature: | **Nextstep_1(T2I)** | ❓ | ❓ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **OmniGen2** | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | **Ovis-Image** | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| **Qwen-Image** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | -| **Qwen-Image-2512** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | -| **Qwen-Image-Edit** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | -| **Qwen-Image-Edit-2509** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | -| **Qwen-Image-Layered** | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | +| **Qwen-Image** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| **Qwen-Image-2512** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| **Qwen-Image-Edit** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| **Qwen-Image-Edit-2509** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| **Qwen-Image-Layered** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | **Stable-Diffusion3.5** | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | -| **Z-Image** | ✅ | ✅ | ✅ | ❓ | ✅ (TP=2 only) | ❌ | ❌ | ✅ | ✅ | ❌ | +| **Z-Image** | ✅ | ✅ | ✅ | ❓ | ✅ (TP=2 only) | ✅ | ❌ | ✅ | ✅ | ❌ | > Notes: > 1. Nextstep_1(T2I) does not support cache acceleration methods such as TeaCache or Cache-DiT. diff --git a/tests/e2e/online_serving/test_qwen_image_edit_expansion.py b/tests/e2e/online_serving/test_qwen_image_edit_expansion.py index 4501569aab..14e4c915b6 100644 --- a/tests/e2e/online_serving/test_qwen_image_edit_expansion.py +++ b/tests/e2e/online_serving/test_qwen_image_edit_expansion.py @@ -98,6 +98,18 @@ def _get_diffusion_feature_cases(model: str): id="parallel_004", marks=PARALLEL_FEATURE_MARKS, ), + pytest.param( + OmniServerParams( + model=model, + server_args=[ + "--use-hsdp", + "--hsdp-shard-size", + "2", + ], + ), + id="parallel_005", + marks=PARALLEL_FEATURE_MARKS, + ), ] diff --git a/tests/e2e/online_serving/test_qwen_image_expansion.py b/tests/e2e/online_serving/test_qwen_image_expansion.py index 6d6d236016..88e56cc3e1 100644 --- a/tests/e2e/online_serving/test_qwen_image_expansion.py +++ b/tests/e2e/online_serving/test_qwen_image_expansion.py @@ -107,6 +107,18 @@ def _get_diffusion_feature_cases(model: str): id="vae_patch_parallel_2", marks=PARALLEL_FEATURE_MARKS, ), + pytest.param( + OmniServerParams( + model=model, + server_args=[ + "--use-hsdp", + "--hsdp-shard-size", + "2", + ], + ), + id="parallel_hsdp", + marks=PARALLEL_FEATURE_MARKS, + ), ] diff --git a/tests/e2e/online_serving/test_qwen_image_layered_expansion.py b/tests/e2e/online_serving/test_qwen_image_layered_expansion.py index 39b8f36b30..fc73801c0e 100644 --- a/tests/e2e/online_serving/test_qwen_image_layered_expansion.py +++ b/tests/e2e/online_serving/test_qwen_image_layered_expansion.py @@ -62,6 +62,18 @@ id="cfg_parallel_001", marks=PARALLEL_FEATURE_MARKS, ), + pytest.param( + OmniServerParams( + model=MODEL, + server_args=[ + "--use-hsdp", + "--hsdp-shard-size", + "2", + ], + ), + id="parallel_hsdp", + marks=PARALLEL_FEATURE_MARKS, + ), ] diff --git a/tests/e2e/online_serving/test_zimage_expansion.py b/tests/e2e/online_serving/test_zimage_expansion.py index dfca76ca25..bed95545ac 100644 --- a/tests/e2e/online_serving/test_zimage_expansion.py +++ b/tests/e2e/online_serving/test_zimage_expansion.py @@ -60,6 +60,18 @@ def _get_diffusion_feature_cases(): id="parallel_teacache_fp8_ulysses2_ring2", marks=FOUR_CARD_MARKS, ), + pytest.param( + OmniServerParams( + model=MODEL, + server_args=[ + "--use-hsdp", + "--hsdp-shard-size", + "2", + ], + ), + id="parallel_hsdp", + marks=FOUR_CARD_MARKS, + ), ] diff --git a/vllm_omni/diffusion/distributed/hsdp_utils.py b/vllm_omni/diffusion/distributed/hsdp_utils.py new file mode 100644 index 0000000000..3e538d6fdd --- /dev/null +++ b/vllm_omni/diffusion/distributed/hsdp_utils.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import Any + + +def is_transformer_block_module(name: str, module: Any) -> bool: + """Return True for numbered modules under `transformer_blocks`.""" + return "transformer_blocks" in name and name.split(".")[-1].isdigit() diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py index 8b129ce2a5..490e0198b9 100644 --- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py +++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py @@ -22,6 +22,7 @@ from vllm_omni.diffusion.attention.layer import Attention from vllm_omni.diffusion.cache.base import CachedTransformer from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.distributed.hsdp_utils import is_transformer_block_module logger = init_logger(__name__) @@ -724,6 +725,8 @@ class GlmImageTransformer2DModel(CachedTransformer): _repeated_blocks = ["GlmImageTransformerBlock"] + _hsdp_shard_conditions = [is_transformer_block_module] + def __init__( self, od_config: OmniDiffusionConfig, diff --git a/vllm_omni/diffusion/models/hunyuan_video/hunyuan_video_15_transformer.py b/vllm_omni/diffusion/models/hunyuan_video/hunyuan_video_15_transformer.py index 2f7318cefc..263e39e018 100644 --- a/vllm_omni/diffusion/models/hunyuan_video/hunyuan_video_15_transformer.py +++ b/vllm_omni/diffusion/models/hunyuan_video/hunyuan_video_15_transformer.py @@ -23,6 +23,7 @@ from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata from vllm_omni.diffusion.attention.layer import Attention from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.distributed.hsdp_utils import is_transformer_block_module from vllm_omni.diffusion.layers.rope import RotaryEmbedding from vllm_omni.diffusion.models.flux.flux_transformer import FeedForward @@ -544,11 +545,7 @@ class HunyuanVideo15Transformer3DModel(nn.Module): "add_kv_proj": ["add_q_proj", "add_k_proj", "add_v_proj"], } - @staticmethod - def _is_transformer_block(name: str, module) -> bool: - return "transformer_blocks" in name and name.split(".")[-1].isdigit() - - _hsdp_shard_conditions = [_is_transformer_block] + _hsdp_shard_conditions = [is_transformer_block_module] def __init__( self, diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py index 505bad3d52..5056b5342e 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py @@ -273,7 +273,7 @@ def __init__( ) self.text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained( model, subfolder="text_encoder", local_files_only=local_files_only - ) + ).to(self.device) self.vae = DistributedAutoencoderKLQwenImage.from_pretrained( model, subfolder="vae", local_files_only=local_files_only ).to(self.device) diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py index f805a7e7cb..3d0cd2a6d4 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py @@ -245,7 +245,7 @@ def __init__( ) self.text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained( model, subfolder="text_encoder", local_files_only=local_files_only - ) + ).to(self.device) self.vae = AutoencoderKLQwenImage.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to( self.device diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py index 8e2ba90a44..cb5a36579f 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py @@ -198,7 +198,7 @@ def __init__( ) self.text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained( model, subfolder="text_encoder", local_files_only=local_files_only - ) + ).to(self.device) self.vae = AutoencoderKLQwenImage.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to( self.device diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py index ee2f471f5a..f1d28f0685 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py @@ -219,7 +219,7 @@ def __init__( ) self.text_encoder = Qwen2_5_VLForConditionalGeneration.from_pretrained( model, subfolder="text_encoder", local_files_only=local_files_only - ) + ).to(self.device) self.vae = AutoencoderKLQwenImage.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to( self.device ) diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py index 3e9a0f0f38..c211567069 100644 --- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py +++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py @@ -37,6 +37,7 @@ from vllm_omni.diffusion.attention.layer import Attention from vllm_omni.diffusion.cache.base import CachedTransformer from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.distributed.hsdp_utils import is_transformer_block_module from vllm_omni.diffusion.distributed.sp_plan import ( SequenceParallelInput, SequenceParallelOutput, @@ -887,6 +888,8 @@ class QwenImageTransformer2DModel(CachedTransformer): "add_kv_proj": ["add_q_proj", "add_k_proj", "add_v_proj"], } + _hsdp_shard_conditions = [is_transformer_block_module] + # Sequence Parallelism plan (following diffusers' _cp_plan pattern) # Similar to Z-Image's UnifiedPrepare, we use ImageRopePrepare to create # a module boundary where _sp_plan can shard hidden_states and vid_freqs together. diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index ac18d5773f..b9aceed2e5 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -170,7 +170,7 @@ def __init__( self.text_encoder = AutoModel.from_pretrained( model, subfolder="text_encoder", local_files_only=local_files_only - ) + ).to(self._execution_device) self.vae = DistributedAutoencoderKL.from_pretrained( model, subfolder="vae", local_files_only=local_files_only ).to(self._execution_device) diff --git a/vllm_omni/diffusion/models/z_image/z_image_transformer.py b/vllm_omni/diffusion/models/z_image/z_image_transformer.py index faeff3dce6..fd8b0e490f 100644 --- a/vllm_omni/diffusion/models/z_image/z_image_transformer.py +++ b/vllm_omni/diffusion/models/z_image/z_image_transformer.py @@ -580,6 +580,12 @@ class ZImageTransformer2DModel(CachedTransformer): _repeated_blocks = ["ZImageTransformerBlock"] + @staticmethod + def _is_transformer_block(name: str, module) -> bool: + return "layers" in name and name.split(".")[-1].isdigit() + + _hsdp_shard_conditions = [_is_transformer_block] + # Sequence Parallelism for Z-Image (following diffusers' _cp_plan pattern) # Similar to how Wan uses `rope` module's split_output to shard rotary embeddings, # Z-Image uses `unified_prepare` module's split_output to shard unified tensors. From bbae904f1d9347d8b9a47dc1628ed7c332bc2c29 Mon Sep 17 00:00:00 2001 From: NATURE Date: Wed, 1 Apr 2026 23:01:39 +0800 Subject: [PATCH 018/204] [Bugfix] Fix delayed decoding bug for Bagel AR/DIT workflow (L3 test_bagel_img2img error) (#2422) Signed-off-by: natureofnature --- vllm_omni/core/sched/omni_ar_scheduler.py | 53 ++++++++++++----------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/vllm_omni/core/sched/omni_ar_scheduler.py b/vllm_omni/core/sched/omni_ar_scheduler.py index c4d8452225..d49664161c 100644 --- a/vllm_omni/core/sched/omni_ar_scheduler.py +++ b/vllm_omni/core/sched/omni_ar_scheduler.py @@ -95,54 +95,54 @@ def _process_kv_transfer_trigger(self, request: Request, new_token_ids: list[int return False criteria_type = self.kv_transfer_criteria.get("type") - if ( - self.kv_transfer_criteria.get("stop_after_transfer", True) - and request.request_id in self.transfer_triggered_requests - ): - # For split pipelines that only need the transferred KV - # snapshot, stop AR decode once KV extraction has completed. - # This frees stage-0 resources without requiring an - # orchestrator-side abort. - if request.request_id not in self.active_kv_transfers: - request.status = RequestStatus.FINISHED_STOPPED - return True - return False + stop_decode_on_trigger = self.kv_transfer_criteria.get("stop_after_transfer", True) if request.request_id in self.transfer_triggered_requests: + # Already triggered. When stop_decode_on_trigger is True AND + # transfer was actually queued, the request was already stopped + # at trigger time (see below). Any request that reaches this + # point either has stop_decode_on_trigger=False (continue + # decoding) or was not actually queued (should not be stopped). return False if criteria_type == "prefill_finished": if request.num_computed_tokens >= request.num_prompt_tokens: - logger.debug(f"[Omni] Request {request.request_id} triggered prefill_finished transfer (Non-Stop)") self.transfer_triggered_requests.add(request.request_id) self._mark_request_for_kv_transfer(request.request_id, request.num_computed_tokens) + actually_queued = request.request_id in self.requests_needing_kv_transfer + + if stop_decode_on_trigger and actually_queued: + # Stop immediately so the request is NOT scheduled in + # the next step, freeing scheduling budget for companion + # requests whose chunked-prefill boundaries must be + # deterministic. waiting_for_transfer_free keeps blocks + # alive until the model runner finishes KV extraction. + self.waiting_for_transfer_free.add(request.request_id) + request.status = RequestStatus.FINISHED_STOPPED + return True - # Return False means "Do NOT stop the request" -> Continue Decoding return False elif criteria_type == "special_token": target_token_id = self.kv_transfer_criteria.get("token_id") if target_token_id is not None and target_token_id in new_token_ids: - logger.debug(f"[Omni] Request {request.request_id} triggered special_token criteria (Non-Stop)") - self.transfer_triggered_requests.add(request.request_id) - # Calculate precise snapshot length (trim to sentinel) - # Find the FIRST occurrence of the sentinel try: idx = new_token_ids.index(target_token_id) - # seq_len = tokens_before_this_step + idx + 1 (include sentinel) - # request.num_computed_tokens already includes ALL new_token_ids - # so we subtract (len(new_token_ids) - (idx + 1)) tokens_to_exclude = len(new_token_ids) - (idx + 1) snapshot_len = request.num_computed_tokens - tokens_to_exclude except ValueError: snapshot_len = request.num_computed_tokens - # Trigger Transfer self._mark_request_for_kv_transfer(request.request_id, snapshot_len) + actually_queued = request.request_id in self.requests_needing_kv_transfer + + if stop_decode_on_trigger and actually_queued: + self.waiting_for_transfer_free.add(request.request_id) + request.status = RequestStatus.FINISHED_STOPPED + return True - # Do NOT stop request return False return False @@ -532,9 +532,12 @@ def _free_request(self, request: Request, delay_free_blocks: bool = False) -> di # It triggered but hasn't finished yet. We MUST wait. logger.debug(f"[Omni] Request {request_id} finished but transfer is still ACTIVE. Waiting.") self.waiting_for_transfer_free.add(request_id) - # We do NOT mark for transfer again, just wait. - kv_xfer_params = None # No new transfer params + kv_xfer_params = None return kv_xfer_params + elif request_id in self.waiting_for_transfer_free: + # Stopped immediately by stop_decode_on_trigger; blocks are + # held until KV extraction completes in a future step. + return None else: logger.debug( f"[Omni] Request {request_id} finished and transfer no longer ACTIVE (extracted/acked). " From 9595be59ae79fe25cb08d21e8056bb4a1c99cf0c Mon Sep 17 00:00:00 2001 From: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com> Date: Thu, 2 Apr 2026 09:28:48 +0800 Subject: [PATCH 019/204] [skip ci][Doc] Update RFC template doc (#2141) Signed-off-by: Yuanheng Zhao --- .github/ISSUE_TEMPLATE/750-RFC.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml index ab16145bba..eb0ba40171 100644 --- a/.github/ISSUE_TEMPLATE/750-RFC.yml +++ b/.github/ISSUE_TEMPLATE/750-RFC.yml @@ -7,7 +7,7 @@ body: - type: markdown attributes: value: > - #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm-omni/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference. + #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm-omni/issues?q=in%3Atitle%20RFC%20sort%3Aupdated-desc) for reference. - type: textarea attributes: label: Motivation. @@ -21,7 +21,7 @@ body: description: > The proposed change of the RFC. value: | - Please provide the detailed design document of the RFC using the [template](https://docs.google.com/document/d/12YxSsVeD1jvL-InClkeAEnZyWFDndz_65JmXvsamuV4/edit?tab=t.0#heading=h.4ef4szrsgspp). + Please provide the detailed design document of the RFC using the [template](https://docs.google.com/document/d/1jcgR3cDaUQH3VczD4ZcKaJAoYWHjCmnYzHkCNyz-9fk/edit?usp=sharing). validations: required: true - type: textarea From 9c2a576301cac21f0e6ad7d47d0bc9a7298b85a6 Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Thu, 2 Apr 2026 09:29:22 +0800 Subject: [PATCH 020/204] [Test] Add voice or language test case for Qwen3-omni and Qwen-tts (#1844) Signed-off-by: yenuo26 <410167048@qq.com> Signed-off-by: wangyu <410167048@qq.com> Signed-off-by: wangyu <53896905+yenuo26@users.noreply.github.com> --- .buildkite/test-merge.yml | 25 ++-- .buildkite/test-nightly.yml | 17 +-- .buildkite/test-ready.yml | 3 +- tests/conftest.py | 110 ++++++++------ .../offline_inference/test_qwen2_5_omni.py | 4 +- .../e2e/offline_inference/test_qwen3_omni.py | 2 +- .../test_qwen3_omni_expansion.py | 134 ++++++++++++++++-- .../test_qwen3_tts_customvoice_expansion.py | 25 ++++ 8 files changed, 230 insertions(+), 90 deletions(-) diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 5ee9363374..a1ce0c495f 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -56,8 +56,8 @@ steps: timeout_in_minutes: 20 depends_on: upload-merge-pipeline commands: - - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py - - pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py + # Single pytest session for one combined summary at end of log. + - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py tests/e2e/offline_inference/test_diffusion_layerwise_offload.py agents: queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU plugins: @@ -111,8 +111,7 @@ steps: timeout_in_minutes: 20 depends_on: upload-merge-pipeline commands: - - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py - - pytest -s -v tests/diffusion/distributed/test_ulysses_uaa_perf.py + - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py tests/diffusion/distributed/test_ulysses_uaa_perf.py agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: @@ -193,8 +192,7 @@ steps: commands: - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py - - pytest -s -v tests/e2e/online_serving/test_qwen2_5_omni.py -m "advanced_model" --run-level "advanced_model" + - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py tests/e2e/online_serving/test_qwen2_5_omni.py -m "advanced_model" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: @@ -216,7 +214,7 @@ steps: export VLLM_LOGGING_LEVEL=DEBUG export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline_inference/test_qwen3_tts_customvoice.py + pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py tests/e2e/offline_inference/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model" ' agents: queue: "gpu_1_queue" @@ -239,7 +237,7 @@ steps: export VLLM_LOGGING_LEVEL=DEBUG export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline_inference/test_qwen3_tts_base.py + pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py tests/e2e/offline_inference/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model" ' agents: queue: "gpu_1_queue" @@ -259,9 +257,8 @@ steps: depends_on: upload-merge-pipeline commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py - - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model" - - pytest -s -v tests/e2e/online_serving/test_mimo_audio.py -m "advanced_model" --run-level "advanced_model" + - export VLLM_TEST_CLEAN_GPU_MEMORY="1" + - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py tests/e2e/online_serving/test_mimo_audio.py -m "advanced_model" --run-level "advanced_model" agents: queue: "mithril-h100-pool" plugins: @@ -347,8 +344,7 @@ steps: export VLLM_TEST_CLEAN_GPU_MEMORY=1 export VLLM_IMAGE_FETCH_TIMEOUT=60 pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory" - pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model" - pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model" + pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model" ' agents: queue: "mithril-h100-pool" @@ -392,8 +388,7 @@ steps: timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" - pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" + pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" ' agents: queue: "mithril-h100-pool" diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 5c6d6d35a6..9088c352b1 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -6,16 +6,10 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - | - set +e - pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model" - EXIT1=$$? - pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model" - EXIT2=$$? - pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" - EXIT3=$$? - pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" - EXIT4=$$? - exit $$((EXIT1 | EXIT2 | EXIT3 | EXIT4)) + pytest -s -v \ + tests/examples/ \ + tests/e2e/online_serving/test_*_expansion.py \ + -m "advanced_model and H100 and omni" --run-level "advanced_model" agents: queue: "mithril-h100-pool" plugins: @@ -57,8 +51,7 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - pytest -s -v tests/examples/ -m "advanced_model and L4 and omni" --run-level "advanced_model" - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model" + - pytest -s -v tests/examples/ tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 91ea92a5ce..985b50fc72 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -328,8 +328,7 @@ steps: timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" - pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" + pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "core_model" --run-level "core_model" ' agents: queue: "mithril-h100-pool" diff --git a/tests/conftest.py b/tests/conftest.py index fb88869542..8e9a7bf928 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1146,18 +1146,6 @@ def convert_audio_bytes_to_text(raw_bytes: bytes) -> str: return text -def merge_base64_and_convert_to_text(base64_list): - """ - Merge a list of base64 encoded audio data and convert to text. - """ - merged_audio = _merge_base64_audio_to_segment(base64_list) - output_path = f"./test_{uuid.uuid4().hex}.wav" - merged_audio.export(output_path, format="wav") - print(f"audio data is saved: {output_path}") - text = convert_audio_file_to_text(output_path) - return text - - def modify_stage_config( yaml_path: str, updates: dict[str, Any] = None, @@ -1742,7 +1730,7 @@ def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str: label = str(top.get("label", "")).lower() conf = float(top.get("score", 0.0)) - if conf < 0.6: + if conf < 0.5: gender = "unknown" # Some models use non-English labels (e.g., Russian). Normalize to 'male'/'female'. elif ("female" in label) or ("жен" in label): @@ -1771,6 +1759,34 @@ def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str: return "unknown" +_PRESET_VOICE_GENDER_MAP: dict[str, str] = { + "serena": "female", + "uncle_fu": "male", + "chelsie": "female", + "clone": "female", + "ethan": "male", +} + + +def _assert_preset_voice_gender_from_audio( + audio_bytes: bytes | None, + voice_name: str | None, +) -> None: + """If ``voice_name`` matches a known preset, assert classifier gender matches (skip when unknown).""" + if not voice_name or not audio_bytes: + return + key = str(voice_name).lower() + expected_gender = _PRESET_VOICE_GENDER_MAP.get(key) + if expected_gender is None: + return + estimated_gender = _estimate_voice_gender_from_audio(audio_bytes) + print(f"Preset voice gender check: preset={key!r}, estimated={estimated_gender!r}, expected={expected_gender!r}") + if estimated_gender != "unknown": + assert estimated_gender == expected_gender, ( + f"{voice_name!r} is expected {expected_gender}, but estimated gender is {estimated_gender!r}" + ) + + # Threshold aligned with _compute_pcm_hnr_db docstring (clean clone vs distorted). _MIN_PCM_SPEECH_HNR_DB = 1.0 @@ -1837,6 +1853,12 @@ def assert_omni_response(response: OmniResponse, request_config: dict[str, Any], if "audio" in modalities: assert response.audio_content is not None, "No audio output is generated" print(f"audio content is: {response.audio_content}") + speaker = request_config.get("speaker") + if speaker: + _assert_preset_voice_gender_from_audio( + response.audio_bytes, + speaker, + ) if "text" in modalities: assert response.text_content is not None, "No text output is generated" @@ -1849,12 +1871,14 @@ def assert_omni_response(response: OmniResponse, request_config: dict[str, Any], keywords = keywords_dict.get(word_type) if "text" in modalities: if keywords: - assert any(keyword in response.text_content.lower() for keyword in keywords), ( + text_lower = response.text_content.lower() + assert any(str(kw).lower() in text_lower for kw in keywords), ( "The output does not contain any of the keywords." ) else: if keywords: - assert any(keyword in response.audio_content.lower() for keyword in keywords), ( + audio_lower = response.audio_content.lower() + assert any(str(kw).lower() in audio_lower for kw in keywords), ( "The output does not contain any of the keywords." ) @@ -1908,24 +1932,12 @@ def assert_audio_speech_response( f"Transcript doesn't match input: similarity={similarity:.2f}, transcript='{transcript}'" ) - # Voice gender consistency check: + # Voice gender consistency check (preset names in ``_PRESET_VOICE_GENDER_MAP``). # When the estimator returns 'unknown', we treat it as inconclusive and do NOT fail the test. - voice = (request_config.get("voice") or "").lower() - if voice and response.audio_bytes: - estimated_gender = _estimate_voice_gender_from_audio(response.audio_bytes) - voice_gender_map = { - # adjust this mapping to your actual voice names - "serena": "female", - "uncle_fu": "male", - "clone": "female", - } - expected_gender = voice_gender_map.get(voice) - if expected_gender is not None: - print(f"Estimated voice gender from audio: {estimated_gender} (voice='{voice}')") - if estimated_gender != "unknown": - assert estimated_gender == expected_gender, ( - f"Voice '{voice}' is expected {expected_gender}, but estimated gender is '{estimated_gender}'" - ) + _assert_preset_voice_gender_from_audio( + response.audio_bytes, + request_config.get("voice"), + ) def assert_diffusion_response(response: DiffusionResponse, request_config: dict[str, Any], run_level: str = None): @@ -2041,7 +2053,11 @@ def _process_stream_omni_response(self, chat_completion) -> OmniResponse: if audio_data or text_content: if audio_data: - audio_content = merge_base64_and_convert_to_text(audio_data) + merged_seg = _merge_base64_audio_to_segment(audio_data) + wav_buf = BytesIO() + merged_seg.export(wav_buf, format="wav") + result.audio_bytes = wav_buf.getvalue() + audio_content = convert_audio_bytes_to_text(result.audio_bytes) if audio_content and text_content: similarity = cosine_similarity_text(audio_content.lower(), text_content.lower()) @@ -2096,7 +2112,8 @@ def _process_non_stream_omni_response(self, chat_completion) -> OmniResponse: if audio_data or text_content: if audio_data: - audio_content = convert_audio_to_text(audio_data) + result.audio_bytes = base64.b64decode(audio_data) + audio_content = convert_audio_bytes_to_text(result.audio_bytes) if audio_content and text_content: similarity = cosine_similarity_text(audio_content.lower(), text_content.lower()) @@ -2265,8 +2282,9 @@ def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1 request_config: Request configuration dictionary containing parameters like model, messages, stream. Optional ``use_audio_in_video`` (bool): when true, sets ``extra_body["mm_processor_kwargs"] = {"use_audio_in_video": True}`` for Qwen-Omni video+audio - extraction (merged with any existing ``extra_body`` / ``mm_processor_kwargs``). - Optional ``extra_body`` (dict): passed through to ``chat.completions.create`` after merge. + extraction. + Optional top-level ``speaker`` (str): Qwen3-Omni preset TTS speaker name; sent as + ``extra_body["speaker"]`` to ``chat.completions.create``. request_num: Number of requests, defaults to 1 (single request) Returns: @@ -2278,9 +2296,8 @@ def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1 modalities = request_config.get("modalities", ["text", "audio"]) extra_body: dict[str, Any] = {} - raw_extra = request_config.get("extra_body") - if raw_extra: - extra_body.update(raw_extra) + if "speaker" in request_config: + extra_body["speaker"] = request_config["speaker"] if request_config.get("use_audio_in_video"): mm = dict(extra_body.get("mm_processor_kwargs") or {}) mm["use_audio_in_video"] = True @@ -2312,12 +2329,15 @@ def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1 # Send concurrent requests: run create + process in worker so e2e_latency includes full round-trip. def _one_omni_request(): start = time.perf_counter() - chat_completion = self.client.chat.completions.create( - model=request_config.get("model"), - messages=request_config.get("messages"), - modalities=modalities, - stream=stream, - ) + worker_kwargs: dict[str, Any] = { + "model": request_config.get("model"), + "messages": request_config.get("messages"), + "modalities": modalities, + "stream": stream, + } + if extra_body_arg is not None: + worker_kwargs["extra_body"] = extra_body_arg + chat_completion = self.client.chat.completions.create(**worker_kwargs) if stream: response = self._process_stream_omni_response(chat_completion) else: diff --git a/tests/e2e/offline_inference/test_qwen2_5_omni.py b/tests/e2e/offline_inference/test_qwen2_5_omni.py index 6af59c1f63..4c4315aab9 100644 --- a/tests/e2e/offline_inference/test_qwen2_5_omni.py +++ b/tests/e2e/offline_inference/test_qwen2_5_omni.py @@ -57,7 +57,7 @@ def get_question(prompt_type="mix"): return prompts.get(prompt_type, prompts["mix"]) -@pytest.mark.core_model +@pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325", "xpu": "B60"}, num_cards={"cuda": 4, "rocm": 2, "xpu": 3}) @pytest.mark.parametrize("omni_runner", test_params, indirect=True) @@ -88,7 +88,7 @@ def test_mix_to_audio(omni_runner, omni_runner_handler) -> None: omni_runner_handler.send_request(request_config) -@pytest.mark.core_model +@pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325", "xpu": "B60"}, num_cards={"cuda": 4, "rocm": 2, "xpu": 3}) @pytest.mark.parametrize("omni_runner", test_params, indirect=True) diff --git a/tests/e2e/offline_inference/test_qwen3_omni.py b/tests/e2e/offline_inference/test_qwen3_omni.py index 01be0486fc..cc0af437ec 100644 --- a/tests/e2e/offline_inference/test_qwen3_omni.py +++ b/tests/e2e/offline_inference/test_qwen3_omni.py @@ -56,7 +56,7 @@ def get_question(prompt_type="video"): return prompts.get(prompt_type, prompts["video"]) -@pytest.mark.core_model +@pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_runner", test_params, indirect=True) diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index 4055ad4267..0bcc86840b 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -26,7 +26,7 @@ model = "Qwen/Qwen3-Omni-30B-A3B-Instruct" AUDIO_KEY = ["test"] -IMAGE_KEY = ["square", "quadrate"] +IMAGE_KEY = ["square", "quadrate", "rectangle"] VIDEO_KEY = ["sphere", "globe", "circle", "round", "ball"] @@ -103,6 +103,7 @@ def get_prompt(prompt_type="text_only"): "text_audio": "What is in this audio? ", "text_audio_video": "First, what is in this audio? Then, what is in this video? ", "one_word": "What is the capital of UK? Answer in one word", + "text_chinese": "北京,中国的首都,是一座融合了长城等历史地点与现代建筑的国际化大都市,充满了独特的文化与活力。请重复这句话。", } return prompts.get(prompt_type, prompts["text_only"]) @@ -464,20 +465,10 @@ def test_audio_in_video_002(omni_server, openai_client) -> None: "messages": messages, "stream": True, "use_audio_in_video": True, - "key_words": {"video": VIDEO_KEY, "audio": AUDIO_KEY + ["beep", "electronic"]}, + "key_words": {"video": VIDEO_KEY}, } - # Retry when assert_omni_response fails on key_words (see tests/conftest.py). - _keyword_assert_msg = "The output does not contain any of the keywords." - _max_retries = 3 - for attempt in range(_max_retries): - try: - openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) - break - except AssertionError as e: - if _keyword_assert_msg not in str(e) or attempt == _max_retries - 1: - raise - print(f"Keyword assertion failed, retrying {attempt + 2}/{_max_retries}: {e!r}") + openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) @pytest.mark.advanced_model @@ -514,3 +505,120 @@ def test_one_word_prompt_001(omni_server, openai_client) -> None: if _similarity_assert_msg not in str(e) or attempt == _max_retries - 1: raise print(f"Similarity assertion failed, retrying {attempt + 2}/{_max_retries}: {e!r}") + + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_speaker_001(omni_server, openai_client) -> None: + """ + Input Modal: text only (one-word answer constraint). + Output Modal: text, audio (default ``modalities``); ``key_words`` only assert on text. + Input Setting: stream=True + Datasets: single request + """ + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + content_text=get_prompt("text"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": True, + "speaker": "Chelsie", + "key_words": {"text": ["beijing"]}, + } + + openai_client.send_omni_request(request_config) + + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_speaker_002(omni_server, openai_client) -> None: + """ + Input Modal: text only (one-word answer constraint). + Output Modal: text, audio (default ``modalities``); ``key_words`` only assert on text. + Input Setting: stream=True + Datasets: single request + """ + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + content_text=get_prompt("text"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": True, + "speaker": "Ethan", + "key_words": {"text": ["beijing"]}, + } + + # Retry only when assert_omni_response fails on preset voice gender (see tests/conftest.py). + _gender_assert_substr = "estimated gender" + _max_retries = 3 + for attempt in range(_max_retries): + try: + openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) + break + except AssertionError as e: + if _gender_assert_substr not in str(e) or attempt == _max_retries - 1: + raise + print(f"Gender assertion failed, retrying {attempt + 2}/{_max_retries}: {e!r}") + + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_speaker_003(omni_server, openai_client) -> None: + """ + Input Modal: text only (one-word answer constraint). + Output Modal: text, audio (default ``modalities``); ``key_words`` only assert on text. + Input Setting: stream=True + Datasets: single request + """ + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + content_text=get_prompt("text"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": True, + "speaker": "CHELSIE", + "key_words": {"text": ["beijing"]}, + } + + openai_client.send_omni_request(request_config) + + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_language_001(omni_server, openai_client) -> None: + """ + Input Modal: text only (one-word answer constraint). + Output Modal: text, audio (default ``modalities``); ``key_words`` only assert on text. + Input Setting: stream=True + Datasets: single request + """ + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + content_text=get_prompt("text_chinese"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": True, + "key_words": {"text": ["北京"]}, + } + + openai_client.send_omni_request(request_config) diff --git a/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py b/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py index 9921e3a4a1..03a985896e 100644 --- a/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py @@ -120,6 +120,31 @@ def test_voice_002(omni_server, openai_client) -> None: openai_client.send_audio_speech_request(request_config) +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "L4"}, num_cards=1) +@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True) +def test_voice_003(omni_server, openai_client) -> None: + """ + Test text input processing and audio output via OpenAI API. + Deploy Setting: default yaml + Input Modal: text + Output Modal: audio + Input Setting: stream=False, language=chinese + Datasets: few requests + """ + request_config = { + "model": omni_server.model, + "input": get_prompt(), + "stream": False, + "response_format": "wav", + "task_type": "CustomVoice", + "voice": "SERENA", + } + + openai_client.send_audio_speech_request(request_config) + + @pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "L4"}, num_cards=1) From ebc9a8d875d72d62c831de53e22bc17059bc8941 Mon Sep 17 00:00:00 2001 From: Didan Deng <33117903+wtomin@users.noreply.github.com> Date: Thu, 2 Apr 2026 10:08:58 +0800 Subject: [PATCH 021/204] [skip ci][Doc] Small fix of Doc (#2400) Signed-off-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- docs/user_guide/diffusion/cache_acceleration/cache_dit.md | 2 +- docs/user_guide/diffusion_features.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user_guide/diffusion/cache_acceleration/cache_dit.md b/docs/user_guide/diffusion/cache_acceleration/cache_dit.md index dec52b9d6b..824e8c9305 100644 --- a/docs/user_guide/diffusion/cache_acceleration/cache_dit.md +++ b/docs/user_guide/diffusion/cache_acceleration/cache_dit.md @@ -164,7 +164,7 @@ cache_config={ **Performance Tips**: -- Default `Fn_compute_blocks=1` works well for most cases. Some models (e.g., [FLUX.2-klein](https://github.com/wtomin/vllm-omni/blob/main/vllm_omni/diffusion/cache/cache_dit_backend.py#L363)) use a larger value for `Fn_compute_blocks` for a balanced performance. +- Default `Fn_compute_blocks=1` works well for most cases. Some models (e.g., FLUX.2-klein) use a larger value for `Fn_compute_blocks` for a balanced performance. - Increase `residual_diff_threshold` (e.g., 0.12-0.15) for faster inference with slight quality trade-off, or decrease from default 0.24 for higher quality. - Default `max_warmup_steps=4` is optimized for few-step models. Increase to 6-8 for more steps if needed. diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index d633e7de8c..607d9af73c 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -106,7 +106,7 @@ The following tables show which models support each feature: | **FLUX.1-Kontext-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **FLUX.2-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **GLM-Image** | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **HunyuanImage3** | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | +| **HunyuanImage3** | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | | **LongCat-Image** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **LongCat-Image-Edit** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **MammothModa2(T2I)** | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | @@ -118,7 +118,7 @@ The following tables show which models support each feature: | **Qwen-Image-Edit** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | **Qwen-Image-Edit-2509** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | **Qwen-Image-Layered** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | -| **Stable-Diffusion3.5** | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **Stable-Diffusion3.5** | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | | **Z-Image** | ✅ | ✅ | ✅ | ❓ | ✅ (TP=2 only) | ✅ | ❌ | ✅ | ✅ | ❌ | > Notes: From d3daafbe4156f6eddbcf5520e5d209fb9cc4d268 Mon Sep 17 00:00:00 2001 From: Jason <72191212+JasonJ2021@users.noreply.github.com> Date: Thu, 2 Apr 2026 11:44:04 +0800 Subject: [PATCH 022/204] [Feat] Add benchmarks for Qwen3-TTS Base/VoiceDesign Model (#2411) Signed-off-by: Jiahui Sun --- benchmarks/qwen3-tts/README.md | 3 ++ benchmarks/qwen3-tts/run_benchmark.sh | 12 +++++- .../qwen3-tts/transformers/bench_tts_hf.py | 40 +++++++++++++----- .../qwen3-tts/vllm_omni/bench_tts_serve.py | 41 +++++++++++++++---- 4 files changed, 75 insertions(+), 21 deletions(-) diff --git a/benchmarks/qwen3-tts/README.md b/benchmarks/qwen3-tts/README.md index 73bc420f91..9c01f29aa9 100644 --- a/benchmarks/qwen3-tts/README.md +++ b/benchmarks/qwen3-tts/README.md @@ -32,6 +32,9 @@ bash run_benchmark.sh --hf-only # Use a different model (e.g. 1.7B) MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice bash run_benchmark.sh --async-only +# Use a Voice Clone model +MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-Base TASK_TYPE=Base bash run_benchmark.sh --async-only + # Use bs16 config for higher throughput STAGE_CONFIG=vllm_omni/configs/qwen3_tts_bs16.yaml bash run_benchmark.sh --async-only diff --git a/benchmarks/qwen3-tts/run_benchmark.sh b/benchmarks/qwen3-tts/run_benchmark.sh index ef85d64d6d..283b6b844c 100755 --- a/benchmarks/qwen3-tts/run_benchmark.sh +++ b/benchmarks/qwen3-tts/run_benchmark.sh @@ -23,6 +23,9 @@ # # Use 1.7B model: # MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice bash run_benchmark.sh --async-only # +# # Use Voice Clone model +# MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-Base TASK_TYPE=Base bash run_benchmark.sh --async-only +# # # Use batch_size=4 config: # STAGE_CONFIG=vllm_omni/configs/qwen3_tts_bs4.yaml bash run_benchmark.sh --async-only # @@ -35,6 +38,7 @@ # GPU_MEM_TALKER - gpu_memory_utilization for talker stage (default: 0.3) # GPU_MEM_CODE2WAV - gpu_memory_utilization for code2wav stage (default: 0.2) # STAGE_CONFIG - Path to stage config YAML (default: configs/qwen3_tts_bs1.yaml) +# TASK_TYPE - Task type: CustomVoice, VoiceDesign, Base (default: CustomVoice) set -euo pipefail @@ -53,6 +57,7 @@ NUM_WARMUPS="${NUM_WARMUPS:-3}" STAGE_CONFIG="${STAGE_CONFIG:-vllm_omni/configs/qwen3_tts_bs1.yaml}" RESULT_DIR="${SCRIPT_DIR}/results" TIMESTAMP="$(date +%Y%m%d_%H%M%S)" +TASK_TYPE="${TASK_TYPE:-CustomVoice}" # Parse args RUN_ASYNC=true @@ -77,6 +82,7 @@ echo " Concurrency: ${CONCURRENCY}" echo " Port: ${PORT}" echo " Stage config: ${STAGE_CONFIG}" echo " Results: ${RESULT_DIR}" +echo " Task type: ${TASK_TYPE}" echo "============================================================" # Prepare stage config with correct GPU device and memory settings @@ -195,7 +201,8 @@ run_bench() { --max-concurrency ${conc_args} \ --num-warmups "${NUM_WARMUPS}" \ --config-name "${config_name}" \ - --result-dir "${RESULT_DIR}" + --result-dir "${RESULT_DIR}" \ + --task-type "${TASK_TYPE}" stop_server @@ -222,7 +229,8 @@ if [ "${RUN_HF}" = true ]; then --num-warmups "${NUM_WARMUPS}" \ --gpu-device "${GPU_DEVICE}" \ --config-name "hf_transformers" \ - --result-dir "${RESULT_DIR}" + --result-dir "${RESULT_DIR}" \ + --task-type "${TASK_TYPE}" # Allow GPU memory to settle sleep 5 diff --git a/benchmarks/qwen3-tts/transformers/bench_tts_hf.py b/benchmarks/qwen3-tts/transformers/bench_tts_hf.py index 0e0ef8e9e8..ed04ee264c 100644 --- a/benchmarks/qwen3-tts/transformers/bench_tts_hf.py +++ b/benchmarks/qwen3-tts/transformers/bench_tts_hf.py @@ -38,6 +38,10 @@ "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.", ] +REF_AUDIO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav" +REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you." +INSTRUCT = "Speak in an incredulous tone, but with a hint of panic beginning to creep into your voice." + @dataclass class BenchmarkResult: @@ -75,6 +79,29 @@ class BenchmarkResult: per_request: list = field(default_factory=list) +def generate_audio(model, prompt: str, args): + if args.task_type == "Base": + return model.generate_voice_clone( + text=prompt, + language=args.language, + ref_audio=REF_AUDIO, + ref_text=REF_TEXT, + ) + + if args.task_type == "VoiceDesign": + return model.generate_voice_design( + text=prompt, + language=args.language, + instruct=INSTRUCT, + ) + + return model.generate_custom_voice( + text=prompt, + language=args.language, + speaker=args.voice, + ) + + def run_benchmark(args): from qwen_tts import Qwen3TTSModel @@ -95,11 +122,7 @@ def run_benchmark(args): print(f"Warming up with {args.num_warmups} requests...") for i in range(args.num_warmups): p = PROMPTS[i % len(PROMPTS)] - wavs, sr = model.generate_custom_voice( - text=p, - language=args.language, - speaker=args.voice, - ) + wavs, sr = generate_audio(model, p, args) # Sync GPU torch.cuda.synchronize(device) print("Warmup done.") @@ -124,11 +147,7 @@ def run_benchmark(args): torch.cuda.synchronize(device) st = time.perf_counter() - wavs, sr = model.generate_custom_voice( - text=prompt, - language=args.language, - speaker=args.voice, - ) + wavs, sr = generate_audio(model, prompt, args) torch.cuda.synchronize(device) elapsed = time.perf_counter() - st @@ -268,6 +287,7 @@ def parse_args(): parser.add_argument("--gpu-device", type=int, default=0) parser.add_argument("--voice", type=str, default="Vivian") parser.add_argument("--language", type=str, default="English") + parser.add_argument("--task-type", type=str, default="CustomVoice", choices=["CustomVoice", "VoiceDesign", "Base"]) parser.add_argument( "--config-name", type=str, default="hf_transformers", help="Label for this config (used in filenames)" ) diff --git a/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py b/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py index 91e4ecbbb9..96b904b017 100644 --- a/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py +++ b/benchmarks/qwen3-tts/vllm_omni/bench_tts_serve.py @@ -37,6 +37,9 @@ "Could you please turn down the music a little bit, I'm trying to concentrate on my work.", "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.", ] +REF_AUDIO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/clone_2.wav" +REF_TEXT = "Okay. Yeah. I resent you. I love you. I respect you. But you know what? You blew it! And thanks to you." +INSTRUCT = "Speak in an incredulous tone, but with a hint of panic beginning to creep into your voice." @dataclass @@ -93,22 +96,39 @@ def pcm_bytes_to_duration(num_bytes: int, sample_rate: int = 24000, sample_width return num_samples / sample_rate +def create_payload( + prompt: str, task_type: str = "CustomVoice", voice: str = "vivian", language: str = "English" +) -> dict: + payload = { + "input": prompt, + "language": language, + "stream": True, + "response_format": "pcm", + "task_type": task_type, + } + + if task_type == "Base": + payload["ref_audio"] = REF_AUDIO + payload["ref_text"] = REF_TEXT + elif task_type == "CustomVoice": + payload["voice"] = voice + elif task_type == "VoiceDesign": + payload["instructions"] = INSTRUCT + + return payload + + async def send_tts_request( session: aiohttp.ClientSession, api_url: str, prompt: str, + task_type: str = "CustomVoice", voice: str = "vivian", language: str = "English", pbar: tqdm | None = None, ) -> RequestResult: """Send a streaming TTS request and measure latency metrics.""" - payload = { - "input": prompt, - "voice": voice, - "language": language, - "stream": True, - "response_format": "pcm", - } + payload = create_payload(prompt, task_type, voice, language) result = RequestResult(prompt=prompt) st = time.perf_counter() @@ -153,6 +173,7 @@ async def run_benchmark( num_prompts: int, max_concurrency: int, num_warmups: int = 3, + task_type: str = "CustomVoice", voice: str = "vivian", language: str = "English", ) -> BenchmarkResult: @@ -175,7 +196,7 @@ async def run_benchmark( warmup_tasks = [] for i in range(num_warmups): prompt = PROMPTS[i % len(PROMPTS)] - warmup_tasks.append(send_tts_request(session, api_url, prompt, voice, language)) + warmup_tasks.append(send_tts_request(session, api_url, prompt, task_type, voice, language)) await asyncio.gather(*warmup_tasks) print(" Warmup done.") @@ -189,7 +210,7 @@ async def run_benchmark( async def limited_request(prompt): async with semaphore: - return await send_tts_request(session, api_url, prompt, voice, language, pbar) + return await send_tts_request(session, api_url, prompt, task_type, voice, language, pbar) start_time = time.perf_counter() tasks = [asyncio.create_task(limited_request(p)) for p in request_prompts] @@ -306,6 +327,7 @@ async def main(args): num_prompts=args.num_prompts, max_concurrency=concurrency, num_warmups=args.num_warmups, + task_type=args.task_type, voice=args.voice, language=args.language, ) @@ -334,6 +356,7 @@ def parse_args(): "--max-concurrency", type=int, nargs="+", default=[1, 4, 10], help="Concurrency levels to test" ) parser.add_argument("--num-warmups", type=int, default=3) + parser.add_argument("--task-type", type=str, default="CustomVoice", choices=["CustomVoice", "VoiceDesign", "Base"]) parser.add_argument("--voice", type=str, default="vivian") parser.add_argument("--language", type=str, default="English") parser.add_argument( From 900f6aa837f510210758393f49fb77b4c9b1bb32 Mon Sep 17 00:00:00 2001 From: Alicia <115451386+congw729@users.noreply.github.com> Date: Thu, 2 Apr 2026 11:45:00 +0800 Subject: [PATCH 023/204] [CI] [skip ci] Rename & reset timout mins for nightly L4 tests. (#2251) Signed-off-by: Alicia <115451386+congw729@users.noreply.github.com> --- .buildkite/test-nightly.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 9088c352b1..32bf219bc9 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -107,7 +107,7 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - - label: ":full_moon: Diffusion Model Wan22 completed Test with H100" + - label: ":full_moon: Diffusion Model (Wan2.2) Test with H100" timeout_in_minutes: 90 depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" @@ -148,7 +148,7 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - - label: ":full_moon: Diffusion Model Test with L4" + - label: ":full_moon: Diffusion Model Test" timeout_in_minutes: 60 depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" @@ -170,7 +170,7 @@ steps: - "/fsx/hf_cache:/fsx/hf_cache" - - label: ":full_moon: Documentation Example Code Test with H100" + - label: ":full_moon: Doc Example Code Test with H100" timeout_in_minutes: 60 depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" @@ -212,7 +212,7 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - - label: ":full_moon: Omni Model Perf Test & Test Case Statistics" + - label: ":full_moon: Omni Model Perf Test & Testcase Statistics with H100" key: nightly-omni-performance timeout_in_minutes: 180 depends_on: upload-nightly-pipeline @@ -390,9 +390,9 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - - label: ":full_moon: Qwen-Image Diffusion Perf Test with H100" + - label: ":full_moon: Diffusion Perf Test with H100" key: nightly-qwen-image-performance - timeout_in_minutes: 300 + timeout_in_minutes: 180 depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: From c1d2dcc3acc5b86ce22e0c6ae5ef196faa720d81 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 2 Apr 2026 13:57:53 +0800 Subject: [PATCH 024/204] [AutoRound] Add offline quantized `W4A16` model support (#1777) Signed-off-by: yiliu30 Co-authored-by: Hongsheng Liu --- .../diffusion/quantization/autoround.md | 91 +++++++ .../diffusion/quantization/overview.md | 4 + tests/diffusion/layers/__init__.py | 0 tests/diffusion/layers/test_adalayernorm.py | 237 ++++++++++++++++++ tests/diffusion/models/flux/__init__.py | 0 .../flux/test_flux_prefix_propagation.py | 134 ++++++++++ .../diffusion/quantization/test_inc_config.py | 147 +++++++++++ .../test_flux_autoround_w4a16.py | 127 ++++++++++ vllm_omni/diffusion/data.py | 47 +++- vllm_omni/diffusion/layers/adalayernorm.py | 104 ++++++++ .../model_loader/diffusers_loader.py | 55 +++- .../diffusion/models/flux/flux_transformer.py | 64 +++-- vllm_omni/entrypoints/async_omni_diffusion.py | 4 +- vllm_omni/quantization/factory.py | 16 ++ 14 files changed, 1011 insertions(+), 19 deletions(-) create mode 100644 docs/user_guide/diffusion/quantization/autoround.md create mode 100644 tests/diffusion/layers/__init__.py create mode 100644 tests/diffusion/layers/test_adalayernorm.py create mode 100644 tests/diffusion/models/flux/__init__.py create mode 100644 tests/diffusion/models/flux/test_flux_prefix_propagation.py create mode 100644 tests/diffusion/quantization/test_inc_config.py create mode 100644 tests/e2e/offline_inference/test_flux_autoround_w4a16.py diff --git a/docs/user_guide/diffusion/quantization/autoround.md b/docs/user_guide/diffusion/quantization/autoround.md new file mode 100644 index 0000000000..48df176b03 --- /dev/null +++ b/docs/user_guide/diffusion/quantization/autoround.md @@ -0,0 +1,91 @@ +# AutoRound Quantization + +## Overview + +[AutoRound](https://github.com/intel/auto-round) is an advanced quantization toolkit designed for Large Language Models (LLMs), Vision-Language Models (VLMs), and diffusion models. It achieves high accuracy at ultra-low bit widths (2–4 bits) with minimal tuning by leveraging sign-gradient descent, while providing broad hardware compatibility with multi-datatype support. + +The quantization config is auto-detected from the checkpoint's `config.json` (`quantization_config.quant_method = "auto-round"`). No extra CLI flags are needed. + +### Supported Schemes + +| Scheme | Bits | Status | +|--------|------|--------| +| W4A16 | 4 | ✅ Supported | +| W8A16 | 8 | Planned | + +W4A16 is the first supported scheme. Additional schemes will be added in future releases. + +## Configuration + +1. **Python API**: point `model` at a pre-quantized checkpoint. The quantization is detected automatically. + +```python +from vllm_omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + +omni = Omni(model="vllm-project-org/FLUX.1-dev-AutoRound-w4a16") + +outputs = omni.generate( + "A cat sitting on a windowsill", + OmniDiffusionSamplingParams(num_inference_steps=28), +) +outputs[0].save_images("output.png") +``` + +2. **CLI**: pass the quantized model path directly. + +```bash +python examples/offline_inference/text_to_image/text_to_image.py \ + --model vllm-project-org/FLUX.1-dev-AutoRound-w4a16 \ + --prompt "A cat sitting on a windowsill" \ + --num-inference-steps 28 \ + --output outputs/flux_w4a16.png +``` + +No `--quantization` flag is needed — the quantization method is read from the checkpoint. + +## How It Works + +The checkpoint's `config.json` contains: + +```json +{ + "quantization_config": { + "quant_method": "auto-round", + "bits": 4, + "group_size": 128, + "sym": true, + "packing_format": "auto_round:auto_gptq", + "block_name_to_quantize": "transformer_blocks,single_transformer_blocks" + } +} +``` + +At load time: + +1. `TransformerConfig.from_dict()` parses the `quantization_config` section and builds a vLLM `INCConfig` via `build_quant_config("auto-round", ...)`. +2. `OmniDiffusionConfig.set_tf_model_config()` propagates the detected config to the engine. +3. The appropriate compute kernel (e.g. GPTQ-Marlin for W4A16) is selected automatically based on the checkpoint's bit-width and packing format. + +## Supported Models + +| Model | HF Checkpoint | Scheme | Group Size | Backend | +|-------|--------------|--------|------------|---------| +| FLUX.1-dev | `vllm-project-org/FLUX.1-dev-AutoRound-w4a16` | W4A16 | 128 | GPTQ-Marlin | + +## Creating a Quantized Checkpoint + +Use [AutoRound](https://github.com/intel/auto-round) to quantize a BF16 model. The `--scheme` flag selects the quantization scheme: + +```bash +# W4A16 (4-bit weight, 16-bit activation) +auto-round \ + --model black-forest-labs/FLUX.1-dev \ + --scheme W4A16 \ + --batch_size 1 \ + --disable_opt_rtn \ + --dataset coco2014 \ + --iters 0 +``` + +The output directory can be used directly as the `model` argument. See the [AutoRound documentation](https://github.com/intel/auto-round) for all available schemes and options. diff --git a/docs/user_guide/diffusion/quantization/overview.md b/docs/user_guide/diffusion/quantization/overview.md index 0fc8b9bc2a..25d7fa5c75 100644 --- a/docs/user_guide/diffusion/quantization/overview.md +++ b/docs/user_guide/diffusion/quantization/overview.md @@ -11,6 +11,7 @@ vLLM-Omni provides a unified quantization framework that supports both diffusion | FP8 | [FP8](fp8.md) | FP8 W8A8, dynamic or static | Z-Image, Qwen-Image, Flux, Bagel | SM 89 (Ada) | | Int8 | [Int8](int8.md) | Int8 W8A8 | Z-Image, Qwen-Image | SM 89 (Ada) / Ascend NPU | | GGUF | [GGUF](gguf.md) | GGUF format, dequant+GEMM for N-D tensors | Z-Image, Flux | SM 60 | +| AutoRound | [AutoRound](autoround.md) | W4A16 (pre-quantized) | Flux | SM 80 (Ampere) | ### Multi-stage Omni Models (Pre-quantized Checkpoints) @@ -102,6 +103,9 @@ config = build_quant_config("fp8") # Dict with parameters config = build_quant_config({"method": "fp8", "activation_scheme": "static"}) +# AutoRound / INC (auto-detected from checkpoint, or explicit) +config = build_quant_config("auto-round", bits=4, group_size=128) + # Per-component dict config = build_quant_config({ "transformer": {"method": "fp8"}, diff --git a/tests/diffusion/layers/__init__.py b/tests/diffusion/layers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/diffusion/layers/test_adalayernorm.py b/tests/diffusion/layers/test_adalayernorm.py new file mode 100644 index 0000000000..5e41b7a26d --- /dev/null +++ b/tests/diffusion/layers/test_adalayernorm.py @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for shared AdaLayerNorm layers used by FLUX and other models.""" + +import os + +import pytest +import torch + +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] + + +@pytest.fixture(autouse=True) +def _init_distributed(): + """Initialize the minimal distributed environment required by + ReplicatedLinear (tensor-parallel group must exist).""" + from vllm.distributed.parallel_state import ( + cleanup_dist_env_and_memory, + init_distributed_environment, + initialize_model_parallel, + ) + + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "29501") + init_distributed_environment( + world_size=1, + rank=0, + local_rank=0, + distributed_init_method="env://", + ) + initialize_model_parallel() + yield + cleanup_dist_env_and_memory() + + +@pytest.fixture(autouse=True) +def _force_default_gemm(monkeypatch): + """Force CPU-compatible GEMM dispatch for tests using CPU tensors. + + vLLM's dispatch_unquantized_gemm() selects the backend by platform + (e.g. rocm_unquantized_gemm on AMD machines), not by tensor device. + CPU test tensors crash with NotImplementedError on ROCm. Monkeypatch + the dispatcher to always return the default (torch.nn.functional.linear) + implementation which works on any device.""" + from vllm.model_executor.layers.utils import default_unquantized_gemm + + monkeypatch.setattr( + "vllm.model_executor.layers.linear.dispatch_unquantized_gemm", + lambda: default_unquantized_gemm, + ) + + +def test_adalayernorm_import_from_shared_module(): + """Verify imports work from the shared adalayernorm module.""" + from vllm_omni.diffusion.layers.adalayernorm import ( # noqa: F401 + AdaLayerNormContinuous, + AdaLayerNormZero, + AdaLayerNormZeroSingle, + ) + + +def test_adalayernorm_zero_forward_shape(): + """AdaLayerNormZero produces correct output shapes (x, gate, shift, scale, gate).""" + from vllm_omni.diffusion.layers.adalayernorm import AdaLayerNormZero + + dim = 64 + batch = 2 + seq_len = 4 + norm = AdaLayerNormZero(dim) + + x = torch.randn(batch, seq_len, dim) + emb = torch.randn(batch, dim) + + out_x, gate_msa, shift_mlp, scale_mlp, gate_mlp = norm(x, emb) + + assert out_x.shape == (batch, seq_len, dim) + assert gate_msa.shape == (batch, dim) + assert shift_mlp.shape == (batch, dim) + assert scale_mlp.shape == (batch, dim) + assert gate_mlp.shape == (batch, dim) + + +def test_adalayernorm_zero_single_forward_shape(): + """AdaLayerNormZeroSingle produces (x, gate) with correct shapes.""" + from vllm_omni.diffusion.layers.adalayernorm import AdaLayerNormZeroSingle + + dim = 64 + batch = 2 + seq_len = 4 + norm = AdaLayerNormZeroSingle(dim) + + x = torch.randn(batch, seq_len, dim) + emb = torch.randn(batch, dim) + + out_x, gate = norm(x, emb) + + assert out_x.shape == (batch, seq_len, dim) + assert gate.shape == (batch, dim) + + +def test_adalayernorm_continuous_forward_shape(): + """AdaLayerNormContinuous produces correct output shape.""" + from vllm_omni.diffusion.layers.adalayernorm import AdaLayerNormContinuous + + dim = 64 + cond_dim = 64 + batch = 2 + seq_len = 4 + norm = AdaLayerNormContinuous(dim, cond_dim) + + x = torch.randn(batch, seq_len, dim) + conditioning = torch.randn(batch, cond_dim) + + out = norm(x, conditioning) + + assert out.shape == (batch, seq_len, dim) + + +def test_adalayernorm_zero_accepts_quant_config(): + """Constructor accepts quant_config=None and prefix='test' without error.""" + from vllm_omni.diffusion.layers.adalayernorm import ( + AdaLayerNormContinuous, + AdaLayerNormZero, + AdaLayerNormZeroSingle, + ) + + # Should not raise with quant_config=None and prefix + AdaLayerNormZero(64, quant_config=None, prefix="test.norm1") + AdaLayerNormZeroSingle(64, quant_config=None, prefix="test.norm") + AdaLayerNormContinuous(64, 64, quant_config=None, prefix="test.norm_out") + + +def test_adalayernorm_uses_replicated_linear(): + """Verify .linear is a ReplicatedLinear instance (not nn.Linear).""" + from vllm.model_executor.layers.linear import ReplicatedLinear + + from vllm_omni.diffusion.layers.adalayernorm import ( + AdaLayerNormContinuous, + AdaLayerNormZero, + AdaLayerNormZeroSingle, + ) + + norm_zero = AdaLayerNormZero(64) + assert isinstance(norm_zero.linear, ReplicatedLinear) + + norm_zero_single = AdaLayerNormZeroSingle(64) + assert isinstance(norm_zero_single.linear, ReplicatedLinear) + + norm_continuous = AdaLayerNormContinuous(64, 64) + assert isinstance(norm_continuous.linear, ReplicatedLinear) + + +# ── Numerical equivalence tests against diffusers originals ── + + +def _copy_weights(src_linear, dst_replicated_linear): + """Copy weights from nn.Linear to ReplicatedLinear for comparison.""" + dst_replicated_linear.weight.data.copy_(src_linear.weight.data) + if src_linear.bias is not None and dst_replicated_linear.bias is not None: + dst_replicated_linear.bias.data.copy_(src_linear.bias.data) + + +def test_adalayernorm_zero_matches_diffusers(): + """Verify AdaLayerNormZero produces identical output to diffusers original.""" + from diffusers.models.normalization import ( + AdaLayerNormZero as DiffusersAdaLayerNormZero, + ) + + from vllm_omni.diffusion.layers.adalayernorm import AdaLayerNormZero + + dim = 64 + torch.manual_seed(42) + ours = AdaLayerNormZero(dim) + ref = DiffusersAdaLayerNormZero(dim) + + # Copy weights: nn.Linear -> ReplicatedLinear + _copy_weights(ref.linear, ours.linear) + + x = torch.randn(2, 4, dim) + emb = torch.randn(2, dim) + + out_ours = ours(x, emb) + out_ref = ref(x, emb=emb) + + for o, r in zip(out_ours, out_ref): + torch.testing.assert_close(o, r, atol=1e-5, rtol=1e-5) + + +def test_adalayernorm_zero_single_matches_diffusers(): + """Verify AdaLayerNormZeroSingle produces identical output to diffusers original.""" + from diffusers.models.normalization import ( + AdaLayerNormZeroSingle as DiffusersAdaLayerNormZeroSingle, + ) + + from vllm_omni.diffusion.layers.adalayernorm import AdaLayerNormZeroSingle + + dim = 64 + torch.manual_seed(42) + ours = AdaLayerNormZeroSingle(dim) + ref = DiffusersAdaLayerNormZeroSingle(dim) + + _copy_weights(ref.linear, ours.linear) + + x = torch.randn(2, 4, dim) + emb = torch.randn(2, dim) + + out_ours = ours(x, emb) + out_ref = ref(x, emb=emb) + + for o, r in zip(out_ours, out_ref): + torch.testing.assert_close(o, r, atol=1e-5, rtol=1e-5) + + +def test_adalayernorm_continuous_matches_diffusers(): + """Verify AdaLayerNormContinuous produces identical output to diffusers original.""" + from diffusers.models.normalization import ( + AdaLayerNormContinuous as DiffusersAdaLayerNormContinuous, + ) + + from vllm_omni.diffusion.layers.adalayernorm import AdaLayerNormContinuous + + dim = 64 + cond_dim = 64 + torch.manual_seed(42) + # Match constructor args: diffusers defaults elementwise_affine=True, eps=1e-5 + ours = AdaLayerNormContinuous(dim, cond_dim, elementwise_affine=False, eps=1e-6) + ref = DiffusersAdaLayerNormContinuous(dim, cond_dim, elementwise_affine=False, eps=1e-6) + + _copy_weights(ref.linear, ours.linear) + + x = torch.randn(2, 4, dim) + cond = torch.randn(2, cond_dim) + + out_ours = ours(x, cond) + out_ref = ref(x, cond) + + torch.testing.assert_close(out_ours, out_ref, atol=1e-5, rtol=1e-5) diff --git a/tests/diffusion/models/flux/__init__.py b/tests/diffusion/models/flux/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/diffusion/models/flux/test_flux_prefix_propagation.py b/tests/diffusion/models/flux/test_flux_prefix_propagation.py new file mode 100644 index 0000000000..b51fc3384f --- /dev/null +++ b/tests/diffusion/models/flux/test_flux_prefix_propagation.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests that FLUX transformer blocks correctly propagate `quant_config` and +`prefix` through all sub-layers. + +The tests instantiate blocks with a known prefix and verify that all quantization- +aware sub-layers (AdaLayerNorm, FeedForward, Attention projections) receive the +prefix rooted at the block prefix. This is critical for quantized weight loading +to match checkpoint keys to the correct model parameters. +""" + +import os + +import pytest + +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] + +# Standard dimensions for a minimal FLUX block +_DIM = 64 +_HEADS = 2 +_HEAD_DIM = _DIM // _HEADS + + +@pytest.fixture(autouse=True) +def _init_distributed(): + """Initialize the minimal distributed environment required by + vLLM parallel linear layers (tensor-parallel group must exist).""" + from vllm.distributed.parallel_state import ( + cleanup_dist_env_and_memory, + init_distributed_environment, + initialize_model_parallel, + ) + + os.environ.setdefault("MASTER_ADDR", "localhost") + os.environ.setdefault("MASTER_PORT", "29502") + init_distributed_environment( + world_size=1, + rank=0, + local_rank=0, + distributed_init_method="env://", + ) + initialize_model_parallel() + yield + cleanup_dist_env_and_memory() + + +def _param_names(module) -> set[str]: + """Return the set of all parameter names in a module.""" + return {name for name, _ in module.named_parameters()} + + +def test_flux_transformer_block_passes_prefix(): + """FluxTransformerBlock propagates prefix to norm1, norm1_context, attn, ff, ff_context.""" + from vllm_omni.diffusion.models.flux.flux_transformer import FluxTransformerBlock + + prefix = "transformer_blocks.0" + block = FluxTransformerBlock( + dim=_DIM, + num_attention_heads=_HEADS, + attention_head_dim=_HEAD_DIM, + quant_config=None, + prefix=prefix, + ) + + params = _param_names(block) + + # norm1 and norm1_context (AdaLayerNormZero) should have linear weights + assert any(name.startswith("norm1.linear.") for name in params), ( + f"norm1.linear.* not found in params: {sorted(params)}" + ) + assert any(name.startswith("norm1_context.linear.") for name in params), ( + f"norm1_context.linear.* not found in params: {sorted(params)}" + ) + + # attn should have QKV projections + assert any(name.startswith("attn.to_qkv.") for name in params), ( + f"attn.to_qkv.* not found in params: {sorted(params)}" + ) + + # ff and ff_context should have net layers + assert any(name.startswith("ff.net.") for name in params), f"ff.net.* not found in params: {sorted(params)}" + assert any(name.startswith("ff_context.net.") for name in params), ( + f"ff_context.net.* not found in params: {sorted(params)}" + ) + + +def test_flux_single_transformer_block_passes_prefix(): + """FluxSingleTransformerBlock propagates prefix to norm, proj_mlp, attn.""" + from vllm_omni.diffusion.models.flux.flux_transformer import FluxSingleTransformerBlock + + prefix = "single_transformer_blocks.0" + block = FluxSingleTransformerBlock( + dim=_DIM, + num_attention_heads=_HEADS, + attention_head_dim=_HEAD_DIM, + quant_config=None, + prefix=prefix, + ) + + params = _param_names(block) + + # norm (AdaLayerNormZeroSingle) should have linear weights + assert any(name.startswith("norm.linear.") for name in params), ( + f"norm.linear.* not found in params: {sorted(params)}" + ) + + # proj_mlp (ReplicatedLinear) should have weight + assert any(name.startswith("proj_mlp.") for name in params), f"proj_mlp.* not found in params: {sorted(params)}" + + # attn should have QKV projection + assert any(name.startswith("attn.to_qkv.") for name in params), ( + f"attn.to_qkv.* not found in params: {sorted(params)}" + ) + + +def test_flux_feedforward_passes_prefix(): + """FeedForward propagates prefix to net.0 (GELU proj) and net.2 (output proj).""" + from vllm_omni.diffusion.models.flux.flux_transformer import FeedForward + + prefix = "transformer_blocks.0.ff" + ff = FeedForward( + dim=_DIM, + dim_out=_DIM, + quant_config=None, + prefix=prefix, + ) + + params = _param_names(ff) + + # net.0 is ColumnParallelApproxGELU which wraps a ColumnParallelLinear + assert any("net.0" in name for name in params), f"net.0 not found in params: {sorted(params)}" + + # net.2 is RowParallelLinear + assert any("net.2" in name for name in params), f"net.2 not found in params: {sorted(params)}" diff --git a/tests/diffusion/quantization/test_inc_config.py b/tests/diffusion/quantization/test_inc_config.py new file mode 100644 index 0000000000..a7aabf7f62 --- /dev/null +++ b/tests/diffusion/quantization/test_inc_config.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for INC/AutoRound quantization via the unified framework.""" + +import pytest + +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion] + + +def test_build_quant_config_autoround(): + """build_quant_config("auto-round", ...) should produce an INCConfig.""" + from vllm.model_executor.layers.quantization.inc import INCConfig + + from vllm_omni.quantization import build_quant_config + + config = build_quant_config( + "auto-round", + bits=4, + group_size=128, + sym=True, + packing_format="auto_round:auto_gptq", + ) + assert config is not None + assert isinstance(config, INCConfig) + assert config.weight_bits == 4 + assert config.group_size == 128 + + +def test_build_quant_config_inc(): + """build_quant_config("inc", ...) should also produce an INCConfig.""" + from vllm.model_executor.layers.quantization.inc import INCConfig + + from vllm_omni.quantization import build_quant_config + + config = build_quant_config("inc", bits=4, group_size=128) + assert isinstance(config, INCConfig) + assert config.weight_bits == 4 + + +def test_build_quant_config_autoround_dict(): + """Dict-style config with method=auto-round should work.""" + from vllm.model_executor.layers.quantization.inc import INCConfig + + from vllm_omni.quantization import build_quant_config + + config = build_quant_config( + { + "method": "auto-round", + "bits": 4, + "group_size": 128, + "sym": True, + "packing_format": "auto_round:auto_gptq", + } + ) + assert isinstance(config, INCConfig) + assert config.weight_bits == 4 + + +def test_build_quant_config_autoround_filters_metadata(): + """Checkpoint metadata keys (autoround_version, batch_size, iters) + should be silently filtered out instead of causing TypeError.""" + from vllm.model_executor.layers.quantization.inc import INCConfig + + from vllm_omni.quantization import build_quant_config + + config = build_quant_config( + "auto-round", + bits=4, + group_size=128, + sym=True, + packing_format="auto_round:auto_gptq", + block_name_to_quantize="transformer_blocks,single_transformer_blocks", + autoround_version="0.12.0", # metadata — must be filtered + batch_size=1, # metadata — must be filtered + iters=0, # metadata — must be filtered + ) + assert isinstance(config, INCConfig) + assert config.weight_bits == 4 + assert config.group_size == 128 + + +def test_build_quant_config_bits_to_weight_bits_mapping(): + """The 'bits' key from checkpoints should be mapped to 'weight_bits'.""" + from vllm.model_executor.layers.quantization.inc import INCConfig + + from vllm_omni.quantization import build_quant_config + + # If weight_bits is already provided, bits should be ignored + config = build_quant_config("auto-round", weight_bits=4, group_size=128) + assert isinstance(config, INCConfig) + assert config.weight_bits == 4 + + +def test_autoround_in_supported_methods(): + """auto-round and inc should appear in SUPPORTED_QUANTIZATION_METHODS.""" + from vllm_omni.quantization import SUPPORTED_QUANTIZATION_METHODS + + assert "auto-round" in SUPPORTED_QUANTIZATION_METHODS + assert "inc" in SUPPORTED_QUANTIZATION_METHODS + + +def test_integration_autoround_via_omni_diffusion_config(): + """OmniDiffusionConfig with auto-round quantization dict should resolve.""" + from vllm.model_executor.layers.quantization.inc import INCConfig + + from vllm_omni.diffusion.data import OmniDiffusionConfig + + config = OmniDiffusionConfig( + model="test", + quantization_config={ + "method": "auto-round", + "bits": 4, + "group_size": 128, + "sym": True, + }, + ) + assert isinstance(config.quantization_config, INCConfig) + assert config.quantization_config.weight_bits == 4 + + +def test_integration_autodetect_from_transformer_config(): + """When TransformerConfig has quant_config, OmniDiffusionConfig should + auto-detect it even without explicit quantization_config.""" + from vllm.model_executor.layers.quantization.inc import INCConfig + + from vllm_omni.diffusion.data import OmniDiffusionConfig, TransformerConfig + + tf_config = TransformerConfig.from_dict( + { + "quantization_config": { + "quant_method": "auto-round", + "bits": 4, + "group_size": 128, + "sym": True, + "packing_format": "auto_round:auto_gptq", + "autoround_version": "0.12.0", + "batch_size": 1, + "iters": 0, + } + } + ) + assert tf_config.quant_method == "auto-round" + assert isinstance(tf_config.quant_config, INCConfig) + + od_config = OmniDiffusionConfig(model="test", tf_model_config=tf_config) + assert isinstance(od_config.quantization_config, INCConfig) + assert od_config.quantization_config.weight_bits == 4 diff --git a/tests/e2e/offline_inference/test_flux_autoround_w4a16.py b/tests/e2e/offline_inference/test_flux_autoround_w4a16.py new file mode 100644 index 0000000000..42aab7f26a --- /dev/null +++ b/tests/e2e/offline_inference/test_flux_autoround_w4a16.py @@ -0,0 +1,127 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""E2E tests for FLUX AutoRound W4A16 quantized inference. + +These tests require: + - A CUDA GPU + - The quantized model checkpoint (vllm-project-org/FLUX.1-dev-AutoRound-w4a16) +""" + +import gc +import sys +from pathlib import Path + +import pytest +import torch +from vllm.distributed.parallel_state import cleanup_dist_env_and_memory + +from tests.utils import DeviceMemoryMonitor, hardware_test +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput +from vllm_omni.platforms import current_omni_platform + +# ruff: noqa: E402 +REPO_ROOT = Path(__file__).resolve().parents[2] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from vllm_omni import Omni + +QUANTIZED_MODEL = "vllm-project-org/FLUX.1-dev-AutoRound-w4a16" +BASELINE_MODEL = "black-forest-labs/FLUX.1-dev" + +# Allow overriding via environment for local testing +import os as _os + +QUANTIZED_MODEL = _os.environ.get("FLUX_AUTOROUND_MODEL", QUANTIZED_MODEL) +BASELINE_MODEL = _os.environ.get("FLUX_BASELINE_MODEL", BASELINE_MODEL) + +# Small resolution to keep GPU memory & time manageable +HEIGHT = 256 +WIDTH = 256 +NUM_STEPS = 2 # minimal for smoke-test + + +def _generate_image(model_name: str, **extra_kwargs) -> tuple[list, float]: + """Load a FLUX model, generate one image, return (images, peak_memory_mb).""" + gc.collect() + current_omni_platform.empty_cache() + device_index = current_omni_platform.current_device() + current_omni_platform.reset_peak_memory_stats() + monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) + monitor.start() + + m = Omni(model=model_name, enforce_eager=True, **extra_kwargs) + + current_omni_platform.reset_peak_memory_stats() + outputs = m.generate( + "a photo of a cat sitting on a laptop keyboard", + OmniDiffusionSamplingParams( + height=HEIGHT, + width=WIDTH, + num_inference_steps=NUM_STEPS, + guidance_scale=0.0, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + ), + ) + + peak = monitor.peak_used_mb + monitor.stop() + + first_output = outputs[0] + assert first_output.final_output_type == "image" + req_out = first_output.request_output + assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images") + images = req_out.images + + del m + gc.collect() + current_omni_platform.empty_cache() + + return images, peak + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "L4"}) +def test_flux_autoround_w4a16_generates_image(): + """Load the W4A16 quantized FLUX model and verify it produces a valid image.""" + images, _ = _generate_image(QUANTIZED_MODEL) + + assert len(images) >= 1, "Expected at least one generated image" + img = images[0] + assert img.width == WIDTH, f"Expected width {WIDTH}, got {img.width}" + assert img.height == HEIGHT, f"Expected height {HEIGHT}, got {img.height}" + + # Sanity: image should not be blank (all-zero) + import numpy as np + + arr = np.array(img) + assert arr.std() > 1.0, "Generated image appears blank (std ≈ 0)" + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "L4"}) +def test_flux_autoround_w4a16_memory_savings(): + """Compare peak GPU memory of quantized vs FP16 baseline. + + The W4A16 model should use meaningfully less memory than the + BF16/FP16 baseline since weights are 4-bit instead of 16-bit. + """ + quant_images, quant_peak = _generate_image(QUANTIZED_MODEL) + cleanup_dist_env_and_memory() + _, baseline_peak = _generate_image(BASELINE_MODEL) + + print(f"Quantized (W4A16) peak memory: {quant_peak:.0f} MB") + print(f"Baseline (BF16) peak memory: {baseline_peak:.0f} MB") + print(f"Savings: {baseline_peak - quant_peak:.0f} MB") + + # W4A16 weights are 4x smaller than BF16/FP16. FLUX.1-dev transformer + # is ~12 GB in BF16, so we expect ~9 GB savings on weights alone. + # Use a conservative threshold to account for activations and overhead. + min_savings_mb = 2000 + assert quant_peak + min_savings_mb < baseline_peak, ( + f"Quantized model ({quant_peak:.0f} MB) should use at least " + f"{min_savings_mb} MB less than baseline ({baseline_peak:.0f} MB)" + ) diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py index 12eb5ed3da..3071fd9d56 100644 --- a/vllm_omni/diffusion/data.py +++ b/vllm_omni/diffusion/data.py @@ -193,12 +193,24 @@ class TransformerConfig: """Container for raw transformer configuration dictionaries.""" params: dict[str, Any] = field(default_factory=dict) + quant_method: str | None = None + quant_config: "QuantizationConfig | None" = None @classmethod def from_dict(cls, data: dict[str, Any]) -> "TransformerConfig": if not isinstance(data, dict): raise TypeError(f"Expected transformer config dict, got {type(data)!r}") - return cls(params=dict(data)) + params = dict(data) # copy to avoid mutating caller's dict + + quant_method: str | None = None + quant_config: QuantizationConfig | None = None + disk_qc = params.get("quantization_config") + if isinstance(disk_qc, dict) and "quant_method" in disk_qc: + quant_method = disk_qc["quant_method"] + kwargs = {k: v for k, v in disk_qc.items() if k != "quant_method"} + quant_config = build_quant_config(quant_method, **kwargs) + + return cls(params=params, quant_method=quant_method, quant_config=quant_config) def to_dict(self) -> dict[str, Any]: return dict(self.params) @@ -598,6 +610,17 @@ def __post_init__(self): # If it's neither dict nor DiffusionCacheConfig, convert to empty config self.cache_config = DiffusionCacheConfig() + # Auto-detect quantization from TransformerConfig if not explicitly set. + # This covers the case where tf_model_config is passed at construction + # time. For late (post-construction) assignment, callers should use + # set_tf_model_config() which propagates quant_config automatically. + if self.quantization_config is None and self.tf_model_config.quant_config is not None: + self.quantization_config = self.tf_model_config.quant_config + logger.info( + "Auto-detected quantization '%s' from model config", + self.tf_model_config.quant_method, + ) + # Resolve quantization_config: str/dict -> QuantizationConfig via build_quant_config. if self.quantization_config is not None: if isinstance(self.quantization_config, QuantizationConfig): @@ -617,6 +640,28 @@ def __post_init__(self): elif self.max_cpu_loras < 1: raise ValueError("max_cpu_loras must be >= 1 for diffusion LoRA") + def set_tf_model_config(self, tf_config: "TransformerConfig") -> None: + """Assign `tf_model_config` and propagate quantization if detected. + + In the normal startup flow `OmniDiffusionConfig` is created + *before* the transformer `config.json` is loaded from disk, so + `__post_init__` sees an empty `TransformerConfig`. Callers + that load the config later should use this method instead of bare + assignment so that an embedded `quant_config` is propagated to + `self.quantization_config` automatically. + + Args: + tf_config: Transformer configuration, typically built via + `TransformerConfig.from_dict`. + """ + self.tf_model_config = tf_config + if self.quantization_config is None and tf_config.quant_config is not None: + self.quantization_config = tf_config.quant_config + logger.info( + "Auto-detected quantization '%s' from model config", + tf_config.quant_method, + ) + def update_multimodal_support(self) -> None: self.supports_multimodal_inputs = self.model_class_name in {"QwenImageEditPlusPipeline"} diff --git a/vllm_omni/diffusion/layers/adalayernorm.py b/vllm_omni/diffusion/layers/adalayernorm.py index c2389cc151..35f63e2fc9 100644 --- a/vllm_omni/diffusion/layers/adalayernorm.py +++ b/vllm_omni/diffusion/layers/adalayernorm.py @@ -1,11 +1,16 @@ from importlib.util import find_spec +from typing import TYPE_CHECKING import torch import torch.nn as nn from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ReplicatedLinear from vllm_omni.diffusion.layers.custom_op import CustomOp +if TYPE_CHECKING: + from vllm.model_executor.layers.quantization.base_config import QuantizationConfig + logger = init_logger(__name__) _HAS_MINDIESD = find_spec("mindiesd") is not None @@ -123,3 +128,102 @@ def forward_native( shift_result, scale_result, gate_result = self.preprocess(mod_params, index) return self.layernorm(x) * (1 + scale_result) + shift_result, gate_result + + +class AdaLayerNormZero(nn.Module): + def __init__( + self, + embedding_dim: int, + bias: bool = True, + quant_config: "QuantizationConfig | None" = None, + prefix: str = "", + ): + super().__init__() + self.emb = None + self.silu = nn.SiLU() + self.linear = ReplicatedLinear( + embedding_dim, + 6 * embedding_dim, + bias=bias, + return_bias=False, + quant_config=quant_config, + prefix=f"{prefix}.linear", + ) + self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6) + + def forward( + self, + x: torch.Tensor, + emb: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + emb = self.linear(self.silu(emb)) + if isinstance(emb, tuple): + emb = emb[0] + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1) + x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None] + return x, gate_msa, shift_mlp, scale_mlp, gate_mlp + + +class AdaLayerNormZeroSingle(nn.Module): + def __init__( + self, + embedding_dim: int, + bias: bool = True, + quant_config: "QuantizationConfig | None" = None, + prefix: str = "", + ): + super().__init__() + self.silu = nn.SiLU() + self.linear = ReplicatedLinear( + embedding_dim, + 3 * embedding_dim, + bias=bias, + return_bias=False, + quant_config=quant_config, + prefix=f"{prefix}.linear", + ) + self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6) + + def forward( + self, + x: torch.Tensor, + emb: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + emb = self.linear(self.silu(emb)) + if isinstance(emb, tuple): + emb = emb[0] + shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=1) + x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None] + return x, gate_msa + + +class AdaLayerNormContinuous(nn.Module): + def __init__( + self, + embedding_dim: int, + conditioning_embedding_dim: int, + elementwise_affine: bool = False, + eps: float = 1e-6, + bias: bool = True, + quant_config: "QuantizationConfig | None" = None, + prefix: str = "", + ): + super().__init__() + self.silu = nn.SiLU() + self.linear = ReplicatedLinear( + conditioning_embedding_dim, + embedding_dim * 2, + bias=bias, + return_bias=False, + quant_config=quant_config, + prefix=f"{prefix}.linear", + ) + self.norm = nn.LayerNorm(embedding_dim, eps=eps, elementwise_affine=elementwise_affine) + + def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor: + emb = self.linear(self.silu(conditioning_embedding).to(x.dtype)) + if isinstance(emb, tuple): + emb = emb[0] + scale, shift = torch.chunk(emb, 2, dim=1) + x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :] + return x diff --git a/vllm_omni/diffusion/model_loader/diffusers_loader.py b/vllm_omni/diffusion/model_loader/diffusers_loader.py index c48640e342..146afb26fb 100644 --- a/vllm_omni/diffusion/model_loader/diffusers_loader.py +++ b/vllm_omni/diffusion/model_loader/diffusers_loader.py @@ -7,7 +7,7 @@ import time from collections.abc import Generator, Iterable from pathlib import Path -from typing import cast +from typing import TYPE_CHECKING, cast import torch from huggingface_hub import hf_hub_download @@ -34,6 +34,9 @@ from vllm_omni.diffusion.model_loader.gguf_adapters import get_gguf_adapter from vllm_omni.diffusion.registry import initialize_model +if TYPE_CHECKING: + from vllm_omni.diffusion.data import OmniDiffusionConfig + logger = init_logger(__name__) @@ -332,12 +335,60 @@ def load_weights(self, model: nn.Module) -> None: weights_scale_not_loaded = {name for name in weights_not_loaded if name.endswith("weight_scale")} weights_not_loaded = weights_not_loaded - weights_scale_not_loaded if weights_not_loaded: - raise ValueError(f"Following weights were not initialized from checkpoint: {weights_not_loaded}") + self._check_unloaded_weights(weights_not_loaded) if weights_scale_not_loaded: logger.warning( f"Following weight_scale weights were not initialized from checkpoint: {weights_scale_not_loaded}" ) + @staticmethod + def _is_expected_quantized_weight(name: str) -> bool: + """Return True if *name* is a quantization-specific parameter. + + Quantization methods (GPTQ, AWQ, FP8, GGUF, Autoround, etc.) create extra + parameters that have no counterpart in an unquantized checkpoint. + These are expected to be absent and should not trigger a load error. + """ + # Weight suffixes that quantization methods register in the model but + # are not present in unquantized checkpoints. + _QUANTIZED_WEIGHT_SUFFIXES = ( + # GPTQ / AWQ / AutoRound – g_idx is optional (not all checkpoints include it) + ".g_idx", + # FP8 + ".weight_scale", + ".weight_scale_inv", + ".input_scale", + # GGUF + ".qweight_type", + # INT8 (weight_scale already covered above) + ) + return name.endswith(_QUANTIZED_WEIGHT_SUFFIXES) + + def _check_unloaded_weights( + self, + weights_not_loaded: set[str], + ) -> None: + """Validate unloaded weights, tolerating expected quantization artifacts. + + For quantized models, weights matching known quant-specific suffixes + are logged as a warning. Any *other* missing weight raises + ``ValueError`` regardless of quantization. + """ + od_config = getattr(self, "od_config", None) + if od_config is None or od_config.quantization_config is None: + raise ValueError(f"Following weights were not initialized from checkpoint: {weights_not_loaded}") + + expected_missing = {w for w in weights_not_loaded if self._is_expected_quantized_weight(w)} + unexpected_missing = weights_not_loaded - expected_missing + + if expected_missing: + logger.warning( + "Following weights were not initialized from checkpoint (expected for quantized models): %s", + expected_missing, + ) + if unexpected_missing: + raise ValueError(f"Following weights were not initialized from checkpoint: {unexpected_missing}") + def _is_gguf_quantization(self, od_config: OmniDiffusionConfig) -> bool: quant_config = od_config.quantization_config if quant_config is None: diff --git a/vllm_omni/diffusion/models/flux/flux_transformer.py b/vllm_omni/diffusion/models/flux/flux_transformer.py index df3a267420..362fb4446f 100644 --- a/vllm_omni/diffusion/models/flux/flux_transformer.py +++ b/vllm_omni/diffusion/models/flux/flux_transformer.py @@ -12,7 +12,6 @@ get_1d_rotary_pos_embed, ) from diffusers.models.modeling_outputs import Transformer2DModelOutput -from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle from diffusers.utils import is_torch_npu_available from vllm.distributed import get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather from vllm.logger import init_logger @@ -32,6 +31,11 @@ from vllm_omni.diffusion.attention.layer import Attention from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.layers.adalayernorm import ( + AdaLayerNormContinuous, + AdaLayerNormZero, + AdaLayerNormZeroSingle, +) from vllm_omni.diffusion.layers.rope import RotaryEmbedding, apply_rope_to_qk logger = init_logger(__name__) @@ -46,6 +50,7 @@ def __init__( approximate: str, bias: bool = True, quant_config: "QuantizationConfig | None" = None, + prefix: str = "", ): super().__init__() self.proj = ColumnParallelLinear( @@ -55,6 +60,7 @@ def __init__( gather_output=False, return_bias=False, quant_config=quant_config, + prefix=f"{prefix}.proj", ) self.approximate = approximate @@ -73,6 +79,7 @@ def __init__( inner_dim: int | None = None, bias: bool = True, quant_config: "QuantizationConfig | None" = None, + prefix: str = "", ) -> None: super().__init__() @@ -82,7 +89,9 @@ def __init__( dim_out = dim_out or dim layers: list[nn.Module] = [ - ColumnParallelApproxGELU(dim, inner_dim, approximate="tanh", bias=bias, quant_config=quant_config), + ColumnParallelApproxGELU( + dim, inner_dim, approximate="tanh", bias=bias, quant_config=quant_config, prefix=f"{prefix}.net.0" + ), nn.Identity(), # placeholder for weight loading RowParallelLinear( inner_dim, @@ -90,6 +99,7 @@ def __init__( input_is_parallel=True, return_bias=False, quant_config=quant_config, + prefix=f"{prefix}.net.2", ), ] @@ -117,6 +127,7 @@ def __init__( context_pre_only: bool | None = None, pre_only: bool = False, quant_config: "QuantizationConfig | None" = None, + prefix: str = "", ): super().__init__() @@ -141,6 +152,7 @@ def __init__( total_num_heads=self.heads, bias=bias, quant_config=quant_config, + prefix=f"{prefix}.to_qkv", ) if not self.pre_only: @@ -153,6 +165,7 @@ def __init__( input_is_parallel=True, return_bias=False, quant_config=quant_config, + prefix=f"{prefix}.to_out.0", ), nn.Dropout(dropout), ] @@ -168,8 +181,8 @@ def __init__( total_num_heads=self.heads, bias=added_proj_bias, quant_config=quant_config, + prefix=f"{prefix}.add_kv_proj", ) - self.to_add_out = RowParallelLinear( self.inner_dim, query_dim, @@ -177,6 +190,7 @@ def __init__( input_is_parallel=True, return_bias=False, quant_config=quant_config, + prefix=f"{prefix}.to_add_out", ) self.rope = RotaryEmbedding(is_neox_style=False) @@ -272,11 +286,11 @@ def __init__( qk_norm: str = "rms_norm", eps: float = 1e-6, quant_config: "QuantizationConfig | None" = None, + prefix: str = "", ): super().__init__() - - self.norm1 = AdaLayerNormZero(dim) - self.norm1_context = AdaLayerNormZero(dim) + self.norm1 = AdaLayerNormZero(dim, quant_config=quant_config, prefix=f"{prefix}.norm1") + self.norm1_context = AdaLayerNormZero(dim, quant_config=quant_config, prefix=f"{prefix}.norm1_context") self.attn = FluxAttention( query_dim=dim, @@ -288,13 +302,14 @@ def __init__( bias=True, eps=eps, quant_config=quant_config, + prefix=f"{prefix}.attn", ) self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6) - self.ff = FeedForward(dim=dim, dim_out=dim, quant_config=quant_config) + self.ff = FeedForward(dim=dim, dim_out=dim, quant_config=quant_config, prefix=f"{prefix}.ff") self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6) - self.ff_context = FeedForward(dim=dim, dim_out=dim, quant_config=quant_config) + self.ff_context = FeedForward(dim=dim, dim_out=dim, quant_config=quant_config, prefix=f"{prefix}.ff_context") def forward( self, @@ -361,17 +376,28 @@ def __init__( attention_head_dim: int, mlp_ratio: float = 4.0, quant_config: "QuantizationConfig | None" = None, + prefix: str = "", ): super().__init__() self.mlp_hidden_dim = int(dim * mlp_ratio) - self.norm = AdaLayerNormZeroSingle(dim) + self.norm = AdaLayerNormZeroSingle(dim, quant_config=quant_config, prefix=f"{prefix}.norm") self.proj_mlp = ReplicatedLinear( - dim, self.mlp_hidden_dim, bias=True, return_bias=False, quant_config=quant_config + dim, + self.mlp_hidden_dim, + bias=True, + return_bias=False, + quant_config=quant_config, + prefix=f"{prefix}.proj_mlp", ) self.act_mlp = nn.GELU(approximate="tanh") self.proj_out = ReplicatedLinear( - dim + self.mlp_hidden_dim, dim, bias=True, return_bias=False, quant_config=quant_config + dim + self.mlp_hidden_dim, + dim, + bias=True, + return_bias=False, + quant_config=quant_config, + prefix=f"{prefix}.proj_out", ) self.attn = FluxAttention( @@ -383,6 +409,7 @@ def __init__( eps=1e-6, pre_only=True, quant_config=quant_config, + prefix=f"{prefix}.attn", ) def forward( @@ -542,8 +569,9 @@ def __init__( num_attention_heads=num_attention_heads, attention_head_dim=attention_head_dim, quant_config=quant_config, + prefix=f"transformer_blocks.{i}", ) - for _ in range(num_layers) + for i in range(num_layers) ] ) @@ -554,12 +582,20 @@ def __init__( num_attention_heads=num_attention_heads, attention_head_dim=attention_head_dim, quant_config=quant_config, + prefix=f"single_transformer_blocks.{i}", ) - for _ in range(num_single_layers) + for i in range(num_single_layers) ] ) - self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6) + self.norm_out = AdaLayerNormContinuous( + self.inner_dim, + self.inner_dim, + elementwise_affine=False, + eps=1e-6, + quant_config=quant_config, + prefix="norm_out", + ) self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True) def forward( diff --git a/vllm_omni/entrypoints/async_omni_diffusion.py b/vllm_omni/entrypoints/async_omni_diffusion.py index 674c3509d2..558ef96cb9 100644 --- a/vllm_omni/entrypoints/async_omni_diffusion.py +++ b/vllm_omni/entrypoints/async_omni_diffusion.py @@ -113,7 +113,7 @@ def __init__( od_config.update_multimodal_support() tf_config_dict = get_hf_file_to_dict("transformer/config.json", od_config.model) - od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict) + od_config.set_tf_model_config(TransformerConfig.from_dict(tf_config_dict)) else: raise FileNotFoundError("model_index.json not found") except (AttributeError, OSError, ValueError, FileNotFoundError): @@ -121,7 +121,7 @@ def __init__( if cfg is None: raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}") - od_config.tf_model_config = TransformerConfig.from_dict(cfg) + od_config.set_tf_model_config(TransformerConfig.from_dict(cfg)) model_type = cfg.get("model_type") architectures = cfg.get("architectures") or [] # Bagel/NextStep models don't have a model_index.json, so we set the pipeline class name manually diff --git a/vllm_omni/quantization/factory.py b/vllm_omni/quantization/factory.py index a867f37a40..f85589d69b 100644 --- a/vllm_omni/quantization/factory.py +++ b/vllm_omni/quantization/factory.py @@ -41,9 +41,25 @@ def _build_int8(**kw: Any) -> QuantizationConfig: return DiffusionInt8Config(**kw) +def _build_inc(**kw: Any) -> QuantizationConfig: + """Lazy import for INC/AutoRound config with checkpoint kwarg normalization.""" + from vllm.model_executor.layers.quantization.inc import INCConfig + + # Map checkpoint key 'bits' to INCConfig's 'weight_bits' + if "bits" in kw and "weight_bits" not in kw: + kw["weight_bits"] = kw.pop("bits") + + # Filter to only valid INCConfig params + valid = set(inspect.signature(INCConfig.__init__).parameters) - {"self"} + filtered = {k: v for k, v in kw.items() if k in valid} + return INCConfig(**filtered) + + _OVERRIDES: dict[str, Callable[..., QuantizationConfig]] = { "gguf": _build_gguf, "int8": _build_int8, + "inc": _build_inc, + "auto-round": _build_inc, } SUPPORTED_QUANTIZATION_METHODS: list[str] = list(dict.fromkeys(QUANTIZATION_METHODS + list(_OVERRIDES.keys()))) From e2892ef6c1d461b65435339ca3ac83c17d8d3c0f Mon Sep 17 00:00:00 2001 From: Canlin Guo Date: Thu, 2 Apr 2026 14:28:44 +0800 Subject: [PATCH 025/204] [Perf] Optimize Wan2.2 rotary embedding (#2393) Signed-off-by: gcanlin --- .../diffusion/models/wan2_2/wan2_2_transformer.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py index a4ae3118a7..20e2b9fea8 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py @@ -52,10 +52,14 @@ def apply_rotary_emb_wan( x1, x2 = hidden_states.unflatten(-1, (-1, 2)).unbind(-1) cos = freqs_cos[..., 0::2] sin = freqs_sin[..., 1::2] - out = torch.empty_like(hidden_states) - out[..., 0::2] = x1 * cos - x2 * sin - out[..., 1::2] = x1 * sin + x2 * cos - return out.type_as(hidden_states) + rotated = torch.stack( + ( + x1 * cos - x2 * sin, + x1 * sin + x2 * cos, + ), + dim=-1, + ) + return rotated.flatten(-2, -1).to(hidden_states.dtype) class DistributedRMSNorm(nn.Module): From 458f4023235f1d49ea10e47fb641a051b431e438 Mon Sep 17 00:00:00 2001 From: Binh Tang Date: Thu, 2 Apr 2026 00:42:50 -0700 Subject: [PATCH 026/204] Add VACE support for WAN 2.1 conditional video generation (#1885) Signed-off-by: Binh Tang Signed-off-by: Binh Tang Signed-off-by: Didan Deng <33117903+wtomin@users.noreply.github.com> Co-authored-by: Binh Tang Co-authored-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- docs/models/supported_models.md | 1 + docs/user_guide/diffusion_features.md | 1 + .../vace/vace_video_generation.md | 88 +++ .../vace/vace_video_generation.py | 209 ++++++ .../test_wan_2_1_vace_expansion.py | 161 +++++ vllm_omni/diffusion/models/wan2_2/__init__.py | 14 + .../models/wan2_2/pipeline_wan2_2.py | 8 +- .../models/wan2_2/pipeline_wan2_2_vace.py | 645 ++++++++++++++++++ .../models/wan2_2/wan2_2_vace_transformer.py | 254 +++++++ vllm_omni/diffusion/registry.py | 7 + 10 files changed, 1386 insertions(+), 2 deletions(-) create mode 100644 examples/offline_inference/vace/vace_video_generation.md create mode 100644 examples/offline_inference/vace/vace_video_generation.py create mode 100644 tests/e2e/online_serving/test_wan_2_1_vace_expansion.py create mode 100644 vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_vace.py create mode 100644 vllm_omni/diffusion/models/wan2_2/wan2_2_vace_transformer.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0706a67864..68024e18b3 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -31,6 +31,7 @@ th { | `ZImagePipeline` | Z-Image | `Tongyi-MAI/Z-Image-Turbo` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `WanPipeline` | Wan2.1-T2V, Wan2.2-T2V, Wan2.2-TI2V | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers`, `Wan-AI/Wan2.1-T2V-14B-Diffusers`, `Wan-AI/Wan2.2-T2V-A14B-Diffusers`, `Wan-AI/Wan2.2-TI2V-5B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `WanImageToVideoPipeline` | Wan2.2-I2V | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | +| `Wan22VACEPipeline` | Wan2.1-VACE | `Wan-AI/Wan2.1-VACE-1.3B-diffusers`, `Wan-AI/Wan2.1-VACE-14B-diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `LTX2Pipeline` | LTX-2-T2V | `Lightricks/LTX-2` | ✅︎ | ✅︎ | | | | `LTX2ImageToVideoPipeline` | LTX-2-I2V | `Lightricks/LTX-2` | ✅︎ | ✅︎ | | | | `HeliosPipeline`, `HeliosPyramidPipeline` | Helios | `BestWishYsh/Helios-Base`, `BestWishYsh/Helios-Mid`, `BestWishYsh/Helios-Distilled` | ✅︎ | ✅︎ | ✅︎ | | diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 607d9af73c..9cd407d377 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -130,6 +130,7 @@ The following tables show which models support each feature: | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| | **Wan2.2** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| **Wan2.1-VACE** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | **LTX-2** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **Helios** | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | | **HunyuanVideo-1.5 T2V I2V** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | diff --git a/examples/offline_inference/vace/vace_video_generation.md b/examples/offline_inference/vace/vace_video_generation.md new file mode 100644 index 0000000000..bbaf994528 --- /dev/null +++ b/examples/offline_inference/vace/vace_video_generation.md @@ -0,0 +1,88 @@ +# VACE Video Generation + +[VACE](https://github.com/ali-vilab/VACE) (Video All-in-one Creation Engine) supports multiple video tasks through a single model. + +| Model | Architecture | Model Weights (bf16) | HuggingFace | +|-------|-------------|----------------------|-------------| +| Wan2.1-VACE (1.3B) | Wan2.1 | ~10 GB | [Wan-AI/Wan2.1-VACE-1.3B-diffusers](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B-diffusers) | +| Wan2.1-VACE (14B) | Wan2.1 | ~38 GB | [Wan-AI/Wan2.1-VACE-14B-diffusers](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers) | + +## Text-to-Video (T2V) + +```bash +python vace_video_generation.py \ + --mode t2v \ + --prompt "A sleek robot stands in a vast warehouse filled with boxes" \ + --height 480 --width 832 --num-frames 81 \ + --num-inference-steps 30 --guidance-scale 5.0 --flow-shift 5.0 \ + --output t2v_output.mp4 +``` + +## Image-to-Video (I2V) + +First frame is kept, remaining frames are generated: + +```bash +python vace_video_generation.py \ + --mode i2v \ + --image astronaut.jpg \ + --prompt "An astronaut emerging from a cracked egg on the moon" \ + --height 480 --width 832 --num-frames 81 \ + --output i2v_output.mp4 +``` + +## First-Last-Frame Interpolation (FLF2V) + +```bash +python vace_video_generation.py \ + --mode flf2v \ + --image first_frame.jpg --last-image last_frame.jpg \ + --prompt "A bird takes off from a branch and lands on another" \ + --height 512 --width 512 --num-frames 81 \ + --output flf2v_output.mp4 +``` + +## Inpainting + +Center vertical stripe is masked and regenerated: + +```bash +python vace_video_generation.py \ + --mode inpaint \ + --image scene.jpg \ + --prompt "Shrek walks out of a building" \ + --height 480 --width 832 --num-frames 81 \ + --output inpaint_output.mp4 +``` + +## Reference Image-guided (R2V) + +```bash +python vace_video_generation.py \ + --mode r2v \ + --image reference.jpg \ + --prompt "Camera slowly zooms out from the character" \ + --height 480 --width 832 --num-frames 81 \ + --output r2v_output.mp4 +``` + +## Key Arguments + +- `--mode`: VACE task mode (`t2v`, `i2v`, `flf2v`, `inpaint`, `r2v`). +- `--model`: Model ID (default: `Wan-AI/Wan2.1-VACE-1.3B-diffusers`). +- `--image`: Input image for I2V, inpainting, and R2V modes. +- `--last-image`: Last frame image for FLF2V mode. +- `--prompt`: Text description of desired video. +- `--height/--width`: Output resolution (default 480x832). Dimensions should be multiples of 16. +- `--num-frames`: Number of frames (default 81). +- `--guidance-scale`: CFG scale (default 5.0). +- `--flow-shift`: Scheduler flow shift (default 5.0). +- `--num-inference-steps`: Number of denoising steps (default 30). +- `--fps`: Frames per second for the saved MP4 (default 16). +- `--output`: Path to save the generated video. +- `--vae-use-tiling`: Enable VAE tiling for memory optimization. +- `--ulysses-degree`: Ulysses sequence parallelism degree for multi-GPU. +- `--cfg-parallel-size`: CFG parallel size for multi-GPU. +- `--tensor-parallel-size`: Tensor parallel size. + +> If you encounter OOM errors, try `--vae-use-tiling` or multi-GPU parallelism options. diff --git a/examples/offline_inference/vace/vace_video_generation.py b/examples/offline_inference/vace/vace_video_generation.py new file mode 100644 index 0000000000..6ca0d74c52 --- /dev/null +++ b/examples/offline_inference/vace/vace_video_generation.py @@ -0,0 +1,209 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""VACE video generation example. + +VACE (Video All-in-one Creation Engine) supports multiple video tasks: + - T2V: Text-to-Video + - I2V: Image-to-Video (first frame conditioning) + - V2LF: Video-to-Last-Frame + - FLF2V: First-Last-Frame interpolation + - Inpainting: Masked region generation + - R2V: Reference image-guided generation + +Usage examples: + # T2V (text-to-video) + python vace_video_generation.py --mode t2v --prompt "A robot in a warehouse" + + # I2V (image-to-video, first frame kept) + python vace_video_generation.py --mode i2v --image input.jpg --prompt "..." + + # FLF2V (first-last frame interpolation) + python vace_video_generation.py --mode flf2v --image first.jpg --last-image last.jpg + + # R2V (reference image guided) + python vace_video_generation.py --mode r2v --image ref.jpg --prompt "..." +""" + +import argparse +import time +from pathlib import Path + +import numpy as np +import PIL.Image +import torch + +from vllm_omni.diffusion.data import DiffusionParallelConfig +from vllm_omni.entrypoints.omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.platforms import current_omni_platform + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="VACE video generation.") + parser.add_argument( + "--model", + default="Wan-AI/Wan2.1-VACE-14B-diffusers", + help="VACE model ID or local path.", + ) + parser.add_argument( + "--mode", + default="t2v", + choices=["t2v", "i2v", "v2lf", "flf2v", "inpaint", "r2v"], + help="Generation mode.", + ) + parser.add_argument("--prompt", default="A cat walking in a garden", help="Text prompt.") + parser.add_argument("--negative-prompt", default="", help="Negative prompt.") + parser.add_argument("--image", type=str, default=None, help="Input image path (for I2V, R2V, FLF2V, inpaint).") + parser.add_argument("--last-image", type=str, default=None, help="Last frame image path (for FLF2V).") + parser.add_argument("--video-dir", type=str, default=None, help="Directory of video frames (for inpaint).") + parser.add_argument("--seed", type=int, default=42, help="Random seed.") + parser.add_argument("--guidance-scale", type=float, default=5.0, help="CFG guidance scale.") + parser.add_argument("--height", type=int, default=480, help="Video height.") + parser.add_argument("--width", type=int, default=832, help="Video width.") + parser.add_argument("--num-frames", type=int, default=81, help="Number of frames.") + parser.add_argument("--num-inference-steps", type=int, default=30, help="Sampling steps.") + parser.add_argument("--flow-shift", type=float, default=5.0, help="Scheduler flow_shift.") + parser.add_argument("--output", type=str, default="vace_output.mp4", help="Output video path.") + parser.add_argument("--fps", type=int, default=16, help="Output video FPS.") + parser.add_argument("--vae-use-tiling", action="store_true", default=True, help="Enable VAE tiling.") + parser.add_argument("--enforce-eager", action="store_true", help="Disable torch.compile.") + parser.add_argument("--ulysses-degree", type=int, default=1, help="Ulysses SP degree.") + parser.add_argument("--ring-degree", type=int, default=1, help="Ring attention degree.") + parser.add_argument("--cfg-parallel-size", type=int, default=1, choices=[1, 2], help="CFG parallel size.") + return parser.parse_args() + + +def build_prompts(args): + """Build prompt dict with multi_modal_data based on mode.""" + h, w, nf = args.height, args.width, args.num_frames + + gray = PIL.Image.new("RGB", (w, h), (128, 128, 128)) + mask_black = PIL.Image.new("L", (w, h), 0) + mask_white = PIL.Image.new("L", (w, h), 255) + + prompt_data = { + "prompt": args.prompt, + "negative_prompt": args.negative_prompt, + } + + if args.mode == "t2v": + return prompt_data + + if args.mode == "r2v": + assert args.image, "--image required for R2V mode" + ref_img = PIL.Image.open(args.image).convert("RGB").resize((w, h)) + prompt_data["multi_modal_data"] = {"reference_images": [ref_img]} + return prompt_data + + if args.mode == "i2v": + assert args.image, "--image required for I2V mode" + img = PIL.Image.open(args.image).convert("RGB").resize((w, h)) + prompt_data["multi_modal_data"] = { + "video": [img] + [gray] * (nf - 1), + "mask": [mask_black] + [mask_white] * (nf - 1), + } + return prompt_data + + if args.mode == "v2lf": + assert args.image, "--image required for V2LF mode" + img = PIL.Image.open(args.image).convert("RGB").resize((w, h)) + prompt_data["multi_modal_data"] = { + "video": [gray] * (nf - 1) + [img], + "mask": [mask_white] * (nf - 1) + [mask_black], + } + return prompt_data + + if args.mode == "flf2v": + assert args.image and args.last_image, "--image and --last-image required for FLF2V" + first = PIL.Image.open(args.image).convert("RGB").resize((w, h)) + last = PIL.Image.open(args.last_image).convert("RGB").resize((w, h)) + prompt_data["multi_modal_data"] = { + "video": [first] + [gray] * (nf - 2) + [last], + "mask": [mask_black] + [mask_white] * (nf - 2) + [mask_black], + } + return prompt_data + + if args.mode == "inpaint": + assert args.image, "--image required for inpaint mode" + img = PIL.Image.open(args.image).convert("RGB").resize((w, h)) + d = 80 + frames, masks = [], [] + for _ in range(nf): + base = np.array(img).copy() + mask = PIL.Image.new("L", (w, h), 0) + stripe = PIL.Image.new("L", (2 * d, h), 255) + mask.paste(stripe, (w // 2 - d, 0)) + base[np.array(mask) > 128] = 128 + frames.append(PIL.Image.fromarray(base)) + masks.append(mask) + prompt_data["multi_modal_data"] = {"video": frames, "mask": masks} + return prompt_data + + raise ValueError(f"Unknown mode: {args.mode}") + + +def main(): + args = parse_args() + generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed) + + parallel_config = DiffusionParallelConfig( + ulysses_degree=args.ulysses_degree, + ring_degree=args.ring_degree, + cfg_parallel_size=args.cfg_parallel_size, + ) + + omni = Omni( + model=args.model, + vae_use_tiling=args.vae_use_tiling, + flow_shift=args.flow_shift, + enforce_eager=args.enforce_eager, + parallel_config=parallel_config, + ) + + prompt_data = build_prompts(args) + + print(f"\n{'=' * 60}") + print(f"VACE {args.mode.upper()} Generation") + print(f" Model: {args.model}") + print(f" Size: {args.width}x{args.height}, {args.num_frames} frames, {args.num_inference_steps} steps") + print(f"{'=' * 60}\n") + + start = time.perf_counter() + outputs = omni.generate( + prompt_data, + OmniDiffusionSamplingParams( + height=args.height, + width=args.width, + num_frames=args.num_frames, + num_inference_steps=args.num_inference_steps, + guidance_scale=args.guidance_scale, + generator=generator, + ), + ) + elapsed = time.perf_counter() - start + + video = outputs[0].images + if isinstance(video, list): + video = video[0] + if isinstance(video, torch.Tensor): + video = video.cpu().numpy() + if video.ndim == 5: + video = video[0] + print(f"Output shape: {video.shape}, Time: {elapsed:.1f}s") + + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + + from diffusers.utils import export_to_video + + if np.issubdtype(video.dtype, np.integer): + video = video.astype(np.float32) / 255.0 + export_to_video(list(video), str(output_path), fps=args.fps) + print(f"Saved to {output_path}") + + omni.close() + + +if __name__ == "__main__": + main() diff --git a/tests/e2e/online_serving/test_wan_2_1_vace_expansion.py b/tests/e2e/online_serving/test_wan_2_1_vace_expansion.py new file mode 100644 index 0000000000..0de70afe86 --- /dev/null +++ b/tests/e2e/online_serving/test_wan_2_1_vace_expansion.py @@ -0,0 +1,161 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Comprehensive e2e tests of diffusion features for Wan2.1-VACE in online serving mode. + +Wan2.1-VACE supports: Cache-DiT, Ulysses-SP, Ring, CFG-Parallel, TP, +VAE-Patch-Parallel, HSDP. TeaCache is NOT supported for this model, so +Cache-DiT is used in place of TeaCache for single-card and CFG tests. + +Uses the 1.3B variant for faster CI testing. + +Coverage: + Single GPU: + - Cache-DiT + layerwise CPU offload + Two GPUs: + - Cache-DiT + Ulysses-SP = 2 + - Cache-DiT + Ring = 2 + - Cache-DiT + CFG-Parallel = 2 + - Cache-DiT + TP = 2 + VAE-Patch-Parallel = 2 + - Cache-DiT + HSDP = 2 + VAE-Patch-Parallel = 2 +""" + +import pytest + +from tests.conftest import ( + OmniServer, + OmniServerParams, + OpenAIClientHandler, +) +from tests.utils import hardware_marks + +MODEL = "Wan-AI/Wan2.1-VACE-1.3B-diffusers" +PROMPT = "A cat walking slowly across a sunlit garden path" + +SINGLE_CARD_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}) +PARALLEL_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}, num_cards=2) + + +def _get_vace_feature_cases(): + return [ + # Single GPU: Cache-DiT + layerwise CPU offload + pytest.param( + OmniServerParams( + model=MODEL, + server_args=[ + "--cache-backend", + "cache_dit", + "--enable-layerwise-offload", + "--vae-use-tiling", + ], + ), + id="single_card_001", + marks=SINGLE_CARD_FEATURE_MARKS, + ), + # 2 GPUs: Cache-DiT + Ulysses-SP = 2 + pytest.param( + OmniServerParams( + model=MODEL, + server_args=[ + "--cache-backend", + "cache_dit", + "--ulysses-degree", + "2", + "--vae-use-tiling", + ], + ), + id="parallel_001", + marks=PARALLEL_FEATURE_MARKS, + ), + # 2 GPUs: Cache-DiT + Ring = 2 + pytest.param( + OmniServerParams( + model=MODEL, + server_args=[ + "--cache-backend", + "cache_dit", + "--ring", + "2", + "--vae-use-tiling", + ], + ), + id="parallel_002", + marks=PARALLEL_FEATURE_MARKS, + ), + # 2 GPUs: Cache-DiT + CFG-Parallel = 2 + pytest.param( + OmniServerParams( + model=MODEL, + server_args=[ + "--cache-backend", + "cache_dit", + "--cfg-parallel-size", + "2", + "--vae-use-tiling", + ], + ), + id="parallel_003", + marks=PARALLEL_FEATURE_MARKS, + ), + # 2 GPUs: Cache-DiT + TP = 2 + VAE-Patch-Parallel = 2 + pytest.param( + OmniServerParams( + model=MODEL, + server_args=[ + "--cache-backend", + "cache_dit", + "--tensor-parallel-size", + "2", + "--vae-patch-parallel-size", + "2", + "--vae-use-tiling", + ], + ), + id="parallel_004", + marks=PARALLEL_FEATURE_MARKS, + ), + # 2 GPUs: Cache-DiT + HSDP = 2 + VAE-Patch-Parallel = 2 + pytest.param( + OmniServerParams( + model=MODEL, + server_args=[ + "--cache-backend", + "cache_dit", + "--hsdp-shard-size", + "2", + "--vae-patch-parallel-size", + "2", + "--vae-use-tiling", + ], + ), + id="parallel_005", + marks=PARALLEL_FEATURE_MARKS, + ), + ] + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@pytest.mark.parametrize( + "omni_server", + _get_vace_feature_cases(), + indirect=True, +) +def test_wan_2_1_vace(omni_server: OmniServer, openai_client: OpenAIClientHandler): + """Test VACE T2V generation with all supported diffusion acceleration features.""" + openai_client.send_video_diffusion_request( + { + "model": MODEL, + "form_data": { + "prompt": PROMPT, + "height": 480, + "width": 320, + "num_frames": 5, + "fps": 8, + "num_inference_steps": 2, + "guidance_scale": 5.0, + "seed": 42, + }, + } + ) diff --git a/vllm_omni/diffusion/models/wan2_2/__init__.py b/vllm_omni/diffusion/models/wan2_2/__init__.py index c337f58a4a..d418001d95 100644 --- a/vllm_omni/diffusion/models/wan2_2/__init__.py +++ b/vllm_omni/diffusion/models/wan2_2/__init__.py @@ -1,3 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + from .pipeline_wan2_2 import ( Wan22Pipeline, create_transformer_from_config, @@ -16,7 +19,13 @@ get_wan22_ti2v_post_process_func, get_wan22_ti2v_pre_process_func, ) +from .pipeline_wan2_2_vace import ( + Wan22VACEPipeline, + get_wan22_vace_post_process_func, + get_wan22_vace_pre_process_func, +) from .wan2_2_transformer import WanTransformer3DModel +from .wan2_2_vace_transformer import VaceWanTransformerBlock, WanVACETransformer3DModel __all__ = [ "Wan22Pipeline", @@ -31,5 +40,10 @@ "Wan22TI2VPipeline", "get_wan22_ti2v_post_process_func", "get_wan22_ti2v_pre_process_func", + "Wan22VACEPipeline", + "get_wan22_vace_post_process_func", + "get_wan22_vace_pre_process_func", "WanTransformer3DModel", + "VaceWanTransformerBlock", + "WanVACETransformer3DModel", ] diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py index d7d8bad521..d2d2bb8602 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py @@ -278,13 +278,13 @@ def __init__( # Initialize transformers with correct config (weights loaded via load_weights) if load_transformer: transformer_config = load_transformer_config(model, "transformer", local_files_only) - self.transformer = create_transformer_from_config(transformer_config) + self.transformer = self._create_transformer(transformer_config) else: self.transformer = None if load_transformer_2: transformer_2_config = load_transformer_config(model, "transformer_2", local_files_only) - self.transformer_2 = create_transformer_from_config(transformer_2_config) + self.transformer_2 = self._create_transformer(transformer_2_config) else: self.transformer_2 = None @@ -316,6 +316,10 @@ def __init__( enable_diffusion_pipeline_profiler=self.od_config.enable_diffusion_pipeline_profiler ) + def _create_transformer(self, config: dict) -> WanTransformer3DModel: + """Create a transformer from a config dict. Subclasses may override.""" + return create_transformer_from_config(config) + @property def guidance_scale(self): return self._guidance_scale diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_vace.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_vace.py new file mode 100644 index 0000000000..ea52336311 --- /dev/null +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_vace.py @@ -0,0 +1,645 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +VACE (Video Creation and Editing) Pipeline for WAN models. + +VACE is an all-in-one model for video creation and editing. The mode is +determined by which inputs are provided (no explicit mode flag): + +- T2V: Text-to-Video (prompt only) +- R2V: Reference-to-Video (prompt + reference_images) +- V2V: Video-to-Video editing (prompt + video) +- MV2V: Masked Video-to-Video / inpainting (prompt + video + mask) +""" + +from __future__ import annotations + +from collections.abc import Iterable +from dataclasses import replace + +import PIL.Image +import torch +from vllm.logger import init_logger +from vllm.model_executor.models.utils import AutoWeightsLoader + +from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig +from vllm_omni.diffusion.models.interface import SupportImageInput +from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import ( + Wan22Pipeline, + retrieve_latents, +) +from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import ( + get_wan22_post_process_func as get_wan22_vace_post_process_func, # noqa: F401 +) +from vllm_omni.diffusion.models.wan2_2.wan2_2_vace_transformer import WanVACETransformer3DModel +from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.inputs.data import OmniTextPrompt +from vllm_omni.platforms import current_omni_platform + +logger = init_logger(__name__) + + +def create_vace_transformer_from_config(config: dict) -> WanVACETransformer3DModel: + """Create WanVACETransformer3DModel from config dict.""" + kwargs = {} + if "patch_size" in config: + kwargs["patch_size"] = tuple(config["patch_size"]) + if "num_attention_heads" in config: + kwargs["num_attention_heads"] = config["num_attention_heads"] + if "attention_head_dim" in config: + kwargs["attention_head_dim"] = config["attention_head_dim"] + if "in_channels" in config: + kwargs["in_channels"] = config["in_channels"] + if "out_channels" in config: + kwargs["out_channels"] = config["out_channels"] + if "text_dim" in config: + kwargs["text_dim"] = config["text_dim"] + if "freq_dim" in config: + kwargs["freq_dim"] = config["freq_dim"] + if "ffn_dim" in config: + kwargs["ffn_dim"] = config["ffn_dim"] + if "num_layers" in config: + kwargs["num_layers"] = config["num_layers"] + if "cross_attn_norm" in config: + kwargs["cross_attn_norm"] = config["cross_attn_norm"] + if "eps" in config: + kwargs["eps"] = config["eps"] + if "image_dim" in config: + kwargs["image_dim"] = config["image_dim"] + if "added_kv_proj_dim" in config: + kwargs["added_kv_proj_dim"] = config["added_kv_proj_dim"] + if "rope_max_seq_len" in config: + kwargs["rope_max_seq_len"] = config["rope_max_seq_len"] + if "pos_embed_seq_len" in config: + kwargs["pos_embed_seq_len"] = config["pos_embed_seq_len"] + if "vace_layers" in config: + kwargs["vace_layers"] = config["vace_layers"] + if "vace_in_channels" in config: + kwargs["vace_in_channels"] = config["vace_in_channels"] + + return WanVACETransformer3DModel(**kwargs) + + +def get_wan22_vace_pre_process_func(od_config: OmniDiffusionConfig): + """Pre-process function for VACE: handle reference images, source videos, and masks.""" + import numpy as np + + def pre_process_func(request: OmniDiffusionRequest) -> OmniDiffusionRequest: + for i, prompt in enumerate(request.prompts): + multi_modal_data = prompt.get("multi_modal_data", {}) if not isinstance(prompt, str) else None + if isinstance(prompt, str): + prompt = OmniTextPrompt(prompt=prompt) + if "additional_information" not in prompt: + prompt["additional_information"] = {} + + if not multi_modal_data: + request.prompts[i] = prompt + continue + + # Handle reference images for R2V + # "image" is the standard key from online serving (SupportImageInput convention) + # "reference_images" is the offline API key for backwards compatibility + ref_images = multi_modal_data.get("image") or multi_modal_data.get("reference_images") + if ref_images is not None: + if isinstance(ref_images, str): + ref_images = [PIL.Image.open(ref_images).convert("RGB")] + elif isinstance(ref_images, PIL.Image.Image): + ref_images = [ref_images] + elif isinstance(ref_images, list): + ref_images = [ + PIL.Image.open(img).convert("RGB") if isinstance(img, str) else img for img in ref_images + ] + + # Calculate dimensions from first reference image if not provided + if request.sampling_params.height is None or request.sampling_params.width is None: + first_img = ref_images[0] + max_area = 480 * 832 # VACE default is 480p + aspect_ratio = first_img.height / first_img.width + mod_value = 16 + height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value + width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value + + if request.sampling_params.height is None: + request.sampling_params.height = height + if request.sampling_params.width is None: + request.sampling_params.width = width + + prompt["additional_information"]["reference_images"] = ref_images + + # Handle source video for V2V / MV2V + source_video = multi_modal_data.get("video") + if source_video is not None: + if isinstance(source_video, list) and len(source_video) > 0: + if isinstance(source_video[0], str): + source_video = [PIL.Image.open(f).convert("RGB") for f in source_video] + prompt["additional_information"]["source_video"] = source_video + + # Handle mask for MV2V / inpainting + mask = multi_modal_data.get("mask") + if mask is not None: + if isinstance(mask, list) and len(mask) > 0: + if isinstance(mask[0], str): + mask = [PIL.Image.open(m).convert("L") for m in mask] + elif isinstance(mask, str): + mask = [PIL.Image.open(mask).convert("L")] + elif isinstance(mask, PIL.Image.Image): + mask = [mask] + prompt["additional_information"]["mask"] = mask + + request.prompts[i] = prompt + return request + + return pre_process_func + + +class Wan22VACEPipeline(Wan22Pipeline, SupportImageInput): + """VACE (Video Creation and Editing) Pipeline for Wan2.1. + + Extends Wan22Pipeline with VACE-specific context creation and weight loading. + All VACE modes (T2V, R2V, V2V, MV2V) are handled by varying the inputs. + """ + + def __init__( + self, + *, + od_config: OmniDiffusionConfig, + prefix: str = "", + ): + # VACE defaults to flow_shift=3.0 for 480p (base WAN T2V uses 5.0 for 720p) + if od_config.flow_shift is None: + od_config = replace(od_config, flow_shift=3.0) + + super().__init__(od_config=od_config, prefix=prefix) + + def _create_transformer(self, config: dict) -> WanVACETransformer3DModel: + """Build VACE transformer directly from config dict.""" + return create_vace_transformer_from_config(config) + + def check_inputs( + self, + prompt, + negative_prompt, + height, + width, + prompt_embeds=None, + negative_prompt_embeds=None, + video=None, + mask=None, + reference_images=None, + ): + super().check_inputs( + prompt=prompt, + negative_prompt=negative_prompt, + height=height, + width=width, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ) + + # VACE-specific: validate video/mask/reference_images consistency + if video is not None: + if mask is not None and len(video) != len(mask): + raise ValueError( + f"Length of `video` ({len(video)}) and `mask` ({len(mask)}) do not match. " + "Please make sure that they have the same length." + ) + if reference_images is not None: + is_pil_image = isinstance(reference_images, PIL.Image.Image) + is_list_of_pil_images = isinstance(reference_images, list) and all( + isinstance(img, PIL.Image.Image) for img in reference_images + ) + if not (is_pil_image or is_list_of_pil_images): + raise ValueError( + "`reference_images` has to be of type `PIL.Image.Image` or `list` of `PIL.Image.Image`, " + f"but is {type(reference_images)}" + ) + elif mask is not None: + raise ValueError("`mask` can only be passed if `video` is passed as well.") + + def preprocess_conditions( + self, + video: list | torch.Tensor | None, + mask: list | torch.Tensor | None, + reference_images: list[PIL.Image.Image] | None, + height: int, + width: int, + num_frames: int, + dtype: torch.dtype, + device: torch.device, + ) -> tuple[torch.Tensor, torch.Tensor, list[list[torch.Tensor]]]: + """Preprocess video, mask, and reference images for VACE conditioning. + + - If video is None, create zero tensor (T2V mode) + - If mask is None, create all-ones tensor (generate everything) + - Reference images are resized maintaining aspect ratio and center-padded + + Returns: + (video, mask, reference_images_processed) tensors ready for VAE encoding. + """ + from diffusers.video_processor import VideoProcessor + + video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial) + + if video is None: + video = torch.zeros(1, 3, num_frames, height, width, dtype=dtype, device=device) + image_size = (height, width) + else: + base = self.vae_scale_factor_spatial * self.transformer_config.patch_size[1] + if isinstance(video, list): + video_height, video_width = video_processor.get_default_height_width(video[0]) + # Downscale if video exceeds target area + if video_height * video_width > height * width: + scale = min(width / video_width, height / video_height) + video_height, video_width = int(video_height * scale), int(video_width * scale) + # Align to base + video_height = (video_height // base) * base + video_width = (video_width // base) * base + video = video_processor.preprocess_video(video, video_height, video_width) + image_size = (video.shape[-2], video.shape[-1]) + + if mask is None: + mask = torch.ones_like(video) + else: + if isinstance(mask, list): + mask = video_processor.preprocess_video(mask, image_size[0], image_size[1]) + mask = torch.clamp((mask + 1) / 2, min=0, max=1) + + video = video.to(dtype=dtype, device=device) + mask = mask.to(dtype=dtype, device=device) + + # Preprocess reference images: resize with aspect ratio, center-pad on white canvas + ref_images_processed: list[list[torch.Tensor]] = [] + if reference_images is not None and len(reference_images) > 0: + preprocessed = [] + for image in reference_images: + img_tensor = video_processor.preprocess(image, None, None) + img_h, img_w = img_tensor.shape[-2:] + scale = min(image_size[0] / img_h, image_size[1] / img_w) + new_h, new_w = int(img_h * scale), int(img_w * scale) + resized = torch.nn.functional.interpolate( + img_tensor, size=(new_h, new_w), mode="bilinear", align_corners=False + ).squeeze(0) + canvas = torch.ones(3, *image_size, device=device, dtype=dtype) + top = (image_size[0] - new_h) // 2 + left = (image_size[1] - new_w) // 2 + canvas[:, top : top + new_h, left : left + new_w] = resized + preprocessed.append(canvas) + ref_images_processed = [preprocessed] + else: + ref_images_processed = [[]] + + return video, mask, ref_images_processed + + def prepare_video_latents( + self, + video: torch.Tensor, + mask: torch.Tensor, + reference_images: list[list[torch.Tensor]], + generator: torch.Generator | None, + device: torch.device, + ) -> torch.Tensor: + """Encode video and reference images into VACE conditioning latents. + + - Encodes inactive (video * (1-mask)) and reactive (video * mask) regions + - Reference images are encoded and prepended as extra temporal frames + """ + vae_dtype = self.vae.dtype + + latents_mean = torch.tensor(self.vae.config.latents_mean, device=device, dtype=torch.float32).view( + 1, self.vae.config.z_dim, 1, 1, 1 + ) + latents_std = 1.0 / torch.tensor(self.vae.config.latents_std, device=device, dtype=torch.float32).view( + 1, self.vae.config.z_dim, 1, 1, 1 + ) + + # Binarize mask + mask = torch.where(mask > 0.5, 1.0, 0.0).to(dtype=vae_dtype) + + # Encode inactive and reactive regions separately + video = video.to(dtype=vae_dtype) + inactive = video * (1 - mask) + reactive = video * mask + + with torch.no_grad(): + inactive_latent = retrieve_latents(self.vae.encode(inactive), generator, sample_mode="argmax") + reactive_latent = retrieve_latents(self.vae.encode(reactive), generator, sample_mode="argmax") + + inactive_latent = ((inactive_latent.float() - latents_mean) * latents_std).to(vae_dtype) + reactive_latent = ((reactive_latent.float() - latents_mean) * latents_std).to(vae_dtype) + + # Concatenate inactive + reactive along channels -> [B, 2*z_dim, T, H, W] + latents = torch.cat([inactive_latent, reactive_latent], dim=1) + + # Prepend reference image latents along temporal dimension + latent_list = [] + for latent, ref_batch in zip(latents, reference_images): + for ref_image in ref_batch: + ref_image = ref_image.to(dtype=vae_dtype) + ref_image = ref_image[None, :, None, :, :] # [1, C, 1, H, W] + with torch.no_grad(): + ref_latent = retrieve_latents(self.vae.encode(ref_image), generator, sample_mode="argmax") + ref_latent = ((ref_latent.float() - latents_mean) * latents_std).to(vae_dtype) + ref_latent = ref_latent.squeeze(0) # [z_dim, 1, H, W] + # Double channels with zeros (inactive=ref, reactive=zeros) + ref_latent = torch.cat([ref_latent, torch.zeros_like(ref_latent)], dim=0) + # Prepend along temporal dimension + latent = torch.cat([ref_latent, latent], dim=1) + latent_list.append(latent) + + return torch.stack(latent_list) + + def prepare_masks( + self, + mask: torch.Tensor, + reference_images: list[list[torch.Tensor]], + ) -> torch.Tensor: + """Encode mask using spatial stride sampling and prepend reference padding. + + - 8x8 spatial stride encoding -> 64 channels + - Zero-masks prepended for reference image frames + """ + patch_size = self.transformer_config.patch_size if hasattr(self.transformer_config, "patch_size") else (1, 2, 2) + if isinstance(self.transformer_config, dict): + patch_size = self.transformer_config.get("patch_size", (1, 2, 2)) + transformer_patch_size = patch_size[1] if isinstance(patch_size, list | tuple) else 2 + + mask_list = [] + for mask_, ref_batch in zip(mask, reference_images): + num_channels, num_frames, height, width = mask_.shape + new_num_frames = (num_frames + self.vae_scale_factor_temporal - 1) // self.vae_scale_factor_temporal + new_height = height // (self.vae_scale_factor_spatial * transformer_patch_size) * transformer_patch_size + new_width = width // (self.vae_scale_factor_spatial * transformer_patch_size) * transformer_patch_size + + m = mask_[0, :, :, :] # [T, H, W] + m = m.view(num_frames, new_height, self.vae_scale_factor_spatial, new_width, self.vae_scale_factor_spatial) + m = m.permute(2, 4, 0, 1, 3).flatten(0, 1) # [64, T, H', W'] + m = torch.nn.functional.interpolate( + m.unsqueeze(0), size=(new_num_frames, new_height, new_width), mode="nearest-exact" + ).squeeze(0) + + # Prepend zero-masks for reference image frames + num_ref = len(ref_batch) + if num_ref > 0: + mask_padding = torch.zeros_like(m[:, :num_ref, :, :]) + m = torch.cat([mask_padding, m], dim=1) + + mask_list.append(m) + + return torch.stack(mask_list) + + def forward( + self, + req: OmniDiffusionRequest, + prompt: str | None = None, + negative_prompt: str | None = None, + height: int = 480, + width: int = 832, + num_inference_steps: int = 50, + guidance_scale: float = 5.0, + frame_num: int = 81, + output_type: str | None = "np", + generator: torch.Generator | list[torch.Generator] | None = None, + prompt_embeds: torch.Tensor | None = None, + negative_prompt_embeds: torch.Tensor | None = None, + attention_kwargs: dict | None = None, + vace_context_scale: float | list[float] = 1.0, + **kwargs, + ) -> DiffusionOutput: + """Generate or edit video using VACE. + + The mode is determined by which inputs are provided in the request: + - T2V: prompt only (no video/mask/reference_images) + - R2V: prompt + reference_images (in multi_modal_data) + - V2V: prompt + video (in multi_modal_data) + - MV2V: prompt + video + mask (in multi_modal_data) + + Args: + req: Diffusion request containing prompt and optional multi-modal data. + prompt: Text prompt (overridden by req.prompts if provided). + negative_prompt: Negative prompt for CFG. + height: Output video height. + width: Output video width. + num_inference_steps: Number of denoising steps. + guidance_scale: CFG scale. + frame_num: Number of output frames. + output_type: Output format ("np", "pt", or "latent"). + generator: Random generator for reproducibility. + prompt_embeds: Pre-computed prompt embeddings. + negative_prompt_embeds: Pre-computed negative prompt embeddings. + attention_kwargs: Additional kwargs for attention layers. + vace_context_scale: VACE conditioning strength. + """ + # Get parameters from request or arguments + if len(req.prompts) > 1: + raise ValueError( + "This model only supports a single prompt, not a batched request. " + "Please pass in a single prompt object or string, or a single-item list." + ) + + reference_images = None + source_video = None + source_mask = None + + if len(req.prompts) == 1: + first_prompt = req.prompts[0] + if isinstance(first_prompt, str): + prompt = first_prompt + else: + prompt = first_prompt.get("prompt") + negative_prompt = negative_prompt or first_prompt.get("negative_prompt") + prompt_embeds = prompt_embeds if prompt_embeds is not None else first_prompt.get("prompt_embeds") + negative_prompt_embeds = ( + negative_prompt_embeds + if negative_prompt_embeds is not None + else first_prompt.get("negative_prompt_embeds") + ) + + additional_info = first_prompt.get("additional_information", {}) + reference_images = additional_info.get("reference_images") + source_video = additional_info.get("source_video") + source_mask = additional_info.get("mask") + + if prompt is None and prompt_embeds is None: + raise ValueError("Prompt or prompt_embeds is required for VACE generation.") + + height = req.sampling_params.height or height + width = req.sampling_params.width or width + num_frames = req.sampling_params.num_frames or frame_num + num_inference_steps = req.sampling_params.num_inference_steps or num_inference_steps + generator = req.sampling_params.generator or generator + + if req.sampling_params.guidance_scale_provided: + guidance_scale = req.sampling_params.guidance_scale + + # Ensure dimensions are compatible with VAE and patch size + mod_value = self.vae_scale_factor_spatial * 2 # 8 * 2 = 16 + height = (height // mod_value) * mod_value + width = (width // mod_value) * mod_value + + if num_frames % self.vae_scale_factor_temporal != 1: + num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1 + num_frames = max(num_frames, 1) + + self.check_inputs( + prompt=prompt, + negative_prompt=negative_prompt, + height=height, + width=width, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + video=source_video, + mask=source_mask, + reference_images=reference_images, + ) + + device = self.device + self._guidance_scale = guidance_scale + dtype = self.transformer.dtype if self.transformer is not None else torch.bfloat16 + + if generator is None and req.sampling_params.seed is not None: + generator = torch.Generator(device=device).manual_seed(req.sampling_params.seed) + + # Encode prompts + if prompt_embeds is None: + if prompt is None: + raise ValueError("Either prompt or prompt_embeds must be provided.") + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt=prompt, + negative_prompt=negative_prompt, + do_classifier_free_guidance=guidance_scale > 1.0, + num_videos_per_prompt=req.sampling_params.num_outputs_per_prompt or 1, + max_sequence_length=req.sampling_params.max_sequence_length or 512, + device=device, + dtype=dtype, + ) + else: + prompt_embeds = prompt_embeds.to(device=device, dtype=dtype) + if negative_prompt_embeds is not None: + negative_prompt_embeds = negative_prompt_embeds.to(device=device, dtype=dtype) + elif guidance_scale > 1.0: + _, negative_prompt_embeds = self.encode_prompt( + prompt="", + negative_prompt=None, + do_classifier_free_guidance=True, + device=device, + dtype=dtype, + ) + + num_reference_images = 0 + if self.transformer.vace_patch_embedding is not None: + video, mask, ref_images_processed = self.preprocess_conditions( + video=source_video, + mask=source_mask, + reference_images=reference_images, + height=height, + width=width, + num_frames=num_frames, + dtype=dtype, + device=device, + ) + + conditioning_latents = self.prepare_video_latents(video, mask, ref_images_processed, generator, device) + mask_encoded = self.prepare_masks(mask, ref_images_processed) + + # Unified VACE context: [video_latents, mask] along channels -> [B, C, T, H, W] + vace_context = torch.cat([conditioning_latents, mask_encoded], dim=1) + + num_reference_images = len(ref_images_processed[0]) if ref_images_processed else 0 + else: + vace_context = None + + # Prepare noise latents (extra frames for reference images) + num_channels_latents = self.transformer_config.in_channels + noise_num_frames = num_frames + num_reference_images * self.vae_scale_factor_temporal + latents = self.prepare_latents( + batch_size=prompt_embeds.shape[0], + num_channels_latents=num_channels_latents, + height=height, + width=width, + num_frames=noise_num_frames, + dtype=torch.float32, + device=device, + generator=generator, + latents=req.sampling_params.latents, + ) + + # Set up scheduler + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + self._num_timesteps = len(timesteps) + + # Denoising loop + with self.progress_bar(total=len(timesteps)) as pbar: + for t in timesteps: + self._current_timestep = t + latent_model_input = latents.to(dtype) + timestep = t.expand(latents.shape[0]) + + do_true_cfg = guidance_scale > 1.0 and negative_prompt_embeds is not None + + positive_kwargs = { + "hidden_states": latent_model_input, + "timestep": timestep, + "encoder_hidden_states": prompt_embeds, + "attention_kwargs": attention_kwargs, + "vace_context": vace_context, + "vace_context_scale": vace_context_scale, + "return_dict": False, + } + negative_kwargs = ( + { + "hidden_states": latent_model_input, + "timestep": timestep, + "encoder_hidden_states": negative_prompt_embeds, + "attention_kwargs": attention_kwargs, + "vace_context": vace_context, + "vace_context_scale": vace_context_scale, + "return_dict": False, + } + if do_true_cfg + else None + ) + + noise_pred = self.predict_noise_maybe_with_cfg( + do_true_cfg=do_true_cfg, + true_cfg_scale=guidance_scale, + positive_kwargs=positive_kwargs, + negative_kwargs=negative_kwargs, + cfg_normalize=False, + ) + + latents = self.scheduler_step_maybe_with_cfg(noise_pred, t, latents, do_true_cfg) + pbar.update() + + self._current_timestep = None + + if current_omni_platform.is_available(): + current_omni_platform.empty_cache() + + # Trim reference frames from output before decoding + # (reference images were prepended as extra temporal frames) + if output_type != "latent" and num_reference_images > 0: + latents = latents[:, :, num_reference_images:] + + if output_type == "latent": + output = latents + else: + latents = latents.to(self.vae.dtype) + latents_mean = ( + torch.tensor(self.vae.config.latents_mean) + .view(1, self.vae.config.z_dim, 1, 1, 1) + .to(latents.device, latents.dtype) + ) + latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to( + latents.device, latents.dtype + ) + latents = latents / latents_std + latents_mean + output = self.vae.decode(latents, return_dict=False)[0] + + return DiffusionOutput(output=output) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load weights using AutoWeightsLoader for vLLM integration.""" + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_vace_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_vace_transformer.py new file mode 100644 index 0000000000..4f4217dabf --- /dev/null +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_vace_transformer.py @@ -0,0 +1,254 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""VACE variant of WanTransformer3DModel for conditional video generation.""" + +from __future__ import annotations + +from typing import Any + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from vllm_omni.diffusion.distributed.sp_plan import SequenceParallelInput +from vllm_omni.diffusion.distributed.sp_sharding import sp_shard +from vllm_omni.diffusion.forward_context import get_forward_context +from vllm_omni.diffusion.models.wan2_2.wan2_2_transformer import ( + Transformer2DModelOutput, + WanTransformer3DModel, + WanTransformerBlock, +) + + +class VaceWanTransformerBlock(WanTransformerBlock): + """VACE variant of WanTransformerBlock with proj_in/proj_out for skip connections.""" + + def __init__( + self, + dim: int, + ffn_dim: int, + num_heads: int, + eps: float = 1e-6, + added_kv_proj_dim: int | None = None, + cross_attn_norm: bool = False, + block_id: int = 0, + ): + super().__init__(dim, ffn_dim, num_heads, eps, added_kv_proj_dim, cross_attn_norm) + self.proj_in = nn.Linear(dim, dim) if block_id == 0 else None + self.proj_out = nn.Linear(dim, dim) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + control_hidden_states: torch.Tensor, + temb: torch.Tensor, + rotary_emb: tuple[torch.Tensor, torch.Tensor], + hidden_states_mask: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + if self.proj_in is not None: + control_hidden_states = self.proj_in(control_hidden_states) + control_hidden_states = control_hidden_states + hidden_states + + control_hidden_states = super().forward( + control_hidden_states, + encoder_hidden_states, + temb, + rotary_emb, + hidden_states_mask, + ) + + conditioning_states = self.proj_out(control_hidden_states) + return conditioning_states, control_hidden_states + + +class WanVACETransformer3DModel(WanTransformer3DModel): + """VACE-extended WAN Transformer with conditioning blocks for video editing.""" + + # TODO: `vace_blocks` are not layerwise-offloaded yet. The current offloader only + # supports a single block group (`blocks`); extend it to support both + # `vace_blocks` and `blocks`. + + # Shard hidden_states before VACE blocks (replaces parent's blocks.0) + _sp_plan = { + **{k: v for k, v in WanTransformer3DModel._sp_plan.items() if k != "blocks.0"}, + "_sp_shard_point": { + 0: SequenceParallelInput(split_dim=1, expected_dims=3, split_output=True, auto_pad=True), + }, + } + + def __init__( + self, + *, + vace_layers: list[int] | None = None, + vace_in_channels: int | None = None, + **kwargs, + ): + super().__init__(**kwargs) + + self.vace_blocks = None + self.vace_patch_embedding = None + self.vace_layers = None + self.vace_layers_mapping = None + + # SP shard point: Identity module that _sp_plan hooks into to shard + # hidden_states before VACE processing (instead of at blocks.0) + self._sp_shard_point = nn.Identity() + + if vace_layers is not None: + inner_dim = self.config.num_attention_heads * self.config.attention_head_dim + self.vace_layers = list(vace_layers) + self.vace_layers_mapping = {layer_idx: vace_idx for vace_idx, layer_idx in enumerate(vace_layers)} + + vace_in_channels = vace_in_channels or self.config.in_channels + self.vace_patch_embedding = nn.Conv3d( + vace_in_channels, + inner_dim, + kernel_size=self.config.patch_size, + stride=self.config.patch_size, + ) + self.vace_blocks = nn.ModuleList( + [ + VaceWanTransformerBlock( + inner_dim, + self.config.ffn_dim, + self.config.num_attention_heads, + self.config.eps, + self.config.added_kv_proj_dim, + self.config.cross_attn_norm, + block_id=i, + ) + for i in range(len(vace_layers)) + ] + ) + + def embed_vace_context( + self, + vace_context: torch.Tensor, + seq_len: int, + sp_size: int = 1, + ) -> torch.Tensor: + """Compute VACE patch embeddings, aligned and sharded for SP. + + Args: + vace_context: Raw conditioning tensor [B, C, T, H, W]. + seq_len: Target full (padded) sequence length to align to. + sp_size: Sequence parallel world size. + """ + vace_embeds = self.vace_patch_embedding(vace_context) + vace_embeds = vace_embeds.flatten(2).transpose(1, 2) + + # Align to target seq_len (may include SP padding) + if vace_embeds.size(1) < seq_len: + vace_embeds = F.pad(vace_embeds, (0, 0, 0, seq_len - vace_embeds.size(1))) + + if sp_size > 1: + vace_embeds = sp_shard(vace_embeds, dim=1) + return vace_embeds + + def forward( + self, + hidden_states: torch.Tensor, + timestep: torch.LongTensor, + encoder_hidden_states: torch.Tensor, + encoder_hidden_states_image: torch.Tensor | None = None, + return_dict: bool = True, + attention_kwargs: dict[str, Any] | None = None, + vace_context: torch.Tensor | None = None, + vace_context_scale: float | list[float] = 1.0, + ) -> torch.Tensor | Transformer2DModelOutput: + batch_size, _, num_frames, height, width = hidden_states.shape + p_t, p_h, p_w = self.config.patch_size + post_patch_num_frames = num_frames // p_t + post_patch_height = height // p_h + post_patch_width = width // p_w + + # Compute RoPE embeddings (sharded by _sp_plan via split_output=True) + rotary_emb = self.rope(hidden_states) + + # Patch embedding and flatten to sequence + hidden_states = self.patch_embedding(hidden_states) + hidden_states = hidden_states.flatten(2).transpose(1, 2) + + if timestep.ndim == 2: + ts_seq_len = timestep.shape[1] + timestep = timestep.flatten() + else: + ts_seq_len = None + + temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder( + timestep, encoder_hidden_states, encoder_hidden_states_image, timestep_seq_len=ts_seq_len + ) + timestep_proj = self.timestep_proj_prepare(timestep_proj, ts_seq_len) + + if encoder_hidden_states_image is not None: + encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1) + + # Shard hidden_states via _sp_plan hook (before VACE, not at blocks.0) + hidden_states = self._sp_shard_point(hidden_states) + + # SP state and attention mask for padding + hidden_states_mask = None + ctx = get_forward_context() + parallel_config = ctx.omni_diffusion_config.parallel_config + sp_size = parallel_config.sequence_parallel_size if parallel_config is not None else 1 + if ctx.sp_original_seq_len is not None and ctx.sp_padding_size > 0: + padded_seq_len = ctx.sp_original_seq_len + ctx.sp_padding_size + hidden_states_mask = torch.ones( + batch_size, + padded_seq_len, + dtype=torch.bool, + device=hidden_states.device, + ) + hidden_states_mask[:, ctx.sp_original_seq_len :] = False + + # VACE: embed context and run conditioning blocks + vace_hints = None + if vace_context is not None and self.vace_blocks is not None: + full_seq_len = hidden_states.shape[1] * sp_size + control_hidden_states = self.embed_vace_context(vace_context.to(hidden_states.dtype), full_seq_len, sp_size) + vace_hints = [] + for block in self.vace_blocks: + conditioning_states, control_hidden_states = block( + hidden_states, + encoder_hidden_states, + control_hidden_states, + timestep_proj, + rotary_emb, + hidden_states_mask, + ) + vace_hints.append(conditioning_states) + + # Normalize scale to per-layer list + if vace_hints is not None and isinstance(vace_context_scale, (int, float)): + vace_context_scale = [vace_context_scale] * len(vace_hints) + + # Transformer blocks with VACE hint application + for i, block in enumerate(self.blocks): + hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb, hidden_states_mask) + if vace_hints is not None and self.vace_layers_mapping is not None and i in self.vace_layers_mapping: + vace_idx = self.vace_layers_mapping[i] + hidden_states = hidden_states + vace_hints[vace_idx] * vace_context_scale[vace_idx] + + # Output norm, projection & unpatchify + shift, scale = self.output_scale_shift_prepare(temb) + shift = shift.to(hidden_states.device) + scale = scale.to(hidden_states.device) + if shift.ndim == 2: + shift = shift.unsqueeze(1) + scale = scale.unsqueeze(1) + + hidden_states = (self.norm_out(hidden_states.float()) * (1 + scale) + shift).type_as(hidden_states) + hidden_states = self.proj_out(hidden_states) + + hidden_states = hidden_states.reshape( + batch_size, post_patch_num_frames, post_patch_height, post_patch_width, p_t, p_h, p_w, -1 + ) + hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6) + output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3) + + if not return_dict: + return (output,) + + return Transformer2DModelOutput(sample=output) diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index 994dac04ad..dcd2272375 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -57,6 +57,11 @@ "pipeline_wan2_2", "Wan22Pipeline", ), + "WanVACEPipeline": ( + "wan2_2", + "pipeline_wan2_2_vace", + "Wan22VACEPipeline", + ), "LTX2Pipeline": ( "ltx2", "pipeline_ltx2", @@ -316,6 +321,7 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) - "ZImagePipeline": "get_post_process_func", "OvisImagePipeline": "get_ovis_image_post_process_func", "WanPipeline": "get_wan22_post_process_func", + "WanVACEPipeline": "get_wan22_vace_post_process_func", "LTX2Pipeline": "get_ltx2_post_process_func", "LTX2ImageToVideoPipeline": "get_ltx2_post_process_func", "StableAudioPipeline": "get_stable_audio_post_process_func", @@ -346,6 +352,7 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) - "LongCatImageEditPipeline": "get_longcat_image_edit_pre_process_func", "QwenImageLayeredPipeline": "get_qwen_image_layered_pre_process_func", "WanPipeline": "get_wan22_pre_process_func", + "WanVACEPipeline": "get_wan22_vace_pre_process_func", "WanImageToVideoPipeline": "get_wan22_i2v_pre_process_func", "OmniGen2Pipeline": "get_omnigen2_pre_process_func", "HeliosPipeline": "get_helios_pre_process_func", From ca02351a1ef8aa6397126c60154a80ee06ae3553 Mon Sep 17 00:00:00 2001 From: rein yang <73573651+R2-Y@users.noreply.github.com> Date: Thu, 2 Apr 2026 18:07:22 +0800 Subject: [PATCH 027/204] [skip ci][Bugfix] clean useless log (#2450) Signed-off-by: Rein Yang --- vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py index 5a22ce024a..ebe516e240 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py @@ -680,7 +680,6 @@ def talker_preprocess_prefill(self, input_ids: torch.Tensor, input_embeds: torch update_dict: dict[str, dict] = {} voice_type = info_dict.get("speaker") - logger.info("talker_preprocess_prefill speaker: %s", voice_type) if voice_type is not None and isinstance(voice_type, (list, tuple)) and len(voice_type) > 0: voice_type = voice_type[0] if not isinstance(voice_type, str) or not voice_type.strip(): From 50bb47a62930465574c64dafd891bb62b26f2dc1 Mon Sep 17 00:00:00 2001 From: zhumingjue138 Date: Thu, 2 Apr 2026 20:32:57 +0800 Subject: [PATCH 028/204] [Test] Skip tests/e2e/online_serving/test_zimage_expansion.py due to issue #2435 (#2454) --- tests/e2e/online_serving/test_zimage_expansion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/online_serving/test_zimage_expansion.py b/tests/e2e/online_serving/test_zimage_expansion.py index bed95545ac..bef12e55d1 100644 --- a/tests/e2e/online_serving/test_zimage_expansion.py +++ b/tests/e2e/online_serving/test_zimage_expansion.py @@ -70,7 +70,7 @@ def _get_diffusion_feature_cases(): ], ), id="parallel_hsdp", - marks=FOUR_CARD_MARKS, + marks=[*FOUR_CARD_MARKS, pytest.mark.skip(reason="issue #2435")], ), ] From 728cf6d023896a507df8cb1019fde13200fe28cc Mon Sep 17 00:00:00 2001 From: ChenWenjing <54166744+Shirley125@users.noreply.github.com> Date: Thu, 2 Apr 2026 22:51:41 +0800 Subject: [PATCH 029/204] [Feature] add session based audio streaming input (#2208) Signed-off-by: CHEN <116010019@link.cuhk.edu.cn> Co-authored-by: Hongsheng Liu --- examples/online_serving/qwen3_omni/README.md | 39 +++++ .../qwen3_omni/openai_realtime_client.py | 146 ++++++++++++++++++ tests/engine/test_async_omni_engine_input.py | 28 ++++ vllm_omni/engine/async_omni_engine.py | 68 +++++++- vllm_omni/engine/orchestrator.py | 30 ++++ vllm_omni/entrypoints/async_omni.py | 141 +++++++++++++++-- vllm_omni/entrypoints/openai/api_server.py | 20 +++ .../models/qwen3_omni/qwen3_omni.py | 63 +++++++- 8 files changed, 520 insertions(+), 15 deletions(-) create mode 100644 examples/online_serving/qwen3_omni/openai_realtime_client.py diff --git a/examples/online_serving/qwen3_omni/README.md b/examples/online_serving/qwen3_omni/README.md index 45482984b9..c3171e4366 100644 --- a/examples/online_serving/qwen3_omni/README.md +++ b/examples/online_serving/qwen3_omni/README.md @@ -36,6 +36,45 @@ cd examples/online_serving/qwen3_omni python examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py --model Qwen/Qwen3-Omni-30B-A3B-Instruct --query-type use_image --port 8091 --host "localhost" ``` +#### Realtime WebSocket client (`openai_realtime_client.py`) + +[`openai_realtime_client.py`](./openai_realtime_client.py) connects to **`ws://:/v1/realtime`**, uploads a local audio file as **PCM16 mono @ 16 kHz** chunks (OpenAI-style `input_audio_buffer.append` / `commit`), and prints **streaming transcription** (`transcription.delta` / `transcription.done`). + +**Dependencies:** + +```bash +pip install websockets librosa numpy +``` + +(ffmpeg may be required by `librosa` for some formats; see the FAQ below.) + +**From this directory** (`examples/online_serving/qwen3_omni`): + +```bash +python openai_realtime_client.py \ + --host localhost \ + --port 8091 \ + --model Qwen/Qwen3-Omni-30B-A3B-Instruct \ + --audio_path /path/to/your.wav +``` + +If `--audio_path` is omitted, the script uses a bundled default clip (`mary_had_lamb` via vLLM assets). + +**Arguments:** + +| Flag | Default | Description | +|------|---------|-------------| +| `--host` | `localhost` | API server host | +| `--port` | `8000` | API server port (match your `vllm serve` port, e.g. `8091`) | +| `--model` | `Qwen/Qwen3-Omni-30B-A3B-Instruct` | Must match the served model (also sent in `session.update`) | +| `--audio_path` | *(optional)* | Path to input audio; resampled to 16 kHz mono inside the client | + +Ensure the vLLM-Omni server is running with realtime support for this endpoint, for example: + +```bash +vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 +``` + The Python client supports the following command-line arguments: - `--query-type` (or `-q`): Query type (default: `use_video`). Options: `text`, `use_audio`, `use_image`, `use_video` diff --git a/examples/online_serving/qwen3_omni/openai_realtime_client.py b/examples/online_serving/qwen3_omni/openai_realtime_client.py new file mode 100644 index 0000000000..4fa043c481 --- /dev/null +++ b/examples/online_serving/qwen3_omni/openai_realtime_client.py @@ -0,0 +1,146 @@ +""" +This script demonstrates how to use the vLLM-Omni Realtime WebSocket API to perform +audio transcription by uploading an audio file. + +Before running this script, you must start the vLLM-Omni server with a realtime-capable +model, for example: + + vllm serve Qwen/Qwen3-Omni-30B-A3B-Instruct --omni + +Requirements: +- vllm with audio support +- websockets +- librosa +- numpy + +The script: +1. Connects to the Realtime WebSocket endpoint +2. Converts an audio file to PCM16 @ 16kHz +3. Sends audio chunks to the server +4. Receives and prints transcription as it streams +""" + +import argparse +import asyncio +import base64 +import json + +import librosa +import numpy as np +import websockets +from vllm.assets.audio import AudioAsset + + +def audio_to_pcm16_base64(audio_path: str) -> str: + """ + Load an audio file and convert it to base64-encoded PCM16 @ 16kHz. + """ + # Load audio and resample to 16kHz mono + audio, _ = librosa.load(audio_path, sr=16000, mono=True) + # Convert to PCM16 + pcm16 = (audio * 32767).astype(np.int16) + # Encode as base64 + return base64.b64encode(pcm16.tobytes()).decode("utf-8") + + +async def realtime_transcribe(audio_path: str, host: str, port: int, model: str): + """ + Connect to the Realtime API and transcribe an audio file. + """ + uri = f"ws://{host}:{port}/v1/realtime" + + async with websockets.connect(uri) as ws: + # Wait for session.created + response = json.loads(await ws.recv()) + if response["type"] == "session.created": + print(f"Session created: {response['id']}") + else: + print(f"Unexpected response: {response}") + return + + # Validate model + await ws.send(json.dumps({"type": "session.update", "model": model})) + + # Signal ready to start + await ws.send(json.dumps({"type": "input_audio_buffer.commit"})) + + # Convert audio file to base64 PCM16 + print(f"Loading audio from: {audio_path}") + audio_base64 = audio_to_pcm16_base64(audio_path) + + # Send audio in chunks (4KB of raw audio = ~8KB base64) + chunk_size = 4096 + audio_bytes = base64.b64decode(audio_base64) + total_chunks = (len(audio_bytes) + chunk_size - 1) // chunk_size + + print(f"Sending {total_chunks} audio chunks...") + for i in range(0, len(audio_bytes), chunk_size): + chunk = audio_bytes[i : i + chunk_size] + await ws.send( + json.dumps( + { + "type": "input_audio_buffer.append", + "audio": base64.b64encode(chunk).decode("utf-8"), + } + ) + ) + + # Signal all audio is sent + await ws.send(json.dumps({"type": "input_audio_buffer.commit", "final": True})) + print("Audio sent. Waiting for transcription...\n") + + # Receive transcription + print("Transcription: ", end="", flush=True) + while True: + response = json.loads(await ws.recv()) + if response["type"] == "transcription.delta": + print(response["delta"], end="", flush=True) + elif response["type"] == "transcription.done": + print(f"\n\nFinal transcription: {response['text']}") + if response.get("usage"): + print(f"Usage: {response['usage']}") + break + elif response["type"] == "error": + print(f"\nError: {response['error']}") + break + + +def main(args): + if args.audio_path: + audio_path = args.audio_path + else: + # Use default audio asset + audio_path = str(AudioAsset("mary_had_lamb").get_local_path()) + print(f"No audio path provided, using default: {audio_path}") + + asyncio.run(realtime_transcribe(audio_path, args.host, args.port, args.model)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Realtime WebSocket Transcription Client") + parser.add_argument( + "--model", + type=str, + default="Qwen/Qwen3-Omni-30B-A3B-Instruct", + help="Model that is served and should be pinged.", + ) + parser.add_argument( + "--audio_path", + type=str, + default=None, + help="Path to the audio file to transcribe.", + ) + parser.add_argument( + "--host", + type=str, + default="localhost", + help="vLLM-Omni server host (default: localhost)", + ) + parser.add_argument( + "--port", + type=int, + default=8000, + help="vLLM-Omni server port (default: 8000)", + ) + args = parser.parse_args() + main(args) diff --git a/tests/engine/test_async_omni_engine_input.py b/tests/engine/test_async_omni_engine_input.py index b2d2d9a9e5..ed6a7277b4 100644 --- a/tests/engine/test_async_omni_engine_input.py +++ b/tests/engine/test_async_omni_engine_input.py @@ -61,3 +61,31 @@ def test_build_add_request_message_preserves_additional_information(): assert request.additional_information.entries["text"].list_data == ["hello world"] assert request.additional_information.entries["speaker"].list_data == ["vivian"] output_processor.add_request.assert_called_once() + + +def test_build_add_request_message_with_resumable_streaming(): + engine = object.__new__(AsyncOmniEngine) + params = SamplingParams(max_tokens=8) + engine.default_sampling_params_list = [params] + engine.stage_metadata = [{"stage_type": "llm"}] + engine.supported_tasks = ("generate",) + + input_processor = Mock() + input_processor.process_inputs.return_value = _make_engine_core_request() + engine.input_processor = input_processor + + output_processor = Mock() + engine.output_processors = [output_processor] + + msg = engine._build_add_request_message( + request_id="req-stream", + prompt={"prompt_token_ids": [1, 2, 3]}, + sampling_params_list=[params], + final_stage_id=0, + resumable=True, + message_type="streaming_update", + ) + + assert msg["type"] == "streaming_update" + input_processor.process_inputs.assert_called_once() + assert input_processor.process_inputs.call_args.kwargs["resumable"] is True diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 9de3dc867f..71bf6e2379 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -635,9 +635,13 @@ def _build_add_request_message( self, request_id: str, prompt: EngineCoreRequest | PromptType, + prompt_text: str | None = None, sampling_params_list: Sequence[Any] | None = None, final_stage_id: int = 0, arrival_time: float | None = None, + *, + resumable: bool = False, + message_type: str = "add_request", ) -> dict[str, Any]: """Build an add_request message after stage-0 preprocessing.""" effective_sampling_params_list = ( @@ -669,6 +673,7 @@ def _build_add_request_message( params=params, supported_tasks=self.supported_tasks, arrival_time=arrival_time, + resumable=resumable, ) # TODO (Peiqi): add this for Qwen3-TTS only. Other models don't have # additional_information field in the prompt. @@ -683,9 +688,10 @@ def _build_add_request_message( request.external_req_id = request_id # Register with stage 0's output processor. + output_prompt_text = prompt_text self.output_processors[0].add_request( request=request, - prompt=prompt, + prompt=output_prompt_text, parent_req=None, request_index=0, queue=None, @@ -693,7 +699,7 @@ def _build_add_request_message( prompt = request return { - "type": "add_request", + "type": message_type, "request_id": request_id, "prompt": prompt, "original_prompt": original_prompt, @@ -949,9 +955,12 @@ def add_request( self, request_id: str, prompt: EngineCoreRequest | PromptType, + prompt_text: str | None = None, sampling_params_list: Sequence[Any] | None = None, final_stage_id: int = 0, arrival_time: float | None = None, + *, + resumable: bool = False, ) -> None: """Process stage-0 input locally, then send to the Orchestrator. @@ -963,9 +972,11 @@ def add_request( msg = self._build_add_request_message( request_id=request_id, prompt=prompt, + prompt_text=prompt_text, sampling_params_list=sampling_params_list, final_stage_id=final_stage_id, arrival_time=arrival_time, + resumable=resumable, ) if self.request_queue is None: raise RuntimeError("request_queue is not initialized") @@ -984,17 +995,70 @@ async def add_request_async( self, request_id: str, prompt: EngineCoreRequest | PromptType, + prompt_text: str | None = None, sampling_params_list: Sequence[Any] | None = None, final_stage_id: int = 0, arrival_time: float | None = None, + *, + resumable: bool = False, ) -> None: """Async add_request API.""" self.add_request( request_id=request_id, prompt=prompt, + prompt_text=prompt_text, + sampling_params_list=sampling_params_list, + final_stage_id=final_stage_id, + arrival_time=arrival_time, + resumable=resumable, + ) + + def add_streaming_update( + self, + request_id: str, + prompt: EngineCoreRequest | PromptType, + prompt_text: str | None = None, + sampling_params_list: Sequence[Any] | None = None, + final_stage_id: int = 0, + arrival_time: float | None = None, + *, + resumable: bool = True, + ) -> None: + """Send an incremental streaming update for an existing request.""" + msg = self._build_add_request_message( + request_id=request_id, + prompt=prompt, + prompt_text=prompt_text, + sampling_params_list=sampling_params_list, + final_stage_id=final_stage_id, + arrival_time=arrival_time, + resumable=resumable, + message_type="streaming_update", + ) + if self.request_queue is None: + raise RuntimeError("request_queue is not initialized") + self.request_queue.sync_q.put_nowait(msg) + + async def add_streaming_update_async( + self, + request_id: str, + prompt: EngineCoreRequest | PromptType, + prompt_text: str | None = None, + sampling_params_list: Sequence[Any] | None = None, + final_stage_id: int = 0, + arrival_time: float | None = None, + *, + resumable: bool = True, + ) -> None: + """Async wrapper for add_streaming_update().""" + self.add_streaming_update( + request_id=request_id, + prompt=prompt, + prompt_text=prompt_text, sampling_params_list=sampling_params_list, final_stage_id=final_stage_id, arrival_time=arrival_time, + resumable=resumable, ) def try_get_output(self, timeout: float = 0.001) -> dict[str, Any] | None: diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 8128c25c64..4a85a2c6c9 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -200,6 +200,8 @@ async def _request_handler(self) -> None: if msg_type == "add_request": await self._handle_add_request(msg) + elif msg_type == "streaming_update": + await self._handle_streaming_update(msg) elif msg_type == "add_companion_request": await self._handle_add_companion(msg) elif msg_type == "abort": @@ -659,6 +661,34 @@ async def _handle_add_request(self, msg: dict[str, Any]) -> None: if self.async_chunk and stage_id == 0 and final_stage_id > 0: await self._prewarm_async_chunk_stages(request_id, request, req_state) + async def _handle_streaming_update(self, msg: dict[str, Any]) -> None: + """Handle a streaming_update message for an existing request.""" + stage_id = 0 + request_id = msg["request_id"] + request = msg["prompt"] + + req_state = self.request_states.get(request_id) + if req_state is None: + logger.warning( + "[Orchestrator] streaming_update for unknown req=%s, falling back to add_request", + request_id, + ) + fallback_msg = dict(msg) + fallback_msg["type"] = "add_request" + await self._handle_add_request(fallback_msg) + return + + if "sampling_params_list" in msg and msg["sampling_params_list"]: + req_state.sampling_params_list = msg["sampling_params_list"] + + req_state.stage_submit_ts[stage_id] = _time.time() + stage_client = self.stage_clients[stage_id] + if stage_client.stage_type == "diffusion": + params = req_state.sampling_params_list[stage_id] + await stage_client.add_request_async(request_id, request, params) + else: + await stage_client.add_request_async(request) + async def _prewarm_async_chunk_stages( self, request_id: str, diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py index 68c072c2b3..6c8022461b 100644 --- a/vllm_omni/entrypoints/async_omni.py +++ b/vllm_omni/entrypoints/async_omni.py @@ -12,12 +12,15 @@ from collections.abc import AsyncGenerator, Iterable, Sequence from typing import TYPE_CHECKING, Any -from vllm.engine.protocol import EngineClient +from vllm import TokensPrompt +from vllm.engine.protocol import EngineClient, StreamingInput from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import PoolingRequestOutput from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams +from vllm.renderers.inputs.preprocess import extract_prompt_components +from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.tasks import SupportedTask from vllm.v1.engine.exceptions import EngineDeadError @@ -147,7 +150,8 @@ def model_config(self): async def generate( self, - prompt: OmniPromptType | list[OmniPromptType], + prompt: OmniPromptType | AsyncGenerator[StreamingInput, None] | list[OmniPromptType], + sampling_params: Any = None, request_id: str = "", *, prompt_text: str | None = None, @@ -191,6 +195,7 @@ async def generate( logger.debug(f"[AsyncOmni] generate() called for request {request_id}") + input_stream_task: asyncio.Task | None = None try: # Start final output dispatcher on the first call to generate() self._final_output_handler() @@ -214,13 +219,22 @@ async def generate( req_state.metrics = metrics self.request_states[request_id] = req_state - # Add request to stage 0 (Orchestrator handles all stage transitions) - await self.engine.add_request_async( - request_id=request_id, - prompt=prompt, - sampling_params_list=sampling_params_list, - final_stage_id=final_stage_id_for_e2e, - ) + # Add request(s) to stage 0. For streaming inputs, submit + # chunks incrementally through streaming_update. + if isinstance(prompt, AsyncGenerator): + input_stream_task = await self._add_streaming_input_request( + request_id=request_id, + input_stream=prompt, + sampling_params_list=sampling_params_list, + final_stage_id=final_stage_id_for_e2e, + ) + else: + await self.engine.add_request_async( + request_id=request_id, + prompt=prompt, + sampling_params_list=sampling_params_list, + final_stage_id=final_stage_id_for_e2e, + ) submit_ts = time.time() req_state.metrics.stage_first_ts[0] = submit_ts req_start_ts[request_id] = submit_ts @@ -243,9 +257,118 @@ async def generate( self._log_summary_and_cleanup(request_id) except (asyncio.CancelledError, GeneratorExit): + if input_stream_task is not None and not input_stream_task.done(): + input_stream_task.cancel() await self.abort(request_id) logger.info(f"[AsyncOmni] Request {request_id} aborted.") raise + except Exception as e: + await self.abort(request_id) + logger.info(f"[AsyncOmni] Request {request_id} failed (input error): {e}") + raise + + async def _add_streaming_input_request( + self, + *, + request_id: str, + input_stream: AsyncGenerator[StreamingInput, None], + sampling_params_list: Sequence[OmniSamplingParams], + final_stage_id: int, + ) -> asyncio.Task: + """Submit a streaming input generator as incremental stage-0 updates.""" + if not sampling_params_list: + raise ValueError("sampling_params_list cannot be empty for streaming input") + # only check thinker's sampling params now + stage0_params = sampling_params_list[0] + self._validate_streaming_input_sampling_params(stage0_params) + + req_state = self.request_states[request_id] + + if not stage0_params.skip_clone: + stage0_params = stage0_params.clone() + stage0_params.skip_clone = True + stage0_params.output_kind = RequestOutputKind.DELTA + + has_submitted_first_chunk = False + + async def handle_inputs() -> None: + nonlocal has_submitted_first_chunk + cancelled = False + try: + async for chunk in input_stream: + chunk_params = getattr(chunk, "sampling_params", None) or stage0_params + self._validate_streaming_input_sampling_params(chunk_params) + chunk_sampling_params_list = list(sampling_params_list) + chunk_sampling_params_list[0] = chunk_params + chunk_prompt = chunk.prompt + prompt_text, _, _ = extract_prompt_components(self.model_config, chunk_prompt) + + if not has_submitted_first_chunk: + await self.engine.add_request_async( + request_id=request_id, + prompt=chunk_prompt, + prompt_text=prompt_text, + sampling_params_list=chunk_sampling_params_list, + final_stage_id=final_stage_id, + resumable=True, + ) + has_submitted_first_chunk = True + else: + await self.engine.add_streaming_update_async( + request_id=request_id, + prompt=chunk_prompt, + prompt_text=prompt_text, + sampling_params_list=chunk_sampling_params_list, + final_stage_id=final_stage_id, + resumable=True, + ) + except (asyncio.CancelledError, GeneratorExit): + cancelled = True + except Exception as error: + await req_state.queue.put({"request_id": request_id, "error": error}) + finally: + if not cancelled: + # Send empty final request to indicate that inputs have + # finished. Don't send if canceled (session was aborted). + final_sampling_params_list = list(sampling_params_list) + final_sampling_params_list[0] = stage0_params + final_prompt = TokensPrompt(prompt_token_ids=[0]) + + if has_submitted_first_chunk: + await self.engine.add_streaming_update_async( + request_id=request_id, + prompt=final_prompt, + prompt_text=None, + sampling_params_list=final_sampling_params_list, + final_stage_id=final_stage_id, + resumable=False, + ) + else: + await self.engine.add_request_async( + request_id=request_id, + prompt=final_prompt, + prompt_text=None, + sampling_params_list=final_sampling_params_list, + final_stage_id=final_stage_id, + resumable=False, + ) + + input_stream_task = asyncio.create_task(handle_inputs()) + req_state.input_stream_task = input_stream_task + return input_stream_task + + @staticmethod + def _validate_streaming_input_sampling_params(params: OmniSamplingParams) -> None: + if ( + not isinstance(params, SamplingParams) + or params.n > 1 + or params.output_kind == RequestOutputKind.FINAL_ONLY + or params.stop + ): + raise ValueError( + "Input streaming is currently supported only for SamplingParams " + "with n == 1, output_kind != FINAL_ONLY, and without stop strings." + ) async def encode( self, diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index d832b2726c..0ffe33abde 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -52,6 +52,8 @@ from vllm.entrypoints.openai.models.protocol import BaseModelPath from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.openai.orca_metrics import metrics_header +from vllm.entrypoints.openai.realtime.connection import RealtimeConnection +from vllm.entrypoints.openai.realtime.serving import OpenAIServingRealtime from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses from vllm.entrypoints.openai.server_utils import get_uvicorn_log_config from vllm.entrypoints.openai.speech_to_text.serving import ( @@ -803,6 +805,11 @@ async def omni_init_app_state( state.openai_streaming_speech = OmniStreamingSpeechHandler( speech_service=state.openai_serving_speech, ) + state.openai_serving_realtime = OpenAIServingRealtime( + engine_client=engine_client, + models=state.openai_serving_models, + request_logger=request_logger, + ) state.openai_serving_video = OmniOpenAIServingVideo( engine_client, @@ -1161,6 +1168,19 @@ async def streaming_speech(websocket: WebSocket): await handler.handle_session(websocket) +@router.websocket("/v1/realtime") +async def realtime_websocket(websocket: WebSocket): + """WebSocket endpoint for OpenAI-style realtime interactions.""" + serving = getattr(websocket.app.state, "openai_serving_realtime", None) + if serving is None: + await websocket.accept() + await websocket.send_json({"type": "error", "error": "Realtime API is not available", "code": "unsupported"}) + await websocket.close() + return + connection = RealtimeConnection(websocket, serving) + await connection.handle_connection() + + # Health and Model endpoints for diffusion mode diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py index ebe516e240..04212ceeba 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py @@ -3,10 +3,12 @@ # Copyright 2025 The Qwen team. """Inference-only Qwen3-Omni-Moe unified model (thinker + talker + code2wav).""" -from collections.abc import Iterable +import asyncio +from collections.abc import AsyncGenerator, Iterable from functools import cached_property from typing import Any +import numpy as np import torch import torch.nn as nn from transformers.models.qwen3_omni_moe.configuration_qwen3_omni_moe import ( @@ -15,10 +17,12 @@ Qwen3OmniMoeTalkerConfig, Qwen3OmniMoeThinkerConfig, ) -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig +from vllm.inputs.data import PromptType, TokensPrompt from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal, SupportsPP +from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal, SupportsPP, SupportsRealtime +from vllm.model_executor.models.qwen3_asr_realtime import Qwen3ASRRealtimeBuffer from vllm.model_executor.models.qwen3_omni_moe_thinker import ( Qwen3OmniMoeConditionalGenerationMixin, ) @@ -26,6 +30,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFeatureSpec from vllm.sequence import IntermediateTensors +from vllm.tokenizers import cached_tokenizer_from_config +from vllm.transformers_utils.processor import cached_processor_from_config from vllm.v1.outputs import SamplerOutput from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.sampler import Sampler @@ -34,6 +40,7 @@ from vllm_omni.model_executor.models.output_templates import OmniOutput from vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker import ( Qwen3OmniMoeThinkerDummyInputsBuilder, + Qwen3OmniMoeThinkerForConditionalGeneration, Qwen3OmniMoeThinkerMultiModalProcessor, Qwen3OmniMoeThinkerProcessingInfo, ) @@ -70,7 +77,13 @@ dummy_inputs=Qwen3OmniMoeThinkerDummyInputsBuilder, ) class Qwen3OmniMoeForConditionalGeneration( - nn.Module, SupportsMultiModal, SupportsPP, Qwen3OmniMoeConditionalGenerationMixin, CustomProcessMixin, SupportsMRoPE + nn.Module, + SupportsMultiModal, + SupportsPP, + Qwen3OmniMoeConditionalGenerationMixin, + CustomProcessMixin, + SupportsMRoPE, + SupportsRealtime, ): """ Unified Qwen3 Omni MoE model combining thinker, talker, and code2wav. @@ -84,6 +97,8 @@ class Qwen3OmniMoeForConditionalGeneration( Set `model_stage` in vllm_config to one of: "thinker", "talker", "code2wav" """ + realtime_max_tokens = 64 + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.have_multimodal_outputs = True @@ -191,6 +206,46 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.thinker.make_empty_intermediate_tensors if self.model_stage == "thinker" else lambda: None ) + @classmethod + async def buffer_realtime_audio( + cls, + audio_stream: AsyncGenerator[np.ndarray, None], + input_stream: asyncio.Queue[list[int]], + model_config: ModelConfig, + ) -> AsyncGenerator[PromptType, None]: + processor = cached_processor_from_config(model_config) + feature_extractor = processor.feature_extractor + sampling_rate = feature_extractor.sampling_rate + tokenizer = cached_tokenizer_from_config(model_config) + + # Use a small segment size for low-latency streaming. + segment_duration_s = 5.0 + buffer = Qwen3ASRRealtimeBuffer( + sampling_rate=sampling_rate, + segment_duration_s=segment_duration_s, + ) + + audio_placeholder = Qwen3OmniMoeThinkerForConditionalGeneration.get_placeholder_str("audio", 0) + prompt_template = f"<|im_start|>user\n{audio_placeholder}<|im_end|>\n<|im_start|>assistant\n" + + prompt_token_ids = tokenizer.encode(prompt_template) + + async for audio_chunk in audio_stream: + buffer.write_audio(audio_chunk) + + while (segment := buffer.read_audio()) is not None: + yield TokensPrompt( + prompt_token_ids=prompt_token_ids, + multi_modal_data={"audio": segment}, + ) + + remaining = buffer.flush() + if remaining is not None and len(remaining) > 0: + yield TokensPrompt( + prompt_token_ids=prompt_token_ids, + multi_modal_data={"audio": remaining}, + ) + # ==================== Device utilities ==================== @staticmethod From 6211413677ae96ca2af82efff9ca7130ba46bd16 Mon Sep 17 00:00:00 2001 From: vraiti Date: Thu, 2 Apr 2026 12:28:53 -0400 Subject: [PATCH 030/204] Update MRoPE config fallback logic (#2278) Signed-off-by: vraiti Co-authored-by: Canlin Guo --- vllm_omni/model_executor/layers/rotary_embedding/mrope.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm_omni/model_executor/layers/rotary_embedding/mrope.py b/vllm_omni/model_executor/layers/rotary_embedding/mrope.py index 463e555073..3d3a88d877 100644 --- a/vllm_omni/model_executor/layers/rotary_embedding/mrope.py +++ b/vllm_omni/model_executor/layers/rotary_embedding/mrope.py @@ -337,12 +337,11 @@ def _omni_get_input_positions_tensor( """ thinker_config = hf_config.thinker_config - try: + if hasattr(thinker_config, "audio_token_index"): audio_token_id = thinker_config.audio_token_index image_token_id = thinker_config.image_token_index video_token_id = thinker_config.video_token_index - except Exception: - logger.info("Multimodal token idx changed!") + else: audio_token_id = thinker_config.audio_token_id image_token_id = thinker_config.image_token_id video_token_id = thinker_config.video_token_id From 6be5d05a7c356ac7b19a1422e16157f2972c0cad Mon Sep 17 00:00:00 2001 From: Canlin Guo Date: Fri, 3 Apr 2026 09:54:47 +0800 Subject: [PATCH 031/204] [Docs] Update docs to use vllm-ascend v0.18.0rc1 (#2453) Signed-off-by: gcanlin --- docker/Dockerfile.npu | 26 ++++++++-------- docker/Dockerfile.npu.a3 | 26 ++++++++-------- .../installation/npu/npu.inc.md | 30 +++++-------------- 3 files changed, 33 insertions(+), 49 deletions(-) diff --git a/docker/Dockerfile.npu b/docker/Dockerfile.npu index 47ea99fc79..2e961b89e6 100644 --- a/docker/Dockerfile.npu +++ b/docker/Dockerfile.npu @@ -1,20 +1,20 @@ ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend -ARG VLLM_ASCEND_TAG=v0.17.0rc1 +ARG VLLM_ASCEND_TAG=v0.18.0rc1 FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG} -WORKDIR /vllm-workspace/vllm -RUN git fetch origin --tags && git checkout v0.18.0 +# WORKDIR /vllm-workspace/vllm +# RUN git fetch origin --tags && git checkout v0.18.0 -WORKDIR /vllm-workspace/vllm-ascend -RUN git fetch origin releases/v0.18.0 && git checkout d781902ce9dbda8ab1e11bb0f2f0c1bc508fee7a -# Install vllm-ascend -# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH -RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ - source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ - source /usr/local/Ascend/nnal/atb/set_env.sh && \ - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ - python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ - python3 -m pip cache purge +# WORKDIR /vllm-workspace/vllm-ascend +# RUN git fetch origin releases/v0.18.0 && git checkout d781902ce9dbda8ab1e11bb0f2f0c1bc508fee7a +# # Install vllm-ascend +# # Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH +# RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ +# source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ +# source /usr/local/Ascend/nnal/atb/set_env.sh && \ +# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ +# python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ +# python3 -m pip cache purge ARG APP_DIR=/vllm-workspace/vllm-omni WORKDIR ${APP_DIR} diff --git a/docker/Dockerfile.npu.a3 b/docker/Dockerfile.npu.a3 index e919382577..e3781fc18f 100644 --- a/docker/Dockerfile.npu.a3 +++ b/docker/Dockerfile.npu.a3 @@ -1,20 +1,20 @@ ARG VLLM_ASCEND_IMAGE=quay.io/ascend/vllm-ascend -ARG VLLM_ASCEND_TAG=v0.17.0rc1-a3 +ARG VLLM_ASCEND_TAG=v0.18.0rc1-a3 FROM ${VLLM_ASCEND_IMAGE}:${VLLM_ASCEND_TAG} -WORKDIR /vllm-workspace/vllm -RUN git fetch origin --tags && git checkout v0.18.0 +# WORKDIR /vllm-workspace/vllm +# RUN git fetch origin --tags && git checkout v0.18.0 -WORKDIR /vllm-workspace/vllm-ascend -RUN git fetch origin releases/v0.18.0 && git checkout d781902ce9dbda8ab1e11bb0f2f0c1bc508fee7a -# Install vllm-ascend -# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH -RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ - source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ - source /usr/local/Ascend/nnal/atb/set_env.sh && \ - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ - python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ - python3 -m pip cache purge +# WORKDIR /vllm-workspace/vllm-ascend +# RUN git fetch origin releases/v0.18.0 && git checkout d781902ce9dbda8ab1e11bb0f2f0c1bc508fee7a +# # Install vllm-ascend +# # Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH +# RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ +# source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ +# source /usr/local/Ascend/nnal/atb/set_env.sh && \ +# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ +# python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \ +# python3 -m pip cache purge ARG APP_DIR=/vllm-workspace/vllm-omni WORKDIR ${APP_DIR} diff --git a/docs/getting_started/installation/npu/npu.inc.md b/docs/getting_started/installation/npu/npu.inc.md index b718bd493f..c5b13dd73f 100644 --- a/docs/getting_started/installation/npu/npu.inc.md +++ b/docs/getting_started/installation/npu/npu.inc.md @@ -10,10 +10,10 @@ The recommended way to use vLLM-Omni on NPU is through the vllm-ascend pre-built ```bash # Update the vllm-ascend image # Atlas A2: -# export IMAGE=quay.io/ascend/vllm-ascend:v0.17.0rc1 +# export IMAGE=quay.io/ascend/vllm-ascend:v0.18.0rc1 # Atlas A3: -# export IMAGE=quay.io/ascend/vllm-ascend:v0.17.0rc1-a3 -export IMAGE=quay.io/ascend/vllm-ascend:v0.17.0rc1 +# export IMAGE=quay.io/ascend/vllm-ascend:v0.18.0rc1-a3 +export IMAGE=quay.io/ascend/vllm-ascend:v0.18.0rc1 docker run --rm \ --name vllm-omni-npu \ --shm-size=1g \ @@ -33,17 +33,6 @@ docker run --rm \ -p 8000:8000 \ -it $IMAGE bash -cd /vllm-workspace/vllm -git fetch origin --tags -git checkout v0.18.0 - -# Because vllm-ascend will release v0.18.0rc1 after vllm-omni 0.16.0, -# we have to pin vllm-ascend at the current commit. -cd /vllm-workspace/vllm-ascend -git pull origin main -git checkout d781902ce9dbda8ab1e11bb0f2f0c1bc508fee7a -pip install -v -e . - # Inside the container, install vLLM-Omni from source cd /vllm-workspace git clone -b v0.18.0 https://github.com/vllm-project/vllm-omni.git @@ -68,15 +57,10 @@ You can also build vLLM-Omni from the latest main branch if you want to use the ```bash # Pin vLLM version to 0.18.0 -cd /vllm-workspace/vllm -git fetch origin --tags -git checkout v0.18.0 - -# Because vllm-ascend has not yet entered continuous development and has not been officially released, we need to pin it to a specific commit. Please note that this commit may change over time. -cd /vllm-workspace/vllm-ascend -git pull origin main -git fetch origin --tags -git checkout d781902ce9dbda8ab1e11bb0f2f0c1bc508fee7a +git clone -b v0.18.0 https://github.com/vllm-project/vllm.git +VLLM_TARGET_DEVICE=empty pip install -v -e . + +git clone -b v0.18.0rc1 https://github.com/vllm-project/vllm-ascend.git pip install -v -e . # Install vLLM-Omni from the latest main branch From fa275fd1bea7e4d43fd7fa54ff9bb1f27c88f54c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Fri, 3 Apr 2026 11:47:57 +0800 Subject: [PATCH 032/204] [BAGEL] [Feature]: Add `thinking mode` in Bagel multi-stage serving (#2447) --- examples/offline_inference/bagel/end2end.py | 50 ++++- .../diffusion/models/bagel/pipeline_bagel.py | 7 + vllm_omni/engine/async_omni_engine.py | 7 +- .../model_executor/models/bagel/bagel.py | 208 ++++++++++++++---- .../stage_configs/bagel_think.yaml | 86 ++++++++ .../stage_input_processors/bagel.py | 109 +++++++++ vllm_omni/worker/gpu_ar_model_runner.py | 22 +- 7 files changed, 443 insertions(+), 46 deletions(-) create mode 100644 vllm_omni/model_executor/stage_configs/bagel_think.yaml diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index efcdea2355..2153a31ba7 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -2,6 +2,7 @@ import os from vllm_omni.inputs.data import OmniPromptType +from vllm_omni.model_executor.stage_input_processors.bagel import GEN_THINK_SYSTEM_PROMPT def parse_args(): @@ -65,6 +66,17 @@ def parse_args(): help="CFG parallel size: 1=batched (single GPU), 2=parallel with 2 branches (text CFG only), 3=parallel (3 GPUs).", ) parser.add_argument("--seed", type=int, default=None, help="Random seed for generation.") + parser.add_argument( + "--cfg-interval", + type=float, + nargs=2, + default=None, + help="CFG interval [start, end] (default: pipeline default)", + ) + parser.add_argument( + "--cfg-renorm-type", type=str, default=None, help="CFG renorm type: global, text_channel, channel" + ) + parser.add_argument("--cfg-renorm-min", type=float, default=None, help="CFG renorm min") parser.add_argument( "--enable-diffusion-pipeline-profiler", action="store_true", @@ -76,6 +88,12 @@ def parse_args(): default=None, help="Quantization method (e.g. 'fp8').", ) + parser.add_argument( + "--think", + action="store_true", + default=False, + help="Enable thinking mode: AR stage decodes ... planning tokens before image generation.", + ) args = parser.parse_args() return args @@ -110,8 +128,12 @@ def main(): from vllm_omni.entrypoints.omni import Omni omni_kwargs = {} - if args.stage_configs_path: - omni_kwargs["stage_configs_path"] = args.stage_configs_path + stage_configs_path = args.stage_configs_path + if args.think and stage_configs_path is None: + stage_configs_path = "vllm_omni/model_executor/stage_configs/bagel_think.yaml" + print(f"[Info] Think mode enabled, using stage config: {stage_configs_path}") + if stage_configs_path: + omni_kwargs["stage_configs_path"] = stage_configs_path omni_kwargs.update( { @@ -136,7 +158,8 @@ def main(): if not args.image_path or not os.path.exists(args.image_path): raise ValueError(f"img2img requires --image-path pointing to an existing file, got: {args.image_path}") loaded_image = Image.open(args.image_path).convert("RGB") - final_prompt_text = f"<|fim_middle|><|im_start|>{p}<|im_end|>" + think_prefix = f"<|im_start|>{GEN_THINK_SYSTEM_PROMPT}<|im_end|>" if args.think else "" + final_prompt_text = f"{think_prefix}<|fim_middle|><|im_start|>{p}<|im_end|>" prompt_dict = { "prompt": final_prompt_text, "multi_modal_data": {"img2img": loaded_image}, @@ -160,7 +183,8 @@ def main(): prompt_dict = {"prompt": final_prompt_text, "modalities": ["text"]} formatted_prompts.append(prompt_dict) else: - final_prompt_text = f"<|im_start|>{p}<|im_end|>" + think_prefix = f"<|im_start|>{GEN_THINK_SYSTEM_PROMPT}<|im_end|>" if args.think else "" + final_prompt_text = f"{think_prefix}<|im_start|>{p}<|im_end|>" prompt_dict = {"prompt": final_prompt_text, "modalities": ["image"]} if args.negative_prompt is not None: prompt_dict["negative_prompt"] = args.negative_prompt @@ -178,6 +202,12 @@ def main(): "cfg_text_scale": args.cfg_text_scale, "cfg_img_scale": args.cfg_img_scale, } + if args.cfg_interval is not None: + extra["cfg_interval"] = tuple(args.cfg_interval) + if args.cfg_renorm_type is not None: + extra["cfg_renorm_type"] = args.cfg_renorm_type + if args.cfg_renorm_min is not None: + extra["cfg_renorm_min"] = args.cfg_renorm_min if args.negative_prompt is not None: extra["negative_prompt"] = args.negative_prompt diffusion_params.extra_args = extra # type: ignore @@ -186,6 +216,17 @@ def main(): img_idx = 0 for req_output in omni_outputs: + if args.think: + text_output = getattr(req_output, "text", None) or getattr(req_output, "outputs", None) + if text_output: + if isinstance(text_output, list) and text_output: + for out in text_output: + txt = getattr(out, "text", str(out)) + if txt: + print(f"[Think] {txt}") + elif isinstance(text_output, str): + print(f"[Think] {text_output}") + images = getattr(req_output, "images", None) if not images: @@ -194,6 +235,7 @@ def main(): for j, img in enumerate(images): save_path = os.path.join(args.output, f"output_{img_idx}_{j}.png") img.save(save_path) + print(f"[Output] Saved image to {save_path}") img_idx += 1 print(omni_outputs) diff --git a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py index aa4f0a74f0..3e053cbda5 100644 --- a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py +++ b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py @@ -326,11 +326,18 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: cfg_text_scale = extra_args.get("cfg_text_scale", 4.0) cfg_img_scale = extra_args.get("cfg_img_scale", 1.5) + cfg_interval = extra_args.get("cfg_interval", (0.4, 1.0)) + cfg_renorm_type = extra_args.get("cfg_renorm_type", "global") + cfg_renorm_min = extra_args.get("cfg_renorm_min", 0.0) + gen_params = BagelGenParams( num_timesteps=int(req.sampling_params.num_inference_steps or 50), timestep_shift=3.0, cfg_text_scale=cfg_text_scale, cfg_img_scale=cfg_img_scale, + cfg_interval=cfg_interval, + cfg_renorm_type=cfg_renorm_type, + cfg_renorm_min=cfg_renorm_min, ) gen_context = { diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 71bf6e2379..c998870ce7 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -728,14 +728,15 @@ def _enqueue_cfg_companions( cid = f"{parent_id}{ep.request_id_suffix}" companion_prompt = ep.prompt - # Run through same input processing as the main prompt + companion_params, companion_spl = ep.apply_overrides(stage0_params, sampling_params_list) + if isinstance(companion_prompt, dict): _inject_global_id(companion_prompt, cid) request = self.input_processor.process_inputs( request_id=cid, prompt=companion_prompt, - params=stage0_params, + params=companion_params, supported_tasks=self.supported_tasks, ) request = _upgrade_to_omni_request(request, companion_prompt) @@ -756,7 +757,7 @@ def _enqueue_cfg_companions( "parent_id": parent_id, "role": ep.role, "prompt": request, - "sampling_params_list": sampling_params_list, + "sampling_params_list": companion_spl, } ) diff --git a/vllm_omni/model_executor/models/bagel/bagel.py b/vllm_omni/model_executor/models/bagel/bagel.py index e58b3501c4..e79f0212e2 100644 --- a/vllm_omni/model_executor/models/bagel/bagel.py +++ b/vllm_omni/model_executor/models/bagel/bagel.py @@ -429,6 +429,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._ropes_metadata: dict[str, dict[str, Any]] = {} self._cfg_companion_queue: deque[tuple[tuple[int, int, int, int], int]] = deque() + # Per-request position offset for decode after img2img prefill. + # Prefill rewrites positions (VAE→0, ViT→1, text→2..N) but the model + # runner assigns decode positions starting from prefill_len, not N+1. + # offset = rope - prefill_len (a negative number). + self._pending_decode_offsets: list[int] = [] + self._decode_position_offsets: dict[str, int] = {} + from transformers import AutoTokenizer tok_name = getattr(vllm_config.model_config, "tokenizer", None) or vllm_config.model_config.model @@ -438,6 +445,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): _tok.add_tokens([t]) self._start_of_image_id = int(_tok.convert_tokens_to_ids("<|vision_start|>")) self._end_of_image_id = int(_tok.convert_tokens_to_ids("<|vision_end|>")) + self._img2img_token_id = int(_tok.convert_tokens_to_ids("<|fim_middle|>")) self._vae_token_mask: torch.Tensor | None = None self.device = get_local_device() @@ -518,10 +526,64 @@ def _clear_warmup_state(self): self._ropes_metadata.clear() self._pending_img2img_info.clear() self._cfg_companion_queue.clear() + self._pending_decode_offsets.clear() + self._decode_position_offsets.clear() self._vae_token_mask = None - def get_kv_transfer_metadata(self, req_id: str) -> dict[str, Any] | None: - return self._ropes_metadata.pop(req_id, None) + def get_kv_transfer_metadata( + self, + req_id: str, + *, + num_computed_tokens: int | None = None, + ) -> dict[str, Any] | None: + meta = self._ropes_metadata.pop(req_id, None) + if meta is None: + return None + # In think-mode img2img the prefill rope doesn't account for decoded + # thinking tokens; correct it to num_computed_tokens + offset. + # Skip correction when num_computed_tokens is unavailable (None). + offset = self._decode_position_offsets.pop(req_id, 0) + if offset != 0 and "ropes" in meta and num_computed_tokens is not None: + meta["ropes"] = [num_computed_tokens + offset] + return meta + + def prepare_runner_inputs( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor | None, + inputs_embeds: torch.Tensor | None, + req_ids: list[str], + num_computed_tokens: list[int], + num_scheduled_tokens: list[int], + input_ids_buffer: torch.Tensor | None = None, + ) -> tuple[torch.Tensor | None, torch.Tensor | None]: + """Model-runner hook: adjust inputs before ``forward()``. + + Returns ``(input_ids, positions)`` — possibly modified. + + Two adjustments for BAGEL img2img: + + 1. **Restore input_ids** when ``inputs_embeds`` is present so that + ``_adjust_positions_for_img2img`` can locate the + ``<|fim_middle|>`` placeholder. + 2. **Decode position offset**: prefill rewrites positions to a + compact scheme (rope ≪ prefill_len). The runner assigns decode + positions from ``num_computed_tokens``, which is far too large; + apply the stored per-request offset. + """ + if inputs_embeds is not None and input_ids is None and input_ids_buffer is not None: + input_ids = input_ids_buffer + + if self._decode_position_offsets and positions is not None: + token_start = 0 + for i, rid in enumerate(req_ids): + sched = num_scheduled_tokens[i] + offset = self._decode_position_offsets.get(rid, 0) + if offset != 0 and num_computed_tokens[i] > 0: + positions[token_start : token_start + sched] += offset + token_start += sched + + return input_ids, positions def flush_pending_metadata(self, req_ids: list[str]) -> None: """Map pending metadata (batch order) to req_ids after forward().""" @@ -529,7 +591,14 @@ def flush_pending_metadata(self, req_ids: list[str]) -> None: self._ropes_pending = [] for i, meta in enumerate(pending): if i < len(req_ids): - self._ropes_metadata[req_ids[i]] = meta + if req_ids[i] not in self._ropes_metadata: + self._ropes_metadata[req_ids[i]] = meta + + pending_offsets = self._pending_decode_offsets + self._pending_decode_offsets = [] + for i, offset in enumerate(pending_offsets): + if i < len(req_ids) and offset != 0: + self._decode_position_offsets[req_ids[i]] = offset def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: mm_input_by_modality = {} @@ -643,7 +712,16 @@ def _process_img2img_input(self, multimodal_input): num_vit = vit_emb.shape[0] + 2 info = (num_vae, num_vit, int(H), int(W)) self._pending_img2img_info.append(info) - self._cfg_companion_queue.append((info, 2)) # cfg_text + cfg_img + # Only the gen (main) request should add a companion queue entry. + # Companion requests (cfg_text, cfg_img) also call this method with + # the same image, so guard by checking whether this exact info + # tuple is already enqueued. For batched img2img with multiple + # concurrent gen requests this correctly adds one entry per unique + # image; images with identical (num_vae, num_vit, H, W) that arrive + # in the same batch are indistinguishable here and will share one + # entry, but that is an uncommon edge case. + if not any(entry[0] == info for entry in self._cfg_companion_queue): + self._cfg_companion_queue.append((info, 2)) # cfg_text + cfg_img return tuple(results) @@ -659,42 +737,65 @@ def forward( seq_len = inputs_embeds.shape[0] if inputs_embeds is not None else positions.shape[0] if self._pending_img2img_info: - positions = self._adjust_positions_for_img2img(positions) + positions = self._adjust_positions_for_img2img(positions, input_ids) use_mot = True elif self._cfg_companion_queue: - cached, remaining = self._cfg_companion_queue[0] - remaining -= 1 - num_vae, num_vit, img_H, img_W = cached - num_img2img = num_vae + 1 + num_vit # +1 separator - seq_len = inputs_embeds.shape[0] if inputs_embeds is not None else positions.shape[0] - - if inputs_embeds is not None and seq_len >= num_img2img: - self._pending_img2img_info = [cached] - positions = self._adjust_positions_for_img2img(positions) - use_mot = True + # Guard: if this looks like a pure decode step (small token count, + # no multimodal embeddings), the queue has stale entries from a + # previous prefill cycle — clear them instead of consuming. + if inputs_embeds is None and seq_len <= 2: + self._cfg_companion_queue.clear() else: - rope = int(positions[seq_len - 1].item()) + 1 - self._ropes_pending.append({"ropes": [rope]}) + cached, remaining = self._cfg_companion_queue[0] + remaining -= 1 + num_vae, num_vit, img_H, img_W = cached + num_img2img = num_vae + 1 + num_vit # +1 separator + seq_len = inputs_embeds.shape[0] if inputs_embeds is not None else positions.shape[0] - if remaining == 0: - self._cfg_companion_queue.popleft() - else: - self._cfg_companion_queue[0] = (cached, remaining) + if inputs_embeds is not None and seq_len >= num_img2img: + self._pending_img2img_info = [cached] + positions = self._adjust_positions_for_img2img(positions, input_ids) + use_mot = True + else: + rope = int(positions[seq_len - 1].item()) + 1 + self._ropes_pending.append({"ropes": [rope]}) + + if remaining == 0: + self._cfg_companion_queue.popleft() + else: + self._cfg_companion_queue[0] = (cached, remaining) if use_mot: return self._mot_forward(input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs) return super().forward(input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs) - def _adjust_positions_for_img2img(self, positions: torch.Tensor) -> torch.Tensor: - """Rewrite position IDs to match the single-stage DiT scheme: - VAE tokens -> position 0, separator -> position 0, - ViT tokens -> position 1, text -> 2, 3, ... + def _adjust_positions_for_img2img( + self, + positions: torch.Tensor, + input_ids: torch.Tensor | None = None, + ) -> torch.Tensor: + """Rewrite position IDs to match the original BAGEL position scheme: + + If there are ``pre_text_len`` text tokens before the img2img block:: + + pre_text → 0, 1, ..., M-1 + VAE → M (all share) + separator→ M + ViT → M+1 (all share) + post_text→ M+2, M+3, ... + + When no text precedes the img2img block (M=0), this reduces to the + simpler scheme: VAE→0, ViT→1, text→2, 3, ... Also computes ``self._vae_token_mask`` (bool tensor, True for actual VAE latent patches that should use gen-mode weights) and pushes per-request ropes + image_shape to the FIFO consumed by ``get_kv_transfer_metadata``. + + For img2img requests, also stores a decode position offset so that + subsequent autoregressive decode steps use positions that continue + from the rewritten scheme rather than from the original prefill length. """ info_list = self._pending_img2img_info self._pending_img2img_info = [] @@ -724,35 +825,66 @@ def _adjust_positions_for_img2img(self, positions: torch.Tensor) -> torch.Tensor num_img2img = num_vae + 1 + num_vit # +1 separator if req_len >= num_img2img: - new_positions[start : start + num_vae] = 0 - new_positions[start + num_vae] = 0 # separator - vit_start = start + num_vae + 1 - new_positions[vit_start : vit_start + num_vit] = 1 - num_text = req_len - num_img2img - if num_text > 0: - text_start = start + num_img2img - new_positions[text_start:end] = torch.arange( - 2, 2 + num_text, device=positions.device, dtype=positions.dtype + # Detect offset of img2img tokens within this request + # by searching for the img2img placeholder token ID. + pre_text_len = 0 + if input_ids is not None: + req_ids = input_ids[start:end] + mask = req_ids == self._img2img_token_id + indices = mask.nonzero(as_tuple=True)[0] + if indices.numel() > 0: + pre_text_len = int(indices[0].item()) + + img_start = start + pre_text_len + post_text_start = img_start + num_img2img + # pre_text_pos: position base for image tokens + pre_text_pos = pre_text_len + + # Pre-image text: sequential positions 0..pre_text_pos-1 + if pre_text_len > 0: + new_positions[start:img_start] = torch.arange( + 0, pre_text_pos, device=positions.device, dtype=positions.dtype + ) + + # VAE tokens: all share position pre_text_pos + new_positions[img_start : img_start + num_vae] = pre_text_pos + # Separator: position pre_text_pos + new_positions[img_start + num_vae] = pre_text_pos + # ViT tokens: all share position pre_text_pos+1 + vit_start = img_start + num_vae + 1 + new_positions[vit_start : vit_start + num_vit] = pre_text_pos + 1 + + # Post-image text: sequential positions pre_text_pos+2, pre_text_pos+3, ... + num_post_text = end - post_text_start + if num_post_text > 0: + new_positions[post_text_start:end] = torch.arange( + pre_text_pos + 2, + pre_text_pos + 2 + num_post_text, + device=positions.device, + dtype=positions.dtype, ) - # VAE gen-mode mask: only actual VAE patches (not markers) - vae_patches_start = start + 1 # skip start_marker - vae_patches_end = start + num_vae - 1 # before end_marker + # VAE gen-mode mask: only actual VAE latent patches (not markers) + vae_patches_start = img_start + 1 # skip start_marker + vae_patches_end = img_start + num_vae - 1 # before end_marker if vae_patches_end > vae_patches_start: vae_mask[vae_patches_start:vae_patches_end] = True - rope = 2 + num_text + rope = pre_text_pos + 2 + num_post_text self._ropes_pending.append( { "ropes": [rope], "image_shape": [img_H, img_W], } ) + decode_offset = rope - req_len + self._pending_decode_offsets.append(decode_offset) img2img_idx += 1 continue rope = int(new_positions[end - 1].item()) + 1 self._ropes_pending.append({"ropes": [rope]}) + self._pending_decode_offsets.append(0) self._vae_token_mask = vae_mask if vae_mask.any() else None return new_positions diff --git a/vllm_omni/model_executor/stage_configs/bagel_think.yaml b/vllm_omni/model_executor/stage_configs/bagel_think.yaml new file mode 100644 index 0000000000..c4cf32c707 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/bagel_think.yaml @@ -0,0 +1,86 @@ +# BAGEL Think Model: AR stage decodes thinking tokens before KV transfer to DiT. +# +# Differences from bagel.yaml: +# - No kv_transfer_criteria: AR stage decodes until EOS, then transfers full +# KV cache (including thinking tokens) via _free_request path. +# - prompt_expand_func: uses expand_cfg_prompts_think which sets max_tokens=1 +# on companion requests so they stop immediately after prefill. +# - max_tokens: 2048 for thinking text generation. + +stage_args: + - stage_id: 0 + stage_type: llm + prompt_expand_func: vllm_omni.model_executor.stage_input_processors.bagel.expand_cfg_prompts_think + runtime: + devices: "0" + engine_args: + model_stage: thinker + max_num_seqs: 3 + model_arch: OmniBagelForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.45 + enforce_eager: true + trust_remote_code: true + engine_output_type: text + distributed_executor_backend: "mp" + enable_prefix_caching: false + max_num_batched_tokens: 32768 + tensor_parallel_size: 1 + omni_kv_config: + need_send_cache: true + final_output: true + final_output_type: text + is_comprehension: true + default_sampling_params: + temperature: 0.3 + top_p: 0.9 + top_k: 1 + max_tokens: 2048 + seed: 52 + detokenize: True + repetition_penalty: 1.05 + + - stage_id: 1 + stage_type: diffusion + cfg_kv_collect_func: vllm_omni.model_executor.stage_input_processors.bagel.collect_cfg_kv_caches + runtime: + devices: "0" + engine_args: + model_stage: dit + max_num_seqs: 1 + gpu_memory_utilization: 0.45 + enforce_eager: true + trust_remote_code: true + engine_output_type: image + distributed_executor_backend: "mp" + enable_prefix_caching: false + max_num_batched_tokens: 32768 + tensor_parallel_size: 1 + omni_kv_config: + need_recv_cache: true + engine_input_source: [0] + + final_output: true + final_output_type: image + is_comprehension: false + default_sampling_params: + seed: 52 + +# Runtime edges +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 + + connectors: + shared_memory_connector: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + + edges: + - from: 0 + to: 1 + window_size: -1 diff --git a/vllm_omni/model_executor/stage_input_processors/bagel.py b/vllm_omni/model_executor/stage_input_processors/bagel.py index d7055ff518..6b88fcd4a1 100644 --- a/vllm_omni/model_executor/stage_input_processors/bagel.py +++ b/vllm_omni/model_executor/stage_input_processors/bagel.py @@ -30,6 +30,26 @@ class ExpandedPrompt: prompt: dict[str, Any] | str role: str request_id_suffix: str + sampling_params_override: dict[str, Any] | None = None + + def apply_overrides( + self, + base_params: Any, + base_spl: list[Any], + ) -> tuple[Any, list[Any]]: + """Return ``(params, sampling_params_list)`` with overrides applied. + + If this prompt has no overrides the originals are returned as-is. + """ + if not self.sampling_params_override: + return base_params, base_spl + patched = base_params.clone() + for k, v in self.sampling_params_override.items(): + setattr(patched, k, v) + spl = list(base_spl) + if spl: + spl[0] = patched + return patched, spl def expand_cfg_prompts( @@ -108,6 +128,95 @@ def expand_cfg_prompts( return [] +GEN_THINK_SYSTEM_PROMPT = ( + "You should first think about the planning process in the mind " + "and then generate the image. \n" + "The planning process is enclosed within tags, " + "i.e. planning process here image here" +) + + +def expand_cfg_prompts_think( + prompt: dict[str, Any] | str, + sampling_params: Any, +) -> list[ExpandedPrompt]: + """Expand prompts for Bagel CFG in thinking mode. + + Same as expand_cfg_prompts but companion requests get max_tokens=1 + so they stop immediately after prefill (no thinking decode). + + In thinking mode the gen (main) request decodes thinking tokens until + EOS; companions should only contribute their prefill KV cache. + """ + if not isinstance(prompt, dict): + return [] + + modalities = prompt.get("modalities", []) + if "image" not in modalities and "img2img" not in modalities: + return [] + + neg_prompt = _get_negative_prompt(prompt, sampling_params) + companion_params = {"max_tokens": 1} + + if "image" in modalities: + neg_prompt_dict = { + "prompt": neg_prompt, + "modalities": prompt.get("modalities", []), + } + return [ + ExpandedPrompt( + prompt=neg_prompt_dict, + role="cfg_text", + request_id_suffix=CFG_TEXT_SUFFIX, + sampling_params_override=companion_params, + ), + ] + + if "img2img" in modalities: + IMG2IMG_PLACEHOLDER = "<|fim_middle|>" + + original_text = prompt.get("prompt", "") + # Extract system prompt prefix (everything before <|fim_middle|>) + # so cfg_text gets system_prompt + image (no user text), matching + # the original BAGEL code where cfg_text = deepcopy(gen after image). + parts = original_text.split(IMG2IMG_PLACEHOLDER, 1) + system_prefix = parts[0] if len(parts) > 1 else "" + + cfg_text_prompt = f"{system_prefix}{IMG2IMG_PLACEHOLDER}{neg_prompt}" + cfg_text_dict: dict[str, Any] = { + "prompt": cfg_text_prompt, + "modalities": ["img2img"], + } + mm_data = prompt.get("multi_modal_data") + if mm_data: + cfg_text_dict["multi_modal_data"] = mm_data + + cfg_img_text = original_text.replace(IMG2IMG_PLACEHOLDER, "") + cfg_img_dict: dict[str, Any] = { + "prompt": cfg_img_text, + "modalities": ["img2img"], + } + if mm_data: + cfg_img_dict["multi_modal_data"] = mm_data + + return [ + ExpandedPrompt( + prompt=cfg_text_dict, + role="cfg_text", + request_id_suffix=CFG_TEXT_SUFFIX, + sampling_params_override=companion_params, + ), + ExpandedPrompt( + prompt=cfg_img_dict, + role="cfg_img", + request_id_suffix=CFG_IMG_SUFFIX, + sampling_params_override=companion_params, + ), + ] + + return [] + + def collect_cfg_kv_caches( request_id: str, cfg_request_ids: dict[str, str], diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 697c39d242..155b75675f 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -108,7 +108,14 @@ def execute_model( if finished_reqs and hasattr(self.model, "get_kv_transfer_metadata"): for req_id, data in finished_reqs.items(): try: - model_meta = self.model.get_kv_transfer_metadata(req_id) + req_idx = self.input_batch.req_id_to_index.get(req_id) + num_computed = ( + int(self.input_batch.num_computed_tokens_cpu[req_idx]) if req_idx is not None else None + ) + model_meta = self.model.get_kv_transfer_metadata( + req_id, + num_computed_tokens=num_computed, + ) if model_meta: existing = data.get("custom_metadata") or {} existing.update(model_meta) @@ -266,6 +273,19 @@ def execute_model( ec_connector_output, ) = self._preprocess(scheduler_output, num_tokens_padded, intermediate_tensors) + # Let the model adjust inputs before forward (e.g. restore input_ids + # for multimodal position detection, fix decode position offsets). + if hasattr(self.model, "prepare_runner_inputs"): + input_ids, positions = self.model.prepare_runner_inputs( + input_ids=input_ids, + positions=positions, + inputs_embeds=inputs_embeds, + req_ids=req_ids[:num_reqs], + num_computed_tokens=[int(self.input_batch.num_computed_tokens_cpu[i]) for i in range(num_reqs)], + num_scheduled_tokens=[int(num_scheduled_tokens_np[i]) for i in range(num_reqs)], + input_ids_buffer=self.input_ids.gpu[:num_tokens_padded], + ) + # Set cudagraph mode to none if calc_kv_scales is true. # KV scales calculation involves dynamic operations that are incompatible # with CUDA graph capture. From 7fb86d51cd0ad6745734b367a28dc24370552f88 Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Fri, 3 Apr 2026 14:56:33 +0800 Subject: [PATCH 033/204] [BugFix][FishSpeech] Fix structured voice clone prefill conditioning (#2446) --- .../models/test_fish_speech_regressions.py | 108 ++++++++++++++++++ .../models/fish_speech/dac_encoder.py | 90 ++++++++------- .../fish_speech/fish_speech_dac_decoder.py | 19 +-- .../models/fish_speech/fish_speech_slow_ar.py | 37 +++++- 4 files changed, 201 insertions(+), 53 deletions(-) create mode 100644 tests/model_executor/models/test_fish_speech_regressions.py diff --git a/tests/model_executor/models/test_fish_speech_regressions.py b/tests/model_executor/models/test_fish_speech_regressions.py new file mode 100644 index 0000000000..1f8c3cf71e --- /dev/null +++ b/tests/model_executor/models/test_fish_speech_regressions.py @@ -0,0 +1,108 @@ +import math + +import pytest +import torch + +from vllm_omni.model_executor.models.fish_speech import fish_speech_slow_ar as slow_ar_module +from vllm_omni.model_executor.models.fish_speech.fish_speech_dac_decoder import FishSpeechDACDecoder +from vllm_omni.model_executor.models.fish_speech.fish_speech_slow_ar import ( + FishSpeechSlowARForConditionalGeneration, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +class _FakeCodec: + def decode(self, codes_bqf: torch.Tensor, feature_lengths: torch.Tensor): + del codes_bqf, feature_lengths + wav = torch.arange(100, dtype=torch.float32).view(1, 1, 100) + audio_lengths = torch.tensor([100], dtype=torch.long) + return wav, audio_lengths + + +class _FakeTokenizer: + def __init__(self, mapping, unk_token_id=-1): + self._mapping = mapping + self.unk_token_id = unk_token_id + + def convert_tokens_to_ids(self, token: str) -> int: + return self._mapping.get(token, self.unk_token_id) + + +def test_dac_decoder_mixed_batch_empty_request_does_not_misalign_indices(): + decoder = object.__new__(FishSpeechDACDecoder) + torch.nn.Module.__init__(decoder) + decoder._codec = _FakeCodec() + decoder._num_codebooks = 10 + decoder._output_sample_rate = 44100 + decoder._hop_length = 512 + decoder._logged_codec_stats = False + decoder._ensure_codec_loaded = lambda: None + decoder._split_request_ids = lambda ids, seq_token_counts=None: [ + torch.empty((0,), dtype=torch.long), + torch.arange(20, dtype=torch.long), + ] + + out = decoder.forward( + input_ids=torch.arange(20, dtype=torch.long), + runtime_additional_information=[{}, {"left_context_size": 1}], + ) + + audios = out.multimodal_outputs["model_outputs"] + assert len(audios) == 2 + assert audios[0].numel() == 0 + # 2 total frames with 1 frame of left context => proportional trim removes half the samples. + assert audios[1].shape[0] == 50 + + +def test_structured_voice_clone_prefill_adds_full_codebooks_with_decode_scale(monkeypatch): + model = object.__new__(FishSpeechSlowARForConditionalGeneration) + torch.nn.Module.__init__(model) + model._num_codebooks = 2 + model._codebook_size = 8 + model._semantic_begin_id = 100 + model.model_path = "unused" + + hidden_size = 3 + text_embed = torch.nn.Embedding(256, hidden_size) + codebook_embed = torch.nn.Embedding(model._num_codebooks * model._codebook_size, hidden_size) + with torch.no_grad(): + text_embed.weight.zero_() + text_embed.weight[20] = torch.tensor([1.0, 2.0, 3.0]) + text_embed.weight[21] = torch.tensor([4.0, 5.0, 6.0]) + codebook_embed.weight.zero_() + codebook_embed.weight[1] = torch.tensor([10.0, 0.0, 0.0]) + codebook_embed.weight[10] = torch.tensor([0.0, 20.0, 0.0]) + codebook_embed.weight[3] = torch.tensor([30.0, 0.0, 0.0]) + codebook_embed.weight[12] = torch.tensor([0.0, 40.0, 0.0]) + + model.embed_input_ids = lambda ids: text_embed(ids) + model.codebook_embeddings = codebook_embed + model._get_tokenizer = lambda: _FakeTokenizer({"<|audio_start|>": 10, "<|audio_end|>": 11}) + + monkeypatch.setattr(slow_ar_module.np, "load", lambda path: [0.0]) + monkeypatch.setattr(slow_ar_module.os, "remove", lambda path: None) + monkeypatch.setattr( + slow_ar_module, + "encode_reference_audio_codes", + lambda *args, **kwargs: torch.tensor([[1, 2], [3, 4]], dtype=torch.long), + ) + monkeypatch.setattr( + slow_ar_module, + "build_fish_voice_clone_prompt_ids", + lambda tokenizer, text, ref_text, semantic_token_ids: ([1, 10, 20, 21, 11, 2], None, None), + ) + + prefill = model._build_structured_voice_clone_prefill_embeds( + { + "ref_text": "ref", + "text": "target", + "ref_audio_path": "unused.npy", + "ref_audio_sr": 16000, + } + ) + + expected_0 = (torch.tensor([1.0, 2.0, 3.0]) + torch.tensor([10.0, 20.0, 0.0])) / math.sqrt(3.0) + expected_1 = (torch.tensor([4.0, 5.0, 6.0]) + torch.tensor([30.0, 40.0, 0.0])) / math.sqrt(3.0) + assert torch.allclose(prefill[2].to(dtype=torch.float32), expected_0, atol=2e-2, rtol=0) + assert torch.allclose(prefill[3].to(dtype=torch.float32), expected_1, atol=2e-2, rtol=0) diff --git a/vllm_omni/model_executor/models/fish_speech/dac_encoder.py b/vllm_omni/model_executor/models/fish_speech/dac_encoder.py index e89815ab43..397530ca34 100644 --- a/vllm_omni/model_executor/models/fish_speech/dac_encoder.py +++ b/vllm_omni/model_executor/models/fish_speech/dac_encoder.py @@ -66,42 +66,22 @@ def _load_dac_codec( def _get_resample_kernel( source_sr: int, target_sr: int, - device_type: str, - device_index: int | None, - dtype_name: str, + device: torch.device, + dtype: torch.dtype, ): import torchaudio - device = torch.device(device_type, device_index) if device_index is not None else torch.device(device_type) - dtype = getattr(torch, dtype_name) + # lru_cache requires hashable key parts; torch.device and torch.dtype are. return torchaudio.transforms.Resample(source_sr, target_sr).to(device=device, dtype=dtype) -@torch.no_grad() -def encode_reference_audio( - model_path: str, +def _prepare_reference_audio_tensor( wav_samples: list[float] | np.ndarray | torch.Tensor, sample_rate: int, *, - device: torch.device | str | None = None, -) -> list[int]: - """Encode reference audio into semantic token IDs for prompt conditioning. - - Args: - model_path: HuggingFace model path (for locating codec.pth). - wav_samples: Audio waveform samples (mono, float). - sample_rate: Sample rate of the input audio. - - Returns: - List of semantic token IDs (151678 + code_value for each frame). - """ - if device is None: - device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") - else: - device = torch.device(device) - dtype = torch.float32 - codec = _load_dac_codec(model_path, device=device, dtype=dtype) - + device: torch.device, + dtype: torch.dtype, +) -> torch.Tensor: if isinstance(wav_samples, torch.Tensor): wav_tensor = wav_samples.detach() else: @@ -124,28 +104,52 @@ def encode_reference_audio( resampler = _get_resample_kernel( int(sample_rate), DAC_SAMPLE_RATE, - device.type, - device.index, - "float32", + device, + dtype, ) wav_tensor = resampler(wav_tensor.unsqueeze(0)).squeeze(0) + return wav_tensor - # Encode: [1, 1, T] -> codes [1, num_codebooks, num_frames] - wav_tensor = wav_tensor.unsqueeze(0).unsqueeze(0) - feature_lengths = torch.tensor([wav_tensor.shape[-1]], device=device, dtype=torch.long) - codes, feature_lengths_out = codec.encode(wav_tensor, feature_lengths) - # Extract semantic codebook (index 0) - shape [num_frames]. - semantic_codes = codes[0, 0, :].to(device="cpu", dtype=torch.long).tolist() +@torch.no_grad() +def encode_reference_audio_codes( + model_path: str, + wav_samples: list[float] | np.ndarray | torch.Tensor, + sample_rate: int, + *, + device: torch.device | str | None = None, +) -> torch.Tensor: + """Encode reference audio into DAC codebook indices. - # Convert to semantic token IDs: <|semantic:{i}|> = 151678 + i - SEMANTIC_TOKEN_OFFSET = 151678 - semantic_token_ids = [SEMANTIC_TOKEN_OFFSET + int(c) for c in semantic_codes] + Returns: + Tensor of shape [num_frames, num_codebooks] on the requested device + (dtype=torch.long). + """ + if device is None: + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + else: + device = torch.device(device) + dtype = torch.float32 + codec = _load_dac_codec(model_path, device=device, dtype=dtype) + wav_tensor = _prepare_reference_audio_tensor( + wav_samples, + sample_rate, + device=device, + dtype=dtype, + ) + + wav_tensor = wav_tensor.unsqueeze(0).unsqueeze(0) + feature_lengths = torch.tensor([wav_tensor.shape[-1]], device=device, dtype=torch.long) + codes, _ = codec.encode(wav_tensor, feature_lengths) + prepared_num_samples = int(wav_tensor.shape[-1]) + # [1, num_codebooks, num_frames] -> [num_frames, num_codebooks] + codes_fq = codes[0].transpose(0, 1).to(dtype=torch.long).contiguous() logger.info( - "Encoded reference audio: %d samples @ %dHz -> %d semantic tokens", - int(wav_tensor.shape[-1]), + "Encoded reference audio codes: %d samples @ %dHz -> frames=%d codebooks=%d", + prepared_num_samples, sample_rate, - len(semantic_token_ids), + int(codes_fq.shape[0]), + int(codes_fq.shape[1]), ) - return semantic_token_ids + return codes_fq diff --git a/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py b/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py index 3a8042eb2e..e121b03371 100644 --- a/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py +++ b/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py @@ -213,7 +213,9 @@ def forward( ids = input_ids.reshape(-1).to(dtype=torch.long) request_ids_list = self._split_request_ids(ids, kwargs.get("seq_token_counts")) - parsed_ctx_frames: list[int] = [] + num_req = len(request_ids_list) + parsed_ctx_frames = [0] * num_req + parsed_total_frames = [0] * num_req valid_codes_qf: list[torch.Tensor] = [] valid_indices: list[int] = [] left_context_size = [0] * len(request_ids_list) @@ -226,7 +228,6 @@ def forward( for i, req_ids in enumerate(request_ids_list): if req_ids.numel() < 1: - parsed_ctx_frames.append(0) continue ctx_frames = left_context_size[i] flat = req_ids @@ -238,15 +239,13 @@ def forward( n, q, ) - parsed_ctx_frames.append(0) continue frames = n // q codes_qf = flat.reshape(q, frames) - parsed_ctx_frames.append(ctx_frames) + parsed_ctx_frames[i] = ctx_frames + parsed_total_frames[i] = frames valid_codes_qf.append(codes_qf) valid_indices.append(i) - - num_req = len(request_ids_list) if not valid_codes_qf: return OmniOutput( text_hidden_states=None, @@ -297,11 +296,17 @@ def forward( for j, idx in enumerate(valid_indices): ctx_frames = parsed_ctx_frames[idx] + total_frames = parsed_total_frames[idx] audio_len = int(audio_lengths[j].item()) if audio_lengths.numel() > j else int(wav_batch.shape[-1]) wav = wav_batch[j, 0, :audio_len] # Trim context frames (left overlap for streaming). if ctx_frames > 0: - cut = ctx_frames * self._hop_length + # Decode length may deviate from (frames * hop_length) due to model + # internals (padding/rounding). Use proportional trimming to keep + # overlap removal aligned with the actual decoded length. + denom = max(int(total_frames), 1) + cut = int(ctx_frames / denom * wav.shape[0]) + cut = max(0, min(cut, int(wav.shape[0]))) if cut < wav.shape[0]: wav = wav[cut:] else: diff --git a/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py b/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py index b2e8a95445..4ad2a1fa63 100644 --- a/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py +++ b/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py @@ -35,7 +35,7 @@ from vllm_omni.model_executor.models.output_templates import OmniOutput from .configuration_fish_speech import FishSpeechConfig, FishSpeechFastARConfig, FishSpeechSlowARConfig -from .dac_encoder import _load_dac_codec, encode_reference_audio +from .dac_encoder import _load_dac_codec, encode_reference_audio_codes from .fish_speech_fast_ar import FishSpeechFastAR from .prompt_utils import build_fish_voice_clone_prompt_ids @@ -530,12 +530,13 @@ def _build_structured_voice_clone_prefill_embeds(self, info_dict: dict[str, Any] ref_audio_wav = np.load(ref_audio_path) os.remove(ref_audio_path) - semantic_token_ids = encode_reference_audio( + ref_codes_fq = encode_reference_audio_codes( self.model_path, ref_audio_wav, ref_audio_sr, device=self.codebook_embeddings.weight.device, ) + semantic_token_ids = (ref_codes_fq[:, 0] + self._semantic_begin_id).tolist() prompt_ids, _, _ = build_fish_voice_clone_prompt_ids( tokenizer, text, @@ -547,7 +548,37 @@ def _build_structured_voice_clone_prefill_embeds(self, info_dict: dict[str, Any] dtype=torch.long, device=self.codebook_embeddings.weight.device, ) - return self.embed_input_ids(prompt_ids.unsqueeze(0)).squeeze(0).to(dtype=torch.bfloat16) + embeds = self.embed_input_ids(prompt_ids.unsqueeze(0)).squeeze(0).to(dtype=torch.bfloat16) + + audio_start_id = tokenizer.convert_tokens_to_ids("<|audio_start|>") + audio_end_id = tokenizer.convert_tokens_to_ids("<|audio_end|>") + start_pos = (prompt_ids == int(audio_start_id)).nonzero(as_tuple=False) + end_pos = (prompt_ids == int(audio_end_id)).nonzero(as_tuple=False) + if start_pos.numel() == 0 or end_pos.numel() == 0: + return embeds + s = int(start_pos[0].item()) + 1 + e = int(end_pos[0].item()) + if e <= s: + return embeds + + frames_in_prompt = e - s + if ref_codes_fq.device != embeds.device: + ref_codes_fq = ref_codes_fq.to(device=embeds.device, dtype=torch.long) + frames = min(int(ref_codes_fq.shape[0]), int(frames_in_prompt)) + if frames <= 0: + return embeds + + q = min(int(ref_codes_fq.shape[1]), self._num_codebooks) + offsets = (torch.arange(q, device=embeds.device, dtype=torch.long) * self._codebook_size).unsqueeze(0) + ref_codes_slice = ref_codes_fq[:frames, :q] + if bool((ref_codes_slice < 0).any().item()): + logger.warning("Fish Speech structured clone saw negative DAC codes; clamping them to zero") + code_with_offset = ref_codes_slice.clamp(min=0) + offsets + codebook_sum = self.codebook_embeddings(code_with_offset).sum(dim=1).to(dtype=embeds.dtype) + + result = embeds.clone() + result[s : s + frames] = (result[s : s + frames] + codebook_sum) / math.sqrt(self._num_codebooks + 1) + return result.to(dtype=torch.bfloat16) # -------------------- GPU-side MTP fast-path -------------------- From 563f73b78a1be00f483f1d940bb5bf6276550984 Mon Sep 17 00:00:00 2001 From: chickeyton Date: Fri, 3 Apr 2026 16:05:19 +0800 Subject: [PATCH 034/204] Refactor StageDiffusionClient and StageEngineCoreClient (#2006) --- docs/api/README.md | 1 - docs/contributing/ci/CI_5levels.md | 1 - docs/contributing/ci/tests_style.md | 1 - docs/design/module/async_omni_architecture.md | 4 +- .../test_qwen_image_diffusion_batching.py | 2 +- .../test_async_omni_engine_stage_init.py | 1 + .../openai_api/test_image_server.py | 2 +- .../entrypoints/test_async_omni_diffusion.py | 113 ---- vllm_omni/diffusion/ipc.py | 15 +- vllm_omni/diffusion/stage_diffusion_client.py | 320 +++++++--- vllm_omni/diffusion/stage_diffusion_proc.py | 604 ++++++++++++++++++ vllm_omni/engine/async_omni_engine.py | 21 +- vllm_omni/engine/orchestrator.py | 2 +- vllm_omni/engine/stage_engine_core_client.py | 28 +- vllm_omni/engine/stage_engine_core_proc.py | 206 ++++++ vllm_omni/engine/stage_init_utils.py | 41 +- vllm_omni/entrypoints/__init__.py | 2 - vllm_omni/entrypoints/async_omni_diffusion.py | 473 -------------- vllm_omni/entrypoints/openai/serving_chat.py | 41 +- 19 files changed, 1108 insertions(+), 770 deletions(-) delete mode 100644 tests/entrypoints/test_async_omni_diffusion.py create mode 100644 vllm_omni/diffusion/stage_diffusion_proc.py create mode 100644 vllm_omni/engine/stage_engine_core_proc.py delete mode 100644 vllm_omni/entrypoints/async_omni_diffusion.py diff --git a/docs/api/README.md b/docs/api/README.md index 2266a52415..f65cbb525d 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -5,7 +5,6 @@ Main entry points for vLLM-Omni inference and serving. - [vllm_omni.entrypoints.async_omni.AsyncOmni][] -- [vllm_omni.entrypoints.async_omni_diffusion.AsyncOmniDiffusion][] - [vllm_omni.entrypoints.cfg_companion_tracker.CfgCompanionTracker][] - [vllm_omni.entrypoints.cli.benchmark.base.OmniBenchmarkSubcommandBase][] - [vllm_omni.entrypoints.cli.benchmark.main.OmniBenchmarkSubcommand][] diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index 81392b201d..967d0cc6d7 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -168,7 +168,6 @@ vllm_omni/ tests/ │ └── arg_utils.py │ └── test_arg_utils.py ⬜ │ ├── entrypoints/ → ├── entrypoints/ -│ ├── async_omni_diffusion.py │ ├── test_async_omni_diffusion_config.py ✅ │ ├── stage_utils.py │ ├── test_stage_utils.py ✅ │ ├── cli/ │ ├── cli/ (benchmarks/test_serve_cli.py covers CLI serve) │ │ └── ... │ │ └── test_*.py ⬜ diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md index 0b07c5ffe4..8b10cf4cc1 100644 --- a/docs/contributing/ci/tests_style.md +++ b/docs/contributing/ci/tests_style.md @@ -73,7 +73,6 @@ vllm_omni/ tests/ │ └── arg_utils.py │ └── test_arg_utils.py ⬜ │ ├── entrypoints/ → ├── entrypoints/ -│ ├── async_omni_diffusion.py │ ├── test_async_omni_diffusion_config.py ✅ │ ├── stage_utils.py │ ├── test_stage_utils.py ✅ │ ├── cli/ │ ├── cli/ (benchmarks/test_serve_cli.py covers CLI serve) │ │ └── ... │ │ └── test_*.py ⬜ diff --git a/docs/design/module/async_omni_architecture.md b/docs/design/module/async_omni_architecture.md index 59275c556f..92b13a3da0 100644 --- a/docs/design/module/async_omni_architecture.md +++ b/docs/design/module/async_omni_architecture.md @@ -69,7 +69,7 @@ [5] Orchestrator._orchestration_loop (loop) -> poll stage output - llm stage: await get_output_async() - - diffusion stage: get_diffusion_output_async() + - diffusion stage: get_diffusion_output_nowait() -> (llm stage) output_processors[i].process_outputs(...) -> _route_output(...) -> if finished and not final_stage and non-async-chunk: @@ -112,7 +112,7 @@ sequenceDiagram ORCH->>S0: add_request_async loop poll route forward - ORCH->>S0: get_output_async / get_diffusion_output_async + ORCH->>S0: get_output_async / get_diffusion_output_nowait ORCH->>ORCH: _route_output alt need forward to next stage ORCH->>SN: add_request_async diff --git a/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py b/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py index e5c7387260..d5f82f893e 100644 --- a/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py +++ b/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py @@ -509,7 +509,7 @@ def test_diffusion_batching_async_explicit_batch(model_name: str): all prompts in a single engine call and returns a single combined result. The list-prompt path routes through the orchestrator's - ``add_batch_request_async`` → ``AsyncOmniDiffusion.generate_batch`` + ``add_batch_request_async`` → ``AsyncOmni.generate_batch`` and yields ONE ``OmniRequestOutput`` with ALL images combined. """ diff --git a/tests/engine/test_async_omni_engine_stage_init.py b/tests/engine/test_async_omni_engine_stage_init.py index 28b44e9bd7..9f47fd449d 100644 --- a/tests/engine/test_async_omni_engine_stage_init.py +++ b/tests/engine/test_async_omni_engine_stage_init.py @@ -113,6 +113,7 @@ def __init__(self, vllm_config, renderer=None): executor_class=object, engine_manager=object(), coordinator=object(), + proc=None, addresses=types.SimpleNamespace( inputs=["inproc://input"], outputs=["inproc://output"], diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py index 7d2a67e730..d68143dae8 100644 --- a/tests/entrypoints/openai_api/test_image_server.py +++ b/tests/entrypoints/openai_api/test_image_server.py @@ -106,7 +106,7 @@ def test_encode_image_base64(): class MockGenerationResult: - """Mock result object from AsyncOmniDiffusion.generate()""" + """Mock result object from AsyncOmni.generate()""" def __init__(self, images): self.images = images diff --git a/tests/entrypoints/test_async_omni_diffusion.py b/tests/entrypoints/test_async_omni_diffusion.py deleted file mode 100644 index c8aaae4f94..0000000000 --- a/tests/entrypoints/test_async_omni_diffusion.py +++ /dev/null @@ -1,113 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import asyncio -import threading -from concurrent.futures import ThreadPoolExecutor -from types import SimpleNamespace -from unittest.mock import Mock - -import pytest - -import vllm_omni.diffusion.stage_diffusion_client as stage_diffusion_client_module -from vllm_omni.diffusion.data import DiffusionRequestAbortedError -from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient -from vllm_omni.entrypoints.async_omni_diffusion import AsyncOmniDiffusion -from vllm_omni.inputs.data import OmniDiffusionSamplingParams - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -def test_get_diffusion_od_config_returns_direct_config(): - diffusion = object.__new__(AsyncOmniDiffusion) - diffusion.od_config = object() - - assert diffusion.get_diffusion_od_config() is diffusion.od_config - - -def test_async_omni_diffusion_generate_aborts_engine_on_cancel(): - async def run_test(): - started = threading.Event() - release = threading.Event() - abort = Mock() - - def step(request): - del request - started.set() - release.wait(timeout=5) - return [SimpleNamespace(request_id="req-1")] - - diffusion = object.__new__(AsyncOmniDiffusion) - diffusion.engine = SimpleNamespace(step=step, abort=abort) - diffusion._executor = ThreadPoolExecutor(max_workers=1) - - task = asyncio.create_task( - diffusion.generate( - prompt="hello", - sampling_params=OmniDiffusionSamplingParams(), - request_id="req-1", - ) - ) - try: - assert await asyncio.to_thread(started.wait, 1) - task.cancel() - with pytest.raises(asyncio.CancelledError): - await task - finally: - release.set() - diffusion._executor.shutdown(wait=True) - - abort.assert_called_once_with("req-1") - - asyncio.run(run_test()) - - -def test_stage_diffusion_client_abort_requests_forwards_to_engine(): - async def run_test(): - aborted_request_ids: list[list[str]] = [] - - async def abort(request_ids): - aborted_request_ids.append(request_ids) - - client = object.__new__(StageDiffusionClient) - client._engine = SimpleNamespace(abort=abort) - client._tasks = {} - - task = asyncio.create_task(asyncio.sleep(60)) - client._tasks["req-1"] = task - - await client.abort_requests_async(["req-1", "req-2"]) - - with pytest.raises(asyncio.CancelledError): - await task - assert client._tasks == {} - assert aborted_request_ids == [["req-1", "req-2"]] - - asyncio.run(run_test()) - - -def test_stage_diffusion_client_run_treats_abort_as_normal_path(monkeypatch): - async def run_test(): - async def generate(prompt, sampling_params, request_id): - del prompt, sampling_params - raise DiffusionRequestAbortedError(f"Request {request_id} aborted.") - - info = Mock() - exception = Mock() - monkeypatch.setattr(stage_diffusion_client_module.logger, "info", info) - monkeypatch.setattr(stage_diffusion_client_module.logger, "exception", exception) - - client = object.__new__(StageDiffusionClient) - client.stage_id = 3 - client._engine = SimpleNamespace(generate=generate) - client._output_queue = asyncio.Queue() - client._tasks = {"req-1": object()} - - await client._run("req-1", "prompt", OmniDiffusionSamplingParams()) - - assert client._output_queue.empty() - assert client._tasks == {} - info.assert_called_once() - exception.assert_not_called() - - asyncio.run(run_test()) diff --git a/vllm_omni/diffusion/ipc.py b/vllm_omni/diffusion/ipc.py index d3d7b3aff3..9aafc1cf17 100644 --- a/vllm_omni/diffusion/ipc.py +++ b/vllm_omni/diffusion/ipc.py @@ -31,6 +31,12 @@ def _tensor_to_shm(tensor: torch.Tensor) -> dict[str, Any]: import numpy as np tensor = tensor.detach().cpu().contiguous() + original_dtype = tensor.dtype + # NumPy does not support bfloat16; promote to float32 for the SHM + # transfer and record the original dtype so _tensor_from_shm can + # convert back. The round-trip is lossless for bfloat16 values. + if original_dtype == torch.bfloat16: + tensor = tensor.to(torch.float32) arr = tensor.numpy() nbytes = arr.nbytes shm = shared_memory.SharedMemory(create=True, size=nbytes) @@ -40,7 +46,7 @@ def _tensor_to_shm(tensor: torch.Tensor) -> dict[str, Any]: "__tensor_shm__": True, "name": shm.name, "shape": list(tensor.shape), - "torch_dtype": str(tensor.dtype), + "torch_dtype": str(original_dtype), "numpy_dtype": str(arr.dtype), "nbytes": nbytes, } @@ -59,6 +65,13 @@ def _tensor_from_shm(handle: dict[str, Any]) -> torch.Tensor: np_dtype = np.dtype(handle["numpy_dtype"]) arr = np.ndarray(handle["shape"], dtype=np_dtype, buffer=shm.buf[: handle["nbytes"]]) tensor = torch.from_numpy(arr.copy()) + # Restore the original dtype if it differs from the numpy-compatible + # dtype used for the SHM transfer (e.g. bfloat16 → float32 → bfloat16). + torch_dtype_str = handle.get("torch_dtype", "") + if torch_dtype_str: + original_dtype = getattr(torch, torch_dtype_str.replace("torch.", ""), None) + if original_dtype is not None and tensor.dtype != original_dtype: + tensor = tensor.to(original_dtype) finally: shm.close() shm.unlink() diff --git a/vllm_omni/diffusion/stage_diffusion_client.py b/vllm_omni/diffusion/stage_diffusion_client.py index ddad2f9f3f..db13f99aab 100644 --- a/vllm_omni/diffusion/stage_diffusion_client.py +++ b/vllm_omni/diffusion/stage_diffusion_client.py @@ -1,20 +1,30 @@ """Stage Diffusion Client for vLLM-Omni multi-stage runtime. -Wraps AsyncOmniDiffusion to expose the same interface the Orchestrator -expects from any stage client. +Spawns StageDiffusionProc in a subprocess and communicates via ZMQ +(PUSH/PULL) to expose the same interface the Orchestrator expects +from any stage client. """ from __future__ import annotations import asyncio import time +import uuid +from dataclasses import fields, is_dataclass from typing import TYPE_CHECKING, Any +import zmq from vllm.logger import init_logger -from vllm_omni.diffusion.data import DiffusionRequestAbortedError -from vllm_omni.engine.stage_init_utils import StageMetadata -from vllm_omni.entrypoints.async_omni_diffusion import AsyncOmniDiffusion +from vllm_omni.diffusion.stage_diffusion_proc import ( + complete_diffusion_handshake, + spawn_diffusion_proc, +) +from vllm_omni.distributed.omni_connectors.utils.serialization import ( + OmniMsgpackDecoder, + OmniMsgpackEncoder, +) +from vllm_omni.engine.stage_init_utils import StageMetadata, terminate_alive_proc from vllm_omni.outputs import OmniRequestOutput if TYPE_CHECKING: @@ -25,11 +35,12 @@ class StageDiffusionClient: - """Wraps AsyncOmniDiffusion for use inside the Orchestrator. + """Communicates with StageDiffusionProc via ZMQ for use inside the Orchestrator. Exposes the same attributes and async methods the Orchestrator uses on StageEngineCoreClient, but routes execution through - DiffusionEngine instead of vLLM EngineCore. + a StageDiffusionProc subprocess instead of running the diffusion + engine in-process. """ stage_type: str = "diffusion" @@ -48,56 +59,137 @@ def __init__( self.custom_process_input_func = metadata.custom_process_input_func self.engine_input_source = metadata.engine_input_source - self._engine = AsyncOmniDiffusion(model=model, od_config=od_config, batch_size=batch_size) + # Spawn StageDiffusionProc subprocess and wait for READY. + proc, handshake_address, request_address, response_address = spawn_diffusion_proc(model, od_config) + complete_diffusion_handshake(proc, handshake_address) + self._proc = proc + + # ZMQ sockets (sync) for communicating with the subprocess. + self._zmq_ctx = zmq.Context() + self._request_socket = self._zmq_ctx.socket(zmq.PUSH) + self._request_socket.connect(request_address) + self._response_socket = self._zmq_ctx.socket(zmq.PULL) + self._response_socket.connect(response_address) + + self._encoder = OmniMsgpackEncoder() + self._decoder = OmniMsgpackDecoder() + + # Buffers for demultiplexing response messages. self._output_queue: asyncio.Queue[OmniRequestOutput] = asyncio.Queue() + self._rpc_results: dict[str, Any] = {} + self._pending_rpcs: set[str] = set() self._tasks: dict[str, asyncio.Task] = {} + self._shutting_down = False logger.info("[StageDiffusionClient] Stage-%s initialized (batch_size=%d)", self.stage_id, batch_size) - async def add_request_async( - self, - request_id: str, - prompt: OmniPromptType, - sampling_params: OmniDiffusionSamplingParams, - ) -> None: - task = asyncio.create_task( - self._run(request_id, prompt, sampling_params), - name=f"diffusion-{request_id}", - ) - self._tasks[request_id] = task + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _drain_responses(self) -> None: + """Non-blocking drain of all available responses from the subprocess.""" + while True: + try: + raw = self._response_socket.recv(zmq.NOBLOCK) + except zmq.Again: + break + + msg = self._decoder.decode(raw) + msg_type = msg.get("type") + + if msg_type == "result": + self._output_queue.put_nowait(msg["output"]) + elif msg_type == "rpc_result": + self._rpc_results[msg["rpc_id"]] = msg["result"] + elif msg_type == "error": + req_id = msg.get("request_id") + rpc_id = msg.get("rpc_id") + error_msg = msg.get("error") + logger.error( + "[StageDiffusionClient] Stage-%s subprocess error for %s: %s", + self.stage_id, + rpc_id or req_id, + error_msg, + ) + # Route RPC errors so collective_rpc_async can unblock. + if rpc_id is not None and rpc_id in self._pending_rpcs: + self._rpc_results[rpc_id] = { + "error": True, + "reason": error_msg, + } + + # Fields that are subprocess-local and cannot be serialized across + # process boundaries. They are recreated in the subprocess with + # their default values. + _NON_SERIALIZABLE_FIELDS = frozenset( + { + "generator", # torch.Generator — recreated from seed + "modules", # model components — loaded in subprocess + } + ) - async def _run( + @staticmethod + def _sampling_params_to_dict(sampling_params: Any) -> dict[str, Any]: + """Convert sampling params to a plain dict for serialization. + + Uses ``dataclasses.fields`` + ``getattr`` instead of ``asdict`` + to avoid deep-copying large tensors, and skips fields that + cannot cross process boundaries. + + When a ``torch.Generator`` is present but ``seed`` is not set, + the generator's initial seed is extracted so the subprocess can + recreate an equivalent generator via ``diffusion_model_runner``. + """ + if is_dataclass(sampling_params) and not isinstance(sampling_params, type): + result = { + f.name: getattr(sampling_params, f.name) + for f in fields(sampling_params) + if f.name not in StageDiffusionClient._NON_SERIALIZABLE_FIELDS + } + elif not isinstance(sampling_params, dict): + raise TypeError(f"sampling_params is not a dict but {sampling_params.__class__.__name__}") + else: + result = { + k: v for k, v in sampling_params.items() if k not in StageDiffusionClient._NON_SERIALIZABLE_FIELDS + } + + # Preserve the generator's seed across the process boundary so + # the subprocess can recreate deterministic random state. + if result.get("seed") is None: + generator = ( + getattr(sampling_params, "generator", None) + if not isinstance(sampling_params, dict) + else sampling_params.get("generator") + ) + if generator is not None: + if isinstance(generator, list) and generator: + generator = generator[0] + if hasattr(generator, "initial_seed"): + result["seed"] = generator.initial_seed() + + return result + + # ------------------------------------------------------------------ + # Public API (matches the interface the Orchestrator expects) + # ------------------------------------------------------------------ + + async def add_request_async( self, request_id: str, prompt: OmniPromptType, sampling_params: OmniDiffusionSamplingParams, ) -> None: - try: - result = await self._engine.generate(prompt, sampling_params, request_id) - await self._output_queue.put(result) - except asyncio.CancelledError: - logger.info( - "[StageDiffusionClient] Stage-%s req=%s cancelled", - self.stage_id, - request_id, - ) - raise - except DiffusionRequestAbortedError as e: - logger.info( - "[StageDiffusionClient] Stage-%s req=%s aborted: %s", - self.stage_id, - request_id, - e, - ) - except Exception as e: - logger.exception( - "[StageDiffusionClient] Stage-%s req=%s failed: %s", - self.stage_id, - request_id, - e, + self._request_socket.send( + self._encoder.encode( + { + "type": "add_request", + "request_id": request_id, + "prompt": prompt, + "sampling_params": self._sampling_params_to_dict(sampling_params), + } ) - finally: - self._tasks.pop(request_id, None) + ) # TODO(Long): Temporary solution to boost performance of diffusion stages. # Remove this after scheduling algorithm is implemented @@ -126,12 +218,16 @@ async def _run_batch( sampling_params: OmniDiffusionSamplingParams, ) -> None: try: - result = await self._engine.generate_batch( - prompts, - sampling_params, - request_id, + self._request_socket.send( + self._encoder.encode( + { + "type": "add_batch_request", + "request_id": request_id, + "prompts": prompts, + "sampling_params": self._sampling_params_to_dict(sampling_params), + } + ) ) - await self._output_queue.put(result) except Exception as e: logger.exception( "[StageDiffusionClient] Stage-%s batch req=%s failed: %s", @@ -142,18 +238,24 @@ async def _run_batch( finally: self._tasks.pop(request_id, None) - def get_diffusion_output_async(self) -> OmniRequestOutput | None: + def get_diffusion_output_nowait(self) -> OmniRequestOutput | None: + self._drain_responses() try: return self._output_queue.get_nowait() except asyncio.QueueEmpty: + if not self._shutting_down and self._proc is not None and not self._proc.is_alive(): + raise RuntimeError(f"StageDiffusionProc died unexpectedly (exit code {self._proc.exitcode})") return None async def abort_requests_async(self, request_ids: list[str]) -> None: - for rid in request_ids: - task = self._tasks.pop(rid, None) - if task: - task.cancel() - await self._engine.abort(request_ids) + self._request_socket.send( + self._encoder.encode( + { + "type": "abort", + "request_ids": list(request_ids), + } + ) + ) async def collective_rpc_async( self, @@ -162,60 +264,66 @@ async def collective_rpc_async( args: tuple[Any, ...] = (), kwargs: dict[str, Any] | None = None, ) -> Any: - """Best-effort control RPC shim for diffusion stages. - - TODO(AsyncOmni): add dedicated wrappers on AsyncOmniDiffusion for the - remaining control APIs instead of reaching into its underlying engine. - """ - kwargs = kwargs or {} - - # Handle profile method: inject stage_id into profile_prefix for diffusion stages + """Forward control RPCs to the diffusion subprocess.""" + # Inject a default profile_prefix that includes stage_id when profiling. if method == "profile": - target = getattr(self._engine, method, None) - if target is None: - return { - "supported": False, - "todo": True, - "reason": f"AsyncOmniDiffusion.{method} is not implemented", - } - # Extract is_start and profile_prefix from args - is_start = args[0] if args else True - profile_prefix = args[1] if len(args) > 1 else None - # Generate profile_prefix with stage_id if starting and no prefix provided + args_list = list(args) + is_start = args_list[0] if args_list else True + profile_prefix = args_list[1] if len(args_list) > 1 else None if is_start and profile_prefix is None: profile_prefix = f"stage_{self.stage_id}_diffusion_{int(time.time())}" - result = target(is_start, profile_prefix) - if timeout is not None: - return await asyncio.wait_for(result, timeout=timeout) - return await result - - if method in {"add_lora", "remove_lora", "list_loras", "pin_lora"}: - target = getattr(self._engine, method, None) - if target is None: - return { - "supported": False, - "todo": True, - "reason": f"AsyncOmniDiffusion.{method} is not implemented", + if len(args_list) > 1: + args_list[1] = profile_prefix + else: + args_list.append(profile_prefix) + args = tuple(args_list) + + kwargs = kwargs or {} + rpc_id = uuid.uuid4().hex + self._pending_rpcs.add(rpc_id) + + self._request_socket.send( + self._encoder.encode( + { + "type": "collective_rpc", + "rpc_id": rpc_id, + "method": method, + "timeout": timeout, + "args": list(args), + "kwargs": kwargs, } - result = target(*args, **kwargs) - if timeout is not None: - return await asyncio.wait_for(result, timeout=timeout) - return await result - - # Fall back to collective RPC for other methods - loop = asyncio.get_running_loop() - return await loop.run_in_executor( - self._engine._executor, - self._engine.engine.collective_rpc, - method, - timeout, - args, - kwargs, - None, + ) ) + deadline = time.monotonic() + timeout if timeout else None + # Wait for the matching RPC response, buffering result messages. + try: + while True: + self._drain_responses() + if rpc_id in self._rpc_results: + return self._rpc_results.pop(rpc_id) + if self._proc is not None and not self._proc.is_alive(): + raise RuntimeError( + f"StageDiffusionProc died while waiting for " + f"collective_rpc '{method}' (exit code {self._proc.exitcode})" + ) + if deadline and time.monotonic() > deadline: + raise TimeoutError(f"collective_rpc_async '{method}' timed out after {timeout}s") + await asyncio.sleep(0.01) + finally: + self._pending_rpcs.discard(rpc_id) + def shutdown(self) -> None: - for task in self._tasks.values(): - task.cancel() - self._tasks.clear() - self._engine.close() + self._shutting_down = True + try: + self._request_socket.send(self._encoder.encode({"type": "shutdown"})) + except Exception: + pass + + if self._proc is not None and self._proc.is_alive(): + self._proc.join(timeout=10) + terminate_alive_proc(self._proc) + + self._request_socket.close(linger=0) + self._response_socket.close(linger=0) + self._zmq_ctx.term() diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py new file mode 100644 index 0000000000..8677da0371 --- /dev/null +++ b/vllm_omni/diffusion/stage_diffusion_proc.py @@ -0,0 +1,604 @@ +"""Subprocess entry point for the diffusion engine. + +StageDiffusionProc runs DiffusionEngine in a child process, +communicating with StageDiffusionClient via ZMQ (PUSH/PULL). +""" + +from __future__ import annotations + +import asyncio +import signal +import time +from concurrent.futures import ThreadPoolExecutor +from multiprocessing.process import BaseProcess +from typing import TYPE_CHECKING, Any + +import msgspec +import zmq +import zmq.asyncio +from vllm.logger import init_logger +from vllm.transformers_utils.config import get_hf_file_to_dict +from vllm.utils.network_utils import get_open_zmq_ipc_path, zmq_socket_ctx +from vllm.utils.system_utils import get_mp_context +from vllm.v1.utils import shutdown + +from vllm_omni.diffusion.data import DiffusionRequestAbortedError, TransformerConfig +from vllm_omni.diffusion.diffusion_engine import DiffusionEngine +from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.distributed.omni_connectors.utils.serialization import ( + OmniMsgpackDecoder, + OmniMsgpackEncoder, +) +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput + +if TYPE_CHECKING: + from vllm_omni.diffusion.data import OmniDiffusionConfig + +logger = init_logger(__name__) + +_HANDSHAKE_POLL_TIMEOUT_S = 600 + + +class StageDiffusionProc: + """Subprocess entry point for diffusion inference. + + Manages DiffusionEngine lifecycle, async request processing, + and ZMQ-based communication with StageDiffusionClient. + """ + + def __init__(self, model: str, od_config: OmniDiffusionConfig) -> None: + self._model = model + self._od_config = od_config + self._engine: DiffusionEngine | None = None + self._executor: ThreadPoolExecutor | None = None + self._closed = False + + # ------------------------------------------------------------------ + # Initialization + # ------------------------------------------------------------------ + + def initialize(self) -> None: + """Enrich config, create DiffusionEngine and thread pool.""" + self._enrich_config() + self._engine = DiffusionEngine.make_engine(self._od_config) + self._executor = ThreadPoolExecutor(max_workers=1) + logger.info("StageDiffusionProc initialized with model: %s", self._model) + + def _enrich_config(self) -> None: + """Load model metadata from HuggingFace and populate od_config fields. + + Diffusers-style models expose ``model_index.json`` with ``_class_name``. + Non-diffusers models (e.g. Bagel, NextStep) only have ``config.json``, + so we fall back to reading that and mapping model_type manually. + """ + od_config = self._od_config + + try: + config_dict = get_hf_file_to_dict("model_index.json", od_config.model) + if config_dict is not None: + if od_config.model_class_name is None: + od_config.model_class_name = config_dict.get("_class_name", None) + od_config.update_multimodal_support() + + tf_config_dict = get_hf_file_to_dict("transformer/config.json", od_config.model) + od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict) + else: + raise FileNotFoundError("model_index.json not found") + except (AttributeError, OSError, ValueError, FileNotFoundError): + cfg = get_hf_file_to_dict("config.json", od_config.model) + if cfg is None: + raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}") + + od_config.tf_model_config = TransformerConfig.from_dict(cfg) + model_type = cfg.get("model_type") + architectures = cfg.get("architectures") or [] + + if model_type == "bagel" or "BagelForConditionalGeneration" in architectures: + od_config.model_class_name = "BagelPipeline" + od_config.tf_model_config = TransformerConfig() + od_config.update_multimodal_support() + elif model_type == "nextstep": + if od_config.model_class_name is None: + od_config.model_class_name = "NextStep11Pipeline" + od_config.tf_model_config = TransformerConfig() + od_config.update_multimodal_support() + elif architectures and len(architectures) == 1: + od_config.model_class_name = architectures[0] + else: + raise + + # ------------------------------------------------------------------ + # Request processing + # ------------------------------------------------------------------ + + def _reconstruct_sampling_params(self, sampling_params_dict: dict) -> OmniDiffusionSamplingParams: + """Reconstruct OmniDiffusionSamplingParams from a dict, handling LoRA.""" + lora_req = sampling_params_dict.get("lora_request") + if lora_req is not None: + from vllm.lora.request import LoRARequest + + if not isinstance(lora_req, LoRARequest): + sampling_params_dict["lora_request"] = msgspec.convert(lora_req, LoRARequest) + + return OmniDiffusionSamplingParams(**sampling_params_dict) + + async def _process_request( + self, + request_id: str, + prompt: Any, + sampling_params_dict: dict, + ) -> OmniRequestOutput: + """Build a diffusion request and run DiffusionEngine.step().""" + sampling_params = self._reconstruct_sampling_params(sampling_params_dict) + + request = OmniDiffusionRequest( + prompts=[prompt], + sampling_params=sampling_params, + request_ids=[request_id], + ) + + loop = asyncio.get_running_loop() + results = await loop.run_in_executor(self._executor, self._engine.step, request) + result = results[0] + if not result.request_id: + result.request_id = request_id + return result + + async def _process_batch_request( + self, + request_id: str, + prompts: list[Any], + sampling_params_dict: dict, + ) -> OmniRequestOutput: + """Build a batched diffusion request and run DiffusionEngine.step(). + + All prompts are processed in a single step() call. The per-prompt + results are merged into one :class:`OmniRequestOutput` whose + ``images`` list contains every generated image, matching the + contract expected by the orchestrator and tests. + """ + sampling_params = self._reconstruct_sampling_params(sampling_params_dict) + + request = OmniDiffusionRequest( + prompts=prompts, + sampling_params=sampling_params, + request_ids=[request_id] * len(prompts), + ) + + loop = asyncio.get_running_loop() + results = await loop.run_in_executor(self._executor, self._engine.step, request) + + # Merge per-prompt results into a single combined output. + all_images: list = [] + merged_mm: dict[str, Any] = {} + merged_metrics: dict[str, Any] = {} + merged_durations: dict[str, float] = {} + peak_mem = 0.0 + latents = None + final_output_type = "image" + + for r in results: + all_images.extend(r.images) + merged_mm.update(r._multimodal_output) + merged_metrics.update(r.metrics) + merged_durations.update(r.stage_durations) + peak_mem = max(peak_mem, r.peak_memory_mb) + if latents is None and r.latents is not None: + latents = r.latents + if r.final_output_type != "image": + final_output_type = r.final_output_type + + return OmniRequestOutput.from_diffusion( + request_id=request_id, + images=all_images, + prompt=prompts[0] if len(prompts) == 1 else None, + metrics=merged_metrics, + latents=latents, + multimodal_output=merged_mm or None, + final_output_type=final_output_type, + stage_durations=merged_durations, + peak_memory_mb=peak_mem, + ) + + # ------------------------------------------------------------------ + # Collective RPC dispatch + # ------------------------------------------------------------------ + + async def _handle_collective_rpc( + self, + method: str, + timeout: float | None, + args: tuple, + kwargs: dict, + ) -> Any: + """Dispatch collective RPC calls to DiffusionEngine. + + LoRA methods remap arguments and post-process results to match + the contract that ``AsyncOmni`` provides. + """ + loop = asyncio.get_running_loop() + + if method == "profile": + is_start = args[0] if args else True + profile_prefix = args[1] if len(args) > 1 else None + return await loop.run_in_executor( + self._executor, + self._engine.profile, + is_start, + profile_prefix, + ) + + if method == "add_lora": + # Reconstruct LoRARequest after IPC if needed. + lora_request = args[0] if args else kwargs.get("lora_request") + if lora_request is not None: + from vllm.lora.request import LoRARequest + + if not isinstance(lora_request, LoRARequest): + lora_request = msgspec.convert(lora_request, LoRARequest) + results = await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + "add_lora", + timeout, + (), + {"lora_request": lora_request}, + None, + ) + return all(results) if isinstance(results, list) else results + + if method == "remove_lora": + results = await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + "remove_lora", + timeout, + args, + kwargs or {}, + None, + ) + return all(results) if isinstance(results, list) else results + + if method == "list_loras": + results = await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + "list_loras", + timeout, + (), + {}, + None, + ) + if not isinstance(results, list): + return results or [] + merged: set[int] = set() + for part in results: + merged.update(part or []) + return sorted(merged) + + if method == "pin_lora": + lora_id = args[0] if args else kwargs.get("adapter_id") + results = await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + "pin_lora", + timeout, + (), + {"adapter_id": lora_id}, + None, + ) + return all(results) if isinstance(results, list) else results + + # Fall back to DiffusionEngine.collective_rpc for all other methods + # (e.g. worker extension RPCs like "test_extension_name"). + return await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + method, + timeout, + args, + kwargs or {}, + None, + ) + + # ------------------------------------------------------------------ + # ZMQ event loop + # ------------------------------------------------------------------ + + async def run_loop( + self, + request_address: str, + response_address: str, + ) -> None: + """Async event loop handling ZMQ messages from StageDiffusionClient.""" + ctx = zmq.asyncio.Context() + + request_socket = ctx.socket(zmq.PULL) + request_socket.bind(request_address) + + response_socket = ctx.socket(zmq.PUSH) + response_socket.bind(response_address) + + encoder = OmniMsgpackEncoder() + decoder = OmniMsgpackDecoder() + + tasks: dict[str, asyncio.Task] = {} + + async def _dispatch_request(request_id: str, prompt: Any, sampling_params_dict: dict) -> None: + """Process a single diffusion request and send the response.""" + try: + result = await self._process_request(request_id, prompt, sampling_params_dict) + await response_socket.send(encoder.encode({"type": "result", "output": result})) + except DiffusionRequestAbortedError as e: + logger.info( + "request_id: %s aborted: %s", + request_id, + str(e), + ) + except Exception as e: + logger.exception("Diffusion request %s failed: %s", request_id, e) + await response_socket.send( + encoder.encode( + { + "type": "error", + "request_id": request_id, + "error": str(e), + } + ) + ) + finally: + tasks.pop(request_id, None) + + try: + while True: + raw = await request_socket.recv() + msg = decoder.decode(raw) + msg_type = msg.get("type") + + if msg_type == "add_request": + request_id = msg["request_id"] + task = asyncio.create_task( + _dispatch_request( + request_id, + msg["prompt"], + msg["sampling_params"], + ) + ) + tasks[request_id] = task + + elif msg_type == "add_batch_request": + request_id = msg["request_id"] + + async def _dispatch_batch(rid: str, prompts: list, sp_dict: dict) -> None: + try: + result = await self._process_batch_request(rid, prompts, sp_dict) + await response_socket.send(encoder.encode({"type": "result", "output": result})) + except DiffusionRequestAbortedError as e: + logger.info( + "request_id: %s aborted: %s", + rid, + str(e), + ) + except Exception as e: + logger.exception("Batch diffusion request %s failed: %s", rid, e) + await response_socket.send( + encoder.encode( + { + "type": "error", + "request_id": rid, + "error": str(e), + } + ) + ) + finally: + tasks.pop(rid, None) + + task = asyncio.create_task( + _dispatch_batch( + request_id, + msg["prompts"], + msg["sampling_params"], + ) + ) + tasks[request_id] = task + + elif msg_type == "abort": + for rid in msg.get("request_ids", []): + task = tasks.pop(rid, None) + if task: + task.cancel() + self._engine.abort(rid) + + elif msg_type == "collective_rpc": + rpc_id = msg["rpc_id"] + try: + result = await self._handle_collective_rpc( + msg["method"], + msg.get("timeout"), + tuple(msg.get("args", ())), + msg.get("kwargs", {}), + ) + await response_socket.send( + encoder.encode( + { + "type": "rpc_result", + "rpc_id": rpc_id, + "result": result, + } + ) + ) + except Exception as e: + logger.exception("Collective RPC %s failed: %s", msg["method"], e) + await response_socket.send( + encoder.encode( + { + "type": "error", + "rpc_id": rpc_id, + "error": str(e), + } + ) + ) + + elif msg_type == "shutdown": + break + + finally: + for task in tasks.values(): + task.cancel() + if tasks: + await asyncio.gather(*tasks.values(), return_exceptions=True) + + request_socket.close() + response_socket.close() + ctx.term() + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def close(self) -> None: + """Release engine and thread pool resources.""" + if self._closed: + return + self._closed = True + + if self._engine is not None: + try: + self._engine.close() + except Exception as e: + logger.warning("Error closing diffusion engine: %s", e) + + if self._executor is not None: + try: + self._executor.shutdown(wait=False) + except Exception as e: + logger.warning("Error shutting down executor: %s", e) + + # ------------------------------------------------------------------ + # Subprocess entry point + # ------------------------------------------------------------------ + + @classmethod + def run_diffusion_proc( + cls, + model: str, + od_config: OmniDiffusionConfig, + handshake_address: str, + request_address: str, + response_address: str, + ) -> None: + """Entry point for the diffusion subprocess.""" + shutdown_requested = False + + def signal_handler(signum: int, frame: Any) -> None: + nonlocal shutdown_requested + if not shutdown_requested: + shutdown_requested = True + raise SystemExit() + + signal.signal(signal.SIGTERM, signal_handler) + signal.signal(signal.SIGINT, signal_handler) + + proc = cls(model, od_config) + try: + proc.initialize() + + # Send READY via handshake socket + handshake_ctx = zmq.Context() + handshake_socket = handshake_ctx.socket(zmq.DEALER) + handshake_socket.connect(handshake_address) + handshake_socket.send(msgspec.msgpack.encode({"status": "READY"})) + handshake_socket.close() + handshake_ctx.term() + + # Run async event loop + asyncio.run(proc.run_loop(request_address, response_address)) + + except SystemExit: + logger.debug("StageDiffusionProc exiting.") + raise + except Exception: + logger.exception("StageDiffusionProc encountered a fatal error.") + raise + finally: + proc.close() + + +# -- Free functions for backward compatibility with StageDiffusionClient ------ + + +def spawn_diffusion_proc( + model: str, + od_config: OmniDiffusionConfig, +) -> tuple[BaseProcess, str, str, str]: + """Spawn a StageDiffusionProc subprocess. + + Returns ``(proc, handshake_address, request_address, response_address)``. + """ + handshake_address = get_open_zmq_ipc_path() + request_address = get_open_zmq_ipc_path() + response_address = get_open_zmq_ipc_path() + + ctx = get_mp_context() + proc = ctx.Process( + target=StageDiffusionProc.run_diffusion_proc, + name="StageDiffusionProc", + kwargs={ + "model": model, + "od_config": od_config, + "handshake_address": handshake_address, + "request_address": request_address, + "response_address": response_address, + }, + ) + proc.start() + # Wait for the process to become alive before returning. + deadline = time.monotonic() + 10 + while not proc.is_alive(): + if proc.exitcode is not None: + raise RuntimeError(f"StageDiffusionProc failed to start (exit code {proc.exitcode})") + if time.monotonic() > deadline: + raise TimeoutError("StageDiffusionProc did not become alive within 10s") + time.sleep(0.01) + return proc, handshake_address, request_address, response_address + + +def complete_diffusion_handshake( + proc: BaseProcess, + handshake_address: str, +) -> None: + """Wait for the diffusion subprocess to signal READY. + + On failure the process is terminated before re-raising. + """ + try: + _perform_diffusion_handshake(proc, handshake_address) + except Exception: + shutdown([proc]) + raise + + +def _perform_diffusion_handshake( + proc: BaseProcess, + handshake_address: str, +) -> None: + """Run the handshake with the diffusion subprocess.""" + with zmq_socket_ctx(handshake_address, zmq.ROUTER, bind=True) as handshake_socket: + poller = zmq.Poller() + poller.register(handshake_socket, zmq.POLLIN) + poller.register(proc.sentinel, zmq.POLLIN) + + timeout_ms = _HANDSHAKE_POLL_TIMEOUT_S * 1000 + while True: + events = dict(poller.poll(timeout=timeout_ms)) + if not events: + raise TimeoutError("Timed out waiting for READY from StageDiffusionProc") + if handshake_socket in events: + identity, raw = handshake_socket.recv_multipart() + msg = msgspec.msgpack.decode(raw) + if msg.get("status") == "READY": + return + raise RuntimeError(f"Expected READY, got: {msg}") + if proc.exitcode is not None: + raise RuntimeError(f"StageDiffusionProc died during handshake (exit code {proc.exitcode})") diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index c998870ce7..c987106fee 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -32,7 +32,6 @@ from vllm.tokenizers import cached_tokenizer_from_config from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.input_processor import InputProcessor -from vllm.v1.engine.utils import get_engine_zmq_addresses, launch_core_engines from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.distributed.omni_connectors.utils.initialization import ( @@ -45,6 +44,10 @@ from vllm_omni.engine.output_processor import MultimodalOutputProcessor from vllm_omni.engine.serialization import serialize_additional_information from vllm_omni.engine.stage_engine_core_client import StageEngineCoreClient +from vllm_omni.engine.stage_engine_core_proc import ( + complete_stage_handshake, + spawn_stage_core, +) from vllm_omni.engine.stage_init_utils import ( StartedLlmStage, acquire_device_locks, @@ -334,21 +337,17 @@ def _launch_llm_stage( engine_args_dict, stage_init_timeout, ) - addresses = get_engine_zmq_addresses(vllm_config) - launch_cm = launch_core_engines( + addresses, proc, handshake_address = spawn_stage_core( vllm_config=vllm_config, executor_class=executor_class, log_stats=False, - addresses=addresses, ) - engine_manager, coordinator, addresses = launch_cm.__enter__() started_stage = StartedLlmStage( stage_id=metadata.stage_id, metadata=metadata, vllm_config=vllm_config, executor_class=executor_class, - engine_manager=engine_manager, - coordinator=coordinator, + proc=proc, addresses=addresses, ) finally: @@ -358,7 +357,7 @@ def _launch_llm_stage( current_omni_platform.set_device_control_env_var(previous_visible_devices) logger.info("[AsyncOmniEngine] Stage %s engine launch started", metadata.stage_id) - launch_cm.__exit__(None, None, None) + complete_stage_handshake(proc, handshake_address, addresses, vllm_config) logger.info("[AsyncOmniEngine] Stage %s engine startup completed", metadata.stage_id) assert started_stage is not None return started_stage @@ -389,11 +388,9 @@ def _attach_llm_stage( executor_class=started.executor_class, metadata=started.metadata, client_addresses=client_addresses, - engine_manager=started.engine_manager, - coordinator=started.coordinator, + proc=started.proc, ) - started.engine_manager = None - started.coordinator = None + started.proc = None except Exception: close_started_llm_stage(started) raise diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 4a85a2c6c9..8ea9a5096c 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -241,7 +241,7 @@ async def _orchestration_loop(self) -> None: # the output format in the future to simplify the processing logic in Orchestrator. stage_client = self.stage_clients[stage_id] if stage_client.stage_type == "diffusion": - output = stage_client.get_diffusion_output_async() + output = stage_client.get_diffusion_output_nowait() if output is not None: idle = False req_state = self.request_states.get(output.request_id) diff --git a/vllm_omni/engine/stage_engine_core_client.py b/vllm_omni/engine/stage_engine_core_client.py index 284cc2d31a..e08ce78011 100644 --- a/vllm_omni/engine/stage_engine_core_client.py +++ b/vllm_omni/engine/stage_engine_core_client.py @@ -25,13 +25,13 @@ class StageEngineCoreClient(AsyncMPClient): """Stage async client that inherits from vLLM's AsyncMPClient. - Fully reuses AsyncMPClient.__init__ for: + Fully reuses AsyncMPClient for: - ZMQ setup, sockets - - launch_core_engines() -> EngineCoreProc - outputs_queue, output_queue_task - - All utility methods (shutdown, get_output_async, abort_requests_async, etc.) + - All utility methods (get_output_async, abort_requests_async, etc.) - This is the async version of StageMPClient, designed for use with AsyncOmniEngine. + The subprocess is spawned externally via ``spawn_stage_core`` / + ``complete_stage_handshake`` from *stage_engine_core_proc.py*. """ def __init__( @@ -40,6 +40,7 @@ def __init__( executor_class: type, log_stats: bool = False, client_addresses: dict[str, str] | None = None, + proc: Any = None, client_count: int = 1, client_index: int = 0, *, @@ -53,6 +54,11 @@ def __init__( engine args building, device locking) is done by the Orchestrator via helpers in stage_init_utils.py. This constructor just stores metadata and calls super().__init__(). + + The subprocess is spawned externally via ``spawn_stage_core`` / + ``complete_stage_handshake`` (see *stage_engine_core_proc.py*). + The resulting ``proc`` handle is passed in so this client can + manage the process lifecycle on shutdown. """ # -------- Stage metadata (public fields used at runtime) -------- if metadata is not None: @@ -69,6 +75,7 @@ def __init__( self.model_stage = metadata.model_stage self.engine_outputs: Any = None + self._proc = proc logger.info( "[StageEngineCoreClient] Stage-%s initializing EngineCore", @@ -83,10 +90,6 @@ def __init__( client_count=client_count, client_index=client_index, ) - if engine_manager is not None: - self.resources.engine_manager = engine_manager - if coordinator is not None: - self.resources.coordinator = coordinator except Exception: logger.exception( "[StageEngineCoreClient] Stage-%s EngineCore init failed", @@ -173,3 +176,12 @@ async def collective_rpc_async( args=args, kwargs=kwargs, ) + + def shutdown(self) -> None: + """Shutdown ZMQ connections and the subprocess.""" + super().shutdown() + if self._proc is not None and self._proc.is_alive(): + self._proc.terminate() + self._proc.join(timeout=5) + if self._proc.is_alive(): + self._proc.kill() diff --git a/vllm_omni/engine/stage_engine_core_proc.py b/vllm_omni/engine/stage_engine_core_proc.py new file mode 100644 index 0000000000..05d8f107c2 --- /dev/null +++ b/vllm_omni/engine/stage_engine_core_proc.py @@ -0,0 +1,206 @@ +""" +Stage Core Process for vLLM-Omni V1 architecture. + +StageEngineCoreProc inherits from vLLM's EngineCoreProc and runs the engine core +busy loop in a subprocess, communicating with StageEngineCoreClient via ZMQ. +""" + +from __future__ import annotations + +import signal +from multiprocessing.process import BaseProcess +from typing import TYPE_CHECKING, Any + +import msgspec +import zmq +from vllm.logger import init_logger +from vllm.transformers_utils.config import ( + maybe_register_config_serialize_by_value, +) +from vllm.utils.network_utils import get_open_zmq_ipc_path, zmq_socket_ctx +from vllm.utils.system_utils import ( + decorate_logs, + get_mp_context, + set_process_title, +) +from vllm.v1.engine.core import EngineCoreProc +from vllm.v1.engine.utils import ( + EngineHandshakeMetadata, + EngineZmqAddresses, + get_engine_zmq_addresses, +) +from vllm.v1.utils import shutdown + +if TYPE_CHECKING: + from vllm.config import VllmConfig + from vllm.v1.executor import Executor + +logger = init_logger(__name__) + +_HANDSHAKE_POLL_TIMEOUT_S = 600 + + +class StageEngineCoreProc(EngineCoreProc): + """Stage-specific engine core process for vLLM-Omni. + + Inherits from EngineCoreProc and provides its own ``run_stage_core`` + entry point for launching in a subprocess. Does **not** delegate to + ``EngineCoreProc.run_engine_core()``. + """ + + @staticmethod + def run_stage_core( + *args: Any, + dp_rank: int = 0, + local_dp_rank: int = 0, + **kwargs: Any, + ) -> None: + """Launch StageEngineCoreProc busy loop in background process.""" + shutdown_requested = False + maybe_register_config_serialize_by_value() + + def signal_handler(signum: int, frame: Any) -> None: + nonlocal shutdown_requested + if not shutdown_requested: + shutdown_requested = True + raise SystemExit() + + signal.signal(signal.SIGTERM, signal_handler) + signal.signal(signal.SIGINT, signal_handler) + + engine_core: StageEngineCoreProc | None = None + try: + vllm_config: VllmConfig = kwargs["vllm_config"] + parallel_config = vllm_config.parallel_config + + set_process_title(f"StageEngineCoreProc_DP{dp_rank}") + decorate_logs() + + # the current vllm-omni does not support data parallelism, + # so we set the data parallel size to 1. + # [TODO] support data parallelism in the future. + # https://github.com/vllm-project/vllm-omni/issues/984 + parallel_config.data_parallel_size = 1 + parallel_config.data_parallel_size_local = 1 + parallel_config.data_parallel_rank = 0 + parallel_config.data_parallel_index = dp_rank + + engine_core = StageEngineCoreProc( + *args, + engine_index=dp_rank, + **kwargs, + ) + engine_core.run_busy_loop() + + except SystemExit: + logger.debug("StageEngineCoreProc exiting.") + raise + except Exception: + if engine_core is None: + logger.exception("StageEngineCoreProc failed to start.") + else: + logger.exception("StageEngineCoreProc encountered a fatal error.") + engine_core._send_engine_dead() + raise + finally: + if engine_core is not None: + engine_core.shutdown() + + +def spawn_stage_core( + vllm_config: VllmConfig, + executor_class: type[Executor], + log_stats: bool = False, +) -> tuple[EngineZmqAddresses, BaseProcess, str]: + """Spawn a *StageEngineCoreProc* subprocess without performing the handshake. + + Must be called while the correct device env vars are set (e.g. under + the stage-launch lock). Call ``complete_stage_handshake`` afterwards. + + Returns ``(addresses, process, handshake_address)``. + """ + addresses = get_engine_zmq_addresses(vllm_config) + handshake_address = get_open_zmq_ipc_path() + + ctx = get_mp_context() + proc = ctx.Process( + target=StageEngineCoreProc.run_stage_core, + name="StageEngineCoreProc", + kwargs={ + "vllm_config": vllm_config, + "local_client": True, + "handshake_address": handshake_address, + "executor_class": executor_class, + "log_stats": log_stats, + "dp_rank": 0, + "local_dp_rank": 0, + }, + ) + proc.start() + return addresses, proc, handshake_address + + +def complete_stage_handshake( + proc: BaseProcess, + handshake_address: str, + addresses: EngineZmqAddresses, + vllm_config: VllmConfig, +) -> None: + """Perform the HELLO/INIT/READY handshake with an already-spawned proc. + + On failure the process is terminated before re-raising. + """ + try: + _perform_handshake(proc, handshake_address, addresses, vllm_config) + except Exception: + shutdown([proc]) + raise + + +def _perform_handshake( + proc: BaseProcess, + handshake_address: str, + addresses: EngineZmqAddresses, + vllm_config: VllmConfig, +) -> None: + """Run the HELLO / INIT / READY handshake with the subprocess.""" + with zmq_socket_ctx(handshake_address, zmq.ROUTER, bind=True) as handshake_socket: + poller = zmq.Poller() + poller.register(handshake_socket, zmq.POLLIN) + poller.register(proc.sentinel, zmq.POLLIN) + + identity, msg = _recv(poller, handshake_socket, proc, "HELLO") + if msg.get("status") != "HELLO": + raise RuntimeError(f"Expected HELLO, got: {msg}") + + init_payload = EngineHandshakeMetadata( + addresses=addresses, + parallel_config={}, + ) + handshake_socket.send_multipart([identity, msgspec.msgpack.encode(init_payload)]) + + identity, msg = _recv(poller, handshake_socket, proc, "READY") + if msg.get("status") != "READY": + raise RuntimeError(f"Expected READY, got: {msg}") + num_gpu_blocks = msg.get("num_gpu_blocks") + if num_gpu_blocks is not None: + vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks + + +def _recv( + poller: zmq.Poller, + handshake_socket: zmq.Socket, + proc: BaseProcess, + expected: str, +) -> tuple[bytes, dict]: + """Wait for one handshake message; raise if the process dies first.""" + timeout_ms = _HANDSHAKE_POLL_TIMEOUT_S * 1000 + while True: + events = dict(poller.poll(timeout=timeout_ms)) + if not events: + raise TimeoutError(f"Timed out waiting for {expected} from StageEngineCoreProc") + if handshake_socket in events: + identity, raw = handshake_socket.recv_multipart() + return identity, msgspec.msgpack.decode(raw) + if proc.exitcode is not None: + raise RuntimeError(f"StageEngineCoreProc died during {expected} (exit code {proc.exitcode})") diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 9c246ce6eb..6e81372061 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -75,6 +75,14 @@ def _resolve_model_tokenizer_paths(model: str, engine_args: dict[str, Any]) -> s return model +def terminate_alive_proc(proc, timeout=5): + if proc.is_alive(): + proc.terminate() + proc.join(timeout=timeout) + if proc.is_alive(): + proc.kill() + + def resolve_worker_cls(engine_args: dict[str, Any]) -> None: """Resolve worker_cls from worker_type for non-diffusion stages.""" worker_type = engine_args.get("worker_type", None) @@ -121,8 +129,7 @@ class StartedLlmStage: metadata: Any vllm_config: Any executor_class: type - engine_manager: Any - coordinator: Any + proc: Any addresses: Any @@ -446,7 +453,7 @@ def initialize_diffusion_stage( metadata: Extracted stage metadata. batch_size: Maximum number of requests to batch together in the diffusion engine. Passed through to ``StageDiffusionClient`` - and ultimately to ``AsyncOmniDiffusion``. + and ultimately to ``AsyncOmni``. """ from vllm_omni.diffusion.data import OmniDiffusionConfig from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient @@ -461,23 +468,17 @@ def initialize_diffusion_stage( def close_started_llm_stage(started: StartedLlmStage) -> None: - """Close managers owned by a launched stage that never attached.""" - resources = ( - ("engine manager", started.engine_manager), - ("coordinator", started.coordinator), - ) - for resource_name, resource in resources: - if resource is None: - continue - try: - resource.close() - except Exception as cleanup_error: - logger.warning( - "[stage_init] Failed to close launched %s for stage %s: %s", - resource_name, - started.stage_id, - cleanup_error, - ) + """Terminate the subprocess owned by a launched stage that never attached.""" + if started.proc is None: + return + try: + terminate_alive_proc(started.proc) + except Exception as cleanup_error: + logger.warning( + "[stage_init] Failed to terminate process for stage %s: %s", + started.stage_id, + cleanup_error, + ) def finalize_initialized_stages( diff --git a/vllm_omni/entrypoints/__init__.py b/vllm_omni/entrypoints/__init__.py index d0830df96d..7b09adf939 100644 --- a/vllm_omni/entrypoints/__init__.py +++ b/vllm_omni/entrypoints/__init__.py @@ -6,11 +6,9 @@ """ from vllm_omni.entrypoints.async_omni import AsyncOmni -from vllm_omni.entrypoints.async_omni_diffusion import AsyncOmniDiffusion from vllm_omni.entrypoints.omni import Omni __all__ = [ "AsyncOmni", - "AsyncOmniDiffusion", "Omni", ] diff --git a/vllm_omni/entrypoints/async_omni_diffusion.py b/vllm_omni/entrypoints/async_omni_diffusion.py deleted file mode 100644 index 558ef96cb9..0000000000 --- a/vllm_omni/entrypoints/async_omni_diffusion.py +++ /dev/null @@ -1,473 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -""" -Async entrypoint for vLLM-Omni diffusion model inference. - -Provides an asynchronous interface for running diffusion models, -enabling concurrent request handling and streaming generation. -""" - -import asyncio -import uuid -import weakref -from collections.abc import AsyncGenerator, Iterable -from concurrent.futures import ThreadPoolExecutor -from typing import Any - -from vllm.logger import init_logger -from vllm.transformers_utils.config import get_hf_file_to_dict - -from vllm_omni.diffusion.data import ( - DiffusionRequestAbortedError, - OmniDiffusionConfig, - TransformerConfig, -) -from vllm_omni.diffusion.diffusion_engine import DiffusionEngine -from vllm_omni.diffusion.request import OmniDiffusionRequest -from vllm_omni.inputs.data import OmniDiffusionSamplingParams, OmniPromptType -from vllm_omni.lora.request import LoRARequest -from vllm_omni.outputs import OmniRequestOutput - -logger = init_logger(__name__) - - -def _weak_close_async_omni_diffusion(engine: DiffusionEngine, executor: ThreadPoolExecutor) -> None: - """Best-effort diffusion cleanup for GC finalization.""" - try: - engine.close() - except Exception: - pass - try: - executor.shutdown(wait=False) - except Exception: - pass - - -class AsyncOmniDiffusion: - """Async entry point for vLLM-Omni diffusion model inference. - - This class provides an asynchronous interface for running diffusion models, - enabling concurrent request handling. It wraps the DiffusionEngine and - provides async methods for image generation. - - Args: - model: Model name or path to load - od_config: Optional OmniDiffusionConfig. If not provided, it will be - created from kwargs - **kwargs: Additional keyword arguments passed to OmniDiffusionConfig - - Example: - >>> async_diffusion = AsyncOmniDiffusion(model="Qwen/Qwen-Image") - >>> result = await async_diffusion.generate( - ... prompt="A beautiful sunset over the ocean", - ... request_id="req-1", - ... ) - >>> print(result.images) - """ - - def __init__( - self, - model: str, - od_config: OmniDiffusionConfig | None = None, - batch_size: int = 1, - **kwargs: Any, - ): - self.model = model - - # Set batch size (default 1 for backward compatibility) - self._batch_size = max(1, batch_size) - - # Capture stage info from kwargs before they might be filtered out - stage_id = kwargs.get("stage_id") - engine_input_source = kwargs.get("engine_input_source") - cfg_kv_collect_func = kwargs.pop("cfg_kv_collect_func", None) - - # Build config - if od_config is None: - od_config = OmniDiffusionConfig.from_kwargs(model=model, **kwargs) - elif isinstance(od_config, dict): - # If config is dict, check it too (priority to kwargs if both exist) - if stage_id is None: - stage_id = od_config.get("stage_id") - if engine_input_source is None: - engine_input_source = od_config.get("engine_input_source") - od_config = OmniDiffusionConfig.from_kwargs(**od_config) - - self.od_config = od_config - - # Inject stage info into omni_kv_config if present - if stage_id is not None: - self.od_config.omni_kv_config.setdefault("stage_id", stage_id) - if engine_input_source is not None: - self.od_config.omni_kv_config.setdefault("engine_input_source", engine_input_source) - - # Diffusers-style models expose `model_index.json` with `_class_name`. - # Non-diffusers models (e.g. Bagel, NextStep) only have `config.json`, - # so we fall back to reading that and mapping model_type manually. - try: - config_dict = get_hf_file_to_dict("model_index.json", od_config.model) - if config_dict is not None: - if od_config.model_class_name is None: - od_config.model_class_name = config_dict.get("_class_name", None) - od_config.update_multimodal_support() - - tf_config_dict = get_hf_file_to_dict("transformer/config.json", od_config.model) - od_config.set_tf_model_config(TransformerConfig.from_dict(tf_config_dict)) - else: - raise FileNotFoundError("model_index.json not found") - except (AttributeError, OSError, ValueError, FileNotFoundError): - cfg = get_hf_file_to_dict("config.json", od_config.model) - if cfg is None: - raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}") - - od_config.set_tf_model_config(TransformerConfig.from_dict(cfg)) - model_type = cfg.get("model_type") - architectures = cfg.get("architectures") or [] - # Bagel/NextStep models don't have a model_index.json, so we set the pipeline class name manually - if model_type == "bagel" or "BagelForConditionalGeneration" in architectures: - od_config.model_class_name = "BagelPipeline" - od_config.tf_model_config = TransformerConfig() - od_config.update_multimodal_support() - elif model_type == "nextstep": - if od_config.model_class_name is None: - od_config.model_class_name = "NextStep11Pipeline" - od_config.tf_model_config = TransformerConfig() - od_config.update_multimodal_support() - elif architectures and len(architectures) == 1: - od_config.model_class_name = architectures[0] - else: - raise - - if cfg_kv_collect_func is not None: - od_config.cfg_kv_collect_func = cfg_kv_collect_func - - # Initialize engine - self.engine: DiffusionEngine = DiffusionEngine.make_engine(od_config) - - # Thread pool for running sync engine in async context - self._executor = ThreadPoolExecutor(max_workers=1) - self._closed = False - self._weak_finalizer = weakref.finalize( - self, - _weak_close_async_omni_diffusion, - self.engine, - self._executor, - ) - - logger.info("AsyncOmniDiffusion initialized with model: %s, batch_size: %d", model, self._batch_size) - - # ------------------------------------------------------------------ - # batch_size property - # ------------------------------------------------------------------ - - @property - def batch_size(self) -> int: - """Return the configured batch size for request batching.""" - return self._batch_size - - @batch_size.setter - def batch_size(self, value: int) -> None: - if not isinstance(value, int) or value < 1: - raise ValueError("batch_size must be a positive integer") - self._batch_size = value - - # ------------------------------------------------------------------ - # Public batch generation API - # ------------------------------------------------------------------ - - async def generate_batch( - self, - prompts: list[OmniPromptType], - sampling_params: OmniDiffusionSamplingParams, - request_id: str | None = None, - lora_request: LoRARequest | None = None, - ) -> OmniRequestOutput: - """Generate images from multiple prompts in a single engine call. - - Batches the given prompts into **one** ``DiffusionEngine.step()`` - call and returns a single ``OmniRequestOutput`` containing all - generated images. Called by ``StageDiffusionClient._run_batch`` - when the orchestrator receives a list-prompt request. - - Args: - prompts: List of text prompts describing the desired images. - sampling_params: Shared sampling parameters for all prompts. - request_id: Optional unique identifier. Auto-generated when *None*. - lora_request: Optional LoRA adapter to apply. - - Returns: - A single ``OmniRequestOutput`` with all images combined. - """ - if request_id is None: - request_id = f"diff-batch-{uuid.uuid4().hex[:8]}" - return await self._generate_batch(prompts, sampling_params, request_id, lora_request) - - # ------------------------------------------------------------------ - # Internal batch generation - # ------------------------------------------------------------------ - - async def _generate_batch( - self, - prompts: list[OmniPromptType], - sampling_params: OmniDiffusionSamplingParams, - request_id: str, - lora_request: LoRARequest | None = None, - ) -> OmniRequestOutput: - """Generate images from multiple prompts in a single engine call.""" - if not prompts: - return OmniRequestOutput(request_id=request_id, images=[], final_output_type="image") - - if sampling_params.guidance_scale: - sampling_params.guidance_scale_provided = True - - if lora_request is not None: - sampling_params.lora_request = lora_request - - request = OmniDiffusionRequest( - prompts=prompts, - sampling_params=sampling_params, - request_ids=[f"{request_id}-{i}" for i in range(len(prompts))], - ) - - logger.debug("Starting batch generation for %d prompts, request_id=%s", len(prompts), request_id) - - loop = asyncio.get_event_loop() - try: - results = await loop.run_in_executor( - self._executor, - self.engine.step, - request, - ) - except Exception as e: - logger.error("Batch generation failed for request %s: %s", request_id, e) - raise RuntimeError(f"Diffusion batch generation failed: {e}") from e - - # Combine all per-prompt results into a single OmniRequestOutput - all_images = [] - for result in results: - all_images.extend(result.images) - - return OmniRequestOutput( - request_id=request_id, - images=all_images, - final_output_type="image", - finished=True, - ) - - def get_diffusion_od_config(self) -> OmniDiffusionConfig: - """Return the diffusion config used by this engine.""" - return self.od_config - - # ------------------------------------------------------------------ - # Public generate API - # ------------------------------------------------------------------ - - async def generate( - self, - prompt: OmniPromptType, - sampling_params: OmniDiffusionSamplingParams, - request_id: str | None = None, - lora_request: LoRARequest | None = None, - ) -> OmniRequestOutput: - """Generate images asynchronously from a single text prompt. - - For batched generation (multiple prompts in one engine call), use - :meth:`generate_batch` instead. This method always processes - exactly one prompt per call. - - Args: - prompt: Text prompt describing the desired image - sampling_params: Sampling parameters - request_id: Optional unique identifier for tracking the request - lora_request: Optional LoRA adapter to apply - - Returns: - OmniRequestOutput containing generated images - - Raises: - RuntimeError: If generation fails - """ - if request_id is None: - request_id = f"diff-{uuid.uuid4().hex[:16]}" - if sampling_params.guidance_scale: - sampling_params.guidance_scale_provided = True - - if lora_request is not None: - sampling_params.lora_request = lora_request - - request = OmniDiffusionRequest( - prompts=[prompt], - sampling_params=sampling_params, - request_ids=[request_id], - ) - - logger.debug("Starting generation for request %s", request_id) - - loop = asyncio.get_event_loop() - try: - result = await loop.run_in_executor( - self._executor, - self.engine.step, - request, - ) - result = result[0] - except asyncio.CancelledError: - self.engine.abort(request_id) - raise - except DiffusionRequestAbortedError: - raise - except Exception as e: - logger.error("Generation failed for request %s: %s", request_id, e) - raise RuntimeError(f"Diffusion generation failed: {e}") from e - - if not result.request_id: - result.request_id = request_id - return result - - async def generate_stream( - self, - prompt: str, - request_id: str | None = None, - **kwargs: Any, - ) -> AsyncGenerator[OmniRequestOutput, None]: - """Generate images with streaming progress updates. - - Currently, diffusion models don't support true streaming, so this - yields a single result after generation completes. Future implementations - may support step-by-step progress updates. - - Args: - prompt: Text prompt describing the desired image - request_id: Optional unique identifier for tracking the request - **kwargs: Additional generation parameters - - Yields: - OmniRequestOutput with generation progress/results - """ - result = await self.generate(prompt=prompt, request_id=request_id, **kwargs) - yield result - - def close(self) -> None: - """Close the engine and release resources. - - Should be called when done using the AsyncOmniDiffusion instance. - """ - if self._closed: - return - self._closed = True - - finalizer = getattr(self, "_weak_finalizer", None) - if finalizer is not None and finalizer.alive: - finalizer.detach() - - try: - self.engine.close() - except Exception as e: - logger.warning("Error closing diffusion engine: %s", e) - - try: - self._executor.shutdown(wait=False) - except Exception as e: - logger.warning("Error shutting down executor: %s", e) - - logger.info("AsyncOmniDiffusion closed") - - def shutdown(self) -> None: - """Alias for close() method.""" - self.close() - - async def abort(self, request_id: str | Iterable[str]) -> None: - """Abort a request.""" - self.engine.abort(request_id) - - @property - def is_running(self) -> bool: - """Check if the engine is running.""" - return not self._closed - - @property - def is_stopped(self) -> bool: - """Check if the engine is stopped.""" - return self._closed - - async def remove_lora(self, adapter_id: int) -> bool: - """Remove a LoRA""" - loop = asyncio.get_event_loop() - results = await loop.run_in_executor( - self._executor, - self.engine.collective_rpc, - "remove_lora", - None, - (adapter_id,), - {}, - None, - ) - return all(results) if isinstance(results, list) else results - - async def add_lora(self, lora_request: LoRARequest) -> bool: - """Add a LoRA adapter""" - loop = asyncio.get_event_loop() - results = await loop.run_in_executor( - self._executor, - self.engine.collective_rpc, - "add_lora", - None, - (), - {"lora_request": lora_request}, - None, - ) - return all(results) if isinstance(results, list) else results - - async def list_loras(self) -> list[int]: - """List all registered LoRA adapter IDs.""" - loop = asyncio.get_event_loop() - results = await loop.run_in_executor( - self._executor, - self.engine.collective_rpc, - "list_loras", - None, - (), - {}, - None, - ) - # collective_rpc returns list from workers; flatten unique ids - if not isinstance(results, list): - return results or [] - merged: set[int] = set() - for part in results: - merged.update(part or []) - return sorted(merged) - - async def pin_lora(self, lora_id: int) -> bool: - """Prevent an adapter from being evicted.""" - loop = asyncio.get_event_loop() - results = await loop.run_in_executor( - self._executor, - self.engine.collective_rpc, - "pin_lora", - None, - (), - {"adapter_id": lora_id}, - None, - ) - return all(results) if isinstance(results, list) else results - - async def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None: - """Start or stop profiling for the diffusion model. - - Args: - is_start: True to start profiling, False to stop. - profile_prefix: Optional prefix for trace filename (vLLM compat). - - Note: - Matches vLLM's worker.profile() signature for consistency. - Traces are saved automatically via on_trace_ready callback. - """ - loop = asyncio.get_event_loop() - await loop.run_in_executor( - self._executor, - self.engine.profile, - is_start, - profile_prefix, - ) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 527947be92..35f56516c7 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -6,7 +6,7 @@ from collections.abc import AsyncGenerator, AsyncIterator, Callable from datetime import datetime, timedelta, timezone from io import BytesIO -from typing import TYPE_CHECKING, Any, Final, Optional, cast +from typing import Any, Final, cast import jinja2 import torch @@ -89,9 +89,6 @@ from vllm_omni.lora.request import LoRARequest from vllm_omni.outputs import OmniRequestOutput -if TYPE_CHECKING: - from vllm_omni.entrypoints.async_omni_diffusion import AsyncOmniDiffusion - logger = init_logger(__name__) @@ -107,13 +104,13 @@ class OmniOpenAIServingChat(OpenAIServingChat, AudioMixin): # Diffusion mode attributes _diffusion_mode: bool = False - _diffusion_engine: Optional["AsyncOmniDiffusion"] = None + _diffusion_engine: AsyncOmni | None = None _diffusion_model_name: str = "" @classmethod def for_diffusion( cls, - diffusion_engine: "AsyncOmniDiffusion", + diffusion_engine: AsyncOmni, model_name: str, ) -> "OmniOpenAIServingChat": """Create a chat serving instance for diffusion models. @@ -2153,7 +2150,7 @@ async def _create_diffusion_chat_completion( if resolution is not None: gen_params.resolution = resolution - # Parse per-request LoRA (works for both AsyncOmniDiffusion and AsyncOmni). + # Parse per-request LoRA. if lora_body and isinstance(lora_body, dict): try: lora_req, lora_scale = parse_lora_request(lora_body) @@ -2187,26 +2184,16 @@ async def _create_diffusion_chat_completion( ) # Generate image - # Handle both AsyncOmniDiffusion (returns OmniRequestOutput) and AsyncOmni (returns AsyncGenerator) - if isinstance(self._diffusion_engine, AsyncOmni): - diffusion_engine = cast(AsyncOmni, self._diffusion_engine) - result = None - async for output in diffusion_engine.generate( - prompt=gen_prompt, - sampling_params_list=[gen_params], # Pass as single-stage params - request_id=request_id, - ): - result = output - if result is None: - return self._create_error_response("No output generated from AsyncOmni") - else: - # AsyncOmniDiffusion: direct call - diffusion_engine = cast(AsyncOmniDiffusion, self._diffusion_engine) - result = await diffusion_engine.generate( - prompt=gen_prompt, - sampling_params=gen_params, - request_id=request_id, - ) + diffusion_engine = cast(AsyncOmni, self._diffusion_engine) + result = None + async for output in diffusion_engine.generate( + prompt=gen_prompt, + sampling_params_list=[gen_params], # Pass as single-stage params + request_id=request_id, + ): + result = output + if result is None: + return self._create_error_response("No output generated from AsyncOmni") # Extract images from result # Handle nested OmniRequestOutput structure where images might be in request_output images = getattr(result.request_output, "images", []) From 6dc61c9a20a49e86aaf48ad4a03e7c8cb6b29e34 Mon Sep 17 00:00:00 2001 From: Canlin Guo Date: Fri, 3 Apr 2026 16:23:36 +0800 Subject: [PATCH 035/204] [Perf] Skip Wan2.2 cross attn Ulysses SP (#2459) Signed-off-by: gcanlin --- vllm_omni/diffusion/attention/layer.py | 4 ++++ vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py | 1 + 2 files changed, 5 insertions(+) diff --git a/vllm_omni/diffusion/attention/layer.py b/vllm_omni/diffusion/attention/layer.py index f83bb294d2..4fdf2ff161 100644 --- a/vllm_omni/diffusion/attention/layer.py +++ b/vllm_omni/diffusion/attention/layer.py @@ -36,6 +36,7 @@ def __init__( scatter_idx: int = 2, gather_idx: int = 1, use_sync: bool = False, + skip_sequence_parallel: bool = False, ): super().__init__() self.attn_backend = get_attn_backend(-1) @@ -62,6 +63,7 @@ def __init__( self.gather_idx = gather_idx self.use_sync = use_sync self.causal = causal + self.skip_sequence_parallel = skip_sequence_parallel self.use_ring = False self.ring_pg = None @@ -98,6 +100,8 @@ def _get_active_parallel_strategy(self): (e.g., in noise_refiner/context_refiner before unified_prepare in Z-Image). This avoids unnecessary SP communication for layers not covered by _sp_plan. """ + if self.skip_sequence_parallel: + return self._no_parallel_strategy if is_forward_context_available(): ctx = get_forward_context() if not ctx.sp_active: diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py index 20e2b9fea8..c4e3b40cdd 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py @@ -539,6 +539,7 @@ def __init__( num_kv_heads=self.num_heads, softmax_scale=1.0 / (head_dim**0.5), causal=False, + skip_sequence_parallel=True, ) def forward( From cd71567b0686968d378486c38844e1fa5fc92998 Mon Sep 17 00:00:00 2001 From: Jerry Song <46962917+Songrui625@users.noreply.github.com> Date: Fri, 3 Apr 2026 23:17:49 +0800 Subject: [PATCH 036/204] [Model] Add two stages inference for model LTX-2 distilled. (#2260) Signed-off-by: Songrui625 --- docs/models/supported_models.md | 2 + .../image_to_video/image_to_video.py | 6 +- .../text_to_video/text_to_video.py | 9 +- .../diffusion/cache/cache_dit_backend.py | 18 +- vllm_omni/diffusion/models/ltx2/__init__.py | 10 +- .../diffusion/models/ltx2/pipeline_ltx2.py | 552 +++++++++++++----- .../models/ltx2/pipeline_ltx2_image2video.py | 538 ++++++++++++----- .../ltx2/pipeline_ltx2_latent_upsample.py | 262 +++++++++ vllm_omni/diffusion/registry.py | 20 +- vllm_omni/diffusion/utils/tf_utils.py | 24 + 10 files changed, 1129 insertions(+), 312 deletions(-) create mode 100644 vllm_omni/diffusion/models/ltx2/pipeline_ltx2_latent_upsample.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 68024e18b3..d611c0311c 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -34,6 +34,8 @@ th { | `Wan22VACEPipeline` | Wan2.1-VACE | `Wan-AI/Wan2.1-VACE-1.3B-diffusers`, `Wan-AI/Wan2.1-VACE-14B-diffusers` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `LTX2Pipeline` | LTX-2-T2V | `Lightricks/LTX-2` | ✅︎ | ✅︎ | | | | `LTX2ImageToVideoPipeline` | LTX-2-I2V | `Lightricks/LTX-2` | ✅︎ | ✅︎ | | | +| `LTX2TwoStagesPipeline` | LTX-2-T2V | `rootonchair/LTX-2-19b-distilled` | ✅︎ | ✅︎ | | | +| `LTX2ImageToVideoTwoStagesPipeline` | LTX-2-I2V | `rootonchair/LTX-2-19b-distilled` | ✅︎ | ✅︎ | | | | `HeliosPipeline`, `HeliosPyramidPipeline` | Helios | `BestWishYsh/Helios-Base`, `BestWishYsh/Helios-Mid`, `BestWishYsh/Helios-Distilled` | ✅︎ | ✅︎ | ✅︎ | | | `OvisImagePipeline` | Ovis-Image | `OvisAI/Ovis-Image` | ✅︎ | ✅︎ | | ✅︎ | | `LongcatImagePipeline` | LongCat-Image | `meituan-longcat/LongCat-Image` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | diff --git a/examples/offline_inference/image_to_video/image_to_video.py b/examples/offline_inference/image_to_video/image_to_video.py index 04e0566919..c8c55c485a 100644 --- a/examples/offline_inference/image_to_video/image_to_video.py +++ b/examples/offline_inference/image_to_video/image_to_video.py @@ -490,10 +490,6 @@ def _ensure_frame_list(video_array): if frames_np.ndim == 4 and frames_np.shape[-1] == 4: frames_np = frames_np[..., :3] - frames_np = np.clip(frames_np, 0.0, 1.0) - frames_u8 = (frames_np * 255).round().clip(0, 255).astype("uint8") - video_tensor = torch.from_numpy(frames_u8) - audio_out = None if audio is not None: if isinstance(audio, list): @@ -507,7 +503,7 @@ def _ensure_frame_list(video_array): audio_out = audio_out.float().cpu() encode_video( - video_tensor, + frames_np, fps=fps, audio=audio_out, audio_sample_rate=args.audio_sample_rate if audio_out is not None else None, diff --git a/examples/offline_inference/text_to_video/text_to_video.py b/examples/offline_inference/text_to_video/text_to_video.py index a3aa818d2e..322911c993 100644 --- a/examples/offline_inference/text_to_video/text_to_video.py +++ b/examples/offline_inference/text_to_video/text_to_video.py @@ -56,8 +56,13 @@ def parse_args() -> argparse.Namespace: "Examples: Wan-AI/Wan2.2-T2V-A14B-Diffusers, " "hunyuanvideo-community/HunyuanVideo-1.5-480p_t2v", ) + parser.add_argument( + "--model-class-name", + default=None, + help="Override model class name (e.g., LTX2TwoStagesVideoPipeline).", + ) parser.add_argument("--prompt", default="A serene lakeside sunrise with mist over the water.", help="Text prompt.") - parser.add_argument("--negative-prompt", default="", help="Negative prompt (Wan2.2 only).") + parser.add_argument("--negative-prompt", default="", help="Negative prompt.") parser.add_argument("--seed", type=int, default=42, help="Random seed.") parser.add_argument("--guidance-scale", type=float, default=None, help="CFG scale. Default: model-specific.") parser.add_argument( @@ -185,6 +190,7 @@ def parse_args() -> argparse.Namespace: def main(): args = parse_args() + model_class_name = args.model_class_name preset = _detect_preset(args.model) for key, default_val in preset.items(): @@ -229,6 +235,7 @@ def main(): enable_cpu_offload=args.enable_cpu_offload, parallel_config=parallel_config, enforce_eager=args.enforce_eager, + model_class_name=model_class_name, cache_backend=args.cache_backend, cache_config=cache_config, enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler, diff --git a/vllm_omni/diffusion/cache/cache_dit_backend.py b/vllm_omni/diffusion/cache/cache_dit_backend.py index e5337be127..a5055a0688 100644 --- a/vllm_omni/diffusion/cache/cache_dit_backend.py +++ b/vllm_omni/diffusion/cache/cache_dit_backend.py @@ -24,6 +24,7 @@ from vllm_omni.diffusion.cache.base import CacheBackend from vllm_omni.diffusion.data import DiffusionCacheConfig, OmniDiffusionConfig +from vllm_omni.diffusion.utils.tf_utils import get_transformer_from_pipeline logger = init_logger(__name__) @@ -533,7 +534,7 @@ def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool def enable_cache_for_ltx2(pipeline: Any, cache_config: Any) -> Callable[[int], None]: """Enable cache-dit for LTX2 pipelines (audio-video transformer blocks).""" - transformer = pipeline.transformer + transformer = get_transformer_from_pipeline(pipeline) db_cache_config = _build_db_cache_config(cache_config) @@ -566,11 +567,12 @@ def enable_cache_for_ltx2(pipeline: Any, cache_config: Any) -> Callable[[int], N ) def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool = True) -> None: + transformer = get_transformer_from_pipeline(pipeline) if cache_config.scm_steps_mask_policy is None: - cache_dit.refresh_context(pipeline.transformer, num_inference_steps=num_inference_steps, verbose=verbose) + cache_dit.refresh_context(transformer, num_inference_steps=num_inference_steps, verbose=verbose) else: cache_dit.refresh_context( - pipeline.transformer, + transformer, cache_config=DBCacheConfig().reset( num_inference_steps=num_inference_steps, steps_computation_mask=cache_dit.steps_mask( @@ -613,8 +615,9 @@ def enable_cache_for_dit(pipeline: Any, cache_config: Any) -> Callable[[int], No ) # Enable cache-dit on the transformer + transformer = get_transformer_from_pipeline(pipeline) cache_dit.enable_cache( - pipeline.transformer, + transformer, cache_config=db_cache_config, calibrator_config=calibrator_config, ) @@ -626,11 +629,12 @@ def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool pipeline: The diffusion pipeline instance. num_inference_steps: New number of inference steps. """ + transformer = get_transformer_from_pipeline(pipeline) if cache_config.scm_steps_mask_policy is None: - cache_dit.refresh_context(pipeline.transformer, num_inference_steps=num_inference_steps, verbose=verbose) + cache_dit.refresh_context(transformer, num_inference_steps=num_inference_steps, verbose=verbose) else: cache_dit.refresh_context( - pipeline.transformer, + transformer, cache_config=DBCacheConfig().reset( num_inference_steps=num_inference_steps, steps_computation_mask=cache_dit.steps_mask( @@ -1211,6 +1215,8 @@ def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool "StableDiffusion3Pipeline": enable_cache_for_sd3, "LTX2Pipeline": enable_cache_for_ltx2, "LTX2ImageToVideoPipeline": enable_cache_for_ltx2, + "LTX2TwoStagesPipeline": enable_cache_for_ltx2, + "LTX2ImageToVideoTwoStagesPipeline": enable_cache_for_ltx2, "BagelPipeline": enable_cache_for_bagel, "GlmImagePipeline": enable_cache_for_glm_image, "Flux2Pipeline": enable_cache_for_flux2, diff --git a/vllm_omni/diffusion/models/ltx2/__init__.py b/vllm_omni/diffusion/models/ltx2/__init__.py index 0a92d4f24f..9f9d70f010 100644 --- a/vllm_omni/diffusion/models/ltx2/__init__.py +++ b/vllm_omni/diffusion/models/ltx2/__init__.py @@ -4,15 +4,23 @@ from vllm_omni.diffusion.models.ltx2.ltx2_transformer import LTX2VideoTransformer3DModel from vllm_omni.diffusion.models.ltx2.pipeline_ltx2 import ( LTX2Pipeline, + LTX2TwoStagesPipeline, create_transformer_from_config, get_ltx2_post_process_func, load_transformer_config, ) -from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_image2video import LTX2ImageToVideoPipeline +from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_image2video import ( + LTX2ImageToVideoPipeline, + LTX2ImageToVideoTwoStagesPipeline, +) +from vllm_omni.diffusion.models.ltx2.pipeline_ltx2_latent_upsample import LTX2LatentUpsamplePipeline __all__ = [ "LTX2Pipeline", "LTX2ImageToVideoPipeline", + "LTX2LatentUpsamplePipeline", + "LTX2TwoStagesPipeline", + "LTX2ImageToVideoTwoStagesPipeline", "get_ltx2_post_process_func", "load_transformer_config", "create_transformer_from_config", diff --git a/vllm_omni/diffusion/models/ltx2/pipeline_ltx2.py b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2.py index 34263e217e..efc342e932 100644 --- a/vllm_omni/diffusion/models/ltx2/pipeline_ltx2.py +++ b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2.py @@ -15,12 +15,14 @@ import torch from diffusers import AutoencoderKLLTX2Audio, AutoencoderKLLTX2Video, FlowMatchEulerDiscreteScheduler from diffusers.pipelines.ltx2 import LTX2TextConnectors +from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES from diffusers.pipelines.ltx2.vocoder import LTX2Vocoder from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg, retrieve_timesteps from diffusers.utils.torch_utils import randn_tensor from diffusers.video_processor import VideoProcessor from torch import nn from transformers import AutoTokenizer, Gemma3ForConditionalGeneration +from vllm.logger import init_logger from vllm.model_executor.models.utils import AutoWeightsLoader from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig @@ -31,10 +33,16 @@ get_classifier_free_guidance_world_size, ) from vllm_omni.diffusion.distributed.utils import get_local_device +from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader +from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.lora.request import LoRARequest from .ltx2_transformer import LTX2VideoTransformer3DModel +from .pipeline_ltx2_latent_upsample import LTX2LatentUpsamplePipeline + +logger = init_logger(__name__) def load_transformer_config(model_path: str, subfolder: str = "transformer", local_files_only: bool = True) -> dict: @@ -114,7 +122,7 @@ def calculate_shift( return mu -class LTX2Pipeline(nn.Module, CFGParallelMixin): +class LTX2Pipeline(nn.Module, CFGParallelMixin, ProgressBarMixin): def __init__( self, *, @@ -145,12 +153,15 @@ def __init__( subfolder="tokenizer", local_files_only=local_files_only, ) - self.text_encoder = Gemma3ForConditionalGeneration.from_pretrained( - model, - subfolder="text_encoder", - torch_dtype=dtype, - local_files_only=local_files_only, - ).to(self.device) + # prefer mmap loading as default device is cuda, and the output of text encoder + # could be deterministic. + with torch.device("cpu"): + self.text_encoder = Gemma3ForConditionalGeneration.from_pretrained( + model, + subfolder="text_encoder", + torch_dtype=dtype, + local_files_only=local_files_only, + ).to(self.device) self.connectors = LTX2TextConnectors.from_pretrained( model, subfolder="connectors", @@ -460,6 +471,22 @@ def _unpack_latents( latents = latents.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(2, 3) return latents + @staticmethod + def _normalize_latents( + latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0 + ) -> torch.Tensor: + # Normalize latents across the channel dimension [B, C, F, H, W] + latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype) + latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype) + latents = (latents - latents_mean) * scaling_factor / latents_std + return latents + + @staticmethod + def _normalize_audio_latents(latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor): + latents_mean = latents_mean.to(latents.device, latents.dtype) + latents_std = latents_std.to(latents.device, latents.dtype) + return (latents - latents_mean) / latents_std + @staticmethod def _denormalize_latents( latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0 @@ -475,6 +502,14 @@ def _denormalize_audio_latents(latents: torch.Tensor, latents_mean: torch.Tensor latents_std = latents_std.to(latents.device, latents.dtype) return (latents * latents_std) + latents_mean + @staticmethod + def _create_noised_state( + latents: torch.Tensor, noise_scale: float | torch.Tensor, generator: torch.Generator | None = None + ): + noise = randn_tensor(latents.shape, generator=generator, device=latents.device, dtype=latents.dtype) + noised_latents = noise_scale * noise + (1 - noise_scale) * latents + return noised_latents + @staticmethod def _pack_audio_latents( latents: torch.Tensor, patch_size: int | None = None, patch_size_t: int | None = None @@ -514,12 +549,26 @@ def prepare_latents( height: int = 512, width: int = 768, num_frames: int = 121, + noise_scale: float = 0.0, dtype: torch.dtype | None = None, device: torch.device | None = None, generator: torch.Generator | None = None, latents: torch.Tensor | None = None, ) -> torch.Tensor: if latents is not None: + if latents.ndim == 5: + latents = self._normalize_latents( + latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor + ) + # latents are of shape [B, C, F, H, W], need to be packed + latents = self._pack_latents( + latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size + ) + if latents.ndim != 3: + raise ValueError( + f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is [batch_size, num_seq, num_features]." # noqa + ) + latents = self._create_noised_state(latents, noise_scale, generator) return latents.to(device=device, dtype=dtype) height = height // self.vae_spatial_compression_ratio @@ -543,37 +592,30 @@ def prepare_audio_latents( self, batch_size: int = 1, num_channels_latents: int = 8, + audio_latent_length: int = 1, # 1 is just a dummy value num_mel_bins: int = 64, - num_frames: int = 121, - frame_rate: float = 25.0, - sampling_rate: int = 16000, - hop_length: int = 160, + noise_scale: float = 0.0, dtype: torch.dtype | None = None, device: torch.device | None = None, generator: torch.Generator | None = None, latents: torch.Tensor | None = None, ) -> tuple[torch.Tensor, int]: - duration_s = num_frames / frame_rate - latents_per_second = float(sampling_rate) / float(hop_length) / float(self.audio_vae_temporal_compression_ratio) - latent_length = round(duration_s * latents_per_second) + if latents is not None: + if latents.ndim == 4: + # latents are of shape [B, C, L, M], need to be packed + latents = self._pack_audio_latents(latents) + if latents.ndim != 3: + raise ValueError( + f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is [batch_size, num_seq, num_features]." # noqa + ) + latents = self._normalize_audio_latents(latents, self.audio_vae.latents_mean, self.audio_vae.latents_std) + latents = self._create_noised_state(latents, noise_scale, generator) + return latents.to(device=device, dtype=dtype) + # TODO: confirm whether this logic is correct latent_mel_bins = num_mel_bins // self.audio_vae_mel_compression_ratio - sp_size = getattr(self.od_config.parallel_config, "sequence_parallel_size", 1) - if sp_size > 1: - pad_len = (sp_size - (latent_length % sp_size)) % sp_size - if pad_len > 0: - if latents is not None: - pad_shape = list(latents.shape) - pad_shape[2] = pad_len - padding = torch.zeros(pad_shape, dtype=latents.dtype, device=latents.device) - latents = torch.cat([latents, padding], dim=2) - latent_length += pad_len - - if latents is not None: - return latents.to(device=device, dtype=dtype), latent_length - - shape = (batch_size, num_channels_latents, latent_length, latent_mel_bins) + shape = (batch_size, num_channels_latents, audio_latent_length, latent_mel_bins) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( @@ -583,7 +625,7 @@ def prepare_audio_latents( latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) latents = self._pack_audio_latents(latents) - return latents, latent_length + return latents @property def guidance_scale(self): @@ -766,9 +808,11 @@ def forward( num_frames: int | None = None, frame_rate: float | None = None, num_inference_steps: int | None = None, + sigmas: list[float] | None = None, timesteps: list[int] | None = None, guidance_scale: float = 4.0, guidance_rescale: float = 0.0, + noise_scale: float = 0.0, num_videos_per_prompt: int | None = 1, generator: torch.Generator | list[torch.Generator] | None = None, latents: torch.Tensor | None = None, @@ -925,6 +969,21 @@ def forward( latent_num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1 latent_height = height // self.vae_spatial_compression_ratio latent_width = width // self.vae_spatial_compression_ratio + if latents is not None: + if latents.ndim == 5: + logger.info( + "Got latents of shape [batch_size, latent_dim, latent_frames, latent_height, latent_width], `latent_num_frames`, `latent_height`, `latent_width` will be inferred." # noqa + ) + _, _, latent_num_frames, latent_height, latent_width = latents.shape # [B, C, F, H, W] + elif latents.ndim == 3: + logger.warning( + f"You have supplied packed `latents` of shape {latents.shape}, so the latent dims cannot be" + f" inferred. Make sure the supplied `height`, `width`, and `num_frames` are correct." + ) + else: + raise ValueError( + f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is either [batch_size, seq_len, num_features] or [batch_size, latent_dim, latent_frames, latent_height, latent_width]." # noqa + ) video_sequence_length = latent_num_frames * latent_height * latent_width num_channels_latents = self.transformer.config.in_channels @@ -934,33 +993,66 @@ def forward( height, width, num_frames, + noise_scale, torch.float32, device, generator, latents, ) + duration_s = num_frames / frame_rate + audio_latents_per_second = ( + self.audio_sampling_rate / self.audio_hop_length / float(self.audio_vae_temporal_compression_ratio) + ) + audio_num_frames = round(duration_s * audio_latents_per_second) + if audio_latents is not None: + if audio_latents.ndim == 4: + logger.info( + "Got audio_latents of shape [batch_size, num_channels, audio_length, mel_bins], `audio_num_frames` will be inferred." # noqa + ) + _, _, audio_num_frames, _ = audio_latents.shape # [B, C, L, M] + elif audio_latents.ndim == 3: + logger.warning( + f"You have supplied packed `audio_latents` of shape {audio_latents.shape}, so the latent dims" + f" cannot be inferred. Make sure the supplied `num_frames` and `frame_rate` are correct." + ) + else: + raise ValueError( + f"Provided `audio_latents` tensor has shape {audio_latents.shape}, but the expected shape is either [batch_size, seq_len, num_features] or [batch_size, num_channels, audio_length, mel_bins]." # noqa + ) + num_mel_bins = self.audio_vae.config.mel_bins if getattr(self, "audio_vae", None) is not None else 64 latent_mel_bins = num_mel_bins // self.audio_vae_mel_compression_ratio num_channels_latents_audio = ( self.audio_vae.config.latent_channels if getattr(self, "audio_vae", None) is not None else 8 ) - audio_latents, audio_num_frames = self.prepare_audio_latents( + + # padding audio_latents if needed + sp_size = getattr(self.od_config.parallel_config, "sequence_parallel_size", 1) + if sp_size > 1: + pad_len = (sp_size - (audio_num_frames % sp_size)) % sp_size + if pad_len > 0: + if audio_latents is not None: + pad_shape = list(audio_latents.shape) + pad_shape[2] = pad_len + padding = torch.zeros(pad_shape, dtype=audio_latents.dtype, device=audio_latents.device) + audio_latents = torch.cat([audio_latents, padding], dim=2) + audio_num_frames += pad_len + + audio_latents = self.prepare_audio_latents( batch_size * num_videos_per_prompt, num_channels_latents=num_channels_latents_audio, + audio_latent_length=audio_num_frames, num_mel_bins=num_mel_bins, - num_frames=num_frames, - frame_rate=frame_rate, - sampling_rate=self.audio_sampling_rate, - hop_length=self.audio_hop_length, + noise_scale=noise_scale, dtype=torch.float32, device=device, generator=generator, latents=audio_latents, ) - sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas mu = calculate_shift( video_sequence_length, self.scheduler.config.get("base_image_seq_len", 1024), @@ -985,7 +1077,6 @@ def forward( sigmas=sigmas, mu=mu, ) - num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) self._num_timesteps = len(timesteps) video_coords = self.transformer.rope.prepare_video_coords( @@ -994,129 +1085,133 @@ def forward( audio_coords = self.transformer.audio_rope.prepare_audio_coords( audio_latents.shape[0], audio_num_frames, audio_latents.device ) - - for i, t in enumerate(timesteps): - if self.interrupt: - continue - - self._current_timestep = t - - if cfg_parallel_ready: - latent_model_input = latents.to(prompt_embeds.dtype) - audio_latent_model_input = audio_latents.to(prompt_embeds.dtype) - timestep = t.expand(latent_model_input.shape[0]) - - positive_kwargs = { - "hidden_states": latent_model_input, - "audio_hidden_states": audio_latent_model_input, - "encoder_hidden_states": connector_prompt_embeds, - "audio_encoder_hidden_states": connector_audio_prompt_embeds, - "timestep": timestep, - "encoder_attention_mask": connector_attention_mask, - "audio_encoder_attention_mask": connector_attention_mask, - "num_frames": latent_num_frames, - "height": latent_height, - "width": latent_width, - "fps": frame_rate, - "audio_num_frames": audio_num_frames, - "video_coords": video_coords, - "audio_coords": audio_coords, - "attention_kwargs": attention_kwargs, - "return_dict": False, - } - negative_kwargs = { - "hidden_states": latent_model_input, - "audio_hidden_states": audio_latent_model_input, - "encoder_hidden_states": negative_connector_prompt_embeds, - "audio_encoder_hidden_states": negative_connector_audio_prompt_embeds, - "timestep": timestep, - "encoder_attention_mask": negative_connector_attention_mask, - "audio_encoder_attention_mask": negative_connector_attention_mask, - "num_frames": latent_num_frames, - "height": latent_height, - "width": latent_width, - "fps": frame_rate, - "audio_num_frames": audio_num_frames, - "video_coords": video_coords, - "audio_coords": audio_coords, - "attention_kwargs": attention_kwargs, - "return_dict": False, - } - - noise_pred_video, noise_pred_audio = self.predict_noise_av_maybe_with_cfg( - do_true_cfg=True, - true_cfg_scale=guidance_scale, - positive_kwargs=positive_kwargs, - negative_kwargs=negative_kwargs, - guidance_rescale=guidance_rescale, - cfg_normalize=False, - ) - - latents, audio_latents = self._scheduler_step_video_audio_maybe_with_cfg( - noise_pred_video, - noise_pred_audio, - t, - latents, - audio_latents, - audio_scheduler, - do_true_cfg=True, - ) - else: - latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents - latent_model_input = latent_model_input.to(prompt_embeds.dtype) - audio_latent_model_input = ( - torch.cat([audio_latents] * 2) if self.do_classifier_free_guidance else audio_latents - ) - audio_latent_model_input = audio_latent_model_input.to(prompt_embeds.dtype) - - timestep = t.expand(latent_model_input.shape[0]) - - with self._transformer_cache_context("cond_uncond"): - noise_pred_video, noise_pred_audio = self.transformer( - hidden_states=latent_model_input, - audio_hidden_states=audio_latent_model_input, - encoder_hidden_states=connector_prompt_embeds, - audio_encoder_hidden_states=connector_audio_prompt_embeds, - timestep=timestep, - encoder_attention_mask=connector_attention_mask, - audio_encoder_attention_mask=connector_attention_mask, - num_frames=latent_num_frames, - height=latent_height, - width=latent_width, - fps=frame_rate, - audio_num_frames=audio_num_frames, - video_coords=video_coords, - audio_coords=audio_coords, - attention_kwargs=attention_kwargs, - return_dict=False, + # Duplicate the positional ids as well if using CFG + if self.do_classifier_free_guidance and not cfg_parallel_ready: + video_coords = video_coords.repeat((2,) + (1,) * (video_coords.ndim - 1)) # Repeat twice in batch dim + audio_coords = audio_coords.repeat((2,) + (1,) * (audio_coords.ndim - 1)) + + with self.progress_bar(total=len(timesteps)) as pbar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + self._current_timestep = t + + if cfg_parallel_ready: + latent_model_input = latents.to(prompt_embeds.dtype) + audio_latent_model_input = audio_latents.to(prompt_embeds.dtype) + timestep = t.expand(latent_model_input.shape[0]) + + positive_kwargs = { + "hidden_states": latent_model_input, + "audio_hidden_states": audio_latent_model_input, + "encoder_hidden_states": connector_prompt_embeds, + "audio_encoder_hidden_states": connector_audio_prompt_embeds, + "timestep": timestep, + "encoder_attention_mask": connector_attention_mask, + "audio_encoder_attention_mask": connector_attention_mask, + "num_frames": latent_num_frames, + "height": latent_height, + "width": latent_width, + "fps": frame_rate, + "audio_num_frames": audio_num_frames, + "video_coords": video_coords, + "audio_coords": audio_coords, + "attention_kwargs": attention_kwargs, + "return_dict": False, + } + negative_kwargs = { + "hidden_states": latent_model_input, + "audio_hidden_states": audio_latent_model_input, + "encoder_hidden_states": negative_connector_prompt_embeds, + "audio_encoder_hidden_states": negative_connector_audio_prompt_embeds, + "timestep": timestep, + "encoder_attention_mask": negative_connector_attention_mask, + "audio_encoder_attention_mask": negative_connector_attention_mask, + "num_frames": latent_num_frames, + "height": latent_height, + "width": latent_width, + "fps": frame_rate, + "audio_num_frames": audio_num_frames, + "video_coords": video_coords, + "audio_coords": audio_coords, + "attention_kwargs": attention_kwargs, + "return_dict": False, + } + + noise_pred_video, noise_pred_audio = self.predict_noise_av_maybe_with_cfg( + do_true_cfg=True, + true_cfg_scale=guidance_scale, + positive_kwargs=positive_kwargs, + negative_kwargs=negative_kwargs, + guidance_rescale=guidance_rescale, + cfg_normalize=False, ) - noise_pred_video = noise_pred_video.float() - noise_pred_audio = noise_pred_audio.float() - if self.do_classifier_free_guidance: - noise_pred_video_uncond, noise_pred_video_text = noise_pred_video.chunk(2) - noise_pred_video = noise_pred_video_uncond + guidance_scale * ( - noise_pred_video_text - noise_pred_video_uncond + latents, audio_latents = self._scheduler_step_video_audio_maybe_with_cfg( + noise_pred_video, + noise_pred_audio, + t, + latents, + audio_latents, + audio_scheduler, + do_true_cfg=True, ) - - noise_pred_audio_uncond, noise_pred_audio_text = noise_pred_audio.chunk(2) - noise_pred_audio = noise_pred_audio_uncond + guidance_scale * ( - noise_pred_audio_text - noise_pred_audio_uncond + else: + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + latent_model_input = latent_model_input.to(prompt_embeds.dtype) + audio_latent_model_input = ( + torch.cat([audio_latents] * 2) if self.do_classifier_free_guidance else audio_latents ) + audio_latent_model_input = audio_latent_model_input.to(prompt_embeds.dtype) + + timestep = t.expand(latent_model_input.shape[0]) + + with self._transformer_cache_context("cond_uncond"): + noise_pred_video, noise_pred_audio = self.transformer( + hidden_states=latent_model_input, + audio_hidden_states=audio_latent_model_input, + encoder_hidden_states=connector_prompt_embeds, + audio_encoder_hidden_states=connector_audio_prompt_embeds, + timestep=timestep, + encoder_attention_mask=connector_attention_mask, + audio_encoder_attention_mask=connector_attention_mask, + num_frames=latent_num_frames, + height=latent_height, + width=latent_width, + fps=frame_rate, + audio_num_frames=audio_num_frames, + video_coords=video_coords, + audio_coords=audio_coords, + attention_kwargs=attention_kwargs, + return_dict=False, + ) + noise_pred_video = noise_pred_video.float() + noise_pred_audio = noise_pred_audio.float() - if guidance_rescale > 0: - noise_pred_video = rescale_noise_cfg( - noise_pred_video, noise_pred_video_text, guidance_rescale=guidance_rescale + if self.do_classifier_free_guidance: + noise_pred_video_uncond, noise_pred_video_text = noise_pred_video.chunk(2) + noise_pred_video = noise_pred_video_uncond + guidance_scale * ( + noise_pred_video_text - noise_pred_video_uncond ) - noise_pred_audio = rescale_noise_cfg( - noise_pred_audio, noise_pred_audio_text, guidance_rescale=guidance_rescale + + noise_pred_audio_uncond, noise_pred_audio_text = noise_pred_audio.chunk(2) + noise_pred_audio = noise_pred_audio_uncond + guidance_scale * ( + noise_pred_audio_text - noise_pred_audio_uncond ) - latents = self.scheduler.step(noise_pred_video, t, latents, return_dict=False)[0] - audio_latents = audio_scheduler.step(noise_pred_audio, t, audio_latents, return_dict=False)[0] + if guidance_rescale > 0: + noise_pred_video = rescale_noise_cfg( + noise_pred_video, noise_pred_video_text, guidance_rescale=guidance_rescale + ) + noise_pred_audio = rescale_noise_cfg( + noise_pred_audio, noise_pred_audio_text, guidance_rescale=guidance_rescale + ) + + latents = self.scheduler.step(noise_pred_video, t, latents, return_dict=False)[0] + audio_latents = audio_scheduler.step(noise_pred_audio, t, audio_latents, return_dict=False)[0] - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - pass + pbar.update() latents = self._unpack_latents( latents, @@ -1174,3 +1269,158 @@ def forward( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) + + +class LTX2TwoStagesPipeline(nn.Module): + """LTX2TwoStagesPipeline is for two stages image to video generation""" + + def __init__( + self, + *, + od_config: OmniDiffusionConfig, + prefix: str = "", + ): + super().__init__() + + self.device = get_local_device() + self.dtype = getattr(od_config, "dtype", torch.bfloat16) + self.model_path = od_config.model + self.distilled = False + # User provided model path may contain '/' in the end and basename function + # will not return the expected directory name, so we need to remove it by normpath + if "distilled" in os.path.basename(os.path.normpath(self.model_path)): + self.distilled = True + else: + raise NotImplementedError(f"{self.model_path} is not supported for {self.__class__.__name__}.") + + self.pipe = LTX2Pipeline(od_config=od_config, prefix=prefix) + self.upsample_pipe = LTX2LatentUpsamplePipeline( + vae=self.pipe.vae, + od_config=od_config, + ) + + self.lora_manager = DiffusionLoRAManager( + pipeline=self.pipe, + device=self.device, + dtype=self.dtype, + max_cached_adapters=od_config.max_cpu_loras, + ) + + self.weights_sources = [ + DiffusersPipelineLoader.ComponentSource( + model_or_path=od_config.model, + subfolder="transformer", + revision=None, + prefix="pipe.transformer.", + fall_back_to_pt=True, + ), + ] + + def forward( + self, + req: OmniDiffusionRequest, + prompt: str | list[str] | None = None, + negative_prompt: str | list[str] | None = None, + height: int | None = None, + width: int | None = None, + num_frames: int | None = None, + frame_rate: float | None = None, + num_inference_steps: int | None = None, + timesteps: list[int] | None = None, + guidance_scale: float = 4.0, + guidance_rescale: float = 0.0, + noise_scale: float = 0.0, + num_videos_per_prompt: int | None = 1, + generator: torch.Generator | list[torch.Generator] | None = None, + latents: torch.Tensor | None = None, + audio_latents: torch.Tensor | None = None, + prompt_embeds: torch.Tensor | None = None, + negative_prompt_embeds: torch.Tensor | None = None, + prompt_attention_mask: torch.Tensor | None = None, + negative_prompt_attention_mask: torch.Tensor | None = None, + decode_timestep: float | list[float] = 0.0, + decode_noise_scale: float | list[float] | None = None, + output_type: str = "np", + return_dict: bool = True, + attention_kwargs: dict[str, Any] | None = None, + max_sequence_length: int | None = None, + ): + video_latent, audio_latent = self.pipe( + req=req, + prompt=prompt, + negative_prompt=negative_prompt, + height=height, + width=width, + num_frames=num_frames, + frame_rate=frame_rate, + num_inference_steps=num_inference_steps, + sigmas=DISTILLED_SIGMA_VALUES if self.distilled else None, + timesteps=timesteps, + guidance_scale=guidance_scale, + guidance_rescale=guidance_rescale, + noise_scale=noise_scale, + num_videos_per_prompt=num_videos_per_prompt, + generator=generator, + latents=latents, + audio_latents=audio_latents, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + prompt_attention_mask=prompt_attention_mask, + negative_prompt_attention_mask=negative_prompt_attention_mask, + decode_timestep=decode_timestep, + decode_noise_scale=decode_noise_scale, + output_type="latent", + return_dict=return_dict, + attention_kwargs=attention_kwargs, + max_sequence_length=max_sequence_length, + ).output + + upscaled_video_latent = self.upsample_pipe( + latents=video_latent, + output_type="latent", + return_dict=False, + )[0] + + if not self.distilled: + # Load Stage 2 distilled LoRA + lora_path = f"{self.model_path}/ltx-2-19b-distilled-lora-384.safetensors" + lora_request = LoRARequest( + lora_name="stage_2_distilled", + lora_int_id=1, + lora_path=lora_path, + ) + self.lora_manager.set_active_adapter(lora_request, lora_scale=1.0) + + # Change scheduler to use Stage 2 distilled sigmas as is + new_scheduler = FlowMatchEulerDiscreteScheduler.from_config( + self.pipe.scheduler.config, + use_dynamic_shifting=False, + shift_terminal=None, + ) + self.pipe.scheduler = new_scheduler + + # We only want to change num_inference_steps here, so no need + # to deep copy the whole request + stage_2_req = copy.copy(req) + stage_2_req.sampling_params = req.sampling_params.clone() + stage_2_req.sampling_params.num_inference_steps = 3 + + video, audio = self.pipe( + req=stage_2_req, + latents=upscaled_video_latent, + audio_latents=audio_latent, + prompt=prompt, + negative_prompt=negative_prompt, + noise_scale=STAGE_2_DISTILLED_SIGMA_VALUES[0], + sigmas=STAGE_2_DISTILLED_SIGMA_VALUES, + guidance_scale=1.0, + generator=generator, + output_type="np", + return_dict=False, + ).output + + return DiffusionOutput(output=(video, audio)) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) diff --git a/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_image2video.py b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_image2video.py index 5fa9cc797e..11091518b4 100644 --- a/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_image2video.py +++ b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_image2video.py @@ -4,19 +4,30 @@ from __future__ import annotations import copy +import os +from collections.abc import Iterable from typing import Any import numpy as np import PIL.Image import torch +import torch.nn as nn +from diffusers import FlowMatchEulerDiscreteScheduler +from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg, retrieve_timesteps from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_latents from diffusers.utils.torch_utils import randn_tensor from diffusers.video_processor import VideoProcessor +from vllm.logger import init_logger +from vllm.model_executor.models.utils import AutoWeightsLoader from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig from vllm_omni.diffusion.distributed.parallel_state import get_cfg_group, get_classifier_free_guidance_rank +from vllm_omni.diffusion.distributed.utils import get_local_device +from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager +from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.lora.request import LoRARequest from .pipeline_ltx2 import ( LTX2Pipeline, @@ -26,6 +37,9 @@ from .pipeline_ltx2 import ( get_ltx2_post_process_func as _get_ltx2_post_process_func, ) +from .pipeline_ltx2_latent_upsample import LTX2LatentUpsamplePipeline + +logger = init_logger(__name__) def get_ltx2_post_process_func(od_config: OmniDiffusionConfig): @@ -61,6 +75,7 @@ def prepare_latents( height: int = 512, width: int = 768, num_frames: int = 121, + noise_scale: float = 0.0, dtype: torch.dtype | None = None, device: torch.device | None = None, generator: torch.Generator | list[torch.Generator] | None = None, @@ -74,11 +89,29 @@ def prepare_latents( mask_shape = (batch_size, 1, num_frames, height, width) if latents is not None: - conditioning_mask = latents.new_zeros(mask_shape) - conditioning_mask[:, :, 0] = 1.0 + if latents.ndim == 5: + # conditioning_mask needs to the same shape as latents in two stages generation. + batch_size, _, num_frames, height, width = latents.shape + mask_shape = (batch_size, 1, num_frames, height, width) + conditioning_mask = latents.new_zeros(mask_shape) + conditioning_mask[:, :, 0] = 1.0 + + latents = self._normalize_latents( + latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor + ) + latents = self._create_noised_state(latents, noise_scale * (1 - conditioning_mask), generator) + # latents are of shape [B, C, F, H, W], need to be packed + latents = self._pack_latents( + latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size + ) + else: + conditioning_mask = latents.new_zeros(mask_shape) + conditioning_mask[:, :, 0] = 1.0 + conditioning_mask = self._pack_latents( conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size ).squeeze(-1) + if latents.ndim != 3 or latents.shape[:2] != conditioning_mask.shape: raise ValueError( "Provided `latents` tensor has shape" @@ -234,9 +267,11 @@ def forward( num_frames: int | None = None, frame_rate: float | None = None, num_inference_steps: int | None = None, + sigmas: list[float] | None = None, timesteps: list[int] | None = None, guidance_scale: float = 4.0, guidance_rescale: float = 0.0, + noise_scale: float = 0.0, num_videos_per_prompt: int | None = 1, generator: torch.Generator | list[torch.Generator] | None = None, latents: torch.Tensor | None = None, @@ -421,6 +456,26 @@ def forward( additive_mask=True, ) + latent_num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1 + latent_height = height // self.vae_spatial_compression_ratio + latent_width = width // self.vae_spatial_compression_ratio + if latents is not None: + if latents.ndim == 5: + logger.info( + "Got latents of shape [batch_size, latent_dim, latent_frames, latent_height, latent_width], `latent_num_frames`, `latent_height`, `latent_width` will be inferred." # noqa + ) + _, _, latent_num_frames, latent_height, latent_width = latents.shape # [B, C, F, H, W] + elif latents.ndim == 3: + logger.warning( + f"You have supplied packed `latents` of shape {latents.shape}, so the latent dims cannot be" + f" inferred. Make sure the supplied `height`, `width`, and `num_frames` are correct." + ) + else: + raise ValueError( + f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is either [batch_size, seq_len, num_features] or [batch_size, latent_dim, latent_frames, latent_height, latent_width]." # noqa + ) + video_sequence_length = latent_num_frames * latent_height * latent_width + if latents is None: if isinstance(image, torch.Tensor): if image.ndim == 3: @@ -439,6 +494,7 @@ def forward( height, width, num_frames, + noise_scale, torch.float32, device, generator, @@ -447,32 +503,58 @@ def forward( if self.do_classifier_free_guidance and not cfg_parallel_ready: conditioning_mask = torch.cat([conditioning_mask, conditioning_mask]) + duration_s = num_frames / frame_rate + audio_latents_per_second = ( + self.audio_sampling_rate / self.audio_hop_length / float(self.audio_vae_temporal_compression_ratio) + ) + audio_num_frames = round(duration_s * audio_latents_per_second) + if audio_latents is not None: + if audio_latents.ndim == 4: + logger.info( + "Got audio_latents of shape [batch_size, num_channels, audio_length, mel_bins], `audio_num_frames` will be inferred." # noqa + ) + _, _, audio_num_frames, _ = audio_latents.shape # [B, C, L, M] + elif audio_latents.ndim == 3: + logger.warning( + f"You have supplied packed `audio_latents` of shape {audio_latents.shape}, so the latent dims" + f" cannot be inferred. Make sure the supplied `num_frames` and `frame_rate` are correct." + ) + else: + raise ValueError( + f"Provided `audio_latents` tensor has shape {audio_latents.shape}, but the expected shape is either [batch_size, seq_len, num_features] or [batch_size, num_channels, audio_length, mel_bins]." # noqa + ) + num_mel_bins = self.audio_vae.config.mel_bins if getattr(self, "audio_vae", None) is not None else 64 latent_mel_bins = num_mel_bins // self.audio_vae_mel_compression_ratio - num_channels_latents_audio = ( self.audio_vae.config.latent_channels if getattr(self, "audio_vae", None) is not None else 8 ) - audio_latents, audio_num_frames = self.prepare_audio_latents( + + # padding audio_latents if needed + sp_size = getattr(self.od_config.parallel_config, "sequence_parallel_size", 1) + if sp_size > 1: + pad_len = (sp_size - (audio_num_frames % sp_size)) % sp_size + if pad_len > 0: + if audio_latents is not None: + pad_shape = list(audio_latents.shape) + pad_shape[2] = pad_len + padding = torch.zeros(pad_shape, dtype=audio_latents.dtype, device=audio_latents.device) + audio_latents = torch.cat([audio_latents, padding], dim=2) + audio_num_frames += pad_len + + audio_latents = self.prepare_audio_latents( batch_size * num_videos_per_prompt, num_channels_latents=num_channels_latents_audio, + audio_latent_length=audio_num_frames, num_mel_bins=num_mel_bins, - num_frames=num_frames, - frame_rate=frame_rate, - sampling_rate=self.audio_sampling_rate, - hop_length=self.audio_hop_length, + noise_scale=noise_scale, dtype=torch.float32, device=device, generator=generator, latents=audio_latents, ) - latent_num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1 - latent_height = height // self.vae_spatial_compression_ratio - latent_width = width // self.vae_spatial_compression_ratio - video_sequence_length = latent_num_frames * latent_height * latent_width - - sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas mu = calculate_shift( video_sequence_length, self.scheduler.config.get("base_image_seq_len", 1024), @@ -497,7 +579,6 @@ def forward( sigmas=sigmas, mu=mu, ) - num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) self._num_timesteps = len(timesteps) video_coords = self.transformer.rope.prepare_video_coords( @@ -506,69 +587,142 @@ def forward( audio_coords = self.transformer.audio_rope.prepare_audio_coords( audio_latents.shape[0], audio_num_frames, audio_latents.device ) + # Duplicate the positional ids as well if using CFG + if self.do_classifier_free_guidance and not cfg_parallel_ready: + video_coords = video_coords.repeat((2,) + (1,) * (video_coords.ndim - 1)) # Repeat twice in batch dim + audio_coords = audio_coords.repeat((2,) + (1,) * (audio_coords.ndim - 1)) + + with self.progress_bar(total=len(timesteps)) as pbar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + self._current_timestep = t + + if cfg_parallel_ready: + latent_model_input = latents.to(prompt_embeds.dtype) + audio_latent_model_input = audio_latents.to(prompt_embeds.dtype) + + timestep = t.expand(latent_model_input.shape[0]) + video_timestep = timestep.unsqueeze(-1) * (1 - conditioning_mask) + + positive_kwargs = { + "hidden_states": latent_model_input, + "audio_hidden_states": audio_latent_model_input, + "encoder_hidden_states": connector_prompt_embeds, + "audio_encoder_hidden_states": connector_audio_prompt_embeds, + "timestep": video_timestep, + "audio_timestep": timestep, + "encoder_attention_mask": connector_attention_mask, + "audio_encoder_attention_mask": connector_attention_mask, + "num_frames": latent_num_frames, + "height": latent_height, + "width": latent_width, + "fps": frame_rate, + "audio_num_frames": audio_num_frames, + "video_coords": video_coords, + "audio_coords": audio_coords, + "attention_kwargs": attention_kwargs, + "return_dict": False, + } + negative_kwargs = { + "hidden_states": latent_model_input, + "audio_hidden_states": audio_latent_model_input, + "encoder_hidden_states": negative_connector_prompt_embeds, + "audio_encoder_hidden_states": negative_connector_audio_prompt_embeds, + "timestep": video_timestep, + "audio_timestep": timestep, + "encoder_attention_mask": negative_connector_attention_mask, + "audio_encoder_attention_mask": negative_connector_attention_mask, + "num_frames": latent_num_frames, + "height": latent_height, + "width": latent_width, + "fps": frame_rate, + "audio_num_frames": audio_num_frames, + "video_coords": video_coords, + "audio_coords": audio_coords, + "attention_kwargs": attention_kwargs, + "return_dict": False, + } + + noise_pred_video, noise_pred_audio = self.predict_noise_av_maybe_with_cfg( + do_true_cfg=True, + true_cfg_scale=guidance_scale, + positive_kwargs=positive_kwargs, + negative_kwargs=negative_kwargs, + guidance_rescale=guidance_rescale, + cfg_normalize=False, + ) - for i, t in enumerate(timesteps): - if self.interrupt: - continue - - self._current_timestep = t - - if cfg_parallel_ready: - latent_model_input = latents.to(prompt_embeds.dtype) - audio_latent_model_input = audio_latents.to(prompt_embeds.dtype) - - timestep = t.expand(latent_model_input.shape[0]) - video_timestep = timestep.unsqueeze(-1) * (1 - conditioning_mask) - - positive_kwargs = { - "hidden_states": latent_model_input, - "audio_hidden_states": audio_latent_model_input, - "encoder_hidden_states": connector_prompt_embeds, - "audio_encoder_hidden_states": connector_audio_prompt_embeds, - "timestep": video_timestep, - "audio_timestep": timestep, - "encoder_attention_mask": connector_attention_mask, - "audio_encoder_attention_mask": connector_attention_mask, - "num_frames": latent_num_frames, - "height": latent_height, - "width": latent_width, - "fps": frame_rate, - "audio_num_frames": audio_num_frames, - "video_coords": video_coords, - "audio_coords": audio_coords, - "attention_kwargs": attention_kwargs, - "return_dict": False, - } - negative_kwargs = { - "hidden_states": latent_model_input, - "audio_hidden_states": audio_latent_model_input, - "encoder_hidden_states": negative_connector_prompt_embeds, - "audio_encoder_hidden_states": negative_connector_audio_prompt_embeds, - "timestep": video_timestep, - "audio_timestep": timestep, - "encoder_attention_mask": negative_connector_attention_mask, - "audio_encoder_attention_mask": negative_connector_attention_mask, - "num_frames": latent_num_frames, - "height": latent_height, - "width": latent_width, - "fps": frame_rate, - "audio_num_frames": audio_num_frames, - "video_coords": video_coords, - "audio_coords": audio_coords, - "attention_kwargs": attention_kwargs, - "return_dict": False, - } - - noise_pred_video, noise_pred_audio = self.predict_noise_av_maybe_with_cfg( - do_true_cfg=True, - true_cfg_scale=guidance_scale, - positive_kwargs=positive_kwargs, - negative_kwargs=negative_kwargs, - guidance_rescale=guidance_rescale, - cfg_normalize=False, - ) + if get_classifier_free_guidance_rank() == 0: + latents = self._step_video_latents_i2v( + noise_pred_video, + latents, + t, + latent_num_frames, + latent_height, + latent_width, + ) + audio_latents = audio_scheduler.step(noise_pred_audio, t, audio_latents, return_dict=False)[0] + + cfg_group = get_cfg_group() + latents = latents.contiguous() + audio_latents = audio_latents.contiguous() + cfg_group.broadcast(latents, src=0) + cfg_group.broadcast(audio_latents, src=0) + else: + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + latent_model_input = latent_model_input.to(prompt_embeds.dtype) + audio_latent_model_input = ( + torch.cat([audio_latents] * 2) if self.do_classifier_free_guidance else audio_latents + ) + audio_latent_model_input = audio_latent_model_input.to(prompt_embeds.dtype) + + timestep = t.expand(latent_model_input.shape[0]) + video_timestep = timestep.unsqueeze(-1) * (1 - conditioning_mask) + + with self._transformer_cache_context("cond_uncond"): + noise_pred_video, noise_pred_audio = self.transformer( + hidden_states=latent_model_input, + audio_hidden_states=audio_latent_model_input, + encoder_hidden_states=connector_prompt_embeds, + audio_encoder_hidden_states=connector_audio_prompt_embeds, + timestep=video_timestep, + audio_timestep=timestep, + encoder_attention_mask=connector_attention_mask, + audio_encoder_attention_mask=connector_attention_mask, + num_frames=latent_num_frames, + height=latent_height, + width=latent_width, + fps=frame_rate, + audio_num_frames=audio_num_frames, + video_coords=video_coords, + audio_coords=audio_coords, + attention_kwargs=attention_kwargs, + return_dict=False, + ) + noise_pred_video = noise_pred_video.float() + noise_pred_audio = noise_pred_audio.float() + + if self.do_classifier_free_guidance: + noise_pred_video_uncond, noise_pred_video_text = noise_pred_video.chunk(2) + noise_pred_video = noise_pred_video_uncond + guidance_scale * ( + noise_pred_video_text - noise_pred_video_uncond + ) + + noise_pred_audio_uncond, noise_pred_audio_text = noise_pred_audio.chunk(2) + noise_pred_audio = noise_pred_audio_uncond + guidance_scale * ( + noise_pred_audio_text - noise_pred_audio_uncond + ) + + if guidance_rescale > 0: + noise_pred_video = rescale_noise_cfg( + noise_pred_video, noise_pred_video_text, guidance_rescale=guidance_rescale + ) + noise_pred_audio = rescale_noise_cfg( + noise_pred_audio, noise_pred_audio_text, guidance_rescale=guidance_rescale + ) - if get_classifier_free_guidance_rank() == 0: latents = self._step_video_latents_i2v( noise_pred_video, latents, @@ -577,79 +731,10 @@ def forward( latent_height, latent_width, ) - audio_latents = audio_scheduler.step(noise_pred_audio, t, audio_latents, return_dict=False)[0] - - cfg_group = get_cfg_group() - latents = latents.contiguous() - audio_latents = audio_latents.contiguous() - cfg_group.broadcast(latents, src=0) - cfg_group.broadcast(audio_latents, src=0) - else: - latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents - latent_model_input = latent_model_input.to(prompt_embeds.dtype) - audio_latent_model_input = ( - torch.cat([audio_latents] * 2) if self.do_classifier_free_guidance else audio_latents - ) - audio_latent_model_input = audio_latent_model_input.to(prompt_embeds.dtype) - - timestep = t.expand(latent_model_input.shape[0]) - video_timestep = timestep.unsqueeze(-1) * (1 - conditioning_mask) - - with self._transformer_cache_context("cond_uncond"): - noise_pred_video, noise_pred_audio = self.transformer( - hidden_states=latent_model_input, - audio_hidden_states=audio_latent_model_input, - encoder_hidden_states=connector_prompt_embeds, - audio_encoder_hidden_states=connector_audio_prompt_embeds, - timestep=video_timestep, - audio_timestep=timestep, - encoder_attention_mask=connector_attention_mask, - audio_encoder_attention_mask=connector_attention_mask, - num_frames=latent_num_frames, - height=latent_height, - width=latent_width, - fps=frame_rate, - audio_num_frames=audio_num_frames, - video_coords=video_coords, - audio_coords=audio_coords, - attention_kwargs=attention_kwargs, - return_dict=False, - ) - noise_pred_video = noise_pred_video.float() - noise_pred_audio = noise_pred_audio.float() - if self.do_classifier_free_guidance: - noise_pred_video_uncond, noise_pred_video_text = noise_pred_video.chunk(2) - noise_pred_video = noise_pred_video_uncond + guidance_scale * ( - noise_pred_video_text - noise_pred_video_uncond - ) - - noise_pred_audio_uncond, noise_pred_audio_text = noise_pred_audio.chunk(2) - noise_pred_audio = noise_pred_audio_uncond + guidance_scale * ( - noise_pred_audio_text - noise_pred_audio_uncond - ) - - if guidance_rescale > 0: - noise_pred_video = rescale_noise_cfg( - noise_pred_video, noise_pred_video_text, guidance_rescale=guidance_rescale - ) - noise_pred_audio = rescale_noise_cfg( - noise_pred_audio, noise_pred_audio_text, guidance_rescale=guidance_rescale - ) - - latents = self._step_video_latents_i2v( - noise_pred_video, - latents, - t, - latent_num_frames, - latent_height, - latent_width, - ) - - audio_latents = audio_scheduler.step(noise_pred_audio, t, audio_latents, return_dict=False)[0] + audio_latents = audio_scheduler.step(noise_pred_audio, t, audio_latents, return_dict=False)[0] - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - pass + pbar.update() latents = self._unpack_latents( latents, @@ -703,3 +788,162 @@ def forward( return DiffusionOutput(output=(video, audio)) return DiffusionOutput(output=(video, audio)) + + +class LTX2ImageToVideoTwoStagesPipeline(nn.Module): + """LTXImageToVideoTwoStagesPipeline is for two stages image to video generation""" + + support_image_input = True + + def __init__( + self, + *, + od_config: OmniDiffusionConfig, + prefix: str = "", + ): + super().__init__() + + self.device = get_local_device() + self.dtype = getattr(od_config, "dtype", torch.bfloat16) + self.model_path = od_config.model + self.distilled = False + # User provided model path may contain '/' in the end and basename function + # will not return the expected directory name, so we need to remove it by normpath + if "distilled" in os.path.basename(os.path.normpath(self.model_path)): + self.distilled = True + else: + raise NotImplementedError(f"{self.model_path} is not supported for {self.__class__.__name__}.") + + self.pipe = LTX2ImageToVideoPipeline(od_config=od_config, prefix=prefix) + self.upsample_pipe = LTX2LatentUpsamplePipeline( + vae=self.pipe.vae, + od_config=od_config, + ) + + self.lora_manager = DiffusionLoRAManager( + pipeline=self.pipe, + device=self.device, + dtype=self.dtype, + max_cached_adapters=od_config.max_cpu_loras, + ) + + self.weights_sources = [ + DiffusersPipelineLoader.ComponentSource( + model_or_path=od_config.model, + subfolder="transformer", + revision=None, + prefix="pipe.transformer.", + fall_back_to_pt=True, + ), + ] + + @torch.no_grad() + def forward( + self, + req: OmniDiffusionRequest, + image: PIL.Image.Image | torch.Tensor | None = None, + prompt: str | list[str] | None = None, + negative_prompt: str | list[str] | None = None, + height: int | None = None, + width: int | None = None, + num_frames: int | None = None, + frame_rate: float | None = None, + num_inference_steps: int | None = None, + sigmas: list[float] | None = None, + timesteps: list[int] | None = None, + guidance_scale: float = 4.0, + guidance_rescale: float = 0.0, + noise_scale: float = 0.0, + num_videos_per_prompt: int | None = 1, + generator: torch.Generator | list[torch.Generator] | None = None, + latents: torch.Tensor | None = None, + audio_latents: torch.Tensor | None = None, + prompt_embeds: torch.Tensor | None = None, + negative_prompt_embeds: torch.Tensor | None = None, + prompt_attention_mask: torch.Tensor | None = None, + negative_prompt_attention_mask: torch.Tensor | None = None, + decode_timestep: float | list[float] = 0.0, + decode_noise_scale: float | list[float] | None = None, + output_type: str = "np", + return_dict: bool = True, + attention_kwargs: dict[str, Any] | None = None, + max_sequence_length: int | None = None, + ): + video_latent, audio_latent = self.pipe( + req=req, + image=image, + prompt=prompt, + negative_prompt=negative_prompt, + height=height, + width=width, + num_frames=num_frames, + frame_rate=frame_rate, + num_inference_steps=num_inference_steps, + sigmas=DISTILLED_SIGMA_VALUES if self.distilled else None, + timesteps=timesteps, + guidance_scale=guidance_scale, + guidance_rescale=guidance_rescale, + noise_scale=noise_scale, + num_videos_per_prompt=num_videos_per_prompt, + generator=generator, + latents=latents, + audio_latents=audio_latents, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + prompt_attention_mask=prompt_attention_mask, + negative_prompt_attention_mask=negative_prompt_attention_mask, + decode_timestep=decode_timestep, + decode_noise_scale=decode_noise_scale, + output_type="latent", + return_dict=return_dict, + attention_kwargs=attention_kwargs, + max_sequence_length=max_sequence_length, + ).output + + upscaled_video_latent = self.upsample_pipe( + latents=video_latent, + output_type="latent", + return_dict=False, + )[0] + + if not self.distilled: + # Load Stage 2 distilled LoRA + lora_path = f"{self.model_path}/ltx-2-19b-distilled-lora-384.safetensors" + lora_request = LoRARequest( + lora_name="stage_2_distilled", + lora_int_id=1, + lora_path=lora_path, + ) + self.lora_manager.set_active_adapter(lora_request, lora_scale=1.0) + + # Change scheduler to use Stage 2 distilled sigmas as is + new_scheduler = FlowMatchEulerDiscreteScheduler.from_config( + self.pipe.scheduler.config, + use_dynamic_shifting=False, + shift_terminal=None, + ) + self.pipe.scheduler = new_scheduler + + stage_2_req = copy.copy(req) + stage_2_req.sampling_params = req.sampling_params.clone() + stage_2_req.sampling_params.num_inference_steps = 3 + + video, audio = self.pipe( + req=stage_2_req, + latents=upscaled_video_latent, + audio_latents=audio_latent, + prompt=prompt, + negative_prompt=negative_prompt, + noise_scale=STAGE_2_DISTILLED_SIGMA_VALUES[0], + sigmas=STAGE_2_DISTILLED_SIGMA_VALUES, + guidance_scale=1.0, + generator=generator, + output_type="np", + return_dict=False, + ).output + + return DiffusionOutput(output=(video, audio)) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) diff --git a/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_latent_upsample.py b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_latent_upsample.py new file mode 100644 index 0000000000..0c72a41d5e --- /dev/null +++ b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_latent_upsample.py @@ -0,0 +1,262 @@ +import os + +import torch +import torch.nn as nn +from diffusers import AutoencoderKLLTX2Video +from diffusers.image_processor import PipelineImageInput +from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_latents +from diffusers.utils.torch_utils import randn_tensor +from diffusers.video_processor import VideoProcessor +from vllm.logger import init_logger + +from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig +from vllm_omni.diffusion.distributed.utils import get_local_device + +logger = init_logger(__name__) + + +class LTX2LatentUpsamplePipeline(nn.Module): + def __init__( + self, + od_config: OmniDiffusionConfig, + vae: AutoencoderKLLTX2Video, + latent_upsampler: LTX2LatentUpsamplerModel = None, + ) -> None: + super().__init__() + + if vae is None: + raise ValueError("vae must be provided") + self.vae = vae + + self.device = get_local_device() + model = od_config.model + local_files_only = os.path.exists(model) + + if latent_upsampler is None: + # Use cpu context to create latent upsampler. The code k[:, None] @ k[None, :] in + # diffuser's BlurDownsample is not supported on GPU as k is type of torch.Int64 + with torch.device("cpu"): + latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained( + model, + subfolder="latent_upsampler", + torch_dtype=torch.bfloat16, + local_files_only=local_files_only, + ).to(self.device) + self.latent_upsampler = latent_upsampler + + self.vae_spatial_compression_ratio = ( + self.vae.spatial_compression_ratio if getattr(self, "vae", None) is not None else 32 + ) + self.vae_temporal_compression_ratio = ( + self.vae.temporal_compression_ratio if getattr(self, "vae", None) is not None else 8 + ) + self.video_processor = VideoProcessor(vae_scale_factor=self.vae_spatial_compression_ratio) + + def prepare_latents( + self, + video: torch.Tensor | None = None, + batch_size: int = 1, + num_frames: int = 121, + height: int = 512, + width: int = 768, + spatial_patch_size: int = 1, + temporal_patch_size: int = 1, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + generator: torch.Generator | None = None, + latents: torch.Tensor | None = None, + ) -> torch.Tensor: + if latents is not None: + if latents.ndim == 3: + # Convert token seq [B, S, D] to latent video [B, C, F, H, W] + latent_num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1 + latent_height = height // self.vae_spatial_compression_ratio + latent_width = width // self.vae_spatial_compression_ratio + latents = self._unpack_latents( + latents, latent_num_frames, latent_height, latent_width, spatial_patch_size, temporal_patch_size + ) + return latents.to(device=device, dtype=dtype) + + video = video.to(device=device, dtype=self.vae.dtype) + if isinstance(generator, list): + if len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + init_latents = [ + retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator[i]) for i in range(batch_size) + ] + else: + init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video] + + init_latents = torch.cat(init_latents, dim=0).to(dtype) + # NOTE: latent upsampler operates on the unnormalized latents, so don't normalize here + # init_latents = self._normalize_latents(init_latents, self.vae.latents_mean, self.vae.latents_std) + return init_latents + + def adain_filter_latent(self, latents: torch.Tensor, reference_latents: torch.Tensor, factor: float = 1.0): + result = latents.clone() + + for i in range(latents.size(0)): + for c in range(latents.size(1)): + r_sd, r_mean = torch.std_mean(reference_latents[i, c], dim=None) # index by original dim order + i_sd, i_mean = torch.std_mean(result[i, c], dim=None) + + result[i, c] = ((result[i, c] - i_mean) / i_sd) * r_sd + r_mean + + result = torch.lerp(latents, result, factor) + return result + + def tone_map_latents(self, latents: torch.Tensor, compression: float) -> torch.Tensor: + # Remap [0-1] to [0-0.75] and apply sigmoid compression in one shot + scale_factor = compression * 0.75 + abs_latents = torch.abs(latents) + + # Sigmoid compression: sigmoid shifts large values toward 0.2, small values stay ~1.0 + # When scale_factor=0, sigmoid term vanishes, when scale_factor=0.75, full effect + sigmoid_term = torch.sigmoid(4.0 * scale_factor * (abs_latents - 1.0)) + scales = 1.0 - 0.8 * scale_factor * sigmoid_term + + filtered = latents * scales + return filtered + + @staticmethod + # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._denormalize_latents + def _denormalize_latents( + latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0 + ) -> torch.Tensor: + # Denormalize latents across the channel dimension [B, C, F, H, W] + latents_mean = latents_mean.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype) + latents_std = latents_std.view(1, -1, 1, 1, 1).to(latents.device, latents.dtype) + latents = latents * latents_std / scaling_factor + latents_mean + return latents + + @staticmethod + # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._unpack_latents + def _unpack_latents( + latents: torch.Tensor, num_frames: int, height: int, width: int, patch_size: int = 1, patch_size_t: int = 1 + ) -> torch.Tensor: + # Packed latents of shape [B, S, D] (S is the effective video sequence length, D is the effective feature dimensions) # noqa + # are unpacked and reshaped into a video tensor of shape [B, C, F, H, W]. This is the inverse operation of + # what happens in the `_pack_latents` method. + batch_size = latents.size(0) + latents = latents.reshape(batch_size, num_frames, height, width, -1, patch_size_t, patch_size, patch_size) + latents = latents.permute(0, 4, 1, 5, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(2, 3) + return latents + + def check_inputs(self, video, height, width, latents, tone_map_compression_ratio): + if height % self.vae_spatial_compression_ratio != 0 or width % self.vae_spatial_compression_ratio != 0: + raise ValueError(f"`height` and `width` have to be divisible by 32 but are {height} and {width}.") + + if video is not None and latents is not None: + raise ValueError("Only one of `video` or `latents` can be provided.") + if video is None and latents is None: + raise ValueError("One of `video` or `latents` has to be provided.") + + if not (0 <= tone_map_compression_ratio <= 1): + raise ValueError("`tone_map_compression_ratio` must be in the range [0, 1]") + + def forward( + self, + video: list[PipelineImageInput] | None = None, + height: int = 512, + width: int = 768, + num_frames: int = 121, + spatial_patch_size: int = 1, + temporal_patch_size: int = 1, + latents: torch.Tensor | None = None, + latents_normalized: bool = False, + decode_timestep: float | list[float] = 0.0, + decode_noise_scale: float | list[float] | None = None, + adain_factor: float = 0.0, + tone_map_compression_ratio: float = 0.0, + generator: torch.Generator | list[torch.Generator] | None = None, + output_type: str | None = "pil", + return_dict: bool = True, + ): + self.check_inputs( + video=video, + height=height, + width=width, + latents=latents, + tone_map_compression_ratio=tone_map_compression_ratio, + ) + + if video is not None: + # Batched video input is not yet tested/supported. TODO: take a look later + batch_size = 1 + else: + batch_size = latents.shape[0] + device = self.device + + if video is not None: + num_frames = len(video) + if num_frames % self.vae_temporal_compression_ratio != 1: + num_frames = num_frames // self.vae_temporal_compression_ratio * self.vae_temporal_compression_ratio + 1 + video = video[:num_frames] + logger.warning( + f"Video length expected to be of the form `k * {self.vae_temporal_compression_ratio} + 1` but is {len(video)}. Truncating to {num_frames} frames." # noqa + ) + video = self.video_processor.preprocess_video(video, height=height, width=width) + video = video.to(device=device, dtype=torch.float32) + + latents_supplied = latents is not None + latents = self.prepare_latents( + video=video, + batch_size=batch_size, + num_frames=num_frames, + height=height, + width=width, + spatial_patch_size=spatial_patch_size, + temporal_patch_size=temporal_patch_size, + dtype=torch.float32, + device=device, + generator=generator, + latents=latents, + ) + + if latents_supplied and latents_normalized: + latents = self._denormalize_latents( + latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor + ) + latents = latents.to(self.latent_upsampler.dtype) + latents_upsampled = self.latent_upsampler(latents) + + if adain_factor > 0.0: + latents = self.adain_filter_latent(latents_upsampled, latents, adain_factor) + else: + latents = latents_upsampled + + if tone_map_compression_ratio > 0.0: + latents = self.tone_map_latents(latents, tone_map_compression_ratio) + + if output_type == "latent": + video = latents + else: + if not self.vae.config.timestep_conditioning: + timestep = None + else: + noise = randn_tensor(latents.shape, generator=generator, device=device, dtype=latents.dtype) + if not isinstance(decode_timestep, list): + decode_timestep = [decode_timestep] * batch_size + if decode_noise_scale is None: + decode_noise_scale = decode_timestep + elif not isinstance(decode_noise_scale, list): + decode_noise_scale = [decode_noise_scale] * batch_size + + timestep = torch.tensor(decode_timestep, device=device, dtype=latents.dtype) + decode_noise_scale = torch.tensor(decode_noise_scale, device=device, dtype=latents.dtype)[ + :, None, None, None, None + ] + latents = (1 - decode_noise_scale) * latents + decode_noise_scale * noise + + video = self.vae.decode(latents, timestep, return_dict=False)[0] + video = self.video_processor.postprocess_video(video, output_type=output_type) + + if not return_dict: + return (video,) + + return DiffusionOutput(output=(video,)) diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index dcd2272375..db88057227 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -12,6 +12,7 @@ from vllm_omni.diffusion.distributed.sp_plan import SequenceParallelConfig, get_sp_plan_from_model from vllm_omni.diffusion.forward_context import get_forward_context from vllm_omni.diffusion.hooks.sequence_parallel import apply_sequence_parallel +from vllm_omni.diffusion.utils.tf_utils import find_module_with_attr logger = init_logger(__name__) @@ -72,6 +73,16 @@ "pipeline_ltx2_image2video", "LTX2ImageToVideoPipeline", ), + "LTX2TwoStagesPipeline": ( + "ltx2", + "pipeline_ltx2", + "LTX2TwoStagesPipeline", + ), + "LTX2ImageToVideoTwoStagesPipeline": ( + "ltx2", + "pipeline_ltx2_image2video", + "LTX2ImageToVideoTwoStagesPipeline", + ), "StableAudioPipeline": ( "stable_audio", "pipeline_stable_audio", @@ -266,7 +277,12 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) - for attr in transformer_attrs: if not hasattr(model, attr): - continue + # Some pipeline like LTX2TwoStagesPipeline have recursive + # modules that have the transformer + module = find_module_with_attr(model, attr) + if module is None: + continue + model = module transformer = getattr(model, attr) if transformer is None: @@ -323,7 +339,9 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) - "WanPipeline": "get_wan22_post_process_func", "WanVACEPipeline": "get_wan22_vace_post_process_func", "LTX2Pipeline": "get_ltx2_post_process_func", + "LTX2TwoStagesPipeline": "get_ltx2_post_process_func", "LTX2ImageToVideoPipeline": "get_ltx2_post_process_func", + "LTX2ImageToVideoTwoStagesPipeline": "get_ltx2_post_process_func", "StableAudioPipeline": "get_stable_audio_post_process_func", "WanImageToVideoPipeline": "get_wan22_i2v_post_process_func", "LongCatImagePipeline": "get_longcat_image_post_process_func", diff --git a/vllm_omni/diffusion/utils/tf_utils.py b/vllm_omni/diffusion/utils/tf_utils.py index 44a7880445..745410ec2f 100644 --- a/vllm_omni/diffusion/utils/tf_utils.py +++ b/vllm_omni/diffusion/utils/tf_utils.py @@ -52,3 +52,27 @@ def get_transformer_config_kwargs( pass return filtered_params + + +def find_module_with_attr(model, attr_name="transformer"): + """ + This function searches for a module in the model that has the specified attribute. + If the model itself has the attribute, it returns the model. + If none of the modules have the attribute, it returns None. + """ + if hasattr(model, attr_name): + return model + + for _, child in model.named_children(): + if hasattr(child, attr_name): + return child + + return None + + +def get_transformer_from_pipeline(pipeline: Any): + pipe = find_module_with_attr(pipeline, attr_name="transformer") + + if pipe is not None: + return pipe.transformer + return None From 515d15ef87141198f22db1a2f9494452d0348efe Mon Sep 17 00:00:00 2001 From: Lidang Jiang <119769478+Lidang-Jiang@users.noreply.github.com> Date: Fri, 3 Apr 2026 23:59:07 +0800 Subject: [PATCH 037/204] [Cleanup] Replace bare print() with logger and use specific exception types (#2228) Signed-off-by: Lidang Jiang Signed-off-by: Lidang-Jiang Co-authored-by: Claude Opus 4.6 --- .../diffusion/attention/backends/ring/ring_utils.py | 9 ++++++--- vllm_omni/diffusion/diffusion_engine.py | 2 +- vllm_omni/diffusion/models/dreamid_omni/fusion.py | 7 +++++-- .../models/hunyuan_image_3/hunyuan_image_3_tokenizer.py | 5 ++++- .../hunyuan_image_3/hunyuan_image_3_transformer.py | 2 +- vllm_omni/model_executor/models/cosyvoice3/utils.py | 2 +- .../models/qwen3_tts/tokenizer_25hz/vq/core_vq.py | 5 ++++- 7 files changed, 22 insertions(+), 10 deletions(-) diff --git a/vllm_omni/diffusion/attention/backends/ring/ring_utils.py b/vllm_omni/diffusion/attention/backends/ring/ring_utils.py index c256f62cbd..67f71562bf 100644 --- a/vllm_omni/diffusion/attention/backends/ring/ring_utils.py +++ b/vllm_omni/diffusion/attention/backends/ring/ring_utils.py @@ -5,6 +5,9 @@ import torch import torch.nn.functional as F +from vllm.logger import init_logger + +logger = init_logger(__name__) __all__ = ["update_out_and_lse", "flatten_varlen_lse", "unflatten_varlen_lse"] @@ -79,9 +82,9 @@ def _update_out_and_lse( out = out - F.sigmoid(block_lse - lse) * (out - block_out) lse = lse - F.logsigmoid(lse - block_lse) except RuntimeError as e: - print(f"ERROR in _update_out_and_lse: {e}") - print(f"out: {out.shape}, lse: {lse.shape}") - print(f"block_out: {block_out.shape}, block_lse: {block_lse.shape}") + logger.error("_update_out_and_lse failed: %s", e) + logger.error("out: %s, lse: %s", out.shape, lse.shape) + logger.error("block_out: %s, block_lse: %s", block_out.shape, block_lse.shape) # raise e raise e diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index 308c8cef80..05008d7e91 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -115,7 +115,7 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: if output.aborted: raise DiffusionRequestAbortedError(output.abort_message or "Diffusion request aborted.") if output.error: - raise Exception(f"{output.error}") + raise RuntimeError(f"{output.error}") logger.info("Generation completed successfully.") if output.output is None: diff --git a/vllm_omni/diffusion/models/dreamid_omni/fusion.py b/vllm_omni/diffusion/models/dreamid_omni/fusion.py index 2a4e485fa6..a534f5a76f 100644 --- a/vllm_omni/diffusion/models/dreamid_omni/fusion.py +++ b/vllm_omni/diffusion/models/dreamid_omni/fusion.py @@ -1,5 +1,6 @@ import torch import torch.nn as nn +from vllm.logger import init_logger from vllm_omni.diffusion.attention.layer import Attention @@ -11,6 +12,8 @@ from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.models.dreamid_omni.wan2_2 import WanModel, rope_apply +logger = init_logger(__name__) + class FusionModel(nn.Module): def __init__(self, video_config=None, audio_config=None): @@ -22,14 +25,14 @@ def __init__(self, video_config=None, audio_config=None): else: has_video = False self.video_model = None - print("Warning: No video model is provided!") + logger.warning("No video model is provided!") if audio_config is not None: self.audio_model = WanModel(**audio_config) else: has_audio = False self.audio_model = None - print("Warning: No audio model is provided!") + logger.warning("No audio model is provided!") if has_video and has_audio: assert len(self.video_model.blocks) == len(self.audio_model.blocks) diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_tokenizer.py b/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_tokenizer.py index 360904b5e4..ce563f7115 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_tokenizer.py +++ b/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_tokenizer.py @@ -11,9 +11,12 @@ import torch.nn.functional as F from diffusers.utils.outputs import BaseOutput from transformers import AutoTokenizer +from vllm.logger import init_logger from .hunyuan_image_3_transformer import ImageInfo, JointImageInfo, default +logger = init_logger(__name__) + class TokenizerEncodeOutput(BaseOutput): tokens: torch.Tensor | None = None @@ -121,7 +124,7 @@ def encode_text( elif isinstance(uncond_enabled, bool): uncond_enabled = [uncond_enabled] * len(texts) if len(uncond_enabled) != len(texts): - print(uncond_enabled, texts) + logger.debug("uncond_enabled=%s, texts=%s", uncond_enabled, texts) assert len(uncond_enabled) == len(texts), ( f"Length of uncond_flags should be equal to the number of texts, " f"but got {len(uncond_enabled)} and {len(texts)}." diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py index 3d670809ba..bc81ca9c3e 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py +++ b/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py @@ -2036,7 +2036,7 @@ def contains_unexpected_keyword(name, keywords): for name, loaded_weight in weights: # print(f"Loading weight name: {name}, tp_rank: {tp_rank}", flush=True) if contains_unexpected_keyword(name, unexpected_keywords): - print(f"Skipping unexpected weight name: {name}") + logger.warning("Skipping unexpected weight name: %s", name) continue if "rotary_emb.inv_freq" in name: continue diff --git a/vllm_omni/model_executor/models/cosyvoice3/utils.py b/vllm_omni/model_executor/models/cosyvoice3/utils.py index e1310cd3b1..ca98e9aefb 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/utils.py +++ b/vllm_omni/model_executor/models/cosyvoice3/utils.py @@ -180,7 +180,7 @@ def log_mel_spectrogram( HOP_LENGTH = 160 if not torch.is_tensor(audio): - raise Exception(f"audio is not tensor {type(audio)}") + raise TypeError(f"audio is not tensor {type(audio)}") if device is not None: audio = audio.to(device) diff --git a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/core_vq.py b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/core_vq.py index 9c103a851e..5609abb394 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/core_vq.py +++ b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/core_vq.py @@ -40,6 +40,9 @@ import torch.nn.functional as F from einops import rearrange, repeat from torch import nn +from vllm.logger import init_logger + +logger = init_logger(__name__) def round_up_multiple(num, mult): @@ -175,7 +178,7 @@ def expire_codes_(self, batch_samples): if not torch.any(expired_codes): return else: - print(f"VQ expire infos: num_expire={sum(expired_codes)}, cluster_size[:5]={cluster_size[:5]}") + logger.info("VQ expire infos: num_expire=%s, cluster_size[:5]=%s", sum(expired_codes), cluster_size[:5]) batch_samples = rearrange(batch_samples, "... d -> (...) d") self.replace_(batch_samples, mask=expired_codes) From 10db95f9a9e9f718db289fce2bd769a9888a497f Mon Sep 17 00:00:00 2001 From: Alex Brooks Date: Fri, 3 Apr 2026 10:05:19 -0600 Subject: [PATCH 038/204] [Bugfix] Fix Flux2 Dev Guidance (#2433) Signed-off-by: Alex Brooks --- vllm_omni/diffusion/models/flux2/flux2_transformer.py | 2 ++ vllm_omni/diffusion/models/flux2/pipeline_flux2.py | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/models/flux2/flux2_transformer.py b/vllm_omni/diffusion/models/flux2/flux2_transformer.py index 116e499b0e..0a4452197f 100644 --- a/vllm_omni/diffusion/models/flux2/flux2_transformer.py +++ b/vllm_omni/diffusion/models/flux2/flux2_transformer.py @@ -578,9 +578,11 @@ def __init__( guidance_embeds: bool = True, ): super().__init__() + self.guidance_embeds = guidance_embeds self.stacked_params_mapping = None self.out_channels = out_channels or in_channels self.inner_dim = num_attention_heads * attention_head_dim + self.config = SimpleNamespace( patch_size=patch_size, in_channels=in_channels, diff --git a/vllm_omni/diffusion/models/flux2/pipeline_flux2.py b/vllm_omni/diffusion/models/flux2/pipeline_flux2.py index 1da0f0cdaf..c5bf9b77d9 100644 --- a/vllm_omni/diffusion/models/flux2/pipeline_flux2.py +++ b/vllm_omni/diffusion/models/flux2/pipeline_flux2.py @@ -928,6 +928,7 @@ def forward( self._attention_kwargs = attention_kwargs self._current_timestep = None self._interrupt = False + guidance_tensor = None # 2. Define call parameters if prompt is not None and isinstance(prompt, str): @@ -1017,6 +1018,11 @@ def forward( ) self._num_timesteps = len(timesteps) + # handle guidance + if self.transformer.guidance_embeds is not None: + guidance_tensor = torch.full([1], self.guidance_scale, device=device, dtype=torch.float32) + guidance_tensor = guidance_tensor.expand(latents.shape[0]) + # 7. Denoising loop # We set the index here to remove DtoH sync, helpful especially during compilation. # Check out more details here: https://github.com/huggingface/diffusers/pull/11696 @@ -1038,7 +1044,7 @@ def forward( noise_pred = self.transformer( hidden_states=latent_model_input, # (B, image_seq_len, C) timestep=timestep / 1000, - guidance=None, + guidance=guidance_tensor, encoder_hidden_states=prompt_embeds, txt_ids=text_ids, # B, text_seq_len, 4 img_ids=latent_image_ids, # B, image_seq_len, 4 From 0e83ebe1d47cdc605637db5f4ef5c8765626f0a5 Mon Sep 17 00:00:00 2001 From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Date: Fri, 3 Apr 2026 16:41:20 -0400 Subject: [PATCH 039/204] [OmniVoice] Add two-stage TTS serving support (#2463) Signed-off-by: linyueqian --- .buildkite/test-ready.yml | 23 + .../offline_inference/omnivoice/README.md | 73 +++ .../offline_inference/omnivoice/end2end.py | 164 +++++ examples/online_serving/omnivoice/README.md | 131 ++++ .../online_serving/omnivoice/run_server.sh | 19 + .../online_serving/omnivoice/speech_client.py | 84 +++ pyproject.toml | 1 + tests/e2e/offline_inference/test_omnivoice.py | 84 +++ tests/e2e/online_serving/test_omnivoice.py | 84 +++ .../diffusion/models/omnivoice/__init__.py | 2 + .../models/omnivoice/pipeline_omnivoice.py | 195 ++++++ vllm_omni/diffusion/registry.py | 11 + vllm_omni/engine/arg_utils.py | 2 + vllm_omni/entrypoints/openai/api_server.py | 7 + .../entrypoints/openai/serving_speech.py | 109 +++- .../models/omnivoice/__init__.py | 2 + .../model_executor/models/omnivoice/config.py | 81 +++ .../models/omnivoice/duration.py | 281 +++++++++ .../models/omnivoice/omnivoice.py | 520 ++++++++++++++++ .../models/omnivoice/omnivoice_decoder.py | 211 +++++++ .../models/omnivoice/omnivoice_generator.py | 588 ++++++++++++++++++ vllm_omni/model_executor/models/registry.py | 5 + .../stage_configs/omnivoice.yaml | 20 + .../stage_input_processors/omnivoice.py | 41 ++ 24 files changed, 2737 insertions(+), 1 deletion(-) create mode 100644 examples/offline_inference/omnivoice/README.md create mode 100644 examples/offline_inference/omnivoice/end2end.py create mode 100644 examples/online_serving/omnivoice/README.md create mode 100755 examples/online_serving/omnivoice/run_server.sh create mode 100644 examples/online_serving/omnivoice/speech_client.py create mode 100644 tests/e2e/offline_inference/test_omnivoice.py create mode 100644 tests/e2e/online_serving/test_omnivoice.py create mode 100644 vllm_omni/diffusion/models/omnivoice/__init__.py create mode 100644 vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py create mode 100644 vllm_omni/model_executor/models/omnivoice/__init__.py create mode 100644 vllm_omni/model_executor/models/omnivoice/config.py create mode 100644 vllm_omni/model_executor/models/omnivoice/duration.py create mode 100644 vllm_omni/model_executor/models/omnivoice/omnivoice.py create mode 100644 vllm_omni/model_executor/models/omnivoice/omnivoice_decoder.py create mode 100644 vllm_omni/model_executor/models/omnivoice/omnivoice_generator.py create mode 100644 vllm_omni/model_executor/stage_configs/omnivoice.yaml create mode 100644 vllm_omni/model_executor/stage_input_processors/omnivoice.py diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 985b50fc72..1151da4672 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -320,6 +320,29 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" + - label: "OmniVoice E2E Test" + timeout_in_minutes: 20 + depends_on: upload-ready-pipeline + commands: + - | + timeout 20m bash -c ' + export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn + pytest -s -v tests/e2e/online_serving/test_omnivoice.py -m "core_model" --run-level "core_model" + ' + agents: + queue: "gpu_1_queue" + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + - label: "Voxtral-TTS E2E Test" timeout_in_minutes: 20 depends_on: upload-ready-pipeline diff --git a/examples/offline_inference/omnivoice/README.md b/examples/offline_inference/omnivoice/README.md new file mode 100644 index 0000000000..d804b61b57 --- /dev/null +++ b/examples/offline_inference/omnivoice/README.md @@ -0,0 +1,73 @@ +# OmniVoice + +This directory contains an offline demo for running OmniVoice TTS models with vLLM Omni. It generates speech from text and saves WAV files locally. + +## Model Overview + +[OmniVoice](https://huggingface.co/k2-fsa/OmniVoice) is a zero-shot multilingual TTS model supporting 600+ languages. It uses a diffusion language model (Qwen3-0.6B backbone) with iterative masked unmasking to generate speech. + +Three inference modes are supported: + +- **Auto Voice**: Generate speech without any reference — the model picks a voice automatically. +- **Voice Clone**: Clone a voice from a reference audio + transcription. +- **Voice Design**: Control voice style via natural language instruction (e.g., "female, low pitch, british accent"). + +## Setup + +Ensure the model is downloaded: + +```bash +huggingface-cli download k2-fsa/OmniVoice +``` + +> **Note:** Voice cloning requires `transformers>=5.3.0` for `HiggsAudioV2TokenizerModel`. Auto voice and voice design modes work with `transformers>=4.57.0`. + +## Quick Start + +Auto voice (text only): + +```bash +python end2end.py --model k2-fsa/OmniVoice --text "Hello, this is a test." +``` + +Voice design (with style instruction): + +```bash +python end2end.py --model k2-fsa/OmniVoice \ + --text "Hello, this is a test." \ + --instruct "female, low pitch, british accent" +``` + +Voice clone (with reference audio): + +```bash +python end2end.py --model k2-fsa/OmniVoice \ + --text "Hello, this is a test." \ + --ref-audio ref.wav \ + --ref-text "This is the reference transcription." +``` + +## Language Support + +Specify a language for improved quality: + +```bash +python end2end.py --model k2-fsa/OmniVoice \ + --text "你好,这是一个测试。" \ + --lang zh +``` + +## Architecture + +OmniVoice uses a two-stage pipeline: + +- **Stage 0 (Generator)**: Qwen3-0.6B transformer with 32-step iterative unmasking and classifier-free guidance. Generates 8-codebook audio tokens from text. +- **Stage 1 (Decoder)**: HiggsAudioV2 RVQ quantizer + DAC acoustic decoder. Converts tokens to 24kHz waveform. + +Both stages use `GPUGenerationWorker` with `OmniGenerationScheduler`. + +## Notes + +- Output audio is saved to `output.wav` by default. Use `--output` to change the path. +- The model estimates duration from text automatically via `RuleDurationEstimator`. +- Use `--stage-init-timeout` to increase the stage initialization timeout for first-time model downloads. diff --git a/examples/offline_inference/omnivoice/end2end.py b/examples/offline_inference/omnivoice/end2end.py new file mode 100644 index 0000000000..b41379b011 --- /dev/null +++ b/examples/offline_inference/omnivoice/end2end.py @@ -0,0 +1,164 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""End-to-end OmniVoice TTS inference via vLLM-Omni. + +Supports: +- Auto voice mode: text only → generated speech +- Voice cloning mode: text + reference audio → cloned voice speech + +Usage: + # Auto voice + python end2end.py --model k2-fsa/OmniVoice --text "Hello world" + + # Voice cloning + python end2end.py --model k2-fsa/OmniVoice --text "Hello" \ + --ref-audio ref.wav --ref-text "reference transcription" +""" + +import argparse +import os + +import numpy as np +import soundfile as sf + +from vllm_omni.entrypoints.omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + + +def run_e2e(): + parser = argparse.ArgumentParser(description="OmniVoice E2E TTS inference") + parser.add_argument( + "--model", + type=str, + default="k2-fsa/OmniVoice", + help="Model name or path (HuggingFace or local)", + ) + parser.add_argument( + "--stage-config", + type=str, + default="vllm_omni/model_executor/stage_configs/omnivoice.yaml", + ) + parser.add_argument( + "--text", + type=str, + default="Hello, this is a test of the OmniVoice text to speech system.", + ) + parser.add_argument( + "--ref-audio", + type=str, + default=None, + help="Reference audio for voice cloning (WAV file)", + ) + parser.add_argument( + "--ref-text", + type=str, + default=None, + help="Transcription of reference audio", + ) + parser.add_argument( + "--lang", + type=str, + default=None, + help="Language code (e.g., 'en', 'zh')", + ) + parser.add_argument( + "--instruct", + type=str, + default=None, + help="Voice design instruction (e.g., 'female, low pitch, british accent')", + ) + parser.add_argument( + "--output", + type=str, + default="output.wav", + help="Output audio file path", + ) + parser.add_argument( + "--stage-init-timeout", + type=int, + default=600, + help="Stage initialization timeout in seconds", + ) + args = parser.parse_args() + + if not os.path.exists(args.stage_config): + raise FileNotFoundError(f"Stage config not found: {args.stage_config}") + + print(f"Initializing OmniVoice with model={args.model}") + + omni = Omni( + model=args.model, + stage_configs_path=args.stage_config, + trust_remote_code=True, + log_stats=True, + ) + + print("Model initialized. Preparing inputs...") + + # Build prompt + mm_processor_kwargs = {} + multi_modal_data = {} + + if args.ref_audio: + if not os.path.exists(args.ref_audio): + raise FileNotFoundError(f"Reference audio not found: {args.ref_audio}") + + import librosa + + audio_signal, sr = librosa.load(args.ref_audio, sr=None) + multi_modal_data["audio"] = (audio_signal.astype(np.float32), sr) + mm_processor_kwargs["ref_text"] = args.ref_text or "" + mm_processor_kwargs["sample_rate"] = sr + + if args.lang: + mm_processor_kwargs["lang"] = args.lang + if args.instruct: + mm_processor_kwargs["instruct"] = args.instruct + + prompts = {"prompt": args.text} + if multi_modal_data: + prompts["multi_modal_data"] = multi_modal_data + if mm_processor_kwargs: + prompts["mm_processor_kwargs"] = mm_processor_kwargs + + sampling_params_list = [OmniDiffusionSamplingParams()] + + print(f"Generating speech for: {args.text}") + + outputs = list(omni.generate(prompts, sampling_params_list=sampling_params_list)) + + print(f"Received {len(outputs)} outputs.") + for i, output in enumerate(outputs): + try: + ro = output.request_output + if ro is None: + print("No request_output found.") + continue + + mm = getattr(ro, "multimodal_output", None) + if not mm and ro.outputs: + mm = getattr(ro.outputs[0], "multimodal_output", None) + + if mm: + print(f"Multimodal output keys: {mm.keys()}") + if "audio" in mm: + audio_out = mm["audio"] + sr = mm.get("sr", 24000) + if isinstance(audio_out, np.ndarray): + audio_np = audio_out + else: + audio_np = audio_out.cpu().numpy().squeeze() + out_path = args.output if i == 0 else f"output_{i}.wav" + sf.write(out_path, audio_np, sr) + print(f"Saved audio to {out_path} ({sr}Hz, {len(audio_np) / sr:.2f}s)") + else: + print("No multimodal output found.") + except Exception as e: + print(f"Error inspecting output: {e}") + + omni.close() + print("Done.") + + +if __name__ == "__main__": + run_e2e() diff --git a/examples/online_serving/omnivoice/README.md b/examples/online_serving/omnivoice/README.md new file mode 100644 index 0000000000..1d8f00421b --- /dev/null +++ b/examples/online_serving/omnivoice/README.md @@ -0,0 +1,131 @@ +# OmniVoice + +## Model Overview + +| Model | Description | +|-------|-------------| +| `k2-fsa/OmniVoice` | Zero-shot multilingual TTS (600+ languages) with diffusion language model (Qwen3-0.6B backbone) | + +> **Note:** Requires `transformers>=5.3.0` for voice cloning (HiggsAudioV2 tokenizer). Auto voice and voice design work with `transformers>=4.57.0`. + +## Launch the Server + +```bash +vllm serve k2-fsa/OmniVoice \ + --omni \ + --port 8091 \ + --trust-remote-code +``` + +Or use the convenience script: + +```bash +./run_server.sh +``` + +## Send TTS Request + +### Using curl + +```bash +# Basic TTS (auto voice) +curl -X POST http://localhost:8091/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{ + "input": "Hello, how are you?", + "voice": "default", + "response_format": "wav" + }' --output output.wav +``` + +### Using Python + +```python +import httpx + +response = httpx.post( + "http://localhost:8091/v1/audio/speech", + json={ + "input": "Hello, how are you?", + "voice": "default", + "response_format": "wav", + }, + timeout=300.0, +) + +with open("output.wav", "wb") as f: + f.write(response.content) +``` + +### Using OpenAI SDK + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8091/v1", api_key="none") + +response = client.audio.speech.create( + model="k2-fsa/OmniVoice", + voice="default", + input="Hello, how are you?", +) + +response.stream_to_file("output.wav") +``` + +### Using the CLI Client + +```bash +cd examples/online_serving/omnivoice + +# Basic TTS +python speech_client.py --text "Hello, how are you?" + +# Specify language for improved quality +python speech_client.py --text "Bonjour, comment allez-vous?" --language French +``` + +The CLI client supports: + +- `--api-base`: API base URL (default: `http://localhost:8091`) +- `--model` (or `-m`): Model name (default: `k2-fsa/OmniVoice`) +- `--text`: Text to synthesize (required) +- `--response-format`: Audio format: wav, mp3, flac, pcm, aac, opus (default: wav) +- `--language`: Language hint (default: Auto) +- `--output` (or `-o`): Output file path (default: `omnivoice_output.wav`) + +## Inference Modes + +OmniVoice supports three inference modes. Currently, **auto voice** is supported through the online Speech API. Voice cloning and voice design are available via offline inference (see `examples/offline_inference/omnivoice/`). + +| Mode | Description | Online API | Offline | +|------|-------------|:----------:|:-------:| +| Auto Voice | Generate speech without reference | Yes | Yes | +| Voice Clone | Clone from reference audio + transcript | - | Yes | +| Voice Design | Control style via natural language instruction | - | Yes | + +## Architecture + +OmniVoice uses a single-stage diffusion pipeline: + +- **Stage 0 (Generator)**: Qwen3-0.6B transformer with 32-step iterative masked unmasking and classifier-free guidance. Generates 8-codebook audio tokens from text, then decodes to 24kHz waveform via HiggsAudioV2 RVQ quantizer + DAC acoustic decoder. + +## API Parameters + +OmniVoice uses the standard `/v1/audio/speech` endpoint. See the [Speech API reference](https://docs.vllm.ai/projects/vllm-omni/en/latest/serving/speech_api/) for full documentation. + +Key parameters: + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `input` | string | **required** | Text to synthesize | +| `voice` | string | "default" | Voice name | +| `response_format` | string | "wav" | Audio format: wav, mp3, flac, pcm, aac, opus | +| `speed` | float | 1.0 | Playback speed (0.25-4.0) | + +## Troubleshooting + +1. **TTS model did not produce audio output**: Ensure the model is fully downloaded (`huggingface-cli download k2-fsa/OmniVoice`) +2. **Connection refused**: Make sure the server is running on the correct port +3. **Out of memory**: Reduce `--gpu-memory-utilization` (default stage config uses 0.5) +4. **Slow first request**: The model performs warmup on first inference; subsequent requests are faster diff --git a/examples/online_serving/omnivoice/run_server.sh b/examples/online_serving/omnivoice/run_server.sh new file mode 100755 index 0000000000..abe9bb7989 --- /dev/null +++ b/examples/online_serving/omnivoice/run_server.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Launch vLLM-Omni server for OmniVoice TTS +# +# Usage: +# ./run_server.sh +# CUDA_VISIBLE_DEVICES=0 ./run_server.sh + +set -e + +MODEL="${MODEL:-k2-fsa/OmniVoice}" +PORT="${PORT:-8091}" + +echo "Starting OmniVoice server with model: $MODEL" + +vllm serve "$MODEL" \ + --host 0.0.0.0 \ + --port "$PORT" \ + --trust-remote-code \ + --omni diff --git a/examples/online_serving/omnivoice/speech_client.py b/examples/online_serving/omnivoice/speech_client.py new file mode 100644 index 0000000000..b8e6f38890 --- /dev/null +++ b/examples/online_serving/omnivoice/speech_client.py @@ -0,0 +1,84 @@ +"""Client for OmniVoice TTS via /v1/audio/speech endpoint. + +Examples: + # Basic TTS (auto voice) + python speech_client.py --text "Hello, how are you?" + + # Specify language + python speech_client.py --text "Bonjour, comment allez-vous?" --language French +""" + +import argparse + +import httpx + +DEFAULT_API_BASE = "http://localhost:8091" +DEFAULT_API_KEY = "EMPTY" + + +def run_tts(args) -> None: + """Generate speech via /v1/audio/speech API.""" + payload = { + "model": args.model, + "input": args.text, + "voice": "default", + "response_format": args.response_format, + } + + if args.language: + payload["language"] = args.language + + print(f"Model: {args.model}") + print(f"Text: {args.text}") + if args.language: + print(f"Language: {args.language}") + print("Generating audio...") + + api_url = f"{args.api_base}/v1/audio/speech" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {args.api_key}", + } + + with httpx.Client(timeout=300.0) as client: + response = client.post(api_url, json=payload, headers=headers) + + if response.status_code != 200: + print(f"Error: {response.status_code}") + print(response.text) + return + + try: + text = response.content.decode("utf-8") + if text.startswith('{"error"'): + print(f"Error: {text}") + return + except UnicodeDecodeError: + pass + + output_path = args.output or "omnivoice_output.wav" + with open(output_path, "wb") as f: + f.write(response.content) + print(f"Audio saved to: {output_path}") + + +def main(): + parser = argparse.ArgumentParser(description="OmniVoice TTS client") + parser.add_argument("--api-base", default=DEFAULT_API_BASE, help="API base URL") + parser.add_argument("--api-key", default=DEFAULT_API_KEY, help="API key") + parser.add_argument("--model", "-m", default="k2-fsa/OmniVoice", help="Model name") + parser.add_argument("--text", required=True, help="Text to synthesize") + parser.add_argument("--language", default=None, help="Language hint (e.g., English, Chinese, French)") + parser.add_argument( + "--response-format", + default="wav", + choices=["wav", "mp3", "flac", "pcm", "aac", "opus"], + help="Audio format (default: wav)", + ) + parser.add_argument("--output", "-o", default=None, help="Output file path") + args = parser.parse_args() + run_tts(args) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 15e7c6305a..e49aa6e325 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -223,3 +223,4 @@ extend-ignore-identifiers-re = [ ue = "ue" semantics = "semantics" fullset = "fullset" +Vai = "Vai" diff --git a/tests/e2e/offline_inference/test_omnivoice.py b/tests/e2e/offline_inference/test_omnivoice.py new file mode 100644 index 0000000000..4b093e357d --- /dev/null +++ b/tests/e2e/offline_inference/test_omnivoice.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +E2E offline tests for OmniVoice TTS model with text input and audio output. + +Uses GPUGenerationWorker for both stages (iterative unmasking + DAC decoder). +""" + +import os + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" + +from pathlib import Path + +import numpy as np +import pytest + +from tests.utils import hardware_test + +MODEL = "k2-fsa/OmniVoice" + + +def get_stage_config(): + return str( + Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "omnivoice.yaml" + ) + + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "L4"}, num_cards=1) +def test_omnivoice_text_to_audio() -> None: + """ + Test OmniVoice text-to-audio generation via offline Omni runner. + Deploy Setting: omnivoice.yaml (enforce_eager=true) + Input Modal: text + Output Modal: audio + """ + from vllm_omni.entrypoints.omni import Omni + + omni = Omni( + model=MODEL, + stage_configs_path=get_stage_config(), + trust_remote_code=True, + log_stats=True, + ) + + try: + prompts = {"prompt": "Hello, this is a test for text to audio."} + + from vllm_omni.inputs.data import OmniDiffusionSamplingParams + + sampling_params_list = [OmniDiffusionSamplingParams()] + + outputs = list(omni.generate(prompts, sampling_params_list=sampling_params_list)) + + assert len(outputs) > 0, "No outputs generated" + + # Check final output has audio + final_output = outputs[-1] + ro = final_output.request_output + assert ro is not None, "No request_output" + + mm = getattr(ro, "multimodal_output", None) + if not mm and ro.outputs: + mm = getattr(ro.outputs[0], "multimodal_output", None) + + assert mm is not None, "No multimodal_output" + assert "audio" in mm, f"No 'audio' key in multimodal_output: {mm.keys()}" + + audio = mm["audio"] + if isinstance(audio, np.ndarray): + audio_np = audio + else: + audio_np = audio.cpu().numpy().squeeze() + + assert audio_np.size > 0, "Audio output is empty" + rms = np.sqrt(np.mean(audio_np**2)) + assert rms > 0.01, f"Audio RMS too low ({rms:.4f}), likely silence" + + print(f"Generated audio: {len(audio_np) / 24000:.2f}s, rms={rms:.4f}") + finally: + omni.close() diff --git a/tests/e2e/online_serving/test_omnivoice.py b/tests/e2e/online_serving/test_omnivoice.py new file mode 100644 index 0000000000..ec1981aab2 --- /dev/null +++ b/tests/e2e/online_serving/test_omnivoice.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +E2E Online tests for OmniVoice TTS model via /v1/audio/speech endpoint. + +Tests verify that the OmniVoice model generates valid audio when +accessed through the standard OpenAI-compatible speech API. +""" + +import os + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" + +from pathlib import Path + +import httpx +import pytest + +from tests.conftest import OmniServerParams +from tests.utils import hardware_test + +MODEL = "k2-fsa/OmniVoice" + +STAGE_CONFIG = str( + Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "omnivoice.yaml" +) +EXTRA_ARGS = [ + "--trust-remote-code", + "--disable-log-stats", +] +TEST_PARAMS = [ + OmniServerParams( + model=MODEL, + stage_config_path=STAGE_CONFIG, + server_args=EXTRA_ARGS, + ) +] + +MIN_AUDIO_BYTES = 5000 + + +def make_speech_request( + host: str, + port: int, + text: str, + timeout: float = 180.0, +) -> httpx.Response: + """Make a request to the /v1/audio/speech endpoint for OmniVoice.""" + url = f"http://{host}:{port}/v1/audio/speech" + payload = {"input": text} + + with httpx.Client(timeout=timeout) as client: + return client.post(url, json=payload) + + +def verify_wav_audio(content: bytes) -> bool: + """Verify that content is valid WAV audio data.""" + if len(content) < 44: + return False + return content[:4] == b"RIFF" and content[8:12] == b"WAVE" + + +@pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True) +class TestOmniVoiceTTS: + """E2E tests for OmniVoice TTS model.""" + + @pytest.mark.core_model + @pytest.mark.omni + @hardware_test(res={"cuda": "L4"}, num_cards=1) + def test_speech_auto_voice(self, omni_server) -> None: + """Test auto voice TTS generation (text only, no reference audio).""" + response = make_speech_request( + host=omni_server.host, + port=omni_server.port, + text="Hello, this is a test of the OmniVoice text to speech system.", + ) + + assert response.status_code == 200, f"Request failed: {response.text}" + assert response.headers.get("content-type") == "audio/wav" + assert verify_wav_audio(response.content), "Response is not valid WAV audio" + assert len(response.content) > MIN_AUDIO_BYTES, ( + f"Audio too small ({len(response.content)} bytes), expected > {MIN_AUDIO_BYTES}" + ) diff --git a/vllm_omni/diffusion/models/omnivoice/__init__.py b/vllm_omni/diffusion/models/omnivoice/__init__.py new file mode 100644 index 0000000000..208f01a7cb --- /dev/null +++ b/vllm_omni/diffusion/models/omnivoice/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py b/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py new file mode 100644 index 0000000000..568e2f5164 --- /dev/null +++ b/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py @@ -0,0 +1,195 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +OmniVoice TTS Pipeline for vLLM-Omni diffusion engine. + +Single-stage pipeline that runs the full text-to-speech flow: + text → tokenize → 32-step iterative unmasking → 8-codebook tokens → DAC decode → 24kHz audio + +Uses request-mode execution (all steps in one forward() call). +""" + +from __future__ import annotations + +import json +import os +from collections.abc import Iterable +from typing import ClassVar + +import torch +from tokenizers import Tokenizer as HFTokenizer +from torch import nn +from vllm.logger import init_logger + +from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig +from vllm_omni.diffusion.distributed.utils import get_local_device +from vllm_omni.diffusion.models.interface import SupportAudioOutput +from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig +from vllm_omni.model_executor.models.omnivoice.duration import RuleDurationEstimator +from vllm_omni.model_executor.models.omnivoice.omnivoice_decoder import OmniVoiceDecoder +from vllm_omni.model_executor.models.omnivoice.omnivoice_generator import OmniVoiceGenerator + +logger = init_logger(__name__) + + +def get_omnivoice_post_process_func(od_config: OmniDiffusionConfig): + """Post-processing: convert audio tensor to numpy for WAV encoding.""" + + def post_process_func(audio: torch.Tensor, output_type: str = "np"): + if output_type == "pt": + return audio + return audio.cpu().float().numpy() + + return post_process_func + + +class OmniVoicePipeline(nn.Module, SupportAudioOutput): + """OmniVoice text-to-speech pipeline for the diffusion engine. + + Wraps OmniVoiceGenerator (32-step iterative unmasking) and + OmniVoiceDecoder (HiggsAudioV2 RVQ + DAC) into a single forward() call. + """ + + support_audio_output: ClassVar[bool] = True + + def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = ""): + super().__init__() + self.od_config = od_config + self.device = get_local_device() + self.model_path = od_config.model + + # Resolve model path (HF hub ID → local cache) + if not os.path.isdir(self.model_path): + from huggingface_hub import snapshot_download + + self.model_path = snapshot_download(self.model_path) + + # Load OmniVoice config + config_path = os.path.join(self.model_path, "config.json") + with open(config_path) as f: + hf_config = json.load(f) + self.config = OmniVoiceConfig(**hf_config) + + # Build generator and decoder + self.generator = OmniVoiceGenerator(self.config) + self.decoder = OmniVoiceDecoder(self.config) + + # Tokenizer (low-level, avoids HF tokenizer extra_special_tokens issue) + tokenizer_path = os.path.join(self.model_path, "tokenizer.json") + self.tokenizer = HFTokenizer.from_file(tokenizer_path) + + # Duration estimator + self.duration_estimator = RuleDurationEstimator() + + # Generation parameters + self.num_step = self.config.num_step + self.guidance_scale = self.config.guidance_scale + self.t_shift = self.config.t_shift + self.layer_penalty_factor = self.config.layer_penalty_factor + self.position_temperature = self.config.position_temperature + self.class_temperature = self.config.class_temperature + self.sample_rate = self.config.sample_rate + + @torch.inference_mode() + def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: + """Generate speech audio from text. + + Args: + req: Diffusion request containing text prompt(s). + + Returns: + DiffusionOutput with audio tensor in .output + """ + # Extract text from request + prompt = req.prompts[0] if req.prompts else "" + if isinstance(prompt, dict): + text = prompt.get("input", prompt.get("text", str(prompt))) + else: + text = str(prompt) + + if not text: + return DiffusionOutput(error="Empty text prompt") + + device = self.device + num_cb = self.config.num_audio_codebook + mask_id = self.config.audio_mask_id + + # Estimate target duration + target_len = self.duration_estimator.estimate_duration(text, "Nice to meet you.", 25) + target_len = max(1, int(target_len)) + + # Tokenize with control tokens + style = "<|denoise|><|lang_start|>None<|lang_end|><|instruct_start|>None<|instruct_end|>" + full_prompt = f"{style}<|text_start|>{text}<|text_end|>" + encoding = self.tokenizer.encode(full_prompt) + text_tokens = torch.tensor(encoding.ids, dtype=torch.long, device=device) + text_len = text_tokens.shape[0] + + # Build conditional + unconditional batches [2, 8, max_len] + text_ids = text_tokens.unsqueeze(0).repeat(num_cb, 1) + target_ids = torch.full((num_cb, target_len), mask_id, dtype=torch.long, device=device) + cond_ids = torch.cat([text_ids, target_ids], dim=1) + cond_len = cond_ids.shape[1] + + uncond_ids = target_ids.clone() + uncond_len = target_len + max_len = max(cond_len, uncond_len) + if uncond_len < max_len: + pad = torch.full( + (num_cb, max_len - uncond_len), + mask_id, + dtype=torch.long, + device=device, + ) + uncond_ids = torch.cat([uncond_ids, pad], dim=1) + + batch_input_ids = torch.stack([cond_ids, uncond_ids]) + + batch_audio_mask = torch.zeros(2, max_len, dtype=torch.bool, device=device) + batch_audio_mask[0, text_len:cond_len] = True + batch_audio_mask[1, :uncond_len] = True + + batch_attn_mask = torch.zeros(2, 1, max_len, max_len, dtype=torch.bool, device=device) + batch_attn_mask[0, :, :cond_len, :cond_len] = True + batch_attn_mask[1, :, :uncond_len, :uncond_len] = True + + # Run 32-step iterative unmasking + tokens = self.generator( + input_ids=batch_input_ids, + audio_mask=batch_audio_mask, + attention_mask=batch_attn_mask, + target_lens=[target_len], + num_step=self.num_step, + guidance_scale=self.guidance_scale, + t_shift=self.t_shift, + layer_penalty_factor=self.layer_penalty_factor, + position_temperature=self.position_temperature, + class_temperature=self.class_temperature, + ) + + # Decode tokens to audio + audio = self.decoder(tokens) # [1, 1, samples] + + return DiffusionOutput(output=audio) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load weights from model directory (not from the iterator). + + The diffusion model loader passes HF safetensors weights, but OmniVoice + has custom weight names (llm.* → generator.*, audio_tokenizer.* → decoder.*). + We load from model_path directly and return all param names to satisfy + the loader's "all weights initialized" check. + """ + # Consume the iterator (required by the loader contract) + for _ in weights: + pass + + device = self.device + self.generator.load_weights(self.model_path, device) + self.generator = self.generator.to(device).eval() + self.decoder.load_weights(self.model_path, device) + logger.info("OmniVoice pipeline loaded on %s", device) + + # Return all parameter names to indicate they're initialized + return {name for name, _ in self.named_parameters()} diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index db88057227..c1f48137e1 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -173,6 +173,16 @@ "pipeline_hunyuan_video_1_5_i2v", "HunyuanVideo15I2VPipeline", ), + "OmniVoicePipeline": ( + "omnivoice", + "pipeline_omnivoice", + "OmniVoicePipeline", + ), + "OmniVoice": ( + "omnivoice", + "pipeline_omnivoice", + "OmniVoicePipeline", + ), } @@ -358,6 +368,7 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) - "Flux2Pipeline": "get_flux2_post_process_func", "HunyuanVideo15Pipeline": "get_hunyuan_video_15_post_process_func", "HunyuanVideo15ImageToVideoPipeline": "get_hunyuan_video_15_i2v_post_process_func", + "OmniVoicePipeline": "get_omnivoice_post_process_func", } _DIFFUSION_PRE_PROCESS_FUNCS = { diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index f4a082cffb..a1dc373dd9 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -18,6 +18,7 @@ def _register_omni_hf_configs() -> None: from transformers import AutoConfig from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config + from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig from vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts import ( Qwen3TTSConfig, ) @@ -31,6 +32,7 @@ def _register_omni_hf_configs() -> None: for model_type, config_cls in [ ("qwen3_tts", Qwen3TTSConfig), ("cosyvoice3", CosyVoice3Config), + ("omnivoice", OmniVoiceConfig), ("voxtral_tts", VoxtralTTSConfig), ]: try: diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 0ffe33abde..acf45b4fe6 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -510,6 +510,13 @@ async def omni_init_app_state( stage_configs=diffusion_stage_configs, ) + state.openai_serving_speech = OmniOpenAIServingSpeech.for_diffusion( + diffusion_engine=engine_client, + model_name=model_name, + stage_configs=diffusion_stage_configs, + ) + state.openai_streaming_speech = None + state.enable_server_load_tracking = getattr(args, "enable_server_load_tracking", False) state.server_load_metrics = 0 logger.info("Pure diffusion API server initialized for model: %s", model_name) diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index b483181fd5..75279f0755 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -47,7 +47,10 @@ _VOXTRAL_TTS_MODEL_STAGES = {"audio_generation"} _QWEN3_TTS_MODEL_STAGES = {"qwen3_tts"} _FISH_TTS_MODEL_STAGES = {"fish_speech_slow_ar"} -_TTS_MODEL_STAGES: set[str] = _VOXTRAL_TTS_MODEL_STAGES | _QWEN3_TTS_MODEL_STAGES | _FISH_TTS_MODEL_STAGES +_OMNIVOICE_TTS_MODEL_STAGES = {"omnivoice_generator"} +_TTS_MODEL_STAGES: set[str] = ( + _VOXTRAL_TTS_MODEL_STAGES | _QWEN3_TTS_MODEL_STAGES | _FISH_TTS_MODEL_STAGES | _OMNIVOICE_TTS_MODEL_STAGES +) _TTS_LANGUAGES: set[str] = { "Auto", "Chinese", @@ -145,6 +148,27 @@ def _validate_path_within_directory(file_path: Path, directory: Path) -> bool: class OmniOpenAIServingSpeech(OpenAIServing, AudioMixin): + _diffusion_mode: bool = False + + @classmethod + def for_diffusion( + cls, + diffusion_engine: "Any", + model_name: str, + stage_configs: "list[Any] | None" = None, + ) -> "OmniOpenAIServingSpeech": + """Create a speech serving instance for pure diffusion TTS models. + + Bypasses OpenAIServing.__init__ which requires a fully configured + engine client that pure diffusion engines don't provide. + """ + instance = cls.__new__(cls) + instance._diffusion_mode = True + instance._diffusion_engine = diffusion_engine + instance._diffusion_model_name = model_name + instance._diffusion_stage_configs = stage_configs + return instance + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Initialize uploaded speakers storage @@ -240,6 +264,8 @@ def _detect_tts_model_type(self) -> str | None: return "voxtral_tts" if model_stage in _FISH_TTS_MODEL_STAGES: return "fish_tts" + if model_stage in _OMNIVOICE_TTS_MODEL_STAGES: + return "omnivoice" return None def _compute_max_instructions_length(self) -> int: @@ -1203,6 +1229,9 @@ async def _prepare_speech_generation( ref_audio_data = (wav_list, sr) prompt = self._build_fish_speech_prompt(request, ref_audio_data=ref_audio_data) tts_params = {} + elif self._tts_model_type == "omnivoice": + tts_params = {} + prompt = request.input # Diffusion engine takes raw text elif self._is_tts: validation_error = self._validate_tts_request(request) if validation_error: @@ -1324,6 +1353,79 @@ async def _generate_audio_bytes( audio_response: AudioResponse = self.create_audio(audio_obj) return audio_response.audio_data, audio_response.media_type + async def _create_diffusion_speech( + self, + request: OpenAICreateSpeechRequest, + ) -> Response: + """Handle speech generation for pure diffusion TTS models (e.g. OmniVoice).""" + from vllm_omni.outputs import OmniRequestOutput + + try: + request_id = f"speech-{random_uuid()}" + prompt = request.input + + logger.info( + "Diffusion TTS speech request %s: text=%r", + request_id, + prompt[:50] + "..." if len(prompt) > 50 else prompt, + ) + + generator = self._diffusion_engine.generate( + prompt=prompt, + request_id=request_id, + sampling_params_list=self._diffusion_engine.default_sampling_params_list, + output_modalities=["audio"], + ) + + final_output: OmniRequestOutput | None = None + async for res in generator: + final_output = res + + if final_output is None: + raise ValueError("No output generated from the model.") + + audio_output, audio_key = self._extract_audio_output(final_output) + if audio_key is None: + raise ValueError("TTS model did not produce audio output.") + + audio_tensor = audio_output[audio_key] + sr_raw = audio_output.get("sr", 24000) + sr_val = sr_raw[-1] if isinstance(sr_raw, list) and sr_raw else sr_raw + sample_rate = sr_val.item() if hasattr(sr_val, "item") else int(sr_val) + + if isinstance(audio_tensor, list): + non_empty = [c for c in audio_tensor if c.numel() > 0] + audio_tensor = torch.cat(non_empty, dim=-1) if non_empty else np.zeros((0,), dtype=np.float32) + if hasattr(audio_tensor, "float"): + audio_tensor = audio_tensor.float().detach().cpu().numpy() + if audio_tensor.ndim > 1: + audio_tensor = audio_tensor.squeeze() + + audio_obj = CreateAudio( + audio_tensor=audio_tensor, + sample_rate=sample_rate, + response_format=request.response_format or "wav", + speed=request.speed or 1.0, + stream_format=request.stream_format, + base64_encode=False, + ) + audio_response: AudioResponse = self.create_audio(audio_obj) + return Response(content=audio_response.audio_data, media_type=audio_response.media_type) + + except asyncio.CancelledError: + return self._diffusion_error_response("Client disconnected") + except ValueError as e: + return self._diffusion_error_response(str(e)) + except Exception as e: + logger.exception("Diffusion speech generation failed: %s", e) + return self._diffusion_error_response(f"Speech generation failed: {e}") + + @staticmethod + def _diffusion_error_response(message: str) -> Response: + """Create a JSON error response without depending on OpenAIServing.""" + error_body = json.dumps({"error": {"message": message, "type": "server_error", "param": None, "code": 500}}) + return Response(content=error_body, media_type="application/json", status_code=500) + async def create_speech( self, request: OpenAICreateSpeechRequest, @@ -1349,6 +1451,9 @@ async def create_speech( Each Code2Wav chunk is yielded as raw audio bytes as soon as it is decoded. For WAV format, a header with placeholder size values is emitted first. """ + if self._diffusion_mode: + return await self._create_diffusion_speech(request) + error_check_ret = await self._check_model(request) if error_check_ret is not None: logger.error("Error with model %s", error_check_ret) @@ -1426,6 +1531,8 @@ async def create_speech_batch( batch_request: BatchSpeechRequest, ) -> BatchSpeechResponse | ErrorResponse: """Generate speech for multiple items concurrently.""" + if self._diffusion_mode: + raise ValueError("Batch speech is not supported in diffusion mode") if len(batch_request.items) > self._batch_max_items: raise ValueError( f"Batch contains {len(batch_request.items)} items, exceeding the maximum of {self._batch_max_items}." diff --git a/vllm_omni/model_executor/models/omnivoice/__init__.py b/vllm_omni/model_executor/models/omnivoice/__init__.py new file mode 100644 index 0000000000..208f01a7cb --- /dev/null +++ b/vllm_omni/model_executor/models/omnivoice/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/vllm_omni/model_executor/models/omnivoice/config.py b/vllm_omni/model_executor/models/omnivoice/config.py new file mode 100644 index 0000000000..a24176bcf2 --- /dev/null +++ b/vllm_omni/model_executor/models/omnivoice/config.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""OmniVoice configuration for vLLM-Omni two-stage pipeline.""" + +from transformers.configuration_utils import PretrainedConfig + + +class OmniVoiceConfig(PretrainedConfig): + """Configuration for OmniVoice model in vLLM-Omni. + + This mirrors the HuggingFace OmniVoiceConfig but adds fields needed + for the two-stage serving pipeline. + """ + + model_type = "omnivoice" + + def get_text_config(self, **kwargs): + """Return self so vLLM uses our top-level config (which has + num_attention_heads etc.) instead of trying to extract a sub-config.""" + return self + + def __init__(self, **kwargs): + # HF repos (e.g. k2-fsa/OmniVoice) may nest generation hyperparameters. + gen_cfg = kwargs.pop("generation_config", None) + if isinstance(gen_cfg, dict): + for k, v in gen_cfg.items(): + kwargs.setdefault(k, v) + + super().__init__(**kwargs) + + # Audio codec params (prefer values set by PretrainedConfig from config.json) + self.audio_vocab_size = getattr(self, "audio_vocab_size", 1025) + self.audio_mask_id = getattr(self, "audio_mask_id", 1024) + self.num_audio_codebook = getattr(self, "num_audio_codebook", 8) + self.audio_codebook_weights = getattr( + self, + "audio_codebook_weights", + [8, 8, 6, 6, 4, 4, 2, 2], + ) + + # LLM backbone params (Qwen3-0.6B defaults from HF config) + llm_config = getattr(self, "llm_config", None) or {} + if isinstance(llm_config, PretrainedConfig): + llm_config = llm_config.to_dict() + elif not isinstance(llm_config, dict): + llm_config = {} + self.llm_hidden_size = llm_config.get("hidden_size", 1024) + self.llm_num_hidden_layers = llm_config.get("num_hidden_layers", 28) + self.llm_num_attention_heads = llm_config.get("num_attention_heads", 16) + self.llm_num_key_value_heads = llm_config.get("num_key_value_heads", 8) + self.llm_intermediate_size = llm_config.get("intermediate_size", 3072) + self.llm_vocab_size = llm_config.get("vocab_size", 151676) + self.llm_max_position_embeddings = llm_config.get("max_position_embeddings", 40960) + self.llm_rope_theta = llm_config.get("rope_theta", 1000000.0) + self.llm_rms_norm_eps = llm_config.get("rms_norm_eps", 1e-6) + self.llm_head_dim = llm_config.get("head_dim", self.llm_hidden_size // self.llm_num_attention_heads) + + # Expose LLM params at top level for vLLM ModelConfig compatibility + # (vLLM expects num_attention_heads, hidden_size, etc. on the config) + self.num_attention_heads = self.llm_num_attention_heads + self.num_key_value_heads = self.llm_num_key_value_heads + self.num_hidden_layers = self.llm_num_hidden_layers + self.hidden_size = self.llm_hidden_size + self.head_dim = self.llm_head_dim + if not hasattr(self, "vocab_size"): + self.vocab_size = self.llm_vocab_size + + # Generation params (defaults from OmniVoiceGenerationConfig) + self.num_step = getattr(self, "num_step", 32) + self.guidance_scale = getattr(self, "guidance_scale", 2.0) + self.t_shift = getattr(self, "t_shift", 0.1) + self.layer_penalty_factor = getattr(self, "layer_penalty_factor", 5.0) + self.position_temperature = getattr(self, "position_temperature", 5.0) + self.class_temperature = getattr(self, "class_temperature", 0.0) + + # Audio output + self.sample_rate = getattr(self, "sample_rate", 24000) + self.frame_rate = getattr(self, "frame_rate", 25) + + # Serving + self.speculative_config = None diff --git a/vllm_omni/model_executor/models/omnivoice/duration.py b/vllm_omni/model_executor/models/omnivoice/duration.py new file mode 100644 index 0000000000..8343362a2e --- /dev/null +++ b/vllm_omni/model_executor/models/omnivoice/duration.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +# Copyright 2026 Xiaomi Corp. (authors: Han Zhu) +# +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Text duration estimation for TTS generation. + +Provides ``RuleDurationEstimator``, which estimates audio duration from text +using character phonetic weights across 600+ languages. Used by +``OmniVoice.generate()`` to determine output length when no duration is specified. +""" + +import bisect +import unicodedata +from functools import lru_cache + + +class RuleDurationEstimator: + def __init__(self): + # ========================================== + # 1. Phonetic Weights Table + # ========================================== + # The weight represents the relative speaking time compared to + # a standard Latin letter. + # Benchmark: 1.0 = One Latin Character (~40-50ms) + self.weights = { + # --- Logographic (1 char = full syllable/word) --- + "cjk": 3.0, # Chinese, Japanese Kanji, etc. + # --- Syllabic / Blocks + "hangul": 2.5, # Korean Hangul + "kana": 2.2, # Japanese Hiragana/Katakana + "ethiopic": 3.0, # Amharic/Ge'ez + "yi": 3.0, # Yi script + # --- Abugida (Consonant-Vowel complexes) --- + "indic": 1.8, # Hindi, Bengali, Tamil, etc. + "thai_lao": 1.5, # Thai, Lao + "khmer_myanmar": 1.8, # Khmer, Myanmar + # --- Abjad (Consonant-heavy) --- + "arabic": 1.5, # Arabic, Persian, Urdu + "hebrew": 1.5, # Hebrew + # --- Alphabet (Segmental) --- + "latin": 1.0, # English, Spanish, French, Vietnamese, etc. (Baseline) + "cyrillic": 1.0, # Russian, Ukrainian + "greek": 1.0, # Greek + "armenian": 1.0, # Armenian + "georgian": 1.0, # Georgian + # --- Symbols & Misc --- + "punctuation": 0.5, # Pause capability + "space": 0.2, # Word boundary/Breath (0.05 / 0.22) + "digit": 3.5, # Numbers + "mark": 0.0, # Diacritics/Accents (Silent modifiers) + "default": 1.0, # Fallback for unknown scripts + } + + # ========================================== + # 2. Unicode Range Mapping + # ========================================== + # Format: (End_Codepoint, Type_Key) + # Used for fast binary search (bisect). + self.ranges = [ + (0x02AF, "latin"), # Latin (Basic, Supplement, Ext, IPA) + (0x03FF, "greek"), # Greek & Coptic + (0x052F, "cyrillic"), # Cyrillic + (0x058F, "armenian"), # Armenian + (0x05FF, "hebrew"), # Hebrew + (0x077F, "arabic"), # Arabic, Syriac, Arabic Supplement + (0x089F, "arabic"), # Arabic Extended-B (+ Syriac Supp) + (0x08FF, "arabic"), # Arabic Extended-A + (0x097F, "indic"), # Devanagari + (0x09FF, "indic"), # Bengali + (0x0A7F, "indic"), # Gurmukhi + (0x0AFF, "indic"), # Gujarati + (0x0B7F, "indic"), # Oriya + (0x0BFF, "indic"), # Tamil + (0x0C7F, "indic"), # Telugu + (0x0CFF, "indic"), # Kannada + (0x0D7F, "indic"), # Malayalam + (0x0DFF, "indic"), # Sinhala + (0x0EFF, "thai_lao"), # Thai & Lao + (0x0FFF, "indic"), # Tibetan (Abugida) + (0x109F, "khmer_myanmar"), # Myanmar + (0x10FF, "georgian"), # Georgian + (0x11FF, "hangul"), # Hangul Jamo + (0x137F, "ethiopic"), # Ethiopic + (0x139F, "ethiopic"), # Ethiopic Supplement + (0x13FF, "default"), # Cherokee + (0x167F, "default"), # Canadian Aboriginal Syllabics + (0x169F, "default"), # Ogham + (0x16FF, "default"), # Runic + (0x171F, "default"), # Tagalog (Baybayin) + (0x173F, "default"), # Hanunoo + (0x175F, "default"), # Buhid + (0x177F, "default"), # Tagbanwa + (0x17FF, "khmer_myanmar"), # Khmer + (0x18AF, "default"), # Mongolian + (0x18FF, "default"), # Canadian Aboriginal Syllabics Ext + (0x194F, "indic"), # Limbu + (0x19DF, "indic"), # Tai Le & New Tai Lue + (0x19FF, "khmer_myanmar"), # Khmer Symbols + (0x1A1F, "indic"), # Buginese + (0x1AAF, "indic"), # Tai Than + (0x1B7F, "indic"), # Balinese + (0x1BBF, "indic"), # Sundanese + (0x1BFF, "indic"), # Batak + (0x1C4F, "indic"), # Lepcha + (0x1C7F, "indic"), # Ol Chiki (Santali) + (0x1C8F, "cyrillic"), # Cyrillic Extended-C + (0x1CBF, "georgian"), # Georgian Extended + (0x1CCF, "indic"), # Sundanese Supplement + (0x1CFF, "indic"), # Vedic Extensions + (0x1D7F, "latin"), # Phonetic Extensions + (0x1DBF, "latin"), # Phonetic Extensions Supplement + (0x1DFF, "default"), # Combining Diacritical Marks Supplement + (0x1EFF, "latin"), # Latin Extended Additional (Vietnamese) + (0x309F, "kana"), # Hiragana + (0x30FF, "kana"), # Katakana + (0x312F, "cjk"), # Bopomofo (Pinyin) + (0x318F, "hangul"), # Hangul Compatibility Jamo + (0x9FFF, "cjk"), # CJK Unified Ideographs (Main) + (0xA4CF, "yi"), # Yi Syllables + (0xA4FF, "default"), # Lisu + (0xA63F, "default"), # Vai + (0xA69F, "cyrillic"), # Cyrillic Extended-B + (0xA6FF, "default"), # Bamum + (0xA7FF, "latin"), # Latin Extended-D + (0xA82F, "indic"), # Syloti Nagri + (0xA87F, "default"), # Phags-pa + (0xA8DF, "indic"), # Saurashtra + (0xA8FF, "indic"), # Devanagari Extended + (0xA92F, "indic"), # Kayah Li + (0xA95F, "indic"), # Rejang + (0xA97F, "hangul"), # Hangul Jamo Extended-A + (0xA9DF, "indic"), # Javanese + (0xA9FF, "khmer_myanmar"), # Myanmar Extended-B + (0xAA5F, "indic"), # Cham + (0xAA7F, "khmer_myanmar"), # Myanmar Extended-A + (0xAADF, "indic"), # Tai Viet + (0xAAFF, "indic"), # Meetei Mayek Extensions + (0xAB2F, "ethiopic"), # Ethiopic Extended-A + (0xAB6F, "latin"), # Latin Extended-E + (0xABBF, "default"), # Cherokee Supplement + (0xABFF, "indic"), # Meetei Mayek + (0xD7AF, "hangul"), # Hangul Syllables + (0xFAFF, "cjk"), # CJK Compatibility + (0xFDFF, "arabic"), # Arabic Presentation Forms-A + (0xFE6F, "default"), # Variation Selectors + (0xFEFF, "arabic"), # Arabic Presentation Forms-B + (0xFFEF, "latin"), # Fullwidth Latin + ] + self.breakpoints = [r[0] for r in self.ranges] + + @lru_cache(maxsize=4096) + def _get_char_weight(self, char): + """Determines the weight of a single character.""" + code = ord(char) + if (65 <= code <= 90) or (97 <= code <= 122): + return self.weights["latin"] + if code == 32: + return self.weights["space"] + + # Ignore arabic Tatweel + if code == 0x0640: + return self.weights["mark"] + + category = unicodedata.category(char) + + if category.startswith("M"): + return self.weights["mark"] + + if category.startswith("P") or category.startswith("S"): + return self.weights["punctuation"] + + if category.startswith("Z"): + return self.weights["space"] + + if category.startswith("N"): + return self.weights["digit"] + + # 3. Binary search for Unicode Block (此时区间里绝不会再混进标点符号) + idx = bisect.bisect_left(self.breakpoints, code) + if idx < len(self.ranges): + script_type = self.ranges[idx][1] + return self.weights.get(script_type, self.weights["default"]) + + # 4. Handle upper planes (CJK Ext B/C/D, Historic scripts) + if code > 0x20000: + return self.weights["cjk"] + + return self.weights["default"] + + def calculate_total_weight(self, text): + """Sums up the normalized weights for a string.""" + return sum(self._get_char_weight(c) for c in text) + + def estimate_duration( + self, + target_text: str, + ref_text: str, + ref_duration: float, + low_threshold: float | None = 50, + boost_strength: float = 3, + ) -> float: + """ + + Args: + target_text (str): The text for which we want to estimate the duration. + ref_text (str): The reference text that was used to measure + the ref_duration. + ref_duration (float): The actual duration it took + to speak the ref_text. + low_threshold (float): The minimum duration threshold below which the + estimation will be considered unreliable. + boost_strength (float): Controls the power-curve boost for short durations. + Higher values boost small durations more aggressively. + 1 = no boost (linear), 2 = sqrt-like + + Returns: + float: The estimated duration for the target_text based + on the ref_text and ref_duration. + """ + if ref_duration <= 0 or not ref_text: + return 0.0 + + ref_weight = self.calculate_total_weight(ref_text) + if ref_weight == 0: + return 0.0 + + speed_factor = ref_weight / ref_duration + target_weight = self.calculate_total_weight(target_text) + + estimated_duration = target_weight / speed_factor + if low_threshold is not None and estimated_duration < low_threshold: + alpha = 1.0 / boost_strength + return low_threshold * (estimated_duration / low_threshold) ** alpha + else: + return estimated_duration + + +# ========================================== +# Example Usage +# ========================================== +if __name__ == "__main__": + estimator = RuleDurationEstimator() + + ref_txt = "Hello, world." + ref_dur = 1.5 + + test_cases = [ + ("Hindi (With complex marks)", "नमस्ते दुनिया"), + ("Arabic (With vowels)", "مَرْحَبًا بِالْعَالَم"), + ("Vietnamese (Lots of diacritics)", "Chào thế giới"), + ("Chinese", "你好,世界!"), + ("Mixed Emoji", "Hello 🌍! This is fun 🎉"), + ] + + print("--- Reference ---") + print(f"Reference Text: '{ref_txt}'") + print(f"Reference Duration: {ref_dur}s") + print("-" * 30) + + for lang, txt in test_cases: + est_time = estimator.estimate_duration(txt, ref_txt, ref_dur) + weight = estimator.calculate_total_weight(txt) + + print(f"[{lang}]") + print(f"Text: {txt}") + print(f"Total Weight: {weight:.2f}") + print(f"Estimated Duration: {est_time:.2f} s") + print("-" * 30) diff --git a/vllm_omni/model_executor/models/omnivoice/omnivoice.py b/vllm_omni/model_executor/models/omnivoice/omnivoice.py new file mode 100644 index 0000000000..a3603a3c39 --- /dev/null +++ b/vllm_omni/model_executor/models/omnivoice/omnivoice.py @@ -0,0 +1,520 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +OmniVoice model for vLLM-Omni two-stage TTS pipeline. + +Stage 0 (Generator): Qwen3 backbone + iterative unmasking → 8-codebook tokens +Stage 1 (Decoder): HiggsAudioV2 decoder → 24kHz waveform +""" + +from __future__ import annotations + +import os +from collections.abc import Iterable, Mapping, Sequence + +import numpy as np +import torch +import torch.nn as nn +from transformers.feature_extraction_utils import BatchFeature +from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions +from vllm.logger import init_logger +from vllm.multimodal.inputs import ( + MultiModalDataDict, + MultiModalFieldConfig, + MultiModalKwargsItems, +) +from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser +from vllm.multimodal.processing import ( + BaseDummyInputsBuilder, + BaseMultiModalProcessor, + BaseProcessingInfo, + ProcessorInputs, + PromptIndexTargets, + PromptInsertion, + PromptUpdate, +) +from vllm.sequence import IntermediateTensors + +from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig +from vllm_omni.model_executor.models.output_templates import OmniOutput + +logger = init_logger(__name__) + + +# --------------------------------------------------------------------------- +# Multimodal processing +# --------------------------------------------------------------------------- + + +class OmniVoiceMultiModalProcessingInfo(BaseProcessingInfo): + def get_hf_config(self): + return self.ctx.get_hf_config(OmniVoiceConfig) + + def get_supported_mm_limits(self) -> Mapping[str, int | None]: + return {"audio": None} + + def get_data_parser(self): + return MultiModalDataParser( + target_sr=self.ctx.get_hf_config().sample_rate, + expected_hidden_size=self._get_expected_hidden_size(), + ) + + +class OmniVoiceMultiModalProcessor(BaseMultiModalProcessor[OmniVoiceMultiModalProcessingInfo]): + """Processes text + optional reference audio for OmniVoice. + + For voice cloning: text + reference audio → tokenized reference + For auto voice: text only + """ + + def _ensure_cached_runtime_components(self, model_dir: str, config: OmniVoiceConfig) -> None: + cached_model_dir = getattr(self, "_cached_model_dir", None) + if cached_model_dir == model_dir: + return + + from transformers import AutoTokenizer + + self.text_tokenizer = AutoTokenizer.from_pretrained(model_dir) + + # Audio tokenizer for encoding reference audio + audio_tokenizer_path = os.path.join(model_dir, "audio_tokenizer") + if os.path.isdir(audio_tokenizer_path): + try: + from transformers import ( + AutoFeatureExtractor, + HiggsAudioV2TokenizerModel, + ) + except ImportError as e: + raise ImportError( + "OmniVoice voice cloning requires transformers with " + "HiggsAudioV2TokenizerModel. Upgrade transformers or " + "use text-only mode (no reference audio)." + ) from e + + self.audio_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained(audio_tokenizer_path, device_map="cpu") + self.feature_extractor = AutoFeatureExtractor.from_pretrained(audio_tokenizer_path) + self.audio_tokenizer.eval() + else: + self.audio_tokenizer = None + self.feature_extractor = None + logger.warning( + "audio_tokenizer not found at %s, voice cloning disabled", + audio_tokenizer_path, + ) + + self._cached_model_dir = model_dir + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + config = self.info.ctx.get_hf_config() + model_dir = self.info.ctx.model_config.model + self._ensure_cached_runtime_components(model_dir, config) + + audio = mm_data.get("audio", None) + if audio is None: + audio = mm_data.get("audios") + if audio is not None: + audio = audio[0], config.sample_rate + + # Build text prompt with control tokens + lang = mm_kwargs.get("lang", None) + instruct = mm_kwargs.get("instruct", None) + denoise = mm_kwargs.get("denoise", True) + ref_text = mm_kwargs.get("ref_text", None) + + # Construct the style + text portion + style_text = "" + if denoise: + style_text += "<|denoise|>" + lang_str = lang if lang else "None" + instruct_str = instruct if instruct else "None" + style_text += f"<|lang_start|>{lang_str}<|lang_end|>" + style_text += f"<|instruct_start|>{instruct_str}<|instruct_end|>" + + # Combine ref_text and main text + if ref_text: + full_text = f"{ref_text} {prompt}" + else: + full_text = prompt + + text_prompt = f"{style_text}<|text_start|>{full_text}<|text_end|>" + text_tokens = self.text_tokenizer(text_prompt, return_tensors="pt").input_ids.squeeze(0) # [N_text] + + if audio is None: + # Text-only path (auto voice mode) + return BatchFeature( + { + "input_ids": text_tokens, + "input_len": [len(text_tokens)], + } + ) + + # Voice cloning: encode reference audio to tokens + audio_signal, sr = audio + if isinstance(audio_signal, np.ndarray): + audio_signal = torch.from_numpy(audio_signal).float() + if audio_signal.dim() == 1: + audio_signal = audio_signal.unsqueeze(0) + + # Resample to tokenizer sample rate if needed + if self.feature_extractor is not None: + target_sr = self.feature_extractor.sampling_rate + if sr != target_sr: + import torchaudio + + audio_signal = torchaudio.functional.resample(audio_signal, sr, target_sr) + + # Encode reference audio to 8-codebook tokens + if self.audio_tokenizer is not None: + with torch.inference_mode(): + ref_audio_tokens = self.audio_tokenizer.encode(audio_signal) # [8, T_ref] + if ref_audio_tokens.dim() == 3: + ref_audio_tokens = ref_audio_tokens.squeeze(0) # [8, T_ref] + else: + raise RuntimeError( + "Audio tokenizer not available for voice cloning. Ensure audio_tokenizer/ exists in model directory." + ) + + ft = BatchFeature( + { + "input_ids": text_tokens, + "ref_audio_tokens": ref_audio_tokens, # [8, T_ref] + "ref_audio_len": [ref_audio_tokens.shape[1]], + } + ) + return ft + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return { + "ref_audio_tokens": MultiModalFieldConfig.batched("audio"), + "ref_audio_len": MultiModalFieldConfig.batched("audio"), + } + + def _hf_processor_applies_updates( + self, + prompt_text: str, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + tokenization_kwargs: Mapping[str, object], + ) -> bool: + return False + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + def insertion_end(item_idx): + if "audio" in out_mm_kwargs and out_mm_kwargs["audio"]: + ref_len = out_mm_kwargs["audio"][0]["ref_audio_len"].data[0].item() + return [1] * ref_len + return [] + + return [ + PromptInsertion( + modality="audio", + target=PromptIndexTargets.start(), + insertion=insertion_end, + ), + ] + + +class OmniVoiceDummyInputsBuilder(BaseDummyInputsBuilder[OmniVoiceMultiModalProcessingInfo]): + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + return "Hello, this is a test of the OmniVoice system." + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, BaseDummyOptions] | None = None, + ) -> MultiModalDataDict: + num_audios = mm_counts.get("audio") + max_prompt_seconds = 10 + prompt_sample_rate = 24000 + target_audio_length = max_prompt_seconds * prompt_sample_rate + + audio_overrides = mm_options.get("audio") if mm_options else None + mm_data = { + "audio": ( + self._get_dummy_audios( + length=target_audio_length, + num_audios=num_audios, + overrides=audio_overrides, + )[0], + 24000, + ), + } + return mm_data + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, BaseDummyOptions] | None = None, + ) -> ProcessorInputs: + inputs = super().get_dummy_processor_inputs(seq_len, mm_counts, mm_options) + inputs.hf_processor_mm_kwargs = {"ref_text": "Testing voice cloning."} + return inputs + + +# --------------------------------------------------------------------------- +# Main model class +# --------------------------------------------------------------------------- + + +class OmniVoiceModel( + nn.Module, +): + """OmniVoice model for vLLM-Omni two-stage pipeline. + + Routes to generator (Stage 0) or decoder (Stage 1) based on model_stage. + """ + + requires_raw_input_tokens = True + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.config = vllm_config.model_config.hf_config + self.have_multimodal_outputs = True + self.model_stage = vllm_config.model_config.model_stage + self.model_dir = vllm_config.model_config.model + + if self.model_stage == "omnivoice_generator": + from vllm_omni.model_executor.models.omnivoice.omnivoice_generator import ( + OmniVoiceGenerator, + ) + + self.generator = OmniVoiceGenerator(self.config) + self.model = self.generator + elif self.model_stage == "omnivoice_decoder": + from vllm_omni.model_executor.models.omnivoice.omnivoice_decoder import ( + OmniVoiceDecoder, + ) + + self.decoder = OmniVoiceDecoder(self.config) + self.model = self.decoder + else: + raise ValueError(f"Unsupported model_stage: {self.model_stage}") + + def embed_input_ids( + self, + input_ids: torch.Tensor, + multimodal_embeddings=None, + is_multimodal=None, + ) -> torch.Tensor: + if self.model_stage == "omnivoice_generator": + # Generator handles its own embedding in forward() + hidden = int(self.config.llm_hidden_size) + return torch.zeros((input_ids.shape[0], hidden), device=input_ids.device) + elif self.model_stage == "omnivoice_decoder": + hidden = int(self.config.llm_hidden_size) + return torch.zeros((input_ids.shape[0], hidden), device=input_ids.device) + else: + raise RuntimeError(f"embed_input_ids not valid for {self.model_stage}") + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + additional_information: dict[str, object] | None = None, + **kwargs: object, + ) -> OmniOutput: + if self.model_stage == "omnivoice_generator": + return self._forward_generator(input_ids, kwargs) + elif self.model_stage == "omnivoice_decoder": + return self._forward_decoder(input_ids, kwargs) + else: + raise ValueError(f"Unsupported model_stage: {self.model_stage}") + + def _forward_generator(self, input_ids: torch.Tensor, kwargs: dict) -> OmniOutput: + """Run generator stage: text → 8-codebook audio tokens.""" + runtime_info = kwargs.get("runtime_additional_information", []) + + if not runtime_info: + # Profiling / dummy run — return a plain tensor (not OmniOutput) + # so the v1 model runner's _dummy_run can index into it. + return torch.zeros( + (input_ids.shape[0], self.config.llm_hidden_size), + device=input_ids.device, + dtype=torch.float32, + ) + + info = runtime_info[0] + device = input_ids.device + num_codebooks = self.config.num_audio_codebook + mask_id = self.config.audio_mask_id + + # Extract text tokens from input_ids + text_tokens = input_ids # [N_text] + text_len = text_tokens.shape[0] + + # Estimate target length using RuleDurationEstimator + # (same formula as reference OmniVoice: weight * 25 / 14.1) + from vllm_omni.model_executor.models.omnivoice.duration import ( + RuleDurationEstimator, + ) + + if not hasattr(self, "_duration_estimator"): + self._duration_estimator = RuleDurationEstimator() + raw_text = info.get("raw_text", "") + if raw_text: + target_len = self._duration_estimator.estimate_duration(raw_text, "Nice to meet you.", 25) + target_len = max(1, int(target_len)) + else: + # Fallback: use character weight formula on text tokens + # approximate ~1.77 frames per text token (25/14.1) + target_len = max(int(text_len * 1.77), 25) + + # Get reference audio tokens if available + ref_audio_tokens = info.get("ref_audio_tokens", None) + + # Build input_ids tensor: [2*B, 8, S] + # B=1, conditional + unconditional + + # Replicate text tokens across 8 codebooks + text_ids = text_tokens.unsqueeze(0).repeat(num_codebooks, 1) # [8, N_text] + + # Target: all MASK + target_ids = torch.full((num_codebooks, target_len), mask_id, dtype=torch.long, device=device) + + # Conditional: [text] [ref_audio?] [target_mask] + if ref_audio_tokens is not None: + ref_tokens = ref_audio_tokens.to(device) # [8, T_ref] + cond_ids = torch.cat([text_ids, ref_tokens, target_ids], dim=1) + cond_audio_start = text_ids.shape[1] + else: + cond_ids = torch.cat([text_ids, target_ids], dim=1) + cond_audio_start = text_ids.shape[1] + + cond_len = cond_ids.shape[1] + + # Unconditional: [target_mask only] + uncond_ids = target_ids.clone() + uncond_len = target_len + + # Pad to same length + max_len = max(cond_len, uncond_len) + if cond_len < max_len: + pad = torch.full( + (num_codebooks, max_len - cond_len), + mask_id, + dtype=torch.long, + device=device, + ) + cond_ids = torch.cat([cond_ids, pad], dim=1) + if uncond_len < max_len: + pad = torch.full( + (num_codebooks, max_len - uncond_len), + mask_id, + dtype=torch.long, + device=device, + ) + uncond_ids = torch.cat([uncond_ids, pad], dim=1) + + batch_input_ids = torch.stack([cond_ids, uncond_ids], dim=0) # [2, 8, max_len] + + # Audio mask: True for audio positions + batch_audio_mask = torch.zeros((2, max_len), dtype=torch.bool, device=device) + batch_audio_mask[0, cond_audio_start:cond_len] = True + batch_audio_mask[1, :uncond_len] = True + + # Attention mask: [2, 1, S, S] + batch_attention_mask = torch.zeros((2, 1, max_len, max_len), dtype=torch.bool, device=device) + batch_attention_mask[0, :, :cond_len, :cond_len] = True + batch_attention_mask[1, :, :uncond_len, :uncond_len] = True + + # Run iterative generation + tokens = self.generator( + input_ids=batch_input_ids, + audio_mask=batch_audio_mask, + attention_mask=batch_attention_mask, + target_lens=[target_len], + num_step=self.config.num_step, + guidance_scale=self.config.guidance_scale, + t_shift=self.config.t_shift, + layer_penalty_factor=self.config.layer_penalty_factor, + position_temperature=self.config.position_temperature, + class_temperature=self.config.class_temperature, + ) # [1, 8, target_len] + + return OmniOutput( + text_hidden_states=None, + multimodal_outputs={"audio_tokens": tokens}, + ) + + def _forward_decoder(self, input_ids: torch.Tensor, kwargs: dict) -> OmniOutput: + """Run decoder stage: 8-codebook tokens → audio waveform.""" + runtime_info = kwargs.get("runtime_additional_information", []) + + if not runtime_info: + # Profiling / dummy run — return plain tensor for v1 runner compat + return torch.zeros( + (input_ids.shape[0], self.config.llm_hidden_size), + device=input_ids.device, + dtype=torch.float32, + ) + + info = runtime_info[0] + audio_tokens = info.get("audio_tokens", None) + + if audio_tokens is None: + raise RuntimeError("No audio_tokens received from generator stage") + + if isinstance(audio_tokens, np.ndarray): + audio_tokens = torch.from_numpy(audio_tokens) + + # audio_tokens: [B, 8, T]; buffer may be CPU — move to decoder weights + if audio_tokens.dim() == 2: + audio_tokens = audio_tokens.unsqueeze(0) # Add batch dim + + dec_device = next(self.decoder.parameters()).device + audio_tokens = audio_tokens.to(device=dec_device, dtype=torch.long) + + tts_speech = self.decoder(audio_tokens) + + return OmniOutput( + text_hidden_states=None, + multimodal_outputs={ + "audio": tts_speech, + "sr": self.config.sample_rate, + }, + ) + + def _resolve_model_dir(self) -> str: + """Resolve model directory to local path (handles HF hub IDs).""" + model_dir = self.model_dir + if os.path.isdir(model_dir): + return model_dir + # HF hub model ID — resolve to local cache + from huggingface_hub import snapshot_download + + return snapshot_download(model_dir) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + try: + device = next(self.parameters()).device + except StopIteration: + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + model_dir = self._resolve_model_dir() + + if self.model_stage == "omnivoice_generator": + self.generator.load_weights(model_dir, device) + elif self.model_stage == "omnivoice_decoder": + self.decoder.load_weights(model_dir, device) + else: + raise ValueError(f"{self.model_stage} not supported!") diff --git a/vllm_omni/model_executor/models/omnivoice/omnivoice_decoder.py b/vllm_omni/model_executor/models/omnivoice/omnivoice_decoder.py new file mode 100644 index 0000000000..cf69f26587 --- /dev/null +++ b/vllm_omni/model_executor/models/omnivoice/omnivoice_decoder.py @@ -0,0 +1,211 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +OmniVoice Decoder (Stage 1) - Audio token to waveform conversion. + +Implements the HiggsAudioV2 decode path using transformers' DacModel decoder +and a custom RVQ quantizer, compatible with transformers 4.x. + +Decode path: + audio_codes [B, 8, T] + → RVQ codebook lookup + project_out → sum → [B, 1024, T] + → fc2 Linear(1024, 256) → [B, 256, T] + → DAC acoustic decoder (conv transpose upsampling) → [B, 1, T*960] + → 24kHz waveform (25fps × 960 samples/frame) +""" + +from __future__ import annotations + +import json +import os + +import torch +import torch.nn as nn +from vllm.logger import init_logger + +from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig + +logger = init_logger(__name__) + + +class HiggsAudioVQLayer(nn.Module): + """Single VQ layer: codebook lookup + project_out.""" + + def __init__(self, codebook_size: int = 1024, codebook_dim: int = 64, hidden_size: int = 1024): + super().__init__() + self.codebook = nn.Embedding(codebook_size, codebook_dim) + self.project_out = nn.Linear(codebook_dim, hidden_size) + + def decode(self, indices: torch.Tensor) -> torch.Tensor: + """indices: [B, T] → [B, hidden_size, T]""" + quantized = self.codebook(indices) # [B, T, codebook_dim] + quantized = self.project_out(quantized) # [B, T, hidden_size] + return quantized.permute(0, 2, 1) # [B, hidden_size, T] + + +class HiggsAudioRVQ(nn.Module): + """Residual Vector Quantizer with 8 codebook layers.""" + + def __init__( + self, num_quantizers: int = 8, codebook_size: int = 1024, codebook_dim: int = 64, hidden_size: int = 1024 + ): + super().__init__() + self.quantizers = nn.ModuleList( + [HiggsAudioVQLayer(codebook_size, codebook_dim, hidden_size) for _ in range(num_quantizers)] + ) + + def decode(self, codes: torch.Tensor) -> torch.Tensor: + """codes: [num_quantizers, B, T] → [B, hidden_size, T]""" + result = torch.zeros( + codes.shape[1], + self.quantizers[0].project_out.out_features, + codes.shape[2], + device=codes.device, + dtype=torch.float32, + ) + for i, quantizer in enumerate(self.quantizers): + result = result + quantizer.decode(codes[i]) + return result + + +class OmniVoiceDecoder(nn.Module): + """OmniVoice Stage 1: Token-to-audio decoder. + + Uses DAC acoustic decoder from transformers + custom HiggsAudio RVQ + quantizer to convert 8-codebook tokens into 24kHz waveform. + """ + + def __init__(self, config: OmniVoiceConfig): + super().__init__() + self.config = config + self.sample_rate = config.sample_rate + self._loaded = False + + # These are populated by load_weights + self.quantizer = None + self.fc2 = None + self.acoustic_decoder = None + + @torch.inference_mode() + def forward(self, audio_codes: torch.Tensor) -> torch.Tensor: + """Decode audio tokens to waveform. + + Args: + audio_codes: [B, 8, T] - 8-codebook audio token IDs + + Returns: + waveform: [B, 1, audio_samples] at 24kHz + """ + if not self._loaded: + raise RuntimeError("Decoder not loaded. Call load_weights() first.") + + device = audio_codes.device + + # Transpose: [B, 8, T] → [8, B, T] + codes = audio_codes.transpose(0, 1).long() + + # RVQ decode: sum codebook embeddings → [B, 1024, T] + quantized = self.quantizer.decode(codes) + + # Project: [B, 1024, T] → fc2 → [B, 256, T] + quantized = self.fc2(quantized.transpose(1, 2)).transpose(1, 2) + + # Acoustic decoder: [B, 256, T] → [B, 1, T*960] + audio = self.acoustic_decoder(quantized) + + # Ensure [B, 1, samples] + if audio.dim() == 2: + audio = audio.unsqueeze(1) + + return audio.to(device) + + def _adjust_output_padding(self, decoder: nn.Module): + """Adjust ConvTranspose1d output_padding (HiggsAudioV2 modification).""" + for module in decoder.modules(): + if isinstance(module, nn.ConvTranspose1d): + stride = module.stride[0] if isinstance(module.stride, tuple) else module.stride + module.output_padding = (stride % 2,) + + def load_weights(self, model_dir: str, device: torch.device) -> None: + """Load decoder components from audio_tokenizer/model.safetensors.""" + from safetensors.torch import load_file + from transformers import DacConfig, DacModel + + audio_tokenizer_path = os.path.join(model_dir, "audio_tokenizer") + config_path = os.path.join(audio_tokenizer_path, "config.json") + weights_path = os.path.join(audio_tokenizer_path, "model.safetensors") + + if not os.path.exists(weights_path): + raise FileNotFoundError(f"Audio tokenizer weights not found at {weights_path}") + + with open(config_path) as f: + tokenizer_config = json.load(f) + + state_dict = load_file(weights_path, device=str(device)) + + # 1. Build RVQ quantizer + codebook_dim = tokenizer_config.get("codebook_dim", 64) + codebook_size = tokenizer_config.get("codebook_size", 1024) + # Hidden size = quantizer project_out output dim + hidden_size = state_dict["quantizer.quantizers.0.project_out.weight"].shape[0] + num_quantizers = sum( + 1 for k in state_dict if k.startswith("quantizer.quantizers.") and k.endswith(".codebook.embed") + ) + + self.quantizer = HiggsAudioRVQ( + num_quantizers=num_quantizers, + codebook_size=codebook_size, + codebook_dim=codebook_dim, + hidden_size=hidden_size, + ).to(device) + + # Load quantizer weights + for i in range(num_quantizers): + prefix = f"quantizer.quantizers.{i}" + embed_key = f"{prefix}.codebook.embed" + if embed_key in state_dict: + self.quantizer.quantizers[i].codebook.weight.data.copy_(state_dict[embed_key]) + proj_out_w = f"{prefix}.project_out.weight" + proj_out_b = f"{prefix}.project_out.bias" + if proj_out_w in state_dict: + self.quantizer.quantizers[i].project_out.weight.data.copy_(state_dict[proj_out_w]) + if proj_out_b in state_dict: + self.quantizer.quantizers[i].project_out.bias.data.copy_(state_dict[proj_out_b]) + + # 2. Build fc2 projection + fc2_w = state_dict["fc2.weight"] + fc2_b = state_dict["fc2.bias"] + self.fc2 = nn.Linear(fc2_w.shape[1], fc2_w.shape[0]).to(device) + self.fc2.weight.data.copy_(fc2_w) + self.fc2.bias.data.copy_(fc2_b) + + # 3. Build DAC acoustic decoder + dac_cfg = DacConfig(**tokenizer_config["acoustic_model_config"]) + dac_model = DacModel(dac_cfg) + self.acoustic_decoder = dac_model.decoder.to(device) + + # Load acoustic decoder weights + loaded = 0 + for name, param in self.acoustic_decoder.named_parameters(): + higgs_name = f"acoustic_decoder.{name}" + if higgs_name in state_dict: + param.data.copy_(state_dict[higgs_name]) + loaded += 1 + + # Apply HiggsAudioV2 output padding adjustment + self._adjust_output_padding(self.acoustic_decoder) + + # Remove tanh if present (HiggsAudioV2 uses Identity instead) + if hasattr(self.acoustic_decoder, "tanh"): + self.acoustic_decoder.tanh = nn.Identity() + + self.acoustic_decoder.eval() + self._loaded = True + + logger.info( + "Loaded OmniVoice decoder: %d quantizers, fc2(%d→%d), acoustic decoder (%d weights)", + num_quantizers, + fc2_w.shape[1], + fc2_w.shape[0], + loaded, + ) diff --git a/vllm_omni/model_executor/models/omnivoice/omnivoice_generator.py b/vllm_omni/model_executor/models/omnivoice/omnivoice_generator.py new file mode 100644 index 0000000000..32fe422721 --- /dev/null +++ b/vllm_omni/model_executor/models/omnivoice/omnivoice_generator.py @@ -0,0 +1,588 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +OmniVoice Generator (Stage 0) - Iterative unmasking with Qwen3 backbone. + +Generates 8-codebook audio tokens from text via 32-step non-autoregressive +iterative masked prediction with classifier-free guidance. + +Uses vLLM-Omni's DiffusionAttention for optimized full (bidirectional) attention +via FlashAttention/SageAttention/SDPA backends. +""" + +from __future__ import annotations + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from vllm.logger import init_logger + +from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig + +logger = init_logger(__name__) + + +# --------------------------------------------------------------------------- +# Unmasking schedule helpers +# --------------------------------------------------------------------------- + + +def _get_time_steps( + t_start: float, + t_end: float, + num_step: int, + t_shift: float, +) -> torch.Tensor: + """Compute the unmasking schedule with time shift. + + Returns cumulative proportions [0, ..., 1] of length num_step. + Formula: r_n = t_shift * (n/N) / (1 + (t_shift - 1) * (n/N)) + """ + steps = torch.linspace(t_start, t_end, num_step) + shifted = t_shift * steps / (1.0 + (t_shift - 1.0) * steps) + return shifted + + +def _gumbel_sample(logits: torch.Tensor, temperature: float) -> torch.Tensor: + """Add Gumbel noise for stochastic position selection.""" + noise = -torch.log(-torch.log(torch.rand_like(logits).clamp(min=1e-8))) + return logits / max(temperature, 1e-8) + noise + + +# --------------------------------------------------------------------------- +# Qwen3-style transformer blocks using DiffusionAttention +# --------------------------------------------------------------------------- + + +class OmniVoiceRMSNorm(nn.Module): + def __init__(self, hidden_size: int, eps: float = 1e-6): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.eps = eps + + def forward(self, x: torch.Tensor) -> torch.Tensor: + variance = x.to(torch.float32).pow(2).mean(-1, keepdim=True) + x = x * torch.rsqrt(variance + self.eps) + return self.weight * x.to(self.weight.dtype) + + +class OmniVoiceAttention(nn.Module): + """Qwen3-style GQA attention using DiffusionAttention backend.""" + + def __init__(self, config: OmniVoiceConfig): + super().__init__() + self.hidden_size = config.llm_hidden_size + self.num_heads = config.llm_num_attention_heads + self.num_kv_heads = config.llm_num_key_value_heads + self.head_dim = config.llm_head_dim + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + # Qwen3 uses per-head QK norm + self.q_norm = OmniVoiceRMSNorm(self.head_dim) + self.k_norm = OmniVoiceRMSNorm(self.head_dim) + + self.scale = 1.0 / math.sqrt(self.head_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor | None = None, + cos: torch.Tensor | None = None, + sin: torch.Tensor | None = None, + ) -> torch.Tensor: + batch_size, seq_len, _ = hidden_states.shape + + q = self.q_proj(hidden_states) + k = self.k_proj(hidden_states) + v = self.v_proj(hidden_states) + + q = q.view(batch_size, seq_len, self.num_heads, self.head_dim) + k = k.view(batch_size, seq_len, self.num_kv_heads, self.head_dim) + v = v.view(batch_size, seq_len, self.num_kv_heads, self.head_dim) + + # Per-head QK norm (Qwen3) + q = self.q_norm(q) + k = self.k_norm(k) + + # Apply RoPE + if cos is not None and sin is not None: + q = _apply_rotary_pos_emb(q, cos, sin) + k = _apply_rotary_pos_emb(k, cos, sin) + + # Expand KV heads for GQA (8 KV heads → 16 Q heads) + if self.num_kv_heads != self.num_heads: + repeat_factor = self.num_heads // self.num_kv_heads + k = k.repeat_interleave(repeat_factor, dim=2) + v = v.repeat_interleave(repeat_factor, dim=2) + + # Full bidirectional attention via SDPA with proper mask support + # Permute to (batch, heads, seq, head_dim) for SDPA + q = q.permute(0, 2, 1, 3) + k = k.permute(0, 2, 1, 3) + v = v.permute(0, 2, 1, 3) + + # Convert [B, 1, S, S] bool mask to float mask for SDPA + sdpa_mask = None + if attention_mask is not None: + sdpa_mask = attention_mask.to(dtype=q.dtype) + sdpa_mask = sdpa_mask.masked_fill(~attention_mask, float("-inf")) + sdpa_mask = sdpa_mask.masked_fill(attention_mask, 0.0) + + out = F.scaled_dot_product_attention( + q, + k, + v, + attn_mask=sdpa_mask, + scale=1.0 / math.sqrt(self.head_dim), + ) + + # Back to (batch, seq, heads * head_dim) + out = out.permute(0, 2, 1, 3).contiguous() + out = out.view(batch_size, seq_len, self.num_heads * self.head_dim) + return self.o_proj(out) + + +class OmniVoiceMLP(nn.Module): + """Qwen3-style MLP with SwiGLU.""" + + def __init__(self, config: OmniVoiceConfig): + super().__init__() + self.gate_proj = nn.Linear(config.llm_hidden_size, config.llm_intermediate_size, bias=False) + self.up_proj = nn.Linear(config.llm_hidden_size, config.llm_intermediate_size, bias=False) + self.down_proj = nn.Linear(config.llm_intermediate_size, config.llm_hidden_size, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x)) + + +class OmniVoiceTransformerBlock(nn.Module): + """Single Qwen3 transformer block with DiffusionAttention.""" + + def __init__(self, config: OmniVoiceConfig): + super().__init__() + self.input_layernorm = OmniVoiceRMSNorm(config.llm_hidden_size, eps=config.llm_rms_norm_eps) + self.self_attn = OmniVoiceAttention(config) + self.post_attention_layernorm = OmniVoiceRMSNorm(config.llm_hidden_size, eps=config.llm_rms_norm_eps) + self.mlp = OmniVoiceMLP(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor | None = None, + cos: torch.Tensor | None = None, + sin: torch.Tensor | None = None, + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.self_attn(hidden_states, attention_mask=attention_mask, cos=cos, sin=sin) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +# --------------------------------------------------------------------------- +# RoPE +# --------------------------------------------------------------------------- + + +def _precompute_rope( + head_dim: int, + max_seq_len: int, + theta: float = 1000000.0, + device: torch.device | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """Precompute RoPE cos/sin tensors.""" + inv_freq = 1.0 / (theta ** (torch.arange(0, head_dim, 2, device=device, dtype=torch.float32) / head_dim)) + t = torch.arange(max_seq_len, device=device, dtype=torch.float32) + freqs = torch.outer(t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + return cos, sin + + +def _apply_rotary_pos_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor: + """Apply rotary position embedding. x shape: (B, S, H, D).""" + seq_len = x.shape[1] + cos = cos[:seq_len].unsqueeze(0).unsqueeze(2) # (1, S, 1, D/2) + sin = sin[:seq_len].unsqueeze(0).unsqueeze(2) + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + rotated = torch.cat([-x2, x1], dim=-1) + return x * torch.cat([cos, cos], dim=-1) + rotated * torch.cat([sin, sin], dim=-1) + + +# --------------------------------------------------------------------------- +# Generator model +# --------------------------------------------------------------------------- + + +class OmniVoiceGenerator(nn.Module): + """OmniVoice Stage 0: Iterative unmasking generator. + + Architecture: + - Text embedding (from Qwen3 vocab) + Audio embedding (8*1025 entries) + - 28-layer Qwen3 transformer with full bidirectional attention + - 8-codebook prediction head (single linear: hidden → 8*1025) + - 32-step iterative unmasking with classifier-free guidance + + Optimizations: + - DiffusionAttention (FlashAttn/SageAttn/SDPA auto-selected) + - TeaCache / Cache-DiT compatible (hook-based, non-intrusive) + - regionally_compile() compatible for torch.compile on repeated blocks + - Sequence parallelism via SP hooks for multi-GPU + """ + + # For regionally_compile() support + _repeated_blocks = ["layers"] + + def __init__(self, config: OmniVoiceConfig): + super().__init__() + self.config = config + + # Text embedding (shared with LLM) + self.text_embedding = nn.Embedding(config.llm_vocab_size, config.llm_hidden_size) + + # Audio embedding: 8 codebooks * 1025 tokens + self.audio_embeddings = nn.Embedding( + config.num_audio_codebook * config.audio_vocab_size, + config.llm_hidden_size, + ) + self.register_buffer( + "codebook_layer_offsets", + torch.arange(config.num_audio_codebook) * config.audio_vocab_size, + ) + + # Transformer layers + self.layers = nn.ModuleList([OmniVoiceTransformerBlock(config) for _ in range(config.llm_num_hidden_layers)]) + self.norm = OmniVoiceRMSNorm(config.llm_hidden_size, eps=config.llm_rms_norm_eps) + + # Prediction head: hidden → 8 * 1025 + self.audio_heads = nn.Linear( + config.llm_hidden_size, + config.num_audio_codebook * config.audio_vocab_size, + bias=False, + ) + + # Precompute RoPE + self._rope_cos = None + self._rope_sin = None + + def _ensure_rope(self, seq_len: int, device: torch.device) -> None: + """Lazily compute RoPE cos/sin if needed.""" + if self._rope_cos is None or self._rope_cos.shape[0] < seq_len: + max_len = max(seq_len, 4096) + self._rope_cos, self._rope_sin = _precompute_rope( + self.config.llm_head_dim, + max_len, + theta=self.config.llm_rope_theta, + device=device, + ) + + def _prepare_embeddings( + self, + input_ids: torch.Tensor, + audio_mask: torch.Tensor, + ) -> torch.Tensor: + """Prepare mixed text+audio embeddings. + + Args: + input_ids: [B, 8, S] - text tokens replicated across codebooks, + audio positions have per-codebook token IDs + audio_mask: [B, S] - True for audio positions, False for text + + Returns: + embeddings: [B, S, hidden_size] + """ + # Text embeddings from first codebook row (all rows identical for text) + text_embeds = self.text_embedding(input_ids[:, 0, :]) + + # Audio embeddings: offset per codebook, then sum across codebooks + shifted_ids = (input_ids * audio_mask.unsqueeze(1)) + self.codebook_layer_offsets.view(1, -1, 1) + audio_embeds = self.audio_embeddings(shifted_ids).sum(dim=1) + + # Merge: audio where audio_mask=True, text elsewhere + return torch.where(audio_mask.unsqueeze(-1), audio_embeds, text_embeds) + + def _transformer_forward( + self, + inputs_embeds: torch.Tensor, + attention_mask: torch.Tensor | None = None, + ) -> torch.Tensor: + """Run through transformer layers. + + Args: + inputs_embeds: [B, S, hidden_size] + attention_mask: [B, 1, S, S] or None + + Returns: + hidden_states: [B, S, hidden_size] + """ + device = inputs_embeds.device + seq_len = inputs_embeds.shape[1] + self._ensure_rope(seq_len, device) + + hidden_states = inputs_embeds + cos = self._rope_cos.to(device=device, dtype=hidden_states.dtype) + sin = self._rope_sin.to(device=device, dtype=hidden_states.dtype) + + for layer in self.layers: + hidden_states = layer( + hidden_states, + attention_mask=attention_mask, + cos=cos, + sin=sin, + ) + + return self.norm(hidden_states) + + def _get_logits(self, hidden_states: torch.Tensor) -> torch.Tensor: + """Project hidden states to per-codebook logits. + + Args: + hidden_states: [B, S, hidden_size] + + Returns: + logits: [B, 8, S, 1025] + """ + batch_size, seq_len, _ = hidden_states.shape + logits_flat = self.audio_heads(hidden_states) # [B, S, 8*1025] + return logits_flat.view( + batch_size, + seq_len, + self.config.num_audio_codebook, + self.config.audio_vocab_size, + ).permute(0, 2, 1, 3) # [B, 8, S, 1025] + + @torch.inference_mode() + def forward( + self, + input_ids: torch.Tensor, + audio_mask: torch.Tensor, + attention_mask: torch.Tensor, + target_lens: list[int], + num_step: int = 32, + guidance_scale: float = 2.0, + t_shift: float = 0.1, + layer_penalty_factor: float = 5.0, + position_temperature: float = 5.0, + class_temperature: float = 0.0, + ) -> torch.Tensor: + """Run the full 32-step iterative unmasking generation. + + Args: + input_ids: [2*B, 8, S] - conditional (0:B) + unconditional (B:2B) + audio_mask: [2*B, S] - True for audio positions + attention_mask: [2*B, 1, S, S] - attention mask + target_lens: List of target audio lengths per batch item + num_step: Number of unmasking steps + guidance_scale: CFG scale + t_shift: Time shift for schedule + layer_penalty_factor: Penalty for later codebooks + position_temperature: Gumbel temperature for position selection + class_temperature: Temperature for token prediction (0=greedy) + + Returns: + tokens: [B, 8, max_target_len] - generated audio tokens + """ + B = len(target_lens) + device = input_ids.device + max_target_len = max(target_lens) + mask_id = self.config.audio_mask_id + num_codebooks = self.config.num_audio_codebook + + # Initialize all target tokens as [MASK] + tokens = torch.full( + (B, num_codebooks, max_target_len), + mask_id, + dtype=torch.long, + device=device, + ) + + # Compute unmasking schedule + timesteps = _get_time_steps(0.0, 1.0, num_step + 1, t_shift).tolist() + schedules = [] + for t_len in target_lens: + total_mask = t_len * num_codebooks + rem = total_mask + sched = [] + for step in range(num_step): + num = ( + rem + if step == num_step - 1 + else min( + math.ceil(total_mask * (timesteps[step + 1] - timesteps[step])), + rem, + ) + ) + sched.append(int(num)) + rem -= int(num) + schedules.append(sched) + + layer_ids = torch.arange(num_codebooks, device=device).view(1, -1, 1) + + # Compute c_lens for extracting target region from full sequence + c_lens = [] + for i in range(B): + # Conditional sequence length = number of non-padding positions + c_len = attention_mask[i, 0, 0].sum().item() + c_lens.append(int(c_len)) + + # Main iterative loop + for step in range(num_step): + # Prepare embeddings and run transformer + inputs_embeds = self._prepare_embeddings(input_ids, audio_mask) + hidden_states = self._transformer_forward(inputs_embeds, attention_mask) + batch_logits = self._get_logits(hidden_states).to(torch.float32) + # batch_logits: [2*B, 8, S, 1025] + + for i in range(B): + k = schedules[i][step] + if k <= 0: + continue + + c_len = c_lens[i] + t_len = target_lens[i] + + # Extract logits for target region + c_logits = batch_logits[i : i + 1, :, c_len - t_len : c_len, :] # [1, 8, T, 1025] + u_logits = batch_logits[B + i : B + i + 1, :, :t_len, :] # [1, 8, T, 1025] + + # Classifier-free guidance + if guidance_scale != 0: + c_log_probs = F.log_softmax(c_logits, dim=-1) + u_log_probs = F.log_softmax(u_logits, dim=-1) + log_probs = torch.log_softmax( + c_log_probs + guidance_scale * (c_log_probs - u_log_probs), + dim=-1, + ) + else: + log_probs = F.log_softmax(c_logits, dim=-1) + + # Prevent predicting [MASK] + log_probs[..., mask_id] = -float("inf") + + # Token prediction + if class_temperature > 0.0: + pred_tokens = _gumbel_sample(log_probs, class_temperature).argmax(dim=-1) + else: + pred_tokens = log_probs.argmax(dim=-1) # [1, 8, T] + + # Confidence scores + scores = log_probs.max(dim=-1)[0] # [1, 8, T] + + # Layer penalty (earlier codebooks get higher priority) + scores = scores - (layer_ids * layer_penalty_factor) + + # Gumbel noise for position selection + if position_temperature > 0.0: + scores = _gumbel_sample(scores, position_temperature) + + # Mask out already unmasked positions + sample_tokens = tokens[i : i + 1, :, :t_len] + scores.masked_fill_(sample_tokens != mask_id, -float("inf")) + + # Select top-k positions to unmask + _, topk_idx = torch.topk(scores.flatten(), k) + flat_tokens = sample_tokens.flatten().clone() + flat_tokens[topk_idx] = pred_tokens.flatten()[topk_idx] + sample_tokens.copy_(flat_tokens.view_as(sample_tokens)) + + # Update tokens and batch inputs for next iteration + tokens[i : i + 1, :, :t_len] = sample_tokens + input_ids = input_ids.clone() + input_ids[i, :, c_len - t_len : c_len] = sample_tokens.squeeze(0) + input_ids[B + i, :, :t_len] = sample_tokens.squeeze(0) + + return tokens + + def load_weights(self, model_dir: str, device: torch.device) -> None: + """Load weights from HuggingFace OmniVoice model.safetensors. + + The HF checkpoint contains: + - llm.* -> Qwen3 transformer weights + - audio_embeddings.* -> audio embedding table + - audio_heads.* -> prediction head + """ + import os + + from safetensors.torch import load_file + + weights_path = os.path.join(model_dir, "model.safetensors") + if not os.path.exists(weights_path): + raise FileNotFoundError(f"Model weights not found at {weights_path}") + + state_dict = load_file(weights_path, device=str(device)) + + # Map HF weight names to our module names + loaded_keys = set() + + # 1. Text embedding: llm.embed_tokens.weight -> text_embedding.weight + text_emb_key = "llm.embed_tokens.weight" + if text_emb_key in state_dict: + self.text_embedding.weight.data.copy_(state_dict[text_emb_key]) + loaded_keys.add(text_emb_key) + + # 2. Audio embeddings + for key in ["audio_embeddings.weight"]: + if key in state_dict: + self.audio_embeddings.weight.data.copy_(state_dict[key]) + loaded_keys.add(key) + + # 3. Audio heads + for key in ["audio_heads.weight"]: + if key in state_dict: + self.audio_heads.weight.data.copy_(state_dict[key]) + loaded_keys.add(key) + + # 4. Transformer layers: llm.layers.N.* -> layers.N.* + for key, value in state_dict.items(): + if key.startswith("llm.layers."): + # llm.layers.0.self_attn.q_proj.weight -> layers.0.self_attn.q_proj.weight + our_key = key.replace("llm.layers.", "layers.") + parts = our_key.split(".") + module = self + try: + for part in parts[:-1]: + if part.isdigit(): + module = module[int(part)] + else: + module = getattr(module, part) + param_name = parts[-1] + param = getattr(module, param_name) + if isinstance(param, nn.Parameter): + param.data.copy_(value) + elif isinstance(param, torch.Tensor): + param.copy_(value) + loaded_keys.add(key) + except (AttributeError, IndexError, KeyError) as e: + logger.warning("Failed to load weight %s: %s", key, e) + + # 5. Final norm: llm.norm.weight -> norm.weight + norm_key = "llm.norm.weight" + if norm_key in state_dict: + self.norm.weight.data.copy_(state_dict[norm_key]) + loaded_keys.add(norm_key) + + unloaded = set(state_dict.keys()) - loaded_keys + # Filter out audio_tokenizer weights (loaded in decoder stage) + unloaded = {k for k in unloaded if not k.startswith("audio_tokenizer.")} + if unloaded: + logger.info( + "Generator: %d/%d weights loaded, %d skipped (decoder weights)", + len(loaded_keys), + len(state_dict), + len(unloaded), + ) + else: + logger.info("Generator: all %d weights loaded", len(loaded_keys)) diff --git a/vllm_omni/model_executor/models/registry.py b/vllm_omni/model_executor/models/registry.py index b5ef92616d..1398923458 100644 --- a/vllm_omni/model_executor/models/registry.py +++ b/vllm_omni/model_executor/models/registry.py @@ -57,6 +57,11 @@ "cosyvoice3", "CosyVoice3Model", ), + "OmniVoiceModel": ( + "omnivoice", + "omnivoice", + "OmniVoiceModel", + ), "MammothModa2Qwen2ForCausalLM": ( "mammoth_moda2", "mammoth_moda2", diff --git a/vllm_omni/model_executor/stage_configs/omnivoice.yaml b/vllm_omni/model_executor/stage_configs/omnivoice.yaml new file mode 100644 index 0000000000..49f11e9674 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/omnivoice.yaml @@ -0,0 +1,20 @@ +# Stage config for OmniVoice TTS via diffusion engine +# Single stage: text → 32-step iterative unmasking → 8-codebook tokens → DAC decode → 24kHz audio + +stage_args: + - stage_id: 0 + stage_type: diffusion + is_comprehension: true + runtime: + devices: 0 + engine_args: + model_stage: dit + model_class_name: "OmniVoicePipeline" + gpu_memory_utilization: 0.5 + enforce_eager: true + trust_remote_code: true + engine_output_type: audio + distributed_executor_backend: "mp" + dtype: "float32" + final_output: true + final_output_type: audio diff --git a/vllm_omni/model_executor/stage_input_processors/omnivoice.py b/vllm_omni/model_executor/stage_input_processors/omnivoice.py new file mode 100644 index 0000000000..b7f5c102e4 --- /dev/null +++ b/vllm_omni/model_executor/stage_input_processors/omnivoice.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Inter-stage processor for OmniVoice: Generator → Decoder.""" + +from typing import Any + +from vllm.inputs import TextPrompt + +from vllm_omni.inputs.data import OmniTokensPrompt + + +def tokens2audio( + stage_list: list[Any], + engine_input_source: list[int], + prompt: OmniTokensPrompt | TextPrompt = None, + requires_multimodal_data: bool = True, +): + """Build stage-1 (decoder) inputs from stage-0 (generator) outputs. + + Takes the 8-codebook audio tokens from the generator and packages + them for the HiggsAudioV2 decoder. + """ + source_stage_id = engine_input_source[0] + source_outputs = stage_list[source_stage_id].engine_outputs + + if not isinstance(prompt, list): + prompt = [prompt] + + source_output = source_outputs[0] + output = source_output.outputs[0] + + multi_modal_data = output.multimodal_output + if multi_modal_data is None: + raise RuntimeError(f"Missing multimodal_output for request {source_output.request_id}") + + # Pass audio_tokens from generator to decoder + engine_input = OmniTokensPrompt( + prompt_token_ids=output.token_ids, + additional_information=multi_modal_data, + ) + return [engine_input] From f50c5a413ff37b0314ce24a09a26b3d02e696a67 Mon Sep 17 00:00:00 2001 From: Juan Pablo Zuluaga <46724788+JuanPZuluaga@users.noreply.github.com> Date: Fri, 3 Apr 2026 22:51:58 +0200 Subject: [PATCH 040/204] [Qwen3TTS] [TTS] [Feat] Refactor voice cache manager (#2108) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: JuanPZuluaga Signed-off-by: yiliu30 Signed-off-by: gcanlin Signed-off-by: Binh Tang Signed-off-by: Binh Tang Signed-off-by: Didan Deng <33117903+wtomin@users.noreply.github.com> Signed-off-by: Rein Yang Signed-off-by: CHEN <116010019@link.cuhk.edu.cn> Signed-off-by: vraiti Signed-off-by: Songrui625 Signed-off-by: Lidang Jiang Signed-off-by: Lidang-Jiang Signed-off-by: Alex Brooks Co-authored-by: JuanPZuluaga Co-authored-by: Yi Liu Co-authored-by: Hongsheng Liu Co-authored-by: Canlin Guo Co-authored-by: Binh Tang Co-authored-by: Binh Tang Co-authored-by: Didan Deng <33117903+wtomin@users.noreply.github.com> Co-authored-by: rein yang <73573651+R2-Y@users.noreply.github.com> Co-authored-by: zhumingjue138 Co-authored-by: ChenWenjing <54166744+Shirley125@users.noreply.github.com> Co-authored-by: vraiti Co-authored-by: 汪志鹏 Co-authored-by: Sy03 <1370724210@qq.com> Co-authored-by: chickeyton Co-authored-by: Jerry Song <46962917+Songrui625@users.noreply.github.com> Co-authored-by: Lidang Jiang <119769478+Lidang-Jiang@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 Co-authored-by: Alex Brooks Co-authored-by: linyueqian Co-authored-by: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> --- docs/serving/speech_api.md | 27 +- .../examples/online_serving/qwen3_tts.md | 34 ++- examples/online_serving/qwen3_tts/README.md | 10 +- .../openai_api/test_serving_speech.py | 78 ++++- tests/test_voice_cache.py | 129 +++++++++ vllm_omni/entrypoints/openai/api_server.py | 41 ++- .../entrypoints/openai/metadata_manager.py | 243 ---------------- .../entrypoints/openai/serving_speech.py | 98 +++---- .../models/qwen3_tts/qwen3_tts_talker.py | 36 +++ .../models/qwen3_tts/voice_cache_manager.py | 271 ------------------ vllm_omni/utils/voice_cache.py | 89 ++++++ 11 files changed, 442 insertions(+), 614 deletions(-) create mode 100644 tests/test_voice_cache.py delete mode 100644 vllm_omni/entrypoints/openai/metadata_manager.py delete mode 100644 vllm_omni/model_executor/models/qwen3_tts/voice_cache_manager.py create mode 100644 vllm_omni/utils/voice_cache.py diff --git a/docs/serving/speech_api.md b/docs/serving/speech_api.md index e6ab77edda..ecbe8d9ac9 100644 --- a/docs/serving/speech_api.md +++ b/docs/serving/speech_api.md @@ -118,6 +118,7 @@ Content-Type: application/json | `instructions` | string | "" | Voice style/emotion instructions | | `max_new_tokens` | integer | 2048 | Maximum tokens to generate | | `initial_codec_chunk_frames` | integer | null | Per-request initial chunk size override for TTFA tuning. When null, IC is computed dynamically based on server load. | +| `stream` | bool | false | Stream raw PCM chunks as they are decoded (requires `response_format="pcm"`) | **Supported languages:** Auto, Chinese, English, Japanese, Korean, German, French, Russian, Portuguese, Spanish, Italian @@ -143,9 +144,23 @@ Lists available voices for the loaded model. ```json { - "voices": ["aiden", "dylan", "eric", "ono_anna", "ryan", "serena", "sohee", "uncle_fu", "vivian"] + "voices": ["aiden", "dylan", "eric", "ono_anna", "ryan", "serena", "sohee", "uncle_fu", "vivian", "custom_voice_1"], + "uploaded_voices": [ + { + "name": "custom_voice_1", + "consent": "user_consent_id", + "created_at": 1738660000, + "file_size": 1024000, + "mime_type": "audio/wav", + "ref_text": "The exact transcript of the audio sample.", + "speaker_description": "warm narrator" + } + ] } ``` + +`uploaded_voices` is always present (empty list when no custom voices have been uploaded). Fields `ref_text` and `speaker_description` are omitted per-entry when not provided at upload time. + ``` POST /v1/audio/voices Content-Type: multipart/form-data @@ -161,6 +176,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests. | `consent` | string | Yes | Consent recording ID | | `name` | string | Yes | Name for the new voice | | `ref_text` | string | No | Transcript of the audio. When provided, enables in-context voice cloning (higher quality). Without it, only the speaker embedding is extracted. | +| `speaker_description` | string | No | Free-form description of the voice (e.g. "warm narrator", "energetic presenter"). Stored as metadata and returned in `GET /v1/audio/voices`. | **Response Example:** @@ -172,11 +188,15 @@ Upload a new voice sample for voice cloning in Base task TTS requests. "consent": "user_consent_id", "created_at": 1738660000, "mime_type": "audio/wav", - "file_size": 1024000 + "file_size": 1024000, + "ref_text": "The exact transcript of the audio sample.", + "speaker_description": "warm narrator" } } ``` +Fields `ref_text` and `speaker_description` are omitted when not provided at upload time. + **Usage Example:** ```bash @@ -184,7 +204,8 @@ curl -X POST http://localhost:8091/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ -F "consent=user_consent_id" \ -F "name=custom_voice_1" \ - -F "ref_text=The exact transcript of the audio sample." + -F "ref_text=The exact transcript of the audio sample." \ + -F "speaker_description=warm narrator" ``` ## Streaming Text Input (WebSocket) diff --git a/docs/user_guide/examples/online_serving/qwen3_tts.md b/docs/user_guide/examples/online_serving/qwen3_tts.md index 401a5c2e94..156c4942cd 100644 --- a/docs/user_guide/examples/online_serving/qwen3_tts.md +++ b/docs/user_guide/examples/online_serving/qwen3_tts.md @@ -159,7 +159,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \ -H "Content-Type: application/json" \ -d '{ "input": "Hello, how are you?", - "speaker": "vivian", + "voice": "vivian", "language": "English" }' --output output.wav @@ -168,7 +168,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \ -H "Content-Type: application/json" \ -d '{ "input": "I am so excited!", - "speaker": "vivian", + "voice": "vivian", "instructions": "Speak with great enthusiasm" }' --output excited.wav @@ -185,7 +185,7 @@ client = OpenAI(base_url="http://localhost:8091/v1", api_key="none") response = client.audio.speech.create( model="Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", - speaker="vivian", + voice="vivian", input="Hello, how are you?", ) @@ -201,7 +201,7 @@ response = httpx.post( "http://localhost:8091/v1/audio/speech", json={ "input": "Hello, how are you?", - "speaker": "vivian", + "voice": "vivian", "language": "English", }, timeout=300.0, @@ -237,12 +237,16 @@ List all available voices/speakers from the loaded model, including both built-i "consent": "user_consent_id", "created_at": 1738660000, "file_size": 1024000, - "mime_type": "audio/wav" + "mime_type": "audio/wav", + "ref_text": "The exact transcript of the audio sample.", + "speaker_description": "warm narrator" } ] } ``` +Fields `ref_text` and `speaker_description` are omitted per-entry when not provided at upload time. + #### POST /v1/audio/voices Upload a new voice sample for voice cloning in Base task TTS requests. @@ -252,6 +256,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests. - `consent` (required): Consent recording ID - `name` (required): Name for the new voice - `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality). +- `speaker_description` (optional): Free-form description of the voice (e.g. "warm narrator", "energetic presenter"). Stored as metadata. **Response Example:** ```json @@ -262,18 +267,23 @@ Upload a new voice sample for voice cloning in Base task TTS requests. "consent": "user_consent_id", "created_at": 1738660000, "mime_type": "audio/wav", - "file_size": 1024000 + "file_size": 1024000, + "ref_text": "The exact transcript of the audio sample.", + "speaker_description": "warm narrator" } } ``` +Fields `ref_text` and `speaker_description` are omitted when not provided at upload time. + **Usage Example:** ```bash -curl -X POST http://localhost:8000/v1/audio/voices \ +curl -X POST http://localhost:8091/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ -F "consent=user_consent_id" \ -F "name=custom_voice_1" \ - -F "ref_text=The exact transcript of the audio sample." + -F "ref_text=The exact transcript of the audio sample." \ + -F "speaker_description=warm narrator" ``` ### Endpoint @@ -290,7 +300,7 @@ This endpoint follows the [OpenAI Audio Speech API](https://platform.openai.com/ ```json { "input": "Text to synthesize", - "speaker": "vivian", + "voice": "vivian", "response_format": "wav", "task_type": "CustomVoice", "language": "Auto", @@ -310,7 +320,7 @@ Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/w ### Voice and language (summary) -- **Speaker**: Use the `speaker` request field to select the speaker (e.g., `vivian`, `ryan`, `aiden`). List available speakers with `GET /v1/audio/voices`. +- **Speaker**: Use the `voice` request field to select the speaker (e.g., `vivian`, `ryan`, `aiden`). List available speakers with `GET /v1/audio/voices`. - **Language**: Use the `language` field for the codec language tag (`Auto`, `Chinese`, `English`, etc.). Default is `Auto` for automatic detection. - **CustomVoice**: Requires a valid `voice` from the model’s speaker set. **VoiceDesign**: Use `instructions` to describe the voice. **Base**: Use `ref_audio` and `ref_text` for voice cloning. @@ -322,7 +332,7 @@ Returns binary audio data with appropriate `Content-Type` header (e.g., `audio/w | ----------------- | ------ | -------------- | ----------------------------------------------------------- | | `input` | string | **required** | Text to synthesize | | `model` | string | server's model | Model to use (optional, should match server if specified) | -| `speaker` | string | "vivian" | Speaker name (e.g., vivian, ryan, aiden) | +| `voice` | string | "vivian" | Speaker name (e.g., vivian, ryan, aiden) | | `response_format` | string | "wav" | Audio format: wav, mp3, flac, pcm, aac, opus | | `speed` | float | 1.0 | Playback speed (0.25-4.0, not supported with `stream=true`) | @@ -357,7 +367,7 @@ curl -X POST http://localhost:8091/v1/audio/speech \ -H "Content-Type: application/json" \ -d '{ "input": "Hello, how are you?", - "speaker": "vivian", + "voice": "vivian", "language": "English", "stream": true, "response_format": "pcm" diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md index 1b51e00f12..5504b5737a 100644 --- a/examples/online_serving/qwen3_tts/README.md +++ b/examples/online_serving/qwen3_tts/README.md @@ -233,6 +233,7 @@ Upload a new voice sample for voice cloning in Base task TTS requests. - `consent` (required): Consent recording ID - `name` (required): Name for the new voice - `ref_text` (optional): Transcript of the audio. Enables in-context voice cloning (higher quality). +- `speaker_description` (optional): Free-form description of the voice (e.g. "warm narrator", "energetic presenter"). **Response Example:** ```json @@ -243,18 +244,23 @@ Upload a new voice sample for voice cloning in Base task TTS requests. "consent": "user_consent_id", "created_at": 1738660000, "mime_type": "audio/wav", - "file_size": 1024000 + "file_size": 1024000, + "ref_text": "The exact transcript of the audio sample.", + "speaker_description": "warm narrator" } } ``` +Fields `ref_text` and `speaker_description` are omitted when not provided at upload time. + **Usage Example:** ```bash curl -X POST http://localhost:8000/v1/audio/voices \ -F "audio_sample=@/path/to/voice_sample.wav" \ -F "consent=user_consent_id" \ -F "name=custom_voice_1" \ - -F "ref_text=The exact transcript of the audio sample." + -F "ref_text=The exact transcript of the audio sample." \ + -F "speaker_description=warm narrator" ``` ### Endpoint diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 969df5bce0..17203cb577 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -233,17 +233,20 @@ async def list_voices(): uploaded_voices = [] if hasattr(speech_server, "uploaded_speakers"): for voice_name, info in speech_server.uploaded_speakers.items(): - uploaded_voices.append( - { - "name": info.get("name", voice_name), - "consent": info.get("consent", ""), - "created_at": info.get("created_at", 0), - "file_size": info.get("file_size", 0), - "mime_type": info.get("mime_type", ""), - "embedding_source": info.get("embedding_source", "audio"), - "embedding_dim": info.get("embedding_dim"), - } - ) + voice_entry = { + "name": info.get("name", voice_name), + "consent": info.get("consent", ""), + "created_at": info.get("created_at", 0), + "file_size": info.get("file_size", 0), + "mime_type": info.get("mime_type", ""), + "embedding_source": info.get("embedding_source", "audio"), + "embedding_dim": info.get("embedding_dim"), + } + if info.get("ref_text"): + voice_entry["ref_text"] = info["ref_text"] + if info.get("speaker_description"): + voice_entry["speaker_description"] = info["speaker_description"] + uploaded_voices.append(voice_entry) return {"voices": speakers, "uploaded_voices": uploaded_voices} app.add_api_route("/v1/audio/voices", list_voices, methods=["GET"]) @@ -255,7 +258,8 @@ async def upload_voice( speaker_embedding: str | None = Form(None), consent: str = Form(...), name: str = Form(...), - ref_text: str = Form(None), + ref_text: str | None = Form(None), + speaker_description: str | None = Form(None), ): try: if speaker_embedding is not None and audio_sample is not None: @@ -263,7 +267,13 @@ async def upload_voice( if speaker_embedding is not None: result = await speech_server.upload_voice_embedding(speaker_embedding, consent, name) elif audio_sample is not None: - result = await speech_server.upload_voice(audio_sample, consent, name, ref_text=ref_text) + result = await speech_server.upload_voice( + audio_sample, + consent, + name, + ref_text=ref_text, + speaker_description=speaker_description, + ) else: raise ValueError("Either 'audio_sample' or 'speaker_embedding' must be provided") return {"success": True, "voice": result} @@ -397,6 +407,44 @@ def test_upload_voice_with_ref_text(self, client, tmp_path): assert result["voice"].get("ref_text") == "Hello world transcript" response = client.delete("/v1/audio/voices/test_voice_rt") + def test_upload_voice_with_speaker_description(self, client, tmp_path): + """Test voice upload with speaker_description stores and returns the description.""" + # Pre-cleanup in case a previous test run left this voice behind + client.delete("/v1/audio/voices/test_voice_vd") + + audio_content = b"fake audio content" * 1000 + files = {"audio_sample": ("test.wav", audio_content, "audio/wav")} + data = {"consent": "c1", "name": "test_voice_vd", "speaker_description": " warm, energetic narrator "} + + response = client.post("/v1/audio/voices", files=files, data=data) + try: + assert response.status_code == 200 + result = response.json() + assert result["success"] is True + assert result["voice"]["name"] == "test_voice_vd" + assert result["voice"].get("speaker_description") == "warm, energetic narrator" + finally: + client.delete("/v1/audio/voices/test_voice_vd") + + def test_upload_voice_speaker_description_in_listing(self, client): + """Test that speaker_description survives the upload → list round-trip.""" + client.delete("/v1/audio/voices/test_voice_sd_list") + + audio_content = b"fake audio content" * 1000 + files = {"audio_sample": ("test.wav", audio_content, "audio/wav")} + data = {"consent": "c1", "name": "test_voice_sd_list", "speaker_description": "calm female narrator"} + + response = client.post("/v1/audio/voices", files=files, data=data) + try: + assert response.status_code == 200 + + listing = client.get("/v1/audio/voices").json() + uploaded = {v["name"]: v for v in listing["uploaded_voices"]} + assert "test_voice_sd_list" in uploaded + assert uploaded["test_voice_sd_list"]["speaker_description"] == "calm female narrator" + finally: + client.delete("/v1/audio/voices/test_voice_sd_list") + def test_upload_voice_file_too_large(self, client): """Test voice upload with file exceeding size limit.""" # Create a file larger than 10MB @@ -850,6 +898,7 @@ def test_build_tts_params_with_uploaded_voice(self, speech_server): "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav", "mime_type": "audio/wav", "ref_text": None, + "created_at": 1711234567.89, } } speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"} @@ -862,6 +911,7 @@ def test_build_tts_params_with_uploaded_voice(self, speech_server): assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] assert params["x_vector_only_mode"] == [True] assert params["task_type"] == ["Base"] + assert params["voice_created_at"] == [1711234567.89] assert "ref_text" not in params def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server): @@ -872,6 +922,7 @@ def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server): "file_path": "/tmp/voice_samples/custom_voice_consent_123.wav", "mime_type": "audio/wav", "ref_text": "Hello world transcript", + "created_at": 1711234567.89, } } speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"} @@ -885,6 +936,7 @@ def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server): assert params["x_vector_only_mode"] == [False] assert params["task_type"] == ["Base"] assert params["ref_text"] == ["Hello world transcript"] + assert params["voice_created_at"] == [1711234567.89] def test_build_tts_params_without_uploaded_voice(self, speech_server): """Test _build_tts_params does not auto-set ref_audio for non-uploaded voices.""" diff --git a/tests/test_voice_cache.py b/tests/test_voice_cache.py new file mode 100644 index 0000000000..69327aae57 --- /dev/null +++ b/tests/test_voice_cache.py @@ -0,0 +1,129 @@ +import threading + +import pytest + +from vllm_omni.utils.voice_cache import VoiceEmbeddingCache + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +@pytest.fixture +def cache(): + return VoiceEmbeddingCache(max_entries=4) + + +class TestVoiceEmbeddingCache: + def test_miss_returns_none(self, cache: VoiceEmbeddingCache): + assert cache.get("nonexistent") is None + assert cache.stats()["misses"] == 1 + + def test_put_and_hit(self, cache: VoiceEmbeddingCache): + cache.put("abc", {"val": 42}) + result = cache.get("abc") + assert result is not None + assert result["val"] == 42 + assert cache.stats()["hits"] == 1 + + def test_lru_eviction(self, cache: VoiceEmbeddingCache): + for i in range(5): + cache.put(f"key{i}", {"i": i}) + # key0 should have been evicted (oldest, max_entries=4) + assert cache.get("key0") is None + # key1..key4 should still be present + for i in range(1, 5): + assert cache.get(f"key{i}") is not None + assert cache.stats()["entries"] == 4 + + def test_lru_access_promotes(self, cache: VoiceEmbeddingCache): + cache.put("a", {"v": 1}) + cache.put("b", {"v": 2}) + cache.put("c", {"v": 3}) + cache.put("d", {"v": 4}) + # Access "a" to promote it to MRU + cache.get("a") + # Insert "e" -- should evict "b" (now the oldest), not "a" + cache.put("e", {"v": 5}) + assert cache.get("a") is not None + assert cache.get("b") is None + + def test_put_overwrites(self, cache: VoiceEmbeddingCache): + cache.put("k", {"old": True}) + cache.put("k", {"new": True}) + result = cache.get("k") + assert result is not None + assert "new" in result + assert "old" not in result + assert cache.stats()["entries"] == 1 + + def test_make_cache_key_includes_mode(self): + k1 = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=True) + k2 = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False) + assert k1 != k2 + assert "xvec" in k1 + assert "icl" in k2 + + def test_make_cache_key_deterministic(self): + k1 = VoiceEmbeddingCache.make_cache_key("bob", xvec_only=True) + k2 = VoiceEmbeddingCache.make_cache_key("bob", xvec_only=True) + assert k1 == k2 + + def test_make_cache_key_created_at_isolation(self): + """Different created_at timestamps must produce different keys (stale-cache protection).""" + k1 = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False, created_at=1000.0) + k2 = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False, created_at=2000.0) + assert k1 != k2 + + def test_stale_cache_protection(self, cache: VoiceEmbeddingCache): + """Re-upload (new created_at) must produce a cache miss, not a stale hit.""" + key_old = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False, created_at=1000.0) + key_new = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False, created_at=2000.0) + cache.put(key_old, {"ref_spk_embedding": "old_emb"}) + # Re-upload produces a new created_at → different key → cold miss + assert cache.get(key_new) is None + # Old key still in cache (not yet evicted) + assert cache.get(key_old) is not None + + def test_cache_mode_isolation(self, cache: VoiceEmbeddingCache): + """xvec entry must NOT be served for an icl request (same voice).""" + key_xvec = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=True) + key_icl = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False) + cache.put(key_xvec, {"ref_code": None, "ref_spk_embedding": "emb"}) + # icl request should miss — different key + assert cache.get(key_icl) is None + # xvec request should hit + assert cache.get(key_xvec) is not None + + def test_stats_counters(self, cache: VoiceEmbeddingCache): + cache.put("x", {"v": 1}) + cache.get("x") # hit + cache.get("x") # hit + cache.get("y") # miss + s = cache.stats() + assert s["hits"] == 2 + assert s["misses"] == 1 + assert s["entries"] == 1 + assert s["max_entries"] == 4 + + def test_thread_safety(self): + cache = VoiceEmbeddingCache(max_entries=32) + errors = [] + + def worker(thread_id: int): + try: + for i in range(50): + key = f"t{thread_id}_k{i}" + cache.put(key, {"tid": thread_id, "i": i}) + cache.get(key) + cache.get(f"t{(thread_id + 1) % 10}_k{i}") + except Exception as e: + errors.append(e) + + threads = [threading.Thread(target=worker, args=(t,)) for t in range(10)] + for t in threads: + t.start() + for t in threads: + t.join() + + assert not errors, f"Thread safety errors: {errors}" + s = cache.stats() + assert s["entries"] <= 32 diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index acf45b4fe6..4a7b097b2f 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1031,17 +1031,20 @@ async def list_voices(raw_request: Request): uploaded_speakers = [] if hasattr(handler, "uploaded_speakers"): for voice_name, info in handler.uploaded_speakers.items(): - uploaded_speakers.append( - { - "name": info.get("name", voice_name), - "consent": info.get("consent", ""), - "created_at": info.get("created_at", 0), - "file_size": info.get("file_size", 0), - "mime_type": info.get("mime_type", ""), - "embedding_source": info.get("embedding_source", "audio"), - "embedding_dim": info.get("embedding_dim"), - } - ) + voice_entry = { + "name": info.get("name", voice_name), + "consent": info.get("consent", ""), + "created_at": info.get("created_at", 0), + "file_size": info.get("file_size", 0), + "mime_type": info.get("mime_type", ""), + "embedding_source": info.get("embedding_source", "audio"), + "embedding_dim": info.get("embedding_dim"), + } + if info.get("ref_text"): + voice_entry["ref_text"] = info["ref_text"] + if info.get("speaker_description"): + voice_entry["speaker_description"] = info["speaker_description"] + uploaded_speakers.append(voice_entry) return JSONResponse(content={"voices": speakers, "uploaded_voices": uploaded_speakers}) @@ -1060,7 +1063,8 @@ async def upload_voice( speaker_embedding: str | None = Form(None), consent: str = Form(...), name: str = Form(...), - ref_text: str = Form(None), + ref_text: str | None = Form(None), + speaker_description: str | None = Form(None), ): """Upload a new voice for voice cloning. @@ -1079,6 +1083,11 @@ async def upload_voice( speaker_embedding: JSON-encoded float list. Mutually exclusive with audio_sample. consent: Consent recording ID name: Name for the new voice + ref_text: Optional transcript of the audio for ICL (in-context + learning) mode. When provided, voice clone requests using this + voice will produce higher quality results. + speaker_description: Optional free-form description of the voice + (e.g. "warm speaker", "energetic narrator"). raw_request: Raw FastAPI request Returns: @@ -1096,7 +1105,13 @@ async def upload_voice( if speaker_embedding is not None: result = await handler.upload_voice_embedding(speaker_embedding, consent, name) elif audio_sample is not None: - result = await handler.upload_voice(audio_sample, consent, name, ref_text=ref_text) + result = await handler.upload_voice( + audio_sample, + consent, + name, + ref_text=ref_text, + speaker_description=speaker_description, + ) else: return base(raw_request).create_error_response( message="Either 'audio_sample' or 'speaker_embedding' must be provided" diff --git a/vllm_omni/entrypoints/openai/metadata_manager.py b/vllm_omni/entrypoints/openai/metadata_manager.py deleted file mode 100644 index 4077aa23bc..0000000000 --- a/vllm_omni/entrypoints/openai/metadata_manager.py +++ /dev/null @@ -1,243 +0,0 @@ -""" -Metadata manager for voice samples and cache information. - -Provides a unified interface for managing metadata.json with -concurrency safety and data consistency across multiple processes. -""" - -import fcntl -import json -import logging -import os -import threading -import time -from collections.abc import Callable -from pathlib import Path -from typing import Any - -logger = logging.getLogger(__name__) - - -class MetadataManager: - """ - Manages metadata for uploaded speakers and cache information. - - Features: - 1. Single source of truth for metadata - 2. Concurrency safety with threading locks - 3. Atomic read-modify-write operations - 4. Merge updates to preserve fields from different components - """ - - def __init__(self, metadata_file: Path): - """ - Initialize the metadata manager. - - Args: - metadata_file: Path to metadata.json file - """ - self.metadata_file = metadata_file - self._lock = threading.Lock() # For intra-process concurrency - self._metadata = self._load_from_disk() - - # Create lock file for cross-process synchronization - self.lock_file = metadata_file.with_suffix(".lock") - self.lock_file.parent.mkdir(parents=True, exist_ok=True) - - def _load_from_disk(self) -> dict[str, Any]: - """Load metadata from disk.""" - if not self.metadata_file.exists(): - return {"uploaded_speakers": {}} - - try: - with open(self.metadata_file) as f: - return json.load(f) - except Exception as e: - logger.error(f"Failed to load metadata from {self.metadata_file}: {e}") - return {"uploaded_speakers": {}} - - def _save_to_disk(self, metadata: dict[str, Any]) -> bool: - """Save metadata to disk.""" - try: - self.metadata_file.parent.mkdir(parents=True, exist_ok=True) - tmp = self.metadata_file.with_suffix(".tmp") - with open(tmp, "w") as f: - json.dump(metadata, f, indent=2) - tmp.replace(self.metadata_file) - return True - except Exception as e: - logger.error(f"Failed to save metadata to {self.metadata_file}: {e}") - return False - - # ================================ - # Core fix: single flock overwrites RMW - # ================================ - def _update_with_file_lock( - self, update_fn: Callable[[dict[str, Any]], dict[str, Any] | None] - ) -> dict[str, Any] | None: - lock_fd = os.open(self.lock_file, os.O_CREAT | os.O_RDWR) - try: - fcntl.flock(lock_fd, fcntl.LOCK_EX) - - metadata = self._load_from_disk() - result = update_fn(metadata) - if result is None: - return None - - if not self._save_to_disk(metadata): - return None - - self._metadata = metadata - return result - finally: - fcntl.flock(lock_fd, fcntl.LOCK_UN) - os.close(lock_fd) - - def get_uploaded_speakers(self) -> dict[str, dict[str, Any]]: - """Get all uploaded speakers.""" - # Read directly from disk to ensure getting the latest data - metadata = self._load_from_disk() - return metadata.get("uploaded_speakers", {}).copy() - - def get_speaker(self, speaker_key: str) -> dict[str, Any] | None: - """Get specific speaker information.""" - # Read directly from disk to ensure getting the latest data - metadata = self._load_from_disk() - speakers = metadata.get("uploaded_speakers", {}) - return speakers.get(speaker_key, {}).copy() if speaker_key in speakers else None - - def update_speaker(self, speaker_key: str, updates: dict[str, Any]) -> bool: - """ - Update speaker information with merge semantics. - - Uses file locking for cross-process atomic operations. - """ - with self._lock: - - def _update(metadata: dict[str, Any]): - speakers = metadata.setdefault("uploaded_speakers", {}) - entry = speakers.get(speaker_key, {}) - entry.update(updates) - speakers[speaker_key] = entry - return True - - return self._update_with_file_lock(_update) is not None - - def create_speaker(self, speaker_key: str, speaker_data: dict[str, Any]) -> bool: - """ - Create a new speaker entry. - - Uses file locking for cross-process atomic operations. - """ - with self._lock: - - def _create(metadata: dict[str, Any]): - speakers = metadata.setdefault("uploaded_speakers", {}) - if speaker_key in speakers: - logger.warning(f"Speaker {speaker_key} already exists") - return None - speakers[speaker_key] = speaker_data - return True - - return self._update_with_file_lock(_create) is not None - - def update_cache_info(self, speaker_key: str, cache_file_path: Path, status: str = "ready") -> bool: - """ - Update cache information for a speaker. - """ - updates = { - "cache_status": status, - "cache_file": str(cache_file_path), - "cache_generated_at": time.time(), - } - return self.update_speaker(speaker_key, updates) - - def delete_speaker(self, speaker_key: str) -> dict[str, Any] | None: - """ - Delete a speaker from metadata and clean up associated files. - - Uses file locking for cross-process atomic operations. - - Args: - speaker_key: Speaker name (lowercase) - base_dir: Base directory for file validation (optional) - - Returns: - dict: Deleted speaker information if successful, None if speaker doesn't exist or error - """ - with self._lock: - - def _delete(metadata: dict[str, Any]): - speakers = metadata.get("uploaded_speakers", {}) - if speaker_key not in speakers: - logger.warning(f"Speaker {speaker_key} not found in metadata") - return None - - speaker_info = speakers.pop(speaker_key) - - # Clean up associated files - deleted_files = self._cleanup_speaker_files(speaker_info) - if deleted_files: - logger.info(f"Deleted {len(deleted_files)} files for speaker {speaker_key}: {deleted_files}") - - return speaker_info - - return self._update_with_file_lock(_delete) - - def _cleanup_speaker_files(self, speaker_info: dict[str, Any]) -> list[str]: - """ - Clean up files associated with a speaker. - - Args: - speaker_info: Speaker information dictionary - base_dir: Base directory for file validation (optional) - - Returns: - list: List of successfully deleted file paths - """ - deleted_files = [] - - # Helper function to safely delete a file - def safe_delete(file_path_str: str, description: str) -> bool: - if not file_path_str: - return False - - try: - file_path = Path(file_path_str) - - # Check if file exists - if not file_path.exists(): - logger.debug(f"{description} not found: {file_path}") - return False - - # Delete the file - file_path.unlink() - logger.info(f"Deleted {description}: {file_path}") - deleted_files.append(str(file_path)) - return True - - except Exception as e: - logger.error(f"Failed to delete {description} {file_path_str}: {e}") - return False - - # Delete audio file - audio_file = speaker_info.get("file_path") - if audio_file: - safe_delete(audio_file, "audio file") - - # Delete cache file - cache_file = speaker_info.get("cache_file") - if cache_file: - safe_delete(cache_file, "cache file") - - return deleted_files - - def reload_from_disk(self) -> bool: - """Force reload metadata from disk (useful for external changes).""" - with self._lock: - try: - self._metadata = self._load_from_disk() - return True - except Exception as e: - logger.error(f"Failed to reload metadata from disk: {e}") - return False diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 75279f0755..3d3ef60487 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -24,7 +24,6 @@ from vllm.utils import random_uuid from vllm_omni.entrypoints.openai.audio_utils_mixin import AudioMixin -from vllm_omni.entrypoints.openai.metadata_manager import MetadataManager from vllm_omni.entrypoints.openai.protocol.audio import ( AudioResponse, BatchSpeechRequest, @@ -171,14 +170,10 @@ def for_diffusion( def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # Initialize uploaded speakers storage + # Initialize uploaded speakers storage (ephemeral — cleared on restart) speech_voice_samples_dir = os.environ.get("SPEECH_VOICE_SAMPLES", "/tmp/voice_samples") self.uploaded_speakers_dir = Path(speech_voice_samples_dir) self.uploaded_speakers_dir.mkdir(parents=True, exist_ok=True) - self.metadata_file = self.uploaded_speakers_dir / "metadata.json" - - # Initialize metadata manager - self.metadata_manager = MetadataManager(self.metadata_file) # Find and cache the TTS stage (if any) during initialization self._tts_stage = self._find_tts_stage() @@ -195,17 +190,16 @@ def __init__(self, *args, **kwargs): # Cache TTS configuration values (computed once, reused per request) self._max_instructions_length = self._compute_max_instructions_length() - # Load supported speakers + # Load supported speakers (built-in only; uploaded voices start empty) self.supported_speakers = self._load_supported_speakers() - # Load uploaded speakers - self.uploaded_speakers = self.metadata_manager.get_uploaded_speakers() - - # Merge supported speakers with uploaded speakers - self.supported_speakers.update(self.uploaded_speakers.keys()) + self.uploaded_speakers: dict[str, dict] = {} + logger.warning( + "Uploaded voices are ephemeral and will be lost on server restart. " + "Re-upload voices after each restart if needed." + ) self._tts_tokenizer = None logger.info(f"Loaded {len(self.supported_speakers)} supported speakers: {sorted(self.supported_speakers)}") - logger.info(f"Loaded {len(self.uploaded_speakers)} uploaded speakers") # Batch configuration self._batch_max_items: int = getattr(self.engine_client, "tts_batch_max_items", 32) @@ -443,11 +437,20 @@ def _get_uploaded_audio_data(self, voice_name: str) -> str | None: return None async def upload_voice( - self, audio_file: UploadFile, consent: str, name: str, *, ref_text: str | None = None + self, + audio_file: UploadFile, + consent: str, + name: str, + *, + ref_text: str | None = None, + speaker_description: str | None = None, ) -> dict: - # Normalize ref_text: treat whitespace-only as absent + """Upload a new voice sample.""" + # Normalize optional strings: treat whitespace-only as absent if ref_text is not None: ref_text = ref_text.strip() or None + if speaker_description is not None: + speaker_description = speaker_description.strip() or None # Validate file size (max 10MB) MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB audio_file.file.seek(0, 2) # Seek to end @@ -499,7 +502,9 @@ async def upload_voice( # Check if voice already exists if voice_name_lower in self.uploaded_speakers: - raise ValueError(f"Voice '{name}' already exists") + raise ValueError( + f"Voice '{name}' already exists. To re-register this voice, delete it first and then upload it again." + ) # Sanitize name and consent to prevent path traversal sanitized_name = _sanitize_filename(name) @@ -549,7 +554,7 @@ async def upload_voice( raise ValueError(f"Failed to save audio file: {e}") # Create speaker data - speaker_data = { + speaker_data: dict[str, Any] = { "name": name, "consent": consent, "file_path": str(file_path), @@ -558,23 +563,13 @@ async def upload_voice( "original_filename": audio_file.filename, "file_size": file_size, "ref_text": ref_text, - "cache_status": "pending", # The initial cache state is pending. - "cache_file": None, # The initial cache file is empty. - "cache_generated_at": None, # The initial cache generation time is empty. "embedding_source": "audio", } - # Save metadata using metadata manager (concurrency safe) - success = self.metadata_manager.create_speaker(voice_name_lower, speaker_data) - if not success: - # Clean up the saved file if metadata creation failed - try: - file_path.unlink() - except Exception: - pass - raise ValueError(f"Failed to create metadata for voice '{name}' (possibly already exists)") + # Store voice description if provided. + if speaker_description: + speaker_data["speaker_description"] = speaker_description - # Update in-memory cache self.uploaded_speakers[voice_name_lower] = speaker_data self.supported_speakers.add(voice_name_lower) @@ -588,8 +583,10 @@ async def upload_voice( "mime_type": mime_type, "file_size": file_size, } - if ref_text is not None: - result["ref_text"] = ref_text + if speaker_data.get("ref_text"): + result["ref_text"] = speaker_data["ref_text"] + if speaker_data.get("speaker_description"): + result["speaker_description"] = speaker_data["speaker_description"] return result async def upload_voice_embedding(self, embedding_json: str, consent: str, name: str) -> dict: @@ -659,21 +656,10 @@ async def upload_voice_embedding(self, embedding_json: str, consent: str, name: "mime_type": "application/x-safetensors", "original_filename": filename, "file_size": file_path.stat().st_size, - "cache_status": "ready", - "cache_file": str(file_path), - "cache_generated_at": timestamp, "embedding_source": "direct", "embedding_dim": emb_dim, } - success = self.metadata_manager.create_speaker(voice_name_lower, speaker_data) - if not success: - try: - file_path.unlink() - except Exception: - pass - raise ValueError(f"Failed to create metadata for voice '{name}' (possibly already exists)") - self.uploaded_speakers[voice_name_lower] = speaker_data self.supported_speakers.add(voice_name_lower) @@ -699,25 +685,22 @@ async def delete_voice(self, name: str) -> bool: """ voice_name_lower = name.lower() - # Check if voice exists in memory cache if voice_name_lower not in self.uploaded_speakers: - logger.warning(f"Voice '{name}' not found in memory cache") + logger.warning(f"Voice '{name}' not found") return False - # Delete from metadata manager with file cleanup - # Pass base_dir for path validation - deleted_info = self.metadata_manager.delete_speaker(voice_name_lower) - if not deleted_info: - logger.error(f"Failed to delete voice '{name}' from metadata") - return False + speaker_info = self.uploaded_speakers.pop(voice_name_lower) + self.supported_speakers.discard(voice_name_lower) - # Update in-memory cache - if voice_name_lower in self.uploaded_speakers: - del self.uploaded_speakers[voice_name_lower] - if voice_name_lower in self.supported_speakers: - self.supported_speakers.remove(voice_name_lower) + # Clean up audio file on disk + file_path = speaker_info.get("file_path") + if file_path: + try: + Path(file_path).unlink(missing_ok=True) + except Exception as e: + logger.warning(f"Failed to delete audio file for '{name}': {e}") - logger.info(f"Deleted voice '{name}' and associated files") + logger.info(f"Deleted voice '{name}'") return True def _is_tts_model(self) -> bool: @@ -1063,6 +1046,7 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any stored_ref_text = speaker_info.get("ref_text") params["ref_audio"] = [audio_data] params["task_type"] = ["Base"] + params["voice_created_at"] = [speaker_info.get("created_at", 0)] if stored_ref_text: params["ref_text"] = [stored_ref_text] params["x_vector_only_mode"] = [False] diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py index 08c0f9a1e6..bc6222bbe2 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py @@ -27,6 +27,7 @@ from vllm.sequence import IntermediateTensors from vllm_omni.model_executor.models.output_templates import OmniOutput +from vllm_omni.utils.voice_cache import VoiceEmbeddingCache from .configuration_qwen3_tts import Qwen3TTSConfig, Qwen3TTSSpeakerEncoderConfig, Qwen3TTSTalkerConfig from .qwen3_tts_code_predictor_vllm import Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM @@ -406,6 +407,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._tokenizer = None self._speech_tokenizer: Qwen3TTSTokenizer | None = None + # In-memory LRU cache for voice extraction artifacts (Base voice clone). + self._voice_cache = VoiceEmbeddingCache() + # -------------------- vLLM required hooks -------------------- def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor: @@ -1326,6 +1330,25 @@ def _normalize_voice_clone_prompt(raw: object) -> dict[str, object] | None: xvec_only = bool((info_dict.get("x_vector_only_mode") or [False])[0]) in_context_mode = not xvec_only voice_clone_prompt = _normalize_voice_clone_prompt(info_dict.get("voice_clone_prompt")) + + # Voice cache: only for uploaded voices (created_at > 0) + _voice_cache_key = None + if voice_clone_prompt is None: + _speaker_list = info_dict.get("speaker") + if isinstance(_speaker_list, list) and _speaker_list: + _voice_name = str(_speaker_list[0]).lower() + _voice_created_at = float((info_dict.get("voice_created_at") or [0])[0]) + if _voice_created_at > 0: + _voice_cache_key = self._voice_cache.make_cache_key(_voice_name, xvec_only, _voice_created_at) + _cached = self._voice_cache.get(_voice_cache_key) if _voice_cache_key is not None else None + if _cached is not None: + voice_clone_prompt = { + "ref_code": _cached.get("ref_code"), + "ref_spk_embedding": _cached.get("ref_spk_embedding"), + "icl_mode": _cached.get("icl_mode"), + } + _voice_cache_key = None # hit -> don't store again + # Official implementation may pass `voice_clone_prompt.icl_mode`. if voice_clone_prompt is not None and "icl_mode" in voice_clone_prompt: icl_flag = _as_singleton(voice_clone_prompt.get("icl_mode")) @@ -1375,6 +1398,19 @@ def _normalize_voice_clone_prompt(raw: object) -> dict[str, object] | None: wav_np, sr = self._normalize_ref_audio(ref_audio_list[0]) speaker_embed = self._extract_speaker_embedding(wav_np, sr).view(1, 1, -1) + # Cache miss: store extraction result + if _voice_cache_key is not None and speaker_embed is not None: + self._voice_cache.put( + _voice_cache_key, + { + "ref_code": ref_code_prompt.detach().cpu() + if isinstance(ref_code_prompt, torch.Tensor) + else None, + "ref_spk_embedding": speaker_embed.detach().cpu().reshape(-1), + "icl_mode": in_context_mode, + }, + ) + codec_input = torch.cat([codec_input_0, speaker_embed, codec_input_1], dim=1) # Role header (<|im_start|>assistant\n) -> projected text embeds. diff --git a/vllm_omni/model_executor/models/qwen3_tts/voice_cache_manager.py b/vllm_omni/model_executor/models/qwen3_tts/voice_cache_manager.py deleted file mode 100644 index 1e26a161da..0000000000 --- a/vllm_omni/model_executor/models/qwen3_tts/voice_cache_manager.py +++ /dev/null @@ -1,271 +0,0 @@ -# Copyright 2026 The Alibaba Qwen team. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from dataclasses import dataclass -from pathlib import Path -from typing import Any - -import torch -from safetensors import safe_open -from safetensors.torch import save_file -from vllm.logger import init_logger - -from vllm_omni.entrypoints.openai.metadata_manager import MetadataManager - -logger = init_logger(__name__) - - -@dataclass -class VoiceClonePromptItem: - """ - Container for one sample's voice-clone prompt information that can be fed to the model. - - Fields are aligned with `Qwen3TTSForConditionalGeneration.generate(..., voice_clone_prompt=...)`. - """ - - ref_code: torch.Tensor | None # (T, Q) or (T,) depending on tokenizer 25Hz/12Hz - ref_spk_embedding: torch.Tensor # (D,) - x_vector_only_mode: bool - icl_mode: bool - ref_text: str | None = None - - -class VoiceCacheManager: - """ - Voice cache manager, responsible for managing custom voice cache functionality. - - Main features: - 1. Load uploaded speaker information from metadata.json - 2. Manage voice clone prompt cache - 3. Update cache status to metadata.json - - Security properties: - - No pickle / torch.load - - Safetensors-only - - Cache path confined to voice samples directory - """ - - def __init__(self, speech_voice_samples_dir: str | None = None, metadata_manager: MetadataManager | None = None): - """ - Initialize the voice cache manager. - - Args: - speech_voice_samples_dir: Speech voice samples directory path, - if None, get from environment variable - metadata_manager: Optional MetadataManager instance for shared metadata access. - If not provided, will create its own (less efficient). - """ - self.speech_voice_samples_dir = speech_voice_samples_dir or os.environ.get( - "SPEECH_VOICE_SAMPLES", "/tmp/voice_samples" - ) - - # Initialize metadata manager - if metadata_manager is not None: - self.metadata_manager = metadata_manager - else: - metadata_file = Path(self.speech_voice_samples_dir) / "metadata.json" - self.metadata_manager = MetadataManager(metadata_file) - - # ------------------------------------------------------------------ - # Metadata helpers - # ------------------------------------------------------------------ - - def load_uploaded_speakers_from_metadata(self) -> dict[str, Any] | None: - """Load uploaded speakers from metadata manager.""" - try: - return self.metadata_manager.get_uploaded_speakers() - except Exception as e: - logger.warning(f"Failed to load uploaded speakers from metadata: {e}") - return None - - def update_metadata_cache_info(self, speaker: str, cache_file_path: Path, status: str = "ready") -> bool: - """ - Update cache information using metadata manager. - - Args: - speaker: Speaker name - cache_file_path: Cache file path - status: Cache status, default is "ready" - - Returns: - bool: Whether the update was successful - """ - try: - speaker_key = speaker.lower() - return self.metadata_manager.update_cache_info( - speaker_key=speaker_key, cache_file_path=cache_file_path, status=status - ) - except Exception as e: - logger.error(f"Failed to update metadata cache info: {e}") - return False - - # ------------------------------------------------------------------ - # Cache save (SAFE) - # ------------------------------------------------------------------ - - def save_voice_cache( - self, - speaker: str, - audio_file_path: Path, - prompt_items: list[VoiceClonePromptItem], - ) -> bool: - """ - Save voice cache using safetensors (no pickle, no RCE). - """ - try: - cache_file_path = audio_file_path.with_suffix(".safetensors") - - tensors: dict[str, torch.Tensor] = {} - metadata: dict[str, str] = {} - - tensors["__len__"] = torch.tensor(len(prompt_items), dtype=torch.int64) - - for i, item in enumerate(prompt_items): - prefix = f"item_{i}_" - - tensors[prefix + "ref_spk_embedding"] = item.ref_spk_embedding.detach().cpu() - - has_ref_code = item.ref_code is not None - tensors[prefix + "has_ref_code"] = torch.tensor(int(has_ref_code), dtype=torch.int8) - - if has_ref_code: - tensors[prefix + "ref_code"] = item.ref_code.detach().cpu() - - tensors[prefix + "x_vector_only_mode"] = torch.tensor(int(item.x_vector_only_mode), dtype=torch.int8) - tensors[prefix + "icl_mode"] = torch.tensor(int(item.icl_mode), dtype=torch.int8) - - if item.ref_text is not None: - metadata[prefix + "ref_text"] = item.ref_text - - save_file(tensors, str(cache_file_path), metadata=metadata) - - return self.update_metadata_cache_info( - speaker=speaker, - cache_file_path=cache_file_path, - status="ready", - ) - - except Exception as e: - logger.error(f"Failed to save safetensors cache for speaker {speaker}: {e}") - self.update_metadata_cache_info(speaker, Path(""), "failed") - return False - - # ------------------------------------------------------------------ - # Cache load (SAFE) - # ------------------------------------------------------------------ - - def load_cached_voice_prompt( - self, - speaker: str, - device: str | None = None, - ) -> list[VoiceClonePromptItem] | None: - """ - Load cached VoiceClonePromptItem list from safetensors. - """ - try: - uploaded_speakers = self.load_uploaded_speakers_from_metadata() - if not uploaded_speakers: - return None - - speaker_key = speaker.lower() - if speaker_key not in uploaded_speakers: - return None - - speaker_info = uploaded_speakers[speaker_key] - if speaker_info.get("cache_status") != "ready": - return None - - cache_file_path = Path(speaker_info.get("cache_file", "")).resolve() - - base_dir = Path(self.speech_voice_samples_dir).resolve() - - # ---- Path confinement (critical security check) - if not str(cache_file_path).startswith(str(base_dir)): - logger.error(f"Illegal cache path outside base dir: {cache_file_path}") - return None - - if not cache_file_path.exists(): - return None - - if cache_file_path.suffix != ".safetensors": - logger.error(f"Legacy or unsafe cache format rejected: {cache_file_path}") - return None - - with safe_open(cache_file_path, framework="pt", device="cpu") as f: - meta = f.metadata() - - num_items = int(f.get_tensor("__len__").item()) - result: list[VoiceClonePromptItem] = [] - - for i in range(num_items): - prefix = f"item_{i}_" - - has_ref_code = bool(f.get_tensor(prefix + "has_ref_code").item()) - - ref_code = f.get_tensor(prefix + "ref_code").to(device) if has_ref_code else None - - ref_spk_embedding = f.get_tensor(prefix + "ref_spk_embedding").to(device) - - x_vector_only_mode = bool(f.get_tensor(prefix + "x_vector_only_mode").item()) - icl_mode = bool(f.get_tensor(prefix + "icl_mode").item()) - - ref_text = meta.get(prefix + "ref_text") - - result.append( - VoiceClonePromptItem( - ref_code=ref_code, - ref_spk_embedding=ref_spk_embedding, - x_vector_only_mode=x_vector_only_mode, - icl_mode=icl_mode, - ref_text=ref_text, - ) - ) - - logger.info(f"Safetensors cache loaded for speaker: {speaker}") - return result - - except Exception as e: - logger.warning(f"Failed to load safetensors cache for speaker {speaker}: {e}") - return None - - # ------------------------------------------------------------------ - # Audio path helper - # ------------------------------------------------------------------ - - def get_speaker_audio_path(self, speaker: str) -> Path | None: - """ - Get speaker's audio file path. - - Args: - speaker: Speaker name - - Returns: - Optional[Path]: Audio file path, returns None if speaker doesn't exist - """ - uploaded_speakers = self.load_uploaded_speakers_from_metadata() - if not uploaded_speakers: - return None - - speaker_key = speaker.lower() - if speaker_key not in uploaded_speakers: - return None - - audio_file_path = Path(uploaded_speakers[speaker_key]["file_path"]) - if audio_file_path.exists(): - return audio_file_path - - logger.warning(f"Audio file not found for speaker {speaker}: {audio_file_path}") - return None diff --git a/vllm_omni/utils/voice_cache.py b/vllm_omni/utils/voice_cache.py new file mode 100644 index 0000000000..2d78a5bfdb --- /dev/null +++ b/vllm_omni/utils/voice_cache.py @@ -0,0 +1,89 @@ +"""In-memory LRU cache for voice extraction artifacts. + +Keyed by voice name + extraction mode (e.g. ``"alice:icl"``). +Only named voices are cached; inline ``ref_audio`` without a voice +name is not cached. + +Usage:: + + key = VoiceEmbeddingCache.make_cache_key("alice", xvec_only=False) + cached = cache.get(key) + if cached is None: + # ... extract ... + cache.put(key, {"artifact": result}) +""" + +import os +import threading +from collections import OrderedDict +from typing import Any + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +_DEFAULT_MAX_ENTRIES = 128 + + +class VoiceEmbeddingCache: + """LRU cache for voice extraction outputs. + + Each entry stores a ``dict[str, Any]`` whose contents are model-specific. + Thread-safe via a lightweight ``threading.Lock``. + """ + + def __init__(self, max_entries: int | None = None): + if max_entries is None: + max_entries = int(os.environ.get("VOICE_CACHE_MAX_ENTRIES", _DEFAULT_MAX_ENTRIES)) + self._cache: OrderedDict[str, dict[str, Any]] = OrderedDict() + self._max_entries = max_entries + self._lock = threading.Lock() + self._hits = 0 + self._misses = 0 + logger.info("Voice embedding cache initialized (max_entries=%d)", max_entries) + + @staticmethod + def make_cache_key(voice_name: str, xvec_only: bool, created_at: float = 0.0) -> str: + """Build a cache key from a voice name, upload timestamp, and extraction mode. + + Args: + voice_name: The speaker/voice name (case-insensitive, lowered + by the caller). + xvec_only: True for speaker-embedding-only mode, False for + ICL mode (speaker embedding + ref_code). + created_at: Upload timestamp from metadata. Prevents stale cache + hits after a voice is deleted and re-uploaded with the same + name but different audio. + """ + mode = "xvec" if xvec_only else "icl" + return f"{voice_name}:{created_at:.6f}:{mode}" + + def get(self, key: str) -> dict[str, Any] | None: + """Return cached artifacts or ``None`` on miss. Promotes to MRU on hit.""" + with self._lock: + if key in self._cache: + self._cache.move_to_end(key) + self._hits += 1 + logger.debug("Voice cache HIT (key=%s, hits=%d)", key, self._hits) + return self._cache[key] + self._misses += 1 + return None + + def put(self, key: str, artifacts: dict[str, Any]) -> None: + """Store *artifacts* under *key*, evicting the LRU entry if full.""" + with self._lock: + self._cache[key] = artifacts + self._cache.move_to_end(key) + while len(self._cache) > self._max_entries: + evicted_key, _ = self._cache.popitem(last=False) + logger.debug("Voice cache EVICT (key=%s)", evicted_key) + + def stats(self) -> dict[str, int]: + """Return cache statistics.""" + with self._lock: + return { + "entries": len(self._cache), + "max_entries": self._max_entries, + "hits": self._hits, + "misses": self._misses, + } From 4c031580cffa99ac8b96ba14055ba78678362436 Mon Sep 17 00:00:00 2001 From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Date: Fri, 3 Apr 2026 21:55:15 -0400 Subject: [PATCH 041/204] [CosyVoice3] Add online serving support, fix stage config, and add CI tests (#2431) Signed-off-by: linyueqian --- .buildkite/test-merge.yml | 15 +++ .buildkite/test-ready.yml | 43 ++++++ .../e2e/online_serving/test_cosyvoice3_tts.py | 124 ++++++++++++++++++ .../openai_api/test_serving_speech.py | 112 ++++++++++++++++ vllm_omni/engine/arg_utils.py | 64 +++++++++ vllm_omni/engine/async_omni_engine.py | 16 +++ .../entrypoints/openai/serving_speech.py | 74 ++++++++++- .../models/cosyvoice3/cosyvoice3.py | 35 +++-- .../model_executor/models/cosyvoice3/utils.py | 30 +++-- .../stage_configs/cosyvoice3.yaml | 12 +- 10 files changed, 497 insertions(+), 28 deletions(-) create mode 100644 tests/e2e/online_serving/test_cosyvoice3_tts.py diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index a1ce0c495f..b0b5a63961 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -390,6 +390,16 @@ steps: export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" ' + + - label: "CosyVoice3-TTS E2E Test" + timeout_in_minutes: 20 + depends_on: upload-merge-pipeline + commands: + - | + timeout 20m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn + pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "advanced_model" --run-level "advanced_model" + ' agents: queue: "mithril-h100-pool" plugins: @@ -408,6 +418,11 @@ steps: env: - name: HF_HOME value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token nodeSelector: node.kubernetes.io/instance-type: gpu-h100-sxm volumes: diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 1151da4672..be528b316c 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -548,3 +548,46 @@ steps: hostPath: path: /mnt/hf-cache type: DirectoryOrCreate + + - label: "CosyVoice3-TTS E2E Test" + timeout_in_minutes: 20 + depends_on: upload-ready-pipeline + commands: + - | + timeout 20m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn + pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "core_model" --run-level "core_model" + ' + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate diff --git a/tests/e2e/online_serving/test_cosyvoice3_tts.py b/tests/e2e/online_serving/test_cosyvoice3_tts.py new file mode 100644 index 0000000000..976be805c2 --- /dev/null +++ b/tests/e2e/online_serving/test_cosyvoice3_tts.py @@ -0,0 +1,124 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +E2E Online tests for CosyVoice3 TTS model with voice cloning. + +These tests verify the /v1/audio/speech endpoint works correctly with +the CosyVoice3 model, which requires reference audio for voice cloning. +""" + +import os + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" + +from pathlib import Path + +import pytest + +from tests.conftest import OmniServerParams +from tests.utils import hardware_test + +MODEL = "FunAudioLLM/Fun-CosyVoice3-0.5B-2512" + +# Official CosyVoice zero-shot prompt audio and its transcript +REF_AUDIO_URL = "https://raw.githubusercontent.com/FunAudioLLM/CosyVoice/main/asset/zero_shot_prompt.wav" +REF_TEXT = "希望你以后能够做的比我还好呦。" + + +def get_stage_config(name: str = "cosyvoice3.yaml"): + """Get the stage config path from vllm_omni model_executor stage_configs.""" + return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name) + + +def get_prompt(prompt_type="zh"): + prompts = { + "zh": "收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的感动让我热泪盈眶。", + "en": "Hello, this is a voice cloning test with English text.", + } + return prompts.get(prompt_type, prompts["zh"]) + + +tts_server_params = [ + pytest.param( + OmniServerParams( + model=MODEL, + stage_config_path=get_stage_config(), + server_args=["--trust-remote-code", "--disable-log-stats"], + ), + id="cosyvoice3", + ) +] + + +@pytest.mark.advanced_model +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=1) +@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True) +def test_voice_clone_zh_001(omni_server, openai_client) -> None: + """ + Test voice cloning TTS with Chinese text via OpenAI API. + Deploy Setting: default yaml + Input Modal: text + ref_audio + ref_text + Output Modal: audio + Input Setting: stream=False + Datasets: single request + """ + request_config = { + "model": omni_server.model, + "input": get_prompt("zh"), + "stream": False, + "response_format": "wav", + "ref_audio": REF_AUDIO_URL, + "ref_text": REF_TEXT, + } + openai_client.send_audio_speech_request(request_config) + + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=1) +@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True) +def test_voice_clone_zh_002(omni_server, openai_client) -> None: + """ + Test voice cloning TTS with Chinese text via OpenAI API. + Deploy Setting: default yaml + Input Modal: text + ref_audio + ref_text + Output Modal: audio + Input Setting: stream=True + Datasets: single request + """ + request_config = { + "model": omni_server.model, + "input": get_prompt("zh"), + "stream": True, + "response_format": "wav", + "ref_audio": REF_AUDIO_URL, + "ref_text": REF_TEXT, + } + openai_client.send_audio_speech_request(request_config) + + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=1) +@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True) +def test_voice_clone_en_001(omni_server, openai_client) -> None: + """ + Test voice cloning TTS with English text via OpenAI API. + Deploy Setting: default yaml + Input Modal: text + ref_audio + ref_text + Output Modal: audio + Input Setting: stream=False + Datasets: single request + """ + request_config = { + "model": omni_server.model, + "input": get_prompt("en"), + "stream": False, + "response_format": "wav", + "ref_audio": REF_AUDIO_URL, + "ref_text": REF_TEXT, + } + openai_client.send_audio_speech_request(request_config) diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 17203cb577..83e4188c17 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -1872,3 +1872,115 @@ def test_streaming_unsupported_format_rejected(self, wav_streaming_app): for fmt in unsupported_formats: response = client.post("/v1/audio/speech", json={"input": "Hello", "stream": True, "response_format": fmt}) assert response.status_code == 422 + + +# ---- CosyVoice3 Serving Tests ---- + + +@pytest.fixture +def cosyvoice3_server(mocker: MockerFixture): + mocker.patch.object(OmniOpenAIServingSpeech, "_load_supported_speakers", return_value=set()) + mocker.patch.object(OmniOpenAIServingSpeech, "_load_codec_frame_rate", return_value=None) + + mock_engine_client = mocker.MagicMock() + mock_engine_client.errored = False + mock_engine_client.model_config = mocker.MagicMock(model="FunAudioLLM/Fun-CosyVoice3-0.5B-2512") + mock_engine_client.default_sampling_params_list = [SimpleNamespace(max_tokens=2048)] + mock_engine_client.tts_batch_max_items = 32 + mock_engine_client.generate = mocker.MagicMock(return_value="generator") + mock_engine_client.stage_configs = [ + SimpleNamespace( + engine_args=SimpleNamespace(model_stage="cosyvoice3_talker"), + tts_args={}, + ) + ] + + mock_models = mocker.MagicMock() + mock_models.is_base_model.return_value = True + + return OmniOpenAIServingSpeech( + engine_client=mock_engine_client, + models=mock_models, + request_logger=mocker.MagicMock(), + ) + + +class TestCosyVoice3Serving: + def test_cosyvoice3_model_type_detection(self, cosyvoice3_server): + assert cosyvoice3_server._tts_model_type == "cosyvoice3" + assert cosyvoice3_server._is_tts is True + assert cosyvoice3_server._is_cosyvoice3 is True + + def test_cosyvoice3_stage_registered(self): + from vllm_omni.entrypoints.openai.serving_speech import ( + _COSYVOICE3_TTS_MODEL_STAGES, + _TTS_MODEL_STAGES, + ) + + assert "cosyvoice3_talker" in _COSYVOICE3_TTS_MODEL_STAGES + assert "cosyvoice3_talker" in _TTS_MODEL_STAGES + + def test_validate_cosyvoice3_empty_input(self, cosyvoice3_server): + request = OpenAICreateSpeechRequest(input="", ref_audio="data:audio/wav;base64,abc", ref_text="hello") + error = cosyvoice3_server._validate_cosyvoice3_request(request) + assert error is not None + assert "empty" in error.lower() + + def test_validate_cosyvoice3_missing_ref_audio(self, cosyvoice3_server): + request = OpenAICreateSpeechRequest(input="Hello", ref_text="hello") + error = cosyvoice3_server._validate_cosyvoice3_request(request) + assert error is not None + assert "ref_audio" in error.lower() + + def test_validate_cosyvoice3_missing_ref_text(self, cosyvoice3_server): + request = OpenAICreateSpeechRequest(input="Hello", ref_audio="data:audio/wav;base64,abc") + error = cosyvoice3_server._validate_cosyvoice3_request(request) + assert error is not None + assert "ref_text" in error.lower() + + def test_validate_cosyvoice3_invalid_ref_audio_format(self, cosyvoice3_server): + request = OpenAICreateSpeechRequest(input="Hello", ref_audio="/local/path.wav", ref_text="hello") + error = cosyvoice3_server._validate_cosyvoice3_request(request) + assert error is not None + assert "url" in error.lower() or "format" in error.lower() + + def test_validate_cosyvoice3_valid_request(self, cosyvoice3_server): + request = OpenAICreateSpeechRequest( + input="Hello world", + ref_audio="data:audio/wav;base64,abc123", + ref_text="Reference transcript", + ) + error = cosyvoice3_server._validate_cosyvoice3_request(request) + assert error is None + + def test_validate_cosyvoice3_max_new_tokens_range(self, cosyvoice3_server): + request = OpenAICreateSpeechRequest( + input="Hello", + ref_audio="data:audio/wav;base64,abc", + ref_text="hello", + max_new_tokens=0, + ) + error = cosyvoice3_server._validate_cosyvoice3_request(request) + assert error is not None + assert "max_new_tokens" in error + + def test_prepare_speech_generation_cosyvoice3(self, cosyvoice3_server): + cosyvoice3_server._build_cosyvoice3_prompt = AsyncMock( + return_value={ + "prompt": "Hello", + "multi_modal_data": {"audio": (np.zeros(24000), 24000)}, + "mm_processor_kwargs": {"prompt_text": "ref text", "sample_rate": 24000}, + } + ) + + request = OpenAICreateSpeechRequest( + input="Hello", + ref_audio="data:audio/wav;base64,abc", + ref_text="Reference text", + ) + request_id, generator, tts_params = asyncio.run(cosyvoice3_server._prepare_speech_generation(request)) + + assert request_id.startswith("speech-") + assert generator == "generator" + assert tts_params == {} + cosyvoice3_server._build_cosyvoice3_prompt.assert_awaited_once() diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index a1dc373dd9..5bc51043a5 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -1,5 +1,6 @@ import argparse import dataclasses +import os from dataclasses import dataclass, field from typing import Any @@ -12,6 +13,18 @@ logger = init_logger(__name__) +# Maps model architecture names to their HuggingFace model_type values. +# Used when auto-injecting hf_overrides for models with missing config.json. +_ARCH_TO_MODEL_TYPE: dict[str, str] = { + "CosyVoice3Model": "cosyvoice3", + "OmniVoiceModel": "omnivoice", +} + +# Maps model architecture names to tokenizer subfolder paths within HF repos. +_TOKENIZER_SUBFOLDER_MAP: dict[str, str] = { + "CosyVoice3Model": "CosyVoice-BlankEN", +} + def _register_omni_hf_configs() -> None: try: @@ -29,6 +42,14 @@ def _register_omni_hf_configs() -> None: logger.warning("Skipping omni HF config registration due to import error: %s", exc) return + # Register with both transformers AutoConfig and vLLM's config registry + # so models with empty/missing config.json (e.g. CosyVoice3) can be + # resolved when model_type is injected via hf_overrides. + try: + from vllm.transformers_utils.config import _CONFIG_REGISTRY + except ImportError: + _CONFIG_REGISTRY = None + for model_type, config_cls in [ ("qwen3_tts", Qwen3TTSConfig), ("cosyvoice3", CosyVoice3Config), @@ -40,6 +61,8 @@ def _register_omni_hf_configs() -> None: except ValueError: # Already registered elsewhere; ignore. pass + if _CONFIG_REGISTRY is not None and model_type not in _CONFIG_REGISTRY: + _CONFIG_REGISTRY[model_type] = config_cls def register_omni_models_to_vllm(): @@ -129,11 +152,52 @@ def create_model_config(self) -> OmniModelConfig: # If model_arch is specified, inject it into hf_overrides so vLLM can # resolve the architecture even when config.json lacks 'architectures'. + # Also inject model_type so AutoConfig can resolve the correct config + # class for models with empty or missing config.json (e.g. CosyVoice3). if self.model_arch: if self.hf_overrides is None: self.hf_overrides = {} if isinstance(self.hf_overrides, dict): self.hf_overrides.setdefault("architectures", [self.model_arch]) + if "model_type" not in self.hf_overrides: + model_type = _ARCH_TO_MODEL_TYPE.get(self.model_arch) + if model_type is not None: + self.hf_overrides.setdefault("model_type", model_type) + + # Auto-detect tokenizer for models that store it in a subdirectory + # rather than the root (e.g. CosyVoice3 uses CosyVoice-BlankEN/). + if not self.tokenizer and self.model: + model_path = self.model + if os.path.isdir(model_path) and not os.path.isfile(os.path.join(model_path, "tokenizer_config.json")): + for subfolder in sorted(os.listdir(model_path)): + candidate = os.path.join(model_path, subfolder) + if os.path.isdir(candidate) and os.path.isfile(os.path.join(candidate, "tokenizer_config.json")): + self.tokenizer = candidate + logger.info("Auto-detected tokenizer at %s", candidate) + break + elif not os.path.isdir(model_path): + subfolder = _TOKENIZER_SUBFOLDER_MAP.get(self.model_arch) + if subfolder: + # Download just the tokenizer files from the subfolder + try: + from huggingface_hub import snapshot_download + + local_dir = snapshot_download( + model_path, + allow_patterns=[ + f"{subfolder}/tokenizer*", + f"{subfolder}/special_tokens*", + f"{subfolder}/vocab*", + f"{subfolder}/merges*", + f"{subfolder}/added_tokens*", + ], + ) + candidate = os.path.join(local_dir, subfolder) + if os.path.isdir(candidate): + self.tokenizer = candidate + logger.info("Downloaded tokenizer from %s/%s", model_path, subfolder) + except Exception as e: + logger.warning("Failed to download tokenizer subfolder: %s", e) # Build the vLLM config first, then use it to create the Omni config. model_config = super().create_model_config() diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index c987106fee..d9960ecbac 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -73,6 +73,16 @@ logger = init_logger(__name__) +def _patch_generation_config_if_needed(model_config: Any) -> None: + """Ensure try_get_generation_config won't crash for models whose HF + config.json lacks model_type (e.g. CosyVoice3). We probe it once; + if it raises, we monkey-patch the method to return None.""" + try: + model_config.try_get_generation_config() + except Exception: + model_config.try_get_generation_config = lambda: {} + + def _inject_kv_stage_info(stage_cfg: Any, stage_id: int) -> None: """Inject stage_id and engine_input_source into omni_kv_config. @@ -409,6 +419,12 @@ def _attach_llm_stage( ) input_processor = None if started.stage_id == 0: + # Some omni models (e.g. CosyVoice3) have an empty HF + # config.json without model_type, which causes + # try_get_generation_config -> AutoConfig.from_pretrained + # to raise ValueError. Patch it to return None so + # InputProcessor doesn't crash. + _patch_generation_config_if_needed(started.vllm_config.model_config) input_processor = InputProcessor(vllm_config=started.vllm_config) # Use omni preprocessor so text-only prompts with # mm_processor_kwargs (e.g. GLM-Image t2i target_h/target_w) diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 3d3ef60487..8126fd544f 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -46,9 +46,14 @@ _VOXTRAL_TTS_MODEL_STAGES = {"audio_generation"} _QWEN3_TTS_MODEL_STAGES = {"qwen3_tts"} _FISH_TTS_MODEL_STAGES = {"fish_speech_slow_ar"} +_COSYVOICE3_TTS_MODEL_STAGES = {"cosyvoice3_talker"} _OMNIVOICE_TTS_MODEL_STAGES = {"omnivoice_generator"} _TTS_MODEL_STAGES: set[str] = ( - _VOXTRAL_TTS_MODEL_STAGES | _QWEN3_TTS_MODEL_STAGES | _FISH_TTS_MODEL_STAGES | _OMNIVOICE_TTS_MODEL_STAGES + _VOXTRAL_TTS_MODEL_STAGES + | _QWEN3_TTS_MODEL_STAGES + | _FISH_TTS_MODEL_STAGES + | _COSYVOICE3_TTS_MODEL_STAGES + | _OMNIVOICE_TTS_MODEL_STAGES ) _TTS_LANGUAGES: set[str] = { "Auto", @@ -184,6 +189,13 @@ def __init__(self, *args, **kwargs): ) self._fish_speech_tokenizer = None + self._is_cosyvoice3 = ( + self._tts_stage is not None + and getattr(getattr(self._tts_stage, "engine_args", None), "model_stage", None) + in _COSYVOICE3_TTS_MODEL_STAGES + ) + self._cosyvoice3_tokenizer = None + # Determine TTS model type or None self._tts_model_type = self._detect_tts_model_type() @@ -258,6 +270,8 @@ def _detect_tts_model_type(self) -> str | None: return "voxtral_tts" if model_stage in _FISH_TTS_MODEL_STAGES: return "fish_tts" + if model_stage in _COSYVOICE3_TTS_MODEL_STAGES: + return "cosyvoice3" if model_stage in _OMNIVOICE_TTS_MODEL_STAGES: return "omnivoice" return None @@ -713,6 +727,8 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non return self._validate_voxtral_tts_request(request) if self._tts_model_type == "fish_tts": return self._validate_fish_tts_request(request) + if self._tts_model_type == "cosyvoice3": + return self._validate_cosyvoice3_request(request) return self._validate_qwen_tts_request(request) def _validate_ref_audio_format(self, ref_audio: str) -> str | None: @@ -884,6 +900,30 @@ def _validate_fish_tts_request(self, request: OpenAICreateSpeechRequest) -> str return None + def _validate_cosyvoice3_request(self, request: OpenAICreateSpeechRequest) -> str | None: + """Validate CosyVoice3 request parameters. Returns error message or None.""" + if not request.input or not request.input.strip(): + return "Input text cannot be empty" + + # CosyVoice3 requires reference audio for voice cloning + if request.ref_audio is None: + return "CosyVoice3 requires 'ref_audio' (reference audio for voice cloning)" + + fmt_err = self._validate_ref_audio_format(request.ref_audio) + if fmt_err: + return fmt_err + + if not request.ref_text or not request.ref_text.strip(): + return "CosyVoice3 requires 'ref_text' (transcript of the reference audio)" + + if request.max_new_tokens is not None: + if request.max_new_tokens < _TTS_MAX_NEW_TOKENS_MIN: + return f"max_new_tokens must be at least {_TTS_MAX_NEW_TOKENS_MIN}" + if request.max_new_tokens > _TTS_MAX_NEW_TOKENS_MAX: + return f"max_new_tokens cannot exceed {_TTS_MAX_NEW_TOKENS_MAX}" + + return None + async def _resolve_ref_audio(self, ref_audio_str: str) -> tuple[list[float], int]: """Resolve ref_audio to (wav_samples, sample_rate). @@ -1194,6 +1234,33 @@ def _build_fish_speech_prompt( "additional_information": additional_information, } + # ---- CosyVoice3 helpers ---- + + async def _build_cosyvoice3_prompt( + self, + request: OpenAICreateSpeechRequest, + ) -> dict[str, Any]: + """Build prompt for CosyVoice3. + + CosyVoice3 uses multimodal input with reference audio for voice cloning. + The prompt format matches the offline example: text prompt + audio data + + mm_processor_kwargs with prompt_text. + """ + # Resolve reference audio + wav_samples, sr = await self._resolve_ref_audio(request.ref_audio) + audio_data = (np.asarray(wav_samples, dtype=np.float32), sr) + + return { + "prompt": request.input, + "multi_modal_data": { + "audio": audio_data, + }, + "mm_processor_kwargs": { + "prompt_text": request.ref_text, + "sample_rate": sr, + }, + } + # ---- Common speech generation helpers ---- async def _prepare_speech_generation( @@ -1224,6 +1291,9 @@ async def _prepare_speech_generation( if self._tts_model_type == "voxtral_tts": prompt = await self._build_voxtral_prompt(request) tts_params = {} + elif self._tts_model_type == "cosyvoice3": + prompt = await self._build_cosyvoice3_prompt(request) + tts_params = {} else: tts_params = self._build_tts_params(request) # Resolve ref_audio (explicit or auto-set for uploaded voices) @@ -1247,6 +1317,8 @@ async def _prepare_speech_generation( model_type = "fish_speech" elif self._tts_model_type == "voxtral_tts": model_type = "voxtral_tts" + elif self._tts_model_type == "cosyvoice3": + model_type = "cosyvoice3" elif self._is_tts: model_type = tts_params.get("task_type", ["unknown"])[0] else: diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py index 87c5f323a4..784393e181 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py +++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py @@ -66,6 +66,12 @@ def _ensure_cached_runtime_components(self, model_dir: str, config: CosyVoice3Co if cached_model_dir == model_dir: return + # If model_dir is an HF repo ID (not a local path), resolve to cache + if not os.path.isdir(model_dir): + from huggingface_hub import snapshot_download + + model_dir = snapshot_download(model_dir) + import onnxruntime from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer @@ -266,9 +272,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = vllm_config.model_config.hf_config self.have_multimodal_outputs = True self.model_stage = vllm_config.model_config.model_stage - self.model_dir = vllm_config.model_config.model + model_dir = vllm_config.model_config.model + if not os.path.isdir(model_dir): + from huggingface_hub import snapshot_download + + model_dir = snapshot_download(model_dir) + self.model_dir = model_dir self.model = None - if self.model_stage == "talker": + if self.model_stage == "cosyvoice3_talker": # Initialize talker stage (text to speech tokens) from vllm_omni.model_executor.models.cosyvoice3.cosyvoice3_talker import CosyVoice3LM, VLLMQwen2Encoder @@ -286,7 +297,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # KV cache is now managed externally by vLLM's PagedAttention # No need for self.llm_cache self.model = self.talker - elif self.model_stage == "code2wav": + elif self.model_stage == "cosyvoice3_code2wav": # Initialize code2wav stage (flow matching + vocoder) from vllm_omni.model_executor.models.cosyvoice3.cosyvoice3_code2wav import CosyVoice3Code2Wav @@ -322,7 +333,7 @@ def _create_llm_vllm_config(self, parent_config: VllmConfig) -> VllmConfig: def compute_logits(self, hidden_states: torch.Tensor | OmniOutput) -> torch.Tensor | None: if isinstance(hidden_states, OmniOutput): hidden_states = hidden_states.text_hidden_states - if self.model_stage == "talker": + if self.model_stage == "cosyvoice3_talker": logits = self.model.llm_decoder(hidden_states) vocab_size = self.config.vocab_size pad_size = vocab_size - logits.size(-1) @@ -337,7 +348,7 @@ def compute_logits(self, hidden_states: torch.Tensor | OmniOutput) -> torch.Tens raise RuntimeError(f"compute_logits is only valid for {self.model_stage}.") def embed_multimodal(self, **kwargs: object) -> torch.Tensor: - if self.model_stage == "talker": + if self.model_stage == "cosyvoice3_talker": speech_token = kwargs["speech_token"] speech_token_emb = self.model.speech_embedding(speech_token) return speech_token_emb @@ -350,7 +361,7 @@ def embed_input_ids( multimodal_embeddings=None, is_multimodal=None, ) -> torch.Tensor: - if self.model_stage == "talker": + if self.model_stage == "cosyvoice3_talker": if is_multimodal is not None and any(is_multimodal): embed_tokens = self.model.llm.model.embed_tokens(input_ids) sos = self.model.speech_embedding.weight[self.model.sos].reshape(1, -1) @@ -363,7 +374,7 @@ def embed_input_ids( else: embed_tokens = self.model.speech_embedding.weight[input_ids] return embed_tokens - elif self.model_stage == "code2wav": + elif self.model_stage == "cosyvoice3_code2wav": assert input_ids.dim() == 1 hidden = int(self.config.hidden_size) return torch.zeros( @@ -381,7 +392,7 @@ def forward( additional_information: dict[str, object] | None = None, **kwargs: object, ) -> OmniOutput: - if self.model_stage == "talker": + if self.model_stage == "cosyvoice3_talker": if inputs_embeds is None: inputs_embeds = self.embed_input_ids(input_ids) @@ -399,7 +410,7 @@ def forward( } return OmniOutput(text_hidden_states=hidden_states, multimodal_outputs=multimodal_outputs) - elif self.model_stage == "code2wav": + elif self.model_stage == "cosyvoice3_code2wav": runtime_info = kwargs.get("runtime_additional_information", []) if not runtime_info: length = 30 * 24000 @@ -420,13 +431,13 @@ def forward( return OmniOutput( text_hidden_states=None, - multimodal_outputs={"audio": tts_speech}, + multimodal_outputs={"audio": tts_speech, "sr": 22050}, ) else: raise ValueError(f"Unsupported model_stage: {self.model_stage}") def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - if self.model_stage == "talker": + if self.model_stage == "cosyvoice3_talker": # Load weights for text to speech LM stage using vLLM's weight loading llm_weight_path = os.path.join(self.model_dir, "llm.pt") device = next(self.parameters()).device @@ -460,7 +471,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: self.model.llm_decoder.load_state_dict(llm_decoder_state) self.model.to(device).eval() - elif self.model_stage == "code2wav": + elif self.model_stage == "cosyvoice3_code2wav": # Load weights for code2wav stage (flow + hift) device = next(self.parameters()).device self.code2wav.load_weights(self.model_dir, device) diff --git a/vllm_omni/model_executor/models/cosyvoice3/utils.py b/vllm_omni/model_executor/models/cosyvoice3/utils.py index ca98e9aefb..52c52655e8 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/utils.py +++ b/vllm_omni/model_executor/models/cosyvoice3/utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import logging import os from functools import cache, lru_cache @@ -10,6 +11,8 @@ import torchaudio.compliance.kaldi as kaldi from librosa.filters import mel as librosa_mel_fn +logger = logging.getLogger(__name__) + IGNORE_ID = -1 @@ -134,15 +137,24 @@ def mel_filters(device, n_mels: int) -> torch.Tensor: filters_path = os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz") if not os.path.exists(filters_path): source_url = "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz" - raise FileNotFoundError( - "Missing CosyVoice3 mel filter asset:\n" - f" {filters_path}\n" - "Download it manually from:\n" - f" {source_url}\n" - "Example:\n" - f" mkdir -p {os.path.dirname(filters_path)} && " - f"curl -L {source_url} -o {filters_path}" - ) + os.makedirs(os.path.dirname(filters_path), exist_ok=True) + try: + import urllib.request + + with urllib.request.urlopen(source_url, timeout=30) as resp: + with open(filters_path, "wb") as f_out: + f_out.write(resp.read()) + logger.info("Downloaded mel_filters.npz from %s", source_url) + except Exception as e: + raise FileNotFoundError( + "Missing CosyVoice3 mel filter asset:\n" + f" {filters_path}\n" + "Auto-download failed. Download it manually from:\n" + f" {source_url}\n" + "Example:\n" + f" mkdir -p {os.path.dirname(filters_path)} && " + f"curl -L {source_url} -o {filters_path}" + ) from e with np.load(filters_path, allow_pickle=False) as f: return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) diff --git a/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml b/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml index 13b6ff55bd..e215f51428 100644 --- a/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml +++ b/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml @@ -9,16 +9,16 @@ stage_args: runtime: devices: 0 engine_args: - model_stage: talker + model_stage: cosyvoice3_talker worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler model_arch: CosyVoice3Model trust_remote_code: true - gpu_memory_utilization: 0.4 + gpu_memory_utilization: 0.2 engine_output_type: latent # Output speech tokens for chunk aware flow matching disable_hybrid_kv_cache_manager: true enable_prefix_caching: false - enforce_eager: false + enforce_eager: true mm_processor_cache_gb: 0 skip_mm_profiling: true dtype: "float32" @@ -27,14 +27,14 @@ stage_args: runtime: devices: 0 engine_args: - model_stage: code2wav + model_stage: cosyvoice3_code2wav model_arch: CosyVoice3Model trust_remote_code: true worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler engine_output_type: latent - gpu_memory_utilization: 0.2 - enforce_eager: false # CUDA graphs don't work with dynamic runtime_info access + gpu_memory_utilization: 0.1 + enforce_eager: true # CUDA graphs don't work with dynamic conv shapes in code2wav disable_hybrid_kv_cache_manager: true enable_prefix_caching: false skip_mm_profiling: true From 2804a85e58875f559267d2f811ca2b9e78dd2dae Mon Sep 17 00:00:00 2001 From: Zhou Taichang Date: Sat, 4 Apr 2026 15:03:32 +0800 Subject: [PATCH 042/204] [Rebase] Rebase to vllm v0.19.0 (#2475) --- .buildkite/pipeline-intel.yaml | 2 +- .buildkite/test-nightly.yml | 2 +- docker/Dockerfile.ci | 6 +- docker/Dockerfile.rocm | 2 +- docker/Dockerfile.xpu | 2 +- .../installation/gpu/cuda.inc.md | 11 +-- .../installation/gpu/rocm.inc.md | 14 ++-- docs/getting_started/quickstart.md | 4 +- docs/mkdocs/hooks/generate_argparse.py | 1 + .../quantization/test_quantization_quality.py | 8 +- .../offline_inference/test_bagel_text2img.py | 20 +++++ tests/e2e/online_serving/test_mimo_audio.py | 33 ++++---- tests/engine/test_async_omni_engine_abort.py | 1 + .../openai_api/test_serving_speech.py | 1 + tests/entrypoints/test_async_omni_abort.py | 4 +- tests/entrypoints/test_omni_entrypoints.py | 1 + tests/examples/conftest.py | 3 +- .../models/test_omni_processing.py | 9 ++- vllm_omni/benchmarks/patch/patch.py | 8 +- vllm_omni/core/sched/omni_ar_scheduler.py | 5 +- .../core/sched/omni_generation_scheduler.py | 7 +- .../diffusion/attention/backends/utils/fa.py | 4 +- vllm_omni/diffusion/diffusion_engine.py | 4 +- vllm_omni/diffusion/stage_diffusion_client.py | 15 +++- vllm_omni/diffusion/stage_diffusion_proc.py | 2 +- .../diffusion/worker/diffusion_worker.py | 10 ++- vllm_omni/engine/arg_utils.py | 48 +++++++++++- vllm_omni/engine/async_omni_engine.py | 40 +++++++++- vllm_omni/engine/stage_init_utils.py | 29 +++++++ vllm_omni/entrypoints/async_omni.py | 13 +++- vllm_omni/entrypoints/cli/main.py | 4 + vllm_omni/entrypoints/cli/serve.py | 32 ++++++++ vllm_omni/entrypoints/openai/api_server.py | 7 +- vllm_omni/entrypoints/openai/serving_chat.py | 36 ++++----- .../entrypoints/openai/serving_speech.py | 3 + vllm_omni/entrypoints/utils.py | 23 ++++-- vllm_omni/inputs/data.py | 6 +- vllm_omni/inputs/preprocess.py | 17 ++-- .../model_executor/models/bagel/bagel.py | 2 +- .../models/cosyvoice3/cosyvoice3.py | 3 +- .../models/glm_image/glm_image_ar.py | 2 +- .../models/hunyuan_image3/hunyuan_image3.py | 2 +- .../models/mimo_audio/mimo_audio.py | 3 +- .../models/mimo_audio/mimo_audio_llm.py | 2 +- .../models/qwen3_omni/qwen3_omni.py | 2 +- .../qwen3_omni/qwen3_omni_moe_thinker.py | 2 +- .../voxtral_tts_audio_generation.py | 2 +- vllm_omni/patch.py | 2 +- .../npu/worker/npu_ar_model_runner.py | 4 +- vllm_omni/worker/gpu_ar_model_runner.py | 16 +++- .../worker/gpu_generation_model_runner.py | 25 ++++-- vllm_omni/worker/gpu_model_runner.py | 77 ++++++++++++++----- 52 files changed, 438 insertions(+), 143 deletions(-) diff --git a/.buildkite/pipeline-intel.yaml b/.buildkite/pipeline-intel.yaml index 4334dd516b..2dc53ad963 100644 --- a/.buildkite/pipeline-intel.yaml +++ b/.buildkite/pipeline-intel.yaml @@ -10,7 +10,7 @@ steps: DOCKER_BUILDKIT: "1" # Buildkite will automatically replace this with the actual commit hash VLLM_IMAGE_TAG: "${BUILDKITE_COMMIT}" - VLLM_VERSION: "v0.18.0" + VLLM_VERSION: "v0.19.0" priority: 100 timeout_in_minutes: 60 soft_fail: true diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 32bf219bc9..9dc8885061 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -362,7 +362,7 @@ steps: - kubernetes: podSpec: containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:c392ce21e9cf9ea65c52b866447793db10e0261c + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT resources: limits: nvidia.com/gpu: 2 diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index f4253fe725..24ce39bafd 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -1,5 +1,5 @@ ARG VLLM_BASE_IMAGE=vllm/vllm-openai -ARG VLLM_BASE_TAG=v0.18.0 +ARG VLLM_BASE_TAG=v0.19.0 FROM ${VLLM_BASE_IMAGE}:${VLLM_BASE_TAG} ARG APP_DIR=/workspace/vllm-omni WORKDIR ${APP_DIR} @@ -11,9 +11,7 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# Install vllm-omni into the same uv-managed Python environment used by the base image. -# Use bash -c so that $(python3 -c ...) is expanded inside the container. -RUN uv pip install --system --no-cache-dir ".[dev]" +RUN uv pip install --system ".[dev]" RUN ln -sf /usr/bin/python3 /usr/bin/python diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index ce541497a3..bfbb060bcb 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.18.0 +ARG BASE_IMAGE=vllm/vllm-openai-rocm:v0.19.0 FROM ${BASE_IMAGE} AS base # Declare a variable to know if we want to use the nightly build or the stable build. diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 8901725f06..17f1aebf0d 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -76,7 +76,7 @@ ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} ENV UV_INDEX_STRATEGY="unsafe-best-match" ENV UV_LINK_MODE="copy" -ARG VLLM_VERSION=v0.18.0 +ARG VLLM_VERSION=v0.19.0 RUN git clone -b ${VLLM_VERSION} https://github.com/vllm-project/vllm WORKDIR /workspace/vllm diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index 932b54f0de..3e6600f66e 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -39,13 +39,13 @@ uv pip install 'vllm-omni[demo]' # --8<-- [start:build-wheel-from-source] #### Installation of vLLM -If you do not need to modify source code of vLLM, you can directly install the stable 0.18.0 release version of the library +If you do not need to modify source code of vLLM, you can directly install the stable 0.19.0 release version of the library ```bash -uv pip install vllm==0.18.0 --torch-backend=auto +uv pip install vllm==0.19.0 --torch-backend=auto ``` -The 0.18.0 release of vLLM ships CUDA 12.9-compatible binaries by default. If you need a different CUDA variant or want to reuse an existing PyTorch installation, build vLLM from source instead. +The 0.19.0 release of vLLM ships CUDA 13.0-compatible binaries by default. If you need a different CUDA variant or want to reuse an existing PyTorch installation, build vLLM from source instead. #### Installation of vLLM-Omni Since vllm-omni is rapidly evolving, it's recommended to install it from source @@ -66,11 +66,12 @@ If you want to check, modify or debug with source code of vLLM, install the libr ```bash git clone https://github.com/vllm-project/vllm.git cd vllm -git checkout v0.18.0 +git checkout v0.19.0 ``` Set up environment variables to get pre-built wheels. If there are internet problems, just download the whl file manually. And set `VLLM_PRECOMPILED_WHEEL_LOCATION` as your local absolute path of whl file. ```bash -export VLLM_PRECOMPILED_WHEEL_LOCATION=https://github.com/vllm-project/vllm/releases/download/v0.18.0/vllm-0.18.0+cu129-cp38-abi3-manylinux_2_35_x86_64.whl +#For CUDA 13.0 +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://github.com/vllm-project/vllm/releases/download/v0.19.0/vllm-0.19.0+cu130-cp38-abi3-manylinux_2_35_x86_64.whl ``` Install vllm with command below (If you have no existing PyTorch). ```bash diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index da84561c96..1a683d174f 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -13,7 +13,7 @@ vLLM-Omni current recommends the steps in under setup through Docker Images. vLLM-Omni is built based on vLLM. Please install it with command below. ```bash -uv pip install vllm==0.18.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.18.0/rocm700 +uv pip install vllm==0.19.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.19.0/rocm700 ``` #### Installation of vLLM-Omni @@ -34,13 +34,13 @@ uv pip install onnxruntime-rocm sox # --8<-- [start:build-wheel-from-source] #### Installation of vLLM -If you do not need to modify source code of vLLM, you can directly install the stable 0.18.0 release version of the library +If you do not need to modify source code of vLLM, you can directly install the stable 0.19.0 release version of the library ```bash -uv pip install vllm==0.18.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.18.0/rocm700 +uv pip install vllm==0.19.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.19.0/rocm700 ``` -The pre-built 0.18.0 vLLM wheel targets ROCm 7.0. If you need a different ROCm stack or want to reuse an existing PyTorch installation, build vLLM from source instead. +The pre-built 0.19.0 vLLM wheel targets ROCm 7.0. If you need a different ROCm stack or want to reuse an existing PyTorch installation, build vLLM from source instead. #### Installation of vLLM-Omni Since vllm-omni is rapidly evolving, it's recommended to install it from source @@ -58,7 +58,7 @@ If you want to check, modify or debug with source code of vLLM, install the libr ```bash git clone https://github.com/vllm-project/vllm.git cd vllm -git checkout v0.18.0 +git checkout v0.19.0 python3 -m pip install -r requirements/rocm.txt python3 setup.py develop ``` @@ -130,7 +130,7 @@ docker run --rm \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=$HF_TOKEN" \ -p 8091:8091 \ - vllm/vllm-omni-rocm:v0.18.0 \ + vllm/vllm-omni-rocm:v0.19.0 \ --model Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 ``` @@ -149,7 +149,7 @@ docker run --rm -it \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=$HF_TOKEN" \ --entrypoint bash \ - vllm/vllm-omni-rocm:v0.18.0 + vllm/vllm-omni-rocm:v0.19.0 ``` # --8<-- [end:pre-built-images] diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index eef3dd1a79..45b3eab1d9 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -19,10 +19,10 @@ uv venv --python 3.12 --seed source .venv/bin/activate # On CUDA -uv pip install vllm==0.18.0 --torch-backend=auto +uv pip install vllm==0.19.0 --torch-backend=auto # On ROCm -uv pip install vllm==0.18.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.18.0/rocm700 +uv pip install vllm==0.19.0+rocm700 --extra-index-url https://wheels.vllm.ai/rocm/0.19.0/rocm700 git clone https://github.com/vllm-project/vllm-omni.git cd vllm-omni diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index 6cef7cfbd2..0aed44a0c6 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -120,6 +120,7 @@ def add_parser(self, name, **kwargs): "_FlexibleArgumentParser": _FlexibleArgumentParser, "FlexibleArgumentParser": _FlexibleArgumentParser, "make_arg_parser": lambda parser: parser, # no-op for doc + "_ensure_vllm_platform": lambda: None, # no-op for doc "VLLM_SUBCMD_PARSER_EPILOG": "", "logger": logger, "DummySubparsers": DummySubparsers, diff --git a/tests/diffusion/quantization/test_quantization_quality.py b/tests/diffusion/quantization/test_quantization_quality.py index a937a64861..3d8f187369 100644 --- a/tests/diffusion/quantization/test_quantization_quality.py +++ b/tests/diffusion/quantization/test_quantization_quality.py @@ -118,8 +118,12 @@ def _generate_image(omni, config: QualityTestConfig): peak_mem = torch.cuda.max_memory_allocated() / (1024**3) first = outputs[0] - req_out = first.request_output[0] if hasattr(first, "request_output") else first - return req_out.images[0], peak_mem + if hasattr(first, "images") and first.images: + return first.images[0], peak_mem + inner = first.request_output + if inner is not None and hasattr(inner, "images") and inner.images: + return inner.images[0], peak_mem + raise ValueError("Could not extract image from output.") def _generate_video(omni, config: QualityTestConfig): diff --git a/tests/e2e/offline_inference/test_bagel_text2img.py b/tests/e2e/offline_inference/test_bagel_text2img.py index c74763a35a..7cce8da3a7 100644 --- a/tests/e2e/offline_inference/test_bagel_text2img.py +++ b/tests/e2e/offline_inference/test_bagel_text2img.py @@ -229,6 +229,24 @@ def _wait_for_port(host: str, port: int, timeout: int = 30) -> bool: return False +def _is_mooncake_master_available() -> bool: + """Check if mooncake_master binary is present and can actually execute.""" + import shutil + + binary = shutil.which("mooncake_master") + if binary is None: + return False + try: + result = subprocess.run( + [binary, "--help"], + capture_output=True, + timeout=5, + ) + return result.returncode != 127 + except (subprocess.TimeoutExpired, OSError): + return True + + def _cleanup_mooncake_processes(timeout_secs: int = 5) -> None: """Clean up any existing mooncake_master processes. @@ -292,6 +310,8 @@ def _load_mooncake_config(host: str, rpc_port: int, http_port: int) -> str: @hardware_test(res={"cuda": "H100"}) def test_bagel_text2img_mooncake_connector(run_level): """Test Bagel text2img with Mooncake connector for inter-stage communication.""" + if not _is_mooncake_master_available(): + pytest.skip("mooncake_master is not available or cannot execute (missing shared libraries like libibverbs)") MOONCAKE_HOST = "127.0.0.1" MOONCAKE_RPC_PORT = _find_free_port() MOONCAKE_HTTP_PORT = _find_free_port() diff --git a/tests/e2e/online_serving/test_mimo_audio.py b/tests/e2e/online_serving/test_mimo_audio.py index 2fb63c1e42..43eeb77335 100644 --- a/tests/e2e/online_serving/test_mimo_audio.py +++ b/tests/e2e/online_serving/test_mimo_audio.py @@ -63,20 +63,27 @@ def download_tokenizer(): # CI stage config for H100 / MI325 -stage_configs = [get_chunk_config()] -tokenizer_path = download_tokenizer() -os.environ["MIMO_AUDIO_TOKENIZER_PATH"] = tokenizer_path - -# Create parameter combinations for model and stage config -test_params = [ - OmniServerParams( - model=model, - stage_config_path=stage_config, - server_args=["--chat-template", CHAT_TEMPLATE_PATH], +# Guard module-level setup so test collection doesn't fail in environments +# where the model cache is read-only or models aren't available. +try: + stage_configs = [get_chunk_config()] + tokenizer_path = download_tokenizer() + os.environ["MIMO_AUDIO_TOKENIZER_PATH"] = tokenizer_path + + test_params = [ + OmniServerParams( + model=model, + stage_config_path=stage_config, + server_args=["--chat-template", CHAT_TEMPLATE_PATH], + ) + for model in models + for stage_config in stage_configs + ] +except Exception as exc: + pytest.skip( + f"MiMo-Audio online serving tests skipped: module setup failed ({type(exc).__name__}: {exc})", + allow_module_level=True, ) - for model in models - for stage_config in stage_configs -] def get_prompt(prompt_type="text_only"): diff --git a/tests/engine/test_async_omni_engine_abort.py b/tests/engine/test_async_omni_engine_abort.py index a99c522c30..34fdf45ea2 100644 --- a/tests/engine/test_async_omni_engine_abort.py +++ b/tests/engine/test_async_omni_engine_abort.py @@ -60,6 +60,7 @@ async def generate( @pytest.mark.core_model @pytest.mark.omni +@pytest.mark.real_hf_config @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards=1) @pytest.mark.asyncio async def test_abort(): diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 83e4188c17..b140b7a046 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -1664,6 +1664,7 @@ def test_build_fish_prompt_normalizes_legacy_speaker_tags(self, fish_speech_serv assert all(allowed_special is None for _, _, allowed_special in tokenizer.calls) def test_build_fish_clone_prompt_normalizes_text_fields(self, fish_speech_server): + fish_speech_server._fish_speech_tokenizer = _FakeFishTokenizer() fish_speech_server._estimate_fish_prompt_len = MagicMock(return_value=123) request = OpenAICreateSpeechRequest( diff --git a/tests/entrypoints/test_async_omni_abort.py b/tests/entrypoints/test_async_omni_abort.py index 71f3e99feb..b34652162d 100644 --- a/tests/entrypoints/test_async_omni_abort.py +++ b/tests/entrypoints/test_async_omni_abort.py @@ -13,8 +13,8 @@ async def run_test(): submitted_request_ids = [] aborted_request_batches = [] - async def fake_add_request_async(*, request_id, prompt, sampling_params_list, final_stage_id): - del prompt, sampling_params_list, final_stage_id + async def fake_add_request_async(*, request_id, prompt, sampling_params_list, final_stage_id, **kwargs): + del prompt, sampling_params_list, final_stage_id, kwargs submitted_request_ids.append(request_id) async def fake_abort_async(request_ids): diff --git a/tests/entrypoints/test_omni_entrypoints.py b/tests/entrypoints/test_omni_entrypoints.py index 0aeb6158e2..3cffcd37df 100644 --- a/tests/entrypoints/test_omni_entrypoints.py +++ b/tests/entrypoints/test_omni_entrypoints.py @@ -113,6 +113,7 @@ def add_request( sampling_params_list: list[Any] | None = None, final_stage_id: int = 0, arrival_time: float | None = None, + **kwargs: Any, ) -> None: msg = { "request_id": request_id, diff --git a/tests/examples/conftest.py b/tests/examples/conftest.py index a66db90402..137d15f163 100644 --- a/tests/examples/conftest.py +++ b/tests/examples/conftest.py @@ -14,7 +14,6 @@ from pathlib import Path from typing import Any, NamedTuple, cast -import mistune import pytest import torch from safetensors.torch import save_file @@ -62,6 +61,8 @@ def extract_readme_snippets( readme_path: Path, skipif: ReadmeSnippetExtractionSkipPredicate | None = None, ) -> list["ReadmeSnippet"]: + import mistune + markdown = mistune.create_markdown(renderer="ast") tokens = markdown(readme_path.read_text(encoding="utf-8")) tokens = cast(list[dict[str, Any]], tokens) # mistune's AST renderer always produces a list, not a str diff --git a/tests/model_executor/models/test_omni_processing.py b/tests/model_executor/models/test_omni_processing.py index fe9c63b820..70a9ca0e90 100644 --- a/tests/model_executor/models/test_omni_processing.py +++ b/tests/model_executor/models/test_omni_processing.py @@ -21,9 +21,10 @@ ImageDummyOptions, VideoDummyOptions, ) -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict +from vllm.inputs import MultiModalDataDict, MultiModalInput +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import MultiModalProcessorOnlyCache -from vllm.multimodal.inputs import MultiModalInputs, batched_tensors_equal +from vllm.multimodal.inputs import batched_tensors_equal from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config @@ -302,8 +303,8 @@ def test_omni_processing_correctness( def _assert_inputs_equal( - a: MultiModalInputs, - b: MultiModalInputs, + a: MultiModalInput, + b: MultiModalInput, *, ignore_mm_keys: set[str] | None = None, msg: str = "", diff --git a/vllm_omni/benchmarks/patch/patch.py b/vllm_omni/benchmarks/patch/patch.py index 539af11f86..d8145c40bc 100644 --- a/vllm_omni/benchmarks/patch/patch.py +++ b/vllm_omni/benchmarks/patch/patch.py @@ -358,6 +358,7 @@ async def benchmark( lora_modules: Iterable[str] | None, extra_headers: dict | None, extra_body: dict | None, + lora_assignment: Literal["random", "round-robin"] = "random", ramp_up_strategy: Literal["linear", "exponential"] | None = None, ramp_up_start_rps: int | None = None, ramp_up_end_rps: int | None = None, @@ -454,8 +455,11 @@ async def warmup_limited_request_func(): print("Starting main benchmark run...") if lora_modules: - # For each input request, choose a LoRA module at random. - lora_modules = iter([random.choice(lora_modules) for _ in range(len(input_requests))]) + lora_modules_list = list(lora_modules) + if lora_assignment == "round-robin": + lora_modules = iter([lora_modules_list[i % len(lora_modules_list)] for i in range(len(input_requests))]) + else: + lora_modules = iter([random.choice(lora_modules_list) for _ in range(len(input_requests))]) if profile: print("Starting profiler...") diff --git a/vllm_omni/core/sched/omni_ar_scheduler.py b/vllm_omni/core/sched/omni_ar_scheduler.py index d49664161c..af178d14d2 100644 --- a/vllm_omni/core/sched/omni_ar_scheduler.py +++ b/vllm_omni/core/sched/omni_ar_scheduler.py @@ -235,7 +235,10 @@ def update_from_output( # These blocks contain externally computed tokens that failed to # load. Identify affected requests and adjust their computed token # count to trigger recomputation of the invalid blocks. - failed_kv_load_req_ids = self._handle_invalid_blocks(kv_connector_output.invalid_block_ids) + failed_kv_load_req_ids = self._handle_invalid_blocks( + kv_connector_output.invalid_block_ids, + num_scheduled_tokens, + ) # NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more, # the below loop can be a performance bottleneck. We should do our best diff --git a/vllm_omni/core/sched/omni_generation_scheduler.py b/vllm_omni/core/sched/omni_generation_scheduler.py index dded8f7aa4..1c4356d4f5 100644 --- a/vllm_omni/core/sched/omni_generation_scheduler.py +++ b/vllm_omni/core/sched/omni_generation_scheduler.py @@ -272,7 +272,7 @@ def schedule(self) -> SchedulerOutput: # KVTransfer: package metadata if self.connector is not None: - meta = self.connector.build_connector_meta(scheduler_output) + meta = self._build_kv_connector_meta(self.connector, scheduler_output) scheduler_output.kv_connector_metadata = meta # EC Connector: package metadata if self.ec_connector is not None: @@ -368,7 +368,10 @@ def update_from_output( failed_kv_load_req_ids = None if kv_connector_output and getattr(kv_connector_output, "invalid_block_ids", None): - failed_kv_load_req_ids = self._handle_invalid_blocks(kv_connector_output.invalid_block_ids) + failed_kv_load_req_ids = self._handle_invalid_blocks( + kv_connector_output.invalid_block_ids, + num_scheduled_tokens, + ) # NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more, # the below loop can be a performance bottleneck. We should do our best diff --git a/vllm_omni/diffusion/attention/backends/utils/fa.py b/vllm_omni/diffusion/attention/backends/utils/fa.py index 1fd47790f0..77596a1033 100644 --- a/vllm_omni/diffusion/attention/backends/utils/fa.py +++ b/vllm_omni/diffusion/attention/backends/utils/fa.py @@ -32,7 +32,9 @@ pass elif current_omni_platform.is_xpu(): try: - from vllm.v1.attention.backends.fa_utils import flash_attn_varlen_func # noqa: F401 + from vllm._xpu_ops import xpu_ops # noqa: F401 + + flash_attn_varlen_func = xpu_ops.flash_attn_varlen_func except (ImportError, ModuleNotFoundError): pass elif current_omni_platform.is_musa(): diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index 05008d7e91..784da61752 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -361,8 +361,8 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N def _dummy_run(self): """A dummy run to warm up the model.""" num_inference_steps = 1 - height = 1024 - width = 1024 + height = 512 + width = 512 if supports_image_input(self.od_config.model_class_name): # Provide a dummy image input if the model supports it color_format = image_color_format(self.od_config.model_class_name) diff --git a/vllm_omni/diffusion/stage_diffusion_client.py b/vllm_omni/diffusion/stage_diffusion_client.py index db13f99aab..77db2b1b97 100644 --- a/vllm_omni/diffusion/stage_diffusion_client.py +++ b/vllm_omni/diffusion/stage_diffusion_client.py @@ -244,7 +244,20 @@ def get_diffusion_output_nowait(self) -> OmniRequestOutput | None: return self._output_queue.get_nowait() except asyncio.QueueEmpty: if not self._shutting_down and self._proc is not None and not self._proc.is_alive(): - raise RuntimeError(f"StageDiffusionProc died unexpectedly (exit code {self._proc.exitcode})") + exitcode = self._proc.exitcode + # One final drain – the last ZMQ frame may have arrived + # between the first drain and the is_alive() check. + self._drain_responses() + try: + return self._output_queue.get_nowait() + except asyncio.QueueEmpty: + pass + if exitcode is not None and exitcode > 128: + sig = exitcode - 128 + logger.warning("StageDiffusionProc was killed by signal %d; treating as external shutdown.", sig) + self._shutting_down = True + return None + raise RuntimeError(f"StageDiffusionProc died unexpectedly (exit code {exitcode})") return None async def abort_requests_async(self, request_ids: list[str]) -> None: diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py index 8677da0371..0a5fd35901 100644 --- a/vllm_omni/diffusion/stage_diffusion_proc.py +++ b/vllm_omni/diffusion/stage_diffusion_proc.py @@ -495,7 +495,7 @@ def signal_handler(signum: int, frame: Any) -> None: nonlocal shutdown_requested if not shutdown_requested: shutdown_requested = True - raise SystemExit() + raise SystemExit(128 + signum) signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py index 6e1cabba0c..ea4b9d96f7 100644 --- a/vllm_omni/diffusion/worker/diffusion_worker.py +++ b/vllm_omni/diffusion/worker/diffusion_worker.py @@ -17,7 +17,7 @@ import torch import zmq -from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config +from vllm.config import CompilationConfig, DeviceConfig, VllmConfig, set_current_vllm_config from vllm.distributed.device_communicators.shm_broadcast import MessageQueue from vllm.logger import init_logger from vllm.utils.import_utils import resolve_obj_by_qualname @@ -113,8 +113,12 @@ def init_device(self) -> None: self.device = current_omni_platform.get_torch_device(rank) current_omni_platform.set_device(self.device) - # Create vllm_config for parallel configuration - vllm_config = VllmConfig(compilation_config=CompilationConfig()) + # Create vllm_config for parallel configuration. Pass explicit device_config + # so DeviceConfig does not rely on current_platform in worker subprocesses. + vllm_config = VllmConfig( + compilation_config=CompilationConfig(), + device_config=DeviceConfig(device=self.device), + ) vllm_config.parallel_config.tensor_parallel_size = self.od_config.parallel_config.tensor_parallel_size vllm_config.parallel_config.data_parallel_size = self.od_config.parallel_config.data_parallel_size vllm_config.parallel_config.enable_expert_parallel = self.od_config.parallel_config.enable_expert_parallel diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index 5bc51043a5..b663789262 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -1,6 +1,8 @@ import argparse import dataclasses +import json import os +import tempfile from dataclasses import dataclass, field from typing import Any @@ -135,6 +137,30 @@ def _ensure_omni_models_registered(self): self._omni_models_registered = True return True + def _patch_empty_hf_config(self, model_type: str) -> None: + """For models with empty config.json (e.g. CosyVoice3), create a + patched config in a temp directory with model_type set so that + transformers AutoConfig.from_pretrained can resolve the config class. + Sets self.hf_config_path to point to the patched directory.""" + try: + from transformers import PretrainedConfig + + config_dict, _ = PretrainedConfig.get_config_dict(self.model) + if config_dict.get("model_type"): + return # config.json already has model_type, no patching needed + except Exception: + return # can't load config, let vLLM handle the error + + # Create a temp dir with a patched config.json + temp_dir = tempfile.mkdtemp(prefix="omni_hf_config_") + config_dict["model_type"] = model_type + config_dict.setdefault("architectures", [self.model_arch]) + with open(os.path.join(temp_dir, "config.json"), "w") as f: + json.dump(config_dict, f) + self.hf_config_path = temp_dir + self._temp_config_dir = temp_dir + logger.info("Patched empty HF config with model_type=%s at %s", model_type, temp_dir) + def create_model_config(self) -> OmniModelConfig: """Create an OmniModelConfig from these engine arguments. Returns: @@ -164,6 +190,18 @@ def create_model_config(self) -> OmniModelConfig: if model_type is not None: self.hf_overrides.setdefault("model_type", model_type) + # For models whose HF config.json is empty or lacks model_type + # (e.g. CosyVoice3), AutoConfig.from_pretrained fails because it + # cannot determine which config class to use from the empty dict. + # hf_overrides alone is not enough since transformers reads + # model_type from config_dict before applying overrides. + # Workaround: create a patched config.json in a temp directory + # and point hf_config_path to it so vLLM reads model_type from it. + if not self.hf_config_path: + model_type = _ARCH_TO_MODEL_TYPE.get(self.model_arch) + if model_type is not None: + self._patch_empty_hf_config(model_type) + # Auto-detect tokenizer for models that store it in a subdirectory # rather than the root (e.g. CosyVoice3 uses CosyVoice-BlankEN/). if not self.tokenizer and self.model: @@ -200,7 +238,15 @@ def create_model_config(self) -> OmniModelConfig: logger.warning("Failed to download tokenizer subfolder: %s", e) # Build the vLLM config first, then use it to create the Omni config. - model_config = super().create_model_config() + try: + model_config = super().create_model_config() + finally: + # Clean up temp config dir if we created one + if hasattr(self, "_temp_config_dir"): + import shutil + + shutil.rmtree(self._temp_config_dir, ignore_errors=True) + del self._temp_config_dir omni_config = OmniModelConfig.from_vllm_model_config( model_config=model_config, diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index d9960ecbac..092b341e42 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -17,7 +17,7 @@ import time import uuid import weakref -from collections.abc import Sequence +from collections.abc import Mapping, Sequence from dataclasses import asdict from typing import TYPE_CHECKING, Any @@ -652,6 +652,12 @@ def _build_add_request_message( sampling_params_list: Sequence[Any] | None = None, final_stage_id: int = 0, arrival_time: float | None = None, + lora_request: Any = None, + tokenization_kwargs: dict[str, Any] | None = None, + trace_headers: Mapping[str, str] | None = None, + priority: int = 0, + data_parallel_rank: int | None = None, + reasoning_ended: bool | None = None, *, resumable: bool = False, message_type: str = "add_request", @@ -686,12 +692,20 @@ def _build_add_request_message( params=params, supported_tasks=self.supported_tasks, arrival_time=arrival_time, + lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, + trace_headers=trace_headers, + priority=priority, + data_parallel_rank=data_parallel_rank, resumable=resumable, ) # TODO (Peiqi): add this for Qwen3-TTS only. Other models don't have # additional_information field in the prompt. request = _upgrade_to_omni_request(request, prompt) + if reasoning_ended is not None: + request.reasoning_ended = reasoning_ended + # Restore external_req_id to the original user-facing request_id. # InputProcessor.process_inputs() renames request_id to an internal # UUID (saving the original in external_req_id), but then overwrites @@ -973,6 +987,12 @@ def add_request( sampling_params_list: Sequence[Any] | None = None, final_stage_id: int = 0, arrival_time: float | None = None, + lora_request: Any = None, + tokenization_kwargs: dict[str, Any] | None = None, + trace_headers: Mapping[str, str] | None = None, + priority: int = 0, + data_parallel_rank: int | None = None, + reasoning_ended: bool | None = None, *, resumable: bool = False, ) -> None: @@ -990,6 +1010,12 @@ def add_request( sampling_params_list=sampling_params_list, final_stage_id=final_stage_id, arrival_time=arrival_time, + lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, + trace_headers=trace_headers, + priority=priority, + data_parallel_rank=data_parallel_rank, + reasoning_ended=reasoning_ended, resumable=resumable, ) if self.request_queue is None: @@ -1013,6 +1039,12 @@ async def add_request_async( sampling_params_list: Sequence[Any] | None = None, final_stage_id: int = 0, arrival_time: float | None = None, + lora_request: Any = None, + tokenization_kwargs: dict[str, Any] | None = None, + trace_headers: Mapping[str, str] | None = None, + priority: int = 0, + data_parallel_rank: int | None = None, + reasoning_ended: bool | None = None, *, resumable: bool = False, ) -> None: @@ -1024,6 +1056,12 @@ async def add_request_async( sampling_params_list=sampling_params_list, final_stage_id=final_stage_id, arrival_time=arrival_time, + lora_request=lora_request, + tokenization_kwargs=tokenization_kwargs, + trace_headers=trace_headers, + priority=priority, + data_parallel_rank=data_parallel_rank, + reasoning_ended=reasoning_ended, resumable=resumable, ) diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 6e81372061..f71afad83b 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -467,6 +467,35 @@ def initialize_diffusion_stage( return StageDiffusionClient(model, od_config, metadata, batch_size=batch_size) +def _shutdown_or_close_resource(resource: Any, resource_name: str, stage_id: int) -> None: + """vLLM CoreEngineProcManager / coordinators use ``shutdown()``, not ``close()``.""" + if resource is None: + return + shutdown = getattr(resource, "shutdown", None) + if callable(shutdown): + try: + shutdown() + except Exception as cleanup_error: + logger.warning( + "[stage_init] Failed to shutdown launched %s for stage %s: %s", + resource_name, + stage_id, + cleanup_error, + ) + return + close = getattr(resource, "close", None) + if callable(close): + try: + close() + except Exception as cleanup_error: + logger.warning( + "[stage_init] Failed to close launched %s for stage %s: %s", + resource_name, + stage_id, + cleanup_error, + ) + + def close_started_llm_stage(started: StartedLlmStage) -> None: """Terminate the subprocess owned by a launched stage that never attached.""" if started.proc is None: diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py index 6c8022461b..129ef3c99d 100644 --- a/vllm_omni/entrypoints/async_omni.py +++ b/vllm_omni/entrypoints/async_omni.py @@ -9,7 +9,7 @@ import asyncio import time -from collections.abc import AsyncGenerator, Iterable, Sequence +from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence from typing import TYPE_CHECKING, Any from vllm import TokensPrompt @@ -88,7 +88,12 @@ def __init__(self, *args: Any, model: str = "", **kwargs: Any) -> None: else: vllm_config = self.engine.stage_vllm_configs[stage_index] io_processor_plugin = vllm_config.model_config.io_processor_plugin - self.io_processor = get_io_processor(vllm_config, io_processor_plugin) + renderer = self.renderer + if renderer is None: + from vllm.renderers import renderer_from_config + + renderer = renderer_from_config(vllm_config) + self.io_processor = get_io_processor(vllm_config, renderer, io_processor_plugin) def _get_comprehension_stage_index(self) -> int | None: fallback_idx: int | None = None @@ -159,6 +164,10 @@ async def generate( tokenization_kwargs: dict[str, Any] | None = None, sampling_params_list: Sequence[OmniSamplingParams] | None = None, output_modalities: list[str] | None = None, + trace_headers: Mapping[str, str] | None = None, + priority: int = 0, + data_parallel_rank: int | None = None, + reasoning_ended: bool | None = None, ) -> AsyncGenerator[OmniRequestOutput, None]: """Generate outputs for the given prompt(s) asynchronously. diff --git a/vllm_omni/entrypoints/cli/main.py b/vllm_omni/entrypoints/cli/main.py index 629a4641cc..affa6c8334 100644 --- a/vllm_omni/entrypoints/cli/main.py +++ b/vllm_omni/entrypoints/cli/main.py @@ -28,6 +28,10 @@ def main(): cli_env_setup() + from vllm_omni.entrypoints.cli.serve import _ensure_vllm_platform + + _ensure_vllm_platform() + parser = FlexibleArgumentParser( description="vLLM OMNI CLI", epilog=VLLM_SUBCMD_PARSER_EPILOG.format(subcmd="[subcommand]"), diff --git a/vllm_omni/entrypoints/cli/serve.py b/vllm_omni/entrypoints/cli/serve.py index 4e1c8d3a94..b72df41cdd 100644 --- a/vllm_omni/entrypoints/cli/serve.py +++ b/vllm_omni/entrypoints/cli/serve.py @@ -42,6 +42,37 @@ """ +def _ensure_vllm_platform(): + """Ensure vLLM's current_platform is valid before arg parsing. + + Upstream vLLM's argument parser now instantiates DeviceConfig during + ``make_arg_parser``, which requires a resolved platform with a non-empty + ``device_type``. In some environments (e.g. editable installs with + broken package metadata), vLLM's own platform auto-detection may fail + and fall back to ``UnspecifiedPlatform``. When that happens, use the + Omni platform (which has its own detection logic) as a drop-in + replacement so that argument parsing succeeds. + """ + from vllm import platforms as vllm_platforms + + if vllm_platforms.current_platform.is_unspecified(): + from vllm_omni.platforms import current_omni_platform + + if not current_omni_platform.is_unspecified(): + vllm_platforms.current_platform = current_omni_platform + logger.debug( + "Replaced vLLM UnspecifiedPlatform with omni platform %s", + type(current_omni_platform).__name__, + ) + else: + from vllm.platforms.cpu import CpuPlatform + + vllm_platforms.current_platform = CpuPlatform() + logger.debug( + "Both vLLM and omni platforms are unspecified, falling back to CpuPlatform for arg parsing", + ) + + class OmniServeCommand(CLISubcommand): """The `serve` subcommand for the vLLM CLI.""" @@ -82,6 +113,7 @@ def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgu usage="vllm serve [model_tag] --omni [options]", ) + _ensure_vllm_platform() serve_parser = make_arg_parser(serve_parser) serve_parser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(subcmd=self.name) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 4a7b097b2f..627174b20e 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -639,6 +639,7 @@ async def omni_init_app_state( OpenAIServingResponses( engine_client, state.openai_serving_models, + openai_serving_render=state.openai_serving_render, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -699,7 +700,8 @@ async def omni_init_app_state( OpenAIServingPooling( engine_client, state.openai_serving_models, - supported_tasks=supported_tasks, + state.openai_serving_render, + supported_tasks=tuple(supported_tasks), request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -746,6 +748,7 @@ async def omni_init_app_state( state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, state.openai_serving_models, + state.openai_serving_render, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -787,6 +790,7 @@ async def omni_init_app_state( reasoning_parser=args.structured_outputs_config.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, + default_chat_template_kwargs=args.default_chat_template_kwargs, ) if "generate" in supported_tasks else None @@ -795,6 +799,7 @@ async def omni_init_app_state( ServingTokens( engine_client, state.openai_serving_models, + state.openai_serving_render, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_prompt_tokens_details=args.enable_prompt_tokens_details, diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 35f56516c7..e84a49aac2 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -62,7 +62,7 @@ from vllm.entrypoints.openai.responses.protocol import ResponsesRequest from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls from vllm.entrypoints.utils import should_include_usage -from vllm.inputs.data import PromptType +from vllm.inputs import PromptType from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.reasoning import ReasoningParser @@ -818,7 +818,7 @@ async def chat_completion_stream_generator( # Prepare the tool parser if it's needed try: if tool_choice_auto and self.tool_parser: - tool_parsers: list[ToolParser | None] = [self.tool_parser(tokenizer)] * num_choices + tool_parsers: list[ToolParser | None] = [self.tool_parser(tokenizer, request.tools)] * num_choices else: tool_parsers = [None] * num_choices except Exception as e: @@ -1639,12 +1639,12 @@ def _create_text_choice( logprobs = None if self.use_harmony: - reasoning_content, content, _ = parse_chat_output(token_ids) + reasoning, content, _ = parse_chat_output(token_ids) if not request.include_reasoning: - reasoning_content = None + reasoning = None if self.tool_parser is not None: - tool_parser = self.tool_parser(tokenizer) + tool_parser = self.tool_parser(tokenizer, request.tools) # NOTE: We use token_ids for openai tool parser tool_call_info = tool_parser.extract_tool_calls( "", @@ -1654,14 +1654,14 @@ def _create_text_choice( content = tool_call_info.content message = ChatMessage( role=role, - reasoning_content=reasoning_content, + reasoning=reasoning, content=content, tool_calls=tool_call_info.tool_calls, ) else: message = ChatMessage( role=role, - reasoning_content=reasoning_content, + reasoning=reasoning, content=content, ) @@ -1682,11 +1682,11 @@ def _create_text_choice( if reasoning_parser: # If the reasoning parser is enabled, # tool calls are extracted exclusively from the content. - reasoning_content, content = reasoning_parser.extract_reasoning(output.text, request=request) + reasoning, content = reasoning_parser.extract_reasoning(output.text, request=request) if not request.include_reasoning: - reasoning_content = None + reasoning = None else: - reasoning_content = None + reasoning = None content = output.text auto_tools_called = False @@ -1696,14 +1696,14 @@ def _create_text_choice( not isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam) and request.tool_choice != "required" ): - message = ChatMessage(role=role, reasoning_content=reasoning_content, content=content) + message = ChatMessage(role=role, reasoning=reasoning, content=content) # if the request uses tools and specified a tool choice elif request.tool_choice and type(request.tool_choice) is ChatCompletionNamedToolChoiceParam: tool_call_class = MistralToolCall if isinstance(tokenizer, MistralTokenizer) else ToolCall message = ChatMessage( role=role, - reasoning_content=reasoning_content, + reasoning=reasoning, content="", tool_calls=[ tool_call_class( @@ -1745,13 +1745,13 @@ def _create_text_choice( ) for i, tool_call in enumerate(tool_calls) ], - reasoning_content=reasoning_content, + reasoning=reasoning, ) # if the request doesn't use tool choice # OR specifies to not use a tool elif not request.tool_choice or request.tool_choice == "none": - message = ChatMessage(role=role, reasoning_content=reasoning_content, content=content) + message = ChatMessage(role=role, reasoning=reasoning, content=content) # handle when there are tools and tool choice is auto elif ( @@ -1761,7 +1761,7 @@ def _create_text_choice( and self.tool_parser ): try: - tool_parser = self.tool_parser(tokenizer) + tool_parser = self.tool_parser(tokenizer, request.tools) except RuntimeError as e: logger.exception("Error in tool parser creation.") return self.create_error_response(e) @@ -1774,7 +1774,7 @@ def _create_text_choice( if tool_call_info.tools_called: message = ChatMessage( role=role, - reasoning_content=reasoning_content, + reasoning=reasoning, content=tool_call_info.content, tool_calls=tool_call_info.tool_calls, ) @@ -1790,7 +1790,7 @@ def _create_text_choice( ret_content = tool_call_info.content message = ChatMessage( role=role, - reasoning_content=reasoning_content, + reasoning=reasoning, content=ret_content, ) @@ -1800,7 +1800,7 @@ def _create_text_choice( "Error in chat_completion_full_generator - cannot determine if tools should be extracted. " "Returning a standard chat completion." ) - message = ChatMessage(role=role, reasoning_content=reasoning_content, content=content) + message = ChatMessage(role=role, reasoning=reasoning, content=content) choice_data = ChatCompletionResponseChoice( index=output.index, diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 8126fd544f..f051268824 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -1642,3 +1642,6 @@ async def _run_item(idx: int, req: OpenAICreateSpeechRequest) -> SpeechBatchItem succeeded=succeeded, failed=len(final_results) - succeeded, ) + + +ServingSpeech = OmniOpenAIServingSpeech diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index c664fe80a0..e29e9eea1c 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -1,7 +1,7 @@ import os import types from collections import Counter -from dataclasses import asdict, fields, is_dataclass +from dataclasses import fields, is_dataclass from pathlib import Path from typing import Any, get_args, get_origin @@ -145,12 +145,21 @@ def _convert_dataclasses_to_dict(obj: Any) -> Any: if isinstance(obj, set): return list(obj) # Handle dataclass objects - # Note: asdict() recursively converts nested dataclasses but not Counter objects, - # so we need to recursively process the result - if is_dataclass(obj): - result = asdict(obj) - # Recursively process the result to convert any Counter objects - return _convert_dataclasses_to_dict(result) + # Use field iteration instead of asdict() to: + # 1. Only include init fields (non-init fields cause "unexpected kwarg" errors) + # 2. Skip None values matching field defaults (avoids Pydantic validation + # failures when None is explicitly passed for non-Optional typed fields, + # e.g. CompilationConfig.cudagraph_capture_sizes: list[int] = None) + if is_dataclass(obj) and not isinstance(obj, type): + result = {} + for f in fields(obj): + if not f.init: + continue + value = getattr(obj, f.name) + if value is None and f.default is None: + continue + result[f.name] = _convert_dataclasses_to_dict(value) + return result # Handle dictionaries (recurse into values) and filter out callables(cause error in OmegaConf.create) # Note: This must come AFTER Counter check since Counter is a dict subclass if isinstance(obj, dict): diff --git a/vllm_omni/inputs/data.py b/vllm_omni/inputs/data.py index 5768c3b6d9..7824e7092d 100644 --- a/vllm_omni/inputs/data.py +++ b/vllm_omni/inputs/data.py @@ -16,7 +16,7 @@ import torch -from vllm.inputs.data import EmbedsPrompt, TextPrompt, TokenInputs, TokensPrompt +from vllm.inputs import EmbedsPrompt, TextPrompt, TokensInput, TokensPrompt class OmniTextPrompt(TextPrompt): @@ -59,10 +59,10 @@ class OmniTokensPrompt(TokensPrompt): additional_information: NotRequired[dict[str, Any]] -class OmniTokenInputs(TokenInputs): +class OmniTokenInputs(TokensInput): """Token inputs with optional embeddings and additional information. - Extends TokenInputs to support prompt embeddings and additional + Extends TokensInput to support prompt embeddings and additional information payloads for direct transfer between pipeline stages. Attributes: diff --git a/vllm_omni/inputs/preprocess.py b/vllm_omni/inputs/preprocess.py index 15f31627fb..c6dffd0542 100644 --- a/vllm_omni/inputs/preprocess.py +++ b/vllm_omni/inputs/preprocess.py @@ -1,10 +1,9 @@ from typing import Any from typing_extensions import assert_never -from vllm.inputs.data import EmbedsInputs, SingletonInputs +from vllm.inputs import EmbedsInput, MultiModalInput, SingletonInput from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger -from vllm.multimodal.inputs import MultiModalInputs from vllm.renderers.inputs import SingletonDictPrompt from vllm_omni.inputs.data import ( @@ -30,7 +29,7 @@ def _process_text( self, parsed_content: OmniTextPrompt, tokenization_kwargs: dict[str, Any] | None = None, - ) -> OmniTokenInputs | MultiModalInputs: + ) -> OmniTokenInputs | MultiModalInput: """Process text prompts with support for mm_processor_kwargs. Extends base class to support mm_processor_kwargs without multi_modal_data. @@ -40,7 +39,7 @@ def _process_text( prompt_text = parsed_content["prompt"] mm_processor_kwargs = parsed_content.get("mm_processor_kwargs") or {} - inputs: OmniTokenInputs | MultiModalInputs + inputs: OmniTokenInputs | MultiModalInput if multi_modal_data := parsed_content.get("multi_modal_data"): inputs = self._process_multimodal( prompt_text, @@ -86,14 +85,14 @@ def _process_tokens( self, parsed_content: OmniTokensPrompt, tokenization_kwargs: dict[str, Any] | None = None, - ) -> OmniTokenInputs | MultiModalInputs: + ) -> OmniTokenInputs | MultiModalInput: prompt_token_ids = self._truncate_inputs(parsed_content["prompt_token_ids"], tokenization_kwargs) prompt_embeds = parsed_content.get("prompt_embeds") additional_information = parsed_content.get("additional_information") multi_modal_data = parsed_content.get("multi_modal_data") - inputs: OmniTokenInputs | MultiModalInputs + inputs: OmniTokenInputs | MultiModalInput if multi_modal_data: inputs = self._process_multimodal( prompt_token_ids, @@ -123,7 +122,7 @@ def _process_tokens( def _process_embeds( self, parsed_content: OmniEmbedsPrompt, - ) -> EmbedsInputs: + ) -> EmbedsInput: """Process embeddings prompt with omni-specific extensions. Extends base _process_embeds to handle additional_information payload @@ -143,7 +142,7 @@ def _prompt_to_llm_inputs( self, prompt: SingletonDictPrompt, tokenization_kwargs: dict[str, Any] | None = None, - ) -> SingletonInputs: + ) -> SingletonInput: """ Extract the singleton inputs from a prompt. @@ -153,7 +152,7 @@ def _prompt_to_llm_inputs( Returns: - * [`SingletonInputs`][vllm.inputs.data.SingletonInputs] instance + * [`SingletonInput`][vllm.inputs.engine.SingletonInput] instance """ if "prompt_embeds" in prompt: return self._process_embeds(prompt) # type: ignore[arg-type] diff --git a/vllm_omni/model_executor/models/bagel/bagel.py b/vllm_omni/model_executor/models/bagel/bagel.py index e79f0212e2..934f434e64 100644 --- a/vllm_omni/model_executor/models/bagel/bagel.py +++ b/vllm_omni/model_executor/models/bagel/bagel.py @@ -8,6 +8,7 @@ from transformers import BatchFeature from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions +from vllm.inputs import MultiModalDataDict from vllm.model_executor.layers.layernorm import RMSNorm as VllmRMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, @@ -20,7 +21,6 @@ from vllm.model_executor.models.utils import AutoWeightsLoader from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( - MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, ) diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py index 784393e181..bc04aae33c 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py +++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py @@ -10,10 +10,11 @@ from transformers.feature_extraction_utils import BatchFeature from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions +from vllm.inputs import MultiModalDataDict from vllm.logger import init_logger from vllm.model_executor.models.interfaces import SupportsMultiModal from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser from vllm.multimodal.processing import ( BaseDummyInputsBuilder, diff --git a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py index f90826fd3b..31eed9b2cb 100644 --- a/vllm_omni/model_executor/models/glm_image/glm_image_ar.py +++ b/vllm_omni/model_executor/models/glm_image/glm_image_ar.py @@ -40,6 +40,7 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.distributed import utils as dist_utils +from vllm.inputs import MultiModalDataDict from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention from vllm.model_executor.layers.attention.mm_encoder_attention import ( @@ -73,7 +74,6 @@ from vllm.model_executor.models.vision import get_vit_attn_backend from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( - MultiModalDataDict, MultiModalFeatureSpec, MultiModalFieldConfig, MultiModalKwargsItems, diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index 5a0ae99657..6d25274f90 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -20,6 +20,7 @@ from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_pp_group +from vllm.inputs import MultiModalDataDict from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import SharedFusedMoE from vllm.model_executor.layers.linear import ( @@ -58,7 +59,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import rgba_to_rgb from vllm.multimodal.inputs import ( - MultiModalDataDict, MultiModalFeatureSpec, MultiModalFieldConfig, MultiModalKwargsItems, diff --git a/vllm_omni/model_executor/models/mimo_audio/mimo_audio.py b/vllm_omni/model_executor/models/mimo_audio/mimo_audio.py index 9acb81bce5..22a9a91113 100644 --- a/vllm_omni/model_executor/models/mimo_audio/mimo_audio.py +++ b/vllm_omni/model_executor/models/mimo_audio/mimo_audio.py @@ -10,6 +10,7 @@ from transformers import BatchFeature, Qwen2Config from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions +from vllm.inputs import ModalityData, MultiModalDataDict from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.models import SupportsPP @@ -18,9 +19,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( AudioItem, - ModalityData, MultiModalBatchedField, - MultiModalDataDict, MultiModalFieldConfig, MultiModalFieldElem, MultiModalKwargsItem, diff --git a/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py b/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py index 1424ca7756..56cb8788ee 100644 --- a/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py +++ b/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py @@ -13,6 +13,7 @@ ) from vllm.config import VllmConfig from vllm.forward_context import get_forward_context +from vllm.inputs import MultiModalDataDict from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.model_loader.weight_utils import default_weight_loader, maybe_remap_kv_scale_name @@ -34,7 +35,6 @@ ) from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( - MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, ) diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py index 04212ceeba..ed6df6af36 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py @@ -18,7 +18,7 @@ Qwen3OmniMoeThinkerConfig, ) from vllm.config import ModelConfig, VllmConfig -from vllm.inputs.data import PromptType, TokensPrompt +from vllm.inputs import PromptType, TokensPrompt from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal, SupportsPP, SupportsRealtime diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py index fd7b14ab42..671ffb6cb1 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py @@ -46,7 +46,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.inputs.data import PromptType +from vllm.inputs import PromptType from vllm.logger import init_logger from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY from vllm.model_executor.layers.attention.mm_encoder_attention import ( diff --git a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py index 3b8927d63d..b5d1161733 100644 --- a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py +++ b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py @@ -29,6 +29,7 @@ from transformers import BatchFeature from transformers.tokenization_utils_base import TextInput from vllm.config import VllmConfig +from vllm.inputs import MultiModalDataDict from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import SupportsMultiModal @@ -39,7 +40,6 @@ ) from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( - MultiModalDataDict, MultiModalFieldConfig, MultiModalKwargsItems, NestedTensors, diff --git a/vllm_omni/patch.py b/vllm_omni/patch.py index 6daef1a23f..eafff821a2 100644 --- a/vllm_omni/patch.py +++ b/vllm_omni/patch.py @@ -1,7 +1,7 @@ import sys from aenum import extend_enum -from vllm.inputs.data import TokensPrompt as _OriginalTokensPrompt +from vllm.inputs import TokensPrompt as _OriginalTokensPrompt from vllm.model_executor.layers.rotary_embedding import ( MRotaryEmbedding as _OriginalMRotaryEmbedding, ) diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py index 3d9cb86bac..138948064b 100644 --- a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py +++ b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py @@ -26,8 +26,8 @@ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.structured_output.utils import apply_grammar_bitmask from vllm.v1.utils import record_function_or_nullcontext -from vllm.v1.worker import mamba_utils from vllm.v1.worker.gpu_model_runner import AsyncGPUModelRunnerOutput, PerLayerAttnMetadata +from vllm.v1.worker.mamba_utils import preprocess_mamba from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices from vllm_ascend.ascend_forward_context import set_ascend_forward_context from vllm_ascend.attention.utils import AscendCommonAttentionMetadata @@ -243,7 +243,7 @@ def execute_model( # '_update_states_after_model_execute', which is not overridden in vLLM-Ascend. # We simply utilize the implementation in vLLM. if self.cache_config.mamba_cache_mode == "align": - mamba_utils.preprocess_mamba( + preprocess_mamba( scheduler_output, self.kv_cache_config, self.cache_config, diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 155b75675f..f1115ab4c6 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -137,8 +137,10 @@ def execute_model( else: logger.error("RoutedExpertsCapturer not initialized.") - if scheduler_output.preempted_req_ids and has_kv_transfer_group(): - get_kv_transfer_group().handle_preemptions(scheduler_output.preempted_req_ids) + if has_kv_transfer_group(): + kv_connector_metadata = scheduler_output.kv_connector_metadata + if kv_connector_metadata is not None: + get_kv_transfer_group().handle_preemptions(kv_connector_metadata) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens with ( @@ -146,7 +148,7 @@ def execute_model( self.synchronize_input_prep(), ): # Update persistent batch states. - self._update_states(scheduler_output) + deferred_state_corrections_fn = self._update_states(scheduler_output) if has_ec_transfer() and not get_ec_transfer().is_consumer: with self.maybe_get_ec_connector_output( @@ -417,6 +419,9 @@ def execute_model( ) self.kv_connector_output = kv_connector_output + if deferred_state_corrections_fn: + deferred_state_corrections_fn() + return None @torch.inference_mode() @@ -474,8 +479,11 @@ def sample_tokens( with record_function_or_nullcontext("gpu_model_runner: sample"): sampler_output = self._sample(logits, spec_decode_metadata) + self._update_states_after_model_execute(sampler_output.sampled_token_ids, scheduler_output) + self._draft_token_ids = None self._draft_token_req_ids = None + self.valid_sampled_token_count_gpu = None self.input_batch.prev_sampled_token_ids = None def propose_draft_token_ids(sampled_token_ids): @@ -515,7 +523,7 @@ def propose_draft_token_ids(sampled_token_ids): elif self.valid_sampled_token_count_event is not None: assert spec_decode_common_attn_metadata is not None next_token_ids, valid_sampled_tokens_count = self.drafter.prepare_next_token_ids_padded( - spec_decode_common_attn_metadata, + self.optimistic_seq_lens_cpu, sampled_token_ids, self.requests, self.input_batch, diff --git a/vllm_omni/worker/gpu_generation_model_runner.py b/vllm_omni/worker/gpu_generation_model_runner.py index 4db683a8b4..d95b676f6d 100644 --- a/vllm_omni/worker/gpu_generation_model_runner.py +++ b/vllm_omni/worker/gpu_generation_model_runner.py @@ -94,8 +94,10 @@ def execute_model( else: logger.error("RoutedExpertsCapturer not initialized.") - if scheduler_output.preempted_req_ids and has_kv_transfer_group(): - get_kv_transfer_group().handle_preemptions(scheduler_output.preempted_req_ids) + if has_kv_transfer_group(): + kv_connector_metadata = scheduler_output.kv_connector_metadata + if kv_connector_metadata is not None: + get_kv_transfer_group().handle_preemptions(kv_connector_metadata) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens with ( @@ -104,7 +106,7 @@ def execute_model( ): if self.model_config.async_chunk and num_scheduled_tokens: self._update_request_states(scheduler_output) - self._update_states(scheduler_output) + deferred_state_corrections_fn = self._update_states(scheduler_output) if not scheduler_output.total_num_scheduled_tokens: return EMPTY_MODEL_RUNNER_OUTPUT @@ -309,6 +311,10 @@ def execute_model( slot_mappings, # OMNI: pass slot_mappings for upstream v1 API compatibility ) self.kv_connector_output = kv_connector_output + + if deferred_state_corrections_fn: + deferred_state_corrections_fn() + return None @torch.inference_mode() @@ -636,11 +642,14 @@ def _dummy_run( seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1] # type: ignore[assignment] else: seq_lens = max_query_len # type: ignore[assignment] - self.seq_lens.np[:num_reqs] = seq_lens - self.seq_lens.np[num_reqs:] = 0 - self.seq_lens.copy_to_gpu() + self.seq_lens[:num_reqs] = ( + seq_lens + if isinstance(seq_lens, int) + else torch.tensor(seq_lens, dtype=torch.int32, device=self.device) + ) + self.seq_lens[num_reqs:] = 0 - cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens) + cum_num_tokens = self._get_cumsum_and_arange(num_scheduled_tokens, self._arange_scratch) self.query_start_loc.np[1 : num_reqs + 1] = cum_num_tokens self.query_start_loc.copy_to_gpu() @@ -696,7 +705,7 @@ def _dummy_run( elif self.uses_xdrope_dim > 0: positions = self.xdrope_positions.gpu[:, :num_tokens_padded] else: - positions = self.positions.gpu[:num_tokens_padded] + positions = self.positions[:num_tokens_padded] if get_pp_group().is_first_rank: intermediate_tensors = None diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 8e5689986e..a7abaf7b62 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -243,7 +243,7 @@ def _fixup_precomputed_mrope_decode_positions(self, scheduler_output: "Scheduler mrope_pos_ptr += completion_part_len - def _update_states(self, scheduler_output: "SchedulerOutput") -> None: + def _update_states(self, scheduler_output: "SchedulerOutput"): """Update the cached states and the persistent batch with the scheduler output. @@ -271,7 +271,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Zero GPU memory for freshly allocated cache blocks to prevent # stale NaN/data from corrupting attention or SSM computation. - if hasattr(scheduler_output, "new_block_ids_to_zero") and scheduler_output.new_block_ids_to_zero: + if scheduler_output.new_block_ids_to_zero: self._zero_block_ids(scheduler_output.new_block_ids_to_zero) # Free the cached encoder outputs. @@ -300,7 +300,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: for req_id in unscheduled_req_ids: self.input_batch.remove_request(req_id) + if self.use_async_spec_decode: + self.prev_num_draft_tokens.np.fill(0) + reqs_to_add: list[CachedRequestState] = [] + deferred_spec_decode_corrections = [] # Add new requests to the cached states. for new_req_data in scheduler_output.scheduled_new_reqs: req_id = new_req_data.req_id @@ -398,10 +402,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: req_data = scheduler_output.scheduled_cached_reqs scheduled_spec_tokens = scheduler_output.scheduled_spec_decode_tokens - # Wait until valid_sampled_tokens_count is copied to cpu, - # then use it to update actual num_computed_tokens of each request. - valid_sampled_token_count = self._get_valid_sampled_token_count() - for i, req_id in enumerate(req_data.req_ids): req_state = self.requests[req_id] num_computed_tokens = req_data.num_computed_tokens[i] @@ -427,12 +427,18 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: if req_index is None: req_state.prev_num_draft_len = 0 else: - assert self.input_batch.prev_req_id_to_index is not None - prev_req_index = self.input_batch.prev_req_id_to_index[req_id] - num_accepted = valid_sampled_token_count[prev_req_index] - 1 - num_rejected = req_state.prev_num_draft_len - num_accepted - num_computed_tokens -= num_rejected - req_state.output_token_ids.extend([-1] * num_accepted) + optimistic_num_accepted = req_state.prev_num_draft_len + req_state.output_token_ids.extend([-1] * optimistic_num_accepted) + + deferred_spec_decode_corrections.append((req_id, optimistic_num_accepted, req_state)) + + prev_req_index = ( + self.input_batch.prev_req_id_to_index.get(req_id) + if self.input_batch.prev_req_id_to_index + else None + ) + if prev_req_index is not None: + self.prev_num_draft_tokens.np[prev_req_index] = optimistic_num_accepted # Update the cached states. req_state.num_computed_tokens = num_computed_tokens @@ -449,7 +455,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: req_state.output_token_ids.extend(new_token_ids[-num_new_tokens:]) elif num_output_tokens < len(req_state.output_token_ids): # Some output tokens were discarded due to a sync-KV-load - # failure. Align the cached state. + # failure, or output_token_ids was inflated by the optimistic + # extend above (async spec decode). Align the cached state. del req_state.output_token_ids[num_output_tokens:] if req_index is not None: end_idx = self.input_batch.num_prompt_tokens[req_index] + num_output_tokens @@ -513,6 +520,35 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Refresh batch metadata with any pending updates. self.input_batch.refresh_metadata() + if deferred_spec_decode_corrections: + + def correct_spec_decode_token_counts(): + valid_sampled_token_count = self._get_valid_sampled_token_count() + if not valid_sampled_token_count: + return + prev_req_id_to_index = self.input_batch.prev_req_id_to_index + if not prev_req_id_to_index: + return + for ( + req_id, + optimistic_num_accepted, + req_state, + ) in deferred_spec_decode_corrections: + prev_req_index = prev_req_id_to_index.get(req_id) + if prev_req_index is None: + continue + num_accepted = valid_sampled_token_count[prev_req_index] - 1 + correction = optimistic_num_accepted - num_accepted + req_state.num_computed_tokens -= correction + cur_req_index = self.input_batch.req_id_to_index.get(req_id) + if cur_req_index is None: + continue + self.input_batch.num_computed_tokens_cpu[cur_req_index] -= correction + + return correct_spec_decode_token_counts + else: + return None + @torch.inference_mode() def extract_multimodal_outputs(self, hidden_states: torch.Tensor | list[torch.Tensor] | OmniOutput) -> dict: if ( @@ -704,11 +740,14 @@ def _dummy_run( seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1] # type: ignore[assignment] else: seq_lens = max_query_len # type: ignore[assignment] - self.seq_lens.np[:num_reqs] = seq_lens - self.seq_lens.np[num_reqs:] = 0 - self.seq_lens.copy_to_gpu() + self.seq_lens[:num_reqs] = ( + seq_lens + if isinstance(seq_lens, int) + else torch.tensor(seq_lens, dtype=torch.int32, device=self.device) + ) + self.seq_lens[num_reqs:] = 0 - cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens) + cum_num_tokens = self._get_cumsum_and_arange(num_scheduled_tokens, self._arange_scratch) self.query_start_loc.np[1 : num_reqs + 1] = cum_num_tokens self.query_start_loc.copy_to_gpu() @@ -759,7 +798,7 @@ def _dummy_run( elif self.uses_xdrope_dim > 0: positions = self.xdrope_positions.gpu[:, :num_tokens_padded] else: - positions = self.positions.gpu[:num_tokens_padded] + positions = self.positions[:num_tokens_padded] if get_pp_group().is_first_rank: intermediate_tensors = None @@ -1164,7 +1203,7 @@ def _preprocess( elif self.uses_xdrope_dim > 0: positions = self.xdrope_positions.gpu[:, :num_input_tokens] else: - positions = self.positions.gpu[:num_input_tokens] + positions = self.positions[:num_input_tokens] if is_first_rank: intermediate_tensors = None From 191b9a8dbf22fc494b7db34ea5ae04153f44f94d Mon Sep 17 00:00:00 2001 From: Joshna-Medisetty Date: Sat, 4 Apr 2026 00:04:04 -0700 Subject: [PATCH 043/204] Voxtral TTS: drop hardcoded CUDA in audio tokenizer; add XPU stage config (#2428) Signed-off-by: Joshna Medisetty Signed-off-by: Joshna-Medisetty --- .../voxtral_tts_audio_tokenizer.py | 8 +- .../xpu/stage_configs/voxtral_tts.yaml | 111 ++++++++++++++++++ 2 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 vllm_omni/platforms/xpu/stage_configs/voxtral_tts.yaml diff --git a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_tokenizer.py b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_tokenizer.py index 4f488e2fc1..fc753a58f3 100644 --- a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_tokenizer.py +++ b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_tokenizer.py @@ -17,6 +17,7 @@ MultimodalAudioModelArgs, from_nested_dict, ) +from vllm_omni.platforms import current_omni_platform try: from flash_attn import flash_attn_func @@ -954,7 +955,10 @@ def _tokenize_audio(self, x: torch.Tensor) -> torch.Tensor: if x.shape[-1] % self.patch_size != 0: pad_length = self.patch_size - (x.shape[-1] % self.patch_size) x = F.pad(x, (0, pad_length), mode="constant", value=0) - with torch.autocast(dtype=torch.bfloat16, device_type="cuda"): + with torch.autocast( + device_type=current_omni_platform.device_type, + dtype=torch.bfloat16, + ): # bf16 to use alibi bias in flash attn emb = self._forward_encoder(x) # (b, d, t) codes = self.quantizer.encode(emb) # (b, k, t) @@ -1095,7 +1099,7 @@ def decode_helper_batch_async(self, codes_list: list[torch.Tensor]) -> list[torc for i, chunk in enumerate(all_chunks): padded[i, : len(chunk)] = chunk - audio_codes = padded.to(device=torch.device("cuda")) # [B, T, K] + audio_codes = padded.to(device=current_omni_platform.device_type) # [B, T, K] audio_values = self.decode(audio_codes.transpose(1, 2), dtype=torch.bfloat16) # [B, 1, T_out] audio_values = audio_values.detach().cpu().float().squeeze(1) # [B, T_out] if torch.min(audio_values) < -1.0: diff --git a/vllm_omni/platforms/xpu/stage_configs/voxtral_tts.yaml b/vllm_omni/platforms/xpu/stage_configs/voxtral_tts.yaml new file mode 100644 index 0000000000..10051c1eda --- /dev/null +++ b/vllm_omni/platforms/xpu/stage_configs/voxtral_tts.yaml @@ -0,0 +1,111 @@ +# Voxtral TTS — Intel XPU (AR → audio tokenizer). Matches CUDA stage config knobs where noted. + +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm + runtime: + process: true + devices: "0" + max_batch_size: 1 + engine_args: + max_num_seqs: 32 + model_stage: audio_generation + model_arch: VoxtralTTSForConditionalGeneration + worker_type: ar + worker_cls: vllm_omni.platforms.xpu.worker.xpu_ar_worker.XPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.7 + enforce_eager: true + trust_remote_code: true + async_scheduling: true + engine_output_type: latent + enable_prefix_caching: false + tokenizer_mode: mistral + config_format: mistral + load_format: mistral + skip_mm_profiling: true + enable_chunked_prefill: false + max_model_len: 4096 + distributed_executor_backend: "mp" + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.voxtral_tts.generator2tokenizer_async_chunk + output_connectors: + to_stage_1: connector_of_shared_memory + is_comprehension: true + final_output: false + final_output_type: text + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 2048 + seed: 42 + detokenize: True + repetition_penalty: 1.1 + + - stage_id: 1 + stage_type: llm + runtime: + process: true + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: audio_tokenizer + model_arch: VoxtralTTSForConditionalGeneration + worker_type: generation + worker_cls: vllm_omni.platforms.xpu.worker.xpu_generation_worker.XPUGenerationWorker + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + async_scheduling: false + max_num_seqs: 1 + gpu_memory_utilization: 0.28 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + skip_mm_profiling: true + engine_output_type: audio + tokenizer_mode: mistral + config_format: mistral + load_format: mistral + max_num_batched_tokens: 16384 + max_model_len: 16384 + distributed_executor_backend: "mp" + engine_input_source: [0] + is_comprehension: false + final_output: true + final_output_type: audio + input_connectors: + from_stage_0: connector_of_shared_memory + tts_args: + max_instructions_length: 500 + default_sampling_params: + temperature: 0.9 + top_p: 0.8 + top_k: 40 + max_tokens: 2048 + seed: 42 + detokenize: True + repetition_penalty: 1.05 + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 + + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + codec_streaming: true + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + codec_chunk_frames: 25 + codec_chunk_frames_at_begin: 5 + codec_left_context_frames: 25 + + edges: + - from: 0 + to: 1 + window_size: -1 From 0059ec878c69061acc42206331a36065c4a4fea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Sat, 4 Apr 2026 17:29:09 +0800 Subject: [PATCH 044/204] [Model Support]: Magihuman support (#2301) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: princepride Signed-off-by: 汪志鹏 Co-authored-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- docs/models/supported_models.md | 1 + docs/user_guide/diffusion_features.md | 3 +- .../offline_inference/magi_human/README.md | 72 + .../offline_inference/magi_human/end2end.py | 117 + requirements/common.txt | 1 + .../e2e/offline_inference/test_magi_human.py | 129 + .../diffusion/models/magi_human/__init__.py | 1 + .../models/magi_human/magi_human_dit.py | 1624 ++++++++++++ .../models/magi_human/pipeline_magi_human.py | 2277 +++++++++++++++++ .../models/t5_encoder/t5_gemma_encoder.py | 309 +++ .../diffusion/offloader/module_collector.py | 4 +- vllm_omni/diffusion/registry.py | 7 + vllm_omni/diffusion/utils/media_utils.py | 75 + 13 files changed, 4617 insertions(+), 3 deletions(-) create mode 100644 examples/offline_inference/magi_human/README.md create mode 100644 examples/offline_inference/magi_human/end2end.py create mode 100644 tests/e2e/offline_inference/test_magi_human.py create mode 100644 vllm_omni/diffusion/models/magi_human/__init__.py create mode 100644 vllm_omni/diffusion/models/magi_human/magi_human_dit.py create mode 100644 vllm_omni/diffusion/models/magi_human/pipeline_magi_human.py create mode 100644 vllm_omni/diffusion/models/t5_encoder/t5_gemma_encoder.py create mode 100644 vllm_omni/diffusion/utils/media_utils.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index d611c0311c..f3d22aa768 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -37,6 +37,7 @@ th { | `LTX2TwoStagesPipeline` | LTX-2-T2V | `rootonchair/LTX-2-19b-distilled` | ✅︎ | ✅︎ | | | | `LTX2ImageToVideoTwoStagesPipeline` | LTX-2-I2V | `rootonchair/LTX-2-19b-distilled` | ✅︎ | ✅︎ | | | | `HeliosPipeline`, `HeliosPyramidPipeline` | Helios | `BestWishYsh/Helios-Base`, `BestWishYsh/Helios-Mid`, `BestWishYsh/Helios-Distilled` | ✅︎ | ✅︎ | ✅︎ | | +| `MagiHumanPipeline` | MagiHuman | `princepride/daVinci-MagiHuman` | ✅︎ | ✅︎ | | | | `OvisImagePipeline` | Ovis-Image | `OvisAI/Ovis-Image` | ✅︎ | ✅︎ | | ✅︎ | | `LongcatImagePipeline` | LongCat-Image | `meituan-longcat/LongCat-Image` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `LongCatImageEditPipeline` | LongCat-Image-Edit | `meituan-longcat/LongCat-Image-Edit` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 9cd407d377..e7f33306ec 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -109,6 +109,7 @@ The following tables show which models support each feature: | **HunyuanImage3** | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | | **LongCat-Image** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **LongCat-Image-Edit** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **MagiHuman** | ❌ | ❌ | ❌ | ❓ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | | **MammothModa2(T2I)** | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | **Nextstep_1(T2I)** | ❓ | ❓ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **OmniGen2** | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | @@ -123,7 +124,7 @@ The following tables show which models support each feature: > Notes: > 1. Nextstep_1(T2I) does not support cache acceleration methods such as TeaCache or Cache-DiT. -> 2. `Tongyi-MAI/Z-Image-Turbo` is a distilled model with minimal NFEs; CFG-Parallel is not necessary. +> 2. `Tongyi-MAI/Z-Image-Turbo` and `princepride/daVinci-MagiHuman` are distilled models with minimal NFEs; CFG-Parallel is not necessary. ### VideoGen diff --git a/examples/offline_inference/magi_human/README.md b/examples/offline_inference/magi_human/README.md new file mode 100644 index 0000000000..2b89093d94 --- /dev/null +++ b/examples/offline_inference/magi_human/README.md @@ -0,0 +1,72 @@ +# MagiHuman Generation + +MagiHuman is an advanced, omni-modality model that generates both high-quality video and lip-synced audio from a text prompt. + +Because MagiHuman is a very large model featuring a powerful DiT MoE backbone and a ~9B parameter T5Gemma text encoder, it natively supports **Tensor Parallelism (TP)** in vLLM-Omni to run efficiently across multi-GPU setups, reducing device memory bottlenecks. + +## Setup + +### Install MagiCompiler (recommended) + +MagiHuman relies on [MagiCompiler](https://github.com/SandAI-org/MagiCompiler) for custom-op registration used by the DiT attention kernels. While the pipeline can fall back to stub implementations, installing MagiCompiler is **strongly recommended** for correct behaviour. + +```bash +# Clone the repo +git clone https://github.com/SandAI-org/MagiCompiler.git +cd MagiCompiler + +# System dependencies (optional, for FX graph visualization; Debian/Ubuntu) +sudo apt update && sudo apt install -y graphviz + +# Python dependencies +pip install -r requirements.txt + +# Install MagiCompiler +pip install . # end users (recommended) +# pip install -e . # developers (editable install) +``` + +### Hardware requirements + +Ensure your hardware has enough VRAM. For a standard node with 80GB GPUs, running with `--tensor-parallel-size 4` is recommended to shard both the MoE weights and the T5Gemma text encoder across 4 GPUs, reducing the per-GPU peak VRAM overhead significantly (by roughly ~13.5GB per GPU compared to single-device inference). + +Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) for further details on allocating memory. + +## Run Examples + +Get into the example folder: +```bash +cd examples/offline_inference/magi_human +``` + +### End-to-End Generation (Text to Video+Audio) + +Generate a video with synchronized speech natively generated by the model. + +```bash +python end2end.py \ + --model /proj-tango-pvc/users/zhipeng.wang/workspace/models/daVinci-MagiHuman \ + --prompt "A young woman with long, wavy golden blonde hair..." \ + --tensor-parallel-size 4 \ + --output output_magihuman.mp4 +``` + +## Common Parameters + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `--model` | *(Required)* | Local model path or HuggingFace ID | +| `--prompt` | *(built-in demo prompt)* | Highly detailed text prompt dictating visual look and dialogue text | +| `--tensor-parallel-size` | `4` | Tensor parallelism size (Number of GPUs) | +| `--height` | `256` | Initial resolution height | +| `--width` | `448` | Initial resolution width | +| `--num-inference-steps` | `8` | Denoising steps | +| `--seed` | `52` | Random seed | +| `--output` | `output_magihuman.mp4` | Output video with audio path | + +## Example materials + +??? abstract "end2end.py" + ``````py + --8<-- "examples/offline_inference/magi_human/end2end.py" + `````` diff --git a/examples/offline_inference/magi_human/end2end.py b/examples/offline_inference/magi_human/end2end.py new file mode 100644 index 0000000000..39451ccc44 --- /dev/null +++ b/examples/offline_inference/magi_human/end2end.py @@ -0,0 +1,117 @@ +import argparse + +from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes +from vllm_omni.entrypoints.omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + + +def parse_args(): + parser = argparse.ArgumentParser(description="End-to-end inference script for MagiHuman.") + parser.add_argument("--model", type=str, required=True, help="Path or ID of the MagiHuman model.") + parser.add_argument( + "--prompt", + type=str, + default="", + help="Text prompt containing visual description, dialogue, and background sound.", + ) + parser.add_argument( + "--tensor-parallel-size", "-tp", type=int, default=4, help="Tensor parallel size (number of GPUs)." + ) + parser.add_argument( + "--output", type=str, default="output_magihuman.mp4", help="Path to save the generated mp4 file." + ) + parser.add_argument("--height", type=int, default=256, help="Video height.") + parser.add_argument("--width", type=int, default=448, help="Video width.") + parser.add_argument("--num-inference-steps", type=int, default=8, help="Number of denoising steps.") + parser.add_argument("--seed", type=int, default=52, help="Random seed for generation.") + return parser.parse_args() + + +def main(): + args = parse_args() + + print(f"Initializing MagiHuman pipeline with TP={args.tensor_parallel_size}...") + omni = Omni( + model=args.model, + init_timeout=1200, + tensor_parallel_size=args.tensor_parallel_size, + devices=list(range(args.tensor_parallel_size)), + ) + + prompt = args.prompt + if not prompt: + prompt = ( + "A young woman with long, wavy golden blonde hair and bright blue eyes, " + "wearing a fitted ivory silk blouse with a delicate lace collar, sits " + "stationary in front of a softly lit, blurred warm-toned interior. Her " + "overall disposition is warm, composed, and gently confident. The camera " + "holds a static medium close-up, framing her from the shoulders up, " + "with shallow depth of field keeping her face in sharp focus. Soft " + "directional key light falls from the upper left, casting a gentle " + "highlight along her cheekbone and nose bridge. She draws a quiet breath, " + "the levator labii superiors relaxing as her lips part. She speaks in " + "clear, warm, unhurried American English: " + "\"The most beautiful things in life aren't things at all — " + "they're moments, feelings, and the people who make you feel truly alive.\" " + "Her jaw descends smoothly on each stressed syllable; the orbicularis oris " + "shapes each vowel with precision. A faint, genuine smile engages the " + "zygomaticus major, lifting her lip corners fractionally. Her brows rest " + "in a soft, neutral arch throughout. She maintains steady, forward-facing " + "eye contact. Head position remains level; no torso displacement occurs.\n\n" + "Dialogue:\n" + ": " + "\"The most beautiful things in life aren't things at all — " + "they're moments, feelings, and the people who make you feel truly alive.\"\n\n" + "Background Sound:\n" + "" + ) + + sampling_params = OmniDiffusionSamplingParams( + height=args.height, + width=args.width, + num_inference_steps=args.num_inference_steps, + seed=args.seed, + extra_args={ + "seconds": 5, + "sr_height": 1080, + "sr_width": 1920, + "sr_num_inference_steps": 5, + }, + ) + + print(f"Generating with prompt: {prompt[:80]}...") + outputs = omni.generate( + prompts=[prompt], + sampling_params_list=[sampling_params], + ) + + print(f"Generation complete. Output type: {type(outputs)}") + if outputs: + first = outputs[0] + + if hasattr(first, "images") and first.images: + video_frames = first.images[0] + print(f"Video frames: shape={video_frames.shape}, dtype={video_frames.dtype}") + + audio_waveform = None + if hasattr(first, "multimodal_output") and first.multimodal_output: + audio_waveform = first.multimodal_output.get("audio") + if audio_waveform is not None: + print(f"Audio waveform: shape={audio_waveform.shape}, dtype={audio_waveform.dtype}") + + video_bytes = mux_video_audio_bytes( + video_frames, + audio_waveform, + fps=25.0, + audio_sample_rate=44100, + ) + with open(args.output, "wb") as f: + f.write(video_bytes) + print(f"Saved MP4 ({len(video_bytes)} bytes) to {args.output}") + print("SUCCESS: MagiHuman pipeline generation completed.") + else: + print("WARNING: No outputs returned.") + + +if __name__ == "__main__": + main() diff --git a/requirements/common.txt b/requirements/common.txt index 138a61ed22..89eaac32bc 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -1,4 +1,5 @@ # Common dependencies for all platforms +av>=14.0.0 omegaconf>=2.3.0 librosa>=0.11.0 resampy>=0.4.3 diff --git a/tests/e2e/offline_inference/test_magi_human.py b/tests/e2e/offline_inference/test_magi_human.py new file mode 100644 index 0000000000..6211fdafc0 --- /dev/null +++ b/tests/e2e/offline_inference/test_magi_human.py @@ -0,0 +1,129 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""End-to-end tests for MagiHuman pipeline via vLLM-Omni.""" + +import io + +import av +import numpy as np +import pytest + +from tests.utils import hardware_test +from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes +from vllm_omni.entrypoints.omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + + +def _validate_mp4(video_bytes: bytes, min_frames: int = 10) -> None: + """Validate that the MP4 contains meaningful video and audio tracks.""" + container = av.open(io.BytesIO(video_bytes)) + + v_streams = [s for s in container.streams if s.type == "video"] + assert len(v_streams) >= 1, "No video stream found in MP4" + + a_streams = [s for s in container.streams if s.type == "audio"] + assert len(a_streams) >= 1, "No audio stream found in MP4" + + v_stream = v_streams[0] + assert v_stream.width >= 1080, f"Unexpected video width: {v_stream.width}" + assert v_stream.height >= 1056, f"Unexpected video height: {v_stream.height}" + + frame_count = 0 + for frame in container.decode(video=0): + frame_count += 1 + if frame_count >= min_frames: + break + assert frame_count >= min_frames, f"Video has only {frame_count} frames (expected >= {min_frames})" + + container.close() + + +@pytest.mark.core_model +@pytest.mark.advanced_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}, num_cards=2) +def test_magi_human_e2e(run_level): + """End-to-end test for MagiHuman generating video and audio.""" + if run_level != "advanced_model": + pytest.skip("MagiHuman e2e test requires advanced_model run level with real weights.") + + model_path = "princepride/daVinci-MagiHuman" + + omni = Omni( + model=model_path, + init_timeout=1200, + tensor_parallel_size=2, + ) + + prompt = ( + "A young woman with long, wavy golden blonde hair and bright blue eyes, " + "wearing a fitted ivory silk blouse with a delicate lace collar, sits " + "stationary in front of a softly lit, blurred warm-toned interior. Her " + "overall disposition is warm, composed, and gently confident. The camera " + "holds a static medium close-up, framing her from the shoulders up, " + "with shallow depth of field keeping her face in sharp focus. Soft " + "directional key light falls from the upper left, casting a gentle " + "highlight along her cheekbone and nose bridge. She draws a quiet breath, " + "the levator labii superiors relaxing as her lips part. She speaks in " + "clear, warm, unhurried American English: " + "\"The most beautiful things in life aren't things at all — " + "they're moments, feelings, and the people who make you feel truly alive.\" " + "Her jaw descends smoothly on each stressed syllable; the orbicularis oris " + "shapes each vowel with precision. A faint, genuine smile engages the " + "zygomaticus major, lifting her lip corners fractionally. Her brows rest " + "in a soft, neutral arch throughout. She maintains steady, forward-facing " + "eye contact. Head position remains level; no torso displacement occurs.\n\n" + "Dialogue:\n" + ": " + "\"The most beautiful things in life aren't things at all — " + "they're moments, feelings, and the people who make you feel truly alive.\"\n\n" + "Background Sound:\n" + "" + ) + + sampling_params = OmniDiffusionSamplingParams( + height=256, + width=448, + num_inference_steps=8, + seed=52, + extra_args={ + "seconds": 5, + "sr_height": 1080, + "sr_width": 1920, + "sr_num_inference_steps": 5, + }, + ) + + try: + outputs = list( + omni.generate( + prompts=[prompt], + sampling_params_list=[sampling_params], + ) + ) + + assert len(outputs) > 0, "No outputs returned" + first = outputs[0] + + assert hasattr(first, "images") and first.images, "No video frames in output" + video_frames = first.images[0] + assert isinstance(video_frames, np.ndarray), f"Expected numpy array, got {type(video_frames)}" + assert video_frames.ndim == 4, f"Expected 4D array (T,H,W,3), got shape {video_frames.shape}" + + audio_waveform = None + if hasattr(first, "multimodal_output") and first.multimodal_output: + audio_waveform = first.multimodal_output.get("audio") + assert audio_waveform is not None, "No audio waveform in multimodal_output" + + video_bytes = mux_video_audio_bytes( + video_frames, + audio_waveform, + fps=25.0, + audio_sample_rate=44100, + ) + assert isinstance(video_bytes, bytes), f"Expected MP4 bytes, got {type(video_bytes)}" + assert len(video_bytes) > 1000, f"MP4 too small ({len(video_bytes)} bytes)" + + _validate_mp4(video_bytes) + finally: + omni.close() diff --git a/vllm_omni/diffusion/models/magi_human/__init__.py b/vllm_omni/diffusion/models/magi_human/__init__.py new file mode 100644 index 0000000000..9881313609 --- /dev/null +++ b/vllm_omni/diffusion/models/magi_human/__init__.py @@ -0,0 +1 @@ +# SPDX-License-Identifier: Apache-2.0 diff --git a/vllm_omni/diffusion/models/magi_human/magi_human_dit.py b/vllm_omni/diffusion/models/magi_human/magi_human_dit.py new file mode 100644 index 0000000000..491b1b3c40 --- /dev/null +++ b/vllm_omni/diffusion/models/magi_human/magi_human_dit.py @@ -0,0 +1,1624 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SandAI. All Rights Reserved. +# Ported from daVinci-MagiHuman inference/model/dit/dit_module.py +# Adaptations: removed Ulysses context-parallelism, inlined Modality/VarlenHandler. + +from __future__ import annotations + +import importlib +from collections.abc import Callable +from dataclasses import dataclass, field +from enum import Enum, IntEnum +from typing import TYPE_CHECKING, Any, Literal + +import torch +import torch.nn as nn +from einops import rearrange, repeat +from torch.nn import Parameter +from torch.nn import functional as F +from vllm.distributed import ( + get_tensor_model_parallel_world_size, +) +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.vllm_flash_attn import flash_attn_varlen_func as _vllm_fa_varlen + +try: + from magi_compiler.api import magi_register_custom_op + from magi_compiler.config import CompileConfig +except Exception: + + class CompileConfig: # type: ignore[no-redef] + pass + + def magi_register_custom_op(*args, **kwargs): # type: ignore[no-redef] + def decorator(func): + return func + + return decorator + + +def magi_compile(*args, **kwargs): + """No-op stub — vllm-omni handles execution; magi compilation is skipped.""" + + def decorator(cls_or_fn): + return cls_or_fn + + return decorator + + +# --------------------------------------------------------------------------- +# Inlined from inference/common/sequence_schema.py +# --------------------------------------------------------------------------- +class Modality(IntEnum): + VIDEO = 0 + AUDIO = 1 + TEXT = 2 + + +@dataclass +class VarlenHandler: + cu_seqlens_q: torch.Tensor + cu_seqlens_k: torch.Tensor + max_seqlen_q: int + max_seqlen_k: int + + +def _is_hopper_arch() -> bool: + if not torch.cuda.is_available(): + return False + return torch.cuda.get_device_capability()[0] == 9 + + +# --------------------------------------------------------------------------- +# FFA handler for local / flex attention +# --------------------------------------------------------------------------- +@dataclass +class FFAHandler: + q_ranges: torch.Tensor + k_ranges: torch.Tensor + max_seqlen_q: int + max_seqlen_k: int + attn_type_map: torch.Tensor + softmax_scale: float + + +# --------------------------------------------------------------------------- +# Activation helpers +# --------------------------------------------------------------------------- +class MLPActivationType(Enum): + SWIGLU7 = "swiglu7" + GELU7 = "gelu7" + + +def swiglu7(x, alpha: float = 1.702, limit: float = 7.0, out_dtype: torch.dtype | None = None): + out_dtype = x.dtype if out_dtype is None else out_dtype + x = x.to(torch.float32) + x_glu, x_linear = x[..., ::2], x[..., 1::2] + x_glu = x_glu.clamp(min=None, max=limit) + x_linear = x_linear.clamp(min=-limit, max=limit) + out_glu = x_glu * torch.sigmoid(alpha * x_glu) + return (out_glu * (x_linear + 1)).to(out_dtype) + + +def gelu7(x, alpha: float = 1.702, limit: float = 7.0, out_dtype: torch.dtype | None = None): + out_dtype = x.dtype if out_dtype is None else out_dtype + x = x.to(torch.float32) + x_glu = x.clamp(min=None, max=limit) + out_glu = x_glu * torch.sigmoid(alpha * x_glu) + return out_glu.to(out_dtype) + + +def create_activation_func(activation_type: MLPActivationType) -> Callable: + match activation_type: + case MLPActivationType.SWIGLU7: + return swiglu7 + case MLPActivationType.GELU7: + return gelu7 + case _: + raise ValueError(f"Unknown activation type: {activation_type}") + + +# --------------------------------------------------------------------------- +# Modality dispatcher (permutation helper) +# --------------------------------------------------------------------------- +class ModalityDispatcher: + permuted_modality_mapping: torch.Tensor + group_size: torch.Tensor + group_size_cpu: list[int] + num_modalities: int + + def __init__(self, modality_mapping: torch.Tensor, num_modalities: int): + self.modality_mapping = modality_mapping + self.num_modalities = num_modalities + self.permuted_modality_mapping = self._precompute_permute_mapping(modality_mapping) + self.group_size = torch.bincount(self.permuted_modality_mapping, minlength=num_modalities).to(torch.int32) + self.group_size_cpu: list[int] = [int(x) for x in self.group_size.to("cpu").tolist()] + + def _precompute_permute_mapping(self, modality_mapping): + self.permute_mapping = torch.argsort(modality_mapping) + self.inv_permute_mapping = torch.argsort(self.permute_mapping) + return modality_mapping[self.permute_mapping] + + def dispatch(self, x: torch.Tensor) -> list[torch.Tensor]: + return list(torch.split(x, self.group_size_cpu, dim=0)) + + def undispatch(self, *processed_groups: list[torch.Tensor]) -> torch.Tensor: + return torch.cat(processed_groups, dim=0) + + @staticmethod + def permute(x: torch.Tensor, permute_mapping: torch.Tensor) -> torch.Tensor: + return x[permute_mapping] + + @staticmethod + def inv_permute(x: torch.Tensor, inv_permute_mapping: torch.Tensor) -> torch.Tensor: + return x[inv_permute_mapping] + + +# --------------------------------------------------------------------------- +# Positional / rotary embedding helpers +# --------------------------------------------------------------------------- +def freq_bands( + num_bands: int, temperature: float = 10000.0, step: int = 2, device: torch.device | None = None +) -> torch.Tensor: + exp = torch.arange(0, num_bands, step, dtype=torch.int64, device=device).to(torch.float32) / num_bands + return 1.0 / (temperature**exp) + + +def rotate_half(x, interleaved=False): + if not interleaved: + x1, x2 = x.chunk(2, dim=-1) + return torch.cat((-x2, x1), dim=-1) + else: + x1, x2 = x[..., ::2], x[..., 1::2] + return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2) + + +def apply_rotary_emb_torch(x, cos, sin, interleaved=False): + ro_dim = cos.shape[-1] * 2 + assert ro_dim <= x.shape[-1] + cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") + sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)") + return torch.cat([x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]], dim=-1) + + +# --------------------------------------------------------------------------- +# Fourier positional embedding +# --------------------------------------------------------------------------- +class ElementWiseFourierEmbed(nn.Module): + def __init__( + self, + dim: int, + max_res: int = 224, + temperature: float = 10000.0, + in_pixels: bool = True, + linear_bands: bool = False, + learnable: bool = False, + device: torch.device = torch.device("cpu"), + dtype: torch.dtype = torch.float32, + ): + super().__init__() + self.dim = dim + self.in_pixels = in_pixels + self.learnable = learnable + self.temperature = temperature + self.max_res = max_res + self.linear_bands = linear_bands + self.device = device + self.dtype = dtype + bands = self.get_default_bands() + self.bands = nn.Parameter(bands, requires_grad=self.learnable) + + def forward(self, coords: torch.Tensor) -> torch.Tensor: + coords_xyz = coords[:, :3] + sizes = coords[:, 3:6] + refs = coords[:, 6:9] + + scales = (refs - 1) / (sizes - 1) + scales[(refs == 1) & (sizes == 1)] = 1 + assert not scales.isnan().any(), "scales has nan" + assert not scales.isinf().any(), "scales has inf" + + centers = (sizes - 1) / 2 + centers[:, 0] = 0 + coords_xyz = coords_xyz - centers + + bands = self.bands.to(coords.device, coords.dtype) + proj = coords_xyz.unsqueeze(-1) * scales.unsqueeze(-1) * bands + sin_proj = proj.sin() + cos_proj = proj.cos() + return torch.cat((sin_proj, cos_proj), dim=1).flatten(1) + + def reset_parameters(self): + self.bands.copy_(self.get_default_bands()) + + def get_default_bands(self): + if self.in_pixels: + raise NotImplementedError("in_pixels are not implemented yet") + return freq_bands(self.dim // 8, temperature=self.temperature, step=1, device=self.device).to(self.dtype) + + +# --------------------------------------------------------------------------- +# Multi-modality RMSNorm +# --------------------------------------------------------------------------- +class MultiModalityRMSNorm(nn.Module): + __constants__ = ["dim", "eps", "num_modality"] + + def __init__(self, dim: int, eps: float = 1e-6, device: torch.device | None = None, num_modality: int = 1): + super().__init__() + self.dim = dim + self.eps = eps + self.num_modality = num_modality + self.weight = nn.Parameter(torch.zeros(dim * num_modality, device=device, dtype=torch.float32)) + if num_modality > 1: + self.forward = self.forward_multi_experts + else: + self.forward = self.forward_single_expert + self.reset_parameters() + + def reset_parameters(self): + nn.init.zeros_(self.weight) + + def rms(self, x: torch.Tensor) -> torch.Tensor: + t = x.float() + return t * torch.rsqrt(torch.mean(t**2, dim=-1, keepdim=True) + self.eps) + + def forward_multi_experts(self, x: torch.Tensor, modality_dispatcher: ModalityDispatcher) -> torch.Tensor: + original_dtype = x.dtype + t = self.rms(x) + weight_chunked = self.weight.chunk(self.num_modality, dim=0) + t_list = modality_dispatcher.dispatch(t) + for i in range(self.num_modality): + t_list[i] = t_list[i] * (weight_chunked[i] + 1) + t = modality_dispatcher.undispatch(*t_list) + return t.to(original_dtype) + + def forward_single_expert( + self, x: torch.Tensor, modality_dispatcher: ModalityDispatcher | None = None + ) -> torch.Tensor: + t, original_dtype = x.float(), x.dtype + t = t * torch.rsqrt(torch.mean(t**2, dim=-1, keepdim=True) + self.eps) + return (t * (self.weight + 1)).to(original_dtype) + + +# --------------------------------------------------------------------------- +# Linear layers with bf16 compute and MoE dispatch +# --------------------------------------------------------------------------- +class _BF16ComputeLinear(torch.autograd.Function): + @staticmethod + def forward( + ctx, + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor | None, + output_dtype: torch.dtype | None, + compute_dtype: torch.dtype = torch.bfloat16, + ): + input_cast = input.to(compute_dtype) + weight_cast = weight.to(compute_dtype) + output = torch.matmul(input_cast, weight_cast.t()) + if bias is not None: + output = output + bias.to(compute_dtype) + return output.to(output_dtype) + + +class BaseLinear(nn.Module): + __constants__ = ["in_features", "out_features", "num_layers", "num_experts"] + + def __init__( + self, in_features, out_features, num_layers_for_initialization, num_experts, bias=True, device=None, dtype=None + ): + super().__init__() + factory_kwargs = {"device": device, "dtype": torch.bfloat16} + self.in_features = in_features + self.out_features = out_features + self.num_layers_for_initialization = num_layers_for_initialization + self.num_experts = num_experts + self.use_bias = bias + self.weight = Parameter(torch.empty((out_features * num_experts, in_features), **factory_kwargs)) + if bias: + self.bias = Parameter(torch.empty(out_features * num_experts, **factory_kwargs)) + else: + self.register_parameter("bias", None) + + def forward( + self, + input: torch.Tensor, + output_dtype: torch.dtype | None = None, + modality_dispatcher: ModalityDispatcher | None = None, + ) -> torch.Tensor: + output_dtype = input.dtype if output_dtype is None else output_dtype + return _BF16ComputeLinear.apply(input, self.weight, self.bias, output_dtype, torch.bfloat16) + + +class NativeMoELinear(BaseLinear): + def forward( + self, + input: torch.Tensor, + output_dtype: torch.dtype | None = None, + modality_dispatcher: ModalityDispatcher | None = None, + ) -> torch.Tensor: + output_dtype = input.dtype if output_dtype is None else output_dtype + input_list = modality_dispatcher.dispatch(input) # type: ignore + weight_chunked = self.weight.chunk(self.num_experts, dim=0) + if self.bias is not None: + bias_chunked = self.bias.chunk(self.num_experts, dim=0) + for i in range(self.num_experts): + input_list[i] = _BF16ComputeLinear.apply( + input_list[i], + weight_chunked[i], + bias_chunked[i] if self.bias is not None else None, + output_dtype, + torch.bfloat16, + ) + return modality_dispatcher.undispatch(*input_list) # type: ignore + + +def create_linear( + in_features, out_features, num_layers=1, num_experts=1, bias=True, device=None, dtype=None +) -> BaseLinear | NativeMoELinear: + if num_experts == 1: + return BaseLinear(in_features, out_features, num_layers, num_experts, bias, device, dtype) + else: + return NativeMoELinear(in_features, out_features, num_layers, num_experts, bias, device, dtype) + + +# --------------------------------------------------------------------------- +# MoE TP parallel linear wrappers: per-expert vLLM parallel layers +# --------------------------------------------------------------------------- +class MoEQKVParallelLinear(nn.Module): + """Per-expert QKVParallelLinear with modality dispatch. + + Wraps ``num_experts`` independent QKVParallelLinear instances. + Forward: dispatch tokens by modality → per-expert QKV matmul (TP-sharded) + → undispatch. + """ + + def __init__( + self, + hidden_size: int, + head_size: int, + total_num_heads: int, + total_num_kv_heads: int, + num_experts: int, + bias: bool = False, + ): + super().__init__() + self.num_experts = num_experts + self.experts = nn.ModuleList( + [ + QKVParallelLinear( + hidden_size=hidden_size, + head_size=head_size, + total_num_heads=total_num_heads, + total_num_kv_heads=total_num_kv_heads, + bias=bias, + return_bias=False, + ) + for _ in range(num_experts) + ] + ) + # Expose per-rank head info from the first expert (all are identical). + self.num_heads = self.experts[0].num_heads + self.num_kv_heads = self.experts[0].num_kv_heads + self.head_size = head_size + + def forward( + self, + x: torch.Tensor, + modality_dispatcher: ModalityDispatcher, + ) -> torch.Tensor: + x_list = modality_dispatcher.dispatch(x) + out_list: list[torch.Tensor] = [] + for i in range(self.num_experts): + out = self.experts[i](x_list[i]) + out_list.append(out) + return modality_dispatcher.undispatch(*out_list) + + +class MoEColumnParallelLinear(nn.Module): + """Per-expert ColumnParallelLinear with modality dispatch. + + Forward: dispatch → per-expert column-parallel matmul → undispatch. + Output stays TP-local (no gather). + """ + + def __init__( + self, + input_size: int, + output_size: int, + num_experts: int, + bias: bool = False, + ): + super().__init__() + self.num_experts = num_experts + self.experts = nn.ModuleList( + [ + ColumnParallelLinear( + input_size=input_size, + output_size=output_size, + bias=bias, + gather_output=False, + return_bias=False, + ) + for _ in range(num_experts) + ] + ) + + def forward( + self, + x: torch.Tensor, + modality_dispatcher: ModalityDispatcher, + ) -> torch.Tensor: + x_list = modality_dispatcher.dispatch(x) + out_list: list[torch.Tensor] = [] + for i in range(self.num_experts): + out = self.experts[i](x_list[i]) + out_list.append(out) + return modality_dispatcher.undispatch(*out_list) + + +class MoERowParallelLinear(nn.Module): + """Per-expert RowParallelLinear with modality dispatch. + + Forward: dispatch → per-expert row-parallel matmul (includes all-reduce) + → undispatch. + """ + + def __init__( + self, + input_size: int, + output_size: int, + num_experts: int, + bias: bool = False, + ): + super().__init__() + self.num_experts = num_experts + self.experts = nn.ModuleList( + [ + RowParallelLinear( + input_size=input_size, + output_size=output_size, + bias=bias, + input_is_parallel=True, + return_bias=False, + ) + for _ in range(num_experts) + ] + ) + + def forward( + self, + x: torch.Tensor, + modality_dispatcher: ModalityDispatcher, + ) -> torch.Tensor: + x_list = modality_dispatcher.dispatch(x) + out_list: list[torch.Tensor] = [] + for i in range(self.num_experts): + out = self.experts[i](x_list[i]) + out_list.append(out) + return modality_dispatcher.undispatch(*out_list) + + +def validate_magi_human_tp_constraints( + *, + hidden_size: int, + num_heads_q: int, + num_heads_kv: int, + tensor_parallel_size: int, +) -> None: + """Validate MagiHuman TP divisibility constraints. + + Both shared layers (num_modality == 1) and MoE layers (num_modality == 3) + support TP via vLLM's parallel linear layers (QKVParallelLinear / + ColumnParallelLinear / RowParallelLinear). MoE layers use per-expert + parallel layers with modality dispatch. + + Supported tp_sizes given default config (hidden=5120, heads_q=40, kv=8): 1, 2, 4. + """ + tp = tensor_parallel_size + if tp <= 1: + return + errors: list[str] = [] + if num_heads_q % tp != 0: + errors.append(f"num_heads_q ({num_heads_q}) must be divisible by tensor_parallel_size ({tp})") + if num_heads_kv % tp != 0: + errors.append(f"num_heads_kv ({num_heads_kv}) must be divisible by tensor_parallel_size ({tp})") + # SWIGLU layers use intermediate = int(hidden * 8/3) // 4 * 4 + intermediate_swiglu = int(hidden_size * 4 * 2 / 3) // 4 * 4 + if intermediate_swiglu % tp != 0: + errors.append( + f"swiglu intermediate_size ({intermediate_swiglu}) must be divisible by " + f"tensor_parallel_size ({tp}). Supported tp values: 1, 2, 4" + ) + # GELU7 MoE layers use intermediate = hidden * 4 + intermediate_gelu = hidden_size * 4 + if intermediate_gelu % tp != 0: + errors.append(f"gelu intermediate_size ({intermediate_gelu}) must be divisible by tensor_parallel_size ({tp})") + if errors: + raise ValueError("MagiHuman TP constraint violations:\n" + "\n".join(f" - {e}" for e in errors)) + + +# --------------------------------------------------------------------------- +# Flash attention (no context-parallelism) — uses vllm's flash attention +# --------------------------------------------------------------------------- + +HAS_MAGI_ATTENTION = importlib.util.find_spec("magi_attention") is not None + + +def _fa_varlen_simple( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, +) -> torch.Tensor: + had_batch = query.ndim == 4 + if had_batch: + query = query.squeeze(0) + key = key.squeeze(0) + value = value.squeeze(0) + seq_len = query.shape[0] + cu_seqlens = torch.tensor([0, seq_len], dtype=torch.int32, device=query.device) + out = _vllm_fa_varlen( + q=query, + k=key, + v=value, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=seq_len, + max_seqlen_k=seq_len, + ) + if had_batch: + out = out.unsqueeze(0) + return out + + +@magi_register_custom_op(name="infra::flash_attn_func", is_subgraph_boundary=True) +def flash_attn_func(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor: + return _fa_varlen_simple(query, key, value) + + +def _split_q_range_with_no_overlap( + q_ranges: torch.Tensor, k_ranges: torch.Tensor +) -> tuple[list[list[int]], list[list[list[int]]]]: + range_boundary = torch.unique(q_ranges, sorted=True).tolist() + candidates = [[start, end, []] for start, end in zip(range_boundary[:-1], range_boundary[1:])] + q_ranges = q_ranges.tolist() + k_ranges = k_ranges.tolist() + for q_range, k_range in zip(q_ranges, k_ranges): + q_start, q_end = q_range + for q_range_cand in candidates: + if q_start <= q_range_cand[0] and q_range_cand[1] <= q_end: + q_range_cand[2].append(k_range) + q_ranges_out = [] + k_ranges_out = [] + for q_range_cand in candidates: + if len(q_range_cand[2]) > 0: + q_ranges_out.append(q_range_cand[0:2]) + k_ranges_out.append(q_range_cand[2]) + return q_ranges_out, k_ranges_out + + +def _flash_attn_with_correction( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + q_ranges: list[list[int]], + k_range_list: list[list[list[int]]], +): + output = torch.zeros_like(query) + output_lse = torch.zeros((query.shape[0], query.shape[1]), dtype=torch.float32, device=query.device) + + for q_range, k_ranges in zip(q_ranges, k_range_list): + q_start, q_end = q_range + q_chunk = query[q_start:q_end] + q_len = q_chunk.shape[0] + + # Concatenate all k_ranges into a single key/value block, then run one + # flash-attention call. This avoids the need to merge per-chunk LSEs. + k_parts = [key[ks:ke] for ks, ke in k_ranges] + v_parts = [value[ks:ke] for ks, ke in k_ranges] + k_combined = torch.cat(k_parts, dim=0) if len(k_parts) > 1 else k_parts[0] + v_combined = torch.cat(v_parts, dim=0) if len(v_parts) > 1 else v_parts[0] + k_len = k_combined.shape[0] + + cu_q = torch.tensor([0, q_len], dtype=torch.int32, device=query.device) + cu_k = torch.tensor([0, k_len], dtype=torch.int32, device=query.device) + qo_out = _vllm_fa_varlen( + q=q_chunk, + k=k_combined, + v=v_combined, + cu_seqlens_q=cu_q, + cu_seqlens_k=cu_k, + max_seqlen_q=q_len, + max_seqlen_k=k_len, + ) + output[q_start:q_end] = qo_out + return output, output_lse + + +def _flex_flash_attn_func_infer_output_meta( + query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, q_ranges: torch.Tensor, k_ranges: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor]: + output = torch.empty_like(query) + output_lse = torch.empty((query.shape[0], query.shape[1]), dtype=torch.float32, device=query.device) + return output, output_lse + + +@magi_register_custom_op( + name="infra::flex_flash_attn_func", + mutates_args=(), + infer_output_meta_fn=_flex_flash_attn_func_infer_output_meta, + is_subgraph_boundary=True, +) +def flex_flash_attn_func( + query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, q_ranges: torch.Tensor, k_ranges: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor]: + if HAS_MAGI_ATTENTION and _is_hopper_arch(): + from magi_attention.api import flex_flash_attn_func as magi_flex_flash_attn_func + + return magi_flex_flash_attn_func(query, key, value, q_ranges, k_ranges) + else: + q_ranges_split, k_range_list = _split_q_range_with_no_overlap(q_ranges, k_ranges) + return _flash_attn_with_correction(query, key, value, q_ranges_split, k_range_list) + + +def flash_attn_no_cp(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor: + q, k, v = q.to(torch.bfloat16), k.to(torch.bfloat16), v.to(torch.bfloat16) + return flash_attn_func(q, k, v).squeeze(0) + + +def flex_flash_attn_no_cp( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + q_ranges: torch.Tensor, + k_ranges: torch.Tensor, +) -> torch.Tensor: + q, k, v = q.to(torch.bfloat16).squeeze(0), k.to(torch.bfloat16).squeeze(0), v.to(torch.bfloat16).squeeze(0) + out, _ = flex_flash_attn_func(q, k, v, q_ranges=q_ranges, k_ranges=k_ranges) + return out + + +# --------------------------------------------------------------------------- +# Attention module (no context-parallelism) +# --------------------------------------------------------------------------- +@dataclass +class AttentionConfig: + hidden_size: int + num_heads_q: int + num_heads_kv: int + head_dim: int + params_dtype: torch.dtype + checkpoint_qk_layernorm_rope: bool + num_modality: int + num_layers: int + use_local_attn: bool = False + enable_attn_gating: bool = False + + +class Attention(torch.nn.Module): + config: AttentionConfig + + def __init__(self, config: AttentionConfig): + super().__init__() + self.config = config + self.pre_norm = MultiModalityRMSNorm(config.hidden_size, eps=1e-6, num_modality=config.num_modality) + self.gating_size = config.num_heads_q if config.enable_attn_gating else 0 + + # Both shared blocks (num_modality == 1) and MoE blocks (num_modality > 1) + # use vLLM's parallel linear layers for TP support. + # MoE blocks wrap per-expert parallel layers with modality dispatch. + if config.num_modality == 1: + # QKVParallelLinear handles GQA head-sharding for any tp_size. + # The combined checkpoint weight [Q, K, V, G] is split during + # load_weights: Q+K+V → linear_qkv, G → linear_gating. + self.linear_qkv = QKVParallelLinear( + hidden_size=config.hidden_size, + head_size=config.head_dim, + total_num_heads=config.num_heads_q, + total_num_kv_heads=config.num_heads_kv, + bias=False, + return_bias=False, + ) + self.linear_proj = RowParallelLinear( + input_size=config.num_heads_q * config.head_dim, + output_size=config.hidden_size, + bias=False, + input_is_parallel=True, + return_bias=False, + ) + if config.enable_attn_gating: + self.linear_gating = ColumnParallelLinear( + input_size=config.hidden_size, + output_size=config.num_heads_q, + bias=False, + gather_output=False, + return_bias=False, + ) + else: + self.linear_gating = None + else: + # MoE blocks: per-expert TP-sharded parallel layers. + self.linear_qkv = MoEQKVParallelLinear( + hidden_size=config.hidden_size, + head_size=config.head_dim, + total_num_heads=config.num_heads_q, + total_num_kv_heads=config.num_heads_kv, + num_experts=config.num_modality, + bias=False, + ) + self.linear_proj = MoERowParallelLinear( + input_size=config.num_heads_q * config.head_dim, + output_size=config.hidden_size, + num_experts=config.num_modality, + bias=False, + ) + if config.enable_attn_gating: + self.linear_gating = MoEColumnParallelLinear( + input_size=config.hidden_size, + output_size=config.num_heads_q, + num_experts=config.num_modality, + bias=False, + ) + else: + self.linear_gating = None + + self.q_norm = MultiModalityRMSNorm(config.head_dim, num_modality=config.num_modality) + self.k_norm = MultiModalityRMSNorm(config.head_dim, num_modality=config.num_modality) + + # q_size / kv_size reflect the per-rank head count when tp > 1. + # Both shared and MoE QKV layers expose .num_heads / .num_kv_heads. + if config.num_modality == 1: + self.q_size = self.linear_qkv.num_heads * config.head_dim + self.kv_size = self.linear_qkv.num_kv_heads * config.head_dim + self._local_heads_q = self.linear_qkv.num_heads + self._local_heads_kv = self.linear_qkv.num_kv_heads + else: + self.q_size = self.linear_qkv.num_heads * config.head_dim + self.kv_size = self.linear_qkv.num_kv_heads * config.head_dim + self._local_heads_q = self.linear_qkv.num_heads + self._local_heads_kv = self.linear_qkv.num_kv_heads + + def forward( + self, + hidden_states: torch.Tensor, + rope: torch.Tensor, + permute_mapping: torch.Tensor, + inv_permute_mapping: torch.Tensor, + varlen_handler: VarlenHandler, + local_attn_handler: FFAHandler | None, + modality_dispatcher: ModalityDispatcher, + ) -> torch.Tensor: + hidden_states = self.pre_norm(hidden_states, modality_dispatcher=modality_dispatcher).to(torch.bfloat16) + + if self.config.num_modality == 1: + # vLLM parallel layers with return_bias=False return a single tensor. + qkv = self.linear_qkv(hidden_states).to(torch.float32) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + if self.linear_gating is not None: + g = self.linear_gating(hidden_states).to(torch.float32) + else: + g = hidden_states.new_empty(hidden_states.shape[0], 0) + else: + # MoE TP path: per-expert QKV parallel layers. + qkv = self.linear_qkv(hidden_states, modality_dispatcher=modality_dispatcher).to(torch.float32) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + if self.linear_gating is not None: + g = self.linear_gating(hidden_states, modality_dispatcher=modality_dispatcher).to(torch.float32) + else: + g = hidden_states.new_empty(hidden_states.shape[0], 0) + + q = q.view(-1, self._local_heads_q, self.config.head_dim) + k = k.view(-1, self._local_heads_kv, self.config.head_dim) + v = v.view(-1, self._local_heads_kv, self.config.head_dim) + g = g.view(k.shape[0], self._local_heads_q, -1) + + q = self.q_norm(q, modality_dispatcher=modality_dispatcher) + k = self.k_norm(k, modality_dispatcher=modality_dispatcher) + + q = ModalityDispatcher.inv_permute(q, inv_permute_mapping).unsqueeze(0) + k = ModalityDispatcher.inv_permute(k, inv_permute_mapping).unsqueeze(0) + v = ModalityDispatcher.inv_permute(v, inv_permute_mapping).unsqueeze(0) + + sin_emb, cos_emb = rope.tensor_split(2, -1) + q = apply_rotary_emb_torch(q, cos_emb, sin_emb) + k = apply_rotary_emb_torch(k, cos_emb, sin_emb) + + if self.config.use_local_attn and local_attn_handler is not None: + self_attn_out = flex_flash_attn_no_cp(q, k, v, local_attn_handler.q_ranges, local_attn_handler.k_ranges) + else: + self_attn_out = flash_attn_no_cp(q, k, v) + self_attn_out = ModalityDispatcher.permute(self_attn_out, permute_mapping) + + if self.config.enable_attn_gating: + self_attn_out = self_attn_out * torch.sigmoid(g) + + self_attn_out = self_attn_out.view(-1, self._local_heads_q * self.config.head_dim).to(torch.bfloat16) + if self.config.num_modality == 1: + return self.linear_proj(self_attn_out) + return self.linear_proj(self_attn_out, modality_dispatcher=modality_dispatcher) + + +# --------------------------------------------------------------------------- +# MLP module +# --------------------------------------------------------------------------- +@dataclass +class MLPConfig: + hidden_size: int + intermediate_size: int + activation_type: MLPActivationType + params_dtype: torch.dtype + num_modality: int = 1 + num_layers: int = 1 + gated_act: bool = False + + +class MLP(torch.nn.Module): + config: MLPConfig + + def __init__(self, config: MLPConfig): + super().__init__() + num_experts = config.num_modality + self.pre_norm = MultiModalityRMSNorm(config.hidden_size, num_modality=config.num_modality) + intermediate_size_up = config.intermediate_size * 2 if config.gated_act else config.intermediate_size + + # Both shared blocks (num_experts == 1) and MoE blocks (num_experts > 1) + # use vLLM's parallel linear layers for TP support. + if num_experts == 1: + # ColumnParallelLinear shards the output dim uniformly. For + # SWIGLU7 the interleaved [up0, gate0, up1, gate1, ...] format + # is preserved within each rank's contiguous slice, so swiglu7 + # (which uses x[..., ::2] / x[..., 1::2]) still works correctly. + self.up_gate_proj = ColumnParallelLinear( + input_size=config.hidden_size, + output_size=intermediate_size_up, + bias=False, + gather_output=False, + return_bias=False, + ) + self.down_proj = RowParallelLinear( + input_size=config.intermediate_size, + output_size=config.hidden_size, + bias=False, + input_is_parallel=True, + return_bias=False, + ) + else: + # MoE blocks: per-expert TP-sharded parallel layers. + self.up_gate_proj = MoEColumnParallelLinear( + input_size=config.hidden_size, + output_size=intermediate_size_up, + num_experts=num_experts, + bias=False, + ) + self.down_proj = MoERowParallelLinear( + input_size=config.intermediate_size, + output_size=config.hidden_size, + num_experts=num_experts, + bias=False, + ) + self.activation_func = create_activation_func(config.activation_type) + + def forward(self, x: torch.Tensor, modality_dispatcher: ModalityDispatcher) -> torch.Tensor: + x = self.pre_norm(x, modality_dispatcher=modality_dispatcher).to(torch.bfloat16) + if isinstance(self.up_gate_proj, ColumnParallelLinear): + x = self.up_gate_proj(x).to(torch.float32) + x = self.activation_func(x).to(torch.bfloat16) + return self.down_proj(x).to(torch.float32) + # MoE TP path: per-expert column/row parallel layers. + x = self.up_gate_proj(x, modality_dispatcher=modality_dispatcher).to(torch.float32) + x = self.activation_func(x).to(torch.bfloat16) + x = self.down_proj(x, modality_dispatcher=modality_dispatcher).to(torch.float32) + return x + + +# --------------------------------------------------------------------------- +# Adapter (per-modality embedders + RoPE) +# --------------------------------------------------------------------------- +@dataclass +class AdapterConfig: + hidden_size: int + num_attention_heads: int + text_in_channels: int + video_in_channels: int + audio_in_channels: int + params_dtype: torch.dtype + + +class Adapter(torch.nn.Module): + config: AdapterConfig + + def __init__(self, config: AdapterConfig): + super().__init__() + self.config = config + self.video_embedder = nn.Linear(config.video_in_channels, config.hidden_size, bias=True, dtype=torch.float32) + self.text_embedder = nn.Linear(config.text_in_channels, config.hidden_size, bias=True, dtype=torch.float32) + self.audio_embedder = nn.Linear(config.audio_in_channels, config.hidden_size, bias=True, dtype=torch.float32) + self.rope = ElementWiseFourierEmbed( + config.hidden_size // config.num_attention_heads, in_pixels=False, learnable=False + ) + + def forward(self, x, coords_mapping, video_mask, audio_mask, text_mask): + rope = self.rope(coords_mapping) + + text_input = x[text_mask, : self.config.text_in_channels] + audio_input = x[audio_mask, : self.config.audio_in_channels] + video_input = x[video_mask, : self.config.video_in_channels] + + text_out = self.text_embedder(text_input) + audio_out = self.audio_embedder(audio_input) + video_out = self.video_embedder(video_input) + + output_x = torch.zeros(x.shape[0], self.config.hidden_size, device=x.device, dtype=x.dtype) + output_x[text_mask] = text_out + output_x[audio_mask] = audio_out + output_x[video_mask] = video_out + return output_x, rope + + +# --------------------------------------------------------------------------- +# Transformer layer (no CP) +# --------------------------------------------------------------------------- +class TransFormerLayer(torch.nn.Module): + def __init__(self, config: Any, layer_idx: int): + super().__init__() + num_modality = 3 if layer_idx in config.mm_layers else 1 + use_local_attn = layer_idx in config.local_attn_layers + self.post_norm = layer_idx in config.post_norm_layers + attention_config = AttentionConfig( + hidden_size=config.hidden_size, + num_heads_q=config.num_heads_q, + num_heads_kv=config.num_heads_kv, + head_dim=config.head_dim, + params_dtype=config.params_dtype, + checkpoint_qk_layernorm_rope=config.checkpoint_qk_layernorm_rope, + num_modality=num_modality, + num_layers=config.num_layers, + use_local_attn=use_local_attn, + enable_attn_gating=config.enable_attn_gating, + ) + self.attention: Attention = Attention(attention_config) + + activation_type = MLPActivationType.GELU7 if layer_idx in config.gelu7_layers else MLPActivationType.SWIGLU7 + if activation_type == MLPActivationType.SWIGLU7: + gated_act = True + intermediate_size = int(config.hidden_size * 4 * 2 / 3) // 4 * 4 + else: + gated_act = False + intermediate_size = config.hidden_size * 4 + mlp_config = MLPConfig( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + activation_type=activation_type, + params_dtype=config.params_dtype, + num_modality=num_modality, + num_layers=config.num_layers, + gated_act=gated_act, + ) + self.mlp: MLP = MLP(mlp_config) + if self.post_norm: + self.attn_post_norm = MultiModalityRMSNorm(config.hidden_size, num_modality=num_modality) + self.mlp_post_norm = MultiModalityRMSNorm(config.hidden_size, num_modality=num_modality) + + def forward( + self, + hidden_states: torch.Tensor, + rope: torch.Tensor, + permute_mapping: torch.Tensor, + inv_permute_mapping: torch.Tensor, + varlen_handler: VarlenHandler, + local_attn_handler: FFAHandler | None, + modality_dispatcher: ModalityDispatcher, + ) -> torch.Tensor: + attn_out = self.attention( + hidden_states, + rope, + permute_mapping, + inv_permute_mapping, + varlen_handler, + local_attn_handler, + modality_dispatcher, + ) + if self.post_norm: + attn_out = self.attn_post_norm(attn_out, modality_dispatcher=modality_dispatcher) + hidden_states = hidden_states + attn_out + + mlp_out = self.mlp(hidden_states, modality_dispatcher) + if self.post_norm: + mlp_out = self.mlp_post_norm(mlp_out, modality_dispatcher=modality_dispatcher) + hidden_states = hidden_states + mlp_out + return hidden_states + + +# --------------------------------------------------------------------------- +# TransformerBlock with magi_compile +# --------------------------------------------------------------------------- +is_base_model = True + + +def config_patch(compile_config: CompileConfig) -> CompileConfig: + global is_base_model + if is_base_model: + is_base_model = False + else: + compile_config.offload_config.gpu_resident_weight_ratio = 0.0 + return compile_config + + +@magi_compile( + config_patch=config_patch, dynamic_arg_dims={"x": 0, "rope": 0, "permute_mapping": 0, "inv_permute_mapping": 0} +) +class TransformerBlock(torch.nn.Module): + def __init__(self, model_config: Any): + super().__init__() + self.layers: list[TransFormerLayer] = nn.ModuleList() + for layer_idx in range(model_config.num_layers): + self.layers.append(TransFormerLayer(model_config, layer_idx)) + + def forward( + self, + x: torch.Tensor, + rope: torch.Tensor, + permute_mapping: torch.Tensor, + inv_permute_mapping: torch.Tensor, + varlen_handler: VarlenHandler, + local_attn_handler: FFAHandler | None, + modality_dispatcher: ModalityDispatcher, + ) -> torch.Tensor: + for layer in self.layers: + x = layer( + x, rope, permute_mapping, inv_permute_mapping, varlen_handler, local_attn_handler, modality_dispatcher + ) + return x + + +# --------------------------------------------------------------------------- +# Internal config for TransformerBlock / DiTModel construction +# --------------------------------------------------------------------------- +@dataclass +class TransformerConfig: + hidden_size: int + video_in_channels: int + audio_in_channels: int + text_in_channels: int + params_dtype: torch.dtype + post_process_dtype: torch.dtype + + +# --------------------------------------------------------------------------- +# DiTModel (no context-parallelism) +# --------------------------------------------------------------------------- +class DiTModel(torch.nn.Module): + config: TransformerConfig + _layerwise_offload_blocks_attr = "blocks" + + @property + def blocks(self) -> nn.ModuleList: + return self.block.layers + + def __init__(self, model_config: Any): + super().__init__() + validate_magi_human_tp_constraints( + hidden_size=model_config.hidden_size, + num_heads_q=model_config.hidden_size // model_config.head_dim, + num_heads_kv=model_config.num_query_groups, + tensor_parallel_size=get_tensor_model_parallel_world_size(), + ) + self.config = TransformerConfig( + hidden_size=model_config.hidden_size, + video_in_channels=model_config.video_in_channels, + audio_in_channels=model_config.audio_in_channels, + text_in_channels=model_config.text_in_channels, + params_dtype=model_config.params_dtype, + post_process_dtype=torch.float32, + ) + adapter_config = AdapterConfig( + hidden_size=model_config.hidden_size, + num_attention_heads=model_config.num_heads_q, + text_in_channels=model_config.text_in_channels, + video_in_channels=model_config.video_in_channels, + audio_in_channels=model_config.audio_in_channels, + params_dtype=torch.float32, + ) + self.adapter: Adapter = Adapter(adapter_config) + self.block: TransformerBlock = TransformerBlock(model_config=model_config) + self.final_norm_video = MultiModalityRMSNorm(self.config.hidden_size) + self.final_norm_audio = MultiModalityRMSNorm(self.config.hidden_size) + self.final_linear_video = nn.Linear( + self.config.hidden_size, self.config.video_in_channels, bias=False, dtype=torch.float32 + ) + self.final_linear_audio = nn.Linear( + self.config.hidden_size, self.config.audio_in_channels, bias=False, dtype=torch.float32 + ) + + def forward( + self, + x: torch.Tensor, + coords_mapping: torch.Tensor, + modality_mapping: torch.Tensor, + varlen_handler: VarlenHandler, + local_attn_handler: FFAHandler | None, + ): + modality_dispatcher = ModalityDispatcher(modality_mapping, 3) + permute_mapping = modality_dispatcher.permute_mapping + inv_permute_mapping = modality_dispatcher.inv_permute_mapping + video_mask = modality_mapping == Modality.VIDEO + audio_mask = modality_mapping == Modality.AUDIO + text_mask = modality_mapping == Modality.TEXT + + x, rope = self.adapter(x, coords_mapping, video_mask, audio_mask, text_mask) + + x = x.to(self.config.params_dtype) + x = ModalityDispatcher.permute(x, permute_mapping) + + x = self.block( + x, + rope, + permute_mapping=permute_mapping, + inv_permute_mapping=inv_permute_mapping, + varlen_handler=varlen_handler, + local_attn_handler=local_attn_handler, + modality_dispatcher=modality_dispatcher, + ) + + x = ModalityDispatcher.inv_permute(x, inv_permute_mapping) + + x_video = x[video_mask].to(self.final_norm_video.weight.dtype) + x_video = self.final_norm_video(x_video) + x_video = self.final_linear_video(x_video) + + x_audio = x[audio_mask].to(self.final_norm_audio.weight.dtype) + x_audio = self.final_norm_audio(x_audio) + x_audio = self.final_linear_audio(x_audio) + + x_out = torch.zeros( + x.shape[0], + max(self.config.video_in_channels, self.config.audio_in_channels), + device=x.device, + dtype=x.dtype, + ) + x_out[video_mask, : self.config.video_in_channels] = x_video + x_out[audio_mask, : self.config.audio_in_channels] = x_audio + + return x_out + + +# --------------------------------------------------------------------------- +# Public config dataclass for building DiTModel from JSON +# --------------------------------------------------------------------------- +@dataclass +class MagiHumanDiTConfig: + num_layers: int = 40 + hidden_size: int = 5120 + head_dim: int = 128 + num_query_groups: int = 8 + video_in_channels: int = 48 * 4 + audio_in_channels: int = 64 + text_in_channels: int = 3584 + checkpoint_qk_layernorm_rope: bool = False + params_dtype: torch.dtype = torch.float32 + mm_layers: list = field(default_factory=lambda: [0, 1, 2, 3, 36, 37, 38, 39]) + local_attn_layers: list = field(default_factory=list) + enable_attn_gating: bool = True + gelu7_layers: list = field(default_factory=lambda: [0, 1, 2, 3]) + post_norm_layers: list = field(default_factory=list) + + def __post_init__(self): + self.num_heads_q = self.hidden_size // self.head_dim + self.num_heads_kv = self.num_query_groups + + +if TYPE_CHECKING: + from .pipeline_magi_human import EvalInput + + +# =========================================================================== +# Data proxy (ported from daVinci-MagiHuman inference/pipeline/data_proxy.py) +# =========================================================================== +def _unfold_3d( + x: torch.Tensor, + kernel_size: tuple[int, int, int], + stride: tuple[int, int, int], +) -> torch.Tensor: + """Pure-PyTorch 3D unfold matching UnfoldAnd behavior. + + After N unfold ops the shape is (batch, C, oD, oH, oW, kD, kH, kW). + UnfoldAnd permutes kernel dims next to channel before reshape so that the + col_dim axis is ordered as (C, kD, kH, kW) -- matching F.unfold semantics. + Without this permute, .view() interleaves spatial and kernel positions. + + Args: + x: (N, C, D, H, W) + kernel_size: (kD, kH, kW) + stride: (sD, sH, sW) + Returns: + (N, C*kD*kH*kW, L) where L = product of output spatial dims. + """ + ndim = len(kernel_size) + for d in range(ndim): + x = x.unfold(d + 2, kernel_size[d], stride[d]) + # x: (N, C, oD, oH, oW, kD, kH, kW) + # Permute to (N, C, kD, kH, kW, oD, oH, oW) so that view groups correctly + perm = [0, 1] + list(range(ndim + 2, 2 * ndim + 2)) + list(range(2, ndim + 2)) + x = x.permute(*perm).contiguous() + + batch_size = x.shape[0] + col_dim = 1 + for i in range(1, ndim + 2): + col_dim *= x.shape[i] + spatial = 1 + for i in range(ndim + 2, 2 * ndim + 2): + spatial *= x.shape[i] + return x.view(batch_size, col_dim, spatial) + + +def calc_local_qk_range( + num_video_tokens, + num_audio_and_txt_tokens, + num_frames, + frame_receptive_field, +): + token_per_frame = num_video_tokens // num_frames + total_tokens = num_video_tokens + num_audio_and_txt_tokens + + q_range_list = [] + k_range_list = [] + for i in range(num_frames): + q_range_list.append(torch.tensor([i * token_per_frame, (i + 1) * token_per_frame])) + k_range_list.append( + torch.tensor( + [ + (i - frame_receptive_field) * token_per_frame, + (i + frame_receptive_field + 1) * token_per_frame, + ] + ) + ) + local_q_range = torch.stack(q_range_list, dim=0) + local_k_range = torch.stack(k_range_list, dim=0) + + local_k_range[local_k_range < 0] = 0 + local_k_range[local_k_range > num_video_tokens] = num_video_tokens + + video_q_range = torch.tensor([[0, num_video_tokens]]) + video_k_range = torch.tensor([[num_video_tokens, num_video_tokens + num_audio_and_txt_tokens]]) + + at_q_ranges = torch.tensor([[num_video_tokens, total_tokens]]) + at_k_ranges = torch.tensor([[0, total_tokens]]) + + q_ranges = ( + torch.cat([local_q_range, video_q_range, at_q_ranges], dim=0).to(torch.int32).to("cuda", non_blocking=True) + ) + k_ranges = ( + torch.cat([local_k_range, video_k_range, at_k_ranges], dim=0).to(torch.int32).to("cuda", non_blocking=True) + ) + return q_ranges, k_ranges + + +def calc_local_attn_ffa_handler( + num_video_tokens, + num_audio_and_txt_tokens, + num_frames, + frame_receptive_field, +): + q_ranges, k_ranges = calc_local_qk_range( + num_video_tokens, + num_audio_and_txt_tokens, + num_frames, + frame_receptive_field, + ) + total = num_video_tokens + num_audio_and_txt_tokens + return FFAHandler( + q_ranges=q_ranges, + k_ranges=k_ranges, + max_seqlen_q=total, + max_seqlen_k=total, + attn_type_map=torch.zeros([q_ranges.shape[0]], device="cuda", dtype=torch.int32), + softmax_scale=None, + ) + + +def get_coords( + shape: list[int], + ref_feat_shape: list[int], + offset_thw: list[int] | None = None, + device: torch.device = torch.device("cpu"), + dtype: torch.dtype = torch.float32, +): + if offset_thw is None: + offset_thw = [0, 0, 0] + ori_t, ori_h, ori_w = shape + ref_t, ref_h, ref_w = ref_feat_shape + + offset_t, offset_h, offset_w = offset_thw + time_rng = torch.arange(ori_t, device=device, dtype=dtype) + offset_t + height_rng = torch.arange(ori_h, device=device, dtype=dtype) + offset_h + width_rng = torch.arange(ori_w, device=device, dtype=dtype) + offset_w + + time_grid, height_grid, width_grid = torch.meshgrid( + time_rng, + height_rng, + width_rng, + indexing="ij", + ) + coords_flat = torch.stack([time_grid, height_grid, width_grid], dim=-1).reshape(-1, 3) + + meta = torch.tensor( + [ori_t, ori_h, ori_w, ref_t, ref_h, ref_w], + device=device, + dtype=dtype, + ) + meta_expanded = meta.expand(coords_flat.size(0), -1) + return torch.cat([coords_flat, meta_expanded], dim=-1) + + +@dataclass +class SingleData: + video_x_t: torch.Tensor + audio_x_t: torch.Tensor + audio_feat_len: int + txt_feat: torch.Tensor + txt_feat_len: int + t: int + h: int + w: int + patch_size: int + t_patch_size: int + spatial_rope_interpolation: Literal["inter", "extra"] + ref_audio_offset: int + text_offset: int + coords_style: Literal["v1", "v2"] = "v1" + + def __post_init__(self): + self.video_token_num = self.video_x_t.shape[0] + self.audio_x_t = self.audio_x_t[: self.audio_feat_len] + self.txt_feat = self.txt_feat[: self.txt_feat_len] + self.video_channel = self.video_x_t.shape[-1] + self.audio_channel = self.audio_x_t.shape[-1] + self.txt_channel = self.txt_feat.shape[-1] + + @property + def device(self): + return self.video_x_t.device + + @property + def default_dtype(self): + return self.video_x_t.dtype + + @property + def total_token_num(self): + return self.video_token_num + self.audio_feat_len + self.txt_feat_len + + @property + def token_sequence(self): + tensors = [self.video_x_t, self.audio_x_t, self.txt_feat] + max_channel = max(t.shape[-1] for t in tensors) + padded = [F.pad(t, (0, max_channel - t.shape[-1])) for t in tensors] + return torch.cat(padded, dim=0) + + @property + def modality_mapping(self): + v_map = torch.full((self.video_token_num,), Modality.VIDEO, dtype=torch.int64, device=self.device) + a_map = torch.full((self.audio_feat_len,), Modality.AUDIO, dtype=torch.int64, device=self.device) + t_map = torch.full((self.txt_feat_len,), Modality.TEXT, dtype=torch.int64, device=self.device) + return torch.cat([v_map, a_map, t_map], dim=0) + + def default_coords(self, shape, ref_feat_shape, offset_thw=None): + if offset_thw is None: + offset_thw = [0, 0, 0] + return get_coords( + shape=shape, + ref_feat_shape=ref_feat_shape, + offset_thw=offset_thw, + device=self.device, + dtype=self.default_dtype, + ) + + @property + def coords_mapping(self): + if self.spatial_rope_interpolation == "inter": + video_ref_feat_shape = (self.t // self.t_patch_size, 32, 32) + else: + video_ref_feat_shape = ( + self.t // self.t_patch_size, + self.h // self.patch_size, + self.w // self.patch_size, + ) + + video_coords = self.default_coords( + shape=( + self.t // self.t_patch_size, + self.h // self.patch_size, + self.w // self.patch_size, + ), + ref_feat_shape=video_ref_feat_shape, + ) + + if self.coords_style == "v1": + audio_coords = self.default_coords( + shape=(self.audio_feat_len, 1, 1), + ref_feat_shape=(self.t // self.t_patch_size, 1, 1), + ) + text_coords = self.default_coords( + shape=(self.txt_feat_len, 1, 1), + ref_feat_shape=(2, 1, 1), + offset_thw=[self.text_offset, 0, 0], + ) + elif self.coords_style == "v2": + magic_audio_ref_t = (self.audio_feat_len - 1) // 4 + 1 + audio_coords = self.default_coords( + shape=(self.audio_feat_len, 1, 1), + ref_feat_shape=(magic_audio_ref_t // self.t_patch_size, 1, 1), + ) + text_coords = self.default_coords( + shape=(self.txt_feat_len, 1, 1), + ref_feat_shape=(1, 1, 1), + offset_thw=[-self.txt_feat_len, 0, 0], + ) + else: + raise ValueError(f"Unknown coords_style: {self.coords_style}") + + return torch.cat([video_coords, audio_coords, text_coords], dim=0) + + def depack_token_sequence(self, token_sequence): + video_x_t = token_sequence[: self.video_token_num, : self.video_channel] + video_x_t = rearrange( + video_x_t, + "(T H W) (pT pH pW C) -> C (T pT) (H pH) (W pW)", + H=self.h // self.patch_size, + W=self.w // self.patch_size, + pT=self.t_patch_size, + pH=self.patch_size, + pW=self.patch_size, + ).contiguous() + audio_x_t = token_sequence[ + self.video_token_num : self.video_token_num + self.audio_feat_len, + : self.audio_channel, + ] + return video_x_t, audio_x_t + + +@dataclass +class SimplePackedData: + items: list[SingleData] + + @property + def token_sequence(self): + return torch.cat([item.token_sequence for item in self.items], dim=0) + + @property + def modality_mapping(self): + return torch.cat([item.modality_mapping for item in self.items], dim=0) + + @property + def coords_mapping(self): + return torch.cat([item.coords_mapping for item in self.items], dim=0) + + @property + def total_token_num(self): + return sum(item.total_token_num for item in self.items) + + def __getitem__(self, index): + return self.items[index] + + @property + def cu_seqlen(self): + cu = torch.cumsum( + torch.tensor([item.total_token_num for item in self.items]), + dim=0, + ) + return F.pad(cu, (1, 0)) + + @property + def max_seqlen(self): + return torch.tensor(max(item.total_token_num for item in self.items)) + + def depack_token_sequence(self, token_sequence): + video_list, audio_list = [], [] + parts = torch.split( + token_sequence, + [item.total_token_num for item in self.items], + dim=0, + ) + for item, part in zip(self.items, parts): + v, a = item.depack_token_sequence(part) + video_list.append(v) + audio_list.append(a) + return torch.stack(video_list, dim=0), torch.stack(audio_list, dim=0) + + +class MagiDataProxy: + def __init__( + self, + patch_size: int = 2, + t_patch_size: int = 1, + frame_receptive_field: int = 11, + spatial_rope_interpolation: str = "extra", + ref_audio_offset: int = 1000, + text_offset: int = 0, + coords_style: str = "v2", + ): + self.patch_size = patch_size + self.t_patch_size = t_patch_size + self.frame_receptive_field = frame_receptive_field + self.spatial_rope_interpolation = spatial_rope_interpolation + self.ref_audio_offset = ref_audio_offset + self.text_offset = text_offset + self.coords_style = coords_style + self._kernel = (t_patch_size, patch_size, patch_size) + self._stride = (t_patch_size, patch_size, patch_size) + self._saved_data: dict[str, Any] = {} + + def saved_for_output(self, **kwargs): + self._saved_data.update(kwargs) + + def get_saved_data(self, key: str): + return self._saved_data[key] + + def img2tokens(self, x_t: torch.Tensor): + x_t_unfolded = _unfold_3d(x_t, self._kernel, self._stride) + return rearrange( + x_t_unfolded, + "N col_dim num_tokens -> N num_tokens col_dim", + ).contiguous() + + def process_input(self, transported_data: EvalInput): + batch_size, _, t, h, w = transported_data.x_t.shape + x_t = self.img2tokens(transported_data.x_t) + audio_x_t = transported_data.audio_x_t.contiguous() + text_in = transported_data.txt_feat.contiguous() + + simple_packed_data = SimplePackedData(items=[]) + for i in range(batch_size): + single_data = SingleData( + video_x_t=x_t[i], + audio_x_t=audio_x_t[i], + audio_feat_len=transported_data.audio_feat_len[i], + txt_feat=text_in[i], + txt_feat_len=transported_data.txt_feat_len[i], + t=t, + h=h, + w=w, + patch_size=self.patch_size, + t_patch_size=self.t_patch_size, + spatial_rope_interpolation=self.spatial_rope_interpolation, + ref_audio_offset=self.ref_audio_offset, + text_offset=self.text_offset, + coords_style=self.coords_style, + ) + simple_packed_data.items.append(single_data) + + if self.frame_receptive_field != -1: + assert batch_size == 1, "local attention only supports batch size 1" + local_attn_handler = calc_local_attn_ffa_handler( + num_video_tokens=simple_packed_data[0].video_token_num, + num_audio_and_txt_tokens=(simple_packed_data[0].audio_feat_len + simple_packed_data[0].txt_feat_len), + num_frames=t, + frame_receptive_field=self.frame_receptive_field, + ) + if isinstance(local_attn_handler.max_seqlen_k, torch.Tensor): + local_attn_handler.max_seqlen_k = local_attn_handler.max_seqlen_k.item() + if isinstance(local_attn_handler.max_seqlen_q, torch.Tensor): + local_attn_handler.max_seqlen_q = local_attn_handler.max_seqlen_q.item() + else: + local_attn_handler = None + + varlen_handler = VarlenHandler( + cu_seqlens_q=simple_packed_data.cu_seqlen.to(torch.int32).cuda(), + cu_seqlens_k=simple_packed_data.cu_seqlen.to(torch.int32).cuda(), + max_seqlen_q=simple_packed_data.max_seqlen.to(torch.int32).cuda(), + max_seqlen_k=simple_packed_data.max_seqlen.to(torch.int32).cuda(), + ) + + self.saved_for_output(simple_packed_data=simple_packed_data) + + x = simple_packed_data.token_sequence + coords_mapping = simple_packed_data.coords_mapping + modality_mapping = simple_packed_data.modality_mapping + return (x, coords_mapping, modality_mapping, varlen_handler, local_attn_handler) + + def process_output(self, x: torch.Tensor): + simple_packed_data: SimplePackedData = self.get_saved_data("simple_packed_data") + return simple_packed_data.depack_token_sequence(x) diff --git a/vllm_omni/diffusion/models/magi_human/pipeline_magi_human.py b/vllm_omni/diffusion/models/magi_human/pipeline_magi_human.py new file mode 100644 index 0000000000..9e6efcad39 --- /dev/null +++ b/vllm_omni/diffusion/models/magi_human/pipeline_magi_human.py @@ -0,0 +1,2277 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (c) 2026 SandAI. All Rights Reserved. +# Ported from daVinci-MagiHuman inference/pipeline/video_generate.py +# Adapted for vllm-omni: single-GPU, diffusers VAE, configurable dit_subfolder. + +from __future__ import annotations + +import json +import logging +import math +import os +from collections.abc import Iterable +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Literal + +import numpy as np +import torch +import torch.nn as nn +import whisper +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.schedulers.scheduling_utils import ( + KarrasDiffusionSchedulers, + SchedulerMixin, + SchedulerOutput, +) +from diffusers.utils import deprecate, load_image +from diffusers.utils.torch_utils import randn_tensor +from diffusers.video_processor import VideoProcessor +from einops import rearrange +from PIL import Image +from safetensors.torch import load_file +from torch.nn import functional as F +from torch.nn.utils import weight_norm +from transformers import AutoTokenizer +from transformers.models.t5gemma import T5GemmaEncoderModel +from vllm.distributed import ( + get_tensor_model_parallel_world_size, +) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + +from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig +from vllm_omni.diffusion.distributed.autoencoders.autoencoder_kl_wan import ( + DistributedAutoencoderKLWan, +) +from vllm_omni.diffusion.model_loader.diffusers_loader import ( + DiffusersPipelineLoader, +) +from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin +from vllm_omni.diffusion.models.t5_encoder.t5_gemma_encoder import T5GemmaEncoderModelTP +from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import ( + DiffusionPipelineProfilerMixin, +) +from vllm_omni.diffusion.request import OmniDiffusionRequest + +from .magi_human_dit import ( + DiTModel, + FFAHandler, + MagiHumanDiTConfig, + Modality, + VarlenHandler, +) + +logger = logging.getLogger(__name__) + + +# =========================================================================== +# Scheduler (ported from daVinci-MagiHuman inference/pipeline/scheduler_unipc.py) +# =========================================================================== +class FlowUniPCMultistepScheduler(SchedulerMixin, ConfigMixin): + _compatibles = [e.name for e in KarrasDiffusionSchedulers] + order = 1 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 1000, + solver_order: int = 2, + prediction_type: str = "flow_prediction", + shift: float = 1.0, + use_dynamic_shifting=False, + thresholding: bool = False, + dynamic_thresholding_ratio: float = 0.995, + sample_max_value: float = 1.0, + predict_x0: bool = True, + solver_type: str = "bh2", + lower_order_final: bool = True, + disable_corrector: list[int] = [], + solver_p: SchedulerMixin = None, + timestep_spacing: str = "linspace", + steps_offset: int = 0, + final_sigmas_type: str | None = "zero", + ): + if solver_type not in ["bh1", "bh2"]: + if solver_type in ["midpoint", "heun", "logrho"]: + self.register_to_config(solver_type="bh2") + else: + raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}") + + self.predict_x0 = predict_x0 + self.num_inference_steps = None + alphas = np.linspace(1, 1 / num_train_timesteps, num_train_timesteps)[::-1].copy() + sigmas = 1.0 - alphas + sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32) + + if not use_dynamic_shifting: + sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) + + self.sigmas = sigmas + self.timesteps = sigmas * num_train_timesteps + + self.model_outputs = [None] * solver_order + self.timestep_list = [None] * solver_order + self.lower_order_nums = 0 + self.disable_corrector = disable_corrector + self.solver_p = solver_p + self.last_sample = None + self._step_index: int | None = None + self._begin_index: int | None = None + + self.sigmas = self.sigmas.to("cpu") + self.sigma_min = self.sigmas[-1].item() + self.sigma_max = self.sigmas[0].item() + + @property + def step_index(self): + return self._step_index + + @property + def begin_index(self): + return self._begin_index + + def set_begin_index(self, begin_index: int = 0): + self._begin_index = begin_index + + def set_timesteps( + self, + num_inference_steps: int | None = None, + device: str | torch.device = None, + sigmas: list[float] | None = None, + mu: float | None | None = None, + shift: float | None | None = None, + ): + if self.config.use_dynamic_shifting and mu is None: + raise ValueError(" you have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`") + + if sigmas is None: + sigmas = np.linspace(self.sigma_max, self.sigma_min, num_inference_steps + 1).copy()[:-1] + + if self.config.use_dynamic_shifting: + sigmas = self.time_shift(mu, 1.0, sigmas) + else: + if shift is None: + shift = self.config.shift + sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) + + if self.config.final_sigmas_type == "sigma_min": + sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5 + elif self.config.final_sigmas_type == "zero": + sigma_last = 0 + else: + raise ValueError( + f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}" + ) + + timesteps = sigmas * self.config.num_train_timesteps + sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32) + + self.sigmas = torch.from_numpy(sigmas) + self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64) + + self.num_inference_steps = len(timesteps) + + self.model_outputs = [None] * self.config.solver_order + self.lower_order_nums = 0 + self.last_sample = None + if self.solver_p: + self.solver_p.set_timesteps(self.num_inference_steps, device=device) + + self._step_index = None + self._begin_index = None + self.sigmas = self.sigmas.to("cpu") + + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: + dtype = sample.dtype + batch_size, channels, *remaining_dims = sample.shape + + if dtype not in (torch.float32, torch.float64): + sample = sample.float() + + sample = sample.reshape(batch_size, channels * np.prod(remaining_dims)) + abs_sample = sample.abs() + s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1) + s = torch.clamp(s, min=1, max=self.config.sample_max_value) + s = s.unsqueeze(1) + sample = torch.clamp(sample, -s, s) / s + sample = sample.reshape(batch_size, channels, *remaining_dims) + return sample.to(dtype) + + def _sigma_to_t(self, sigma): + return sigma * self.config.num_train_timesteps + + def _sigma_to_alpha_sigma_t(self, sigma): + return 1 - sigma, sigma + + def time_shift(self, mu: float, sigma: float, t: torch.Tensor): + return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma) + + def convert_model_output( + self, model_output: torch.Tensor, *args, sample: torch.Tensor = None, **kwargs + ) -> torch.Tensor: + timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) + if sample is None: + if len(args) > 1: + sample = args[1] + else: + raise ValueError("missing `sample` as a required keyword argument") + if timestep is not None: + deprecate( + "timesteps", + "1.0.0", + "Passing `timesteps` is deprecated and has no effect as model output " + "conversion is now handled via an internal counter `self.step_index`", + ) + + sigma = self.sigmas[self.step_index] + alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma) + + if self.predict_x0: + if self.config.prediction_type == "flow_prediction": + sigma_t = self.sigmas[self.step_index] + x0_pred = sample - sigma_t * model_output + else: + raise ValueError( + f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`," + " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler." + ) + if self.config.thresholding: + x0_pred = self._threshold_sample(x0_pred) + return x0_pred + else: + if self.config.prediction_type == "flow_prediction": + sigma_t = self.sigmas[self.step_index] + epsilon = sample - (1 - sigma_t) * model_output + else: + raise ValueError( + f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`," + " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler." + ) + if self.config.thresholding: + sigma_t = self.sigmas[self.step_index] + x0_pred = sample - sigma_t * model_output + x0_pred = self._threshold_sample(x0_pred) + epsilon = model_output + x0_pred + return epsilon + + def multistep_uni_p_bh_update( + self, + model_output: torch.Tensor, + *args, + sample: torch.Tensor | None = None, + order: int | None = None, + **kwargs, + ) -> torch.Tensor: + prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None) + if sample is None: + if len(args) > 1: + sample = args[1] + else: + raise ValueError(" missing `sample` as a required keyword argument") + if order is None: + if len(args) > 2: + order = args[2] + else: + raise ValueError(" missing `order` as a required keyword argument") + if prev_timestep is not None: + deprecate("prev_timestep", "1.0.0", "Passing `prev_timestep` is deprecated and has no effect.") + + model_output_list = self.model_outputs + s0 = self.timestep_list[-1] + m0 = model_output_list[-1] + x = sample + + if self.solver_p: + return self.solver_p.step(model_output, s0, x).prev_sample + + sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[self.step_index] + alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t) + alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0) + + lambda_t = torch.log(alpha_t) - torch.log(sigma_t) + lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0) + h = lambda_t - lambda_s0 + device = sample.device + + rks = [] + D1s: list[Any] | None = [] + for i in range(1, order): + si = self.step_index - i + mi = model_output_list[-(i + 1)] + alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si]) + lambda_si = torch.log(alpha_si) - torch.log(sigma_si) + rk = (lambda_si - lambda_s0) / h + rks.append(rk) + D1s.append((mi - m0) / rk) + + rks.append(1.0) + rks = torch.tensor(rks, device=device) + + R = [] + b = [] + hh = -h if self.predict_x0 else h + h_phi_1 = torch.expm1(hh) + h_phi_k = h_phi_1 / hh - 1 + factorial_i = 1 + + if self.config.solver_type == "bh1": + B_h = hh + elif self.config.solver_type == "bh2": + B_h = torch.expm1(hh) + else: + raise NotImplementedError() + + for i in range(1, order + 1): + R.append(torch.pow(rks, i - 1)) + b.append(h_phi_k * factorial_i / B_h) + factorial_i *= i + 1 + h_phi_k = h_phi_k / hh - 1 / factorial_i + + R = torch.stack(R) + b = torch.tensor(b, device=device) + + if len(D1s) > 0: + D1s = torch.stack(D1s, dim=1) + if order == 2: + rhos_p = torch.tensor([0.5], dtype=x.dtype, device=device) + else: + rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1]).to(device).to(x.dtype) + else: + D1s = None + + if self.predict_x0: + x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0 + pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s) if D1s is not None else 0 + x_t = x_t_ - alpha_t * B_h * pred_res + else: + x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0 + pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s) if D1s is not None else 0 + x_t = x_t_ - sigma_t * B_h * pred_res + + return x_t.to(x.dtype) + + def multistep_uni_c_bh_update( + self, + this_model_output: torch.Tensor, + *args, + last_sample: torch.Tensor = None, + this_sample: torch.Tensor = None, + order: int | None = None, + **kwargs, + ) -> torch.Tensor: + this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None) + if last_sample is None: + if len(args) > 1: + last_sample = args[1] + else: + raise ValueError(" missing`last_sample` as a required keyword argument") + if this_sample is None: + if len(args) > 2: + this_sample = args[2] + else: + raise ValueError(" missing`this_sample` as a required keyword argument") + if order is None: + if len(args) > 3: + order = args[3] + else: + raise ValueError(" missing`order` as a required keyword argument") + if this_timestep is not None: + deprecate("this_timestep", "1.0.0", "Passing `this_timestep` is deprecated and has no effect.") + + model_output_list = self.model_outputs + m0 = model_output_list[-1] + x = last_sample + x_t = this_sample + model_t = this_model_output + + sigma_t, sigma_s0 = self.sigmas[self.step_index], self.sigmas[self.step_index - 1] + alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t) + alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0) + + lambda_t = torch.log(alpha_t) - torch.log(sigma_t) + lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0) + h = lambda_t - lambda_s0 + device = this_sample.device + + rks = [] + D1s: list[Any] | None = [] + for i in range(1, order): + si = self.step_index - (i + 1) + mi = model_output_list[-(i + 1)] + alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si]) + lambda_si = torch.log(alpha_si) - torch.log(sigma_si) + rk = (lambda_si - lambda_s0) / h + rks.append(rk) + D1s.append((mi - m0) / rk) + + rks.append(1.0) + rks = torch.tensor(rks, device=device) + + R = [] + b = [] + hh = -h if self.predict_x0 else h + h_phi_1 = torch.expm1(hh) + h_phi_k = h_phi_1 / hh - 1 + factorial_i = 1 + + if self.config.solver_type == "bh1": + B_h = hh + elif self.config.solver_type == "bh2": + B_h = torch.expm1(hh) + else: + raise NotImplementedError() + + for i in range(1, order + 1): + R.append(torch.pow(rks, i - 1)) + b.append(h_phi_k * factorial_i / B_h) + factorial_i *= i + 1 + h_phi_k = h_phi_k / hh - 1 / factorial_i + + R = torch.stack(R) + b = torch.tensor(b, device=device) + + if len(D1s) > 0: + D1s = torch.stack(D1s, dim=1) + else: + D1s = None + + if order == 1: + rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device) + else: + rhos_c = torch.linalg.solve(R, b).to(device).to(x.dtype) + + if self.predict_x0: + x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0 + corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s) if D1s is not None else 0 + D1_t = model_t - m0 + x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t) + else: + x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0 + corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s) if D1s is not None else 0 + D1_t = model_t - m0 + x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t) + return x_t.to(x.dtype) + + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps + indices = (schedule_timesteps == timestep).nonzero() + pos = 1 if len(indices) > 1 else 0 + return indices[pos].item() + + def _init_step_index(self, timestep): + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index + + def step( + self, + model_output: torch.Tensor, + timestep: int | torch.Tensor, + sample: torch.Tensor, + return_dict: bool = True, + generator=None, + ) -> SchedulerOutput | tuple: + if self.num_inference_steps is None: + raise ValueError( + "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" + ) + + if self.step_index is None: + self._init_step_index(timestep) + + use_corrector = ( + self.step_index > 0 and self.step_index - 1 not in self.disable_corrector and self.last_sample is not None + ) + + model_output_convert = self.convert_model_output(model_output, sample=sample) + if use_corrector: + sample = self.multistep_uni_c_bh_update( + this_model_output=model_output_convert, + last_sample=self.last_sample, + this_sample=sample, + order=self.this_order, + ) + + for i in range(self.config.solver_order - 1): + self.model_outputs[i] = self.model_outputs[i + 1] + self.timestep_list[i] = self.timestep_list[i + 1] + + self.model_outputs[-1] = model_output_convert + self.timestep_list[-1] = timestep + + if self.config.lower_order_final: + this_order = min(self.config.solver_order, len(self.timesteps) - self.step_index) + else: + this_order = self.config.solver_order + + self.this_order = min(this_order, self.lower_order_nums + 1) + assert self.this_order > 0 + + self.last_sample = sample + prev_sample = self.multistep_uni_p_bh_update(model_output=model_output, sample=sample, order=self.this_order) + + if self.lower_order_nums < self.config.solver_order: + self.lower_order_nums += 1 + + self._step_index += 1 + + if not return_dict: + return (prev_sample,) + return SchedulerOutput(prev_sample=prev_sample) + + def step_ddim( + self, + velocity: torch.FloatTensor, + t: int, + curr_state: torch.FloatTensor, + prev_state: torch.FloatTensor | None = None, + generator: torch.Generator | None = None, + ): + device = curr_state.device + curr_t = self.sigmas[t] + prev_t = self.sigmas[t + 1] + variance_noise = randn_tensor(curr_state.shape, generator=generator, device=device, dtype=curr_state.dtype) + cur_clean_ = curr_state - curr_t * velocity + return prev_t * variance_noise + (1 - prev_t) * cur_clean_ + + def step_sde( + self, + velocity: torch.FloatTensor, + t: int, + curr_state: torch.FloatTensor, + noise_theta: float = 1.0, + prev_state: torch.FloatTensor | None = None, + generator: torch.Generator | None = None, + ): + device = curr_state.device + curr_t = self.sigmas[t] + prev_t = self.sigmas[t + 1] + cos = torch.cos(torch.tensor(noise_theta) * torch.pi / 2).to(device) + sin = torch.sin(torch.tensor(noise_theta) * torch.pi / 2).to(device) + prev_sample_mean = (1 - prev_t + prev_t * cos) * (curr_state - curr_t * velocity) + prev_t * cos * velocity + std_dev_t = prev_t * sin + std_dev_t = torch.ones((1, 1)).to(curr_state) * std_dev_t + if prev_state is None: + variance_noise = randn_tensor(curr_state.shape, generator=generator, device=device, dtype=curr_state.dtype) + prev_state = prev_sample_mean + std_dev_t * variance_noise + else: + prev_state = prev_sample_mean + (prev_state - prev_sample_mean.detach()) + return prev_state + + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: + return sample + + def add_noise( + self, original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor + ) -> torch.Tensor: + sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) + if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): + schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32) + timesteps = timesteps.to(original_samples.device, dtype=torch.float32) + else: + schedule_timesteps = self.timesteps.to(original_samples.device) + timesteps = timesteps.to(original_samples.device) + + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + elif self.step_index is not None: + step_indices = [self.step_index] * timesteps.shape[0] + else: + step_indices = [self.begin_index] * timesteps.shape[0] + + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < len(original_samples.shape): + sigma = sigma.unsqueeze(-1) + + alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma) + return alpha_t * original_samples + sigma_t * noise + + def __len__(self): + return self.config.num_train_timesteps + + +# =========================================================================== +# Audio VAE (ported from daVinci-MagiHuman inference/model/sa_audio/) +# =========================================================================== +def _snake_beta(x, alpha, beta): + return x + (1.0 / (beta + 1e-9)) * torch.pow(torch.sin(x * alpha), 2) + + +class _SnakeBeta(nn.Module): + def __init__(self, in_features: int, alpha: float = 1.0, alpha_trainable: bool = True, alpha_logscale: bool = True): + super().__init__() + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: + self.alpha = nn.Parameter(torch.zeros(in_features) * alpha) + self.beta = nn.Parameter(torch.zeros(in_features) * alpha) + else: + self.alpha = nn.Parameter(torch.ones(in_features) * alpha) + self.beta = nn.Parameter(torch.ones(in_features) * alpha) + self.alpha.requires_grad = alpha_trainable + self.beta.requires_grad = alpha_trainable + + def forward(self, x): + alpha = self.alpha.unsqueeze(0).unsqueeze(-1) + beta = self.beta.unsqueeze(0).unsqueeze(-1) + if self.alpha_logscale: + alpha = torch.exp(alpha) + beta = torch.exp(beta) + return _snake_beta(x, alpha, beta) + + +def _vae_sample(mean, scale): + stdev = F.softplus(scale) + 1e-4 + var = stdev * stdev + logvar = torch.log(var) + latents = torch.randn_like(mean) * stdev + mean + kl = (mean * mean + var - logvar - 1).sum(1).mean() + return latents, kl + + +class _VAEBottleneck(nn.Module): + def encode(self, x, return_info=False, **kwargs): + info = {} + mean, scale = x.chunk(2, dim=1) + x, kl = _vae_sample(mean, scale) + info["kl"] = kl + return (x, info) if return_info else x + + def decode(self, x): + return x + + +def _WNConv1d(*args, **kwargs): + return weight_norm(nn.Conv1d(*args, **kwargs)) + + +def _WNConvTranspose1d(*args, **kwargs): + return weight_norm(nn.ConvTranspose1d(*args, **kwargs)) + + +def _checkpoint(function, *args, **kwargs): + kwargs.setdefault("use_reentrant", False) + return torch.utils.checkpoint.checkpoint(function, *args, **kwargs) + + +def _get_activation(activation: Literal["elu", "snake", "none"], antialias: bool = False, channels=None) -> nn.Module: + if antialias: + raise NotImplementedError("antialias activation not supported") + if activation == "elu": + return nn.ELU() + if activation == "snake": + return _SnakeBeta(channels) + if activation == "none": + return nn.Identity() + raise ValueError(f"Unknown activation {activation}") + + +class _ResidualUnit(nn.Module): + def __init__(self, in_channels, out_channels, dilation, use_snake=False, antialias_activation=False): + super().__init__() + padding = (dilation * (7 - 1)) // 2 + self.layers = nn.Sequential( + _get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=out_channels), + _WNConv1d(in_channels, out_channels, kernel_size=7, dilation=dilation, padding=padding), + _get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=out_channels), + _WNConv1d(out_channels, out_channels, kernel_size=1), + ) + + def forward(self, x): + return (_checkpoint(self.layers, x) if self.training else self.layers(x)) + x + + +class _EncoderBlock(nn.Module): + def __init__(self, in_channels, out_channels, stride, use_snake=False, antialias_activation=False): + super().__init__() + self.layers = nn.Sequential( + _ResidualUnit(in_channels, in_channels, 1, use_snake=use_snake), + _ResidualUnit(in_channels, in_channels, 3, use_snake=use_snake), + _ResidualUnit(in_channels, in_channels, 9, use_snake=use_snake), + _get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=in_channels), + _WNConv1d(in_channels, out_channels, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2)), + ) + + def forward(self, x): + return self.layers(x) + + +class _DecoderBlock(nn.Module): + def __init__( + self, in_channels, out_channels, stride, use_snake=False, antialias_activation=False, use_nearest_upsample=False + ): + super().__init__() + if use_nearest_upsample: + upsample_layer = nn.Sequential( + nn.Upsample(scale_factor=stride, mode="nearest"), + _WNConv1d(in_channels, out_channels, kernel_size=2 * stride, stride=1, bias=False, padding="same"), + ) + else: + upsample_layer = _WNConvTranspose1d( + in_channels, out_channels, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2) + ) + self.layers = nn.Sequential( + _get_activation("snake" if use_snake else "elu", antialias=antialias_activation, channels=in_channels), + upsample_layer, + _ResidualUnit(out_channels, out_channels, 1, use_snake=use_snake), + _ResidualUnit(out_channels, out_channels, 3, use_snake=use_snake), + _ResidualUnit(out_channels, out_channels, 9, use_snake=use_snake), + ) + + def forward(self, x): + return self.layers(x) + + +class _OobleckEncoder(nn.Module): + def __init__( + self, + in_channels=2, + channels=128, + latent_dim=32, + c_mults=[1, 2, 4, 8], + strides=[2, 4, 8, 8], + use_snake=False, + antialias_activation=False, + ): + super().__init__() + c_mults = [1] + c_mults + depth = len(c_mults) + layers = [_WNConv1d(in_channels, c_mults[0] * channels, kernel_size=7, padding=3)] + for i in range(depth - 1): + layers.append( + _EncoderBlock(c_mults[i] * channels, c_mults[i + 1] * channels, strides[i], use_snake=use_snake) + ) + layers.extend( + [ + _get_activation( + "snake" if use_snake else "elu", antialias=antialias_activation, channels=c_mults[-1] * channels + ), + _WNConv1d(c_mults[-1] * channels, latent_dim, kernel_size=3, padding=1), + ] + ) + self.layers = nn.Sequential(*layers) + + def forward(self, x): + return self.layers(x) + + +class _OobleckDecoder(nn.Module): + def __init__( + self, + out_channels=2, + channels=128, + latent_dim=32, + c_mults=[1, 2, 4, 8], + strides=[2, 4, 8, 8], + use_snake=False, + antialias_activation=False, + use_nearest_upsample=False, + final_tanh=True, + ): + super().__init__() + c_mults = [1] + c_mults + depth = len(c_mults) + layers = [_WNConv1d(latent_dim, c_mults[-1] * channels, kernel_size=7, padding=3)] + for i in range(depth - 1, 0, -1): + layers.append( + _DecoderBlock( + c_mults[i] * channels, + c_mults[i - 1] * channels, + strides[i - 1], + use_snake=use_snake, + antialias_activation=antialias_activation, + use_nearest_upsample=use_nearest_upsample, + ) + ) + layers.extend( + [ + _get_activation( + "snake" if use_snake else "elu", antialias=antialias_activation, channels=c_mults[0] * channels + ), + _WNConv1d(c_mults[0] * channels, out_channels, kernel_size=7, padding=3, bias=False), + nn.Tanh() if final_tanh else nn.Identity(), + ] + ) + self.layers = nn.Sequential(*layers) + + def forward(self, x): + return self.layers(x) + + +class _AudioAutoencoder(nn.Module): + def __init__( + self, + encoder, + decoder, + latent_dim, + downsampling_ratio, + sample_rate, + io_channels=2, + bottleneck=None, + in_channels=None, + out_channels=None, + soft_clip=False, + ): + super().__init__() + self.downsampling_ratio = downsampling_ratio + self.sample_rate = sample_rate + self.latent_dim = latent_dim + self.io_channels = io_channels + self.in_channels = in_channels if in_channels is not None else io_channels + self.out_channels = out_channels if out_channels is not None else io_channels + self.bottleneck = bottleneck + self.encoder = encoder + self.decoder = decoder + self.soft_clip = soft_clip + + def encode(self, audio, skip_bottleneck=False, return_info=False, **kwargs): + info = {} + latents = self.encoder(audio) + info["pre_bottleneck_latents"] = latents + if self.bottleneck is not None and not skip_bottleneck: + latents, bottleneck_info = self.bottleneck.encode(latents, return_info=True, **kwargs) + info.update(bottleneck_info) + return (latents, info) if return_info else latents + + def decode(self, latents, skip_bottleneck=False, **kwargs): + if self.bottleneck is not None and not skip_bottleneck: + latents = self.bottleneck.decode(latents) + decoded = self.decoder(latents, **kwargs) + if self.soft_clip: + decoded = torch.tanh(decoded) + return decoded + + +def _create_encoder_from_config(cfg: dict[str, Any]): + assert cfg.get("type") == "oobleck", f"Only 'oobleck' encoder supported, got: {cfg.get('type')}" + enc = _OobleckEncoder(**cfg["config"]) + if not cfg.get("requires_grad", True): + for p in enc.parameters(): + p.requires_grad = False + return enc + + +def _create_decoder_from_config(cfg: dict[str, Any]): + assert cfg.get("type") == "oobleck", f"Only 'oobleck' decoder supported, got: {cfg.get('type')}" + dec = _OobleckDecoder(**cfg["config"]) + if not cfg.get("requires_grad", True): + for p in dec.parameters(): + p.requires_grad = False + return dec + + +def _create_bottleneck_from_config(cfg: dict[str, Any]): + assert cfg.get("type") == "vae", f"Only 'vae' bottleneck supported, got: {cfg.get('type')}" + bn = _VAEBottleneck() + if not cfg.get("requires_grad", True): + for p in bn.parameters(): + p.requires_grad = False + return bn + + +def _create_autoencoder_from_config(config: dict[str, Any]): + ae_config = config["model"] + if ae_config.get("pretransform") is not None: + raise NotImplementedError("Nested pretransform not supported") + encoder = _create_encoder_from_config(ae_config["encoder"]) + decoder = _create_decoder_from_config(ae_config["decoder"]) + bottleneck_cfg = ae_config.get("bottleneck") + bottleneck = _create_bottleneck_from_config(bottleneck_cfg) if bottleneck_cfg else None + return _AudioAutoencoder( + encoder=encoder, + decoder=decoder, + latent_dim=ae_config["latent_dim"], + downsampling_ratio=ae_config["downsampling_ratio"], + sample_rate=config["sample_rate"], + io_channels=ae_config["io_channels"], + bottleneck=bottleneck, + in_channels=ae_config.get("in_channels"), + out_channels=ae_config.get("out_channels"), + soft_clip=ae_config["decoder"].get("soft_clip", False), + ) + + +class SAAudioFeatureExtractor: + def __init__(self, device, model_path): + self.device = device + self.vae_model, self.sample_rate = self._load_vae(model_path) + self.resampler = None + + def _load_vae(self, model_path): + if not (isinstance(model_path, str) and Path(model_path).is_dir()): + raise ValueError("model_path must be a local directory") + + model_config_path = os.path.join(model_path, "model_config.json") + with open(model_config_path) as f: + full_config = json.load(f) + + vae_config = full_config["model"]["pretransform"]["config"] + sample_rate = full_config["sample_rate"] + + autoencoder_config = { + "model_type": "autoencoder", + "sample_rate": sample_rate, + "model": vae_config, + } + vae_model = _create_autoencoder_from_config(autoencoder_config) + + weights_path = Path(model_path) / "model.safetensors" + if not weights_path.exists(): + raise FileNotFoundError(f"Weight file does not exist: {weights_path}") + + full_state_dict = load_file(weights_path, device=str(self.device)) + vae_state_dict = {} + for key, value in full_state_dict.items(): + if key.startswith("pretransform.model."): + vae_state_dict[key[len("pretransform.model.") :]] = value + + model_keys = set(vae_model.state_dict().keys()) + vae_keys = set(vae_state_dict.keys()) + missing = model_keys - vae_keys + extra = vae_keys - model_keys + if missing: + logger.warning("Audio VAE missing keys (%d): %s", len(missing), list(missing)[:5]) + if extra: + logger.warning("Audio VAE unexpected keys (%d): %s", len(extra), list(extra)[:5]) + + vae_model.load_state_dict(vae_state_dict) + vae_model.to(self.device) + return vae_model, sample_rate + + def decode(self, latents): + with torch.no_grad(): + return self.vae_model.decode(latents) + + def encode(self, waveform): + with torch.no_grad(): + return self.vae_model.encode(waveform) + + +# =========================================================================== +# Audio utilities (ported from daVinci-MagiHuman inference/pipeline/video_process.py) +# =========================================================================== +_SAMPLE_RATE = 51200 +_AUDIO_CHUNK_DURATION = 29 +_OVERLAP_RATIO = 0.5 + + +def _merge_overlapping_vae_features(audio_feats: list[torch.Tensor], overlap_ratio: float = 0.5) -> torch.Tensor | None: + if not audio_feats: + return None + if len(audio_feats) == 1: + return audio_feats[0] + + batch_size, total_frames, feature_dim = audio_feats[0].shape + overlap_frames = int(total_frames * overlap_ratio) + step_frames = total_frames - overlap_frames + final_length = (len(audio_feats) - 1) * step_frames + total_frames + output_feat = torch.zeros( + batch_size, final_length, feature_dim, device=audio_feats[0].device, dtype=audio_feats[0].dtype + ) + + for block_idx, current_feat in enumerate(audio_feats): + output_start = block_idx * step_frames + if block_idx == 0: + output_feat[:, output_start : output_start + total_frames, :] = current_feat + continue + + non_overlap_start = output_start + overlap_frames + non_overlap_end = output_start + total_frames + output_feat[:, non_overlap_start:non_overlap_end, :] = current_feat[:, overlap_frames:, :] + + for frame_idx in range(overlap_frames): + output_pos = output_start + frame_idx + prev_weight = (overlap_frames - frame_idx) / overlap_frames + curr_weight = frame_idx / overlap_frames + output_feat[:, output_pos, :] = ( + prev_weight * output_feat[:, output_pos, :] + curr_weight * current_feat[:, frame_idx, :] + ) + return output_feat + + +def load_audio_and_encode(audio_vae, audio_path: str, seconds: int | None = None) -> torch.Tensor: + """Load audio from file and encode to latent space using the Stable Audio VAE.""" + audio_full = whisper.load_audio(audio_path, sr=_SAMPLE_RATE) + if seconds is not None: + audio_full = audio_full[: min(int(seconds * _SAMPLE_RATE), audio_full.shape[0])] + total_samples = audio_full.shape[0] + + window_size = int(_AUDIO_CHUNK_DURATION * _SAMPLE_RATE) + step_size = int(window_size * (1 - _OVERLAP_RATIO)) + if total_samples <= window_size: + audio = torch.from_numpy(audio_full).cuda() + audio = audio.unsqueeze(0).expand(2, -1) + return audio_vae.vae_model.encode(audio) + + encoded_chunks = [] + latent_to_audio_ratio = None + for offset_start in range(0, total_samples, step_size): + offset_end = min(offset_start + window_size, total_samples) + chunk = whisper.pad_or_trim(audio_full[offset_start:offset_end], length=window_size) + chunk_tensor = torch.from_numpy(chunk).cuda().unsqueeze(0).expand(2, -1) + encoded_chunk = audio_vae.vae_model.encode(chunk_tensor) + + if latent_to_audio_ratio is None: + latent_to_audio_ratio = encoded_chunk.shape[-1] / window_size + + encoded_chunks.append(encoded_chunk.permute(0, 2, 1)) + if offset_end >= total_samples: + break + + final_feat = _merge_overlapping_vae_features(encoded_chunks, overlap_ratio=_OVERLAP_RATIO).permute(0, 2, 1) + final_target_len = math.ceil(total_samples * latent_to_audio_ratio) + return final_feat[:, :, :final_target_len] + + +# =========================================================================== +# Data proxy (ported from daVinci-MagiHuman inference/pipeline/data_proxy.py) +# =========================================================================== +def _unfold_3d(x: torch.Tensor, kernel_size: tuple[int, int, int], stride: tuple[int, int, int]) -> torch.Tensor: + """Pure-PyTorch 3D unfold matching UnfoldAnd behavior. + + After N unfold ops the shape is (batch, C, oD, oH, oW, kD, kH, kW). + UnfoldAnd permutes kernel dims next to channel before reshape so that the + col_dim axis is ordered as (C, kD, kH, kW) -- matching F.unfold semantics. + Without this permute, .view() interleaves spatial and kernel positions. + + Args: + x: (N, C, D, H, W) + kernel_size: (kD, kH, kW) + stride: (sD, sH, sW) + Returns: + (N, C*kD*kH*kW, L) where L = product of output spatial dims. + """ + ndim = len(kernel_size) + for d in range(ndim): + x = x.unfold(d + 2, kernel_size[d], stride[d]) + perm = [0, 1] + list(range(ndim + 2, 2 * ndim + 2)) + list(range(2, ndim + 2)) + x = x.permute(*perm).contiguous() + + batch_size = x.shape[0] + col_dim = 1 + for i in range(1, ndim + 2): + col_dim *= x.shape[i] + spatial = 1 + for i in range(ndim + 2, 2 * ndim + 2): + spatial *= x.shape[i] + return x.view(batch_size, col_dim, spatial) + + +def _calc_local_qk_range(num_video_tokens, num_audio_and_txt_tokens, num_frames, frame_receptive_field): + token_per_frame = num_video_tokens // num_frames + total_tokens = num_video_tokens + num_audio_and_txt_tokens + + q_range_list = [] + k_range_list = [] + for i in range(num_frames): + q_range_list.append(torch.tensor([i * token_per_frame, (i + 1) * token_per_frame])) + k_range_list.append( + torch.tensor( + [ + (i - frame_receptive_field) * token_per_frame, + (i + frame_receptive_field + 1) * token_per_frame, + ] + ) + ) + local_q_range = torch.stack(q_range_list, dim=0) + local_k_range = torch.stack(k_range_list, dim=0) + + local_k_range[local_k_range < 0] = 0 + local_k_range[local_k_range > num_video_tokens] = num_video_tokens + + video_q_range = torch.tensor([[0, num_video_tokens]]) + video_k_range = torch.tensor([[num_video_tokens, num_video_tokens + num_audio_and_txt_tokens]]) + + at_q_ranges = torch.tensor([[num_video_tokens, total_tokens]]) + at_k_ranges = torch.tensor([[0, total_tokens]]) + + q_ranges = ( + torch.cat([local_q_range, video_q_range, at_q_ranges], dim=0).to(torch.int32).to("cuda", non_blocking=True) + ) + k_ranges = ( + torch.cat([local_k_range, video_k_range, at_k_ranges], dim=0).to(torch.int32).to("cuda", non_blocking=True) + ) + return q_ranges, k_ranges + + +def _calc_local_attn_ffa_handler(num_video_tokens, num_audio_and_txt_tokens, num_frames, frame_receptive_field): + q_ranges, k_ranges = _calc_local_qk_range( + num_video_tokens, num_audio_and_txt_tokens, num_frames, frame_receptive_field + ) + total = num_video_tokens + num_audio_and_txt_tokens + return FFAHandler( + q_ranges=q_ranges, + k_ranges=k_ranges, + max_seqlen_q=total, + max_seqlen_k=total, + attn_type_map=torch.zeros([q_ranges.shape[0]], device="cuda", dtype=torch.int32), + softmax_scale=None, + ) + + +def _get_coords( + shape: list[int], + ref_feat_shape: list[int], + offset_thw: list[int] | None = None, + device: torch.device = torch.device("cpu"), + dtype: torch.dtype = torch.float32, +): + if offset_thw is None: + offset_thw = [0, 0, 0] + ori_t, ori_h, ori_w = shape + ref_t, ref_h, ref_w = ref_feat_shape + + offset_t, offset_h, offset_w = offset_thw + time_rng = torch.arange(ori_t, device=device, dtype=dtype) + offset_t + height_rng = torch.arange(ori_h, device=device, dtype=dtype) + offset_h + width_rng = torch.arange(ori_w, device=device, dtype=dtype) + offset_w + + time_grid, height_grid, width_grid = torch.meshgrid(time_rng, height_rng, width_rng, indexing="ij") + coords_flat = torch.stack([time_grid, height_grid, width_grid], dim=-1).reshape(-1, 3) + + meta = torch.tensor([ori_t, ori_h, ori_w, ref_t, ref_h, ref_w], device=device, dtype=dtype) + meta_expanded = meta.expand(coords_flat.size(0), -1) + return torch.cat([coords_flat, meta_expanded], dim=-1) + + +@dataclass +class _SingleData: + video_x_t: torch.Tensor + audio_x_t: torch.Tensor + audio_feat_len: int + txt_feat: torch.Tensor + txt_feat_len: int + t: int + h: int + w: int + patch_size: int + t_patch_size: int + spatial_rope_interpolation: Literal["inter", "extra"] + ref_audio_offset: int + text_offset: int + coords_style: Literal["v1", "v2"] = "v1" + + def __post_init__(self): + self.video_token_num = self.video_x_t.shape[0] + self.audio_x_t = self.audio_x_t[: self.audio_feat_len] + self.txt_feat = self.txt_feat[: self.txt_feat_len] + self.video_channel = self.video_x_t.shape[-1] + self.audio_channel = self.audio_x_t.shape[-1] + self.txt_channel = self.txt_feat.shape[-1] + + @property + def device(self): + return self.video_x_t.device + + @property + def default_dtype(self): + return self.video_x_t.dtype + + @property + def total_token_num(self): + return self.video_token_num + self.audio_feat_len + self.txt_feat_len + + @property + def token_sequence(self): + tensors = [self.video_x_t, self.audio_x_t, self.txt_feat] + max_channel = max(t.shape[-1] for t in tensors) + padded = [F.pad(t, (0, max_channel - t.shape[-1])) for t in tensors] + return torch.cat(padded, dim=0) + + @property + def modality_mapping(self): + v_map = torch.full((self.video_token_num,), Modality.VIDEO, dtype=torch.int64, device=self.device) + a_map = torch.full((self.audio_feat_len,), Modality.AUDIO, dtype=torch.int64, device=self.device) + t_map = torch.full((self.txt_feat_len,), Modality.TEXT, dtype=torch.int64, device=self.device) + return torch.cat([v_map, a_map, t_map], dim=0) + + def _default_coords(self, shape, ref_feat_shape, offset_thw=None): + if offset_thw is None: + offset_thw = [0, 0, 0] + return _get_coords( + shape=shape, + ref_feat_shape=ref_feat_shape, + offset_thw=offset_thw, + device=self.device, + dtype=self.default_dtype, + ) + + @property + def coords_mapping(self): + if self.spatial_rope_interpolation == "inter": + video_ref_feat_shape = (self.t // self.t_patch_size, 32, 32) + else: + video_ref_feat_shape = (self.t // self.t_patch_size, self.h // self.patch_size, self.w // self.patch_size) + + video_coords = self._default_coords( + shape=(self.t // self.t_patch_size, self.h // self.patch_size, self.w // self.patch_size), + ref_feat_shape=video_ref_feat_shape, + ) + + if self.coords_style == "v1": + audio_coords = self._default_coords( + shape=(self.audio_feat_len, 1, 1), + ref_feat_shape=(self.t // self.t_patch_size, 1, 1), + ) + text_coords = self._default_coords( + shape=(self.txt_feat_len, 1, 1), + ref_feat_shape=(2, 1, 1), + offset_thw=[self.text_offset, 0, 0], + ) + elif self.coords_style == "v2": + magic_audio_ref_t = (self.audio_feat_len - 1) // 4 + 1 + audio_coords = self._default_coords( + shape=(self.audio_feat_len, 1, 1), + ref_feat_shape=(magic_audio_ref_t // self.t_patch_size, 1, 1), + ) + text_coords = self._default_coords( + shape=(self.txt_feat_len, 1, 1), + ref_feat_shape=(1, 1, 1), + offset_thw=[-self.txt_feat_len, 0, 0], + ) + else: + raise ValueError(f"Unknown coords_style: {self.coords_style}") + + return torch.cat([video_coords, audio_coords, text_coords], dim=0) + + def depack_token_sequence(self, token_sequence): + video_x_t = token_sequence[: self.video_token_num, : self.video_channel] + video_x_t = rearrange( + video_x_t, + "(T H W) (pT pH pW C) -> C (T pT) (H pH) (W pW)", + H=self.h // self.patch_size, + W=self.w // self.patch_size, + pT=self.t_patch_size, + pH=self.patch_size, + pW=self.patch_size, + ).contiguous() + audio_x_t = token_sequence[ + self.video_token_num : self.video_token_num + self.audio_feat_len, : self.audio_channel + ] + return video_x_t, audio_x_t + + +@dataclass +class _SimplePackedData: + items: list[_SingleData] + + @property + def token_sequence(self): + return torch.cat([item.token_sequence for item in self.items], dim=0) + + @property + def modality_mapping(self): + return torch.cat([item.modality_mapping for item in self.items], dim=0) + + @property + def coords_mapping(self): + return torch.cat([item.coords_mapping for item in self.items], dim=0) + + @property + def total_token_num(self): + return sum(item.total_token_num for item in self.items) + + def __getitem__(self, index): + return self.items[index] + + @property + def cu_seqlen(self): + cu = torch.cumsum(torch.tensor([item.total_token_num for item in self.items]), dim=0) + return F.pad(cu, (1, 0)) + + @property + def max_seqlen(self): + return torch.tensor(max(item.total_token_num for item in self.items)) + + def depack_token_sequence(self, token_sequence): + video_list, audio_list = [], [] + parts = torch.split(token_sequence, [item.total_token_num for item in self.items], dim=0) + for item, part in zip(self.items, parts): + v, a = item.depack_token_sequence(part) + video_list.append(v) + audio_list.append(a) + return torch.stack(video_list, dim=0), torch.stack(audio_list, dim=0) + + +class MagiDataProxy: + def __init__( + self, + patch_size: int = 2, + t_patch_size: int = 1, + frame_receptive_field: int = 11, + spatial_rope_interpolation: str = "extra", + ref_audio_offset: int = 1000, + text_offset: int = 0, + coords_style: str = "v2", + ): + self.patch_size = patch_size + self.t_patch_size = t_patch_size + self.frame_receptive_field = frame_receptive_field + self.spatial_rope_interpolation = spatial_rope_interpolation + self.ref_audio_offset = ref_audio_offset + self.text_offset = text_offset + self.coords_style = coords_style + self._kernel = (t_patch_size, patch_size, patch_size) + self._stride = (t_patch_size, patch_size, patch_size) + self._saved_data: dict[str, Any] = {} + + def saved_for_output(self, **kwargs): + self._saved_data.update(kwargs) + + def get_saved_data(self, key: str): + return self._saved_data[key] + + def img2tokens(self, x_t: torch.Tensor): + x_t_unfolded = _unfold_3d(x_t, self._kernel, self._stride) + return rearrange(x_t_unfolded, "N col_dim num_tokens -> N num_tokens col_dim").contiguous() + + def process_input(self, transported_data: EvalInput): + batch_size, _, t, h, w = transported_data.x_t.shape + x_t = self.img2tokens(transported_data.x_t) + audio_x_t = transported_data.audio_x_t.contiguous() + text_in = transported_data.txt_feat.contiguous() + + simple_packed_data = _SimplePackedData(items=[]) + for i in range(batch_size): + single_data = _SingleData( + video_x_t=x_t[i], + audio_x_t=audio_x_t[i], + audio_feat_len=transported_data.audio_feat_len[i], + txt_feat=text_in[i], + txt_feat_len=transported_data.txt_feat_len[i], + t=t, + h=h, + w=w, + patch_size=self.patch_size, + t_patch_size=self.t_patch_size, + spatial_rope_interpolation=self.spatial_rope_interpolation, + ref_audio_offset=self.ref_audio_offset, + text_offset=self.text_offset, + coords_style=self.coords_style, + ) + simple_packed_data.items.append(single_data) + + if self.frame_receptive_field != -1: + assert batch_size == 1, "local attention only supports batch size 1" + local_attn_handler = _calc_local_attn_ffa_handler( + num_video_tokens=simple_packed_data[0].video_token_num, + num_audio_and_txt_tokens=simple_packed_data[0].audio_feat_len + simple_packed_data[0].txt_feat_len, + num_frames=t, + frame_receptive_field=self.frame_receptive_field, + ) + if isinstance(local_attn_handler.max_seqlen_k, torch.Tensor): + local_attn_handler.max_seqlen_k = local_attn_handler.max_seqlen_k.item() + if isinstance(local_attn_handler.max_seqlen_q, torch.Tensor): + local_attn_handler.max_seqlen_q = local_attn_handler.max_seqlen_q.item() + else: + local_attn_handler = None + + varlen_handler = VarlenHandler( + cu_seqlens_q=simple_packed_data.cu_seqlen.to(torch.int32).cuda(), + cu_seqlens_k=simple_packed_data.cu_seqlen.to(torch.int32).cuda(), + max_seqlen_q=simple_packed_data.max_seqlen.to(torch.int32).cuda(), + max_seqlen_k=simple_packed_data.max_seqlen.to(torch.int32).cuda(), + ) + + self.saved_for_output(simple_packed_data=simple_packed_data) + + x = simple_packed_data.token_sequence + coords_mapping = simple_packed_data.coords_mapping + modality_mapping = simple_packed_data.modality_mapping + return (x, coords_mapping, modality_mapping, varlen_handler, local_attn_handler) + + def process_output(self, x: torch.Tensor): + simple_packed_data: _SimplePackedData = self.get_saved_data("simple_packed_data") + return simple_packed_data.depack_token_sequence(x) + + +# =========================================================================== +# Pipeline helpers +# =========================================================================== +@dataclass +class EvalInput: + x_t: torch.Tensor + audio_x_t: torch.Tensor + audio_feat_len: torch.Tensor | list[int] + txt_feat: torch.Tensor + txt_feat_len: torch.Tensor | list[int] + + +class _T5GemmaEncoder: + def __init__(self, model_path: str, device: str, weight_dtype: torch.dtype, subfolder: str | None = None): + from vllm.distributed import get_tensor_model_parallel_world_size + + self.device = device + hf_kwargs: dict[str, Any] = {} + if subfolder is not None: + hf_kwargs["subfolder"] = subfolder + self.tokenizer = AutoTokenizer.from_pretrained(model_path, **hf_kwargs) + + tp_size = get_tensor_model_parallel_world_size() + if tp_size > 1: + from transformers.models.t5gemma.configuration_t5gemma import T5GemmaConfig + + config = T5GemmaConfig.from_pretrained(model_path, **hf_kwargs) + # The config we need is the encoder config + config_encoder = config.encoder + # Propagate some outer config values + config_encoder.vocab_size = config.vocab_size + config_encoder.rms_norm_eps = getattr(config, "rms_norm_eps", config_encoder.rms_norm_eps) + self.model = T5GemmaEncoderModelTP(config_encoder).to(device).to(weight_dtype) + self.is_tp = True + else: + self.model = T5GemmaEncoderModel.from_pretrained( + model_path, is_encoder_decoder=False, dtype=weight_dtype, **hf_kwargs + ).to(device) + self.is_tp = False + + @torch.inference_mode() + def encode(self, prompt: str) -> torch.Tensor: + inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device) + outputs = self.model(**inputs) + + if self.is_tp: + # T5GemmaEncoderModelTP just returns the hidden states tensor + return outputs.half() + else: + # HF model returns BaseModelOutput + return outputs["last_hidden_state"].half() + + +def _pad_or_trim(tensor: torch.Tensor, target_size: int, dim: int, pad_value: float = 0.0) -> tuple[torch.Tensor, int]: + current_size = tensor.size(dim) + if current_size < target_size: + padding_amount = target_size - current_size + padding_tuple = [0] * (2 * tensor.dim()) + padding_dim_index = tensor.dim() - 1 - dim + padding_tuple[2 * padding_dim_index + 1] = padding_amount + return F.pad(tensor, tuple(padding_tuple), "constant", pad_value), current_size + slicing = [slice(None)] * tensor.dim() + slicing[dim] = slice(0, target_size) + return tensor[tuple(slicing)], target_size + + +def _get_padded_t5_gemma_embedding( + prompt: str, + encoder: _T5GemmaEncoder, + target_length: int, +) -> tuple[torch.Tensor, int]: + txt_feat = encoder.encode(prompt) + txt_feat, original_len = _pad_or_trim(txt_feat, target_size=target_length, dim=1) + return txt_feat.to(torch.float32), original_len + + +def _resizecrop(img: Image.Image, target_height: int, target_width: int) -> Image.Image: + """Centre-crop resize keeping aspect ratio then letterbox to target.""" + pil_image = img.convert("RGB") + original_width, original_height = pil_image.size + scale_x = target_width / original_width + scale_y = target_height / original_height + scale = max(scale_x, scale_y) + new_width = int(round(original_width * scale)) + new_height = int(round(original_height * scale)) + resized_image = pil_image.resize((new_width, new_height), Image.LANCZOS) + left = (new_width - target_width) // 2 + top = (new_height - target_height) // 2 + return resized_image.crop((left, top, left + target_width, top + target_height)) + + +class ZeroSNRDDPMDiscretization: + """ZeroSNR DDPM sigma schedule, ported from daVinci-MagiHuman. + Used to compute sigma values for SR noise injection. + """ + + def __init__( + self, + linear_start: float = 0.00085, + linear_end: float = 0.0120, + num_timesteps: int = 1000, + shift_scale: float = 1.0, + keep_start: bool = False, + post_shift: bool = False, + ): + from functools import partial + + if keep_start and not post_shift: + linear_start = linear_start / (shift_scale + (1 - shift_scale) * linear_start) + self.num_timesteps = num_timesteps + betas = torch.linspace(linear_start**0.5, linear_end**0.5, num_timesteps, dtype=torch.float64) ** 2 + alphas = 1.0 - betas.cpu().numpy() + self.alphas_cumprod = np.cumprod(alphas, axis=0) + self.to_torch = partial(torch.tensor, dtype=torch.float32) + if not post_shift: + self.alphas_cumprod = self.alphas_cumprod / (shift_scale + (1 - shift_scale) * self.alphas_cumprod) + self.post_shift = post_shift + self.shift_scale = shift_scale + + def __call__( + self, + n: int, + do_append_zero: bool = True, + device: str = "cpu", + flip: bool = False, + return_idx: bool = False, + ): + from functools import partial + + if n < self.num_timesteps: + timesteps = np.linspace(self.num_timesteps - 1, 0, n, endpoint=False).astype(int)[::-1] + alphas_cumprod = self.alphas_cumprod[timesteps] + elif n == self.num_timesteps: + alphas_cumprod = self.alphas_cumprod + else: + raise ValueError(f"n={n} > num_timesteps={self.num_timesteps}") + + to_torch = partial(torch.tensor, dtype=torch.float32, device=device) + alphas_cumprod = to_torch(alphas_cumprod) + alphas_cumprod_sqrt = alphas_cumprod.sqrt() + alphas_cumprod_sqrt_0 = alphas_cumprod_sqrt[0].clone() + alphas_cumprod_sqrt_T = alphas_cumprod_sqrt[-1].clone() + alphas_cumprod_sqrt -= alphas_cumprod_sqrt_T + alphas_cumprod_sqrt *= alphas_cumprod_sqrt_0 / (alphas_cumprod_sqrt_0 - alphas_cumprod_sqrt_T) + + if self.post_shift: + alphas_cumprod_sqrt = ( + alphas_cumprod_sqrt**2 / (self.shift_scale + (1 - self.shift_scale) * alphas_cumprod_sqrt**2) + ) ** 0.5 + + sigmas = torch.flip(alphas_cumprod_sqrt, (0,)) + sigmas = torch.cat([sigmas, sigmas.new_zeros([1])]) if do_append_zero else sigmas + if return_idx: + return sigmas if not flip else torch.flip(sigmas, (0,)), timesteps + return sigmas if not flip else torch.flip(sigmas, (0,)) + + +def _schedule_latent_step( + *, + video_scheduler: FlowUniPCMultistepScheduler, + audio_scheduler: FlowUniPCMultistepScheduler, + latent_video: torch.Tensor, + latent_audio: torch.Tensor, + t, + idx: int, + steps, + v_cfg_video: torch.Tensor, + v_cfg_audio: torch.Tensor, + is_a2v: bool, + cfg_number: int, + using_sde_flag: bool, + use_sr_model: bool = False, +): + # Fast DDIM path for cfg_number==1, only used during the BR stage + if cfg_number == 1 and not use_sr_model: + latent_video = video_scheduler.step_ddim(v_cfg_video, idx, latent_video) + latent_audio = audio_scheduler.step_ddim(v_cfg_audio, idx, latent_audio) + return latent_video, latent_audio + + if using_sde_flag: + if use_sr_model: + # SR stage with SDE: only update video, keep audio unchanged + latent_video = video_scheduler.step(v_cfg_video, t, latent_video, return_dict=False)[0] + return latent_video, latent_audio + if idx < int(len(steps) * (3 / 4)): + noise_theta = 1.0 if (idx + 1) % 2 == 0 else 0.0 + else: + noise_theta = 1.0 if idx % 3 == 0 else 0.0 + latent_video = video_scheduler.step_sde(v_cfg_video, idx, latent_video, noise_theta=noise_theta) + if not is_a2v: + latent_audio = audio_scheduler.step_sde(v_cfg_audio, idx, latent_audio, noise_theta=noise_theta) + return latent_video, latent_audio + + latent_video = video_scheduler.step(v_cfg_video, t, latent_video, return_dict=False)[0] + # Do not update audio latent during the SR stage + if not is_a2v and not use_sr_model: + latent_audio = audio_scheduler.step(v_cfg_audio, t, latent_audio, return_dict=False)[0] + return latent_video, latent_audio + + +_NEGATIVE_PROMPT = ( + "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, " + "overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, " + "poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, " + "still picture, messy background, three legs, many people in the background, walking backwards" + ", low quality, worst quality, poor quality, noise, background noise, hiss, hum, buzz, crackle, static, " + "compression artifacts, MP3 artifacts, digital clipping, distortion, muffled, muddy, unclear, echo, " + "reverb, room echo, over-reverberated, hollow sound, distant, washed out, harsh, shrill, piercing, " + "grating, tinny, thin sound, boomy, bass-heavy, flat EQ, over-compressed, abrupt cut, jarring transition, " + "sudden silence, looping artifact, music, instrumental, sirens, alarms, crowd noise, unrelated sound " + "effects, chaotic, disorganized, messy, cheap sound" + ", emotionless, flat delivery, deadpan, lifeless, apathetic, robotic, mechanical, monotone, flat " + "intonation, undynamic, boring, reading from a script, AI voice, synthetic, text-to-speech, TTS, " + "insincere, fake emotion, exaggerated, overly dramatic, melodramatic, cheesy, cringey, hesitant, " + "unconfident, tired, weak voice, stuttering, stammering, mumbling, slurred speech, mispronounced, " + "bad articulation, lisp, vocal fry, creaky voice, mouth clicks, lip smacks, wet mouth sounds, heavy " + "breathing, audible inhales, plosives, p-pops, coughing, clearing throat, sneezing, speaking too fast, " + "rushed, speaking too slow, dragged out, unnatural pauses, awkward silence, choppy, disjointed, multiple " + "speakers, two voices, background talking, out of tune, off-key, autotune artifacts" +) + + +# =========================================================================== +# Pre/post process funcs (registered in registry) +# =========================================================================== +def get_magi_human_pre_process_func(*args, **kwargs): + def pre_process(request: OmniDiffusionRequest): + return request + + return pre_process + + +def get_magi_human_post_process_func(*args, **kwargs): + def post_process(output): + if isinstance(output, tuple) and len(output) == 2: + video, audio = output + return {"video": video, "audio": audio} + return output + + return post_process + + +# =========================================================================== +# HF Hub / local path helpers +# =========================================================================== + + +def _load_json(model_path: str, filename: str, local_files_only: bool = True) -> dict: + """Load a JSON config file from a local path or HuggingFace Hub repo.""" + if local_files_only: + path = os.path.join(model_path, *filename.split("/")) + with open(path) as f: + return json.load(f) + else: + from huggingface_hub import hf_hub_download + + cached = hf_hub_download(repo_id=model_path, filename=filename) + with open(cached) as f: + return json.load(f) + + +def _resolve_subdir( + model_path: str, + subfolder: str, + local_files_only: bool = True, + required_files: list[str] | None = None, +) -> str: + """Resolve a model subfolder to a local directory path. + + For HF Hub repos, downloads all ``required_files`` (default: ``["config.json"]``) + into the HF cache and returns the parent directory. + """ + if local_files_only: + return os.path.join(model_path, subfolder) + from huggingface_hub import hf_hub_download + + files = required_files or ["config.json"] + last_cached: str | None = None + for fname in files: + last_cached = hf_hub_download(repo_id=model_path, filename=f"{subfolder}/{fname}") + return os.path.dirname(last_cached) + + +# =========================================================================== +# Main Pipeline +# =========================================================================== +class MagiHumanPipeline(nn.Module, ProgressBarMixin, DiffusionPipelineProfilerMixin): + def __init__(self, od_config: OmniDiffusionConfig, **kwargs): + super().__init__() + model_path = od_config.model + local_files_only = os.path.exists(model_path) + device = f"cuda:{torch.cuda.current_device()}" + self.device_str = device + self.dtype = od_config.dtype or torch.bfloat16 + + model_index = _load_json(model_path, "model_index.json", local_files_only) + eval_cfg = model_index + dp_cfg = model_index.get("data_proxy", {}) + + dit_subfolder = "transformer" + + dit_json = _load_json(model_path, f"{dit_subfolder}/config.json", local_files_only) + dit_model_config = MagiHumanDiTConfig(**dit_json) + + self.dit = DiTModel(dit_model_config) + self.dit.eval() + + self.vae = DistributedAutoencoderKLWan.from_pretrained(model_path, subfolder="vae") + self.vae.to(device) + self.vae.eval() + vae_cfg = _load_json(model_path, "vae/config.json", local_files_only) + self.vae_latent_mean = torch.tensor(vae_cfg["latents_mean"], dtype=torch.float32) + self.vae_latent_std = torch.tensor(vae_cfg["latents_std"], dtype=torch.float32) + + self.audio_vae = SAAudioFeatureExtractor( + device=device, + model_path=_resolve_subdir( + model_path, + "audio_vae", + local_files_only, + required_files=["config.json", "model_config.json", "model.safetensors"], + ), + ) + + logger.info("Loading T5Gemma text encoder from %s (subfolder=text_encoder)", model_path) + if local_files_only: + txt_enc_path = os.path.join(model_path, "text_encoder") + txt_enc_subfolder = None + else: + txt_enc_path = model_path + txt_enc_subfolder = "text_encoder" + self.text_encoder = _T5GemmaEncoder( + model_path=txt_enc_path, + device=device, + weight_dtype=self.dtype, + subfolder=txt_enc_subfolder, + ) + + self.data_proxy = MagiDataProxy( + patch_size=dp_cfg.get("patch_size", 2), + t_patch_size=dp_cfg.get("t_patch_size", 1), + frame_receptive_field=dp_cfg.get("frame_receptive_field", 11), + spatial_rope_interpolation=dp_cfg.get("spatial_rope_interpolation", "extra"), + ref_audio_offset=dp_cfg.get("ref_audio_offset", 1000), + text_offset=dp_cfg.get("text_offset", 0), + coords_style=dp_cfg.get("coords_style", "v2"), + ) + # SR DataProxy forces v1 coordinate style (consistent with the original) + self.sr_data_proxy = MagiDataProxy( + patch_size=dp_cfg.get("patch_size", 2), + t_patch_size=dp_cfg.get("t_patch_size", 1), + frame_receptive_field=dp_cfg.get("frame_receptive_field", 11), + spatial_rope_interpolation=dp_cfg.get("spatial_rope_interpolation", "extra"), + ref_audio_offset=dp_cfg.get("ref_audio_offset", 1000), + text_offset=dp_cfg.get("text_offset", 0), + coords_style="v1", + ) + + self.fps = eval_cfg.get("fps", 25) + self.num_inference_steps_default = eval_cfg.get("num_inference_steps", 32) + self.video_txt_guidance_scale = eval_cfg.get("video_txt_guidance_scale", 5.0) + self.audio_txt_guidance_scale = eval_cfg.get("audio_txt_guidance_scale", 5.0) + self.shift = eval_cfg.get("shift", 5.0) + self.cfg_number = eval_cfg.get("cfg_number", 2) + self.use_cfg_trick = eval_cfg.get("use_cfg_trick", True) + self.cfg_trick_start_frame = eval_cfg.get("cfg_trick_start_frame", 13) + self.cfg_trick_value = eval_cfg.get("cfg_trick_value", 2.0) + self.using_sde_flag = eval_cfg.get("using_sde_flag", False) + self.t5_gemma_target_length = eval_cfg.get("t5_gemma_target_length", 640) + self.vae_stride = eval_cfg.get("vae_stride", [4, 16, 16]) + self.z_dim = eval_cfg.get("z_dim", 48) + self.patch_size = eval_cfg.get("patch_size", [1, 2, 2]) + # SR-specific hyperparameters + self.sr_num_inference_steps_default = eval_cfg.get("sr_num_inference_steps", 5) + self.sr_cfg_number = eval_cfg.get("sr_cfg_number", 2) + self.sr_video_txt_guidance_scale = eval_cfg.get("sr_video_txt_guidance_scale", 3.5) + self.noise_value = eval_cfg.get("noise_value", 220) + self.sr_audio_noise_scale = eval_cfg.get("sr_audio_noise_scale", 0.7) + # ZeroSNR sigma schedule for SR noise injection (flip=True, high to low) + self.zerosnr_sigmas = ZeroSNRDDPMDiscretization()(1000, do_append_zero=False, flip=True) + + self.context_null, self.original_context_null_len = _get_padded_t5_gemma_embedding( + _NEGATIVE_PROMPT, + self.text_encoder, + self.t5_gemma_target_length, + ) + self.video_processor = VideoProcessor(vae_scale_factor=16) + + # SR DiT model (loaded from the sr/ subdirectory) + sr_dit_subfolder = "sr" + sr_dit_json = _load_json(model_path, f"{sr_dit_subfolder}/config.json", local_files_only) + sr_dit_model_config = MagiHumanDiTConfig(**sr_dit_json) + self.sr_dit = DiTModel(sr_dit_model_config) + self.sr_dit.eval() + + self.weights_sources = [ + DiffusersPipelineLoader.ComponentSource( + model_or_path=model_path, + subfolder=dit_subfolder, + revision=None, + prefix="dit.", + fall_back_to_pt=True, + ), + DiffusersPipelineLoader.ComponentSource( + model_or_path=model_path, + subfolder=sr_dit_subfolder, + revision=None, + prefix="sr_dit.", + fall_back_to_pt=True, + ), + ] + if getattr(self.text_encoder, "is_tp", False): + self.weights_sources.append( + DiffusersPipelineLoader.ComponentSource( + model_or_path=model_path, + subfolder="text_encoder", + revision=None, + prefix="text_encoder.", + fall_back_to_pt=True, + ), + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + # Weight loading for MagiHuman DiT with TP support. + # + # The checkpoint stores weights with these naming patterns: + # - attention.linear_qkv.weight: fused [Q, K, V, G] for shared layers, + # or stacked per-expert [expert0_Q|K|V|G, expert1_..., expert2_...] for MoE. + # - attention.linear_proj.weight: single for shared, stacked per-expert for MoE. + # - mlp.up_gate_proj.weight / mlp.down_proj.weight: similarly stacked for MoE. + # + # The model now uses per-expert vLLM parallel layers for MoE blocks: + # attention.linear_qkv.experts.{i}.weight (QKVParallelLinear per expert) + # attention.linear_gating.experts.{i}.weight (ColumnParallelLinear per expert) + # attention.linear_proj.experts.{i}.weight (RowParallelLinear per expert) + # mlp.up_gate_proj.experts.{i}.weight (ColumnParallelLinear per expert) + # mlp.down_proj.experts.{i}.weight (RowParallelLinear per expert) + # + # Shared layers keep the same naming (no .experts.). + params_dict = dict(self.named_parameters()) + modules_dict = dict(self.named_modules()) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + # ── Text Encoder weights ── + if name.startswith("text_encoder."): + if getattr(self.text_encoder, "is_tp", False): + # Strip "text_encoder." prefix for the T5Gemma TP model + # The T5GemmaEncoderModelTP load_weights handles the "encoder." prefix itself + sub_name = name[len("text_encoder.") :] + loaded_params.update( + f"text_encoder.{k}" for k in self.text_encoder.model.load_weights([(sub_name, loaded_weight)]) + ) + else: + loaded_params.add(name) + continue + + # ── Shared attention QKV + Gating split ── + # Checkpoint: attention.linear_qkv.weight = [Q, K, V, G] fused. + # Model: attention.linear_qkv.weight (QKVParallelLinear) + attention.linear_gating.weight. + if "attention.linear_qkv.weight" in name: + gating_name = name.replace("attention.linear_qkv.weight", "attention.linear_gating.weight") + # Check if this is a shared layer (direct param exists, no .experts.) + if name in params_dict and gating_name in params_dict: + qkv_param = params_dict[name] + gating_param = params_dict[gating_name] + + mod_path = name[: -len(".weight")] + qkv_mod = modules_dict.get(mod_path) + if qkv_mod is not None and hasattr(qkv_mod, "total_num_heads"): + total_heads_q = qkv_mod.total_num_heads + total_heads_kv = qkv_mod.total_num_kv_heads + head_dim = qkv_mod.head_size + else: + head_dim = 128 + tp_size = get_tensor_model_parallel_world_size() + total_heads_q = gating_param.data.shape[0] * tp_size + total_heads_kv = (loaded_weight.shape[0] - total_heads_q * head_dim - total_heads_q) // ( + 2 * head_dim + ) + + q_size = total_heads_q * head_dim + kv_size = total_heads_kv * head_dim + + q_w = loaded_weight[:q_size] + k_w = loaded_weight[q_size : q_size + kv_size] + v_w = loaded_weight[q_size + kv_size : q_size + 2 * kv_size] + g_w = loaded_weight[q_size + 2 * kv_size :] + + qkv_loader = getattr(qkv_param, "weight_loader", default_weight_loader) + qkv_loader(qkv_param, q_w, "q") + qkv_loader(qkv_param, k_w, "k") + qkv_loader(qkv_param, v_w, "v") + + gating_loader = getattr(gating_param, "weight_loader", default_weight_loader) + gating_loader(gating_param, g_w) + + loaded_params.add(name) + loaded_params.add(gating_name) + continue + + # ── MoE attention QKV + Gating split ── + # Checkpoint: attention.linear_qkv.weight = stacked [expert0_QKVG, expert1_QKVG, ...]. + # Model: attention.linear_qkv.experts.{i}.weight (QKVParallelLinear per expert) + # + attention.linear_gating.experts.{i}.weight (ColumnParallelLinear per expert). + expert0_name = name.replace("attention.linear_qkv.weight", "attention.linear_qkv.experts.0.weight") + if expert0_name in params_dict: + # Determine num_experts by checking which expert indices exist. + moe_qkv_mod_path = name[: -len(".weight")] + moe_qkv_mod = modules_dict.get(moe_qkv_mod_path) + num_experts = moe_qkv_mod.num_experts if moe_qkv_mod is not None else 3 + + # Get head info from the first expert's QKVParallelLinear. + expert0_mod_path = name.replace("attention.linear_qkv.weight", "attention.linear_qkv.experts.0") + expert0_mod = modules_dict.get(expert0_mod_path) + if expert0_mod is not None and hasattr(expert0_mod, "total_num_heads"): + total_heads_q = expert0_mod.total_num_heads + total_heads_kv = expert0_mod.total_num_kv_heads + head_dim = expert0_mod.head_size + else: + head_dim = 128 + # Infer from checkpoint weight shape. + # We'll get exact sizes from model config below. + total_heads_q = 40 # fallback for default config + total_heads_kv = 8 + + q_size = total_heads_q * head_dim + kv_size = total_heads_kv * head_dim + # Check if gating is present. + gating_expert0_name = name.replace( + "attention.linear_qkv.weight", "attention.linear_gating.experts.0.weight" + ) + has_gating = gating_expert0_name in params_dict + + # Split stacked checkpoint weight into per-expert chunks. + expert_weights = loaded_weight.chunk(num_experts, dim=0) + + for i in range(num_experts): + expert_w = expert_weights[i] + # Each expert chunk: [Q, K, V, G (optional)]. + q_w = expert_w[:q_size] + k_w = expert_w[q_size : q_size + kv_size] + v_w = expert_w[q_size + kv_size : q_size + 2 * kv_size] + + expert_param_name = name.replace( + "attention.linear_qkv.weight", + f"attention.linear_qkv.experts.{i}.weight", + ) + expert_param = params_dict[expert_param_name] + expert_loader = getattr(expert_param, "weight_loader", default_weight_loader) + expert_loader(expert_param, q_w, "q") + expert_loader(expert_param, k_w, "k") + expert_loader(expert_param, v_w, "v") + loaded_params.add(expert_param_name) + + if has_gating: + g_w = expert_w[q_size + 2 * kv_size :] + gating_param_name = name.replace( + "attention.linear_qkv.weight", + f"attention.linear_gating.experts.{i}.weight", + ) + gating_param = params_dict[gating_param_name] + gating_loader = getattr(gating_param, "weight_loader", default_weight_loader) + gating_loader(gating_param, g_w) + loaded_params.add(gating_param_name) + continue + + # ── MoE stacked weight splitting for proj / MLP layers ── + # Checkpoint: x.y.weight (stacked [expert0, expert1, ...]). + # Model: x.y.experts.{i}.weight. + if name not in params_dict: + # Check if this is a stacked MoE weight by looking for .experts.0. + base, _, suffix = name.rpartition(".") + expert0_name = f"{base}.experts.0.{suffix}" if base else None + if expert0_name and expert0_name in params_dict: + # Determine num_experts. + moe_mod = modules_dict.get(base) + num_experts = getattr(moe_mod, "num_experts", 3) if moe_mod is not None else 3 + + # Split stacked weight into per-expert chunks. + expert_weights = loaded_weight.chunk(num_experts, dim=0) + for i in range(num_experts): + expert_param_name = f"{base}.experts.{i}.{suffix}" + if expert_param_name not in params_dict: + continue + expert_param = params_dict[expert_param_name] + expert_loader = getattr(expert_param, "weight_loader", default_weight_loader) + expert_loader(expert_param, expert_weights[i]) + loaded_params.add(expert_param_name) + continue + # Truly unknown weight — skip. + continue + + # ── Standard weight loading (shared layers + non-MoE params) ── + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + + if getattr(self.text_encoder, "is_tp", False): + self.context_null, self.original_context_null_len = _get_padded_t5_gemma_embedding( + _NEGATIVE_PROMPT, + self.text_encoder, + self.t5_gemma_target_length, + ) + + return loaded_params + + def _dit_forward(self, eval_input: EvalInput) -> tuple[torch.Tensor, torch.Tensor]: + packed = self.data_proxy.process_input(eval_input) + noise_pred = self.dit(*packed) + return self.data_proxy.process_output(noise_pred) + + def _sr_dit_forward(self, eval_input: EvalInput) -> tuple[torch.Tensor, torch.Tensor]: + """SR stage uses sr_data_proxy (coords_style=v1) and sr_dit model.""" + packed = self.sr_data_proxy.process_input(eval_input) + noise_pred = self.sr_dit(*packed) + return self.sr_data_proxy.process_output(noise_pred) + + @torch.inference_mode() + def _evaluate_with_latent( + self, + context: torch.Tensor, + original_context_len: int, + latent_image: torch.Tensor | None, + latent_video: torch.Tensor, + latent_audio: torch.Tensor, + num_inference_steps: int, + is_a2v: bool = False, + use_sr_model: bool = False, + ) -> tuple[torch.Tensor, torch.Tensor]: + # Select cfg_number and guidance_scale based on BR/SR stage + cfg_number = self.sr_cfg_number if use_sr_model else self.cfg_number + video_guidance = self.sr_video_txt_guidance_scale if use_sr_model else self.video_txt_guidance_scale + forward_fn = self._sr_dit_forward if use_sr_model else self._dit_forward + + video_scheduler = FlowUniPCMultistepScheduler() + audio_scheduler = FlowUniPCMultistepScheduler() + video_scheduler.set_timesteps(num_inference_steps, device=self.device_str, shift=self.shift) + audio_scheduler.set_timesteps(num_inference_steps, device=self.device_str, shift=self.shift) + timesteps = video_scheduler.timesteps + + latent_length = latent_video.shape[2] + cfg_trick_guidance = ( + torch.tensor(video_guidance, device=self.device_str).expand(1, 1, latent_length, 1, 1).clone() + ) + if self.use_cfg_trick: + cfg_trick_guidance[:, :, : self.cfg_trick_start_frame] = min(self.cfg_trick_value, video_guidance) + + with self.progress_bar(total=len(timesteps)) as pbar: + for idx, t in enumerate(timesteps): + if latent_image is not None: + latent_video[:, :, :1] = latent_image[:, :, :1] + + # Reduce guidance when t<=500 during BR stage (original behavior) + cur_video_guidance = video_guidance if (use_sr_model or t > 500) else 2.0 + + eval_input_cond = EvalInput( + x_t=latent_video, + audio_x_t=latent_audio, + audio_feat_len=[latent_audio.shape[1]], + txt_feat=context, + txt_feat_len=[original_context_len], + ) + + v_cond_video, v_cond_audio = forward_fn(eval_input_cond) + + if cfg_number == 1: + v_cfg_video = v_cond_video + v_cfg_audio = v_cond_audio + elif cfg_number == 2: + eval_input_uncond = EvalInput( + x_t=latent_video, + audio_x_t=latent_audio, + audio_feat_len=[latent_audio.shape[1]], + txt_feat=self.context_null, + txt_feat_len=[self.original_context_null_len], + ) + v_uncond_video, v_uncond_audio = forward_fn(eval_input_uncond) + v_cfg_video = v_uncond_video + cur_video_guidance * (v_cond_video - v_uncond_video) + v_cfg_audio = v_uncond_audio + self.audio_txt_guidance_scale * (v_cond_audio - v_uncond_audio) + else: + raise ValueError(f"Invalid cfg_number: {cfg_number}") + + latent_video, latent_audio = _schedule_latent_step( + video_scheduler=video_scheduler, + audio_scheduler=audio_scheduler, + latent_video=latent_video, + latent_audio=latent_audio, + t=t, + idx=idx, + steps=timesteps, + v_cfg_video=v_cfg_video, + v_cfg_audio=v_cfg_audio, + is_a2v=is_a2v, + cfg_number=cfg_number, + using_sde_flag=self.using_sde_flag, + use_sr_model=use_sr_model, + ) + + pbar.update() + + if latent_image is not None: + latent_video[:, :, :1] = latent_image[:, :, :1] + return latent_video, latent_audio + + def _encode_image(self, image: Image.Image, height: int, width: int) -> torch.Tensor: + image = load_image(image) + image = _resizecrop(image, height, width) + image = self.video_processor.preprocess(image, height=height, width=width) + image = image.to(device=self.device_str, dtype=self.dtype).unsqueeze(2) + vae_out = self.vae.encode(image) + if hasattr(vae_out, "latent_dist"): + return vae_out.latent_dist.mode().to(torch.float32) + return vae_out.to(torch.float32) + + def _decode_video(self, latent: torch.Tensor) -> list[np.ndarray]: + mean = self.vae_latent_mean.to(latent.device, dtype=latent.dtype).view(1, -1, 1, 1, 1) + std = self.vae_latent_std.to(latent.device, dtype=latent.dtype).view(1, -1, 1, 1, 1) + latent = latent * std + mean + + videos = self.vae.decode(latent.to(self.dtype)) + if hasattr(videos, "sample"): + videos = videos.sample + videos.mul_(0.5).add_(0.5).clamp_(0, 1) + videos = [v.float().cpu().permute(1, 2, 3, 0) * 255 for v in videos] + return [v.numpy().astype(np.uint8) for v in videos] + + def _decode_audio(self, latent_audio: torch.Tensor) -> np.ndarray: + latent_audio = latent_audio.squeeze(0).to(self.dtype) + audio_output = self.audio_vae.decode(latent_audio.T) + audio_np = audio_output.squeeze(0).T.float().cpu().numpy() + target_len = int(audio_np.shape[0] * 441 / 512) + from scipy.signal import resample + + return resample(audio_np, target_len) + + @torch.inference_mode() + def forward( + self, + req: OmniDiffusionRequest, + prompt: str | None = None, + height: int = 256, + width: int = 448, + num_inference_steps: int | None = None, + seconds: int = 10, + seed: int | None = None, + image_path: str | None = None, + audio_path: str | None = None, + **kwargs, + ) -> DiffusionOutput: + if len(req.prompts) >= 1: + p = req.prompts[0] + prompt = p if isinstance(p, str) else p.get("prompt", prompt) + if not isinstance(p, str): + image_path = p.get("image_path", image_path) + audio_path = p.get("audio_path", audio_path) + if prompt is None: + raise ValueError("prompt is required") + + height = req.sampling_params.height or height + width = req.sampling_params.width or width + seed = req.sampling_params.seed if req.sampling_params.seed is not None else seed + num_steps = req.sampling_params.num_inference_steps or num_inference_steps or self.num_inference_steps_default + sr_height: int | None = None + sr_width: int | None = None + sr_num_steps: int | None = None + if hasattr(req.sampling_params, "extra_args") and req.sampling_params.extra_args: + seconds = req.sampling_params.extra_args.get("seconds", seconds) + audio_path = req.sampling_params.extra_args.get("audio_path", audio_path) + image_path = req.sampling_params.extra_args.get("image_path", image_path) + sr_height = req.sampling_params.extra_args.get("sr_height", None) + sr_width = req.sampling_params.extra_args.get("sr_width", None) + sr_num_steps = req.sampling_params.extra_args.get("sr_num_inference_steps", None) + + device = self.device_str + + br_latent_height = height // self.vae_stride[1] // self.patch_size[1] * self.patch_size[1] + br_latent_width = width // self.vae_stride[2] // self.patch_size[2] * self.patch_size[2] + br_height = br_latent_height * self.vae_stride[1] + br_width = br_latent_width * self.vae_stride[2] + + if seed is not None: + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + if audio_path is not None: + latent_audio = load_audio_and_encode(self.audio_vae, audio_path, seconds) + latent_audio = latent_audio.permute(0, 2, 1) + num_frames = latent_audio.shape[1] + is_a2v = True + else: + num_frames = seconds * self.fps + 1 + latent_audio = torch.randn(1, num_frames, 64, dtype=torch.float32, device=device) + is_a2v = False + + latent_length = (num_frames - 1) // 4 + 1 + latent_video = torch.randn( + 1, + self.z_dim, + latent_length, + br_latent_height, + br_latent_width, + dtype=torch.float32, + device=device, + ) + + context, original_context_len = _get_padded_t5_gemma_embedding( + prompt, + self.text_encoder, + self.t5_gemma_target_length, + ) + + if image_path is not None: + br_image = self._encode_image(load_image(image_path), br_height, br_width) + else: + br_image = None + + # ── BR stage ───────────────────────────────────────────────────────── + br_latent_video, br_latent_audio = self._evaluate_with_latent( + context, + original_context_len, + br_image, + latent_video.clone(), + latent_audio.clone(), + num_steps, + is_a2v, + use_sr_model=False, + ) + + # ── SR stage (optional, triggered when sr_height/sr_width are provided) ── + if sr_height is not None and sr_width is not None: + sr_latent_height = sr_height // self.vae_stride[1] // self.patch_size[1] * self.patch_size[1] + sr_latent_width = sr_width // self.vae_stride[2] // self.patch_size[2] * self.patch_size[2] + sr_height = sr_latent_height * self.vae_stride[1] + sr_width = sr_latent_width * self.vae_stride[2] + + # Image condition (at SR resolution) + if image_path is not None: + sr_image = self._encode_image(load_image(image_path), sr_height, sr_width) + else: + sr_image = None + + # Trilinear interpolation of BR latent to SR resolution + sr_latent_video = torch.nn.functional.interpolate( + br_latent_video, + size=(latent_length, sr_latent_height, sr_latent_width), + mode="trilinear", + align_corners=True, + ) + + # Noise injection: sigma-weighted blend (noise_value indexes the ZeroSNR sigma schedule) + if self.noise_value != 0: + noise = torch.randn_like(sr_latent_video) + sigma = self.zerosnr_sigmas.to(sr_latent_video.device)[self.noise_value] + sr_latent_video = sr_latent_video * sigma + noise * (1 - sigma**2) ** 0.5 + + # Audio: blend with noise (noised version used during SR inference; final audio keeps BR result) + sr_latent_audio = torch.randn_like(br_latent_audio) * self.sr_audio_noise_scale + br_latent_audio * ( + 1 - self.sr_audio_noise_scale + ) + + torch.cuda.empty_cache() + sr_steps = sr_num_steps or self.sr_num_inference_steps_default + final_latent_video, _ = self._evaluate_with_latent( + context, + original_context_len, + sr_image, + sr_latent_video.clone(), + sr_latent_audio.clone(), + sr_steps, + is_a2v, + use_sr_model=True, + ) + # SR stage does not update audio; keep the BR result + final_latent_video = final_latent_video + final_latent_audio = br_latent_audio + else: + final_latent_video = br_latent_video + final_latent_audio = br_latent_audio + + torch.cuda.empty_cache() + videos_np = self._decode_video(final_latent_video) + torch.cuda.empty_cache() + audio_np = self._decode_audio(final_latent_audio) + + return DiffusionOutput(output=(videos_np, audio_np)) diff --git a/vllm_omni/diffusion/models/t5_encoder/t5_gemma_encoder.py b/vllm_omni/diffusion/models/t5_encoder/t5_gemma_encoder.py new file mode 100644 index 0000000000..eca4267fa2 --- /dev/null +++ b/vllm_omni/diffusion/models/t5_encoder/t5_gemma_encoder.py @@ -0,0 +1,309 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + +from collections.abc import Iterable + +import torch +import torch.nn as nn +from transformers import PretrainedConfig +from vllm.config import VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import ( + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + + +class T5GemmaRMSNorm(nn.Module): + def __init__(self, hidden_size: int, eps: float = 1e-6): + super().__init__() + # Normal RMSNorm but T5Gemma requires (1 + weight) + self.weight = nn.Parameter(torch.zeros(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return (hidden_states * (1.0 + self.weight.float())).to(input_dtype) + + +class T5GemmaMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + input_size=hidden_size, + output_sizes=[intermediate_size, intermediate_size], + bias=False, + gather_output=False, + ) + self.down_proj = RowParallelLinear( + input_size=intermediate_size, + output_size=hidden_size, + bias=False, + input_is_parallel=True, + ) + self.act_fn = get_act_fn(hidden_act) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + gate_up, _ = self.gate_up_proj(x) + gate, up = gate_up.chunk(2, dim=-1) + x = self.act_fn(gate) * up + x, _ = self.down_proj(x) + return x + + +class T5GemmaAttention(nn.Module): + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + head_dim: int, + max_position_embeddings: int, + rope_theta: float, + cache_config: VllmConfig | None = None, + quant_config: dict | None = None, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + assert self.total_num_kv_heads % tp_size == 0 + else: + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + + self.qkv_proj = QKVParallelLinear( + hidden_size=hidden_size, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_kv_heads, + bias=False, + ) + self.o_proj = RowParallelLinear( + input_size=self.total_num_heads * self.head_dim, + output_size=hidden_size, + bias=False, + input_is_parallel=True, + ) + + self.rotary_emb = get_rope( + self.head_dim, + max_position=max_position_embeddings, + is_neox_style=True, + rope_parameters={"base": rope_theta}, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor | None = None, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + q, k = self.rotary_emb(positions, q, k) + + # Scale Q appropriately. T5Gemma uses query_pre_attn_scalar=256 => 256**-0.5 = 1/16 + # The standard scaling is head_dim**-0.5. For T5Gemma, head_dim=256. + # So we don't need to manually scale if F.scaled_dot_product_attention scales by head_dim. + # But we must reshape. + batch_size, seq_len, _ = hidden_states.shape + q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2) + k = k.view(batch_size, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2) + v = v.view(batch_size, seq_len, self.num_kv_heads, self.head_dim).transpose(1, 2) + + # GQA repeat KV + if self.num_kv_heads != self.num_heads: + num_repeat = self.num_heads // self.num_kv_heads + k = k.repeat_interleave(num_repeat, dim=1) + v = v.repeat_interleave(num_repeat, dim=1) + + attn_output = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attention_mask, dropout_p=0.0) + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, seq_len, self.q_size) + + output, _ = self.o_proj(attn_output) + return output + + +class T5GemmaEncoderLayer(nn.Module): + def __init__(self, config: PretrainedConfig) -> None: + super().__init__() + self.self_attn = T5GemmaAttention( + hidden_size=config.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + head_dim=config.head_dim, + max_position_embeddings=config.max_position_embeddings, + rope_theta=config.rope_theta, + ) + self.mlp = T5GemmaMLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_activation, + ) + self.pre_self_attn_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_self_attn_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.pre_feedforward_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_feedforward_layernorm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor | None, + ) -> torch.Tensor: + # Self Attention + residual = hidden_states + hidden_states = self.pre_self_attn_layernorm(hidden_states) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + attention_mask=attention_mask, + ) + hidden_states = self.post_self_attn_layernorm(hidden_states) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.pre_feedforward_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_feedforward_layernorm(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +class T5GemmaEncoderModelTP(nn.Module): + def __init__(self, config: PretrainedConfig) -> None: + super().__init__() + self.config = config + self.vocab_size = config.vocab_size + + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + + self.layers = nn.ModuleList([T5GemmaEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + @property + def dtype(self) -> torch.dtype: + return next(self.parameters()).dtype + + @property + def device(self) -> torch.device: + return next(self.parameters()).device + + def forward( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor | None = None, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + + # Scaling inputs + normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype, device=hidden_states.device) + hidden_states = hidden_states * normalizer + + # Simple position ids for RoPE + batch_size, seq_len = input_ids.shape + positions = torch.arange(seq_len, device=input_ids.device, dtype=torch.long).unsqueeze(0).expand(batch_size, -1) + + # Build attention mask: (batch, seq) -> (batch, 1, 1, seq) + # Assuming typical bidirectional causal mask handling in HF: T5Gemma uses non-causal encoder. + if attention_mask is not None: + # HuggingFace expects boolean mask for scaled_dot_product_attention + # or additive mask (0 and -inf). Let's use boolean matching FA patterns. + # SDPA expects attention_mask to be boolean (True = keep, False = masking) + bool_mask = attention_mask.to(torch.bool) + extended_mask = bool_mask.unsqueeze(1).unsqueeze(2) # (B, 1, 1, S) + else: + extended_mask = None + + for idx, layer in enumerate(self.layers): + # T5Gemma has layer_types switching between "sliding_attention" and "full_attention" + # However, for text encoder inference, the sequences are typically < max sequence length + # and local sliding window only affects very long contexts. For simplicity we use full. + hidden_states = layer( + positions=positions, + hidden_states=hidden_states, + attention_mask=extended_mask, + ) + + hidden_states = self.norm(hidden_states) + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + # HF checkpoint keys may carry a "model." prefix (e.g. + # "model.encoder.layers.0..."). Strip it so the rest of the + # logic only needs to handle the "encoder.*" namespace. + if name.startswith("model."): + name = name[len("model.") :] + + if not name.startswith("encoder."): + continue + + # Strip "encoder." prefix as this model only wraps the encoder + name = name[len("encoder.") :] + + # Map self_attn to self_attn and correct normalization names + # HF: layers.0.pre_self_attn_layernorm.weight -> Ours: layers.0.pre_self_attn_layernorm.weight + + lookup_name = name + for param_name, weight_name, shard_id in stacked_params_mapping: + if f".{weight_name}." not in name: + continue + lookup_name = name.replace(f".{weight_name}.", f".{param_name}.") + if lookup_name not in params_dict: + continue + param = params_dict[lookup_name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + + loaded_params.add("encoder." + name) + loaded_params.add("encoder." + lookup_name) + + return loaded_params diff --git a/vllm_omni/diffusion/offloader/module_collector.py b/vllm_omni/diffusion/offloader/module_collector.py index 307ca53a88..d9d21b939a 100644 --- a/vllm_omni/diffusion/offloader/module_collector.py +++ b/vllm_omni/diffusion/offloader/module_collector.py @@ -21,9 +21,9 @@ class PipelineModules: class ModuleDiscovery: """Discovers pipeline components for offloading""" - DIT_ATTRS = ["transformer", "transformer_2", "dit", "language_model", "transformer_blocks"] + DIT_ATTRS = ["transformer", "transformer_2", "dit", "sr_dit", "language_model", "transformer_blocks"] ENCODER_ATTRS = ["text_encoder", "text_encoder_2", "text_encoder_3", "image_encoder"] - VAE_ATTRS = ["vae"] + VAE_ATTRS = ["vae", "audio_vae"] @staticmethod def discover(pipeline: nn.Module) -> PipelineModules: diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index c1f48137e1..97bc7fa292 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -173,6 +173,11 @@ "pipeline_hunyuan_video_1_5_i2v", "HunyuanVideo15I2VPipeline", ), + "MagiHumanPipeline": ( + "magi_human", + "pipeline_magi_human", + "MagiHumanPipeline", + ), "OmniVoicePipeline": ( "omnivoice", "pipeline_omnivoice", @@ -368,6 +373,7 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) - "Flux2Pipeline": "get_flux2_post_process_func", "HunyuanVideo15Pipeline": "get_hunyuan_video_15_post_process_func", "HunyuanVideo15ImageToVideoPipeline": "get_hunyuan_video_15_i2v_post_process_func", + "MagiHumanPipeline": "get_magi_human_post_process_func", "OmniVoicePipeline": "get_omnivoice_post_process_func", } @@ -387,6 +393,7 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) - "HeliosPipeline": "get_helios_pre_process_func", "HeliosPyramidPipeline": "get_helios_pre_process_func", "HunyuanVideo15ImageToVideoPipeline": "get_hunyuan_video_15_i2v_pre_process_func", + "MagiHumanPipeline": "get_magi_human_pre_process_func", } diff --git a/vllm_omni/diffusion/utils/media_utils.py b/vllm_omni/diffusion/utils/media_utils.py new file mode 100644 index 0000000000..ee1f8116f0 --- /dev/null +++ b/vllm_omni/diffusion/utils/media_utils.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Video/audio muxing utilities using PyAV (no ffmpeg binary dependency).""" + +from __future__ import annotations + +import io +from fractions import Fraction + +import av +import numpy as np + + +def mux_video_audio_bytes( + video_frames: np.ndarray, + audio_waveform: np.ndarray | None = None, + *, + fps: float = 25.0, + audio_sample_rate: int = 44100, + video_codec: str = "h264", + audio_codec: str = "aac", + crf: str = "18", +) -> bytes: + """Mux video frames and optional audio waveform into MP4 bytes. + + Args: + video_frames: uint8 array of shape ``(T, H, W, 3)`` (RGB). + audio_waveform: float32 array – mono ``(N,)`` or ``(N, C)`` / ``(C, N)``. + fps: Video frame rate. + audio_sample_rate: Audio sample rate in Hz. + video_codec: Video codec name. + audio_codec: Audio codec name. + crf: Constant rate factor for the video encoder. + + Returns: + Raw MP4 bytes ready to be written to disk or streamed. + """ + buf = io.BytesIO() + container = av.open(buf, mode="w", format="mp4") + + v_stream = container.add_stream(video_codec, rate=Fraction(fps).limit_denominator(10000)) + v_stream.width = video_frames.shape[2] + v_stream.height = video_frames.shape[1] + v_stream.pix_fmt = "yuv420p" + v_stream.options = {"crf": crf} + + a_stream = None + if audio_waveform is not None: + samples = audio_waveform.astype(np.float32) + if samples.ndim == 1: + samples = samples.reshape(1, -1) + elif samples.ndim == 2 and samples.shape[0] > samples.shape[1]: + samples = samples.T + num_channels = samples.shape[0] + layout = "stereo" if num_channels >= 2 else "mono" + a_stream = container.add_stream(audio_codec, rate=audio_sample_rate) + a_stream.layout = layout + + for frame_data in video_frames: + frame = av.VideoFrame.from_ndarray(frame_data, format="rgb24") + for packet in v_stream.encode(frame): + container.mux(packet) + for packet in v_stream.encode(): + container.mux(packet) + + if a_stream is not None and audio_waveform is not None: + audio_frame = av.AudioFrame.from_ndarray(samples, format="fltp", layout=layout) + audio_frame.sample_rate = audio_sample_rate + for packet in a_stream.encode(audio_frame): + container.mux(packet) + for packet in a_stream.encode(): + container.mux(packet) + + container.close() + return buf.getvalue() From f2227d3c9aa7d76c2dd271ed7e1ab888e4588cc8 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Sun, 5 Apr 2026 00:21:20 +0800 Subject: [PATCH 045/204] [Docs] Update WeChat QR code for community support (#2481) Signed-off-by: david6666666 Co-authored-by: david6666666 --- docs/assets/WeChat.jpg | Bin 100428 -> 98759 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/docs/assets/WeChat.jpg b/docs/assets/WeChat.jpg index 28956a12099dfaed166e958f24218ed201858cbd..c32ece6c102f7cb76d06e6eb1194af59dcd30488 100644 GIT binary patch literal 98759 zcmc$`cQ}^+|2IyAk}_I0B~r?km6b$UrFR*jWbcfSRiUI%$QDvYg~;AfBqOrI6)Ias z3E98L>GS^F_xC=I-~IR9aa32j#(ACR>-Bs-*7Lllt$B)WE6Y|gGBP?fRpoPJWEAxH z%|b(kpX@eCio;*pX1Z$T8X9DW@R){-lAMf;5|8i~{viL~kMCOjug7xajQ{H~@iaU! z46h{q-uM^($kEViCjRYw?vxT)Rx{fe{z2)iq;{SLzr1Kn1Ifs^$kdb-&%4L}{OWdo z=f!2}84F$MKrXJ`J6TQME9aD5|EY3y$NZOIaci-rGM3VN2gnWDqm{2#@zn12o-8la zZN2>JW`IfZQLdkImGVx1MyfY09Cdaxj~a1qW(zG9{p2&^OAdNr05)HRM zeRY+}7}>v%8x$#MHh!o?Bg&%ipZ`~iZY8_@?^7R8(f|8QbFrIT|9L2sLQaXdAzsEz z$3?s&9yVH%DgNi-6_x*&Z^^rZi=H7jH<#=9arW8=%oLiMnp|p(8oNcT#8~zUT_$^$ zmUhD2oR{lq{Av(!+=uQ4K8%UkqMx~&YTI72f`Wn|&cv2?a#B)KZ>c%G$n-Nif7UrM z#?DyR*Vj8vbRLP8^Ok0lc2QM6YNDd3s7SwiH?yFLrQI!X^X5&mgL=8oX1acGD9Cy*?KJ=NgzoT#SM-nX9W-r8 zQhaUMy6>Pt!K=?_iBq{x_0lCaua#eQGYx{32iXHY*jD>*nHh>Qz8w`6)zXzS6eGpW zO}<+|SHX|^@Eo3VGa-S!F-DZy%iH^L$O%d**YTY^2M&<$yUn9UwO+ALjggE@%(|VX zti1ej(8tNO)n&)oVGXjIN?o72y7r8Byi7ZP-d;~n&v9lzF=gVh=c*VBoy!zS?`@v) zWR_9+)*YO3jSH*4#;=jx)X%q89Qyb;xKm9_>u$=a(BaaRxOHhsNrmP4$=N2C;YMcn z`JZ~ps|H~_G+VcB;^gE^ab;v;`nIw-OGXxul~`ITBWBb2Ao+a|ue3DFzWw`Yz1L5N zQQ|HA3j2KwbR+NX;H<`1v6{L3{6f~HEiA!QsalEKpPz}^y`c2rps+9lpVpHHWD4Pr zwf3t#sQU6nw=+vo;KMBn>etpCFVt5R!g%6E%(lvTFP$IzqCVAEet&g&;gnifbZ55S z8*}%y6_4jvMO!E4{!SM5i)6>)ysgdmI-IWKJ>>eWdb4d`nbhp{B;!gS_mSoVSywD4 z$8XgEjmwdY2Tl)kW*P*WOMCqlqN1b}>WY=Twy(CXZX{#&&zR22Xi8{w|KWdLoPSF} zz5PR9twV|Ly4-dS+34bO)?@Y$EIvKi6?Q;EV)v;Bdz1fOqtf`ErWVe7#l+<4Lm&UZ z4`Me}PoJ*d!o+*9Z?9O41B-<1UOA87%DQ#?hYLLy>|g1<;dL{P#s~89@f~KbR9Kkq zuk%@5x|VIw5P9Smso42+o#4(-iajqgOzXpnd{%EfPhom2TwPU)! z(m~s%^OYXS^>ozHU)Edy=Hr+REaDG(SK8-*0z0ow_pcnK*q0 zs-AW4-VG!v`W1^!p5fhM+5ezN-wwX>6xzl*GiPwErbFrF>rOfKs=uRgX{nb-3BBj@(l^6Jm$M=}Yr13h9c+CG~7PwTT z-5L8%1g~btaxl#{z!ywwYt7WZpTWeZsi2~A=d*C2dXNZ>c5m}zdXr*HPZJ+zW@g9l z@3uU@O1~K;gXYhT@}I7>>cp!wVnj~P#|CWYlCb^4wS1qk;a9IirJ|;02tI3@Wts-fSqkwdjjx9%uWD*; zU)fl}Tw9kGM-t7l^+w6|iHhnFpL62f!f>ESP;OhnaKd5kcV+EG$It#qQqjy46zo%G zYuF==!mdkQLUo(|DU~(pN%}SI6M?a40S`@-f@SQK@F6{NjMaD133f8M+zF;vdXO@? zpWBNl`20I3FFigvsr5>2YVFmXHH}0C`hTBz#GrsuUS57`=*IldEk}v>GEZ+_Lmp66!v)pvq^<{GGDxSakTqQ+D)-{VI40oetIUy$;P&=qN0K-l#W@% zzcl3TU22!H&+i9v%$s5rA0PUs$Oh|`jzl6Et@powzxl@8_yyiK z6R0_I=cOgW4`0l>j5-$6bwo&re&J_#ii<7pjdBh?jadIM9^uTm%*-P`t8-mMwa9my zHrlpro2gHBo0CjmGJPe&4&; zokDXjr@ofP{mBS8mvyOlQ(Y*h#f|><7yr=q6gp(Pn3|*Op_);XX^$=u?NQ#Rtoq)) zSaw-=dV5mQ+dOv5)~6>cENa5T!i@T_R=k!*GZnIKPxlhF>(1TGas0-4OLnrGS%#&< z%NZwKT*Q8U$?>z2(N0xo9_=l3U@&;&*p+R3cmJsnGKFuG=gV)nj<#on+h!*`dPLsb z+??Ct+4lMQ8O4;=^zV`%Sk^Q&HSY*kuAKYO|Keedx4k*1LY`&oCZa2!xU((U?;P{k z+>I}S` zkbB{8;aWawzb52W558Jh=}azMo-iRN=+!<>lpX5!MjFuQPy#N^!G#c?j*}@}JSxYSFpz z4qB|DhW0-B)jv!wKfj!H=QXih{5{l?X>jayzWBr*-5_>PPtUy~rZhY}JT#@ePtzMa zDX{3qJpY(~9~dBee){n_X)12@O%xR0mgXiJXD?mBwh{+u<2!dJF8np@cWi9<%yP`Sj`2s{Vds-PDhd57+on?}#1|#BMM( zGt+x_tHUDEo9fDyD~+p8SXBexzi&cA-5JTx!$Vo%(BB+Cmf+TRE1XZO+IMZ1^Gb}W z5e{h+H8u6~t2=42O0{!M)t+A+Y_K)H61M!EPAPE6rY)6DSw+R^NJ`hl+qO0x?R|@{ z3`=*fE%&cG*)7vE)CRLs8x+_wM5`Mv%}>&z77XjFWOtoVP#^<_+EU;C`pSu{w>2-- zBWJcOW7&S2$gUh7k3wCb+QG4Hd0~1_nAOVPUo&-_zK#~wov)bDj!&ubDzz(2u`<`+ z%kts&Klm{soHo)?sk;3^pe4fuU)0mQ@PpQkJbT2m;xfGZ7<=L#KGk;SOe4@WP13OF zDXhkJt)^v$H*{R>B%t?wg-_$9o=zUXI$NUL)c*B9Kiam*(o>CwTe*oZeDtt7o<&Bt z2}^NjKV+CHjw?M^u@+UvFSMpJZk66j!LCpD}Pm7z6JkZDLBePqZDYe zol|a`>v;P&eU&gC0IcF9<=@Q5Z-_O<9SgPTCf1Hp_me)Q_-`rHReCy zQ6rYIqnq2Fu(9d@TJ0Q@lT+$wfJ(nVSdF)5%+&KGIxaLA`z&sjb{Xa3X~Rk>UM|C{ z#@qSRFJ#6(RQFH66)S)1wRjipJxj`Rqa9XORu3ON;`X$2zUJaWiN?}mDR%l4w=x4! zilkk~tJ>1EXcI1m@ko07=5cp-kLK2E_5SsK4_-t*(V2PJ#YipeGP;z|^`E+;JUUS| zcXyn4c_EWPeF4pf)fllFDPCO6J}TCR6`KUu5B5+WF@^)XY-3YkX#L0wt>NM0$D1qH zmTZz|0czivH{Av>-P-hLdAjn%rzgk10oba~W*b*>#7fv55@QnHk)TP9<5r&*&j)zf zv}ezr?$Dujuisxq3a+bwu+)sTrCsTG@y~1T#OR>Dx!Tzen)$*yrah&(G|+&ml`jX4_xEj_XDS zym>1xPpo_V#Uc~U{Zc9br?CODtTUQWfRa6ChqWubJf5dScLeP^9KBqZ zem145wY7EB8IU`GcF$3v4v*!@LgIOi%#Y`Pc8dlE240Ts>gd>YhmJLJdN%Hu!=0ic z$yEKXC9V{}Eh(SAR{C;YpX_;mKJ8MmQ%kAav{#~Wp*{O-zB(>^^TPLTz=SH4>ge(I zS9(kY+z`vo?=E)co9%KYzBD;G`E90*Hn)CnYLH3wS?mwCOP4PB4}*3Phdw+x^zI&0 zz%V%(K>zJv)?*?~6N7`4=$4HOi?iq{any7w z0Eu!OPo>ZCXQ`d#ONOKl4JQdac)7$HjwcjLwl*94wj6 zOMk2gI^>+ds}Vc!>qlKcW>ow+9i3l(JLE1K9m6l2o#(Ckl`Fq#BL$706kk|hTm99V za>`@;-Me>}RaI%66L3XLl$HHk)zO4cU~#8Phw%_yAb7t1<;83htg2|y%~Vw1`^r7< z&)4NBpfjHI_4OUTKHgU@vxe_{x=y0>{rx>DplQxB8c$@IU8NIR)UlCAdai$p6+dJ7 za?V9d35A_(T%^0aeyTvXJa zI{^XA?F2S2zTT3Zitqa4jm0OUE;UuvDj+Gxt_;+YCkda9^?UiDmwnF?cNx|8lRoCq z_pZRM7ewr0Hpf(Np%#f#+pNI;YY;?-0sAz!J`d`vCMFRf4ib=@{9j7l z9Hyjwe0++o)KK|tTrELC!72Ue%JqtwTReg+3emnnp`lUF6RNALO&fTQVJqL(4Q+}$ z7Ux3Smv7U>va;TsAmfN7GbN3EXI5hO?6c3>qQ#UMfeqHtYPWHqEbUc3b;`M`C)bP? zwXJf^_w?x?wPS)sfYML0_e_$C+n)8id{}K=zy%vEz#$@(J)rdYv&P7zdFq) zvRgn5e`MvOj;d;E`mC=1`Q!2h$KjTM-6eVBPx{-d*y!dZ`7ZXUw%PaQ6k{q11TV5P zG)mmmcHU2~v^8~pA-iXmp+RX??GJ+S)MeO&bzwQ;)z~`A9{-zmHxJB{!5Y0 z?iXJ;15wT7xl>}X_H4|yr#|hzf#NA^8iPC+ea|^pK10*ZIp^e~akH;dxyhnB(CqAw zdFAG{%-XsA!za5GKBzVYD+G#%%`(y4SZ7okG@0yWvN4u7k+bG_@EKh?=bUw!_9K7< zFKy+(*mRFd0ujI3X-J_ssJwdj!^vRklkJuF;?bX(Gm05#a#YavABD5*SPC>d_ha5a z>lT;OQyVVRXTvI7kxf0vI;|zcxqq*|yfb%LGd*GJDLbCXoXLBid&!sBX`ZTB_w5EU z2sA+*pv=n3A~iH6%J&bf-@0{cW~y}BscWn~gGtP?<)){UGuS=Un&xtphM~)UXTDK| z&P{z~1t_XFw1I>*Fg3NdwEpDD6QajXp(nTB!VcIgD`DM!@StIdm|G^QJU}I7*YD^&FlR904@2|^9r%P z-it#~q=ty2hAb+f9CkChAX`bOu z1+295&sf{r+B@4b17*)2-}_#P?52Q@rKRN+!S_5zk7^dETgQW^nOUWwKy-^(qkEywn7dDril44t6KFwk&M_*#(KspQG^qL@IQSpwg6`F&iEK*;hfga;dL=IW z<1W(4F`Ze44J&TpJX-*W1jV$Vh#b{@eRHwg;HmN!Qm)zkR(%xYY6z9X`b@m)Re%5f zm377T4bn+fzha+u=${t@U_Qa-Jr&+Ek%#^XDlWHZiap@sYjr5|LIWu)Km&U(*EHzC zg9vmNzN_ACJAvYJ&BEgRb!TQ&vO={`I|gcY$O||n{Qc{GP$%`lwtMaTA|juki3%5= z1huQWMPa5&nbcwZw!U61Qor_RPyT(6h_5+UPZ7jz>C*Z0J4qGZaT%I8bIBJn_0i8G zV`I0T3gs}1?W(Wef)gG!Jqz;dzemtWsKc66bgd$h{@wfcZ-E1%$K7w-P-;unnDzES zqXJHQfBvwfq-4KV7Kk$SF?-VG7dpvGo82ce3!{hgB}P6a-o;_2di47&+s(|)?TS}T zxIW!SC!LVhvdE)$8`t?3gsc|XK$_C)lO|94Becvto&&Kx+J7p#;M?}?+fl&odL_zx zFNr~Hdewr~r>L#1T~W8`-o1N{03u2gWMqJ$3bVsav)*e1Z~Xv7-=2T@&&vzw+qGiZ zQf8*6H{ZN@BamNRQ*#CV>tvSmk6OBPowN|*#2r1l9aJyq5}$y;HeRhK0w?x8Ty5KC zp!af@!qmXK8!nSQ|3pd?0yP%;hjVA_PDdQ%?tQDED07Cx>|?~EBRY!IyJP_{HlZCn zdLAim)1Ln5GlM|2%ye0b3nfJuPmGy%Rn7bNCkyP{75SLm-90@UIyE;ZJm6LqBNU?z z(p#DQf&L_M>?$ebYGGRwY;A1^zSZ14eabvCA%PAE*3767 zfHZa5V9!acK{0z$v{V~(6NT(9+NKC7@^W%=jWMYF)_EbCX?`f;q8*QTwh&~C;0Ikt zWMo)tLpgUyNJyy1m^Vg~17s%ql4@U`Fyq<+qcF*7$;3fQ_UdbtYA<-z`8^Fl7jZ`_5>}2+hZ^yBElb^HDXlw`0z!R@|uTi z#O9go$^X%uhc49E+`K!CxN1N?%?V==-KktfL`2j}4Bn5nrU*+(>3t}9o|^i-JwvBh z6rFWPTU%RzY1nLQSr)+~CwZXHbj>AyEl z`jiV@S%yFC_X9;exPM=q+?=Y{(g1qzt+Vp?Q)R!z=8VQ zqb_@RXSJN>u8H>t+@qpD6c^M+_SF7s3GYy=r5HiS!0CB2<9O8E|BRmHj!}8A_li?$ zbo5^C7@+v8PN~P(B^_@r76cbh2tdjGP?D+lruLaS*-hV-Uw0p>JE@&ME#mPN{O_vv zVIH1$gK{(9-_vqW?Af`K=S*eZ_0;)WAXtal%eA4{i4^Y`92_jma{l>6IK0;i^)2@m zEge^*e7*Qh8aleiN5V-2<)l<#lD{tID)nflpNX#8x*;b%gZX4(q4=C1Y4Brut;D&j zyy8!p^apuDw%(t|hiYj%?pr?#g3*|OVNsFzF$4OjJ= zc0>Y8x$ojxPbQQ2`~`>q*>d@LEz``6+WY+XrVab;Q)*WRJx7Z_r{1F3Tb5qFMOd?m z`lSDdTUoarK8j_%_3-=6ig1UG=UYs!_V#pz?hN_A{#SHpEOpACUA#&`TY55L_T(QU zD7yBnkN<}9YAqLjB^aN;{mlKpP5YzcwQIr|#t{e4Dk-pdNJ&cGEH6L4-~|06snwE_ zXOklsuz`WW)DU<_Rjo>6YIS>0@$yaRm(U5Xb>*O*j!nO-Zf@Sr#>Q3@<&3Ax$jFf2 z8{DtPSX5HtB$(D}`FkY+`i;g^)YJLHX#2T;&e)8gFqNah&=nOG(Wveh1m+Gec#r?2 zT0NK2iee?|#{Q_6cc0OG=cOHNYGtY^Suew1~wag8~7gN*_bZ zRKU#?m6R*Z=OG-xdHASffu1`$H1z&bn({ryBFAq^$rA-8@3*?Rxmh%ye{r6F+m0Q3 zV%3iB;E=5%bl>baoH6(x{g#>J6jVw0hYq`17<9Sc?(1vb$rD}B3#aUKpt!fFTZhNT zKOl5m`~85`Sf}TtCEj3%B;$O}F8!0PP0qx~_`4m#;f@=go}yjXTwUKwIS+p*;Ly*E zT+FA9L*@8C3B~T+ydV_666d*PlTFs9-PNeEnJ| zReHK?cDvVnx5Xtrk6=!D4yc4%z%&_L z9gI^KzkdCC1>fejPCap1TEI0_RaGy?`l5ynOim_@{6a-~ixPOHKJ0+HD;hVosCnbS z$J4KvCUPDz@(A!GT;gQ7d*@E|GqrHEI5|0vj}iR)PwaCAyzs}<_Y`X?D^qS`lhnSW zOgOdAvNs%9Wj!dsyxzedqX`nH>4cMY;%`0wf%kQFCm>JUzhu#!YnICPNjs(nQ&ZgPL!AL4>fT^`={Q{cKHPpJPN1< zSDP(0lun%BmX?mS>I3mT(G)AbnQv3Hlrt~5`EKb%9<`Zni^QSj))^qQD^Nhyv%BG% zZITZf8H+j-DZsF{x)7lv>EZa_EI`E0XmOj}I=tArx@GRyDn4Z!6oxEHM?*wX($;?1 zI>I6AUggzvFEliC#zwQ<8bIc?#V3wb4}&~Q^70#VB1`c`*>|yGM2L=Au{a#(mDrzW z#S|~^`y}H6frf!DR1(jqtNWwJiOih=ZH$(2JKAQm`y|j6bmBW!M&QoTvK~$qpCDMV zLgb0I_WoVZr(IyHm+G@Dic4y zmhPEo&!{QWgkks&yltz2V7`8?>6Y1qARe{qfq~0qZqrX%)DwM{_`4F4`@9zn)z6%1 z&>2DN`;w$cF)R&8Qoj(g53c4N-^iS)H=*S;j5pdOY!a3(Thu304VJ7Cq`y?Z~9G$G0qxlUY2$zNKS{?s7| zq-&i=tRdjY$Hnp;*~Xk&Ph?}#G;iFHu2`NbZFFU0Xa5zhCHE-b+qo20E)_MkXkj?o z{ZXT`FP52t6@NDwUb+;Jsptv#OPNYe$$Dn5 z!UK-uv5TF-4w}Ud6z+d#uDm*%LgmJeE=?+R%YNvplLCX=>Fep4j`nA2OeuxD5jBvL z8(m-aUes}Pe65%OO_3HNWrJZF#Dc$&6It6?#4PE%?97Zcj(g7U{Hzif2=ij3`N!yJ zAa`_&q;gi!;8r zJr)s(lH%fPT`)0rfpc(ASR5j>gRlvStmNcmkO5Xu)Sb>mN?`H#fd!}H%G8~E_E4Wa zdzR#C-(Nu&$|*k#uK%^ng9)L5;wX4wY^J8BG^&)2zrLSTJ#*%V(G+kG)w(a7*-Mo^ z38l@@dS<`w-3sU{FXhJ2hxjZ>(GdYna(MjsFmOYp|JkLAK z_aD8W{}!He@S|p2UuA&K55-6Eue?{=Yh!y2f_s^y@aM#-#+Q;_Paz!uk?b@}Q7kU5$Nwtd8i=2c0agMD;oZcj3QT-9y|M*~($Pf1jP;)7GV04rQYb@${ zWDh~w(F`I&6bV6sdpElneRXY5Azx>OkCUUbvmaRPA@&o2^OL=PAW#DN?PbO}McG2d zB_%|)kAQqhNVs)%r^fdB@3cv^e|?feEZu`oDM(Of`M{h7@GJ%FxpUHY=-~oUp@KmQ zz9FRlMe83wb@^m=78dKJX|s_qpmFpR?Ul5TL-F8854LRClAC|l-Mz@RJ5N8;wzw}3 zvXg*bE|1%je&6-4Nt&=Rx3fz}ls1Eh{(?%U)Tt4C^it7HIufIC$A zkOONC>j2uYOs%x}4jtNxHeb7;to$9UXY$jRU;De?josMTSk~+BYiv+0t-9=rr63s{ z)X-lL5vcM*IORh?PlmJd&6`*dk5KCTbRHnZ^Z4&L^m?;~U7VbuIIWRK;#g9dMa?gt zI~QhaXZrEFM5UU&Ije$yKtMeU-e}}6OfVCOoja$6PR2l@xkvcv;LCcwb&OQ>fq`J* z%CK2e*&z7hN(Xn|3l4s-=tm`-tpjCP<&u|7qg?S``MuSID5k5 zXCi_zd7WsnMN32I;=!H3pWC}CoB;2@OSkRzL3D zJLL`FSD0kSw67YhHpaLR%E_YK0xrR@w?&MISrvHhs51f-{zwjpc7Stgr>M}SdfeZ4 zR2&9=xl$S#o^7#}}mkP3v`aPIElr@lUpL+8_zT(G?uyH{OpKYSnIl{hsUkp;9u7qfJdN}?nVF)KlL*<)zP|bSc_^+= z95T_lZfT}9xGf&*Ej^!cZqX@pFW1V$ zujD;$+!!vk14Vln8F^vgclMfbOTm_bexLdg365hRWW* z2h~zGq()A!8FJ?Y2xlfp#`f$E4@E3>m*H@=~3B9DCU>E=K z;|YtK^npB4TlO+BUCrwR-fU@W3okBro9qdIG~_XkUGxoxz(-Qw*HX7n&KF}Y5*e2c z>Z8UL0yfKxLxA$M?Ck6@%T>_s9`tz=jtCKG8(tHG-|02|!ENZ{t}Gu9?1&WC@87?_ z2100bm0KKSN^y%sIP@)XK#eiLN&$rq{SvYK*cyJfZ$}sMLqB4=?j~uIGJHQ>?d|AM zbp;9<=QsZ3@9+sR(-pJBE`uK!{0P`eeBP8akV~^OnFrL5DQP69h$N+-G{IQWj-0!e zem4lx*+*rlKvKT=O80HKJ)(0}*vZW8C0>gn7hasdSK0|7ZQHo4?;2a>-|w_j(vaJl zoR4L}F-!6Dqj1XvN^9)ZMSF zI^jI?E;PWXW^FfFjX1i1T5q9kfvUt?IfCwEq218#YV)-uUoinvLky{J(fg_58t2B5 z%HI;|J=qR>q@GS=#Xd2k;oggIW4lLlkIg@p)OT~#TJwJue6=`jNQz!sd;G?D*htgP zHRxP}F8lLBet!Zy6#?mE_bLSiEgee^=kb4?jF^TkKut0{p?wtMmG$>LS`-#+qRc3D)_0$Ox`&JcVB0>5@qT}*N6MO=q(mat!7A=RO^98l3afF% z>uvw@!|97e)Z>FflY^59hl{o{4Qr1PW3>vP9kL?G5I{wZn-~fbM<4=~@^sz73+*es ze8Y;=C;xqCtK!@@Z^m?+)&JZ$KI$U9k(HqcB%F5m;pe~Z?Zi9OAlY?!v?clUDgFzu zJS?=nv?xBB)2>D9%4UWY`(fx2fPZD zlLLp-xS0F?4mjyVS9VQ_>qJCIH4hgT8B$QiQP4r3yj?#oD@)X1k}E1aHSA9jcL5=x zhovDZDYu~d?3En`-0SP>BjvBGtRR~37$%dGqazuZ)6@`})!VMFu7MoTja%X2Om3O5 zl=iMvLI4^V8L2a@L*Yl(6-E$LSU5YrZMuc_hEryN(uBu?#a$fLP`(Pmgy(#^_N`J) zq8l|dGz6vB4`kE)=x6*X`C{Y-0&5nFJ0UhP@*{v7;g{Q05*8LU7*We9D$HZ9u{$vqe z2F@~tXbDi`q2+${mmOOh*clmb!!NS1!jf&3#s+_nlOx*oYj(EA*mt$VGE>5?$1z_E zeEvbFUcU9NNPe9=#hS>U&A?mF&CgUNyyL>Y8}~iFqWuwGgNhB}mW-2=lQA~pc0D_F z&Yk<+F6A;xPelZoi6kN}c&8-}r)7jF27PUc0&&iF@$vDpuGgnX_aGmb zWf3+Bz)?!8Iw5Y4M;Vs52Fe;aZP9da~(B6LD7I`RVB2vaCY=RsB zn4coH?yx?RXf=S0jw>hc3&K9?aL`S~A$&*m(oSpRFTZi&fT$>w@A|46k|gCneMO7a z0yq0=9vV-1KhzL4BDH)*Jt`sL$WNOyg!uF3-7CT!Kq8ArZIiOVOg}2Yi02(2I4iHO z)@?`5eWGl(S%#3hQiw?R<#&M$>aJvDKClbDy6h02a|9zzl_u6S&`As_Wcd2n=L3I- z;~bE`yS|l%<|eT4pn(m(^~12R>UK?__B**|4J?^&KqCGkm>)e(L>A@#4)}SAf-t1F zNN2BoMP$)Zj729dEFBrL8ZH@`xD5GB-JE^>Z$@#N{cvW5I>uU(Dars`RVJ_*iQJuY zrGRB`-t|d+NuL!-vYl&dkk`2oX7093KujfC!j45J0V}Zz@scQqRK!{oq1@=mATYcc zioipMBfr;SD^zuN>#0s6_wAREknphQeJhsYmGAHFDOvAvD*B|6Lr$5)_LO>K!h_+& zAlYr*?Z`{IuwLA7b^ecukYD|q?$XE1{p=23WC=uAWw@9rkc5+%tH za39k%hW@?@3oF?JyXnu=*SJL%8fV)$4tbw#5JM1xETpivx1T|xg!Yrm;%~F)

a zGMgzWtDc^`?{Cpl>L&QgsGNmtt3w(DlFEuKfBKGUrJIcxZ?&0vgnuX%LK-Jv;;1_2 zKB{xeKBu4EpEYmQm6gyU3z4h_DP-XhXzF#`C+^TRxvVa^X31~>^L&FgintgOPOqWb zG{#{3;u^x1N}{WO+O#gYeSe{yyve>h_oO9p8LxevOa*G*n4XA180#*?{KgoFyne#M z!b+kin8oDdQGoVo#Tc(1B5+ei7ykN(K zR@0(Rq#8@kBtgIOnQc0jTV4@ZD%6UvMnc3-CpkQwTp8S`$Ugk9HC8)45MB=SZIXu= zOYrMn_wl(KS?r%)*1vsAewU7wmq!Owd1-0M$-)UjH+BY1y$cXr7RB7fa1=xzL~E5q z?0WLHoOyiceL_8oFlg3(o;I9n!p@e_K{!ANsWsJ9gs7b8IE*yFN}}WA1$cB&(ps9E zLu7Pd@dgYkD%-Xsk-KpK-kCK-R*U?TaV|h698&%Ey+R7APrxxR!>}t7EiNh=Kx&ki zrwvd-qfY{ROeu1k*v+IZ3Txz)gyO^mX!uz$ydTL-!~_5XLp_<%ws2;NvUkR}|&B@>dw|wng>Nw4H*eh)7Y?{g9B|^CvFkBSNZ2$G^*CxhQPimfgyEvLhCz2^> zbbKB?*4?qR{6;axA(WT8Wb>QcSDCAscHV(bmY!BV8{z&H&9lkJ$2;CXd*)7Lsa6$s zQ;1J_wHJjS)eQQ))}Ba&QS6MMrIxr&XSr;9n)L7?eYeeS>6=7s$;|hxiB!evya9ao zi_&+-Gp>!K)F!xGI~Os3@v84W<)iJDO%32Q#8Zf21!UGVH-;53Hbl93bNyPi@+zng zHB9i+b+2ixWp&dM@^aHx;gJRR!bL)k*r9wb5~B>8Zr{G`?-h{Q5`^czCg_=Qbb3Do zhPMq3XU6vKRt%=TpBH{1GC#jEKXb>~?RJ7qpRk^Wn-gB*pb2+_o6iq}6{=yB5O{?w zTe!N7CvqQ%(FdNjf6wcDC7RJ*tbk2Z-Y~tgy zT33eWs4%bLjV(!sKPHBZ8e_$2F*EkQ;vv7h@hYrj5u9APLE z%xYXPHjYO`gmGh2FQ5chiy-f6O!By7XtlxK2jg9J1j>PT8Jx77KC(7%r~i~LUv^nHQ(lziUiNjM0ww%JUY<52nC)L1EKH^ z(w<5pUyGfoS;Z`?62?O1*Qy{zaL0hb-9n7vTQf0n#_d7RAZS?fE};&KGTW&9kU$#( z2gRcQWL;uTdHU3U5Mh97v@RzLYXlnmO5LZpkfPbvXB8PI@9!?c_|=u=Jd!$#0@uFb!NJRrW#rsv$e1Nhm@1QRCroi+Kk_K$W|o47UxvJN7^4hegv4GhKPvvB)pC;J;CeM z`v+HXJ1!n2r>y%;SP!OsoNdoSh~7Y)60ey7aty`M%4C>4@Ynq_ZDa~13!Ut21HCBr z-HX4EijdeG_Q26J{XQ$Ig?To3z^HC|+F?->?w~D9LinK7QJnun(ZM2zO+h z&tOD;N~rQ^{b58tHe{^1`T2tM-d*s_Yhd<74?pJF0*1pQ$)I3ueTY=%5pHFKlR>j> z8&beFU0K4u$J2nBv=PpN81@~X(`Y>R!nm?xpYeX7j&jchE@9zE8*W{4q2Kv8fI*_Z z>@U!@2Y&t(kYwOKbfw_s5gK6=tg=gc&fdLymo?LQy6^b5EnBWs%%tNe-IwR>Qt~An z`Wju6ze50g&{+)TLJSN_wM7eG^*6q3*~B5|6^LB2sr@u4#{jlaL`aizc9*l^UTX`B zyD&rPWBJh~_Z-oGN9x;k=zJBXn3yw>>*Id~oa>wkos*^Q^Jo98e(0;w4%;z_nuYZH z+>P=Ub*EyFSzbumk;Ua0ioAv9NThwC@g@VMICJV2*fN`&n;&A2qkoe`PIB z(038`w49m;<4S2UnE5ya+JeLm;29xP2W(UdNc~P0F9;SR?eZb16Qx9$i|b}-=k3k3 zLWd8Z`LHwzx_12d@nX?T!_o_?98!jrKI}5CTTft`}fscS5{Z)#v2i@d?rnH6Ef4T zzWL5!_z}E?3qVg1A;t%Fmfaxc2MiLin&R?hB33g|zR)LH+zDXf|H>FPu^Pld#;{Wa zNs%ZnUzqGobMbm3Z1!-@8WP>wP3=WxrL9Q9%Poj3PIEe(d?sGqP)HHm=ils>)YKq z62x2J69Y|%;b{N^QUlO$&A>Y50_kDq8m57yf#6%rfX;s{n)uS)tq4DN8^=qSs^I32 zOE2I$fXUzYEdHJaJ+}oDJX1qBw{J0YgwaES%y=>rn{E5{-4BQR<*-LQr-sVP%975e zs8EH9+kR_mS)^9%e{LA=$D?0m6`)r0ECx6jxYG{-9+RFDS@hOb2={Oo{8; zASv63BEsjIQ+plQil4AiojV$-JRu1Wdz#T}Tf)g1d`4oyMU#nshpD$0B5 z&)dO4)8|*2Om;Kw^X^oM5XaPNYN*!%wcGe;fqqeRdk%l>WZ`mir16qa6jTyDp#&2n zK~in;QqI4<33Bc@*(07R4Hc=$`Fi9G5;NOyz;RZ6m|J5-`#OLu%;(Ra2{@*{`nj`n zcg4!@Gv8vMMGj!sTEun%3CgX^)LeMB#p5*>y%K`V?0vyieV-O=2;xIg}n+Z}*K z{HHnm&zl?==${gRfv6UTn~rdCB~@J?{rsHvU2UyM2i6Z$dzN9G zm65nEzT@md&fRUyn{YnGmo8!2l*mply6RMtdOs9KWrk*^gNuN^`oLQX)eRG z31)?gTSr1Tj_VJ&_4J>|h_x<$SA4?OVR<6&+kAfFll;0r#B1Ru8x15Dw}QD{iR}WC zg2jDM@=+Nv#*65SS*#mI31%?BMXUK|n*Avxfau}l(2$UzWc9X(4ffh;=ya+yqRn^l zj`30{8ALb}ssGEd3y^&8V0qGNQVQ9A5hBdP=wUiqTBhq!!VEsklgFavedXH9Al=_V z$IwSxN5CnyYVvoSN(fub^t>F1SO2QN@a9QhVh&Q$eTGXX0dRPz)VK^y5+R~tdf6+@lP|L?lu!eP)eoGG zsG6CXktO1oEvOpBAxOAM*pt3krlN5?g|z3KiJkKuO2lmYy`UhI3YjfrNyvWP@%P`` z)DBOg-BPSEU-tNMVF`%>OJ6ooJw&xiii(7?;}D61%|Nou#F7*3A}9N^u&}4Z9$|se zv9bF2Qz@;N0o{T-S)iBobaxAPIk~yjK^Ps*a7J2_$gX8wb#Mqptn^rH2E;eZt5<6- zL87EJE_3I$VSSmPnRdd#!C_be!FwX8jC&hQbid*2Vh<(U2wb&Yu^G5eK%-mqCbBT$0IBL+ckj*YV>*nlTpRo_%nLNi8u4Y;Xwz!D2+IYm^^#f&0rFy+V?FS zSB7`Ozcc-c;rn`Tq6SgrV>MYhaR%PI=V1sPN`}{hMdtb|#Z;7p*!DiyR!Thvnd)C9 zHgDd{k3cGpS-jXdh(#4AOgqx281bR-LJeG<-MRKiZwaHfPWKD)w85O#Xm$ab;*J?~ zTKRh|yf6yIjVa6ujT*KDiO<-B%G*H`3sO9N7FuB}iMlynYPJOP;&mvp(ZkAwSVv=u)BHN!8`?A9`v#Ya(1dA1i9a&0TfhIiH4lhvTP zeXe87h4)CEQr5zyvrEs}CV4D84U{S)!o>^2Sqe3r^4-oBXgvK;%>M8`%ks@j4<C@cgypB5Wjiww8e$1SA$5Z4R3pTdSf$k%7g8aG@c_n z@-8;RdtF)etnTt!hGdyJ{h zq}tu7c#e&0q0hGm&TB>1VGug~QBT*Y$;A)rIzMU!-PV)Wlmmy777pi_cUJi37R(9X z$-1R2Y-8hKlUh4pFh3p5XwuYP;k)nI!B^hv=h^xyUFiIS&UzYb7cVG`w{UMS^1+)6 zFij>3a2ir1o3!^u)-X2T)P|>m`x)f<=6Vs&&p6Jx=xa^-tGDc?z9q-}kI#Y5#-0VO z&H}kw$MuKt8Np9$Ya6{=(h42~9yIWLY4>%Iw`(aQ$2U&jpjlDl+Ju(%)uzeJT34MD zypQ+}w7>Ro0G%$*uv;B$Jh9C8quu@7+3ReT`oUl;@1Rm$TYt51iqqR4K}1=>_X4n4 zk&^hBz>kzZx|Egn1mGme6~h$%FbJqzKex5zAT&qHzy9;fEx4806QgrCWHF%XmVj9F zRcq@PiUd`){qmaIEfW}@cCoac|L|ejJp#H_<@TV4??AKt^r1rj$oernd`HlzjK07J z!uc_`RPZ$z#6gM)n83vF>VCIO2;9k2PMEBO;!;{82(Q-2}b7sB0Pti1e0^6zhQbGKQ?{+^Q<4=zMZ|C z3dcdDYiV_%e>T$@X6&#uz?q)Fm`#^c!4^dd-OitnP_wff0}7}n38H>J z4r#~)S8H6w@RW{u{owfcN0K0?ybnGB3W-Q|FET}zk*=(``+##QP++G(%xyov7*SA< z8iAO>-v}ECXZPMDr)UftLrU>V^Pl&7j0X(}>pFK*wBK|3_^`VC%FKxhS0bnuU*F$f zSw*PVjWL);z2ul;z_2Ap(yRGjx&INxg7}6PfZX}_`~UmKPX(z0e{e&I8YG6Ihcho^ zvWTI3_m3R2>v^|Qf<|xgpi62L*sHN{=l1C5Sw5Pc zMS$_}EIaLug(*7364Ym35Ocu)iOwb^am}DDV!#H4m}-Nq!+zO})XM`oH@M412=3`5 zzydoF&a3g;`-w$wJ3_{rkC&Cmp|N=CFsC9wgSKfB%nVIoRG&&XIU z=5Us{;Syj`l~{IM94iRHNgxbB)GADyMGqf={gr3kaoTa;1TjuTO!LHcwYD+<_wY+H zR3QyDfB_Bl*&CQ20?hz)MWh*1N-@>|`^@klgXP>%{EOUw?bBgLc-FEfkH4+#^TZjcT@Z-mR$w#MW zjKpA6(+!s%P!03PkKe<;XU)uqJ{rMuulRzlu5N4AU!fF;03!W{n7YS3W)4Cp0avca z&tChkuQmFX0zrR6lHx%p5ghON@v=A+-ckDRe`Q$PolyNmTC* zp%s3=FBrZ29KnBRduKQ33!w};SN@ruZCJSdGcb_m&&#4oSK$4lHrWAPRJSd zX+**m)IOk~4gP2V0buRZK(q#cZ~VX zBwX!DG1u|)emH-mw;Q9tR-d13foVlb>iquxek+`i(oO)}ZR3QSZjf({d))xX>}6)Q z(zd}w4apUI#1ZzT(waTCjU;a3G^LI!^bSjAOFc0egb@OHw^U>>He$x=o%SFUq{+#l zp#WHFOksRb6QRAlO+bC%EVS)5(Y66*(Zy|Cru1>yB#Ht?u*-9B&PcA1ivy8;WcaKJ zr>Gh`6>2KP6{BLOld7qJFD7_pu_&YnPpHLkdvLLH076MI7{yE&xv7l*H7-`;5f?DMi4yv>6MD1bLYXZM2Zuf)_EU@e=iAS(J1?Icuhtd zqjU!jo|e(uM9dDw42~N+%{kckCZ-=rgUBMz&^>&5f#S@VmUTnMpMH&urN{?1WXX3D zsre%BFYU-NFJ3-2=Qexjv}c>*rsRc}|N1%)gd6zi$?}{JoxoT-)3Sval!7>=w8C2EA8``+48r659Mi47$_)^PyMTzN zg!6oSI$ywMEqhfx;^ee(!wHxpZ<5zeykhp&(9r{uIsxaU-8aI_ZqmVS;_3sFE`^8R zPX)`+DGkC$Ie7m+pFYMyW2V=;TqwyleegPQV-a1Z5-O$tynOP=h~odp)tkUmxwUV= zJE=%YnyExYMNvs)C`l1*;xw2_Dj_pv%1mUa3>gZMu~0(W5JiSW$XsNeB{P}wT}$V@ z|L^;Lzw@5=B<#JP=UHpr_ch(|Z1vQ^-gyJ-jO8q5=Q=##SXMZBR0uz_Md8d9#X;Gj z%hJD1n0F8xBAVk)`$r%?PvAjlAq(9DU24J8ln3&-QWVTaRRIX3-cEo zf-8f4uBD=4O_oJpz9ugp9|Iopo5KxQuX%t(vg=$K*RA?voLG^1vp#v7>MN}uttk&A zXFRgcVw=2z7Nn^(#l*zKaqUi7ll?sd1587?v&oJ!6c`vQ4c!%s6^ z_8f19$H!ek2CqE56!O58STo&P$yg$l2`aoU#pGnf4>qNnaWn>h?}1=ujbiAbHlR;X zSwy1uYbs4g5L&bj`$jBpH|*}F`==a!hTXZizJ6+^NBNZ>ZZ=|&eAKko)q6K z?Chm!Mkrc7{P>~ydgrvq{rk>DRgt8gq$b>pg80$U0stL_pa+2Np!ji&iQ!45-dzV^ z6A>y31Bjbo!!s<<@aVv|6rB~2njT1M#FGtPxc;{mz-w1mSHV6Qt8=HG=!R!)J#g*x zqYJ7~F7M#taxOuW=abjisNP|V+e8E&ySAyQs1S`ybrui8IhLSw%n#tjbxKOs8fD$K zjTr^1!oc~2bISnhoyha`euCBq*%bS$v(BGCPvkp!zLcvvI&Ju93IjM|j{(HGZ|8v8 z0~*z$=vrJ~5~5i&h2TP_3&|SVRSC+^-OpcOup%iq@Px%$vJ2}+*ZyOvg%~U*s12^F z)fX4XRo1Gl-HPlpnCJ`BjPTfz5S-ULuRp7$IPYLs{Kuopvw3^euD^d^$R7rlf1Ks- zKV>(C9SK_j1*N-Qh@f`9Z8~t4U<>>!ZKYBKJv3Iuh^~?hv(iT!HWF50BnB8m*EcC+iy`E51sXxvN>BC^gUmPoSFc{-Rkd*=%Tf?__A^IK z+^~8!yoK1*QtKc=x)6O976Ijm(zPc)j1^cK8SMk8^LScseC|4=E5XP(j12^b85QT@TuBNHt&YQC&I$07z8aq2CNW6j zs*b2U+8VFAM;z!fEej7Vl8%Kiji;S_CZx?3V~HqIFQ4O z@rL((e7foqfNe*%N8QUgUH;`uAsQA_TcA<5 zBFAF!iIOix-q(Ku{p2wEz}7kxpW2aGc4OVlG+!^TGm(=1g5o$*aicU;*t_=w^g_~3 zO&XZ^Owb7MeexO5$vji+cp5>%DmcUG=;+d#?+k74&CAPELlQ9_e)ZG0&<0eBwSX!+ zU%ls}P*`vF8Jw=Q7TpyQuPk#b!j4Xz$403aHiRSZ^yysKBwktG(kb#*t{$D3sI_VF zjKUeDW(`Yjq9D~w$ng82q^YpjzUy5?8Kzc$49^7%6~D1sj5_oNGP$G&Er7rnJq-a9 zKrRT>OkWMi7W{g`M3##ex)V3z?dkdTdo^}b*UqDSe5VH*P)5eZ;`9?px|*7jqFgKqbGrDY(f|K%Pw8IR z!H^$3aLUssQm9AWbF*!(kA!55cG-07BE!11Kfpr@cVyAPw$d>M$;v~<=M=xfCjmBuB^&eZS~5EU$KrUZk8NhMSuL0qtG2m zdiEMgR@b(x_$U)pSBBm#4AQ!r&FA@2#)P`<_O8{bprsVY9pY=rDhwV|oL|RDmm{(C zc%jOS($$^2HpwUI>Xg%1g(mITZSCYg;yVUo$cN!-&83SpfY-De<`{4R)vGtaoF*s&X|q@NEQxz?J!uemkrl zcWm2bQWuQ0wVfai$dOF~7-BX*o(VMhm1d3*QeO@oJgDRQ1&A-)F#Bu#BO*9LgM$Ta z_uzzw#vuA(Iodk1HZK)7HEl)I)S1W>w9^{}1O!4PE~%-t;};&%|JK-OUK{h|$rITy z=$IojGeuK<)6<22wY-1-kmRZmee5Nq*wUlz+iPIYGzxy=88H4-UdRQuLNqB0Y(p)Q zTdqv)_U+pMytohLVnLbJI=Z;HC`p+eAdOX)xMhEU5)~`D%jgM$h+J(D2Dkl=CNv@I ztG zHvI|?mnIx?f2MmNX}%O|2h?OUb~abre$ZtyCr=7>*%Grk+`w(&P#95g{S~ZH+Os09 z;ARc9zL&3H$5s%-7Vi$PKGOq3!~(s>&P3t+^E*sPXE{j}eooTD5Kb8(SByv*U~XRH zaDf~{`o#6m)!;imR8&x(mqQ|O1?G7La>6*{s4@sUbsh-r5;Tmf-c53{u!HY@`aYtR01_SaoY7&8e$H#Ls6@c~l)L*2Az zuiyxf#{F}%6*Fz8cXfB)n7x2P5er)yD@nLNIwmF$>x4qN?^97zvp!bw$@%CwERZ~a z23|?VGOjpoxgy+f%_1KNrKP5m3 z(XbW=9;Kr^^oOr~d}Ku1{(COoIWe8j3eiW3kiQq6{fwO6L;b`Uj~1sev14Gs3y$J? z)3om+kQEV{-uxDBW)T5yCM;kOy;Z-x*vH{NEkN?N)Hu{;tq{?;E;b1yX2KSWV}ll8 zsfWJuG5G)#g$Cd$XaoKxY(p}dva+)e8cfg5b`S@CAL#KbkU4}{Ld#DeBQ^=N4J%6n z_ZBBkAZrt{Hfc_LT53b4x(HcL1cj1%DpPd$q& zRuE>z;;;Ir6ckp$W8g3P77s%^?yRuBfY!TUgU1PYzgJPjaE3wZ|LoPP{i$aO<_C4A z^q2*S7ev37N(9#=E&P!FU94;kOHqS5ejhO;S!@E}dSKx9^t3kcPR=m&X9Uek^#vb` z9Za=u?(uTeK)9zLv3Z+rSoy)h#RGTu{(Yw%*L0w$Xqqs@xdX$KYW27G4pQ~4oDS2i zE^7q2xTL9HQ`f@i3h!f63aUj$c9|x4z#5TIJ(vIt$f~!JlQaBmgrpB@-p0`6v&T`;! zc$97j>(A0UtXa2_E0+RuxJfG=EYc^w9 zzuuMT^(To^{GCOgUuYeS@|l*2iHR!}RkHt$m8YG@`yh5_V@p)9MChbm{O#Ko=ta1= zTZ&fJA7lZzKNRMi#0P&cv(d`)Q5J6qif4%LhQ?NDFdeo{1L1#uU^U}BbFw|E=?4@Y z5e==)`(T8FzjfJ>nIW?YT#`&$HmtRHKu-Zq0DSim!r=@W6IwF>sOk0CK=YFPA$i6w zu3|fgl`{-MrBM;UaQ6!u%DZf9}B#1B=(=NwI_HzUoTVCzn( z72k`ip^`lztMVn4Wy=;uq|`Jura(K%jOE(X{0m0kVK`Uu!g@~*96Z=I-1brbm7ALz z@nF!-D=D$O17kMW5~?5WMr^3~!qn}UEnyNe^S_pyWGPx+E5gYupBOi%YabvlH@91; z#nnjJ<}N#XfbqwB1{7sK*~j}IaDch4x~V_cYxPw?zLZuq7qDJ=ihr--J;7UPNiJ3ArDi05;infb0F)`4p$(q8 zXE*-!R4WkroNdQ@LjmKP8S5AtGDG5!^xXglm@@z+lSNMxR2P}FY91hd1V#Qe2#-Z1 z_z$|ROF*oxa9_Rcb50nlN^B#HCiI|JmSXGgXod*UzB0p9C%qnfM`02|GZ+vJr(z5F z4F$B_mn32PIji_{F`paXCnY6mBZ#U=jKmfqUoX8L30O-2W_8wuLSl4LUH$!hHu}P0 z5O5{~Ad*(1Eh@|jOTGjUZ0c}#cQ@)V@$`E5*GT$ZXB`{hGrYHZSSe8n-IwAozn3II zHiPMa#azHo(dr5VWH+T9Nd753X`hQB_Tq6-PCP<@(QSWGH3(UaceM2Lt!u- zCx;nP!R4VAsMQ((gOLVVrTPHuF882_dN<#Q;+Z&djKUyX`-sjzgSNMJ2QMJ40{CV* zDeJSrGocDZieb^>srBb~#~MO46#*J}rQCx(o$M`w2H3Y&B6ZYaYtn5z>8ym>e)^o8 zOjzA+Ha0LmtY$y#E!u+c-w$XkS5A-weE7q(%T;jrjKFB&W}&)g11^8K=wWknb0p~3 z^zKFwG10R{FQpJ0tR0FwPU0PVDq_4nbuo7k)ta!480`E+jMAeA2|Y4f)lUD@qJ$F3s>?}z!&D>5dC|?VO$w+ z30ltL_U#+x)sXoJ?%etO;aMDMzHGwxFBL;jg}ml{hp5hiKHtit&&WkM;CF0fThh%IZjU7DSP# zMFbkwlHc2g!VRz)VyG6ey0srresLZ?{8;Y??)C{7_z&g6EXTZvhe6_QJw&s{zBb%n zyLVR-HnbD)(BM1#{OGX9=~h99>x9d;Y1;ngO+Hx959%R?3w@AF(G;RL+R#TRkZAxw zzn8{Z22!~4Q4WRFJ6vvDkNpx|oNg7;Wt6h%Vl3dD;pFG{*3(3^*iAGCkA_wN!AInu zKgsC(P*-=p!xo1N%Z3ew(JCm=!ZZd!%)8?(h_!^Ri3lp$T1vlsaYN-L+LQ&G%#+R~RiA&zwM?6KlEIeTI6IQgt zj}SC~b7(mLOhF-`-M2}49?AmQ=jYL1ruyReE+{SzI?_7jwB-jX(CH*I+_wqb^`g9dIhq9D;0FV#;{dd-oHT7~!cDvTPCcFj#AiX!&4^nSmmv)~i1RTfbkU?=QxkB8H02PKJg=>8=SP2GR*db^ z9fo(MYNS*X7qXdwVe`97hRg3YHEAHkLW*`4#N`O!bNz5EMpx`d?RDh3y-%xfG&WHr)_E7u-6qk%wt<*q1xdP1K~JgUYbv7Z4c1B9Ol7?+=3q^n5+OL&TN> zBLe5*dm_-{q-u?Vju|QlPlz3w!cjW8StFvNXOf81k#;0KM<7WwRkS$tSnViVYto== zeU19RNe%1&BQd#*>}E_}B7=!3ceuua@I1upAtOgWMg@s?m(~sMORRplK?8?XQtVjm zh0A&|JLUYoJ8kFSc$FT9y8DGSjC_^fRcvf#)Wgu4B#F!Uktn2%Z-)YkU~EV)pF+_+ zZrIX#1UO5bA#vx88%8{I0`X+s z?o)?~-n38xzSZv@c_-%??%g$JxvZWNmBxK?Hvl!~9SwYPwwFS;+i}ZtsJ6H=pfgZ) z%_?jdHM}*#LLa$RzABe*q?Ap6KV#;znL3H0?HrD)UmaVoa6|93k(|(5&+FK5`3Bcw+U4{%`W-qW`_VchX&`Hn)I_QF=r4%rWFKTb|t0rrN*s=rOLQtf#?H4qvsB$yX!3E?sHO&pS z?Ao;L%@N6xCan|_c`@Dd4qfa$EYQ06T3|oP5i5+LqIjjb4+Z*S zEV8|Vq9W7$xV0qYax$=@-$=r_sDor@&M@qnI)JD{EEAKGOreoulVIK^ww|DiI5uCn z;dFAdfJG6h{V&R-HVq+LjfkpBZu>5IV>NP>A0Na#_eWFHFbJucw}!8`iki_AvhL{+ zobbrEEFCv|^6(L42Rit5!Lz)0@uvrMeU+%dU1Z_S#U_m*MkKY-N3FTJxokoJ0O74~ z=WgCI8-=${2Z02^9mLhpj3C4KIO557b6aMf3wsdJ8?Nss1cOG8BDO)+et33K%#}?r zQ$XyUinNEN#~m+wby7!9&l4ES*LX7JEA8#u8+D`O;|0*VM2_rU#`tLAJA$FJuRyY@ z5L5`bj1EILu0I;O(_iB$hg_K>5DNuS4H4tx2-H&0`VHc?p^V)y{Rh=%8yYKt^Z2e{ zgdSoP&)JSy_yk;m!U|av3Q00yql z4i|cJC6De1%#tYsa(?tc0PkU!9rzG9^2TAo9!3w%#jm4CC3XI`3@(cjsJ9lq0fs7| zhKR2+U&0V`C*VK66Ke_o!WG!{Q9zWIy5RHec~KGIPwYrg-Tm{^#*aG>eP|>IiF^qn zJ&6nU+Xub)@dls$Pe4~l?baS~MqP2Ty=S@JgV z3~hP|WS9TqYYMuL%y(>i)Zl;lx9H#t3vhmt_TQf#GQsWgFL9XX9dej`n}p4Jq5$(? z@Z;Vvy`ORnu*#KB4>?cjX5vmF=h^|cb#Si9`ZO!5)kY_s4mXtZ7KHCBswHLjt)K!4PSA0a~x8SNYz zT0`#TFFlke(8|$K3VZZgKRBWDda2BN`0(VQy};Qx*g;<7cW_C}Bf15>@Rxu%LTdvk z-fw=sz!YXLpcA~5^$~F}iG2WfGye0URH8=a=wMEFVV53oQnAc>J4l{cHf<_a4MH`t z0_u&naKuJ3p;}RxCuKLW{XIX6!w%z^04P@60SM4mO_i#Re||23fQC9%OB|;rLy)57 zMC!pli2m0Qn%GAS4rxJvMzQuyh(-h8QvP_3m1X<(Ka=M7$GU#~T8(0p^_QF{P&RhN zpRLAcOu%ozU=L#-G5AY|-!IHRkXz9@Z<&d@DFc~g{^iK-jszuV7AwPtn)~jox0dNh zE&54l4ia^X3!hDY>yHwJN*CIPplQ!>Y!7X?Ly^jRfu9&0CZYB>YrefTWFISIr#PY@ z_#- z0kDpOCoYv22?sFfK2W0KkS<=#1W|zsiO`8|7D(!R4J@Qgtgfz(BvR;)6Rxy7Byv>O zG}B=rw^F>MkSO*@P`owR%a16A1{&X%Ujj;n-oa`5qFz&m5K6W8(h^^Sg_aJ2!XBtX z`=g_y<7UGKLvm;^PM|)no@M`$x_i*>m&oJCmEn@me|Fd zk+@JMMuI-b2djdIlo<%mBhkZ5OiWv)?CnZ!D+r0hNsBvL@-z;cWjmy%;DA`af$;~q zfldn{55Mt6gBozj>n>=nW;=BHc{wJ9IU{noYy!CMPU!#EhV1&rz{K5@+hnf$3NYuT1E5^#dTqeL7w%MRi5|5o3JOBAIHoBv^j^K z$4-(#5Hb_|a<$lnc?_B`3ybgBkA@35j~bk$l*<(RI2VJc@RMg1FP=kYzY~cYnN)?G zQ{1BWAqy2Bh`{1zwxgq?`Piy+YLydIo2s`hrF{SXeIuI;DcB9vzRks8yZ(*DEKX}b zFWB$L$HzN;li)-=X`iRg%fBy8w>Z+y9bG6wlZwK>5K)i58O6&W^{Wy|hC%>ar zep&l!=fS(v(wWj{RLr?SMCLQK8Ta z8+PvG?*>2!(RvjmZNLmyutZIbeSWC+lfK7-DUvqy7a9W_mZbj&Km9N~eBXz!4%Gr1 z6RkIbcwhuvOt)3<^4K(OZ?`|PA4pO}9sE2_@TH{J6G2WA*|30^=)ZzH+FNhWkTZG8 ziD&?2P(|6NPmYkXl8=U2M=-wp8pO)oAtt8gxbylb^gVE=v^D<%P)r*1^!k-`;!N-b zxWx{@c9(Ge?hSl)(!UtToq=7c6(~VdV-BDgs7CoeJQ9bPE={zpf_EW2J?c;0ie(+U z@H-bnNxW?TGnPG7RZ_ zz%}08^Vc*zKZO`ajdS)q3+wmX!I1fE()Ahs{_D3549*;%MU+`@R-R>XWfgKY_GU~` zTDbNb@FG}h8*%xmVy~3K=N180jXSh6l;(Rk5|){|(MPmJ-gSoe<0<2cV|+_^4qqJt z%`Gpjmt7pd!nEx@p{SDzCJXRynRtf-cH@()=#|9-jHS=Uhch@U(aGPU_HwJf$17R= zTXKJ5tX+4U#q~z~-J1OY4j)~@osQr3#Kl0Fz%L26IAy#=oh&8QZBCa zwF+$Q(_~RLpvrqKJN8p#mAu!5vr^ynOG1oex@bG(r*u~Z#2PaMDpB1lwf3YqoRhXq zu!wiyy5`iQ`w~&#h=uiRk}AfWC~rfIgAlxG96P0HI8*@WRoW!g0ELgLYfjPR0dD zEzFr4EzY_nnfu`&wI85EZ3>6E&Iwru=c)>-n9}+sX5r>wUj^VkU&|OK7 zawF^Q9ijBhitC`=^7i$uqumWNo%*Fon%8nTOUDF}rsoldL*?%_V2HFz44!baBIGGy zFdBhB-m9^IUC5*wsMK%P4}XP-zm|xN#$itq~@(bmmYaWTxk01e)DQyIW`- z)=;J;;u`NmdLlik4%1??TKfG)QnC%>^pqCg{RsN&c7*z7j*TNNTN@sz&^q|;GDwc( zFkF%PVz?F`LHU>%djLL`k4UT6yQeN&2MZ{?DmTwLRZ?@y!Y^YB1j+^Nrk ztY|kJM78mjAk%{#${BB|w@F54-`bIQn+L2ZPCQ)p3ifE3n|*oR(_ z;AmjvS85#cG8m64R}coD)TZ5RKywOV%x0We&%seqy8}lOLxhCwu9|A-rC&pwT^5dW z$Q>Jkn*~t>ftD!~2KzY8#U6%=yW3w_&X3SNLqAi93cleY7nl$v?zZdrQ<(){^2B!xJ9MBCd7)bD&^E9 z!NX|W?Z#N9LyM!haQQ+cpe0SH+{EtTfekDJVze7n)Zc}$J^Bp>Uq!I;M{M{)w#XNd zM48Jt5iLX72NWJRlATa0ZknkQjzIflQhQzbvWcYq+-9JxpMLf|G0s(hJnGc0n!%>@j5Ynvv0J03qvtkV>%QC}o$?8!bNc&+g>4HEP@l{Rzi5GUG7%Bl;%9C{^`#)@ z>+5?(S2s?XfP%>E`1_gBI&QT+c9hUd;M5eX-=w9vB5;^tDq05biy4_8cqfR!1DNU~ zwkL2U1d-Ph&|_Yf#;5f^3t_$2&x3)d@drHc_jglK4`PaDlyT|M$(R49ABb58RtliS zB2wzwrW0$t6u8?JEtthmJoB&0!{fYzm-n9D58w*DorjlXog&l>rjPK5-z) zz{u!NvwEi|SLLvVYk7uQ*UEYqk*FI-_Xv=k4+9E<$8tc)X(vI5i@m+QYvL_I7h0H^U2VPt1{`U3TO&sxHaAaKgtT{4 zKU8qqXiQRhQMk^b9Cfqt^M|Z1`CdHoTH=0(y7Bc(uMb0o9z9Wjq# zK(|remC(A%$;>|_#1mzq_M|HG32p1zH}AAzKZYV$`)g@mLvK#tD{|~|I-5S{b~TKbswDrwsDsZJVzQX_vT{I$yy-?SU3sA z7nvBDnU2h2s`dy{(fzC^!|nt@X~{5_hF;o83JV6aqeQjCJGnr;VAH7B)`BY2O|j#J z4AJC~?&Sfy z2U`xxDwfvQ2TiL-Ijcp*Yvp8}A3-^KKA6|gjDEPqIG-;J(s_MNs`~-A=UZQE#q7;c zeR*E7rH(OVcyzSt#d6?VuGR-cMCw12v}?EDC7(W3X62yi%Vwb#-G_EpHgrxd?&cHY z6Ajd&txtrMRddt7t>KyqV;v|7Hu%SL`tUA3s_W?htGRPu29}4qq2!TXE+a+ONF=ko zcx3Ek)$pSJ#&s26ia&(FO6?_L=|S#m7JL0`$2`LhVJ(FvuH#yi2D`3O;#EX>6uDo7 zVJ$K*$Zw0sk4bw1hh)~@&&-j-yd&Oxg9-{aY%UZce(i6;nA>wekskmMjLX{Fh_lZBg&Zw00fKYsj}5T;9vI%9whx3;!^$k8M$4zw~TT%#1fU@;%Vp-!K{IqvjU z_*L555|EB0U*Ut#q%aoR8a$yJT0ePwEg90>lvj-D26O8=API)jeO#};H3a%AqSF)Y zLCQz)!-vm|8x>%hy9!7qMCT=$Mc?xNa?4xh$2bllCK5K!g^SV@(GYCuq{d2n^X6(D zk~xae8%2&VL#pv@4dn{zBiES{obRX}1ZRbWg$q$vGfi;w;((C_7}xz9^qh|% znR%^Kkni%3Wuf8Q&>L+b2oD25lV#Oez&q|?KV-bk%|T)@uty$98o;;})bne9e#(HY zeB+o9a0o_hx%)=vQK(6S)$*@{hv23E-c!(C>tvPf`#VM#NT%Bi(*B$X!j@!kvsU-&hUi!mLGI38{Qb}_M= zb_?JV%0$uoUTYSS&x037bt zU;IWg%KA&YG|Ng(kWvVF2^kXBHMEt_$92~RlBO_#$w6ewEd5bBsQBElqaN%cDxcxf z{qNULlVNbEmm)82!}MREU?hCpX9(F=EGT@n5?51>5g-b6jN7?zclMq(*8A z3M57x9V(}J(c4Ifiw9fs^YfGGtZ%w$hakQN8^v7@#0Ip1DfV$`Ix_ejxQUtV8yaoX z?Kg)}g(vIn9t$FX5nz#)ej-F5I<_Po$i3j(+rawdsRfz>MFoY74s)=?w27;iV9$UB z)}*+*wbhYK0ch7l4|h{fuVfH5d`9d+dkWA>-hy_@YqlYfTdsFUeqo_oBhKwDkhZCK zM4)X*-oI{a6ZjI68&{KtIWiAWM>eS;sQa%13xJ%)W$v#12$Fzh`5dS9cfg`Z^)R(X zC$M+C%K$FL3`>-?uYua@6FF$-jM-iXOsg)atgNIF)I($G^SMIoo7gD$|CL|it0Z$a zm61Y$-tssaD2PD7RvuGRW1G4Sa08qxajiF-9q@lWThAWij&;!{MNRbR8>q`Gl z&FTh9r9j)a!L`fmO)j(vF6eQ9J^1@`LM-a|d?778VHz4c@Hxj>M)SnxC$o2O;nJSN z2~b(20`2WP`*}6|AvO~iwzGL3YVt))!xpfOnnRg<-cypcQ+0zAhx|)(_KiR%g5$o} zZ&K_z{C&K5PvDEnOw=mIX<}6Vj0otqP-SUb>LmiOkRLVgJ)K%;c4B&0rz?RPc%BtZ zMspA991qwWQ56Zyp_T1e<=HBM|FD1^hfWMWw~w4--!7I-7bz34{Xy0ITK8rD0|Ny9 zs+k~#lJ+c&9AEn?rRz!MepNj5fF0iQnuV|76OxQk7Hp}J+OmCom8&x}?`F~yGB~aU zCnpOUI{cT4zB;CqJF!$^Y2+2Y7T2dr5^L!r>3i9Aq|5iRxY|F{-Lc;Lf3GQj%#+uv zMzSA5sr=zc@>l=g*TX9q;mfNyo?9)K)+^i@#3ARGR(Ecu^;5&{lH7G}=N?>k-f{nw z-XNpnWO#s;i)@X6)Y)D=N3l+dv142cNr~ulI-nf1}r?vzR6I5?TVZfTNkdTpv&Q5f7YV)0Q zSV9wqITrz;5ss+M*AM}I`ouD9|5f5IltZdP2y_pRh(yw74D|d0B zQ0UfTMC!Wth$M~OhL|}VE~yu3;W+r9HK8aY8BA% z5lAByQb3V|4Elh^H8Q0{4tNYPBlO5!)8!Mv0sR|MPY3I}fVS5zf5>%OQLzxG?!g{g zK*}iB_tuaNgrrc>c#+04+rEz@P~Hu2W?48`SEew>&x%+?Fs42Erh=FCnmd3Q23!8J zh1d$7$;a%8_74nnL$Zu^UEhyv+Uxy(qbOoOFs|Dr*aKs@T=1huk;(%Q`)F8(Kschz zize81yf-f~6?5(|UUFYFs_=VIr=rb(uggWu_}ofXt{+dJzcY35co-D40w03R3nO}J z=;iwwH%12|6;Rawcys*>aUI65 z!?>umYu66_kX#r-S9}adw%51?IH+RYdU=EiS@P}LnO zL}@6z2)NL*on_3J)_q6he{qKpJ-N@p+U`^^!~R5}+q^ z#O!fMtpE)3^`7GG0tqc;aR_I7Y=*lbxUY%Rbm@U1>LLpCZo3K$|NQG2CEXvgQFd*k ze88iobNaN0EEPpTmyL%F=oU6hJWXCCf^(7>4&L3JeJh)UJ=VOo3xwyPevKTP4MfwN zco3-_P`?RvG;<%qJO>pG%wi+r>UuhXv|)aP)nW>qQ?3pUc3SjRXVav8U|2^u~4hnzY_qyD#U8;}6lFcJJ;gBJ!FD zJg2puQqtnk1gh0yIr<1v!MUhe1D4Abq{??dIHpEBh&nhXcW{$ub9d|s0gjx?+u2pV ztfPiy?^j;8Pr;ud;w6m{qJ) z%y1&!2%O&WRDrcN9k`~i@$o;Q@1|{VuJpjS^Q_iqknHUdW<>8RRI>)V`)h>q+-=+; zFXX|1W-+RTjdQ)%f8U4{$=+3<1(N-p|-N?bKkYq_53 zX`ER*Q=XYCb(uRfw@s+qD)0kD_~V97J8GVVatC;P+RO5N(mW=~;ZTv)#9_zYM%t_M zjCfpMQACQU=phybGTCICO-Sf#e2-2m62)qh816N(l}xIk(S&LyRNVk)qrL;|PbU`1 zNJv}hAU)Q9#f)4(bz*}aC7@KoN13MBMcz7%5DQMUaxc#OTlSD-LDO=pkS$KjFxJy&YRK@k2Cb;)MatEIZ=I z_+!4;QXBKCK2P{J!^oB{ImjlCep4Eb?QT2luL$~B$JB>DjF?7Tam4mnSzEWETsUMi zgURs0mYDX4*2AC%#yC0>EPE-Aj4cOM;G=f=?!<-*n#73uyc{ADACTnQz7^>IU|K85 z5Md0zE5js^r6zL#{bcf_8PZOq=UK9JQH*O$5PEy%6iGZXsU=<4-X{MlD+CW?K6jWX z72wxLkR~cEZ(F=v<6GhkkjP#3?iboNfl9eP(&G)!Oc=ykVsl!BwyWm1*2RmPQ9^mk z#er|GL}|g*f-&)oV?1>@dL07;1B<3G2abXjuONo)6?S^`t^0X-R}ig1^wQx!S&%Wi zk^sg&$jpzUVGog_rRjJFxiWq{buH)x=s*ktxSqtM~n^R5qlXHEZ$Woz*GL0Gh> zd*Y0NrmU=NK2G+KIyezS+lMl~&Mrbm>{t_gYx~bJI`c7C@+;s_Ui?S!#Q#~uf`zJS zOJdftvompEcvAKCn^4>0yxBWCkKgwZMz#C3m<39vx1#Il8ht^8npY?IVcM7qX@7;!~eOHi~~Rv(~hQUu+mp> zI~BqaHGaAr)J}&xn8;C;aF^>lrwGNZ*++5y+q^m7a7B?(MIiKi{{$y%Z7!Xdu|q zL7)>~dYKyNtIg}zM|XIncR;Ch6GFJH#E-L>cPN4MGB!AkLnLsw+Hh1O1408*U-SCw zWb%T|%GIl7(GuRELQr=E0R@*NUfUlQNb5CQ>q#2!HFz<|XQbybgUXbqh|_;GE z(s_zE`gnRWLj0p19|sYR=~M`ch>iVv7(#aQ#trn|Sn3y&)MAvq;`S|};MHCz>*l*M zHuWgN_w8muSQO15EdC(j_sc{b{)pdUUWJ-BKF&{ismGlwh9)cbp=vo_9dg7p>bx3& zHuU{46YV^5;rk;8vNZ^9W);yjaP%bZob zM(sc{o`a=BI1Cc6n-Hk7g8b}CBDsfk)1bKTJ+wO|BXe_5Zd3KJK9j`Kk=1yDaP3r4{sq4HYAfa97yHk}*SP!f zx%z)9%>?yDD+?mj%AXl;2K)R3?QB$m|4IG*miUC)zX=rVy*~eK(+zEFr*vb3%;6Vsg`?rBe7lY1n>}BO;ki&=(fLo`q)NKZ=uIzIij8C4 zIcu!WE{H?5N$Q{$V5`Mk1k&o^1UEMT@Xz|soWSkmp2Y1c~pr0>Qaw(M(f zKNr88d>x9ejylX;F-G74c49bZi0)An!!JP&zqg-vo$TN_DIIMfH3Xh-DfptRGm~|Y zlM+Cya~8s6cWl%fiSF)J^Sdnwh$-p%C*oG)TjoDg?-zu+5>O_;MmgISjyQug_^jqB z;Ybp~-tZB8x2vt67v87ig9mF=a`*o8f*y0*HvZ~E?vEgFyv~*@#cG~Cc zT7Y6s0D{ouxnRYb|3nu7vHVg%au3n>HSjEh6|C`57X4!jBtV+4^pt2z;(% zI7&mrMUNgmfdoJkDkv;>h|^t5NDJ5o@*I*yynPZ?)R};C_BGON@RrL$mZBiN(SHy& zItF~sy5D~Qutq`_2AC0xuMPiG2tS{AaS0e&{3UFD&#*%(gd>fv4NOj9@`ZE6skvFw z?ug1QQXl}qp4euPUoWh`*gC-j3p$04o&CvG0vvIQ zi*HTV$fN_n>4tX21&A`DHi#A!vhJ)q7byWMFexu33ow6&Q0x*{?3>WgI6{Zi(9m!U z8?ZJ+m}D5@tgZx5=EwY(=pbk$sr_VL5J=0`?TbL!F!Ue|NMlKvE?JnuQ@@TqxQu8VQI!Y#wxB_V=L^9FP{e-3)9F=Knw_o; z88)89dx@r-M;E2uWY%Cj;ulwKIY5twJ>i0s2LsAQXBfYv;XEYiaDbsF`b(77iaIJK zs2UIX=!)96)E|bUcQsn6Vv9Kv4D(1mPO#G=0?2fwqP2)*fjul-BY0$AyW)Luww5 zxG?8MnKw=|OnY=H6MX@69gVem-9@X(N9P`s;6i*vU_TB8AeSzUajpRAPKI9qsUfL- zNPomre9PPe+!H;_mmCPX$p|kSIA-(*>YM6HcHmqZNqCtiPA;LTVDJXvt zw3%^ARG=9IuIsi#VNUrQgM~L)ma=3qrsa7#taT$UYb3AKk+?m`_q-I=K_s$yp!Q|) z(>N5{SCJc_M8)&Yo0XMypvM;0=gk}(5K^9N)gOdh@oPoJgY4tjW<*Jn`)LYf#H0r= ztf+XU4(a5qu7Eb3iVD1<3`2G5g$KMwk2V+Zo#DXT4+-x-amm7Apqn~d8 z-(W<=m33n)Kr*w*0Q!nnvaJ^E;>9j;3YySgzX?8$W5w7K@(CwA5M6cfg?#`raOSO* zmDJ^yQV4}j@M^(O*r~6UVTDl;)L9{(3DdCG0h1@!gTLvSyn5q?JD!byszpHo0lf@R zhqRFcmZ6>elA@>O<*W2pTH@spVlM@`lEZ=|l0)4ZRgnKbycjS8x&Bf$H8nFtn~Psk zx^QB*8Plv0;pDjvFOS+Z z>Tjsx6JuirMUsgk9t4H=R?%|<;u7|Azmv+bH>VTZp!cnjo9g$nv@lD=8`{+(s3Qs& z%Fi4p@FeQ*BhWZon3`VDU*`6o7T~MxK4$H;%PML+?gYV_^P-Pm>)pk@V6s+*RqpY@ z&nL}hB1wv2WIgIz%UmXVp{}m3n83$C7DCIhkkZQiHNW}HJF^L@+4Hoy`yX6&yZnAe zS3OjS8o$agp;-Lt7i@iQ9QMDL*#DW<1xbDQQ{KnyBch^CII1yPDi4Xb<3J6so$}<3 zG}C4n6oIa9MLos&HtX@5MJ!?*V)m9N0F7yb)gw$~yi%ld?%*eRys1_+fNns$2Y_XK zx-Ud~K)U+YEVmMt`wv&>cwySOc`kG0$ieQpy~`j6W?*D=!qV?;ye|kv(KDRQWV}1V zP9EhBvbgeZHlIB`@Ba&)o zGqlA0c+)H=ul)OY7k$`Q7}X*su~Yp9wzmTC`^?)DENa?i zgF7dvS;Yy-CK={)Nw?J2&9tKWY zKxS@xbR)@t!9L&J{0qKoSMba)bC6NYZ$#`8!GhX!2mv5I|6znRzm(t`=_kO(qN$g1^yP~{EzwD&gr-ov-zjrYoy;PK9+fpl=e7`qoaWKbzl zNvokYU5*C`@dKVYITFZvako8cSYdb(niesV(ALz5AYWGOKyj1bVJBI{d|qC8-#s|U zOaK^?3+4<&(Zsme8Ic)qczfGs12AWpWsSonZN=PxoF5sc9rA_O*gniw*OKg5oB{_H zo=&acuyX`T@s8u;Nr;KiUqAk)0?by3i!16OZs_Vin7mlFc$OMA9wuigteRgzbBg9d zPN0oEZ_Y4)fM)Xr*)*C6IV^78!oF1iu3X4=6vIccKO@@_R4S&gN2Bsy4C=jQW`^8s zGOD1QB;gBzOMh67F=nP^!9r!$#A$@ew1kbxU=`P6EP^&Pxz~J(FA~eh`2~_+sjVuc_lB;C}OvKEJ1||HY^xXEvWl!KV|2W*%K+T_-Fge=Frw z;eBSND8BQ@Z1&$7YT>Z&>dD(ywl({XZSm~1)z1@|VRr`&nvWh$P0cGX*glV&;37K334ZA?~8m@eG ztLTd2;UF?Ya9bwx4RTkL3Q5 zEDeg%?WZ7E9U+)675$-j9@ zkVVTs-k;+6;K4l%Mt!jREEcyj3gt7u`T%JWc7}4jJ_xAK-iag<{CTjC^3G=TmUiT{ z|7BA8LxmiBWEw~u8ikX-@^Y~4w&KPrbiKh|Nv8UiNI;ab9u;#)`w#3;CU@=xHTiF? zOhCK^z4pyjwN41P5(ApN3Gci#DMocq8<5N8f1>I4=tLE)l)4iTUoYbuKO-YexjfhMSK zPeT!e;@oqz02@OQ)Sl-o?)XBaBPcL_m8g@^_6R!i{ z&Ew5`)LI|0iyhLc+{uN(Gx^xHWWEKV=S3>!>wNLgybR9qH%;#BU|9>ybG7r1fK2qn zd06i0%}#&-t^lt}j5(K(W)q5iHMI`f4DR=PW6vc#19?zp4N-dw_r%U!9t;F@Fn_m# znJs#vAwhM|>({TZ_`-4?b)JeM?|R)dGQiUwJE6B!{u~objQ!t;1?7y-9h?Q`%_wGh zDxRc}6b~3x)*IHvptt%BqhzTnLcjkG}KrEiCgFS{oZq+0--&s8pHO5YEqgSlx`$qQC zdpYp7AU~&?QqL)biETh2Wlb8&ny+PL-e*p+&@amSVcE~eCTeP{hQk+}&FHm=5ZFx+ zF2_P6kyzTtk2`UP7jQ{QaeU0r&lo$5BoSCal&a++rU>28KQRg;kZ>485gPBTglOex z4TO#!RaYE<1h|Rlgh9;mT`EHqMfo5J^Dj_ffe^R>_@_}4*49;R?Z|7>)f<}a0`6>) zut|$IMQtp-X3c{GQ;q07l_xi`vYzYCg__+BG)llcCl>))vcp9t-F7r{LpOzW^GVYe zjy(0KFAAqmA9It=OPT=nz8I}Zf;ebrVeIuhtilgAco^+S>tq_zX3Hun_E*8(WCf3Wiu2>gzmS`bj$wfj!Q=HXOO8FBJsv_L4vSNK`Fyi~jY7;BAMVqLv20xaVUTu>4 z1wX5os(ZpZejxf)n9O&uGIgcM~H|h*ZDu4u`1gI zc0ba2x`z#DmNSMw9aB|A<8=`+F-OG2^c7W*$@?3}%gJw-QLf5|y{HA#?6=!9X2y~G z_~xZunMQsqcfWG5h!G$3-)tDx)ULhC#BvAyar+3S{cB<$b7bm#9(k$LLj!YcrxKgjpP;%jp~fYaV(!lt`==RyGJEIphMu^sK(Zm@MDxu2OVbeVHaP& z(v4fl1Q~J&JaI(a4)BqDbXIw=sS|+#sxWulY|u@Ce6y_dOPVpCf`xtwVO3zoQc_Z) zz}MS~0t{`>#_Ljr32NytxU=vUv{y}tq*IGlTv!n$zCy0dYm4(kb~FBptGB;*1xwv+ z`}uPfAoWKIEtqfyYvwAjdzdJ2_w}1M&RFj*Xi9o#Aql<*(KZJlYLX&IWDq!Xmmf?W z{n86X%}V5JI78e83xo?+A0IzIeatyVtl%9C;}Y0G*B6=mQl7nL*E`5ae?0hr*}J9E zVYf|5APZ)4UmBf9Z*m+5K2OV<4=+c*BNQdTIIS`(B1!YfMAyu`-$np>0=izV~ds> z6?@cFQc-b_#Mq&)C5{4;W`zsI^Z5}lHcUD@hHZ~@5tyhKXIgd|0yAXOd<5#yhE|_k zPGu>LQ*c>%NB)VOEcq?iuo;(y$2-!ldio{%h9uKHe0+|$!THd|a6%@65ywuOvcHI|N?LhY*(&lN zK$%PjeJ>saK$+~xX2wv;A}Il8ixsero_xQP3!xSpNhH!S8Pe&&7@n115Vj-Q{X4Jp z$_!ar2(Vs+VNacfI#Isq(Hn?c5|lPYR8;~P6gIT#f>>{(3IF9brw*T81v2Ai6bE^W5e&@ zSwl&Pk)5lSE??ycd%O1o|9{{2IPT*-t^a z@%fyebD+BqQp%t(G&Fqu5eF*3)qv>ejeA&D>*KnBmk92CTKp+1pN&Cj>%Fx0S+ffl z&U}Z#K2bc-(Pan?B=u9Z5)Bh>3;d|fy6(vQwJz(AF8%>VXs#FH|r@vS^uh)&+w z&!!qrbc-9%EwPcyC0s zDrO@FnC1>6f*4O^wV?tcXq^F(_YfAcHstsbEkw1f98t&vXulYTaqC4GNE8E)*-}dS z*A*@|Q7EI|P)(FK{vdUD!~jMo_TzWeo&gOPl$5O4b1#T99qko?HIa7Yp;w?syO0NK zWk7WVp3>XHvWGrsAo5Ys8buafpxCgUjBkMh4+ptbvQDBjtZdGTi;Fv)0^lbDhO`eO z@jyvO(=13zUy_OVE;=G&4MITkL)uUvz3%LcHfX|O2WIjC>j8CU%A0>HwJ|$}am}p+ z4JK;Ou@nyzScRSLEU|hWH1Rr?lim^cV&N1Br>j9IFzrYbwpe|2yKGgf>&OTQwSi$_ zVZxmZK@jIxFN*m!fYzpjOEe0py1reGBSGP0gQ}J(m>i01F!jNR~Q>A`>S>AH4_+ z9yxJ#e5%M5)>}ik$-Lji)c`tngf0+;;z1IbLviMP4w9!#U$fmAIgAgAHf9Tu(KV12 zUMkCCEP+={C?n-k%R9=vV5VBj96HN^bsK6E-Ti}aD+Fy&@ zy$7HqBRu$(r}h(US`z@TTYBISwiO?zuH+Taw>pwf<-37G+KrIS8L!nyGvQVY!q>0% zy*aA}nmb0#uZT%veaYD9JghjV6|Kgk ztP=3I(J6>CY_&4lVHu<%`}+Da+UN9>nwN0Z03E@kfR5r`Digklt+|?M7fd`X;D)&l zOrNzFMH5dV0ta(jhH>G2WnY#i6ZsriAkl;4A+oft0oPu{83W|QLl1^}q#iOuO5AR= z?oCZ_3CTb$$OP5#px=*_jA{I=p5!=dt{M2(wIDLI%(M5xU~htz3o=%Q-7wA~Q?Z02GlsBT zM!JKj4O)FloSn^gS)pgggzuK9KOp-Li3PCSy9oAa%LZw19NyOW`B)O3mYKeO{9#S( zrQt*aZg`J(uru2=G&GoiNtSTk5SmRVv?7__ChyDHk6}QmyilZxbW%t3q2q8%rMPE# z8fupZB5N)8E=&=jORSROd(d#*7jxY&NbYzuV*WgBm`)yTX z8?dp;yj3(5jP&%iE7fy3>I9eLyMB02z)TP$W7Y|r5OgGgsF3*4^_STvCFYUg3`h>; zn~s-T>Jx1+z=L7A3t&>dmUb)J|8?ex%rQJXSk7sSepyn&6dDjpF%AS3uLL|P#Ds6{ zP0R;t99}&6XIetaRQO3@c;Ns@x<5eyRU0iJ;9%-D4i2)eqkgKYt2cmG0%HZBO5n;B zSnt!nA7I3%3$!7dCXl1O5sZemLS545=Ox8$BrMC+-W?Pqsw!&Gci#a_d!WNG zoT*09@BvMIcCMKDGsYr~>8TvxYnfxm5)TBi^W_}(E4YTqq9m@lUI#c2sOawc<_Oi? zi(FoqscyiIb#(BUAJ<0Hsp6lPn+v5fTLl@GP~%I`{xyw50<~sSA?x<-ZI9RP+zy`s z?q-=a#33(H?qnp|JZ2zL_EuZ_Nx6mln^RnT69ol)p{dDN21hvdw+R){7}W#TBjlq? zWn7!VJ^DCsI7dh5_?3daRi4eypWm=M{R!Wzq}wFRiY0<0^vBJ;)nKKJ z0N{O!*eZd(`a1`Iv}eyJKucXM1PLFoD=wAQp+5(AbSD53CT3tR8MrWr z*zAyDbPQBkbto~wo_p7RGEAqvhnhZQRw0FAI9Qq~WFY-;u%IPA8Y3QzaQLPD2M=xq zdPEyWj{;uM^Q%d`luK^gn2D+BPEgZp3Ce^QB({p8RP<}`{WI;@u^xnJT*UPFtR0Je zz&2`ejO@g;Q$WD#mUAUnlS^1H?^C5&2|2v&@wHOoTd3=v;@lcSIen>2n+GG#y-WT$ zXH4L5BW;TaLnVN(IGxuL{2tzyfxv4H;ctBaL4V{MnA_BNVnZV%G-R#~MyaE#YY1@@ z%KPDawi5>MuueCrtY+@m*%YB7)W<6|syP$G`-jEhsR9RX9QwmVVEVSymJOeXy*KQ)^yi!n*;uad7V01oHcgw+jfZA!_($8(Rcb@&h2LIZ2@QZ@Cca=sJihn`BCD)&r znZ4n$QBa20HQ#SEp3*+JNyLE}1}J5;8wB52CpDnHVU9~Laxwbg$QAj^bwFD-WvsO^X3SofI{uJ*tPPnJ2(ejFCUo|)L z|G4}MM6lKP5&k$@ZKw1It^awr-mORDSCNrE9!=b@1&#uJMAuPJ5Gnzxl-idskDo6r zDJjuOD+BIWi`&(;qXzOhoYLbqh3fF*Ux#*(60?XG-xcH6p>^F9{$y5>gI>v@orAVs z?=4*Ve88%;ZF5JMmUaWyAtABzf#Q`+Kw7#25gd*hcT3E% zSCGnX+qSK?Roz8b8$pQrP~^Ej=bD0 zRif$o@ahiT9aN@KJ#m@-eZ66e7*tfsub=twArJupTAjERGBwinL1!VJo1J|$svK@q z=6J3ZvLvJzL!Ag>!8yCaw88l;f;-~2*Ai}h?F=nPTYLCmMd6Br#pf_^+RE#OpqmnN zY|$Onzv9fGK#+iKfli4`EE{kTY6?*ai6%I1g=c!VzHS3n%xL%*t% zlBCC=M7HUwg69(^LswOS=E0wB9S=9RYT^@|33pGI@;}ULZ`UH~ij3$RHv-c9zHfF( z#umGb$w=4u1k7<($a)ia+i{|YskdTYOcJI?n=z+;qD=Ro`s>Mboqb^J>1%0m+6=f7 zt>JhOu`7j-u5xD$Zg$8@x8Yf}qj%kkkUi9qdp*YZkdFj1z$F|YikMfS?WzM>GlVs- z?EGi4J{wq;_uln78*sZAeOVn8fWghJE+jd~5jpm|Xr&C|40~*n7$v~VyZ$qYf&PaG zfyb4B)0ji9HVv87Fi$ ztt$|^Cm#N_)0k!`0lpfQw14+NW%}@aefqG&Uq2tYjz*qXOC@w9CMCr!S@fV68Rc6C z+j|~L~( zqk-T#wGCdj}0?7tqV8CG@xj$Ni1BVEda`STx4m zqML;7)->jM0hK903!GxS4J_RIig7yRh9~uY@A8Z}6KiXS&lL|a0P6}0JMZFSh zDu_>eT59rDx#=^x6oScIq4^3JTTo0RleC~t2B_R7w=LNfeafV1x%Ya)n3ysU#6gj% zYo^R!-VfdCMzm~-@^|9DJ={J=+WZdvk(Kr8eY*96p$-V+CQN-ZAOt2 zpq68H`v4k^(i-vjw6uHQ_3n#4e$d+rtR14B{J1F|W6yK!51aw6 zPFkueDpI1>q*8d-LuC;g*fXNj_ub*o^!Rwb8_#3sx`S3+*ed5+zH|8EPGTMfiw9v|${&I)3XLYyH zk+W~x)@pQV>HG8@H_Hq_6fVXR)?35&@&HEo_>H`?6{9=?N{&jB6l9=-d|rPVI8ibo zdoE$F8#S@-pWi@%aFhtY62Q#_uI08^wVZLUkGF|X)GaX_Rvy~i`?nLHYTpk(Ejh1F zta?z59Y2g=bH(s`jO1W1bgKcbv(o=q8PPJb2cY1#IgMUW1&)_1ZvJXf5<*`8dW9c! zAXFE8eX25zXtqE{s94d@6mGnU%rBgWL){FFj1Ble5w~^>EKm7B-B<@7_~RwNZ#5=y z&|}j6c)Ew(>U~w|K;%KwEhNZ+P^0{Q|CsK}6TfB)ne2+bP?~f=VA|_Gu3Wi1~|~Sy(uA)J!t!e zyQRJ7^T!wQbfC6))!!meg9}6Lctq)`v%UpJl@IbMZ;0USN;myN(Gl06tE?QNagl=J z3H%Dg3d;w)kMVNg{8jzN!&4>7PH|8A1?ZkP8ai>OxP6B#Ba15qhi6L52yg?Hmg_|$ zT4^m1=n%7}5}CMX`0#k3k3V~=~I|i=<*P;W1zN?|7Q%#v_K?irot0U(S6$PS) zcny+q0mJ9~%a?n(SuN$EXAbDN-)P_j(_~B-6_B3TKI03kYWByOJzdsenUs90JUiXc zA!HH$r1M$uw&8Q?=4{Ood>j(TQ$LOCMiLSm=!^3Y4xBpGllkKwN{xjFnn9dhHy_l* zm_OpWT&A#YdviTY3kAXLY56-Ev-IOnS6}W|1&^teYi zv+-W~w`1cQD`O=p7b%&(+{wY)kGpe)I`7-(#lDB*Bs zo_ZDY2V8XCedWngu((DW$KFe;$j)e9aY*xDB%WG6cawu|%k^T#*`5OcEkZ$nh)viJ zY@4Vk2+9P$MLqNq4Cro$w>PD0?Fp}W2PBXlYES7!aWUvn)?!>MQnEHkklndj^1#4$ zGD2|lJF3x&G@AP{*9snQ+|#6Xm&?>6O@y4TJ46(e+rq3rUxf2M@COEtjRIQ(%jPm$l%ChhrYdTC5_rhgIzC1b|CvFLKX^ z_ctf?|E37@(KU!%yhBf>CUxr@lbAO0Yc7G5w}pv`Dcy;r`I^9|2zPWs2`+}6Qc{&& zqKUb_kr?>{4qA^ZzQMAW+JH1BD6B4(DKgQjM0$)*v9UL z&PnrtXYmvzh4Nrp?z2+`di5YPkii7dEakzxet-xe5cpOJ?&OzomZ9VCwVfVWEQ&Mw zutj-#Xov!^!pMMXj#{jm%;6sk3nSf>6#1Y{X!LE~g`QISYRR$td zS?=QGLqnwEcj-ND5` zxXQIHrKlJyO<)Jmr zlZ~1Zu9UlB7z5y(7<9TMPj(kZ;AoEP>cp=4$NvI&Cb3!Pc7^HU6~yqI>qf6835ALF zT)nQxxKHu@`?aJXhKN%uLyNuZ7&>erxF~kLZAk%yMUq+8Zg8k~Wk)O|2|wrXP-H@C z(9#LKnm1?-yjC{X_s$-GbwIy5o$G&$p`B`j|t z`Cd%&MKCEA&WWYMTQ<8$=L19^dNs`}9F#yp=}4iHTm}}xmL7Bs3@hriVHnMzuQU7p zJg7ka+om=6?md6J_b>%efT5F})xjwE>8%2a9E$AXHqLvD>}F{puIkB>#SO}Iy%>$X zmY(YcYl1|8s!Idl3&!uaHh$&uz61zEN^163$SD~>*xc!|)KN5|q@*;2W>Q@m<{%+B zxP-s-v?qgu4qKBKG2e0L1px%HHI_&6-{`xM?z;w5P#>#Xd;N!dlTTHL!`96N5s`w3 z><|yTnRAOpBX4RUV(r^Gd>+Q$GO2|`Xlj9+oZK1F}ZXs@CJtDyB* z4kG$vft>W2nF_cJf&|sZ8K~t^T7|%B6;#8VjKVN7)iH(goLIVS?xGVb031)vI1osl zPy)Y^B9jSPS8vC2y}ob{BQCou!x}s+8ptGMD>ht49kLe4-Z?v<#3w*pDu`DKt#!A;I zV4PHG<~dQI2Z_OrJ@%vMHdhjFe{n#+1J=rs;VM-6Pf;w#%V4S1L6z28qNz*7Yxkkn z&XvSgT?g6bZ8zChZn{9VKbdoWdX62sBDZq_cp5@vKT}H>`YNJ0UV+kaB$L#;mOHPx zmhRiF1D~v3EJ*>Mf30=T_#*6G$UsbL0G(YEFhh1=%_!CC0hvCA;znakDM{!9lCVzY!QbQ-r-qp(NYqiVkU z2#k2^#9T6Pp1=pwrfagV{J<%%&&YF2b+PpXU8fkEzf}nC<&AK>vF=I)#S3kv;H%Uw zAdS9XhXPYw?ARH*0}-jUAWfbT|Tk$IW_iS;p5cQ-WT&{%}BrfhP~ztY_ptj z#YcJe!k&iM(D&)$JSBFOQJxzjkg#$0#GOqZ1IfwBB^t-_)>L)vD5*Y;608{8=O-G0 z*O8ZZV%qi=DPN;}YZr=-s(|5C&W1Az4WW%Rn#;quQZDSnl}`q(i6_C7M|2ll>QNX* zHL6}9_%Q}7Z5+HP0*UA(u^Tg*Yq+EOPvc9H(3)Am!Mp#|`RN5STiXZ{dY_2>_OQLK zF3_;0sfq4vjjwe&{rLLewV5?A#XVDgXPgXzFcWvyy`2W2anjdYoiHFSh!qE3(od%#? zH>z+{{4TM2;1jN8;_d{4U7?@xKr+L^=~sq}{E|fPg`)5fB5OZ?Nge@IjHBAM>drAH zR%Y`mDHDFZ5LG#>jePn#n6nAeGBK0*Xn0))pc*7Jy>mXVoj?xmr9#C=oor>buw@q@cUuRj!`I)#Ucq$cF&$Yf+hRJu(WG|@wbbIIV(&* zxtQU(QOCs8iVAIdNVTN%C;Dw@#UOHH031RmFE3B}d4c`?#>U17s_$O;DUXJQHH}F( zRuT9>k%=2bmJpZ}qCX-uAQGyDi2@P4Vf|3Gwl??#q<{&7{T+geu%3H3x=VhwH@ zlB||ud21Kr`uEH|2Ur}Ib(y8@yKs_1=`{iy3c@xgoHgr%# z>D2uD<326|ju2}mG)t7Ii-OTCKL@A+;v&u*0f@vDaEam!J3JNl4tM{0ba;7zU5C|F zDk`Ej{$H;ePtO0TZWwbTAC%sGgNqCBvjVKsa7XMw zS*+%JnM+D)Gt}RV09SDSc)_r1H(`|TI}TriaWS4)kqpPpglC8az zF??=vTHA?{hla9CX0@s>iZD`f&n-6iM8G7c_Bu*RQn4S5&Y!NpjXf2f&HY-tS6WKd{_smK?!J z56+CmXJE>NtT6u5f9|D(mA$>aqkkh?0XPOj-_0y6Lh*a`I?2|hrJ-@MeVHb|1CGNh z7-?a>PX?Cjz+l9o%Pj;}sxC!J4r^QZ}>ReY@o^jn>7!;e?ad;VxvH}?L0lDaXw4i5AH(`3iq3etyN@@tDuOl(f>e)rBmiI0*Bp2_RB})rKyWA3NUT?3<(!gLaWKZY-XF3oZxrWGwEOc4`$KyLr10a*5 zu~P97(5xn@FSr7B;;+Y$g%hJN$#deacy!(cTx&FdK(9>!Fe8~=6ZAn8t3!h@R==5+ zcGj%=x;XA0f`?9dS6$%W3f2)2BX#SSt*x~%9+_Ig2Oq6^hEaup0ROYEX2Bk0x4Q;n z#I`MCg{K@)SMxt^v9qRM#{>ZA&?pNC*_`Pd1W5V%_8d-Z0x%c!JsHqMmr&c;shg#N z&f_&psKp|YNud36`qb?TYQ!aM|IGN9O%9<-+Gbl0^AU0!ZW)nSLRiTM`R@~0py0Cr za@Mu1rabfb<7>z%3rVOu3DL4#q@~CQk)q(bJ6LoYxB@uQLAEaGFi{inD!}c3oPK~} z3r}rSrUpLYIaOG`yZNRlz->(5kS1#Y_q)P-&oJWHkP0Pzms}-etL5e8CpGWgOH!Mg z^Oi?nxnUvex(YcNYypM|-*G(mB*G{9HG&{m`o@aBrB(*2|wVd=s5fI8s8y#pNE zwxwe~NBQuxMP0pX@(G2RQE(t&eq?q4QzW&HCE==m#OhHS$GP$wh8FQ|aPFLiBv|LN zsyoJ8#51C8TOO_@b4Z7?Q;7^u=3y8oz2v#)Oj(;-!Ckrxc+4cS99ALS1`?3z2H|P9 zI-P!S?uy;{P_q|Q@FDOKq6Lf@A?rPF(tI61`)|G;Lka*|lxHF?@3Xb`d`0&_`@Hx9 zsnm%j-*5?G4p3Pk8!He3FVhtS4s&$dc_?dyil%Z=x~g&TyoMQAJz90av1!bho`!^k zY#?k{0+x+Ro+!|vfi3s$mxl0g03{*r9SM5=idr_xoBsgqooU!mG|C2F;-s$tgGdY@ z+zfy>s>;2KXd^#hlJ%xU++m=#aCl@ zQX&b+$^@sZyc-0<+h4vqxlH$peFkO1V}N9x!Rk+-gHJ$!a21@0RXQE!1N&76FpD75 zl>SuYII5{X72st&*@x$)lGpqWo8~yLsAx>(kyF0!fbAyUDw4*5w#1mCcqJ^XT}lJY zn&^g3upnH&?V?|W=Gb6^rdftgfor4(j;1#ypm5)DNiI3=bpEL8i#7X7jy`^#f{rKU z8w!_R4@o9B2N5bNs9rBF*cJy?LL`p0h}J6Mg-QIDZ}%l z8-D<*PH{v1#H%`h=yO-OQ)n;vukZ>9rm_vIQMx5BWd3xon$#0*2rtjLnY6>6y~=m^ z@8d&iOd+$oESK#C-16m^@6_D2Di(A7wcD-w_^juayHuv(b-U@Mw5xUYX>uPM+pEZA zW;(fWrT-hRgnVK;VVYh6(xVs?l6s$+dgqbxE4deiE_vz#NYfi%7b29bmrLDf_#|wR z8Z&7nfpIhKC+vr+3;cxT_+9Dh2~j!dW8v8b-&KmS2milco!^dGdj8S&gSpg-&nb29 zX@0sTU%B|2nsAL#6J`b}#0n@fDK|c4pK1SpUlAibrV*(WC^B3pLDja*I zgZM%k3(YabhyVM?K%2f3sxpW)ogGf0YK7b+l{%7wDCThA0PTVG78zv)?sA#{UGIlD z1YYgH-%AHYW4$(!(bftmgeNBtSZXbbn9+eEEc2n!(cJHbkgCF8A4cA|QIUR;1X50* zQpV6g5ttV?{-@y4iEZ?Zlm%-s9@qZ053$6!K`Kj$(Zb_vHM0u~q4*ol4n3JFYfwGz zUm_8sq-iW12A36;@z6jdEVSw9>FM!XK}M1`?i*jrC+G08862nQ3#wn%%m3EOY6E7m zlF>zsb(+Avk8LM&77{cyh0^XLA$YBe{v3AoY6H#-p(c_DQvuz9sw@X2(D1WpSjkaj z-@cuc2*k!7n2_MuOW>twk?(0w;l&hiA6ORRh0N{j?xwo5(rUg}T(fWK^wmHdI%Yr{ z;^(^HE(cKCjv)oc1Vpp(FcK|lTY^Xvg;e)7aM}sg)a4HuW?;z7?TYzF7jJj9!kCkr7?t_5=1+{H{y2~NqbchOnjl&%*tNSO`s#3sOdk|b|a z6ZI8vvQQlTR$X4`;{aR*sW!o!gD4})Wy0&g0o^GkhQRk^@Vtm7G)sd}{=i=#iZbY% z8F2}pmYp?{Ku|b4;&zlH!(HL(%t{hgC@jVtz#ivLEqO8uG6~a`EzQ}(lar@ebonYT zE=$9;CZ&J_&}=9!3e}kc8TM*IC(2lZ7UdKDKNt@G{Qr0*eFgsmmMa^Kc>n|PWeBT+ z1zM*RckSZh2pe&9q)h+p(K6sQmZCG*bWt{IeajeSGZMeB{-gRgYKvor+YS@KTxUM|pfg1y zr1BE|{y;z;P&a|ctly9XsQ$-3aRo*=Bs8Y%G_ceUw5G}?@o8XaaS2PuYM~4AVfeq9 zg#SR9lYw175|+E^#){d(!-pbsN1bsZ{QlnOAZCM1S(leQTDYdqJfaZdJyz%>aqX49i^4s28mPr3 zVOyg;oOJ8fvlIU9N)cVhPM=Q9x2TpR0Z-p_0T^GC2KqOWX#5M|iw6Nk^wgf>pc$eylk+YrWVg*pG-5dhC13gUx- zQ|RjAV&08XhWK6B&XY|lDk5UueH-)>__to!Vy>)Qw5|c-xu>kpdm^AqKq-3qctrW! zS8t(98V#TZwD*7cUe@^^EoCRQA) zngE0s^~oNlo6_Y8KyITvx!!^U_et(T5GP3?O5hXqZ3oo_ik(m*{KC~8qm$?gH7~G{OZd_LJPks9o;#R3Vx3svF?{=SSI2=&WbnAoyyRv z#UeO2bv>WGHCbvSU`Iy#V9u+wx=*$X;)lj(8I)D%c@&>-)|K%GFtBqE2vDn5ftQ(| zj;QcWgm!x$RNfL+R5p)^8VNb^n{;LUA3uI<3g{JzOLMEe$~0N%YVJFasO6uyb1S0| ze`%a`3n4}F%P%!{H7Qi90+=y@Q|-`)52obr*AZP7EQ~No62b2llvb~lT_w2uo08p; z5pQUJMr6SNW!zp?Q^p2c@PABUT_WiIS$|TQDQft`2TEW?p_n~%a$XE-r~LP$I9|MR zy453puaYA}{f)49))JTI-n6u=S#ijLW(E@HbvRQUtd4~+=s=kpFX_a^2!0jnCPw&1 zT-{JP7{k?k=@~@VS9F*UPUCnZuLu(OK|F}=c6a)XSt!5mgNzOcU%+iUedad3knhjC zK5VDjdGg-t7@suf)}(EbDSA#Bo)7JLEt5a7&-GwBN*JDmHuV*}_RhUL%9R*b-N&^- zravVY-uH4^G`BBYi3u|}yJ1P?LJyT0_SJIi8?XAgcaww&0@2eC-V%7(>tadib$TN@ zATc4q$^Xx9+|$WB`nu)OgG$1>aYk|(w-#ZOUCF};7VcM4<$3=a2zQX;3Gj&H6tOMn z?Cj)zVTfLpq@DD2g264ccki(cnrQu3Y+p3dKGp)W5GzEL;xzU1*RNsh|2M8saNUP} zR{v?R3~N!zaIG|pOf)1iUV}a9wQ4n1DDYuXIAi7`*kTLT$0vAW8N9EU@vJVW8NA z)ED)WNJ{}nUeLt6>M2lTu|kY)KO+pVoJ2$cK;JZhjXi`(Q-vGu41I`K!4Zed)yWoY zlCbTzMeC@!GIa_XK>ilmyos6;fp>YEo=X6xR#0S$FTGlj383^`js8YzN{`QPGu$)w z$MRyhV@X_jwzvUJ`2#oo)}|R;{I5^bHYgS7eN2{eiJyaHHEp7?6P2{Io*`g!q56*CQtBtFgVUkf54&P!NrUqYeq z46J46Q?(^}S935ZtiTjU9knb@&=*q{+adZ>widzVzLH1}4;$rgk^l`5SG&t4= z1CFF{btR>s&qX63e!u!$fLb}HECJL7i?+U-yN-z7BF1d6#@_lc50C2Br`DDGh<>Nl3PNpXhq9Rj2F*o$Yb?0yszfMj2u`>YfLxL7ifF4 z+(>4|5Xxlr(`c2oU{Y{x3Im@HXdcw1A@?KYvKw*31aOw5dIyUgjE(q?TbBXe3+7G3 zOhjM*s82y$S{ebN&ug4LT?w>S2A6k?Dlsf2{$NhV0G3xwLP?!Vx}cac@;6w>1CQy( zIY+ct#aYf=FP^WKbwM5wzGtdA&Pqgae*~L(t$h;HO+xAdLYdG)Lfr+0g>`f^AuHd3 ztjwr)WGN9(b?%keFl2DFN`o)?6RuNM^R7d+ltr7jk?>Kcva6_0hd=|0rwhTvpN-IM zy{|H4aup-Iw5aYjn65Eh#0?itQ6fIh$QB@EsjO~P0TnzFI1V$P8El4cs}m@-t;bs+zH|vgWu9{EPjhD|S)uq_i@7xQp%8^@0(gi_1HkmXuo${BGVU@4yA} z(MRtnbFbU!$P-BZ4|mkw}RDB0wH2qfpG*a8mgZjO0f^(Y^eVa z=t7b2Crm?RCOoUx3hbg+ifrdFU*>8e=+SS29&OX)4`N#5-4w$~FU19&g>KP~YdeW< zUN=X{gY%Q84s(e9Kj09vR5!0Etr9#-?nL~7yGV5?tX%@BM&v-7nTGOm4RG2GlB5&|2-;F8#9 zR{9=9%ZIHCqiNJz#X!!~p$8m7XE6(;gxeL&)vJIQTD?ee)E`L9Gn?W$`8TZDBw$iR zN3BSdKx&iFiO_+?-fXlU5{zLqkg)ebeKI~h{qim=3rh%itK2J@mvB!w7r8Ru$A)|} zPHc*mpq$?J1)vW*xV_5PoSbS9JTb~nNpW%E&8es*Mdf6uGEP+@p7?eCMzE(~?_$!u zSdr$#&aUKy{h>0)DTZ?RE?p0#I|H7C#M3aLm)ndqH>ERYGf<81&TtGGQ5N;@h5_mn_fu-%@Ik48lA@>6x054 zFBFeM_!)EWYYpAqWnH{n1pWYYc9#f?iI>;PWx0%X2DHR3117P8Fy^ zFS5lj)gPE*(HcRFb|w1$BFYHz-0w%R_BHicF-AF_K$*9AN$jz-a2$D6JU4`;UHmQ|5zL z+#aZ>eeKa(9JRl9f1O=HC$m{MoOyI?Z>ZR z4gjh4X5C)g&Gf(yw}P&Ou&~YGN1w_qfj0K0E6S3gy2tWw{VItoWy`!=+YFmg~N4}$Wmd{Yj ziE8ci`SQ4FdIHaP--ijzQy}K4a)S|$l32>Wm2&w>b@i)>B-;fRkr9StS=Z&F?lR)m zBb`xs9yaW)uXQ-w{EoC z;IVgUwp%Bd)RnTKmoLY)SHT9WTmul??mKj!KoO}2YUS8ujUjVXEG0^0;0j~kSvGCr zN#GSC`^#fs;NmZP)xJC+#m+v7GG5N1(x)H*yF*nBw#g`8e)s_}@uGH9tq^|!nqoyb zsLG)5K7L*kMU6lQOssH^qL!#h%)_P{|IByr;G?6wkXfDAB%_?&1K>Kfzv~3A;!O#3 zVy+zpV4Lpt+8B-gX3~gO^mv@r?XT+Gi=&uwBCs4g1^~=ILo?NH#Lh0Ir4~yj!WBa$ zH@PuL*0@v$K0OEA?Y5r^Jt>Ao&>^Fuz^9ICh}M@u5isT~1rSVC1~PJ;^O_ip-9q?P z808Ok`Yxvq9^>-p)ygsN<0vDVjxtfd{D%JoBcRBHemu_P3mco}&ivOeUaW8I^AyC4$1CdA$Jrdujo87t;RC4( zOux0NuhB^j2<=}N%WKU%2P+or?&q5y%Ld-q<^1Y;|MIMv&7#*v^9p^@k)JJ;LTL)m znZe-h&%j6~Tte|HyhF}0;+NkruZ}PW=lTMMI7;gOd42q0^PySs?N@gW=%r?d5^!La zt4ZS^USSK>PV+qj&w_QaB41Mrh(O0(S{%>)ev|2pJc zm3q!uMk=Rj9G@iS_pLE=C7m;^Bicv?6c{4#5j#RI?Zs!NV}>@$`7A;^-&{WtE5zVqqI_Mjf5BM$O!xiV+^D z1{i1}iZ$L17{YxN;ag-eyi}B`{h#-ckMHCW>q4xWJVg*BTPp{%JD3+`X`S6YhTeG&<{> zT_j_wqOTR#l97%kwe}fmh zi{Kn2i6_P#m3cdk*=ti6Hq#MEy~Daaof4jT3R6pft?KZ9C6wUD=`e5WsJMXAR?uZs zohJpg7ij`WKZx^@02B!w2yWdDgM&Lx#(@5i=^*s}<_79OsZ?iE%&1{N_aa7IK!9-- z6%{-RG^-y!PWYnstf3(QKh;-s1N65^ zIa31`cotIzz>UlhuT2p~-``@70aIm8rG01USDVzRNVOVZ)j&tmenN zc3)hGRzCh&Xf#@Rqu8ytkE~dHB8-az*RrtQ7`RiLI=Ex;LsO`6rN`TMDaLJ6UZWpO z#|D1Xyh?y6r$*&3JPMHs_?p#XXrWg$2!LsBZWaVL!k-t3<&JT|6C#W&-FLVq^?J|$ zdZPlmG#u$}6xe~tF86c$D~#P&1L9Rbje>&?b_Tf*bcR4uavW>p$bEq8_cC-z{Kj!G zYbWSObYw=xPT-@uBTKl=iF0=JNd$-yZ&tM)G*oq1nDMTlVGz@Db&DKAZ%xn74=eZ# zF@GH{s&hvshd~`*vAceNBEB8oYBxDlqtWHA#7MtG)*}hFiY~rD ztZAxgK5k-9l!(I84uds>`zkbrXFi15BG$XN*qpAavQlW``8i?Y{Q$*dv2IQ|yr3Bf z1zx5_1wI_Js2+OD3Lk>;GK>N-Va^!V3Xoad(J2J4Ap^3&huxq{f)aS@Df60P5R~8I zv5a0!$BfVu;IXi@bLBQtXNyI@isC*NWr9^7A|);_|3pn{YahO!b`t9G7rbr(aK3K6 zuJr#a2S#?Z)F6;uq>-5Bq{2<*IgT_KIj9$RhF}tBK0}8ug6PxK+yKvAheEhj8eK2@ z_U&g5lTbBz*sYIld-wM3kpJIbXSH3>_J#skYV{(fL|9dO-w+ejIlG&}35>kQ6GQM| zPxz=y%Lfn_hNYiB9jCP@h;2#J{riG6g^;7TtPHa;abAHTSN(cR6h5eNMqEBuSm-p;O~6ec+tIpgn$eu>DI z50+t-f5ZTtm9RfBu7UNFoyCA7{ZK6d!DLP`X@$l zulq49s=>Oo>hrGLCVw&6_N&$dh;XK{`*A{d1rB)6OuTt{RbjGxdd|BzOu=ZDy zi#)L!X*5?XMp#O@i8@WWMORCUsXPs{%a2e3u(}crHulF@H$)y=AwjR{^!oLE-@2{h zR}J&fw^abP(spLqx-|k1q@S3ev@mj!#2%S^KmW~QHZi@!9gCC==*}uH_6;p_1N~Gy zmal>3$?A#;JNMBeJkgjDtQSj=drV?^gXSKipCS(sa$cH|trU(R_^&>#wC*l>S3wIOI#7npwo|Jw#Nl-A zExRWalKeIHQ$c?IcnX=LQJ3f7`{MI7dn|p{n}iSU)0BDQF#D$igmM~=frER$jQv`E zR0f-gpt<)?;Kk?60-i#}lVl9hC@)>_re&lyi(eusdVLC&{I&n7JQ!wXG6zAI;y?^$K$x}@Ftx^u3JweOAuo5sN z!0)V;73~((Zk%2ASMB9>;ru6EvLq#fdgm%vEg&bW$Lm?WCl9z?SCb8B4^0W@q<^nG zxWwicaha2UjgE{|o(HFE8vfgjj~Xf@8Pnm_ypF#rligAKzp;rsD7@*Ec98djma91Z zyf;IpGgs;IDVvKFT|RV5G_hXM#^%s{>B zj1Pc;f{=>ruc!!GDhOB36xZYGG^AoNt#DOXw{2qt!?J&Y{~$_%VBoIKJpf#Hl-Rsc zqdu~99Q1q>a2om-a9^9^J6@f}U_Xc2ceE(WnqV?O!$*rGz^(f_JFETE4hL61Id%mx zW)FQ8QnpC!U(sPM%>2dDt_KkCvXJ;fA%);WQJyuHB28IYS^olT({=`JUHAyy4cR0M zpkOdy`LosC6yB9Lv8Jlf8I%yN330=u_=ANKnzT5Y?=#8F)o^szDsXChtap7 z$Ba?vE2yA|V^sgj{U#w%?(ui)=FMnV)>OEGwu&)c*rV-h#B90(fPg#qK%unH<$?&j zJxEwY`K>+n;F8-H(hvq9ev8E))i2n@NW2%w($&)oAz7kxnvqF~i64Ou$9Pv^4o;5O zcb(_c{4N1!GRsqQ@PbXuI>c#94I;0)4&|LqUs+pJOiT!RPqQ`;fUQLIpjHay4++>a zutf$6hEwSu!a|+k9f-@A*ToLE?K99OVqV{D86o!Kag`2`aMa_j-@D)nujBcJ1&i;Z zUf<4LjfmKrh0xtAsG`nw7XZ8gIa6m^YEuLi-MaMRVj*2SoVn!E!%@^*_Obzu*?oc= z^%NY;@!Y|piI;$HiTZ_mONGZDe|z zT&}aYPSr{=mAHyqsCDrVsz{)6TGK)=m==aV6vIjassmI0DGW6UpQ;K|==ETWG-Sj2 zh#p_fiON4HO-asg_Q>?~agQDRN5Rm*6e79cE@CPm%M1T}^oc66eQ~AaqL?ph2W2UC z3T{H{aY?&{!@0tEC{tp-65~>NISX7AniQ&LR6;8T(plu1R2rWwo zSj69RIW$xe?4;fO6v}Ulb$AGZf`TU9YMPp-ZbxG{gc|dF_YyX6d|LS? zHC(2y)4j1$WuRr~PR!Y%eslz7bo4FG7ZRylkOzGoq@(*E=Ai-;lK*>06Z#sb(iU*7 z?F}t?06qd6Zrs)fqI){{&{s9IwVC028O^6jOH11hSF+e?;k|p;V8O-90xdZ5RHX> zY|szE0b%b#3=FwZ9<|yJp_tA5aU2$<d_JL^p(F?Z8UonYYEki<=!Y&RJLl!Xwe8<}XZ%V*NS+?IgIt-siTiUnkDiF6UM_ zJBAW8af&z$@&C%jU_r#^wr*Zza}6IJ0dz0%vY{a%dt@yqxn&%TKt&tIgrfHMCa)KW zUtqx)RJT1mJX{H+)%_d^>X+cewOSgZaAJ2bJ%JXY3F4F0)&50^00virg7dJ}`CIps z6AlN#aJMyp&>F9~!*#ltT&_fjOrfHc7I_#ZjVQv+9+4n6&H`5JEjzOxhn~^H*jNGL zuM=PQ0W};EI$TIWduoOOYfv6U5?L!n)x>5sawe=?6F{2d6BBRW{Cz3}J+@QR==gZp zfwMJw?bxvO;4;E&r@w-EZfOW0%cv!;W8w|18#BI7T z>c^7IN(YN30<+GM?#<{ZVGtCyAxWA<|9IGa|NfmFl8E%^#w{U{!^Dwr+jh! z$ZOzrz$)|DYmSyA=@hzQaSQPCcO3mw%Fsjc5ujQ>>1rDOqW#xt-_RjN z_1j0wHw{EnVa+!uyfMBn@T85c6?tqiA@f+i)Cw-&LvL{#77STK2OLy#PB@{%dSadx z`gi6Vh5bUNn0Z(vO`{)p>%7sSoBS61ciq=6Q`P&o^rfQs zm$o^m^yaAZ?43)li1FaPHs3oBzr2yCpk>POSiPP9kno$eBZeOBp}x94T%z8{*FcK)i%gFX9{7-7>=IFC_c!fqdYl#^!<)mdDo1&tnbQt)Hd?eiun;8 z%U4#)$LqqP?x@DDsZ8?vO}0wx++TIW!A4VA<%YMv>iD{)fwXZS-${j(FM6s%gRvP; zx_S>X#qO+R4NYa};&9zL*p*5jbYsD=vjv$F?)yF!1VD*{QW>|3|G^gM*phAQ~48!N^n#LM^s$HtA(!qXkF?POfL zwz3q3DeE5zadBE8W*0cf%ZrJPpg_c~Z8JXd&K-8t2Rhjvn%LVH-cP|ENUuT0cYdpT zl-b}pTiZMBY!A7z|1#+sj=P(wgz>_e1-aTMf40Jyr(LM?*|Wp#?q(UTN#|8k{7@!s z_K-ckE89WGy|o1JcNZHR1!5CVMQ;A$1Jf=NC3@HR7}hwvJjjU(7T=r zOM}3y0u0fT1G+8J3h8JP#g>)T^SdWNAW_%UJdm{!NzbU|TS)<4fe}8?0LJi$2;#si zC5}sgR7|(UpiY2f_*}QMyZbZHGGkv8E^;^Kod!(}4b-HG&q+naAI!tU(|Q_&lArLi zFCDEA#btU3*NLSj8h?E~z1_Eo8(C4!i=LiSdh)mn^&t(il!uD=Et@8EiREc@G``Tl@R$Dn{X5>3f&yq~p3ZsP(D0qB&lUj{f*tc)hqwJ8EYN|P z)5tQ2`h0LCQxwlV6l@lu3HVp7{{#Pl*i_P-V9KfP1$Si*GG2RWa8nMP$9AMeP~>|) zis3?S9g2&HVvl?c5L_h;6k;bak)Z+8`&BlP2Y|5_t)_{uJP7qtt}oPptT5V$nIx=Z zLb3h_ULpEioi^j-l$?i$hLM zLavAc<{LyI<%qOl?PUrIP$g<{XHlmAcDIJtJ*FW#1!91NDi9v0TZ7-rh|8W$wgwFn zf?!N+;y|RK#`l{eO)_P1OEOb5ztRO#V~X}Gvnh8!)OjYj$VgfIZ9c<%)c?=ijtaQB zQN*O>!2n|wabnUVh#kqH;^Dxs8NNQ3)ijZm!iRzB)dFLXc(-6W7n-Y+$0Ujb#ec?< zna(;4+dGK4F!@Kxm;{FY`v=RPdF&Jc_(~r8L>5<{uLtGmZ{3fz;A1prQ&D_Hu5f-A zi7NyrFG2M@-W)N9$dLwN?>>mpx#!i7A3tsiJV!|46Z9*MJ5DBhiQD9TE z%GQu4O^n!Sfs_d*)B4%5t=e4tC65ro;D zz2P6h=Py7YD8_5II%M}iu|-&cdbY6SX_ZFjd3o*DT)p#{9l-Lp?d}`Y22kg#ZzcrE z&z*VeAA--ojFT`{B2@Yn0J&n|LqH92KH)Tom?Un2Mh|(tyuCl-;i@bE_vR@|2*(;A z`EHrs9p{O#1koc#4=tdEIH9KrkZ3qDCl>&<=f?&HZP|hdOcm4NB;jI5A`$VeqTjWq zUBA6G*@2=KA5y=d4b4~k%a_*-NC|hqsDjhR4ixkT)Gi`E#3}e6ycO8DJ1TjDCMae6 zVS}%2{s{hSOG`__J_v=Z!^|dpastJak;o%45=*#U_O{+$FtY9#V>WW8pbgE%Ymf*H z^5Cifba|~%XSO(^hgMWEkuU4&6h|0ybpmZEoY39ROjN0@>%vFTZAT;*Q}a3Dl}a8R zyIwkDC#tIQG~5@NPco`)&UM@WiQX0u;G+8FH@2P0G0g1vgtb^ zQygJnz7kJWO*rBa^91P7`F(MziYc@)q%$$&FuDEJ03A-X?_yM6CmKFZZf=zi53yC8 zh_QkqgxMal#@y&A@jX2^{(|KROOrLs+bDPQ@F)Ro@OTV%-XUM#YWheD48>^BbM!AD zi-l8G)|dcjFj9l0GVIX+YJm?}&s>sAS)*{00;j#@A^w?By^hG3Vn(}}y)Qgff=z9@ z_zvi9K~vjM$E@J%D?_l5M`y5X@oxEjPYRFluSYjbya+;HkyNJr`y&mf00a|`iLqw% zr_s@RtcA)c2HEo0EJjpeQRn{;Rqq{_bKC!qN2w4>(V(lOy;Gs7p=fBMffPlhM5RPj z3Kd1#G_@B>8XB@9?I{hBww9Ju`aO^Hy07o|`};gTf86(t#(BQa<9NNEbCbV;=hIgT z9*j2v->OflFYx13O{V9Jms#I$G5?K2#pFkFh3l#NYJK-#bEoc7UEmjH9{DM;)|Zi0 zUfFJ|+X<7@zeWlsBML-UVx3qAOqE_O-e6F}W<;Xk#`|%CFPQMBSMbvN8f-jn(&j~( zsfR2Sjc)xqgg+=__8V7O%(5jDE)ADC;Lmjpw~yu|iaE5%bZXZ3hg7veJC3Iy((z0W9i{)m@7rK(4u3t0*u$}j53D?i&&aCP*`-`QQtaq&4w+Ny#r_QJ|`vNA~_ z2TpYqPwLr|Cts!DekBmsI(h8cP}Vp*>66q`vO=pm-~p%95qUu28YeR%gS%xf=CE-^ zOd#vw_uW4=^JmuK5n7-M(}=(YDI-FGX#jmrA_uxisTM;C0*ChFBZ~L&J8V!z(@LmZ zzWQe-Lokjsh3?e6(>sUeUtW-8Ti!h=fjpZiSnn#lz_#@bkJfWXO93KLaW6-UrId_} ziRvIuY@&>J%7GS!tm%T!@YZkYEO4kABW>zYv9Zf~1Q&q@^j%If9E+tcgTaE&_yh#{ zfCLmP&5m`la_ze&8kmOV`#VLQc^3>YGJSFq+`u6@apH&|F)Y4yaK|+uCdYzIE$>m+ zVfZUqe3!KJ1vIvHJp=T_giMy+`o>A5X`)Zg3`|avgTn}Qu7cppP-$2iQ3`vJY(8*x9!foUxp!m|_@WQ0 ztEf&Jlx>Eg2)N|Si)Gc*x0g1> zfnitG)?R09hV2S{R0hS$yLN>xT!9QY?c#A9$7tBuH4HT$ss3ql)6mdBGI?$CsgozE zz`5z#%-)3q{GhJxX<-KhFkxcnQCLm7(;)m!&BLx?0Pog)&Q(AcRgN6lYTSWK^F~2I z!H3|PhnQ{wi`fc0AUO|!J7Wiv3NTY7l1hdb+oSoIXthj+RHE z>3WM>&5z-S;0$Q-4QjrMgh3&OAe2+Wq;L{O2kmy!9DeNB_6Nb(WB~+9USZKj2@ffH z2rt0eiX)c%OacPxNYyk(af1a)M30u}iATNxm=;|zuRit*hdCO_-z4`T!488#&DShP z(%!AC9G8)es}%l}pr7g1^c?>O1T~cV<{1$4zrpf$_1(mK+6btxDg04eTYFepJXsGy zzo_$9@ZAtKn9}41baQf`b{jt@7N>Nx{M71MyvGFVC0=nlfZn<`B5F9M2?QJN=Il&` z)rPzvc)bUzKBGj(q$l*ieiYl;{OtkkDHX{V#*Ctki^TxA5r4c={AnKc1{gV0*(QNX$5R=#WiQhIzpV9hVIb2%ESQd0<{no=jMZc~tU; z-z_Qklr3P?=~L|eoYFXG5@6`k%EC7^D6l^aaR9e~z^w%)3#{egA?sIBO`p1f4XUnA z_zV@I^iN*9Q;YhZ{ysJQSPS>1sA{)M1D!%*vrk4g7(mT`*qaqREMxE-yvG>EE;kiRM78)!z z(nn)y0OD!eRu-1qBU;Dt(47Qx>QW8}@h)&k?E(_nKdbf`XPNBwoNF+o_`tzev!IMT z{;gZKgg4^zZFS9%iRvj1s-%j85*oM%p&APY|1kb?LUQL$brXGvfHa8zSs&m1HCD!5 z@}&~OnEX~fe*8Gx6D>`vVxpoXudLg!L8JH>*l#oiq@j$yOC0aB#`XKk{QR^C>x)vV zjZ@l&xR~l8yusfvyhcwuB30_!w{O}?_>@tqyQjzs=wR>NQdhP5e_Az?7)v7w?hTocDRmA15gi(E+dI_XRWL%bCBz*wT5KH zsv5`Z$H-*V=<4EerO=lok;U*A9=1DJ;`--W5FVf{mu0Kb%b~Je$usQA<1kUO!3(ZJ zAJ&n3T)YDBNW2Ml&%7K5%F8ezoR2=KrWB!b4j}=Smtg zG?qmhP_cQm(ij0pEluJ8og_>LLBlZfeHA&9&Ddob@%PZ$M0pBUYZc3J!YmFTJim5a z%R2(n4%&f@TW1ag$JZ5NHx^e6;-Vgw9pzpxeIp<&51k_G7hd(;onf9WhBF|9%YuNP*_pPtB4r?7B(>fmawyyAnm$ukf z>&K4q)^m|xf)BA9%_N7Qv-s>|M!$Zr?NuISmH8~s?e1+lmbtL?dOy~t1R z#xwgjG+4jn2Fr9#{Xwa(f4a&=)^oh1R=w+wDbS1A?-D_BRTkPrp$%Xdf}!1j+Nkuk zI%D`+FZ?aW5#KL+cF@kit(Crt;~4Xw;Fd)l{K1adtIQ?T1^@S-ocl_Xs0R?WX-doe zo^|;wyU>6|e5bn&&{}SKS(UMe+|SnoOs1cKpdx}D_O#LV}ws^D^TQX7vS(F{%XDHMIzL1E0bzOXMNr@UkQ42^9Giq$QW=$t& zW=I&%xF-~{YpAL3F83sG126@B82&O2JQ6Xfl`~*kj^HB|a5QTIG9S3= zKHRhhrKiX zle-p=XC;wvjOUO7`xZlRc;m;qI!$CR*pf8vN4snSjr2)BBB)?#2U4v~){xorWCpq@?bDY=*Fd=D1SxR-pCX z*fe&9&(6=^!cr@8{3!Ng;gGdR z=PV-&%iFQT(SZw3{?h_*;c*f}h(lr5MXV(Jx=~a>;I?#=?~C@Kh>w(n3OusLC4Vvc z5_?AKaywuo+nT7vi_7?N2_vc-2c-J&K<*_{`h{H9%8PF3pGNBrvB?NqFo9yF!5(ER zyx4evOBLd?@LZjP2fZQHj1l^Z&&niB0U5_C&UNg5N-((z)*LIy19fe%v}}MC zfc2NGd_6R5-!MYx+K|+Cf`2x-;i|d?*h^{{zNzbzWejcQ+(Hgo|E) zK%>&4J0W!hIVF6&5TP}swC6$Hy&dYLhE3OIZ9gE94a7H;nuooXWWnD4c`$}J*onQ> z~_Kqb+U!D-s4~(uUNDYbZMmABF0W2H1#o6Gq#^(H#@odm(>tu^=ZfF%dz# zssa#v26;2ZOCa?R=UQNZc!bJdtwFr{{?MdKU6pNf=`&7X>+I1hlXM5 zNM>u<75ChZZ4go5Gy3@%H76%$v{EbX8e*rgjhTtTn{%PI7acA>mRG0zUpP8EBDpk( zT#K4UBa$Efxb=l2KbF^bvM&vA86B=O-vd^JkWk}vh;5yM6v)OEa9JQIp_&CqY2L3a z4eo>1$?^A}KNn{sG1C}k;rv;SV!U;myZ>X;)ir=r9PQbbECt{m_aJKtihgWSA(99s(fPb(X*X4bXy!v=sF5 zTYDhN&9QTZ8;9Ev-c8Pj*go}QWVBQrAecPX{5wJvS7Frs`%`*@IDw8CpNSO3wo2Ho zLuvQ6#o4LoY{j$)t;LZBGq6X~?{el@LA;hafAft<5@1~6R0MnKd|m<3R%>F378P(! zLisQh#7S>oAVUI-W$zq_L|K2kR7fLWUykXLfB*hHi6$_#Za!Mn4 zDJ!dsL)o=u^KUC;kJQoWYtw=w9cRb5XLxK<5)W9P-duRVCC(}g3nNhmAOuv0Vq%93 zRuZ*zrkV`oh-8>K7{`%q0!wb}Ow) zn9{a%9Bf&jAe-I(bC$JgfH?by)D}~hkIpxI#?Z)By+?IO9LmbACB9mW)s6mhhn7<) z0s)$ReJR^LYu$d9qtzY^05knXa4(-7oUu) zPN*DY`#)b6|3UBo>z6%a6$Gx_VI_!p^JT-|EHm;SwjVE2)Bo?Eczr4?f2YpK<@@C& z(S7<|bp7_@(VS)X4F@Virt}MwH*Wh*^V^pD+Xr<77%nzp?!D0Hzogyxm}OB%$f)Cc z&d;!(o^XO)yczMjr+#jE%m_K6EC`+q~zBqbyiFqq{U1Eh@5GVXCG=kliLTOc-@ zLd;;7JTAU>xTT<)fvD*(v`JlY&@BaG@m^<6mS#N4PC1yKM265pMD%IJZb;t`031g? zT*!}jg|Bdd0&o!-HS*vb2MAqUE(%GTQ27qp8JMMQ)-V zZxNaM7?kq-iz@)_e2JkHu9zSTgzn2BQ-WXn9Db9%un0Iow>ra&-LbS8;!8G!X;H#n z0T?p!?e_9_pz^s5g=Emm{PmTpQbSCzhlLbkfrnv=kTwp6P99>&f7Y*Chs}TM*6)}T z4fr2O*~vO}u>lU)e^(491}yotU9`XlZy@1Ay8ysV!>d=`awI}S+l`2%25O>f;vnX^ zgUk@?RIMjO0FI=u=;Gr@tJi!^9TJAx|yivFL8is&a zG8K-~k(m`AH5*r?rA}3>EKB64!RV%Q?dhB{)@9q^L;MfIi_aRX+1m1h`rTFd3A(1O z&@jku7fZYYziaMpn0KI&Lx3t$`=dKw;V9PFXKQ=<^hp>;qm{4*tixG^xq8bMD=4yD zDB-a9V77doVFDfR!grenn#Un8v| z$o94sw!k7qG7s?&Apu?uIhn2veuDKt>j2sk)}DgBI-)8Jc(2Iu$cPBgpo%A(o}p?S z@O1bno`{WzU(h9S=yZ4pQbtKoEbwA2B67h}ns9L$vcG7EC_tn?`6ZP~_%CSa9_V3j zsYKb}a0x!XMQ97kPus*PjkSW3Rzh6vh)#^IINC`-Sp?frK-sfR$pO`~@WcDyAMxj4 zMkbd2=<17=GpTQ{lzL39K5&=g-O_4(+|N_EQA;f!Pa`yf6Ez(G#i25Xu9l^>8C(X* zT3%R_g2);nuS992B@;w$$=3+KKIv%FCVENdvi-}L2?Jwx?te=Rv!i?kFhK;3r2R;7 zJED(O(*r3ThDwCuz{M5S8Hy)^0Qx5S7#m;XIN3Fr0cy|#3FL-KTKD1bwZ@(sJq>rc zOt?f}W)LQr$Jn~2l?y$ltZUHL^_ZE9D+E*VnKrw&Y^~WS* zYrP)_c8HMea$r>OZ%C%TZ7dj-p|=e7w^!<}?Z%5cpoqk|KAa-q_MC`PC(H2k48|eh z8q+j!T%CB$_ae=EAU5Ij)cm}J^lYj!v7eDUokKBc=r07 z06AeHQFV%iZ7iaL9b7EmS?v-Mw-UB}2M^JJcU=Q+Z~@GB(r+BTMOb;9^pWa~xIr*3`2%?Yan*EP{_``xe*eo?OwbyUPQy52 z;Pz)X4Gj%-{_@-oH`wBO@ zt|T5#6c*mG7)3wN(FRJJ7l+Jk3*w%!p=DA_)q4SdDwa^8A$aMiNufiIECl|I63!+p ziOmDZyveKSox@c;)Z6xt*si(Hteb-3*{0vy4_ z!A#`TEpMoZK20O%I79_#OtEa=F1lAATkVy!H`29^!!}F5r`8e0$(q68<*YK&Czzd& z9dwPp8toSs1UdH^oK6doT?r-8y*~99SNuL78Eaz8rAK!=tG8YJHTH zx@r~xVBhIvr=k-{nm)|CzpcdcXXhE*R}Ao%FDT%&w6WKr`chw5!q^$o@;zMG$U%SP z*Uo|>;Y^9yt69u$aXZf{C}#}c&UrGi_@v~Xd3o!qb}HWiAAVt9v%MO=>Tb7BLfyTFGs&OJGmQC!nO(2)B>T2}dC;;%l|B!AvoDJQ zI44X<(ngAAY0Kg1x&~JSFps{T3LtYH5J#Dtl%MI^e?OB}Hy6B2AuWxAS4Uq@X~j=J zc>+4z@BKY%fmCsPn_0Q9egpSCuQV0*J!qNEIgdY#@$d0z@cI`JZ7E-?_Of9rAf0|h zacwx|r?tWCc?Fm)eYd~n(08>3KKe1twgG_&^#Y0b>rz0$Te8iVp?<4f=KEg`@3?1) zHnzW41d}`g0Z8SSFXl7{r~vACV;j8CP6AW@#TIeOHlY5Syvbz1wyoUfMzUuCdeR%@Hqe{qH@#ElD29QQK?uo}`&CZ>V-B&t0I@a1x z!(Flp`DFy3%{;swBJ+>ONw~EEB#lhlmpcc_N=u7I3lRB-8%)EQrh&xwUbyfrw;gLY z%mN)TC?+y^oDW|Eb z07s(=;FDrE=wm)T>ZF(Z=OLyB;2h@JvaL!YM?CbSO$;8E>Dm!!EDw27~-vz_h2@K$@(HL+N2g8;K>!rl$H0 zwoRn916HWEtfL`jX}~Z&;Mz(tH|oP6@)P!59uc5q%s)l?l(9`WmHNkD5{X5|#7p+T6q@j>R1&$9x1o|)2jwKoxd$Mac zZuf7XNp0vT?flGMuqd@%apY**zNu4nSr7AA`{;h(;Tw|VB)%8^+7FCzhK)rS?ZW_;{N9+w@ZFWrXpdeS@*t9WTo!{PkmK=y&B{4mgf)c#Zl;;4mj4M2G82+@Q#f z+GV3%EKjvkmzU?Ycqx1oS{*t-H@fEdTzKOk5u^a7zY&ET^-BiQ9#T=>?5^ztWy?Uv zF);sxgBEUw^@$zGSXFWXJ`rq4S28vef)08$kij*^9r!@J0_?8=vl>>jMEJ=7XHM4Fn1R6Nqm3h$R zU!-3HpHOVOT1ry#6^<04);~~w=`RNNs={KEwnVZjx75PMhF#6{WfS>wi<(U`*b`atb~l>&%fC!kA2?X24@7T!~rxa_Gm?HmNK# z)6>j9hrRyqux8Vl!JN1Nz@XT7;{?mS2?sK?=``jufkfWB`&pXx5jtGOzF!Knva-&` z`Hj2$AAe8j7+x-Z^Q$Ah(7^ki0`hYI#hAk#4BP9Jw;sbf#G;Q!U~}nM*db}kJmJ@- z;=}B{HYd%|F)>|4Uo<2jqe*C4Csj=`L%Oi~d2-QMi+{t_ED`<8=gEEsAx0rtG+yHp zcAXh%!QwmL&54;^#+S=oPuBX z+nohgNDPqnDimu@tY~8WQ0Eq1-*jE*5C8qiRta6Y1~QpY#n*<`@dGn7ny@4+cG!PE znJ7jswb+Bpybr$=f|uskevF?Bq%--|)f^69NMjw*s;N^;7m&J6rxx3(cii0Qoc@Sf z;6emeHQi7$n|{A!e6(C7Z!9@jxhhpJ*^}=6i*CB~5A~+Z`!&$@qe`Zm_<(wpde<65 zs$0kf$rp4UOtnCVSPBJDunon=;Re{lNUpbw1KDc&FD@v=e*bO_tsX<69van9MI_=; zU0bUGLe(jUs8J{a7DAx8fzE0j)rFHjt}lZxdtgPsXE6#$DfE0KL@`C+ zYsWtm4;%BC(Q(4mLf+#Zha1Wtf12b`jjUc^A{VZdTw5-AEoE$Zf5(xlLnu(9#~*JA z-yx$*T6CZm>)idSscAEQ2vzQyZDtRU3GMwY-&5<5CYJWtFhC$9bCgWr2K_%CXeg@xTxQa*s`!_6@u`2)|9 z?ze*odJ_Au1FxnV3hl({*R5Oa{c9TS!z5v~J@3>dSUO+5e}6NB3;2OIjIxbN8gSd8 zo(%Sh3B0(K*wPN$;}=^`R8TGAFt2QYIGb5X*$r?4Ei{_6#ApFzp%Qab!B10#JKN

bU=7VCZoHbL6Bc9L$#1Xy5qIo_6cvNQ zeQLb0MH64gBvKE3L5ToP%*VRajKtFMCN-9| zd%hy-W_l(Tn4WK9;to->JQ{GZX^sl!kJHoXVcFP7l_2?2{bPRc#L6ZrNvJ6sl_1fp zgu=ag7M1J}HsaN1?rtVBZHx{5f9`)g-3whHM(3^F4_Jt}35;TTP5t7^9>*^;-P1p; zFAmhO6WyY{J}Jn=I)iBqUM@_YgD=SNcKM7w9;kyW%k?wzqcTf21EgdK{QHZ>$O=96IJ_zuNBsCcy zJV*~lW!G#ZVncd|hCVPo+xYm`?1&LO*5BY=FZ!yPrZFFZ0UPZ}EDp^L3dj_>vn%?J z&{1GT?b&8KGvo2tJt8@JbNS!^0TKp{2<_gz6{Gm4pBcEJJ%$=~5Nr%P`bj`I(bFxc z{UHXh)@JAwYGtQe4{&+;Vg6!mZN?Dt3|DVW9Kvd(WMu)f{iz|5mc;)-EJ zU|^AlXe<%kZwt^A0|o3QSY-6s8@DmUUEJ`G_ExAuDB5m<)ihRB-Rf+H;7yVeeq&{| z1N3HqMH+}k61llXA1kQ}DcK}dT!Hf4r6z*zR68O+5gW=kJ#Mj*3N%{X-d&Mrg6;^` zFcDZs43|FiU##qW0zG1qM<6`PKgoiXK8Zht zKq9i>fq*cFW>5i&bryX!@z5~Q>Rgp9YLM!caXtH-PN1doX&w5|LH5FPg|AFtLl1Kp zQ%>pwRlZD^A_3F1|7$!JQC2kW3k-AYBoa>pEy;(g}woCEA!3&9vcqUl&^Cq_XozV;S4L#(fFkgv#u+v z&^UO$KRVryyH>pgp(gw7#yvr7>%QQ+?K^M&$Y6+FQ+YsRqdmO8YVL{4kz6Dq<{}*< zf=k={HmN0TM?DYD*aAoOUk9hw(I*}sO3-AasicLG;XpApeeto}wOoGt6UXXrvMB81 zXgSO3^{Ijof_|w3yYVurwwskY(Z zv;2G3Y39-Y2Fs`|O6+T~9QlV>>14UzJwU5{!TSDqY&RS5!)>BWOpN){%<_Bhe@{r= zm3V%ERpcM9Ps}?Q?CD}O>=z46R9jTmR-lME_N5*$ zB5p}Z1DZWlV9?3gJMM{%i!^L9ZP$7LAXg1nOz|kWw?Hs(1JA(|If)jr+<78~!9U;c zOd|=&Kbq6~b4kE1X}hC`z8v{a>L5mnM%U2LsDOUK-|NM7iLXAGJCCiD;zJImLgrj0 z=z*6#I2tNnksKLplQ*#pxs1Z@?r& zBiB;7eJxt4-apy~`X9(fR{Ax<@jWQx6otbGc`6gicaEZeU3^z>X9X)V}*Pwl=ar_8(mpcOW zTbl8mpMuw^F^6^qJ>tADJNq(By{SKO+ilwb*ce(ekv$jL3CMz!N*F!|>r{!!h{aLJ z$i##Ox2Bc&KI^yiOx;U5(DE-~AJ(-Yo=iv-L2BU!CboxNL_hD1r8x*{AOvu119<(~ zUU)~8NEZRFT_jYD9$T zK7VOF;qqt3j^cgj1Ze3mQg3|$7aWunHo~1KBz*epnPJH|Kp$eM;^!n$3FFG8`f!aA z(^tkhyhW&7C3GJ^L0|Ytq{B%+`M;Sm_#O9R0w8%YczkKnk6nki0~(7{O+exvo^^Lf zx%r-s5mIPx0C8%V%b}c1Km2mn&d5&${~;lVuV4#n%{<=mpIXb$oxMbpji;TcQJUCp z;Mep9qEtD3XrP$7^f zEa?S$PT>l%@4RGP`#KDc8W2f?bEqZFOh7`LH%1tx!a8DFuBoqQ<$D3Hn38>uNTb~l zS0xI@;ZselPMw63BmsOZ`Xi*t;5S^CE>)!e!U_R2FUBHu;Dd2L2VNFpp+rjR2;in4 zZC_rxOTeaJC*)a+hJZ@hY3+}p@DIhLL0sUp4`kmv2zeS6GlDQ7lB+y;f_TonjGX+6hD+AQR=>KsS`Eh;5a#^DTcAL|v|sF|ygbvaElX{UnuFkNdx918@j6Sm)RcmK zEYyRZToP$D00>Bs6v^6~LKjGSchGxUMxu59Qp=p#=)Y<|(vD~rh=Hu312KrxCax)w z_}v3GcS1@@vw)%5l_jU)}t>+}S+Qih|n_orwL@BHG~^=EapqDay)W)G${EbNGAW@S0OmC27=T1!{u zW|Uve>2bZL;hwMimi)P5O#}vAGfT1*7W0qfWaobS=yGXHIZe+k)_CBBrNsM)92|wY@{NtzL2Zj`axt z1Etq~Gcgy{c-~&rvRkmp=1K{t?&Z?go|)y1=zX>$N+CX5?uB)fLeJd|LVb?oF5E|0 zqdOnS?zAZt`=%(Aac+j+-uEo8saijRgsnEFUz3qwx&G+}l1Up6-WyH1x4+N(*R7|v zf)%yv-+lchX-4N8dV{-0M}2&6)?E$4v$L5p;KbxNmE!sSBI^`HQ9L zsp>4>z7}P(e`e^JRb@7(hoat+JB*(EMbh7RUuEC)<_$jo&sR^7BHuOPjBdb96?giz z4yXW%VzrMK-Gl-o#=A70?@yd_Jg`DloUN2F2@s=?&-JVAKL$t<#i>$PNFy26zUmRX?{5Q0 z=;HSCU|LE64`kz=1%y2`Fz|l+d5A6H@GmZQlum55Z>+Bm8GZQ+N!vlBoMJr@LKa|c zpTvN`FaB&{7_F1*d|z!Bz#!6cRwmEWzA1U_(T!+^s2(|>P0-L$ekv96ZJ?JGi^U+F z1gXGuaeiT872-e*^}3F!ZWBduKvn1Nu<^PIy4zI zF|sEfBu zyQB|D{072z%2p%>^ciUg?u0z(kK$>e5EUs9>zx)`Ibg-DMFA0?&<8qVVsMa#T9+KNS^#)JRfh;7EF_J-h)q zC)h`XEsr}608!lm4P}NGVHjNz~z&+YhFD5BRyxgf#4_+d-p_iebr`1YAHm z&^FkcZJE8BNBiEeXI9JzlSg#Oe=^H)6W_}K)7?)CbG6JEt%=0wI%tk2DswxO8j}I-OBmk*MS1IuDtiBFs zfSJ-*S1~gY_4RSGvj^b+3MiS4uyWLVwZs5H`gzjRuue(W$MDdv#=B?a`eK`q8S38A zx-L~^2de>I1e;mjEr2MifFw#*y##H~#>Hi+la`&$0CUZ&?K{cY`^?&(UHjY#Qn(G? z?M$#SKFb&26ihNl3AOmqMHWs@u}l|Cz(1junVv5sq$*KkEvn%x2wOC%Az6o zl*a7v)a3oX9}1|}<&g2QGQ=2eAweNQ#57`sz?LERkN|-4sBd#JXe%^ZI`|R$Ca5{gAD2*)S;Fw*<7W%7C(o=yr$gYNMoXIAnH?*uWd%DUaX6w*1-D4}hzzM3EHN z2B5l_G!|a~&+91%iDa5fgSb3AlK0|nHZM_bP z*w|OkahDcZDRnu(ZRl`Nc#=-C9eegfS`hnCLgU$@kW=4ajYv3>RUGml-zhW@L+rk5 zSMT1!c&UEr(uM^8F(Q=l#f zB41bsCkdpWht7AC6vZ54L&H@-)LJZvSRU_ZU)vunojdmJTbVkOb7Vd6Xz4;dxNetF zk>fdvP^b`dx%$^w(h`!Ad>8lXygAKDuTiEH!p*Vcv-h5yruj%NhamUbUoy#8jkxDm z_P8&E(Y|~0+*JG8wyd2tUOXCJLT;to-ps!{&3XGhqi)ia_1cB*d)jTAtJ7ozduVoS z>@T!4v@UgM3o)@Yr_62#y*E#=wfhO1HLFeW(_auxEPu`)#&$ksIZ10$9h&ufR66NP zVV`fCO1Eu2?;x_tO8LLPB6QuyfT!)qKT>BIXjBh+^Zh6+j+b&gXKGOz?!lyH%CB8> z-a#{9>c|&1`V2dUSw0Mq^lP8`Es-3g_3s{XzfD})Ef&FeK2t($lc!$*H?K;7yNbP9 zc(z?4nkN!z(!F?ocw<)Gu`0q~!AxzZ^&x!eoE$}(MpFap#r>zaMY+iAxcTJRg5y}f zF^Be@!p#zDKhy%Ij`F;JpYiNndslz>v~{t_;TE=z3tXf&8;pKL<=y9{b6*@9@ZgX40kdZ4>FL?EcQvt*VY7DTVr6FDi6@ZU zxnMj(7mLV74aEJlGtFeR$dgw9bUt}?;RZ&l&F?83S7%6DK#>~&Q7cFRJV%ihwGM0C zV4Hcq?FPgRICE9K#zO@y@2$lmd{)$G_cZMSRI%tMkZoiek6Dw^Ng`Ttak1v+nzHCDU0~_)t3+pgrB;b?eIeO>gWjN4HaccII zAhz_COT7|0+c2=vuU&h2sRL&gypA>k%al2ngq38VRPG9~hab?`h1{{6eW znCFSK9*()8#v5`y2)%j_p|ZI=*p}Zw^Q0n4D|&sbg22cya3wal?Xn9ljhTxI3Eh{~ zgIM$c9Rm0E5=hETJ{A)^akB43v{fHGO?MGtTHq=yB60!OLCtIm+MnZ;y}mH;?Un#R z4;!D!z+PNdn}8VzWfHSM?%XgYw=Jz?_eb0Fry4l|KbBsZ&rLsq@QBDXpw?z4gP`|5 z<1Mh&$4glSW3;(E8k~UEpcE3^3i3|P4D~Bk{wmz`^OKBp#9T!Qz_W3rzwNdn!`-T{ ziWy51zIxhGB*m%&GUIhl)`g<~&d5~TX{a6{^yc%~NfoZW1`B`a*#HEeXsGN*{dP|U zHC1~1CMwcGWHY-{76X8Rk(&UsPAwn&|=@e=szfhH{AQS?qw*O+>w?M+5Nx-wntV#8kl<#6b=@JwN*$$ z;udgwYL(^Ll-O8KjP6l+QGqCl+y?Niisd|XEoa+vV^wni%B`ldSZsko^yxFE>z{dL zWw&7Ni7u}|#FHO#(F|iVGlhxY)J5UM(TA`%IKsMgrU}3DDnP@Av3YsBiY`y{KM2N( zgxj_?Aq{($uXc*Hu%)S~Y2WYJRaCldK1j##hmLBu=`DH2bEZb1{wWjg)9vJd`_BUK zVNsM`Rnp+ZiMrDB$3t>W4j)!Me0a-);MmyXu_0<&{aOJgM~?XFrJefH4wSOD{;PcK zMm%QGuCRu|;#}D~7Z|v1;30`hr@hYJYF~hxD9RP&oY=^H=KX%;50^))b>g<;Wut?i zZD2*`Lr~KQe_UzywPSA?jrvnIkMau{ns1~MtY_BW*b*7(w`yb?@2WdD%tkEEGRk%~ zeWo=`OG|UvkW_x5?`FCl@SHl_?Ph0CIBAOfh0O3rk7{JOb3r%+zjT zYv+aMLWBg_mh1sGcyw%A#eVr1A#>y(Qmu`QJxV-iBPuGjWpXL z8hn7n?#D*$v=19#TQm8Q@yWh83fVmM^siqt&%u(yse>L|ob91}nnu6y!65O*?h6nQ zq_hVJN=Q`U7vR8=zOr@VvI`Q6_C5?@A*_E_9Hg;#AzP}qb4Yaf+Kyeaj}Ka`>A!^BM)o(KbFUv5!q@>for7RkA) zJ*{p^M-VeIbxpSTaWLVGxsY9a?YZAvz*9w1n7R4n+Pj-%dG@O(EbYi<$2aGDq2IQB z;{EFXGd@W%?pHVXu0d}VE%;11QHn9ue}wtdEmE3=AO2c1>DmkPE@&6f7Kv|Kq7uwE zR-aT2k!13tF|Yv)SV`L&nW}-Y%sTC6FQ7ob4tfR#8XPp^eR$S#<~wR@Hz22`$n9Uz zbD5swR-9Z3xCy$dqGJH+d&8A7e*s7aE}Qw^`JlVWTr^r#Qqm@`j|M&}+`3LVpsLI2 z@pch821T!ofQ28*buFNDow-h(IdgPWF`!>@JrFZ$92?_(^uuRcGq=o3`EsiONM&&P zTggV4Zc^}5<_4;L6@@59GKF_p*>Yb!#J-xX@9_1((LE{51^T7ge-)~_bF4q1Mm~A! z&Gio3n#OzobGB{vD7gl8UHO)e|LqXE z9*%amR9wGqwv4Ui6h?rkWm2t)eMh4Jws>!nH(`!w8V?LT<|J3=bD}z(Ehk>vrTK~# zu@SO_6F*4y7F1>eMhksk#1A6nN3{?1)E`FZn51}kd_R#-rOoVv@Nj?dq}FqNaDBtR zljoVIN%OYk`hn0)0c0{$V|KBBst4tm4EV3r2da5@>{x{|bB=7xasS*73=Dn6eI39Kj4=1r4aKJR}I-vW&eI865VhE9@9a$U0g%2D-r)V+?-|W{+)4Ykoiu zG1(t!xhwW1?kcSjUSmTEQo$bDu#SWmyW#iB8m#mK-Jvu#WY;L{|_iN5l3&8!&bj%~L=BOB0KrCWpiS#uawYP5*aV`P^C&B=b~p+W{H96IaG$2c#G= zZATmCyGf%&mL6nDlQrlk1=fLp zj_gz@dMUD9bJ5LBc5ml~iJvDSOL`0Dk2&(Q=Vt`lv}e0vjH!YOAf|ISGmf-MeDI_r zYoRHM6gA_?l!YM?NS<{YDFKF@^OJcw2+<1#`ZsOYcA+wkD2L1hczJ0yZQskCDGMxC zTp#HbTUtAwT2Vul9cCr~2OKgS=FP#e$Pr6}EvYcXBb}Nq(9=aUv2jL@NfVg#r9^;!C3c(cklcyL=+Xb7ntXW_HV-x_vOo%i&sXP z(?a6vKo%6q-)q$Y1~CQUg{OIGa~w4le(P_*xMxV5)*aGcWDdtlC8B;5+faMe8}qur z^8)I~uu2YJCMOK}nUHGy_$Q|pp9%ptrf}(~VIi5nfJf&+W$iuy{WlibfHje^a}=T3 zB!bUk6tB2}xp`*cI6hik^7aKBAaNwd`%Do7eU-UV_a% ze)vLV3`!={RaJd@(?iE1b!wrzN!#O$#E4=o6 zPKzuE4QFM#s?0br#c|`T%x0@TpTxhR*iLO>KVzy!^}|Qni+_b{Lt0F)EQGB))Thn3 zS*7ABlWd~qU2ai2hri|KYKy09rwWfyCbWdN)%BY>HaOh;uFn{8L-x9*SfW?ni{BO& zm){bWbH^$mUwpec?grd^YZ-uV&~_(mHl?Y&LVKFpK%uAYh~@5^bCHyJO;D;L+%#i1 zKUpwk^up$r55wVS2VBj1frI*gbD~6BhLJ3ITQb%0J=Z6=T?;|d46OUp-fBLUQk@c@ z|5O=M@g$-_+00@I0GYLE7aKk9Ra~mcyyn`cKv?wl_uDva^J5ZxmSjqJ1qBh5!)S-vB|MxJ?Xd(9T5 z%i279vN;_>3ZL~moSX15_%eUA`j1h-*n`7HU$ZiOrnmImB#;qpsiZ}GXz1pp9}o0- zr&IlG;4bOt=m_DUKR6(s{g&u_DQy?k_#Vr7}Ao@Z}e-sLt&$-7H zoN~ZOYy|ow+W!)D#uyDRT=|2cW5)F#WOYN?_N^^?wGIrb_Eun!=`7vq+u?*Wy_W#5W@o=g?7!f zc@quMR3r{Cww3^r zBR!0AdSH$s%Zwi)38q!~T~$@k2-NxCL37n*gC0F^<>Zu`j<*+qm#sgu4?`8>L%50j zI$Rjc)#Z(!c0ZpVI$zrchwb{wtwrugxWZzdL zfWHQ9#@Cpsi6hU2k8t zY}ft=gIr1s_@X>BwK1IjWii(QL(IW6sf+8ZPns7_4!CKMiIAAZWq^QVn z;^Ab=q1l3HB&w{wtNDhstZl$Apu1=;k29H+LqLDc+sHFg75ZOsu3x`ObJ2hL{K`8^ z6$MOw{)x@QxGdOdR2v?4S2hf$#d%h5-(9O`XD39|d?EX`1%`0bqqd9zri;+-X6O{T z6%-byR&qv{u+OMMn{;}f)Ni3=LwDf^N&<<2y|vN40J$1QgHx^XP%Xe(&oL8*JPTZ4 zx(l3)j8yRZCy`6?A4QK>Eax%3pKVUPt4fk2nydC$J-Lb% zD{o)9@9lel+E^z3T{bF?M*EXUoO z8CQ|K0ztiCCKN?fFp#9fC)eN2x2v|GC%exM7FP^F}3XO}~h+tsdeg}B7 zG!A)L9y`VW^ji0;C0=@XK6c$YbctYt4Acgu5uoExGRmlFEp#J2`~#4?Vp_jw>yw2| zg2KY2hh4IYBwR-U+|MG-X++HKa$*yQ%!l4rav2^Ngv)`>uq~7GEwSk z2z8lBs;p2@XcgQHr zZAUX~ArhTG6s!Wg8n|)19!MrgS;;DeUAvlk_p_B!N5J)eruYpnJGMhoBr~uVi{B<- zdo_*^10bTnSCDh*z`E$<{IuBU>1VU3+a(x4FKplI?5&^-Xn}OAx>)RC$5T**U2=Rc zQ58QqR?HIwI$9L-D}P}c-q-dR^CQA&cywrSjV5eOYD&tm-55?0iz?uo`j7S|rnx}U zxgGAv!Zzs3XKcS^WM{txsq8$8cFkvPVO3bYUg1d&v=D&`1vV=iId}LlRlxW<>EnFP z5xvI9gp=+Ag<#&qrJS9HXk}l7CO{}GbxJcw?=5gRV1a1nG;a}jJH)B%1b!jv%t_CS(fh8Ke^+3F{pncj zt&#rdGjWx`=O1>KtEN+1r5Mn?-g_tG(JCsKMb9{m^7lc?W?fEw#3fqZEmJg(pnk5` zo>&y>ZPh2qaE)=KKUhiZR3F${QPLw!f zDNMrHjdVEn>SSL+q#CAFs6=rr$q8d$Vj`zV^8KDu^L_t(*Y$n>bg9dE-?#U?@AKUE zZ@ZDAQQkVU=a*OXtmN{51w=$Ot=&@4zYw15wpwd<^Bhsq`DQ?{ku&-&K?g}!Vf9-A zM^l%X5)tfqSO^;=PK5CeBR5WpHU%ig%Zdws_o|t3n)&=O#lxea>$$%XHS6{A8Mo?k z$7z3iA@RHyk+f#ku-d!j6lsb=wxyEoiYrQLd-Mj+&!*=a?3Ys8JhDcuEzBB~TaKRl zuvMCjQgO+jCxq$62#XbQYBmpEqvn2yp}(L9!ZfRogTY1Wuyck#L2%DDh+dILWKP&F z>tktajUv@MQaFM+U7q~eO&-oG<93RGQN4H8JjmT{yIimJog*7n1wv)N`4wl6s?ugs zR<4r^bG`K7L$X*7&CEUcLD`M>6-uP?*X7xICo1h3Lb2uIt42TJk(y+(JeCd^)i|Zf`wuLPAK;2cis^3F#k9LI4%MOq|YRJNL{>r z>`wMnPgj0Mii$!G=4JAcEQp|^B+&smwrG3)8W|D5n4iHtXi(imrKG+)zbPu8$QXwv zt5@y7G4Er&Vi@Kqzh`&rc~;u(2xfP1rWHJTYU`r;1B$@mXft2m%3C|qGtjZQK%SWD z(hh|>QLnkOeIcHG1$i^HheO}u^n(Kf$JxG!nuKsn+b07C#TlZ`_QioYbfyOBuR#i1 zke8RAV4M^-6ltIr9E`SLXbJxHz=;1D#2dgyqo<8T7EY9;FG@(rIGoXk-8VJWCiDr> z{|~C39y(N9Sa_j>3Ggv>aEBld$u_j!f;1&$O3Ts$LF;MPf}nUZBfzy@#VwZQbRia~ zk{I0Xse^h=;G@Xh6o5Z8H~8TPfzI@?(*rr8$<8AU4hE=kB_eml^t8E z(a(-sJPJV|_CWsK<^|TMA-2cd5W`X(1_hkT-!kC+8PdZbzqwEfq7cJFDIZWgjg>bX z0OL9LK3v{DYwL?o;<+$#n~SKYU-&)zT}}~Peyb<8X%dJkzX2A7c{D`AUVQ*xpkYyV zH_6!bB|s3pYRssXA-)1%Jmx}Qtlh47GYI1L-)Xe^tJt55?fiZkCV2#eFns$XL|H(T zA5I+1W7YwjY^n+Qk;;VIn)o4snI4Q+ou>d}{2pahgh3-feKI|#@|xulY5M^?w-&rgJ@M1;dPQ15>Fn+j=M|A8q3a?4}| zSEEr{p^;hIy96mGF8blejB+u|2j6H6tC+jSo{*4bjYe+cRORr6g?HfN(}yIPAbXT* z{{ZLx9O15f{QOZ~eZZmKs4QT2JBL6qY1T(1H7VDlZ^g{{*t%{DXJ==6c2Mw}W_2gn z#f%rllJAoZQ%kcK<_9M@^RM*aHPCv;@_7;vFwNS1z6SpctE}`y~F@`uL zRXXf6Nn}_ZsJpVdDNy{s!R66Wv`LWXEWD`C@UjHH^V|xnMkp#Md17!=^cYd|A)-2R zvCKaS2pzICdF)CLDmh0y9T$d%Q4{o|jWEvY4OtG7*m;6Jb_2P^B!q2N9JMDr1S&U! zaNVcy6G!rjN-l=(YAiM(?Ye3`W68a|K>Q9F)esqo(nWyi3Mo}EPNA-H{t)Hlh-Q&wv#$ReX@u|( zC3YlPV-nX_@TR5?PC;6Rm2O>!(8`s66)K!zPi!u{jyzOfA0LsuAK?(F3EJW#Q-fXq$bAhRa(nOmHoNEFR_TX6>5!A!@NKU782_d#9GIF zkFE-0J7c&4zQ#6Y8p>bX6EMFnSgVTabUj2*48hvd9W+} zHAOm%uytj%z|ozmNiq8#LJdHKVZ+4kQyMccQIBB>98Q&F)Ok+|hW-i~rHx7O(k^}r zHiR{ypf=l>&Y6MLZZGHBj1hw?=19!H~c>MfXAn&9iUWXOvvLFw|i7o_+N;I z{faT$hK}#uR8L-By$>@t5OjAPRDP4=SOWy0y0}?|-HS}hPN3pLXw9%Rv$ag{a^2!x z7#f(Kjt4Y1o4GHXgCts5S~|B&E5}4fA3w!73NPWU1hh%!Ha5A4YqwD`Gg!ccl+DHo zbjkhbzBt&P$H=ex3~riXJxBM3W;SpVpNo6klr~xbeIRg>TR5{??FptK@Hmd}ZkvL_ zx@IE>(eh4jpoEZQghI(tOKoVche`nNh=;qhl{zgs`LT^UjYg9^tS)IELcf6GgfVMb zyviwa;H?4BUo>kIB}Mu9iiN!>pNnWImQ|_baOGV)ll5yi2)gt(7#tBvRt-_TY}PDP ztYrQ3NZ5dh=!s`@ulZ1cnbmHHyzStodt@(W9IEWvb9NY@#Pel$OLp zxu#IEuxM|Bi$Dtx!Ior~Qeu@ly5aWp2gVcS>r{p5BLg z(c5TiAe_&9C3qRUr>tuPejIrka44$l}t+jV~`P!w46gX>gerV zo@Ik**lHp?B8}lV8Q*f}BO75D$b}U@{2eW;6Z`5-DhcXGvmdAdlmf{|=0FaS^Kj!f z!0azHsS^*T$oLi_8U~Pd+wILM});M<>?z@|v0t zY^GRRyZX28TPrnccvAv+HgbK+FWb({=ILr&W&OeK)P@UdsR|A zQB(=MQO20PikkJHVXwCU2ZzL{FqzywTufZI6$*OXhL}LyCgPt^IfG2hQ;ab!*{_8n z<6iClM=K^98$tPxSS(H&l9JE0Pn|G)QJ-rc$B2J0Edg%&ce-*&-@go!Lo6p@C%YJt`C#8PzEbdn839J2O$4Wp8C~ zAw2J|uKRcYe$VUq51#Awy6=i}e8=Z=9Pf2}{nbdPw{BwEL_tBZRY~!r1_cEbEB+@( zvjIQ3)pLUxf2dv6Rl0WO3_PD8Pdl7eC#{)RvBf0Y0I`(2a&`MV4y!~gu9co|+8 zigyx!uKf%DmvQENE%9#~jnncJuWFfp;UDX4NV|jX zj^=s63E`tAb?Wq5T$}i50z@{=>1Ui=?=SMWG{9fosNe|G_A`R(j~-!gp$yYCyqQs& zc;n{ojN>P{l{J$zVn2?#tyZSUp4;_a_;dTC^R@OuH23e{=aQ8v z_IDQ&7WT>Rbl;1gb4x|^`fbw}k(Ff|YkNEiGNt(xP`gz1p=aBszNQ z0i9IO@q(qG{%ZD(>!_(u+_6%P7bzjm!8Sv$q@>+=M>W_@{i`NXe;81H-c?%_)%`}(TF)O2*_{#XR{ zdeXg^?RjJ?dh?yKuvLccuJt$!ODt~jg9TyRw~cmza15om6N9y1qP`p zy2i$~9lG$&_-U9faY`k(H&alEy!%pD$KX6SQYD>mmzJ$$cDUKhb(NBW>F5oLXh}Q6 zr(w+PIaXA4b#=*UU+yixIrBvHYo3$MfW)OdM|x{(>*PzT2A|W7DuTYgzH(wf0&hwF zbWS(@Q8>F)e(n2%H4gsxUw`}{Pinc#NkVuwlW>Pyz1>6lA3yu zfq_Auf`a#0O#Imxl7^<{c{^_3qkk5M(@XIQxgM|dvH~7PMCkZWj~A|n78M;Yv0~OH zkF|dYJ8%Q+(9JhQ)^ zjMJ$6{75)yDKtAklq7KO_SBcr29!!x)ep*=l@@e++Xrl$OlWZZK5D)xwH#0-3iIs4|#o2Yp{ z9;ydLRU)G zbyzvi{FFOvQ1Cg{JR>8+xX6<-&t?-14g1E88&$0tR7_0x7P@?B9kbJ%rU&|pyr@*I zmCv4Kv_0PV`~PV$8Pmeh@o0Rx&>?`)6R<)M2>RD&f`tHy;`C856 zQ%|LxO1KgX+!t<(OG!=F{_Lj=9VURw*5Ha&qz>K4w3n%Og=%UcPBe<=(DChM{51Y3JW6SQbjV{}B%( zceu@8DqjB8k?*2vjnx<2C0S)vc>er(r6AU0pGF)hc^62L0tVM4ckf!Z&%Ju}>TBCy z8W|t=t=G*m-PzaIrx?9KUd!+Et&N9{vQ3pZl=rI|3^}4QwZ)Qqjq9=Yq zyy>g=`pr>gyWF}m+?MP!PV8I1fW`G3S8wPjod&and%Jwhm3;cTS8v`hxUVc+PpUu8 z(2`+9{r39{T4`k!m9lOAJAC`+|DmtDp0%)!mG;HL$G^u_;^x#|KBcQa<3)|yvNG|Q zs+~CEq7~mE24+Wg6vD3JySKXcpm)?A`_B(j|IZIfx}V*amF|6haZ$&^&5Tw)-5Z@I z*l^2U(Jdn*BaQ-MCUsnag_kZ~e0DZQCa>4g#l`2r1KRQa8=1G-nI!E;DqSRGWo2)4 z1&VFkwh0nnwq-Xc%7i*#AQoM%dY&3mX?A(H|au;WO!_1 zF`lp)7gv(n$;f!GJ&n%!u(UK;S65eU_7s+?)Og;@th4%s?$I;joiVuvE-o%%ogS;v zCat6xnKP1hLnos(AFv*)j8ln>8sLf;8M!IsG-W*WPC{HfIB>6+Pz|x{jQc7>8YHrA zeDfY@{}5?HvbG)hexzq~^vtD%mF16%v2uiN@ee`Cg0QzCPo43Afed zd7@59ChQTu7VPc)PrSrN^@|q|E?BK2)tybX^z~c1s;!f-~?-ic1V zl?&?Xl$#k38&B!ulzbjN+7tGjhLYlDTMoUlva;#OAO(d*!2-|smKM`Mo8?72zo4La zJ&Tr16RPE<`Rg6;-d_JwfgW_zkjl~;`Q|}tjW%;t*yg-^<7d9 zV@3EG<+B0j54pJ|e)ODCgIuRQ{!CJrR#a$?8vI#ae(tt3_t~Z;+k#En^>?tTl;F+p zf^yfDez&~2x>?$7@nJFveb&1+^tFzL#yVf<6 z;xX9>VZkDcg?n+av3&~}ZSQRcPX^YQc3mmuN>)Tkq(tkW^6>BwJaQyNk|p2p@@0xJ zo-^J$X?m0uR+dwJN=ceTZw-F>tFurxRX5`wNr$ofKHEYbJ$h7}X;S|T+wiqv*`{yQ zYzHfIsn>1T(DTfPXCJEV6kUv-t7Z=zi@Ko#Kalnq0NI zKSXb}(Zsk;ZPd=M2;573<;s=Da%+404Ox0;X4~7aYy?F`o3w?^%uZ^o1C=%b3@B?2q z_upPlB>B_}+1nKE@1|5xcyp|+VKYuy5WAxC^JioGn!7jrDv!se{QCIuV;N~{zPzz< z?2U1jjz;CE;9xrRU%Tk7TeegJfW(q6WW0Fs!q3Z#(k@>jh!d5)j;Xq?F0@(F@WzeA z-lk1rZi}`Z%TwPoj5HF?^q2Z?&s!>LNYacWT^QUYp#S*;?MO>jiZZ#a?V|qd28y2k z{<2K-=2w?`+o^GI#YYp+VWw(LR{ku!$S;U;$O^cBf5#yKt+%gV3x4a6(lM5rpxL~6 zyOhYxCh>{jrokAGTC<9ko^6BZ^BM{1;=Cj>ot6a!`}60|nKCY3y0pG5BP%OBadzf6 zt7XA=f%Hosc+#ud=M3Dq6FqF0PY1EdzM@yTk?%4)R2FNVl$7M1 zO;sO%#wb+N);6g$ksKX`ZL z0W)p<>s59M4f(={qeUk+mYtyCc;4qI$j$RUJ6&|4dqpX`PuJr8yoHt+-*bac_Fd0? zd$9i1tfixpRZKNK+GKx4Fb@X~3xslV%Z`+}IN$;-)yi-lPeMJ<<4S;zXi`xB%h z)U$vs?^C{;V>wmhowg;h3GF!S}wr{}&te7=3hH@1qP`*2Y)S9I5ESCl4ae-&4D zDxA%7%c=cF|5icXC__3b)M&Nb-QQQ^WGDSn!dJQ27*kqWT1MiQhJ=LR(w3E#X^5Q; zApn+m(92iC6;^$b2JRt}=4Dt6eV4|8sgIi-+`M_S{xH^|SI~t6ZP@yb&A2LalIXSwSyMerrtNAr!!Xp92{fBOO!ow#y^()5ZdjEq54{rHXUESSm;1M^pYT!D#K5w#ouVkgF?E3ZV_ zLGK)JEXU!)B(>U1S2wqA>_xgAJ5Gj&rKbxpOFIUekc`Ujd!=2-F6k+|XGVO?ZV78H z-@CXjN+d1*ojZ2zk~c9)YDl8FcJ11RUAuO@V5{cqzE6Gnq)Z88$~P1{f+?vRQc~>L zxwH88X2#3Qi?fzpMU>xqhV)ifmakwdcMT3wH#IfAC<_b-sLOpx_aY}}_vg=_Px{{` zs&PBcpVG$0M&0m~f`Yu7TJZi13W}-e>F)Hx6>seOA=(pt``OtkfN0m_N0PdFdI7t{ zK-}czUc8@8y;JfouM0(~NS+iwsH4grC)L$+(II;y&+ z<~BZs>CQHmP7v@fiG7W?Yoe*6;&_H1IHw#PkhtscMH)r{{dM8W+{)5A*ZE3g=bIggtFkyd ze9E6&to5Ba6(E$Qy?un_&QrOzzZgnOOVw&x3*1~Lrl*a_A&#I}?EL%zrur%>Dp8Vl zrdeOcf4rrxtFKQoBmmQ65wrN&_tdnkMOe2C6+zgd=(le4=jj8`k8Jcwm8 ziA2(i`Cv1+jVx~E%^#GKa$pOaOkT^N$M0_H#MIQ-01@++qpx4Te)jm>{w%$%!_Dal z9Wk}hQrRtoM4QiXTN2e~M&G{C{EDS9&pMb)R&{Gvc&QL*Yi-_+#npxStTmnc;lqJ} zTz=r;H-~9@Y~}^cB8L! z7!O}$NAqhiA8N@u;2#teA!*JJsLBnlCTiCTFj+a@>5(qd^y<^+WXjJA{^(OWnHE(%%^3Nh6!JZTc<9eIz$B_%Z~9s?g#)z-dZ7zB{yVrTc1BmuWn z6}T;B4rE@dVKTgVQ+Xv+t_3dwhUKKzS6O>vU<_aRl#@c z*rkV(eBG~f#3$5#`%=-ayt?>a9yq1^!=+0f?70EiA{%}chU%s906z7!i&9hH<@DN(M9iA*-|Q@iRw6l!cW?s_4@PTZ^;Cs$o<3PcNzIxYBP1kr zGD*|XG3SOv#=cgYR)=s#$2<_X8M_S!#W#&_zh3> zR|f|N<5ORreju--(==UJU0r=;eylyO7tA*2)2UeAvrm0bKVY2t zJw2xJe%mCJHel)sm-oPuL+NspacN)bbL@t9UHo7lD5s#v=%*OT=(qDOH_FySHEG{q z`<#`gEwUFwbj*iVtyjxe^yj-4@AWBeT-JC;aK6lm%K?W{(=!1dxJTOPFXXK3Bk)Hl z!7VA0Syj7>cV1Z3H+Y|Y_A@ZVScuE2!l!*Y9{+^{iZOl*v-B4|%;LwX)2%Jo<}9@K z0x>9_pJ1ZhL%@kTUc;x8MYGx094Gh>jdap*xV`jU4!B9pifUi7`I_D4;RAz$j_Pp{F*$*eGw9G5>-M$y9 z;|~wctHoy0#>at|KE;kxZ?*p27kfCZP<}Y}q5J;3D_q&)0`nGed6DuzZCWX2JeE({ z-D6mcBMu+~9B=E#yR?J(ZO`OM_1$`ap^+l4f@@C_fG5I`tj!KmN)c~Uu9(N zhO}<9Ou{TggK7Oc5y2~0jg8mcV-#p|{M^~e27Xj5)r3Mu(44%;dTui_GlDvd_Y)E( zw)BpV0qDVI?3}zZM9rGU*^ubFO^6 zqn>rQS6s-l>$vswz?r1_c#Vq}RZ%VF0LDyX($Wqs%r+Zjl@m&n(`Z(6;_wdWd8p{g zrI60pA)dsIofsP%>jTh8RpEYY-uh0x`AM%h13f*5-KT~ZFJJmYRyhNuEbz369A%rdUJW~9?W%pA3~wGFfSZ;Urqefg5kzW*Dc=yrA@xX!V)dg{PF5;D?dj<$s;__BWCg0U z&h68SGgej^<-GT-!-In~S_qSyxZbi`n;}SmGWl(%NM7*c0jc`N?i8=)w~-zciC|2^~gCN-FIO1ZBCK zHxFu)?`+*G59*oXw(|4a%V!!%ruRQuF+baL_!4v~&aNI3EF&Q>!P-hWYDB~4WHu_J~I+Uru8?tc|d)5{_x00?Z` zq5d+?*a?}Jwp59lHYjG2`xIkZ=aPNQWp0|ApGv7u*}(0Zlar$*rPgDHj`gCHeshJ@ zi^dP=ieFz{l2SLU3ViwUy%3Wl|+=xa0Pk4AZIPPt_f6k@rucxA-N`Ch~C+GP2 z^N*Tzp;!_cO|tlg4I3s}EegH#(weO9#hug8kiTSsm+s%cKUngrX@7seAiCtxJ3x!` zW^C-fMZfA3l7<`5$@RDPNQ*WSHWfQRYuT6smB6!R0^|t}S_!$;?88 z3Mnz#ntj%&Rf~|6nI%f3APu@H+S|WVAhvO7b@e@m)TpT4JR-?k^HRdc5@u#>)rVqz zKj?KAo#@`aP5Sq}fPfbTc0JYg5r;0sl2*eTOtZS)ZDkf$<7`4*U0>EMPNJu7G_!xAW7wjTMO!6dP956!V^J~6R*Co*KkVu?d~nSM(7 z^P{c2jvtmj(#i@}nLt)&4twxHoR%i11?|o!-P{*IMbpc0=*8&*8?ncXj)GwH9(y zb?q3KPPbYcmWQ@f&qgp+GgUpO^*c4xe0qw#Y4JezsHf=P1y1uTEYiU<>jy{A;ss$h z*{)!hrI|F3>imvHh0o5_8L}Mhp^H9yE`|QpDHLP!@}^U2YU!FgA9oku$$agxrbm~) z_@7SCFnymoc~8?hJzWpq-~#m$`>g~3prD(_j~^8@pBozub8&b5b-|@?+i+m1Z?j{i zCquy9U>S6FTa}G5s!%Cf6D*c5jLkNQIR;WQY}fDQpi&|A~{SP}LdCciIYhJua-fy?2k|h9`t6Lkw{xL6YYKq7` z0~LS^*33|5lJWKH-~o@-vTG_U{lM%lnxz6;)8S-?bV2?4P+25zWz0w)JC;o>Okr9XZ=k___q-?(whmWIcgYHD<7OwV6sgK{}d zbjgK_fBW_=7BHeg;?a{QVW;lx@<*wHl1`6)Gek~JORGY4iyD|IV>F;UaNxiY8GdLb zNO;u1QCMh(*RGYDwa$Rao%7$$$atF2NP43lKMu|Aa8@xRaVp$r#|3K0AnF!e-bYPJ zemLzR(T1ZC4+27o%R#V4!Yqp;5mH@$>82+SIO#&)gX^9+e}2!+)^~xXE4X=4U?RP# zrB{IJuGf;iy1FhWFR-w%bi+4(+TQo&KUskK#Wr9H8eclU2M-bgW&spDBO@8@UM=oB ze0UqqMB$i+t?ki|H%n)6LY|Ne3Rf2InXUk#$-#q2mhMPBFZ>%QuzNJSGd`eqXlRqV zy1HJ>z5DmqQFwyNtCh99zCx9go2%p028@T=xDB@Qvw-@v3u3>gss)k;cJd1f(!m_M z(pw&=HT{4^YS+N$M<0(Sl-*-IfqK9!VeMT1q3`-n}X!tBeM& zlm!I^+N|?4Grk5M%R6@+)IGj40k&>qfmq@4++G-{$FE)E(M|yR(UeVRv{W(Uqu*nJ(>zVpXg?;dJz(Koy{W?D^juR2I>N!a-^w0johj-1` zQ4*qvQc7&>y3aK=r~G&B*|QPSbXL%IHn#L;HdA0(0?e4QT!;Gof?0~{_1@CTxqIJ# zN1t9xJj9NdN>-lxtG*C4W~~K7QerCpfByVQoIa_2>h$T45O|bDED}i3`U~M+^Bpb^7OBG03l^2XvRM8`>-B_l zs@V^~w$7|6^~&Q17ngB^UYj)^PIhXAPaY;#4GWrLFuP zPA^ovyNxArcywSuXOu-RUiFbYP=n z*8aqK+0N3E9arTW)9}j&GW($SH7uJD)L%g0fyt6@(FeU>?w=|v>EcMrIVs^FRKyay6HK}{G`p|GY; zaq7VLOQ)*6UZ1G`GAb+GV|68O`iO`K_uFeVh7nw?gD zKk!gjjK|71M|dV?YHDglQaY$LtmEw8_IEQ8A|-VDjlQf$yNLF?LUVj^mT`%LA&g-W z00weDgXlKDL)@-HabFb6n6Ta~;D%f~?c`wo6As76g*9h@lDcsiz(O^+^<6i30wSN4?@mWjJe_QUq#~|$( zzU%4a@hq=lf&Vsf;=2iX0G2efb*h+tuh(A)pj(D6dm94GJqDFqge9n>qXQ#%tRte>8b&o<4^`6F|(2`}NWV^W2j#s;c#%0KQe{xHnQ%=875QUPhehV}%% zQI1xK=CS4WaFXD@c`(>szEdiKdhhp`pAB*8VwGOHttwjcM{j`5Qn1BEMdkNp$w@swMO!n^< zGNu{VMLGBye`YTrg&kcVa<}F=36EdBc=6)pHSK}& z_1m}G8wTB=@wYp4Hagw}rY6EW9U1%f?L)sCr!6zBDJ@+`*a`RDo-$&riL#O%TS&umfqe+C`zrRI&c>A6`e0o99 z3tkNHI6&*-X0A|gyaSTY!jdLZ3v#IhCo`!Rt;1l(T`e*;At4NVR&#zBmn;;ao3xDD zr^l*OXbIiVX=!e(T_rjUh++W+4~DfEl*zR$j~$`(zIR)C*Q85`zesev9HW92!eN@F zS8*sp2YlxTBu%?~^P4v-QKKIXXsM|kFZeU1GBkP|3P@GW6S1uF0DpfD#E+hhQ9Thg z--o@e+RA$pjQmCg6`_6%M1+LMf%z*D3rR0XLltMdPj-1`yVF#kf8y*$qCA||5tlDL zCcxQrId}>jXZk$=Z~wRiptB!*)Y?;!uB$aSp`8%EF9|U?uNCipfx=dHZ^6P z<1o(Jn1}oNHCg*W`5sEb#j&;)YRm(kOP79Yg+i2=nD{JUw#6*nf3DsANch8to{&WL zXF!ESluq%X?iaAk6AB6&cI?>UhjRqIF8=n79CI$r&w8~&n3KbMsTfb zWaQ5$A8Oyp~GJ{x7QDvQd>PCe93yf!yCciF;1K>1+CqNgqrdd87q%;zs$@X5-` z+Mh8r)D=j(ef#zwvv53AzFAv4I52W=OMZ&QfZ*lHp`*tBjhE$Af0xPsg#$ZC8$lE1 z?%xGbhz$u3Ur+Iv(BU@2AcK~*GWkUM*blH{E`xRGiJCQgiyY8 z>CB{lqrmj^^z+oz`^j!7N^kxw(j-SGoQ)ygLDA4w2v;OMEWLH0zaOfvg!byYH*dV1 zXNODzHTCshP3{~yS0)UGap6L%sm-UVDoVryvQ}Dt{Lo&q`w$w+;D}UL-~z>;`L~pg zW^U`A-4|wklb<7v^)}M*^}zoAo+)47bqGWp52uDB`e$KUua|)JfU8d=YY{|H)X|A( z(ghaJJAClfSdW&3GKs{4{7r*-D*z@uHaMu!_Z~bLY@{|cGRi8i`uv%zxVX5%>e`p5 zmqyF{0fM{jtQN4xGs~TRPjLMD@s`&v6;vn;rx>7ffh!*znSqY(c)I}OA-$7RQ&Uj` z4?{!c5e-so6hIchtoao;uD^B<4ANi#qM?;h#A!mq!mwez1@=H=8P^3k;pW@F|FXVIslKxECIWGd z%Mda#PU$080g! z{138RpZWQXi;qv~;bKqGiT4jJ&mDkYF7GF_fffD1d+LagP@UiQojbYrZ>z8}hN;9v zP|bB^rZA_%nEXu^XMTAPYqj%wFIF}-YO6(KSBmt|NKiQIi~|IjmcCe?nd=t^(~gGV)7jw^HtZ$p(T>X5!0amN}y>k zM?c;4$DJ2FLUf_N#Sk+6FJbPR%$CcNSKCl*WAi@Qd_{~E>aSfp(UmPn# z3bLr5GZHWcF$ca@FQPC6t118N%-X}u_uNfm`}F<0%)@EE#Uk?Z{yz%);dNZqD6u~W zBZPYgEyweRUkfK6Q@tb5A05T3LPcri-K9ubcm>O70&h6nxe0XuN?>WM@3J3`3zFY6 zos>AzOA_f?&`LQ}F-5~I<;ZY`OKJWS42l67PCnvxCZf$DX zMUIYksxSd~{{G{KiS)qeXpK}5z)YrTW5B$1Zd0t%_SbLT^yHI2e2{{bAq@Q*IzVxe z=W2W=#Cs+WDRFU6SbgK-hYugVY;K-5sf+sWSb<7I>EPfHSb>dQ)Y0+O;UI#2f?{HW z`OKj)8@4hLT+dOU8T2>Npisu=J_U}WxV-$%yfySMPt?U@tF7O^?}k6enrY21?7n?LK|u`}d3kw? zkXsXHPt~>LSaA~f#g5I=!GV`Z>D7~EmS;2#&CGy@;F7>8Nni7rOPiY`C4;iEL>4+% zR}w@L030JBifB|g!}?Hyg7a5(fnU{N#Ux^SvS4+^<(uh+ix)qexhM#4-79+k&s?ho zJQDv1-hHu|nGgR!S%~#mU3pP>oyaV@$dDWPKH}}#3z>U1o>NyZ9vRjKhVhnJcs@Ju z#KK!IZ3B@)pZeq>hKxl-M1)@N2$BizOQT{Go=m-4_6V)>_V&&!=R0t~2X$JrjFRGe zmKmc@^6R&7y$Lj;oiF>l%ZLAZ)KEZIR~OUp^hhTZS|0?pn#55`iD(7Y@tS25)JH*3 z+B7Q&yI1B$erQ<%GZ1Nk#&@HDhj*Z$Ca)~lN{xHQ#2A<>`y2lRBKQ6I^9JH>vS-NR z^c!edKT9Wkky+Y;pn2kAKN7iw%cs}dmglq`dF7<0&RvJ}iUM{XNUkArX2H?XXR=sy z^PNQq#9>AT0VkApB17pg+OiW`i)OXyN(?&mwd)77&0BR6YW?2M^;K<;LRviR`#xIQ zr<$LOsQ9Rd*Rr|3-!Dr{9RE~o+Ql(!_OFc~zX+R}@DpGv4E*lgN5kPC$RrZ5$n*f+ zR_MdqPr^W@g6@ADDV}{O?;|a(Xb|q3_|2N&4swey?r4#Tsg&mRJt=o` z!#)G;2Vqk@5e`{QlD7NPKj_(wd=EK)_=_!Ew{va~S}TsmK)vtj?L86w008h2QWo{( zTkW~0P$Lp;K!*1r%E(bqMpTc4Y+sU%wXH1|j@{@namY$A-x6(1>tZ+lK)4kE}BWaW#be zf!R=I^K9N^W^%E4{5cXvu8ooXGEA}h!`R5^1cIiOTnXS;+aacQNJs#*5W#0961EYM zN_HS2hQR}YPSyG`R%g0g#oHv@GcH-#7$p)VC1vUN?@}oD2X`8Op5^)n|*IQ`{O4vx!7RA-%R!eDI=1 z)7!zx&3)>tzz<#?9-+zs+4=EKaA7E+4@Z--Q{MEcjqA;N8;rE3X#1i7-CiRH~ z9Pa6E>(i>z{t-!ho%V56f4P~bvm^U1Xo&?fON5%-ZDh1=GpVu~)Q@b4 zU-=#rEc0zzpmm=j5%XsfyT!Z~^T$2hZ?723-s!${CPi+lW22d~BNrhfITL0l@wGp{XK>c1xA?QZ#}8kFw-N9C{b%|MpugPE(H(L@)Q@s( zBLDXp+SuCqt|i(wgdTTv6tAZZImPCPm?A4HD;+cQLHiqAM#vgaLevu!D(Q@k<*0~` zV0IX@+eqLa`|pKzUtpIY;&d6yN+ZGBcE7mzZoTYVHMY@4)`!b#Yqy~YUqQ6}U*%9AI1 z3F}b&_g5Q8Zg-I%*nwtQ`X?)@Bk@OB+4=|pgKhYVs|!Mk`*4O{;65ji9>uWMqV4Ni z>jdsh(!t>a74xK95>rC&)H zASppy2%`_D!~`lw#J(9!VDU~ZjTdf5RIbY7AJ&8xVnU)1AvHv+PBFF*!ax=`ynekx zsuN`7ngpRs=S@S&;u5pyFpj9tFsg7I?Cb89Lmd4hS4>@<2K?V$9TKyUZt)36s`#+e z0op?yNVva!7zI$Z{sf*ko;SU1oD4>w`gbIVNN2hJw(O8dyYwLf2}QgSfqPq2K?jn{ z{1`W2uBvswg9p55Q$5l3a2g)_irJoLv5Z?JTQ|E2`IZB8`Xaf+B$;2k(ac;lahQp82VM zZ7Fl;r*RkN`tbE!!op!VqY$Mqo)BW0>)Vol^1?VLkI8*~RJvqp;% z6y_2Yjc_7mTXgb3kQyv?E{fMks>D@5tFX4-hYn`=SQFChk4%&8+9V7tA;B2-`Lk-g z#Liv2LSfmcH=Dq*{k&}=rmDJ{6Kf%EOl9+~gY3v?Tz#yGMBpz>0@_6vEYDp)D(3T1 z!Zz+18zWsJV(UVk*Z%dqhscQIl^kW0U)dWw`9qYOmshyvbeu*AR-NXcv4zE#KbLcH zv4~KcBr*CI0#Oaeuybb!V9WhQx4lOU*G-$(3*$Nv_Yi#1|32`7v&a1~>3Cd?wW|^v z%M}_DQati8>RdO?w`Iniu<4&+(x_SBo@o_Y@h;hIx`-PbOGlQSMDm}m0^z@Zg=ZB5 zU>GY@5D}=bx@>CtNt}}+TzNf$2wKxjyLbQW6T5ZEYq+AKUr{5Owc;x-|4- zb&svZ6%`3B>1H7RC!k_!GoL2vcNfkq*kdhrLv#c?}2J8RV3qpOo2JQt3vJNg|jthtv3>}Cy# z>V1fWY9s{GaNylMaA38@_d>3@%jV9GYpmT98y6P>EAw71Es@T`RE}BAFZgzOhkM!! zJqnw|5pnFdV@$$8%@jrj)rYH)(nshixCr|oz zks&o#=gnn>L`L%Fm}8FhYN+PTo2gx7FoIGpUN>Ib53MQZ1Omn3Ju&?0$afr&BqA7O zRPv#<@6O9%Zd6)lF`$!>_s4y>bt&CqG(;*jTTp&DetcL!-frWXC-V>HHx8v{TmS5t zCqcaMTARKos;1U-w2Zr;YtbLv%TB*UD_;VQi^G?TeP9o*G))ifa2kA&Y-Z{cG<^RV z6hVm6hkfOB2izOyYAZqCV1~bML6AXl^IxxJC&v{)F*>XU} zY5m&z`t69Dm$P%8tcEQOg&Pxsh$;mZ?)^I}+{2Q9g8B8$Rh7Vs4cnM$08O%Ls~Q`3 zqO*ULB4J4sAt(TW!T9FQEktNHpA6@M!1@>i#3KEg@8W>GrFqK^Fyh_Dv8#7z*=k)L zfd=>#78aHp$uGHLwzasVq-UWMoA)bja0kI{*FoIMi#$O?>AB6Fg@xmXgx%1N$=O+M zisK?qSS7fpO3|YGd@&p59rfsudZ7am{#%nJFBZ5yvK?*;lnlCim-@#Chx8^g$lZy5 z4OOS%wCv1s3F|(Z$*HLqL1?w7tp`5uQNLi`Z2zAu0Cpop{XLkEup^YWDhv-{H)86# zF5xWOkh2)fP9k1lLRw5kOo?D#&e6ic!kke4nwbtjAH;OHdIgiXCA*TUYAORSa4VRz zZ?1vpjjx`r^X(3)Dz}3>dT5Gm$`fxb_CAp90?F2zwjb|cfuK?<&Utc}nE62LhiR}^ zd>bSSY7AZK#S}VD?qfdY{BTGFJ`^p2$*<-Xh;HV_?b|En!{&eoX{XA# zEz&HO>R{BB@3mF0Lb4N54_7+!obC(%16gFlNNe_k|sB zdvdEpXf6H07jy9OC6$XaSI8mJifybnJdIND8regpUWz{s<$s8@16*&doy3M%qrRX2 zDrsvRcx}ZJKQ*jXWE3DU45qqXK6XI!eSPzXQ28GJLG(RAWKffxpu~3}ScRkW^Y`~6 z#D$t+Xk4&?lI9kGBjf?)wNt@~d=e6P8}qWd!5`gR%1|!6Aei*tFMW$yxFNG`#1uM4 zR=+M@G&Jlvkbc@x9fD9drbJ-4Ff(rfz={pXa$k1FCI zeN!K{p{J)$ca3ITzhQ&Mpa6AMRaJvS_M5D%xPonGJO2RAPoNlU)<|KNmI#fc;80|y z5hJ*K>lR5QT@5oxjbpr-&)ZkoZ(nwsUBooK5~8L~?PZ22(W&(er(j`NSz123^#qfB zFb+4@lcApYMMtyPk>DyBBa*IXw3S8j;cg}-jfywf*}Om%s;z_l{iodK=_%MDZK+Pv z*RQV3%Nm-RDz}S5A+Vn4D(1HW#q-P0KW4{_2tpm={b>^U3jj-MjeOHHHnbN6%4D6mqS-+-&-3O_XgKn+ z&pdtl^s|k8?nm1$tKSn>^r>kC>@yJuj!5#CS8S#5Bx;S43?@^(hoKOp(jUN}NHb}s zjOCP-)se6{RIAO&*%=u}!dy#daQ8O9n!lH=sh!g3y;KA*>Zr9dD$=?^SS(Z0Btyd& zU28M*fnJ)F>nWy zV#+|YoI9}tir?p!>xPE=_9whEZxyQE_-og+nKa~!@vD^{RQoU(p+cOpNvm(uQhz_P zlFy`XH~HTAnw#-i>7e{l-n)1HS+$Kk36YW85AZ%8@2{>l8J^CqRl3#c)pD?cUv${b zE2>rHh`zxMm+;qx>w>QrKHMvD^NALJ__O{Sl~ykPw)cJ&y%c4+-78KnoV53KN1|xv zR%zdo6)oYm5JdeJvt{OaC7Le8RAqE5!ii&wr+Uz0t3P7==SqG^)N9#{HGb{Ad_m%M zNAB%eg@aXd_o>lx5j$M=CuSjLM(V}T)p)F-=+S^!Oy-dBd{erH6$*C;}&8jqR_faF5wnVBB$Vg{hk<(xIG*G zm!wbclm6Loh&b0Jhf5#-P_Mgfzl+;e=zYH;4SiaNwb_$`?Qc8ejr`xg4RP`45ho!c ztu`K6mDWMA7#_yFl+8@L%OL(s1(jiFZvH_7gP>NGSR;A&F|rQyk@En0rm-;`cuctJG#(9sc=~j05<<^?sK!hgkYyCHq8uv%0t2tsiOSOu z+zACCk8nA!M1<)gKfe~y{b2q?%z_6rhbiM$XF-h=38hs@PR>h#45yVCIOxa#PCTW> z-#9JKA0EIgWq;p+j7hgJ{5p^f1im(jRH3Y@+TaIEm4sY_W5uImF81N(nuyCE5f<)c zA#w|bwk&C|b)5!#M@Cc$pR(_zP6|5imT|H^CZqUUc`_jOg+k<;sO1wB3okcT5#+S~zyJY>4vx}3{FRlbkJPeV6#a2G)V$(Ocbs@LZSbqM!HMbpUdZ1?e z2QJn2e3wXI7R2v|Y`LcqU{;&=diLzux$>U3=7u6)J-(q)I`;d{Y(7o%wD529r^zV$ zLs)EU|0?+p3G|cTyp&f78=xDoPiy+!jc@N)oXEk$7%WAAet%;R_&UH^B#|b8~V$3kzk@S&@X>I5|0q4>oPw zgeM++5$Bx5BL0XoN*WLm6T6EEGND?r0#rEfe|_-AS}qF%{PRR8=_1Tt-V1!N$D!_;^9Ye#-Y$)k6y)8ot)F zs9Dn%_`^D*)2=~JIvjhq3JTR#J%64m?KHKH5V1wl!6yF>S|ZlD0otcl1{_{0Fu8`s zi@-HxS&yguBnaEjzzdNpbQsrPi*1Mia{Qgfq-$af0gDX7a|+bj-+Q77Vo4D9Gc5yy zkSYMTK|5JlUy0O0!%@P@Ni;(Ih+JA$mT_>qg#F0w*>-2~AWqGARSu$|+b~q)Iw}Fv z+2y;rx>jP{L=9kCUIn3{<8J2y!EM8!ZXsrP5Rlq2XZu+gfPeaNGx(G z4dd;n+1z4h4?96HHwGteup4T7FN`Pfkk=hjP<{=#M@&ez3m|2q8Tg^FaXmIzA6V*( zS0T6{Ki=CUG?)yx;WDmGZ4UPh533+u_fn(@A+E%1D~jV#5bfIZe(l_`BN!9M8qJl( z#a;vo(lc5no+E^3jpPm>8NdS!_Lp+QYgMa&2A?=La2QqRWx@15`0L0`X^%zl@t|2C zTOovL;UTf<9}W0oexC;d1;4$UKTW54;)I+0Y$kh7^$ZSHNF^ZX6GA+5D5)OH#}8TH zak96Mj}nqHVdPRo0!z%O7JKf@09dM8=K@c!iSQ7R`tUfTYoTGfXqvbz-R5dl8`Y2{G<$Cu0;>>{P<<>F_4Q%Qn8e~tK#pTgk8FBz>RwbM8HEgDW!~P+ z&z5iA4hsu=rc105Htz~}p z%^xLPyctlDif38HEuS^~YEHz;_HNnm?0u4yRgZ6%WMkRcb>C884XHnLPa5}aN{nwc z8xF(G-1MscQ}_W9Lp)t(bHK}#RQ=v#{@RyZFCn~eIK{P;Cws2y5L?DCm!(pT=9YWkiwjD3 zH8Jwt)^*b!5Ar?o{O9-d?VdUsWy8i_~R4r z121p=``b`XiILS|hScn_#<=D5n_nDX|4H5{pUB_#gWb1}W`6a5u!p>2wx!!Cnkstt z{VyO1l!oH9M3~|9*k6G1Kfh^DQCTtoQV8yS;`x8Rhn4oH@PEIjAx?~vFc@39Cp~d6 z^eU>+J07!6ALW*l_y?XZNH>x#q&-IEhtmlQ0!t;c{Bvh$z5rP>N2CF-&Yn~pv&^wOV z{$h-*J5}Hz4xF|_4gT}iiiL!myCysk!^lWPHdhaGY{kQVcEdFAx!G3*$$m{VkT5Xl z}}RhC!o@I)CE- zJf;XBXKga^9l~*87@1d&7R^FXpB}lmp}BlWbg}qPnGCgoknV@X;rz1WDR2pEC6y=- zD5?1#i$7mJ@KcTpMb--hp99Y?i0ycAL02~zAEkuU0yh^|u%{=*kU5brAoxR*E*{~~ zfJd=RA@?wplT}y!mfyG^?AIfbq4?c;+)jLNr!hKJ)x8jxWhiCgWn zBb2jXTq@HVKIj{3nFPH&$0K|K0*qPZ0zEWWR<9Nyu2BpxC23$5CNcb6?WI5PYm*Ed zV0kvmIG|bO;K>8pWQ=;rqc|rSoJ+)HW)bq~Ndu$Ez8;0cV5QLW9w`E%A)0nzwa8(d zYxnhamZL2qglN&2hnd`f#aJmZF-GF~JpT_}?*WeW-~JC@MbRW0(pNN4$_kAuBa}+^ z$V^5>q_WChMY0;UGP7qkl~F_?dxfmB8ze;GdA)t_`~Lrr<9VLrxPQOnzGZx_>w1s# ze63R?)5BvGahu`iI*T2S{CmAa*=s7{w+O*x-!gPe$Qm@=EyrSC7$Y9&q zz0p7ZpY;KN=%XW@ejx=W1gIXtNK(2sTUi|Yw~jhf1n6Xu-C>w*>)YvtC81wEab7F8 zs-4)Bwe4WXIrrkqp2sXk;0h`ge@(5)Ps*xfVLm5(pNmV@u*FN+=>CN}GH+L|bdTK} zfA*eOsl?B>Q3rgwkJo*z+c-QeIyLcJXsTxx>bjnymqNGKxO33sKT@}TS?=jYInxh3 z>PqLWzrQD+svY4}!e(D-p*g;Go8gd&eC|gpm1;&6{Z^v%^4h1 zG$0$fNQ4fA{WyDmEpXzy=swQBh~S{ViEiDbD?j~fB!HZ-NHHeb6^4d}(I8f`=h5`6 zKnTOB=`h6dL13%Or3eKQ9RD0Ix%W>Z>Iz-6#_}(j`Knc18~CpIu$cX?GiIKg@j@v7 z5{9?0l5~ZO;2)%Fq*^(tscoJTa$_dZbF{>bF3mbm&B5PX7G8lW0NvmyU^;^xa z_v&554@e*N(MAQIhhPDvuD8xOPQcaPkr8l40rWg}`x@{&O?bS05@Ta`I}S(OE);Y6 z^Q{2%P;5Nn$g0wt8wwwmFOS5{JT#i7*a{{#25w#|M{VWTAFpXC8Te-oiu-B}Be ziVj{0c!|)10+O|1qj$vSTW65YIKo-9%?YQslWOgyV+^N?Orvo%b)o}_9__-{FZ*!N z!y%;2%d4oUc%6+>hJ{?SJvR1?y!@7kA1L5VufkERm>c|*Eo3uX7U$G>*SNdlzzTZX z4AAk^@E@cJ)msa*`t8FH!#rYY7k<f#xEp#XzBTs8i<(amg9889rIopn@>#&EXXZ6ro9$V+b zeQ#N9C(HLF1Ab&4d|z29vsFTc+y{+w;waOEA|U*397fyhlqU*ORyOy%tZcPV1k`X| z;u;9j%qTG8*s2vZS>{4_ICdtWNm^a9+VMbt$9oUW2bPsw(^<^pk~|UHT;|TF=g~%Y z+cV`n5d5N>Ja=B(?|_J+>8a`&6`p;)*Z!RSs3#jAKV-x`UiMjCAtA+@F;1%5ky-u0 zw()ylVZXfS;GoZonVC6xb#niQ2Yg8`hx9HjRo-b}54y_kNyZyw-3k!CBDLF*SWbO-GbpI;+=KE&dM27V;|| z{@lTkl2|S1Sr@?(=-S!E9RD^^SmK3}&XQWtJPm%Ca?nP1HKmd@`(x}MZ{xh>OdE5u zr6+P9uXy}iUJypbU>_~((`UhH|Frp}CPv#*pcWsx3MV3a-a?c4pP0o8E! z3rXoNDY6T0Uwdu6#LCo1InH%+u9y=r=@)lhBc*_`6&EcV;0%q8B2EYR&_iU}g4blkb>9NBo5+Lu& zON&H#?*kEKZ6u7Ag1dLeK2?Bp86tDRacE8S!EIXGvmU<2f*b9S*~iXKL*SstB2Xnn z1B=b^M=qg17T3>zYEA(cF6!huefqT40xr*3)B-u|l8n#M{bbueeoUl}FU{esc+{$j z_Ed?H_KG5e{^XEJ<;D+Xy!bUS@vv19bjj@tIFn@0E2=aBlKlR{@z=atKAmckwC6-UhAFKNbF|1%^KMOIW{*f-mDG4q6lX@RP9uJTQ1PH`jej4KqXw zxaGikL^$upEwyD4K?p2JtRnjSvZbG|Ci5!~~8z=xj_{ZmvvRA7bp-wElDV(5&RXMXh72mU~# z`36iaX=SbGK*5}N*7!<|ZqsWJWuP3jViM8*yn$|p{5O=LmweGx0QAUs>T)gu(Ib#R4{Y^D^3i~5tao;f|{|@rUVKAGA=jQ11Tkn zAl+Mvkau~aP)wMb1qMQ7E~!KWg81sB7KYt0BdzH!and<2L7||fs*u!50RdI zc;;+}$B*$oN}_U8ErI!*4o{KoFi>>;1vIoB9Ubq!pXDLfGRhH(lAR;Wzn;6KJx5xI z)?+=3fa}?+uC5(OXVs1&0h!Xcq*Y5m*sOp!sWEv9qLy+zQkXSR6S`iR}e<&ON_b&}JSLSU=iTwtxLFA`^^PU+Ft>~y@&@*o_MdHAh zFqu=cuY%J5ptR*_nFB+Dbjg%A3sLPqP#8c^2Rs`(v1=XA`{VzFEaX8;49!~>r(e=T zB!HIu(wjd-E9nZqvnwno3K68V1dXmfI=PK@1$0Rp(C7q-TTU zwb#({e^Bcsv8D*W`e;Ous3_<8SN%ezepQ%8_KjWuy;)*7`% zA?P?&r`sb9E2!2Zs2h?CU6#aA+1*Jl`PALrhYmYUX&i!J_uO3JgsIzzHR&51j8`27 z2sVLbeP~({94E5Np05?-y;$-Ou*C!2OLLtr0AJ$>|@c64x>Rh6_=;U9U`xdl*#2$kgga6|pG#L^r?d#&J&;_2Y! zPeC8|c$f`HW>3VuJQB$V9vdAM#h#OhmTUZ}Bo9)WPD4+SW@K!9j{q1V`RIMtuUls{ zqn2|;zB~J}Pg*2OpSw_;UYgm=$mj}u4qt+SN$}{=(2uHcdrd$ff#b>>5}|*fIkmb5 zgtri)g9vSGu`0YTf@T{aT}IbfGfL#axM|;$;3c-d$mg-K*M?|69KFCOy}K*a7jFQC zTT+1_ZvzJs{81HOnn1@#6+>_3r$tA(2_}YWtWOCG>LP zm?bDE&-~h2__E!QkyD$6KnX^`C^e%|y@ zD2r^4;&>|hqcTJ<5GDii}}Rde|FZ!}QPv1_^`(N#}CS|9MX8JMczzI|R@s`yYR zVNS}nee8`oHN(yLBLq_Z#8KR&{qh#CMwMPZJdy_2uYVVc0O$t6m{5Vjrx!LB01vez z5&Bp!;%E1fw79(FUed>kEwq8QyxK{Cr1)NE!-w@06k%{3P9Z^1YC5TSl;0@1&nS!E z^=1*X7rxDlK$?C8TCX=RRsp%?3Zkxia(c9x4rF}PDE0?C3dzcJRjm1LG`K5-$~@C( z9sv2Y;?B56IFR7r&^v3Cob`Uz%N{ln9%8oZ*tD#aaP z;`Lawr`TVjZLenP0#tU=!QNRQuF>S=I7E=r!5z>^pK`EA&3$>2oDhwXU*>O_0I-M^ zTL5fH-YjT(8nZU+6xBSpoLiFz^dfTfFAlp{8@*Tbqt_7T=_-Q!@IOv&FY_T<6=Q*n>10hW!%CyXoDPgj)2*wpAaIWW2km{PoCA z9QEpJO!=pMI!h*T0|>0%{KxMeld_H|?W8Dvm&eb{i!C#Lbgc|AS6cLXO&|QQ!#7m< z&`GYLJ9R%|gXEc2tpx@#ihcM6Kjnw{|L^a*$7H-=7rn@{3ZLYZyPe{08q~lW?gBfL zTvQmm7b@>?&@V_0ry9ik9Vd^&!eCvJLWj~Z<|Hp?&*G^hy5zC5Df8!#%lc?sb&jiS znC1;p2n&vJN-lP$w>dGL{v^fPY5#cH9-u-*&{TX8LX;Ssf!dk4jcDcMet%u-P62q< zYR@YyyqToweyX9c_|WvDKSW`W7=0M{(P`ZV$NPN2ZC`T>`Q03>wD>;2=(J3Cp4T)p@r?gV{= zgK1}y@@A|(vpzi!4aW2QeCL@`U{kODe1F+Z#dYqsf@ZZz@nNKKAiH3o4R>Tj+Y+>`yQ1CiE?J>3e26Wq!Pye>vjCg5DwA8iy6RDVZc=f9)wgp3{axOpuQ+tOIh@fDa|CYOQ%Sj4+s|k@Hjm#1iO~fMYN?NZKOhkmh9V1Ju-hR?x~b z0uIPchmuCvq#^pvG}2A=oqO@~>V@BN#NnNh=0a}vnCpWF59(|nE4jJ&*NzKka4OLIO{m(ZovtZl!DsTG9(%|$s0^Rn{r3s@3JaAE+_xxw&K_+}cg z)k;5^^0?M~a~(j!4Mj-*bRY?jajFmjTK3upyLpk|aV|F3kl8Rui#_3Y7aaXPKktr@ zcj^z5x(}>D1kjtB;ZHrsNtBha^6qcWmk$?Cp6F^qk%u(*D@_1UJt6D7^1$;XKMliw z%3_X`!`b4W|5p~Tpy*#Qp)qwGKDWo|gBnK=xAe$JO+bsbO0=+q?5X>9DzV(jN>j(^ zqD2*DKNJYg9;n>5$G+E)I*^vN8Zm>}_A0wUF|S6l^N?r+HqXql zK0HyF+5483%}6olPi5ba1XWL9bb?j+2IZ^wA(12ze`YP+#((td0!m9=Jj?^sNO&%4 zrPp7R@uqLdmsfcd6~#crQdSPfU`|=vm4HOdj2CGmbJ^g~m-Ft$Jnr3-#=<_vcJeNo zibD<ceo7S8HKTBkltF1)_B9z{FAre`2JOYISk~3`H_wT!ms~3K~2roj%Fjc$ivJb)y z@fm4sl>h?BvT4)nZ^>W?(YYtZBRkL)E2_T@Sx+tD$(;D>?U@(r(}fUrCU^F1zI&y-*{`~&)? zpbgwf&4<0WV0ypK2L8q}M^@Z+g5rRP%k*i82MrAj`h_9@SAqsUQm|Y~;~3D6gepyd z6n*1w!@BUvTHqCTQ;~9iu-&+)#AyeFR)u*uKQiK{@b;qw zSaltpKKGXIL-s*syoE*H_Qni%ggT2_f&ItiwVf*Kw?tqi@ zU>0ch`!ggMOH1_I$DGsFB_Q9D;s*x?cFWdORh>R(4EJ5^)aLEmljAwha|;Wroddkv zvne7Gi^iCW13gv@x4mSIDz2-O4m`ZPP6<{R-qIL}l#e`75tD`!f&PAee!+Fbz$Qya znLs0NoaS8qFe-|l+6!tZ+t%CPf9p5mJlEF7_K6)=;Tt)hszKRucvmO~eXLkB^WtdZOzFPJ@D4|GJI6I?oLkz<`C^_5@%+anQ|YUdXYCfA zRUGzTy^}}kDmQ~9;}&UDY<59^cYMB+b2m3>?r;1&*O$BGpUwR|ItO-n4J$i19S|$g zm`tqjQHUSbF;6M3>s0dNI+B>%Bu`-y}7c7HzQWyUF@1#I2 z+OgG?-;XYDScl-33^9BreKEU}vN-5JyVYrwOU)hGY4R}R;=bOyPnUlxJxgi1Ts6(S z5nq*P?eZ##H*#2cfaB3mdB%Zgw|m~v4#8{Q*ea1H6aRnz;*rS9q$;&Ka*#ZcY;8f{ zaOVFW>hqQ)tsJgdapFt>kaK4T1vSgW#6({YnbCv*ja)q|8=F-?-LI#kC^Qp!htfbP zU0dWj8{!5rF{Q!&LwPw3GH;6VOk0v-rb_|gL11h#eiVjn{P$Vsv`);m>i$Do1*$@& zHz=#nCr_+?43CcPzw=}%lXqx$9Jc?V!be*H**!)@i7Ph}89ejWsr*VzsnNVRnGQgcec_a=!qBTKv6B1Ny58X7uraj4kV z+{n!AK*|t=$*0)3ozPn(S)KZtw_zO~CpLrGCQlyCGxAj`(>aH9FAoJd&%W{oy5WkF z!fPEOM^xbh5UPTa!5{o2vx`0pah*|*9~W}?IWtoUqOxY} z?0p1TuzaYk4V@xb1Uk@cd(-BG4OSOcQdXmm89ObDDjb!7f9)g=t#vqpFDP&$qZ)ep zvZs5XGR33Bre)!>Ty55x^0qq?A7dYCwoB8$5Oo`alFW+0M>o+mQu9#IgiV2a%ZKzj zu?`r@bLcc{wPB8{0J#*{jJvfGVwpqZxV8IH{94~3yH3f#dp!siZZmt>qi;Suf7-#-@f_y*BtI|zO1WkKroI*Ja zw{#*oHtrB4g9_NJia0|PgPUxT9x^{SCs=jm>ec(uq&#YE1`r4HVP|b5@@p-iHK*o5 z#_9vjRIN4yBo!bxJ{mGJjvmj`K!LagrsMKot#PpVVK$G(Ek%ghJV?5k-A~Lofi5TZ z%`~8kdmtWVZ2;||NtZ;}fOW3Lnwp_sBLm-|6WE2g#^o=gdc6aS z>eoo@nYU`a?i^*fkxn_Sy}S<}EIr_hRO{m)DB)Re$TQQ37)Q>m0SpFmc0XaAP zb$}m`5DAkLvIF+C1#orE_pA6p=;m)FnE#m`r-MpAsfJC4IQmE#wGkO6TI-*sPBLlsN7hp@q@YN^KSMvM~HUyV7 zffXn?3Z>dVxZN~H^JRB&ajnL^Um3^^UhOv$0)Tvuwd~h?Wor>1iMJ5{_69fnf$l4y zy%9OwEZeKfNZV23^fKwCv1W|h|B&Ep>3ZA$g2KCw3s3xu78BapcbMz*k`dTHS1M)wR=az%ATx{^9B4{I~$1uTr49B*p$u}y2zM@WK3{y6llvmBxOZVH8OPVZq-{} zrlPF5hm|+RE5n%uz5h|nl~awx(p?9qL(r#Yi2EWxox2#8d7qj4ye8EqkB=&oUoz*%k}#_({7HQKct0h8O~I3-U>678i- zH02{7&-^Bt@fHELh4j>yAKbtQpqs;WYzeJ-~HWW3z5cJ~$z~I*A5riM3^5lkpphG(O;dfQ-Id(A(7MKA( zheyQR=q({VBlz3K4i%U~Y#Q~>la|~F;G1a~MK|t3EnFoFFX5O(wXZhsmKrq$NKhO$o z2;sY6>gu(j29-G3%9rZ`fC6KEE1g$FcXtXc#1%!n;=JLsRX56(m5> zL`32l?}2-&wb{v+15_jDsM)y4uIlO;6PDhQU7MMDu zb+{NO0Z?=xUoGR~H>9J-sEz~IKZ!9WLsM{uWN0cWap+u=5t^_?MtWgFs&GPp2`IG@ zT&8R~FrG}J(yX_R6t#0-F6z;xRe)e(6`;B13p@GT7dvM}4@3mlVcQoaBp=WLEKHEr z<|d+~EFc}*#_fkrMkO=91;SjEFw3h0gSjt2TKfd6r9~%#2)7{z;5AInEw0-e_~9SX zRW$)|KGs4Q{r3Y`@esA_UZSPH|NQxk!as5pC&^9uZ`%2|cLJa6{s}#%6ck$5(;=>d zqS!9M>d%b%dV2c4HpIi0<6YAYc06~Mz`{mCf`YH13As8-JWQI;WHww+$71it#<_Ig z94mKZVBpO+D6JU-9zTv8g^Wob@~LZK=X|Zrx^u z5MBr`3Qb#ic+;_$pCNv;;5@+KOFs1`k(0Q*s1o!>M&a2rr-+{i4Q>4*KXOCC8d!}Z;PM2r%V(V{Wr3u?q-^nH8#_A#+TGyS zn@+$Z(cBWvZ;9}8M1VTXxL|lrQvUwq$6xApp+%!1k_l~cp@Hf&UV?$iFYfQ+-XA@qQu@y6RnU1w+FhKU8T`k3 zdremF^c1JBEK9Za47|8txPC6dzKK8Z#)8xw7yTzCCuU1SVH$>5%cLxUoxc6dswv#~ zmo|KZdxM|gFU7l)QZx)(B>g-l_1A9`Q&XHjWj=S_UQ~AW^u8S;?GOIXcg!B#!2l5f zempkc@xa0WzK`pdPt*7x_2~b!2JYXr4)65c)r()^@iQG|dqryWkdIZ$4Jn0x>v}vu zOmpmip03xHG1EP~UgQ7cpG|8Dnm)(e5cSfK9Gza^`R{8GiTa}A<`p}o*808M&Y3lh$8#p}!ywrqr8@i%VBx zj*osNwk1~ffk7Nq9Yt2g-=lIb=3m z2VVw8sp@d=-AfDO!1c2ABO#C$|AUz-w+VFPjeb4wE$R8616=7Q4WZzH1LI)|a3^7! zg@+C|HY{X^H+U71H`P&SVRStmOVtQtQiLK9_>NL}Q-K0*r%8gTe$kO7G3#tkpt~cO zr#2P47-94J*V92-7-?&VAIu_47Ru@RZ=*T&;^i=P8B?)1;fb@q9sxm2KQ5X*=dP(( zqCd^e$@%Qhk1#ZH9spb-Gp;Pz3*e!^@!Zx#b(lHWmZW?~2c}A?{NKJ;`rN#1F@XS3 z@&2ps9DOJ^*PDlhVK-bOCAjjz3okaK&8``Rc&Q)5#NI#U#$bq7R>S@)c5ki1ZV9D} z=HFqP5Es{niFIL9VrZz=@_)z8`c^#zuii4@j0z-Tq5MA~zrr1L)#mGEZGJ0Lq7bArIE)JLD;e|Gw(6Groyyk%Ug9C zV1B~6msc(==o9sO?XWJ6Q~^u}OKgLHsR1!wMXT{R_fUamzvbd!ZJ+gkuakrRO-`4$ z-b{N{RaL{SA_P$0LB*c%8(ha~fUQ)Uwd>aH;^g!=3nSe>DH@p>!)Gq~VVx#?=BHoT zlBUHhCuiNO918pmZIK)mmo-sBs?u$Oi}Y-sP1xhdD{;iiO#)1GL*{`35j)|J7&T=* ze@?;uFLpX*J@_zWz}(xf2tK}VWMtn}%vsXrxYd_eE|VA+zR;%3DkP{qvwgh-AG2!inc3FfUb5KbX9PCrL0z3;(*cHSf)|%5)e$AxL4%bAqK)ZJ6TFPX=K&Dib(^1X&}%}>Ynj6lXaTwLub)2+XUH_DgP7qVqNwud z=~G{nGe-X3{EqaLA<=!L1=1fDjSHwKBoh@-DYUPbBII2{Vq!@1ejc7TXp{Xv1|V(^ z=(l>)Usf6b@4rFq3k@fuXvWA*eUmOZ6Jb^&d=VU)S0O3 z>iVK(+>-;qgZo*ig|aEqR4mZ9);cYB8&SW|8Pgp_aM6&_5lktSQ&4D2Mw@u; z>lYB9H8l~~8I7&wD1`zFOg?}A0nFv}L>DtK1C`Y7^ssakHYepbu>A9n?)TWwP!o*6 zd6S-JH+AJ8VoW&|RV+~AJ{7*obo&}=*|%`5UcLIQw=_zJuz(LR*()AwEn9|`QT4KV zc84QLTG2@aC~pf(|3|CL%@{jwf#b8=j#WV4Rk_xL8CW~p6C^HUK%?$f36gA_Fn=8^ z`n7FraK+T#@gELBB{+Zr>e6H&K5s|A1{V}^_f4tzNwiuo!9{puJy#NR1`$@E)k2Od0w16Pz}DHSi1W6Vy+y2) zOmggwwnbajO$BOXi}A!-h6i|gw}`af2HG*PEV8Hu&I4zcmBSdJf>$&Gx;QDHi2LBW zmZ7w8tlK@nMz;qj6fkGG*0`suMBN=N=76uGFbB#K`X-=<(^tE}TSG-Dsy> z;#d^(O~bpR1B-*pCyMAAB0es*zNw05FKq4SFW;zB}(1_zbKqUkBhr3+2g+O_vcdiB$W1xb*e)PdCYzrTM zgapWKRRlK9_^=!};f5 zNDM4JtMK$6yO$)Jl-|?2wSH<}R`K3nKH=jpQzU5^HU{?mF7}q7CQth?O{M>u=c3=Q ztmBY8G^KH$6tVyLt}T;1enJr~C|>0mJv})-x}CDh%At=-IK)U6`Ld=61EbsR(*Z&u2eK@kwA=!M%D1%O z>$(Fra@HAD(F$Dp>Ft>BS!jdAzG^mZL}C-PydW6PPTFsFztI|>Msm6D5 zWraIhPp`CgE>J!K#PF?7F9-TU*1<0PH&{y}3^oHL+ zvfivSzG={372^*yb#$!E?ib}75G-4_o+CrjDSa5dU_C}8fyKqeVKzXQCvjB_TBGQCj<%wfic9sVS=+(0 zQ)cU*irO*Vad!`$QX}#yAbK(8LdqD)>NNk$!2bj2U6hyyQxieVVl?jM8N^*ZSemPM zxik$j*8?_;OEYT4_BNP&S-h;(%L2w#8;QxZ$Rz(n&4bjmto1NFs=A4F0GgkpFQ}yw zodSd_mQ);@Z*k8Fj6*IkfG6bqkSTZ*MvV0vH(sd=0C@__uhw}*V$>y(Djx}wvzx>*fzCL}MaC1PS?25@a=JG{8rzj8Uiri;wS zq1vR>`cq&lr-$doxLOr2*I=X}hT48rnAH|QPZ+!?uK%L($8UXgC1m^PdRAfj7q`Bp zSUCY2IjCFsT!qcmrS$#<&?U|Jrk`PoC1X|+%!v#N@O9wFO+=H>4>^zAbPFPWM!E_z z>ug9>{hE&bj7o^gBaATyAc6-+v=Ca<%eQDJ@Fta`lMed{qJ;w9LJ!zCW&`7=g2w3G zDQKOtKlRg4^sWDo%)*!9a<8xAe}Lw@JkkFb0Br=t#VZUT!^etZLcQ=hU^+6gO>h)p zdE3dL2CEAwEXg=$x!FEkf+fGiFm9X%uw{zcX-3-LAiH-73+FbOx1K>;2U)lp@dsg^ z@GjF^d^=IfL}S_3c9o{_#qSmu--$U)6>1^a`V%lzRewiwwV(ps16)McjR*=_0Xu*9L`D?5;CXLV5! zWm~c6?Vr6%i`$4Uee@RCfHbX~!g*6^!u*mrpwPNrynj3Dr_Wqa^sRI{Ud{n_9NQ za$owD=s^obkxVOdUSf1~R1@Q^u1J8hBRGJT*e2Q-7m~NwdKieT+<6HgVW_K}u1KK$ z=s+?|gf`}MoB$Vfg%`)YuebMO5%FYVuD7?hKU6g|GU7!XW36N$TLa=KJ_dl#{N9lz zeO{3ar=uP{D&UZ!{A~5kv5kbP5zXOEaio3@Z zaZ6;|rCfMD`9@%m>m`H%L5?dFF<% z)$|Qrtgt1CQ5qq%k}Uj&ohvRlB?NkzKS76hJd z0l-*xL@yT3GCMrgB(f;vFIfEufz;;rI9Jw!1#D|cpW|)-z>#D$b-&Fh&q%AR9y9aU zc9SII$sgyYFKL?KUsm_2v&t{K2&xlW4z0xZ0K;7WvhK< zIr(Wi$)q)$dSCwUe;Ld4-G;*Jgdh8^Ewa~ylS|gPEA1xLP`T7X5CJkbCQa^%le+K6 z)e7CK?8*DzI1MyMwljQwi!w`PtGAlAZ1b*P4?2vvhfj&OGUHpW(Z!kauaZ={JfQd0VwTI)nSmW zDB*%yRWY@N=r!j$h$9GNKrT-QK6&B;8t`&mGYZ12q@*)Ab(GOsgS|7VyKQ2!4?$}! z4f$b=jDCH`xv@;e+`Rw}ttGb`sEBe9a2#A*l@0myt5k3fVv6iea(&_$C_~kWjQAnG zEBL8mtTJ~FucbZMO2~IIfV%ZL##~|N3gmLW@j_mS`_jS>7lA9aEH|yKX>m+Q>ku{e z=4CRK;tCxmiVSvgh7bZlx@(s&3;KRCIM^36<)J|`H#rWiws%bRMo|)(0t&4m{L=|W z+K446%u3(X5x^2kK>iaIYiId^+x|mZGcOr&jMUtK7ei69aIQ81Sq9>0{bckzECgJNr@_t39|*JmpV!c}lC0YJQDaxU{EY)ELi`#vu5A9o?i zgh+{@+5imeLve9f&JhX@Fqwe7dP56yc zdn1e`Os7>NAxn_PYmawEX#|?mt~5PpynhqtvQQVZjGAeVQ(Qs3HP1b?gmM{(LhxKa zJ3g|%n5hi>_$bUPjpgNc=3b{GNbH$iAx4?#z{=_Zks)}*2|OKcvXfYgd^-8Q#|{G) z!>f-@Ti$WNGioC-?sXDXR<^y=-`j?+Si^=;eTNn|O0BCO7$5h+dtHHQu12rka*~jw z&?AHy?G!N$tPA^7y)%3$?BO6`8AnI)J6I4x5{Gxa4%G*4#6g9iy?eR124IK{oB9N` zEH+RqYX7NLjf)oYh|Q~)s2SsiED_UTD!#r){lLIRjZ+;*MFT;p`H<0U<7xKy;{CO# za~lx$cB~hK^dpiV0vcY@g9Z-1D9^hTX3RD2kUaGK^+eI0(LP$4q= zGRjo%sAFVa<-fv2l=Yh>em9s1v+Uei$EJh0PYW2tvuzQ2T!GgY)wOKWAw`uf-jD`C z4B)c9eS3IDW*x7n4-mMzCr9e{(Xm5zR|&fOW|b&B+|UL#G-*Q=P@sTPaFQs`n~s6_ zhv4O|jv31F?2BKDd9ZM*par+6YJ;s&)MfD%K#yuvPlZMx8p>~!E}UN$aTa2$URQ)d znP*n=CqS=;-rgu3GgJWCA7p1x0D8}FcZVRPW`Hhh?MBgx%}yV)QQPtIW>KNu5K;Q| z70zo#wQKGi`<(s1;@I;RzlH8VBzGYs7VeU&wj5(cEz!4?mEJSI(C5?)?BBD82I_^g zcQ$JO$}fER@-w*nEiV0kb{mT8AJD<_%FET1PBFvCUp`U)fUb@ z=6BuAzOd;fyW~g!IG%ElENg)Gdge>1Z9>&SW>>=2WG<~E$f7cRphaRkzx8@+)u0Ui zt9`FmLZ7r~4St<}xbv!JMaki| zl9D1b90^DqM|bSURQfx>Ftxz1aihrC(dpf&6vS6e|4GebF*)S~IYTMYrqnJ= z%!&)vOr}4gT6XqMHI&_<`H$+5kC)ej5cr*|B`Fl5zQOA{qyA#w9_wc_i)Z1JVIYpj zBqrQ$=w{tw%o%AZ6pYq7p{ExaW6rXQyf}`QjsJcVi8AucKf$Q2B+o>v+=_06j^FuH z>P)6f>&__cO%B~H&aXmvb*`AKqm5bB8fV+mqnEs8EXsP1hNEn)PS6fpaQf01%sT6g zNt-kX0Zwn1@&D(2I!XWrgc}Wk3`fJW3NDqRSsa>-i0!EEJ43}ONrO?_13j6bB9&0Q zZ~y0P4GeBYvpCY+fS$hZ8|n@J0lUMjGtOC(#`WG2#e*49WstSBCl;ltA_-c;U2{(TU*-`xGQI!4KvMueZWpF znkCaX{GZMP*AoDk6}#d6RuGbo5dRiQa~&>te7XJA{cB4r-z z^uyc%!CZ4LP=6kXBhwhf%eDn z1K`rSpu&*$Lw+6aysobsEM-Or4p9|Fzk&975{{WcYurvZfo9Ye!Nl+y7Uq#(xtRV* z>>+Jv4C>k-q2ZzCk%k5*oN*hOu_5mGTdD0g`Xx`|HJ^jpY>vo(!v$Z*~A@L zNNJU8j$0rGU>>#ovaxYF6hfiaD1erMhn+!X@cV2);U(9Fzr+f1k1uD>=uw33lAHW$ z7k;_-{N`#UkVfTDNr&d=<>cH!+Sa?T^c(=cAt4LRpZ@uiX?k>|MrlB!j@IDdg$Y2o z8&O&S=>R6t5PND}#5j;9eF)9usNfR?fB>#-Uz#Xqco-O15hnr(`m^7aFMP@L6KJeo z=cWS%go*I|*E3YK*ZO$jwOs(Ab!3h#{Y1k4Z=8gnjZjWNIHFPLjEdwVajIwuM=(}^IFsXmvjx*z5|MIWp-TUwV zc*rIsy`wK43qKmFRN_~cqmqDt zr0r7c-vXf5Cz8cw{UEGRIt&>&RmKP=2isLm%i&dYbmfo}#L{V08zINFwmugRn)k+7 z2sx}AM~`Yg7FYd%8AU|;RvnlKs<#|b1c`JSOdMTC0&1lDs)`6+*-**~6fhA~B&Ht&7bmnVIW>-QN>HtV z@V7=4o5W_!5qOS?scCf@Mzy)YyY%Q_fu$v^N4dGVm;A|8%G16qyOQZ5jZR|YXw#R2 zto9(8qd5nmMeaj1zy@=jACUfe|9O!d8y02zM-J_e*l9CcKpgk6rvJ@JfPtK~^Dwha z^BBfQd7HLeeDw~OWyC(@#@qytqOF~_q1ERc>htL^YHI@&4a#$eg@l@$J|b~0$%vbN zCC)hYb2Z=}0~29UG8;XJxK!&4gcF+SE?tZL8e1QP`bg&xyJ1h%R_!l(^yQBN1NXi# z4;L_SkNGz4ccWxZ#yX04)XlC=I`+-}WkODC^vJX#92Fzv1ARfbkw($aZo7>a8DFyd z_^Wl^eRam>Yh0S!wpk%-ry9Sk-+Sp}O=D*AHP%k_G9;h+R`ruL#;?P1$2zFVIOs(r zRbMg@7yS;?62?qZas|E6+#Bc4s*J_@*&~DQ*SbL+XyX6>zRayJHWZ1aBk@>Z!T9ratozI`-;^3B@pbaY5QmRw~z$$CcQPb(R4JkSPJ-NE5x%%jG{>&o;z95$9V z$tSU7-m`VNuAq4y7kBay!>b>PSaX{6{Dsv{CnN6R)D zzbGnFPafy;leq)KdVgIrFdpAnzEvkbv&tZJQozp<(LZR{)e4PejKYo$Hc8&J1G;&TsvnvvH* zKZ$&GSrzOw1RGHws@1k4adc+jHx8VQO1|O@r#s1FQ!P1eGe!%Ap}pmdDa$8B*3(O^ zH)XR07K;cIm>X6D1Ef$IFllb2C7Bt6b;)cv5JL$n#Nk6iu>l}#B_>Id<%QlHZZLNU z3f|S%C;D949{m(mP|z}FtG-fXgD5luuRna)@T#~N^W;_|V}U{_d`xAKUDP8fkVpsffhNcX#l{Y3Naz((PKZWs~!&r}n!u8I8rs&n-sv>hZs6bF>t zqzB$!=pT4mcIOHXx!r?ql1yhKv=)A$8;-RdhzdbXy)sRwU?VV|KD2s}Wf&0`{~Ybe+$l!dSI!GB4ia_#f%$>{xphPYa{v27VvgWdG-TaQ zIRO-dH-a?Lb+bSa@^UMj-yo*Xi8liF&--<=uzYfg&yStF#x~|aCOQIzSP8|l@4yl5 zNV3~dV)`Ka`ZS!McZQo-f>Z8x{6hk~2Dl+(vwnHp;0>*o=%#BEJy(_T0l@WtS|>}! z%EXTUe9NFMEhTkw_$*&-*_kq)TkHqdSq@h;96R#sPpMbulsro$3cELS?2Z)lB6OfY zq`Uen(feSd(~!_iaAf6h@o*5>G@m|Jsv`eoi{6f4jbqmlk1=^UUA918O?LW0F<|cV7kms1#F6x5RduKo%xM^ksdJKggZo$v zS5jo^VKhgK%*?8T)G~ac<=8keP@V&IVA<57$HXsHZ_C2}Z`q{W%6Ihsk|K^7lFh81 zy&4D|zP%g=bTm97p83)&Yt$!ocj;~1u#;f^jNYd{L~IZVR}a7`(E$^xrluxKr}t}^ zs@=%P;qarF4xk_TxfLJ;TL1U5Ag{PIe`?L#c|+qpTlj}G*35Bz!IirTa5e=okDmF4 zH<8VvkD|5#@u31DA|knLQvLg~oYbtA;%)_MD&l(CZc7SgeZx1#8jQ4|NYs(@k9!6< zgmLrcisD{el4YMhU4Uu<`<)J+m+uQmwj@bX^^3DVf4U#(vAv^{h&4_AKzcigbX8^; z;L|Ns8ehVCl*U#rvLSQdiI2qI9IbIT6%8UuIZAIQ*C`s`{x(*GrM6^Tc;X--Rf-D^ z(qmUhmCPd8)x+=>zXm69GSBI*#ZO?%mOnn@k_W^S@OpzQ>YGiJU+1{T3fBm%YCnv-shu%?e5azb~{PN z2_OCv0bB};zdtj;d7!^|><{$L1L&$RPZI@^wUw3844GqDTUW=I$jrX9gecqR`!Z&~ zN#s0#-m>Z>W0otum*MM;Gd<_Nirw<#Okg>8P5D`6c!#0SkQtMX;eC-{&1YG=^|whf z2J#e_e~+Af@b`<2fkxeWq2oC@V?Tk07(kGFP1(`mp1W#lXq24!}Cn`0%4u&CmmpDUsS{ z@MOe+uz23VBIP^o+>gVDLh+Vx$ox@Cq2Ns3+{|MJNyJ8kh_U;Qe{#X8Ztzg~4}{#i zAi?G7MdA`^nbipl`TR*&*s*1CJ|9@*}Q0$WSJ3uB2lpl)7`3R&+LfJ+1HE~Fo=!jR@yCalad zZL30_Kwf$j<||6zCq3v^Pr&j`JBzvG^Z+6f$25467(QgfD+#m{#K_N$5nGs0m z=SWupGPSQDSUcSl){$~N`&y_Ipy_3xEENm&W4a8~fv#9QQ(MR3s98@`nW7Fe>%sG^ zpdey?JqUGu1YHQg0IoNKi+m+6Mt7jqKVyyM^tbtPoH@0!R2N3{-HhmpJb;Z&92%ie z0H?cSCA`Mp%v%&gjur$xG3bPELN=*4a~uDka0-(V)whwvO}}xYCuGed@D-6Oy!($S z!mFUM+PXC6iSjm-OJ>UQ7zn5=puP+vNL=IB2I&JCM~2EB|JnQaQy>(kvP+RgOotjN z){5f~|hO@|w7&EkQl;GDsXs8roTAziGE4lLGNo-05S!mS?B$yK{(Nhr*iBLj=TiA-)k z85*2>Eq1aB3R{H_i|@Ky=Cx^0(iG9>7Ed2S*`u9^(~79y^z^x;>}({l+JfxE6Db^7Z^ zp|O1APY>Q%um$*>B!(R!cf(;0Sedc|vYOt~b=5%XpyaR?F#C?K2NH{C(uIIN6=!7? zUk5kQ;_&tN0G9PD-I`t2Cc@zZ$E$_s{C#~lw=pJaG@atWFh;yL=W{#+)=NXDoQLAa z@*Qb#?S~%0FZ&Rcr=(%cyLTKktJmv)8-+_%TZjRlr6ri*Ho_cmLVn?1ab8pN44HV= z*$iCuEU4Mw^RSYnYUTIvqSkV+sN{wQBkjs_+XAG>1OSaTxhiHm;hN_Ud=0|z#wO$v zauYLjx(z~*tr2!c!^0x&E6!5{zE{n#=Oo23_;L5cI-1iP^LvtL%wPMtw!Y~OkP?pmy9c_YmioXJ82W$O7 z&U(f}) zSIhwAwet2eU;&Ky%n;77y>(d_@2c{!1IWt1Dv&}^K>C`8Xg}jQmEcL3c^M53 zd)m1}VZOO$?fxN7#Sd_tNInSa@M($J?mA{hhg&l-{%xKg4({UMI;2G%H7f?`rU_Yb zFcMHOFq&8KT7ohE=;*?0c#hkTR^@}*;3YFfpNeYf=uD|Q$?giNLCp8;?6%r~{mt-P z)=jJI^Bqz?@Mcn(`$CsR&Wr$OjbSA8|w01Ekk!Zh)cWHODwC2 zrv#dV)gb0o#&O7OKnNTmwc$R=yeOfWed&Ik-49+~UIRRjfYp$k-psT(w&1`8X35`` zfnBHf>wS4*S#p3?Mkpd<5eF1 zbM#;3Tp4-|v_KqGDl4#WQW1jqpu-(h(8F*g+rxl`1 z@20Y&9;>Bmt217xuzh6c@Wyy)Z{F{?Av#JXog69V^y3da`6;|L^y1z6<2$z$i61+9 zl(guy&9_mN<2FA5rwAG?cgV(}gI-56?|Y$<*$yix6Bv{gIJP{Xxi`VV{O;1=OD!mm zHiD-hr!2f4u7G7RWHb7F>TBE>EUQb0-Tu1B74x0p6;Ga9@o-3=zTXqtr-#{(42|4a z7#w>hlisPNsI4L4l)s%Jgx$r#abjYD^0$2yty)NPKG6P~5Ie!_Y%-Y}iX(!Njco%C z3?d?gAOnBMCvoxQ!`at93gx@sIvTlLqV|>DS*0Sn{r{orJHV;_+y75?kw`^lXG9s5 zsFRUZ$S71uW(z5aj5Lg7lr3au9-~l5MP`yx3HhcXBP$6VQRM%6`##U__xoSh_qnb| zg>%m5^LgL*{aSbT-jTnZc8(J|0zQrYLio`;tuqss;Vc4{uSNphMdajGK;!ZSy`SgA zRTqsAI=N;ql|;pM{IKyQ7i&xD$;qsgvnGt)6Fsa6`V!CWjf{y`8*!4HWKj#Y+K{cp z$4;EMmA7}8f(1)2j*OsxUP+&Okk?4HnOtsAUTWs);t*KKP? zm%PI^b_pic=#ZO!~GBSm2ysuH4*#LYm-?(ZSDw%Lx zGm;`W z)WM3j-ssyx^$)6juP^nz$y_UGmiq9(fk|wr?RlB7?FXzhlEfMG&h+;9W#3f{ zfwnumTHVRk?;Y(+V4mve;#-H7s)b(C-~7Zmqre`^K=;{hw6g=H@`Gl36`4bPS)RGNh8f21!5;e5^2Ry?nL8*bTq=@FAA@~ z#;mEi+4IH~uE!k(Dl84LOG5V4XLA#X15O*fcIBQvH7@pW+#7r0(p>5m%QlgE>2F2h^7&1NQXHY5ix)oGc zvF7R8z=(4D-%VfACrP4+!Ur)*4+#w{X9}zLpH|h)ym(f3ETC)VKH$8 z_+ENg6L#J3?{v+K!D2p^{4!mM=t$_Qke7@W3ft_O@|+*R{vnkN<8aByHW(wGW4>_r z9(kL=iVayA9j@-~;y_8W+#ugE;1>{}x#ON{LT06#H`qPF{NF?0zyD3ot7eD&t3Y`> zwmi8QW$R=357L*&Xeo#262+DK0va%L|;4*%?_a& zdoLhv?dihbBjdjjEm(s|wYhBx1qptPDDj4ak03Rw#nVnxp8fNu4j&o)?l|=2IMZTi zb@CkiJ6tG0wcYQkLG?=5F1;L7K!Rsi9`CeNng{IfXA~Bhkgyc}X8(W%XL<&hcGEH8 zI(ab4OV!o-9ie#!xg*Qx*z*>Ciy7Qi%oi2jy@A+V9W1+k!d7OhC!R|v3zyU@F%Xb- zlbSHyXkW-pg`kSaaVPYTD5#n7t^4N3%|z4bX&yk|t_*6M6|){JhvH?it~{$(GsDQ` zhdxssTh*}db8iX2Yx#@Qrz1bb8sO)Bc|2alhyj(TcX2b9w%s^F+W#-rzO0~#!jXwl z2aE2F%RPuWUygA<0Exz{ArnFV9dN}T198@Vsi~>Yzjtnd4F43OQV7gRWb0)DP|_}A zS9LsSh;!No&%0@P^Xg^HK2xuhSX>%}@Nr$kX_Qw2De+bGC&QLNO;pIsh$T%e8Q(u_ zI7}caGFC`;LBfE*=eX*Bb?yL*O^@SdTmhRxTZJrkeO#XT!YVC*a|^g%&h+zG2n8?y z_-E+X#Lu4rV7Y%hx}vG#c-+BZDW={4q<^eGc0~ht6eGdPfC4i^p^x_j81Dx>SAIx@ zdH(&O2{Pp%ee%UqAMBjPC-T?^Lky7Vw9B{tYL2f0N*e#${10vpvTeux?!`aTGJxvC z^k8oL>fh)Hw3ci2YVEA;z~zY_v&9`FB_x`H(#Zg}NC-FW#jWeOxHVv8bq^w3FOVdL zR*nw7O59u74Ij*Kvib;aLb5wWIvF$LZphYie{I4Nswcj92uxxLb&jCi(+}NgKmQ;z*3j)RPX#Fc8K=lhc8tP6Tpq zZiO=BQMbbF;AnI(&r}4D?w5APv8q`!=&Xjt?9MafWo}g?W$Ou`z?fcX&J|Aw;;Ol^m_#cRGhQ82gUd}erIix9ZX}+=;kMg2>Pk85 z_*!xo?cd=nbKC49)ba; z2_yS*{^Ex{)fxO)FB9!9Y47Z8aN>1}Oc{Nvd6m!3{NcIJXFswpTNdJur0qSB9=2HI zpj+(+EzKiV2JcEqiX9gv2*#HRmoI+8DY(A%j&CN;ATDvK7tX_C?DVj053V|bvIy#> zSh%*JuRES`qrbq^S&7l|v%{>Z&1A9_?$(#u2-O)#U+)fVP7(dLKR&tlcBo8hd( zOZ@XFv_ESf7!Dl?u!G*Wpb|stRxF2-Nk#|15<_-}1%hY#Vbis+02yXqh$!#9MC6b) z1u}6ZCb13+2#!PnPB&0-88UIA6p--TCcu=;JI}yVBZ_zw+p(s1@9wufSRiZaTq}}F zJ0J)sb^D8K0%I7iI;c3YH$e2pEoS6%z|8FV$lhX)P{fCtBfR=n2N*ANgI9BBkf$P` zyKf(l6k}lHDOKT(+Nui7G*x5>5{#bV;g>krNJ%Wx=oqOqj-`)g#u3#>%sWoW$m^(q zzs2nA-a4G%J?LHw$}d9A3et92&(PE1vq)KL@_r1!+Bm#6d2xD0ac(zlv4l1Hi|Jrr zUmq=dB6cyv17}N+v;THHw8S6O+jv-Ywx5THe=$l2eY;_ezJttKMGz+y7qUHr=G5#g zl+0uYvcA9d3vr352yG`MUTP!esXQJl)lZ_0&PM;p0V+2?bvd{wV^s?K~ zCBO$Xq2S~Dpb|%%>iy5M$jJyN4;!s+!KsY*}_ujyjh zL~Kdm__zcGIVlukP=O4`KR-YJc}svg&9oNwWa5aiJEHy~?bx4<*QQ%zYJ4_m_>sy* zq3q(Vf;RCQgn6OP{>wDjpl_w%&0ghVfbstg!ZR=hrFV8PHGpb>UTZo26hBFo@^_=1 znS6Ip1Qw)6Ztk2}yXK|RQ@ml5E7z<}<06y%=G|;d&;j#d{Wea&>*td(G)17W;j#0_ zuOkp6TGy%GQplb0sgc`)j6viI&N*`i+7cj+Torkx#(xH~LRP&NF*zfb6@S*GF=o_6 z7kim>)|{T>VE>wU1-S2TVI54Vfcx+c+apUTF zYDh;~#SaK`)iHufeu9)AY=a^!Wcp4F>>6wZ=;sL9){=rqI%d3}G*juFPQtY!4WU>f zu_Ml|ikl2mwP6)FaI&vp)s=W7X%zLlo;|jJt}(jF#HAqa4v8*)`q_`2Xox~Y+FDZd z1*irItUgE$w1r?IF#7u+Xgd?XdF*>AdaWw0Uwdtx{GIZ+>>GDOZC>&S#7i#Y_~Gsr zJwQauzrU%x>h0~-AkzKf!nh_>8(ruO%p1#*w+k>Y#2pW66Nxym$icLVCR8h;^Uj?+ z&tV2NyGZnd!9hV>_3sc@u7iLf^NY8U`ip&AZ(2W;9A|e*hQB+Cw7Ly%7(Zld30bNz@Q+2=S0%rBDd zVqQObd~bD-ehIUXxf5Qq(LPD+v*TE=c%!rh?!+b>5X?!{KiJtj>*sJE|P)qE$ z+&7B5_+Nk$!MeJWllxTpv3^CEqN}BDI;r*I>X#bdy*q&m90ThXV9S^H;<&xNCS;e= zWzLQO6}9sY73!^CkDdX!j$%qmDn0BKB3mt$0 zK^p185L|9@U$S3NLsL^1O%lf;p@4q$oo!*824_LQ*wKG& zz{iA~@>EP3B(-Mi)~Gz(Rx&d)Gnf~oJ~5m{CJ_Rg3`|$RSal7eMGb_8!O^UN+w|V} zeq<5W;C5CoJminm)w7Kz8Ey{)*5vIZrieirncoDdbfd913e%#V^pBMV}( zezDVm`?c}#62u7_l!O0FMkFP1pbs}ao}83aU00_$?yY!#ga88L8Bt_bK=zDuGKjO? za4pg>KL8N_0cIqx1j9UXaZnauv-+$p7vY9&u#)}!0neey^^9j zb2JvdA#@+!{XfEs~C zogY8PWr4%W`E_oW6<1Fsw+wm64qIQ+p1<=705@|geE=VEo7@^k3Kpp=V*KK2Gl4d2 zadIR1pd&t!M?#RS9}WELKqEq;4o4x4++-BR1jodIryD(xv{KWDGsK=gX?5&arXgWX zT9J|Hm`pTrwUsf41W%2o@)6l>6DcKzhG8|6+^bfZK3M>VzW(&nXS-qKTYll&rj?GR zw+Z$0s1kgaC<3cnX{&{Vj4VgEx$cFvrA8|4|^5D_GwPKF_R(XP2yO z*bRYQw@P!+%tS-3sICl^;JMp+?trCOD))cUKu-lH#q!Zk)QH0)^_i=z}3l>VLn zWnI%tmKi#!K~9!+LOb~8O>v^XEGz|qQ^VSj&CvC6oi|n|h(ZydV5a(6$P?oMANjB# zqu{vkannlFNc&_VBcs>G_xbni*;8_R$&XJZ2$C(V*^3r^7EG4;DBjRg#Zqs~B3Kf` z>|}(`Ujk3GpQoO_z8H2log8z(ni8>{B>EiZWq~{JK;}d%Yt&*)a3#|?*i02yzUSb|8F)Irt`*FRLIFly?NxZ&_j*-xwsRxZ(LP-ay;u z&}%SF{6v85xoNSV5AF|KYrZ!7_&635Sy)(pZDg4lU(c*z>*Unjg!G2;TbyxWUylkv zycuFMk+9NoG+iXlNOe}LNN9)N(nkk5#IR-lB1JGDB8UB?-u_4Qfrx&oi*U9ux!z7#sZ0mPIxeO3_ivn?&s+} zH?!rW5&FE=R-A*Y2kj+S`8~BX+|klmX8 zZsL2RU=2QnhA&}&Kx}w6i2-s5OJ-gsDy;wuYxXoJ-BddCsT20)+MsJm@=06=L9{RG zW3jXF%Ep*Kl2U%NiULgYSB0zkd}CNmTFO_ADv8Ph!0|=XIC9j8%@;c9Y7BzH2(!cf z6-v)j>mf)zIG{JJmgji*_;DS&_;kz>U|rPA-UAX45fx=X02oQY_!WO6a%gC%H~fM~ z>+X=TEU8~c#!OSrR5t35Y7b>DEhV$3k-s9F2W*_2jwm(;JTYOzSkVLV4T)RS?O5rC?ifG<)KS8BcA@^-tYqFDAvHQmUN zTW5<$%Olu+yH=nF0ZSQ6zX1MY<+lq@0g-_Hy#Kica^xZhSqNJ&o|k;SizXRMh^4M> z_)E;i02S|lHhg~$(EXu_=AzSRk2pu2PfgJ2$188@ODx*UuAIMc;j8H<N%zN@(1O zBT;TTI#+QfLJYlx=#SAoZhW5BrwP+217wBvZ!YOw13ScxR!_Wad34~ui?}DvM?S$o zs#J9UzJ1|A?0Ac49%#6dn3(8~6e}l^dY=oPWpwNmRu+BzDI;|Ey`lm<$5*k-WpoU; z2sRB_jdst?&6$n5s2D1Mtwq~WP=3FpWaYQSeWWRYC#|CkjvRtUbUgL)^jxKOp_xT? zP9gEm8_-3gPICth+l0H0`PDnzbU_Ho zDV2n8xCHx!)l~#`xLhR%`J)8{GO-Zfin}8y-r|u)JRX})od!{q3E@lI>>*&I5R&adwUJpaT(kyiqA15Cf>^+f5^_NQ>j^uZCjB`7a{*lug_ z>0^VH`srPM(RgIwBn!aP7dm;$pYPAAXkw9vM34631I$_wmC^57gXTjpWPl#A>b`+! zFL@NhjxCmLm{K9M2qfK5tjr2x0SC-+Al4a`gUD?$6wClmBf5Umag=dgJv}|CWL%x1+e=#!0fd%VHzx`h44a;m89T_C z>N`4aQLA87h=L~ZRx5y8abQ_ls%LOKl6+ZFt(Srs4s_hHjkE)2u`y1s0f&0od&eAKG zLLJ-giVoOL;N~7pgol}rg4o~c&?+WF6yBoHpBLaD&(j7pvOKwV>zCHcJ(Jy{am~8e zJ{Q)j-{vd%>!4C%WWUAOMdJz9z|+j$2klmn8)8w zw!v&~2S=nuJ}#Kw01Zng3g(|Ap0OMu8#aO6m4v~j6-wwYGlCvLu3FS_Ksxpq=3~(l zS2;7hDgcfQvU~~)M5QVDoLy8DF|vlAx6N^TPMjEG1HRx)0>%&i5DH5u-ms$QOzKxS z#xedoB}1f%;ps$WaxrKb{Yw|2<$QYisAI?zl!p$Iq?{FI0>2X~Lve@q?4&pb*^L^q$-0QC+rwBY$D5D7LCmAa@{p3^MYBaN=KuMsl~7R>07>8*3 zV&&3fs}cX9`95icq!P})RJ#+7!$bq;AmoPHlO8M3y4$>nkxII(y(c0hXqDWgL8aPO z`poA>gvqQPPIJjpzIkoc)TV}KuVcPUgCnQ;|zc&e%yV>nm&ZPYAYDs(v90XsP-2vLP~K>(8N`A=9VDj z7zphG%K;03IA#dQu(>&+p7|-<=fEg}of*|{ut31}ucE3hAi~c6fFO~{`QzIM)U%9h zprrnS;})mT3P?XGsN+8%_=h6WPybo>@@a^~9(qklV5}uF{GVf8zx)SA+w)lg?hPoI z2>F>RWO_!P9hr}$HU3iF@k3rje=NaAO7U_w2O9Ms4Vd|Qpz=jjzkx;=x+kKo z28&115pzvx#tGIs5&^q`xZpNlLqt13U;ZNub3p<(q{j@9hO=P-3%P7ymfc23jvD&B zYjW`$qO=^xvfAb-@qg)8|N9z#B=`gv6{(60kTMo6Y1D)6d=cCyZW5diyJ@C=ma^~j zPIA6Ey-l!_l^4|))2^5n8!9M7E zP#D*|e24Ma{7UQRPoI)u4p`a?AzK7-bQh=~0__G;LoD}=;@#E*EG+Fn*A{G1DBIBl zg(!%&6NT>}c$|b7MSW6C8?$XwPmRC%paYIou?-tE_l-h6Q?S@S;tfA=gRDaxgHD5m z6ZkAKHMQ=D;S4`#{)c>nwelQ9pEL}CH@1&hPE0Hy!tI=nM#@AK7&5&L8Ot;p{qGhaijs?dMMN1%ey$!@U>VN+2hq z3v?GieTe%^PD#IZcM~0*pf?VL;ypef1-Y?{k?Ux-iL-&Z|J;z8;v)9kfb-_ zGW@&?6>;Weg_@QYPP|*UyPJSG(@&SF_Pg!`e7iq(`*uQk;%3V$0AmH`(vT&T?IN2t zh3f5}V!p*ne(&eDHP+e0&s z2RtT0NP`WbO9+R(x0`#ZCS**M-3b#*C?LW{D{rnvXbf&qQ+gBPfXDi9rd6 zf^qvedH`P(R(e@lqi&jwldh!fhR*!-1 z9FE$dw#7jY%VtVjz%lBWn#QCRV>$W_x1l$_kHQcJ=?|`sZ6sI@_{?+6Nwg_!R=Bis z-MSa~tRK!Z@LFVgkD6hFzez7C&Hj!Uob_FMuhC zCMp6d2xFpuX`^E{eCGEm{)G~1IU$rs*Xr^Gia=1frpr44=JjykipAfHNY+SHN{Uqh zK9;>zf4=YPpu&#Vk46;TP8g^;?kwXbDmdF|hhe~mkti+qJJ$Y(R}?G5&x-;BRy?f$ zv0&2*Rl*`7!_=%19r-q^oVpW_uUNWlsTW-Zl5v-l9E2bID&D5@M#dXpLVDL$ilXFa zxHFVb5>ifKbWSIarb5P&wSE0DR0pCFbV>#$WZ7^vZA3WR2F2YOAH}7eJwr!K2 zXKyG(NiDOOwvN*b$vpw5b7)x$p^7_??=uj zY66q6L+yOu3{*HcehKx*fa}uTJ6ar#U3cB3W^-D5aZ{gq&ONB{5oYuFF_-Rt?=P?V z3r+1uDccZjC?&*HwehuEdb~ABe?UB<4t$WlG0Qmx$yfM~1_cEVgIWY}?FiTU&&$+G zuBNGesjdsy_*x)5#B`V&|0q+rhlhnN)im>W0QuM56L?b9xxUm0jdBI7Cmn&ykE{Rv zem@sRB9#8i07L)(`H5>^RcD9_th(E?OYj-{S0p;rdsgofT>P2G6d1!%>5Hp&*z>W# z=qnbn;V(35vH>~ zI93QW4`jAh6!I#y^> z3T@$gMV3YvAze8&D0%Ja!!vjuf}Im?gDQyz3})QeQFZC!B>FI8DkLuw5Q60_P6<#S zt`@O73w8O$O2PXX3ZEEy|S8_^(kY!y^%iVhyj*KPBQ&_=Kh5fD#q; zLr`5XRS~d23C2Dv6*9-wH0%$JK>R{rY#gSJr6YSowD#_e1|hqDFc)^8e$->$E=AD; zm~HoBSk^k4iuQ63+J$|?*iaIK-SsIJIXDK70nj1iSRhy5qo>=@2Y{pw0AK69z)mTG zzj{ky$ybc&e%L^{;u-z>QwSQ{{)FzssSIxr59LJ*iCZNQM|xk$y6J%;fScRrx@rG1+WQi1eUY;o%7P7Ylv$BXCA}ue`cp4p+sE$ zOmZ4{dIQw`_pF~CnvR`^GC)_qO=jgSK*Tga$XxLPvoM2n)5FE zfLewkHOpiW?O730)AGEB`}!Ec{3slYPulWmQESRebUEDr;vsymaGU<83h~M9K#|JS z=ouT6DPv}O%P=YifMfaj;^o4vU$Oei4{j3^m3$Jsbd^X^!bq8j z2>YtVmCJSe+LFu;q_&mD18o-3AbTsep%g(~U8o+hYxcidfcNhJzx1HTGkqNd^+V)4 zjzfVY3|by89>C^0TL6UcbotewmifYvhEHfL3q8CgmPp*|K&uC;Ga#?hviytyQ4;@{ zLBF#ckc#Qp07OA_PF1WuVCcT|OMnuSoP&9w7FK|t-i}Zi#bVPTAgk&ZFZ6O+(6=o@ za;1MC^u~JqgnM&w;+yQN5M(;oyiVpMbN%>mlxLE|5wFcg2G-!%+b2X^H2AT{4lBnzuLMMjvkb2fHbX%gZ4mEiac;oBzpkZ&s!W zz|pI`*MwL^i3VV6tcV-HZ7Y(A(EyrxWG=hrhu$p2x!a~FL2>*On&t`Q)VG>fdT-BH z|NOY2w$=}veWBUjjJI4MeLJYIOyC6A+x!l1+T8ul_#6M zU{OBY*9zU|XITXJQIuM*?Ryfyg&zCO?{W;Tz=H}(u|;K{Kn^h54l4G|$K?F$VJRpD zf+fDwBphcbblM@9ODPl^1V@mSHf78GYqKg5$FtweESG-PvMU=?XUzwRqt08Fi+4Fe zHYjbi&Q$s!O*O#jiFkOlUX~wzTL%eWOA6yDRE$%N9xYv7D@2|@?ug!NUQNR%44i+P z!xKspPgq%5iwlKU^Fw%@g>kauB1`zBN|Llfu`{6r}E4#fMglRO{5RSji+ z!a`ah^Cr6b6<4;pivbU-U`e1SdWLKE=~~a5yeu6$V`I12es*6sjC)ltdxr)Fl>p#(sm5uz{Q7EATFd5eL@K;VN^uKZj4V2 zcUz+RfVUK!AL;8e$#d3&7Zxdoc7^U(;*FeZF`d&y%uSw5vfSNqQGtb77gIAbkuW2+ zwi1>dmX*>&K0Z%w8Dgk1f9N9Ol#EAe*#$v3*rG^BJpB_0do&)r#mBaq@olJHmMDTE zOeMELO@*>>mY%V{lJ0R!^G%ngU%o zKRuiC<1xbFb!rjG2bwCv{ETvP#)TAnUc4BMo+TPT6n}4;e%bR&EpV6Iu54V9D@B!9 z0U#5*2EQCFv7gX-{8$)+42ejMNV%!*sQfr@*h^cWAOT|0F&b>J5OeERw#nYYL(ycH zorM$fcf3_5u2~dH0H#U{BMF0^m-Q%tTt53b7n@ZyowhZEdHDA$Zhq*A|5KYqcd*Qc z?M3ZZ_Wl`^R{-u0S|7G=L25*+jaU%Vh)MmX zEAw<-SXwe$-l5a{x5C0vJ&D6>{~%d)2Dg=AKoEDmTPK!KG5tOTzs>Ox3`@Pv)Lv?oFhPTi>INfla@&a#?cD#LL8XHioixfeE5T9!+<9JJV|uLNh1w|U86t%o5Hc|?pDj9UZxnIs!>GejHsMk`1!@1 zx5Z=Y#iK4&B=u~5S|Ig0PI*grU_;x~(XFZ`;fPPEvvnK`ZtJC;8Y`zYN^WNo@a(Bk zfU3GLUD;nt)0e)KsPk+&=6UR%5ShjcCy4c*`ERjov%jPP`q7h5b0L6FE~>K6msjOm zjnCrd&jKSdc5e6&daF_5q3lxsO7LYl7KhG0`yGJP3*~=t`z>y@dv&%ItF86HoZJNe8u8C zY>3R>nb}!~`7Y8^j^g;L2D@o~_T(wZC)nWnE+}7*RG!X%{#m{7sWc1IVqo@_CCS(GpeqOsUPES{+Rgb;6a37)UUr@^5A zJ9ZPD)#zD5Ni17tSBUqs{>Q|GZDq0uVbF1;mV9o(R0CPC_|=SMS0hrjB?G|caJ51a znD_eiHV#eDnR*yY#U995E#Bfj6U?@JfzqniG}PfN+nkF8Eiqya8u2+yGrl&5BsV|r z-e}-9CW8<%ZbdS8G8Le?1p2BBoXl9;x5>5d*>Z%o@ycd)ETNFnOwKB}vV{Rz<+S`F zp%xhFrINuGZ-oe9gdc%((R0|OC#rP=;4J}@O5?z2uIk#l9ZBM>6F+{WoiTxX)3nk*5JSgWKK5up5yL26{QTo4EM+a?dBDe@3x3@v`F!e3!g+ z%TDYT1ng}bi=$N%S5I^B(a_MT9axAO8-}S1apB<~Huwo=fo~%zIwO;o-+MxuAj1=u zmAy+mxWw&O4K=Te(bM_q^^kU�n^T$|Dg#*Jkgx-!E~FTm{`y@g1!PM6G&R7+md7 z!NXFzw;1b9192Sfk1a)b^G~;>lR_+O|EF#n7jyp< zS>4p6JNO6@e$}9zbKSCyQaN$3zpIc%@;nYW_4%+yG=22DFo0~OpU);nOS~fjiDR~8 zMC~v=j;5i|gCz+kT%8{?d#)RP5DPRT?T=Gl-T#1@+B-D^h_5RB{4C1lH5eiHkH>Ao zGkg_a9*F&f=@N=2t-(jgAS7#_P|baOdZ3r^e$UFxTmue)_JBl2Lb&p#;ybjWX_w2> zp_Fk!oK0Ry&>0}nwAA@~u>2vWfc+aAF|5dA4$Uip*UcB-jz6xOq9H9r7gVHJ_n?`LC%mO*&%Jtipy&L$W6*A8MZjKL#!a1*TOdLWCWZ{ zHjDmq*aUGlFqOEs(Apha0W5V`078-)7zSHY;I4{bFyj<1Di81?fqm*KxtqzO@#Vt^O>=<-_7{nq4{?6BA>#gO!lYqFA zNWZgpUo%t@i)Hy%3kioA{DZbyS?JPmD>enRczLJdKy%4s-bq~7 zU(2mV8ouM6F^qP#JbveH8oa+?H>j&)kK^XfEYp!6qQS>*=Ihpjql-!Itk1)DD?_Hr?vBIJ{H- z_WJ#pWf!a^9l+JGEZa!Cs^Uro0Rr%?0f+75dsCuzP|nj>Yh(iPGLs8PVsUsuZ{(YF zele>=X6*m@XLcj6H}9LD+g^&dW&EBOAp;2bK4_FjfZ_K-HW(aLoKa!kMYrnk~Lj- zS*UxO`Wb2?DNHvlqwKTHGc-9{Z(9=5}wS^x24k_B$wq(!;n zEsb;=@~j`sX~B>cj4+GRFiTlU@UH2~x3e{Ygqy!_Zfk2Zrb3c_?5G^<x6(eCF{ zN$DCPc?Is37zrc^+=h5Sk{2<3%+^*Nc6)0*B8WjlvwsvG;a2DYlT#kK4cC5nha+8Y z7$~Q2X$}VciIlz&4PwGo-G)$_{YFS`Ef}rdD|AULHwQF#$(cm5QYhI1QsOUJ&b>Tk zmrkIqavyn#Xb<;PFKpwh>5E=Rn)FX6v0k0+>va(=zh7-Pf{^#H8{O4G$kdK_&dn5) z+Jg}eq7H+D(|}iz{mYm4(K&O??!gjfL%n?8Lm&oNcw1I1{L3ors{qCdY5Z6+x`Fwirg?#SHdM^*!L5=5Nc zLWh>o7ifU-*m?Z^A}m6$992P+>pun02`uxH+w}{U%cUqabwY8Vu**LxCNPXZlL8oux!EcRj z{4UlqxUuDSeN^Ygx|OGO3WPRyZrZ5Ft3Z(m7P8_|CnxWbG8!~8 zaO>nlEd-&?>s{k$a!Kgn%3+vDYoH?uzE#LcIzco9ca9@8OKoQJcYJzeGnAGE`YV!m zb?10~9jZrATrxrhhK7Dv|JsQjZ{qwIusAe^I&+m8NVO4$5rJE#3o~XeR(g&$kmGtUaMg8aQgvtc zhzJ8I!W6cTKM`!^nSV;I9d-h*c*@RBqhLD{w8;81g(_H~0`p`~zKpNQKabE(5}Q*@ zradcINLxFhr1XcQuztdQ{Ih@%>c9FRZW%s&gaueL>yP8TkNyCs!vp9fTiyZj--3WD zzs1#T5V6%$(Um*|lJoZboGPy6Gf)QHgEX9`o;VM!3ULXI+Xrxv)a9QCdi4{U_0NN7 zAILT-^Y0(|Ea3xaSQJSuBewEMwF$7Q(LJ0YIzRjzpW zum=r(yK^Gj&0$ZV;jxZ8ic0*FAr6mf0RTdxu}!IztGLPecz8DGo(8158mMN`}iIXm?V~f0e?i(r26)PIix|uj( z6Y8O1F{AYTu+K8m++p>Q7La}T2t{%kZeznS8hXbWFho)gXhS)WE=MaRE30)+2Q8$0 ze9Pf`MA0B#UR_rb%IQP}2aQ4=KE5DmX=BC8fHb86p)7^_i7J&0=l#GJa?BQ*Ak<_I zcX5nzA(ni32kobj{jSo>xn~hMl^6ZEgovXNZoNUb3P$-o?&_JR9kIz6IPD=_Cn$T{ zsE-~!T1{=hN~W|Cw-^J6*FB+|)SY|)O;sJI@qtebw z@f`nkD4$~MqxrdAe=yGcviYrwOUIQoK6&^N!JTxPk)x`65>2z88-;@I_Lt$_^!;pi z$3YzLEBOSznz<1Y{vF~nBCP!-y3zqjlqD4GdypDlL<&Y-F$hN0D}vvtp}V0tcu+CO z__^ci=JeA57y3=bL`F^+!ez6ki1aVwO}~JWF%LT6-|AU_pd{QnEhF>3UBj{(uaKai z2r)uS^7b#K#9V$YE+V4tQq^qzW@RWc^=gV`kd>?pba7uq82H$z7WBx>gS5F|`B&r7 zPt*|qxI48%|VZy{z~+vir>C`aNBJJnR)?Pd|hyHs)^;Sg}*MIW^_vDKXd6re>G&Z%xj2 zl+87(2v+>b(C+Gu-1Q;p>;)e?ou#9m`-K~mBhwR}7jqw{7h2RmVpvf+qp&4k=rK70WtXo^!G96nuc!v1U5#y7$|%U4pK!%gfUL^Yd{dJWn6$ zf4FU{0?*F2^Ire`ceb-9Oj(^71b^#Ln>NYNepQ#`IdFZYD8u6GU`I5Pj-_No1wy0XLvItkY49||hl1QoLTCScq4 z4dYBc8rEk6X_8(lS$R#I==1YL|7vY*Jwn$ZNx{e$5Ad$Uq`w=GjhKs=5zH~n@cy@r z7Mc}n3@-rR-Hls_=I;G-XbG5;!@p01&upVUd-ja1df0TS196Y&L$W)5!+T$Ycw877r z{KiTk+trRK0cZ-=-MD$Pdw#sXpUE1CTB#w9*)Le-m}M0n0Ng?YYB;(#$ymVzFMUZO z^DF)5PaCi=dZ}a&1bot^2!Z8m%Re0Mt+FzpV~*@)!}}QjTV7N|Bo+HdL5T>laPsTm z4Wb6&w=z|oPn_Itq&M4l`^{~{lO4WxJ`roOeJvW1NRKEYgUXa_)Juc-9p^;p-Vigc zaXXXYn)j=2Fz+}_zV&JwUZ3NKUH9hUL3}Em!0>xh(}VzH;Ir0S*9C6i&Fk0Uo$r7z z$V1X?Y~*?jU4!ZNBCKVyO2%+u1M6?lVv;J^yKs*zLdaDfgAnBKAOx3uXnqPvnC`!| zDm68nND#taAr%ojFeH33xR6;Bh$dyjbpP1cP2d(!ECAG=!XQ#W+_jKO#wb*Qs55S_ zM&d(L{(0=B`-Z(F=T2jWZa_gry+c!81kq1@3eMB(0F6-kOG;e8Rn!xmgC`(7Dmu517-YAz1MGfUE|WCnIZ&l=h$ePhaDdnIO4TVTGtnp7>o>g~+iC&6D-` z)`@Nay)m|zOH4zpABb3o*sTV9*nI?nVtugz$YQK4*fa#|e;=92?9Y36G(vT+Yk%?Y z+)hpegmjuft_mV4_ArtC;xk0FhgC8TCR=ikl457I{X^VKrtw>6Lj zL^|vG*)wm_mH_Iq`cxL&!?k_%Z#(Ft;J7g&2T$@t^TqbGniQEQ@f z8c~#x$?$5I9L)YCSt8RODx;w@>@u(bB0Np=K>?PQAi<-cbru#6f2apU3(?c~1_v*F zThi|Pz`1t^C&Ekj4{u;4@!*(*xEv}TKW2aQ+h`vNO32FMmKnvu0<@t|rqHQDt9e@o zH}8YmgLA(sz_s=F_j5UHKl_;~Nny6H2^*PJf=&&$O@XvbRhf#j(Q}9R#Z`!oa)yOU~I+~Gmq9lx(>6%9RnJy0Q zCWIv5LFRc+I+!$U?D8|j?oz12qEA_)(gfu|Y=k}~LZJ+5d_FK8 z=T4I?Wa*GV_;HF7zicqsuIUedmacvD25eO*0?pRz^9#1Q*0NzVsiw|6!xxoZg^~3D zszRH;3w~qIQR6T)O4Z01)P6FC##c=3L`dK|>(e=+va+1+Zf-%{KdvvRt;4dQq+==Q z1hofPDMh&8GrbM$P0GRA2xR&F=Z{Gv@7lG&%dTmBnqDg!0qfHC-$(v&Ln!}dS=q}u zCZ?tVXeF~dhH;WqdDHcxIzd;TgJG~O1(?e<S&6MS1nIUH9LlR*LNItwwfl z5xt{6fsPOJ>PXpZE?+e5OY_cRsKVD{^db1GZdsMx|UTC+O z-nXK0jeP*&G}AyA9w)pR&20)Ev&u)Se9S*hs}VUh)A%OD#2qM~mwp?+w{U+?&k>FH z(Z)ygjO=_DF6c2AHooD^GwJ4UJUF9&u=}3lCFe7&{XzUwGAAxdmCw}wtaD7^*jX0m zs(I#5aObHIBN5We)Y#*vH;1 z=U)@C;kL_xfYQ|4?4zD|Fs&56^kg&@&26m@Xy?(~tnei>rTy{OJ81e29peaeq_xys zAG>!jV=1dg;McTQ^!g7x?RS>Wo@kVOrEO0Y{O~b%E1zqCismMnZ#|7R5jFq+U*oYa zEqZ2dVRm^f$5u1Rqot=bs?7h;Dj+`duj$d|;Z(iN(JZH_|DYnJQ#YHuV5jqyPP+>F zaAtVgzJ8+@D#8D8?QMU>_dHHUrlAu3=*|>)(Vf9Xz$U#0FvQZ2bD_z}0(3Ht5L3AC z`g4$`Q7&D&G(tzH%|C{P;xIY-`}!`Sdn4O7Eo176LFk*?N(u^0C|zjIy40G-O=Rc1iGB|m>(rPb^@d3+1(oQ?uKJkiVCrPfGDNZ4SAu1rZt zFuyh?w%FNJLxtz9Vvt33Hklne=ImV9wk8_4n`UKL-j?m#SDQ{jxJ||}-whLQVAuY% z#l%!=ZnayJ1L|D6QRAQAzx77oEfG>tQ3;+$gaBE}DNYxY0JyHf$jI0`0~h%gFczQ6 z){1Vmtz}YBRFt^uOEwl^+S)Yq@rf4?o;QO6dpkQn++~ppWte~p4(6{ZmW1|h4l)PN zQXqXA?TVO_b=D=8kv>I1K|!pg9O5t4i_`kXMmVMR3J3_;8A0P?1~H@hNvJ7KAsnXw z5dBTG2KkxA=vf&sg$ApT4O$rBjMXJKZDI#B807x*=g%ML|6&yscdq2-_C<}W_bMqW zss^1ulPvh62p&&pY2g{vB;naHzfBKFxIPXa8G#SJ> z58VI$8c+^}LJ?(XJQw?ynF7&R%*WR-zmPRG2Y;(;VNK07;IDe$b@hS73GdvQu(2D5 z?sXipw12i}p2i+JHW=Yqujc)FjNVBD^SGxwYC98<0r6G`1eXIIvzY88rr?Jh z9*!Nrs9ue_6zqm5P)2~Ux0K7EFs~urJj%iGr(|Ib^cPmiI4JmFI*1Gd4T~Hxoykrj zY}(rcs|C~5yeSCdb|HSzD$7D)&;aOW+_&!utL6sP7JSJ@3rb$tS>zSGsl4d1eV4=_ z;BB(i39R^U74}c}I`wkb|EbF7F;o4|UyAHp_bXxo;*lj+ zznM1xbUBoF9~>=qL^DJhqUlWo#L~)|(|};HrDbQk4S&ksKqKih(ZCz)2yQTV#{4&q zQccVgp6<1Eb;M<^MaBJe9feshR`4yMXK^3!40OGCv22Z;l_jlXh1%Tixy|zs5W-Ct zcNm}HE`l>ta=jPaJv^?XT{}GDfF2DS;4~jB0I3&(W*9v&q)gZmqIMt!h)%akqO3T2 z$7^Uk@AzQM7eL?hET61JMoryLF-<{r5}@nU+oWx;jjePOZ1zl@$nmtNVVT2P&kY?MvR(wlmXnVAv=V; zPJ^Jm20Wc-;18N7uq0J*UwB_)~i zInZ|iD8=s0+ZzdrdI@Jrlr8 z9mzh32az72V1!_dW_g*1#7>Ulg2vqbcMAimq=WrvWKR%O)F>MFs}tWY<+XLn1=4H9 zEs?Ho8PyoID2#xOnCxQ2g!~T#KOV|R#Dl5s_p%l)oCm(X$xkPcSFj1G4-Nc6LKLTo z-ZAI7Zyrlhf)PsS_V`6a-a79|q$Yh5j)>Fl!v4ftKjCk-1G^Ixu*I2(*&ylQ?0%N0 zH~JAs_$wLT99<53GDjxTg+CFzl5z!`9~!xidQ@yXF~o11xjo`Bj~+Rq$(VTYJE+PsLOsX00%UM(Ji4$fMG!`S!+EH*bUK({q1FUDZPS; zOp6~!vJu?gK*AAG84gC1ggJx;$GE> zB8OfWXg0gb^6WQ{q%FTLYXCuTI{ws%%ymquwE6*SXIeZWz0cU&=o};XEWkr{UrQWH zno<|Tl~A_RGcob_a%4gvSpvP1I^_y5Y3LcWYF2JlM1;CI#>69fJ~`sZ$Iu z1WDydgr-e7v!TJI1r3t?-DSLc0~%2-H|>I6IV8Dx!nVTc2B80_q@V|Y+%J`t?teVc zN5RdbF$6F5^iS2w%aHysW8o*~;GB+)kM{&-gDx^wb0R*ZO&M0ByEvaThKeZ^DGVLJ z+pSNZ#RoVE&BAK`;y8pVU}~3gz5y->w6n4UuZThgp@KOLI5v{MS!ybw^Czw#=X?@o zsHCJdrzqR3dG6dMfZ0Hl(DALp^B8Nuckp0p(c21iXC2tZ>-2iQS;8|*qylxKr|>#{ zpY6oDUmAyr2UAoxA8JeS>WSQcAT367WBF^di?iB1O)=Lk14i5c;O+PJWAU#1Mp=#j zhwI76V7K9x`PApJma|vpGgf}os8`>AKBJ{-dahx5rd0+tFLSN*#lv-yvv&;bx7@Yx zNbv57{5T_gasGP!%gV>oPfS(Z5&vZ4-Sf}CSpvg^={>Cr@x|7uJHxKm7qyP5@->BB&s|RW@2@;f25n7y+vfkN zpS|*JpX%urFl?%j{95@}vj+n+-M^LdI;r|1ytT;|3y;f(J!A2a_%nFd*BiZ-^7=Rq zT(<0WZlB?`vC>#SIt_&s=b+{rf6sgme%BSu&1duV^`kF5Z8m50hxE-XWA}Eq%N>bd zIBUl*g3*kF{r#$ou3{8xN5uKFN8=`u5usBm)hZDJ8;~)b{JdW<9GqG^>X77#pP2W z(*Ta~FdyGYG|5Qi)z5vPhcThs(5)tS51`r7h5gQqv|wBVFw}M;uPn(P&Lcmpl(0+h zL4It)^QSm$3am;uSN%r#wv@{^Qi>Wu0MrEwi$fXO7@TUoE~HCX%zZ&PgUk{ku%7N!=m9XWne%eQ={VS{!O_V_ zU3YPrr{Mm zI!&vfR!tg-nAlrlbjd*1k(Rfgh6r-h4_GiN5eMc)$RDPTDer(oYBT=$iZhAWc>eJ` zio`SmS41VkCD8$jnaz&}Y$>7K7@GkN;JDUOUjd!&5VpDUbw&>uD~gueg?d7b#NDvFApAhcO{;8w%v(2nTv$jFUILtv}nKXT+R!2j}nC$Ng-6%>54i9iu- z^a^H4qF_#og1}G((he;EiFU&&5oU50O*zRwi`Ky?Z=7E)*6a$(E{Y-qmOxO)T? zbTGh96dxbIZGe8yo;y%1RA&uJ|6imtyqw{GkWP;$I+sAr`!X}(S-vYR-k_2W&wFvKHr@ z5Bb!$amz<54`RYA&PJL6JnHVF)w%e&P8mIn71A1)Rls7!s(E%9+2~pWtUNYsM~~8> z?(zu}6cSno%)^(-S4K^ZW{fo)%n-9{_Aqu3)pO?*<1ZuD{qEhnc@yN#n9X6(nQLba z2dZ2gpy!rkNlOR2GOui?@@gJGNu{_H8F@OfOuZLOwHYNl;{@!UE^Z%uMTq z0728^pnx${QYAZ0m!b%41L7j>gozTfNZ9!Hi;G7?mXa%{O;JpStpo||a6mln@1b8| zq_yc|&%A(nrxR7Rui+9Yps_qtnvd}VS}#WZL|4d`17MJ_CUv#wHIof^@xUI+s6>%! z1^sccy$Ass<>UZ{Ko0T?Y1-BnhHg_k3Y;dM-&Te&<~KYbgFWLoC`O&{`T3nelC4=G zC&vNc1u}S;jZ~oM{ty&ID<|$-g6hs1kON*6T(F;$?`xzBVbfGR7yjbKi_n5Yvm~s9 z#uH5jbO~I3yEWDhZ;S%tSA+;R0~KgLW1ZiMLB={>{Qw6`vO%YxuWw*oz$3_HE)v~} zajHf8ec+~qn)jKggG66`-3c0EM@?CER@CA5p;@ZGZMxAzYsURX<+Bxl)WLpGGo1)b z^O+d__3PX~J9r;4Nle}Ce0Q$UJ->$Y;bkJC_FMY{apQQkw`hPP9zUjWp@#YZQq4ij zsIB`Y?v9V%c8cvk>o^91&%3&VN*h&&(H+p9xae`abpM>g1VzS2i4D=dw_8@&&as#jH1#}eCI?twu8jOazdFV33 zU;KgDUDf~%1}~I9$3jyuRoX7=Lg`5a6-%xzNU{z0{JMl&Ti??^G;rcOF)%pDJwP;m zqq)!w1+!&t5Lv|?*?K|^qseX?54F5~6~qy7{8W$m$&}RV&>`LyFO6<wCtNpq&F&pIik;mo~^?j&Hjwwq^ZN4a_V3H5fGc;=(KmG!Js3y$^bN zedQliFRcS4!Q(|aU?_Q7eFS`5P=P2o1nP;$Cp-$;7ym4_wShp{6*Z!AVo|fS7D=hA zzxT}V3P5YkA)E_7fYS-l0SUjF4*V!M=fC@KW7q-5?f0r01=u-kCb*t6+f(i?8&s*L zdXImp2xv4p`nF1pGN$kq6JGX|GlI6M=m zZ@iF6YS@*!*K4>i;(k2SR{yT}kgJF3S=8M6#R$jQ@rxKVhwZ)v^KssDoaBBt%VZa z!}5Ng)R$?03MHF{ev`-tQrQ3Z>biXz|M%^;(Z$Y3|NlNE)t3pj5H&%IWp*k5DIQbG zrW2}7|CA9Zc}TjD=0z)_94G%(F|qQJXJn!)5_ zU|CSdtC`Qv&E3Z7ieE(z&CSeA{wZgGfPo+U-oOBaRGXty&L}Yu&&5OF?)rK!@JwnM zR}tQYdPe?-h2s14p5wxwoevQ5C4W9c5n(9;Rf*Fw8B9g=o>IuLnS{Z zF)^{`tsjnDho{H3CHJ<8j--(2yDBWcoG4Y(}+CUA{&*6Gr;J^jokQ{Vk0yK zE5+}13^{_}^U*s?xprj6y@bpp04y=(0N%G z3=a>Z&+zD;5&)KjzzFu~Iw(*MadFWIQhElEwyfpy1A;ucMIRE1Qw7V1a2qCReS&&Z zrwTg%5;WsS z4mSTE) z|E~6-jKUXIeen-)EpR3G_=q8pWc_o+$8g(V?MkA^3rxfZyq%D6Ika4R9bgnh8SE#= zP2(HuxOn&CD=5R!R313z4Hehn=$L{O-+>*ZoP z++on+zD%XTWH=ssX}|v$4TzO4l7Fjwg2YdB$}~}SW!8@L};y1zyllE>85uNW}zdi z25#Y)O$Awym|g$Fca}#l-2;PT&9jPX?YeaegB&Qv9oa;vPJ(PC6nsVlsE=gnV-usJ z?Mx!lijX&}si*`r1|tL(oQ}4-ia?woh<1pqdh2Hq30&3M8Y6sc-MV#1;mU~5^u=f6 zdx_EpisU0lYLe8bFjU|!N}}Rs%>PmuS4-b5bWKq!{E6%MhKx$o5~l@khv4ME9FV-* z%J3(mF`*sm9~iiZtZGN`iWu=dN=iz;IjAMau)jR()uAKFN%do;NX`El=1VL?V@da^ zZsPd(2x?EO$_YWE&Mq!=L%-7da3Br^Q6M>eT1{Y$9s*d2b!v& z;;k%3xIbg1Kj$@o1clgPH@-`Ttcrp|&SF*7_qi*l$${P)n73TM z0b)6!&r(3Of`%7T9RR(k_X&spFhZu7x|!?vas6!L=MSf|AgL*O_}j)Jvil1*HQ&22 zd;uXK-Qmb5OkjR13U1^Fj{aQs;p}((>hSf_!!g{|lsgZDH?F;dda%wi_OG6y_o7q0 zl$q|e3hl#k`zWUtvC_Hk%Ylz7q9&b)Y!$T3KH>pxk!v`qwMc ziU0YL9Yj5G{~!I5)S<>P-~H?WYHuC*^ zW5%18|NZJ+#%2XeX}a->CWQ(wdoj)`Pe^SUzGPNk8evQVLcI14QHC@m=tlekeYLpWWDBM17c=qyC{FV3_5jzpY zkKcuov*i7ge2!=mn&&z{o&Ik4=%vqq>XOWn)Q7=euAGk(LAIKyKz%j?x`I6#x6uMC zF$X)}`cvO$d)?N#;Z)19VT>|E?)cQ7pNo0v`Y83r@N0v)hU`FcrLmt3&VXpx|2Ppeui8AnfIu{k+m?;AwZnoMAm3c+HTijYmE}12`Dq~YmQBxrDxE+CST0@Qk4#7$d?MI~q-N(( zU^y2cQ?<7MV(=MNwSy?0S5_95Bzx#+eXz(hjMoZ;2fXOIM2Y?= zJUkH+w8yl&c6l>WXufN}>KgA3ungc?P-8Iiv7RR+jF}P43dIMoOrWvsLv|s7xX55} zi(^1&2Y}!Va+U|-lX44asEJGKfsD$O6qLY;yY3VM7Phtv-hi%Ixu zg+JIjQGMfoB!hz&&L8_9t%7sPOb{MBBEzW80!~4ajxt$qlF1!p%Luc_sAT{|6O~9G z^(4&wX(AHZ2FMu39~|wNuwnV^GR~7&SW!YN3qgQmz9OBeAViW#E%kv33Q=4fdufs& z+jyGDVfi>bEUbu}w&7d1YzeEXD1`T5Mf4XYC?Md8lpxt(vN13wNFwFBItx{ZC)iXi z(PgN3_b;^qbuh1ERW}NRECG-an_fe4@oHiTiT1#t+b`witTp3*{t#U5+2x)OI5%P3 zi7`0~To#dc?a|&okoP)uj%b}}jvrna@mRhCjPluKJXX{w_S&Pv{~3mLS`+{rx~TI5 zt`WIhm?>RDNe8Y;y`q%GskjP}%&kuB?CiwkXQaZMQcmnv*iC)yz$iQg)m_|{szd}H zh+FpwvxKjno`FH`>fK(iZTrQ51bc5{IJID;f%(eZ52jg;L2Xfj7(?1ZS@c1lk2_0( zpm>3rP8SoWSCeAP)B?7`5&$Uv!fj{=XTbA}nnjDd>;x)#W}&kI8N6&B)q)uH1*Kz> z0cS`A*c`R|JO|8zm0QHDtac+eg|Wgc*J39#q#m?FajG)}&kU5s z=HGdD$fUXx3)EO=3Cuw^ZhS@wFlL5*YYS0DMU9|Zx`+np9_vnN+@dY?hi?k2pv)@4 zC73WH0`3&vt&GA?J56%#6V80IYbHipe+BHcVmTruv&8h;HEXm)bw6SgNjAFxm?w_} zx)edPL2EuW+`J1J#a~mWV5U=dBId@7O5o&U!738_S5lV_aB$FFQyM117(%CaiY+24 z4I(R+$INoa_oBBcj+4nhVLwo_hIlc36LE?fqm@e*-^_3%{#2haPTo3EfZovQy2}~C z0P+bOx|+-Q*w{yW;De^o!q$Ib4JPN{Vf2LjG^qGBE{mkp-?$~pv?qLgNXkSGGg{pi zRHJO}NPi=oT9XEde10*pbF?Zp2>M?og9M7`M2H$L04@q@z>{13;loWm;@&a4XD~(Z zMiNY5+dVMg3ul|W9{#5ve*Kjxc4Ph~tQ^?`-~sZO^}A z3m~ksa&*2H*#c^@s(H1Dt9eFzM7a4SxLI6IO|S_Nj(DO$lFLF-qh)U26G0z9hTSmdy;{pab z&Ov|b>(bPdIL>k96tRS{`|^P4u`2jfR1}hpHOnr$k&11Pb0bRBheV4fx3G_8Ov`f| z`)?w-XkLKTfWNWfaU^+0eWtVHJ@R^Zx^LSdyA9uO5Xa>9s1dMAT&7baZH89I!vh0j zpKp4Yi5vIP2_u#BgTfGIvyUiWQn;Keo&&bLJv6?US`I(JZF>^%(Jd#tj;XyZ@o4K+ z^!m4lvKG!915Y9;3bmPP%?b`D1`1KGJDtxEa)8x+a@=fgZyDEYdE~Y1@#ai&l0P56 zUHqx9uH`(rgHM$*AMVq3Ge2%rQa-MPdWd1rUqxd)p2wu@hQ@1^%t|$h+_^C07`9LOZ3a$jC)RKpZJeGr#Fh-K;h6HiMW4g#F6UDt0b6g3IcL_^dkd}3wl6I zlQK(DSKO4>=J?((6^5HErVmqfJBU4uWTvtn zas2;&3WRyO$d|GB;Q9cN?^h1f+nlEJS4mr}J9h~~p!~lO%aQQ{vyFwo*+o+F+b17>8&hQz8CM+6aI0FJCv;c zFT3cZ!H{f<8mcLGqq=^8nQO}S$g~k5y%NHN#e4OL5?Y)N3&B)~Y7ZgaK zTR8uh&@vfZH)xNC>?sxs?VCUjS3#BAK%)_v28$3}c9Me27-puH5tj@}ixf%!_)Zk* zknJRV!&CAB{h3}Wim0a<8NkH2t#918i*80O16q=GE3h0kv2Z;Z9i3egi<8jD(o^A$ z)_|%VoB|w?Zubntf?;g0T>J6Y#DflGUDZ7F#EZA@-rYtcnWKy?A{G*L`6nAh_U_re zn*{8KNOYw-(PCmSPk)QF#z12xE66{!436%WyN}Ek21RIyq+_mh8XflqN5?nuG03~P zhlhOMbRA$RC{}FNM570RqgpuAA3|PY+N@~=;gE?wmmiizPhbtO6IK;0PI9`PHqP2KSkMw}_P3Wgwt)v2JrM7-6b#(?BxE^LF)zS?vX z0}ry~qd1tEnb9Jn14cH7k3AI?_}rvUlX?|1Gj330w0&RT=K=FpkvatxTO3-*Ol|b2 z1nT4V*QE#A^#O@Rp2E2RJv)HxRip}H`bWw;$o$^ji>v3wOd)>)z0K|gZH!hr_qH;m zJm?w2bSdTK8~+S9Q-+A{QInDg1ZQjzzqgxMJJaa4Z38eF2nQ?dR$AxXCn|*UL;=-E z1GzPdMt#`@5^Tfk=hLIG62}3b1t-0g8QM~>pxKUX359JQm>DQ++qydmPnE0Kf z%ge^sT!R|ybh)-SDkQp|NML9%hJzwi7$l+!?EBJr3&8yj0)3GKm3?Aq<_@O{TJXb8 zZu%Cq~RPM%A)k@?E6%ltt2nW&;=!LGGOPC-EMOg)f zRCVtz;TiBnB;;JvpuQAc!+y-NG5LXyC<$qrQyyx4_aWbZ1hI5|*5d2A0g6SDTN?IV?<6H<7KQ>3o$)B&~4OSfWuLX*ur_he8|w z#V&sy`c7utz?0p{z+r$RZ4~W)8UIM}*@zw33Uru*vm_gd0go4Xyk$J$=Ca$}Fx(7W2gc zHyoVn@JyX9$IK%}a{gt;dC2?%h?{OB0<;eH@~U__y4G3P)D9yH{n-dfU?jfR(8}wO zPNN>6H9e0IgcWNMDQSoSm#)4RY9Fof)sKuF@b`$?$)4a$m2 z-KxJTHjl`M)nsJ6b<0~?7$N55?uU4OWjLPa@f0T$VL1^$UQ&ZMJ=ksDPO8TUr^FvS#J0Xp& z`jKr)N2IVTpcF(rXTD`_hu!HJ=n!iQM#@DTE^JeF6TM0Tf`aVR@u{izVJ2;_H$#K1 z)r=p!i`1eWb`atKI)flX7_MWv2{svIO)398#&p8{+|S$-&bpL4vHT>xe6ah41u}Xx zUfcU!0-Ak$93mhIkEMlZWr$1aH~g3v?eV?=0WHn10XIck*~+5FyLij`_1n^>Bv3@}Iv0-WES#;)h(#b5 zdzpME2aL-v6G0<#o7xXIX<40zGBU5$_uf6)a|(n}WP`a9+N;|W6@MnyZ>s-@O2K3k zrxn$)*de!H55+1!uVc}k>CxZceDdq?=S23f0#&BiSuMd+ucI~RqjiUGss@DC2R+g| zRpog@*?(_I8b_?C4bYQSmaQt;Oawxq&kH$0U-M!2pOj=}cZPakJH1bbXCdW_zX|T} zUki+G*CC!mg@ax_hPD@??WCO=ms0<=sCY5`N;uOzaq!Pf_wRt6pz++g9fOi9W*b%$ z%I}{y(CoYLZ40#)D8os3@ zTIwkFK)!gVM?e4Nf;sQMia#CCgfspe8AdFBX0p=@dSXTf@ioJYU2SBVr`rNH;FFqc zO3PH`F*2apY1{0ZkZ_GjDf#N#+3mLPH`Ld7&V{iW_2?^hKOFhfm8yK+UD2eOsXXA` z;KDDFJF)U`idDC^?p<}$tgNz98_!WZAVm-@^3fy2gRq0Tik~JSl8n?bSUlAW;EW<02AkELs=R`G#pD9k5?y(c5g+Z zH_mf_0GULMZERo=v(o3_@QP{D0BLg=VbgL4VZd8lTujaZ0upM6R@Vw0M4c$AmsHj8 zCPmEh z4)83kdw2idy%fznl-tuomenYk?sA z1$#?2*f)9!*EbPNM>QM9IC*%yfDA%g;Ei`7N`0WbsyhrTwygq$jnw{K3o~5-fa}(y z3x+c*^_u>FI;+i+>`Vb6A=`ZrP!Fo?1AaeYN%U?p=-y9IijwI|NNlgS^#1oGj$I|& zIwlMG8+jw3XbB6u@9yqze$7V$>e_pIpI zqOuhpmQARd2Xm#c-*1Bdb?E2vL>cNOH-MAcFDRH(Gu5_SNaA_AyNgqUF_sL`uE{l9Db}E} zx8gP?Y}BKIJjok~CAav-bB|A;_%o?b3GF6w)egu~PtQ*O`Qz~?&(#SMtBY8YBMq|; zV_v-A#dySQbrrnaEeP+PIh!HT27%lYgHX?EJL<$A5rKSy#ozE21%bILTr43v9VjgAw~zmCy-oT zwE_`;4Y_8v10KtZLWnlG$60`SMGCb$`=4xw5d)$PEOWqT{a99ZPlwn?P;~WdePOth znA0HbmA-8f@)n#Snef8dhOBc>FqN{uF*wSWIFmz0P-JS)6&ONZj^siQtHrG4imzYW z;m4sn9zso2>6)6`)e7(+9ds*Hzwz`m@x%MjuqMecUI^}00((aRJDV&oM+*@9F3M;Q ze|}+M`iygLi&EiSW9b(5mqod`V$bm7fjOM$r5Gb|sxJXi$kW4;qF`BUGxuW@p@kay ztRx4<2Se|woYplscn=SJkbnviAc5!#Y19D74VYM6T?Ly`5gIfj6{h1iilM5dCIB?z z%Vg5x5)-|#F?CJZ!MBkTSd1J59|((~811dt6Lt9FUzz^UI{j|Uyz{*l#at$e-eNr% z%RT^L)V$uM`4LZ~s-fYNH%x@Tb&|mZCEt(Oe`qK>w{IuL^zNzbOQp0%q-DZd59*aw zBvuv}MT#?VeK*n3(XmfWLJ=Gb4XJe{SklkO-~9wdN#xDNWSS19Hk-;B1ASleMh@cs zY2sER*>3SMkcF+}C}@zj2r$V%5d~l-i$$S6A!c601Yl<77KS69jRe^SwW=^EcZhAk zk?0%=&ggpN^MQ&Oi0RNE8Noliy@ySdpP!#ZHVg_w6h(rR64m*bB%1%5uwphjR? z^`+;3T!KLZXqR_T1F*R&3^>yp!ses1`@kd+ITBrz-qq9Ng?Xvu;z|Fhn5Ls6pOf zQiPlh+W{gtL>aC|5rR>jpzmxk$1EC8)-&)%lKGih;6I2XfdKE`0n<@M#*&oq>UmPcX44jbW`Au{?VsQAW6 ztN6zl=p$HsIrwQ7!^;61wE2lw%)BAG4;;eD&3C159$DzooHlI7-P3EQaB>u z1E-4=3{3AVzwA|d(^<6p>OUA7{Ut(V(m-;i%DM8g7W2XTDblZ8ugcPi$~1WboFn-> z(82|5Aiy2&?%F$Qg22yy9N#;5rg@x)QkioMpxfIo*2|&Cn;Hz<6-hrDyB1~czFq|X zeCBX~((XY3UG8$tr+8$yiCR@6495ETPCZWZ9L z!706LlU0xx_^$S+!oY=CaOHJF#DSQXpC2kg#-1hHgY3Bg%dPyAv3ju$f3Mr(2`OE{ zd|Or3HZ;aE=}_n}_17i_)Leyq|1Ne^b#tUt?Sz9Q={xI15b77eCg!(7OqDR{fl%3} zC`0Q+5r|cVxVnUIu%t<+VJprbr_;M26+%oJei2ixHXm7L3S zWHMxy%iDory#;X^%RL-&=m$Tdx)?WuWbA>k5F>@eCvuxwAk-6C&K5WdzmV}TD=|7l zpkRq_lCc>Xkhh?BNqHI!u^CaaG+5i(e!_pqMuzQm_x7a=zM<3#R+o$XhW`jnWy#XTpPf!|Ci=rwKMi zXP{4l#Dn@G5qoSWX_Tk1#W5hhoW?q4?I!x0@VNYQ074&BoO1~dMYI+{0=cNd2(W2g zfHA1v#NAy-oFAPM6Z`9jV?iO3HiE5*nPiC^cAc@3%Z4T&E(Bd#OMXOt?&=RW)Uj_r3rTU2U3XH@GWZceKq1fb4cokBi}t%3o*k&C zZsW;)b{U$YwP+agCSt^#{sJb;UX!f<`o$%O@j@&w5TylfTCIwX<@sT_d^X^g~`H znukUosMg-v7Qq#7IoO-$F;i3?wE3;WGTR>}66G>o8J++BxRBqDtEaQC ze%p(-lGgbeVCf{BTH4I0tm8lrY%Cd@n0Pg?$M!5h;-}bBsKtfE#G>{(_wO%Z82B_w z7XbXc8VLXcnUuY(0JP2GqYKVfBkH`)=y(*htOltvZkuUwMwbEdZoCA6&_cM%xkFP}ceeXT%t1pqO|=oeC` zstGHm8jbF;r{At)CyxD{&OgP(#7r>A8-SidM6FGEh;p$r@yW|2ai(BG0TWabJWgaP^akx1(#mxM5CwB zH(4RnW~HVU7!);$fi%TrtGL#zp(37O@c?U~4-F)KW-v9w24F)_6wneUL|oU1@SM%^ z+(=Cw(j5{U{3$?u`jMS7==!X8L5~6fc8nu>?ZD4m^wlCI_kx^}V$=gtS>Zccaeve! zq1_=EV!c&-@}ZqF_?jMs7qN?6zH-F|BS7txQJ`rCZ9V$xBFA1_HRr^&4(*%h0+0UsX7E}wzgN24f#Pcz{V`z7LIQq~= z*EGakYRM_1uF2uzP^!f|`x1E@XKconl`y2cs|VF0y4CV`IGRT_+)MYR*{hNBJ;279 zq6>cJ&Gtqf`*RDinrGycrS5LZ6iLL@<}6J{w|cBE{@z6ovl8m%tfL`(J?VTunp(%T zIdqJ&zgL>fhJ5y++~djA`rd3J!Z2oE8~mz!;K5AWRy@~ty~b~((ckD?MWq@;KYXcw z@b~l9lqG{5R6ZXa2l5U>RLN2l$t7c(Gq)#S+#&c0P1_`Wek+x;P0rG9v*EXNpOnSQ zYVT8BSS{^(R4e996UV6Zh)R;s&55G1p}bAU$2e8^{<=v+Z+)0o<=ilxW1Lj}s;FK5 zhJBYCeb-Sn?pkY1Ikh;vacZrVfv|h@t^<3iPM)6m{Hgj}`jYhZfa;0h9<$+9Z|HP} zMqV_;9OwU494D5)o=Wh^qa>X@-Nz?uq4+y&Ll|M^F%o0bn|Ot<)a zG{0(?+o0bwbg5Y4LUz=Huz=Qc`EJhqdeINUwg&Rsvg9vs7yU7s_#@pC-4D6h+ji|r zY&O4ofWSKjl;2|dQSOrtX$ z62FIr%At?ZiKvAN;RkSGVFDJlW*U3fL``NP?t|dD*CNr7sUTC-RQZY9Gy{TZsQbFR zx;~-r)QBK)^^FvYj4&C4ywn?cJ*1b2#4&(uTP4@w!w(5JlYqqwj{euk2w%%FY+__- z#ntA%4>xf3&Ckp8Y}*Q42Co(w+B?}DtdjNq{rge#UE8-80}IoPKsssqn^RuYMIxvQ zIo+81l-DL0g&`?W76fS=N}Zv-%?3CklH^0VAwn4F*JNaYT4t|f`zmhG@Dpz%o>!yo z`}Ht1Q}e~S$0R9w5EWdNP8ou^W6_8-??1{}j?fHFjF!iybhg(#(R4Sey094cInxA@ zlHy`jh^7Heq<-x)i;y14o7jS)@vNz`Gs(}wZd{F#ZrpZh#2s&8kyaB$HcZq;*?M;UmzJlOW&4fPhi7(uHS-UsRp%$z3*i_dN=e|jtq&O&fNh3 zBj>Qun4&^g79ZHh(v!^Y?#_svO5uVz%4-4f68*M@=KHOckD9 zo+daH_p&7?O*MX`ACmQbELk%ZDLD&!ow{j=4QV`z(2v+N59LS*jkB^4D+Y+#Z)Q=@ zkIiA@mjAi@dGYjV5KEI^kACB{2(?}W%m|hBts{mWEYAR~pb3Z$>D#_*mkdCk11#JV zQ$w|hTOk$Gby9W~fj1g4y4pz|$YxmfKOq+_VtZi6&Yi895<*M!o%i{I3JVM0-K9}O z4_<-s$Ud@3ZbC8U6}UJusJM2Bx)aMW@DsJ_c$+Rt$djpUnO6GSRfZM^m?2E;8zR$NIK-XmxhGV1uubT<&7 zR~j8Bzgrzmf*`baEB@u?*gzh^!HT)hK*POx?EkM*8G z!T*+_54Iz%h8o^y$qPYstt%%vDhQkV%M-=ltT+|EJ9(5yz;)$ZO=dA9<`ObiJ>;y9zr+LKjJG z@n;la$$#>SQ1B!zs%D-f4)7w{Vqcs`ct&o*w~?Rk!=aqmekuM5HqR#5B_`eR%qPMq ze1D#VYvDAh$t+l$)*o;?4PHmmnfcWZOvJU!dNYEjL&)O_M;GWaJ0O1pi)HUYe9Ymp zB>X|bXgqec`|}v{z`+g4vp*t7 zIiO2u-&UCrl%sbS%NW)b_L!CDtQHs+(HUVzY0K=%l``ZZ@7=pkgfXxjs0x z@cEO^xpZ*{qefpDCubU}2VbfSV|lKhOK+8Sov=BmlPYx6;6Y=d-wi+S)&~*>gQYL& z0>&pdtHsFFGSPK3%wY#V*0oG`}NIavD{>~g-7Rv9K9dSka@8XpR?rh+EB)y zPc)~m>l+V<#L$n;*!PT$-}vuWNS2nsg%4B%iv$V6&aCl}Mhyf>GHUdic|V}ezw`eX ztYM%5R{sD_nnU?*aDpjg3h_Iq_}f4*dfx8g|}WuOI3%6 zxHSxm&3)LS`SRNkcgzWAx}@}6GV&RKqnvdv1gxQkJT9K!E{jv&xrBvvMH_H{#Q&k$ zv1iZ0`~Yw7brsRazjqd(C4KY``!v%K#AgX$v*bo}S^+ja0ouWuZwPd!KTu|G=9}y2 z2{loL4(LMSko5+ms`LqpZWjf)n=1C!L%&}oe4C%2Cl%(Q-$OXL5VEBv+5j$+^T?6$ zUQ6&?WRjs>6x;{+!ciq|n0xgy`5WknXw-w(T3=umd;VmNLX9;UH3~2<`^^YsLkv{9DY*U%P4@j&w{XJY{6lfXS--AeY0f;=8tSr7Dq>@vR?Phf@c~q#=ZB_| zBNWhdWZGvk#|^q1ZQ&LD8A=3&V*ba9aPYZyL=}Dd>-!S=e zf;Ii>`sgtkUWTTi%oZ<_h`5wkQKaD$MONBpIqYZH8DoV(TlGxW)&Qoh#`p3X)Pxi} z1;HHSQM9zQ{oo`{EoOnr-oATxiWo;e>>*Wn1J6MwI0qn0

bc{BxhjDZNm$HB7m5ILSVO9Q3{k~Bg!3KdC;UQS9)?%Q{`xiv^Z@)~5q+yi2u5^MmV zDu0@-jt@R!xgFfP%luyfBn;pVpIRhikTemjl4=cn20A6Fca!Brk#1&|qKZfVi~5Ph zZ>6QNU6cTzN#ow(lP6jZAoozQ-xDgG5o6NeMgOk$-UKU+5W#|v$ zK|91wLyhPOI7i;k{xuq5*h>>&{%^5oR1_-?p`iQ4pT?~(&|#B5J?O=$Zl(bt>F9%* zE__ZG&C!k>J2sR6Y~XNknDSOYU-G!(N^GCAqvONK$jExiqsJgzygFNE2Hyz4U4{^^ zL)$T9;Z;+I5sP9q;yHR}^kvkBppA)V`dnt_6YSU(*mTw8FHGJRKw*%oG2_ol_h>jzZ?;Oc8 zw7GaOwk8UC)*sEy`of7oR=F#@AJ0OFroK3i*Oi~(kF3apj=v)c;m|W!MDn&ro{`4N=; zO-^?>c(~<+G}2A_SBn^et#uSeW|IzV_fD)-M)u z_#IWptuK^|DT$nst|X%!etOpET43iP-Jia?g6!@Pih6wwv%NBXYyqvi|9;yS-<3CB zb}$NaoZ~GCb4}Unq9{rx`RYY)AKH%{8@$nh=vqijQ!Nt007Ke9Ol+qNE>$&X>qKE{ zU}fHTJs?UpVP2rw98oP9X)#JGY)( ze507d7jN4|A=SbP3ouBOorA*`r(%L_1g0=z>S5ah<71JtI0GO0+oI1uXgHw073o#y zuz&}q$;7?0$~e*6_iW8f7gG{LQ)f{|`T6rFOC>0y%NmSIwKE9J)vC$G{;<)b$!0zm z$9tqTLR2jMC(u;ucN26aa>x$;A9nje7!Q<&YXy|#B))z{eV>&~&igE%SK#p@hw>3A z#ct_>@O;Y`4yNUmt?4k$*Yt{pPA8B4G;7U!TOpE+P0mO|8NF_qO;!#cJVA12m8D-{ zK2PL)4bVIuBVYG}j`P|$rtb@*!hJoP2YXv$yfTcZ&nxV)7jD&Xm(%cK5wbt)$hC9MjPc^Qe%9AwV||OCdKf$`1JcoVQKRxWl;|r z`2vIAHKtJL;`lp6cL z1CQ42>AJsC1V&STD43jE9kZU2u^vBiG|y>O--Y6(;j*aM))&VN!Z^qUk6+WS>u^-K zYlbH>rp@tbYkOH$;cz*_AgS0G2&(As_3S#SPZ$(`<=`_4%3m-@ZA zzUgmO1L04R&JXx1%U{xM1wb)xGiAQz!RPv+;Z4^KK8Vm~db~e!HR*M#UwDr%%3opA zRgJq|h`a>&cXHF*Yc~<8e0~E_J|K~ZGdfJ)ybF8kwmT|Gq_7PIf|;m>rv!*`aBBuG zFL}7`>faCLW8}%y(itdDF@!A7P?$p^AFSsT>D+OEf~U|Hb**lGBal= zM1w`JE5{Kx5ELwkVO5LK{c`(yxw={ZSea z1d1R9-c%VKVThIkU|$(V{p(%tW~Y1Toed4098C~Q#2B{taQ(|&e*e7 zP|`$PL*ix)?XAO6LyjaXWz^(7IENENNMwDAp|p(5!3uIfVN*U7Me1s@eVQsuXQR0U zypwAcjji7i7_PQ4(b1-C1z1vK7KPhxg&N4KOEXp5uW)YgVCdye?C_Y2_0me2xCFo) z&|0I;Eke6mXvGNg#$IG1U=~3#dg5Qby$4zfP&-&%yY?c>Rp5$SVcYkigYOBuJk^vO);uBww*1Rnv+a+9f`u$gh;H8Vj@ulbX3)8p+kYpY!461q( zZF$(-i~?W&X|(m+NBx0G5dalF*fQ7O7_4K*38nt|{lZa~8>o@i2(d_DYsv} zMEde)@I7o47ZMeXz!O!X&K%AVN$dih`jdNL_E3fgBwsZRD!E#ZrHLm;o;bWl$t*>> zj0q>8)#z|Y0-iQA0{P-e`M(&6M=TrAXR;WRC7wrcNE$027V+iWi^tZ{ZROC}%dwY~ zXYH<5ek&Cl73Wnj7us6nAE~UE%)(--Z&emj{mpq1>WQL4QzV^npn{dPmY#d-IL3m9 zP^n*-8fz1Ta*qfx+D#%No?845j_L%czVhoRz>1v8U}n2Fg@5~TSt}}5%A0!G-x)A* z!i%b0Osi@_u~bz^vS6G$eL3RCS1aI4?ONhlyt7q8mM?GgRB}K7J^!hh-2phO!*sop z#NL2-Cq3G!Z&QE%Bx0OnZ-sC&)A*I#G922U4gUcFhS% z=7Y;$Z|HH%sMtq6;{+VsjQL}q_nfju>bnV+N0CU=AS!@3FrTtXhOJNTDMrgF5HqB5&yXHGL?|9n^S2Q$cq8<`9znf)SZ96~q^* zGPB(v2?-LF2GVs!P#LzB(o*)2tx=3&wqF~U54Q7Xa)Hm#P9hH;-dhZMUe^dv-SScGeZR!?kb(FGC zKKTI-%{SPci!#n*-8y3WFMB8AFZ~Ga#+YbSa!(AaF2NG+_(yfrR!}J~{C+>8=O+k} z!3_t8UHgTN87 zHS#uS&pOdRDfB^9Tuf*SAN&qy@@38I7G&2A%H=3UB1+*Q}R(y$e24phJsA=E5(xO4x1idF1T)i)B!OBYu^_J zFXLYq-}M>RE4Koxj-m=-==M5*BFe5@3dob2-@97f$I4XTIpJZPip`lB-}nlFzLFE{ zu3NzFBZf=JZVA{ddm!p;vN}7FpYRxoFHY9}>gi!;8p0u>S##{lm2+LIdPvgpd4z5| z7h;g5Jo1Sj1{Pq(^VR`39N`0K=+qx>R)ZiAGcom?3sK=~tiWx!Ke4F$2Xu+Rqg6)J z)itkbyz(`<_9eqm(0jvobBGCIQ8`D`i!;lL2ZEK{Bv4kPqrYU`2JFDo!^l1rB(3a7 z*H;UXEzp4Uqf*=X2cIPnf{p}ZykP|Tt?RA4=#-X{8`1$S_G4?UnZMy zMbl9cl0Pu(x9-a)Aw?@8t31)2gCm4`J(>8|OTN6vS$rtJJ~(o1ow~7CI!)-a0_9(7 z_Jic+3;(aZYyXEbfBT+oH_<^VQz_b%ltapyMZ0Bf30Y0D36)dGVHi0?+G3kx__ifq zjYGviU@DQRoWJkS^uzP~3(xEI_^Ewu=)UK^ug`V8PnYAm==V#z zg7he(?+vYd3g!McGdokIEgusZ@{?5P_U|~)0M315gX>=R`Z6y&rS>*GWB6#agl{DO z5c~^=J2LT*4g>q@QoPwhqE`S3Y)o2QxMWLmkNtY!bgJ##;B~pZ>(u3zeghi!J-pP7 zl?2$WmUU3|?s|xH;GxquSf0Tfy%ZzO&2Y0sEBj#TjVw|W^LkAM*G*kKld?PP9^MiJ z%w7yDnjRMn&p)`)8!nl&bwTx$|wPj6>c^Z)jY?-QZJN3Bahh0}(2goF07a z-ocsB;mUwo=d-!mc*M);BY)p*Q*Q&JpT@NTx?t@0mhpF}`L@;SIJ^upK%V$Q82f8B zj+Z1j>hZh+F*=IpbqEC`7ZWz!5`MS5UlZ}b%0p&PPecIv1Wsmx!$VFm0S|QQ5#k3F#g=mx><{H9=$a!Mh9fh9(NC3}&ESvYl z`}Hm4PP&gxRH`*48#R!kcmBO=Q(_$`v5NJzXLLSc+EJ9t06h?jWde#`4NXl$QpR6@ zHpsXvA3{{k!1;l697hv7Vv9bAymUi;yT?z=2TmXo%;x;;cx^yKi!=(@>mc-H6!94O z`nGD!y^4yFzcFPM+vOoz08=%KIbF}dYcoEsS=8jmz~bI#p^3htYaP;1SJ#hYNRm&G z8FHgM1d{!wySpm)Q+_KWvBvGMyAQ7TcRet=-K z=wq-uZeSwLbOlQUwnL~~TcwZS+i`Sn)OgA7p-lJ3bwwHZx77u{ z0qYH^NA|q#>+lXEZtfZ&n;aTy=K)-3r{c{zX9f6Q?N4PwgJJAZ1YOqr8RLPf|pJ@((6~&4Gb|sr+6hFfhqI z9GppPW8A-Aw_QK5JqDUG3?h;XC05q(3%NKuFEQ$BXb{A7b>RH8`0C~Z6fj*cKqtv8{G&p>){sx8<^#D=*DbMaqhhYqd1%Ioj{1rp4?%|8fH z3b(7P9~v%#rkGu>yF*nq4Y{?cdY=v4>Z2`Ok~Cdv(jlIB-RIP6HuvhKsX;TEyuPMY zSU`&JD6U=1=fNFKy)LmHI|Bkk{RK-fEfo*aGjzAa+&RjHs+6Qp#oJoxpuL{>{JB$+ zl2fFDJIeB|txxJbT?Z?VR!@+`DKdTS+VwGqhjm!%a-<4JUsC6dy`c-iSBd63U3syl zJ1EYW&-b@!Mfr@Hl@OvPPF4j#E(t3J zS>M_@pC?>I6he)~zO6kCzYs_fWt1x$Nn z_ffpC)_eX$95ow1T0-bUoRmOEGWC(w(K(<*3A|6%fE~%U&26_ey^i%__=IDv$}U~H zw8PM_$77>wI>A;$;yc8nhihJ%(E0NgCV(&-tt$>ZZ)!>h`IZ_o13MTF{DX}nIK8O0 zw%e&=1k4PdXN%P0DJk#_S&beNgM`!!g%LO?0%_RiHNA9LBjCKMctcYO^po$kO84^Xc4skSo>4pTXgA8V5 zq^@FiBC3Pb!XAh#`i*yiNX%MW$}-ouJBXD;21MV-GWXW%Hv zRK2zt5)bT_Xp5Y2!RM0b%GL%dG3n4D`C-{=DQ@V@drqq`(>7OO0R^2xgA=q4V$vHi z+K}41T#rRz@WZ#I`6JUe=bzEt*k)nq*{M3Bhje4*+~{;^ub#qd$jV@IGyJaffw(%r z$NXP~YMsyU2cQRdEsvfJ1p~9Kt<9$f(KK(+%I~iwyO+lLD*6;Mh<40FBh)F1$sELM z?4sTE42@eb8tPWB4Fs1va`wz0#huTd?Hkd!6+!|uQ6Y~Nhv8o}fJ6iqjr2_imEmo; zAti5k`dS;553mb$x8_3qk$&a}qW#XrGIqL1;>P%dk@yqB`_g!Z==BtBl%BX0eE~=@ zk0JC%`+qY~Hqy4q=@#H8jDxq9OFEx`9>cY!`k76^vQd+2^uc)yO^hw90T4J~Lakd1 zAPIQcBkpbAIfh{uY&n*HMW4otqQmim2#=C`@zN!)s~8);=-m?zpds<}uP6-|`G|Ll zC76R?#rADG_CK-wt@C>VA-eLa&qRMup<(Rw_9vi;>Z#zMpqFd-L>^$hWeXxPBN~Ot zdbB%eEmrT`Defy)?$uGi_2J6aZlCl>(cE>43s&;$!rZp??=lQKxGi&bhX&P@wiK*= z2NW!JLW{9?innZT#@ljb(+lI@{DrF@WpSEMTlvh8PH2NV^QM&aG)D!BxJ60JjwfhN zj+N+7>Ft}@*vL={6nnJiE;HV=;qnq5Tl80+#_2!%(WCl$tjaH8|Mip+9dy1bXv6^b z+E>hNoe48&f~J_`;CR$0rR8?-=fQfR&cIY%oWQ;p!oWW*7w0@*KMkJMssTURs3>G6 zdaPxqI?mtvzC#PGWT#=lV1t-%o$l0B3mutDbn>hhhGM0E{zxQ*I(9Xuhc#e zbqT>WYe!$CUV5yXfJ@BJY$hs23I}wzDL|IE7+A)_zy)#&AA&Y%rNQmtg)KO7=oKJj zqJ~spsoi~ON-jpz)4!zbRobxd9PRbx6>>zfdPXB*(lY281`(b3KKS z+nIH^fgZKtg`&USC@sBtV+K(WfihS2;-YPCMmHm3V@X0Urq`Pcf4JMAlc#vwVKkqD zD2v*yV?@Yy$SZT&5dU~%h(P~Wl#*BGaG9cB~DQzjj$)EX!t$y z@=a(g>GbZsfCt0x-SZeScB)y1tJrGMrdqzZ+KcPGRz{`;LZJQ1|2E$ibtvw*J@k%P z4SeM56v$n+F~NuoLccz;O2jq&j%PA*!QL+HYyV}!XQcAsohzXum+c9 zVEN@!1wz>*_5=dc(Fr@Kum#v?Z|EnBRDN+uNgDJgbI=8gRJT0T%VV&=Q5jXD25pwt)s%r`)A@8B*;7;$i(w&T*XV$rO&fw7BN zc>NR8!t{x6|6Vh5^EDwK;14wmJ$Dvp#CKun()ZG!dpZplBOt4=X?ij4u2nWB^zKRK z9o2+fcN}xy8d10?2%i-(Ko@i}i4ZX>7UBXdLA&K^+0>~nBu10zcX`bb$#!_!Wq&%G zJrF(FueDmrSANh7+yuT{G$%2Aldm_jWV|I`gVANx;umCy><@)qBL%WmB2`?oa^(r( zIH18+0GB%_eOAb7F(3(3ipx+VsSpJ85y6bO!h0CCY&MY{3CraSsHY!HK^id)>?h4X zE*~!3kiod45q=>bE5+Th3a_3h_3}Szivw=I{S6T$YLRvwi?-8vwb(g2u!!!11Cwb* zMEt7DHyQVgIzi$_pzhQ}yt(cOVZpsZ*V&ExoInHIil37NwTQQH96gV$rfakn0qD{C z7HXu3f8LEFbsDW%Wb#iys6fbF=%~exozuWT_lErlCL62CY)5&R0Pg}3xj_)U5csk) z@kzQEcWoz;l|cB)t&+gOo58lZT}Z>{0vT9+xN;K44qVB~CSn+38_z?k(E@lqV*d3} zhu}z*G)xd^dRy?JNBG@$C^LWv^4M@+=>`Cer96GU4g3|p$(|=C%a0{(wK?>3=oYK~ z$O95>lrL)vVSa}mQA>-9i!bG%!C^r6a-Xow=ul396oGJ}swz827FSb6cog3x6!|H* zJPvo0@b4tlzimA_7`({*4?QG~mVTKM24fMtFyQfRoN(Sqbg>vj+h@a5#bWnM7cWN4 z1cEBU+M75AN)hRIs)(Qj56GiiZ7L5-s{nCTk@;5=mt10!h*p;d7iWY&!4fl)L(B$b zemFAJl+=J(2UsD9p@)FqOcEhV%Z!^r9A+UDsy6;O;|$ThFR6tPQx*ZBwgH*+T%-$U z%9PB`%>@I?wsC@8$y{%RG!5nR21gy%W{=V{Fr|0s`IL??o|&4Gh^4`WR*I6piUw$_ zT67PL;h*mT?9SFUfVd7s&{9582*Jwp7<%-JSHyK*>mFGhiw4|mxs7fCSm!7`^1QW6 z2x^Ajzi&j>$o{(kt$r+I#T%s4}6)3^>o85H=>6_OtJUv6-x*2*YMLk1qt+o@d;v| zTv%S{s+fnP6a(NkJ2M)O(aEz)GbD)iGQ)v*Q9|}6q%dJgEFWRSIgXaXNC+_h5>@bZ@pL+~+pxx_MXo3{L zOO^$v^qav2s|;(&vhaiAx?Z`Gx9`Oi#pTvgT--*zI#M@8La=JDM#UHI`nB>Mw3qbe=3>%NH1K-!Xx$?@gb-Ql2N4ZwMj1GEC zbK!vI9UX&{qqdS|KBIA}%s#AC^$vpIouj@;2(x*Qi0Ui}>48J`sxZ7l?!?}07~6uiEH+#~6jqoAbwR7;MX6RTgkr93lGIR4o1_s5CeNTV)8!%EmSnf}TX z$-zVGLono#alIw;sr{~jx|+D6X`h=dDL<>5zU#|*&JdmzeQ&|670LkVA{Y#cs_ljA z|61Cr#U##%TQD6!k^HP>h_mg_xwnl^_YDOi~tWPFRb_d z7YtNwcz%=oT2DUFeHAj~mgGJXnnAA-+t*&BbMhyB+c%h$6=z1FR4OenhoXa)v=x22EjKUe}^EKxsgfszb@y z2Xsp$Ux*|uLJ%E@nP9Mqtuz90Viu<4&QA-} zNA%E94`R-bB1%9kLJJ?UB_{&#BauiX7>A)qnOKLzvu&zF9OK7&tU^3J()9t|r8697 z(gL`wT4`W6Yva`&3mAY}szh^mKBOH$_etGA!1hG=!i?ic0ih`plasxr1q1&?ZCttX zM>#Bp@vra3rXl^2P;ka((H77+86rnu#1?Ho51KELv?Le;mR1`4t9RZd{$XWBme1$Y zEn!kymyw>XTzBBm*@U273o`>Kp9oUbiIvr^PhYLAQ|l$8Onrm-0>ObgeEdHeEw`*V zCK75SLTp<#!BYl&HM<5V+gr3nlAQ2?>MO^stRSTa&Nz!ET$1x1m--}RmD!{&QEv$s4{LXq4mGafFH;lw)D}m56nlo4|zNn)fQZm|DDDS^4kj z6KUA{P|UP|{N+e*Ik;%EL2CC)h}xQQ@=#*(;A6;gI*c+p2JMet1~Q0MHnA{I1>sbv z(kDKmDgDFhvYO}LEf)239y`iv7K#3JVaaMK(H|Zy`eyGG`&s}0|L}j|CKN3!l`;5G VviIDY{p15Y@T;Xs&fXIj{{xm)utxv@ From 0cf38197a0f3f977fffdc071432631a93d0836c9 Mon Sep 17 00:00:00 2001 From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Date: Sat, 4 Apr 2026 13:00:05 -0400 Subject: [PATCH 046/204] [CI] Fix missing queue for Voxtral-TTS E2E test step (#2484) Signed-off-by: linyueqian --- .buildkite/test-merge.yml | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index b0b5a63961..15f668b386 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -390,6 +390,39 @@ steps: export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" ' + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - label: "CosyVoice3-TTS E2E Test" timeout_in_minutes: 20 From d92439c155dd315a662a6fb7d7efe94103d84065 Mon Sep 17 00:00:00 2001 From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Date: Sat, 4 Apr 2026 20:45:49 -0400 Subject: [PATCH 047/204] [CosyVoice3] Fix vLLM 0.19.0 compatibility issues (#2486) --- .../e2e/online_serving/test_cosyvoice3_tts.py | 1 + vllm_omni/entrypoints/utils.py | 28 ++++++++++++++++++- .../models/cosyvoice3/config.py | 2 ++ .../models/cosyvoice3/cosyvoice3.py | 2 +- .../models/cosyvoice3/cosyvoice3_code2wav.py | 6 ++-- .../stage_configs/cosyvoice3.yaml | 5 ++++ 6 files changed, 40 insertions(+), 4 deletions(-) diff --git a/tests/e2e/online_serving/test_cosyvoice3_tts.py b/tests/e2e/online_serving/test_cosyvoice3_tts.py index 976be805c2..1845d7818a 100644 --- a/tests/e2e/online_serving/test_cosyvoice3_tts.py +++ b/tests/e2e/online_serving/test_cosyvoice3_tts.py @@ -80,6 +80,7 @@ def test_voice_clone_zh_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100"}, num_cards=1) @pytest.mark.parametrize("omni_server", tts_server_params, indirect=True) +@pytest.mark.skip(reason="CosyVoice3 does not support async_chunk streaming yet") def test_voice_clone_zh_002(omni_server, openai_client) -> None: """ Test voice cloning TTS with Chinese text via OpenAI API. diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index e29e9eea1c..0e1000ec95 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -182,6 +182,28 @@ def _convert_dataclasses_to_dict(obj: Any) -> Any: return obj +def _try_resolve_omni_model_type(model: str) -> str | None: + """Try to resolve model_type for omni models with empty config.json. + + Checks if any registered omni stage config file name matches a substring + in the model name (e.g. 'cosyvoice3' in 'FunAudioLLM/Fun-CosyVoice3-0.5B-2512'). + When multiple configs match, the longest stem wins to avoid ambiguity + (e.g. 'bagel_single_stage' over 'bagel'). + """ + stage_configs_dir = PROJECT_ROOT / "vllm_omni" / "model_executor" / "stage_configs" + if not stage_configs_dir.exists(): + return None + model_lower = model.lower().replace("-", "").replace("_", "") + best_match: str | None = None + best_len = 0 + for config_file in sorted(stage_configs_dir.glob("*.yaml")): + candidate = config_file.stem.replace("-", "").replace("_", "") + if candidate in model_lower and len(candidate) > best_len: + best_match = config_file.stem + best_len = len(candidate) + return best_match + + def resolve_model_config_path(model: str) -> str: """Resolve the stage config file path from the model name. @@ -220,7 +242,11 @@ def resolve_model_config_path(model: str) -> str: if config_dict and "model_type" in config_dict: model_type = config_dict["model_type"] else: - raise ValueError(f"config.json found but missing 'model_type' for model: {model}") + # For models with empty config.json (e.g. CosyVoice3), + # try matching against registered omni stage configs. + model_type = _try_resolve_omni_model_type(model) + if model_type is None: + raise ValueError(f"config.json found but missing 'model_type' for model: {model}") except Exception as e: raise ValueError(f"Failed to read config.json for model: {model}. Error: {e}") from e else: diff --git a/vllm_omni/model_executor/models/cosyvoice3/config.py b/vllm_omni/model_executor/models/cosyvoice3/config.py index 0c9a289979..b4e44b7a82 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/config.py +++ b/vllm_omni/model_executor/models/cosyvoice3/config.py @@ -7,6 +7,8 @@ class CosyVoice3Config(PretrainedConfig): model_type = "cosyvoice3" def __init__(self, **kwargs): + # Set speech EOS so vLLM stops generation at the right token + kwargs.setdefault("eos_token_id", 6562) super().__init__(**kwargs) self.sample_rate = 24000 self.llm_input_size = 896 diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py index bc04aae33c..18a16ba551 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py +++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py @@ -432,7 +432,7 @@ def forward( return OmniOutput( text_hidden_states=None, - multimodal_outputs={"audio": tts_speech, "sr": 22050}, + multimodal_outputs={"audio": tts_speech, "sr": torch.tensor(22050)}, ) else: raise ValueError(f"Unsupported model_stage: {self.model_stage}") diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py index f5e0d04a8a..222d6d98ac 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py +++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py @@ -192,8 +192,10 @@ def forward( # Create mask mask = (~make_pad_mask(full_token_len)).unsqueeze(-1).to(embedding) - # Token embedding - token_emb = self.input_embedding(torch.clamp(full_token, min=0)) * mask + # Token embedding (clamp to valid codebook range; EOS/padding tokens may exceed vocab_size) + token_emb = ( + self.input_embedding(torch.clamp(full_token, min=0, max=self.input_embedding.num_embeddings - 1)) * mask + ) # Pre-lookahead processing h = self.pre_lookahead_layer(token_emb) diff --git a/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml b/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml index e215f51428..bfb847f5ea 100644 --- a/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml +++ b/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml @@ -22,6 +22,9 @@ stage_args: mm_processor_cache_gb: 0 skip_mm_profiling: true dtype: "float32" + default_sampling_params: + max_tokens: 2048 + stop_token_ids: [6562] # speech EOS token - stage_id: 1 runtime: @@ -39,6 +42,8 @@ stage_args: enable_prefix_caching: false skip_mm_profiling: true dtype: "float32" + default_sampling_params: + max_tokens: 2048 engine_input_source: [0] custom_process_input_func: vllm_omni.model_executor.stage_input_processors.cosyvoice3.text2flow final_output: true From 6fc38e0467b2d8967cda0749e3d83a5e0561b4ff Mon Sep 17 00:00:00 2001 From: indevn Date: Sun, 5 Apr 2026 14:11:33 +0800 Subject: [PATCH 048/204] [Model][Core] Enable async_chunk streaming pipeline for CosyVoice3 (#1703) Signed-off-by: linyueqian Signed-off-by: indevn Co-authored-by: linyueqian --- .../test_chunk_transfer_adapter.py | 99 ++++ .../e2e/offline_inference/test_cosyvoice3.py | 218 +++++++++ .../e2e/online_serving/test_cosyvoice3_tts.py | 25 +- .../cosyvoice3/test_cosyvoice3_components.py | 31 ++ .../test_cosyvoice3_model_helpers.py | 463 ++++++++++++++++++ .../test_cosyvoice3_stage_input_processors.py | 267 ++++++++++ vllm_omni/core/sched/omni_ar_scheduler.py | 4 - .../chunk_transfer_adapter.py | 30 +- .../entrypoints/openai/serving_speech.py | 34 ++ .../models/cosyvoice3/assets/mel_filters.npz | Bin 0 -> 4271 bytes .../models/cosyvoice3/code2wav_core/cfm.py | 9 +- .../models/cosyvoice3/config.py | 4 +- .../models/cosyvoice3/cosyvoice3.py | 417 ++++++++++++++-- .../models/cosyvoice3/cosyvoice3_code2wav.py | 200 +++++--- .../stage_configs/cosyvoice3.yaml | 5 +- .../stage_configs/cosyvoice3_async_chunk.yaml | 85 ++++ .../stage_input_processors/cosyvoice3.py | 241 ++++++++- vllm_omni/worker/gpu_ar_model_runner.py | 83 ++++ 18 files changed, 2089 insertions(+), 126 deletions(-) create mode 100644 tests/e2e/offline_inference/test_cosyvoice3.py create mode 100644 tests/model_executor/models/cosyvoice3/test_cosyvoice3_model_helpers.py create mode 100644 tests/model_executor/stage_input_processors/test_cosyvoice3_stage_input_processors.py create mode 100644 vllm_omni/model_executor/models/cosyvoice3/assets/mel_filters.npz create mode 100644 vllm_omni/model_executor/stage_configs/cosyvoice3_async_chunk.yaml diff --git a/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py b/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py index dddf49a05d..7a3caba11e 100644 --- a/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py +++ b/tests/distributed/omni_connectors/test_chunk_transfer_adapter.py @@ -133,6 +133,22 @@ def test_save_async(build_adapter): assert task["is_finished"] is False +def test_send_single_request_cleans_up_after_finished_payload(build_adapter, monkeypatch): + adapter, _ = build_adapter(stage_id=1) + request = _req("req-finished", RequestStatus.FINISHED_STOPPED, external_req_id="ext-finished") + + adapter.custom_process_next_stage_input_func = lambda **kwargs: {"x": [1], "finished": True} + cleanup_calls = [] + monkeypatch.setattr(adapter, "cleanup", lambda *a, **kw: cleanup_calls.append((a, kw))) + + adapter._send_single_request({"pooling_output": None, "request": request, "is_finished": True}) + + assert len(cleanup_calls) == 1 + args, _ = cleanup_calls[0] + assert args[0] == "req-finished" + assert args[1] == "ext-finished" + + def test_update_request_payload(build_adapter): adapter, _ = build_adapter() @@ -409,3 +425,86 @@ def test_generation_scheduler_calls_cleanup_on_finished(monkeypatch, mocker: Moc args, _ = cleanup_calls[0] assert args[0] == "req-s1" assert args[1] == "ext-s1" + + +def test_ar_scheduler_defers_cleanup_and_queues_save_on_finished(mocker: MockerFixture): + """OmniARScheduler should enqueue save; adapter cleanup is handled in save thread.""" + cleanup_calls = [] + save_calls = [] + + adapter_mock = mocker.MagicMock() + adapter_mock.cleanup = lambda *a, **kw: cleanup_calls.append((a, kw)) + adapter_mock.save_async = lambda *a, **kw: save_calls.append((a, kw)) + + from vllm_omni.core.sched.omni_ar_scheduler import OmniARScheduler + + scheduler = mocker.MagicMock() + scheduler.chunk_transfer_adapter = adapter_mock + scheduler.connector = None + scheduler.perf_metrics = None + scheduler.log_stats = False + scheduler.recompute_kv_load_failures = False + scheduler.structured_output_manager = mocker.MagicMock() + scheduler.structured_output_manager.should_advance.return_value = False + scheduler.finished_req_ids_dict = {} + scheduler.kv_cache_manager = mocker.MagicMock() + scheduler.kv_cache_manager.take_events.return_value = None + scheduler.kv_event_publisher = mocker.MagicMock() + scheduler.waiting_for_transfer_free = set() + scheduler.transfer_triggered_requests = set() + scheduler.active_kv_transfers = set() + + request = _HashableRequest( + request_id="req-ar", + external_req_id="ext-ar", + status=RequestStatus.RUNNING, + is_finished=lambda: False, + num_computed_tokens=1, + num_prompt_tokens=1, + prompt_token_ids=[1], + num_output_placeholders=0, + sampling_params=None, + pooling_params=None, + stop_reason=None, + client_index=0, + take_events=lambda: [], + trace_headers=None, + num_cached_tokens=0, + num_external_computed_tokens=0, + num_nans_in_logits=0, + get_finished_reason=lambda: "stop", + ) + scheduler.requests = {"req-ar": request} + + scheduler._update_request_with_output = mocker.MagicMock(return_value=([], True)) + scheduler._process_kv_transfer_trigger = mocker.MagicMock(return_value=False) + scheduler._handle_stopped_request = mocker.MagicMock(return_value=True) + scheduler._free_request = mocker.MagicMock(return_value=None) + scheduler._get_routed_experts = mocker.MagicMock(return_value=None) + scheduler.running = [request] + scheduler.waiting = mocker.MagicMock() + scheduler.waiting.remove_requests = mocker.MagicMock() + scheduler.make_spec_decoding_stats = mocker.MagicMock(return_value=None) + scheduler.make_stats = mocker.MagicMock(return_value=None) + + scheduler_output = SimpleNamespace( + num_scheduled_tokens={"req-ar": 1}, + scheduled_spec_decode_tokens={}, + num_invalid_spec_tokens=0, + ) + model_runner_output = SimpleNamespace( + sampled_token_ids=[[123]], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=None, + num_nans_in_logits=None, + kv_connector_output=None, + cudagraph_stats=None, + req_id_to_index={"req-ar": 0}, + kv_extracted_req_ids=None, + ) + + OmniARScheduler.update_from_output(scheduler, scheduler_output, model_runner_output) + + assert len(cleanup_calls) == 0 + assert len(save_calls) == 1 diff --git a/tests/e2e/offline_inference/test_cosyvoice3.py b/tests/e2e/offline_inference/test_cosyvoice3.py new file mode 100644 index 0000000000..8c88d972d5 --- /dev/null +++ b/tests/e2e/offline_inference/test_cosyvoice3.py @@ -0,0 +1,218 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Offline E2E smoke test for CosyVoice3 zero-shot reference inference. + +This test uses the official upstream zero-shot prompt text/audio pair and +verifies a stable reference recipe: +- config-derived top_p/top_k and token-length ratios +- model EOS token as the stop token +- a conservative repetition penalty to avoid degenerate loops +""" + +from __future__ import annotations + +import functools +import io +import os +import tempfile +from pathlib import Path +from urllib.request import urlopen + +import numpy as np +import pytest +import soundfile as sf +import yaml +from huggingface_hub import snapshot_download +from vllm.sampling_params import SamplingParams + +from tests.conftest import OmniRunner +from tests.utils import hardware_test +from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config +from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer + +MODEL = "FunAudioLLM/Fun-CosyVoice3-0.5B-2512" +MODEL_DIR_ENV = "VLLM_OMNI_COSYVOICE3_MODEL_DIR" + +REFERENCE_PROMPT_WAV_URL = "https://raw.githubusercontent.com/FunAudioLLM/CosyVoice/main/asset/zero_shot_prompt.wav" +REFERENCE_PROMPT_TEXT = "You are a helpful assistant.<|endofprompt|>希望你以后能够做的比我还好呦。" +REFERENCE_SYNTH_TEXT = ( + "CosyVoice is undergoing a comprehensive upgrade, providing more accurate, " + "stable, faster, and better voice generation capabilities." +) +REFERENCE_STAGE0_TEMPERATURE = 1.0 +REFERENCE_STAGE0_REPETITION_PENALTY = 2.0 + + +def _stage_config(name: str) -> str: + return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name) + + +STAGE_CONFIGS = [ + _stage_config("cosyvoice3.yaml"), + _stage_config("cosyvoice3_async_chunk.yaml"), +] + + +@functools.lru_cache(maxsize=1) +def _load_reference_prompt_wav() -> tuple[np.ndarray, int]: + with urlopen(REFERENCE_PROMPT_WAV_URL, timeout=30) as resp: + data = resp.read() + audio, sr = sf.read(io.BytesIO(data), dtype="float32", always_2d=False) + if isinstance(audio, np.ndarray) and audio.ndim > 1: + audio = np.mean(audio, axis=-1) + return np.asarray(audio, dtype=np.float32), int(sr) + + +@functools.lru_cache(maxsize=1) +def _resolve_model_dir() -> Path: + override = os.environ.get(MODEL_DIR_ENV) + if override: + return Path(override).expanduser().resolve() + return Path(snapshot_download(MODEL, allow_patterns=["*"])) + + +def _reference_zero_shot_stage0_sampling(*, text: str) -> SamplingParams: + config = CosyVoice3Config() + sampling_cfg = config.llm.get("sampling", {}) + eos_token_id = int(config.llm["eos_token_id"]) + model_dir = _resolve_model_dir() + tokenizer = get_qwen_tokenizer( + token_path=str(model_dir / config.qwen_pretrain_path), + skip_special_tokens=config.skip_special_tokens, + version=config.version, + ) + text_len = max(1, len(tokenizer.encode(text, allowed_special=config.allowed_special))) + return SamplingParams( + temperature=REFERENCE_STAGE0_TEMPERATURE, + top_p=float(sampling_cfg.get("top_p", 0.8)), + top_k=int(sampling_cfg.get("top_k", 25)), + repetition_penalty=REFERENCE_STAGE0_REPETITION_PENALTY, + stop_token_ids=[eos_token_id], + min_tokens=int(text_len * config.min_token_text_ratio), + max_tokens=int(text_len * config.max_token_text_ratio), + ) + + +def _concat_audio(audio_val) -> np.ndarray: + import torch + + if isinstance(audio_val, list): + tensors = [] + for t in audio_val: + if t is None: + continue + if hasattr(t, "detach"): + t = t.detach() + if hasattr(t, "cpu"): + t = t.cpu() + if hasattr(t, "float"): + t = t.float() + if isinstance(t, torch.Tensor): + tensors.append(t.reshape(-1)) + if not tensors: + return np.zeros((0,), dtype=np.float32) + return torch.cat(tensors, dim=-1).numpy().astype(np.float32, copy=False) + + if hasattr(audio_val, "detach"): + audio_val = audio_val.detach() + if hasattr(audio_val, "cpu"): + audio_val = audio_val.cpu() + if hasattr(audio_val, "float"): + audio_val = audio_val.float() + if hasattr(audio_val, "numpy"): + audio_val = audio_val.numpy() + audio_np = np.asarray(audio_val, dtype=np.float32) + return audio_np.reshape(-1) + + +def _get_stage_engine_outputs(omni_runner: OmniRunner, stage_id: int): + stage_list = getattr(omni_runner.omni, "stage_list", None) + if stage_list is not None: + return getattr(stage_list[stage_id], "engine_outputs", None) or [] + + stage_clients = getattr(getattr(omni_runner.omni, "engine", None), "stage_clients", None) + if stage_clients is not None: + return getattr(stage_clients[stage_id], "engine_outputs", None) or [] + + raise AttributeError("Unable to locate stage outputs on Omni runner") + + +def _patched_stage_config(base_stage_config: str, model_dir: Path, tmp_dir: Path) -> str: + cfg = yaml.safe_load(Path(base_stage_config).read_text(encoding="utf-8")) + tokenizer_path = str(model_dir / "CosyVoice-BlankEN") + for stage in cfg.get("stage_args", []): + engine_args = stage.setdefault("engine_args", {}) + engine_args["tokenizer"] = tokenizer_path + engine_args["enforce_eager"] = True + engine_args["hf_overrides"] = {"architectures": ["CosyVoice3Model"]} + out_path = tmp_dir / Path(base_stage_config).name + out_path.write_text(yaml.safe_dump(cfg, sort_keys=False), encoding="utf-8") + return str(out_path) + + +def _build_reference_inputs(prompt_audio: tuple[np.ndarray, int]) -> list[dict[str, object]]: + return [ + { + "prompt": REFERENCE_SYNTH_TEXT, + "multi_modal_data": {"audio": prompt_audio}, + "modalities": ["audio"], + "mm_processor_kwargs": {"prompt_text": REFERENCE_PROMPT_TEXT}, + } + ] + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "L4"}, num_cards=1) +@pytest.mark.parametrize("base_stage_config", STAGE_CONFIGS) +def test_cosyvoice3_offline_reference_zero_shot(base_stage_config: str) -> None: + """CosyVoice3 zero-shot reference inference should stop cleanly and produce sane audio.""" + prompt_audio, prompt_sr = _load_reference_prompt_wav() + model_dir = _resolve_model_dir() + expected_stop_token = int(CosyVoice3Config().llm["eos_token_id"]) + + with tempfile.TemporaryDirectory(prefix="cv3-e2e-") as tmp: + stage_config = _patched_stage_config(base_stage_config, model_dir, Path(tmp)) + with OmniRunner( + str(model_dir), seed=42, stage_configs_path=stage_config, stage_init_timeout=300 + ) as omni_runner: + sampling_params_list = omni_runner.get_default_sampling_params_list() + sampling_params_list[0] = _reference_zero_shot_stage0_sampling(text=REFERENCE_SYNTH_TEXT) + + outputs = omni_runner.omni.generate( + _build_reference_inputs((prompt_audio, prompt_sr)), sampling_params_list + ) + + assert outputs, "No outputs returned" + audio_mm = outputs[0].multimodal_output + assert "audio" in audio_mm, "No audio output found" + + audio = _concat_audio(audio_mm["audio"]) + assert audio.size > 0, "Generated audio is empty" + + sr_val = audio_mm.get("sr", 24000) + if isinstance(sr_val, list) and sr_val: + sr_val = sr_val[-1] + if hasattr(sr_val, "item"): + sr_val = sr_val.item() + sr = int(sr_val) + assert sr == 24000, f"Unexpected sample_rate={sr}" + + duration_s = audio.size / sr + assert 2.8 <= duration_s <= 8.8, f"Unexpected duration={duration_s:.3f}s (samples={audio.size}, sr={sr})" + + stage0_outputs = _get_stage_engine_outputs(omni_runner, 0) + if stage0_outputs: + completion = stage0_outputs[0].outputs[0] + finish_reason = getattr(completion, "finish_reason", None) + stop_reason = getattr(completion, "stop_reason", None) + num_tokens = len(getattr(completion, "token_ids", []) or []) + + assert finish_reason == "stop", f"Stage-0 finish_reason={finish_reason}, expected 'stop'" + assert int(stop_reason) == expected_stop_token, ( + f"Stage-0 stop_reason={stop_reason}, expected {expected_stop_token}" + ) + assert 80 <= num_tokens <= 220, f"Stage-0 num_tokens={num_tokens}, expected sane stop-bound range" + else: + assert "async_chunk" in Path(base_stage_config).name, "Stage-0 produced no engine outputs" diff --git a/tests/e2e/online_serving/test_cosyvoice3_tts.py b/tests/e2e/online_serving/test_cosyvoice3_tts.py index 1845d7818a..276b1782f5 100644 --- a/tests/e2e/online_serving/test_cosyvoice3_tts.py +++ b/tests/e2e/online_serving/test_cosyvoice3_tts.py @@ -50,8 +50,18 @@ def get_prompt(prompt_type="zh"): ) ] +tts_async_chunk_server_params = [ + pytest.param( + OmniServerParams( + model=MODEL, + stage_config_path=get_stage_config("cosyvoice3_async_chunk.yaml"), + server_args=["--trust-remote-code", "--disable-log-stats"], + ), + id="cosyvoice3_async_chunk", + ) +] + -@pytest.mark.advanced_model @pytest.mark.core_model @pytest.mark.omni @hardware_test(res={"cuda": "H100"}, num_cards=1) @@ -76,17 +86,16 @@ def test_voice_clone_zh_001(omni_server, openai_client) -> None: openai_client.send_audio_speech_request(request_config) -@pytest.mark.advanced_model +@pytest.mark.core_model @pytest.mark.omni @hardware_test(res={"cuda": "H100"}, num_cards=1) -@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True) -@pytest.mark.skip(reason="CosyVoice3 does not support async_chunk streaming yet") +@pytest.mark.parametrize("omni_server", tts_async_chunk_server_params, indirect=True) def test_voice_clone_zh_002(omni_server, openai_client) -> None: """ - Test voice cloning TTS with Chinese text via OpenAI API. - Deploy Setting: default yaml + Test voice cloning TTS with Chinese text via async_chunk streaming. + Deploy Setting: cosyvoice3_async_chunk.yaml Input Modal: text + ref_audio + ref_text - Output Modal: audio + Output Modal: audio (streamed) Input Setting: stream=True Datasets: single request """ @@ -101,7 +110,7 @@ def test_voice_clone_zh_002(omni_server, openai_client) -> None: openai_client.send_audio_speech_request(request_config) -@pytest.mark.advanced_model +@pytest.mark.core_model @pytest.mark.omni @hardware_test(res={"cuda": "H100"}, num_cards=1) @pytest.mark.parametrize("omni_server", tts_server_params, indirect=True) diff --git a/tests/model_executor/models/cosyvoice3/test_cosyvoice3_components.py b/tests/model_executor/models/cosyvoice3/test_cosyvoice3_components.py index 3b1471365d..0f5202c3b9 100644 --- a/tests/model_executor/models/cosyvoice3/test_cosyvoice3_components.py +++ b/tests/model_executor/models/cosyvoice3/test_cosyvoice3_components.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Unit tests for CosyVoice3 components.""" +from types import SimpleNamespace + import pytest import torch import torch.nn as nn @@ -247,3 +249,32 @@ def test_float32_uses_sdpa(self): assert out.shape == (batch, seq_len, heads, dim) assert out.dtype == torch.float32 + + +def test_code2wav_forward_finalizes_hift_tail(): + from vllm_omni.model_executor.models.cosyvoice3.cosyvoice3_code2wav import CosyVoice3Code2Wav + + class DummyHiFT(nn.Module): + def __init__(self): + super().__init__() + self.m_source = SimpleNamespace(l_linear=SimpleNamespace(weight=torch.ones(1, dtype=torch.float32))) + self.finalize_calls: list[bool] = [] + + def inference(self, speech_feat, finalize=True): + self.finalize_calls.append(bool(finalize)) + return torch.zeros((speech_feat.shape[0], 1, speech_feat.shape[-1]), dtype=speech_feat.dtype), None + + model = object.__new__(CosyVoice3Code2Wav) + nn.Module.__init__(model) + model.hift = DummyHiFT() + model._forward_mel = lambda **_: torch.ones((1, 80, 8), dtype=torch.float32) + + out = model.forward( + token=torch.tensor([[1, 2, 3]], dtype=torch.int32), + prompt_token=torch.tensor([[4, 5]], dtype=torch.int32), + prompt_feat=torch.ones((1, 4, 80), dtype=torch.float32), + embedding=torch.ones((1, 192), dtype=torch.float32), + ) + + assert out.shape == (1, 1, 8) + assert model.hift.finalize_calls == [True] diff --git a/tests/model_executor/models/cosyvoice3/test_cosyvoice3_model_helpers.py b/tests/model_executor/models/cosyvoice3/test_cosyvoice3_model_helpers.py new file mode 100644 index 0000000000..9a78c54de6 --- /dev/null +++ b/tests/model_executor/models/cosyvoice3/test_cosyvoice3_model_helpers.py @@ -0,0 +1,463 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from threading import Lock +from types import SimpleNamespace + +import pytest +import torch +import torch.nn as nn +from vllm.v1.outputs import SamplerOutput +from vllm.v1.sample.logits_processor.state import LogitsProcessors +from vllm.v1.sample.metadata import SamplingMetadata + +from vllm_omni.model_executor.models.cosyvoice3.cosyvoice3 import CosyVoice3Model +from vllm_omni.worker.gpu_ar_model_runner import GPUARModelRunner + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +class _DummyCode2Wav: + def __init__( + self, + vocab_size: int, + num_samples: int = 32, + outputs: list[tuple[torch.Tensor, dict[str, object] | None]] | None = None, + ): + self.input_embedding = SimpleNamespace(num_embeddings=vocab_size) + self.num_samples = num_samples + self.outputs = list(outputs or []) + self.forward_calls: list[dict[str, object]] = [] + self.forward_streaming_calls: list[dict[str, object]] = [] + + def forward(self, **kwargs): + self.forward_calls.append(kwargs) + token = kwargs["token"] + num_samples = int(token.shape[-1]) + return torch.linspace(-1.0, 1.0, max(num_samples, 1), dtype=torch.float32).reshape(1, 1, -1) + + def forward_streaming(self, **kwargs): + self.forward_streaming_calls.append(kwargs) + if self.outputs: + return self.outputs.pop(0) + + token = kwargs["token"] + num_samples = int(token.shape[-1]) + audio = torch.linspace(-1.0, 1.0, max(num_samples, 1), dtype=torch.float32).reshape(1, 1, -1) + new_state = None + if not kwargs.get("finalize", False): + new_state = { + "mel": torch.ones((1, 80, max(num_samples, 1)), dtype=torch.float32), + "speech_offset": audio.shape[-1], + } + return audio, new_state + + +def _make_code2wav_model( + *, + with_stride_cfg: bool = False, + num_samples: int = 32, + outputs: list[tuple[torch.Tensor, dict[str, object] | None]] | None = None, +) -> CosyVoice3Model: + model = object.__new__(CosyVoice3Model) + nn.Module.__init__(model) + model.model_stage = "cosyvoice3_code2wav" + hift_cfg = {} if not with_stride_cfg else {"upsample_rates": [8, 5, 3], "istft_params": {"hop_len": 4}} + model.config = SimpleNamespace( + sample_rate=24000, + hift=hift_cfg, + token_frame_rate=25 if with_stride_cfg else 0, + token_mel_ratio=2 if with_stride_cfg else 0, + ) + model.code2wav = _DummyCode2Wav(vocab_size=4, num_samples=num_samples, outputs=outputs) + model.source_cache_len = 4 + model.speech_window = torch.hamming_window(8, periodic=False) + model._stream_audio_cache_by_req = {} + model._stream_audio_cache_lock = Lock() + model._stream_vocoder_cache_by_req = {} + return model + + +def _make_talker_model() -> CosyVoice3Model: + model = object.__new__(CosyVoice3Model) + nn.Module.__init__(model) + model.model_stage = "cosyvoice3_talker" + model.config = SimpleNamespace( + llm={ + "speech_token_size": 6561, + "eos_token_id": 6562, + "sampling": { + "top_p": 0.8, + "top_k": 25, + "win_size": 10, + "tau_r": 0.1, + }, + }, + vocab_size=151923, + ) + return model + + +def _make_sampling_metadata( + *, + output_token_ids: list[list[int]], + repetition_penalty: float = 2.0, +) -> SamplingMetadata: + return SamplingMetadata( + temperature=torch.tensor([1.0], dtype=torch.float32), + all_greedy=False, + all_random=True, + top_p=torch.tensor([0.8], dtype=torch.float32), + top_k=torch.tensor([25], dtype=torch.int32), + generators={}, + max_num_logprobs=None, + no_penalties=False, + prompt_token_ids=None, + frequency_penalties=torch.zeros(1, dtype=torch.float32), + presence_penalties=torch.zeros(1, dtype=torch.float32), + repetition_penalties=torch.tensor([repetition_penalty], dtype=torch.float32), + output_token_ids=output_token_ids, + allowed_token_ids_mask=None, + bad_words_token_ids={}, + logitsprocs=LogitsProcessors(), + ) + + +def test_split_request_ids_uses_seq_token_counts(): + ids = torch.tensor([10, 11, 12, 13, 14], dtype=torch.long) + chunks = CosyVoice3Model._split_request_ids(ids, [2, 2, 2]) + assert [c.tolist() for c in chunks] == [[10, 11], [12, 13], [14]] + + +def test_split_request_ids_honors_single_request_seq_token_counts(): + ids = torch.tensor([10, 11, 12, 13, 14], dtype=torch.long) + chunks = CosyVoice3Model._split_request_ids(ids, [3]) + assert [c.tolist() for c in chunks] == [[10, 11, 12]] + + +def test_sanitize_codec_tokens_filters_out_of_range(): + model = _make_code2wav_model() + raw = torch.tensor([-1, 0, 3, 4, 99], dtype=torch.long) + clean = model._sanitize_codec_tokens(raw) + assert clean.tolist() == [0, 3] + + +def test_forward_prefers_token_offset_when_present(): + model = _make_code2wav_model() + + runtime_info = [ + { + "speech_token": torch.tensor([[1, 2, 3]], dtype=torch.long), + "speech_feat": torch.tensor([[[0.1, 0.2], [0.3, 0.4]]], dtype=torch.float32), + "embedding": torch.tensor([[0.5, 0.6]], dtype=torch.float32), + "token_offset": 2, + "left_context_size": 1, + } + ] + + out = model.forward( + input_ids=torch.tensor([0, 1, 2], dtype=torch.long), + positions=torch.tensor([0, 1, 2], dtype=torch.long), + model_intermediate_buffer=runtime_info, + seq_token_counts=[3], + ) + + assert len(out.multimodal_outputs["audio"]) == 1 + assert out.multimodal_outputs["audio"][0].numel() > 0 + assert len(model.code2wav.forward_streaming_calls) == 1 + call = model.code2wav.forward_streaming_calls[0] + assert call["token"].shape == (1, 3) + assert call["token_offset_tokens"] == 2 + assert call["finalize"] is False + + +def test_forward_falls_back_to_left_context_size_for_backward_compat(): + model = _make_code2wav_model() + + runtime_info = [ + { + "speech_token": torch.tensor([[1, 2, 3]], dtype=torch.long), + "speech_feat": torch.tensor([[[0.1, 0.2], [0.3, 0.4]]], dtype=torch.float32), + "embedding": torch.tensor([[0.5, 0.6]], dtype=torch.float32), + "left_context_size": 2, + } + ] + + model.forward( + input_ids=torch.tensor([0, 1, 2], dtype=torch.long), + positions=torch.tensor([0, 1, 2], dtype=torch.long), + model_intermediate_buffer=runtime_info, + seq_token_counts=[3], + ) + + assert model.code2wav.forward_streaming_calls[0]["token_offset_tokens"] == 2 + + +def test_forward_ignores_single_request_padded_tail_tokens(): + model = _make_code2wav_model(with_stride_cfg=True) + runtime_info = [ + { + "speech_token": torch.tensor([[1, 2, 3]], dtype=torch.long), + "speech_feat": torch.tensor([[[0.1, 0.2], [0.3, 0.4]]], dtype=torch.float32), + "embedding": torch.tensor([[0.5, 0.6]], dtype=torch.float32), + "token_offset": 0, + } + ] + + out = model.forward( + input_ids=torch.tensor([0, 1, 2, 3, 3], dtype=torch.long), + positions=torch.tensor([0, 1, 2, 3, 4], dtype=torch.long), + model_intermediate_buffer=runtime_info, + seq_token_counts=[3], + ) + + # The padded tail must not contribute to code2wav length. + assert out.multimodal_outputs["audio"][0].numel() == 3 + assert model.code2wav.forward_streaming_calls[0]["token"].tolist() == [[0, 1, 2]] + + +def test_forward_uses_non_stream_decode_without_chunk_metadata(): + model = _make_code2wav_model() + + runtime_info = [ + { + "speech_token": torch.tensor([[1, 2, 3]], dtype=torch.long), + "speech_feat": torch.tensor([[[0.1, 0.2], [0.3, 0.4]]], dtype=torch.float32), + "embedding": torch.tensor([[0.5, 0.6]], dtype=torch.float32), + "prefix_ids": [101, 102], + "generated_len": 3, + } + ] + + out = model.forward( + input_ids=torch.tensor([0, 1, 2], dtype=torch.long), + positions=torch.tensor([0, 1, 2], dtype=torch.long), + model_intermediate_buffer=runtime_info, + seq_token_counts=[3], + ) + + assert out.multimodal_outputs["audio"][0].numel() == 3 + assert len(model.code2wav.forward_calls) == 1 + assert len(model.code2wav.forward_streaming_calls) == 0 + call = model.code2wav.forward_calls[0] + assert call["token"].tolist() == [[0, 1, 2]] + + +def test_forward_reuses_streaming_cache_state_between_chunks(): + model = _make_code2wav_model( + outputs=[ + ( + torch.arange(4, dtype=torch.float32).reshape(1, 1, -1), + {"mel": torch.ones((1, 80, 3), dtype=torch.float32), "speech_offset": 4}, + ), + ( + torch.full((1, 1, 2), 9.0, dtype=torch.float32), + {"mel": torch.ones((1, 80, 5), dtype=torch.float32), "speech_offset": 6}, + ), + ] + ) + runtime_info = [ + { + "req_id": ["rid-stream"], + "speech_token": torch.tensor([[1, 2, 3]], dtype=torch.long), + "speech_feat": torch.tensor([[[0.1, 0.2], [0.3, 0.4]]], dtype=torch.float32), + "embedding": torch.tensor([[0.5, 0.6]], dtype=torch.float32), + "token_offset": 0, + "stream_finished": torch.tensor(False), + } + ] + + out1 = model.forward( + input_ids=torch.tensor([0, 1, 2], dtype=torch.long), + positions=torch.tensor([0, 1, 2], dtype=torch.long), + model_intermediate_buffer=runtime_info, + seq_token_counts=[3], + ) + assert out1.multimodal_outputs["audio"][0].tolist() == [0.0, 1.0, 2.0, 3.0] + assert model.code2wav.forward_streaming_calls[0]["cache_state"] is None + + out2 = model.forward( + input_ids=torch.tensor([0, 1, 2], dtype=torch.long), + positions=torch.tensor([0, 1, 2], dtype=torch.long), + model_intermediate_buffer=runtime_info, + seq_token_counts=[3], + ) + assert out2.multimodal_outputs["audio"][0].tolist() == [9.0, 9.0] + cache_state = model.code2wav.forward_streaming_calls[1]["cache_state"] + assert cache_state is not None + assert cache_state["speech_offset"] == 4 + assert "rid-stream" in model._stream_vocoder_cache_by_req + + +def test_forward_clears_streaming_cache_on_terminal_chunk(): + model = _make_code2wav_model( + outputs=[ + ( + torch.arange(4, dtype=torch.float32).reshape(1, 1, -1), + {"mel": torch.ones((1, 80, 3), dtype=torch.float32), "speech_offset": 4}, + ), + ( + torch.full((1, 1, 1), 7.0, dtype=torch.float32), + None, + ), + ] + ) + runtime_info = [ + { + "req_id": ["rid-stream"], + "speech_token": torch.tensor([[1, 2, 3]], dtype=torch.long), + "speech_feat": torch.tensor([[[0.1, 0.2], [0.3, 0.4]]], dtype=torch.float32), + "embedding": torch.tensor([[0.5, 0.6]], dtype=torch.float32), + "token_offset": 0, + "stream_finished": torch.tensor(False), + } + ] + + model.forward( + input_ids=torch.tensor([0, 1, 2], dtype=torch.long), + positions=torch.tensor([0, 1, 2], dtype=torch.long), + model_intermediate_buffer=runtime_info, + seq_token_counts=[3], + ) + assert "rid-stream" in model._stream_vocoder_cache_by_req + + runtime_info[0]["stream_finished"] = torch.tensor(True) + out = model.forward( + input_ids=torch.tensor([0, 1, 2], dtype=torch.long), + positions=torch.tensor([0, 1, 2], dtype=torch.long), + model_intermediate_buffer=runtime_info, + seq_token_counts=[3], + ) + assert out.multimodal_outputs["audio"][0].tolist() == [7.0] + assert "rid-stream" not in model._stream_vocoder_cache_by_req + + +def test_sample_uses_ras_rejection_for_recent_repetition(): + model = _make_talker_model() + metadata = _make_sampling_metadata(output_token_ids=[[1] * 10]) + logits = torch.tensor([[-1e9, 10.0, 0.0]], dtype=torch.float32) + + out = model.sample(logits, metadata) + + assert out is not None + assert out.sampled_token_ids.tolist() == [[2]] + + +def test_sample_tolerates_padded_rows_without_history(): + model = _make_talker_model() + metadata = _make_sampling_metadata(output_token_ids=[[1] * 10]) + logits = torch.tensor( + [ + [-1e9, 10.0, 0.0], + [-1e9, 0.0, 10.0], + ], + dtype=torch.float32, + ) + + out = model.sample(logits, metadata) + + assert out is not None + assert out.sampled_token_ids.shape == (2, 1) + + +def test_gpu_ar_model_runner_prefers_model_sampler_when_opted_in(): + metadata = _make_sampling_metadata(output_token_ids=[[1, 2, 3]]) + expected = SamplerOutput( + sampled_token_ids=torch.tensor([[7]], dtype=torch.int32), + logprobs_tensors=None, + ) + calls: list[torch.Tensor] = [] + + class _DummyInputBatch: + def __init__(self): + self.sampling_metadata = metadata + self.updated = False + + def update_async_output_token_ids(self): + self.updated = True + + runner = object.__new__(GPUARModelRunner) + runner.input_batch = _DummyInputBatch() + runner.model = SimpleNamespace( + prefer_model_sampler=True, + sample=lambda logits, sampling_metadata: calls.append(logits.clone()) or expected, + ) + runner.sampler = lambda **_: (_ for _ in ()).throw(AssertionError("fallback sampler should not be used")) + + out = runner._sample(torch.tensor([[0.1, 0.2]], dtype=torch.float32), spec_decode_metadata=None) + + assert out is expected + assert runner.input_batch.updated is False + assert len(calls) == 1 + + +def test_gpu_ar_model_runner_supplies_req_output_history_to_model_sampler(): + metadata = _make_sampling_metadata(output_token_ids=[]) + seen_histories: list[list[list[int]]] = [] + + class _DummyInputBatch: + def __init__(self): + self.sampling_metadata = metadata + self.req_output_token_ids = [[1, 2, 3]] + self.req_ids = ["rid-1"] + self.sampled_token_ids_cpu = None + self.async_copy_ready_event = None + self.prev_req_id_to_index = None + + def update_async_output_token_ids(self): + raise AssertionError("fallback async repair should not run for model sampler path") + + runner = object.__new__(GPUARModelRunner) + runner.input_batch = _DummyInputBatch() + runner.model = SimpleNamespace( + prefer_model_sampler=True, + sample=lambda logits, sampling_metadata: seen_histories.append( + [list(x) for x in sampling_metadata.output_token_ids] + ) + or SamplerOutput(sampled_token_ids=torch.tensor([[7]], dtype=torch.int32), logprobs_tensors=None), + ) + runner.sampler = lambda **_: (_ for _ in ()).throw(AssertionError("fallback sampler should not be used")) + + runner._sample(torch.tensor([[0.1, 0.2]], dtype=torch.float32), spec_decode_metadata=None) + + assert seen_histories == [[[1, 2, 3]]] + + +def test_gpu_ar_model_runner_repairs_async_placeholders_for_model_sampler(): + metadata = _make_sampling_metadata(output_token_ids=[]) + seen_histories: list[list[list[int]]] = [] + + class _ReadyEvent: + def __init__(self): + self.synced = False + + def synchronize(self): + self.synced = True + + class _DummyInputBatch: + def __init__(self): + self.sampling_metadata = metadata + self.req_output_token_ids = [[11, -1]] + self.req_ids = ["rid-1"] + self.sampled_token_ids_cpu = torch.tensor([[29]], dtype=torch.int32) + self.async_copy_ready_event = _ReadyEvent() + self.prev_req_id_to_index = {"rid-1": 0} + + def update_async_output_token_ids(self): + raise AssertionError("fallback async repair should not run for model sampler path") + + runner = object.__new__(GPUARModelRunner) + runner.input_batch = _DummyInputBatch() + runner.model = SimpleNamespace( + prefer_model_sampler=True, + sample=lambda logits, sampling_metadata: seen_histories.append( + [list(x) for x in sampling_metadata.output_token_ids] + ) + or SamplerOutput(sampled_token_ids=torch.tensor([[7]], dtype=torch.int32), logprobs_tensors=None), + ) + runner.sampler = lambda **_: (_ for _ in ()).throw(AssertionError("fallback sampler should not be used")) + + runner._sample(torch.tensor([[0.1, 0.2]], dtype=torch.float32), spec_decode_metadata=None) + + assert runner.input_batch.async_copy_ready_event.synced is True + assert seen_histories == [[[11, 29]]] diff --git a/tests/model_executor/stage_input_processors/test_cosyvoice3_stage_input_processors.py b/tests/model_executor/stage_input_processors/test_cosyvoice3_stage_input_processors.py new file mode 100644 index 0000000000..e26de3022f --- /dev/null +++ b/tests/model_executor/stage_input_processors/test_cosyvoice3_stage_input_processors.py @@ -0,0 +1,267 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections import defaultdict +from types import SimpleNamespace + +import torch + +from vllm_omni.model_executor.stage_input_processors.cosyvoice3 import talker2code2wav_async_chunk, text2flow + + +def _source_output(request_id: str, prompt_ids: list[int], out_ids: list[int], mm: dict): + return SimpleNamespace( + request_id=request_id, + prompt_token_ids=prompt_ids, + outputs=[SimpleNamespace(token_ids=out_ids, multimodal_output=mm)], + ) + + +def _transfer_manager( + *, + chunk_frames: int = 2, + pre_lookahead_frames: int = 0, + stream_scale_factor: int = 1, + max_chunk_frames: int | None = None, +): + if max_chunk_frames is None: + max_chunk_frames = chunk_frames + return SimpleNamespace( + code_prompt_token_ids=defaultdict(list), + request_payload={}, + connector=SimpleNamespace( + config={ + "extra": { + "codec_chunk_frames": chunk_frames, + "codec_pre_lookahead_frames": pre_lookahead_frames, + "codec_max_chunk_frames": max_chunk_frames, + "codec_stream_scale_factor": stream_scale_factor, + "codec_vocab_size": 6561, + } + } + ), + ) + + +def test_text2flow_supports_batched_source_outputs(): + stage_list = [ + SimpleNamespace( + engine_outputs=[ + _source_output("req-0", [10, 11], [1, 2, 3], {"speech_token": torch.tensor([[1, 2]])}), + _source_output("req-1", [20, 21], [4, 5], {"speech_token": torch.tensor([[3, 4]])}), + ] + ) + ] + + outputs = text2flow(stage_list=stage_list, engine_input_source=[0], prompt=None) + + assert len(outputs) == 2 + assert outputs[0]["prompt_token_ids"] == [1, 2, 3] + assert outputs[1]["prompt_token_ids"] == [4, 5] + assert outputs[0]["additional_information"]["prefix_ids"] == [10, 11] + assert outputs[1]["additional_information"]["prefix_ids"] == [20, 21] + + +def test_talker2code2wav_async_chunk_final_payload_uses_absolute_token_offset(): + transfer_manager = _transfer_manager() + request = SimpleNamespace( + external_req_id="rid-0", + output_token_ids=[1, 2, 6562, 3], + additional_information={ + "speech_token": [torch.tensor([[11, 12, 13]])], + "speech_feat": [torch.tensor([[[0.1, 0.2], [0.3, 0.4]]])], + "embedding": [torch.tensor([[0.5, 0.6]])], + }, + is_finished=lambda: True, + ) + + payload = talker2code2wav_async_chunk( + transfer_manager=transfer_manager, + pooling_output=None, + request=request, + is_finished=True, + ) + + assert payload is not None + assert payload["finished"].item() is True + assert payload["code_predictor_codes"] == [1, 2, 3] + assert payload["token_offset"] == 0 + assert payload["left_context_size"] == 0 + assert payload["req_id"] == ["rid-0"] + assert payload["stream_finished"].item() is True + assert "speech_token" in payload + assert "speech_feat" in payload + assert "embedding" in payload + + +def test_talker2code2wav_async_chunk_emits_eof_when_finished_without_valid_codes(): + transfer_manager = _transfer_manager(chunk_frames=25) + request = SimpleNamespace( + external_req_id="rid-eof", + output_token_ids=[6561, 6562], # all filtered out + additional_information={}, + is_finished=lambda: True, + ) + + payload = talker2code2wav_async_chunk( + transfer_manager=transfer_manager, + pooling_output=None, + request=request, + is_finished=True, + ) + + assert payload is not None + assert payload["code_predictor_codes"] == [] + assert payload["finished"].item() is True + + +def test_talker2code2wav_async_chunk_does_not_reemit_without_new_tokens(): + transfer_manager = _transfer_manager() + request = SimpleNamespace( + external_req_id="rid-stable", + output_token_ids=[1, 2], + additional_information={}, + is_finished=lambda: False, + ) + + payload1 = talker2code2wav_async_chunk( + transfer_manager=transfer_manager, + pooling_output=None, + request=request, + is_finished=False, + ) + payload2 = talker2code2wav_async_chunk( + transfer_manager=transfer_manager, + pooling_output=None, + request=request, + is_finished=False, + ) + + assert payload1 is not None + assert payload1["code_predictor_codes"] == [1, 2] + assert payload1["token_offset"] == 0 + assert payload2 is None + + +def test_talker2code2wav_async_chunk_waits_for_prelookahead_and_emits_cumulative_prefix(): + transfer_manager = _transfer_manager(pre_lookahead_frames=1) + request = SimpleNamespace( + external_req_id="rid-pre", + output_token_ids=[1, 2], + additional_information={}, + is_finished=lambda: False, + ) + + payload_pending = talker2code2wav_async_chunk( + transfer_manager=transfer_manager, + pooling_output=None, + request=request, + is_finished=False, + ) + request.output_token_ids = [1, 2, 3] + payload_ready = talker2code2wav_async_chunk( + transfer_manager=transfer_manager, + pooling_output=None, + request=request, + is_finished=False, + ) + + assert payload_pending is None + assert payload_ready is not None + assert payload_ready["code_predictor_codes"] == [1, 2, 3] + assert payload_ready["token_offset"] == 0 + assert payload_ready["finished"].item() is False + + +def test_talker2code2wav_async_chunk_final_flush_uses_previous_token_offset(): + transfer_manager = _transfer_manager(pre_lookahead_frames=1) + request = SimpleNamespace( + external_req_id="rid-tail", + output_token_ids=[3, 4, 5], + additional_information={}, + is_finished=lambda: False, + ) + + payload_stream = talker2code2wav_async_chunk( + transfer_manager=transfer_manager, + pooling_output=None, + request=request, + is_finished=False, + ) + request.output_token_ids = [3, 4, 5, 6] + payload_final = talker2code2wav_async_chunk( + transfer_manager=transfer_manager, + pooling_output=None, + request=request, + is_finished=True, + ) + + assert payload_stream is not None + assert payload_stream["finished"].item() is False + assert payload_stream["code_predictor_codes"] == [3, 4, 5] + assert payload_stream["token_offset"] == 0 + assert payload_final is not None + assert payload_final["finished"].item() is True + assert payload_final["code_predictor_codes"] == [3, 4, 5, 6] + assert payload_final["token_offset"] == 2 + + +def test_talker2code2wav_async_chunk_respects_prompt_token_pad_on_first_chunk(): + transfer_manager = _transfer_manager(pre_lookahead_frames=1) + request = SimpleNamespace( + external_req_id="rid-pad", + output_token_ids=[8, 9, 10], + additional_information={ + "speech_token": [torch.tensor([[1, 2, 3]])], + }, + is_finished=lambda: False, + ) + + payload_pending = talker2code2wav_async_chunk( + transfer_manager=transfer_manager, + pooling_output=None, + request=request, + is_finished=False, + ) + request.output_token_ids = [8, 9, 10, 11] + payload_ready = talker2code2wav_async_chunk( + transfer_manager=transfer_manager, + pooling_output=None, + request=request, + is_finished=False, + ) + + assert payload_pending is None + assert payload_ready is not None + assert payload_ready["code_predictor_codes"] == [8, 9, 10, 11] + assert payload_ready["token_offset"] == 0 + + +def test_talker2code2wav_async_chunk_emits_terminal_eof_without_duplicate_audio(): + transfer_manager = _transfer_manager() + request = SimpleNamespace( + external_req_id="rid-eof-tail", + output_token_ids=[3, 4], + additional_information={}, + is_finished=lambda: False, + ) + + payload_stream = talker2code2wav_async_chunk( + transfer_manager=transfer_manager, + pooling_output=None, + request=request, + is_finished=False, + ) + payload_final = talker2code2wav_async_chunk( + transfer_manager=transfer_manager, + pooling_output=None, + request=request, + is_finished=True, + ) + + assert payload_stream is not None + assert payload_stream["finished"].item() is False + assert payload_stream["code_predictor_codes"] == [3, 4] + assert payload_final is not None + assert payload_final["finished"].item() is True + assert payload_final["code_predictor_codes"] == [] diff --git a/vllm_omni/core/sched/omni_ar_scheduler.py b/vllm_omni/core/sched/omni_ar_scheduler.py index af178d14d2..0956d1856a 100644 --- a/vllm_omni/core/sched/omni_ar_scheduler.py +++ b/vllm_omni/core/sched/omni_ar_scheduler.py @@ -316,10 +316,6 @@ def update_from_output( finished = self._handle_stopped_request(request) if finished: kv_transfer_params = self._free_request(request) - if self.chunk_transfer_adapter is not None: - self.chunk_transfer_adapter.cleanup_receiver( - request.request_id, - ) if status_before_stop == RequestStatus.RUNNING: stopped_running_reqs.add(request) elif status_before_stop == RequestStatus.WAITING_FOR_CHUNK: diff --git a/vllm_omni/distributed/omni_connectors/transfer_adapter/chunk_transfer_adapter.py b/vllm_omni/distributed/omni_connectors/transfer_adapter/chunk_transfer_adapter.py index e8e00eeca2..393d0e8013 100644 --- a/vllm_omni/distributed/omni_connectors/transfer_adapter/chunk_transfer_adapter.py +++ b/vllm_omni/distributed/omni_connectors/transfer_adapter/chunk_transfer_adapter.py @@ -160,11 +160,15 @@ def _poll_single_request(self, request: Request): new_ids = payload_data.get("code_predictor_codes", []) request.prompt_token_ids = new_ids - # Pass additional fields (like left_context_size) to the request - # Only pass chunk context metadata in additional_information - request.additional_information = {} - if "left_context_size" in payload_data: - request.additional_information["left_context_size"] = payload_data["left_context_size"] + # Preserve previously attached request metadata (e.g. prompt + # conditioning tensors) and update only per-chunk fields. + prev_info = getattr(request, "additional_information", None) + info = dict(prev_info) if isinstance(prev_info, dict) else {} + for key, value in payload_data.items(): + if key in {"code_predictor_codes", "finished"}: + continue + info[key] = value + request.additional_information = info request.num_computed_tokens = 0 # Empty chunk with more data expected: keep polling. @@ -240,9 +244,23 @@ def _send_single_request(self, task: dict): if success: self.put_req_chunk[external_req_id] += 1 logger.debug(f"[Stage-{stage_id}] Sent {connector_put_key}") + finished_flag = payload_data.get("finished") + is_payload_finished = False + if isinstance(finished_flag, torch.Tensor): + is_payload_finished = finished_flag.numel() == 1 and bool(finished_flag.item()) + elif finished_flag is not None: + is_payload_finished = bool(finished_flag) + + # Reclaim per-request async state only after the terminal payload + # has been sent successfully. This avoids cleanup->save races. + if is_payload_finished: + self.cleanup(request.request_id, external_req_id) if is_finished: - self.cleanup_sender(external_req_id) + self.code_prompt_token_ids.pop(external_req_id, None) + cached_ic = getattr(self, "_cached_ic", None) + if cached_ic is not None: + cached_ic.pop(external_req_id, None) ######################################################################## # Cleanup diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index f051268824..0a9e11b771 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -1041,9 +1041,20 @@ def _extract_audio_output(res) -> tuple[dict | None, str | None]: streaming needs per-chunk delta slicing; non-streaming needs full concatenation. """ mm = getattr(res, "multimodal_output", None) + ro = None if not mm: ro = getattr(res, "request_output", None) mm = getattr(ro, "multimodal_output", None) if ro else None + if not mm: + if ro is None: + ro = getattr(res, "request_output", None) + outputs = getattr(ro, "outputs", None) if ro else None + if outputs: + for completion_output in outputs: + completion_mm = getattr(completion_output, "multimodal_output", None) + if completion_mm: + mm = completion_mm + break if not mm: return None, None key = "audio" if "audio" in mm else ("model_outputs" if "model_outputs" in mm else None) @@ -1332,6 +1343,29 @@ async def _prepare_speech_generation( sampling_params_list = self.engine_client.default_sampling_params_list + # CosyVoice3: set dynamic min/max tokens based on text length. + # The official model requires min_token_text_ratio to prevent early + # EOS and max_token_text_ratio to cap generation length. + if self._tts_model_type == "cosyvoice3" and sampling_params_list: + import copy + + sampling_params_list = copy.deepcopy(sampling_params_list) + text_len = len(request.input) # rough char-level estimate + # Use the model's configured ratios (defaults: min=2, max=20) + hf_cfg = self.model_config.hf_config + min_ratio = getattr(hf_cfg, "min_token_text_ratio", 2) + max_ratio = getattr(hf_cfg, "max_token_text_ratio", 20) + min_tokens = max(1, int(text_len * min_ratio)) + max_tokens = min(2048, int(text_len * max_ratio)) + sampling_params_list[0].min_tokens = min_tokens + sampling_params_list[0].max_tokens = max_tokens + logger.info( + "CosyVoice3 dynamic tokens: text_len=%d, min_tokens=%d, max_tokens=%d", + text_len, + min_tokens, + max_tokens, + ) + # Fish defaults come from stage_configs YAML. Only override when the caller # explicitly requests a different generation length. if self._is_fish_speech and request.max_new_tokens is not None and sampling_params_list: diff --git a/vllm_omni/model_executor/models/cosyvoice3/assets/mel_filters.npz b/vllm_omni/model_executor/models/cosyvoice3/assets/mel_filters.npz new file mode 100644 index 0000000000000000000000000000000000000000..28ea26909dbdfd608aef67afc4d74d7961ae4bb6 GIT binary patch literal 4271 zcmZ`-cQjmYw;lx1g6JcN7QKe3LG%_Oh!VX=^k~teM-XGQ(Mu4$_Y%?jkm$lFBkB+( z3yfKIgF zxGiAhze`A@t->QRNVV!%P+W=o}VHkB) z%g>qyRHfN1IQ4-=`Y@0T9qE#o+;4E3VQ!epW1Xt=ZG`I3U|62t?<>5h*W|9VvJc`KZ+)ghnA**Z~ET21Tjf_f8oe`vy zZQNtlOx?dDhS71hnOus5cqj)hfyF@H&4y?@9z{I#&cf>A+s2~~(I>TQF}SaR3_tqa z(7&ZdN^vR*t<~?{9DEoI>0PL@Sl?wa?Z{rGX`*eEx9Nh=z*J3HZL1*Py4z$TD#+;m zSSW(kcOTe(4hqgib_W6&xx+j~-u(p)Nn6?>a%wHk=h7Ay$%lcGoo;gAY zmVV7|!Nb;w(PlH@c24{ple2Y3<*9J@jE=sfLzwu_BiAFPE$0Axp`^Nq!H}eG0?r-X zFj@Pwp^al*p>K{@_Cz`q#(N0Y=OpZy^ z{P$KjLJuk_Y%I)$mh`b{uOW5C5Xcmxk!gt_Zg zw>}6fkD4zRK9!#ems~H%U$>V;_wK38Zf-baU$S!#i;7!HWsi}GuC>%@?lMdgkUGC& zh9gC?O-5BlS2#}?7x0?eP#bOL(cqE{M%LJD$CZnplD)CgQR#KCttD=dZK+Ck5R52; z*%5hZ+SXU7)8k%Y^_1U>yI*By(INn&+ir-_4$#dUwTlMNyR@iGQIaZ+eiYqucu)CB z#i{Ru1w+aU#}DHSyzjG_9c?ToB_YjU#f;N=qel98WBIjIc1!#ePwRR+(go&-by#}@ z+M+klVke5b@lWfZ+O&|c??YvRe)&W)qAgtc>t-IZtbRTG#X}49_Q$>P%-)=0W_QY-x%DPep2Vm9#ci zyQcCc4p2&dLtV1@rPe!%>Y^#9W8#ZH&}^@wJKT7N;R9A7cEq&;Y2CYvd@R+Mn&b5O zVyfS^*H#kD74=J5uhD)o`TXoX>>Si$!cT?TXRxj2pB)w_ljjhTby&Je;X|BESZZT= zC%G5!-$BJf&a~U78d_3zBjrvrkJ0CCl@Rfcf7I(`VTNPnI^B#B$zOfPW zG&mEd?R0+W<`l08O1dkcWKS8wB!Z*Cs%I1nMs-EeB-uu5?t@PuD3|z>je8DKi#X(B z{Z=Rz{4X%?-UnxnHQtkELIZ&=J;fK_t}yu8|IxG0(85e&K>H3!!~zlhyJrgti~o1i zzBS*jTgdG~Exp#B-T)6A+PB ztD-e`j^@XAx}|L&JSEFkRvS_%3b%m86z02#Hfn{Y+qIqQ_muywgt?roUA7oiS1xBD zFxmDMsj_cbBcn*^rn^KIMP{AlHM`NiVm*D&`z~7FH#hf<$L3HmJ+=NdiY5>W?nKD? z8Ox6{9dKyI1o8a-j9BtV-|=lm`<`v>tR^Cln&x1dMYzu{@wq5KW!#K14_QMnpH5K%Pavag+g6(i8i-#Eq zguc}rH3?BxH4SOqZW#7m*aT(U9-n#_Xn^Q19(}eH!xG`nI!GYziVQNcA0)`FDHD%~ zz2$HnxW4BQ{#*@u`dssbAa`|fESn$8i8FdxGZh48_Uf~_Q@tv?4in)6fwSed)k&ITqu|){^(WL~J z?Lb|0ro06J^>f>^2}^e-+$u5bU4IZNfO?75v8lstS15%XYw2ac^pkU34{QhDR(umt zPu~`w2?FP|nn3!RWZ3{?=77@teulahD9*S*k5KmY3*adlM)%{SR~bkZYlx1q@fkE= zI$7+kiw5!ha=dYlO>Z5KgxnZEJsaBm%v#nkX0MN-h%n&KA?N}xU3K3o-3Jpk?ANq2n9&Lh%K_CTvfiN ze>6w~NSSl8$#NEZ^t7h9YOxI=zcAG|a+m6AWei`3Jw7K;b;T${pJa^4RwRt%F>?>M zBmoQqm1`<_W7i!5P~THp-II)Ka^u;=z;}d{;SVj{G_4`9^HaEb!=@Pa;Dw)CH^DjsGxFqmb%o$Bkop$KnH8 zDYN)Bh)5=5!-*|f0Gh4)oZG=TEBr()g^DCtSQhmT3!ZN`Qd-E%@1cE}hm8&Vq5B+C zVF2_O)9IiZ(v(xzTwJIg5|}KVuE(;}|7dVIrT`$d=q_OG|3PY}x*URYkMXXJ6PT1$IFkNyvY_(9UglDi6TaeikPS(!Bnij z;Szn+)I_oxnRz7(WTYTp+IHSWQ?Xd~tQn(Q1r)kThM?NM< z?d6LaBG!H}R$zRy!Ij(}1?xe^+o+!;tqWJ3NgjHl1XNxzusxQ0I#6qzM(_00UPMw* zF*GWW_q&fqAN=uimSKgBu_@jD%MX3hpNY|*4r=e=k1lw2r**IyD(hcq?A+HtUgUy4Dqh5D7|G9q{)TsUj{g~c!xy>9wk^(LiXA4VKGz_zMvJMX#AgsR z34T3hhJ)#&sUaQ1+0PML(?YA~{5?=(MT}X^Vib%};uoI{qGW@wgJ&_M+8S8clsNz2 zPQkxMi`#3+Khwtl>>K>wxc{71{&!qGu&Zzz_wU(7TLTyG){PAu?!cXs?Dp-y0Ekcn AQvd(} literal 0 HcmV?d00001 diff --git a/vllm_omni/model_executor/models/cosyvoice3/code2wav_core/cfm.py b/vllm_omni/model_executor/models/cosyvoice3/code2wav_core/cfm.py index 7281cd81f9..36ff0d4565 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/code2wav_core/cfm.py +++ b/vllm_omni/model_executor/models/cosyvoice3/code2wav_core/cfm.py @@ -174,7 +174,7 @@ def __init__(self, in_channels, cfm_params, n_spks=1, spk_emb_dim=64, estimator: super().__init__(in_channels, cfm_params, n_spks, spk_emb_dim, estimator) @torch.inference_mode() - def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None): + def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None, streaming: bool = False): """Forward diffusion Args: @@ -277,7 +277,9 @@ def inference( prompt_feat, prompt_feat_len, embedding, - finalize, + streaming: bool = True, + finalize: bool = False, + n_timesteps: int = 10, ): assert token.shape[0] == 1 # xvec projection @@ -314,7 +316,8 @@ def inference( mask=mask.unsqueeze(1), spks=embedding, cond=conds, - n_timesteps=10, + n_timesteps=max(1, int(n_timesteps)), + streaming=streaming, ) feat = feat[:, :, mel_len1:] diff --git a/vllm_omni/model_executor/models/cosyvoice3/config.py b/vllm_omni/model_executor/models/cosyvoice3/config.py index b4e44b7a82..518fe76b78 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/config.py +++ b/vllm_omni/model_executor/models/cosyvoice3/config.py @@ -7,7 +7,9 @@ class CosyVoice3Config(PretrainedConfig): model_type = "cosyvoice3" def __init__(self, **kwargs): - # Set speech EOS so vLLM stops generation at the right token + # Set primary speech EOS so vLLM stops generation at the right token. + # The official CosyVoice3 treats ALL tokens >= speech_token_size + # (6561-6760) as stop signals; see stop_token_ids in the YAML configs. kwargs.setdefault("eos_token_id", 6562) super().__init__(**kwargs) self.sample_rate = 24000 diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py index 18a16ba551..2fba8fb8af 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py +++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py @@ -2,14 +2,16 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os from collections.abc import Iterable, Mapping, Sequence +from dataclasses import replace from functools import partial +from threading import Lock -import numpy as np import torch import torch.nn as nn from transformers.feature_extraction_utils import BatchFeature from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions +from vllm.forward_context import get_forward_context, is_forward_context_available from vllm.inputs import MultiModalDataDict from vllm.logger import init_logger from vllm.model_executor.models.interfaces import SupportsMultiModal @@ -26,6 +28,9 @@ PromptUpdate, ) from vllm.sequence import IntermediateTensors +from vllm.v1.outputs import SamplerOutput +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.sample.sampler import Sampler from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config from vllm_omni.model_executor.models.cosyvoice3.utils import ( @@ -267,6 +272,8 @@ class CosyVoice3Model( supports_multimodal_raw_input_only = True supports_multimodal = True requires_raw_input_tokens = True + prefer_model_sampler = True + _sampling_eps = 1e-5 def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -305,6 +312,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.code2wav = CosyVoice3Code2Wav(self.config) self.model = self.code2wav.flow_model self.hift = self.code2wav.hift + # Keep additional information synchronized for async_chunk updates. + self.enable_update_additional_information = True # Expose streaming parameters self.token_overlap_len = self.code2wav.token_overlap_len @@ -313,6 +322,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.mel_cache_len = self.code2wav.mel_cache_len self.source_cache_len = self.code2wav.source_cache_len self.speech_window = self.code2wav.speech_window + self._stream_audio_cache_by_req: dict[str, torch.Tensor] = {} + self._stream_audio_cache_lock = Lock() + self._stream_vocoder_cache_by_req: dict[str, dict[str, torch.Tensor]] = {} else: raise ValueError(f"Model stage not supported {self.model_stage}") @@ -331,19 +343,277 @@ def _create_llm_vllm_config(self, parent_config: VllmConfig) -> VllmConfig: # Use parent's cache config - critical for PagedAttention to work correctly return parent_config.with_hf_config(qwen_hf_config, architectures=["Qwen2Model"]) + @staticmethod + def _as_tensor(value: object) -> torch.Tensor | None: + """Extract tensor payload from runtime info fields.""" + if isinstance(value, list): + if not value: + return None + value = value[0] + if isinstance(value, torch.Tensor): + return value + return None + + @staticmethod + def _as_str(value: object) -> str | None: + """Extract string payload from runtime info fields.""" + if isinstance(value, list): + if not value: + return None + value = value[0] + if value is None: + return None + return str(value) + + @staticmethod + def _as_bool(value: object) -> bool: + """Extract boolean payload from runtime info fields.""" + if isinstance(value, list): + if not value: + return False + value = value[0] + if isinstance(value, torch.Tensor): + if value.numel() == 0: + return False + return bool(value.reshape(-1)[0].item()) + if value is None: + return False + return bool(value) + + @staticmethod + def _cross_fade_audio(audio: torch.Tensor, prev_tail: torch.Tensor) -> torch.Tensor: + """Blend previous chunk tail into current chunk head using a Hamming window. + + This mirrors upstream CosyVoice's `fade_in_out(...)` semantics: + update the current head in-place using a 2*overlap window, then + concatenate the unchanged remainder. + """ + if audio.numel() == 0 or prev_tail.numel() == 0: + return audio + overlap = min(int(audio.numel()), int(prev_tail.numel())) + if overlap <= 0: + return audio + window = torch.hamming_window(2 * overlap, periodic=False, dtype=audio.dtype, device=audio.device) + fade_in = window[:overlap] + fade_out = window[overlap:] + blended = audio[:overlap] * fade_in + prev_tail[-overlap:].to(device=audio.device, dtype=audio.dtype) * fade_out + if overlap == int(audio.numel()): + return blended + return torch.cat([blended, audio[overlap:]], dim=0) + + def _stitch_stream_audio(self, req_id: str | None, audio: torch.Tensor, stream_finished: bool) -> torch.Tensor: + """Pass-through stitching for async_chunk. + + Chunk overlap is already removed in mel domain via token_offset_tokens. + Applying an additional waveform-domain fade/cache step introduces either + duplicated overlap (if no tail trim) or duration shrink (if tail trim). + """ + if req_id is not None and stream_finished and hasattr(self, "_stream_audio_cache_by_req"): + with self._stream_audio_cache_lock: + self._stream_audio_cache_by_req.pop(req_id, None) + if hasattr(self, "_stream_vocoder_cache_by_req"): + self._stream_vocoder_cache_by_req.pop(req_id, None) + return audio + + @staticmethod + def _split_request_ids(ids: torch.Tensor, seq_token_counts: list[int] | None = None) -> list[torch.Tensor]: + """Split concatenated input_ids into per-request segments.""" + if seq_token_counts is not None: + boundaries = [0] + for count in seq_token_counts: + boundaries.append(boundaries[-1] + int(count)) + total = ids.numel() + return [ids[boundaries[i] : min(boundaries[i + 1], total)] for i in range(len(seq_token_counts))] + + if is_forward_context_available(): + slices = get_forward_context().ubatch_slices + if slices is not None and len(slices) > 1 and not any(hasattr(s, "token_slice") for s in slices): + boundaries = [0] + for s in slices: + boundaries.append(boundaries[-1] + int(s)) + return [ids[boundaries[i] : boundaries[i + 1]] for i in range(len(boundaries) - 1)] + + return [ids] + + def _sanitize_codec_tokens(self, req_ids: torch.Tensor) -> torch.Tensor: + """Filter non-code tokens before feeding flow token embedding.""" + vocab_size = int(self.code2wav.input_embedding.num_embeddings) + valid_mask = (req_ids >= 0) & (req_ids < vocab_size) + return req_ids[valid_mask] + + @staticmethod + def _req_scalar(param: torch.Tensor | None, req_idx: int, default: float | int) -> float | int: + if param is None or param.numel() == 0: + return default + index = min(req_idx, int(param.numel()) - 1) + value = param.reshape(-1)[index].item() + if isinstance(default, int): + return int(value) + return float(value) + + @staticmethod + def _multinomial_sample(probs: torch.Tensor, generator: torch.Generator | None = None) -> torch.Tensor: + return torch.multinomial(probs, 1, replacement=True, generator=generator).reshape(()) + + @classmethod + def _nucleus_sample_one( + cls, + weighted_scores: torch.Tensor, + *, + top_p: float, + top_k: int, + generator: torch.Generator | None, + ) -> int: + probs = weighted_scores.softmax(dim=0) + sorted_prob, sorted_idx = probs.sort(descending=True, stable=True) + kept_probs: list[torch.Tensor] = [] + kept_indices: list[torch.Tensor] = [] + cum_prob = 0.0 + max_keep = len(sorted_idx) if top_k <= 0 else min(int(top_k), len(sorted_idx)) + for i in range(len(sorted_idx)): + if cum_prob < top_p and len(kept_probs) < max_keep: + cum_prob += float(sorted_prob[i].item()) + kept_probs.append(sorted_prob[i]) + kept_indices.append(sorted_idx[i]) + else: + break + + if not kept_probs: + return int(sorted_idx[0].item()) + + sample_probs = torch.stack(kept_probs) + sample_idx = cls._multinomial_sample(sample_probs, generator=generator) + return int(torch.stack(kept_indices)[int(sample_idx.item())].item()) + + @classmethod + def _ras_sample_one( + cls, + weighted_scores: torch.Tensor, + decoded_tokens: Sequence[int], + *, + top_p: float, + top_k: int, + win_size: int, + tau_r: float, + generator: torch.Generator | None, + ) -> int: + top_id = cls._nucleus_sample_one( + weighted_scores, + top_p=top_p, + top_k=top_k, + generator=generator, + ) + if win_size > 0 and decoded_tokens: + recent = torch.as_tensor( + list(decoded_tokens[-win_size:]), + device=weighted_scores.device, + dtype=torch.long, + ) + rep_num = int((recent == top_id).sum().item()) + if rep_num >= win_size * tau_r: + weighted_scores = weighted_scores.clone() + weighted_scores[top_id] = float("-inf") + fallback_probs = weighted_scores.softmax(dim=0) + top_id = int(cls._multinomial_sample(fallback_probs, generator=generator).item()) + return top_id + + def _cosyvoice3_ras_enabled(self, sampling_metadata: SamplingMetadata) -> bool: + if self.model_stage != "cosyvoice3_talker": + return False + if sampling_metadata.max_num_logprobs is not None: + return False + if sampling_metadata.temperature is None: + return False + if bool(sampling_metadata.bad_words_token_ids): + return False + if torch.any(sampling_metadata.frequency_penalties != 0): + return False + if torch.any(sampling_metadata.presence_penalties != 0): + return False + return True + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> SamplerOutput | None: + if logits is None or logits.numel() == 0: + return None + if self.model_stage != "cosyvoice3_talker": + return None + + sampler = getattr(self, "_talker_sampler", None) + if sampler is None: + sampler = Sampler() + self._talker_sampler = sampler + + if not self._cosyvoice3_ras_enabled(sampling_metadata): + return sampler(logits=logits, sampling_metadata=sampling_metadata) + + logits = logits.to(torch.float32) + sampling_for_processors = replace(sampling_metadata, no_penalties=True) + logits = sampler.apply_logits_processors(logits, sampling_for_processors, predict_bonus_token=False) + + sampling_cfg = dict(self.config.llm.get("sampling", {})) + default_top_p = float(sampling_cfg.get("top_p", 0.8)) + default_top_k = int(sampling_cfg.get("top_k", 25)) + win_size = int(sampling_cfg.get("win_size", 10)) + tau_r = float(sampling_cfg.get("tau_r", 0.1)) + + sampled_ids: list[int] = [] + for req_idx in range(int(logits.shape[0])): + row_logits = logits[req_idx] + + temperature = float(self._req_scalar(sampling_metadata.temperature, req_idx, 1.0)) + if temperature < self._sampling_eps: + sampled_ids.append(int(torch.argmax(row_logits).item())) + continue + + top_p = float(self._req_scalar(sampling_metadata.top_p, req_idx, default_top_p)) + top_k = int(self._req_scalar(sampling_metadata.top_k, req_idx, default_top_k)) + generator = sampling_metadata.generators.get(req_idx) + weighted_scores = torch.log_softmax(row_logits / max(temperature, self._sampling_eps), dim=0) + decoded_tokens = ( + sampling_metadata.output_token_ids[req_idx] if req_idx < len(sampling_metadata.output_token_ids) else [] + ) + sampled_ids.append( + self._ras_sample_one( + weighted_scores, + decoded_tokens, + top_p=top_p, + top_k=top_k, + win_size=win_size, + tau_r=tau_r, + generator=generator, + ) + ) + + sampled = torch.tensor(sampled_ids, device=logits.device, dtype=torch.int32) + return SamplerOutput(sampled_token_ids=sampled.unsqueeze(-1), logprobs_tensors=None) + def compute_logits(self, hidden_states: torch.Tensor | OmniOutput) -> torch.Tensor | None: if isinstance(hidden_states, OmniOutput): hidden_states = hidden_states.text_hidden_states if self.model_stage == "cosyvoice3_talker": logits = self.model.llm_decoder(hidden_states) + # The decoder outputs speech_token_size + 200 logits. The official + # CosyVoice3 treats ALL tokens >= speech_token_size (the last 200) + # as stop signals. Merge their probabilities into a single EOS + # token (6562) via logsumexp so that vLLM's stop_token_ids=[6562] + # fires with the correct aggregate stop probability. + speech_token_size = self.config.llm["speech_token_size"] + eos_idx = self.config.llm["eos_token_id"] + stop_logits = logits[..., speech_token_size:] # last 200 + merged_stop = torch.logsumexp(stop_logits, dim=-1, keepdim=True) + logits[..., speech_token_size:] = float("-inf") # mask all + logits[..., eos_idx] = merged_stop.squeeze(-1) # restore merged + # Pad to full vocab_size for vLLM token handling. vocab_size = self.config.vocab_size pad_size = vocab_size - logits.size(-1) - pad_shape = logits.shape[:-1] + (pad_size,) - pad = logits.new_full(pad_shape, float("-inf")) - eos_token_val = logits[..., self.config.llm["eos_token_id"]].clone() - logits[..., -200:] = float("-inf") - logits[..., self.config.llm["eos_token_id"]] = eos_token_val - logits = torch.cat([logits, pad], dim=-1) + if pad_size > 0: + pad_shape = logits.shape[:-1] + (pad_size,) + pad = logits.new_full(pad_shape, float("-inf")) + logits = torch.cat([logits, pad], dim=-1) return logits else: raise RuntimeError(f"compute_logits is only valid for {self.model_stage}.") @@ -380,6 +650,7 @@ def embed_input_ids( hidden = int(self.config.hidden_size) return torch.zeros( (input_ids.shape[0], hidden), + device=input_ids.device, ) else: raise RuntimeError(f"embed_input_ids is not valid for {self.model_stage}.") @@ -412,28 +683,116 @@ def forward( return OmniOutput(text_hidden_states=hidden_states, multimodal_outputs=multimodal_outputs) elif self.model_stage == "cosyvoice3_code2wav": - runtime_info = kwargs.get("runtime_additional_information", []) - if not runtime_info: - length = 30 * 24000 - audio = np.zeros((length,)) - return OmniOutput(text_hidden_states=None, multimodal_outputs={"audio": audio}) - - # Remove the last eos token and add batch dimension - token = input_ids[..., :-1].unsqueeze(0) - - # Generate audio using code2wav - tts_speech = self.code2wav( - token=token, - prompt_token=runtime_info[0]["speech_token"][:1], - prompt_feat=runtime_info[0]["speech_feat"][:1], - embedding=runtime_info[0]["embedding"][:1], - n_timesteps=10, - ) - - return OmniOutput( - text_hidden_states=None, - multimodal_outputs={"audio": tts_speech, "sr": torch.tensor(22050)}, - ) + runtime_info = kwargs.get("model_intermediate_buffer") + if runtime_info is None: + runtime_info = kwargs.get("runtime_additional_information", []) + if "runtime_additional_information" in kwargs and "model_intermediate_buffer" not in kwargs: + logger.warning_once("runtime_additional_information is deprecated, use model_intermediate_buffer") + + seq_token_counts = kwargs.get("seq_token_counts") + flat_ids = input_ids.reshape(-1).to(dtype=torch.long) + request_ids_list = self._split_request_ids(flat_ids, seq_token_counts) + + num_reqs = max(1, len(request_ids_list)) + sample_rate = torch.tensor(int(self.config.sample_rate), dtype=torch.int32) + empty_audio = torch.zeros((0,), dtype=torch.float32, device=input_ids.device) + audios: list[torch.Tensor] = [empty_audio] * num_reqs + srs: list[torch.Tensor] = [sample_rate] * num_reqs + if not isinstance(runtime_info, list): + runtime_info = [] + + for idx, req_ids in enumerate(request_ids_list): + info = runtime_info[idx] if idx < len(runtime_info) and isinstance(runtime_info[idx], dict) else {} + req_id = self._as_str(info.get("req_id")) if info else None + stream_finished = self._as_bool(info.get("stream_finished")) if info else False + speech_token = self._as_tensor(info.get("speech_token")) if info else None + speech_feat = self._as_tensor(info.get("speech_feat")) if info else None + embedding = self._as_tensor(info.get("embedding")) if info else None + if speech_token is None or speech_feat is None or embedding is None: + if stream_finished and req_id is not None and hasattr(self, "_stream_vocoder_cache_by_req"): + with self._stream_audio_cache_lock: + self._stream_vocoder_cache_by_req.pop(req_id, None) + audios[idx] = self._stitch_stream_audio(req_id, empty_audio, stream_finished) + if ( + req_ids.numel() > 0 + and info + and ("token_offset" in info or "left_context_size" in info or "generated_len" in info) + ): + info_keys = ",".join(sorted(info.keys())) if info else "" + logger.warning_once( + "CosyVoice3 code2wav missing prompt conditioning for non-empty codec tokens: " + "raw_len=%d info_keys=%s", + int(req_ids.numel()), + info_keys, + ) + continue + + token = self._sanitize_codec_tokens(req_ids) + if token.numel() == 0: + audios[idx] = self._stitch_stream_audio(req_id, empty_audio, stream_finished) + if req_ids.numel() > 0: + logger.warning_once( + "CosyVoice3 code2wav received no valid codec tokens after filtering: " + "raw_len=%d raw_range=[%d,%d] vocab_size=%d", + req_ids.numel(), + int(req_ids.min().item()), + int(req_ids.max().item()), + int(self.code2wav.input_embedding.num_embeddings), + ) + continue + + # `generated_len` is injected for many models by the generic + # runner, so only explicit chunk-routing fields should switch + # code2wav into the streaming path. + uses_streaming_decode = bool(info) and ( + "stream_finished" in info or "token_offset" in info or "left_context_size" in info + ) + if uses_streaming_decode: + token_offset = 0 + try: + if info and "token_offset" in info: + token_offset = max(0, int(info.get("token_offset", 0))) + elif info: + token_offset = max(0, int(info.get("left_context_size", 0))) + except (TypeError, ValueError): + token_offset = 0 + + cache_state = None + if req_id is not None and hasattr(self, "_stream_vocoder_cache_by_req"): + with self._stream_audio_cache_lock: + cache_state = self._stream_vocoder_cache_by_req.get(req_id) + + tts_speech, new_cache_state = self.code2wav.forward_streaming( + token=token.unsqueeze(0), + prompt_token=speech_token[:1], + prompt_feat=speech_feat[:1], + embedding=embedding[:1], + cache_state=cache_state, + n_timesteps=10, + token_offset_tokens=token_offset, + finalize=stream_finished, + ) + + if req_id is not None and hasattr(self, "_stream_vocoder_cache_by_req"): + with self._stream_audio_cache_lock: + if new_cache_state is None or stream_finished: + self._stream_vocoder_cache_by_req.pop(req_id, None) + else: + self._stream_vocoder_cache_by_req[req_id] = new_cache_state + else: + tts_speech = self.code2wav.forward( + token=token.unsqueeze(0), + prompt_token=speech_token[:1], + prompt_feat=speech_feat[:1], + embedding=embedding[:1], + n_timesteps=10, + ) + + audio = tts_speech.reshape(-1).to(dtype=torch.float32) + + audios[idx] = self._stitch_stream_audio(req_id, audio, stream_finished) + + return OmniOutput(text_hidden_states=None, multimodal_outputs={"audio": audios, "sr": srs}) else: raise ValueError(f"Unsupported model_stage: {self.model_stage}") diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py index 222d6d98ac..3ad23cdb10 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py +++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py @@ -11,11 +11,12 @@ from __future__ import annotations +from contextlib import nullcontext + import numpy as np import torch import torch.nn as nn from omegaconf import DictConfig -from torch.nn import functional as F from vllm.logger import init_logger from vllm_omni.diffusion.models.cosyvoice3_audio.cosyvoice3_dit import DiT @@ -29,7 +30,6 @@ ) from vllm_omni.model_executor.models.cosyvoice3.code2wav_core.layers import PreLookaheadLayer from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config -from vllm_omni.model_executor.models.cosyvoice3.utils import make_pad_mask logger = init_logger(__name__) @@ -151,84 +151,160 @@ def spk_embed_affine_layer(self) -> nn.Linear: return self.flow_model.spk_embed_affine_layer @torch.inference_mode() - def forward( + def _forward_mel( self, token: torch.Tensor, prompt_token: torch.Tensor, prompt_feat: torch.Tensor, embedding: torch.Tensor, n_timesteps: int = 10, + token_offset_tokens: int = 0, + streaming: bool = True, + finalize: bool = False, ) -> torch.Tensor: - """Generate audio waveform from speech tokens. + """Generate mel features via the upstream flow-model inference path.""" + flow_weight = next(self.flow_model.parameters()) + device = flow_weight.device + dtype = flow_weight.dtype + + token = token.to(device=device, dtype=torch.int32) + prompt_token = prompt_token.to(device=device, dtype=torch.int32) + prompt_feat = prompt_feat.to(device=device, dtype=dtype) + embedding = embedding.to(device=device, dtype=dtype) + token_len = torch.tensor([token.shape[1]], device=device, dtype=torch.int32) + prompt_token_len = torch.tensor([prompt_token.shape[1]], device=device, dtype=torch.int32) + prompt_feat_len = torch.tensor([prompt_feat.shape[1]], device=device, dtype=torch.int32) + + with nullcontext(): + feat, _ = self.flow_model.inference( + token=token, + token_len=token_len, + prompt_token=prompt_token, + prompt_token_len=prompt_token_len, + prompt_feat=prompt_feat, + prompt_feat_len=prompt_feat_len, + embedding=embedding, + streaming=streaming, + finalize=finalize, + n_timesteps=n_timesteps, + ) - Args: - token: Speech tokens from talker stage [batch, seq_len] - prompt_token: Prompt speech tokens [batch, prompt_len] - prompt_feat: Prompt mel features [batch, feat_len, mel_dim] - embedding: Speaker embedding [batch, spk_dim] - n_timesteps: Number of diffusion steps - - Returns: - Audio waveform [batch, 1, audio_len] - """ - device = token.device - dtype = next(self.flow_model.parameters()).dtype + trim_mel = max(0, int(token_offset_tokens)) * int(self.token_mel_ratio) + if trim_mel > 0: + feat = feat[:, :, trim_mel:] - # Normalize and project speaker embedding - embedding = embedding.to(device=device, dtype=dtype) - embedding = F.normalize(embedding, dim=1) - embedding = self.spk_embed_affine_layer(embedding) + return feat - # Prepare tokens - prompt_token = prompt_token.to(device=device) - token_len1, token_len2 = prompt_token.shape[1], token.shape[1] - prompt_token_len = torch.tensor([token_len1], device=device, dtype=torch.int32) - token_len = torch.tensor([token_len2], device=device, dtype=torch.int32) + @staticmethod + def _fade_speech( + speech: torch.Tensor, + prev_speech: torch.Tensor, + ) -> torch.Tensor: + """Blend previous speech tail into current speech head.""" + if speech.numel() == 0 or prev_speech.numel() == 0: + return speech + overlap = min(int(speech.shape[-1]), int(prev_speech.shape[-1])) + if overlap <= 0: + return speech + window = torch.hamming_window(2 * overlap, periodic=False, dtype=speech.dtype, device=speech.device) + fade_in = window[:overlap].view(1, -1) + fade_out = window[overlap:].view(1, -1) + blended_head = ( + speech[:, :overlap] * fade_in + + prev_speech[:, -overlap:].to(device=speech.device, dtype=speech.dtype) * fade_out + ) + if overlap == int(speech.shape[-1]): + return blended_head + return torch.cat([blended_head, speech[:, overlap:]], dim=-1) - # Concatenate prompt and target tokens - full_token = torch.cat([prompt_token, token], dim=1) - full_token_len = prompt_token_len + token_len + @torch.inference_mode() + def forward_streaming( + self, + token: torch.Tensor, + prompt_token: torch.Tensor, + prompt_feat: torch.Tensor, + embedding: torch.Tensor, + *, + cache_state: dict[str, torch.Tensor] | None = None, + n_timesteps: int = 10, + token_offset_tokens: int = 0, + finalize: bool = False, + ) -> tuple[torch.Tensor, dict[str, torch.Tensor] | None]: + """Decode streaming audio using cumulative mel + emitted-speech offset. + + This mirrors upstream CosyVoice3 streaming semantics more closely than + waveform-domain overlap-add: keep a cumulative mel history per request, + re-run causal HiFT on the history, and emit only the newly grown speech + suffix. That preserves causal look-right handling without double + trimming or duplicated overlap at chunk boundaries. + """ + with nullcontext(): + feat = self._forward_mel( + token=token, + prompt_token=prompt_token, + prompt_feat=prompt_feat, + embedding=embedding, + n_timesteps=n_timesteps, + token_offset_tokens=token_offset_tokens, + streaming=True, + finalize=finalize, + ) + hift_weight = self.hift.m_source.l_linear.weight + chunk_mel = feat.to(device=hift_weight.device, dtype=hift_weight.dtype) + + cached_mel = None if not cache_state else cache_state.get("mel") + speech_offset_obj = None if not cache_state else cache_state.get("speech_offset") + try: + speech_offset = int(speech_offset_obj) if speech_offset_obj is not None else 0 + except (TypeError, ValueError): + speech_offset = 0 + + if isinstance(cached_mel, torch.Tensor) and cached_mel.numel() > 0: + cached_mel = cached_mel.to(device=chunk_mel.device, dtype=chunk_mel.dtype) + tts_mel = torch.cat([cached_mel, chunk_mel], dim=-1) if chunk_mel.numel() > 0 else cached_mel + else: + tts_mel = chunk_mel - # Create mask - mask = (~make_pad_mask(full_token_len)).unsqueeze(-1).to(embedding) + if tts_mel.shape[-1] == 0: + tts_speech = torch.zeros((chunk_mel.shape[0], 1, 0), device=chunk_mel.device, dtype=chunk_mel.dtype) + else: + with nullcontext(): + tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=finalize) - # Token embedding (clamp to valid codebook range; EOS/padding tokens may exceed vocab_size) - token_emb = ( - self.input_embedding(torch.clamp(full_token, min=0, max=self.input_embedding.num_embeddings - 1)) * mask - ) + tts_speech = tts_speech.reshape(tts_speech.shape[0], -1) + speech_offset = max(0, min(speech_offset, int(tts_speech.shape[-1]))) + emitted_speech = tts_speech[:, speech_offset:] - # Pre-lookahead processing - h = self.pre_lookahead_layer(token_emb) - h = h.repeat_interleave(self.token_mel_ratio, dim=1) + if finalize: + return emitted_speech.reshape(emitted_speech.shape[0], 1, -1), None - # Calculate mel lengths - mel_len1 = prompt_feat.shape[1] - mel_len2 = h.shape[1] - mel_len1 + new_state = { + "mel": tts_mel.detach().cpu().contiguous(), + "speech_offset": int(tts_speech.shape[-1]), + } + return emitted_speech.reshape(emitted_speech.shape[0], 1, -1), new_state - # Build conditioning - conds = torch.zeros( - [1, mel_len1 + mel_len2, self.output_size], - device=device, - dtype=h.dtype, - ) - conds[:, :mel_len1] = prompt_feat - conds = conds.transpose(1, 2) - - # Create mel mask - mel_mask = (~make_pad_mask(torch.tensor([mel_len1 + mel_len2]))).to(h) - - # Run flow matching decoder - feat, _ = self.decoder( - mu=h.transpose(1, 2).contiguous(), - mask=mel_mask.unsqueeze(1), - spks=embedding, - cond=conds, + @torch.inference_mode() + def forward( + self, + token: torch.Tensor, + prompt_token: torch.Tensor, + prompt_feat: torch.Tensor, + embedding: torch.Tensor, + n_timesteps: int = 10, + ) -> torch.Tensor: + """Generate audio waveform from speech tokens.""" + feat = self._forward_mel( + token=token, + prompt_token=prompt_token, + prompt_feat=prompt_feat, + embedding=embedding, n_timesteps=n_timesteps, + token_offset_tokens=0, + streaming=False, + finalize=True, ) - # Extract generated portion (after prompt) - feat = feat[:, :, mel_len1:] - # Run vocoder hift_weight = self.hift.m_source.l_linear.weight tts_mel = feat.to(device=hift_weight.device, dtype=hift_weight.dtype) @@ -240,7 +316,7 @@ def forward( dtype=tts_mel.dtype, ) else: - tts_speech, _ = self.hift.inference(speech_feat=tts_mel) + tts_speech, _ = self.hift.inference(speech_feat=tts_mel, finalize=True) return tts_speech diff --git a/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml b/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml index bfb847f5ea..8e0582723e 100644 --- a/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml +++ b/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml @@ -24,7 +24,10 @@ stage_args: dtype: "float32" default_sampling_params: max_tokens: 2048 - stop_token_ids: [6562] # speech EOS token + stop_token_ids: [6562] # merged speech stop token (logsumexp of all 200 stop logits) + top_k: 25 + top_p: 0.8 + repetition_penalty: 1.0001 # near-identity; forces vLLM to track output_token_ids for RAS - stage_id: 1 runtime: diff --git a/vllm_omni/model_executor/stage_configs/cosyvoice3_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/cosyvoice3_async_chunk.yaml new file mode 100644 index 0000000000..ca7e9850ae --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/cosyvoice3_async_chunk.yaml @@ -0,0 +1,85 @@ +# Stage config for running CosyVoice3 with async_chunk architecture +# Stage 0: Talker (text prompt -> speech tokens streamed by chunks) +# Stage 1: Code2Wav (flow matching -> acoustic features -> waveform) +async_chunk: true + +stage_args: + - stage_id: 0 + is_comprehension: true + runtime: + devices: 0 + max_batch_size: 1 + engine_args: + model_stage: cosyvoice3_talker + model_arch: CosyVoice3Model + worker_type: ar + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.cosyvoice3.talker2code2wav_async_chunk + trust_remote_code: true + gpu_memory_utilization: 0.4 + engine_output_type: latent + disable_hybrid_kv_cache_manager: true + enable_prefix_caching: false + enforce_eager: true + mm_processor_cache_gb: 0 + skip_mm_profiling: true + dtype: "float32" + default_sampling_params: + max_tokens: 2048 + stop_token_ids: [6562] # merged speech stop token (logsumexp of all 200 stop logits) + top_k: 25 + top_p: 0.8 + repetition_penalty: 1.0001 # near-identity; forces vLLM to track output_token_ids for RAS + output_connectors: + to_stage_1: connector_of_shared_memory + + - stage_id: 1 + runtime: + devices: 0 + max_batch_size: 1 + engine_args: + model_stage: cosyvoice3_code2wav + model_arch: CosyVoice3Model + worker_type: generation + worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + trust_remote_code: true + engine_output_type: latent + gpu_memory_utilization: 0.2 + enforce_eager: true + disable_hybrid_kv_cache_manager: true + enable_prefix_caching: false + skip_mm_profiling: true + max_model_len: 32768 + dtype: "float32" + default_sampling_params: + max_tokens: 2048 + engine_input_source: [0] + final_output: true + final_output_type: audio + input_connectors: + from_stage_0: connector_of_shared_memory + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 + + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + codec_streaming: true + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + codec_chunk_frames: 25 + codec_left_context_frames: 25 + codec_vocab_size: 6561 + + edges: + - from: 0 + to: 1 + window_size: -1 diff --git a/vllm_omni/model_executor/stage_input_processors/cosyvoice3.py b/vllm_omni/model_executor/stage_input_processors/cosyvoice3.py index b7f21eca8f..c722a125e5 100644 --- a/vllm_omni/model_executor/stage_input_processors/cosyvoice3.py +++ b/vllm_omni/model_executor/stage_input_processors/cosyvoice3.py @@ -1,10 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections import defaultdict +from contextlib import nullcontext from typing import Any +import numpy as np +import torch from vllm.inputs import TextPrompt from vllm_omni.inputs.data import OmniTokensPrompt +def _ensure_list(x: Any) -> list[Any]: + if hasattr(x, "_x"): + return list(x._x) + if isinstance(x, list): + return list(x) + if isinstance(x, tuple): + return list(x) + if x is None: + return [] + try: + return list(x) + except TypeError: + return [x] + + +def _to_cpu_tensor(x: Any) -> torch.Tensor | None: + if isinstance(x, list): + if not x: + return None + x = x[0] + if isinstance(x, torch.Tensor): + return x.detach().cpu() + return None + + +def _decode_additional_information(raw_info: Any) -> dict[str, Any]: + if raw_info is None: + return {} + if isinstance(raw_info, dict): + return raw_info + + entries = getattr(raw_info, "entries", None) + if not isinstance(entries, dict): + return {} + + decoded: dict[str, Any] = {} + for key, entry in entries.items(): + tensor_data = getattr(entry, "tensor_data", None) + if tensor_data is not None: + dtype_name = getattr(entry, "tensor_dtype", "float32") + tensor_shape = getattr(entry, "tensor_shape", None) + if tensor_shape is None: + continue + dt = np.dtype(dtype_name) + arr = np.frombuffer(tensor_data, dtype=dt).reshape(tensor_shape) + decoded[key] = torch.from_numpy(arr.copy()) + else: + decoded[key] = getattr(entry, "list_data", None) + return decoded + + def text2flow( stage_list: list[Any], engine_input_source: list[int], @@ -15,18 +72,178 @@ def text2flow( source_stage_id = engine_input_source[0] source_outputs = stage_list[source_stage_id].engine_outputs - if not isinstance(prompt, list): - prompt = [prompt] + engine_inputs: list[OmniTokensPrompt] = [] + for source_output in source_outputs: + output = source_output.outputs[0] + multi_modal_data = output.multimodal_output + if multi_modal_data is None: + raise RuntimeError(f"Missing multimodal_output for request {source_output.request_id}") + + output_ids = _ensure_list(output.token_ids) + prefix_ids = _ensure_list(source_output.prompt_token_ids) + additional_info = dict(multi_modal_data) + additional_info["prefix_ids"] = prefix_ids + engine_inputs.append(OmniTokensPrompt(prompt_token_ids=output_ids, additional_information=additional_info)) + return engine_inputs + + +def talker2code2wav_async_chunk( + transfer_manager: Any, + pooling_output: dict[str, Any] | None, + request: Any, + is_finished: bool = False, +) -> dict[str, Any] | None: + """CosyVoice3 async_chunk processor: talker token stream -> code2wav chunks.""" + with nullcontext(): + request_id = request.external_req_id + finished = bool(is_finished or request.is_finished()) + + connector = getattr(transfer_manager, "connector", None) + raw_cfg = getattr(connector, "config", {}) or {} + cfg = raw_cfg.get("extra", raw_cfg) if isinstance(raw_cfg, dict) else {} + chunk_size = int(cfg.get("codec_chunk_frames", 25)) + code_vocab_size = int(cfg.get("codec_vocab_size", 6561)) + pre_lookahead_len = int(cfg.get("codec_pre_lookahead_frames", 3)) + max_chunk_size = int(cfg.get("codec_max_chunk_frames", 4 * chunk_size)) + stream_scale_factor = int(cfg.get("codec_stream_scale_factor", 2)) + if chunk_size <= 0 or pre_lookahead_len < 0 or max_chunk_size <= 0 or stream_scale_factor <= 0: + raise ValueError( + f"Invalid codec chunk config: codec_chunk_frames={chunk_size}, " + f"codec_pre_lookahead_frames={pre_lookahead_len}, " + f"codec_max_chunk_frames={max_chunk_size}, " + f"codec_stream_scale_factor={stream_scale_factor}" + ) + + request_state = transfer_manager.request_payload.get(request_id) + if not isinstance(request_state, dict) or "_cosyvoice3_async_state" not in request_state: + with nullcontext(): + info = _decode_additional_information(getattr(request, "additional_information", None)) + prompt_payload = {} + for key in ("speech_token", "speech_feat", "embedding"): + value = _to_cpu_tensor(info.get(key)) + if value is not None: + prompt_payload[key] = value + if isinstance(pooling_output, dict): + for key in ("speech_token", "speech_feat", "embedding"): + if key in prompt_payload: + continue + value = _to_cpu_tensor(pooling_output.get(key)) + if value is not None: + prompt_payload[key] = value + prompt_token = prompt_payload.get("speech_token") + prompt_token_len = ( + int(prompt_token.shape[1]) + if isinstance(prompt_token, torch.Tensor) and prompt_token.ndim >= 2 + else 0 + ) + prompt_token_pad = ( + ((prompt_token_len + chunk_size - 1) // chunk_size) * chunk_size - prompt_token_len + if prompt_token_len > 0 + else 0 + ) + request_state = { + "_cosyvoice3_async_state": { + "seen_len": 0, + "sent_prompt": False, + "emitted_chunks": 0, + "emitted_token_len": 0, + "token_hop_len": chunk_size, + "prompt_token_pad": prompt_token_pad, + "pre_lookahead_len": pre_lookahead_len, + "token_max_hop_len": max(chunk_size, max_chunk_size), + "stream_scale_factor": stream_scale_factor, + "terminal_sent": False, + "prompt_payload": prompt_payload, + } + } + transfer_manager.request_payload[request_id] = request_state + + state = request_state["_cosyvoice3_async_state"] + if bool(state.get("terminal_sent", False)): + return None + + with nullcontext(): + output_token_ids = _ensure_list(getattr(request, "output_token_ids", [])) + seen_len = int(state.get("seen_len", 0)) + new_tokens = output_token_ids[seen_len:] if seen_len < len(output_token_ids) else [] + state["seen_len"] = len(output_token_ids) + + if not hasattr(transfer_manager, "code_prompt_token_ids"): + transfer_manager.code_prompt_token_ids = defaultdict(list) + token_frames = transfer_manager.code_prompt_token_ids[request_id] + for tok in new_tokens: + tok_int = int(tok) + if 0 <= tok_int < code_vocab_size: + token_frames.append([tok_int]) + + length = len(token_frames) + if length <= 0: + if not finished: + return None + payload: dict[str, Any] = { + "code_predictor_codes": [], + "finished": torch.tensor(True, dtype=torch.bool), + } + if not state.get("sent_prompt", False): + payload.update(state.get("prompt_payload", {})) + state["sent_prompt"] = True + state["terminal_sent"] = True + return payload + + emitted_token_len = int(state.get("emitted_token_len", 0)) + if finished and length <= emitted_token_len: + payload = { + "code_predictor_codes": [], + "finished": torch.tensor(True, dtype=torch.bool), + } + if not state.get("sent_prompt", False): + payload.update(state.get("prompt_payload", {})) + state["sent_prompt"] = True + state["terminal_sent"] = True + return payload + + with nullcontext(): + token_hop_len = max(1, int(state.get("token_hop_len", chunk_size))) + prompt_token_pad = max(0, int(state.get("prompt_token_pad", 0))) + pre_lookahead_len = max(0, int(state.get("pre_lookahead_len", pre_lookahead_len))) + available = max(0, length - emitted_token_len) + this_token_hop_len = token_hop_len + prompt_token_pad if emitted_token_len == 0 else token_hop_len + required = this_token_hop_len + pre_lookahead_len + + if not finished: + if available < required: + return None + prefix_len = emitted_token_len + required + token_offset = emitted_token_len + else: + if available <= 0: + return None + prefix_len = length + token_offset = emitted_token_len + + with nullcontext(): + code_predictor_codes = [int(frame[0]) for frame in token_frames[:prefix_len]] - source_output = source_outputs[0] - output = source_output.outputs[0] + payload = { + "code_predictor_codes": code_predictor_codes, + "token_offset": token_offset, + "left_context_size": token_offset, + "req_id": [request_id], + "stream_finished": torch.tensor(finished, dtype=torch.bool), + "finished": torch.tensor(finished, dtype=torch.bool), + } + if not state.get("sent_prompt", False): + payload.update(state.get("prompt_payload", {})) + state["sent_prompt"] = True - multi_modal_data = output.multimodal_output - if multi_modal_data is None: - raise RuntimeError(f"Missing multimodal_output for request {source_output.request_id}") + if not finished: + state["emitted_token_len"] = emitted_token_len + this_token_hop_len + state["token_hop_len"] = min( + int(state.get("token_max_hop_len", chunk_size)), + max(chunk_size, token_hop_len * int(state.get("stream_scale_factor", 1))), + ) + else: + state["terminal_sent"] = True - output_ids = output.token_ids - prefix_ids = source_output.prompt_token_ids - multi_modal_data["prefix_ids"] = prefix_ids - engine_input = OmniTokensPrompt(prompt_token_ids=output_ids, additional_information=multi_modal_data) - return [engine_input] + state["emitted_chunks"] = int(state.get("emitted_chunks", 0)) + 1 + return payload diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index f1115ab4c6..01ec23acb4 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -6,7 +6,9 @@ from __future__ import annotations +from contextlib import nullcontext from copy import copy +from dataclasses import replace from typing import Any, NamedTuple import numpy as np @@ -89,6 +91,53 @@ def _make_buffer(self, *size, dtype, numpy=True): with maybe_disable_pin_memory_for_ray(self, total_bytes): return super()._make_buffer(*size, dtype=dtype, numpy=numpy) + def _build_model_sampler_output_token_ids(self) -> list[list[int]]: + """Build decoded-token history for custom model samplers. + + vLLM only populates sampling_metadata.output_token_ids when penalties or + logits processors require it. CosyVoice3's custom RAS sampler also + depends on this history, so we reconstruct it directly from the input + batch for prefer_model_sampler models. + """ + req_output_token_ids = getattr(self.input_batch, "req_output_token_ids", []) + req_ids = list(getattr(self.input_batch, "req_ids", [])) + output_token_ids = [list(req_output_token_ids[idx] or []) for idx in range(len(req_ids))] + + sampled_token_ids_cpu = getattr(self.input_batch, "sampled_token_ids_cpu", None) + async_copy_ready_event = getattr(self.input_batch, "async_copy_ready_event", None) + prev_req_id_to_index = getattr(self.input_batch, "prev_req_id_to_index", None) + if sampled_token_ids_cpu is None or not output_token_ids or prev_req_id_to_index is None: + return output_token_ids + + sampled_token_ids: list[list[int]] | None = None + for index, req_id in enumerate(req_ids): + prev_index = prev_req_id_to_index.get(req_id) + if prev_index is None: + continue + req_history = output_token_ids[index] + if not req_history or req_history[-1] != -1: + continue + if sampled_token_ids is None: + assert async_copy_ready_event is not None + async_copy_ready_event.synchronize() + sampled_token_ids = sampled_token_ids_cpu.tolist() + new_ids = list(sampled_token_ids[prev_index]) + if not new_ids: + continue + num_sampled_ids = len(new_ids) if new_ids[-1] != -1 else new_ids.index(-1) + first_placeholder = req_history.index(-1) + num_placeholders = len(req_history) - first_placeholder + num_to_replace = min(num_sampled_ids, num_placeholders) + req_history[first_placeholder : first_placeholder + num_to_replace] = new_ids[:num_to_replace] + + return output_token_ids + + def _sampling_metadata_for_model_sampler(self, sampling_metadata): + output_token_ids = self._build_model_sampler_output_token_ids() + if output_token_ids == sampling_metadata.output_token_ids: + return sampling_metadata + return replace(sampling_metadata, output_token_ids=output_token_ids) + @torch.inference_mode() def execute_model( self, @@ -302,6 +351,7 @@ def execute_model( # (wait_for_save + clear metadata) until after draft model runs. defer_kv_connector_finalize = self.speculative_config is not None with ( + nullcontext(), set_forward_context( attn_metadata, self.vllm_config, @@ -424,6 +474,39 @@ def execute_model( return None + def _sample( + self, + logits: torch.Tensor | None, + spec_decode_metadata: Any, + ): + sampling_metadata = self.input_batch.sampling_metadata + if spec_decode_metadata is None: + model_sample = getattr(self.model, "sample", None) + if logits is not None and callable(model_sample) and getattr(self.model, "prefer_model_sampler", False): + # Apply logit bias (min_tokens, allowed_token_ids) before + # the custom model sampler — the standard GPU sampler does + # this internally, but prefer_model_sampler bypasses it. + if hasattr(self.sampler, "logit_bias_state"): + self.sampler.logit_bias_state.apply_logit_bias( + logits, + self.input_batch.expanded_idx_mapping, + self.input_batch.idx_mapping_np, + self.input_batch.positions[self.input_batch.logits_indices], + ) + sampler_output = model_sample( + logits, + self._sampling_metadata_for_model_sampler(sampling_metadata), + ) + if sampler_output is not None: + return sampler_output + self.input_batch.update_async_output_token_ids() + return self.sampler( + logits=logits, + sampling_metadata=sampling_metadata, + ) + + return super()._sample(logits, spec_decode_metadata) + @torch.inference_mode() def sample_tokens( self, From 094907eeee6f3569a3b7c756a084b1d8026a616b Mon Sep 17 00:00:00 2001 From: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com> Date: Sun, 5 Apr 2026 14:47:08 +0800 Subject: [PATCH 049/204] [Chore] Fix Bagel model import compatibility (#2491) Signed-off-by: Yuanheng Zhao --- vllm_omni/model_executor/models/bagel/bagel.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_omni/model_executor/models/bagel/bagel.py b/vllm_omni/model_executor/models/bagel/bagel.py index 934f434e64..3b4acae515 100644 --- a/vllm_omni/model_executor/models/bagel/bagel.py +++ b/vllm_omni/model_executor/models/bagel/bagel.py @@ -8,7 +8,7 @@ from transformers import BatchFeature from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions -from vllm.inputs import MultiModalDataDict +from vllm.inputs import ModalityData, MultiModalDataDict from vllm.model_executor.layers.layernorm import RMSNorm as VllmRMSNorm from vllm.model_executor.layers.linear import ( QKVParallelLinear, @@ -27,7 +27,6 @@ from vllm.multimodal.parse import ( ImageEmbeddingItems, ImageProcessorItems, - ModalityData, ModalityDataItems, MultiModalDataItems, MultiModalDataParser, From 0824edefd5e60fdc9eaf66c16ad692f161b3c322 Mon Sep 17 00:00:00 2001 From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Date: Sun, 5 Apr 2026 03:05:52 -0400 Subject: [PATCH 050/204] ci: remove CosyVoice3 post-merge test (#2492) --- .buildkite/test-merge.yml | 43 --------------------------------------- 1 file changed, 43 deletions(-) diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 15f668b386..f98ff17140 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -423,46 +423,3 @@ steps: hostPath: path: /mnt/hf-cache type: DirectoryOrCreate - - - label: "CosyVoice3-TTS E2E Test" - timeout_in_minutes: 20 - depends_on: upload-merge-pipeline - commands: - - | - timeout 20m bash -c ' - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "advanced_model" --run-level "advanced_model" - ' - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 1 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate From 832952b2beb1dcba3b328b3ea43e21f4569fc9cd Mon Sep 17 00:00:00 2001 From: Lancer Date: Sun, 5 Apr 2026 15:16:11 +0800 Subject: [PATCH 051/204] [Feat] add diffusion pipeline profiler and progress bar support to FluxKontextPipeline et.al (#2489) Signed-off-by: Lancer --- .../models/flux/pipeline_flux_kontext.py | 103 +++++++++-------- .../diffusion/models/flux2/pipeline_flux2.py | 95 +++++++++------- .../pipeline_hunyuan_video_1_5.py | 103 +++++++++-------- .../pipeline_hunyuan_video_1_5_i2v.py | 107 ++++++++++-------- 4 files changed, 224 insertions(+), 184 deletions(-) diff --git a/vllm_omni/diffusion/models/flux/pipeline_flux_kontext.py b/vllm_omni/diffusion/models/flux/pipeline_flux_kontext.py index 3232b436d6..c7574c1c85 100644 --- a/vllm_omni/diffusion/models/flux/pipeline_flux_kontext.py +++ b/vllm_omni/diffusion/models/flux/pipeline_flux_kontext.py @@ -31,6 +31,8 @@ ) from vllm_omni.diffusion.models.flux.flux_pipeline_mixin import FluxPipelineMixin from vllm_omni.diffusion.models.interface import SupportImageInput +from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin +from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs from vllm_omni.logger import init_logger @@ -67,7 +69,9 @@ def post_process_func(images: torch.Tensor) -> list[PIL.Image.Image]: return post_process_func -class FluxKontextPipeline(nn.Module, FluxPipelineMixin, SupportImageInput): +class FluxKontextPipeline( + nn.Module, FluxPipelineMixin, SupportImageInput, ProgressBarMixin, DiffusionPipelineProfilerMixin +): """FLUX.1-Kontext pipeline for image editing with text guidance.""" support_image_input = True @@ -148,6 +152,10 @@ def __init__( self._callback_tensor_inputs = ["latents", "prompt_embeds"] self.latent_channels = self.vae.config.latent_channels if hasattr(self.vae, "config") else 16 + self.setup_diffusion_pipeline_profiler( + enable_diffusion_pipeline_profiler=self.od_config.enable_diffusion_pipeline_profiler + ) + def _get_t5_prompt_embeds( self, prompt: str | list[str] = None, @@ -635,58 +643,61 @@ def forward( # 5. Denoising loop self.scheduler.set_begin_index(0) - for i, t in enumerate(timesteps): - if self.interrupt: - continue - - latent_model_input = latents - if image_latents is not None: - latent_model_input = torch.cat([latents, image_latents], dim=1) - timestep = t.expand(latents.shape[0]).to(latents.dtype) - - noise_pred = self.transformer( - hidden_states=latent_model_input, - timestep=timestep / 1000, - guidance=guidance, - pooled_projections=pooled_prompt_embeds, - encoder_hidden_states=prompt_embeds, - txt_ids=text_ids, - img_ids=latent_ids, - joint_attention_kwargs=self.joint_attention_kwargs, - return_dict=False, - )[0] - noise_pred = noise_pred[:, : latents.size(1)] - - if do_true_cfg: - neg_noise_pred = self.transformer( + with self.progress_bar(total=len(timesteps)) as pbar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + latent_model_input = latents + if image_latents is not None: + latent_model_input = torch.cat([latents, image_latents], dim=1) + timestep = t.expand(latents.shape[0]).to(latents.dtype) + + noise_pred = self.transformer( hidden_states=latent_model_input, timestep=timestep / 1000, guidance=guidance, - pooled_projections=negative_pooled_prompt_embeds, - encoder_hidden_states=negative_prompt_embeds, - txt_ids=negative_text_ids, + pooled_projections=pooled_prompt_embeds, + encoder_hidden_states=prompt_embeds, + txt_ids=text_ids, img_ids=latent_ids, joint_attention_kwargs=self.joint_attention_kwargs, return_dict=False, )[0] - neg_noise_pred = neg_noise_pred[:, : latents.size(1)] - noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred) - - latents_dtype = latents.dtype - latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] - - if latents.dtype != latents_dtype: - if torch.backends.mps.is_available(): - latents = latents.to(latents_dtype) - - if callback_on_step_end is not None: - callback_kwargs = {} - for k in callback_on_step_end_tensor_inputs: - callback_kwargs[k] = locals()[k] - callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - - latents = callback_outputs.pop("latents", latents) - prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + noise_pred = noise_pred[:, : latents.size(1)] + + if do_true_cfg: + neg_noise_pred = self.transformer( + hidden_states=latent_model_input, + timestep=timestep / 1000, + guidance=guidance, + pooled_projections=negative_pooled_prompt_embeds, + encoder_hidden_states=negative_prompt_embeds, + txt_ids=negative_text_ids, + img_ids=latent_ids, + joint_attention_kwargs=self.joint_attention_kwargs, + return_dict=False, + )[0] + neg_noise_pred = neg_noise_pred[:, : latents.size(1)] + noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred) + + latents_dtype = latents.dtype + latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + + if latents.dtype != latents_dtype: + if torch.backends.mps.is_available(): + latents = latents.to(latents_dtype) + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + + pbar.update() if output_type == "latent": image = latents else: diff --git a/vllm_omni/diffusion/models/flux2/pipeline_flux2.py b/vllm_omni/diffusion/models/flux2/pipeline_flux2.py index c5bf9b77d9..cc25c6b704 100644 --- a/vllm_omni/diffusion/models/flux2/pipeline_flux2.py +++ b/vllm_omni/diffusion/models/flux2/pipeline_flux2.py @@ -29,6 +29,8 @@ from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.flux2 import Flux2Transformer2DModel from vllm_omni.diffusion.models.interface import SupportImageInput +from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin +from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs from vllm_omni.model_executor.model_loader.weight_utils import download_weights_from_hf_specific @@ -331,7 +333,7 @@ def retrieve_latents(encoder_output: torch.Tensor, generator: torch.Generator = raise AttributeError("Could not access latents of provided encoder_output") -class Flux2Pipeline(nn.Module, SupportImageInput): +class Flux2Pipeline(nn.Module, SupportImageInput, ProgressBarMixin, DiffusionPipelineProfilerMixin): """Flux2 pipeline for text-to-image generation.""" _callback_tensor_inputs = ["latents", "prompt_embeds"] @@ -389,6 +391,10 @@ def __init__( self._guidance_scale = None self._attention_kwargs = None self._num_timesteps = None + + self.setup_diffusion_pipeline_profiler( + enable_diffusion_pipeline_profiler=self.od_config.enable_diffusion_pipeline_profiler + ) self._current_timestep = None self._interrupt = False @@ -1027,48 +1033,51 @@ def forward( # We set the index here to remove DtoH sync, helpful especially during compilation. # Check out more details here: https://github.com/huggingface/diffusers/pull/11696 self.scheduler.set_begin_index(0) - for i, t in enumerate(timesteps): - if self.interrupt: - continue - - self._current_timestep = t - timestep = t.expand(latents.shape[0]).to(latents.dtype) - - latent_model_input = latents.to(self.transformer.dtype) - latent_image_ids = latent_ids - - if image_latents is not None: - latent_model_input = torch.cat([latents, image_latents], dim=1).to(self.transformer.dtype) - latent_image_ids = torch.cat([latent_ids, image_latent_ids], dim=1) - - noise_pred = self.transformer( - hidden_states=latent_model_input, # (B, image_seq_len, C) - timestep=timestep / 1000, - guidance=guidance_tensor, - encoder_hidden_states=prompt_embeds, - txt_ids=text_ids, # B, text_seq_len, 4 - img_ids=latent_image_ids, # B, image_seq_len, 4 - joint_attention_kwargs=self.attention_kwargs, - return_dict=False, - )[0] - - noise_pred = noise_pred[:, : latents.size(1) :] - - # compute the previous noisy sample x_t -> x_t-1 - latents_dtype = latents.dtype - latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] - - if latents.dtype != latents_dtype and torch.backends.mps.is_available(): - latents = latents.to(latents_dtype) - - if callback_on_step_end is not None: - callback_kwargs = {} - for k in callback_on_step_end_tensor_inputs: - callback_kwargs[k] = locals()[k] - callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - - latents = callback_outputs.pop("latents", latents) - prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + with self.progress_bar(total=len(timesteps)) as pbar: + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + self._current_timestep = t + timestep = t.expand(latents.shape[0]).to(latents.dtype) + + latent_model_input = latents.to(self.transformer.dtype) + latent_image_ids = latent_ids + + if image_latents is not None: + latent_model_input = torch.cat([latents, image_latents], dim=1).to(self.transformer.dtype) + latent_image_ids = torch.cat([latent_ids, image_latent_ids], dim=1) + + noise_pred = self.transformer( + hidden_states=latent_model_input, # (B, image_seq_len, C) + timestep=timestep / 1000, + guidance=guidance_tensor, + encoder_hidden_states=prompt_embeds, + txt_ids=text_ids, # B, text_seq_len, 4 + img_ids=latent_image_ids, # B, image_seq_len, 4 + joint_attention_kwargs=self.attention_kwargs, + return_dict=False, + )[0] + + noise_pred = noise_pred[:, : latents.size(1) :] + + # compute the previous noisy sample x_t -> x_t-1 + latents_dtype = latents.dtype + latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + + if latents.dtype != latents_dtype and torch.backends.mps.is_available(): + latents = latents.to(latents_dtype) + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + + pbar.update() self._current_timestep = None diff --git a/vllm_omni/diffusion/models/hunyuan_video/pipeline_hunyuan_video_1_5.py b/vllm_omni/diffusion/models/hunyuan_video/pipeline_hunyuan_video_1_5.py index 0b68676e8d..6445bfee21 100644 --- a/vllm_omni/diffusion/models/hunyuan_video/pipeline_hunyuan_video_1_5.py +++ b/vllm_omni/diffusion/models/hunyuan_video/pipeline_hunyuan_video_1_5.py @@ -24,7 +24,9 @@ from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.hunyuan_video.hunyuan_video_15_transformer import HunyuanVideo15Transformer3DModel +from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin from vllm_omni.diffusion.models.t5_encoder import T5EncoderModel +from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs from vllm_omni.platforms import current_omni_platform @@ -81,7 +83,7 @@ def post_process_func(video: torch.Tensor, output_type: str = "pil"): return post_process_func -class HunyuanVideo15Pipeline(nn.Module, CFGParallelMixin): +class HunyuanVideo15Pipeline(nn.Module, CFGParallelMixin, ProgressBarMixin, DiffusionPipelineProfilerMixin): def __init__( self, *, @@ -173,6 +175,10 @@ def __init__( self._num_timesteps = None self._current_timestep = None + self.setup_diffusion_pipeline_profiler( + enable_diffusion_pipeline_profiler=self.od_config.enable_diffusion_pipeline_profiler + ) + @property def guidance_scale(self): return self._guidance_scale @@ -445,60 +451,63 @@ def forward( timesteps = self.scheduler.timesteps self._num_timesteps = len(timesteps) - for i, t in enumerate(timesteps): - self._current_timestep = t - - latent_model_input = torch.cat([latents, cond_latents, mask], dim=1) - timestep = t.expand(latent_model_input.shape[0]).to(latent_model_input.dtype) - - timestep_r = None - if self.use_meanflow: - if i == len(timesteps) - 1: - timestep_r = torch.tensor([0.0], device=device) - else: - timestep_r = timesteps[i + 1] - timestep_r = timestep_r.expand(latents.shape[0]).to(latents.dtype) - - positive_kwargs = { - "hidden_states": latent_model_input, - "timestep": timestep, - "timestep_r": timestep_r, - "encoder_hidden_states": prompt_embeds, - "encoder_attention_mask": prompt_embeds_mask, - "encoder_hidden_states_2": prompt_embeds_2, - "encoder_attention_mask_2": prompt_embeds_mask_2, - "image_embeds": image_embeds, - "return_dict": False, - } - - negative_kwargs = None - if do_cfg and negative_prompt_embeds is not None: - negative_kwargs = { + with self.progress_bar(total=len(timesteps)) as pbar: + for i, t in enumerate(timesteps): + self._current_timestep = t + + latent_model_input = torch.cat([latents, cond_latents, mask], dim=1) + timestep = t.expand(latent_model_input.shape[0]).to(latent_model_input.dtype) + + timestep_r = None + if self.use_meanflow: + if i == len(timesteps) - 1: + timestep_r = torch.tensor([0.0], device=device) + else: + timestep_r = timesteps[i + 1] + timestep_r = timestep_r.expand(latents.shape[0]).to(latents.dtype) + + positive_kwargs = { "hidden_states": latent_model_input, "timestep": timestep, "timestep_r": timestep_r, - "encoder_hidden_states": negative_prompt_embeds, - "encoder_attention_mask": negative_prompt_embeds_mask, - "encoder_hidden_states_2": negative_prompt_embeds_2, - "encoder_attention_mask_2": negative_prompt_embeds_mask_2, + "encoder_hidden_states": prompt_embeds, + "encoder_attention_mask": prompt_embeds_mask, + "encoder_hidden_states_2": prompt_embeds_2, + "encoder_attention_mask_2": prompt_embeds_mask_2, "image_embeds": image_embeds, "return_dict": False, } - noise_pred = self.predict_noise_maybe_with_cfg( - do_true_cfg=do_cfg and negative_kwargs is not None, - true_cfg_scale=guidance_scale, - positive_kwargs=positive_kwargs, - negative_kwargs=negative_kwargs, - cfg_normalize=req.sampling_params.cfg_normalize, - ) + negative_kwargs = None + if do_cfg and negative_prompt_embeds is not None: + negative_kwargs = { + "hidden_states": latent_model_input, + "timestep": timestep, + "timestep_r": timestep_r, + "encoder_hidden_states": negative_prompt_embeds, + "encoder_attention_mask": negative_prompt_embeds_mask, + "encoder_hidden_states_2": negative_prompt_embeds_2, + "encoder_attention_mask_2": negative_prompt_embeds_mask_2, + "image_embeds": image_embeds, + "return_dict": False, + } + + noise_pred = self.predict_noise_maybe_with_cfg( + do_true_cfg=do_cfg and negative_kwargs is not None, + true_cfg_scale=guidance_scale, + positive_kwargs=positive_kwargs, + negative_kwargs=negative_kwargs, + cfg_normalize=req.sampling_params.cfg_normalize, + ) - latents = self.scheduler_step_maybe_with_cfg( - noise_pred, - t, - latents, - do_true_cfg=do_cfg and negative_kwargs is not None, - ) + latents = self.scheduler_step_maybe_with_cfg( + noise_pred, + t, + latents, + do_true_cfg=do_cfg and negative_kwargs is not None, + ) + + pbar.update() self._current_timestep = None diff --git a/vllm_omni/diffusion/models/hunyuan_video/pipeline_hunyuan_video_1_5_i2v.py b/vllm_omni/diffusion/models/hunyuan_video/pipeline_hunyuan_video_1_5_i2v.py index d68c43125c..c1acd1a895 100644 --- a/vllm_omni/diffusion/models/hunyuan_video/pipeline_hunyuan_video_1_5_i2v.py +++ b/vllm_omni/diffusion/models/hunyuan_video/pipeline_hunyuan_video_1_5_i2v.py @@ -38,7 +38,9 @@ retrieve_latents, ) from vllm_omni.diffusion.models.interface import SupportImageInput +from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin from vllm_omni.diffusion.models.t5_encoder import T5EncoderModel +from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs from vllm_omni.platforms import current_omni_platform @@ -98,7 +100,9 @@ def pre_process_func(req: OmniDiffusionRequest) -> OmniDiffusionRequest: return pre_process_func -class HunyuanVideo15I2VPipeline(nn.Module, CFGParallelMixin, SupportImageInput): +class HunyuanVideo15I2VPipeline( + nn.Module, CFGParallelMixin, SupportImageInput, ProgressBarMixin, DiffusionPipelineProfilerMixin +): support_image_input = True color_format = "RGB" @@ -199,6 +203,10 @@ def __init__( self._num_timesteps = None self._current_timestep = None + self.setup_diffusion_pipeline_profiler( + enable_diffusion_pipeline_profiler=self.od_config.enable_diffusion_pipeline_profiler + ) + @property def guidance_scale(self): return self._guidance_scale @@ -520,61 +528,64 @@ def forward( timesteps = self.scheduler.timesteps self._num_timesteps = len(timesteps) - for i, t in enumerate(timesteps): - self._current_timestep = t - - latent_model_input = torch.cat([latents, cond_latents, mask], dim=1) - timestep = t.expand(latent_model_input.shape[0]).to(latent_model_input.dtype) - - timestep_r = None - if self.use_meanflow: - if i == len(timesteps) - 1: - timestep_r = torch.tensor([0.0], device=device) - else: - timestep_r = timesteps[i + 1] - timestep_r = timestep_r.expand(latents.shape[0]).to(latents.dtype) - - positive_kwargs = { - "hidden_states": latent_model_input, - "timestep": timestep, - "timestep_r": timestep_r, - "encoder_hidden_states": prompt_embeds, - "encoder_attention_mask": prompt_embeds_mask, - "encoder_hidden_states_2": prompt_embeds_2, - "encoder_attention_mask_2": prompt_embeds_mask_2, - "image_embeds": image_embeds, - "return_dict": False, - } - - negative_kwargs = None - if do_cfg and negative_prompt_embeds is not None: - # For I2V CFG, negative still uses image embeds (only text is unconditional) - negative_kwargs = { + with self.progress_bar(total=len(timesteps)) as pbar: + for i, t in enumerate(timesteps): + self._current_timestep = t + + latent_model_input = torch.cat([latents, cond_latents, mask], dim=1) + timestep = t.expand(latent_model_input.shape[0]).to(latent_model_input.dtype) + + timestep_r = None + if self.use_meanflow: + if i == len(timesteps) - 1: + timestep_r = torch.tensor([0.0], device=device) + else: + timestep_r = timesteps[i + 1] + timestep_r = timestep_r.expand(latents.shape[0]).to(latents.dtype) + + positive_kwargs = { "hidden_states": latent_model_input, "timestep": timestep, "timestep_r": timestep_r, - "encoder_hidden_states": negative_prompt_embeds, - "encoder_attention_mask": negative_prompt_embeds_mask, - "encoder_hidden_states_2": negative_prompt_embeds_2, - "encoder_attention_mask_2": negative_prompt_embeds_mask_2, + "encoder_hidden_states": prompt_embeds, + "encoder_attention_mask": prompt_embeds_mask, + "encoder_hidden_states_2": prompt_embeds_2, + "encoder_attention_mask_2": prompt_embeds_mask_2, "image_embeds": image_embeds, "return_dict": False, } - noise_pred = self.predict_noise_maybe_with_cfg( - do_true_cfg=do_cfg and negative_kwargs is not None, - true_cfg_scale=guidance_scale, - positive_kwargs=positive_kwargs, - negative_kwargs=negative_kwargs, - cfg_normalize=req.sampling_params.cfg_normalize, - ) + negative_kwargs = None + if do_cfg and negative_prompt_embeds is not None: + # For I2V CFG, negative still uses image embeds (only text is unconditional) + negative_kwargs = { + "hidden_states": latent_model_input, + "timestep": timestep, + "timestep_r": timestep_r, + "encoder_hidden_states": negative_prompt_embeds, + "encoder_attention_mask": negative_prompt_embeds_mask, + "encoder_hidden_states_2": negative_prompt_embeds_2, + "encoder_attention_mask_2": negative_prompt_embeds_mask_2, + "image_embeds": image_embeds, + "return_dict": False, + } + + noise_pred = self.predict_noise_maybe_with_cfg( + do_true_cfg=do_cfg and negative_kwargs is not None, + true_cfg_scale=guidance_scale, + positive_kwargs=positive_kwargs, + negative_kwargs=negative_kwargs, + cfg_normalize=req.sampling_params.cfg_normalize, + ) - latents = self.scheduler_step_maybe_with_cfg( - noise_pred, - t, - latents, - do_true_cfg=do_cfg and negative_kwargs is not None, - ) + latents = self.scheduler_step_maybe_with_cfg( + noise_pred, + t, + latents, + do_true_cfg=do_cfg and negative_kwargs is not None, + ) + + pbar.update() self._current_timestep = None From dd9ca6feef7fe7176644ef950b744e93fe34aaf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zhengyuan=20Su=20=28=E8=8B=8F=E6=94=BF=E6=B8=8A=29?= Date: Sun, 5 Apr 2026 15:43:00 +0800 Subject: [PATCH 052/204] [Bugfix] Include uv.lock in .gitignore (#2493) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Zhengyuan Su (苏政渊) --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 28d56e0f6f..7f101a784c 100644 --- a/.gitignore +++ b/.gitignore @@ -83,6 +83,9 @@ target/ profile_default/ ipython_config.py +# uv +uv.lock + # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: From 88f7ed9fb6a33c8ffd0211e891f8396543c615e9 Mon Sep 17 00:00:00 2001 From: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com> Date: Sun, 5 Apr 2026 18:46:11 +0800 Subject: [PATCH 053/204] [Bugfix] Assign original prompt back to RequestOutput (#2498) Signed-off-by: Yuanheng Zhao --- vllm_omni/engine/async_omni_engine.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 092b341e42..28c6d6caa1 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -716,6 +716,8 @@ def _build_add_request_message( # Register with stage 0's output processor. output_prompt_text = prompt_text + if output_prompt_text is None and isinstance(original_prompt, dict): + output_prompt_text = original_prompt.get("prompt") self.output_processors[0].add_request( request=request, prompt=output_prompt_text, From b2b2ab0c3c0e6999fa00c908a501f59bc33ec308 Mon Sep 17 00:00:00 2001 From: Hyoseop Song Date: Mon, 6 Apr 2026 01:48:31 +0900 Subject: [PATCH 054/204] [CI/Build] Add Dockerfile.cuda for NVIDIA GPU users [Skip-CI] (#1439) Signed-off-by: Hyoseop Song Signed-off-by: Hyoseop Song --- docker/Dockerfile.cuda | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 docker/Dockerfile.cuda diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda new file mode 100644 index 0000000000..754d491d86 --- /dev/null +++ b/docker/Dockerfile.cuda @@ -0,0 +1,22 @@ +ARG BASE_IMAGE=vllm/vllm-openai:v0.19.0 +FROM ${BASE_IMAGE} + +ARG COMMON_WORKDIR=/app + +WORKDIR ${COMMON_WORKDIR} + +# Step 1: Setup - Install system dependencies +RUN apt-get update && \ + apt-get install -y ffmpeg git sox libsox-fmt-all jq && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN mkdir -p ${COMMON_WORKDIR}/vllm-omni + +# Step 2: Copy vllm-omni code and install +COPY . ${COMMON_WORKDIR}/vllm-omni +RUN cd ${COMMON_WORKDIR}/vllm-omni && uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir "." + +RUN ln -sf /usr/bin/python3 /usr/bin/python + +ENTRYPOINT [] From 025408f693fb3ef0f82456481f48ceda653c8909 Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Mon, 6 Apr 2026 02:40:38 +0800 Subject: [PATCH 055/204] [Fix] [Qwen3-TTS] Qwen3-TTS streaming chunk-boundary artifacts (#2480) Signed-off-by: Sy03 <1370724210@qq.com> --- tests/dfx/perf/stage_configs/qwen3_tts.yaml | 2 +- .../qwen3_tts/test_qwen3_tts_code2wav.py | 65 +++++++++++++++++++ .../models/qwen3_tts/pipeline.yaml | 3 +- .../models/qwen3_tts/qwen3_tts_code2wav.py | 42 +++++++----- .../stage_configs/qwen3_tts.yaml | 4 +- .../stage_configs/qwen3_tts_batch.yaml | 4 +- .../npu/stage_configs/qwen3_tts.yaml | 2 +- 7 files changed, 100 insertions(+), 22 deletions(-) create mode 100644 tests/model_executor/models/qwen3_tts/test_qwen3_tts_code2wav.py diff --git a/tests/dfx/perf/stage_configs/qwen3_tts.yaml b/tests/dfx/perf/stage_configs/qwen3_tts.yaml index dd69b248d1..97b3090560 100644 --- a/tests/dfx/perf/stage_configs/qwen3_tts.yaml +++ b/tests/dfx/perf/stage_configs/qwen3_tts.yaml @@ -88,7 +88,7 @@ runtime: connector_get_max_wait_first_chunk: 3000 connector_get_max_wait: 300 codec_chunk_frames: 25 - codec_left_context_frames: 25 + codec_left_context_frames: 72 edges: - from: 0 diff --git a/tests/model_executor/models/qwen3_tts/test_qwen3_tts_code2wav.py b/tests/model_executor/models/qwen3_tts/test_qwen3_tts_code2wav.py new file mode 100644 index 0000000000..3f4e5b3ada --- /dev/null +++ b/tests/model_executor/models/qwen3_tts/test_qwen3_tts_code2wav.py @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from types import SimpleNamespace + +import pytest +import torch +import torch.nn as nn + +from vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_code2wav import Qwen3TTSCode2Wav + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +class _FakeDecoder(nn.Module): + def __init__(self, total_upsample: int = 4): + super().__init__() + self.total_upsample = total_upsample + + def chunked_decode(self, codes: torch.Tensor) -> torch.Tensor: + frames = codes.shape[-1] + wav_len = frames * self.total_upsample + 6 + wav = torch.arange(wav_len, dtype=torch.float32) + return wav.view(1, 1, -1) + + +def _make_model() -> Qwen3TTSCode2Wav: + model = Qwen3TTSCode2Wav( + vllm_config=SimpleNamespace( + model_config=SimpleNamespace(model="unused"), + device_config=SimpleNamespace(device=torch.device("cpu")), + ) + ) + model._decoder = _FakeDecoder() + model._num_quantizers = 2 + model._output_sample_rate = 24000 + model._total_upsample = 4 + model._ensure_speech_tokenizer_loaded = lambda: None + return model + + +def test_forward_trims_context_on_exact_frame_boundaries(): + model = _make_model() + + out = model.forward( + input_ids=torch.arange(12, dtype=torch.long), + runtime_additional_information=[{"left_context_size": 2}], + ) + + audio = out.multimodal_outputs["model_outputs"][0] + expected = torch.arange(8, 24, dtype=torch.float32) + torch.testing.assert_close(audio, expected) + + +def test_forward_trims_trailing_padding_without_context(): + model = _make_model() + + out = model.forward( + input_ids=torch.arange(12, dtype=torch.long), + runtime_additional_information=[{"left_context_size": 0}], + ) + + audio = out.multimodal_outputs["model_outputs"][0] + expected = torch.arange(24, dtype=torch.float32) + torch.testing.assert_close(audio, expected) diff --git a/vllm_omni/model_executor/models/qwen3_tts/pipeline.yaml b/vllm_omni/model_executor/models/qwen3_tts/pipeline.yaml index 6e3c78ff93..fd8ea3a3f4 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/pipeline.yaml +++ b/vllm_omni/model_executor/models/qwen3_tts/pipeline.yaml @@ -84,7 +84,8 @@ connectors: connector_get_max_wait_first_chunk: 3000 connector_get_max_wait: 300 codec_chunk_frames: 25 - codec_left_context_frames: 25 + # Match the decoder sliding attention window to avoid chunk-boundary noise. + codec_left_context_frames: 72 edges: - from: 0 diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py index f6ac91a994..79f0f4a8de 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code2wav.py @@ -41,6 +41,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._num_quantizers: int | None = None self._output_sample_rate: int | None = None self._total_upsample: int | None = None + self._decoder_sliding_window: int | None = None self._logged_codec_stats = False @staticmethod @@ -106,6 +107,7 @@ def _ensure_speech_tokenizer_loaded(self) -> None: self._num_quantizers = num_q self._output_sample_rate = out_sr self._total_upsample = int(decoder.total_upsample) + self._decoder_sliding_window = int(getattr(dec_cfg, "sliding_window", 0) or 0) # Precompute SnakeBeta exp caches (benefits both Triton and eager paths) if hasattr(decoder, "precompute_snake_caches"): @@ -128,6 +130,20 @@ def _ensure_speech_tokenizer_loaded(self) -> None: if isinstance(extra_cfg, dict): chunk_frames = int(extra_cfg.get("codec_chunk_frames") or 0) left_frames = int(extra_cfg.get("codec_left_context_frames") or 0) + if ( + chunk_frames > 0 + and left_frames > 0 + and self._decoder_sliding_window + and left_frames < self._decoder_sliding_window + ): + logger.warning( + "Qwen3-TTS streaming codec_left_context_frames=%d is smaller than " + "decoder sliding_window=%d; chunk-boundary distortion may occur. " + "Increase codec_left_context_frames to at least %d for streaming.", + left_frames, + self._decoder_sliding_window, + self._decoder_sliding_window, + ) decoder.enable_cudagraph( device=device, @@ -289,21 +305,17 @@ def forward( for j, idx in enumerate(valid_indices): ctx_frames, actual_frames = parsed[idx] wav = wav_tensors[j] - # Drop the ref_code prefix from the decoded waveform, keeping only newly generated audio. - if ctx_frames <= 0: - expected_len = actual_frames * upsample - if wav.shape[0] > expected_len: - wav = wav[:expected_len] - else: - cut = int(ctx_frames / max(actual_frames, 1) * wav.shape[0]) - if cut >= wav.shape[0]: - logger.warning( - "Context trim %d >= decoded length %d; returning empty audio.", - cut, - wav.shape[0], - ) - continue - wav = wav[cut:] + # Slice on exact codec-frame boundaries instead of proportionally. + start = max(0, ctx_frames * upsample) + end = max(start, actual_frames * upsample) + if start >= wav.shape[0]: + logger.warning( + "Context trim start %d >= decoded length %d; returning empty audio.", + start, + wav.shape[0], + ) + continue + wav = wav[start : min(end, wav.shape[0])] if wav.shape[0] > 0: audios[idx] = wav.to(dtype=torch.float32).reshape(-1) diff --git a/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml b/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml index 2c5f0a5474..a0d38eb4b9 100644 --- a/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen3_tts.yaml @@ -89,9 +89,9 @@ runtime: connector_get_sleep_s: 0.01 connector_get_max_wait_first_chunk: 3000 connector_get_max_wait: 300 - # Align with Omni: small chunks with sufficient context overlap. + # Match the decoder sliding attention window to avoid chunk-boundary noise. codec_chunk_frames: 25 - codec_left_context_frames: 25 + codec_left_context_frames: 72 edges: - from: 0 diff --git a/vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml b/vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml index a3509bb330..75b2bab3a2 100644 --- a/vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen3_tts_batch.yaml @@ -90,9 +90,9 @@ runtime: connector_get_sleep_s: 0.01 connector_get_max_wait_first_chunk: 3000 connector_get_max_wait: 300 - # Align with Omni: small chunks with sufficient context overlap. + # Match the decoder sliding attention window to avoid chunk-boundary noise. codec_chunk_frames: 25 - codec_left_context_frames: 25 + codec_left_context_frames: 72 edges: - from: 0 diff --git a/vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml b/vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml index a741f819a2..cd82d91b71 100644 --- a/vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml +++ b/vllm_omni/platforms/npu/stage_configs/qwen3_tts.yaml @@ -88,7 +88,7 @@ runtime: connector_get_max_wait: 300 # Align with Omni: small chunks with sufficient context overlap. codec_chunk_frames: 25 - codec_left_context_frames: 25 + codec_left_context_frames: 72 edges: - from: 0 From f6cfacdd160b73537019221d0b32e4d5831ac592 Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Mon, 6 Apr 2026 03:04:37 +0800 Subject: [PATCH 056/204] [Perf][Qwen3-TTS] Free unused decoder in Talker SpeechTokenizer to VRAM (#2429) Signed-off-by: Sy03 <1370724210@qq.com> Co-authored-by: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> --- .../models/qwen3_tts/qwen3_tts_talker.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py index bc6222bbe2..9f8aff6aff 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py @@ -1124,14 +1124,19 @@ def _ensure_speech_tokenizer_loaded(self) -> Qwen3TTSTokenizer: speech_tokenizer_dir, torch_dtype=torch.bfloat16, ) - # Prefer GPU for encoder if available; otherwise keep CPU. + # Only move encoder to GPU; the decoder is unused by Talker (which + # only calls tok.encode()) and would otherwise waste bf16 VRAM. + # NOTE: after this point the tokenizer instance is encode-only; + # calling tok.decode() will fail because tok.model.decoder is None. dev = next(self.parameters()).device if dev.type != "cpu": try: - tok.model.to(dev) + del tok.model.decoder + tok.model.decoder = None + tok.model.encoder.to(dev) tok.device = dev except Exception as e: - raise RuntimeError(f"Failed to move speech tokenizer to {dev}: {e}") from e + raise RuntimeError(f"Failed to move speech tokenizer encoder to {dev}: {e}") from e else: tok.device = dev self._speech_tokenizer = tok From 8b57c6205e8db6703e83402ace641ce9673d2ebf Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Mon, 6 Apr 2026 04:49:18 +0800 Subject: [PATCH 057/204] [Perf][Fish Speech] Free unused DAC codec components to save VRAM (#2430) Signed-off-by: Sy03 <1370724210@qq.com> --- vllm_omni/model_executor/models/fish_speech/dac_encoder.py | 3 +++ .../models/fish_speech/fish_speech_dac_decoder.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/vllm_omni/model_executor/models/fish_speech/dac_encoder.py b/vllm_omni/model_executor/models/fish_speech/dac_encoder.py index 397530ca34..cdf0da992f 100644 --- a/vllm_omni/model_executor/models/fish_speech/dac_encoder.py +++ b/vllm_omni/model_executor/models/fish_speech/dac_encoder.py @@ -54,6 +54,9 @@ def _load_dac_codec( if "generator" in state_dict: state_dict = state_dict["generator"] codec.load_state_dict(state_dict, strict=False) + # Encoder path only uses encoder + quantizer.forward(); prune the + # decoder before moving to device to avoid unnecessary GPU allocation. + codec.decoder = None codec = codec.to(device=device, dtype=dtype) codec.eval() diff --git a/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py b/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py index e121b03371..ed42aa98c0 100644 --- a/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py +++ b/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py @@ -141,6 +141,13 @@ def _ensure_codec_loaded(self) -> None: self._bake_weight_norm(codec) self._cache_attention_masks(codec) + # Decode path only uses quantizer.decode() + decoder; prune + # encode-only components before moving to device to avoid + # unnecessary GPU allocation. + codec.encoder = None + codec.quantizer.pre_module = None + codec.quantizer.downsample = None + device = self.vllm_config.device_config.device codec = codec.to(device=device, dtype=torch.float32) codec.eval() From e23b2634d17a339a3c83002ec1aa39b1f5fcb72e Mon Sep 17 00:00:00 2001 From: "Will.hou" <1205157517@qq.com> Date: Mon, 6 Apr 2026 04:54:15 +0800 Subject: [PATCH 058/204] fix(qwen3_tts): align code predictor buffer dtype with model parameters (#2470) Signed-off-by: willamhou Co-authored-by: willamhou Co-authored-by: Claude Co-authored-by: Happy --- .../qwen3_tts/test_code_predictor_dtype.py | 258 ++++++++++++++++++ .../qwen3_tts_code_predictor_vllm.py | 18 +- 2 files changed, 272 insertions(+), 4 deletions(-) create mode 100644 tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py diff --git a/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py b/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py new file mode 100644 index 0000000000..e2970dcb2d --- /dev/null +++ b/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py @@ -0,0 +1,258 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Tests for code predictor dtype alignment (fix for #2385). + +Verifies that the code predictor handles dtype mismatches between input +tensors and model parameters without raising RuntimeError. This can happen +when model weights are loaded in float16/bfloat16 but upstream modules +produce float32 hidden states. +""" + +from __future__ import annotations + +import importlib.util +import os +import sys +import types +from unittest.mock import MagicMock, patch + +import torch + +# Direct file import to avoid vllm_omni.__init__ patch dependencies. +_BASE = os.path.join( + os.path.dirname(__file__), + os.pardir, + os.pardir, + os.pardir, + os.pardir, + "vllm_omni", + "model_executor", + "models", + "qwen3_tts", +) + + +def _load_module(name: str, filename: str): + path = os.path.abspath(os.path.join(_BASE, filename)) + spec = importlib.util.spec_from_file_location(name, path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +def _build_mock_modules() -> dict[str, object]: + """Build the dict of modules to inject into sys.modules.""" + platforms_mock = MagicMock() + platforms_mock.current_omni_platform.supports_torch_inductor.return_value = False + + logger_mock = MagicMock() + logger_mock.init_logger = lambda name: MagicMock() + + vllm_config_mod = MagicMock() + vllm_config_mod.set_current_vllm_config = lambda cfg: MagicMock(__enter__=MagicMock(), __exit__=MagicMock()) + + weight_utils_mock = MagicMock() + weight_utils_mock.default_weight_loader = lambda p, w: None + + pkg = types.ModuleType("vllm_omni.model_executor.models.qwen3_tts") + pkg.__path__ = [os.path.abspath(_BASE)] + + return { + "vllm_omni": MagicMock(), + "vllm_omni.platforms": platforms_mock, + "vllm.logger": logger_mock, + "vllm.config": MagicMock(), + "vllm.config.vllm": vllm_config_mod, + "vllm.model_executor.model_loader.weight_utils": weight_utils_mock, + "vllm_omni.model_executor": types.ModuleType("vllm_omni.model_executor"), + "vllm_omni.model_executor.models": types.ModuleType("vllm_omni.model_executor.models"), + "vllm_omni.model_executor.models.qwen3_tts": pkg, + } + + +def _load_target_classes(): + """Load config and code predictor modules with mocked dependencies. + + Uses patch.dict to ensure sys.modules is always restored, even on failure. + """ + mocks = _build_mock_modules() + with patch.dict(sys.modules, mocks): + config_mod = _load_module( + "vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts", + "configuration_qwen3_tts.py", + ) + sys.modules["vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts"] = config_mod + + cp_mod = _load_module( + "vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_code_predictor_vllm", + "qwen3_tts_code_predictor_vllm.py", + ) + + return config_mod, cp_mod + + +_config_mod, _cp_mod = _load_target_classes() + +Qwen3TTSTalkerCodePredictorConfig = _config_mod.Qwen3TTSTalkerCodePredictorConfig +Qwen3TTSTalkerConfig = _config_mod.Qwen3TTSTalkerConfig +CodePredictorWrapper = _cp_mod.Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM +CodePredictorModel = _cp_mod.Qwen3TTSTalkerCodePredictorModelVLLM + + +def _make_tiny_config() -> tuple: + """Create minimal configs for a tiny code predictor model.""" + cp_config = Qwen3TTSTalkerCodePredictorConfig( + vocab_size=64, + hidden_size=32, + intermediate_size=64, + num_hidden_layers=1, + num_attention_heads=4, + num_key_value_heads=2, + head_dim=8, + num_code_groups=4, + rms_norm_eps=1e-6, + ) + talker_config = Qwen3TTSTalkerConfig( + hidden_size=32, + num_code_groups=4, + ) + return cp_config, talker_config + + +def _make_vllm_config(max_num_seqs: int = 4) -> MagicMock: + """Create a mock VllmConfig with scheduler_config.""" + vllm_config = MagicMock() + vllm_config.scheduler_config.max_num_seqs = max_num_seqs + return vllm_config + + +class TestCodePredictorDtypeAlignment: + """Test that code predictor buffers match model parameter dtype.""" + + def test_ensure_buffers_uses_given_dtype(self) -> None: + """_ensure_buffers should create proj_buf with the given dtype.""" + cp_config, talker_config = _make_tiny_config() + vllm_config = _make_vllm_config() + + predictor = CodePredictorWrapper( + vllm_config=vllm_config, + config=cp_config, + talker_config=talker_config, + ) + + # Create buffer in float16 + predictor._ensure_buffers(torch.device("cpu"), torch.float16) + assert predictor._proj_buf is not None + assert predictor._proj_buf.dtype == torch.float16 + + # Re-create buffer in float32 (different dtype triggers re-allocation) + predictor._ensure_buffers(torch.device("cpu"), torch.float32) + assert predictor._proj_buf.dtype == torch.float32 + + def test_warmup_aligns_buffer_to_model_params(self) -> None: + """_warmup_buckets should align proj_buf dtype to model parameters.""" + cp_config, talker_config = _make_tiny_config() + vllm_config = _make_vllm_config(max_num_seqs=2) + + predictor = CodePredictorWrapper( + vllm_config=vllm_config, + config=cp_config, + talker_config=talker_config, + ) + + # Cast model to float16 (simulating vLLM loading weights in half precision) + predictor = predictor.to(torch.float16) + + # Pre-create proj_buf with WRONG dtype (float32) — simulating the bug + predictor._ensure_buffers(torch.device("cpu"), torch.float32) + assert predictor._proj_buf.dtype == torch.float32 + + # Simulate _setup_compile having cached model dtype and compiled forward + predictor._model_dtype = torch.float16 + predictor._compiled_model_fwd = predictor.model.forward + + # _warmup_buckets should fix the dtype mismatch + predictor._warmup_buckets() + + assert predictor._proj_buf.dtype == torch.float16 + + def test_setup_compile_caches_model_dtype(self) -> None: + """_setup_compile should cache model parameter dtype.""" + cp_config, talker_config = _make_tiny_config() + vllm_config = _make_vllm_config(max_num_seqs=2) + + predictor = CodePredictorWrapper( + vllm_config=vllm_config, + config=cp_config, + talker_config=talker_config, + ) + predictor = predictor.to(torch.float16) + + assert predictor._model_dtype is None + predictor._setup_compile() + assert predictor._model_dtype == torch.float16 + + def test_forward_with_mismatched_input_dtype(self) -> None: + """forward() should not crash when inputs are float32 but model is float16.""" + cp_config, talker_config = _make_tiny_config() + vllm_config = _make_vllm_config(max_num_seqs=2) + + predictor = CodePredictorWrapper( + vllm_config=vllm_config, + config=cp_config, + talker_config=talker_config, + ) + + # Model in float16 + predictor = predictor.to(torch.float16) + + bsz = 1 + num_groups = cp_config.num_code_groups + hidden = talker_config.hidden_size + + # Inputs in float32 (simulating the dtype mismatch from #2385) + layer0_code = torch.zeros(bsz, dtype=torch.long) + layer0_embed = torch.randn(bsz, hidden, dtype=torch.float32) + last_talker_hidden = torch.randn(bsz, hidden, dtype=torch.float32) + + # This should NOT raise RuntimeError about dtype mismatch + result = predictor( + layer0_code=layer0_code, + layer0_embed=layer0_embed, + last_talker_hidden=last_talker_hidden, + do_sample=False, + ) + + assert result.shape == (bsz, num_groups) + assert result.dtype == torch.long + + +class TestCodePredictorModelDtype: + """Test the inner model forward with different dtypes.""" + + def test_model_forward_float16(self) -> None: + """Inner model forward should work in float16.""" + cp_config, _ = _make_tiny_config() + model = CodePredictorModel(cp_config, talker_hidden_size=32).to(torch.float16) + + bsz, seq_len = 1, 4 + inputs = torch.randn(bsz, seq_len, 32, dtype=torch.float16) + pos_ids = torch.arange(seq_len).unsqueeze(0).expand(bsz, -1) + + output = model(inputs, pos_ids) + assert output.dtype == torch.float16 + assert output.shape == (bsz, seq_len, 32) + + def test_model_forward_float32(self) -> None: + """Inner model forward should work in float32.""" + cp_config, _ = _make_tiny_config() + model = CodePredictorModel(cp_config, talker_hidden_size=32).to(torch.float32) + + bsz, seq_len = 1, 4 + inputs = torch.randn(bsz, seq_len, 32, dtype=torch.float32) + pos_ids = torch.arange(seq_len).unsqueeze(0).expand(bsz, -1) + + output = model(inputs, pos_ids) + assert output.dtype == torch.float32 + assert output.shape == (bsz, seq_len, 32) diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py index 11c0369e82..1e84eaebaa 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py @@ -348,6 +348,7 @@ def __init__( # Pre-allocated buffers (lazily initialized on first forward). self._proj_buf: torch.Tensor | None = None + self._model_dtype: torch.dtype | None = None # torch.compile + warmup state (lazily initialized in _setup_compile). self._compiled_model_fwd = None @@ -404,6 +405,10 @@ def _setup_compile(self) -> None: """Lazily set up torch.compile with manual CUDA graph capture.""" if self._compiled_model_fwd is not None: return + # Cache model parameter dtype so forward() doesn't need to query it + # on every call. Also ensures warmup buffers match model precision + # even when upstream modules produce a different dtype (#2385). + self._model_dtype = next(self.model.parameters()).dtype self._lm_heads_list = list(self.lm_head) self._codec_embeds_list = list(self.model.codec_embedding) if not current_omni_platform.supports_torch_inductor(): @@ -443,6 +448,9 @@ def _warmup_buckets(self) -> None: max_seq = self._num_groups + 1 device = next(self.model.parameters()).device + # Ensure proj_buf matches model parameter dtype to avoid dtype + # mismatch during warmup compilation (see #2385). + self._ensure_buffers(device, self._model_dtype) proj_buf = self._proj_buf for bsz in self._bucket_sizes: # position_ids: [batch, seq_len] for HF-style RoPE @@ -499,13 +507,15 @@ def forward( bsz = int(layer0_code.shape[0]) num_groups = self._num_groups device = layer0_code.device - dtype = layer0_embed.dtype all_codes = torch.empty(bsz, num_groups, dtype=torch.long, device=device) all_codes[:, 0] = layer0_code.reshape(bsz) - self._ensure_buffers(device, dtype) + # _setup_compile caches _model_dtype on first call; use it for buffers + # so they always match model weight precision (#2385). self._setup_compile() + dtype = self._model_dtype + self._ensure_buffers(device, dtype) proj_buf = self._proj_buf max_seq = self._num_groups + 1 @@ -525,8 +535,8 @@ def forward( padded_bsz = self._padded_bsz(bsz) proj_buf[:padded_bsz].zero_() - proj_buf[:bsz, 0, :] = projection(last_talker_hidden.reshape(bsz, 1, -1)).reshape(bsz, -1) - proj_buf[:bsz, 1, :] = projection(layer0_embed.reshape(bsz, 1, -1)).reshape(bsz, -1) + proj_buf[:bsz, 0, :] = projection(last_talker_hidden.reshape(bsz, 1, -1).to(dtype)).reshape(bsz, -1) + proj_buf[:bsz, 1, :] = projection(layer0_embed.reshape(bsz, 1, -1).to(dtype)).reshape(bsz, -1) full_pos_ids = self._bucket_pos_ids.get(padded_bsz) if full_pos_ids is None: full_pos_ids = torch.arange(max_seq, device=device, dtype=torch.long).unsqueeze(0).expand(padded_bsz, -1) From 328de586aad4f75b6b529a9f620dff2700bb4e87 Mon Sep 17 00:00:00 2001 From: Lancer Date: Mon, 6 Apr 2026 11:49:35 +0800 Subject: [PATCH 059/204] [Feat] support for multi-block layerwise offloading, fix top-level parameters/buffers staying on CPU (#1486) Signed-off-by: Lancer Signed-off-by: Lancer <402430575@qq.com> Co-authored-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- .../model/adding_diffusion_model.md | 2 +- .../diffusion/cpu_offload_diffusion.md | 11 +- .../offloader/test_layerwise_backend.py | 117 +++++++++++++++++- .../online_serving/test_flux2_expansion.py | 45 +++++++ .../online_serving/test_zimage_expansion.py | 34 ++++- .../diffusion/models/flux/flux_transformer.py | 1 + .../flux2_klein/flux2_klein_transformer.py | 1 + .../models/helios/helios_transformer.py | 2 +- .../hunyuan_video_15_transformer.py | 2 +- .../qwen_image/qwen_image_transformer.py | 2 +- .../models/wan2_2/wan2_2_transformer.py | 2 +- .../models/z_image/z_image_transformer.py | 1 + .../diffusion/offloader/layerwise_backend.py | 106 ++++++++++++---- 13 files changed, 288 insertions(+), 38 deletions(-) diff --git a/docs/contributing/model/adding_diffusion_model.md b/docs/contributing/model/adding_diffusion_model.md index dfa550173c..8d85eb4f6e 100644 --- a/docs/contributing/model/adding_diffusion_model.md +++ b/docs/contributing/model/adding_diffusion_model.md @@ -802,7 +802,7 @@ omni = Omni(model="your-model", enable_layerwise_offload=True) ```python class WanTransformer3DModel(nn.Module): - _layerwise_offload_blocks_attr = "blocks" # Attribute name containing transformer blocks + _layerwise_offload_blocks_attrs = ["blocks"] # Attribute name containing transformer blocks def __init__(self): self.blocks = nn.ModuleList([...]) # Transformer blocks diff --git a/docs/user_guide/diffusion/cpu_offload_diffusion.md b/docs/user_guide/diffusion/cpu_offload_diffusion.md index 8786ae9649..be72efffa5 100644 --- a/docs/user_guide/diffusion/cpu_offload_diffusion.md +++ b/docs/user_guide/diffusion/cpu_offload_diffusion.md @@ -91,12 +91,19 @@ Models must define the blocks attribute name for layerwise offloading: ```python class WanTransformer3DModel(nn.Module): - _layerwise_offload_blocks_attr = "blocks" # Attribute name containing transformer blocks + _layerwise_offload_blocks_attrs = ["blocks"] # Attribute names containing transformer blocks def __init__(self): self.blocks = nn.ModuleList([...]) # Transformer blocks ``` +For models with multiple block types: + +```python +class Flux2Transformer2DModel(nn.Module): + _layerwise_offload_blocks_attrs = ["transformer_blocks", "single_transformer_blocks"] +``` + ### Limitations - Cold start latency increases because of 1) components are loaded to CPU first at the very first during initialization, @@ -140,4 +147,4 @@ Factory function `get_offload_backend()` selects the appropriate backend based o **Notes:** - Model-Level Offloading is expected to be supported by all common diffusion models (DiT and encoders) naturally -- Layerwise Offloading requires DiT class to define `_layerwise_offload_blocks_attr` pointing to transformer blocks +- Layerwise Offloading requires DiT class to define `_layerwise_offload_blocks_attrs` pointing to transformer blocks diff --git a/tests/diffusion/offloader/test_layerwise_backend.py b/tests/diffusion/offloader/test_layerwise_backend.py index 7df3c1bb1a..5fd80e75c2 100644 --- a/tests/diffusion/offloader/test_layerwise_backend.py +++ b/tests/diffusion/offloader/test_layerwise_backend.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Unit tests for LayerwiseOffloadHook.""" +"""Unit tests for LayerwiseOffloadHook and LayerWiseOffloadBackend utilities.""" import gc import os @@ -15,7 +15,7 @@ from torch.distributed.tensor import DeviceMesh, DTensor, Replicate import vllm_omni.diffusion.offloader.layerwise_backend as layerwise_backend_module -from vllm_omni.diffusion.offloader.layerwise_backend import LayerwiseOffloadHook +from vllm_omni.diffusion.offloader.layerwise_backend import LayerWiseOffloadBackend, LayerwiseOffloadHook from vllm_omni.platforms import current_omni_platform pytestmark = [pytest.mark.diffusion, pytest.mark.cpu, pytest.mark.core_model] @@ -127,3 +127,116 @@ def test_dtensor_wrapper_is_preserved_across_prefetch_and_offload(self, dist_gro assert current_block.weight.to_local().is_meta assert current_block.weight.to_local().shape == torch.Size([4]) assert not hook.is_materialized + + +class _DummyBlock(nn.Module): + def __init__(self): + super().__init__() + self.weight = nn.Parameter(torch.randn(10, 10)) + + +class _SingleBlockModel(nn.Module): + _layerwise_offload_blocks_attrs = ["blocks"] + + def __init__(self, num_blocks: int = 3): + super().__init__() + self.blocks = nn.ModuleList([_DummyBlock() for _ in range(num_blocks)]) + + +class _MultiBlockModel(nn.Module): + _layerwise_offload_blocks_attrs = ["transformer_blocks", "single_transformer_blocks"] + + def __init__(self, num_transformer: int = 2, num_single: int = 2): + super().__init__() + self.transformer_blocks = nn.ModuleList([_DummyBlock() for _ in range(num_transformer)]) + self.single_transformer_blocks = nn.ModuleList([_DummyBlock() for _ in range(num_single)]) + + +class _EmptyBlocksModel(nn.Module): + _layerwise_offload_blocks_attrs = ["blocks"] + + def __init__(self): + super().__init__() + self.blocks = nn.ModuleList([]) + + +class _InvalidAttrModel(nn.Module): + _layerwise_offload_blocks_attrs = ["nonexistent_blocks", "blocks"] + + def __init__(self, num_blocks: int = 2): + super().__init__() + self.blocks = nn.ModuleList([_DummyBlock() for _ in range(num_blocks)]) + + +class _DeprecatedSingleAttrModel(nn.Module): + _layerwise_offload_blocks_attr = "blocks" + + def __init__(self, num_blocks: int = 2): + super().__init__() + self.blocks = nn.ModuleList([_DummyBlock() for _ in range(num_blocks)]) + + +class _NoAttrsModel(nn.Module): + def __init__(self, num_blocks: int = 2): + super().__init__() + self.blocks = nn.ModuleList([_DummyBlock() for _ in range(num_blocks)]) + + +class TestGetBlocksFromDit: + def test_get_blocks_from_dit_single_block_attr(self): + model = _SingleBlockModel(num_blocks=3) + attr_names, blocks = LayerWiseOffloadBackend.get_blocks_from_dit(model) + assert attr_names == ["blocks"] + assert len(blocks) == 3 + assert all(isinstance(b, _DummyBlock) for b in blocks) + + def test_get_blocks_from_dit_multi_block_attrs(self): + model = _MultiBlockModel(num_transformer=2, num_single=3) + attr_names, blocks = LayerWiseOffloadBackend.get_blocks_from_dit(model) + assert set(attr_names) == {"transformer_blocks", "single_transformer_blocks"} + assert len(blocks) == 5 + assert all(isinstance(b, _DummyBlock) for b in blocks) + + def test_get_blocks_from_dit_empty_blocks(self): + model = _EmptyBlocksModel() + attr_names, blocks = LayerWiseOffloadBackend.get_blocks_from_dit(model) + assert attr_names == [] + assert blocks == [] + + def test_get_blocks_from_dit_invalid_attr_name(self): + model = _InvalidAttrModel(num_blocks=2) + with pytest.raises( + AttributeError, + match="Attribute 'nonexistent_blocks' declared in _layerwise_offload_blocks_attrs does not exist", + ): + LayerWiseOffloadBackend.get_blocks_from_dit(model) + + def test_get_blocks_from_dit_no_attrs_defined(self): + model = _NoAttrsModel(num_blocks=3) + attr_names, blocks = LayerWiseOffloadBackend.get_blocks_from_dit(model) + assert attr_names == [] + assert blocks == [] + + def test_get_blocks_from_dit_deprecated_single_attr(self): + model = _DeprecatedSingleAttrModel(num_blocks=2) + attr_names, blocks = LayerWiseOffloadBackend.get_blocks_from_dit(model) + assert attr_names == ["blocks"] + assert len(blocks) == 2 + + +class TestGetBlocksAttrNames: + def test_get_blocks_attr_names_new_format(self): + model = _MultiBlockModel() + attrs = LayerWiseOffloadBackend.get_blocks_attr_names(model) + assert attrs == ["transformer_blocks", "single_transformer_blocks"] + + def test_get_blocks_attr_names_no_attrs(self): + model = _NoAttrsModel() + attrs = LayerWiseOffloadBackend.get_blocks_attr_names(model) + assert attrs == [] + + def test_set_blocks_attr_names(self): + model = _NoAttrsModel() + LayerWiseOffloadBackend.set_blocks_attr_names(model, ["new_blocks"]) + assert hasattr(model.__class__, "_layerwise_offload_blocks_attrs") + assert model.__class__._layerwise_offload_blocks_attrs == ["new_blocks"] diff --git a/tests/e2e/online_serving/test_flux2_expansion.py b/tests/e2e/online_serving/test_flux2_expansion.py index 0e9e8c89a6..336bd83a1d 100644 --- a/tests/e2e/online_serving/test_flux2_expansion.py +++ b/tests/e2e/online_serving/test_flux2_expansion.py @@ -1,6 +1,12 @@ """ Tests for Flux2 Klein; currently Dev is implemented separately, but ideally these models will fold together in the future. + +Coverage: +- FP8 + CacheDiT + Ulysses=2 + TP=2 +- Layerwise CPU offload + Ulysses=2 + Ring=2 +- Layerwise CPU offload + TP=2 +- Layerwise CPU offload + HSDP """ import pytest @@ -42,6 +48,45 @@ def _get_diffusion_feature_cases(model: str): ), marks=FOUR_CARD_FEATURE_MARKS, ), + pytest.param( + OmniServerParams( + model=model, + server_args=[ + "--enable-layerwise-offload", + "--ulysses-degree", + "2", + "--ring", + "2", + ], + ), + id="layerwise_ulysses2_ring2", + marks=FOUR_CARD_FEATURE_MARKS, + ), + pytest.param( + OmniServerParams( + model=model, + server_args=[ + "--enable-layerwise-offload", + "--tensor-parallel-size", + "2", + ], + ), + id="layerwise_tp2", + marks=FOUR_CARD_FEATURE_MARKS, + ), + pytest.param( + OmniServerParams( + model=model, + server_args=[ + "--enable-layerwise-offload", + "--use-hsdp", + "--hsdp-shard-size", + "2", + ], + ), + id="layerwise_hsdp", + marks=FOUR_CARD_FEATURE_MARKS, + ), ] diff --git a/tests/e2e/online_serving/test_zimage_expansion.py b/tests/e2e/online_serving/test_zimage_expansion.py index bef12e55d1..9f90ec855b 100644 --- a/tests/e2e/online_serving/test_zimage_expansion.py +++ b/tests/e2e/online_serving/test_zimage_expansion.py @@ -3,9 +3,12 @@ for Z-Image. Coverage is intentionally limited to the minimal 4xL4 cases that -exercise Z-Image's supported parallel feature combinations: +exercise Z-Image's supported feature combinations: - CacheDiT + FP8 + Ring=2 + TP=2 - TeaCache + FP8 + Ulysses=2 + Ring=2 +- Layerwise CPU offload + Ulysses=2 + Ring=2 +- Layerwise CPU offload + TP=2 +- Layerwise CPU offload + HSDP """ import pytest @@ -64,12 +67,39 @@ def _get_diffusion_feature_cases(): OmniServerParams( model=MODEL, server_args=[ + "--enable-layerwise-offload", + "--ulysses-degree", + "2", + "--ring", + "2", + ], + ), + id="layerwise_ulysses2_ring2", + marks=FOUR_CARD_MARKS, + ), + pytest.param( + OmniServerParams( + model=MODEL, + server_args=[ + "--enable-layerwise-offload", + "--tensor-parallel-size", + "2", + ], + ), + id="layerwise_tp2", + marks=FOUR_CARD_MARKS, + ), + pytest.param( + OmniServerParams( + model=MODEL, + server_args=[ + "--enable-layerwise-offload", "--use-hsdp", "--hsdp-shard-size", "2", ], ), - id="parallel_hsdp", + id="layerwise_hsdp", marks=[*FOUR_CARD_MARKS, pytest.mark.skip(reason="issue #2435")], ), ] diff --git a/vllm_omni/diffusion/models/flux/flux_transformer.py b/vllm_omni/diffusion/models/flux/flux_transformer.py index 362fb4446f..680b8bfbbe 100644 --- a/vllm_omni/diffusion/models/flux/flux_transformer.py +++ b/vllm_omni/diffusion/models/flux/flux_transformer.py @@ -510,6 +510,7 @@ class FluxTransformer2DModel(nn.Module): # -- typically a transformer layer # used for torch compile optimizations _repeated_blocks = ["FluxTransformerBlock"] + _layerwise_offload_blocks_attrs = ["transformer_blocks", "single_transformer_blocks"] @staticmethod def _is_transformer_block(name: str, module) -> bool: diff --git a/vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py b/vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py index 1d375ca8d2..9cf2fb7568 100644 --- a/vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py +++ b/vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py @@ -742,6 +742,7 @@ class Flux2Transformer2DModel(nn.Module): """ _repeated_blocks = ["Flux2TransformerBlock", "Flux2SingleTransformerBlock"] + _layerwise_offload_blocks_attrs = ["transformer_blocks", "single_transformer_blocks"] @staticmethod def _is_transformer_block(name: str, module) -> bool: diff --git a/vllm_omni/diffusion/models/helios/helios_transformer.py b/vllm_omni/diffusion/models/helios/helios_transformer.py index 812da7db14..b3d2621ad8 100644 --- a/vllm_omni/diffusion/models/helios/helios_transformer.py +++ b/vllm_omni/diffusion/models/helios/helios_transformer.py @@ -576,7 +576,7 @@ class HeliosTransformer3DModel(nn.Module): """ _repeated_blocks = ["HeliosTransformerBlock"] - _layerwise_offload_blocks_attr = "blocks" + _layerwise_offload_blocks_attrs = ["blocks"] packed_modules_mapping = { "to_qkv": ["to_q", "to_k", "to_v"], } diff --git a/vllm_omni/diffusion/models/hunyuan_video/hunyuan_video_15_transformer.py b/vllm_omni/diffusion/models/hunyuan_video/hunyuan_video_15_transformer.py index 263e39e018..6600b17d5c 100644 --- a/vllm_omni/diffusion/models/hunyuan_video/hunyuan_video_15_transformer.py +++ b/vllm_omni/diffusion/models/hunyuan_video/hunyuan_video_15_transformer.py @@ -539,7 +539,7 @@ class HunyuanVideo15Transformer3DModel(nn.Module): """ _repeated_blocks = ["HunyuanVideo15TransformerBlock"] - _layerwise_offload_blocks_attr = "transformer_blocks" + _layerwise_offload_blocks_attrs = ["transformer_blocks"] packed_modules_mapping = { "to_qkv": ["to_q", "to_k", "to_v"], "add_kv_proj": ["add_q_proj", "add_k_proj", "add_v_proj"], diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py index c211567069..b34f19e954 100644 --- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py +++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py @@ -882,7 +882,7 @@ class QwenImageTransformer2DModel(CachedTransformer): # -- typically a transformer layer # used for torch compile optimizations _repeated_blocks = ["QwenImageTransformerBlock"] - _layerwise_offload_blocks_attr = "transformer_blocks" + _layerwise_offload_blocks_attrs = ["transformer_blocks"] packed_modules_mapping = { "to_qkv": ["to_q", "to_k", "to_v"], "add_kv_proj": ["add_q_proj", "add_k_proj", "add_v_proj"], diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py index c4e3b40cdd..efaab5a8f9 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py @@ -725,7 +725,7 @@ class WanTransformer3DModel(nn.Module): """ _repeated_blocks = ["WanTransformerBlock"] - _layerwise_offload_blocks_attr = "blocks" + _layerwise_offload_blocks_attrs = ["blocks"] packed_modules_mapping = { "to_qkv": ["to_q", "to_k", "to_v"], } diff --git a/vllm_omni/diffusion/models/z_image/z_image_transformer.py b/vllm_omni/diffusion/models/z_image/z_image_transformer.py index fd8b0e490f..3ffad221ba 100644 --- a/vllm_omni/diffusion/models/z_image/z_image_transformer.py +++ b/vllm_omni/diffusion/models/z_image/z_image_transformer.py @@ -579,6 +579,7 @@ class ZImageTransformer2DModel(CachedTransformer): """ _repeated_blocks = ["ZImageTransformerBlock"] + _layerwise_offload_blocks_attrs = ["layers"] @staticmethod def _is_transformer_block(name: str, module) -> bool: diff --git a/vllm_omni/diffusion/offloader/layerwise_backend.py b/vllm_omni/diffusion/offloader/layerwise_backend.py index 20af5b5d82..7876b00947 100644 --- a/vllm_omni/diffusion/offloader/layerwise_backend.py +++ b/vllm_omni/diffusion/offloader/layerwise_backend.py @@ -312,10 +312,9 @@ def enable(self, pipeline: nn.Module) -> None: dit_name = modules.dit_names[i] logger.info(f"Applying hooks on {dit_name} ({dit_module.__class__.__name__})") - blocks_attr_name = LayerWiseOffloadBackend.get_blocks_attr_name(dit_module) - blocks = LayerWiseOffloadBackend.get_blocks_from_dit(dit_module) + blocks_attr_names, blocks = LayerWiseOffloadBackend.get_blocks_from_dit(dit_module) - if not blocks_attr_name or not blocks: + if not blocks: logger.warning( "Target layers (blocks) not found. Skipping offloading on %s (%s)", dit_name, @@ -336,11 +335,20 @@ def enable(self, pipeline: nn.Module) -> None: # Move non-block modules to GPU (they stay resident) for name, m in dit_module.named_children(): - if name == blocks_attr_name: + if name not in blocks_attr_names: + m.to(self.device) + logger.debug(f"Moved {name} to device {self.device}") + else: logger.debug(f"Skipped blocks module {name}") - continue - m.to(self.device) - logger.debug(f"Moved {name} to device {self.device}") + + # Move top-level params/buffers to GPU (dit_module's own, not sub-modules) + for param in dit_module._parameters.values(): + if param is not None: + param.data = param.data.to(self.device, non_blocking=True) + + for buffer in dit_module._buffers.values(): + if buffer is not None: + buffer.data = buffer.data.to(self.device, non_blocking=True) # Pre-fetch the first layer by manually calling the hook function on the last layer; # For subsequent requests, the first layer/block will be pre-fetched @@ -395,40 +403,84 @@ def disable(self) -> None: logger.info("Layer-wise offloading disabled") @staticmethod - def get_blocks_attr_name(model: nn.Module) -> str | None: - """Retrieve blocks attribute name from provided DiT model""" - return getattr(model.__class__, "_layerwise_offload_blocks_attr", None) + def get_blocks_attr_names(model: nn.Module) -> list[str]: + """Get block attribute names from model class.""" + attrs: list[str] = getattr(model.__class__, "_layerwise_offload_blocks_attrs", []) + + if not attrs: + old_attr = getattr(model.__class__, "_layerwise_offload_blocks_attr", None) + if old_attr is not None: + logger.warning( + "'_layerwise_offload_blocks_attr' is deprecated, " + "please use '_layerwise_offload_blocks_attrs' instead. " + "Example: _layerwise_offload_blocks_attrs = ['blocks']" + ) + attrs = [old_attr] if isinstance(old_attr, str) else list(old_attr) + + return attrs @staticmethod - def set_blocks_attr_name(model: nn.Module, name: str) -> None: - if not hasattr(model.__class__, "_layerwise_offload_blocks_attr"): - setattr(model.__class__, "_layerwise_offload_blocks_attr", name) + def set_blocks_attr_names(model: nn.Module, names: list[str]) -> None: + if not hasattr(model.__class__, "_layerwise_offload_blocks_attrs"): + setattr(model.__class__, "_layerwise_offload_blocks_attrs", names) @staticmethod - def get_blocks_from_dit(model: nn.Module) -> list[nn.Module]: + def get_blocks_from_dit(model: nn.Module) -> tuple[list[str], list[nn.Module]]: """ - Retrieve a list of blocks from provided DiT model. Blocks attribute name - are found by `_layerwise_offload_blocks_attr` set to DiT models. For example, + Retrieve blocks and attribute names from provided DiT model. Blocks attribute names + are found by `_layerwise_offload_blocks_attrs` set to DiT models. For example, ``` class WanTransformer3DModel(nn.Module): - _layerwise_offload_blocks_attr = "blocks" + _layerwise_offload_blocks_attrs = ["blocks"] ``` + + Returns: + Tuple of (blocks_attr_names, blocks) """ - blocks_attr_name = LayerWiseOffloadBackend.get_blocks_attr_name(model) - if blocks_attr_name is None: + blocks_attr_names = LayerWiseOffloadBackend.get_blocks_attr_names(model) + if not blocks_attr_names: logger.warning( - f"No _layerwise_offload_blocks_attr defined for {model.__class__.__name__}, " + f"No _layerwise_offload_blocks_attrs defined for {model.__class__.__name__}, " "skipping layerwise offloading" ) - return [] + return [], [] + + blocks = [] + for name in blocks_attr_names: + attr = getattr(model, name, None) + if attr is None: + raise AttributeError( + f"Attribute '{name}' declared in _layerwise_offload_blocks_attrs " + f"does not exist on model {model.__class__.__name__}" + ) + try: + attr_iter = iter(attr) + except TypeError: + if isinstance(attr, nn.Module): + logger.warning( + "Attribute '%s' on %s is not iterable; treating it as one block.", + name, + model.__class__.__name__, + ) + blocks.append(attr) + continue - _blocks = getattr(model, blocks_attr_name, None) - if _blocks is None: + logger.warning( + "Attribute '%s' on %s is not iterable (got %s); skipping it.", + name, + model.__class__.__name__, + type(attr).__name__, + ) + else: + blocks.extend(attr_iter) + + if not blocks: logger.warning( - f"Blocks (layers) '{blocks_attr_name}' not found on {model.__class__.__name__}, " - "skipping layerwise offloading" + "No blocks found in %s for %s, skipping layerwise offloading", + blocks_attr_names, + model.__class__.__name__, ) - return [] + return [], [] - return list(_blocks) + return blocks_attr_names, blocks From 486d77d7970e6deb88fab915e224c7659e379e10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zhengyuan=20Su=20=28=E8=8B=8F=E6=94=BF=E6=B8=8A=29?= Date: Mon, 6 Apr 2026 19:30:52 +0800 Subject: [PATCH 060/204] [Feature] Enable LoRA adapter injection for BAGEL (#2490) Signed-off-by: Zhengyuan Su Signed-off-by: Claude Co-authored-by: Claude --- tests/diffusion/lora/conftest.py | 56 ++++ tests/diffusion/lora/test_lora_manager.py | 80 +++--- .../diffusion/models/bagel/test_bagel_lora.py | 248 ++++++++++++++++++ vllm_omni/diffusion/lora/manager.py | 9 +- .../models/bagel/bagel_transformer.py | 14 + .../model_executor/models/bagel/bagel.py | 16 ++ 6 files changed, 392 insertions(+), 31 deletions(-) create mode 100644 tests/diffusion/lora/conftest.py create mode 100644 tests/diffusion/models/bagel/test_bagel_lora.py diff --git a/tests/diffusion/lora/conftest.py b/tests/diffusion/lora/conftest.py new file mode 100644 index 0000000000..8b9b1ef4d2 --- /dev/null +++ b/tests/diffusion/lora/conftest.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Shared test helpers for diffusion LoRA tests.""" + +from __future__ import annotations + +import torch +from vllm.model_executor.layers.linear import LinearBase + + +class FakeLinearBase(LinearBase): + """Minimal LinearBase stub for LoRA layer discovery.""" + + def __init__(self): + torch.nn.Module.__init__(self) + + +class DummyBaseLayerWithLoRA(torch.nn.Module): + """Fake LoRA wrapper that records set/reset/create calls.""" + + def __init__(self, base_layer: torch.nn.Module): + super().__init__() + self.base_layer = base_layer + + self.set_calls: list[ + tuple[list[torch.Tensor | None] | torch.Tensor, list[torch.Tensor | None] | torch.Tensor] + ] = [] + self.reset_calls: int = 0 + self.create_calls: int = 0 + + def set_lora(self, index: int, lora_a, lora_b): + assert index == 0 + self.set_calls.append((lora_a, lora_b)) + + def reset_lora(self, index: int): + assert index == 0 + self.reset_calls += 1 + + def create_lora_weights(self, max_loras, lora_config, model_config): + self.create_calls += 1 + + +def fake_replace_submodule( + root: torch.nn.Module, + module_name: str, + submodule: torch.nn.Module, + replace_calls: list[str] | None = None, +) -> None: + """Replace a submodule by traversing dotted paths correctly.""" + if replace_calls is not None: + replace_calls.append(module_name) + parts = module_name.split(".") + parent = root + for attr in parts[:-1]: + parent = getattr(parent, attr) + setattr(parent, parts[-1], submodule) diff --git a/tests/diffusion/lora/test_lora_manager.py b/tests/diffusion/lora/test_lora_manager.py index 8d4a1487fd..83ac7a1144 100644 --- a/tests/diffusion/lora/test_lora_manager.py +++ b/tests/diffusion/lora/test_lora_manager.py @@ -7,8 +7,12 @@ import torch from vllm.lora.lora_weights import LoRALayerWeights from vllm.lora.utils import get_supported_lora_modules -from vllm.model_executor.layers.linear import LinearBase +from tests.diffusion.lora.conftest import ( + DummyBaseLayerWithLoRA, + FakeLinearBase, + fake_replace_submodule, +) from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager from vllm_omni.lora.request import LoRARequest @@ -33,35 +37,9 @@ def reset_lora(self, index: int): self.reset_calls += 1 -class _FakeLinearBase(LinearBase): - def __init__(self): - torch.nn.Module.__init__(self) - - -class _DummyBaseLayerWithLoRA(torch.nn.Module): - def __init__(self, base_layer: torch.nn.Module): - super().__init__() - self.base_layer = base_layer - - self.set_calls: list[ - tuple[list[torch.Tensor | None] | torch.Tensor, list[torch.Tensor | None] | torch.Tensor] - ] = [] - self.reset_calls: int = 0 - self.create_calls: int = 0 - - def set_lora(self, index: int, lora_a, lora_b): - assert index == 0 - self.set_calls.append((lora_a, lora_b)) - - def reset_lora(self, index: int): - assert index == 0 - self.reset_calls += 1 - - def create_lora_weights(self, max_loras, lora_config, model_config): - # Needs to be callable for scale test when rank changes, but not - # actually used since we mock everything and check everything based - # on set calls. - self.create_calls += 1 +# Aliases for backward compatibility within this file +_FakeLinearBase = FakeLinearBase +_DummyBaseLayerWithLoRA = DummyBaseLayerWithLoRA class _DummyPipeline(torch.nn.Module): @@ -555,3 +533,45 @@ def _fake_load(_req: LoRARequest): req1 = _dummy_lora_request(1) with pytest.raises(ValueError): manager.add_adapter(req1) + + +def test_lora_manager_discovers_bagel_component(monkeypatch): + """Verify that _replace_layers_with_lora finds layers under 'bagel'.""" + import vllm_omni.diffusion.lora.manager as manager_mod + + monkeypatch.setattr(manager_mod, "BaseLayerWithLoRA", _DummyBaseLayerWithLoRA) + + def _fake_from_layer_diffusion(*, layer: torch.nn.Module, **_kwargs): + if isinstance(layer, _FakeLinearBase): + return _DummyBaseLayerWithLoRA(layer) + return layer + + replace_calls: list[str] = [] + + monkeypatch.setattr(manager_mod, "from_layer_diffusion", _fake_from_layer_diffusion) + monkeypatch.setattr( + manager_mod, + "replace_submodule", + lambda root, name, sub: fake_replace_submodule(root, name, sub, replace_calls), + ) + + # Pipeline with a 'bagel' component (no 'transformer') + pipeline = torch.nn.Module() + pipeline.bagel = torch.nn.Module() + pipeline.bagel.language_model = torch.nn.Module() + pipeline.bagel.language_model.qkv_proj = _FakeLinearBase() + + manager = DiffusionLoRAManager( + pipeline=pipeline, + device=torch.device("cpu"), + dtype=torch.bfloat16, + max_cached_adapters=1, + ) + + peft_helper = type("_PH", (), {"r": 1})() + manager._replace_layers_with_lora(peft_helper) + + assert "language_model.qkv_proj" in replace_calls + assert "bagel.language_model.qkv_proj" in manager._lora_modules + # Verify the module was actually replaced in the tree (not just recorded) + assert isinstance(pipeline.bagel.language_model.qkv_proj, _DummyBaseLayerWithLoRA) diff --git a/tests/diffusion/models/bagel/test_bagel_lora.py b/tests/diffusion/models/bagel/test_bagel_lora.py new file mode 100644 index 0000000000..8cb3446ed5 --- /dev/null +++ b/tests/diffusion/models/bagel/test_bagel_lora.py @@ -0,0 +1,248 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for BAGEL LoRA support across Stage 0 (Thinker) and Stage 1 (DiT).""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest +import torch +from safetensors.torch import save_file + +from tests.diffusion.lora.conftest import ( + DummyBaseLayerWithLoRA, + FakeLinearBase, + fake_replace_submodule, +) +from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager +from vllm_omni.lora.request import LoRARequest + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + +_FakeLinearBase = FakeLinearBase + + +# --------------------------------------------------------------------------- +# Stage 0 (Thinker / AR) -- packed_modules_mapping on the AR model class +# --------------------------------------------------------------------------- + + +class TestStage0ThinkerLoRA: + """Validate that OmniBagelForConditionalGeneration declares correct LoRA metadata.""" + + def test_omni_bagel_supports_lora(self): + from vllm_omni.model_executor.models.bagel.bagel import ( + OmniBagelForConditionalGeneration, + ) + + assert getattr(OmniBagelForConditionalGeneration, "supports_lora", False) is True + + def test_omni_bagel_packed_modules_mapping_complete(self): + from vllm_omni.model_executor.models.bagel.bagel import ( + OmniBagelForConditionalGeneration, + ) + + mapping = OmniBagelForConditionalGeneration.packed_modules_mapping + # Standard Qwen2 projections + assert mapping["qkv_proj"] == ["q_proj", "k_proj", "v_proj"] + assert mapping["gate_up_proj"] == ["gate_proj", "up_proj"] + # MoE generation-mode projections + assert mapping["qkv_proj_moe_gen"] == [ + "q_proj_moe_gen", + "k_proj_moe_gen", + "v_proj_moe_gen", + ] + assert mapping["mlp_moe_gen.gate_up_proj"] == [ + "mlp_moe_gen.gate_proj", + "mlp_moe_gen.up_proj", + ] + + +# --------------------------------------------------------------------------- +# Stage 1 (DiT / Diffusion) -- DiffusionLoRAManager with bagel component +# --------------------------------------------------------------------------- + + +class TestStage1DiTLoRA: + """Validate DiffusionLoRAManager discovers BAGEL's packed modules.""" + + def test_diffusion_lora_manager_discovers_bagel_packed_modules(self): + """Manager should derive packed→sublayer mapping from stacked_params_mapping.""" + pipeline = torch.nn.Module() + pipeline.bagel = torch.nn.Module() + + # Simulate a submodule that exposes stacked_params_mapping + # (as Bagel does after load_weights()) + language_model = torch.nn.Module() + language_model.stacked_params_mapping = [ + (".qkv_proj_moe_gen", ".q_proj_moe_gen", "q"), + (".qkv_proj_moe_gen", ".k_proj_moe_gen", "k"), + (".qkv_proj_moe_gen", ".v_proj_moe_gen", "v"), + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] + pipeline.bagel.language_model = language_model + + manager = DiffusionLoRAManager( + pipeline=pipeline, + device=torch.device("cpu"), + dtype=torch.bfloat16, + max_cached_adapters=1, + ) + + mapping = manager._packed_modules_mapping + assert mapping["qkv_proj"] == ["q_proj", "k_proj", "v_proj"] + assert mapping["qkv_proj_moe_gen"] == [ + "q_proj_moe_gen", + "k_proj_moe_gen", + "v_proj_moe_gen", + ] + assert mapping["gate_up_proj"] == ["gate_proj", "up_proj"] + + def test_diffusion_lora_manager_replaces_bagel_packed_layer_via_sublayer_target(self, monkeypatch): + """Targeting sublayer 'q_proj' should replace the fused 'qkv_proj' under bagel.""" + import vllm_omni.diffusion.lora.manager as manager_mod + + monkeypatch.setattr(manager_mod, "BaseLayerWithLoRA", DummyBaseLayerWithLoRA) + + def _fake_from_layer_diffusion(*, layer, **_kwargs): + return DummyBaseLayerWithLoRA(layer) + + replace_calls: list[str] = [] + + monkeypatch.setattr(manager_mod, "from_layer_diffusion", _fake_from_layer_diffusion) + monkeypatch.setattr( + manager_mod, + "replace_submodule", + lambda root, name, sub: fake_replace_submodule(root, name, sub, replace_calls), + ) + + # Build pipeline with bagel component + pipeline = torch.nn.Module() + pipeline.bagel = torch.nn.Module() + lm = torch.nn.Module() + lm.stacked_params_mapping = [ + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + ] + lm.attn = torch.nn.Module() + lm.attn.qkv_proj = _FakeLinearBase() + pipeline.bagel.language_model = lm + + manager = DiffusionLoRAManager( + pipeline=pipeline, + device=torch.device("cpu"), + dtype=torch.bfloat16, + max_cached_adapters=1, + ) + + # Treat qkv_proj as 3-slice packed layer + monkeypatch.setattr(manager, "_get_packed_modules_list", lambda _module: ["q", "k", "v"]) + + # Target sublayer "q_proj" -- manager should replace the packed "qkv_proj" + peft_helper = type("_PH", (), {"r": 1, "target_modules": ["q_proj"]})() + manager._replace_layers_with_lora(peft_helper) + + assert "language_model.attn.qkv_proj" in replace_calls + assert "bagel.language_model.attn.qkv_proj" in manager._lora_modules + # Verify the module was actually replaced in the tree (not just recorded) + assert isinstance(pipeline.bagel.language_model.attn.qkv_proj, DummyBaseLayerWithLoRA) + + +# --------------------------------------------------------------------------- +# Round-trip: synthetic checkpoint → set_active_adapter → verify weights +# --------------------------------------------------------------------------- + + +def _write_synthetic_lora( + adapter_dir: Path, + module_name: str, + rank: int, + in_dim: int, + out_dim: int, +) -> str: + """Write a minimal LoRA adapter (safetensors + config) to *adapter_dir*.""" + adapter_dir.mkdir(parents=True, exist_ok=True) + lora_a = torch.ones((rank, in_dim), dtype=torch.float32) + lora_b = torch.ones((out_dim, rank), dtype=torch.float32) * 2.0 + save_file( + { + f"base_model.model.{module_name}.lora_A.weight": lora_a, + f"base_model.model.{module_name}.lora_B.weight": lora_b, + }, + str(adapter_dir / "adapter_model.safetensors"), + ) + (adapter_dir / "adapter_config.json").write_text( + json.dumps({"r": rank, "lora_alpha": rank, "target_modules": [module_name]}), + encoding="utf-8", + ) + return str(adapter_dir) + + +class TestBagelLoRARoundTrip: + """End-to-end: synthetic checkpoint → load → activate → verify weights in fused layer.""" + + def test_set_active_adapter_loads_and_activates_bagel_lora(self, tmp_path, monkeypatch): + """Full round-trip through set_active_adapter for a bagel component module.""" + import vllm_omni.diffusion.lora.manager as manager_mod + + monkeypatch.setattr(manager_mod, "BaseLayerWithLoRA", DummyBaseLayerWithLoRA) + + # Build pipeline with bagel.language_model.foo (simple non-packed layer) + pipeline = torch.nn.Module() + pipeline.bagel = torch.nn.Module() + lm = torch.nn.Module() + lm.foo = _FakeLinearBase() + pipeline.bagel.language_model = lm + + def _fake_from_layer(*, layer, **_kwargs): + if isinstance(layer, FakeLinearBase): + return DummyBaseLayerWithLoRA(layer) + return layer + + monkeypatch.setattr(manager_mod, "from_layer_diffusion", _fake_from_layer) + monkeypatch.setattr( + manager_mod, + "replace_submodule", + lambda root, name, sub: fake_replace_submodule(root, name, sub), + ) + + manager = DiffusionLoRAManager( + pipeline=pipeline, + device=torch.device("cpu"), + dtype=torch.bfloat16, + max_cached_adapters=1, + ) + + # Write synthetic adapter targeting bagel.language_model.foo + module_name = "bagel.language_model.foo" + rank = 2 + in_dim = 4 + out_dim = 4 + lora_dir = _write_synthetic_lora(tmp_path / "lora", module_name, rank, in_dim, out_dim) + + lora_request = LoRARequest( + lora_name="test_bagel", + lora_int_id=42, + lora_path=lora_dir, + ) + + # Full round-trip: load from disk → replace layer → activate weights + manager.set_active_adapter(lora_request, lora_scale=0.5) + + # Verify the layer was replaced and weights were set + replaced_layer = pipeline.bagel.language_model.foo + assert isinstance(replaced_layer, DummyBaseLayerWithLoRA), "Layer should be wrapped with LoRA" + assert len(replaced_layer.set_calls) == 1, "set_lora should have been called once" + + lora_a, lora_b = replaced_layer.set_calls[0] + # A weights should be ones (as written) + assert torch.all(lora_a == 1.0), f"lora_a should be all ones, got {lora_a}" + # B weights should be 2.0 * scale(0.5) = 1.0 + assert torch.allclose(lora_b, torch.ones_like(lora_b)), f"lora_b should be 2.0 * 0.5 = 1.0, got {lora_b}" diff --git a/vllm_omni/diffusion/lora/manager.py b/vllm_omni/diffusion/lora/manager.py index 5f75e26cb1..63e8d9a96f 100644 --- a/vllm_omni/diffusion/lora/manager.py +++ b/vllm_omni/diffusion/lora/manager.py @@ -366,13 +366,17 @@ def _matches_target(module_name: str) -> bool: fully_sharded_loras=False, ) - for component_name in ("transformer", "transformer_2", "dit"): + for component_name in ("transformer", "transformer_2", "dit", "bagel"): if not hasattr(self.pipeline, component_name): continue component = getattr(self.pipeline, component_name) if not isinstance(component, nn.Module): continue + # Collect replacements first to avoid mutating the module tree + # while iterating over named_modules(). + pending_replacements: list[tuple[str, str, nn.Module, list[str]]] = [] + for module_name, module in component.named_modules(remove_duplicate=False): # Don't recurse into already-replaced LoRA wrappers. Their # original LinearBase lives under "base_layer", and replacing @@ -401,6 +405,9 @@ def _matches_target(module_name: str) -> bool: if not should_replace: continue + pending_replacements.append((module_name, full_module_name, module, packed_modules_list)) + + for module_name, full_module_name, module, packed_modules_list in pending_replacements: lora_layer = from_layer_diffusion( layer=module, max_loras=1, diff --git a/vllm_omni/diffusion/models/bagel/bagel_transformer.py b/vllm_omni/diffusion/models/bagel/bagel_transformer.py index 685d14729e..bbcd09dd51 100644 --- a/vllm_omni/diffusion/models/bagel/bagel_transformer.py +++ b/vllm_omni/diffusion/models/bagel/bagel_transformer.py @@ -929,17 +929,31 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: (".qkv_proj", ".q_proj", "q"), (".qkv_proj", ".k_proj", "k"), (".qkv_proj", ".v_proj", "v"), + # MLP gate/up projections — the DiT uses separate + # ColumnParallelLinear layers (no fused gate_up_proj), but + # these entries are needed so that DiffusionLoRAManager can + # derive the packed→sublayer mapping for LoRA checkpoints + # that store weights under fused gate_up_proj keys. + # The weight loader gracefully falls through to the + # non-stacked path when the fused parameter doesn't exist. + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), ] + self.stacked_params_mapping = stacked_params_mapping params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: + original_name = name for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue name = name.replace(weight_name, param_name) param = params_dict.get(name) if param is None: + # Fused param doesn't exist (e.g. gate_up_proj on DiT); + # restore original name and fall through to non-stacked path. + name = original_name break weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight, shard_id) diff --git a/vllm_omni/model_executor/models/bagel/bagel.py b/vllm_omni/model_executor/models/bagel/bagel.py index 3b4acae515..acbbc28b4c 100644 --- a/vllm_omni/model_executor/models/bagel/bagel.py +++ b/vllm_omni/model_executor/models/bagel/bagel.py @@ -407,6 +407,22 @@ class OmniBagelForConditionalGeneration(BagelForConditionalGeneration): the DiT's denoising loop. """ + # LoRA packed→sublayer mapping for both standard Qwen2 projections + # and the MoE generation-mode projections added by _install_mot_modules(). + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"], + "qkv_proj_moe_gen": [ + "q_proj_moe_gen", + "k_proj_moe_gen", + "v_proj_moe_gen", + ], + "mlp_moe_gen.gate_up_proj": [ + "mlp_moe_gen.gate_proj", + "mlp_moe_gen.up_proj", + ], + } + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) config = vllm_config.model_config.hf_config From e7718427815104770b0b688bcb48b5d875bfaf82 Mon Sep 17 00:00:00 2001 From: Canlin Guo Date: Mon, 6 Apr 2026 22:47:45 +0800 Subject: [PATCH 061/204] [Feature] Support vae tiling parallel encode (#2368) Signed-off-by: gcanlin Co-authored-by: Hongsheng Liu --- docs/design/feature/vae_parallel.md | 206 +++++++++++-- docs/user_guide/diffusion_features.md | 20 +- .../test_autoencoder_kl_wan_encode.py | 273 ++++++++++++++++++ .../test_distributed_vae_executor.py | 6 +- .../autoencoders/autoencoder_kl.py | 4 +- .../autoencoders/autoencoder_kl_qwenimage.py | 4 +- .../autoencoders/autoencoder_kl_wan.py | 140 ++++++++- .../autoencoders/distributed_vae_executor.py | 16 +- 8 files changed, 624 insertions(+), 45 deletions(-) create mode 100644 tests/diffusion/distributed/test_autoencoder_kl_wan_encode.py diff --git a/docs/design/feature/vae_parallel.md b/docs/design/feature/vae_parallel.md index 9009ece72a..e330b41a68 100644 --- a/docs/design/feature/vae_parallel.md +++ b/docs/design/feature/vae_parallel.md @@ -1,14 +1,15 @@ # VAE Patch Parallelism This document describes how to add **VAE Patch Parallelism** support to a diffusion model. -We use **Qwen-Image** as the reference implementation. +We use **Qwen-Image** as the reference implementation for decode parallel, and **Wan2.2** for encode parallel. --- ## Table of Contents - [Overview](#overview) -- [Step-by-Step Implementation](#step-by-step-implementation) +- [Step-by-Step Implementation (Decode)](#step-by-step-implementation-decode) +- [Encode Parallel Implementation](#encode-parallel-implementation) - [Testing](#testing) - [Reference Implementations](#reference-implementations) - [Summary](#summary) @@ -19,13 +20,13 @@ We use **Qwen-Image** as the reference implementation. ### What is Vae Patch parallel? -**VAE Patch Parallelism** is a decoding acceleration technique. Instead of decoding the entire latent tensor at once, the latent tensor is: +**VAE Patch Parallelism** is an acceleration technique for both **encoding** and **decoding**. Instead of processing the entire tensor at once, the tensor is: + Split into multiple spatial tiles + Distributed across multiple ranks -+ Decoded in parallel ++ Encoded/Decoded in parallel + Merged to reconstruct the final output @@ -35,10 +36,17 @@ This approach: + Reduces peak memory usage per device -+ Accelerates decoding latency ++ Accelerates encoding/decoding latency + +### When to Use Encode vs Decode Parallel + +| Operation | Use Case | Example | +|-----------|----------|---------| +| **Decode Parallel** | Text-to-Image, Text-to-Video | Latent → Image/Video | +| **Encode Parallel** | Image-to-Video (I2V) | Image → Latent (for conditioning) | ### Architecture -We introduce **DistributedVaeExecutor** as the core component responsible for distributed VAE decoding. +We introduce **DistributedVaeExecutor** as the core component responsible for distributed VAE encoding/decoding. The executor is model-agnostic and accepts three function parameters: @@ -84,7 +92,7 @@ Therefore: + Merge must perform blending to avoid seams -## Step-by-Step Implementation +## Step-by-Step Implementation (Decode) ### Step 1: Implement DistributedAutoencoderKLQwenImage `QwenImagePipeline` use `AutoencoderKLQwenImage` for vae, so implement a distributed version: @@ -205,14 +213,14 @@ def tile_merge(self, coord_tensor_map: dict[tuple[int, ...], torch.Tensor], grid We need to override tiled_decode, the main logic is: + check distributed is enabled + select split/exec/merge -+ Invoke self.distributed_decoder.execute to decode ++ Invoke self.distributed_executor.execute to decode ``` def tiled_decode(self, z: torch.Tensor, return_dict: bool = True): if not self.is_distributed_enabled(): return super().tiled_decode(z, return_dict=return_dict) logger.info("Decode run with distributed executor") - result = self.distributed_decoder.execute( + result = self.distributed_executor.execute( z, DistributedOperator(split=self.tile_split, exec=self.tile_exec, merge=self.tile_merge), broadcast_result=True, @@ -243,6 +251,166 @@ class YourModelPipeline(nn.Module): + ).to(self.device) ``` +## Encode Parallel Implementation + +For models that require VAE encoding (e.g., Image-to-Video), you can also parallelize the encode operation. We use **Wan2.2** as the reference implementation. + +### Step 1: Implement encode_tile_split + +Similar to decode, split the input tensor into tiles. Key considerations: + ++ **Patchify handling**: If the model uses `patch_size`, scale tile parameters accordingly ++ **Temporal chunking**: Video VAEs may have temporal compression (e.g., 4x) + +```python +def encode_tile_split(self, x: torch.Tensor) -> tuple[list[TileTask], GridSpec]: + _, _, num_frames, height, width = x.shape + encode_spatial_compression_ratio = self.spatial_compression_ratio + + # Scale tile parameters for patchified coordinate system + tile_sample_min_height = self.tile_sample_min_height + tile_sample_min_width = self.tile_sample_min_width + tile_sample_stride_height = self.tile_sample_stride_height + tile_sample_stride_width = self.tile_sample_stride_width + + if self.config.patch_size is not None: + # When input is patchified, scale tile parameters accordingly + encode_spatial_compression_ratio = self.spatial_compression_ratio // self.config.patch_size + tile_sample_min_height = tile_sample_min_height // self.config.patch_size + tile_sample_min_width = tile_sample_min_width // self.config.patch_size + tile_sample_stride_height = tile_sample_stride_height // self.config.patch_size + tile_sample_stride_width = tile_sample_stride_width // self.config.patch_size + + latent_height = height // encode_spatial_compression_ratio + latent_width = width // encode_spatial_compression_ratio + + tile_latent_min_height = tile_sample_min_height // encode_spatial_compression_ratio + tile_latent_min_width = tile_sample_min_width // encode_spatial_compression_ratio + tile_latent_stride_height = tile_sample_stride_height // encode_spatial_compression_ratio + tile_latent_stride_width = tile_sample_stride_width // encode_spatial_compression_ratio + + blend_height = tile_latent_min_height - tile_latent_stride_height + blend_width = tile_latent_min_width - tile_latent_stride_width + + tiletask_list = [] + # Use temporal compression ratio from config instead of hardcoding + temporal_compression = self.config.scale_factor_temporal + + for i in range(0, height, tile_sample_stride_height): + for j in range(0, width, tile_sample_stride_width): + time_list = [] + frame_range = 1 + (num_frames - 1) // temporal_compression + for k in range(frame_range): + if k == 0: + tile = x[:, :, :1, i : i + tile_sample_min_height, j : j + tile_sample_min_width] + else: + tile = x[ + :, :, + 1 + temporal_compression * (k - 1) : 1 + temporal_compression * k, + i : i + tile_sample_min_height, + j : j + tile_sample_min_width, + ] + time_list.append(tile) + tiletask_list.append( + TileTask(len(tiletask_list), (i // tile_sample_stride_height, j // tile_sample_stride_width), + time_list, workload=time_list[0].shape[3] * time_list[0].shape[4]) + ) + + grid_spec = GridSpec( + split_dims=(3, 4), + grid_shape=(tiletask_list[-1].grid_coord[0] + 1, tiletask_list[-1].grid_coord[1] + 1), + tile_spec={ + "latent_height": latent_height, "latent_width": latent_width, + "blend_height": blend_height, "blend_width": blend_width, + "tile_latent_stride_height": tile_latent_stride_height, + "tile_latent_stride_width": tile_latent_stride_width, + }, + output_dtype=self.dtype, + ) + return tiletask_list, grid_spec +``` + +### Step 2: Implement encode_tile_exec + +```python +def encode_tile_exec(self, task: TileTask) -> torch.Tensor: + """Encode a single sample tile into latent space.""" + self.clear_cache() + time = [] + for k, tile in enumerate(task.tensor): + self._enc_conv_idx = [0] + encoded = self.encoder(tile, feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx) + encoded = self.quant_conv(encoded) + time.append(encoded) + result = torch.cat(time, dim=2) + self.clear_cache() + return result +``` + +### Step 3: Implement encode_tile_merge + +```python +def encode_tile_merge( + self, coord_tensor_map: dict[tuple[int, ...], torch.Tensor], grid_spec: GridSpec +) -> torch.Tensor: + """Merge encoded tiles into a full latent tensor.""" + grid_h, grid_w = grid_spec.grid_shape + result_rows = [] + for i in range(grid_h): + result_row = [] + for j in range(grid_w): + tile = coord_tensor_map[(i, j)] + if i > 0: + tile = self.blend_v(coord_tensor_map[(i - 1, j)], tile, grid_spec.tile_spec["blend_height"]) + if j > 0: + tile = self.blend_h(coord_tensor_map[(i, j - 1)], tile, grid_spec.tile_spec["blend_width"]) + result_row.append(tile[:, :, :, + : grid_spec.tile_spec["tile_latent_stride_height"], + : grid_spec.tile_spec["tile_latent_stride_width"]]) + result_rows.append(torch.cat(result_row, dim=-1)) + + enc = torch.cat(result_rows, dim=3)[ + :, :, :, : grid_spec.tile_spec["latent_height"], : grid_spec.tile_spec["latent_width"] + ] + return enc +``` + +### Step 4: Override tiled_encode method + +Override `tiled_encode` instead of `encode`. The parent's `_encode()` handles patchify before calling `tiled_encode()`, so input `x` is already patchified. + +```python +def tiled_encode(self, x: torch.Tensor) -> torch.Tensor: + """ + Encode using distributed VAE executor. + + Note: x is already patchified by parent's _encode() before calling this method. + """ + if not self.is_distributed_enabled(): + return super().tiled_encode(x) + + self.clear_cache() + result = self.distributed_executor.execute( + x, + DistributedOperator( + split=self.encode_tile_split, + exec=self.encode_tile_exec, + merge=self.encode_tile_merge, + ), + broadcast_result=True, # Latents needed by all ranks for diffusion + ) + self.clear_cache() + return result +``` + +**Key differences from decode parallel:** + +| Aspect | Decode Parallel | Encode Parallel | +|--------|-----------------|-----------------| +| `broadcast_result` | Often `False` (only rank 0 needs output) | `True` (all ranks need latents for diffusion) | +| Patchify | Applied in merge (unpatchify) | Handled by parent `_encode()` before `tiled_encode()` | +| Temporal chunking | Frame-by-frame | Chunk-based (e.g., 1 + 4n frames) | + ## Testing Verify numerical consistency between: + vae_patch_parallel_size = 1 @@ -272,18 +440,20 @@ When vae_patch_parallel_size is larger than the DiT world size, it will automati Complete examples in the codebase: -| Model | Path | Notes | -|-------|------|-------| -| **Z-Image** | `vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py` | Distributed AutoencoderKL | -| **Wan2.2** | `vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_wan.py` | Distributed AutoencoderKLWan | -| **Qwen-Image** | `vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_qwenimage.py` | Distributed AutoencoderKLQwenImage | +| Model | Path | Decode Parallel | Encode Parallel | +|-------|------|-----------------|-----------------| +| **Z-Image** | `vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py` | ✅ | ❌ | +| **Wan2.2** | `vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_wan.py` | ✅ | ✅ | +| **Qwen-Image** | `vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_qwenimage.py` | ✅ | ❌ | --- ## Summary -Adding Vae Patch Parallel support to diffusion model: +Adding VAE Patch Parallel support to diffusion model: -1. **Implement Distributed Vae** - mainly copy from `diffusers` tiled_decode, and refactor into split/exec/merge -2. **Change vae model in pipeline to Distributed Vae** -3. **Test** - Verify with `tensor_parallel_size=N` quality +1. **Implement Distributed VAE** - Inherit from base VAE class and `DistributedVaeMixin` +2. **Decode Parallel** - Refactor `tiled_decode` into `tile_split`/`tile_exec`/`tile_merge` +3. **Encode Parallel** (optional) - Implement `encode_tile_split`/`encode_tile_exec`/`encode_tile_merge` for I2V models +4. **Change VAE model in pipeline** - Use the distributed version +5. **Test** - Verify numerical consistency with `vae_patch_parallel_size=1` vs `N` diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index e7f33306ec..c151164ca0 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -114,13 +114,13 @@ The following tables show which models support each feature: | **Nextstep_1(T2I)** | ❓ | ❓ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **OmniGen2** | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | **Ovis-Image** | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| **Qwen-Image** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| **Qwen-Image-2512** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| **Qwen-Image-Edit** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | -| **Qwen-Image-Edit-2509** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | -| **Qwen-Image-Layered** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | -| **Stable-Diffusion3.5** | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | -| **Z-Image** | ✅ | ✅ | ✅ | ❓ | ✅ (TP=2 only) | ✅ | ❌ | ✅ | ✅ | ❌ | +| **Qwen-Image** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ✅ | +| **Qwen-Image-2512** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ✅ | +| **Qwen-Image-Edit** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ❌ | ❌ | +| **Qwen-Image-Edit-2509** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ❌ | ❌ | +| **Qwen-Image-Layered** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ❌ | ❌ | +| **Stable-Diffusion3.5** | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ (decode) | ❌ | ❌ | +| **Z-Image** | ✅ | ✅ | ✅ | ❓ | ✅ (TP=2 only) | ✅ | ❌ | ✅ (decode) | ✅ | ❌ | > Notes: > 1. Nextstep_1(T2I) does not support cache acceleration methods such as TeaCache or Cache-DiT. @@ -130,11 +130,11 @@ The following tables show which models support each feature: | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| -| **Wan2.2** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | -| **Wan2.1-VACE** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| **Wan2.2** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (encode/decode) | ❌ | ❌ | +| **Wan2.1-VACE** | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ❌ | ❌ | | **LTX-2** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **Helios** | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | -| **HunyuanVideo-1.5 T2V I2V** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | +| **HunyuanVideo-1.5 T2V I2V** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ❌ | | **DreamID-Omni** | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ### AudioGen diff --git a/tests/diffusion/distributed/test_autoencoder_kl_wan_encode.py b/tests/diffusion/distributed/test_autoencoder_kl_wan_encode.py new file mode 100644 index 0000000000..7a18fa66da --- /dev/null +++ b/tests/diffusion/distributed/test_autoencoder_kl_wan_encode.py @@ -0,0 +1,273 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Unit tests for DistributedAutoencoderKLWan encode parallel (CPU-only).""" + +import pytest +import torch + +pytestmark = [pytest.mark.cpu, pytest.mark.core_model] + + +class _DummyConfig: + def __init__(self, patch_size=None, scale_factor_temporal=4): + self.patch_size = patch_size + self.scale_factor_temporal = scale_factor_temporal + + +class _DummyWanVae: + """Minimal mock of DistributedAutoencoderKLWan for testing encode_tile_split.""" + + def __init__( + self, + config=None, + spatial_compression_ratio=8, + tile_sample_min_height=256, + tile_sample_min_width=256, + tile_sample_stride_height=192, + tile_sample_stride_width=192, + ): + self.config = config or _DummyConfig() + self.spatial_compression_ratio = spatial_compression_ratio + self.tile_sample_min_height = tile_sample_min_height + self.tile_sample_min_width = tile_sample_min_width + self.tile_sample_stride_height = tile_sample_stride_height + self.tile_sample_stride_width = tile_sample_stride_width + self.dtype = torch.float32 + + # Mock caches + self._enc_feat_map = None + self._enc_conv_idx = [0] + + def clear_cache(self): + self._enc_feat_map = None + self._enc_conv_idx = [0] + + def encoder(self, x, feat_cache=None, feat_idx=None): # noqa: ARG002 + # Simple mock: just return the input + return x + + def quant_conv(self, x): + return x + + def blend_v(self, _a, b, _blend_extent): + return b + + def blend_h(self, _a, b, _blend_extent): + return b + + +def _import_encode_tile_split(): + """Import the encode_tile_split method from the module.""" + from vllm_omni.diffusion.distributed.autoencoders.autoencoder_kl_wan import ( + DistributedAutoencoderKLWan, + ) + + return DistributedAutoencoderKLWan.encode_tile_split + + +def _import_encode_tile_exec(): + from vllm_omni.diffusion.distributed.autoencoders.autoencoder_kl_wan import ( + DistributedAutoencoderKLWan, + ) + + return DistributedAutoencoderKLWan.encode_tile_exec + + +def _import_encode_tile_merge(): + from vllm_omni.diffusion.distributed.autoencoders.autoencoder_kl_wan import ( + DistributedAutoencoderKLWan, + ) + + return DistributedAutoencoderKLWan.encode_tile_merge + + +class TestEncodeTileSplit: + """Tests for encode_tile_split method.""" + + def test_basic_split_without_patch_size(self): + """Test basic tile splitting without patch_size.""" + encode_tile_split = _import_encode_tile_split() + + vae = _DummyWanVae( + config=_DummyConfig(patch_size=None, scale_factor_temporal=4), + spatial_compression_ratio=8, + tile_sample_min_height=256, + tile_sample_min_width=256, + tile_sample_stride_height=192, + tile_sample_stride_width=192, + ) + + # Input: (B, C, T, H, W) = (1, 3, 5, 256, 256) + x = torch.randn(1, 3, 5, 256, 256) + + tiletask_list, grid_spec = encode_tile_split(vae, x) + + # With stride 192 and input size 256, we should get: + # Height: ceil(256/192) = 2 positions (0, 192) but 192+256 > 256, so only 1 + # Actually for i in range(0, 256, 192): i = 0, 192 but 192 is out of bounds + # So we get 1x1 grid + assert len(tiletask_list) >= 1 + assert grid_spec.grid_shape[0] >= 1 + assert grid_spec.grid_shape[1] >= 1 + + # Check temporal chunking: 5 frames -> 1 + (5-1)//4 = 2 chunks + first_task = tiletask_list[0] + assert len(first_task.tensor) == 2 # 2 temporal chunks + + def test_split_with_patch_size_scales_coordinates(self): + """Test that patch_size properly scales tile coordinates.""" + encode_tile_split = _import_encode_tile_split() + + # Without patch_size + vae_no_patch = _DummyWanVae( + config=_DummyConfig(patch_size=None, scale_factor_temporal=4), + spatial_compression_ratio=8, + tile_sample_min_height=256, + tile_sample_min_width=256, + tile_sample_stride_height=128, + tile_sample_stride_width=128, + ) + + # With patch_size=2 (simulating patchified input) + vae_with_patch = _DummyWanVae( + config=_DummyConfig(patch_size=2, scale_factor_temporal=4), + spatial_compression_ratio=8, + tile_sample_min_height=256, + tile_sample_min_width=256, + tile_sample_stride_height=128, + tile_sample_stride_width=128, + ) + + # Same patchified input size + x = torch.randn(1, 3, 5, 256, 256) + + tasks_no_patch, _ = encode_tile_split(vae_no_patch, x) + tasks_with_patch, _ = encode_tile_split(vae_with_patch, x) + + # With patch_size=2, stride becomes 128//2=64, so more tiles + assert len(tasks_with_patch) >= len(tasks_no_patch) + + def test_temporal_compression_from_config(self): + """Test that temporal compression ratio is read from config.""" + encode_tile_split = _import_encode_tile_split() + + # temporal_compression=4 (default) + vae_4x = _DummyWanVae( + config=_DummyConfig(scale_factor_temporal=4), + tile_sample_min_height=512, + tile_sample_min_width=512, + tile_sample_stride_height=512, + tile_sample_stride_width=512, + ) + + # temporal_compression=2 + vae_2x = _DummyWanVae( + config=_DummyConfig(scale_factor_temporal=2), + tile_sample_min_height=512, + tile_sample_min_width=512, + tile_sample_stride_height=512, + tile_sample_stride_width=512, + ) + + # 9 frames input + x = torch.randn(1, 3, 9, 512, 512) + + tasks_4x, _ = encode_tile_split(vae_4x, x) + tasks_2x, _ = encode_tile_split(vae_2x, x) + + # With 4x compression: 1 + (9-1)//4 = 3 chunks + assert len(tasks_4x[0].tensor) == 3 + + # With 2x compression: 1 + (9-1)//2 = 5 chunks + assert len(tasks_2x[0].tensor) == 5 + + def test_grid_spec_latent_dimensions(self): + """Test that grid_spec contains correct latent dimensions.""" + encode_tile_split = _import_encode_tile_split() + + vae = _DummyWanVae( + config=_DummyConfig(patch_size=None), + spatial_compression_ratio=8, + tile_sample_min_height=512, + tile_sample_min_width=512, + tile_sample_stride_height=512, + tile_sample_stride_width=512, + ) + + # Input: 512x512 with compression 8 -> 64x64 latent + x = torch.randn(1, 3, 5, 512, 512) + + _, grid_spec = encode_tile_split(vae, x) + + assert grid_spec.tile_spec["latent_height"] == 64 + assert grid_spec.tile_spec["latent_width"] == 64 + + +class TestEncodeTileExec: + """Tests for encode_tile_exec method.""" + + def test_basic_exec(self): + """Test basic tile execution.""" + encode_tile_exec = _import_encode_tile_exec() + + vae = _DummyWanVae() + + from vllm_omni.diffusion.distributed.autoencoders.distributed_vae_executor import ( + TileTask, + ) + + # Create a simple task with 2 temporal chunks + tile1 = torch.randn(1, 3, 1, 32, 32) + tile2 = torch.randn(1, 3, 4, 32, 32) + task = TileTask(tile_id=0, grid_coord=(0, 0), tensor=[tile1, tile2]) + + result = encode_tile_exec(vae, task) + + # Result should concatenate temporal dimension + assert result.shape[2] == 5 # 1 + 4 frames + + +class TestEncodeTileMerge: + """Tests for encode_tile_merge method.""" + + def test_basic_merge(self): + """Test basic tile merging.""" + encode_tile_merge = _import_encode_tile_merge() + + vae = _DummyWanVae() + + from vllm_omni.diffusion.distributed.autoencoders.distributed_vae_executor import ( + GridSpec, + ) + + # Create 2x2 grid of tiles + tile_00 = torch.ones(1, 16, 2, 32, 32) * 0 + tile_01 = torch.ones(1, 16, 2, 32, 32) * 1 + tile_10 = torch.ones(1, 16, 2, 32, 32) * 2 + tile_11 = torch.ones(1, 16, 2, 32, 32) * 3 + + coord_tensor_map = { + (0, 0): tile_00, + (0, 1): tile_01, + (1, 0): tile_10, + (1, 1): tile_11, + } + + grid_spec = GridSpec( + split_dims=(3, 4), + grid_shape=(2, 2), + tile_spec={ + "latent_height": 48, + "latent_width": 48, + "blend_height": 8, + "blend_width": 8, + "tile_latent_stride_height": 24, + "tile_latent_stride_width": 24, + }, + ) + + result = encode_tile_merge(vae, coord_tensor_map, grid_spec) + + # Output should be (1, 16, 2, 48, 48) + assert result.shape == (1, 16, 2, 48, 48) diff --git a/tests/diffusion/distributed/test_distributed_vae_executor.py b/tests/diffusion/distributed/test_distributed_vae_executor.py index 42e9f3300b..93cf3d195f 100644 --- a/tests/diffusion/distributed/test_distributed_vae_executor.py +++ b/tests/diffusion/distributed/test_distributed_vae_executor.py @@ -59,9 +59,9 @@ def merge(self, coord_tensor_map, grid_spec): class DummyMixin(DistributedVaeMixin): def __init__(self): self.use_tiling = True - self.distributed_decoder = MagicMock() - self.distributed_decoder.parallel_size = 2 - self.distributed_decoder.group = None + self.distributed_executor = MagicMock() + self.distributed_executor.parallel_size = 2 + self.distributed_executor.group = None @pytest.fixture(autouse=True) diff --git a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py index 7df2d6a8ad..0084719a8a 100644 --- a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py +++ b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl.py @@ -93,7 +93,7 @@ def patch_split(self, z: torch.Tensor) -> tuple[list[TileTask], GridSpec]: _, _, latent_h, latent_w = z.shape scale = int(2 ** (len(self.config.block_out_channels) - 1)) - max_parallel_size = self.distributed_decoder.parallel_size + max_parallel_size = self.distributed_executor.parallel_size root = int(math.sqrt(max_parallel_size)) for rows in range(root, 0, -1): @@ -187,7 +187,7 @@ def decode(self, z: torch.Tensor, return_dict: bool = True, *args: Any, **kwargs if split is not None: strategy = "tile" if split == self.tile_split else "patch" logger.info(f"Decode run with distributed executor, split strategy is {strategy}") - result = self.distributed_decoder.execute( + result = self.distributed_executor.execute( z, DistributedOperator(split=split, exec=exec, merge=merge), broadcast_result=False ) if not return_dict: diff --git a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_qwenimage.py b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_qwenimage.py index 7549bbd3d5..f9dea8a36d 100644 --- a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_qwenimage.py +++ b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_qwenimage.py @@ -108,8 +108,8 @@ def tiled_decode(self, z: torch.Tensor, return_dict: bool = True): if not self.is_distributed_enabled(): return super().tiled_decode(z, return_dict=return_dict) - logger.info("Decode run with distributed executor") - result = self.distributed_decoder.execute( + logger.debug("Decode running with distributed executor") + result = self.distributed_executor.execute( z, DistributedOperator(split=self.tile_split, exec=self.tile_exec, merge=self.tile_merge), broadcast_result=True, diff --git a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_wan.py b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_wan.py index 7defbae79b..027991c3f2 100644 --- a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_wan.py +++ b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_wan.py @@ -92,6 +92,119 @@ def tile_exec(self, task: TileTask) -> torch.Tensor: result = torch.cat(time, dim=2) return result + def encode_tile_split(self, x: torch.Tensor) -> tuple[list[TileTask], GridSpec]: + _, _, num_frames, height, width = x.shape + encode_spatial_compression_ratio = self.spatial_compression_ratio + # Scale tile parameters for patchified coordinate system + tile_sample_min_height = self.tile_sample_min_height + tile_sample_min_width = self.tile_sample_min_width + tile_sample_stride_height = self.tile_sample_stride_height + tile_sample_stride_width = self.tile_sample_stride_width + if self.config.patch_size is not None: + assert encode_spatial_compression_ratio % self.config.patch_size == 0 + encode_spatial_compression_ratio = self.spatial_compression_ratio // self.config.patch_size + # When input is patchified, scale tile parameters accordingly + tile_sample_min_height = tile_sample_min_height // self.config.patch_size + tile_sample_min_width = tile_sample_min_width // self.config.patch_size + tile_sample_stride_height = tile_sample_stride_height // self.config.patch_size + tile_sample_stride_width = tile_sample_stride_width // self.config.patch_size + + latent_height = height // encode_spatial_compression_ratio + latent_width = width // encode_spatial_compression_ratio + + tile_latent_min_height = tile_sample_min_height // encode_spatial_compression_ratio + tile_latent_min_width = tile_sample_min_width // encode_spatial_compression_ratio + tile_latent_stride_height = tile_sample_stride_height // encode_spatial_compression_ratio + tile_latent_stride_width = tile_sample_stride_width // encode_spatial_compression_ratio + + blend_height = tile_latent_min_height - tile_latent_stride_height + blend_width = tile_latent_min_width - tile_latent_stride_width + + tiletask_list = [] + temporal_compression = self.config.scale_factor_temporal + for i in range(0, height, tile_sample_stride_height): + for j in range(0, width, tile_sample_stride_width): + time_list = [] + frame_range = 1 + (num_frames - 1) // temporal_compression + for k in range(frame_range): + if k == 0: + tile = x[:, :, :1, i : i + tile_sample_min_height, j : j + tile_sample_min_width] + else: + tile = x[ + :, + :, + 1 + temporal_compression * (k - 1) : 1 + temporal_compression * k, + i : i + tile_sample_min_height, + j : j + tile_sample_min_width, + ] + time_list.append(tile) + tiletask_list.append( + TileTask( + len(tiletask_list), + (i // tile_sample_stride_height, j // tile_sample_stride_width), + time_list, + workload=time_list[0].shape[3] * time_list[0].shape[4], + ) + ) + + grid_spec = GridSpec( + split_dims=(3, 4), + grid_shape=(tiletask_list[-1].grid_coord[0] + 1, tiletask_list[-1].grid_coord[1] + 1), + tile_spec={ + "latent_height": latent_height, + "latent_width": latent_width, + "blend_height": blend_height, + "blend_width": blend_width, + "tile_latent_stride_height": tile_latent_stride_height, + "tile_latent_stride_width": tile_latent_stride_width, + }, + output_dtype=self.dtype, + ) + return tiletask_list, grid_spec + + def encode_tile_exec(self, task: TileTask) -> torch.Tensor: + """Encode a single sample tile into latent space.""" + self.clear_cache() + time = [] + for k, tile in enumerate(task.tensor): + self._enc_conv_idx = [0] + encoded = self.encoder(tile, feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx) + encoded = self.quant_conv(encoded) + time.append(encoded) + result = torch.cat(time, dim=2) + self.clear_cache() + return result + + def encode_tile_merge( + self, coord_tensor_map: dict[tuple[int, ...], torch.Tensor], grid_spec: GridSpec + ) -> torch.Tensor: + """Merge encoded tiles into a full latent tensor.""" + grid_h, grid_w = grid_spec.grid_shape + result_rows = [] + for i in range(grid_h): + result_row = [] + for j in range(grid_w): + tile = coord_tensor_map[(i, j)] + if i > 0: + tile = self.blend_v(coord_tensor_map[(i - 1, j)], tile, grid_spec.tile_spec["blend_height"]) + if j > 0: + tile = self.blend_h(coord_tensor_map[(i, j - 1)], tile, grid_spec.tile_spec["blend_width"]) + result_row.append( + tile[ + :, + :, + :, + : grid_spec.tile_spec["tile_latent_stride_height"], + : grid_spec.tile_spec["tile_latent_stride_width"], + ] + ) + result_rows.append(torch.cat(result_row, dim=-1)) + + enc = torch.cat(result_rows, dim=3)[ + :, :, :, : grid_spec.tile_spec["latent_height"], : grid_spec.tile_spec["latent_width"] + ] + return enc + def tile_merge(self, coord_tensor_map: dict[tuple[int, ...], torch.Tensor], grid_spec: GridSpec) -> torch.Tensor: """Merge decoded tiles into a full image.""" grid_h, grid_w = grid_spec.grid_shape @@ -130,8 +243,8 @@ def tiled_decode(self, z: torch.Tensor, return_dict: bool = True): if not self.is_distributed_enabled(): return super().tiled_decode(z, return_dict=return_dict) - logger.info("Decode run with distributed executor") - result = self.distributed_decoder.execute( + logger.debug("Decode running with distributed executor") + result = self.distributed_executor.execute( z, DistributedOperator(split=self.tile_split, exec=self.tile_exec, merge=self.tile_merge), broadcast_result=False, @@ -140,3 +253,26 @@ def tiled_decode(self, z: torch.Tensor, return_dict: bool = True): return (result,) return DecoderOutput(sample=result) + + def tiled_encode(self, x: torch.Tensor) -> torch.Tensor: + """ + Encode using distributed VAE executor. + + Note: x is already patchified by parent's _encode() before calling this method. + """ + if not self.is_distributed_enabled(): + return super().tiled_encode(x) + + logger.debug("Encode running with distributed executor") + self.clear_cache() + result = self.distributed_executor.execute( + x, + DistributedOperator( + split=self.encode_tile_split, + exec=self.encode_tile_exec, + merge=self.encode_tile_merge, + ), + broadcast_result=True, + ) + self.clear_cache() + return result diff --git a/vllm_omni/diffusion/distributed/autoencoders/distributed_vae_executor.py b/vllm_omni/diffusion/distributed/autoencoders/distributed_vae_executor.py index bdf664741d..ad60d164aa 100644 --- a/vllm_omni/diffusion/distributed/autoencoders/distributed_vae_executor.py +++ b/vllm_omni/diffusion/distributed/autoencoders/distributed_vae_executor.py @@ -168,25 +168,25 @@ def _sync_final_result(self, rank0_result, output_ndim, output_device, output_dt class DistributedVaeMixin: def init_distributed(self): - self.distributed_decoder = DistributedVaeExecutor() + self.distributed_executor = DistributedVaeExecutor() - def set_parallel_size(self, parallel_size: int) -> bool: - return self.distributed_decoder.set_parallel_size(parallel_size) + def set_parallel_size(self, parallel_size: int) -> None: + self.distributed_executor.set_parallel_size(parallel_size) def is_distributed_enabled(self) -> bool: if ( - self.distributed_decoder.parallel_size <= 1 + self.distributed_executor.parallel_size <= 1 or not dist.is_initialized() or not getattr(self, "use_tiling", False) ): return False - world_size = dist.get_world_size(group=self.distributed_decoder.group) - pp_size = min(int(self.distributed_decoder.parallel_size), int(world_size)) + world_size = dist.get_world_size(group=self.distributed_executor.group) + pp_size = min(int(self.distributed_executor.parallel_size), int(world_size)) if pp_size <= 1: return False - if self.distributed_decoder.parallel_size > pp_size: + if self.distributed_executor.parallel_size > pp_size: logger.warning( - f"vae_patch_parallel_size={self.distributed_decoder.parallel_size} " + f"vae_patch_parallel_size={self.distributed_executor.parallel_size} " f"is greater than dit_group={world_size};" f" using dit_group size={world_size}" ) From 54e964dc0ac2f717e37e33037a956c2f6a8f738f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zhengyuan=20Su=20=28=E8=8B=8F=E6=94=BF=E6=B8=8A=29?= Date: Tue, 7 Apr 2026 07:28:13 +0800 Subject: [PATCH 062/204] [Bugfix] Fix load_weights fallback for non-fused stacked_params_mapping entries (#2523) --- .../diffusion/models/bagel/bagel_transformer.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm_omni/diffusion/models/bagel/bagel_transformer.py b/vllm_omni/diffusion/models/bagel/bagel_transformer.py index bbcd09dd51..a14e875c06 100644 --- a/vllm_omni/diffusion/models/bagel/bagel_transformer.py +++ b/vllm_omni/diffusion/models/bagel/bagel_transformer.py @@ -944,26 +944,27 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loaded_params: set[str] = set() for name, loaded_weight in weights: - original_name = name + loaded = False for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue - name = name.replace(weight_name, param_name) - param = params_dict.get(name) + stacked_name = name.replace(weight_name, param_name) + param = params_dict.get(stacked_name) if param is None: - # Fused param doesn't exist (e.g. gate_up_proj on DiT); - # restore original name and fall through to non-stacked path. - name = original_name break weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight, shard_id) + name = stacked_name + loaded = True break - else: + + if not loaded: param = params_dict.get(name) if param is None: continue weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) return loaded_params From 5b2c4f909d8f7c1cce98bc7d5a7ed65fc10eefe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Tue, 7 Apr 2026 09:29:26 +0800 Subject: [PATCH 063/204] [BugFix] Add bagel text2text/img2text think mode support (#2503) Signed-off-by: princepride --- examples/offline_inference/bagel/end2end.py | 27 ++++++++++--------- .../stage_input_processors/bagel.py | 7 +++++ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index 2153a31ba7..472d748d1e 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -2,7 +2,10 @@ import os from vllm_omni.inputs.data import OmniPromptType -from vllm_omni.model_executor.stage_input_processors.bagel import GEN_THINK_SYSTEM_PROMPT +from vllm_omni.model_executor.stage_input_processors.bagel import ( + GEN_THINK_SYSTEM_PROMPT, + VLM_THINK_SYSTEM_PROMPT, +) def parse_args(): @@ -171,7 +174,10 @@ def main(): elif args.modality == "img2text": if args.image_path: loaded_image = Image.open(args.image_path).convert("RGB") - final_prompt_text = f"<|im_start|>user\n<|image_pad|>\n{p}<|im_end|>\n<|im_start|>assistant\n" + think_prefix = f"<|im_start|>system\n{VLM_THINK_SYSTEM_PROMPT}<|im_end|>\n" if args.think else "" + final_prompt_text = ( + f"{think_prefix}<|im_start|>user\n<|image_pad|>\n{p}<|im_end|>\n<|im_start|>assistant\n" + ) prompt_dict = { "prompt": final_prompt_text, "multi_modal_data": {"image": loaded_image}, @@ -179,7 +185,8 @@ def main(): } formatted_prompts.append(prompt_dict) elif args.modality == "text2text": - final_prompt_text = f"<|im_start|>user\n{p}<|im_end|>\n<|im_start|>assistant\n" + think_prefix = f"<|im_start|>{VLM_THINK_SYSTEM_PROMPT}<|im_end|>" if args.think else "" + final_prompt_text = f"{think_prefix}<|im_start|>{p}<|im_end|><|im_start|>" prompt_dict = {"prompt": final_prompt_text, "modalities": ["text"]} formatted_prompts.append(prompt_dict) else: @@ -217,15 +224,11 @@ def main(): img_idx = 0 for req_output in omni_outputs: if args.think: - text_output = getattr(req_output, "text", None) or getattr(req_output, "outputs", None) - if text_output: - if isinstance(text_output, list) and text_output: - for out in text_output: - txt = getattr(out, "text", str(out)) - if txt: - print(f"[Think] {txt}") - elif isinstance(text_output, str): - print(f"[Think] {text_output}") + ro = getattr(req_output, "request_output", None) + if ro and getattr(ro, "outputs", None): + txt = "".join(getattr(o, "text", "") or "" for o in ro.outputs) + if txt: + print(txt) images = getattr(req_output, "images", None) diff --git a/vllm_omni/model_executor/stage_input_processors/bagel.py b/vllm_omni/model_executor/stage_input_processors/bagel.py index 6b88fcd4a1..bfcff0ea0f 100644 --- a/vllm_omni/model_executor/stage_input_processors/bagel.py +++ b/vllm_omni/model_executor/stage_input_processors/bagel.py @@ -135,6 +135,13 @@ def expand_cfg_prompts( "i.e. planning process here image here" ) +VLM_THINK_SYSTEM_PROMPT = ( + "You should first think about the reasoning process in the mind " + "and then provide the user with the answer. \n" + "The reasoning process is enclosed within tags, " + "i.e. reasoning process here answer here" +) + def expand_cfg_prompts_think( prompt: dict[str, Any] | str, From 8dd66ceb005a31b8802ffff113b62887e27e12f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Tue, 7 Apr 2026 09:30:01 +0800 Subject: [PATCH 064/204] =?UTF-8?q?[BugFix]=20Continue=20decode=20if=20don?= =?UTF-8?q?'t=20need=20transfer=20kv=20cache=20between=20two=20=E2=80=A6?= =?UTF-8?q?=20(#2502)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: princepride --- .../test_bagel_understanding.py | 144 ++++++++++++++++++ vllm_omni/core/sched/omni_ar_scheduler.py | 37 ++++- vllm_omni/engine/async_omni_engine.py | 38 ++++- 3 files changed, 217 insertions(+), 2 deletions(-) create mode 100644 tests/e2e/offline_inference/test_bagel_understanding.py diff --git a/tests/e2e/offline_inference/test_bagel_understanding.py b/tests/e2e/offline_inference/test_bagel_understanding.py new file mode 100644 index 0000000000..6f95e7ee00 --- /dev/null +++ b/tests/e2e/offline_inference/test_bagel_understanding.py @@ -0,0 +1,144 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +End-to-end tests for Bagel text2text and img2text (understanding) tasks. + +These tests validate that the Bagel multistage pipeline correctly generates +text output for understanding tasks, matching reference results. + +Equivalent to running: + python3 examples/offline_inference/bagel/end2end.py \ + --modality text2text \ + --prompts "Where is the capital of France?" + + python3 examples/offline_inference/bagel/end2end.py \ + --modality img2text \ + --prompts "Please describe this image" \ + --image-path 2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg +""" + +import os + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" +from pathlib import Path + +import pytest +from vllm.assets.image import ImageAsset + +from tests.conftest import modify_stage_config +from tests.utils import hardware_test +from vllm_omni.entrypoints.omni import Omni + +MODEL_NAME = "ByteDance-Seed/BAGEL-7B-MoT" +STAGE_CONFIG = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") + +REFERENCE_TEXT_TEXT2TEXT = "The capital of France is Paris." + +REFERENCE_TEXT_IMG2TEXT = ( + "This is a photo of a wooden boardwalk or pathway that leads through " + "tall green grass. The path appears to be in a natural setting, possibly " + "a wetland or marsh area. The sky above is blue with some scattered " + "clouds, suggesting it might be a sunny day. The overall scene looks " + "peaceful and serene." +) + + +def _resolve_stage_config(config_path: str, run_level: str) -> str: + """Strip load_format: dummy for advanced_model (real weights).""" + if run_level == "advanced_model": + return modify_stage_config( + config_path, + deletes={ + "stage_args": { + 0: ["engine_args.load_format"], + 1: ["engine_args.load_format"], + } + }, + ) + return config_path + + +def _extract_text(omni_outputs: list) -> str: + """Extract generated text from OmniRequestOutput list.""" + for req_output in omni_outputs: + ro = getattr(req_output, "request_output", None) + if ro and getattr(ro, "outputs", None): + return "".join(getattr(o, "text", "") or "" for o in ro.outputs) + return "" + + +@pytest.mark.core_model +@pytest.mark.advanced_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) +def test_bagel_text2text(run_level): + """Test Bagel text2text produces correct text output.""" + config_path = _resolve_stage_config(STAGE_CONFIG, run_level) + omni = Omni( + model=MODEL_NAME, + stage_configs_path=config_path, + stage_init_timeout=300, + ) + + try: + prompt = "<|im_start|>user\nWhere is the capital of France?<|im_end|>\n<|im_start|>assistant\n" + params_list = omni.default_sampling_params_list + omni_outputs = list( + omni.generate( + prompts=[{"prompt": prompt, "modalities": ["text"]}], + sampling_params_list=params_list, + ) + ) + + assert len(omni_outputs) > 0, "No outputs returned" + text = _extract_text(omni_outputs) + assert len(text) > 0, "Generated text is empty" + + if run_level == "advanced_model": + assert text == REFERENCE_TEXT_TEXT2TEXT, ( + f"Text mismatch: expected {REFERENCE_TEXT_TEXT2TEXT!r}, got {text!r}" + ) + finally: + omni.close() + + +@pytest.mark.core_model +@pytest.mark.advanced_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) +def test_bagel_img2text(run_level): + """Test Bagel img2text produces correct text output.""" + input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB") + config_path = _resolve_stage_config(STAGE_CONFIG, run_level) + omni = Omni( + model=MODEL_NAME, + stage_configs_path=config_path, + stage_init_timeout=300, + ) + + try: + prompt = "<|im_start|>user\n<|image_pad|>\nPlease describe this image<|im_end|>\n<|im_start|>assistant\n" + params_list = omni.default_sampling_params_list + omni_outputs = list( + omni.generate( + prompts=[ + { + "prompt": prompt, + "multi_modal_data": {"image": input_image}, + "modalities": ["text"], + } + ], + sampling_params_list=params_list, + ) + ) + + assert len(omni_outputs) > 0, "No outputs returned" + text = _extract_text(omni_outputs) + assert len(text) > 0, "Generated text is empty" + + if run_level == "advanced_model": + assert text == REFERENCE_TEXT_IMG2TEXT, f"Text mismatch: expected {REFERENCE_TEXT_IMG2TEXT!r}, got {text!r}" + finally: + omni.close() diff --git a/vllm_omni/core/sched/omni_ar_scheduler.py b/vllm_omni/core/sched/omni_ar_scheduler.py index 0956d1856a..eac737b6e6 100644 --- a/vllm_omni/core/sched/omni_ar_scheduler.py +++ b/vllm_omni/core/sched/omni_ar_scheduler.py @@ -64,6 +64,9 @@ def __init__(self, *args, **kwargs): # Track requests that have already triggered prefill transfer to avoid duplicates self.transfer_triggered_requests: set[str] = set() + + # Cache per-request flag to avoid repeated deserialization of additional_information + self._omits_kv_transfer_cache: dict[str, bool] = {} model_config = self.vllm_config.model_config self.chunk_transfer_adapter = None if getattr(model_config, "async_chunk", False): @@ -82,6 +85,27 @@ def _get_kv_transfer_criteria(self) -> dict | None: return getattr(omni_kv_config, "kv_transfer_criteria", None) return None + def _request_omits_kv_transfer_to_next_stage(self, request: Request) -> bool: + """True when orchestrator will not run stage 1+ for this request (e.g. text-only). + + The result is cached per request to avoid repeated deserialization of + additional_information on every scheduler tick. + """ + rid = request.request_id + cached = self._omits_kv_transfer_cache.get(rid) + if cached is not None: + return cached + + payload = getattr(request, "additional_information", None) + if payload is None: + result = False + else: + info = deserialize_additional_information(payload) + result = info.get("omni_final_stage_id") == 0 + + self._omits_kv_transfer_cache[rid] = result + return result + def _process_kv_transfer_trigger(self, request: Request, new_token_ids: list[int]) -> bool: """ Check triggers and process side effects (marking transfer). @@ -91,6 +115,10 @@ def _process_kv_transfer_trigger(self, request: Request, new_token_ids: list[int if not self.kv_transfer_criteria: return False + # Text-only requests finalize at stage 0; do not prefill-stop for DiT KV. + if self._request_omits_kv_transfer_to_next_stage(request): + return False + if request.request_id in self.waiting_for_transfer_free: return False @@ -512,6 +540,8 @@ def _free_request(self, request: Request, delay_free_blocks: bool = False) -> di """Mark a request as finished and free its resources.""" assert request.is_finished() + self._omits_kv_transfer_cache.pop(request.request_id, None) + # 1. Standard cleanup parts from base _free_request connector_delay_free_blocks, kv_xfer_params = self._connector_finished(request) @@ -638,7 +668,12 @@ def _should_transfer_kv_for_request(self, req_id: str) -> bool: need_send = omni_kv_config.get("need_send_cache", False) else: need_send = getattr(omni_kv_config, "need_send_cache", False) - return need_send + if not need_send: + return False + request = self.requests.get(req_id) + if request is not None and self._request_omits_kv_transfer_to_next_stage(request): + return False + return True def has_requests(self) -> bool: """Check if there are any requests to process, including KV transfers.""" diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 28c6d6caa1..c802e62ef2 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -42,7 +42,10 @@ ) from vllm_omni.engine.orchestrator import Orchestrator from vllm_omni.engine.output_processor import MultimodalOutputProcessor -from vllm_omni.engine.serialization import serialize_additional_information +from vllm_omni.engine.serialization import ( + deserialize_additional_information, + serialize_additional_information, +) from vllm_omni.engine.stage_engine_core_client import StageEngineCoreClient from vllm_omni.engine.stage_engine_core_proc import ( complete_stage_handshake, @@ -170,6 +173,38 @@ def _upgrade_to_omni_request( ) +def _apply_omni_final_stage_metadata( + request: EngineCoreRequest, + final_stage_id: int, +) -> EngineCoreRequest: + """Tag EngineCoreRequest so OmniARScheduler can skip DiT KV when final_stage_id is 0.""" + merged: dict[str, Any] = {} + if isinstance(request, OmniEngineCoreRequest) and request.additional_information is not None: + merged = deserialize_additional_information(request.additional_information) + merged["omni_final_stage_id"] = final_stage_id + payload = serialize_additional_information(merged) + return OmniEngineCoreRequest( + request_id=request.request_id, + prompt_token_ids=request.prompt_token_ids, + mm_features=request.mm_features, + sampling_params=request.sampling_params, + pooling_params=request.pooling_params, + arrival_time=request.arrival_time, + lora_request=request.lora_request, + cache_salt=request.cache_salt, + data_parallel_rank=request.data_parallel_rank, + prompt_embeds=request.prompt_embeds, + client_index=request.client_index, + current_wave=request.current_wave, + priority=request.priority, + trace_headers=request.trace_headers, + resumable=request.resumable, + external_req_id=request.external_req_id, + reasoning_ended=request.reasoning_ended, + additional_information=payload, + ) + + def _weak_shutdown_async_omni_engine( orchestrator_thread: threading.Thread | None, request_queue: janus.Queue[dict[str, Any]] | None, @@ -713,6 +748,7 @@ def _build_add_request_message( # to match the key used in Orchestrator.request_states so that # output routing (output.request_id lookup) can find the req_state. request.external_req_id = request_id + request = _apply_omni_final_stage_metadata(request, final_stage_id) # Register with stage 0's output processor. output_prompt_text = prompt_text From 93a3fcf48e801dbcdebf4240e0b966ee574653df Mon Sep 17 00:00:00 2001 From: Alicia <115451386+congw729@users.noreply.github.com> Date: Tue, 7 Apr 2026 09:57:38 +0800 Subject: [PATCH 065/204] [CI] Add doc-only change detection to skip Buildkite CI. (#1284) Signed-off-by: Alicia <115451386+congw729@users.noreply.github.com> Signed-off-by: wangyu <410167048@qq.com> Co-authored-by: wangyu <410167048@qq.com> --- .buildkite/pipeline.yml | 19 ++- .../scripts/upload_pipeline_with_skip_ci.sh | 137 ++++++++++++++++++ 2 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 .buildkite/scripts/upload_pipeline_with_skip_ci.sh diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index f265a42f9d..d9a2315953 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,6 +1,21 @@ +# Document 1: Buildkite loads only this block on first parse. The next step resolves docs-only skip-ci +# from git diff, then uploads document 2. When docs-only skip applies, image-build still runs if nightly-test +# / main NIGHTLY so upload-nightly is not skipped together with test-ready/test-merge. +# +# Document 2: appended after `---`; same file, read by upload_pipeline_with_skip_ci.sh (not evaluated as a second pipeline by Buildkite). +steps: + - label: ":github: Resolve skip-ci & upload pipeline" + key: upload-ci-pipeline + commands: + - "bash .buildkite/scripts/upload_pipeline_with_skip_ci.sh" + agents: + queue: "cpu_queue_premerge" + +--- steps: - label: ":docker: Build image" key: image-build + if: __IMAGE_BUILD_IF__ commands: - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - "docker build --progress=plain --file docker/Dockerfile.ci -t vllm-omni-ci ." @@ -13,7 +28,7 @@ steps: - label: "Upload Ready Pipeline" depends_on: image-build key: upload-ready-pipeline - if: build.branch != "main" && build.pull_request.labels includes "ready" + if: __UPLOAD_READY_IF__ commands: - buildkite-agent pipeline upload .buildkite/test-ready.yml agents: @@ -23,7 +38,7 @@ steps: - label: "Upload Merge Pipeline" depends_on: image-build key: upload-merge-pipeline - if: build.branch == "main" && build.env("NIGHTLY") != "1" + if: __UPLOAD_MERGE_IF__ commands: - buildkite-agent pipeline upload .buildkite/test-merge.yml agents: diff --git a/.buildkite/scripts/upload_pipeline_with_skip_ci.sh b/.buildkite/scripts/upload_pipeline_with_skip_ci.sh new file mode 100644 index 0000000000..c00140de46 --- /dev/null +++ b/.buildkite/scripts/upload_pipeline_with_skip_ci.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash +# Evaluate docs-only skip-ci and upload continuation steps from the same `.buildkite/pipeline.yml` +# (YAML document after the first `---`). Buildkite `if` is evaluated at upload time. +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +PIPELINE_YML="${ROOT}/.buildkite/pipeline.yml" + +# Prints a single digit to stdout: 1 = skip image CI, 0 = run. Logs go to stderr. +is_docs_only_change() { + local file_path + local has_any=0 + + while IFS= read -r file_path; do + [[ -z "${file_path}" ]] && continue + has_any=1 + + if [[ "${file_path}" == docs/* ]]; then + continue + fi + if [[ "${file_path}" == *.md ]]; then + continue + fi + if [[ "${file_path}" == "mkdocs.yaml" ]]; then + continue + fi + return 1 + done + + [[ "${has_any}" -eq 1 ]] +} + +resolve_skip_ci() { + local is_pr_build=0 + local files + local base_branch base_ref + + if [[ "${BUILDKITE_PULL_REQUEST:-false}" != "false" && -n "${BUILDKITE_PULL_REQUEST:-}" ]]; then + is_pr_build=1 + fi + + if [[ "${is_pr_build}" -eq 1 ]]; then + base_branch="${BUILDKITE_PULL_REQUEST_BASE_BRANCH:-main}" + if ! git rev-parse --verify "origin/${base_branch}" >/dev/null 2>&1; then + echo "resolve_skip_ci: origin/${base_branch} not found locally; trying fetch" >&2 + git fetch --depth=200 origin "${base_branch}" >/dev/null 2>&1 || true + fi + + base_ref="" + if git rev-parse --verify "origin/${base_branch}" >/dev/null 2>&1; then + base_ref="origin/${base_branch}" + elif git rev-parse --verify "${base_branch}" >/dev/null 2>&1; then + base_ref="${base_branch}" + else + echo "resolve_skip_ci: cannot resolve PR base ${base_branch}; skip-ci=0" >&2 + echo -n 0 + return 0 + fi + + if ! files="$(git diff --name-only "${base_ref}...${BUILDKITE_COMMIT}" 2>/dev/null)"; then + echo "resolve_skip_ci: failed to compute PR changed files; skip-ci=0" >&2 + echo -n 0 + return 0 + fi + elif [[ "${BUILDKITE_BRANCH:-}" == "main" ]]; then + if ! git rev-parse --verify "${BUILDKITE_COMMIT}^" >/dev/null 2>&1; then + echo "resolve_skip_ci: commit has no parent on main; skip-ci=0" >&2 + echo -n 0 + return 0 + fi + if ! files="$(git diff --name-only "${BUILDKITE_COMMIT}^..${BUILDKITE_COMMIT}" 2>/dev/null)"; then + echo "resolve_skip_ci: failed to compute main changed files; skip-ci=0" >&2 + echo -n 0 + return 0 + fi + else + echo "resolve_skip_ci: not PR/main build; skip-ci=0" >&2 + echo -n 0 + return 0 + fi + + if is_docs_only_change <<< "${files}"; then + echo "resolve_skip_ci: docs-only change detected; skip-ci=1" >&2 + echo -n 1 + return 0 + fi + + echo "resolve_skip_ci: non-doc changes detected; skip-ci=0" >&2 + echo -n 0 +} + +SKIP_CI="$(resolve_skip_ci)" + +if [[ ! -f "${PIPELINE_YML}" ]]; then + echo "upload_pipeline_with_skip_ci: missing ${PIPELINE_YML}" >&2 + exit 1 +fi + +export ROOT SKIP_CI PIPELINE_YML +python3 <<'PY' | buildkite-agent pipeline upload +import os +import pathlib + +path = pathlib.Path(os.environ["PIPELINE_YML"]) +text = path.read_text(encoding="utf-8") +sep = "\n---\n" +if sep not in text: + raise SystemExit( + "upload_pipeline_with_skip_ci: .buildkite/pipeline.yml must contain a '\\n---\\n' separator " + "(document 1 = bootstrap, document 2 = uploaded steps)" + ) +_, continuation = text.split(sep, 1) + +skip = os.environ.get("SKIP_CI") == "1" +# When docs-only skip-ci: skip default CI image, but still build for L4 nightly (PR label nightly-test or +# main NIGHTLY=1), otherwise upload-nightly (depends_on image-build) would be skipped too. +nightly_only = ( + '(build.pull_request.labels includes "nightly-test") ' + '|| (build.branch == "main" && build.env("NIGHTLY") == "1")' +) +# Placeholder in pipeline.yml is `if: __IMAGE_BUILD_IF__` (valid YAML); replace value only. +if skip: + rep = f"'{nightly_only}'" + ready_rep = "'false'" + merge_rep = "'false'" +else: + rep = "'true'" + ready_rep = "'build.branch != \"main\" && build.pull_request.labels includes \"ready\"'" + merge_rep = "'build.branch == \"main\" && build.env(\"NIGHTLY\") != \"1\"'" +rendered = ( + continuation + .replace("__IMAGE_BUILD_IF__", rep) + .replace("__UPLOAD_READY_IF__", ready_rep) + .replace("__UPLOAD_MERGE_IF__", merge_rep) +) +print(rendered, end="") +PY From 368de99f08deb08d69c598045c92697a229b4df7 Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Tue, 7 Apr 2026 10:47:44 +0800 Subject: [PATCH 066/204] [Test] Test whether CI can be correctly skipped when the committed files only contain documentation. (#2534) Signed-off-by: wangyu <410167048@qq.com> --- docs/contributing/ci/CI_5levels.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index 967d0cc6d7..74ae1a38eb 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -271,7 +271,7 @@ Before entering specific testing levels, the project establishes two common spec L1 and L2 level testing form the foundation of the quality assurance system. L1 level testing focuses on verifying the internal logic correctness of code units (e.g., functions, classes), ensuring each independent component behaves as designed. -L2 level testing builds upon L1 by introducing GPU resources and verifying that the end-to-end (E2E) process of the model in basic deployment scenarios is smooth. For example, it uses dummy models to confirm that core interfaces like the inference pipeline, output format, and streaming response work properly. The common goal of these two levels is to provide developers with rapid feedback, discovering and fixing issues early in the development cycle . +L2 level testing builds upon L1 by introducing GPU resources and verifying that the end-to-end (E2E) process of the model in basic deployment scenarios is smooth. For example, it uses dummy models to confirm that core interfaces like the inference pipeline, output format, and streaming response work properly. The common goal of these two levels is to provide developers with rapid feedback, discovering and fixing issues early in the development cycle. From 7a72f34ce481e991bfdda69fbfb868a6fe97f030 Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Tue, 7 Apr 2026 11:15:02 +0800 Subject: [PATCH 067/204] Add supports_float64() to OmniPlatform and clean up MPS (#2488) Signed-off-by: Xiaodong Ye --- vllm_omni/diffusion/models/flux/pipeline_flux_kontext.py | 5 ----- vllm_omni/diffusion/models/flux2/pipeline_flux2.py | 4 ---- vllm_omni/diffusion/models/mammoth_moda2/rope_real.py | 4 +++- vllm_omni/diffusion/models/omnigen2/omnigen2_transformer.py | 3 ++- vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py | 3 ++- vllm_omni/platforms/interface.py | 4 ++++ vllm_omni/platforms/musa/platform.py | 5 +++++ 7 files changed, 16 insertions(+), 12 deletions(-) diff --git a/vllm_omni/diffusion/models/flux/pipeline_flux_kontext.py b/vllm_omni/diffusion/models/flux/pipeline_flux_kontext.py index c7574c1c85..c3bea7dd1c 100644 --- a/vllm_omni/diffusion/models/flux/pipeline_flux_kontext.py +++ b/vllm_omni/diffusion/models/flux/pipeline_flux_kontext.py @@ -681,13 +681,8 @@ def forward( neg_noise_pred = neg_noise_pred[:, : latents.size(1)] noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred) - latents_dtype = latents.dtype latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] - if latents.dtype != latents_dtype: - if torch.backends.mps.is_available(): - latents = latents.to(latents_dtype) - if callback_on_step_end is not None: callback_kwargs = {} for k in callback_on_step_end_tensor_inputs: diff --git a/vllm_omni/diffusion/models/flux2/pipeline_flux2.py b/vllm_omni/diffusion/models/flux2/pipeline_flux2.py index cc25c6b704..00d3288501 100644 --- a/vllm_omni/diffusion/models/flux2/pipeline_flux2.py +++ b/vllm_omni/diffusion/models/flux2/pipeline_flux2.py @@ -1062,12 +1062,8 @@ def forward( noise_pred = noise_pred[:, : latents.size(1) :] # compute the previous noisy sample x_t -> x_t-1 - latents_dtype = latents.dtype latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] - if latents.dtype != latents_dtype and torch.backends.mps.is_available(): - latents = latents.to(latents_dtype) - if callback_on_step_end is not None: callback_kwargs = {} for k in callback_on_step_end_tensor_inputs: diff --git a/vllm_omni/diffusion/models/mammoth_moda2/rope_real.py b/vllm_omni/diffusion/models/mammoth_moda2/rope_real.py index d16181a691..64cc432486 100644 --- a/vllm_omni/diffusion/models/mammoth_moda2/rope_real.py +++ b/vllm_omni/diffusion/models/mammoth_moda2/rope_real.py @@ -18,6 +18,8 @@ from einops import repeat from torch import nn +from vllm_omni.platforms import current_omni_platform + def apply_real_rotary_emb(x: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor) -> torch.Tensor: """ @@ -119,7 +121,7 @@ def get_freqs_real( axes_dim: tuple[int, int, int], axes_lens: tuple[int, int, int], theta: int ) -> list[tuple[torch.Tensor, torch.Tensor]]: freqs_real = [] - freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64 + freqs_dtype = torch.float64 if current_omni_platform.supports_float64() else torch.float32 for i, (d, e) in enumerate(zip(axes_dim, axes_lens)): cos_emb, sin_emb = get_1d_rotary_pos_embed_real(d, e, theta=theta, freqs_dtype=freqs_dtype) freqs_real.append((cos_emb, sin_emb)) diff --git a/vllm_omni/diffusion/models/omnigen2/omnigen2_transformer.py b/vllm_omni/diffusion/models/omnigen2/omnigen2_transformer.py index b626ca1d85..9ff681a3c0 100644 --- a/vllm_omni/diffusion/models/omnigen2/omnigen2_transformer.py +++ b/vllm_omni/diffusion/models/omnigen2/omnigen2_transformer.py @@ -19,6 +19,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm_omni.diffusion.attention.layer import Attention +from vllm_omni.platforms import current_omni_platform logger = logging.getLogger(__name__) @@ -411,7 +412,7 @@ def get_freqs_cis( axes_dim: tuple[int, int, int], axes_lens: tuple[int, int, int], theta: int ) -> list[torch.Tensor]: freqs_cis = [] - freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64 + freqs_dtype = torch.float64 if current_omni_platform.supports_float64() else torch.float32 for i, (d, e) in enumerate(zip(axes_dim, axes_lens)): emb = get_1d_rotary_pos_embed(d, e, theta=theta, freqs_dtype=freqs_dtype) freqs_cis.append(emb) diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py index efaab5a8f9..65a2d4390a 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py @@ -29,6 +29,7 @@ SequenceParallelOutput, ) from vllm_omni.diffusion.forward_context import get_forward_context +from vllm_omni.platforms import current_omni_platform logger = init_logger(__name__) @@ -171,7 +172,7 @@ def __init__( # Split dimensions for temporal, height, width h_dim = w_dim = 2 * (attention_head_dim // 6) t_dim = attention_head_dim - h_dim - w_dim - freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64 + freqs_dtype = torch.float64 if current_omni_platform.supports_float64() else torch.float32 freqs_cos = [] freqs_sin = [] diff --git a/vllm_omni/platforms/interface.py b/vllm_omni/platforms/interface.py index 4325851e5f..4df297fa02 100644 --- a/vllm_omni/platforms/interface.py +++ b/vllm_omni/platforms/interface.py @@ -117,6 +117,10 @@ def get_free_memory(cls, device: torch.device | None = None) -> int: def supports_cpu_offload(cls) -> bool: return True + @classmethod + def supports_float64(cls) -> bool: + return True + @classmethod def set_device_control_env_var(cls, devices: str | int | None) -> None: import os diff --git a/vllm_omni/platforms/musa/platform.py b/vllm_omni/platforms/musa/platform.py index 932ce62d27..3bd520c61b 100644 --- a/vllm_omni/platforms/musa/platform.py +++ b/vllm_omni/platforms/musa/platform.py @@ -81,6 +81,11 @@ def supports_torch_inductor(cls) -> bool: """MUSA supports torch.compile with inductor backend.""" return True + @classmethod + def supports_float64(cls) -> bool: + """MUSA does not support float64 yet.""" + return False + @classmethod def get_torch_device(cls, local_rank: int | None = None) -> torch.device: """Get the torch device for MUSA platform. From 08e2e1fc8e33cf7c4022829ca9e620fc73551892 Mon Sep 17 00:00:00 2001 From: Alex Brooks Date: Mon, 6 Apr 2026 21:24:19 -0600 Subject: [PATCH 068/204] [Bugfix] Fix DataType Handling in Default Diffusion Config (#2530) Signed-off-by: Alex Brooks --- tests/entrypoints/test_utils.py | 18 +++++++ vllm_omni/engine/async_omni_engine.py | 67 ++++++++++++++------------- 2 files changed, 54 insertions(+), 31 deletions(-) diff --git a/tests/entrypoints/test_utils.py b/tests/entrypoints/test_utils.py index 352ed2aad9..6e44fe533c 100644 --- a/tests/entrypoints/test_utils.py +++ b/tests/entrypoints/test_utils.py @@ -5,14 +5,17 @@ from dataclasses import dataclass import pytest +import torch from pytest_mock import MockerFixture from vllm_omni.diffusion.data import OmniDiffusionConfig from vllm_omni.engine.arg_utils import OmniEngineArgs +from vllm_omni.engine.async_omni_engine import AsyncOmniEngine from vllm_omni.entrypoints.utils import ( _convert_dataclasses_to_dict, _filter_dict_like_object, filter_dataclass_kwargs, + load_and_resolve_stage_configs, resolve_model_config_path, ) @@ -304,3 +307,18 @@ def mock_exists(path): assert result is not None assert "glm_image.yaml" in result + + +class TestLoadAndResolveStageConfigs: + def test_load_and_resolve_with_kwargs(self): + """Ensure that dtype survives default stage creation.""" + kwargs = {"dtype": torch.float32} + config_path, stage_configs = load_and_resolve_stage_configs( + model="black-forest-labs/FLUX.2-klein-4B", + stage_configs_path=None, + kwargs=kwargs, + default_stage_cfg_factory=lambda: AsyncOmniEngine._create_default_diffusion_stage_cfg(kwargs), + ) + assert config_path is None + assert len(stage_configs) == 1 + assert "dtype" in stage_configs[0]["engine_args"] diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index c802e62ef2..8cd2d69526 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -920,6 +920,41 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list: num_devices = max(1, int(parallel_config.world_size)) devices = ",".join(str(i) for i in range(num_devices)) + stage_engine_args = { + "max_num_seqs": 1, + "parallel_config": parallel_config, + "model_class_name": kwargs.get("model_class_name", None), + "step_execution": kwargs.get("step_execution", False), + "vae_use_slicing": kwargs.get("vae_use_slicing", False), + "vae_use_tiling": kwargs.get("vae_use_tiling", False), + "cache_backend": cache_backend, + "cache_config": cache_config, + "enable_cache_dit_summary": kwargs.get("enable_cache_dit_summary", False), + "enable_cpu_offload": kwargs.get("enable_cpu_offload", False), + "enable_layerwise_offload": kwargs.get("enable_layerwise_offload", False), + "enforce_eager": kwargs.get("enforce_eager", False), + "diffusion_load_format": kwargs.get("diffusion_load_format", "default"), + "custom_pipeline_args": kwargs.get("custom_pipeline_args", None), + "worker_extension_cls": kwargs.get("worker_extension_cls", None), + "enable_sleep_mode": kwargs.get("enable_sleep_mode", False), + "enable_multithread_weight_load": kwargs.get("enable_multithread_weight_load", True), + "num_weight_load_threads": kwargs.get("num_weight_load_threads", 4), + "quantization": kwargs.get("quantization", None), + "enable_diffusion_pipeline_profiler": kwargs.get("enable_diffusion_pipeline_profiler", False), + **( + { + "profiler_config": asdict(kwargs["profiler_config"]) + if hasattr(kwargs["profiler_config"], "__dataclass_fields__") + else kwargs["profiler_config"] + } + if kwargs.get("profiler_config") is not None + else {} + ), + } + # Only set dtype if it was already explicitly passed and normalized + if "dtype" in normalized_kwargs: + stage_engine_args["dtype"] = normalized_kwargs["dtype"] + default_stage_cfg = [ { "stage_id": 0, @@ -928,37 +963,7 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list: "process": True, "devices": devices, }, - "engine_args": { - "max_num_seqs": 1, - "parallel_config": parallel_config, - "model_class_name": kwargs.get("model_class_name", None), - "step_execution": kwargs.get("step_execution", False), - "vae_use_slicing": kwargs.get("vae_use_slicing", False), - "vae_use_tiling": kwargs.get("vae_use_tiling", False), - "cache_backend": cache_backend, - "cache_config": cache_config, - "enable_cache_dit_summary": kwargs.get("enable_cache_dit_summary", False), - "enable_cpu_offload": kwargs.get("enable_cpu_offload", False), - "enable_layerwise_offload": kwargs.get("enable_layerwise_offload", False), - "enforce_eager": kwargs.get("enforce_eager", False), - "diffusion_load_format": kwargs.get("diffusion_load_format", "default"), - "custom_pipeline_args": kwargs.get("custom_pipeline_args", None), - "worker_extension_cls": kwargs.get("worker_extension_cls", None), - "enable_sleep_mode": kwargs.get("enable_sleep_mode", False), - "enable_multithread_weight_load": kwargs.get("enable_multithread_weight_load", True), - "num_weight_load_threads": kwargs.get("num_weight_load_threads", 4), - "quantization": kwargs.get("quantization", None), - "enable_diffusion_pipeline_profiler": kwargs.get("enable_diffusion_pipeline_profiler", False), - **( - { - "profiler_config": asdict(kwargs["profiler_config"]) - if hasattr(kwargs["profiler_config"], "__dataclass_fields__") - else kwargs["profiler_config"] - } - if kwargs.get("profiler_config") is not None - else {} - ), - }, + "engine_args": stage_engine_args, "final_output": True, "final_output_type": "image", } From 0304c975d5ba8fd8409b09fef8d8514933f4caad Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Tue, 7 Apr 2026 11:27:30 +0800 Subject: [PATCH 069/204] [Docs] Add installation guide for Moore Threads (MUSA) GPUs (#2359) Signed-off-by: Xiaodong Ye Co-authored-by: Canlin Guo --- docs/getting_started/installation/README.md | 1 + docs/getting_started/installation/gpu.md | 20 ++++++ .../installation/gpu/musa.inc.md | 65 +++++++++++++++++++ 3 files changed, 86 insertions(+) create mode 100644 docs/getting_started/installation/gpu/musa.inc.md diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index 353fbe1c07..89562c53c5 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -6,4 +6,5 @@ vLLM-Omni supports the following hardware platforms: - [NVIDIA CUDA](gpu.md) - [AMD ROCm](gpu.md) - [Intel XPU](gpu.md) + - [MThreads MUSA](gpu.md) - [NPU](npu.md) diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md index 297c366616..d08f134b5d 100644 --- a/docs/getting_started/installation/gpu.md +++ b/docs/getting_started/installation/gpu.md @@ -22,6 +22,10 @@ vLLM-Omni is a Python library that supports the following GPU variants. The libr --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:requirements" +=== "MThreads MUSA" + + --8<-- "docs/getting_started/installation/gpu/musa.inc.md:requirements" + ## Set up using Python ### Create a new Python environment @@ -44,6 +48,10 @@ Note: Pre-built wheels are currently available for vLLM-Omni 0.11.0rc1, 0.12.0rc --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-wheels" +=== "MThreads MUSA" + + --8<-- "docs/getting_started/installation/gpu/musa.inc.md:pre-built-wheels" + [](){ #build-from-source } ### Build wheel from source @@ -60,6 +68,10 @@ Note: Pre-built wheels are currently available for vLLM-Omni 0.11.0rc1, 0.12.0rc --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-wheel-from-source" +=== "MThreads MUSA" + + --8<-- "docs/getting_started/installation/gpu/musa.inc.md:build-wheel-from-source" + ## Set up using Docker ### Pre-built images @@ -76,6 +88,10 @@ Note: Pre-built wheels are currently available for vLLM-Omni 0.11.0rc1, 0.12.0rc --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-images" +=== "MThreads MUSA" + + --8<-- "docs/getting_started/installation/gpu/musa.inc.md:pre-built-images" + ### Build your own docker image === "AMD ROCm" @@ -85,3 +101,7 @@ Note: Pre-built wheels are currently available for vLLM-Omni 0.11.0rc1, 0.12.0rc === "Intel XPU" --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-docker" + +=== "MThreads MUSA" + + --8<-- "docs/getting_started/installation/gpu/musa.inc.md:build-docker" diff --git a/docs/getting_started/installation/gpu/musa.inc.md b/docs/getting_started/installation/gpu/musa.inc.md new file mode 100644 index 0000000000..a7cbc848f5 --- /dev/null +++ b/docs/getting_started/installation/gpu/musa.inc.md @@ -0,0 +1,65 @@ +# --8<-- [start:requirements] + +- GPU: Moore Threads GPU with MUSA SDK installed (validated on MTT S5000) + +# --8<-- [end:requirements] +# --8<-- [start:set-up-using-python] + +vLLM-Omni for MUSA requires building from source. Pre-built wheels are not currently available. + +!!! note + MUSA platform requires vLLM-MUSA to be installed first. + +# --8<-- [start:pre-built-wheels] + +# --8<-- [end:pre-built-wheels] + +# --8<-- [start:build-wheel-from-source] + +#### Prerequisites + +- **MUSA SDK**: Download from [MUSA SDK Download](https://developer.mthreads.com/sdk/download/musa) +- **torchada**: CUDA→MUSA compatibility layer for PyTorch (`pip install torchada`) +- **mthreads-ml-py**: MTML Python bindings (`pip install mthreads-ml-py`) +- **MATE**: MUSA AI Tensor Engine ([GitHub](https://github.com/MooreThreads/mate)) + +#### Installation of vLLM-MUSA + +```bash +git clone https://github.com/MooreThreads/vllm-musa.git +cd vllm-musa +git checkout v0.18.0-dev +pip install . --no-build-isolation -v +``` + +#### Installation of vLLM-Omni + +```bash +git clone https://github.com/vllm-project/vllm-omni.git +cd vllm-omni +VLLM_OMNI_TARGET_DEVICE=musa pip install -e . --no-build-isolation +``` + +For Gradio demos: + +```bash +pip install -e '.[demo]' --no-build-isolation +``` + +#### Environment Variables + +```bash +export MUSA_VISIBLE_DEVICES=0,1 +export VLLM_WORKER_MULTIPROC_METHOD=spawn +export VLLM_MUSA_CUSTOM_OP_USE_NATIVE=false +``` + +# --8<-- [end:build-wheel-from-source] + +# --8<-- [start:build-docker] + +# --8<-- [end:build-docker] + +# --8<-- [start:pre-built-images] + +# --8<-- [end:pre-built-images] From 5d4c9ec4fa2494682269b136790f2889719143c9 Mon Sep 17 00:00:00 2001 From: erfgss <97771661+erfgss@users.noreply.github.com> Date: Tue, 7 Apr 2026 12:06:58 +0800 Subject: [PATCH 070/204] [bugfix]bugfix dreamid (#2125) Signed-off-by: Chen Yang <2082464740@qq.com> Signed-off-by: erfgss <97771661+erfgss@users.noreply.github.com> --- .../x_to_video_audio/download_dreamid_omni.py | 7 ++++++- .../offline_inference/x_to_video_audio/x_to_video_audio.md | 4 +++- .../offline_inference/x_to_video_audio/x_to_video_audio.py | 4 ++-- .../diffusion/models/dreamid_omni/pipeline_dreamid_omni.py | 2 +- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/examples/offline_inference/x_to_video_audio/download_dreamid_omni.py b/examples/offline_inference/x_to_video_audio/download_dreamid_omni.py index 0dbf402e9e..2f66d5f778 100644 --- a/examples/offline_inference/x_to_video_audio/download_dreamid_omni.py +++ b/examples/offline_inference/x_to_video_audio/download_dreamid_omni.py @@ -82,7 +82,6 @@ def main(output_dir: str): data = { "_class_name": "DreamIDOmniPipeline", - "fusion": "DreamID-Omni/dreamid_omni.safetensors", } with open(os.path.join(output_dir, "model_index.json"), "w", encoding="utf-8") as f: @@ -90,6 +89,12 @@ def main(output_dir: str): print(f"model_index.json created at {os.path.join(output_dir, 'model_index.json')}") + transformer_dir = os.path.join(output_dir, "transformer") + os.makedirs(transformer_dir, exist_ok=True) + with open(os.path.join(transformer_dir, "config.json"), "w", encoding="utf-8") as f: + json.dump({"fusion": "DreamID-Omni/dreamid_omni.safetensors"}, f) + print(f"transformer/config.json created at {os.path.join(transformer_dir, 'config.json')}") + # now we download the dependency code download_dependency() diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.md b/examples/offline_inference/x_to_video_audio/x_to_video_audio.md index 59b993a728..4b5188f41b 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.md +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.md @@ -24,7 +24,9 @@ dreamid_omni/ │ ├── models_t5_umt5-xxl-enc-bf16.pth │ ├── Wan2.2_VAE.pth │ -├── model_index.json # create by download_dreamid_omni.py +├── model_index.json +└── transformer/ + └── config.json # create by download_dreamid_omni.py ``` ### Run the Inference diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py index 17d0f06c3c..e0424add69 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py @@ -132,8 +132,8 @@ def main() -> None: if not outputs: raise RuntimeError("No output returned from DreamID-Omni.") output = outputs[0].request_output - generated_video = output[0].images[0][0] - generated_audio = output[0].images[0][1] + generated_video = output.images[0][0] + generated_audio = output.images[0][1] try: from dreamid_omni.utils.io_utils import save_video except Exception as e: diff --git a/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py b/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py index f8074fee22..e22765f80e 100644 --- a/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py +++ b/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py @@ -116,7 +116,7 @@ def __init__( ## load audio/video model config Fusion_model = FusionModel(VIDEO_CONFIG, AUDIO_CONFIG) - checkpoint_path = self.od_config.model_config.get("fusion", None) + checkpoint_path = self.od_config.tf_model_config.get("fusion", None) assert checkpoint_path is not None, "fusion checkpoint path is None" load_fusion_checkpoint(Fusion_model, checkpoint_path=os.path.join(model, checkpoint_path)) self.model = Fusion_model From badbe8eb0fe1d3b27938a820b9f8b523a16fc695 Mon Sep 17 00:00:00 2001 From: Bvicii <98971614+scyyh11@users.noreply.github.com> Date: Mon, 6 Apr 2026 21:17:04 -0700 Subject: [PATCH 071/204] [RFC] Offload blocking TTS/speech ops to thread pool to unblock event loop (#2511) Signed-off-by: Bvicii --- .../openai_api/test_serving_speech.py | 125 +++++++++++++++++- vllm_omni/entrypoints/openai/api_server.py | 1 + .../entrypoints/openai/serving_speech.py | 25 +++- 3 files changed, 143 insertions(+), 8 deletions(-) diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index b140b7a046..da15ec8f0e 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -658,11 +658,13 @@ def speech_server(self, mocker: MockerFixture): mock_engine_client.tts_max_instructions_length = None mock_models = mocker.MagicMock() mock_models.is_base_model.return_value = True - return OmniOpenAIServingSpeech( + server = OmniOpenAIServingSpeech( engine_client=mock_engine_client, models=mock_models, request_logger=mocker.MagicMock(), ) + yield server + server.shutdown() def test_is_tts_detection_no_stage(self, speech_server): """Test TTS model detection when no TTS stage exists.""" @@ -1639,11 +1641,13 @@ def fish_speech_server(mocker: MockerFixture): mock_models = mocker.MagicMock() mock_models.is_base_model.return_value = True - return OmniOpenAIServingSpeech( + server = OmniOpenAIServingSpeech( engine_client=mock_engine_client, models=mock_models, request_logger=mocker.MagicMock(), ) + yield server + server.shutdown() class TestFishSpeechServing: @@ -1717,7 +1721,7 @@ def test_build_fish_prompt_rejects_unsafe_control_tokens(self, fish_speech_serve fish_speech_server._build_fish_speech_prompt(request) def test_prepare_speech_generation_overrides_fish_default_max_tokens(self, fish_speech_server): - fish_speech_server._build_fish_speech_prompt = MagicMock( + fish_speech_server._build_fish_speech_prompt_async = AsyncMock( return_value={ "prompt_token_ids": [1, 2, 3], "additional_information": {}, @@ -1730,13 +1734,14 @@ def test_prepare_speech_generation_overrides_fish_default_max_tokens(self, fish_ assert request_id.startswith("speech-") assert generator == "generator" + fish_speech_server._build_fish_speech_prompt_async.assert_awaited_once() fish_speech_server.engine_client.generate.assert_called_once() sampling_params_list = fish_speech_server.engine_client.generate.call_args.kwargs["sampling_params_list"] assert sampling_params_list[0].max_tokens == 4096 assert fish_speech_server.engine_client.default_sampling_params_list[0].max_tokens == 2048 def test_prepare_speech_generation_uses_stage_default_max_tokens(self, fish_speech_server): - fish_speech_server._build_fish_speech_prompt = MagicMock( + fish_speech_server._build_fish_speech_prompt_async = AsyncMock( return_value={ "prompt_token_ids": [1, 2, 3], "additional_information": {}, @@ -1985,3 +1990,115 @@ def test_prepare_speech_generation_cosyvoice3(self, cosyvoice3_server): assert generator == "generator" assert tts_params == {} cosyvoice3_server._build_cosyvoice3_prompt.assert_awaited_once() + + +class TestTTSAsyncOffloading: + """Tests for event-loop-safe offloading of blocking TTS operations.""" + + def test_build_voxtral_prompt_is_sync(self): + """_build_voxtral_prompt should be a regular function, not a coroutine.""" + assert not asyncio.iscoroutinefunction(OmniOpenAIServingSpeech._build_voxtral_prompt) + + @pytest.fixture + def voxtral_server(self, mocker: MockerFixture): + mocker.patch.object(OmniOpenAIServingSpeech, "_load_supported_speakers", return_value=set()) + mocker.patch.object(OmniOpenAIServingSpeech, "_load_codec_frame_rate", return_value=None) + mock_engine_client = mocker.MagicMock() + mock_engine_client.errored = False + mock_engine_client.model_config = mocker.MagicMock(model="mistralai/Voxtral") + mock_engine_client.default_sampling_params_list = [SimpleNamespace(max_tokens=2048)] + mock_engine_client.tts_batch_max_items = 32 + mock_engine_client.generate = mocker.MagicMock(return_value="generator") + mock_engine_client.stage_configs = [ + SimpleNamespace( + engine_args=SimpleNamespace(model_stage="audio_generation"), + tts_args={}, + ) + ] + mock_models = mocker.MagicMock() + mock_models.is_base_model.return_value = True + server = OmniOpenAIServingSpeech( + engine_client=mock_engine_client, + models=mock_models, + request_logger=mocker.MagicMock(), + ) + yield server + server.shutdown() + + @pytest.fixture + def qwen3_tts_server(self, mocker: MockerFixture): + mocker.patch.object(OmniOpenAIServingSpeech, "_load_supported_speakers", return_value=set()) + mocker.patch.object(OmniOpenAIServingSpeech, "_load_codec_frame_rate", return_value=None) + mock_engine_client = mocker.MagicMock() + mock_engine_client.errored = False + mock_engine_client.model_config = mocker.MagicMock(model="Qwen/Qwen3-TTS", hf_config=mocker.MagicMock()) + mock_engine_client.default_sampling_params_list = [SimpleNamespace(max_tokens=2048)] + mock_engine_client.tts_batch_max_items = 32 + mock_engine_client.generate = mocker.MagicMock(return_value="generator") + mock_engine_client.tts_max_instructions_length = None + mock_engine_client.stage_configs = [ + SimpleNamespace( + engine_args=SimpleNamespace(model_stage="qwen3_tts"), + tts_args={}, + ) + ] + mock_models = mocker.MagicMock() + mock_models.is_base_model.return_value = True + server = OmniOpenAIServingSpeech( + engine_client=mock_engine_client, + models=mock_models, + request_logger=mocker.MagicMock(), + ) + yield server + server.shutdown() + + def test_prepare_speech_generation_awaits_voxtral_async(self, voxtral_server): + """Voxtral path in _prepare_speech_generation should call the async wrapper.""" + voxtral_server._build_voxtral_prompt_async = AsyncMock( + return_value={ + "prompt_token_ids": [1, 2, 3], + "additional_information": {"voice": ["test"]}, + } + ) + request = OpenAICreateSpeechRequest(input="hello", voice="test") + asyncio.run(voxtral_server._prepare_speech_generation(request)) + voxtral_server._build_voxtral_prompt_async.assert_awaited_once() + + def test_prepare_speech_generation_awaits_qwen3_tts_async(self, qwen3_tts_server): + """Qwen3 TTS path should call _estimate_prompt_len_async.""" + qwen3_tts_server._validate_tts_request = MagicMock(return_value=None) + qwen3_tts_server._build_tts_params = MagicMock( + return_value={"text": ["hello"], "task_type": ["CustomVoice"], "speaker": ["Vivian"]} + ) + qwen3_tts_server._estimate_prompt_len_async = AsyncMock(return_value=512) + request = OpenAICreateSpeechRequest(input="hello") + asyncio.run(qwen3_tts_server._prepare_speech_generation(request)) + qwen3_tts_server._build_tts_params.assert_called_once() + qwen3_tts_server._estimate_prompt_len_async.assert_awaited_once() + + def test_shutdown_is_idempotent(self, mocker: MockerFixture): + """Calling shutdown() twice should not raise.""" + mocker.patch.object(OmniOpenAIServingSpeech, "_load_supported_speakers", return_value=set()) + mocker.patch.object(OmniOpenAIServingSpeech, "_load_codec_frame_rate", return_value=None) + mock_engine_client = mocker.MagicMock() + mock_engine_client.errored = False + mock_engine_client.stage_configs = [] + mock_engine_client.tts_max_instructions_length = None + mock_models = mocker.MagicMock() + mock_models.is_base_model.return_value = True + server = OmniOpenAIServingSpeech( + engine_client=mock_engine_client, + models=mock_models, + request_logger=mocker.MagicMock(), + ) + assert server._tts_executor is not None + server.shutdown() + assert server._tts_executor is None + server.shutdown() # Should not raise + assert server._tts_executor is None + + def test_diffusion_instance_shutdown_safe(self): + """Diffusion instances (created via for_diffusion) should have safe shutdown.""" + server = OmniOpenAIServingSpeech.for_diffusion(diffusion_engine=MagicMock(), model_name="test-model") + assert server._tts_executor is None + server.shutdown() # Should not raise diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 627174b20e..d15dc90fe5 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -353,6 +353,7 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None, try: await shutdown_task finally: + app.state.openai_serving_speech.shutdown() sock.close() diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 0a9e11b771..10c5fdacc5 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -8,6 +8,7 @@ import struct import tempfile import time +from concurrent.futures import ThreadPoolExecutor from pathlib import Path from typing import Any @@ -22,6 +23,7 @@ from vllm.logger import init_logger from vllm.multimodal.media import MediaConnector from vllm.utils import random_uuid +from vllm.utils.async_utils import make_async from vllm_omni.entrypoints.openai.audio_utils_mixin import AudioMixin from vllm_omni.entrypoints.openai.protocol.audio import ( @@ -153,6 +155,7 @@ def _validate_path_within_directory(file_path: Path, directory: Path) -> bool: class OmniOpenAIServingSpeech(OpenAIServing, AudioMixin): _diffusion_mode: bool = False + _tts_executor: ThreadPoolExecutor | None = None @classmethod def for_diffusion( @@ -219,6 +222,14 @@ def __init__(self, *args, **kwargs): # Load speech tokenizer codec parameters for prompt length estimation self._codec_frame_rate: float | None = self._load_codec_frame_rate() + # Shared thread pool executor for blocking TTS preprocessing + # operations. max_workers=1 serializes tokenizer access to avoid + # Rust RefCell "Already borrowed" errors from concurrent use. + self._tts_executor = ThreadPoolExecutor(max_workers=1) + self._build_voxtral_prompt_async = make_async(self._build_voxtral_prompt, executor=self._tts_executor) + self._build_fish_speech_prompt_async = make_async(self._build_fish_speech_prompt, executor=self._tts_executor) + self._estimate_prompt_len_async = make_async(self._estimate_prompt_len, executor=self._tts_executor) + def _load_codec_frame_rate(self) -> float | None: """Load codec frame rate from speech tokenizer config for prompt length estimation.""" try: @@ -252,6 +263,12 @@ def _load_codec_frame_rate(self) -> float | None: pass return None + def shutdown(self) -> None: + """Shut down the TTS thread pool executor.""" + if self._tts_executor is not None: + self._tts_executor.shutdown(wait=False, cancel_futures=True) + self._tts_executor = None + def _find_tts_stage(self): """Find and return the TTS stage config, or None if not found.""" for stage in self.engine_client.stage_configs: @@ -1149,7 +1166,7 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any # ---- Voxtral TTS helpers ---- - async def _build_voxtral_prompt(self, request: OpenAICreateSpeechRequest) -> dict[str, Any]: + def _build_voxtral_prompt(self, request: OpenAICreateSpeechRequest) -> dict[str, Any]: """Build Voxtral TTS engine prompt from shared TTS parameters.""" from mistral_common.protocol.speech.request import SpeechRequest @@ -1289,7 +1306,7 @@ async def _prepare_speech_generation( if request.ref_audio is not None: wav_list, sr = await self._resolve_ref_audio(request.ref_audio) ref_audio_data = (wav_list, sr) - prompt = self._build_fish_speech_prompt(request, ref_audio_data=ref_audio_data) + prompt = await self._build_fish_speech_prompt_async(request, ref_audio_data=ref_audio_data) tts_params = {} elif self._tts_model_type == "omnivoice": tts_params = {} @@ -1300,7 +1317,7 @@ async def _prepare_speech_generation( raise ValueError(validation_error) if self._tts_model_type == "voxtral_tts": - prompt = await self._build_voxtral_prompt(request) + prompt = await self._build_voxtral_prompt_async(request) tts_params = {} elif self._tts_model_type == "cosyvoice3": prompt = await self._build_cosyvoice3_prompt(request) @@ -1317,7 +1334,7 @@ async def _prepare_speech_generation( wav_list, sr = await self._resolve_ref_audio(ref_audio_source) tts_params["ref_audio"] = [[wav_list, sr]] - ph_len = self._estimate_prompt_len(tts_params) + ph_len = await self._estimate_prompt_len_async(tts_params) prompt = {"prompt_token_ids": [1] * ph_len, "additional_information": tts_params} else: tts_params = {} From 0998b30cbef8e8279bae373f4bd1a5ab2b22e5c7 Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Tue, 7 Apr 2026 15:32:51 +0800 Subject: [PATCH 072/204] [Bugfix] To resolve timeout error, update nightly test commands for diffusion model (#2532) --- .buildkite/test-nightly.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 9dc8885061..15a7bba55d 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -72,7 +72,7 @@ steps: if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion" -m "advanced_model and diffusion and H100" --run-level "advanced_model" + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion" -m "advanced_model and diffusion and H100" --run-level "advanced_model" agents: queue: "mithril-h100-pool" plugins: @@ -107,13 +107,13 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - - label: ":full_moon: Diffusion Model (Wan2.2) Test with H100" + - label: ":full_moon: Diffusion Model (Wan) Test with H100" timeout_in_minutes: 90 depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py -m "advanced_model" --run-level "advanced_model" + - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py -m "advanced_model" --run-level "advanced_model" agents: queue: "mithril-h100-pool" plugins: From 9584dd6e69eeb58fbe1257d766ebaf04de1a995c Mon Sep 17 00:00:00 2001 From: skf <54565339+skf-1999@users.noreply.github.com> Date: Tue, 7 Apr 2026 15:37:17 +0800 Subject: [PATCH 073/204] [HunyuanImage3] Align system_prompt support with official implementation (#2270) Signed-off-by: skf1999 <13234016272@163.com> --- .../offline_inference/text_to_image/README.md | 3 + .../text_to_image/text_to_image.py | 17 +- .../online_serving/text_to_image/README.md | 2 + .../text_to_image/openai_chat_client.py | 27 +- .../test_hunyuanimage3_text2img.py | 347 ++++++++++++++++++ .../pipeline_hunyuan_image_3.py | 8 +- .../models/hunyuan_image_3/system_prompt.py | 215 +++++++++++ vllm_omni/entrypoints/openai/api_server.py | 8 +- .../entrypoints/openai/protocol/images.py | 18 + 9 files changed, 638 insertions(+), 7 deletions(-) create mode 100644 tests/e2e/offline_inference/test_hunyuanimage3_text2img.py create mode 100644 vllm_omni/diffusion/models/hunyuan_image_3/system_prompt.py diff --git a/examples/offline_inference/text_to_image/README.md b/examples/offline_inference/text_to_image/README.md index 235b710a68..4796a17692 100644 --- a/examples/offline_inference/text_to_image/README.md +++ b/examples/offline_inference/text_to_image/README.md @@ -33,6 +33,7 @@ This folder provides several entrypoints for experimenting with text-to-image di | `black-forest-labs/FLUX.2-klein-4B` | 1024 x 1024 | 72.7 | 14.9 | | `black-forest-labs/FLUX.2-klein-9B` | 1024 x 1024 | 37.1 | 32.3 | | `black-forest-labs/FLUX.2-dev` | 1024 x 1024 | 65.7 | >80 (CPU offload required) | +| `HunyuanImage-3.0` | 1024 x 1024 | 80.0 (TP≥3) | 160 | !!! info *Peak VRAM: based on basic single-card usage, batch size =1, without any acceleration/optimization features. FLUX.2-dev requires `--enable-cpu-offload` on a single 80 GiB GPU. @@ -90,6 +91,8 @@ python text_to_image.py \ | `--enable-cpu-offload` | flag | off | Enable CPU offloading for diffusion models | | `--lora-path` | str | — | Path to PEFT LoRA adapter folder | | `--lora-scale` | float | `1.0` | Scale factor for LoRA weights | +| `--use-system-prompt` | str | `None` | System prompt preset: `en_unified`, `en_vanilla`, `en_recaption`, `en_think_recaption`, `dynamic`, `None`, or custom text. Recommended: `en_unified`. Only for HunyuanImage-3.0.| +| `--system-prompt` | str | `None` | Custom system prompt text. Only used when `--use-system-prompt` is set to `custom`. Only for HunyuanImage-3.0.| **NextStep-1.1 specific arguments:** diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py index 927b0f0b08..42e44abb89 100644 --- a/examples/offline_inference/text_to_image/text_to_image.py +++ b/examples/offline_inference/text_to_image/text_to_image.py @@ -242,6 +242,19 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Enable logging of diffusion pipeline stats.", ) + parser.add_argument( + "--use-system-prompt", + type=str, + default=None, + choices=["None", "dynamic", "en_vanilla", "en_recaption", "en_think_recaption", "en_unified", "custom"], + help="System prompt preset for generation. Recommended: en_unified.", + ) + parser.add_argument( + "--system-prompt", + type=str, + default=None, + help=("Custom system prompt. Used when --use-system-prompt is custom. "), + ) return parser.parse_args() @@ -382,13 +395,13 @@ def main(): ) generation_start = time.perf_counter() - extra_args = { "timesteps_shift": args.timesteps_shift, "cfg_schedule": args.cfg_schedule, "use_norm": args.use_norm, + "use_system_prompt": args.use_system_prompt, + "system_prompt": args.system_prompt, } - if lora_request: extra_args["lora_request"] = lora_request extra_args["lora_scale"] = args.lora_scale diff --git a/examples/online_serving/text_to_image/README.md b/examples/online_serving/text_to_image/README.md index 87b6a56438..17d377ea3e 100644 --- a/examples/online_serving/text_to_image/README.md +++ b/examples/online_serving/text_to_image/README.md @@ -231,6 +231,8 @@ count, use `size` and `n` rather than `height`, `width`, or | `seed` | int | None | Random seed (reproducible) | | `negative_prompt` | str | None | Negative prompt | | `num_outputs_per_prompt` | int | 1 | Number of images to generate | +| `use_system_prompt` | str | None | System prompt preset: `en_unified`, `en_vanilla`, `en_recaption`, `en_think_recaption`, `dynamic`, `None`, or custom text string. Only for HunyuanImage-3.0. | +| `system_prompt` | str | None | Custom system prompt text. Only used when `use_system_prompt` is set to `custom`. Only for HunyuanImage-3.0. | ## Response Format diff --git a/examples/online_serving/text_to_image/openai_chat_client.py b/examples/online_serving/text_to_image/openai_chat_client.py index 828827aba2..f3c43086a1 100644 --- a/examples/online_serving/text_to_image/openai_chat_client.py +++ b/examples/online_serving/text_to_image/openai_chat_client.py @@ -28,6 +28,8 @@ def generate_image( lora_name: str | None = None, lora_scale: float | None = None, lora_int_id: int | None = None, + use_system_prompt: str | None = None, + system_prompt: str | None = None, ) -> bytes | None: """Generate an image using the images generation API. @@ -45,6 +47,8 @@ def generate_image( lora_name: LoRA name (optional, defaults to path stem) lora_scale: LoRA scale factor (default: 1.0) lora_int_id: LoRA integer ID (optional, derived from path if not provided) + use_system_prompt: System prompt for generation. + system_prompt: Custom system prompt. Returns: Image bytes or None if failed @@ -70,7 +74,10 @@ def generate_image( payload["negative_prompt"] = negative_prompt if seed is not None: payload["seed"] = seed - + if use_system_prompt is not None: + payload["use_system_prompt"] = use_system_prompt + if system_prompt is not None: + payload["system_prompt"] = system_prompt # Add LoRA if provided if lora_path: lora_body: dict = { @@ -128,9 +135,21 @@ def main(): default=None, help="LoRA integer id (cache key). If omitted, the server derives a stable id from lora_path.", ) - + parser.add_argument( + "--use-system-prompt", + type=str, + default=None, + help=( + "System prompt for generation. Use predefined types: 'en_unified', 'en_vanilla', 'en_recaption', 'en_think_recaption', 'dynamic', or 'None'; Or provide custom text string directly. Recommended en_unified. " + ), + ) + parser.add_argument( + "--system-prompt", + type=str, + default=None, + help=("Custom system prompt. Used when --use-system-prompt is custom. "), + ) args = parser.parse_args() - print(f"Generating image for: {args.prompt}") image_bytes = generate_image( @@ -146,6 +165,8 @@ def main(): lora_name=args.lora_name, lora_scale=args.lora_scale if args.lora_path else None, lora_int_id=args.lora_int_id if args.lora_path else None, + use_system_prompt=args.use_system_prompt, + system_prompt=args.system_prompt, ) if image_bytes: diff --git a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py new file mode 100644 index 0000000000..5522f33eaa --- /dev/null +++ b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py @@ -0,0 +1,347 @@ +# ruff: noqa: E501 +from collections.abc import Generator +from pathlib import Path + +import pytest +import torch +import torch.nn.functional as F +from PIL import Image +from transformers import CLIPModel, CLIPProcessor + +from vllm_omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.platforms import current_omni_platform + +PROMPT = "A brown and white dog is running on the grass" +MODEL_NAME = "tencent/HunyuanImage-3.0" +LOCAL_CLIP_PATH = "openai/clip-vit-base-patch32" +REPO_ROOT = Path(__file__).resolve().parents[3] +STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image_3_moe.yaml" + +pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion] + +# System prompt type. Options: None, dynamic, en_vanilla, en_recaption, en_think_recaption, en_unified +# Below are the CLIP embedding tensors from the official HunyuanImage model (seed=1234, prompt: "A brown and white dog is running on the grass"). +# SEED_1234 denotes the output without system prompt, while the remaining entries correspond to outputs generated with different system prompts. +# fmt: off +SEED_1234 = torch.tensor( + [ + 0.027797, 0.028964, -0.005051, 0.001059, 0.017021, -0.034029, 0.021989, 0.033318, -0.000308, 0.016179, 0.010504, -0.034201, 0.050230, -0.021170, 0.083530, -0.003621, + 0.040758, 0.039913, 0.044305, -0.019285, -0.058387, -0.001099, 0.042782, -0.036136, -0.014955, 0.002147, 0.009439, 0.012943, -0.028732, -0.018349, 0.002861, 0.013019, + 0.014362, -0.038833, 0.029413, 0.020724, 0.002714, 0.010416, -0.020527, 0.050266, -0.081026, -0.006814, -0.007457, -0.032333, 0.008417, -0.122455, -0.006085, -0.025610, + 0.012614, 0.025817, -0.005419, 0.038657, 0.000789, 0.067111, 0.002818, 0.028696, 0.047305, -0.009993, -0.019508, 0.038604, 0.099657, 0.026728, 0.012361, 0.013626, + 0.023164, -0.037186, 0.007535, 0.054645, -0.009012, -0.019383, -0.005234, -0.018715, -0.000346, 0.051317, -0.028744, 0.029933, -0.006382, -0.018414, -0.033906, -0.028892, + -0.015301, -0.004276, 0.014626, -0.008505, 0.013717, -0.027323, -0.001332, -0.040227, 0.047021, -0.019082, -0.037260, -0.029780, -0.594026, 0.016573, -0.010523, 0.042616, + -0.013136, 0.030540, -0.151685, -0.005367, 0.016209, -0.034183, 0.009852, 0.038452, 0.005494, -0.017887, -0.007167, 0.017262, -0.038980, 0.011995, 0.021952, -0.031660, + 0.020507, -0.035880, 0.035183, -0.026975, -0.050788, -0.002553, 0.037774, -0.020082, -0.015403, 0.045022, 0.072167, -0.029237, 0.003895, -0.051250, 0.008581, 0.023545, + -0.026827, 0.020895, 0.041780, -0.040766, -0.008146, 0.080630, 0.000404, 0.032003, -0.005279, -0.090707, -0.013813, 0.010204, -0.001513, 0.016394, -0.001321, 0.020535, + -0.038645, 0.024858, 0.024378, 0.018717, -0.056314, 0.024402, 0.018694, 0.029009, -0.008502, -0.014694, -0.028345, 0.005202, 0.046116, -0.032166, -0.030706, -0.038738, + -0.031356, -0.009683, 0.040069, 0.001596, -0.012621, 0.018590, -0.024138, 0.035330, 0.011546, 0.015791, -0.026932, 0.004531, 0.022455, -0.012871, 0.013915, -0.009567, + -0.010976, 0.013497, 0.042590, 0.002072, -0.052718, -0.045494, 0.013036, -0.005403, -0.005947, -0.003437, 0.016653, -0.016805, -0.040291, 0.007927, 0.001296, -0.008319, + 0.021514, -0.001452, -0.121998, 0.015396, -0.022594, -0.006977, -0.040108, -0.035550, -0.021872, -0.014721, 0.019799, 0.036556, 0.015072, -0.057988, -0.011684, -0.045220, + -0.026295, 0.052647, 0.013741, -0.013428, 0.061794, 0.021431, -0.011316, -0.009963, 0.008198, 0.027746, 0.074219, -0.019499, 0.042673, 0.016028, 0.007214, -0.010650, + -0.019682, 0.001902, 0.038867, -0.007333, 0.031749, 0.004391, 0.018688, 0.044654, 0.030615, -0.027816, 0.031711, -0.056952, -0.033499, -0.039368, 0.025801, -0.027610, + -0.009329, -0.001799, 0.024061, -0.012593, -0.050266, -0.012512, 0.019528, -0.083434, 0.018238, 0.034138, -0.020120, -0.009910, -0.002280, 0.035325, 0.034440, -0.055205, + -0.017698, -0.000439, -0.034703, 0.013356, -0.037287, 0.048494, -0.018570, 0.028069, 0.019269, -0.007263, -0.008521, 0.000426, -0.016677, 0.056162, -0.011944, 0.017322, + 0.022219, -0.014266, -0.009292, -0.009979, 0.014973, 0.011623, -0.017799, 0.032925, -0.024668, 0.007312, -0.025035, -0.008967, -0.026827, 0.011889, -0.138517, -0.009608, + -0.020592, -0.001272, 0.015676, -0.025706, 0.031775, -0.004195, 0.026876, -0.014748, -0.025966, -0.008741, 0.035437, 0.017139, -0.005140, -0.007101, -0.012510, -0.023600, + 0.032969, -0.005510, 0.020010, 0.032567, 0.015558, 0.004265, -0.036300, 0.048210, 0.080424, -0.052820, -0.002063, -0.020875, 0.052530, -0.001638, -0.020299, -0.035202, + 0.087818, 0.034614, -0.032735, 0.033201, -0.001751, 0.029574, 0.009926, 0.011619, -0.001267, -0.020149, -0.003826, -0.029860, 0.011437, -0.051276, 0.024344, 0.003096, + -0.011573, 0.038228, -0.005730, -0.052328, 0.001909, -0.025877, 0.019976, -0.010160, 0.023892, 0.049161, -0.028978, 0.018700, -0.026460, 0.001090, -0.072128, -0.008406, + 0.010828, 0.020621, -0.005706, 0.023797, 0.036231, -0.112069, 0.017601, 0.007496, 0.045999, 0.016771, 0.021977, 0.022305, 0.018377, 0.002036, -0.029815, -0.082922, + -0.012710, -0.026355, 0.003790, 0.017472, -0.023148, -0.002901, -0.057854, 0.028393, 0.230866, -0.023486, 0.051094, 0.047508, 0.018957, -0.037130, 0.001054, -0.026126, + 0.021970, -0.046915, -0.019419, -0.014077, 0.002502, -0.079454, -0.057149, -0.081701, 0.041979, -0.043074, -0.009425, -0.035776, -0.021794, -0.004826, -0.057263, -0.072940, + 0.037651, -0.013991, -0.043863, -0.020581, 0.034319, -0.052566, -0.010355, -0.022963, 0.027144, -0.017339, 0.088930, -0.000670, -0.026547, -0.026586, -0.032531, 0.040314, + 0.010148, 0.021104, 0.009228, -0.073227, 0.036650, -0.019337, 0.010211, -0.089620, -0.024676, -0.020729, -0.004070, 0.000784, -0.110561, 0.015390, 0.027151, -0.003228, + -0.066704, -0.004797, -0.026117, -0.018131, -0.090114, 0.020659, -0.007157, 0.013608, -0.022324, 0.027487, 0.018873, 0.027854, 0.045085, -0.039992, -0.017829, 0.011071, + -0.011393, -0.004454, -0.037189, -0.030299, 0.059668, 0.005064, 0.024655, -0.037239, 0.046882, -0.010356, -0.009690, 0.061909, -0.024736, 0.016849, 0.000784, 0.000201, + 0.066165, 0.010234, -0.012134, -0.002823, -0.060847, 0.008953, 0.010348, 0.022292, -0.044602, -0.020981, 0.038839, 0.006616, -0.016836, -0.043995, -0.005463, -0.036413, + 0.034895, -0.018008, -0.009543, -0.025080, -0.035243, 0.042696, -0.028911, -0.030676, -0.038542, -0.027798, -0.026607, 0.019467, 0.070629, -0.037356, -0.042648, -0.000284, + 0.033095, 0.077781, -0.052930, 0.022515, -0.029926, -0.033821, -0.003277, -0.000038, -0.026871, 0.018223, -0.004221, 0.023454, -0.030611, -0.006396, -0.009873, -0.008402, + ], + dtype=torch.float32, +) +SYSTEM_PROMPT_DYNAMIC = torch.tensor( + [ + 0.010809, 0.021177, -0.017600, -0.016814, 0.012351, -0.024554, 0.018299, 0.039305, 0.003331, 0.030473, 0.005557, -0.040898, 0.047294, -0.016136, 0.076989, -0.002723, + 0.017622, 0.042330, 0.058266, -0.016232, -0.029502, 0.004529, 0.033543, -0.041481, -0.017631, 0.002727, 0.018874, 0.019932, -0.030052, -0.009997, 0.004582, 0.002135, + -0.003720, -0.030923, 0.021174, 0.034033, -0.007096, 0.011522, -0.009518, 0.055688, -0.092351, -0.003914, 0.004589, -0.032635, 0.012479, -0.140607, -0.014141, -0.031821, + 0.001396, 0.026780, -0.007623, 0.039957, 0.006434, 0.047516, 0.014377, 0.015237, 0.034212, 0.003576, -0.027357, 0.038888, 0.087272, 0.020248, 0.015165, 0.016002, + 0.020781, -0.040509, -0.008929, 0.080857, -0.002642, -0.009738, -0.005683, -0.000615, -0.012801, 0.046457, -0.045004, 0.024689, 0.002498, -0.017333, -0.027366, -0.023231, + -0.006064, -0.021505, 0.007405, -0.021249, 0.026252, -0.018690, 0.020093, -0.036954, 0.037510, -0.032027, -0.030871, -0.011173, -0.618627, 0.021213, -0.004366, 0.029555, + -0.004324, 0.020221, -0.143832, -0.021386, 0.010482, -0.042113, 0.016164, 0.040350, 0.014627, -0.011778, -0.018102, 0.035380, -0.020305, 0.010590, 0.009227, -0.011415, + 0.018623, -0.036384, 0.031003, -0.017073, -0.056456, -0.010423, 0.033029, -0.023511, -0.008717, 0.045716, 0.068273, -0.027886, 0.009665, -0.039801, 0.001465, 0.024361, + -0.015039, 0.022903, 0.033362, -0.022804, 0.008631, 0.076518, 0.000619, 0.022786, -0.015435, -0.095242, -0.006092, 0.015496, -0.009081, 0.015740, 0.004280, 0.013103, + -0.031836, 0.034241, 0.031836, 0.032636, -0.053721, 0.034370, 0.019172, 0.018383, 0.006907, -0.036039, -0.027927, 0.008646, 0.040496, -0.060314, -0.039116, -0.021488, + -0.031682, -0.005077, 0.034920, 0.002148, -0.008087, 0.002024, -0.008480, 0.041096, 0.011401, 0.020380, -0.025078, 0.005002, 0.022252, -0.014577, 0.008051, -0.014476, + -0.007078, 0.021075, 0.036965, 0.005343, -0.038671, -0.037222, 0.014052, -0.009952, -0.003958, -0.001878, 0.017848, -0.016608, -0.030813, 0.010921, 0.001068, 0.003095, + 0.007076, -0.001936, -0.102996, 0.006838, -0.005243, -0.009140, -0.043796, -0.027227, -0.008426, -0.013177, 0.015602, 0.021036, 0.025484, -0.064836, -0.003593, -0.038036, + -0.023102, 0.064053, 0.007850, 0.000771, 0.039297, 0.011903, -0.015866, -0.017612, 0.006308, 0.024342, 0.086761, -0.016705, 0.039239, 0.025079, -0.006452, 0.003174, + -0.010146, 0.010787, 0.035932, -0.015346, 0.037191, 0.010990, 0.011573, 0.044958, 0.035560, -0.017339, 0.018878, -0.025394, -0.044339, -0.029852, 0.015951, -0.032248, + -0.012019, 0.013497, 0.012224, -0.001284, -0.034041, -0.015768, 0.000230, -0.086076, 0.024878, 0.031929, -0.016668, -0.019815, -0.001325, 0.007944, 0.017674, -0.036097, + -0.019651, -0.001272, -0.032842, 0.002056, -0.037140, 0.043191, -0.003710, 0.011767, 0.020313, -0.018396, -0.015935, 0.010228, -0.017349, 0.049363, -0.010007, 0.019533, + 0.018076, 0.016608, -0.005523, -0.007793, 0.016868, 0.019341, -0.008236, 0.026765, -0.025324, -0.007849, -0.023648, -0.007791, -0.018508, 0.015357, -0.166499, -0.003718, + -0.035447, -0.005229, 0.019327, -0.014207, 0.028433, -0.002619, 0.013888, -0.033146, -0.017015, 0.004677, 0.039554, 0.003803, -0.014592, -0.018886, -0.023868, -0.022708, + 0.033661, 0.008626, 0.015687, 0.046395, 0.014173, 0.015083, -0.025994, 0.039120, 0.076334, -0.061165, 0.001791, -0.017579, 0.067567, -0.002415, -0.032495, -0.025576, + 0.079027, 0.036370, -0.013303, 0.030510, -0.009061, 0.019135, 0.015627, 0.024864, 0.015093, -0.017066, -0.014075, -0.021907, 0.017388, -0.033492, 0.013317, -0.000040, + 0.003396, 0.044030, -0.009194, -0.049524, -0.005015, -0.040007, 0.009104, 0.000580, 0.005603, 0.035891, -0.038913, 0.023239, -0.017022, -0.002695, -0.095759, 0.018503, + 0.017365, 0.011104, -0.003433, 0.024113, 0.052609, -0.085274, 0.027565, -0.005833, 0.020700, 0.015842, 0.019148, 0.020203, -0.000698, -0.005337, -0.037400, -0.060144, + -0.031893, -0.038396, -0.001949, 0.018901, -0.014268, -0.004721, -0.055913, 0.013814, 0.215024, -0.011357, 0.057530, 0.050092, 0.016513, -0.059254, 0.001494, -0.031472, + 0.032190, -0.047512, -0.020501, -0.002571, 0.007844, -0.063630, -0.043938, -0.079595, 0.032820, -0.021659, -0.003738, -0.035267, -0.013794, -0.021172, -0.046356, -0.077079, + 0.021526, -0.007447, -0.050276, -0.029743, 0.022208, -0.039137, -0.021426, -0.029825, 0.029390, -0.002943, 0.073158, -0.000435, -0.032029, -0.038524, -0.029886, 0.017473, + 0.013513, 0.022738, 0.000632, -0.073718, 0.029219, -0.018896, 0.007302, -0.116122, -0.013324, -0.012214, -0.005960, -0.003720, -0.155869, 0.019896, 0.016919, -0.021133, + -0.066911, -0.000926, -0.020871, -0.015295, -0.086108, 0.014918, -0.009284, 0.001689, -0.038155, 0.039163, 0.015988, 0.014413, 0.034205, -0.053273, 0.001687, 0.012227, + -0.007341, -0.006123, -0.005731, -0.026863, 0.060196, 0.028929, 0.019328, -0.033709, 0.038789, -0.015624, 0.013323, 0.053821, -0.015538, -0.001610, 0.012959, -0.013897, + 0.082010, 0.012866, -0.017269, 0.000017, -0.059458, 0.015870, 0.028455, 0.025234, -0.051163, -0.022976, 0.011866, -0.005613, -0.008738, -0.047658, -0.002155, -0.029432, + 0.039242, -0.013491, -0.001641, -0.024210, -0.019187, 0.026716, -0.025698, -0.027591, -0.034678, -0.002473, -0.019391, 0.017597, 0.064385, -0.029104, -0.034501, -0.004955, + 0.015008, 0.060749, -0.051693, 0.020279, -0.027170, -0.027003, 0.000254, 0.011352, -0.028116, 0.028938, -0.007224, 0.019978, -0.025379, -0.004874, -0.019361, -0.020278, + ], + dtype=torch.float32, +) +SYSTEM_EN_RECAPTION = torch.tensor( + [ + 0.007721, 0.015421, -0.019305, -0.000920, 0.016031, -0.019730, 0.029683, 0.026810, -0.010510, 0.021463, 0.008833, -0.040851, 0.043260, -0.007042, 0.057224, 0.011995, + 0.007818, 0.046369, 0.059838, -0.028548, -0.047399, -0.000983, 0.024343, -0.052259, -0.013638, 0.006856, 0.009186, 0.014235, -0.031497, -0.008644, -0.009349, 0.018900, + 0.002913, -0.022475, 0.039518, 0.019052, -0.007600, 0.010634, -0.011830, 0.075675, -0.071738, -0.014947, 0.004995, -0.025804, -0.002553, -0.093262, 0.002881, -0.033744, + -0.007234, 0.013659, 0.009897, 0.039185, -0.005366, 0.041534, -0.005924, 0.019786, 0.048566, -0.009356, -0.027360, 0.042557, 0.091286, 0.009286, 0.015410, 0.028166, + 0.022476, -0.025162, 0.012144, 0.084603, -0.003150, -0.008549, -0.002099, -0.014987, -0.019480, 0.046843, -0.030613, 0.015557, -0.008965, -0.008798, -0.027032, -0.014112, + 0.018703, -0.014749, -0.000928, -0.024660, 0.024004, 0.004560, 0.028156, -0.028467, 0.025444, -0.038699, -0.014927, -0.031593, -0.648498, 0.018529, 0.003378, 0.030188, + -0.002314, 0.014950, -0.146615, -0.009005, 0.016579, -0.037867, 0.020907, 0.033160, 0.007877, -0.026345, -0.056428, 0.031255, -0.018404, 0.013334, 0.009988, -0.022790, + 0.020803, -0.036862, 0.036222, -0.006646, -0.058084, -0.012036, 0.044199, -0.027665, -0.015779, 0.051554, 0.059970, -0.025977, 0.003967, -0.035247, -0.000488, 0.023182, + 0.000468, 0.019190, 0.047268, -0.032279, -0.005302, 0.078669, -0.001915, 0.024918, -0.014952, -0.078905, -0.018333, 0.001362, -0.015115, 0.005435, 0.002313, 0.018766, + -0.032773, 0.037344, 0.024061, 0.012143, -0.057106, 0.029490, 0.019537, 0.009099, 0.026064, -0.015927, -0.037047, 0.006002, 0.025191, -0.035318, -0.032245, -0.047822, + -0.023568, -0.004533, 0.025100, 0.002758, -0.002649, -0.012287, -0.012139, 0.043080, 0.003295, 0.024667, -0.021050, 0.006752, 0.025315, -0.011127, 0.009800, -0.021343, + -0.024866, 0.010098, 0.026954, 0.012467, -0.035866, -0.031780, 0.007479, -0.003388, -0.012619, -0.012099, 0.014974, -0.001908, -0.032700, 0.004703, 0.003238, -0.007498, + 0.023241, 0.002715, -0.111739, 0.003317, 0.006475, -0.019792, -0.046558, -0.032593, -0.020762, -0.005059, 0.016934, 0.029195, 0.028744, -0.050633, 0.001907, -0.028791, + -0.016695, 0.052143, 0.010439, 0.007204, 0.028502, 0.012607, -0.012414, -0.031238, 0.007305, 0.032309, 0.087924, -0.010530, 0.029925, 0.032666, -0.002202, 0.017539, + -0.009091, -0.001631, 0.024906, -0.013102, 0.031772, 0.018465, 0.012035, 0.031460, 0.030193, 0.005289, 0.025859, -0.038971, -0.046577, -0.025852, 0.035235, -0.038514, + 0.001042, 0.013012, 0.023701, -0.014630, -0.029269, -0.011981, 0.008219, -0.067347, -0.003456, 0.028198, -0.008657, -0.017773, 0.010540, 0.023964, 0.021012, -0.034465, + -0.023748, 0.004065, -0.021598, 0.008440, -0.031533, 0.038390, -0.007680, -0.003852, 0.016136, -0.017906, -0.008927, 0.006300, -0.001251, 0.029337, -0.008632, 0.020568, + 0.021560, -0.007222, 0.005313, -0.013089, 0.012299, 0.031303, -0.013951, 0.016547, -0.024771, -0.008753, -0.030908, -0.014421, -0.017656, 0.014044, -0.114986, 0.000956, + -0.035588, 0.003756, 0.015383, -0.013358, 0.009385, -0.001359, 0.012623, -0.028724, 0.001607, 0.012809, 0.032668, 0.011834, -0.015587, -0.007170, -0.021344, -0.019664, + 0.017690, -0.014538, 0.016511, 0.038037, 0.029919, 0.020907, -0.018565, 0.032964, 0.078548, -0.050386, -0.003012, -0.016965, 0.064131, 0.008077, -0.025879, -0.035820, + 0.095075, 0.019901, -0.019114, 0.022832, 0.003741, 0.027148, 0.018231, 0.027741, 0.020328, 0.001700, -0.006939, -0.024154, 0.018523, -0.029819, 0.008050, -0.004477, + 0.006087, 0.056878, -0.009083, -0.061537, -0.011531, -0.037551, 0.000434, -0.005843, 0.024739, 0.032020, -0.053119, 0.020704, -0.012385, -0.002726, -0.082489, 0.009072, + 0.013341, 0.000316, 0.001899, 0.022868, 0.034407, -0.066857, 0.020589, 0.012195, 0.023211, -0.001520, 0.000897, 0.029670, -0.015930, 0.006509, -0.035172, -0.061215, + -0.014099, -0.038584, -0.012213, 0.018613, -0.012365, -0.002777, -0.055184, 0.017146, 0.214358, -0.015750, 0.052488, 0.045205, 0.025334, -0.054615, 0.002117, -0.038122, + 0.012402, -0.053418, -0.025405, 0.007235, 0.013208, -0.092481, -0.048700, -0.085186, 0.029039, -0.036767, -0.000777, -0.017625, -0.012556, -0.004887, -0.033660, -0.082310, + 0.013387, -0.003256, -0.062981, -0.019886, 0.017624, -0.037421, -0.020743, -0.020894, 0.041974, -0.008502, 0.088413, -0.018697, -0.029398, -0.029389, -0.043721, 0.013872, + 0.003944, 0.030361, 0.005355, -0.081355, 0.041843, -0.016395, 0.011954, -0.060440, -0.000966, -0.019101, 0.006803, -0.011310, -0.148581, 0.020342, 0.012795, -0.016473, + -0.053300, -0.012340, -0.016640, -0.029834, -0.082405, 0.011859, -0.004255, -0.004396, -0.012515, 0.031962, 0.030438, 0.013792, 0.031557, -0.047200, 0.006485, 0.024815, + -0.019376, -0.011454, -0.034184, -0.021329, 0.050115, 0.021720, 0.002874, -0.047163, 0.044031, -0.014663, 0.020534, 0.056017, 0.007017, 0.003323, 0.005734, -0.002777, + 0.082836, 0.012048, -0.023236, -0.007401, -0.071598, 0.016760, 0.017282, 0.028306, -0.026220, -0.008016, -0.000202, -0.020271, -0.019828, -0.046986, -0.005805, -0.039647, + 0.042879, -0.004463, 0.007753, -0.028916, -0.020612, 0.028833, -0.039839, -0.052447, -0.013275, -0.002407, -0.018937, 0.033216, 0.075535, -0.045026, -0.009901, 0.016637, + -0.000322, 0.073925, -0.055701, 0.014912, -0.045671, -0.021189, 0.006761, -0.002015, -0.027410, 0.018250, -0.015916, 0.016254, -0.044964, 0.029261, -0.029319, -0.005222, + ], + dtype=torch.float32, +) +SYSTEM_EN_THINK_RECAPTION = torch.tensor( + [ + 0.011004, 0.017341, -0.019959, -0.018314, 0.016520, -0.027395, 0.017946, 0.039665, 0.000645, 0.035903, 0.002499, -0.045664, 0.039472, -0.013479, 0.081302, 0.000182, + 0.006947, 0.042845, 0.059741, -0.010796, -0.035240, 0.004176, 0.029557, -0.043467, -0.017271, 0.006896, 0.010997, 0.022498, -0.023308, -0.013046, -0.000742, 0.016209, + -0.007152, -0.029868, 0.028747, 0.033743, -0.000227, 0.018419, -0.015023, 0.050376, -0.098475, -0.002375, 0.007897, -0.023936, 0.007843, -0.122463, -0.011680, -0.027267, + -0.007270, 0.021869, -0.011415, 0.043770, 0.000551, 0.048573, 0.003132, 0.014233, 0.037080, -0.004818, -0.028738, 0.044468, 0.073843, 0.016947, 0.014484, 0.021931, + 0.020110, -0.032309, -0.003811, 0.095704, -0.006950, -0.007237, -0.005529, -0.020573, -0.016259, 0.041909, -0.038748, 0.018029, 0.005066, -0.021186, -0.020102, -0.019719, + 0.006239, -0.021284, 0.004213, -0.024963, 0.032345, -0.012557, 0.037268, -0.038075, 0.040998, -0.032766, -0.023509, -0.016426, -0.627412, 0.022675, 0.000101, 0.023162, + -0.002081, 0.015922, -0.138671, -0.027995, 0.011579, -0.042859, 0.019935, 0.038077, 0.012640, -0.017377, -0.027456, 0.035151, -0.015756, 0.018530, 0.004646, -0.002589, + 0.019645, -0.043736, 0.034947, -0.010166, -0.061165, -0.019195, 0.028909, -0.019415, -0.009485, 0.049566, 0.068621, -0.038644, 0.011278, -0.036133, 0.000564, 0.022611, + -0.013612, 0.020854, 0.030614, -0.025578, 0.005673, 0.076526, -0.004887, 0.027769, -0.022605, -0.092657, -0.013218, 0.008081, -0.015227, 0.018031, -0.005145, 0.015028, + -0.027193, 0.034767, 0.028710, 0.032007, -0.053175, 0.033528, 0.019437, 0.011517, 0.012107, -0.027679, -0.026937, 0.008612, 0.036909, -0.051484, -0.039971, -0.034372, + -0.023825, -0.003025, 0.033648, -0.001852, 0.007309, 0.000714, -0.001075, 0.038534, 0.007586, 0.016213, -0.025223, -0.001099, 0.015852, -0.011477, 0.020635, -0.010696, + -0.019634, 0.025613, 0.034374, 0.007169, -0.035000, -0.032268, 0.015114, -0.014217, -0.005229, -0.005495, 0.018189, -0.011360, -0.026755, 0.007036, -0.002333, -0.001174, + 0.014729, 0.001739, -0.108591, 0.004699, 0.002048, -0.014801, -0.042855, -0.028846, -0.009609, -0.004500, 0.019466, 0.021848, 0.022140, -0.063035, -0.004272, -0.030798, + -0.018452, 0.055169, 0.012240, -0.003555, 0.038293, 0.008503, -0.016608, -0.021309, 0.000690, 0.027093, 0.088054, -0.008881, 0.034087, 0.030647, 0.003284, 0.005038, + -0.008359, 0.006311, 0.032462, -0.009699, 0.035283, 0.015261, 0.012827, 0.038169, 0.033959, -0.018048, 0.018122, -0.025259, -0.040084, -0.030879, 0.019853, -0.042558, + -0.011938, 0.019602, 0.016537, -0.003378, -0.027890, -0.014909, -0.005464, -0.071862, 0.012335, 0.021899, -0.017008, -0.023228, 0.003263, 0.004571, 0.016447, -0.029446, + -0.022645, -0.001261, -0.018573, 0.007431, -0.027587, 0.035362, -0.006785, -0.000614, 0.026044, -0.009056, -0.009843, 0.010467, -0.011929, 0.042025, -0.014068, 0.023113, + 0.023880, 0.014948, 0.004370, -0.005262, 0.012587, 0.021608, -0.001783, 0.023697, -0.024945, -0.011533, -0.020953, -0.007205, -0.024693, 0.012961, -0.168760, 0.001767, + -0.041265, -0.007044, 0.015021, -0.008407, 0.029642, -0.000956, 0.008607, -0.035365, -0.012187, 0.011744, 0.032612, 0.006226, -0.015891, -0.017747, -0.022565, -0.024505, + 0.031279, 0.004188, 0.011939, 0.038032, 0.008798, 0.012314, -0.024830, 0.034484, 0.076395, -0.060108, 0.001019, -0.016138, 0.067729, 0.003899, -0.029845, -0.019960, + 0.086663, 0.040965, -0.010458, 0.027808, -0.006394, 0.017343, 0.014788, 0.024756, 0.016446, -0.012537, -0.008406, -0.028109, 0.013369, -0.033571, 0.012170, -0.002199, + 0.005263, 0.052280, -0.018171, -0.047898, -0.010087, -0.038632, 0.006773, -0.000838, 0.011197, 0.038187, -0.049525, 0.021689, -0.007385, -0.005987, -0.094551, 0.019019, + 0.012760, 0.009617, -0.002262, 0.030228, 0.047823, -0.079764, 0.023391, -0.005561, 0.018866, 0.012817, 0.020878, 0.027037, -0.013905, -0.002874, -0.035522, -0.046266, + -0.032448, -0.036010, -0.007776, 0.016512, -0.012279, -0.005665, -0.057974, 0.016967, 0.202836, -0.009066, 0.066093, 0.045689, 0.018319, -0.048465, 0.000242, -0.040874, + 0.027824, -0.049045, -0.015616, -0.000307, 0.009163, -0.072975, -0.042979, -0.082254, 0.040549, -0.027049, 0.000725, -0.034118, -0.019604, -0.019097, -0.042483, -0.075446, + 0.019387, -0.005218, -0.053573, -0.029975, 0.008195, -0.036608, -0.018920, -0.025610, 0.028426, -0.002688, 0.074996, -0.003423, -0.032505, -0.030565, -0.028142, 0.014437, + 0.013359, 0.019376, 0.008356, -0.069731, 0.031824, -0.011103, 0.019327, -0.117090, -0.009352, -0.010290, -0.002129, -0.009198, -0.172915, 0.021232, 0.017274, -0.030060, + -0.061449, -0.006598, -0.013069, -0.012857, -0.081220, 0.019058, -0.004841, 0.003066, -0.037741, 0.041806, 0.018281, 0.009458, 0.036761, -0.044987, 0.003557, 0.008890, + -0.008011, -0.004063, -0.013474, -0.022090, 0.055398, 0.037475, 0.006991, -0.035962, 0.045503, -0.017162, 0.022391, 0.052754, -0.005924, -0.005936, 0.012673, -0.017922, + 0.084548, 0.014695, -0.013817, 0.000421, -0.065167, 0.018269, 0.023317, 0.023523, -0.034229, -0.019588, 0.007911, -0.002426, -0.017109, -0.050870, 0.002848, -0.033077, + 0.043451, -0.010609, -0.000375, -0.023206, -0.018155, 0.027102, -0.036006, -0.035115, -0.023922, 0.005989, -0.015372, 0.027123, 0.075210, -0.035302, -0.029799, 0.003642, + 0.007714, 0.063498, -0.053234, 0.015699, -0.040459, -0.027354, -0.002433, 0.010923, -0.020134, 0.029292, -0.010176, 0.013508, -0.032403, 0.004323, -0.017504, -0.015237, + ], + dtype=torch.float32, +) +SYSTEM_EN_VANILLA = torch.tensor( + [ + 0.010809, 0.021177, -0.017600, -0.016814, 0.012351, -0.024554, 0.018299, 0.039305, 0.003331, 0.030473, 0.005557, -0.040898, 0.047294, -0.016136, 0.076989, -0.002723, + 0.017622, 0.042330, 0.058266, -0.016232, -0.029502, 0.004529, 0.033543, -0.041481, -0.017631, 0.002727, 0.018874, 0.019932, -0.030052, -0.009997, 0.004582, 0.002135, + -0.003720, -0.030923, 0.021174, 0.034033, -0.007096, 0.011522, -0.009518, 0.055688, -0.092351, -0.003914, 0.004589, -0.032635, 0.012479, -0.140607, -0.014141, -0.031821, + 0.001396, 0.026780, -0.007623, 0.039957, 0.006434, 0.047516, 0.014377, 0.015237, 0.034212, 0.003576, -0.027357, 0.038888, 0.087272, 0.020248, 0.015165, 0.016002, + 0.020781, -0.040509, -0.008929, 0.080857, -0.002642, -0.009738, -0.005683, -0.000615, -0.012801, 0.046457, -0.045004, 0.024689, 0.002498, -0.017333, -0.027366, -0.023231, + -0.006064, -0.021505, 0.007405, -0.021249, 0.026252, -0.018690, 0.020093, -0.036954, 0.037510, -0.032027, -0.030871, -0.011173, -0.618627, 0.021213, -0.004366, 0.029555, + -0.004324, 0.020221, -0.143832, -0.021386, 0.010482, -0.042113, 0.016164, 0.040350, 0.014627, -0.011778, -0.018102, 0.035380, -0.020305, 0.010590, 0.009227, -0.011415, + 0.018623, -0.036384, 0.031003, -0.017073, -0.056456, -0.010423, 0.033029, -0.023511, -0.008717, 0.045716, 0.068273, -0.027886, 0.009665, -0.039801, 0.001465, 0.024361, + -0.015039, 0.022903, 0.033362, -0.022804, 0.008631, 0.076518, 0.000619, 0.022786, -0.015435, -0.095242, -0.006092, 0.015496, -0.009081, 0.015740, 0.004280, 0.013103, + -0.031836, 0.034241, 0.031836, 0.032636, -0.053721, 0.034370, 0.019172, 0.018383, 0.006907, -0.036039, -0.027927, 0.008646, 0.040496, -0.060314, -0.039116, -0.021488, + -0.031682, -0.005077, 0.034920, 0.002148, -0.008087, 0.002024, -0.008480, 0.041096, 0.011401, 0.020380, -0.025078, 0.005002, 0.022252, -0.014577, 0.008051, -0.014476, + -0.007078, 0.021075, 0.036965, 0.005343, -0.038671, -0.037222, 0.014052, -0.009952, -0.003958, -0.001878, 0.017848, -0.016608, -0.030813, 0.010921, 0.001068, 0.003095, + 0.007076, -0.001936, -0.102996, 0.006838, -0.005243, -0.009140, -0.043796, -0.027227, -0.008426, -0.013177, 0.015602, 0.021036, 0.025484, -0.064836, -0.003593, -0.038036, + -0.023102, 0.064053, 0.007850, 0.000771, 0.039297, 0.011903, -0.015866, -0.017612, 0.006308, 0.024342, 0.086761, -0.016705, 0.039239, 0.025079, -0.006452, 0.003174, + -0.010146, 0.010787, 0.035932, -0.015346, 0.037191, 0.010990, 0.011573, 0.044958, 0.035560, -0.017339, 0.018878, -0.025394, -0.044339, -0.029852, 0.015951, -0.032248, + -0.012019, 0.013497, 0.012224, -0.001284, -0.034041, -0.015768, 0.000230, -0.086076, 0.024878, 0.031929, -0.016668, -0.019815, -0.001325, 0.007944, 0.017674, -0.036097, + -0.019651, -0.001272, -0.032842, 0.002056, -0.037140, 0.043191, -0.003710, 0.011767, 0.020313, -0.018396, -0.015935, 0.010228, -0.017349, 0.049363, -0.010007, 0.019533, + 0.018076, 0.016608, -0.005523, -0.007793, 0.016868, 0.019341, -0.008236, 0.026765, -0.025324, -0.007849, -0.023648, -0.007791, -0.018508, 0.015357, -0.166499, -0.003718, + -0.035447, -0.005229, 0.019327, -0.014207, 0.028433, -0.002619, 0.013888, -0.033146, -0.017015, 0.004677, 0.039554, 0.003803, -0.014592, -0.018886, -0.023868, -0.022708, + 0.033661, 0.008626, 0.015687, 0.046395, 0.014173, 0.015083, -0.025994, 0.039120, 0.076334, -0.061165, 0.001791, -0.017579, 0.067567, -0.002415, -0.032495, -0.025576, + 0.079027, 0.036370, -0.013303, 0.030510, -0.009061, 0.019135, 0.015627, 0.024864, 0.015093, -0.017066, -0.014075, -0.021907, 0.017388, -0.033492, 0.013317, -0.000040, + 0.003396, 0.044030, -0.009194, -0.049524, -0.005015, -0.040007, 0.009104, 0.000580, 0.005603, 0.035891, -0.038913, 0.023239, -0.017022, -0.002695, -0.095759, 0.018503, + 0.017365, 0.011104, -0.003433, 0.024113, 0.052609, -0.085274, 0.027565, -0.005833, 0.020700, 0.015842, 0.019148, 0.020203, -0.000698, -0.005337, -0.037400, -0.060144, + -0.031893, -0.038396, -0.001949, 0.018901, -0.014268, -0.004721, -0.055913, 0.013814, 0.215024, -0.011357, 0.057530, 0.050092, 0.016513, -0.059254, 0.001494, -0.031472, + 0.032190, -0.047512, -0.020501, -0.002571, 0.007844, -0.063630, -0.043938, -0.079595, 0.032820, -0.021659, -0.003738, -0.035267, -0.013794, -0.021172, -0.046356, -0.077079, + 0.021526, -0.007447, -0.050276, -0.029743, 0.022208, -0.039137, -0.021426, -0.029825, 0.029390, -0.002943, 0.073158, -0.000435, -0.032029, -0.038524, -0.029886, 0.017473, + 0.013513, 0.022738, 0.000632, -0.073718, 0.029219, -0.018896, 0.007302, -0.116122, -0.013324, -0.012214, -0.005960, -0.003720, -0.155869, 0.019896, 0.016919, -0.021133, + -0.066911, -0.000926, -0.020871, -0.015295, -0.086108, 0.014918, -0.009284, 0.001689, -0.038155, 0.039163, 0.015988, 0.014413, 0.034205, -0.053273, 0.001687, 0.012227, + -0.007341, -0.006123, -0.005731, -0.026863, 0.060196, 0.028929, 0.019328, -0.033709, 0.038789, -0.015624, 0.013323, 0.053821, -0.015538, -0.001610, 0.012959, -0.013897, + 0.082010, 0.012866, -0.017269, 0.000017, -0.059458, 0.015870, 0.028455, 0.025234, -0.051163, -0.022976, 0.011866, -0.005613, -0.008738, -0.047658, -0.002155, -0.029432, + 0.039242, -0.013491, -0.001641, -0.024210, -0.019187, 0.026716, -0.025698, -0.027591, -0.034678, -0.002473, -0.019391, 0.017597, 0.064385, -0.029104, -0.034501, -0.004955, + 0.015008, 0.060749, -0.051693, 0.020279, -0.027170, -0.027003, 0.000254, 0.011352, -0.028116, 0.028938, -0.007224, 0.019978, -0.025379, -0.004874, -0.019361, -0.020278, + ], + dtype=torch.float32, +) +SYSTEM_EN_UNIFIED = torch.tensor( + [ + 0.011409, 0.014191, -0.023163, -0.020119, 0.019190, -0.029559, 0.019616, 0.035872, 0.010434, 0.028709, 0.011616, -0.039422, 0.038369, -0.004631, 0.081177, 0.007400, + 0.008903, 0.040408, 0.055323, -0.011950, -0.026940, 0.004916, 0.028101, -0.046200, -0.016732, 0.005115, 0.012100, 0.016136, -0.026057, -0.013827, -0.004914, 0.015261, + -0.010824, -0.028188, 0.022934, 0.026204, -0.003855, 0.013797, -0.014518, 0.050289, -0.100077, -0.002962, 0.009050, -0.028205, 0.016294, -0.128956, -0.012730, -0.023647, + -0.009306, 0.020066, 0.000033, 0.043619, 0.003250, 0.053425, 0.005889, 0.021529, 0.036032, -0.003254, -0.029715, 0.048345, 0.077978, 0.010674, 0.019296, 0.018721, + 0.019244, -0.040115, -0.004245, 0.085214, -0.005280, -0.010746, -0.000164, -0.023405, -0.015641, 0.040193, -0.038735, 0.018966, -0.004031, -0.017879, -0.023017, -0.030379, + 0.006468, -0.015959, 0.000532, -0.026530, 0.042640, -0.006095, 0.037899, -0.043658, 0.040965, -0.034682, -0.023729, -0.019291, -0.630840, 0.029658, 0.005462, 0.026650, + -0.000292, 0.013954, -0.149594, -0.019405, 0.015321, -0.045104, 0.030332, 0.031727, 0.012349, -0.009553, -0.022371, 0.034043, -0.014838, 0.015398, -0.003657, 0.000477, + 0.021084, -0.041406, 0.029946, -0.013832, -0.057358, -0.018086, 0.031598, -0.031835, -0.006697, 0.040866, 0.068602, -0.042203, 0.007362, -0.036959, 0.003794, 0.026533, + -0.011873, 0.017343, 0.028333, -0.021804, 0.004007, 0.075133, 0.003340, 0.025326, -0.015068, -0.092280, -0.011514, 0.006827, -0.008254, 0.021181, -0.005035, 0.022263, + -0.022443, 0.043919, 0.026637, 0.028568, -0.056881, 0.036740, 0.024430, 0.015891, 0.012257, -0.031126, -0.030108, 0.007229, 0.026998, -0.051685, -0.033003, -0.031170, + -0.024021, 0.004235, 0.030164, 0.002674, 0.008018, 0.005532, 0.001621, 0.044790, 0.006413, 0.027160, -0.015022, 0.000911, 0.019723, -0.016244, 0.020077, -0.006847, + -0.014110, 0.022461, 0.031656, 0.002760, -0.039078, -0.026893, 0.006628, -0.011775, -0.000240, -0.005908, 0.014943, -0.012131, -0.021755, 0.004732, -0.005297, -0.002922, + 0.014631, -0.002010, -0.112400, 0.000842, -0.002732, -0.014861, -0.052099, -0.034167, -0.011613, -0.006101, 0.013278, 0.018867, 0.026530, -0.068150, -0.003306, -0.032801, + -0.018523, 0.050875, 0.005488, -0.007241, 0.045707, 0.023119, -0.021519, -0.022683, 0.004806, 0.024827, 0.091371, -0.014424, 0.043836, 0.033094, 0.002390, 0.005450, + -0.004893, 0.013608, 0.031272, -0.002449, 0.031607, 0.014646, 0.014146, 0.043995, 0.028826, -0.012219, 0.021008, -0.020911, -0.036967, -0.036256, 0.013328, -0.038382, + -0.012084, 0.018183, 0.018782, -0.004697, -0.024284, -0.015474, -0.001463, -0.076015, 0.013923, 0.022125, -0.018765, -0.010793, 0.008409, 0.002067, 0.017961, -0.029716, + -0.020915, -0.001779, -0.009217, -0.001933, -0.036081, 0.042577, 0.000118, -0.013920, 0.014901, -0.016486, -0.010278, -0.000449, -0.017234, 0.042453, -0.009893, 0.021087, + 0.017671, 0.009861, -0.004210, 0.004944, 0.015627, 0.014370, -0.001128, 0.030247, -0.019552, -0.014017, -0.020859, -0.002614, -0.024405, 0.016532, -0.173204, -0.001196, + -0.037415, -0.010990, 0.010449, -0.006124, 0.019211, 0.003695, 0.011679, -0.031852, -0.009764, 0.005773, 0.035793, 0.003455, -0.011772, -0.020532, -0.027434, -0.024761, + 0.027483, -0.001554, 0.010411, 0.037888, 0.015619, 0.019186, -0.021204, 0.038158, 0.074991, -0.064521, -0.002503, -0.014499, 0.068165, 0.006145, -0.032891, -0.021540, + 0.091385, 0.047584, -0.009590, 0.028004, -0.002962, 0.021061, 0.014854, 0.025840, 0.016068, -0.014364, -0.016418, -0.033454, 0.011734, -0.036518, 0.013015, -0.003966, + 0.000855, 0.051373, -0.010960, -0.047078, -0.011048, -0.042015, 0.006818, 0.005483, 0.010251, 0.034951, -0.046162, 0.021258, -0.013397, -0.005259, -0.093775, 0.019974, + 0.014992, 0.004043, -0.005931, 0.035662, 0.050723, -0.083293, 0.028047, -0.008042, 0.020763, 0.016763, 0.022913, 0.027129, -0.014314, -0.009854, -0.039019, -0.044870, + -0.028101, -0.038026, -0.006294, 0.018265, -0.015425, -0.007866, -0.052784, 0.010470, 0.200260, -0.007798, 0.064482, 0.046612, 0.025353, -0.059695, -0.001831, -0.039643, + 0.025148, -0.042752, -0.014928, -0.010216, 0.014195, -0.069149, -0.041424, -0.078360, 0.036999, -0.021357, 0.011032, -0.026564, -0.016214, -0.023440, -0.044723, -0.064498, + 0.018283, -0.007165, -0.051802, -0.026299, 0.005867, -0.034691, -0.020621, -0.030512, 0.024458, -0.011330, 0.066558, -0.004069, -0.031624, -0.030639, -0.037451, 0.013079, + 0.015152, 0.008058, 0.009223, -0.069514, 0.030702, -0.009681, 0.014826, -0.115441, -0.005514, -0.011925, 0.001046, -0.007148, -0.164128, 0.018043, 0.017001, -0.026352, + -0.049691, -0.011637, -0.013045, -0.014851, -0.079469, 0.017692, -0.006575, 0.001063, -0.028299, 0.038777, 0.019930, 0.010641, 0.036955, -0.039004, -0.006477, 0.004278, + -0.001006, -0.002514, -0.017242, -0.023927, 0.049113, 0.038393, 0.011633, -0.031537, 0.041725, -0.012146, 0.023445, 0.049999, -0.008538, 0.001319, 0.012732, -0.021170, + 0.082096, 0.009610, -0.025717, 0.002566, -0.060849, 0.017403, 0.032650, 0.018658, -0.030629, -0.025032, 0.005555, 0.000522, -0.009667, -0.043099, 0.005939, -0.027156, + 0.045634, -0.011986, 0.002713, -0.032225, -0.015494, 0.028734, -0.036528, -0.033101, -0.027174, 0.009490, -0.016537, 0.029435, 0.065709, -0.037711, -0.020497, -0.005578, + 0.011768, 0.061035, -0.044676, 0.016113, -0.042945, -0.022579, 0.002430, 0.012474, -0.018198, 0.030468, -0.016646, 0.019020, -0.035804, 0.001175, -0.018312, -0.010760, + ], + dtype=torch.float32, +) +# fmt: on +SYSTEM_PROMPT_CASES = [ + pytest.param("none", None, SEED_1234, id="none"), + pytest.param("dynamic", "dynamic", SYSTEM_PROMPT_DYNAMIC, id="dynamic"), + pytest.param("en_vanilla", "en_vanilla", SYSTEM_EN_VANILLA, id="en_vanilla"), + pytest.param("en_recaption", "en_recaption", SYSTEM_EN_RECAPTION, id="en_recaption"), + pytest.param("en_think_recaption", "en_think_recaption", SYSTEM_EN_THINK_RECAPTION, id="en_think_recaption"), + pytest.param("en_unified", "en_unified", SYSTEM_EN_UNIFIED, id="en_unified"), +] + + +@pytest.fixture(scope="session") +def clip_bundle() -> tuple[CLIPModel, CLIPProcessor]: + try: + model = CLIPModel.from_pretrained(LOCAL_CLIP_PATH, local_files_only=True) + processor = CLIPProcessor.from_pretrained(LOCAL_CLIP_PATH, local_files_only=True) + except OSError as exc: + pytest.skip(f"Could not load CLIP model from local cache ({LOCAL_CLIP_PATH}): {exc}") + + model.eval() + return model, processor + + +@pytest.fixture(scope="module") +def omni() -> Generator[Omni, None, None]: + engine = Omni( + model=MODEL_NAME, + stage_configs_path=str(STAGE_CONFIG_PATH), + stage_init_timeout=600, + init_timeout=900, + ) + try: + yield engine + finally: + engine.close() + + +def _extract_generated_image(outputs: list[object]) -> Image.Image: + if not outputs: + raise AssertionError("No outputs were returned from Omni.generate()") + + first_output = outputs[0] + if images := getattr(first_output, "images", None): + return images[0] + + request_output = getattr(first_output, "request_output", None) + if request_output is not None and (images := getattr(request_output, "images", None)): + return images[0] + + raise AssertionError("No generated image found in Omni output") + + +def extract_embedding(image: Image.Image, clip_model: CLIPModel, clip_processor: CLIPProcessor) -> torch.Tensor: + inputs = clip_processor(images=image.convert("RGB"), return_tensors="pt") + with torch.inference_mode(): + features = clip_model.get_image_features(**inputs) + features = F.normalize(features, p=2, dim=-1) + return features.squeeze(0) + + +def compare_semantic( + expected_embedding: torch.Tensor, + image: Image.Image, + clip_model: CLIPModel, + clip_processor: CLIPProcessor, +) -> float: + features = extract_embedding(image, clip_model, clip_processor) + expected = F.normalize(expected_embedding, p=2, dim=-1) + return torch.dot(expected, features).item() + + +def _generate_image(omni: Omni, use_system_prompt: str | None) -> Image.Image: + generator_device = current_omni_platform.device_type or "cuda" + sampling_params = OmniDiffusionSamplingParams( + seed=1234, + generator=torch.Generator(device=generator_device).manual_seed(1234), + num_outputs_per_prompt=1, + ) + if use_system_prompt is not None: + sampling_params.extra_args = {"use_system_prompt": use_system_prompt} + + outputs = omni.generate({"prompt": PROMPT}, sampling_params) + return _extract_generated_image(outputs) + + +@pytest.mark.skipif(torch.cuda.device_count() < 8, reason="Need at least 8 CUDA GPUs for this test.") +@pytest.mark.parametrize("system_prompt_name,use_system_prompt,expected_embedding", SYSTEM_PROMPT_CASES) +def test_system_prompt_scores( + omni: Omni, + clip_bundle: tuple[CLIPModel, CLIPProcessor], + system_prompt_name: str, + use_system_prompt: str | None, + expected_embedding: torch.Tensor, +) -> None: + clip_model, clip_processor = clip_bundle + generated_image = _generate_image(omni, use_system_prompt) + score = compare_semantic(expected_embedding, generated_image, clip_model, clip_processor) + + print(f"{system_prompt_name}: CLIP cosine similarity = {score:.6f}") diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py b/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py index ba24818dc9..7e9e2d2787 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py +++ b/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py @@ -41,6 +41,7 @@ build_batch_2d_rope, real_batched_index_select, ) +from .system_prompt import get_system_prompt logger = logging.getLogger(__name__) @@ -991,10 +992,15 @@ def forward( width: int = 1024, num_inference_steps: int = 50, guidance_scale: float = 5.0, - system_prompt: str | None = None, generator: torch.Generator | list[torch.Generator] | None = None, **kwargs, ) -> DiffusionOutput: + extra_args = getattr(getattr(req, "sampling_params", None), "extra_args", {}) or {} + use_system_prompt = extra_args.get("use_system_prompt") + system_prompt = extra_args.get("system_prompt") + if use_system_prompt is not None: + system_prompt = get_system_prompt(use_system_prompt, "image", system_prompt) + system_prompt = system_prompt.strip() if system_prompt is not None else "" prompt = [p if isinstance(p, str) else (p.get("prompt") or "") for p in req.prompts] or prompt generator = req.sampling_params.generator or generator height = req.sampling_params.height or height diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/system_prompt.py b/vllm_omni/diffusion/models/hunyuan_image_3/system_prompt.py new file mode 100644 index 0000000000..29494fad41 --- /dev/null +++ b/vllm_omni/diffusion/models/hunyuan_image_3/system_prompt.py @@ -0,0 +1,215 @@ +# ruff: noqa: E501 +# Licensed under the TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://github.com/Tencent-Hunyuan/HunyuanImage-3.0/blob/main/LICENSE +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +t2i_system_prompt_en_vanilla = """ +You are an advanced AI text-to-image generation system. Given a detailed text prompt, your task is to create a high-quality, visually compelling image that accurately represents the described scene, characters, or objects. Pay careful attention to style, color, lighting, perspective, and any specific instructions provided. +""" + +# 775 +t2i_system_prompt_en_recaption = """ +You are a world-class image generation prompt expert. Your task is to rewrite a user's simple description into a **structured, objective, and detail-rich** professional-level prompt. + +The final output must be wrapped in `` tags. + +### **Universal Core Principles** + +When rewriting the prompt (inside the `` tags), you must adhere to the following principles: + +1. **Absolute Objectivity**: Describe only what is visually present. Avoid subjective words like "beautiful" or "sad". Convey aesthetic qualities through specific descriptions of color, light, shadow, and composition. +2. **Physical and Logical Consistency**: All scene elements (e.g., gravity, light, shadows, reflections, spatial relationships, object proportions) must strictly adhere to real-world physics and common sense. For example, tennis players must be on opposite sides of the net; objects cannot float without a cause. +3. **Structured Description**: Strictly follow a logical order: from general to specific, background to foreground, and primary to secondary elements. Use directional terms like "foreground," "mid-ground," "background," and "left side of the frame" to clearly define the spatial layout. +4. **Use Present Tense**: Describe the scene from an observer's perspective using the present tense, such as "A man stands..." or "Light shines on..." +5. **Use Rich and Specific Descriptive Language**: Use precise adjectives to describe the quantity, size, shape, color, and other attributes of objects, subjects, and text. Vague expressions are strictly prohibited. + +If the user specifies a style (e.g., oil painting, anime, UI design, text rendering), strictly adhere to that style. Otherwise, first infer a suitable style from the user's input. If there is no clear stylistic preference, default to an **ultra-realistic photographic style**. Then, generate the detailed rewritten prompt according to the **Style-Specific Creation Guide** below: + +### **Style-Specific Creation Guide** + +Based on the determined artistic style, apply the corresponding professional knowledge. + +**1. Photography and Realism Style** +* Utilize professional photography terms (e.g., lighting, lens, composition) and meticulously detail material textures, physical attributes of subjects, and environmental details. + +**2. Illustration and Painting Style** +* Clearly specify the artistic school (e.g., Japanese Cel Shading, Impasto Oil Painting) and focus on describing its unique medium characteristics, such as line quality, brushstroke texture, or paint properties. + +**3. Graphic/UI/APP Design Style** +* Objectively describe the final product, clearly defining the layout, elements, and color palette. All text on the interface must be enclosed in double quotes `""` to specify its exact content (e.g., "Login"). Vague descriptions are strictly forbidden. + +**4. Typographic Art** +* The text must be described as a complete physical object. The description must begin with the text itself. Use a straightforward front-on or top-down perspective to ensure the entire text is visible without cropping. + +### **Final Output Requirements** + +1. **Output the Final Prompt Only**: Do not show any thought process, Markdown formatting, or line breaks. +2. **Adhere to the Input**: You must retain the core concepts, attributes, and any specified text from the user's input. +3. **Style Reinforcement**: Mention the core style 3-5 times within the prompt and conclude with a style declaration sentence. +4. **Avoid Self-Reference**: Describe the image content directly. Remove redundant phrases like "This image shows..." or "The scene depicts..." +5. **The final output must be wrapped in `xxxx` tags.** + +The user will now provide an input prompt. You will provide the expanded prompt. +""" + +# 890 +t2i_system_prompt_en_think_recaption = """ +You will act as a top-tier Text-to-Image AI. Your core task is to deeply analyze the user's text input and transform it into a detailed, artistic, and fully user-intent-compliant image. + +Your workflow is divided into two phases: + +1. Thinking Phase (): In the tag, you need to conduct a structured thinking process, progressively breaking down and enriching the constituent elements of the image. This process must include, but is not limited to, the following dimensions: + +Subject: Clearly define the core character(s) or object(s) in the scene, including their appearance, posture, expression, and emotion. +Composition: Set the camera angle and layout, such as close-up, long shot, bird's-eye view, golden ratio composition, etc. +Environment/Background: Describe the scene where the subject is located, including the location, time of day, weather, and other elements in the background. +Lighting: Define the type, direction, and quality of the light source, such as soft afternoon sunlight, cool tones of neon lights, dramatic Rembrandt lighting, etc., to create a specific atmosphere. +Color Palette: Set the main color tone and color scheme of the image, such as vibrant and saturated, low-saturation Morandi colors, black and white, etc. +Quality/Style: Determine the artistic style and technical details of the image. This includes user-specified styles (e.g., anime, oil painting) or the default realistic style, as well as camera parameters (e.g., focal length, aperture, depth of field). +Details: Add minute elements that enhance the realism and narrative quality of the image, such as a character's accessories, the texture of a surface, dust particles in the air, etc. + + +2. Recaption Phase (): In the tag, merge all the key details from the thinking process into a coherent, precise, and visually evocative final description. This description is the direct instruction for generating the image, so it must be clear, unambiguous, and organized in a way that is most suitable for an image generation engine to understand. + +Absolutely Objective: Describe only what is visually present. Avoid subjective words like "beautiful" or "sad." Convey aesthetic sense through concrete descriptions of colors, light, shadow, and composition. + +Physical and Logical Consistency: All scene elements (e.g., gravity, light and shadow, reflections, spatial relationships, object proportions) must strictly adhere to the physical laws of the real world and common sense. For example, in a tennis match, players must be on opposite sides of the net; objects cannot float without reason. + +Structured Description: Strictly follow a logical order: from whole to part, background to foreground, and primary to secondary. Use directional words like "foreground," "mid-ground," "background," "left side of the frame" to clearly define the spatial layout. + +Use Present Tense: Describe from an observer's perspective using the present tense, such as "a man stands," "light shines on..." +Use Rich and Specific Descriptive Language: Use precise adjectives to describe the quantity, size, shape, color, and other attributes of objects/characters/text. Absolutely avoid any vague expressions. + + +Output Format: +Thinking processRefined image descriptionGenerate Image + + +You must strictly adhere to the following rules: + +1. Faithful to Intent, Reasonable Expansion: You can creatively add details to the user's description to enhance the image's realism and artistic quality. However, all additions must be highly consistent with the user's core intent and never introduce irrelevant or conflicting elements. +2. Style Handling: When the user does not specify a style, you must default to an "Ultra-realistic, Photorealistic" style. If the user explicitly specifies a style (e.g., anime, watercolor, oil painting, cyberpunk, etc.), both your thinking process and final description must strictly follow and reflect that specified style. +3. Text Rendering: If specific text needs to appear in the image (such as words on a sign, a book title), you must enclose this text in English double quotes (""). Descriptive text must not use double quotes. +4. Design-related Images: You need to specify all text and graphical elements that appear in the image and clearly describe their design details, including font, color, size, position, arrangement, visual effects, etc. +""" + +t2i_system_prompts = { + "en_vanilla": [t2i_system_prompt_en_vanilla], + "en_recaption": [t2i_system_prompt_en_recaption], + "en_think_recaption": [t2i_system_prompt_en_think_recaption], +} + + +unified_system_prompt_en = """You are an advanced multimodal model whose core mission is to analyze user intent and generate high-quality text and images. + +#### Four Core Capabilities +1. **Text-to-Text (T2T):** Generate coherent text responses from text prompts. +2. **Text-to-Image (T2I):** Generate high-quality images from text prompts. +3. **Text & Image to Text (TI2T):** Generate accurate text responses based on a combination of images and text. +4. **Text & Image to Image (TI2I):** Generate modified images based on a reference image and editing instructions. + +--- +### Image Generation Protocol (for T2I & TI2I) +You will operate in one of two modes, determined by the user's starting tag: +#### ** Mode (Prompt Rewriting)**: +* **Trigger:** Input begins with ``. +* **Task:** Immediately rewrite the user's text into a structured, objective, and detail-rich professional-grade prompt. +* **Output:** Output only the rewritten prompt within `` tags: `Rewritten professional-grade prompt` + +#### ** Mode (Think + Rewrite)**: +* **Trigger:** Input begins with ``. +* **Task:** First, conduct a structured analysis of the request within `` tags. Then, output the professional prompt, rewritten based on the analysis, within `` tags. +* **Output:** Strictly adhere to the format: `Analysis processRewritten prompt` + +--- +### Execution Standards and Guidelines +#### **`` Phase: Analysis Guidelines** +**For T2I (New Image Generation):** +Deconstruct the user's request into the following core visual components: +* **Subject:** Key features of the main character/object, including appearance, pose, expression, and emotion. +* **Composition:** Camera angle, lens type, and layout. +* **Environment/Background:** The setting, time of day, weather, and background elements. +* **Lighting:** Technical details such as light source type, direction, and quality. +* **Color Palette:** The dominant hues and overall color scheme. +* **Style/Quality:** The artistic style, clarity, depth of field, and other technical details. +* **Text:** Identify any text to be rendered in the image, including its content, style, and position. +* **Details:** Small elements that add narrative depth and realism. + +**For TI2I (Image Editing):** +Adopt a task-diagnostic approach: +1. **Diagnose Task:** Identify the edit type and analyze key requirements. +2. **Prioritize Analysis:** + * **Adding:** Analyze the new element's position and appearance, ensuring seamless integration with the original image's lighting, shadows, and style. + * **Removing:** Identify the target for removal and determine how to logically fill the resulting space using surrounding textures and lighting. + * **Modifying:** Analyze what to change and what it should become, while emphasizing which elements must remain unchanged. + * **Style Transfer:** Deconstruct the target style into specific features (e.g., brushstrokes, color palette) and apply them to the original image. + * **Text Editing:** Ensure correct content and format. Consider the text's visual style (e.g., font, color, material) and how it adapts to the surface's perspective, curvature, and lighting. + * **Reference Editing:** Extract specific visual elements (e.g., appearance, posture, composition, lines, depth) from the reference image to generate an image that aligns with the text description while also incorporating the referenced content. + * **Inferential Editing:** Identify vague requests (e.g., "make it more professional") and translate them into concrete visual descriptions. + +#### `` Phase: Professional-Grade Prompt Generation Rules +**General Rewriting Principles (for T2I & TI2I):** +1. **Structure & Logic:** Start with a global description. Use positional words (e.g., "foreground", "background") to define the layout. +2. **Absolute Objectivity:** Avoid subjective terms. Convey aesthetics through precise descriptions of color, light, shadow, and materials. +3. **Physical & Logical Consistency:** Ensure all descriptions adhere to the laws of physics and common sense. +4. **Fidelity to User Intent:** Preserve the user's core concepts, subjects, and attributes. Text to be rendered in the image **must be enclosed in double quotes ("")**. +5. **Camera & Resolution:** Translate camera parameters into descriptions of visual effects. Convert resolution information into natural language. + +**T2I-Specific Guidelines:** +* **Style Adherence & Inference:** Strictly follow the specified style. If none is given, infer the most appropriate style and detail it using professional terminology. +* **Style Detailing:** + * **Photography/Realism:** Use professional photography terms to describe lighting, lens effects, and material textures. + * **Painting/Illustration:** Specify the art movement or medium's characteristics. + * **UI/Design:** Objectively describe the final product. Define layout, elements, and typography. Text content must be specific and unambiguous. + +**TI2I-Specific Guidelines:** +* **Preserve Unchanged Elements:** Emphasize elements that **remain unchanged**. Unless explicitly instructed, never alter a character's identity/appearance, the core background, camera angle, or overall style. +* **Clear Editing Instructions:** + * **Replacement:** Use the logic "**replace B with A**," and provide a detailed description of A. + * **Addition:** Clearly state what to add, where, and what it looks like. +* **Unambiguous Referencing:** Avoid vague references (e.g., "that person"). Use specific descriptions of appearance. +""" + + +def get_system_prompt(sys_type, bot_task, system_prompt=None): + # No system prompt, return None directly + if sys_type == "None": + return None + # Use the unified English system prompt (combined T2I and TI2I guidelines) + elif sys_type == "en_unified": + return unified_system_prompt_en + # Use predefined English system prompts: vanilla (basic), recaption, think_recaption + elif sys_type in ["en_vanilla", "en_recaption", "en_think_recaption"]: + return t2i_system_prompts[sys_type][0] + # Dynamic mode: automatically select system prompt based on bot_task type + elif sys_type == "dynamic": + # Think task: use chain-of-thought recaption prompt + if bot_task == "think": + return t2i_system_prompts["en_think_recaption"][0] + # Recaption task: use recaption prompt + elif bot_task == "recaption": + return t2i_system_prompts["en_recaption"][0] + # Image generation task: use vanilla prompt + elif bot_task == "image": + return t2i_system_prompts["en_vanilla"][0].strip("\n") + # Other tasks: use user-provided custom prompt + else: + return system_prompt + # Custom mode: use the user-provided system_prompt parameter directly + elif sys_type == "custom": + return system_prompt + # Unsupported type: raise NotImplementedError + else: + raise NotImplementedError(f"Unsupported system prompt type: {sys_type}") + + +__all__ = ["get_system_prompt"] diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index d15dc90fe5..38d32f7198 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1312,7 +1312,13 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) if request.negative_prompt is not None: prompt["negative_prompt"] = request.negative_prompt gen_params = OmniDiffusionSamplingParams(num_outputs_per_prompt=request.n) - + extra_args = {} + if request.use_system_prompt is not None: + extra_args["use_system_prompt"] = request.use_system_prompt + if request.system_prompt is not None: + extra_args["system_prompt"] = request.system_prompt + if extra_args: + gen_params.extra_args = extra_args # Parse per-request LoRA (compatible with chat's extra_body.lora shape). lora_request, lora_scale = _parse_lora_request(request.lora) _update_if_not_none(gen_params, "lora_request", lora_request) diff --git a/vllm_omni/entrypoints/openai/protocol/images.py b/vllm_omni/entrypoints/openai/protocol/images.py index 5f76bbd6b8..6a2dd43be5 100644 --- a/vllm_omni/entrypoints/openai/protocol/images.py +++ b/vllm_omni/entrypoints/openai/protocol/images.py @@ -81,6 +81,24 @@ def validate_layers(cls, v): # vllm-omni extensions for diffusion control negative_prompt: str | None = Field(default=None, description="Text describing what to avoid in the image") + system_prompt: str | None = Field( + default=None, description="Custom system prompt. Used when --use_system_prompt is custom" + ) + use_system_prompt: str | None = Field( + default=None, + description="System prompt type. Options: None, dynamic, en_vanilla, " + "en_recaption, en_think_recaption, en_unified, custom", + ) + + @field_validator("use_system_prompt") + @classmethod + def validate_use_system_prompt(cls, v): + """Validate system prompt type.""" + valid_types = [None, "dynamic", "en_vanilla", "en_recaption", "en_think_recaption", "en_unified", "custom"] + if v not in valid_types: + raise ValueError(f"Invalid use_system_prompt type: {v}. Must be one of: {valid_types[1:] + [None]}") + return v + num_inference_steps: int | None = Field( default=None, ge=1, From 340cba7b04237e5374f4e3642483117f4bfc42b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Tue, 7 Apr 2026 17:16:12 +0800 Subject: [PATCH 074/204] [daVinci-MagiHuman][Doc][BugFix] Update model support for daVici-MagiHuman and fix media utils bug (#2542) Signed-off-by: princepride --- docs/models/supported_models.md | 2 +- docs/user_guide/diffusion_features.md | 2 +- tests/e2e/offline_inference/test_magi_human.py | 2 +- vllm_omni/diffusion/utils/media_utils.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index f3d22aa768..8eab20edc8 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -37,7 +37,7 @@ th { | `LTX2TwoStagesPipeline` | LTX-2-T2V | `rootonchair/LTX-2-19b-distilled` | ✅︎ | ✅︎ | | | | `LTX2ImageToVideoTwoStagesPipeline` | LTX-2-I2V | `rootonchair/LTX-2-19b-distilled` | ✅︎ | ✅︎ | | | | `HeliosPipeline`, `HeliosPyramidPipeline` | Helios | `BestWishYsh/Helios-Base`, `BestWishYsh/Helios-Mid`, `BestWishYsh/Helios-Distilled` | ✅︎ | ✅︎ | ✅︎ | | -| `MagiHumanPipeline` | MagiHuman | `princepride/daVinci-MagiHuman` | ✅︎ | ✅︎ | | | +| `MagiHumanPipeline` | MagiHuman | `SII-GAIR/daVinci-MagiHuman-Base-1080p` | ✅︎ | ✅︎ | | | | `OvisImagePipeline` | Ovis-Image | `OvisAI/Ovis-Image` | ✅︎ | ✅︎ | | ✅︎ | | `LongcatImagePipeline` | LongCat-Image | `meituan-longcat/LongCat-Image` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | | `LongCatImageEditPipeline` | LongCat-Image-Edit | `meituan-longcat/LongCat-Image-Edit` | ✅︎ | ✅︎ | ✅︎ | ✅︎ | diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index c151164ca0..d4d9ce6a3d 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -124,7 +124,7 @@ The following tables show which models support each feature: > Notes: > 1. Nextstep_1(T2I) does not support cache acceleration methods such as TeaCache or Cache-DiT. -> 2. `Tongyi-MAI/Z-Image-Turbo` and `princepride/daVinci-MagiHuman` are distilled models with minimal NFEs; CFG-Parallel is not necessary. +> 2. `Tongyi-MAI/Z-Image-Turbo` and `SII-GAIR/daVinci-MagiHuman-Base-1080p` are distilled models with minimal NFEs; CFG-Parallel is not necessary. ### VideoGen diff --git a/tests/e2e/offline_inference/test_magi_human.py b/tests/e2e/offline_inference/test_magi_human.py index 6211fdafc0..cb711edb57 100644 --- a/tests/e2e/offline_inference/test_magi_human.py +++ b/tests/e2e/offline_inference/test_magi_human.py @@ -47,7 +47,7 @@ def test_magi_human_e2e(run_level): if run_level != "advanced_model": pytest.skip("MagiHuman e2e test requires advanced_model run level with real weights.") - model_path = "princepride/daVinci-MagiHuman" + model_path = "SII-GAIR/daVinci-MagiHuman-Base-1080p" omni = Omni( model=model_path, diff --git a/vllm_omni/diffusion/utils/media_utils.py b/vllm_omni/diffusion/utils/media_utils.py index ee1f8116f0..f96a28fbd7 100644 --- a/vllm_omni/diffusion/utils/media_utils.py +++ b/vllm_omni/diffusion/utils/media_utils.py @@ -50,7 +50,7 @@ def mux_video_audio_bytes( if samples.ndim == 1: samples = samples.reshape(1, -1) elif samples.ndim == 2 and samples.shape[0] > samples.shape[1]: - samples = samples.T + samples = np.ascontiguousarray(samples.T) num_channels = samples.shape[0] layout = "stereo" if num_channels >= 2 else "mono" a_stream = container.add_stream(audio_codec, rate=audio_sample_rate) From 408365fa0d542ccffbf659c82b06b374d02dfc4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Tue, 7 Apr 2026 18:02:51 +0800 Subject: [PATCH 075/204] [Bagel]Fused gate_proj and up_proj (#2546) Signed-off-by: princepride --- .../models/bagel/bagel_transformer.py | 30 ++++++------------- .../diffusion/models/bagel/pipeline_bagel.py | 2 ++ 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/vllm_omni/diffusion/models/bagel/bagel_transformer.py b/vllm_omni/diffusion/models/bagel/bagel_transformer.py index a14e875c06..d32a6d8aca 100644 --- a/vllm_omni/diffusion/models/bagel/bagel_transformer.py +++ b/vllm_omni/diffusion/models/bagel/bagel_transformer.py @@ -25,6 +25,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ( ColumnParallelLinear, + MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear, ) @@ -157,21 +158,12 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - self.gate_proj = ColumnParallelLinear( + self.gate_up_proj = MergedColumnParallelLinear( hidden_size, - intermediate_size, - bias=False, - gather_output=False, - quant_config=quant_config, - prefix=f"{prefix}.gate_proj", - ) - self.up_proj = ColumnParallelLinear( - hidden_size, - intermediate_size, + [intermediate_size, intermediate_size], bias=False, - gather_output=False, quant_config=quant_config, - prefix=f"{prefix}.up_proj", + prefix=f"{prefix}.gate_up_proj", ) self.down_proj = RowParallelLinear( intermediate_size, @@ -186,8 +178,8 @@ def __init__( self.act_fn = nn.SiLU() def forward(self, x): - gate, _ = self.gate_proj(x) - up, _ = self.up_proj(x) + gate_up, _ = self.gate_up_proj(x) + gate, up = gate_up.chunk(2, dim=-1) x = self.act_fn(gate) * up x, _ = self.down_proj(x) return x @@ -929,13 +921,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: (".qkv_proj", ".q_proj", "q"), (".qkv_proj", ".k_proj", "k"), (".qkv_proj", ".v_proj", "v"), - # MLP gate/up projections — the DiT uses separate - # ColumnParallelLinear layers (no fused gate_up_proj), but - # these entries are needed so that DiffusionLoRAManager can - # derive the packed→sublayer mapping for LoRA checkpoints - # that store weights under fused gate_up_proj keys. - # The weight loader gracefully falls through to the - # non-stacked path when the fused parameter doesn't exist. + # MLP gate/up projections — fused into MergedColumnParallelLinear. + # HF checkpoints store separate gate_proj / up_proj weights; + # these entries remap them to the fused gate_up_proj parameter. (".gate_up_proj", ".gate_proj", 0), (".gate_up_proj", ".up_proj", 1), ] diff --git a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py index 3e053cbda5..84f177e01a 100644 --- a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py +++ b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py @@ -675,6 +675,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: (".qkv_proj_moe_gen", ".q_proj_moe_gen"), (".qkv_proj_moe_gen", ".k_proj_moe_gen"), (".qkv_proj_moe_gen", ".v_proj_moe_gen"), + (".gate_up_proj", ".gate_proj"), + (".gate_up_proj", ".up_proj"), ] stacked_source_names: set[str] = set() for name in list(allowed): From feefdaee9fdbb62ce3fdbeb2814e20d70a245573 Mon Sep 17 00:00:00 2001 From: Markus / Mark <46672778+marksverdhei@users.noreply.github.com> Date: Tue, 7 Apr 2026 17:07:01 +0200 Subject: [PATCH 076/204] [Bugfix] Accept 'speaker' as alias for 'voice' in TTS speech API (#2424) Signed-off-by: marksverdhei Signed-off-by: marksverdhai <249650165+marksverdhai@users.noreply.github.com> Co-authored-by: marksverdhai <249650165+marksverdhai@users.noreply.github.com> --- .../qwen3_tts/openai_speech_client.py | 2 +- .../openai_api/test_serving_speech.py | 175 ++++++++++++++++++ .../entrypoints/openai/protocol/audio.py | 6 +- .../entrypoints/openai/serving_speech.py | 88 +++++++-- 4 files changed, 255 insertions(+), 16 deletions(-) diff --git a/examples/online_serving/qwen3_tts/openai_speech_client.py b/examples/online_serving/qwen3_tts/openai_speech_client.py index 4741a47158..77e13b08ed 100644 --- a/examples/online_serving/qwen3_tts/openai_speech_client.py +++ b/examples/online_serving/qwen3_tts/openai_speech_client.py @@ -71,7 +71,7 @@ def run_tts_generation(args) -> None: payload = { "model": args.model, "input": args.text, - "speaker": args.speaker, + "voice": args.speaker, "response_format": args.response_format, } diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index da15ec8f0e..334264602e 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -1028,6 +1028,181 @@ def test_get_uploaded_audio_data_voice_not_found(self, speech_server): assert result is None + # ── speaker field alias ── + + def test_speaker_alias_accepted_as_voice(self): + """The 'speaker' JSON key should be accepted as an alias for 'voice'.""" + req = OpenAICreateSpeechRequest.model_validate({"input": "Hello", "speaker": "custom_voice"}) + assert req.voice == "custom_voice" + + def test_voice_field_still_accepted(self): + """The canonical 'voice' JSON key should still work.""" + req = OpenAICreateSpeechRequest.model_validate({"input": "Hello", "voice": "custom_voice"}) + assert req.voice == "custom_voice" + + def test_speaker_alias_in_base_task_with_uploaded_voice(self, speech_server): + """Using 'speaker' key with an uploaded voice should work for Base task.""" + speech_server.uploaded_speakers = { + "utesf": { + "name": "UTESF", + "file_path": "/tmp/voice_samples/utesf.wav", + "mime_type": "audio/wav", + "ref_text": None, + } + } + req = OpenAICreateSpeechRequest.model_validate({"input": "Hello", "speaker": "UTESF", "task_type": "Base"}) + assert req.voice == "UTESF" + with patch("pathlib.Path.exists", return_value=True): + result = speech_server._validate_qwen_tts_request(req) + assert result is None + + # ── uploaded voice with embedding ── + + def test_build_tts_params_with_uploaded_voice_embedding(self, speech_server): + """Test _build_tts_params loads embedding for embedding-uploaded voices.""" + speech_server.uploaded_speakers = { + "emb_voice": { + "name": "emb_voice", + "file_path": "/tmp/voice_samples/emb_voice.safetensors", + "mime_type": "application/x-safetensors", + "embedding_source": "direct", + "embedding_dim": 1024, + "cache_status": "ready", + "cache_file": "/tmp/voice_samples/emb_voice.safetensors", + } + } + speech_server.supported_speakers = {"ryan", "vivian", "emb_voice"} + + fake_embedding = [0.1] * 1024 + with patch.object(speech_server, "_get_uploaded_speaker_embedding") as mock_get_emb: + mock_get_emb.return_value = fake_embedding + req = OpenAICreateSpeechRequest(input="Hello", voice="emb_voice") + params = speech_server._build_tts_params(req) + + assert "voice_clone_prompt" in params + assert params["voice_clone_prompt"][0]["ref_spk_embedding"] == fake_embedding + assert params["task_type"] == ["Base"] + assert params["x_vector_only_mode"] == [True] + assert "ref_audio" not in params + + # ── regression: full flow from issue #1603 ── + + def test_regression_1603_speaker_key_with_uploaded_audio_voice(self, speech_server): + """Regression test for #1603: upload audio voice, then invoke TTS with 'speaker' key. + + Verifies the full validate → build_params pipeline works end-to-end. + """ + speech_server.uploaded_speakers = { + "utesf": { + "name": "UTESF", + "file_path": "/tmp/voice_samples/utesf.wav", + "mime_type": "audio/wav", + "ref_text": "Hola, esta es una prueba.", + } + } + # Parse with 'speaker' alias (the key users actually send) + req = OpenAICreateSpeechRequest.model_validate( + {"input": "Hello world", "speaker": "UTESF", "task_type": "Base"} + ) + assert req.voice == "UTESF" + + # Validation should pass (file exists) + with patch("pathlib.Path.exists", return_value=True): + err = speech_server._validate_qwen_tts_request(req) + assert err is None, f"Validation failed: {err}" + + # Build params should auto-set ref_audio from stored file + with patch.object(speech_server, "_get_uploaded_audio_data") as mock_audio: + mock_audio.return_value = "data:audio/wav;base64,ZmFrZQ==" + params = speech_server._build_tts_params(req) + + assert params["task_type"] == ["Base"] + assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZQ=="] + assert params["ref_text"] == ["Hola, esta es una prueba."] + assert params["x_vector_only_mode"] == [False] + assert params["speaker"] == ["utesf"] + + def test_regression_1603_speaker_key_with_uploaded_embedding_voice(self, speech_server): + """Regression test for #1603: upload embedding voice, then invoke TTS with 'speaker' key. + + Verifies embedding-uploaded voices are loaded as voice_clone_prompt, not as audio. + """ + speech_server.uploaded_speakers = { + "myvoice": { + "name": "myvoice", + "file_path": "/tmp/voice_samples/myvoice.safetensors", + "mime_type": "application/x-safetensors", + "embedding_source": "direct", + "embedding_dim": 1024, + "cache_status": "ready", + "cache_file": "/tmp/voice_samples/myvoice.safetensors", + } + } + # Parse with 'speaker' alias + req = OpenAICreateSpeechRequest.model_validate( + {"input": "Hello world", "speaker": "myvoice", "task_type": "Base"} + ) + assert req.voice == "myvoice" + + # Validation should pass + with patch("pathlib.Path.exists", return_value=True): + err = speech_server._validate_qwen_tts_request(req) + assert err is None, f"Validation failed: {err}" + + # Build params should use embedding, NOT audio + fake_emb = [0.1] * 1024 + with patch.object(speech_server, "_get_uploaded_speaker_embedding") as mock_emb: + mock_emb.return_value = fake_emb + params = speech_server._build_tts_params(req) + + assert params["task_type"] == ["Base"] + assert params["x_vector_only_mode"] == [True] + assert "voice_clone_prompt" in params + assert params["voice_clone_prompt"][0]["ref_spk_embedding"] == fake_emb + # Must NOT have ref_audio — that would fail for safetensors files + assert "ref_audio" not in params + + def test_validate_rejects_embedding_voice_with_pending_cache(self, speech_server): + """Validation should reject embedding voices whose cache is not yet ready.""" + speech_server.uploaded_speakers = { + "myvoice": { + "name": "myvoice", + "file_path": "/tmp/myvoice.safetensors", + "mime_type": "application/x-safetensors", + "embedding_source": "direct", + "cache_status": "pending", + "cache_file": None, + } + } + req = OpenAICreateSpeechRequest.model_validate({"input": "Hello", "speaker": "myvoice", "task_type": "Base"}) + with patch("pathlib.Path.exists", return_value=True): + err = speech_server._validate_qwen_tts_request(req) + assert err is not None + assert "not yet ready" in err + + def test_x_vector_only_mode_not_overwritten_for_uploaded_embedding(self, speech_server): + """x_vector_only_mode set by uploaded embedding must not be overwritten by request field.""" + speech_server.uploaded_speakers = { + "emb_voice": { + "name": "emb_voice", + "file_path": "/tmp/emb_voice.safetensors", + "mime_type": "application/x-safetensors", + "embedding_source": "direct", + "embedding_dim": 1024, + "cache_status": "ready", + "cache_file": "/tmp/emb_voice.safetensors", + } + } + fake_emb = [0.1] * 1024 + with patch.object(speech_server, "_get_uploaded_speaker_embedding") as mock_emb: + mock_emb.return_value = fake_emb + # Client explicitly sends x_vector_only_mode=False, but embedding requires True + req = OpenAICreateSpeechRequest(input="Hello", voice="emb_voice", x_vector_only_mode=False) + params = speech_server._build_tts_params(req) + + assert params["x_vector_only_mode"] == [True] + assert "voice_clone_prompt" in params + def test_max_instructions_length_default(self, speech_server): """Test default max instructions length (500) when no config provided.""" # Fixture creates server with no CLI override and no TTS stage diff --git a/vllm_omni/entrypoints/openai/protocol/audio.py b/vllm_omni/entrypoints/openai/protocol/audio.py index 89d2dc02f6..8468efd861 100644 --- a/vllm_omni/entrypoints/openai/protocol/audio.py +++ b/vllm_omni/entrypoints/openai/protocol/audio.py @@ -2,7 +2,7 @@ from typing import Literal import numpy as np -from pydantic import BaseModel, Field, field_validator, model_validator +from pydantic import AliasChoices, BaseModel, Field, field_validator, model_validator _MAX_EMBEDDING_DIM = 8192 @@ -10,8 +10,12 @@ class OpenAICreateSpeechRequest(BaseModel): input: str model: str | None = None + # Accept both "voice" (OpenAI convention) and "speaker" (model/internal + # convention) as input keys. Intentionally global — all TTS backends + # (Qwen3-TTS, Voxtral, Fish Speech) use this field for the speaker name. voice: str | None = Field( default=None, + validation_alias=AliasChoices("voice", "speaker"), description="Speaker/voice to use. For Qwen3-TTS: vivian, ryan, aiden, etc.", ) instructions: str | None = Field( diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 10c5fdacc5..a4b0293932 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -467,6 +467,48 @@ def _get_uploaded_audio_data(self, voice_name: str) -> str | None: logger.error(f"Could not read audio file for voice {voice_name}: {e}") return None + def _get_uploaded_speaker_embedding(self, voice_name: str) -> list[float] | None: + """Load pre-computed speaker embedding for an uploaded voice. + + Returns the embedding as a list of floats, or None if the voice + was not uploaded with an embedding (i.e. it has audio instead). + """ + voice_name_lower = voice_name.lower() + if voice_name_lower not in self.uploaded_speakers: + return None + + speaker_info = self.uploaded_speakers[voice_name_lower] + if speaker_info.get("embedding_source") != "direct": + return None + + cache_file = speaker_info.get("cache_file") + if not cache_file or not Path(cache_file).exists(): + logger.warning("Embedding file not found for voice %s: %s", voice_name, cache_file) + return None + + if not _validate_path_within_directory(Path(cache_file), self.uploaded_speakers_dir): + logger.error("Cache file path traversal detected for voice %s: %s", voice_name, cache_file) + return None + + try: + from safetensors.torch import load_file + except ImportError: + logger.error( + "The 'safetensors' package is required to load speaker embeddings. " + "Install it with: pip install safetensors" + ) + return None + + try: + tensors = load_file(cache_file) + if "speaker_embedding" not in tensors: + logger.warning("Key 'speaker_embedding' not found in %s for voice %s", cache_file, voice_name) + return None + return tensors["speaker_embedding"].squeeze().tolist() + except Exception as e: + logger.error("Could not load embedding for voice %s: %s", voice_name, e) + return None + async def upload_voice( self, audio_file: UploadFile, @@ -858,11 +900,17 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str # voice is not None voice_lower = request.voice.lower() if voice_lower in self.uploaded_speakers: - # Check if audio file exists for uploaded speaker + # Check if data file exists for uploaded speaker speaker_info = self.uploaded_speakers[voice_lower] file_path = Path(speaker_info["file_path"]) if not file_path.exists(): - return f"Audio file for uploaded speaker '{request.voice}' not found on disk" + return f"Data file for uploaded speaker '{request.voice}' not found on disk" + # For embedding-uploaded voices, verify the cache is ready + if speaker_info.get("embedding_source") == "direct": + cache_file = speaker_info.get("cache_file") + if not cache_file or not Path(cache_file).exists(): + status = speaker_info.get("cache_status", "unknown") + return f"Speaker embedding for '{request.voice}' is not yet ready (cache_status='{status}')" else: # need ref_audio for built-in speaker if request.ref_audio is None: @@ -1107,20 +1155,32 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any # Uploaded voices use task_type="Base" (CustomVoice requires built-in spk_id). # If ref_text was provided at upload time, use in-context cloning; otherwise x_vector only. if request.voice.lower() in self.uploaded_speakers and request.ref_audio is None: - audio_data = self._get_uploaded_audio_data(request.voice) - if not audio_data: - raise ValueError(f"Audio file for uploaded voice '{request.voice}' is missing or corrupted") speaker_info = self.uploaded_speakers[request.voice.lower()] - stored_ref_text = speaker_info.get("ref_text") - params["ref_audio"] = [audio_data] - params["task_type"] = ["Base"] - params["voice_created_at"] = [speaker_info.get("created_at", 0)] - if stored_ref_text: - params["ref_text"] = [stored_ref_text] - params["x_vector_only_mode"] = [False] + + # Check if this voice was uploaded with a pre-computed embedding. + # Populate request.speaker_embedding so the existing code path + # (below) handles voice_clone_prompt and x_vector_only_mode. + embedding = self._get_uploaded_speaker_embedding(request.voice) + if embedding is not None: + request.speaker_embedding = embedding + params["task_type"] = ["Base"] + logger.info("Auto-set speaker_embedding for uploaded voice: %s", request.voice) else: - params["x_vector_only_mode"] = [True] - logger.info("Auto-set ref_audio for uploaded voice: %s (icl=%s)", request.voice, bool(stored_ref_text)) + audio_data = self._get_uploaded_audio_data(request.voice) + if not audio_data: + raise ValueError(f"Audio file for uploaded voice '{request.voice}' is missing or corrupted") + stored_ref_text = speaker_info.get("ref_text") + params["ref_audio"] = [audio_data] + params["task_type"] = ["Base"] + params["voice_created_at"] = [speaker_info.get("created_at", 0)] + if stored_ref_text: + params["ref_text"] = [stored_ref_text] + params["x_vector_only_mode"] = [False] + else: + params["x_vector_only_mode"] = [True] + logger.info( + "Auto-set ref_audio for uploaded voice: %s (icl=%s)", request.voice, bool(stored_ref_text) + ) elif params["task_type"][0] == "CustomVoice": params["speaker"] = ["Vivian"] # Default for CustomVoice From c9dbc0955ba37843c576b46842acd2212c56b7a9 Mon Sep 17 00:00:00 2001 From: pikaxinge <68273313+pikaxinge@users.noreply.github.com> Date: Wed, 8 Apr 2026 00:48:26 +0800 Subject: [PATCH 077/204] [Bugfix] Prevent Silent Stage Dropouts: fix coordinator reconnect bug, close/update race, and heartbeat stall (#1899) Signed-off-by: pikaxinge <2392811793@qq.com> Co-authored-by: Alicia <115451386+congw729@users.noreply.github.com> --- .../test_omni_coord_client_for_stage.py | 204 +++++++++++++++++- .../omni_coord_client_for_stage.py | 143 ++++++------ 2 files changed, 285 insertions(+), 62 deletions(-) diff --git a/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py b/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py index b74a48f49c..0ba19c7fff 100644 --- a/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py +++ b/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py @@ -2,13 +2,20 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json +import threading +import pytest import zmq from vllm_omni.distributed.omni_coordinator import ( OmniCoordClientForStage, StageStatus, ) +from vllm_omni.distributed.omni_coordinator import ( + omni_coord_client_for_stage as stage_client_module, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] def _bind_router() -> tuple[zmq.Context, zmq.Socket, str]: @@ -19,7 +26,8 @@ def _bind_router() -> tuple[zmq.Context, zmq.Socket, str]: return ctx, router, endpoint -def _recv_event(router: zmq.Socket) -> dict: +def _recv_event(router: zmq.Socket, timeout_ms: int = 2000) -> dict: + assert router.poll(timeout=timeout_ms) != 0, "Timed out waiting for coordinator event" frames = router.recv_multipart() # ROUTER adds identity frame; the last frame is the payload. payload = frames[-1] @@ -108,3 +116,197 @@ def test_stage_client_close_sends_down_status(): router.close(0) ctx.term() + + +def test_stage_client_reconnects_after_send_failure(mocker): + """Verify send failure path invokes reconnect before retrying send.""" + ctx, router, endpoint = _bind_router() + + client = OmniCoordClientForStage( + endpoint, + "tcp://stage:reconnect-in", + "tcp://stage:reconnect-out", + 0, + ) + + # Discard initial registration event from the real socket. + _recv_event(router) + + class _FlakySocket: + def __init__(self): + self.send_calls = 0 + self.closed = False + + def send(self, *_args, **_kwargs): + self.send_calls += 1 + if self.send_calls == 1: + raise RuntimeError("simulated send failure") + + def close(self, *_args, **_kwargs): + self.closed = True + + flaky_socket = _FlakySocket() + client._socket = flaky_socket + client._reconnect = mocker.Mock(return_value=True) + + client.update_info(queue_length=1) + + client._reconnect.assert_called_once_with(max_retries=3) + assert flaky_socket.send_calls == 2 + + client.close() + router.close(0) + ctx.term() + + +def test_stage_client_raises_when_reconnect_fails(mocker): + """Verify send failure is propagated when reconnect cannot recover.""" + ctx, router, endpoint = _bind_router() + + client = OmniCoordClientForStage( + endpoint, + "tcp://stage:reconnect-fail-in", + "tcp://stage:reconnect-fail-out", + 0, + ) + + # Discard initial registration event from the real socket. + _recv_event(router) + + class _AlwaysFailSocket: + def send(self, *_args, **_kwargs): + raise RuntimeError("simulated send failure") + + def close(self, *_args, **_kwargs): + pass + + client._socket = _AlwaysFailSocket() + client._reconnect = mocker.Mock(return_value=False) + + with pytest.raises(RuntimeError, match="simulated send failure"): + client.update_info(queue_length=2) + + client._reconnect.assert_called_once_with(max_retries=3) + client.close() + router.close(0) + ctx.term() + + +def test_stage_client_close_handles_runtime_error_in_final_update(mocker): + """Verify close() still releases resources when final update raises RuntimeError.""" + ctx, router, endpoint = _bind_router() + + client = OmniCoordClientForStage( + endpoint, + "tcp://stage:close-runtime-in", + "tcp://stage:close-runtime-out", + 0, + ) + + # Discard initial registration event from the real socket. + _recv_event(router) + + client._send_event = mocker.Mock(side_effect=RuntimeError("simulated close-time failure")) + client.close() + + assert client._closed + assert client._socket.closed + + router.close(0) + ctx.term() + + +def test_reconnect_respects_retry_limit(monkeypatch): + """Verify _reconnect stops after max_retries on repeated failures.""" + attempts = {"connect": 0} + + class _FailSocket: + def close(self, *_args, **_kwargs): + pass + + def connect(self, *_args, **_kwargs): + attempts["connect"] += 1 + raise zmq.ZMQError("simulated reconnect failure") + + class _FailContext: + def socket(self, *_args, **_kwargs): + return _FailSocket() + + def term(self): + pass + + client = OmniCoordClientForStage.__new__(OmniCoordClientForStage) + client._closed = False + client._coord_zmq_addr = "tcp://127.0.0.1:9999" + client._stop_event = threading.Event() + client._send_lock = threading.RLock() + client._socket = _FailSocket() + client._ctx = _FailContext() + + monkeypatch.setattr(stage_client_module.zmq, "Context", lambda: _FailContext()) + monkeypatch.setattr(stage_client_module.time, "sleep", lambda *_args, **_kwargs: None) + + assert client._reconnect(max_retries=3, retry_interval=5.0) is False + assert attempts["connect"] == 3 + + +def test_heartbeat_loop_retries_after_transient_send_failure(): + """Verify heartbeat loop continues after one transient send failure.""" + + class _FakeStopEvent: + def __init__(self): + self.wait_calls = 0 + self._set = False + + def wait(self, timeout=None): + _ = timeout + self.wait_calls += 1 + # Run two loop iterations, then stop. + return self._set or self.wait_calls >= 3 + + def is_set(self): + return self._set + + def set(self): + self._set = True + + client = OmniCoordClientForStage.__new__(OmniCoordClientForStage) + client._closed = False + client._heartbeat_interval = 0.0 + client._stop_event = _FakeStopEvent() + + calls = {"count": 0} + + def _fake_send(event_type): + assert event_type == "heartbeat" + calls["count"] += 1 + if calls["count"] == 1: + raise RuntimeError("transient heartbeat failure") + + client._send_event = _fake_send + + client._heartbeat_loop() + + assert calls["count"] == 2 + + +def test_update_info_rejected_while_closing(): + """Verify update_info is rejected once client enters closing state.""" + ctx, router, endpoint = _bind_router() + + client = OmniCoordClientForStage( + endpoint, + "tcp://stage:closing-in", + "tcp://stage:closing-out", + 0, + ) + _recv_event(router) + + client._closing = True + with pytest.raises(RuntimeError, match="closing"): + client.update_info(queue_length=3) + + client._closing = False + client.close() + router.close(0) + ctx.term() diff --git a/vllm_omni/distributed/omni_coordinator/omni_coord_client_for_stage.py b/vllm_omni/distributed/omni_coordinator/omni_coord_client_for_stage.py index cd5c357bb4..cd3c99ab81 100644 --- a/vllm_omni/distributed/omni_coordinator/omni_coord_client_for_stage.py +++ b/vllm_omni/distributed/omni_coordinator/omni_coord_client_for_stage.py @@ -45,9 +45,10 @@ def __init__( self._status = StageStatus.UP self._queue_length = 0 self._closed = False + self._closing = False self._heartbeat_interval = 5.0 self._stop_event = threading.Event() - self._send_lock = threading.Lock() + self._send_lock = threading.RLock() self._send_event("update") @@ -57,38 +58,45 @@ def __init__( ) self._heartbeat_thread.start() - def _reconnect(self) -> bool: + def _reconnect(self, max_retries: int = 3, retry_interval: float = 5.0) -> bool: """Best-effort reconnect with up to ``max_retries`` attempts. - Each attempt closes the current socket/context, sleeps 5 seconds, - then creates a new DEALER socket and reconnects to the coordinator. - Caller must hold ``_send_lock``. + Each attempt closes the current socket/context, sleeps ``retry_interval`` + seconds, then creates a new DEALER socket and reconnects to the coordinator. Returns True on success, False if all attempts fail. """ - while not self._stop_event.is_set() and not self._closed: - try: - self._socket.close(0) - except zmq.ZMQError: - pass - try: - self._ctx.term() - except zmq.ZMQError: - pass + if max_retries <= 0: + return False - time.sleep(5.0) + for attempt in range(1, max_retries + 1): + with self._send_lock: + if self._stop_event.is_set() or self._closed: + return False + try: + self._socket.close(0) + except zmq.ZMQError: + pass + try: + self._ctx.term() + except zmq.ZMQError: + pass - try: - self._ctx = zmq.Context() - self._socket = self._ctx.socket(zmq.DEALER) - self._socket.connect(self._coord_zmq_addr) - return True - except zmq.ZMQError as e: - logger.error( - "Stage client reconnect failed, will retry in 5s (coord=%s)", - self._coord_zmq_addr, - exc_info=e, - ) - continue + try: + self._ctx = zmq.Context() + self._socket = self._ctx.socket(zmq.DEALER) + self._socket.connect(self._coord_zmq_addr) + return True + except zmq.ZMQError as e: + logger.error( + "Stage client reconnect failed (attempt=%d/%d, coord=%s)", + attempt, + max_retries, + self._coord_zmq_addr, + exc_info=e, + ) + + if retry_interval > 0: + time.sleep(retry_interval) return False def _send_event(self, event_type: str) -> None: @@ -102,20 +110,20 @@ def _send_event(self, event_type: str) -> None: to 3 times (5s sleep each) and retries the send once after a successful reconnect. Raises if reconnect or the retry send fails. """ - if self._closed: - raise RuntimeError("Client already closed") - - event = InstanceEvent( - input_addr=self._input_addr, - output_addr=self._output_addr, - stage_id=self._stage_id, - event_type=event_type, - status=self._status, - queue_length=self._queue_length, - ) - data = json.dumps(asdict(event)).encode("utf-8") - with self._send_lock: + if self._closed: + raise RuntimeError("Client already closed") + + event = InstanceEvent( + input_addr=self._input_addr, + output_addr=self._output_addr, + stage_id=self._stage_id, + event_type=event_type, + status=self._status, + queue_length=self._queue_length, + ) + data = json.dumps(asdict(event)).encode("utf-8") + try: self._socket.send(data, flags=zmq.NOBLOCK) return @@ -124,7 +132,7 @@ def _send_event(self, event_type: str) -> None: return except (RuntimeError, zmq.ZMQError) as e: # First send failed; try reconnecting a few times. - if not self._reconnect: + if not self._reconnect(max_retries=3): logger.error("Failed to send event and reconnect to coordinator", exc_info=e) raise @@ -149,12 +157,16 @@ def update_info( if status is None and queue_length is None: raise ValueError("At least one of status or queue_length must be provided") - if status is not None: - self._status = status - if queue_length is not None: - self._queue_length = queue_length + with self._send_lock: + if self._closed or self._closing: + raise RuntimeError("Client is closing or already closed") + + if status is not None: + self._status = status + if queue_length is not None: + self._queue_length = queue_length - self._send_event("update") + self._send_event("update") def _heartbeat_loop(self) -> None: """Periodically send heartbeat events while the client is alive.""" @@ -164,8 +176,11 @@ def _heartbeat_loop(self) -> None: try: self._send_event("heartbeat") - except (RuntimeError, zmq.ZMQError): - break + except (RuntimeError, zmq.ZMQError) as e: + if self._closed or self._stop_event.is_set(): + break + logger.warning("Heartbeat send failed; will retry on next interval", exc_info=e) + continue def close(self) -> None: """Send a final down event and close the underlying socket.""" @@ -177,17 +192,23 @@ def close(self) -> None: if hasattr(self, "_heartbeat_thread"): self._heartbeat_thread.join(timeout=1.0) - # Mark status as DOWN and send one last update. - self._status = StageStatus.DOWN - try: - self._send_event("update") - except zmq.ZMQError: - pass # Socket may already be broken, proceed with close + with self._send_lock: + if self._closed: + raise RuntimeError("Client already closed") - # Close DEALER socket and terminate this client's context. - self._socket.close(0) - try: - self._ctx.term() - except zmq.ZMQError: - pass - self._closed = True + self._closing = True + + # Mark status as DOWN and send one last update. + self._status = StageStatus.DOWN + try: + self._send_event("update") + except (RuntimeError, zmq.ZMQError): + pass # Socket may already be broken, proceed with close + + # Close DEALER socket and terminate this client's context. + self._socket.close(0) + try: + self._ctx.term() + except zmq.ZMQError: + pass + self._closed = True From bc5e94554410aa3e85fc85e0544269b0792fd494 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 7 Apr 2026 14:50:11 -0700 Subject: [PATCH 078/204] [release] Fix release script (#2566) Signed-off-by: khluu --- .../scripts/generate-and-upload-nightly-index.sh | 11 ++++++----- .buildkite/scripts/generate-nightly-index.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.buildkite/scripts/generate-and-upload-nightly-index.sh b/.buildkite/scripts/generate-and-upload-nightly-index.sh index 6624af3230..b09c13f5cf 100755 --- a/.buildkite/scripts/generate-and-upload-nightly-index.sh +++ b/.buildkite/scripts/generate-and-upload-nightly-index.sh @@ -19,7 +19,7 @@ has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) if [[ "$has_new_python" -eq 0 ]]; then # use new python from docker docker pull python:3-slim - PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3" + PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3" fi echo "Using python interpreter: $PYTHON" @@ -36,7 +36,7 @@ mkdir -p "$INDICES_OUTPUT_DIR" # HACK: we do not need regex module here, but it is required by pre-commit hook # To avoid any external dependency, we simply replace it back to the stdlib re module -sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py +sed -i.bak 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py && rm -f .buildkite/scripts/generate-nightly-index.py.bak # Generate indices -- the version is just the commit hash (not omni/{commit}) # because relative paths are computed between the index and wheel directories, @@ -73,15 +73,16 @@ echo "Pure version (without variant): $pure_version" # re-generate and copy to /omni/{version}/ only if it does not have "dev" in the version if [[ "$version" != *"dev"* ]]; then - echo "Re-generating indices for /omni/$pure_version/" + s3_version="v$pure_version" + echo "Re-generating indices for /omni/$s3_version/" rm -rf "${INDICES_OUTPUT_DIR:?}" mkdir -p "$INDICES_OUTPUT_DIR" # wheel-dir is overridden to be the commit directory, so that the indices point to the correct wheel path $PYTHON .buildkite/scripts/generate-nightly-index.py \ - --version "$pure_version" \ + --version "$s3_version" \ --wheel-dir "$BUILDKITE_COMMIT" \ --current-objects "$obj_json" \ --output-dir "$INDICES_OUTPUT_DIR" \ --comment "version $pure_version" - aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/omni/$pure_version/" + aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/omni/$s3_version/" fi diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py index c616c446b0..b78df41a19 100755 --- a/.buildkite/scripts/generate-nightly-index.py +++ b/.buildkite/scripts/generate-nightly-index.py @@ -11,7 +11,7 @@ from typing import Any from urllib.parse import quote -import regex as re +import re def normalize_package_name(name: str) -> str: From b246617fe51700d9692d8fdecf080d806019cfa7 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 7 Apr 2026 18:47:42 -0700 Subject: [PATCH 079/204] [release] Fix lint issue (#2567) Signed-off-by: khluu --- .buildkite/scripts/generate-nightly-index.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py index b78df41a19..bb4a74a704 100755 --- a/.buildkite/scripts/generate-nightly-index.py +++ b/.buildkite/scripts/generate-nightly-index.py @@ -4,6 +4,7 @@ import argparse import json +import re import sys from dataclasses import asdict, dataclass from datetime import datetime @@ -11,8 +12,6 @@ from typing import Any from urllib.parse import quote -import re - def normalize_package_name(name: str) -> str: """Normalize package name per PEP 503.""" From 8a55d3d01f3463748519351847ac45a1d4ce6d60 Mon Sep 17 00:00:00 2001 From: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com> Date: Wed, 8 Apr 2026 09:55:09 +0800 Subject: [PATCH 080/204] [Feat] Enable Layerwise CPU offloading for SD3.5, Ovis-Image, Nextstep_1, LongCat-Image (#2339) Signed-off-by: Yuanheng Zhao Signed-off-by: yuanheng --- docs/user_guide/diffusion/cpu_offload_diffusion.md | 12 ++++++++---- docs/user_guide/diffusion_features.md | 10 +++++----- .../offline_inference/text_to_image/text_to_image.py | 2 +- .../longcat_image/longcat_image_transformer.py | 1 + .../models/nextstep_1_1/modeling_nextstep.py | 2 ++ .../models/ovis_image/ovis_image_transformer.py | 1 + vllm_omni/diffusion/models/sd3/sd3_transformer.py | 1 + vllm_omni/diffusion/offloader/module_collector.py | 2 +- 8 files changed, 20 insertions(+), 11 deletions(-) diff --git a/docs/user_guide/diffusion/cpu_offload_diffusion.md b/docs/user_guide/diffusion/cpu_offload_diffusion.md index be72efffa5..f80005ccb7 100644 --- a/docs/user_guide/diffusion/cpu_offload_diffusion.md +++ b/docs/user_guide/diffusion/cpu_offload_diffusion.md @@ -139,11 +139,15 @@ Factory function `get_offload_backend()` selects the appropriate backend based o ## Supported Models -| Architecture | Example Models | DiT Class | Model-Level Offload | Layerwise Offload | Blocks Attr (Layerwise specific) | -|--------------|----------------|-----------|---------------------|-------------------|-------------| -| Wan22Pipeline | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `WanTransformer3DModel` | ✓ | ✓ | `"blocks"` | -| Wan22I2VPipeline | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | `WanTransformer3DModel` | ✓ | ✓ | `"blocks"` | +| Architecture | Example Models | DiT Class | Model-Level Offload | Layerwise Offload | Blocks Attrs (Layerwise specific) | +|--------------|----------------|-----------|---------------------|-------------------|-----------------------------------| +| LongCatImagePipeline | `meituan-longcat/LongCat-Image` | `LongCatImageTransformer2DModel` | - | ✓ | `"transformer_blocks"`, `"single_transformer_blocks"` | +| NextStep11Pipeline | `stepfun-ai/NextStep-1.1` | `NextStepModel` | - | ✓ | `"layers"` | +| OvisImagePipeline | `AIDC-AI/Ovis-Image-7B` | `OvisImageTransformer2DModel` | - | ✓ | `"transformer"` | | QwenImagePipeline | `Qwen/Qwen-Image` | `QwenImageTransformer2DModel` | ✓ | ✓ | `"transformer_blocks"` | +| StableDiffusion3Pipeline | `stabilityai/stable-diffusion-3.5-medium` | `SD3Transformer2DModel` | - | ✓ | `"transformer_blocks"` | +| Wan22I2VPipeline | `Wan-AI/Wan2.2-I2V-A14B-Diffusers` | `WanTransformer3DModel` | ✓ | ✓ | `"blocks"` | +| Wan22Pipeline | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | `WanTransformer3DModel` | ✓ | ✓ | `"blocks"` | **Notes:** - Model-Level Offloading is expected to be supported by all common diffusion models (DiT and encoders) naturally diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index d4d9ce6a3d..2f04e35687 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -107,19 +107,19 @@ The following tables show which models support each feature: | **FLUX.2-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **GLM-Image** | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **HunyuanImage3** | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | -| **LongCat-Image** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | -| **LongCat-Image-Edit** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **LongCat-Image** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | +| **LongCat-Image-Edit** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | | **MagiHuman** | ❌ | ❌ | ❌ | ❓ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | | **MammothModa2(T2I)** | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| **Nextstep_1(T2I)** | ❓ | ❓ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **Nextstep_1(T2I)** | ❓ | ❓ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | | **OmniGen2** | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| **Ovis-Image** | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **Ovis-Image** | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | | **Qwen-Image** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ✅ | | **Qwen-Image-2512** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ✅ | | **Qwen-Image-Edit** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ❌ | ❌ | | **Qwen-Image-Edit-2509** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ❌ | ❌ | | **Qwen-Image-Layered** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ❌ | ❌ | -| **Stable-Diffusion3.5** | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ (decode) | ❌ | ❌ | +| **Stable-Diffusion3.5** | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | ✅ | ✅ (decode) | ❌ | ❌ | | **Z-Image** | ✅ | ✅ | ✅ | ❓ | ✅ (TP=2 only) | ✅ | ❌ | ✅ (decode) | ✅ | ❌ | > Notes: diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py index 42e44abb89..615e4067ed 100644 --- a/examples/offline_inference/text_to_image/text_to_image.py +++ b/examples/offline_inference/text_to_image/text_to_image.py @@ -376,7 +376,7 @@ def main(): f"vae_patch_parallel_size={args.vae_patch_parallel_size}, " f"enable_expert_parallel={args.enable_expert_parallel}." ) - print(f" CPU offload: {args.enable_cpu_offload}") + print(f" CPU offload: {args.enable_cpu_offload}; CPU Layerwise Offload: {args.enable_layerwise_offload}") print(f" Image size: {args.width}x{args.height}") if args.lora_path: print(f" LoRA: scale={args.lora_scale}") diff --git a/vllm_omni/diffusion/models/longcat_image/longcat_image_transformer.py b/vllm_omni/diffusion/models/longcat_image/longcat_image_transformer.py index 8d8e523d60..8f0ff446af 100644 --- a/vllm_omni/diffusion/models/longcat_image/longcat_image_transformer.py +++ b/vllm_omni/diffusion/models/longcat_image/longcat_image_transformer.py @@ -582,6 +582,7 @@ class LongCatImageTransformer2DModel(nn.Module): """ _repeated_blocks = ["LongCatImageTransformerBlock", "LongCatImageSingleTransformerBlock"] + _layerwise_offload_blocks_attrs = ["transformer_blocks", "single_transformer_blocks"] # Sequence Parallelism for LongCat (following diffusers' _cp_plan pattern) _sp_plan = { diff --git a/vllm_omni/diffusion/models/nextstep_1_1/modeling_nextstep.py b/vllm_omni/diffusion/models/nextstep_1_1/modeling_nextstep.py index ded3079265..d2b3eb81e3 100644 --- a/vllm_omni/diffusion/models/nextstep_1_1/modeling_nextstep.py +++ b/vllm_omni/diffusion/models/nextstep_1_1/modeling_nextstep.py @@ -114,6 +114,8 @@ def from_json(cls, path: str) -> NextStepConfig: class NextStepModel(nn.Module): + _layerwise_offload_blocks_attrs = ["layers"] + def __init__(self, config: NextStepConfig): super().__init__() self.config = config diff --git a/vllm_omni/diffusion/models/ovis_image/ovis_image_transformer.py b/vllm_omni/diffusion/models/ovis_image/ovis_image_transformer.py index bd2a3b4834..0e98729c3d 100644 --- a/vllm_omni/diffusion/models/ovis_image/ovis_image_transformer.py +++ b/vllm_omni/diffusion/models/ovis_image/ovis_image_transformer.py @@ -366,6 +366,7 @@ class OvisImageTransformer2DModel(nn.Module): """ _repeated_blocks = ["OvisImageTransformerBlock", "OvisImageSingleTransformerBlock"] + _layerwise_offload_blocks_attrs = ["transformer_blocks", "single_transformer_blocks"] def __init__( self, diff --git a/vllm_omni/diffusion/models/sd3/sd3_transformer.py b/vllm_omni/diffusion/models/sd3/sd3_transformer.py index 308bd35a13..89f0615775 100644 --- a/vllm_omni/diffusion/models/sd3/sd3_transformer.py +++ b/vllm_omni/diffusion/models/sd3/sd3_transformer.py @@ -387,6 +387,7 @@ class SD3Transformer2DModel(nn.Module): """ _repeated_blocks = ["SD3TransformerBlock"] + _layerwise_offload_blocks_attrs = ["transformer_blocks"] def __init__( self, diff --git a/vllm_omni/diffusion/offloader/module_collector.py b/vllm_omni/diffusion/offloader/module_collector.py index d9d21b939a..a09a337001 100644 --- a/vllm_omni/diffusion/offloader/module_collector.py +++ b/vllm_omni/diffusion/offloader/module_collector.py @@ -21,7 +21,7 @@ class PipelineModules: class ModuleDiscovery: """Discovers pipeline components for offloading""" - DIT_ATTRS = ["transformer", "transformer_2", "dit", "sr_dit", "language_model", "transformer_blocks"] + DIT_ATTRS = ["transformer", "transformer_2", "dit", "sr_dit", "language_model", "transformer_blocks", "model"] ENCODER_ATTRS = ["text_encoder", "text_encoder_2", "text_encoder_3", "image_encoder"] VAE_ATTRS = ["vae", "audio_vae"] From 6433847249a6aef8deafd1210958698376fc39e0 Mon Sep 17 00:00:00 2001 From: skf <54565339+skf-1999@users.noreply.github.com> Date: Wed, 8 Apr 2026 10:35:14 +0800 Subject: [PATCH 081/204] [skipCI][Docs] Add expert_parallel.md (#2471) Signed-off-by: skf1999 <13234016272@163.com> Co-authored-by: Canlin Guo --- docs/.nav.yml | 1 + docs/design/feature/expert_parallel.md | 221 +++++++++++++++++++++++++ 2 files changed, 222 insertions(+) create mode 100644 docs/design/feature/expert_parallel.md diff --git a/docs/.nav.yml b/docs/.nav.yml index a4939961e8..86ce4a3b0c 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -98,6 +98,7 @@ nav: - design/feature/ray_based_execution.md - design/feature/omni_connectors/ - design/feature/cfg_parallel.md + - design/feature/expert_parallel.md - design/feature/sequence_parallel.md - design/feature/tensor_parallel.md - design/feature/vae_parallel.md diff --git a/docs/design/feature/expert_parallel.md b/docs/design/feature/expert_parallel.md new file mode 100644 index 0000000000..9a7c4cdbac --- /dev/null +++ b/docs/design/feature/expert_parallel.md @@ -0,0 +1,221 @@ +# Expert Parallel + +This section describes how to add Expert Parallel (EP) to a diffusion transformer that uses Mixture-of-Experts (MoE) layers. +We use **HunyuanImage3.0** as the reference implementation. + +--- + +## Table of Contents + +- [Overview](#overview) +- [Step-by-Step Implementation](#step-by-step-implementation) +- [Testing](#testing) +- [Reference Implementations](#reference-implementations) +- [Summary](#summary) + +--- + +## Overview + +### What is Expert Parallel? + +**Expert Parallel** is a parallelism strategy in Mixture-of-Experts (MoE) models that distributes different expert networks across distinct computational devices. Each device holds and computes only a subset of experts (local experts), with tokens dispatched to and gathered from remote devices via collective communication operations (e.g., All-to-All, All-Gather). + +| Backend | Description | +|---------|-------------| +| `allgather_reducescatter` | Default backend based on allgather/reducescatter primitives, suitable for general EP+DP deployments.| + +## Configuration + +Enable EP by setting the `--enable-expert-parallel` flag. The EP size is automatically calculated as: + +```text +EP_SIZE = TP_SIZE × SP_SIZE × CFG_SIZE × DP_SIZE +``` + + +Where: + +- `TP_SIZE`: Tensor parallel size +- `SP_SIZE`: Sequence parallel size +- `CFG_SIZE`: Classifier-free guidance parallel size +- `DP_SIZE`: Data parallel size +- `EP_SIZE`: Expert parallel size (computed automatically) + +Note: +- Expert parallelism is only applicable to Mixture-of-Experts (MoE) models. +- The EP group is created **per pipeline stage**, meaning it includes all ranks that participate in model parallelism except pipeline parallelism. +- The underlying communication pattern for expert parallelism is **All-to-All** among the ranks in the EP group. + +For example, consider a configuration with `TP=2`, `SP=1`, `CFG=2`, and `DP=4` (total 2×1×2×4 = 16 GPUs). + +- Expert layers are handled by an EP group of size 16. + +- Attention layers use tensor parallelism of size 2 within each of the 8 DP groups (because `DP×CFG×SP = 4×2×1 = 8` groups, each containing the 2 TP ranks). Inside each such group, the attention weights are sharded across the 2 GPUs. + + +## Step-by-Step Implementation + +### Step 1: Configure Expert Parallelism Settings + +Calculate local experts per rank: + +``` +ep_size = 8 # Expert Parallel size (typically equals TP size) +num_experts = 64 +num_local_experts = num_experts // ep_size # 8 experts per card + +# Check divisibility +assert num_experts % ep_size == 0, "Experts must be divisible by EP size" +``` + +### Step 2: Use Sparse MoE Block to enable EP routing. + +Example: +``` +from vllm.model_executor.layers.linear import ReplicatedLinear +class HunYuanSparseMoeBlock(nn.Module): + def __init__( + self, + config: PretrainedConfig, + layer_id: int = -1, + prefix: str = "", + ): + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.n_routed_experts = config.num_experts # 64 + + # Calculate local experts per rank (key for EP) + if self.tp_size > self.n_routed_experts: + raise ValueError(f"TP size {self.tp_size} > experts {self.n_routed_experts}") + + # Routing gate (replicated on all ranks, computes scores for all tokens to all experts) + self.gate = ReplicatedLinear( + config.hidden_size, + config.num_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate", + ) + + # EP expert layer (factory loads platform-specific implementation) + self.experts = HunyuanFusedMoE(...) +``` +**Key Points:** +- gate is **ReplicatedLinear** (replicated on all ranks) +- experts is created via **HunyuanFusedMoE factory**, which automatically handles EP dispatch + +### Step 3: Initialize EP Runtime + +Initialize the EP communication context before model loading. +``` +from vllm.utils.import_utils import resolve_obj_by_qualname +# Call during __init__ or model loading +op_name = "hunyuan_fused_moe" + +# Prepare EP runtime: establish communication groups, assign local expert indices, init _expert_map +current_omni_platform.prepare_diffusion_op_runtime(op_name) + +# Factory automatically resolves platform implementation (GPU: FusedMoE / NPU: AscendFusedMoE) +impl = resolve_obj_by_qualname( + current_omni_platform.get_diffusion_model_impl_qualname(op_name) +) +``` + +### Step 4: Expert Weight Mapping & Loading + +Each rank loads only the expert weights assigned to its local allocation. +``` +# Get expert parameter mapping (different per rank) +expert_mapping = HunyuanFusedMoE.make_expert_params_mapping( + model=self, + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=64, + num_redundant_experts=0, +) +# Returns: [(param_name, weight_name, expert_id, shard_id), ...] +# Note: Each rank only contains mappings for its local expert_ids + +# Filter non-local experts during loading +for name, loaded_weight in weights: + if "mlp.experts" in name: + # Parse expert_id from weight name (implementation needed) + expert_id = parse_expert_id_from_name(name) + local_expert_start = (ep_rank) * num_local_experts + local_expert_end = (ep_rank + 1) * num_local_experts + + if not (local_expert_start <= expert_id < local_expert_end): + continue # Skip non-local expert weights +``` +### Step 5: Forward Pass with EP + +Example (MoE Forward): +``` +def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + orig_shape = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + + # 1. Global routing computation (all tokens, all expert scores) + # hidden_states: [num_tokens, hidden_dim] (full tensor) + router_logits, _ = self.gate(hidden_states) # [num_tokens, num_experts] + + # 2. EP dispatch and compute (HunyuanFusedMoE handles all_to_all internally) + # - Dispatch: Send tokens to target ranks based on router_logits + # - Local Compute: Each rank processes only its num_local_experts + # - Combine: Results returned to original token positions + final_hidden_states = self.experts( + hidden_states=hidden_states, + router_logits=router_logits, + ) + + # 3. Add shared expert output (not EP, computed on all ranks) + if self.shared_mlp is not None: + shared_out = self.shared_mlp(hidden_states) + final_hidden_states = final_hidden_states + shared_out + + # 4. Tensor Parallel All-Reduce (synchronize across TP group) + if self.tp_size > 1: + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states + ) + + return final_hidden_states.view(orig_shape) +``` + +## Testing +After adding Expert Parallel support, test via command line: +```bash +cd examples/offline_inference/text_to_image +python text_to_image.py \ + --model Your-org/your-model \ + --prompt "a cup of coffee on the table" \ + --output "ep_enabled.png" \ + --num-inference-steps 50 \ + --guidance-scale 5.0 \ + --tensor-parallel-size 8 \ + --seed 1234 \ + --enable-expert-parallel +``` + +vLLM‑Omni currently focuses on core diffusion model inference acceleration, so the Expert Parallel implementation includes only the basic multi‑GPU expert sharding functionality (enabled via --enable-expert-parallel). Advanced features such as communication backend selection (--all2all-backend), load balancing (--enable-eplb and its configuration), and multi‑node deployment belong to the extended capabilities of the main vLLM project and have not yet been integrated into Omni. + +## Reference Implementations + +Complete examples in the codebase: + +| Model | Path | Pattern | Notes | +|-------|------|---------|-------| +| **HunyuanImage3.0** | `vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py` | Standard EP | Full implementation with validation | +| **EP Tests** | `vllm-omni/tests/e2e/offline_inference/test_expert_parallel.py` | E2E testing | EP correctness and performance | +| **Constraint Tests** | `vllm-omni/tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py` | Unit testing | Validation logic | + +--- +## Summary + +Adding Expert Parallel support to diffusion model: + +1. **Identify MoE layers** - Locate the router and expert networks in each transformer block. +2. **Validate EP constraints** – Ensure num_experts is divisible by expert_parallel_size. +3. **Test** - Run with enable-expert-parallel, check memory reduction, speedup, and output quality against single‑GPU baseline. From cb6a8739c19bb27483c43f9d5e567d2bcfc9f628 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zhengyuan=20Su=20=28=E8=8B=8F=E6=94=BF=E6=B8=8A=29?= Date: Wed, 8 Apr 2026 10:43:01 +0800 Subject: [PATCH 082/204] [Feature] Add trajectory recording to BAGEL denoising loop (#2483) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Zhengyuan Su Signed-off-by: Zhengyuan Su (苏政渊) Co-authored-by: Claude Co-authored-by: Samit <285365963@qq.com> --- .../models/bagel/test_trajectory_recording.py | 236 ++++++++++++++++++ vllm_omni/diffusion/data.py | 11 +- vllm_omni/diffusion/diffusion_engine.py | 8 + vllm_omni/diffusion/ipc.py | 32 ++- .../models/bagel/bagel_transformer.py | 77 +++++- .../diffusion/models/bagel/pipeline_bagel.py | 32 ++- vllm_omni/diffusion/stage_diffusion_proc.py | 21 ++ vllm_omni/entrypoints/omni_base.py | 5 + vllm_omni/outputs.py | 12 + 9 files changed, 412 insertions(+), 22 deletions(-) create mode 100644 tests/diffusion/models/bagel/test_trajectory_recording.py diff --git a/tests/diffusion/models/bagel/test_trajectory_recording.py b/tests/diffusion/models/bagel/test_trajectory_recording.py new file mode 100644 index 0000000000..7518388d28 --- /dev/null +++ b/tests/diffusion/models/bagel/test_trajectory_recording.py @@ -0,0 +1,236 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for BAGEL trajectory recording in the denoising loop.""" + +import types +from dataclasses import dataclass +from unittest.mock import MagicMock, patch + +import pytest +import torch + +from vllm_omni.diffusion.models.bagel.bagel_transformer import ( + Bagel, + NaiveCache, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + +NUM_TOKENS = 8 +HIDDEN_DIM = 16 +NUM_TIMESTEPS = 5 +# generate_image uses timesteps[:-1], so actual steps = NUM_TIMESTEPS - 1 +EXPECTED_STEPS = NUM_TIMESTEPS - 1 + + +def _make_mock_bagel(): + """Create a mock Bagel with _forward_flow returning constant velocity.""" + mock = MagicMock(spec=Bagel) + mock._sp_size = 1 + + # _forward_flow returns a small constant velocity so x_t changes each step + def fake_forward_flow(self, x_t, **kwargs): + return torch.ones_like(x_t) * 0.1 + + mock._forward_flow = types.MethodType(fake_forward_flow, mock) + # _merge_naive_caches is called in the batched CFG path + mock._merge_naive_caches = types.MethodType(lambda self, caches: NaiveCache(1), mock) + + # Bind the real generate_image to our mock + mock.generate_image = types.MethodType(Bagel.generate_image, mock) + return mock + + +def _make_generate_args(num_tokens=NUM_TOKENS, hidden_dim=HIDDEN_DIM, cfg=False): + """Tensor arguments for generate_image. + + Args: + cfg: If True, enable batched CFG path (cfg_text_scale > 1.0). + """ + seq_len = num_tokens + 2 # packed_seqlens includes 2 extra tokens + base = dict( + packed_text_ids=torch.zeros(2, dtype=torch.long), + packed_text_indexes=torch.tensor([0, 1], dtype=torch.long), + packed_init_noises=torch.randn(num_tokens, hidden_dim), + packed_vae_position_ids=torch.arange(num_tokens, dtype=torch.long), + packed_vae_token_indexes=torch.arange(2, seq_len, dtype=torch.long), + packed_seqlens=torch.tensor([seq_len], dtype=torch.int), + packed_position_ids=torch.arange(seq_len, dtype=torch.long), + packed_indexes=torch.arange(seq_len, dtype=torch.long), + past_key_values=NaiveCache(1), + key_values_lens=torch.tensor([0], dtype=torch.int), + packed_key_value_indexes=torch.zeros(0, dtype=torch.long), + num_timesteps=NUM_TIMESTEPS, + timestep_shift=1.0, + cfg_text_scale=1.0, + cfg_img_scale=1.0, + ) + if cfg: + base |= dict( + cfg_text_scale=4.0, + cfg_text_packed_query_indexes=torch.arange(seq_len, dtype=torch.long), + cfg_text_packed_position_ids=torch.arange(seq_len, dtype=torch.long), + cfg_text_past_key_values=NaiveCache(1), + cfg_text_key_values_lens=torch.tensor([0], dtype=torch.int), + cfg_text_packed_key_value_indexes=torch.zeros(0, dtype=torch.long), + ) + return base + + +@pytest.fixture(params=[False, True], ids=["no_cfg", "batched_cfg"]) +def bagel_and_args(request): + """Mock Bagel instance and generate_image arguments. + + Parametrized over CFG mode so every test runs on both the no-CFG + and batched-CFG code paths. + """ + cfg = request.param + with patch( + "vllm_omni.diffusion.models.bagel.bagel_transformer.get_classifier_free_guidance_world_size", + return_value=1, + ): + yield _make_mock_bagel(), _make_generate_args(cfg=cfg) + + +class TestTrajectoryRecording: + """Tests for trajectory latent/timestep recording in generate_image.""" + + def test_trajectory_disabled_returns_none(self, bagel_and_args): + bagel, args = bagel_and_args + + unpacked, trajectory_latents, trajectory_timesteps, trajectory_log_probs = bagel.generate_image( + **args, return_trajectory_latents=False + ) + + assert isinstance(unpacked, (list, tuple)) + assert len(unpacked) == 1 # one sequence + assert trajectory_latents is None + assert trajectory_timesteps is None + assert trajectory_log_probs is None + + def test_trajectory_enabled_returns_correct_count(self, bagel_and_args): + bagel, args = bagel_and_args + + _, trajectory_latents, trajectory_timesteps, trajectory_log_probs = bagel.generate_image( + **args, return_trajectory_latents=True + ) + + assert trajectory_latents is not None + assert trajectory_timesteps is not None + assert len(trajectory_latents) == EXPECTED_STEPS + assert len(trajectory_timesteps) == EXPECTED_STEPS + # log_probs is None without a scheduler (default ODE path) + assert trajectory_log_probs is None + + def test_trajectory_latents_shape_matches_input(self, bagel_and_args): + bagel, args = bagel_and_args + expected_shape = args["packed_init_noises"].shape + + _, trajectory_latents, *_ = bagel.generate_image(**args, return_trajectory_latents=True) + + for i, lat in enumerate(trajectory_latents): + assert lat.shape == expected_shape, f"Step {i}: expected {expected_shape}, got {lat.shape}" + + def test_trajectory_latents_are_distinct(self, bagel_and_args): + bagel, args = bagel_and_args + + _, trajectory_latents, *_ = bagel.generate_image(**args, return_trajectory_latents=True) + + for i in range(1, len(trajectory_latents)): + assert not torch.equal(trajectory_latents[i], trajectory_latents[i - 1]), ( + f"Steps {i - 1} and {i} should differ" + ) + + def test_trajectory_timesteps_are_decreasing(self, bagel_and_args): + bagel, args = bagel_and_args + + _, _, trajectory_timesteps, _ = bagel.generate_image(**args, return_trajectory_latents=True) + + for i in range(1, len(trajectory_timesteps)): + assert trajectory_timesteps[i] < trajectory_timesteps[i - 1], ( + f"Timestep {i} ({trajectory_timesteps[i]:.4f}) should be less than " + f"timestep {i - 1} ({trajectory_timesteps[i - 1]:.4f})" + ) + + def test_trajectory_final_latent_matches_output(self, bagel_and_args): + bagel, args = bagel_and_args + + unpacked, trajectory_latents, *_ = bagel.generate_image(**args, return_trajectory_latents=True) + + # Reconstruct the full final latent from unpacked pieces + final_latent = torch.cat(unpacked, dim=0) + assert torch.allclose(trajectory_latents[-1], final_latent, atol=1e-6), ( + "Last trajectory latent should match the final output" + ) + + +# --------------------------------------------------------------------------- +# Mock scheduler for log-prob tests +# --------------------------------------------------------------------------- + + +@dataclass +class _MockStepOutput: + prev_sample: torch.Tensor + log_prob: torch.Tensor + + +class _MockScheduler: + """Minimal scheduler: Euler step + constant log-prob per step.""" + + def step(self, model_output, sigma, sample, dt, **kwargs): + prev_sample = sample - model_output * dt + log_prob = torch.tensor(-1.0) + return _MockStepOutput(prev_sample=prev_sample, log_prob=log_prob) + + +class TestTrajectoryLogProbs: + """Tests for log-prob recording when a scheduler is provided.""" + + @pytest.fixture() + def bagel_scheduler_args(self): + with patch( + "vllm_omni.diffusion.models.bagel.bagel_transformer.get_classifier_free_guidance_world_size", + return_value=1, + ): + yield _make_mock_bagel(), _make_generate_args(), _MockScheduler() + + def test_log_probs_recorded_with_scheduler(self, bagel_scheduler_args): + bagel, args, scheduler = bagel_scheduler_args + + _, _, _, trajectory_log_probs = bagel.generate_image( + **args, return_trajectory_latents=True, scheduler=scheduler + ) + + assert trajectory_log_probs is not None + assert len(trajectory_log_probs) == EXPECTED_STEPS + + def test_log_probs_are_finite(self, bagel_scheduler_args): + bagel, args, scheduler = bagel_scheduler_args + + _, _, _, trajectory_log_probs = bagel.generate_image( + **args, return_trajectory_latents=True, scheduler=scheduler + ) + + for i, lp in enumerate(trajectory_log_probs): + assert torch.isfinite(lp).all(), f"Step {i}: log_prob is not finite" + + def test_log_probs_none_without_scheduler(self, bagel_scheduler_args): + bagel, args, _ = bagel_scheduler_args + + _, _, _, trajectory_log_probs = bagel.generate_image(**args, return_trajectory_latents=True, scheduler=None) + + assert trajectory_log_probs is None + + def test_scheduler_updates_latents(self, bagel_scheduler_args): + """Verify the scheduler's prev_sample is used (not the raw Euler step).""" + bagel, args, scheduler = bagel_scheduler_args + + _, traj_with_sched, *_ = bagel.generate_image(**args, return_trajectory_latents=True, scheduler=scheduler) + _, traj_without, *_ = bagel.generate_image(**args, return_trajectory_latents=True, scheduler=None) + + # Mock scheduler does the same Euler step, so latents should match + for i in range(len(traj_with_sched)): + assert torch.allclose(traj_with_sched[i], traj_without[i], atol=1e-5), ( + f"Step {i}: scheduler and ODE paths should produce same latents" + ) diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py index 3071fd9d56..56a891aa5c 100644 --- a/vllm_omni/diffusion/data.py +++ b/vllm_omni/diffusion/data.py @@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, Any import torch +from PIL import Image from pydantic import model_validator from typing_extensions import Self from vllm.config.utils import config @@ -701,10 +702,12 @@ class DiffusionOutput: Final output (after pipeline completion) """ - output: torch.Tensor | None = None - trajectory_timesteps: list[torch.Tensor] | None = None - trajectory_latents: torch.Tensor | None = None - trajectory_decoded: list[torch.Tensor] | None = None + # Fields may be replaced with SHM handle dicts by ipc.pack_diffusion_output_shm + output: torch.Tensor | dict | None = None + trajectory_timesteps: torch.Tensor | dict | None = None + trajectory_latents: torch.Tensor | dict | None = None + trajectory_log_probs: torch.Tensor | dict | None = None + trajectory_decoded: list[Image.Image] | None = None error: str | None = None aborted: bool = False abort_message: str | None = None diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index 784da61752..8d3c02b7ab 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -209,6 +209,10 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: prompt=prompt, metrics=metrics, latents=output.trajectory_latents, + trajectory_latents=output.trajectory_latents, + trajectory_timesteps=output.trajectory_timesteps, + trajectory_log_probs=output.trajectory_log_probs, + trajectory_decoded=output.trajectory_decoded, custom_output=output.custom_output or {}, multimodal_output=mm_output, stage_durations=output.stage_durations, @@ -267,6 +271,10 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: prompt=prompt, metrics=metrics, latents=output.trajectory_latents, + trajectory_latents=output.trajectory_latents, + trajectory_timesteps=output.trajectory_timesteps, + trajectory_log_probs=output.trajectory_log_probs, + trajectory_decoded=output.trajectory_decoded, custom_output=output.custom_output or {}, multimodal_output=mm_output, stage_durations=output.stage_durations, diff --git a/vllm_omni/diffusion/ipc.py b/vllm_omni/diffusion/ipc.py index 9aafc1cf17..6a96533fd4 100644 --- a/vllm_omni/diffusion/ipc.py +++ b/vllm_omni/diffusion/ipc.py @@ -78,13 +78,29 @@ def _tensor_from_shm(handle: dict[str, Any]) -> torch.Tensor: return tensor +def _pack_tensor_if_large(val: torch.Tensor) -> torch.Tensor | dict: + """Replace a tensor with an SHM handle if it exceeds the threshold.""" + if val.nelement() * val.element_size() > _SHM_TENSOR_THRESHOLD: + return _tensor_to_shm(val) + return val + + +def _unpack_if_shm_handle(val: object) -> object: + """Reconstruct a tensor from an SHM handle dict, or return as-is.""" + if isinstance(val, dict) and val.get("__tensor_shm__"): + return _tensor_from_shm(val) + return val + + def _pack_diffusion_fields(output: DiffusionOutput) -> DiffusionOutput: if output.output is not None and isinstance(output.output, torch.Tensor): - if output.output.nelement() * output.output.element_size() > _SHM_TENSOR_THRESHOLD: - output.output = _tensor_to_shm(output.output) + output.output = _pack_tensor_if_large(output.output) if output.trajectory_latents is not None and isinstance(output.trajectory_latents, torch.Tensor): - if output.trajectory_latents.nelement() * output.trajectory_latents.element_size() > _SHM_TENSOR_THRESHOLD: - output.trajectory_latents = _tensor_to_shm(output.trajectory_latents) + output.trajectory_latents = _pack_tensor_if_large(output.trajectory_latents) + if output.trajectory_timesteps is not None and isinstance(output.trajectory_timesteps, torch.Tensor): + output.trajectory_timesteps = _pack_tensor_if_large(output.trajectory_timesteps) + if output.trajectory_log_probs is not None and isinstance(output.trajectory_log_probs, torch.Tensor): + output.trajectory_log_probs = _pack_tensor_if_large(output.trajectory_log_probs) return output @@ -104,10 +120,10 @@ def pack_diffusion_output_shm(output: object) -> object: def _unpack_diffusion_fields(output: DiffusionOutput) -> DiffusionOutput: - if isinstance(output.output, dict) and output.output.get("__tensor_shm__"): - output.output = _tensor_from_shm(output.output) - if isinstance(output.trajectory_latents, dict) and output.trajectory_latents.get("__tensor_shm__"): - output.trajectory_latents = _tensor_from_shm(output.trajectory_latents) + output.output = _unpack_if_shm_handle(output.output) + output.trajectory_latents = _unpack_if_shm_handle(output.trajectory_latents) + output.trajectory_timesteps = _unpack_if_shm_handle(output.trajectory_timesteps) + output.trajectory_log_probs = _unpack_if_shm_handle(output.trajectory_log_probs) return output diff --git a/vllm_omni/diffusion/models/bagel/bagel_transformer.py b/vllm_omni/diffusion/models/bagel/bagel_transformer.py index d32a6d8aca..a04ded3765 100644 --- a/vllm_omni/diffusion/models/bagel/bagel_transformer.py +++ b/vllm_omni/diffusion/models/bagel/bagel_transformer.py @@ -1655,6 +1655,9 @@ def generate_image( cfg_img_past_key_values: NaiveCache | None = None, cfg_img_key_values_lens: torch.IntTensor | None = None, cfg_img_packed_key_value_indexes: torch.LongTensor | None = None, + return_trajectory_latents: bool = False, + scheduler: object | None = None, + scheduler_kwargs: dict | None = None, ): x_t = packed_init_noises @@ -1663,6 +1666,14 @@ def generate_image( dts = timesteps[:-1] - timesteps[1:] timesteps = timesteps[:-1] + # Optional trajectory recording for RL rollout data collection + trajectory_latents: list[torch.Tensor] | None = [] if return_trajectory_latents else None + trajectory_timesteps: list[torch.Tensor] | None = [] if return_trajectory_latents else None + trajectory_log_probs: list[torch.Tensor] | None = ( + [] if (return_trajectory_latents and scheduler is not None) else None + ) + _sched_kw = scheduler_kwargs or {} + use_cfg_text = cfg_text_scale > 1.0 use_cfg_img = cfg_img_scale > 1.0 @@ -1699,6 +1710,9 @@ def generate_image( cfg_img_past_key_values=cfg_img_past_key_values, cfg_img_key_values_lens=cfg_img_key_values_lens, cfg_img_packed_key_value_indexes=cfg_img_packed_key_value_indexes, + return_trajectory_latents=return_trajectory_latents, + scheduler=scheduler, + scheduler_kwargs=scheduler_kwargs, ) # ── SP + CFG: sequential single-branch forwards ── @@ -1758,10 +1772,19 @@ def generate_image( cfg_renorm_min, ) - x_t = x_t - v_t.to(x_t.device) * dts[i] + if scheduler is not None: + out = scheduler.step(v_t.to(x_t.device), timesteps[i], x_t, dts[i], **_sched_kw) + x_t = out.prev_sample + if trajectory_log_probs is not None and out.log_prob is not None: + trajectory_log_probs.append(out.log_prob) + else: + x_t = x_t - v_t.to(x_t.device) * dts[i] + if return_trajectory_latents: + trajectory_latents.append(x_t.clone()) + trajectory_timesteps.append(timesteps[i] - dts[i]) unpacked_latent = x_t.split((packed_seqlens - 2).tolist()) - return unpacked_latent + return unpacked_latent, trajectory_latents, trajectory_timesteps, trajectory_log_probs # ── SP without CFG: direct single-branch loop ── if use_sp: @@ -1781,10 +1804,20 @@ def generate_image( past_key_values=past_key_values, packed_key_value_indexes=packed_key_value_indexes, ) - x_t = x_t - v_t.to(x_t.device) * dts[i] + if scheduler is not None: + out = scheduler.step(v_t.to(x_t.device), timesteps[i], x_t, dts[i], **_sched_kw) + x_t = out.prev_sample + out_log_prob = getattr(out, "log_prob", None) + if trajectory_log_probs is not None and out_log_prob is not None: + trajectory_log_probs.append(out_log_prob) + else: + x_t = x_t - v_t.to(x_t.device) * dts[i] + if return_trajectory_latents: + trajectory_latents.append(x_t.clone()) + trajectory_timesteps.append(timesteps[i] - dts[i]) unpacked_latent = x_t.split((packed_seqlens - 2).tolist()) - return unpacked_latent + return unpacked_latent, trajectory_latents, trajectory_timesteps, trajectory_log_probs # ── Batched CFG mode (cfg_parallel_size=1, no SP) ── cfg_batched = None @@ -1870,10 +1903,19 @@ def generate_image( cfg_batched=cfg_batched, ) - x_t = x_t - v_t.to(x_t.device) * dts[i] # velocity pointing from data to noise + if scheduler is not None: + out = scheduler.step(v_t.to(x_t.device), timesteps[i], x_t, dts[i], **_sched_kw) + x_t = out.prev_sample + if trajectory_log_probs is not None and out.log_prob is not None: + trajectory_log_probs.append(out.log_prob) + else: + x_t = x_t - v_t.to(x_t.device) * dts[i] # velocity pointing from data to noise + if return_trajectory_latents: + trajectory_latents.append(x_t.clone()) + trajectory_timesteps.append(timesteps[i] - dts[i]) unpacked_latent = x_t.split((packed_seqlens - 2).tolist()) - return unpacked_latent + return unpacked_latent, trajectory_latents, trajectory_timesteps, trajectory_log_probs def _generate_image_parallel( self, @@ -1905,6 +1947,9 @@ def _generate_image_parallel( cfg_img_past_key_values: NaiveCache | None, cfg_img_key_values_lens: torch.IntTensor | None, cfg_img_packed_key_value_indexes: torch.LongTensor | None, + return_trajectory_latents: bool = False, + scheduler: object | None = None, + scheduler_kwargs: dict | None = None, ): """CFG parallel denoising loop: each rank computes one CFG branch. @@ -1961,6 +2006,13 @@ def _generate_image_parallel( else: raise RuntimeError(f"Unexpected cfg_rank={cfg_rank} for Bagel 3-branch CFG parallel") + trajectory_latents: list[torch.Tensor] | None = [] if return_trajectory_latents else None + trajectory_timesteps: list[torch.Tensor] | None = [] if return_trajectory_latents else None + trajectory_log_probs: list[torch.Tensor] | None = ( + [] if (return_trajectory_latents and scheduler is not None) else None + ) + _sched_kw = scheduler_kwargs or {} + for i, t in enumerate(timesteps): timestep = torch.tensor([t] * x_t.shape[0], device=x_t.device) use_cfg_this_step = t > cfg_interval[0] and t <= cfg_interval[1] and cfg_text_scale > 1.0 @@ -2009,10 +2061,19 @@ def _generate_image_parallel( packed_key_value_indexes=packed_key_value_indexes, ) - x_t = x_t - v_t.to(x_t.device) * dts[i] + if scheduler is not None: + out = scheduler.step(v_t.to(x_t.device), timesteps[i], x_t, dts[i], **_sched_kw) + x_t = out.prev_sample + if trajectory_log_probs is not None and out.log_prob is not None: + trajectory_log_probs.append(out.log_prob) + else: + x_t = x_t - v_t.to(x_t.device) * dts[i] + if return_trajectory_latents: + trajectory_latents.append(x_t.clone()) + trajectory_timesteps.append(timesteps[i] - dts[i]) unpacked_latent = x_t.split((packed_seqlens - 2).tolist()) - return unpacked_latent + return unpacked_latent, trajectory_latents, trajectory_timesteps, trajectory_log_probs @staticmethod def _combine_cfg( diff --git a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py index 84f177e01a..2c72d98908 100644 --- a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py +++ b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py @@ -159,6 +159,9 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = ""): self.od_config = od_config self.device = get_local_device() + self._scheduler: object | None = None + self._scheduler_kwargs: dict = {} + model = od_config.model local_files_only = os.path.exists(model) if local_files_only: @@ -630,7 +633,7 @@ def vae_transforms(img): enabled=self.device.type != "cpu", dtype=self.od_config.dtype, ): - latents = self.bagel.generate_image( + latents, trajectory_latents, trajectory_timesteps, trajectory_log_probs = self.bagel.generate_image( past_key_values=gen_context["past_key_values"], cfg_text_past_key_values=cfg_text_context["past_key_values"], cfg_img_past_key_values=cfg_img_context["past_key_values"], @@ -650,11 +653,36 @@ def vae_transforms(img): cfg_img_packed_query_indexes=generation_input_cfg_img["cfg_packed_query_indexes"], cfg_img_key_values_lens=generation_input_cfg_img["cfg_key_values_lens"], cfg_img_packed_key_value_indexes=generation_input_cfg_img["cfg_packed_key_value_indexes"], + return_trajectory_latents=req.sampling_params.return_trajectory_latents, + scheduler=self._scheduler, + scheduler_kwargs=self._scheduler_kwargs, ) img = self._decode_image_from_latent(self.bagel, self.vae, latents[0], image_shape) + + # Build trajectory output when requested + trajectory_latents_stacked: torch.Tensor | None = None + trajectory_timesteps_stacked: torch.Tensor | None = None + trajectory_decoded: list[Image.Image] | None = None + if trajectory_latents: + trajectory_latents_stacked = torch.stack(trajectory_latents) + trajectory_timesteps_stacked = torch.stack(trajectory_timesteps) + if req.sampling_params.return_trajectory_decoded: + trajectory_decoded = [ + self._decode_image_from_latent(self.bagel, self.vae, lat, image_shape) for lat in trajectory_latents + ] + + trajectory_log_probs_stacked: torch.Tensor | None = None + if trajectory_log_probs: + trajectory_log_probs_stacked = torch.stack(trajectory_log_probs) + return DiffusionOutput( - output=img, stage_durations=self.stage_durations if hasattr(self, "stage_durations") else None + output=img, + trajectory_latents=trajectory_latents_stacked, + trajectory_timesteps=trajectory_timesteps_stacked, + trajectory_log_probs=trajectory_log_probs_stacked, + trajectory_decoded=trajectory_decoded, + stage_durations=self.stage_durations if hasattr(self, "stage_durations") else None, ) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py index 0a5fd35901..bcc3bef15d 100644 --- a/vllm_omni/diffusion/stage_diffusion_proc.py +++ b/vllm_omni/diffusion/stage_diffusion_proc.py @@ -14,8 +14,10 @@ from typing import TYPE_CHECKING, Any import msgspec +import torch import zmq import zmq.asyncio +from PIL import Image from vllm.logger import init_logger from vllm.transformers_utils.config import get_hf_file_to_dict from vllm.utils.network_utils import get_open_zmq_ipc_path, zmq_socket_ctx @@ -174,8 +176,13 @@ async def _process_batch_request( merged_mm: dict[str, Any] = {} merged_metrics: dict[str, Any] = {} merged_durations: dict[str, float] = {} + merged_custom: dict[str, Any] = {} peak_mem = 0.0 latents = None + trajectory_latents: list[torch.Tensor] | None = None + trajectory_timesteps: list[torch.Tensor] | None = None + trajectory_log_probs: torch.Tensor | None = None + trajectory_decoded: list[Image.Image] | None = None final_output_type = "image" for r in results: @@ -183,9 +190,18 @@ async def _process_batch_request( merged_mm.update(r._multimodal_output) merged_metrics.update(r.metrics) merged_durations.update(r.stage_durations) + merged_custom.update(r._custom_output) peak_mem = max(peak_mem, r.peak_memory_mb) if latents is None and r.latents is not None: latents = r.latents + if trajectory_latents is None: + trajectory_latents = r.trajectory_latents + if trajectory_timesteps is None: + trajectory_timesteps = r.trajectory_timesteps + if trajectory_log_probs is None: + trajectory_log_probs = r.trajectory_log_probs + if trajectory_decoded is None: + trajectory_decoded = r.trajectory_decoded if r.final_output_type != "image": final_output_type = r.final_output_type @@ -195,6 +211,11 @@ async def _process_batch_request( prompt=prompts[0] if len(prompts) == 1 else None, metrics=merged_metrics, latents=latents, + trajectory_latents=trajectory_latents, + trajectory_timesteps=trajectory_timesteps, + trajectory_log_probs=trajectory_log_probs, + trajectory_decoded=trajectory_decoded, + custom_output=merged_custom or None, multimodal_output=merged_mm or None, final_output_type=final_output_type, stage_durations=merged_durations, diff --git a/vllm_omni/entrypoints/omni_base.py b/vllm_omni/entrypoints/omni_base.py index 96df0591ea..1a7ffc4a50 100644 --- a/vllm_omni/entrypoints/omni_base.py +++ b/vllm_omni/entrypoints/omni_base.py @@ -282,6 +282,11 @@ def _process_single_result( final_output_type=stage_meta["final_output_type"], request_output=engine_outputs, images=images, + trajectory_latents=getattr(engine_outputs, "trajectory_latents", None), + trajectory_timesteps=getattr(engine_outputs, "trajectory_timesteps", None), + trajectory_log_probs=getattr(engine_outputs, "trajectory_log_probs", None), + trajectory_decoded=getattr(engine_outputs, "trajectory_decoded", None), + _custom_output=getattr(engine_outputs, "_custom_output", {}), stage_durations=stage_durations, peak_memory_mb=peak_memory_mb, ) diff --git a/vllm_omni/outputs.py b/vllm_omni/outputs.py index ca3ba271a1..4a775356ee 100644 --- a/vllm_omni/outputs.py +++ b/vllm_omni/outputs.py @@ -58,6 +58,10 @@ class OmniRequestOutput: images: list[Image.Image] = field(default_factory=list) prompt: OmniPromptType | None = None latents: torch.Tensor | None = None + trajectory_latents: torch.Tensor | None = None + trajectory_timesteps: torch.Tensor | None = None + trajectory_log_probs: torch.Tensor | None = None + trajectory_decoded: list | None = None metrics: dict[str, Any] = field(default_factory=dict) _multimodal_output: dict[str, Any] = field(default_factory=dict) _custom_output: dict[str, Any] = field(default_factory=dict) @@ -101,6 +105,10 @@ def from_diffusion( prompt: OmniPromptType | None = None, metrics: dict[str, Any] | None = None, latents: torch.Tensor | None = None, + trajectory_latents: torch.Tensor | None = None, + trajectory_timesteps: torch.Tensor | None = None, + trajectory_log_probs: torch.Tensor | None = None, + trajectory_decoded: list | None = None, multimodal_output: dict[str, Any] | None = None, custom_output: dict[str, Any] | None = None, final_output_type: str = "image", @@ -129,6 +137,10 @@ def from_diffusion( images=images, prompt=prompt, latents=latents, + trajectory_latents=trajectory_latents, + trajectory_timesteps=trajectory_timesteps, + trajectory_log_probs=trajectory_log_probs, + trajectory_decoded=trajectory_decoded, metrics=metrics or {}, _multimodal_output=multimodal_output or {}, _custom_output=custom_output or {}, From ec082add35d0ed41b90fb5ceda5f31c243267aeb Mon Sep 17 00:00:00 2001 From: "Y. Fisher" Date: Wed, 8 Apr 2026 11:26:39 +0800 Subject: [PATCH 083/204] [Perf] Wan2.2 I2V optimization: convert datatype from FP32 to BF16 in vae (#2391) Signed-off-by: KexiongYu Signed-off-by: Canlin Guo <961750412@qq.com> Co-authored-by: Canlin Guo <961750412@qq.com> --- .../distributed/test_autoencoder_kl_wan.py | 43 +++++++++++++++++++ .../autoencoders/autoencoder_kl_wan.py | 41 +++++++++++++++--- .../models/wan2_2/pipeline_wan2_2.py | 2 +- .../models/wan2_2/pipeline_wan2_2_i2v.py | 2 +- .../models/wan2_2/pipeline_wan2_2_ti2v.py | 6 +-- vllm_omni/platforms/interface.py | 21 +++++++++ vllm_omni/platforms/npu/platform.py | 19 ++++++++ 7 files changed, 123 insertions(+), 11 deletions(-) create mode 100644 tests/diffusion/distributed/test_autoencoder_kl_wan.py diff --git a/tests/diffusion/distributed/test_autoencoder_kl_wan.py b/tests/diffusion/distributed/test_autoencoder_kl_wan.py new file mode 100644 index 0000000000..2ea1c1214b --- /dev/null +++ b/tests/diffusion/distributed/test_autoencoder_kl_wan.py @@ -0,0 +1,43 @@ +import pytest +import torch + +from vllm_omni.diffusion.distributed.autoencoders import autoencoder_kl_wan as wan_vae_module +from vllm_omni.diffusion.distributed.autoencoders.autoencoder_kl_wan import OmniAutoencoderKLWan + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +class _DummyOmniAutoencoderKLWan(OmniAutoencoderKLWan): + def __init__(self, *, dtype: torch.dtype): + torch.nn.Module.__init__(self) + self.register_parameter("dummy_weight", torch.nn.Parameter(torch.ones(1, dtype=dtype))) + + +def test_wan_vae_execution_context_handles_fp32(): + model = _DummyOmniAutoencoderKLWan(dtype=torch.float32) + with model._execution_context(): + output = model.dummy_weight + 1 + assert output.dtype == torch.float32 + + +def test_wan_vae_execution_context_handles_bf16(): + model = _DummyOmniAutoencoderKLWan(dtype=torch.bfloat16) + with model._execution_context(): + output = model.dummy_weight + 1 + assert output.dtype == torch.bfloat16 + + +def test_wan_vae_execution_context_uses_platform_autocast(mocker): + sentinel = object() + platform = mocker.Mock() + platform.create_autocast_context.return_value = sentinel + mocker.patch.object(wan_vae_module, "current_omni_platform", platform) + + model = _DummyOmniAutoencoderKLWan(dtype=torch.bfloat16) + + assert model._execution_context() is sentinel + platform.create_autocast_context.assert_called_once_with( + device_type=model.dummy_weight.device.type, + dtype=torch.bfloat16, + enabled=True, + ) diff --git a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_wan.py b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_wan.py index 027991c3f2..35c9434d06 100644 --- a/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_wan.py +++ b/vllm_omni/diffusion/distributed/autoencoders/autoencoder_kl_wan.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from contextlib import nullcontext from typing import Any import torch @@ -15,11 +16,38 @@ GridSpec, TileTask, ) +from vllm_omni.platforms import current_omni_platform logger = init_logger(__name__) -class DistributedAutoencoderKLWan(AutoencoderKLWan, DistributedVaeMixin): +class OmniAutoencoderKLWan(AutoencoderKLWan): + def _execution_context(self): + try: + first_param = next(self.parameters()) + except StopIteration: + return nullcontext() + + dtype = first_param.dtype + if dtype not in (torch.float16, torch.bfloat16): + return nullcontext() + + return current_omni_platform.create_autocast_context( + device_type=first_param.device.type, + dtype=dtype, + enabled=True, + ) + + def encode(self, x: torch.Tensor, return_dict: bool = True): + with self._execution_context(): + return super().encode(x, return_dict=return_dict) + + def decode(self, z: torch.Tensor, return_dict: bool = True): + with self._execution_context(): + return super().decode(z, return_dict=return_dict) + + +class DistributedAutoencoderKLWan(OmniAutoencoderKLWan, DistributedVaeMixin): @classmethod def from_pretrained(cls, *args: Any, **kwargs: Any): model = super().from_pretrained(*args, **kwargs) @@ -84,11 +112,12 @@ def tile_exec(self, task: TileTask) -> torch.Tensor: """Decode a single latent tile into RGB space.""" self.clear_cache() time = [] - for k in range(len(task.tensor)): - self._conv_idx = [0] - tile = self.post_quant_conv(task.tensor[k]) - decoded = self.decoder(tile, feat_cache=self._feat_map, feat_idx=self._conv_idx, first_chunk=(k == 0)) - time.append(decoded) + with self._execution_context(): + for k in range(len(task.tensor)): + self._conv_idx = [0] + tile = self.post_quant_conv(task.tensor[k]) + decoded = self.decoder(tile, feat_cache=self._feat_map, feat_idx=self._conv_idx, first_chunk=(k == 0)) + time.append(decoded) result = torch.cat(time, dim=2) return result diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py index d2d2bb8602..a550e576f0 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py @@ -272,7 +272,7 @@ def __init__( model, subfolder="text_encoder", torch_dtype=dtype, local_files_only=local_files_only ).to(self.device) self.vae = DistributedAutoencoderKLWan.from_pretrained( - model, subfolder="vae", torch_dtype=torch.float32, local_files_only=local_files_only + model, subfolder="vae", torch_dtype=dtype, local_files_only=local_files_only ).to(self.device) # Initialize transformers with correct config (weights loaded via load_weights) diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py index 1e8a94eb3c..c05ecc9c9a 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py @@ -217,7 +217,7 @@ def __init__( # VAE self.vae = DistributedAutoencoderKLWan.from_pretrained( - model, subfolder="vae", torch_dtype=torch.float32, local_files_only=local_files_only + model, subfolder="vae", torch_dtype=dtype, local_files_only=local_files_only ).to(self.device) # Transformers (weights loaded via load_weights) diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py index f116834cf2..261f62fb79 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py @@ -24,13 +24,13 @@ import numpy as np import PIL.Image import torch -from diffusers import AutoencoderKLWan from diffusers.utils.torch_utils import randn_tensor from torch import nn from transformers import AutoTokenizer, UMT5EncoderModel from vllm.model_executor.models.utils import AutoWeightsLoader from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig +from vllm_omni.diffusion.distributed.autoencoders.autoencoder_kl_wan import OmniAutoencoderKLWan from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader @@ -174,8 +174,8 @@ def __init__( ).to(self.device) # VAE - self.vae = AutoencoderKLWan.from_pretrained( - model, subfolder="vae", torch_dtype=torch.float32, local_files_only=local_files_only + self.vae = OmniAutoencoderKLWan.from_pretrained( + model, subfolder="vae", torch_dtype=dtype, local_files_only=local_files_only ).to(self.device) # Single transformer (TI2V uses dense 5B model, not MoE) diff --git a/vllm_omni/platforms/interface.py b/vllm_omni/platforms/interface.py index 4df297fa02..8f1e66747d 100644 --- a/vllm_omni/platforms/interface.py +++ b/vllm_omni/platforms/interface.py @@ -1,12 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from contextlib import nullcontext from enum import Enum from typing import Any import torch +from vllm.logger import init_logger from vllm.platforms import Platform +logger = init_logger(__name__) + class OmniPlatformEnum(Enum): """Enum for supported Omni platforms.""" @@ -113,6 +117,23 @@ def synchronize(cls) -> None: def get_free_memory(cls, device: torch.device | None = None) -> int: raise NotImplementedError + @classmethod + def create_autocast_context( + cls, + *, + device_type: str, + dtype: torch.dtype, + enabled: bool = True, + ): + if not enabled: + return nullcontext() + + try: + return torch.autocast(device_type=device_type, dtype=dtype, enabled=True) + except (RuntimeError, TypeError, ValueError) as exc: + logger.warning("autocast unavailable for device_type=%s dtype=%s: %s", device_type, dtype, exc) + return nullcontext() + @classmethod def supports_cpu_offload(cls) -> bool: return True diff --git a/vllm_omni/platforms/npu/platform.py b/vllm_omni/platforms/npu/platform.py index 1d6bea7cb5..c40dd6fea1 100644 --- a/vllm_omni/platforms/npu/platform.py +++ b/vllm_omni/platforms/npu/platform.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from contextlib import nullcontext from typing import Any import torch @@ -106,6 +107,24 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: device_props = torch.npu.get_device_properties(device_id) return device_props.total_memory + @classmethod + def create_autocast_context(cls, *, device_type, dtype, enabled=True): + if device_type != "npu": + return super().create_autocast_context( + device_type=device_type, + dtype=dtype, + enabled=enabled, + ) + if not enabled: + return nullcontext() + + # NPU-specific fallback + try: + return torch.npu.amp.autocast(dtype=dtype) + except (RuntimeError, TypeError, ValueError) as exc: + logger.warning("autocast unavailable for device_type=%s dtype=%s: %s", device_type, dtype, exc) + return nullcontext() + @classmethod def get_profiler_cls(cls) -> str: return "vllm_omni.platforms.npu.profiler.NPUTorchProfilerWrapper" From 1cd52104404ab87db3057d0a5bf96646da53db9f Mon Sep 17 00:00:00 2001 From: Yangshen Deng Date: Wed, 8 Apr 2026 04:44:04 +0100 Subject: [PATCH 084/204] [Diffusion] Refactor LTX2 to use unified CFG parallel framework (#2160) Signed-off-by: Yangshen Deng Co-authored-by: Claude Opus 4.6 (1M context) --- tests/dfx/perf/tests/test_ltx2_vllm_omni.json | 217 +++++++++ .../ltx2/test_ltx2_cfg_parallel_adaptation.py | 58 +++ .../test_ltx2_cfg_parallel_parity.py | 243 ++++++++++ .../diffusion/models/ltx2/pipeline_ltx2.py | 422 +++++++----------- .../models/ltx2/pipeline_ltx2_image2video.py | 256 ++++------- 5 files changed, 768 insertions(+), 428 deletions(-) create mode 100644 tests/dfx/perf/tests/test_ltx2_vllm_omni.json create mode 100644 tests/diffusion/models/ltx2/test_ltx2_cfg_parallel_adaptation.py create mode 100644 tests/e2e/offline_inference/test_ltx2_cfg_parallel_parity.py diff --git a/tests/dfx/perf/tests/test_ltx2_vllm_omni.json b/tests/dfx/perf/tests/test_ltx2_vllm_omni.json new file mode 100644 index 0000000000..4a6f9e3501 --- /dev/null +++ b/tests/dfx/perf/tests/test_ltx2_vllm_omni.json @@ -0,0 +1,217 @@ +[ + { + "test_name": "test_ltx2_baseline_eager", + "description": "Single-device baseline with enforce-eager (no torch.compile)", + "server_type": "vllm-omni", + "server_params": { + "model": "Lightricks/LTX-2", + "serve_args": { + "enforce-eager": true, + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "256x256_145f_steps6", + "dataset": "random", + "task": "t2v", + "backend": "v1/videos", + "width": 256, + "height": 256, + "num-frames": 145, + "fps": 24, + "num-inference-steps": 6, + "num-prompts": 3, + "max-concurrency": 1, + "enable-negative-prompt": true + }, + { + "name": "480x768_41f_steps20", + "dataset": "random", + "task": "t2v", + "backend": "v1/videos", + "width": 768, + "height": 480, + "num-frames": 41, + "fps": 24, + "num-inference-steps": 20, + "num-prompts": 3, + "max-concurrency": 1, + "enable-negative-prompt": true + } + ] + }, + + { + "test_name": "test_ltx2_torch_compile", + "description": "Single-device with torch.compile (default, no enforce-eager)", + "server_type": "vllm-omni", + "server_params": { + "model": "Lightricks/LTX-2", + "serve_args": { + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "256x256_145f_steps6", + "dataset": "random", + "task": "t2v", + "backend": "v1/videos", + "width": 256, + "height": 256, + "num-frames": 145, + "fps": 24, + "num-inference-steps": 6, + "num-prompts": 3, + "max-concurrency": 1, + "enable-negative-prompt": true + }, + { + "name": "480x768_41f_steps20", + "dataset": "random", + "task": "t2v", + "backend": "v1/videos", + "width": 768, + "height": 480, + "num-frames": 41, + "fps": 24, + "num-inference-steps": 20, + "num-prompts": 3, + "max-concurrency": 1, + "enable-negative-prompt": true + } + ] + }, + + { + "test_name": "test_ltx2_cfg2_eager", + "description": "CFG-parallel=2 with enforce-eager", + "server_type": "vllm-omni", + "server_params": { + "model": "Lightricks/LTX-2", + "serve_args": { + "cfg-parallel-size": 2, + "enforce-eager": true, + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "256x256_145f_steps6", + "dataset": "random", + "task": "t2v", + "backend": "v1/videos", + "width": 256, + "height": 256, + "num-frames": 145, + "fps": 24, + "num-inference-steps": 6, + "num-prompts": 3, + "max-concurrency": 1, + "enable-negative-prompt": true + }, + { + "name": "480x768_41f_steps20", + "dataset": "random", + "task": "t2v", + "backend": "v1/videos", + "width": 768, + "height": 480, + "num-frames": 41, + "fps": 24, + "num-inference-steps": 20, + "num-prompts": 3, + "max-concurrency": 1, + "enable-negative-prompt": true + } + ] + }, + + { + "test_name": "test_ltx2_cfg2_compile", + "description": "CFG-parallel=2 with torch.compile", + "server_type": "vllm-omni", + "server_params": { + "model": "Lightricks/LTX-2", + "serve_args": { + "cfg-parallel-size": 2, + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "256x256_145f_steps6", + "dataset": "random", + "task": "t2v", + "backend": "v1/videos", + "width": 256, + "height": 256, + "num-frames": 145, + "fps": 24, + "num-inference-steps": 6, + "num-prompts": 3, + "max-concurrency": 1, + "enable-negative-prompt": true + }, + { + "name": "480x768_41f_steps20", + "dataset": "random", + "task": "t2v", + "backend": "v1/videos", + "width": 768, + "height": 480, + "num-frames": 41, + "fps": 24, + "num-inference-steps": 20, + "num-prompts": 3, + "max-concurrency": 1, + "enable-negative-prompt": true + } + ] + }, + + { + "test_name": "test_ltx2_cache_dit_eager", + "description": "CacheDiT with enforce-eager", + "server_type": "vllm-omni", + "server_params": { + "model": "Lightricks/LTX-2", + "serve_args": { + "cache-backend": "cache_dit", + "enforce-eager": true, + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "256x256_145f_steps6", + "dataset": "random", + "task": "t2v", + "backend": "v1/videos", + "width": 256, + "height": 256, + "num-frames": 145, + "fps": 24, + "num-inference-steps": 6, + "num-prompts": 3, + "max-concurrency": 1, + "enable-negative-prompt": true + }, + { + "name": "480x768_41f_steps20", + "dataset": "random", + "task": "t2v", + "backend": "v1/videos", + "width": 768, + "height": 480, + "num-frames": 41, + "fps": 24, + "num-inference-steps": 20, + "num-prompts": 3, + "max-concurrency": 1, + "enable-negative-prompt": true + } + ] + } +] diff --git a/tests/diffusion/models/ltx2/test_ltx2_cfg_parallel_adaptation.py b/tests/diffusion/models/ltx2/test_ltx2_cfg_parallel_adaptation.py new file mode 100644 index 0000000000..bbfe63dfa5 --- /dev/null +++ b/tests/diffusion/models/ltx2/test_ltx2_cfg_parallel_adaptation.py @@ -0,0 +1,58 @@ +from types import SimpleNamespace + +import pytest +import torch + +from vllm_omni.diffusion.models.ltx2.pipeline_ltx2 import LTX2Pipeline + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def _make_pipeline(sequence_parallel_size: int = 1) -> LTX2Pipeline: + pipeline = object.__new__(LTX2Pipeline) + torch.nn.Module.__init__(pipeline) + pipeline.audio_vae_temporal_compression_ratio = 4 + pipeline.audio_vae_mel_compression_ratio = 4 + pipeline.od_config = SimpleNamespace(parallel_config=SimpleNamespace(sequence_parallel_size=sequence_parallel_size)) + # Mock audio_vae with identity normalization (mean=0, std=1) so + # _normalize_audio_latents is a no-op and test values are preserved. + pipeline.audio_vae = SimpleNamespace( + latents_mean=torch.tensor(0.0), + latents_std=torch.tensor(1.0), + ) + return pipeline + + +def test_prepare_audio_latents_pads_packed_sequence_dim_for_provided_latents(): + pipeline = _make_pipeline(sequence_parallel_size=4) + latents = torch.arange(40, dtype=torch.float32).view(1, 10, 4) + + padded, original_num_frames, padded_num_frames = pipeline.prepare_audio_latents( + batch_size=1, + num_channels_latents=2, + num_mel_bins=8, + audio_latent_length=10, + dtype=torch.float32, + device=torch.device("cpu"), + latents=latents, + ) + + assert original_num_frames == 10 + assert padded_num_frames == 12 + assert padded.shape == (1, 12, 4) + torch.testing.assert_close(padded[:, :10], latents) + torch.testing.assert_close(padded[:, 10:], torch.zeros(1, 2, 4)) + + +def test_unpad_audio_latents_restores_original_frames_before_unpack(): + pipeline = _make_pipeline() + original = torch.arange(40, dtype=torch.float32).view(1, 10, 4) + padded = torch.cat([original, torch.full((1, 2, 4), 999.0)], dim=1) + + unpadded = pipeline._unpad_audio_latents(padded, 10) + unpacked = pipeline._unpack_audio_latents(unpadded, latent_length=10, num_mel_bins=2) + expected = pipeline._unpack_audio_latents(original, latent_length=10, num_mel_bins=2) + + assert unpacked.shape == (1, 2, 10, 2) + assert not (unpacked == 999.0).any() + torch.testing.assert_close(unpacked, expected) diff --git a/tests/e2e/offline_inference/test_ltx2_cfg_parallel_parity.py b/tests/e2e/offline_inference/test_ltx2_cfg_parallel_parity.py new file mode 100644 index 0000000000..659040929e --- /dev/null +++ b/tests/e2e/offline_inference/test_ltx2_cfg_parallel_parity.py @@ -0,0 +1,243 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +import os +import subprocess +import sys +from pathlib import Path + +import numpy as np +import pytest +from PIL import Image + +from tests.utils import hardware_test + +REPO_ROOT = Path(__file__).resolve().parents[3] +T2V_EXAMPLE = REPO_ROOT / "examples" / "offline_inference" / "text_to_video" / "text_to_video.py" +I2V_EXAMPLE = REPO_ROOT / "examples" / "offline_inference" / "image_to_video" / "image_to_video.py" + +T2V_PROMPT = ( + "At sunrise, a glowing paper lantern boat drifts through a narrow canal between mossy stone walls, " + "soft fog above the water, the camera slowly gliding forward as golden reflections shimmer across " + "the ripples, cinematic, realistic, highly detailed." +) +T2V_NEGATIVE_PROMPT = "worst quality, blurry, jittery motion, distorted, oversaturated, artifacts" +I2V_PROMPT = "A cinematic dolly shot of a boat drifting on calm water at sunset" +I2V_NEGATIVE_PROMPT = "worst quality, blurry, jittery motion" + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +def _get_ltx2_model() -> str: + return os.environ.get("VLLM_TEST_LTX2_MODEL", "Lightricks/LTX-2") + + +def _md5(path: Path) -> str: + digest = hashlib.md5(usedforsecurity=False) + with path.open("rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def _make_deterministic_test_image(path: Path) -> None: + """Create a deterministic 256x256 test image for I2V tests.""" + rng = np.random.RandomState(42) + img = Image.fromarray(rng.randint(0, 255, (256, 256, 3), dtype=np.uint8)) + img.save(path) + + +def _run_and_check(cmd: list[str], env: dict, output_path: Path, expected_md5: str) -> None: + result = subprocess.run(cmd, cwd=REPO_ROOT, env=env, capture_output=True, text=True, check=False) + assert result.returncode == 0, ( + f"Command failed (exit {result.returncode}).\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}" + ) + generated_md5 = _md5(output_path) + assert generated_md5 == expected_md5, ( + f"Unexpected output md5: {generated_md5} != {expected_md5}.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}" + ) + + +# ── T2V tests ── + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@pytest.mark.parallel +@pytest.mark.slow +@hardware_test(res={"cuda": "L4"}, num_cards=2) +def test_ltx2_t2v_cfg_parallel(tmp_path: Path): + """T2V with CFG=4.0, cfg-parallel-size=2.""" + output = tmp_path / "t2v_cfg4.mp4" + env = os.environ.copy() + env.setdefault("CUDA_VISIBLE_DEVICES", "0,1") + cmd = [ + sys.executable, + str(T2V_EXAMPLE), + "--model", + _get_ltx2_model(), + "--prompt", + T2V_PROMPT, + "--negative-prompt", + T2V_NEGATIVE_PROMPT, + "--height", + "256", + "--width", + "256", + "--num-frames", + "145", + "--num-inference-steps", + "6", + "--guidance-scale", + "4.0", + "--frame-rate", + "24", + "--fps", + "24", + "--seed", + "42", + "--cfg-parallel-size", + "2", + "--enforce-eager", + "--output", + str(output), + ] + _run_and_check(cmd, env, output, expected_md5="08e606b9c522fee4b6f30cee8b77db40") + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@pytest.mark.slow +@hardware_test(res={"cuda": "L4"}, num_cards=1) +def test_ltx2_t2v_no_cfg(tmp_path: Path): + """T2V with CFG=1.0 (no classifier-free guidance).""" + output = tmp_path / "t2v_nocfg.mp4" + env = os.environ.copy() + env.setdefault("CUDA_VISIBLE_DEVICES", "0") + cmd = [ + sys.executable, + str(T2V_EXAMPLE), + "--model", + _get_ltx2_model(), + "--prompt", + T2V_PROMPT, + "--height", + "256", + "--width", + "256", + "--num-frames", + "145", + "--num-inference-steps", + "6", + "--guidance-scale", + "1.0", + "--frame-rate", + "24", + "--fps", + "24", + "--seed", + "42", + "--enforce-eager", + "--output", + str(output), + ] + _run_and_check(cmd, env, output, expected_md5="a83994b94b6e67c54a524e0383c45ce8") + + +# ── I2V tests ── + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@pytest.mark.parallel +@pytest.mark.slow +@hardware_test(res={"cuda": "L4"}, num_cards=2) +def test_ltx2_i2v_cfg_parallel(tmp_path: Path): + """I2V with CFG=4.0, cfg-parallel-size=2.""" + test_image = tmp_path / "test_input.png" + _make_deterministic_test_image(test_image) + output = tmp_path / "i2v_cfg4.mp4" + env = os.environ.copy() + env.setdefault("CUDA_VISIBLE_DEVICES", "0,1") + cmd = [ + sys.executable, + str(I2V_EXAMPLE), + "--model", + _get_ltx2_model(), + "--model-class-name", + "LTX2ImageToVideoPipeline", + "--image", + str(test_image), + "--prompt", + I2V_PROMPT, + "--negative-prompt", + I2V_NEGATIVE_PROMPT, + "--height", + "256", + "--width", + "256", + "--num-frames", + "73", + "--num-inference-steps", + "6", + "--guidance-scale", + "4.0", + "--frame-rate", + "24", + "--fps", + "24", + "--seed", + "42", + "--cfg-parallel-size", + "2", + "--enforce-eager", + "--output", + str(output), + ] + _run_and_check(cmd, env, output, expected_md5="aed7e56084b36373244d8f839b16d115") + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@pytest.mark.slow +@hardware_test(res={"cuda": "L4"}, num_cards=1) +def test_ltx2_i2v_no_cfg(tmp_path: Path): + """I2V with CFG=1.0 (no classifier-free guidance).""" + test_image = tmp_path / "test_input.png" + _make_deterministic_test_image(test_image) + output = tmp_path / "i2v_nocfg.mp4" + env = os.environ.copy() + env.setdefault("CUDA_VISIBLE_DEVICES", "0") + cmd = [ + sys.executable, + str(I2V_EXAMPLE), + "--model", + _get_ltx2_model(), + "--model-class-name", + "LTX2ImageToVideoPipeline", + "--image", + str(test_image), + "--prompt", + I2V_PROMPT, + "--height", + "256", + "--width", + "256", + "--num-frames", + "73", + "--num-inference-steps", + "6", + "--guidance-scale", + "1.0", + "--frame-rate", + "24", + "--fps", + "24", + "--seed", + "42", + "--enforce-eager", + "--output", + str(output), + ] + _run_and_check(cmd, env, output, expected_md5="81b21ede12753e9e14a357a6c548b666") diff --git a/vllm_omni/diffusion/models/ltx2/pipeline_ltx2.py b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2.py index efc342e932..c60b192f0a 100644 --- a/vllm_omni/diffusion/models/ltx2/pipeline_ltx2.py +++ b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2.py @@ -28,8 +28,6 @@ from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin from vllm_omni.diffusion.distributed.parallel_state import ( - get_cfg_group, - get_classifier_free_guidance_rank, get_classifier_free_guidance_world_size, ) from vllm_omni.diffusion.distributed.utils import get_local_device @@ -122,6 +120,31 @@ def calculate_shift( return mu +class _VideoAudioScheduler: + """Composite scheduler dispatching to video and audio schedulers.""" + + def __init__(self, video_scheduler, audio_scheduler): + self.video_scheduler = video_scheduler + self.audio_scheduler = audio_scheduler + + def step(self, noise_pred, t, latents, return_dict=False, generator=None): + video_out = self.video_scheduler.step( + noise_pred[0], + t[0], + latents[0], + return_dict=False, + generator=generator, + )[0] + audio_out = self.audio_scheduler.step( + noise_pred[1], + t[1], + latents[1], + return_dict=False, + generator=generator, + )[0] + return ((video_out, audio_out),) + + class LTX2Pipeline(nn.Module, CFGParallelMixin, ProgressBarMixin): def __init__( self, @@ -542,6 +565,10 @@ def _unpack_audio_latents( latents = latents.unflatten(2, (-1, num_mel_bins)).transpose(1, 2) return latents + @staticmethod + def _unpad_audio_latents(latents: torch.Tensor, num_frames: int) -> torch.Tensor: + return latents[:, :num_frames] + def prepare_latents( self, batch_size: int = 1, @@ -597,25 +624,49 @@ def prepare_audio_latents( noise_scale: float = 0.0, dtype: torch.dtype | None = None, device: torch.device | None = None, - generator: torch.Generator | None = None, + generator: torch.Generator | list[torch.Generator] | None = None, latents: torch.Tensor | None = None, - ) -> tuple[torch.Tensor, int]: + ) -> tuple[torch.Tensor, int, int]: + original_latent_length = audio_latent_length + padded_latent_length = original_latent_length + + latent_mel_bins = num_mel_bins // self.audio_vae_mel_compression_ratio + + sp_size = getattr(self.od_config.parallel_config, "sequence_parallel_size", 1) + if sp_size > 1: + padded_latent_length += (sp_size - (original_latent_length % sp_size)) % sp_size + if latents is not None: if latents.ndim == 4: # latents are of shape [B, C, L, M], need to be packed latents = self._pack_audio_latents(latents) if latents.ndim != 3: raise ValueError( - f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is [batch_size, num_seq, num_features]." # noqa + f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is " + "[batch_size, num_seq, num_features] or [batch_size, num_channels, audio_length, mel_bins]." ) latents = self._normalize_audio_latents(latents, self.audio_vae.latents_mean, self.audio_vae.latents_std) latents = self._create_noised_state(latents, noise_scale, generator) - return latents.to(device=device, dtype=dtype) - # TODO: confirm whether this logic is correct - latent_mel_bins = num_mel_bins // self.audio_vae_mel_compression_ratio + if latents.shape[1] not in {original_latent_length, padded_latent_length}: + raise ValueError( + "Provided `audio_latents` has incompatible audio frame count " + f"{latents.shape[1]}; expected {original_latent_length} or {padded_latent_length}." + ) - shape = (batch_size, num_channels_latents, audio_latent_length, latent_mel_bins) + if latents.shape[1] == original_latent_length and padded_latent_length > original_latent_length: + padding = torch.zeros( + latents.shape[0], + padded_latent_length - original_latent_length, + latents.shape[2], + dtype=latents.dtype, + device=latents.device, + ) + latents = torch.cat([latents, padding], dim=1) + + return latents.to(device=device, dtype=dtype), original_latent_length, padded_latent_length + + shape = (batch_size, num_channels_latents, padded_latent_length, latent_mel_bins) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( @@ -625,7 +676,7 @@ def prepare_audio_latents( latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) latents = self._pack_audio_latents(latents) - return latents + return latents, original_latent_length, padded_latent_length @property def guidance_scale(self): @@ -655,147 +706,44 @@ def attention_kwargs(self): def interrupt(self): return self._interrupt - def _is_cfg_parallel_enabled(self, do_true_cfg: bool) -> bool: - return do_true_cfg and get_classifier_free_guidance_world_size() > 1 - def _transformer_cache_context(self, context_name: str): cache_context = getattr(self.transformer, "cache_context", None) if callable(cache_context): return cache_context(context_name) return nullcontext() - def _predict_noise_av(self, **kwargs) -> tuple[torch.Tensor, torch.Tensor]: + def predict_noise(self, **kwargs): with self._transformer_cache_context("cond_uncond"): noise_pred_video, noise_pred_audio = self.transformer(**kwargs) - return noise_pred_video, noise_pred_audio - - def predict_noise_av_maybe_with_cfg( - self, - do_true_cfg: bool, - true_cfg_scale: float, - positive_kwargs: dict[str, Any], - negative_kwargs: dict[str, Any] | None, - guidance_rescale: float = 0.0, - cfg_normalize: bool = False, - ) -> tuple[torch.Tensor | None, torch.Tensor | None]: - if do_true_cfg: - cfg_parallel_ready = get_classifier_free_guidance_world_size() > 1 - - if cfg_parallel_ready: - cfg_group = get_cfg_group() - cfg_rank = get_classifier_free_guidance_rank() - - if cfg_rank == 0: - noise_pred_video, noise_pred_audio = self._predict_noise_av(**positive_kwargs) - else: - noise_pred_video, noise_pred_audio = self._predict_noise_av(**negative_kwargs) - - noise_pred_video = noise_pred_video.float() - noise_pred_audio = noise_pred_audio.float() - - gathered_video = cfg_group.all_gather(noise_pred_video, separate_tensors=True) - gathered_audio = cfg_group.all_gather(noise_pred_audio, separate_tensors=True) - - if cfg_rank == 0: - noise_pred_video_text = gathered_video[0] - noise_pred_video_uncond = gathered_video[1] - noise_pred_audio_text = gathered_audio[0] - noise_pred_audio_uncond = gathered_audio[1] - - noise_pred_video = self.combine_cfg_noise( - noise_pred_video_text, - noise_pred_video_uncond, - true_cfg_scale, - cfg_normalize, - ) - noise_pred_audio = self.combine_cfg_noise( - noise_pred_audio_text, - noise_pred_audio_uncond, - true_cfg_scale, - cfg_normalize, - ) - - if guidance_rescale > 0: - noise_pred_video = rescale_noise_cfg( - noise_pred_video, - noise_pred_video_text, - guidance_rescale=guidance_rescale, - ) - noise_pred_audio = rescale_noise_cfg( - noise_pred_audio, - noise_pred_audio_text, - guidance_rescale=guidance_rescale, - ) - return noise_pred_video, noise_pred_audio - return None, None - - noise_pred_video_text, noise_pred_audio_text = self._predict_noise_av(**positive_kwargs) - noise_pred_video_uncond, noise_pred_audio_uncond = self._predict_noise_av(**negative_kwargs) - - noise_pred_video_text = noise_pred_video_text.float() - noise_pred_audio_text = noise_pred_audio_text.float() - noise_pred_video_uncond = noise_pred_video_uncond.float() - noise_pred_audio_uncond = noise_pred_audio_uncond.float() - - noise_pred_video = self.combine_cfg_noise( - noise_pred_video_text, - noise_pred_video_uncond, - true_cfg_scale, - cfg_normalize, - ) - noise_pred_audio = self.combine_cfg_noise( - noise_pred_audio_text, - noise_pred_audio_uncond, - true_cfg_scale, - cfg_normalize, - ) - - if guidance_rescale > 0: - noise_pred_video = rescale_noise_cfg( - noise_pred_video, - noise_pred_video_text, - guidance_rescale=guidance_rescale, - ) - noise_pred_audio = rescale_noise_cfg( - noise_pred_audio, - noise_pred_audio_text, - guidance_rescale=guidance_rescale, - ) - - return noise_pred_video, noise_pred_audio - - noise_pred_video, noise_pred_audio = self._predict_noise_av(**positive_kwargs) return noise_pred_video.float(), noise_pred_audio.float() - def _scheduler_step_video_audio_maybe_with_cfg( + def combine_cfg_noise(self, positive_noise_pred, negative_noise_pred, true_cfg_scale, cfg_normalize=False): + """Per-element CFG combine with guidance_rescale support.""" + (video_pos, audio_pos) = positive_noise_pred + (video_neg, audio_neg) = negative_noise_pred + video_combined = super().combine_cfg_noise(video_pos, video_neg, true_cfg_scale, cfg_normalize) + audio_combined = super().combine_cfg_noise(audio_pos, audio_neg, true_cfg_scale, cfg_normalize) + if self._guidance_rescale and self._guidance_rescale > 0: + video_combined = rescale_noise_cfg(video_combined, video_pos, guidance_rescale=self._guidance_rescale) + audio_combined = rescale_noise_cfg(audio_combined, audio_pos, guidance_rescale=self._guidance_rescale) + return (video_combined, audio_combined) + + def _synchronize_cfg_parallel_step_output( self, - noise_pred_video: torch.Tensor | None, - noise_pred_audio: torch.Tensor | None, - t: torch.Tensor, - latents: torch.Tensor, - audio_latents: torch.Tensor, - audio_scheduler: FlowMatchEulerDiscreteScheduler, + latents: tuple[torch.Tensor, torch.Tensor], do_true_cfg: bool, ) -> tuple[torch.Tensor, torch.Tensor]: - cfg_parallel_ready = self._is_cfg_parallel_enabled(do_true_cfg) - - if cfg_parallel_ready: - cfg_group = get_cfg_group() - cfg_rank = get_classifier_free_guidance_rank() - - if cfg_rank == 0: - latents = self.scheduler.step(noise_pred_video, t, latents, return_dict=False)[0] - audio_latents = audio_scheduler.step(noise_pred_audio, t, audio_latents, return_dict=False)[0] - - latents = latents.contiguous() - audio_latents = audio_latents.contiguous() - cfg_group.broadcast(latents, src=0) - cfg_group.broadcast(audio_latents, src=0) - return latents, audio_latents - - latents = self.scheduler.step(noise_pred_video, t, latents, return_dict=False)[0] - audio_latents = audio_scheduler.step(noise_pred_audio, t, audio_latents, return_dict=False)[0] - return latents, audio_latents + if not (do_true_cfg and get_classifier_free_guidance_world_size() > 1): + return latents + + # Without this sync, CUDA async execution causes non-deterministic + # numerical drift across denoising steps in CFG parallel mode, + # producing different video outputs across runs. + latents = tuple(tensor.contiguous() for tensor in latents) + device = next((tensor.device for tensor in latents if tensor.is_cuda), None) + if device is not None: + torch.cuda.current_stream(device).synchronize() + return latents @torch.no_grad() def forward( @@ -828,6 +776,8 @@ def forward( attention_kwargs: dict[str, Any] | None = None, max_sequence_length: int | None = None, ) -> DiffusionOutput: + # Extract prompt/negative_prompt from request. + # Input format: req.prompts is a list of str or dict with "prompt"/"negative_prompt" keys. prompt = [p if isinstance(p, str) else (p.get("prompt") or "") for p in req.prompts] or prompt if all(isinstance(p, str) or p.get("negative_prompt") is None for p in req.prompts): negative_prompt = None @@ -869,6 +819,7 @@ def forward( else req.sampling_params.extra_args.get("audio_latents", audio_latents) ) + # Override with pre-computed embeddings if provided in request. req_prompt_embeds = [_get_prompt_field(p, "prompt_embeds") for p in req.prompts] if any(p is not None for p in req_prompt_embeds): prompt_embeds = torch.stack(req_prompt_embeds) # type: ignore[arg-type] @@ -939,20 +890,17 @@ def forward( max_sequence_length=max_sequence_length, device=device, ) - cfg_parallel_ready = self._is_cfg_parallel_enabled(self.do_classifier_free_guidance) - if self.do_classifier_free_guidance and not cfg_parallel_ready: - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) - prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0) - + # Compute positive prompt connectors additive_attention_mask = (1 - prompt_attention_mask.to(prompt_embeds.dtype)) * -1000000.0 connector_prompt_embeds, connector_audio_prompt_embeds, connector_attention_mask = self.connectors( prompt_embeds, additive_attention_mask, additive_mask=True ) + # Compute negative prompt connectors when CFG is enabled negative_connector_prompt_embeds = None negative_connector_audio_prompt_embeds = None negative_connector_attention_mask = None - if cfg_parallel_ready: + if self.do_classifier_free_guidance: negative_additive_attention_mask = ( 1 - negative_prompt_attention_mask.to(negative_prompt_embeds.dtype) ) * -1000000.0 @@ -1027,20 +975,7 @@ def forward( num_channels_latents_audio = ( self.audio_vae.config.latent_channels if getattr(self, "audio_vae", None) is not None else 8 ) - - # padding audio_latents if needed - sp_size = getattr(self.od_config.parallel_config, "sequence_parallel_size", 1) - if sp_size > 1: - pad_len = (sp_size - (audio_num_frames % sp_size)) % sp_size - if pad_len > 0: - if audio_latents is not None: - pad_shape = list(audio_latents.shape) - pad_shape[2] = pad_len - padding = torch.zeros(pad_shape, dtype=audio_latents.dtype, device=audio_latents.device) - audio_latents = torch.cat([audio_latents, padding], dim=2) - audio_num_frames += pad_len - - audio_latents = self.prepare_audio_latents( + audio_latents, original_audio_num_frames, padded_audio_num_frames = self.prepare_audio_latents( batch_size * num_videos_per_prompt, num_channels_latents=num_channels_latents_audio, audio_latent_length=audio_num_frames, @@ -1061,6 +996,7 @@ def forward( self.scheduler.config.get("max_shift", 2.05), ) audio_scheduler = copy.deepcopy(self.scheduler) + video_audio_scheduler = _VideoAudioScheduler(self.scheduler, audio_scheduler) _ = retrieve_timesteps( audio_scheduler, num_inference_steps, @@ -1083,12 +1019,10 @@ def forward( latents.shape[0], latent_num_frames, latent_height, latent_width, latents.device, fps=frame_rate ) audio_coords = self.transformer.audio_rope.prepare_audio_coords( - audio_latents.shape[0], audio_num_frames, audio_latents.device + audio_latents.shape[0], padded_audio_num_frames, audio_latents.device ) - # Duplicate the positional ids as well if using CFG - if self.do_classifier_free_guidance and not cfg_parallel_ready: - video_coords = video_coords.repeat((2,) + (1,) * (video_coords.ndim - 1)) # Repeat twice in batch dim - audio_coords = audio_coords.repeat((2,) + (1,) * (audio_coords.ndim - 1)) + # No coord duplication needed: mixin handles CFG via separate forward calls, + # not batch=2. Each forward gets batch=1 coords directly. with self.progress_bar(total=len(timesteps)) as pbar: for i, t in enumerate(timesteps): @@ -1097,119 +1031,60 @@ def forward( self._current_timestep = t - if cfg_parallel_ready: - latent_model_input = latents.to(prompt_embeds.dtype) - audio_latent_model_input = audio_latents.to(prompt_embeds.dtype) - timestep = t.expand(latent_model_input.shape[0]) - - positive_kwargs = { - "hidden_states": latent_model_input, - "audio_hidden_states": audio_latent_model_input, - "encoder_hidden_states": connector_prompt_embeds, - "audio_encoder_hidden_states": connector_audio_prompt_embeds, - "timestep": timestep, - "encoder_attention_mask": connector_attention_mask, - "audio_encoder_attention_mask": connector_attention_mask, - "num_frames": latent_num_frames, - "height": latent_height, - "width": latent_width, - "fps": frame_rate, - "audio_num_frames": audio_num_frames, - "video_coords": video_coords, - "audio_coords": audio_coords, - "attention_kwargs": attention_kwargs, - "return_dict": False, - } - negative_kwargs = { - "hidden_states": latent_model_input, - "audio_hidden_states": audio_latent_model_input, + latent_model_input = latents.to(prompt_embeds.dtype) + audio_latent_model_input = audio_latents.to(prompt_embeds.dtype) + timestep = t.expand(latent_model_input.shape[0]) + do_true_cfg = self.do_classifier_free_guidance + + positive_kwargs = { + "hidden_states": latent_model_input, + "audio_hidden_states": audio_latent_model_input, + "encoder_hidden_states": connector_prompt_embeds, + "audio_encoder_hidden_states": connector_audio_prompt_embeds, + "timestep": timestep, + "encoder_attention_mask": connector_attention_mask, + "audio_encoder_attention_mask": connector_attention_mask, + "num_frames": latent_num_frames, + "height": latent_height, + "width": latent_width, + "fps": frame_rate, + "audio_num_frames": padded_audio_num_frames, + "video_coords": video_coords, + "audio_coords": audio_coords, + "attention_kwargs": attention_kwargs, + "return_dict": False, + } + negative_kwargs = ( + { + **positive_kwargs, "encoder_hidden_states": negative_connector_prompt_embeds, "audio_encoder_hidden_states": negative_connector_audio_prompt_embeds, - "timestep": timestep, "encoder_attention_mask": negative_connector_attention_mask, "audio_encoder_attention_mask": negative_connector_attention_mask, - "num_frames": latent_num_frames, - "height": latent_height, - "width": latent_width, - "fps": frame_rate, - "audio_num_frames": audio_num_frames, - "video_coords": video_coords, - "audio_coords": audio_coords, - "attention_kwargs": attention_kwargs, - "return_dict": False, } + if do_true_cfg + else None + ) - noise_pred_video, noise_pred_audio = self.predict_noise_av_maybe_with_cfg( - do_true_cfg=True, - true_cfg_scale=guidance_scale, - positive_kwargs=positive_kwargs, - negative_kwargs=negative_kwargs, - guidance_rescale=guidance_rescale, - cfg_normalize=False, - ) - - latents, audio_latents = self._scheduler_step_video_audio_maybe_with_cfg( - noise_pred_video, - noise_pred_audio, - t, - latents, - audio_latents, - audio_scheduler, - do_true_cfg=True, - ) - else: - latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents - latent_model_input = latent_model_input.to(prompt_embeds.dtype) - audio_latent_model_input = ( - torch.cat([audio_latents] * 2) if self.do_classifier_free_guidance else audio_latents - ) - audio_latent_model_input = audio_latent_model_input.to(prompt_embeds.dtype) - - timestep = t.expand(latent_model_input.shape[0]) - - with self._transformer_cache_context("cond_uncond"): - noise_pred_video, noise_pred_audio = self.transformer( - hidden_states=latent_model_input, - audio_hidden_states=audio_latent_model_input, - encoder_hidden_states=connector_prompt_embeds, - audio_encoder_hidden_states=connector_audio_prompt_embeds, - timestep=timestep, - encoder_attention_mask=connector_attention_mask, - audio_encoder_attention_mask=connector_attention_mask, - num_frames=latent_num_frames, - height=latent_height, - width=latent_width, - fps=frame_rate, - audio_num_frames=audio_num_frames, - video_coords=video_coords, - audio_coords=audio_coords, - attention_kwargs=attention_kwargs, - return_dict=False, - ) - noise_pred_video = noise_pred_video.float() - noise_pred_audio = noise_pred_audio.float() - - if self.do_classifier_free_guidance: - noise_pred_video_uncond, noise_pred_video_text = noise_pred_video.chunk(2) - noise_pred_video = noise_pred_video_uncond + guidance_scale * ( - noise_pred_video_text - noise_pred_video_uncond - ) - - noise_pred_audio_uncond, noise_pred_audio_text = noise_pred_audio.chunk(2) - noise_pred_audio = noise_pred_audio_uncond + guidance_scale * ( - noise_pred_audio_text - noise_pred_audio_uncond - ) - - if guidance_rescale > 0: - noise_pred_video = rescale_noise_cfg( - noise_pred_video, noise_pred_video_text, guidance_rescale=guidance_rescale - ) - noise_pred_audio = rescale_noise_cfg( - noise_pred_audio, noise_pred_audio_text, guidance_rescale=guidance_rescale - ) - - latents = self.scheduler.step(noise_pred_video, t, latents, return_dict=False)[0] - audio_latents = audio_scheduler.step(noise_pred_audio, t, audio_latents, return_dict=False)[0] + noise_pred_video, noise_pred_audio = self.predict_noise_maybe_with_cfg( + do_true_cfg=do_true_cfg, + true_cfg_scale=guidance_scale, + positive_kwargs=positive_kwargs, + negative_kwargs=negative_kwargs, + cfg_normalize=False, + ) + + latents, audio_latents = self.scheduler_step_maybe_with_cfg( + (noise_pred_video, noise_pred_audio), + (t, t), + (latents, audio_latents), + do_true_cfg=do_true_cfg, + per_request_scheduler=video_audio_scheduler, + ) + latents, audio_latents = self._synchronize_cfg_parallel_step_output( + (latents, audio_latents), + do_true_cfg=do_true_cfg, + ) pbar.update() @@ -1225,10 +1100,15 @@ def forward( latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor ) + audio_latents = self._unpad_audio_latents(audio_latents, original_audio_num_frames) audio_latents = self._denormalize_audio_latents( audio_latents, self.audio_vae.latents_mean, self.audio_vae.latents_std ) - audio_latents = self._unpack_audio_latents(audio_latents, audio_num_frames, num_mel_bins=latent_mel_bins) + audio_latents = self._unpack_audio_latents( + audio_latents, + original_audio_num_frames, + num_mel_bins=latent_mel_bins, + ) if output_type == "latent": video = latents diff --git a/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_image2video.py b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_image2video.py index 11091518b4..65e7454b73 100644 --- a/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_image2video.py +++ b/vllm_omni/diffusion/models/ltx2/pipeline_ltx2_image2video.py @@ -14,7 +14,7 @@ import torch.nn as nn from diffusers import FlowMatchEulerDiscreteScheduler from diffusers.pipelines.ltx2.utils import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES -from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg, retrieve_timesteps +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_latents from diffusers.utils.torch_utils import randn_tensor from diffusers.video_processor import VideoProcessor @@ -22,7 +22,6 @@ from vllm.model_executor.models.utils import AutoWeightsLoader from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig -from vllm_omni.diffusion.distributed.parallel_state import get_cfg_group, get_classifier_free_guidance_rank from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.lora.manager import DiffusionLoRAManager from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader @@ -46,6 +45,32 @@ def get_ltx2_post_process_func(od_config: OmniDiffusionConfig): return _get_ltx2_post_process_func(od_config) +class _I2VVideoAudioScheduler: + """Composite scheduler for I2V: uses _step_video_latents_i2v for video, standard step for audio.""" + + def __init__(self, pipeline, audio_scheduler, latent_num_frames, latent_height, latent_width): + self.video_scheduler = pipeline.scheduler + self.audio_scheduler = audio_scheduler + self._pipeline = pipeline + self._latent_num_frames = latent_num_frames + self._latent_height = latent_height + self._latent_width = latent_width + + def step(self, noise_pred, t, latents, return_dict=False, generator=None): + video_out = self._pipeline._step_video_latents_i2v( + noise_pred[0], + latents[0], + t[0], + self._latent_num_frames, + self._latent_height, + self._latent_width, + ) + audio_out = self.audio_scheduler.step(noise_pred[1], t[1], latents[1], return_dict=False, generator=generator)[ + 0 + ] + return ((video_out, audio_out),) + + class LTX2ImageToVideoPipeline(LTX2Pipeline): support_image_input = True @@ -287,6 +312,8 @@ def forward( attention_kwargs: dict[str, Any] | None = None, max_sequence_length: int | None = None, ) -> DiffusionOutput: + # Extract prompt/negative_prompt from request. + # Input format: req.prompts is a list of str or dict with "prompt"/"negative_prompt" keys. prompt = [p if isinstance(p, str) else (p.get("prompt") or "") for p in req.prompts] or prompt if all(isinstance(p, str) or p.get("negative_prompt") is None for p in req.prompts): negative_prompt = None @@ -328,6 +355,7 @@ def forward( else req.sampling_params.extra_args.get("audio_latents", audio_latents) ) + # Override with pre-computed embeddings if provided in request. req_prompt_embeds = [_get_prompt_field(p, "prompt_embeds") for p in req.prompts] if any(p is not None for p in req_prompt_embeds): prompt_embeds = torch.stack(req_prompt_embeds) # type: ignore[arg-type] @@ -429,20 +457,17 @@ def forward( max_sequence_length=max_sequence_length, device=device, ) - cfg_parallel_ready = self._is_cfg_parallel_enabled(self.do_classifier_free_guidance) - if self.do_classifier_free_guidance and not cfg_parallel_ready: - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) - prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0) - + # Compute positive prompt connectors additive_attention_mask = (1 - prompt_attention_mask.to(prompt_embeds.dtype)) * -1000000.0 connector_prompt_embeds, connector_audio_prompt_embeds, connector_attention_mask = self.connectors( prompt_embeds, additive_attention_mask, additive_mask=True ) + # Compute negative prompt connectors when CFG is enabled negative_connector_prompt_embeds = None negative_connector_audio_prompt_embeds = None negative_connector_attention_mask = None - if cfg_parallel_ready: + if self.do_classifier_free_guidance: negative_additive_attention_mask = ( 1 - negative_prompt_attention_mask.to(negative_prompt_embeds.dtype) ) * -1000000.0 @@ -500,8 +525,6 @@ def forward( generator, latents, ) - if self.do_classifier_free_guidance and not cfg_parallel_ready: - conditioning_mask = torch.cat([conditioning_mask, conditioning_mask]) duration_s = num_frames / frame_rate audio_latents_per_second = ( @@ -529,20 +552,7 @@ def forward( num_channels_latents_audio = ( self.audio_vae.config.latent_channels if getattr(self, "audio_vae", None) is not None else 8 ) - - # padding audio_latents if needed - sp_size = getattr(self.od_config.parallel_config, "sequence_parallel_size", 1) - if sp_size > 1: - pad_len = (sp_size - (audio_num_frames % sp_size)) % sp_size - if pad_len > 0: - if audio_latents is not None: - pad_shape = list(audio_latents.shape) - pad_shape[2] = pad_len - padding = torch.zeros(pad_shape, dtype=audio_latents.dtype, device=audio_latents.device) - audio_latents = torch.cat([audio_latents, padding], dim=2) - audio_num_frames += pad_len - - audio_latents = self.prepare_audio_latents( + audio_latents, original_audio_num_frames, padded_audio_num_frames = self.prepare_audio_latents( batch_size * num_videos_per_prompt, num_channels_latents=num_channels_latents_audio, audio_latent_length=audio_num_frames, @@ -585,12 +595,17 @@ def forward( latents.shape[0], latent_num_frames, latent_height, latent_width, latents.device, fps=frame_rate ) audio_coords = self.transformer.audio_rope.prepare_audio_coords( - audio_latents.shape[0], audio_num_frames, audio_latents.device + audio_latents.shape[0], padded_audio_num_frames, audio_latents.device ) - # Duplicate the positional ids as well if using CFG - if self.do_classifier_free_guidance and not cfg_parallel_ready: - video_coords = video_coords.repeat((2,) + (1,) * (video_coords.ndim - 1)) # Repeat twice in batch dim - audio_coords = audio_coords.repeat((2,) + (1,) * (audio_coords.ndim - 1)) + + i2v_scheduler = _I2VVideoAudioScheduler( + pipeline=self, + audio_scheduler=audio_scheduler, + latent_num_frames=latent_num_frames, + latent_height=latent_height, + latent_width=latent_width, + ) + # No coord duplication needed: mixin handles CFG via separate forward calls. with self.progress_bar(total=len(timesteps)) as pbar: for i, t in enumerate(timesteps): @@ -599,140 +614,62 @@ def forward( self._current_timestep = t - if cfg_parallel_ready: - latent_model_input = latents.to(prompt_embeds.dtype) - audio_latent_model_input = audio_latents.to(prompt_embeds.dtype) - - timestep = t.expand(latent_model_input.shape[0]) - video_timestep = timestep.unsqueeze(-1) * (1 - conditioning_mask) - - positive_kwargs = { - "hidden_states": latent_model_input, - "audio_hidden_states": audio_latent_model_input, - "encoder_hidden_states": connector_prompt_embeds, - "audio_encoder_hidden_states": connector_audio_prompt_embeds, - "timestep": video_timestep, - "audio_timestep": timestep, - "encoder_attention_mask": connector_attention_mask, - "audio_encoder_attention_mask": connector_attention_mask, - "num_frames": latent_num_frames, - "height": latent_height, - "width": latent_width, - "fps": frame_rate, - "audio_num_frames": audio_num_frames, - "video_coords": video_coords, - "audio_coords": audio_coords, - "attention_kwargs": attention_kwargs, - "return_dict": False, - } - negative_kwargs = { - "hidden_states": latent_model_input, - "audio_hidden_states": audio_latent_model_input, + latent_model_input = latents.to(prompt_embeds.dtype) + audio_latent_model_input = audio_latents.to(prompt_embeds.dtype) + timestep = t.expand(latent_model_input.shape[0]) + video_timestep = timestep.unsqueeze(-1) * (1 - conditioning_mask) + do_true_cfg = self.do_classifier_free_guidance + + positive_kwargs = { + "hidden_states": latent_model_input, + "audio_hidden_states": audio_latent_model_input, + "encoder_hidden_states": connector_prompt_embeds, + "audio_encoder_hidden_states": connector_audio_prompt_embeds, + "timestep": video_timestep, + "audio_timestep": timestep, + "encoder_attention_mask": connector_attention_mask, + "audio_encoder_attention_mask": connector_attention_mask, + "num_frames": latent_num_frames, + "height": latent_height, + "width": latent_width, + "fps": frame_rate, + "audio_num_frames": padded_audio_num_frames, + "video_coords": video_coords, + "audio_coords": audio_coords, + "attention_kwargs": attention_kwargs, + "return_dict": False, + } + negative_kwargs = ( + { + **positive_kwargs, "encoder_hidden_states": negative_connector_prompt_embeds, "audio_encoder_hidden_states": negative_connector_audio_prompt_embeds, - "timestep": video_timestep, - "audio_timestep": timestep, "encoder_attention_mask": negative_connector_attention_mask, "audio_encoder_attention_mask": negative_connector_attention_mask, - "num_frames": latent_num_frames, - "height": latent_height, - "width": latent_width, - "fps": frame_rate, - "audio_num_frames": audio_num_frames, - "video_coords": video_coords, - "audio_coords": audio_coords, - "attention_kwargs": attention_kwargs, - "return_dict": False, } + if do_true_cfg + else None + ) - noise_pred_video, noise_pred_audio = self.predict_noise_av_maybe_with_cfg( - do_true_cfg=True, - true_cfg_scale=guidance_scale, - positive_kwargs=positive_kwargs, - negative_kwargs=negative_kwargs, - guidance_rescale=guidance_rescale, - cfg_normalize=False, - ) - - if get_classifier_free_guidance_rank() == 0: - latents = self._step_video_latents_i2v( - noise_pred_video, - latents, - t, - latent_num_frames, - latent_height, - latent_width, - ) - audio_latents = audio_scheduler.step(noise_pred_audio, t, audio_latents, return_dict=False)[0] - - cfg_group = get_cfg_group() - latents = latents.contiguous() - audio_latents = audio_latents.contiguous() - cfg_group.broadcast(latents, src=0) - cfg_group.broadcast(audio_latents, src=0) - else: - latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents - latent_model_input = latent_model_input.to(prompt_embeds.dtype) - audio_latent_model_input = ( - torch.cat([audio_latents] * 2) if self.do_classifier_free_guidance else audio_latents - ) - audio_latent_model_input = audio_latent_model_input.to(prompt_embeds.dtype) - - timestep = t.expand(latent_model_input.shape[0]) - video_timestep = timestep.unsqueeze(-1) * (1 - conditioning_mask) - - with self._transformer_cache_context("cond_uncond"): - noise_pred_video, noise_pred_audio = self.transformer( - hidden_states=latent_model_input, - audio_hidden_states=audio_latent_model_input, - encoder_hidden_states=connector_prompt_embeds, - audio_encoder_hidden_states=connector_audio_prompt_embeds, - timestep=video_timestep, - audio_timestep=timestep, - encoder_attention_mask=connector_attention_mask, - audio_encoder_attention_mask=connector_attention_mask, - num_frames=latent_num_frames, - height=latent_height, - width=latent_width, - fps=frame_rate, - audio_num_frames=audio_num_frames, - video_coords=video_coords, - audio_coords=audio_coords, - attention_kwargs=attention_kwargs, - return_dict=False, - ) - noise_pred_video = noise_pred_video.float() - noise_pred_audio = noise_pred_audio.float() - - if self.do_classifier_free_guidance: - noise_pred_video_uncond, noise_pred_video_text = noise_pred_video.chunk(2) - noise_pred_video = noise_pred_video_uncond + guidance_scale * ( - noise_pred_video_text - noise_pred_video_uncond - ) - - noise_pred_audio_uncond, noise_pred_audio_text = noise_pred_audio.chunk(2) - noise_pred_audio = noise_pred_audio_uncond + guidance_scale * ( - noise_pred_audio_text - noise_pred_audio_uncond - ) + noise_pred_video, noise_pred_audio = self.predict_noise_maybe_with_cfg( + do_true_cfg=do_true_cfg, + true_cfg_scale=guidance_scale, + positive_kwargs=positive_kwargs, + negative_kwargs=negative_kwargs, + cfg_normalize=False, + ) - if guidance_rescale > 0: - noise_pred_video = rescale_noise_cfg( - noise_pred_video, noise_pred_video_text, guidance_rescale=guidance_rescale - ) - noise_pred_audio = rescale_noise_cfg( - noise_pred_audio, noise_pred_audio_text, guidance_rescale=guidance_rescale - ) - - latents = self._step_video_latents_i2v( - noise_pred_video, - latents, - t, - latent_num_frames, - latent_height, - latent_width, - ) - - audio_latents = audio_scheduler.step(noise_pred_audio, t, audio_latents, return_dict=False)[0] + latents, audio_latents = self.scheduler_step_maybe_with_cfg( + (noise_pred_video, noise_pred_audio), + (t, t), + (latents, audio_latents), + do_true_cfg=do_true_cfg, + per_request_scheduler=i2v_scheduler, + ) + latents, audio_latents = self._synchronize_cfg_parallel_step_output( + (latents, audio_latents), + do_true_cfg=do_true_cfg, + ) pbar.update() @@ -748,10 +685,15 @@ def forward( latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor ) + audio_latents = self._unpad_audio_latents(audio_latents, original_audio_num_frames) audio_latents = self._denormalize_audio_latents( audio_latents, self.audio_vae.latents_mean, self.audio_vae.latents_std ) - audio_latents = self._unpack_audio_latents(audio_latents, audio_num_frames, num_mel_bins=latent_mel_bins) + audio_latents = self._unpack_audio_latents( + audio_latents, + original_audio_num_frames, + num_mel_bins=latent_mel_bins, + ) if output_type == "latent": video = latents From 8609bc8ed963b5e7e199efdbbdd88f283be24aa6 Mon Sep 17 00:00:00 2001 From: Lancer <402430575@qq.com> Date: Wed, 8 Apr 2026 14:41:24 +0800 Subject: [PATCH 085/204] [Feat] image2image for Z-Image (#1580) Signed-off-by: Lancer Signed-off-by: Lancer <402430575@qq.com> --- .../image_to_image/image_to_image.md | 1 + .../online_serving/image_to_image/README.md | 1 + .../models/z_image/pipeline_z_image.py | 209 +++++++++++++++--- vllm_omni/entrypoints/openai/api_server.py | 2 + vllm_omni/inputs/data.py | 1 + 5 files changed, 185 insertions(+), 29 deletions(-) diff --git a/examples/offline_inference/image_to_image/image_to_image.md b/examples/offline_inference/image_to_image/image_to_image.md index 2df248e034..1c1a5ff3a7 100644 --- a/examples/offline_inference/image_to_image/image_to_image.md +++ b/examples/offline_inference/image_to_image/image_to_image.md @@ -51,5 +51,6 @@ Key arguments: - `--vae-use-tiling`: enable VAE tiling for memory optimization. - `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](../../../docs/user_guide/diffusion/parallelism_acceleration.md#cfg-parallel). - `--enable-cpu-offload`: enable CPU offloading for diffusion models. +- `--strength`: **Z-Image only** - controls the denoising start timestep for I2I (default: 0.6). Range: [0.0, 1.0]. Lower values preserve more of the original image; higher values allow more creative changes. > ℹ️ If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage. diff --git a/examples/online_serving/image_to_image/README.md b/examples/online_serving/image_to_image/README.md index 789258473f..59b1f0e2c1 100644 --- a/examples/online_serving/image_to_image/README.md +++ b/examples/online_serving/image_to_image/README.md @@ -314,6 +314,7 @@ count, use `size` and `n` rather than `height`, `width`, or | `seed` | int | None | Random seed (reproducible) | | `negative_prompt` | str | None | Negative prompt | | `num_outputs_per_prompt` | int | 1 | Number of images to generate | +| `strength` | float | 0.6 | **Z-Image only** - Denoising start timestep for I2I. Range: [0.0, 1.0]. Lower preserves more of original image. | | `layers` | int | 4 | Number of layers (Qwen-Image-Layered) | | `resolution` | int | 640 | Resolution, 640 or 1024 (Qwen-Image-Layered) | diff --git a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py index b9aceed2e5..5bea59a209 100644 --- a/vllm_omni/diffusion/models/z_image/pipeline_z_image.py +++ b/vllm_omni/diffusion/models/z_image/pipeline_z_image.py @@ -21,9 +21,10 @@ from collections.abc import Callable, Iterable from typing import Any +import PIL.Image import torch import torch.nn as nn -from diffusers.image_processor import VaeImageProcessor +from diffusers.image_processor import PipelineImageInput, VaeImageProcessor from diffusers.schedulers import FlowMatchEulerDiscreteScheduler from diffusers.utils import logging from diffusers.utils.torch_utils import randn_tensor @@ -59,7 +60,7 @@ def get_post_process_func( vae_config = json.load(f) vae_scale_factor = 2 ** (len(vae_config["block_out_channels"]) - 1) if "block_out_channels" in vae_config else 8 - image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor * 2) + image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor * 2, do_convert_rgb=True) def post_process_func( images: torch.Tensor, @@ -83,6 +84,20 @@ def calculate_shift( return mu +# Copied from diffusers +def retrieve_latents( + encoder_output: torch.Tensor, generator: torch.Generator | None = None, sample_mode: str = "sample" +): + if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": + return encoder_output.latent_dist.sample(generator) + elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": + return encoder_output.latent_dist.mode() + elif hasattr(encoder_output, "latents"): + return encoder_output.latents + else: + raise AttributeError("Could not access latents of provided encoder_output") + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps def retrieve_timesteps( scheduler, @@ -187,6 +202,8 @@ def __init__( enable_diffusion_pipeline_profiler=self.od_config.enable_diffusion_pipeline_profiler ) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2, do_convert_rgb=True) + def encode_prompt( self, prompt: str | list[str], @@ -282,12 +299,45 @@ def prepare_latents( device, generator, latents=None, + image=None, + timestep=None, ): height = 2 * (int(height) // (self.vae_scale_factor * 2)) width = 2 * (int(width) // (self.vae_scale_factor * 2)) shape = (batch_size, num_channels_latents, height, width) + if image is not None: + if latents is not None: + return latents.to(device=device, dtype=dtype) + + image = image.to(device=device, dtype=dtype) + if image.shape[1] != num_channels_latents: + if isinstance(generator, list): + image_latents = [ + retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i]) + for i in range(image.shape[0]) + ] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = retrieve_latents(self.vae.encode(image), generator=generator) + + image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor + else: + image_latents = image + + if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: + additional_image_per_prompt = batch_size // image_latents.shape[0] + image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0) + elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts." + ) + + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + latents = self.scheduler.scale_noise(image_latents, timestep, noise) + return latents + if latents is None: latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) else: @@ -296,6 +346,14 @@ def prepare_latents( latents = latents.to(device) return latents + def get_timesteps(self, num_inference_steps, strength, device): + init_timestep = min(num_inference_steps * strength, num_inference_steps) + t_start = int(max(num_inference_steps - init_timestep, 0)) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) + return timesteps, num_inference_steps - t_start + @property def guidance_scale(self): return self._guidance_scale @@ -320,6 +378,8 @@ def forward( self, req: OmniDiffusionRequest, prompt: str | list[str] | None = None, + image: PipelineImageInput = None, + strength: float = 0.6, height: int = 1024, width: int = 1024, num_inference_steps: int = 50, @@ -347,6 +407,11 @@ def forward( prompt (`str` or `list[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. + image (`PipelineImageInput`, *optional*): + The image to use for img2img generation. If provided, the pipeline + will perform img2img instead of text-to-image. + strength (`float`, *optional*, defaults to 0.6): + Indicates extent to transform the reference `image`. Must be between 0 and 1. height (`int`, *optional*, defaults to 1024): The height in pixels of the generated image. width (`int`, *optional*, defaults to 1024): @@ -425,6 +490,34 @@ def forward( elif req.prompts: negative_prompt = ["" if isinstance(p, str) else (p.get("negative_prompt") or "") for p in req.prompts] + # Handle img2img: extract image from request + if image is None and req.prompts: + if len(req.prompts) > 1: + logger.warning( + "This model only supports a single prompt for img2img, not a batched request. " + "Taking only the first image for now." + ) + first_prompt = req.prompts[0] + if not isinstance(first_prompt, str): + raw_image = first_prompt.get("multi_modal_data", {}).get("image") + if raw_image is not None: + if isinstance(raw_image, list): + image = [PIL.Image.open(im) if isinstance(im, str) else raw_image[0] for im in raw_image[:1]] + else: + image = PIL.Image.open(raw_image) if isinstance(raw_image, str) else raw_image + + # strength is currently only applicable for Z-Image I2I; other pipelines ignore this parameter + strength = req.sampling_params.strength if req.sampling_params.strength is not None else strength + if strength is not None and image is None: + logger.warning( + "strength parameter (%.2f) is only applicable for image-to-image (I2I) generation. " + "It will be ignored for text-to-image (T2I) generation.", + strength, + ) + strength = None + if image is not None and strength is not None and (strength < 0 or strength > 1): + raise ValueError(f"The value of strength should be in [0.0, 1.0] but is {strength}") + height = req.sampling_params.height or height width = req.sampling_params.width or width num_inference_steps = req.sampling_params.num_inference_steps or num_inference_steps @@ -491,16 +584,71 @@ def forward( # 4. Prepare latent variables num_channels_latents = self.transformer.in_channels - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - num_channels_latents, - height, - width, - torch.float32, - device, - generator, - latents, - ) + # img2img mode: prepare latents from input image + if image is not None: + # Handle image list - take first image + if isinstance(image, list): + image = image[0] + + # Prepare image for VAE encoding using image_processor + if not isinstance(image, torch.Tensor): + init_image = self.image_processor.preprocess(image, height, width) + image = init_image.to(dtype=torch.float32, device=device) + + # Initialize scheduler kwargs for img2img + mu = calculate_shift( + (height // self.vae_scale_factor // 2) * (width // self.vae_scale_factor // 2), + self.scheduler.config.get("base_image_seq_len", 256), + self.scheduler.config.get("max_image_seq_len", 4096), + self.scheduler.config.get("base_shift", 0.5), + self.scheduler.config.get("max_shift", 1.15), + ) + self.scheduler.sigma_min = 0.0 + scheduler_kwargs = {"mu": mu} + + # First initialize timesteps in scheduler + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, + num_inference_steps, + device, + sigmas=sigmas, + **scheduler_kwargs, + ) + + # Then adjust timesteps based on strength + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + + if num_inference_steps < 1: + raise ValueError( + f"After adjusting the num_inference_steps by strength parameter: " + f"{strength}, the number of pipeline steps is {num_inference_steps} " + f"which is < 1 and not appropriate for this pipeline." + ) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds[0].dtype, + device, + generator, + latents, + image, + latent_timestep, + ) + else: + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + torch.float32, + device, + generator, + latents, + ) # Repeat prompt_embeds for num_images_per_prompt if num_images_per_prompt > 1: @@ -509,25 +657,28 @@ def forward( negative_prompt_embeds = [npe for npe in negative_prompt_embeds for _ in range(num_images_per_prompt)] actual_batch_size = batch_size * num_images_per_prompt - image_seq_len = (latents.shape[2] // 2) * (latents.shape[3] // 2) # 5. Prepare timesteps - mu = calculate_shift( - image_seq_len, - self.scheduler.config.get("base_image_seq_len", 256), - self.scheduler.config.get("max_image_seq_len", 4096), - self.scheduler.config.get("base_shift", 0.5), - self.scheduler.config.get("max_shift", 1.15), - ) - self.scheduler.sigma_min = 0.0 - scheduler_kwargs = {"mu": mu} - timesteps, num_inference_steps = retrieve_timesteps( - self.scheduler, - num_inference_steps, - device, - sigmas=sigmas, - **scheduler_kwargs, - ) + if image is None: + image_seq_len = (latents.shape[2] // 2) * (latents.shape[3] // 2) + mu = calculate_shift( + image_seq_len, + self.scheduler.config.get("base_image_seq_len", 256), + self.scheduler.config.get("max_image_seq_len", 4096), + self.scheduler.config.get("base_shift", 0.5), + self.scheduler.config.get("max_shift", 1.15), + ) + self.scheduler.sigma_min = 0.0 + scheduler_kwargs = {"mu": mu} + + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, + num_inference_steps, + device, + sigmas=sigmas, + **scheduler_kwargs, + ) + num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) self._num_timesteps = len(timesteps) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 38d32f7198..ebe4cf30bf 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1421,6 +1421,7 @@ async def edit_images( negative_prompt: str | None = Form(None), num_inference_steps: int | None = Form(None), guidance_scale: float | None = Form(None), + strength: float | None = Form(None), true_cfg_scale: float | None = Form(None), seed: int | None = Form(None), generator_device: str | None = Form(None), @@ -1551,6 +1552,7 @@ async def edit_images( # 3.4 Add optional parameters ONLY if provided _update_if_not_none(gen_params, "num_inference_steps", num_inference_steps) _update_if_not_none(gen_params, "guidance_scale", guidance_scale) + _update_if_not_none(gen_params, "strength", strength) _update_if_not_none(gen_params, "true_cfg_scale", true_cfg_scale) # If seed is not provided, generate a random one to ensure # a proper generator is initialized in the backend. diff --git a/vllm_omni/inputs/data.py b/vllm_omni/inputs/data.py index 7824e7092d..9cb6c44335 100644 --- a/vllm_omni/inputs/data.py +++ b/vllm_omni/inputs/data.py @@ -241,6 +241,7 @@ class OmniDiffusionSamplingParams: guidance_scale_provided: bool = False guidance_scale_2: float | None = None guidance_rescale: float = 0.0 + strength: float | None = None # I2I: Z-Image specific now, uses to control denoising start timestep decode_timestep: float | list[float] | None = None decode_noise_scale: float | list[float] | None = None eta: float = 0.0 From c3c736d488264d340e049e79759927be61641888 Mon Sep 17 00:00:00 2001 From: Jinheng Date: Wed, 8 Apr 2026 14:55:35 +0800 Subject: [PATCH 086/204] [Feature] Port Bagel RDMA flow to latest main (#2000) Signed-off-by: Jinheng Li Signed-off-by: natureofnature Co-authored-by: ahengljh Co-authored-by: natureofnature --- .../mooncake_transfer_engine_connector.md | 16 +- tests/diffusion/test_stage_diffusion_proc.py | 65 +++ .../omni_connectors/test_basic_connectors.py | 58 ++ .../omni_connectors/test_kv_flow.py | 227 +++++++- .../test_orchestrator_kv_sender_info.py | 207 +++++++ vllm_omni/diffusion/request.py | 2 + vllm_omni/diffusion/stage_diffusion_client.py | 12 +- vllm_omni/diffusion/stage_diffusion_proc.py | 38 +- .../mooncake_transfer_engine_connector.py | 10 +- .../omni_connectors/kv_transfer_manager.py | 522 ++++++++++++++++-- .../omni_connectors/utils/initialization.py | 85 ++- vllm_omni/engine/async_omni_engine.py | 8 +- vllm_omni/engine/orchestrator.py | 43 +- vllm_omni/engine/stage_engine_core_client.py | 121 ++++ .../model_executor/stage_configs/bagel.yaml | 27 + 15 files changed, 1376 insertions(+), 65 deletions(-) create mode 100644 tests/diffusion/test_stage_diffusion_proc.py create mode 100644 tests/engine/test_orchestrator_kv_sender_info.py diff --git a/docs/design/feature/omni_connectors/mooncake_transfer_engine_connector.md b/docs/design/feature/omni_connectors/mooncake_transfer_engine_connector.md index 798644b96f..306a0620b4 100644 --- a/docs/design/feature/omni_connectors/mooncake_transfer_engine_connector.md +++ b/docs/design/feature/omni_connectors/mooncake_transfer_engine_connector.md @@ -33,8 +33,8 @@ runtime: zmq_port: 50051 # ZMQ base port (see "Port Offset Scheme" below) protocol: "rdma" # "rdma" or "tcp" device_name: "" # RDMA device (e.g., "mlx5_0"), empty for auto-detect - memory_pool_size: 2147483648 # 2GB memory pool - memory_pool_device: "cpu" # "cpu" for pinned memory, "cuda" for GPUDirect RDMA + memory_pool_size: 4294967296 # 4 GB (CPU); use 2147483648 (2 GB) for GPU + memory_pool_device: "cpu" # "cpu" for pinned memory (recommended), "cuda" for GPUDirect RDMA ``` Wire stages to the connector: @@ -64,8 +64,8 @@ stage_args: | Parameter | Default | Description | |---|---|---| -| `memory_pool_size` | 1 GB | Total size of the RDMA-registered memory pool in bytes. | -| `memory_pool_device` | `"cpu"` | `"cpu"`: pinned host memory (recommended). `"cuda"`: GPU VRAM for GPUDirect RDMA (requires NIC-GPU direct PCIe connectivity). | +| `memory_pool_size` | 4 GB (CPU) / 2 GB (GPU) | Total size of the RDMA-registered memory pool in bytes. Recommended 4 GB for CPU pinned memory; 2 GB for GPU VRAM to conserve device memory. | +| `memory_pool_device` | `"cpu"` | `"cpu"`: pinned host memory (recommended, works on all topologies). `"cuda"`: GPU VRAM for GPUDirect RDMA (requires NIC-GPU direct PCIe connectivity, PIX topology). | ### Networking @@ -107,10 +107,10 @@ receiver_connect = remote_side_channel_port + tp_rank ## Memory Pool Modes -| Mode | Config | Data Flow | Best For | -|---|---|---|---| -| CPU Pinned | `memory_pool_device: "cpu"` | GPU → CPU pool → RDMA → CPU pool → GPU | Most hardware topologies (recommended) | -| GPUDirect | `memory_pool_device: "cuda"` | GPU → GPU pool → RDMA (NIC reads GPU BAR1) → GPU pool | NIC-GPU direct PCIe (PIX topology) | +| Mode | Config | Recommended Pool Size | Data Flow | Best For | +|---|---|---|---|---| +| CPU Pinned | `memory_pool_device: "cpu"` | 4 GB | GPU → CPU pool → RDMA → CPU pool → GPU | Most hardware topologies (recommended) | +| GPUDirect | `memory_pool_device: "cuda"` | 2 GB | GPU → GPU pool → RDMA (NIC reads GPU BAR1) → GPU pool | NIC-GPU direct PCIe (PIX topology) | > **Note**: GPUDirect RDMA requires the NIC and GPU to share a direct PCIe > switch (PIX topology). On systems where they are connected via PXB or NODE, diff --git a/tests/diffusion/test_stage_diffusion_proc.py b/tests/diffusion/test_stage_diffusion_proc.py new file mode 100644 index 0000000000..c26070ad43 --- /dev/null +++ b/tests/diffusion/test_stage_diffusion_proc.py @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import asyncio +from concurrent.futures import ThreadPoolExecutor +from dataclasses import asdict +from types import SimpleNamespace + +import pytest + +from vllm_omni.diffusion.stage_diffusion_proc import StageDiffusionProc +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] + + +def test_process_batch_request_preserves_parent_request_id_and_kv_sender_info(): + async def run_test(): + captured = {} + + def step(request): + captured["request"] = request + return [ + SimpleNamespace( + images=["img-1"], + _multimodal_output={}, + metrics={}, + stage_durations={}, + peak_memory_mb=0.0, + latents=None, + final_output_type="image", + ), + SimpleNamespace( + images=["img-2"], + _multimodal_output={}, + metrics={}, + stage_durations={}, + peak_memory_mb=0.0, + latents=None, + final_output_type="image", + ), + ] + + proc = object.__new__(StageDiffusionProc) + proc._engine = SimpleNamespace(step=step) + proc._executor = ThreadPoolExecutor(max_workers=1) + + try: + result = await proc._process_batch_request( + request_id="req-parent", + prompts=["hello", "world"], + sampling_params_dict=asdict(OmniDiffusionSamplingParams()), + kv_sender_info={0: {"host": "10.0.0.2", "zmq_port": 50151}}, + ) + finally: + proc._executor.shutdown(wait=True) + + request = captured["request"] + assert request.request_id == "req-parent" + assert request.request_ids == ["req-parent-0", "req-parent-1"] + assert request.kv_sender_info == {0: {"host": "10.0.0.2", "zmq_port": 50151}} + assert result.request_id == "req-parent" + assert result.images == ["img-1", "img-2"] + + asyncio.run(run_test()) diff --git a/tests/distributed/omni_connectors/test_basic_connectors.py b/tests/distributed/omni_connectors/test_basic_connectors.py index 1b1965355e..bca96e790d 100644 --- a/tests/distributed/omni_connectors/test_basic_connectors.py +++ b/tests/distributed/omni_connectors/test_basic_connectors.py @@ -120,3 +120,61 @@ def test_get_invalid_metadata(shm_connector): result = shm_connector.get("stage_0", "stage_1", "req_3", {"unknown": "format"}) assert result is None + + +def test_mooncake_connector_defaults_missing_host_to_detected_ip(monkeypatch: pytest.MonkeyPatch): + import vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector as mooncake_module + + class _FakePool: + is_cuda = False + + def pin_memory(self): + return self + + def data_ptr(self): + return 1234 + + class _FakeTransferEngine: + def initialize(self, host, mode, protocol, device_name): + self.host = host + self.mode = mode + self.protocol = protocol + self.device_name = device_name + return 0 + + def get_rpc_port(self): + return 23456 + + def register_memory(self, base_ptr, pool_size): + del base_ptr, pool_size + return 0 + + def unregister_memory(self, base_ptr): + del base_ptr + return 0 + + monkeypatch.setattr(mooncake_module, "TransferEngine", _FakeTransferEngine) + monkeypatch.setattr(mooncake_module.torch, "empty", lambda *args, **kwargs: _FakePool()) + monkeypatch.setattr( + mooncake_module.MooncakeTransferEngineConnector, + "_get_local_ip", + lambda self: "10.20.30.40", + ) + monkeypatch.setattr( + mooncake_module.MooncakeTransferEngineConnector, + "_zmq_listener_loop", + lambda self: self._listener_ready.set(), + ) + + connector = mooncake_module.MooncakeTransferEngineConnector( + { + "zmq_port": 50051, + "memory_pool_size": 4096, + } + ) + try: + assert connector.host == "10.20.30.40" + assert connector.engine.host == "10.20.30.40" + assert connector.get_connection_info()["host"] == "10.20.30.40" + finally: + connector.close() diff --git a/tests/distributed/omni_connectors/test_kv_flow.py b/tests/distributed/omni_connectors/test_kv_flow.py index b12fc013b7..cea1860193 100644 --- a/tests/distributed/omni_connectors/test_kv_flow.py +++ b/tests/distributed/omni_connectors/test_kv_flow.py @@ -1,8 +1,14 @@ +import json +import struct + +import numpy as np import pytest import torch +import vllm_omni.distributed.omni_connectors.kv_transfer_manager as kv_transfer_manager_module from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.distributed.omni_connectors.kv_transfer_manager import ( + KVCacheTransferData, OmniKVCacheConfig, OmniKVTransferManager, ) @@ -60,6 +66,35 @@ def common_constants(): } +def _decode_stored_payload(data): + if isinstance(data, torch.Tensor) and data.dtype == torch.uint8 and data.dim() == 1: + return KVCacheTransferData.from_bytes(data.cpu().numpy().tobytes()) + + if isinstance(data, (bytes, bytearray, memoryview)): + return KVCacheTransferData.from_bytes(data) + + return data + + +def _make_serialized_payload() -> tuple[bytes, torch.Tensor]: + key_tensor = torch.arange(12, dtype=torch.float32).reshape(3, 4) + payload = KVCacheTransferData( + request_id="req-payload", + layer_blocks={"key_cache": [key_tensor], "value_cache": [None]}, + block_ids=[1], + metadata={"seq_len": 3}, + ).to_bytes() + return payload, key_tensor + + +def _rewrite_serialized_header(payload: bytes, mutate_header) -> bytes: + header_len = struct.unpack(">I", payload[:4])[0] + header = json.loads(payload[4 : 4 + header_len]) + mutate_header(header) + new_header = json.dumps(header, separators=(",", ":")).encode("utf-8") + return struct.pack(">I", len(new_header)) + new_header + payload[4 + header_len :] + + def test_manager_extraction(kv_config, mock_connector, common_constants): """Test extraction and sending logic in OmniKVTransferManager.""" num_layers = common_constants["num_layers"] @@ -95,7 +130,7 @@ def test_manager_extraction(kv_config, mock_connector, common_constants): expected_key = f"stage1->stage2:{full_request_id}" assert expected_key in mock_connector.store - data = mock_connector.store[expected_key] + data = _decode_stored_payload(mock_connector.store[expected_key]) assert data["request_id"] == req_id assert "layer_blocks" in data assert len(data["layer_blocks"]["key_cache"]) == num_layers @@ -106,6 +141,116 @@ def test_manager_extraction(kv_config, mock_connector, common_constants): assert data["layer_blocks"]["key_cache"][0].shape == expected_shape +def test_from_bytes_rejects_out_of_bounds_header_len(): + payload, _ = _make_serialized_payload() + bad_payload = struct.pack(">I", len(payload)) + payload[4:] + + with pytest.raises(ValueError, match="header_len"): + KVCacheTransferData.from_bytes(bad_payload) + + with pytest.raises(ValueError, match="header_len"): + KVCacheTransferData.from_bytes_gpu(torch.tensor(list(bad_payload), dtype=torch.uint8)) + + +def test_from_bytes_rejects_out_of_bounds_tensor_span(): + payload, _ = _make_serialized_payload() + bad_payload = _rewrite_serialized_header(payload, lambda header: header["td"][0].update({"o": 4096})) + + with pytest.raises(ValueError, match="tensor span"): + KVCacheTransferData.from_bytes(bad_payload) + + with pytest.raises(ValueError, match="tensor span"): + KVCacheTransferData.from_bytes_gpu(torch.tensor(list(bad_payload), dtype=torch.uint8)) + + +def test_from_bytes_rejects_unsupported_dtype(): + payload, _ = _make_serialized_payload() + bad_payload = _rewrite_serialized_header(payload, lambda header: header["td"][0].update({"d": "cuda"})) + + with pytest.raises(ValueError, match="Unsupported dtype"): + KVCacheTransferData.from_bytes(bad_payload) + + with pytest.raises(ValueError, match="Unsupported dtype"): + KVCacheTransferData.from_bytes_gpu(torch.tensor(list(bad_payload), dtype=torch.uint8)) + + +def test_from_bytes_uses_explicit_layer_index_descriptor(): + payload, key_tensor = _make_serialized_payload() + payload_with_explicit_index = _rewrite_serialized_header( + payload, + lambda header: header["td"][0].update({"n": "key_cache_extra_suffix", "i": 0}), + ) + + data = KVCacheTransferData.from_bytes(payload_with_explicit_index) + + assert torch.equal(data["layer_blocks"]["key_cache"][0], key_tensor) + + +def test_update_sender_info_uses_configured_source_stage(): + config = OmniKVCacheConfig( + connector_config={"type": "mock"}, + stage_id=2, + engine_input_source=[1], + need_recv_cache=True, + ) + manager = OmniKVTransferManager(config) + + manager.update_sender_info( + { + 0: {"host": "10.0.0.1", "zmq_port": 50151}, + 1: {"host": "10.0.0.2", "zmq_port": 50152}, + } + ) + + assert manager.config.connector_config["sender_host"] == "10.0.0.2" + assert manager.config.connector_config["sender_zmq_port"] == 50152 + + +def test_clone_received_payload_tensors_breaks_buffer_alias(): + payload, key_tensor = _make_serialized_payload() + raw = np.frombuffer(bytearray(payload), dtype=np.uint8) + data = KVCacheTransferData.from_bytes(memoryview(raw)) + + OmniKVTransferManager._clone_received_payload_tensors(data) + raw[:] = 0 + + assert torch.equal(data["layer_blocks"]["key_cache"][0], key_tensor) + + +def test_receive_kv_cache_uses_exponential_backoff(monkeypatch): + config = OmniKVCacheConfig( + connector_config={"type": "mock"}, + from_stage="sender", + stage_id="receiver", + need_recv_cache=True, + recv_timeout=0.3, + ) + manager = OmniKVTransferManager(config) + + class _NeverReadyConnector: + def get(self, **kwargs): + del kwargs + return None + + manager._connector = _NeverReadyConnector() + + now = {"value": 0.0} + sleep_intervals = [] + + monkeypatch.setattr(kv_transfer_manager_module.time, "time", lambda: now["value"]) + + def _fake_sleep(interval: float) -> None: + sleep_intervals.append(interval) + now["value"] += interval + + monkeypatch.setattr(kv_transfer_manager_module.time, "sleep", _fake_sleep) + + data, size = manager.receive_kv_cache_for_request("req-backoff") + + assert (data, size) == (None, 0) + assert sleep_intervals == pytest.approx([0.01, 0.02, 0.04, 0.08, 0.16]) + + def test_manager_extraction_tuple_layout(kv_config, mock_connector, common_constants): """Test extraction with tuple layout.""" num_layers = common_constants["num_layers"] @@ -135,7 +280,7 @@ def test_manager_extraction_tuple_layout(kv_config, mock_connector, common_const expected_key = f"stage1->stage2:{full_request_id}" assert expected_key in mock_connector.store - data = mock_connector.store[expected_key] + data = _decode_stored_payload(mock_connector.store[expected_key]) expected_shape = (seq_len, num_heads, head_dim) for idx in range(len(kv_caches)): assert data["layer_blocks"]["key_cache"][idx].shape == expected_shape @@ -165,7 +310,7 @@ def test_manager_extraction_mismatched_kv_block_counts(kv_config, mock_connector expected_key = f"stage1->stage2:{full_request_id}" assert expected_key in mock_connector.store - data = mock_connector.store[expected_key] + data = _decode_stored_payload(mock_connector.store[expected_key]) expected_shape = (2 * block_size, num_heads, head_dim) assert data["layer_blocks"]["key_cache"][0].shape == expected_shape assert data["layer_blocks"]["value_cache"][0].shape == expected_shape @@ -254,6 +399,82 @@ def test_manager_reception(kv_config, mock_connector, common_constants): assert req.kv_metadata["seq_len"] == seq_len +def test_manager_reception_prefers_parent_request_id_for_batched_request(kv_config, mock_connector, common_constants): + """Batched diffusion requests must fetch KV using the parent/global request ID.""" + num_layers = common_constants["num_layers"] + num_heads = common_constants["num_heads"] + head_dim = common_constants["head_dim"] + seq_len = common_constants["seq_len"] + parent_req_id = common_constants["req_id"] + + expected_shape = (seq_len, num_heads, head_dim) + key_cache = [torch.randn(expected_shape) for _ in range(num_layers)] + value_cache = [torch.randn(expected_shape) for _ in range(num_layers)] + + data_to_receive = { + "request_id": parent_req_id, + "layer_blocks": {"key_cache": key_cache, "value_cache": value_cache}, + "metadata": {"seq_len": seq_len}, + "block_ids": [], + } + + manager = OmniKVTransferManager(kv_config) + manager._connector = mock_connector + + full_request_id = f"omni_stage1_to_stage2_kv_cache_{parent_req_id}" + store_key = f"stage1->stage2:{full_request_id}" + mock_connector.store[store_key] = data_to_receive + + req = OmniDiffusionRequest( + prompts=["prompt-a", "prompt-b"], + sampling_params=OmniDiffusionSamplingParams(), + request_ids=[f"{parent_req_id}-0", f"{parent_req_id}-1"], + request_id=parent_req_id, + ) + + success = manager.receive_kv_cache(req, target_device=torch.device("cpu")) + + assert success + assert req.kv_metadata["seq_len"] == seq_len + assert torch.allclose(req.past_key_values.key_cache[0], key_cache[0]) + + +def test_receive_multi_kv_cache_uses_parent_request_id_for_cfg_collection(kv_config): + manager = OmniKVTransferManager(kv_config) + + seen = {} + + def collect_cfg(request_id, cfg_request_ids, kv_transfer_manager, target_device): + seen["request_id"] = request_id + seen["cfg_request_ids"] = cfg_request_ids + seen["kv_transfer_manager"] = kv_transfer_manager + seen["target_device"] = target_device + return {"cfg_text_kv_metadata": {"ok": True}} + + req = OmniDiffusionRequest( + prompts=["prompt-a", "prompt-b"], + sampling_params=OmniDiffusionSamplingParams(), + request_ids=["req-parent-0", "req-parent-1"], + request_id="req-parent", + ) + req.sampling_params.cfg_kv_request_ids = {"cfg_text": "req-parent__cfg_text"} + + manager.receive_kv_cache = lambda request, target_device=None: request is req + + success = manager.receive_multi_kv_cache( + req, + cfg_kv_collect_func=collect_cfg, + target_device=torch.device("cpu"), + ) + + assert success + assert seen["request_id"] == "req-parent" + assert seen["cfg_request_ids"] == {"cfg_text": "req-parent__cfg_text"} + assert seen["kv_transfer_manager"] is manager + assert seen["target_device"] == torch.device("cpu") + assert req.sampling_params.cfg_text_kv_metadata == {"ok": True} + + def test_integration_flow(common_constants): """Simulate extraction -> connector -> reception.""" num_layers = common_constants["num_layers"] diff --git a/tests/engine/test_orchestrator_kv_sender_info.py b/tests/engine/test_orchestrator_kv_sender_info.py new file mode 100644 index 0000000000..94da4ce717 --- /dev/null +++ b/tests/engine/test_orchestrator_kv_sender_info.py @@ -0,0 +1,207 @@ +import asyncio +from types import SimpleNamespace + +import pytest +from vllm import SamplingParams + +from vllm_omni.engine.orchestrator import Orchestrator, OrchestratorRequestState +from vllm_omni.engine.stage_engine_core_client import StageEngineCoreClient +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +class _DummySenderStage: + def __init__(self, sender_info): + self._sender_info = sender_info + self.engine_outputs = None + + def set_engine_outputs(self, outputs): + self.engine_outputs = outputs + + def get_kv_sender_info(self): + return self._sender_info + + +class _DummyDiffusionStage: + stage_type = "diffusion" + custom_process_input_func = None + + def __init__(self, engine_input_source=None): + self.engine_input_source = engine_input_source or [0] + self.calls = [] + + async def add_request_async(self, request_id, prompt, sampling_params, kv_sender_info=None): + self.calls.append( + { + "request_id": request_id, + "prompt": prompt, + "sampling_params": sampling_params, + "kv_sender_info": kv_sender_info, + } + ) + + +def test_stage_engine_core_client_builds_kv_sender_info_from_tcp_address(): + client = object.__new__(StageEngineCoreClient) + client.stage_id = 0 + client.client_addresses = {"input_address": "tcp://10.20.30.40:1234"} + client._omni_kv_config = None + client._kv_sender_info = None + client._kv_sender_initialized = False + client._kv_sender_host = client._resolve_contact_host() + client._initialize_kv_sender_endpoint() + + assert client.get_kv_sender_info() == { + "host": "10.20.30.40", + "zmq_port": 50151, + } + + +def test_stage_engine_core_client_falls_back_to_detected_ip_for_loopback(monkeypatch): + client = object.__new__(StageEngineCoreClient) + client.stage_id = 1 + client.client_addresses = {"input_address": "tcp://127.0.0.1:1234"} + client._omni_kv_config = None + client._kv_sender_info = None + client._kv_sender_initialized = False + monkeypatch.setattr(client, "_detect_local_ip", lambda: "192.168.0.12") + client._kv_sender_host = client._resolve_contact_host() + client._initialize_kv_sender_endpoint() + + assert client.get_kv_sender_info() == { + "host": "192.168.0.12", + "zmq_port": 50152, + } + + +def test_stage_engine_core_client_uses_connector_config_for_sender_port(): + client = object.__new__(StageEngineCoreClient) + client.stage_id = 3 + client.client_addresses = {"input_address": "tcp://10.20.30.40:1234"} + client._kv_sender_info = None + client._kv_sender_initialized = False + client._omni_kv_config = { + "omni_from_stage": "3", + "connector_config": { + "type": "MooncakeTransferEngineConnector", + "role": "sender", + "host": "10.20.30.99", + "zmq_port": 51000, + }, + } + client._kv_sender_host = client._resolve_contact_host() + client._initialize_kv_sender_endpoint() + + assert client.get_kv_sender_info() == { + "host": "10.20.30.99", + "zmq_port": 51103, + } + + +def test_stage_engine_core_client_preserves_explicit_loopback_sender_host(): + client = object.__new__(StageEngineCoreClient) + client.stage_id = 2 + client.client_addresses = {"input_address": "tcp://10.20.30.40:1234"} + client._kv_sender_info = None + client._kv_sender_initialized = False + client._omni_kv_config = { + "omni_from_stage": "2", + "connector_config": { + "type": "MooncakeTransferEngineConnector", + "role": "sender", + "host": "127.0.0.1", + "zmq_port": 51000, + }, + } + client._kv_sender_host = client._resolve_contact_host() + client._initialize_kv_sender_endpoint() + + assert client.get_kv_sender_info() == { + "host": "127.0.0.1", + "zmq_port": 51102, + } + + +def test_forward_to_diffusion_attaches_kv_sender_info(): + orchestrator = object.__new__(Orchestrator) + sender_stage = _DummySenderStage({"host": "10.0.0.2", "zmq_port": 50151}) + diffusion_stage = _DummyDiffusionStage(engine_input_source=[0]) + + orchestrator.num_stages = 2 + orchestrator.stage_clients = [sender_stage, diffusion_stage] + orchestrator._companion_map = {} + orchestrator.stage_vllm_configs = [None, None] + orchestrator.output_processors = [None, None] + + params = OmniDiffusionSamplingParams() + req_state = OrchestratorRequestState( + request_id="req-1", + prompt={"prompt": "hello"}, + sampling_params_list=[SamplingParams(max_tokens=4), params], + final_stage_id=1, + ) + + output = SimpleNamespace(request_id="req-1", finished=True) + asyncio.run(Orchestrator._forward_to_next_stage(orchestrator, "req-1", 0, output, req_state)) + + assert sender_stage.engine_outputs == [output] + assert diffusion_stage.calls[0]["request_id"] == "req-1" + assert diffusion_stage.calls[0]["kv_sender_info"] == { + 0: {"host": "10.0.0.2", "zmq_port": 50151}, + } + assert req_state.stage_submit_ts[1] > 0 + + +def test_forward_to_diffusion_uses_engine_input_source_for_kv_sender_info(): + orchestrator = object.__new__(Orchestrator) + source_stage = _DummySenderStage({"host": "10.0.0.2", "zmq_port": 50151}) + previous_stage = _DummySenderStage({"host": "10.0.0.9", "zmq_port": 59999}) + diffusion_stage = _DummyDiffusionStage(engine_input_source=[0]) + + orchestrator.num_stages = 3 + orchestrator.stage_clients = [source_stage, previous_stage, diffusion_stage] + orchestrator._companion_map = {} + orchestrator.stage_vllm_configs = [None, None, None] + orchestrator.output_processors = [None, None, None] + + params = OmniDiffusionSamplingParams() + req_state = OrchestratorRequestState( + request_id="req-3", + prompt={"prompt": "hello"}, + sampling_params_list=[SamplingParams(max_tokens=4), SamplingParams(max_tokens=4), params], + final_stage_id=2, + ) + + output = SimpleNamespace(request_id="req-3", finished=True) + asyncio.run(Orchestrator._forward_to_next_stage(orchestrator, "req-3", 1, output, req_state)) + + assert previous_stage.engine_outputs == [output] + assert diffusion_stage.calls[0]["kv_sender_info"] == { + 0: {"host": "10.0.0.2", "zmq_port": 50151}, + } + + +def test_prewarm_diffusion_attaches_kv_sender_info(): + orchestrator = object.__new__(Orchestrator) + sender_stage = _DummySenderStage({"host": "10.0.0.3", "zmq_port": 50151}) + diffusion_stage = _DummyDiffusionStage(engine_input_source=[0]) + + orchestrator.stage_clients = [sender_stage, diffusion_stage] + orchestrator.num_stages = 2 + + req_state = OrchestratorRequestState( + request_id="req-2", + prompt={"prompt": "hello"}, + sampling_params_list=[SamplingParams(max_tokens=4), OmniDiffusionSamplingParams()], + final_stage_id=1, + ) + + stage0_request = SimpleNamespace(prompt_token_ids=[1, 2, 3]) + asyncio.run(Orchestrator._prewarm_async_chunk_stages(orchestrator, "req-2", stage0_request, req_state)) + + assert diffusion_stage.calls[0]["request_id"] == "req-2" + assert diffusion_stage.calls[0]["kv_sender_info"] == { + 0: {"host": "10.0.0.3", "zmq_port": 50151}, + } + assert req_state.stage_submit_ts[1] > 0 diff --git a/vllm_omni/diffusion/request.py b/vllm_omni/diffusion/request.py index 1d6d64905a..4d4328d251 100644 --- a/vllm_omni/diffusion/request.py +++ b/vllm_omni/diffusion/request.py @@ -26,6 +26,8 @@ class OmniDiffusionRequest: sampling_params: OmniDiffusionSamplingParams request_ids: list[str] = field(default_factory=list) + request_id: str | None = None + kv_sender_info: dict | None = None def __post_init__(self): """Initialize dependent fields after dataclass initialization.""" diff --git a/vllm_omni/diffusion/stage_diffusion_client.py b/vllm_omni/diffusion/stage_diffusion_client.py index 77db2b1b97..a1a4766de2 100644 --- a/vllm_omni/diffusion/stage_diffusion_client.py +++ b/vllm_omni/diffusion/stage_diffusion_client.py @@ -179,6 +179,7 @@ async def add_request_async( request_id: str, prompt: OmniPromptType, sampling_params: OmniDiffusionSamplingParams, + kv_sender_info: dict[int, dict[str, Any]] | None = None, ) -> None: self._request_socket.send( self._encoder.encode( @@ -187,6 +188,7 @@ async def add_request_async( "request_id": request_id, "prompt": prompt, "sampling_params": self._sampling_params_to_dict(sampling_params), + "kv_sender_info": kv_sender_info, } ) ) @@ -198,6 +200,7 @@ async def add_batch_request_async( request_id: str, prompts: list[OmniPromptType], sampling_params: OmniDiffusionSamplingParams, + kv_sender_info: dict[int, dict[str, Any]] | None = None, ) -> None: """Submit a list of prompts as a single batched engine call. @@ -206,7 +209,12 @@ async def add_batch_request_async( *request_id*. """ task = asyncio.create_task( - self._run_batch(request_id, prompts, sampling_params), + self._run_batch( + request_id, + prompts, + sampling_params, + kv_sender_info, + ), name=f"diffusion-batch-{request_id}", ) self._tasks[request_id] = task @@ -216,6 +224,7 @@ async def _run_batch( request_id: str, prompts: list[OmniPromptType], sampling_params: OmniDiffusionSamplingParams, + kv_sender_info: dict[int, dict[str, Any]] | None = None, ) -> None: try: self._request_socket.send( @@ -225,6 +234,7 @@ async def _run_batch( "request_id": request_id, "prompts": prompts, "sampling_params": self._sampling_params_to_dict(sampling_params), + "kv_sender_info": kv_sender_info, } ) ) diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py index bcc3bef15d..9d8c06cce9 100644 --- a/vllm_omni/diffusion/stage_diffusion_proc.py +++ b/vllm_omni/diffusion/stage_diffusion_proc.py @@ -130,6 +130,7 @@ async def _process_request( request_id: str, prompt: Any, sampling_params_dict: dict, + kv_sender_info: dict[str, Any] | None = None, ) -> OmniRequestOutput: """Build a diffusion request and run DiffusionEngine.step().""" sampling_params = self._reconstruct_sampling_params(sampling_params_dict) @@ -138,6 +139,8 @@ async def _process_request( prompts=[prompt], sampling_params=sampling_params, request_ids=[request_id], + request_id=request_id, + kv_sender_info=kv_sender_info, ) loop = asyncio.get_running_loop() @@ -152,6 +155,7 @@ async def _process_batch_request( request_id: str, prompts: list[Any], sampling_params_dict: dict, + kv_sender_info: dict[str, Any] | None = None, ) -> OmniRequestOutput: """Build a batched diffusion request and run DiffusionEngine.step(). @@ -165,7 +169,9 @@ async def _process_batch_request( request = OmniDiffusionRequest( prompts=prompts, sampling_params=sampling_params, - request_ids=[request_id] * len(prompts), + request_ids=[f"{request_id}-{i}" for i in range(len(prompts))], + request_id=request_id, + kv_sender_info=kv_sender_info, ) loop = asyncio.get_running_loop() @@ -346,10 +352,20 @@ async def run_loop( tasks: dict[str, asyncio.Task] = {} - async def _dispatch_request(request_id: str, prompt: Any, sampling_params_dict: dict) -> None: + async def _dispatch_request( + request_id: str, + prompt: Any, + sampling_params_dict: dict, + kv_sender_info: dict[str, Any] | None = None, + ) -> None: """Process a single diffusion request and send the response.""" try: - result = await self._process_request(request_id, prompt, sampling_params_dict) + result = await self._process_request( + request_id, + prompt, + sampling_params_dict, + kv_sender_info=kv_sender_info, + ) await response_socket.send(encoder.encode({"type": "result", "output": result})) except DiffusionRequestAbortedError as e: logger.info( @@ -384,6 +400,7 @@ async def _dispatch_request(request_id: str, prompt: Any, sampling_params_dict: request_id, msg["prompt"], msg["sampling_params"], + msg.get("kv_sender_info"), ) ) tasks[request_id] = task @@ -391,9 +408,19 @@ async def _dispatch_request(request_id: str, prompt: Any, sampling_params_dict: elif msg_type == "add_batch_request": request_id = msg["request_id"] - async def _dispatch_batch(rid: str, prompts: list, sp_dict: dict) -> None: + async def _dispatch_batch( + rid: str, + prompts: list, + sp_dict: dict, + kv_sender_info: dict[str, Any] | None = None, + ) -> None: try: - result = await self._process_batch_request(rid, prompts, sp_dict) + result = await self._process_batch_request( + rid, + prompts, + sp_dict, + kv_sender_info=kv_sender_info, + ) await response_socket.send(encoder.encode({"type": "result", "output": result})) except DiffusionRequestAbortedError as e: logger.info( @@ -420,6 +447,7 @@ async def _dispatch_batch(rid: str, prompts: list, sp_dict: dict) -> None: request_id, msg["prompts"], msg["sampling_params"], + msg.get("kv_sender_info"), ) ) tasks[request_id] = task diff --git a/vllm_omni/distributed/omni_connectors/connectors/mooncake_transfer_engine_connector.py b/vllm_omni/distributed/omni_connectors/connectors/mooncake_transfer_engine_connector.py index b1dc8b8987..96a528963f 100644 --- a/vllm_omni/distributed/omni_connectors/connectors/mooncake_transfer_engine_connector.py +++ b/vllm_omni/distributed/omni_connectors/connectors/mooncake_transfer_engine_connector.py @@ -277,13 +277,15 @@ def __init__(self, config: dict[str, Any]): } self.config = config - host_config = config.get("host", "127.0.0.1") - # Support "auto" to auto-detect local IP address - if host_config.lower() == "auto": + host_config = config.get("host") + host_value = "auto" if host_config is None else str(host_config) + # Default sender/receiver bootstrap to a routable local IP so the + # advertised endpoint matches the interface Mooncake binds. + if host_value.lower() == "auto" or host_value in {"", "*", "0.0.0.0", "::"}: self.host = self._get_local_ip() logger.info(f"Auto-detected local IP for RDMA: {self.host}") else: - self.host = host_config + self.host = host_value self.zmq_port = config.get("zmq_port", 50051) self.protocol = config.get("protocol", "rdma") diff --git a/vllm_omni/distributed/omni_connectors/kv_transfer_manager.py b/vllm_omni/distributed/omni_connectors/kv_transfer_manager.py index 1f49384383..1958c9d40a 100644 --- a/vllm_omni/distributed/omni_connectors/kv_transfer_manager.py +++ b/vllm_omni/distributed/omni_connectors/kv_transfer_manager.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Unified OmniConnector and KV cache transfer management.""" +import json +import struct import time from collections.abc import Callable from dataclasses import asdict, dataclass @@ -12,12 +14,36 @@ from .factory import OmniConnectorFactory from .utils.config import ConnectorSpec +from .utils.initialization import KV_TRANSFER_PORT_OFFSET from .utils.kv_utils import normalize_layer_kv logger = init_logger(__name__) LayerKV = torch.Tensor | tuple[torch.Tensor, torch.Tensor] +_SAFE_TORCH_DTYPES = { + name: dtype + for name in ( + "bool", + "uint8", + "int8", + "int16", + "int32", + "int64", + "float16", + "float32", + "float64", + "bfloat16", + "complex64", + "complex128", + "float8_e4m3fn", + "float8_e4m3fnuz", + "float8_e5m2", + "float8_e5m2fnuz", + ) + if isinstance((dtype := getattr(torch, name, None)), torch.dtype) +} + @dataclass class OmniKVCacheConfig: @@ -46,6 +72,242 @@ def to_dict(self) -> dict[str, Any]: """Convert to dictionary for serialization.""" return asdict(self) + def to_bytes(self) -> bytes: + """Convert to compact binary format for fast transfer.""" + tensors_desc: list[dict[str, Any]] = [] + tensor_bufs: list[bytes] = [] + data_offset = 0 + + for cache_name in ("key_cache", "value_cache"): + cache_list = self.layer_blocks.get(cache_name, []) + for layer_idx, tensor in enumerate(cache_list): + if tensor is None: + tensors_desc.append({"n": f"{cache_name}_{layer_idx}", "x": True}) + continue + + t = tensor.detach().cpu().contiguous() + dtype_str = str(t.dtype).removeprefix("torch.") + raw = t.view(torch.uint8).numpy().tobytes() + tensors_desc.append( + { + "n": f"{cache_name}_{layer_idx}", + "i": layer_idx, + "d": dtype_str, + "s": list(t.shape), + "o": data_offset, + "b": len(raw), + } + ) + tensor_bufs.append(raw) + data_offset += len(raw) + + header = json.dumps( + { + "rid": self.request_id, + "bids": self.block_ids, + "meta": self.metadata, + "td": tensors_desc, + "nl": len(self.layer_blocks.get("key_cache", [])), + }, + separators=(",", ":"), + ).encode("utf-8") + return b"".join([struct.pack(">I", len(header)), header] + tensor_bufs) + + def to_gpu_tensor(self) -> torch.Tensor: + """Convert to a packed GPU tensor for raw-data connectors.""" + tensors_desc: list[dict[str, Any]] = [] + gpu_tensors: list[torch.Tensor] = [] + data_offset = 0 + device = None + + for cache_name in ("key_cache", "value_cache"): + cache_list = self.layer_blocks.get(cache_name, []) + for layer_idx, tensor in enumerate(cache_list): + if tensor is None: + tensors_desc.append({"n": f"{cache_name}_{layer_idx}", "x": True}) + continue + + t = tensor.detach().contiguous() + if device is None and t.is_cuda: + device = t.device + dtype_str = str(t.dtype).removeprefix("torch.") + nbytes = t.numel() * t.element_size() + tensors_desc.append( + { + "n": f"{cache_name}_{layer_idx}", + "i": layer_idx, + "d": dtype_str, + "s": list(t.shape), + "o": data_offset, + "b": nbytes, + } + ) + gpu_tensors.append(t.view(torch.uint8).flatten()) + data_offset += nbytes + + if device is None: + raise RuntimeError("No CUDA tensors found, use to_bytes() instead") + + header = json.dumps( + { + "rid": self.request_id, + "bids": self.block_ids, + "meta": self.metadata, + "td": tensors_desc, + "nl": len(self.layer_blocks.get("key_cache", [])), + }, + separators=(",", ":"), + ).encode("utf-8") + + header_prefix = struct.pack(">I", len(header)) + header + total_size = len(header_prefix) + data_offset + output = torch.empty(total_size, dtype=torch.uint8, device=device) + header_tensor = torch.frombuffer(bytearray(header_prefix), dtype=torch.uint8) + output[: len(header_prefix)].copy_(header_tensor) + + pos = len(header_prefix) + for t_flat in gpu_tensors: + n = t_flat.numel() + output[pos : pos + n].copy_(t_flat) + pos += n + + return output + + @staticmethod + def _load_header_from_memoryview(raw_mv: memoryview) -> tuple[dict[str, Any], memoryview]: + if len(raw_mv) < 4: + raise ValueError("Corrupted KV payload: missing 4-byte header length") + + header_len = struct.unpack(">I", raw_mv[:4])[0] + if header_len > len(raw_mv) - 4: + raise ValueError(f"Corrupted KV payload: header_len={header_len} exceeds buffer size={len(raw_mv)}") + + return json.loads(bytes(raw_mv[4 : 4 + header_len])), raw_mv[4 + header_len :] + + @staticmethod + def _load_header_from_tensor(gpu_tensor: torch.Tensor) -> tuple[dict[str, Any], int]: + if gpu_tensor.dtype != torch.uint8 or gpu_tensor.dim() != 1: + raise ValueError("Packed GPU KV payload must be a 1-D uint8 tensor") + + total_bytes = int(gpu_tensor.numel()) + if total_bytes < 4: + raise ValueError("Corrupted KV payload: missing 4-byte header length") + + header_len = struct.unpack(">I", gpu_tensor[:4].cpu().numpy().tobytes())[0] + if header_len > total_bytes - 4: + raise ValueError(f"Corrupted KV payload: header_len={header_len} exceeds buffer size={total_bytes}") + + header_bytes = gpu_tensor[4 : 4 + header_len].cpu().numpy().tobytes() + return json.loads(header_bytes), 4 + header_len + + @staticmethod + def _validate_tensor_span(name: str, info: dict[str, Any], tensor_data_bytes: int) -> tuple[int, int]: + offset = info["o"] + nbytes = info["b"] + if offset < 0 or nbytes < 0 or offset + nbytes > tensor_data_bytes: + raise ValueError( + f"Corrupted KV payload tensor span for {name}: " + f"offset={offset}, bytes={nbytes}, tensor_data_bytes={tensor_data_bytes}" + ) + return offset, nbytes + + @staticmethod + def _resolve_torch_dtype(dtype_name: Any) -> torch.dtype: + torch_dtype = _SAFE_TORCH_DTYPES.get(str(dtype_name)) + if torch_dtype is None: + raise ValueError(f"Unsupported dtype in KV payload: {dtype_name}") + return torch_dtype + + @staticmethod + def _resolve_layer_idx(info: dict[str, Any], num_layers: int) -> int: + layer_idx = info.get("i") + if layer_idx is None: + name = info.get("n") + if isinstance(name, str) and name.startswith("key_cache_"): + layer_idx = int(name.removeprefix("key_cache_")) + elif isinstance(name, str) and name.startswith("value_cache_"): + layer_idx = int(name.removeprefix("value_cache_")) + else: + raise ValueError(f"Invalid KV tensor name in payload: {name}") + + if not isinstance(layer_idx, int): + raise ValueError(f"Invalid layer index in KV payload: {layer_idx}") + if layer_idx < 0 or layer_idx >= num_layers: + raise ValueError(f"Invalid layer index in KV payload: {layer_idx} (num_layers={num_layers})") + return layer_idx + + @staticmethod + def from_bytes(raw: "bytes | bytearray | memoryview") -> dict[str, Any]: + """Reconstruct KV cache data from the packed bytes format.""" + raw_mv = memoryview(raw) if not isinstance(raw, memoryview) else raw + header, tensor_data_mv = KVCacheTransferData._load_header_from_memoryview(raw_mv) + + num_layers = header["nl"] + key_cache: list[torch.Tensor | None] = [None] * num_layers + value_cache: list[torch.Tensor | None] = [None] * num_layers + + for info in header["td"]: + if info.get("x"): + continue + + name: str = info["n"] + torch_dtype = KVCacheTransferData._resolve_torch_dtype(info["d"]) + offset, nbytes = KVCacheTransferData._validate_tensor_span(name, info, len(tensor_data_mv)) + t = ( + torch.frombuffer( + tensor_data_mv, + dtype=torch.uint8, + offset=offset, + count=nbytes, + ) + .view(torch_dtype) + .reshape(info["s"]) + ) + layer_idx = KVCacheTransferData._resolve_layer_idx(info, num_layers) + if name.startswith("key_cache_"): + key_cache[layer_idx] = t + elif name.startswith("value_cache_"): + value_cache[layer_idx] = t + + return { + "request_id": header["rid"], + "layer_blocks": {"key_cache": key_cache, "value_cache": value_cache}, + "block_ids": header["bids"], + "metadata": header["meta"], + } + + @staticmethod + def from_bytes_gpu(gpu_tensor: torch.Tensor) -> dict[str, Any]: + """Reconstruct KV cache data from a packed GPU tensor.""" + header, data_start = KVCacheTransferData._load_header_from_tensor(gpu_tensor) + + num_layers = header["nl"] + key_cache: list[torch.Tensor | None] = [None] * num_layers + value_cache: list[torch.Tensor | None] = [None] * num_layers + tensor_data_bytes = int(gpu_tensor.numel()) - data_start + + for info in header["td"]: + if info.get("x"): + continue + + name: str = info["n"] + torch_dtype = KVCacheTransferData._resolve_torch_dtype(info["d"]) + offset, nbytes = KVCacheTransferData._validate_tensor_span(name, info, tensor_data_bytes) + t = gpu_tensor[data_start + offset : data_start + offset + nbytes].clone() + t = t.view(torch_dtype).reshape(info["s"]) + layer_idx = KVCacheTransferData._resolve_layer_idx(info, num_layers) + if name.startswith("key_cache_"): + key_cache[layer_idx] = t + elif name.startswith("value_cache_"): + value_cache[layer_idx] = t + + return { + "request_id": header["rid"], + "layer_blocks": {"key_cache": key_cache, "value_cache": value_cache}, + "block_ids": header["bids"], + "metadata": header["meta"], + } + class OmniKVTransferManager: """Unified management for OmniConnector and KV cache transfer. @@ -79,6 +341,13 @@ def __init__(self, config: OmniKVCacheConfig): else (None, None) ) + if config.need_send_cache and config.connector_config: + try: + _ = self.connector + logger.info("Sender connector eagerly initialized") + except Exception as e: + logger.warning("Failed to eagerly initialize sender connector: %s", e) + @classmethod def _create(cls, cfg: dict | None) -> "OmniKVTransferManager": """Create manager from raw config dict.""" @@ -140,8 +409,39 @@ def connector(self): cfg = self.config.connector_config if cfg and (c_type := cfg.get("type")): try: - logger.info(f"Initializing OmniConnector with config: {cfg}") c_extra = {k: v for k, v in cfg.items() if k != "type"} + if c_type == "MooncakeTransferEngineConnector": + base_port = c_extra.get("zmq_port", 50051) + c_extra["from_stage"] = ( + str(self.config.from_stage) if self.config.from_stage is not None else "0" + ) + c_extra["to_stage"] = str(self.config.to_stage) if self.config.to_stage is not None else "1" + + if self.config.need_send_cache: + c_extra["role"] = "sender" + from_stage = self.config.from_stage + if from_stage is not None: + try: + c_extra["zmq_port"] = base_port + KV_TRANSFER_PORT_OFFSET + int(from_stage) + except (TypeError, ValueError): + c_extra["zmq_port"] = base_port + KV_TRANSFER_PORT_OFFSET + elif self.config.need_recv_cache: + c_extra["role"] = "receiver" + from_stage = self.config.from_stage + sender_port = base_port + KV_TRANSFER_PORT_OFFSET + if from_stage is not None: + try: + sender_port = base_port + KV_TRANSFER_PORT_OFFSET + int(from_stage) + except (TypeError, ValueError): + pass + c_extra.setdefault("sender_host", c_extra.get("host", "127.0.0.1")) + c_extra.setdefault("sender_zmq_port", sender_port) + + logger.info( + "Initializing OmniConnector (purpose=kv_transfer) with config: %s, role: %s", + cfg, + c_extra.get("role", "N/A"), + ) self._connector = OmniConnectorFactory.create_connector(ConnectorSpec(name=c_type, extra=c_extra)) except Exception as e: logger.error(f"Failed to initialize OmniConnector: {e}") @@ -157,6 +457,85 @@ def get_connector(self): """Get connector (compatibility wrapper for existing code).""" return self.connector + def _resolve_sender_info( + self, sender_info: dict[str, Any], sender_stage_id: str | int | None = None + ) -> dict[str, Any] | None: + if not sender_info: + return None + + if "host" in sender_info: + return sender_info + + if not isinstance(sender_info, dict): + return None + + preferred_keys: list[str | int] = [] + if sender_stage_id is None: + recv_from, _ = self.recv_stages + sender_stage_id = recv_from + + if sender_stage_id is not None: + preferred_keys.append(sender_stage_id) + preferred_keys.append(str(sender_stage_id)) + try: + preferred_keys.append(int(sender_stage_id)) + except (TypeError, ValueError): + pass + + for key in dict.fromkeys(preferred_keys): + info = sender_info.get(key) + if isinstance(info, dict) and "host" in info: + return info + + candidates = [info for info in sender_info.values() if isinstance(info, dict) and "host" in info] + if len(candidates) == 1: + return candidates[0] + + if candidates: + logger.warning( + "Ambiguous sender_info for sender_stage_id=%s: " + "expected caller to resolve a single sender entry, got %s", + sender_stage_id, + sender_info, + ) + return None + + @staticmethod + def _clone_received_payload_tensors(data: dict[str, Any]) -> dict[str, Any]: + if not isinstance(data, dict) or "layer_blocks" not in data: + return data + + layer_blocks = data["layer_blocks"] + for cache_name in ("key_cache", "value_cache"): + cache_list = layer_blocks.get(cache_name, []) + for idx, tensor in enumerate(cache_list): + if isinstance(tensor, torch.Tensor): + cache_list[idx] = tensor.clone() + return data + + def update_sender_info(self, sender_info: dict[str, Any], sender_stage_id: str | int | None = None) -> None: + """Update receiver-side sender info before loading remote KV cache.""" + if not self.config.need_recv_cache: + return + + actual_info = self._resolve_sender_info(sender_info, sender_stage_id=sender_stage_id) + if not actual_info or "host" not in actual_info: + logger.warning("Invalid sender_info format: %s", sender_info) + return + + if self.config.connector_config: + self.config.connector_config["sender_host"] = actual_info.get("host") + self.config.connector_config["sender_zmq_port"] = actual_info.get("zmq_port") + + if self._connector and hasattr(self._connector, "update_sender_info"): + try: + self._connector.update_sender_info(actual_info.get("host"), actual_info.get("zmq_port")) + except Exception: + if hasattr(self._connector, "sender_host"): + self._connector.sender_host = actual_info.get("host") + if hasattr(self._connector, "sender_zmq_port"): + self._connector.sender_zmq_port = actual_info.get("zmq_port") + def handle_finished_requests_kv_transfer( self, finished_reqs: dict[str, dict[str, Any]], @@ -203,7 +582,8 @@ def handle_finished_requests_kv_transfer( custom_metadata = data.get("custom_metadata") - # Extract KV cache from GPU blocks -> CPU tensors + # Extract KV cache from GPU blocks and keep it on-device when + # possible so raw-data connectors can use the fast path. kv_data = self._extract_kv_cache( req_id, block_ids, seq_len, kv_caches, block_size, cache_dtype, custom_metadata ) @@ -280,9 +660,8 @@ def _extract_kv_cache( flat_k = flat_k[:seq_len] flat_v = flat_v[:seq_len] - # Move to CPU - key_cache[layer_idx] = flat_k.detach().cpu().contiguous() - value_cache[layer_idx] = flat_v.detach().cpu().contiguous() + key_cache[layer_idx] = flat_k.detach().contiguous() + value_cache[layer_idx] = flat_v.detach().contiguous() if not any(k is not None for k in key_cache): return None @@ -311,14 +690,40 @@ def _transfer_kv_cache(self, kv_data: KVCacheTransferData, transfer_req_id: str) if not from_stage or not to_stage: raise ValueError("Transfer stages (omni_from_stage, omni_to_stage) not configured") - # Prepare data and transfer with retry - data_dict = kv_data.to_dict() - data_dict["request_id"] = transfer_req_id + kv_data.request_id = transfer_req_id + serialization_start = time.perf_counter() + transfer_data: torch.Tensor | bytes | dict[str, Any] + supports_raw = getattr(self.connector, "supports_raw_data", False) + + try: + if supports_raw: + transfer_data = kv_data.to_gpu_tensor() + else: + raise RuntimeError("Connector does not support raw tensor") + except Exception: + try: + transfer_data = kv_data.to_bytes() + except Exception: + data_dict = kv_data.to_dict() + data_dict["request_id"] = transfer_req_id + transfer_data = data_dict + + serialization_ms = (time.perf_counter() - serialization_start) * 1000 + logger.info("KV cache serialized for %s in %.1f ms", transfer_req_id, serialization_ms) - success, size, _ = self._transfer_with_retry(from_stage, to_stage, f"kv_cache_{transfer_req_id}", data_dict) + transfer_start = time.perf_counter() + success, size, _ = self._transfer_with_retry(from_stage, to_stage, f"kv_cache_{transfer_req_id}", transfer_data) + elapsed = time.perf_counter() - transfer_start if success: - logger.info(f"KV transfer OK: {transfer_req_id}, {size} bytes") + mbps = (size / 1024 / 1024) / elapsed if elapsed > 0 else 0 + logger.info( + "KV transfer OK: %s, %s bytes, %.3fs, %.1f MB/s", + transfer_req_id, + size, + elapsed, + mbps, + ) else: logger.error(f"KV transfer FAILED: {transfer_req_id}") @@ -327,7 +732,7 @@ def _transfer_with_retry( from_stage: str, to_stage: str, request_id: str, - data: dict[str, Any], + data: "dict[str, Any] | bytes | torch.Tensor", max_retries: int = 3, ) -> tuple[bool, int, dict[str, Any] | None]: """Transfer data with retry and exponential backoff. @@ -393,6 +798,8 @@ def receive_kv_cache_for_request( timeout = self.config.recv_timeout start_time = time.time() + poll_interval = 0.01 + max_poll_interval = 0.5 logger.info(f"Wait for KV cache for request {request_id} from stage {from_stage} to {to_stage}...") @@ -400,33 +807,74 @@ def receive_kv_cache_for_request( while True: # Build the full key for connector full_request_id = f"omni_{from_stage}_to_{to_stage}_kv_cache_{request_id}" + link_start = time.perf_counter() result = self.connector.get( from_stage=from_stage, to_stage=to_stage, get_key=full_request_id, ) if result: - data, size = result - logger.info(f"Successfully received KV cache for {request_id}, {size} bytes") - - # Move tensors to target device if specified - if target_device is not None and isinstance(data, dict) and "layer_blocks" in data: - layer_blocks = data["layer_blocks"] - for cache_list in [ - layer_blocks.get("key_cache", []), - layer_blocks.get("value_cache", []), - ]: - for i, tensor in enumerate(cache_list): - if isinstance(tensor, torch.Tensor) and tensor.device != target_device: - cache_list[i] = tensor.to(target_device).contiguous() - + raw_data, size = result + elapsed = time.time() - start_time + link_ms = (time.perf_counter() - link_start) * 1000 + managed_buffer = None + + if hasattr(raw_data, "tensor") and hasattr(raw_data, "release"): + managed_buffer = raw_data + try: + buf_tensor = raw_data.tensor + if buf_tensor.is_cuda: + data = KVCacheTransferData.from_bytes_gpu(buf_tensor) + raw_data.release() + managed_buffer = None + else: + data = KVCacheTransferData.from_bytes(memoryview(buf_tensor.numpy())) + data = self._clone_received_payload_tensors(data) + raw_data.release() + managed_buffer = None + except Exception as e: + logger.error("Failed to deserialize KV cache from ManagedBuffer: %s", e) + if managed_buffer is not None: + raw_data.release() + return None, 0 + elif isinstance(raw_data, (bytes, bytearray)): + data = KVCacheTransferData.from_bytes(raw_data) + elif isinstance(raw_data, torch.Tensor) and raw_data.dtype == torch.uint8 and raw_data.dim() == 1: + data = KVCacheTransferData.from_bytes(raw_data.cpu().numpy().tobytes()) + else: + data = raw_data + + try: + if isinstance(data, dict) and "layer_blocks" in data: + layer_blocks = data["layer_blocks"] + for cache_list in [ + layer_blocks.get("key_cache", []), + layer_blocks.get("value_cache", []), + ]: + for i, tensor in enumerate(cache_list): + if not isinstance(tensor, torch.Tensor): + continue + if target_device is not None and tensor.device != target_device: + cache_list[i] = tensor.to(target_device).contiguous() + finally: + if managed_buffer is not None: + managed_buffer.release() + + logger.info( + "Successfully received KV cache for %s, %s bytes, wait=%.3fs, link=%.1fms", + request_id, + size, + elapsed, + link_ms, + ) return data, size if time.time() - start_time > timeout: logger.error(f"Timeout waiting for KV cache for request {request_id} after {timeout}s") return None, 0 - time.sleep(0.5) + time.sleep(poll_interval) + poll_interval = min(poll_interval * 2, max_poll_interval) except Exception as e: logger.error(f"Error receiving KV cache for {request_id}: {e}") @@ -459,6 +907,16 @@ def apply_kv_cache_to_request(self, req: Any, data: dict[str, Any]) -> None: if hasattr(req, "sampling_params") and req.sampling_params is not None: req.sampling_params.kv_metadata = data["metadata"] + @staticmethod + def _resolve_request_id(req: Any) -> str | None: + """Resolve the logical request ID used for KV transfer lookups.""" + request_id = getattr(req, "request_id", None) + if request_id: + return request_id + if hasattr(req, "request_ids") and req.request_ids: + return req.request_ids[0] + return None + # Legacy compatibility method def receive_kv_cache(self, req: Any, target_device: torch.device | None = None) -> bool: """Receive KV cache and populate request object (legacy interface). @@ -470,11 +928,11 @@ def receive_kv_cache(self, req: Any, target_device: torch.device | None = None) Returns: True if successful, False otherwise """ - request_id = getattr(req, "request_id", None) - if not request_id and hasattr(req, "request_ids") and req.request_ids: - # Adaptation for new OmniDiffusionRequest which has list of prompts/ids - request_id = req.request_ids[0] + kv_sender_info = getattr(req, "kv_sender_info", None) + if kv_sender_info: + self.update_sender_info(kv_sender_info, sender_stage_id=self.recv_stages[0]) + request_id = self._resolve_request_id(req) if not request_id: logger.warning("Request has no ID, cannot receive KV cache") return False @@ -513,9 +971,7 @@ def receive_multi_kv_cache( cfg_ids = getattr(getattr(req, "sampling_params", None), "cfg_kv_request_ids", None) if cfg_ids and cfg_kv_collect_func: - request_id = getattr(req, "request_id", None) or ( - req.request_ids[0] if hasattr(req, "request_ids") and req.request_ids else None - ) + request_id = self._resolve_request_id(req) try: cfg_kvs = cfg_kv_collect_func( request_id, diff --git a/vllm_omni/distributed/omni_connectors/utils/initialization.py b/vllm_omni/distributed/omni_connectors/utils/initialization.py index aaa222b4c5..37b7d0d7f8 100644 --- a/vllm_omni/distributed/omni_connectors/utils/initialization.py +++ b/vllm_omni/distributed/omni_connectors/utils/initialization.py @@ -19,9 +19,17 @@ logger = get_connector_logger(__name__) +# Reserve a separate port range for KV-transfer sockets so they do not +# collide with request-forwarding endpoints that share the same base port. +KV_TRANSFER_PORT_OFFSET = 100 + def initialize_connectors_from_config( - config_path: str | Path | None = None, default_shm_threshold: int = 65536 + config_path: str | Path | None = None, + default_shm_threshold: int = 65536, + purpose: str = "request_forwarding", + caller_stage_id: int | str | None = None, + is_sender: bool | None = None, ) -> tuple[OmniTransferConfig | None, dict[tuple[str, str], OmniConnectorBase]]: """ Initialize connectors from configuration file. @@ -36,12 +44,20 @@ def initialize_connectors_from_config( return None, {} # create connectors from config - connectors = create_connectors_from_config(transfer_config.connectors) + connectors = create_connectors_from_config( + transfer_config.connectors, + purpose=purpose, + caller_stage_id=caller_stage_id, + is_sender=is_sender, + ) return transfer_config, connectors def create_connectors_from_config( connectors_config: dict[tuple[str, str], ConnectorSpec], + purpose: str = "request_forwarding", + caller_stage_id: int | str | None = None, + is_sender: bool | None = None, ) -> dict[tuple[str, str], OmniConnectorBase]: """ Create connectors from config. @@ -52,12 +68,59 @@ def create_connectors_from_config( Returns: A dictionary of connectors. """ + purpose_port_offsets = { + "request_forwarding": 0, + "kv_transfer": KV_TRANSFER_PORT_OFFSET, + } + port_offset = purpose_port_offsets.get(purpose, 0) + orchestrator_port_offset = 200 + connectors = {} for edge_key, connector_spec in connectors_config.items(): + from_stage, to_stage = edge_key try: - connector = OmniConnectorFactory.create_connector(connector_spec) + if connector_spec.name == "MooncakeTransferEngineConnector": + extra = dict(connector_spec.extra) if connector_spec.extra else {} + base_port = extra.get("zmq_port", 50051) + try: + stage_offset = int(from_stage) + except (TypeError, ValueError): + stage_offset = 0 + + if str(caller_stage_id) == "orchestrator": + adjusted_port = base_port + orchestrator_port_offset + stage_offset + else: + adjusted_port = base_port + port_offset + stage_offset + extra["zmq_port"] = adjusted_port + + if is_sender is not None: + extra["role"] = "sender" if is_sender else "receiver" + if not is_sender: + extra.setdefault("sender_host", extra.get("host", "127.0.0.1")) + extra.setdefault("sender_zmq_port", adjusted_port) + elif caller_stage_id is not None: + caller_str = str(caller_stage_id) + if caller_str == from_stage: + extra["role"] = "sender" + elif caller_str == to_stage: + extra["role"] = "receiver" + extra.setdefault("sender_host", extra.get("host", "127.0.0.1")) + extra.setdefault("sender_zmq_port", adjusted_port) + else: + extra["role"] = "sender" + else: + extra["role"] = extra.get("role", "auto") + + connector = OmniConnectorFactory.create_connector(ConnectorSpec(name=connector_spec.name, extra=extra)) + else: + connector = OmniConnectorFactory.create_connector(connector_spec) connectors[edge_key] = connector - logger.info(f"Created connector for {edge_key[0]} -> {edge_key[1]}: {type(connector).__name__}") + logger.info( + "Created connector for %s -> %s: %s", + from_stage, + to_stage, + type(connector).__name__, + ) except Exception as e: raise RuntimeError(f"Failed to initialize connector for edge {edge_key}: {e}") from e @@ -289,7 +352,11 @@ def initialize_orchestrator_connectors( else: default_shm_threshold = max(0, shm_threshold_bytes) transfer_config, connectors = initialize_connectors_from_config( - config_path, default_shm_threshold=default_shm_threshold + config_path, + default_shm_threshold=default_shm_threshold, + purpose="request_forwarding", + caller_stage_id="orchestrator", + is_sender=True, ) return transfer_config, connectors @@ -316,6 +383,7 @@ def get_stage_connector_config( def build_stage_connectors( stage_id: int, connectors_config: dict[str, Any], + purpose: str = "request_forwarding", ) -> dict[tuple[str, str], Any] | None: """Instantiate OmniConnectors for a stage based on config.""" if not connectors_config: @@ -352,7 +420,12 @@ def build_stage_connectors( try: # Use unified connector creation logic - connectors = create_connectors_from_config(stage_connector_specs) + connectors = create_connectors_from_config( + stage_connector_specs, + purpose=purpose, + caller_stage_id=stage_id, + is_sender=False, + ) except Exception as exc: # pragma: no cover - defensive logging # Fail fast so the stage does not start with missing connectors. logger.exception("[Stage-%s] Failed to initialize connectors: %s", stage_id, exc) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 8cd2d69526..f7e7d53d58 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -395,15 +395,17 @@ def _launch_llm_stage( proc=proc, addresses=addresses, ) + logger.info("[AsyncOmniEngine] Stage %s engine launch started", metadata.stage_id) + # Keep the stage-specific device visibility until vLLM + # finishes starting all child processes. + complete_stage_handshake(proc, handshake_address, addresses, vllm_config) + logger.info("[AsyncOmniEngine] Stage %s engine startup completed", metadata.stage_id) finally: if previous_visible_devices is None: current_omni_platform.unset_device_control_env_var() else: current_omni_platform.set_device_control_env_var(previous_visible_devices) - logger.info("[AsyncOmniEngine] Stage %s engine launch started", metadata.stage_id) - complete_stage_handshake(proc, handshake_address, addresses, vllm_config) - logger.info("[AsyncOmniEngine] Stage %s engine startup completed", metadata.stage_id) assert started_stage is not None return started_stage except Exception: diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 8ea9a5096c..20dce1f0ff 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -477,6 +477,30 @@ def _build_stage_metrics( ), ) + def _build_kv_sender_info(self, sender_stage_ids: list[int]) -> dict[int, dict[str, Any]] | None: + """Build per-request sender info for diffusion KV-transfer receivers.""" + sender_infos: dict[int, dict[str, Any]] = {} + for sender_stage_id in dict.fromkeys(sender_stage_ids): + if sender_stage_id < 0 or sender_stage_id >= self.num_stages: + continue + + sender_stage = self.stage_clients[sender_stage_id] + get_sender_info = getattr(sender_stage, "get_kv_sender_info", None) + if not callable(get_sender_info): + continue + + sender_info = get_sender_info() + if not sender_info: + logger.warning( + "[Orchestrator] Stage-%s has no KV sender info available", + sender_stage_id, + ) + continue + + sender_infos[sender_stage_id] = sender_info + + return sender_infos or None + async def _forward_to_next_stage( self, req_id: str, @@ -522,14 +546,22 @@ async def _forward_to_next_stage( req_id, ) + source_stage_ids = list(getattr(next_client, "engine_input_source", None) or [stage_id]) + kv_sender_info = self._build_kv_sender_info(sender_stage_ids=source_stage_ids) if isinstance(diffusion_prompt, list): await next_client.add_batch_request_async( req_id, diffusion_prompt, params, + kv_sender_info=kv_sender_info, ) else: - await next_client.add_request_async(req_id, diffusion_prompt, params) + await next_client.add_request_async( + req_id, + diffusion_prompt, + params, + kv_sender_info=kv_sender_info, + ) req_state.stage_submit_ts[next_stage_id] = _time.time() return @@ -731,7 +763,14 @@ async def _prewarm_async_chunk_stages( params = req_state.sampling_params_list[next_stage_id] if next_client.stage_type == "diffusion": - await next_client.add_request_async(request_id, req_state.prompt, params) + source_stage_ids = list(getattr(next_client, "engine_input_source", None) or [next_stage_id - 1]) + kv_sender_info = self._build_kv_sender_info(sender_stage_ids=source_stage_ids) + await next_client.add_request_async( + request_id, + req_state.prompt, + params, + kv_sender_info=kv_sender_info, + ) req_state.stage_submit_ts[next_stage_id] = _time.time() continue diff --git a/vllm_omni/engine/stage_engine_core_client.py b/vllm_omni/engine/stage_engine_core_client.py index e08ce78011..71a0aee4a4 100644 --- a/vllm_omni/engine/stage_engine_core_client.py +++ b/vllm_omni/engine/stage_engine_core_client.py @@ -6,12 +6,15 @@ from __future__ import annotations +import socket from typing import TYPE_CHECKING, Any +from urllib.parse import urlparse from vllm.logger import init_logger from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import AsyncMPClient +from vllm_omni.distributed.omni_connectors.utils.initialization import KV_TRANSFER_PORT_OFFSET from vllm_omni.engine.stage_init_utils import StageMetadata if TYPE_CHECKING: @@ -76,6 +79,11 @@ def __init__( self.engine_outputs: Any = None self._proc = proc + self.client_addresses = dict(client_addresses or {}) + self._omni_kv_config = getattr(getattr(vllm_config, "model_config", None), "omni_kv_config", None) + self._kv_sender_host = self._resolve_contact_host() + self._kv_sender_info: dict[str, Any] | None = None + self._kv_sender_initialized = False logger.info( "[StageEngineCoreClient] Stage-%s initializing EngineCore", @@ -104,6 +112,7 @@ def __init__( shutdown_error, ) raise + self._initialize_kv_sender_endpoint() logger.info( "[StageEngineCoreClient] Stage-%s EngineCore running", self.stage_id, @@ -118,6 +127,118 @@ async def add_request_async(self, request: EngineCoreRequest) -> None: # ==================== Stage Methods ==================== + @staticmethod + def _detect_local_ip() -> str | None: + """Best-effort local IP detection for cross-node connector bootstrap.""" + try: + with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock: + sock.connect(("8.8.8.8", 80)) + return sock.getsockname()[0] + except Exception: + try: + return socket.gethostbyname(socket.gethostname()) + except Exception: + return None + + def _resolve_contact_host(self) -> str | None: + """Resolve a routable host for this stage from its client addresses.""" + for key in ("input_address", "output_address", "stats_update_address"): + address = self.client_addresses.get(key) + if not address: + continue + host = urlparse(address).hostname + if host in {None, "", "*", "0.0.0.0", "::"}: + continue + if host in {"localhost", "127.0.0.1"}: + detected = self._detect_local_ip() + if detected: + return detected + continue + return host + return self._detect_local_ip() + + def _get_kv_connector_config(self) -> dict[str, Any] | None: + omni_kv_config = getattr(self, "_omni_kv_config", None) + if not isinstance(omni_kv_config, dict): + return None + connector_config = omni_kv_config.get("connector_config") + if not isinstance(connector_config, dict): + return None + return connector_config + + def _resolve_sender_host_from_config(self, connector_config: dict[str, Any]) -> str | None: + host = connector_config.get("sender_host") or connector_config.get("host") + if host in {None, "", "auto", "*", "0.0.0.0", "::"}: + return self._resolve_contact_host() + return str(host) + + def _initialize_kv_sender_endpoint(self) -> None: + if self._kv_sender_initialized: + return + self._kv_sender_initialized = True + connector_config = self._get_kv_connector_config() + if connector_config is None or connector_config.get("role") != "sender": + return + + sender_host = self._resolve_sender_host_from_config(connector_config) + if sender_host is not None: + self._kv_sender_host = sender_host + + sender_port = connector_config.get("sender_zmq_port") + if sender_port is None: + base_port = connector_config.get("zmq_port") + if base_port is None: + return + + omni_kv_config = getattr(self, "_omni_kv_config", None) + from_stage = self.stage_id + if isinstance(omni_kv_config, dict): + from_stage = omni_kv_config.get("omni_from_stage", from_stage) + + try: + sender_port = int(base_port) + KV_TRANSFER_PORT_OFFSET + int(from_stage) + except (TypeError, ValueError): + logger.warning( + "[StageEngineCoreClient] Stage-%s could not resolve sender_zmq_port " + "from base_port=%s and from_stage=%s", + self.stage_id, + base_port, + from_stage, + ) + return + + if self._kv_sender_host is None: + return + + self._kv_sender_info = { + "host": str(self._kv_sender_host), + "zmq_port": int(sender_port), + } + + def get_kv_sender_info( + self, + *, + base_port: int = 50051, + kv_transfer_port_offset: int = KV_TRANSFER_PORT_OFFSET, + ) -> dict[str, Any] | None: + """Build sender bootstrap info for diffusion KV transfer receivers. + + ``base_port`` and ``kv_transfer_port_offset`` are only used by the + legacy fallback path when no connector-level sender endpoint is + configured in ``omni_kv_config``. + """ + if self._kv_sender_info is not None: + return dict(self._kv_sender_info) + + if self._kv_sender_host is None: + self._kv_sender_host = self._resolve_contact_host() + if self._kv_sender_host is None: + return None + return { + "host": self._kv_sender_host, + "zmq_port": base_port + kv_transfer_port_offset + int(self.stage_id), + } + def set_engine_outputs(self, engine_outputs: EngineCoreOutput) -> None: """Set engine outputs (called by orchestrator).""" self.engine_outputs = engine_outputs diff --git a/vllm_omni/model_executor/stage_configs/bagel.yaml b/vllm_omni/model_executor/stage_configs/bagel.yaml index b0c1b04803..d1031b574a 100644 --- a/vllm_omni/model_executor/stage_configs/bagel.yaml +++ b/vllm_omni/model_executor/stage_configs/bagel.yaml @@ -1,5 +1,9 @@ # Stage 0: Thinker (multimodal understanding + text generation) +# By default this config uses the shared-memory connector for stage-0 -> stage-1 forwarding. +# To switch to RDMA, add output_connectors/input_connectors that point to +# rdma_connector and keep the rest of the pipeline unchanged. + stage_args: - stage_id: 0 stage_type: llm @@ -36,6 +40,9 @@ stage_args: seed: 52 detokenize: True repetition_penalty: 1.05 + # Optional RDMA override: + # output_connectors: + # to_stage_1: rdma_connector - stage_id: 1 stage_type: diffusion @@ -62,6 +69,9 @@ stage_args: is_comprehension: false default_sampling_params: seed: 52 + # Optional RDMA override: + # input_connectors: + # from_stage_0: rdma_connector # Runtime edges runtime: @@ -78,6 +88,23 @@ runtime: extra: shm_threshold_bytes: 65536 # 64KB threshold + # Optional RDMA connector template for Bagel. To enable it, point + # stage-0 output_connectors/to_stage_1 and stage-1 input_connectors/from_stage_0 + # to rdma_connector instead of relying on the default shared-memory path. + rdma_connector: + name: MooncakeTransferEngineConnector + extra: + host: "auto" + zmq_port: 50051 + protocol: "rdma" + device_name: "" + # Memory pool for RDMA-registered buffers. + # Supports both CPU pinned memory ("cpu") and GPU VRAM ("cuda"). + # CPU mode works on all topologies; GPU mode (GPUDirect RDMA) requires + # NIC-GPU direct PCIe connectivity (PIX topology). + # Recommended: 4 GB for CPU, 2 GB for GPU (to conserve VRAM). + memory_pool_size: 4294967296 # 4 GB + memory_pool_device: "cpu" edges: - from: 0 From fb3c6bd9b131479b9f7c76afed47bf8d87ca9718 Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Wed, 8 Apr 2026 16:01:30 +0800 Subject: [PATCH 087/204] [Feat] Add MUSA flash attention support via mate package (#2451) Signed-off-by: Xiaodong Ye --- .../diffusion/attention/backends/flash_attn.py | 10 ++++++++++ .../diffusion/attention/backends/utils/fa.py | 14 ++++++++++++-- vllm_omni/diffusion/envs.py | 5 +++++ vllm_omni/diffusion/layers/rope.py | 8 ++++++++ vllm_omni/platforms/musa/platform.py | 18 ++++++++++++++---- 5 files changed, 49 insertions(+), 6 deletions(-) diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py index 5c586c0631..b6ab3a57ad 100644 --- a/vllm_omni/diffusion/attention/backends/flash_attn.py +++ b/vllm_omni/diffusion/attention/backends/flash_attn.py @@ -209,3 +209,13 @@ def forward_npu( layout="BNSD", ) return output + + def forward_musa( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_metadata: AttentionMetadata = None, + ) -> torch.Tensor: + # XXX (MUSA): MUSA uses the same implementation as XPU (mate only provides flash_attn_varlen_func) + return self.forward_xpu(query, key, value, attn_metadata) diff --git a/vllm_omni/diffusion/attention/backends/utils/fa.py b/vllm_omni/diffusion/attention/backends/utils/fa.py index 77596a1033..fe6051f8ba 100644 --- a/vllm_omni/diffusion/attention/backends/utils/fa.py +++ b/vllm_omni/diffusion/attention/backends/utils/fa.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_flash_attention_utils.py +from functools import lru_cache + import torch import torch.nn.functional as F @@ -38,8 +40,10 @@ except (ImportError, ModuleNotFoundError): pass elif current_omni_platform.is_musa(): - # XXX (MUSA): Add MUSA-specific Flash Attention when available - pass + try: + from mate import flash_attn_varlen_func # noqa: F401 + except (ImportError, ModuleNotFoundError): + pass else: # CUDA: try FA3 -> FA2 fallback chain # Try FA3 from fa3-fwd PyPI package @@ -76,6 +80,12 @@ HAS_FLASH_ATTN = flash_attn_func is not None or flash_attn_varlen_func is not None +@lru_cache(maxsize=1) +def is_mate_available() -> bool: + """Check if MATE (MUSA Flash Attention) is available.""" + return current_omni_platform.is_musa() and flash_attn_varlen_func is not None + + def _index_first_axis(tensor, indices): """ A local implementation of the PyTorch indexing operation `tensor[indices]` on the first axis, diff --git a/vllm_omni/diffusion/envs.py b/vllm_omni/diffusion/envs.py index a71dc2e8e1..ea7b2c24c8 100644 --- a/vllm_omni/diffusion/envs.py +++ b/vllm_omni/diffusion/envs.py @@ -7,6 +7,7 @@ from vllm.logger import init_logger +from vllm_omni.diffusion.attention.backends.utils.fa import is_mate_available from vllm_omni.platforms import current_omni_platform if TYPE_CHECKING: @@ -52,6 +53,10 @@ def _check_flash_attn(self, packages_info) -> bool: """Check if flash attention is available and compatible.""" platform = current_omni_platform + # MUSA uses MATE for flash attention + if platform.is_musa(): + return is_mate_available() + # Flash attention requires CUDA-like platforms (CUDA or ROCm) if not platform.is_cuda_alike(): return False diff --git a/vllm_omni/diffusion/layers/rope.py b/vllm_omni/diffusion/layers/rope.py index 65d37d0b01..61ddb4d84a 100644 --- a/vllm_omni/diffusion/layers/rope.py +++ b/vllm_omni/diffusion/layers/rope.py @@ -145,6 +145,14 @@ def forward_xpu( ) -> torch.Tensor: return self.forward_native(x, cos, sin) + def forward_musa( + self, + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + ) -> torch.Tensor: + return self.forward_native(x, cos, sin) + def forward_native( self, x: torch.Tensor, diff --git a/vllm_omni/platforms/musa/platform.py b/vllm_omni/platforms/musa/platform.py index 3bd520c61b..fe1ccc6d0b 100644 --- a/vllm_omni/platforms/musa/platform.py +++ b/vllm_omni/platforms/musa/platform.py @@ -8,6 +8,7 @@ from vllm_musa.platform import MUSAPlatformBase from vllm_omni.diffusion.attention.backends.registry import DiffusionAttentionBackendEnum +from vllm_omni.diffusion.attention.backends.utils.fa import is_mate_available from vllm_omni.platforms.interface import OmniPlatform, OmniPlatformEnum logger = init_logger(__name__) @@ -54,9 +55,7 @@ def get_diffusion_attn_backend_cls( ) -> str: """Get the diffusion attention backend class path for MUSA platform. - MUSA currently supports SDPA (Scaled Dot Product Attention) as the - primary backend. Flash Attention support may be added in future - when MUSA-specific implementations are available. + MUSA supports FLASH_ATTN via the mate package, and SDPA as fallback. Args: selected_backend: User-selected backend name (e.g., "FLASH_ATTN", @@ -66,13 +65,24 @@ def get_diffusion_attn_backend_cls( Returns: Fully qualified class path of the selected backend. """ + + flash_attn_available = is_mate_available() + if selected_backend is not None: backend_upper = selected_backend.upper() + if backend_upper == "FLASH_ATTN" and not flash_attn_available: + logger.warning("Flash Attention (mate package) not available. Falling back to TORCH_SDPA backend.") + logger.info("Defaulting to diffusion attention backend SDPA") + return DiffusionAttentionBackendEnum.TORCH_SDPA.get_path() backend = DiffusionAttentionBackendEnum[backend_upper] logger.info("Using diffusion attention backend '%s'", backend_upper) return backend.get_path() - # Default to SDPA for MUSA as it's the most compatible backend + # Default to FLASH_ATTN if mate is available, otherwise SDPA + if flash_attn_available: + logger.info("Defaulting to diffusion attention backend FLASH_ATTN") + return DiffusionAttentionBackendEnum.FLASH_ATTN.get_path() + logger.info("Defaulting to diffusion attention backend SDPA") return DiffusionAttentionBackendEnum.TORCH_SDPA.get_path() From aefa2ee45cd0f7145436e265596bb22853e18241 Mon Sep 17 00:00:00 2001 From: Jinheng Date: Wed, 8 Apr 2026 17:45:10 +0800 Subject: [PATCH 088/204] [Fix] Align diffusion proc test mock with current output fields (#2584) Signed-off-by: Jinheng Li --- tests/diffusion/test_stage_diffusion_proc.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/diffusion/test_stage_diffusion_proc.py b/tests/diffusion/test_stage_diffusion_proc.py index c26070ad43..f1cf4f9b7d 100644 --- a/tests/diffusion/test_stage_diffusion_proc.py +++ b/tests/diffusion/test_stage_diffusion_proc.py @@ -24,19 +24,29 @@ def step(request): SimpleNamespace( images=["img-1"], _multimodal_output={}, + _custom_output={}, metrics={}, stage_durations={}, peak_memory_mb=0.0, latents=None, + trajectory_latents=None, + trajectory_timesteps=None, + trajectory_log_probs=None, + trajectory_decoded=None, final_output_type="image", ), SimpleNamespace( images=["img-2"], _multimodal_output={}, + _custom_output={}, metrics={}, stage_durations={}, peak_memory_mb=0.0, latents=None, + trajectory_latents=None, + trajectory_timesteps=None, + trajectory_log_probs=None, + trajectory_decoded=None, final_output_type="image", ), ] From 7e7efdd1ab0f94069dcb9bdc85709dfb65c4928f Mon Sep 17 00:00:00 2001 From: Dnoob Date: Wed, 8 Apr 2026 19:00:53 +0800 Subject: [PATCH 089/204] [Bugfix] Fix benchmark Total input tokens for multimodal requests (#2540) (#2549) Signed-off-by: Dnoob --- tests/benchmarks/metrics/test_metrics.py | 67 ++++++++++++++++++++++++ tests/benchmarks/patch/test_patch.py | 54 +++++++++++++++++++ vllm_omni/benchmarks/metrics/metrics.py | 2 +- vllm_omni/benchmarks/patch/patch.py | 4 ++ 4 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 tests/benchmarks/metrics/test_metrics.py diff --git a/tests/benchmarks/metrics/test_metrics.py b/tests/benchmarks/metrics/test_metrics.py new file mode 100644 index 0000000000..f531a5026a --- /dev/null +++ b/tests/benchmarks/metrics/test_metrics.py @@ -0,0 +1,67 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Unit tests for metrics.py +""" + +import pytest +from vllm.benchmarks.serve import TaskType + +from vllm_omni.benchmarks.metrics.metrics import calculate_metrics +from vllm_omni.benchmarks.patch.patch import MixRequestFuncOutput + +pytestmark = [pytest.mark.core_model, pytest.mark.benchmark, pytest.mark.cpu] + + +def _make_output(prompt_len: int, output_tokens: int = 10) -> MixRequestFuncOutput: + """Build a minimal successful MixRequestFuncOutput for metrics aggregation.""" + output = MixRequestFuncOutput() + output.success = True + output.prompt_len = prompt_len + output.output_tokens = output_tokens + output.generated_text = "x" * output_tokens + output.ttft = 0.1 + output.text_latency = 1.0 + output.latency = 1.0 + output.start_time = 0.0 + output.itl = [0.1] * max(output_tokens - 1, 0) + output.audio_ttfp = 0.0 + output.audio_rtf = 0.0 + output.audio_duration = 0.0 + output.audio_frames = 0 + output.input_audio_duration = 0.0 + output.error = "" + return output + + +# ============================================================================ +# total_input Tests +# ============================================================================ + + +def test_total_input_aggregated_from_output_prompt_len(): + """Test that total_input sums outputs[i].prompt_len, not input_requests[i].prompt_len.""" + outputs = [_make_output(4992), _make_output(3000)] + + metrics, _ = calculate_metrics( + input_requests=[], + outputs=outputs, + dur_s=10.0, + tokenizer=None, + selected_percentiles=[99.0], + goodput_config_dict={}, + task_type=TaskType.GENERATION, + selected_percentile_metrics=[], + max_concurrency=None, + request_rate=float("inf"), + benchmark_duration=10.0, + ) + + assert metrics.total_input == 7992, ( + "total_input should aggregate from outputs[i].prompt_len to reflect the true multimodal input token count" + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) diff --git a/tests/benchmarks/patch/test_patch.py b/tests/benchmarks/patch/test_patch.py index 39b7f84fb4..35a18aea33 100644 --- a/tests/benchmarks/patch/test_patch.py +++ b/tests/benchmarks/patch/test_patch.py @@ -574,5 +574,59 @@ async def test_text_latency_value_consistency(self, mocker: MockerFixture): ) +# ============================================================================ +# prompt_len Tests +# ============================================================================ + + +@pytest.mark.asyncio +async def test_prompt_len_assigned_from_usage(mocker: MockerFixture): + # Arrange: request claims prompt_len=100, but server reports 4992 (multimodal). + request_input = RequestFuncInput( + model="test-model", + model_name="test-model", + prompt="test prompt", + api_url="http://test.com/v1/chat/completions", + prompt_len=100, + output_len=20, + ) + + chunks = [ + create_sse_chunk( + { + "choices": [{"delta": {"content": "Hello"}}], + "modality": "text", + } + ), + create_sse_chunk( + { + "choices": [{"delta": {"content": " world"}}], + "modality": "text", + } + ), + # Final usage chunk emitted because stream_options.include_usage=True. + create_sse_chunk( + { + "choices": [], + "usage": {"prompt_tokens": 4992, "completion_tokens": 2, "total_tokens": 4994}, + } + ), + b"data: [DONE]\n\n", + ] + + mock_response = MockResponse(200, chunks) + mock_session = mocker.AsyncMock() + mock_session.post = mocker.MagicMock(return_value=mock_response) + + # Act + output = await async_request_openai_chat_omni_completions(request_input, mock_session) + + # Assert + assert output.success is True + assert output.prompt_len == 4992, ( + "prompt_len should be overridden by usage.prompt_tokens to reflect the true multimodal input token count" + ) + + if __name__ == "__main__": pytest.main([__file__, "-v", "-s"]) diff --git a/vllm_omni/benchmarks/metrics/metrics.py b/vllm_omni/benchmarks/metrics/metrics.py index a2acc7d756..dbf764698a 100644 --- a/vllm_omni/benchmarks/metrics/metrics.py +++ b/vllm_omni/benchmarks/metrics/metrics.py @@ -185,7 +185,7 @@ def calculate_metrics( # Note : this may inflate the output token count slightly output_len = len(tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids) actual_output_lens.append(output_len) - total_input += input_requests[i].prompt_len + total_input += outputs[i].prompt_len tpot = 0 if output_len > 1: latency_minus_ttft = outputs[i].text_latency - outputs[i].ttft diff --git a/vllm_omni/benchmarks/patch/patch.py b/vllm_omni/benchmarks/patch/patch.py index d8145c40bc..343655df20 100644 --- a/vllm_omni/benchmarks/patch/patch.py +++ b/vllm_omni/benchmarks/patch/patch.py @@ -190,6 +190,10 @@ async def async_request_openai_chat_omni_completions( if metrics := data.get("metrics"): output.output_tokens = metrics.get("num_tokens_out", 0) + if usage := data.get("usage"): + if (pt := usage.get("prompt_tokens")) is not None: + output.prompt_len = pt + output.latency = timestamp - st output.generated_text = generated_text if generated_audio is not None: From fcda835f4d32737e9f1212fc95c804e37ea0ef7c Mon Sep 17 00:00:00 2001 From: Peiqi Yin <60515999+yinpeiqi@users.noreply.github.com> Date: Wed, 8 Apr 2026 20:58:14 +0800 Subject: [PATCH 090/204] [Unit Test] Add unit tests for orchestrator (#2096) Signed-off-by: yinpe <11810305@mail.sustech.edu.cn> --- tests/engine/test_orchestrator.py | 510 ++++++++++++++++++++++++++++++ 1 file changed, 510 insertions(+) create mode 100644 tests/engine/test_orchestrator.py diff --git a/tests/engine/test_orchestrator.py b/tests/engine/test_orchestrator.py new file mode 100644 index 0000000000..7bf2eccf7f --- /dev/null +++ b/tests/engine/test_orchestrator.py @@ -0,0 +1,510 @@ +from __future__ import annotations + +import asyncio +import concurrent.futures +import queue +import threading +import time +from dataclasses import dataclass +from types import SimpleNamespace +from typing import Any + +import janus +import pytest +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.sampling_params import SamplingParams + +from vllm_omni.engine.orchestrator import Orchestrator +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +@dataclass +class OrchestratorFixture: + orchestrator: Orchestrator + request_sync_q: Any + output_sync_q: Any + queues: tuple[janus.Queue, ...] + thread: threading.Thread + result_future: concurrent.futures.Future[None] + + +class FakeStageClient: + def __init__( + self, + *, + stage_type: str = "llm", + final_output: bool = False, + final_output_type: str = "text", + next_inputs: list[dict] | None = None, + ) -> None: + self.stage_type = stage_type + self.final_output = final_output + self.final_output_type = final_output_type + self.next_inputs = list(next_inputs or []) + self.custom_process_input_func = None + self.add_request_calls: list[tuple] = [] + self.abort_calls: list[list[str]] = [] + self.shutdown_calls = 0 + self._engine_core_outputs = queue.Queue() + self._diffusion_outputs = queue.Queue() + + # Orchestrator-facing interface. + async def add_request_async(self, *args, **_kwargs) -> None: + self.add_request_calls.append(args) + + async def get_output_async(self): + try: + return self._engine_core_outputs.get_nowait() + except queue.Empty: + return SimpleNamespace(outputs=[]) + + def get_diffusion_output_nowait(self): + try: + return self._diffusion_outputs.get_nowait() + except queue.Empty: + return None + + def set_engine_outputs(self, outputs) -> None: + return None + + def process_engine_inputs(self, stage_list, prompt=None): + return list(self.next_inputs) + + async def abort_requests_async(self, request_ids: list[str]) -> None: + self.abort_calls.append(list(request_ids)) + + def shutdown(self) -> None: + self.shutdown_calls += 1 + + # Test helpers for seeding fake stage outputs. + def push_engine_core_outputs(self, outputs) -> None: + self._engine_core_outputs.put_nowait(outputs) + + def push_diffusion_output(self, output) -> None: + self._diffusion_outputs.put_nowait(output) + + +class FakeOutputProcessor: + def __init__(self, *, request_outputs: list[object] | None = None) -> None: + self.request_outputs = list(request_outputs or []) + + def add_request(self, *_args, **_kwargs) -> None: + return None + + def process_outputs(self, *_args, **_kwargs): + return SimpleNamespace( + request_outputs=list(self.request_outputs), + reqs_to_abort=[], + ) + + def update_scheduler_stats(self, _scheduler_stats) -> None: + return None + + +def _sampling_params(max_tokens: int = 4) -> SamplingParams: + return SamplingParams(max_tokens=max_tokens) + + +def _engine_core_outputs(tag: str, timestamp: float) -> SimpleNamespace: + return SimpleNamespace(outputs=[tag], timestamp=timestamp, scheduler_stats=None) + + +def _build_request_output( + request_id: str, + *, + token_ids: list[int] | None = None, + prompt_token_ids: list[int] | None = None, + finished: bool = True, + text: str = "test", +) -> RequestOutput: + completion = CompletionOutput( + index=0, + text=text, + token_ids=list(token_ids or [1, 2]), + cumulative_logprob=0.0, + logprobs=None, + finish_reason="stop" if finished else None, + stop_reason=None, + ) + return RequestOutput( + request_id=request_id, + prompt="prompt", + prompt_token_ids=list(prompt_token_ids or [10, 11]), + prompt_logprobs=None, + outputs=[completion], + finished=finished, + metrics=None, + lora_request=None, + ) + + +def _build_harness( + stage_clients: list[object], + *, + output_processors: list[object] | None = None, + stage_vllm_configs: list[object] | None = None, + async_chunk: bool = False, +) -> OrchestratorFixture: + if output_processors is None: + output_processors = [FakeOutputProcessor() for _ in stage_clients] + if stage_vllm_configs is None: + stage_vllm_configs = [SimpleNamespace(model_config=SimpleNamespace(max_model_len=64)) for _ in stage_clients] + + ready_future: concurrent.futures.Future[tuple[Orchestrator, janus.Queue, janus.Queue, janus.Queue]] = ( + concurrent.futures.Future() + ) + result_future: concurrent.futures.Future[None] = concurrent.futures.Future() + + def _runner() -> None: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + async def _run() -> None: + request_queue = janus.Queue() + output_queue = janus.Queue() + rpc_queue = janus.Queue() + orchestrator = Orchestrator( + request_async_queue=request_queue.async_q, + output_async_queue=output_queue.async_q, + rpc_async_queue=rpc_queue.async_q, + stage_clients=stage_clients, + output_processors=output_processors, + stage_vllm_configs=stage_vllm_configs, + async_chunk=async_chunk, + ) + ready_future.set_result((orchestrator, request_queue, output_queue, rpc_queue)) + await orchestrator.run() + + try: + loop.run_until_complete(_run()) + result_future.set_result(None) + except Exception as exc: + result_future.set_exception(exc) + finally: + try: + pending = [task for task in asyncio.all_tasks(loop) if not task.done()] + for task in pending: + task.cancel() + if pending: + loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True)) + loop.run_until_complete(loop.shutdown_asyncgens()) + finally: + asyncio.set_event_loop(None) + loop.close() + + thread = threading.Thread(target=_runner, daemon=True, name="test-orchestrator") + thread.start() + + orchestrator, request_queue, output_queue, rpc_queue = ready_future.result(timeout=5) + return OrchestratorFixture( + orchestrator=orchestrator, + request_sync_q=request_queue.sync_q, + output_sync_q=output_queue.sync_q, + queues=(request_queue, output_queue, rpc_queue), + thread=thread, + result_future=result_future, + ) + + +async def _shutdown_orchestrator(orchestrator_fixture: OrchestratorFixture) -> None: + orchestrator_fixture.request_sync_q.put_nowait({"type": "shutdown"}) + await asyncio.to_thread(orchestrator_fixture.thread.join, 5) + if orchestrator_fixture.thread.is_alive(): + raise AssertionError("Timed out waiting for orchestrator thread shutdown") + orchestrator_fixture.result_future.result(timeout=0) + + +async def _wait_for(predicate, *, timeout: float = 2.0) -> None: + deadline = time.monotonic() + timeout + while not predicate(): + if time.monotonic() >= deadline: + raise AssertionError("Timed out waiting for predicate") + await asyncio.sleep(0.01) + + +async def _get_output_message(orchestrator_fixture: OrchestratorFixture, *, timeout: float = 2.0) -> dict: + deadline = time.monotonic() + timeout + while True: + if time.monotonic() >= deadline: + raise AssertionError("Timed out waiting for orchestrator output") + try: + msg = orchestrator_fixture.output_sync_q.get_nowait() + except queue.Empty: + await asyncio.sleep(0.01) + continue + if msg.get("type") == "output": + return msg + + +async def _enqueue_add_request( + orchestrator_fixture: OrchestratorFixture, + *, + request_id: str, + prompt, + original_prompt, + sampling_params_list, + final_stage_id: int, +) -> None: + orchestrator_fixture.request_sync_q.put_nowait( + { + "type": "add_request", + "request_id": request_id, + "prompt": prompt, + "original_prompt": original_prompt, + "sampling_params_list": sampling_params_list, + "final_stage_id": final_stage_id, + } + ) + + +async def _enqueue_abort_request(orchestrator_fixture: OrchestratorFixture, request_ids: list[str]) -> None: + orchestrator_fixture.request_sync_q.put_nowait( + { + "type": "abort", + "request_ids": request_ids, + } + ) + + +@pytest.fixture +def orchestrator_factory(): + fixtures: list[OrchestratorFixture] = [] + + def _factory(*args, **kwargs) -> OrchestratorFixture: + fixture = _build_harness(*args, **kwargs) + fixtures.append(fixture) + return fixture + + yield _factory + + for fixture in fixtures: + if fixture.thread.is_alive(): + fixture.request_sync_q.put_nowait({"type": "shutdown"}) + fixture.thread.join(timeout=5) + for q in fixture.queues: + q.close() + + +@pytest.mark.asyncio +async def test_run_two_stage_llm(orchestrator_factory) -> None: + stage0 = FakeStageClient(stage_type="llm", final_output=False) + stage1 = FakeStageClient( + stage_type="llm", + final_output=True, + next_inputs=[{"prompt_token_ids": [7, 8, 9]}], + ) + processors = [ + FakeOutputProcessor(request_outputs=[_build_request_output("req-llm", token_ids=[3, 4], finished=True)]), + FakeOutputProcessor(request_outputs=[_build_request_output("req-llm", token_ids=[10, 11], finished=True)]), + ] + orchestrator_fixture = orchestrator_factory([stage0, stage1], output_processors=processors) + request = SimpleNamespace(request_id="req-llm", prompt_token_ids=[1, 2, 3]) + + try: + await _enqueue_add_request( + orchestrator_fixture, + request_id="req-llm", + prompt=request, + original_prompt={"prompt": "hello"}, + sampling_params_list=[_sampling_params(), _sampling_params()], + final_stage_id=1, + ) + + await _wait_for(lambda: len(stage0.add_request_calls) == 1) + stage0.push_engine_core_outputs(_engine_core_outputs("stage0-raw", 1.0)) + + await _wait_for(lambda: len(stage1.add_request_calls) == 1) + stage1_request = stage1.add_request_calls[0][0] + assert stage1_request.request_id == "req-llm" + assert stage1_request.prompt_token_ids == [7, 8, 9] + + stage1.push_engine_core_outputs(_engine_core_outputs("stage1-raw", 2.0)) + + output_msg = await _get_output_message(orchestrator_fixture) + + assert output_msg["request_id"] == "req-llm" + assert output_msg["stage_id"] == 1 + assert output_msg["finished"] is True + assert output_msg["engine_outputs"].request_id == "req-llm" + assert "req-llm" not in orchestrator_fixture.orchestrator.request_states + finally: + await _shutdown_orchestrator(orchestrator_fixture) + + +@pytest.mark.asyncio +async def test_run_single_stage_diffusion(orchestrator_factory) -> None: + stage0 = FakeStageClient(stage_type="diffusion", final_output=True, final_output_type="image") + orchestrator_fixture = orchestrator_factory([stage0]) + params = OmniDiffusionSamplingParams() + + try: + await _enqueue_add_request( + orchestrator_fixture, + request_id="req-diff", + prompt={"prompt": "draw a cat"}, + original_prompt={"prompt": "draw a cat"}, + sampling_params_list=[params], + final_stage_id=0, + ) + + await _wait_for(lambda: len(stage0.add_request_calls) == 1) + stage0.push_diffusion_output( + OmniRequestOutput.from_diffusion( + request_id="req-diff", + images=[], + final_output_type="image", + ) + ) + + output_msg = await _get_output_message(orchestrator_fixture) + + assert output_msg["request_id"] == "req-diff" + assert output_msg["stage_id"] == 0 + assert output_msg["finished"] is True + assert output_msg["engine_outputs"].request_id == "req-diff" + assert "req-diff" not in orchestrator_fixture.orchestrator.request_states + finally: + await _shutdown_orchestrator(orchestrator_fixture) + + +@pytest.mark.asyncio +async def test_run_llm_to_diffusion(orchestrator_factory) -> None: + stage0 = FakeStageClient(stage_type="llm", final_output=False) + stage1 = FakeStageClient(stage_type="diffusion", final_output=True, final_output_type="image") + processors = [ + FakeOutputProcessor(request_outputs=[_build_request_output("req-img", token_ids=[3, 4], finished=True)]), + FakeOutputProcessor(), + ] + orchestrator_fixture = orchestrator_factory([stage0, stage1], output_processors=processors) + request = SimpleNamespace(request_id="req-img", prompt_token_ids=[1, 2, 3]) + params = OmniDiffusionSamplingParams() + original_prompt = {"prompt": "draw a fox"} + + try: + await _enqueue_add_request( + orchestrator_fixture, + request_id="req-img", + prompt=request, + original_prompt=original_prompt, + sampling_params_list=[_sampling_params(), params], + final_stage_id=1, + ) + + await _wait_for(lambda: len(stage0.add_request_calls) == 1) + stage0.push_engine_core_outputs(_engine_core_outputs("stage0-raw", 1.0)) + + await _wait_for(lambda: len(stage1.add_request_calls) == 1) + assert stage1.add_request_calls[0] == ("req-img", original_prompt, params) + + stage1.push_diffusion_output( + OmniRequestOutput.from_diffusion( + request_id="req-img", + images=[], + final_output_type="image", + ) + ) + + output_msg = await _get_output_message(orchestrator_fixture) + + assert output_msg["request_id"] == "req-img" + assert output_msg["stage_id"] == 1 + assert output_msg["finished"] is True + assert output_msg["engine_outputs"].request_id == "req-img" + assert "req-img" not in orchestrator_fixture.orchestrator.request_states + finally: + await _shutdown_orchestrator(orchestrator_fixture) + + +@pytest.mark.asyncio +async def test_run_async_chunk(orchestrator_factory) -> None: + stage0 = FakeStageClient(stage_type="llm", final_output=False) + stage1 = FakeStageClient(stage_type="llm", final_output=True) + processors = [ + FakeOutputProcessor(request_outputs=[_build_request_output("req-async", token_ids=[1], finished=True)]), + FakeOutputProcessor(request_outputs=[_build_request_output("req-async", token_ids=[20, 21], finished=True)]), + ] + orchestrator_fixture = orchestrator_factory( + [stage0, stage1], + output_processors=processors, + async_chunk=True, + ) + request = SimpleNamespace(request_id="req-async", prompt_token_ids=[1, 2, 3, 4]) + + try: + await _enqueue_add_request( + orchestrator_fixture, + request_id="req-async", + prompt=request, + original_prompt={"prompt": "hello async"}, + sampling_params_list=[_sampling_params(), _sampling_params()], + final_stage_id=1, + ) + + await _wait_for(lambda: len(stage1.add_request_calls) == 1) + prewarmed_request = stage1.add_request_calls[0][0] + assert prewarmed_request.request_id == "req-async" + assert prewarmed_request.prompt_token_ids + assert all(token_id == 0 for token_id in prewarmed_request.prompt_token_ids) + + stage1.push_engine_core_outputs(_engine_core_outputs("stage1-final", 3.0)) + + output_msg = await _get_output_message(orchestrator_fixture) + + assert output_msg["request_id"] == "req-async" + assert output_msg["stage_id"] == 1 + assert output_msg["finished"] is True + assert "req-async" not in orchestrator_fixture.orchestrator.request_states + finally: + await _shutdown_orchestrator(orchestrator_fixture) + + +@pytest.mark.asyncio +async def test_run_shutdown(orchestrator_factory) -> None: + stages = [ + FakeStageClient(stage_type="llm", final_output=False), + FakeStageClient(stage_type="diffusion", final_output=True, final_output_type="image"), + ] + orchestrator_fixture = orchestrator_factory(stages) + + await _shutdown_orchestrator(orchestrator_fixture) + + assert not orchestrator_fixture.thread.is_alive() + for stage in stages: + assert stage.shutdown_calls == 1 + + +@pytest.mark.asyncio +async def test_run_abort(orchestrator_factory) -> None: + stages = [ + FakeStageClient(stage_type="llm", final_output=False), + FakeStageClient(stage_type="llm", final_output=True), + ] + processors = [ + FakeOutputProcessor(request_outputs=[_build_request_output("req-abort", token_ids=[1], finished=True)]), + FakeOutputProcessor(request_outputs=[_build_request_output("req-abort", token_ids=[2], finished=True)]), + ] + orchestrator_fixture = orchestrator_factory(stages, output_processors=processors) + request = SimpleNamespace(request_id="req-abort", prompt_token_ids=[1, 2, 3]) + + try: + await _enqueue_add_request( + orchestrator_fixture, + request_id="req-abort", + prompt=request, + original_prompt={"prompt": "cancel me"}, + sampling_params_list=[_sampling_params(), _sampling_params()], + final_stage_id=1, + ) + await _wait_for(lambda: len(stages[0].add_request_calls) == 1) + + await _enqueue_abort_request(orchestrator_fixture, ["req-abort"]) + await _wait_for(lambda: all(stage.abort_calls for stage in stages)) + + for stage in stages: + assert stage.abort_calls == [["req-abort"]] + assert "req-abort" not in orchestrator_fixture.orchestrator.request_states + finally: + await _shutdown_orchestrator(orchestrator_fixture) From 2c6c07c4f68385cf4f625a5cdb0dec710e9c0fff Mon Sep 17 00:00:00 2001 From: vveerrgg Date: Wed, 8 Apr 2026 15:20:33 -0700 Subject: [PATCH 091/204] [TTS] Add missing _generate_pcm_chunks for OmniOpenAIServingSpeech streaming (#2569) Signed-off-by: Yueqian Lin Signed-off-by: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Co-authored-by: vveerrgg --- .../openai_api/test_serving_speech_stream.py | 15 +++++++++++++++ vllm_omni/entrypoints/openai/serving_speech.py | 9 +++++++++ 2 files changed, 24 insertions(+) diff --git a/tests/entrypoints/openai_api/test_serving_speech_stream.py b/tests/entrypoints/openai_api/test_serving_speech_stream.py index bd136ac727..1d26b5855f 100644 --- a/tests/entrypoints/openai_api/test_serving_speech_stream.py +++ b/tests/entrypoints/openai_api/test_serving_speech_stream.py @@ -385,3 +385,18 @@ async def mock_generate_pcm_chunks(_generator, _request_id): speech_service.engine_client.abort.assert_awaited_once_with("req-abort") assert websocket.send_json.await_count == 2 + + +class TestGeneratePcmChunksContract: + """Guard: _generate_pcm_chunks must exist on OmniOpenAIServingSpeech. + + The WebSocket handler calls speech_service._generate_pcm_chunks() + at runtime. If the method is removed, all WS TTS streaming breaks + with an AttributeError. This test catches that at CI time. + """ + + def test_generate_pcm_chunks_defined(self): + assert hasattr(OmniOpenAIServingSpeech, "_generate_pcm_chunks") + assert asyncio.iscoroutinefunction(OmniOpenAIServingSpeech._generate_pcm_chunks) or callable( + OmniOpenAIServingSpeech._generate_pcm_chunks + ) diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index a4b0293932..5903c0cd60 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -1459,6 +1459,15 @@ async def _prepare_speech_generation( ) return request_id, generator, tts_params + async def _generate_pcm_chunks(self, generator, request_id: str): + """Yield raw PCM byte chunks from the engine generator. + + Delegates to ``_generate_audio_chunks`` with ``response_format="pcm"``. + Used by the WebSocket streaming handler and ``_iter_pcm_audio_bytes``. + """ + async for chunk in self._generate_audio_chunks(generator, request_id, response_format="pcm"): + yield chunk + async def _iter_pcm_audio_bytes(self, request: OpenAICreateSpeechRequest): """Yield raw PCM bytes for a speech request as soon as chunks are decoded.""" request_id, generator, _ = await self._prepare_speech_generation(request) From 149b9f179a5f85082136e15ba065756022debc69 Mon Sep 17 00:00:00 2001 From: Nick Cao Date: Wed, 8 Apr 2026 20:40:17 -0400 Subject: [PATCH 092/204] [Perf][Qwen3-TTS][Voxtral-TTS] Share CUDA graph memory pool across decoder capture sizes (#2386) Signed-off-by: Nick Cao Co-authored-by: Claude --- .../models/qwen3_tts/cuda_graph_decoder_wrapper.py | 3 ++- .../voxtral_tts/cuda_graph_acoustic_transformer_wrapper.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm_omni/model_executor/models/qwen3_tts/cuda_graph_decoder_wrapper.py b/vllm_omni/model_executor/models/qwen3_tts/cuda_graph_decoder_wrapper.py index 96f8c799c1..8f7eeb542d 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/cuda_graph_decoder_wrapper.py +++ b/vllm_omni/model_executor/models/qwen3_tts/cuda_graph_decoder_wrapper.py @@ -10,6 +10,7 @@ import torch from torch.cuda import CUDAGraph from vllm.logger import init_logger +from vllm.platforms import current_platform logger = init_logger(__name__) @@ -129,7 +130,7 @@ def _capture(self, size: int, device: torch.device, dtype: torch.dtype): graph = CUDAGraph() with torch.no_grad(): - with torch.cuda.graph(graph): + with torch.cuda.graph(graph, pool=current_platform.get_global_graph_pool()): static_output = self.decoder(static_input) self.graphs[size] = graph diff --git a/vllm_omni/model_executor/models/voxtral_tts/cuda_graph_acoustic_transformer_wrapper.py b/vllm_omni/model_executor/models/voxtral_tts/cuda_graph_acoustic_transformer_wrapper.py index 395c0d1130..a4d58df5b1 100644 --- a/vllm_omni/model_executor/models/voxtral_tts/cuda_graph_acoustic_transformer_wrapper.py +++ b/vllm_omni/model_executor/models/voxtral_tts/cuda_graph_acoustic_transformer_wrapper.py @@ -11,6 +11,7 @@ import torch from torch.cuda import CUDAGraph from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm_omni.model_executor.models.voxtral_tts.voxtral_tts_audio_generation import ( AudioSpecialTokens, @@ -196,7 +197,7 @@ def _capture_graph_for_size( graph = CUDAGraph() with torch.no_grad(): - with torch.cuda.graph(graph): + with torch.cuda.graph(graph, pool=current_platform.get_global_graph_pool()): static_fake_eos, static_audio_codes = self._forward_cudagraph_compatible( static_input, noise=static_noise ) From c3f10420611d55b4a8cda64ce8beae9adba326ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zhengyuan=20Su=20=28=E8=8B=8F=E6=94=BF=E6=B8=8A=29?= Date: Thu, 9 Apr 2026 09:33:55 +0800 Subject: [PATCH 093/204] [Feature] End-to-end LoRA support for BAGEL (#2494) Signed-off-by: Zhengyuan Su Co-authored-by: Claude Opus 4.6 (1M context) --- .../e2e/offline_inference/test_bagel_lora.py | 198 ++++++++++++++++++ tests/engine/test_cross_stage_lora.py | 44 ++++ vllm_omni/engine/orchestrator.py | 2 +- 3 files changed, 243 insertions(+), 1 deletion(-) create mode 100644 tests/e2e/offline_inference/test_bagel_lora.py create mode 100644 tests/engine/test_cross_stage_lora.py diff --git a/tests/e2e/offline_inference/test_bagel_lora.py b/tests/e2e/offline_inference/test_bagel_lora.py new file mode 100644 index 0000000000..593a640478 --- /dev/null +++ b/tests/e2e/offline_inference/test_bagel_lora.py @@ -0,0 +1,198 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +End-to-end test for BAGEL LoRA support (Stage 1 / DiT). + +Validates that LoRA adapters are correctly loaded, applied with controllable +scale, and cleanly deactivated. Uses a synthetic rank-1 adapter targeting the +first decoder layer's QKV projection. + +Assertions: + (a) LoRA at scale=1.0 visibly changes the output (diff > 0.5) + (b) scale=2.0 produces a larger delta than scale=1.0 (linearity) + (c) The delta is bounded (diff < 80, not corrupted) + (d) Deactivating LoRA exactly restores the baseline (diff == 0) +""" + +import json +import os + +from vllm_omni.inputs.data import OmniSamplingParams +from vllm_omni.outputs import OmniRequestOutput + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" + +from pathlib import Path + +import numpy as np +import pytest +import torch +from PIL import Image +from safetensors.torch import save_file + +from tests.conftest import modify_stage_config +from tests.utils import hardware_test +from vllm_omni.entrypoints.omni import Omni +from vllm_omni.lora.request import LoRARequest +from vllm_omni.lora.utils import stable_lora_int_id + +MODEL = "ByteDance-Seed/BAGEL-7B-MoT" +BAGEL_STAGE_CONFIG = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") +DEFAULT_PROMPT = "<|im_start|>A cute cat<|im_end|>" + + +# --------------------------------------------------------------------------- +# Helpers (reused from test_bagel_text2img.py patterns) +# --------------------------------------------------------------------------- + + +def _resolve_stage_config(config_path: str, run_level: str) -> str: + if run_level == "advanced_model": + return modify_stage_config( + config_path, + deletes={ + "stage_args": { + 0: ["engine_args.load_format"], + 1: ["engine_args.load_format"], + } + }, + ) + return config_path + + +def _configure_sampling_params(omni: Omni, num_inference_steps: int = 10) -> list[OmniSamplingParams]: + params_list = omni.default_sampling_params_list + if len(params_list) > 1: + params_list[1].num_inference_steps = num_inference_steps + params_list[1].extra_args = { + "cfg_text_scale": 4.0, + "cfg_img_scale": 1.5, + } + return params_list + + +def _extract_generated_image(omni_outputs: list[OmniRequestOutput]) -> Image.Image | None: + for req_output in omni_outputs: + if req_output.images: + return req_output.images[0] + return None + + +def _generate_bagel_image(omni: Omni) -> Image.Image: + params_list = _configure_sampling_params(omni) + params_list[1].lora_request = None + outputs = list( + omni.generate( + prompts=[{"prompt": DEFAULT_PROMPT, "modalities": ["image"]}], + sampling_params_list=params_list, + ) + ) + img = _extract_generated_image(outputs) + assert img is not None, "No image generated" + return img + + +def _generate_bagel_image_with_lora( + omni: Omni, + lora_request: LoRARequest, + lora_scale: float = 1.0, +) -> Image.Image: + params_list = _configure_sampling_params(omni) + params_list[1].lora_request = lora_request + params_list[1].lora_scale = lora_scale + outputs = list( + omni.generate( + prompts=[{"prompt": DEFAULT_PROMPT, "modalities": ["image"]}], + sampling_params_list=params_list, + ) + ) + img = _extract_generated_image(outputs) + assert img is not None, "No image generated with LoRA" + return img + + +# BAGEL uses GQA: hidden_size=3584, 28 Q heads, 4 KV heads, head_dim=128 +# QKV packed dim = 28*128 + 4*128 + 4*128 = 3584 + 512 + 512 = 4608 +_LORA_DIM = 3584 +_LORA_QKV_DIM = 4608 +_LORA_MODULE = "bagel.language_model.model.layers.0.self_attn.qkv_proj" +_LORA_RANK = 4 + + +def _make_file_lora_request(adapter_dir: Path) -> LoRARequest: + """Write synthetic adapter to disk and return a file-backed LoRARequest.""" + adapter_dir.mkdir(parents=True, exist_ok=True) + gen = torch.Generator().manual_seed(42) + lora_a = torch.randn((_LORA_RANK, _LORA_DIM), dtype=torch.float32, generator=gen) * 0.1 + lora_b = torch.randn((_LORA_QKV_DIM, _LORA_RANK), dtype=torch.float32, generator=gen) * 0.5 + save_file( + { + f"base_model.model.{_LORA_MODULE}.lora_A.weight": lora_a, + f"base_model.model.{_LORA_MODULE}.lora_B.weight": lora_b, + }, + str(adapter_dir / "adapter_model.safetensors"), + ) + (adapter_dir / "adapter_config.json").write_text( + json.dumps({"r": _LORA_RANK, "lora_alpha": _LORA_RANK, "target_modules": [_LORA_MODULE]}), + encoding="utf-8", + ) + lora_dir = str(adapter_dir) + return LoRARequest(lora_name="test_file", lora_int_id=stable_lora_int_id(lora_dir), lora_path=lora_dir) + + +# --------------------------------------------------------------------------- +# Test +# --------------------------------------------------------------------------- + + +@pytest.mark.core_model +@pytest.mark.advanced_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) +def test_bagel_lora_scale_and_deactivation(run_level, tmp_path): + """Validate LoRA effect, bounded perturbation, and clean deactivation.""" + config_path = _resolve_stage_config(BAGEL_STAGE_CONFIG, run_level) + omni = Omni(model=MODEL, stage_configs_path=config_path, stage_init_timeout=300) + try: + lora_request = _make_file_lora_request(tmp_path / "bagel_lora") + + # 1) Baseline (no LoRA) + baseline = _generate_bagel_image(omni) + + # 2) LoRA with scale=1.0 + img_1x = _generate_bagel_image_with_lora(omni, lora_request, lora_scale=1.0) + + # 3) LoRA with scale=2.0 + img_2x = _generate_bagel_image_with_lora(omni, lora_request, lora_scale=2.0) + + # 4) No LoRA again (deactivation) + restored = _generate_bagel_image(omni) + + baseline_arr = np.array(baseline, dtype=np.int16) + img_1x_arr = np.array(img_1x, dtype=np.int16) + img_2x_arr = np.array(img_2x, dtype=np.int16) + restored_arr = np.array(restored, dtype=np.int16) + + diff_1x = np.abs(baseline_arr - img_1x_arr).mean() + diff_2x = np.abs(baseline_arr - img_2x_arr).mean() + diff_restored = np.abs(baseline_arr - restored_arr).mean() + + # (a) Adapter has visible effect at both scales + assert diff_1x > 0.5, f"LoRA scale=1.0 had no visible effect: diff={diff_1x}" + assert diff_2x > 0.5, f"LoRA scale=2.0 had no visible effect: diff={diff_2x}" + + # (b) Different scales produce different outputs + assert not np.isclose(diff_1x, diff_2x, atol=1.0), ( + f"LoRA scale has no effect: diff_1x={diff_1x:.2f}, diff_2x={diff_2x:.2f}" + ) + + # (c) Output is not corrupted + assert diff_1x < 80, f"LoRA output looks corrupted: diff_1x={diff_1x}" + assert diff_2x < 80, f"LoRA output looks corrupted: diff_2x={diff_2x}" + + # (d) Deactivation fully restores base model + assert diff_restored == 0.0, f"Base model not restored after LoRA deactivation: diff={diff_restored}" + finally: + omni.close() diff --git a/tests/engine/test_cross_stage_lora.py b/tests/engine/test_cross_stage_lora.py new file mode 100644 index 0000000000..1eccc5526c --- /dev/null +++ b/tests/engine/test_cross_stage_lora.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for cross-stage LoRA routing in the orchestrator.""" + +from __future__ import annotations + +import pytest +from vllm.lora.request import LoRARequest +from vllm.sampling_params import SamplingParams + +from vllm_omni.engine.orchestrator import build_engine_core_request_from_tokens +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +class TestBuildEngineCoreRequestLoRA: + """Verify build_engine_core_request_from_tokens passes LoRA from params.""" + + def test_lora_extracted_from_diffusion_params(self): + lr = LoRARequest(lora_name="test", lora_int_id=1, lora_path="/tmp/fake") + params = OmniDiffusionSamplingParams(lora_request=lr) + + # OmniDiffusionSamplingParams is not a SamplingParams, so + # build_engine_core_request_from_tokens takes the pooling path. + # We only care that lora_request is extracted via getattr. + request = build_engine_core_request_from_tokens( + request_id="req-1", + prompt={"prompt_token_ids": [1, 2, 3]}, + params=params, + model_config=None, + ) + assert request.lora_request is lr + + def test_no_lora_on_sampling_params(self): + params = SamplingParams(max_tokens=10) + + request = build_engine_core_request_from_tokens( + request_id="req-2", + prompt={"prompt_token_ids": [1, 2, 3]}, + params=params, + model_config=None, + ) + assert request.lora_request is None diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 20dce1f0ff..386b545eb7 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -79,7 +79,7 @@ def build_engine_core_request_from_tokens( sampling_params=sampling_params, pooling_params=pooling_params, arrival_time=arrival_time, - lora_request=None, + lora_request=getattr(params, "lora_request", None), cache_salt=None, data_parallel_rank=None, prompt_embeds=prompt_embeds, From e6f88f7c5a22d6494a23d95c00e2f42f084bbd0f Mon Sep 17 00:00:00 2001 From: zhumingjue138 Date: Thu, 9 Apr 2026 10:31:21 +0800 Subject: [PATCH 094/204] [CI] Reorganize the L1 L2 use cases and add markers (#2449) Signed-off-by: zhumingjue --- .buildkite/test-ready.yml | 39 +++++++++---------- .../test_generation_scheduler_restore.py | 4 ++ .../cache/test_teacache_extractors.py | 8 +++- .../test_distributed_vae_executor.py | 2 + .../distributed/test_ulysses_uaa_perf.py | 3 ++ .../models/flux2/test_flux2_transformer_tp.py | 9 +++++ .../diffusion/quantization/test_fp8_config.py | 2 +- .../diffusion/test_diffusion_model_runner.py | 9 ++++- .../omni_connectors/test_basic_connectors.py | 2 +- .../omni_coordinator/test_load_balancer.py | 4 ++ .../test_omni_coord_client_for_hub.py | 2 + .../test_omni_coord_client_for_stage.py | 2 + .../omni_coordinator/test_omni_coordinator.py | 3 ++ tests/engine/test_output_modality.py | 1 + .../openai_api/test_text_splitter.py | 2 +- tests/entrypoints/test_stage_utils.py | 6 ++- .../cosyvoice3/test_cosyvoice3_components.py | 26 +++++++++++++ .../cosyvoice3/test_cosyvoice3_utils.py | 2 + tests/test_diffusion_config_propagation.py | 3 ++ 19 files changed, 101 insertions(+), 28 deletions(-) diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index be528b316c..6f3ad6504e 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -16,11 +16,10 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Voxtral TTS CUDA Unit Test" - timeout_in_minutes: 10 + - label: "CUDA Unit Test with single card" depends_on: upload-ready-pipeline commands: - - "timeout 10m pytest -s -v tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py" + - timeout 10m pytest -v -s -m 'core_model and cuda and L4 and not distributed_cuda' --ignore=tests/e2e --ignore=tests/engine/test_async_omni_engine_abort.py --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml agents: queue: "gpu_1_queue" plugins: @@ -33,6 +32,22 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" + - label: "CUDA Unit Test with multi cards" + depends_on: upload-ready-pipeline + commands: + - timeout 10m pytest -v -s -m 'core_model and cuda and L4 and distributed_cuda' --ignore=tests/e2e --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml + agents: + queue: "gpu_4_queue" + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + environment: + - "HF_HOME=/fsx/hf_cache" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + - label: "Diffusion Model Test" depends_on: upload-ready-pipeline commands: @@ -152,24 +167,6 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Diffusion GPU Worker Test" - depends_on: upload-ready-pipeline - commands: - - timeout 20m pytest -s -v tests/diffusion/test_diffusion_worker.py - agents: - queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - shm-size: "8gb" - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Engine Test" depends_on: upload-ready-pipeline diff --git a/tests/core/sched/test_generation_scheduler_restore.py b/tests/core/sched/test_generation_scheduler_restore.py index 0eae3c4db9..154f40b399 100644 --- a/tests/core/sched/test_generation_scheduler_restore.py +++ b/tests/core/sched/test_generation_scheduler_restore.py @@ -9,6 +9,10 @@ import unittest from collections import deque +import pytest + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + class FakeAdapter: """Minimal mock of OmniChunkTransferAdapter tracking restore calls.""" diff --git a/tests/diffusion/cache/test_teacache_extractors.py b/tests/diffusion/cache/test_teacache_extractors.py index 5ba52ddfe2..a52e11b3d4 100644 --- a/tests/diffusion/cache/test_teacache_extractors.py +++ b/tests/diffusion/cache/test_teacache_extractors.py @@ -21,12 +21,13 @@ import pytest import torch +from tests.utils import hardware_test from vllm_omni.diffusion.cache.teacache.extractors import extract_flux2_klein_context from vllm_omni.diffusion.models.flux2_klein.flux2_klein_transformer import ( Flux2Transformer2DModel, ) -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] +pytestmark = [pytest.mark.core_model] @pytest.fixture(scope="function", autouse=True) @@ -113,6 +114,7 @@ def sample_inputs(self): def get_sample_inputs(self, sample_inputs): return sample_inputs + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_modulated_input_shape(self, flux2_klein_module, sample_inputs): """Test that modulated_input has correct shape matching the model's inner_dim. @@ -126,16 +128,19 @@ def test_modulated_input_shape(self, flux2_klein_module, sample_inputs): inner_dim = flux2_klein_module.inner_dim assert context.modulated_input.shape == (batch_size, img_seq_len, inner_dim) + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_run_transformer_blocks_callable(self, flux2_klein_module, sample_inputs): """Test that run_transformer_blocks is callable.""" context = extract_flux2_klein_context(flux2_klein_module, **sample_inputs) assert callable(context.run_transformer_blocks) + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_postprocess_callable(self, flux2_klein_module, sample_inputs): """Test that postprocess is callable.""" context = extract_flux2_klein_context(flux2_klein_module, **sample_inputs) assert callable(context.postprocess) + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_extra_states_contains_full_transformer(self, flux2_klein_module, sample_inputs): """Test that extra_states contains run_flux2_full_transformer_with_single.""" context = extract_flux2_klein_context(flux2_klein_module, **sample_inputs) @@ -154,6 +159,7 @@ def test_without_guidance(self, flux2_klein_module, sample_inputs): assert context is not None assert context.temb is not None + @pytest.mark.cpu def test_invalid_module_raises_error(self): """Test that invalid module without transformer_blocks raises ValueError.""" invalid_module = Mock() diff --git a/tests/diffusion/distributed/test_distributed_vae_executor.py b/tests/diffusion/distributed/test_distributed_vae_executor.py index 93cf3d195f..dc491dcdaf 100644 --- a/tests/diffusion/distributed/test_distributed_vae_executor.py +++ b/tests/diffusion/distributed/test_distributed_vae_executor.py @@ -11,6 +11,8 @@ TileTask, ) +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + class E2EOperator: """tiles with (2, 3) -- (H,W)""" diff --git a/tests/diffusion/distributed/test_ulysses_uaa_perf.py b/tests/diffusion/distributed/test_ulysses_uaa_perf.py index c8b07ba152..04bbf5ee86 100644 --- a/tests/diffusion/distributed/test_ulysses_uaa_perf.py +++ b/tests/diffusion/distributed/test_ulysses_uaa_perf.py @@ -17,6 +17,7 @@ import torch import torch.distributed as dist +from tests.utils import hardware_test from vllm_omni.diffusion.attention.parallel.ulysses import ( _all_gather_int, _ulysses_all_to_all_any_o, @@ -69,6 +70,8 @@ def world_size(self) -> int: @pytest.mark.parametrize("case", PERF_CASES) +@pytest.mark.core_model +@hardware_test(res={"cuda": "L4"}, num_cards=4) def test_ulysses_advanced_uaa_comm_overhead(case: _PerfCase) -> None: available_gpus = current_omni_platform.get_device_count() if available_gpus < case.world_size: diff --git a/tests/diffusion/models/flux2/test_flux2_transformer_tp.py b/tests/diffusion/models/flux2/test_flux2_transformer_tp.py index a2d1fe6abd..faad08afd1 100644 --- a/tests/diffusion/models/flux2/test_flux2_transformer_tp.py +++ b/tests/diffusion/models/flux2/test_flux2_transformer_tp.py @@ -3,6 +3,7 @@ import pytest import torch +from tests.utils import hardware_test from vllm_omni.diffusion.models.flux2.flux2_transformer import ( Flux2PosEmbed, Flux2Transformer2DModel, @@ -24,6 +25,8 @@ def setup_tp_group(): class TestFlux2TransformerWeightLoading: """Test Flux2Transformer weight loading functionality""" + @pytest.mark.core_model + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_weight_loading_tp2(self, setup_tp_group): """Verify weights load correctly with TP=2""" # Prepare test data @@ -78,6 +81,8 @@ def test_weight_loading_tp2(self, setup_tp_group): class TestFlux2RopePositionEmbedding: """Test Flux2 RoPE position embedding functionality""" + @pytest.mark.core_model + @pytest.mark.cpu def test_rope_position_embedding(self): """Verify RoPE produces correct embeddings for 4D coordinates""" # Prepare test data - use model default configuration @@ -132,6 +137,8 @@ def test_rope_position_embedding(self): class TestFlux2PackedModuleMapping: """Test Flux2 packed module mapping functionality""" + @pytest.mark.core_model + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_packed_module_mapping(self, setup_tp_group): """Verify to_qkv packing matches HF checkpoint""" model = Flux2Transformer2DModel( @@ -208,6 +215,8 @@ def test_packed_module_mapping(self, setup_tp_group): f"add_kv_proj weight dimension should be {expected_add_kv_shape}, got {attn_block.add_kv_proj.weight.shape}" ) + @pytest.mark.core_model + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_packed_mapping_edge_cases(self, setup_tp_group): """Test edge cases for packed mapping""" model = Flux2Transformer2DModel( diff --git a/tests/diffusion/quantization/test_fp8_config.py b/tests/diffusion/quantization/test_fp8_config.py index 9c18c1f551..574af7a669 100644 --- a/tests/diffusion/quantization/test_fp8_config.py +++ b/tests/diffusion/quantization/test_fp8_config.py @@ -5,7 +5,7 @@ import pytest from torch import nn -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion] +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] def test_build_quant_config_fp8(): diff --git a/tests/diffusion/test_diffusion_model_runner.py b/tests/diffusion/test_diffusion_model_runner.py index 88b17147e8..8768986f01 100644 --- a/tests/diffusion/test_diffusion_model_runner.py +++ b/tests/diffusion/test_diffusion_model_runner.py @@ -8,9 +8,10 @@ import torch import vllm_omni.diffusion.worker.diffusion_model_runner as model_runner_module +from tests.utils import hardware_test from vllm_omni.diffusion.worker.diffusion_model_runner import DiffusionModelRunner -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] +pytestmark = [pytest.mark.diffusion] @contextmanager @@ -64,6 +65,8 @@ def _make_runner(cache_backend, cache_backend_name: str, enable_cache_dit_summar return runner +@pytest.mark.core_model +@hardware_test(res={"cuda": "L4"}, num_cards=1) def test_execute_model_skips_cache_summary_without_active_cache_backend(monkeypatch): """Guard cache diagnostics with runtime backend state to avoid stale-config crashes.""" runner = _make_runner(cache_backend=None, cache_backend_name="cache_dit") @@ -84,6 +87,8 @@ def test_execute_model_skips_cache_summary_without_active_cache_backend(monkeypa assert cache_summary_calls == [] +@pytest.mark.core_model +@hardware_test(res={"cuda": "L4"}, num_cards=1) def test_execute_model_emits_cache_summary_with_active_cache_dit_backend(monkeypatch): class _EnabledCacheBackend: def is_enabled(self): @@ -107,6 +112,8 @@ def is_enabled(self): assert cache_summary_calls == [(runner.pipeline, True)] +@pytest.mark.core_model +@pytest.mark.cpu def test_load_model_clears_cache_backend_for_unsupported_pipeline(monkeypatch): class _DummyLoader: def __init__(self, load_config, od_config=None): diff --git a/tests/distributed/omni_connectors/test_basic_connectors.py b/tests/distributed/omni_connectors/test_basic_connectors.py index bca96e790d..662d41fe01 100644 --- a/tests/distributed/omni_connectors/test_basic_connectors.py +++ b/tests/distributed/omni_connectors/test_basic_connectors.py @@ -9,7 +9,7 @@ from vllm_omni.distributed.omni_connectors.utils.config import ConnectorSpec from vllm_omni.distributed.omni_connectors.utils.serialization import OmniSerializer -# pytestmark = [pytest.mark.core_model, pytest.mark.cpu] +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] def test_basic_serialization(): diff --git a/tests/distributed/omni_coordinator/test_load_balancer.py b/tests/distributed/omni_coordinator/test_load_balancer.py index c54d248940..b2d1f3ee84 100644 --- a/tests/distributed/omni_coordinator/test_load_balancer.py +++ b/tests/distributed/omni_coordinator/test_load_balancer.py @@ -3,12 +3,16 @@ from time import time +import pytest + from vllm_omni.distributed.omni_coordinator import ( InstanceInfo, RandomBalancer, StageStatus, ) +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + def test_load_balancer_select_returns_valid_index(): """Verify RandomBalancer.select() returns a valid index for instances.""" diff --git a/tests/distributed/omni_coordinator/test_omni_coord_client_for_hub.py b/tests/distributed/omni_coordinator/test_omni_coord_client_for_hub.py index 24b3319232..2fbd7c85bf 100644 --- a/tests/distributed/omni_coordinator/test_omni_coord_client_for_hub.py +++ b/tests/distributed/omni_coordinator/test_omni_coord_client_for_hub.py @@ -12,6 +12,8 @@ OmniCoordClientForHub, ) +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + def _bind_pub() -> tuple[zmq.Context, zmq.Socket, str]: ctx = zmq.Context.instance() diff --git a/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py b/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py index 0ba19c7fff..f095dfd492 100644 --- a/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py +++ b/tests/distributed/omni_coordinator/test_omni_coord_client_for_stage.py @@ -17,6 +17,8 @@ pytestmark = [pytest.mark.core_model, pytest.mark.cpu] +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + def _bind_router() -> tuple[zmq.Context, zmq.Socket, str]: ctx = zmq.Context.instance() diff --git a/tests/distributed/omni_coordinator/test_omni_coordinator.py b/tests/distributed/omni_coordinator/test_omni_coordinator.py index 0c68e61bb1..38a595cc78 100644 --- a/tests/distributed/omni_coordinator/test_omni_coordinator.py +++ b/tests/distributed/omni_coordinator/test_omni_coordinator.py @@ -4,6 +4,7 @@ import json import time +import pytest import zmq from vllm.v1.utils import get_engine_client_zmq_addr @@ -13,6 +14,8 @@ StageStatus, ) +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + def _recv_instance_list(sub: zmq.Socket, timeout_ms: int = 2000) -> dict | None: """Receive InstanceList JSON from SUB socket. Returns None on timeout.""" diff --git a/tests/engine/test_output_modality.py b/tests/engine/test_output_modality.py index 5a2a5dfc57..7a9c765028 100644 --- a/tests/engine/test_output_modality.py +++ b/tests/engine/test_output_modality.py @@ -12,6 +12,7 @@ import torch # ── Load modules without triggering vllm_omni.__init__ ───────────── +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] _ENGINE_DIR = Path(__file__).resolve().parents[2] / "vllm_omni" / "engine" diff --git a/tests/entrypoints/openai_api/test_text_splitter.py b/tests/entrypoints/openai_api/test_text_splitter.py index 23d4d191fc..a1886662ae 100644 --- a/tests/entrypoints/openai_api/test_text_splitter.py +++ b/tests/entrypoints/openai_api/test_text_splitter.py @@ -4,7 +4,7 @@ from vllm_omni.entrypoints.openai.text_splitter import SentenceSplitter -pytestmark = [pytest.mark.openai, pytest.mark.speech] +pytestmark = [pytest.mark.openai, pytest.mark.speech, pytest.mark.core_model, pytest.mark.cpu] class TestSentenceSplitterEnglish: diff --git a/tests/entrypoints/test_stage_utils.py b/tests/entrypoints/test_stage_utils.py index 2bb2231ccb..3afc6f12f5 100644 --- a/tests/entrypoints/test_stage_utils.py +++ b/tests/entrypoints/test_stage_utils.py @@ -6,8 +6,6 @@ from vllm_omni.entrypoints.stage_utils import set_stage_devices -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - def _make_dummy_torch(call_log): class _Props: @@ -55,6 +53,8 @@ def _make_mock_platform(mocker, device_type: str = "cuda", env_var: str = "CUDA_ return mock_platform +@pytest.mark.core_model +@pytest.mark.cpu @pytest.mark.usefixtures("clean_gpu_memory_between_tests") def test_set_stage_devices_respects_logical_ids(mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch): # Preserve an existing logical mapping and ensure devices "0,1" map through it. @@ -75,6 +75,8 @@ def test_set_stage_devices_respects_logical_ids(mocker: MockerFixture, monkeypat assert os.environ["CUDA_VISIBLE_DEVICES"] == "6,7" +@pytest.mark.core_model +@pytest.mark.cpu @pytest.mark.usefixtures("clean_gpu_memory_between_tests") def test_set_stage_devices_handles_not_enough_devices(mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch): # Preserve an existing logical mapping and ensure devices "0,1" map through it. diff --git a/tests/model_executor/models/cosyvoice3/test_cosyvoice3_components.py b/tests/model_executor/models/cosyvoice3/test_cosyvoice3_components.py index 0f5202c3b9..ec24f6949f 100644 --- a/tests/model_executor/models/cosyvoice3/test_cosyvoice3_components.py +++ b/tests/model_executor/models/cosyvoice3/test_cosyvoice3_components.py @@ -8,6 +8,8 @@ import torch import torch.nn as nn +from tests.utils import hardware_test + class TestPreLookaheadLayer: """Tests for PreLookaheadLayer.""" @@ -18,6 +20,8 @@ def layer(self): return PreLookaheadLayer(in_channels=512, channels=512, pre_lookahead_len=3) + @pytest.mark.core_model + @pytest.mark.cpu def test_forward_shape(self, layer): """Test that output shape matches input shape.""" batch, seq_len, channels = 2, 10, 512 @@ -27,6 +31,8 @@ def test_forward_shape(self, layer): assert out.shape == x.shape + @pytest.mark.core_model + @pytest.mark.cpu def test_forward_with_context(self, layer): """Test forward with context for streaming.""" batch, seq_len, channels = 1, 10, 512 @@ -38,6 +44,8 @@ def test_forward_with_context(self, layer): assert out.shape == x.shape + @pytest.mark.core_model + @pytest.mark.cpu def test_residual_connection(self, layer): """Test that residual connection is applied.""" batch, seq_len, channels = 1, 5, 512 @@ -59,6 +67,8 @@ def attention(self): return DiTAttention(dim=512, heads=8, dim_head=64, dropout=0.0) + @pytest.mark.core_model + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_forward_shape(self, attention): """Test attention output shape.""" batch, seq_len, dim = 2, 16, 512 @@ -68,6 +78,8 @@ def test_forward_shape(self, attention): assert out.shape == x.shape + @pytest.mark.core_model + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_forward_with_mask(self, attention): """Test attention with mask.""" batch, seq_len, dim = 2, 16, 512 @@ -81,6 +93,8 @@ def test_forward_with_mask(self, attention): # Masked positions should be zero assert torch.allclose(out[:, -3:], torch.zeros_like(out[:, -3:])) + @pytest.mark.core_model + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_qkv_projections(self, attention): """Test that Q/K/V projections exist and have correct dimensions.""" assert hasattr(attention, "to_q") @@ -100,6 +114,8 @@ def block(self): return DiTBlock(dim=512, heads=8, dim_head=64, ff_mult=4, dropout=0.0) + @pytest.mark.core_model + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_forward_shape(self, block): """Test block output shape.""" batch, seq_len, dim = 2, 16, 512 @@ -110,6 +126,8 @@ def test_forward_shape(self, block): assert out.shape == x.shape + @pytest.mark.core_model + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_adalayernorm_modulation(self, block): """Test that AdaLayerNorm modulates based on timestep.""" batch, seq_len, dim = 1, 8, 512 @@ -144,6 +162,8 @@ def dit(self): long_skip_connection=True, ) + @pytest.mark.core_model + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_forward_shape(self, dit): """Test DiT forward output shape.""" batch, mel_dim, seq_len = 1, 80, 32 @@ -158,6 +178,8 @@ def test_forward_shape(self, dit): assert out.shape == (batch, mel_dim, seq_len) + @pytest.mark.core_model + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_timestep_embedding(self, dit): """Test that different timesteps produce different outputs.""" batch, mel_dim, seq_len = 1, 80, 16 @@ -190,6 +212,8 @@ def forward(self, x, mask, mu, t, spks=None, cond=None): return DummyEstimator() + @pytest.mark.core_model + @pytest.mark.cpu def test_causal_conditional_cfm_forward(self, dummy_estimator): """Test CausalConditionalCFM forward pass.""" from omegaconf import DictConfig @@ -228,6 +252,8 @@ def test_causal_conditional_cfm_forward(self, dummy_estimator): class TestSDPAFallback: """Test SDPA fallback for float32 inputs.""" + @pytest.mark.core_model + @hardware_test(res={"cuda": "L4"}, num_cards=1) def test_float32_uses_sdpa(self): """Test that float32 inputs use SDPA fallback.""" from vllm_omni.diffusion.attention.layer import Attention diff --git a/tests/model_executor/models/cosyvoice3/test_cosyvoice3_utils.py b/tests/model_executor/models/cosyvoice3/test_cosyvoice3_utils.py index 828bb2b147..76428ed582 100644 --- a/tests/model_executor/models/cosyvoice3/test_cosyvoice3_utils.py +++ b/tests/model_executor/models/cosyvoice3/test_cosyvoice3_utils.py @@ -5,6 +5,8 @@ import pytest import torch +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + class TestMakePadMask: """Tests for make_pad_mask utility.""" diff --git a/tests/test_diffusion_config_propagation.py b/tests/test_diffusion_config_propagation.py index 58eb6097ca..7d6d9c43f0 100644 --- a/tests/test_diffusion_config_propagation.py +++ b/tests/test_diffusion_config_propagation.py @@ -7,6 +7,7 @@ from collections.abc import Mapping +import pytest import torch from vllm_omni.config.stage_config import StageConfigFactory @@ -15,6 +16,8 @@ OmniDiffusionConfig, ) +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + def _roundtrip_diffusion_config(**kwargs) -> OmniDiffusionConfig: """Simulate the real path: create_default_diffusion → OmniDiffusionConfig. From 3bd8a5239e3df20e001e5046ca000d6a9b17d515 Mon Sep 17 00:00:00 2001 From: Nick Cao Date: Wed, 8 Apr 2026 23:54:12 -0400 Subject: [PATCH 095/204] [Bugfix] Enforce --max-generated-image-size on /v1/images/generations (#2599) Signed-off-by: Nick Cao Co-authored-by: Claude --- .../openai_api/test_image_server.py | 19 +++++-- vllm_omni/entrypoints/openai/api_server.py | 51 ++++++++++++------- 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py index d68143dae8..c91c5a5c75 100644 --- a/tests/entrypoints/openai_api/test_image_server.py +++ b/tests/entrypoints/openai_api/test_image_server.py @@ -178,7 +178,7 @@ def test_client(mock_async_diffusion): ) app.state.args = Namespace( default_sampling_params='{"0": {"num_inference_steps":4, "guidance_scale":7.5}}', - max_generated_image_size=4096, # 64*64 + max_generated_image_size=1024 * 1792, ) return TestClient(app) @@ -245,7 +245,7 @@ def async_omni_stage_configs_only_client(): # AsyncOmni exposes stage_configs on the engine instance. app.state.args = Namespace( default_sampling_params='{"1": {"num_inference_steps":4, "guidance_scale":7.5}}', - max_generated_image_size=4096, # 64*64 + max_generated_image_size=1024 * 1792, ) return TestClient(app) @@ -392,6 +392,18 @@ def test_image_edits_async_omni_stage_configs_only(async_omni_stage_configs_only assert len(captured) == 2 +def test_generate_images_max_size_rejected(async_omni_test_client): + """Test that a size exceeding max_generated_image_size returns 400.""" + response = async_omni_test_client.post( + "/v1/images/generations", + json={ + "prompt": "a cat", + "size": "2048x2048", # 4,194,304 pixels > max_generated_image_size (1,048,576) + }, + ) + assert response.status_code == 400 + + def test_generate_multiple_images(test_client): """Test generating multiple images""" response = test_client.post( @@ -982,12 +994,13 @@ def test_image_edit_parameter_default_single_stage(test_client): assert captured_sampling_params.num_inference_steps == 4 assert captured_sampling_params.guidance_scale == 7.5 + # Size exceeding max_generated_image_size (1024*1792) returns 400 response = test_client.post( "/v1/images/edits", files=[("image", img_bytes_1)], data={ "prompt": "hello world.", - "size": "96x96", + "size": "2048x2048", }, ) assert response.status_code == 400 diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index ebe4cf30bf..25817d6a79 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1331,6 +1331,10 @@ async def generate_images(request: ImageGenerationRequest, raw_request: Request) size_str = f"{width}x{height}" else: size_str = "model default" + + app_state_args = getattr(raw_request.app.state, "args", None) + _check_max_generated_image_size(app_state_args, width, height) + _update_if_not_none(gen_params, "width", width) _update_if_not_none(gen_params, "height", height) @@ -1517,7 +1521,6 @@ async def edit_images( ) # 3.3 Parse and add size if provided - max_generated_image_size = getattr(app_state_args, "max_generated_image_size", None) width, height = None, None if size.lower() == "auto": if resolution is None: @@ -1527,23 +1530,7 @@ async def edit_images( else: width, height = parse_size(size) - # Check max_generated_image_size - if max_generated_image_size is not None: - if width is not None and height is not None: - if width * height > max_generated_image_size: - raise HTTPException( - status_code=HTTPStatus.BAD_REQUEST.value, - detail=f"Requested image size {width}x{height} exceeds the maximum allowed " - f"size of {max_generated_image_size} pixels.", - ) - elif resolution is not None: - # When resolution is set, the output size is resolution * resolution - if resolution * resolution > max_generated_image_size: - raise HTTPException( - status_code=HTTPStatus.BAD_REQUEST.value, - detail=f"Requested resolution {resolution} (max {resolution}x{resolution} pixels) " - f"exceeds the maximum allowed size of {max_generated_image_size} pixels.", - ) + _check_max_generated_image_size(app_state_args, width, height, resolution) size_str = f"{width}x{height}" if width is not None and height is not None else "auto" _update_if_not_none(gen_params, "width", width) @@ -1743,6 +1730,34 @@ async def _generate_with_async_omni( return result +def _check_max_generated_image_size( + app_state_args: Any, + width: int | None, + height: int | None, + resolution: int | None = None, +) -> None: + """Raise 400 if the requested image size exceeds --max-generated-image-size.""" + max_generated_image_size = getattr(app_state_args, "max_generated_image_size", None) + # Check max_generated_image_size + if max_generated_image_size is None: + return + if width is not None and height is not None: + if width * height > max_generated_image_size: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail=f"Requested image size {width}x{height} exceeds the maximum allowed " + f"size of {max_generated_image_size} pixels.", + ) + elif resolution is not None: + # When resolution is set, the output size is resolution * resolution + if resolution * resolution > max_generated_image_size: + raise HTTPException( + status_code=HTTPStatus.BAD_REQUEST.value, + detail=f"Requested resolution {resolution} (max {resolution}x{resolution} pixels) " + f"exceeds the maximum allowed size of {max_generated_image_size} pixels.", + ) + + def _update_if_not_none(object: Any, key: str, val: Any) -> None: if val is not None: setattr(object, key, val) From 0edc356fc8a30199ac85383d15fa9566a40486b6 Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Thu, 9 Apr 2026 14:35:35 +0800 Subject: [PATCH 096/204] [CI]Refactor nightly test configuration in Buildkite, Add group for Omni and Diffusion models (#2582) Signed-off-by: wangyu <410167048@qq.com> --- .buildkite/test-nightly-diffusion.yml | 367 +++++++++++++++++ .buildkite/test-nightly.yml | 559 ++++++++------------------ tests/conftest.py | 28 +- 3 files changed, 567 insertions(+), 387 deletions(-) create mode 100644 .buildkite/test-nightly-diffusion.yml diff --git a/.buildkite/test-nightly-diffusion.yml b/.buildkite/test-nightly-diffusion.yml new file mode 100644 index 0000000000..73bf455113 --- /dev/null +++ b/.buildkite/test-nightly-diffusion.yml @@ -0,0 +1,367 @@ +# Nightly diffusion GPU tests — appended to the main nightly build via +# buildkite-agent pipeline upload .buildkite/test-nightly-diffusion.yml +# from test-nightly.yml (step key: nightly-diffusion-model-test). Top-level groups are +# foldable in the Buildkite UI (Other / Wan / Qwen-Image). +steps: + - group: ":card_index_dividers: Other Model Test" + key: nightly-other-model-test-group + steps: + - label: ":full_moon: Diffusion · Other · Function Test with H100" + timeout_in_minutes: 120 + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not test_qwen_image" -m "advanced_model and diffusion and H100" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Diffusion · Other · Function Test with L4" + timeout_in_minutes: 60 + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" + agents: + queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + + - label: ":full_moon: Diffusion · Other · Doc Test" + timeout_in_minutes: 60 + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_TEST_CLEAN_GPU_MEMORY="1" + - pytest -s -v tests/examples/online_serving/test_text_to_image.py tests/examples/offline_inference/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - group: ":card_index_dividers: Wan Series Model Test" + key: nightly-wan-model-test-group + steps: + - label: ":full_moon: Diffusion · Wan · Function Test" + timeout_in_minutes: 90 + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py -m "advanced_model" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Diffusion · Wan · Accuracy Test" + key: nightly-wan22-i2v-accuracy + timeout_in_minutes: 180 + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level advanced_model + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - group: ":card_index_dividers: Qwen-Image Series Model Test" + key: nightly-qwen-image-edit-group + steps: + - label: ":full_moon: Diffusion · Qwen-Image · Function Test with H100" + timeout_in_minutes: 120 + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/online_serving/test_qwen_image*_expansion.py -m "advanced_model and diffusion and H100" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Diffusion · Qwen-Image · GEBench Accuracy Test" + key: nightly-gebench-accuracy + timeout_in_minutes: 60 + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1 + - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Diffusion · Qwen-Image · GEdit-Bench Accuracy Test" + key: nightly-gedit-bench-accuracy + timeout_in_minutes: 60 + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1 + - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv" + - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_HTTP_TIMEOUT_KEEP_ALIVE + value: "120" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Diffusion · Qwen-Image · Perf Test" + key: nightly-qwen-image-performance + timeout_in_minutes: 180 + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results + - export CACHE_DIT_VERSION=1.3.0 + - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json + - buildkite-agent artifact upload "tests/dfx/perf/results/benchmark_results_*.json" + - buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 4 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 15a7bba55d..62f6e4dceb 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -1,228 +1,199 @@ steps: - - label: ":full_moon: Omni Model Test with H100" - timeout_in_minutes: 90 - depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - | - pytest -s -v \ - tests/examples/ \ - tests/e2e/online_serving/test_*_expansion.py \ - -m "advanced_model and H100 and omni" --run-level "advanced_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: + # Group: collapses under one heading in the Buildkite UI; child steps still run in parallel. + - group: ":card_index_dividers: Omni Model Test" + key: nightly-omni-test-group + steps: + - label: ":full_moon: Omni · Function Test with H100" + timeout_in_minutes: 90 + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and H100 and omni" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: - name: devshm - mountPath: /dev/shm + emptyDir: + medium: Memory - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - - label: ":full_moon: Omni Model Test with L4" - timeout_in_minutes: 90 - depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - pytest -s -v tests/examples/ tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model" - agents: - queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - shm-size: "8gb" - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" + - label: ":full_moon: Omni · Function Test with L4" + timeout_in_minutes: 90 + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model" + agents: + queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" - - label: ":full_moon: Diffusion Model Test with H100" - timeout_in_minutes: 120 - depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion" -m "advanced_model and diffusion and H100" --run-level "advanced_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: + - label: ":full_moon: Omni · Doc Test with L4" + timeout_in_minutes: 90 + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" + - pytest -s -v tests/examples/ -m "advanced_model and omni and L4" --run-level "advanced_model" + agents: + queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + + - label: ":full_moon: Omni · Doc Test with H100" + timeout_in_minutes: 90 + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v tests/examples/ -m "advanced_model and omni and H100" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: - name: devshm - mountPath: /dev/shm + emptyDir: + medium: Memory - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - - label: ":full_moon: Diffusion Model (Wan) Test with H100" - timeout_in_minutes: 90 - depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py -m "advanced_model" --run-level "advanced_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: + - label: ":full_moon: Omni · Perf Test" + key: nightly-omni-performance + timeout_in_minutes: 180 + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export BENCHMARK_DIR=tests/dfx/perf/results + - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" + - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py + - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" + - buildkite-agent artifact upload "tests/dfx/perf/results/*.html" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: - name: devshm - mountPath: /dev/shm + emptyDir: + medium: Memory - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion Model Test" - timeout_in_minutes: 60 - depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" - agents: - queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - shm-size: "8gb" - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - - - label: ":full_moon: Doc Example Code Test with H100" - timeout_in_minutes: 60 + # Dynamically appends steps from test-nightly-diffusion.yml into this build (same mechanism as + # pipeline.yml → test-ready.yml / test-merge.yml / test-nightly.yml). Foldable groups stay in the + # uploaded YAML (Other / Wan / Qwen-Image). + - label: ":card_index_dividers: Diffusion Model Test" + key: nightly-diffusion-model-test depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - - pytest -s -v tests/examples/online_serving/test_text_to_image.py tests/examples/offline_inference/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model" + - buildkite-agent pipeline upload .buildkite/test-nightly-diffusion.yml agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate + queue: "cpu_queue_premerge" - - label: ":full_moon: Omni Model Perf Test & Testcase Statistics with H100" - key: nightly-omni-performance - timeout_in_minutes: 180 + - label: ":bar_chart: Testcase Statistics" + key: nightly-testcase-statistics + timeout_in_minutes: 120 depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export BENCHMARK_DIR=tests/dfx/perf/results - - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py - - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" - python tools/nightly/buildkite_testcase_statistics.py -o tests/dfx/perf/results/buildkite_testcase_statistics.html - buildkite-agent artifact upload "tests/dfx/perf/results/*.html" agents: @@ -259,189 +230,13 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - - label: ":full_moon: GEBench Accuracy Test with H100" - key: nightly-gebench-accuracy - timeout_in_minutes: 60 - depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1 - - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 1 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: GEdit-Bench Accuracy Test with H100" - key: nightly-gedit-bench-accuracy - timeout_in_minutes: 60 - depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1 - - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv" - - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 1 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_HTTP_TIMEOUT_KEEP_ALIVE - value: "120" - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Wan22 I2V Accuracy Test with H100" - key: nightly-wan22-i2v-accuracy - timeout_in_minutes: 180 - depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level advanced_model - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion Perf Test with H100" - key: nightly-qwen-image-performance - timeout_in_minutes: 180 - depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results - - export CACHE_DIT_VERSION=1.3.0 - - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json - - buildkite-agent artifact upload "tests/dfx/perf/results/benchmark_results_*.json" - - buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - # No need to run this step for PRs with label nightly-test - label: ":email: Nightly Collection & Email" key: nightly-perf-distribution depends_on: - nightly-omni-performance - nightly-qwen-image-performance + - nightly-testcase-statistics if: build.env("NIGHTLY") == "1" commands: - pip install openpyxl diff --git a/tests/conftest.py b/tests/conftest.py index 8e9a7bf928..8ac790f137 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,6 +16,7 @@ os.environ["VLLM_TARGET_DEVICE"] = "cpu" import concurrent.futures +import contextlib import gc import multiprocessing import socket @@ -52,6 +53,7 @@ logger = init_logger(__name__) + PromptAudioInput = list[tuple[Any, int]] | tuple[Any, int] | None PromptImageInput = list[Any] | Any | None PromptVideoInput = list[Any] | Any | None @@ -337,10 +339,10 @@ def log_test_name_before_test(request): def _run_pre_test_cleanup(enable_force=False): if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1" and not enable_force: - print("GPU cleanup disabled") + print("\nPre-test GPU cleanup skipped(Default off is typical when one worker/instance runs many tests.)\n") return - print("Pre-test GPU status:") + print("\nPre-test GPU status:") num_gpus = torch.cuda.device_count() if num_gpus > 0: @@ -1087,6 +1089,22 @@ def _merge_base64_audio_to_segment(base64_list: list[str]): return merged +@contextlib.contextmanager +def _serialize_whisper_small_model_download(): + """Serialize Whisper ``small`` cache writes across processes (Linux; ``fcntl``).""" + import fcntl + + lock_path = Path.home() / ".cache" / "whisper" / ".small_model_download.lock" + lock_path.parent.mkdir(parents=True, exist_ok=True) + f = open(lock_path, "a+b") + try: + fcntl.flock(f.fileno(), fcntl.LOCK_EX) + yield + finally: + fcntl.flock(f.fileno(), fcntl.LOCK_UN) + f.close() + + def _whisper_transcribe_in_current_process(output_path: str) -> str: import whisper @@ -1107,7 +1125,8 @@ def _whisper_transcribe_in_current_process(output_path: str) -> str: else: use_accelerator = False device = "cpu" - model = whisper.load_model("small", device=device) + with _serialize_whisper_small_model_download(): + model = whisper.load_model("small", device=device) try: text = model.transcribe( output_path, @@ -1126,8 +1145,7 @@ def _whisper_transcribe_in_current_process(output_path: str) -> str: def convert_audio_file_to_text(output_path: str) -> str: - """Convert an audio file to text in an isolated subprocess.""" - # Import locally to avoid impacting test module import time. + """Convert an audio file to text in an isolated subprocess (spawn).""" ctx = multiprocessing.get_context("spawn") with concurrent.futures.ProcessPoolExecutor(max_workers=1, mp_context=ctx) as executor: future = executor.submit(_whisper_transcribe_in_current_process, output_path) From ed7a448cd39a14a45991ed2d4200b5ccedf4fc8e Mon Sep 17 00:00:00 2001 From: pjh4993 Date: Thu, 9 Apr 2026 15:41:23 +0900 Subject: [PATCH 097/204] [Bugfix] Guard app.state access during server shutdown (#2587) Signed-off-by: pjh4993 --- vllm_omni/entrypoints/openai/api_server.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 25817d6a79..d445ad0eca 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -353,7 +353,9 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None, try: await shutdown_task finally: - app.state.openai_serving_speech.shutdown() + serving_speech = getattr(getattr(app, "state", None), "openai_serving_speech", None) + if serving_speech is not None: + serving_speech.shutdown() sock.close() From 9d87229a30bbd8a4d34f814a563f88f53b277c5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Thu, 9 Apr 2026 14:52:12 +0800 Subject: [PATCH 098/204] [MagiHuman] Fix audio sample rate and fps propagation for online serving (#2554) Signed-off-by: princepride --- .../image_to_video/image_to_video.py | 47 +++++------ .../offline_inference/magi_human/end2end.py | 13 ++- .../text_to_video/text_to_video.py | 51 +++++------ .../e2e/offline_inference/test_magi_human.py | 25 ++++-- vllm_omni/diffusion/diffusion_engine.py | 12 +++ .../models/magi_human/pipeline_magi_human.py | 7 +- vllm_omni/entrypoints/openai/serving_video.py | 42 +++++++++- .../entrypoints/openai/video_api_utils.py | 84 +++++-------------- 8 files changed, 150 insertions(+), 131 deletions(-) diff --git a/examples/offline_inference/image_to_video/image_to_video.py b/examples/offline_inference/image_to_video/image_to_video.py index c8c55c485a..7e7cfbf84e 100644 --- a/examples/offline_inference/image_to_video/image_to_video.py +++ b/examples/offline_inference/image_to_video/image_to_video.py @@ -146,7 +146,7 @@ def parse_args() -> argparse.Namespace: "--audio-sample-rate", type=int, default=24000, - help="Sample rate for audio output when saved (default: 24000 for LTX2).", + help="Sample rate for audio output when saved (default: 24000).", ) parser.add_argument( "--cache-backend", @@ -471,15 +471,9 @@ def _ensure_frame_list(video_array): video_array = _ensure_frame_list(video_array) - use_ltx2_export = is_ltx2 - encode_video = None - if use_ltx2_export: - try: - from diffusers.pipelines.ltx2.export_utils import encode_video - except ImportError: - encode_video = None + if audio is not None: + from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes - if use_ltx2_export and encode_video is not None: if isinstance(video_array, list): frames_np = np.stack(video_array, axis=0) elif isinstance(video_array, np.ndarray): @@ -490,25 +484,24 @@ def _ensure_frame_list(video_array): if frames_np.ndim == 4 and frames_np.shape[-1] == 4: frames_np = frames_np[..., :3] - audio_out = None - if audio is not None: - if isinstance(audio, list): - audio = audio[0] if audio else None - if isinstance(audio, np.ndarray): - audio = torch.from_numpy(audio) - if isinstance(audio, torch.Tensor): - audio_out = audio - if audio_out.dim() > 1: - audio_out = audio_out[0] - audio_out = audio_out.float().cpu() - - encode_video( - frames_np, - fps=fps, - audio=audio_out, - audio_sample_rate=args.audio_sample_rate if audio_out is not None else None, - output_path=str(output_path), + frames_u8 = (np.clip(frames_np, 0.0, 1.0) * 255).round().clip(0, 255).astype("uint8") + + audio_np = audio + if isinstance(audio_np, list): + audio_np = audio_np[0] if audio_np else None + if isinstance(audio_np, torch.Tensor): + audio_np = audio_np.detach().cpu().float().numpy() + if isinstance(audio_np, np.ndarray): + audio_np = np.squeeze(audio_np).astype(np.float32) + + video_bytes = mux_video_audio_bytes( + frames_u8, + audio_np, + fps=float(fps), + audio_sample_rate=args.audio_sample_rate, ) + with open(str(output_path), "wb") as f: + f.write(video_bytes) else: export_to_video(video_array, str(output_path), fps=fps) print(f"Saved generated video to {output_path}") diff --git a/examples/offline_inference/magi_human/end2end.py b/examples/offline_inference/magi_human/end2end.py index 39451ccc44..64f11c4658 100644 --- a/examples/offline_inference/magi_human/end2end.py +++ b/examples/offline_inference/magi_human/end2end.py @@ -94,16 +94,21 @@ def main(): print(f"Video frames: shape={video_frames.shape}, dtype={video_frames.dtype}") audio_waveform = None - if hasattr(first, "multimodal_output") and first.multimodal_output: - audio_waveform = first.multimodal_output.get("audio") + mm = first.multimodal_output or {} + if mm: + audio_waveform = mm.get("audio") if audio_waveform is not None: print(f"Audio waveform: shape={audio_waveform.shape}, dtype={audio_waveform.dtype}") + output_fps = float(mm.get("fps", 25)) + output_sr = int(mm.get("audio_sample_rate", 24000)) + print(f"Using fps={output_fps}, audio_sample_rate={output_sr} from model output") + video_bytes = mux_video_audio_bytes( video_frames, audio_waveform, - fps=25.0, - audio_sample_rate=44100, + fps=output_fps, + audio_sample_rate=output_sr, ) with open(args.output, "wb") as f: f.write(video_bytes) diff --git a/examples/offline_inference/text_to_video/text_to_video.py b/examples/offline_inference/text_to_video/text_to_video.py index 322911c993..83925cc458 100644 --- a/examples/offline_inference/text_to_video/text_to_video.py +++ b/examples/offline_inference/text_to_video/text_to_video.py @@ -160,7 +160,7 @@ def parse_args() -> argparse.Namespace: "--audio-sample-rate", type=int, default=24000, - help="Sample rate for audio output when saved (default: 24000 for LTX2).", + help="Sample rate for audio output when saved (default: 24000).", ) parser.add_argument( "--vae-patch-parallel-size", @@ -430,17 +430,8 @@ def _ensure_frame_list(video_array): video_array = _ensure_frame_list(video_array) - use_ltx2_export = False - if args.model and "ltx" in str(args.model).lower(): - use_ltx2_export = True if audio is not None: - use_ltx2_export = True - - if use_ltx2_export: - try: - from diffusers.pipelines.ltx2.export_utils import encode_video - except ImportError: - raise ImportError("diffusers is required for LTX2 encode_video.") + from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes if isinstance(video_array, list): frames_np = np.stack(video_array, axis=0) @@ -449,28 +440,24 @@ def _ensure_frame_list(video_array): else: frames_np = np.asarray(video_array) - frames_u8 = (frames_np * 255).round().clip(0, 255).astype("uint8") - video_tensor = torch.from_numpy(frames_u8) - - audio_out = None - if audio is not None: - if isinstance(audio, list): - audio = audio[0] if audio else None - if isinstance(audio, np.ndarray): - audio = torch.from_numpy(audio) - if isinstance(audio, torch.Tensor): - audio_out = audio - if audio_out.dim() > 1: - audio_out = audio_out[0] - audio_out = audio_out.float().cpu() - - encode_video( - video_tensor, - fps=args.fps, - audio=audio_out, - audio_sample_rate=args.audio_sample_rate if audio_out is not None else None, - output_path=str(output_path), + frames_u8 = (np.clip(frames_np, 0.0, 1.0) * 255).round().clip(0, 255).astype("uint8") + + audio_np = audio + if isinstance(audio_np, list): + audio_np = audio_np[0] if audio_np else None + if isinstance(audio_np, torch.Tensor): + audio_np = audio_np.detach().cpu().float().numpy() + if isinstance(audio_np, np.ndarray): + audio_np = np.squeeze(audio_np).astype(np.float32) + + video_bytes = mux_video_audio_bytes( + frames_u8, + audio_np, + fps=float(args.fps), + audio_sample_rate=args.audio_sample_rate, ) + with open(str(output_path), "wb") as f: + f.write(video_bytes) else: export_to_video(video_array, str(output_path), fps=args.fps) print(f"Saved generated video to {output_path}") diff --git a/tests/e2e/offline_inference/test_magi_human.py b/tests/e2e/offline_inference/test_magi_human.py index cb711edb57..8648216a92 100644 --- a/tests/e2e/offline_inference/test_magi_human.py +++ b/tests/e2e/offline_inference/test_magi_human.py @@ -110,16 +110,31 @@ def test_magi_human_e2e(run_level): assert isinstance(video_frames, np.ndarray), f"Expected numpy array, got {type(video_frames)}" assert video_frames.ndim == 4, f"Expected 4D array (T,H,W,3), got shape {video_frames.shape}" - audio_waveform = None - if hasattr(first, "multimodal_output") and first.multimodal_output: - audio_waveform = first.multimodal_output.get("audio") + mm = first.multimodal_output + assert mm, "multimodal_output is empty or missing" + + audio_waveform = mm.get("audio") assert audio_waveform is not None, "No audio waveform in multimodal_output" + audio_sample_rate = mm.get("audio_sample_rate") + assert audio_sample_rate is not None, ( + "audio_sample_rate not found in multimodal_output; model post-process must propagate it" + ) + assert isinstance(audio_sample_rate, (int, float)), ( + f"audio_sample_rate should be numeric, got {type(audio_sample_rate)}" + ) + assert int(audio_sample_rate) > 0, f"audio_sample_rate must be positive, got {audio_sample_rate}" + + fps = mm.get("fps") + assert fps is not None, "fps not found in multimodal_output; model post-process must propagate it" + assert isinstance(fps, (int, float)), f"fps should be numeric, got {type(fps)}" + assert int(fps) > 0, f"fps must be positive, got {fps}" + video_bytes = mux_video_audio_bytes( video_frames, audio_waveform, - fps=25.0, - audio_sample_rate=44100, + fps=float(fps), + audio_sample_rate=int(audio_sample_rate), ) assert isinstance(video_bytes, bytes), f"Expected MP4 bytes, got {type(video_bytes)}" assert len(video_bytes) > 1000, f"MP4 too small ({len(video_bytes)} bytes)" diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index 8d3c02b7ab..5b77c064f8 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -145,8 +145,12 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: postprocess_start_time = time.perf_counter() outputs = self.post_process_func(output_data) if self.post_process_func is not None else output_data audio_payload = None + model_audio_sample_rate = None + model_fps = None if isinstance(outputs, dict): audio_payload = outputs.get("audio") + model_audio_sample_rate = outputs.get("audio_sample_rate") + model_fps = outputs.get("fps") outputs = outputs.get("video", outputs) postprocess_time = time.perf_counter() - postprocess_start_time logger.info(f"Post-processing completed in {postprocess_time:.4f} seconds") @@ -202,6 +206,10 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: mm_output = {} if audio_payload is not None: mm_output["audio"] = audio_payload + if model_audio_sample_rate is not None: + mm_output["audio_sample_rate"] = model_audio_sample_rate + if model_fps is not None: + mm_output["fps"] = model_fps return [ OmniRequestOutput.from_diffusion( request_id=request_id, @@ -264,6 +272,10 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: if num_outputs == 1: sliced_audio = sliced_audio[0] mm_output["audio"] = sliced_audio + if model_audio_sample_rate is not None: + mm_output["audio_sample_rate"] = model_audio_sample_rate + if model_fps is not None: + mm_output["fps"] = model_fps results.append( OmniRequestOutput.from_diffusion( request_id=request_id, diff --git a/vllm_omni/diffusion/models/magi_human/pipeline_magi_human.py b/vllm_omni/diffusion/models/magi_human/pipeline_magi_human.py index 9e6efcad39..881c72edc6 100644 --- a/vllm_omni/diffusion/models/magi_human/pipeline_magi_human.py +++ b/vllm_omni/diffusion/models/magi_human/pipeline_magi_human.py @@ -1624,7 +1624,12 @@ def get_magi_human_post_process_func(*args, **kwargs): def post_process(output): if isinstance(output, tuple) and len(output) == 2: video, audio = output - return {"video": video, "audio": audio} + return { + "video": video, + "audio": audio, + "audio_sample_rate": 44100, + "fps": 25, + } return output return post_process diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py index 2987c81fba..bddfd48003 100644 --- a/vllm_omni/entrypoints/openai/serving_video.py +++ b/vllm_omni/entrypoints/openai/serving_video.py @@ -152,7 +152,7 @@ async def _run_and_extract( videos = self._extract_video_outputs(result) audios = self._extract_audio_outputs(result, expected_count=len(videos)) audio_sample_rate = self._resolve_audio_sample_rate(result) - output_fps = vp.fps or 24 + output_fps = vp.fps or self._resolve_fps(result) or 24 return videos, audios, audio_sample_rate, output_fps async def generate_videos( @@ -365,6 +365,46 @@ def _resolve_audio_sample_rate(self, result: Any) -> int: return 24000 + @staticmethod + def _resolve_fps(result: Any) -> int | None: + """Extract fps from multimodal_output if the model reported it.""" + multimodal_output = getattr(result, "multimodal_output", None) + if isinstance(multimodal_output, dict): + fps = multimodal_output.get("fps") + if fps is not None: + try: + fps_val = fps.item() if hasattr(fps, "item") else int(fps) + if fps_val > 0: + return fps_val + except (TypeError, ValueError): + pass + + request_output = getattr(result, "request_output", None) + if isinstance(request_output, dict): + mm = request_output.get("multimodal_output") or {} + if isinstance(mm, dict): + fps = mm.get("fps") + if fps is not None: + try: + fps_val = fps.item() if hasattr(fps, "item") else int(fps) + if fps_val > 0: + return fps_val + except (TypeError, ValueError): + pass + elif hasattr(request_output, "multimodal_output"): + mm = getattr(request_output, "multimodal_output", None) + if isinstance(mm, dict): + fps = mm.get("fps") + if fps is not None: + try: + fps_val = fps.item() if hasattr(fps, "item") else int(fps) + if fps_val > 0: + return fps_val + except (TypeError, ValueError): + pass + + return None + @classmethod def _extract_audio_sample_rate_from_result(cls, result: Any) -> int | None: multimodal_output = getattr(result, "multimodal_output", None) diff --git a/vllm_omni/entrypoints/openai/video_api_utils.py b/vllm_omni/entrypoints/openai/video_api_utils.py index 2ed1fd3de6..69178fb3d3 100644 --- a/vllm_omni/entrypoints/openai/video_api_utils.py +++ b/vllm_omni/entrypoints/openai/video_api_utils.py @@ -8,8 +8,6 @@ import base64 import binascii -import os -import tempfile from io import BytesIO from typing import Any @@ -160,7 +158,7 @@ def _normalize_frames(frames: list[Any]) -> list[np.ndarray]: def _coerce_video_to_frames(video: Any) -> list[np.ndarray]: - """Convert a video payload into a list of frames for export_to_video.""" + """Convert a video payload into a list of normalized float32 frames.""" if isinstance(video, torch.Tensor): video_array = _normalize_video_tensor(video) return list(video_array) @@ -186,81 +184,45 @@ def _coerce_video_to_frames(video: Any) -> list[np.ndarray]: raise ValueError(f"Unsupported video payload type: {type(video)}") -def _coerce_audio_to_waveform(audio: Any) -> torch.Tensor: - """Convert an audio payload into a 2-channel CPU float tensor for LTX2 export.""" +def _coerce_audio_to_numpy(audio: Any) -> np.ndarray: + """Convert an audio payload into a float32 numpy array for muxing.""" if isinstance(audio, torch.Tensor): - waveform = audio.detach().cpu() + arr = audio.detach().cpu().float().numpy() elif isinstance(audio, np.ndarray): - waveform = torch.from_numpy(audio) + arr = audio elif isinstance(audio, list): - waveform = torch.tensor(audio) + arr = np.array(audio) else: raise ValueError(f"Unsupported audio payload type: {type(audio)}") - waveform = waveform.squeeze() - - if waveform.ndim == 0: + arr = np.squeeze(arr) + if arr.ndim == 0: raise ValueError("Audio payload must contain at least one sample.") - if waveform.ndim == 1: - waveform = waveform.unsqueeze(0) - elif waveform.ndim == 2: - if waveform.shape[0] in (1, 2): - pass - elif waveform.shape[1] in (1, 2): - waveform = waveform.transpose(0, 1) - else: - raise ValueError(f"Unsupported audio payload shape: {tuple(waveform.shape)}") - else: - raise ValueError(f"Unsupported audio payload rank: {waveform.ndim}") - - if waveform.shape[0] == 1: - waveform = waveform.repeat(2, 1) - elif waveform.shape[0] != 2: - raise ValueError(f"Expected mono or stereo audio, got shape {tuple(waveform.shape)}") - - return waveform.float().contiguous() + return arr.astype(np.float32) def _encode_video_bytes(video: Any, fps: int, audio: Any | None = None, audio_sample_rate: int | None = None) -> bytes: """Encode a video payload into MP4 bytes, optionally muxing audio.""" - try: - from diffusers.utils import export_to_video - except ImportError as exc: # pragma: no cover - optional dependency - raise ImportError("diffusers is required for export_to_video.") from exc + from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes frames = _coerce_video_to_frames(video) if not frames: raise ValueError("No frames found to encode.") - tmp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) - tmp_file.close() - try: - if audio is not None: - from diffusers.pipelines.ltx2.export_utils import encode_video as encode_ltx2_video - - frames_np = np.stack(frames, axis=0) - if frames_np.ndim == 4 and frames_np.shape[-1] == 4: - frames_np = frames_np[..., :3] - frames_np = np.clip(frames_np, 0.0, 1.0) - frames_u8 = (frames_np * 255).round().clip(0, 255).astype("uint8") - video_tensor = torch.from_numpy(frames_u8) - encode_ltx2_video( - video_tensor, - fps=fps, - audio=_coerce_audio_to_waveform(audio), - audio_sample_rate=audio_sample_rate, - output_path=tmp_file.name, - ) - else: - export_to_video(frames, tmp_file.name, fps=fps) - with open(tmp_file.name, "rb") as f: - return f.read() - finally: - try: - os.remove(tmp_file.name) - except OSError: - pass + frames_np = np.stack(frames, axis=0) + if frames_np.ndim == 4 and frames_np.shape[-1] == 4: + frames_np = frames_np[..., :3] + frames_u8 = (np.clip(frames_np, 0.0, 1.0) * 255).round().clip(0, 255).astype(np.uint8) + + audio_np = _coerce_audio_to_numpy(audio) if audio is not None else None + + return mux_video_audio_bytes( + frames_u8, + audio_np, + fps=float(fps), + audio_sample_rate=audio_sample_rate or 24000, + ) def encode_video_base64(video: Any, fps: int, audio: Any | None = None, audio_sample_rate: int | None = None) -> str: From 92c788e5fd77b6cb03f9aa23b5f796e9e7c575e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zhengyuan=20Su=20=28=E8=8B=8F=E6=94=BF=E6=B8=8A=29?= Date: Thu, 9 Apr 2026 14:55:15 +0800 Subject: [PATCH 099/204] [Misc] Clean up method name in BAGEL. (#2501) Signed-off-by: Zhengyuan Su Co-authored-by: Claude Opus 4.6 (1M context) --- .../models/bagel/test_trajectory_recording.py | 8 ++++---- vllm_omni/diffusion/cache/teacache/backend.py | 9 --------- .../cache/teacache/coefficient_estimator.py | 8 -------- .../models/bagel/bagel_transformer.py | 18 +++++++++--------- .../diffusion/models/bagel/pipeline_bagel.py | 8 ++++---- 5 files changed, 17 insertions(+), 34 deletions(-) diff --git a/tests/diffusion/models/bagel/test_trajectory_recording.py b/tests/diffusion/models/bagel/test_trajectory_recording.py index 7518388d28..80b3f9d9ba 100644 --- a/tests/diffusion/models/bagel/test_trajectory_recording.py +++ b/tests/diffusion/models/bagel/test_trajectory_recording.py @@ -24,15 +24,15 @@ def _make_mock_bagel(): - """Create a mock Bagel with _forward_flow returning constant velocity.""" + """Create a mock Bagel with forward returning constant velocity.""" mock = MagicMock(spec=Bagel) mock._sp_size = 1 - # _forward_flow returns a small constant velocity so x_t changes each step - def fake_forward_flow(self, x_t, **kwargs): + # forward returns a small constant velocity so x_t changes each step + def fake_forward(self, x_t, **kwargs): return torch.ones_like(x_t) * 0.1 - mock._forward_flow = types.MethodType(fake_forward_flow, mock) + mock.forward = types.MethodType(fake_forward, mock) # _merge_naive_caches is called in the batched CFG path mock._merge_naive_caches = types.MethodType(lambda self, caches: NaiveCache(1), mock) diff --git a/vllm_omni/diffusion/cache/teacache/backend.py b/vllm_omni/diffusion/cache/teacache/backend.py index a5087fe0c2..772dec7891 100644 --- a/vllm_omni/diffusion/cache/teacache/backend.py +++ b/vllm_omni/diffusion/cache/teacache/backend.py @@ -48,16 +48,7 @@ def enable_bagel_teacache(pipeline: Any, config: DiffusionCacheConfig) -> None: coefficients=config.coefficients, ) transformer = pipeline.bagel - original_forward_flow = transformer._forward_flow - - import types - - def forward_alias(self, *args, **kwargs): - return original_forward_flow(*args, **kwargs) - - transformer.forward = types.MethodType(forward_alias, transformer) apply_teacache_hook(transformer, teacache_config) - transformer._forward_flow = transformer.forward pipeline.transformer = transformer logger.info( diff --git a/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py b/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py index f3a278b217..5dd80718d1 100644 --- a/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py +++ b/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import types from typing import Any import numpy as np @@ -74,15 +73,8 @@ def get_transformer(pipeline: Any) -> tuple[Any, str]: @staticmethod def install_hook(transformer: Any, hook: DataCollectionHook) -> None: - original_forward_flow = transformer._forward_flow - - def forward_alias(self, *args, **kwargs): - return original_forward_flow(*args, **kwargs) - - transformer.forward = types.MethodType(forward_alias, transformer) registry = HookRegistry.get_or_create(transformer) registry.register_hook(hook._HOOK_NAME, hook) - transformer._forward_flow = transformer.forward class StableAudioAdapter: diff --git a/vllm_omni/diffusion/models/bagel/bagel_transformer.py b/vllm_omni/diffusion/models/bagel/bagel_transformer.py index a04ded3765..f848077568 100644 --- a/vllm_omni/diffusion/models/bagel/bagel_transformer.py +++ b/vllm_omni/diffusion/models/bagel/bagel_transformer.py @@ -1734,7 +1734,7 @@ def generate_image( packed_seqlens=packed_seqlens, ) - v_t = self._forward_flow_single_branch( + v_t = self.forward_single_branch( **common, packed_indexes=packed_indexes, packed_position_ids=packed_position_ids, @@ -1744,7 +1744,7 @@ def generate_image( ) if cfg_text_scale_ > 1.0: - cfg_text_v_t = self._forward_flow_single_branch( + cfg_text_v_t = self.forward_single_branch( **common, packed_indexes=cfg_text_packed_query_indexes, packed_position_ids=cfg_text_packed_position_ids, @@ -1754,7 +1754,7 @@ def generate_image( ) cfg_img_v_t = None if cfg_img_scale_ > 1.0: - cfg_img_v_t = self._forward_flow_single_branch( + cfg_img_v_t = self.forward_single_branch( **common, packed_indexes=cfg_img_packed_query_indexes, packed_position_ids=cfg_img_packed_position_ids, @@ -1790,7 +1790,7 @@ def generate_image( if use_sp: for i, t in enumerate(timesteps): timestep = torch.tensor([t] * x_t.shape[0], device=x_t.device) - v_t = self._forward_flow_single_branch( + v_t = self.forward_single_branch( x_t=x_t, timestep=timestep, packed_vae_token_indexes=packed_vae_token_indexes, @@ -1883,7 +1883,7 @@ def generate_image( else: cfg_text_scale_ = 1.0 cfg_img_scale_ = 1.0 - v_t = self._forward_flow( + v_t = self.forward( x_t=x_t, timestep=timestep, packed_vae_token_indexes=packed_vae_token_indexes, @@ -2019,7 +2019,7 @@ def _generate_image_parallel( if use_cfg_this_step: # CFG interval: each rank computes its own branch - local_v_t = self._forward_flow_single_branch( + local_v_t = self.forward_single_branch( x_t=x_t, timestep=timestep, packed_vae_token_indexes=packed_vae_token_indexes, @@ -2046,7 +2046,7 @@ def _generate_image_parallel( ) else: # Outside CFG interval: all ranks compute with gen inputs, no comm - v_t = self._forward_flow_single_branch( + v_t = self.forward_single_branch( x_t=x_t, timestep=timestep, packed_vae_token_indexes=packed_vae_token_indexes, @@ -2128,7 +2128,7 @@ def _combine_cfg( return v_t - def _forward_flow_single_branch( + def forward_single_branch( self, x_t: torch.Tensor, timestep: torch.LongTensor, @@ -2258,7 +2258,7 @@ def _forward_flow_single_branch( v_t = v_t[packed_vae_token_indexes] return v_t - def _forward_flow( + def forward( self, x_t: torch.Tensor, timestep: torch.LongTensor, diff --git a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py index 2c72d98908..13d0cc2093 100644 --- a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py +++ b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py @@ -159,8 +159,8 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = ""): self.od_config = od_config self.device = get_local_device() - self._scheduler: object | None = None - self._scheduler_kwargs: dict = {} + self.scheduler: object | None = None + self.scheduler_kwargs: dict = {} model = od_config.model local_files_only = os.path.exists(model) @@ -654,8 +654,8 @@ def vae_transforms(img): cfg_img_key_values_lens=generation_input_cfg_img["cfg_key_values_lens"], cfg_img_packed_key_value_indexes=generation_input_cfg_img["cfg_packed_key_value_indexes"], return_trajectory_latents=req.sampling_params.return_trajectory_latents, - scheduler=self._scheduler, - scheduler_kwargs=self._scheduler_kwargs, + scheduler=self.scheduler, + scheduler_kwargs=self.scheduler_kwargs, ) img = self._decode_image_from_latent(self.bagel, self.vae, latents[0], image_shape) From 0e8e630c5b183bcdca74194bb07f2016b7cad3aa Mon Sep 17 00:00:00 2001 From: Ting FU Date: Thu, 9 Apr 2026 15:08:08 +0800 Subject: [PATCH 100/204] [Feat] /v1/images/generations api supports request cancel (#2621) Signed-off-by: Semmer --- vllm_omni/entrypoints/openai/api_server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index d445ad0eca..0706b98987 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1282,7 +1282,8 @@ async def show_available_models(raw_request: Request) -> JSONResponse: HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse}, }, ) -async def generate_images(request: ImageGenerationRequest, raw_request: Request) -> ImageGenerationResponse: +@with_cancellation +async def generate_images(request: ImageGenerationRequest, raw_request: Request): """Generate images from text prompts using diffusion models. OpenAI DALL-E compatible endpoint for text-to-image generation. From 9225039d170607954e23ff32153bd0121ba3ce57 Mon Sep 17 00:00:00 2001 From: LiBai <91311486+RGB-loop@users.noreply.github.com> Date: Thu, 9 Apr 2026 15:35:22 +0800 Subject: [PATCH 101/204] [Bug] Lazy-import entrypoints to fix subprocess pynvml crash (#2187) Signed-off-by: Meng Jianwen --- vllm_omni/__init__.py | 18 +++++++++++++++++- vllm_omni/entrypoints/__init__.py | 17 +++++++++++++++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/vllm_omni/__init__.py b/vllm_omni/__init__.py index b093272d2f..cec8b0af7e 100644 --- a/vllm_omni/__init__.py +++ b/vllm_omni/__init__.py @@ -24,11 +24,27 @@ from vllm_omni.transformers_utils import configs as _configs # noqa: F401, E402 from .config import OmniModelConfig -from .entrypoints import AsyncOmni, Omni from .version import __version__, __version_tuple__ # isort:skip +def __getattr__(name: str): + # Lazy import for AsyncOmni and Omni to avoid pulling in heavy + # dependencies (vllm model_loader → fused_moe → pynvml) at package + # import time. This prevents crashes in lightweight subprocesses + # (e.g. model-architecture inspection) that lack a CUDA context. + # See: https://github.com/vllm-project/vllm-omni/issues/1793 + if name == "AsyncOmni": + from .entrypoints.async_omni import AsyncOmni + + return AsyncOmni + if name == "Omni": + from .entrypoints.omni import Omni + + return Omni + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + __all__ = [ "__version__", "__version_tuple__", diff --git a/vllm_omni/entrypoints/__init__.py b/vllm_omni/entrypoints/__init__.py index 7b09adf939..b273929a8e 100644 --- a/vllm_omni/entrypoints/__init__.py +++ b/vllm_omni/entrypoints/__init__.py @@ -5,8 +5,21 @@ vLLM-Omni entrypoints module. """ -from vllm_omni.entrypoints.async_omni import AsyncOmni -from vllm_omni.entrypoints.omni import Omni + +def __getattr__(name: str): + # Lazy imports to avoid eagerly loading heavy modules (engine, + # model_loader, pynvml) when the package is imported in lightweight + # contexts such as model-architecture inspection subprocesses. + if name == "AsyncOmni": + from vllm_omni.entrypoints.async_omni import AsyncOmni + + return AsyncOmni + if name == "Omni": + from vllm_omni.entrypoints.omni import Omni + + return Omni + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + __all__ = [ "AsyncOmni", From a7bf4050deabc39a12642ab22117bca23f8fc596 Mon Sep 17 00:00:00 2001 From: Samit <285365963@qq.com> Date: Thu, 9 Apr 2026 15:54:23 +0800 Subject: [PATCH 102/204] [Docs] Add multi-thread weight loading documentation (#2445) Signed-off-by: samithuang <285365963@qq.com> --- docs/user_guide/diffusion_features.md | 65 ++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 2f04e35687..c09705ae05 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -12,7 +12,7 @@ vLLM-Omni supports various advanced features for diffusion models: -- Acceleration: **cache methods**, **parallelism methods** +- Acceleration: **cache methods**, **parallelism methods**, **startup optimizations** - Memory optimization: **cpu offloading**, **quantization** - Extensions: **LoRA inference** - Execution modes: **step execution** @@ -44,6 +44,12 @@ Parallelism methods distribute computation across GPUs without quality loss (mat | **[HSDP](diffusion/parallelism/hsdp.md)** | Weight sharding via FSDP2, redistributed on-demand at runtime | Very large models (14B+) on limited VRAM, combinable with SP | | **[Expert Parallelism](diffusion/parallelism/expert_parallel.md)** | Shards MoE expert MLP blocks across devices | MoE diffusion models (e.g., HunyuanImage3.0) | +#### Startup Optimization + +| Method | Description | Best For | +|--------|-------------|----------| +| **[Multi-Thread Weight Loading](#multi-thread-weight-loading)** | Loads safetensors shards in parallel using a thread pool | All diffusion models; reduces startup from minutes to seconds | + **Note:** Some acceleration methods can be combined together for optimized performance. See [Feature Compatibility Table](#feature-compatibility) and [Feature Compatibility Tutorial](feature_compatibility.md) for detailed configuration examples. ### Memory Optimization @@ -179,6 +185,59 @@ The following tables show which models support each feature: 6. Step Execution is not compatible with cache backends (TeaCache, Cache-DiT) or LoRA. +## Multi-Thread Weight Loading + +Large diffusion models can take several minutes to load weights at startup (e.g., ~3 min for Qwen-Image, ~5 min for Wan2.2 I2V 14B). Multi-thread weight loading speeds up this process by loading safetensors shards in parallel using a thread pool instead of sequentially. + +This optimization is **enabled by default** with 4 threads. No configuration is needed for the default behavior. + +### Configuration + +| Parameter | CLI Flag | Default | Description | +|-----------|----------|---------|-------------| +| `enable_multithread_weight_load` | `--disable-multithread-weight-load` | `True` (enabled) | Pass the flag to disable multi-thread loading | +| `num_weight_load_threads` | `--num-weight-load-threads` | `4` | Number of threads for parallel weight loading | + +!!! tip + The default of 4 threads balances speed and disk I/O contention. On fast NVMe storage you may benefit from more threads (e.g., 8). On HDD or network storage, the default of 4 avoids saturating I/O bandwidth. + +### Online Serving + +```bash +# Default (multi-thread enabled, 4 threads) +vllm serve Qwen/Qwen-Image --omni --port 8091 + +# Custom thread count +vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers --omni --num-weight-load-threads 8 + +# Disable multi-thread loading +vllm serve Qwen/Qwen-Image --omni --disable-multithread-weight-load +``` + +### Offline Inference + +```python +from vllm_omni import Omni + +# Default (multi-thread enabled, 4 threads) +omni = Omni(model="Qwen/Qwen-Image") + +# Custom thread count +omni = Omni( + model="Wan-AI/Wan2.2-I2V-A14B-Diffusers", + num_weight_load_threads=8, +) +``` + +### Benchmarks + +Measured on NVIDIA H800: + +| Model | Before | After | Speedup | +|-------|--------|-------|---------| +| **Qwen/Qwen-Image** (53.7 GiB) | 168s | 27s | **6.2x** | +| **Wan-AI/Wan2.2-I2V-A14B-Diffusers** (64.5 GiB) | 283s | 56s | **5.1x** | + ## Learn More **Cache Acceleration:** @@ -204,6 +263,10 @@ The following tables show which models support each feature: - **[Step Execution Guide](diffusion/step_execution.md)** - Per-step denoise execution with mid-request abort support +**Startup Optimization:** + +- **[Multi-Thread Weight Loading](#multi-thread-weight-loading)** - Speed up model startup by loading safetensors shards in parallel + **Advanced Topics:** - **[Feature Compatibility](feature_compatibility.md)** - How to combine multiple features for maximum performance From e2b0ee4b8723366c927d64bdc7e41e7e67cc9495 Mon Sep 17 00:00:00 2001 From: Dogeun Kim <82812668+DOGEUNNKIM@users.noreply.github.com> Date: Thu, 9 Apr 2026 17:37:34 +0900 Subject: [PATCH 103/204] [Model] Add Dynin-omni model in vllm-omni (#1759) Signed-off-by: kdg6245@snu.ac.kr Signed-off-by: Yejoon Lee (IPAI) Signed-off-by: aidas (arpa-kt) Signed-off-by: Dogeun Kim <82812668+DOGEUNNKIM@users.noreply.github.com> Co-authored-by: Yejoon Lee (IPAI) Co-authored-by: aidas (arpa-kt) --- docs/models/supported_models.md | 1 + .../offline_inference/dynin_omni/README.md | 110 ++ .../offline_inference/dynin_omni/end2end.py | 1448 +++++++++++++++ examples/online_serving/dynin_omni/README.md | 97 + ...letion_client_for_multimodal_generation.py | 342 ++++ .../e2e/offline_inference/test_dynin_omni.py | 419 +++++ .../test_dynin_omni_expansion.py | 160 ++ tests/e2e/stage_configs/dynin_omni_ci.yaml | 84 + .../models/dynin_omni/__init__.py | 59 + .../models/dynin_omni/dynin_omni.py | 744 ++++++++ .../models/dynin_omni/dynin_omni_common.py | 1241 +++++++++++++ .../dynin_omni/dynin_omni_token2audio.py | 274 +++ .../dynin_omni/dynin_omni_token2image.py | 150 ++ .../dynin_omni/dynin_omni_token2text.py | 1580 +++++++++++++++++ vllm_omni/model_executor/models/registry.py | 5 + .../stage_configs/dynin_omni.yaml | 80 + .../dynin_omni_multiconnector.yaml | 114 ++ .../stage_input_processors/dynin_omni.py | 164 ++ 18 files changed, 7072 insertions(+) create mode 100644 examples/offline_inference/dynin_omni/README.md create mode 100644 examples/offline_inference/dynin_omni/end2end.py create mode 100644 examples/online_serving/dynin_omni/README.md create mode 100644 examples/online_serving/dynin_omni/openai_chat_completion_client_for_multimodal_generation.py create mode 100644 tests/e2e/offline_inference/test_dynin_omni.py create mode 100644 tests/e2e/online_serving/test_dynin_omni_expansion.py create mode 100644 tests/e2e/stage_configs/dynin_omni_ci.yaml create mode 100644 vllm_omni/model_executor/models/dynin_omni/__init__.py create mode 100644 vllm_omni/model_executor/models/dynin_omni/dynin_omni.py create mode 100644 vllm_omni/model_executor/models/dynin_omni/dynin_omni_common.py create mode 100644 vllm_omni/model_executor/models/dynin_omni/dynin_omni_token2audio.py create mode 100644 vllm_omni/model_executor/models/dynin_omni/dynin_omni_token2image.py create mode 100644 vllm_omni/model_executor/models/dynin_omni/dynin_omni_token2text.py create mode 100644 vllm_omni/model_executor/stage_configs/dynin_omni.yaml create mode 100644 vllm_omni/model_executor/stage_configs/dynin_omni_multiconnector.yaml create mode 100644 vllm_omni/model_executor/stage_input_processors/dynin_omni.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 8eab20edc8..0f9c8fff60 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -60,5 +60,6 @@ th { | `HunyuanVideo15Pipeline` | HunyuanVideo-1.5-T2V | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_t2v`, `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v` | ✅︎ | ✅︎ | | | | `HunyuanVideo15ImageToVideoPipeline` | HunyuanVideo-1.5-I2V | `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-480p_i2v`, `hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_i2v` | ✅︎ | ✅︎ | | | | `VoxtralTTSForConditionalGeneration` | Voxtral TTS | `mistralai/Voxtral-4B-TTS-2603` | ✅︎ | ✅︎ | | | +|`DyninOmniForConditionalGeneration` | Dynin-Omni | `snu-aidas/Dynin-Omni` | ✅︎ | | | | ✅︎ indicates the model is supported on that backend. Empty cells mean not listed as supported on that backend. diff --git a/examples/offline_inference/dynin_omni/README.md b/examples/offline_inference/dynin_omni/README.md new file mode 100644 index 0000000000..d28b360714 --- /dev/null +++ b/examples/offline_inference/dynin_omni/README.md @@ -0,0 +1,110 @@ +# Dynin-Omni Offline End2End Example + +This folder contains a unified offline inference entrypoint: + +- `end2end.py` + +## 1. Environment Setup + +Run from repository root: + +```bash +cd +``` + +If needed, install this repo in editable mode: + +```bash +pip install -e . +``` + +## 2. Extra Dependencies (EMOVA) + +Install the following packages for EMOVA-related components: + +```bash +pip install \ + "phonemizer==3.3.0" \ + "Unidecode==1.4.0" \ + "hydra-core==1.3.2" \ + "pytorch-lightning==1.1.0" \ + "wget==3.2" \ + "wrapt==2.1.1" \ + "onnx==1.20.1" \ + "frozendict==2.4.7" \ + "inflect==7.5.0" \ + "braceexpand==0.1.7" \ + "webdataset==1.0.2" \ + "torch-stft==0.1.4" \ + "editdistance==0.8.1" +``` + +## 3. Hardware and VRAM Requirements + +This example uses a 3-stage pipeline on one GPU by default +([`dynin_omni.yaml`](../../../vllm_omni/model_executor/stage_configs/dynin_omni.yaml)): + +- Stage-0 (`token2text`): `gpu_memory_utilization: 0.5` +- Stage-1 (`token2image`): `gpu_memory_utilization: 0.1` +- Stage-2 (`token2audio`): `gpu_memory_utilization: 0.1` + +### Requested GPU Memory Budget from `gpu_memory_utilization` + +| Stage | Utilization | A100 80GB | H200 141GB | +| :-- | :-- | :-- | :-- | +| Stage-0 (token2text) | 0.5 | ~40.0 GB | ~70.5 GB | +| Stage-1 (token2image) | 0.1 | ~8.0 GB | ~14.1 GB | +| Stage-2 (token2audio) | 0.1 | ~8.0 GB | ~14.1 GB | +| Total requested budget | 0.7 | ~56.0 GB | ~98.7 GB | + +### Observed Runtime Signal (from your log) + +- Stage-0 reported: `Model loading took 15.12 GiB memory` (weights footprint signal). +- Stages 1/2 can still add runtime memory depending on task path and backend allocations. +- Keep extra headroom for CUDA/PyTorch overhead and temporary allocations. + +### GPU Compatibility + +- Confirmed target GPUs for this setup: **NVIDIA H200**, **NVIDIA A100**. +- CI/e2e coverage in this repo also includes CUDA **L4** markers for Dynin tests. + +## 4. End2End Run Examples + +```bash +# t2t +python /examples/offline_inference/dynin_omni/end2end.py \ + --task t2t --model snu-aidas/Dynin-Omni --text + +# i2t +python /examples/offline_inference/dynin_omni/end2end.py \ + --task i2t --model snu-aidas/Dynin-Omni --image --text "Please describe this image in detail." + +# s2t +python /examples/offline_inference/dynin_omni/end2end.py \ + --task s2t --model snu-aidas/Dynin-Omni --audio --text "Transcribe the given audio." + +# t2i +python /examples/offline_inference/dynin_omni/end2end.py \ + --task t2i --model snu-aidas/Dynin-Omni --text + +# v2t +python /examples/offline_inference/dynin_omni/end2end.py \ + --task v2t --model snu-aidas/Dynin-Omni --video --text "Describe this video in detail." + +# i2i +python /examples/offline_inference/dynin_omni/end2end.py \ + --task i2i --model snu-aidas/Dynin-Omni --image --text + +# t2s +python /examples/offline_inference/dynin_omni/end2end.py \ + --task t2s --model snu-aidas/Dynin-Omni --text +``` + +## 5. Notes + +- Outputs are saved under task-specific directories in `/tmp` by default. +- You can override output path with `--output-dir`. +- If you want to force local config resolution, pass `--dynin-config-path `. +- If you see the warning + `max_num_batched_tokens (32768) exceeds max_num_seqs * max_model_len (4096)`, + reduce `max_num_batched_tokens` in stage config (for example, `4096` in CI config). diff --git a/examples/offline_inference/dynin_omni/end2end.py b/examples/offline_inference/dynin_omni/end2end.py new file mode 100644 index 0000000000..66047934d5 --- /dev/null +++ b/examples/offline_inference/dynin_omni/end2end.py @@ -0,0 +1,1448 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import argparse +import json +import os +import re +import sys +import time +import types +from importlib.machinery import ModuleSpec +from pathlib import Path +from typing import Any + +import numpy as np +import torch +from PIL import Image + +TASK_CHOICES = ("t2t", "t2i", "t2s", "i2i", "i2t", "s2t", "v2t") + +TASK_DEFAULT_RUNTIME = { + "t2t": ("mmu", "mmu", 0, "text"), + "t2i": ("t2i", "t2i_gen", 2, "image"), + "t2s": ("t2s_mmu_like", "t2s_gen", 1, "audio"), + "i2i": ("i2i", "i2i", 2, "image"), + "i2t": ("mmu", "mmu", 0, "text"), + "s2t": ("s2t", "s2t", 0, "text"), + "v2t": ("v2t", "v2t", 0, "text"), +} + +TASK_RUNTIME_FALLBACKS: dict[str, dict[str, Any]] = { + "t2t": { + "output_dir": "/tmp/dynin_end2end_outputs", + "prompt_max_text_len": 1024, + "max_new_tokens": 1024, + "steps": 1024, + "block_length": 16, + "temperature": 0.0, + "cfg_scale": 0.0, + }, + "t2i": { + "output_dir": "/tmp/dynin_t2i_outputs", + "prompt_max_text_len": 128, + "image_token_count": 1024, + "mask_token_id": 126336, + "codebook_size": 8192, + "timesteps": 20, + "guidance_scale": 3.5, + "temperature": 1.0, + }, + "i2i": { + "output_dir": "/tmp/dynin_i2i_outputs", + "prompt_max_text_len": 128, + "mask_token_id": 126336, + "codebook_size": 8192, + "timesteps": 64, + "guidance_scale": 3.5, + "temperature": 1.0, + "image_resolution": 336, + "use_train_i2i_prompt": True, + }, + "i2t": { + "output_dir": "/tmp/dynin_i2t_outputs", + "prompt_max_text_len": 128, + "max_new_tokens": 128, + "steps": 128, + "block_length": 2, + "temperature": 0.0, + "cfg_scale": 0.0, + "mask_token_id": 126336, + "codebook_size": 8192, + "image_resolution": 480, + "remasking": "low_confidence", + }, + "s2t": { + "output_dir": "/tmp/dynin_s2t_outputs", + "prompt_max_text_len": 1024, + "max_new_tokens": 128, + "steps": 128, + "block_length": 2, + "temperature": 0.0, + "cfg_scale": 0.0, + "mask_token_id": 126336, + "codebook_size": 8192, + "remasking": "low_confidence", + }, + "t2s": { + "output_dir": "/tmp/dynin_t2s_outputs", + "runtime_task": "t2s_mmu_like", + "prompting_task": "t2s_gen", + "prompt_max_text_len": 1024, + "t2s_token_length": 512, + "mask_token_id": 126336, + "codebook_size": 8192, + "audio_codebook_size": 4096, + "steps": 512, + "block_length": 128, + "temperature": 1.0, + "cfg_scale": 2.5, + "t2s_condition": "gender-female_emotion-neutral_speed-normal_pitch-normal", + }, + "v2t": { + "output_dir": "/tmp/dynin_v2t_outputs", + "prompt_max_text_len": 1024, + "max_new_tokens": 128, + "steps": 128, + "block_length": 2, + "temperature": 0.0, + "cfg_scale": 0.0, + "mask_token_id": 126336, + "codebook_size": 8192, + "image_resolution": 224, + "num_frames": 5, + "remasking": "low_confidence", + }, +} + +DEFAULT_I2T_QUESTION = "Please describe this image in detail." +DEFAULT_S2T_INSTRUCTION = "Transcribe the given audio." +DEFAULT_V2T_QUESTION = "Please provide a detailed description of the video." +DEFAULT_T2T_PROMPT = "Explain multimodal LLM inference in 3 sentences." +DEFAULT_T2S_INSTRUCTION = "Convert the given text into spoken audio." +DEFAULT_T2S_PROMPT = "Hello. This is a default text-to-speech sample." + +DYNIN_SPECIAL_TOKENS = ( + "<|soi|>", + "<|eoi|>", + "<|sov|>", + "<|eov|>", + "<|t2i|>", + "<|mmu|>", + "<|t2v|>", + "<|v2v|>", + "<|lvg|>", + "<|i2i|>", + "<|ti2ti|>", + "<|v2t|>", + "<|v2s|>", + "<|s2t|>", + "<|t2s|>", + "<|s2s|>", + "<|soa|>", + "<|eoa|>", +) + + +def bootstrap_repo_path() -> Path: + repo_root = Path(__file__).resolve().parents[3] + repo_root_str = str(repo_root) + if repo_root_str not in sys.path: + sys.path.insert(0, repo_root_str) + return repo_root + + +def ensure_safe_import_for_vllm() -> None: + os.environ.setdefault("TRANSFORMERS_NO_TORCHVISION", "1") + try: + import torchvision # noqa: F401 + + return + except Exception: + pass + + import enum + + class _InterpolationMode(enum.Enum): + NEAREST = 0 + BILINEAR = 2 + BICUBIC = 3 + LANCZOS = 1 + HAMMING = 4 + BOX = 5 + + tv_mod = types.ModuleType("torchvision") + tv_mod.__dict__["__version__"] = "0.0-stub" + tv_mod.__spec__ = ModuleSpec(name="torchvision", loader=None) + transforms_mod = types.ModuleType("torchvision.transforms") + transforms_mod.__spec__ = ModuleSpec(name="torchvision.transforms", loader=None) + transforms_mod.InterpolationMode = _InterpolationMode + tv_mod.transforms = transforms_mod + sys.modules["torchvision"] = tv_mod + sys.modules["torchvision.transforms"] = transforms_mod + + +def sanitize_repo_id(repo_id: str) -> str: + return re.sub(r"[^a-zA-Z0-9._-]+", "_", repo_id) + + +def is_hf_repo_id(value: str) -> bool: + return isinstance(value, str) and value.count("/") == 1 and all(value.split("/", 1)) + + +def ensure_local_model_dir(model: str, cache_dir: Path, localize: bool) -> Path: + model_path = Path(model).expanduser() + if model_path.is_dir(): + return model_path.resolve() + if not localize: + return Path(model) + + from huggingface_hub import snapshot_download + + cache_dir.mkdir(parents=True, exist_ok=True) + os.environ.setdefault("HF_HOME", str(cache_dir / ".hf_home")) + local_dir = cache_dir / sanitize_repo_id(model) + if not local_dir.exists(): + print(f"[end2end] Downloading model into local cache: {local_dir}") + snapshot_download( + repo_id=model, + local_dir=str(local_dir), + local_dir_use_symlinks=True, + resume_download=True, + ) + return local_dir.resolve() + + +def resolve_local_only( + override: bool | None, + source: str, + default: bool, +) -> bool: + if override is not None: + return bool(override) + return default or Path(source).expanduser().is_dir() + + +def load_text_tokenizer(tokenizer_source: str, local_files_only: bool): + from transformers import AutoTokenizer + + kwargs = { + "trust_remote_code": True, + "padding_side": "left", + "local_files_only": bool(local_files_only), + } + try: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_source, **kwargs) + except TypeError: + kwargs.pop("local_files_only", None) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_source, **kwargs) + return tokenizer + + +def preprocess_image(image: Image.Image, resolution: int) -> torch.Tensor: + w, h = image.size + short_side = min(w, h) + scale = resolution / short_side + new_w, new_h = round(w * scale), round(h * scale) + image = image.resize((new_w, new_h), Image.BICUBIC) + left = (new_w - resolution) // 2 + top = (new_h - resolution) // 2 + image = image.crop((left, top, left + resolution, top + resolution)) + arr = np.array(image, dtype=np.float32) / 255.0 + tensor = torch.from_numpy(arr).permute(2, 0, 1) + return (tensor - 0.5) / 0.5 + + +def load_vq_image_encoder(source: str, local_files_only: bool, device: torch.device) -> Any: + from vllm_omni.model_executor.models.dynin_omni.dynin_omni_common import get_dynin_magvit_attr + + MAGVITv2 = get_dynin_magvit_attr("MAGVITv2", source=source, local_files_only=local_files_only) + vq_model = MAGVITv2.from_pretrained(source, local_files_only=local_files_only).to(device) + vq_model.requires_grad_(False) + vq_model.eval() + return vq_model + + +def encode_image_tokens( + image_path: Path, + vq_model: Any, + device: torch.device, + resolution: int, +) -> torch.Tensor: + image = Image.open(image_path).convert("RGB") + image_tensor = preprocess_image(image, resolution=resolution).unsqueeze(0).to(device) + with torch.no_grad(): + token_ids = vq_model.get_code(image_tensor) + token_ids = torch.as_tensor(token_ids, dtype=torch.long).detach().cpu() + if token_ids.ndim == 2 and token_ids.shape[0] == 1: + token_ids = token_ids[0] + return token_ids.contiguous() + + +def encode_video_tokens( + video_path: Path, + vq_model: Any, + device: torch.device, + resolution: int, + num_frames: int, +) -> torch.Tensor: + import cv2 + + cap = cv2.VideoCapture(str(video_path)) + frames: list[np.ndarray] = [] + while True: + ok, frame = cap.read() + if not ok: + break + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + frames.append(frame) + cap.release() + if not frames: + raise ValueError(f"Video has no readable frames: {video_path}") + if len(frames) < num_frames: + raise ValueError(f"Video has {len(frames)} frames, requires >= {num_frames}: {video_path}") + + indices = np.linspace(0, len(frames) - 1, num_frames).astype(int) + token_list: list[torch.Tensor] = [] + for idx in indices: + pil = Image.fromarray(frames[int(idx)]) + frame_tensor = preprocess_image(pil, resolution=resolution).unsqueeze(0).to(device) + with torch.no_grad(): + token_list.append(torch.as_tensor(vq_model.get_code(frame_tensor), dtype=torch.long)) + merged = torch.cat(token_list, dim=1).detach().cpu() + if merged.ndim == 2 and merged.shape[0] == 1: + merged = merged[0] + return merged.contiguous() + + +def load_vq_audio_encoder(source: str, local_files_only: bool, device: torch.device) -> Any: + from transformers import AutoModel + + kwargs = { + "trust_remote_code": True, + "local_files_only": bool(local_files_only), + "low_cpu_mem_usage": False, + } + try: + model = AutoModel.from_pretrained(source, **kwargs) + except TypeError: + kwargs.pop("low_cpu_mem_usage", None) + try: + model = AutoModel.from_pretrained(source, **kwargs) + except TypeError: + kwargs.pop("local_files_only", None) + model = AutoModel.from_pretrained(source, **kwargs) + model.requires_grad_(False) + model.eval() + if hasattr(model, "to"): + model = model.to(device) + return model + + +def encode_audio_tokens(audio_path: Path, vq_audio_model: Any) -> torch.Tensor: + encoded = vq_audio_model.encode(str(audio_path)) + if isinstance(encoded, dict): + for key in ("input_ids", "token_ids", "codes", "tokens"): + if key in encoded: + encoded = encoded[key] + break + encoded = torch.as_tensor(encoded, dtype=torch.long).detach().cpu() + if encoded.ndim == 1: + encoded = encoded.unsqueeze(0) + elif encoded.ndim > 2: + encoded = encoded.view(encoded.shape[0], -1) + return encoded.contiguous() + + +def build_chat_prompt(content: str) -> str: + return ( + f"<|start_header_id|>user<|end_header_id|>\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" + ) + + +def resolve_task_text( + *, + task_name: str, + text: str, + instruction: str = "", + raw_prompt: bool = False, +) -> str: + text = str(text or "").strip() + + if task_name == "t2t" and not text: + return DEFAULT_T2T_PROMPT + if task_name == "i2t" and not text: + return DEFAULT_I2T_QUESTION + if task_name == "s2t" and not text: + return DEFAULT_S2T_INSTRUCTION + if task_name == "v2t" and not text: + return DEFAULT_V2T_QUESTION + if task_name in {"t2i", "i2i"} and not text: + return "A high quality detailed image." + + if task_name != "t2s": + return text + + if not text: + text = DEFAULT_T2S_PROMPT + + if raw_prompt: + return text + + instruction = str(instruction or "").strip() or DEFAULT_T2S_INSTRUCTION + return build_chat_prompt(f"{instruction}\n{text}") + + +def load_universal_prompting( + *, + tokenizer: Any, + tokenizer_source: str, + max_text_len: int, + cond_dropout_prob: float, + local_files_only: bool, + max_audio_len: int = 512, + max_audio_len_short: int = 256, +) -> Any: + from vllm_omni.model_executor.models.dynin_omni.dynin_omni_common import ( + DYNIN_REMOTE_SETTINGS, + resolve_remote_attr, + ) + + UniversalPrompting = resolve_remote_attr( + "UniversalPrompting", + module_name="prompting_utils", + settings=DYNIN_REMOTE_SETTINGS, + source=tokenizer_source, + local_files_only=bool(local_files_only), + fallback_module_names=("modeling_dynin_omni",), + ) + init_kwargs: dict[str, Any] = { + "max_text_len": int(max_text_len), + "special_tokens": DYNIN_SPECIAL_TOKENS, + "ignore_id": -100, + "cond_dropout_prob": float(cond_dropout_prob), + "use_reserved_token": True, + "max_audio_len": int(max_audio_len), + "max_audio_len_short": int(max_audio_len_short), + } + try: + return UniversalPrompting(tokenizer, **init_kwargs) + except TypeError: + init_kwargs.pop("max_audio_len", None) + init_kwargs.pop("max_audio_len_short", None) + return UniversalPrompting(tokenizer, **init_kwargs) + + +def _runtime_fallback(task: str, key: str, value: Any) -> Any: + if isinstance(value, str): + if value.strip() != "": + return value + elif value is not None: + return value + return TASK_RUNTIME_FALLBACKS.get(task, {}).get(key) + + +def _validate_generation_args(*, task: str, max_new_tokens: int, steps: int, block_length: int) -> None: + # Keep i2t/v2t generation constraints aligned with i2t.py/v2t.py. + if task not in {"i2t", "v2t"}: + return + if max_new_tokens <= 0: + raise ValueError(f"{task} requires max_new_tokens > 0.") + if block_length <= 0: + raise ValueError(f"{task} requires block_length > 0.") + if steps <= 0: + raise ValueError(f"{task} requires steps > 0.") + if max_new_tokens % block_length != 0: + raise ValueError(f"{task} requires max_new_tokens % block_length == 0, got {max_new_tokens} % {block_length}") + num_blocks = max_new_tokens // block_length + if num_blocks <= 0: + raise ValueError(f"{task} has invalid num_blocks.") + if steps % num_blocks != 0: + raise ValueError( + f"{task} requires steps % (max_new_tokens // block_length) == 0, " + f"got steps={steps}, max_new_tokens={max_new_tokens}, block_length={block_length}" + ) + + +def make_prompt_payload( + *, + task: str, + text: str, + image_tokens: torch.Tensor | None, + audio_tokens: torch.Tensor | None, + video_tokens: torch.Tensor | None, + image_placeholder_tokens: int, + audio_placeholder_tokens: int, + image_token_offset: int, + speech_token_offset: int, + mask_token_id: int, + use_train_i2i_prompt: bool, +) -> tuple[Any, str]: + runtime_task, prompting_task, _, _ = TASK_DEFAULT_RUNTIME[task] + del runtime_task + + if task == "t2t": + payload = ([[]], [build_chat_prompt(text)]) + return payload, prompting_task + + if task == "i2t": + if image_tokens is None: + raise ValueError("i2t requires image tokens") + img = image_tokens.view(-1).long() + int(image_token_offset) + payload = ([[img]], [build_chat_prompt(text)]) + return payload, prompting_task + + if task == "s2t": + if audio_tokens is None: + raise ValueError("s2t requires audio tokens") + aud = audio_tokens.long() + int(speech_token_offset) + if aud.ndim == 1: + aud = aud.unsqueeze(0) + payload = ([aud], [build_chat_prompt(text)]) + return payload, prompting_task + + if task == "v2t": + if video_tokens is None: + raise ValueError("v2t requires video tokens") + vid = video_tokens.view(-1).long() + int(image_token_offset) + payload = (vid.unsqueeze(0), [build_chat_prompt(text)]) + return payload, prompting_task + + if task == "t2i": + image_placeholder = torch.full( + (1, int(image_placeholder_tokens)), + fill_value=int(mask_token_id), + dtype=torch.long, + ) + payload = ([text], image_placeholder) + return payload, prompting_task + + if task == "i2i": + if image_tokens is None: + raise ValueError("i2i requires image tokens") + src = image_tokens.view(1, -1).long() + int(image_token_offset) + target_len = int(image_placeholder_tokens) if image_placeholder_tokens > 0 else int(src.shape[1]) + image_placeholder = torch.full( + (1, target_len), + fill_value=int(mask_token_id), + dtype=torch.long, + ) + if use_train_i2i_prompt: + labels_placeholder = torch.full( + (1, target_len), + fill_value=-100, + dtype=torch.long, + ) + payload = ([text], src, image_placeholder, labels_placeholder) + return payload, "i2i" + payload = ([text], src, image_placeholder) + return payload, "i2i_gen" + + if task == "t2s": + audio_placeholder = torch.full( + (1, int(audio_placeholder_tokens)), + fill_value=int(mask_token_id), + dtype=torch.long, + ) + payload = ([text], audio_placeholder) + return payload, prompting_task + + raise ValueError(f"Unsupported task: {task}") + + +def _to_1d_int_list(value: Any) -> list[int]: + if value is None: + return [] + if isinstance(value, torch.Tensor): + tensor = value.detach().to(device="cpu", dtype=torch.long) + else: + tensor = torch.as_tensor(value, dtype=torch.long) + if tensor.ndim == 0: + tensor = tensor.view(1) + elif tensor.ndim >= 2: + tensor = tensor.view(tensor.shape[0], -1)[0] + return [int(v) for v in tensor.tolist()] + + +def _run_uni_prompting(uni_prompting: Any, payload: Any, prompting_task: str) -> tuple[list[int], list[int]]: + prepared = uni_prompting(payload, prompting_task) + if isinstance(prepared, tuple): + prepared_input_ids = prepared[0] if len(prepared) > 0 else None + prepared_attention_mask = prepared[1] if len(prepared) > 1 else None + else: + prepared_input_ids = prepared + prepared_attention_mask = None + + input_ids = _to_1d_int_list(prepared_input_ids) + attention_mask = _to_1d_int_list(prepared_attention_mask) + if not input_ids: + raise RuntimeError(f"UniversalPrompting returned empty input_ids for task={prompting_task}") + return input_ids, attention_mask + + +def _get_special_token_id(uni_prompting: Any, token: str) -> int: + sptids = getattr(uni_prompting, "sptids_dict", None) or {} + if token not in sptids: + raise KeyError(f"Special token not found in UniversalPrompting.sptids_dict: {token}") + token_ids = _to_1d_int_list(sptids[token]) + if not token_ids: + raise ValueError(f"Special token id is empty for token: {token}") + return int(token_ids[0]) + + +def _tokenize_chat_query(tokenizer: Any, text: str) -> list[int]: + encoded = tokenizer(build_chat_prompt(text), return_tensors="pt").input_ids[0] + token_ids = _to_1d_int_list(encoded) + if not token_ids: + raise RuntimeError("Failed to tokenize chat query text.") + return token_ids + + +def _flatten_media_token_ids_with_offset(token_ids: Any, token_offset: int) -> list[int]: + media_ids = token_ids + if isinstance(media_ids, torch.Tensor): + media_ids = media_ids.detach().cpu().reshape(-1).tolist() + else: + media_ids = np.asarray(media_ids).reshape(-1).tolist() + return [int(x) + int(token_offset) for x in media_ids] + + +def _scalar_token_id(value: Any) -> int: + if isinstance(value, torch.Tensor): + if value.numel() == 0: + raise ValueError("Empty special-token tensor.") + return int(value.view(-1)[0].item()) + if isinstance(value, (list, tuple)): + if not value: + raise ValueError("Empty special-token list.") + return int(value[0]) + return int(value) + + +def build_v2t_input_ids( + *, + video_token_ids: Any, + tokenizer: Any, + uni_prompting: Any, + question: str, + image_token_offset: int, +) -> tuple[list[int], str]: + media_ids = video_token_ids + if isinstance(media_ids, torch.Tensor): + media_ids = media_ids.detach().cpu().reshape(-1).tolist() + else: + media_ids = np.asarray(media_ids).reshape(-1).tolist() + media_ids = [int(x) + int(image_token_offset) for x in media_ids] + + sptids = uni_prompting.sptids_dict + task_id = _scalar_token_id(sptids["<|v2t|>"]) + soi_id = _scalar_token_id(sptids["<|soi|>"]) + eoi_id = _scalar_token_id(sptids["<|eoi|>"]) + sot_id = _scalar_token_id(sptids["<|sot|>"]) + + prompt_text = build_v2t_chat_prompt(question) + query_ids = tokenizer(prompt_text, return_tensors="pt").input_ids[0].detach().cpu().tolist() + input_ids = [task_id, soi_id] + media_ids + [eoi_id, sot_id] + [int(v) for v in query_ids] + return input_ids, prompt_text + + +def build_i2t_input_ids( + *, + image_token_ids: Any, + tokenizer: Any, + uni_prompting: Any, + question: str, + image_token_offset: int, +) -> tuple[list[int], str]: + image_ids = image_token_ids + if isinstance(image_ids, torch.Tensor): + image_ids = image_ids.detach().cpu().reshape(-1).tolist() + else: + image_ids = np.asarray(image_ids).reshape(-1).tolist() + image_ids = [int(x) + int(image_token_offset) for x in image_ids] + + sptids = uni_prompting.sptids_dict + task_id = _scalar_token_id(sptids["<|mmu|>"]) + soi_id = _scalar_token_id(sptids["<|soi|>"]) + eoi_id = _scalar_token_id(sptids["<|eoi|>"]) + sot_id = _scalar_token_id(sptids["<|sot|>"]) + + prompt_text = build_i2t_chat_prompt(question) + query_ids = tokenizer(prompt_text, return_tensors="pt").input_ids[0].detach().cpu().tolist() + input_ids = [task_id, soi_id] + image_ids + [eoi_id, sot_id] + [int(v) for v in query_ids] + return input_ids, prompt_text + + +def build_v2t_chat_prompt(question: str) -> str: + return ( + f"<|start_header_id|>user<|end_header_id|>\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" + ) + + +def build_i2t_chat_prompt(question: str) -> str: + return ( + f"<|start_header_id|>user<|end_header_id|>\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" + ) + + +def make_mmu_prompt( + *, + task: str, + text: str, + tokenizer: Any, + uni_prompting: Any, + image_tokens: torch.Tensor | None, + audio_tokens: torch.Tensor | None, + video_tokens: torch.Tensor | None, + image_token_offset: int, + speech_token_offset: int, +) -> tuple[list[int], list[int]]: + query_ids = _tokenize_chat_query(tokenizer, text) + + if task == "i2t": + token_ids, _ = build_i2t_input_ids( + image_token_ids=image_tokens, + tokenizer=tokenizer, + uni_prompting=uni_prompting, + question=text, + image_token_offset=int(image_token_offset), + ) + token_ids = [int(v) for v in token_ids] + return token_ids, [1] * len(token_ids) + + if task == "v2t": + token_ids, _ = build_v2t_input_ids( + video_token_ids=video_tokens, + tokenizer=tokenizer, + uni_prompting=uni_prompting, + question=text, + image_token_offset=int(image_token_offset), + ) + token_ids = [int(v) for v in token_ids] + return token_ids, [1] * len(token_ids) + + if task == "s2t": + if audio_tokens is None: + raise ValueError("s2t requires audio tokens") + audio_ids = _to_1d_int_list(audio_tokens.long() + int(speech_token_offset)) + token_ids = [ + _get_special_token_id(uni_prompting, "<|s2t|>"), + _get_special_token_id(uni_prompting, "<|soa|>"), + *audio_ids, + _get_special_token_id(uni_prompting, "<|eoa|>"), + *query_ids, + ] + return token_ids, [1] * len(token_ids) + + raise ValueError(f"Unsupported task for validation-style MMU prompt: {task}") + + +def iter_mm_outputs(outputs: list[Any]): + for omni_out in outputs: + req_out = getattr(omni_out, "request_output", None) + req_list = req_out if isinstance(req_out, list) else [req_out] + for item in req_list: + if item is None: + continue + mm_out = getattr(item, "multimodal_output", None) or {} + if mm_out: + yield mm_out + completions = getattr(item, "outputs", None) or [] + for completion in completions: + c_mm_out = getattr(completion, "multimodal_output", None) or {} + if c_mm_out: + yield c_mm_out + omni_mm = getattr(omni_out, "multimodal_output", None) or {} + if omni_mm: + yield omni_mm + + +def _to_token_list(value: Any) -> list[int]: + if value is None: + return [] + if hasattr(value, "detach"): + value = value.detach() + if hasattr(value, "cpu"): + value = value.cpu() + if hasattr(value, "flatten"): + value = value.flatten().tolist() + if isinstance(value, tuple): + value = list(value) + if not isinstance(value, list): + return [] + out: list[int] = [] + for token in value: + if isinstance(token, bool): + continue + try: + out.append(int(token)) + except Exception: + continue + return out + + +def extract_text_output(outputs: list[Any], tokenizer: Any) -> str: + for mm_out in iter_mm_outputs(outputs): + text = mm_out.get("text") + if isinstance(text, list) and text: + text = text[-1] + if isinstance(text, str) and text.strip(): + return text.strip() + for key in ("text_tokens", "token_ids"): + token_ids = _to_token_list(mm_out.get(key)) + if not token_ids: + continue + decoded = tokenizer.decode(token_ids, skip_special_tokens=True) + if isinstance(decoded, str) and decoded.strip(): + return decoded.strip() + return "" + + +def extract_image_output(outputs: list[Any]) -> torch.Tensor | None: + for mm_out in iter_mm_outputs(outputs): + image = mm_out.get("image") + if isinstance(image, list) and image: + image = image[-1] + if isinstance(image, torch.Tensor): + return image + return None + + +def tensor_to_pil_image(image: torch.Tensor) -> Image.Image: + arr = image.detach().cpu().numpy() + if arr.ndim == 4: + arr = arr[0] + if arr.ndim == 3 and arr.shape[0] in (1, 3, 4): + arr = np.transpose(arr, (1, 2, 0)) + if arr.dtype != np.uint8: + arr = arr.astype(np.float32) + if arr.max() <= 1.0: + arr = arr * 255.0 + arr = np.clip(arr, 0.0, 255.0).astype(np.uint8) + if arr.ndim == 3 and arr.shape[-1] == 1: + arr = arr[..., 0] + return Image.fromarray(arr) + + +def extract_audio_output(outputs: list[Any]) -> tuple[np.ndarray, int] | None: + for mm_out in iter_mm_outputs(outputs): + audio = mm_out.get("audio") + if audio is None: + audio = mm_out.get("speech") + if audio is None: + continue + + def _to_wav_array(value: Any) -> np.ndarray: + if isinstance(value, torch.Tensor): + return value.detach().cpu().numpy().reshape(-1).astype(np.float32) + return np.asarray(value).reshape(-1).astype(np.float32) + + if isinstance(audio, list): + chunks = [_to_wav_array(chunk) for chunk in audio] + wav = np.concatenate(chunks, axis=0) if chunks else np.zeros((0,), dtype=np.float32) + else: + wav = _to_wav_array(audio) + sr = mm_out.get("sr", 24000) + if hasattr(sr, "item"): + try: + sr = int(sr.item()) + except Exception: + sr = 24000 + elif isinstance(sr, list): + sr = int(sr[0]) if sr else 24000 + else: + sr = int(sr) + return wav, sr + return None + + +def save_audio_wav(path: Path, wav: np.ndarray, sr: int) -> None: + try: + import soundfile as sf + + sf.write(str(path), wav, int(sr), format="WAV") + except Exception: + from scipy.io import wavfile + + wav_i16 = np.clip(wav, -1.0, 1.0) + wav_i16 = (wav_i16 * 32767.0).astype(np.int16) + wavfile.write(str(path), int(sr), wav_i16) + + +def parse_args(repo_root: Path) -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Dynin-Omni unified offline end2end example.") + parser.add_argument("--task", type=str, required=True, choices=TASK_CHOICES) + parser.add_argument("--model", type=str, required=True, help="HF repo id or local model directory.") + parser.add_argument( + "--stage-config-path", + type=str, + default=str(repo_root / "vllm_omni/model_executor/stage_configs/dynin_omni.yaml"), + help="Path to stage config yaml.", + ) + parser.add_argument( + "--dynin-config-path", + type=str, + default="", + help="Path to DYNIN config yaml (passed through additional_information).", + ) + parser.add_argument( + "--model-cache-dir", + type=str, + default="/tmp/dynin_localized_models", + help="Cache directory used when --model is HF repo id.", + ) + parser.add_argument( + "--localize-model", + action=argparse.BooleanOptionalAction, + default=True, + help="If true and --model is HF repo id, snapshot it under --model-cache-dir.", + ) + parser.add_argument("--text", type=str, default="", help="Prompt/edit/question text.") + parser.add_argument("--instruction", type=str, default="", help="Optional extra instruction.") + parser.add_argument("--raw-prompt", action=argparse.BooleanOptionalAction, default=False) + parser.add_argument("--image", type=str, default="", help="Input image path for i2i/i2t.") + parser.add_argument("--audio", type=str, default="", help="Input audio path for s2t.") + parser.add_argument("--video", type=str, default="", help="Input video path for v2t.") + parser.add_argument("--image-resolution", type=int, default=None) + parser.add_argument("--num-frames", type=int, default=None) + parser.add_argument( + "--output-dir", + type=str, + default="", + help="Directory for generated outputs.", + ) + parser.add_argument("--output-prefix", type=str, default="") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--dtype", type=str, default="auto") + parser.add_argument("--max-tokens-per-stage", type=int, default=1) + + parser.add_argument("--runtime-task", type=str, default="", help="Override runtime task key.") + parser.add_argument("--prompting-task", type=str, default="", help="Override prompting task key.") + parser.add_argument("--detok-id", type=int, default=None, help="Override detok id.") + + parser.add_argument("--prompt-max-text-len", type=int, default=None) + parser.add_argument("--cond-dropout-prob", type=float, default=0.0) + parser.add_argument("--max-new-tokens", type=int, default=None) + parser.add_argument("--steps", type=int, default=None) + parser.add_argument("--block-length", type=int, default=None) + parser.add_argument("--temperature", type=float, default=None) + parser.add_argument("--cfg-scale", type=float, default=None) + parser.add_argument("--remasking", type=str, default="low_confidence") + + parser.add_argument("--timesteps", type=int, default=None) + parser.add_argument("--guidance-scale", type=float, default=None) + parser.add_argument("--noise-type", type=str, default="mask") + parser.add_argument("--noise-schedule-name", type=str, default="cosine") + parser.add_argument("--noise-schedule-params", type=str, default="{}") + + parser.add_argument("--mask-token-id", type=int, default=None) + parser.add_argument("--codebook-size", type=int, default=None) + parser.add_argument("--audio-codebook-size", type=int, default=None) + parser.add_argument("--image-token-count", type=int, default=None) + parser.add_argument("--t2s-token-length", type=int, default=None) + parser.add_argument( + "--t2s-condition", + type=str, + default="", + ) + parser.add_argument( + "--use-train-i2i-prompt", + action="store_true", + help="Use i2i training prompt template (default behavior of i2i.py).", + ) + parser.add_argument( + "--no-use-train-i2i-prompt", + dest="use_train_i2i_prompt", + action="store_false", + help="Use i2i_gen prompt template.", + ) + parser.set_defaults(use_train_i2i_prompt=None) + + parser.add_argument("--tokenizer-path", type=str, default="") + parser.add_argument("--model-local-files-only", action=argparse.BooleanOptionalAction, default=None) + parser.add_argument("--tokenizer-local-files-only", action=argparse.BooleanOptionalAction, default=None) + + parser.add_argument("--vq-model-image-path", type=str, default="") + parser.add_argument("--vq-model-image-local-files-only", action=argparse.BooleanOptionalAction, default=None) + parser.add_argument("--vq-model-audio-path", type=str, default="") + parser.add_argument("--vq-model-audio-local-files-only", action=argparse.BooleanOptionalAction, default=None) + + parser.add_argument("--disable-hf-xet", action=argparse.BooleanOptionalAction, default=True) + return parser.parse_args() + + +def main() -> None: + repo_root = bootstrap_repo_path() + ensure_safe_import_for_vllm() + from vllm_omni.model_executor.models.dynin_omni.dynin_omni_common import ( + DYNIN_PROMPT_SOURCE_KEY, + DYNIN_PROMPT_SOURCE_OFFLINE_PREBUILT, + ) + + args = parse_args(repo_root) + + if args.disable_hf_xet: + os.environ.setdefault("HF_HUB_DISABLE_XET", "1") + + np.random.seed(args.seed) + torch.manual_seed(args.seed) + + model_dir = ensure_local_model_dir( + model=args.model, + cache_dir=Path(args.model_cache_dir).expanduser(), + localize=bool(args.localize_model), + ) + model_source = str(model_dir) + + task_name = str(args.task) + dynin_config_path = str(Path(args.dynin_config_path).expanduser()) + os.environ["DYNIN_CONFIG_PATH"] = dynin_config_path + default_runtime_task, default_prompting_task, default_detok_id, final_modality = TASK_DEFAULT_RUNTIME[task_name] + runtime_task = args.runtime_task.strip() or str( + _runtime_fallback(task_name, "runtime_task", None) or default_runtime_task + ) + prompting_task = args.prompting_task.strip() or str( + _runtime_fallback(task_name, "prompting_task", None) or default_prompting_task + ) + detok_id_default = _runtime_fallback(task_name, "detok_id", None) + if detok_id_default is None: + detok_id_default = default_detok_id + detok_id = int(detok_id_default if args.detok_id is None else args.detok_id) + + output_dir_default = _runtime_fallback(task_name, "output_dir", args.output_dir) + resolved_output_dir = str(output_dir_default or "/tmp/dynin_end2end_outputs") + + image_resolution_value = _runtime_fallback( + task_name, + "image_resolution", + args.image_resolution, + ) + if image_resolution_value is None: + image_resolution_value = 336 + image_resolution = int(image_resolution_value) + + num_frames_value = _runtime_fallback( + task_name, + "num_frames", + args.num_frames, + ) + if num_frames_value is None: + num_frames_value = 8 + num_frames = int(num_frames_value) + + prompt_max_text_len_value = _runtime_fallback( + task_name, + "prompt_max_text_len", + args.prompt_max_text_len, + ) + if prompt_max_text_len_value is None: + prompt_max_text_len_value = 1024 + prompt_max_text_len = int(prompt_max_text_len_value) + + max_new_tokens_value = _runtime_fallback( + task_name, + "max_new_tokens", + args.max_new_tokens, + ) + if max_new_tokens_value is None: + max_new_tokens_value = 256 + max_new_tokens = int(max_new_tokens_value) + + steps_value = _runtime_fallback( + task_name, + "steps", + args.steps, + ) + if steps_value is None: + steps_value = 256 + steps = int(steps_value) + + block_length_value = _runtime_fallback( + task_name, + "block_length", + args.block_length, + ) + if block_length_value is None: + block_length_value = 2 + block_length = int(block_length_value) + + temperature_value = _runtime_fallback( + task_name, + "temperature", + args.temperature, + ) + if temperature_value is None: + temperature_value = 0.0 + temperature = float(temperature_value) + + cfg_scale_value = _runtime_fallback( + task_name, + "cfg_scale", + args.cfg_scale, + ) + if cfg_scale_value is None: + cfg_scale_value = 0.0 + cfg_scale = float(cfg_scale_value) + + remasking = str(_runtime_fallback(task_name, "remasking", args.remasking) or "low_confidence") + + timesteps_value = _runtime_fallback( + task_name, + "timesteps", + args.timesteps, + ) + if timesteps_value is None: + timesteps_value = 20 + timesteps = int(timesteps_value) + + guidance_scale_value = _runtime_fallback( + task_name, + "guidance_scale", + args.guidance_scale, + ) + if guidance_scale_value is None: + guidance_scale_value = 0.0 + guidance_scale = float(guidance_scale_value) + + mask_token_id_value = _runtime_fallback( + task_name, + "mask_token_id", + args.mask_token_id, + ) + if mask_token_id_value is None: + mask_token_id_value = 126336 + mask_token_id = int(mask_token_id_value) + + codebook_size_value = _runtime_fallback( + task_name, + "codebook_size", + args.codebook_size, + ) + if codebook_size_value is None: + codebook_size_value = 8192 + codebook_size = int(codebook_size_value) + + audio_codebook_size_value = _runtime_fallback( + task_name, + "audio_codebook_size", + args.audio_codebook_size, + ) + if audio_codebook_size_value is None: + audio_codebook_size_value = 4096 + audio_codebook_size = int(audio_codebook_size_value) + + image_token_count_value = _runtime_fallback( + task_name, + "image_token_count", + args.image_token_count, + ) + image_token_count = int(image_token_count_value) if image_token_count_value is not None else 0 + + t2s_token_length_value = _runtime_fallback( + task_name, + "t2s_token_length", + args.t2s_token_length, + ) + if t2s_token_length_value is None: + t2s_token_length_value = 383 + t2s_token_length = int(t2s_token_length_value) + + t2s_condition = str( + _runtime_fallback(task_name, "t2s_condition", args.t2s_condition) + or "gender-female_emotion-neutral_speed-normal_pitch-normal" + ) + + _validate_generation_args( + task=task_name, + max_new_tokens=max_new_tokens, + steps=steps, + block_length=block_length, + ) + + use_train_i2i_prompt = _runtime_fallback(task_name, "use_train_i2i_prompt", args.use_train_i2i_prompt) + if use_train_i2i_prompt is None: + use_train_i2i_prompt = bool(task_name == "i2i") + use_train_i2i_prompt = bool(use_train_i2i_prompt) + + if task_name in {"i2i", "i2t"} and not args.image: + raise ValueError(f"--task {task_name} requires --image") + if task_name == "s2t" and not args.audio: + raise ValueError("--task s2t requires --audio") + if task_name == "v2t" and not args.video: + raise ValueError("--task v2t requires --video") + + text = resolve_task_text( + task_name=task_name, + text=args.text, + instruction=args.instruction, + raw_prompt=bool(args.raw_prompt), + ) + + tokenizer_source = args.tokenizer_path.strip() or model_source + model_local_only = resolve_local_only( + args.model_local_files_only, model_source, default=Path(model_source).is_dir() + ) + tokenizer_local_only = resolve_local_only( + args.tokenizer_local_files_only, + tokenizer_source, + default=model_local_only, + ) + tokenizer = load_text_tokenizer(tokenizer_source, local_files_only=tokenizer_local_only) + text_vocab_size = int(len(tokenizer)) + + image_tokens: torch.Tensor | None = None + audio_tokens: torch.Tensor | None = None + video_tokens: torch.Tensor | None = None + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + vq_image_source = args.vq_model_image_path.strip() or "snu-aidas/magvitv2" + vq_audio_source = args.vq_model_audio_path.strip() or "snu-aidas/emova_speech_tokenizer_vllm" + vq_image_local_only = resolve_local_only(args.vq_model_image_local_files_only, vq_image_source, default=False) + vq_audio_local_only = resolve_local_only(args.vq_model_audio_local_files_only, vq_audio_source, default=False) + + if task_name in {"i2i", "i2t", "v2t"}: + vq_image = load_vq_image_encoder(vq_image_source, vq_image_local_only, device) + if task_name in {"i2i", "i2t"}: + image_tokens = encode_image_tokens( + Path(args.image).expanduser().resolve(), + vq_model=vq_image, + device=device, + resolution=int(image_resolution), + ) + if task_name == "v2t": + video_tokens = encode_video_tokens( + Path(args.video).expanduser().resolve(), + vq_model=vq_image, + device=device, + resolution=int(image_resolution), + num_frames=int(num_frames), + ) + if hasattr(vq_image, "cpu"): + vq_image = vq_image.cpu() + + if task_name == "s2t": + vq_audio = load_vq_audio_encoder(vq_audio_source, vq_audio_local_only, device) + audio_tokens = encode_audio_tokens(Path(args.audio).expanduser().resolve(), vq_audio) + if hasattr(vq_audio, "cpu"): + vq_audio = vq_audio.cpu() + + noise_schedule_params: dict[str, Any] = {} + try: + parsed = json.loads(args.noise_schedule_params) + if isinstance(parsed, dict): + noise_schedule_params = {str(k): v for k, v in parsed.items()} + except Exception: + noise_schedule_params = {} + + image_token_count = int(image_token_count) + if image_token_count <= 0: + if image_tokens is not None: + image_token_count = int(image_tokens.numel()) + else: + base_res = int(image_resolution) + image_token_count = max(1, (base_res // 16) ** 2) + + uncond_input_ids: list[int] | None = None + uncond_attention_mask: list[int] | None = None + if task_name == "t2t": + messages = [{"role": "user", "content": text}] + if getattr(tokenizer, "chat_template", None): + prompt_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) + encoded = tokenizer(prompt_text, return_tensors="pt", add_special_tokens=False) + else: + encoded = tokenizer(text, return_tensors="pt", add_special_tokens=True) + prompt_token_ids = _to_1d_int_list(encoded["input_ids"]) + prompt_attention_mask = _to_1d_int_list(encoded.get("attention_mask")) + if not prompt_attention_mask: + prompt_attention_mask = [1] * len(prompt_token_ids) + else: + max_audio_len_for_prompt = int(max(t2s_token_length, 512)) + if audio_tokens is not None: + max_audio_len_for_prompt = max(max_audio_len_for_prompt, int(audio_tokens.numel())) + max_audio_len_short_for_prompt = max(256, max_audio_len_for_prompt // 2) + + uni_prompting = load_universal_prompting( + tokenizer=tokenizer, + tokenizer_source=tokenizer_source, + max_text_len=int(prompt_max_text_len), + cond_dropout_prob=float(args.cond_dropout_prob), + local_files_only=bool(tokenizer_local_only), + max_audio_len=int(max_audio_len_for_prompt), + max_audio_len_short=int(max_audio_len_short_for_prompt), + ) + prompting_text_vocab_size = int(len(uni_prompting.text_tokenizer)) + + is_mmu_task = task_name in {"i2t", "s2t", "v2t"} and not args.prompting_task.strip() + if is_mmu_task: + prompt_token_ids, prompt_attention_mask = make_mmu_prompt( + task=task_name, + text=text, + tokenizer=uni_prompting.text_tokenizer, + uni_prompting=uni_prompting, + image_tokens=image_tokens, + audio_tokens=audio_tokens, + video_tokens=video_tokens, + image_token_offset=prompting_text_vocab_size, + speech_token_offset=prompting_text_vocab_size + int(codebook_size), + ) + else: + prompt_payload, prompting_task = make_prompt_payload( + task=task_name, + text=text, + image_tokens=image_tokens, + audio_tokens=audio_tokens, + video_tokens=video_tokens, + image_placeholder_tokens=image_token_count, + audio_placeholder_tokens=int(t2s_token_length), + image_token_offset=text_vocab_size, + speech_token_offset=text_vocab_size + int(codebook_size), + mask_token_id=int(mask_token_id), + use_train_i2i_prompt=use_train_i2i_prompt, + ) + if args.prompting_task.strip(): + prompting_task = args.prompting_task.strip() + + prompt_token_ids, prompt_attention_mask = _run_uni_prompting( + uni_prompting, + prompt_payload, + prompting_task, + ) + + if task_name in {"i2t", "s2t", "v2t"}: + prompt_attention_mask = [1] * len(prompt_token_ids) + if not prompt_attention_mask: + prompt_attention_mask = [1] * len(prompt_token_ids) + + if task_name in {"t2i", "i2i"} and guidance_scale > 0: + uncond_payload, uncond_prompting_task = make_prompt_payload( + task=task_name, + text="", + image_tokens=image_tokens, + audio_tokens=audio_tokens, + video_tokens=video_tokens, + image_placeholder_tokens=image_token_count, + audio_placeholder_tokens=int(t2s_token_length), + image_token_offset=text_vocab_size, + speech_token_offset=text_vocab_size + int(codebook_size), + mask_token_id=int(mask_token_id), + use_train_i2i_prompt=use_train_i2i_prompt, + ) + uncond_input_ids, uncond_attention_mask = _run_uni_prompting( + uni_prompting, + uncond_payload, + args.prompting_task.strip() or uncond_prompting_task, + ) + if not uncond_attention_mask: + uncond_attention_mask = [1] * len(uncond_input_ids) + + runtime_info: dict[str, Any] = { + "task": [runtime_task], + "detok_id": [int(detok_id)], + DYNIN_PROMPT_SOURCE_KEY: [DYNIN_PROMPT_SOURCE_OFFLINE_PREBUILT], + "dynin_config_path": [str(dynin_config_path)], + "attention_mask": [prompt_attention_mask], + "prompt_max_text_len": [int(prompt_max_text_len)], + "prompting_max_text_len": [int(prompt_max_text_len)], + "cond_dropout_prob": [float(args.cond_dropout_prob)], + "prompting_cond_dropout_prob": [float(args.cond_dropout_prob)], + "tokenizer_path": [str(tokenizer_source)], + "text_vocab_size": [int(text_vocab_size)], + "model_local_files_only": [bool(model_local_only)], + "max_new_tokens": [int(max_new_tokens)], + "steps": [int(steps)], + "block_length": [int(block_length)], + "temperature": [float(temperature)], + "cfg_scale": [float(cfg_scale)], + "remasking": [str(remasking)], + "mask_id": [int(mask_token_id)], + "mask_token_id": [int(mask_token_id)], + "codebook_size": [int(codebook_size)], + "audio_codebook_size": [int(audio_codebook_size)], + "timesteps": [int(timesteps)], + "guidance_scale": [float(guidance_scale)], + "noise_type": [str(args.noise_type)], + "noise_schedule_name": [str(args.noise_schedule_name)], + "noise_schedule_params": [noise_schedule_params], + "seq_len": [int(image_token_count)], + "condition": [str(t2s_condition)], + "vq_model_image_path": [str(vq_image_source)], + "vq_model_image_local_files_only": [bool(vq_image_local_only)], + "vq_model_audio_path": [str(vq_audio_source)], + "vq_model_audio_local_files_only": [bool(vq_audio_local_only)], + } + + if task_name in {"t2t", "i2t", "s2t", "v2t"}: + runtime_info["prompt_length"] = [int(len(prompt_token_ids))] + if uncond_input_ids is not None: + runtime_info["uncond_input_ids"] = [uncond_input_ids] + if uncond_attention_mask is not None: + runtime_info["uncond_attention_mask"] = [uncond_attention_mask] + + if task_name == "t2s": + runtime_info["max_new_tokens"] = [int(t2s_token_length)] + + prompt = { + "prompt_token_ids": [int(v) for v in prompt_token_ids], + "additional_information": runtime_info, + "modalities": [final_modality], + } + + from vllm import SamplingParams + + from vllm_omni.entrypoints.omni import Omni + + stage_config_path = str(Path(args.stage_config_path).expanduser()) + omni = Omni(model=model_source, stage_configs_path=stage_config_path, dtype=args.dtype) + sampling_params_list = [ + SamplingParams(max_tokens=int(args.max_tokens_per_stage), temperature=0.0, top_p=1.0, detokenize=False) + for _ in range(omni.num_stages) + ] + + try: + outputs = list(omni.generate(prompt, sampling_params_list)) + finally: + omni.close() + + out_dir = Path(resolved_output_dir).expanduser() + out_dir.mkdir(parents=True, exist_ok=True) + stamp = time.strftime("%Y%m%d_%H%M%S") + prefix = args.output_prefix.strip() or f"{task_name}_{stamp}" + + if final_modality == "text": + text_out = extract_text_output(outputs, tokenizer=tokenizer) + if not text_out: + raise RuntimeError("No text output found.") + out_path = out_dir / f"{prefix}.txt" + out_path.write_text(text_out + "\n", encoding="utf-8") + print(f"[end2end] text saved: {out_path}") + print(text_out) + return + + if final_modality == "image": + image_out = extract_image_output(outputs) + if image_out is None: + raise RuntimeError("No image output found.") + pil = tensor_to_pil_image(image_out) + out_path = out_dir / f"{prefix}.png" + pil.save(out_path) + print(f"[end2end] image saved: {out_path}") + return + + if final_modality == "audio": + audio_out = extract_audio_output(outputs) + if audio_out is None: + raise RuntimeError("No audio output found.") + wav, sr = audio_out + out_path = out_dir / f"{prefix}.wav" + save_audio_wav(out_path, wav, sr) + print(f"[end2end] audio saved: {out_path} (sr={sr}, samples={wav.shape[0]})") + return + + raise RuntimeError(f"Unsupported final modality: {final_modality}") + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/dynin_omni/README.md b/examples/online_serving/dynin_omni/README.md new file mode 100644 index 0000000000..d8526d4237 --- /dev/null +++ b/examples/online_serving/dynin_omni/README.md @@ -0,0 +1,97 @@ +# Dynin-Omni Online Serving Example + +## Installation + +Please refer to [README.md](../../../README.md). + +## Launch the Server + +First, find the `transformers_modules` path: + +```bash +python - <<'PY' +from transformers.utils.hub import HF_MODULES_CACHE +print(HF_MODULES_CACHE) +PY +``` + +Then export it for both `PYTHONPATH` and `HF_MODULES_CACHE`: + +```bash +export PYTHONPATH=:$PYTHONPATH +export HF_MODULES_CACHE= +``` + +Run from repository root: + +```bash +vllm-omni serve snu-aidas/Dynin-Omni \ + --omni \ + --port 8091 \ + --stage-configs-path "$(pwd)/vllm_omni/model_executor/stage_configs/dynin_omni.yaml" +``` + +If `vllm-omni` is not in PATH, run: + +```bash +PYTHONPATH="$(pwd)" python -m vllm_omni.entrypoints.cli.main serve snu-aidas/Dynin-Omni \ + --omni \ + --port 8091 \ + --stage-configs-path "$(pwd)/vllm_omni/model_executor/stage_configs/dynin_omni.yaml" +``` + +Wait until the server logs show both `All stages initialized successfully` and +`Application startup complete.` before sending requests. + +## Send Requests via Python Client + +Move to the example directory: + +```bash +cd examples/online_serving/dynin_omni +``` + +### Text -> Image + +```bash +python openai_chat_completion_client_for_multimodal_generation.py \ + --query-type t2i \ + --prompt "A realistic indoor living room with natural daylight." +``` + +### Image -> Image + +```bash +python openai_chat_completion_client_for_multimodal_generation.py \ + --query-type i2i \ + --image-path ../../offline_inference/dynin_omni/data/image/sofa_under_water.jpg \ + --prompt "Transform this surreal underwater setting into a realistic indoor living room while preserving the sofa layout." +``` + +### Text -> Speech + +```bash +python openai_chat_completion_client_for_multimodal_generation.py \ + --query-type t2s \ + --prompt "Hello. This is Dynin-omni." +``` + +## CLI Arguments + +- `--query-type` (`t2i|t2s|i2i`) +- `--model` (default: `snu-aidas/Dynin-Omni`) +- `--host` / `--port` (OpenAI-compatible vLLM endpoint) +- `--prompt` (custom text) +- `--image-path` (required for `i2i`) +- `--modalities` (optional output modalities override) +- `--output-dir` (default: `/tmp/dynin_online_outputs`) + +## Notes + +- This client currently supports only `t2i`, `t2s`, and `i2i`. +- `t2t` is intentionally not exposed in this online example. +- This example intentionally uses the OpenAI-compatible chat completion endpoint. +- Task routing for non-text outputs relies on Dynin task trigger tokens (`<|t2i|>`, `<|i2i|>`, `<|t2s|>`) injected by the client. +- Outputs are saved under `/tmp/dynin_online_outputs` by default. +- Dynin stage-0 warmup can take a while on first startup; do not send requests before startup completes. +- Dynin itself can execute text-returning tasks such as `t2t`, `s2t`, `i2t`, and `v2t`, but this online serving example currently runs stage-0 in `generation` mode. In that path, the generation worker does not surface the final text as `output.text`, so OpenAI chat responses for those text-output tasks may complete internally but still return empty text. diff --git a/examples/online_serving/dynin_omni/openai_chat_completion_client_for_multimodal_generation.py b/examples/online_serving/dynin_omni/openai_chat_completion_client_for_multimodal_generation.py new file mode 100644 index 0000000000..9728555431 --- /dev/null +++ b/examples/online_serving/dynin_omni/openai_chat_completion_client_for_multimodal_generation.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import argparse +import base64 +import json +import mimetypes +import os +import time +from pathlib import Path +from typing import Any + +DEFAULT_MODEL = "snu-aidas/Dynin-Omni" +DEFAULT_OUTPUT_DIR = "/tmp/dynin_online_outputs" + +QUERY_CHOICES = ("t2i", "t2s", "i2i") +DEFAULT_PROMPT_BY_QUERY = { + "t2i": "A high quality detailed living room interior photo.", + "t2s": "Please read this sentence naturally: Hello from Dynin-Omni online serving.", + "i2i": "Transform this image into a realistic indoor living room while preserving layout.", +} +DEFAULT_MODALITIES_BY_QUERY = { + "t2i": ["image"], + "t2s": ["audio"], + "i2i": ["image"], +} +OFFLINE_PARITY_STAGE_COUNT = 3 +OFFLINE_PARITY_STAGE_SAMPLING = { + "max_tokens": 1, + "temperature": 0.0, + "top_p": 1.0, + "detokenize": False, +} + + +def _infer_mime_type(path: Path) -> str: + mime_type, _ = mimetypes.guess_type(str(path)) + return mime_type or "application/octet-stream" + + +def _encode_file_as_data_url(path: Path) -> str: + mime_type = _infer_mime_type(path) + raw = path.read_bytes() + encoded = base64.b64encode(raw).decode("utf-8") + return f"data:{mime_type};base64,{encoded}" + + +def _to_image_url(path_or_url: str) -> str: + value = str(path_or_url) + if value.startswith(("http://", "https://", "data:image/")): + return value + path = Path(value).expanduser().resolve() + if not path.exists(): + raise FileNotFoundError(f"Image file not found: {path}") + return _encode_file_as_data_url(path) + + +def _build_user_content(query_type: str, prompt: str, image_path: str | None) -> list[dict[str, Any]]: + if query_type == "t2i": + return [{"type": "text", "text": f"<|t2i|> {prompt}"}] + + if query_type == "t2s": + return [{"type": "text", "text": f"<|t2s|> {prompt}"}] + + if query_type == "i2i": + if not image_path: + raise ValueError("--image-path is required for query type i2i") + return [ + {"type": "text", "text": f"<|i2i|> {prompt}"}, + {"type": "image_url", "image_url": {"url": _to_image_url(image_path)}}, + ] + + raise ValueError(f"Unsupported query_type: {query_type}") + + +def _collect_text_from_content(content: Any) -> list[str]: + texts: list[str] = [] + if isinstance(content, str): + stripped = content.strip() + if stripped: + texts.append(stripped) + return texts + + if isinstance(content, dict): + for key in ("text", "content", "value", "output_text"): + text_value = content.get(key) + if isinstance(text_value, str) and text_value.strip(): + texts.append(text_value.strip()) + return texts + + if isinstance(content, list): + for item in content: + texts.extend(_collect_text_from_content(item)) + return texts + + content_text = getattr(content, "text", None) + if isinstance(content_text, str) and content_text.strip(): + texts.append(content_text.strip()) + content_value = getattr(content, "content", None) + if isinstance(content_value, str) and content_value.strip(): + texts.append(content_value.strip()) + output_text = getattr(content, "output_text", None) + if isinstance(output_text, str) and output_text.strip(): + texts.append(output_text.strip()) + return texts + + +def _extract_text_outputs(chat_completion: Any) -> list[str]: + texts: list[str] = [] + for choice in getattr(chat_completion, "choices", []) or []: + message = getattr(choice, "message", None) + if message is None: + continue + content = getattr(message, "content", None) + texts.extend(_collect_text_from_content(content)) + reasoning_content = getattr(message, "reasoning_content", None) + if isinstance(reasoning_content, str) and reasoning_content.strip(): + texts.append(reasoning_content.strip()) + choice_text = getattr(choice, "text", None) + if isinstance(choice_text, str) and choice_text.strip(): + texts.append(choice_text.strip()) + top_level_output_text = getattr(chat_completion, "output_text", None) + if isinstance(top_level_output_text, str) and top_level_output_text.strip(): + texts.append(top_level_output_text.strip()) + return texts + + +def _extract_image_data_urls(chat_completion: Any) -> list[str]: + urls: list[str] = [] + for choice in getattr(chat_completion, "choices", []) or []: + message = getattr(choice, "message", None) + if message is None: + continue + content = getattr(message, "content", None) + if not isinstance(content, list): + continue + for item in content: + if not isinstance(item, dict): + continue + if item.get("type") != "image_url": + continue + image_url = (item.get("image_url") or {}).get("url") + if isinstance(image_url, str) and image_url.startswith("data:image"): + urls.append(image_url) + return urls + + +def _extract_audio_payloads(chat_completion: Any) -> list[bytes]: + payloads: list[bytes] = [] + for choice in getattr(chat_completion, "choices", []) or []: + message = getattr(choice, "message", None) + if message is None: + continue + message_audio = getattr(message, "audio", None) + if message_audio is None: + continue + data_b64 = getattr(message_audio, "data", None) + if isinstance(data_b64, str) and data_b64: + try: + payloads.append(base64.b64decode(data_b64)) + except Exception: + continue + return payloads + + +def _decode_data_url(data_url: str) -> tuple[bytes, str]: + header, data = data_url.split(",", 1) + mime_type = "image/png" + if ";" in header and ":" in header: + mime_type = header.split(":", 1)[1].split(";", 1)[0] + return base64.b64decode(data), mime_type + + +def _image_extension_from_mime(mime_type: str) -> str: + if mime_type == "image/jpeg": + return ".jpg" + if mime_type == "image/webp": + return ".webp" + if mime_type == "image/gif": + return ".gif" + return ".png" + + +def _save_outputs( + *, + query_type: str, + chat_completion: Any, + output_dir: Path, +) -> None: + output_dir.mkdir(parents=True, exist_ok=True) + stamp = time.strftime("%Y%m%d_%H%M%S") + + text_outputs = _extract_text_outputs(chat_completion) + image_data_urls = _extract_image_data_urls(chat_completion) + audio_payloads = _extract_audio_payloads(chat_completion) + + if text_outputs: + text_path = output_dir / f"{query_type}_{stamp}.txt" + text_path.write_text("\n\n".join(text_outputs) + "\n", encoding="utf-8") + print(f"[dynin-online] text saved: {text_path}") + print(text_outputs[0]) + + for idx, image_url in enumerate(image_data_urls): + image_bytes, mime_type = _decode_data_url(image_url) + ext = _image_extension_from_mime(mime_type) + image_path = output_dir / f"{query_type}_{stamp}_{idx}{ext}" + image_path.write_bytes(image_bytes) + print(f"[dynin-online] image saved: {image_path}") + + for idx, audio_bytes in enumerate(audio_payloads): + audio_path = output_dir / f"{query_type}_{stamp}_{idx}.wav" + audio_path.write_bytes(audio_bytes) + print(f"[dynin-online] audio saved: {audio_path}") + + if not text_outputs and not image_data_urls and not audio_payloads: + print("[dynin-online] no output extracted from response") + raw_path = output_dir / f"{query_type}_{stamp}_raw_response.json" + try: + if hasattr(chat_completion, "model_dump_json"): + serialized = chat_completion.model_dump_json(indent=2) + else: + if hasattr(chat_completion, "model_dump"): + raw_payload: Any = chat_completion.model_dump(mode="json") + else: + raw_payload = chat_completion + try: + serialized = json.dumps(raw_payload, ensure_ascii=False, indent=2) + except Exception: + serialized = json.dumps({"repr": repr(raw_payload)}, ensure_ascii=False, indent=2) + raw_path.write_text(serialized + "\n", encoding="utf-8") + print(f"[dynin-online] raw response saved: {raw_path}") + except Exception: + pass + + +def _build_offline_parity_sampling_params_list() -> list[dict[str, Any]]: + return [dict(OFFLINE_PARITY_STAGE_SAMPLING) for _ in range(OFFLINE_PARITY_STAGE_COUNT)] + + +def run_request(args: argparse.Namespace) -> None: + from openai import OpenAI + + client = OpenAI( + api_key="EMPTY", + base_url=f"http://{args.host}:{args.port}/v1", + ) + prompt = args.prompt.strip() if args.prompt else DEFAULT_PROMPT_BY_QUERY[args.query_type] + user_content = _build_user_content( + query_type=args.query_type, + prompt=prompt, + image_path=args.image_path, + ) + if args.modalities: + modalities = [item.strip() for item in args.modalities.split(",") if item.strip()] + else: + modalities = DEFAULT_MODALITIES_BY_QUERY[args.query_type] + + extra_body = { + "sampling_params_list": _build_offline_parity_sampling_params_list(), + } + chat_completion = client.chat.completions.create( + model=args.model, + messages=[{"role": "user", "content": user_content}], + modalities=modalities, + extra_body=extra_body, + ) + _save_outputs( + query_type=args.query_type, + chat_completion=chat_completion, + output_dir=Path(args.output_dir).expanduser(), + ) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Dynin-Omni online chat completion client") + parser.add_argument( + "--query-type", + "-q", + type=str, + default="t2i", + choices=QUERY_CHOICES, + help="Dynin query type", + ) + parser.add_argument( + "--model", + "-m", + type=str, + default=DEFAULT_MODEL, + help="Model name/path", + ) + parser.add_argument( + "--host", + type=str, + default="localhost", + help="Host/IP of the vLLM Omni API server", + ) + parser.add_argument( + "--port", + type=int, + default=8091, + help="Port of the vLLM Omni API server", + ) + parser.add_argument( + "--prompt", + "-p", + type=str, + default="", + help="Custom prompt text", + ) + parser.add_argument( + "--image-path", + "-i", + type=str, + default=None, + help="Image path/URL for i2i", + ) + parser.add_argument( + "--modalities", + type=str, + default="", + help="Comma-separated output modalities override (e.g., text,image,audio)", + ) + parser.add_argument( + "--output-dir", + "-o", + type=str, + default=DEFAULT_OUTPUT_DIR, + help="Directory to save outputs", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + os.environ.setdefault("HF_HUB_DISABLE_XET", "1") + run_request(args) + + +if __name__ == "__main__": + main() diff --git a/tests/e2e/offline_inference/test_dynin_omni.py b/tests/e2e/offline_inference/test_dynin_omni.py new file mode 100644 index 0000000000..d17e7b8175 --- /dev/null +++ b/tests/e2e/offline_inference/test_dynin_omni.py @@ -0,0 +1,419 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +E2E offline smoke tests for Dynin-Omni. + +- model: "snu-aidas/Dynin-Omni" +- stage config: tests/e2e/stage_configs/dynin_omni_ci.yaml +""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Any + +import numpy as np +import pytest +import torch +from transformers import AutoTokenizer + +from tests.conftest import OmniRunner +from tests.utils import hardware_test + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" + +_REPO_ROOT = Path(__file__).resolve().parents[3] +_DEFAULT_DYNIN_CONFIG_PATH: Path | None = None +_DEFAULT_STAGE_CONFIG_PATH = _REPO_ROOT / "tests" / "e2e" / "stage_configs" / "dynin_omni_ci.yaml" + +models = ["snu-aidas/Dynin-Omni"] +stage_configs = [str(_DEFAULT_STAGE_CONFIG_PATH)] +test_params = [(model, stage_config) for model in models for stage_config in stage_configs] + +DYNIN_CONFIG_PATH = str(_DEFAULT_DYNIN_CONFIG_PATH) if _DEFAULT_DYNIN_CONFIG_PATH is not None else None + +pytestmark = [ + pytest.mark.core_model, + pytest.mark.omni, +] + + +# prompting util +def _build_mmu_prompt(tokenizer: Any, question: str, dynin_config_path: str | None) -> dict[str, Any]: + encoded = tokenizer(question, return_tensors="pt", add_special_tokens=True) + token_ids = [int(v) for v in encoded["input_ids"][0].tolist()] + attention_mask = [int(v) for v in encoded["attention_mask"][0].tolist()] + additional_information: dict[str, Any] = { + "task": ["mmu"], + "detok_id": [0], + "prompt_length": [len(token_ids)], + "attention_mask": [attention_mask], + "max_new_tokens": [64], + "steps": [64], + "block_length": [16], + "temperature": [0.0], + } + if dynin_config_path: + additional_information["dynin_config_path"] = [str(dynin_config_path)] + return { + "prompt_token_ids": token_ids, + "additional_information": additional_information, + "modalities": ["text"], + } + + +def _build_mmu_multimodal_prompt( + tokenizer: Any, + question: str, + dynin_config_path: str | None, + *, + image: Any | None = None, + audio: tuple[np.ndarray, int] | None = None, +) -> dict[str, Any]: + if image is None and audio is None: + raise ValueError("At least one multimodal input (image or audio) must be provided.") + + prefix_chunks: list[str] = [] + mm_data: dict[str, Any] = {} + if image is not None: + prefix_chunks.append("<|soi|><|image|><|eoi|>") + mm_data["image"] = image + if audio is not None: + prefix_chunks.append("<|soa|><|audio|><|eoa|>") + mm_data["audio"] = audio + + prefixed_question = " ".join(prefix_chunks + [question]).strip() + prompt = _build_mmu_prompt( + tokenizer=tokenizer, + question=prefixed_question, + dynin_config_path=dynin_config_path, + ) + prompt["multi_modal_data"] = mm_data + prompt["modalities"] = ["text"] + return prompt + + +def _generate_synthetic_image(width: int = 224, height: int = 224) -> np.ndarray: + x = np.linspace(0, 255, width, dtype=np.uint8) + y = np.linspace(0, 255, height, dtype=np.uint8)[:, None] + red = np.tile(x, (height, 1)) + green = np.tile(y, (1, width)) + blue = ((red.astype(np.uint16) + green.astype(np.uint16)) // 2).astype(np.uint8) + return np.stack([red, green, blue], axis=-1) + + +def _generate_synthetic_audio(duration_s: int = 5, sample_rate: int = 48_000) -> tuple[np.ndarray, int]: + t = np.linspace(0, duration_s, int(sample_rate * duration_s), endpoint=False, dtype=np.float32) + waveform = 0.1 * np.sin(2.0 * np.pi * 440.0 * t) + return waveform.astype(np.float32), sample_rate + + +# prompting util +def _build_t2s_decode_prompt(dynin_config_path: str | None) -> dict[str, Any]: + # Bypass stage-0 generation and directly validate token->audio decode path. + generated_audio_token_ids = [int(v) for v in ([10, 11, 12, 13, 14] * 32)] + additional_information: dict[str, Any] = { + "task": ["t2s"], + "detok_id": [1], + "generated_token_ids": [generated_audio_token_ids], + "audio_codebook_size": [4096], + } + if dynin_config_path: + additional_information["dynin_config_path"] = [str(dynin_config_path)] + return { + "prompt_token_ids": [0], + "additional_information": additional_information, + "modalities": ["audio"], + } + + +# prompting util +def _build_t2i_decode_prompt(dynin_config_path: str | None) -> dict[str, Any]: + # Bypass stage-0 generation and directly validate token->image decode path. + # MAGVIT decode path expects a square token grid; 1024 tokens -> 32x32. + generated_image_token_ids = [int(v) for v in ([10, 11, 12, 13, 14, 15, 16, 17] * 128)] + additional_information: dict[str, Any] = { + "task": ["t2i"], + "detok_id": [2], + "generated_token_ids": [generated_image_token_ids], + "codebook_size": [8192], + } + if dynin_config_path: + additional_information["dynin_config_path"] = [str(dynin_config_path)] + return { + "prompt_token_ids": [0], + "additional_information": additional_information, + "modalities": ["image"], + } + + +def _configure_dynin_config_env() -> None: + if DYNIN_CONFIG_PATH: + os.environ["DYNIN_CONFIG_PATH"] = str(DYNIN_CONFIG_PATH) + else: + os.environ.pop("DYNIN_CONFIG_PATH", None) + + +def _is_finished_request_output(request_output: Any) -> bool: + if request_output is None: + return False + req_list = request_output if isinstance(request_output, list) else [request_output] + for req in req_list: + if req is not None and bool(getattr(req, "finished", False)): + return True + return False + + +def _find_stage_output(outputs: list[Any], output_type: str) -> Any | None: + matched = [ + stage_output for stage_output in outputs if getattr(stage_output, "final_output_type", None) == output_type + ] + if not matched: + return None + + # Prefer the latest finished chunk to avoid picking an intermediate stream output. + for stage_output in reversed(matched): + if _is_finished_request_output(getattr(stage_output, "request_output", None)): + return stage_output + return matched[-1] + + +def _to_token_list(value: Any) -> list[int]: + if value is None: + return [] + if hasattr(value, "detach"): + value = value.detach() + if hasattr(value, "cpu"): + value = value.cpu() + if hasattr(value, "flatten"): + value = value.flatten().tolist() + if isinstance(value, tuple): + value = list(value) + if not isinstance(value, list): + return [] + out: list[int] = [] + for token in value: + if isinstance(token, bool): + continue + try: + out.append(int(token)) + except Exception: + continue + return out + + +def _extract_text(stage_output: Any, tokenizer: Any | None = None) -> str: + request_output = getattr(stage_output, "request_output", None) + if request_output is None: + return "" + req_list = request_output if isinstance(request_output, list) else [request_output] + for req in req_list: + completions = getattr(req, "outputs", None) or [] + if not completions: + continue + completion = completions[0] + mm_out = ( + getattr(completion, "multimodal_output", None) + or getattr(req, "multimodal_output", None) + or getattr(stage_output, "multimodal_output", None) + or {} + ) + text = mm_out.get("text") + if isinstance(text, list) and text: + text = text[-1] + if isinstance(text, str) and text.strip(): + return text.strip() + if tokenizer is not None: + for key in ("text_tokens", "token_ids"): + token_ids = _to_token_list(mm_out.get(key)) + if not token_ids: + continue + decoded = tokenizer.decode(token_ids, skip_special_tokens=True) + if isinstance(decoded, str) and decoded.strip(): + return decoded.strip() + fallback = getattr(completion, "text", None) + if isinstance(fallback, str) and fallback.strip(): + return fallback.strip() + return "" + + +def _extract_audio(stage_output: Any) -> Any | None: + request_output = getattr(stage_output, "request_output", None) + if request_output is None: + return None + req_list = request_output if isinstance(request_output, list) else [request_output] + for req in req_list: + completions = getattr(req, "outputs", None) or [] + if not completions: + continue + completion = completions[0] + mm_out = getattr(completion, "multimodal_output", None) or {} + if "audio" in mm_out: + return mm_out["audio"] + return None + + +def _extract_image(stage_output: Any) -> Any | None: + request_output = getattr(stage_output, "request_output", None) + if request_output is None: + return None + req_list = request_output if isinstance(request_output, list) else [request_output] + for req in req_list: + completions = getattr(req, "outputs", None) or [] + if not completions: + continue + completion = completions[0] + mm_out = getattr(completion, "multimodal_output", None) or {} + if "image" in mm_out: + return mm_out["image"] + return None + + +def _numel(value: Any) -> int: + if value is None: + return 0 + if isinstance(value, torch.Tensor): + return int(value.numel()) + shape = getattr(value, "shape", None) + if shape is not None: + try: + total = 1 + for dim in shape: + total *= int(dim) + return int(total) + except Exception: + pass + if isinstance(value, (list, tuple)): + return len(value) + return 0 + + +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@pytest.mark.parametrize("test_config", test_params) +def test_dynin_t2i_decode_to_image(test_config: tuple[str, str]) -> None: + model, stage_config_path = test_config + _configure_dynin_config_env() + prompt = _build_t2i_decode_prompt(dynin_config_path=DYNIN_CONFIG_PATH) + + with OmniRunner( + model, + seed=42, + stage_configs_path=stage_config_path, + stage_init_timeout=600, + init_timeout=600, + ) as runner: + outputs = runner.generate([prompt]) + + image_output = _find_stage_output(outputs, "image") + assert image_output is not None + image_value = _extract_image(image_output) + assert image_value is not None + assert _numel(image_value) > 0 + + +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@pytest.mark.parametrize("test_config", test_params) +def test_dynin_mmu_to_text(test_config: tuple[str, str]) -> None: + model, stage_config_path = test_config + _configure_dynin_config_env() + tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + prompt = _build_mmu_prompt( + tokenizer=tokenizer, + question="What is 2 + 2? Answer in one short sentence.", + dynin_config_path=DYNIN_CONFIG_PATH, + ) + + with OmniRunner( + model, + seed=42, + stage_configs_path=stage_config_path, + stage_init_timeout=600, + init_timeout=600, + ) as runner: + outputs = runner.generate([prompt]) + + text_output = _find_stage_output(outputs, "text") + assert text_output is not None + text_content = _extract_text(text_output, tokenizer=tokenizer) + assert text_content + + +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@pytest.mark.parametrize("test_config", test_params) +def test_dynin_image_to_text(test_config: tuple[str, str]) -> None: + model, stage_config_path = test_config + _configure_dynin_config_env() + tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + prompt = _build_mmu_multimodal_prompt( + tokenizer=tokenizer, + question="Describe the image briefly in one sentence.", + dynin_config_path=DYNIN_CONFIG_PATH, + image=_generate_synthetic_image(), + ) + + with OmniRunner( + model, + seed=42, + stage_configs_path=stage_config_path, + stage_init_timeout=600, + init_timeout=600, + ) as runner: + outputs = runner.generate([prompt]) + + text_output = _find_stage_output(outputs, "text") + assert text_output is not None + text_content = _extract_text(text_output, tokenizer=tokenizer) + assert text_content + + +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@pytest.mark.parametrize("test_config", test_params) +def test_dynin_speech_to_text(test_config: tuple[str, str]) -> None: + model, stage_config_path = test_config + _configure_dynin_config_env() + tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + prompt = _build_mmu_multimodal_prompt( + tokenizer=tokenizer, + question="Transcribe the audio briefly in one sentence.", + dynin_config_path=DYNIN_CONFIG_PATH, + audio=_generate_synthetic_audio(), + ) + + with OmniRunner( + model, + seed=42, + stage_configs_path=stage_config_path, + stage_init_timeout=600, + init_timeout=600, + ) as runner: + outputs = runner.generate([prompt]) + + text_output = _find_stage_output(outputs, "text") + assert text_output is not None + text_content = _extract_text(text_output, tokenizer=tokenizer) + assert text_content + + +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@pytest.mark.parametrize("test_config", test_params) +def test_dynin_t2s_decode_to_audio(test_config: tuple[str, str]) -> None: + model, stage_config_path = test_config + _configure_dynin_config_env() + prompt = _build_t2s_decode_prompt(dynin_config_path=DYNIN_CONFIG_PATH) + + with OmniRunner( + model, + seed=42, + stage_configs_path=stage_config_path, + stage_init_timeout=600, + init_timeout=600, + ) as runner: + outputs = runner.generate([prompt]) + + audio_output = _find_stage_output(outputs, "audio") + assert audio_output is not None + audio_value = _extract_audio(audio_output) + assert audio_value is not None + assert _numel(audio_value) > 0 diff --git a/tests/e2e/online_serving/test_dynin_omni_expansion.py b/tests/e2e/online_serving/test_dynin_omni_expansion.py new file mode 100644 index 0000000000..4648c424fe --- /dev/null +++ b/tests/e2e/online_serving/test_dynin_omni_expansion.py @@ -0,0 +1,160 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Example online tests for Dynin-Omni model. +""" + +import base64 +import gc +import os +from io import BytesIO +from pathlib import Path + +import numpy as np +import pytest +import soundfile as sf +from vllm.assets.image import ImageAsset + +from tests import conftest as tests_conftest +from tests.conftest import OmniServerParams +from tests.utils import hardware_test + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" + +MODEL = "snu-aidas/Dynin-Omni" +STAGE_CONFIG = str(Path(__file__).parent.parent / "stage_configs" / "dynin_omni_ci.yaml") +_WHISPER_SAMPLE_RATE_HZ = 16_000 + +T2I_PROMPT = "A high quality detailed living room interior photo." +T2S_PROMPT = "Please read this sentence naturally: Hello from Dynin-Omni online serving." +I2I_PROMPT = "Transform this outdoor nature boardwalk scene into a painting style with vivid colors." + +TEST_PARAMS = [OmniServerParams(model=MODEL, stage_config_path=STAGE_CONFIG)] +_STAGE_COUNT = 3 +_I2I_STAGE_SAMPLING = {"max_tokens": 1, "temperature": 0.0, "top_p": 1.0, "detokenize": False} + + +def _prepare_audio_waveform_for_whisper(audio_data: np.ndarray, samplerate: int) -> np.ndarray: + """Normalize decoded audio into a mono 16 kHz float32 waveform for Whisper.""" + if samplerate <= 0: + raise ValueError(f"Invalid audio sample rate: {samplerate}") + + waveform = np.asarray(audio_data, dtype=np.float32) + if waveform.ndim == 0: + raise ValueError("Audio waveform must have at least one dimension") + if waveform.ndim > 1: + waveform = np.mean(waveform, axis=1) + if waveform.size == 0: + raise ValueError("Empty audio waveform") + + if samplerate != _WHISPER_SAMPLE_RATE_HZ: + target_num_samples = max(int(round(waveform.shape[0] * _WHISPER_SAMPLE_RATE_HZ / samplerate)), 1) + source_positions = np.arange(waveform.shape[0], dtype=np.float64) + target_positions = np.linspace( + 0.0, + max(waveform.shape[0] - 1, 0), + num=target_num_samples, + dtype=np.float64, + ) + waveform = np.interp(target_positions, source_positions, waveform).astype(np.float32) + + return np.ascontiguousarray(np.clip(waveform, -1.0, 1.0), dtype=np.float32) + + +def _convert_audio_bytes_to_text_without_ffmpeg(raw_bytes: bytes) -> str: + """Dynin t2s keeps Whisper transcription local to this test module and avoids ffmpeg.""" + import whisper + + data, samplerate = sf.read(BytesIO(raw_bytes), dtype="float32", always_2d=True) + audio_waveform = _prepare_audio_waveform_for_whisper(data, samplerate) + + model = whisper.load_model("small", device="cpu") + try: + transcript = model.transcribe( + audio_waveform, + temperature=0.0, + word_timestamps=True, + condition_on_previous_text=False, + )["text"] + finally: + del model + gc.collect() + + return transcript or "" + + +@pytest.fixture +def dynin_t2s_openai_client(openai_client, monkeypatch): + monkeypatch.setattr( + tests_conftest, + "convert_audio_bytes_to_text", + _convert_audio_bytes_to_text_without_ffmpeg, + ) + return openai_client + + +def _build_t2i_messages(prompt: str) -> list[dict]: + return [{"role": "user", "content": [{"type": "text", "text": f"<|t2i|> {prompt}"}]}] + + +def _build_t2s_messages(prompt: str) -> list[dict]: + return [{"role": "user", "content": [{"type": "text", "text": f"<|t2s|> {prompt}"}]}] + + +def _build_i2i_messages(prompt: str) -> list[dict]: + input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB") + buffer = BytesIO() + input_image.save(buffer, format="JPEG") + image_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8") + return [ + { + "role": "user", + "content": [ + {"type": "text", "text": f"<|i2i|> {prompt}"}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}, + ], + } + ] + + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True) +def test_send_i2i_request_001(omni_server, openai_client) -> None: + request_config = { + "model": omni_server.model, + "messages": _build_i2i_messages(I2I_PROMPT), + "modalities": ["image"], + "extra_body": { + "sampling_params_list": [dict(_I2I_STAGE_SAMPLING) for _ in range(_STAGE_COUNT)], + }, + } + openai_client.send_diffusion_request(request_config) + + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True) +def test_send_t2i_request_001(omni_server, openai_client) -> None: + request_config = { + "model": omni_server.model, + "messages": _build_t2i_messages(T2I_PROMPT), + "modalities": ["image"], + } + openai_client.send_diffusion_request(request_config) + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True) +def test_send_t2s_request_001(omni_server, dynin_t2s_openai_client) -> None: + request_config = { + "model": omni_server.model, + "messages": _build_t2s_messages(T2S_PROMPT), + "modalities": ["audio"], + } + dynin_t2s_openai_client.send_omni_request(request_config) diff --git a/tests/e2e/stage_configs/dynin_omni_ci.yaml b/tests/e2e/stage_configs/dynin_omni_ci.yaml new file mode 100644 index 0000000000..0240007510 --- /dev/null +++ b/tests/e2e/stage_configs/dynin_omni_ci.yaml @@ -0,0 +1,84 @@ +# stage config for running dynin_omni with a 3-stage architecture. +# this config is intended for e2e smoke tests. + +stage_args: + - stage_id: 0 + stage_type: llm + runtime: + process: true + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: token2text + model_arch: DyninOmniForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + engine_output_type: latent + trust_remote_code: true + gpu_memory_utilization: 0.5 + enforce_eager: true + enable_prefix_caching: false + async_scheduling: false + max_num_batched_tokens: 4096 + is_comprehension: true + final_output: true + final_output_type: text + + - stage_id: 1 + stage_type: llm + runtime: + process: true + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: token2image + model_arch: DyninOmniForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + engine_output_type: latent + trust_remote_code: true + gpu_memory_utilization: 0.2 + enforce_eager: true + enable_prefix_caching: false + async_scheduling: false + max_num_batched_tokens: 4096 + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.dynin_omni.token2text_to_token2image + final_output: true + final_output_type: image + + - stage_id: 2 + stage_type: llm + runtime: + process: true + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: token2audio + model_arch: DyninOmniForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + engine_output_type: latent + trust_remote_code: true + gpu_memory_utilization: 0.2 + enforce_eager: true + enable_prefix_caching: false + async_scheduling: false + max_num_batched_tokens: 4096 + engine_input_source: [1] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.dynin_omni.token2image_to_token2audio + final_output: true + final_output_type: audio + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 + edges: + - from: 0 + to: 1 + window_size: -1 + - from: 1 + to: 2 + window_size: -1 diff --git a/vllm_omni/model_executor/models/dynin_omni/__init__.py b/vllm_omni/model_executor/models/dynin_omni/__init__.py new file mode 100644 index 0000000000..2a3bae8a9f --- /dev/null +++ b/vllm_omni/model_executor/models/dynin_omni/__init__.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from .dynin_omni import DyninOmniForConditionalGeneration +from .dynin_omni_common import ( + get_dynin_magvit_attr, + get_dynin_modeling_attr, + get_dynin_sampling_attr, +) + +if TYPE_CHECKING: + from .dynin_omni_token2audio import DyninOmniToken2Audio + from .dynin_omni_token2image import DyninOmniToken2Image + from .dynin_omni_token2text import DyninOmniToken2Text + + +_STAGE_EXPORTS = { + "DyninOmniToken2Audio": (".dynin_omni_token2audio", "DyninOmniToken2Audio"), + "DyninOmniToken2Image": (".dynin_omni_token2image", "DyninOmniToken2Image"), + "DyninOmniToken2Text": (".dynin_omni_token2text", "DyninOmniToken2Text"), +} + +_MODELING_EXPORTS = {"DyninOmniConfig", "DyninOmniModelLM", "VideoTokenMerger"} +_MAGVIT_EXPORTS = {"VQGANEncoder", "VQGANDecoder", "LFQuantizer", "MAGVITv2"} + + +def __getattr__(name: str) -> Any: + if name in _STAGE_EXPORTS: + module_name, attr_name = _STAGE_EXPORTS[name] + module = __import__(module_name, globals(), locals(), [attr_name], 1) + return getattr(module, attr_name) + + if name in _MODELING_EXPORTS: + return get_dynin_modeling_attr(name) + + if name in _MAGVIT_EXPORTS: + return get_dynin_magvit_attr(name) + + if name == "get_mask_schedule": + return get_dynin_sampling_attr("get_mask_schedule") + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +__all__ = [ + "DyninOmniForConditionalGeneration", + "DyninOmniToken2Audio", + "DyninOmniToken2Image", + "DyninOmniToken2Text", + "DyninOmniConfig", + "DyninOmniModelLM", + "VideoTokenMerger", + "VQGANEncoder", + "VQGANDecoder", + "LFQuantizer", + "MAGVITv2", + "get_mask_schedule", +] diff --git a/vllm_omni/model_executor/models/dynin_omni/dynin_omni.py b/vllm_omni/model_executor/models/dynin_omni/dynin_omni.py new file mode 100644 index 0000000000..0caae158ef --- /dev/null +++ b/vllm_omni/model_executor/models/dynin_omni/dynin_omni.py @@ -0,0 +1,744 @@ +from __future__ import annotations + +from collections.abc import Iterable, Mapping, Sequence +from functools import cached_property +from importlib import import_module +from typing import Any + +import numpy as np +import torch +import torch.nn as nn +from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions +from vllm.inputs import MultiModalDataDict +from vllm.inputs import MultiModalInput as MultiModalInputs +from vllm.model_executor.models.interfaces import SupportsMultiModal +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import ( + MultiModalFieldConfig, + MultiModalKwargsItems, + PlaceholderRange, +) +from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser +from vllm.multimodal.processing import ( + BaseDummyInputsBuilder, + BaseMultiModalProcessor, + BaseProcessingInfo, + ProcessorInputs, + PromptUpdate, + TimingContext, +) +from vllm.sequence import IntermediateTensors +from vllm.v1.outputs import SamplerOutput +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.sample.sampler import Sampler + +from vllm_omni.model_executor.models.output_templates import OmniOutput + +from .dynin_omni_common import build_zero_input_embeddings + +try: + from PIL import Image as PILImage +except Exception: # pragma: no cover + PILImage = None + + +_MODALITY_ORDER = ("image", "video", "audio") + +_MODALITY_ALIASES = { + "img2img": "image", +} + +_MODALITY_INPUT_KEY_BY_NAME = { + "image": "pixel_values", + "video": "pixel_values_videos", + "audio": "input_audio_features", +} + +_MODALITY_PLACEHOLDER_BY_NAME = { + "image": "<|soi|><|image|><|eoi|>", + "video": "<|sov|><|video|><|eov|>", + "audio": "<|soa|><|audio|><|eoa|>", +} + +_MODALITY_INPUT_ALIASES = { + "image": ("pixel_values", "image_embeds", "img2img"), + "video": ("pixel_values_videos", "video_embeds"), + "audio": ("input_audio_features", "audio_embeds"), +} + + +def _normalize_modality_name(modality: str) -> str: + return _MODALITY_ALIASES.get(modality, modality) + + +def _get_modality_count(mm_counts: Mapping[str, int], modality: str) -> int: + canonical = _normalize_modality_name(modality) + count = mm_counts.get(canonical, 0) + for alias, target in _MODALITY_ALIASES.items(): + if target == canonical: + count += mm_counts.get(alias, 0) + return count + + +def _normalize_mm_data_aliases(mm_data: MultiModalDataDict) -> MultiModalDataDict: + normalized: dict[str, Any] = {} + for modality, value in mm_data.items(): + canonical = _normalize_modality_name(modality) + if canonical in normalized and normalized[canonical] is not None and value is not None: + raise ValueError( + "Dynin received duplicate multimodal inputs for " + f"{canonical!r} via {modality!r}. " + "Provide either the canonical modality or its alias, not both." + ) + if canonical not in normalized or normalized[canonical] is None: + normalized[canonical] = value + return normalized + + +def _get_placeholder_text(modality: str) -> str | None: + modality = _normalize_modality_name(modality) + for base_modality, placeholder in _MODALITY_PLACEHOLDER_BY_NAME.items(): + if modality.startswith(base_modality): + return placeholder + return None + + +class DyninOmniProcessingInfo(BaseProcessingInfo): + def get_data_parser(self) -> MultiModalDataParser: + return DyninOmniMultiModalDataParser( + expected_hidden_size=self._get_expected_hidden_size(), + ) + + def get_supported_mm_limits(self) -> Mapping[str, int | None]: + limits = {modality: 1 for modality in _MODALITY_ORDER} + for alias, target in _MODALITY_ALIASES.items(): + if target in limits: + limits[alias] = limits[target] + return limits + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int] | None: + del seq_len, mm_counts + limits = {modality: 1 for modality in _MODALITY_ORDER} + for alias, target in _MODALITY_ALIASES.items(): + if target in limits: + limits[alias] = limits[target] + return limits + + +class DyninOmniDummyInputsBuilder(BaseDummyInputsBuilder[DyninOmniProcessingInfo]): + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + chunks: list[str] = [] + for modality in _MODALITY_ORDER: + placeholder = _get_placeholder_text(modality) + if placeholder is None: + continue + chunks.extend([placeholder] * _get_modality_count(mm_counts, modality)) + return " ".join(chunks) + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, BaseDummyOptions] | None = None, + ) -> MultiModalDataDict: + del seq_len + + mm_data: dict[str, Any] = {} + + num_images = _get_modality_count(mm_counts, "image") + if num_images > 0: + mm_data["image"] = self._get_dummy_images( + width=224, + height=224, + num_images=num_images, + overrides=mm_options.get("image") if mm_options else None, + ) + + num_videos = _get_modality_count(mm_counts, "video") + if num_videos > 0: + mm_data["video"] = self._get_dummy_videos( + width=224, + height=224, + num_frames=8, + num_videos=num_videos, + overrides=mm_options.get("video") if mm_options else None, + ) + + num_audios = _get_modality_count(mm_counts, "audio") + if num_audios > 0: + mm_data["audio"] = self._get_dummy_audios( + length=16000, + num_audios=num_audios, + overrides=mm_options.get("audio") if mm_options else None, + ) + + return mm_data + + +class DyninOmniMultiModalDataParser(MultiModalDataParser): + def parse_mm_data(self, mm_data: MultiModalDataDict) -> MultiModalDataItems: + normalized = _normalize_mm_data_aliases(mm_data) + mm_items = super().parse_mm_data(normalized) + + for alias, canonical in _MODALITY_ALIASES.items(): + if alias in mm_data and canonical in mm_items and alias not in mm_items: + mm_items[alias] = mm_items[canonical] + + return mm_items + + def _get_audio_with_sr(self, audio: Any) -> tuple[np.ndarray, float | None]: + audio_array, orig_sr = super()._get_audio_with_sr(audio) + if self.audio_resampler.target_sr is None: + return audio_array, None + return audio_array, orig_sr + + +class DyninOmniMultiModalProcessor(BaseMultiModalProcessor[DyninOmniProcessingInfo]): + @staticmethod + def _find_subsequence( + haystack: list[int], + needle: list[int], + start: int, + ) -> int | None: + if not needle: + return None + + max_start = len(haystack) - len(needle) + if max_start < start: + return None + + for idx in range(start, max_start + 1): + if haystack[idx : idx + len(needle)] == needle: + return idx + return None + + @staticmethod + def _make_disabled_embed_mask(length: int) -> torch.Tensor: + return torch.zeros(length, dtype=torch.bool) + + @staticmethod + def _encode_prompt_to_token_ids( + prompt: str | list[int], + tokenizer: Any | None, + ) -> list[int]: + if isinstance(prompt, str): + if tokenizer is None: + raise ValueError("Tokenizer is required to process string prompts for Dynin multimodal inputs.") + return tokenizer.encode(prompt, add_special_tokens=False) + return list(prompt) + + @staticmethod + def _ensure_non_empty_prompt_ids( + prompt_token_ids: list[int], + tokenizer: Any | None, + ) -> list[int]: + if prompt_token_ids: + return prompt_token_ids + + fallback_id = None + if tokenizer is not None: + fallback_id = getattr(tokenizer, "bos_token_id", None) + if fallback_id is None: + fallback_id = getattr(tokenizer, "eos_token_id", None) + if fallback_id is None: + fallback_id = getattr(tokenizer, "pad_token_id", None) + + return [0 if fallback_id is None else int(fallback_id)] + + @classmethod + def _image_to_chw_float_tensor(cls, image: Any) -> torch.Tensor: + if isinstance(image, torch.Tensor): + tensor = image.detach() + elif isinstance(image, np.ndarray): + tensor = torch.from_numpy(image) + elif PILImage is not None and isinstance(image, PILImage.Image): + tensor = torch.from_numpy(np.asarray(image).copy()) + else: + raise TypeError(f"Unsupported image item type: {type(image)!r}") + + if tensor.ndim == 2: + tensor = tensor.unsqueeze(-1) + if tensor.ndim != 3: + raise ValueError(f"Expected 3D image tensor, got shape={tuple(tensor.shape)}") + + if tensor.shape[-1] in (1, 3, 4) and tensor.shape[0] not in (1, 3, 4): + tensor = tensor.permute(2, 0, 1) + + if tensor.shape[0] == 1: + tensor = tensor.repeat(3, 1, 1) + if tensor.shape[0] == 4: + tensor = tensor[:3] + + tensor = tensor.to(dtype=torch.float32) + if tensor.numel() > 0 and torch.max(tensor) > 1.0: + tensor = tensor / 255.0 + return tensor.contiguous() + + @classmethod + def _video_to_tchw_float_tensor(cls, video: Any) -> torch.Tensor: + if isinstance(video, (list, tuple)) and not isinstance(video, torch.Tensor): + frames = [cls._image_to_chw_float_tensor(frame) for frame in video] + if not frames: + return torch.zeros((1, 3, 1, 1), dtype=torch.float32) + return torch.stack(frames, dim=0).contiguous() + + if isinstance(video, torch.Tensor): + tensor = video.detach() + elif isinstance(video, np.ndarray): + tensor = torch.from_numpy(video) + else: + raise TypeError(f"Unsupported video item type: {type(video)!r}") + + if tensor.ndim == 3: + return cls._image_to_chw_float_tensor(tensor).unsqueeze(0).contiguous() + + if tensor.ndim != 4: + raise ValueError(f"Expected 4D video tensor, got shape={tuple(tensor.shape)}") + + if tensor.shape[-1] in (1, 3, 4) and tensor.shape[1] not in (1, 3, 4): + tensor = tensor.permute(0, 3, 1, 2) + + if tensor.shape[1] == 1: + tensor = tensor.repeat(1, 3, 1, 1) + if tensor.shape[1] == 4: + tensor = tensor[:, :3] + + tensor = tensor.to(dtype=torch.float32) + if tensor.numel() > 0 and torch.max(tensor) > 1.0: + tensor = tensor / 255.0 + return tensor.contiguous() + + @staticmethod + def _audio_to_float_tensor(audio: Any) -> torch.Tensor: + if isinstance(audio, tuple) and len(audio) == 2: + audio = audio[0] + + if isinstance(audio, torch.Tensor): + tensor = audio.detach() + elif isinstance(audio, np.ndarray): + tensor = torch.from_numpy(audio) + else: + tensor = torch.as_tensor(audio) + + tensor = tensor.to(dtype=torch.float32).contiguous().view(-1) + if tensor.numel() == 0: + return torch.zeros((16000,), dtype=torch.float32) + + max_abs = torch.max(torch.abs(tensor)) + if max_abs > 1.0: + tensor = tensor / max_abs + + return tensor.contiguous() + + @classmethod + def _convert_modality_item(cls, modality: str, item: Any) -> torch.Tensor: + if modality == "image": + return cls._image_to_chw_float_tensor(item) + if modality == "video": + return cls._video_to_tchw_float_tensor(item) + if modality == "audio": + return cls._audio_to_float_tensor(item) + raise ValueError(f"Unsupported modality for Dynin processor: {modality}") + + def _build_modality_kwargs( + self, + modality: str, + modality_items: Sequence[Any], + ) -> Sequence[Any]: + modality = _normalize_modality_name(modality) + input_key = _MODALITY_INPUT_KEY_BY_NAME[modality] + tensor_items = [self._convert_modality_item(modality, item) for item in modality_items] + mm_kwargs = MultiModalKwargsItems.from_hf_inputs( + {input_key: tensor_items}, + {input_key: MultiModalFieldConfig.batched(modality)}, + ) + return mm_kwargs[modality] + + def _build_placeholder_ranges( + self, + *, + modality: str, + item_count: int, + prompt_token_ids: list[int], + tokenizer: Any | None, + search_start: int, + ) -> tuple[list[PlaceholderRange], int]: + ranges: list[PlaceholderRange] = [] + + for _ in range(item_count): + placeholder_text = _get_placeholder_text(modality) + placeholder_token_ids: list[int] = [] + + if placeholder_text and tokenizer is not None: + placeholder_token_ids = tokenizer.encode( + placeholder_text, + add_special_tokens=False, + ) + + found_offset = None + if placeholder_token_ids: + found_offset = self._find_subsequence( + prompt_token_ids, + placeholder_token_ids, + search_start, + ) + + if found_offset is None: + found_offset = min(search_start, len(prompt_token_ids) - 1) + placeholder_len = 1 + else: + placeholder_len = len(placeholder_token_ids) + + ranges.append( + PlaceholderRange( + offset=found_offset, + length=placeholder_len, + is_embed=self._make_disabled_embed_mask(placeholder_len), + ) + ) + search_start = found_offset + placeholder_len + + return ranges, search_start + + def _get_mm_fields_config( + self, + hf_inputs: Any, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + del hf_inputs, hf_processor_mm_kwargs + return {} + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + del mm_items, hf_processor_mm_kwargs, out_mm_kwargs + return [] + + def apply( + self, + inputs: ProcessorInputs, + timing_ctx: TimingContext, + ) -> MultiModalInputs: + prompt = inputs.prompt + mm_items = inputs.mm_data_items + + with timing_ctx.record("get_mm_hashes"): + mm_hashes = inputs.get_mm_hashes(self.info.model_id) + + tokenizer = self.info.ctx.tokenizer + prompt_token_ids = self._encode_prompt_to_token_ids(prompt, tokenizer) + prompt_token_ids = self._ensure_non_empty_prompt_ids(prompt_token_ids, tokenizer) + + mm_kwargs_by_modality: dict[str, Sequence[Any]] = {} + mm_placeholders: dict[str, list[PlaceholderRange]] = {} + search_start = 0 + mm_counts = mm_items.get_all_counts() + + for modality in _MODALITY_ORDER: + item_count = mm_counts.get(modality, 0) + if item_count <= 0: + continue + + modality_items = mm_items[modality].get_all() + if len(modality_items) != item_count: + raise RuntimeError( + f"Parsed {len(modality_items)} items but expected {item_count} for modality={modality!r}" + ) + + mm_kwargs_by_modality[modality] = self._build_modality_kwargs( + modality, + modality_items, + ) + + placeholder_ranges, search_start = self._build_placeholder_ranges( + modality=modality, + item_count=item_count, + prompt_token_ids=prompt_token_ids, + tokenizer=tokenizer, + search_start=search_start, + ) + mm_placeholders[modality] = placeholder_ranges + + return MultiModalInputs( + type="multimodal", + prompt_token_ids=prompt_token_ids, + mm_kwargs=MultiModalKwargsItems(mm_kwargs_by_modality), + mm_hashes=mm_hashes, + mm_placeholders=mm_placeholders, + ) + + +class DyninOmniStageBase(nn.Module): + stage_name = "Dynin stage" + + def make_empty_intermediate_tensors( + self, + batch_size: int, + dtype: torch.dtype, + device: torch.device, + ) -> IntermediateTensors: + del batch_size, dtype, device + return IntermediateTensors({}) + + def embed_input_ids( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Any = None, + is_multimodal: torch.Tensor | None = None, + **kwargs: Any, + ) -> torch.Tensor: + del multimodal_embeddings, is_multimodal, kwargs + return build_zero_input_embeddings( + input_ids=input_ids, + hidden_size=self.hidden_size, + stage_name=self.stage_name, + ) + + def load_weights( + self, + weights: Iterable[tuple[str, torch.Tensor]], + ) -> set[str]: + return {name for name, _ in weights} + + def compute_logits( + self, + hidden_states: torch.Tensor | OmniOutput, + sampling_metadata: Any = None, + ) -> torch.Tensor | None: + del hidden_states, sampling_metadata + return None + + +@MULTIMODAL_REGISTRY.register_processor( + DyninOmniMultiModalProcessor, + info=DyninOmniProcessingInfo, + dummy_inputs=DyninOmniDummyInputsBuilder, +) +class DyninOmniForConditionalGeneration(nn.Module, SupportsMultiModal): + supports_multimodal_raw_input_only = True + STAGE_ALIAS = { + "tokenizer": "token2text", + "token2token": "token2text", + "detok_text": "token2text", + "token2img": "token2image", + "token2wav": "token2audio", + "token2speech": "token2audio", + } + + STAGE_IMPL = { + "token2text": (".dynin_omni_token2text", "DyninOmniToken2Text"), + "token2image": (".dynin_omni_token2image", "DyninOmniToken2Image"), + "token2audio": (".dynin_omni_token2audio", "DyninOmniToken2Audio"), + } + + _STAGE_IMPL_CACHE: dict[str, type[nn.Module]] = {} + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> str | None: + del i + return _get_placeholder_text(modality) + + @classmethod + def _resolve_stage_impl_class(cls, model_stage: str) -> type[nn.Module]: + impl = cls._STAGE_IMPL_CACHE.get(model_stage) + if impl is not None: + return impl + + module_name, class_name = cls.STAGE_IMPL[model_stage] + module = import_module(module_name, package=__package__) + impl = getattr(module, class_name) + cls._STAGE_IMPL_CACHE[model_stage] = impl + return impl + + @classmethod + def _normalize_stage_name(cls, raw_stage: str) -> str: + normalized = cls.STAGE_ALIAS.get(raw_stage, raw_stage) + if normalized not in cls.STAGE_IMPL: + raise ValueError( + "Unsupported DYNIN omni model_stage: " + f"{raw_stage} (normalized={normalized}). " + f"Supported: {sorted(cls.STAGE_IMPL.keys())}" + ) + return normalized + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + raw_stage = str(getattr(vllm_config.model_config, "model_stage", "token2text")).lower() + self.model_stage = self._normalize_stage_name(raw_stage) + + impl_cls = self._resolve_stage_impl_class(self.model_stage) + self.impl = impl_cls(vllm_config=vllm_config, prefix=prefix) + self.model = self.impl + + self.has_preprocess = False + self.has_postprocess = False + self.have_multimodal_outputs = getattr(self.impl, "have_multimodal_outputs", True) + self.requires_raw_input_tokens = getattr(self.impl, "requires_raw_input_tokens", True) + self.language_model = self._resolve_language_model() + + def _resolve_language_model(self) -> Any | None: + if hasattr(self.impl, "get_language_model"): + language_model = self.impl.get_language_model() + if language_model is not None: + return language_model + + if hasattr(self.impl, "language_model"): + language_model = getattr(self.impl, "language_model") + if language_model is not None: + return language_model + + if self.model_stage == "token2text": + return getattr(self.impl, "model", None) + + return None + + def get_language_model(self) -> Any | None: + return self.language_model + + @cached_property + def sampler(self): + if hasattr(self.model, "sampler"): + return self.model.sampler + if self.language_model is not None and hasattr(self.language_model, "sampler"): + return self.language_model.sampler + return Sampler() + + def init_multi_modal(self, thinker_config: Any = None) -> None: + if hasattr(self.model, "init_multi_modal"): + self.model.init_multi_modal(thinker_config) + + def _collect_multimodal_inputs(self, **kwargs: Any) -> dict[str, Any]: + mm_inputs: dict[str, Any] = {} + for modality, aliases in _MODALITY_INPUT_ALIASES.items(): + for alias in aliases: + if alias in kwargs and kwargs[alias] is not None: + mm_inputs[modality] = kwargs[alias] + break + return mm_inputs + + def _normalize_loaded_weight_names( + self, + loaded: set[str], + expected_param_names: set[str], + ) -> set[str]: + if self.model_stage != "token2text": + return loaded + + normalized_loaded: set[str] = set() + prefixes = ("", "impl.", "impl.model.") + + for name in loaded: + for prefix in prefixes: + candidate = f"{prefix}{name}" if prefix else name + if candidate in expected_param_names: + normalized_loaded.add(candidate) + break + + if len(normalized_loaded) < len(expected_param_names): + normalized_loaded.update(expected_param_names) + + return normalized_loaded + + def forward( + self, + input_ids: torch.Tensor | None = None, + positions: torch.Tensor | None = None, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: Any, + ) -> OmniOutput: + return self.model( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **kwargs, + ) + + def make_empty_intermediate_tensors( + self, + batch_size: int, + dtype: torch.dtype, + device: torch.device, + ) -> IntermediateTensors: + return self.model.make_empty_intermediate_tensors(batch_size, dtype, device) + + def embed_input_ids( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Any = None, + is_multimodal: torch.Tensor | None = None, + **kwargs: Any, + ) -> torch.Tensor: + squeezed_batch = False + staged_input_ids = input_ids + + if input_ids.ndim == 0: + staged_input_ids = input_ids.view(1, 1) + squeezed_batch = True + elif input_ids.ndim == 1: + staged_input_ids = input_ids.unsqueeze(0) + squeezed_batch = True + + embeddings = self.model.embed_input_ids( + staged_input_ids, + multimodal_embeddings=multimodal_embeddings, + is_multimodal=is_multimodal, + **kwargs, + ) + + if squeezed_batch and isinstance(embeddings, torch.Tensor): + if embeddings.ndim == 3 and embeddings.shape[0] == 1: + return embeddings.squeeze(0) + if embeddings.ndim == 2 and input_ids.ndim == 0 and embeddings.shape[0] == 1: + return embeddings + + return embeddings + + def embed_multimodal(self, **kwargs: Any) -> Any: + if hasattr(self.model, "embed_multimodal"): + return self.model.embed_multimodal(**kwargs) + + self._collect_multimodal_inputs(**kwargs) + return None + + def load_weights( + self, + weights: Iterable[tuple[str, torch.Tensor]], + ) -> set[str]: + loaded = self.model.load_weights(weights) + if loaded is None: + loaded = set() + + expected_param_names = {name for name, _ in self.named_parameters()} + if not expected_param_names: + return loaded + + return self._normalize_loaded_weight_names(loaded, expected_param_names) + + def compute_logits( + self, + hidden_states: torch.Tensor | OmniOutput, + sampling_metadata: Any = None, + ) -> torch.Tensor | None: + return self.model.compute_logits(hidden_states, sampling_metadata=sampling_metadata) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> SamplerOutput | None: + if hasattr(self.model, "sample"): + return self.model.sample(logits, sampling_metadata) + if self.language_model is not None and hasattr(self.language_model, "sample"): + return self.language_model.sample(logits, sampling_metadata) + return None diff --git a/vllm_omni/model_executor/models/dynin_omni/dynin_omni_common.py b/vllm_omni/model_executor/models/dynin_omni/dynin_omni_common.py new file mode 100644 index 0000000000..6166d8615c --- /dev/null +++ b/vllm_omni/model_executor/models/dynin_omni/dynin_omni_common.py @@ -0,0 +1,1241 @@ +from __future__ import annotations + +import hashlib +import importlib.util +import os +import sys +import threading +import types +from collections.abc import Iterable +from dataclasses import dataclass +from enum import IntEnum +from functools import lru_cache +from pathlib import Path +from typing import Any + +import torch +from vllm.config import VllmConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + +try: + from huggingface_hub import snapshot_download +except Exception: # pragma: no cover + snapshot_download = None + + +class DetokTarget(IntEnum): + TEXT = 0 + AUDIO = 1 + IMAGE = 2 + + +TASK_TO_DETOK = { + "mmu": DetokTarget.TEXT, + "s2t": DetokTarget.TEXT, + "mmu_fast": DetokTarget.TEXT, + "mmu_fastdllm_v1": DetokTarget.TEXT, + "v2t": DetokTarget.TEXT, + "t2s": DetokTarget.AUDIO, + "t2s_mmu_like": DetokTarget.AUDIO, + "t2s_fixed": DetokTarget.AUDIO, + "s2s": DetokTarget.AUDIO, + "v2s": DetokTarget.AUDIO, + "t2i": DetokTarget.IMAGE, + "i2i": DetokTarget.IMAGE, + "ti2ti": DetokTarget.IMAGE, +} + +DEFAULT_VQ_IMAGE_SOURCE = "snu-aidas/magvitv2" +DEFAULT_VQ_AUDIO_SOURCE = "snu-aidas/emova_speech_tokenizer_vllm" +DEFAULT_MAGVIT_REMOTE_CODE_REPO = "snu-aidas/magvitv2" +DEFAULT_DYNIN_REMOTE_CODE_REPO = "snu-aidas/Dynin-Omni" +DYNIN_PROMPT_SOURCE_KEY = "dynin_prompt_source" +DYNIN_PROMPT_SOURCE_OFFLINE_PREBUILT = "offline_prebuilt" + +DYNIN_TASK_DEFAULT_RUNTIME = { + "t2t": ("mmu", "mmu", 0, "text"), + "t2i": ("t2i", "t2i_gen", 2, "image"), + "t2s": ("t2s_mmu_like", "t2s_gen", 1, "audio"), + "i2i": ("i2i", "i2i", 2, "image"), +} + +DYNIN_TASK_RUNTIME_FALLBACKS: dict[str, dict[str, Any]] = { + "t2t": { + "prompt_max_text_len": 1024, + "max_new_tokens": 1024, + "steps": 1024, + "block_length": 16, + "temperature": 0.0, + "cfg_scale": 0.0, + }, + "t2i": { + "prompt_max_text_len": 128, + "image_token_count": 1024, + "mask_token_id": 126336, + "codebook_size": 8192, + "timesteps": 20, + "guidance_scale": 3.5, + "temperature": 1.0, + }, + "i2i": { + "prompt_max_text_len": 128, + "mask_token_id": 126336, + "codebook_size": 8192, + "timesteps": 64, + "guidance_scale": 3.5, + "temperature": 1.0, + "image_resolution": 336, + "use_train_i2i_prompt": True, + }, + "t2s": { + "runtime_task": "t2s_mmu_like", + "prompting_task": "t2s_gen", + "prompt_max_text_len": 1024, + "t2s_token_length": 512, + "mask_token_id": 126336, + "codebook_size": 8192, + "audio_codebook_size": 4096, + "steps": 512, + "block_length": 128, + "temperature": 1.0, + "cfg_scale": 2.5, + "t2s_condition": "gender-female_emotion-neutral_speed-normal_pitch-normal", + }, +} + +DEFAULT_DYNIN_T2S_INSTRUCTION = "Please read the following text naturally." + +DYNIN_SPECIAL_TOKENS = ( + "<|soi|>", + "<|eoi|>", + "<|sov|>", + "<|eov|>", + "<|t2i|>", + "<|mmu|>", + "<|t2v|>", + "<|v2v|>", + "<|lvg|>", + "<|i2i|>", + "<|ti2ti|>", + "<|v2t|>", + "<|v2s|>", + "<|s2t|>", + "<|t2s|>", + "<|s2s|>", + "<|soa|>", + "<|eoa|>", +) + +_DYNIN_ONLINE_PROMPT_TOKEN_BY_TASK = { + "t2i": "<|t2i|>", + "i2i": "<|i2i|>", + "t2s": "<|t2s|>", +} + +_DYNIN_MODALITY_PLACEHOLDERS = ( + "<|soi|><|image|><|eoi|>", + "<|sov|><|video|><|eov|>", + "<|soa|><|audio|><|eoa|>", +) + +_DYNIN_CONFIG_CANDIDATE_RELPATHS = ( + "configs/dynin_omni.yaml", + "models/configs/dynin_omni.yaml", + "vllm_omni/model_executor/models/dynin_omni/configs/dynin_omni.yaml", + "vllm_omni/model_executor/stage_configs/dynin_omni.yaml", + "dynin_omni.yaml", +) + +_DYNIN_REMOTE_ALLOW_PATTERNS = ("*.py", "*.json", "*.yaml", "*.yml") + +_DYNIN_REMOTE_CACHE_LOCK = threading.Lock() +_DYNIN_REMOTE_PACKAGE_BY_SNAPSHOT: dict[str, str] = {} +_DYNIN_REMOTE_ATTR_CACHE: dict[tuple[str, str, str, str | None, bool], Any] = {} + + +@dataclass(frozen=True) +class DyninInferSources: + model_source: str + tokenizer_source: str + vq_image_source: str + vq_audio_source: str + model_local_files_only: bool + vq_image_local_files_only: bool + vq_audio_local_files_only: bool + config_path: str | None = None + + @property + def local_files_only(self) -> bool: + return self.model_local_files_only + + +@dataclass(frozen=True) +class RemoteCodeSettings: + default_repo: str + repo_env: str + revision_env: str + local_only_env: str + + +DYNIN_REMOTE_SETTINGS = RemoteCodeSettings( + default_repo=DEFAULT_DYNIN_REMOTE_CODE_REPO, + repo_env="DYNIN_REMOTE_CODE_REPO_ID", + revision_env="DYNIN_REMOTE_CODE_REVISION", + local_only_env="DYNIN_REMOTE_CODE_LOCAL_FILES_ONLY", +) + +MAGVIT_REMOTE_SETTINGS = RemoteCodeSettings( + default_repo=DEFAULT_MAGVIT_REMOTE_CODE_REPO, + repo_env="DYNIN_MAGVIT_REMOTE_CODE_REPO_ID", + revision_env="DYNIN_MAGVIT_REMOTE_CODE_REVISION", + local_only_env="DYNIN_MAGVIT_REMOTE_CODE_LOCAL_FILES_ONLY", +) + + +def unwrap_first_value(value: Any, default: Any = None) -> Any: + if value is None: + return default + if isinstance(value, list): + return default if not value else value[0] + if isinstance(value, torch.Tensor): + if value.numel() == 0: + return default + if value.numel() == 1: + return value.item() + return value + return value + + +def normalize_runtime_info(runtime_additional_information: Any) -> dict[str, Any]: + if isinstance(runtime_additional_information, list): + if not runtime_additional_information: + return {} + first = runtime_additional_information[0] + return first if isinstance(first, dict) else {} + if isinstance(runtime_additional_information, dict): + return runtime_additional_information + return {} + + +def logical_dynin_task(task: Any) -> str: + task_text = str(unwrap_first_value(task, "") or "").strip().lower() + if task_text in ("t2s", "t2s_mmu_like", "t2s_fixed"): + return "t2s" + if task_text in ("t2i", "i2i"): + return task_text + return "t2t" + + +def dynin_runtime_fallback(task: str, key: str, value: Any = None) -> Any: + if isinstance(value, str): + if value.strip() != "": + return value + elif value is not None: + return value + return DYNIN_TASK_RUNTIME_FALLBACKS.get(task, {}).get(key) + + +def coerce_token_ids_1d( + value: Any, + ref_device: torch.device | None = None, +) -> torch.Tensor: + if isinstance(value, tuple): + value = value[0] + + if isinstance(value, list): + if not value: + device = ref_device or torch.device("cpu") + return torch.empty(0, dtype=torch.long, device=device) + if isinstance(value[0], torch.Tensor): + value = value[0] + else: + value = torch.tensor( + value[0] if isinstance(value[0], list) else value, + dtype=torch.long, + ) + + if not isinstance(value, torch.Tensor): + value = torch.tensor(value, dtype=torch.long) + + if value.ndim == 0: + value = value.unsqueeze(0) + if value.ndim > 1: + value = value[0] + + if ref_device is not None and value.device != ref_device: + value = value.to(ref_device) + + return value.to(dtype=torch.long).contiguous() + + +def _first_positive_int(value: Any) -> int | None: + if value is None: + return None + if isinstance(value, torch.Tensor): + if value.numel() != 1: + return None + value = value.item() + try: + value = int(value) + except (TypeError, ValueError): + return None + return value if value > 0 else None + + +def resolve_hidden_size( + *, + vllm_config: VllmConfig, + model: Any | None = None, + default: int = 1024, +) -> int: + if model is not None: + try: + embeddings = model.get_input_embeddings() + weight = getattr(embeddings, "weight", None) + if isinstance(weight, torch.Tensor) and weight.ndim >= 2: + hidden_size = _first_positive_int(weight.shape[-1]) + if hidden_size is not None: + return hidden_size + except Exception: + pass + + model_cfg = getattr(model, "config", None) + for key in ("hidden_size", "d_model", "n_embd", "dim", "model_dim", "embed_dim"): + hidden_size = _first_positive_int(getattr(model_cfg, key, None)) + if hidden_size is not None: + return hidden_size + + for config_obj in ( + getattr(vllm_config.model_config, "hf_config", None), + getattr(vllm_config.model_config, "hf_text_config", None), + ): + if config_obj is None: + continue + for key in ("hidden_size", "d_model", "n_embd", "dim", "model_dim", "embed_dim"): + value = config_obj.get(key) if isinstance(config_obj, dict) else getattr(config_obj, key, None) + hidden_size = _first_positive_int(value) + if hidden_size is not None: + return hidden_size + + return default + + +def build_zero_input_embeddings( + *, + input_ids: torch.Tensor, + hidden_size: int, + stage_name: str, + dtype: torch.dtype = torch.bfloat16, +) -> torch.Tensor: + if input_ids.ndim == 0: + shape = (1, hidden_size) + elif input_ids.ndim == 1: + shape = (input_ids.shape[0], hidden_size) + elif input_ids.ndim == 2: + shape = (input_ids.shape[0], input_ids.shape[1], hidden_size) + else: + raise ValueError(f"Unsupported input_ids rank for {stage_name}: {input_ids.ndim}") + return torch.zeros(shape, dtype=dtype, device=input_ids.device) + + +def _to_bool(value: Any, default: bool = False) -> bool: + if value is None: + return default + if isinstance(value, bool): + return value + if isinstance(value, (int, float)): + return bool(value) + + text = str(value).strip().lower() + if text in ("1", "true", "yes", "y", "on"): + return True + if text in ("0", "false", "no", "n", "off", "", "none", "null"): + return False + return default + + +def _runtime_value(runtime_info: dict[str, Any], key: str) -> Any: + return unwrap_first_value(runtime_info.get(key), None) + + +def _runtime_first_value(runtime_info: dict[str, Any], keys: tuple[str, ...]) -> Any: + for key in keys: + value = _runtime_value(runtime_info, key) + if value is not None: + return value + return None + + +def _node_value(node: Any, key: str, default: Any = None) -> Any: + if node is None: + return default + if isinstance(node, dict): + return node.get(key, default) + try: + return node.get(key, default) + except Exception: + return getattr(node, key, default) + + +def _looks_like_hf_repo_id(value: str | None) -> bool: + if not isinstance(value, str): + return False + if value.count("/") != 1: + return False + org, name = value.split("/", 1) + return bool(org and name) + + +def _find_dynin_config_under_root(root: Path) -> Path | None: + for rel_path in _DYNIN_CONFIG_CANDIDATE_RELPATHS: + candidate = root.expanduser() / rel_path + if candidate.exists(): + return candidate.resolve() + return None + + +@lru_cache(maxsize=16) +def _resolve_dynin_config_from_hf_repo(repo_id: str) -> str | None: + if not _looks_like_hf_repo_id(repo_id) or snapshot_download is None: + return None + + try: + snapshot_dir = ( + Path( + snapshot_download( + repo_id=repo_id, + repo_type="model", + allow_patterns=list(_DYNIN_CONFIG_CANDIDATE_RELPATHS), + local_files_only=True, + ) + ) + .expanduser() + .resolve() + ) + except Exception: + return None + + found = _find_dynin_config_under_root(snapshot_dir) + return str(found) if found is not None else None + + +def _resolve_existing_path(path_like: Any, source_name: str) -> str | None: + if path_like is None: + return None + text = str(path_like).strip() + if not text: + return None + + path = Path(text).expanduser() + if path.is_file(): + return str(path.resolve()) + + logger.warning( + "DYNIN config path from %s does not exist: %s. Falling back to auto-discovery.", + source_name, + path, + ) + return None + + +def _resolve_config_path(vllm_config: VllmConfig, runtime_info: dict[str, Any]) -> str | None: + for value, name in ( + (_runtime_value(runtime_info, "dynin_config_path"), "runtime_info.dynin_config_path"), + (os.getenv("DYNIN_CONFIG_PATH"), "DYNIN_CONFIG_PATH"), + (getattr(vllm_config.model_config, "dynin_config_path", None), "vllm_config.model_config.dynin_config_path"), + ): + resolved = _resolve_existing_path(value, name) + if resolved: + return resolved + + model_source = str(getattr(vllm_config.model_config, "model", "") or "") + tokenizer_source = str(getattr(vllm_config.model_config, "tokenizer", "") or "") + hf_config = getattr(vllm_config.model_config, "hf_config", None) + hf_name_or_path = ( + hf_config.get("_name_or_path") if isinstance(hf_config, dict) else getattr(hf_config, "_name_or_path", None) + ) + + hf_repo_candidates: list[str] = [] + for source in (model_source, tokenizer_source, hf_name_or_path): + if not _looks_like_hf_repo_id(source): + continue + source = str(source) + if source not in hf_repo_candidates: + hf_repo_candidates.append(source) + + for source in hf_repo_candidates: + resolved = _resolve_dynin_config_from_hf_repo(source) + if resolved is not None: + logger.info("Resolved dynin config from Hugging Face cache for %s: %s", source, resolved) + return resolved + + for source in (model_source, tokenizer_source): + source_path = Path(source).expanduser() + if source_path.is_dir(): + found = _find_dynin_config_under_root(source_path) + if found is not None: + return str(found) + + module_root = Path(__file__).resolve().parent + for bundled in ( + module_root / "configs" / "dynin_omni.yaml", + module_root / "models" / "configs" / "dynin_omni.yaml", + module_root.parent / "stage_configs" / "dynin_omni.yaml", + ): + if bundled.exists(): + return str(bundled) + + return None + + +@lru_cache(maxsize=16) +def _load_omega_config(config_path: str) -> Any: + try: + from omegaconf import OmegaConf + except ImportError as e: + raise ImportError( + f"omegaconf is required to load Dynin config files. Install it to read config: {config_path}" + ) from e + return OmegaConf.load(config_path) + + +def resolve_dynin_infer_sources( + *, + vllm_config: VllmConfig, + runtime_info: dict[str, Any] | None = None, +) -> DyninInferSources: + runtime_info = runtime_info or {} + + base_model_source = str(getattr(vllm_config.model_config, "model", "")) + base_model_path = Path(base_model_source).expanduser() + local_vllm_model_source = str(base_model_path) if base_model_path.is_dir() else None + + model_source = base_model_source + tokenizer_source = model_source + vq_image_source = DEFAULT_VQ_IMAGE_SOURCE + vq_audio_source = DEFAULT_VQ_AUDIO_SOURCE + model_local_files_only = False + vq_image_local_files_only = False + vq_audio_local_files_only = False + + resolver_source: str | None = base_model_source if base_model_source else None + resolver_local_files_only: bool | None = True if base_model_path.is_dir() else None + resolve_model_pretrained_source_fn = get_dynin_config_resolver_attr( + "resolve_model_pretrained_source", + source=resolver_source, + local_files_only=resolver_local_files_only, + ) + resolve_tokenizer_source_fn = get_dynin_config_resolver_attr( + "resolve_tokenizer_source", + source=resolver_source, + local_files_only=resolver_local_files_only, + ) + resolve_model_local_files_only_fn = get_dynin_config_resolver_attr( + "resolve_model_local_files_only", + source=resolver_source, + local_files_only=resolver_local_files_only, + ) + resolve_vq_cfg_block_fn = get_dynin_config_resolver_attr( + "resolve_vq_cfg_block", + source=resolver_source, + local_files_only=resolver_local_files_only, + ) + resolve_vq_repo_source_fn = get_dynin_config_resolver_attr( + "resolve_vq_repo_source", + source=resolver_source, + local_files_only=resolver_local_files_only, + ) + + config_path = _resolve_config_path(vllm_config, runtime_info) + if config_path: + config_file = Path(config_path).expanduser() + if config_file.exists(): + try: + dynin_cfg = _load_omega_config(str(config_file)) + model_source = resolve_model_pretrained_source_fn( + dynin_cfg, + default=model_source, + ) + tokenizer_source = resolve_tokenizer_source_fn( + dynin_cfg, + default=tokenizer_source, + ) + model_local_files_only = resolve_model_local_files_only_fn( + dynin_cfg, + default=model_local_files_only, + ) + vq_image_cfg = resolve_vq_cfg_block_fn(dynin_cfg, modality="image") + vq_audio_cfg = resolve_vq_cfg_block_fn(dynin_cfg, modality="audio") + vq_image_source = resolve_vq_repo_source_fn( + vq_image_cfg, + default=vq_image_source, + ) + vq_audio_source = resolve_vq_repo_source_fn( + vq_audio_cfg, + default=vq_audio_source, + ) + vq_image_local_files_only = _to_bool( + _node_value(vq_image_cfg, "local_files_only", None), + default=model_local_files_only, + ) + vq_audio_local_files_only = _to_bool( + _node_value(vq_audio_cfg, "local_files_only", None), + default=model_local_files_only, + ) + except Exception as e: + logger.warning( + "Failed to resolve DYNIN inference config from %s: %s", + config_file, + e, + ) + else: + logger.warning("DYNIN config path does not exist: %s", config_file) + + runtime_model_source = _runtime_value(runtime_info, "dynin_model_path") + if runtime_model_source: + model_source = str(runtime_model_source) + + runtime_tokenizer_source = _runtime_value(runtime_info, "tokenizer_path") + if runtime_tokenizer_source: + tokenizer_source = str(runtime_tokenizer_source) + + runtime_vq_image_source = _runtime_value(runtime_info, "vq_model_image_path") + if runtime_vq_image_source is None: + runtime_vq_image_source = _runtime_value(runtime_info, "vq_model_path_image") + if runtime_vq_image_source: + vq_image_source = str(runtime_vq_image_source) + + runtime_vq_audio_source = _runtime_value(runtime_info, "vq_model_audio_path") + if runtime_vq_audio_source is None: + runtime_vq_audio_source = _runtime_value(runtime_info, "vq_model_path_audio") + if runtime_vq_audio_source: + vq_audio_source = str(runtime_vq_audio_source) + + runtime_local_global = _runtime_value(runtime_info, "local_files_only") + runtime_local_model = _runtime_first_value( + runtime_info, + ("model_local_files_only", "local_files_only_model"), + ) + runtime_local_vq_image = _runtime_first_value( + runtime_info, + ("vq_model_image_local_files_only", "local_files_only_vq_image"), + ) + runtime_local_vq_audio = _runtime_first_value( + runtime_info, + ("vq_model_audio_local_files_only", "local_files_only_vq_audio"), + ) + + if runtime_local_global is not None: + global_local = _to_bool(runtime_local_global, default=False) + if runtime_local_model is None: + model_local_files_only = global_local + if runtime_local_vq_image is None: + vq_image_local_files_only = global_local + if runtime_local_vq_audio is None: + vq_audio_local_files_only = global_local + + if runtime_local_model is not None: + model_local_files_only = _to_bool( + runtime_local_model, + default=model_local_files_only, + ) + if runtime_local_vq_image is not None: + vq_image_local_files_only = _to_bool( + runtime_local_vq_image, + default=vq_image_local_files_only, + ) + if runtime_local_vq_audio is not None: + vq_audio_local_files_only = _to_bool( + runtime_local_vq_audio, + default=vq_audio_local_files_only, + ) + + if runtime_local_global is None and runtime_local_model is None and local_vllm_model_source is not None: + model_local_files_only = True + + if local_vllm_model_source is not None: + if not runtime_model_source: + if model_source != local_vllm_model_source: + logger.info( + "DYNIN infer model source overridden to local vLLM model path: %s (from %s)", + local_vllm_model_source, + model_source, + ) + model_source = local_vllm_model_source + if not runtime_tokenizer_source: + tokenizer_source = local_vllm_model_source + + return DyninInferSources( + model_source=model_source, + tokenizer_source=tokenizer_source, + vq_image_source=vq_image_source, + vq_audio_source=vq_audio_source, + model_local_files_only=model_local_files_only, + vq_image_local_files_only=vq_image_local_files_only, + vq_audio_local_files_only=vq_audio_local_files_only, + config_path=config_path, + ) + + +def _resolve_remote_source(source: str | None, settings: RemoteCodeSettings) -> str: + if isinstance(source, str): + stripped = source.strip() + if stripped: + source_path = Path(stripped).expanduser() + if source_path.is_dir(): + return str(source_path.resolve()) + if _looks_like_hf_repo_id(stripped): + return stripped + + env_repo = os.getenv(settings.repo_env) + if _looks_like_hf_repo_id(env_repo): + return str(env_repo).strip() + + return settings.default_repo + + +def _resolve_remote_revision(revision: str | None, settings: RemoteCodeSettings) -> str | None: + if isinstance(revision, str) and revision.strip(): + return revision.strip() + env_revision = os.getenv(settings.revision_env) + if isinstance(env_revision, str) and env_revision.strip(): + return env_revision.strip() + return None + + +def _resolve_remote_local_only(local_files_only: bool | None, settings: RemoteCodeSettings) -> bool: + if local_files_only is not None: + return bool(local_files_only) + return _to_bool(os.getenv(settings.local_only_env), default=False) + + +def _resolve_remote_snapshot_dir( + *, + source: str, + revision: str | None, + local_files_only: bool, +) -> str: + source_path = Path(source).expanduser() + if source_path.is_dir(): + return str(source_path.resolve()) + + if snapshot_download is None: + raise RuntimeError("huggingface_hub is required to load remote code.") + + kwargs: dict[str, Any] = { + "repo_id": source, + "repo_type": "model", + "allow_patterns": list(_DYNIN_REMOTE_ALLOW_PATTERNS), + "local_files_only": bool(local_files_only), + } + if revision is not None: + kwargs["revision"] = revision + + try: + return str(snapshot_download(**kwargs)) + except TypeError: + kwargs.pop("local_files_only", None) + return str(snapshot_download(**kwargs)) + + +def _ensure_remote_package(snapshot_dir: str) -> str: + with _DYNIN_REMOTE_CACHE_LOCK: + existing = _DYNIN_REMOTE_PACKAGE_BY_SNAPSHOT.get(snapshot_dir) + if existing is not None: + return existing + + digest = hashlib.sha1(snapshot_dir.encode("utf-8")).hexdigest()[:12] + package_name = f"_dynin_hf_remote_{digest}" + + package = types.ModuleType(package_name) + package.__path__ = [snapshot_dir] # type: ignore[attr-defined] + package.__file__ = str(Path(snapshot_dir) / "__init__.py") + + sys.modules.setdefault(package_name, package) + _DYNIN_REMOTE_PACKAGE_BY_SNAPSHOT[snapshot_dir] = package_name + return package_name + + +def _load_remote_module( + *, + module_name: str, + source: str, + revision: str | None, + local_files_only: bool, +): + snapshot_dir = _resolve_remote_snapshot_dir( + source=source, + revision=revision, + local_files_only=local_files_only, + ) + + module_path = Path(snapshot_dir) / f"{module_name}.py" + if not module_path.is_file(): + raise ImportError(f"Remote code module '{module_name}.py' not found under '{snapshot_dir}'. source={source!r}") + + package_name = _ensure_remote_package(snapshot_dir) + full_name = f"{package_name}.{module_name}" + + existing = sys.modules.get(full_name) + if existing is not None: + return existing + + spec = importlib.util.spec_from_file_location(full_name, module_path) + if spec is None or spec.loader is None: + raise ImportError(f"Failed to create import spec for '{module_path}'.") + + module = importlib.util.module_from_spec(spec) + module.__package__ = package_name + sys.modules[full_name] = module + try: + spec.loader.exec_module(module) + except Exception: + sys.modules.pop(full_name, None) + raise + return module + + +def resolve_remote_attr( + attr_name: str, + *, + module_name: str, + settings: RemoteCodeSettings, + source: str | None = None, + revision: str | None = None, + local_files_only: bool | None = None, + fallback_module_names: Iterable[str] = (), + optional: bool = False, +) -> Any | None: + resolved_source = _resolve_remote_source(source, settings) + resolved_revision = _resolve_remote_revision(revision, settings) + resolved_local_only = _resolve_remote_local_only(local_files_only, settings) + + module_candidates = [module_name, *[m for m in fallback_module_names if m and m != module_name]] + last_error: Exception | None = None + + for candidate in module_candidates: + cache_key = (attr_name, candidate, resolved_source, resolved_revision, resolved_local_only) + cached = _DYNIN_REMOTE_ATTR_CACHE.get(cache_key) + if cached is not None: + return cached + + try: + module = _load_remote_module( + module_name=candidate, + source=resolved_source, + revision=resolved_revision, + local_files_only=resolved_local_only, + ) + if hasattr(module, attr_name): + value = getattr(module, attr_name) + _DYNIN_REMOTE_ATTR_CACHE[cache_key] = value + return value + except Exception as e: + last_error = e + + if optional: + if last_error is not None: + logger.debug( + "Optional remote attr not found: attr=%s source=%s revision=%s err=%s", + attr_name, + resolved_source, + resolved_revision, + last_error, + ) + return None + + raise ImportError( + f"Failed to resolve '{attr_name}' from remote code " + f"(source={resolved_source!r}, revision={resolved_revision!r}, modules={module_candidates})." + ) from last_error + + +_DYNIN_MODELING_REMOTE_EXPORTS = { + "DyninOmniConfig": "DyninOmniConfig", + "DyninOmniModelLM": "DyninOmniModelLM", + "VideoTokenMerger": "VideoTokenMerger", +} + +_DYNIN_SAMPLING_REMOTE_EXPORTS = { + "log": "log", + "gumbel_noise": "gumbel_noise", + "gumbel_sample": "gumbel_sample", + "top_k": "top_k", + "mask_by_random_topk": "mask_by_random_topk", + "cosine_schedule": "cosine_schedule", + "linear_schedule": "linear_schedule", + "pow": "pow", + "sigmoid_schedule": "sigmoid_schedule", + "get_mask_schedule": "get_mask_schedule", + "top_k_top_p_filtering": "top_k_top_p_filtering", +} + +_DYNIN_CONFIG_RESOLVER_REMOTE_EXPORTS = { + "resolve_model_pretrained_source": "resolve_model_pretrained_source", + "resolve_tokenizer_source": "resolve_tokenizer_source", + "resolve_model_local_files_only": "resolve_model_local_files_only", + "resolve_vq_cfg_block": "resolve_vq_cfg_block", + "resolve_vq_repo_source": "resolve_vq_repo_source", +} + +_DYNIN_MAGVIT_REMOTE_EXPORTS = { + "VQGANEncoder": "VQGANEncoder", + "VQGANDecoder": "VQGANDecoder", + "LFQuantizer": "LFQuantizer", + "MAGVITv2": "MAGVITv2", +} + + +def _get_export_attr( + name: str, + export_map: dict[str, str], + *, + module_name: str, + settings: RemoteCodeSettings, + source: str | None = None, + revision: str | None = None, + local_files_only: bool | None = None, + optional: bool = False, +) -> Any | None: + attr_name = export_map.get(name) + if attr_name is None: + raise AttributeError(f"Unsupported export: {name!r}") + + return resolve_remote_attr( + attr_name, + module_name=module_name, + settings=settings, + source=source, + revision=revision, + local_files_only=local_files_only, + optional=optional, + ) + + +def get_dynin_modeling_attr(name: str) -> Any: + return _get_export_attr( + name, + _DYNIN_MODELING_REMOTE_EXPORTS, + module_name="modeling_dynin_omni", + settings=DYNIN_REMOTE_SETTINGS, + ) + + +def get_dynin_sampling_attr(name: str) -> Any: + return _get_export_attr( + name, + _DYNIN_SAMPLING_REMOTE_EXPORTS, + module_name="sampling", + settings=DYNIN_REMOTE_SETTINGS, + ) + + +def get_dynin_config_resolver_attr( + name: str, + *, + source: str | None = None, + revision: str | None = None, + local_files_only: bool | None = None, +) -> Any: + attr_name = _DYNIN_CONFIG_RESOLVER_REMOTE_EXPORTS.get(name) + if attr_name is None: + raise AttributeError(f"Unsupported Dynin config_resolver export: {name!r}") + + if source is not None: + value = resolve_remote_attr( + attr_name, + module_name="config_resolver", + settings=DYNIN_REMOTE_SETTINGS, + source=source, + revision=revision, + local_files_only=local_files_only, + optional=True, + ) + if value is not None: + return value + + return resolve_remote_attr( + attr_name, + module_name="config_resolver", + settings=DYNIN_REMOTE_SETTINGS, + source=DEFAULT_DYNIN_REMOTE_CODE_REPO, + revision=revision, + local_files_only=local_files_only, + optional=False, + ) + + +def get_dynin_magvit_attr( + name: str, + *, + source: str | None = None, + revision: str | None = None, + local_files_only: bool | None = None, +) -> Any: + attr_name = _DYNIN_MAGVIT_REMOTE_EXPORTS.get(name) + if attr_name is None: + raise AttributeError(f"Unsupported Dynin MAGVIT export: {name!r}") + + value = resolve_remote_attr( + attr_name, + module_name="modeling_magvitv2", + settings=MAGVIT_REMOTE_SETTINGS, + source=source, + revision=revision, + local_files_only=local_files_only, + optional=True, + ) + if value is not None: + return value + + resolved_source = _resolve_remote_source(source, MAGVIT_REMOTE_SETTINGS) + resolved_revision = _resolve_remote_revision(revision, MAGVIT_REMOTE_SETTINGS) + resolved_local_only = _resolve_remote_local_only(local_files_only, MAGVIT_REMOTE_SETTINGS) + + if resolved_source != DEFAULT_MAGVIT_REMOTE_CODE_REPO: + return resolve_remote_attr( + attr_name, + module_name="modeling_magvitv2", + settings=MAGVIT_REMOTE_SETTINGS, + source=DEFAULT_MAGVIT_REMOTE_CODE_REPO, + revision=resolved_revision, + local_files_only=resolved_local_only, + optional=False, + ) + + raise ImportError( + f"Failed to resolve MAGVIT attr '{attr_name}' from source={resolved_source!r} (revision={resolved_revision!r})." + ) + + +def build_dynin_chat_prompt(content: str) -> str: + return ( + f"<|start_header_id|>user<|end_header_id|>\n{content}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" + ) + + +def extract_dynin_user_prompt_text(decoded_prompt: str) -> str: + text = str(decoded_prompt or "") + assistant_marker = "<|start_header_id|>assistant<|end_header_id|>" + user_marker = "<|start_header_id|>user<|end_header_id|>" + end_header_marker = "<|end_header_id|>" + eot_marker = "<|eot_id|>" + + if assistant_marker in text: + text = text.rsplit(assistant_marker, 1)[0] + if eot_marker in text: + text = text.rsplit(eot_marker, 1)[0] + if user_marker in text: + text = text.rsplit(user_marker, 1)[-1] + if end_header_marker in text: + text = text.split(end_header_marker, 1)[-1] + return text.strip() + + +def normalize_dynin_online_prompt_text(task: str, decoded_prompt: str) -> str: + text = extract_dynin_user_prompt_text(decoded_prompt) + if not text: + text = str(decoded_prompt or "") + + for placeholder in _DYNIN_MODALITY_PLACEHOLDERS: + text = text.replace(placeholder, " ") + + task_token = _DYNIN_ONLINE_PROMPT_TOKEN_BY_TASK.get(task) + if task_token: + text = text.replace(task_token, " ", 1) + + text = " ".join(text.split()).strip() + + if task == "t2s": + if not text: + text = "Hello. This is a default text-to-speech sample." + text = build_dynin_chat_prompt(f"{DEFAULT_DYNIN_T2S_INSTRUCTION}\n{text}") + elif task in {"t2i", "i2i"} and not text: + text = "A high quality detailed image." + + return text + + +def infer_dynin_online_task( + *, + decoded_prompt: str, + has_image: bool = False, + has_audio: bool = False, + has_video: bool = False, +) -> str: + prompt = str(decoded_prompt or "") + if "<|i2i|>" in prompt: + return "i2i" + if "<|t2i|>" in prompt and not has_audio and not has_video: + return "t2i" + if "<|t2s|>" in prompt and not has_audio and not has_video: + return "t2s" + return "t2t" + + +def build_dynin_prompt_payload( + *, + task: str, + text: str, + image_tokens: torch.Tensor | None, + image_placeholder_tokens: int, + audio_placeholder_tokens: int, + image_token_offset: int, + mask_token_id: int, + use_train_i2i_prompt: bool, +) -> tuple[Any, str]: + _, prompting_task, _, _ = DYNIN_TASK_DEFAULT_RUNTIME[task] + + if task == "t2t": + payload = ([[]], [build_dynin_chat_prompt(text)]) + return payload, prompting_task + + if task == "t2i": + image_placeholder = torch.full( + (1, int(image_placeholder_tokens)), + fill_value=int(mask_token_id), + dtype=torch.long, + ) + payload = ([text], image_placeholder) + return payload, prompting_task + + if task == "i2i": + if image_tokens is None: + raise ValueError("i2i requires image tokens") + src = image_tokens.view(1, -1).long() + int(image_token_offset) + target_len = int(image_placeholder_tokens) if image_placeholder_tokens > 0 else int(src.shape[1]) + image_placeholder = torch.full( + (1, target_len), + fill_value=int(mask_token_id), + dtype=torch.long, + ) + if use_train_i2i_prompt: + labels_placeholder = torch.full( + (1, target_len), + fill_value=-100, + dtype=torch.long, + ) + payload = ([text], src, image_placeholder, labels_placeholder) + return payload, "i2i" + payload = ([text], src, image_placeholder) + return payload, "i2i_gen" + + if task == "t2s": + audio_placeholder = torch.full( + (1, int(audio_placeholder_tokens)), + fill_value=int(mask_token_id), + dtype=torch.long, + ) + payload = ([text], audio_placeholder) + return payload, prompting_task + + raise ValueError(f"Unsupported Dynin online bootstrap task: {task}") + + +def _wrap_runtime_field(value: Any) -> list[Any]: + return [value] + + +def build_dynin_online_runtime_info( + *, + task: str, + text_vocab_size: int, + infer_sources: DyninInferSources, + dynin_config_path: str | None = None, + prompting_input: Any | None = None, + attention_mask: list[int] | None = None, + prompt_length: int | None = None, + uncond_prompting_input: Any | None = None, + image_token_count: int = 0, + t2s_token_length: int | None = None, + use_train_i2i_prompt: bool | None = None, +) -> dict[str, Any]: + runtime_task, prompting_task, detok_id, _ = DYNIN_TASK_DEFAULT_RUNTIME[task] + + prompt_max_text_len = int(dynin_runtime_fallback(task, "prompt_max_text_len", None) or 1024) + max_new_tokens = int(dynin_runtime_fallback(task, "max_new_tokens", None) or 256) + steps = int(dynin_runtime_fallback(task, "steps", None) or 256) + block_length = int(dynin_runtime_fallback(task, "block_length", None) or 2) + temperature = float(dynin_runtime_fallback(task, "temperature", None) or 0.0) + cfg_scale = float(dynin_runtime_fallback(task, "cfg_scale", None) or 0.0) + remasking = str(dynin_runtime_fallback(task, "remasking", None) or "low_confidence") + timesteps = int(dynin_runtime_fallback(task, "timesteps", None) or 20) + guidance_scale = float(dynin_runtime_fallback(task, "guidance_scale", None) or 0.0) + mask_token_id = int(dynin_runtime_fallback(task, "mask_token_id", None) or 126336) + codebook_size = int(dynin_runtime_fallback(task, "codebook_size", None) or 8192) + audio_codebook_size = int(dynin_runtime_fallback(task, "audio_codebook_size", None) or 4096) + image_resolution = int(dynin_runtime_fallback(task, "image_resolution", None) or 336) + if image_token_count <= 0 and task in {"t2i", "i2i"}: + fallback_count = dynin_runtime_fallback(task, "image_token_count", None) + if fallback_count is not None: + image_token_count = int(fallback_count) + else: + image_token_count = max(1, (image_resolution // 16) ** 2) + + if t2s_token_length is None: + t2s_token_length = int(dynin_runtime_fallback(task, "t2s_token_length", None) or 383) + t2s_condition = str( + dynin_runtime_fallback( + task, + "t2s_condition", + None, + ) + or "gender-female_emotion-neutral_speed-normal_pitch-normal" + ) + if use_train_i2i_prompt is None: + use_train_i2i_prompt = bool(dynin_runtime_fallback(task, "use_train_i2i_prompt", task == "i2i")) + + runtime_info: dict[str, Any] = { + "task": _wrap_runtime_field(runtime_task), + "prompting_task": _wrap_runtime_field(prompting_task), + "detok_id": _wrap_runtime_field(int(detok_id)), + "prompt_max_text_len": _wrap_runtime_field(prompt_max_text_len), + "prompting_max_text_len": _wrap_runtime_field(prompt_max_text_len), + "cond_dropout_prob": _wrap_runtime_field(0.0), + "prompting_cond_dropout_prob": _wrap_runtime_field(0.0), + "tokenizer_path": _wrap_runtime_field(str(infer_sources.tokenizer_source)), + "text_vocab_size": _wrap_runtime_field(int(text_vocab_size)), + "model_local_files_only": _wrap_runtime_field(bool(infer_sources.model_local_files_only)), + "max_new_tokens": _wrap_runtime_field(int(t2s_token_length if task == "t2s" else max_new_tokens)), + "steps": _wrap_runtime_field(steps), + "block_length": _wrap_runtime_field(block_length), + "temperature": _wrap_runtime_field(temperature), + "cfg_scale": _wrap_runtime_field(cfg_scale), + "remasking": _wrap_runtime_field(remasking), + "mask_id": _wrap_runtime_field(mask_token_id), + "mask_token_id": _wrap_runtime_field(mask_token_id), + "codebook_size": _wrap_runtime_field(codebook_size), + "audio_codebook_size": _wrap_runtime_field(audio_codebook_size), + "timesteps": _wrap_runtime_field(timesteps), + "guidance_scale": _wrap_runtime_field(guidance_scale), + "noise_type": _wrap_runtime_field("mask"), + "noise_schedule_name": _wrap_runtime_field("cosine"), + "noise_schedule_params": _wrap_runtime_field({}), + "seq_len": _wrap_runtime_field(int(image_token_count)), + "condition": _wrap_runtime_field(t2s_condition), + "t2s_condition": _wrap_runtime_field(t2s_condition), + "vq_model_image_path": _wrap_runtime_field(str(infer_sources.vq_image_source)), + "vq_model_image_local_files_only": _wrap_runtime_field(bool(infer_sources.vq_image_local_files_only)), + "vq_model_audio_path": _wrap_runtime_field(str(infer_sources.vq_audio_source)), + "vq_model_audio_local_files_only": _wrap_runtime_field(bool(infer_sources.vq_audio_local_files_only)), + "image_resolution": _wrap_runtime_field(image_resolution), + "t2s_token_length": _wrap_runtime_field(int(t2s_token_length)), + "use_train_i2i_prompt": _wrap_runtime_field(bool(use_train_i2i_prompt)), + } + + if dynin_config_path: + runtime_info["dynin_config_path"] = _wrap_runtime_field(str(dynin_config_path)) + if prompting_input is not None: + runtime_info["prompting_input"] = _wrap_runtime_field(prompting_input) + if uncond_prompting_input is not None: + runtime_info["uncond_prompting_input"] = _wrap_runtime_field(uncond_prompting_input) + if attention_mask: + runtime_info["attention_mask"] = _wrap_runtime_field(list(attention_mask)) + if prompt_length is None and attention_mask: + prompt_length = len(attention_mask) + if prompt_length is not None: + runtime_info["prompt_length"] = _wrap_runtime_field(int(prompt_length)) + + return runtime_info diff --git a/vllm_omni/model_executor/models/dynin_omni/dynin_omni_token2audio.py b/vllm_omni/model_executor/models/dynin_omni/dynin_omni_token2audio.py new file mode 100644 index 0000000000..8b4063d079 --- /dev/null +++ b/vllm_omni/model_executor/models/dynin_omni/dynin_omni_token2audio.py @@ -0,0 +1,274 @@ +from __future__ import annotations + +import os +import tempfile +from pathlib import Path +from typing import Any + +import torch +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.sequence import IntermediateTensors + +from vllm_omni.model_executor.models.output_templates import OmniOutput + +from .dynin_omni import DyninOmniStageBase +from .dynin_omni_common import ( + DetokTarget, + _looks_like_hf_repo_id, + coerce_token_ids_1d, + normalize_runtime_info, + resolve_dynin_infer_sources, + resolve_hidden_size, + unwrap_first_value, +) + +logger = init_logger(__name__) + + +def _get_hf_token() -> str | None: + return os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_HUB_TOKEN") + + +def _ensure_remote_s2u_vendor_root( + *, + repo_id: str, + local_files_only: bool, +) -> str | None: + if local_files_only or not _looks_like_hf_repo_id(repo_id): + return None + + existing = os.environ.get("DYNIN_S2U_VENDOR_ROOT") + if existing: + existing_path = Path(existing).expanduser().resolve() + if existing_path.is_dir(): + return str(existing_path) + + try: + from huggingface_hub import snapshot_download + except Exception as e: + logger.warning("huggingface_hub unavailable; cannot fetch s2u_vendor from %s: %s", repo_id, e) + return None + + token = _get_hf_token() + last_error: Exception | None = None + revisions: list[str | None] = [None] + + for revision in revisions: + try: + snapshot_dir = snapshot_download( + repo_id=repo_id, + revision=revision, + allow_patterns=["s2u_vendor/**"], + token=token, + ) + except TypeError: + try: + snapshot_dir = snapshot_download( + repo_id=repo_id, + revision=revision, + allow_patterns=["s2u_vendor/**"], + ) + except Exception as e: + last_error = e + continue + except Exception as e: + last_error = e + continue + + vendor_root = (Path(snapshot_dir) / "s2u_vendor").resolve() + if vendor_root.is_dir(): + os.environ["DYNIN_S2U_VENDOR_ROOT"] = str(vendor_root) + logger.info("Using remote S2U vendor root: %s", vendor_root) + return str(vendor_root) + + if last_error is not None: + logger.warning("Failed to download remote s2u_vendor from %s: %s", repo_id, last_error) + return None + + +class DyninOmniToken2Audio(DyninOmniStageBase): + """Stage-3: token detokenization to speech (or pass-through).""" + + stage_name = "Dynin token2audio" + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + del prefix + super().__init__() + self.vllm_config = vllm_config + self.have_multimodal_outputs = True + self.requires_raw_input_tokens = True + self.hidden_size = resolve_hidden_size(vllm_config=vllm_config) + self._vq_audio = None + self._vq_audio_path: str | None = None + self._vq_audio_local_files_only: bool | None = None + + def forward( + self, + input_ids: torch.Tensor | None = None, + positions: torch.Tensor | None = None, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: Any, + ) -> OmniOutput: + del positions, intermediate_tensors, inputs_embeds + if input_ids is None: + raise ValueError("token2audio stage requires input_ids") + + runtime_info = normalize_runtime_info(kwargs.get("runtime_additional_information")) + detok_id = int(unwrap_first_value(runtime_info.get("detok_id"), 0)) + tokens = coerce_token_ids_1d(input_ids) + + if detok_id != DetokTarget.AUDIO: + return OmniOutput( + text_hidden_states=None, + multimodal_outputs={ + "token_ids": tokens, + "detok_id": torch.tensor([detok_id], dtype=torch.long, device=tokens.device), + }, + ) + + audio, sample_rate = self._decode_audio_tokens(tokens, runtime_info=runtime_info) + return OmniOutput( + text_hidden_states=None, + multimodal_outputs={ + "speech": audio, + "audio": audio, + "sr": torch.tensor([sample_rate], dtype=torch.int, device=audio.device), + "detok_id": torch.tensor([detok_id], dtype=torch.long, device=audio.device), + }, + ) + + def _decode_audio_tokens(self, tokens: torch.Tensor, runtime_info: dict[str, Any]) -> tuple[torch.Tensor, int]: + # Follow DYNIN validation path: + # token list -> "<|speech_x|>" string -> vq_model_audio.decode(...). + vq_audio = self._ensure_vq_audio(runtime_info=runtime_info, ref_device=tokens.device) + + audio_codebook_size = int(unwrap_first_value(runtime_info.get("audio_codebook_size"), 4096)) + audio_vocab_offset = unwrap_first_value( + runtime_info.get("audio_vocab_offset"), + unwrap_first_value(runtime_info.get("t2s_vocab_start"), None), + ) + + token_ids = tokens.to(torch.long) + if audio_vocab_offset is not None: + off = int(audio_vocab_offset) + token_ids = torch.where(token_ids >= off, token_ids - off, token_ids) + token_ids = token_ids[(token_ids >= 0) & (token_ids < audio_codebook_size)] + if token_ids.numel() == 0: + raise RuntimeError("Audio detokenizer got no valid audio token ids.") + + speech_unit_str = " ".join(map(str, token_ids.detach().cpu().tolist())) + speech_unit_for_decode = "".join(f"<|speech_{unit}|>" for unit in speech_unit_str.split(" ") if unit != "") + + condition = unwrap_first_value( + runtime_info.get("condition"), + unwrap_first_value(runtime_info.get("t2s_condition"), None), + ) + output_wav_file = unwrap_first_value(runtime_info.get("output_wav_file"), None) + created_tmp = False + if output_wav_file is None: + fd, tmp_wav = tempfile.mkstemp(prefix="dynin_t2s_", suffix=".wav") + os.close(fd) + output_wav_file = tmp_wav + created_tmp = True + + audio_array = vq_audio.decode(speech_unit_for_decode, condition=condition, output_wav_file=output_wav_file) + if created_tmp: + try: + os.remove(output_wav_file) + except Exception: + pass + if not isinstance(audio_array, torch.Tensor): + audio_array = torch.as_tensor(audio_array, dtype=torch.float32, device=tokens.device) + else: + audio_array = audio_array.to(device=tokens.device, dtype=torch.float32) + + if audio_array.ndim > 1: + audio_array = audio_array.reshape(-1) + audio_array = audio_array.contiguous() + + sample_rate = int( + unwrap_first_value( + runtime_info.get("sr"), + unwrap_first_value(runtime_info.get("sample_rate"), 24000), + ) + ) + try: + cfg = getattr(vq_audio, "u2s_config", None) + cfg_sr = getattr(cfg, "sampling_rate", None) + if cfg_sr is None: + cfg_sr = getattr(getattr(cfg, "data", None), "sampling_rate", None) + if cfg_sr is not None: + sample_rate = int(cfg_sr) + except Exception: + pass + return audio_array, sample_rate + + def _ensure_vq_audio(self, runtime_info: dict[str, Any], ref_device: torch.device) -> Any: + sources = resolve_dynin_infer_sources(vllm_config=self.vllm_config, runtime_info=runtime_info) + model_path = str(sources.vq_audio_source) + local_files_only = bool(sources.vq_audio_local_files_only) + + _ensure_remote_s2u_vendor_root( + repo_id=model_path, + local_files_only=local_files_only, + ) + + if ( + self._vq_audio is None + or self._vq_audio_path != model_path + or self._vq_audio_local_files_only != local_files_only + ): + logger.info( + "Loading DYNIN audio detokenizer from %s (local_files_only=%s)", + model_path, + local_files_only, + ) + try: + from transformers import AutoModel + except Exception as e: + raise RuntimeError( + "transformers is required to load EMOVASpeechTokenizer remote code from Hugging Face." + ) from e + + try: + self._vq_audio = AutoModel.from_pretrained( + model_path, + trust_remote_code=True, + local_files_only=local_files_only, + low_cpu_mem_usage=False, + ) + except TypeError: + try: + self._vq_audio = AutoModel.from_pretrained( + model_path, + trust_remote_code=True, + local_files_only=local_files_only, + ) + except TypeError: + self._vq_audio = AutoModel.from_pretrained( + model_path, + trust_remote_code=True, + ) + except Exception as e: + raise RuntimeError( + f"Failed to load EMOVASpeechTokenizer from Hugging Face remote code for model path '{model_path}'." + ) from e + + if not hasattr(self._vq_audio, "decode"): + raise RuntimeError( + "Loaded audio tokenizer does not expose decode(). " + "Check HF config.json auto_map/model_type and ensure trust_remote_code=True." + ) + self._vq_audio.eval() + self._vq_audio.requires_grad_(False) + self._vq_audio_path = model_path + self._vq_audio_local_files_only = local_files_only + if hasattr(self._vq_audio, "to"): + self._vq_audio = self._vq_audio.to(ref_device) + return self._vq_audio + + def embed_multimodal(self, **kwargs: Any) -> Any: + del kwargs + return None diff --git a/vllm_omni/model_executor/models/dynin_omni/dynin_omni_token2image.py b/vllm_omni/model_executor/models/dynin_omni/dynin_omni_token2image.py new file mode 100644 index 0000000000..6b5110a77e --- /dev/null +++ b/vllm_omni/model_executor/models/dynin_omni/dynin_omni_token2image.py @@ -0,0 +1,150 @@ +from __future__ import annotations + +import os +from typing import Any + +import torch +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.sequence import IntermediateTensors + +from vllm_omni.model_executor.models.output_templates import OmniOutput + +from .dynin_omni import DyninOmniStageBase +from .dynin_omni_common import ( + DetokTarget, + _to_bool, + coerce_token_ids_1d, + get_dynin_magvit_attr, + normalize_runtime_info, + resolve_dynin_infer_sources, + resolve_hidden_size, + unwrap_first_value, +) + +logger = init_logger(__name__) + + +class DyninOmniToken2Image(DyninOmniStageBase): + """Stage-2: token detokenization to image (or pass-through).""" + + stage_name = "Dynin token2image" + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + del prefix + super().__init__() + + self.vllm_config = vllm_config + self.have_multimodal_outputs = True + self.requires_raw_input_tokens = True + self.hidden_size = resolve_hidden_size(vllm_config=vllm_config) + self._vq_model = None + self._vq_model_path: str | None = None + self._vq_local_files_only: bool | None = None + + def forward( + self, + input_ids: torch.Tensor | None = None, + positions: torch.Tensor | None = None, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: Any, + ) -> OmniOutput: + del positions, intermediate_tensors, inputs_embeds + if input_ids is None: + raise ValueError("token2image stage requires input_ids") + runtime_info = normalize_runtime_info(kwargs.get("runtime_additional_information")) + detok_id = int(unwrap_first_value(runtime_info.get("detok_id"), 0)) + tokens = coerce_token_ids_1d(input_ids) + + if detok_id != DetokTarget.IMAGE: + return OmniOutput( + text_hidden_states=None, + multimodal_outputs={ + "token_ids": tokens, + "detok_id": torch.tensor([detok_id], dtype=torch.long, device=tokens.device), + }, + ) + + image = self._decode_image_tokens(tokens, runtime_info=runtime_info) + return OmniOutput( + text_hidden_states=None, + multimodal_outputs={ + "image": image, + "detok_id": torch.tensor([detok_id], dtype=torch.long, device=image.device), + }, + ) + + def _decode_image_tokens(self, tokens: torch.Tensor, runtime_info: dict[str, Any]) -> torch.Tensor: + # Follow DYNIN validation path: + # tokens -> clamp -> vq_model.decode_code -> (x+1)/2 -> [0,1]. + vq_model = self._ensure_vq_model(runtime_info=runtime_info, ref_device=tokens.device) + codebook_size = int(unwrap_first_value(runtime_info.get("codebook_size"), 8192)) + image_vocab_offset = unwrap_first_value(runtime_info.get("image_vocab_offset"), None) + if image_vocab_offset is None: + text_vocab_size = unwrap_first_value(runtime_info.get("text_vocab_size"), None) + num_new_special_tokens = int(unwrap_first_value(runtime_info.get("num_new_special_tokens"), 0)) + if text_vocab_size is not None: + image_vocab_offset = int(text_vocab_size) + num_new_special_tokens + + token_ids = tokens.to(torch.long) + if image_vocab_offset is not None: + off = int(image_vocab_offset) + token_ids = torch.where(token_ids >= off, token_ids - off, token_ids) + token_ids = torch.clamp(token_ids, min=0, max=max(0, codebook_size - 1)) + token_ids = token_ids.unsqueeze(0) + + decoded = vq_model.decode_code(token_ids) + decoded = torch.clamp((decoded + 1.0) / 2.0, min=0.0, max=1.0) + if decoded.ndim != 4 or decoded.shape[0] == 0: + raise RuntimeError(f"Unexpected MAGVIT decode output shape: {tuple(decoded.shape)}") + return decoded[0].contiguous() + + def _ensure_vq_model(self, runtime_info: dict[str, Any], ref_device: torch.device) -> Any: + sources = resolve_dynin_infer_sources(vllm_config=self.vllm_config, runtime_info=runtime_info) + model_path = str(sources.vq_image_source) + local_files_only = bool(sources.vq_image_local_files_only) + if self._vq_model is None or self._vq_model_path != model_path or self._vq_local_files_only != local_files_only: + disable_xet = unwrap_first_value( + runtime_info.get("hf_hub_disable_xet"), + unwrap_first_value(runtime_info.get("disable_hf_xet"), True), + ) + if _to_bool(disable_xet, default=True): + os.environ.setdefault("HF_HUB_DISABLE_XET", "1") + logger.info( + "Loading DYNIN image detokenizer from %s (local_files_only=%s)", + model_path, + local_files_only, + ) + try: + MAGVITv2 = get_dynin_magvit_attr( + "MAGVITv2", + source=model_path, + local_files_only=local_files_only, + ) + try: + self._vq_model = MAGVITv2.from_pretrained( + model_path, + local_files_only=local_files_only, + ) + except TypeError: + self._vq_model = MAGVITv2.from_pretrained(model_path) + except Exception as e: + raise RuntimeError( + "Failed to load MAGVITv2 from local DYNIN submodel implementation " + f"for model path '{model_path}'. " + "If your environment cannot access huggingface.co, set " + "additional_information.vq_model_image_path to a local MAGVITv2 directory " + "and set additional_information.vq_model_image_local_files_only=true." + ) from e + self._vq_model.eval() + self._vq_model.requires_grad_(False) + self._vq_model_path = model_path + self._vq_local_files_only = local_files_only + if hasattr(self._vq_model, "to"): + self._vq_model = self._vq_model.to(ref_device) + return self._vq_model + + def embed_multimodal(self, **kwargs: Any) -> Any: + del kwargs + return None diff --git a/vllm_omni/model_executor/models/dynin_omni/dynin_omni_token2text.py b/vllm_omni/model_executor/models/dynin_omni/dynin_omni_token2text.py new file mode 100644 index 0000000000..fb5ac17029 --- /dev/null +++ b/vllm_omni/model_executor/models/dynin_omni/dynin_omni_token2text.py @@ -0,0 +1,1580 @@ +from __future__ import annotations + +import inspect +import json +from contextlib import contextmanager +from typing import Any + +import torch +import torch.nn.functional as F +from transformers import AutoTokenizer +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.sequence import IntermediateTensors + +from vllm_omni.model_executor.models.output_templates import OmniOutput + +from .dynin_omni import DyninOmniStageBase +from .dynin_omni_common import ( + DYNIN_PROMPT_SOURCE_KEY, + DYNIN_PROMPT_SOURCE_OFFLINE_PREBUILT, + DYNIN_REMOTE_SETTINGS, + DYNIN_SPECIAL_TOKENS, + TASK_TO_DETOK, + DetokTarget, + _to_bool, + build_dynin_online_runtime_info, + build_dynin_prompt_payload, + coerce_token_ids_1d, + dynin_runtime_fallback, + get_dynin_magvit_attr, + get_dynin_modeling_attr, + get_dynin_sampling_attr, + infer_dynin_online_task, + logical_dynin_task, + normalize_dynin_online_prompt_text, + normalize_runtime_info, + resolve_dynin_infer_sources, + resolve_hidden_size, + resolve_remote_attr, + unwrap_first_value, +) + +logger = init_logger(__name__) + +TASK_TO_PROMPTING_TASK = { + "t2i": "t2i_gen", + "i2i": "i2i_gen", + "ti2ti": "ti2ti_gen", + "t2s": "t2s_gen", + "t2s_mmu_like": "t2s_gen", + "t2s_fixed": "t2s_fixed_gen", + "s2s": "s2s_gen", + "v2s": "v2s_gen", + "mmu": "mmu", + "mmu_fast": "mmu", + "mmu_fastdllm_v1": "mmu", + "s2t": "s2t", + "v2t": "v2t", +} + +TASK_TO_GENERATE_FN = { + "t2i": "t2i_generate", + "i2i": "i2i_generate", + "ti2ti": "ti2ti_generate", + "t2s": "t2s_generate", + "t2s_mmu_like": "t2s_generate_mmu_like", + "t2s_fixed": "t2s_fixed_generate", + "s2s": "t2s_generate_mmu_like", + "v2s": "t2s_generate_mmu_like", + "s2t": "s2t_generate", + "mmu": "mmu_generate", + "t2t": "generate", + "mmu_fast": "mmu_generate_fast", + "mmu_fastdllm_v1": "mmu_generate_fastdllm_v1", + "v2t": "mmu_generate", +} + +TASKS_USING_UNI_PROMPTING = set(TASK_TO_PROMPTING_TASK.keys()) +PROMPT_PAYLOAD_REQUIRED_TASKS = { + "t2i", + "i2i", + "ti2ti", + "t2s", + "t2s_mmu_like", + "t2s_fixed", + "s2s", + "v2s", +} + +GENERATE_RUNTIME_KWARG_KEYS = ( + "uncond_input_ids", + "uncond_attention_mask", + "noise_schedule", + "generator", + "config", + "uni_prompting", + "resolution", + "max_new_tokens", + "steps", + "block_length", + "temperature", + "top_k", + "eot_token", + "cfg_scale", + "remasking", + "mask_id", + "attention_mask", + "timesteps", + "guidance_scale", + "noise_type", + "seq_len", + "mask_token_id", + "codebook_size", + "audio_codebook_size", + "use_cache", + "threshold", + "factor", +) + +PASSTHROUGH_GENERATE_KWARG_KEYS = ( + "attention_mask", + "uncond_input_ids", + "uncond_attention_mask", + "noise_schedule", + "uni_prompting", + "generator", + "noise_type", +) + +PROMPTING_PAYLOAD_KEYS = ( + "prompting_input", + "prompting_inputs", + "dynin_inputs", + "model_inputs", + "raw_inputs", +) + +UNCOND_PROMPTING_PAYLOAD_KEYS = ( + "uncond_prompting_input", + "uncond_prompting_inputs", +) + +PROMPTING_META_KEYS = ( + "uncond_prompting_input", + "uncond_prompting_inputs", + "uni_prompting", + "prompting_task", + "prompting_config", +) + +MM_INPUT_ALIASES = { + "image": ("pixel_values", "image_embeds", "img2img"), + "video": ("pixel_values_videos", "video_embeds"), + "audio": ("input_audio_features", "audio_embeds"), +} + + +class DyninOmniToken2Text(DyninOmniStageBase): + """Stage-1: DYNIN generation + text detokenization or pass-through.""" + + stage_name = "Dynin token2text" + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + del prefix + super().__init__() + + self.vllm_config = vllm_config + self.have_multimodal_outputs = True + self.requires_raw_input_tokens = True + + self._infer_sources = resolve_dynin_infer_sources(vllm_config=vllm_config) + if self._infer_sources.config_path: + logger.info( + "DYNIN token2text using inference config: %s", + self._infer_sources.config_path, + ) + + self.model = self._load_text_model( + self._infer_sources.model_source, + local_files_only=self._infer_sources.model_local_files_only, + ) + self.model.eval() + self.model.requires_grad_(False) + + self.hidden_size = resolve_hidden_size( + vllm_config=vllm_config, + model=self.model, + ) + + self.tokenizer: Any | None = None + self._tokenizer_path: str | None = None + self._uni_prompting: Any | None = None + self._uni_prompting_init_spec: tuple[Any, ...] | None = None + self._prompt_vq_model: Any | None = None + self._prompt_vq_model_path: str | None = None + self._prompt_vq_local_files_only: bool | None = None + self._cached_mm_inputs: dict[str, Any] = {} + + try: + self._set_tokenizer( + self._infer_sources.tokenizer_source, + local_files_only=self._infer_sources.model_local_files_only, + ) + except Exception: + self.tokenizer = None + self._tokenizer_path = None + + @staticmethod + def _load_text_model(model_path: str, *, local_files_only: bool = False) -> Any: + try: + dynin_model_cls = get_dynin_modeling_attr("DyninOmniModelLM") + try: + return dynin_model_cls.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + local_files_only=local_files_only, + ) + except TypeError: + return dynin_model_cls.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + ) + except Exception as e: + raise RuntimeError( + f"Failed to load DyninOmniModelLM via remote Dynin code for model path '{model_path}'." + ) from e + + @staticmethod + def _load_tokenizer_from_source( + source: str, + *, + local_files_only: bool = False, + trust_remote_code: bool = False, + ) -> Any: + load_kwargs = { + "trust_remote_code": trust_remote_code, + "local_files_only": _to_bool(local_files_only, default=False), + } + try: + return AutoTokenizer.from_pretrained(source, **load_kwargs) + except TypeError: + load_kwargs.pop("local_files_only", None) + return AutoTokenizer.from_pretrained(source, **load_kwargs) + + def _set_tokenizer(self, source: str, *, local_files_only: bool) -> None: + try: + tokenizer = self._load_tokenizer_from_source( + source, + local_files_only=local_files_only, + trust_remote_code=False, + ) + except Exception as e: + logger.info( + "Falling back to trust_remote_code=True tokenizer loading for %s: %s", + source, + e, + ) + tokenizer = self._load_tokenizer_from_source( + source, + local_files_only=local_files_only, + trust_remote_code=True, + ) + + self.tokenizer = tokenizer + self._tokenizer_path = source + self._reset_uni_prompting_cache() + + def _reset_uni_prompting_cache(self) -> None: + self._uni_prompting = None + self._uni_prompting_init_spec = None + + def get_language_model(self) -> Any: + return self.model + + @staticmethod + def _merge_runtime_info_missing_values( + runtime_info: dict[str, Any], + fallback_info: dict[str, Any], + ) -> dict[str, Any]: + merged = dict(runtime_info) + for key, value in fallback_info.items(): + if unwrap_first_value(merged.get(key), None) is None: + merged[key] = value + return merged + + def _runtime_info_needs_bootstrap( + self, + runtime_info: dict[str, Any], + logical_task_name: str, + ) -> bool: + task = str(unwrap_first_value(runtime_info.get("task"), "") or "").lower() + detok_id = unwrap_first_value(runtime_info.get("detok_id"), None) + prompt_length = unwrap_first_value(runtime_info.get("prompt_length"), None) + + if not task or detok_id is None: + return True + if prompt_length is None: + return True + if ( + task in PROMPT_PAYLOAD_REQUIRED_TASKS + and self._find_first_payload( + runtime_info=runtime_info, + kwargs={}, + keys=PROMPTING_PAYLOAD_KEYS, + ) + is None + ): + return True + if logical_task_name in {"t2i", "i2i"}: + for key in ("codebook_size", "text_vocab_size", "vq_model_image_path"): + if unwrap_first_value(runtime_info.get(key), None) is None: + return True + if logical_task_name == "t2s": + for key in ("audio_codebook_size", "condition", "vq_model_audio_path"): + if unwrap_first_value(runtime_info.get(key), None) is None: + return True + return False + + def _decode_prompt_for_bootstrap( + self, + input_ids: torch.Tensor, + runtime_info: dict[str, Any], + ) -> str: + self._maybe_load_runtime_tokenizer(runtime_info) + if self.tokenizer is None: + return "" + token_ids = coerce_token_ids_1d(input_ids).detach().cpu().tolist() + try: + return str(self.tokenizer.decode(token_ids, skip_special_tokens=False)) + except Exception: + return "" + + def _bootstrap_runtime_info_if_needed( + self, + *, + input_ids: torch.Tensor, + runtime_info: dict[str, Any], + kwargs: dict[str, Any], + ) -> dict[str, Any]: + if unwrap_first_value(runtime_info.get(DYNIN_PROMPT_SOURCE_KEY), None) == DYNIN_PROMPT_SOURCE_OFFLINE_PREBUILT: + return runtime_info + + mm_inputs = self._collect_mm_inputs(**kwargs) + decoded_prompt = "" + + task_value = unwrap_first_value(runtime_info.get("task"), None) + if task_value is None: + decoded_prompt = self._decode_prompt_for_bootstrap(input_ids, runtime_info) + logical_task_name = infer_dynin_online_task( + decoded_prompt=decoded_prompt, + has_image="image" in mm_inputs, + has_audio="audio" in mm_inputs, + has_video="video" in mm_inputs, + ) + else: + logical_task_name = logical_dynin_task(task_value) + + if not self._runtime_info_needs_bootstrap(runtime_info, logical_task_name): + return runtime_info + + self._maybe_load_runtime_tokenizer(runtime_info) + if self.tokenizer is None: + logger.warning("Unable to bootstrap Dynin runtime info because tokenizer is unavailable.") + return runtime_info + + if not decoded_prompt: + decoded_prompt = self._decode_prompt_for_bootstrap(input_ids, runtime_info) + + text_vocab_size = int(len(self.tokenizer)) + prompt_len = int(coerce_token_ids_1d(input_ids).numel()) + dynin_config_path = self._infer_sources.config_path + + base_runtime_info = build_dynin_online_runtime_info( + task=logical_task_name, + text_vocab_size=text_vocab_size, + infer_sources=self._infer_sources, + dynin_config_path=dynin_config_path, + attention_mask=([1] * prompt_len) if logical_task_name == "t2t" else None, + prompt_length=prompt_len if logical_task_name == "t2t" else None, + ) + merged_runtime_info = self._merge_runtime_info_missing_values(runtime_info, base_runtime_info) + + payload_required = logical_task_name in {"t2i", "i2i", "t2s"} + existing_prompt_payload = self._find_first_payload( + runtime_info=merged_runtime_info, + kwargs=kwargs, + keys=PROMPTING_PAYLOAD_KEYS, + ) + has_prompt_payload = existing_prompt_payload is not None + needs_prompt_length = unwrap_first_value(merged_runtime_info.get("prompt_length"), None) is None + if not payload_required: + return merged_runtime_info + + use_train_i2i_prompt = _to_bool( + unwrap_first_value( + merged_runtime_info.get("use_train_i2i_prompt"), + dynin_runtime_fallback(logical_task_name, "use_train_i2i_prompt", logical_task_name == "i2i"), + ), + default=logical_task_name == "i2i", + ) + t2s_token_length = int( + dynin_runtime_fallback( + logical_task_name, + "t2s_token_length", + unwrap_first_value(merged_runtime_info.get("t2s_token_length"), None), + ) + or 383 + ) + image_resolution = int( + dynin_runtime_fallback( + logical_task_name, + "image_resolution", + unwrap_first_value(merged_runtime_info.get("image_resolution"), None), + ) + or 336 + ) + + image_token_count = int( + dynin_runtime_fallback( + logical_task_name, + "image_token_count", + unwrap_first_value(merged_runtime_info.get("seq_len"), None), + ) + or 0 + ) + image_tokens: torch.Tensor | None = None + if logical_task_name == "i2i" and (not has_prompt_payload or image_token_count <= 0): + image_tokens = self._encode_prompt_image_tokens( + runtime_info=merged_runtime_info, + mm_inputs=mm_inputs, + resolution=image_resolution, + ) + image_token_count = int(image_tokens.numel()) + + mask_token_id = int(unwrap_first_value(merged_runtime_info.get("mask_token_id"), 126336)) + prompting_input = self._unwrap_singleton(existing_prompt_payload) + prompting_task = str( + unwrap_first_value( + merged_runtime_info.get("prompting_task"), + TASK_TO_PROMPTING_TASK.get( + str(unwrap_first_value(merged_runtime_info.get("task"), "mmu")).lower(), + "mmu", + ), + ) + ) + if not has_prompt_payload: + prompt_text = normalize_dynin_online_prompt_text(logical_task_name, decoded_prompt) + prompting_input, prompting_task = build_dynin_prompt_payload( + task=logical_task_name, + text=prompt_text, + image_tokens=image_tokens, + image_placeholder_tokens=image_token_count, + audio_placeholder_tokens=t2s_token_length, + image_token_offset=text_vocab_size, + mask_token_id=mask_token_id, + use_train_i2i_prompt=use_train_i2i_prompt, + ) + + prompt_runtime_info = build_dynin_online_runtime_info( + task=logical_task_name, + text_vocab_size=text_vocab_size, + infer_sources=self._infer_sources, + dynin_config_path=dynin_config_path, + image_token_count=image_token_count, + t2s_token_length=t2s_token_length, + use_train_i2i_prompt=use_train_i2i_prompt, + ) + prompt_runtime_info["prompting_task"] = [str(prompting_task)] + prompt_runtime_info["prompting_input"] = [prompting_input] + merged_runtime_info = self._merge_runtime_info_missing_values(merged_runtime_info, prompt_runtime_info) + + if not needs_prompt_length and has_prompt_payload: + return merged_runtime_info + + uni_prompting = self._get_or_create_uni_prompting( + runtime_info=merged_runtime_info, + kwargs=kwargs, + ) + if uni_prompting is not None: + prepared_input_ids, prepared_attention_mask = self._prepare_prompting_input( + payload=prompting_input, + task=str(unwrap_first_value(merged_runtime_info.get("task"), "mmu")), + runtime_info=merged_runtime_info, + kwargs=kwargs, + uni_prompting=uni_prompting, + ref_device=input_ids.device, + ) + if prepared_input_ids is not None: + prepared_prompt_len = int(prepared_input_ids.shape[-1]) + prepared_attention_list: list[int] | None = None + if prepared_attention_mask is not None: + prepared_attention_list = prepared_attention_mask.view(-1).detach().cpu().tolist() + final_runtime_info = build_dynin_online_runtime_info( + task=logical_task_name, + text_vocab_size=text_vocab_size, + infer_sources=self._infer_sources, + dynin_config_path=dynin_config_path, + prompting_input=prompting_input, + attention_mask=prepared_attention_list, + prompt_length=prepared_prompt_len, + image_token_count=image_token_count, + t2s_token_length=t2s_token_length, + use_train_i2i_prompt=use_train_i2i_prompt, + ) + final_runtime_info["prompting_task"] = [str(prompting_task)] + + guidance_scale = float(unwrap_first_value(merged_runtime_info.get("guidance_scale"), 0.0)) + if logical_task_name in {"t2i", "i2i"} and guidance_scale > 0: + uncond_prompting_input, _ = build_dynin_prompt_payload( + task=logical_task_name, + text="", + image_tokens=image_tokens, + image_placeholder_tokens=image_token_count, + audio_placeholder_tokens=t2s_token_length, + image_token_offset=text_vocab_size, + mask_token_id=mask_token_id, + use_train_i2i_prompt=use_train_i2i_prompt, + ) + final_runtime_info["uncond_prompting_input"] = [uncond_prompting_input] + + merged_runtime_info = self._merge_runtime_info_missing_values( + merged_runtime_info, + final_runtime_info, + ) + + return merged_runtime_info + + @staticmethod + def _build_downstream_runtime_info(runtime_info: dict[str, Any]) -> dict[str, Any]: + bridge_keys = ( + "task", + "detok_id", + "dynin_config_path", + "codebook_size", + "audio_codebook_size", + "text_vocab_size", + "num_new_special_tokens", + "image_vocab_offset", + "audio_vocab_offset", + "t2s_vocab_start", + "condition", + "t2s_condition", + "vq_model_image_path", + "vq_model_image_local_files_only", + "vq_model_audio_path", + "vq_model_audio_local_files_only", + "model_local_files_only", + "local_files_only", + "hf_hub_disable_xet", + "disable_hf_xet", + ) + return {key: runtime_info[key] for key in bridge_keys if key in runtime_info} + + @staticmethod + def _jsonify_runtime_value(value: Any) -> Any: + if isinstance(value, torch.Tensor): + return value.detach().cpu().tolist() + if isinstance(value, (list, tuple)): + return [DyninOmniToken2Text._jsonify_runtime_value(item) for item in value] + if isinstance(value, dict): + return {str(key): DyninOmniToken2Text._jsonify_runtime_value(val) for key, val in value.items()} + if isinstance(value, (str, int, float, bool)) or value is None: + return value + return str(value) + + def _encode_runtime_info_tensor( + self, + runtime_info: dict[str, Any], + *, + device: torch.device, + ) -> torch.Tensor | None: + if not runtime_info: + return None + payload = {key: self._jsonify_runtime_value(value) for key, value in runtime_info.items()} + encoded = json.dumps( + payload, + ensure_ascii=False, + separators=(",", ":"), + sort_keys=True, + ).encode("utf-8") + if not encoded: + return None + return torch.tensor(list(encoded), dtype=torch.uint8, device=device) + + def forward( + self, + input_ids: torch.Tensor | None = None, + positions: torch.Tensor | None = None, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: Any, + ) -> OmniOutput: + del positions, intermediate_tensors, inputs_embeds + + if input_ids is None: + raise ValueError("token2text stage requires input_ids") + try: + runtime_info = normalize_runtime_info(kwargs.get("runtime_additional_information")) + runtime_info = self._bootstrap_runtime_info_if_needed( + input_ids=input_ids, + runtime_info=runtime_info, + kwargs=kwargs, + ) + task = str(unwrap_first_value(runtime_info.get("task"), "mmu")).lower() + + detok_id = int( + unwrap_first_value( + runtime_info.get("detok_id"), + TASK_TO_DETOK.get(task, DetokTarget.TEXT), + ) + ) + + token_ids = self._generate_token_ids( + task=task, + input_ids=input_ids, + runtime_info=runtime_info, + kwargs=kwargs, + ) + bridge_runtime_info = self._build_downstream_runtime_info(runtime_info) + runtime_info_tensor = self._encode_runtime_info_tensor( + bridge_runtime_info, + device=token_ids.device, + ) + + if detok_id != int(DetokTarget.TEXT): + multimodal_outputs = { + "token_ids": token_ids, + "detok_id": torch.tensor( + [detok_id], + dtype=torch.long, + device=token_ids.device, + ), + } + if runtime_info_tensor is not None: + multimodal_outputs["runtime_info_json"] = runtime_info_tensor + return OmniOutput( + text_hidden_states=None, + multimodal_outputs=multimodal_outputs, + ) + + decode_tokens = self._extract_decode_tokens(token_ids, runtime_info=runtime_info) + multimodal_outputs = { + "token_ids": token_ids, + "text_tokens": decode_tokens, + "detok_id": torch.tensor( + [detok_id], + dtype=torch.long, + device=token_ids.device, + ), + } + if runtime_info_tensor is not None: + multimodal_outputs["runtime_info_json"] = runtime_info_tensor + + return OmniOutput( + text_hidden_states=None, + multimodal_outputs=multimodal_outputs, + ) + finally: + self._cached_mm_inputs = {} + + def _generate_token_ids( + self, + task: str, + input_ids: torch.Tensor, + runtime_info: dict[str, Any], + kwargs: dict[str, Any], + ) -> torch.Tensor: + precomputed = self._get_precomputed_token_ids(runtime_info) + if precomputed is not None: + return coerce_token_ids_1d(precomputed, ref_device=input_ids.device) + + gen_fn_name = TASK_TO_GENERATE_FN.get(task, "mmu_generate") + gen_fn = self._resolve_generate_fn(gen_fn_name) + + gen_kwargs = self._collect_generate_kwargs(runtime_info=runtime_info, kwargs=kwargs) + + if "noise_schedule" not in gen_kwargs: + noise_schedule = self._resolve_noise_schedule( + runtime_info=runtime_info, + kwargs=kwargs, + ) + if noise_schedule is not None: + gen_kwargs["noise_schedule"] = noise_schedule + + if task in TASKS_USING_UNI_PROMPTING and "uni_prompting" not in gen_kwargs: + uni_prompting = self._get_or_create_uni_prompting( + runtime_info=runtime_info, + kwargs=kwargs, + ) + if uni_prompting is not None: + gen_kwargs["uni_prompting"] = uni_prompting + + should_prepare_prompting_inputs = task in TASKS_USING_UNI_PROMPTING or self._contains_prompting_payload( + runtime_info=runtime_info, kwargs=kwargs + ) + if should_prepare_prompting_inputs: + input_ids, gen_kwargs = self._prepare_prompting_inputs_if_needed( + task=task, + input_ids=input_ids, + runtime_info=runtime_info, + kwargs=kwargs, + gen_kwargs=gen_kwargs, + ) + + input_ids, gen_kwargs = self._normalize_generate_inputs( + input_ids=input_ids, + gen_kwargs=gen_kwargs, + ref_device=input_ids.device, + ) + gen_kwargs = self._filter_supported_generate_kwargs( + gen_fn=gen_fn, + gen_kwargs=gen_kwargs, + fn_name=gen_fn_name, + ) + + generated = self._call_generate_fn( + gen_fn=gen_fn, + input_ids=input_ids, + gen_kwargs=gen_kwargs, + ) + return coerce_token_ids_1d(generated, ref_device=input_ids.device) + + @staticmethod + def _get_precomputed_token_ids(runtime_info: dict[str, Any]) -> Any | None: + precomputed = runtime_info.get("generated_token_ids") + if precomputed is None: + precomputed = runtime_info.get("token_ids") + return precomputed + + def _resolve_generate_fn(self, fn_name: str) -> Any: + if not hasattr(self.model, fn_name): + raise RuntimeError( + f"DYNIN model does not expose '{fn_name}'. " + "Pass additional_information.generated_token_ids or adjust task mapping." + ) + return getattr(self.model, fn_name) + + @staticmethod + def _collect_generate_kwargs( + *, + runtime_info: dict[str, Any], + kwargs: dict[str, Any], + ) -> dict[str, Any]: + gen_kwargs: dict[str, Any] = {} + + for key in GENERATE_RUNTIME_KWARG_KEYS: + if key in runtime_info: + gen_kwargs[key] = unwrap_first_value(runtime_info[key]) + + for key in PASSTHROUGH_GENERATE_KWARG_KEYS: + if key not in gen_kwargs and key in kwargs: + gen_kwargs[key] = kwargs[key] + + return gen_kwargs + + @staticmethod + def _contains_prompting_payload( + runtime_info: dict[str, Any], + kwargs: dict[str, Any], + ) -> bool: + keys = PROMPTING_PAYLOAD_KEYS + PROMPTING_META_KEYS + return any(key in runtime_info for key in keys) or any(key in kwargs for key in keys) + + @staticmethod + def _filter_supported_generate_kwargs( + *, + gen_fn: Any, + gen_kwargs: dict[str, Any], + fn_name: str, + ) -> dict[str, Any]: + if not gen_kwargs: + return gen_kwargs + + try: + signature = inspect.signature(gen_fn) + except (TypeError, ValueError): + return gen_kwargs + + params = signature.parameters + accepts_var_kwargs = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()) + if accepts_var_kwargs: + return gen_kwargs + + allowed_keys = { + name + for name, param in params.items() + if param.kind + in ( + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + ) + } + filtered = {k: v for k, v in gen_kwargs.items() if k in allowed_keys} + + removed_keys = sorted(set(gen_kwargs.keys()) - set(filtered.keys())) + if removed_keys: + logger.debug("Filtered unsupported kwargs for %s: %s", fn_name, removed_keys) + + return filtered + + @staticmethod + def _call_generate_fn( + *, + gen_fn: Any, + input_ids: torch.Tensor, + gen_kwargs: dict[str, Any], + ) -> Any: + try: + signature = inspect.signature(gen_fn) + params = signature.parameters + except (TypeError, ValueError): + params = {} + + if "idx" in params: + return gen_fn(idx=input_ids, **gen_kwargs) + if "input_ids" in params: + return gen_fn(input_ids=input_ids, **gen_kwargs) + + try: + return gen_fn(input_ids, **gen_kwargs) + except TypeError: + try: + return gen_fn(idx=input_ids, **gen_kwargs) + except TypeError: + return gen_fn(input_ids=input_ids, **gen_kwargs) + + def _normalize_generate_inputs( + self, + *, + input_ids: torch.Tensor, + gen_kwargs: dict[str, Any], + ref_device: torch.device, + ) -> tuple[torch.Tensor, dict[str, Any]]: + normalized_input_ids = self._coerce_long_tensor_2d(input_ids, ref_device) + if normalized_input_ids is None: + normalized_input_ids = input_ids + + normalized_kwargs = dict(gen_kwargs) + for key in ("attention_mask", "uncond_input_ids", "uncond_attention_mask"): + if key not in normalized_kwargs: + continue + normalized_value = self._coerce_long_tensor_2d( + normalized_kwargs[key], + ref_device, + ) + if normalized_value is not None: + normalized_kwargs[key] = normalized_value + + return normalized_input_ids, normalized_kwargs + + def _get_or_create_uni_prompting( + self, + runtime_info: dict[str, Any], + kwargs: dict[str, Any], + ) -> Any | None: + runtime_uni_prompting = runtime_info.get("uni_prompting") + if runtime_uni_prompting is not None: + runtime_uni_prompting = self._unwrap_singleton(runtime_uni_prompting) + if runtime_uni_prompting is not None: + return runtime_uni_prompting + + kwargs_uni_prompting = self._unwrap_singleton(kwargs.get("uni_prompting")) + if kwargs_uni_prompting is not None: + return kwargs_uni_prompting + + self._maybe_load_runtime_tokenizer(runtime_info) + if self.tokenizer is None: + return None + + use_reserved_token = _to_bool( + unwrap_first_value( + runtime_info.get("use_reserved_token"), + unwrap_first_value(runtime_info.get("prompting_use_reserved_token"), True), + ), + default=True, + ) + + max_text_len_value = unwrap_first_value( + runtime_info.get("prompt_max_text_len"), + unwrap_first_value( + runtime_info.get("prompting_max_text_len"), + unwrap_first_value(runtime_info.get("max_text_len"), None), + ), + ) + cond_dropout_value = unwrap_first_value( + runtime_info.get("cond_dropout_prob"), + unwrap_first_value(runtime_info.get("prompting_cond_dropout_prob"), None), + ) + max_audio_len_value = unwrap_first_value( + runtime_info.get("max_audio_len"), + unwrap_first_value(runtime_info.get("t2s_token_length"), None), + ) + max_audio_len_short_value = unwrap_first_value( + runtime_info.get("max_audio_len_short"), + None, + ) + + max_text_len: int | None = None + if max_text_len_value is not None: + try: + parsed = int(max_text_len_value) + if parsed > 0: + max_text_len = parsed + except Exception: + pass + + cond_dropout_prob: float | None = None + if cond_dropout_value is not None: + try: + cond_dropout_prob = float(cond_dropout_value) + except Exception: + pass + + max_audio_len: int | None = None + if max_audio_len_value is not None: + try: + parsed = int(max_audio_len_value) + if parsed > 0: + max_audio_len = max(parsed, 512) + except Exception: + pass + + max_audio_len_short: int | None = None + if max_audio_len_short_value is not None: + try: + parsed = int(max_audio_len_short_value) + if parsed > 0: + max_audio_len_short = parsed + except Exception: + pass + elif max_audio_len is not None: + max_audio_len_short = max(256, max_audio_len // 2) + + if self._uni_prompting is not None: + if max_text_len is None and hasattr(self._uni_prompting, "max_text_len"): + try: + existing_max_text_len = int(getattr(self._uni_prompting, "max_text_len")) + if existing_max_text_len > 0: + max_text_len = existing_max_text_len - 1 + except Exception: + pass + if cond_dropout_prob is None and hasattr(self._uni_prompting, "cond_dropout_prob"): + try: + cond_dropout_prob = float(getattr(self._uni_prompting, "cond_dropout_prob")) + except Exception: + pass + + desired_spec = ( + id(self.tokenizer), + use_reserved_token, + max_text_len, + cond_dropout_prob, + max_audio_len, + max_audio_len_short, + ) + + if self._uni_prompting is not None and self._uni_prompting_init_spec != desired_spec: + self._reset_uni_prompting_cache() + + if self._uni_prompting is None: + try: + universal_prompting_cls = resolve_remote_attr( + "UniversalPrompting", + module_name="prompting_utils", + settings=DYNIN_REMOTE_SETTINGS, + source=self._infer_sources.model_source, + local_files_only=self._infer_sources.model_local_files_only, + fallback_module_names=("modeling_dynin_omni",), + optional=True, + ) + except Exception: + universal_prompting_cls = None + + try: + if universal_prompting_cls is None: + raise ImportError("UniversalPrompting is not available in the configured remote Dynin code.") + + init_kwargs: dict[str, Any] = { + "use_reserved_token": use_reserved_token, + "special_tokens": DYNIN_SPECIAL_TOKENS, + "ignore_id": -100, + } + if max_text_len is not None: + init_kwargs["max_text_len"] = max_text_len + if cond_dropout_prob is not None: + init_kwargs["cond_dropout_prob"] = cond_dropout_prob + if max_audio_len is not None: + init_kwargs["max_audio_len"] = max_audio_len + if max_audio_len_short is not None: + init_kwargs["max_audio_len_short"] = max_audio_len_short + + try: + self._uni_prompting = universal_prompting_cls(self.tokenizer, **init_kwargs) + except TypeError: + trimmed_audio_kwargs = dict(init_kwargs) + trimmed_audio_kwargs.pop("max_audio_len", None) + trimmed_audio_kwargs.pop("max_audio_len_short", None) + try: + self._uni_prompting = universal_prompting_cls(self.tokenizer, **trimmed_audio_kwargs) + except TypeError: + minimal_kwargs = dict(trimmed_audio_kwargs) + minimal_kwargs.pop("special_tokens", None) + minimal_kwargs.pop("ignore_id", None) + self._uni_prompting = universal_prompting_cls(self.tokenizer, **minimal_kwargs) + self._uni_prompting_init_spec = desired_spec + except Exception as e: + logger.warning("Failed to initialize UniversalPrompting: %s", e) + self._reset_uni_prompting_cache() + + return self._uni_prompting + + @staticmethod + def _unwrap_singleton(value: Any) -> Any: + if isinstance(value, list) and len(value) == 1: + return value[0] + return value + + @classmethod + def _coerce_schedule_params(cls, value: Any) -> dict[str, Any]: + value = cls._unwrap_singleton(value) + if value is None: + return {} + if isinstance(value, dict): + return {str(k): v for k, v in value.items()} + if hasattr(value, "items"): + try: + return {str(k): v for k, v in dict(value).items()} + except Exception: + return {} + if isinstance(value, str): + text = value.strip() + if not text: + return {} + try: + parsed = json.loads(text) + except Exception: + return {} + if isinstance(parsed, dict): + return {str(k): v for k, v in parsed.items()} + return {} + + def _resolve_noise_schedule( + self, + runtime_info: dict[str, Any], + kwargs: dict[str, Any], + ) -> Any | None: + runtime_noise_schedule = unwrap_first_value( + runtime_info.get("noise_schedule"), + kwargs.get("noise_schedule"), + ) + runtime_noise_schedule = self._unwrap_singleton(runtime_noise_schedule) + if callable(runtime_noise_schedule): + return runtime_noise_schedule + + schedule_name: str | None = None + if isinstance(runtime_noise_schedule, str) and runtime_noise_schedule.strip(): + schedule_name = runtime_noise_schedule.strip() + + if schedule_name is None: + for key in ("noise_schedule_name", "mask_schedule", "schedule"): + value = unwrap_first_value(runtime_info.get(key), None) + if value is None and key in kwargs: + value = self._unwrap_singleton(kwargs.get(key)) + if isinstance(value, str) and value.strip(): + schedule_name = value.strip() + break + + if schedule_name is None: + return None + + schedule_params = self._coerce_schedule_params( + unwrap_first_value( + runtime_info.get("noise_schedule_params"), + kwargs.get("noise_schedule_params"), + ) + ) + + try: + get_mask_schedule = get_dynin_sampling_attr("get_mask_schedule") + return get_mask_schedule(schedule_name, **schedule_params) + except Exception as e: + logger.warning( + "Failed to resolve mask schedule '%s' with params=%s: %s", + schedule_name, + schedule_params, + e, + ) + return None + + @staticmethod + def _coerce_long_tensor_2d( + value: Any, + device: torch.device, + ) -> torch.Tensor | None: + if value is None: + return None + out = value if isinstance(value, torch.Tensor) else torch.as_tensor(value) + if out.ndim == 1: + out = out.unsqueeze(0) + if out.ndim > 2: + out = out.view(out.shape[0], -1) + return out.to(device=device, dtype=torch.long).contiguous() + + @staticmethod + def _config_get(config_obj: Any, key: str) -> Any: + if config_obj is None: + return None + if isinstance(config_obj, dict): + return config_obj.get(key) + if hasattr(config_obj, "get"): + try: + return config_obj.get(key) + except Exception: + return None + return None + + @classmethod + def _is_numeric_token_structure(cls, value: Any) -> bool: + if isinstance(value, torch.Tensor): + return True + if isinstance(value, bool): + return True + if isinstance(value, int): + return True + if isinstance(value, float): + return float(value).is_integer() + if isinstance(value, (list, tuple)): + if not value: + return False + return all(cls._is_numeric_token_structure(v) for v in value) + return False + + @classmethod + def _materialize_prompting_payload(cls, value: Any, ref_device: torch.device) -> Any: + if isinstance(value, torch.Tensor): + return value.to(device=ref_device, dtype=torch.long).contiguous() + if isinstance(value, dict): + return {k: cls._materialize_prompting_payload(v, ref_device) for k, v in value.items()} + if isinstance(value, (list, tuple)): + if cls._is_numeric_token_structure(value): + try: + return torch.as_tensor(value, dtype=torch.long, device=ref_device) + except Exception: + pass + converted = [cls._materialize_prompting_payload(v, ref_device) for v in value] + return tuple(converted) if isinstance(value, tuple) else converted + return value + + @contextmanager + def _temporary_prompting_overrides(self, uni_prompting: Any, prompting_cfg: Any): + restore_values: dict[str, Any] = {} + try: + max_text_len_override = self._config_get(prompting_cfg, "max_text_len_override") + if max_text_len_override is not None and hasattr(uni_prompting, "max_text_len"): + try: + override_int = int(max_text_len_override) + if override_int > 0: + restore_values["max_text_len"] = getattr(uni_prompting, "max_text_len") + setattr(uni_prompting, "max_text_len", override_int + 1) + except Exception: + pass + yield + finally: + for attr_name, original_value in restore_values.items(): + try: + setattr(uni_prompting, attr_name, original_value) + except Exception: + pass + + def _prepare_prompting_input( + self, + *, + payload: Any, + task: str, + runtime_info: dict[str, Any], + kwargs: dict[str, Any], + uni_prompting: Any, + ref_device: torch.device, + ) -> tuple[torch.Tensor | None, torch.Tensor | None]: + if payload is None: + return None, None + + payload = self._unwrap_singleton(payload) + prompting_task = str( + self._unwrap_singleton( + unwrap_first_value( + runtime_info.get("prompting_task"), + TASK_TO_PROMPTING_TASK.get(task, task), + ) + ) + ) + prompting_cfg = self._unwrap_singleton( + unwrap_first_value( + runtime_info.get("prompting_config"), + kwargs.get("prompting_config"), + ) + ) + + if isinstance(payload, dict): + if payload.get("task") is not None: + prompting_task = str(payload["task"]) + if payload.get("config") is not None: + prompting_cfg = payload["config"] + payload = payload.get("input", payload.get("inputs", payload.get("data", payload))) + + payload = self._materialize_prompting_payload(payload, ref_device) + + try: + with self._temporary_prompting_overrides(uni_prompting, prompting_cfg): + prepared = uni_prompting(payload, prompting_task, config=prompting_cfg) + except Exception as e: + logger.warning( + "UniversalPrompting failed for task=%s prompting_task=%s: %s", + task, + prompting_task, + e, + ) + return None, None + + if isinstance(prepared, tuple): + prepared_input_ids = prepared[0] if len(prepared) > 0 else None + prepared_attention_mask = prepared[1] if len(prepared) > 1 else None + else: + prepared_input_ids = prepared + prepared_attention_mask = None + + return ( + self._coerce_long_tensor_2d(prepared_input_ids, ref_device), + self._coerce_long_tensor_2d(prepared_attention_mask, ref_device), + ) + + def _prepare_prompting_inputs_if_needed( + self, + *, + task: str, + input_ids: torch.Tensor, + runtime_info: dict[str, Any], + kwargs: dict[str, Any], + gen_kwargs: dict[str, Any], + ) -> tuple[torch.Tensor, dict[str, Any]]: + uni_prompting = gen_kwargs.get("uni_prompting") + if uni_prompting is None: + uni_prompting = self._get_or_create_uni_prompting( + runtime_info=runtime_info, + kwargs=kwargs, + ) + if uni_prompting is not None: + gen_kwargs["uni_prompting"] = uni_prompting + + if uni_prompting is None: + return input_ids, gen_kwargs + + payload = self._find_first_payload( + runtime_info=runtime_info, + kwargs=kwargs, + keys=PROMPTING_PAYLOAD_KEYS, + ) + + if payload is not None: + prepared_input_ids, prepared_attention_mask = self._prepare_prompting_input( + payload=payload, + task=task, + runtime_info=runtime_info, + kwargs=kwargs, + uni_prompting=uni_prompting, + ref_device=input_ids.device, + ) + if prepared_input_ids is not None: + input_ids = prepared_input_ids + if prepared_attention_mask is not None and "attention_mask" not in gen_kwargs: + gen_kwargs["attention_mask"] = prepared_attention_mask + + uncond_payload = self._find_first_payload( + runtime_info=runtime_info, + kwargs=kwargs, + keys=UNCOND_PROMPTING_PAYLOAD_KEYS, + ) + if uncond_payload is not None and "uncond_input_ids" not in gen_kwargs: + uncond_input_ids, uncond_attention_mask = self._prepare_prompting_input( + payload=uncond_payload, + task=task, + runtime_info=runtime_info, + kwargs=kwargs, + uni_prompting=uni_prompting, + ref_device=input_ids.device, + ) + if uncond_input_ids is not None: + gen_kwargs["uncond_input_ids"] = uncond_input_ids + if uncond_attention_mask is not None and "uncond_attention_mask" not in gen_kwargs: + gen_kwargs["uncond_attention_mask"] = uncond_attention_mask + + return input_ids, gen_kwargs + + @staticmethod + def _find_first_payload( + *, + runtime_info: dict[str, Any], + kwargs: dict[str, Any], + keys: tuple[str, ...], + ) -> Any | None: + for key in keys: + if key in runtime_info: + return runtime_info[key] + if key in kwargs: + return kwargs[key] + return None + + def _extract_decode_tokens( + self, + tokens: torch.Tensor, + runtime_info: dict[str, Any], + ) -> torch.Tensor: + prompt_len = int( + unwrap_first_value( + runtime_info.get("prompt_length"), + unwrap_first_value( + runtime_info.get("prompt_len"), + unwrap_first_value(runtime_info.get("prompt_token_len"), 0), + ), + ) + ) + + decode_tokens = tokens + if 0 < prompt_len < tokens.numel(): + decode_tokens = tokens[prompt_len:] + + text_vocab_size = unwrap_first_value(runtime_info.get("text_vocab_size"), None) + if text_vocab_size is None and self.tokenizer is not None: + text_vocab_size = len(self.tokenizer) + + if text_vocab_size is not None: + vocab_size = int(text_vocab_size) + valid = decode_tokens[(decode_tokens >= 0) & (decode_tokens < vocab_size)] + if valid.numel() > 0: + decode_tokens = valid + + return decode_tokens.contiguous() + + def _decode_text(self, tokens: torch.Tensor, runtime_info: dict[str, Any]) -> str: + self._maybe_load_runtime_tokenizer(runtime_info) + if self.tokenizer is None: + return "" + try: + return self.tokenizer.decode( + tokens.detach().cpu().tolist(), + skip_special_tokens=True, + ) + except Exception: + return "" + + def _maybe_load_runtime_tokenizer(self, runtime_info: dict[str, Any]) -> None: + tokenizer_path = unwrap_first_value(runtime_info.get("tokenizer_path"), None) + if tokenizer_path is not None: + tokenizer_path = str(tokenizer_path) + + runtime_local_files_only = unwrap_first_value( + runtime_info.get("local_files_only_model"), + unwrap_first_value( + runtime_info.get("model_local_files_only"), + unwrap_first_value( + runtime_info.get("local_files_only"), + self._infer_sources.model_local_files_only, + ), + ), + ) + local_only = _to_bool( + runtime_local_files_only, + default=self._infer_sources.model_local_files_only, + ) + + if tokenizer_path and tokenizer_path != self._tokenizer_path: + try: + logger.info("Loading DYNIN text tokenizer from %s", tokenizer_path) + self._set_tokenizer(tokenizer_path, local_files_only=local_only) + except Exception as e: + logger.warning("Failed to load tokenizer from %s: %s", tokenizer_path, e) + + def _ensure_prompt_vq_model(self, runtime_info: dict[str, Any], ref_device: torch.device) -> Any: + sources = resolve_dynin_infer_sources(vllm_config=self.vllm_config, runtime_info=runtime_info) + model_path = str(sources.vq_image_source) + local_files_only = bool(sources.vq_image_local_files_only) + if ( + self._prompt_vq_model is None + or self._prompt_vq_model_path != model_path + or self._prompt_vq_local_files_only != local_files_only + ): + logger.info( + "Loading DYNIN prompt VQ encoder from %s (local_files_only=%s)", + model_path, + local_files_only, + ) + magvit_cls = get_dynin_magvit_attr( + "MAGVITv2", + source=model_path, + local_files_only=local_files_only, + ) + try: + self._prompt_vq_model = magvit_cls.from_pretrained( + model_path, + local_files_only=local_files_only, + ) + except TypeError: + self._prompt_vq_model = magvit_cls.from_pretrained(model_path) + self._prompt_vq_model.eval() + self._prompt_vq_model.requires_grad_(False) + self._prompt_vq_model_path = model_path + self._prompt_vq_local_files_only = local_files_only + if hasattr(self._prompt_vq_model, "to"): + self._prompt_vq_model = self._prompt_vq_model.to(ref_device) + return self._prompt_vq_model + + @staticmethod + def _prepare_prompt_image_tensor( + image: Any, + *, + resolution: int, + device: torch.device, + ) -> torch.Tensor: + tensor = image if isinstance(image, torch.Tensor) else torch.as_tensor(image) + if tensor.ndim == 4: + tensor = tensor[0] + if tensor.ndim != 3: + raise ValueError(f"Unsupported image tensor shape for Dynin bootstrap: {tuple(tensor.shape)}") + + if tensor.shape[0] not in (1, 3, 4) and tensor.shape[-1] in (1, 3, 4): + tensor = tensor.permute(2, 0, 1) + if tensor.shape[0] == 1: + tensor = tensor.repeat(3, 1, 1) + if tensor.shape[0] == 4: + tensor = tensor[:3] + + tensor = tensor.to(device=device, dtype=torch.float32) + if tensor.numel() > 0 and tensor.max() > 1.0: + tensor = tensor / 255.0 + + tensor = tensor.unsqueeze(0) + _, _, height, width = tensor.shape + short_side = max(1, min(int(height), int(width))) + scale = float(resolution) / float(short_side) + new_height = max(1, int(round(height * scale))) + new_width = max(1, int(round(width * scale))) + tensor = F.interpolate( + tensor, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + top = max(0, (new_height - resolution) // 2) + left = max(0, (new_width - resolution) // 2) + tensor = tensor[:, :, top : top + resolution, left : left + resolution] + if tensor.shape[-2:] != (resolution, resolution): + tensor = F.interpolate( + tensor, + size=(resolution, resolution), + mode="bicubic", + align_corners=False, + ) + tensor = torch.clamp(tensor, min=0.0, max=1.0) + return ((tensor - 0.5) / 0.5).contiguous() + + def _encode_prompt_image_tokens( + self, + *, + runtime_info: dict[str, Any], + mm_inputs: dict[str, Any], + resolution: int, + ) -> torch.Tensor: + image_value = mm_inputs.get("image") + image_items = self._split_mm_items(image_value) + if not image_items: + raise ValueError("Dynin online i2i bootstrap requires an image input.") + + device = self._default_mm_device() + image_tensor = self._prepare_prompt_image_tensor( + image_items[0], + resolution=resolution, + device=device, + ) + vq_model = self._ensure_prompt_vq_model(runtime_info=runtime_info, ref_device=device) + with torch.no_grad(): + token_ids = vq_model.get_code(image_tensor) + token_ids = torch.as_tensor(token_ids, dtype=torch.long).detach().cpu() + if token_ids.ndim == 2 and token_ids.shape[0] == 1: + token_ids = token_ids[0] + return token_ids.contiguous() + + @staticmethod + def _split_mm_items(value: Any) -> list[Any]: + if value is None: + return [] + if isinstance(value, torch.Tensor): + if value.ndim == 0: + return [value] + return [value[i] for i in range(value.shape[0])] + if isinstance(value, list): + return value + if isinstance(value, tuple): + if len(value) == 2 and isinstance(value[1], (int, float)): + return [value] + return list(value) + return [value] + + def _default_mm_device(self) -> torch.device: + try: + return next(self.model.parameters()).device + except StopIteration: + return torch.device("cpu") + + @staticmethod + def _coerce_mm_item_to_float_tensor( + item: Any, + *, + device: torch.device, + ) -> torch.Tensor: + if isinstance(item, tuple) and len(item) == 2 and isinstance(item[1], (int, float)): + item = item[0] + + if isinstance(item, torch.Tensor): + tensor = item.detach().to(device=device, dtype=torch.float32) + else: + tensor = torch.as_tensor(item, dtype=torch.float32, device=device) + + return tensor.contiguous() + + def _build_deterministic_mm_embedding( + self, + item: Any, + *, + device: torch.device, + ) -> torch.Tensor: + tensor = self._coerce_mm_item_to_float_tensor(item, device=device) + if tensor.numel() == 0: + return torch.zeros((1, self.hidden_size), dtype=torch.bfloat16, device=device) + + flattened = tensor.view(-1) + first = flattened[0] + last = flattened[-1] + mean = flattened.mean() + std = flattened.std(unbiased=False) + abs_mean = flattened.abs().mean() + max_abs = flattened.abs().max() + l2 = torch.linalg.vector_norm(flattened) / max(float(flattened.numel()), 1.0) + + base = torch.stack([first, last, mean, std, abs_mean, max_abs, l2], dim=0) + denom = torch.clamp(base.abs().max(), min=1.0) + base = base / denom + + repeats = (self.hidden_size + base.numel() - 1) // base.numel() + embedding = base.repeat(repeats)[: self.hidden_size].to(dtype=torch.bfloat16) + return embedding.unsqueeze(0).contiguous() + + def _collect_mm_inputs(self, **kwargs: Any) -> dict[str, Any]: + mm_inputs: dict[str, Any] = {} + for modality, aliases in MM_INPUT_ALIASES.items(): + for alias in aliases: + if alias in kwargs and kwargs[alias] is not None: + mm_inputs[modality] = kwargs[alias] + break + for modality, value in self._cached_mm_inputs.items(): + if modality not in mm_inputs and value is not None: + mm_inputs[modality] = value + return mm_inputs + + def embed_multimodal(self, **kwargs: Any) -> Any: + mm_inputs = self._collect_mm_inputs(**kwargs) + self._cached_mm_inputs = dict(mm_inputs) + if not mm_inputs: + return None + + device = self._default_mm_device() + mm_embeddings: list[torch.Tensor] = [] + + for modality in ("image", "video", "audio"): + value = mm_inputs.get(modality) + if value is None: + continue + for item in self._split_mm_items(value): + mm_embeddings.append(self._build_deterministic_mm_embedding(item, device=device)) + + return tuple(mm_embeddings) if mm_embeddings else None diff --git a/vllm_omni/model_executor/models/registry.py b/vllm_omni/model_executor/models/registry.py index 1398923458..3b51f20023 100644 --- a/vllm_omni/model_executor/models/registry.py +++ b/vllm_omni/model_executor/models/registry.py @@ -157,6 +157,11 @@ "VoxtralTTSAudioGenerationForConditionalGeneration", ), "VoxtralTTSAudioTokenizer": ("voxtral_tts", "voxtral_tts_audio_tokenizer", "VoxtralTTSAudioTokenizer"), + "DyninOmniForConditionalGeneration": ( + "dynin_omni", + "dynin_omni", + "DyninOmniForConditionalGeneration", + ), } diff --git a/vllm_omni/model_executor/stage_configs/dynin_omni.yaml b/vllm_omni/model_executor/stage_configs/dynin_omni.yaml new file mode 100644 index 0000000000..0724146aa7 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/dynin_omni.yaml @@ -0,0 +1,80 @@ +stage_args: + - stage_id: 0 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: token2text + model_arch: DyninOmniForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + engine_output_type: latent + trust_remote_code: true + gpu_memory_utilization: 0.5 + enforce_eager: true + enable_prefix_caching: false + async_scheduling: false + max_num_batched_tokens: 32768 + is_comprehension: true + final_output: true + final_output_type: text + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: token2image + model_arch: DyninOmniForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + engine_output_type: latent + trust_remote_code: true + gpu_memory_utilization: 0.1 + enforce_eager: true + enable_prefix_caching: false + async_scheduling: false + max_num_batched_tokens: 32768 + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.dynin_omni.token2text_to_token2image + final_output: true + final_output_type: image + + - stage_id: 2 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: token2audio + model_arch: DyninOmniForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + engine_output_type: latent + trust_remote_code: true + gpu_memory_utilization: 0.1 + enforce_eager: true + enable_prefix_caching: false + async_scheduling: false + max_num_batched_tokens: 32768 + engine_input_source: [1] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.dynin_omni.token2image_to_token2audio + final_output: true + final_output_type: audio + +# Top-level runtime config (concise): default windows and stage edges +runtime: + enabled: true + defaults: + window_size: -1 # Simplified: trigger downstream only after full upstream completion + max_inflight: 1 # Simplified: process serially within each stage + + edges: + - from: 0 + to: 1 + window_size: -1 + - from: 1 + to: 2 + window_size: -1 diff --git a/vllm_omni/model_executor/stage_configs/dynin_omni_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/dynin_omni_multiconnector.yaml new file mode 100644 index 0000000000..7259daa9ea --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/dynin_omni_multiconnector.yaml @@ -0,0 +1,114 @@ +stage_args: + - stage_id: 0 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: token2text + model_arch: DyninOmniForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + engine_output_type: latent + trust_remote_code: false + enforce_eager: true + enable_prefix_caching: false + async_scheduling: false + max_num_batched_tokens: 32768 + output_connectors: + to_stage_1: mooncake_connector + final_output: true + final_output_type: text + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: token2image + model_arch: DyninOmniForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + engine_output_type: latent + trust_remote_code: false + enforce_eager: true + enable_prefix_caching: false + async_scheduling: false + max_num_batched_tokens: 32768 + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.dynin_omni.token2text_to_token2image + final_output: true + final_output_type: image + input_connectors: + from_stage_0: mooncake_connector + output_connectors: + to_stage_2: mooncake_connector + + - stage_id: 2 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: token2audio + model_arch: DyninOmniForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + engine_output_type: latent + trust_remote_code: false + enforce_eager: true + enable_prefix_caching: false + async_scheduling: false + max_num_batched_tokens: 32768 + engine_input_source: [1] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.dynin_omni.token2image_to_token2audio + final_output: true + final_output_type: audio + input_connectors: + from_stage_1: mooncake_connector + +# Top-level runtime config (concise): default windows and stage edges +runtime: + enabled: true + defaults: + window_size: -1 # Simplified: trigger downstream only after full upstream completion + max_inflight: 1 # Simplified: process serially within each stage + #### + # same as Qwen2.5_omni version + # Distributed connectors configuration (optional) + # More connectors will be supported in the future. + connectors: + # Mooncake connector for cross-node/intra-node communication + mooncake_connector: + name: MooncakeConnector + extra: + host: "127.0.0.1" + metadata_server: "http://10.90.67.86:8080/metadata" + master: "10.90.67.86:50051" + segment: 512000000 # 512MB + localbuf: 64000000 # 64MB + proto: "tcp" + + # Yuanrong connector for cross-node/intra-node communication + yuanrong_connector: + name: YuanrongConnector + extra: + host: "127.0.0.1" + port: "35000" + + # SharedMemory connector for intra-node communication + # Alternative SHM connector with different threshold + shared_memory_connector: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 # 64KB threshold + #### + + edges: + - from: 0 + to: 1 + window_size: -1 + - from: 1 + to: 2 + window_size: -1 diff --git a/vllm_omni/model_executor/stage_input_processors/dynin_omni.py b/vllm_omni/model_executor/stage_input_processors/dynin_omni.py new file mode 100644 index 0000000000..9ec8497998 --- /dev/null +++ b/vllm_omni/model_executor/stage_input_processors/dynin_omni.py @@ -0,0 +1,164 @@ +from __future__ import annotations + +import json +from typing import Any + +import torch +from vllm.inputs import TextPrompt + +from vllm_omni.inputs.data import OmniTokensPrompt + + +def _to_prompt_dict(prompt_item: OmniTokensPrompt | TextPrompt | str | None) -> dict[str, Any]: + if isinstance(prompt_item, dict): + return prompt_item + return {} + + +def _to_token_id_list(value: Any) -> list[int]: + if isinstance(value, torch.Tensor): + value = value.detach().to("cpu") + if value.ndim == 0: + return [int(value.item())] + if value.ndim > 1: + value = value[0] + return [int(x) for x in value.tolist()] + if isinstance(value, list): + if not value: + return [] + if isinstance(value[0], list): + return [int(x) for x in value[0]] + return [int(x) for x in value] + if value is None: + return [] + return [int(value)] + + +def _to_int(value: Any, default: int = 0) -> int: + if isinstance(value, torch.Tensor): + if value.numel() == 0: + return default + return int(value.view(-1)[0].item()) + if isinstance(value, list): + if not value: + return default + return int(value[0]) + if value is None: + return default + return int(value) + + +def _normalize_additional_info(value: Any) -> dict[str, Any]: + if not isinstance(value, dict): + return {} + normalized: dict[str, Any] = {} + for key, val in value.items(): + if isinstance(val, list): + normalized[key] = val + else: + normalized[key] = [val] + return normalized + + +def _decode_runtime_bridge_info(value: Any) -> dict[str, Any]: + if isinstance(value, torch.Tensor): + tensor = value.detach().to("cpu").reshape(-1).to(torch.uint8) + raw = bytes(tensor.tolist()) + elif isinstance(value, (bytes, bytearray)): + raw = bytes(value) + elif isinstance(value, list): + try: + raw = bytes(int(item) for item in value) + except Exception: + return {} + elif value is None: + return {} + else: + return value if isinstance(value, dict) else {} + + if not raw: + return {} + + try: + decoded = json.loads(raw.decode("utf-8")) + except Exception: + return {} + return decoded if isinstance(decoded, dict) else {} + + +def _bridge_tokens( + stage_list, + engine_input_source, + prompt: OmniTokensPrompt | TextPrompt = None, + requires_multimodal_data: bool = False, +): + if not engine_input_source: + raise ValueError("engine_input_source cannot be empty") + + source_stage_id = engine_input_source[0] + if source_stage_id >= len(stage_list): + raise IndexError(f"Invalid stage_id: {source_stage_id}") + + if stage_list[source_stage_id].engine_outputs is None: + raise RuntimeError(f"Stage {source_stage_id} has no outputs yet") + + source_outputs = stage_list[source_stage_id].engine_outputs + next_inputs = [] + if not isinstance(prompt, list): + prompt = [prompt] + + prompt_meta_by_reqid = {src_out.request_id: _to_prompt_dict(p) for src_out, p in zip(source_outputs, prompt)} + + for source_output in source_outputs: + output = source_output.outputs[0] + mm_out = getattr(output, "multimodal_output", None) or {} + + token_ids = _to_token_id_list(mm_out.get("token_ids")) + if not token_ids: + token_ids = _to_token_id_list(mm_out.get("text_tokens")) + if not token_ids: + token_ids = list(getattr(output, "token_ids", []) or []) + if not token_ids: + raise RuntimeError( + f"Stage {source_stage_id} output for request {source_output.request_id} has no token_ids" + ) + + detok_id = _to_int(mm_out.get("detok_id"), default=0) + src_prompt = prompt_meta_by_reqid.get(source_output.request_id, {}) + src_additional_info = src_prompt.get("additional_information", {}) or {} + runtime_bridge_info = _decode_runtime_bridge_info(mm_out.get("runtime_info_json")) + if not runtime_bridge_info: + runtime_bridge_info = mm_out.get("runtime_info", {}) or {} + + additional_information: dict[str, Any] = _normalize_additional_info(src_additional_info) + additional_information.update(_normalize_additional_info(runtime_bridge_info)) + additional_information["detok_id"] = [detok_id] + + next_inputs.append( + OmniTokensPrompt( + prompt_token_ids=token_ids, + additional_information=additional_information, + multi_modal_data=(src_prompt.get("multi_modal_data") if requires_multimodal_data else None), + mm_processor_kwargs=None, + ) + ) + + return next_inputs + + +def token2text_to_token2image( + stage_list, + engine_input_source, + prompt: OmniTokensPrompt | TextPrompt = None, + requires_multimodal_data: bool = False, +): + return _bridge_tokens(stage_list, engine_input_source, prompt, requires_multimodal_data) + + +def token2image_to_token2audio( + stage_list, + engine_input_source, + prompt: OmniTokensPrompt | TextPrompt = None, + requires_multimodal_data: bool = False, +): + return _bridge_tokens(stage_list, engine_input_source, prompt, requires_multimodal_data) From 2d980133fee0c9d6c2ec09e839366368f8f555e3 Mon Sep 17 00:00:00 2001 From: Haco <75477391+xiaohajiayou@users.noreply.github.com> Date: Thu, 9 Apr 2026 16:44:31 +0800 Subject: [PATCH 104/204] [Bugfix] Fix precedence between caller runtime args and default stage configs (#2076) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: xiaohajiayou <923390377@qq.com> Co-authored-by: 汪志鹏 Co-authored-by: SYLAR <125541396+lishunyang12@users.noreply.github.com> --- tests/entrypoints/test_stage_utils.py | 7 ++--- vllm_omni/engine/stage_init_utils.py | 23 +++++++++++++++- vllm_omni/entrypoints/stage_utils.py | 38 ++++++++++++++++++++------- vllm_omni/entrypoints/utils.py | 21 ++++++++++++--- 4 files changed, 73 insertions(+), 16 deletions(-) diff --git a/tests/entrypoints/test_stage_utils.py b/tests/entrypoints/test_stage_utils.py index 3afc6f12f5..15ee9c32a4 100644 --- a/tests/entrypoints/test_stage_utils.py +++ b/tests/entrypoints/test_stage_utils.py @@ -92,9 +92,10 @@ def test_set_stage_devices_handles_not_enough_devices(mocker: MockerFixture, mon mock_platform, ) - # Raise since we need 4 GPUs, but we only have 2 visible - with pytest.raises(ValueError): - set_stage_devices(stage_id=0, devices="0,1,2,3") + # Keep the logical mapping and resolve to the visible subset. + set_stage_devices(stage_id=0, devices="0,1,2,3") + + assert os.environ["CUDA_VISIBLE_DEVICES"] == "6,7" @pytest.mark.usefixtures("clean_gpu_memory_between_tests") diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index f71afad83b..e6f603d2a9 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -336,7 +336,13 @@ def acquire_device_locks( num_devices = current_omni_platform.get_device_count() physical_devices = list(range(num_devices)) - num_devices_to_lock = min(num_devices_per_stage, len(physical_devices)) + if len(physical_devices) < num_devices_per_stage: + raise RuntimeError( + f"Stage {stage_id} requires {num_devices_per_stage} device(s) based on parallel_config, " + f"but only {len(physical_devices)} device(s) are available: {physical_devices}" + ) + + num_devices_to_lock = num_devices_per_stage devices_to_lock = sorted(physical_devices[:num_devices_to_lock]) logger.debug( @@ -462,6 +468,21 @@ def initialize_diffusion_stage( model=model, **_to_dict(stage_cfg.engine_args), ) + num_devices_per_stage = od_config.parallel_config.world_size + device_control_env = current_omni_platform.device_control_env_var + visible_devices_str = os.environ.get(device_control_env) + if visible_devices_str: + physical_devices = [device.strip() for device in visible_devices_str.split(",") if device.strip()] + else: + physical_devices = list(range(current_omni_platform.get_device_count())) + + if len(physical_devices) < num_devices_per_stage: + raise ValueError( + f"Stage {metadata.stage_id} requires {num_devices_per_stage} device(s) based on parallel_config, " + f"but {len(physical_devices)} device(s) are available: {physical_devices}" + ) + + od_config.num_gpus = num_devices_per_stage if metadata.cfg_kv_collect_func is not None: od_config.cfg_kv_collect_func = metadata.cfg_kv_collect_func return StageDiffusionClient(model, od_config, metadata, batch_size=batch_size) diff --git a/vllm_omni/entrypoints/stage_utils.py b/vllm_omni/entrypoints/stage_utils.py index 8674d3c33d..7b725f469e 100644 --- a/vllm_omni/entrypoints/stage_utils.py +++ b/vllm_omni/entrypoints/stage_utils.py @@ -78,7 +78,7 @@ def _parse_device_list(devices: str | int) -> list[str]: def _map_device_list(stage_id: int, device_list: list[str], visible_device_list: list[str]) -> list[str]: - """Maps logical to physical devices if we have enough visible devices available. + """Map logical stage devices onto the currently available device pool. Args: stage_id: The stage ID currently configuring devices. @@ -87,22 +87,42 @@ def _map_device_list(stage_id: int, device_list: list[str], visible_device_list: visible_device_list: List of physical devices available. """ num_visible = len(visible_device_list) - num_logical = len(device_list) - if num_visible < num_logical: - raise ValueError(f"Stage {stage_id} requires {num_logical} devices, but only {num_visible} devices are visible") # Ensure that the logical IDs are actually in range to avoid index errors; - # If the check above passes and those below fail, the logical devices are wrong, - # i.e., not actually 0, 1, ..., n + # if some requested ids exceed the available pool, we will fall back to the + # subset that can be mapped and leave the final capacity check to the later + # parallel-config validation path. if not all(device.isdigit() for device in device_list): raise ValueError("Logical devices must be non-negative integers") logical_ids = [int(device) for device in device_list] - if max(logical_ids) >= num_visible: + mapped_devices = [visible_device_list[idx] for idx in logical_ids if idx < num_visible] + mapping_pairs = [ + f"{logical_id}->{visible_device_list[logical_id]}" for logical_id in logical_ids if logical_id < num_visible + ] + if not mapped_devices: raise ValueError( - f"Stage {stage_id} has logical IDs {device_list}, one or more of which exceed the number of visible devices" + f"Stage {stage_id} has logical IDs {device_list}, none of which map to the visible devices " + f"{visible_device_list}" ) - return [visible_device_list[idx] for idx in logical_ids] + if len(mapped_devices) < len(logical_ids): + logger.warning( + "Stage %s requested logical devices %s, but only %d device(s) are currently available: %s. " + "Resolved logical-to-physical mapping: %s. Falling back to mapped subset %s", + stage_id, + device_list, + num_visible, + visible_device_list, + ", ".join(mapping_pairs) if mapping_pairs else "(none)", + mapped_devices, + ) + else: + logger.info( + "Stage %s logical-to-physical device mapping: %s", + stage_id, + ", ".join(mapping_pairs), + ) + return mapped_devices def serialize_obj(obj: Any) -> bytes: diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index 0e1000ec95..c5e49a9336 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -299,11 +299,19 @@ def load_stage_configs_from_model(model: str, base_engine_args: dict | None = No stage_config_path = resolve_model_config_path(model) if stage_config_path is None: return [] - stage_configs = load_stage_configs_from_yaml(config_path=stage_config_path, base_engine_args=base_engine_args) + stage_configs = load_stage_configs_from_yaml( + config_path=stage_config_path, + base_engine_args=base_engine_args, + prefer_stage_engine_args=False, + ) return stage_configs -def load_stage_configs_from_yaml(config_path: str, base_engine_args: dict | None = None) -> list: +def load_stage_configs_from_yaml( + config_path: str, + base_engine_args: dict | None = None, + prefer_stage_engine_args: bool = True, +) -> list: """Load stage configurations from a YAML file. .. deprecated:: @@ -311,6 +319,9 @@ def load_stage_configs_from_yaml(config_path: str, base_engine_args: dict | None Args: config_path: Path to the YAML configuration file + base_engine_args: Engine args supplied by the caller. + prefer_stage_engine_args: When True, YAML stage args override caller + engine args. When False, caller engine args override YAML defaults. Returns: List of stage configuration dictionaries from the file's stage_args @@ -327,7 +338,11 @@ def load_stage_configs_from_yaml(config_path: str, base_engine_args: dict | None base_engine_args_tmp = base_engine_args.copy() # Update base_engine_args with stage-specific engine_args if they exist if hasattr(stage_arg, "engine_args") and stage_arg.engine_args is not None: - base_engine_args_tmp = create_config(merge_configs(base_engine_args_tmp, stage_arg.engine_args)) + if prefer_stage_engine_args: + merged_engine_args = merge_configs(base_engine_args_tmp, stage_arg.engine_args) + else: + merged_engine_args = merge_configs(stage_arg.engine_args, base_engine_args_tmp) + base_engine_args_tmp = create_config(merged_engine_args) stage_type = getattr(stage_arg, "stage_type", "llm") if hasattr(stage_arg, "runtime") and stage_arg.runtime is not None and stage_type != "diffusion": base_engine_args_tmp.async_chunk = global_async_chunk From d2aa9cf08ad6bdd44b55a32b7dcc8c4393a89dda Mon Sep 17 00:00:00 2001 From: Ziming Huang Date: Thu, 9 Apr 2026 17:19:58 +0800 Subject: [PATCH 105/204] Revert "[Fix] Fix slow hasattr in CUDAGraphWrapper.__getattr__ (#1982)" (#2639) Signed-off-by: ZeldaHuang --- tests/worker/test_cudagraph_wrapper_perf.py | 185 -------------------- vllm_omni/worker/gpu_model_runner.py | 19 +- 2 files changed, 1 insertion(+), 203 deletions(-) delete mode 100644 tests/worker/test_cudagraph_wrapper_perf.py diff --git a/tests/worker/test_cudagraph_wrapper_perf.py b/tests/worker/test_cudagraph_wrapper_perf.py deleted file mode 100644 index d73fe46c90..0000000000 --- a/tests/worker/test_cudagraph_wrapper_perf.py +++ /dev/null @@ -1,185 +0,0 @@ -"""Tests for CUDAGraphWrapper.__getattr__ performance optimization. - -This module tests that the patched CUDAGraphWrapper avoids expensive __repr__ -calls when hasattr() is used for non-existent attributes. The original vLLM -implementation includes {self.runnable} in the AttributeError message, which -triggers model tree traversal and can take ~6ms on large models. -""" - -import time - -import pytest -import torch -import torch.nn as nn - -from vllm_omni.worker.gpu_model_runner import CUDAGraphWrapper - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -class SlowReprModel(nn.Module): - """A mock model with artificially slow __repr__ to detect unwanted calls.""" - - def __init__(self, repr_delay_ms: float = 10.0): - super().__init__() - self.linear = nn.Linear(16, 16) - self.repr_delay_ms = repr_delay_ms - self.repr_call_count = 0 - - def forward(self, x): - return self.linear(x) - - def __repr__(self): - self.repr_call_count += 1 - # Simulate expensive repr by sleeping - time.sleep(self.repr_delay_ms / 1000.0) - return f"SlowReprModel(delay={self.repr_delay_ms}ms)" - - -class MockCUDAGraphWrapper: - """A minimal mock that mimics CUDAGraphWrapper structure for CPU testing.""" - - def __init__(self, runnable): - # Store in __dict__ directly to avoid triggering __getattr__ - object.__setattr__(self, "runnable", runnable) - - def __getattr__(self, key: str): - # This is the optimized implementation we're testing - runnable = object.__getattribute__(self, "runnable") - if hasattr(runnable, key): - return getattr(runnable, key) - # Key optimization: DO NOT include {self.runnable} in error message - # as it triggers expensive __repr__ on large models - raise AttributeError(f"Attribute {key} not exists in the runnable of cudagraph wrapper") - - -def test_hasattr_nonexistent_does_not_trigger_repr(): - """Verify that hasattr for non-existent attributes doesn't call __repr__.""" - model = SlowReprModel(repr_delay_ms=100.0) # Very slow repr - wrapper = MockCUDAGraphWrapper(model) - - # Reset counter - model.repr_call_count = 0 - - # Call hasattr for non-existent attribute multiple times - for _ in range(10): - result = hasattr(wrapper, "nonexistent_attribute_xyz") - assert result is False - - # __repr__ should never have been called - assert model.repr_call_count == 0, ( - f"__repr__ was called {model.repr_call_count} times when checking " - "for non-existent attributes. This indicates the AttributeError " - "message contains {self.runnable} which triggers expensive repr." - ) - - -def test_hasattr_nonexistent_is_fast(): - """Verify that hasattr for non-existent attributes is fast (<1ms per call).""" - model = SlowReprModel(repr_delay_ms=100.0) - wrapper = MockCUDAGraphWrapper(model) - - num_iterations = 100 - start = time.perf_counter() - for _ in range(num_iterations): - hasattr(wrapper, "nonexistent_attribute_xyz") - elapsed_ms = (time.perf_counter() - start) * 1000 - - avg_ms = elapsed_ms / num_iterations - # If __repr__ were being called, each would take ~100ms - # We expect <1ms per call with the fix - assert avg_ms < 1.0, ( - f"hasattr for non-existent attribute took {avg_ms:.2f}ms on average. " - "Expected <1ms. This suggests __repr__ is being triggered." - ) - - -def test_hasattr_existing_attribute_works(): - """Verify that hasattr for existing attributes returns True and works correctly.""" - model = SlowReprModel() - wrapper = MockCUDAGraphWrapper(model) - - # 'forward' exists on nn.Module - assert hasattr(wrapper, "forward") is True - - # 'linear' exists on our model - assert hasattr(wrapper, "linear") is True - - # Can actually access the attribute - linear = wrapper.linear - assert isinstance(linear, nn.Linear) - - -def test_getattr_existing_attribute_returns_value(): - """Verify that getattr for existing attributes returns the correct value.""" - model = SlowReprModel() - wrapper = MockCUDAGraphWrapper(model) - - # Access forward method - forward_method = wrapper.forward - assert callable(forward_method) - - # Access linear layer - linear = wrapper.linear - assert isinstance(linear, nn.Linear) - assert linear.in_features == 16 - assert linear.out_features == 16 - - -def test_getattr_nonexistent_raises_attribute_error(): - """Verify that getattr for non-existent attributes raises AttributeError.""" - model = SlowReprModel() - wrapper = MockCUDAGraphWrapper(model) - - with pytest.raises(AttributeError) as exc_info: - _ = wrapper.nonexistent_attribute - - # Verify error message format (should NOT contain model repr) - error_msg = str(exc_info.value) - assert "nonexistent_attribute" in error_msg - assert "cudagraph wrapper" in error_msg - # Should NOT contain the slow repr output - assert "SlowReprModel(delay=" not in error_msg - - -def test_attribute_error_message_does_not_contain_runnable_repr(): - """Explicitly verify the error message doesn't trigger runnable repr.""" - model = SlowReprModel(repr_delay_ms=100.0) - wrapper = MockCUDAGraphWrapper(model) - model.repr_call_count = 0 - - try: - _ = wrapper.nonexistent_attr - except AttributeError: - pass - - # __repr__ should not have been called during error construction - assert model.repr_call_count == 0, ( - "AttributeError message construction triggered __repr__. The error message should not include {self.runnable}." - ) - - -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") -def test_real_cudagraph_wrapper_hasattr_performance(): - """Test the actual CUDAGraphWrapper from vllm_omni (requires CUDA).""" - from vllm.config import CUDAGraphMode - - model = SlowReprModel(repr_delay_ms=50.0).cuda() - model.repr_call_count = 0 - - # Create actual CUDAGraphWrapper - try: - wrapper = CUDAGraphWrapper(model, runtime_mode=CUDAGraphMode.NONE) - except Exception: - pytest.skip("Could not create CUDAGraphWrapper") - - # Test hasattr performance - num_iterations = 50 - start = time.perf_counter() - for _ in range(num_iterations): - hasattr(wrapper, "nonexistent_xyz") - elapsed_ms = (time.perf_counter() - start) * 1000 - - avg_ms = elapsed_ms / num_iterations - assert avg_ms < 1.0, f"Real CUDAGraphWrapper hasattr took {avg_ms:.2f}ms avg. Expected <1ms with the optimization." - assert model.repr_call_count == 0, f"__repr__ called {model.repr_call_count} times" diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index a7abaf7b62..35e1598435 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -1,9 +1,8 @@ -import sys from typing import TYPE_CHECKING, Any, cast import numpy as np import torch -from vllm.compilation.cuda_graph import CUDAGraphWrapper as _OriginalCUDAGraphWrapper +from vllm.compilation.cuda_graph import CUDAGraphWrapper from vllm.config import CUDAGraphMode from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import set_forward_context @@ -38,22 +37,6 @@ logger = init_logger(__name__) -class CUDAGraphWrapper(_OriginalCUDAGraphWrapper): - def __getattr__(self, key: str) -> Any: - # allow accessing the attributes of the runnable. - if hasattr(self.runnable, key): - return getattr(self.runnable, key) - raise AttributeError(f"Attribute {key} not exists in the runnable of cudagraph wrapper") - - -# Patch vLLM's CUDAGraphWrapper with our optimized version -for _module_name, _module in sys.modules.items(): - if "vllm" not in _module_name: - continue - if hasattr(_module, "CUDAGraphWrapper") and _module.CUDAGraphWrapper is _OriginalCUDAGraphWrapper: - _module.CUDAGraphWrapper = CUDAGraphWrapper - - class OmniGPUModelRunner(GPUModelRunner): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) From 956f53b2781dde13480b9082fd62fd1a42df1fc2 Mon Sep 17 00:00:00 2001 From: Samit <285365963@qq.com> Date: Thu, 9 Apr 2026 18:06:40 +0800 Subject: [PATCH 106/204] [Refactor] Use trajectory_* fields for Qwen-Image structured RL outputs (#2513) Signed-off-by: samithuang <285365963@qq.com> --- .../qwen_image_pipeline_with_logprob.py | 9 ++++---- .../test_async_omni_qwen_image_generate.py | 22 ++++++++++++++----- vllm_omni/diffusion/diffusion_engine.py | 8 +++++++ vllm_omni/outputs.py | 6 ++++- 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/tests/e2e/offline_inference/custom_pipeline/qwen_image_pipeline_with_logprob.py b/tests/e2e/offline_inference/custom_pipeline/qwen_image_pipeline_with_logprob.py index ed5b219f80..709c665556 100644 --- a/tests/e2e/offline_inference/custom_pipeline/qwen_image_pipeline_with_logprob.py +++ b/tests/e2e/offline_inference/custom_pipeline/qwen_image_pipeline_with_logprob.py @@ -6,7 +6,8 @@ This pipeline follows the structure of the user's reference implementation: - supports pre-tokenized prompt IDs via OmniCustomPrompt-style dict input - uses an SDE scheduler that can return step logprobs -- returns rich custom_output fields for testing +- returns structured trajectory_* fields (latents, timesteps, log_probs) + consistent with the BAGEL trajectory recording design """ from __future__ import annotations @@ -393,10 +394,10 @@ def forward( return DiffusionOutput( output=_maybe_to_cpu(image), + trajectory_latents=_maybe_to_cpu(all_latents), + trajectory_log_probs=_maybe_to_cpu(all_log_probs), + trajectory_timesteps=_maybe_to_cpu(all_timesteps), custom_output={ - "all_latents": _maybe_to_cpu(all_latents), - "all_log_probs": _maybe_to_cpu(all_log_probs), - "all_timesteps": _maybe_to_cpu(all_timesteps), "prompt_embeds": _maybe_to_cpu(prompt_embeds), "prompt_embeds_mask": _maybe_to_cpu(prompt_embeds_mask), "negative_prompt_embeds": _maybe_to_cpu(negative_prompt_embeds), diff --git a/tests/e2e/offline_inference/custom_pipeline/test_async_omni_qwen_image_generate.py b/tests/e2e/offline_inference/custom_pipeline/test_async_omni_qwen_image_generate.py index f1b4595c9d..03bd12efae 100644 --- a/tests/e2e/offline_inference/custom_pipeline/test_async_omni_qwen_image_generate.py +++ b/tests/e2e/offline_inference/custom_pipeline/test_async_omni_qwen_image_generate.py @@ -1,7 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""E2E tests for AsyncOmni Qwen-Image generation flow (no Ray, no HTTP server).""" +"""E2E tests for AsyncOmni Qwen-Image generation with trajectory_* fields. + +Validates that the custom Qwen-Image pipeline returns structured trajectory +outputs (latents, timesteps, log_probs) via OmniRequestOutput's trajectory_* +fields instead of the legacy custom_output dict. +""" from __future__ import annotations @@ -191,10 +196,17 @@ async def test_async_omni_generate_with_logprobs(): _assert_valid_image_output(output) - all_log_probs = output.custom_output.get("all_log_probs") - assert all_log_probs is not None, "all_log_probs should be present when logprobs=True" - assert hasattr(all_log_probs, "shape") - assert all_log_probs.numel() > 0 + assert output.trajectory_latents is not None, "trajectory_latents should be present" + assert hasattr(output.trajectory_latents, "shape") + assert output.trajectory_latents.numel() > 0 + + assert output.trajectory_timesteps is not None, "trajectory_timesteps should be present" + assert hasattr(output.trajectory_timesteps, "shape") + assert output.trajectory_timesteps.numel() > 0 + + assert output.trajectory_log_probs is not None, "trajectory_log_probs should be present when logprobs=True" + assert hasattr(output.trajectory_log_probs, "shape") + assert output.trajectory_log_probs.numel() > 0 @pytest.mark.core_model diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index 5b77c064f8..422ef479b0 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -196,6 +196,10 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: prompt=prompt, metrics=metrics, latents=output.trajectory_latents, + trajectory_latents=output.trajectory_latents, + trajectory_timesteps=output.trajectory_timesteps, + trajectory_log_probs=output.trajectory_log_probs, + trajectory_decoded=output.trajectory_decoded, multimodal_output={"audio": request_audio_payload}, final_output_type="audio", stage_durations=output.stage_durations, @@ -252,6 +256,10 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: prompt=prompt, metrics=metrics, latents=output.trajectory_latents, + trajectory_latents=output.trajectory_latents, + trajectory_timesteps=output.trajectory_timesteps, + trajectory_log_probs=output.trajectory_log_probs, + trajectory_decoded=output.trajectory_decoded, multimodal_output={"audio": request_audio_payload}, final_output_type="audio", stage_durations=output.stage_durations, diff --git a/vllm_omni/outputs.py b/vllm_omni/outputs.py index 4a775356ee..9a7bb67065 100644 --- a/vllm_omni/outputs.py +++ b/vllm_omni/outputs.py @@ -123,8 +123,12 @@ def from_diffusion( prompt: The prompt used metrics: Generation metrics latents: Optional latent tensors + trajectory_latents: Optional stacked trajectory latent tensors + trajectory_timesteps: Optional stacked trajectory timestep tensors + trajectory_log_probs: Optional stacked trajectory log-probability tensors + trajectory_decoded: Optional list of decoded trajectory images multimodal_output: Optional multimodal output dict - custom_output: Optional custom output dict (e.g. latent trajectories, prompt embeds) + custom_output: Optional custom output dict (e.g. prompt embeds) stage_durations: Optional stage durations (execution time of each stage) dict peak_memory_mb: Peak memory usage in MB From 85d63c47f90ae9b29c5b866ca486cffc369b4fdf Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Thu, 9 Apr 2026 20:51:01 +0800 Subject: [PATCH 107/204] [Bugfix] Fix Qwen-Image min-size normalization for tiny requests (#2637) Signed-off-by: David Chen <530634352@qq.com> --- .../qwen_image/test_qwen_image_size_utils.py | 26 +++++++++++++++++++ .../models/qwen_image/pipeline_qwen_image.py | 4 +++ .../qwen_image/pipeline_qwen_image_edit.py | 11 ++++---- .../pipeline_qwen_image_edit_plus.py | 11 ++++---- .../qwen_image/pipeline_qwen_image_layered.py | 11 ++++---- vllm_omni/diffusion/utils/size_utils.py | 20 ++++++++++++++ 6 files changed, 65 insertions(+), 18 deletions(-) create mode 100644 tests/diffusion/models/qwen_image/test_qwen_image_size_utils.py create mode 100644 vllm_omni/diffusion/utils/size_utils.py diff --git a/tests/diffusion/models/qwen_image/test_qwen_image_size_utils.py b/tests/diffusion/models/qwen_image/test_qwen_image_size_utils.py new file mode 100644 index 0000000000..7ba8f108a1 --- /dev/null +++ b/tests/diffusion/models/qwen_image/test_qwen_image_size_utils.py @@ -0,0 +1,26 @@ +import pytest + +from vllm_omni.diffusion.utils.size_utils import ( + normalize_min_aligned_size, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +@pytest.mark.parametrize( + ("height", "width", "expected"), + [ + (1, 1, (16, 16)), + (15, 15, (16, 16)), + (17, 17, (16, 16)), + (31, 33, (16, 32)), + (64, 80, (64, 80)), + ], +) +def test_normalize_min_aligned_size_clamps_to_minimum_aligned_shape(height, width, expected): + assert normalize_min_aligned_size(height, width, alignment=16) == expected + + +def test_normalize_min_aligned_size_rejects_invalid_alignment(): + with pytest.raises(ValueError, match="positive alignment"): + normalize_min_aligned_size(16, 16, alignment=0) diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py index 5056b5342e..9f75c84538 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py @@ -34,6 +34,9 @@ ) from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.diffusion.utils.size_utils import ( + normalize_min_aligned_size, +) from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs if TYPE_CHECKING: @@ -938,6 +941,7 @@ def forward( height = req.sampling_params.height or self.default_sample_size * self.vae_scale_factor width = req.sampling_params.width or self.default_sample_size * self.vae_scale_factor + height, width = normalize_min_aligned_size(height, width, self.vae_scale_factor * 2) num_inference_steps = req.sampling_params.num_inference_steps or num_inference_steps sigmas = req.sampling_params.sigmas or sigmas max_sequence_length = req.sampling_params.max_sequence_length or max_sequence_length diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py index 3d0cd2a6d4..dd77d71b1e 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py @@ -37,6 +37,9 @@ ) from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.diffusion.utils.size_utils import ( + normalize_min_aligned_size, +) from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs from vllm_omni.inputs.data import OmniTextPrompt from vllm_omni.model_executor.model_loader.weight_utils import ( @@ -97,9 +100,7 @@ def pre_process_func( width = request.sampling_params.width or calculated_width # Ensure dimensions are multiples of vae_scale_factor * 2 - multiple_of = vae_scale_factor * 2 - height = height // multiple_of * multiple_of - width = width // multiple_of * multiple_of + height, width = normalize_min_aligned_size(height, width, vae_scale_factor * 2) # Store calculated dimensions in request prompt["additional_information"]["calculated_height"] = calculated_height @@ -661,9 +662,7 @@ def forward( height = height or calculated_height width = width or calculated_width - multiple_of = self.vae_scale_factor * 2 - width = width // multiple_of * multiple_of - height = height // multiple_of * multiple_of + height, width = normalize_min_aligned_size(height, width, self.vae_scale_factor * 2) if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels): image = self.image_processor.resize(image, calculated_height, calculated_width) diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py index cb5a36579f..6f6c9d2ba3 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py @@ -40,6 +40,9 @@ ) from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.diffusion.utils.size_utils import ( + normalize_min_aligned_size, +) from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs from vllm_omni.inputs.data import OmniTextPrompt from vllm_omni.model_executor.model_loader.weight_utils import ( @@ -99,9 +102,7 @@ def pre_process_func( width = request.sampling_params.width or calculated_width # Ensure dimensions are multiples of vae_scale_factor * 2 - multiple_of = vae_scale_factor * 2 - height = height // multiple_of * multiple_of - width = width // multiple_of * multiple_of + height, width = normalize_min_aligned_size(height, width, vae_scale_factor * 2) # Store calculated dimensions in request prompt["additional_information"]["calculated_height"] = calculated_height @@ -604,9 +605,7 @@ def forward( height = height or calculated_height width = width or calculated_width - multiple_of = self.vae_scale_factor * 2 - width = width // multiple_of * multiple_of - height = height // multiple_of * multiple_of + height, width = normalize_min_aligned_size(height, width, self.vae_scale_factor * 2) condition_images = [] vae_images = [] diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py index f1d28f0685..38866d89c5 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py @@ -36,6 +36,9 @@ ) from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.diffusion.utils.size_utils import ( + normalize_min_aligned_size, +) from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs from vllm_omni.inputs.data import OmniTextPrompt from vllm_omni.model_executor.model_loader.weight_utils import ( @@ -109,9 +112,7 @@ def pre_process_func( height = calculated_height width = calculated_width - multiple_of = vae_scale_factor * 2 - width = width // multiple_of * multiple_of - height = height // multiple_of * multiple_of + height, width = normalize_min_aligned_size(height, width, vae_scale_factor * 2) # Store calculated dimensions in request prompt["additional_information"]["calculated_height"] = calculated_height @@ -665,9 +666,7 @@ def forward( height = calculated_height width = calculated_width - multiple_of = self.vae_scale_factor * 2 - width = width // multiple_of * multiple_of - height = height // multiple_of * multiple_of + height, width = normalize_min_aligned_size(height, width, self.vae_scale_factor * 2) if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels): image = self.image_processor.resize(image, calculated_height, calculated_width) diff --git a/vllm_omni/diffusion/utils/size_utils.py b/vllm_omni/diffusion/utils/size_utils.py new file mode 100644 index 0000000000..030e542f17 --- /dev/null +++ b/vllm_omni/diffusion/utils/size_utils.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Shared size normalization helpers for diffusion pipelines.""" + + +def normalize_min_aligned_size(height: int, width: int, alignment: int) -> tuple[int, int]: + """Clamp dimensions to the minimum valid aligned size. + + This preserves floor-to-alignment behavior for normal requests while + preventing very small dimensions from collapsing to zero after alignment. + """ + + alignment = int(alignment) + if alignment <= 0: + raise ValueError(f"Expected positive alignment, got {alignment}") + + normalized_height = max(alignment, (int(height) // alignment) * alignment) + normalized_width = max(alignment, (int(width) // alignment) * alignment) + return normalized_height, normalized_width From 694be6f2e1792603ec87c912644b83e1d5a9f80e Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Fri, 10 Apr 2026 04:34:34 +0800 Subject: [PATCH 108/204] [Bugfix] Fix Fish Speech voice clone FileNotFoundError on multi-GPU (#2606) Signed-off-by: Sy03 <1370724210@qq.com> --- examples/offline_inference/fish_speech/end2end.py | 10 +--------- tests/entrypoints/openai_api/test_serving_speech.py | 4 ++-- .../models/test_fish_speech_regressions.py | 4 +--- vllm_omni/entrypoints/openai/serving_speech.py | 11 +++-------- .../models/fish_speech/fish_speech_slow_ar.py | 13 +++++++------ 5 files changed, 14 insertions(+), 28 deletions(-) diff --git a/examples/offline_inference/fish_speech/end2end.py b/examples/offline_inference/fish_speech/end2end.py index 31c24d3d5d..60830d06b7 100644 --- a/examples/offline_inference/fish_speech/end2end.py +++ b/examples/offline_inference/fish_speech/end2end.py @@ -18,7 +18,6 @@ import logging import math import os -import tempfile import time import numpy as np @@ -88,17 +87,10 @@ def build_prompt( semantic_len, ) - # The model-side structured clone prefill consumes a temporary .npy file and - # removes it after loading. Abnormal termination can still leave the file - # behind, which is acceptable for this offline example. - with tempfile.NamedTemporaryFile(prefix="fish_ref_", suffix=".npy", delete=False) as f: - np.save(f, np.asarray(ref_audio_wav, dtype=np.float32)) - ref_audio_npy_path = f.name - additional_information = { "text": normalized_text, "ref_text": normalized_ref_text, - "ref_audio_path": ref_audio_npy_path, + "ref_audio_wav": torch.from_numpy(np.asarray(ref_audio_wav, dtype=np.float32)), "ref_audio_sr": int(ref_audio_sr), "fish_structured_voice_clone": True, } diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 334264602e..57aeef8f9d 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -1861,8 +1861,8 @@ def test_build_fish_clone_prompt_normalizes_text_fields(self, fish_speech_server assert info["text"] == "<|speaker:1|>你好,欢迎回来。" assert info["ref_text"] == "<|speaker:0|>参考音频的原始文本。" assert info["fish_structured_voice_clone"] is True - assert os.path.exists(info["ref_audio_path"]) - os.remove(info["ref_audio_path"]) + assert isinstance(info["ref_audio_wav"], torch.Tensor) + assert info["ref_audio_wav"].dtype == torch.float32 fish_speech_server._estimate_fish_prompt_len.assert_called_once_with( "<|speaker:1|>你好,欢迎回来。", "<|speaker:0|>参考音频的原始文本。", diff --git a/tests/model_executor/models/test_fish_speech_regressions.py b/tests/model_executor/models/test_fish_speech_regressions.py index 1f8c3cf71e..04d1b20dff 100644 --- a/tests/model_executor/models/test_fish_speech_regressions.py +++ b/tests/model_executor/models/test_fish_speech_regressions.py @@ -80,8 +80,6 @@ def test_structured_voice_clone_prefill_adds_full_codebooks_with_decode_scale(mo model.codebook_embeddings = codebook_embed model._get_tokenizer = lambda: _FakeTokenizer({"<|audio_start|>": 10, "<|audio_end|>": 11}) - monkeypatch.setattr(slow_ar_module.np, "load", lambda path: [0.0]) - monkeypatch.setattr(slow_ar_module.os, "remove", lambda path: None) monkeypatch.setattr( slow_ar_module, "encode_reference_audio_codes", @@ -97,7 +95,7 @@ def test_structured_voice_clone_prefill_adds_full_codebooks_with_decode_scale(mo { "ref_text": "ref", "text": "target", - "ref_audio_path": "unused.npy", + "ref_audio_wav": torch.tensor([0.0]), "ref_audio_sr": 16000, } ) diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 5903c0cd60..494c977d77 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -6,7 +6,6 @@ import os import re import struct -import tempfile import time from concurrent.futures import ThreadPoolExecutor from pathlib import Path @@ -1301,17 +1300,13 @@ def _build_fish_speech_prompt( wav_samples, sr = ref_audio_data normalized_text, normalized_ref_text = normalize_fish_voice_clone_texts(request.input, request.ref_text) ph_len = self._estimate_fish_prompt_len(normalized_text, normalized_ref_text, ref_audio_data) - with tempfile.NamedTemporaryFile(prefix="fish_ref_", suffix=".npy", delete=False) as f: - np.save(f, np.asarray(wav_samples, dtype=np.float32)) - ref_audio_path = f.name - # Structured clone metadata is consumed directly by - # FishSpeechSlowARForConditionalGeneration.preprocess(), so keep these - # values as scalars instead of the list-wrapped prompt-dict convention. + # Structured clone: scalars (not list-wrapped) because model-side + # preprocess() consumes per-request fields directly. additional_information = { "text": normalized_text, "ref_text": normalized_ref_text, - "ref_audio_path": ref_audio_path, + "ref_audio_wav": torch.from_numpy(np.asarray(wav_samples, dtype=np.float32)), "ref_audio_sr": int(sr), "fish_structured_voice_clone": True, } diff --git a/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py b/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py index 4ad2a1fa63..9333400593 100644 --- a/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py +++ b/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py @@ -14,7 +14,6 @@ import dataclasses import math -import os from collections.abc import Iterable from typing import Any @@ -518,17 +517,19 @@ def _build_structured_voice_clone_prefill_embeds(self, info_dict: dict[str, Any] tokenizer = self._get_tokenizer() ref_text = info_dict.get("ref_text") text = info_dict.get("text") - ref_audio_path = info_dict.get("ref_audio_path") ref_audio_sr = info_dict.get("ref_audio_sr") if not isinstance(ref_text, str) or not isinstance(text, str): raise ValueError("Fish Speech structured voice clone requires string text and ref_text") - if not isinstance(ref_audio_path, str) or not ref_audio_path: - raise ValueError("Fish Speech structured voice clone requires ref_audio_path") if not isinstance(ref_audio_sr, int): raise ValueError("Fish Speech structured voice clone requires integer ref_audio_sr") - ref_audio_wav = np.load(ref_audio_path) - os.remove(ref_audio_path) + ref_audio_wav_raw = info_dict.get("ref_audio_wav") + if ref_audio_wav_raw is None: + raise ValueError("Fish Speech structured voice clone requires ref_audio_wav") + if isinstance(ref_audio_wav_raw, torch.Tensor): + ref_audio_wav = ref_audio_wav_raw.cpu().numpy() + else: + ref_audio_wav = np.asarray(ref_audio_wav_raw, dtype=np.float32) ref_codes_fq = encode_reference_audio_codes( self.model_path, From 4b6d92963e6c07692a670ecaf392f32a63b51ba9 Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Fri, 10 Apr 2026 06:43:16 +0800 Subject: [PATCH 109/204] [CI][Bugfix] Update environment variables for test configurations in Buildkite YAML files to resolve HF timeout (#2628) Signed-off-by: wangyu <410167048@qq.com> --- .buildkite/test-merge.yml | 13 +++++-------- .buildkite/test-nightly-diffusion.yml | 14 +++++--------- .buildkite/test-nightly.yml | 11 +++++------ .buildkite/test-ready.yml | 17 +++++------------ 4 files changed, 20 insertions(+), 35 deletions(-) diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index f98ff17140..7355e2b4c7 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -1,3 +1,8 @@ +env: + VLLM_WORKER_MULTIPROC_METHOD: spawn + HF_HUB_DOWNLOAD_TIMEOUT: 300 + HF_HUB_ETAG_TIMEOUT: 60 + steps: - label: "Simple Unit Test" depends_on: upload-merge-pipeline @@ -169,7 +174,6 @@ steps: commands: - | timeout 15m bash -c ' - export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/engine/test_async_omni_engine_abort.py ' agents: @@ -191,7 +195,6 @@ steps: depends_on: upload-merge-pipeline commands: - export VLLM_LOGGING_LEVEL=DEBUG - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py tests/e2e/online_serving/test_qwen2_5_omni.py -m "advanced_model" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU @@ -212,7 +215,6 @@ steps: - | timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py tests/e2e/offline_inference/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model" ' @@ -235,7 +237,6 @@ steps: - | timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py tests/e2e/offline_inference/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model" ' @@ -256,7 +257,6 @@ steps: timeout_in_minutes: 30 depends_on: upload-merge-pipeline commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py tests/e2e/online_serving/test_mimo_audio.py -m "advanced_model" --run-level "advanced_model" agents: @@ -297,7 +297,6 @@ steps: timeout_in_minutes: 20 depends_on: upload-merge-pipeline commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py agents: queue: "mithril-h100-pool" @@ -340,7 +339,6 @@ steps: - | timeout 55m bash -c ' set -e - export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_TEST_CLEAN_GPU_MEMORY=1 export VLLM_IMAGE_FETCH_TIMEOUT=60 pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory" @@ -387,7 +385,6 @@ steps: - | timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" ' agents: diff --git a/.buildkite/test-nightly-diffusion.yml b/.buildkite/test-nightly-diffusion.yml index 73bf455113..742624e8b5 100644 --- a/.buildkite/test-nightly-diffusion.yml +++ b/.buildkite/test-nightly-diffusion.yml @@ -2,6 +2,11 @@ # buildkite-agent pipeline upload .buildkite/test-nightly-diffusion.yml # from test-nightly.yml (step key: nightly-diffusion-model-test). Top-level groups are # foldable in the Buildkite UI (Other / Wan / Qwen-Image). +env: + VLLM_WORKER_MULTIPROC_METHOD: spawn + HF_HUB_DOWNLOAD_TIMEOUT: 300 + HF_HUB_ETAG_TIMEOUT: 60 + steps: - group: ":card_index_dividers: Other Model Test" key: nightly-other-model-test-group @@ -10,7 +15,6 @@ steps: timeout_in_minutes: 120 if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not test_qwen_image" -m "advanced_model and diffusion and H100" --run-level "advanced_model" agents: queue: "mithril-h100-pool" @@ -50,7 +54,6 @@ steps: timeout_in_minutes: 60 if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU @@ -70,7 +73,6 @@ steps: timeout_in_minutes: 60 if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - pytest -s -v tests/examples/online_serving/test_text_to_image.py tests/examples/offline_inference/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model" agents: @@ -114,7 +116,6 @@ steps: timeout_in_minutes: 90 if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py -m "advanced_model" --run-level "advanced_model" agents: queue: "mithril-h100-pool" @@ -155,7 +156,6 @@ steps: timeout_in_minutes: 180 if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level advanced_model agents: queue: "mithril-h100-pool" @@ -198,7 +198,6 @@ steps: timeout_in_minutes: 120 if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_qwen_image*_expansion.py -m "advanced_model and diffusion and H100" --run-level "advanced_model" agents: queue: "mithril-h100-pool" @@ -239,7 +238,6 @@ steps: timeout_in_minutes: 60 if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1 - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json" agents: @@ -281,7 +279,6 @@ steps: timeout_in_minutes: 60 if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1 - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv" - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json" @@ -326,7 +323,6 @@ steps: timeout_in_minutes: 180 if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results - export CACHE_DIT_VERSION=1.3.0 - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 62f6e4dceb..0d1c8eaccf 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -1,3 +1,8 @@ +env: + VLLM_WORKER_MULTIPROC_METHOD: spawn + HF_HUB_DOWNLOAD_TIMEOUT: 300 + HF_HUB_ETAG_TIMEOUT: 60 + steps: # Group: collapses under one heading in the Buildkite UI; child steps still run in parallel. - group: ":card_index_dividers: Omni Model Test" @@ -8,7 +13,6 @@ steps: depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and H100 and omni" --run-level "advanced_model" agents: queue: "mithril-h100-pool" @@ -49,7 +53,6 @@ steps: depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model" agents: @@ -71,7 +74,6 @@ steps: depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/examples/ -m "advanced_model and omni and L4" --run-level "advanced_model" agents: @@ -93,7 +95,6 @@ steps: depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/examples/ -m "advanced_model and omni and H100" --run-level "advanced_model" agents: queue: "mithril-h100-pool" @@ -135,7 +136,6 @@ steps: depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export BENCHMARK_DIR=tests/dfx/perf/results - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py @@ -193,7 +193,6 @@ steps: depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - python tools/nightly/buildkite_testcase_statistics.py -o tests/dfx/perf/results/buildkite_testcase_statistics.html - buildkite-agent artifact upload "tests/dfx/perf/results/*.html" agents: diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 6f3ad6504e..2f1f05463a 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -1,3 +1,8 @@ +env: + VLLM_WORKER_MULTIPROC_METHOD: spawn + HF_HUB_DOWNLOAD_TIMEOUT: 300 + HF_HUB_ETAG_TIMEOUT: 60 + steps: - label: "Simple Unit Test" depends_on: upload-ready-pipeline @@ -173,7 +178,6 @@ steps: commands: - | timeout 15m bash -c ' - export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/engine/test_async_omni_engine_abort.py ' agents: @@ -197,7 +201,6 @@ steps: - | timeout 17m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_qwen2_5_omni.py -m "core_model" --run-level "core_model" ' agents: @@ -218,7 +221,6 @@ steps: commands: - | timeout 20m bash -c ' - export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model" ' agents: @@ -256,7 +258,6 @@ steps: - | timeout 30m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_mimo_audio.py -m "core_model" --run-level "core_model" ' agents: @@ -299,7 +300,6 @@ steps: - | timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "core_model" --run-level "core_model" ' @@ -324,7 +324,6 @@ steps: - | timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_omnivoice.py -m "core_model" --run-level "core_model" ' agents: @@ -347,7 +346,6 @@ steps: - | timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "core_model" --run-level "core_model" ' agents: @@ -384,7 +382,6 @@ steps: # commands: # - | # timeout 20m bash -c ' - # export VLLM_WORKER_MULTIPROC_METHOD=spawn # pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py # ' # agents: @@ -421,7 +418,6 @@ steps: commands: - | timeout 30m bash -c ' - export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "core_model" --run-level "core_model" ' @@ -464,7 +460,6 @@ steps: commands: - | timeout 30m bash -c ' - export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "core_model" --run-level "core_model" ' @@ -507,7 +502,6 @@ steps: commands: - | timeout 40m bash -c ' - export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_TEST_CLEAN_GPU_MEMORY=1 export VLLM_IMAGE_FETCH_TIMEOUT=60 pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "core_model" --run-level "core_model" @@ -552,7 +546,6 @@ steps: commands: - | timeout 20m bash -c ' - export VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "core_model" --run-level "core_model" ' agents: From 0c46ba57aa6b434e0c5a4cfe2669e4cbbe987351 Mon Sep 17 00:00:00 2001 From: Haco <75477391+xiaohajiayou@users.noreply.github.com> Date: Fri, 10 Apr 2026 10:24:35 +0800 Subject: [PATCH 110/204] [Bugfix] restore legacy stage config precedence (#2663) Signed-off-by: xiaohajiayou <923390377@qq.com> --- vllm_omni/entrypoints/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index c5e49a9336..84391c2ea8 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -302,7 +302,7 @@ def load_stage_configs_from_model(model: str, base_engine_args: dict | None = No stage_configs = load_stage_configs_from_yaml( config_path=stage_config_path, base_engine_args=base_engine_args, - prefer_stage_engine_args=False, + prefer_stage_engine_args=True, ) return stage_configs From 94232436bb1d76845a3d0d4abaa21a229bb4ecfd Mon Sep 17 00:00:00 2001 From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Date: Thu, 9 Apr 2026 22:34:03 -0400 Subject: [PATCH 111/204] [Feat][FishSpeech] Cache DAC-encoded ref audio for voice cloning (#2609) Signed-off-by: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> --- benchmarks/fish-speech/bench_voice_cache.py | 290 ++++++++++ benchmarks/fish-speech/fish_bench_utils.py | 501 ++++++++++++++++++ .../models/test_fish_speech_voice_cache.py | 218 ++++++++ tests/test_fish_speech_voice_cache.py | 218 ++++++++ .../entrypoints/openai/serving_speech.py | 32 +- .../models/fish_speech/fish_speech_slow_ar.py | 56 ++ 6 files changed, 1313 insertions(+), 2 deletions(-) create mode 100644 benchmarks/fish-speech/bench_voice_cache.py create mode 100644 benchmarks/fish-speech/fish_bench_utils.py create mode 100644 tests/model_executor/models/test_fish_speech_voice_cache.py create mode 100644 tests/test_fish_speech_voice_cache.py diff --git a/benchmarks/fish-speech/bench_voice_cache.py b/benchmarks/fish-speech/bench_voice_cache.py new file mode 100644 index 0000000000..8d465d6489 --- /dev/null +++ b/benchmarks/fish-speech/bench_voice_cache.py @@ -0,0 +1,290 @@ +"""Benchmark Fish Speech voice cache: inline ref_audio vs uploaded voice. + +Measures TTFP improvement from DAC-code caching when using uploaded voices. + +Setup: + 1. Start vllm-omni with Fish Speech S2 Pro (use our feat branch) + 2. Provide a reference audio file for voice cloning + +Usage: + python bench_voice_cache.py \ + --ref-audio /path/to/reference.wav \ + --ref-text "Transcript of the reference audio." \ + --num-prompts 20 \ + --port 8091 + +The script runs two rounds: + A) Inline ref_audio: every request sends base64 audio (no cache) + B) Uploaded voice: upload once, then use voice name (cache hits after 1st) +""" + +import argparse +import asyncio +import base64 +import json +import os +import sys +import time +from pathlib import Path + +import aiohttp + +# Allow imports from benchmarks/fish-speech/ +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +from fish_bench_utils import ( # noqa: E402 + BenchmarkResult, + RequestResult, + compute_stats, + print_benchmark_results, + send_streaming_request, +) + +SAMPLE_RATE = 44100 +SAMPLE_WIDTH = 2 + +PROMPTS = [ + "Hello, welcome to the voice synthesis benchmark test.", + "She said she would be here by noon, but nobody showed up.", + "The quick brown fox jumps over the lazy dog near the riverbank.", + "I can't believe how beautiful the sunset looks from up here.", + "Please remember to bring your identification documents tomorrow morning.", + "Have you ever wondered what it would be like to travel through time?", + "The restaurant on the corner serves the best pasta I have ever tasted.", + "After the meeting, we should discuss the quarterly results.", + "Learning a new language takes patience and genuine curiosity.", + "The train leaves at half past seven, so we need to arrive early.", + "Could you please turn down the music, I'm trying to concentrate.", + "It was a dark and stormy night when the keeper heard a knock.", +] + + +def encode_audio_to_base64(audio_path: str) -> str: + """Encode a local audio file to base64 data URL.""" + ext = audio_path.lower().rsplit(".", 1)[-1] + mime_map = {"wav": "audio/wav", "mp3": "audio/mpeg", "flac": "audio/flac"} + mime_type = mime_map.get(ext, "audio/wav") + with open(audio_path, "rb") as f: + audio_b64 = base64.b64encode(f.read()).decode("utf-8") + return f"data:{mime_type};base64,{audio_b64}" + + +async def upload_voice( + host: str, + port: int, + audio_path: str, + ref_text: str, + voice_name: str = "bench_voice", +) -> dict: + """Upload a voice via POST /v1/audio/voices.""" + url = f"http://{host}:{port}/v1/audio/voices" + data = aiohttp.FormData() + data.add_field("name", voice_name) + data.add_field("consent", "true") + if ref_text: + data.add_field("ref_text", ref_text) + data.add_field( + "audio_sample", + open(audio_path, "rb"), + filename=os.path.basename(audio_path), + content_type="audio/wav", + ) + + async with aiohttp.ClientSession() as session: + async with session.post(url, data=data) as resp: + result = await resp.json() + print(f" Upload response ({resp.status}): {json.dumps(result, indent=2)}") + return result + + +async def delete_voice(host: str, port: int, voice_name: str) -> None: + """Delete an uploaded voice.""" + url = f"http://{host}:{port}/v1/audio/voices/{voice_name}" + async with aiohttp.ClientSession() as session: + async with session.delete(url) as resp: + if resp.status == 200: + print(f" Deleted voice '{voice_name}'") + + +async def run_round( + host: str, + port: int, + num_prompts: int, + create_payload_fn, + label: str, + num_warmups: int = 2, + timeout_s: float = 120.0, +) -> BenchmarkResult: + """Run one benchmark round and return results.""" + api_url = f"http://{host}:{port}/v1/audio/speech" + connector = aiohttp.TCPConnector(limit=1, limit_per_host=1) + session = aiohttp.ClientSession( + connector=connector, + timeout=aiohttp.ClientTimeout(total=timeout_s), + ) + + try: + # Warmup. + if num_warmups > 0: + print(f" [{label}] Warming up ({num_warmups} requests)...") + for i in range(num_warmups): + payload = create_payload_fn(PROMPTS[i % len(PROMPTS)]) + r = await send_streaming_request( + session, + api_url, + payload, + SAMPLE_RATE, + SAMPLE_WIDTH, + ) + status = "OK" if r.success else f"FAIL: {r.error[:80]}" + print(f" warmup {i + 1}: ttfp={r.ttfp * 1000:.0f}ms {status}") + + # Benchmark. + print(f" [{label}] Running {num_prompts} requests (concurrency=1)...") + results: list[RequestResult] = [] + start = time.perf_counter() + for i in range(num_prompts): + prompt = PROMPTS[i % len(PROMPTS)] + payload = create_payload_fn(prompt) + r = await send_streaming_request( + session, + api_url, + payload, + SAMPLE_RATE, + SAMPLE_WIDTH, + ) + results.append(r) + tag = "HIT" if i > 0 and label == "uploaded_voice" else "" + print( + f" req {i + 1:3d}: ttfp={r.ttfp * 1000:7.1f}ms " + f"e2e={r.e2e * 1000:7.1f}ms " + f"{'OK' if r.success else 'FAIL'} {tag}" + ) + wall_time = time.perf_counter() - start + finally: + await session.close() + + bench = compute_stats(results, wall_time) + bench.concurrency = 1 + bench.num_prompts = num_prompts + bench.config_name = label + return bench + + +async def main(): + parser = argparse.ArgumentParser( + description="Benchmark Fish Speech voice cache (inline vs uploaded)", + ) + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--port", type=int, default=8091) + parser.add_argument("--ref-audio", required=True, help="Path to reference audio file") + parser.add_argument("--ref-text", required=True, help="Transcript of reference audio") + parser.add_argument("--num-prompts", type=int, default=20) + parser.add_argument("--num-warmups", type=int, default=2) + parser.add_argument("--voice-name", default="bench_voice") + args = parser.parse_args() + + if not os.path.exists(args.ref_audio): + print(f"Error: ref_audio not found: {args.ref_audio}") + sys.exit(1) + + ref_audio_b64 = encode_audio_to_base64(args.ref_audio) + print(f"Reference audio: {args.ref_audio} ({len(ref_audio_b64) // 1024}KB base64)") + + # ---- Round A: Inline ref_audio (no cache) ---- + print(f"\n{'=' * 60}") + print("Round A: INLINE ref_audio (every request sends full audio)") + print(f"{'=' * 60}") + + def make_inline_payload(prompt: str) -> dict: + return { + "input": prompt, + "voice": "default", + "stream": True, + "response_format": "pcm", + "ref_audio": ref_audio_b64, + "ref_text": args.ref_text, + "max_new_tokens": 2048, + } + + bench_inline = await run_round( + args.host, + args.port, + args.num_prompts, + make_inline_payload, + "inline_ref_audio", + num_warmups=args.num_warmups, + ) + print_benchmark_results(bench_inline) + + # ---- Upload voice ---- + print(f"\n{'=' * 60}") + print("Uploading voice for cache test...") + print(f"{'=' * 60}") + await delete_voice(args.host, args.port, args.voice_name) + await upload_voice( + args.host, + args.port, + args.ref_audio, + args.ref_text, + args.voice_name, + ) + + # ---- Round B: Uploaded voice (cache hits after 1st request) ---- + print(f"\n{'=' * 60}") + print("Round B: UPLOADED VOICE (cache hits after 1st request)") + print(f"{'=' * 60}") + + def make_uploaded_payload(prompt: str) -> dict: + return { + "input": prompt, + "voice": args.voice_name, + "stream": True, + "response_format": "pcm", + "ref_text": args.ref_text, + "max_new_tokens": 2048, + } + + bench_cached = await run_round( + args.host, + args.port, + args.num_prompts, + make_uploaded_payload, + "uploaded_voice", + num_warmups=args.num_warmups, + ) + print_benchmark_results(bench_cached) + + # ---- Comparison ---- + print(f"\n{'=' * 60}") + print("COMPARISON: Inline ref_audio vs Uploaded voice (cached)") + print(f"{'=' * 60}") + print(f"{'Metric':<30} {'Inline':>12} {'Cached':>12} {'Speedup':>10}") + print(f"{'-' * 64}") + + def fmt_speedup(inline_val: float, cached_val: float) -> str: + if cached_val > 0 and inline_val > 0: + ratio = inline_val / cached_val + return f"{ratio:.2f}x" + return "N/A" + + rows = [ + ("Mean TTFP (ms)", bench_inline.mean_ttfp_ms, bench_cached.mean_ttfp_ms), + ("Median TTFP (ms)", bench_inline.median_ttfp_ms, bench_cached.median_ttfp_ms), + ("P99 TTFP (ms)", bench_inline.p99_ttfp_ms, bench_cached.p99_ttfp_ms), + ("Mean E2E (ms)", bench_inline.mean_e2e_ms, bench_cached.mean_e2e_ms), + ("Median E2E (ms)", bench_inline.median_e2e_ms, bench_cached.median_e2e_ms), + ("Mean RTF", bench_inline.mean_rtf, bench_cached.mean_rtf), + ] + for label, a, b in rows: + print(f"{label:<30} {a:>12.1f} {b:>12.1f} {fmt_speedup(a, b):>10}") + + print("\nNote: Round B request #1 is a cache MISS (cold start).") + print(" Requests #2+ are cache HITs (skip DAC encoding).") + + # Cleanup. + await delete_voice(args.host, args.port, args.voice_name) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/benchmarks/fish-speech/fish_bench_utils.py b/benchmarks/fish-speech/fish_bench_utils.py new file mode 100644 index 0000000000..cc84c4037f --- /dev/null +++ b/benchmarks/fish-speech/fish_bench_utils.py @@ -0,0 +1,501 @@ +"""Shared benchmark infrastructure for Fish Speech serving benchmarks. + +Provides common dataclasses, metrics computation, streaming HTTP client, +and result formatting used by model-specific benchmark scripts. + +Model-specific scripts supply a ``create_payload_fn(prompt) -> dict`` +callback and audio parameters; everything else is handled here. +""" + +import asyncio +import base64 +import json +import time +from collections.abc import Callable +from dataclasses import asdict, dataclass, field +from datetime import datetime +from pathlib import Path + +import aiohttp +import numpy as np +from tqdm.asyncio import tqdm + +# --------------------------------------------------------------------------- +# Shared test prompts (varying length for realistic workload) +# --------------------------------------------------------------------------- +PROMPTS = [ + "Hello, welcome to the voice synthesis benchmark test.", + "She said she would be here by noon, but nobody showed up.", + "The quick brown fox jumps over the lazy dog near the riverbank.", + "I can't believe how beautiful the sunset looks from up here on the mountain.", + "Please remember to bring your identification documents to the appointment tomorrow morning.", + "Have you ever wondered what it would be like to travel through time and visit ancient civilizations?", + "The restaurant on the corner serves the best pasta I have ever tasted in my entire life.", + "After the meeting, we should discuss the quarterly results and plan for the next phase.", + "Learning a new language takes patience, practice, and a genuine curiosity about other cultures.", + "The train leaves at half past seven, so we need to arrive at the station before then.", + "Could you please turn down the music a little bit, I'm trying to concentrate on my work.", + "It was a dark and stormy night when the old lighthouse keeper heard a knock at the door.", +] + + +# --------------------------------------------------------------------------- +# Dataclasses +# --------------------------------------------------------------------------- +@dataclass +class RequestResult: + success: bool = False + ttfp: float = 0.0 # Time to first audio packet (seconds) + e2e: float = 0.0 # End-to-end latency (seconds) + audio_bytes: int = 0 # Total audio bytes received + audio_duration: float = 0.0 # Audio duration in seconds + rtf: float = 0.0 # Real-time factor = e2e / audio_duration + prompt: str = "" + error: str = "" + + +@dataclass +class BenchmarkResult: + config_name: str = "" + concurrency: int = 0 + num_prompts: int = 0 + completed: int = 0 + failed: int = 0 + duration_s: float = 0.0 + # TTFP stats (ms) + mean_ttfp_ms: float = 0.0 + median_ttfp_ms: float = 0.0 + std_ttfp_ms: float = 0.0 + p90_ttfp_ms: float = 0.0 + p95_ttfp_ms: float = 0.0 + p99_ttfp_ms: float = 0.0 + # E2E stats (ms) + mean_e2e_ms: float = 0.0 + median_e2e_ms: float = 0.0 + std_e2e_ms: float = 0.0 + p90_e2e_ms: float = 0.0 + p95_e2e_ms: float = 0.0 + p99_e2e_ms: float = 0.0 + # RTF stats + mean_rtf: float = 0.0 + median_rtf: float = 0.0 + std_rtf: float = 0.0 + p99_rtf: float = 0.0 + # Audio stats + mean_audio_duration_s: float = 0.0 + total_audio_duration_s: float = 0.0 + audio_throughput: float = 0.0 # audio_duration / wall_time + request_throughput: float = 0.0 # requests / second + # Per-request details + per_request: list = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Audio helpers +# --------------------------------------------------------------------------- +def pcm_bytes_to_duration( + num_bytes: int, + sample_rate: int = 24000, + sample_width: int = 2, +) -> float: + """Convert raw PCM byte count to duration in seconds.""" + return num_bytes / sample_width / sample_rate + + +def _is_sse_response(response: aiohttp.ClientResponse) -> bool: + content_type = (response.headers.get("Content-Type") or "").lower() + return "text/event-stream" in content_type + + +async def _read_raw_audio_stream( + response: aiohttp.ClientResponse, + *, + start_time: float, +) -> tuple[int, float]: + first_audio_at = 0.0 + total_bytes = 0 + + async for chunk in response.content.iter_any(): + if chunk and first_audio_at <= 0: + first_audio_at = time.perf_counter() - start_time + total_bytes += len(chunk) + + return total_bytes, first_audio_at + + +def _extract_sse_payload(raw_event: bytes) -> bytes | None: + data_lines: list[bytes] = [] + for raw_line in raw_event.splitlines(): + line = raw_line.rstrip(b"\r") + if line.startswith(b"data: "): + data_lines.append(line[6:]) + elif line.startswith(b"data:"): + data_lines.append(line[5:].lstrip()) + + if not data_lines: + return None + return b"\n".join(data_lines).strip() + + +async def _read_sse_audio_stream( + response: aiohttp.ClientResponse, + *, + start_time: float, +) -> tuple[int, float]: + """Decode SSE events and count raw audio bytes from base64 payloads.""" + first_audio_at = 0.0 + total_bytes = 0 + pending = b"" + + async for chunk in response.content.iter_any(): + if not chunk: + continue + pending += chunk + pending = pending.replace(b"\r\n", b"\n") + + while b"\n\n" in pending: + raw_event, pending = pending.split(b"\n\n", 1) + payload_bytes = _extract_sse_payload(raw_event) + if payload_bytes is None: + continue + if payload_bytes == b"[DONE]": + return total_bytes, first_audio_at + + try: + payload = json.loads(payload_bytes) + except json.JSONDecodeError as exc: + raise ValueError(f"Invalid SSE JSON payload: {exc}") from exc + + audio = payload.get("audio") + if not isinstance(audio, dict): + continue + + audio_b64 = audio.get("data") + if not audio_b64: + continue + + try: + audio_bytes = base64.b64decode(audio_b64) + except Exception as exc: + raise ValueError(f"Invalid base64 audio chunk: {exc}") from exc + + if audio_bytes and first_audio_at <= 0: + first_audio_at = time.perf_counter() - start_time + total_bytes += len(audio_bytes) + + return total_bytes, first_audio_at + + +# --------------------------------------------------------------------------- +# Metrics +# --------------------------------------------------------------------------- +def compute_stats( + results: list[RequestResult], + wall_time: float, +) -> BenchmarkResult: + """Compute aggregate statistics from per-request results.""" + successful = [r for r in results if r.success] + failed = [r for r in results if not r.success] + + bench = BenchmarkResult( + completed=len(successful), + failed=len(failed), + duration_s=wall_time, + ) + + if not successful: + return bench + + ttfps = [r.ttfp * 1000 for r in successful] + e2es = [r.e2e * 1000 for r in successful] + rtfs = [r.rtf for r in successful] + audio_durs = [r.audio_duration for r in successful] + + bench.mean_ttfp_ms = float(np.mean(ttfps)) + bench.median_ttfp_ms = float(np.median(ttfps)) + bench.std_ttfp_ms = float(np.std(ttfps)) + bench.p90_ttfp_ms = float(np.percentile(ttfps, 90)) + bench.p95_ttfp_ms = float(np.percentile(ttfps, 95)) + bench.p99_ttfp_ms = float(np.percentile(ttfps, 99)) + + bench.mean_e2e_ms = float(np.mean(e2es)) + bench.median_e2e_ms = float(np.median(e2es)) + bench.std_e2e_ms = float(np.std(e2es)) + bench.p90_e2e_ms = float(np.percentile(e2es, 90)) + bench.p95_e2e_ms = float(np.percentile(e2es, 95)) + bench.p99_e2e_ms = float(np.percentile(e2es, 99)) + + bench.mean_rtf = float(np.mean(rtfs)) + bench.median_rtf = float(np.median(rtfs)) + bench.std_rtf = float(np.std(rtfs)) + bench.p99_rtf = float(np.percentile(rtfs, 99)) + + bench.mean_audio_duration_s = float(np.mean(audio_durs)) + bench.total_audio_duration_s = float(np.sum(audio_durs)) + bench.audio_throughput = bench.total_audio_duration_s / wall_time + bench.request_throughput = len(successful) / wall_time + + bench.per_request = [ + { + "ttfp_ms": r.ttfp * 1000, + "e2e_ms": r.e2e * 1000, + "rtf": r.rtf, + "audio_duration_s": r.audio_duration, + "prompt": r.prompt, + } + for r in successful + ] + + return bench + + +# --------------------------------------------------------------------------- +# Output formatting +# --------------------------------------------------------------------------- +def print_benchmark_results(bench: BenchmarkResult) -> None: + """Print benchmark results in standardized format.""" + W = 50 + print("") + print(f"{'=' * W}") + print(f"{'Serving Benchmark Result':^{W}}") + print(f"{'=' * W}") + print(f"{'Successful requests:':<40}{bench.completed:<10}") + print(f"{'Failed requests:':<40}{bench.failed:<10}") + print(f"{'Maximum request concurrency:':<40}{bench.concurrency:<10}") + print(f"{'Benchmark duration (s):':<40}{bench.duration_s:<10.2f}") + print(f"{'Request throughput (req/s):':<40}{bench.request_throughput:<10.2f}") + print(f"{'-' * W}") + print(f"{'End-to-end Latency':^{W}}") + print(f"{'-' * W}") + print(f"{'Mean E2EL (ms):':<40}{bench.mean_e2e_ms:<10.2f}") + print(f"{'Median E2EL (ms):':<40}{bench.median_e2e_ms:<10.2f}") + print(f"{'P99 E2EL (ms):':<40}{bench.p99_e2e_ms:<10.2f}") + print(f"{'=' * W}") + print(f"{'Audio Result':^{W}}") + print(f"{'=' * W}") + print(f"{'Total audio duration generated (s):':<40}{bench.total_audio_duration_s:<10.2f}") + print(f"{'Audio throughput (audio duration/s):':<40}{bench.audio_throughput:<10.2f}") + print(f"{'-' * W}") + print(f"{'Time to First Packet':^{W}}") + print(f"{'-' * W}") + print(f"{'Mean AUDIO_TTFP (ms):':<40}{bench.mean_ttfp_ms:<10.2f}") + print(f"{'Median AUDIO_TTFP (ms):':<40}{bench.median_ttfp_ms:<10.2f}") + print(f"{'P99 AUDIO_TTFP (ms):':<40}{bench.p99_ttfp_ms:<10.2f}") + print(f"{'-' * W}") + print(f"{'Real Time Factor':^{W}}") + print(f"{'-' * W}") + print(f"{'Mean AUDIO_RTF:':<40}{bench.mean_rtf:<10.3f}") + print(f"{'Median AUDIO_RTF:':<40}{bench.median_rtf:<10.3f}") + print(f"{'P99 AUDIO_RTF:':<40}{bench.p99_rtf:<10.3f}") + print(f"{'=' * W}") + print("") + + +def save_results( + all_results: list[dict], + result_dir: str, + config_name: str, +) -> Path: + """Save benchmark results as JSON and return the file path.""" + out = Path(result_dir) + out.mkdir(parents=True, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + result_file = out / f"bench_{config_name}_{timestamp}.json" + + with open(result_file, "w") as f: + json.dump(all_results, f, indent=2) + print(f"Results saved to {result_file}") + return result_file + + +# --------------------------------------------------------------------------- +# Streaming HTTP client +# --------------------------------------------------------------------------- +async def send_streaming_request( + session: aiohttp.ClientSession, + api_url: str, + payload: dict, + sample_rate: int, + sample_width: int, + pbar: tqdm | None = None, +) -> RequestResult: + """Send a streaming TTS request and measure latency metrics.""" + result = RequestResult(prompt=payload.get("input", "")) + st = time.perf_counter() + + try: + async with session.post(api_url, json=payload) as response: + if response.status != 200: + result.error = f"HTTP {response.status}: {await response.text()}" + else: + if _is_sse_response(response): + total_bytes, result.ttfp = await _read_sse_audio_stream( + response, + start_time=st, + ) + else: + total_bytes, result.ttfp = await _read_raw_audio_stream( + response, + start_time=st, + ) + + result.e2e = time.perf_counter() - st + result.audio_bytes = total_bytes + result.audio_duration = pcm_bytes_to_duration(total_bytes, sample_rate, sample_width) + + if total_bytes <= 0 or result.ttfp <= 0: + result.error = "HTTP 200 but no audio bytes were received" + else: + if result.audio_duration > 0: + result.rtf = result.e2e / result.audio_duration + result.success = True + + except Exception as e: + result.error = str(e) + result.e2e = time.perf_counter() - st + + finally: + if pbar: + pbar.update(1) + return result + + +# --------------------------------------------------------------------------- +# Benchmark runner +# --------------------------------------------------------------------------- +async def run_benchmark( + host: str, + port: int, + num_prompts: int, + max_concurrency: int, + create_payload_fn: Callable[[str], dict], + sample_rate: int, + sample_width: int = 2, + num_warmups: int = 3, + request_timeout_s: float = 120.0, +) -> BenchmarkResult: + """Run a TTS streaming benchmark at a given concurrency level. + + Args: + create_payload_fn: Model-specific function that takes a prompt string + and returns the request JSON payload dict. + sample_rate: PCM sample rate for audio duration calculation. + sample_width: PCM sample width in bytes (default 2 for 16-bit). + """ + api_url = f"http://{host}:{port}/v1/audio/speech" + + connector = aiohttp.TCPConnector( + limit=max_concurrency, + limit_per_host=max_concurrency, + keepalive_timeout=60, + ) + session = aiohttp.ClientSession( + connector=connector, + timeout=aiohttp.ClientTimeout( + total=request_timeout_s, + connect=min(10.0, request_timeout_s), + sock_connect=min(10.0, request_timeout_s), + sock_read=request_timeout_s, + ), + ) + + try: + # Warmup + if num_warmups > 0: + print(f" Warming up with {num_warmups} requests...") + warmup_tasks = [ + send_streaming_request( + session, + api_url, + create_payload_fn(PROMPTS[i % len(PROMPTS)]), + sample_rate, + sample_width, + ) + for i in range(num_warmups) + ] + warmup_results = await asyncio.gather(*warmup_tasks) + warmup_ok = sum(1 for r in warmup_results if r.success) + if warmup_ok == 0: + print(" WARNING: All warmup requests failed!") + for r in warmup_results: + if r.error: + print(f" {r.error[:200]}") + print(f" Warmup done ({warmup_ok}/{num_warmups} succeeded).") + + # Build request list + request_prompts = [PROMPTS[i % len(PROMPTS)] for i in range(num_prompts)] + + # Run + print(f" Running {num_prompts} requests with concurrency={max_concurrency}...") + semaphore = asyncio.Semaphore(max_concurrency) + pbar = tqdm(total=num_prompts, desc=f" concurrency={max_concurrency}") + + async def limited_request(prompt: str) -> RequestResult: + async with semaphore: + return await send_streaming_request( + session, + api_url, + create_payload_fn(prompt), + sample_rate, + sample_width, + pbar, + ) + + start_time = time.perf_counter() + tasks = [asyncio.create_task(limited_request(p)) for p in request_prompts] + results: list[RequestResult] = await asyncio.gather(*tasks) + wall_time = time.perf_counter() - start_time + pbar.close() + + finally: + await session.close() + + # Compute stats + bench = compute_stats(results, wall_time) + bench.concurrency = max_concurrency + bench.num_prompts = num_prompts + + print_benchmark_results(bench) + + # Print sample errors + failed = [r for r in results if not r.success] + if failed: + for r in failed[:3]: + print(f" [ERROR] {r.error[:200]}") + + return bench + + +async def run_benchmark_sweep( + host: str, + port: int, + num_prompts: int, + concurrency_levels: list[int], + create_payload_fn: Callable[[str], dict], + sample_rate: int, + sample_width: int = 2, + num_warmups: int = 3, + request_timeout_s: float = 120.0, + config_name: str = "benchmark", + result_dir: str = "results", +) -> list[dict]: + """Run benchmarks across multiple concurrency levels and save results.""" + all_results = [] + + for concurrency in concurrency_levels: + result = await run_benchmark( + host=host, + port=port, + num_prompts=num_prompts, + max_concurrency=concurrency, + create_payload_fn=create_payload_fn, + sample_rate=sample_rate, + sample_width=sample_width, + num_warmups=num_warmups, + request_timeout_s=request_timeout_s, + ) + result.config_name = config_name + all_results.append(asdict(result)) + + save_results(all_results, result_dir, config_name) + return all_results diff --git a/tests/model_executor/models/test_fish_speech_voice_cache.py b/tests/model_executor/models/test_fish_speech_voice_cache.py new file mode 100644 index 0000000000..8fe7a4a4d1 --- /dev/null +++ b/tests/model_executor/models/test_fish_speech_voice_cache.py @@ -0,0 +1,218 @@ +"""Tests for Fish Speech DAC-code caching via VoiceEmbeddingCache. + +Covers: + - Cache miss → DAC encode → store + - Cache hit → skip DAC encode, reuse cached ref_codes_fq + - Inline ref_audio (no voice name) → no caching, full encode path + - Stale-cache protection via created_at + - Temp file cleanup on cache hit +""" + +import os +import tempfile +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest +import torch + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def _make_info_dict( + *, + text: str = "Hello world", + ref_text: str = "Reference transcript", + ref_audio_sr: int = 44100, + voice_name: str | None = None, + voice_created_at: float | None = None, + ref_audio_path: str | None = None, +) -> dict: + """Build a minimal info_dict for _build_structured_voice_clone_prefill_embeds.""" + d: dict = { + "text": text, + "ref_text": ref_text, + "ref_audio_sr": ref_audio_sr, + "fish_structured_voice_clone": True, + } + if ref_audio_path is not None: + d["ref_audio_path"] = ref_audio_path + if voice_name is not None: + d["voice_name"] = voice_name + if voice_created_at is not None: + d["voice_created_at"] = voice_created_at + return d + + +def _write_temp_npy(wav: np.ndarray | None = None) -> str: + """Write a temporary .npy file with dummy audio and return its path.""" + if wav is None: + wav = np.random.randn(44100).astype(np.float32) # 1 second @ 44.1kHz + with tempfile.NamedTemporaryFile(prefix="fish_test_", suffix=".npy", delete=False) as f: + np.save(f, wav) + return f.name + + +# Fake ref_codes_fq: [frames, codebooks] +_FAKE_REF_CODES = torch.randint(0, 1024, (10, 10), dtype=torch.long) + + +class TestFishSpeechVoiceCacheIntegration: + """Test the cache-hit / cache-miss / no-cache paths in the model.""" + + @pytest.fixture + def mock_model(self): + """Create a mock FishSpeechSlowARForConditionalGeneration with cache.""" + from vllm_omni.utils.voice_cache import VoiceEmbeddingCache + + model = MagicMock() + model._voice_cache = VoiceEmbeddingCache(max_entries=4) + model._semantic_begin_id = 151678 + model._num_codebooks = 10 + model._codebook_size = 4096 + model.model_path = "/fake/model" + model.codebook_embeddings = MagicMock() + model.codebook_embeddings.weight = MagicMock() + model.codebook_embeddings.weight.device = torch.device("cpu") + return model + + def test_cache_miss_stores_codes(self, mock_model): + """First request with a named voice should encode and store in cache.""" + cache = mock_model._voice_cache + voice_name = "alice" + created_at = 1712345678.0 + + # Verify cache starts empty. + key = cache.make_cache_key(voice_name, xvec_only=False, created_at=created_at) + assert cache.get(key) is None + + # Simulate a cache store (what the model does on miss). + cache.put(key, {"ref_codes_fq": _FAKE_REF_CODES.detach().cpu()}) + + # Verify it's now cached. + cached = cache.get(key) + assert cached is not None + assert torch.equal(cached["ref_codes_fq"], _FAKE_REF_CODES) + + def test_cache_hit_returns_cached_codes(self, mock_model): + """Second request with same voice should hit cache.""" + cache = mock_model._voice_cache + voice_name = "alice" + created_at = 1712345678.0 + + key = cache.make_cache_key(voice_name, xvec_only=False, created_at=created_at) + cache.put(key, {"ref_codes_fq": _FAKE_REF_CODES.detach().cpu()}) + + # Hit. + cached = cache.get(key) + assert cached is not None + ref_codes = cached["ref_codes_fq"].to(device=torch.device("cpu"), dtype=torch.long) + assert torch.equal(ref_codes, _FAKE_REF_CODES) + assert cache.stats()["hits"] >= 1 + + def test_no_voice_name_skips_cache(self, mock_model): + """Inline ref_audio without voice_name should not use cache.""" + cache = mock_model._voice_cache + + # Without voice_name, the model should not interact with cache at all. + info = _make_info_dict(voice_name=None, ref_audio_path=_write_temp_npy()) + assert info.get("voice_name") is None + # Cache should remain untouched. + assert cache.stats()["hits"] == 0 + assert cache.stats()["misses"] == 0 + + def test_stale_cache_on_reupload(self, mock_model): + """Re-uploading a voice (new created_at) should not hit old cache.""" + cache = mock_model._voice_cache + voice_name = "alice" + + key_old = cache.make_cache_key(voice_name, xvec_only=False, created_at=1000.0) + cache.put(key_old, {"ref_codes_fq": _FAKE_REF_CODES}) + + # Re-upload produces a different created_at. + key_new = cache.make_cache_key(voice_name, xvec_only=False, created_at=2000.0) + assert cache.get(key_new) is None # miss + assert cache.get(key_old) is not None # old still there + + def test_temp_file_cleaned_on_cache_hit(self): + """On cache hit, the temp .npy file written by the entrypoint should be deleted.""" + tmp_path = _write_temp_npy() + assert os.path.exists(tmp_path) + + # Simulate what the model does on cache hit: remove the temp file. + try: + os.remove(tmp_path) + except OSError: + pass + assert not os.path.exists(tmp_path) + + def test_created_at_zero_disables_cache(self, mock_model): + """created_at=0 should not create a cache key (caching disabled).""" + cache = mock_model._voice_cache + + info = _make_info_dict( + voice_name="bob", + voice_created_at=0.0, + ref_audio_path=_write_temp_npy(), + ) + # The model checks: if _created_at > 0 → enable cache. + # With 0.0, no cache interaction should happen. + _created_at = float(info.get("voice_created_at", 0)) + assert _created_at <= 0 + assert cache.stats()["hits"] == 0 + assert cache.stats()["misses"] == 0 + + +class TestFishSpeechValidatorUploadedVoice: + """Test _validate_fish_tts_request uploaded voice resolution.""" + + def test_uploaded_voice_resolves_ref_audio(self): + """When voice matches an uploaded speaker, ref_audio should be auto-set.""" + request = MagicMock() + request.input = "Hello" + request.voice = "alice" + request.ref_audio = None + request.ref_text = None + request.max_new_tokens = None + + # Uploaded speaker with ref_text. + uploaded_speakers = { + "alice": { + "file_path": "/tmp/fake_audio.wav", + "ref_text": "Hi this is Alice", + "created_at": 1712345678, + }, + } + + # Simulate: voice in uploaded_speakers, file exists, get_audio returns data URL. + with patch("pathlib.Path.exists", return_value=True): + voice_lower = request.voice.lower() + assert voice_lower in uploaded_speakers + + speaker_info = uploaded_speakers[voice_lower] + ref_text_from_upload = speaker_info.get("ref_text") + assert ref_text_from_upload == "Hi this is Alice" + + def test_uploaded_voice_without_ref_text_uses_request_ref_text(self): + """If upload has no ref_text but request provides it, use request's.""" + request = MagicMock() + request.input = "Hello" + request.voice = "bob" + request.ref_audio = None + request.ref_text = "Request-level transcript" + request.max_new_tokens = None + + uploaded_speakers = { + "bob": { + "file_path": "/tmp/fake_audio.wav", + "ref_text": None, + "created_at": 1712345678, + }, + } + + voice_lower = request.voice.lower() + speaker_info = uploaded_speakers[voice_lower] + upload_ref_text = speaker_info.get("ref_text") + # Upload has no ref_text, so request.ref_text should remain. + assert upload_ref_text is None + assert request.ref_text == "Request-level transcript" diff --git a/tests/test_fish_speech_voice_cache.py b/tests/test_fish_speech_voice_cache.py new file mode 100644 index 0000000000..8fe7a4a4d1 --- /dev/null +++ b/tests/test_fish_speech_voice_cache.py @@ -0,0 +1,218 @@ +"""Tests for Fish Speech DAC-code caching via VoiceEmbeddingCache. + +Covers: + - Cache miss → DAC encode → store + - Cache hit → skip DAC encode, reuse cached ref_codes_fq + - Inline ref_audio (no voice name) → no caching, full encode path + - Stale-cache protection via created_at + - Temp file cleanup on cache hit +""" + +import os +import tempfile +from unittest.mock import MagicMock, patch + +import numpy as np +import pytest +import torch + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def _make_info_dict( + *, + text: str = "Hello world", + ref_text: str = "Reference transcript", + ref_audio_sr: int = 44100, + voice_name: str | None = None, + voice_created_at: float | None = None, + ref_audio_path: str | None = None, +) -> dict: + """Build a minimal info_dict for _build_structured_voice_clone_prefill_embeds.""" + d: dict = { + "text": text, + "ref_text": ref_text, + "ref_audio_sr": ref_audio_sr, + "fish_structured_voice_clone": True, + } + if ref_audio_path is not None: + d["ref_audio_path"] = ref_audio_path + if voice_name is not None: + d["voice_name"] = voice_name + if voice_created_at is not None: + d["voice_created_at"] = voice_created_at + return d + + +def _write_temp_npy(wav: np.ndarray | None = None) -> str: + """Write a temporary .npy file with dummy audio and return its path.""" + if wav is None: + wav = np.random.randn(44100).astype(np.float32) # 1 second @ 44.1kHz + with tempfile.NamedTemporaryFile(prefix="fish_test_", suffix=".npy", delete=False) as f: + np.save(f, wav) + return f.name + + +# Fake ref_codes_fq: [frames, codebooks] +_FAKE_REF_CODES = torch.randint(0, 1024, (10, 10), dtype=torch.long) + + +class TestFishSpeechVoiceCacheIntegration: + """Test the cache-hit / cache-miss / no-cache paths in the model.""" + + @pytest.fixture + def mock_model(self): + """Create a mock FishSpeechSlowARForConditionalGeneration with cache.""" + from vllm_omni.utils.voice_cache import VoiceEmbeddingCache + + model = MagicMock() + model._voice_cache = VoiceEmbeddingCache(max_entries=4) + model._semantic_begin_id = 151678 + model._num_codebooks = 10 + model._codebook_size = 4096 + model.model_path = "/fake/model" + model.codebook_embeddings = MagicMock() + model.codebook_embeddings.weight = MagicMock() + model.codebook_embeddings.weight.device = torch.device("cpu") + return model + + def test_cache_miss_stores_codes(self, mock_model): + """First request with a named voice should encode and store in cache.""" + cache = mock_model._voice_cache + voice_name = "alice" + created_at = 1712345678.0 + + # Verify cache starts empty. + key = cache.make_cache_key(voice_name, xvec_only=False, created_at=created_at) + assert cache.get(key) is None + + # Simulate a cache store (what the model does on miss). + cache.put(key, {"ref_codes_fq": _FAKE_REF_CODES.detach().cpu()}) + + # Verify it's now cached. + cached = cache.get(key) + assert cached is not None + assert torch.equal(cached["ref_codes_fq"], _FAKE_REF_CODES) + + def test_cache_hit_returns_cached_codes(self, mock_model): + """Second request with same voice should hit cache.""" + cache = mock_model._voice_cache + voice_name = "alice" + created_at = 1712345678.0 + + key = cache.make_cache_key(voice_name, xvec_only=False, created_at=created_at) + cache.put(key, {"ref_codes_fq": _FAKE_REF_CODES.detach().cpu()}) + + # Hit. + cached = cache.get(key) + assert cached is not None + ref_codes = cached["ref_codes_fq"].to(device=torch.device("cpu"), dtype=torch.long) + assert torch.equal(ref_codes, _FAKE_REF_CODES) + assert cache.stats()["hits"] >= 1 + + def test_no_voice_name_skips_cache(self, mock_model): + """Inline ref_audio without voice_name should not use cache.""" + cache = mock_model._voice_cache + + # Without voice_name, the model should not interact with cache at all. + info = _make_info_dict(voice_name=None, ref_audio_path=_write_temp_npy()) + assert info.get("voice_name") is None + # Cache should remain untouched. + assert cache.stats()["hits"] == 0 + assert cache.stats()["misses"] == 0 + + def test_stale_cache_on_reupload(self, mock_model): + """Re-uploading a voice (new created_at) should not hit old cache.""" + cache = mock_model._voice_cache + voice_name = "alice" + + key_old = cache.make_cache_key(voice_name, xvec_only=False, created_at=1000.0) + cache.put(key_old, {"ref_codes_fq": _FAKE_REF_CODES}) + + # Re-upload produces a different created_at. + key_new = cache.make_cache_key(voice_name, xvec_only=False, created_at=2000.0) + assert cache.get(key_new) is None # miss + assert cache.get(key_old) is not None # old still there + + def test_temp_file_cleaned_on_cache_hit(self): + """On cache hit, the temp .npy file written by the entrypoint should be deleted.""" + tmp_path = _write_temp_npy() + assert os.path.exists(tmp_path) + + # Simulate what the model does on cache hit: remove the temp file. + try: + os.remove(tmp_path) + except OSError: + pass + assert not os.path.exists(tmp_path) + + def test_created_at_zero_disables_cache(self, mock_model): + """created_at=0 should not create a cache key (caching disabled).""" + cache = mock_model._voice_cache + + info = _make_info_dict( + voice_name="bob", + voice_created_at=0.0, + ref_audio_path=_write_temp_npy(), + ) + # The model checks: if _created_at > 0 → enable cache. + # With 0.0, no cache interaction should happen. + _created_at = float(info.get("voice_created_at", 0)) + assert _created_at <= 0 + assert cache.stats()["hits"] == 0 + assert cache.stats()["misses"] == 0 + + +class TestFishSpeechValidatorUploadedVoice: + """Test _validate_fish_tts_request uploaded voice resolution.""" + + def test_uploaded_voice_resolves_ref_audio(self): + """When voice matches an uploaded speaker, ref_audio should be auto-set.""" + request = MagicMock() + request.input = "Hello" + request.voice = "alice" + request.ref_audio = None + request.ref_text = None + request.max_new_tokens = None + + # Uploaded speaker with ref_text. + uploaded_speakers = { + "alice": { + "file_path": "/tmp/fake_audio.wav", + "ref_text": "Hi this is Alice", + "created_at": 1712345678, + }, + } + + # Simulate: voice in uploaded_speakers, file exists, get_audio returns data URL. + with patch("pathlib.Path.exists", return_value=True): + voice_lower = request.voice.lower() + assert voice_lower in uploaded_speakers + + speaker_info = uploaded_speakers[voice_lower] + ref_text_from_upload = speaker_info.get("ref_text") + assert ref_text_from_upload == "Hi this is Alice" + + def test_uploaded_voice_without_ref_text_uses_request_ref_text(self): + """If upload has no ref_text but request provides it, use request's.""" + request = MagicMock() + request.input = "Hello" + request.voice = "bob" + request.ref_audio = None + request.ref_text = "Request-level transcript" + request.max_new_tokens = None + + uploaded_speakers = { + "bob": { + "file_path": "/tmp/fake_audio.wav", + "ref_text": None, + "created_at": 1712345678, + }, + } + + voice_lower = request.voice.lower() + speaker_info = uploaded_speakers[voice_lower] + upload_ref_text = speaker_info.get("ref_text") + # Upload has no ref_text, so request.ref_text should remain. + assert upload_ref_text is None + assert request.ref_text == "Request-level transcript" diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 494c977d77..87ef6a4e9b 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -945,10 +945,32 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str return None def _validate_fish_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None: - """Validate Fish Speech request parameters. Returns error message or None.""" + """Validate Fish Speech request parameters. Returns error message or None. + + Side effect: if request.voice references an uploaded speaker, resolves + it to request.ref_audio and request.ref_text for voice cloning. + """ if not request.input or not request.input.strip(): return "Input text cannot be empty" + # Support uploaded voices: auto-resolve voice → ref_audio + ref_text. + if request.voice is not None and request.ref_audio is None: + voice_lower = request.voice.lower() + if voice_lower in self.uploaded_speakers: + speaker_info = self.uploaded_speakers[voice_lower] + file_path = Path(speaker_info["file_path"]) + if not file_path.exists(): + return f"Audio file for uploaded voice '{request.voice}' not found on disk" + audio_data_url = self._get_uploaded_audio_data(voice_lower) + if audio_data_url is None: + return f"Could not load audio for uploaded voice '{request.voice}'" + request.ref_audio = audio_data_url + # Use ref_text from upload metadata if not provided in request. + if not request.ref_text or not request.ref_text.strip(): + upload_ref_text = speaker_info.get("ref_text") + if upload_ref_text and upload_ref_text.strip(): + request.ref_text = upload_ref_text + if request.ref_audio is not None: fmt_err = self._validate_ref_audio_format(request.ref_audio) if fmt_err: @@ -1303,13 +1325,19 @@ def _build_fish_speech_prompt( # Structured clone: scalars (not list-wrapped) because model-side # preprocess() consumes per-request fields directly. - additional_information = { + additional_information: dict[str, Any] = { "text": normalized_text, "ref_text": normalized_ref_text, "ref_audio_wav": torch.from_numpy(np.asarray(wav_samples, dtype=np.float32)), "ref_audio_sr": int(sr), "fish_structured_voice_clone": True, } + # Pass voice identity for model-side DAC code caching. + if request.voice is not None: + voice_lower = request.voice.lower() + if voice_lower in self.uploaded_speakers: + additional_information["voice_name"] = voice_lower + additional_information["voice_created_at"] = self.uploaded_speakers[voice_lower].get("created_at", 0) if request.max_new_tokens is not None: additional_information["max_new_tokens"] = request.max_new_tokens return { diff --git a/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py b/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py index 9333400593..3813597caa 100644 --- a/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py +++ b/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py @@ -32,6 +32,7 @@ from vllm.sequence import IntermediateTensors from vllm_omni.model_executor.models.output_templates import OmniOutput +from vllm_omni.utils.voice_cache import VoiceEmbeddingCache from .configuration_fish_speech import FishSpeechConfig, FishSpeechFastARConfig, FishSpeechSlowARConfig from .dac_encoder import _load_dac_codec, encode_reference_audio_codes @@ -249,6 +250,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): semantic_mask[im_end_id] = True self.register_buffer("_semantic_allowed_mask", semantic_mask, persistent=False) + # In-memory LRU cache for DAC-encoded reference audio codes. + self._voice_cache = VoiceEmbeddingCache() + # Tokeniser (lazy). self._tokenizer = None @@ -520,6 +524,39 @@ def _build_structured_voice_clone_prefill_embeds(self, info_dict: dict[str, Any] ref_audio_sr = info_dict.get("ref_audio_sr") if not isinstance(ref_text, str) or not isinstance(text, str): raise ValueError("Fish Speech structured voice clone requires string text and ref_text") + + # --- Voice cache: reuse DAC codes for uploaded (named) voices --- + _voice_cache_key: str | None = None + voice_name = info_dict.get("voice_name") + voice_created_at = info_dict.get("voice_created_at") + if isinstance(voice_name, str) and voice_name: + _created_at = float(voice_created_at) if voice_created_at is not None else 0.0 + if _created_at <= 0: + logger.warning( + "Voice '%s' has no created_at timestamp; DAC code caching disabled for this request", + voice_name, + ) + else: + _voice_cache_key = self._voice_cache.make_cache_key( + voice_name, + xvec_only=False, + created_at=_created_at, + ) + _cached = self._voice_cache.get(_voice_cache_key) + if _cached is not None: + ref_codes_fq = _cached["ref_codes_fq"].to( + device=self.codebook_embeddings.weight.device, + dtype=torch.long, + ) + _voice_cache_key = None # hit → don't store again + logger.debug("Voice cache HIT for Fish Speech voice '%s'", voice_name) + return self._apply_codebook_embeddings( + tokenizer, + text, + ref_text, + ref_codes_fq, + ) + if not isinstance(ref_audio_sr, int): raise ValueError("Fish Speech structured voice clone requires integer ref_audio_sr") @@ -537,6 +574,25 @@ def _build_structured_voice_clone_prefill_embeds(self, info_dict: dict[str, Any] ref_audio_sr, device=self.codebook_embeddings.weight.device, ) + + # Cache miss: store DAC codes for future reuse. + if _voice_cache_key is not None: + self._voice_cache.put( + _voice_cache_key, + {"ref_codes_fq": ref_codes_fq.detach().cpu()}, + ) + logger.debug("Voice cache STORE for Fish Speech voice '%s'", voice_name) + + return self._apply_codebook_embeddings(tokenizer, text, ref_text, ref_codes_fq) + + def _apply_codebook_embeddings( + self, + tokenizer: Any, + text: str, + ref_text: str, + ref_codes_fq: torch.Tensor, + ) -> torch.Tensor: + """Build prefill embeddings from DAC codes and inject codebook conditioning.""" semantic_token_ids = (ref_codes_fq[:, 0] + self._semantic_begin_id).tolist() prompt_ids, _, _ = build_fish_voice_clone_prompt_ids( tokenizer, From 86985ed9db1cd76cd67de2405eccfadc82c677a9 Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Fri, 10 Apr 2026 11:05:19 +0800 Subject: [PATCH 112/204] [CI] Update merge condition in upload_pipeline_with_skip_ci.sh to include 'merge-test' label for non-main branches (#2666) Signed-off-by: wangyu <410167048@qq.com> Co-authored-by: Hongsheng Liu --- .buildkite/scripts/upload_pipeline_with_skip_ci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/scripts/upload_pipeline_with_skip_ci.sh b/.buildkite/scripts/upload_pipeline_with_skip_ci.sh index c00140de46..6259d39b29 100644 --- a/.buildkite/scripts/upload_pipeline_with_skip_ci.sh +++ b/.buildkite/scripts/upload_pipeline_with_skip_ci.sh @@ -126,7 +126,7 @@ if skip: else: rep = "'true'" ready_rep = "'build.branch != \"main\" && build.pull_request.labels includes \"ready\"'" - merge_rep = "'build.branch == \"main\" && build.env(\"NIGHTLY\") != \"1\"'" + merge_rep = "'(build.branch == \"main\" && build.env(\"NIGHTLY\") != \"1\") || (build.branch != \"main\" && build.pull_request.labels includes \"merge-test\")'" rendered = ( continuation .replace("__IMAGE_BUILD_IF__", rep) From f3f2dc590c73d06a47608f2b78e13804d1032f32 Mon Sep 17 00:00:00 2001 From: JohnJan Date: Fri, 10 Apr 2026 13:55:21 +0800 Subject: [PATCH 113/204] [Feature]: support Flux.2-dev CFG-Parallel (#2010) --- docs/user_guide/diffusion_features.md | 2 +- .../offline_inference/text_to_image/README.md | 2 +- .../test_flux_2_dev_expansion.py | 15 +++ .../diffusion/models/flux2/pipeline_flux2.py | 96 +++++++++++++++---- 4 files changed, 97 insertions(+), 18 deletions(-) diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index c09705ae05..7e08851812 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -110,7 +110,7 @@ The following tables show which models support each feature: | **FLUX.1-dev** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | | **FLUX.2-klein** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | | **FLUX.1-Kontext-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **FLUX.2-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | +| **FLUX.2-dev** | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **GLM-Image** | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **HunyuanImage3** | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | | **LongCat-Image** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | diff --git a/examples/offline_inference/text_to_image/README.md b/examples/offline_inference/text_to_image/README.md index 4796a17692..cc295e8279 100644 --- a/examples/offline_inference/text_to_image/README.md +++ b/examples/offline_inference/text_to_image/README.md @@ -247,7 +247,7 @@ python examples/offline_inference/text_to_image/text_to_image.py \ #### CFG Parallel Set `--cfg-parallel-size 2` to enable CFG Parallel for faster inference on multi-GPU setups. -See more examples in the [diffusion acceleration user guide](../../../docs/user_guide/diffusion_acceleration.md#using-cfg-parallel). +See more examples in the [cfg_parallel user guide](../../../docs/user_guide/parallelism/cfg_parallel.md#using-cfg-parallel). #### LoRA diff --git a/tests/e2e/online_serving/test_flux_2_dev_expansion.py b/tests/e2e/online_serving/test_flux_2_dev_expansion.py index eba0fbda22..c7140769ba 100644 --- a/tests/e2e/online_serving/test_flux_2_dev_expansion.py +++ b/tests/e2e/online_serving/test_flux_2_dev_expansion.py @@ -29,6 +29,7 @@ NEGATIVE_PROMPT = "low quality, blurry, distorted, deformed, watermark" SINGLE_CARD_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}) +PARALLEL_FEATURE_MARKS = hardware_marks(res={"cuda": "L4"}, num_cards=2) def _get_flux_2_dev_feature_cases(model: str): @@ -47,6 +48,20 @@ def _get_flux_2_dev_feature_cases(model: str): id="cache_dit_cpu_offload", marks=SINGLE_CARD_FEATURE_MARKS, ), + pytest.param( + OmniServerParams( + model=model, + server_args=[ + "--cache-backend", + "cache_dit", + "--enable-cpu-offload", + "--cfg-parallel-size", + "2", + ], + ), + id="parallel_cfg_2", + marks=PARALLEL_FEATURE_MARKS, + ), ] diff --git a/vllm_omni/diffusion/models/flux2/pipeline_flux2.py b/vllm_omni/diffusion/models/flux2/pipeline_flux2.py index 00d3288501..404f05b606 100644 --- a/vllm_omni/diffusion/models/flux2/pipeline_flux2.py +++ b/vllm_omni/diffusion/models/flux2/pipeline_flux2.py @@ -25,6 +25,8 @@ from vllm.model_executor.models.utils import AutoWeightsLoader from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig +from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin +from vllm_omni.diffusion.distributed.parallel_state import get_classifier_free_guidance_world_size from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.flux2 import Flux2Transformer2DModel @@ -333,7 +335,7 @@ def retrieve_latents(encoder_output: torch.Tensor, generator: torch.Generator = raise AttributeError("Could not access latents of provided encoder_output") -class Flux2Pipeline(nn.Module, SupportImageInput, ProgressBarMixin, DiffusionPipelineProfilerMixin): +class Flux2Pipeline(nn.Module, CFGParallelMixin, SupportImageInput, ProgressBarMixin, DiffusionPipelineProfilerMixin): """Flux2 pipeline for text-to-image generation.""" _callback_tensor_inputs = ["latents", "prompt_embeds"] @@ -854,6 +856,21 @@ def current_timestep(self): def interrupt(self): return self._interrupt + def check_cfg_parallel_validity(self, true_cfg_scale: float, has_neg_prompt: bool): + if get_classifier_free_guidance_world_size() == 1: + return True + + if true_cfg_scale <= 1: + logger.warning("CFG parallel is NOT working correctly when true_cfg_scale <= 1.") + return False + + if not has_neg_prompt: + logger.warning( + "CFG parallel is NOT working correctly when there is no negative prompt or negative prompt embeddings." + ) + return False + return True + def forward( self, req: OmniDiffusionRequest, @@ -921,6 +938,14 @@ def forward( # And `torch.stack` automatically raises an exception for us prompt_embeds = torch.stack(req_prompt_embeds) # type: ignore # intentionally expect TypeError + req_negative_prompt_embeds = [ + p.get("negative_prompt_embeds") if not isinstance(p, str) else None for p in req.prompts + ] + if all(p is not None for p in req_negative_prompt_embeds): + negative_prompt_embeds = torch.stack(req_negative_prompt_embeds) # type: ignore # intentionally expect TypeError + + req_negative_prompt = ["" if isinstance(p, str) else (p.get("negative_prompt") or "") for p in req.prompts] + # 1. Check inputs. Raise error if not correct self.check_inputs( prompt=prompt, @@ -958,6 +983,22 @@ def forward( text_encoder_out_layers=text_encoder_out_layers, ) + has_neg_prompt = negative_prompt_embeds is not None or any(req_negative_prompt) + do_true_cfg = self.guidance_scale > 1 and has_neg_prompt + + self.check_cfg_parallel_validity(self.guidance_scale, has_neg_prompt) + negative_text_ids = None + if do_true_cfg: + negative_prompt = req_negative_prompt + negative_prompt_embeds, negative_text_ids = self.encode_prompt( + prompt=negative_prompt, + prompt_embeds=negative_prompt_embeds, + device=device, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + text_encoder_out_layers=text_encoder_out_layers, + ) + # 4. process images if image is not None and not isinstance(image, list): image = [image] @@ -1029,6 +1070,9 @@ def forward( guidance_tensor = torch.full([1], self.guidance_scale, device=device, dtype=torch.float32) guidance_tensor = guidance_tensor.expand(latents.shape[0]) + # For editing pipelines, we need to slice the output to remove condition latents + output_slice = latents.size(1) if image_latents is not None else None + # 7. Denoising loop # We set the index here to remove DtoH sync, helpful especially during compilation. # Check out more details here: https://github.com/huggingface/diffusers/pull/11696 @@ -1048,21 +1092,41 @@ def forward( latent_model_input = torch.cat([latents, image_latents], dim=1).to(self.transformer.dtype) latent_image_ids = torch.cat([latent_ids, image_latent_ids], dim=1) - noise_pred = self.transformer( - hidden_states=latent_model_input, # (B, image_seq_len, C) - timestep=timestep / 1000, - guidance=guidance_tensor, - encoder_hidden_states=prompt_embeds, - txt_ids=text_ids, # B, text_seq_len, 4 - img_ids=latent_image_ids, # B, image_seq_len, 4 - joint_attention_kwargs=self.attention_kwargs, - return_dict=False, - )[0] - - noise_pred = noise_pred[:, : latents.size(1) :] - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + positive_kwargs = { + "hidden_states": latent_model_input, + "timestep": timestep / 1000, + "guidance": guidance_tensor, + "encoder_hidden_states": prompt_embeds, + "txt_ids": text_ids, + "img_ids": latent_image_ids, + "joint_attention_kwargs": self.attention_kwargs, + "return_dict": False, + } + if do_true_cfg: + negative_kwargs = { + "hidden_states": latent_model_input, + "timestep": timestep / 1000, + "guidance": guidance_tensor, + "encoder_hidden_states": negative_prompt_embeds, + "txt_ids": negative_text_ids, + "img_ids": latent_image_ids, + "joint_attention_kwargs": self.attention_kwargs, + "return_dict": False, + } + else: + negative_kwargs = None + + noise_pred = self.predict_noise_maybe_with_cfg( + do_true_cfg=do_true_cfg, + true_cfg_scale=self.guidance_scale, + positive_kwargs=positive_kwargs, + negative_kwargs=negative_kwargs, + cfg_normalize=False, + output_slice=output_slice, + ) + + # Compute the previous noisy sample x_t -> x_t-1 with automatic CFG sync + latents = self.scheduler_step_maybe_with_cfg(noise_pred, t, latents, do_true_cfg) if callback_on_step_end is not None: callback_kwargs = {} From cb91cbe61e87c4a2fec5c11d1597e4c2bd922ad2 Mon Sep 17 00:00:00 2001 From: wuhang Date: Fri, 10 Apr 2026 14:40:37 +0800 Subject: [PATCH 114/204] [Entrypoint][Refactor]Stage CLI Refactor (#2020) Signed-off-by: wuhang Signed-off-by: Didan Deng <33117903+wtomin@users.noreply.github.com> Co-authored-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- .../bagel/run_server_stage_cli.sh | 182 +- tests/conftest.py | 247 ++- tests/dfx/perf/scripts/run_benchmark.py | 2 +- .../test_flux_2_dev_expansion.py | 16 +- .../test_qwen3_omni_expansion.py | 10 +- tests/engine/test_arg_utils.py | 12 + .../test_async_omni_engine_stage_init.py | 23 +- tests/engine/test_single_stage_mode.py | 1645 +++++++++++++++++ tests/entrypoints/test_serve.py | 195 ++ tests/entrypoints/test_utils.py | 69 + vllm_omni/diffusion/stage_diffusion_client.py | 55 +- vllm_omni/diffusion/stage_diffusion_proc.py | 9 +- vllm_omni/engine/arg_utils.py | 23 +- vllm_omni/engine/async_omni_engine.py | 526 ++++-- vllm_omni/engine/stage_engine_core_client.py | 85 +- vllm_omni/engine/stage_engine_startup.py | 599 ++++++ vllm_omni/engine/stage_init_utils.py | 107 +- vllm_omni/entrypoints/cli/serve.py | 234 ++- vllm_omni/entrypoints/openai/api_server.py | 3 +- 19 files changed, 3771 insertions(+), 271 deletions(-) create mode 100644 tests/engine/test_single_stage_mode.py create mode 100644 tests/entrypoints/test_serve.py create mode 100644 vllm_omni/engine/stage_engine_startup.py diff --git a/examples/online_serving/bagel/run_server_stage_cli.sh b/examples/online_serving/bagel/run_server_stage_cli.sh index 51639153f7..2d0b4bc369 100644 --- a/examples/online_serving/bagel/run_server_stage_cli.sh +++ b/examples/online_serving/bagel/run_server_stage_cli.sh @@ -1,34 +1,164 @@ #!/bin/bash -# Bagel multi-stage online serving startup script -# Starts stage 0 as master with API server, and stage 1 in headless mode +# Bagel multi-stage online serving startup script. +# +# Usage: +# ./run_server_stage_cli.sh --stage 0 +# ./run_server_stage_cli.sh --stage 1 +# ./run_server_stage_cli.sh --stage 0 -- --tensor-parallel-size 2 +# ./run_server_stage_cli.sh --stage 1 -- --gpu-memory-utilization 0.9 +# +# By default, `--stage all` keeps the old behavior and launches both stages in +# one session. Use `--stage 0` / `--stage 1` to launch each stage separately in +# different terminal sessions, with stage-specific extra CLI arguments passed +# after `--`. + +set -euo pipefail MODEL="${MODEL:-ByteDance-Seed/BAGEL-7B-MoT}" PORT="${PORT:-8091}" MASTER_ADDRESS="${MASTER_ADDRESS:-127.0.0.1}" MASTER_PORT="${MASTER_PORT:-8092}" -STAGE_CONFIGS_PATH="$(dirname "$0")/../../../vllm_omni/model_executor/stage_configs/bagel.yaml" +STAGE="all" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +STAGE_CONFIGS_PATH="${STAGE_CONFIGS_PATH:-$SCRIPT_DIR/../../../vllm_omni/model_executor/stage_configs/bagel.yaml}" +EXTRA_ARGS=() + +usage() { + cat <&2 + usage + exit 1 + ;; + esac +done + +if [[ "$STAGE" != "0" && "$STAGE" != "1" && "$STAGE" != "all" ]]; then + echo "Invalid --stage value: $STAGE" >&2 + usage + exit 1 +fi + +print_config() { + echo "Model: $MODEL" + echo "API Port: $PORT" + echo "Master Address: $MASTER_ADDRESS" + echo "Master Port: $MASTER_PORT" + echo "Stage Configs: $STAGE_CONFIGS_PATH" + echo "Selected Stage: $STAGE" + if [[ ${#EXTRA_ARGS[@]} -gt 0 ]]; then + echo "Extra Args: ${EXTRA_ARGS[*]}" + fi +} + +run_stage_0() { + echo "Starting Stage 0 (Thinker) as master..." + vllm serve "$MODEL" --omni \ + --port "$PORT" \ + --stage-configs-path "$STAGE_CONFIGS_PATH" \ + --stage-id 0 \ + -oma "$MASTER_ADDRESS" \ + -omp "$MASTER_PORT" \ + "${EXTRA_ARGS[@]}" +} + +run_stage_1() { + echo "Starting Stage 1 (DiT) in headless mode..." + vllm serve "$MODEL" --omni \ + --stage-configs-path "$STAGE_CONFIGS_PATH" \ + --stage-id 1 \ + --headless \ + -oma "$MASTER_ADDRESS" \ + -omp "$MASTER_PORT" \ + "${EXTRA_ARGS[@]}" +} echo "Starting Bagel multi-stage server..." -echo "Model: $MODEL" -echo "API Port: $PORT" -echo "Master Address: $MASTER_ADDRESS" -echo "Master Port: $MASTER_PORT" -echo "Stage Configs: $STAGE_CONFIGS_PATH" - -# Start stage 1 (DiT) in headless mode first -echo "Starting Stage 1 (DiT) in headless mode..." -vllm serve "$MODEL" --omni \ - --stage-configs-path "$STAGE_CONFIGS_PATH" \ - --stage-id 1 \ - --headless \ - -oma "$MASTER_ADDRESS" \ - -omp "$MASTER_PORT" & - -# Start stage 0 (Thinker) as master with API server -echo "Starting Stage 0 (Thinker) as master..." -vllm serve "$MODEL" --omni \ - --port "$PORT" \ - --stage-configs-path "$STAGE_CONFIGS_PATH" \ - --stage-id 0 \ - -oma "$MASTER_ADDRESS" \ - -omp "$MASTER_PORT" +print_config + +case "$STAGE" in + 0) + run_stage_0 + ;; + 1) + run_stage_1 + ;; + all) + echo "Launching both stages in one session (legacy mode)..." + echo "Starting Stage 0 (Thinker) in background first..." + run_stage_0 & + STAGE_0_PID=$! + + cleanup() { + if [[ -n "${STAGE_0_PID:-}" ]]; then + kill "$STAGE_0_PID" 2>/dev/null || true + wait "$STAGE_0_PID" 2>/dev/null || true + fi + } + + trap cleanup EXIT INT TERM + + echo "Waiting briefly for Stage 0 to initialize..." + sleep 2 + run_stage_1 + ;; +esac diff --git a/tests/conftest.py b/tests/conftest.py index 8ac790f137..27833fe282 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -73,6 +73,8 @@ class OmniServerParams(NamedTuple): server_args: list[str] | None = None env_dict: dict[str, str] | None = None use_omni: bool = True + use_stage_cli: bool = False + init_timeout: int | None = None def assert_image_diffusion_response( @@ -1546,6 +1548,183 @@ def __exit__(self, exc_type, exc_val, exc_tb): cleanup_dist_env_and_memory() +class OmniServerStageCli(OmniServer): + """Omni server harness that exercises the stage CLI flow.""" + + def __init__( + self, + model: str, + stage_config_path: str, + serve_args: list[str] | None = None, + *, + stage_ids: list[int] | None = None, + port: int | None = None, + env_dict: dict[str, str] | None = None, + ) -> None: + super().__init__(model, serve_args or [], port=port, env_dict=env_dict, use_omni=True) + self.stage_config_path = stage_config_path + self.master_port = get_open_port() + self.visible_device_list = self._load_visible_device_list(env_dict) + self.stage_runtime_devices = self._load_stage_runtime_devices(stage_config_path) + self.stage_ids = stage_ids or self._load_stage_ids(stage_config_path) + if 0 not in self.stage_ids: + raise ValueError(f"Stage CLI test requires stage_id=0 in config: {stage_config_path}") + self.stage_procs: dict[int, subprocess.Popen] = {} + self.proc = None + + @staticmethod + def _load_stage_ids(stage_config_path: str) -> list[int]: + with open(stage_config_path, encoding="utf-8") as f: + cfg = yaml.safe_load(f) or {} + + stage_ids = [stage["stage_id"] for stage in cfg.get("stage_args", []) if "stage_id" in stage] + if not stage_ids: + raise ValueError(f"No stage IDs found in config: {stage_config_path}") + return stage_ids + + @staticmethod + def _load_stage_runtime_devices(stage_config_path: str) -> dict[int, str]: + with open(stage_config_path, encoding="utf-8") as f: + cfg = yaml.safe_load(f) or {} + + runtime_devices: dict[int, str] = {} + for stage in cfg.get("stage_args", []): + stage_id = stage.get("stage_id") + devices = stage.get("runtime", {}).get("devices") + if stage_id is not None and devices: + runtime_devices[int(stage_id)] = str(devices) + return runtime_devices + + @classmethod + def _parse_device_list(cls, devices: str | int) -> list[str]: + if isinstance(devices, int): + if devices < 0: + raise ValueError("Device IDs must be non-negative integers") + return [str(devices)] + return [token.strip() for token in str(devices).split(",") if token.strip()] + + @classmethod + def _load_visible_device_list(cls, env_dict: dict[str, str] | None) -> list[str] | None: + env = os.environ.copy() + if env_dict is not None: + env.update(env_dict) + + env_var = getattr(current_omni_platform, "device_control_env_var", None) + if env_var and env_var in env: + return [token.strip() for token in env[env_var].split(",") if token.strip()] + return None + + @classmethod + def _map_stage_devices(cls, stage_id: int, visible_device_list: list[str] | None, devices: str) -> str: + device_list = cls._parse_device_list(devices) + + if visible_device_list is None: + return ",".join(device_list) + + if not all(device.isdigit() for device in device_list): + raise ValueError("Logical devices must be non-negative integers") + + logical_ids = [int(device) for device in device_list] + if logical_ids and max(logical_ids) >= len(visible_device_list): + raise ValueError( + f"Stage {stage_id} has logical IDs {device_list}, one or more of which exceed the number of visible devices" + ) + + return ",".join(visible_device_list[idx] for idx in logical_ids) + + def _set_stage_device_env(self, stage_id: int, env: dict[str, str], devices: str) -> None: + mapped_devices = self._map_stage_devices(stage_id, self.visible_device_list, devices) + env_var = getattr(current_omni_platform, "device_control_env_var", None) + if env_var: + env[env_var] = mapped_devices + + def _build_stage_cmd(self, stage_id: int, *, headless: bool) -> list[str]: + cmd = [ + sys.executable, + "-m", + "vllm_omni.entrypoints.cli.main", + "serve", + self.model, + "--omni", + "--stage-configs-path", + self.stage_config_path, + "--stage-id", + str(stage_id), + "--omni-master-address", + self.host, + "--omni-master-port", + str(self.master_port), + ] + + if headless: + cmd.append("--headless") + else: + cmd += ["--host", self.host, "--port", str(self.port)] + + cmd += self.serve_args + return cmd + + def _launch_stage(self, stage_id: int, *, headless: bool) -> None: + env = os.environ.copy() + env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + if self.env_dict is not None: + env.update(self.env_dict) + + devices = self.stage_runtime_devices.get(stage_id) + if devices: + self._set_stage_device_env(stage_id, env, devices) + + cmd = self._build_stage_cmd(stage_id, headless=headless) + print(f"Launching OmniServerStageCli stage {stage_id}: {' '.join(cmd)}") + proc = subprocess.Popen( + cmd, + env=env, + cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + ) + self.stage_procs[stage_id] = proc + if stage_id == 0: + self.proc = proc + + def _ensure_stage_processes_alive(self) -> None: + for stage_id, proc in self.stage_procs.items(): + ret = proc.poll() + if ret is not None: + raise RuntimeError(f"Stage {stage_id} exited with code {ret} before API server became ready.") + + def _start_server(self) -> None: + ordered_stage_ids = [0, *[stage_id for stage_id in self.stage_ids if stage_id != 0]] + + self._launch_stage(0, headless=False) + time.sleep(2) + self._ensure_stage_processes_alive() + + for stage_id in ordered_stage_ids[1:]: + self._launch_stage(stage_id, headless=True) + + max_wait = 1200 + start_time = time.time() + while time.time() - start_time < max_wait: + self._ensure_stage_processes_alive() + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.settimeout(1) + result = sock.connect_ex((self.host, self.port)) + if result == 0: + print(f"OmniServerStageCli ready on {self.host}:{self.port}") + return + time.sleep(2) + + raise RuntimeError(f"OmniServerStageCli failed to start within {max_wait} seconds") + + def __exit__(self, exc_type, exc_val, exc_tb): + for stage_id in sorted(self.stage_procs, reverse=True): + proc = self.stage_procs[stage_id] + if proc.poll() is None: + self._kill_process_tree(proc.pid) + _run_pre_test_cleanup(enable_force=True) + _run_post_test_cleanup(enable_force=True) + cleanup_dist_env_and_memory() + + def pytest_addoption(parser): parser.addoption( "--run-level", @@ -1568,9 +1747,11 @@ def run_level(request) -> str: @pytest.fixture(scope="module") def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: str) -> Generator[OmniServer, Any, None]: - """Start vLLM-Omni server as a subprocess with actual model weights. - Uses session scope so the server starts only once for the entire test session. - Multi-stage initialization can take 10-20+ minutes. + """Start vLLM-Omni through the standard or stage-CLI launcher. + + The fixture stays module-scoped because multi-stage initialization is costly. + The ``use_stage_cli`` flag on ``OmniServerParams`` routes the setup through the + stage-CLI harness while still reusing the same fixture grouping semantics. """ with _omni_server_lock: params: OmniServerParams = request.param @@ -1589,28 +1770,47 @@ def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: st server_args = params.server_args or [] if params.use_omni: server_args = ["--stage-init-timeout", "120", *server_args] - if stage_config_path is not None: - server_args += ["--stage-configs-path", stage_config_path] - - with ( - OmniServer( + if params.init_timeout is not None: + server_args = [*server_args, "--init-timeout", str(params.init_timeout)] + if params.use_stage_cli: + if not params.use_omni: + raise ValueError("omni_server with use_stage_cli=True requires use_omni=True") + if stage_config_path is None: + raise ValueError("omni_server with use_stage_cli=True requires a stage_config_path") + + with OmniServerStageCli( model, + stage_config_path, server_args, port=port, env_dict=params.env_dict, - use_omni=params.use_omni, - ) - if port - else OmniServer( - model, - server_args, - env_dict=params.env_dict, - use_omni=params.use_omni, - ) - ) as server: - print("OmniServer started successfully") - yield server - print("OmniServer stopping...") + ) as server: + print("OmniServer started successfully") + yield server + print("OmniServer stopping...") + else: + if stage_config_path is not None: + server_args += ["--stage-configs-path", stage_config_path] + + with ( + OmniServer( + model, + server_args, + port=port, + env_dict=params.env_dict, + use_omni=params.use_omni, + ) + if port + else OmniServer( + model, + server_args, + env_dict=params.env_dict, + use_omni=params.use_omni, + ) + ) as server: + print("OmniServer started successfully") + yield server + print("OmniServer stopping...") print("OmniServer stopped") @@ -2653,10 +2853,11 @@ def _build_url(self, path: str) -> str: @pytest.fixture -def openai_client(omni_server: OmniServer, run_level: str): +def openai_client(request: pytest.FixtureRequest, run_level: str): """Create OpenAIClientHandler fixture to facilitate communication with OmniServer with encapsulated request sending, concurrent requests, response handling, and validation.""" - return OpenAIClientHandler(host=omni_server.host, port=omni_server.port, api_key="EMPTY", run_level=run_level) + server = request.getfixturevalue("omni_server") + return OpenAIClientHandler(host=server.host, port=server.port, api_key="EMPTY", run_level=run_level) class OmniRunner: diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index 9e375fa9fe..c625239e5c 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -43,7 +43,7 @@ def omni_server(request): print(f"Starting OmniServer with test: {test_name}, model: {model}") - server_args = ["--stage-init-timeout", "120"] + server_args = ["--stage-init-timeout", "120", "--init-timeout", "900"] if stage_config_path: server_args = ["--stage-configs-path", stage_config_path] + server_args with OmniServer(model, server_args) as server: diff --git a/tests/e2e/online_serving/test_flux_2_dev_expansion.py b/tests/e2e/online_serving/test_flux_2_dev_expansion.py index c7140769ba..9d96a48c0c 100644 --- a/tests/e2e/online_serving/test_flux_2_dev_expansion.py +++ b/tests/e2e/online_serving/test_flux_2_dev_expansion.py @@ -2,13 +2,11 @@ End-to-end diffusion coverage for FLUX.2-dev in online serving mode. Coverage: -- Cache-DiT cache acceleration backend - CPU offload -This test verifies that FLUX.2-dev can be launched with the Cache-DiT backend -and CPU offload enabled, accepts text-to-image requests through the -OpenAI-compatible API, and returns valid generated images with the requested -resolution. +This test verifies that FLUX.2-dev can be launched with CPU offload enabled, +accepts text-to-image requests through the OpenAI-compatible API, and returns +valid generated images with the requested resolution. assert_diffusion_response validates successful generation and the expected image resolution. @@ -33,19 +31,17 @@ def _get_flux_2_dev_feature_cases(model: str): - """Return FLUX.2-dev diffusion feature cases for Cache-DiT + CPU offload.""" + """Return FLUX.2-dev diffusion feature cases for CPU offload.""" return [ pytest.param( OmniServerParams( model=model, server_args=[ - "--cache-backend", - "cache_dit", "--enable-cpu-offload", ], ), - id="cache_dit_cpu_offload", + id="cpu_offload", marks=SINGLE_CARD_FEATURE_MARKS, ), pytest.param( @@ -76,7 +72,7 @@ def test_flux_2_dev( omni_server: OmniServer, openai_client: OpenAIClientHandler, ): - """Validate FLUX.2-dev online serving with Cache-DiT and CPU offload.""" + """Validate FLUX.2-dev online serving with CPU offload.""" messages = dummy_messages_from_mix_data(content_text=PROMPT) diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index 0bcc86840b..1637627695 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -67,13 +67,17 @@ def get_batch_token_config(default_path): # Create parameter combinations for model and stage config test_params = [ - pytest.param(OmniServerParams(model=model, stage_config_path=default_path), id="default"), - pytest.param(OmniServerParams(model=model, stage_config_path=get_chunk_config(default_path)), id="async_chunk"), + pytest.param(OmniServerParams(model=model, stage_config_path=default_path, use_stage_cli=True), id="default"), + pytest.param( + OmniServerParams(model=model, stage_config_path=get_chunk_config(default_path), use_stage_cli=True), + id="async_chunk", + ), ] test_token_params = [ pytest.param( - OmniServerParams(model=model, stage_config_path=get_batch_token_config(default_path)), id="batch_token_64" + OmniServerParams(model=model, stage_config_path=get_batch_token_config(default_path), use_stage_cli=True), + id="batch_token_64", ) ] diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 7ba1cebece..5584b15d9f 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -24,6 +24,18 @@ def test_sync_config_is_omni(): assert isinstance(cfg, OmniModelConfig) +def test_default_stage_id_is_concrete_int(): + """Ensure `stage_id` stays safe for downstream arithmetic/indexing.""" + engine_args = OmniEngineArgs() + + assert engine_args.stage_id == 0 + assert isinstance(engine_args.stage_id, int) + assert engine_args.log_stats is False + + cfg = engine_args.create_model_config() + assert cfg.stage_id == 0 + + def test_multimodal_kwarg_overrides(): """Ensure that overrides in the multimodal config are preserved.""" # Get a different value than the default for a multimodal field diff --git a/tests/engine/test_async_omni_engine_stage_init.py b/tests/engine/test_async_omni_engine_stage_init.py index 9f47fd449d..31d3ed7751 100644 --- a/tests/engine/test_async_omni_engine_stage_init.py +++ b/tests/engine/test_async_omni_engine_stage_init.py @@ -1,3 +1,4 @@ +import importlib import os import types @@ -8,6 +9,17 @@ pytestmark = [pytest.mark.core_model, pytest.mark.cpu] +def test_stage_engine_core_client_module_reload_keeps_forward_refs_deferred(): + """Regression test for forward references in make_async_mp_client.""" + import vllm_omni.engine.stage_engine_core_client as client_mod + + importlib.reload(client_mod) + + assert client_mod.StageEngineCoreClientBase.make_async_mp_client.__annotations__["return"] == ( + "StageEngineCoreClient | DPLBStageEngineCoreClient" + ) + + def test_initialize_stages_restores_device_visibility_after_diffusion_init(monkeypatch): """Regression test for stage device env leakage across stage init. @@ -23,6 +35,9 @@ def test_initialize_stages_restores_device_visibility_after_diffusion_init(monke engine.num_stages = 1 engine.async_chunk = False engine.diffusion_batch_size = 1 + engine.single_stage_mode = False + engine._single_stage_id_filter = None + engine._omni_master_server = None engine.stage_configs = [types.SimpleNamespace(stage_id=0, stage_type="diffusion")] env_var = current_omni_platform.device_control_env_var @@ -49,7 +64,7 @@ def _fake_setup_stage_devices(_stage_id, _runtime_cfg): current_omni_platform.set_device_control_env_var("1") monkeypatch.setattr(engine_mod, "setup_stage_devices", _fake_setup_stage_devices) - monkeypatch.setattr(engine_mod, "_inject_kv_stage_info", lambda *_: None) + monkeypatch.setattr(engine_mod, "inject_kv_stage_info", lambda *_: None) monkeypatch.setattr(engine_mod, "initialize_diffusion_stage", lambda *_, **__: diffusion_client) monkeypatch.setattr( engine_mod, @@ -101,7 +116,11 @@ def __init__(self, vllm_config, renderer=None): self.vllm_config = vllm_config self.renderer = renderer - monkeypatch.setattr(engine_mod, "StageEngineCoreClient", DummyStageEngineCoreClient) + monkeypatch.setattr( + engine_mod.StageEngineCoreClientBase, + "make_async_mp_client", + staticmethod(lambda **kwargs: DummyStageEngineCoreClient(**kwargs)), + ) monkeypatch.setattr(engine_mod, "MultimodalOutputProcessor", DummyOutputProcessor) monkeypatch.setattr(engine_mod, "InputProcessor", DummyInputProcessor) monkeypatch.setattr(engine_mod, "OmniInputPreprocessor", DummyOmniInputPreprocessor) diff --git a/tests/engine/test_single_stage_mode.py b/tests/engine/test_single_stage_mode.py new file mode 100644 index 0000000000..627a98395f --- /dev/null +++ b/tests/engine/test_single_stage_mode.py @@ -0,0 +1,1645 @@ +"""Unit tests for AsyncOmniEngine single-stage mode and OmniMasterServer. + +These tests cover: +- OmniMasterServer address pre-allocation & ZMQ registration handshake +- AsyncOmniEngine single_stage_mode detection / _single_stage_id_filter setup +- _initialize_stages stage routing (local launch vs. remote-wait) in + single_stage_mode +- _create_remote_llm_stage delegation to connect_remote_engine_cores +- _launch_llm_stage delegation to launch_omni_core_engines in + single_stage_mode + +All tests run without real hardware by mocking ZMQ, vllm_config, and the +heavy initialization helpers. +""" + +from __future__ import annotations + +import threading +from contextlib import contextmanager +from typing import Any +from unittest.mock import MagicMock, Mock, patch + +import pytest +from vllm.v1.engine.utils import EngineZmqAddresses + +from vllm_omni.engine.async_omni_engine import AsyncOmniEngine +from vllm_omni.engine.stage_engine_startup import ( + OmniMasterServer, + StageAllocation, + StageCoordinatorAddresses, + connect_remote_engine_cores, + launch_omni_core_engines, +) +from vllm_omni.engine.stage_init_utils import StartedLlmStage + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_stage_cfg(stage_id: int, stage_type: str = "llm") -> Mock: + """Return a lightweight stage config mock.""" + cfg = Mock() + cfg.stage_id = stage_id + cfg.stage_type = stage_type + cfg.engine_args = MagicMock() + cfg.engine_args.async_chunk = False + cfg.engine_args.model_stage = None + cfg.engine_args.engine_output_type = None + return cfg + + +def _make_started_llm_stage(stage_id: int) -> StartedLlmStage: + """Return a minimal StartedLlmStage for mocking.""" + addresses = Mock() + addresses.inputs = ["tcp://127.0.0.1:5000"] + addresses.outputs = ["tcp://127.0.0.1:5001"] + addresses.frontend_stats_publish_address = None + return StartedLlmStage( + stage_id=stage_id, + metadata=Mock(stage_id=stage_id), + vllm_config=Mock(), + executor_class=Mock(), + engine_manager=Mock(), + coordinator=Mock(), + addresses=addresses, + ) + + +# --------------------------------------------------------------------------- +# OmniMasterServer – address pre-allocation +# --------------------------------------------------------------------------- + + +class TestOmniMasterServerAllocation: + """Test address pre-allocation in OmniMasterServer.__init__.""" + + def test_public_address_and_port_properties_expose_registration_endpoint(self): + server = OmniMasterServer( + master_address="127.0.0.1", + master_port=15000, + stage_ids=[0], + ) + assert server.address == "127.0.0.1" + assert server.port == 15000 + + def test_allocations_created_for_each_stage_id(self): + server = OmniMasterServer( + master_address="127.0.0.1", + master_port=15000, + stage_ids=[0, 1, 2], + ) + assert set(server._allocations.keys()) == {0, 1, 2} + + def test_each_allocation_is_stage_allocation(self): + server = OmniMasterServer( + master_address="127.0.0.1", + master_port=15000, + stage_ids=[0, 1], + ) + for sid in (0, 1): + alloc = server._allocations[sid] + assert isinstance(alloc, StageAllocation) + + def test_allocation_addresses_reference_master_address(self): + server = OmniMasterServer( + master_address="192.168.1.10", + master_port=20000, + stage_ids=[0], + ) + alloc = server._allocations[0] + for addr in ( + alloc.handshake_bind_address, + alloc.handshake_connect_address, + alloc.input_bind_address, + alloc.input_connect_address, + alloc.output_bind_address, + alloc.output_connect_address, + ): + assert "192.168.1.10" in addr, f"Expected master address in {addr}" + + def test_port_uniqueness_within_single_allocation(self): + """Each allocation uses three distinct ports.""" + server = OmniMasterServer( + master_address="127.0.0.1", + master_port=15001, + stage_ids=[0], + ) + alloc = server._allocations[0] + hs_port = int(alloc.handshake_bind_address.split(":")[-1]) + inp_port = int(alloc.input_bind_address.split(":")[-1]) + out_port = int(alloc.output_bind_address.split(":")[-1]) + assert len({hs_port, inp_port, out_port}) == 3, "Expected three distinct ports per stage allocation" + + def test_get_zmq_addresses_returns_bind_addresses(self): + server = OmniMasterServer( + master_address="127.0.0.1", + master_port=15002, + stage_ids=[0], + ) + alloc = server._allocations[0] + zmq_addrs = server.get_zmq_addresses(0) + assert zmq_addrs.inputs == [alloc.input_bind_address] + assert zmq_addrs.outputs == [alloc.output_bind_address] + + def test_get_engine_zmq_addresses_returns_connect_addresses(self): + server = OmniMasterServer( + master_address="127.0.0.1", + master_port=15003, + stage_ids=[0], + ) + alloc = server._allocations[0] + engine_addrs = server.get_engine_zmq_addresses(0) + assert engine_addrs.inputs == [alloc.input_connect_address] + assert engine_addrs.outputs == [alloc.output_connect_address] + + def test_get_allocation_returns_correct_object(self): + server = OmniMasterServer( + master_address="127.0.0.1", + master_port=15004, + stage_ids=[3], + ) + assert server.get_allocation(3) is server._allocations[3] + + +# --------------------------------------------------------------------------- +# OmniMasterServer – ZMQ registration flow +# --------------------------------------------------------------------------- + + +class TestOmniMasterServerRegistration: + """Test that the server correctly handles a stage registration.""" + + def test_registration_reply_contains_handshake_address(self): + """A DEALER client that sends a registration msg gets the handshake + address back from the ROUTER registration socket.""" + import msgspec + import zmq + from vllm.utils.network_utils import get_open_port + + master_port = get_open_port() + server = OmniMasterServer( + master_address="127.0.0.1", + master_port=master_port, + stage_ids=[0], + ) + server.start() + expected_hs = server._allocations[0].handshake_connect_address + + ctx = zmq.Context() + try: + sock = ctx.socket(zmq.DEALER) + sock.connect(f"tcp://127.0.0.1:{master_port}") + sock.send(msgspec.msgpack.encode({"stage_id": 0})) + if not sock.poll(timeout=5_000): + pytest.fail("No reply received from OmniMasterServer within 5 s") + reply = msgspec.msgpack.decode(sock.recv()) + assert reply["handshake_address"] == expected_hs + finally: + sock.close(linger=0) + ctx.term() + server.stop() + + def test_server_handles_unknown_stage_id_gracefully(self): + """A registration for an unrecognised stage_id must not crash the server.""" + import msgspec + import zmq + from vllm.utils.network_utils import get_open_port + + master_port = get_open_port() + server = OmniMasterServer( + master_address="127.0.0.1", + master_port=master_port, + stage_ids=[0], + ) + server.start() + + ctx = zmq.Context() + try: + bad_sock = ctx.socket(zmq.DEALER) + bad_sock.connect(f"tcp://127.0.0.1:{master_port}") + # Send unknown stage_id=99 + bad_sock.send(msgspec.msgpack.encode({"stage_id": 99})) + # Server should NOT reply for an unknown id; wait briefly + has_reply = bad_sock.poll(timeout=500) + assert not has_reply, "Server should not reply to unknown stage_id" + # Then register the valid stage so the server thread can exit + good_sock = ctx.socket(zmq.DEALER) + good_sock.connect(f"tcp://127.0.0.1:{master_port}") + good_sock.send(msgspec.msgpack.encode({"stage_id": 0})) + good_sock.poll(timeout=2_000) + finally: + for s in (bad_sock, good_sock): + try: + s.close(linger=0) + except Exception: + pass + ctx.term() + server.stop() + + def test_registration_stores_stage_config(self): + """Stage registration should persist the sender's stage config.""" + import msgspec + import zmq + from vllm.utils.network_utils import get_open_port + + master_port = get_open_port() + server = OmniMasterServer( + master_address="127.0.0.1", + master_port=master_port, + stage_ids=[0], + ) + server.start() + + payload = { + "stage_id": 0, + "stage_config": { + "stage_id": 0, + "stage_type": "llm", + "engine_args": {"model": "fake-model"}, + }, + } + + ctx = zmq.Context() + try: + sock = ctx.socket(zmq.DEALER) + sock.connect(f"tcp://127.0.0.1:{master_port}") + sock.send(msgspec.msgpack.encode(payload)) + assert sock.poll(timeout=5_000) + sock.recv() + + stored = server.get_stage_config(0, timeout_s=0.1) + assert stored == payload["stage_config"] + finally: + sock.close(linger=0) + ctx.term() + server.stop() + + def test_registration_stores_coordinator_addresses(self): + """Stage registration should persist optional coordinator addresses.""" + import msgspec + import zmq + from vllm.utils.network_utils import get_open_port + + master_port = get_open_port() + server = OmniMasterServer( + master_address="127.0.0.1", + master_port=master_port, + stage_ids=[0], + ) + server.start() + + payload = { + "stage_id": 0, + "stage_config": {"stage_id": 0}, + "coordinator_input": "tcp://127.0.0.1:31001", + "coordinator_output": "tcp://127.0.0.1:31002", + "frontend_stats_publish_address": "tcp://127.0.0.1:31003", + } + + ctx = zmq.Context() + try: + sock = ctx.socket(zmq.DEALER) + sock.connect(f"tcp://127.0.0.1:{master_port}") + sock.send(msgspec.msgpack.encode(payload)) + assert sock.poll(timeout=5_000) + sock.recv() + + stored = server.get_stage_coordinator_addresses(0, timeout_s=0.1) + assert stored == StageCoordinatorAddresses( + coordinator_input=payload["coordinator_input"], + coordinator_output=payload["coordinator_output"], + frontend_stats_publish_address=payload["frontend_stats_publish_address"], + ) + finally: + sock.close(linger=0) + ctx.term() + server.stop() + + def test_stop_joins_server_thread(self): + from vllm.utils.network_utils import get_open_port + + master_port = get_open_port() + server = OmniMasterServer( + master_address="127.0.0.1", + master_port=master_port, + stage_ids=[], # no stages → thread exits immediately + ) + server.start() + assert server._thread is not None + server.stop() + # Thread should have exited (joined with timeout=10 inside stop()) + assert not server._thread.is_alive() + + +# --------------------------------------------------------------------------- +# AsyncOmniEngine – single_stage_mode detection in __init__ +# --------------------------------------------------------------------------- + + +class TestSingleStageModeDetection: + """Test __init__ single_stage_mode / _single_stage_id_filter setup. + + We bypass the real __init__ by patching _resolve_stage_configs and + the orchestrator thread, so no actual engines are started. + """ + + def _make_engine_no_thread(self, **kwargs: Any) -> AsyncOmniEngine: + """Create an AsyncOmniEngine without starting the orchestrator thread.""" + stage_cfg = _make_stage_cfg(0) + mock_stage_configs = [stage_cfg] + + with ( + patch.object( + AsyncOmniEngine, + "_resolve_stage_configs", + return_value=("/fake/path", mock_stage_configs), + ), + patch.object( + AsyncOmniEngine, + "_bootstrap_orchestrator", + ), + patch("threading.Thread") as mock_thread_cls, + patch("concurrent.futures.Future") as mock_future_cls, + ): + mock_future = Mock() + mock_future.result.return_value = Mock() # simulates a loop + mock_future_cls.return_value = mock_future + + mock_thread = Mock() + mock_thread.is_alive.return_value = False + mock_thread_cls.return_value = mock_thread + + engine = AsyncOmniEngine(model="fake-model", **kwargs) + return engine + + def test_explicit_single_stage_mode_true(self): + engine = self._make_engine_no_thread( + single_stage_mode=True, + omni_master_address="127.0.0.1", + omni_master_port=20000, + ) + assert engine.single_stage_mode is True + + def test_stage_id_kwarg_promotes_to_single_stage_mode(self): + engine = self._make_engine_no_thread( + stage_id=0, + omni_master_address="127.0.0.1", + omni_master_port=20001, + ) + assert engine.single_stage_mode is True + + def test_stage_id_kwarg_sets_filter(self): + engine = self._make_engine_no_thread( + stage_id=1, + omni_master_address="127.0.0.1", + omni_master_port=20002, + ) + assert engine._single_stage_id_filter == 1 + + def test_no_stage_id_no_single_stage_mode(self): + engine = self._make_engine_no_thread() + assert engine.single_stage_mode is False + assert engine._single_stage_id_filter is None + + def test_single_stage_mode_without_stage_id_has_no_filter(self): + engine = self._make_engine_no_thread( + single_stage_mode=True, + omni_master_address="127.0.0.1", + omni_master_port=20003, + ) + assert engine._single_stage_id_filter is None + + def test_master_address_and_port_stored(self): + engine = self._make_engine_no_thread( + stage_id=0, + omni_master_address="10.0.0.1", + omni_master_port=12345, + ) + assert engine._omni_master_address == "10.0.0.1" + assert engine._omni_master_port == 12345 + + def test_omni_master_server_starts_as_none(self): + engine = self._make_engine_no_thread() + assert engine._omni_master_server is None + + +# --------------------------------------------------------------------------- +# AsyncOmniEngine – _initialize_stages stage routing +# --------------------------------------------------------------------------- + + +class TestInitializeStagesRouting: + """Verify that _initialize_stages routes each stage to the correct launch + function depending on single_stage_mode and _single_stage_id_filter.""" + + _COMMON_PATCHES = [ + "vllm_omni.engine.async_omni_engine.prepare_engine_environment", + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + ] + + def _build_engine_skeleton( + self, + stage_cfgs: list[Mock], + single_stage_mode: bool, + stage_id_filter: int | None, + omni_master_address: str = "127.0.0.1", + omni_master_port: int = 25000, + ) -> AsyncOmniEngine: + """Build a bare AsyncOmniEngine without launching any threads.""" + engine = object.__new__(AsyncOmniEngine) + engine.model = "fake-model" + engine.config_path = "/fake" + engine.stage_configs = stage_cfgs + engine.num_stages = len(stage_cfgs) + engine.async_chunk = False + engine.single_stage_mode = single_stage_mode + engine._single_stage_id_filter = stage_id_filter + engine._omni_master_address = omni_master_address + engine._omni_master_port = omni_master_port + engine._omni_master_server = None + engine._llm_stage_launch_lock = __import__("threading").Lock() + engine.diffusion_batch_size = 1 + engine.stage_clients = [] + engine.stage_vllm_configs = [] + engine.output_processors = [] + engine.input_processor = None + engine.supported_tasks = ("generate",) + engine.default_sampling_params_list = [] + engine.stage_metadata = [] + engine.prompt_expand_func = None + return engine + + def _fake_metadata(self, stage_id: int, stage_type: str = "llm") -> Mock: + meta = Mock() + meta.stage_id = stage_id + meta.stage_type = stage_type + meta.runtime_cfg = {} + meta.prompt_expand_func = None + meta.engine_output_type = None + meta.is_comprehension = False + meta.final_output = True if stage_id == 0 else False + meta.final_output_type = None + return meta + + def _run_initialize_stages_mocked( + self, + engine: AsyncOmniEngine, + stage_cfgs: list[Mock], + *, + launch_side_effect: Any = None, + remote_side_effect: Any = None, + attach_result: Any = None, + ) -> tuple[Mock, Mock]: + """Execute _initialize_stages with all heavy helpers mocked. + + Returns (mock_launch_llm_stage, mock_create_remote_llm_stage). + """ + started_by_stage: dict[int, StartedLlmStage] = { + cfg.stage_id: _make_started_llm_stage(cfg.stage_id) + for cfg in stage_cfgs + if getattr(cfg, "stage_type", "llm") != "diffusion" + } + + default_attach = (Mock(), Mock(), Mock(), Mock()) + + mock_launch = Mock( + side_effect=launch_side_effect + or (lambda cfg, meta, spec, timeout, llm_stage_launch_lock, kv: started_by_stage[meta.stage_id]) + ) + mock_remote = Mock( + side_effect=remote_side_effect or (lambda cfg, meta, spec, timeout, srv: started_by_stage[meta.stage_id]) + ) + mock_attach = Mock(return_value=attach_result or default_attach) + + mock_oms = Mock(spec=OmniMasterServer) + mock_oms.get_zmq_addresses.side_effect = lambda sid: Mock() + + finalized = ( + [Mock() for _ in stage_cfgs], + [Mock() for _ in stage_cfgs], + [{"final_output": True, "final_output_type": None, "stage_type": "llm"} for _ in stage_cfgs], + ) + + with ( + patch.object(engine, "_launch_llm_stage", mock_launch), + patch.object(engine, "_create_remote_llm_stage", mock_remote), + patch.object(engine, "_attach_llm_stage", mock_attach), + patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), + patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), + patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ), + patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ), + patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ), + patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(cfg.stage_id, getattr(cfg, "stage_type", "llm")), + ), + patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ), + ): + engine._initialize_stages(stage_init_timeout=60) + + return mock_launch, mock_remote + + # -- single-stage mode: stage matches filter → local launch --------------- + + def test_matching_stage_uses_launch_llm_stage(self): + """stage_id == _single_stage_id_filter → _launch_llm_stage is called.""" + stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) + mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + + launched_ids = [c.args[1].stage_id for c in mock_launch.call_args_list] + assert 0 in launched_ids, "_launch_llm_stage should be called for stage 0" + + def test_non_matching_stage_uses_create_remote_llm_stage(self): + """stage_id != _single_stage_id_filter → _create_remote_llm_stage is called.""" + stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) + mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + + remote_ids = [c.args[1].stage_id for c in mock_remote.call_args_list] + assert 1 in remote_ids, "_create_remote_llm_stage should be called for stage 1" + + def test_filter_1_routes_correctly(self): + """With filter=1, stage 0 is remote and stage 1 is local.""" + stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=1) + mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + + launched_ids = [c.args[1].stage_id for c in mock_launch.call_args_list] + remote_ids = [c.args[1].stage_id for c in mock_remote.call_args_list] + assert 1 in launched_ids, "stage 1 should be launched locally with filter=1" + assert 0 in remote_ids, "stage 0 should use remote path with filter=1" + + def test_no_filter_all_stages_use_launch_path(self): + """single_stage_mode=True but no filter → all stages use _launch_llm_stage.""" + stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=None) + mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + + assert mock_remote.call_count == 0, "No remote launches without a filter" + launched_ids = [c.args[1].stage_id for c in mock_launch.call_args_list] + assert set(launched_ids) == {0, 1} + + def test_non_single_stage_mode_never_calls_create_remote(self): + """Outside single_stage_mode, _create_remote_llm_stage must not be called.""" + stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=False, stage_id_filter=None) + mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + + assert mock_remote.call_count == 0 + + def test_omni_master_server_started_in_single_stage_mode(self): + """OmniMasterServer.start() must be called when single_stage_mode=True.""" + stage_cfgs = [_make_stage_cfg(0)] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) + mock_oms = Mock(spec=OmniMasterServer) + mock_oms.get_zmq_addresses.return_value = Mock() + finalized = ([Mock()], [Mock()], [{"final_output": True, "final_output_type": None, "stage_type": "llm"}]) + + with ( + patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)), + patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(0)), + patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), + patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), + patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), + patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), + patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), + patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) + ), + patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(cfg.stage_id), + ), + patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), + ): + engine._initialize_stages(stage_init_timeout=60) + + mock_oms.start.assert_called_once() + + def test_omni_master_server_uses_configured_stage_ids(self): + """Configured stage IDs, not list indexes, should drive pre-allocation.""" + stage_cfgs = [_make_stage_cfg(7), _make_stage_cfg(11)] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=7) + mock_oms = Mock(spec=OmniMasterServer) + mock_oms.get_zmq_addresses.return_value = Mock() + finalized = ( + [Mock(), Mock()], + [Mock(), Mock()], + [{"final_output": False, "final_output_type": None, "stage_type": "llm"} for _ in stage_cfgs], + ) + + with ( + patch.object( + engine, "_launch_llm_stage", side_effect=[_make_started_llm_stage(7), _make_started_llm_stage(11)] + ), + patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(11)), + patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), + patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms) as mock_oms_cls, + patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), + patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), + patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), + patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) + ), + patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(cfg.stage_id), + ), + patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), + ): + engine._initialize_stages(stage_init_timeout=60) + + mock_oms_cls.assert_called_once_with( + master_address=engine._omni_master_address, + master_port=engine._omni_master_port, + stage_ids=[7, 11], + ) + + def test_single_stage_filter_uses_configured_stage_ids(self): + """Local/remote dispatch should compare against configured stage IDs.""" + stage_cfgs = [_make_stage_cfg(7), _make_stage_cfg(11)] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=7) + mock_oms = Mock(spec=OmniMasterServer) + finalized = ( + [Mock(), Mock()], + [Mock(), Mock()], + [{"final_output": False, "final_output_type": None, "stage_type": "llm"} for _ in stage_cfgs], + ) + + with ( + patch.object(engine, "_launch_llm_stage", side_effect=[_make_started_llm_stage(7)]) as mock_launch, + patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(11)) as mock_remote, + patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), + patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), + patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), + patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), + patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), + patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) + ), + patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(cfg.stage_id), + ), + patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), + ): + engine._initialize_stages(stage_init_timeout=60) + + assert [call.args[1].stage_id for call in mock_launch.call_args_list] == [7] + assert [call.args[1].stage_id for call in mock_remote.call_args_list] == [11] + + def test_omni_master_server_preallocates_diffusion_stage_ids(self): + """Diffusion stages should also receive OmniMasterServer allocations.""" + stage_cfgs = [_make_stage_cfg(7), _make_stage_cfg(11, stage_type="diffusion")] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=7) + mock_oms = Mock(spec=OmniMasterServer) + finalized = ( + [Mock(), Mock()], + [Mock(), Mock()], + [ + {"final_output": False, "final_output_type": None, "stage_type": "llm"}, + {"final_output": False, "final_output_type": None, "stage_type": "diffusion"}, + ], + ) + + with ( + patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(7)), + patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(7)), + patch.object(engine, "_launch_diffusion_stage", return_value=Mock()), + patch.object(engine, "_create_remote_diffusion_stage", return_value=Mock()), + patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), + patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms) as mock_oms_cls, + patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), + patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), + patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), + patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) + ), + patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(cfg.stage_id, getattr(cfg, "stage_type", "llm")), + ), + patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), + ): + engine._initialize_stages(stage_init_timeout=60) + + mock_oms_cls.assert_called_once_with( + master_address=engine._omni_master_address, + master_port=engine._omni_master_port, + stage_ids=[7, 11], + ) + + def test_duplicate_llm_stage_ids_raise(self): + """Duplicate configured LLM stage IDs should fail fast.""" + stage_cfgs = [_make_stage_cfg(3), _make_stage_cfg(3)] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=3) + + with ( + patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), + patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), + pytest.raises(ValueError, match="Duplicate stage_id"), + ): + engine._initialize_stages(stage_init_timeout=60) + + def test_omni_master_server_not_started_in_normal_mode(self): + """OmniMasterServer must NOT be instantiated outside single_stage_mode.""" + stage_cfgs = [_make_stage_cfg(0)] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=False, stage_id_filter=None) + finalized = ([Mock()], [Mock()], [{"final_output": True, "final_output_type": None, "stage_type": "llm"}]) + + with ( + patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)), + patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), + patch("vllm_omni.engine.async_omni_engine.OmniMasterServer") as mock_oms_cls, + patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), + patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), + patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), + patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) + ), + patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(cfg.stage_id), + ), + patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), + ): + engine._initialize_stages(stage_init_timeout=60) + + mock_oms_cls.assert_not_called() + + def test_single_stage_mode_missing_master_address_raises(self): + """single_stage_mode without master address/port raises ValueError.""" + stage_cfgs = [_make_stage_cfg(0)] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) + engine._omni_master_address = None # missing + engine._omni_master_port = None + + with ( + patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), + patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), + pytest.raises(ValueError, match="omni_master_address"), + ): + engine._initialize_stages(stage_init_timeout=60) + + def test_matching_diffusion_stage_uses_local_registered_launch(self): + """A local diffusion stage should use the registered single-stage launch path.""" + stage_cfgs = [_make_stage_cfg(0, stage_type="diffusion"), _make_stage_cfg(1)] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) + mock_oms = Mock(spec=OmniMasterServer) + diffusion_client = Mock(stage_type="diffusion") + finalized = ( + [diffusion_client, Mock()], + [Mock(), Mock()], + [ + {"final_output": False, "final_output_type": None, "stage_type": "diffusion"}, + {"final_output": False, "final_output_type": None, "stage_type": "llm"}, + ], + ) + + with ( + patch.object(engine, "_launch_diffusion_stage", return_value=diffusion_client) as mock_local_diff, + patch.object(engine, "_create_remote_diffusion_stage") as mock_remote_diff, + patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(1)), + patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(1)), + patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), + patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), + patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), + patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), + patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), + patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) + ), + patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(cfg.stage_id, getattr(cfg, "stage_type", "llm")), + ), + patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), + ): + engine._initialize_stages(stage_init_timeout=60) + + assert mock_local_diff.call_count == 1 + assert mock_local_diff.call_args.args[1].stage_id == 0 + mock_remote_diff.assert_not_called() + + def test_non_matching_diffusion_stage_uses_remote_diffusion_client(self): + """A non-local diffusion stage should attach via the remote diffusion path.""" + stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1, stage_type="diffusion")] + engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) + mock_oms = Mock(spec=OmniMasterServer) + remote_diffusion_client = Mock(stage_type="diffusion") + finalized = ( + [Mock(), remote_diffusion_client], + [Mock(), Mock()], + [ + {"final_output": False, "final_output_type": None, "stage_type": "llm"}, + {"final_output": False, "final_output_type": None, "stage_type": "diffusion"}, + ], + ) + + with ( + patch.object(engine, "_launch_diffusion_stage") as mock_local_diff, + patch.object( + engine, "_create_remote_diffusion_stage", return_value=remote_diffusion_client + ) as mock_remote_diff, + patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)), + patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(0)), + patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), + patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), + patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), + patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), + patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), + patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) + ), + patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(cfg.stage_id, getattr(cfg, "stage_type", "llm")), + ), + patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), + ): + engine._initialize_stages(stage_init_timeout=60) + + mock_local_diff.assert_not_called() + assert mock_remote_diff.call_count == 1 + assert mock_remote_diff.call_args.args[0].stage_id == 1 + + +# --------------------------------------------------------------------------- +# AsyncOmniEngine – _launch_diffusion_stage +# --------------------------------------------------------------------------- + + +class TestLaunchDiffusionStage: + """Test local diffusion stage launch wiring.""" + + def test_registers_stage_with_public_master_properties(self): + engine = object.__new__(AsyncOmniEngine) + engine.model = "fake-model" + engine.diffusion_batch_size = 4 + + stage_cfg = _make_stage_cfg(5, stage_type="diffusion") + metadata = Mock(stage_id=5) + omni_master_server = Mock(spec=OmniMasterServer) + omni_master_server.address = "127.0.0.1" + omni_master_server.port = 25000 + + proc = Mock() + diffusion_client = Mock() + + with ( + patch("vllm_omni.engine.async_omni_engine.build_diffusion_config", return_value="diffusion-config"), + patch( + "vllm_omni.engine.async_omni_engine.register_stage_with_omni_master", + return_value=( + "tcp://127.0.0.1:25001", + "tcp://127.0.0.1:25002", + "tcp://127.0.0.1:25003", + ), + ) as mock_register, + patch( + "vllm_omni.engine.async_omni_engine.spawn_diffusion_proc", + return_value=(proc, None, None, None), + ) as mock_spawn, + patch("vllm_omni.engine.async_omni_engine.complete_diffusion_handshake") as mock_handshake, + patch( + "vllm_omni.engine.async_omni_engine.StageDiffusionClient.from_addresses", + return_value=diffusion_client, + ) as mock_from_addresses, + ): + result = engine._launch_diffusion_stage( + stage_cfg=stage_cfg, + metadata=metadata, + omni_master_server=omni_master_server, + ) + + mock_register.assert_called_once_with( + omni_master_address="127.0.0.1", + omni_master_port=25000, + omni_stage_id=5, + omni_stage_config=stage_cfg, + return_addresses=True, + ) + mock_spawn.assert_called_once_with( + "fake-model", + "diffusion-config", + handshake_address="tcp://127.0.0.1:25001", + request_address="tcp://127.0.0.1:25002", + response_address="tcp://127.0.0.1:25003", + ) + mock_handshake.assert_called_once_with(proc, "tcp://127.0.0.1:25001") + mock_from_addresses.assert_called_once_with( + metadata, + request_address="tcp://127.0.0.1:25002", + response_address="tcp://127.0.0.1:25003", + proc=proc, + batch_size=4, + ) + assert result is diffusion_client + + +# --------------------------------------------------------------------------- +# AsyncOmniEngine – _create_remote_llm_stage +# --------------------------------------------------------------------------- + + +class TestCreateRemoteLlmStage: + """Test _create_remote_llm_stage delegates correctly.""" + + def _engine(self) -> AsyncOmniEngine: + engine = object.__new__(AsyncOmniEngine) + engine.model = "fake-model" + engine.single_stage_mode = True + engine._single_stage_id_filter = 0 + engine._omni_master_server = Mock(spec=OmniMasterServer) + engine._omni_master_server.get_zmq_addresses.return_value = Mock() + engine._omni_master_server.get_allocation.return_value = Mock() + engine._omni_master_server.get_stage_config.return_value = { + "stage_id": 0, + "stage_type": "llm", + "engine_args": {}, + } + return engine + + @contextmanager + def _patch_build_and_connect(self, stage_id: int): + fake_vllm_config = Mock() + fake_executor_cls = Mock() + fake_addresses = Mock() + fake_addresses.inputs = ["tcp://127.0.0.1:5000"] + fake_addresses.outputs = ["tcp://127.0.0.1:5001"] + fake_addresses.frontend_stats_publish_address = None + + eng_mgr = Mock() + coordinator = Mock() + + @contextmanager + def fake_connect_cm(*args, **kwargs): + yield eng_mgr, coordinator, fake_addresses + + with ( + patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": stage_id}, + ), + patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(fake_vllm_config, fake_executor_cls), + ), + patch( + "vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", + return_value=fake_connect_cm(), + ) as mock_connect, + ): + yield mock_connect, fake_vllm_config, fake_executor_cls, fake_addresses + + def test_returns_started_llm_stage_with_correct_stage_id(self): + engine = self._engine() + stage_cfg = _make_stage_cfg(1) + metadata = Mock(stage_id=1) + omni_ms = engine._omni_master_server + omni_ms.get_stage_config.return_value = { + "stage_id": 1, + "stage_type": "llm", + "engine_args": {}, + } + + with self._patch_build_and_connect(1): + result = engine._create_remote_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + omni_master_server=omni_ms, + ) + assert isinstance(result, StartedLlmStage) + assert result.stage_id == 1 + + def test_connect_remote_engine_cores_called_with_stage_id(self): + engine = self._engine() + stage_cfg = _make_stage_cfg(2) + metadata = Mock(stage_id=2) + omni_ms = engine._omni_master_server + omni_ms.get_zmq_addresses.return_value = Mock(inputs=["x"], outputs=["y"]) + omni_ms.get_stage_config.return_value = { + "stage_id": 2, + "stage_type": "llm", + "engine_args": {}, + } + + fake_vllm_config = Mock() + fake_executor_cls = Mock() + fake_addresses = Mock() + fake_addresses.inputs = ["tcp://127.0.0.1:5000"] + fake_addresses.outputs = ["tcp://127.0.0.1:5001"] + fake_addresses.frontend_stats_publish_address = None + + @contextmanager + def fake_connect_cm(*args, **kwargs): + yield Mock(), Mock(), fake_addresses + + with ( + patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 2}, + ), + patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(fake_vllm_config, fake_executor_cls), + ), + patch( + "vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", return_value=fake_connect_cm() + ) as mock_connect, + ): + engine._create_remote_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + omni_master_server=omni_ms, + ) + + mock_connect.assert_called_once() + _, kwargs = mock_connect.call_args + assert kwargs.get("stage_id") == 2 or mock_connect.call_args.args[-1] == 2 + omni_ms.get_stage_config.assert_called_once_with(2, timeout_s=60) + + def test_missing_registered_stage_config_raises_value_error(self): + engine = self._engine() + stage_cfg = _make_stage_cfg(3) + metadata = Mock(stage_id=3) + omni_ms = engine._omni_master_server + omni_ms.get_stage_config.return_value = None + + with patch("vllm_omni.engine.async_omni_engine.build_engine_args_dict") as mock_build_args: + with pytest.raises( + ValueError, + match="Remote stage 3 registered without stage config", + ): + engine._create_remote_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + omni_master_server=omni_ms, + ) + + mock_build_args.assert_not_called() + + def test_exception_during_connect_closes_started_stage(self): + """If an error occurs after StartedLlmStage creation, close_started_llm_stage is called.""" + engine = self._engine() + stage_cfg = _make_stage_cfg(1) + metadata = Mock(stage_id=1) + omni_ms = engine._omni_master_server + omni_ms.get_stage_config.return_value = { + "stage_id": 1, + "stage_type": "llm", + "engine_args": {}, + } + + @contextmanager + def boom(*args, **kwargs): + yield Mock(), Mock(), Mock() + raise RuntimeError("handshake failed") + + with ( + patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 1}, + ), + patch("vllm_omni.engine.async_omni_engine.build_vllm_config", return_value=(Mock(), Mock())), + patch("vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", return_value=boom()), + patch("vllm_omni.engine.async_omni_engine.close_started_llm_stage") as mock_close, + ): + with pytest.raises(RuntimeError, match="handshake failed"): + engine._create_remote_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + omni_master_server=omni_ms, + ) + mock_close.assert_called_once() + + +class TestConnectRemoteEngineCoresCoordinator: + """Test coordinator launch parity with launch_core_engines.""" + + @staticmethod + def _build_vllm_config(*, dp_rank: int = 0, offline_mode: bool = False, needs_dp_coordinator: bool = True) -> Mock: + parallel_config = Mock() + parallel_config.data_parallel_size_local = 1 + parallel_config.data_parallel_size = 2 + parallel_config.data_parallel_rank = dp_rank + parallel_config.data_parallel_rank_local = 0 if offline_mode else None + + vllm_config = Mock() + vllm_config.parallel_config = parallel_config + vllm_config.needs_dp_coordinator = needs_dp_coordinator + vllm_config.model_config = Mock(is_moe=False) + return vllm_config + + def test_uses_registered_coordinator_addresses(self): + vllm_config = self._build_vllm_config(dp_rank=0, offline_mode=False, needs_dp_coordinator=True) + + omni_master_server = Mock(spec=OmniMasterServer) + omni_master_server.get_zmq_addresses.return_value = EngineZmqAddresses( + inputs=["tcp://client-in"], outputs=["tcp://client-out"] + ) + omni_master_server.get_allocation.return_value = Mock(handshake_bind_address="tcp://127.0.0.1:26001") + omni_master_server.get_stage_coordinator_addresses.return_value = StageCoordinatorAddresses( + coordinator_input="tcp://coord-in", + coordinator_output="tcp://coord-out", + frontend_stats_publish_address="tcp://stats", + ) + + @contextmanager + def fake_socket_ctx(*args, **kwargs): + yield Mock() + + with ( + patch("vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", return_value=fake_socket_ctx()), + patch("vllm_omni.engine.stage_engine_startup._wait_for_omni_engine_startup") as mock_wait, + ): + with connect_remote_engine_cores( + vllm_config=vllm_config, + omni_master_server=omni_master_server, + stage_id=7, + ) as (_, yielded_coordinator, yielded_addresses): + assert yielded_coordinator is None + assert yielded_addresses.coordinator_input == "tcp://coord-in" + assert yielded_addresses.coordinator_output == "tcp://coord-out" + assert yielded_addresses.frontend_stats_publish_address == "tcp://stats" + + omni_master_server.get_stage_coordinator_addresses.assert_called_once_with(7) + mock_wait.assert_called_once() + + def test_defaults_to_no_coordinator_addresses_when_none_registered(self): + vllm_config = self._build_vllm_config( + dp_rank=0, + offline_mode=False, + needs_dp_coordinator=True, + ) + + omni_master_server = Mock(spec=OmniMasterServer) + omni_master_server.get_zmq_addresses.return_value = EngineZmqAddresses( + inputs=["tcp://client-in"], outputs=["tcp://client-out"] + ) + omni_master_server.get_allocation.return_value = Mock(handshake_bind_address="tcp://127.0.0.1:26001") + omni_master_server.get_stage_coordinator_addresses.return_value = StageCoordinatorAddresses() + + @contextmanager + def fake_socket_ctx(*args, **kwargs): + yield Mock() + + with ( + patch("vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", return_value=fake_socket_ctx()), + patch("vllm_omni.engine.stage_engine_startup._wait_for_omni_engine_startup"), + ): + with connect_remote_engine_cores( + vllm_config=vllm_config, + omni_master_server=omni_master_server, + stage_id=7, + ) as (_, yielded_coordinator, yielded_addresses): + assert yielded_coordinator is None + assert yielded_addresses.coordinator_input is None + assert yielded_addresses.coordinator_output is None + assert yielded_addresses.frontend_stats_publish_address is None + + +class TestLaunchOmniCoreEngines: + """Tests for local omni engine launch wiring.""" + + def test_registers_stage_once_and_reuses_handshake_for_all_local_engines(self): + parallel_config = Mock( + data_parallel_size_local=2, + data_parallel_size=4, + data_parallel_rank=3, + ) + vllm_config = Mock(parallel_config=parallel_config) + + omni_master_server = Mock(spec=OmniMasterServer) + omni_master_server.address = "127.0.0.1" + omni_master_server.port = 26000 + omni_master_server.get_allocation.return_value = Mock(handshake_bind_address="tcp://127.0.0.1:26001") + + stage_config = {"stage_id": 7, "stage_type": "llm"} + local_engine_manager = Mock() + + @contextmanager + def fake_socket_ctx(*args, **kwargs): + yield Mock() + + with ( + patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value="tcp://127.0.0.1:26001", + ) as mock_register, + patch("vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", return_value=fake_socket_ctx()), + patch( + "vllm_omni.engine.stage_engine_startup.CoreEngineProcManager", + return_value=local_engine_manager, + ) as mock_manager_cls, + patch("vllm_omni.engine.stage_engine_startup.wait_for_engine_startup"), + ): + with launch_omni_core_engines( + vllm_config=vllm_config, + executor_class=Mock(), + log_stats=False, + omni_master_server=omni_master_server, + stage_id=7, + stage_config=stage_config, + ) as (yielded_manager, yielded_coordinator, yielded_addresses): + assert yielded_manager is local_engine_manager + assert yielded_coordinator is None + + mock_register.assert_called_once_with( + omni_master_address="127.0.0.1", + omni_master_port=26000, + omni_stage_id=7, + omni_stage_config=stage_config, + coordinator=None, + ) + mock_manager_cls.assert_called_once() + manager_kwargs = mock_manager_cls.call_args.kwargs + assert manager_kwargs["local_engine_count"] == 2 + assert manager_kwargs["start_index"] == 3 + assert manager_kwargs["local_start_index"] == 0 + assert manager_kwargs["vllm_config"] is vllm_config + assert manager_kwargs["local_client"] is True + assert manager_kwargs["handshake_address"] == "tcp://127.0.0.1:26001" + assert manager_kwargs["executor_class"] is not None + + def test_registers_stage_with_coordinator_when_started(self): + parallel_config = Mock( + data_parallel_size_local=1, + data_parallel_size=2, + data_parallel_rank=0, + ) + vllm_config = Mock(parallel_config=parallel_config) + vllm_config.needs_dp_coordinator = True + vllm_config.model_config = Mock(is_moe=False) + + omni_master_server = Mock(spec=OmniMasterServer) + omni_master_server.address = "127.0.0.1" + omni_master_server.port = 26000 + omni_master_server.get_zmq_addresses.return_value = EngineZmqAddresses( + inputs=["tcp://client-in"], outputs=["tcp://client-out"] + ) + omni_master_server.get_allocation.return_value = Mock(handshake_bind_address="tcp://127.0.0.1:26001") + + coordinator = Mock() + coordinator.proc.pid = 1234 + coordinator.get_engine_socket_addresses.return_value = ("tcp://coord-in", "tcp://coord-out") + coordinator.get_stats_publish_address.return_value = "tcp://stats" + + @contextmanager + def fake_socket_ctx(*args, **kwargs): + yield Mock() + + with ( + patch("vllm_omni.engine.stage_engine_startup.DPCoordinator", return_value=coordinator), + patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value="tcp://127.0.0.1:26001", + ) as mock_register, + patch("vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", return_value=fake_socket_ctx()), + patch( + "vllm_omni.engine.stage_engine_startup.CoreEngineProcManager", + return_value=Mock(), + ) as mock_manager_cls, + patch("vllm_omni.engine.stage_engine_startup.wait_for_engine_startup") as mock_wait, + ): + with launch_omni_core_engines( + vllm_config=vllm_config, + executor_class=Mock(), + log_stats=False, + omni_master_server=omni_master_server, + stage_id=7, + stage_config={"stage_id": 7}, + ): + pass + + mock_register.assert_called_once_with( + omni_master_address="127.0.0.1", + omni_master_port=26000, + omni_stage_id=7, + omni_stage_config={"stage_id": 7}, + coordinator=coordinator, + ) + manager_kwargs = mock_manager_cls.call_args.kwargs + assert manager_kwargs["log_stats"] is False + mock_wait.assert_called_once() + + +# --------------------------------------------------------------------------- +# AsyncOmniEngine – _launch_llm_stage single_stage_mode codepath +# --------------------------------------------------------------------------- + + +class TestLaunchLlmStageSingleStageMode: + """Test that _launch_llm_stage selects launch_omni_core_engines when + single_stage_mode=True and _omni_master_server is set.""" + + def _build_engine_with_oms(self) -> AsyncOmniEngine: + engine = object.__new__(AsyncOmniEngine) + engine.model = "fake-model" + engine.single_stage_mode = True + engine._single_stage_id_filter = 0 + engine._llm_stage_launch_lock = threading.Lock() + mock_oms = Mock(spec=OmniMasterServer) + mock_oms.address = "127.0.0.1" + mock_oms.port = 25000 + alloc = Mock() + alloc.handshake_bind_address = "tcp://127.0.0.1:25001" + mock_oms.get_allocation.return_value = alloc + fake_addresses = Mock() + fake_addresses.inputs = ["tcp://127.0.0.1:5000"] + fake_addresses.outputs = ["tcp://127.0.0.1:5001"] + fake_addresses.frontend_stats_publish_address = None + mock_oms.get_zmq_addresses.return_value = fake_addresses + engine._omni_master_server = mock_oms + return engine + + @contextmanager + def _patch_launch_omni_cm(self, stage_id: int): + fake_vllm_config = Mock() + fake_executor_cls = Mock() + fake_addresses = Mock() + fake_addresses.inputs = ["tcp://127.0.0.1:5000"] + fake_addresses.outputs = ["tcp://127.0.0.1:5001"] + fake_addresses.frontend_stats_publish_address = None + + eng_mgr = Mock() + + @contextmanager + def fake_launch_omni(*args, **kwargs): + yield eng_mgr, None, fake_addresses + + with ( + patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), + patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": stage_id}, + ), + patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(fake_vllm_config, fake_executor_cls), + ), + patch( + "vllm_omni.engine.async_omni_engine.acquire_device_locks", + return_value=[], + ), + patch( + "vllm_omni.engine.async_omni_engine.release_device_locks", + ), + patch( + "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", + return_value=fake_launch_omni(), + ) as mock_launch_omni, + ): + yield mock_launch_omni + + def test_launch_omni_core_engines_used_in_single_stage_mode(self): + """single_stage_mode + _omni_master_server → launch_omni_core_engines.""" + engine = self._build_engine_with_oms() + metadata = Mock(stage_id=0, runtime_cfg={}) + stage_cfg = _make_stage_cfg(0) + + with self._patch_launch_omni_cm(0) as mock_launch_omni: + result = engine._launch_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + llm_stage_launch_lock=threading.Lock(), + ) + + mock_launch_omni.assert_called_once() + assert mock_launch_omni.call_args.kwargs["stage_config"] is stage_cfg + assert isinstance(result, StartedLlmStage) + assert result.stage_id == 0 + + def test_spawn_stage_core_used_in_normal_mode(self): + """~single_stage_mode → spawn_stage_core + complete_stage_handshake.""" + engine = object.__new__(AsyncOmniEngine) + engine.model = "fake-model" + engine.single_stage_mode = False + engine._omni_master_server = None + engine._llm_stage_launch_lock = threading.Lock() + + fake_vllm_config = Mock() + fake_executor_cls = Mock() + fake_addresses = Mock() + fake_addresses.inputs = ["tcp://127.0.0.1:5000"] + fake_addresses.outputs = ["tcp://127.0.0.1:5001"] + fake_addresses.frontend_stats_publish_address = None + + fake_proc = Mock() + fake_handshake_address = "ipc:///tmp/fake-handshake" + + with ( + patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), + patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 0}, + ), + patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(fake_vllm_config, fake_executor_cls), + ), + patch("vllm_omni.engine.async_omni_engine.acquire_device_locks", return_value=[]), + patch("vllm_omni.engine.async_omni_engine.release_device_locks"), + patch( + "vllm_omni.engine.async_omni_engine.spawn_stage_core", + return_value=(fake_addresses, fake_proc, fake_handshake_address), + ) as mock_spawn, + patch("vllm_omni.engine.async_omni_engine.complete_stage_handshake") as mock_handshake, + patch("vllm_omni.engine.async_omni_engine.launch_omni_core_engines") as mock_omni, + ): + metadata = Mock(stage_id=0, runtime_cfg={}) + result = engine._launch_llm_stage( + stage_cfg=_make_stage_cfg(0), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + llm_stage_launch_lock=threading.Lock(), + ) + + mock_spawn.assert_called_once_with( + vllm_config=fake_vllm_config, + executor_class=fake_executor_cls, + log_stats=False, + ) + mock_handshake.assert_called_once_with( + fake_proc, + fake_handshake_address, + fake_addresses, + fake_vllm_config, + ) + mock_omni.assert_not_called() + assert isinstance(result, StartedLlmStage) + assert result.proc is fake_proc + + def test_launch_omni_passes_stage_id_and_master_server(self): + """launch_omni_core_engines receives the correct stage_id and omni_master_server.""" + engine = self._build_engine_with_oms() + metadata = Mock(stage_id=0, runtime_cfg={}) + + captured_kwargs: dict[str, Any] = {} + + @contextmanager + def capturing_launch(*args, **kwargs): + captured_kwargs.update(kwargs) + fake_addresses = Mock() + fake_addresses.inputs = ["tcp://127.0.0.1:5000"] + fake_addresses.outputs = ["tcp://127.0.0.1:5001"] + fake_addresses.frontend_stats_publish_address = None + yield Mock(), None, fake_addresses + + with ( + patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), + patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 0}, + ), + patch("vllm_omni.engine.async_omni_engine.build_vllm_config", return_value=(Mock(), Mock())), + patch("vllm_omni.engine.async_omni_engine.acquire_device_locks", return_value=[]), + patch("vllm_omni.engine.async_omni_engine.release_device_locks"), + patch("vllm_omni.engine.async_omni_engine.launch_omni_core_engines", side_effect=capturing_launch), + ): + engine._launch_llm_stage( + stage_cfg=_make_stage_cfg(0), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + llm_stage_launch_lock=threading.Lock(), + ) + + assert captured_kwargs.get("stage_id") == 0 + assert captured_kwargs.get("omni_master_server") is engine._omni_master_server + + def test_launch_omni_context_exits_before_stage_cleanup_on_error(self): + """Errors after entering the omni launch context still unwind it first.""" + engine = self._build_engine_with_oms() + metadata = Mock(stage_id=0, runtime_cfg={}) + + fake_addresses = Mock() + fake_addresses.inputs = ["tcp://127.0.0.1:5000"] + fake_addresses.outputs = ["tcp://127.0.0.1:5001"] + fake_addresses.frontend_stats_publish_address = None + + events: list[str] = [] + + @contextmanager + def fake_launch_omni(*args, **kwargs): + try: + yield Mock(), None, fake_addresses + finally: + events.append("launch_exit") + + with ( + patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), + patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 0}, + ), + patch("vllm_omni.engine.async_omni_engine.build_vllm_config", return_value=(Mock(), Mock())), + patch("vllm_omni.engine.async_omni_engine.acquire_device_locks", return_value=[]), + patch("vllm_omni.engine.async_omni_engine.release_device_locks"), + patch( + "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", + return_value=fake_launch_omni(), + ), + patch("vllm_omni.engine.async_omni_engine.logger.info", side_effect=RuntimeError("boom")), + patch( + "vllm_omni.engine.async_omni_engine.close_started_llm_stage", + side_effect=lambda _started: events.append("stage_close"), + ) as mock_close_stage, + ): + with pytest.raises(RuntimeError, match="boom"): + engine._launch_llm_stage( + stage_cfg=_make_stage_cfg(0), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + llm_stage_launch_lock=threading.Lock(), + ) + + mock_close_stage.assert_called_once() + assert events == ["launch_exit", "stage_close"] + + def test_base_exception_propagates_without_started_stage_cleanup(self): + """BaseException subclasses should bypass the Exception cleanup path.""" + engine = self._build_engine_with_oms() + metadata = Mock(stage_id=0, runtime_cfg={}) + + fake_addresses = Mock() + fake_addresses.inputs = ["tcp://127.0.0.1:5000"] + fake_addresses.outputs = ["tcp://127.0.0.1:5001"] + fake_addresses.frontend_stats_publish_address = None + + events: list[str] = [] + + class FatalLaunchInterrupt(BaseException): + pass + + @contextmanager + def fake_launch_omni(*args, **kwargs): + try: + yield Mock(), None, fake_addresses + finally: + events.append("launch_exit") + + with ( + patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), + patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 0}, + ), + patch("vllm_omni.engine.async_omni_engine.build_vllm_config", return_value=(Mock(), Mock())), + patch("vllm_omni.engine.async_omni_engine.acquire_device_locks", return_value=[]), + patch("vllm_omni.engine.async_omni_engine.release_device_locks"), + patch( + "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", + return_value=fake_launch_omni(), + ), + patch( + "vllm_omni.engine.async_omni_engine.logger.info", + side_effect=FatalLaunchInterrupt("stop"), + ), + patch("vllm_omni.engine.async_omni_engine.close_started_llm_stage") as mock_close_stage, + ): + with pytest.raises(FatalLaunchInterrupt, match="stop"): + engine._launch_llm_stage( + stage_cfg=_make_stage_cfg(0), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + llm_stage_launch_lock=threading.Lock(), + ) + + mock_close_stage.assert_not_called() + assert events == ["launch_exit"] diff --git a/tests/entrypoints/test_serve.py b/tests/entrypoints/test_serve.py new file mode 100644 index 0000000000..916db3cc22 --- /dev/null +++ b/tests/entrypoints/test_serve.py @@ -0,0 +1,195 @@ +"""Unit tests for the Omni serve CLI helpers.""" + +from __future__ import annotations + +import argparse +from unittest.mock import Mock, patch + +import pytest + +from vllm_omni.entrypoints.cli.serve import run_headless + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def _make_headless_args() -> argparse.Namespace: + return argparse.Namespace( + model="fake-model", + stage_id=3, + omni_master_address="127.0.0.1", + omni_master_port=26000, + api_server_count=0, + worker_backend="multi_process", + stage_configs_path=None, + log_stats=False, + disable_log_stats=False, + ) + + +def test_run_headless_registers_stage_once_and_launches_all_local_engines() -> None: + args = _make_headless_args() + stage_cfg = Mock(stage_id=3) + stage_cfgs = [stage_cfg] + parallel_config = Mock( + data_parallel_size_local=2, + data_parallel_rank=4, + data_parallel_rank_local=1, + node_rank_within_dp=0, + ) + vllm_config = Mock(parallel_config=parallel_config) + executor_class = Mock() + engine_manager = Mock() + + with ( + patch( + "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", + return_value=("/fake/stages.yaml", stage_cfgs), + ), + patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment"), + patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=Mock()), + patch("vllm_omni.engine.stage_init_utils.get_stage_connector_spec", return_value={}), + patch("vllm_omni.engine.stage_init_utils.build_engine_args_dict", return_value={}), + patch( + "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ), + patch( + "vllm_omni.engine.stage_init_utils.build_vllm_config", + return_value=(vllm_config, executor_class), + ) as mock_build_vllm_config, + patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value="tcp://127.0.0.1:26001", + ) as mock_register, + patch("vllm.v1.engine.utils.CoreEngineProcManager", return_value=engine_manager) as mock_manager_cls, + patch("signal.signal"), + ): + run_headless(args) + + mock_build_vllm_config.assert_called_once_with( + stage_cfg, + "fake-model", + stage_connector_spec={}, + engine_args_dict={}, + headless=True, + ) + mock_register.assert_called_once_with( + omni_master_address="127.0.0.1", + omni_master_port=26000, + omni_stage_id=3, + omni_stage_config=stage_cfg, + coordinator=None, + ) + mock_manager_cls.assert_called_once() + manager_kwargs = mock_manager_cls.call_args.kwargs + assert manager_kwargs["local_engine_count"] == 2 + assert manager_kwargs["start_index"] == 4 + assert manager_kwargs["local_start_index"] == 0 + assert manager_kwargs["local_client"] is False + assert manager_kwargs["handshake_address"] == "tcp://127.0.0.1:26001" + assert manager_kwargs["log_stats"] is False + engine_manager.join_first.assert_called_once_with() + engine_manager.shutdown.assert_called_once_with() + + +def test_run_headless_honors_explicit_log_stats_flag() -> None: + args = _make_headless_args() + args.log_stats = True + stage_cfg = Mock(stage_id=3) + stage_cfgs = [stage_cfg] + parallel_config = Mock( + data_parallel_size_local=2, + data_parallel_rank=4, + data_parallel_rank_local=1, + node_rank_within_dp=0, + ) + vllm_config = Mock(parallel_config=parallel_config) + executor_class = Mock() + engine_manager = Mock() + + with ( + patch( + "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", + return_value=("/fake/stages.yaml", stage_cfgs), + ), + patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment"), + patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=Mock()), + patch("vllm_omni.engine.stage_init_utils.get_stage_connector_spec", return_value={}), + patch("vllm_omni.engine.stage_init_utils.build_engine_args_dict", return_value={}), + patch( + "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ), + patch( + "vllm_omni.engine.stage_init_utils.build_vllm_config", + return_value=(vllm_config, executor_class), + ), + patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value="tcp://127.0.0.1:26001", + ), + patch("vllm.v1.engine.utils.CoreEngineProcManager", return_value=engine_manager) as mock_manager_cls, + patch("signal.signal"), + ): + run_headless(args) + + manager_kwargs = mock_manager_cls.call_args.kwargs + assert manager_kwargs["log_stats"] is True + + +def test_run_headless_launches_diffusion_stage_via_omni_master() -> None: + args = _make_headless_args() + stage_cfg = Mock(stage_id=3, stage_type="diffusion") + stage_cfg.engine_args = Mock() + stage_cfg.engine_input_source = [] + stage_cfgs = [stage_cfg] + metadata = Mock(stage_id=3) + od_config = Mock() + proc = Mock() + proc.exitcode = 0 + proc.is_alive.return_value = False + + with ( + patch( + "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", + return_value=("/fake/stages.yaml", stage_cfgs), + ), + patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment"), + patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=Mock()), + patch( + "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ), + patch("vllm_omni.engine.stage_init_utils.extract_stage_metadata", return_value=metadata), + patch("vllm_omni.engine.stage_init_utils.inject_kv_stage_info") as mock_inject_stage_info, + patch("vllm_omni.engine.stage_init_utils.build_diffusion_config", return_value=od_config), + patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value=("tcp://127.0.0.1:26001", "tcp://127.0.0.1:26002", "tcp://127.0.0.1:26003"), + ) as mock_register, + patch( + "vllm_omni.diffusion.stage_diffusion_proc.spawn_diffusion_proc", + return_value=(proc, "tcp://127.0.0.1:26001", "tcp://127.0.0.1:26002", "tcp://127.0.0.1:26003"), + ) as mock_spawn, + patch("vllm_omni.diffusion.stage_diffusion_proc.complete_diffusion_handshake") as mock_handshake, + patch("signal.signal"), + ): + run_headless(args) + + mock_inject_stage_info.assert_called_once_with(stage_cfg, 3) + mock_register.assert_called_once_with( + omni_master_address="127.0.0.1", + omni_master_port=26000, + omni_stage_id=3, + omni_stage_config=stage_cfg, + return_addresses=True, + ) + mock_spawn.assert_called_once_with( + "fake-model", + od_config, + handshake_address="tcp://127.0.0.1:26001", + request_address="tcp://127.0.0.1:26002", + response_address="tcp://127.0.0.1:26003", + ) + mock_handshake.assert_called_once_with(proc, "tcp://127.0.0.1:26001") + proc.join.assert_called_once_with() diff --git a/tests/entrypoints/test_utils.py b/tests/entrypoints/test_utils.py index 6e44fe533c..94e254c250 100644 --- a/tests/entrypoints/test_utils.py +++ b/tests/entrypoints/test_utils.py @@ -8,6 +8,7 @@ import torch from pytest_mock import MockerFixture +from vllm_omni.config.yaml_util import create_config from vllm_omni.diffusion.data import OmniDiffusionConfig from vllm_omni.engine.arg_utils import OmniEngineArgs from vllm_omni.engine.async_omni_engine import AsyncOmniEngine @@ -16,6 +17,7 @@ _filter_dict_like_object, filter_dataclass_kwargs, load_and_resolve_stage_configs, + load_stage_configs_from_yaml, resolve_model_config_path, ) @@ -322,3 +324,70 @@ def test_load_and_resolve_with_kwargs(self): assert config_path is None assert len(stage_configs) == 1 assert "dtype" in stage_configs[0]["engine_args"] + + +class TestLoadStageConfigsFromYaml: + """Regression tests for stage-config loading and merging.""" + + def test_deep_merges_stage_engine_args(self, mocker: MockerFixture): + yaml_config = create_config( + { + "async_chunk": True, + "stage_args": [ + { + "stage_id": 0, + "runtime": {"device": 0}, + "engine_args": { + "parallel_config": {"tensor_parallel_size": 4}, + }, + } + ], + } + ) + mocker.patch( + "vllm_omni.entrypoints.utils.load_yaml_config", + return_value=yaml_config, + ) + + stages = load_stage_configs_from_yaml( + "fake.yaml", + base_engine_args={ + "parallel_config": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 2, + }, + "model": "base-model", + }, + ) + + merged_engine_args = stages[0]["engine_args"] + assert merged_engine_args["parallel_config"]["tensor_parallel_size"] == 4 + assert merged_engine_args["parallel_config"]["pipeline_parallel_size"] == 2 + assert merged_engine_args["model"] == "base-model" + assert merged_engine_args["async_chunk"] is True + + def test_merges_nested_stage_engine_args(self, mocker: MockerFixture): + yaml_config = create_config( + { + "stage_args": [ + { + "stage_id": 0, + "engine_args": { + "nested": {"override": 2}, + }, + } + ], + } + ) + mocker.patch( + "vllm_omni.entrypoints.utils.load_yaml_config", + return_value=yaml_config, + ) + + stages = load_stage_configs_from_yaml( + "fake.yaml", + base_engine_args={"nested": {"base": 1}}, + ) + + assert stages[0]["engine_args"]["nested"]["base"] == 1 + assert stages[0]["engine_args"]["nested"]["override"] == 2 diff --git a/vllm_omni/diffusion/stage_diffusion_client.py b/vllm_omni/diffusion/stage_diffusion_client.py index a1a4766de2..cd7159b683 100644 --- a/vllm_omni/diffusion/stage_diffusion_client.py +++ b/vllm_omni/diffusion/stage_diffusion_client.py @@ -51,6 +51,41 @@ def __init__( od_config: OmniDiffusionConfig, metadata: StageMetadata, batch_size: int = 1, + ) -> None: + # Spawn StageDiffusionProc subprocess and wait for READY. + proc, handshake_address, request_address, response_address = spawn_diffusion_proc(model, od_config) + complete_diffusion_handshake(proc, handshake_address) + self._initialize_client(metadata, request_address, response_address, proc=proc, batch_size=batch_size) + + @classmethod + def from_addresses( + cls, + metadata: StageMetadata, + request_address: str, + response_address: str, + *, + proc: Any = None, + batch_size: int = 1, + ) -> StageDiffusionClient: + """Create a client for an already-running diffusion subprocess.""" + client = cls.__new__(cls) + client._initialize_client( + metadata, + request_address, + response_address, + proc=proc, + batch_size=batch_size, + ) + return client + + def _initialize_client( + self, + metadata: StageMetadata, + request_address: str, + response_address: str, + *, + proc: Any, + batch_size: int, ) -> None: self.stage_id = metadata.stage_id self.final_output = metadata.final_output @@ -58,13 +93,9 @@ def __init__( self.default_sampling_params = metadata.default_sampling_params self.custom_process_input_func = metadata.custom_process_input_func self.engine_input_source = metadata.engine_input_source - - # Spawn StageDiffusionProc subprocess and wait for READY. - proc, handshake_address, request_address, response_address = spawn_diffusion_proc(model, od_config) - complete_diffusion_handshake(proc, handshake_address) self._proc = proc + self._owns_process = proc is not None - # ZMQ sockets (sync) for communicating with the subprocess. self._zmq_ctx = zmq.Context() self._request_socket = self._zmq_ctx.socket(zmq.PUSH) self._request_socket.connect(request_address) @@ -74,14 +105,18 @@ def __init__( self._encoder = OmniMsgpackEncoder() self._decoder = OmniMsgpackDecoder() - # Buffers for demultiplexing response messages. self._output_queue: asyncio.Queue[OmniRequestOutput] = asyncio.Queue() self._rpc_results: dict[str, Any] = {} self._pending_rpcs: set[str] = set() self._tasks: dict[str, asyncio.Task] = {} self._shutting_down = False - logger.info("[StageDiffusionClient] Stage-%s initialized (batch_size=%d)", self.stage_id, batch_size) + logger.info( + "[StageDiffusionClient] Stage-%s initialized (owns_process=%s, batch_size=%d)", + self.stage_id, + self._owns_process, + batch_size, + ) # ------------------------------------------------------------------ # Internal helpers @@ -253,7 +288,7 @@ def get_diffusion_output_nowait(self) -> OmniRequestOutput | None: try: return self._output_queue.get_nowait() except asyncio.QueueEmpty: - if not self._shutting_down and self._proc is not None and not self._proc.is_alive(): + if not self._shutting_down and self._owns_process and self._proc is not None and not self._proc.is_alive(): exitcode = self._proc.exitcode # One final drain – the last ZMQ frame may have arrived # between the first drain and the is_alive() check. @@ -325,7 +360,7 @@ async def collective_rpc_async( self._drain_responses() if rpc_id in self._rpc_results: return self._rpc_results.pop(rpc_id) - if self._proc is not None and not self._proc.is_alive(): + if self._owns_process and self._proc is not None and not self._proc.is_alive(): raise RuntimeError( f"StageDiffusionProc died while waiting for " f"collective_rpc '{method}' (exit code {self._proc.exitcode})" @@ -343,7 +378,7 @@ def shutdown(self) -> None: except Exception: pass - if self._proc is not None and self._proc.is_alive(): + if self._owns_process and self._proc is not None and self._proc.is_alive(): self._proc.join(timeout=10) terminate_alive_proc(self._proc) diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py index 9d8c06cce9..2bba419250 100644 --- a/vllm_omni/diffusion/stage_diffusion_proc.py +++ b/vllm_omni/diffusion/stage_diffusion_proc.py @@ -580,14 +580,17 @@ def signal_handler(signum: int, frame: Any) -> None: def spawn_diffusion_proc( model: str, od_config: OmniDiffusionConfig, + handshake_address: str | None = None, + request_address: str | None = None, + response_address: str | None = None, ) -> tuple[BaseProcess, str, str, str]: """Spawn a StageDiffusionProc subprocess. Returns ``(proc, handshake_address, request_address, response_address)``. """ - handshake_address = get_open_zmq_ipc_path() - request_address = get_open_zmq_ipc_path() - response_address = get_open_zmq_ipc_path() + handshake_address = handshake_address or get_open_zmq_ipc_path() + request_address = request_address or get_open_zmq_ipc_path() + response_address = response_address or get_open_zmq_ipc_path() ctx = get_mp_context() proc = ctx.Process( diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index b663789262..d43f1b8fdc 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -86,7 +86,11 @@ class OmniEngineArgs(EngineArgs): Adds omni-specific configuration fields for multi-stage pipeline processing and output type specification. Args: - stage_id: Identifier for the stage in a multi-stage pipeline (default: 0) + stage_id: Identifier for the stage in a multi-stage pipeline. + Defaults to 0 for per-stage engine construction. The CLI-level + single-stage selector remains optional on the parsed argparse + namespace and should not be forwarded as a nullable per-stage + engine argument. model_stage: Stage type identifier, e.g., "thinker" or "talker" (default: "thinker") model_arch: Model architecture name @@ -105,6 +109,18 @@ class OmniEngineArgs(EngineArgs): worker_type: Model Type, e.g., "ar" or "generation" task_type: Default task type for TTS models (CustomVoice, VoiceDesign, or Base). If not specified, will be inferred from model path. + omni_master_address: TCP address that the OmniMasterServer (running + inside AsyncOmniEngine) listens on for engine core registrations. + Required when single-stage mode is active. + omni_master_port: TCP port for the OmniMasterServer registration + socket. Required when single-stage mode is active. + stage_configs_path: Optional path to a JSON/YAML file containing + stage configurations for the multi-stage pipeline. If None, + stage configs are resolved from the model's default configuration. + output_modalities: Optional list of output modality names to enable + (e.g. ["text", "audio"]). If None, all modalities supported by + the model are used. + log_stats: Whether to log engine statistics. Defaults to False. """ stage_id: int = 0 @@ -119,6 +135,11 @@ class OmniEngineArgs(EngineArgs): quantization_config: Any | None = None worker_type: str | None = None task_type: str | None = None + omni_master_address: str | None = None + omni_master_port: int | None = None + stage_configs_path: str | None = None + output_modalities: list[str] | None = None + log_stats: bool = False def __post_init__(self) -> None: load_omni_general_plugins() diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index f7e7d53d58..7dc5db0acd 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -18,12 +18,10 @@ import uuid import weakref from collections.abc import Mapping, Sequence +from contextlib import ExitStack from dataclasses import asdict from typing import TYPE_CHECKING, Any -if TYPE_CHECKING: - from vllm_omni.engine.arg_utils import OmniEngineArgs - import janus import torch from omegaconf import OmegaConf @@ -34,26 +32,36 @@ from vllm.v1.engine.input_processor import InputProcessor from vllm_omni.diffusion.data import DiffusionParallelConfig +from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient +from vllm_omni.diffusion.stage_diffusion_proc import ( + complete_diffusion_handshake, + spawn_diffusion_proc, +) from vllm_omni.distributed.omni_connectors.utils.initialization import ( resolve_omni_kv_config_for_stage, ) -from vllm_omni.engine import ( - OmniEngineCoreRequest, -) +from vllm_omni.engine import OmniEngineCoreRequest from vllm_omni.engine.orchestrator import Orchestrator from vllm_omni.engine.output_processor import MultimodalOutputProcessor from vllm_omni.engine.serialization import ( deserialize_additional_information, serialize_additional_information, ) -from vllm_omni.engine.stage_engine_core_client import StageEngineCoreClient +from vllm_omni.engine.stage_engine_core_client import StageEngineCoreClientBase from vllm_omni.engine.stage_engine_core_proc import ( complete_stage_handshake, spawn_stage_core, ) +from vllm_omni.engine.stage_engine_startup import ( + OmniMasterServer, + connect_remote_engine_cores, + launch_omni_core_engines, + register_stage_with_omni_master, +) from vllm_omni.engine.stage_init_utils import ( StartedLlmStage, acquire_device_locks, + build_diffusion_config, build_engine_args_dict, build_vllm_config, cleanup_failed_stage_initialization, @@ -62,17 +70,20 @@ finalize_initialized_stages, get_stage_connector_spec, initialize_diffusion_stage, + inject_kv_stage_info, load_omni_transfer_config_for_model, prepare_engine_environment, release_device_locks, setup_stage_devices, + terminate_alive_proc, ) -from vllm_omni.entrypoints.utils import ( - load_and_resolve_stage_configs, -) +from vllm_omni.entrypoints.utils import load_and_resolve_stage_configs from vllm_omni.inputs.preprocess import OmniInputPreprocessor from vllm_omni.platforms import current_omni_platform +if TYPE_CHECKING: + from vllm_omni.engine.arg_utils import OmniEngineArgs + logger = init_logger(__name__) @@ -86,39 +97,6 @@ def _patch_generation_config_if_needed(model_config: Any) -> None: model_config.try_get_generation_config = lambda: {} -def _inject_kv_stage_info(stage_cfg: Any, stage_id: int) -> None: - """Inject stage_id and engine_input_source into omni_kv_config. - - OmniKVTransferManager needs stage_id to compute recv_stages for the - receiving side. In the old Omni architecture, OmniDiffusion.__init__ - performed this injection; replicate it here for AsyncOmniEngine. - """ - try: - engine_args = stage_cfg.engine_args - if hasattr(engine_args, "get"): - omni_kv = engine_args.get("omni_kv_config", None) - else: - omni_kv = getattr(engine_args, "omni_kv_config", None) - - if omni_kv is None: - return - - if hasattr(omni_kv, "setdefault"): - omni_kv.setdefault("stage_id", stage_id) - elif hasattr(omni_kv, "__setitem__"): - if "stage_id" not in omni_kv: - omni_kv["stage_id"] = stage_id - - engine_input_source = getattr(stage_cfg, "engine_input_source", None) - if engine_input_source is not None: - if hasattr(omni_kv, "setdefault"): - omni_kv.setdefault("engine_input_source", list(engine_input_source)) - elif hasattr(omni_kv, "__setitem__") and "engine_input_source" not in omni_kv: - omni_kv["engine_input_source"] = list(engine_input_source) - except Exception as e: - logger.debug("Failed to inject stage info into omni_kv_config: %s", e) - - def _inject_global_id(target: Any, request_id: str) -> None: """Inject global_request_id into a prompt dict's additional_information.""" if isinstance(target, dict): @@ -255,6 +233,7 @@ def __init__( stage_init_timeout: int = 300, init_timeout: int = 600, diffusion_batch_size: int = 1, + single_stage_mode: bool = False, **kwargs: Any, ) -> None: self.model = model @@ -274,6 +253,31 @@ def __init__( ea_dict.pop("model", None) kwargs = {**ea_dict, **kwargs} + # ------------------------------------------------------------------ # + # Single-stage mode detection # + # ------------------------------------------------------------------ # + # Single-stage mode is enabled when the caller explicitly passes # + # single_stage_mode=True, or when a stage_id is provided in the args. # + _stage_id_kwarg = kwargs.get("stage_id") + if isinstance(_stage_id_kwarg, int) and not single_stage_mode: + single_stage_mode = True + + self.single_stage_mode: bool = single_stage_mode + self._single_stage_id_filter: int | None = ( + int(_stage_id_kwarg) if single_stage_mode and isinstance(_stage_id_kwarg, int) else None + ) + self._omni_master_address: str | None = kwargs.get("omni_master_address") + self._omni_master_port: int | None = kwargs.get("omni_master_port") + self._omni_master_server: OmniMasterServer | None = None + + if single_stage_mode: + logger.info( + "[AsyncOmniEngine] Single-stage mode enabled (stage_id_filter=%s, master=%s:%s)", + self._single_stage_id_filter, + self._omni_master_address, + self._omni_master_port, + ) + self.config_path, self.stage_configs = self._resolve_stage_configs(model, kwargs) self.num_stages = len(self.stage_configs) @@ -350,61 +354,89 @@ def _launch_llm_stage( started_stage: StartedLlmStage | None = None lock_fds: list[int] = [] device_control_env = current_omni_platform.device_control_env_var - try: - with llm_stage_launch_lock: - previous_visible_devices = os.environ.get(device_control_env) - try: - setup_stage_devices(metadata.stage_id, metadata.runtime_cfg) - engine_args_dict = build_engine_args_dict( - stage_cfg, - self.model, - stage_connector_spec=stage_connector_spec, - ) - omni_conn_cfg, omni_from, omni_to = omni_kv_connector - if omni_conn_cfg: - omni_kv = engine_args_dict.get("omni_kv_config") or {} - if not isinstance(omni_kv, dict): - omni_kv = dict(omni_kv) - omni_kv["connector_config"] = omni_conn_cfg - omni_kv["omni_from_stage"] = omni_from - omni_kv["omni_to_stage"] = omni_to - omni_kv.setdefault("stage_id", metadata.stage_id) - engine_args_dict["omni_kv_config"] = omni_kv - vllm_config, executor_class = build_vllm_config( - stage_cfg, - self.model, - stage_connector_spec=stage_connector_spec, - engine_args_dict=engine_args_dict, - ) - lock_fds = acquire_device_locks( - metadata.stage_id, - engine_args_dict, - stage_init_timeout, - ) - addresses, proc, handshake_address = spawn_stage_core( - vllm_config=vllm_config, - executor_class=executor_class, - log_stats=False, - ) - started_stage = StartedLlmStage( - stage_id=metadata.stage_id, - metadata=metadata, - vllm_config=vllm_config, - executor_class=executor_class, - proc=proc, - addresses=addresses, - ) - logger.info("[AsyncOmniEngine] Stage %s engine launch started", metadata.stage_id) - # Keep the stage-specific device visibility until vLLM - # finishes starting all child processes. - complete_stage_handshake(proc, handshake_address, addresses, vllm_config) - logger.info("[AsyncOmniEngine] Stage %s engine startup completed", metadata.stage_id) - finally: - if previous_visible_devices is None: - current_omni_platform.unset_device_control_env_var() - else: - current_omni_platform.set_device_control_env_var(previous_visible_devices) + proc = None + handshake_address = None + with ExitStack() as launch_stack: + with llm_stage_launch_lock: + previous_visible_devices = os.environ.get(device_control_env) + try: + setup_stage_devices(metadata.stage_id, metadata.runtime_cfg) + engine_args_dict = build_engine_args_dict( + stage_cfg, + self.model, + stage_connector_spec=stage_connector_spec, + ) + omni_conn_cfg, omni_from, omni_to = omni_kv_connector + if omni_conn_cfg: + omni_kv = engine_args_dict.get("omni_kv_config") or {} + if not isinstance(omni_kv, dict): + omni_kv = dict(omni_kv) + omni_kv["connector_config"] = omni_conn_cfg + omni_kv["omni_from_stage"] = omni_from + omni_kv["omni_to_stage"] = omni_to + omni_kv.setdefault("stage_id", metadata.stage_id) + engine_args_dict["omni_kv_config"] = omni_kv + vllm_config, executor_class = build_vllm_config( + stage_cfg, + self.model, + stage_connector_spec=stage_connector_spec, + engine_args_dict=engine_args_dict, + ) + lock_fds = acquire_device_locks( + metadata.stage_id, + engine_args_dict, + stage_init_timeout, + ) + if self.single_stage_mode and self._omni_master_server is not None: + engine_manager, coordinator, addresses = launch_stack.enter_context( + launch_omni_core_engines( + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=False, + omni_master_server=self._omni_master_server, + stage_id=metadata.stage_id, + stage_config=stage_cfg, + ) + ) + started_stage = StartedLlmStage( + stage_id=metadata.stage_id, + metadata=metadata, + vllm_config=vllm_config, + executor_class=executor_class, + addresses=addresses, + engine_manager=engine_manager, + coordinator=coordinator, + ) + else: + addresses, proc, handshake_address = spawn_stage_core( + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=False, + ) + started_stage = StartedLlmStage( + stage_id=metadata.stage_id, + metadata=metadata, + vllm_config=vllm_config, + executor_class=executor_class, + addresses=addresses, + proc=proc, + ) + logger.info("[AsyncOmniEngine] Stage %s engine launch started", metadata.stage_id) + # Keep the stage-specific device visibility until vLLM + # finishes starting all child processes. + if self.single_stage_mode and self._omni_master_server is not None: + launch_stack.close() + else: + assert proc is not None + assert handshake_address is not None + complete_stage_handshake(proc, handshake_address, addresses, vllm_config) + logger.info("[AsyncOmniEngine] Stage %s engine startup completed", metadata.stage_id) + finally: + if previous_visible_devices is None: + current_omni_platform.unset_device_control_env_var() + else: + current_omni_platform.set_device_control_env_var(previous_visible_devices) assert started_stage is not None return started_stage @@ -416,13 +448,138 @@ def _launch_llm_stage( if lock_fds: release_device_locks(lock_fds) + def _create_remote_llm_stage( + self, + stage_cfg: Any, + metadata: Any, + stage_connector_spec: dict[str, Any], + stage_init_timeout: int, + omni_master_server: OmniMasterServer, + ) -> StartedLlmStage: + """Attach to a remote engine core and wait for its startup handshake.""" + started_stage: StartedLlmStage | None = None + try: + raw_stage_cfg = omni_master_server.get_stage_config( + metadata.stage_id, + timeout_s=stage_init_timeout, + ) + if raw_stage_cfg is None: + raise ValueError(f"Remote stage {metadata.stage_id} registered without stage config") + stage_cfg = OmegaConf.create(raw_stage_cfg) + engine_args_dict = build_engine_args_dict( + stage_cfg, + self.model, + stage_connector_spec=stage_connector_spec, + ) + vllm_config, executor_class = build_vllm_config( + stage_cfg, + self.model, + stage_connector_spec=stage_connector_spec, + engine_args_dict=engine_args_dict, + ) + vllm_config.parallel_config.data_parallel_size_local = 0 + launch_cm = connect_remote_engine_cores( + vllm_config=vllm_config, + omni_master_server=omni_master_server, + stage_id=metadata.stage_id, + ) + logger.info("[AsyncOmniEngine] Stage %s remote engine handshake started", metadata.stage_id) + with launch_cm as (engine_manager, coordinator, addresses): + started_stage = StartedLlmStage( + stage_id=metadata.stage_id, + metadata=metadata, + vllm_config=vllm_config, + executor_class=executor_class, + engine_manager=engine_manager, + coordinator=coordinator, + addresses=addresses, + ) + logger.info("[AsyncOmniEngine] Stage %s remote engine startup completed", metadata.stage_id) + assert started_stage is not None + return started_stage + except Exception: + if started_stage is not None: + close_started_llm_stage(started_stage) + raise + + def _launch_diffusion_stage( + self, + stage_cfg: Any, + metadata: Any, + omni_master_server: OmniMasterServer, + ) -> StageDiffusionClient: + """Launch a local diffusion stage on OmniMasterServer-allocated sockets.""" + proc = None + try: + od_config = build_diffusion_config(self.model, stage_cfg, metadata) + handshake_address, request_address, response_address = register_stage_with_omni_master( + omni_master_address=omni_master_server.address, + omni_master_port=omni_master_server.port, + omni_stage_id=metadata.stage_id, + omni_stage_config=stage_cfg, + return_addresses=True, + ) + logger.info( + "[AsyncOmniEngine] Stage %s diffusion registration completed", + metadata.stage_id, + ) + proc, _, _, _ = spawn_diffusion_proc( + self.model, + od_config, + handshake_address=handshake_address, + request_address=request_address, + response_address=response_address, + ) + complete_diffusion_handshake(proc, handshake_address) + logger.info( + "[AsyncOmniEngine] Stage %s diffusion startup completed", + metadata.stage_id, + ) + return StageDiffusionClient.from_addresses( + metadata, + request_address=request_address, + response_address=response_address, + proc=proc, + batch_size=self.diffusion_batch_size, + ) + except Exception: + if proc is not None: + terminate_alive_proc(proc) + raise + + def _create_remote_diffusion_stage( + self, + metadata: Any, + stage_init_timeout: int, + omni_master_server: OmniMasterServer, + ) -> StageDiffusionClient: + """Attach to a remote diffusion stage registered with OmniMasterServer.""" + remote_stage_cfg = OmegaConf.create( + omni_master_server.get_stage_config( + metadata.stage_id, + timeout_s=stage_init_timeout, + ) + ) + remote_metadata = extract_stage_metadata(remote_stage_cfg) + addresses = omni_master_server.get_zmq_addresses(metadata.stage_id) + logger.info( + "[AsyncOmniEngine] Stage %s remote diffusion startup completed", + metadata.stage_id, + ) + return StageDiffusionClient.from_addresses( + remote_metadata, + request_address=addresses.inputs[0], + response_address=addresses.outputs[0], + batch_size=self.diffusion_batch_size, + ) + def _attach_llm_stage( self, started: StartedLlmStage, ) -> tuple[Any, Any, Any, InputProcessor | None]: """Attach a READY LLM stage to the orchestrator event loop.""" - client_addresses = { + client_addresses: dict[str, str] = { "input_address": started.addresses.inputs[0], "output_address": started.addresses.outputs[0], } @@ -430,14 +587,18 @@ def _attach_llm_stage( client_addresses["stats_update_address"] = started.addresses.frontend_stats_publish_address try: - stage_client = StageEngineCoreClient( + stage_client = StageEngineCoreClientBase.make_async_mp_client( vllm_config=started.vllm_config, executor_class=started.executor_class, metadata=started.metadata, client_addresses=client_addresses, proc=started.proc, + engine_manager=started.engine_manager, + coordinator=started.coordinator, ) started.proc = None + started.engine_manager = None + started.coordinator = None except Exception: close_started_llm_stage(started) raise @@ -493,7 +654,7 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: output_processors: list[Any | None] = [None] * num_stages stage_vllm_configs: list[Any | None] = [None] * num_stages input_processor: InputProcessor | None = None - llm_stage_ids: list[int] = [] + llm_stage_positions: list[int] = [] llm_launch_futures: dict[int, concurrent.futures.Future[StartedLlmStage]] = {} started_llm_stages: dict[int, StartedLlmStage] = {} llm_stage_launch_lock = threading.Lock() @@ -507,45 +668,102 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: prepare_engine_environment() omni_transfer_config = load_omni_transfer_config_for_model(self.model, self.config_path) + # ------------------------------------------------------------------ # + # Single-stage mode: start OmniMasterServer before launching stages. # + # ------------------------------------------------------------------ # + if self.single_stage_mode: + if not self._omni_master_address or not self._omni_master_port: + raise ValueError( + "AsyncOmniEngine single_stage_mode requires both " + "omni_master_address and omni_master_port to be set." + ) + # Collect all configured stage IDs for pre-allocation. + all_stage_ids: list[int] = [] + seen_stage_ids: set[int] = set() + for i, sc in enumerate(self.stage_configs): + stage_id = int(getattr(sc, "stage_id", i)) + if stage_id in seen_stage_ids: + raise ValueError( + f"Duplicate stage_id {stage_id!r} detected among configured stages; stage_ids must be unique." + ) + seen_stage_ids.add(stage_id) + all_stage_ids.append(stage_id) + self._omni_master_server = OmniMasterServer( + master_address=self._omni_master_address, + master_port=self._omni_master_port, + stage_ids=all_stage_ids, + ) + self._omni_master_server.start() + logger.info( + "[AsyncOmniEngine] OmniMasterServer started for stages %s", + all_stage_ids, + ) + try: with concurrent.futures.ThreadPoolExecutor( max_workers=max(1, llm_stage_count), thread_name_prefix="llm-stage-launch", ) as launch_executor: - for stage_id, stage_cfg in enumerate(self.stage_configs): - logger.info("[AsyncOmniEngine] Initializing stage %s", stage_id) + for stage_idx, stage_cfg in enumerate(self.stage_configs): metadata = extract_stage_metadata(stage_cfg) + configured_stage_id = metadata.stage_id + logger.info("[AsyncOmniEngine] Initializing stage %s", configured_stage_id) if metadata.prompt_expand_func is not None: prompt_expand_func = metadata.prompt_expand_func + if self.single_stage_mode: + metadata.runtime_cfg = None + stage_connector_spec = get_stage_connector_spec( omni_transfer_config=omni_transfer_config, - stage_id=stage_id, + stage_id=configured_stage_id, async_chunk=async_chunk, ) - omni_kv_connector = resolve_omni_kv_config_for_stage(omni_transfer_config, stage_id) + omni_kv_connector = resolve_omni_kv_config_for_stage(omni_transfer_config, configured_stage_id) if metadata.stage_type == "diffusion": + is_remote_diffusion_stage = ( + self.single_stage_mode + and self._single_stage_id_filter is not None + and configured_stage_id != self._single_stage_id_filter + ) + if is_remote_diffusion_stage: + assert self._omni_master_server is not None + stage_clients[stage_idx] = self._create_remote_diffusion_stage( + metadata, + stage_init_timeout, + self._omni_master_server, + ) + continue + with llm_stage_launch_lock: previous_visible_devices = os.environ.get(device_control_env) try: - setup_stage_devices(stage_id, metadata.runtime_cfg) + setup_stage_devices(configured_stage_id, metadata.runtime_cfg) omni_conn_cfg, omni_from, omni_to = omni_kv_connector if omni_conn_cfg: from vllm_omni.entrypoints.utils import inject_omni_kv_config inject_omni_kv_config(stage_cfg, omni_conn_cfg, omni_from, omni_to) - _inject_kv_stage_info(stage_cfg, stage_id) - stage_clients[stage_id] = initialize_diffusion_stage( - self.model, - stage_cfg, - metadata, - batch_size=self.diffusion_batch_size, - ) + inject_kv_stage_info(stage_cfg, configured_stage_id) + if self.single_stage_mode: + assert self._omni_master_server is not None + stage_clients[stage_idx] = self._launch_diffusion_stage( + stage_cfg, + metadata, + self._omni_master_server, + ) + else: + stage_clients[stage_idx] = initialize_diffusion_stage( + self.model, + stage_cfg, + metadata, + batch_size=self.diffusion_batch_size, + ) logger.info( "[AsyncOmniEngine] Stage %s initialized (diffusion, batch_size=%d)", - stage_id, + configured_stage_id, self.diffusion_batch_size, ) finally: @@ -555,30 +773,58 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: current_omni_platform.set_device_control_env_var(previous_visible_devices) continue - llm_stage_ids.append(stage_id) - llm_launch_futures[stage_id] = launch_executor.submit( - self._launch_llm_stage, - stage_cfg, - metadata, - stage_connector_spec, - stage_init_timeout, - llm_stage_launch_lock, - omni_kv_connector, - ) + llm_stage_positions.append(stage_idx) + + # In single-stage mode, stages that don't match the local + # stage_id filter are skipped. + if ( + self.single_stage_mode + and self._single_stage_id_filter is not None + and configured_stage_id != self._single_stage_id_filter + ): + assert self._omni_master_server is not None + llm_launch_futures[stage_idx] = launch_executor.submit( + self._create_remote_llm_stage, + stage_cfg, + metadata, + stage_connector_spec, + stage_init_timeout, + self._omni_master_server, + ) + else: + llm_launch_futures[stage_idx] = launch_executor.submit( + self._launch_llm_stage, + stage_cfg, + metadata, + stage_connector_spec, + stage_init_timeout, + llm_stage_launch_lock, + omni_kv_connector, + ) concurrent.futures.wait(list(llm_launch_futures.values())) - for stage_id in llm_stage_ids: - started_llm_stages[stage_id] = llm_launch_futures[stage_id].result() + for stage_idx in llm_stage_positions: + started_llm_stages[stage_idx] = llm_launch_futures[stage_idx].result() - for stage_id in llm_stage_ids: - started = started_llm_stages[stage_id] - stage_client, output_processor, vllm_config, stage0_input_processor = self._attach_llm_stage(started) - stage_clients[stage_id] = stage_client - output_processors[stage_id] = output_processor - stage_vllm_configs[stage_id] = vllm_config - if stage0_input_processor is not None: - input_processor = stage0_input_processor + attach_futures: dict[concurrent.futures.Future[tuple[Any, Any, Any, InputProcessor | None]], int] = {} + with concurrent.futures.ThreadPoolExecutor( + max_workers=max(1, len(llm_stage_positions)), + thread_name_prefix="llm-stage-attach", + ) as attach_executor: + for stage_idx in llm_stage_positions: + attach_futures[attach_executor.submit(self._attach_llm_stage, started_llm_stages[stage_idx])] = ( + stage_idx + ) + + for future in concurrent.futures.as_completed(attach_futures): + stage_idx = attach_futures[future] + stage_client, output_processor, vllm_config, stage0_input_processor = future.result() + stage_clients[stage_idx] = stage_client + output_processors[stage_idx] = output_processor + stage_vllm_configs[stage_idx] = vllm_config + if stage0_input_processor is not None: + input_processor = stage0_input_processor initialized_stage_clients, default_sampling_params_list, stage_metadata = finalize_initialized_stages( stage_clients, @@ -595,8 +841,13 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: ) cleanup_failed_stage_initialization( stage_clients, - [started_llm_stages[stage_id] for stage_id in llm_stage_ids if stage_id in started_llm_stages], + [started_llm_stages[stage_idx] for stage_idx in llm_stage_positions if stage_idx in started_llm_stages], ) + if self._omni_master_server is not None: + try: + self._omni_master_server.stop() + except Exception: + logger.exception("[AsyncOmniEngine] Failed to stop OmniMasterServer during stage-init cleanup") raise self.stage_clients = initialized_stage_clients @@ -1310,3 +1561,10 @@ def shutdown(self) -> None: q.close() except Exception: pass + + if self._omni_master_server is not None: + try: + self._omni_master_server.stop() + except Exception: + logger.exception("[AsyncOmniEngine] Failed to stop OmniMasterServer during shutdown") + self._omni_master_server = None diff --git a/vllm_omni/engine/stage_engine_core_client.py b/vllm_omni/engine/stage_engine_core_client.py index 71a0aee4a4..52e674f476 100644 --- a/vllm_omni/engine/stage_engine_core_client.py +++ b/vllm_omni/engine/stage_engine_core_client.py @@ -12,7 +12,7 @@ from vllm.logger import init_logger from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.core_client import AsyncMPClient +from vllm.v1.engine.core_client import AsyncMPClient, DPLBAsyncMPClient from vllm_omni.distributed.omni_connectors.utils.initialization import KV_TRANSFER_PORT_OFFSET from vllm_omni.engine.stage_init_utils import StageMetadata @@ -25,18 +25,54 @@ logger = init_logger(__name__) -class StageEngineCoreClient(AsyncMPClient): - """Stage async client that inherits from vLLM's AsyncMPClient. +class StageEngineCoreClientBase: + """Shared stage-aware behavior for async EngineCore clients. - Fully reuses AsyncMPClient for: + The concrete transport/load-balancing behavior is supplied by the + multiprocessing client subclass in the MRO. + + Fully reuses the underlying vLLM async MP client ``__init__`` for: - ZMQ setup, sockets - outputs_queue, output_queue_task - All utility methods (get_output_async, abort_requests_async, etc.) The subprocess is spawned externally via ``spawn_stage_core`` / ``complete_stage_handshake`` from *stage_engine_core_proc.py*. + In single-stage CLI mode, the client may instead attach to an + ``engine_manager`` / ``coordinator`` pair created elsewhere. """ + @staticmethod + def make_async_mp_client( + vllm_config: Any, + executor_class: type, + metadata: StageMetadata, + client_addresses: dict[str, str] | None = None, + proc: Any = None, + engine_manager: Any = None, + coordinator: Any = None, + client_count: int = 1, + client_index: int = 0, + ) -> StageEngineCoreClient | DPLBStageEngineCoreClient: + """Create the appropriate stage async client for the DP mode.""" + parallel_config = vllm_config.parallel_config + client_args = dict( + vllm_config=vllm_config, + executor_class=executor_class, + metadata=metadata, + client_addresses=client_addresses, + proc=proc, + engine_manager=engine_manager, + coordinator=coordinator, + client_count=client_count, + client_index=client_index, + ) + + if parallel_config.data_parallel_size > 1 and not parallel_config.data_parallel_external_lb: + return DPLBStageEngineCoreClient(**client_args) + + return StageEngineCoreClient(**client_args) + def __init__( self, vllm_config: Any, @@ -85,8 +121,10 @@ def __init__( self._kv_sender_info: dict[str, Any] | None = None self._kv_sender_initialized = False + client_name = self.__class__.__name__ logger.info( - "[StageEngineCoreClient] Stage-%s initializing EngineCore", + "[%s] Stage-%s initializing EngineCore", + client_name, self.stage_id, ) try: @@ -98,23 +136,30 @@ def __init__( client_count=client_count, client_index=client_index, ) + if engine_manager is not None: + self.resources.engine_manager = engine_manager + if coordinator is not None: + self.resources.coordinator = coordinator except Exception: logger.exception( - "[StageEngineCoreClient] Stage-%s EngineCore init failed", + "[%s] Stage-%s EngineCore init failed", + client_name, self.stage_id, ) try: self.shutdown() except Exception as shutdown_error: logger.warning( - "[StageEngineCoreClient] Stage-%s cleanup after init failure failed: %s", + "[%s] Stage-%s cleanup after init failure failed: %s", + client_name, self.stage_id, shutdown_error, ) raise self._initialize_kv_sender_endpoint() logger.info( - "[StageEngineCoreClient] Stage-%s EngineCore running", + "[%s] Stage-%s EngineCore running", + client_name, self.stage_id, ) @@ -122,7 +167,12 @@ def __init__( async def add_request_async(self, request: EngineCoreRequest) -> None: """Add request to the stage engine core.""" - logger.info(f"[StageEngineCoreClient] Stage-{self.stage_id} adding request: {request.request_id}") + logger.info( + "[%s] Stage-%s adding request: %s", + self.__class__.__name__, + self.stage_id, + request.request_id, + ) await super().add_request_async(request) # ==================== Stage Methods ==================== @@ -287,9 +337,9 @@ async def collective_rpc_async( ) -> Any: """Forward control RPCs to the underlying AsyncMPClient stage engine. - Each ``StageEngineCoreClient`` already represents one logical stage, so - stage-scoped control operations should be executed here and then fanned - in-core across the workers managed by this EngineCore client. + Each stage client already represents one logical stage, so stage-scoped + control operations should be executed here and then fanned in-core + across the workers managed by this EngineCore client. """ return await super().collective_rpc_async( method=method, @@ -299,10 +349,19 @@ async def collective_rpc_async( ) def shutdown(self) -> None: - """Shutdown ZMQ connections and the subprocess.""" + """Shutdown managed resources and any externally spawned subprocess.""" super().shutdown() if self._proc is not None and self._proc.is_alive(): self._proc.terminate() self._proc.join(timeout=5) if self._proc.is_alive(): self._proc.kill() + self._proc = None + + +class StageEngineCoreClient(StageEngineCoreClientBase, AsyncMPClient): + """Stage async client backed by vLLM's ``AsyncMPClient``.""" + + +class DPLBStageEngineCoreClient(StageEngineCoreClientBase, DPLBAsyncMPClient): + """Stage async client backed by vLLM's ``DPLBAsyncMPClient``.""" diff --git a/vllm_omni/engine/stage_engine_startup.py b/vllm_omni/engine/stage_engine_startup.py new file mode 100644 index 0000000000..6af66c71f3 --- /dev/null +++ b/vllm_omni/engine/stage_engine_startup.py @@ -0,0 +1,599 @@ +"""Helpers for launching and handshaking omni engine cores.""" + +from __future__ import annotations + +import contextlib +import dataclasses +import threading +from collections.abc import Iterator +from dataclasses import dataclass +from typing import Any + +import msgspec +import zmq +from omegaconf import OmegaConf +from vllm.config import CacheConfig, VllmConfig +from vllm.logger import init_logger +from vllm.utils.network_utils import get_open_port, zmq_socket_ctx +from vllm.v1.engine.coordinator import DPCoordinator +from vllm.v1.engine.utils import ( + STARTUP_POLL_PERIOD_MS, + CoreEngine, + CoreEngineProcManager, + CoreEngineState, + EngineHandshakeMetadata, + EngineZmqAddresses, + wait_for_engine_startup, +) +from vllm.v1.executor import Executor + +logger = init_logger(__name__) + +# Poll period (ms) used by the registration/handshake loop. +_POLL_PERIOD_MS = 5_000 +# Default timeout (s) for a stage to send READY. +_DEFAULT_STARTUP_TIMEOUT_S = 300 + + +def _serialize_stage_config(stage_config: Any) -> Any: + """Convert a stage config to msgpack-friendly builtins.""" + if stage_config is None or isinstance(stage_config, (str, bytes, int, float, bool)): + return stage_config + + if OmegaConf.is_config(stage_config): + return _serialize_stage_config(OmegaConf.to_container(stage_config, resolve=True)) + + if dataclasses.is_dataclass(stage_config): + return _serialize_stage_config(dataclasses.asdict(stage_config)) + + if isinstance(stage_config, dict): + return {key: _serialize_stage_config(value) for key, value in stage_config.items() if not callable(value)} + + if isinstance(stage_config, (list, tuple, set)): + return [_serialize_stage_config(item) for item in stage_config if not callable(item)] + + if hasattr(stage_config, "items"): + return {key: _serialize_stage_config(value) for key, value in stage_config.items() if not callable(value)} + + if hasattr(stage_config, "__dict__"): + return { + key: _serialize_stage_config(value) + for key, value in vars(stage_config).items() + if not key.startswith("_") and not callable(value) + } + + return stage_config + + +# --------------------------------------------------------------------------- +# Per-stage address allocation +# --------------------------------------------------------------------------- + + +@dataclass +class StageAllocation: + """ZMQ addresses reserved for a single stage.""" + + # Per-stage handshake socket (OmniMasterServer binds, engine connects) + handshake_bind_address: str + handshake_connect_address: str + # Input channel: client binds ROUTER, engine connects DEALER + input_bind_address: str + input_connect_address: str + # Output channel: client binds PULL, engine connects PUSH + output_bind_address: str + output_connect_address: str + + +@dataclass(frozen=True) +class StageCoordinatorAddresses: + """Optional DP coordinator addresses registered for a stage.""" + + coordinator_input: str | None = None + coordinator_output: str | None = None + frontend_stats_publish_address: str | None = None + + +# --------------------------------------------------------------------------- +# OmniMasterServer +# --------------------------------------------------------------------------- + + +class OmniMasterServer: + """Registration server for single-stage engine startup.""" + + def __init__( + self, + master_address: str, + master_port: int, + stage_ids: list[int], + ) -> None: + self._address = master_address + self._port = master_port + self._allocations: dict[int, StageAllocation] = {} + self._stage_configs: dict[int, Any] = {} + self._stage_coordinator_addresses: dict[int, StageCoordinatorAddresses] = {} + self._stage_config_events: dict[int, threading.Event] = {} + self._thread: threading.Thread | None = None + self._stop_event = threading.Event() + + for sid in stage_ids: + self._stage_config_events[sid] = threading.Event() + self._stage_coordinator_addresses[sid] = StageCoordinatorAddresses() + hs_port = get_open_port() + inp_port = get_open_port() + out_port = get_open_port() + self._allocations[sid] = StageAllocation( + handshake_bind_address=f"tcp://{master_address}:{hs_port}", + handshake_connect_address=f"tcp://{master_address}:{hs_port}", + input_bind_address=f"tcp://{master_address}:{inp_port}", + input_connect_address=f"tcp://{master_address}:{inp_port}", + output_bind_address=f"tcp://{master_address}:{out_port}", + output_connect_address=f"tcp://{master_address}:{out_port}", + ) + + logger.info( + "[OmniMasterServer] Pre-allocated addresses for stages %s (master=%s:%d)", + list(stage_ids), + master_address, + master_port, + ) + + # ------------------------------------------------------------------ + # Public helpers + # ------------------------------------------------------------------ + @property + def address(self) -> str: + """Return the registration address exposed to stage launchers.""" + return self._address + + @property + def port(self) -> int: + """Return the registration port exposed to stage launchers.""" + return self._port + + def get_allocation(self, stage_id: int) -> StageAllocation: + """Return the full address allocation for *stage_id*.""" + return self._allocations[stage_id] + + def register_stage_config( + self, + stage_id: int, + stage_config: Any, + coordinator_addresses: StageCoordinatorAddresses | None = None, + ) -> None: + """Store the latest stage registration payload for *stage_id*.""" + if stage_id not in self._allocations: + raise KeyError(stage_id) + self._stage_configs[stage_id] = stage_config + if coordinator_addresses is not None: + self._stage_coordinator_addresses[stage_id] = coordinator_addresses + self._stage_config_events[stage_id].set() + + def get_stage_config(self, stage_id: int, timeout_s: float | None = None) -> Any: + """Return the stage config for *stage_id*, waiting if necessary.""" + if stage_id not in self._allocations: + raise KeyError(stage_id) + + if stage_id in self._stage_configs: + return self._stage_configs[stage_id] + + if not self._stage_config_events[stage_id].wait(timeout=timeout_s): + raise TimeoutError(f"Timed out waiting for stage config for stage {stage_id}.") + + return self._stage_configs[stage_id] + + def get_stage_coordinator_addresses( + self, + stage_id: int, + timeout_s: float | None = None, + ) -> StageCoordinatorAddresses: + """Return the registered coordinator addresses for *stage_id*.""" + if stage_id not in self._allocations: + raise KeyError(stage_id) + + if not self._stage_config_events[stage_id].is_set(): + if not self._stage_config_events[stage_id].wait(timeout=timeout_s): + raise TimeoutError(f"Timed out waiting for stage registration for stage {stage_id}.") + + return self._stage_coordinator_addresses[stage_id] + + def get_client_addresses(self, stage_id: int) -> dict[str, str]: + """Return the addresses the client-side sockets should *bind* to.""" + alloc = self._allocations[stage_id] + return { + "input_address": alloc.input_bind_address, + "output_address": alloc.output_bind_address, + } + + def get_zmq_addresses(self, stage_id: int) -> EngineZmqAddresses: + """Return EngineZmqAddresses using the *bind* (client) side addresses.""" + alloc = self._allocations[stage_id] + return EngineZmqAddresses( + inputs=[alloc.input_bind_address], + outputs=[alloc.output_bind_address], + ) + + def get_engine_zmq_addresses(self, stage_id: int) -> EngineZmqAddresses: + """Return EngineZmqAddresses using the *connect* (engine) addresses.""" + alloc = self._allocations[stage_id] + return EngineZmqAddresses( + inputs=[alloc.input_connect_address], + outputs=[alloc.output_connect_address], + ) + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def start(self) -> None: + """Start the background server thread.""" + self._thread = threading.Thread( + target=self._run, + name="OmniMasterServer", + daemon=True, + ) + self._thread.start() + logger.info( + "[OmniMasterServer] Listening on tcp://%s:%d", + self.address, + self.port, + ) + + def stop(self) -> None: + """Signal stop and join the background thread.""" + self._stop_event.set() + if self._thread is not None: + self._thread.join(timeout=10) + + # ------------------------------------------------------------------ + # Internal server logic + # ------------------------------------------------------------------ + + def _run(self) -> None: + ctx = zmq.Context() + try: + self._serve(ctx) + except Exception: + logger.exception("[OmniMasterServer] Server thread crashed") + finally: + ctx.term() + + def _serve(self, ctx: zmq.Context) -> None: # type: ignore[type-arg] + # Registration socket for the initial stage registration. + # Per-stage handshake sockets are bound by the launch helpers. + reg_socket: zmq.Socket = ctx.socket(zmq.ROUTER) # type: ignore[attr-defined] + reg_socket.bind(f"tcp://{self.address}:{self.port}") + + poller = zmq.Poller() + poller.register(reg_socket, zmq.POLLIN) + + pending: set[int] = set(self._allocations.keys()) + + while pending and not self._stop_event.is_set(): + events: list[tuple[zmq.Socket, int]] = poller.poll(_POLL_PERIOD_MS) # type: ignore[assignment] + if not events: + logger.debug("[OmniMasterServer] Still waiting for registration from stages: %s", pending) + continue + + for sock, _ in events: + if sock is reg_socket: + sid = self._handle_registration(reg_socket) + if sid is not None: + pending.discard(sid) + + # Cleanup + reg_socket.close(linger=0) + logger.info("[OmniMasterServer] All stages registered; server thread exiting.") + + def _handle_registration(self, reg_socket: zmq.Socket) -> int | None: # type: ignore[type-arg] + """Receive a stage registration and reply with the handshake address. + + Returns the registered stage_id on success, or None on failure. + """ + frames = reg_socket.recv_multipart() + if len(frames) < 2: + logger.warning( + "[OmniMasterServer] Unexpected registration frame count: %d", + len(frames), + ) + return None + identity = frames[0] + msg_bytes = frames[-1] + try: + msg = msgspec.msgpack.decode(msg_bytes) + except Exception as exc: + logger.warning("[OmniMasterServer] Failed to decode registration message: %s", exc) + return None + + stage_id: int | None = msg.get("stage_id") + if stage_id not in self._allocations: + logger.warning( + "[OmniMasterServer] Received registration for unknown stage_id=%s", + stage_id, + ) + return None + + self.register_stage_config( + stage_id, + msg.get("stage_config"), + coordinator_addresses=StageCoordinatorAddresses( + coordinator_input=msg.get("coordinator_input"), + coordinator_output=msg.get("coordinator_output"), + frontend_stats_publish_address=msg.get("frontend_stats_publish_address"), + ), + ) + + alloc = self._allocations[stage_id] + response = msgspec.msgpack.encode( + { + "handshake_address": alloc.handshake_connect_address, + "input_address": alloc.input_bind_address, + "output_address": alloc.output_bind_address, + } + ) + # ROUTER-DEALER: reply is [identity, payload] (no empty delimiter). + reg_socket.send_multipart([identity, response]) + logger.info( + "[OmniMasterServer] Stage %d registered; assigned handshake=%s", + stage_id, + alloc.handshake_connect_address, + ) + return stage_id + + +def register_stage_with_omni_master( + *, + omni_master_address: str, + omni_master_port: int, + omni_stage_id: int, + omni_stage_config: Any = None, + coordinator: DPCoordinator | None = None, + return_addresses: bool = False, +) -> str | tuple[str, str, str]: + """Register a stage with the omni master server. + + Returns the per-stage handshake address by default. When + ``return_addresses`` is true, also returns the stage input/output + addresses allocated by the master. + """ + + reg_ctx = zmq.Context() + try: + reg_sock: zmq.Socket = reg_ctx.socket(zmq.DEALER) # type: ignore[attr-defined] + try: + reg_sock.connect(f"tcp://{omni_master_address}:{omni_master_port}") + payload = { + "stage_id": omni_stage_id, + "stage_config": _serialize_stage_config(omni_stage_config), + } + if coordinator is not None: + coordinator_input, coordinator_output = coordinator.get_engine_socket_addresses() + payload["coordinator_input"] = coordinator_input + payload["coordinator_output"] = coordinator_output + payload["frontend_stats_publish_address"] = coordinator.get_stats_publish_address() + + reg_sock.send(msgspec.msgpack.encode(payload)) + timeout_ms = _DEFAULT_STARTUP_TIMEOUT_S * 1_000 + if not reg_sock.poll(timeout=timeout_ms): + raise RuntimeError( + f"Timed out waiting for registration " + f"response from OmniMasterServer " + f"({omni_master_address}:{omni_master_port}) " + f"for stage {omni_stage_id}." + ) + response_bytes = reg_sock.recv() + response = msgspec.msgpack.decode(response_bytes) + handshake_address: str = response["handshake_address"] + input_address: str = response["input_address"] + output_address: str = response["output_address"] + logger.info( + "Stage %d registered; handshake_address=%s", + omni_stage_id, + handshake_address, + ) + finally: + reg_sock.close(linger=0) + finally: + reg_ctx.term() + + if return_addresses: + return handshake_address, input_address, output_address + return handshake_address + + +def _wait_for_omni_engine_startup( + handshake_socket: zmq.Socket, + engine_addresses: EngineZmqAddresses, + engines: list[CoreEngine], + cache_config: CacheConfig, +) -> None: + """Wait for omni-managed engines to finish the HELLO/READY handshake.""" + conn_pending = len(engines) + start_pending = 0 + + poller = zmq.Poller() + poller.register(handshake_socket, zmq.POLLIN) + + while conn_pending or start_pending: + events = poller.poll(STARTUP_POLL_PERIOD_MS) + if not events: + logger.debug( + "[omni] Waiting for %d engine(s) to connect, %d to start.", + conn_pending, + start_pending, + ) + continue + + eng_identity, msg_bytes = handshake_socket.recv_multipart() + eng_index = int.from_bytes(eng_identity, "little") + engine = next((e for e in engines if e.identity == eng_identity), None) + if engine is None: + raise RuntimeError(f"[omni] Handshake message from unexpected engine rank: {eng_index}") + + msg = msgspec.msgpack.decode(msg_bytes) + status: str = msg["status"] + + if status == "HELLO" and engine.state == CoreEngineState.NEW: + init_message = msgspec.msgpack.encode( + EngineHandshakeMetadata(addresses=engine_addresses, parallel_config={}) + ) + handshake_socket.send_multipart((eng_identity, init_message), copy=False) + conn_pending -= 1 + start_pending += 1 + engine.state = CoreEngineState.CONNECTED + logger.debug("[omni] HELLO from engine %d", eng_index) + + elif status == "READY" and engine.state == CoreEngineState.CONNECTED: + num_gpu_blocks = (cache_config.num_gpu_blocks or 0) + msg["num_gpu_blocks"] + cache_config.num_gpu_blocks = num_gpu_blocks + if engine_addresses.frontend_stats_publish_address is None: + engine_addresses.frontend_stats_publish_address = msg.get("dp_stats_address") + start_pending -= 1 + engine.state = CoreEngineState.READY + logger.debug("[omni] READY from engine %d (num_gpu_blocks=%d)", eng_index, msg["num_gpu_blocks"]) + + else: + raise RuntimeError(f"[omni] Unexpected status '{status}' from engine {eng_index} in state {engine.state}.") + + +@contextlib.contextmanager +def connect_remote_engine_cores( + vllm_config: VllmConfig, + omni_master_server: OmniMasterServer, + stage_id: int, +) -> Iterator[tuple[None, DPCoordinator | None, EngineZmqAddresses]]: + """Wait for remote engine cores to connect through the omni handshake.""" + addresses = omni_master_server.get_zmq_addresses(stage_id) + parallel_config = vllm_config.parallel_config + # Mirror the engine-count logic from launch_omni_core_engines. + remote_engine_count = ( + parallel_config.data_parallel_size_local + if parallel_config.data_parallel_size_local is not None and parallel_config.data_parallel_size_local > 0 + else max(1, parallel_config.data_parallel_size) + ) + start_index = parallel_config.data_parallel_rank if parallel_config.data_parallel_rank is not None else 0 + coordinator = None + + registered_coordinator_addresses = omni_master_server.get_stage_coordinator_addresses(stage_id) + addresses.coordinator_input = registered_coordinator_addresses.coordinator_input + addresses.coordinator_output = registered_coordinator_addresses.coordinator_output + addresses.frontend_stats_publish_address = registered_coordinator_addresses.frontend_stats_publish_address + + engines_to_handshake = [CoreEngine(index=start_index + i, local=False) for i in range(remote_engine_count)] + + logger.info( + "Waiting for %d remote engine(s) for stage %d", + remote_engine_count, + stage_id, + ) + + handshake_bind_address = omni_master_server.get_allocation(stage_id).handshake_bind_address + + with zmq_socket_ctx(handshake_bind_address, zmq.ROUTER, bind=True) as handshake_socket: + yield None, coordinator, addresses + + _wait_for_omni_engine_startup( + handshake_socket, + addresses, + engines_to_handshake, + vllm_config.cache_config, + ) + + +@contextlib.contextmanager +def launch_omni_core_engines( + vllm_config: VllmConfig, + executor_class: type[Executor], + log_stats: bool, + omni_master_server: OmniMasterServer, + stage_id: int, + stage_config: Any = None, +) -> Iterator[tuple[CoreEngineProcManager, DPCoordinator | None, EngineZmqAddresses]]: + """Launch local engine cores using the omni registration flow.""" + addresses = omni_master_server.get_zmq_addresses(stage_id) + parallel_config = vllm_config.parallel_config + # Determine the number of local engines and their ranks. + local_engine_count = ( + parallel_config.data_parallel_size_local + if parallel_config.data_parallel_size_local is not None and parallel_config.data_parallel_size_local > 0 + else max(1, parallel_config.data_parallel_size) + ) + dp_rank = parallel_config.data_parallel_rank if parallel_config.data_parallel_rank is not None else 0 + local_start_index = 0 + start_index = dp_rank + + # Run the DP Coordinator process with rank 0 when in online DP mode. + # The coordinator is needed for: + # 1. Internal/hybrid LB: collecting and publishing queue stats + # 2. MoE models: wave coordination in addition to stats + run_coordinator = vllm_config.needs_dp_coordinator and dp_rank == 0 + + if run_coordinator: + coordinator = DPCoordinator( + parallel_config, + enable_wave_coordination=vllm_config.model_config.is_moe, + ) + + addresses.coordinator_input, addresses.coordinator_output = coordinator.get_engine_socket_addresses() + addresses.frontend_stats_publish_address = coordinator.get_stats_publish_address() + + logger.info( + "[omni] Started DP Coordinator process for stage %d (PID: %d)", + stage_id, + coordinator.proc.pid, + ) + else: + coordinator = None + + logger.info( + "Starting %d local engine(s) for stage %d (dp_rank=%d)", + local_engine_count, + stage_id, + dp_rank, + ) + + # Register the stage once and reuse the returned per-stage handshake + # address for all local engine-core processes. + handshake_address = register_stage_with_omni_master( + omni_master_address=omni_master_server.address, + omni_master_port=omni_master_server.port, + omni_stage_id=stage_id, + omni_stage_config=stage_config, + coordinator=coordinator, + ) + + # One CoreEngine entry per local engine so wait_for_engine_startup can + # track the HELLO/READY handshake for each of them. + engines_to_handshake = [CoreEngine(index=start_index + i, local=True) for i in range(local_engine_count)] + + # Bind the pre-allocated handshake socket for this stage. + handshake_bind_address = omni_master_server.get_allocation(stage_id).handshake_bind_address + + with zmq_socket_ctx(handshake_bind_address, zmq.ROUTER, bind=True) as handshake_socket: + local_engine_manager = CoreEngineProcManager( + local_engine_count=local_engine_count, + start_index=start_index, + local_start_index=local_start_index, + vllm_config=vllm_config, + local_client=True, + handshake_address=handshake_address, + executor_class=executor_class, + log_stats=log_stats, + ) + + yield local_engine_manager, coordinator, addresses + + # Wait for all local engine-core processes to complete the + # standard HELLO/READY handshake — mirrors launch_core_engines. + coordinated_dp = parallel_config.data_parallel_size > 1 and vllm_config.model_config.is_moe + wait_for_engine_startup( + handshake_socket, + addresses, + engines_to_handshake, + parallel_config, + coordinated_dp, + vllm_config.cache_config, + local_engine_manager, + coordinator.proc if coordinator else None, + ) diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index e6f603d2a9..09195faeca 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -101,6 +101,34 @@ def resolve_worker_cls(engine_args: dict[str, Any]) -> None: raise ValueError(f"Unknown worker_type: {worker_type}") +def inject_kv_stage_info(stage_cfg: Any, stage_id: int) -> None: + """Inject stage metadata into omni_kv_config when present.""" + try: + engine_args = stage_cfg.engine_args + if hasattr(engine_args, "get"): + omni_kv = engine_args.get("omni_kv_config", None) + else: + omni_kv = getattr(engine_args, "omni_kv_config", None) + + if omni_kv is None: + return + + if hasattr(omni_kv, "setdefault"): + omni_kv.setdefault("stage_id", stage_id) + elif hasattr(omni_kv, "__setitem__"): + if "stage_id" not in omni_kv: + omni_kv["stage_id"] = stage_id + + engine_input_source = getattr(stage_cfg, "engine_input_source", None) + if engine_input_source is not None: + if hasattr(omni_kv, "setdefault"): + omni_kv.setdefault("engine_input_source", list(engine_input_source)) + elif hasattr(omni_kv, "__setitem__") and "engine_input_source" not in omni_kv: + omni_kv["engine_input_source"] = list(engine_input_source) + except Exception as e: + logger.debug("Failed to inject stage info into omni_kv_config: %s", e) + + @dataclass class StageMetadata: """Lightweight stage attributes extracted from stage_config.""" @@ -129,8 +157,10 @@ class StartedLlmStage: metadata: Any vllm_config: Any executor_class: type - proc: Any addresses: Any + proc: Any = None + engine_manager: Any = None + coordinator: Any = None def extract_stage_metadata(stage_config: Any) -> StageMetadata: @@ -263,6 +293,7 @@ def build_vllm_config( model: str, stage_connector_spec: dict[str, Any] | None = None, engine_args_dict: dict[str, Any] | None = None, + headless: bool = False, ) -> tuple[Any, type]: """Build engine args, then create VllmConfig and executor_class. @@ -278,7 +309,10 @@ def build_vllm_config( filtered_engine_args_dict = filter_dataclass_kwargs(OmniEngineArgs, engine_args_dict) omni_engine_args = OmniEngineArgs(**filtered_engine_args_dict) - vllm_config = omni_engine_args.create_engine_config(usage_context=UsageContext.LLM_CLASS) + vllm_config = omni_engine_args.create_engine_config( + usage_context=UsageContext.LLM_CLASS, + headless=headless, + ) executor_class = Executor.get_class(vllm_config) return vllm_config, executor_class @@ -445,32 +479,20 @@ def get_stage_connector_spec( return {} -def initialize_diffusion_stage( +def build_diffusion_config( model: str, stage_cfg: Any, metadata: StageMetadata, - batch_size: int = 1, ) -> Any: - """Build a diffusion stage client. - - Args: - model: Model name or path. - stage_cfg: Stage configuration. - metadata: Extracted stage metadata. - batch_size: Maximum number of requests to batch together in the - diffusion engine. Passed through to ``StageDiffusionClient`` - and ultimately to ``AsyncOmni``. - """ + """Build diffusion config for a stage.""" from vllm_omni.diffusion.data import OmniDiffusionConfig - from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient - od_config = OmniDiffusionConfig.from_kwargs( - model=model, - **_to_dict(stage_cfg.engine_args), - ) + engine_args_dict = build_engine_args_dict(stage_cfg, model) + od_config = OmniDiffusionConfig.from_kwargs(**engine_args_dict) + num_devices_per_stage = od_config.parallel_config.world_size device_control_env = current_omni_platform.device_control_env_var - visible_devices_str = os.environ.get(device_control_env) + visible_devices_str = os.environ.get(device_control_env) if device_control_env else None if visible_devices_str: physical_devices = [device.strip() for device in visible_devices_str.split(",") if device.strip()] else: @@ -485,6 +507,28 @@ def initialize_diffusion_stage( od_config.num_gpus = num_devices_per_stage if metadata.cfg_kv_collect_func is not None: od_config.cfg_kv_collect_func = metadata.cfg_kv_collect_func + return od_config + + +def initialize_diffusion_stage( + model: str, + stage_cfg: Any, + metadata: StageMetadata, + batch_size: int = 1, +) -> Any: + """Build a diffusion stage client. + + Args: + model: Model name or path. + stage_cfg: Stage configuration. + metadata: Extracted stage metadata. + batch_size: Maximum number of requests to batch together in the + diffusion engine. Passed through to ``StageDiffusionClient`` + and ultimately to ``AsyncOmni``. + """ + from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient + + od_config = build_diffusion_config(model, stage_cfg, metadata) return StageDiffusionClient(model, od_config, metadata, batch_size=batch_size) @@ -518,17 +562,18 @@ def _shutdown_or_close_resource(resource: Any, resource_name: str, stage_id: int def close_started_llm_stage(started: StartedLlmStage) -> None: - """Terminate the subprocess owned by a launched stage that never attached.""" - if started.proc is None: - return - try: - terminate_alive_proc(started.proc) - except Exception as cleanup_error: - logger.warning( - "[stage_init] Failed to terminate process for stage %s: %s", - started.stage_id, - cleanup_error, - ) + """Release resources owned by a launched stage that never attached.""" + if started.proc is not None: + try: + terminate_alive_proc(started.proc) + except Exception as cleanup_error: + logger.warning( + "[stage_init] Failed to terminate process for stage %s: %s", + started.stage_id, + cleanup_error, + ) + _shutdown_or_close_resource(started.engine_manager, "engine manager", started.stage_id) + _shutdown_or_close_resource(started.coordinator, "coordinator", started.stage_id) def finalize_initialized_stages( diff --git a/vllm_omni/entrypoints/cli/serve.py b/vllm_omni/entrypoints/cli/serve.py index b72df41cdd..6e9adc2461 100644 --- a/vllm_omni/entrypoints/cli/serve.py +++ b/vllm_omni/entrypoints/cli/serve.py @@ -8,6 +8,8 @@ import argparse import json import os +import signal +from types import FrameType from typing import Any import uvloop @@ -419,23 +421,229 @@ def _create_default_diffusion_stage_cfg(args: argparse.Namespace) -> list[dict[s def run_headless(args: argparse.Namespace) -> None: - """Run a single stage in headless mode. + """Run a single stage in headless mode.""" + from vllm.v1.engine.coordinator import DPCoordinator + from vllm.v1.engine.utils import CoreEngineProcManager + from vllm.v1.executor.multiproc_executor import MultiprocExecutor + from vllm.version import __version__ as VLLM_VERSION + + from vllm_omni.diffusion.stage_diffusion_proc import ( + complete_diffusion_handshake, + spawn_diffusion_proc, + ) + from vllm_omni.distributed.omni_connectors.utils.initialization import resolve_omni_kv_config_for_stage + from vllm_omni.engine.stage_engine_startup import register_stage_with_omni_master + from vllm_omni.engine.stage_init_utils import ( + build_diffusion_config, + build_engine_args_dict, + build_vllm_config, + extract_stage_metadata, + get_stage_connector_spec, + inject_kv_stage_info, + load_omni_transfer_config_for_model, + prepare_engine_environment, + terminate_alive_proc, + ) + from vllm_omni.entrypoints.utils import inject_omni_kv_config, load_and_resolve_stage_configs + + model = args.model + stage_id: int | None = args.stage_id + omni_master_address: str | None = args.omni_master_address + omni_master_port: int | None = args.omni_master_port + + if stage_id is None: + raise ValueError("--stage-id is required in headless mode") + if omni_master_address is None or omni_master_port is None: + raise ValueError("--omni-master-address and --omni-master-port are required in headless mode") + if getattr(args, "api_server_count", 0) and args.api_server_count > 1: + raise ValueError("api_server_count can't be set in headless mode") + if args.worker_backend != "multi_process": + raise ValueError("headless mode requires worker_backend=multi_process") + + args_dict = vars(args).copy() + config_path, stage_configs = load_and_resolve_stage_configs( + model, + args_dict.get("stage_configs_path"), + args_dict, + ) + + # Locate the stage config that matches stage_id. + stage_cfg = None + for cfg in stage_configs: + if getattr(cfg, "stage_id", None) == stage_id: + stage_cfg = cfg + break + if stage_cfg is None: + raise ValueError( + f"No stage config found for stage_id={stage_id}. " + f"Available stage ids: {[getattr(c, 'stage_id', None) for c in stage_configs]}" + ) + + prepare_engine_environment() + omni_transfer_config = load_omni_transfer_config_for_model(model, config_path) + omni_conn_cfg, omni_from, omni_to = resolve_omni_kv_config_for_stage(omni_transfer_config, stage_id) + + if getattr(stage_cfg, "stage_type", "llm") == "diffusion": + metadata = extract_stage_metadata(stage_cfg) + if omni_conn_cfg: + inject_omni_kv_config(stage_cfg, omni_conn_cfg, omni_from, omni_to) + inject_kv_stage_info(stage_cfg, stage_id) + od_config = build_diffusion_config(model, stage_cfg, metadata) + + logger.info( + "[Headless] Launching diffusion stage %d via OmniMasterServer at %s:%d", + stage_id, + omni_master_address, + omni_master_port, + ) + + proc = None + try: + handshake_address, request_address, response_address = register_stage_with_omni_master( + omni_master_address=omni_master_address, + omni_master_port=omni_master_port, + omni_stage_id=stage_id, + omni_stage_config=stage_cfg, + return_addresses=True, + ) + proc, _, _, _ = spawn_diffusion_proc( + model, + od_config, + handshake_address=handshake_address, + request_address=request_address, + response_address=response_address, + ) + complete_diffusion_handshake(proc, handshake_address) + proc.join() + if proc.exitcode not in (None, 0): + raise RuntimeError(f"Diffusion stage {stage_id} exited with code {proc.exitcode}") + return + finally: + logger.info("[Headless] Shutting down stage %d.", stage_id) + if proc is not None and proc.is_alive(): + terminate_alive_proc(proc) + + stage_connector_spec = get_stage_connector_spec( + omni_transfer_config=omni_transfer_config, + stage_id=stage_id, + async_chunk=False, + ) - .. deprecated:: 0.x.x - Headless mode is deprecated and will be removed in a future version. - It is only compatible with the old OmniStage-based runtime. - The current AsyncOmniEngine-based runtime does not support headless mode. + # Device assignment is managed externally (e.g. CUDA_VISIBLE_DEVICES); + # runtime_cfg is intentionally ignored in headless mode. + engine_args_dict = build_engine_args_dict( + stage_cfg, + model, + stage_connector_spec=stage_connector_spec, + ) - Raises: - RuntimeError: Always raises an error indicating headless mode is deprecated. - """ - raise RuntimeError( - "Headless mode is deprecated and not supported in the current runtime. " - "Please use the standard orchestrator mode (without --headless flag). " - "If you need distributed deployment, consider using Ray backend or " - "other distributed serving solutions." + # Inject omni KV connector config so the engine runner can initialize the + # correct connector (sender/receiver role, type, addresses, etc.). + if omni_conn_cfg: + omni_kv = engine_args_dict.get("omni_kv_config") or {} + if not isinstance(omni_kv, dict): + omni_kv = dict(omni_kv) + omni_kv["connector_config"] = omni_conn_cfg + omni_kv["omni_from_stage"] = omni_from + omni_kv["omni_to_stage"] = omni_to + omni_kv.setdefault("stage_id", stage_id) + engine_args_dict["omni_kv_config"] = omni_kv + + vllm_config, executor_class = build_vllm_config( + stage_cfg, + model, + stage_connector_spec=stage_connector_spec, + engine_args_dict=engine_args_dict, + headless=True, + ) + parallel_config = vllm_config.parallel_config + local_engine_count = parallel_config.data_parallel_size_local + + if local_engine_count <= 0: + raise ValueError("data_parallel_size_local must be > 0 in headless mode") + + shutdown_requested = False + + def signal_handler(signum: int, frame: FrameType | None) -> None: + nonlocal shutdown_requested + logger.debug("Received %d signal.", signum) + if not shutdown_requested: + shutdown_requested = True + raise SystemExit + + signal.signal(signal.SIGTERM, signal_handler) + signal.signal(signal.SIGINT, signal_handler) + + if parallel_config.node_rank_within_dp > 0: + head_node_address = f"{parallel_config.master_addr}:{parallel_config.master_port}" + logger.info( + "Launching vLLM-Omni (v%s) headless multiproc executor, " + "with head node address %s for torch.distributed process group.", + VLLM_VERSION, + head_node_address, + ) + + executor = MultiprocExecutor(vllm_config, monitor_workers=False) + executor.start_worker_monitor(inline=True) + return + + dp_rank = parallel_config.data_parallel_rank if parallel_config.data_parallel_rank is not None else 0 + coordinator = None + if vllm_config.needs_dp_coordinator and dp_rank == 0: + coordinator = DPCoordinator( + parallel_config, + enable_wave_coordination=vllm_config.model_config.is_moe, + ) + logger.info( + "[Headless] Started DP Coordinator process for stage %d (PID: %d)", + stage_id, + coordinator.proc.pid, + ) + + logger.info( + "[Headless] Launching %d engine core(s) for stage %d via OmniMasterServer at %s:%d", + local_engine_count, + stage_id, + omni_master_address, + omni_master_port, ) + # Headless mode launches all local engine cores for a single stage. + # The OmniMasterServer allocates one handshake endpoint per stage, so we + # register the stage once here and let every local engine core reuse the + # returned handshake address directly. + handshake_address = register_stage_with_omni_master( + omni_master_address=omni_master_address, + omni_master_port=omni_master_port, + omni_stage_id=stage_id, + omni_stage_config=stage_cfg, + coordinator=coordinator, + ) + + engine_manager = None + log_stats = bool(getattr(args, "log_stats", False)) + if getattr(args, "disable_log_stats", False): + log_stats = False + + try: + engine_manager = CoreEngineProcManager( + local_engine_count=local_engine_count, + start_index=dp_rank, + local_start_index=0, + vllm_config=vllm_config, + local_client=False, + handshake_address=handshake_address, + executor_class=executor_class, + log_stats=log_stats, + ) + engine_manager.join_first() + finally: + logger.info("[Headless] Shutting down stage %d.", stage_id) + if engine_manager is not None: + engine_manager.shutdown() + if coordinator is not None: + coordinator.shutdown() + def cmd_init() -> list[CLISubcommand]: return [OmniServeCommand()] diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 0706b98987..4519ae8c0c 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -353,7 +353,8 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None, try: await shutdown_task finally: - serving_speech = getattr(getattr(app, "state", None), "openai_serving_speech", None) + state = getattr(app, "state", None) + serving_speech = getattr(state, "openai_serving_speech", None) if state is not None else None if serving_speech is not None: serving_speech.shutdown() sock.close() From c1da480bbf3d82a812a27c842e3b675aa7024788 Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:03:25 +0800 Subject: [PATCH 115/204] [CI] Update merge condition in upload_pipeline_with_skip_ci.sh to include 'merge-test' label for non-main branches (#2667) Signed-off-by: wangyu <410167048@qq.com> Co-authored-by: Hongsheng Liu --- .buildkite/test-nightly-diffusion.yml | 19 ++++++++++--------- .buildkite/test-nightly.yml | 15 ++++++++------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/.buildkite/test-nightly-diffusion.yml b/.buildkite/test-nightly-diffusion.yml index 742624e8b5..04b99c0a83 100644 --- a/.buildkite/test-nightly-diffusion.yml +++ b/.buildkite/test-nightly-diffusion.yml @@ -13,7 +13,8 @@ steps: steps: - label: ":full_moon: Diffusion · Other · Function Test with H100" timeout_in_minutes: 120 - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + # Shared nightly vs PR label conditional; referenced below as *nightly_or_pr_label + if: &nightly_or_pr_label 'build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"' commands: - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not test_qwen_image" -m "advanced_model and diffusion and H100" --run-level "advanced_model" agents: @@ -52,7 +53,7 @@ steps: - label: ":full_moon: Diffusion · Other · Function Test with L4" timeout_in_minutes: 60 - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" agents: @@ -71,7 +72,7 @@ steps: - label: ":full_moon: Diffusion · Other · Doc Test" timeout_in_minutes: 60 - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - pytest -s -v tests/examples/online_serving/test_text_to_image.py tests/examples/offline_inference/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model" @@ -114,7 +115,7 @@ steps: steps: - label: ":full_moon: Diffusion · Wan · Function Test" timeout_in_minutes: 90 - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py -m "advanced_model" --run-level "advanced_model" agents: @@ -154,7 +155,7 @@ steps: - label: ":full_moon: Diffusion · Wan · Accuracy Test" key: nightly-wan22-i2v-accuracy timeout_in_minutes: 180 - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level advanced_model agents: @@ -196,7 +197,7 @@ steps: steps: - label: ":full_moon: Diffusion · Qwen-Image · Function Test with H100" timeout_in_minutes: 120 - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - pytest -s -v tests/e2e/online_serving/test_qwen_image*_expansion.py -m "advanced_model and diffusion and H100" --run-level "advanced_model" agents: @@ -236,7 +237,7 @@ steps: - label: ":full_moon: Diffusion · Qwen-Image · GEBench Accuracy Test" key: nightly-gebench-accuracy timeout_in_minutes: 60 - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1 - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json" @@ -277,7 +278,7 @@ steps: - label: ":full_moon: Diffusion · Qwen-Image · GEdit-Bench Accuracy Test" key: nightly-gedit-bench-accuracy timeout_in_minutes: 60 - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1 - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv" @@ -321,7 +322,7 @@ steps: - label: ":full_moon: Diffusion · Qwen-Image · Perf Test" key: nightly-qwen-image-performance timeout_in_minutes: 180 - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results - export CACHE_DIT_VERSION=1.3.0 diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 0d1c8eaccf..06b7c14ae1 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -11,7 +11,8 @@ steps: - label: ":full_moon: Omni · Function Test with H100" timeout_in_minutes: 90 depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + # Shared nightly vs PR label conditional; referenced below as *nightly_or_pr_label + if: &nightly_or_pr_label 'build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"' commands: - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and H100 and omni" --run-level "advanced_model" agents: @@ -51,7 +52,7 @@ steps: - label: ":full_moon: Omni · Function Test with L4" timeout_in_minutes: 90 depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model" @@ -72,7 +73,7 @@ steps: - label: ":full_moon: Omni · Doc Test with L4" timeout_in_minutes: 90 depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/examples/ -m "advanced_model and omni and L4" --run-level "advanced_model" @@ -93,7 +94,7 @@ steps: - label: ":full_moon: Omni · Doc Test with H100" timeout_in_minutes: 90 depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - pytest -s -v tests/examples/ -m "advanced_model and omni and H100" --run-level "advanced_model" agents: @@ -134,7 +135,7 @@ steps: key: nightly-omni-performance timeout_in_minutes: 180 depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - export BENCHMARK_DIR=tests/dfx/perf/results - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" @@ -181,7 +182,7 @@ steps: - label: ":card_index_dividers: Diffusion Model Test" key: nightly-diffusion-model-test depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - buildkite-agent pipeline upload .buildkite/test-nightly-diffusion.yml agents: @@ -191,7 +192,7 @@ steps: key: nightly-testcase-statistics timeout_in_minutes: 120 depends_on: upload-nightly-pipeline - if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" + if: *nightly_or_pr_label commands: - python tools/nightly/buildkite_testcase_statistics.py -o tests/dfx/perf/results/buildkite_testcase_statistics.html - buildkite-agent artifact upload "tests/dfx/perf/results/*.html" From c2ae58bb84ce56d55f2d9ce3fb62af1fd6519362 Mon Sep 17 00:00:00 2001 From: fan2956 Date: Fri, 10 Apr 2026 16:18:27 +0800 Subject: [PATCH 116/204] [Bugfix] fix mindiesd laserattention unsupported error (#2673) Signed-off-by: fan2956 --- vllm_omni/platforms/npu/platform.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm_omni/platforms/npu/platform.py b/vllm_omni/platforms/npu/platform.py index c40dd6fea1..53ffe6775a 100644 --- a/vllm_omni/platforms/npu/platform.py +++ b/vllm_omni/platforms/npu/platform.py @@ -69,6 +69,9 @@ def get_diffusion_attn_backend_cls( # Try FLASH_ATTN if mindiesd is available, otherwise fall back to SDPA if find_spec("mindiesd"): + # Configure ASCEND_CUSTOM_OPP_PATH for mindiesd custom ops upon import + import mindiesd # noqa: F401 + logger.info("Defaulting to diffusion attention backend FLASH_ATTN") return DiffusionAttentionBackendEnum.FLASH_ATTN.get_path() From fbb5dd57949085c8353ca4d5ffefbc0e73d32c25 Mon Sep 17 00:00:00 2001 From: bjf-frz Date: Fri, 10 Apr 2026 16:48:06 +0800 Subject: [PATCH 117/204] [Bugfix]: modify diffusion pipeline profiler result in videos (#2647) Signed-off-by: bjf-frz --- benchmarks/diffusion/backends.py | 2 + .../openai_api/test_video_server.py | 56 +++++++++++- .../test_async_omni_diffusion_config.py | 21 +++++ vllm_omni/entrypoints/openai/api_server.py | 6 +- .../entrypoints/openai/protocol/videos.py | 16 ++++ vllm_omni/entrypoints/openai/serving_video.py | 87 +++++++++++++------ 6 files changed, 158 insertions(+), 30 deletions(-) diff --git a/benchmarks/diffusion/backends.py b/benchmarks/diffusion/backends.py index fa53f87aed..13ce7c8309 100644 --- a/benchmarks/diffusion/backends.py +++ b/benchmarks/diffusion/backends.py @@ -306,6 +306,8 @@ async def async_request_v1_videos( video_bytes = await content_response.read() output.response_body = video_bytes output.success = True + if "stage_durations" in poll_json: + output.stage_durations = poll_json["stage_durations"] or {} if "peak_memory_mb" in poll_json: output.peak_memory_mb = poll_json["peak_memory_mb"] elif "peak_memory_mb" in resp_json: diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py index 7200b38abb..0fdee7a77a 100644 --- a/tests/entrypoints/openai_api/test_video_server.py +++ b/tests/entrypoints/openai_api/test_video_server.py @@ -34,12 +34,14 @@ class MockVideoResult: - def __init__(self, videos, audios=None, sample_rate=None): + def __init__(self, videos, audios=None, sample_rate=None, stage_durations=None, peak_memory_mb=0.0): self.multimodal_output = {"video": videos} if audios is not None: self.multimodal_output["audio"] = audios if sample_rate is not None: self.multimodal_output["audio_sample_rate"] = sample_rate + self.stage_durations = stage_durations or {} + self.peak_memory_mb = peak_memory_mb class FakeAsyncOmni: @@ -371,6 +373,33 @@ async def _generate(prompt, request_id, sampling_params_list): assert audio_sample_rates == [16000] +def test_video_job_persists_profiler_metadata(test_client, mocker: MockerFixture): + engine = test_client.app.state.openai_serving_video._engine_client + + async def _generate(prompt, request_id, sampling_params_list): + engine.captured_prompt = prompt + engine.captured_sampling_params_list = sampling_params_list + yield MockVideoResult( + [object()], + stage_durations={"diffuse": 2.5, "vae.decode": 0.3}, + peak_memory_mb=4096.5, + ) + + engine.generate = _generate + mocker.patch( + "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", + return_value="Zg==", + ) + + response = test_client.post("/v1/videos", data={"prompt": "profile me"}) + assert response.status_code == 200 + video_id = response.json()["id"] + completed = _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value) + + assert completed["stage_durations"] == {"diffuse": 2.5, "vae.decode": 0.3} + assert completed["peak_memory_mb"] == 4096.5 + + def test_missing_handler_returns_503(): app = FastAPI() app.include_router(router) @@ -770,6 +799,31 @@ def test_sync_t2v_returns_video_bytes(test_client, mocker: MockerFixture): assert response.headers["x-request-id"].startswith("video_sync-") assert response.headers["x-model"] == "Wan-AI/Wan2.2-T2V-A14B-Diffusers" assert float(response.headers["x-inference-time-s"]) >= 0 + assert json.loads(response.headers["x-stage-durations"]) == {} + assert float(response.headers["x-peak-memory-mb"]) == 0.0 + + +def test_sync_t2v_returns_profiler_headers(test_client, mocker: MockerFixture): + engine = test_client.app.state.openai_serving_video._engine_client + + async def _generate(prompt, request_id, sampling_params_list): + engine.captured_prompt = prompt + engine.captured_sampling_params_list = sampling_params_list + yield MockVideoResult( + [object()], + stage_durations={"diffuse": 1.75}, + peak_memory_mb=1234.25, + ) + + engine.generate = _generate + _mock_encode_video_bytes(mocker, b"profiled-video") + + response = test_client.post("/v1/videos/sync", data={"prompt": "sync profile"}) + + assert response.status_code == 200 + assert response.content == b"profiled-video" + assert json.loads(response.headers["x-stage-durations"]) == {"diffuse": 1.75} + assert float(response.headers["x-peak-memory-mb"]) == pytest.approx(1234.25, rel=0, abs=1e-3) def test_sync_i2v_returns_video_bytes(test_client, mocker: MockerFixture): diff --git a/tests/entrypoints/test_async_omni_diffusion_config.py b/tests/entrypoints/test_async_omni_diffusion_config.py index ca5624f2d4..a55eaf05b9 100644 --- a/tests/entrypoints/test_async_omni_diffusion_config.py +++ b/tests/entrypoints/test_async_omni_diffusion_config.py @@ -93,3 +93,24 @@ def test_serve_cli_accepts_ulysses_mode(): assert args.ulysses_mode == "advanced_uaa" assert parallel_config.ulysses_degree == 4 assert parallel_config.ulysses_mode == "advanced_uaa" + + +def test_serve_cli_accepts_diffusion_pipeline_profiler_flag(): + """Ensure diffusion serve CLI exposes the profiler switch.""" + parser = FlexibleArgumentParser() + subparsers = parser.add_subparsers(dest="command") + OmniServeCommand().subparser_init(subparsers) + + args = parser.parse_args( + [ + "serve", + "Wan-AI/Wan2.2-T2V-A14B-Diffusers", + "--omni", + "--enable-diffusion-pipeline-profiler", + ] + ) + + stage_cfg = _create_default_diffusion_stage_cfg(args)[0] + + assert args.enable_diffusion_pipeline_profiler is True + assert stage_cfg["engine_args"]["enable_diffusion_pipeline_profiler"] is True diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 4519ae8c0c..defaa9822c 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -2009,6 +2009,8 @@ async def _run_video_generation_job( "file_name": file_name, "completed_at": int(time.time()), "inference_time_s": time.perf_counter() - started_at, + "stage_durations": response.stage_durations, + "peak_memory_mb": response.peak_memory_mb, }, ) except Exception as exc: @@ -2182,7 +2184,7 @@ async def create_video_sync( request_id = f"video_sync-{random_uuid()}" started_at = time.perf_counter() try: - video_bytes = await asyncio.wait_for( + video_bytes, stage_durations, peak_memory_mb = await asyncio.wait_for( handler.generate_video_bytes(request, request_id, reference_image=reference_image), timeout=VIDEO_SYNC_TIMEOUT_S, ) @@ -2208,6 +2210,8 @@ async def create_video_sync( "X-Request-Id": request_id, "X-Model": effective_model_name, "X-Inference-Time-S": f"{inference_time_s:.3f}", + "X-Stage-Durations": json.dumps(stage_durations, separators=(",", ":")), + "X-Peak-Memory-MB": f"{peak_memory_mb:.3f}", }, ) diff --git a/vllm_omni/entrypoints/openai/protocol/videos.py b/vllm_omni/entrypoints/openai/protocol/videos.py index e180bef229..de5362dd97 100644 --- a/vllm_omni/entrypoints/openai/protocol/videos.py +++ b/vllm_omni/entrypoints/openai/protocol/videos.py @@ -201,6 +201,14 @@ class VideoGenerationResponse(BaseModel): created: int = Field(..., description="Unix timestamp of when the generation completed") data: list[VideoData] = Field(..., description="Array of generated videos") + stage_durations: dict[str, float] = Field( + default_factory=dict, + description="Profiler stage durations reported by the diffusion pipeline.", + ) + peak_memory_mb: float = Field( + default=0.0, + description="Peak device memory usage in MB reported by the diffusion pipeline.", + ) class VideoError(BaseModel): @@ -250,6 +258,14 @@ class VideoResponse(BaseModel): description="Filename of the saved output video files for this job.", ) inference_time_s: float | None = Field(default=None, description="End-to-end inference time in seconds.") + stage_durations: dict[str, float] = Field( + default_factory=dict, + description="Profiler stage durations reported by the diffusion pipeline.", + ) + peak_memory_mb: float = Field( + default=0.0, + description="Peak device memory usage in MB reported by the diffusion pipeline.", + ) @property def file_extension(self) -> str: diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py index bddfd48003..3e05a1eedd 100644 --- a/vllm_omni/entrypoints/openai/serving_video.py +++ b/vllm_omni/entrypoints/openai/serving_video.py @@ -33,6 +33,18 @@ class ReferenceImage: data: Image.Image +@dataclass +class VideoGenerationArtifacts: + """Normalized outputs and profiler metadata extracted from one request.""" + + videos: list[Any] + audios: list[Any | None] + audio_sample_rate: int + output_fps: int + stage_durations: dict[str, float] + peak_memory_mb: float + + class OmniOpenAIServingVideo: """OpenAI-style video generation handler for omni diffusion models.""" @@ -77,12 +89,8 @@ async def _run_and_extract( reference_id: str, *, reference_image: ReferenceImage | None = None, - ) -> tuple[list[Any], list[Any | None], int, int]: - """Run the generation pipeline and extract video/audio outputs. - - Returns: - Tuple of (videos, audios, audio_sample_rate, output_fps). - """ + ) -> VideoGenerationArtifacts: + """Run the generation pipeline and extract video/audio/profiler outputs.""" prompt: OmniTextPrompt = OmniTextPrompt(prompt=request.prompt) if request.negative_prompt is not None: prompt["negative_prompt"] = request.negative_prompt @@ -153,7 +161,14 @@ async def _run_and_extract( audios = self._extract_audio_outputs(result, expected_count=len(videos)) audio_sample_rate = self._resolve_audio_sample_rate(result) output_fps = vp.fps or self._resolve_fps(result) or 24 - return videos, audios, audio_sample_rate, output_fps + return VideoGenerationArtifacts( + videos=videos, + audios=audios, + audio_sample_rate=audio_sample_rate, + output_fps=output_fps, + stage_durations=self._extract_stage_durations(result), + peak_memory_mb=self._extract_peak_memory_mb(result), + ) async def generate_videos( self, @@ -162,28 +177,31 @@ async def generate_videos( *, reference_image: ReferenceImage | None = None, ) -> VideoGenerationResponse: - videos, audios, audio_sample_rate, output_fps = await self._run_and_extract( - request, reference_id, reference_image=reference_image - ) + artifacts = await self._run_and_extract(request, reference_id, reference_image=reference_image) _t_encode_start = time.perf_counter() video_data = [ VideoData( b64_json=( - encode_video_base64(video, fps=output_fps) - if audios[idx] is None + encode_video_base64(video, fps=artifacts.output_fps) + if artifacts.audios[idx] is None else encode_video_base64( video, - fps=output_fps, - audio=audios[idx], - audio_sample_rate=audio_sample_rate, + fps=artifacts.output_fps, + audio=artifacts.audios[idx], + audio_sample_rate=artifacts.audio_sample_rate, ) ) ) - for idx, video in enumerate(videos) + for idx, video in enumerate(artifacts.videos) ] _t_encode_ms = (time.perf_counter() - _t_encode_start) * 1000 logger.info("Video response encoding (MP4+base64): %.2f ms", _t_encode_ms) - return VideoGenerationResponse(created=int(time.time()), data=video_data) + return VideoGenerationResponse( + created=int(time.time()), + data=video_data, + stage_durations=artifacts.stage_durations, + peak_memory_mb=artifacts.peak_memory_mb, + ) async def generate_video_bytes( self, @@ -191,25 +209,25 @@ async def generate_video_bytes( reference_id: str, *, reference_image: ReferenceImage | None = None, - ) -> bytes: + ) -> tuple[bytes, dict[str, float], float]: """Generate a video and return raw MP4 bytes, bypassing base64 encoding.""" - videos, audios, audio_sample_rate, output_fps = await self._run_and_extract( - request, reference_id, reference_image=reference_image - ) - if len(videos) > 1: + artifacts = await self._run_and_extract(request, reference_id, reference_image=reference_image) + if len(artifacts.videos) > 1: logger.warning( - "Video request %s generated %d outputs; returning only the first.", reference_id, len(videos) + "Video request %s generated %d outputs; returning only the first.", + reference_id, + len(artifacts.videos), ) - audio = audios[0] + audio = artifacts.audios[0] _t_encode_start = time.perf_counter() video_bytes = _encode_video_bytes( - videos[0], - fps=output_fps, - **({"audio": audio, "audio_sample_rate": audio_sample_rate} if audio is not None else {}), + artifacts.videos[0], + fps=artifacts.output_fps, + **({"audio": audio, "audio_sample_rate": artifacts.audio_sample_rate} if audio is not None else {}), ) _t_encode_ms = (time.perf_counter() - _t_encode_start) * 1000 logger.info("Video response encoding (MP4 bytes): %.2f ms", _t_encode_ms) - return video_bytes + return video_bytes, artifacts.stage_durations, artifacts.peak_memory_mb @staticmethod def _apply_lora(lora_body: Any, gen_params: OmniDiffusionSamplingParams) -> None: @@ -483,3 +501,16 @@ def _coerce_audio_sample_rate(value: Any) -> int | None: return None return sample_rate if sample_rate > 0 else None + + @staticmethod + def _extract_stage_durations(result: Any) -> dict[str, float]: + stage_durations = getattr(result, "stage_durations", None) + return stage_durations if isinstance(stage_durations, dict) else {} + + @staticmethod + def _extract_peak_memory_mb(result: Any) -> float: + peak_memory_mb = getattr(result, "peak_memory_mb", 0.0) + try: + return float(peak_memory_mb or 0.0) + except (TypeError, ValueError): + return 0.0 From 78bef62f8260fc9be6ec25de819bdbce9826f7e9 Mon Sep 17 00:00:00 2001 From: Jinheng Date: Fri, 10 Apr 2026 18:13:12 +0800 Subject: [PATCH 118/204] [Profiler] Add Nsight Systems support for serving (#1098) Signed-off-by: Jinheng Li Signed-off-by: Canlin Guo <961750412@qq.com> Co-authored-by: Claude Opus 4.5 Co-authored-by: Canlin Guo <961750412@qq.com> --- docs/contributing/profiling.md | 268 ++++++++---------- .../test_diffusion_worker_cuda_profiler.py | 103 +++++++ vllm_omni/diffusion/diffusion_engine.py | 8 +- .../diffusion/worker/diffusion_worker.py | 69 +++-- 4 files changed, 270 insertions(+), 178 deletions(-) create mode 100644 tests/diffusion/test_diffusion_worker_cuda_profiler.py diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 7a2e64f131..418fb707ae 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -1,216 +1,192 @@ # Profiling vLLM-Omni -> **Warning:** Profiling incurs significant overhead. Use only for development and debugging, never in production. +> **Warning:** Profiling is for development and debugging only. It adds significant overhead and should not be enabled in production. -vLLM-Omni uses the PyTorch Profiler to analyze performance across both **multi-stage omni-modality models** and **diffusion models**. +vLLM-Omni supports two profiler backends through `profiler_config`: -### 1. Configure Profiling in the Stage YAML +- `torch`: detailed CPU/CUDA traces written to `torch_profiler_dir` +- `cuda`: low-overhead CUDA range control for NVIDIA Nsight Systems (`nsys`) -Enable profiling by adding `profiler_config` under `engine_args` for the stage(s) you want to profile in your stage config YAML: +## 1. Configure Profiling + +Use the same `profiler_config` shape everywhere: + +```yaml +profiler_config: + profiler: torch + torch_profiler_dir: ./perf +``` + +Supported fields: + +| Field | Description | +|---|---| +| `profiler` | Profiler backend. Supported values: `torch`, `cuda`. | +| `torch_profiler_dir` | Output directory for torch traces. Required when `profiler: torch`. | +| `delay_iterations` | Number of worker iterations to skip before profiling starts. | +| `max_iterations` | Maximum number of worker iterations to capture before auto-stop. | +| `warmup_iterations` | Torch-profiler warmup iterations. | +| `active_iterations` | Torch-profiler active iterations. | +| `wait_iterations` | Torch-profiler wait iterations before warmup. | + +For multi-stage omni pipelines, put `profiler_config` under the target stage's `engine_args`. ```yaml stage_args: - stage_id: 0 stage_type: llm engine_args: - # ... other engine args ... profiler_config: profiler: torch torch_profiler_dir: ./perf ``` -| Field | Description | -|---|---| -| `profiler` | Profiler backend to use. Currently supports `torch`. | -| `torch_profiler_dir` | Directory where trace files are saved. Created automatically if it doesn't exist. | - -> **Tip:** Only enable `profiler_config` on stages you actually need to profile. Stages without it will not start a profiler, keeping overhead minimal. - -### 2. Profiling Omni-Modality Models +For single-stage diffusion usage, pass `profiler_config` directly to `Omni(...)` or `vllm serve`. -**Selective Stage Profiling** +## 2. Profiling Omni Pipelines -It is highly recommended to profile specific stages to prevent producing overly large trace files: +It is usually best to profile only the stages you need. ```python -# Profile all stages -omni_llm.start_profile() +# Profile all stages. +omni.start_profile() -# Only profile Stage 1 -omni_llm.start_profile(stages=[1]) - -# Stage 0 (Thinker) and Stage 2 (Audio Decoder) for qwen omni -omni_llm.start_profile(stages=[0, 2]) +# Profile selected stages only. +omni.start_profile(stages=[0, 2]) +... +omni.stop_profile(stages=[0, 2]) ``` -> **Important:** Always pass the same `stages` list to both `start_profile()` and `stop_profile()`. If you omit `stages` from `stop_profile()`, it defaults to stopping all stages — including ones that were never started — which will produce errors. - -**Python Usage**: Wrap your generation logic with `start_profile()` and `stop_profile()`. +Always stop the same stage set that you started. If only some stages have `profiler_config`, pass an explicit `stages=[...]` list instead of relying on the default "all stages" behavior. -```python -profiler_stages = [0] # Only profile the stages you need +Examples: -# 1. Start profiling -omni.start_profile(stages=profiler_stages) +1. [Qwen2.5-Omni end2end](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py) +2. [Qwen3-Omni end2end](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py) -# Initialize generator -omni_generator = omni.generate(prompts, sampling_params_list, py_generator=args.py_generator) +## 3. Profiling Single-Stage Diffusion -total_requests = len(prompts) -processed_count = 0 +Single-stage diffusion models use the same `start_profile()` / `stop_profile()` controls, but you must provide `profiler_config` explicitly. -# Main Processing Loop -for stage_outputs in omni_generator: +### PyTorch profiler - # ... [Output processing logic for text/audio would go here] ... +```python +from vllm_omni import Omni + +omni = Omni( + model="Wan-AI/Wan2.2-I2V-A14B-Diffusers", + profiler_config={ + "profiler": "torch", + "torch_profiler_dir": "./perf", + }, +) + +omni.start_profile() +... +omni.stop_profile() +``` - # Update count to track when to stop profiling - processed_count += len(stage_outputs.request_output) +### Nsight Systems (`nsys`) - # 2. Check if all requests are done to stop the profiler safely - if profiler_enabled and processed_count >= total_requests: - print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...") +For Nsight Systems, use `profiler: cuda` and wrap the process with `nsys profile`. - # Stop the profiler while workers are still active - # Pass the same stages list used in start_profile() - omni_llm.stop_profile(stages=profiler_stages) +```bash +nsys profile \ + --trace-fork-before-exec=true \ + --cuda-graph-trace=node \ + --capture-range=cudaProfilerApi \ + --capture-range-end=repeat \ + -o diffusion_trace \ + python image_to_video.py ... +``` - # Wait for traces to flush to disk - print("[Info] Waiting 30s for workers to write trace files to disk...") - time.sleep(30) - print("[Info] Trace export wait time finished.") +The Python process being profiled must create the diffusion engine with: -omni_llm.close() +```python +profiler_config={"profiler": "cuda"} ``` +Then call `start_profile()` before the requests you want to capture and `stop_profile()` after them. The diffusion worker processes open and close the CUDA capture range themselves, so `nsys` sees the actual GPU work instead of only the parent process. -**CLI Usage** (using `end2end.py`): -```bash -# Profile only Stage 0 (Thinker) -python end2end.py --output-wav output_audio \ - --query-type text --enable-profiler --profiler-stages 0 +Examples: -# Profile Stage 0 and Stage 2 -python end2end.py --output-wav output_audio \ - --query-type text --enable-profiler --profiler-stages 0 2 +1. [Image edit example](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py) +2. [Image to video example](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video) -# Profile all stages (omit --profiler-stages) -python end2end.py --output-wav output_audio \ - --query-type text --enable-profiler -``` +## 4. Profiling Online Serving -**Examples**: +When any stage has `profiler_config.profiler` set, the server exposes: -1. **Qwen2.5-Omni**: [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen2_5_omni/end2end.py) +- `POST /start_profile` +- `POST /stop_profile` -2. **Qwen3-Omni**: [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/qwen3_omni/end2end.py) +### Start the server -### 3. Profiling diffusion models +Multi-stage omni serving: -Diffusion profiling is End-to-End, capturing encoding, denoising loops, and decoding. Standalone diffusion scripts use `--profiler-dir` to enable profiling. - -**CLI Usage:** ```bash -python image_to_video.py \ - --model Wan-AI/Wan2.2-I2V-A14B-Diffusers \ - --image qwen-bear.png \ - --prompt "A cat playing with yarn, smooth motion" \ - --profiler-dir \ - \ - # Minimize Spatial Dimensions (Optional but helpful): - # Drastically reduces memory usage so the profiler doesn't - # crash due to overhead, though for accurate performance - # tuning you often want target resolutions. - --height 48 \ - --width 64 \ - \ - # Minimize Temporal Dimension (Frames): - # Video models process 3D tensors (Time, Height, Width). - # Reducing frames to the absolute minimum (2) keeps the - # tensor size small, ensuring the trace file doesn't become - # multi-gigabytes in size. - --num-frames 2 \ - \ - # Minimize Iteration Loop (Steps): - # This is the most critical setting for profiling. - # Diffusion models run the same loop X times. - # Profiling 2 steps gives you the exact same performance - # data as 50 steps, but saves minutes of runtime and - # prevents the trace viewer from freezing. - --num-inference-steps 2 \ - \ - --guidance-scale 5.0 \ - --guidance-scale-high 6.0 \ - --boundary-ratio 0.875 \ - --flow-shift 12.0 \ - --fps 16 \ - --output i2v_output.mp4 +vllm serve Qwen/Qwen2.5-Omni-7B \ + --omni \ + --stage-configs-path qwen2_5_omni.yaml \ + --port 8091 ``` -> **Note:** For diffusion stages within a multi-stage omni pipeline, use `profiler_config` in the stage YAML instead (see Section 1). - -**Examples**: - -1. **Qwen image edit**: [https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py](https://github.com/vllm-project/vllm-omni/blob/main/examples/offline_inference/image_to_image/image_edit.py) - -2. **Wan-AI/Wan2.2-I2V-A14B-Diffusers**: [https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video](https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/image_to_video) - -### 4. Profiling Online Serving - -When `profiler_config` is set in the stage YAML, the server automatically exposes `/start_profile` and `/stop_profile` HTTP endpoints. +Single-stage diffusion serving with torch profiler: -**1. Start the server** with a stage YAML that has `profiler_config` enabled: ```bash -vllm serve Qwen/Qwen2.5-Omni-7B \ - --omni \ - --stage-configs-path qwen2_5_omni.yaml \ - --port 8091 +vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers \ + --omni \ + --port 8091 \ + --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}' ``` -Or for one stage diffusion models: +Single-stage diffusion serving with Nsight Systems: ```bash -vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers --omni --port 8091 --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}' +nsys profile \ + --trace-fork-before-exec=true \ + --cuda-graph-trace=node \ + --capture-range=cudaProfilerApi \ + --capture-range-end=repeat \ + -o serving_trace \ + vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers \ + --omni \ + --port 8091 \ + --profiler-config '{"profiler": "cuda"}' ``` -**2. Start profiling** by sending a POST request: +### Control capture + ```bash -# Profile all stages that have profiler_config set +# Start profiling on all profiled stages. curl -X POST http://localhost:8091/start_profile -# Profile specific stages only +# Start profiling on selected stages. curl -X POST http://localhost:8091/start_profile \ - -H "Content-Type: application/json" \ - -d '{"stages": [0]}' -``` + -H "Content-Type: application/json" \ + -d '{"stages": [0]}' -**3. Send your inference requests** as normal while the profiler is running. - -**4. Stop profiling** and collect traces: -```bash -# Stop all stages +# Stop profiling. curl -X POST http://localhost:8091/stop_profile - -# Stop specific stages (must match the stages you started) -curl -X POST http://localhost:8091/stop_profile \ - -H "Content-Type: application/json" \ - -d '{"stages": [0]}' ``` -Trace files are written to the `torch_profiler_dir` specified in your stage YAML. +For mixed-stage pipelines, use explicit `stages` and pass the same stage list to both endpoints. + +## 5. Analyze Results -> **Important:** Always stop the same stages you started. Stopping a stage that was never started will produce errors. +Torch profiler output: -### 5. Analyzing Traces +- Chrome/Perfetto traces under `torch_profiler_dir` +- Optional aggregated CUDA-time tables under the same directory -Output files are saved to the `torch_profiler_dir` specified in your stage YAML config. +CUDA profiler / Nsight Systems output: -**Output** -**Chrome Trace** (`.json.gz`): Visual timeline of kernels and stages. Open in Perfetto UI. +- `.nsys-rep` report files written by `nsys -o ...` -**Viewing Tools:** +Recommended viewers: -- [Perfetto](https://ui.perfetto.dev/) (recommended) -- `chrome://tracing` (Chrome only) +- [Perfetto](https://ui.perfetto.dev/) for torch traces +- `nsys stats .nsys-rep` for CLI summaries +- Nsight Systems GUI for CUDA kernel timelines -**Note**: vLLM-Omni reuses the PyTorch Profiler infrastructure from vLLM. See the official vLLM profiler documentation: [vLLM Profiling Guide](https://docs.vllm.ai/en/stable/contributing/profiling/) +vLLM-Omni reuses the vLLM profiling infrastructure where possible. For the upstream reference, see the [vLLM profiling guide](https://docs.vllm.ai/en/stable/contributing/profiling/). diff --git a/tests/diffusion/test_diffusion_worker_cuda_profiler.py b/tests/diffusion/test_diffusion_worker_cuda_profiler.py new file mode 100644 index 0000000000..ddc2aed2fc --- /dev/null +++ b/tests/diffusion/test_diffusion_worker_cuda_profiler.py @@ -0,0 +1,103 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import MagicMock + +import pytest +from pytest_mock import MockerFixture + +from vllm_omni.diffusion.worker.diffusion_worker import DiffusionWorker + +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] + + +@pytest.fixture +def mock_od_config(mocker: MockerFixture): + """Create a mock OmniDiffusionConfig with a CUDA profiler backend.""" + config = mocker.Mock() + config.profiler_config = mocker.Mock() + config.profiler_config.profiler = "cuda" + config.diffusion_load_format = "default" + return config + + +@pytest.fixture +def mock_diffusion_worker_dependencies(mocker: MockerFixture): + """Patch heavy worker dependencies for focused profiler tests.""" + mocker.patch.object(DiffusionWorker, "init_device") + mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.DiffusionModelRunner") + + +class TestDiffusionWorkerCudaProfiler: + def test_creates_cuda_profiler_wrapper( + self, + mocker: MockerFixture, + mock_od_config, + mock_diffusion_worker_dependencies, + ): + fake_profiler = mocker.Mock() + cuda_profiler = mocker.patch( + "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper", + return_value=fake_profiler, + ) + create_omni_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.create_omni_profiler") + + worker = DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True) + + cuda_profiler.assert_called_once_with(mock_od_config.profiler_config) + create_omni_profiler.assert_not_called() + assert worker.profiler is fake_profiler + + def test_profile_start_stop_delegates_to_cuda_profiler( + self, + mocker: MockerFixture, + mock_od_config, + mock_diffusion_worker_dependencies, + ): + fake_profiler = mocker.Mock() + fake_profiler.start = MagicMock() + fake_profiler.stop = MagicMock() + mocker.patch( + "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper", + return_value=fake_profiler, + ) + + worker = DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True) + + assert worker.profile(is_start=True) is None + assert worker.profile(is_start=False) is None + + fake_profiler.start.assert_called_once_with() + fake_profiler.stop.assert_called_once_with() + + def test_returns_none_when_profiler_config_is_missing( + self, + mocker: MockerFixture, + mock_od_config, + mock_diffusion_worker_dependencies, + ): + mock_od_config.profiler_config = None + cuda_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper") + create_omni_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.create_omni_profiler") + + worker = DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True) + + cuda_profiler.assert_not_called() + create_omni_profiler.assert_not_called() + assert worker.profiler is None + + def test_cuda_backend_does_not_use_torch_profiler_factory( + self, + mocker: MockerFixture, + mock_od_config, + mock_diffusion_worker_dependencies, + ): + mocker.patch( + "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper", + return_value=mocker.Mock(), + ) + create_omni_profiler = mocker.patch("vllm_omni.diffusion.worker.diffusion_worker.create_omni_profiler") + + DiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config, skip_load_model=True) + + create_omni_profiler.assert_not_called() diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index 422ef479b0..52a8f38547 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -361,15 +361,11 @@ def add_req_and_wait_for_response(self, request: OmniDiffusionRequest) -> Diffus ) def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> None: - """Start or stop torch profiling on all diffusion workers. + """Start or stop profiling on all diffusion workers. Args: is_start: True to start profiling, False to stop. - profile_prefix: Optional prefix for trace filename (vLLM compat). - - Note: - Matches vLLM's worker.profile() signature for consistency. - Traces are saved automatically via on_trace_ready callback. + profile_prefix: Optional prefix for trace filename. """ if is_start: if profile_prefix is None: diff --git a/vllm_omni/diffusion/worker/diffusion_worker.py b/vllm_omni/diffusion/worker/diffusion_worker.py index ea4b9d96f7..160309e0d8 100644 --- a/vllm_omni/diffusion/worker/diffusion_worker.py +++ b/vllm_omni/diffusion/worker/diffusion_worker.py @@ -20,6 +20,7 @@ from vllm.config import CompilationConfig, DeviceConfig, VllmConfig, set_current_vllm_config from vllm.distributed.device_communicators.shm_broadcast import MessageQueue from vllm.logger import init_logger +from vllm.profiler.wrapper import CudaProfilerWrapper, WorkerProfiler from vllm.utils.import_utils import resolve_obj_by_qualname from vllm.utils.mem_utils import GiB_bytes from vllm.v1.worker.workspace import init_workspace_manager @@ -83,15 +84,7 @@ def __init__( od_config=self.od_config, device=self.device, ) - # Initialize profiler if configured - self.profiler: OmniTorchProfilerWrapper | None = None - profiler_config = self.od_config.profiler_config - if profiler_config and profiler_config.profiler == "torch": - self.profiler = create_omni_profiler( - profiler_config=profiler_config, - worker_name=f"diffusion_worker_{self.rank}", - local_rank=self.local_rank, - ) + self.profiler: WorkerProfiler | None = self._create_profiler() if not skip_load_model: self.load_model(load_format=self.od_config.diffusion_load_format) self.init_lora_manager() @@ -122,6 +115,7 @@ def init_device(self) -> None: vllm_config.parallel_config.tensor_parallel_size = self.od_config.parallel_config.tensor_parallel_size vllm_config.parallel_config.data_parallel_size = self.od_config.parallel_config.data_parallel_size vllm_config.parallel_config.enable_expert_parallel = self.od_config.parallel_config.enable_expert_parallel + vllm_config.profiler_config = self.od_config.profiler_config self.vllm_config = vllm_config # Initialize distributed environment @@ -147,6 +141,24 @@ def init_device(self) -> None: ) init_workspace_manager(self.device) + def _create_profiler(self) -> WorkerProfiler | None: + profiler_config = self.od_config.profiler_config + profiler_type = getattr(profiler_config, "profiler", None) + if profiler_type == "torch": + return create_omni_profiler( + profiler_config=profiler_config, + worker_name=f"diffusion_rank{self.rank}", + local_rank=self.local_rank, + ) + if profiler_type == "cuda": + return CudaProfilerWrapper(profiler_config) + if profiler_type is not None: + logger.warning("Unknown profiler backend %r on diffusion worker %s", profiler_type, self.rank) + return None + + def _get_profiler(self) -> WorkerProfiler | None: + return getattr(self, "profiler", None) + def load_model(self, load_format: str = "default", custom_pipeline_name: str | None = None) -> None: """Load the diffusion model using DiffusionModelRunner.""" with ( @@ -192,27 +204,21 @@ def profile(self, is_start: bool = True, profile_prefix: str | None = None) -> N Args: is_start: True to start profiling, False to stop. - profile_prefix: Optional prefix for trace filename (vLLM compat). - - Note: - Matches vLLM's worker.profile() signature for consistency. - Traces are saved automatically via on_trace_ready callback. + profile_prefix: Optional prefix for trace filename. """ - if self.profiler is None: - logger.warning("Profiler not initialized, skipping profile(%s)", is_start) + profiler = self._get_profiler() + if profiler is None: return if is_start: - from vllm_omni.profiler import OmniTorchProfilerWrapper - - if isinstance(self.profiler, OmniTorchProfilerWrapper): + if isinstance(profiler, OmniTorchProfilerWrapper): import time - filename = profile_prefix or f"diffusion_{int(time.time())}" - self.profiler.set_trace_filename(filename) - self.profiler.start() + filename = profile_prefix or f"diffusion_rank{self.rank}_{int(time.time())}" + profiler.set_trace_filename(filename) + profiler.start() else: - self.profiler.stop() + profiler.stop() def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfig) -> DiffusionOutput: """Execute a forward pass by delegating to the model runner.""" @@ -224,7 +230,13 @@ def execute_model(self, req: OmniDiffusionRequest, od_config: OmniDiffusionConfi if req.sampling_params.lora_request is not None: raise logger.warning("LoRA activation skipped: %s", exc) - return self.model_runner.execute_model(req) + profiler = self._get_profiler() + ctx = profiler.annotate_context_manager("diffusion_forward") if profiler else nullcontext() + with ctx: + output = self.model_runner.execute_model(req) + if profiler: + profiler.step() + return output def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> RunnerOutput: """Execute one diffusion step by delegating to the model runner.""" @@ -236,8 +248,13 @@ def execute_stepwise(self, scheduler_output: DiffusionSchedulerOutput) -> Runner if any(new_req.req.sampling_params.lora_request is not None for new_req in scheduler_output.scheduled_new_reqs): raise ValueError("Step mode does not support LoRA yet.") - - return self.model_runner.execute_stepwise(scheduler_output) + profiler = self._get_profiler() + ctx = profiler.annotate_context_manager("diffusion_step") if profiler else nullcontext() + with ctx: + output = self.model_runner.execute_stepwise(scheduler_output) + if profiler: + profiler.step() + return output def load_weights(self, weights) -> set[str]: """Load weights by delegating to the model runner.""" From 687405c5f2c12068701da4d3b7a12e1a6521b85b Mon Sep 17 00:00:00 2001 From: "Yiyang \"Ian\" Liu" Date: Fri, 10 Apr 2026 06:36:29 -0700 Subject: [PATCH 119/204] [Config] Remove invalid LLM-only engine_args from diffusion stage configs (#2622) Signed-off-by: Yiyang Liu Co-authored-by: Yiyang Liu Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../stage_configs/bagel_mooncake_ci.yaml | 6 -- .../stage_configs/bagel_sharedmemory_ci.yaml | 6 -- tests/test_diffusion_config_fields.py | 68 +++++++++++++++++++ .../model_executor/stage_configs/bagel.yaml | 5 -- .../stage_configs/bagel_multiconnector.yaml | 5 -- .../stage_configs/bagel_single_stage.yaml | 5 -- .../stage_configs/bagel_think.yaml | 5 -- .../stage_configs/bagel_usp2.yaml | 5 -- .../stage_configs/hunyuan_image3_moe_dit.yaml | 4 -- .../hunyuan_image3_moe_dit_2gpu_fp8.yaml | 4 -- .../stage_configs/hunyuan_image_3_moe.yaml | 4 -- .../stage_configs/omnivoice.yaml | 2 - 12 files changed, 68 insertions(+), 51 deletions(-) create mode 100644 tests/test_diffusion_config_fields.py diff --git a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml index 590244acd2..1f0d06cb8c 100644 --- a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml @@ -47,15 +47,9 @@ stage_args: engine_args: model_stage: dit max_num_seqs: 1 - gpu_memory_utilization: 0.45 enforce_eager: true trust_remote_code: true - engine_output_type: image distributed_executor_backend: mp - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 - load_format: dummy omni_kv_config: need_recv_cache: true engine_input_source: [0] diff --git a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml index b7999652e2..36b1d2bbe4 100644 --- a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml @@ -46,15 +46,9 @@ stage_args: engine_args: model_stage: dit max_num_seqs: 1 - gpu_memory_utilization: 0.45 enforce_eager: true trust_remote_code: true - engine_output_type: image distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 - load_format: dummy omni_kv_config: need_recv_cache: true engine_input_source: [0] diff --git a/tests/test_diffusion_config_fields.py b/tests/test_diffusion_config_fields.py new file mode 100644 index 0000000000..b87ceec1df --- /dev/null +++ b/tests/test_diffusion_config_fields.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Ensure diffusion stage YAML configs only use valid OmniDiffusionConfig fields. + +Regression test for https://github.com/vllm-project/vllm-omni/issues/2563 +""" + +from dataclasses import fields +from pathlib import Path + +import pytest +import yaml + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + +try: + from vllm_omni.diffusion.data import OmniDiffusionConfig +except Exception: + OmniDiffusionConfig = None + + +@pytest.mark.skipif( + OmniDiffusionConfig is None, + reason="OmniDiffusionConfig could not be imported (missing torch?)", +) +def test_diffusion_stage_configs_only_contain_valid_fields(): + """Diffusion stage engine_args must only contain OmniDiffusionConfig fields. + + Regression test for https://github.com/vllm-project/vllm-omni/issues/2563 + """ + # Scan both main configs and test configs + repo_root = Path(__file__).parent.parent + config_dirs = [ + repo_root / "vllm_omni" / "model_executor" / "stage_configs", + ] + # Also scan test directories recursively + test_dir = repo_root / "tests" + + yaml_paths: list[Path] = [] + for config_dir in config_dirs: + yaml_paths.extend(sorted(config_dir.glob("*.yaml"))) + yaml_paths.extend(sorted(test_dir.rglob("*.yaml"))) + + valid_fields = {f.name for f in fields(OmniDiffusionConfig)} + # model_stage is consumed by the stage init layer, not OmniDiffusionConfig + valid_fields.add("model_stage") + # model_arch is consumed by the stage init layer for diffusion model class resolution + valid_fields.add("model_arch") + # "quantization" is mapped to "quantization_config" by from_kwargs() backwards-compat + valid_fields.add("quantization") + + invalid_entries: list[tuple[str, set[str]]] = [] + for yaml_path in yaml_paths: + with open(yaml_path) as fh: + config = yaml.safe_load(fh) + + stages = config.get("stage_args", config.get("stages", [])) + for stage in stages: + if stage.get("stage_type") != "diffusion": + continue + engine_args = stage.get("engine_args", {}) + invalid = set(engine_args.keys()) - valid_fields + if invalid: + invalid_entries.append((yaml_path.relative_to(repo_root), invalid)) + + assert not invalid_entries, "Diffusion stage configs contain fields not in OmniDiffusionConfig:\n" + "\n".join( + f" {name}: {sorted(bad)}" for name, bad in invalid_entries + ) diff --git a/vllm_omni/model_executor/stage_configs/bagel.yaml b/vllm_omni/model_executor/stage_configs/bagel.yaml index d1031b574a..dfe9da1c26 100644 --- a/vllm_omni/model_executor/stage_configs/bagel.yaml +++ b/vllm_omni/model_executor/stage_configs/bagel.yaml @@ -52,14 +52,9 @@ stage_args: engine_args: model_stage: dit max_num_seqs: 1 - gpu_memory_utilization: 0.45 enforce_eager: true trust_remote_code: true - engine_output_type: image distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 omni_kv_config: need_recv_cache: true engine_input_source: [0] diff --git a/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml b/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml index 4919395cad..af038f59fb 100644 --- a/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml +++ b/vllm_omni/model_executor/stage_configs/bagel_multiconnector.yaml @@ -45,14 +45,9 @@ stage_args: engine_args: model_stage: dit max_num_seqs: 1 - gpu_memory_utilization: 0.45 enforce_eager: true trust_remote_code: true - engine_output_type: image distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 omni_kv_config: need_recv_cache: true engine_input_source: [0] diff --git a/vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml b/vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml index 2c1d84af49..bb24763f90 100644 --- a/vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml +++ b/vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml @@ -9,14 +9,9 @@ stage_args: engine_args: model_stage: dit max_num_seqs: 1 - gpu_memory_utilization: 0.45 enforce_eager: true trust_remote_code: true - engine_output_type: image distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 final_output: true final_output_type: image diff --git a/vllm_omni/model_executor/stage_configs/bagel_think.yaml b/vllm_omni/model_executor/stage_configs/bagel_think.yaml index c4cf32c707..0d2098a203 100644 --- a/vllm_omni/model_executor/stage_configs/bagel_think.yaml +++ b/vllm_omni/model_executor/stage_configs/bagel_think.yaml @@ -49,14 +49,9 @@ stage_args: engine_args: model_stage: dit max_num_seqs: 1 - gpu_memory_utilization: 0.45 enforce_eager: true trust_remote_code: true - engine_output_type: image distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 omni_kv_config: need_recv_cache: true engine_input_source: [0] diff --git a/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml b/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml index 632c227f36..33002b9aa5 100644 --- a/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml +++ b/vllm_omni/model_executor/stage_configs/bagel_usp2.yaml @@ -45,14 +45,9 @@ stage_args: max_batch_size: 1 engine_args: model_stage: dit - gpu_memory_utilization: 0.45 enforce_eager: true trust_remote_code: true - engine_output_type: image distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 1 parallel_config: ulysses_degree: 2 # ring_degree: 2 diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit.yaml index 0b812ff376..a60fe9a5b5 100644 --- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit.yaml +++ b/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit.yaml @@ -11,13 +11,9 @@ stage_args: engine_args: max_num_seqs: 1 model_stage: dit - gpu_memory_utilization: 0.65 enforce_eager: true trust_remote_code: true - engine_output_type: image distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 parallel_config: tensor_parallel_size: 4 enable_expert_parallel: true diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit_2gpu_fp8.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit_2gpu_fp8.yaml index 51110c2858..aeef27a974 100644 --- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit_2gpu_fp8.yaml +++ b/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit_2gpu_fp8.yaml @@ -11,13 +11,9 @@ stage_args: max_batch_size: 1 engine_args: model_stage: dit - gpu_memory_utilization: 0.9 enforce_eager: true trust_remote_code: true - engine_output_type: image distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 quantization: "fp8" parallel_config: tensor_parallel_size: 2 diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image_3_moe.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image_3_moe.yaml index 6f4ba306a5..808b4619f7 100644 --- a/vllm_omni/model_executor/stage_configs/hunyuan_image_3_moe.yaml +++ b/vllm_omni/model_executor/stage_configs/hunyuan_image_3_moe.yaml @@ -50,12 +50,8 @@ stage_args: max_batch_size: 1 engine_args: model_stage: diffusion - gpu_memory_utilization: 0.9 enforce_eager: true - engine_output_type: image distributed_executor_backend: "mp" - enable_prefix_caching: false - max_num_batched_tokens: 32768 vae_use_slicing: false vae_use_tiling: false cache_backend: null diff --git a/vllm_omni/model_executor/stage_configs/omnivoice.yaml b/vllm_omni/model_executor/stage_configs/omnivoice.yaml index 49f11e9674..546e3b3dc2 100644 --- a/vllm_omni/model_executor/stage_configs/omnivoice.yaml +++ b/vllm_omni/model_executor/stage_configs/omnivoice.yaml @@ -10,10 +10,8 @@ stage_args: engine_args: model_stage: dit model_class_name: "OmniVoicePipeline" - gpu_memory_utilization: 0.5 enforce_eager: true trust_remote_code: true - engine_output_type: audio distributed_executor_backend: "mp" dtype: "float32" final_output: true From 2bc183f6f0e91f43aa7e74040c47fdac4a6b1f59 Mon Sep 17 00:00:00 2001 From: Nick Cao Date: Fri, 10 Apr 2026 14:11:31 -0400 Subject: [PATCH 120/204] [Refactor] Remove dependency on librosa (#2273) Signed-off-by: Nick Cao Co-authored-by: Claude --- docker/Dockerfile.ci | 2 +- docker/Dockerfile.cuda | 2 +- docker/Dockerfile.rocm | 2 +- docker/Dockerfile.xpu | 2 - docs/usage/faq.md | 8 --- .../examples/offline_inference/bagel.md | 7 -- .../examples/offline_inference/cosyvoice3.md | 2 +- .../examples/offline_inference/mimo_audio.md | 23 ------- .../offline_inference/qwen2_5_omni.md | 8 --- .../examples/offline_inference/qwen3_omni.md | 8 --- .../examples/online_serving/bagel.md | 7 -- .../examples/online_serving/qwen2_5_omni.md | 8 --- .../examples/online_serving/qwen3_omni.md | 9 --- .../examples/online_serving/qwen3_tts.md | 8 --- examples/offline_inference/bagel/README.md | 7 -- .../offline_inference/cosyvoice3/README.md | 2 +- .../cosyvoice3/verify_e2e_cosyvoice.py | 22 +------ .../offline_inference/mimo_audio/README.md | 23 ------- .../mimo_audio/message_convert.py | 4 +- .../offline_inference/omnivoice/end2end.py | 4 +- .../offline_inference/qwen2_5_omni/README.md | 8 --- .../offline_inference/qwen2_5_omni/end2end.py | 10 +-- .../offline_inference/qwen3_omni/README.md | 8 --- .../offline_inference/qwen3_omni/end2end.py | 6 +- .../qwen3_omni/end2end_async_chunk.py | 4 +- .../x_to_video_audio/x_to_video_audio.py | 4 +- examples/online_serving/bagel/README.md | 7 -- .../online_serving/qwen2_5_omni/README.md | 8 --- examples/online_serving/qwen3_omni/README.md | 10 +-- .../qwen3_omni/openai_realtime_client.py | 6 +- examples/online_serving/qwen3_tts/README.md | 8 --- .../speaker_embedding_interpolation.py | 14 ++-- requirements/common.txt | 1 - .../openai_api/test_serving_speech.py | 23 ++----- tests/utils/test_audio.py | 61 ++++++++++++++++++ vllm_omni/assets/video.py | 4 +- vllm_omni/entrypoints/chat_utils.py | 6 +- .../entrypoints/openai/audio_utils_mixin.py | 54 ++++++++++++---- .../models/cosyvoice3/assets/mel_filters.npz | Bin 4271 -> 0 bytes .../model_executor/models/cosyvoice3/utils.py | 51 ++++----------- .../models/qwen3_tts/qwen3_tts_talker.py | 21 +++--- .../models/qwen3_tts/qwen3_tts_tokenizer.py | 9 +-- .../tokenizer_25hz/vq/assets/mel_filters.npz | Bin 4271 -> 0 bytes .../qwen3_tts/tokenizer_25hz/vq/speech_vq.py | 7 +- .../tokenizer_25hz/vq/whisper_encoder.py | 19 +----- vllm_omni/utils/audio.py | 45 +++++++++++++ 46 files changed, 229 insertions(+), 323 deletions(-) create mode 100644 tests/utils/test_audio.py delete mode 100644 vllm_omni/model_executor/models/cosyvoice3/assets/mel_filters.npz delete mode 100644 vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/assets/mel_filters.npz create mode 100644 vllm_omni/utils/audio.py diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 24ce39bafd..2a98de1b81 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -7,7 +7,7 @@ COPY . . # Install system dependencies RUN apt-get update && \ - apt-get install -y espeak-ng ffmpeg git sox libsox-fmt-all jq && \ + apt-get install -y espeak-ng git sox libsox-fmt-all jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda index 754d491d86..6ed5b7d277 100644 --- a/docker/Dockerfile.cuda +++ b/docker/Dockerfile.cuda @@ -7,7 +7,7 @@ WORKDIR ${COMMON_WORKDIR} # Step 1: Setup - Install system dependencies RUN apt-get update && \ - apt-get install -y ffmpeg git sox libsox-fmt-all jq && \ + apt-get install -y git sox libsox-fmt-all jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index bfbb060bcb..8b22bee38b 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -19,7 +19,7 @@ WORKDIR ${COMMON_WORKDIR} # Step 1: Setup - Install system dependencies RUN apt-get update && \ - apt-get install -y espeak-ng ffmpeg git sox libsox-fmt-all jq && \ + apt-get install -y espeak-ng git sox libsox-fmt-all jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 17f1aebf0d..25d5d0c800 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -15,9 +15,7 @@ RUN apt clean && apt-get update -y && \ apt-get install -y --no-install-recommends --fix-missing \ curl \ espeak-ng \ - ffmpeg \ git \ - libsndfile1 \ libsm6 \ libxext6 \ libgl1 \ diff --git a/docs/usage/faq.md b/docs/usage/faq.md index c080eae402..0539e158b0 100644 --- a/docs/usage/faq.md +++ b/docs/usage/faq.md @@ -4,14 +4,6 @@ A: Now, we support natively disaggregated deployment for different model stages within a model. There is a restriction that one chip can only have one AutoRegressive model stage. This is because the unified KV cache management of vLLM. Stages of other types can coexist within a chip. The restriction will be resolved in later version. -> Q: When trying to run examples, I encounter error about backend of librosa or soundfile. How to solve it? - -A: If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - > Q: I see GPU OOM or "free memory is less than desired GPU memory utilization" errors. How can I fix it? A: Refer to [GPU memory calculation and configuration](../configuration/gpu_memory_utilization.md) for guidance on tuning `gpu_memory_utilization` and related settings. diff --git a/docs/user_guide/examples/offline_inference/bagel.md b/docs/user_guide/examples/offline_inference/bagel.md index 5f458750b4..e626686872 100644 --- a/docs/user_guide/examples/offline_inference/bagel.md +++ b/docs/user_guide/examples/offline_inference/bagel.md @@ -250,13 +250,6 @@ For more details on the Mooncake connector and multi-node setup, see the [Moonca ## FAQ -- If you encounter an error about the backend of librosa, try to install ffmpeg with the command below. - -```bash -sudo apt update -sudo apt install ffmpeg -``` - - If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. | Stage | VRAM | diff --git a/docs/user_guide/examples/offline_inference/cosyvoice3.md b/docs/user_guide/examples/offline_inference/cosyvoice3.md index d912f1c62e..ebb7c02efc 100644 --- a/docs/user_guide/examples/offline_inference/cosyvoice3.md +++ b/docs/user_guide/examples/offline_inference/cosyvoice3.md @@ -10,7 +10,7 @@ Install dependencies: uv pip install -e . ``` -> **Note:** This includes required libraries such as `librosa`, `soundfile`, +> **Note:** This includes required libraries such as `soundfile`, > `onnxruntime`, `x-transformers`, and `einops` via > `requirements/common.txt` and platform-specific requirements files. diff --git a/docs/user_guide/examples/offline_inference/mimo_audio.md b/docs/user_guide/examples/offline_inference/mimo_audio.md index 1a3be15d69..4e80526971 100644 --- a/docs/user_guide/examples/offline_inference/mimo_audio.md +++ b/docs/user_guide/examples/offline_inference/mimo_audio.md @@ -189,29 +189,6 @@ Note: This task uses hardcoded message lists in the script. ## Troubleshooting -### Audio dependencies (soundfile, librosa) - -This example depends on **soundfile** (read/write WAV) and **librosa** (load audio including MP3). Install the project requirements first: - -```bash -pip install -r requirements/common.txt -# or at least: pip install soundfile>=0.13.1 librosa>=0.11.0 -``` - -- **`soundfile` / libsndfile not found** - `soundfile` uses the C library **libsndfile**. On Linux, install the system package before pip: - - Debian/Ubuntu: `sudo apt-get install libsndfile1` - - For development builds: `sudo apt-get install libsndfile1-dev` - - Then: `pip install soundfile` - -- **`librosa` fails to load MP3 or reports "No backend available"** - Loading MP3 (e.g. in `spoken_dialogue_sft_multiturn` with `.mp3` files) uses **ffmpeg** as the backend. Install ffmpeg: - - Debian/Ubuntu: `sudo apt-get install ffmpeg` - - macOS: `brew install ffmpeg` - -- **`ImportError: No module named 'soundfile'` or `ModuleNotFoundError: ... librosa`** - Ensure you are in the same Python environment where vLLM Omni and the example dependencies are installed, and that `requirements/common.txt` (or the packages above) are installed. - ### Tokenizer path - **`MIMO_AUDIO_TOKENIZER_PATH` not set or model fails to find tokenizer** diff --git a/docs/user_guide/examples/offline_inference/qwen2_5_omni.md b/docs/user_guide/examples/offline_inference/qwen2_5_omni.md index 07a56cf9a0..c54976b540 100644 --- a/docs/user_guide/examples/offline_inference/qwen2_5_omni.md +++ b/docs/user_guide/examples/offline_inference/qwen2_5_omni.md @@ -64,14 +64,6 @@ If media file paths are not provided, the script will use default assets. Suppor - `use_audio_in_video`: Extract audio from video - `text`: Text-only query -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## Example materials ??? abstract "end2end.py" diff --git a/docs/user_guide/examples/offline_inference/qwen3_omni.md b/docs/user_guide/examples/offline_inference/qwen3_omni.md index 6577092bbf..2d856f7380 100644 --- a/docs/user_guide/examples/offline_inference/qwen3_omni.md +++ b/docs/user_guide/examples/offline_inference/qwen3_omni.md @@ -112,14 +112,6 @@ python end2end_async_chunk.py \ > async_chunk example when you need the stage-level concurrency semantics > described in PR #962 / #1151. -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## Example materials ??? abstract "end2end.py" diff --git a/docs/user_guide/examples/online_serving/bagel.md b/docs/user_guide/examples/online_serving/bagel.md index 4a6094c089..9de31926aa 100644 --- a/docs/user_guide/examples/online_serving/bagel.md +++ b/docs/user_guide/examples/online_serving/bagel.md @@ -357,13 +357,6 @@ curl http://localhost:8091/v1/chat/completions \ ## FAQ -- If you encounter an error about the backend of librosa, try to install ffmpeg with the command below. - -```bash -sudo apt update -sudo apt install ffmpeg -``` - - If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. | Stage | VRAM | diff --git a/docs/user_guide/examples/online_serving/qwen2_5_omni.md b/docs/user_guide/examples/online_serving/qwen2_5_omni.md index 4357646924..b3a2c9f2ac 100644 --- a/docs/user_guide/examples/online_serving/qwen2_5_omni.md +++ b/docs/user_guide/examples/online_serving/qwen2_5_omni.md @@ -218,14 +218,6 @@ The gradio script supports the following arguments: - `--port`: Port for Gradio server (default: 7861) - `--share`: Share the Gradio demo publicly (creates a public link) -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## Example materials ??? abstract "gradio_demo.py" diff --git a/docs/user_guide/examples/online_serving/qwen3_omni.md b/docs/user_guide/examples/online_serving/qwen3_omni.md index 69de24852f..6f6d9ae4a9 100644 --- a/docs/user_guide/examples/online_serving/qwen3_omni.md +++ b/docs/user_guide/examples/online_serving/qwen3_omni.md @@ -64,15 +64,6 @@ python openai_chat_completion_client_for_multimodal_generation.py \ bash run_curl_multimodal_generation.sh use_image ``` - -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## Modality control You can control output modalities to specify which types of output the model should generate. This is useful when you only need text output and want to skip audio generation stages for better performance. diff --git a/docs/user_guide/examples/online_serving/qwen3_tts.md b/docs/user_guide/examples/online_serving/qwen3_tts.md index 156c4942cd..4e632d4c28 100644 --- a/docs/user_guide/examples/online_serving/qwen3_tts.md +++ b/docs/user_guide/examples/online_serving/qwen3_tts.md @@ -211,14 +211,6 @@ with open("output.wav", "wb") as f: f.write(response.content) ``` -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## API Reference ### Voices Endpoint diff --git a/examples/offline_inference/bagel/README.md b/examples/offline_inference/bagel/README.md index 226c009f79..48517b1cda 100644 --- a/examples/offline_inference/bagel/README.md +++ b/examples/offline_inference/bagel/README.md @@ -247,13 +247,6 @@ For more details on the Mooncake connector and multi-node setup, see the [Moonca ## FAQ -- If you encounter an error about the backend of librosa, try to install ffmpeg with the command below. - -```bash -sudo apt update -sudo apt install ffmpeg -``` - - If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. | Stage | VRAM | diff --git a/examples/offline_inference/cosyvoice3/README.md b/examples/offline_inference/cosyvoice3/README.md index 895d3f660f..e16134e6ef 100644 --- a/examples/offline_inference/cosyvoice3/README.md +++ b/examples/offline_inference/cosyvoice3/README.md @@ -7,7 +7,7 @@ Install dependencies: uv pip install -e . ``` -> **Note:** This includes required libraries such as `librosa`, `soundfile`, +> **Note:** This includes required libraries such as `soundfile`, > `onnxruntime`, `x-transformers`, and `einops` via > `requirements/common.txt` and platform-specific requirements files. diff --git a/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py b/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py index 68ab72b387..6311bbc901 100644 --- a/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py +++ b/examples/offline_inference/cosyvoice3/verify_e2e_cosyvoice.py @@ -2,13 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os -from pathlib import Path -import librosa import numpy as np import soundfile as sf from vllm import SamplingParams from vllm.assets.audio import AudioAsset +from vllm.multimodal.media.audio import load_audio from vllm_omni.entrypoints.omni import Omni from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config @@ -16,22 +15,6 @@ from vllm_omni.model_executor.models.cosyvoice3.utils import extract_text_token -def _ensure_mel_filters_asset() -> None: - repo_root = Path(__file__).resolve().parents[3] - filters_path = repo_root / "vllm_omni" / "model_executor" / "models" / "cosyvoice3" / "assets" / "mel_filters.npz" - if filters_path.exists(): - return - - source_url = "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz" - raise FileNotFoundError( - "Missing CosyVoice3 mel filter asset:\n" - f" {filters_path}\n" - "Download it with:\n" - f" mkdir -p {filters_path.parent} && " - f"curl -L {source_url} -o {filters_path}" - ) - - def run_e2e(): parser = argparse.ArgumentParser() # ""FunAudioLLM/Fun-CosyVoice3-0.5B-2512 @@ -56,7 +39,6 @@ def run_e2e(): help="Path to tokenizer directory (e.g., /CosyVoice-BlankEN).", ) args = parser.parse_args() - _ensure_mel_filters_asset() # Ensure tokenizer directory exists if not os.path.exists(args.tokenizer): raise FileNotFoundError(f"{args.tokenizer} does not exist!") @@ -85,7 +67,7 @@ def run_e2e(): if not os.path.exists(args.audio_path): raise FileNotFoundError(f"Audio file not found: {args.audio_path}") # Load at native sample rate - audio_signal, sr = librosa.load(args.audio_path, sr=None) + audio_signal, sr = load_audio(args.audio_path, sr=None) # Validate sample rate before processing (similar to original CosyVoice) min_sr = 16000 diff --git a/examples/offline_inference/mimo_audio/README.md b/examples/offline_inference/mimo_audio/README.md index 747e734cc2..596afabeef 100644 --- a/examples/offline_inference/mimo_audio/README.md +++ b/examples/offline_inference/mimo_audio/README.md @@ -190,29 +190,6 @@ Note: This task uses hardcoded message lists in the script. ## Troubleshooting -### Audio dependencies (soundfile, librosa) - -This example depends on **soundfile** (read/write WAV) and **librosa** (load audio including MP3). Install the project requirements first: - -```bash -pip install -r requirements/common.txt -# or at least: pip install soundfile>=0.13.1 librosa>=0.11.0 -``` - -- **`soundfile` / libsndfile not found** - `soundfile` uses the C library **libsndfile**. On Linux, install the system package before pip: - - Debian/Ubuntu: `sudo apt-get install libsndfile1` - - For development builds: `sudo apt-get install libsndfile1-dev` - - Then: `pip install soundfile` - -- **`librosa` fails to load MP3 or reports "No backend available"** - Loading MP3 (e.g. in `spoken_dialogue_sft_multiturn` with `.mp3` files) uses **ffmpeg** as the backend. Install ffmpeg: - - Debian/Ubuntu: `sudo apt-get install ffmpeg` - - macOS: `brew install ffmpeg` - -- **`ImportError: No module named 'soundfile'` or `ModuleNotFoundError: ... librosa`** - Ensure you are in the same Python environment where vLLM Omni and the example dependencies are installed, and that `requirements/common.txt` (or the packages above) are installed. - ### Tokenizer path - **`MIMO_AUDIO_TOKENIZER_PATH` not set or model fails to find tokenizer** diff --git a/examples/offline_inference/mimo_audio/message_convert.py b/examples/offline_inference/mimo_audio/message_convert.py index ebcc59c6b4..416f21ccfa 100644 --- a/examples/offline_inference/mimo_audio/message_convert.py +++ b/examples/offline_inference/mimo_audio/message_convert.py @@ -5,12 +5,12 @@ import re from collections.abc import Callable -import librosa import numpy as np import torch import torchaudio from process_speechdata import InputSegment, StreamingInputSegment from torchaudio.transforms import MelSpectrogram +from vllm.multimodal.media.audio import load_audio speech_zeroemb_idx = 151667 empty_token = "<|empty|>" @@ -685,7 +685,7 @@ def get_audio_data(audio_url): # File path audio_file = audio_url - audio_signal, sr = librosa.load(audio_file, sr=24000) + audio_signal, sr = load_audio(audio_file, sr=24000) audio_data = (audio_signal.astype(np.float32), sr) return audio_data diff --git a/examples/offline_inference/omnivoice/end2end.py b/examples/offline_inference/omnivoice/end2end.py index b41379b011..9371c95142 100644 --- a/examples/offline_inference/omnivoice/end2end.py +++ b/examples/offline_inference/omnivoice/end2end.py @@ -103,9 +103,9 @@ def run_e2e(): if not os.path.exists(args.ref_audio): raise FileNotFoundError(f"Reference audio not found: {args.ref_audio}") - import librosa + from vllm.multimodal.media.audio import load_audio - audio_signal, sr = librosa.load(args.ref_audio, sr=None) + audio_signal, sr = load_audio(args.ref_audio, sr=None) multi_modal_data["audio"] = (audio_signal.astype(np.float32), sr) mm_processor_kwargs["ref_text"] = args.ref_text or "" mm_processor_kwargs["sample_rate"] = sr diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md index 20740a0da0..e2eae8a96b 100644 --- a/examples/offline_inference/qwen2_5_omni/README.md +++ b/examples/offline_inference/qwen2_5_omni/README.md @@ -60,11 +60,3 @@ If media file paths are not provided, the script will use default assets. Suppor - `mixed_modalities`: Audio + image + video - `use_audio_in_video`: Extract audio from video - `text`: Text-only query - -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py index 7bba599830..d8f1898ec9 100644 --- a/examples/offline_inference/qwen2_5_omni/end2end.py +++ b/examples/offline_inference/qwen2_5_omni/end2end.py @@ -9,7 +9,6 @@ import time from typing import NamedTuple -import librosa import numpy as np import soundfile as sf from PIL import Image @@ -17,6 +16,7 @@ from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset, video_to_ndarrays from vllm.multimodal.image import convert_image_mode +from vllm.multimodal.media.audio import load_audio from vllm.sampling_params import SamplingParams from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -96,7 +96,7 @@ def get_mixed_modalities_query( if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_signal, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate @@ -130,7 +130,7 @@ def get_use_audio_in_video_query( raise FileNotFoundError(f"Video file not found: {video_path}") video_frames = video_to_ndarrays(video_path, num_frames=num_frames) # Extract audio from video file - audio_signal, sr = librosa.load(video_path, sr=sampling_rate) + audio_signal, sr = load_audio(video_path, sr=sampling_rate) audio = (audio_signal.astype(np.float32), sr) else: asset = VideoAsset(name="baby_reading", num_frames=num_frames) @@ -165,7 +165,7 @@ def get_multi_audios_query(audio_path: str | None = None, sampling_rate: int = 1 if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_signal, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) # Use the provided audio as the first audio, default as second audio_list = [ @@ -261,7 +261,7 @@ def get_audio_query(question: str = None, audio_path: str | None = None, samplin if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_signal, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate diff --git a/examples/offline_inference/qwen3_omni/README.md b/examples/offline_inference/qwen3_omni/README.md index b3e8592532..d69ad6abfc 100644 --- a/examples/offline_inference/qwen3_omni/README.md +++ b/examples/offline_inference/qwen3_omni/README.md @@ -108,11 +108,3 @@ python end2end_async_chunk.py \ > recommended entry point for non-async-chunk workflows. Only use the > async_chunk example when you need the stage-level concurrency semantics > described in PR #962 / #1151. - -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index 155eca4ed9..056f820ff0 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -9,7 +9,6 @@ import time from typing import NamedTuple -import librosa import numpy as np import soundfile as sf import vllm @@ -19,6 +18,7 @@ from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset, video_to_ndarrays from vllm.multimodal.image import convert_image_mode +from vllm.multimodal.media.audio import load_audio from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni.entrypoints.omni import Omni @@ -129,7 +129,7 @@ def get_audio_query(question: str = None, audio_path: str | None = None, samplin if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_signal, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate @@ -183,7 +183,7 @@ def get_mixed_modalities_query( if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_signal, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate diff --git a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py index 8adbae9eb6..0744263130 100644 --- a/examples/offline_inference/qwen3_omni/end2end_async_chunk.py +++ b/examples/offline_inference/qwen3_omni/end2end_async_chunk.py @@ -32,13 +32,13 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -import librosa from PIL import Image from vllm import SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset, video_to_ndarrays from vllm.multimodal.image import convert_image_mode +from vllm.multimodal.media.audio import load_audio from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm_omni.entrypoints.async_omni import AsyncOmni @@ -89,7 +89,7 @@ def get_audio_query( if audio_path: if not os.path.exists(audio_path): raise FileNotFoundError(f"Audio file not found: {audio_path}") - audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_signal, sr = load_audio(audio_path, sr=sampling_rate) audio_data = (audio_signal.astype(np.float32), sr) else: audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py index e0424add69..fb77b21483 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py @@ -5,8 +5,8 @@ import re import time -import librosa from PIL import Image +from vllm.multimodal.media.audio import load_audio from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.entrypoints.omni import Omni @@ -69,7 +69,7 @@ def load_image_and_audio(image_paths, audio_paths): image.append(img) for path in audio_paths: - audio_array, sr = librosa.load(path, sr=16000) + audio_array, sr = load_audio(path, sr=16000) audio_array = audio_array[int(sr * 1) : int(sr * 3)] audio.append(audio_array) return image, audio diff --git a/examples/online_serving/bagel/README.md b/examples/online_serving/bagel/README.md index 9b74acae10..0939bc5f38 100644 --- a/examples/online_serving/bagel/README.md +++ b/examples/online_serving/bagel/README.md @@ -354,13 +354,6 @@ curl http://localhost:8091/v1/chat/completions \ ## FAQ -- If you encounter an error about the backend of librosa, try to install ffmpeg with the command below. - -```bash -sudo apt update -sudo apt install ffmpeg -``` - - If you don’t know how much VRAM is needed for the model or encounter the OOM error, you can try to decrease the max_model_len. | Stage | VRAM | diff --git a/examples/online_serving/qwen2_5_omni/README.md b/examples/online_serving/qwen2_5_omni/README.md index 91aab3b651..c528732064 100644 --- a/examples/online_serving/qwen2_5_omni/README.md +++ b/examples/online_serving/qwen2_5_omni/README.md @@ -208,11 +208,3 @@ The gradio script supports the following arguments: - `--ip`: Host/IP for Gradio server (default: 127.0.0.1) - `--port`: Port for Gradio server (default: 7861) - `--share`: Share the Gradio demo publicly (creates a public link) - -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` diff --git a/examples/online_serving/qwen3_omni/README.md b/examples/online_serving/qwen3_omni/README.md index c3171e4366..ff02642247 100644 --- a/examples/online_serving/qwen3_omni/README.md +++ b/examples/online_serving/qwen3_omni/README.md @@ -43,11 +43,9 @@ python examples/online_serving/openai_chat_completion_client_for_multimodal_gene **Dependencies:** ```bash -pip install websockets librosa numpy +pip install websockets numpy ``` -(ffmpeg may be required by `librosa` for some formats; see the FAQ below.) - **From this directory** (`examples/online_serving/qwen3_omni`): ```bash @@ -105,12 +103,6 @@ bash run_curl_multimodal_generation.sh use_image ### FAQ -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## Modality control You can control output modalities to specify which types of output the model should generate. This is useful when you only need text output and want to skip audio generation stages for better performance. diff --git a/examples/online_serving/qwen3_omni/openai_realtime_client.py b/examples/online_serving/qwen3_omni/openai_realtime_client.py index 4fa043c481..660e4ac336 100644 --- a/examples/online_serving/qwen3_omni/openai_realtime_client.py +++ b/examples/online_serving/qwen3_omni/openai_realtime_client.py @@ -10,7 +10,7 @@ Requirements: - vllm with audio support - websockets -- librosa +- soundfile - numpy The script: @@ -25,10 +25,10 @@ import base64 import json -import librosa import numpy as np import websockets from vllm.assets.audio import AudioAsset +from vllm.multimodal.media.audio import load_audio def audio_to_pcm16_base64(audio_path: str) -> str: @@ -36,7 +36,7 @@ def audio_to_pcm16_base64(audio_path: str) -> str: Load an audio file and convert it to base64-encoded PCM16 @ 16kHz. """ # Load audio and resample to 16kHz mono - audio, _ = librosa.load(audio_path, sr=16000, mono=True) + audio, _ = load_audio(audio_path, sr=16000, mono=True) # Convert to PCM16 pcm16 = (audio * 32767).astype(np.int16) # Encode as base64 diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md index 5504b5737a..e53fa7392b 100644 --- a/examples/online_serving/qwen3_tts/README.md +++ b/examples/online_serving/qwen3_tts/README.md @@ -192,14 +192,6 @@ with open("output.wav", "wb") as f: f.write(response.content) ``` -### FAQ - -If you encounter error about backend of librosa, try to install ffmpeg with command below. -``` -sudo apt update -sudo apt install ffmpeg -``` - ## API Reference ### Voices Endpoint diff --git a/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py b/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py index e6786f8869..38a2bdea92 100644 --- a/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py +++ b/examples/online_serving/qwen3_tts/speaker_embedding_interpolation.py @@ -5,7 +5,7 @@ using SLERP and sends the result to the /v1/audio/speech API. Requirements: - pip install torch librosa soundfile numpy httpx + pip install torch resampy soundfile numpy httpx Examples: # Extract and save an embedding @@ -143,17 +143,17 @@ def _load_speaker_encoder_weights(encoder: torch.nn.Module, model_path: str) -> def compute_mel_spectrogram(audio: np.ndarray, sr: int = 24000) -> torch.Tensor: """Compute 128-bin mel spectrogram matching Qwen3-TTS's extraction pipeline.""" - import librosa + from vllm.multimodal.audio import resample_audio_resampy # Resample to 24kHz if needed if sr != 24000: - audio = librosa.resample(audio.astype(np.float32), orig_sr=sr, target_sr=24000) + audio = resample_audio_resampy(audio.astype(np.float32), orig_sr=sr, target_sr=24000) y = torch.from_numpy(audio).unsqueeze(0).float() - from librosa.filters import mel as librosa_mel_fn + from vllm_omni.utils.audio import mel_filter_bank - mel_basis = torch.from_numpy(librosa_mel_fn(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=12000)).float() + mel_basis = mel_filter_bank(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=12000) n_fft = 1024 hop_size = 256 @@ -180,9 +180,9 @@ def compute_mel_spectrogram(audio: np.ndarray, sr: int = 24000) -> torch.Tensor: @torch.inference_mode() def extract_embedding(encoder: torch.nn.Module, audio_path: str, device: str = "cpu") -> np.ndarray: """Extract a 1024-dim speaker embedding from an audio file.""" - import librosa + from vllm.multimodal.media.audio import load_audio - audio, sr = librosa.load(audio_path, sr=None, mono=True) + audio, sr = load_audio(audio_path, sr=None, mono=True) mel = compute_mel_spectrogram(audio, sr).to(device) embedding = encoder(mel.to(next(encoder.parameters()).dtype))[0] return embedding.float().cpu().numpy() diff --git a/requirements/common.txt b/requirements/common.txt index 89eaac32bc..1fff584448 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -1,7 +1,6 @@ # Common dependencies for all platforms av>=14.0.0 omegaconf>=2.3.0 -librosa>=0.11.0 resampy>=0.4.3 diffusers>=0.36.0 accelerate==1.12.0 diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 57aeef8f9d..554164a59c 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -63,14 +63,11 @@ def test_stereo_to_mono_conversion(self, audio_mixin, mocker: MockerFixture): adjusted_tensor = mock_speed.call_args[0][0] assert len(adjusted_tensor) == 24000 - def test_speed_adjustment(self, audio_mixin, mocker: MockerFixture): - mock_time_stretch = mocker.patch("librosa.effects.time_stretch") - mock_time_stretch.return_value = np.zeros(12000) + def test_speed_adjustment(self, audio_mixin): audio_tensor = np.random.rand(24000).astype(np.float32) adjusted_audio, _ = audio_mixin._apply_speed_adjustment(audio_tensor, speed=2.0, sample_rate=24000) - mock_time_stretch.assert_called_with(y=audio_tensor, rate=2.0) assert adjusted_audio.shape == (12000,) def test_unsupported_format_fallback(self, audio_mixin, caplog, mocker: MockerFixture): @@ -117,30 +114,22 @@ def test_stereo_audio_preservation(self, audio_mixin, mocker: MockerFixture): assert np.array_equal(output_tensor, stereo_tensor) def test_speed_adjustment_bypass(self, audio_mixin, mocker: MockerFixture): - """Test that speed=1.0 bypasses the expensive librosa time stretching.""" + """Test that speed=1.0 bypasses the expensive torchaudio time stretching.""" audio_tensor = np.random.rand(24000).astype(np.float32) - mock_time_stretch = mocker.patch("librosa.effects.time_stretch") - # speed=1.0 should return immediately without calling librosa + mock_time_stretch = mocker.patch("torchaudio.transforms.TimeStretch") + # speed=1.0 should return immediately without calling torchaudio result, _ = audio_mixin._apply_speed_adjustment(audio_tensor, speed=1.0, sample_rate=24000) mock_time_stretch.assert_not_called() assert np.array_equal(result, audio_tensor) - def test_speed_adjustment_stereo_handling(self, audio_mixin, mocker: MockerFixture): - """Test that speed adjustment is attempted on stereo inputs.""" - mock_time_stretch = mocker.patch("librosa.effects.time_stretch") + def test_speed_adjustment_stereo_handling(self, audio_mixin): + """Test that speed adjustment handles stereo (channels-last) input.""" stereo_tensor = np.random.rand(24000, 2).astype(np.float32) - # Mock return value representing a sped-up version (half length) - mock_time_stretch.return_value = np.zeros((12000, 2), dtype=np.float32) result, _ = audio_mixin._apply_speed_adjustment(stereo_tensor, speed=2.0, sample_rate=24000) - mock_time_stretch.assert_called_once() - # Ensure the stereo tensor was passed to librosa - call_args = mock_time_stretch.call_args - assert np.array_equal(call_args.kwargs["y"], stereo_tensor) - assert call_args.kwargs["rate"] == 2.0 assert result.shape == (12000, 2) diff --git a/tests/utils/test_audio.py b/tests/utils/test_audio.py new file mode 100644 index 0000000000..cfbd2501b2 --- /dev/null +++ b/tests/utils/test_audio.py @@ -0,0 +1,61 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Unit tests for vllm_omni.utils.audio.mel_filter_bank.""" + +import pytest +import torch + +from vllm_omni.utils.audio import mel_filter_bank + +# Parameter combinations used across the codebase. +_PARAM_SETS = [ + # Qwen3-TTS talker / speaker encoder (sr=24000) + dict(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=12000), + # CosyVoice3 whisper encoder, Qwen3-TTS 25Hz tokenizer (sr=16000, 80 mels) + dict(sr=16000, n_fft=400, n_mels=80), + # CosyVoice3 whisper encoder (sr=16000, 128 mels) + dict(sr=16000, n_fft=400, n_mels=128), +] + +_parametrize_params = pytest.mark.parametrize( + "params", _PARAM_SETS, ids=lambda p: f"{p['sr']}_{p['n_fft']}_{p['n_mels']}" +) + + +class TestMelFilterBank: + @_parametrize_params + def test_output_shape(self, params): + fb = mel_filter_bank(**params) + n_freqs = params["n_fft"] // 2 + 1 + assert fb.shape == (params["n_mels"], n_freqs) + + @_parametrize_params + def test_non_negative(self, params): + fb = mel_filter_bank(**params) + assert (fb >= 0).all() + + def test_dtype_is_float(self): + fb = mel_filter_bank(sr=16000, n_fft=400, n_mels=80) + assert fb.dtype == torch.float32 + + def test_fmax_defaults_to_nyquist(self): + """When fmax is omitted it should equal sr / 2.""" + fb_default = mel_filter_bank(sr=16000, n_fft=400, n_mels=80) + fb_explicit = mel_filter_bank(sr=16000, n_fft=400, n_mels=80, fmax=8000.0) + torch.testing.assert_close(fb_default, fb_explicit) + + def test_each_mel_band_has_nonzero_energy(self): + """Every mel band should have at least one nonzero frequency bin.""" + fb = mel_filter_bank(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=12000) + for i in range(fb.shape[0]): + assert fb[i].sum() > 0, f"mel band {i} is all zeros" + + def test_higher_fmax_extends_coverage(self): + """A higher fmax should produce nonzero weights at higher frequency bins.""" + fb_low = mel_filter_bank(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=6000) + fb_high = mel_filter_bank(sr=24000, n_fft=1024, n_mels=128, fmin=0, fmax=12000) + # The highest nonzero column should be larger for fb_high. + last_nonzero_low = (fb_low.sum(dim=0) > 0).nonzero()[-1].item() + last_nonzero_high = (fb_high.sum(dim=0) > 0).nonzero()[-1].item() + assert last_nonzero_high > last_nonzero_low diff --git a/vllm_omni/assets/video.py b/vllm_omni/assets/video.py index 98b1f7e4e2..6a5f3204a9 100644 --- a/vllm_omni/assets/video.py +++ b/vllm_omni/assets/video.py @@ -1,6 +1,6 @@ -import librosa import numpy as np from vllm.assets.video import VideoAsset +from vllm.multimodal.media.audio import load_audio def extract_video_audio(path: str = None, sampling_rate: int = 16000) -> np.ndarray: @@ -12,5 +12,5 @@ def extract_video_audio(path: str = None, sampling_rate: int = 16000) -> np.ndar """ if not path: path = VideoAsset(name="baby_reading").video_path - audio_signal, sr = librosa.load(path, sr=sampling_rate) + audio_signal, sr = load_audio(path, sr=sampling_rate) return audio_signal diff --git a/vllm_omni/entrypoints/chat_utils.py b/vllm_omni/entrypoints/chat_utils.py index 8970e58984..4c3d311ec5 100644 --- a/vllm_omni/entrypoints/chat_utils.py +++ b/vllm_omni/entrypoints/chat_utils.py @@ -2,7 +2,7 @@ async def extract_audio_from_video_async(video_url: str) -> tuple[np.ndarray, int | float]: - """Extract audio from a video URL using librosa. + """Extract audio from a video URL using vllm's load_audio. Returns a (audio_array, sample_rate) tuple compatible with audio format. All blocking I/O operations are run in a thread pool. @@ -26,9 +26,9 @@ def _write_temp_file_sync(data: bytes, suffix: str) -> str: return temp_file.name def _load_audio_sync(file_path: str) -> tuple[np.ndarray, int | float]: - import librosa + from vllm.multimodal.media.audio import load_audio - return librosa.load(file_path, sr=16000) + return load_audio(file_path, sr=16000) def _cleanup_file_sync(file_path: str) -> None: try: diff --git a/vllm_omni/entrypoints/openai/audio_utils_mixin.py b/vllm_omni/entrypoints/openai/audio_utils_mixin.py index 13df32ebe0..b626f7eeb2 100644 --- a/vllm_omni/entrypoints/openai/audio_utils_mixin.py +++ b/vllm_omni/entrypoints/openai/audio_utils_mixin.py @@ -1,6 +1,8 @@ from io import BytesIO import numpy as np +import torch +import torchaudio from vllm.logger import init_logger from vllm_omni.entrypoints.openai.protocol.audio import AudioResponse, CreateAudio @@ -10,11 +12,6 @@ except ImportError: soundfile = None -try: - import librosa -except ImportError: - librosa = None - logger = init_logger(__name__) @@ -74,20 +71,53 @@ def create_audio(self, audio_obj: CreateAudio) -> AudioResponse: return AudioResponse(audio_data=audio_data, media_type=media_type) def _apply_speed_adjustment(self, audio_tensor: np.ndarray, speed: float, sample_rate: int): - """Apply speed adjustment to the audio tensor while preserving pitch.""" + """Apply speed adjustment to the audio tensor while preserving pitch. + + Uses torchaudio's phase vocoder (Spectrogram → TimeStretch → + InverseSpectrogram) to stretch/compress audio in time without + changing pitch. + """ if speed == 1.0: return audio_tensor, sample_rate - if librosa is None: - raise ImportError("librosa is required for speed adjustment. Please install it with: pip install librosa") - try: - # librosa.effects.time_stretch requires a float audio tensor. if not np.issubdtype(audio_tensor.dtype, np.floating): audio_tensor = audio_tensor.astype(np.float32) - stretched_audio = librosa.effects.time_stretch(y=audio_tensor, rate=speed) - return stretched_audio, sample_rate + # Stereo numpy arrays use channels-last (T, C); + # torch expects channels-first (C, T). + channels_last = audio_tensor.ndim == 2 + if channels_last: + waveform = torch.from_numpy(audio_tensor.T) + else: + waveform = torch.from_numpy(audio_tensor).unsqueeze(0) + + # Match librosa.stft defaults: n_fft=2048, hop_length=n_fft//4 + n_fft = 2048 + hop_length = n_fft // 4 + to_spec = torchaudio.transforms.Spectrogram( + n_fft=n_fft, + hop_length=hop_length, + power=None, + ) + stretch = torchaudio.transforms.TimeStretch( + n_freq=n_fft // 2 + 1, + hop_length=hop_length, + ) + to_wave = torchaudio.transforms.InverseSpectrogram( + n_fft=n_fft, + hop_length=hop_length, + ) + + spec = to_spec(waveform) + stretched = stretch(spec, speed) + expected_length = int(audio_tensor.shape[0] / speed) + result = to_wave(stretched, length=expected_length) + + result = result.squeeze(0).numpy() + if channels_last: + result = result.T + return result, sample_rate except Exception as e: logger.error(f"An error occurred during speed adjustment: {e}") raise ValueError("Failed to apply speed adjustment.") from e diff --git a/vllm_omni/model_executor/models/cosyvoice3/assets/mel_filters.npz b/vllm_omni/model_executor/models/cosyvoice3/assets/mel_filters.npz deleted file mode 100644 index 28ea26909dbdfd608aef67afc4d74d7961ae4bb6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4271 zcmZ`-cQjmYw;lx1g6JcN7QKe3LG%_Oh!VX=^k~teM-XGQ(Mu4$_Y%?jkm$lFBkB+( z3yfKIgF zxGiAhze`A@t->QRNVV!%P+W=o}VHkB) z%g>qyRHfN1IQ4-=`Y@0T9qE#o+;4E3VQ!epW1Xt=ZG`I3U|62t?<>5h*W|9VvJc`KZ+)ghnA**Z~ET21Tjf_f8oe`vy zZQNtlOx?dDhS71hnOus5cqj)hfyF@H&4y?@9z{I#&cf>A+s2~~(I>TQF}SaR3_tqa z(7&ZdN^vR*t<~?{9DEoI>0PL@Sl?wa?Z{rGX`*eEx9Nh=z*J3HZL1*Py4z$TD#+;m zSSW(kcOTe(4hqgib_W6&xx+j~-u(p)Nn6?>a%wHk=h7Ay$%lcGoo;gAY zmVV7|!Nb;w(PlH@c24{ple2Y3<*9J@jE=sfLzwu_BiAFPE$0Axp`^Nq!H}eG0?r-X zFj@Pwp^al*p>K{@_Cz`q#(N0Y=OpZy^ z{P$KjLJuk_Y%I)$mh`b{uOW5C5Xcmxk!gt_Zg zw>}6fkD4zRK9!#ems~H%U$>V;_wK38Zf-baU$S!#i;7!HWsi}GuC>%@?lMdgkUGC& zh9gC?O-5BlS2#}?7x0?eP#bOL(cqE{M%LJD$CZnplD)CgQR#KCttD=dZK+Ck5R52; z*%5hZ+SXU7)8k%Y^_1U>yI*By(INn&+ir-_4$#dUwTlMNyR@iGQIaZ+eiYqucu)CB z#i{Ru1w+aU#}DHSyzjG_9c?ToB_YjU#f;N=qel98WBIjIc1!#ePwRR+(go&-by#}@ z+M+klVke5b@lWfZ+O&|c??YvRe)&W)qAgtc>t-IZtbRTG#X}49_Q$>P%-)=0W_QY-x%DPep2Vm9#ci zyQcCc4p2&dLtV1@rPe!%>Y^#9W8#ZH&}^@wJKT7N;R9A7cEq&;Y2CYvd@R+Mn&b5O zVyfS^*H#kD74=J5uhD)o`TXoX>>Si$!cT?TXRxj2pB)w_ljjhTby&Je;X|BESZZT= zC%G5!-$BJf&a~U78d_3zBjrvrkJ0CCl@Rfcf7I(`VTNPnI^B#B$zOfPW zG&mEd?R0+W<`l08O1dkcWKS8wB!Z*Cs%I1nMs-EeB-uu5?t@PuD3|z>je8DKi#X(B z{Z=Rz{4X%?-UnxnHQtkELIZ&=J;fK_t}yu8|IxG0(85e&K>H3!!~zlhyJrgti~o1i zzBS*jTgdG~Exp#B-T)6A+PB ztD-e`j^@XAx}|L&JSEFkRvS_%3b%m86z02#Hfn{Y+qIqQ_muywgt?roUA7oiS1xBD zFxmDMsj_cbBcn*^rn^KIMP{AlHM`NiVm*D&`z~7FH#hf<$L3HmJ+=NdiY5>W?nKD? z8Ox6{9dKyI1o8a-j9BtV-|=lm`<`v>tR^Cln&x1dMYzu{@wq5KW!#K14_QMnpH5K%Pavag+g6(i8i-#Eq zguc}rH3?BxH4SOqZW#7m*aT(U9-n#_Xn^Q19(}eH!xG`nI!GYziVQNcA0)`FDHD%~ zz2$HnxW4BQ{#*@u`dssbAa`|fESn$8i8FdxGZh48_Uf~_Q@tv?4in)6fwSed)k&ITqu|){^(WL~J z?Lb|0ro06J^>f>^2}^e-+$u5bU4IZNfO?75v8lstS15%XYw2ac^pkU34{QhDR(umt zPu~`w2?FP|nn3!RWZ3{?=77@teulahD9*S*k5KmY3*adlM)%{SR~bkZYlx1q@fkE= zI$7+kiw5!ha=dYlO>Z5KgxnZEJsaBm%v#nkX0MN-h%n&KA?N}xU3K3o-3Jpk?ANq2n9&Lh%K_CTvfiN ze>6w~NSSl8$#NEZ^t7h9YOxI=zcAG|a+m6AWei`3Jw7K;b;T${pJa^4RwRt%F>?>M zBmoQqm1`<_W7i!5P~THp-II)Ka^u;=z;}d{;SVj{G_4`9^HaEb!=@Pa;Dw)CH^DjsGxFqmb%o$Bkop$KnH8 zDYN)Bh)5=5!-*|f0Gh4)oZG=TEBr()g^DCtSQhmT3!ZN`Qd-E%@1cE}hm8&Vq5B+C zVF2_O)9IiZ(v(xzTwJIg5|}KVuE(;}|7dVIrT`$d=q_OG|3PY}x*URYkMXXJ6PT1$IFkNyvY_(9UglDi6TaeikPS(!Bnij z;Szn+)I_oxnRz7(WTYTp+IHSWQ?Xd~tQn(Q1r)kThM?NM< z?d6LaBG!H}R$zRy!Ij(}1?xe^+o+!;tqWJ3NgjHl1XNxzusxQ0I#6qzM(_00UPMw* zF*GWW_q&fqAN=uimSKgBu_@jD%MX3hpNY|*4r=e=k1lw2r**IyD(hcq?A+HtUgUy4Dqh5D7|G9q{)TsUj{g~c!xy>9wk^(LiXA4VKGz_zMvJMX#AgsR z34T3hhJ)#&sUaQ1+0PML(?YA~{5?=(MT}X^Vib%};uoI{qGW@wgJ&_M+8S8clsNz2 zPQkxMi`#3+Khwtl>>K>wxc{71{&!qGu&Zzz_wU(7TLTyG){PAu?!cXs?Dp-y0Ekcn AQvd(} diff --git a/vllm_omni/model_executor/models/cosyvoice3/utils.py b/vllm_omni/model_executor/models/cosyvoice3/utils.py index 52c52655e8..0bf0cccb16 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/utils.py +++ b/vllm_omni/model_executor/models/cosyvoice3/utils.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import logging -import os from functools import cache, lru_cache import numpy as np @@ -9,7 +8,8 @@ import torch.nn.functional as F import torchaudio import torchaudio.compliance.kaldi as kaldi -from librosa.filters import mel as librosa_mel_fn + +from vllm_omni.utils.audio import mel_filter_bank logger = logging.getLogger(__name__) @@ -34,8 +34,13 @@ def _get_mel_basis( fmax: float | None, device_str: str, ) -> torch.Tensor: - mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - return torch.from_numpy(mel).float().to(torch.device(device_str)) + return mel_filter_bank( + sr=sampling_rate, + n_fft=n_fft, + n_mels=num_mels, + fmin=fmin, + fmax=fmax, + ).to(torch.device(device_str)) @lru_cache @@ -122,42 +127,8 @@ def exact_div(x, y): @cache def mel_filters(device, n_mels: int) -> torch.Tensor: - """ - load the mel filterbank matrix for projecting STFT into a Mel spectrogram. - Allows decoupling librosa dependency; saved using: - - np.savez_compressed( - "mel_filters.npz", - mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80), - mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128), - ) - """ - assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}" - - filters_path = os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz") - if not os.path.exists(filters_path): - source_url = "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz" - os.makedirs(os.path.dirname(filters_path), exist_ok=True) - try: - import urllib.request - - with urllib.request.urlopen(source_url, timeout=30) as resp: - with open(filters_path, "wb") as f_out: - f_out.write(resp.read()) - logger.info("Downloaded mel_filters.npz from %s", source_url) - except Exception as e: - raise FileNotFoundError( - "Missing CosyVoice3 mel filter asset:\n" - f" {filters_path}\n" - "Auto-download failed. Download it manually from:\n" - f" {source_url}\n" - "Example:\n" - f" mkdir -p {os.path.dirname(filters_path)} && " - f"curl -L {source_url} -o {filters_path}" - ) from e - - with np.load(filters_path, allow_pickle=False) as f: - return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) + """Compute mel filterbank matrix for projecting STFT into a Mel spectrogram.""" + return mel_filter_bank(sr=16000, n_fft=400, n_mels=n_mels).to(device) def log_mel_spectrogram( diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py index 9f8aff6aff..f89012ec45 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py @@ -13,7 +13,6 @@ import torch import torch.nn as nn import torch.nn.functional as F -from librosa.filters import mel as librosa_mel_fn from transformers import AutoTokenizer from transformers.activations import ACT2FN from transformers.utils.hub import cached_file @@ -27,6 +26,7 @@ from vllm.sequence import IntermediateTensors from vllm_omni.model_executor.models.output_templates import OmniOutput +from vllm_omni.utils.audio import mel_filter_bank from vllm_omni.utils.voice_cache import VoiceEmbeddingCache from .configuration_qwen3_tts import Qwen3TTSConfig, Qwen3TTSSpeakerEncoderConfig, Qwen3TTSTalkerConfig @@ -258,14 +258,19 @@ def mel_spectrogram( fmax: int | None = None, center: bool = False, ) -> torch.Tensor: - """Calculate mel spectrogram of an input signal using librosa mel filterbank and torch STFT.""" + """Calculate mel spectrogram of an input signal using torchaudio mel filterbank and torch STFT.""" if torch.min(y) < -1.0: logger.warning("Min value of input waveform signal is %s", torch.min(y)) if torch.max(y) > 1.0: logger.warning("Max value of input waveform signal is %s", torch.max(y)) device = y.device - mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax) - mel_basis = torch.from_numpy(mel).float().to(device) + mel_basis = mel_filter_bank( + sr=sampling_rate, + n_fft=n_fft, + n_mels=num_mels, + fmin=fmin, + fmax=fmax, + ).to(device) hann_window = torch.hann_window(win_size).to(device) padding = (n_fft - hop_size) // 2 y = torch.nn.functional.pad(y.unsqueeze(1), (padding, padding), mode="reflect").squeeze(1) @@ -871,7 +876,7 @@ def _load_audio_to_np(self, x: str) -> tuple[np.ndarray, int]: Uses upstream vLLM's MediaConnector for http(s) URLs and ``file:`` URIs, with unrestricted local access (offline inference is trusted). """ - import librosa + from vllm.multimodal.media.audio import load_audio if self._is_url(x): from vllm.multimodal.media import MediaConnector @@ -883,7 +888,7 @@ def _load_audio_to_np(self, x: str) -> tuple[np.ndarray, int]: with io.BytesIO(wav_bytes) as f: audio, sr = sf.read(f, dtype="float32", always_2d=False) else: - audio, sr = librosa.load(x, sr=None, mono=True) + audio, sr = load_audio(x, sr=None, mono=True) if isinstance(audio, np.ndarray) and audio.ndim > 1: audio = np.mean(audio, axis=-1) @@ -1089,9 +1094,9 @@ def _extract_speaker_embedding(self, wav: np.ndarray, sr: int) -> torch.Tensor: # Resample to 24kHz for speaker encoder. target_sr = int(getattr(self.config.speaker_encoder_config, "sample_rate", 24000)) if sr != target_sr: - import librosa + from vllm.multimodal.audio import resample_audio_resampy - wav = librosa.resample(y=wav.astype(np.float32), orig_sr=int(sr), target_sr=target_sr) + wav = resample_audio_resampy(wav.astype(np.float32), orig_sr=int(sr), target_sr=target_sr) sr = target_sr # Follow official implementation: mel_spectrogram expects 24kHz. diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_tokenizer.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_tokenizer.py index 503e6bbc83..3db5cfd1b8 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_tokenizer.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_tokenizer.py @@ -17,12 +17,13 @@ import urllib.request from urllib.parse import urlparse -import librosa import numpy as np import soundfile as sf import torch from torch.nn.utils.rnn import pad_sequence from transformers import AutoConfig, AutoFeatureExtractor, AutoModel +from vllm.multimodal.audio import resample_audio_resampy +from vllm.multimodal.media.audio import load_audio as _load_audio_file from .tokenizer_12hz.configuration_qwen3_tts_tokenizer_v2 import Qwen3TTSTokenizerV2Config from .tokenizer_12hz.modeling_qwen3_tts_tokenizer_v2 import ( @@ -154,13 +155,13 @@ def load_audio( with io.BytesIO(wav_bytes) as f: audio, sr = sf.read(f, dtype="float32", always_2d=False) else: - audio, sr = librosa.load(x, sr=None, mono=True) + audio, sr = _load_audio_file(x, sr=None, mono=True) if audio.ndim > 1: audio = np.mean(audio, axis=-1) if sr != target_sr: - audio = librosa.resample(y=audio, orig_sr=sr, target_sr=target_sr) + audio = resample_audio_resampy(audio, orig_sr=sr, target_sr=target_sr) return audio.astype(np.float32) @@ -208,7 +209,7 @@ def _normalize_audio_inputs( if a.ndim > 1: a = np.mean(a, axis=-1) if int(sr) != target_sr: - a = librosa.resample(y=a.astype(np.float32), orig_sr=int(sr), target_sr=target_sr) + a = resample_audio_resampy(a.astype(np.float32), orig_sr=int(sr), target_sr=target_sr) out.append(a.astype(np.float32)) return out diff --git a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/assets/mel_filters.npz b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/assets/mel_filters.npz deleted file mode 100644 index 28ea26909dbdfd608aef67afc4d74d7961ae4bb6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4271 zcmZ`-cQjmYw;lx1g6JcN7QKe3LG%_Oh!VX=^k~teM-XGQ(Mu4$_Y%?jkm$lFBkB+( z3yfKIgF zxGiAhze`A@t->QRNVV!%P+W=o}VHkB) z%g>qyRHfN1IQ4-=`Y@0T9qE#o+;4E3VQ!epW1Xt=ZG`I3U|62t?<>5h*W|9VvJc`KZ+)ghnA**Z~ET21Tjf_f8oe`vy zZQNtlOx?dDhS71hnOus5cqj)hfyF@H&4y?@9z{I#&cf>A+s2~~(I>TQF}SaR3_tqa z(7&ZdN^vR*t<~?{9DEoI>0PL@Sl?wa?Z{rGX`*eEx9Nh=z*J3HZL1*Py4z$TD#+;m zSSW(kcOTe(4hqgib_W6&xx+j~-u(p)Nn6?>a%wHk=h7Ay$%lcGoo;gAY zmVV7|!Nb;w(PlH@c24{ple2Y3<*9J@jE=sfLzwu_BiAFPE$0Axp`^Nq!H}eG0?r-X zFj@Pwp^al*p>K{@_Cz`q#(N0Y=OpZy^ z{P$KjLJuk_Y%I)$mh`b{uOW5C5Xcmxk!gt_Zg zw>}6fkD4zRK9!#ems~H%U$>V;_wK38Zf-baU$S!#i;7!HWsi}GuC>%@?lMdgkUGC& zh9gC?O-5BlS2#}?7x0?eP#bOL(cqE{M%LJD$CZnplD)CgQR#KCttD=dZK+Ck5R52; z*%5hZ+SXU7)8k%Y^_1U>yI*By(INn&+ir-_4$#dUwTlMNyR@iGQIaZ+eiYqucu)CB z#i{Ru1w+aU#}DHSyzjG_9c?ToB_YjU#f;N=qel98WBIjIc1!#ePwRR+(go&-by#}@ z+M+klVke5b@lWfZ+O&|c??YvRe)&W)qAgtc>t-IZtbRTG#X}49_Q$>P%-)=0W_QY-x%DPep2Vm9#ci zyQcCc4p2&dLtV1@rPe!%>Y^#9W8#ZH&}^@wJKT7N;R9A7cEq&;Y2CYvd@R+Mn&b5O zVyfS^*H#kD74=J5uhD)o`TXoX>>Si$!cT?TXRxj2pB)w_ljjhTby&Je;X|BESZZT= zC%G5!-$BJf&a~U78d_3zBjrvrkJ0CCl@Rfcf7I(`VTNPnI^B#B$zOfPW zG&mEd?R0+W<`l08O1dkcWKS8wB!Z*Cs%I1nMs-EeB-uu5?t@PuD3|z>je8DKi#X(B z{Z=Rz{4X%?-UnxnHQtkELIZ&=J;fK_t}yu8|IxG0(85e&K>H3!!~zlhyJrgti~o1i zzBS*jTgdG~Exp#B-T)6A+PB ztD-e`j^@XAx}|L&JSEFkRvS_%3b%m86z02#Hfn{Y+qIqQ_muywgt?roUA7oiS1xBD zFxmDMsj_cbBcn*^rn^KIMP{AlHM`NiVm*D&`z~7FH#hf<$L3HmJ+=NdiY5>W?nKD? z8Ox6{9dKyI1o8a-j9BtV-|=lm`<`v>tR^Cln&x1dMYzu{@wq5KW!#K14_QMnpH5K%Pavag+g6(i8i-#Eq zguc}rH3?BxH4SOqZW#7m*aT(U9-n#_Xn^Q19(}eH!xG`nI!GYziVQNcA0)`FDHD%~ zz2$HnxW4BQ{#*@u`dssbAa`|fESn$8i8FdxGZh48_Uf~_Q@tv?4in)6fwSed)k&ITqu|){^(WL~J z?Lb|0ro06J^>f>^2}^e-+$u5bU4IZNfO?75v8lstS15%XYw2ac^pkU34{QhDR(umt zPu~`w2?FP|nn3!RWZ3{?=77@teulahD9*S*k5KmY3*adlM)%{SR~bkZYlx1q@fkE= zI$7+kiw5!ha=dYlO>Z5KgxnZEJsaBm%v#nkX0MN-h%n&KA?N}xU3K3o-3Jpk?ANq2n9&Lh%K_CTvfiN ze>6w~NSSl8$#NEZ^t7h9YOxI=zcAG|a+m6AWei`3Jw7K;b;T${pJa^4RwRt%F>?>M zBmoQqm1`<_W7i!5P~THp-II)Ka^u;=z;}d{;SVj{G_4`9^HaEb!=@Pa;Dw)CH^DjsGxFqmb%o$Bkop$KnH8 zDYN)Bh)5=5!-*|f0Gh4)oZG=TEBr()g^DCtSQhmT3!ZN`Qd-E%@1cE}hm8&Vq5B+C zVF2_O)9IiZ(v(xzTwJIg5|}KVuE(;}|7dVIrT`$d=q_OG|3PY}x*URYkMXXJ6PT1$IFkNyvY_(9UglDi6TaeikPS(!Bnij z;Szn+)I_oxnRz7(WTYTp+IHSWQ?Xd~tQn(Q1r)kThM?NM< z?d6LaBG!H}R$zRy!Ij(}1?xe^+o+!;tqWJ3NgjHl1XNxzusxQ0I#6qzM(_00UPMw* zF*GWW_q&fqAN=uimSKgBu_@jD%MX3hpNY|*4r=e=k1lw2r**IyD(hcq?A+HtUgUy4Dqh5D7|G9q{)TsUj{g~c!xy>9wk^(LiXA4VKGz_zMvJMX#AgsR z34T3hhJ)#&sUaQ1+0PML(?YA~{5?=(MT}X^Vib%};uoI{qGW@wgJ&_M+8S8clsNz2 zPQkxMi`#3+Khwtl>>K>wxc{71{&!qGu&Zzz_wU(7TLTyG){PAu?!cXs?Dp-y0Ekcn AQvd(} diff --git a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py index de2c69702c..9bb2f78c5c 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py +++ b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/speech_vq.py @@ -22,9 +22,10 @@ import torch.nn as nn import torch.nn.functional as F import torchaudio.compliance.kaldi as kaldi -from librosa.filters import mel as librosa_mel_fn from torch import Tensor +from vllm_omni.utils.audio import mel_filter_bank + from .core_vq import DistributedGroupResidualVectorQuantization from .whisper_encoder import Conv1d, ConvTranspose1d, WhisperEncoder @@ -103,14 +104,14 @@ def extract(self, audio, **kwargs): y = audio if len(list(self.mel_basis.keys())) == 0: - mel = librosa_mel_fn( + mel = mel_filter_bank( sr=self.sampling_rate, n_fft=self.filter_length, n_mels=self.n_mel_channels, fmin=self.mel_fmin, fmax=self.mel_fmax, ) - self.mel_basis[str(self.mel_fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device) + self.mel_basis[str(self.mel_fmax) + "_" + str(y.device)] = mel.to(y.device) self.hann_window[str(y.device)] = torch.hann_window(self.win_length).to(y.device) y = torch.nn.functional.pad( diff --git a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py index e3bd6e1c3a..8464f53c9d 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py +++ b/vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py @@ -14,7 +14,6 @@ # limitations under the License. import math import operator -import os from functools import cache from itertools import accumulate @@ -24,6 +23,7 @@ from torch import Tensor, nn from vllm_omni.diffusion.attention.backends.utils.fa import HAS_FLASH_ATTN, flash_attn_varlen_func +from vllm_omni.utils.audio import mel_filter_bank N_FFT = 400 HOP_LENGTH = 160 @@ -31,21 +31,8 @@ @cache def mel_filters(device, n_mels: int) -> torch.Tensor: - """ - load the mel filterbank matrix for projecting STFT into a Mel spectrogram. - Allows decoupling librosa dependency; saved using: - - np.savez_compressed( - "mel_filters.npz", - mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80), - mel_128=librosa.filters.mel(sr=16000, n_fft=400, n_mels=128), - ) - """ - assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}" - - filters_path = os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz") - with np.load(filters_path, allow_pickle=False) as f: - return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) + """Compute mel filterbank matrix for projecting STFT into a Mel spectrogram.""" + return mel_filter_bank(sr=16000, n_fft=N_FFT, n_mels=n_mels).to(device) def log_mel_spectrogram( diff --git a/vllm_omni/utils/audio.py b/vllm_omni/utils/audio.py new file mode 100644 index 0000000000..490737bd53 --- /dev/null +++ b/vllm_omni/utils/audio.py @@ -0,0 +1,45 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Audio utility functions shared across models and entrypoints.""" + +import torch +from torchaudio.functional import melscale_fbanks + + +def mel_filter_bank( + sr: int, + n_fft: int, + n_mels: int, + fmin: float = 0.0, + fmax: float | None = None, +) -> torch.Tensor: + """Compute a mel filterbank matrix. + + Drop-in replacement for ``librosa.filters.mel`` using + ``torchaudio.functional.melscale_fbanks``. + + Args: + sr: Sample rate of the audio. + n_fft: FFT window size. + n_mels: Number of mel bands. + fmin: Minimum frequency (Hz). + fmax: Maximum frequency (Hz). Defaults to ``sr / 2``. + + Returns: + Tensor of shape ``(n_mels, n_fft // 2 + 1)``. + """ + if fmax is None: + fmax = float(sr) / 2.0 + # Use mel_scale='slaney' and norm='slaney' to match librosa's + # default behaviour (Slaney 1998 frequency mapping with area + # normalization). + return melscale_fbanks( + n_freqs=n_fft // 2 + 1, + f_min=float(fmin), + f_max=float(fmax), + n_mels=n_mels, + sample_rate=sr, + mel_scale="slaney", + norm="slaney", + ).T From a41174e0837a0d905954404810bbf1a590eaee07 Mon Sep 17 00:00:00 2001 From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Date: Fri, 10 Apr 2026 22:21:26 -0400 Subject: [PATCH 121/204] [Model] VoxCPM2 native AR TTS support (#2658) Signed-off-by: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Co-authored-by: SYLAR --- .buildkite/test-ready.yml | 25 + examples/offline_inference/voxcpm2/README.md | 83 +++ examples/offline_inference/voxcpm2/end2end.py | 145 +++++ tests/e2e/offline_inference/test_voxcpm2.py | 101 ++++ vllm_omni/engine/arg_utils.py | 3 + vllm_omni/model_executor/models/registry.py | 6 + .../model_executor/models/voxcpm2/__init__.py | 5 + .../models/voxcpm2/voxcpm2_import_utils.py | 82 +++ .../models/voxcpm2/voxcpm2_talker.py | 569 ++++++++++++++++++ .../model_executor/stage_configs/voxcpm2.yaml | 36 ++ .../transformers_utils/configs/__init__.py | 3 + .../transformers_utils/configs/voxcpm2.py | 153 +++++ 12 files changed, 1211 insertions(+) create mode 100644 examples/offline_inference/voxcpm2/README.md create mode 100644 examples/offline_inference/voxcpm2/end2end.py create mode 100644 tests/e2e/offline_inference/test_voxcpm2.py create mode 100644 vllm_omni/model_executor/models/voxcpm2/__init__.py create mode 100644 vllm_omni/model_executor/models/voxcpm2/voxcpm2_import_utils.py create mode 100644 vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py create mode 100644 vllm_omni/model_executor/stage_configs/voxcpm2.yaml create mode 100644 vllm_omni/transformers_utils/configs/voxcpm2.py diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 2f1f05463a..f5dcbef55e 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -317,6 +317,31 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" + - label: "VoxCPM2 Native AR E2E Test" + timeout_in_minutes: 20 + depends_on: upload-ready-pipeline + commands: + - | + timeout 20m bash -c ' + pip install voxcpm + export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn + pytest -s -v tests/e2e/offline_inference/test_voxcpm2.py -m "core_model" --run-level "core_model" + ' + agents: + queue: "gpu_1_queue" + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + - label: "OmniVoice E2E Test" timeout_in_minutes: 20 depends_on: upload-ready-pipeline diff --git a/examples/offline_inference/voxcpm2/README.md b/examples/offline_inference/voxcpm2/README.md new file mode 100644 index 0000000000..df48a85f56 --- /dev/null +++ b/examples/offline_inference/voxcpm2/README.md @@ -0,0 +1,83 @@ +# VoxCPM2 Offline Inference (Native AR) + +VoxCPM2 is a 2B-parameter tokenizer-free diffusion AR TTS model. It produces 48kHz audio and supports 30+ languages with a single-stage native AR pipeline backed by MiniCPM4. + +## Prerequisites + +Install the `voxcpm` package, or set the environment variable pointing to the source tree: + +```bash +# Option A: install package +pip install voxcpm + +# Option B: use source checkout +export VLLM_OMNI_VOXCPM_CODE_PATH=/path/to/voxcpm +``` + +## Quick Start + +Zero-shot synthesis: + +```bash +python examples/offline_inference/voxcpm2/end2end.py \ + --model openbmb/VoxCPM2 \ + --text "Hello, this is a VoxCPM2 demo." \ + --output-dir output_audio +``` + +Voice cloning with a reference audio: + +```bash +python examples/offline_inference/voxcpm2/end2end.py \ + --text "Hello, this is a voice clone demo." \ + --reference-audio /path/to/reference.wav \ + --output-dir output_clone +``` + +Prompt continuation (matched audio + text prefix): + +```bash +python examples/offline_inference/voxcpm2/end2end.py \ + --text "Continuation target sentence." \ + --prompt-audio /path/to/prompt.wav \ + --prompt-text "Transcript of the prompt audio." \ + --output-dir output_cont +``` + +The script accepts the following arguments: + +| Argument | Default | Description | +|---|---|---| +| `--model` | `openbmb/VoxCPM2` | HuggingFace repo ID or local path | +| `--text` | (example sentence) | Text to synthesize | +| `--output-dir` | `output_audio` | Directory for output WAV files | +| `--stage-configs-path` | `voxcpm2.yaml` | Stage config YAML path | +| `--reference-audio` | `None` | Reference audio for voice cloning (isolated) | +| `--prompt-audio` | `None` | Prompt audio for continuation mode | +| `--prompt-text` | `None` | Transcript matching `--prompt-audio` | + +## Performance + +Measured on a single H20 GPU (80 GB), voxcpm 0.0.0, PyTorch 2.10.0+cu128: + +| Input length | RTF | Sample rate | +|---|---|---| +| Short (~6 words) | ~0.81 | 48 kHz | +| Long (~50 words) | ~0.72 | 48 kHz | + +RTF < 1.0 means faster than real time. + +## Architecture + +VoxCPM2 uses a single-stage native AR pipeline: + +``` +feat_encoder +└─► MiniCPM4 (base LM) + └─► FSQ (finite scalar quantization) + └─► residual_lm (residual AR) + └─► LocDiT (local diffusion transformer) + └─► AudioVAE → 48 kHz waveform +``` + +All stages are fused into one vllm-native execution graph via `voxcpm2.yaml`, eliminating inter-stage coordination overhead and enabling true end-to-end batching. diff --git a/examples/offline_inference/voxcpm2/end2end.py b/examples/offline_inference/voxcpm2/end2end.py new file mode 100644 index 0000000000..2dce750897 --- /dev/null +++ b/examples/offline_inference/voxcpm2/end2end.py @@ -0,0 +1,145 @@ +"""Offline VoxCPM2 inference example (native AR pipeline). + +Uses the single-stage native AR config (voxcpm2.yaml). +Requires the `voxcpm` package or VLLM_OMNI_VOXCPM_CODE_PATH env var. +""" + +from __future__ import annotations + +import os +import time +from pathlib import Path + +import soundfile as sf +import torch +from vllm.utils.argparse_utils import FlexibleArgumentParser + +from vllm_omni import Omni + +REPO_ROOT = Path(__file__).resolve().parents[3] +DEFAULT_STAGE_CONFIGS_PATH = str(REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm2.yaml") +SAMPLE_RATE = 48_000 + + +def parse_args(): + parser = FlexibleArgumentParser(description="Offline VoxCPM2 native AR inference") + parser.add_argument( + "--model", + type=str, + default="openbmb/VoxCPM2", + help="VoxCPM2 model path or HuggingFace repo ID.", + ) + parser.add_argument( + "--text", + type=str, + default="This is a VoxCPM2 native AR synthesis example running on vLLM Omni.", + help="Text to synthesize.", + ) + parser.add_argument( + "--output-dir", + type=str, + default="output_audio", + help="Directory for output WAV files.", + ) + parser.add_argument( + "--stage-configs-path", + type=str, + default=DEFAULT_STAGE_CONFIGS_PATH, + help="Path to the stage config YAML file.", + ) + parser.add_argument( + "--reference-audio", + type=str, + default=None, + help="Path to reference audio for voice cloning (isolated ref mode).", + ) + parser.add_argument( + "--prompt-audio", + type=str, + default=None, + help="Path to prompt audio for continuation mode (requires --prompt-text).", + ) + parser.add_argument( + "--prompt-text", + type=str, + default=None, + help="Text matching --prompt-audio for continuation mode.", + ) + return parser.parse_args() + + +def extract_audio(multimodal_output: dict) -> torch.Tensor: + """Extract the final complete audio tensor from multimodal output. + + The output processor accumulates per-step full audio under ``audio`` + as a list. The last element is the complete waveform. + """ + audio = multimodal_output.get("audio") or multimodal_output.get("model_outputs") + if audio is None: + raise ValueError(f"No audio key in multimodal_output: {list(multimodal_output.keys())}") + + if isinstance(audio, list): + # Take the last valid tensor (most complete audio) + valid = [torch.as_tensor(a).float().cpu().reshape(-1) for a in audio if a is not None] + if not valid: + raise ValueError("Audio list is empty or all elements are None.") + return valid[-1] + + return torch.as_tensor(audio).float().cpu().reshape(-1) + + +def main(): + args = parse_args() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + engine = Omni( + model=args.model, + stage_configs_path=args.stage_configs_path, + ) + + additional: dict = {} + if args.reference_audio: + additional["reference_audio"] = args.reference_audio + if args.prompt_audio and args.prompt_text: + additional["prompt_audio"] = args.prompt_audio + additional["prompt_text"] = args.prompt_text + + prompt: dict = {"prompt": args.text} + if additional: + prompt["additional_information"] = additional + + print(f"Model : {args.model}") + print(f"Text : {args.text}") + if args.reference_audio: + print(f"Ref audio : {args.reference_audio}") + if args.prompt_audio: + print(f"Prompt audio: {args.prompt_audio}") + print(f"Prompt text : {args.prompt_text}") + print(f"Output dir : {output_dir}") + + t_start = time.perf_counter() + outputs = engine.generate([prompt]) + elapsed = time.perf_counter() - t_start + + # outputs[0].outputs[0].multimodal_output["audio"] is a list of tensors + request_output = outputs[0] + mm = request_output.outputs[0].multimodal_output + audio = extract_audio(mm) + + duration = audio.numel() / SAMPLE_RATE + rtf = elapsed / duration if duration > 0 else float("inf") + + output_path = output_dir / "output.wav" + sf.write(str(output_path), audio.numpy(), SAMPLE_RATE, format="WAV") + + print(f"Saved : {output_path}") + print(f"Duration : {duration:.2f}s") + print(f"Inference : {elapsed:.2f}s") + print(f"RTF : {rtf:.3f}") + + +if __name__ == "__main__": + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + main() diff --git a/tests/e2e/offline_inference/test_voxcpm2.py b/tests/e2e/offline_inference/test_voxcpm2.py new file mode 100644 index 0000000000..7e17c6a369 --- /dev/null +++ b/tests/e2e/offline_inference/test_voxcpm2.py @@ -0,0 +1,101 @@ +"""E2E test for VoxCPM2 native AR offline inference.""" + +import os + +import pytest +import torch + +from tests.utils import hardware_test + +VOXCPM2_MODEL = "openbmb/VoxCPM2" +STAGE_CONFIG = os.path.join( + os.path.dirname(__file__), + "..", + "..", + "..", + "vllm_omni", + "model_executor", + "stage_configs", + "voxcpm2.yaml", +) +SAMPLE_RATE = 48000 + + +@pytest.fixture(scope="module") +def voxcpm2_engine(): + """Create VoxCPM2 engine for testing.""" + from vllm_omni import Omni + + engine = Omni(model=VOXCPM2_MODEL, stage_configs_path=STAGE_CONFIG) + yield engine + + +def _extract_audio(multimodal_output: dict) -> torch.Tensor: + """Extract the final complete audio tensor from multimodal output.""" + assert isinstance(multimodal_output, dict), f"Expected dict, got {type(multimodal_output)}" + + # Output processor accumulates per-step full audio under "audio". + audio = multimodal_output.get("audio") or multimodal_output.get("model_outputs") + assert audio is not None, f"No audio key, got {list(multimodal_output.keys())}" + + if isinstance(audio, list): + valid = [x for x in audio if isinstance(x, torch.Tensor) and x.numel() > 100] + assert valid, "No valid audio tensors in output list" + audio = valid[-1] + + assert isinstance(audio, torch.Tensor), f"Expected Tensor, got {type(audio)}" + return audio + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "L4"}, num_cards=1) +def test_voxcpm2_zero_shot_001(voxcpm2_engine): + """Test zero-shot TTS produces valid audio output.""" + outputs = voxcpm2_engine.generate([{"prompt": "Hello, this is a test."}]) + assert len(outputs) == 1 + + audio = _extract_audio(outputs[0].outputs[0].multimodal_output) + duration_s = audio.shape[0] / SAMPLE_RATE + assert 0.5 < duration_s < 30.0, f"Audio duration out of range: {duration_s:.2f}s" + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "L4"}, num_cards=1) +def test_voxcpm2_voice_clone_002(voxcpm2_engine): + """Test voice cloning with a reference audio file. + + Uses the example ``reference_speaker.wav`` bundled with the voxcpm + package. Skipped if the file is not present. + """ + # Try to locate a reference wav from the voxcpm package / env override + candidates = [] + env_path = os.environ.get("VLLM_OMNI_VOXCPM_CODE_PATH") + if env_path: + candidates.append(os.path.join(env_path, "..", "examples", "reference_speaker.wav")) + try: + import voxcpm # noqa: F401 (only used to locate path) + + vox_dir = os.path.dirname(os.path.dirname(os.path.abspath(voxcpm.__file__))) + candidates.append(os.path.join(vox_dir, "examples", "reference_speaker.wav")) + except ImportError: + pass + + ref_path = next((p for p in candidates if p and os.path.exists(p)), None) + if ref_path is None: + pytest.skip("No reference audio available for voice clone test") + + outputs = voxcpm2_engine.generate( + [ + { + "prompt": "Hello, this is a voice clone demo.", + "additional_information": {"reference_audio": ref_path}, + } + ] + ) + assert len(outputs) == 1 + + audio = _extract_audio(outputs[0].outputs[0].multimodal_output) + duration_s = audio.shape[0] / SAMPLE_RATE + assert 0.5 < duration_s < 30.0, f"Audio duration out of range: {duration_s:.2f}s" diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index d43f1b8fdc..e29de3ec98 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -20,6 +20,7 @@ _ARCH_TO_MODEL_TYPE: dict[str, str] = { "CosyVoice3Model": "cosyvoice3", "OmniVoiceModel": "omnivoice", + "VoxCPM2TalkerForConditionalGeneration": "voxcpm2", } # Maps model architecture names to tokenizer subfolder paths within HF repos. @@ -40,6 +41,7 @@ def _register_omni_hf_configs() -> None: from vllm_omni.model_executor.models.voxtral_tts.configuration_voxtral_tts import ( VoxtralTTSConfig, ) + from vllm_omni.transformers_utils.configs.voxcpm2 import VoxCPM2Config except Exception as exc: # pragma: no cover - best-effort optional registration logger.warning("Skipping omni HF config registration due to import error: %s", exc) return @@ -57,6 +59,7 @@ def _register_omni_hf_configs() -> None: ("cosyvoice3", CosyVoice3Config), ("omnivoice", OmniVoiceConfig), ("voxtral_tts", VoxtralTTSConfig), + ("voxcpm2", VoxCPM2Config), ]: try: AutoConfig.register(model_type, config_cls) diff --git a/vllm_omni/model_executor/models/registry.py b/vllm_omni/model_executor/models/registry.py index 3b51f20023..0894088005 100644 --- a/vllm_omni/model_executor/models/registry.py +++ b/vllm_omni/model_executor/models/registry.py @@ -145,6 +145,12 @@ "fish_speech_dac_decoder", "FishSpeechDACDecoder", ), + ## VoxCPM2 + "VoxCPM2TalkerForConditionalGeneration": ( + "voxcpm2", + "voxcpm2_talker", + "VoxCPM2TalkerForConditionalGeneration", + ), ## Voxtral TTS "VoxtralTTSForConditionalGeneration": ( "voxtral_tts", diff --git a/vllm_omni/model_executor/models/voxcpm2/__init__.py b/vllm_omni/model_executor/models/voxcpm2/__init__.py new file mode 100644 index 0000000000..77bd8dfb51 --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm2/__init__.py @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from .voxcpm2_talker import VoxCPM2TalkerForConditionalGeneration + +__all__ = ["VoxCPM2TalkerForConditionalGeneration"] diff --git a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_import_utils.py b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_import_utils.py new file mode 100644 index 0000000000..231a51bbca --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_import_utils.py @@ -0,0 +1,82 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Dynamic import utilities for the native VoxCPM2 package. + +Supports three discovery modes (first match wins): +1. ``VLLM_OMNI_VOXCPM_CODE_PATH`` env var (explicit source tree) +2. Sibling ``../VoxCPM/src`` relative to the vllm-omni repo root +3. pip-installed ``voxcpm`` package (>= 2.0) +""" + +from __future__ import annotations + +import importlib +import os +import sys +from pathlib import Path +from typing import Any + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def _iter_voxcpm2_src_candidates() -> list[Path]: + """Yield candidate source directories for VoxCPM2.""" + candidates: list[Path] = [] + env_path = os.environ.get("VLLM_OMNI_VOXCPM_CODE_PATH") + if env_path: + candidates.append(Path(env_path).expanduser()) + + repo_root = Path(__file__).resolve().parents[4] + candidates.append(repo_root.parent / "VoxCPM" / "src") + + seen: set[str] = set() + unique: list[Path] = [] + for c in candidates: + key = str(c) + if key not in seen: + seen.add(key) + unique.append(c) + return unique + + +def _prepend_src(candidate: Path) -> None: + candidate_str = str(candidate) + if candidate_str not in sys.path: + sys.path.insert(0, candidate_str) + + +def _import_voxcpm2_attrs(module_name: str, *attr_names: str) -> tuple[Any, ...]: + """Import attributes from the voxcpm package, trying source tree first.""" + last_exc: ImportError | None = None + + for candidate in _iter_voxcpm2_src_candidates(): + if not candidate.exists(): + continue + _prepend_src(candidate) + try: + mod = importlib.import_module(module_name) + return tuple(getattr(mod, name) for name in attr_names) + except (ImportError, AttributeError) as exc: + last_exc = ImportError(str(exc)) + continue + + try: + mod = importlib.import_module(module_name) + return tuple(getattr(mod, name) for name in attr_names) + except (ImportError, AttributeError) as exc: + last_exc = ImportError(str(exc)) + + raise ImportError( + f"Could not import {attr_names} from {module_name}. " + f"Install voxcpm>=2.0: pip install voxcpm. " + f"Or set VLLM_OMNI_VOXCPM_CODE_PATH to the VoxCPM source tree. " + f"Last error: {last_exc}" + ) + + +def import_voxcpm2_core(): + """Import the VoxCPM core class used to load the native TTS model.""" + (VoxCPM,) = _import_voxcpm2_attrs("voxcpm.core", "VoxCPM") + return VoxCPM diff --git a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py new file mode 100644 index 0000000000..ade68b673b --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py @@ -0,0 +1,569 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""VoxCPM2 native AR talker — uses native MiniCPM4 base_lm directly. + +Uses native VoxCPM2 modules (no PagedAttention, manual KV cache). +Each AR decode step: + feat_encoder → base_lm → FSQ → residual_lm → LocDiT → stop + +TODO(PagedAttention): The base_lm is a MiniCPM4 variant (GQA + LongRoPE, +use_mup=False). vllm's MiniCPMModel already supports the architecture +(LongRoPE via Phi3LongRoPEScaledRotaryEmbedding, muP via config), but +two issues block replacing the native base_lm with a vllm MiniCPM4Model: + 1. Per-request state isolation — residual_lm and LocDiT diffusion use + shared native KV caches; concurrent requests clobber each other. + Fix: save/restore residual_lm cache per request, or pool N instances. + 2. Streaming audio — make_omni_output re-decodes all patches each step. + Fix: sliding-window VAE decode (decode_pad pattern from nanovllm). +""" + +from __future__ import annotations + +from collections.abc import Iterable +from typing import Any + +import torch +import torch.nn as nn +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.models.minicpm import MiniCPMModel +from vllm.model_executor.models.utils import ( + AutoWeightsLoader, + WeightsMapper, + maybe_prefix, +) +from vllm.sequence import IntermediateTensors + +from vllm_omni.model_executor.models.output_templates import OmniOutput + +from .voxcpm2_import_utils import import_voxcpm2_core + +logger = init_logger(__name__) + + +class VoxCPM2TalkerForConditionalGeneration(nn.Module): + """VoxCPM2 talker using native MiniCPM4 base_lm. + + Loads the full VoxCPM2 model natively and decomposes the AR loop: + each vllm decode step runs one iteration of the native generate loop. + """ + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.vllm_config = vllm_config + self.config = vllm_config.model_config.hf_config + + # Flags for OmniGPUModelRunner + self.have_multimodal_outputs = True + self.has_preprocess = True + self.has_postprocess = True + self._accumulated_patches: list[torch.Tensor] = [] + + # vllm MiniCPMModel scaffold — needed for warmup/profiling/KV cache + # sizing. Not used for actual computation (native modules are used). + self.model = MiniCPMModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) + self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors + + # Placeholder — actual native model loaded in load_weights + self._tts: nn.Module | None = None + self._device = "cuda" + self._side_dtype = torch.bfloat16 + + # Config values + self._patch_size = getattr(self.config, "patch_size", 4) + self._feat_dim = getattr(self.config, "feat_dim", 64) + self._inference_timesteps = 10 + self._cfg_value = 2.0 + + # TODO: implement sliding-window VAE decode (nanovllm pattern) + # for O(1) per-step streaming. Current impl re-decodes all patches. + + @property + def tts(self) -> nn.Module: + assert self._tts is not None, "Model not loaded yet" + return self._tts + + # -------------------- vllm hooks -------------------- + + def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor: + """Embed input IDs using native base_lm with scale_emb.""" + embeds = self.tts.base_lm.embed_tokens(input_ids) + return embeds * self.tts.config.lm_config.scale_emb + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: Any, + ) -> torch.Tensor | IntermediateTensors: + """Full VoxCPM2 AR step: base_lm → FSQ → residual_lm → diffusion.""" + # Always run scaffold model to keep FlashInfer/attention happy + model_output = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) + if isinstance(model_output, IntermediateTensors): + return model_output + scaffold_hidden = model_output + if isinstance(scaffold_hidden, tuple): + scaffold_hidden = scaffold_hidden[0] + + # Real computation: use native modules + has_infos = bool(getattr(self, "_current_step_infos", None)) + is_prefill = scaffold_hidden.shape[0] > 1 + + if is_prefill and has_infos: + self._forward_prefill(inputs_embeds, scaffold_hidden.device) + # Return scaffold output (right shape for engine) — our side + # computation results are stored in instance state + return scaffold_hidden + + if not is_prefill and hasattr(self, "_prev_feat_embed"): + self._forward_decode(inputs_embeds, scaffold_hidden.device) + return scaffold_hidden + + return scaffold_hidden + + def _build_prefill_inputs(self, text: str, dev: Any): + """Build text_token / audio_feat / masks like native _generate_with_prompt_cache. + + Returns a dict with keys: text_token, audio_feat, text_mask, audio_mask, + prefix_feat_cond. Handles zero-shot, reference (voice clone), continuation, + and ref_continuation modes. + """ + tts = self.tts + dtype = self._side_dtype + cache = getattr(self, "_prompt_cache", None) + mode = cache.get("mode", "continuation") if cache else "zero_shot" + + if cache is not None and mode in ("continuation", "ref_continuation"): + full_text = cache.get("prompt_text", "") + text + else: + full_text = text + + text_token = torch.LongTensor(tts.text_tokenizer(full_text)) + text_token = torch.cat( + [ + text_token, + torch.tensor([tts.audio_start_token], dtype=torch.int32, device=text_token.device), + ], + dim=-1, + ) + text_length = text_token.shape[0] + latent_dim = tts.audio_vae.latent_dim + patch_size = tts.patch_size + + if mode in ("zero_shot", "continuation"): + prompt_audio_feat = ( + cache["audio_feat"] if cache else torch.empty((0, patch_size, latent_dim), dtype=torch.float32) + ) + audio_length = prompt_audio_feat.size(0) + text_pad_token = torch.zeros(audio_length, dtype=torch.int32) + text_pad_feat = torch.zeros((text_length, patch_size, latent_dim), dtype=torch.float32) + text_token = torch.cat([text_token, text_pad_token]) + audio_feat = torch.cat([text_pad_feat, prompt_audio_feat], dim=0) + text_mask = torch.cat( + [ + torch.ones(text_length, dtype=torch.int32), + torch.zeros(audio_length, dtype=torch.int32), + ] + ) + audio_mask = torch.cat( + [ + torch.zeros(text_length, dtype=torch.int32), + torch.ones(audio_length, dtype=torch.int32), + ] + ) + elif mode == "reference": + ref_audio_feat = cache["ref_audio_feat"] + ref_tokens, ref_feats, ref_t_mask, ref_a_mask = tts._make_ref_prefix(ref_audio_feat, text_token.device) + text_pad_feat = torch.zeros((text_length, patch_size, latent_dim), dtype=torch.float32) + text_token = torch.cat([ref_tokens.cpu(), text_token]) + audio_feat = torch.cat([ref_feats.cpu(), text_pad_feat], dim=0) + text_mask = torch.cat([ref_t_mask.cpu(), torch.ones(text_length, dtype=torch.int32)]) + audio_mask = torch.cat([ref_a_mask.cpu(), torch.zeros(text_length, dtype=torch.int32)]) + else: + # ref_continuation + ref_audio_feat = cache["ref_audio_feat"] + prompt_audio_feat = cache["audio_feat"] + prompt_audio_length = prompt_audio_feat.size(0) + ref_tokens, ref_feats, ref_t_mask, ref_a_mask = tts._make_ref_prefix(ref_audio_feat, text_token.device) + prompt_pad_token = torch.zeros(prompt_audio_length, dtype=torch.int32) + text_pad_feat = torch.zeros((text_length, patch_size, latent_dim), dtype=torch.float32) + text_token = torch.cat([ref_tokens.cpu(), text_token, prompt_pad_token]) + audio_feat = torch.cat([ref_feats.cpu(), text_pad_feat, prompt_audio_feat], dim=0) + text_mask = torch.cat( + [ + ref_t_mask.cpu(), + torch.ones(text_length, dtype=torch.int32), + torch.zeros(prompt_audio_length, dtype=torch.int32), + ] + ) + audio_mask = torch.cat( + [ + ref_a_mask.cpu(), + torch.zeros(text_length, dtype=torch.int32), + torch.ones(prompt_audio_length, dtype=torch.int32), + ] + ) + + return { + "text_token": text_token.unsqueeze(0).to(dev), + "audio_feat": audio_feat.unsqueeze(0).to(dev).to(dtype), + "text_mask": text_mask.unsqueeze(0).to(dev), + "audio_mask": audio_mask.unsqueeze(0).to(dev), + } + + def _forward_prefill(self, inputs_embeds: torch.Tensor, dev: Any) -> torch.Tensor: + """Prefill: build combined embeds, run base_lm + residual_lm + first diffusion. + + Uses the same path as native ``VoxCPM2Model._inference`` so zero-shot, + voice cloning (reference), continuation, and ref_continuation modes + all share the same code. + """ + tts = self.tts + dtype = self._side_dtype + text = getattr(self, "_prefill_text", None) + if text is None: + # Fallback (should not hit at runtime; preprocess sets this) + text = "" + + inputs = self._build_prefill_inputs(text, dev) + text_token = inputs["text_token"] + feat = inputs["audio_feat"] + text_mask = inputs["text_mask"] + feat_mask = inputs["audio_mask"] + + # Compose combined_embed exactly like native _inference + feat_embed = tts.feat_encoder(feat) + feat_embed = tts.enc_to_lm_proj(feat_embed) + scale_emb = tts.config.lm_config.scale_emb if tts.config.lm_config.use_mup else 1.0 + text_embed = tts.base_lm.embed_tokens(text_token) * scale_emb + combined_embed = text_mask.unsqueeze(-1) * text_embed + feat_mask.unsqueeze(-1) * feat_embed + + # last audio patch becomes initial prefix_feat_cond (zeros for zero-shot, + # last reference/prompt patch for voice clone / continuation) + prefix_feat_cond = ( + feat[:, -1, ...] + if feat.shape[1] > 0 + else torch.zeros(1, tts.patch_size, tts.feat_dim, device=dev, dtype=dtype) + ) + + # Base LM prefill + tts.base_lm.setup_cache(1, 4096, dev, dtype) + enc_out, enc_kv = tts.base_lm(inputs_embeds=combined_embed, is_causal=True) + tts.base_lm.kv_cache.fill_caches(enc_kv) + + # FSQ: identity on text positions, quantized on audio positions + enc_outputs = tts.fsq_layer(enc_out) * feat_mask.unsqueeze(-1) + enc_out * text_mask.unsqueeze(-1) + lm_hidden = enc_outputs[:, -1, :] # [1, H] + + logger.info( + "PREFILL: enc shape=%s last_norm=%.4f", + enc_outputs.shape, + lm_hidden.norm().item(), + ) + + # Residual LM prefill + tts.residual_lm.setup_cache(1, 4096, dev, dtype) + residual_input = tts.fusion_concat_proj(torch.cat([enc_outputs, feat_mask.unsqueeze(-1) * feat_embed], dim=-1)) + res_out, res_kv = tts.residual_lm(inputs_embeds=residual_input, is_causal=True) + tts.residual_lm.kv_cache.fill_caches(res_kv) + residual_hidden = res_out[:, -1, :] # [1, H] + + # Precompute stop logits for first compute_logits call + stop_logits = tts.stop_head(tts.stop_actn(tts.stop_proj(lm_hidden))) + self._precomputed_stop_logits = stop_logits.detach() + logger.info("PREFILL stop: %s", stop_logits[0].tolist()) + + # First diffusion step + dit_h = torch.cat( + [ + tts.lm_to_dit_proj(lm_hidden), + tts.res_to_dit_proj(residual_hidden), + ], + dim=-1, + ) + pred_feat = tts.feat_decoder( + mu=dit_h, + patch_size=tts.patch_size, + cond=prefix_feat_cond.transpose(1, 2).contiguous(), + n_timesteps=self._inference_timesteps, + cfg_value=self._cfg_value, + ).transpose(1, 2) # [1, P, D] + + with torch.no_grad(): + curr_embed = tts.enc_to_lm_proj(tts.feat_encoder(pred_feat.unsqueeze(1))).squeeze(1) + + # Store state for decode steps + self._curr_embed_for_next = curr_embed.detach() + self._prev_feat_embed = curr_embed.detach() + self._curr_prefix_feat_cond = pred_feat[0].detach() + self._last_audio_patch = pred_feat.reshape(1, -1).detach().cpu().float() + + logger.info( + "PREFILL patch: norm=%.4f first3=%s", + pred_feat.norm().item(), + pred_feat[0, 0, :3].tolist(), + ) + + return lm_hidden.to(dtype) + + def _forward_decode(self, inputs_embeds: torch.Tensor | None, dev: Any) -> torch.Tensor: + """Decode step: base_lm → FSQ → residual_lm → diffusion.""" + tts = self.tts + dtype = self._side_dtype + + # 1. Base LM step with curr_embed from previous diffusion + curr_embed = self._curr_embed_for_next.to(dev, dtype=dtype) + if curr_embed.ndim == 2: + curr_embed_3d = curr_embed.unsqueeze(0) # [1, 1, H] + else: + curr_embed_3d = curr_embed + + step_pos = torch.tensor([tts.base_lm.kv_cache.step()], device=dev) + new_hidden = tts.base_lm.forward_step(curr_embed_3d[:, 0, :], step_pos).clone() + + # 2. FSQ + new_lm_hidden = tts.fsq_layer(new_hidden) + if new_lm_hidden.ndim == 1: + new_lm_hidden = new_lm_hidden.unsqueeze(0) + + # 3. Residual LM step + prev_fe = self._prev_feat_embed.to(dtype) + if prev_fe.ndim == 1: + prev_fe = prev_fe.unsqueeze(0) + res_input = tts.fusion_concat_proj(torch.cat([new_lm_hidden, prev_fe], dim=-1)) + res_step_pos = torch.tensor([tts.residual_lm.kv_cache.step()], device=dev) + new_res_hidden = tts.residual_lm.forward_step(res_input, res_step_pos).clone() + if new_res_hidden.ndim == 1: + new_res_hidden = new_res_hidden.unsqueeze(0) + + # 4. Diffusion + p = self._patch_size + pfc = self._curr_prefix_feat_cond.to(dtype).unsqueeze(0) + + dit_h = torch.cat( + [ + tts.lm_to_dit_proj(new_lm_hidden), + tts.res_to_dit_proj(new_res_hidden), + ], + dim=-1, + ) + pred_feat = tts.feat_decoder( + mu=dit_h, + patch_size=p, + cond=pfc.transpose(1, 2).contiguous(), + n_timesteps=self._inference_timesteps, + cfg_value=self._cfg_value, + ).transpose(1, 2) # [1, P, D] + + # 5. feat_encoder → curr_embed + with torch.no_grad(): + curr_embed = tts.enc_to_lm_proj(tts.feat_encoder(pred_feat.unsqueeze(1))).squeeze(1) + + # 6. Stop logits + stop_logits = tts.stop_head(tts.stop_actn(tts.stop_proj(new_lm_hidden))) + self._precomputed_stop_logits = stop_logits.detach() + + # 7. Store state + self._curr_embed_for_next = curr_embed.detach() + self._prev_feat_embed = curr_embed.detach() + self._curr_prefix_feat_cond = pred_feat[0].detach() + self._last_audio_patch = pred_feat.reshape(1, -1).detach().cpu().float() + + return new_lm_hidden[-1:].detach() + + def compute_logits( + self, + hidden_states: torch.Tensor | OmniOutput, + sampling_metadata: Any = None, + ) -> torch.Tensor | None: + if isinstance(hidden_states, OmniOutput): + hidden_states = hidden_states.text_hidden_states + if hidden_states is None: + return None + + precomputed = getattr(self, "_precomputed_stop_logits", None) + if precomputed is not None: + self._precomputed_stop_logits = None + raw_logits = precomputed[: hidden_states.shape[0]] + else: + # Fallback for warmup + bsz = hidden_states.shape[0] + raw_logits = torch.zeros(bsz, 2, device=hidden_states.device) + raw_logits[:, 0] = 1.0 # continue + + bsz = raw_logits.shape[0] + full_logits = torch.full( + (bsz, self.config.vocab_size), + float("-inf"), + device=raw_logits.device, + dtype=raw_logits.dtype, + ) + full_logits[:, 0] = raw_logits[:, 0] # continue + full_logits[:, 1] = raw_logits[:, 1] # stop + return full_logits + + # -------------------- Omni output -------------------- + + def make_omni_output(self, model_outputs: torch.Tensor | OmniOutput, **kwargs: Any) -> OmniOutput: + if isinstance(model_outputs, OmniOutput): + return model_outputs + + hidden = model_outputs + patch = getattr(self, "_last_audio_patch", None) + mm: dict[str, Any] = {} + + if patch is not None: + self._last_audio_patch = None + self._accumulated_patches.append(patch.clone()) + + # Decode all accumulated patches → full audio waveform. + # TODO: implement sliding-window VAE decode (nanovllm pattern) + # for O(1) per-step streaming instead of O(N) re-decode. + if self._accumulated_patches: + all_p = torch.cat(self._accumulated_patches, dim=0) + d = self._feat_dim + from einops import rearrange + + feat = rearrange(all_p.float().reshape(1, -1, d), "b t d -> b d t") + with torch.no_grad(): + audio = self.tts.audio_vae.decode(feat.to(self._device)).reshape(-1).detach().cpu().float() + + mm["model_outputs"] = [audio] + mm["sr"] = [torch.tensor(48000, dtype=torch.int32)] + + return OmniOutput( + text_hidden_states=hidden, + multimodal_outputs=mm, + ) + + # -------------------- preprocess / postprocess -------------------- + + def preprocess( + self, + input_ids: torch.Tensor, + input_embeds: torch.Tensor | None, + **info_dict: Any, + ) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any]]: + additional_information = info_dict.get("additional_information") + if isinstance(additional_information, dict): + merged = {k: v for k, v in info_dict.items() if k != "additional_information"} + for k, v in additional_information.items(): + merged.setdefault(k, v) + info_dict = merged + + span_len = int(input_ids.shape[0]) + dev = input_ids.device + + if span_len > 1: + # ---- Prefill ---- + # Decode the text from input_ids for native-matching tokenization. + # Speech API tokenizes with BOS; we use the detokenized string so + # native's ``text_tokenizer`` produces the exact same tokens as + # ``generate()``. + ids = input_ids.tolist() + if ids and ids[0] == self.config.bos_token_id: + ids = ids[1:] + text = self.tts.text_tokenizer.tokenizer.decode(ids, skip_special_tokens=True) + self._prefill_text = text + + # Voice clone / continuation: build prompt cache from info_dict. + ref_audio = info_dict.get("reference_audio") or info_dict.get("ref_audio") + prompt_audio = info_dict.get("prompt_audio") + prompt_text = info_dict.get("prompt_text") + if isinstance(ref_audio, list): + ref_audio = ref_audio[0] if ref_audio else None + if isinstance(prompt_audio, list): + prompt_audio = prompt_audio[0] if prompt_audio else None + if isinstance(prompt_text, list): + prompt_text = prompt_text[0] if prompt_text else None + + self._prompt_cache = None + if ref_audio or (prompt_audio and prompt_text): + try: + self._prompt_cache = self.tts.build_prompt_cache( + prompt_text=prompt_text, + prompt_wav_path=prompt_audio, + reference_wav_path=ref_audio, + ) + except Exception as e: + logger.warning("build_prompt_cache failed: %s; falling back to zero-shot", e) + self._prompt_cache = None + + # Reset per-request state (fresh generation) + self._accumulated_patches = [] + if hasattr(self, "_prev_feat_embed"): + del self._prev_feat_embed + if hasattr(self, "_curr_embed_for_next"): + del self._curr_embed_for_next + + # Store info for forward + self._current_step_infos = [{"is_prefill": True}] + + # The scaffold model still needs embeddings sized to span_len for + # its warmup/attention bookkeeping. Native modules use the full + # (potentially longer) sequence internally. Pass zeros — scaffold + # output is discarded. + embeds = torch.zeros( + span_len, + self.config.hidden_size, + device=dev, + dtype=self._side_dtype, + ) + + return input_ids, embeds, {} + + # ---- Decode ---- + curr_embed = getattr(self, "_curr_embed_for_next", None) + if curr_embed is not None: + inputs_embeds = curr_embed.to(dev, dtype=self._side_dtype).reshape(1, -1) + else: + inputs_embeds = torch.zeros( + 1, + self.config.hidden_size, + device=dev, + dtype=self._side_dtype, + ) + + self._current_step_infos = [{}] + return input_ids, inputs_embeds, {} + + def postprocess(self, hidden_states: torch.Tensor, **info: Any) -> dict[str, Any]: + return {} + + # -------------------- Weight loading -------------------- + + # Weight mapping for vllm scaffold + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"base_lm.": "model."}) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load scaffold weights via vllm + native model for computation.""" + + # Filter: only pass base_lm weights to the scaffold + def _base_lm_only(ws): + for name, tensor in ws: + if name.startswith("base_lm."): + yield name, tensor + + loader = AutoWeightsLoader(self) + loaded = loader.load_weights(_base_lm_only(weights), mapper=self.hf_to_vllm_mapper) + + # Load the full native model for actual computation + model_path = self.vllm_config.model_config.model + VoxCPM = import_voxcpm2_core() + native = VoxCPM.from_pretrained(model_path, load_denoiser=False, optimize=False) + self._tts = native.tts_model.to("cuda") + self._side_dtype = self._tts.fusion_concat_proj.weight.dtype + self._device = "cuda" + + self._patch_size = self._tts.patch_size + self._feat_dim = self._tts.feat_dim + + logger.info( + "Loaded native VoxCPM2 (patch_size=%d, feat_dim=%d, dtype=%s)", + self._patch_size, + self._feat_dim, + self._side_dtype, + ) + return loaded diff --git a/vllm_omni/model_executor/stage_configs/voxcpm2.yaml b/vllm_omni/model_executor/stage_configs/voxcpm2.yaml new file mode 100644 index 0000000000..de15c88de4 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/voxcpm2.yaml @@ -0,0 +1,36 @@ +# VoxCPM2 native AR single-stage pipeline. +# Uses native MiniCPM4 base_lm + native VAE decode in one stage. +# All computation (base_lm, residual_lm, diffusion, VAE) in forward(). +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: bfloat16 + model_stage: latent_generator + model_arch: VoxCPM2TalkerForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: true + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.9 + distributed_executor_backend: "mp" + max_num_batched_tokens: 4096 + max_model_len: 4096 + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 4096 + seed: 42 + detokenize: false + repetition_penalty: 1.0 + stop_token_ids: [1] + final_output: true + final_output_type: audio diff --git a/vllm_omni/transformers_utils/configs/__init__.py b/vllm_omni/transformers_utils/configs/__init__.py index 59b23f9149..5f957c2f6d 100644 --- a/vllm_omni/transformers_utils/configs/__init__.py +++ b/vllm_omni/transformers_utils/configs/__init__.py @@ -17,6 +17,7 @@ "FishSpeechConfig": "vllm_omni.transformers_utils.configs.fish_speech", "FishSpeechSlowARConfig": "vllm_omni.transformers_utils.configs.fish_speech", "FishSpeechFastARConfig": "vllm_omni.transformers_utils.configs.fish_speech", + "VoxCPM2Config": "vllm_omni.transformers_utils.configs.voxcpm2", } __all__ = [ @@ -27,6 +28,7 @@ "FishSpeechConfig", "FishSpeechSlowARConfig", "FishSpeechFastARConfig", + "VoxCPM2Config", ] @@ -47,3 +49,4 @@ def __dir__(): # run as soon as `vllm_omni.transformers_utils.configs` is imported. from vllm_omni.transformers_utils.configs import fish_speech as _fish_speech # noqa: F401, E402 from vllm_omni.transformers_utils.configs import mammoth_moda2 as _mammoth_moda2 # noqa: F401, E402 +from vllm_omni.transformers_utils.configs import voxcpm2 as _voxcpm2 # noqa: F401, E402 diff --git a/vllm_omni/transformers_utils/configs/voxcpm2.py b/vllm_omni/transformers_utils/configs/voxcpm2.py new file mode 100644 index 0000000000..c625284bd6 --- /dev/null +++ b/vllm_omni/transformers_utils/configs/voxcpm2.py @@ -0,0 +1,153 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math + +from transformers import AutoConfig +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_rope_utils import rope_config_validation + + +class VoxCPM2Config(PretrainedConfig): + """Configuration for VoxCPM2 native AR integration. + + The HuggingFace checkpoint stores LM parameters inside a nested + ``lm_config`` dict. This class hoists them to top-level attributes + so that vllm's ``MiniCPMModel`` can consume them directly. + + vllm's MiniCPM **always** applies muP scaling (scale_emb, scale_depth, + dim_model_base). VoxCPM2 was trained with ``use_mup=false``, so we + neutralise the scalings: + * ``scale_emb = 1.0`` + * ``scale_depth = sqrt(num_hidden_layers)`` (cancels the division) + * ``dim_model_base = hidden_size`` (makes scale_width = 1.0) + """ + + model_type = "voxcpm2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + # -- top-level VoxCPM2 params -- + architecture: str = "voxcpm2", + lm_config: dict | None = None, + encoder_config: dict | None = None, + dit_config: dict | None = None, + audio_vae_config: dict | None = None, + patch_size: int = 4, + feat_dim: int = 64, + residual_lm_num_layers: int = 8, + residual_lm_no_rope: bool = True, + scalar_quantization_latent_dim: int = 512, + scalar_quantization_scale: int = 9, + max_length: int = 8192, + device: str = "cuda", + dtype: str = "bfloat16", + # -- LM defaults (overridden by lm_config if present) -- + bos_token_id: int = 1, + eos_token_id: int = 2, + vocab_size: int = 73448, + hidden_size: int = 2048, + intermediate_size: int = 6144, + max_position_embeddings: int = 32768, + num_attention_heads: int = 16, + num_hidden_layers: int = 28, + num_key_value_heads: int = 2, + rms_norm_eps: float = 1e-5, + rope_theta: float = 10000.0, + rope_scaling: dict | None = None, + **kwargs, + ): + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + self.architecture = architecture + + # -- VoxCPM2-specific fields -- + self.lm_config = lm_config or {} + self.encoder_config = encoder_config or {} + self.dit_config = dit_config or {} + self.audio_vae_config = audio_vae_config or {} + self.patch_size = patch_size + self.feat_dim = feat_dim + self.residual_lm_num_layers = residual_lm_num_layers + self.residual_lm_no_rope = residual_lm_no_rope + self.scalar_quantization_latent_dim = scalar_quantization_latent_dim + self.scalar_quantization_scale = scalar_quantization_scale + self.max_length = max_length + self.device = device + self.dtype = dtype + + # -- Hoist LM parameters to top-level for MiniCPMModel -- + lm = self.lm_config + self.vocab_size = lm.get("vocab_size", vocab_size) + self.hidden_size = lm.get("hidden_size", hidden_size) + self.intermediate_size = lm.get("intermediate_size", intermediate_size) + self.max_position_embeddings = lm.get("max_position_embeddings", max_position_embeddings) + self.num_attention_heads = lm.get("num_attention_heads", num_attention_heads) + self.num_hidden_layers = lm.get("num_hidden_layers", num_hidden_layers) + self.num_key_value_heads = lm.get("num_key_value_heads", num_key_value_heads) + self.rms_norm_eps = lm.get("rms_norm_eps", rms_norm_eps) + self.rope_theta = lm.get("rope_theta", rope_theta) + + # MiniCPM-specific: kv_channels overrides head_dim when set. + kv_channels = lm.get("kv_channels") + if kv_channels is not None: + self.head_dim = kv_channels + else: + self.head_dim = self.hidden_size // self.num_attention_heads + + # MiniCPM requires hidden_act; VoxCPM2 uses SiLU. + self.hidden_act = "silu" + self.hidden_act_param = 0.0 + self.tie_word_embeddings = False + self.num_experts = 0 + + # -- muP scaling -- + # Native VoxCPM2 MiniCPM gates scale_depth behind use_mup: + # use_mup=True → residual += h * (scale_depth / sqrt(N)) + # use_mup=False → residual += h (plain add, no scaling) + # But vllm's MiniCPMModel ALWAYS applies scale_depth / sqrt(N). + # Native applies scale_emb externally; vllm applies it in embed_input_ids. + use_mup = lm.get("use_mup", False) + self.scale_emb = lm.get("scale_emb", 1.0) + if use_mup: + self.scale_depth = lm.get("scale_depth", 1.0) + self.dim_model_base = lm.get("dim_model_base", self.hidden_size) + else: + # Neutralize: scale_depth/sqrt(N) = 1.0, scale_width = 1.0 + self.scale_depth = math.sqrt(self.num_hidden_layers) + self.dim_model_base = self.hidden_size + + # -- RoPE scaling (longrope) -- + raw_rope = lm.get("rope_scaling", rope_scaling) + if raw_rope is not None: + self.rope_scaling = dict(raw_rope) + # HF expects "rope_type" not "type" + if "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling.pop("type") + # longrope requires "factor" (used by HF validation) + if "factor" not in self.rope_scaling: + self.rope_scaling["factor"] = 1.0 + rope_config_validation(self) + + # vllm's MiniCPMAttention reads config.rope_parameters (a dict + # with rope_type, theta, scaling factors, etc.). HF transformers + # only auto-computes this for known model_types; for custom + # types we must build it manually. + if not getattr(self, "rope_parameters", None): + rp = dict(self.rope_scaling) + rp["rope_theta"] = self.rope_theta + self.rope_parameters = rp + else: + self.rope_scaling = None + + def get_text_config(self, **kwargs): + """Return self as the text config — LM attributes are top-level.""" + return self + + +AutoConfig.register("voxcpm2", VoxCPM2Config) + +__all__ = ["VoxCPM2Config"] From 001f2e32e2cac7b86bdbbd9123e7d282cf59e2ca Mon Sep 17 00:00:00 2001 From: teith <123115827+teith@users.noreply.github.com> Date: Sat, 11 Apr 2026 04:46:38 +0200 Subject: [PATCH 122/204] [BUG FIX]: prevent EngineCore crash when Qwen TTS Base task is missing ref_text (#2203) Signed-off-by: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Co-authored-by: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> --- .../openai_api/test_serving_speech.py | 20 +++++++++++++++++++ .../entrypoints/openai/serving_speech.py | 7 +++++++ .../models/qwen3_tts/qwen3_tts_talker.py | 15 +++++++++----- 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 554164a59c..06b6f5c16c 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -752,6 +752,26 @@ def test_validate_tts_request_base_empty_ref_text(self, speech_server): ) assert speech_server._validate_tts_request(req) is None + @pytest.mark.parametrize( + "ref_text", + [None, "", " "], + ids=["none", "empty", "whitespace"], + ) + def test_validate_base_task_missing_ref_text_returns_400(self, speech_server, ref_text): + """Regression: Base task without ref_text must return 400, not crash EngineCore. + + See https://github.com/vllm-project/vllm-omni/pull/2203 + """ + req = OpenAICreateSpeechRequest( + input="Hello", + task_type="Base", + ref_audio="data:audio/wav;base64,abc", + ref_text=ref_text, + ) + result = speech_server._validate_tts_request(req) + assert result is not None, f"ref_text={ref_text!r} should be rejected" + assert "ref_text" in result + def test_validate_tts_request_customvoice_no_speakers(self, speech_server): """CustomVoice on a model with no speakers returns 400 instead of crashing engine.""" req = OpenAICreateSpeechRequest(input="Hello", task_type="CustomVoice") diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 87ef6a4e9b..52944d5082 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -919,6 +919,13 @@ def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str fmt_err = self._validate_ref_audio_format(request.ref_audio) if fmt_err: return fmt_err + if not getattr(request, "x_vector_only_mode", False) and ( + not request.ref_text or not request.ref_text.strip() + ): + return ( + "Base task requires non-empty 'ref_text' (transcript of " + "the reference audio) unless 'x_vector_only_mode' is enabled" + ) # Validate cross-parameter dependencies if task_type != "Base": diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py index f89012ec45..6b7b688f15 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_talker.py @@ -1439,11 +1439,16 @@ def _normalize_voice_clone_prompt(raw: object) -> dict[str, object] | None: ) if ref_ids is None: ref_text = _as_singleton(info_dict.get("ref_text")) - if not isinstance(ref_text, str) or not ref_text.strip(): - raise ValueError("Base in-context voice cloning requires `ref_text` or tokenized `ref_ids`.") - ref_ids = tok(self._build_ref_text(ref_text), return_tensors="pt", padding=False)["input_ids"].to( - device=input_ids.device - ) + if isinstance(ref_text, str) and ref_text.strip(): + ref_ids = tok( + self._build_ref_text(ref_text), + return_tensors="pt", + padding=False, + )["input_ids"].to(device=input_ids.device) + else: + logger.warning("Base ICL: ref_text/ref_ids missing, falling back to x-vector-only mode.") + in_context_mode = False + if in_context_mode: icl_input_embed, trailing_text_hidden = self._generate_icl_prompt( text_id=input_ids[:, 3:-5], ref_id=ref_ids[:, 3:-2], From d1fef41266a3625675e780f8955b812ea556d50a Mon Sep 17 00:00:00 2001 From: Samit <285365963@qq.com> Date: Sat, 11 Apr 2026 16:50:03 +0800 Subject: [PATCH 123/204] [Doc] Add LTX-2 online serving deployment recipes with optimization benchmarks (#1971) Signed-off-by: samithuang <285365963@qq.com> Signed-off-by: Samit <285365963@qq.com> --- .../examples/online_serving/text_to_video.md | 108 +++++++++++++++++- .../online_serving/text_to_video/README.md | 98 +++++++++++++++- .../text_to_video/run_curl_ltx2.sh | 66 +++++++++++ .../text_to_video/run_server_ltx2.sh | 84 ++++++++++++++ 4 files changed, 348 insertions(+), 8 deletions(-) create mode 100644 examples/online_serving/text_to_video/run_curl_ltx2.sh create mode 100644 examples/online_serving/text_to_video/run_server_ltx2.sh diff --git a/docs/user_guide/examples/online_serving/text_to_video.md b/docs/user_guide/examples/online_serving/text_to_video.md index d58296fcc7..01e6d9d464 100644 --- a/docs/user_guide/examples/online_serving/text_to_video.md +++ b/docs/user_guide/examples/online_serving/text_to_video.md @@ -3,17 +3,28 @@ Source . -This example demonstrates how to deploy the Wan2.2 text-to-video model for online video generation using vLLM-Omni. +This example demonstrates how to deploy text-to-video models for online video generation using vLLM-Omni. -## Start Server +## Supported Models -### Basic Start +| Model | Model ID | +|-------|----------| +| Wan2.1 T2V (1.3B) | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` | +| Wan2.1 T2V (14B) | `Wan-AI/Wan2.1-T2V-14B-Diffusers` | +| Wan2.2 T2V | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | +| LTX-2 | `Lightricks/LTX-2` | + +## Wan2.2 T2V + +### Start Server + +#### Basic Start ```bash vllm serve Wan-AI/Wan2.2-T2V-A14B-Diffusers --omni --port 8091 ``` -### Start with Parameters +#### Start with Parameters Or use the startup script: @@ -234,8 +245,94 @@ while true; do done ``` +## LTX-2 + +### Start Server + +#### Basic Start + +```bash +vllm serve Lightricks/LTX-2 --omni --port 8098 \ + --enforce-eager --flow-shift 1.0 --boundary-ratio 1.0 +``` + +#### Start with Optimization Presets + +Use the LTX-2 startup script with built-in optimization presets: + +```bash +# Baseline (1 GPU, eager) +bash run_server_ltx2.sh baseline + +# 4-GPU Ulysses sequence parallelism (lossless) +bash run_server_ltx2.sh ulysses4 + +# Cache-DiT lossy acceleration (1 GPU, ~1.4× speedup) +bash run_server_ltx2.sh cache-dit + +# Best combo: 4-GPU Ulysses SP + Cache-DiT (~2.2× speedup) +bash run_server_ltx2.sh best-combo +``` + +#### Optimization Benchmarks + +Benchmarked on H800, online serving (480×768, 41 frames, 20 steps, `seed=42`). +"Inference" is the server-reported inference time; excludes HTTP/poll overhead. + +| Preset | Server Command | Inference (s) | Speedup | Type | +|--------|---------------|---------------|---------|------| +| `baseline` | `--enforce-eager` | 10.3 | 1.00× | — | +| `compile` | *(default, no --enforce-eager)* | ~10.3 (warm) | ~1.00× | Lossless | +| `ulysses4` | `--enforce-eager --usp 4` | ~10.3 | ~1.00× | Lossless | +| `cache-dit` | `--enforce-eager --cache-backend cache_dit` | 7.4 avg | ~1.4× | Lossy | +| `best-combo` | `--enforce-eager --usp 4 --cache-backend cache_dit` | 4.7 avg | **~2.2×** | Lossless + Lossy | + +**Observations**: +- **torch.compile**: On H800, warm-request inference time matches the eager baseline (~10.3s). + The first request pays ~6s compilation overhead. Benefit depends on model architecture and GPU. +- **Ulysses SP (4 GPU)**: No measurable speedup alone for 41-frame generation at this resolution. + Communication overhead outweighs gains at this sequence length. +- **Cache-DiT**: Inference varies per request (6–10s) due to dynamic caching decisions. + Average is ~7.4s (~1.4× speedup) with slight quality tradeoff. +- **Best combo**: 4-GPU Ulysses SP + Cache-DiT synergize well — Cache-DiT reduces per-step + computation, making the communication overhead of Ulysses SP worthwhile. Average ~4.7s + (~2.2× speedup). +- **FP8 quantization**: Reduces VRAM but does not speed up LTX-2 on H800 (compute-bound). + +**Deployment Recommendations**: +- For **production with quality priority**: use `baseline` with `--enforce-eager` +- For **maximum throughput** (4 GPUs, quality tradeoff): use `best-combo` (~2.2× speedup) +- For **single-GPU throughput**: use `cache-dit` (~1.4× speedup) +- `--enforce-eager` is recommended to avoid torch.compile warmup latency on first request + +### Send Requests (curl) + +```bash +# Using the provided script +bash run_curl_ltx2.sh + +# Or directly +curl -sS -X POST http://localhost:8098/v1/videos \ + -H "Accept: application/json" \ + -F "prompt=A serene lakeside sunrise with mist over the water." \ + -F "width=768" \ + -F "height=480" \ + -F "num_frames=41" \ + -F "fps=24" \ + -F "num_inference_steps=20" \ + -F "guidance_scale=3.0" \ + -F "seed=42" +``` + ## Example materials +??? abstract "response.json" + ``````json + --8<-- "examples/online_serving/text_to_video/response.json" + `````` +??? abstract "run_curl_ltx2.sh" + ``````sh + --8<-- "examples/online_serving/text_to_video/run_curl_ltx2.sh" ??? abstract "run_curl_hunyuan_video_15.sh" ``````sh --8<-- "examples/online_serving/text_to_video/run_curl_hunyuan_video_15.sh" @@ -248,6 +345,9 @@ done ``````sh --8<-- "examples/online_serving/text_to_video/run_server.sh" `````` +??? abstract "run_server_ltx2.sh" + ``````sh + --8<-- "examples/online_serving/text_to_video/run_server_ltx2.sh" ??? abstract "run_server_hunyuan_video_15.sh" ``````sh --8<-- "examples/online_serving/text_to_video/run_server_hunyuan_video_15.sh" diff --git a/examples/online_serving/text_to_video/README.md b/examples/online_serving/text_to_video/README.md index 44e676671f..c01e0602ff 100644 --- a/examples/online_serving/text_to_video/README.md +++ b/examples/online_serving/text_to_video/README.md @@ -1,16 +1,27 @@ # Text-To-Video -This example demonstrates how to deploy the Wan2.2 text-to-video model for online video generation using vLLM-Omni. +This example demonstrates how to deploy text-to-video models for online video generation using vLLM-Omni. -## Start Server +## Supported Models -### Basic Start +| Model | Model ID | +|-------|----------| +| Wan2.1 T2V (1.3B) | `Wan-AI/Wan2.1-T2V-1.3B-Diffusers` | +| Wan2.1 T2V (14B) | `Wan-AI/Wan2.1-T2V-14B-Diffusers` | +| Wan2.2 T2V | `Wan-AI/Wan2.2-T2V-A14B-Diffusers` | +| LTX-2 | `Lightricks/LTX-2` | + +## Wan2.2 T2V + +### Start Server + +#### Basic Start ```bash vllm serve Wan-AI/Wan2.2-T2V-A14B-Diffusers --omni --port 8091 ``` -### Start with Parameters +#### Start with Parameters Or use the startup script: @@ -230,3 +241,82 @@ while true; do sleep 2 done ``` + +## LTX-2 + +### Start Server + +#### Basic Start + +```bash +vllm serve Lightricks/LTX-2 --omni --port 8098 \ + --enforce-eager --flow-shift 1.0 --boundary-ratio 1.0 +``` + +#### Start with Optimization Presets + +Use the LTX-2 startup script with built-in optimization presets: + +```bash +# Baseline (1 GPU, eager) +bash run_server_ltx2.sh baseline + +# 4-GPU Ulysses sequence parallelism (lossless) +bash run_server_ltx2.sh ulysses4 + +# Cache-DiT lossy acceleration (1 GPU, ~1.4× speedup) +bash run_server_ltx2.sh cache-dit + +# Best combo: 4-GPU Ulysses SP + Cache-DiT (~2.2× speedup) +bash run_server_ltx2.sh best-combo +``` + +#### Optimization Benchmarks + +Benchmarked on H800, online serving (480×768, 41 frames, 20 steps, `seed=42`). +"Inference" is the server-reported inference time; excludes HTTP/poll overhead. + +| Preset | Server Command | Inference (s) | Speedup | Type | +|--------|---------------|---------------|---------|------| +| `baseline` | `--enforce-eager` | 10.3 | 1.00× | — | +| `compile` | *(default, no --enforce-eager)* | ~10.3 (warm) | ~1.00× | Lossless | +| `ulysses4` | `--enforce-eager --usp 4` | ~10.3 | ~1.00× | Lossless | +| `cache-dit` | `--enforce-eager --cache-backend cache_dit` | 7.4 avg | ~1.4× | Lossy | +| `best-combo` | `--enforce-eager --usp 4 --cache-backend cache_dit` | 4.7 avg | **~2.2×** | Lossless + Lossy | + +**Observations**: +- **torch.compile**: On H800, warm-request inference time matches the eager baseline (~10.3s). + The first request pays ~6s compilation overhead. Benefit depends on model architecture and GPU. +- **Ulysses SP (4 GPU)**: No measurable speedup alone for 41-frame generation at this resolution. + Communication overhead outweighs gains at this sequence length. +- **Cache-DiT**: Inference varies per request (6–10s) due to dynamic caching decisions. + Average is ~7.4s (~1.4× speedup) with slight quality tradeoff. +- **Best combo**: 4-GPU Ulysses SP + Cache-DiT synergize well — Cache-DiT reduces per-step + computation, making the communication overhead of Ulysses SP worthwhile. Average ~4.7s + (~2.2× speedup). +- **FP8 quantization**: Reduces VRAM but does not speed up LTX-2 on H800 (compute-bound). + +**Deployment Recommendations**: +- For **production with quality priority**: use `baseline` with `--enforce-eager` +- For **maximum throughput** (4 GPUs, quality tradeoff): use `best-combo` (~2.2× speedup) +- For **single-GPU throughput**: use `cache-dit` (~1.4× speedup) +- `--enforce-eager` is recommended to avoid torch.compile warmup latency on first request + +### Send Requests (curl) + +```bash +# Using the provided script +bash run_curl_ltx2.sh + +# Or directly +curl -sS -X POST http://localhost:8098/v1/videos \ + -H "Accept: application/json" \ + -F "prompt=A serene lakeside sunrise with mist over the water." \ + -F "width=768" \ + -F "height=480" \ + -F "num_frames=41" \ + -F "fps=24" \ + -F "num_inference_steps=20" \ + -F "guidance_scale=3.0" \ + -F "seed=42" +``` diff --git a/examples/online_serving/text_to_video/run_curl_ltx2.sh b/examples/online_serving/text_to_video/run_curl_ltx2.sh new file mode 100644 index 0000000000..b82f672eaa --- /dev/null +++ b/examples/online_serving/text_to_video/run_curl_ltx2.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# LTX-2 text-to-video curl example using the async video job API. +# Start the server first: bash run_server_ltx2.sh best-combo + +set -euo pipefail + +BASE_URL="${BASE_URL:-http://localhost:8098}" +OUTPUT_PATH="${OUTPUT_PATH:-ltx2_output.mp4}" +POLL_INTERVAL="${POLL_INTERVAL:-2}" + +PROMPT="${PROMPT:-A serene lakeside sunrise with mist over the water.}" + +create_response=$( + curl -sS -X POST "${BASE_URL}/v1/videos" \ + -H "Accept: application/json" \ + -F "prompt=${PROMPT}" \ + -F "width=768" \ + -F "height=480" \ + -F "num_frames=41" \ + -F "fps=24" \ + -F "num_inference_steps=20" \ + -F "guidance_scale=3.0" \ + -F "seed=42" +) + +video_id="$(echo "${create_response}" | jq -r '.id')" +if [ -z "${video_id}" ] || [ "${video_id}" = "null" ]; then + echo "Failed to create video job:" + echo "${create_response}" | jq . + exit 1 +fi + +echo "Created video job ${video_id}" +echo "${create_response}" | jq . + +while true; do + status_response="$(curl -sS "${BASE_URL}/v1/videos/${video_id}")" + status="$(echo "${status_response}" | jq -r '.status')" + + case "${status}" in + queued|in_progress) + echo "Video job ${video_id} status: ${status}" + sleep "${POLL_INTERVAL}" + ;; + completed) + echo "${status_response}" | jq . + break + ;; + failed) + echo "Video generation failed:" + echo "${status_response}" | jq . + exit 1 + ;; + *) + echo "Unexpected status response:" + echo "${status_response}" | jq . + exit 1 + ;; + esac +done + +curl -sS -L "${BASE_URL}/v1/videos/${video_id}/content" -o "${OUTPUT_PATH}" +echo "Saved video to ${OUTPUT_PATH}" diff --git a/examples/online_serving/text_to_video/run_server_ltx2.sh b/examples/online_serving/text_to_video/run_server_ltx2.sh new file mode 100644 index 0000000000..f4597d3cd2 --- /dev/null +++ b/examples/online_serving/text_to_video/run_server_ltx2.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# LTX-2 online serving startup script with optimization presets. +# +# Usage: +# bash run_server_ltx2.sh # baseline (1 GPU, eager) +# bash run_server_ltx2.sh ulysses4 # 4-GPU Ulysses SP +# bash run_server_ltx2.sh cache-dit # 1 GPU + Cache-DiT +# bash run_server_ltx2.sh best-combo # 4-GPU Ulysses SP + Cache-DiT +# +# Online serving benchmarks on H800 (480×768, 41 frames, 20 steps): +# baseline : 10.3s inference (1.00×) +# compile : ~10.3s warm (~1.00×) first request +6s warmup +# ulysses4 : ~10.3s (~1.00×) no gain at 41 frames +# cache-dit : 7.4s avg (~1.4×) lossy, variable per request +# best-combo : 4.7s avg (~2.2×) 4-GPU ulysses + cache-dit + +set -euo pipefail + +MODEL="${MODEL:-Lightricks/LTX-2}" +PORT="${PORT:-8098}" +FLOW_SHIFT="${FLOW_SHIFT:-1.0}" +BOUNDARY_RATIO="${BOUNDARY_RATIO:-1.0}" + +PRESET="${1:-baseline}" + +EXTRA_ARGS=() +case "$PRESET" in + baseline) + echo "=== LTX-2 Preset: baseline (1 GPU, enforce-eager) ===" + EXTRA_ARGS+=(--enforce-eager) + ;; + ulysses2) + echo "=== LTX-2 Preset: 2-GPU Ulysses SP (lossless) ===" + EXTRA_ARGS+=(--enforce-eager --usp 2) + ;; + ulysses4) + echo "=== LTX-2 Preset: 4-GPU Ulysses SP (lossless) ===" + EXTRA_ARGS+=(--enforce-eager --usp 4) + ;; + cache-dit) + echo "=== LTX-2 Preset: Cache-DiT (1 GPU, lossy) ===" + EXTRA_ARGS+=(--enforce-eager --cache-backend cache_dit) + ;; + best-combo) + echo "=== LTX-2 Preset: 4-GPU Ulysses SP + Cache-DiT (best combo) ===" + EXTRA_ARGS+=(--enforce-eager --usp 4 --cache-backend cache_dit) + ;; + compile) + echo "=== LTX-2 Preset: torch.compile (1 GPU, lossless) ===" + # torch.compile is the default (no --enforce-eager) + ;; + *) + echo "Usage: $0 {baseline|ulysses2|ulysses4|cache-dit|best-combo|compile}" + echo "" + echo "Presets:" + echo " baseline - 1 GPU, eager execution (reference)" + echo " ulysses2 - 2-GPU Ulysses SP (lossless)" + echo " ulysses4 - 4-GPU Ulysses SP (lossless)" + echo " cache-dit - 1 GPU + Cache-DiT (lossy, ~1.4× speedup)" + echo " best-combo - 4-GPU Ulysses SP + Cache-DiT (~2.2× speedup)" + echo " compile - 1 GPU + torch.compile (slower first request)" + echo "" + echo "Environment variables:" + echo " MODEL - Model path (default: Lightricks/LTX-2)" + echo " PORT - Server port (default: 8098)" + echo " FLOW_SHIFT - Scheduler flow shift (default: 1.0)" + echo " BOUNDARY_RATIO - Boundary ratio (default: 1.0)" + exit 1 + ;; +esac + +echo "Model: $MODEL" +echo "Port: $PORT" +echo "Flow shift: $FLOW_SHIFT" +echo "Boundary ratio: $BOUNDARY_RATIO" + +vllm serve "$MODEL" --omni \ + --port "$PORT" \ + --flow-shift "$FLOW_SHIFT" \ + --boundary-ratio "$BOUNDARY_RATIO" \ + "${EXTRA_ARGS[@]}" From c9e8411d9111ba8605f0786af6e183d439e00182 Mon Sep 17 00:00:00 2001 From: akshatvishu <33392262+akshatvishu@users.noreply.github.com> Date: Sat, 11 Apr 2026 14:57:05 +0530 Subject: [PATCH 124/204] [feature] : add cache-dit for stable-audio-open-1.0 (#1341) Signed-off-by: akshatvishu --- .../diffusion/cache/cache_dit_backend.py | 72 +++++++++++++++++++ .../stable_audio/stable_audio_transformer.py | 2 + 2 files changed, 74 insertions(+) diff --git a/vllm_omni/diffusion/cache/cache_dit_backend.py b/vllm_omni/diffusion/cache/cache_dit_backend.py index a5055a0688..e9f79da4f3 100644 --- a/vllm_omni/diffusion/cache/cache_dit_backend.py +++ b/vllm_omni/diffusion/cache/cache_dit_backend.py @@ -464,6 +464,77 @@ def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool return refresh_cache_context +def enable_cache_for_stable_audio_open(pipeline: Any, cache_config: Any) -> Callable[[int], None]: + """Enable cache-dit for Stable Audio Open pipeline. + + Args: + pipeline: The StableAudioPipeline instance. + cache_config: DiffusionCacheConfig instance with cache configuration. + + Returns: + A refresh function that can be called to update cache context with new num_inference_steps. + """ + db_cache_config = _build_db_cache_config(cache_config) + + calibrator_config = None + if cache_config.enable_taylorseer: + taylorseer_order = cache_config.taylorseer_order + calibrator_config = TaylorSeerCalibratorConfig(taylorseer_order=taylorseer_order) + logger.info(f"TaylorSeer enabled with order={taylorseer_order}") + + # StableAudio is officially registered in CacheDiT as Pattern_3: + # https://github.com/vipshop/cache-dit/blob/69e82bd1/src/cache_dit/caching/block_adapters/__init__.py#L562 + # + # Pattern_3 is required because StableAudioDiT uses cross-attention + # with static encoder_hidden_states that do not change inside the + # transformer block loop. + cache_dit.enable_cache( + BlockAdapter( + transformer=pipeline.transformer, + blocks=pipeline.transformer.transformer_blocks, + forward_pattern=ForwardPattern.Pattern_3, + params_modifiers=[ + ParamsModifier( + cache_config=db_cache_config, + calibrator_config=calibrator_config, + ) + ], + ), + cache_config=db_cache_config, + ) + + def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool = True) -> None: + """Refresh cache context for the transformer with new num_inference_steps. + + Args: + pipeline: The StableAudioPipeline instance. + num_inference_steps: New number of inference steps. + verbose: Whether to log refresh operations. + """ + # Bypass SCM for step counts that don't support predefined masks (e.g., vLLM's 1-step dummy run) + scm_supported_steps = num_inference_steps >= 8 or num_inference_steps in (4, 6) + + if cache_config.scm_steps_mask_policy is None or not scm_supported_steps: + cache_dit.refresh_context(pipeline.transformer, num_inference_steps=num_inference_steps, verbose=verbose) + else: + updated_scm_config = DBCacheConfig().reset( + num_inference_steps=num_inference_steps, + steps_computation_mask=cache_dit.steps_mask( + mask_policy=cache_config.scm_steps_mask_policy, + total_steps=num_inference_steps, + ), + steps_computation_policy=cache_config.scm_steps_policy, + ) + + cache_dit.refresh_context( + pipeline.transformer, + cache_config=updated_scm_config, + verbose=verbose, + ) + + return refresh_cache_context + + def enable_cache_for_sd3(pipeline: Any, cache_config: Any) -> Callable[[int], None]: """Enable cache-dit for StableDiffusion3Pipeline. @@ -1212,6 +1283,7 @@ def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool "Flux2KleinPipeline": enable_cache_for_flux2_klein, "LongCatImagePipeline": enable_cache_for_longcat_image, "LongCatImageEditPipeline": enable_cache_for_longcat_image, + "StableAudioPipeline": enable_cache_for_stable_audio_open, "StableDiffusion3Pipeline": enable_cache_for_sd3, "LTX2Pipeline": enable_cache_for_ltx2, "LTX2ImageToVideoPipeline": enable_cache_for_ltx2, diff --git a/vllm_omni/diffusion/models/stable_audio/stable_audio_transformer.py b/vllm_omni/diffusion/models/stable_audio/stable_audio_transformer.py index 22d56ac1fd..4a4892673f 100644 --- a/vllm_omni/diffusion/models/stable_audio/stable_audio_transformer.py +++ b/vllm_omni/diffusion/models/stable_audio/stable_audio_transformer.py @@ -375,6 +375,8 @@ class StableAudioDiTModel(nn.Module): - Output: [B, out_channels, L] """ + _repeated_blocks = ["StableAudioDiTBlock"] + def __init__( self, od_config: OmniDiffusionConfig | None = None, From 25c0566393b467cb7ff0c7dd57ff8994b5348c6f Mon Sep 17 00:00:00 2001 From: TJian Date: Sat, 11 Apr 2026 19:54:22 +0800 Subject: [PATCH 125/204] [ROCm] [CI] [Bugfix] Resurface CI Signal, fix MHA AR selection, sync with cuda tests (#2340) Signed-off-by: tjtanaa --- .buildkite/test-amd-merge.yml | 116 ++++++------ .buildkite/test-amd-ready.yaml | 179 ++++++++++++------ .buildkite/test-template-amd-omni.j2 | 3 + docker/Dockerfile.rocm | 18 ++ tests/e2e/offline_inference/test_t2i_model.py | 7 +- .../test_zimage_parallelism.py | 12 +- vllm_omni/engine/stage_init_utils.py | 14 ++ vllm_omni/platforms/rocm/platform.py | 28 +++ 8 files changed, 252 insertions(+), 125 deletions(-) diff --git a/.buildkite/test-amd-merge.yml b/.buildkite/test-amd-merge.yml index 60ba0d9d41..b6f2037d18 100644 --- a/.buildkite/test-amd-merge.yml +++ b/.buildkite/test-amd-merge.yml @@ -32,7 +32,6 @@ steps: mirror_hardwares: [amdproduction] grade: Blocking commands: - - export GPU_ARCHS=gfx942 - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - | @@ -63,13 +62,12 @@ steps: mirror_hardwares: [amdproduction] grade: Blocking commands: - - export GPU_ARCHS=gfx942 - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - timeout 15m pytest -s -v -m "core_model and cache and diffusion and not distributed_cuda and L4" -- label: "Diffusion Sequence Parallelism Test" - agent_pool: mi325_2 +- label: "Diffusion Sequence Parallelism Test (Need 4 GPUs)" + agent_pool: mi325_4 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking @@ -77,6 +75,7 @@ steps: - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - timeout 20m pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py + - timeout 20m pytest -s -v tests/diffusion/distributed/test_ulysses_uaa_perf.py # merge-only tests - label: "Diffusion Tensor Parallelism Test" @@ -95,22 +94,14 @@ steps: commands: - timeout 20m pytest -s -v tests/diffusion/test_diffusion_worker.py -- label: "Benchmark & Engine Test" - agent_pool: mi325_2 +- label: "Engine Test" + agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - | - timeout 20m bash -c ' - set +e - pytest -s -v tests/benchmarks/test_serve_cli.py - EXIT1=\$? - pytest -s -v tests/engine/test_async_omni_engine_abort.py - EXIT2=\$? - exit \$((EXIT1 | EXIT2)) - ' + - timeout 20m pytest -s -v tests/engine/test_async_omni_engine_abort.py - label: "Omni Model Test Qwen2-5-Omni" agent_pool: mi325_2 @@ -121,6 +112,7 @@ steps: - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py + - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen2_5_omni.py -m "advanced_model" --run-level "advanced_model" - label: "Omni Model Test Qwen3-Omni" agent_pool: mi325_2 @@ -131,11 +123,10 @@ steps: - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py - - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model" + - timeout 30m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py tests/e2e/online_serving/test_mimo_audio.py -m "advanced_model" --run-level "advanced_model" - label: "Qwen3-TTS CustomVoice E2E Test" - agent_pool: mi325_2 + agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking @@ -145,21 +136,21 @@ steps: export VLLM_LOGGING_LEVEL=DEBUG export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline_inference/test_qwen3_tts_customvoice.py + pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py tests/e2e/offline_inference/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model" ' - label: "Qwen3-TTS Base E2E Test" - agent_pool: mi325_2 + agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - | - timeout 20m bash -c ' + timeout 30m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline_inference/test_qwen3_tts_base.py + pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py tests/e2e/offline_inference/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model" ' - label: "Diffusion Image Edit Test" @@ -173,43 +164,58 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py -# split Bagel Model Test with H100 (Real Weights) into three tests -- label: "Bagel Text2Img Model Test" - agent_pool: mi325_1 - depends_on: amd-build - mirror_hardwares: [amdproduction] - grade: Blocking - commands: - - export GPU_ARCHS=gfx942 - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - export VLLM_LOGGING_LEVEL=DEBUG - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export VLLM_ROCM_USE_AITER_RMSNORM=0 - - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory" -k "rocm" +# TODO: Bagel test on ROCm is very unstable. @tjtanaa +# Need to debug before reneable numerical changes across large PRs +# # split Bagel Model Test with H100 (Real Weights) into three tests +# - label: "Bagel Text2Img Model Test (1/3)" +# agent_pool: mi325_1 +# depends_on: amd-build +# mirror_hardwares: [amdproduction] +# grade: Blocking +# commands: +# - export GPU_ARCHS=gfx942 +# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 +# - export VLLM_LOGGING_LEVEL=DEBUG +# - export VLLM_WORKER_MULTIPROC_METHOD=spawn +# - export VLLM_ROCM_USE_AITER_RMSNORM=0 +# - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory" -k "rocm" -- label: "Bagel Img2Img Model Test" - agent_pool: mi325_1 - depends_on: amd-build - mirror_hardwares: [amdproduction] - grade: Blocking - commands: - - export GPU_ARCHS=gfx942 - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - export VLLM_LOGGING_LEVEL=DEBUG - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export VLLM_ROCM_USE_AITER_RMSNORM=0 - - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model" -k "rocm" +# - label: "Bagel Img2Img Model Test (2/3)" +# agent_pool: mi325_1 +# depends_on: amd-build +# mirror_hardwares: [amdproduction] +# grade: Blocking +# commands: +# - export GPU_ARCHS=gfx942 +# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 +# - export VLLM_LOGGING_LEVEL=DEBUG +# - export VLLM_WORKER_MULTIPROC_METHOD=spawn +# - export VLLM_ROCM_USE_AITER_RMSNORM=0 +# - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model" -k "rocm" + +# - label: "Bagel Online Serving Test (3/3)" +# agent_pool: mi325_1 +# depends_on: amd-build +# mirror_hardwares: [amdproduction] +# grade: Blocking +# commands: +# - export GPU_ARCHS=gfx942 +# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 +# - export VLLM_IMAGE_FETCH_TIMEOUT=60 +# - export VLLM_LOGGING_LEVEL=DEBUG +# - export VLLM_WORKER_MULTIPROC_METHOD=spawn +# - export VLLM_ROCM_USE_AITER_RMSNORM=0 +# - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model" -k "rocm" -- label: "Bagel Online Serving Test" +- label: "Voxtral-TTS E2E Test" agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - - export GPU_ARCHS=gfx942 - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - export VLLM_IMAGE_FETCH_TIMEOUT=60 - - export VLLM_LOGGING_LEVEL=DEBUG - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export VLLM_ROCM_USE_AITER_RMSNORM=0 - - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model" -k "rocm" + - | + timeout 20m bash -c ' + export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn + pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" + ' diff --git a/.buildkite/test-amd-ready.yaml b/.buildkite/test-amd-ready.yaml index 6e31163acc..ced91635c2 100644 --- a/.buildkite/test-amd-ready.yaml +++ b/.buildkite/test-amd-ready.yaml @@ -9,13 +9,37 @@ steps: - export VLLM_ROCM_USE_AITER=0 - "timeout 20m pytest -v -s -m 'core_model and cpu' --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml" +- label: "Voxtral TTS CUDA Unit Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - timeout 10m pytest -s -v tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py + - label: "Diffusion Model Test" - agent_pool: mi325_2 + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - timeout 30m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "core_model and diffusion" --run-level "core_model" + +- label: "Diffusion Batching Test" + agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - - timeout 20m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "core_model and diffusion" --run-level "core_model" + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py -m "core_model and diffusion" --run-level "core_model" + +- label: "Custom Pipeline Test" + agent_pool: mi325_1 + depends_on: amd-build + mirror_hardwares: [amdproduction] + grade: Blocking + commands: + - timeout 20m pytest -s -v tests/e2e/offline_inference/custom_pipeline/ -m "core_model" - label: "Diffusion Model CPU offloading Test" agent_pool: mi325_1 @@ -23,7 +47,6 @@ steps: mirror_hardwares: [amdproduction] grade: Blocking commands: - - export GPU_ARCHS=gfx942 - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - | @@ -77,47 +100,58 @@ steps: commands: - timeout 20m pytest -s -v tests/diffusion/test_diffusion_worker.py -- label: "Benchmark & Engine Test" - agent_pool: mi325_2 +- label: "Engine Test" + agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - | - timeout 30m bash -c ' - set +e - pytest -s -v tests/benchmarks/test_serve_cli.py - EXIT1=\$? - pytest -s -v tests/engine/test_async_omni_engine_abort.py - EXIT2=\$? - exit \$((EXIT1 | EXIT2)) + timeout 15m bash -c ' + pytest -s -v tests/engine/test_async_omni_engine_abort.py ' -- label: "Omni Model Test Qwen2-5-Omni" - agent_pool: mi325_2 - depends_on: amd-build - mirror_hardwares: [amdproduction] - grade: Blocking - commands: - - export VLLM_LOGGING_LEVEL=DEBUG - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - timeout 17m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py -- label: "Omni Model Test Qwen3-Omni" - agent_pool: mi325_2 +# NOTE: This test is not running any thing. It is skipped and deselected. +# Currently it is = 1 skipped, 1 deselected, 17 warnings in 0.03s ====== +# - label: "Omni Model Test Qwen2-5-Omni" +# agent_pool: mi325_2 +# depends_on: amd-build +# mirror_hardwares: [amdproduction] +# grade: Blocking +# commands: +# - export VLLM_LOGGING_LEVEL=DEBUG +# - export VLLM_WORKER_MULTIPROC_METHOD=spawn +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py -m "core_model" --run-level "core_model" + +# - label: "Omni Model Test Qwen3-Omni" +# agent_pool: mi325_2 +# depends_on: amd-build +# mirror_hardwares: [amdproduction] +# grade: Blocking +# commands: +# - export VLLM_LOGGING_LEVEL=DEBUG +# - export VLLM_WORKER_MULTIPROC_METHOD=spawn +# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 +# - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py +# - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model" + +- label: "MiMo-Audio E2E Test with H100" + agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - - export VLLM_LOGGING_LEVEL=DEBUG - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py - - timeout 10m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model" + - | + timeout 30m bash -c ' + export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn + pytest -s -v tests/e2e/online_serving/test_mimo_audio.py -m "core_model" --run-level "core_model" + ' - label: "Qwen3-TTS E2E Test" - agent_pool: mi325_2 + agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking @@ -125,55 +159,82 @@ steps: - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "core_model" --run-level "core_model" + - timeout 30m pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "core_model" --run-level "core_model" -- label: "Diffusion Image Edit Test" +- label: "Voxtral-TTS E2E Test" agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - - export GPU_ARCHS=gfx942 - - export VLLM_LOGGING_LEVEL=DEBUG - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py + - | + timeout 20m bash -c ' + export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn + pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" + pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" + ' -- label: "Bagel Text2Img Model Test" +- label: "Diffusion Image Edit Test" agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - export GPU_ARCHS=gfx942 - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export VLLM_ROCM_USE_AITER_RMSNORM=0 - - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "core_model" --run-level "core_model" -k "rocm" + - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py -- label: "Bagel Img2Img Model Test" - agent_pool: mi325_1 - depends_on: amd-build - mirror_hardwares: [amdproduction] - grade: Blocking - commands: - - export GPU_ARCHS=gfx942 - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - export VLLM_LOGGING_LEVEL=DEBUG - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export VLLM_ROCM_USE_AITER_RMSNORM=0 - - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "core_model" --run-level "core_model" -k "rocm" +# TODO: Bagel test on ROCm is very unstable. @tjtanaa +# Need to debug before reneable numerical changes across large PRs +# - label: "Bagel Text2Img Model Test" +# agent_pool: mi325_1 +# depends_on: amd-build +# mirror_hardwares: [amdproduction] +# grade: Blocking +# commands: +# - export GPU_ARCHS=gfx942 +# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 +# - export VLLM_LOGGING_LEVEL=DEBUG +# - export VLLM_WORKER_MULTIPROC_METHOD=spawn +# - export VLLM_ROCM_USE_AITER_RMSNORM=0 +# - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "core_model" --run-level "core_model" -k "rocm" + +# - label: "Bagel Img2Img Model Test" +# agent_pool: mi325_1 +# depends_on: amd-build +# mirror_hardwares: [amdproduction] +# grade: Blocking +# commands: +# - export GPU_ARCHS=gfx942 +# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 +# - export VLLM_LOGGING_LEVEL=DEBUG +# - export VLLM_WORKER_MULTIPROC_METHOD=spawn +# - export VLLM_ROCM_USE_AITER_RMSNORM=0 +# - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "core_model" --run-level "core_model" -k "rocm" -- label: "Bagel Online Serving Test" +# - label: "Bagel Online Serving Test" +# agent_pool: mi325_1 +# depends_on: amd-build +# mirror_hardwares: [amdproduction] +# grade: Blocking +# commands: +# - export GPU_ARCHS=gfx942 +# - export VLLM_TEST_CLEAN_GPU_MEMORY=1 +# - export VLLM_IMAGE_FETCH_TIMEOUT=60 +# - export VLLM_LOGGING_LEVEL=DEBUG +# - export VLLM_WORKER_MULTIPROC_METHOD=spawn +# - export VLLM_ROCM_USE_AITER_RMSNORM=0 +# - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "core_model" --run-level "core_model" -k "rocm" + +- label: "CosyVoice3-TTS E2E Test" agent_pool: mi325_1 depends_on: amd-build mirror_hardwares: [amdproduction] grade: Blocking commands: - - export GPU_ARCHS=gfx942 - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - export VLLM_IMAGE_FETCH_TIMEOUT=60 - - export VLLM_LOGGING_LEVEL=DEBUG - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - export VLLM_ROCM_USE_AITER_RMSNORM=0 - - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "core_model" --run-level "core_model" -k "rocm" + - | + timeout 20m bash -c ' + pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "core_model" --run-level "core_model" + ' diff --git a/.buildkite/test-template-amd-omni.j2 b/.buildkite/test-template-amd-omni.j2 index 8dc91a1172..f4c386a5fe 100644 --- a/.buildkite/test-template-amd-omni.j2 +++ b/.buildkite/test-template-amd-omni.j2 @@ -48,6 +48,9 @@ DOCKER_BUILDKIT: "1" TEST_COMMAND: |- (command rocm-smi || true) && cd {{ (step.working_dir or default_working_dir) | safe }} +{% if "mi250" in step.agent_pool %} + python3 -m pip uninstall -y amd-aiter +{% endif %} {{ indented_cmd | safe }} priority: 100 {% if step.grade and step.grade == "Blocking" %} diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 8b22bee38b..ec0c5aab0d 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -39,6 +39,24 @@ RUN if [ "${USE_NIGHTLY_BUILD}" = "1" ]; then \ # Step 3: Copy vllm-omni code and install without uv RUN mkdir -p ${COMMON_WORKDIR}/vllm-omni COPY . ${COMMON_WORKDIR}/vllm-omni + +# This is a workaround to ensure pytest exits with the correct status code in CI tests. +RUN printf '%s\n' \ + 'import os' \ + '' \ + '_exit_code = 1' \ + '' \ + 'def pytest_sessionfinish(session, exitstatus):' \ + ' global _exit_code' \ + ' _exit_code = int(exitstatus)' \ + '' \ + 'def pytest_unconfigure(config):' \ + ' import sys' \ + ' sys.stdout.flush()' \ + ' sys.stderr.flush()' \ + ' os._exit(_exit_code)' \ + > ${COMMON_WORKDIR}/vllm-omni/conftest.py + RUN cd ${COMMON_WORKDIR}/vllm-omni && uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]" --no-build-isolation RUN ln -sf /usr/bin/python3 /usr/bin/python diff --git a/tests/e2e/offline_inference/test_t2i_model.py b/tests/e2e/offline_inference/test_t2i_model.py index 77b2b3aaf2..55a154f61b 100644 --- a/tests/e2e/offline_inference/test_t2i_model.py +++ b/tests/e2e/offline_inference/test_t2i_model.py @@ -26,17 +26,12 @@ # TODO: When NPU support is ready, remove this branch. if current_omni_platform.is_npu(): models = ["Tongyi-MAI/Z-Image-Turbo", "Qwen/Qwen-Image"] -elif current_omni_platform.is_rocm(): - # TODO: When ROCm support is ready, remove this branch. - # Current upstream vLLM has issues running riverclouds/qwen_image_random - # on ROCm - models = ["Tongyi-MAI/Z-Image-Turbo"] @pytest.mark.core_model @pytest.mark.advanced_model @pytest.mark.diffusion -@hardware_test(res={"cuda": "L4", "rocm": "MI325", "xpu": "B60"}, num_cards={"cuda": 1, "rocm": 2, "xpu": 2}) +@hardware_test(res={"cuda": "L4", "rocm": "MI325", "xpu": "B60"}, num_cards={"cuda": 1, "rocm": 1, "xpu": 2}) @pytest.mark.parametrize("model_name", models) def test_diffusion_model(model_name: str, run_level): if run_level == "core_model" and model_name != "riverclouds/qwen_image_random": diff --git a/tests/e2e/offline_inference/test_zimage_parallelism.py b/tests/e2e/offline_inference/test_zimage_parallelism.py index 9d9db16a40..b685704ae4 100644 --- a/tests/e2e/offline_inference/test_zimage_parallelism.py +++ b/tests/e2e/offline_inference/test_zimage_parallelism.py @@ -159,8 +159,8 @@ def _run_zimage_generate( @pytest.mark.parallel @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2}) def test_zimage_tensor_parallel_tp2(tmp_path: Path): - if current_omni_platform.is_npu() or current_omni_platform.is_rocm(): - pytest.skip("Z-Image TP e2e test is only supported on CUDA for now.") + if current_omni_platform.is_npu(): + pytest.skip("Z-Image TP e2e test is only supported on CUDA and ROCm for now.") if not current_omni_platform.is_available() or current_omni_platform.device_count() < 2: pytest.skip("Z-Image TP=2 requires >= 2 devices.") @@ -211,7 +211,9 @@ def test_zimage_tensor_parallel_tp2(tmp_path: Path): ) print(f"Z-Image TP perf (lower is better): tp1_time_s={tp1_time_s:.6f}, tp2_time_s={tp2_time_s:.6f}") - assert tp2_time_s < tp1_time_s, f"Expected TP=2 to be faster than TP=1 (tp1={tp1_time_s}, tp2={tp2_time_s})" + # ROCm is not optimized TP2 can be slower than TP1 + if not current_omni_platform.is_rocm(): + assert tp2_time_s < tp1_time_s, f"Expected TP=2 to be faster than TP=1 (tp1={tp1_time_s}, tp2={tp2_time_s})" print(f"Z-Image TP peak memory (MB): tp1_peak_mem={tp1_peak_mem:.2f}, tp2_peak_mem={tp2_peak_mem:.2f}") assert tp2_peak_mem < tp1_peak_mem, ( @@ -221,8 +223,8 @@ def test_zimage_tensor_parallel_tp2(tmp_path: Path): @pytest.mark.integration def test_zimage_vae_patch_parallel_tp2(tmp_path: Path): - if current_omni_platform.is_npu() or current_omni_platform.is_rocm(): - pytest.skip("Z-Image VAE patch parallel e2e test is only supported on CUDA for now.") + if current_omni_platform.is_npu(): + pytest.skip("Z-Image VAE patch parallel e2e test is only supported on CUDA and ROCm for now.") if not current_omni_platform.is_available() or current_omni_platform.device_count() < 2: pytest.skip("Z-Image VAE patch parallel TP=2 requires >= 2 devices.") diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 09195faeca..272df14f80 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -168,6 +168,20 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata: stage_id: int = stage_config.stage_id stage_type: Literal["llm", "diffusion"] = getattr(stage_config, "stage_type", "llm") engine_args = stage_config.engine_args + + if current_omni_platform.is_rocm(): + if engine_args.get("attention_backend") is None: + from vllm._aiter_ops import rocm_aiter_ops + + if rocm_aiter_ops.is_enabled(): + engine_args["attention_backend"] = "ROCM_AITER_FA" + # Before vLLM v0.19.0, the default attention backend is TRITON_ATTN for ROCm. + # Since vLLM v0.19.0, the default attention backend is ROCM_ATTN for ROCm. + # However, the compatibility of ROCM_ATTN with Omni is not guaranteed. + # Therefore, we still use TRITON_ATTN as the default attention backend, + # when the selected_backend is not specified. + engine_args["attention_backend"] = "TRITON_ATTN" + runtime_cfg = getattr(stage_config, "runtime", {}) engine_input_source: list[int] = getattr(stage_config, "engine_input_source", []) final_output: bool = getattr(stage_config, "final_output", False) diff --git a/vllm_omni/platforms/rocm/platform.py b/vllm_omni/platforms/rocm/platform.py index 4479e54f2a..7b0e09c128 100644 --- a/vllm_omni/platforms/rocm/platform.py +++ b/vllm_omni/platforms/rocm/platform.py @@ -16,6 +16,34 @@ class RocmOmniPlatform(OmniPlatform, RocmPlatform): Inherits all ROCm-specific implementations from vLLM's RocmPlatform, and adds Omni-specific interfaces from OmniPlatform. + + + NOTE: AR Attention Backend Overriding Logic: + ------------------------------------------ + Since vLLM v0.19.0, the default attention backend is ROCM_ATTN for ROCm. + However, the compatibility of ROCM_ATTN with Omni is not guaranteed. + Therefore, we still use TRITON_ATTN as the default attention backend, + when the selected_backend is not specified. + + So the behaviour of the attention backend overriding logic currently lives in + extract_stage_metadata in `vllm_omni/engine/stage_init_utils.py` + + ``` + if current_omni_platform.is_rocm(): + print(f"engine_args: {str(engine_args)}") + if engine_args.get("attention_backend") is None: + from vllm._aiter_ops import rocm_aiter_ops + + if rocm_aiter_ops.is_enabled(): + engine_args["attention_backend"] = "ROCM_AITER_FA" + # Before vLLM v0.19.0, the default attention backend is TRITON_ATTN for ROCm. + # Since vLLM v0.19.0, the default attention backend is ROCM_ATTN for ROCm. + # However, the compatibility of ROCM_ATTN with Omni is not guaranteed. + # Therefore, we still use TRITON_ATTN as the default attention backend, + # when the selected_backend is not specified. + engine_args["attention_backend"] = "TRITON_ATTN" + ``` + """ _omni_enum = OmniPlatformEnum.ROCM From eccee21c6ca229286a49d23294e2b09830280fe7 Mon Sep 17 00:00:00 2001 From: Nick Cao Date: Sat, 11 Apr 2026 09:45:26 -0400 Subject: [PATCH 126/204] [Perf] Use global CUDA graph pool for MiMo Audio (#2657) Signed-off-by: Nick Cao --- .../model_executor/models/mimo_audio/mimo_audio_llm.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py b/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py index 56cb8788ee..85fe4b0051 100644 --- a/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py +++ b/vllm_omni/model_executor/models/mimo_audio/mimo_audio_llm.py @@ -50,6 +50,7 @@ PromptUpdate, PromptUpdateDetails, ) +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema @@ -150,7 +151,6 @@ def __init__(self, model: "MiMoAudioLLMForConditionalGeneration", max_batch_size dtype = next(model.hidden_states_downcast.parameters()).dtype hidden_size = model.local_config.hidden_size - self.pool = torch.cuda.graph_pool_handle() self.input_tensor = torch.zeros((max_batch_size, 1, hidden_size), dtype=dtype, device=device) self.sampler = MiMoLocalSamplerTensor( temperature=torch.ones(max_batch_size, dtype=torch.float32, device=device), @@ -231,7 +231,7 @@ def capture( cuda_graph = torch.cuda.CUDAGraph() if eager_run_first: model.base_local_forward(input_tensor, local_sampler=sampler) - with torch.cuda.graph(cuda_graph, buffer.pool): + with torch.cuda.graph(cuda_graph, pool=current_platform.get_global_graph_pool()): output_tensor = model.base_local_forward(input_tensor, local_sampler=sampler) return cls( @@ -263,7 +263,6 @@ def __init__(self, model: "MiMoAudioLLMForConditionalGeneration", max_batch_size hidden_size = model.input_local_config.hidden_size group_size = model.group_size - self.pool = torch.cuda.graph_pool_handle() self.input_tensor = torch.zeros((max_batch_size, group_size, hidden_size), dtype=dtype, device=device) self.lock = threading.Lock() @@ -311,7 +310,7 @@ def capture( out = model.input_local_transformer(inputs_embeds=input_tensor, return_dict=True, is_causal=False) _ = out.last_hidden_state - with torch.cuda.graph(cuda_graph, buffer.pool): + with torch.cuda.graph(cuda_graph, pool=current_platform.get_global_graph_pool()): out = model.input_local_transformer(inputs_embeds=input_tensor, return_dict=True, is_causal=False) output_tensor = out.last_hidden_state From f7e8df9dbdade383f1518e3e987858a7fad9c361 Mon Sep 17 00:00:00 2001 From: Juan Pablo Zuluaga <46724788+JuanPZuluaga@users.noreply.github.com> Date: Sat, 11 Apr 2026 18:14:03 +0200 Subject: [PATCH 127/204] [TTS][OmniVoice] Add voice cloning support for OmniVoice TTS (#2676) Signed-off-by: JuanPZuluaga --- tests/e2e/online_serving/test_omnivoice.py | 118 +++++++++++++++++- .../models/omnivoice/pipeline_omnivoice.py | 87 +++++++++++-- .../entrypoints/openai/serving_speech.py | 60 +++++++-- .../models/omnivoice/omnivoice.py | 45 +++---- 4 files changed, 260 insertions(+), 50 deletions(-) diff --git a/tests/e2e/online_serving/test_omnivoice.py b/tests/e2e/online_serving/test_omnivoice.py index ec1981aab2..4a0069f402 100644 --- a/tests/e2e/online_serving/test_omnivoice.py +++ b/tests/e2e/online_serving/test_omnivoice.py @@ -17,9 +17,16 @@ import httpx import pytest -from tests.conftest import OmniServerParams +from tests.conftest import OmniServerParams, generate_synthetic_audio from tests.utils import hardware_test +try: + from transformers import HiggsAudioV2TokenizerModel # noqa: F401 + + _HAS_VOICE_CLONE = True +except ImportError: + _HAS_VOICE_CLONE = False + MODEL = "k2-fsa/OmniVoice" STAGE_CONFIG = str( @@ -40,6 +47,16 @@ MIN_AUDIO_BYTES = 5000 +def _get_ref_audio_b64() -> str: + """Generate synthetic speech for reference audio. + + Returns: + Base64 data URL string (data:audio/wav;base64,...) + """ + audio_data = generate_synthetic_audio(duration=2, num_channels=1, sample_rate=24000) + return f"data:audio/wav;base64,{audio_data['base64']}" + + def make_speech_request( host: str, port: int, @@ -82,3 +99,102 @@ def test_speech_auto_voice(self, omni_server) -> None: assert len(response.content) > MIN_AUDIO_BYTES, ( f"Audio too small ({len(response.content)} bytes), expected > {MIN_AUDIO_BYTES}" ) + + +def make_voice_clone_request( + host: str, + port: int, + text: str, + ref_audio_b64: str, + ref_text: str | None = None, + timeout: float = 180.0, +) -> httpx.Response: + """Make a voice cloning request to the /v1/audio/speech endpoint. + + Args: + host: Server host + port: Server port + text: Text to synthesize + ref_audio_b64: Base64-encoded reference audio data URL + ref_text: Optional transcript of reference audio + timeout: Request timeout in seconds + + Returns: + httpx.Response object + """ + url = f"http://{host}:{port}/v1/audio/speech" + payload = { + "input": text, + "ref_audio": ref_audio_b64, + } + if ref_text: + payload["ref_text"] = ref_text + + with httpx.Client(timeout=timeout) as client: + return client.post(url, json=payload) + + +@pytest.mark.skipif(not _HAS_VOICE_CLONE, reason="Voice cloning requires transformers>=5.3.0") +@pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True) +class TestOmniVoiceVoiceCloning: + """E2E tests for OmniVoice voice cloning functionality.""" + + @pytest.mark.core_model + @pytest.mark.omni + @hardware_test(res={"cuda": "L4"}, num_cards=1) + def test_voice_clone_ref_audio_only(self, omni_server) -> None: + """Test voice cloning with ref_audio only (x_vector mode).""" + ref_audio_b64 = _get_ref_audio_b64() + + response = make_voice_clone_request( + host=omni_server.host, + port=omni_server.port, + text="Hello, this is a voice cloning test.", + ref_audio_b64=ref_audio_b64, + ) + + assert response.status_code == 200, f"Request failed: {response.text}" + assert response.headers.get("content-type") == "audio/wav" + assert verify_wav_audio(response.content), "Response is not valid WAV audio" + assert len(response.content) > MIN_AUDIO_BYTES, ( + f"Audio too small ({len(response.content)} bytes), expected > {MIN_AUDIO_BYTES}" + ) + + @pytest.mark.core_model + @pytest.mark.omni + @hardware_test(res={"cuda": "L4"}, num_cards=1) + def test_voice_clone_ref_audio_and_text(self, omni_server) -> None: + """Test voice cloning with ref_audio and ref_text (in-context mode).""" + ref_audio_b64 = _get_ref_audio_b64() + ref_text = "This is the reference transcript." + + response = make_voice_clone_request( + host=omni_server.host, + port=omni_server.port, + text="Hello, this is a voice cloning test with in-context learning.", + ref_audio_b64=ref_audio_b64, + ref_text=ref_text, + ) + + assert response.status_code == 200, f"Request failed: {response.text}" + assert response.headers.get("content-type") == "audio/wav" + assert verify_wav_audio(response.content), "Response is not valid WAV audio" + assert len(response.content) > MIN_AUDIO_BYTES, ( + f"Audio too small ({len(response.content)} bytes), expected > {MIN_AUDIO_BYTES}" + ) + + @pytest.mark.core_model + @pytest.mark.omni + @hardware_test(res={"cuda": "L4"}, num_cards=1) + def test_voice_clone_invalid_ref_audio_format(self, omni_server) -> None: + """Test that invalid ref_audio format returns a clear error.""" + response = make_voice_clone_request( + host=omni_server.host, + port=omni_server.port, + text="This should fail with invalid ref_audio.", + ref_audio_b64="not_a_valid_uri", + ) + + assert response.status_code in (400, 422), ( + f"Expected 400/422 for invalid ref_audio format, got {response.status_code}" + ) diff --git a/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py b/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py index 568e2f5164..c330e91de8 100644 --- a/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py +++ b/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py @@ -16,6 +16,7 @@ from collections.abc import Iterable from typing import ClassVar +import numpy as np import torch from tokenizers import Tokenizer as HFTokenizer from torch import nn @@ -30,6 +31,13 @@ from vllm_omni.model_executor.models.omnivoice.omnivoice_decoder import OmniVoiceDecoder from vllm_omni.model_executor.models.omnivoice.omnivoice_generator import OmniVoiceGenerator +try: + from transformers import HiggsAudioV2TokenizerModel +except ImportError: + HiggsAudioV2TokenizerModel = None + +import torchaudio + logger = init_logger(__name__) @@ -79,6 +87,17 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = ""): tokenizer_path = os.path.join(self.model_path, "tokenizer.json") self.tokenizer = HFTokenizer.from_file(tokenizer_path) + # Audio tokenizer for voice cloning (requires transformers>=5.3) + if HiggsAudioV2TokenizerModel is not None: + audio_tokenizer_path = os.path.join(self.model_path, "audio_tokenizer") + self.audio_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained( + audio_tokenizer_path, device_map=self.device + ).eval() + logger.info("HiggsAudioV2 tokenizer loaded for voice cloning on %s", self.device) + else: + self.audio_tokenizer = None + logger.warning("Voice cloning disabled (requires transformers>=5.3.0).") + # Duration estimator self.duration_estimator = RuleDurationEstimator() @@ -91,20 +110,46 @@ def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = ""): self.class_temperature = self.config.class_temperature self.sample_rate = self.config.sample_rate + def _encode_ref_audio(self, audio_signal: torch.Tensor, sr: int) -> torch.Tensor: + """Encode reference audio to 8-codebook tokens for voice cloning.""" + if self.audio_tokenizer is None: + raise RuntimeError("Audio tokenizer not available for voice cloning") + if audio_signal.dim() == 1: + audio_signal = audio_signal.unsqueeze(0) + # Resample to tokenizer's expected sample rate + target_sr = self.audio_tokenizer.config.sample_rate + if sr != target_sr: + audio_signal = torchaudio.functional.resample(audio_signal, sr, target_sr) + # Ensure mono [B, 1, samples] + if audio_signal.dim() == 2: + audio_signal = audio_signal.unsqueeze(1) + with torch.inference_mode(): + tokens = self.audio_tokenizer.encode( + audio_signal.to(self.audio_tokenizer.device), return_dict=False + ) # [B, 8, T_ref] + tokens = tokens.squeeze(0) # [8, T_ref] + return tokens + @torch.inference_mode() def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: - """Generate speech audio from text. - - Args: - req: Diffusion request containing text prompt(s). + """Generate speech audio from text, optionally with voice cloning. - Returns: - DiffusionOutput with audio tensor in .output + Accepts either a plain text prompt or a structured dict: + {"text": "...", "ref_audio": (samples, sr), "ref_text": "...", + "lang": "...", "instruct": "..."} """ - # Extract text from request prompt = req.prompts[0] if req.prompts else "" + ref_audio = None + ref_text = None + lang = "None" + instruct = "None" + if isinstance(prompt, dict): text = prompt.get("input", prompt.get("text", str(prompt))) + ref_audio = prompt.get("ref_audio") + ref_text = prompt.get("ref_text") + lang = prompt.get("lang") or "None" + instruct = prompt.get("instruct") or "None" else: text = str(prompt) @@ -119,17 +164,37 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: target_len = self.duration_estimator.estimate_duration(text, "Nice to meet you.", 25) target_len = max(1, int(target_len)) - # Tokenize with control tokens - style = "<|denoise|><|lang_start|>None<|lang_end|><|instruct_start|>None<|instruct_end|>" - full_prompt = f"{style}<|text_start|>{text}<|text_end|>" + # Build text prompt with control tokens + style = f"<|denoise|><|lang_start|>{lang}<|lang_end|><|instruct_start|>{instruct}<|instruct_end|>" + if ref_text: + full_text = f"{ref_text} {text}" + else: + full_text = text + full_prompt = f"{style}<|text_start|>{full_text}<|text_end|>" encoding = self.tokenizer.encode(full_prompt) text_tokens = torch.tensor(encoding.ids, dtype=torch.long, device=device) text_len = text_tokens.shape[0] + # Encode reference audio tokens if provided + ref_audio_tokens = None + if ref_audio is not None: + if self.audio_tokenizer is None: + raise RuntimeError( + "Voice cloning requires transformers>=5.3.0. Try: uv pip install 'transformers>=5.3.0'" + ) + audio_signal, sr = ref_audio + if isinstance(audio_signal, np.ndarray): + audio_signal = torch.from_numpy(audio_signal).float() + ref_audio_tokens = self._encode_ref_audio(audio_signal, int(sr)).to(device) + # Build conditional + unconditional batches [2, 8, max_len] text_ids = text_tokens.unsqueeze(0).repeat(num_cb, 1) target_ids = torch.full((num_cb, target_len), mask_id, dtype=torch.long, device=device) - cond_ids = torch.cat([text_ids, target_ids], dim=1) + + if ref_audio_tokens is not None: + cond_ids = torch.cat([text_ids, ref_audio_tokens, target_ids], dim=1) + else: + cond_ids = torch.cat([text_ids, target_ids], dim=1) cond_len = cond_ids.shape[1] uncond_ids = target_ids.clone() diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 52944d5082..a95fa69515 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -1024,11 +1024,15 @@ async def _resolve_ref_audio(self, ref_audio_str: str) -> tuple[list[float], int URLs, ``data:`` base64 URIs, and ``file:`` local paths (the latter gated by ``--allowed-local-media-path``). """ - model_config = self.model_config - connector = MediaConnector( - allowed_local_media_path=model_config.allowed_local_media_path, - allowed_media_domains=model_config.allowed_media_domains, - ) + # In diffusion mode, model_config may not be available + if self._diffusion_mode: + connector = MediaConnector() + else: + model_config = self.model_config + connector = MediaConnector( + allowed_local_media_path=model_config.allowed_local_media_path, + allowed_media_domains=model_config.allowed_media_domains, + ) wav_np, sr = await connector.fetch_audio_async(ref_audio_str) wav_np = np.asarray(wav_np, dtype=np.float32) if wav_np.ndim > 1: @@ -1399,8 +1403,33 @@ async def _prepare_speech_generation( prompt = await self._build_fish_speech_prompt_async(request, ref_audio_data=ref_audio_data) tts_params = {} elif self._tts_model_type == "omnivoice": + if not request.input or not request.input.strip(): + raise ValueError("Input text cannot be empty") tts_params = {} - prompt = request.input # Diffusion engine takes raw text + prompt: dict[str, Any] = {"input": request.input} + # Resolve ref_audio: explicit request param or uploaded voice + ref_src = request.ref_audio + if not ref_src and request.voice: + vl = request.voice.lower() + if vl in self.uploaded_speakers: + sp = self.uploaded_speakers[vl] + if sp.get("embedding_source") == "audio": + ref_src = self._get_uploaded_audio_data(request.voice) + if not ref_src: + raise ValueError(f"Audio for voice '{request.voice}' missing") + prompt["ref_text"] = sp.get("ref_text") + if ref_src: + fmt_err = self._validate_ref_audio_format(ref_src) + if fmt_err: + raise ValueError(fmt_err) + wav, sr = await self._resolve_ref_audio(ref_src) + prompt["ref_audio"] = (np.asarray(wav, dtype=np.float32), sr) + if request.ref_text: + prompt["ref_text"] = request.ref_text + if request.language: + prompt["lang"] = request.language + if request.instructions: + prompt["instruct"] = request.instructions elif self._is_tts: validation_error = self._validate_tts_request(request) if validation_error: @@ -1567,13 +1596,26 @@ async def _create_diffusion_speech( from vllm_omni.outputs import OmniRequestOutput try: + if not request.input or not request.input.strip(): + raise ValueError("Input text cannot be empty") + request_id = f"speech-{random_uuid()}" - prompt = request.input + prompt: dict[str, Any] = {"input": request.input} + if request.ref_audio: + wav, sr = await self._resolve_ref_audio(request.ref_audio) + prompt["ref_audio"] = (np.asarray(wav, dtype=np.float32), sr) + if request.ref_text: + prompt["ref_text"] = request.ref_text + if request.language: + prompt["lang"] = request.language + if request.instructions: + prompt["instruct"] = request.instructions logger.info( - "Diffusion TTS speech request %s: text=%r", + "Diffusion TTS speech request %s: text=%r, voice_clone=%s", request_id, - prompt[:50] + "..." if len(prompt) > 50 else prompt, + request.input[:50] + "..." if len(request.input) > 50 else request.input, + "ref_audio" in prompt, ) generator = self._diffusion_engine.generate( diff --git a/vllm_omni/model_executor/models/omnivoice/omnivoice.py b/vllm_omni/model_executor/models/omnivoice/omnivoice.py index a3603a3c39..7fde8f16fa 100644 --- a/vllm_omni/model_executor/models/omnivoice/omnivoice.py +++ b/vllm_omni/model_executor/models/omnivoice/omnivoice.py @@ -15,6 +15,7 @@ import numpy as np import torch import torch.nn as nn +import torchaudio from transformers.feature_extraction_utils import BatchFeature from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions @@ -77,31 +78,21 @@ def _ensure_cached_runtime_components(self, model_dir: str, config: OmniVoiceCon self.text_tokenizer = AutoTokenizer.from_pretrained(model_dir) - # Audio tokenizer for encoding reference audio + # Audio tokenizer for encoding reference audio (requires transformers>=5.3) audio_tokenizer_path = os.path.join(model_dir, "audio_tokenizer") - if os.path.isdir(audio_tokenizer_path): - try: - from transformers import ( - AutoFeatureExtractor, - HiggsAudioV2TokenizerModel, - ) - except ImportError as e: - raise ImportError( - "OmniVoice voice cloning requires transformers with " - "HiggsAudioV2TokenizerModel. Upgrade transformers or " - "use text-only mode (no reference audio)." - ) from e + try: + from transformers import ( + AutoFeatureExtractor, + HiggsAudioV2TokenizerModel, + ) self.audio_tokenizer = HiggsAudioV2TokenizerModel.from_pretrained(audio_tokenizer_path, device_map="cpu") self.feature_extractor = AutoFeatureExtractor.from_pretrained(audio_tokenizer_path) self.audio_tokenizer.eval() - else: + except ImportError: self.audio_tokenizer = None self.feature_extractor = None - logger.warning( - "audio_tokenizer not found at %s, voice cloning disabled", - audio_tokenizer_path, - ) + logger.warning("Voice cloning disabled (requires transformers>=5.3.0).") self._cached_model_dir = model_dir @@ -166,20 +157,16 @@ def _call_hf_processor( if self.feature_extractor is not None: target_sr = self.feature_extractor.sampling_rate if sr != target_sr: - import torchaudio - audio_signal = torchaudio.functional.resample(audio_signal, sr, target_sr) # Encode reference audio to 8-codebook tokens - if self.audio_tokenizer is not None: - with torch.inference_mode(): - ref_audio_tokens = self.audio_tokenizer.encode(audio_signal) # [8, T_ref] - if ref_audio_tokens.dim() == 3: - ref_audio_tokens = ref_audio_tokens.squeeze(0) # [8, T_ref] - else: - raise RuntimeError( - "Audio tokenizer not available for voice cloning. Ensure audio_tokenizer/ exists in model directory." - ) + if self.audio_tokenizer is None: + raise RuntimeError("Voice cloning requires transformers>=5.3.0. Try: uv pip install 'transformers>=5.3.0'") + + with torch.inference_mode(): + ref_audio_tokens = self.audio_tokenizer.encode(audio_signal) # [8, T_ref] + if ref_audio_tokens.dim() == 3: + ref_audio_tokens = ref_audio_tokens.squeeze(0) # [8, T_ref] ft = BatchFeature( { From 6e935958221b7cd50b9b81461eaf78191694fa08 Mon Sep 17 00:00:00 2001 From: TJian Date: Sun, 12 Apr 2026 00:18:15 +0800 Subject: [PATCH 128/204] [CI] [Resource] Remove unused test cases to cutdown agent resources usage (#2688) Signed-off-by: tjtanaa --- .buildkite/test-ready.yml | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index f5dcbef55e..13a812a62f 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -194,28 +194,6 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - - label: "Omni Model Test" - depends_on: upload-ready-pipeline - commands: - - | - timeout 17m bash -c ' - export VLLM_LOGGING_LEVEL=DEBUG - pytest -s -v tests/e2e/online_serving/test_qwen2_5_omni.py -m "core_model" --run-level "core_model" - ' - agents: - queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Omni Model Test with H100" depends_on: upload-ready-pipeline commands: From c20cac86ceada79a1b4fc71b36cb4a33f87fc754 Mon Sep 17 00:00:00 2001 From: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com> Date: Sun, 12 Apr 2026 00:48:47 +0800 Subject: [PATCH 129/204] [Bugfix] Restore user config/runtime stage init timeout (#2519) Signed-off-by: yuanheng Signed-off-by: Yuanheng Zhao Co-authored-by: Hongsheng Liu Co-authored-by: SYLAR <125541396+lishunyang12@users.noreply.github.com> --- .../text_to_image/text_to_image.py | 14 ++ tests/conftest.py | 7 +- tests/dfx/perf/scripts/run_benchmark.py | 3 +- tests/e2e/accuracy/conftest.py | 5 +- tests/e2e/online_serving/test_bagel_online.py | 2 +- .../test_dynin_omni_expansion.py | 2 +- .../online_serving/test_qwen3_tts_batch.py | 5 +- .../test_qwen3_tts_speaker_embedding.py | 3 +- .../test_qwen3_tts_websocket.py | 3 +- .../test_async_omni_engine_stage_init.py | 139 ++++++++++++++++++ tests/engine/test_single_stage_mode.py | 4 +- vllm_omni/diffusion/stage_diffusion_client.py | 3 +- vllm_omni/diffusion/stage_diffusion_proc.py | 14 +- vllm_omni/engine/async_omni_engine.py | 5 +- vllm_omni/engine/stage_engine_core_proc.py | 19 ++- vllm_omni/engine/stage_init_utils.py | 8 +- 16 files changed, 207 insertions(+), 29 deletions(-) diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py index 615e4067ed..3b3f8e77cf 100644 --- a/examples/offline_inference/text_to_image/text_to_image.py +++ b/examples/offline_inference/text_to_image/text_to_image.py @@ -242,6 +242,18 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Enable logging of diffusion pipeline stats.", ) + parser.add_argument( + "--init-timeout", + type=int, + default=600, + help="Timeout for initializing a single stage in seconds (default: 600s)", + ) + parser.add_argument( + "--stage-init-timeout", + type=int, + default=600, + help="Timeout for initializing a single stage in seconds (default: 600s)", + ) parser.add_argument( "--use-system-prompt", type=str, @@ -346,6 +358,8 @@ def main(): "mode": "text-to-image", "log_stats": args.log_stats, "enable_diffusion_pipeline_profiler": args.enable_diffusion_pipeline_profiler, + "init_timeout": args.init_timeout, + "stage_init_timeout": args.stage_init_timeout, **lora_args, **quant_kwargs, } diff --git a/tests/conftest.py b/tests/conftest.py index 27833fe282..18a0ee57d9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -75,6 +75,7 @@ class OmniServerParams(NamedTuple): use_omni: bool = True use_stage_cli: bool = False init_timeout: int | None = None + stage_init_timeout: int | None = None # None defers to the server's own default (300 s) def assert_image_diffusion_response( @@ -1768,8 +1769,8 @@ def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: st ) server_args = params.server_args or [] - if params.use_omni: - server_args = ["--stage-init-timeout", "120", *server_args] + if params.use_omni and params.stage_init_timeout is not None: + server_args = [*server_args, "--stage-init-timeout", str(params.stage_init_timeout)] if params.init_timeout is not None: server_args = [*server_args, "--init-timeout", str(params.init_timeout)] if params.use_stage_cli: @@ -3257,7 +3258,7 @@ def omni_runner(request, model_prefix): with _omni_server_lock: model, stage_config_path = request.param model = model_prefix + model - with OmniRunner(model, seed=42, stage_configs_path=stage_config_path, stage_init_timeout=300) as runner: + with OmniRunner(model, seed=42, stage_configs_path=stage_config_path) as runner: print("OmniRunner started successfully") yield runner print("OmniRunner stopping...") diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index c625239e5c..c566c2e0a0 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -23,6 +23,7 @@ CONFIG_FILE_PATH = str(Path(__file__).parent.parent / "tests" / "test.json") BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) +STAGE_INIT_TIMEOUT = 600 STAGE_CONFIGS_DIR = Path(__file__).parent.parent / "stage_configs" @@ -43,7 +44,7 @@ def omni_server(request): print(f"Starting OmniServer with test: {test_name}, model: {model}") - server_args = ["--stage-init-timeout", "120", "--init-timeout", "900"] + server_args = ["--stage-init-timeout", str(STAGE_INIT_TIMEOUT), "--init-timeout", "900"] if stage_config_path: server_args = ["--stage-configs-path", stage_config_path] + server_args with OmniServer(model, server_args) as server: diff --git a/tests/e2e/accuracy/conftest.py b/tests/e2e/accuracy/conftest.py index 0a81b02075..062750b3cd 100644 --- a/tests/e2e/accuracy/conftest.py +++ b/tests/e2e/accuracy/conftest.py @@ -114,8 +114,8 @@ def generate_server(self): params = self.generate_params model = self.model_prefix + params.model server_args = params.server_args or [] - if params.use_omni: - server_args = ["--stage-init-timeout", "120", *server_args] + if params.use_omni and params.stage_init_timeout is not None: + server_args = ["--stage-init-timeout", str(params.stage_init_timeout), *server_args] with OmniServer( model, server_args, @@ -226,6 +226,7 @@ def _build_accuracy_server_config( server_args=generate_server_args, env_dict={"CUDA_VISIBLE_DEVICES": shared_gpu}, use_omni=True, + stage_init_timeout=300, ), judge_params=OmniServerParams( model=judge_model, diff --git a/tests/e2e/online_serving/test_bagel_online.py b/tests/e2e/online_serving/test_bagel_online.py index ca24f5f81f..a3f999f13d 100644 --- a/tests/e2e/online_serving/test_bagel_online.py +++ b/tests/e2e/online_serving/test_bagel_online.py @@ -47,7 +47,7 @@ OmniServerParams( model=MODEL, stage_config_path=STAGE_CONFIGS_PATH, - server_args=["--stage-init-timeout", "300"], + stage_init_timeout=300, ), ] diff --git a/tests/e2e/online_serving/test_dynin_omni_expansion.py b/tests/e2e/online_serving/test_dynin_omni_expansion.py index 4648c424fe..39b6dc8e21 100644 --- a/tests/e2e/online_serving/test_dynin_omni_expansion.py +++ b/tests/e2e/online_serving/test_dynin_omni_expansion.py @@ -30,7 +30,7 @@ T2S_PROMPT = "Please read this sentence naturally: Hello from Dynin-Omni online serving." I2I_PROMPT = "Transform this outdoor nature boardwalk scene into a painting style with vivid colors." -TEST_PARAMS = [OmniServerParams(model=MODEL, stage_config_path=STAGE_CONFIG)] +TEST_PARAMS = [OmniServerParams(model=MODEL, stage_config_path=STAGE_CONFIG, stage_init_timeout=600)] _STAGE_COUNT = 3 _I2I_STAGE_SAMPLING = {"max_tokens": 1, "temperature": 0.0, "top_p": 1.0, "detokenize": False} diff --git a/tests/e2e/online_serving/test_qwen3_tts_batch.py b/tests/e2e/online_serving/test_qwen3_tts_batch.py index d0d6336618..1a453afb72 100644 --- a/tests/e2e/online_serving/test_qwen3_tts_batch.py +++ b/tests/e2e/online_serving/test_qwen3_tts_batch.py @@ -30,6 +30,7 @@ from tests.utils import hardware_test MODEL = "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice" +STAGE_INIT_TIMEOUT_S = 120 def get_stage_config(name: str = "qwen3_tts.yaml"): @@ -47,7 +48,7 @@ def omni_server(): "--stage-configs-path", stage_config_path, "--stage-init-timeout", - "120", + str(STAGE_INIT_TIMEOUT_S), "--trust-remote-code", "--enforce-eager", "--disable-log-stats", @@ -337,7 +338,7 @@ def omni_server_batch2(): "--stage-configs-path", config_path, "--stage-init-timeout", - "120", + str(STAGE_INIT_TIMEOUT_S), "--trust-remote-code", "--enforce-eager", "--disable-log-stats", diff --git a/tests/e2e/online_serving/test_qwen3_tts_speaker_embedding.py b/tests/e2e/online_serving/test_qwen3_tts_speaker_embedding.py index 64e13e1557..8c1c860819 100644 --- a/tests/e2e/online_serving/test_qwen3_tts_speaker_embedding.py +++ b/tests/e2e/online_serving/test_qwen3_tts_speaker_embedding.py @@ -23,6 +23,7 @@ MODEL_BASE = "Qwen/Qwen3-TTS-12Hz-0.6B-Base" MODEL_BASE_1_7B = "Qwen/Qwen3-TTS-12Hz-1.7B-Base" +STAGE_INIT_TIMEOUT_S = 120 # A synthetic 1024-dim speaker embedding (all 0.1 — not a real voice, but # exercises the full code path through the talker's _build_prompt_embeds). @@ -47,7 +48,7 @@ def _server_args(): "--stage-configs-path", get_stage_config(), "--stage-init-timeout", - "120", + str(STAGE_INIT_TIMEOUT_S), "--trust-remote-code", "--enforce-eager", "--disable-log-stats", diff --git a/tests/e2e/online_serving/test_qwen3_tts_websocket.py b/tests/e2e/online_serving/test_qwen3_tts_websocket.py index df05146011..849d1c1158 100644 --- a/tests/e2e/online_serving/test_qwen3_tts_websocket.py +++ b/tests/e2e/online_serving/test_qwen3_tts_websocket.py @@ -19,6 +19,7 @@ os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" MODEL = "Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice" +STAGE_INIT_TIMEOUT_S = 120 def get_stage_config() -> str: @@ -37,7 +38,7 @@ def omni_server(): "--stage-configs-path", stage_config_path, "--stage-init-timeout", - "120", + str(STAGE_INIT_TIMEOUT_S), "--trust-remote-code", "--enforce-eager", "--disable-log-stats", diff --git a/tests/engine/test_async_omni_engine_stage_init.py b/tests/engine/test_async_omni_engine_stage_init.py index 31d3ed7751..002e8226f6 100644 --- a/tests/engine/test_async_omni_engine_stage_init.py +++ b/tests/engine/test_async_omni_engine_stage_init.py @@ -1,5 +1,6 @@ import importlib import os +import threading import types import pytest @@ -86,6 +87,144 @@ def _fake_setup_stage_devices(_stage_id, _runtime_cfg): os.environ[env_var] = old_env +def test_initialize_stages_passes_stage_init_timeout_to_diffusion_handshake(monkeypatch): + """Regression test for stage_init_timeout passing to complete_diffusion_handshake + in the diffusion stage path. + """ + import vllm_omni.diffusion.data as diffusion_data_mod + import vllm_omni.diffusion.stage_diffusion_client as client_mod + import vllm_omni.engine.async_omni_engine as engine_mod + from vllm_omni.platforms import current_omni_platform + + engine = object.__new__(AsyncOmniEngine) + engine.model = "dummy-model" + engine.config_path = "dummy-config" + engine.num_stages = 1 + engine.async_chunk = False + engine.diffusion_batch_size = 1 + engine.single_stage_mode = False + engine.stage_configs = [types.SimpleNamespace(stage_id=0, stage_type="diffusion", engine_args={})] + + metadata = types.SimpleNamespace( + stage_id=0, + stage_type="diffusion", + runtime_cfg={"devices": "0"}, + prompt_expand_func=None, + final_output=True, + final_output_type="image", + default_sampling_params=None, + custom_process_input_func=None, + engine_input_source=None, + cfg_kv_collect_func=None, + ) + + captured_timeout = None + device_env_var = current_omni_platform.device_control_env_var + prev_device_env = os.environ.get(device_env_var) + os.environ[device_env_var] = "0" + + monkeypatch.setattr(engine_mod, "prepare_engine_environment", lambda: None) + monkeypatch.setattr(engine_mod, "load_omni_transfer_config_for_model", lambda *_: None) + monkeypatch.setattr(engine_mod, "extract_stage_metadata", lambda _cfg: metadata) + monkeypatch.setattr(engine_mod, "setup_stage_devices", lambda *_: None) + monkeypatch.setattr( + engine_mod, + "finalize_initialized_stages", + lambda stage_clients, _input_processor: ( + stage_clients, + [types.SimpleNamespace()], + [{"final_output_type": "image"}], + ), + ) + monkeypatch.setattr( + diffusion_data_mod.OmniDiffusionConfig, + "from_kwargs", + classmethod(lambda cls, **kwargs: types.SimpleNamespace(parallel_config=types.SimpleNamespace(world_size=1))), + ) + monkeypatch.setattr( + client_mod, + "spawn_diffusion_proc", + lambda model, od_cfg: (object(), "ipc://handshake", "ipc://request", "ipc://response"), + ) + + def _capture_handshake_timeout(proc, handshake_address, handshake_timeout): + nonlocal captured_timeout + captured_timeout = handshake_timeout + + monkeypatch.setattr(client_mod, "complete_diffusion_handshake", _capture_handshake_timeout) + monkeypatch.setattr( + client_mod.zmq, + "Context", + lambda: types.SimpleNamespace(socket=lambda _: types.SimpleNamespace(connect=lambda _: None)), + ) + + try: + engine._initialize_stages(stage_init_timeout=302) + finally: + if prev_device_env is None: + os.environ.pop(device_env_var, None) + else: + os.environ[device_env_var] = prev_device_env + + assert captured_timeout == 302 + + +def test_launch_llm_stage_passes_stage_init_timeout_to_complete_stage_handshake(monkeypatch): + """Regression test for stage_init_timeout reaching complete_stage_handshake + in the LLM stage path. + """ + import vllm_omni.engine.async_omni_engine as engine_mod + from vllm_omni.platforms import current_omni_platform + + engine = object.__new__(AsyncOmniEngine) + engine.model = "dummy-model" + engine.single_stage_mode = False + engine._omni_master_server = None + + metadata = types.SimpleNamespace(stage_id=0, runtime_cfg={"devices": "0"}) + fake_vllm_config = types.SimpleNamespace() + fake_addresses = types.SimpleNamespace() + fake_proc = types.SimpleNamespace() + + captured_timeout = None + + device_env_var = current_omni_platform.device_control_env_var + prev_device_env = os.environ.get(device_env_var) + os.environ[device_env_var] = "0" + + monkeypatch.setattr(engine_mod, "setup_stage_devices", lambda *_: None) + monkeypatch.setattr(engine_mod, "build_engine_args_dict", lambda *_, **__: {}) + monkeypatch.setattr(engine_mod, "build_vllm_config", lambda *_, **__: (fake_vllm_config, object)) + monkeypatch.setattr(engine_mod, "acquire_device_locks", lambda *_: []) + monkeypatch.setattr( + engine_mod, + "spawn_stage_core", + lambda **_: (fake_addresses, fake_proc, "ipc://handshake"), + ) + + def _capture_stage_timeout(_proc, _handshake_addr, _addresses, _vllm_cfg, handshake_timeout): + nonlocal captured_timeout + captured_timeout = handshake_timeout + + monkeypatch.setattr(engine_mod, "complete_stage_handshake", _capture_stage_timeout) + + try: + engine._launch_llm_stage( + stage_cfg=types.SimpleNamespace(engine_args={}), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=302, + llm_stage_launch_lock=threading.Lock(), + ) + finally: + if prev_device_env is None: + os.environ.pop(device_env_var, None) + else: + os.environ[device_env_var] = prev_device_env + + assert captured_timeout == 302 + + def test_attach_llm_stage_uses_omni_input_preprocessor(monkeypatch): """Regression test for GLM-Image t2i preprocessing path. diff --git a/tests/engine/test_single_stage_mode.py b/tests/engine/test_single_stage_mode.py index 627a98395f..2c5bf6cc79 100644 --- a/tests/engine/test_single_stage_mode.py +++ b/tests/engine/test_single_stage_mode.py @@ -1459,6 +1459,7 @@ def test_spawn_stage_core_used_in_normal_mode(self): fake_proc = Mock() fake_handshake_address = "ipc:///tmp/fake-handshake" + stage_init_timeout = 60 with ( patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), @@ -1484,7 +1485,7 @@ def test_spawn_stage_core_used_in_normal_mode(self): stage_cfg=_make_stage_cfg(0), metadata=metadata, stage_connector_spec={}, - stage_init_timeout=60, + stage_init_timeout=stage_init_timeout, llm_stage_launch_lock=threading.Lock(), ) @@ -1498,6 +1499,7 @@ def test_spawn_stage_core_used_in_normal_mode(self): fake_handshake_address, fake_addresses, fake_vllm_config, + stage_init_timeout, ) mock_omni.assert_not_called() assert isinstance(result, StartedLlmStage) diff --git a/vllm_omni/diffusion/stage_diffusion_client.py b/vllm_omni/diffusion/stage_diffusion_client.py index cd7159b683..7e740dc893 100644 --- a/vllm_omni/diffusion/stage_diffusion_client.py +++ b/vllm_omni/diffusion/stage_diffusion_client.py @@ -50,11 +50,12 @@ def __init__( model: str, od_config: OmniDiffusionConfig, metadata: StageMetadata, + stage_init_timeout: int, batch_size: int = 1, ) -> None: # Spawn StageDiffusionProc subprocess and wait for READY. proc, handshake_address, request_address, response_address = spawn_diffusion_proc(model, od_config) - complete_diffusion_handshake(proc, handshake_address) + complete_diffusion_handshake(proc, handshake_address, stage_init_timeout) self._initialize_client(metadata, request_address, response_address, proc=proc, batch_size=batch_size) @classmethod diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py index 2bba419250..cef697630f 100644 --- a/vllm_omni/diffusion/stage_diffusion_proc.py +++ b/vllm_omni/diffusion/stage_diffusion_proc.py @@ -39,8 +39,6 @@ logger = init_logger(__name__) -_HANDSHAKE_POLL_TIMEOUT_S = 600 - class StageDiffusionProc: """Subprocess entry point for diffusion inference. @@ -619,13 +617,14 @@ def spawn_diffusion_proc( def complete_diffusion_handshake( proc: BaseProcess, handshake_address: str, + handshake_timeout: int, ) -> None: """Wait for the diffusion subprocess to signal READY. On failure the process is terminated before re-raising. """ try: - _perform_diffusion_handshake(proc, handshake_address) + _perform_diffusion_handshake(proc, handshake_address, handshake_timeout) except Exception: shutdown([proc]) raise @@ -634,6 +633,7 @@ def complete_diffusion_handshake( def _perform_diffusion_handshake( proc: BaseProcess, handshake_address: str, + handshake_timeout: int, ) -> None: """Run the handshake with the diffusion subprocess.""" with zmq_socket_ctx(handshake_address, zmq.ROUTER, bind=True) as handshake_socket: @@ -641,11 +641,15 @@ def _perform_diffusion_handshake( poller.register(handshake_socket, zmq.POLLIN) poller.register(proc.sentinel, zmq.POLLIN) - timeout_ms = _HANDSHAKE_POLL_TIMEOUT_S * 1000 + timeout_ms = handshake_timeout * 1000 while True: events = dict(poller.poll(timeout=timeout_ms)) if not events: - raise TimeoutError("Timed out waiting for READY from StageDiffusionProc") + raise TimeoutError( + f"Timed out waiting for READY from StageDiffusionProc after {handshake_timeout}s. " + f"This typically indicates model loading or warmup is taking too long. " + f"Consider increasing `stage_init_timeout` for large models." + ) if handshake_socket in events: identity, raw = handshake_socket.recv_multipart() msg = msgspec.msgpack.decode(raw) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 7dc5db0acd..1e92780b66 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -430,7 +430,9 @@ def _launch_llm_stage( else: assert proc is not None assert handshake_address is not None - complete_stage_handshake(proc, handshake_address, addresses, vllm_config) + complete_stage_handshake( + proc, handshake_address, addresses, vllm_config, stage_init_timeout + ) logger.info("[AsyncOmniEngine] Stage %s engine startup completed", metadata.stage_id) finally: if previous_visible_devices is None: @@ -759,6 +761,7 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: self.model, stage_cfg, metadata, + stage_init_timeout=stage_init_timeout, batch_size=self.diffusion_batch_size, ) logger.info( diff --git a/vllm_omni/engine/stage_engine_core_proc.py b/vllm_omni/engine/stage_engine_core_proc.py index 05d8f107c2..689378a798 100644 --- a/vllm_omni/engine/stage_engine_core_proc.py +++ b/vllm_omni/engine/stage_engine_core_proc.py @@ -37,8 +37,6 @@ logger = init_logger(__name__) -_HANDSHAKE_POLL_TIMEOUT_S = 600 - class StageEngineCoreProc(EngineCoreProc): """Stage-specific engine core process for vLLM-Omni. @@ -145,13 +143,14 @@ def complete_stage_handshake( handshake_address: str, addresses: EngineZmqAddresses, vllm_config: VllmConfig, + handshake_timeout: int, ) -> None: """Perform the HELLO/INIT/READY handshake with an already-spawned proc. On failure the process is terminated before re-raising. """ try: - _perform_handshake(proc, handshake_address, addresses, vllm_config) + _perform_handshake(proc, handshake_address, addresses, vllm_config, handshake_timeout) except Exception: shutdown([proc]) raise @@ -162,6 +161,7 @@ def _perform_handshake( handshake_address: str, addresses: EngineZmqAddresses, vllm_config: VllmConfig, + handshake_timeout: int, ) -> None: """Run the HELLO / INIT / READY handshake with the subprocess.""" with zmq_socket_ctx(handshake_address, zmq.ROUTER, bind=True) as handshake_socket: @@ -169,7 +169,7 @@ def _perform_handshake( poller.register(handshake_socket, zmq.POLLIN) poller.register(proc.sentinel, zmq.POLLIN) - identity, msg = _recv(poller, handshake_socket, proc, "HELLO") + identity, msg = _recv(poller, handshake_socket, proc, "HELLO", handshake_timeout) if msg.get("status") != "HELLO": raise RuntimeError(f"Expected HELLO, got: {msg}") @@ -179,7 +179,7 @@ def _perform_handshake( ) handshake_socket.send_multipart([identity, msgspec.msgpack.encode(init_payload)]) - identity, msg = _recv(poller, handshake_socket, proc, "READY") + identity, msg = _recv(poller, handshake_socket, proc, "READY", handshake_timeout) if msg.get("status") != "READY": raise RuntimeError(f"Expected READY, got: {msg}") num_gpu_blocks = msg.get("num_gpu_blocks") @@ -192,13 +192,18 @@ def _recv( handshake_socket: zmq.Socket, proc: BaseProcess, expected: str, + timeout_s: int = 600, ) -> tuple[bytes, dict]: """Wait for one handshake message; raise if the process dies first.""" - timeout_ms = _HANDSHAKE_POLL_TIMEOUT_S * 1000 + timeout_ms = timeout_s * 1000 while True: events = dict(poller.poll(timeout=timeout_ms)) if not events: - raise TimeoutError(f"Timed out waiting for {expected} from StageEngineCoreProc") + raise TimeoutError( + f"Timed out waiting for {expected} from StageEngineCoreProc after {timeout_s}s. " + f"This typically indicates model loading or initialization is taking too long. " + f"Consider increasing `stage_init_timeout` for large models." + ) if handshake_socket in events: identity, raw = handshake_socket.recv_multipart() return identity, msgspec.msgpack.decode(raw) diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 272df14f80..158b4c5477 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -335,7 +335,7 @@ def build_vllm_config( def acquire_device_locks( stage_id: int, engine_args_dict: dict[str, Any], - stage_init_timeout: int = 300, + stage_init_timeout: int, ) -> list[int]: """Acquire exclusive file locks on devices needed by this stage. @@ -528,6 +528,7 @@ def initialize_diffusion_stage( model: str, stage_cfg: Any, metadata: StageMetadata, + stage_init_timeout: int, batch_size: int = 1, ) -> Any: """Build a diffusion stage client. @@ -536,6 +537,7 @@ def initialize_diffusion_stage( model: Model name or path. stage_cfg: Stage configuration. metadata: Extracted stage metadata. + stage_init_timeout: Timeout in seconds for stage initialization handshake batch_size: Maximum number of requests to batch together in the diffusion engine. Passed through to ``StageDiffusionClient`` and ultimately to ``AsyncOmni``. @@ -543,7 +545,9 @@ def initialize_diffusion_stage( from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient od_config = build_diffusion_config(model, stage_cfg, metadata) - return StageDiffusionClient(model, od_config, metadata, batch_size=batch_size) + return StageDiffusionClient( + model, od_config, metadata, stage_init_timeout=stage_init_timeout, batch_size=batch_size + ) def _shutdown_or_close_resource(resource: Any, resource_name: str, stage_id: int) -> None: From 38dfe56fdfd907ee95d249ba9e5547720e51a7af Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Sun, 12 Apr 2026 12:14:16 +0800 Subject: [PATCH 130/204] [Bugfix] Validate speaker in chat endpoint and fix case-insensitive lookup (#2407) Signed-off-by: reidliu41 Co-authored-by: Hongsheng Liu --- .../openai_api/test_serving_chat_speaker.py | 111 ++++++++++++++++++ vllm_omni/entrypoints/openai/serving_chat.py | 29 ++++- vllm_omni/entrypoints/openai/utils.py | 30 +++++ .../models/qwen3_omni/qwen3_omni.py | 5 +- 4 files changed, 169 insertions(+), 6 deletions(-) create mode 100644 tests/entrypoints/openai_api/test_serving_chat_speaker.py diff --git a/tests/entrypoints/openai_api/test_serving_chat_speaker.py b/tests/entrypoints/openai_api/test_serving_chat_speaker.py new file mode 100644 index 0000000000..3b9151120e --- /dev/null +++ b/tests/entrypoints/openai_api/test_serving_chat_speaker.py @@ -0,0 +1,111 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for chat endpoint speaker validation.""" + +import asyncio +from types import SimpleNamespace +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from vllm_omni.entrypoints.openai.utils import ( + get_supported_speakers_from_hf_config, + validate_requested_speaker, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +@pytest.fixture +def serving_chat(): + from vllm_omni.entrypoints.openai.serving_chat import OmniOpenAIServingChat + + instance = object.__new__(OmniOpenAIServingChat) + instance._supported_speakers = None + return instance + + +def _make_hf_config(*, speaker_id: dict | None = None, spk_id: dict | None = None): + hf_config = MagicMock() + talker_config = MagicMock() + talker_config.speaker_id = speaker_id + talker_config.spk_id = spk_id + hf_config.talker_config = talker_config + return hf_config + + +def test_validate_requested_speaker_accepts_case_insensitive_value(): + supported = {"vivian", "ethan"} + assert validate_requested_speaker("Vivian", supported) == "vivian" + assert validate_requested_speaker(" vivian ", supported) == "vivian" + + +def test_validate_requested_speaker_rejects_invalid_value_with_supported_list(): + supported = {"vivian", "ethan"} + with pytest.raises(ValueError, match="Invalid speaker 'uncle_fu'. Supported: ethan, vivian"): + validate_requested_speaker("uncle_fu", supported) + + +def test_validate_requested_speaker_skips_validation_when_supported_empty(): + assert validate_requested_speaker("anything", set()) == "anything" + assert validate_requested_speaker(" ", {"vivian"}) is None + + +def test_get_supported_speakers_from_hf_config_uses_spk_id_fallback(): + hf_config = _make_hf_config(speaker_id=None, spk_id={"Serena": 0}) + assert get_supported_speakers_from_hf_config(hf_config) == {"serena"} + + +def test_get_supported_speakers_caches_normalized_keys(serving_chat): + serving_chat.model_config = MagicMock() + serving_chat.model_config.hf_config = _make_hf_config(speaker_id={"Vivian": 0, "Ethan": 1}) + + assert serving_chat._get_supported_speakers() == {"vivian", "ethan"} + + # Cached value should be reused even if the config changes afterwards. + serving_chat.model_config.hf_config.talker_config.speaker_id = {"Serena": 2} + assert serving_chat._get_supported_speakers() == {"vivian", "ethan"} + + +def test_create_chat_completion_converts_value_error_to_error_response(serving_chat): + serving_chat._diffusion_mode = False + serving_chat._check_model = AsyncMock(return_value=None) + serving_chat.engine_client = MagicMock(errored=False) + serving_chat._maybe_get_adapters = MagicMock(return_value=None) + serving_chat.models = MagicMock() + serving_chat.models.model_name.return_value = "test-model" + serving_chat.renderer = MagicMock() + serving_chat.renderer.get_tokenizer.return_value = MagicMock() + serving_chat.reasoning_parser_cls = None + serving_chat.tool_parser = None + serving_chat.use_harmony = False + serving_chat.enable_auto_tools = False + serving_chat.exclude_tools_when_tool_choice_none = False + serving_chat.trust_request_chat_template = False + serving_chat.chat_template = None + serving_chat.chat_template_content_format = "string" + serving_chat.default_chat_template_kwargs = {} + serving_chat._validate_chat_template = MagicMock(return_value=None) + serving_chat._prepare_extra_chat_template_kwargs = MagicMock(return_value={}) + serving_chat._preprocess_chat = AsyncMock( + side_effect=ValueError("Invalid speaker 'uncle_fu'. Supported: ethan, vivian") + ) + serving_chat.create_error_response = MagicMock(return_value="error-response") + + request = SimpleNamespace( + tool_choice=None, + tools=None, + chat_template=None, + chat_template_kwargs=None, + reasoning_effort=None, + messages=[], + add_generation_prompt=False, + continue_final_message=False, + add_special_tokens=False, + request_id="speaker-test", + ) + + result = asyncio.run(serving_chat.create_chat_completion(request)) + + assert result == "error-response" + serving_chat.create_error_response.assert_called_once_with("Invalid speaker 'uncle_fu'. Supported: ethan, vivian") diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index e84a49aac2..39fcbc9a0a 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -85,7 +85,11 @@ from vllm_omni.entrypoints.openai.image_api_utils import validate_layered_layers from vllm_omni.entrypoints.openai.protocol import OmniChatCompletionStreamResponse from vllm_omni.entrypoints.openai.protocol.audio import AudioResponse, CreateAudio -from vllm_omni.entrypoints.openai.utils import parse_lora_request +from vllm_omni.entrypoints.openai.utils import ( + get_supported_speakers_from_hf_config, + parse_lora_request, + validate_requested_speaker, +) from vllm_omni.lora.request import LoRARequest from vllm_omni.outputs import OmniRequestOutput @@ -106,6 +110,7 @@ class OmniOpenAIServingChat(OpenAIServingChat, AudioMixin): _diffusion_mode: bool = False _diffusion_engine: AsyncOmni | None = None _diffusion_model_name: str = "" + _supported_speakers: set[str] | None = None @classmethod def for_diffusion( @@ -132,6 +137,18 @@ def for_diffusion( instance._diffusion_model_name = model_name return instance + def _get_supported_speakers(self) -> set[str]: + """Load supported speakers from model config (cached).""" + if self._supported_speakers is not None: + return self._supported_speakers + try: + self._supported_speakers = get_supported_speakers_from_hf_config(self.model_config.hf_config) + return self._supported_speakers + except Exception as e: + logger.warning("Could not load speakers from model config: %s", e) + self._supported_speakers = set() + return self._supported_speakers + async def create_chat_completion( self, request: ChatCompletionRequest, @@ -260,7 +277,10 @@ async def create_chat_completion( except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") - return self.create_error_response(f"{e} {e.__cause__}") + message = str(e) + if e.__cause__ is not None: + message = f"{message} {e.__cause__}" + return self.create_error_response(message) request_id = f"chatcmpl-{self._base_request_id(raw_request, request.request_id)}" @@ -540,10 +560,11 @@ async def _preprocess_chat( engine_prompt["cache_salt"] = request.cache_salt speaker = getattr(request, "speaker", None) - if speaker is not None and isinstance(speaker, str) and speaker.strip(): + normalized = validate_requested_speaker(speaker, self._get_supported_speakers()) + if normalized is not None: if "additional_information" not in engine_prompt or engine_prompt["additional_information"] is None: engine_prompt["additional_information"] = {} - engine_prompt["additional_information"]["speaker"] = [speaker.lower().strip()] + engine_prompt["additional_information"]["speaker"] = [normalized] language = getattr(request, "language", None) if language is not None and isinstance(language, str) and language.strip(): diff --git a/vllm_omni/entrypoints/openai/utils.py b/vllm_omni/entrypoints/openai/utils.py index 84b28ef5b1..f411526fdb 100644 --- a/vllm_omni/entrypoints/openai/utils.py +++ b/vllm_omni/entrypoints/openai/utils.py @@ -53,3 +53,33 @@ def parse_lora_request(lora_body: Any) -> tuple[LoRARequest | None, float | None scale = float(lora_scale) if lora_scale is not None else None return LoRARequest(str(lora_name), int(lora_int_id), str(lora_path)), scale + + +def get_supported_speakers_from_hf_config(hf_config: Any) -> set[str]: + """Extract supported speaker names from a model hf_config.""" + config = ( + hf_config.get("talker_config") if isinstance(hf_config, dict) else getattr(hf_config, "talker_config", None) + ) + if config is None: + return set() + + for spk_attr in ("speaker_id", "spk_id"): + speakers_dict = config.get(spk_attr) if isinstance(config, dict) else getattr(config, spk_attr, None) + if speakers_dict and isinstance(speakers_dict, dict): + return {speaker.lower() for speaker in speakers_dict} + return set() + + +def validate_requested_speaker(speaker: str | None, supported_speakers: set[str]) -> str | None: + """Normalize and validate an optional speaker value. + + Returns the normalized speaker string when provided, otherwise ``None``. + Raises ``ValueError`` when the speaker is not in the supported list. + """ + if not isinstance(speaker, str) or not speaker.strip(): + return None + + normalized = speaker.lower().strip() + if supported_speakers and normalized not in supported_speakers: + raise ValueError(f"Invalid speaker '{speaker}'. Supported: {', '.join(sorted(supported_speakers))}") + return normalized diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py index ed6df6af36..7df6947973 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py @@ -610,13 +610,14 @@ def _init_special_tokens_embeddings(self) -> set[str]: # Speaker token IDs (for voice selection) # In Qwen3, speaker_id mapping is in talker_config.speaker_id + # Keys are lowercased for case-insensitive matching with serving layer. if hasattr(talker_hf_config, "speaker_id") and talker_hf_config.speaker_id: - self.tts_text_spk_token_ids = talker_hf_config.speaker_id + self.tts_text_spk_token_ids = {k.lower(): v for k, v in talker_hf_config.speaker_id.items()} else: # Default to audio_start_token_id if no speaker mapping self.tts_text_spk_token_ids = { "default": talker_hf_config.audio_start_token_id, - "Ethan": talker_hf_config.audio_start_token_id, + "ethan": talker_hf_config.audio_start_token_id, "prefix_caching": talker_hf_config.audio_start_token_id, } From 73fb68ad28888146fa54c05bf86c19d938f69ae8 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Sun, 12 Apr 2026 12:42:44 +0800 Subject: [PATCH 131/204] [Docs] Update WeChat QR code for community support (#2701) Signed-off-by: david6666666 Co-authored-by: david6666666 --- docs/assets/WeChat.jpg | Bin 98759 -> 99445 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/docs/assets/WeChat.jpg b/docs/assets/WeChat.jpg index c32ece6c102f7cb76d06e6eb1194af59dcd30488..416439f7eb07306567e4cb29f03915a409ebacd3 100644 GIT binary patch literal 99445 zcmc$`cRbepA1{7YDya}E85tQ(vNIwxvI;F386{-zT}DWhQOFjGLdwo4g+wAk$cSu7 z_CC*#`~H4^=bYc;oPW>#c-;5Rh3oo^_xtsFuGc58NR~*@ShF;-`D=8|NUBW1I>TGCVmaS z7>2(j{=fbm{*$A6xt{pmPUEx;DXX4o7~hcD$tY=&;~#f&;{Xzgm82vmt>qj$_QS

0Z5h#mddSl{&QgIB}E#Ds1fi{r$Hz(tR5m&c4je6 zE-pU*Gl=(8x|Xd6FL&aHhfPgQz4dY8#~ntrPPos%j7SaKC3v%qnw_0J@%rb-XR~xf zjjGpA+ax_b-Fmd;7`=!klTNXHO`AA@uMiSkdl~J#i<>j}vwL$vtSIe88s>^C?YX^>RBHidL(mc=F#?t^t`!acgVMT^Gd%_l@fZN8*rSKnuOKRw)JifeM=#ib1tejC%&;wXOA zMn-K6(Q)L`j{oS5-q^sEBxgv9(cCggy`~ka;7cX95ZDeW-#7Vf&;EUQ4DS9#c zgOZ-hQRY(P-(PP%az5ijr@CgE%Dd@l`z%}$TRS_O)#ZgcqjZ&M{`tWKFSAUC{*Sw+ zXJ)Qt30<$FS5Z;vZ!e1zcS?T#{K!;c>`6D_!w=77MNiKRH6CE%Hx78$rQTpza;>w_ zCMq`eXiZ_G8++)ZM|Yl_zAx6s7Rv+}!@kTou;`>Y|eq3jY48HPX_GP>9>Kl`^rsynIOe!0C_= zdwKP&O*Pg6U+LG|+ndDZ=5UvbjI~Q$^XgYhEjomQwToNQ^~=i?;%dUVM@ys% z-52;HJt?+rYb^3WDbV5HC0rbmdLdKWw0R$&p>KV5x7*xr8`CTPl+5<~BW)!t_t4Sx z#GiDt_pHCc_v6>{!j!AJP5eK6QsN-agrjAI}FJ*pQ`y`W0sl?35A|b*Rl_6 z)fnlo34ah5XMy*6M@Q+s$keHle2e8AAKl^63 z{^wCAY?Ae;)KogKyV#_py4(1S)k6|a?dGcXi4lFW$|WbR*{1(Ee0kYDO46g4`0K~V z4ZQU_MukG#3T>XO|7lC+m88VPxA@a9bX|g9-*vpK%Pjw#dGdp^ih-A@8U+Q#Rcwl? z5b1kEt@>Wy%-u@4XhcOtT~D6ivshfs%E~?NCvtg_sYDe=Y-G8jYPA}m)qkQam7{vL zF{i}7X|A+Hk9M1kVvS0#>YkPB-7lY&Z)>dYtS%@jxE~=Y7|QA^AnLjCJUw&gj-bUD zRDaP#G^^O?iWmjWsIe$oyHP7Ub1IZ^Vd8gRS6IEO8&{1i$~*nL|3tis!7CLNAExSp zh`P*^z72BT)CwpGdrE`z@`QYSeQ$JL$ksdgXKrqIlHPZBAGzBVB47LH*t^`i6(MWnL-RQ>uIF6p5&)vh26C^US2*-`Oc|wU?fr9ILvl$|AvQ zwP;8zBqXFE^NL7t2ZM;^jZok+N4+R8E4)hY|-xf z@bG!V`etl=_vLxMRuzo}9P>+yj{G`{?d0TdOWo#dKif|aox9omY?ocyqhnWoqM+Vw zKKK0F3CqqC12(3nrh(h{9Bp`Px->gJ)YtbuHeETA$9e9zZq`b!+7vnV44-v73yH%-X6O z=S=w}{-hDZ#RT`ifok1`^tE1VD_gnMVz0XXXwK4QY|po-J=rwbSKW()B%S^1Z1y!} zo1p1eIdkSzjcw+wS=Tb2oIQB8`PsS3>x)Yh(6p3>Fmd!9uGv9W8&i}1+Lfi{M8xe z+fBNx%AuWmV?$C>(sApaw;D485h8ioJJIibpFKNBwD*)eK+Si$0}C|ClAE!erJ;4u|-l+^0;}+(Hd^r;;_%J zI}5DzAIn*%oIRL$?uo+0@<`M=4!tPrPwA%+%y3&K>9`}^Yhw$Gm4Q}qTecu84&qj$ zZKNoXnLK**s3BE(@1^{k`|j-%sA{uO(QtGW)_Q(nC#kJI)Om3_ov)AK^X*MruQnt~ zlitpDtd*}WK5x_Buwlc4u&|2eZd_jXspTkdz=C(h4x^;Re$U5(rZnygzakaXZZc(R z=Xr}e{XV|($a%0nuHo6a1HYrYyw*H`MXxbL0(NZxZ229X!VmP&R_t(u6d}Q5QXjhw zyS_<#@SV?AYI^#QT2FO~Y^j-&ttd_jFmy8QZj;)UnY1=4bDWgg=9==o&&X3;&$r)bYuG@A((78l=bucB{xad}g> z$6VYq4;J={kD&()ut}N1Ym_ItN_+8bay|-9&*A+u(3Ln9-36MjFdPOtgi zt(RV2z5UE4{lasDx3@N6xo-&WsC{^Z+f0f?vhJ(eNlroWwa@~4^G`L4S3rGpiHlu8 zVBoHqdEY3SbYAC_BbQY;$oV!4_Fj9UEVa8JI3llKuJvBP4Vv-ryOJv(f|vx9q|4f9-n!&BD-t7!R51k1J=9EemZILbxi8zoRw z(iZ665W@-Hu=t?X#uqPIb4BhI52bpnLnA}p=&uuU;8m33$5a9G-rH3ohwjyfvdW47 z^Zx!3|EHg;Kl%K^LZGX=G<8p9(B9;HzB>l*??_*|6gA|jU1ZBSJ@!4^h~MQq_7K_T z&4KL-)MSNrgOn%4#qDbBCx6O6(@4ELq=Krh^H)^XZOD%gVD>C@(= z6QC>pmOqN$eSdADY)K;BYXGXp~h>kvVS+~gj z02?-=R$P)sdJj&Dzy?YqHI=cVc30O{SGXn_udK6k-Cu))?}>I%A1hiF!k(@a&P7~9 z=~N9Rjr(XQL0WO4SREik5h;&4iOafi)21Yi)H78fhG#+gP4=^(Z*9JEP**zu;qjX-PmTD=XikW6QQ}+f+s! zhrezvDJjYAn)}l$+g9o(q8SIEK;=3;^e#4(E>-KW0kEga=wUhvdQn@BLAk&7gZ0cO ztb0gJ#aKWMMRq3jfhkW=27i5eShrN`G#--@_UO@;7-0)T?biBwO3Ah5DZ`1isV`E! zpg1ad-abA_*t^;jK|w)2Sa~t&YJL>E6E9}#Rd$sdyx#(9e>LM9dbPEkoso7yuCkgM zXrC~>_?Er68operk8g#Chj-=m{|zQf+)p|c#wj78lR$69!NH;UNGU|w-aeOOg>K)z zLzlTyH4Z6pS(L4UJhI6amZ4SJDzxu%g>H)Fy8CvfEZL%x` zOw95apbdPH;5l~em`!7{Jkk1qe&1*5zTv;G%)-LbS?2L3$4DSX%pv2M#~hYM(E!$O zRp^1!{DV0K1=Xp_k!G2B*J}4n|NU#2wVQ^<8@y3|M8f4SnXIgA^eEJiZ6X%!DMC%3 z9v*1~P%UbCY4HB|fdjH%B_DZfq|*smcIC+_D4Yp9BqpZ)O2t+O0uaaW(?5T#YxswK zxM;X?Bhmst00}nM*Q?pbqeR?jF0$_3C~V%MVjq{L8uN3w>8X%eFthjtduDTU^H^bv z#|m-i^8e6gY{^QL59{FxeXo`C&9@h4zM|^7LtTgwQ&Lh&?s`$GN6>}u($8}D3Ezww zT6MIy{{RfaAYS}V!gZS5$Hzy0E<8IsTS_fra_P@olk+(Wc@?2doY3wk*e!XILyhAi4^l5*;F`=}`Sve6K#cNf!V5^y#Z1fU$U2w0E~^sMJS&+JiS$j%m7&+6-6(bv&g^`(n74ity@El>sOnk z;U)VJvaPnmJ7k=X`%S7Qw9(ip^mW)rxp~F7Ed}&`yrywD2sC*>g@EVuYrXN`Pi8Lp_niyNQzm6o_f5Sb7Qgl`%?tt`v ze>iZOQryC#?+Rs6mqIxk_rSfXg5jQ`^5#6s73OX1r)^zWP>sph{DK)pD4Cd<<*h`V ze(MCUZ6zZ+1(+@0It*GWc%$J|Up|U&P?nLo`F6C=uZ71B9wbK}<0`&z60%aDV^KbqY8!GD2}c;U2Ya^_MSH#g5~LUBx#Zx9XB;?1(saWe>OUK#2KKPtMH$x(Y9qsMyUHssDfQ?O>s0GPSo^TF71(2@cKlJd7694YVriu#h zL;kGf^t*N?<*C`*3t}Ax7O4RPPqycoYnx~46wz#!zM7S1+QgOV0P)&i^|7GA??EWd zeOUh-qgN~_nNHmLmT}(5vSWU-k55#zNHOli2RSBz>pSc^%$lFwhWtCwE^L=};kivs z7-wDI*H51)yq3mAjH-nTQ7rpFw4U7lH zrfZ)f-i>z@m}ff7|1tPSxpbYr0p;?j1~*i2Zk4FFo$7#5mEbZrKIa-$Qy#sT#boF8 zB~~EfE{e%^eEX(tj=|C-)ZcVj!&x98<`J=I9R3NBF>-8EzJ5w~`||2ZtLti0>!h%wdmh!u07 z0C9h}+;MLa~^*~3q1_(_P%gV|U^N^KYRfmqpbi(?Usji>Q1{SYnJ~ZIz`AMTf z+g}t-#ryZ~Zz!}eLId^*c*O%yK(nilFVkGl5=d?ZEbC5y#!!Dkg2Py z8&&tnP?LQR-ar7y*+>nKww~X=c`i#~IVyyS`3`x5gZ$$j1#m@D^%4b)Fi-+E&@UY} z)(EeZli`Vz?&pMRK}WHf-23cLLj!{i_gSO{;tyfbKuU-b*gd!O9dvZ)#~Ts5IJ8Mf zobn^A?66$|MZ;lNUw_vnO1Je3fTif*b5F8;1 z35h5H-BQ={=FGsoL3@rFW*i{jxP$&U>F?B35aX7xk6*s{Lw4YsYsxikN=!_=Wu8`2 zB0)|`dHwf)1G4i4X?vcNx_-&U1`mqb14o40_~KYcEo zb1Zc@LsdONQeH$vq|pU^t*~EO?p{z3ZQN7B9O$p14Hg5|o&Ro<5SW!`SV>V=Q`7j& zVfZU&qr*sZx@LdIu0w|o5#~f=9;lEXIFi*!|KMOBq{c(VYSkgklM7RW4AFocFTur= z&56tLM9tALU+rQ6z3@%?w5Ly%`Rwlk;Jpo`<-S`_w`b3t%aQA-i$sezH)en@ zQXl=Zudll~Ffg!DwCPXpdm;Wj%OBc>SM@6^gg~1f{^(JfU5D%JSbC^ZWZ>sf6+-P_tcg9QovS#w+EwmVu5e7Z`V?G^ zd(E!z#XXk`>sJI#>Y0@I8{{hf1ZmL*{sZh}_efc=f1pbY(Lb=KOD#ieu{n&QZ1H&V zM+=Q_j|{~v_PIz(PApb=)^C@EmzPAmEZpxXC!+B?=1hm(#rAgPA*O9S{a^R_{$Z`2 zJoYQbOEh2oH-z=!&(xuJnW{zDH27z?9{bOZBB!!t>h_~*fsBscU$m~=|4%~0KXl)! zZ^n+J;tn+O@&3Yc4j>F$rX)ftulZHEDtxFsl=Wxy6|a~pZUK`YZM>8MB5(+l|2YKJ zK}Gt=q8X9Wg##L)ee+y$XZ{y_QT|=z=!lg}?vqJXa>gQ+;%j`NtRL3jn8*H)F!7`R zRm528z?=8+7pw{w%tG(u{rAn-Stuzf#e#$QuYIysoVp7}F=Rm|O*(!0bZ`gE$+zw8 z+V;mF=bk-C$51f*RS(tI(b18i;Cp+!Hw4rnZT$IFgGsM9yS8uNZf$K12=MhQ%}oo7 z?#nUfdnI#nbB})2+tbq0B6d>!C<^l1sw#zPN6`PP*REBvH9ogVSGE9k4IWhLh%!mC z&}r}J7?t1|E@Y1n4h~l9`&wV0^>Hetc2%Qw#Xnu+Al+3fD_&Re_$982grc|uNv++L zTLM=Fot&KuCHcA(6%~&!CP)qzfs-qLRSyjf4Fl?W7-sx64ZISHyxkrvD=W*(de61< zGM=rbU(Akg5wjbplg8-|i2eTkyAZZlf2RTYf3g7V4jLL7;Nzd_9F874mSZsR3ihl+tR$f4j2 zh!JfZ^`YGS;E=__-(N#xV|B6V&@1Wp?W+-Lg00lp)fF!q?(e@9u)9PtuC49TI_)B; z$XG`~c8dyhMsoX#Ah)1hK04VO~S>>fbB z*(h#5bk%ncV=H`)#vec8iV}bWZDz;5hn+0bNH_Kz3)_=EtKQhy7Uj{O#4U&CgKZrM$?K`lWtbEa&fY>5))QxEJD zl{_xBSaPTWM{VCgBO$EH#kq+uTGc;(FgyP4OrI$O2_dmp0FOqQ9lQLBY6C|0n^KV}6 zcLa-Sd*j6GK3Ta}#i69Ml#S3=Jg{&`q(6`W(zH=`He$`BIVhb2<-OzWy^-KFQ7tSu zHTSM!-f=> zx)~?A689^mERLLzkYK6`W_;>I;KgTWlf20S`@vcn&}N=E#eq=|wPq)f1(M(^sYx-e z_jQ|VCMKiPQ->`G9JlbY4}Y#LmBPOl75HqKyG!ZlQQcy#u(^8DqUss!_;ry*X2+=v zD;fv|zR)=dhO!P3`qg+?Dw;%T*AZR};VGnvNlQx;DmlmmmWjZn zLTk3BV!-|5MpgF=Cm_T77Z#pK(*{cQf$wHo%8LW{T==y&Z7>Vp9|tx~t8T7FhwI5M zUp3S&ST`H)>eUtMlsMajKDVg(4oDXLC49Iwn*|||v!fn4IXQKMDbCK$)>7Q^SwPh! zD1g_r3*>|LG?cnIWzMC4GlPN3uE{J-eU(7c`y3v|#_oVkYvP)DLQJdz>QbRevB#1y zy3J!#UEE!IDNl((ITErOcaRCi6+tmD?-q01*kchM|$3s5SO?( zIZM;00ISYzt!!0ZzZz6@v#3_dL<~U%;@Lb*JM-p|?(*_ywT)$Zu!X8paY^1d9xHbBJ^jJrkf0#t^v5c($bS$yisA$)&U*bmA-L*buLcIUAuaT^ zlqDvzHbyfsF^K+{wo#%TfefO9lH#8OIxbjo50J-!SNOxOun@U6i32iQES@qj^ob{+aP=x8^1*# z5VCyZGR4Kg%^j5Gu)6H>vA*7OY7MG%p>?nK5TOqdp{B-;9Kw#&DfiT!cEqBmW@LPB z(*f>!Dpeye3%!G{v4j3`meDFg8-MUGNbkXeSIgYTI#2?8v1^hG1kU5V#KvO`4GnuB zz@Rttm%-7&1<{yxd=aw$+=6KW0ZVY@$RtyVSt^8*3YqG5fj zzIx^Bx(tDCeCZ>ZsJaGx7bB^g!<0ulAxEGmDONT%?m`(h8Kc*MjZ6m$bdTBR zg+Dng&dTG5CKEMG5s=~r>(uHm-1FdH_$?<>F79I-yG6RI|Z$yd$}7C0!RmP zt?B2V?@@_5Ey4a}(9|CqU(UdYw>3c4u4vj+^BLDHmY4Cv}At@w593)%{;F)HEGP+C+ zB%VFUkvr#Bmu1F&%jB18VS}f^N_ED3Xeg8tDns+y=WnDk|!r=^CL2KSGx8 z%;RwjSVl4ct7%N$tUbur<+iv?5AB+r=uRk7Wn*Kb-@CUuQ&;2D=g*a(qe-mfDi(Gq z7kGC>#)xvE`$pJKiU1Sy*qQ7O@LBkd9=#{33HfF{*dc`)(E|t))zN~RcMrr!dla%z zodqq~1}df#Z&B<~;P$Lec&!=zeu~T4gZ>rNeD3hW!kgdu2R#4&{aXQJgE6VHY6eA< z3EGyOm(ys=P8Lb`XL&8KxaEUuiCC4z6;Dse)y2{5y6D)rxJg8wbOcJ>7ijh#z4W%T z8yUL_FhQHo5fKr-ebx6(>YIxl%uC$nvW)L%YaNbed6U9Pc-uMwuww%YA#UBo@;2TV zoMS)Kuz^sg+$VqfrL55I-_LVd!Gk=Y*h_@{+s%V88U7}4fQQR!XwamjrYdQ-K8THt zef$2s>|ZN(DAtMk<(^^9fZbP3Ot_VfSy*bDsahrpb&r z!|_NfojM!lvBmUpK5D;fIRpzpCcoI@Ix%^2I$%{ zIyy{E#dq38Qq$5Zkn@agIR+`x+12&TEaI`>31*^{31yLd=gx;IE#@vRE)Srf@#>dF zdI`~F~hli)<=TE~i`d5f`?0k6SqOjwR%U=gSmq*l%{0Ae}#p#p{ zKdKZV03y3owpYmXY~2-VvdlFnJG)KWw~yQPp`Q@q|KM!)_l}OeOX19NzJv!W;muky z6oK5+p~v-zYHwk9z$0Q)+o?vesc6z_{d&76g(?>;^*=GT|5{ z78De~x^ep5-%=)EQlAL^KBR)QPyeGp`WBc}FWndZCRe{rlJyrH0TS!=-9{_dx)0S& z(&ew*lNNv&`PQ8L{3`tIjdf>H(ql0oO9)Bv%!?CY6)<=0i_N~k1uyh$;Ph*5)&kg1 z8_IW>=wdc9GUBW`MZ2wuGxE7`<3DF9ck0wFH5agUp%FQG`C)=AFaL_v7c>(+abifT zmI$fVgYZO4i5@NaQ)c|63`ok}EQXc*1oF_q+TcE7Qcfj9{5cy)_BJ-XIEg)@+akJv!Sk@yxT|Et zmDo8M)a61T_zsuG-iAyJLAfeq1BpLcQRT7VHet)ogE4A+7vU5;PIT=A4dQo<(D0*N z^yLa4kfqEX{3;mV-_Y&*8N2W*Hj!X-j|x;>>;(hlP^v;U5^qEz;jX^JN`{2|w(VS9 z7>;7JHTym4e=xIOV0Jd|C2j4yU0KLvFswO2)<5&`aA4L{m)nFipogF(@|D!Or<2Z} zLSGWHoci;J1P!?{4*>uo8Ua*qY-F?vxw+&5J4eSY(BXqSKrtwxeI=Vi@D1#`b3RVY z;cbP?vSm|ucY-W~;STgJ5;U5|j@?~egza_sV%E`O!qb2I`Lk-Dux?c(EBRT3Q-VAG zE$@4E;kR!86mjT)tN0#7K{uM1z(GBqt;Y=I?F2_y$de}roZZ}ly6T~t09Bsg_}cwN zYHderw*J!^GVaZrH(U2sY!0}47uRp?$(Mw6DQs>u6*e!ysEpLqLL_q(+2FT2{q79J z;f5`XY94;rY3uLno5)IDcNwTLHLo7Bbs*T7q#fZYFOL^==eC&rD9(GfI^3T3yhRLK z`vb-9!^c|*_X3iX&a1Lt!vmjVE+MBokdMCB^WyUB+?K`VsV|M~?T?EtyuSYV51N-w z2luSQbmM6jo6oX4crh~qJ#2~|7@8ANZMyW==B+$0E@U#;#lgn!gX^Aq-K;4s_?gW< zCMHY{;5_e5l+Nw9i$hGO2StNpyu0H1m-t|`vbyMzmdqej)gH_aASU`2=fPKyDdR^` z2(`Xuzka&bT?Ub($^d%SC}Phv(m86CsO@ZR{~)DP4O!^SYV5G=spb`N4LyBPo-6q; zK0bXX=Y-qZ=~W6nd|1hE1xzeE0r6IP8kQrd4u;K5Iu*DsIHA*j z)8-Zt*?VWlz6!0aq^rQ>mH8HxcXwNZG^UARX?r8)JJfmqj^o1LYnNUbZzDx?sw01V z9GOo;^I#@X8oe_2nr3wvB}CNO)=D20M7eHiu^Y;mw@gw`Jc=iB_=t)pnCkH8mzA`s ze_LA|>Wvf)|9v8NLge{ySxC3S^vlsvd;oGsnjWfDR8;gp zw^vyaM0WBNHzbR0zAPw+e9_e*e6x*&ffyaL zNCjTKl3RR6Ro3a5)mkKD(NNYjpSkr6E1UbDSViilq%U$Z!h`>rUsBns3L?2~OeF`F zAEjj5f{3=rfE@>}$A|GxY}vmC24oN|5v@G@;nO z-3L4dFM(vqLdGjJVSpj~p}JaNrU_U%39|&90)D=}r}XspYf5F}BIxMo6b^tS5Ggtw z>5d#L*Tv~W_i&R)XS%2`8S@T8)~AzAz)x3kja3L4W;+hBKi;&v%mYb{!=!^n3CJfI zKyK(1*j2G(hD4Jfq!u)MOd`9Tj*02$NQNr9cK@Sad8lSVBMG<|)}N<21&FpT{>=J){uC+<+-p@I$_ zv$aY9sM0yi2m_2lW#(;FSw}{G9FWFikaIUT~Z%i+8X+@g;I1x1J}xvahEP+#l!gMP9Cn+FCYLlM5(*6= z;Amxa_2_hDy^*2ei4>P#pkc`cSStiEVf%apvrTlRs(`zQDf!Aiz8J~ixjdgUG|n#| zz$PjhEt`Dt;vQv1Me0&rZEc2>v@}~qvImHaV^85F5F*eEyz~bv!Fk~?C~&-_2L(hM zCYlXEf2Uv_4viB=<%;wRc%#yuo@GPhzq`wcPc9zYO1%q#T;yrwBUi3rF+-M$z#({j z90jaPE~x$V$s6i2WuTrepOWPgm&}K+=Ilwe`p%pKKJi|{&#JF|b52qgVT9LIUX(zZJ&@(uMGc>t@!pbhPiQKzEq3kq)tHAHHg#)4b+;gS&bqwdi$D z&TZG3k$vJ@_`DH}8(C=*F3r!!i@=&AyKhCM!MXHCFe=9=icGZ3G{q;+Ci=?Sy4lvS z`t=X?k->WAQ2HJ#JG;yTmEX)`MRIQvigQD`mshC_brX^6!f+@_S$Waz&e48n>af0d z#ezw89OR85RRz5`@n_h!_7hz{+Rg=Rn3?EiLF6HI&RlIma|YCAD@ud!7s(}+ZjBFQ z)4zW(?%~lPU*}$Q;Re6Ava#<61`Ke7k7yg)shEkS{_;j6d#QQ}XPN<~$%t=|M`(QW zh@*8m<5CVg5$s2!VV4(W*Mv|(y=TuBtY#rcx^N8!N*OsO(Xz|zzBevv>t*9*Kb-wn zlmz*3 zySl{E#gH#?o%}V&zL?uGs{fNpj#RdEA=MoTXX>a>M$*#&nYU>HZUG@|=_pYUf45=WxM2tZ735S}-CfeMmE*+UYT z(s72wVi}D##Vv-29B)^VoxgH?Y%CLj-Q`T+E>*w}^%Spxl6I)+?kYFNE;caW}F(%l!K7vpJUm;QgyOIz~8lzF?Cs769+Cy_GN4Byt z+I8-CuzI{iZ`g1iPO}q@s^$4bb$dze{({^ajT`62zQ6w!#kC-}aaRsTV-V;&E-btU z0|J`4IXQf;++QooE?v4rKwdt)PD?{`99_Tq_#tfL`wc`&=`}04^RG|4e@A0m_<6G) zH~D#TgJ!x`q1T${fXeP87e2t|u>Bkn9`1vNcTxJ#3ofHyQ@77~G?@dy5+jrMf`cn@LVPUuO+%|u$$W?)V^viZ2pM|;)kQM7GO5*C zq({-pCG$OWkNY>PP>P8+_4#EGM{J`RCO zEqO0r9{T|Ti_~b4m*WbE5}%H4c>kFbQfkOru&d-Q5n^~a)q%!y8{}+@in;y zc-0f5$*&VVG;64L+1lDhwWNB4m&~gAL72LICY67_k7kAbXW=`Ke?vFqd{WC&`Tw9E z-zetAAVawp9-YFs%i(try`Wn<{TKLo2UdtdhGvf8dc?kE393B|vCwo%(kVpl?w0aG zK){q@y;gNc2O|-uVQG!kDRYlrFkgdi$k0EyYhVm!?1M*-s&Xx%^4(q8K8M>!2v})i z;3w47)Mv+5AQ%{}epiEkl!Q==%IHWRe8@Vf4bL%SDed`4VaF(7cb!!D-?_Q1u(9d` z$@w<&nCMZn-^ScXkl1%IXoKc)SRhvw70IWzok8~bJ$kTdW*#u`Ic`B1Ef>w>z_|MS zSv)_+n$G{}?NpbGc!H&x{Q9+ko!1)1UZNr+O~%@|h>vQ+B+tNVJZ2eNXnzR^2-u*N z*Y&+Y(ZkG;X=XGWDg4b*O~#ol5ZtUzk=G(PmS>WULsqbykrWAt}4XxO}2;Uf3g5!h?WQ3zI{7+ z$*sb7TU1!s-!wHWw)zAqefxEi()-?Dh@Tmg1|A+B@~w|CsE!Da?eNooriz~rshGFC z=neCy?^<11Y9xjk=?OxhdNA-iG9jUyUS3{};-;iielkhtxDOw`yTIsMq1>w!2+2v! z!QI_maHK9y{NhtF(k`nu+9chS+MXkq~A>evvMDI9Z14k$xQX=<3$t?zq-C<8+@ z8f25n4`8x`j3_bBgJu#r@New@nZ-mymuqiqJV z)td}0*L8I#$d81(UDj@yAh-ygZ2sQiOe_xh+M4#`bmf_sGy64RS8+WkVP3A;WA;gY(g+v<`FWVE!jMYBz? zXdAwM4Rg_vR0K3FF_6)8XH4hbYS(<;wcTm%rVq8+XALfGKOXoeZyn-QSy!M zQ=dgDJvE$%8rV)?BQ_after{BekgIKjL7h9-)>Ydh?KVBL^pDQI&U0<)71d$@49PW z(LQ?VB?(|ySx23k44my|`eMr+e>jovu{)w##v7k(ZAPmc^?+{w2O|;!-)x+m{4rRx z$88OgAM2q*f$1$U%^Vi5%fCQGSr)O&z)?K)K;L6#H_qpd8<*iFH2ph>jnlNkX+iLA zGg6^j6y*a98vmiG5XBgPxc*_KyMnRAh`C2!>c*p~JoB}@<)@sTPc#)H=kJRXR%T8L zz=*_Ks0C3i%ywMKt9Dy+*IGI*iA{ zxh}1fgSd#kn8xAz7AILIu!%RXzFqcaV-WIc&6 zv9$XBYIAMGq4(~b+p)%DrM$`_cHNza0xqvRlZtadFzRv&;*-Ke;cv}(`SLAhUv7LB z#2__gFUyY)*bweIX4Y*FZO^Fs`YY&U_O`Z$ zmN7syo=f8eVE&hKHja;vM+-p8?!hh0YeAsFBnvs)mUK-#LnOOwkzzfehM|Ac(#4-p z^yOIufa#1f35bgK?*IAI8w01!D}kYhYPe{C%N5{=4p4fy^&tMOxLfu;Hl5t~m_F~G zJ$njq1b93{?oQH6%#$)W052xmr4{hLj*5s#vGYQo)jyz#TI@h*XGIcD8aU}7HNx(M zjFEj#xsGFDyt1`5Qn>>;k$R&Jv!dT^vC$|!OkV)tvC zo0~-(V3{Q2VH`+{Dx_gU59hMHeEG8B*%VgZNQo6xzlYL``jq%d>J&XP=kYmx9T%UXMBg1+Ts( z#Rz3tjk}&di&$9Gls|cdM=UAp(sijNcxaj${;uXO8s|-AinMj^=J+de9Kaxw|h$EwvHF4ebpE_TG$q#aDs8lIo9&95UF>dc{y`4p}j%e~4|I=lWpZIC`TY0Cot*R4S2fqziH*|V0p`P8FB~@@^I4$+t>cq^74s`U&)|nz!;_t zoP=Zn1RSsG>BTE#Vh97C*Z6oaBqRn05S5oePs!%vokg_UDJZ&#CoVW+I;qhCYt9c~ zn;{yqZ6F@nL!(Q8Tb+23L(nscISsi23ry-qg@->mH;TkX@+f3O158U6%mP6x+jC(N z!Us!Lh^wnpx!GM-I;0J=TjAlEnKVea706QsW;G$hMIhi7^$&vAjExCVS^oZ0gan>w zXL@Qr1#>L0=-?_=gXYM0x(fW@T*Nb$!YZ`YO*Q4#` zKnVq?MfYM@;u8Jz}m=W*o&WQ2kN1^q;h#3L-v&SXijhbp?ejTQESJ%%M{I7pG;|NTE8ILhAw`I9)rt$`S#`%3wZMANyGayF{#HByW>s7O`Awgq z7UZ+Ke+=^rShNcc*ER&zwK!`Ss!;e9sL((ba9+KO+4}eh(kk+nEYP);mX19K|y zkf9JB|7O-zBC3_H;b*FAH-$lHKKgB>uCDeB4nCbJgJv@iN)|P8kdA_U+ctyxXU`E9fcneY zR18%c*r-l)7oLB06t(z~=?`MmmYC4453IX_6*bmXipd~M^$xXUGJ?kwxC@yVzRXcf zngKc-nOy~NTn~v5nA_UXvBk9;_rC|T9|-c6lq8~6VU8BSD@}|Dl6A?Z!trX z&j${!Kz294g5iX?!w4N_G6T}pU_2yYch}v2{v5kd8e6uh7>{h&f}Cr8Ac>HRFu8*J zjh)of)05b}>5NN7C2>`=F`Q1!j_$ym)q!ii?2wUyU@FQ=Q%Bi4{O;U`d{amHmQ1%aBJ<)rI}yoCN6_`@3p2Jbr6vFY(V3xYU<#0)$? zj2Zo2P~O7)aj=AcVx%~!3UqGU&(pwcPYMb|?7X~?V@BZVqY#2mF)x9gAl$&vGb^R& z#7XB|DX^@|?gp9ZgGN@AKMotD?bS79JY%aZ=YZZD$ISI-Y#|VD@cV$Wa;;N*z(GG$ z048K;?YxlJ*?_`RC8UWQ44A{MqH-*(;H{KQJ&$W*i6IlDuWBWDm=P<-$Lk2-xoW+b z2#Kyvjf}jQ!)x~l?=3Sze(t&235qr#xhAUIqDNI_Q z8fhN+&i^xZ4CqprJC4#x1c&d|H^cR0XR#-qUS-dW_@PdY;pWJE;zDB@O5P+xBbCDP z`zt@?G^l=tXa@2o6nJP55y`DF{tbxCQC$9`v(p;8MdvSqTD^Emn4HO9q;|0s%$wU^ z8f?1o?5wnXCj2pToy9507i|!{8y#S{{tvd^1RBe=4I9=>rBJCzh6t6+6%ig3GL@o8 ziZV|bicAg2kRcS(W6YG0%2XLMmXNvF88S;|BI-LX`+eX4``7x`+H0@96Yl4}ulu^r z^O#Og>>bTj(=hAAr>Cb6>f&tZZRqsL%y`6iDCq9pRpd-fsKxcYz=Mmo0K~U}Fifw( zJX=8-arSd05P9uhrp43ep=*A$Uq{^`3EY`(X5~Xmt}K^1>(AXQE`IWXa_^~M++|=b z@x}u-f(Q_3NJ+Gy{#_dL_)nQBWA;0#?85x--y=;L@XRIDA_U3d=NNQ?UMJhpvpRfO zO;hg9lqjgb=|$#mew-~Q_v&YSUb^Yr4m#AqeQ9!`%o>*`Id`L51kNg z)ZYXkxgesSZ>Ko$r)7>qPj^W2ptJs_1)!~{KT*}ICTK5XB^ow=rBhHtk926@)%aX> zCgC_eb-1m)-G0bMK>>UjRRq+rSyLneyK0v{jocjuz%wEhz5cJ-&eTPj+1jI0e#5{2 znw(b3V-!Qp1bu-#Itft1LbD$>DW#DH`NAtkp8|$aeuv~HZ!ZPzj zk@+ucP1RNTz%bic z%0|DuJ#h+6%$w!jZF+X&e?P@{dw_WcyH-{1*U8KrC-RGYmCyE6D>B4uw$Ga@5?x1agD>? z)A1XQ%(bwwi2XSP>w`5L#2kZ-$w3vJaGKBoH|)?Q&;t4@yD^hz8`1y6ER;XTio&C! zXw}MB&~gjs$a>qS?iZyupXYh1KR*@Ix>444=l_9?V)-TEO73;>6Z4wFItHfe$O`IY_6m}zyib!dAgy|r)QJINk zihnovBg3F*)AoZGE`7}b6wScI#5GVk(hH~-!W_0Fs}Wedur3?&krqNv{8|oCuTZtr zliI8m$QL{KlA7LB_7dgIC$Y>idgAEu?2T>Oa4Km7ZLHwQzL8ZO{>;!yfm94h-4CUio27hhzyzJ z9&~8I{|&d>eKS3XsZ1fGx^!X~2ADNDHl4Mv)3dX)Ti9F~rNE*=&{6j~9g<+kt>fBe zAa=NotkS*dI9hKZ5vGb+6thz_BwUjsJb;*SM%2cpO=o(AVhjN7@b_W)ra7IKfh z%%9) zB8Y9oWv6-Br%wU%ZPIQFug_8O?V_!X(3IkDAbZr_67ADLZ4%N@lQkr|D)1O|^ub?3 zkBR;ndXJ?WQJ3FZAkV=|-aGr~!iC{9Lq`~(u8xTQo*Aip4!oTuUyHK!1it};ju@!S zBZx2R&u!fh4zb)$?3MSe{$@kP)`gppZv+R@Hh}VGrdO|Cy$sD)UiL+*BA<*5d#?3A zAJRpyR58@l)*iaDFwG?`O+9)Dl9!sQs(O%B)g=g@v9(wN)~V~blaa9`eG51pd*eM4 zkXII?ceZyK?#O?oO%Hj9A?ScyZB^C#!EFqT5I=*Q^lsO>dGn1oVWB{uQS}DE(Qa)y z1cRUvRAl+mOHg4%#KZ`eg)&uFRmoyO{*;2e+F}sSukJf*Hsm>?_VvT3qZmSjUjnq< z&P5kq4Q+jrq^rlv!69qfJ`E+Ft&4ekPP%ps|8|Jl^Q`}owAR4I3C^et3U{;oD`XZ1a;S`!8@SmqM(yv;B{S zufKoe(H!InnVX_jI&YI6uJH@C=3pXXiP8AWCkL~-1ez%sT!cAY+yUH7;*CN)f8D3p zGh=TQBD7AQQ{s)s-@E^%av36rX=V~gXMEf%fbIoG`>iEE zG>G6q76ef<%*>iq&_GSsHhu&Qqdzu9MEMj$TyFF?$9(i=^a5^17N6t2g4+-oGBYcN zr^p$Ny*%lLz^5k?T0Y5$22&f8Dvo*x4>J>SlQgg#>_+HmQvuor0NrpFZ z3wu)v!~{$^F2~nbUzX&Xh^G!$zNng^XO&A~-5B5 zJSEL&UVFItaNO+dX{&nKQiV8^wQB1=9G0tFWLnLt^4#Xl4)KFc7x%HUC`Ir&*hOq= zqGOz{5tNHK=upVA{X@~qIC2e3JsLxQoKERU2_}>s*ZefE&1LSL@JcBkv`?8(voa6`(3^OZF<8<7sS zPwWKnCV5~T#u`A|+fd(8bpb6>ltUu2;awCG7CvR_3YKNy*$Q-lR7n&S(l^>)L+Tor z6|h72&f9~;aoQBhlzJ**?G1g9&gMhU5n4lzV@Ifq@1ko#2?Os`-iRG=1zlo89PH{U ze!fSU=@7hOrk2Rn?EWbV>W;su5j$MEY z6}|rB+|L{{oF80`3X2}x3NxEqVeQ(4542KYwQ`8xk|0enw&N~@_=>zVx=zx^p)6QO z!zWw_iyMK4{F>L((|h42CP^(#RI+QnGFGEX;+(n*Pn6IQE_W?N^#|;=%N|5V(xFtL z(X=aEv0VE+ZvmacAtm)h*#xcxR0{@FNzevfcs(g{awQgy42)9J(%mS-L`)Q;uvN+9 zNT3OOl8hRp1f{!QvpKTa-#Ja(;nzUU&ngHOyC8Y-vESB(y?&9pA~6> z0^~b2WG2`AmH4j~U}@{I`Q9zvS#FCHm9`KFCArp7k8YQ|@$*0j9s~SvBs->W2xflb z$I&u@SNR2W4ST6LkKW^xc~%}m@F&sm(u7?IC&qFVm?KTjuqffH-pZBD&D%&gmsHLc zUUgy?jGScKx^*>b>}@~IfKvONrZ{4#ouU54TZl6 z(9$Jy!A~_cT|n*1Z=$o`3UTLV8kO2cU{3@goaFVc@4>OYeAwf>=M;SALE5o@|Cx`w z{hvBOj-lgRl>pIdkizuyD|4^Go8AI2Qz~7p2(JvGq-kQye${;XLlU$*dwn+{8LEjj>60x++}-Mad0Ece^Z4R0KVxa9X0(mblgl6^tYL=u`& zUi+M5JE*=&?2Q>aSsXt7iaT@s{*DrHy^lztcNoU`%7(jA`@9iY<2%>+m-TJ9k+tG` zPGR43tnJvgb0<-NUuj8&$Ptv^X;Vvd48=I@g7j#1RZ=7sJ13q3WK1 zx*m<=w|6_yU4Hn%4^IU0LoyPH(I0EqxaFFiT`4Nc4FPKE#ew8FC8UleI_3x^VD$mF2k3 z+daoK$xX(4X9x+>19En~KwNr~>Ok}uyrMsuQ`!=MHq29@$g<|jBTbHYi%Mg~9E>Ic(6nSC0 zh1W@VVr&sFVYoSVqbo4cj;zS2sPC}Wia8)O72s{K-hSu_GWFjYKOOwyglI$(7toPK zUZ!vXy`(sC%<*t=bnI)n33g?fa38?K_=OK>J8~xTqW*wGmz^zOXX5G2O-7V0vC?}v zeML*S@cXyD3rLmxitrbp(Mhg?H5;_=Pf#ERxApcy>d?#5!F({0+H)@FwgRLhx$~u0 zOYW?(`=Tj@fQ-*aHDFNTYkVo)mJHhG%-nFuG8G(FX(aQ^t1 zlE>NwWdQgkAa;AlD!-g$P?Km@LW~VU_W5WbD??Qz(Koxo_Uecj-$BCYD zE-DOz1&(iSjAeAEVne@aQLyO31(>2i@5JIWBZcEce=U(PCL0JsNs}hAst0M%>B=^^aAdU{tY8h9QS=K zSNS%G z!fIX{<)zX3fY`@V8a^k6X5>3fnG)rOX40?o{rzy9@dkH_(yvo`h0Pu3_X~Gs#^>DR z!j;w;JhO=xt3S$A;ZN;KD>(O9gXPy6>5-2=)PQF@r`jE3i9 zRN=IZ|NTZACc8f6)cKay@YCNPtkMsNZRk1sm_7fLi)%%H^WwgAYQlF78qL3De#?KJ zX!u&{;qbtZi#-H~M-BMXpwR_B(+M1uWJ#gTE`}A~e!y&c@bfPm5?s{#_ctPaWM8g2 zL4-Ps43sSK>Xie2v(^}UH?=8R!v44quf%yIwcoIW@q+Yi^vl>5obmbD-kaF`aDGw@ zj}l4Rv$Ou~%QHj7#j}-o`T6Ncnqc}CI68jlyv3e!4M_tX(G+?432Z_Xsp{%@MgKe_ zSmP?NQ&uHR=z_RM+ z0GVqbGXLpv)d6;jx$o3T#ddS&=poe07hV|$crOPlf$-_j1rk_>N^y^3-ElB;?g%aD z$ZEz3wV6%wXUcEzj^V^VSW354? ztiVNRCrGk~B?3xvlY?ab>;U$s`!qB>{N8dqtB&sR8me20VQ46-tONixH(YTN2sRIV zI20_aaR2`OE4x*dLOcFR3msI|MIxRvv;0aGPmqprB|&H3JW3-r%B5Ys4@Lx`xf zvuPE(+Oef#(2Lwj-ebphV?|Uw--0ldMMKEvP88P z+|H74`N;$k>daaT^s*Nu$e$Y!yjqSeNyLRD4j|TLaGGe{`~43A`NPZ@mb>i~FywxC zG=_ocAjzCi6g9BzTJ;jL?u;BVazfG(Ma_pr#e3Y!2tt;1ogRE}JE1uUYVSo%eTtas z9UBY6W!9%U0%e4pz^Jwn5(yy!jq*xLK6V!3L<3^huo~8*17aEgyejZ(ga?MO*Hwbq zTt7pC$d`v-M1$)Y$3xX6kyb~rt@mG#g{)C`l`;>Gq}{Jyy~589;W;TJD42=ElJmpw z@$-oCT!sNhpb&)sG<52_nl~6rImpTzb8HG`nDgGg9k~#W!AbyDuQnii_#U`+5tpWL zps2)e9FgM)?I((6+Uwlxpm@g70SVc3;v*QgVi9eCBPYTE$V41)>zbG}K|DHdVu-f_ zAhd=ASGJNsKZHb2N_`pBUU!+xM+dAn6aHHh+*P`-}Y$ZVnAgv=DNQ?(+x0>A#Nm?+cpkR<$JBr&o+1ZY%$24aDzb0e& zL{516G6RDa#L4~z1Lp_((Dw{Bk^4VwzV$|W;#==cEt4yn_ViWhJnPH@)lzpPs0n8q zCHJ>AZNesf%CP;)v#OgXxPaAyxmb%{_>!@)c8>^Fr_#xjp^bN#k$*^-hrKfR8W}@| zbb+nI{CHDA(^UT$YBliif^;D+8qNoF6+vIfdC39M%Sm2DXfh*OT|x$+a0C?484{nRP!%n-&}um6jNa;7nr2uwF-@P#|PgHnl|MP3Cd* zzxjc>EX#@z>h_kn9)>Mj_J|!qrhm+4EPvx#WQxdUjortRf%ucdzVOEA*Th#=AFx`` zq(^?yLbL6etA7ip`-t@Fh+53FDK^;*u2ohXAl@G1^HpFPur#?hd1zPMH@fRpt`)_Pl^*degY>5dum%l);ix$xttd~e(zZe3``ujxhjwO84G%}{W2SmF zJUTj~^-LI|1d6dp>3r3Ep*|TBKRZb_1OWjxpJYg1Eir&BHyI}itjp>Th~zT^Mo&Fn zF|jiP8(Hmw5kDY)@Zv+h@#ZZfjz>{E?f9`@9Q^#U921mwo z3{1<_Mm}JLDmU)bO|?#xK_=8!r>jX23CT-nZv>w%p0x6W0rB};Qs zh$3#PiaOO}ym!w=c{#ZiIA~*j=M^A@4vq(T`t-p;(~Wd=-7O^8Qt0sER;8T`cTwpe zKkBC^^Fa*f3I6E4mUJQ?gRsLRI+ZgTkVQgnGLB2yi6&f1fiy&3TZa%3;O+b-Psb7K zb}#oK9uX{%0x+Pt%s8o^GfS>5PwMWo2AcM9Peel0goX$=7(s1xCL~8p3OA6f+GF1u!y2 zeKiBf`>paInMyU=d5+Y2*6WbPlMl5cW+ngI0BzX*fEVBNd zQsiitr36%S`W#*(=}}IZ7eAO-U8HLe45}e`6^K@I3(IEapa#&vfDF&^6;&jP&Fl1V z1P}XisBX`u77CQ#xjvZmiq0n-4dK4#k&|QtoT5!T&}0>Kt&G zxO5?M^fguZ|G~yOA+{M z?thQt8#f`kdjdU~LRC61?F2=A?G#96v6R99|F6SQ!`0Y$q#3;xtp+l=ek!F1q^`bf zPJZ?qC!vGDDNo1Vaf>>LqYp?yP!b&h@_<5*Z`(YA+TlTt`$d) z9=!{x@0`1}we=-4GydYw9$%3e+5IgO!a)``2gJa{!pqu0ygoPCc(sr|bt?1m4M)f3 zkt111d09!qPkuW)Io0ek&gXBME zmMA7)%*{e&D?Ctzy9t@dGc8iDnHm|rf8K|x2QOYjg9OB7%b9+KxIx!IVN<4FLrxEi z!j%kO#7$htv5Ztt2T0SuOF6_7s<5prQD5Ob*i3+EiB6pS*l%qt+ij#!ua)2`MPv5E z)^dbsm34vIbeTuNjuO$E+`T%GgxRA$ys11EW z`IP`th(p0sDyyTRWx@?qN`3^9n2%*cAtafOJ9`U2}1gv}Ed&Z}zQXB(hE{du%cBk9G z*RtDY*xjZXq1H=LU{en1f?)m@YO}!$9V<#+hK8QYb#BLPY5Y??z4#d4PBUk zTb9g#X`4ZT?1`AF@DzaNHPO` z@(&OmG8th7qg=>S!@nD7$Q|DybV)XgBxvqGl{{XAu+!k9TLlmYv;y~fB@N%_Ajl>T zTzq#fkC$f>ugg|xt#hZNm#Z|@xp&npX7p(5DJ14 z;{^7CZfdy68B^vEuzy#X_=Eausd|skFpV3uuU-b$@i$1lB*;b@6uRqi%r;s{qvtaJ<`^*nc!O;g9wJ&AObgm~Og0iJiYaYfFqH0KVfi5S z)*DQYJK!dYH2(u6{$7SWW$N10^mG}|$w#y^B>p^O=M@NjY{6AXW&{7OJR6RO+mia~ zKX{qFG+vzzf9D6-1xT(dscK#Yf+HgIP|ISW864wO2`~+0&X(gaI#6GW>uGqszr!Nq zS&;*xfCTryO2&`=jvgS}0jAh2g33DE+1eK4mR>NmL}Wk}gV^E2CbgsrMBAfr(gcR< z3!+t+j@yH~=T$*&HQHk1j4`|>U6`-(+M%JL;RUeZnq|}vfe!p)ImF_L1sFNffHi^9 z8WwTUP3rh$`CmWc^%!cxFjG3H%SBp2xia7aTZ%sq6ZLo&EV~z8<$j_bZEef(54PeU z`#I*8u}{ezys%IExewsRmU{EfLh06NKd+P6j(XRh@CU|i`uudBRoc!@olpqaV6WKvAFMYy*%K`R`A(PshNVvz(YRe>(Z-U1(k9`czpgbKce z&bF@w(%OU)&u9n40)z3$vj-w1v)8*PhaJ*zpX9I%t=T{-EMhD4EXx6u($C*vK<2#o zq3i!#PnEq+mwXm|#7JkF`IO15_k8yE*mV5y)1d|BjzH>}GyUT6&pBC}3Yf#NKEey1tD8uVDEL z>Hb|5z1CDD-4VzYA~GZ~39DV5-$xWHz(wLc29VMZSw}ue&+YZAb#iF;A=wW7Vk zsx)ZdZBUg~G@Vjr;C?{ORiTdm5N8~At}dxQ?-`v$(E(EQd0xH4pODQNg!%BV!FMpgd>e!q8;p7*% zHI}Dbi<>?Dg|8=YwOq>1VNY7}71iyM1_2xT4F2=S`R-wDw9h0uRTM_+7M}_1PLyAk zV!n?jBtFnqKKdk0F_r13dlfEPvi@%KeQx8H9d;y7cuiQajZy2XgBJrde0TRz=&~Xy zE~%M;Rg3$&7u`ZNDTNPh3!m+dbmwHnuv7r)SsgM{=xA-Q2%72%_3=0|^4btUG8`u% z`*#WaS)9`N*#>HM0(C*D6n_9gir~x1pPEK6S2w=Z%gGOE0D$8n+8d!162b&0c?*bH zmaAQ&5T~c&5HtE=fWo*apbv&Xbn%hn9G+!oB!1lRvr>GP`xk01MiwC@f~$2U7cC1uop zLO}t2k&h4ju@~r#z`F?V-FrN_1p#Q9A>4aW4~^sPwDXQvL?$REX8V5$MQ4FLQvDQO zo)9E}La}a?aT{lR59n~Oo2V9cLRuR3MT-ozD>!%bTF+`7eVCfgC##=1vlWcMqc6*N z@G{znxgMCfRraGAA)mqlL8*3_g^|I+_p_7_ITg48pDwH-BUvy$mM%a*M1%>q_QS4r zKq=|MK78D8t>{42|AZHtC^9s1>Or62XscPJ0-zCR7D5tZ6Sr*Neh;R-8gUS)f=Gne z#-c|?@%(6e@THM>Ckwk|6>2_yd=D4n-TbvkY8uBGM=?67p;3u@gf;+H3L;KS;mx*3 zJ(w{&iOo&Y5u~3dDo?|VkKix%xig$nf<*vYgeZv|Z1I;A6g;!6wsUfRaH%^8kzPnh zh-?VAY+36cGO!Ijl7BPi;~F-|DZnA~Ky)J&!l@E$!4Xn+#~1&r1z3!TjJypZ*j7V@ z8@ncOqln2Xn4g69tyCIvD|wQ4Dh;dVtyiyJsUtHJyXX1gNgV7Xp!6x2%JndjOFCi& zc4LXDN<%*obOU6mRm3uIlOG#PzasxcfG;lj~ zXTkR<3*nZQK1{8M8@Q`Zg`L|cAmC~PWUO}~ms=ksfTu`ODZ*?Zc<0l}cw(zzo&+lz z5omT?yy@byJ#8C1`)1T=s*;FkE1!*2;oh@n6>8a%FkVPs;5R;;)5mp%{eMb>IQOj+ zFU9{=8Okvvd3j~gzY9b)B4jf%Frf3}4W{(S<6(uWqXqMKd(MMIe>_ddF9E+HqzBL@ z(T)&9^Vb%{=q-yuPhyrB!btYAK{b3=ssTB~G`MG~?f6{s_%V(d^5pQi^rxe>xevH8 zhk!tUa^1~y|i1+Z@{fZBi+XjTOp3WL!M^p#SRaIRywdKcJLJ-#keg!x&;+U)U z(A$+Z9y*(PTJcv6CxM2^umCF_@CJz(lOp+;(?P#=j zBdx?C<6~9TEi!9(2pa62y^MW8j!7tZV-8kBrdx+!veo3wjKJ)JqlEdsh10fw)S%Vm zgCY)506Ice7Cc7)q$i+S-pEsZ%H`)!`MF(7@1@Vmt!c00F$%BvgJ^`#N^l zTS4qHEve&6N{d!tCaif7n7PS zGKj_^=_3W-R@r>W_sP)9V01#)oSCE(Ef-SiEjZ<9?*euY33GM-b=R1Fes^12o(p#rLapi(KnbNG~aa)I84-5fpb2 zn9xw4?>a9+Vj}cL(B&56IOU?kvqISU25g3g>yD`OAWi}o3&el-@*k$@r_mfHYHW4kOg)ZI>xa;&4(G?BCV>^(+3=!bB%iA_6{}(U(cc%%$Sg*MaWgbU)xCXo}?>3}jhvpBgx{)Dy%g@$g~ea!WoPYfXBf~2GjWUQel(DvaK zkFC1xiCIx|x7X63dwJqri}NV*1U2(uX6<}>raF?rIJJ_uFDxdlDE7FU8!pJ@T$t+S z7>TI!;5KxeDK{($a$Mh#{9rB3^xRx)eWbX>qi@bP=X)|^V`Xiq=rwiTZ>gKk%j_TO zThaPIeq3|K>3xpTlU3&G!C-yaG6-Dd<<<^+-m+YG4s*FA+M>`#lM_IULWFQ=4lCSK>cTC8P=bC zBrL*4tD4Fmhg-zNuw=&_=lggnz07;0*)>h#uga~-lT~!hp_fnD7pld1aD_o}?I-8U z8~zx7@UV^VgX4*nvXK+~^y00DFFU0y7li3$vwBcX?hy;-pe>1@B%FI(OZ+y6RDL{1 z9*q2_0l{oA>O{OnTd`G=b6B;EGS5TXsQ76hlYd&nb6JF0Uw!o)1fM+DDa#I-MJ7AS z3_|{&*k}#L&|Xa-*XpPA-^t<> zSxZUKoZ`j0B;P*gBgd$dCmz$nAktz`uTEa`A7oW~mJzme=mzpBC@Y-qt1y18i;}aw*TG>yi$m5iw!`k4#MY&^YC3)+FiyO2n|`+w>pd-_;zzd zU`5~=%8wVG|x|`a&#z@C1(}e}O_YB{7sr4S*5gY73?^l|gy#qaJFh z8ZEVxWJ!FtKs+J}II|%HaO{SP?&EVU6d{-ams;OO)HWO(x4wiH70JLVDxa*40zA@@ zh6ZB!WGxfa&4Z((pPoZ5a33dHQv*himI2sX*1W=!^G;m@7@EMFH*Z8N#6kDM*pZnC zWQENVD$G~poI?X={m*qDcdVY>q{1;FqBMe2IVI7)DDXUhKrj?KeV; zc@LfmVaP>AG0Jk8T-E~RHUWrDR}vq$c$q{eLdv40S-rR{D#lWyw2720nc6>K@L0bGsXeqcwL4 zV4dx z$p4AI^a3bCl2mhsJ}ZedIvBI?0?-nfM%0)CQ9(W8M5rUEN-jb0Q4{m~&6^9rb@6bb zm6M@$CG)4LBLIfFqihN;MX2rNQfzrLn5xo%0sxbZf$FP~MZ_JmDH{P|+R1vwdLJP6W4*ulnoP+5dw; zFd4IsQ_OPkcVk|}?fim8(3V~cD`FWJ5R`vr!M@uD7smFPF_5V?a@dz}2b9fz%-bTE5PAX$z;juFu_sJrjNaIo#I zJLoT41dlN}0(nOEsT{I#;2?r*&wdk;Kbw3muTQ=W(zGyL!0_ax%PzIH@4BNeG;Ix?{v&zLaGNmf|>R%9&u z>Z4q=rS+)Zijgi1b;0%PqNAxgGiB&T3u0y>6LP#^yKoxp@aeD6PXB7~a= zaWP#p`ePciU19zbSdB}NAVhR}BObd1BD^Xyxg97`&!y{%_qPFhd4at`lFad{5| z&=akVR8%ti0Z*8;v~=WDFJfR__om|8>^_C`As`m;^tRz4$jXnyUt_F6$Rj)~i=ncL zig-W+uTbD#eSrd=Vgy29SL=a5l-Xp&<4}`1y6?wPQKHL`BAhiUeFizV%U~bQ+ho^m zh_pCRr{N#Hg^X>Wpp$w=?;4k%Rm7bvN=S=rg;L@_!6+4x>~)4OJSk=ze> zV|$Moie$n^jaI{(2Gz2`(@+d65SQI|0ipO=9)JH(4By%hYAv-*nMF*4z0r6F?2kbZILpOi)1;Bgy?BeQC0Os z&qzOt4{bYwhcoE-BAI(JM_eL1Dbdya2eWrlhRF;g770Z}QwVFj z2#sOJjj#YP#te((I(RO=)>^`7%Kv4RC+fQY*KgIzOL>sCqp#_+&dDo<&8`g}9*#BadaDC{=EFf>4%+we zWq*{-s=jQm@@w8^)qVSK9?Y7WK$J+|AG4nj6{;#JLCPQmeRe5Z zR8Y`n5~n3ks2hwYJz<@Wj_(p^)d%WG-0>!jj;E*RB}>ca!Mm;+8}~p7SUQSvix+<( z;TZ2hFJAhNER=2pf$>&H%SHkv^hRJfDTO|mJ{=&7-3X*chL~a>yo+QWKo*fFO)S>X z>Fq1qBFEr8fE})5^cG(3cP>4cx1*TnOPtZg*^joVFxBqautuJ@()du=c5BlI(KEh7_424I5Fm z1LSrjmR>a)J~)sBL_br%{1HjVG}VM7EZyIT2T^_Ri7F6r9w}P9SnyB;)jk&|W4mu( z;Uv+i;9G=7&LG0!oRNVNJbnSkTCQ#|L&gj;ym8xi zwll}cLNZ4tJdz3rbF|jr8iYhcN9iu(bqa;?Y_;6G2WYz}*!*EqS!-Mqu>v?uMUFxQ z0~!HBCzs&(ECzrI^}&5BtmD$Idbn484hI4yX?tz91=#C4Xy5~^?g?Fn=HW7q{dy#Z zz*PP+YZPU~j;)HDxiMZ)bNX)xx7jTxWaq+csn zttvs(XTRxdLeyaFiE#OLqyhpPlytUNCR)f_o*p^f043buZ;JzlN4i>rAJ;Ii2N^}IX!}p z;5ZCp^|rsT6$B}->nhsm3GtInmuWy9BylfDsN{S!YPIq0S5T+~8XFteL_a#Hzpn`q z%FoWGe>MLc`+~s@7mZ{m9{eOb4Zf4*~ESoPm1HJ5VYk(VNA)gkI2?_T2^0c=L$k2&(>(*J5SA(Hv9V5}Kbhu*` z+9G0vMH0fcatP7vEzp`NwXOMpdrbyFcSKhNv1bY=DB{VYwDH#C<>RA;aG9GMpK>iD zwz}hJSgmcy2X%6C>Vj*GUNWNIiIH*M~o3`qDo?KpY1>gCk z5y3_Z?%X~uidqCkIy_1Al({P^pcl|SKc@N$v7i70W^qW?*f=RGJz}GZl}3{v6F{93 z`cqN>HW)k~3wa$sby?U2vm9SWZK&uI5F&sAEcg3X@zj*34@pXz1f zcZ4BOCCQgx%+EaShJq)MhlVw~E)yruWw1}0nwpm^_wT*U|2iB%3!DuePp_MDoZ6@& zl~3DPwv|OOyI{`)s77In63U^!Lql^5VN=)>8F_Me!AUNEoIBWAId^E8Zf!j2FNM5chSZI zRK*B@_XP%^0oUZs2VwKE8#6L9qr;sntmq^^0wE7wEhGUgY`;z*RA~^}7Pq%YxfB5m zxLZ)*_PK~{*REK!fE|*^_^gML>*tc+cXTo@%@{p*`8==2f7*|cTnE5ITe|vXAaihHowtH zZF@_`UtudeyO#>=RF0@hWOURP(UJy-g_XUw!dK&bVK0Lf%ByfKS=D| ze>6^9SfgH~-OQtME5Cq#2cg1VwKJe(JmJ_3=I_MGa*G^lEL)+Cm z`{5(-^>w&Pqp#P=T`FwHP{)m^8(uc`0r9$wZ&ZzjDP$Fj6vMAMVq``nO2$*mib9kM zoPa?G79cq(hAc30avB!9Vt^2N1?%6_3OYq`gKi)r%cWTFa4VlfewI=Dp(7}0D=;xw zXrQ{IW0ws4)N#=OlJ29Ns)TLZk#8;SOF1O@A5 zv;$WHyk=gR#gzy9E8#4TaD(6g6eMMdd+%Pmxk$(1CdOf9a9orKlRYbGUPF9@&D(qk z)@}9|?M4SGn>G)*m~$c`Jm_IqCY{uyj85e5p>32k8?`o@6AIPQiGPXj1_ ztpD+z=08eSOQBzTfZ4^-rc4PDq4!r7#XB7L?|p6Gl3mHW&hFa-*an-g7s!*JsM6f_ zEd3NWZK+dAfi%2wT(N(YzmV#l*4&c3c8ZjxPyztP(S|wwo>04kUj(XQ#my6?NVmYB z4zjL;9^6Ee?v9XJ)3@jp>0=VHB>ano@XmFyXuWCw^T)wcfvfxUq{MbaMe# z|5#AfkTA(XqpToz<`T(-!RpAH0<$4hQ&+dWhaaj_k6+y=8i|fn2qFOx*#KzjsL3$e zT#}Q!?cIWwBi|7RiXZ$7FU#Nm#v(BEQQQjOX~#uaXS`-LQ0H_+qnS{PuOli5Y}59Y z+qZ2?j3eqdz@I%WJ0NSqRW}Y;@W4rvFgu7$HjwMW^IZiT{;o5yHg5>8w*een zgLBjt3CM7&3AYWTF%DbP-O*&VDU(HOy@iZ4r}>r18P2VoASbw(%dHHz=qeJLl?sA> zEdts28a`)xbd#AgWU|r9goPkj%0Ml&E-dOSt)L0Rv_za1+Rs~n@^DB<%xBN@FWLZs zDf#Cg{bW=oH@bi_DYCqo6D^*bBp$Of@B?TR3FR zhTgsN=_{6YXYB`6*Jq<;#w9>?_3IMkId-3$3Qa}#qKzN(peng@DpnqR+v zvo>$$Dn$`>8C(HbRD{of(~G4!G?nY+TLCwbd!DBBaqqKLdKOem5YsE zGfi6nisI_(7if%_+7BzNhWv#Hy;G+&nVcm>! zF3n}O9#|3$*PcC*!*3B{i*YWO4y4&I-zU1jtc!f!G|!ssox8w_o1g3yLn+7vA@YiVg+{d8{^*;f#SYKzT?Q zgg8;vsrWmFSc4Ic5CTfSb6rRhi9FL2|d2 zKMyE6ud+uFRXJv9=l%NiYf%Pyod^BWkk>g_pnOVhjhv5n?2Zj>hKDOy0t(75GFccN z$SOBg&EjUVxPOfE^SSu`iBkzXT*i2=ua;9_)O<7j&(!Zd2J`tZs;FNgsb9L^8Du*1 zycCU@e*&HoE4hEu@#DvbO_#ui!KiWen=aN_C8!)G>aSl;KM=8JLq)!@N^>Mgu~Lw{ zW@YMU$NtiO3=o{FA z7tVPEz1U?SwA>OZ(AnV~z-jGTF;ov%!S0w2{^h``+u(&kz7dC6|4m}zjJHq0w)D(8m1>d;24 zb`eQDy2qPjF_z39!;b75sGYX$Iid%6l@0xUeHnZ2eLu1b^q%Q4&qwk`f@p#AAm%2v z<116tlZw}_38S-Q1`HDbDB|Tn!&c_hF44z}!>17~#6~6Z{f6!%n-b#V9bS9(r$Dq6 z(u-I3#GR3kxM4~?%BnbF*OVKZnDF8QMIz~;%g(vKW@r9r+U^Y;({~c1uL>;L8Gjh68=1nCf0e<x z*q05OsXpPhyVe2zBUTg)+zDylblFhz_mvA_5n~N0qc=}jX`Wb)5Tlci{m|_6ZlZ$K z7G0uB_!JHT;+BcS0coelsY+`ebKjr8H|qcTr1G(oM!Vo_SWTu(I8bSCd9eP4q*Ycy zVKq*~kHCHz_kGXj;!O&+G0!CT=Y!l3Z+OGfgyO$BW7A+Lit#JRFdysvFNvfAuctRO zP`5EPu;V)I7Fl1CF)R3xT&u)i^IMWkeX-M_Cu>aBKC}(lC*5fnKkK@m^H@Ha5QM${ zHbOQ!w69d6?L!f@2B#3I5OKfOpYWZz_qd&xr(aLCX?F~hrsIittd(&zN#ox#aP*+Y zdODShwr0GA{48YAOneJm1ElfSukS(5HUnd0o6&bQ%6owT)$O9I_2;ZQy&0!!!bR9k? z@oQr7N#EtN;?Vu4MtJRMHxWI4qCSDHqk2(!2JdnY489)+w{F|k4WJ~#ksz&rK|(5K zK_fj8xw_tNS7k49_t*W!I)R+aSgN z0D!9@AV5$ph}pRB>&u@2i?%=X)|XpQW)uJXNtSl>PS-$*)bxn}royxyj%VvyT$Qr$ z0XJeWpF+S10Tj>MFzM^6^f9}Gr(fFjsr1_iem`;{*P42fN-BFq!q}^_yN<7bEIISO zI*3hoq_fvV?}8T)4=ALc4~kB zA{H59G&E-hKY|B_|IHwy1q+6R3v3&vp}~@2CLZ8nI3?agLlHUIJ32~_$cP@^iq)`F ztt9FA{GHfpiL$9c5*R4q9gS?nTpbv4lb@A5Vfc7?mDsw6KwP|$Uu3HW5=SEc@Y2v0Z)PsWL)dwUF} zXyseBz2F4dP;HfeT9aQARMeS+y~9d`>nB4rfhw%EFcV`r0%IqcWR7G=3}$Ah`Q!Fn z7+Wl|pH{BGD6;Z}n7@nbFvN?Vgwmmer=bCsc@K&x5(phzIXHa{AQ=&-kd`ZcFp{aM z1C8k+)#UvuXc;cS3bqcN6Jz3*70WY`v4Xwfl%N)2EFp>YAf>A<=0KjBy@7xK0?>c* zIFmvAXH$|7LB`5r3Eu-c&GIf2oA%wbG>htgBK9Oan8C|FP; zJ1V|*!OI_g8!QY-9Y;^2PJwMo#^Ma?~z} zs-3om%pDSes$DlDnPj+*xGoGO_dU7|qVn^HK8Rc+v(e7+IOk*P4#V!od{ zsh~`k$6Krzl}I{l-1^hqHz~%UjrWcx+(s~moky#AHc!w|-`j8By5x_YV;t@V=Un6X zBstk6e3*nSMF@V>L3~v4w4ZU|D znpwldqz27%=X~ILdN(EgZzOEXXB9Zl#QmE-A~iz!1TTOAuy$2?CYZs7-tPY}^&N0I z@9qB~6rmvsA?=J7(NrlZ(L_p{G*nttWVAGhQYzYMog~pzQIx1eL!_jkl+aS4oX=FNXn;8XEM7)y8dID}Ld01o)8 zw}4$G*;-o@aZN=p?=;kG;*i!I){ikSfiCXJNWcO@rKLl?nW?cr~ z->F)Om0|eq=Y8IpTJ2?LqE4Sf{X=oiF2>tefnlb`p~G{PjLXp&Y5h&)kU_0os<7Gc zUZU155i(5ffE~0Wp5`qAu-nwoV3+R!Ye60lw!LG|F%Mu{-S{gp_Y}!@VCfL7m+sx#c<~?k4=Z}Mzi|`t6${?HxkC+k_LU*3FAVE`{1*w zQnngA9%jaZ027DfPIs^!Gt03=VsEpRz@%Obt5G%S#T^HC_n@Qo8Owu{R73s`%t=0N zBiQKnb`zfBGktQ+CDfOj8(!4ba?B#`8tk`Lv3_e1oG#SgP;*Mft)^}NCT+&mmOOH) z16;|e+UhpN5sM_o3|ip_xCr(x_@9B!P6m>6p|HA95(+8BSqxWB{)!odL;3op_*#!n zJ=94uNL!|RP&(sVHdkuFd`-Cp0eRY%M>vExuhZO>7s@EqxZF?kKMu0C<)XZ(#O$C? zTEAn5p~1e;o^tGo7!^DhmXBlSFvIu)YKseTadE;^mh~~lLZdZM`cD?`%Dt(JgV+~c5jirQ$qe5bb8qp6O8N5}IiSYs{Q7m1 z#!?!h43)nQ$nzqvaZ&6czDUo2+jsH%JllByv**JtqB~LLyoLos)KFgl3{rv}1v|Pj zUB!;QHqRr0Nn-?S;T()=S`VuN%pZy9WzFsFH~xM^d5!=e#F6=ez>THoK$WnMp|^m|Zz)vI?0P&z=A!3a`E#vERai7xQyg z6Elp!P7#nDyP!;taXD&bHIJ0lsN%8j@*+I3i78_H;N1l0#VTf}ecSGpT z0+grb*w*P!_*@|9$K3LbHs_saWk*-bh+Q7bA=9cW3oz5!9C!@WhSCs>Z95q4ciEut zx8Lh!`PztSit<`0Doj&n*msYLe1Wj;w(S+`&lrn*&1R8VFl_8ljCS z^LHi|6>%YRVh)aRA!LJ4#&B*HTT^`z#&Ois7)mNdF%Ee~o9WJojf<;jY}6aCE;nGy zdSoCarvCM(B{V>-+k$!Dy6EL=~@w5_eJ z2ZI5!P^D!Xl`2}8qC+l|-Fst`GP(|Hnq5J1gJ|1K1JV&tAV;hd%n=BwNi(GkWYc4b zXdJxUAr`p+Ct4kCQbl%eM9YkT%#_OGJ80+B16!f=-=29x9bM_y&9f>j>>zEGOpZYsKdUo+SvwZ5+1oKR5K|Dvm)Ol9L%aB zkp~LJ29&Zgp;GUiS3Wclgt{;nZDL(9%xX_@Th=(V^^;KC#Ka_K3_-k?@N7mlH$h!M z6HZ(4oN<7bGu6|ksjB!=rViEy1_s=5VD3_)qE^n1VW6~(#&=pgOLf~NQoyRgHI{Kw zA+)CxXX6s;l&BJltcwoF!I)0>QCddWQ%MSZplGeGBJk}1STrSjOVABs zY-YI2c_&=H7hA(~e0z2fe7QqTZtxF`Q*UUPr(hiF!KYA={+Nl!%>g$SYw8hLjaH+u zjn1P~4K#F_Clz*aOP}Dv`=mS>$L9wIG7$>>Dc;ZgPDwLl8Psg2e62OK$v2`T3*zcv z(_>-y8!BJ+tM8V8S^2?Ls6+0dXQ7|t6xBMUA5nF5vJXVspR7c0}YV z%rE7jolUyD)>JD46|{irC7god=TC!o(yDfZn0y#y+TQQB;LT4W24d0Mo*hexEky6v zJT-;3+8nU2*3=I)t1u{xjUH}gEh9lU@AhY3_H2bEfS9tI>bw{{+XZfe(0CeX$}m0U z{|!5R1%x-t0O?4|ZAyURl=56(Xz6X*zW(x(ZL?^0gK2>vlTXG8TF}p)l}Bw@>riEm zGSD2iJjH+@F13?FQz15@&Qk!y&$eRZ*ArZ-hp~i5h5}WIs1TQtKpArO+ zlyV~YF)Z-vRWZ*9UJR*VU_jfHO~T`JGJ<3Fya0ftv)hNX+Ea%k3YnHj+Oh#RupIA7J*BQwY$ZZg{%QaC z_-FTG3|Gt10lXIttyR}}X|$6goZ9TGx7k%rOP_@<6P2faP7AJv=gTJ&5E4o35}`P* z2^dCNuVTfokCxbl{+X~tQ>oQ}_Ld|-uD+YHecoZ%vvnLPEsq|(&B9h&i{d>L0vll5 z=+WU&)z#C6eq9O7lC?HYbY)Rdkt7WzrO|8FanQ1LLqVo?#ozs-K+VxUS;xS@4-2Sn-R1pyl3@Q#O%e3Pm$^=W9cp*H?a}%$AS5%GnBvkV z$U1^2N)6CVpViw^PuEhf@qcc|!(|bGfjSlwKX6+hy1|k=4hx7Sy4GlS=>ACJ{$|h? z&sC|l*;m8_^?m-lI$~jiT>)lYSfa%7Jah+ScLkoXK^sh^@jG4h%6Sd>S?-*y83wLC z68aLh=5*wX#R%mZLMWCq@Y!?=m~H{2 zFWph(yo+5MfNp78<(XpasMqY*#6~6O1{x{GXtV;OCZa-wW^*W1}F|M6t090bdBQKqNDKF9F{I^In5_JN@FI4<}ffIIqfol{jf^pN!NOO@HkUk3YS=Pe7tEgnJ^LA+>=eS zJSkl`G#$Mw3k*)4_82hHns_8|V*DEX&uDFAhrU72xD2$I?0F1fVIAT%SM_`RcjngQ z%;8_~5qLOJav#|98|7DkY`8H)aqj7K8s_rXHd_V$*2BJw`e(R>Lfff(BH#_<+$gJK z@)W8m>vKOYi(!kqgpccJi<9@%JI*wYw$P{MEQEkfc@H!{Vb7pNxY%J>P*!#ofW7$u zpa2$NDTrmULy0{T#6>av!`={}KM- zCD3AaW>@`|Rq@<2#;p<`@qO#?H&xaE#y&T|sjoW>!7J)7OdRz@^Q6 zE?XC}3@-u>n7hFTCML|VT8sQdvBGT{%!?NJ(C8>k!vSdR(>aF!n0rJ+3KS^ zkbM3b^`He<=1w4P*y}GqTmpTuADmG8U%Qrz4S+}a@wK&Sy_0IM&wC+HUE`_S1*5l7 z`{xTpuGd;3dw$7h&~dakpbn02KUgDx4T_>D7#Qxs@&p7(1rPW9L6V11HvcG_+=cVC zPbpfI%9uNMHYq=ZvRWJhub;pj-38s0REZ9!d09RR*kM+~_*c=|didS~-C~{YDw|_C zjM?VHXZ7wg$ZcqbDl|r{0DwptIwRkv4oxWh;FeC}Z z#lP&&7e08vaofSsamL6&f>66e+}YvZeRUQ#E4~PM|6V9RQb_Ico@f=lNJFkFmAZG$ zu|g8MM)A$(W>#HCBVdT`_{F7;h$nzw_R6xO9nr}#Rq*`*Az?YVg1WjN+xckJ8Zt_l z8gW-C_+7d*_~%N`j(dfLD{oJULWT9uAIAkr>r&s8V8n_;LZzs-W>eP{<&Rb8{DBLC z6y7?d?Fy{1tWK!zFD8{@LP<~RQ|C2!GZR|-2~-Y!N$#tmk78K+cVKGl0!GH+P$x8A zhCq>%uOo2;7pFHH#nEcojV^PGZ=o6Tl)!O19>R$W@J>ZmHo+_{zIAIvA*HsPqw){h zk2Rqla=zIN=V2(j0%cgG zA|!zrz&Ubs;wV}av<>KDqJdd0#VS@GJA`^15X9ybiao$NJyx%ej)kanN!3ka_!1Qm zuZt)2V{%(LtP>gvCXdc?li~Kr0$?G5Kv|EuVXGNIL1-R|2^)q#Lis-i8~l@sqQ=m` zfZy(f3er0$k&2N4t<;m~Xs(7lJW_#>kg5X($Hjk&x|ZYW8X3!zLZToU@LxVSH)_CQ zpcJowj)&5~R9$9<=GUxtDz(TPT`zzrvYt2A2w&3C+o{Qq_}rD7jVfc)w|98Vqjcs% zhq-K+QtXdq%6!X5{WPr#Tpb*GsRtWo7e?1M;S+s#s8)0*U>9z`f5hOzymxx@CB1OZX+Oug(BbG0*MLN$4Jd6%zP@dUVA zOv^TCQHsT?kmi;3so_$vN6(e*HQqD708>)vh9J#r^-W9IdJL8?_q3O1dGIJPpp*;Z znbTf!`l@H>%!LaVT+9vv_49-1M34s!E;!kv6VH9Z*IL-X^&GpyS>@m|T-gg~+b9$k zE&WrJRI1(!yATIPMh@)w=1xvx??-?_Q>X~KcZOl7TU|Xpe8JM?kEOKpZ5rmn>9GE0 z-ymyo@#YkY!sr_()3eHNFO#_oB@ca938NLJOG>8*=$BpwLr6 z|Bh+TUz+E@!C3Yf7CEV%O9v74g1-`(oP_RUQ0vwoSqghbxRLFL(f{^PnqT%n35kbc z2@wGIJ%4_E@X8!z&@i5qd%3x~?IE2m>@ZLQ4&Gk#D5IEEtbr4=*(YiS*?l|93t>feEHqQ$5h`-7x6{LjvIvZF80PXaYae|Fv87`jppq zoGhRh(ISelvD~}ITDC}6Adm_BhcEbV3K~X}yV|{(gm0)@7#JY(TIKLqwRd;c8x^FP z=zGs)h@2={iwRz`DSkP%QK_lD_UQ)>8cq2HUWp?Tw4j2n&mfS_Z#D+NbdL zeXva`WxbKCI_~6T=6LHcI{48*1GSL@1`7~@v-itX@DC7ewQ-(TxqUnC39I2~XH!AM zF8>&OtF8OsCabC%1&Y00d<9_{F2>eiC z{lzK0DcJ7?rY47n#owRXcM;z&g=eb9Z+8l==rYN-QBa{#E^x8K>NqeMM^H<+a(_Wb zTP(#^_3YWGT|&(ni$hu%BT`x0gjf9d!Y?Fb1)ERw7)2=$)CmjEhyUdno~G+E1=N{g z{CrLSzr`}6>j_-cji*N=0WbE&^zR6LimB<`Yt3`C>kJ*y#7+O2jMk{vm1Y}*y!tOsl`I(I&TO+D@7&}rTO37n#~4b!vj zV-8P4oCldxd};n8c<;f)hVzqTNZFl~K>@hV0;F0hLh0KfizMxGsFiqVT!s=v_8mq* z@uh`V{)adC?ta4*s2J~!B|g_!T14Jxis5MpYxDBIx4-R(SJ#gHVW-*rD=ohsoBlb1 zuo>%!v!(CD;C7R$QQSBG%bq>HuLmze`cUie!3O2;k&~2cnvue}d^!J~YDCuF{|QOn ze9uMDsx`2UIwVJh81SUxS}q=a>Kt1?Njtqj5eq^R4p(2lcI%>;0=G|wm~l!uw}_vk zI_opPn$w4y*W79**a}&9m+y;h-(DC#V2YCU%6;u*mtErmt1YXtD$j20G8C%4}o9aJ8{4( zAgt!{?v#d@4}RCyiQalvwS08nx9f1L22xa3Jj|8H0*Goe)I zdOrV&_kHld^~H2=RaF&=PhJe_MSPGD&XlU-YDV7GkgsSKs@vHoG_Sa7V}?;|QYRRB zSET7Y8kpX;1=Nr~`pL*CxeZSM|B+x6$%J#^VqSRR{7~Z1yWDwDR9@)WF+4F5fJ$T* z;BRak&98m}5hLnAEDcY(o?cSB`eHePZNz>tEl*;}P?XrBS+~86&%y|&KJoZFCY7E4 ze%@N3r4U`|NQQ(flCS}?R{@^DC;(%KEIjq%F+P3?7EiHXPzv?fAlnN>F!qCG0JYKe zGa~3opsg1v3Ru^mmC%m?FIIjyFj5Rd@%kC*2&ew%UAx$6+=w~PJ7CGKW z?#p+0^epLUyQhbmx}%Lm!wCrq^&i1Q&HpfrK1(Dfd;bYwC0lHMBL&ZXaE%aR3g{#| z(FAq4j2eGIZQ=C7>h4)qm%CrgU!Tt1o9pyZktP6*9WTb@f&lEZE1PJ8gX-SH+iRw5 zvg3LiFy`a+s(6dOci4YN&|tc>wJc7sI{LNh$&`QL{!)2tyMa|f&cw~1k zPB@A!?cC8%U+NlwI$%w;#4l*pP#)YpbR76%e^*z<8e=$s;N8${d(MDu zG%K=V-QBD3X!MV7MW0UQhWgV$caRkODpCGqV0UObaYQIZEAt#t{8=Eq(eiuzdj3=d z5g-P-u5<>Ge!NMmM}DCj?85b`8-x@^^XnciyVZ-L7D3rWBc$&@RgM1PQvsh0tpk`f z1#mWI(^dB8x!N)spuiyYkjO?L03&LlYwyA&Wju!0-+@ut_5&%Ufu7a;DZ@=tH}wbl z!*a|t_4MXNd_XzHth_yMa;q5qGlHt*uh&CRL>8{PDb5=7_7rkw`vFDZu9GM4b$rC; zvzQE>jh=(qg_JYhJ2L|lkmRe~$np`z;M=1{h4QLHWCC~&DcCz^aH4=ZmEj&KuhYs) zH{m4R0HV6e;lAXRmOi***}s~M<~BRLhLsb;VM&#v5Q`}>jvK=%&JBvnuxuF78IyL~ zsG?$eqs+%XP1ORzt(ULb?zpLtvw;$qiKm>I`uMlzi|Kp*)nHlyn5;5TEn&jB= zCD1L_daFOVEzy7Ao8B~5zq$v~&TB3~ruGz|(BYhe=>0rjD!xWfN_a0MR45)KrgHDy zgD_;F<`)euO3p!w#<9>L+&<*%sd2bP!{p52vx&CKLR!=ejPBuHzrjy5G&Nm+%d#A` z3wkpI$HCQS8UKya@FA)dusyNesiTs@*x!L%bH*){?A57%`t|7FmC$d0MJQTFjG9&$ z03{3pU5tdVns{Rd+v5%ZbT}U@3+=fFC!Lo&%*i2SZf=2lwi;lfapSw@ItTqsJM4++^cP_0|q*a$0PE)@$Oe*QpMLbAz5OJ94M;8;DAtbLve>o80~AL7wmC!g^*wEu0*|G6or6oAf|3hcz(V4NzBG9%lE%w>{OD&J z`i8z2*~16ss&{Yr^f0ri=pDbHDyvbSm6g?jLqb6C4J_J%c&g1vqY14 zfbd+x2SporLKSsB5nV8ifSHnGW7LgFn#gn^(A3nCpQ-6${UM)7bQ|yk&X+$wb7bUn zBPqAAX#0F01x3!%+@YAcSrIT)#QuNYhikEF`hNe(dT0FxFt!UwB#ZoQA7DjQHxhu> ztH8C4y3R+OA+<%!GO6GvsuEq&K`O8~c z4x;o^=3>a#PV-0k`bC-DZ$RnuMhhm?i1)AyL(0AmB-bvQ=n1rnMk`@?F}D_cZ@wLD ze>un}Xo=@w!%2juD97Y4m^42$pNnn=g|27%rF#YdDSa0LYyBFN8!T@6m4SX#jx*}E0xvi@HQ_)~69xu|5tP7S zfs{l=EQQDEOCARr>gGRzOn{Y0?|}}K3`>1-G@a-klt=K+1OOa+x1R)P9QsWqU2l_!<=;0aB5)mLz&^2kYyWm9;|K#(|v^PI#)GEf{3J3ji~;JC7T z9?=l^3Oi1H<-*>%5Xf*^lP7k?+t?8WFWwy7n{Q^!`uCDy!t?!m_dZ37J2G?RCocR5 zoN2N~cmmb2-IANT-s;jFjQ2iINXUeGUTK`0v-tNXmQd))_)h_TW?rzwt zov}fc=A*>8gt8>OD*Ye|IA@FkEgk!VL`-uP6`%Hrjh}EXO08T`CqYd6+@VYb3c*Vt zYrA;NjvnQQ$f?G`%s@4}m0k-y({txF`YK4gy9DmTD<9g=7=4eY z6+}ARaoAvhTZy?AqqGuLJo{6parT^`^RJ6XQE{Zt+9zo^yWLje8)mhzK)w--%Cizj zVF5A|dD}0(XZ_e9fJBbIH`a*48hk@z-oQRfRKs_EM{@fPrc(e&X3@p^)1a0~uISiS zcLXxWL&XSQjzp~s%-#?Xb>BOX+L))OZkFrJyu6*gERm- zxtdV7ykbT2Bi%%%{}qNz0xB;zqe0tM`r;AUP3fPEt$~Wq!m*`5=q!8MRP&YQ@aRX( zz14vN#3diw>w-YT?-MyQB|zXdsH-p$3j+W|GgQ~JdZ@R@EH&LJ!Usq%>~z+m+E6U% zU|)Zspm6plD>jYjnnFkbF^X>QcM*_)P+4yLF^_@7)+zFWYOH?Fw8x_V4)*lw`IB!8=f^fQrP?K}~HG zaKhsFO579kfVIwH5WP%ujONar%RgCpIa=%pn89vxf1AmB4wX#~qh;{Q2mAbda6Y-a zg1Hy*qUl^d{pq@P^i>lySHr+%!o9pepN?tqV`-tJVep^fUJv0+!$^|=8Es(X*G{a+ zi~`9tpCIzsbQJ9(6L9iy*(6oH3lSlA9yUQfeR;eAN~RH%4K8ic6vd*3)RR8>Y?G3M zLm>rdF{)q%l^jGNv}&lmIOS~Bv?-;N;h=Tl@MNuEJ`z^ZtNRZA_`FT&DIDR0)=pp*=nFDC-P z!u0^vV$;ECk6_-K*yZ%53Ox&|ooUo#?h{@{=$?ZqQ(;JwI^)t)-){29L$z{}Od1Lh zWNJ4L>@n*_+eqHIj(7y-2p>M)jp;iysidt5fl(4dLhApDevND-o+Z1{!J0gD>*9F} zWBbMf$Gg!;^$!l#%4uVbQ_VhToQ&e=G8)0|>?ZPhIrjvFremv4iHqZL%Es61??+;u zB}@dKgK)GK)h#Ik3O_bGnixxUe?QepZJ2ptY!BJJtd#GHl`DJTru7(PWMqtJtB{BQ z=Nya4*G}3f4v(cd-B=8P0^}fz{xOsZeTvl8{@JYEYd2s`GVEfVOBki=)rUVMk!%IQ zEvj{&TKdKUBYETl^TcSOX#RLZb@uUIjV}X7{nzuaFtQ2r(v=0L%ej1IwX^KU5gj>j zrguB#p$?4dCz$UZr4PHy&tI@$Q6pBnriNdy&d=}_dA~39d=PpgH8piTOm^lxad7Q0 z*Au6uNvT|ngUA}#cc(jr@+TNSp8*RP?$I3r03@ZnRTzMuFh_pGyv#ot0;bcCk88nV z?%k|+ej{h@m%aM7#LBr8YNv%e6|-jZSv;>`&w34k6N|(vGb|!1J2kV!BKsC5o;afl z2rkT<8pdBqV^Qmwl1Gy&*^WIdK4An^sL%S0>F0}k6aM|{5H|^Rex?Be>^LzRBJ%Jt zpL%+?0bL{S%&j{axvA+0XmX8w+ zFCCZ$%(?mt))3=#?1r=i$2|tLbEedt#KY*RWX~@~*$>olq{R+iDu{q0rrVHA@xH*J z)|dt+p^UCPBew#yEoM~3PM!kNveM0gkJ<8^;1uedpXkw~&UeU(i?g2o`Y!2EN}wpz zV*hFZ>_G~YDso~wKPj<`m(d065D);FF=qfyD`r4uhI`2Z7zyz66F=iM_Hw&3=~N$rfD2T04=7R zTz5j0L`5}oT0l&Dc;19c+>~a%+`$n)*iVcmL{Z0PNBBY$Wz|B%bHMN2r=^fcPMmzH z+2L&1AP?$kpY3F=@=p7NCHfv1SH!8kx*j$85wXllpsj1)+yo|YeJMjZG^rb{x%}p! z9ATqWbGLTDZRA-CKZCFgAMmmaN>#m?xLmK1g!w|jq4oZJe897abhOfo9@4Ix+lADpw2x zH~)lY=?T6c69ebYGZR=xpQ5GQQZWAoDbZ2cN+~?;L&*aH=k@j`$YyA`8iS7Wqj~e@ zS>ST^+LisCnL?`P^B-ED$mMNgy|CEuf%Osu`LZ#H)+6uQ*Bpc5B;~Aypt%1%IrPe4 zOIc~DId0WnyCv%s*PxWB__1jan?WKv2rqXClje|6lEEN$2>lC&m&>Sf`nRFx@_Ypfd)k^qlN-@$7Pf`s{pM` z7l5a6yYa8B9GI&ZolokZqsapD1gw&oVJ50DZZ z&S^nI{Tc&H57)xMjR9c^k~HtRZ`;&$L`T8wzPq^;1=#3K5x*5iw@uM^sBP{^rOb1& zouIuMvK|0nHGRNV{P1Bn!8%tQa1Bep2x69CsoOyK?Rci1QM@Vb<*wWy!JeY-B0vmO zPBMZbTxksW4k+czi~x_RpbxgrhCZ?f3j)iin2~xnF1rdu0v(ZAST+$Ip|@#uxP`ra zt(-LQEwW4hL^Gj=Rk}!BqZTC6D5HH=_HRKea&>qOjhsxQro|N;8OVerxX2Gh^)Y8Y!Ve^ z&xX1A)xNujqVRa|#6k3b@$Hh-s1P(D2*~&tUMcN4dQ{n+P9AbQ_jCt|Z-KEUHPOY^ z!Ql!}%+a|yya}rU~FgED8f96u=S}coLS)vEMW(@ZH-1SS4CCTlVu9c?? zeu=Dnxif(^73h6wW;6g9i~AY5-g+GHyCin0&)@?j@Voc(^ZkLbuIdfxWK6mWU}`pV zlMtMZ|KAJN)4NxyX7|28(mtGS1P{TkEy@iWeWAYqu3@+bMx_{O?zgEae-4MR8^S$HzMn zY#uJl;r9up$P=_2N$3C0{J6hUIq(8p*4x&Wzg0D*h;0%ZXON%Tb*x zGXVE}#$`GqAYaS@vNt1vlr59x+jSAXFLLa5YgP1&PFVICzpUP$?Odj6b5`#WU(%=8 zlA|~SGd=B1icZeS zH_GN82~tDHP1$XwR<}@~Kk^t+ah&3G0aVnH3$u9-RtYy!9_0ebF3q19`gj$voPaFu z^CASncgUitTWgJ+z-_1T07qk{6ovY~htp<4Yj3Y9yxGG@3$6kN4q|e_)E@P9-j&9KNFJ}m1EOJx5jf>w4TdMMC_CbL53l5~I08%8Z2wbRetu?b z9c+aWc#1_tMV|oH9}dk%XLsy<9lrDrOnAuV10x9|}Le?7HR>8gmsq zm1*-eg05Uq3++K1@2OKQ#pU610eXXbvz#!%*ee$W^b0A7Dj3TJs2qDcHz)`?4mdz# zWeNaD-83kM5Kh#U8?>UQ12@y+#TYRU7<&8*B+cN&cE-qqt3byTvF#%o>5$-HJmefu zF$7|%?lNI((v!IX9JCW)8~p%3U+&I_DMnX|;bOkq9&+N!ZeOx7J*Wp2Bk&5uZv_x8 ziMJv+7=z|qUKaI~26G^x(Y3vRQMJ-o@UbxEY#jU5hPD4~K@W=TI?sp5;;fjFQT6(H z`9GO-Oy>e8$iMLKKamT+{NKc-iq#E|_;k4`mK#i$cs3>oF{5_dI5~+;R+(lk9YkaJ zujv4eOkV&?;sdxxcLAb`ZrSFC#u%E7MNJP;`6Allp3C{abQ;pnQQ`+`B8luQMhui? z!Y*uSR0uR=)-wJW6h#Gg%-t1M`{+9@4s7Q|*y~X+sqLg$~kfBj#Qqx0JRIhQKT+Z)= z;0S^lNn11wOF`v8LFNbEP$Gx21Hl_iNaE00!8q5J&J5P{0(k2JDyKzyHAaZ)${u4J zH)%?v-7pTAtkznC2aQ$oQoHr7JQqufQ8tE7y7E`{X`w&yhbq@PG#e`s z*Xq^AyM>T_i4Cmw4#tB8Q*4K+HWEueZ|ivqTL=sXsh8_1p#Y+ny6U@lITt6VfLADU zvScpkPE#>s%#<|M+esc?Y&O=~PmFj@FpV+MENt4^b0{9y;CX&zan@}arp z`=xaO^7>B)7GUDWLJeGiEye|xqtdafs^!!JKq2j?%tw?J&t^>$Q2gz{oN{Yzm~)Dd zm6h3av{6UhCk%bH#LCX75RCR9fJt3QRCLiU6%`MA)QLwvCx0V%H0qEJS?m|#amxuO z!A$p#c$_gm@N;h(j7m;zmff*qxCP0vVf?v4FM#VvpzAt)0zQYQ$myBouMl_InXL~f z3n>4C&~)Gv<^bBAsK^-5QjP-02Z2;ywmz`T_ClwGhR`|689@M#h7~_!T_IyZxTj^D z$k3ZAunl~Cd;%laV`74^DSiGX0zdz3s)P+e5;PN`!w6?Q>V#ktI*cUbZ8>Q}pOw## zd`^-|wq1q1k6In|&**VV9ZSoOELgfG5RZar11(-im{e5E!+4E#xt#xYzlEEJzOyUjzN@{(Dr^_S=*pR>#K;6_-HJKCPdZPehsmWIvdk z^2Todpxf7Xy45tme!e|sk3{rCj{kU=PBBLc!*<0 zFUbF{3DA&xUWfOjNrjQ#K6f52F5%}_p~@KdLAfzR6w}RO9SCiu(Lfrg+8# z565fqc(uM)mH4zcV2{*HP4N3sGugHIRqCvMBRZCUc>7lRMy^O`gwJv?=)KTDQm9{$ ziqJe>L4$Yzsy|Zte(Tol4>$Nvrix&&NbK}k9b$l3=;xd6-72Hrv93fIPs!6kv;@;2DWUTb)woB z=q`sSQq4LN6!1QM_dw~wvIO1@qu#!oWSH2VuhTt z;cZ)oX+fGZ4iFY$UH+7<4?k5uK+U0VhES~d&<@$LWycOKxJun-q%r*)^i@yxIMVz^ zI-b453|%fbCSIK>LG0g3XkYG*6(YuExMgH`_yYRA!Rc%?sGXlbR~u(Rg45aGZ#e4Y zeJp-HI9D4401ka)@?qd24Y>mzicT>35#=u*Bgr9PQkoW3UDADilW`n{ya*+neKPIoj^dxfImhjQD=j8K!wozOVjnCq zSCcyq7&2@^J04J)jWa5z@V)`;%CI5;?Zw8l42P9<2-A#?)e&C>#~rP(2z#d#axDaf z0i)uCl=;AJbZZMSe#NR!9zCP`W~8SLs$d5~9{S{e{ruSpU)K*i z3jFE^TopS6w1o*HX#}nel?*+ys^8Ag36hq7+&7SxQqx`x*G%R%*3Y` z9AwsL8jgc`BC(|EETzA#kvnzS)&u|?w0ru+k8_L*Lp^O=(A81ia^79`|HSJ(o^_ex zek}**mN_|z%JSVKZ@DH#*s2Jcfz$>_g9?Px*ISSXvk-y9%~Ldn4>@)hG{_hBIz@e1 znme=4>#Jai00{(n{=pKh(cfo=7S{s6zLt9tNTJ(&{9<&vVcxxJYoe|F>TT#XOvh%1 z)Hga0*3x-ojx&18`ZF&EIg8zxz5`RcyF$k&C(w=~U zi*Z9|&ZzK5Hzx6aHh`zLKrkl_@I#LyaMqvkQ(aM(%expE}}@DcPm6WA=b4# z_edU(W9Yh_rr(FKr2Bwc70gD}tPHNZ<0Aka7SvIgpN_H!9?Tn4C6I8SdK1jX9zIxi zLSQ}$XdU>5(vIr@G(|1&K*>icz3h#%-gE=1@qEd}z(99ov&~k^ z)(nFqC8w^Oenk8$x&m2Q{EKVFif6_fF@{!M9WOcmcgn0o7W?fM{RP1xAq@a=3lHk* zh8_L2MO3u=TXkz|u@*-2VjuVXJb>ms$6)*d4BcpohwGaFCF*9q(d^bpY3PJd(c=tq ziW2zGH6#D63ft{S{Up5P0vqQ+d;olM1T2 zXu}f(%0W_l@NTBK8r<= zzZQs)?|N+%@?mw?8v_wNcVq#rX@%Y?5ZB2ndv@{6{)7^y_3bH9kXxjr! zajG~*9cH+P(fy-Aosz7m%kTqNc!4X%dx@0D102pc0RHsFa)qQ7s^02z=kI3_a4V@{ zsdi58HLmLsoXm}I`D^f~HBmm7m)~@HvhOORqn~f!54AHFnKsYUtSxM;xh=wb@Owsw zz$d5y{CNF!Z6lLK5(~~|6-(+{K4K;wK%B8=JMRk3JK9W5|Nc&xs2X~2ns%eRO7=XI zqH_%rSObztG`cgwsp%r*BB`_dfq>FC4hvyQzxbJgx7-|mC@Exj@>aWzL$p7lea}st z2ohh87Fj3t`RweO$Qbsqea{!iJ@(_JA7d)DO7seX6EwqS-+FXkL;&#?5e#pH{E~>} z=v>y@=9Va@sA@~hgbK6Hlqy&7EIg308N9>i1O3Ti!dKqcl)UfOm5p#x*)%IaZeG^N zoZX(G_vtF{IswtLxaeOi|2eiOk~`8#TGY5yMOWWb(tqqw5XNU4SpL$Jvg|e9H_mKf z88{wvLbVdmHCWW6Gh0i3{k1|qcXH9Azvpi(^)X;mWn>iDVEMG;-k}qQ61mlmmvcX* znEl~zs8mfE2okhQezwZKnag++7q9dQL)PV~dpK49DUbiSZf3aW$(@dpy07(jKab0g z4c~JeI`eU|E=JBZy?vYXGnmRATN-(!AbVR4NL!!BYHn@|?4a}W(G>bE^7lDh3vyu{ z3^!_@p|oh1Aw8_J_UxLUIn2Ke@`EGs;5{}M3l+10f{lx!5tjK5*$;rLY?Y9(2)!N@ z#5{vtCEV+#yr<+UZf=P)SWd|kp!wy+HHvsIcI`w|P-HU^g$?iCMYmqh%oG7bctB4( zpbni$(bX8lp0PD!N`05CB9JSa;(R~eWToYkjn5zxq*xBEW0W{~2aZfm8h2v#ID9vlm@ck7+JcrD9p5JUa5b z8caYx;)il$@7L$|5NY6Kj$%OT7_jZd>%4!2RfaP3p?NCRMmHMmX(r^?3MOUxR`zD* z32@(u=!4ni!i5lZcM%PbL?jQ=TG0tJ-`xwkpVqDcE) zMsrd7J+$co>3?p}Qd!A}V6yB}6swH-=AuRxm~NuMD%_z`0cLwpkOyue0@m~o7ZdQL zjA_IAts?qq7xMG-Z=Z|M6ad_Pvv?XS;G4Ts=msA?mtXO}Nxe`mg10$c#ZYph1~$B*lr#W+v@o7^vKf}K!kVVTbhdw zc6H6c?i%@yZK36(V(+de^S{Du&~#$XBLBHqr$+!=vo{;TYxDN&JethXEQ4uI3lNs@ zY7Yt@u4l@&yR+8PF9_pl(_owBP)f;~2~8kRt85(j4GLx`)W8-XWr z3lU_mn~(wU8lJ1e_Gq4={}Oi3IO?yLS&4Dot}my@5VXl;_pQB{0|v&JsajO3lv|k9 zkAN7Yr!qbWO-bRKLBxyRCC#m*!@b)|aVG)2y}fvJF|N_IlylL1xW{yKXeHo6;C6cf z)M=$>f8D?Bz!3t7V~wnvuxl*D@Xi>j;A2D}1S*5;G98e|Rly@KDho2}DfTt-Y8tCS zHqU-rNOIFhGi zKyKthJ82G@&^4B{=@eM#)vcU&u@bt6dvJFj(O_YC2kca|X(4c%TlS)j!Id=m_<$R3 z`2bi}+v9PUTtbS1rGLO{UN|b={c~XB_cbJ7qyXIak6KV@&VydmwK@~1M|51=9~B|o z0nnn?X7_Y;T?7tcF!JL+bVebn@{~Lt#W-rCzH`07pK6-hfRt5&SvqMy-rq0}1GCKf zAt>sGXs*Z|S{DAD<{=e89R1)3>~#DHp*8V>JUm~CxvbmgONVr7Z+}u93tU}iz8dowrHUxxStXL}kr(bLmo)(;ga3u?-mY<*-! zPk+hE;SbUU{5FB)l{S4!mjQrLFKqKJ(1{t{K6btZ~VVEj{YIHou~xDIX6*IiB(ikH$|7#<`^$Hmg#AS!ecd zGQ6kMGcyY0Nz4smqirqBh~YlDyGrNK7@h0bw0i*DR_Cdn1<+bgcN=vYMK4Rcx!nSx8_50QB_WK~+a)vkGu z+NH_;wY4LPAcPmkL9KM9=L75z8kWmfug-EdJ^0Au?nc!!t*f@hsvXRTPcypnTd#$W z#do{IRMR;XhQLCFB{E_e4&85>q#rsaJgU^#R2f`&xA0bTe~7kwp!7BIyoAHs^Cnm; zvl7n={WGt3nZT&e^5hGVLlWeF ze=n7;wF=wErih>zF0k(IGxnSw7Qq`V3)gJQjX3zgbK zYF$nwB=NpiQqi(u)W9z+7>|1I-@T8EU9ay!+IzkD>86>Dipfj(Azo%eiO_lXEpSf2 z2Prkj30~&n9yzQpynGmFLFU;OWIpzHl*tvQgEQR)7s+V_m2)(Y23XZ|6M%;2USkVp zUh{%upDP((=AAh+weqhV;#{$8*Jd|C(}C3XL%-0K*>-|D&CWm3D*0XZcO5PECNR`d z>QBmOOq$GJgxk`vWwU7~3yV3>IDcx%G<&Q_utG_EL1ygOojc$c=Wc)1!2fxlmX-!=&XXdT9$>iU*Lwvb+BpjbFEG0`IY*}vXlW-Q_F1u&I` zEyokQpP5|Z+~?8(WEib^&gfw*ZHed&+Kv9@m|m^?d+X-SgD*T+KtE^x#E-ds+%N;C zAI#>BD=`I(uuZlwj8Wpepq!%OTIdEUrpG$42XaAT4279c%&$%J%h%a zkzQ`BAS5ub98|c*d+A)VCIJSQDtrlvjCQ=dNCtodUaWaXTYjUygx%s^a6K^JlXzvF zEJ)M}Tu&yi_bB!bExlu0xWD=5qFjV5ZP9-Gccvt<9{&zOF;eFY(Z~it0f7SzKvT<+ z*eZDj6y!6Uw7siRfFc0)5Oi%L%ok}?w7pyu{;qHvDs^OzT8Yv`KE%`|y&p$`bZn*r zyIY3I>9M|jq@!%4Kws2L66eK|)=_jVN~Y)&`&S$eR#pKW}X0HM@bMT zP2d0qI4VAXh`3#dmV7^sPTmK#hlFiDGDom*Mc1riC)xvNQzCMkq@)iju+-)Y#Gpy&mi!q^(Ztos^K&P zKT&Q_APaNfQ-EVJHHeo~0R<^=sxq{HcC^uKh0C(;@T31pOiKU%0n4wIRIZAMiBXKQ zPgoe6@bcidTW~`KV9b(oHds73Fs&OUaJI1ZN$y;C$6ixYlMGmO4!3Z5BO)T0>;S+> zR<}KW&J-{2ncUlAw;#bxNDm4xb|6;*j#g9_91&Tc(ssbMp^U$6?&}5Gj_i?`)+Uez z^V65Yr|XZiCgoHDZ>%zag~W7tFYd0OP+b? zJyme~97FuhDae)X45`>I+VBMTakK*3jm7X{iDn#tK>)?x@%mb522~Cpj=BBq3^Es? z=as6Sf_^CJMqrJo5t1z4-PYi6q=~4SiX0m5q&*p@eZAJd3efKVe$o#5Z+x9s~hIlN}7)&~!Qj25dKi zbFvym`$}0!ip?^*Z;-#}}$+X%C^Uh59F#)w<^JahK?<+l}8RnL)| zlE;FB(^39iQ7Eg!CEC_;2>$e>t&}W5_#K!a^QD}E&0Z;mjFjjzQ$GI&E(TF0*V7>J3$_yfvy}Bw8KSy^r+?m)E(~UQ>bFm(g!D za}O;0eR6&caedxD<-78ozHkSQW!=>xHYlLjs#zOtpNT6P0RNmvCG#<6} z+H3uPTwUyJG&Y_7!P^m`-MFEThgs&C@wx@Ie&^O#Pp{zE<+N_ZizWN|f-Bi0QnjBcfUzupP zHA4dnu};xoE5b>`8vZ_H!=^wKP76jvD_kv zq}*JI(+c~3FB8e%_F?}Wo9-9D3g~wgALQoF0rcsJDaUeG3OE)=y2eM=cylC~EMMfm zyG<#>>3(i5TW_>2tIdxy@BB6ufhi!#>EEp-7|^soT3lJj9;z1`wDfoIF#BVLGIGqd z1_8kx7aJSh*4N)32uPo!85YRnEpr%7wBAp>VVos{yLcBM`@RAqX^?`ct44KH|M&0p zci>|5Ck1z6I?x%ywq`AaAjL#w(dH;Y^kdC~gM|wqH8CAsjB~G&lG265dY}V=E;56%H8WacYe z;g(u}gx8pSbhaPcR!K?ZlnxGtU~>xhT7(&sHMJY<$d;S^dHP- zHiuGvem*dmE3mNC8Nn7pC8O;WlCv6`nfk`s`foHuhA zNSui(**5ixC54Als3JqD;(jbDzL2jh`htprrjEq8F)-|pm(Pq_5L^k-R~JlC48y-C zyfGI0HCl4zIIJ-hG+x0C_97PIw2ic(vi8C5n&#mI`i0E(?qJR4lmGepew0XEzRMJK z1~Va~O*9ZS%hVs41m|s+V~WJ#!rAdgJFf7^t)TeB3`&^e0=9KM6wo8Afd-g2u;mC7 zJX~7NB&Bsj(5emzN-^en1Rgw+f&GLN_>C8iW(|gjB#3m7&WO6n>>Gy2L*u}H@N9WH zBpn*(wmmE@T?6Ls*>We)_oT=W7@x;LqYG)K5zdAfjeP{_YDbr72|&qX!IMZ0BqMov zp%YZuR5?XNVduI8)N1edAP)stP{biCWbp7d#TeTs5O=(>D?`BG?1ZOMWaMmv!{1<| zLo-dg`!8R#dfpkGkpKS4LFs>$RZ&;Msm^U++P+q~hUT2HM$o`1al2;Wd{Pq}Z=A#M z6Wz_5FC^Wb)xSuA-i7Prl3~}iSnRMXUqwB>>y4_8T!B1teE{6J>s}@280M;^#GGXE zi2?*Ar4VvgN4OKfK-h^1E*V!?VlY{VQ=jl6%4&UBaED{*Iz<+e&IId|xf{rfk1UE3 zn&}Z6l_P5nnKCzJ%(_u%IG&p@QTb$7l-g;hQ(4)7FHu1U1@`63eyDZkl@DI+!M1$~ zvKeH);ubrJ3OH>CP=m#ws2SZCEnaMa7bJh`g_w#35RjHEs5El8WIo{|!D|4lEmBn? zK&o&AnFsAr$q>Z3JRJ{o(eELrq(Dj?CLnBjK?x#x39{C)huHDRgKI39LacKf(p|&H z=e(`6;Z*8^E~Szs(#C`Z{gZVkVNIRbE3bUNMwT@7)8!P^P5T0nV4X(`!;h&Fe3h!ZzQkZk?7Z^x?| zF)+l$`0Xl`$)tw<&IRpkceub_HwQC&WFg!fJeqAe^5aJ(O3y4CT-K|`)hP5Y!C;#F zzkwXE%rh?}-@xcyQ&xkX?P6Kur{YuN+a?1YDwc%cFyv zIyyQ$fTD9|;{Z<%qTTJr5_7-B_=wK>l7JBPTZ-X`Q?cM9N)} z_Ki0XOKFg`BpIWTk#`9Qs_X6~^u`EJ)BUvxR=aDik|L^pNL7h^h3p<0#nB#*J8>RF zP3_}(FMykNLY&z?4*IzhbUgbG_9!T&pZq_n-aMSjcYPn$h(;-r%!wwMk`QG|G8R&V zA~J-Kj8RA^6e)yEnTO02p-3S^rm}>LAxfqYe&?;dzx#cUWB;>1Eo(jNS>xXB$NOLu@g%Oop#+_Oaj5ej6%NcFKv7&cvYUrT z#@sxm`U+NHZ|muOAu!gZ|2~f_P`J=-EP$1)$0rqzm?1Zg9`nxVaeCKVo`YJiWzV1n zE)cf{o*C%oJkht?v=O2JUaOG$Lt<=@I)UQZt|_$@1pyhig1L$K4vI*E$Dw)Wl|Hls z=#o=i8PUx`(AlAbj=RHN|3vO>xk`z(;p$EeWfDFQJce@>cKYnF_TY$@A6uY}F;JH> z+wNCzBgS!LI}j!mYdT_PeGlYpk9}bj?nD`PuVa3zOlh?1r-R$9N4W+njm5hjvc23D zlh^>0LIeBvizfhBd53O)GUTe@eA!09qx&vu$GKIbnWzx2HdGof$S^u^_xW^Cb~(7O zmj1iB-GEjw=09&&RrQs2>U|H2#$dUzT^D8ViApmG+5S|cS@8I+_)5Jv50zew0ba0r zH2$x8F|CYywC4`#@BFz-!;H8NQ9x?azhuXGXclfyYkM>pm@c*OZF65n^K6@g<{4=2 z$Z)y@uD)c-WbV#&*jmR$L_W=xet)O;>#wFu#mExa2Cu))+ z_i%}$M_DigNHZ6^>V4FC7sNqCY-X2ATh5=w`P(~v2os20F_-LI%tJ3YdjEo=Dx|8P(pGA!RXIE)vEP-mQSWz_|we9oqs{^!egMjA_9XzJ*AOyDb z4txsl>;)VM4it##Ze#m>T%iGPE-;K|6b+n>IokE9eLr0g-bs^P;YPwt|#>$TDaoc#-Z&psXG%1R?V+$tSA zo;FtBMnp;>)|(ipmEVEq!2CWcu$~-A$m10w1Rw#nwlka6%bs z!hj-6^kHR4?$e`~3_t}?yXq8bI~ZpYGx)YzQ(>pQaI*=gAlc7}hKR(Q^!5!6`2xIe z16@jzJ_xxOK*FAhpIUtczv2a_9bo2@f|2zIGQ+qvOvHi_+UAoxcp70-u0n|_T(uWY zF6+)>0BA5aBi<1_4F-vL1B?r9)&N`|z%1e?riv4}9BO3gbii76{m)E)jXZ?MB$4MgI z7kmp3z=iEoh$pV?8tLtgyP=0QjC^y!A`9oyNZeHIv8fTIR3=j~A} z(2T7_he7+5uENgo_b2$?x~)I1KMl(Q=q+Vq0E&=p$r>vB?!y7T9Ko24ADN7;%p0i` zHvvBI7PZN;T&4{1{j)CiD%eNh<`)WSX%(D+g5v>f=9;a5S_#W?WA%fO5Ta;NHijUm z98Ju%$TP`a*rNp9e*&IDj|?+KS$r>IfgugLyr{1ij7UKMG0Mgm2#QH$Dx-6F_3bms zmp}df^@|FVBNMWR>27lTSHtY}ZCCs9+@fVkbFo+`%uc`|9ARkI0C`|;+AkQ3Q5yr4 zOZXnpF73*S8!Xs}3^u@BM~>VI2?+rq3{WIGE$x1C8ZsD=mZKX7|1I&BLqyhur4D)#0l_U}tw4;ztXnKn zt*fD?1`L*vO=M5i>CRbm2>=zP@nbEQyv@YF7neYc?#E7C1{99w(7Y`%K3+ggX*T6G z`jx=hMqLYw5pAgx6-@@OQ>^JnLeHN)`|fX{H{dq#mkOhE2uj$qckiW7I3HG`V@Sxh zjRt0e{})<~Aexk$_i1kNQ=?sKmbx(uL};LjlG`HcD-VzmxSw!{h@fC&b=+lVmAuXu zNpfWAhl|`B+P#^o3}>FvsZ)NTj46m$^MK~<(lnVUO%nT>n{cFO!SqubMIwbUbvU@W z32VCX>(|KHHIbu^X9&5TOmuW*qh=U+)#64mU?thS?G%cgOUlv6m{6kRqo_g0@C$p} zK5^_oh}}GHl@6e=gd3Mjv~ktnNi)-0BMCn*0TabBzi-u$mnQ}Jl=v9m>4#bdF@teIBhb8%EVyd7MGB) zvp|;PNqL2)y-i)tBkB8igLqXVx?9oxz6GS7{GCkdU1mK+Fl`-Oux29;jO_)gfV6($ z@zH98S;LsE<8?6v3Czg7OnE9E^c(HtgiZzwBxE)#eBc#RrqcuT3%Nx z;eK;lVvuR$kxu+1+KuMmT=zIZifQF<@Ah|Gfq}2})s2uRn5>fgozP>wmfEvi0SV&J zf}d@?6nRE}k19~WbgSW$)j07)0-C_$YUR4#yeaZ2D?9s4<189y;E@pi#eV!{qoJu8 zZ&M7(VW!1=wmc(T?6o`rK~LF}8*@vnbs`Quw~g3i^~v!2_+wocufOj<8rjqBe^%k= z+nm*sr&d&{YkqySsbRjRn#pO8Zr2;W;U4H{0GX;6#O(at}-`lWpQjx<_4d|GC_S+db*H06F_DmK>LmP=elO zFhd(e+cg`TEqH)HCai_De#{w|KBf%u-rhx0hcV-{CxguEM4}`yxKB;3!LvH%j8vHw zyLOqmH3`61psb=&G;+tw>ty$U7-_^ZjILhY0EBVhGn}Gkk0tfVrl2BZV(to@x~(1O#!j>(#@$082V?*uy#lAPk>BI7ki6gX4IqdR-3Q zX%*fObmxog?3L(0$DDBlkp%LPdYqlB;micgKhHx=O^rYxu(S}p9FdSnb!|mEYQ?&1 zSKL0z2Mq85m{62gTvAfr+M3@U1vUnR$?<++6}#8agCXIiPEYMANjgFUS~${z?0D31 zalg-ByCwibi=Bm5T@8dnTHs$nS5d={g7(B02ONgKc)f~(XdICxr@G%CryHcsenxT; zH_*^rT%4W5TKERgts+A8FA5a98F(FFB9FF^YcdJghhQM>z8ROnNAk|v;$i-A)mM-_T}l=G)L zrHTZRiH~ahN&vx2#$&0m%UP0c zm`A=9rra|Ok*E#<@9^aWAinBI;Mrau6J~BOhL~9@J;4-!Z1y z2uylT03d5zxK=|GP7sK29-iNiftjdklMganCbSObR{9iKqNOg!1HF#;xFgO}Q@sFE z<{=JV>4delwFya|>V~Q*enYV7Wpwe_#6u}21hXw&dYxfe!;IQ>CXYZ&psfsF>a{|2 z#UyNUKh%K#J$}1x6fw%iZ{NO+0WpqvLIO@9W8`&ygiOpWNK{TbKQHmvEcJ&9j^WUw z$Z9_qmugl_=EXvs7Muummrk6-{-dB0%W}<{LVxSVs1q-FOUBYr;# zZ&440vk_Ht$?}`4NzKZQ>XA{F<(OjzB1T)ir?izTw~+ zn1AVHDRxI@1zH9#gk#cVV5sk1!aL@X6p0r&5Unzc<>ckdfjWjBg>CXJX!_I}3n*E- zaXT|xPVd?F0`)?kLlLINQlYNBnZAkYIItN`E-snn${0xizMn)bKfhQ(5WhY%fWbr{ zUv5tlGy(JRsKKU9vS3s7h(-J@c7Od;BBTTM2b*Y9+vAY+WI`v_gK&r?;s;4Q@#q|o zE`h-^n1G|FGd@DqOh&asgN`xn$Z`3TXWTD?38#pre6&{;nXtdUTzO_z9dQ)$X}L0O#xHW|0}>(i}Dz*Cqaq?XHLil*X|96zpte9FT>Zs4*tQj9#)Otg^>{>>yZz& z2dTF(4GKOv=KAMFyFJh=g@!NpLl4rF(1#(;y5@p)Kf zdBW!r0r-~qOXHi8PVYJgs-S%eP@WfHY@T$y*ZH_F4`{|o9a@Z>17%}q-@jpW`I%R4 zux>0T$)`iYDUGGHrfq@f(`)mdppLqY+A}mcjb#r%zcxjIW<2G!8E{T0Aq06aL(=JP zsv39va~#UG#+Yyxcf+I@6?4gNmH}fji3txL!-XekcA#D~o}Nr8-gXabO6ZvjfMua) zO=s^yl>7$@i?C`uYLAN2&eBbM97Wk}_ff95ug?oNt0a_>I}SxJ8H99nzc)A?h|_4z zaca=O0Px}HD{sJ~hz5!AQ-$GTZf*`lJ6L}*ZSAsvaSe0~<*@mOCy_}P;CTj^!4 z4>$Xc{AtFHDjf-E)DSgR7*Y6L1NPvdga0y6U4uU4)DA9SE?eg2=OJZ*xVnnWUrGdK z7mJ%@7h&F-<_qD3@f3$CmZhK1Q%)f00)d z*JB2P!qnWnIvwHCJmsyt=Rp3;0$dpz#iYUj-1D=IprhBdO_nm;!_cnkTfcF1T+o0H z5yyWS+5IS-eco5*8vNvA)GtD6qIn&%(YLM-NBS!p8@FIPPLFHd>Qhg>1;<_or4C^} z@&ukVy>g{0-3YTSuzhJY8b)@`^;ZIc$u5_J(gM5tT?%}XJ77jqrB?%8LTpW=A23G! z@OE)I7ll|nuLwG+)dS16nTRAFZ~Oq1;#&q#r?bZ5Em)BxbfL8x6B*EzZ(rIUatf$s zZ|h!c07%Urb3Q->lJJ2Sj`R->3ak7~wgBm0G_nV;Q&G{HRc?x?$H+~k6RV3Bp97Y8qMZ-%mNbV*dWCTj_v(PEm{Lk0)>F---$u$SWOQkaTbv|1>fkKb$*V ze4j<_)BY;TgI+bd>(I4(Jv4G_8b4Y3ppupL6tmj~zE0a8wG}e#J(i}ka&-K%vmp3U zC~ur~Eu(w*?=`5!Jz0AB2t%JA$f3V{33!(pn`^R33IK6w+?HPVX<-}9B?R{l6@v#V9@@?d}e zH-rk*E(mvFgClcLv#y>bk~WYHm@`S*Msbj)pE2IaDjgg2rPHSy*GFQ0W(TCX1*#~& zeqnD4sz@SA7)&g%BH3k_Ccxv%D_Dd{VGb0MZUC5cpE#Z{MEI7c=V8SYOrTL5!_5i; z&}+hVrzXz}A3J1V73q1X4UuU3Wql+bQ`nP*3sfP%BVmh`8W^%T_vgbbqV6KpAcc7ljc-Q%=MsZf&I|kfx7Hm37pb5;FqiP^KnZ2L zyv@PP_&k;vmvP+mFDv#0`T5lp(s}8&!0+h~T&l102GCIL1;Csb7wcI3~t|t5Vhmbyyij@q&iF84d~Ti^jdEI)o|*+gZEPL=r~vV@s(Sk3Zz^5 zxwIH4Mu$ec^=X|#vStrB2=y3sfxM6J-mMiD5?Y-{FeDX%?=X^DTzui|_`AVXnx}iX70&hg`&`5X-?R8JR#FKJ zG40qIapdG|8K3~Nu9Ps$4${(jS;8{Rq`{QIpfAYa-oT|sp zz_13Om9h?0sAL#H;X&lAB9b%oIth|P#e5q&1bgFYOcN^ZocA2T1LB)PLo@ z$&N*lBAsaJ31zEDc^ndQ2R@vOR=ODRc|u;E)|?fY&`@H86@10mOTHt5<+zVFL``fn z+)*TqZ;u8QvW`N1z<)PiC5?v!p-}q=zKQ~mE(x{SI6iwX30hvmPAk|e-{PN74(~-4 zG=hlhSRg@gA&P`Gc_cF~7!!FV z#77=FBvhmN_CHlOZalHHrinQIx(Gea*5;ltN7)$lZUPY|*hKs&8NL&# z7!39U*m_q?74_#k_oW5xwtNGQ_TyYYtpR`X4MfHm4#8=w|LTUyft}fdB(#&B{tYXs zsg(}*egd0IJhBur#8T5lK@qvj=x%IBFs(PHS~UyDhkru$JZ$h<*Rg7@;lFl>Lob>p zBocd$EP&bL3%`YIdgMZeI?*P(fq(8G$b=)rb@KOla9;o2rC^462AAiU!hjQZC%mEp?MDAUYpnnv&<`;aQ7rr!HLtmL1nW zzhby;gf4?BYL5ZL$AlLWl;N!_$GOKjh!WHEQamLmPuX|mLU<~aY%BDj7+5Y}2 zA@}%UxT^x4f6YP#VHwlg#ScYVi2}#~nUHER@3W`pxJJ1 z5mht=t$7RDNT80h(Fend0~Gh5^++d9@6(O8*RD03tOEHaKZxmWcsZiQA9AaD*Dj;E zGyg+#QTqbTh5PYPT@4b3l1#(G!_Ai)TZTyFmzc67Li(YYGcL!4Q5C(8rSoI2Ugih- z_{5f6^=DRH^jdO%nJRi`xI%GXMRTkD{JxRz?Saj6wwCPt!+*bvs~Qfy=j$96XrBI% zr}uLDIsd}$7{3b>nAs=F$3In>C@#124a79#uqY>EwEpm(kpueV@a22>susZW#T;y^Q`!^%ca7mM(vWI^bO| zPFZTLn7liWwPJtblissj58-54-a2!Sg#H zymMf+1U|fB8NO}&25zaGX{>v%RI#A-lK_RsC<1g;1poGLLzdC%&EL60$ zmJ=93f!blcGtn5fsZ}93O;LRiTQ}K*)0)!G3m-+UaME~sEd)}HZEa7!ECam&Nwr84 z@HJ>{YsJyn{{aA})Si!H!U%g@s3ZV-k`%H!kN7w-ebKNLOjp4Ra+cPi_{AXcfLd{c z-xXq-fwUU%QK_6wRNatW%KK+Nf)aKO?9ic$5GB)DR7?0DrKijf{*Vk>K$#KN`OKVBC!!C{q_q_P@Ta6Agn=f9a?<7p5Mu| zcrb7uR1>m+(Rt!#>W46nrk{pd)1RJD+eNq3i|w8G<`c5PB4ieCEdiE1@KX;& zEYxl5G9+wzRs&ZS9Kr--{ZZCZNINk9CGQ_Si104P;6&8KX4kLCYa<12JzC4R+aN^d z$FQFUkNt<`*#9r)?b4ry|EVRe$^L(|sDh1oD_~W`^GS5Ox~ehi$1xTKSrm%HiAhN` z;h-V@77t?GKt=oO?@ceLv3b-;%v`;Z_FYmw;vQCO=7Uv!VO+xqKR&WyAI|Rhjh(6- z7CfG!L|{&<$xJED8!!S2Km=88kD9eMrxd7HIaX&&#xnp`#JZ$t0lriKYS)D97Bb(B zj-7p}-4;SBOt(+DaTrN&o66ux4||PJjXDw~aPZm7c&L4`Pa3y~LQI!(GcisG>hDZf z9(P#qufZ11lMctD^r-R?lrZ89WB>dHy67J^kM!ZZoGFOKHzHx+ZQC%_ z(XQ=hGxa}ewl;)91jGXG`0`^6qjTtIJX7K)R z(J8R{sChQCVR8N|oSF4$C#Y^ggA(L|mZbN$CP+5L1u!PjV19Dgp-@K28y6pi2QU!b z@5SlMhK3#xDnE{oLOQH3D8pKk>Uj;7QINAvk?rpAr|4?4(P&3}O(`ypaC#?lYcMB+ zq?$X2OWhO?4-Xfz!0jdZb-VpZ5!SFA@_p55GgHy51Rd-d1Ex z*z_Aia>5WvQ0o5Y*zLh+MLX|Q*o+YMO%7d?M5Nw`!CyU6CN~hK-p~YKqiBu?Bq&@I zA>x`kch}N;9WVpD&?cQ_2-(KN+O72~zV|=JGZgd(s>ll(3nj&nd27(C0k>gxf@{S* zkt^*W5WAKms!&8csAvMcE)t;8pUf`OV%Aw)}2r$M;G~ zU3C;-8U63jxa#!g^Fbw>ZOJS8_`vyCo8q|GSpDzNnwhcu#0p_hBlyR;{pju~oR79S zD|kHEo%HRpVR&m(UVZ}8sfhT1^p1R1sZyV~(~^(t4=uO}=2aX|`wIo7ZH4;W3VZRF z9ok;QS_+B3`4xBC>igXZowaHWZfkStR_csfA@;Y4R@G*76p~f-m17+3DRW zwKsY}yR!l84flxv-qGfwVMf2wf28&gelXngnaegl@&?l4=SCQ1@0BjAEa8b+3`)8m zQ`yzfPWFZeOky{fZI7NSRm3MeQv1uKQ@n8pOgQjJ8_o9xpn9_X$?rrQfhX_B*fYyb z#awX%>96blz^J(L=#CJ)XwzuvZ|}AAt2qU2i@9XJGm00QfuE06G~M2AV86c7;J;TH z$q~si^zTAswGa6YbIbtk+`zX812ytByei&jHAM&S`f6yZ{k}g+&709$_C&yVf&=&U zd;hx%PNlxW35#LHYE@YcwKMP99^^-A`Z(RWP@BKwOvJvC_clum>x(Q!smAAOhOfS= ze$T1OU!aD6glM3eCF)r@4~^w%b@f}o^*fFoTY`%0!%<+&cJs;b+%Rt2c5(nsXz^pSy73fNvt8Wz0a3EJ!4AYk;)4$MLiZLp?57kVp<2p!Zt2 zSYSo=P-La_87nH%k}1?3Iyky2fB&{s?7^&BqN5+2-nMnC=c(tv@2EOCiH$)Zt_vbl zCYyzBtI8Y?P$e>iwpIhVS_QoB=)G?epp0lFbw?o4+~<_6xeva}AYGERg2B-7JrJ$X zJw0UhCY=FYODxcKZ|qY8tzmGi>GwT92LiPlIZ1`Rxo9o{D{3^Nl3^@RMgCI=3(RD1 zPypCkEWkvu?`AM(^zld2wn_j`0F)`tGI*^9jqeFLRfZ)Q!~W>W6Hl2KDml&hHpso% zq+DTW+!QoNyp{lER!|jxQmYK{gV9lOfr<)9CGoi;2Ilo)Re))CkeBsA5@I15@YQRn zG6OZxk9VW53UZkP$#@Bo^~w}JaJp_l#Xd;lxZh0TDB;M6KnmGv^#0qLVGJaW4dEdS ztb29i)I-h7vEPKUee?u1(=9?u z4nugg0U+39`;iq#>tBJZOq~MtPL23C<#7}n-tZgH4#Oh7i3Hdr9NEY!zRe9;U2W7| zRK7kwx03Q-dHz=mfG%Q1*+}msxW~89y05WEQAb(}WpPx}*MREeMG^j*6cyDqsC>2} zM~K^2^LilnDqlQSFmZd6nj!l#El!zL(4JK+GRts*jJgO#(=YUr9!Mc46Q{8e+gtv} ztny$BLT^CAaG+Ty`OP>u4Nq+qj|NBm0<@~>r+8Y35VYHF{Kx1L#xZ=HOH6DBHo}#q ze}CUQ7v!bP1bYP3O1~?6xBdePE1vrMo|A#t^cO}jzRs#2+aZo;e+@R{KN$W(V6(Ox zg-DgTjd5D9|CIK8sjObwzil1N-ke@uUNpma;>wYkM@6-Ir1bAGsxl;Z$XVWPCRwnU zmKAeYihrpK$_NCUH{DV3F9fA~0mBjczaVIQ>1S{6M9&VLyw zBV_fj1Oj9*LcJvhs2>S@E=t5PCabDiXR&|^^gBc$P`)UQAv_7k7%2n!7u~gr5nM{} zN)K^}>X+S3e-Dd_-bKV>($pm2ENK9KO^aw(w68fh3|i!FBKBicawUJ3w8p0=C!4G7 zrlI;K!^aaG5f97+Z7(V%NaCbH$ZJUDfdNM{EJslyDS7ZHht1Uh5xb3px-TkslizV* zbOfrIAGOz+wBB*}DiaSkx6i}FI=ui;F68V-0_YaZ8(ou!Tjq^iDMbZfRv6X%98POu z|H#6d)!Hy0b2}>KJbAJ$KMaXfm2gc67ZAFE0O^X70RK0UhU-T%ekfvKOHL$D7|C9g z^k))W`Sa^5-efZ8;1Z!;B~_jt-pGE;?LacG>;QVI&$06B2OJSP!w%VT>Ua3gdePm4 z&euR+)Qv^DhPYog!(&z7@e>mBwZXx`z|!}ix}zqzKFd!6Qjr>FY7#AjA ziJ;S9f|;V?{jm>t+RBNOq#y_Porvif($TKjj$rrbMdg|h*NS1qgQ}C$(*ssr5OMm$ zl>0J&=@g*17f8kEuv`!6HvUwx2Sb{4;>=Xg?F2y75nO>b>~7(vW(FeeyUc5e9rpxE zMJqK3NFH7FGx~Rfh;znqW|Xp>j2?;=0iFoWfXR-icLA*X zhYn4>jMiC23vV=0iaKzZH7@mIc7fKwmB+i#Q?(TOV_<=LW6J3`?-SzxaXL3%4jA(z zv+2n7N(>noCbl6+E*I{!bWO{y?R*D1RK$z>FcloC?54^ zlhiR%1ML#z_=%xq&Uggb;oI%lw^}oTzV`;9r$=d6ufExB$CC#3zEHKv^_gDhG!e8i zAR(Hi52oix&K zS73xp#c%2FoWyDg%2~PX@Z~`ACqq}KVs6@c5}Px4$`Gg>J2fphFvfFJmMfj}dn}$9 z&@SZFESWx<-r~BDG4jxOT=bk%BJb;UCV5*8bFNM4yQn+ZC68mvzzZ-y<@m2( zzwkaVS0YgaAk?Kev%0%??%c*CsOl@wM1LbwraaK8L41hRSqu;8ZFPpFw7j!3>Q&U? zWZ*xLGKId=D; zyQ|nu77@gzW0gKYayrR45Xv+nZZ2X%Ize$T{t#)W9Q(GdOPEF`4t;%u81_(@!*CYw zvMWYt{2d(JNaDk{Bk2WVRlB&j1Rz>JqlUg(;$>kWr18*Zga!xi!mi3ABt(xj@`2?i zD9RobFe%HsDlRC@?gM-mqL%I4--pmzeIFhDm?nZKZD>l-UB%LE-+q^fjRG8ih>(mk zOjE-MR>FhKJVCKjg(1mG!$VNMF|n}JvXB@cBMd`~k0P58N;BiS8sIvYOiU8uqpsgx zd3s{-VEjwT2@KZ-7Ujv^Zp{T=O#pztVlE2Qa!}~>mV!lyx%? zg6kHFDp|N~G``>^nnm<{Yal=L0E#6Lg4t~nC}@-n6}(k|V*r5)T+VxG?%Yi>IF{pQ z2~7x7YwV2$oDPIaR7?d%7o_L=*aKTQZs>Nz#-Js6D7kYjV~ydPu-X~`{cbjMDi(x1YFggh@n3|d$1v7n6$56CZ6xE1X=rIzY~A|hvk(nxwP+kR{81u|E&~5C zzZ5&YA21#vktfhPnbC0)r4DY)YUmt3b;?QP0Ww=TY$&I;1$7ONU1!0;C%LKoSUJ!B$gs1pX8p_TI%MQ_@6xr&1}>cG8-?q z=flU3{$DWZBtj?B`t_{07&VV_C|>>HA)ZO35aD`W<}3YmN1GfvLBSTiKWYr4oPG=~ zA=V$J2ci$B8uzs*stTycJ{H$Zd-OZt5{9VuL zT?gz2goOtiKF^-TH?Fm$eNz<$QAQdy5D$NqG$8jFCM@{+Ofb1qAbT_yT~KruTO zCXv;6Nkzp!y~er~xrs&W5Sxo0*t_?1S_1C%SUB{q9tb8d5fhKx1RCi>Bz`%moT3kx zg}&j}E3{k0$h%n8MB&tAfM)Qmy^GXZ8a{g>ZVQRL9*Y3uM5$!h+JEN!d0&kLbApY#bAQ7bu zGvHAsB8pB;&odHUZ5PEd%jpZparFGNQQ~1dn!z1iP#_itYn(s7x2P01$m}iK#fb*l z&5C2Z7D+v+DyG!#vrj%#XJy6bkMGW7%gbBUeM7}OHPubhJnghnlC>{W(SVrPpptH& zR#wsbRuQwXhy}~dfx+qR34V80b5`X4xD>?gMW-7*+nArU)Z)*G0MW|DLT)oQ)4z0l zapRV{%}c}Nr5ovGuGWM*xPG5i`pzoLwOyo+*EYafBwuax`=5BLlyfH=qC0q8PH9gsI!f%%XnKpJ@k;hAHS-9y?<};G*xumff2HP>xq9!l=P8=g ztH>q1X3{d6Zw?4hTfRunQ)eicU6ZPi-Z#uEO_x+pYi4LG<=)RFqV!>V+#%7d)4h&o zc0SGf7=G;DM*78V1)ni3{G!SZF9;Wdt_DWxXKJU_L0yw^Y@7|cgmB`0j807CwR;Se zi`C~B(pfh+xsb-nggMZm7GyxUb?E}si#)0kKEPsX7eU4~v8YZ6t^0OfZBYfFC}$8gG>&t23M*CpSYnp61oq z#ltK>xt6V1wbsW3J&V7S)k>;KFeErqMBk%Fmqzyaw$-=yBAj|hZ%>b$g&)Ft!QBlk zZla+nLs0WtF-Qrdp@9bxf|*-L$w+g#_M-td)JB9C6_q0H#|C`NMz9>V0tk>QDrIG5 zwSj8$uxZvI^zQvqCOY%tb+nqteYGAnntFjrI1bm`k1r~fRaMlWV^#oN&p%knb4Ejh zP#x;Jx|HuX*GS(Y12-n~(S@?5q@;8sZsVf!6*CBx1rW$R=IlfqQuj~nBH6J1o!WG( z&jEeHOr$f7;SfNqKzb4Ee4k8@(Rxrp+vb5XMBV~(G?@rBgE4x{C7B#FzrH$j-G27W zz>@-5`6_H8WHBb>6JIfafP+O=E+G#dP>G0$K&#z&;^%&vmBr)lb_zKD>gGwhe~1Ut z&c*RScY%*K9vo#z=we8(!U!m!BkR?Uj<42G0mnc@(v1|L?6OJ8$wV}JD0~;qN{C!# zpx4;h!3dEKux3dz13ozc3aN6)vG+Ms&{bSSoYaG7&+0Xs&|hzah-ahKPC>)fgg;w` z>YI!LCD(#cH=A_RnLwDYCvGjO9>I^_02vEFL9w=gB&m^XS0?nm8t!+!z2AU=-QD)J z8Br*-FdJ|H!4h>gkftL6E^y9RwJh!BP*UrRqY;DjtPIg~mp`neB9e4KKW`uuAg{~r zKb(id?+vEjY+m|1cuby`BN@m&p`EfX4^26zQgd>8(aF$c9SfIGh|SI2Mo=GIL9re` z7nN1%&Uxgozd=q1Eo$i(_G09voRCT;-|_lz*#pTd4A{VoN6uU(cOE;W*YL2p?i2DF zypW2@beP50&u<;zx(BF4;06+fvA+B$h5`M>SSYH7KNF(BxKIjmh2u=11xsu!E`_TIVFNu`y4B#lvQ6u7pI$p%y4P z%>?nF$omf%p#^YCFmPGGg;K}(r%~QKZY^ea`W2Ou#)F)V_*(?1y?yg0l>HTeC}kxj ziwAbV6p_etXrgd(W~K_ozi%t1Ba08Drs3oUr@>evzR64nI|HwBXxY)nhyk(zWtV^O z)rUw)7e)~C?>p3x>++{^yMd1hZdBmNA4xR*jid;>h(#80=_1$KTGiu^mEy+V&LsT*X(Dz6T-lk?#|B?L|Sn?YmADXf7x#>}h z!)whgEXI9#qq^(HCnm_y=J1wWXbG+oSx4)xz@r|bFhd0gBgX*ie4q}cib{@-GHD$~ ze!sKVh1q``&5~bpE}DOmDd(UJh~^!k3q;@k*7IO2mP5R z5IVb`Ra{uvP|Fw2K*zidzV+~hF;D`7b%>rQns{hW;-aLHkLb{f-cdTUt!wstVpI9Q zl5=t1nuMRsxJ?g(qI>Z8@w<{PJp7!PQGMIb%E0P;b?7m-zegYe?wtK%dgu-I(n33= z1%1QjVNyyyk3hB(BLJ!{9%3$<+4UO>Ve^a>>Rd+t6{+&rXE10*aclwC#~~^C^@jtB z*|>PYCG@A^3lWGRfXcf=49GL)$XiaoB5jLA)kT3aoGG<#4v_G@udVH?o5Ra%53*Vb zg^}T<9b%9;lhJiy3u8#-xC_!^EDaBXNBPC>PPLCH!Ww=XFmq4SGW=CaP`t-G}3wdaF=fZ7?mTsPNG?SJF_?s&C|htTDv9xKuj{_<(! z3Hy{NXOF3?D4)o*DC@Hy_x7P;e30-w-{Ha2ox|2L70KgUMfo(0+?&Qfyl#;3cwlm} z&cT)DSo%~!YrE{#yZVO?<1Y(Z1J!ga0`a#xBbqYJs+L{SmR7l~UFNQWGCzWus|p?` zXO-rWD{xKz&lR}H;x#(WU0yzM<6m=s#FU!7))dt`z)i!MFCT2#6uZddm2{iAD=RZ_ zK*>H1rR~QDt)-qRH*`j<)AQ~=7?APj|3uzV$A#PqokV*3)097FWF>9cj^;`YrTR0^ zYy0z`@Uc{6O*CieI+bnj5)x7R+{A3*Pki3&8HV+=>Gu1w^x;mz)UInkXHXc1E?^p$ z0Xdl+>NM0O*KC%Yq?>U&z~=HpkVJ*exbYYNT6ulwQ{@kbC3G=`c6gYm$!y^-KNqJm z-KDX&IJ5#hEbHo8s;ftN@C!S<-5_@DRj4Q5mpm(B#=&@>O)}i26xmQR7{awJ>HBmj zJH+!dZg_t00DqD=hDkVYF^jr9vZ=`K3OEMuFFzr5Y`j>TLz2~q$m^#<9}+^|m#6_< z`i6c?E1KD!e&@sih=soax)q)O3N!0&|5+$N!xvz0cXyl{f~^M}@@5o&BJ*Fd9~ard zG;vo{t<}9YkoLj1M);6J$4{D(^7L| zIu%v+RkWh5k8Bj&++JT5Nw!e}()0Wo@X^buikWRSXM#gPV#LA6rNQdK(M@Wo@L*&N zU~@FKY67V?j%)PGTfc?qElDcCK)Al+w>8*PEgxuTY2Cr1SCFO)wS}%?9IuWQ0Xnnp z70i?uh-EYj8D!FR=x2#2p;iQrg&uT{|(84SIXKV*T*u~rOU6P5A7a663ACIX_cXLk6ul?X#> zOvB-l5e|xvH=NA5$DO&7v5c!ddD6zAklEG?$}`l${7d$37~Yg~6QKOvi2n9p#sf^v z=)^=XfA|AS!q$Ho@o4_X;K$|qOl<059&Z;a1 zZSIV#f&@ke!$cG|my7}hGrM@h$*5VFPL#zuJ3F)QTew`uyiVlW8Vx6+D3yU&kRcPs z0#?*mq3;0MMsn=gbMjCkdVa8ap|*$|9-H+&2M!t6nc#*5x7Fxop1-_%Eh#0%157F@ z$_i25Nh8|l>V+&z_@0gg=MTLXo(FntGf+0SU(B2xyup z03}4n%)BognJuf3{BubvU}mJP-jG594B^O97Q^?Xqo?Nzk82v@Ko0WWDp<*1I{dWC zN&wQ(Ffye)dhh=I=FY?|RME@W0$1_mC#AE4*Z}<{`LIW!^z{*M6hm{pTnAhAbgkl) z+l{6u^ykRHfXR4%csMiKi%Zpjj=m;JTF-^8`{GOMuL^mOHMO;6EiL&p9)nG* z46K9$WZF3 z%xZzC$xok8@apDRejL3#0;5eva5`8oc{_k**8sKFilGIqf|8&y4H-=tTdSZKVi;kU zlMDXfg~IA$2AOak%Nxzf)mdn>3c)Dl9r;9BwQ;SiXGBWMzQ5n&0z)l^n-Vu8XHO$H z0TW_3tXpT8F@!;Yz?qo)zF!w7BV+AUNLBpZw@*hKngfE9Ej&V69TIW!4?a~iHinFf zM=0>Q0-Mun%qP97cQ^;wQbrNMYUn@0j`t3ghv47=-TV#q;(DGexH?pH{V(Y%PJSD+)w`KYH$b{O369>G{+>d<=nkM<78PJCK#y2vM1r zL?Sv=N5rq^M1bQ12j5@Uj?oQY4BuQ7AiH+;MI)Ms|7rmu!Id|4o14LoUQSdQR~B%8 zOvE5Yxu7kYO2Glsb+IudJlqePLdSVPnZZZLnC+&2oF8*WuOi6Dr;-)!^B3oJv>Ny4 z{DIsDK7M{dnt^x(&({&5-d+W|)ld8dib-wqL0$_vMpOej9Xll$Ki=dMK zuyB25Wj8V$A87`2&>Vr9FDD!m$_|K)SRH-#xw&f@zGIURaYn1jJyQ_Gf1>XisMQaa zx*!RG$tddK(vy$dyYRHUI9~_z#O^C!Z|<;;au^x0#?~d0 zYKCCSp!uOOwcU_m@`znQ$c2dra>CA^0gQB4%jlFH3T8Ui>_r5&+oS9sY`74-*!S|0uvt{Ilv}et`|Q$>ORt;hhpi3yP%4==(D7;X z{&_qSDM)3XA9#pQ#PJiq9n0Jeh?OapSZ};U z?^m}Zo7c@kcq2)J(e-@XKKn3^!2_dfX51jv5t#pig#}2g&<^-)8HF+*_2mF^N0w(S z%x9USHJXtkI&=sMbGp_*)(+s$;Vj@Mr0);a)nh{?67?|_jP+X^NTj{xm6dlS_8&M< z`{N3NQ^;HxhX4$vK@N!R&nuY~?~dMspuq#S=9LT|sy=>9FAu%)4THGBSgzCL=HEMU zi-U#$1UX$;X#ld=9nM0(j@)%!z2N6XFiPK(NJUSJ{o8f$3NUI#yDVH0^d+GS8Q5Hd z-npJT7y4yAxO#^Vb{Y#+dqJ(Uu>3Ljr05iJ?#|u2c0b8EfTCrcrtG#|v(|=P`a;#& z*f4c8-m&5%vs>cd@u3=c?1rN^D4nN7pI=GE5U&3K4-SF0PjHVkMi5kil56%h0t1Bh z6K9j@SrjlubI7|ro3$t{hmqHB0DK9y0etiTqeK7Ed8$R5*n-4TasL0oj(O;P5%@cR zOkrYGfD8c=*#k9I4kZe<+VkiTp_y*ySQdrDyA(lUNm0=ijFo4}qUMl+Az;957#t+L zT5np7g~f1~10sA_WC_Iqajok<`IrVD_yxNrG+pv`BwrJECkHZ7!EAvjmTQ)`T4?k5eJ-2%OT7Jt+e3!O;7jS-J82j!4Q?N zz?C|dR68iRqm64G3u(ryqQ!&y+RVNVAJ2L4psKeY)ui)HPlwn(67!Ojo^Cnm222uL zs`)r;3(mi7umAp5N{9f5MnEcgYf+{R0^qU}0`&p30*20SoIel3a=pIu(C&nAJmMsI zmC^dyF?T7Pp+$`>011(v&_I4t3hvcL$d;@QQB!e%r0sN_P`Ov2hHpy>uxwD2m&dn0 z1iDj3q9~fxaO7s!Do(>iqV)@1wD4C49N51A+wQ;c&=sbwE9VNVSetGFrdb2hLZgst zHdL91WTzakYXW8c4JHX?G3uIqH`xcvWUQ!6}mGM}v#gevSkNSsAe#2(s z#hl4}{NMl0s!bk$wWr?CT6>HCd|T;arE5TiJH|sQYR`0^*3;XFYcYDgRPuUa{}Yr( z=MeAJG_eO^z;I15WDWu2_yyao(ANT7{fo;V=VGxfU&s zaWw)mxbI^O^W@=3Rcb7Bt_d-EOvfy6oX77LR^J6NeR$hP86Ud3x-v2Xc0EDgbG0pl)ovM7f-?2WNOl-fb#3+J-y;I4 zr^UuGlcyE_cgUj6dKfLLu(uXw*tTJc4T6i*JtLqXMDzufUBJD2>!oHo?ELaa(Kjwn z)eVQi#|9&w{93saK`>uqeRe=k>le(S6)66M2VvFgh0IJohYT;(!ib<=IXj@Wfv_lD ztn?ajn30L?uz0FY9Tyfx7ph?iEham8Ap<)}dMy>sf;LOZI}wtIOm)6uYR)XyEi z@c5+_yxwooVCp~2C(>ZOxjd^0kVcsEBX1r=I%<-m^y(%^gy{1iJX{JHgY0$Dh}mFo zH#uMYM2VW}Ix;^X7Ygqu<7+)}Xy$-^@c}O7V7p=MDyd&(t2)G>)jEQ`t)K0vgaVFo ztvOV{(FqCOE~U8E{puer@$!y&YzN2!4qDmLaRXy9#%^Nl);CmSn$h8uCsic~ADnRm zya%`|{kZ(_bz&K=sp;+HC*g``3iAzx!Z`T&{9Q`H8x*2N|MC;nhK}Rb;a8m4?e#MOouIIA zuuJKH48WmyJ8mEE#y>n$=O+ee{dr@Jfg$O*sBQdLE9JVeg{9I!;8W%Sl+99$RWq#|1;q^s=A4-w4i7$ z{#&ZWM)GJ=Vi$SkBA5r22ufEo>%00i8#>g}q0GF}%O5f9E>qFy`h`GjdAL0EvtAcG zD=HH?-5q#hdB`tQ|*K=t1eQQ#F*A&&5Hv~cL+dzh~Yz@4)6*KZza>cA`z6g z4{C~{U?ZWhUpe(sy>&%M8z%W2Mb!0mW>%cH9bE*;1U@h%xmQKn4P7Q16~>`O5OCpo zUyf$f#|Kv+LrqPcAii3T^_Ft3rmn6VcfSfeu-S}0UZ)T|e&&hQX8 zufep`E)eDZA*Kx~9fqR>1CqM2&f^w#3o>F0Tvl4f3S8S}|~{ zNNjE8NQ7L4%n2F@M@T$hol4)~6{qJ0eQa;afo-7SFPRKFaNb^-XDhIWq_nh0A_aRP zg@U+mI|7TV!(TyOi@9FGyOlEAJORBPIB!U7c|Ui;oVsaKwVUqdE#Ht{R6AphP)v`O zjPG3(&CI5Qs5PAuuFm@wJ%jK|{xXQRp#9a=WlvM2T ziuNMV7NlhGh(H#x7U-PrlNcJ{!e8E%l`!s*bF*cYfT%!AbTgpXqzoRh4=soBq4(~s zVzXrvMyytOTU*;Oi`XDfY*27I?J`%)IW5nC#?g=;UBt zV#rLAQB{7oY%bC~d-e@%eLs)Z&6qz+O(kOVM#Hm^4}Q^R9uxV0eWSXNMeW3KfHjMif(t zaR~4Cru{44e)0+9nR)K#zOUE4$Y9DE|vM0GrZf@aE!g?V0{ZCTTB@>~B2FWHK zC|)ER*nU`|<+_Ox2sjrpmLd9^qaTMH8;G;W6Pt0KvDWcw4e*;2LC%Km~$ur;Cp4Z7nO>>7tkoy!mfD@wI**UKL!=%3DlI zd!dV;hQw-?+@1sCwz^g`RB$Cs@pj%`1n$gYQn6ZGK3c-@AWc$y?qt9!Ymkrw(&a1) zF(D0=kz<-)@Z+kg3;Pr6x>CaJKTqct5y}!Auh0+iiKD9S%Z94le`p?dBLoQ z*lvFL1#nCYaf}B09ERbBZUpuELy$qiT_i2>R<;=Q{3?%I-FhMw z?ifs6O|M;9gMlG8)6h05?_nny1hi=n8x`sCrd0;Pc^J@myS@4<2N_A^t<>CoW43$UrhpC|5i_gXpG)8 zAUKriUkC_n$KIqFy>DJ$^=_De!tKwF&zYvI6^&F>g{q5zkNsXb`OUA%9_B2~B9o(y z&e8L}e?M&VhQh!9G0cpv?6Xl`+| zLau_bX=d+mHaD!3F79>YnM+YH6jzRjj9l%OKH9bFJ_I9k&}iNY(K_IZKri7aIH2|D z+1LAg% zX9@XcI{0sS6dpn&KL3lzVA^X0v;v_R@-8_&8_p3Iq}O)bwbgd`FVn|a0d04gm_4UZoF4o`i<$*S=L z{bD|{$Aa9)h=0WL1E{h4e6{e|D5|os?59H_cw;w}z~cBoS4lR*0#k|b6AlgWE9)ST z73FBAgh~?j zY|4jGSdw`W!cE}U?h~0@6>OA`h^R+``5JT4YHbm4$;60-xPPK?e~_0%Q7K@B%AS69 zmwB~;uL}_q1sTK`w(`--aD>+U1hwEN%$1H%M2!wiba`y96&Ti=p7*6hqne8sFWp+_ z2RGZNCbt0HR@4u3$q2(hOA4?xBFh5yf&)kLt*n(mC>&0NmM`8bcs$fz`Y$WO4N2a7 zcO~BTDJUzJ0m>q6B|KJk5L!nrGTBV+as{J+H@d zU^dAx+80Y3A^(4kwwqoS01jmbcFLJ9kL=@S$?Y^_wS0%Ugc&Tx6b~t@J{U<M~NsDctq>pGXHFR4_sx}c>OreupJoSt`a~*3yw!;MNsew6ou-? z9X~!pB(^dGM6(j%9)cFoV4g@TvC}UtDY*w3PQv&mPB{s!9o`!T@KdiGnbMr%M4*b0 zTlJT$h5p7R*#X4#LwJD`_QLEMiO_`mZ+l@FiNGx~BnV;ou!yaor1a1!TN(3Bj-l+! zV3{KE-^OKcK^$t12f$tv`;2izLf@R2YZ0&<-YWpSHLG;sbexSV_0H;sYL}QKSGLA) zt!Aw*I@ZxcDcr+w{E|h*x!+MX(*}mQokdlCi0U9N79kOGa9ntZ27QD+Q&$4AN{P(e zD$Ic5V6}%jW?z#4uI=Tao4fO3c>rFJrGC2fE=J=8LQcS;VH8a5m^=mXjOwfcXuL?A zU!NGKDkAIlUO=CBli<%pmX3Yz_TY*bI&S*BdGo5)rl2j?-k$rbhDL<%Y|L_4*32bh z|5>xV@~V-Oiq)mVA{wpzOr}pg3MGB~ac99G(h_E($w(Nz12YoIU9-ys>o&+XNLNvWTA_<|sE3P7 zw(N;{ZY(a3L>Vmf>qi3)-$r|NW^S$}qFVYEK%=|>ATf=Kn`Vw)p-&rqLv4M%e~(UQ zFdE^nFk1PuQAvW|WqE(rK^cRjf`TN_^+DU3xc&?#lMm#(N&8%gZm>l3`t@H(wQwjG zpYev?`At}mdoc;R0YFf9ySX^vNivoM0jVOTt=-#7tK;>Q5T2x+3-EH+65dbrv?Gw_;b#W!q)QbU5U{uw}8A;WS~h_pkW{ zSX|#&TKz&QUeWbgiL&uY>tuG1M_-HQO6w6X-cG)qhFP|XqQiKZreKcki_|4OE)$*^ zgOTitYY?)g@72G1z%Az(?qwl|fM%x_ekM2bZ z#=z1y92K~3$VuE+^o?Vs6I}VwYK&TF_o2h>L$9M$U3?F{(fk3#7;P}Il8 z3iIae{;=w_-R|2?Dg>D_Pa{0-xERe)!hNmwcS5H<8#G+MOQXdE?X#A;^~jf=UtH~!XBuB{ z*KeDW`o`EbI^Dj%aV`0p52$@+aoZdmBFGsIiEN*lk3D;Y3toz3iJnDxYi#%pp(9X{ z$(|$}Th5YP$Fm3tO+um#vxP{PF+^OiW&{{aDWmW1#F0t-31|7GuSu<_D^;WZ$@pmM zA6B8Jq^MZZG6=|pjL@MuLjX}$S{meo1)KPesD8_uM-!aR>>A5Gu#rpnIFU41OH z`N#2`(v6^?9D3;$9Ylr>+TJRKR(avV(AU%YSj3}iZ4wMTg-?vp#eV3sC!ojTNnkPZKZ zi-zIy45S4cXJ53!#S&{r7ENKsH}t3b9Ih+FM#Ybs&lhr08@YVFg7LcNa4JH@o^TuW zZ)8}-_QDLj_(w=x7*w2^@+`Y0>324&z{oV2ve>l_*qQ2V8G&}PJzv-!vB)zh=+r$;(rzTc)M;Mm2t8 z^f5GbNY#>JPyunMbY)=wndC)Ui0YQt1)4V+E6_Wy8{SF-OGS^F_xC=I-~IR9aa32j#(ACR>-Bs-*7Lllt$B)WE6Y|gGBP?fRpoPJWEAxH z%|b(kpX@eCio;*pX1Z$T8X9DW@R){-lAMf;5|8i~{viL~kMCOjug7xajQ{H~@iaU! z46h{q-uM^($kEViCjRYw?vxT)Rx{fe{z2)iq;{SLzr1Kn1Ifs^$kdb-&%4L}{OWdo z=f!2}84F$MKrXJ`J6TQME9aD5|EY3y$NZOIaci-rGM3VN2gnWDqm{2#@zn12o-8la zZN2>JW`IfZQLdkImGVx1MyfY09Cdaxj~a1qW(zG9{p2&^OAdNr05)HRM zeRY+}7}>v%8x$#MHh!o?Bg&%ipZ`~iZY8_@?^7R8(f|8QbFrIT|9L2sLQaXdAzsEz z$3?s&9yVH%DgNi-6_x*&Z^^rZi=H7jH<#=9arW8=%oLiMnp|p(8oNcT#8~zUT_$^$ zmUhD2oR{lq{Av(!+=uQ4K8%UkqMx~&YTI72f`Wn|&cv2?a#B)KZ>c%G$n-Nif7UrM z#?DyR*Vj8vbRLP8^Ok0lc2QM6YNDd3s7SwiH?yFLrQI!X^X5&mgL=8oX1acGD9Cy*?KJ=NgzoT#SM-nX9W-r8 zQhaUMy6>Pt!K=?_iBq{x_0lCaua#eQGYx{32iXHY*jD>*nHh>Qz8w`6)zXzS6eGpW zO}<+|SHX|^@Eo3VGa-S!F-DZy%iH^L$O%d**YTY^2M&<$yUn9UwO+ALjggE@%(|VX zti1ej(8tNO)n&)oVGXjIN?o72y7r8Byi7ZP-d;~n&v9lzF=gVh=c*VBoy!zS?`@v) zWR_9+)*YO3jSH*4#;=jx)X%q89Qyb;xKm9_>u$=a(BaaRxOHhsNrmP4$=N2C;YMcn z`JZ~ps|H~_G+VcB;^gE^ab;v;`nIw-OGXxul~`ITBWBb2Ao+a|ue3DFzWw`Yz1L5N zQQ|HA3j2KwbR+NX;H<`1v6{L3{6f~HEiA!QsalEKpPz}^y`c2rps+9lpVpHHWD4Pr zwf3t#sQU6nw=+vo;KMBn>etpCFVt5R!g%6E%(lvTFP$IzqCVAEet&g&;gnifbZ55S z8*}%y6_4jvMO!E4{!SM5i)6>)ysgdmI-IWKJ>>eWdb4d`nbhp{B;!gS_mSoVSywD4 z$8XgEjmwdY2Tl)kW*P*WOMCqlqN1b}>WY=Twy(CXZX{#&&zR22Xi8{w|KWdLoPSF} zz5PR9twV|Ly4-dS+34bO)?@Y$EIvKi6?Q;EV)v;Bdz1fOqtf`ErWVe7#l+<4Lm&UZ z4`Me}PoJ*d!o+*9Z?9O41B-<1UOA87%DQ#?hYLLy>|g1<;dL{P#s~89@f~KbR9Kkq zuk%@5x|VIw5P9Smso42+o#4(-iajqgOzXpnd{%EfPhom2TwPU)! z(m~s%^OYXS^>ozHU)Edy=Hr+REaDG(SK8-*0z0ow_pcnK*q0 zs-AW4-VG!v`W1^!p5fhM+5ezN-wwX>6xzl*GiPwErbFrF>rOfKs=uRgX{nb-3BBj@(l^6Jm$M=}Yr13h9c+CG~7PwTT z-5L8%1g~btaxl#{z!ywwYt7WZpTWeZsi2~A=d*C2dXNZ>c5m}zdXr*HPZJ+zW@g9l z@3uU@O1~K;gXYhT@}I7>>cp!wVnj~P#|CWYlCb^4wS1qk;a9IirJ|;02tI3@Wts-fSqkwdjjx9%uWD*; zU)fl}Tw9kGM-t7l^+w6|iHhnFpL62f!f>ESP;OhnaKd5kcV+EG$It#qQqjy46zo%G zYuF==!mdkQLUo(|DU~(pN%}SI6M?a40S`@-f@SQK@F6{NjMaD133f8M+zF;vdXO@? zpWBNl`20I3FFigvsr5>2YVFmXHH}0C`hTBz#GrsuUS57`=*IldEk}v>GEZ+_Lmp66!v)pvq^<{GGDxSakTqQ+D)-{VI40oetIUy$;P&=qN0K-l#W@% zzcl3TU22!H&+i9v%$s5rA0PUs$Oh|`jzl6Et@powzxl@8_yyiK z6R0_I=cOgW4`0l>j5-$6bwo&re&J_#ii<7pjdBh?jadIM9^uTm%*-P`t8-mMwa9my zHrlpro2gHBo0CjmGJPe&4&; zokDXjr@ofP{mBS8mvyOlQ(Y*h#f|><7yr=q6gp(Pn3|*Op_);XX^$=u?NQ#Rtoq)) zSaw-=dV5mQ+dOv5)~6>cENa5T!i@T_R=k!*GZnIKPxlhF>(1TGas0-4OLnrGS%#&< z%NZwKT*Q8U$?>z2(N0xo9_=l3U@&;&*p+R3cmJsnGKFuG=gV)nj<#on+h!*`dPLsb z+??Ct+4lMQ8O4;=^zV`%Sk^Q&HSY*kuAKYO|Keedx4k*1LY`&oCZa2!xU((U?;P{k z+>I}S` zkbB{8;aWawzb52W558Jh=}azMo-iRN=+!<>lpX5!MjFuQPy#N^!G#c?j*}@}JSxYSFpz z4qB|DhW0-B)jv!wKfj!H=QXih{5{l?X>jayzWBr*-5_>PPtUy~rZhY}JT#@ePtzMa zDX{3qJpY(~9~dBee){n_X)12@O%xR0mgXiJXD?mBwh{+u<2!dJF8np@cWi9<%yP`Sj`2s{Vds-PDhd57+on?}#1|#BMM( zGt+x_tHUDEo9fDyD~+p8SXBexzi&cA-5JTx!$Vo%(BB+Cmf+TRE1XZO+IMZ1^Gb}W z5e{h+H8u6~t2=42O0{!M)t+A+Y_K)H61M!EPAPE6rY)6DSw+R^NJ`hl+qO0x?R|@{ z3`=*fE%&cG*)7vE)CRLs8x+_wM5`Mv%}>&z77XjFWOtoVP#^<_+EU;C`pSu{w>2-- zBWJcOW7&S2$gUh7k3wCb+QG4Hd0~1_nAOVPUo&-_zK#~wov)bDj!&ubDzz(2u`<`+ z%kts&Klm{soHo)?sk;3^pe4fuU)0mQ@PpQkJbT2m;xfGZ7<=L#KGk;SOe4@WP13OF zDXhkJt)^v$H*{R>B%t?wg-_$9o=zUXI$NUL)c*B9Kiam*(o>CwTe*oZeDtt7o<&Bt z2}^NjKV+CHjw?M^u@+UvFSMpJZk66j!LCpD}Pm7z6JkZDLBePqZDYe zol|a`>v;P&eU&gC0IcF9<=@Q5Z-_O<9SgPTCf1Hp_me)Q_-`rHReCy zQ6rYIqnq2Fu(9d@TJ0Q@lT+$wfJ(nVSdF)5%+&KGIxaLA`z&sjb{Xa3X~Rk>UM|C{ z#@qSRFJ#6(RQFH66)S)1wRjipJxj`Rqa9XORu3ON;`X$2zUJaWiN?}mDR%l4w=x4! zilkk~tJ>1EXcI1m@ko07=5cp-kLK2E_5SsK4_-t*(V2PJ#YipeGP;z|^`E+;JUUS| zcXyn4c_EWPeF4pf)fllFDPCO6J}TCR6`KUu5B5+WF@^)XY-3YkX#L0wt>NM0$D1qH zmTZz|0czivH{Av>-P-hLdAjn%rzgk10oba~W*b*>#7fv55@QnHk)TP9<5r&*&j)zf zv}ezr?$Dujuisxq3a+bwu+)sTrCsTG@y~1T#OR>Dx!Tzen)$*yrah&(G|+&ml`jX4_xEj_XDS zym>1xPpo_V#Uc~U{Zc9br?CODtTUQWfRa6ChqWubJf5dScLeP^9KBqZ zem145wY7EB8IU`GcF$3v4v*!@LgIOi%#Y`Pc8dlE240Ts>gd>YhmJLJdN%Hu!=0ic z$yEKXC9V{}Eh(SAR{C;YpX_;mKJ8MmQ%kAav{#~Wp*{O-zB(>^^TPLTz=SH4>ge(I zS9(kY+z`vo?=E)co9%KYzBD;G`E90*Hn)CnYLH3wS?mwCOP4PB4}*3Phdw+x^zI&0 zz%V%(K>zJv)?*?~6N7`4=$4HOi?iq{any7w z0Eu!OPo>ZCXQ`d#ONOKl4JQdac)7$HjwcjLwl*94wj6 zOMk2gI^>+ds}Vc!>qlKcW>ow+9i3l(JLE1K9m6l2o#(Ckl`Fq#BL$706kk|hTm99V za>`@;-Me>}RaI%66L3XLl$HHk)zO4cU~#8Phw%_yAb7t1<;83htg2|y%~Vw1`^r7< z&)4NBpfjHI_4OUTKHgU@vxe_{x=y0>{rx>DplQxB8c$@IU8NIR)UlCAdai$p6+dJ7 za?V9d35A_(T%^0aeyTvXJa zI{^XA?F2S2zTT3Zitqa4jm0OUE;UuvDj+Gxt_;+YCkda9^?UiDmwnF?cNx|8lRoCq z_pZRM7ewr0Hpf(Np%#f#+pNI;YY;?-0sAz!J`d`vCMFRf4ib=@{9j7l z9Hyjwe0++o)KK|tTrELC!72Ue%JqtwTReg+3emnnp`lUF6RNALO&fTQVJqL(4Q+}$ z7Ux3Smv7U>va;TsAmfN7GbN3EXI5hO?6c3>qQ#UMfeqHtYPWHqEbUc3b;`M`C)bP? zwXJf^_w?x?wPS)sfYML0_e_$C+n)8id{}K=zy%vEz#$@(J)rdYv&P7zdFq) zvRgn5e`MvOj;d;E`mC=1`Q!2h$KjTM-6eVBPx{-d*y!dZ`7ZXUw%PaQ6k{q11TV5P zG)mmmcHU2~v^8~pA-iXmp+RX??GJ+S)MeO&bzwQ;)z~`A9{-zmHxJB{!5Y0 z?iXJ;15wT7xl>}X_H4|yr#|hzf#NA^8iPC+ea|^pK10*ZIp^e~akH;dxyhnB(CqAw zdFAG{%-XsA!za5GKBzVYD+G#%%`(y4SZ7okG@0yWvN4u7k+bG_@EKh?=bUw!_9K7< zFKy+(*mRFd0ujI3X-J_ssJwdj!^vRklkJuF;?bX(Gm05#a#YavABD5*SPC>d_ha5a z>lT;OQyVVRXTvI7kxf0vI;|zcxqq*|yfb%LGd*GJDLbCXoXLBid&!sBX`ZTB_w5EU z2sA+*pv=n3A~iH6%J&bf-@0{cW~y}BscWn~gGtP?<)){UGuS=Un&xtphM~)UXTDK| z&P{z~1t_XFw1I>*Fg3NdwEpDD6QajXp(nTB!VcIgD`DM!@StIdm|G^QJU}I7*YD^&FlR904@2|^9r%P z-it#~q=ty2hAb+f9CkChAX`bOu z1+295&sf{r+B@4b17*)2-}_#P?52Q@rKRN+!S_5zk7^dETgQW^nOUWwKy-^(qkEywn7dDril44t6KFwk&M_*#(KspQG^qL@IQSpwg6`F&iEK*;hfga;dL=IW z<1W(4F`Ze44J&TpJX-*W1jV$Vh#b{@eRHwg;HmN!Qm)zkR(%xYY6z9X`b@m)Re%5f zm377T4bn+fzha+u=${t@U_Qa-Jr&+Ek%#^XDlWHZiap@sYjr5|LIWu)Km&U(*EHzC zg9vmNzN_ACJAvYJ&BEgRb!TQ&vO={`I|gcY$O||n{Qc{GP$%`lwtMaTA|juki3%5= z1huQWMPa5&nbcwZw!U61Qor_RPyT(6h_5+UPZ7jz>C*Z0J4qGZaT%I8bIBJn_0i8G zV`I0T3gs}1?W(Wef)gG!Jqz;dzemtWsKc66bgd$h{@wfcZ-E1%$K7w-P-;unnDzES zqXJHQfBvwfq-4KV7Kk$SF?-VG7dpvGo82ce3!{hgB}P6a-o;_2di47&+s(|)?TS}T zxIW!SC!LVhvdE)$8`t?3gsc|XK$_C)lO|94Becvto&&Kx+J7p#;M?}?+fl&odL_zx zFNr~Hdewr~r>L#1T~W8`-o1N{03u2gWMqJ$3bVsav)*e1Z~Xv7-=2T@&&vzw+qGiZ zQf8*6H{ZN@BamNRQ*#CV>tvSmk6OBPowN|*#2r1l9aJyq5}$y;HeRhK0w?x8Ty5KC zp!af@!qmXK8!nSQ|3pd?0yP%;hjVA_PDdQ%?tQDED07Cx>|?~EBRY!IyJP_{HlZCn zdLAim)1Ln5GlM|2%ye0b3nfJuPmGy%Rn7bNCkyP{75SLm-90@UIyE;ZJm6LqBNU?z z(p#DQf&L_M>?$ebYGGRwY;A1^zSZ14eabvCA%PAE*3767 zfHZa5V9!acK{0z$v{V~(6NT(9+NKC7@^W%=jWMYF)_EbCX?`f;q8*QTwh&~C;0Ikt zWMo)tLpgUyNJyy1m^Vg~17s%ql4@U`Fyq<+qcF*7$;3fQ_UdbtYA<-z`8^Fl7jZ`_5>}2+hZ^yBElb^HDXlw`0z!R@|uTi z#O9go$^X%uhc49E+`K!CxN1N?%?V==-KktfL`2j}4Bn5nrU*+(>3t}9o|^i-JwvBh z6rFWPTU%RzY1nLQSr)+~CwZXHbj>AyEl z`jiV@S%yFC_X9;exPM=q+?=Y{(g1qzt+Vp?Q)R!z=8VQ zqb_@RXSJN>u8H>t+@qpD6c^M+_SF7s3GYy=r5HiS!0CB2<9O8E|BRmHj!}8A_li?$ zbo5^C7@+v8PN~P(B^_@r76cbh2tdjGP?D+lruLaS*-hV-Uw0p>JE@&ME#mPN{O_vv zVIH1$gK{(9-_vqW?Af`K=S*eZ_0;)WAXtal%eA4{i4^Y`92_jma{l>6IK0;i^)2@m zEge^*e7*Qh8aleiN5V-2<)l<#lD{tID)nflpNX#8x*;b%gZX4(q4=C1Y4Brut;D&j zyy8!p^apuDw%(t|hiYj%?pr?#g3*|OVNsFzF$4OjJ= zc0>Y8x$ojxPbQQ2`~`>q*>d@LEz``6+WY+XrVab;Q)*WRJx7Z_r{1F3Tb5qFMOd?m z`lSDdTUoarK8j_%_3-=6ig1UG=UYs!_V#pz?hN_A{#SHpEOpACUA#&`TY55L_T(QU zD7yBnkN<}9YAqLjB^aN;{mlKpP5YzcwQIr|#t{e4Dk-pdNJ&cGEH6L4-~|06snwE_ zXOklsuz`WW)DU<_Rjo>6YIS>0@$yaRm(U5Xb>*O*j!nO-Zf@Sr#>Q3@<&3Ax$jFf2 z8{DtPSX5HtB$(D}`FkY+`i;g^)YJLHX#2T;&e)8gFqNah&=nOG(Wveh1m+Gec#r?2 zT0NK2iee?|#{Q_6cc0OG=cOHNYGtY^Suew1~wag8~7gN*_bZ zRKU#?m6R*Z=OG-xdHASffu1`$H1z&bn({ryBFAq^$rA-8@3*?Rxmh%ye{r6F+m0Q3 zV%3iB;E=5%bl>baoH6(x{g#>J6jVw0hYq`17<9Sc?(1vb$rD}B3#aUKpt!fFTZhNT zKOl5m`~85`Sf}TtCEj3%B;$O}F8!0PP0qx~_`4m#;f@=go}yjXTwUKwIS+p*;Ly*E zT+FA9L*@8C3B~T+ydV_666d*PlTFs9-PNeEnJ| zReHK?cDvVnx5Xtrk6=!D4yc4%z%&_L z9gI^KzkdCC1>fejPCap1TEI0_RaGy?`l5ynOim_@{6a-~ixPOHKJ0+HD;hVosCnbS z$J4KvCUPDz@(A!GT;gQ7d*@E|GqrHEI5|0vj}iR)PwaCAyzs}<_Y`X?D^qS`lhnSW zOgOdAvNs%9Wj!dsyxzedqX`nH>4cMY;%`0wf%kQFCm>JUzhu#!YnICPNjs(nQ&ZgPL!AL4>fT^`={Q{cKHPpJPN1< zSDP(0lun%BmX?mS>I3mT(G)AbnQv3Hlrt~5`EKb%9<`Zni^QSj))^qQD^Nhyv%BG% zZITZf8H+j-DZsF{x)7lv>EZa_EI`E0XmOj}I=tArx@GRyDn4Z!6oxEHM?*wX($;?1 zI>I6AUggzvFEliC#zwQ<8bIc?#V3wb4}&~Q^70#VB1`c`*>|yGM2L=Au{a#(mDrzW z#S|~^`y}H6frf!DR1(jqtNWwJiOih=ZH$(2JKAQm`y|j6bmBW!M&QoTvK~$qpCDMV zLgb0I_WoVZr(IyHm+G@Dic4y zmhPEo&!{QWgkks&yltz2V7`8?>6Y1qARe{qfq~0qZqrX%)DwM{_`4F4`@9zn)z6%1 z&>2DN`;w$cF)R&8Qoj(g53c4N-^iS)H=*S;j5pdOY!a3(Thu304VJ7Cq`y?Z~9G$G0qxlUY2$zNKS{?s7| zq-&i=tRdjY$Hnp;*~Xk&Ph?}#G;iFHu2`NbZFFU0Xa5zhCHE-b+qo20E)_MkXkj?o z{ZXT`FP52t6@NDwUb+;Jsptv#OPNYe$$Dn5 z!UK-uv5TF-4w}Ud6z+d#uDm*%LgmJeE=?+R%YNvplLCX=>Fep4j`nA2OeuxD5jBvL z8(m-aUes}Pe65%OO_3HNWrJZF#Dc$&6It6?#4PE%?97Zcj(g7U{Hzif2=ij3`N!yJ zAa`_&q;gi!;8r zJr)s(lH%fPT`)0rfpc(ASR5j>gRlvStmNcmkO5Xu)Sb>mN?`H#fd!}H%G8~E_E4Wa zdzR#C-(Nu&$|*k#uK%^ng9)L5;wX4wY^J8BG^&)2zrLSTJ#*%V(G+kG)w(a7*-Mo^ z38l@@dS<`w-3sU{FXhJ2hxjZ>(GdYna(MjsFmOYp|JkLAK z_aD8W{}!He@S|p2UuA&K55-6Eue?{=Yh!y2f_s^y@aM#-#+Q;_Paz!uk?b@}Q7kU5$Nwtd8i=2c0agMD;oZcj3QT-9y|M*~($Pf1jP;)7GV04rQYb@${ zWDh~w(F`I&6bV6sdpElneRXY5Azx>OkCUUbvmaRPA@&o2^OL=PAW#DN?PbO}McG2d zB_%|)kAQqhNVs)%r^fdB@3cv^e|?feEZu`oDM(Of`M{h7@GJ%FxpUHY=-~oUp@KmQ zz9FRlMe83wb@^m=78dKJX|s_qpmFpR?Ul5TL-F8854LRClAC|l-Mz@RJ5N8;wzw}3 zvXg*bE|1%je&6-4Nt&=Rx3fz}ls1Eh{(?%U)Tt4C^it7HIufIC$A zkOONC>j2uYOs%x}4jtNxHeb7;to$9UXY$jRU;De?josMTSk~+BYiv+0t-9=rr63s{ z)X-lL5vcM*IORh?PlmJd&6`*dk5KCTbRHnZ^Z4&L^m?;~U7VbuIIWRK;#g9dMa?gt zI~QhaXZrEFM5UU&Ije$yKtMeU-e}}6OfVCOoja$6PR2l@xkvcv;LCcwb&OQ>fq`J* z%CK2e*&z7hN(Xn|3l4s-=tm`-tpjCP<&u|7qg?S``MuSID5k5 zXCi_zd7WsnMN32I;=!H3pWC}CoB;2@OSkRzL3D zJLL`FSD0kSw67YhHpaLR%E_YK0xrR@w?&MISrvHhs51f-{zwjpc7Stgr>M}SdfeZ4 zR2&9=xl$S#o^7#}}mkP3v`aPIElr@lUpL+8_zT(G?uyH{OpKYSnIl{hsUkp;9u7qfJdN}?nVF)KlL*<)zP|bSc_^+= z95T_lZfT}9xGf&*Ej^!cZqX@pFW1V$ zujD;$+!!vk14Vln8F^vgclMfbOTm_bexLdg365hRWW* z2h~zGq()A!8FJ?Y2xlfp#`f$E4@E3>m*H@=~3B9DCU>E=K z;|YtK^npB4TlO+BUCrwR-fU@W3okBro9qdIG~_XkUGxoxz(-Qw*HX7n&KF}Y5*e2c z>Z8UL0yfKxLxA$M?Ck6@%T>_s9`tz=jtCKG8(tHG-|02|!ENZ{t}Gu9?1&WC@87?_ z2100bm0KKSN^y%sIP@)XK#eiLN&$rq{SvYK*cyJfZ$}sMLqB4=?j~uIGJHQ>?d|AM zbp;9<=QsZ3@9+sR(-pJBE`uK!{0P`eeBP8akV~^OnFrL5DQP69h$N+-G{IQWj-0!e zem4lx*+*rlKvKT=O80HKJ)(0}*vZW8C0>gn7hasdSK0|7ZQHo4?;2a>-|w_j(vaJl zoR4L}F-!6Dqj1XvN^9)ZMSF zI^jI?E;PWXW^FfFjX1i1T5q9kfvUt?IfCwEq218#YV)-uUoinvLky{J(fg_58t2B5 z%HI;|J=qR>q@GS=#Xd2k;oggIW4lLlkIg@p)OT~#TJwJue6=`jNQz!sd;G?D*htgP zHRxP}F8lLBet!Zy6#?mE_bLSiEgee^=kb4?jF^TkKut0{p?wtMmG$>LS`-#+qRc3D)_0$Ox`&JcVB0>5@qT}*N6MO=q(mat!7A=RO^98l3afF% z>uvw@!|97e)Z>FflY^59hl{o{4Qr1PW3>vP9kL?G5I{wZn-~fbM<4=~@^sz73+*es ze8Y;=C;xqCtK!@@Z^m?+)&JZ$KI$U9k(HqcB%F5m;pe~Z?Zi9OAlY?!v?clUDgFzu zJS?=nv?xBB)2>D9%4UWY`(fx2fPZD zlLLp-xS0F?4mjyVS9VQ_>qJCIH4hgT8B$QiQP4r3yj?#oD@)X1k}E1aHSA9jcL5=x zhovDZDYu~d?3En`-0SP>BjvBGtRR~37$%dGqazuZ)6@`})!VMFu7MoTja%X2Om3O5 zl=iMvLI4^V8L2a@L*Yl(6-E$LSU5YrZMuc_hEryN(uBu?#a$fLP`(Pmgy(#^_N`J) zq8l|dGz6vB4`kE)=x6*X`C{Y-0&5nFJ0UhP@*{v7;g{Q05*8LU7*We9D$HZ9u{$vqe z2F@~tXbDi`q2+${mmOOh*clmb!!NS1!jf&3#s+_nlOx*oYj(EA*mt$VGE>5?$1z_E zeEvbFUcU9NNPe9=#hS>U&A?mF&CgUNyyL>Y8}~iFqWuwGgNhB}mW-2=lQA~pc0D_F z&Yk<+F6A;xPelZoi6kN}c&8-}r)7jF27PUc0&&iF@$vDpuGgnX_aGmb zWf3+Bz)?!8Iw5Y4M;Vs52Fe;aZP9da~(B6LD7I`RVB2vaCY=RsB zn4coH?yx?RXf=S0jw>hc3&K9?aL`S~A$&*m(oSpRFTZi&fT$>w@A|46k|gCneMO7a z0yq0=9vV-1KhzL4BDH)*Jt`sL$WNOyg!uF3-7CT!Kq8ArZIiOVOg}2Yi02(2I4iHO z)@?`5eWGl(S%#3hQiw?R<#&M$>aJvDKClbDy6h02a|9zzl_u6S&`As_Wcd2n=L3I- z;~bE`yS|l%<|eT4pn(m(^~12R>UK?__B**|4J?^&KqCGkm>)e(L>A@#4)}SAf-t1F zNN2BoMP$)Zj729dEFBrL8ZH@`xD5GB-JE^>Z$@#N{cvW5I>uU(Dars`RVJ_*iQJuY zrGRB`-t|d+NuL!-vYl&dkk`2oX7093KujfC!j45J0V}Zz@scQqRK!{oq1@=mATYcc zioipMBfr;SD^zuN>#0s6_wAREknphQeJhsYmGAHFDOvAvD*B|6Lr$5)_LO>K!h_+& zAlYr*?Z`{IuwLA7b^ecukYD|q?$XE1{p=23WC=uAWw@9rkc5+%tH za39k%hW@?@3oF?JyXnu=*SJL%8fV)$4tbw#5JM1xETpivx1T|xg!Yrm;%~F)

a zGMgzWtDc^`?{Cpl>L&QgsGNmtt3w(DlFEuKfBKGUrJIcxZ?&0vgnuX%LK-Jv;;1_2 zKB{xeKBu4EpEYmQm6gyU3z4h_DP-XhXzF#`C+^TRxvVa^X31~>^L&FgintgOPOqWb zG{#{3;u^x1N}{WO+O#gYeSe{yyve>h_oO9p8LxevOa*G*n4XA180#*?{KgoFyne#M z!b+kin8oDdQGoVo#Tc(1B5+ei7ykN(K zR@0(Rq#8@kBtgIOnQc0jTV4@ZD%6UvMnc3-CpkQwTp8S`$Ugk9HC8)45MB=SZIXu= zOYrMn_wl(KS?r%)*1vsAewU7wmq!Owd1-0M$-)UjH+BY1y$cXr7RB7fa1=xzL~E5q z?0WLHoOyiceL_8oFlg3(o;I9n!p@e_K{!ANsWsJ9gs7b8IE*yFN}}WA1$cB&(ps9E zLu7Pd@dgYkD%-Xsk-KpK-kCK-R*U?TaV|h698&%Ey+R7APrxxR!>}t7EiNh=Kx&ki zrwvd-qfY{ROeu1k*v+IZ3Txz)gyO^mX!uz$ydTL-!~_5XLp_<%ws2;NvUkR}|&B@>dw|wng>Nw4H*eh)7Y?{g9B|^CvFkBSNZ2$G^*CxhQPimfgyEvLhCz2^> zbbKB?*4?qR{6;axA(WT8Wb>QcSDCAscHV(bmY!BV8{z&H&9lkJ$2;CXd*)7Lsa6$s zQ;1J_wHJjS)eQQ))}Ba&QS6MMrIxr&XSr;9n)L7?eYeeS>6=7s$;|hxiB!evya9ao zi_&+-Gp>!K)F!xGI~Os3@v84W<)iJDO%32Q#8Zf21!UGVH-;53Hbl93bNyPi@+zng zHB9i+b+2ixWp&dM@^aHx;gJRR!bL)k*r9wb5~B>8Zr{G`?-h{Q5`^czCg_=Qbb3Do zhPMq3XU6vKRt%=TpBH{1GC#jEKXb>~?RJ7qpRk^Wn-gB*pb2+_o6iq}6{=yB5O{?w zTe!N7CvqQ%(FdNjf6wcDC7RJ*tbk2Z-Y~tgy zT33eWs4%bLjV(!sKPHBZ8e_$2F*EkQ;vv7h@hYrj5u9APLE z%xYXPHjYO`gmGh2FQ5chiy-f6O!By7XtlxK2jg9J1j>PT8Jx77KC(7%r~i~LUv^nHQ(lziUiNjM0ww%JUY<52nC)L1EKH^ z(w<5pUyGfoS;Z`?62?O1*Qy{zaL0hb-9n7vTQf0n#_d7RAZS?fE};&KGTW&9kU$#( z2gRcQWL;uTdHU3U5Mh97v@RzLYXlnmO5LZpkfPbvXB8PI@9!?c_|=u=Jd!$#0@uFb!NJRrW#rsv$e1Nhm@1QRCroi+Kk_K$W|o47UxvJN7^4hegv4GhKPvvB)pC;J;CeM z`v+HXJ1!n2r>y%;SP!OsoNdoSh~7Y)60ey7aty`M%4C>4@Ynq_ZDa~13!Ut21HCBr z-HX4EijdeG_Q26J{XQ$Ig?To3z^HC|+F?->?w~D9LinK7QJnun(ZM2zO+h z&tOD;N~rQ^{b58tHe{^1`T2tM-d*s_Yhd<74?pJF0*1pQ$)I3ueTY=%5pHFKlR>j> z8&beFU0K4u$J2nBv=PpN81@~X(`Y>R!nm?xpYeX7j&jchE@9zE8*W{4q2Kv8fI*_Z z>@U!@2Y&t(kYwOKbfw_s5gK6=tg=gc&fdLymo?LQy6^b5EnBWs%%tNe-IwR>Qt~An z`Wju6ze50g&{+)TLJSN_wM7eG^*6q3*~B5|6^LB2sr@u4#{jlaL`aizc9*l^UTX`B zyD&rPWBJh~_Z-oGN9x;k=zJBXn3yw>>*Id~oa>wkos*^Q^Jo98e(0;w4%;z_nuYZH z+>P=Ub*EyFSzbumk;Ua0ioAv9NThwC@g@VMICJV2*fN`&n;&A2qkoe`PIB z(038`w49m;<4S2UnE5ya+JeLm;29xP2W(UdNc~P0F9;SR?eZb16Qx9$i|b}-=k3k3 zLWd8Z`LHwzx_12d@nX?T!_o_?98!jrKI}5CTTft`}fscS5{Z)#v2i@d?rnH6Ef4T zzWL5!_z}E?3qVg1A;t%Fmfaxc2MiLin&R?hB33g|zR)LH+zDXf|H>FPu^Pld#;{Wa zNs%ZnUzqGobMbm3Z1!-@8WP>wP3=WxrL9Q9%Poj3PIEe(d?sGqP)HHm=ils>)YKq z62x2J69Y|%;b{N^QUlO$&A>Y50_kDq8m57yf#6%rfX;s{n)uS)tq4DN8^=qSs^I32 zOE2I$fXUzYEdHJaJ+}oDJX1qBw{J0YgwaES%y=>rn{E5{-4BQR<*-LQr-sVP%975e zs8EH9+kR_mS)^9%e{LA=$D?0m6`)r0ECx6jxYG{-9+RFDS@hOb2={Oo{8; zASv63BEsjIQ+plQil4AiojV$-JRu1Wdz#T}Tf)g1d`4oyMU#nshpD$0B5 z&)dO4)8|*2Om;Kw^X^oM5XaPNYN*!%wcGe;fqqeRdk%l>WZ`mir16qa6jTyDp#&2n zK~in;QqI4<33Bc@*(07R4Hc=$`Fi9G5;NOyz;RZ6m|J5-`#OLu%;(Ra2{@*{`nj`n zcg4!@Gv8vMMGj!sTEun%3CgX^)LeMB#p5*>y%K`V?0vyieV-O=2;xIg}n+Z}*K z{HHnm&zl?==${gRfv6UTn~rdCB~@J?{rsHvU2UyM2i6Z$dzN9G zm65nEzT@md&fRUyn{YnGmo8!2l*mply6RMtdOs9KWrk*^gNuN^`oLQX)eRG z31)?gTSr1Tj_VJ&_4J>|h_x<$SA4?OVR<6&+kAfFll;0r#B1Ru8x15Dw}QD{iR}WC zg2jDM@=+Nv#*65SS*#mI31%?BMXUK|n*Avxfau}l(2$UzWc9X(4ffh;=ya+yqRn^l zj`30{8ALb}ssGEd3y^&8V0qGNQVQ9A5hBdP=wUiqTBhq!!VEsklgFavedXH9Al=_V z$IwSxN5CnyYVvoSN(fub^t>F1SO2QN@a9QhVh&Q$eTGXX0dRPz)VK^y5+R~tdf6+@lP|L?lu!eP)eoGG zsG6CXktO1oEvOpBAxOAM*pt3krlN5?g|z3KiJkKuO2lmYy`UhI3YjfrNyvWP@%P`` z)DBOg-BPSEU-tNMVF`%>OJ6ooJw&xiii(7?;}D61%|Nou#F7*3A}9N^u&}4Z9$|se zv9bF2Qz@;N0o{T-S)iBobaxAPIk~yjK^Ps*a7J2_$gX8wb#Mqptn^rH2E;eZt5<6- zL87EJE_3I$VSSmPnRdd#!C_be!FwX8jC&hQbid*2Vh<(U2wb&Yu^G5eK%-mqCbBT$0IBL+ckj*YV>*nlTpRo_%nLNi8u4Y;Xwz!D2+IYm^^#f&0rFy+V?FS zSB7`Ozcc-c;rn`Tq6SgrV>MYhaR%PI=V1sPN`}{hMdtb|#Z;7p*!DiyR!Thvnd)C9 zHgDd{k3cGpS-jXdh(#4AOgqx281bR-LJeG<-MRKiZwaHfPWKD)w85O#Xm$ab;*J?~ zTKRh|yf6yIjVa6ujT*KDiO<-B%G*H`3sO9N7FuB}iMlynYPJOP;&mvp(ZkAwSVv=u)BHN!8`?A9`v#Ya(1dA1i9a&0TfhIiH4lhvTP zeXe87h4)CEQr5zyvrEs}CV4D84U{S)!o>^2Sqe3r^4-oBXgvK;%>M8`%ks@j4<C@cgypB5Wjiww8e$1SA$5Z4R3pTdSf$k%7g8aG@c_n z@-8;RdtF)etnTt!hGdyJ{h zq}tu7c#e&0q0hGm&TB>1VGug~QBT*Y$;A)rIzMU!-PV)Wlmmy777pi_cUJi37R(9X z$-1R2Y-8hKlUh4pFh3p5XwuYP;k)nI!B^hv=h^xyUFiIS&UzYb7cVG`w{UMS^1+)6 zFij>3a2ir1o3!^u)-X2T)P|>m`x)f<=6Vs&&p6Jx=xa^-tGDc?z9q-}kI#Y5#-0VO z&H}kw$MuKt8Np9$Ya6{=(h42~9yIWLY4>%Iw`(aQ$2U&jpjlDl+Ju(%)uzeJT34MD zypQ+}w7>Ro0G%$*uv;B$Jh9C8quu@7+3ReT`oUl;@1Rm$TYt51iqqR4K}1=>_X4n4 zk&^hBz>kzZx|Egn1mGme6~h$%FbJqzKex5zAT&qHzy9;fEx4806QgrCWHF%XmVj9F zRcq@PiUd`){qmaIEfW}@cCoac|L|ejJp#H_<@TV4??AKt^r1rj$oernd`HlzjK07J z!uc_`RPZ$z#6gM)n83vF>VCIO2;9k2PMEBO;!;{82(Q-2}b7sB0Pti1e0^6zhQbGKQ?{+^Q<4=zMZ|C z3dcdDYiV_%e>T$@X6&#uz?q)Fm`#^c!4^dd-OitnP_wff0}7}n38H>J z4r#~)S8H6w@RW{u{owfcN0K0?ybnGB3W-Q|FET}zk*=(``+##QP++G(%xyov7*SA< z8iAO>-v}ECXZPMDr)UftLrU>V^Pl&7j0X(}>pFK*wBK|3_^`VC%FKxhS0bnuU*F$f zSw*PVjWL);z2ul;z_2Ap(yRGjx&INxg7}6PfZX}_`~UmKPX(z0e{e&I8YG6Ihcho^ zvWTI3_m3R2>v^|Qf<|xgpi62L*sHN{=l1C5Sw5Pc zMS$_}EIaLug(*7364Ym35Ocu)iOwb^am}DDV!#H4m}-Nq!+zO})XM`oH@M412=3`5 zzydoF&a3g;`-w$wJ3_{rkC&Cmp|N=CFsC9wgSKfB%nVIoRG&&XIU z=5Us{;Syj`l~{IM94iRHNgxbB)GADyMGqf={gr3kaoTa;1TjuTO!LHcwYD+<_wY+H zR3QyDfB_Bl*&CQ20?hz)MWh*1N-@>|`^@klgXP>%{EOUw?bBgLc-FEfkH4+#^TZjcT@Z-mR$w#MW zjKpA6(+!s%P!03PkKe<;XU)uqJ{rMuulRzlu5N4AU!fF;03!W{n7YS3W)4Cp0avca z&tChkuQmFX0zrR6lHx%p5ghON@v=A+-ckDRe`Q$PolyNmTC* zp%s3=FBrZ29KnBRduKQ33!w};SN@ruZCJSdGcb_m&&#4oSK$4lHrWAPRJSd zX+**m)IOk~4gP2V0buRZK(q#cZ~VX zBwX!DG1u|)emH-mw;Q9tR-d13foVlb>iquxek+`i(oO)}ZR3QSZjf({d))xX>}6)Q z(zd}w4apUI#1ZzT(waTCjU;a3G^LI!^bSjAOFc0egb@OHw^U>>He$x=o%SFUq{+#l zp#WHFOksRb6QRAlO+bC%EVS)5(Y66*(Zy|Cru1>yB#Ht?u*-9B&PcA1ivy8;WcaKJ zr>Gh`6>2KP6{BLOld7qJFD7_pu_&YnPpHLkdvLLH076MI7{yE&xv7l*H7-`;5f?DMi4yv>6MD1bLYXZM2Zuf)_EU@e=iAS(J1?Icuhtd zqjU!jo|e(uM9dDw42~N+%{kckCZ-=rgUBMz&^>&5f#S@VmUTnMpMH&urN{?1WXX3D zsre%BFYU-NFJ3-2=Qexjv}c>*rsRc}|N1%)gd6zi$?}{JoxoT-)3Sval!7>=w8C2EA8``+48r659Mi47$_)^PyMTzN zg!6oSI$ywMEqhfx;^ee(!wHxpZ<5zeykhp&(9r{uIsxaU-8aI_ZqmVS;_3sFE`^8R zPX)`+DGkC$Ie7m+pFYMyW2V=;TqwyleegPQV-a1Z5-O$tynOP=h~odp)tkUmxwUV= zJE=%YnyExYMNvs)C`l1*;xw2_Dj_pv%1mUa3>gZMu~0(W5JiSW$XsNeB{P}wT}$V@ z|L^;Lzw@5=B<#JP=UHpr_ch(|Z1vQ^-gyJ-jO8q5=Q=##SXMZBR0uz_Md8d9#X;Gj z%hJD1n0F8xBAVk)`$r%?PvAjlAq(9DU24J8ln3&-QWVTaRRIX3-cEo zf-8f4uBD=4O_oJpz9ugp9|Iopo5KxQuX%t(vg=$K*RA?voLG^1vp#v7>MN}uttk&A zXFRgcVw=2z7Nn^(#l*zKaqUi7ll?sd1587?v&oJ!6c`vQ4c!%s6^ z_8f19$H!ek2CqE56!O58STo&P$yg$l2`aoU#pGnf4>qNnaWn>h?}1=ujbiAbHlR;X zSwy1uYbs4g5L&bj`$jBpH|*}F`==a!hTXZizJ6+^NBNZ>ZZ=|&eAKko)q6K z?Chm!Mkrc7{P>~ydgrvq{rk>DRgt8gq$b>pg80$U0stL_pa+2Np!ji&iQ!45-dzV^ z6A>y31Bjbo!!s<<@aVv|6rB~2njT1M#FGtPxc;{mz-w1mSHV6Qt8=HG=!R!)J#g*x zqYJ7~F7M#taxOuW=abjisNP|V+e8E&ySAyQs1S`ybrui8IhLSw%n#tjbxKOs8fD$K zjTr^1!oc~2bISnhoyha`euCBq*%bS$v(BGCPvkp!zLcvvI&Ju93IjM|j{(HGZ|8v8 z0~*z$=vrJ~5~5i&h2TP_3&|SVRSC+^-OpcOup%iq@Px%$vJ2}+*ZyOvg%~U*s12^F z)fX4XRo1Gl-HPlpnCJ`BjPTfz5S-ULuRp7$IPYLs{Kuopvw3^euD^d^$R7rlf1Ks- zKV>(C9SK_j1*N-Qh@f`9Z8~t4U<>>!ZKYBKJv3Iuh^~?hv(iT!HWF50BnB8m*EcC+iy`E51sXxvN>BC^gUmPoSFc{-Rkd*=%Tf?__A^IK z+^~8!yoK1*QtKc=x)6O976Ijm(zPc)j1^cK8SMk8^LScseC|4=E5XP(j12^b85QT@TuBNHt&YQC&I$07z8aq2CNW6j zs*b2U+8VFAM;z!fEej7Vl8%Kiji;S_CZx?3V~HqIFQ4O z@rL((e7foqfNe*%N8QUgUH;`uAsQA_TcA<5 zBFAF!iIOix-q(Ku{p2wEz}7kxpW2aGc4OVlG+!^TGm(=1g5o$*aicU;*t_=w^g_~3 zO&XZ^Owb7MeexO5$vji+cp5>%DmcUG=;+d#?+k74&CAPELlQ9_e)ZG0&<0eBwSX!+ zU%ls}P*`vF8Jw=Q7TpyQuPk#b!j4Xz$403aHiRSZ^yysKBwktG(kb#*t{$D3sI_VF zjKUeDW(`Yjq9D~w$ng82q^YpjzUy5?8Kzc$49^7%6~D1sj5_oNGP$G&Er7rnJq-a9 zKrRT>OkWMi7W{g`M3##ex)V3z?dkdTdo^}b*UqDSe5VH*P)5eZ;`9?px|*7jqFgKqbGrDY(f|K%Pw8IR z!H^$3aLUssQm9AWbF*!(kA!55cG-07BE!11Kfpr@cVyAPw$d>M$;v~<=M=xfCjmBuB^&eZS~5EU$KrUZk8NhMSuL0qtG2m zdiEMgR@b(x_$U)pSBBm#4AQ!r&FA@2#)P`<_O8{bprsVY9pY=rDhwV|oL|RDmm{(C zc%jOS($$^2HpwUI>Xg%1g(mITZSCYg;yVUo$cN!-&83SpfY-De<`{4R)vGtaoF*s&X|q@NEQxz?J!uemkrl zcWm2bQWuQ0wVfai$dOF~7-BX*o(VMhm1d3*QeO@oJgDRQ1&A-)F#Bu#BO*9LgM$Ta z_uzzw#vuA(Iodk1HZK)7HEl)I)S1W>w9^{}1O!4PE~%-t;};&%|JK-OUK{h|$rITy z=$IojGeuK<)6<22wY-1-kmRZmee5Nq*wUlz+iPIYGzxy=88H4-UdRQuLNqB0Y(p)Q zTdqv)_U+pMytohLVnLbJI=Z;HC`p+eAdOX)xMhEU5)~`D%jgM$h+J(D2Dkl=CNv@I ztG zHvI|?mnIx?f2MmNX}%O|2h?OUb~abre$ZtyCr=7>*%Grk+`w(&P#95g{S~ZH+Os09 z;ARc9zL&3H$5s%-7Vi$PKGOq3!~(s>&P3t+^E*sPXE{j}eooTD5Kb8(SByv*U~XRH zaDf~{`o#6m)!;imR8&x(mqQ|O1?G7La>6*{s4@sUbsh-r5;Tmf-c53{u!HY@`aYtR01_SaoY7&8e$H#Ls6@c~l)L*2Az zuiyxf#{F}%6*Fz8cXfB)n7x2P5er)yD@nLNIwmF$>x4qN?^97zvp!bw$@%CwERZ~a z23|?VGOjpoxgy+f%_1KNrKP5m3 z(XbW=9;Kr^^oOr~d}Ku1{(COoIWe8j3eiW3kiQq6{fwO6L;b`Uj~1sev14Gs3y$J? z)3om+kQEV{-uxDBW)T5yCM;kOy;Z-x*vH{NEkN?N)Hu{;tq{?;E;b1yX2KSWV}ll8 zsfWJuG5G)#g$Cd$XaoKxY(p}dva+)e8cfg5b`S@CAL#KbkU4}{Ld#DeBQ^=N4J%6n z_ZBBkAZrt{Hfc_LT53b4x(HcL1cj1%DpPd$q& zRuE>z;;;Ir6ckp$W8g3P77s%^?yRuBfY!TUgU1PYzgJPjaE3wZ|LoPP{i$aO<_C4A z^q2*S7ev37N(9#=E&P!FU94;kOHqS5ejhO;S!@E}dSKx9^t3kcPR=m&X9Uek^#vb` z9Za=u?(uTeK)9zLv3Z+rSoy)h#RGTu{(Yw%*L0w$Xqqs@xdX$KYW27G4pQ~4oDS2i zE^7q2xTL9HQ`f@i3h!f63aUj$c9|x4z#5TIJ(vIt$f~!JlQaBmgrpB@-p0`6v&T`;! zc$97j>(A0UtXa2_E0+RuxJfG=EYc^w9 zzuuMT^(To^{GCOgUuYeS@|l*2iHR!}RkHt$m8YG@`yh5_V@p)9MChbm{O#Ko=ta1= zTZ&fJA7lZzKNRMi#0P&cv(d`)Q5J6qif4%LhQ?NDFdeo{1L1#uU^U}BbFw|E=?4@Y z5e==)`(T8FzjfJ>nIW?YT#`&$HmtRHKu-Zq0DSim!r=@W6IwF>sOk0CK=YFPA$i6w zu3|fgl`{-MrBM;UaQ6!u%DZf9}B#1B=(=NwI_HzUoTVCzn( z72k`ip^`lztMVn4Wy=;uq|`Jura(K%jOE(X{0m0kVK`Uu!g@~*96Z=I-1brbm7ALz z@nF!-D=D$O17kMW5~?5WMr^3~!qn}UEnyNe^S_pyWGPx+E5gYupBOi%YabvlH@91; z#nnjJ<}N#XfbqwB1{7sK*~j}IaDch4x~V_cYxPw?zLZuq7qDJ=ihr--J;7UPNiJ3ArDi05;infb0F)`4p$(q8 zXE*-!R4WkroNdQ@LjmKP8S5AtGDG5!^xXglm@@z+lSNMxR2P}FY91hd1V#Qe2#-Z1 z_z$|ROF*oxa9_Rcb50nlN^B#HCiI|JmSXGgXod*UzB0p9C%qnfM`02|GZ+vJr(z5F z4F$B_mn32PIji_{F`paXCnY6mBZ#U=jKmfqUoX8L30O-2W_8wuLSl4LUH$!hHu}P0 z5O5{~Ad*(1Eh@|jOTGjUZ0c}#cQ@)V@$`E5*GT$ZXB`{hGrYHZSSe8n-IwAozn3II zHiPMa#azHo(dr5VWH+T9Nd753X`hQB_Tq6-PCP<@(QSWGH3(UaceM2Lt!u- zCx;nP!R4VAsMQ((gOLVVrTPHuF882_dN<#Q;+Z&djKUyX`-sjzgSNMJ2QMJ40{CV* zDeJSrGocDZieb^>srBb~#~MO46#*J}rQCx(o$M`w2H3Y&B6ZYaYtn5z>8ym>e)^o8 zOjzA+Ha0LmtY$y#E!u+c-w$XkS5A-weE7q(%T;jrjKFB&W}&)g11^8K=wWknb0p~3 z^zKFwG10R{FQpJ0tR0FwPU0PVDq_4nbuo7k)ta!480`E+jMAeA2|Y4f)lUD@qJ$F3s>?}z!&D>5dC|?VO$w+ z30ltL_U#+x)sXoJ?%etO;aMDMzHGwxFBL;jg}ml{hp5hiKHtit&&WkM;CF0fThh%IZjU7DSP# zMFbkwlHc2g!VRz)VyG6ey0srresLZ?{8;Y??)C{7_z&g6EXTZvhe6_QJw&s{zBb%n zyLVR-HnbD)(BM1#{OGX9=~h99>x9d;Y1;ngO+Hx959%R?3w@AF(G;RL+R#TRkZAxw zzn8{Z22!~4Q4WRFJ6vvDkNpx|oNg7;Wt6h%Vl3dD;pFG{*3(3^*iAGCkA_wN!AInu zKgsC(P*-=p!xo1N%Z3ew(JCm=!ZZd!%)8?(h_!^Ri3lp$T1vlsaYN-L+LQ&G%#+R~RiA&zwM?6KlEIeTI6IQgt zj}SC~b7(mLOhF-`-M2}49?AmQ=jYL1ruyReE+{SzI?_7jwB-jX(CH*I+_wqb^`g9dIhq9D;0FV#;{dd-oHT7~!cDvTPCcFj#AiX!&4^nSmmv)~i1RTfbkU?=QxkB8H02PKJg=>8=SP2GR*db^ z9fo(MYNS*X7qXdwVe`97hRg3YHEAHkLW*`4#N`O!bNz5EMpx`d?RDh3y-%xfG&WHr)_E7u-6qk%wt<*q1xdP1K~JgUYbv7Z4c1B9Ol7?+=3q^n5+OL&TN> zBLe5*dm_-{q-u?Vju|QlPlz3w!cjW8StFvNXOf81k#;0KM<7WwRkS$tSnViVYto== zeU19RNe%1&BQd#*>}E_}B7=!3ceuua@I1upAtOgWMg@s?m(~sMORRplK?8?XQtVjm zh0A&|JLUYoJ8kFSc$FT9y8DGSjC_^fRcvf#)Wgu4B#F!Uktn2%Z-)YkU~EV)pF+_+ zZrIX#1UO5bA#vx88%8{I0`X+s z?o)?~-n38xzSZv@c_-%??%g$JxvZWNmBxK?Hvl!~9SwYPwwFS;+i}ZtsJ6H=pfgZ) z%_?jdHM}*#LLa$RzABe*q?Ap6KV#;znL3H0?HrD)UmaVoa6|93k(|(5&+FK5`3Bcw+U4{%`W-qW`_VchX&`Hn)I_QF=r4%rWFKTb|t0rrN*s=rOLQtf#?H4qvsB$yX!3E?sHO&pS z?Ao;L%@N6xCan|_c`@Dd4qfa$EYQ06T3|oP5i5+LqIjjb4+Z*S zEV8|Vq9W7$xV0qYax$=@-$=r_sDor@&M@qnI)JD{EEAKGOreoulVIK^ww|DiI5uCn z;dFAdfJG6h{V&R-HVq+LjfkpBZu>5IV>NP>A0Na#_eWFHFbJucw}!8`iki_AvhL{+ zobbrEEFCv|^6(L42Rit5!Lz)0@uvrMeU+%dU1Z_S#U_m*MkKY-N3FTJxokoJ0O74~ z=WgCI8-=${2Z02^9mLhpj3C4KIO557b6aMf3wsdJ8?Nss1cOG8BDO)+et33K%#}?r zQ$XyUinNEN#~m+wby7!9&l4ES*LX7JEA8#u8+D`O;|0*VM2_rU#`tLAJA$FJuRyY@ z5L5`bj1EILu0I;O(_iB$hg_K>5DNuS4H4tx2-H&0`VHc?p^V)y{Rh=%8yYKt^Z2e{ zgdSoP&)JSy_yk;m!U|av3Q00yql z4i|cJC6De1%#tYsa(?tc0PkU!9rzG9^2TAo9!3w%#jm4CC3XI`3@(cjsJ9lq0fs7| zhKR2+U&0V`C*VK66Ke_o!WG!{Q9zWIy5RHec~KGIPwYrg-Tm{^#*aG>eP|>IiF^qn zJ&6nU+Xub)@dls$Pe4~l?baS~MqP2Ty=S@JgV z3~hP|WS9TqYYMuL%y(>i)Zl;lx9H#t3vhmt_TQf#GQsWgFL9XX9dej`n}p4Jq5$(? z@Z;Vvy`ORnu*#KB4>?cjX5vmF=h^|cb#Si9`ZO!5)kY_s4mXtZ7KHCBswHLjt)K!4PSA0a~x8SNYz zT0`#TFFlke(8|$K3VZZgKRBWDda2BN`0(VQy};Qx*g;<7cW_C}Bf15>@Rxu%LTdvk z-fw=sz!YXLpcA~5^$~F}iG2WfGye0URH8=a=wMEFVV53oQnAc>J4l{cHf<_a4MH`t z0_u&naKuJ3p;}RxCuKLW{XIX6!w%z^04P@60SM4mO_i#Re||23fQC9%OB|;rLy)57 zMC!pli2m0Qn%GAS4rxJvMzQuyh(-h8QvP_3m1X<(Ka=M7$GU#~T8(0p^_QF{P&RhN zpRLAcOu%ozU=L#-G5AY|-!IHRkXz9@Z<&d@DFc~g{^iK-jszuV7AwPtn)~jox0dNh zE&54l4ia^X3!hDY>yHwJN*CIPplQ!>Y!7X?Ly^jRfu9&0CZYB>YrefTWFISIr#PY@ z_#- z0kDpOCoYv22?sFfK2W0KkS<=#1W|zsiO`8|7D(!R4J@Qgtgfz(BvR;)6Rxy7Byv>O zG}B=rw^F>MkSO*@P`owR%a16A1{&X%Ujj;n-oa`5qFz&m5K6W8(h^^Sg_aJ2!XBtX z`=g_y<7UGKLvm;^PM|)no@M`$x_i*>m&oJCmEn@me|Fd zk+@JMMuI-b2djdIlo<%mBhkZ5OiWv)?CnZ!D+r0hNsBvL@-z;cWjmy%;DA`af$;~q zfldn{55Mt6gBozj>n>=nW;=BHc{wJ9IU{noYy!CMPU!#EhV1&rz{K5@+hnf$3NYuT1E5^#dTqeL7w%MRi5|5o3JOBAIHoBv^j^K z$4-(#5Hb_|a<$lnc?_B`3ybgBkA@35j~bk$l*<(RI2VJc@RMg1FP=kYzY~cYnN)?G zQ{1BWAqy2Bh`{1zwxgq?`Piy+YLydIo2s`hrF{SXeIuI;DcB9vzRks8yZ(*DEKX}b zFWB$L$HzN;li)-=X`iRg%fBy8w>Z+y9bG6wlZwK>5K)i58O6&W^{Wy|hC%>ar zep&l!=fS(v(wWj{RLr?SMCLQK8Ta z8+PvG?*>2!(RvjmZNLmyutZIbeSWC+lfK7-DUvqy7a9W_mZbj&Km9N~eBXz!4%Gr1 z6RkIbcwhuvOt)3<^4K(OZ?`|PA4pO}9sE2_@TH{J6G2WA*|30^=)ZzH+FNhWkTZG8 ziD&?2P(|6NPmYkXl8=U2M=-wp8pO)oAtt8gxbylb^gVE=v^D<%P)r*1^!k-`;!N-b zxWx{@c9(Ge?hSl)(!UtToq=7c6(~VdV-BDgs7CoeJQ9bPE={zpf_EW2J?c;0ie(+U z@H-bnNxW?TGnPG7RZ_ zz%}08^Vc*zKZO`ajdS)q3+wmX!I1fE()Ahs{_D3549*;%MU+`@R-R>XWfgKY_GU~` zTDbNb@FG}h8*%xmVy~3K=N180jXSh6l;(Rk5|){|(MPmJ-gSoe<0<2cV|+_^4qqJt z%`Gpjmt7pd!nEx@p{SDzCJXRynRtf-cH@()=#|9-jHS=Uhch@U(aGPU_HwJf$17R= zTXKJ5tX+4U#q~z~-J1OY4j)~@osQr3#Kl0Fz%L26IAy#=oh&8QZBCa zwF+$Q(_~RLpvrqKJN8p#mAu!5vr^ynOG1oex@bG(r*u~Z#2PaMDpB1lwf3YqoRhXq zu!wiyy5`iQ`w~&#h=uiRk}AfWC~rfIgAlxG96P0HI8*@WRoW!g0ELgLYfjPR0dD zEzFr4EzY_nnfu`&wI85EZ3>6E&Iwru=c)>-n9}+sX5r>wUj^VkU&|OK7 zawF^Q9ijBhitC`=^7i$uqumWNo%*Fon%8nTOUDF}rsoldL*?%_V2HFz44!baBIGGy zFdBhB-m9^IUC5*wsMK%P4}XP-zm|xN#$itq~@(bmmYaWTxk01e)DQyIW`- z)=;J;;u`NmdLlik4%1??TKfG)QnC%>^pqCg{RsN&c7*z7j*TNNTN@sz&^q|;GDwc( zFkF%PVz?F`LHU>%djLL`k4UT6yQeN&2MZ{?DmTwLRZ?@y!Y^YB1j+^Nrk ztY|kJM78mjAk%{#${BB|w@F54-`bIQn+L2ZPCQ)p3ifE3n|*oR(_ z;AmjvS85#cG8m64R}coD)TZ5RKywOV%x0We&%seqy8}lOLxhCwu9|A-rC&pwT^5dW z$Q>Jkn*~t>ftD!~2KzY8#U6%=yW3w_&X3SNLqAi93cleY7nl$v?zZdrQ<(){^2B!xJ9MBCd7)bD&^E9 z!NX|W?Z#N9LyM!haQQ+cpe0SH+{EtTfekDJVze7n)Zc}$J^Bp>Uq!I;M{M{)w#XNd zM48Jt5iLX72NWJRlATa0ZknkQjzIflQhQzbvWcYq+-9JxpMLf|G0s(hJnGc0n!%>@j5Ynvv0J03qvtkV>%QC}o$?8!bNc&+g>4HEP@l{Rzi5GUG7%Bl;%9C{^`#)@ z>+5?(S2s?XfP%>E`1_gBI&QT+c9hUd;M5eX-=w9vB5;^tDq05biy4_8cqfR!1DNU~ zwkL2U1d-Ph&|_Yf#;5f^3t_$2&x3)d@drHc_jglK4`PaDlyT|M$(R49ABb58RtliS zB2wzwrW0$t6u8?JEtthmJoB&0!{fYzm-n9D58w*DorjlXog&l>rjPK5-z) zz{u!NvwEi|SLLvVYk7uQ*UEYqk*FI-_Xv=k4+9E<$8tc)X(vI5i@m+QYvL_I7h0H^U2VPt1{`U3TO&sxHaAaKgtT{4 zKU8qqXiQRhQMk^b9Cfqt^M|Z1`CdHoTH=0(y7Bc(uMb0o9z9Wjq# zK(|remC(A%$;>|_#1mzq_M|HG32p1zH}AAzKZYV$`)g@mLvK#tD{|~|I-5S{b~TKbswDrwsDsZJVzQX_vT{I$yy-?SU3sA z7nvBDnU2h2s`dy{(fzC^!|nt@X~{5_hF;o83JV6aqeQjCJGnr;VAH7B)`BY2O|j#J z4AJC~?&Sfy z2U`xxDwfvQ2TiL-Ijcp*Yvp8}A3-^KKA6|gjDEPqIG-;J(s_MNs`~-A=UZQE#q7;c zeR*E7rH(OVcyzSt#d6?VuGR-cMCw12v}?EDC7(W3X62yi%Vwb#-G_EpHgrxd?&cHY z6Ajd&txtrMRddt7t>KyqV;v|7Hu%SL`tUA3s_W?htGRPu29}4qq2!TXE+a+ONF=ko zcx3Ek)$pSJ#&s26ia&(FO6?_L=|S#m7JL0`$2`LhVJ(FvuH#yi2D`3O;#EX>6uDo7 zVJ$K*$Zw0sk4bw1hh)~@&&-j-yd&Oxg9-{aY%UZce(i6;nA>wekskmMjLX{Fh_lZBg&Zw00fKYsj}5T;9vI%9whx3;!^$k8M$4zw~TT%#1fU@;%Vp-!K{IqvjU z_*L555|EB0U*Ut#q%aoR8a$yJT0ePwEg90>lvj-D26O8=API)jeO#};H3a%AqSF)Y zLCQz)!-vm|8x>%hy9!7qMCT=$Mc?xNa?4xh$2bllCK5K!g^SV@(GYCuq{d2n^X6(D zk~xae8%2&VL#pv@4dn{zBiES{obRX}1ZRbWg$q$vGfi;w;((C_7}xz9^qh|% znR%^Kkni%3Wuf8Q&>L+b2oD25lV#Oez&q|?KV-bk%|T)@uty$98o;;})bne9e#(HY zeB+o9a0o_hx%)=vQK(6S)$*@{hv23E-c!(C>tvPf`#VM#NT%Bi(*B$X!j@!kvsU-&hUi!mLGI38{Qb}_M= zb_?JV%0$uoUTYSS&x037bt zU;IWg%KA&YG|Ng(kWvVF2^kXBHMEt_$92~RlBO_#$w6ewEd5bBsQBElqaN%cDxcxf z{qNULlVNbEmm)82!}MREU?hCpX9(F=EGT@n5?51>5g-b6jN7?zclMq(*8A z3M57x9V(}J(c4Ifiw9fs^YfGGtZ%w$hakQN8^v7@#0Ip1DfV$`Ix_ejxQUtV8yaoX z?Kg)}g(vIn9t$FX5nz#)ej-F5I<_Po$i3j(+rawdsRfz>MFoY74s)=?w27;iV9$UB z)}*+*wbhYK0ch7l4|h{fuVfH5d`9d+dkWA>-hy_@YqlYfTdsFUeqo_oBhKwDkhZCK zM4)X*-oI{a6ZjI68&{KtIWiAWM>eS;sQa%13xJ%)W$v#12$Fzh`5dS9cfg`Z^)R(X zC$M+C%K$FL3`>-?uYua@6FF$-jM-iXOsg)atgNIF)I($G^SMIoo7gD$|CL|it0Z$a zm61Y$-tssaD2PD7RvuGRW1G4Sa08qxajiF-9q@lWThAWij&;!{MNRbR8>q`Gl z&FTh9r9j)a!L`fmO)j(vF6eQ9J^1@`LM-a|d?778VHz4c@Hxj>M)SnxC$o2O;nJSN z2~b(20`2WP`*}6|AvO~iwzGL3YVt))!xpfOnnRg<-cypcQ+0zAhx|)(_KiR%g5$o} zZ&K_z{C&K5PvDEnOw=mIX<}6Vj0otqP-SUb>LmiOkRLVgJ)K%;c4B&0rz?RPc%BtZ zMspA991qwWQ56Zyp_T1e<=HBM|FD1^hfWMWw~w4--!7I-7bz34{Xy0ITK8rD0|Ny9 zs+k~#lJ+c&9AEn?rRz!MepNj5fF0iQnuV|76OxQk7Hp}J+OmCom8&x}?`F~yGB~aU zCnpOUI{cT4zB;CqJF!$^Y2+2Y7T2dr5^L!r>3i9Aq|5iRxY|F{-Lc;Lf3GQj%#+uv zMzSA5sr=zc@>l=g*TX9q;mfNyo?9)K)+^i@#3ARGR(Ecu^;5&{lH7G}=N?>k-f{nw z-XNpnWO#s;i)@X6)Y)D=N3l+dv142cNr~ulI-nf1}r?vzR6I5?TVZfTNkdTpv&Q5f7YV)0Q zSV9wqITrz;5ss+M*AM}I`ouD9|5f5IltZdP2y_pRh(yw74D|d0B zQ0UfTMC!Wth$M~OhL|}VE~yu3;W+r9HK8aY8BA% z5lAByQb3V|4Elh^H8Q0{4tNYPBlO5!)8!Mv0sR|MPY3I}fVS5zf5>%OQLzxG?!g{g zK*}iB_tuaNgrrc>c#+04+rEz@P~Hu2W?48`SEew>&x%+?Fs42Erh=FCnmd3Q23!8J zh1d$7$;a%8_74nnL$Zu^UEhyv+Uxy(qbOoOFs|Dr*aKs@T=1huk;(%Q`)F8(Kschz zize81yf-f~6?5(|UUFYFs_=VIr=rb(uggWu_}ofXt{+dJzcY35co-D40w03R3nO}J z=;iwwH%12|6;Rawcys*>aUI65 z!?>umYu66_kX#r-S9}adw%51?IH+RYdU=EiS@P}LnO zL}@6z2)NL*on_3J)_q6he{qKpJ-N@p+U`^^!~R5}+q^ z#O!fMtpE)3^`7GG0tqc;aR_I7Y=*lbxUY%Rbm@U1>LLpCZo3K$|NQG2CEXvgQFd*k ze88iobNaN0EEPpTmyL%F=oU6hJWXCCf^(7>4&L3JeJh)UJ=VOo3xwyPevKTP4MfwN zco3-_P`?RvG;<%qJO>pG%wi+r>UuhXv|)aP)nW>qQ?3pUc3SjRXVav8U|2^u~4hnzY_qyD#U8;}6lFcJJ;gBJ!FD zJg2puQqtnk1gh0yIr<1v!MUhe1D4Abq{??dIHpEBh&nhXcW{$ub9d|s0gjx?+u2pV ztfPiy?^j;8Pr;ud;w6m{qJ) z%y1&!2%O&WRDrcN9k`~i@$o;Q@1|{VuJpjS^Q_iqknHUdW<>8RRI>)V`)h>q+-=+; zFXX|1W-+RTjdQ)%f8U4{$=+3<1(N-p|-N?bKkYq_53 zX`ER*Q=XYCb(uRfw@s+qD)0kD_~V97J8GVVatC;P+RO5N(mW=~;ZTv)#9_zYM%t_M zjCfpMQACQU=phybGTCICO-Sf#e2-2m62)qh816N(l}xIk(S&LyRNVk)qrL;|PbU`1 zNJv}hAU)Q9#f)4(bz*}aC7@KoN13MBMcz7%5DQMUaxc#OTlSD-LDO=pkS$KjFxJy&YRK@k2Cb;)MatEIZ=I z_+!4;QXBKCK2P{J!^oB{ImjlCep4Eb?QT2luL$~B$JB>DjF?7Tam4mnSzEWETsUMi zgURs0mYDX4*2AC%#yC0>EPE-Aj4cOM;G=f=?!<-*n#73uyc{ADACTnQz7^>IU|K85 z5Md0zE5js^r6zL#{bcf_8PZOq=UK9JQH*O$5PEy%6iGZXsU=<4-X{MlD+CW?K6jWX z72wxLkR~cEZ(F=v<6GhkkjP#3?iboNfl9eP(&G)!Oc=ykVsl!BwyWm1*2RmPQ9^mk z#er|GL}|g*f-&)oV?1>@dL07;1B<3G2abXjuONo)6?S^`t^0X-R}ig1^wQx!S&%Wi zk^sg&$jpzUVGog_rRjJFxiWq{buH)x=s*ktxSqtM~n^R5qlXHEZ$Woz*GL0Gh> zd*Y0NrmU=NK2G+KIyezS+lMl~&Mrbm>{t_gYx~bJI`c7C@+;s_Ui?S!#Q#~uf`zJS zOJdftvompEcvAKCn^4>0yxBWCkKgwZMz#C3m<39vx1#Il8ht^8npY?IVcM7qX@7;!~eOHi~~Rv(~hQUu+mp> zI~BqaHGaAr)J}&xn8;C;aF^>lrwGNZ*++5y+q^m7a7B?(MIiKi{{$y%Z7!Xdu|q zL7)>~dYKyNtIg}zM|XIncR;Ch6GFJH#E-L>cPN4MGB!AkLnLsw+Hh1O1408*U-SCw zWb%T|%GIl7(GuRELQr=E0R@*NUfUlQNb5CQ>q#2!HFz<|XQbybgUXbqh|_;GE z(s_zE`gnRWLj0p19|sYR=~M`ch>iVv7(#aQ#trn|Sn3y&)MAvq;`S|};MHCz>*l*M zHuWgN_w8muSQO15EdC(j_sc{b{)pdUUWJ-BKF&{ismGlwh9)cbp=vo_9dg7p>bx3& zHuU{46YV^5;rk;8vNZ^9W);yjaP%bZob zM(sc{o`a=BI1Cc6n-Hk7g8b}CBDsfk)1bKTJ+wO|BXe_5Zd3KJK9j`Kk=1yDaP3r4{sq4HYAfa97yHk}*SP!f zx%z)9%>?yDD+?mj%AXl;2K)R3?QB$m|4IG*miUC)zX=rVy*~eK(+zEFr*vb3%;6Vsg`?rBe7lY1n>}BO;ki&=(fLo`q)NKZ=uIzIij8C4 zIcu!WE{H?5N$Q{$V5`Mk1k&o^1UEMT@Xz|soWSkmp2Y1c~pr0>Qaw(M(f zKNr88d>x9ejylX;F-G74c49bZi0)An!!JP&zqg-vo$TN_DIIMfH3Xh-DfptRGm~|Y zlM+Cya~8s6cWl%fiSF)J^Sdnwh$-p%C*oG)TjoDg?-zu+5>O_;MmgISjyQug_^jqB z;Ybp~-tZB8x2vt67v87ig9mF=a`*o8f*y0*HvZ~E?vEgFyv~*@#cG~Cc zT7Y6s0D{ouxnRYb|3nu7vHVg%au3n>HSjEh6|C`57X4!jBtV+4^pt2z;(% zI7&mrMUNgmfdoJkDkv;>h|^t5NDJ5o@*I*yynPZ?)R};C_BGON@RrL$mZBiN(SHy& zItF~sy5D~Qutq`_2AC0xuMPiG2tS{AaS0e&{3UFD&#*%(gd>fv4NOj9@`ZE6skvFw z?ug1QQXl}qp4euPUoWh`*gC-j3p$04o&CvG0vvIQ zi*HTV$fN_n>4tX21&A`DHi#A!vhJ)q7byWMFexu33ow6&Q0x*{?3>WgI6{Zi(9m!U z8?ZJ+m}D5@tgZx5=EwY(=pbk$sr_VL5J=0`?TbL!F!Ue|NMlKvE?JnuQ@@TqxQu8VQI!Y#wxB_V=L^9FP{e-3)9F=Knw_o; z88)89dx@r-M;E2uWY%Cj;ulwKIY5twJ>i0s2LsAQXBfYv;XEYiaDbsF`b(77iaIJK zs2UIX=!)96)E|bUcQsn6Vv9Kv4D(1mPO#G=0?2fwqP2)*fjul-BY0$AyW)Luww5 zxG?8MnKw=|OnY=H6MX@69gVem-9@X(N9P`s;6i*vU_TB8AeSzUajpRAPKI9qsUfL- zNPomre9PPe+!H;_mmCPX$p|kSIA-(*>YM6HcHmqZNqCtiPA;LTVDJXvt zw3%^ARG=9IuIsi#VNUrQgM~L)ma=3qrsa7#taT$UYb3AKk+?m`_q-I=K_s$yp!Q|) z(>N5{SCJc_M8)&Yo0XMypvM;0=gk}(5K^9N)gOdh@oPoJgY4tjW<*Jn`)LYf#H0r= ztf+XU4(a5qu7Eb3iVD1<3`2G5g$KMwk2V+Zo#DXT4+-x-amm7Apqn~d8 z-(W<=m33n)Kr*w*0Q!nnvaJ^E;>9j;3YySgzX?8$W5w7K@(CwA5M6cfg?#`raOSO* zmDJ^yQV4}j@M^(O*r~6UVTDl;)L9{(3DdCG0h1@!gTLvSyn5q?JD!byszpHo0lf@R zhqRFcmZ6>elA@>O<*W2pTH@spVlM@`lEZ=|l0)4ZRgnKbycjS8x&Bf$H8nFtn~Psk zx^QB*8Plv0;pDjvFOS+Z z>Tjsx6JuirMUsgk9t4H=R?%|<;u7|Azmv+bH>VTZp!cnjo9g$nv@lD=8`{+(s3Qs& z%Fi4p@FeQ*BhWZon3`VDU*`6o7T~MxK4$H;%PML+?gYV_^P-Pm>)pk@V6s+*RqpY@ z&nL}hB1wv2WIgIz%UmXVp{}m3n83$C7DCIhkkZQiHNW}HJF^L@+4Hoy`yX6&yZnAe zS3OjS8o$agp;-Lt7i@iQ9QMDL*#DW<1xbDQQ{KnyBch^CII1yPDi4Xb<3J6so$}<3 zG}C4n6oIa9MLos&HtX@5MJ!?*V)m9N0F7yb)gw$~yi%ld?%*eRys1_+fNns$2Y_XK zx-Ud~K)U+YEVmMt`wv&>cwySOc`kG0$ieQpy~`j6W?*D=!qV?;ye|kv(KDRQWV}1V zP9EhBvbgeZHlIB`@Ba&)o zGqlA0c+)H=ul)OY7k$`Q7}X*su~Yp9wzmTC`^?)DENa?i zgF7dvS;Yy-CK={)Nw?J2&9tKWY zKxS@xbR)@t!9L&J{0qKoSMba)bC6NYZ$#`8!GhX!2mv5I|6znRzm(t`=_kO(qN$g1^yP~{EzwD&gr-ov-zjrYoy;PK9+fpl=e7`qoaWKbzl zNvokYU5*C`@dKVYITFZvako8cSYdb(niesV(ALz5AYWGOKyj1bVJBI{d|qC8-#s|U zOaK^?3+4<&(Zsme8Ic)qczfGs12AWpWsSonZN=PxoF5sc9rA_O*gniw*OKg5oB{_H zo=&acuyX`T@s8u;Nr;KiUqAk)0?by3i!16OZs_Vin7mlFc$OMA9wuigteRgzbBg9d zPN0oEZ_Y4)fM)Xr*)*C6IV^78!oF1iu3X4=6vIccKO@@_R4S&gN2Bsy4C=jQW`^8s zGOD1QB;gBzOMh67F=nP^!9r!$#A$@ew1kbxU=`P6EP^&Pxz~J(FA~eh`2~_+sjVuc_lB;C}OvKEJ1||HY^xXEvWl!KV|2W*%K+T_-Fge=Frw z;eBSND8BQ@Z1&$7YT>Z&>dD(ywl({XZSm~1)z1@|VRr`&nvWh$P0cGX*glV&;37K334ZA?~8m@eG ztLTd2;UF?Ya9bwx4RTkL3Q5 zEDeg%?WZ7E9U+)675$-j9@ zkVVTs-k;+6;K4l%Mt!jREEcyj3gt7u`T%JWc7}4jJ_xAK-iag<{CTjC^3G=TmUiT{ z|7BA8LxmiBWEw~u8ikX-@^Y~4w&KPrbiKh|Nv8UiNI;ab9u;#)`w#3;CU@=xHTiF? zOhCK^z4pyjwN41P5(ApN3Gci#DMocq8<5N8f1>I4=tLE)l)4iTUoYbuKO-YexjfhMSK zPeT!e;@oqz02@OQ)Sl-o?)XBaBPcL_m8g@^_6R!i{ z&Ew5`)LI|0iyhLc+{uN(Gx^xHWWEKV=S3>!>wNLgybR9qH%;#BU|9>ybG7r1fK2qn zd06i0%}#&-t^lt}j5(K(W)q5iHMI`f4DR=PW6vc#19?zp4N-dw_r%U!9t;F@Fn_m# znJs#vAwhM|>({TZ_`-4?b)JeM?|R)dGQiUwJE6B!{u~objQ!t;1?7y-9h?Q`%_wGh zDxRc}6b~3x)*IHvptt%BqhzTnLcjkG}KrEiCgFS{oZq+0--&s8pHO5YEqgSlx`$qQC zdpYp7AU~&?QqL)biETh2Wlb8&ny+PL-e*p+&@amSVcE~eCTeP{hQk+}&FHm=5ZFx+ zF2_P6kyzTtk2`UP7jQ{QaeU0r&lo$5BoSCal&a++rU>28KQRg;kZ>485gPBTglOex z4TO#!RaYE<1h|Rlgh9;mT`EHqMfo5J^Dj_ffe^R>_@_}4*49;R?Z|7>)f<}a0`6>) zut|$IMQtp-X3c{GQ;q07l_xi`vYzYCg__+BG)llcCl>))vcp9t-F7r{LpOzW^GVYe zjy(0KFAAqmA9It=OPT=nz8I}Zf;ebrVeIuhtilgAco^+S>tq_zX3Hun_E*8(WCf3Wiu2>gzmS`bj$wfj!Q=HXOO8FBJsv_L4vSNK`Fyi~jY7;BAMVqLv20xaVUTu>4 z1wX5os(ZpZejxf)n9O&uGIgcM~H|h*ZDu4u`1gI zc0ba2x`z#DmNSMw9aB|A<8=`+F-OG2^c7W*$@?3}%gJw-QLf5|y{HA#?6=!9X2y~G z_~xZunMQsqcfWG5h!G$3-)tDx)ULhC#BvAyar+3S{cB<$b7bm#9(k$LLj!YcrxKgjpP;%jp~fYaV(!lt`==RyGJEIphMu^sK(Zm@MDxu2OVbeVHaP& z(v4fl1Q~J&JaI(a4)BqDbXIw=sS|+#sxWulY|u@Ce6y_dOPVpCf`xtwVO3zoQc_Z) zz}MS~0t{`>#_Ljr32NytxU=vUv{y}tq*IGlTv!n$zCy0dYm4(kb~FBptGB;*1xwv+ z`}uPfAoWKIEtqfyYvwAjdzdJ2_w}1M&RFj*Xi9o#Aql<*(KZJlYLX&IWDq!Xmmf?W z{n86X%}V5JI78e83xo?+A0IzIeatyVtl%9C;}Y0G*B6=mQl7nL*E`5ae?0hr*}J9E zVYf|5APZ)4UmBf9Z*m+5K2OV<4=+c*BNQdTIIS`(B1!YfMAyu`-$np>0=izV~ds> z6?@cFQc-b_#Mq&)C5{4;W`zsI^Z5}lHcUD@hHZ~@5tyhKXIgd|0yAXOd<5#yhE|_k zPGu>LQ*c>%NB)VOEcq?iuo;(y$2-!ldio{%h9uKHe0+|$!THd|a6%@65ywuOvcHI|N?LhY*(&lN zK$%PjeJ>saK$+~xX2wv;A}Il8ixsero_xQP3!xSpNhH!S8Pe&&7@n115Vj-Q{X4Jp z$_!ar2(Vs+VNacfI#Isq(Hn?c5|lPYR8;~P6gIT#f>>{(3IF9brw*T81v2Ai6bE^W5e&@ zSwl&Pk)5lSE??ycd%O1o|9{{2IPT*-t^a z@%fyebD+BqQp%t(G&Fqu5eF*3)qv>ejeA&D>*KnBmk92CTKp+1pN&Cj>%Fx0S+ffl z&U}Z#K2bc-(Pan?B=u9Z5)Bh>3;d|fy6(vQwJz(AF8%>VXs#FH|r@vS^uh)&+w z&!!qrbc-9%EwPcyC0s zDrO@FnC1>6f*4O^wV?tcXq^F(_YfAcHstsbEkw1f98t&vXulYTaqC4GNE8E)*-}dS z*A*@|Q7EI|P)(FK{vdUD!~jMo_TzWeo&gOPl$5O4b1#T99qko?HIa7Yp;w?syO0NK zWk7WVp3>XHvWGrsAo5Ys8buafpxCgUjBkMh4+ptbvQDBjtZdGTi;Fv)0^lbDhO`eO z@jyvO(=13zUy_OVE;=G&4MITkL)uUvz3%LcHfX|O2WIjC>j8CU%A0>HwJ|$}am}p+ z4JK;Ou@nyzScRSLEU|hWH1Rr?lim^cV&N1Br>j9IFzrYbwpe|2yKGgf>&OTQwSi$_ zVZxmZK@jIxFN*m!fYzpjOEe0py1reGBSGP0gQ}J(m>i01F!jNR~Q>A`>S>AH4_+ z9yxJ#e5%M5)>}ik$-Lji)c`tngf0+;;z1IbLviMP4w9!#U$fmAIgAgAHf9Tu(KV12 zUMkCCEP+={C?n-k%R9=vV5VBj96HN^bsK6E-Ti}aD+Fy&@ zy$7HqBRu$(r}h(US`z@TTYBISwiO?zuH+Taw>pwf<-37G+KrIS8L!nyGvQVY!q>0% zy*aA}nmb0#uZT%veaYD9JghjV6|Kgk ztP=3I(J6>CY_&4lVHu<%`}+Da+UN9>nwN0Z03E@kfR5r`Digklt+|?M7fd`X;D)&l zOrNzFMH5dV0ta(jhH>G2WnY#i6ZsriAkl;4A+oft0oPu{83W|QLl1^}q#iOuO5AR= z?oCZ_3CTb$$OP5#px=*_jA{I=p5!=dt{M2(wIDLI%(M5xU~htz3o=%Q-7wA~Q?Z02GlsBT zM!JKj4O)FloSn^gS)pgggzuK9KOp-Li3PCSy9oAa%LZw19NyOW`B)O3mYKeO{9#S( zrQt*aZg`J(uru2=G&GoiNtSTk5SmRVv?7__ChyDHk6}QmyilZxbW%t3q2q8%rMPE# z8fupZB5N)8E=&=jORSROd(d#*7jxY&NbYzuV*WgBm`)yTX z8?dp;yj3(5jP&%iE7fy3>I9eLyMB02z)TP$W7Y|r5OgGgsF3*4^_STvCFYUg3`h>; zn~s-T>Jx1+z=L7A3t&>dmUb)J|8?ex%rQJXSk7sSepyn&6dDjpF%AS3uLL|P#Ds6{ zP0R;t99}&6XIetaRQO3@c;Ns@x<5eyRU0iJ;9%-D4i2)eqkgKYt2cmG0%HZBO5n;B zSnt!nA7I3%3$!7dCXl1O5sZemLS545=Ox8$BrMC+-W?Pqsw!&Gci#a_d!WNG zoT*09@BvMIcCMKDGsYr~>8TvxYnfxm5)TBi^W_}(E4YTqq9m@lUI#c2sOawc<_Oi? zi(FoqscyiIb#(BUAJ<0Hsp6lPn+v5fTLl@GP~%I`{xyw50<~sSA?x<-ZI9RP+zy`s z?q-=a#33(H?qnp|JZ2zL_EuZ_Nx6mln^RnT69ol)p{dDN21hvdw+R){7}W#TBjlq? zWn7!VJ^DCsI7dh5_?3daRi4eypWm=M{R!Wzq}wFRiY0<0^vBJ;)nKKJ z0N{O!*eZd(`a1`Iv}eyJKucXM1PLFoD=wAQp+5(AbSD53CT3tR8MrWr z*zAyDbPQBkbto~wo_p7RGEAqvhnhZQRw0FAI9Qq~WFY-;u%IPA8Y3QzaQLPD2M=xq zdPEyWj{;uM^Q%d`luK^gn2D+BPEgZp3Ce^QB({p8RP<}`{WI;@u^xnJT*UPFtR0Je zz&2`ejO@g;Q$WD#mUAUnlS^1H?^C5&2|2v&@wHOoTd3=v;@lcSIen>2n+GG#y-WT$ zXH4L5BW;TaLnVN(IGxuL{2tzyfxv4H;ctBaL4V{MnA_BNVnZV%G-R#~MyaE#YY1@@ z%KPDawi5>MuueCrtY+@m*%YB7)W<6|syP$G`-jEhsR9RX9QwmVVEVSymJOeXy*KQ)^yi!n*;uad7V01oHcgw+jfZA!_($8(Rcb@&h2LIZ2@QZ@Cca=sJihn`BCD)&r znZ4n$QBa20HQ#SEp3*+JNyLE}1}J5;8wB52CpDnHVU9~Laxwbg$QAj^bwFD-WvsO^X3SofI{uJ*tPPnJ2(ejFCUo|)L z|G4}MM6lKP5&k$@ZKw1It^awr-mORDSCNrE9!=b@1&#uJMAuPJ5Gnzxl-idskDo6r zDJjuOD+BIWi`&(;qXzOhoYLbqh3fF*Ux#*(60?XG-xcH6p>^F9{$y5>gI>v@orAVs z?=4*Ve88%;ZF5JMmUaWyAtABzf#Q`+Kw7#25gd*hcT3E% zSCGnX+qSK?Roz8b8$pQrP~^Ej=bD0 zRif$o@ahiT9aN@KJ#m@-eZ66e7*tfsub=twArJupTAjERGBwinL1!VJo1J|$svK@q z=6J3ZvLvJzL!Ag>!8yCaw88l;f;-~2*Ai}h?F=nPTYLCmMd6Br#pf_^+RE#OpqmnN zY|$Onzv9fGK#+iKfli4`EE{kTY6?*ai6%I1g=c!VzHS3n%xL%*t% zlBCC=M7HUwg69(^LswOS=E0wB9S=9RYT^@|33pGI@;}ULZ`UH~ij3$RHv-c9zHfF( z#umGb$w=4u1k7<($a)ia+i{|YskdTYOcJI?n=z+;qD=Ro`s>Mboqb^J>1%0m+6=f7 zt>JhOu`7j-u5xD$Zg$8@x8Yf}qj%kkkUi9qdp*YZkdFj1z$F|YikMfS?WzM>GlVs- z?EGi4J{wq;_uln78*sZAeOVn8fWghJE+jd~5jpm|Xr&C|40~*n7$v~VyZ$qYf&PaG zfyb4B)0ji9HVv87Fi$ ztt$|^Cm#N_)0k!`0lpfQw14+NW%}@aefqG&Uq2tYjz*qXOC@w9CMCr!S@fV68Rc6C z+j|~L~( zqk-T#wGCdj}0?7tqV8CG@xj$Ni1BVEda`STx4m zqML;7)->jM0hK903!GxS4J_RIig7yRh9~uY@A8Z}6KiXS&lL|a0P6}0JMZFSh zDu_>eT59rDx#=^x6oScIq4^3JTTo0RleC~t2B_R7w=LNfeafV1x%Ya)n3ysU#6gj% zYo^R!-VfdCMzm~-@^|9DJ={J=+WZdvk(Kr8eY*96p$-V+CQN-ZAOt2 zpq68H`v4k^(i-vjw6uHQ_3n#4e$d+rtR14B{J1F|W6yK!51aw6 zPFkueDpI1>q*8d-LuC;g*fXNj_ub*o^!Rwb8_#3sx`S3+*ed5+zH|8EPGTMfiw9v|${&I)3XLYyH zk+W~x)@pQV>HG8@H_Hq_6fVXR)?35&@&HEo_>H`?6{9=?N{&jB6l9=-d|rPVI8ibo zdoE$F8#S@-pWi@%aFhtY62Q#_uI08^wVZLUkGF|X)GaX_Rvy~i`?nLHYTpk(Ejh1F zta?z59Y2g=bH(s`jO1W1bgKcbv(o=q8PPJb2cY1#IgMUW1&)_1ZvJXf5<*`8dW9c! zAXFE8eX25zXtqE{s94d@6mGnU%rBgWL){FFj1Ble5w~^>EKm7B-B<@7_~RwNZ#5=y z&|}j6c)Ew(>U~w|K;%KwEhNZ+P^0{Q|CsK}6TfB)ne2+bP?~f=VA|_Gu3Wi1~|~Sy(uA)J!t!e zyQRJ7^T!wQbfC6))!!meg9}6Lctq)`v%UpJl@IbMZ;0USN;myN(Gl06tE?QNagl=J z3H%Dg3d;w)kMVNg{8jzN!&4>7PH|8A1?ZkP8ai>OxP6B#Ba15qhi6L52yg?Hmg_|$ zT4^m1=n%7}5}CMX`0#k3k3V~=~I|i=<*P;W1zN?|7Q%#v_K?irot0U(S6$PS) zcny+q0mJ9~%a?n(SuN$EXAbDN-)P_j(_~B-6_B3TKI03kYWByOJzdsenUs90JUiXc zA!HH$r1M$uw&8Q?=4{Ood>j(TQ$LOCMiLSm=!^3Y4xBpGllkKwN{xjFnn9dhHy_l* zm_OpWT&A#YdviTY3kAXLY56-Ev-IOnS6}W|1&^teYi zv+-W~w`1cQD`O=p7b%&(+{wY)kGpe)I`7-(#lDB*Bs zo_ZDY2V8XCedWngu((DW$KFe;$j)e9aY*xDB%WG6cawu|%k^T#*`5OcEkZ$nh)viJ zY@4Vk2+9P$MLqNq4Cro$w>PD0?Fp}W2PBXlYES7!aWUvn)?!>MQnEHkklndj^1#4$ zGD2|lJF3x&G@AP{*9snQ+|#6Xm&?>6O@y4TJ46(e+rq3rUxf2M@COEtjRIQ(%jPm$l%ChhrYdTC5_rhgIzC1b|CvFLKX^ z_ctf?|E37@(KU!%yhBf>CUxr@lbAO0Yc7G5w}pv`Dcy;r`I^9|2zPWs2`+}6Qc{&& zqKUb_kr?>{4qA^ZzQMAW+JH1BD6B4(DKgQjM0$)*v9UL z&PnrtXYmvzh4Nrp?z2+`di5YPkii7dEakzxet-xe5cpOJ?&OzomZ9VCwVfVWEQ&Mw zutj-#Xov!^!pMMXj#{jm%;6sk3nSf>6#1Y{X!LE~g`QISYRR$td zS?=QGLqnwEcj-ND5` zxXQIHrKlJyO<)Jmr zlZ~1Zu9UlB7z5y(7<9TMPj(kZ;AoEP>cp=4$NvI&Cb3!Pc7^HU6~yqI>qf6835ALF zT)nQxxKHu@`?aJXhKN%uLyNuZ7&>erxF~kLZAk%yMUq+8Zg8k~Wk)O|2|wrXP-H@C z(9#LKnm1?-yjC{X_s$-GbwIy5o$G&$p`B`j|t z`Cd%&MKCEA&WWYMTQ<8$=L19^dNs`}9F#yp=}4iHTm}}xmL7Bs3@hriVHnMzuQU7p zJg7ka+om=6?md6J_b>%efT5F})xjwE>8%2a9E$AXHqLvD>}F{puIkB>#SO}Iy%>$X zmY(YcYl1|8s!Idl3&!uaHh$&uz61zEN^163$SD~>*xc!|)KN5|q@*;2W>Q@m<{%+B zxP-s-v?qgu4qKBKG2e0L1px%HHI_&6-{`xM?z;w5P#>#Xd;N!dlTTHL!`96N5s`w3 z><|yTnRAOpBX4RUV(r^Gd>+Q$GO2|`Xlj9+oZK1F}ZXs@CJtDyB* z4kG$vft>W2nF_cJf&|sZ8K~t^T7|%B6;#8VjKVN7)iH(goLIVS?xGVb031)vI1osl zPy)Y^B9jSPS8vC2y}ob{BQCou!x}s+8ptGMD>ht49kLe4-Z?v<#3w*pDu`DKt#!A;I zV4PHG<~dQI2Z_OrJ@%vMHdhjFe{n#+1J=rs;VM-6Pf;w#%V4S1L6z28qNz*7Yxkkn z&XvSgT?g6bZ8zChZn{9VKbdoWdX62sBDZq_cp5@vKT}H>`YNJ0UV+kaB$L#;mOHPx zmhRiF1D~v3EJ*>Mf30=T_#*6G$UsbL0G(YEFhh1=%_!CC0hvCA;znakDM{!9lCVzY!QbQ-r-qp(NYqiVkU z2#k2^#9T6Pp1=pwrfagV{J<%%&&YF2b+PpXU8fkEzf}nC<&AK>vF=I)#S3kv;H%Uw zAdS9XhXPYw?ARH*0}-jUAWfbT|Tk$IW_iS;p5cQ-WT&{%}BrfhP~ztY_ptj z#YcJe!k&iM(D&)$JSBFOQJxzjkg#$0#GOqZ1IfwBB^t-_)>L)vD5*Y;608{8=O-G0 z*O8ZZV%qi=DPN;}YZr=-s(|5C&W1Az4WW%Rn#;quQZDSnl}`q(i6_C7M|2ll>QNX* zHL6}9_%Q}7Z5+HP0*UA(u^Tg*Yq+EOPvc9H(3)Am!Mp#|`RN5STiXZ{dY_2>_OQLK zF3_;0sfq4vjjwe&{rLLewV5?A#XVDgXPgXzFcWvyy`2W2anjdYoiHFSh!qE3(od%#? zH>z+{{4TM2;1jN8;_d{4U7?@xKr+L^=~sq}{E|fPg`)5fB5OZ?Nge@IjHBAM>drAH zR%Y`mDHDFZ5LG#>jePn#n6nAeGBK0*Xn0))pc*7Jy>mXVoj?xmr9#C=oor>buw@q@cUuRj!`I)#Ucq$cF&$Yf+hRJu(WG|@wbbIIV(&* zxtQU(QOCs8iVAIdNVTN%C;Dw@#UOHH031RmFE3B}d4c`?#>U17s_$O;DUXJQHH}F( zRuT9>k%=2bmJpZ}qCX-uAQGyDi2@P4Vf|3Gwl??#q<{&7{T+geu%3H3x=VhwH@ zlB||ud21Kr`uEH|2Ur}Ib(y8@yKs_1=`{iy3c@xgoHgr%# z>D2uD<326|ju2}mG)t7Ii-OTCKL@A+;v&u*0f@vDaEam!J3JNl4tM{0ba;7zU5C|F zDk`Ej{$H;ePtO0TZWwbTAC%sGgNqCBvjVKsa7XMw zS*+%JnM+D)Gt}RV09SDSc)_r1H(`|TI}TriaWS4)kqpPpglC8az zF??=vTHA?{hla9CX0@s>iZD`f&n-6iM8G7c_Bu*RQn4S5&Y!NpjXf2f&HY-tS6WKd{_smK?!J z56+CmXJE>NtT6u5f9|D(mA$>aqkkh?0XPOj-_0y6Lh*a`I?2|hrJ-@MeVHb|1CGNh z7-?a>PX?Cjz+l9o%Pj;}sxC!J4r^QZ}>ReY@o^jn>7!;e?ad;VxvH}?L0lDaXw4i5AH(`3iq3etyN@@tDuOl(f>e)rBmiI0*Bp2_RB})rKyWA3NUT?3<(!gLaWKZY-XF3oZxrWGwEOc4`$KyLr10a*5 zu~P97(5xn@FSr7B;;+Y$g%hJN$#deacy!(cTx&FdK(9>!Fe8~=6ZAn8t3!h@R==5+ zcGj%=x;XA0f`?9dS6$%W3f2)2BX#SSt*x~%9+_Ig2Oq6^hEaup0ROYEX2Bk0x4Q;n z#I`MCg{K@)SMxt^v9qRM#{>ZA&?pNC*_`Pd1W5V%_8d-Z0x%c!JsHqMmr&c;shg#N z&f_&psKp|YNud36`qb?TYQ!aM|IGN9O%9<-+Gbl0^AU0!ZW)nSLRiTM`R@~0py0Cr za@Mu1rabfb<7>z%3rVOu3DL4#q@~CQk)q(bJ6LoYxB@uQLAEaGFi{inD!}c3oPK~} z3r}rSrUpLYIaOG`yZNRlz->(5kS1#Y_q)P-&oJWHkP0Pzms}-etL5e8CpGWgOH!Mg z^Oi?nxnUvex(YcNYypM|-*G(mB*G{9HG&{m`o@aBrB(*2|wVd=s5fI8s8y#pNE zwxwe~NBQuxMP0pX@(G2RQE(t&eq?q4QzW&HCE==m#OhHS$GP$wh8FQ|aPFLiBv|LN zsyoJ8#51C8TOO_@b4Z7?Q;7^u=3y8oz2v#)Oj(;-!Ckrxc+4cS99ALS1`?3z2H|P9 zI-P!S?uy;{P_q|Q@FDOKq6Lf@A?rPF(tI61`)|G;Lka*|lxHF?@3Xb`d`0&_`@Hx9 zsnm%j-*5?G4p3Pk8!He3FVhtS4s&$dc_?dyil%Z=x~g&TyoMQAJz90av1!bho`!^k zY#?k{0+x+Ro+!|vfi3s$mxl0g03{*r9SM5=idr_xoBsgqooU!mG|C2F;-s$tgGdY@ z+zfy>s>;2KXd^#hlJ%xU++m=#aCl@ zQX&b+$^@sZyc-0<+h4vqxlH$peFkO1V}N9x!Rk+-gHJ$!a21@0RXQE!1N&76FpD75 zl>SuYII5{X72st&*@x$)lGpqWo8~yLsAx>(kyF0!fbAyUDw4*5w#1mCcqJ^XT}lJY zn&^g3upnH&?V?|W=Gb6^rdftgfor4(j;1#ypm5)DNiI3=bpEL8i#7X7jy`^#f{rKU z8w!_R4@o9B2N5bNs9rBF*cJy?LL`p0h}J6Mg-QIDZ}%l z8-D<*PH{v1#H%`h=yO-OQ)n;vukZ>9rm_vIQMx5BWd3xon$#0*2rtjLnY6>6y~=m^ z@8d&iOd+$oESK#C-16m^@6_D2Di(A7wcD-w_^juayHuv(b-U@Mw5xUYX>uPM+pEZA zW;(fWrT-hRgnVK;VVYh6(xVs?l6s$+dgqbxE4deiE_vz#NYfi%7b29bmrLDf_#|wR z8Z&7nfpIhKC+vr+3;cxT_+9Dh2~j!dW8v8b-&KmS2milco!^dGdj8S&gSpg-&nb29 zX@0sTU%B|2nsAL#6J`b}#0n@fDK|c4pK1SpUlAibrV*(WC^B3pLDja*I zgZM%k3(YabhyVM?K%2f3sxpW)ogGf0YK7b+l{%7wDCThA0PTVG78zv)?sA#{UGIlD z1YYgH-%AHYW4$(!(bftmgeNBtSZXbbn9+eEEc2n!(cJHbkgCF8A4cA|QIUR;1X50* zQpV6g5ttV?{-@y4iEZ?Zlm%-s9@qZ053$6!K`Kj$(Zb_vHM0u~q4*ol4n3JFYfwGz zUm_8sq-iW12A36;@z6jdEVSw9>FM!XK}M1`?i*jrC+G08862nQ3#wn%%m3EOY6E7m zlF>zsb(+Avk8LM&77{cyh0^XLA$YBe{v3AoY6H#-p(c_DQvuz9sw@X2(D1WpSjkaj z-@cuc2*k!7n2_MuOW>twk?(0w;l&hiA6ORRh0N{j?xwo5(rUg}T(fWK^wmHdI%Yr{ z;^(^HE(cKCjv)oc1Vpp(FcK|lTY^Xvg;e)7aM}sg)a4HuW?;z7?TYzF7jJj9!kCkr7?t_5=1+{H{y2~NqbchOnjl&%*tNSO`s#3sOdk|b|a z6ZI8vvQQlTR$X4`;{aR*sW!o!gD4})Wy0&g0o^GkhQRk^@Vtm7G)sd}{=i=#iZbY% z8F2}pmYp?{Ku|b4;&zlH!(HL(%t{hgC@jVtz#ivLEqO8uG6~a`EzQ}(lar@ebonYT zE=$9;CZ&J_&}=9!3e}kc8TM*IC(2lZ7UdKDKNt@G{Qr0*eFgsmmMa^Kc>n|PWeBT+ z1zM*RckSZh2pe&9q)h+p(K6sQmZCG*bWt{IeajeSGZMeB{-gRgYKvor+YS@KTxUM|pfg1y zr1BE|{y;z;P&a|ctly9XsQ$-3aRo*=Bs8Y%G_ceUw5G}?@o8XaaS2PuYM~4AVfeq9 zg#SR9lYw175|+E^#){d(!-pbsN1bsZ{QlnOAZCM1S(leQTDYdqJfaZdJyz%>aqX49i^4s28mPr3 zVOyg;oOJ8fvlIU9N)cVhPM=Q9x2TpR0Z-p_0T^GC2KqOWX#5M|iw6Nk^wgf>pc$eylk+YrWVg*pG-5dhC13gUx- zQ|RjAV&08XhWK6B&XY|lDk5UueH-)>__to!Vy>)Qw5|c-xu>kpdm^AqKq-3qctrW! zS8t(98V#TZwD*7cUe@^^EoCRQA) zngE0s^~oNlo6_Y8KyITvx!!^U_et(T5GP3?O5hXqZ3oo_ik(m*{KC~8qm$?gH7~G{OZd_LJPks9o;#R3Vx3svF?{=SSI2=&WbnAoyyRv z#UeO2bv>WGHCbvSU`Iy#V9u+wx=*$X;)lj(8I)D%c@&>-)|K%GFtBqE2vDn5ftQ(| zj;QcWgm!x$RNfL+R5p)^8VNb^n{;LUA3uI<3g{JzOLMEe$~0N%YVJFasO6uyb1S0| ze`%a`3n4}F%P%!{H7Qi90+=y@Q|-`)52obr*AZP7EQ~No62b2llvb~lT_w2uo08p; z5pQUJMr6SNW!zp?Q^p2c@PABUT_WiIS$|TQDQft`2TEW?p_n~%a$XE-r~LP$I9|MR zy453puaYA}{f)49))JTI-n6u=S#ijLW(E@HbvRQUtd4~+=s=kpFX_a^2!0jnCPw&1 zT-{JP7{k?k=@~@VS9F*UPUCnZuLu(OK|F}=c6a)XSt!5mgNzOcU%+iUedad3knhjC zK5VDjdGg-t7@suf)}(EbDSA#Bo)7JLEt5a7&-GwBN*JDmHuV*}_RhUL%9R*b-N&^- zravVY-uH4^G`BBYi3u|}yJ1P?LJyT0_SJIi8?XAgcaww&0@2eC-V%7(>tadib$TN@ zATc4q$^Xx9+|$WB`nu)OgG$1>aYk|(w-#ZOUCF};7VcM4<$3=a2zQX;3Gj&H6tOMn z?Cj)zVTfLpq@DD2g264ccki(cnrQu3Y+p3dKGp)W5GzEL;xzU1*RNsh|2M8saNUP} zR{v?R3~N!zaIG|pOf)1iUV}a9wQ4n1DDYuXIAi7`*kTLT$0vAW8N9EU@vJVW8NA z)ED)WNJ{}nUeLt6>M2lTu|kY)KO+pVoJ2$cK;JZhjXi`(Q-vGu41I`K!4Zed)yWoY zlCbTzMeC@!GIa_XK>ilmyos6;fp>YEo=X6xR#0S$FTGlj383^`js8YzN{`QPGu$)w z$MRyhV@X_jwzvUJ`2#oo)}|R;{I5^bHYgS7eN2{eiJyaHHEp7?6P2{Io*`g!q56*CQtBtFgVUkf54&P!NrUqYeq z46J46Q?(^}S935ZtiTjU9knb@&=*q{+adZ>widzVzLH1}4;$rgk^l`5SG&t4= z1CFF{btR>s&qX63e!u!$fLb}HECJL7i?+U-yN-z7BF1d6#@_lc50C2Br`DDGh<>Nl3PNpXhq9Rj2F*o$Yb?0yszfMj2u`>YfLxL7ifF4 z+(>4|5Xxlr(`c2oU{Y{x3Im@HXdcw1A@?KYvKw*31aOw5dIyUgjE(q?TbBXe3+7G3 zOhjM*s82y$S{ebN&ug4LT?w>S2A6k?Dlsf2{$NhV0G3xwLP?!Vx}cac@;6w>1CQy( zIY+ct#aYf=FP^WKbwM5wzGtdA&Pqgae*~L(t$h;HO+xAdLYdG)Lfr+0g>`f^AuHd3 ztjwr)WGN9(b?%keFl2DFN`o)?6RuNM^R7d+ltr7jk?>Kcva6_0hd=|0rwhTvpN-IM zy{|H4aup-Iw5aYjn65Eh#0?itQ6fIh$QB@EsjO~P0TnzFI1V$P8El4cs}m@-t;bs+zH|vgWu9{EPjhD|S)uq_i@7xQp%8^@0(gi_1HkmXuo${BGVU@4yA} z(MRtnbFbU!$P-BZ4|mkw}RDB0wH2qfpG*a8mgZjO0f^(Y^eVa z=t7b2Crm?RCOoUx3hbg+ifrdFU*>8e=+SS29&OX)4`N#5-4w$~FU19&g>KP~YdeW< zUN=X{gY%Q84s(e9Kj09vR5!0Etr9#-?nL~7yGV5?tX%@BM&v-7nTGOm4RG2GlB5&|2-;F8#9 zR{9=9%ZIHCqiNJz#X!!~p$8m7XE6(;gxeL&)vJIQTD?ee)E`L9Gn?W$`8TZDBw$iR zN3BSdKx&iFiO_+?-fXlU5{zLqkg)ebeKI~h{qim=3rh%itK2J@mvB!w7r8Ru$A)|} zPHc*mpq$?J1)vW*xV_5PoSbS9JTb~nNpW%E&8es*Mdf6uGEP+@p7?eCMzE(~?_$!u zSdr$#&aUKy{h>0)DTZ?RE?p0#I|H7C#M3aLm)ndqH>ERYGf<81&TtGGQ5N;@h5_mn_fu-%@Ik48lA@>6x054 zFBFeM_!)EWYYpAqWnH{n1pWYYc9#f?iI>;PWx0%X2DHR3117P8Fy^ zFS5lj)gPE*(HcRFb|w1$BFYHz-0w%R_BHicF-AF_K$*9AN$jz-a2$D6JU4`;UHmQ|5zL z+#aZ>eeKa(9JRl9f1O=HC$m{MoOyI?Z>ZR z4gjh4X5C)g&Gf(yw}P&Ou&~YGN1w_qfj0K0E6S3gy2tWw{VItoWy`!=+YFmg~N4}$Wmd{Yj ziE8ci`SQ4FdIHaP--ijzQy}K4a)S|$l32>Wm2&w>b@i)>B-;fRkr9StS=Z&F?lR)m zBb`xs9yaW)uXQ-w{EoC z;IVgUwp%Bd)RnTKmoLY)SHT9WTmul??mKj!KoO}2YUS8ujUjVXEG0^0;0j~kSvGCr zN#GSC`^#fs;NmZP)xJC+#m+v7GG5N1(x)H*yF*nBw#g`8e)s_}@uGH9tq^|!nqoyb zsLG)5K7L*kMU6lQOssH^qL!#h%)_P{|IByr;G?6wkXfDAB%_?&1K>Kfzv~3A;!O#3 zVy+zpV4Lpt+8B-gX3~gO^mv@r?XT+Gi=&uwBCs4g1^~=ILo?NH#Lh0Ir4~yj!WBa$ zH@PuL*0@v$K0OEA?Y5r^Jt>Ao&>^Fuz^9ICh}M@u5isT~1rSVC1~PJ;^O_ip-9q?P z808Ok`Yxvq9^>-p)ygsN<0vDVjxtfd{D%JoBcRBHemu_P3mco}&ivOeUaW8I^AyC4$1CdA$Jrdujo87t;RC4( zOux0NuhB^j2<=}N%WKU%2P+or?&q5y%Ld-q<^1Y;|MIMv&7#*v^9p^@k)JJ;LTL)m znZe-h&%j6~Tte|HyhF}0;+NkruZ}PW=lTMMI7;gOd42q0^PySs?N@gW=%r?d5^!La zt4ZS^USSK>PV+qj&w_QaB41Mrh(O0(S{%>)ev|2pJc zm3q!uMk=Rj9G@iS_pLE=C7m;^Bicv?6c{4#5j#RI?Zs!NV}>@$`7A;^-&{WtE5zVqqI_Mjf5BM$O!xiV+^D z1{i1}iZ$L17{YxN;ag-eyi}B`{h#-ckMHCW>q4xWJVg*BTPp{%JD3+`X`S6YhTeG&<{> zT_j_wqOTR#l97%kwe}fmh zi{Kn2i6_P#m3cdk*=ti6Hq#MEy~Daaof4jT3R6pft?KZ9C6wUD=`e5WsJMXAR?uZs zohJpg7ij`WKZx^@02B!w2yWdDgM&Lx#(@5i=^*s}<_79OsZ?iE%&1{N_aa7IK!9-- z6%{-RG^-y!PWYnstf3(QKh;-s1N65^ zIa31`cotIzz>UlhuT2p~-``@70aIm8rG01USDVzRNVOVZ)j&tmenN zc3)hGRzCh&Xf#@Rqu8ytkE~dHB8-az*RrtQ7`RiLI=Ex;LsO`6rN`TMDaLJ6UZWpO z#|D1Xyh?y6r$*&3JPMHs_?p#XXrWg$2!LsBZWaVL!k-t3<&JT|6C#W&-FLVq^?J|$ zdZPlmG#u$}6xe~tF86c$D~#P&1L9Rbje>&?b_Tf*bcR4uavW>p$bEq8_cC-z{Kj!G zYbWSObYw=xPT-@uBTKl=iF0=JNd$-yZ&tM)G*oq1nDMTlVGz@Db&DKAZ%xn74=eZ# zF@GH{s&hvshd~`*vAceNBEB8oYBxDlqtWHA#7MtG)*}hFiY~rD ztZAxgK5k-9l!(I84uds>`zkbrXFi15BG$XN*qpAavQlW``8i?Y{Q$*dv2IQ|yr3Bf z1zx5_1wI_Js2+OD3Lk>;GK>N-Va^!V3Xoad(J2J4Ap^3&huxq{f)aS@Df60P5R~8I zv5a0!$BfVu;IXi@bLBQtXNyI@isC*NWr9^7A|);_|3pn{YahO!b`t9G7rbr(aK3K6 zuJr#a2S#?Z)F6;uq>-5Bq{2<*IgT_KIj9$RhF}tBK0}8ug6PxK+yKvAheEhj8eK2@ z_U&g5lTbBz*sYIld-wM3kpJIbXSH3>_J#skYV{(fL|9dO-w+ejIlG&}35>kQ6GQM| zPxz=y%Lfn_hNYiB9jCP@h;2#J{riG6g^;7TtPHa;abAHTSN(cR6h5eNMqEBuSm-p;O~6ec+tIpgn$eu>DI z50+t-f5ZTtm9RfBu7UNFoyCA7{ZK6d!DLP`X@$l zulq49s=>Oo>hrGLCVw&6_N&$dh;XK{`*A{d1rB)6OuTt{RbjGxdd|BzOu=ZDy zi#)L!X*5?XMp#O@i8@WWMORCUsXPs{%a2e3u(}crHulF@H$)y=AwjR{^!oLE-@2{h zR}J&fw^abP(spLqx-|k1q@S3ev@mj!#2%S^KmW~QHZi@!9gCC==*}uH_6;p_1N~Gy zmal>3$?A#;JNMBeJkgjDtQSj=drV?^gXSKipCS(sa$cH|trU(R_^&>#wC*l>S3wIOI#7npwo|Jw#Nl-A zExRWalKeIHQ$c?IcnX=LQJ3f7`{MI7dn|p{n}iSU)0BDQF#D$igmM~=frER$jQv`E zR0f-gpt<)?;Kk?60-i#}lVl9hC@)>_re&lyi(eusdVLC&{I&n7JQ!wXG6zAI;y?^$K$x}@Ftx^u3JweOAuo5sN z!0)V;73~((Zk%2ASMB9>;ru6EvLq#fdgm%vEg&bW$Lm?WCl9z?SCb8B4^0W@q<^nG zxWwicaha2UjgE{|o(HFE8vfgjj~Xf@8Pnm_ypF#rligAKzp;rsD7@*Ec98djma91Z zyf;IpGgs;IDVvKFT|RV5G_hXM#^%s{>B zj1Pc;f{=>ruc!!GDhOB36xZYGG^AoNt#DOXw{2qt!?J&Y{~$_%VBoIKJpf#Hl-Rsc zqdu~99Q1q>a2om-a9^9^J6@f}U_Xc2ceE(WnqV?O!$*rGz^(f_JFETE4hL61Id%mx zW)FQ8QnpC!U(sPM%>2dDt_KkCvXJ;fA%);WQJyuHB28IYS^olT({=`JUHAyy4cR0M zpkOdy`LosC6yB9Lv8Jlf8I%yN330=u_=ANKnzT5Y?=#8F)o^szDsXChtap7 z$Ba?vE2yA|V^sgj{U#w%?(ui)=FMnV)>OEGwu&)c*rV-h#B90(fPg#qK%unH<$?&j zJxEwY`K>+n;F8-H(hvq9ev8E))i2n@NW2%w($&)oAz7kxnvqF~i64Ou$9Pv^4o;5O zcb(_c{4N1!GRsqQ@PbXuI>c#94I;0)4&|LqUs+pJOiT!RPqQ`;fUQLIpjHay4++>a zutf$6hEwSu!a|+k9f-@A*ToLE?K99OVqV{D86o!Kag`2`aMa_j-@D)nujBcJ1&i;Z zUf<4LjfmKrh0xtAsG`nw7XZ8gIa6m^YEuLi-MaMRVj*2SoVn!E!%@^*_Obzu*?oc= z^%NY;@!Y|piI;$HiTZ_mONGZDe|z zT&}aYPSr{=mAHyqsCDrVsz{)6TGK)=m==aV6vIjassmI0DGW6UpQ;K|==ETWG-Sj2 zh#p_fiON4HO-asg_Q>?~agQDRN5Rm*6e79cE@CPm%M1T}^oc66eQ~AaqL?ph2W2UC z3T{H{aY?&{!@0tEC{tp-65~>NISX7AniQ&LR6;8T(plu1R2rWwo zSj69RIW$xe?4;fO6v}Ulb$AGZf`TU9YMPp-ZbxG{gc|dF_YyX6d|LS? zHC(2y)4j1$WuRr~PR!Y%eslz7bo4FG7ZRylkOzGoq@(*E=Ai-;lK*>06Z#sb(iU*7 z?F}t?06qd6Zrs)fqI){{&{s9IwVC028O^6jOH11hSF+e?;k|p;V8O-90xdZ5RHX> zY|szE0b%b#3=FwZ9<|yJp_tA5aU2$<d_JL^p(F?Z8UonYYEki<=!Y&RJLl!Xwe8<}XZ%V*NS+?IgIt-siTiUnkDiF6UM_ zJBAW8af&z$@&C%jU_r#^wr*Zza}6IJ0dz0%vY{a%dt@yqxn&%TKt&tIgrfHMCa)KW zUtqx)RJT1mJX{H+)%_d^>X+cewOSgZaAJ2bJ%JXY3F4F0)&50^00virg7dJ}`CIps z6AlN#aJMyp&>F9~!*#ltT&_fjOrfHc7I_#ZjVQv+9+4n6&H`5JEjzOxhn~^H*jNGL zuM=PQ0W};EI$TIWduoOOYfv6U5?L!n)x>5sawe=?6F{2d6BBRW{Cz3}J+@QR==gZp zfwMJw?bxvO;4;E&r@w-EZfOW0%cv!;W8w|18#BI7T z>c^7IN(YN30<+GM?#<{ZVGtCyAxWA<|9IGa|NfmFl8E%^#w{U{!^Dwr+jh! z$ZOzrz$)|DYmSyA=@hzQaSQPCcO3mw%Fsjc5ujQ>>1rDOqW#xt-_RjN z_1j0wHw{EnVa+!uyfMBn@T85c6?tqiA@f+i)Cw-&LvL{#77STK2OLy#PB@{%dSadx z`gi6Vh5bUNn0Z(vO`{)p>%7sSoBS61ciq=6Q`P&o^rfQs zm$o^m^yaAZ?43)li1FaPHs3oBzr2yCpk>POSiPP9kno$eBZeOBp}x94T%z8{*FcK)i%gFX9{7-7>=IFC_c!fqdYl#^!<)mdDo1&tnbQt)Hd?eiun;8 z%U4#)$LqqP?x@DDsZ8?vO}0wx++TIW!A4VA<%YMv>iD{)fwXZS-${j(FM6s%gRvP; zx_S>X#qO+R4NYa};&9zL*p*5jbYsD=vjv$F?)yF!1VD*{QW>|3|G^gM*phAQ~48!N^n#LM^s$HtA(!qXkF?POfL zwz3q3DeE5zadBE8W*0cf%ZrJPpg_c~Z8JXd&K-8t2Rhjvn%LVH-cP|ENUuT0cYdpT zl-b}pTiZMBY!A7z|1#+sj=P(wgz>_e1-aTMf40Jyr(LM?*|Wp#?q(UTN#|8k{7@!s z_K-ckE89WGy|o1JcNZHR1!5CVMQ;A$1Jf=NC3@HR7}hwvJjjU(7T=r zOM}3y0u0fT1G+8J3h8JP#g>)T^SdWNAW_%UJdm{!NzbU|TS)<4fe}8?0LJi$2;#si zC5}sgR7|(UpiY2f_*}QMyZbZHGGkv8E^;^Kod!(}4b-HG&q+naAI!tU(|Q_&lArLi zFCDEA#btU3*NLSj8h?E~z1_Eo8(C4!i=LiSdh)mn^&t(il!uD=Et@8EiREc@G``Tl@R$Dn{X5>3f&yq~p3ZsP(D0qB&lUj{f*tc)hqwJ8EYN|P z)5tQ2`h0LCQxwlV6l@lu3HVp7{{#Pl*i_P-V9KfP1$Si*GG2RWa8nMP$9AMeP~>|) zis3?S9g2&HVvl?c5L_h;6k;bak)Z+8`&BlP2Y|5_t)_{uJP7qtt}oPptT5V$nIx=Z zLb3h_ULpEioi^j-l$?i$hLM zLavAc<{LyI<%qOl?PUrIP$g<{XHlmAcDIJtJ*FW#1!91NDi9v0TZ7-rh|8W$wgwFn zf?!N+;y|RK#`l{eO)_P1OEOb5ztRO#V~X}Gvnh8!)OjYj$VgfIZ9c<%)c?=ijtaQB zQN*O>!2n|wabnUVh#kqH;^Dxs8NNQ3)ijZm!iRzB)dFLXc(-6W7n-Y+$0Ujb#ec?< zna(;4+dGK4F!@Kxm;{FY`v=RPdF&Jc_(~r8L>5<{uLtGmZ{3fz;A1prQ&D_Hu5f-A zi7NyrFG2M@-W)N9$dLwN?>>mpx#!i7A3tsiJV!|46Z9*MJ5DBhiQD9TE z%GQu4O^n!Sfs_d*)B4%5t=e4tC65ro;D zz2P6h=Py7YD8_5II%M}iu|-&cdbY6SX_ZFjd3o*DT)p#{9l-Lp?d}`Y22kg#ZzcrE z&z*VeAA--ojFT`{B2@Yn0J&n|LqH92KH)Tom?Un2Mh|(tyuCl-;i@bE_vR@|2*(;A z`EHrs9p{O#1koc#4=tdEIH9KrkZ3qDCl>&<=f?&HZP|hdOcm4NB;jI5A`$VeqTjWq zUBA6G*@2=KA5y=d4b4~k%a_*-NC|hqsDjhR4ixkT)Gi`E#3}e6ycO8DJ1TjDCMae6 zVS}%2{s{hSOG`__J_v=Z!^|dpastJak;o%45=*#U_O{+$FtY9#V>WW8pbgE%Ymf*H z^5Cifba|~%XSO(^hgMWEkuU4&6h|0ybpmZEoY39ROjN0@>%vFTZAT;*Q}a3Dl}a8R zyIwkDC#tIQG~5@NPco`)&UM@WiQX0u;G+8FH@2P0G0g1vgtb^ zQygJnz7kJWO*rBa^91P7`F(MziYc@)q%$$&FuDEJ03A-X?_yM6CmKFZZf=zi53yC8 zh_QkqgxMal#@y&A@jX2^{(|KROOrLs+bDPQ@F)Ro@OTV%-XUM#YWheD48>^BbM!AD zi-l8G)|dcjFj9l0GVIX+YJm?}&s>sAS)*{00;j#@A^w?By^hG3Vn(}}y)Qgff=z9@ z_zvi9K~vjM$E@J%D?_l5M`y5X@oxEjPYRFluSYjbya+;HkyNJr`y&mf00a|`iLqw% zr_s@RtcA)c2HEo0EJjpeQRn{;Rqq{_bKC!qN2w4>(V(lOy;Gs7p=fBMffPlhM5RPj z3Kd1#G_@B>8XB@9?I{hBww9Ju`aO^Hy07o|`};gTf86(t#(BQa<9NNEbCbV;=hIgT z9*j2v->OflFYx13O{V9Jms#I$G5?K2#pFkFh3l#NYJK-#bEoc7UEmjH9{DM;)|Zi0 zUfFJ|+X<7@zeWlsBML-UVx3qAOqE_O-e6F}W<;Xk#`|%CFPQMBSMbvN8f-jn(&j~( zsfR2Sjc)xqgg+=__8V7O%(5jDE)ADC;Lmjpw~yu|iaE5%bZXZ3hg7veJC3Iy((z0W9i{)m@7rK(4u3t0*u$}j53D?i&&aCP*`-`QQtaq&4w+Ny#r_QJ|`vNA~_ z2TpYqPwLr|Cts!DekBmsI(h8cP}Vp*>66q`vO=pm-~p%95qUu28YeR%gS%xf=CE-^ zOd#vw_uW4=^JmuK5n7-M(}=(YDI-FGX#jmrA_uxisTM;C0*ChFBZ~L&J8V!z(@LmZ zzWQe-Lokjsh3?e6(>sUeUtW-8Ti!h=fjpZiSnn#lz_#@bkJfWXO93KLaW6-UrId_} ziRvIuY@&>J%7GS!tm%T!@YZkYEO4kABW>zYv9Zf~1Q&q@^j%If9E+tcgTaE&_yh#{ zfCLmP&5m`la_ze&8kmOV`#VLQc^3>YGJSFq+`u6@apH&|F)Y4yaK|+uCdYzIE$>m+ zVfZUqe3!KJ1vIvHJp=T_giMy+`o>A5X`)Zg3`|avgTn}Qu7cppP-$2iQ3`vJY(8*x9!foUxp!m|_@WQ0 ztEf&Jlx>Eg2)N|Si)Gc*x0g1> zfnitG)?R09hV2S{R0hS$yLN>xT!9QY?c#A9$7tBuH4HT$ss3ql)6mdBGI?$CsgozE zz`5z#%-)3q{GhJxX<-KhFkxcnQCLm7(;)m!&BLx?0Pog)&Q(AcRgN6lYTSWK^F~2I z!H3|PhnQ{wi`fc0AUO|!J7Wiv3NTY7l1hdb+oSoIXthj+RHE z>3WM>&5z-S;0$Q-4QjrMgh3&OAe2+Wq;L{O2kmy!9DeNB_6Nb(WB~+9USZKj2@ffH z2rt0eiX)c%OacPxNYyk(af1a)M30u}iATNxm=;|zuRit*hdCO_-z4`T!488#&DShP z(%!AC9G8)es}%l}pr7g1^c?>O1T~cV<{1$4zrpf$_1(mK+6btxDg04eTYFepJXsGy zzo_$9@ZAtKn9}41baQf`b{jt@7N>Nx{M71MyvGFVC0=nlfZn<`B5F9M2?QJN=Il&` z)rPzvc)bUzKBGj(q$l*ieiYl;{OtkkDHX{V#*Ctki^TxA5r4c={AnKc1{gV0*(QNX$5R=#WiQhIzpV9hVIb2%ESQd0<{no=jMZc~tU; z-z_Qklr3P?=~L|eoYFXG5@6`k%EC7^D6l^aaR9e~z^w%)3#{egA?sIBO`p1f4XUnA z_zV@I^iN*9Q;YhZ{ysJQSPS>1sA{)M1D!%*vrk4g7(mT`*qaqREMxE-yvG>EE;kiRM78)!z z(nn)y0OD!eRu-1qBU;Dt(47Qx>QW8}@h)&k?E(_nKdbf`XPNBwoNF+o_`tzev!IMT z{;gZKgg4^zZFS9%iRvj1s-%j85*oM%p&APY|1kb?LUQL$brXGvfHa8zSs&m1HCD!5 z@}&~OnEX~fe*8Gx6D>`vVxpoXudLg!L8JH>*l#oiq@j$yOC0aB#`XKk{QR^C>x)vV zjZ@l&xR~l8yusfvyhcwuB30_!w{O}?_>@tqyQjzs=wR>NQdhP5e_Az?7)v7w?hTocDRmA15gi(E+dI_XRWL%bCBz*wT5KH zsv5`Z$H-*V=<4EerO=lok;U*A9=1DJ;`--W5FVf{mu0Kb%b~Je$usQA<1kUO!3(ZJ zAJ&n3T)YDBNW2Ml&%7K5%F8ezoR2=KrWB!b4j}=Smtg zG?qmhP_cQm(ij0pEluJ8og_>LLBlZfeHA&9&Ddob@%PZ$M0pBUYZc3J!YmFTJim5a z%R2(n4%&f@TW1ag$JZ5NHx^e6;-Vgw9pzpxeIp<&51k_G7hd(;onf9WhBF|9%YuNP*_pPtB4r?7B(>fmawyyAnm$ukf z>&K4q)^m|xf)BA9%_N7Qv-s>|M!$Zr?NuISmH8~s?e1+lmbtL?dOy~t1R z#xwgjG+4jn2Fr9#{Xwa(f4a&=)^oh1R=w+wDbS1A?-D_BRTkPrp$%Xdf}!1j+Nkuk zI%D`+FZ?aW5#KL+cF@kit(Crt;~4Xw;Fd)l{K1adtIQ?T1^@S-ocl_Xs0R?WX-doe zo^|;wyU>6|e5bn&&{}SKS(UMe+|SnoOs1cKpdx}D_O#LV}ws^D^TQX7vS(F{%XDHMIzL1E0bzOXMNr@UkQ42^9Giq$QW=$t& zW=I&%xF-~{YpAL3F83sG126@B82&O2JQ6Xfl`~*kj^HB|a5QTIG9S3= zKHRhhrKiX zle-p=XC;wvjOUO7`xZlRc;m;qI!$CR*pf8vN4snSjr2)BBB)?#2U4v~){xorWCpq@?bDY=*Fd=D1SxR-pCX z*fe&9&(6=^!cr@8{3!Ng;gGdR z=PV-&%iFQT(SZw3{?h_*;c*f}h(lr5MXV(Jx=~a>;I?#=?~C@Kh>w(n3OusLC4Vvc z5_?AKaywuo+nT7vi_7?N2_vc-2c-J&K<*_{`h{H9%8PF3pGNBrvB?NqFo9yF!5(ER zyx4evOBLd?@LZjP2fZQHj1l^Z&&niB0U5_C&UNg5N-((z)*LIy19fe%v}}MC zfc2NGd_6R5-!MYx+K|+Cf`2x-;i|d?*h^{{zNzbzWejcQ+(Hgo|E) zK%>&4J0W!hIVF6&5TP}swC6$Hy&dYLhE3OIZ9gE94a7H;nuooXWWnD4c`$}J*onQ> z~_Kqb+U!D-s4~(uUNDYbZMmABF0W2H1#o6Gq#^(H#@odm(>tu^=ZfF%dz# zssa#v26;2ZOCa?R=UQNZc!bJdtwFr{{?MdKU6pNf=`&7X>+I1hlXM5 zNM>u<75ChZZ4go5Gy3@%H76%$v{EbX8e*rgjhTtTn{%PI7acA>mRG0zUpP8EBDpk( zT#K4UBa$Efxb=l2KbF^bvM&vA86B=O-vd^JkWk}vh;5yM6v)OEa9JQIp_&CqY2L3a z4eo>1$?^A}KNn{sG1C}k;rv;SV!U;myZ>X;)ir=r9PQbbECt{m_aJKtihgWSA(99s(fPb(X*X4bXy!v=sF5 zTYDhN&9QTZ8;9Ev-c8Pj*go}QWVBQrAecPX{5wJvS7Frs`%`*@IDw8CpNSO3wo2Ho zLuvQ6#o4LoY{j$)t;LZBGq6X~?{el@LA;hafAft<5@1~6R0MnKd|m<3R%>F378P(! zLisQh#7S>oAVUI-W$zq_L|K2kR7fLWUykXLfB*hHi6$_#Za!Mn4 zDJ!dsL)o=u^KUC;kJQoWYtw=w9cRb5XLxK<5)W9P-duRVCC(}g3nNhmAOuv0Vq%93 zRuZ*zrkV`oh-8>K7{`%q0!wb}Ow) zn9{a%9Bf&jAe-I(bC$JgfH?by)D}~hkIpxI#?Z)By+?IO9LmbACB9mW)s6mhhn7<) z0s)$ReJR^LYu$d9qtzY^05knXa4(-7oUu) zPN*DY`#)b6|3UBo>z6%a6$Gx_VI_!p^JT-|EHm;SwjVE2)Bo?Eczr4?f2YpK<@@C& z(S7<|bp7_@(VS)X4F@Virt}MwH*Wh*^V^pD+Xr<77%nzp?!D0Hzogyxm}OB%$f)Cc z&d;!(o^XO)yczMjr+#jE%m_K6EC`+q~zBqbyiFqq{U1Eh@5GVXCG=kliLTOc-@ zLd;;7JTAU>xTT<)fvD*(v`JlY&@BaG@m^<6mS#N4PC1yKM265pMD%IJZb;t`031g? zT*!}jg|Bdd0&o!-HS*vb2MAqUE(%GTQ27qp8JMMQ)-V zZxNaM7?kq-iz@)_e2JkHu9zSTgzn2BQ-WXn9Db9%un0Iow>ra&-LbS8;!8G!X;H#n z0T?p!?e_9_pz^s5g=Emm{PmTpQbSCzhlLbkfrnv=kTwp6P99>&f7Y*Chs}TM*6)}T z4fr2O*~vO}u>lU)e^(491}yotU9`XlZy@1Ay8ysV!>d=`awI}S+l`2%25O>f;vnX^ zgUk@?RIMjO0FI=u=;Gr@tJi!^9TJAx|yivFL8is&a zG8K-~k(m`AH5*r?rA}3>EKB64!RV%Q?dhB{)@9q^L;MfIi_aRX+1m1h`rTFd3A(1O z&@jku7fZYYziaMpn0KI&Lx3t$`=dKw;V9PFXKQ=<^hp>;qm{4*tixG^xq8bMD=4yD zDB-a9V77doVFDfR!grenn#Un8v| z$o94sw!k7qG7s?&Apu?uIhn2veuDKt>j2sk)}DgBI-)8Jc(2Iu$cPBgpo%A(o}p?S z@O1bno`{WzU(h9S=yZ4pQbtKoEbwA2B67h}ns9L$vcG7EC_tn?`6ZP~_%CSa9_V3j zsYKb}a0x!XMQ97kPus*PjkSW3Rzh6vh)#^IINC`-Sp?frK-sfR$pO`~@WcDyAMxj4 zMkbd2=<17=GpTQ{lzL39K5&=g-O_4(+|N_EQA;f!Pa`yf6Ez(G#i25Xu9l^>8C(X* zT3%R_g2);nuS992B@;w$$=3+KKIv%FCVENdvi-}L2?Jwx?te=Rv!i?kFhK;3r2R;7 zJED(O(*r3ThDwCuz{M5S8Hy)^0Qx5S7#m;XIN3Fr0cy|#3FL-KTKD1bwZ@(sJq>rc zOt?f}W)LQr$Jn~2l?y$ltZUHL^_ZE9D+E*VnKrw&Y^~WS* zYrP)_c8HMea$r>OZ%C%TZ7dj-p|=e7w^!<}?Z%5cpoqk|KAa-q_MC`PC(H2k48|eh z8q+j!T%CB$_ae=EAU5Ij)cm}J^lYj!v7eDUokKBc=r07 z06AeHQFV%iZ7iaL9b7EmS?v-Mw-UB}2M^JJcU=Q+Z~@GB(r+BTMOb;9^pWa~xIr*3`2%?Yan*EP{_``xe*eo?OwbyUPQy52 z;Pz)X4Gj%-{_@-oH`wBO@ zt|T5#6c*mG7)3wN(FRJJ7l+Jk3*w%!p=DA_)q4SdDwa^8A$aMiNufiIECl|I63!+p ziOmDZyveKSox@c;)Z6xt*si(Hteb-3*{0vy4_ z!A#`TEpMoZK20O%I79_#OtEa=F1lAATkVy!H`29^!!}F5r`8e0$(q68<*YK&Czzd& z9dwPp8toSs1UdH^oK6doT?r-8y*~99SNuL78Eaz8rAK!=tG8YJHTH zx@r~xVBhIvr=k-{nm)|CzpcdcXXhE*R}Ao%FDT%&w6WKr`chw5!q^$o@;zMG$U%SP z*Uo|>;Y^9yt69u$aXZf{C}#}c&UrGi_@v~Xd3o!qb}HWiAAVt9v%MO=>Tb7BLfyTFGs&OJGmQC!nO(2)B>T2}dC;;%l|B!AvoDJQ zI44X<(ngAAY0Kg1x&~JSFps{T3LtYH5J#Dtl%MI^e?OB}Hy6B2AuWxAS4Uq@X~j=J zc>+4z@BKY%fmCsPn_0Q9egpSCuQV0*J!qNEIgdY#@$d0z@cI`JZ7E-?_Of9rAf0|h zacwx|r?tWCc?Fm)eYd~n(08>3KKe1twgG_&^#Y0b>rz0$Te8iVp?<4f=KEg`@3?1) zHnzW41d}`g0Z8SSFXl7{r~vACV;j8CP6AW@#TIeOHlY5Syvbz1wyoUfMzUuCdeR%@Hqe{qH@#ElD29QQK?uo}`&CZ>V-B&t0I@a1x z!(Flp`DFy3%{;swBJ+>ONw~EEB#lhlmpcc_N=u7I3lRB-8%)EQrh&xwUbyfrw;gLY z%mN)TC?+y^oDW|Eb z07s(=;FDrE=wm)T>ZF(Z=OLyB;2h@JvaL!YM?CbSO$;8E>Dm!!EDw27~-vz_h2@K$@(HL+N2g8;K>!rl$H0 zwoRn916HWEtfL`jX}~Z&;Mz(tH|oP6@)P!59uc5q%s)l?l(9`WmHNkD5{X5|#7p+T6q@j>R1&$9x1o|)2jwKoxd$Mac zZuf7XNp0vT?flGMuqd@%apY**zNu4nSr7AA`{;h(;Tw|VB)%8^+7FCzhK)rS?ZW_;{N9+w@ZFWrXpdeS@*t9WTo!{PkmK=y&B{4mgf)c#Zl;;4mj4M2G82+@Q#f z+GV3%EKjvkmzU?Ycqx1oS{*t-H@fEdTzKOk5u^a7zY&ET^-BiQ9#T=>?5^ztWy?Uv zF);sxgBEUw^@$zGSXFWXJ`rq4S28vef)08$kij*^9r!@J0_?8=vl>>jMEJ=7XHM4Fn1R6Nqm3h$R zU!-3HpHOVOT1ry#6^<04);~~w=`RNNs={KEwnVZjx75PMhF#6{WfS>wi<(U`*b`atb~l>&%fC!kA2?X24@7T!~rxa_Gm?HmNK# z)6>j9hrRyqux8Vl!JN1Nz@XT7;{?mS2?sK?=``jufkfWB`&pXx5jtGOzF!Knva-&` z`Hj2$AAe8j7+x-Z^Q$Ah(7^ki0`hYI#hAk#4BP9Jw;sbf#G;Q!U~}nM*db}kJmJ@- z;=}B{HYd%|F)>|4Uo<2jqe*C4Csj=`L%Oi~d2-QMi+{t_ED`<8=gEEsAx0rtG+yHp zcAXh%!QwmL&54;^#+S=oPuBX z+nohgNDPqnDimu@tY~8WQ0Eq1-*jE*5C8qiRta6Y1~QpY#n*<`@dGn7ny@4+cG!PE znJ7jswb+Bpybr$=f|uskevF?Bq%--|)f^69NMjw*s;N^;7m&J6rxx3(cii0Qoc@Sf z;6emeHQi7$n|{A!e6(C7Z!9@jxhhpJ*^}=6i*CB~5A~+Z`!&$@qe`Zm_<(wpde<65 zs$0kf$rp4UOtnCVSPBJDunon=;Re{lNUpbw1KDc&FD@v=e*bO_tsX<69van9MI_=; zU0bUGLe(jUs8J{a7DAx8fzE0j)rFHjt}lZxdtgPsXE6#$DfE0KL@`C+ zYsWtm4;%BC(Q(4mLf+#Zha1Wtf12b`jjUc^A{VZdTw5-AEoE$Zf5(xlLnu(9#~*JA z-yx$*T6CZm>)idSscAEQ2vzQyZDtRU3GMwY-&5<5CYJWtFhC$9bCgWr2K_%CXeg@xTxQa*s`!_6@u`2)|9 z?ze*odJ_Au1FxnV3hl({*R5Oa{c9TS!z5v~J@3>dSUO+5e}6NB3;2OIjIxbN8gSd8 zo(%Sh3B0(K*wPN$;}=^`R8TGAFt2QYIGb5X*$r?4Ei{_6#ApFzp%Qab!B10#JKN

bU=7VCZoHbL6Bc9L$#1Xy5qIo_6cvNQ zeQLb0MH64gBvKE3L5ToP%*VRajKtFMCN-9| zd%hy-W_l(Tn4WK9;to->JQ{GZX^sl!kJHoXVcFP7l_2?2{bPRc#L6ZrNvJ6sl_1fp zgu=ag7M1J}HsaN1?rtVBZHx{5f9`)g-3whHM(3^F4_Jt}35;TTP5t7^9>*^;-P1p; zFAmhO6WyY{J}Jn=I)iBqUM@_YgD=SNcKM7w9;kyW%k?wzqcTf21EgdK{QHZ>$O=96IJ_zuNBsCcy zJV*~lW!G#ZVncd|hCVPo+xYm`?1&LO*5BY=FZ!yPrZFFZ0UPZ}EDp^L3dj_>vn%?J z&{1GT?b&8KGvo2tJt8@JbNS!^0TKp{2<_gz6{Gm4pBcEJJ%$=~5Nr%P`bj`I(bFxc z{UHXh)@JAwYGtQe4{&+;Vg6!mZN?Dt3|DVW9Kvd(WMu)f{iz|5mc;)-EJ zU|^AlXe<%kZwt^A0|o3QSY-6s8@DmUUEJ`G_ExAuDB5m<)ihRB-Rf+H;7yVeeq&{| z1N3HqMH+}k61llXA1kQ}DcK}dT!Hf4r6z*zR68O+5gW=kJ#Mj*3N%{X-d&Mrg6;^` zFcDZs43|FiU##qW0zG1qM<6`PKgoiXK8Zht zKq9i>fq*cFW>5i&bryX!@z5~Q>Rgp9YLM!caXtH-PN1doX&w5|LH5FPg|AFtLl1Kp zQ%>pwRlZD^A_3F1|7$!JQC2kW3k-AYBoa>pEy;(g}woCEA!3&9vcqUl&^Cq_XozV;S4L#(fFkgv#u+v z&^UO$KRVryyH>pgp(gw7#yvr7>%QQ+?K^M&$Y6+FQ+YsRqdmO8YVL{4kz6Dq<{}*< zf=k={HmN0TM?DYD*aAoOUk9hw(I*}sO3-AasicLG;XpApeeto}wOoGt6UXXrvMB81 zXgSO3^{Ijof_|w3yYVurwwskY(Z zv;2G3Y39-Y2Fs`|O6+T~9QlV>>14UzJwU5{!TSDqY&RS5!)>BWOpN){%<_Bhe@{r= zm3V%ERpcM9Ps}?Q?CD}O>=z46R9jTmR-lME_N5*$ zB5p}Z1DZWlV9?3gJMM{%i!^L9ZP$7LAXg1nOz|kWw?Hs(1JA(|If)jr+<78~!9U;c zOd|=&Kbq6~b4kE1X}hC`z8v{a>L5mnM%U2LsDOUK-|NM7iLXAGJCCiD;zJImLgrj0 z=z*6#I2tNnksKLplQ*#pxs1Z@?r& zBiB;7eJxt4-apy~`X9(fR{Ax<@jWQx6otbGc`6gicaEZeU3^z>X9X)V}*Pwl=ar_8(mpcOW zTbl8mpMuw^F^6^qJ>tADJNq(By{SKO+ilwb*ce(ekv$jL3CMz!N*F!|>r{!!h{aLJ z$i##Ox2Bc&KI^yiOx;U5(DE-~AJ(-Yo=iv-L2BU!CboxNL_hD1r8x*{AOvu119<(~ zUU)~8NEZRFT_jYD9$T zK7VOF;qqt3j^cgj1Ze3mQg3|$7aWunHo~1KBz*epnPJH|Kp$eM;^!n$3FFG8`f!aA z(^tkhyhW&7C3GJ^L0|Ytq{B%+`M;Sm_#O9R0w8%YczkKnk6nki0~(7{O+exvo^^Lf zx%r-s5mIPx0C8%V%b}c1Km2mn&d5&${~;lVuV4#n%{<=mpIXb$oxMbpji;TcQJUCp z;Mep9qEtD3XrP$7^f zEa?S$PT>l%@4RGP`#KDc8W2f?bEqZFOh7`LH%1tx!a8DFuBoqQ<$D3Hn38>uNTb~l zS0xI@;ZselPMw63BmsOZ`Xi*t;5S^CE>)!e!U_R2FUBHu;Dd2L2VNFpp+rjR2;in4 zZC_rxOTeaJC*)a+hJZ@hY3+}p@DIhLL0sUp4`kmv2zeS6GlDQ7lB+y;f_TonjGX+6hD+AQR=>KsS`Eh;5a#^DTcAL|v|sF|ygbvaElX{UnuFkNdx918@j6Sm)RcmK zEYyRZToP$D00>Bs6v^6~LKjGSchGxUMxu59Qp=p#=)Y<|(vD~rh=Hu312KrxCax)w z_}v3GcS1@@vw)%5l_jU)}t>+}S+Qih|n_orwL@BHG~^=EapqDay)W)G${EbNGAW@S0OmC27=T1!{u zW|Uve>2bZL;hwMimi)P5O#}vAGfT1*7W0qfWaobS=yGXHIZe+k)_CBBrNsM)92|wY@{NtzL2Zj`axt z1Etq~Gcgy{c-~&rvRkmp=1K{t?&Z?go|)y1=zX>$N+CX5?uB)fLeJd|LVb?oF5E|0 zqdOnS?zAZt`=%(Aac+j+-uEo8saijRgsnEFUz3qwx&G+}l1Up6-WyH1x4+N(*R7|v zf)%yv-+lchX-4N8dV{-0M}2&6)?E$4v$L5p;KbxNmE!sSBI^`HQ9L zsp>4>z7}P(e`e^JRb@7(hoat+JB*(EMbh7RUuEC)<_$jo&sR^7BHuOPjBdb96?giz z4yXW%VzrMK-Gl-o#=A70?@yd_Jg`DloUN2F2@s=?&-JVAKL$t<#i>$PNFy26zUmRX?{5Q0 z=;HSCU|LE64`kz=1%y2`Fz|l+d5A6H@GmZQlum55Z>+Bm8GZQ+N!vlBoMJr@LKa|c zpTvN`FaB&{7_F1*d|z!Bz#!6cRwmEWzA1U_(T!+^s2(|>P0-L$ekv96ZJ?JGi^U+F z1gXGuaeiT872-e*^}3F!ZWBduKvn1Nu<^PIy4zI zF|sEfBu zyQB|D{072z%2p%>^ciUg?u0z(kK$>e5EUs9>zx)`Ibg-DMFA0?&<8qVVsMa#T9+KNS^#)JRfh;7EF_J-h)q zC)h`XEsr}608!lm4P}NGVHjNz~z&+YhFD5BRyxgf#4_+d-p_iebr`1YAHm z&^FkcZJE8BNBiEeXI9JzlSg#Oe=^H)6W_}K)7?)CbG6JEt%=0wI%tk2DswxO8j}I-OBmk*MS1IuDtiBFs zfSJ-*S1~gY_4RSGvj^b+3MiS4uyWLVwZs5H`gzjRuue(W$MDdv#=B?a`eK`q8S38A zx-L~^2de>I1e;mjEr2MifFw#*y##H~#>Hi+la`&$0CUZ&?K{cY`^?&(UHjY#Qn(G? z?M$#SKFb&26ihNl3AOmqMHWs@u}l|Cz(1junVv5sq$*KkEvn%x2wOC%Az6o zl*a7v)a3oX9}1|}<&g2QGQ=2eAweNQ#57`sz?LERkN|-4sBd#JXe%^ZI`|R$Ca5{gAD2*)S;Fw*<7W%7C(o=yr$gYNMoXIAnH?*uWd%DUaX6w*1-D4}hzzM3EHN z2B5l_G!|a~&+91%iDa5fgSb3AlK0|nHZM_bP z*w|OkahDcZDRnu(ZRl`Nc#=-C9eegfS`hnCLgU$@kW=4ajYv3>RUGml-zhW@L+rk5 zSMT1!c&UEr(uM^8F(Q=l#f zB41bsCkdpWht7AC6vZ54L&H@-)LJZvSRU_ZU)vunojdmJTbVkOb7Vd6Xz4;dxNetF zk>fdvP^b`dx%$^w(h`!Ad>8lXygAKDuTiEH!p*Vcv-h5yruj%NhamUbUoy#8jkxDm z_P8&E(Y|~0+*JG8wyd2tUOXCJLT;to-ps!{&3XGhqi)ia_1cB*d)jTAtJ7ozduVoS z>@T!4v@UgM3o)@Yr_62#y*E#=wfhO1HLFeW(_auxEPu`)#&$ksIZ10$9h&ufR66NP zVV`fCO1Eu2?;x_tO8LLPB6QuyfT!)qKT>BIXjBh+^Zh6+j+b&gXKGOz?!lyH%CB8> z-a#{9>c|&1`V2dUSw0Mq^lP8`Es-3g_3s{XzfD})Ef&FeK2t($lc!$*H?K;7yNbP9 zc(z?4nkN!z(!F?ocw<)Gu`0q~!AxzZ^&x!eoE$}(MpFap#r>zaMY+iAxcTJRg5y}f zF^Be@!p#zDKhy%Ij`F;JpYiNndslz>v~{t_;TE=z3tXf&8;pKL<=y9{b6*@9@ZgX40kdZ4>FL?EcQvt*VY7DTVr6FDi6@ZU zxnMj(7mLV74aEJlGtFeR$dgw9bUt}?;RZ&l&F?83S7%6DK#>~&Q7cFRJV%ihwGM0C zV4Hcq?FPgRICE9K#zO@y@2$lmd{)$G_cZMSRI%tMkZoiek6Dw^Ng`Ttak1v+nzHCDU0~_)t3+pgrB;b?eIeO>gWjN4HaccII zAhz_COT7|0+c2=vuU&h2sRL&gypA>k%al2ngq38VRPG9~hab?`h1{{6eW znCFSK9*()8#v5`y2)%j_p|ZI=*p}Zw^Q0n4D|&sbg22cya3wal?Xn9ljhTxI3Eh{~ zgIM$c9Rm0E5=hETJ{A)^akB43v{fHGO?MGtTHq=yB60!OLCtIm+MnZ;y}mH;?Un#R z4;!D!z+PNdn}8VzWfHSM?%XgYw=Jz?_eb0Fry4l|KbBsZ&rLsq@QBDXpw?z4gP`|5 z<1Mh&$4glSW3;(E8k~UEpcE3^3i3|P4D~Bk{wmz`^OKBp#9T!Qz_W3rzwNdn!`-T{ ziWy51zIxhGB*m%&GUIhl)`g<~&d5~TX{a6{^yc%~NfoZW1`B`a*#HEeXsGN*{dP|U zHC1~1CMwcGWHY-{76X8Rk(&UsPAwn&|=@e=szfhH{AQS?qw*O+>w?M+5Nx-wntV#8kl<#6b=@JwN*$$ z;udgwYL(^Ll-O8KjP6l+QGqCl+y?Niisd|XEoa+vV^wni%B`ldSZsko^yxFE>z{dL zWw&7Ni7u}|#FHO#(F|iVGlhxY)J5UM(TA`%IKsMgrU}3DDnP@Av3YsBiY`y{KM2N( zgxj_?Aq{($uXc*Hu%)S~Y2WYJRaCldK1j##hmLBu=`DH2bEZb1{wWjg)9vJd`_BUK zVNsM`Rnp+ZiMrDB$3t>W4j)!Me0a-);MmyXu_0<&{aOJgM~?XFrJefH4wSOD{;PcK zMm%QGuCRu|;#}D~7Z|v1;30`hr@hYJYF~hxD9RP&oY=^H=KX%;50^))b>g<;Wut?i zZD2*`Lr~KQe_UzywPSA?jrvnIkMau{ns1~MtY_BW*b*7(w`yb?@2WdD%tkEEGRk%~ zeWo=`OG|UvkW_x5?`FCl@SHl_?Ph0CIBAOfh0O3rk7{JOb3r%+zjT zYv+aMLWBg_mh1sGcyw%A#eVr1A#>y(Qmu`QJxV-iBPuGjWpXL z8hn7n?#D*$v=19#TQm8Q@yWh83fVmM^siqt&%u(yse>L|ob91}nnu6y!65O*?h6nQ zq_hVJN=Q`U7vR8=zOr@VvI`Q6_C5?@A*_E_9Hg;#AzP}qb4Yaf+Kyeaj}Ka`>A!^BM)o(KbFUv5!q@>for7RkA) zJ*{p^M-VeIbxpSTaWLVGxsY9a?YZAvz*9w1n7R4n+Pj-%dG@O(EbYi<$2aGDq2IQB z;{EFXGd@W%?pHVXu0d}VE%;11QHn9ue}wtdEmE3=AO2c1>DmkPE@&6f7Kv|Kq7uwE zR-aT2k!13tF|Yv)SV`L&nW}-Y%sTC6FQ7ob4tfR#8XPp^eR$S#<~wR@Hz22`$n9Uz zbD5swR-9Z3xCy$dqGJH+d&8A7e*s7aE}Qw^`JlVWTr^r#Qqm@`j|M&}+`3LVpsLI2 z@pch821T!ofQ28*buFNDow-h(IdgPWF`!>@JrFZ$92?_(^uuRcGq=o3`EsiONM&&P zTggV4Zc^}5<_4;L6@@59GKF_p*>Yb!#J-xX@9_1((LE{51^T7ge-)~_bF4q1Mm~A! z&Gio3n#OzobGB{vD7gl8UHO)e|LqXE z9*%amR9wGqwv4Ui6h?rkWm2t)eMh4Jws>!nH(`!w8V?LT<|J3=bD}z(Ehk>vrTK~# zu@SO_6F*4y7F1>eMhksk#1A6nN3{?1)E`FZn51}kd_R#-rOoVv@Nj?dq}FqNaDBtR zljoVIN%OYk`hn0)0c0{$V|KBBst4tm4EV3r2da5@>{x{|bB=7xasS*73=Dn6eI39Kj4=1r4aKJR}I-vW&eI865VhE9@9a$U0g%2D-r)V+?-|W{+)4Ykoiu zG1(t!xhwW1?kcSjUSmTEQo$bDu#SWmyW#iB8m#mK-Jvu#WY;L{|_iN5l3&8!&bj%~L=BOB0KrCWpiS#uawYP5*aV`P^C&B=b~p+W{H96IaG$2c#G= zZATmCyGf%&mL6nDlQrlk1=fLp zj_gz@dMUD9bJ5LBc5ml~iJvDSOL`0Dk2&(Q=Vt`lv}e0vjH!YOAf|ISGmf-MeDI_r zYoRHM6gA_?l!YM?NS<{YDFKF@^OJcw2+<1#`ZsOYcA+wkD2L1hczJ0yZQskCDGMxC zTp#HbTUtAwT2Vul9cCr~2OKgS=FP#e$Pr6}EvYcXBb}Nq(9=aUv2jL@NfVg#r9^;!C3c(cklcyL=+Xb7ntXW_HV-x_vOo%i&sXP z(?a6vKo%6q-)q$Y1~CQUg{OIGa~w4le(P_*xMxV5)*aGcWDdtlC8B;5+faMe8}qur z^8)I~uu2YJCMOK}nUHGy_$Q|pp9%ptrf}(~VIi5nfJf&+W$iuy{WlibfHje^a}=T3 zB!bUk6tB2}xp`*cI6hik^7aKBAaNwd`%Do7eU-UV_a% ze)vLV3`!={RaJd@(?iE1b!wrzN!#O$#E4=o6 zPKzuE4QFM#s?0br#c|`T%x0@TpTxhR*iLO>KVzy!^}|Qni+_b{Lt0F)EQGB))Thn3 zS*7ABlWd~qU2ai2hri|KYKy09rwWfyCbWdN)%BY>HaOh;uFn{8L-x9*SfW?ni{BO& zm){bWbH^$mUwpec?grd^YZ-uV&~_(mHl?Y&LVKFpK%uAYh~@5^bCHyJO;D;L+%#i1 zKUpwk^up$r55wVS2VBj1frI*gbD~6BhLJ3ITQb%0J=Z6=T?;|d46OUp-fBLUQk@c@ z|5O=M@g$-_+00@I0GYLE7aKk9Ra~mcyyn`cKv?wl_uDva^J5ZxmSjqJ1qBh5!)S-vB|MxJ?Xd(9T5 z%i279vN;_>3ZL~moSX15_%eUA`j1h-*n`7HU$ZiOrnmImB#;qpsiZ}GXz1pp9}o0- zr&IlG;4bOt=m_DUKR6(s{g&u_DQy?k_#Vr7}Ao@Z}e-sLt&$-7H zoN~ZOYy|ow+W!)D#uyDRT=|2cW5)F#WOYN?_N^^?wGIrb_Eun!=`7vq+u?*Wy_W#5W@o=g?7!f zc@quMR3r{Cww3^r zBR!0AdSH$s%Zwi)38q!~T~$@k2-NxCL37n*gC0F^<>Zu`j<*+qm#sgu4?`8>L%50j zI$Rjc)#Z(!c0ZpVI$zrchwb{wtwrugxWZzdL zfWHQ9#@Cpsi6hU2k8t zY}ft=gIr1s_@X>BwK1IjWii(QL(IW6sf+8ZPns7_4!CKMiIAAZWq^QVn z;^Ab=q1l3HB&w{wtNDhstZl$Apu1=;k29H+LqLDc+sHFg75ZOsu3x`ObJ2hL{K`8^ z6$MOw{)x@QxGdOdR2v?4S2hf$#d%h5-(9O`XD39|d?EX`1%`0bqqd9zri;+-X6O{T z6%-byR&qv{u+OMMn{;}f)Ni3=LwDf^N&<<2y|vN40J$1QgHx^XP%Xe(&oL8*JPTZ4 zx(l3)j8yRZCy`6?A4QK>Eax%3pKVUPt4fk2nydC$J-Lb% zD{o)9@9lel+E^z3T{bF?M*EXUoO z8CQ|K0ztiCCKN?fFp#9fC)eN2x2v|GC%exM7FP^F}3XO}~h+tsdeg}B7 zG!A)L9y`VW^ji0;C0=@XK6c$YbctYt4Acgu5uoExGRmlFEp#J2`~#4?Vp_jw>yw2| zg2KY2hh4IYBwR-U+|MG-X++HKa$*yQ%!l4rav2^Ngv)`>uq~7GEwSk z2z8lBs;p2@XcgQHr zZAUX~ArhTG6s!Wg8n|)19!MrgS;;DeUAvlk_p_B!N5J)eruYpnJGMhoBr~uVi{B<- zdo_*^10bTnSCDh*z`E$<{IuBU>1VU3+a(x4FKplI?5&^-Xn}OAx>)RC$5T**U2=Rc zQ58QqR?HIwI$9L-D}P}c-q-dR^CQA&cywrSjV5eOYD&tm-55?0iz?uo`j7S|rnx}U zxgGAv!Zzs3XKcS^WM{txsq8$8cFkvPVO3bYUg1d&v=D&`1vV=iId}LlRlxW<>EnFP z5xvI9gp=+Ag<#&qrJS9HXk}l7CO{}GbxJcw?=5gRV1a1nG;a}jJH)B%1b!jv%t_CS(fh8Ke^+3F{pncj zt&#rdGjWx`=O1>KtEN+1r5Mn?-g_tG(JCsKMb9{m^7lc?W?fEw#3fqZEmJg(pnk5` zo>&y>ZPh2qaE)=KKUhiZR3F${QPLw!f zDNMrHjdVEn>SSL+q#CAFs6=rr$q8d$Vj`zV^8KDu^L_t(*Y$n>bg9dE-?#U?@AKUE zZ@ZDAQQkVU=a*OXtmN{51w=$Ot=&@4zYw15wpwd<^Bhsq`DQ?{ku&-&K?g}!Vf9-A zM^l%X5)tfqSO^;=PK5CeBR5WpHU%ig%Zdws_o|t3n)&=O#lxea>$$%XHS6{A8Mo?k z$7z3iA@RHyk+f#ku-d!j6lsb=wxyEoiYrQLd-Mj+&!*=a?3Ys8JhDcuEzBB~TaKRl zuvMCjQgO+jCxq$62#XbQYBmpEqvn2yp}(L9!ZfRogTY1Wuyck#L2%DDh+dILWKP&F z>tktajUv@MQaFM+U7q~eO&-oG<93RGQN4H8JjmT{yIimJog*7n1wv)N`4wl6s?ugs zR<4r^bG`K7L$X*7&CEUcLD`M>6-uP?*X7xICo1h3Lb2uIt42TJk(y+(JeCd^)i|Zf`wuLPAK;2cis^3F#k9LI4%MOq|YRJNL{>r z>`wMnPgj0Mii$!G=4JAcEQp|^B+&smwrG3)8W|D5n4iHtXi(imrKG+)zbPu8$QXwv zt5@y7G4Er&Vi@Kqzh`&rc~;u(2xfP1rWHJTYU`r;1B$@mXft2m%3C|qGtjZQK%SWD z(hh|>QLnkOeIcHG1$i^HheO}u^n(Kf$JxG!nuKsn+b07C#TlZ`_QioYbfyOBuR#i1 zke8RAV4M^-6ltIr9E`SLXbJxHz=;1D#2dgyqo<8T7EY9;FG@(rIGoXk-8VJWCiDr> z{|~C39y(N9Sa_j>3Ggv>aEBld$u_j!f;1&$O3Ts$LF;MPf}nUZBfzy@#VwZQbRia~ zk{I0Xse^h=;G@Xh6o5Z8H~8TPfzI@?(*rr8$<8AU4hE=kB_eml^t8E z(a(-sJPJV|_CWsK<^|TMA-2cd5W`X(1_hkT-!kC+8PdZbzqwEfq7cJFDIZWgjg>bX z0OL9LK3v{DYwL?o;<+$#n~SKYU-&)zT}}~Peyb<8X%dJkzX2A7c{D`AUVQ*xpkYyV zH_6!bB|s3pYRssXA-)1%Jmx}Qtlh47GYI1L-)Xe^tJt55?fiZkCV2#eFns$XL|H(T zA5I+1W7YwjY^n+Qk;;VIn)o4snI4Q+ou>d}{2pahgh3-feKI|#@|xulY5M^?w-&rgJ@M1;dPQ15>Fn+j=M|A8q3a?4}| zSEEr{p^;hIy96mGF8blejB+u|2j6H6tC+jSo{*4bjYe+cRORr6g?HfN(}yIPAbXT* z{{ZLx9O15f{QOZ~eZZmKs4QT2JBL6qY1T(1H7VDlZ^g{{*t%{DXJ==6c2Mw}W_2gn z#f%rllJAoZQ%kcK<_9M@^RM*aHPCv;@_7;vFwNS1z6SpctE}`y~F@`uL zRXXf6Nn}_ZsJpVdDNy{s!R66Wv`LWXEWD`C@UjHH^V|xnMkp#Md17!=^cYd|A)-2R zvCKaS2pzICdF)CLDmh0y9T$d%Q4{o|jWEvY4OtG7*m;6Jb_2P^B!q2N9JMDr1S&U! zaNVcy6G!rjN-l=(YAiM(?Ye3`W68a|K>Q9F)esqo(nWyi3Mo}EPNA-H{t)Hlh-Q&wv#$ReX@u|( zC3YlPV-nX_@TR5?PC;6Rm2O>!(8`s66)K!zPi!u{jyzOfA0LsuAK?(F3EJW#Q-fXq$bAhRa(nOmHoNEFR_TX6>5!A!@NKU782_d#9GIF zkFE-0J7c&4zQ#6Y8p>bX6EMFnSgVTabUj2*48hvd9W+} zHAOm%uytj%z|ozmNiq8#LJdHKVZ+4kQyMccQIBB>98Q&F)Ok+|hW-i~rHx7O(k^}r zHiR{ypf=l>&Y6MLZZGHBj1hw?=19!H~c>MfXAn&9iUWXOvvLFw|i7o_+N;I z{faT$hK}#uR8L-By$>@t5OjAPRDP4=SOWy0y0}?|-HS}hPN3pLXw9%Rv$ag{a^2!x z7#f(Kjt4Y1o4GHXgCts5S~|B&E5}4fA3w!73NPWU1hh%!Ha5A4YqwD`Gg!ccl+DHo zbjkhbzBt&P$H=ex3~riXJxBM3W;SpVpNo6klr~xbeIRg>TR5{??FptK@Hmd}ZkvL_ zx@IE>(eh4jpoEZQghI(tOKoVche`nNh=;qhl{zgs`LT^UjYg9^tS)IELcf6GgfVMb zyviwa;H?4BUo>kIB}Mu9iiN!>pNnWImQ|_baOGV)ll5yi2)gt(7#tBvRt-_TY}PDP ztYrQ3NZ5dh=!s`@ulZ1cnbmHHyzStodt@(W9IEWvb9NY@#Pel$OLp zxu#IEuxM|Bi$Dtx!Ior~Qeu@ly5aWp2gVcS>r{p5BLg z(c5TiAe_&9C3qRUr>tuPejIrka44$l}t+jV~`P!w46gX>gerV zo@Ik**lHp?B8}lV8Q*f}BO75D$b}U@{2eW;6Z`5-DhcXGvmdAdlmf{|=0FaS^Kj!f z!0azHsS^*T$oLi_8U~Pd+wILM});M<>?z@|v0t zY^GRRyZX28TPrnccvAv+HgbK+FWb({=ILr&W&OeK)P@UdsR|A zQB(=MQO20PikkJHVXwCU2ZzL{FqzywTufZI6$*OXhL}LyCgPt^IfG2hQ;ab!*{_8n z<6iClM=K^98$tPxSS(H&l9JE0Pn|G)QJ-rc$B2J0E Date: Sun, 12 Apr 2026 19:54:23 +0800 Subject: [PATCH 132/204] [Log] Wire stat loggers into AsyncOmniEngine to match AsyncLLM (#2551) Signed-off-by: gcanlin --- .../test_async_omni_engine_do_log_stats.py | 56 ++++++++++++++++++ .../test_async_omni_engine_stage_init.py | 2 + tests/engine/test_single_stage_mode.py | 3 + vllm_omni/engine/async_omni_engine.py | 58 ++++++++++++++++++- vllm_omni/engine/orchestrator.py | 26 ++++++++- vllm_omni/entrypoints/async_omni.py | 7 +-- 6 files changed, 144 insertions(+), 8 deletions(-) create mode 100644 tests/engine/test_async_omni_engine_do_log_stats.py diff --git a/tests/engine/test_async_omni_engine_do_log_stats.py b/tests/engine/test_async_omni_engine_do_log_stats.py new file mode 100644 index 0000000000..e2b8c03b93 --- /dev/null +++ b/tests/engine/test_async_omni_engine_do_log_stats.py @@ -0,0 +1,56 @@ +"""Guard tests for AsyncOmniEngine.do_log_stats edge cases. + +These are pure-Python tests that bypass __init__ and only exercise the +no-op branches of do_log_stats, so no stage cores / threads are needed. +""" + +import asyncio + +import pytest + +from vllm_omni.engine.async_omni_engine import AsyncOmniEngine + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def _make_bare_engine() -> AsyncOmniEngine: + # Bypass __init__ so we don't spin up stage cores; we only need the + # attributes do_log_stats touches. + return AsyncOmniEngine.__new__(AsyncOmniEngine) + + +@pytest.mark.asyncio +async def test_do_log_stats_noop_when_manager_missing(): + engine = _make_bare_engine() + engine.logger_manager = None + engine.orchestrator_loop = None + await engine.do_log_stats() # should silently return + + +@pytest.mark.asyncio +async def test_do_log_stats_noop_when_loop_missing(): + engine = _make_bare_engine() + + class _Manager: + def log(self) -> None: # pragma: no cover - must not be called + raise AssertionError("log() should not be called without a loop") + + engine.logger_manager = _Manager() + engine.orchestrator_loop = None + await engine.do_log_stats() + + +@pytest.mark.asyncio +async def test_do_log_stats_noop_when_loop_not_running(): + engine = _make_bare_engine() + + class _Manager: + def log(self) -> None: # pragma: no cover - must not be called + raise AssertionError("log() should not be called on a stopped loop") + + dead_loop = asyncio.new_event_loop() + dead_loop.close() + + engine.logger_manager = _Manager() + engine.orchestrator_loop = dead_loop + await engine.do_log_stats() diff --git a/tests/engine/test_async_omni_engine_stage_init.py b/tests/engine/test_async_omni_engine_stage_init.py index 002e8226f6..24d2bf0cf9 100644 --- a/tests/engine/test_async_omni_engine_stage_init.py +++ b/tests/engine/test_async_omni_engine_stage_init.py @@ -31,6 +31,7 @@ def test_initialize_stages_restores_device_visibility_after_diffusion_init(monke from vllm_omni.platforms import current_omni_platform engine = object.__new__(AsyncOmniEngine) + engine.log_stats = False engine.model = "dummy-model" engine.config_path = "dummy-config" engine.num_stages = 1 @@ -280,6 +281,7 @@ def __init__(self, vllm_config, renderer=None): ) engine = object.__new__(AsyncOmniEngine) + engine.log_stats = False _stage_client, _out_proc, _vllm_cfg, input_processor = engine._attach_llm_stage(started) diff --git a/tests/engine/test_single_stage_mode.py b/tests/engine/test_single_stage_mode.py index 2c5bf6cc79..1afe2fd6d9 100644 --- a/tests/engine/test_single_stage_mode.py +++ b/tests/engine/test_single_stage_mode.py @@ -461,6 +461,7 @@ def _build_engine_skeleton( engine.stage_configs = stage_cfgs engine.num_stages = len(stage_cfgs) engine.async_chunk = False + engine.log_stats = False engine.single_stage_mode = single_stage_mode engine._single_stage_id_filter = stage_id_filter engine._omni_master_address = omni_master_address @@ -1366,6 +1367,7 @@ class TestLaunchLlmStageSingleStageMode: def _build_engine_with_oms(self) -> AsyncOmniEngine: engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" + engine.log_stats = False engine.single_stage_mode = True engine._single_stage_id_filter = 0 engine._llm_stage_launch_lock = threading.Lock() @@ -1446,6 +1448,7 @@ def test_spawn_stage_core_used_in_normal_mode(self): """~single_stage_mode → spawn_stage_core + complete_stage_handshake.""" engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" + engine.log_stats = False engine.single_stage_mode = False engine._omni_master_server = None engine._llm_stage_launch_lock = threading.Lock() diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 1e92780b66..5cba14c197 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -30,6 +30,7 @@ from vllm.tokenizers import cached_tokenizer_from_config from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.input_processor import InputProcessor +from vllm.v1.metrics.loggers import StatLoggerManager from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient @@ -283,6 +284,7 @@ def __init__( self.num_stages = len(self.stage_configs) stage0_args = getattr(self.stage_configs[0], "engine_args", None) if self.num_stages > 0 else None self.async_chunk = bool(getattr(stage0_args, "async_chunk", False)) + self.log_stats = not bool(getattr(stage0_args, "disable_log_stats", False)) self.stage_clients: list[Any] = [] self.stage_vllm_configs: list[Any] = [] self.output_processors: list[MultimodalOutputProcessor | None] = [] @@ -412,7 +414,7 @@ def _launch_llm_stage( addresses, proc, handshake_address = spawn_stage_core( vllm_config=vllm_config, executor_class=executor_class, - log_stats=False, + log_stats=self.log_stats, ) started_stage = StartedLlmStage( stage_id=metadata.stage_id, @@ -614,7 +616,7 @@ def _attach_llm_stage( ) output_processor = MultimodalOutputProcessor( tokenizer=tokenizer, - log_stats=False, + log_stats=self.log_stats, engine_core_output_type=started.metadata.engine_output_type, ) input_processor = None @@ -869,6 +871,30 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: self.default_sampling_params_list = default_sampling_params_list self.stage_metadata = stage_metadata + # Single StatLoggerManager for the whole pipeline, mirroring how + # vLLM AsyncLLM uses one manager with multiple engine indices for DP. + # We treat each stage as a separate "engine_idx" so logs are + # distinguishable as "Engine 000/001/002/...". Using a single manager + # also avoids PrometheusStatLogger registry collisions. + self.logger_manager: StatLoggerManager | None = None + if self.log_stats: + base_vllm_config = next( + (cfg for cfg in self.stage_vllm_configs if cfg is not None), + None, + ) + if base_vllm_config is not None: + try: + self.logger_manager = StatLoggerManager( + vllm_config=base_vllm_config, + engine_idxs=list(range(self.num_stages)), + custom_stat_loggers=None, + enable_default_loggers=True, + ) + self.logger_manager.log_engine_initialized() + except Exception: + logger.exception("[AsyncOmniEngine] Failed to build StatLoggerManager") + self.logger_manager = None + def _initialize_janus_queues(self) -> None: """Initialize janus queues inside orchestrator thread loop context.""" self.request_queue = janus.Queue() @@ -885,6 +911,10 @@ def _bootstrap_orchestrator( loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) + # Expose the orchestrator loop so other threads (API server) can + # schedule coroutines onto it via run_coroutine_threadsafe, keeping + # single-threaded access to StatLoggerManager (mirrors AsyncLLM). + self.orchestrator_loop = loop async def _run_orchestrator() -> None: self._initialize_janus_queues() @@ -898,6 +928,7 @@ async def _run_orchestrator() -> None: stage_clients=self.stage_clients, output_processors=self.output_processors, stage_vllm_configs=self.stage_vllm_configs, + logger_manager=self.logger_manager, ) if not startup_future.done(): startup_future.set_result(asyncio.get_running_loop()) @@ -1453,6 +1484,29 @@ async def abort_async(self, request_ids: list[str]) -> None: """Async abort API.""" self.abort(request_ids) + async def do_log_stats(self) -> None: + """Flush the StatLoggerManager on the orchestrator thread. + + ``StatLoggerManager`` is only safe to access from the orchestrator + loop (where ``record()`` runs). Schedule ``log()`` onto that loop + via ``run_coroutine_threadsafe`` so all access stays single-threaded, + matching upstream vLLM ``AsyncLLM``. + """ + manager = self.logger_manager + if manager is None: + return + loop = getattr(self, "orchestrator_loop", None) + if loop is None or not loop.is_running(): + return + + async def _log() -> None: + manager.log() + + try: + await asyncio.wrap_future(asyncio.run_coroutine_threadsafe(_log(), loop)) + except Exception: + logger.exception("[AsyncOmniEngine] do_log_stats failed") + def collective_rpc( self, method: str, diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 386b545eb7..e64fd3685c 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -22,6 +22,8 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.v1.engine import EngineCoreOutputs +from vllm.v1.metrics.loggers import StatLoggerManager +from vllm.v1.metrics.stats import IterationStats from vllm_omni.distributed.omni_connectors.adapter import compute_talker_prompt_ids_length from vllm_omni.engine import ( @@ -122,6 +124,7 @@ def __init__( stage_vllm_configs: list[Any], *, async_chunk: bool = False, + logger_manager: StatLoggerManager | None = None, ) -> None: self.request_async_queue = request_async_queue self.output_async_queue = output_async_queue @@ -133,6 +136,8 @@ def __init__( self.stage_clients: list[Any] = stage_clients self.output_processors: list[Any] = output_processors self.stage_vllm_configs: list[Any] = stage_vllm_configs + self.logger_manager: StatLoggerManager | None = logger_manager + self.log_stats = self.logger_manager is not None # Per-request state self.request_states: dict[str, OrchestratorRequestState] = {} @@ -624,10 +629,13 @@ async def _process_stage_outputs(self, stage_id: int, raw_outputs: EngineCoreOut """ processor = self.output_processors[stage_id] + num_outputs = len(raw_outputs.outputs) + iteration_stats = IterationStats() if (self.log_stats and num_outputs) else None + processed = processor.process_outputs( raw_outputs.outputs, raw_outputs.timestamp, - None, + iteration_stats, ) if processed.reqs_to_abort: @@ -636,6 +644,22 @@ async def _process_stage_outputs(self, stage_id: int, raw_outputs: EngineCoreOut if raw_outputs.scheduler_stats is not None: processor.update_scheduler_stats(raw_outputs.scheduler_stats) + # Mirror vLLM AsyncLLM output_handler: feed stats to the logger + # manager so LoggingStatLogger can periodically print KV cache / + # prefix cache hit rate, and PrometheusStatLogger can publish. + if self.logger_manager is not None: + try: + self.logger_manager.record( + engine_idx=stage_id, + scheduler_stats=raw_outputs.scheduler_stats, + iteration_stats=iteration_stats, + ) + except Exception: + logger.exception( + "[Orchestrator] stat logger record failed for stage-%s", + stage_id, + ) + return processed.request_outputs async def _handle_add_request(self, msg: dict[str, Any]) -> None: diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py index 129ef3c99d..0b25ce7141 100644 --- a/vllm_omni/entrypoints/async_omni.py +++ b/vllm_omni/entrypoints/async_omni.py @@ -743,11 +743,8 @@ async def is_tracing_enabled(self) -> bool: return False async def do_log_stats(self) -> None: - """Log statistics. - - TODO: Forward to Orchestrator process via message. - """ - pass + """Log statistics via the engine, mirroring vLLM ``AsyncLLM``.""" + await self.engine.do_log_stats() async def get_supported_tasks(self) -> tuple[SupportedTask, ...]: """Return the task set exposed by the orchestrator-backed engine.""" From ef230ac720f29d30783d47af63d26a08ac774837 Mon Sep 17 00:00:00 2001 From: Alex Brooks Date: Sun, 12 Apr 2026 07:44:06 -0600 Subject: [PATCH 133/204] [Bugfix] Fix Incompatible Multihook Integration (TeaCache <-> CPU Offload) (#2689) Signed-off-by: Alex Brooks Co-authored-by: SYLAR <125541396+lishunyang12@users.noreply.github.com> --- tests/diffusion/hooks/test_hook_registry.py | 164 ++++++++++++++++++++ vllm_omni/diffusion/hooks/base.py | 92 +++++++---- 2 files changed, 230 insertions(+), 26 deletions(-) create mode 100644 tests/diffusion/hooks/test_hook_registry.py diff --git a/tests/diffusion/hooks/test_hook_registry.py b/tests/diffusion/hooks/test_hook_registry.py new file mode 100644 index 0000000000..6c8535cfec --- /dev/null +++ b/tests/diffusion/hooks/test_hook_registry.py @@ -0,0 +1,164 @@ +""" +Tests for hook registry. + +NOTE: The hook registry is also tested indirectly through a lot of +other tests, e.g., tests/diffusion/distributed/test_sp_plan_hooks.py +""" + +from typing import Any + +import pytest +from torch import nn + +from vllm_omni.diffusion.hooks.base import HookRegistry, ModelHook + +DEFAULT_OUT = "ECHO" +OVERRIDE_OUT = "OVERRIDE" +INPUT_KWARG = "inp" + + +class EchoModule(nn.Module): + """Just echo the input.""" + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + def forward(self, *args, **kwargs): + input_val = kwargs[INPUT_KWARG] + return input_val + DEFAULT_OUT + + +class AppendHook(ModelHook): + """Append an echo value to the input string on pre / post forward.""" + + def __init__(self, echo_val: str): + self.echo_val = echo_val + + def pre_forward(self, module: nn.Module, *args, **kwargs): + input_val = kwargs[INPUT_KWARG] + return (), {INPUT_KWARG: input_val + self.echo_val} + + def post_forward(self, module: nn.Module, output): + return output + self.echo_val + + +class OverrideAppendHook(AppendHook): + """Same as AppendHook, but replace the forward call with a different string.""" + + def new_forward(self, module: nn.Module, *args, **kwargs): + return kwargs[INPUT_KWARG] + OVERRIDE_OUT + + +def test_register_no_fwd_override_hooks(): + """Ensure registration is correct with no forward hooks.""" + mod = EchoModule() + registry = HookRegistry.get_or_create(mod) + first_hook = AppendHook("1") + second_hook = AppendHook("2") + sorted_no_fwd_hooks = [first_hook, second_hook] + + # Will add and sort the hook by key + registry.register_hook(name="b", hook=second_hook) + registry.register_hook(name="a", hook=first_hook) + + assert len(registry._hooks) == 2 + assert len(registry._sorted_hooks) == 2 + assert registry._new_fwd_impl_hook is None + # Ensure registering a new hook sorting alphabetically + for actual_hook, expected_hook in zip(registry._sorted_hooks, sorted_no_fwd_hooks): + assert actual_hook is expected_hook + + +def test_register_with_forward_hooks(): + """Ensure registration is correct with a forward hooks.""" + mod = EchoModule() + registry = HookRegistry.get_or_create(mod) + first_hook = AppendHook("1") + second_hook = AppendHook("2") + exec_hook = OverrideAppendHook("3") + sorted_no_fwd_hooks = [first_hook, second_hook] + + # Will add and sort the hook by key + registry.register_hook(name="b", hook=second_hook) + registry.register_hook(name="a", hook=first_hook) + registry.register_hook(name="c", hook=exec_hook) + + assert len(registry._hooks) == 3 + assert len(registry._sorted_hooks) == 3 + assert registry._new_fwd_impl_hook is exec_hook + # Ensure registering a new hook sorting alphabetically + for actual_hook, expected_hook in zip(registry._sorted_hooks, sorted_no_fwd_hooks): + assert actual_hook is expected_hook + + +def test_register_fails_with_multiple_forward_hooks(): + """Ensure registration only allows one hook overriding new_forward""" + mod = EchoModule() + registry = HookRegistry.get_or_create(mod) + + registry.register_hook(name="foo", hook=OverrideAppendHook("1")) + with pytest.raises(RuntimeError): + registry.register_hook(name="bar", hook=OverrideAppendHook("2")) + + +def test_remove_hooks(): + """Ensure removal sorts hooks.""" + mod = EchoModule() + registry = HookRegistry.get_or_create(mod) + + first_hook = AppendHook("1") + second_hook = AppendHook("2") + exec_hook = OverrideAppendHook("3") + + registry.register_hook(name="b", hook=second_hook) + registry.register_hook(name="a", hook=first_hook) + registry.register_hook(name="c", hook=exec_hook) + # Explicitly reorder our hooks to be in the wrong order, since register + # forces them to be sorted too. Ensure that remove the hook will also + # enforce the sorted order. + registry._sorted_hooks = [second_hook, first_hook] + + assert registry._new_fwd_impl_hook is exec_hook + registry.remove_hook("c") + assert registry._new_fwd_impl_hook is None + + sorted_no_fwd_hooks = [first_hook, second_hook] + for actual_hook, expected_hook in zip(registry._sorted_hooks, sorted_no_fwd_hooks): + assert actual_hook is expected_hook + + +def test_dispatch_no_fwd_override_hooks(): + """Ensure dispatch runs hooks in deterministic sorted order.""" + mod = EchoModule() + registry = HookRegistry.get_or_create(mod) + + first_hook = AppendHook("1") + second_hook = AppendHook("2") + + # Register will sort the hooks, so hook 1 will run first + # on preprocess and last in post process + registry.register_hook(name="2", hook=second_hook) + registry.register_hook(name="1", hook=first_hook) + res = registry.dispatch(inp="") + assert isinstance(res, str) + assert res == f"12{DEFAULT_OUT}21" + + +def test_dispatch_with_fwd_hooks(): + """Ensure dispatch runs hooks in deterministic sorted order.""" + mod = EchoModule() + registry = HookRegistry.get_or_create(mod) + + first_hook = AppendHook("1") + second_hook = AppendHook("2") + exec_hook = OverrideAppendHook("3") + + # Register will sort the hooks, so hook 1 will run first on preprocess and last in + # post process. Since the override hook mutates forward, it will run last even + # though the name of the exec_hook is alphabetically before the second hook. + registry.register_hook(name="c", hook=second_hook) + registry.register_hook(name="a", hook=first_hook) + registry.register_hook(name="b", hook=exec_hook) + res = registry.dispatch(inp="") + assert isinstance(res, str) + assert res == f"123{OVERRIDE_OUT}321" diff --git a/vllm_omni/diffusion/hooks/base.py b/vllm_omni/diffusion/hooks/base.py index cda4201ccf..517c661587 100644 --- a/vllm_omni/diffusion/hooks/base.py +++ b/vllm_omni/diffusion/hooks/base.py @@ -8,6 +8,7 @@ from __future__ import annotations +import functools import inspect from collections.abc import Callable from dataclasses import dataclass @@ -94,10 +95,9 @@ def post_forward(self, module: nn.Module, output: Any) -> Any: return output def new_forward(self, module: nn.Module, *args: Any, **kwargs: Any) -> Any: - """Override the module's forward pass completely. - - The default implementation calls pre_forward, then the original forward, - then post_forward. Override this method for more complex behavior. + """Override the module's forward pass. This should be overridden for more complex + cases, e.g., TeaCache. If this method is overridden in a subclass, it will be called + instead of self.module._omni_original_forward when executing the hooks. Args: module: The module being called. @@ -105,11 +105,9 @@ def new_forward(self, module: nn.Module, *args: Any, **kwargs: Any) -> Any: **kwargs: Keyword arguments to forward. Returns: - The output of the forward pass. + The output of the replacement for the forward pass. """ - args, kwargs = self.pre_forward(module, *args, **kwargs) - output = module._omni_original_forward(*args, **kwargs) # type: ignore[attr-defined] - return self.post_forward(module, output) + raise NotImplementedError("By default, hooks do not implement new_forward") def reset_state(self, module: nn.Module) -> nn.Module: """Reset any state associated with this hook. @@ -136,6 +134,21 @@ def __call__(self, *args: Any, **kwargs: Any): return registry.dispatch(*args, **kwargs) +def sort_hooks_after_call(func): + """Calls the method on the hook registry, then sorts the hooks. + + This should be added to methods that mutate add or remove hooks. + """ + + @functools.wraps(func) + def wrapper(self: HookRegistry, *args, **kwargs): + res = func(self, *args, **kwargs) + self.update_sorted_hooks() + return res + + return wrapper + + class HookRegistry: """Registry of hooks attached to a module. @@ -146,6 +159,10 @@ class HookRegistry: def __init__(self, module: nn.Module): self.module = module self._hooks: dict[str, ModelHook] = {} + # Hooks sorted by execution order + self._sorted_hooks: list[ModelHook] = [] + # Hooks overriding new_forward (if any) + self._new_fwd_impl_hook: ModelHook | None = None @classmethod def get_or_create(cls, module: nn.Module) -> HookRegistry: @@ -173,6 +190,14 @@ def get_or_create(cls, module: nn.Module) -> HookRegistry: return registry + def update_sorted_hooks(self): + """Sort hooks by name, which dictates pre/post process order.""" + sorted_hooks = [self._hooks[k] for k in sorted(self._hooks) if self._hooks[k] != self._new_fwd_impl_hook] + if self._new_fwd_impl_hook is not None: + sorted_hooks.append(self._new_fwd_impl_hook) + self._sorted_hooks = sorted_hooks + + @sort_hooks_after_call def register_hook(self, name: str, hook: ModelHook) -> None: """Register a hook with the given name. @@ -182,7 +207,14 @@ def register_hook(self, name: str, hook: ModelHook) -> None: """ hook.initialize_hook(self.module) self._hooks[name] = hook - + # We can only have one hook that overrides new_forward, + # since we don't currently have a mechanism for combining them. + if type(hook).new_forward is not ModelHook.new_forward: + if self._new_fwd_impl_hook is not None: + raise RuntimeError("Cannot have multiple hooks that override forward active simultaneously") + self._new_fwd_impl_hook = hook + + @sort_hooks_after_call def remove_hook(self, name: str) -> None: """Remove a hook by name. @@ -190,6 +222,9 @@ def remove_hook(self, name: str) -> None: name: The name of the hook to remove. """ if name in self._hooks: + # clear the forward hook if it's the one to delete + if self._new_fwd_impl_hook is self._hooks[name]: + self._new_fwd_impl_hook = None del self._hooks[name] def get_hook(self, name: str) -> ModelHook | None: @@ -206,8 +241,18 @@ def get_hook(self, name: str) -> ModelHook | None: def dispatch(self, *args: Any, **kwargs: Any) -> Any: """Dispatch a forward call through registered hooks. - Currently supports a single active hook. Multiple hooks are called - in sorted order by name, with each hook's output passed to the next. + Multiple hooks may be used with the caveat that only one hook + may override new_forward. While it is assumed that pre/post process + on hooks are composable, the execution flow is as follows for determinism: + + - Run preprocess on all hooks in their sorted order; hooks are sorted alphabetically, + except for the hook overriding forward (`self._new_fwd_impl_hook`), which is last + if it exists. + + - If `self._new_fwd_impl_hook` isn't None, call its forward. Otherwise call the + original model forward. + + - Run post process on all hooks in the reverse sorted order. Args: *args: Positional arguments to forward. @@ -219,24 +264,19 @@ def dispatch(self, *args: Any, **kwargs: Any) -> Any: if not self._hooks: return self.module._omni_original_forward(*args, **kwargs) # type: ignore[attr-defined] - # For single hook case, call directly - if len(self._hooks) == 1: - hook = next(iter(self._hooks.values())) - return hook.new_forward(self.module, *args, **kwargs) - - # For multiple hooks, chain them in sorted order - # Each hook can modify args/kwargs via pre_forward - sorted_hooks = sorted(self._hooks.items(), key=lambda x: x[0]) - - # Apply all pre_forward hooks - for _, hook in sorted_hooks: + # Apply all pre_forward hooks; if _new_fwd_impl_hook is set, it's last + for hook in self._sorted_hooks: args, kwargs = hook.pre_forward(self.module, *args, **kwargs) - # Call original forward - output = self.module._omni_original_forward(*args, **kwargs) # type: ignore[attr-defined] + # If we have a hook that overrides new_forward, call it directly + if self._new_fwd_impl_hook is not None: + output = self._new_fwd_impl_hook.new_forward(self.module, *args, **kwargs) + # Otherwise just call the original forward. + else: + output = self.module._omni_original_forward(*args, **kwargs) # type: ignore[attr-defined] - # Apply all post_forward hooks in reverse order - for _, hook in reversed(sorted_hooks): + # Apply all post_forward hooks in reverse order; if _new_fwd_impl_hook is set, it's first + for hook in reversed(self._sorted_hooks): output = hook.post_forward(self.module, output) return output From 16041ab550608b429ca96ea3f9fff100f128ca37 Mon Sep 17 00:00:00 2001 From: zhou zhuoxin Date: Sun, 12 Apr 2026 22:06:49 +0800 Subject: [PATCH 134/204] [Refactor] Extend CFG Parallel to support 3 or 4 branch dispatch across M GPUs (#2423) --- docs/design/feature/cfg_parallel.md | 70 +++- docs/user_guide/diffusion_features.md | 2 +- .../image_to_image/image_edit.py | 4 +- .../x_to_video_audio/x_to_video_audio.py | 4 +- .../distributed/test_cfg_parallel.py | 342 +++++++++++++++++- .../diffusion/distributed/cfg_parallel.py | 180 +++++++++ .../dreamid_omni/pipeline_dreamid_omni.py | 109 ++---- .../models/omnigen2/pipeline_omnigen2.py | 79 ++-- 8 files changed, 669 insertions(+), 121 deletions(-) diff --git a/docs/design/feature/cfg_parallel.md b/docs/design/feature/cfg_parallel.md index 64decbe956..c73a87749f 100644 --- a/docs/design/feature/cfg_parallel.md +++ b/docs/design/feature/cfg_parallel.md @@ -25,7 +25,9 @@ In standard Classifier-Free Guidance, each diffusion step requires two forward p 1. **Positive/Conditional**: Guided by the text prompt 2. **Negative/Unconditional**: Typically using empty or negative prompt -CFG-Parallel eliminates this bottleneck by distributing the two forward passes across different GPU ranks, allowing them to execute simultaneously rather than sequentially. +Some models require 3 or more CFG branches (see [N-Branch CFG](#n-branch-cfg-3-branches)). + +CFG-Parallel eliminates this bottleneck by distributing the forward passes across different GPU ranks, allowing them to execute simultaneously rather than sequentially. ### Architecture @@ -33,9 +35,11 @@ vLLM-omni provides `CFGParallelMixin` that encapsulates all CFG parallel logic. | Method | Purpose | Automatic Behavior | |--------|---------|-------------------| -| [`predict_noise_maybe_with_cfg()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Predict noise with CFG | Detects parallel mode, distributes computation, gathers results | +| [`predict_noise_maybe_with_cfg()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Predict noise with 2-branch CFG | Detects parallel mode, distributes computation, gathers results | +| [`predict_noise_with_multi_branch_cfg()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Predict noise with N-branch CFG | Round-robin dispatches N branches across M GPUs | | [`scheduler_step_maybe_with_cfg()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Step scheduler | All ranks step locally (no broadcast needed) | -| [`combine_cfg_noise()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Combine positive/negative | Applies CFG formula with optional normalization | +| [`combine_cfg_noise()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Combine 2-branch predictions | Applies CFG formula with optional normalization | +| [`combine_multi_branch_cfg_noise()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Combine N-branch predictions | Override for custom multi-branch combine logic | | [`predict_noise()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Forward pass wrapper | Override for custom transformer calls | | [`cfg_normalize_function()`](https://docs.vllm.ai/projects/vllm-omni/en/latest/api/vllm_omni/diffusion/distributed/cfg_parallel/) | Normalize CFG output | Override for custom normalization | @@ -57,6 +61,22 @@ vLLM-omni provides `CFGParallelMixin` that encapsulates all CFG parallel logic. - All ranks compute the scheduler step locally — no broadcast needed because `predict_noise_maybe_with_cfg` already ensures all ranks have identical noise predictions after `all_gather` + local combine. +### N-Branch CFG (3+ branches) + +Some models require more than 2 CFG branches. For example, Bagel and OmniGen2 use 3 branches, DreamID Omni uses 4 branches. + +`predict_noise_with_multi_branch_cfg()` handles these by automatically dispatching N branches across M GPUs using round-robin (rule: branch `i` → rank `i % M`): + +| Branches (N) | GPUs (M) | Dispatch | +|:---:|:---:|:---| +| 3 | 2 | `[[0, 2], [1]]` | +| 3 | 3 | `[[0], [1], [2]]` | +| 4 | 2 | `[[0, 2], [1, 3]]` | +| 4 | 3 | `[[0, 3], [1], [2]]` | +| 4 | 4 | `[[0], [1], [2], [3]]` | + +When a rank handles multiple branches, it runs them sequentially. After `all_gather`, all ranks execute `combine_multi_branch_cfg_noise()` locally, producing identical results. + --- ## Step-by-Step Implementation @@ -98,6 +118,7 @@ class YourModelPipeline(nn.Module, CFGParallelMixin): - `positive_kwargs`: transformer arguments for conditional (text-guided) prediction - `negative_kwargs`: transformer arguments for unconditional prediction (set to `None` if CFG disabled) - For image editing pipelines, add `output_slice=image_seq_len` to extract the generative image portion +- For models with 3+ CFG branches, see [Multi-Branch CFG](#multi-branch-cfg-3-branches) in the Customization section ### Step 2: Call `diffuse` @@ -171,20 +192,42 @@ class LongCatImagePipeline(nn.Module, CFGParallelMixin): ``` -### Override `combine_cfg_noise()` for Multi-Output Models +### Multi-Branch CFG (3+ branches) + +For models with 3 or more CFG branches, use `predict_noise_with_multi_branch_cfg()` instead of `predict_noise_maybe_with_cfg()`, and override `combine_multi_branch_cfg_noise()` for custom combine logic. This interface also works for standard 2-branch CFG — just pass 2 branches in `branches_kwargs`. -When `predict_noise()` returns a tuple (e.g., video + audio), the default `combine_cfg_noise()` applies CFG to every element. Override it to apply different logic per element — for example, CFG on video but positive-only on audio: +**Example (3-branch with dual guidance scale):** ```python -class MyVideoAudioPipeline(nn.Module, CFGParallelMixin): - def combine_cfg_noise(self, positive_noise_pred, negative_noise_pred, scale, normalize): - (video_pos, audio_pos) = positive_noise_pred - (video_neg, audio_neg) = negative_noise_pred - video_combined = super().combine_cfg_noise(video_pos, video_neg, scale, normalize) - return (video_combined, audio_pos) # audio: positive only, no CFG +class YourMultiBranchPipeline(nn.Module, CFGParallelMixin): + def combine_multi_branch_cfg_noise(self, predictions, true_cfg_scale, cfg_normalize=False): + text_scale = true_cfg_scale["text"] + image_scale = true_cfg_scale["image"] + pos, ref, uncond = predictions + return uncond + image_scale * (ref - uncond) + text_scale * (pos - ref) + + def diffuse(self, ...): + for i, t in enumerate(timesteps): + positive_kwargs = {...} # conditional prompt + ref_neg_kwargs = {...} # negative prompt + reference + uncond_kwargs = {...} # unconditional + + noise_pred = self.predict_noise_with_multi_branch_cfg( + do_true_cfg=do_true_cfg, + true_cfg_scale={"text": text_guidance_scale, "image": image_guidance_scale}, + branches_kwargs=[positive_kwargs, ref_neg_kwargs, uncond_kwargs], + ) + latents = self.scheduler_step_maybe_with_cfg(noise_pred, t, latents, do_true_cfg) + + return latents ``` -This also requires `predict_noise()` to return a tuple (see [Override predict_noise](#override-predict_noise-for-custom-transformer-calls) above). +### Override Combine Functions + +There are two combine functions for different scenarios: + +- **`combine_cfg_noise()`** — Used by `predict_noise_maybe_with_cfg()`. Override when `predict_noise()` returns a tuple (e.g., video + audio) and you need per-element CFG logic. +- **`combine_multi_branch_cfg_noise()`** — Used by `predict_noise_with_multi_branch_cfg()`. Override to implement custom multi-branch combine formulas (see [Multi-Branch CFG](#multi-branch-cfg-3-branches) above). ### Implement a Composite Scheduler for Multi-Output Models @@ -303,4 +346,5 @@ Adding CFG-Parallel support: 1. ✅ **Create mixin** - Inherit from `CFGParallelMixin` and implement `diffuse()` method 2. ✅ **(Optional) Customize** - Override `predict_noise()` or `cfg_normalize_function()` for custom behavior -3. ✅ **Test** - Verify with `--cfg-parallel-size 2` and compare performance +3. ✅ **(Optional) Multi-branch** - For 3+ branch models, use `predict_noise_with_multi_branch_cfg()` and override `combine_multi_branch_cfg_noise()` +4. ✅ **Test** - Verify with `--cfg-parallel-size 2` (or 3/4 for multi-branch) and compare performance diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 7e08851812..2f28131ee5 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -118,7 +118,7 @@ The following tables show which models support each feature: | **MagiHuman** | ❌ | ❌ | ❌ | ❓ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | | **MammothModa2(T2I)** | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | **Nextstep_1(T2I)** | ❓ | ❓ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | -| **OmniGen2** | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| **OmniGen2** | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | **Ovis-Image** | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | | **Qwen-Image** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ✅ | | **Qwen-Image-2512** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ✅ | diff --git a/examples/offline_inference/image_to_image/image_edit.py b/examples/offline_inference/image_to_image/image_edit.py index a8035a3fdc..1a7e86f13c 100644 --- a/examples/offline_inference/image_to_image/image_edit.py +++ b/examples/offline_inference/image_to_image/image_edit.py @@ -297,8 +297,8 @@ def parse_args() -> argparse.Namespace: "--cfg-parallel-size", type=int, default=1, - choices=[1, 2], - help="Number of GPUs used for classifier free guidance parallel size.", + choices=[1, 2, 3], + help="Number of GPUs used for classifier free guidance parallel size (max 3 branches).", ) parser.add_argument( "--enforce-eager", diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py index fb77b21483..49a0f496f8 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py @@ -36,8 +36,8 @@ def parse_args() -> argparse.Namespace: "--cfg-parallel-size", type=int, default=1, - choices=[1, 2], - help="Number of GPUs used for classifier free guidance parallel size.", + choices=[1, 2, 3, 4], + help="Number of GPUs used for classifier free guidance parallel size (max 4 branches).", ) parser.add_argument( "--video-negative-prompt", diff --git a/tests/diffusion/distributed/test_cfg_parallel.py b/tests/diffusion/distributed/test_cfg_parallel.py index 79dbe9e6dd..bf709618de 100644 --- a/tests/diffusion/distributed/test_cfg_parallel.py +++ b/tests/diffusion/distributed/test_cfg_parallel.py @@ -2,8 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Unit tests for CFG (Classifier-Free Guidance) parallel functionality. -This test verifies that predict_noise_maybe_with_cfg produces numerically -equivalent results with and without CFG parallel using fixed random inputs. +This test verifies that predict_noise_maybe_with_cfg and +predict_noise_with_multi_branch_cfg produce numerically equivalent results +with and without CFG parallel using fixed random inputs. """ import os @@ -429,3 +430,340 @@ def test_predict_noise_without_cfg(dtype: torch.dtype): assert noise_pred.shape == (1, 4, 16, 16) print(f"✓ Test passed: predict_noise without CFG (dtype={dtype})") + + +class MultiBranchTestPipeline(CFGParallelMixin): + """Test pipeline with custom 3-branch combine logic (like OmniGen2).""" + + def __init__(self, in_channels: int = 4, hidden_dim: int = 128, seed: int = 42): + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + + self.transformer = SimpleTransformer(in_channels, hidden_dim) + + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + for param in self.transformer.parameters(): + torch.nn.init.normal_(param, mean=0.0, std=0.02) + + def combine_multi_branch_cfg_noise(self, predictions, true_cfg_scale, cfg_normalize=False): + """N-branch combine with weighted sum for testing. + + - 2-branch: standard CFG formula (true_cfg_scale is float) + - 3-branch: OmniGen2-style dual guidance scale (true_cfg_scale is dict) + - 4-branch: DreamID-style weighted sum (true_cfg_scale is dict) + """ + if len(predictions) == 4: + text_scale = true_cfg_scale["text"] + image_scale = true_cfg_scale["image"] + vid_ref_scale = true_cfg_scale["vid_ref"] + pos, neg, vid_neg, audio_neg = predictions + combined = ( + audio_neg + + vid_ref_scale * (vid_neg - audio_neg) + + image_scale * (neg - vid_neg) + + text_scale * (pos - neg) + ) + elif len(predictions) == 3: + text_scale = true_cfg_scale["text"] + image_scale = true_cfg_scale["image"] + pos, ref, uncond = predictions + combined = uncond + image_scale * (ref - uncond) + text_scale * (pos - ref) + else: + pos, neg = predictions[0], predictions[1] + combined = neg + true_cfg_scale * (pos - neg) + + if cfg_normalize: + combined = self.cfg_normalize_function(pos, combined) + return combined + + +def _test_multi_branch_parallel_worker( + local_rank: int, + world_size: int, + cfg_parallel_size: int, + dtype: torch.dtype, + test_config: dict, + result_queue: torch.multiprocessing.Queue, +): + """Worker function for multi-branch CFG parallel test.""" + device = torch.device(f"{current_omni_platform.device_type}:{local_rank}") + current_omni_platform.set_device(device) + + update_environment_variables( + { + "RANK": str(local_rank), + "LOCAL_RANK": str(local_rank), + "WORLD_SIZE": str(world_size), + "MASTER_ADDR": "localhost", + "MASTER_PORT": "29504", + } + ) + + init_distributed_environment() + initialize_model_parallel(cfg_parallel_size=cfg_parallel_size) + + cfg_rank = get_classifier_free_guidance_rank() + cfg_world_size = get_classifier_free_guidance_world_size() + assert cfg_world_size == cfg_parallel_size + + pipeline = MultiBranchTestPipeline( + in_channels=test_config["channels"], + hidden_dim=test_config["hidden_dim"], + seed=test_config["model_seed"], + ) + pipeline.transformer = pipeline.transformer.to(device=device, dtype=dtype) + pipeline.transformer.eval() + + n_branches = test_config["n_branches"] + batch_size = test_config["batch_size"] + channels = test_config["channels"] + height = test_config["height"] + width = test_config["width"] + + # Create N branch inputs with distinct seeds + branches_kwargs = [] + for b in range(n_branches): + torch.manual_seed(test_config["input_seed"] + b) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(test_config["input_seed"] + b) + x = torch.randn(batch_size, channels, height, width, dtype=dtype, device=device) + branches_kwargs.append({"x": x}) + + with torch.no_grad(): + noise_pred = pipeline.predict_noise_with_multi_branch_cfg( + do_true_cfg=True, + true_cfg_scale=test_config["cfg_scale"], + branches_kwargs=branches_kwargs, + cfg_normalize=test_config["cfg_normalize"], + ) + + assert noise_pred is not None + result_queue.put((cfg_rank, noise_pred.cpu())) + + destroy_distributed_env() + + +def _test_multi_branch_sequential_worker( + local_rank: int, + world_size: int, + dtype: torch.dtype, + test_config: dict, + result_queue: torch.multiprocessing.Queue, +): + """Worker function for sequential multi-branch CFG test (baseline).""" + device = torch.device(f"{current_omni_platform.device_type}:{local_rank}") + current_omni_platform.set_device(device) + + update_environment_variables( + { + "RANK": str(local_rank), + "LOCAL_RANK": str(local_rank), + "WORLD_SIZE": str(world_size), + "MASTER_ADDR": "localhost", + "MASTER_PORT": "29505", + } + ) + + init_distributed_environment() + initialize_model_parallel(cfg_parallel_size=1) + + cfg_world_size = get_classifier_free_guidance_world_size() + assert cfg_world_size == 1 + + pipeline = MultiBranchTestPipeline( + in_channels=test_config["channels"], + hidden_dim=test_config["hidden_dim"], + seed=test_config["model_seed"], + ) + pipeline.transformer = pipeline.transformer.to(device=device, dtype=dtype) + pipeline.transformer.eval() + + n_branches = test_config["n_branches"] + batch_size = test_config["batch_size"] + channels = test_config["channels"] + height = test_config["height"] + width = test_config["width"] + + branches_kwargs = [] + for b in range(n_branches): + torch.manual_seed(test_config["input_seed"] + b) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(test_config["input_seed"] + b) + x = torch.randn(batch_size, channels, height, width, dtype=dtype, device=device) + branches_kwargs.append({"x": x}) + + with torch.no_grad(): + noise_pred = pipeline.predict_noise_with_multi_branch_cfg( + do_true_cfg=True, + true_cfg_scale=test_config["cfg_scale"], + branches_kwargs=branches_kwargs, + cfg_normalize=test_config["cfg_normalize"], + ) + + assert noise_pred is not None + result_queue.put(noise_pred.cpu()) + + destroy_distributed_env() + + +@pytest.mark.parametrize( + "cfg_parallel_size,n_branches", + [ + (2, 2), # 2 branches on 2 GPUs: [[0],[1]] + (2, 3), # 3 branches on 2 GPUs: [[0,2],[1]] + (3, 3), # 3 branches on 3 GPUs: [[0],[1],[2]] + (2, 4), # 4 branches on 2 GPUs: [[0,2],[1,3]] + ], +) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize("cfg_normalize", [False, True]) +def test_predict_noise_with_multi_branch_cfg( + cfg_parallel_size: int, + n_branches: int, + dtype: torch.dtype, + batch_size: int, + cfg_normalize: bool, +): + """ + Test that predict_noise_with_multi_branch_cfg produces identical results + with and without CFG parallel for N-branch models. + + Args: + cfg_parallel_size: Number of GPUs for CFG parallel + n_branches: Number of CFG branches + dtype: Data type for computation + batch_size: Batch size for testing + cfg_normalize: Whether to normalize CFG output + """ + available_gpus = current_omni_platform.get_device_count() + if available_gpus < cfg_parallel_size: + pytest.skip(f"Test requires {cfg_parallel_size} GPUs but only {available_gpus} available") + + if n_branches == 2: + cfg_scale = 5.0 + elif n_branches == 3: + cfg_scale = {"text": 5.0, "image": 2.0} + else: + cfg_scale = {"text": 5.0, "image": 2.0, "vid_ref": 1.5} + + test_config = { + "batch_size": batch_size, + "channels": 4, + "height": 16, + "width": 16, + "hidden_dim": 128, + "cfg_scale": cfg_scale, + "cfg_normalize": cfg_normalize, + "model_seed": 42, + "input_seed": 123, + "n_branches": n_branches, + } + + mp_context = torch.multiprocessing.get_context("spawn") + manager = mp_context.Manager() + baseline_queue = manager.Queue() + cfg_parallel_queue = manager.Queue() + + # Run baseline (sequential, cfgp=1) + torch.multiprocessing.spawn( + _test_multi_branch_sequential_worker, + args=(1, dtype, test_config, baseline_queue), + nprocs=1, + ) + + # Run CFG parallel + torch.multiprocessing.spawn( + _test_multi_branch_parallel_worker, + args=(cfg_parallel_size, cfg_parallel_size, dtype, test_config, cfg_parallel_queue), + nprocs=cfg_parallel_size, + ) + + baseline_output = baseline_queue.get() + cfg_parallel_outputs = [cfg_parallel_queue.get() for _ in range(cfg_parallel_size)] + cfg_parallel_outputs.sort(key=lambda item: item[0]) + cfg_parallel_output = cfg_parallel_outputs[0][1] + + # All ranks should produce identical output + for cfg_rank, rank_output in cfg_parallel_outputs[1:]: + torch.testing.assert_close( + rank_output, + cfg_parallel_output, + rtol=0, + atol=0, + msg=f"Multi-branch CFG parallel ranks differ (rank 0 vs rank {cfg_rank})", + ) + + assert baseline_output.shape == cfg_parallel_output.shape, ( + f"Shape mismatch: baseline {baseline_output.shape} vs CFG parallel {cfg_parallel_output.shape}" + ) + + if dtype == torch.float32: + rtol, atol = 1e-5, 1e-5 + elif dtype == torch.bfloat16: + rtol, atol = 1e-2, 1e-2 + else: + rtol, atol = 1e-3, 1e-3 + + torch.testing.assert_close( + cfg_parallel_output, + baseline_output, + rtol=rtol, + atol=atol, + msg=( + f"Multi-branch CFG parallel output differs from sequential\n" + f" n_branches={n_branches}, cfg_parallel_size={cfg_parallel_size}\n" + f" dtype={dtype}, cfg_normalize={cfg_normalize}\n" + f" Max diff: {(cfg_parallel_output - baseline_output).abs().max().item():.6e}" + ), + ) + + print( + f"✓ Test passed: multi_branch n_branches={n_branches}, " + f"cfg_size={cfg_parallel_size}, dtype={dtype}, cfg_normalize={cfg_normalize}" + ) + + +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +def test_multi_branch_without_cfg(dtype: torch.dtype): + """ + Test predict_noise_with_multi_branch_cfg when do_true_cfg=False. + + When CFG is disabled, only the first branch (positive) should be computed. + This test runs on a single GPU without distributed environment. + """ + available_gpus = current_omni_platform.get_device_count() + if available_gpus < 1: + pytest.skip("Test requires at least 1 GPU") + + device = torch.device(f"{current_omni_platform.device_type}:0") + current_omni_platform.set_device(device) + + pipeline = MultiBranchTestPipeline(in_channels=4, hidden_dim=128, seed=42) + pipeline.transformer = pipeline.transformer.to(device=device, dtype=dtype) + pipeline.transformer.eval() + + # Create 3 branch inputs (only first should be used) + branches_kwargs = [] + for b in range(3): + torch.manual_seed(123 + b) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(123 + b) + x = torch.randn(1, 4, 16, 16, dtype=dtype, device=device) + branches_kwargs.append({"x": x}) + + with torch.no_grad(): + noise_pred = pipeline.predict_noise_with_multi_branch_cfg( + do_true_cfg=False, # No CFG + true_cfg_scale=5.0, + branches_kwargs=branches_kwargs, + cfg_normalize=False, + ) + + assert noise_pred is not None + assert noise_pred.shape == (1, 4, 16, 16) + + print(f"✓ Test passed: multi_branch predict_noise without CFG (dtype={dtype})") diff --git a/vllm_omni/diffusion/distributed/cfg_parallel.py b/vllm_omni/diffusion/distributed/cfg_parallel.py index a8b0012f66..98757006bf 100644 --- a/vllm_omni/diffusion/distributed/cfg_parallel.py +++ b/vllm_omni/diffusion/distributed/cfg_parallel.py @@ -9,6 +9,7 @@ from typing import Any import torch +from vllm.logger import init_logger from vllm_omni.diffusion.distributed.parallel_state import ( get_cfg_group, @@ -16,6 +17,8 @@ get_classifier_free_guidance_world_size, ) +logger = init_logger(__name__) + def _wrap(pred: torch.Tensor | tuple[torch.Tensor, ...]) -> tuple[torch.Tensor, ...]: """Normalize prediction to tuple form.""" @@ -32,6 +35,24 @@ def _slice_pred(pred: tuple[torch.Tensor, ...], output_slice: int) -> tuple[torc return tuple(p[:, :output_slice] for p in pred) +def _dispatch_branches(n_branches: int, n_ranks: int) -> list[list[int]]: + """ + Round-robin dispatch N branches to M ranks. + + Rule: branch i → rank (i % n_ranks). + + Examples: + _dispatch_branches(3, 2) -> [[0, 2], [1]] + _dispatch_branches(3, 3) -> [[0], [1], [2]] + _dispatch_branches(4, 2) -> [[0, 2], [1, 3]] + _dispatch_branches(4, 4) -> [[0], [1], [2], [3]] + """ + assignments: list[list[int]] = [[] for _ in range(n_ranks)] + for i in range(n_branches): + assignments[i % n_ranks].append(i) + return assignments + + class CFGParallelMixin(metaclass=ABCMeta): """ Base Mixin class for Diffusion pipelines providing shared CFG methods. @@ -189,6 +210,165 @@ def combine_cfg_noise(self, positive_noise_pred, negative_noise_pred, scale, nor results.append(comb) return _unwrap(tuple(results)) + # ── N-branch CFG interface (for 3+ branch models) ── + + def predict_noise_with_multi_branch_cfg( + self, + do_true_cfg: bool, + true_cfg_scale: float | dict[str, float], + branches_kwargs: list[dict[str, Any]], + cfg_normalize: bool = False, + output_slice: int | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, ...]: + """ + Predict noise with N-branch CFG dispatch across M GPUs. + + This is the multi-branch counterpart of predict_noise_maybe_with_cfg(). + Use this for models with 3 or more CFG branches (e.g., OmniGen2, Bagel, + DreamID). Existing 2-branch models should continue using + predict_noise_maybe_with_cfg(). + + Args: + do_true_cfg: Whether to apply CFG. + true_cfg_scale: CFG scale factor (passed to combine_multi_branch_cfg_noise). + branches_kwargs: List of N dicts, each containing kwargs for one + predict_noise() call. branches_kwargs[0] is always the + positive/conditional branch. + cfg_normalize: Whether to normalize (passed to combine_multi_branch_cfg_noise). + output_slice: If set, slice each output to [:, :output_slice]. + + Returns: + Combined noise prediction, identical on all ranks in CFG parallel. + """ + if do_true_cfg: + n_branches = len(branches_kwargs) + cfg_world_size = get_classifier_free_guidance_world_size() + cfg_parallel_ready = cfg_world_size > 1 + + if cfg_parallel_ready: + return self._predict_multi_branch_parallel( + branches_kwargs, + n_branches, + cfg_world_size, + true_cfg_scale, + cfg_normalize, + output_slice, + ) + else: + # Sequential: run all N branches on single device + preds: list[torch.Tensor | tuple[torch.Tensor, ...]] = [] + for kw in branches_kwargs: + pred = _wrap(self.predict_noise(**kw)) + if output_slice is not None: + pred = _slice_pred(pred, output_slice) + preds.append(_unwrap(pred)) + return self.combine_multi_branch_cfg_noise(preds, true_cfg_scale, cfg_normalize) + else: + # No CFG: only compute positive/conditional prediction + pred = self.predict_noise(**branches_kwargs[0]) + if output_slice is not None: + pred = _unwrap(_slice_pred(_wrap(pred), output_slice)) + return pred + + def _predict_multi_branch_parallel( + self, + branches_kwargs: list[dict[str, Any]], + n_branches: int, + cfg_world_size: int, + true_cfg_scale: float, + cfg_normalize: bool, + output_slice: int | None, + ) -> torch.Tensor | tuple[torch.Tensor, ...]: + """Dispatch N branches across M ranks, all_gather, then combine.""" + cfg_group = get_cfg_group() + cfg_rank = get_classifier_free_guidance_rank() + + if cfg_world_size > n_branches: + logger.warning_once( + "cfg_parallel_size=%d > n_branches=%d, %d GPU(s) will be idle for CFG", + cfg_world_size, + n_branches, + cfg_world_size - n_branches, + ) + + # Assign branches to ranks via round-robin + assignments = _dispatch_branches(n_branches, cfg_world_size) + my_branch_ids = assignments[cfg_rank] + max_per_rank = max(len(a) for a in assignments) + + # Run assigned branches + my_preds: list[tuple[torch.Tensor, ...]] = [] + for bid in my_branch_ids: + pred = _wrap(self.predict_noise(**branches_kwargs[bid])) + if output_slice is not None: + pred = _slice_pred(pred, output_slice) + my_preds.append(pred) + + # Idle ranks (cfg_world_size > n_branches) run a forward pass to get the output shape for all_gather. + # Output shape cannot be inferred from kwargs — may be tuple, sliced, etc. + if not my_preds: + pred = _wrap(self.predict_noise(**branches_kwargs[0])) + if output_slice is not None: + pred = _slice_pred(pred, output_slice) + my_preds.append(pred) + + # Pad to max_per_rank with zeros so all ranks have same size + ref_pred = my_preds[0] + while len(my_preds) < max_per_rank: + my_preds.append(tuple(torch.zeros_like(t) for t in ref_pred)) + + # All-gather each output element separately (like predict_noise_maybe_with_cfg) + # For each slot, gather across ranks; then pick valid results by owner_rank + # all_slots[slot][elem_idx] = [rank0_tensor, rank1_tensor, ...] + all_slots: list[list[list[torch.Tensor]]] = [] + for slot in range(max_per_rank): + slot_results: list[list[torch.Tensor]] = [] + for p in my_preds[slot]: + gathered = cfg_group.all_gather(p, separate_tensors=True) + slot_results.append(gathered) + all_slots.append(slot_results) + + # Reconstruct final_preds in branch order + final_preds: list[torch.Tensor | tuple[torch.Tensor, ...]] = [] + for bid in range(n_branches): + owner_rank = bid % cfg_world_size + slot_idx = bid // cfg_world_size + elements = tuple(all_slots[slot_idx][elem_idx][owner_rank] for elem_idx in range(len(ref_pred))) + final_preds.append(_unwrap(elements)) + + return self.combine_multi_branch_cfg_noise(final_preds, true_cfg_scale, cfg_normalize) + + def combine_multi_branch_cfg_noise( + self, + predictions: list[torch.Tensor | tuple[torch.Tensor, ...]], + true_cfg_scale: float | dict[str, float], + cfg_normalize: bool = False, + ) -> torch.Tensor | tuple[torch.Tensor, ...]: + """ + Combine N branch predictions. Default: standard 2-branch CFG formula. + + Override this method for custom multi-branch combine logic. + + Args: + predictions: List of N predictions, where predictions[0] is always + the positive/conditional branch. + true_cfg_scale: CFG scale factor (float for 2-branch, dict for multi-branch). + cfg_normalize: Whether to normalize the combined prediction. + + Returns: + Combined noise prediction. + """ + positive = _wrap(predictions[0]) + negative = _wrap(predictions[1]) + + results = [] + for p, n in zip(positive, negative): + comb = n + true_cfg_scale * (p - n) + if cfg_normalize: + comb = self.cfg_normalize_function(p, comb) + results.append(comb) + return _unwrap(tuple(results)) + def predict_noise(self, *args: Any, **kwargs: Any) -> torch.Tensor | tuple[torch.Tensor, ...]: """ Forward pass through transformer to predict noise. diff --git a/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py b/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py index e22765f80e..974cc582f1 100644 --- a/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py +++ b/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py @@ -15,11 +15,6 @@ from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin -from vllm_omni.diffusion.distributed.parallel_state import ( - get_cfg_group, - get_classifier_free_guidance_rank, - get_classifier_free_guidance_world_size, -) from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.models.interface import SupportAudioInput, SupportImageInput from vllm_omni.diffusion.request import OmniDiffusionRequest @@ -249,6 +244,28 @@ def get_scheduler_time_steps(self, sampling_steps, solver_name="unipc", device=0 return sample_scheduler, timesteps + def predict_noise(self, **kwargs): + pred_vid, pred_audio = self.model(**kwargs) + return (pred_vid[0], pred_audio[0]) + + def combine_multi_branch_cfg_noise(self, predictions, true_cfg_scale, cfg_normalize=False): + vid_pos, audio_pos = predictions[0] + vid_neg, audio_neg = predictions[1] + vid_ip_neg, _ = predictions[2] + _, refaudio_neg = predictions[3] + + pred_video = ( + vid_neg + + true_cfg_scale["video_cfg_scale"] * (vid_pos - vid_neg) + + true_cfg_scale["video_ref_cfg_scale"] * (vid_pos - vid_ip_neg) + ) + pred_audio = ( + audio_neg + + true_cfg_scale["audio_cfg_scale"] * (audio_pos - audio_neg) + + true_cfg_scale["audio_ref_cfg_scale"] * (audio_pos - refaudio_neg) + ) + return (pred_video, pred_audio) + def diffuse( self, video_noise: torch.Tensor, @@ -306,72 +323,22 @@ def diffuse( "vid_context": [text_embeddings_video_neg], } - if get_classifier_free_guidance_world_size() > 1: - # Enable CFG-parallel: rank0 computes positive, rank1 computes negative. - cfg_group = get_cfg_group() - cfg_rank = get_classifier_free_guidance_rank() - - if cfg_rank == 0: - pred_vid, pred_audio = self.model( - vid=[model_input_video], audio=[model_input_audio], t=timestep_input, **pos_args - ) - pre_vid_ip_neg, _ = self.model( - vid=[model_input_video_neg], audio=[model_input_audio], t=timestep_input, **pos_args - ) - pred_vid_0 = pred_vid[0] - pred_audio_0 = pred_audio[0] - pre_vid_ip_0 = pre_vid_ip_neg[0] - pred_refaudio_0 = torch.zeros_like(pred_audio_0) # dummy tensor - else: - pred_vid, pred_audio = self.model( - vid=[model_input_video], audio=[model_input_audio], t=timestep_input, **neg_args - ) - _, pred_refaudio_neg = self.model( - vid=[model_input_video], audio=[model_input_audio_neg], t=timestep_input, **pos_args - ) - pred_vid_0 = pred_vid[0] - pred_audio_0 = pred_audio[0] - pre_vid_ip_0 = torch.zeros_like(pred_vid_0) # dummy tensor - pred_refaudio_0 = pred_refaudio_neg[0] - - pred_vid_gathered = cfg_group.all_gather(pred_vid_0, separate_tensors=True) - pred_audio_gathered = cfg_group.all_gather(pred_audio_0, separate_tensors=True) - pre_vid_ip_gathered = cfg_group.all_gather(pre_vid_ip_0, separate_tensors=True) - pred_refaudio_gathered = cfg_group.all_gather(pred_refaudio_0, separate_tensors=True) - - pred_vid_pos = [pred_vid_gathered[0]] - pred_vid_neg = [pred_vid_gathered[1]] - pred_audio_pos = [pred_audio_gathered[0]] - pred_audio_neg = [pred_audio_gathered[1]] - pre_vid_ip_neg = [pre_vid_ip_gathered[0]] - pred_refaudio_neg = [pred_refaudio_gathered[1]] - else: - pred_vid_pos, pred_audio_pos = self.model( - vid=[model_input_video], audio=[model_input_audio], t=timestep_input, **pos_args - ) - - pred_vid_neg, pred_audio_neg = self.model( - vid=[model_input_video], audio=[model_input_audio], t=timestep_input, **neg_args - ) - - pre_vid_ip_neg, _ = self.model( - vid=[model_input_video_neg], audio=[model_input_audio], t=timestep_input, **pos_args - ) - - _, pred_refaudio_neg = self.model( - vid=[model_input_video], audio=[model_input_audio_neg], t=timestep_input, **pos_args - ) - - pred_video_guided = ( - pred_vid_neg[0] - + self.video_cfg_scale * (pred_vid_pos[0] - pred_vid_neg[0]) - + self.video_ref_cfg_scale * (pred_vid_pos[0] - pre_vid_ip_neg[0]) - ) - - pred_audio_guided = ( - pred_audio_neg[0] - + self.audio_cfg_scale * (pred_audio_pos[0] - pred_audio_neg[0]) - + self.audio_ref_cfg_scale * (pred_audio_pos[0] - pred_refaudio_neg[0]) + branches_kwargs = [ + {"vid": [model_input_video], "audio": [model_input_audio], "t": timestep_input, **pos_args}, + {"vid": [model_input_video], "audio": [model_input_audio], "t": timestep_input, **neg_args}, + {"vid": [model_input_video_neg], "audio": [model_input_audio], "t": timestep_input, **pos_args}, + {"vid": [model_input_video], "audio": [model_input_audio_neg], "t": timestep_input, **pos_args}, + ] + + pred_video_guided, pred_audio_guided = self.predict_noise_with_multi_branch_cfg( + do_true_cfg=True, + true_cfg_scale={ + "video_cfg_scale": self.video_cfg_scale, + "video_ref_cfg_scale": self.video_ref_cfg_scale, + "audio_cfg_scale": self.audio_cfg_scale, + "audio_ref_cfg_scale": self.audio_ref_cfg_scale, + }, + branches_kwargs=branches_kwargs, ) video_noise = scheduler_video.step( pred_video_guided.unsqueeze(0), t_v, video_noise.unsqueeze(0), return_dict=False diff --git a/vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.py b/vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.py index 2d370aea19..e8e307b878 100644 --- a/vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.py +++ b/vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.py @@ -29,6 +29,7 @@ from vllm.model_executor.models.utils import AutoWeightsLoader from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig +from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.omnigen2.omnigen2_transformer import ( @@ -619,7 +620,7 @@ def retrieve_timesteps( return timesteps, num_inference_steps -class OmniGen2Pipeline(nn.Module): +class OmniGen2Pipeline(CFGParallelMixin, nn.Module): """ Pipeline for text-to-image generation using OmniGen2. @@ -1171,7 +1172,14 @@ def processing( self._num_timesteps = len(timesteps) for i, t in enumerate(timesteps): - model_pred = self.predict( + text_guidance_scale = ( + self.text_guidance_scale if self.cfg_range[0] <= i / len(timesteps) <= self.cfg_range[1] else 1.0 + ) + image_guidance_scale = ( + self.image_guidance_scale if self.cfg_range[0] <= i / len(timesteps) <= self.cfg_range[1] else 1.0 + ) + + positive_kwargs = dict( t=t, latents=latents, prompt_embeds=prompt_embeds, @@ -1179,15 +1187,18 @@ def processing( prompt_attention_mask=prompt_attention_mask, ref_image_hidden_states=ref_latents, ) - text_guidance_scale = ( - self.text_guidance_scale if self.cfg_range[0] <= i / len(timesteps) <= self.cfg_range[1] else 1.0 - ) - image_guidance_scale = ( - self.image_guidance_scale if self.cfg_range[0] <= i / len(timesteps) <= self.cfg_range[1] else 1.0 + uncond_kwargs = dict( + t=t, + latents=latents, + prompt_embeds=negative_prompt_embeds, + freqs_cis=freqs_cis, + prompt_attention_mask=negative_prompt_attention_mask, + ref_image_hidden_states=None, ) if text_guidance_scale > 1.0 and image_guidance_scale > 1.0: - model_pred_ref = self.predict( + # 3-branch CFG: pos + ref_neg + uncond + ref_neg_kwargs = dict( t=t, latents=latents, prompt_embeds=negative_prompt_embeds, @@ -1195,31 +1206,24 @@ def processing( prompt_attention_mask=negative_prompt_attention_mask, ref_image_hidden_states=ref_latents, ) - - model_pred_uncond = self.predict( - t=t, - latents=latents, - prompt_embeds=negative_prompt_embeds, - freqs_cis=freqs_cis, - prompt_attention_mask=negative_prompt_attention_mask, - ref_image_hidden_states=None, - ) - - model_pred = ( - model_pred_uncond - + image_guidance_scale * (model_pred_ref - model_pred_uncond) - + text_guidance_scale * (model_pred - model_pred_ref) + model_pred = self.predict_noise_with_multi_branch_cfg( + do_true_cfg=True, + true_cfg_scale={ + "text": text_guidance_scale, + "image": image_guidance_scale, + }, + branches_kwargs=[positive_kwargs, ref_neg_kwargs, uncond_kwargs], ) elif text_guidance_scale > 1.0: - model_pred_uncond = self.predict( - t=t, - latents=latents, - prompt_embeds=negative_prompt_embeds, - freqs_cis=freqs_cis, - prompt_attention_mask=negative_prompt_attention_mask, - ref_image_hidden_states=None, + # 2-branch CFG: pos + uncond + model_pred = self.predict_noise_with_multi_branch_cfg( + do_true_cfg=True, + true_cfg_scale=text_guidance_scale, + branches_kwargs=[positive_kwargs, uncond_kwargs], ) - model_pred = model_pred_uncond + text_guidance_scale * (model_pred - model_pred_uncond) + else: + # No CFG + model_pred = self.predict_noise(**positive_kwargs) latents = self.scheduler.step(model_pred, t, latents, return_dict=False)[0] @@ -1265,6 +1269,21 @@ def predict( ) return model_pred + def predict_noise(self, **kwargs): + """Override CFGParallelMixin.predict_noise to use self.predict.""" + return self.predict(**kwargs) + + def combine_multi_branch_cfg_noise(self, predictions, true_cfg_scale, cfg_normalize=False): + """Override: 3-branch dual scale or 2-branch standard CFG.""" + if len(predictions) == 3: + text_scale = true_cfg_scale["text"] + image_scale = true_cfg_scale["image"] + pos, ref, uncond = predictions[0], predictions[1], predictions[2] + return uncond + image_scale * (ref - uncond) + text_scale * (pos - ref) + # 2-branch: standard CFG + pos, neg = predictions[0], predictions[1] + return neg + true_cfg_scale * (pos - neg) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) From 95b5b2ee43f636a2f1d8a4573674f2a5a4a3b6df Mon Sep 17 00:00:00 2001 From: Canlin Guo Date: Sun, 12 Apr 2026 22:14:43 +0800 Subject: [PATCH 135/204] [Bugfix] Fix UT for the missing of log_stats in Engine (#2706) Signed-off-by: gcanlin --- tests/engine/test_async_omni_engine_stage_init.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/engine/test_async_omni_engine_stage_init.py b/tests/engine/test_async_omni_engine_stage_init.py index 24d2bf0cf9..f397307936 100644 --- a/tests/engine/test_async_omni_engine_stage_init.py +++ b/tests/engine/test_async_omni_engine_stage_init.py @@ -98,6 +98,7 @@ def test_initialize_stages_passes_stage_init_timeout_to_diffusion_handshake(monk from vllm_omni.platforms import current_omni_platform engine = object.__new__(AsyncOmniEngine) + engine.log_stats = False engine.model = "dummy-model" engine.config_path = "dummy-config" engine.num_stages = 1 @@ -178,6 +179,7 @@ def test_launch_llm_stage_passes_stage_init_timeout_to_complete_stage_handshake( from vllm_omni.platforms import current_omni_platform engine = object.__new__(AsyncOmniEngine) + engine.log_stats = False engine.model = "dummy-model" engine.single_stage_mode = False engine._omni_master_server = None From 2dce02854143e2160aa8e3cf3fb5a136f4110476 Mon Sep 17 00:00:00 2001 From: TJian Date: Sun, 12 Apr 2026 23:37:17 +0800 Subject: [PATCH 136/204] [ROCm] [CI] Fix environment issue (#2708) Signed-off-by: tjtanaa --- docker/Dockerfile.rocm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index ec0c5aab0d..b344783892 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -18,8 +18,10 @@ ARG COMMON_WORKDIR=/app WORKDIR ${COMMON_WORKDIR} # Step 1: Setup - Install system dependencies +# Need to include ffmpeg because vllm rocm upstream docker image +# does not include it. RUN apt-get update && \ - apt-get install -y espeak-ng git sox libsox-fmt-all jq && \ + apt-get install -y espeak-ng ffmpeg git sox libsox-fmt-all jq && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* From eb1a801b216b958cba0ddd9b528329a524df2508 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zhengyuan=20Su=20=28=E8=8B=8F=E6=94=BF=E6=B8=8A=29?= Date: Mon, 13 Apr 2026 11:08:52 +0800 Subject: [PATCH 137/204] [Feat] Override single stage CLI args when stage_configs_path is set in OmniEngineArgs (#2684) Signed-off-by: Zhengyuan Su Co-authored-by: Claude Opus 4.6 (1M context) --- tests/engine/test_arg_utils.py | 93 +++++++++++++++++++++++++++ vllm_omni/engine/arg_utils.py | 9 +++ vllm_omni/engine/async_omni_engine.py | 70 +++++++++++++++++++- 3 files changed, 171 insertions(+), 1 deletion(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 5584b15d9f..cb1f31164c 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -4,6 +4,7 @@ explicitly patch values that differ from vLLM. """ +import argparse import inspect from unittest.mock import Mock @@ -14,6 +15,7 @@ from vllm_omni.config.model import OmniModelConfig from vllm_omni.engine.arg_utils import OmniEngineArgs +from vllm_omni.engine.async_omni_engine import AsyncOmniEngine pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -116,6 +118,26 @@ def test_qwen3_tts_codec_frame_rate_patching(): assert omni_config.codec_frame_rate_hz == 12.3 +def test_stage_configs_path_blocks_create_model_config(): + """create_model_config() should raise when stage_configs_path is set.""" + args = OmniEngineArgs(stage_configs_path="/some/path.yaml") + with pytest.raises(RuntimeError, match="stage_configs_path"): + args.create_model_config() + + +def test_from_cli_args_picks_up_stage_configs_path(): + """from_cli_args should pick up stage_configs_path from namespace.""" + ns = argparse.Namespace( + model="facebook/opt-125m", + stage_configs_path="/some/path.yaml", + custom_pipeline_args=None, + ) + + args = OmniEngineArgs.from_cli_args(ns) + assert args.stage_configs_path == "/some/path.yaml" + assert args.custom_pipeline_args is None + + def test_stage_specific_text_config_override(): """Ensure dependent attributes are updated when using stage-specific config.""" vllm_config = EngineArgs().create_model_config() @@ -144,3 +166,74 @@ def test_stage_specific_text_config_override(): assert omni_config.attention_chunk_size == 2048 assert omni_config.max_model_len == 4096 assert omni_config.hf_text_config.sliding_window is None + + +def test_stage_configs_path_field(): + """OmniEngineArgs with stage_configs_path should construct without error.""" + args = OmniEngineArgs(stage_configs_path="/some/path.yaml") + assert args.stage_configs_path == "/some/path.yaml" + + +def test_strip_single_engine_args(): + """_strip_single_engine_args should remove EngineArgs fields but keep omni fields.""" + kwargs = { + # Parent EngineArgs fields — should be stripped + "compilation_config": '{"cudagraph_mode": "FULL_AND_PIECEWISE"}', + "tensor_parallel_size": 4, + "gpu_memory_utilization": 0.9, + "model": "some/model", + # Parent field that should be kept (allowlisted) + "worker_extension_cls": "some.Extension", + # OmniEngineArgs-only / non-engine fields — should pass through + "stage_configs_path": "/path/to/yaml", + "custom_pipeline_args": {"pipeline_class": "my.Pipeline"}, + "mode": "text-to-image", + "lora_path": "/some/lora", + } + + filtered = AsyncOmniEngine._strip_single_engine_args(kwargs) + + # Stripped — parent EngineArgs fields + assert "compilation_config" not in filtered + assert "tensor_parallel_size" not in filtered + assert "gpu_memory_utilization" not in filtered + assert "model" not in filtered + + # Stripped — orchestrator-level OmniEngineArgs field + assert "stage_configs_path" not in filtered + + # Kept + assert filtered["worker_extension_cls"] == "some.Extension" + assert filtered["custom_pipeline_args"] == {"pipeline_class": "my.Pipeline"} + assert filtered["mode"] == "text-to-image" + assert filtered["lora_path"] == "/some/lora" + + +def test_strip_single_engine_args_model_does_not_trigger_warning(mocker): + """model is always in kwargs (callers set it via from_cli_args/asdict), + so it should not cause the override warning by itself or appear in it.""" + mock_warn = mocker.patch("vllm_omni.engine.async_omni_engine.logger.warning") + + # Typical caller kwargs: model is always present, no other parent + # EngineArgs fields are explicitly overridden. + AsyncOmniEngine._strip_single_engine_args( + { + "model": "some/model", + "custom_pipeline_args": {"pipeline_class": "my.Pipeline"}, + } + ) + mock_warn.assert_not_called() + + # When there *are* genuinely surprising overrides alongside model, + # the warning should mention them but not model. + AsyncOmniEngine._strip_single_engine_args( + { + "model": "some/model", + "tensor_parallel_size": 4, + "custom_pipeline_args": {"pipeline_class": "my.Pipeline"}, + } + ) + mock_warn.assert_called_once() + warned_args = mock_warn.call_args[0][-1] # the formatted arg list + assert "tensor_parallel_size" in warned_args + assert "model" not in warned_args diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index e29de3ec98..4e2ad9b257 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -124,6 +124,9 @@ class OmniEngineArgs(EngineArgs): (e.g. ["text", "audio"]). If None, all modalities supported by the model are used. log_stats: Whether to log engine statistics. Defaults to False. + custom_pipeline_args: Dictionary of arguments for custom pipeline + initialization (e.g., ``{"pipeline_class": "my.Module"}``). + Passed through to the diffusion stage engine. """ stage_id: int = 0 @@ -143,6 +146,7 @@ class OmniEngineArgs(EngineArgs): stage_configs_path: str | None = None output_modalities: list[str] | None = None log_stats: bool = False + custom_pipeline_args: dict[str, Any] | None = None def __post_init__(self) -> None: load_omni_general_plugins() @@ -190,6 +194,11 @@ def create_model_config(self) -> OmniModelConfig: Returns: OmniModelConfig instance with all configuration fields set """ + if self.stage_configs_path is not None: + raise RuntimeError( + "create_model_config() should not be called when stage_configs_path is set. " + "Per-stage model configs are resolved from the stage config YAML." + ) # register omni models to avoid model not found error self._ensure_omni_models_registered() diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 5cba14c197..8e0b2b2df1 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -25,6 +25,7 @@ import janus import torch from omegaconf import OmegaConf +from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType from vllm.logger import init_logger from vllm.tokenizers import cached_tokenizer_from_config @@ -1258,6 +1259,68 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list: default_stage_cfg[0]["engine_args"]["model_stage"] = "diffusion" return default_stage_cfg + @staticmethod + def _strip_single_engine_args(kwargs: dict[str, Any]) -> dict[str, Any]: + """Remove parent ``EngineArgs`` fields from *kwargs*. + + When ``stage_configs_path`` is set, per-stage engine args are defined + in the YAML. Top-level single-engine fields (``compilation_config``, + ``tensor_parallel_size``, …) must not leak into per-stage configs via + the ``base_engine_args`` merge in ``load_stage_configs_from_yaml`` — + they can cause type errors (e.g. ``compilation_config`` as a JSON + string rejected by ``VllmConfig``) or silently override YAML values. + + Logs a warning for any parent field whose value differs from the + dataclass default, so users know their explicit overrides are ignored. + """ + # worker_extension_cls is a parent field but must pass through to + # diffusion stages for colocate worker setup. + _keep = {"worker_extension_cls"} + # Orchestrator-level OmniEngineArgs fields that are consumed by + # _resolve_stage_configs and must not leak into per-stage configs + # (stage_configs_path would trigger the create_model_config guard). + _strip_omni = {"stage_configs_path"} + # Fields that are always set by callers (via from_cli_args / asdict) + # and would always appear as overridden — suppress from the warning + # so it only surfaces genuinely surprising overrides. + _no_warn = {"model"} + + parent_fields: dict[str, dataclasses.Field] = {f.name: f for f in dataclasses.fields(EngineArgs)} + overridden: list[str] = [] + result: dict[str, Any] = {} + for k, v in kwargs.items(): + if k in _strip_omni: + continue + if k not in parent_fields or k in _keep: + result[k] = v + continue + # Detect explicitly-set values that differ from the default. + # Values may have been through asdict() which converts dataclass + # defaults to dicts, so normalise before comparing. + field = parent_fields[k] + if field.default is not dataclasses.MISSING: + default = field.default + elif field.default_factory is not dataclasses.MISSING: + default = field.default_factory() + else: + default = dataclasses.MISSING + if default is dataclasses.MISSING or v is None: + continue + # Normalise dataclass defaults to dicts for comparison + if dataclasses.is_dataclass(default) and not isinstance(default, type): + default = dataclasses.asdict(default) + if v != default and k not in _no_warn: + overridden.append(k) + + if overridden: + logger.warning( + "stage_configs_path is set — the following top-level engine " + "args are ignored (per-stage YAML takes precedence): %s", + ", ".join(sorted(overridden)), + ) + + return result + def _resolve_stage_configs(self, model: str, kwargs: dict[str, Any]) -> tuple[str, list[Any]]: """Resolve stage configs and inject defaults shared by orchestrator/headless.""" @@ -1269,12 +1332,17 @@ def _resolve_stage_configs(self, model: str, kwargs: dict[str, Any]) -> tuple[st "Ignoring it and resolving stages from stage_configs_path/model factory." ) + if stage_configs_path is not None: + base_kwargs = self._strip_single_engine_args(kwargs) + else: + base_kwargs = kwargs + # Use the legacy config loading path (load_and_resolve_stage_configs). # StageConfigFactory wiring will be done in config refactor [2/N]. config_path, stage_configs = load_and_resolve_stage_configs( model, stage_configs_path, - kwargs, + base_kwargs, default_stage_cfg_factory=lambda: self._create_default_diffusion_stage_cfg(kwargs), ) From e12250119bc7f90745354a5349e550f391fa123b Mon Sep 17 00:00:00 2001 From: NATURE Date: Mon, 13 Apr 2026 11:36:20 +0800 Subject: [PATCH 138/204] [Bugfix] Fix Bagel online mode for 1. Hang after several requests 2. Non-deterministic image quality regression. (#2458) Signed-off-by: natureofnature --- vllm_omni/core/sched/omni_ar_scheduler.py | 105 +++++----- .../model_executor/models/bagel/bagel.py | 195 ++++++------------ .../npu/worker/npu_ar_model_runner.py | 26 ++- vllm_omni/worker/gpu_ar_model_runner.py | 35 +++- 4 files changed, 164 insertions(+), 197 deletions(-) diff --git a/vllm_omni/core/sched/omni_ar_scheduler.py b/vllm_omni/core/sched/omni_ar_scheduler.py index eac737b6e6..0ee8cd16a3 100644 --- a/vllm_omni/core/sched/omni_ar_scheduler.py +++ b/vllm_omni/core/sched/omni_ar_scheduler.py @@ -59,6 +59,11 @@ def __init__(self, *args, **kwargs): # Track ACTIVE transfers (submitted to runner but not yet acked via kv_extracted_req_ids) self.active_kv_transfers: set[str] = set() + # Requests marked for deferred stop: keep running until KV extraction + # completes so that kv_ready can be emitted while the request is still + # alive. Stopped on the first scheduler step after extraction ack. + self.pending_stop_after_extraction: set[str] = set() + # [Omni] Pre-parse KV transfer criteria self.kv_transfer_criteria = self._get_kv_transfer_criteria() @@ -126,11 +131,16 @@ def _process_kv_transfer_trigger(self, request: Request, new_token_ids: list[int stop_decode_on_trigger = self.kv_transfer_criteria.get("stop_after_transfer", True) if request.request_id in self.transfer_triggered_requests: - # Already triggered. When stop_decode_on_trigger is True AND - # transfer was actually queued, the request was already stopped - # at trigger time (see below). Any request that reaches this - # point either has stop_decode_on_trigger=False (continue - # decoding) or was not actually queued (should not be stopped). + # Deferred stop: once KV extraction is complete (no longer in + # active_kv_transfers), stop the request. This guarantees the + # kv_ready signal was emitted while the request was still alive. + if ( + request.request_id in self.pending_stop_after_extraction + and request.request_id not in self.active_kv_transfers + ): + self.pending_stop_after_extraction.discard(request.request_id) + request.status = RequestStatus.FINISHED_STOPPED + return True return False if criteria_type == "prefill_finished": @@ -140,14 +150,11 @@ def _process_kv_transfer_trigger(self, request: Request, new_token_ids: list[int actually_queued = request.request_id in self.requests_needing_kv_transfer if stop_decode_on_trigger and actually_queued: - # Stop immediately so the request is NOT scheduled in - # the next step, freeing scheduling budget for companion - # requests whose chunked-prefill boundaries must be - # deterministic. waiting_for_transfer_free keeps blocks - # alive until the model runner finishes KV extraction. - self.waiting_for_transfer_free.add(request.request_id) - request.status = RequestStatus.FINISHED_STOPPED - return True + # Defer the stop until KV extraction completes so that + # the kv_ready signal can be emitted while the request + # is still alive. The request will be stopped on the + # next scheduler step after extraction ack arrives. + self.pending_stop_after_extraction.add(request.request_id) return False @@ -167,9 +174,7 @@ def _process_kv_transfer_trigger(self, request: Request, new_token_ids: list[int actually_queued = request.request_id in self.requests_needing_kv_transfer if stop_decode_on_trigger and actually_queued: - self.waiting_for_transfer_free.add(request.request_id) - request.status = RequestStatus.FINISHED_STOPPED - return True + self.pending_stop_after_extraction.add(request.request_id) return False @@ -268,6 +273,26 @@ def update_from_output( num_scheduled_tokens, ) + # Pre-process KV extraction acks so that the per-request loop below + # can see up-to-date active_kv_transfers state and emit kv_ready + # signals while requests are still alive (before any deferred stop). + kv_extracted_ids = getattr(model_runner_output, "kv_extracted_req_ids", None) + if kv_extracted_ids: + for req_id in kv_extracted_ids: + try: + self.active_kv_transfers.discard(req_id) + req = self.requests.get(req_id) + if req is not None and not req.is_finished(): + outputs[req.client_index].append( + EngineCoreOutput( + request_id=req_id, + new_token_ids=[], + kv_transfer_params={"kv_ready": True}, + ) + ) + except Exception: + init_logger(__name__).exception("Failed to pre-process KV extraction for %s", req_id) + # NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more, # the below loop can be a performance bottleneck. We should do our best # to avoid expensive operations inside the loop. @@ -436,6 +461,7 @@ def update_from_output( self.transfer_triggered_requests.remove(req.request_id) if req.request_id in self.active_kv_transfers: self.active_kv_transfers.remove(req.request_id) + self.pending_stop_after_extraction.discard(req.request_id) # Same for preempted for req in stopped_preempted_reqs: @@ -444,6 +470,8 @@ def update_from_output( self.transfer_triggered_requests.remove(req.request_id) if req.request_id in self.active_kv_transfers: self.active_kv_transfers.remove(req.request_id) + self.pending_stop_after_extraction.discard(req.request_id) + # KV Connector: update state for finished KV Transfers. if kv_connector_output: self._update_from_kv_xfer_finished(kv_connector_output) @@ -489,35 +517,12 @@ def update_from_output( engine_core_outputs[0] = eco = EngineCoreOutputs() eco.scheduler_stats = stats - # This is where we free blocks that were held for transfer - try: - kv_extracted_ids = getattr(model_runner_output, "kv_extracted_req_ids", None) - if kv_extracted_ids: - for req_id in kv_extracted_ids: - # Emit a kv_ready signal so the orchestrator can forward - # the request to the DiT stage immediately after KV - # extraction, without waiting for AR decode to finish. - req = self.requests.get(req_id) - if req is not None and not req.is_finished(): - eco = engine_core_outputs.get(req.client_index) - if eco is None: - eco = EngineCoreOutputs() - engine_core_outputs[req.client_index] = eco - eco.outputs.append( - EngineCoreOutput( - request_id=req_id, - new_token_ids=[], - kv_transfer_params={"kv_ready": True}, - ) - ) - - # Mark transfer as finished - if req_id in self.active_kv_transfers: - self.active_kv_transfers.remove(req_id) - logger.debug(f"[Omni] KV Transfer finished for {req_id}") - + # Free blocks that were held for transfer (kv_ready and + # active_kv_transfers updates already done before the per-request loop). + if kv_extracted_ids: + for req_id in kv_extracted_ids: + try: if req_id in self.waiting_for_transfer_free: - # Now it's safe to free blocks req = self.requests.get(req_id) if req: self.kv_cache_manager.free(req) @@ -525,13 +530,12 @@ def update_from_output( del self.requests[req_id] if req_id in self.transfer_triggered_requests: self.transfer_triggered_requests.remove(req_id) - if req_id in self.active_kv_transfers: - self.active_kv_transfers.remove(req_id) - + self.active_kv_transfers.discard(req_id) + self.pending_stop_after_extraction.discard(req_id) logger.debug(f"Freed blocks for {req_id} after transfer extraction") self.waiting_for_transfer_free.remove(req_id) - except Exception: - init_logger(__name__).exception("Failed to process finished transfer requests") + except Exception: + init_logger(__name__).exception("Failed to free blocks for %s after transfer", req_id) return engine_core_outputs @@ -564,8 +568,7 @@ def _free_request(self, request: Request, delay_free_blocks: bool = False) -> di kv_xfer_params = None return kv_xfer_params elif request_id in self.waiting_for_transfer_free: - # Stopped immediately by stop_decode_on_trigger; blocks are - # held until KV extraction completes in a future step. + # Blocks held until KV extraction completes in a future step. return None else: logger.debug( diff --git a/vllm_omni/model_executor/models/bagel/bagel.py b/vllm_omni/model_executor/models/bagel/bagel.py index acbbc28b4c..cbb775680c 100644 --- a/vllm_omni/model_executor/models/bagel/bagel.py +++ b/vllm_omni/model_executor/models/bagel/bagel.py @@ -1,4 +1,3 @@ -from collections import deque from collections.abc import Iterable, Mapping, Sequence from math import isqrt from typing import Any @@ -442,14 +441,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._pending_img2img_info: list[tuple[int, int, int, int]] = [] self._ropes_pending: list[dict[str, Any]] = [] self._ropes_metadata: dict[str, dict[str, Any]] = {} - self._cfg_companion_queue: deque[tuple[tuple[int, int, int, int], int]] = deque() - - # Per-request position offset for decode after img2img prefill. - # Prefill rewrites positions (VAE→0, ViT→1, text→2..N) but the model - # runner assigns decode positions starting from prefill_len, not N+1. - # offset = rope - prefill_len (a negative number). - self._pending_decode_offsets: list[int] = [] - self._decode_position_offsets: dict[str, int] = {} + self._last_img2img_info: tuple[int, int, int, int] | None = None from transformers import AutoTokenizer @@ -461,7 +453,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._start_of_image_id = int(_tok.convert_tokens_to_ids("<|vision_start|>")) self._end_of_image_id = int(_tok.convert_tokens_to_ids("<|vision_end|>")) self._img2img_token_id = int(_tok.convert_tokens_to_ids("<|fim_middle|>")) - self._vae_token_mask: torch.Tensor | None = None self.device = get_local_device() self._install_mot_modules(config) @@ -540,9 +531,7 @@ def _clear_warmup_state(self): self._ropes_pending.clear() self._ropes_metadata.clear() self._pending_img2img_info.clear() - self._cfg_companion_queue.clear() - self._pending_decode_offsets.clear() - self._decode_position_offsets.clear() + self._last_img2img_info = None self._vae_token_mask = None def get_kv_transfer_metadata( @@ -554,12 +543,10 @@ def get_kv_transfer_metadata( meta = self._ropes_metadata.pop(req_id, None) if meta is None: return None - # In think-mode img2img the prefill rope doesn't account for decoded - # thinking tokens; correct it to num_computed_tokens + offset. - # Skip correction when num_computed_tokens is unavailable (None). - offset = self._decode_position_offsets.pop(req_id, 0) - if offset != 0 and "ropes" in meta and num_computed_tokens is not None: - meta["ropes"] = [num_computed_tokens + offset] + if num_computed_tokens is not None and "image_shape" in meta: + prefill_rope = meta["ropes"][0] if meta.get("ropes") else 0 + if num_computed_tokens > prefill_rope: + meta["ropes"] = [num_computed_tokens] return meta def prepare_runner_inputs( @@ -572,48 +559,29 @@ def prepare_runner_inputs( num_scheduled_tokens: list[int], input_ids_buffer: torch.Tensor | None = None, ) -> tuple[torch.Tensor | None, torch.Tensor | None]: - """Model-runner hook: adjust inputs before ``forward()``. - - Returns ``(input_ids, positions)`` — possibly modified. - - Two adjustments for BAGEL img2img: - - 1. **Restore input_ids** when ``inputs_embeds`` is present so that - ``_adjust_positions_for_img2img`` can locate the - ``<|fim_middle|>`` placeholder. - 2. **Decode position offset**: prefill rewrites positions to a - compact scheme (rope ≪ prefill_len). The runner assigns decode - positions from ``num_computed_tokens``, which is far too large; - apply the stored per-request offset. - """ + """Restore input_ids so _adjust_positions_for_img2img can locate + the <|fim_middle|> placeholder for thinking-mode pre_text_len + detection.""" if inputs_embeds is not None and input_ids is None and input_ids_buffer is not None: input_ids = input_ids_buffer - - if self._decode_position_offsets and positions is not None: - token_start = 0 - for i, rid in enumerate(req_ids): - sched = num_scheduled_tokens[i] - offset = self._decode_position_offsets.get(rid, 0) - if offset != 0 and num_computed_tokens[i] > 0: - positions[token_start : token_start + sched] += offset - token_start += sched - return input_ids, positions def flush_pending_metadata(self, req_ids: list[str]) -> None: - """Map pending metadata (batch order) to req_ids after forward().""" + """Map pending metadata (batch order) to req_ids after forward(). + + Guard: if a request already has metadata with ``image_shape`` + (written during img2img prefill), don't overwrite it with + decode-step metadata that lacks ``image_shape``. + """ pending = self._ropes_pending self._ropes_pending = [] for i, meta in enumerate(pending): if i < len(req_ids): - if req_ids[i] not in self._ropes_metadata: - self._ropes_metadata[req_ids[i]] = meta - - pending_offsets = self._pending_decode_offsets - self._pending_decode_offsets = [] - for i, offset in enumerate(pending_offsets): - if i < len(req_ids) and offset != 0: - self._decode_position_offsets[req_ids[i]] = offset + rid = req_ids[i] + existing = self._ropes_metadata.get(rid) + if existing and "image_shape" in existing and "image_shape" not in meta: + continue + self._ropes_metadata[rid] = meta def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: mm_input_by_modality = {} @@ -727,16 +695,7 @@ def _process_img2img_input(self, multimodal_input): num_vit = vit_emb.shape[0] + 2 info = (num_vae, num_vit, int(H), int(W)) self._pending_img2img_info.append(info) - # Only the gen (main) request should add a companion queue entry. - # Companion requests (cfg_text, cfg_img) also call this method with - # the same image, so guard by checking whether this exact info - # tuple is already enqueued. For batched img2img with multiple - # concurrent gen requests this correctly adds one entry per unique - # image; images with identical (num_vae, num_vit, H, W) that arrive - # in the same batch are indistinguishable here and will share one - # entry, but that is an uncommon edge case. - if not any(entry[0] == info for entry in self._cfg_companion_queue): - self._cfg_companion_queue.append((info, 2)) # cfg_text + cfg_img + self._last_img2img_info = info return tuple(results) @@ -755,31 +714,18 @@ def forward( positions = self._adjust_positions_for_img2img(positions, input_ids) use_mot = True - elif self._cfg_companion_queue: - # Guard: if this looks like a pure decode step (small token count, - # no multimodal embeddings), the queue has stale entries from a - # previous prefill cycle — clear them instead of consuming. - if inputs_embeds is None and seq_len <= 2: - self._cfg_companion_queue.clear() - else: - cached, remaining = self._cfg_companion_queue[0] - remaining -= 1 - num_vae, num_vit, img_H, img_W = cached - num_img2img = num_vae + 1 + num_vit # +1 separator - seq_len = inputs_embeds.shape[0] if inputs_embeds is not None else positions.shape[0] - - if inputs_embeds is not None and seq_len >= num_img2img: - self._pending_img2img_info = [cached] - positions = self._adjust_positions_for_img2img(positions, input_ids) - use_mot = True - else: - rope = int(positions[seq_len - 1].item()) + 1 - self._ropes_pending.append({"ropes": [rope]}) + elif self._last_img2img_info is not None: + info = self._last_img2img_info + num_vae, num_vit, _, _ = info + num_img2img = num_vae + 1 + num_vit - if remaining == 0: - self._cfg_companion_queue.popleft() - else: - self._cfg_companion_queue[0] = (cached, remaining) + if seq_len >= num_img2img: + self._pending_img2img_info = [info] + positions = self._adjust_positions_for_img2img(positions, input_ids) + use_mot = True + else: + rope = int(positions[seq_len - 1].item()) + 1 + self._ropes_pending.append({"ropes": [rope]}) if use_mot: return self._mot_forward(input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs) @@ -790,27 +736,18 @@ def _adjust_positions_for_img2img( positions: torch.Tensor, input_ids: torch.Tensor | None = None, ) -> torch.Tensor: - """Rewrite position IDs to match the original BAGEL position scheme: - - If there are ``pre_text_len`` text tokens before the img2img block:: - - pre_text → 0, 1, ..., M-1 - VAE → M (all share) - separator→ M - ViT → M+1 (all share) - post_text→ M+2, M+3, ... + """Rewrite position IDs for img2img. - When no text precedes the img2img block (M=0), this reduces to the - simpler scheme: VAE→0, ViT→1, text→2, 3, ... + Supports an optional ``pre_text_len`` prefix (thinking-mode) detected + via the ``<|fim_middle|>`` token in *input_ids*: - Also computes ``self._vae_token_mask`` (bool tensor, True for actual - VAE latent patches that should use gen-mode weights) and pushes - per-request ropes + image_shape to the FIFO consumed by - ``get_kv_transfer_metadata``. + pre_text -> 0 .. M-1 + VAE -> M (all share) + separator-> M + ViT -> M+1 (all share) + post_text-> M+2, M+3, ... - For img2img requests, also stores a decode position offset so that - subsequent autoregressive decode steps use positions that continue - from the rewritten scheme rather than from the original prefill length. + When M=0 (standard img2img) this reduces to VAE->0, ViT->1, text->2.. """ info_list = self._pending_img2img_info self._pending_img2img_info = [] @@ -836,70 +773,64 @@ def _adjust_positions_for_img2img( req_len = end - start if img2img_idx < len(info_list): - num_vae, num_vit, img_H, img_W = info_list[img2img_idx] + cur_info = info_list[img2img_idx] + elif self._last_img2img_info is not None: + cur_info = self._last_img2img_info + else: + cur_info = None + + if cur_info is not None: + num_vae, num_vit, img_H, img_W = cur_info num_img2img = num_vae + 1 + num_vit # +1 separator if req_len >= num_img2img: - # Detect offset of img2img tokens within this request - # by searching for the img2img placeholder token ID. pre_text_len = 0 if input_ids is not None: - req_ids = input_ids[start:end] - mask = req_ids == self._img2img_token_id - indices = mask.nonzero(as_tuple=True)[0] + req_ids_slice = input_ids[start:end] + indices = (req_ids_slice == self._img2img_token_id).nonzero(as_tuple=True)[0] if indices.numel() > 0: pre_text_len = int(indices[0].item()) - img_start = start + pre_text_len + M = pre_text_len + img_start = start + M post_text_start = img_start + num_img2img - # pre_text_pos: position base for image tokens - pre_text_pos = pre_text_len - # Pre-image text: sequential positions 0..pre_text_pos-1 - if pre_text_len > 0: + if M > 0: new_positions[start:img_start] = torch.arange( - 0, pre_text_pos, device=positions.device, dtype=positions.dtype + 0, M, device=positions.device, dtype=positions.dtype ) - # VAE tokens: all share position pre_text_pos - new_positions[img_start : img_start + num_vae] = pre_text_pos - # Separator: position pre_text_pos - new_positions[img_start + num_vae] = pre_text_pos - # ViT tokens: all share position pre_text_pos+1 + new_positions[img_start : img_start + num_vae] = M + new_positions[img_start + num_vae] = M # separator vit_start = img_start + num_vae + 1 - new_positions[vit_start : vit_start + num_vit] = pre_text_pos + 1 + new_positions[vit_start : vit_start + num_vit] = M + 1 - # Post-image text: sequential positions pre_text_pos+2, pre_text_pos+3, ... num_post_text = end - post_text_start if num_post_text > 0: new_positions[post_text_start:end] = torch.arange( - pre_text_pos + 2, - pre_text_pos + 2 + num_post_text, + M + 2, + M + 2 + num_post_text, device=positions.device, dtype=positions.dtype, ) - # VAE gen-mode mask: only actual VAE latent patches (not markers) - vae_patches_start = img_start + 1 # skip start_marker - vae_patches_end = img_start + num_vae - 1 # before end_marker + vae_patches_start = img_start + 1 + vae_patches_end = img_start + num_vae - 1 if vae_patches_end > vae_patches_start: vae_mask[vae_patches_start:vae_patches_end] = True - rope = pre_text_pos + 2 + num_post_text + rope = M + 2 + num_post_text self._ropes_pending.append( { "ropes": [rope], "image_shape": [img_H, img_W], } ) - decode_offset = rope - req_len - self._pending_decode_offsets.append(decode_offset) img2img_idx += 1 continue rope = int(new_positions[end - 1].item()) + 1 self._ropes_pending.append({"ropes": [rope]}) - self._pending_decode_offsets.append(0) self._vae_token_mask = vae_mask if vae_mask.any() else None return new_positions diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py index 138948064b..ffb997048b 100644 --- a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py +++ b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py @@ -149,7 +149,15 @@ def execute_model( encoder_cache=self.encoder_cache, ) as ec_connector_output: self._execute_mm_encoder(scheduler_output) - return make_empty_encoder_model_runner_output(scheduler_output) + + kv_ids = self.kv_extracted_req_ids + self.kv_extracted_req_ids = None + + output = make_empty_encoder_model_runner_output(scheduler_output) + if kv_ids: + output = copy(output) + output.kv_extracted_req_ids = kv_ids + return output if not num_scheduled_tokens: if ( @@ -163,10 +171,20 @@ def execute_model( # dummy run to ensure coordinate_batch_across_dp # is called into to avoid out of sync issues. self._dummy_run(1) + + kv_ids = self.kv_extracted_req_ids + self.kv_extracted_req_ids = None + if not has_kv_transfer_group(): - # Return empty ModelRunnerOutput if no work to do. - return EMPTY_MODEL_RUNNER_OUTPUT - return self.kv_connector_no_forward(scheduler_output, self.vllm_config) + output = EMPTY_MODEL_RUNNER_OUTPUT + else: + output = self.kv_connector_no_forward(scheduler_output, self.vllm_config) + + if kv_ids: + output = copy(output) + output.kv_extracted_req_ids = kv_ids + + return output if self.cache_config.kv_sharing_fast_prefill: assert not self.num_prompt_logprobs, ( "--kv-sharing-fast-prefill produces incorrect " diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 01ec23acb4..554ac6355d 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -205,24 +205,39 @@ def execute_model( encoder_cache=self.encoder_cache, ) as ec_connector_output: self._execute_mm_encoder(scheduler_output) - return make_empty_encoder_model_runner_output(scheduler_output) + + kv_ids = self.kv_extracted_req_ids + self.kv_extracted_req_ids = None + + output = make_empty_encoder_model_runner_output(scheduler_output) + if kv_ids: + output = copy(output) + output.kv_extracted_req_ids = kv_ids + return output if not num_scheduled_tokens: if ( self.parallel_config.distributed_executor_backend == "external_launcher" and self.parallel_config.data_parallel_size > 1 ): - # this is a corner case when both external launcher - # and DP are enabled, num_scheduled_tokens could be - # 0, and has_unfinished_requests in the outer loop - # returns True. before returning early here we call - # dummy run to ensure coordinate_batch_across_dp - # is called into to avoid out of sync issues. self._dummy_run(1) + + # Capture KV extraction results before early return; + # sample_tokens() is skipped on this path so the IDs + # would otherwise be silently overwritten next step. + kv_ids = self.kv_extracted_req_ids + self.kv_extracted_req_ids = None + if not has_kv_transfer_group(): - # Return empty ModelRunnerOutput if no work to do. - return EMPTY_MODEL_RUNNER_OUTPUT - return self.kv_connector_no_forward(scheduler_output, self.vllm_config) + output = EMPTY_MODEL_RUNNER_OUTPUT + else: + output = self.kv_connector_no_forward(scheduler_output, self.vllm_config) + + if kv_ids: + output = copy(output) + output.kv_extracted_req_ids = kv_ids + + return output if self.cache_config.kv_sharing_fast_prefill: assert not self.num_prompt_logprobs, ( From cb4d13a65806d18337628da0768539ba97c6cd4d Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Mon, 13 Apr 2026 12:53:35 +0800 Subject: [PATCH 139/204] [Perf][Fish Speech] Enable CUDA Graph capture for Fast AR code predictor (#2520) Signed-off-by: Sy03 <1370724210@qq.com> --- .../models/fish_speech/fish_speech_fast_ar.py | 22 +++++-- .../models/fish_speech/fish_speech_slow_ar.py | 39 ++++++------ vllm_omni/worker/gpu_ar_model_runner.py | 62 +++++++++++++++++++ vllm_omni/worker/gpu_model_runner.py | 6 +- 4 files changed, 99 insertions(+), 30 deletions(-) diff --git a/vllm_omni/model_executor/models/fish_speech/fish_speech_fast_ar.py b/vllm_omni/model_executor/models/fish_speech/fish_speech_fast_ar.py index 8bbb643ebe..22a2744ff5 100644 --- a/vllm_omni/model_executor/models/fish_speech/fish_speech_fast_ar.py +++ b/vllm_omni/model_executor/models/fish_speech/fish_speech_fast_ar.py @@ -310,6 +310,7 @@ def __init__( self._compiled_model_fwd: object | None = None self._compile_attempted = False self._compile_failed = False + self._disable_compile_for_graph = False def _ensure_buffers(self, bsz: int, device: torch.device, dtype: torch.dtype) -> None: max_seq = self._num_codebooks + 1 # hidden_state + num_codebooks codes @@ -327,11 +328,20 @@ def _setup_compile(self) -> None: if self._compile_attempted: return self._compile_attempted = True + if self._disable_compile_for_graph: + try: + self._compiled_model_fwd = torch.compile( + self.model.forward, + dynamic=True, + options={"epilogue_fusion": False}, + ) + except Exception as exc: + logger.warning("Fast AR torch.compile (graph mode) failed: %s", exc) + self._compiled_model_fwd = self.model.forward + return try: self._compiled_model_fwd = torch.compile( self.model.forward, - # Keep the helper compiler separate from vLLM's outer - # cudagraph-managed Stage-0 execution. mode="default", dynamic=True, fullgraph=False, @@ -366,10 +376,10 @@ def warmup_compile( @torch.inference_mode() def _run_model(self, step_input: torch.Tensor, step_pos_ids: torch.Tensor, bsz: int) -> torch.Tensor: - # Default-on compile only pays off for single-request decode. For - # batched decode, eager preserves loaded throughput and avoids the - # regression seen with batch>1 compiled execution. - model_fwd = self._compiled_model_fwd if bsz == 1 else self.model.forward + if self._disable_compile_for_graph: + model_fwd = self._compiled_model_fwd or self.model.forward + else: + model_fwd = self._compiled_model_fwd if bsz == 1 else self.model.forward try: return model_fwd(step_input, step_pos_ids) except Exception as exc: diff --git a/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py b/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py index 3813597caa..62776cbb31 100644 --- a/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py +++ b/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py @@ -194,6 +194,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.has_postprocess = True self.mtp_hidden_size = int(self.text_config.hidden_size) self.talker_mtp_output_key = "audio_codes" + self.talker_mtp_graph_safe = True self.gpu_resident_buffer_keys: set[str] = {"last_slow_ar_hidden"} # Qwen3 transformer backbone. @@ -236,6 +237,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): slow_ar_config=self.text_config, prefix="fast_ar", ) + if self.talker_mtp_graph_safe: + self.fast_ar._disable_compile_for_graph = True # Constant logit mask: allow only semantic tokens + im_end. vocab = int(self.text_config.vocab_size) @@ -680,18 +683,13 @@ def talker_mtp( inputs_embeds_out = input_embeds.reshape(bsz, -1).clone() semantic_mask = (input_ids[:, 0] >= self._semantic_begin_id) & (input_ids[:, 0] <= self._semantic_end_id) - if semantic_mask.any(): - semantic_codes = audio_codes[semantic_mask].clamp(min=0) - offsets = ( - torch.arange(self._num_codebooks, device=dev, dtype=semantic_codes.dtype) * self._codebook_size - ).unsqueeze(0) - codebook_sum = self.codebook_embeddings(semantic_codes + offsets).sum(dim=1).to(dtype=torch.bfloat16) - - # Normalize by sqrt(num_codebooks + 1) as in the reference model - # (scale_codebook_embeddings=True for fish_qwen3_omni). - inputs_embeds_out[semantic_mask] = (inputs_embeds_out[semantic_mask] + codebook_sum) / math.sqrt( - self._num_codebooks + 1 - ) + semantic_codes = audio_codes.clamp(min=0, max=self._codebook_size - 1) + offsets = ( + torch.arange(self._num_codebooks, device=dev, dtype=semantic_codes.dtype) * self._codebook_size + ).unsqueeze(0) + codebook_sum = self.codebook_embeddings(semantic_codes + offsets).sum(dim=1).to(dtype=torch.bfloat16) + norm_embeds = (inputs_embeds_out + codebook_sum) / math.sqrt(self._num_codebooks + 1) + inputs_embeds_out = torch.where(semantic_mask.unsqueeze(-1), norm_embeds, inputs_embeds_out) return inputs_embeds_out, audio_codes.to(dtype=torch.long) @@ -802,14 +800,15 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if truncated: logger.info("Truncated %d RoPE cos_sin_cache buffers to bf16 precision", truncated) - try: - self.fast_ar.warmup_compile( - device=self.codebook_embeddings.weight.device, - dtype=torch.bfloat16, - batch_sizes=(1,), - ) - except Exception as exc: - logger.warning("Fish Speech Fast AR compile warmup failed: %s", exc) + if not getattr(self, "talker_mtp_graph_safe", False): + try: + self.fast_ar.warmup_compile( + device=self.codebook_embeddings.weight.device, + dtype=torch.bfloat16, + batch_sizes=(1,), + ) + except Exception as exc: + logger.warning("Fish Speech Fast AR compile warmup failed: %s", exc) codec_device = self.codebook_embeddings.weight.device _load_dac_codec( diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 554ac6355d..72e745fb17 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -138,6 +138,68 @@ def _sampling_metadata_for_model_sampler(self, sampling_metadata): return sampling_metadata return replace(sampling_metadata, output_token_ids=output_token_ids) + def capture_model(self) -> int: + result = super().capture_model() + self._capture_talker_mtp_graphs() + return result + + def _capture_talker_mtp_graphs(self) -> None: + from vllm_omni.worker.gpu_model_runner import CUDAGraphWrapper + + if not self.has_talker_mtp or not isinstance(self.talker_mtp, CUDAGraphWrapper): + return + + from vllm.compilation.monitor import set_cudagraph_capturing_enabled + from vllm.distributed.parallel_state import graph_capture + + capture_sizes = self.compilation_config.cudagraph_capture_sizes + num_warmups = self.compilation_config.cudagraph_num_of_warmups + capture_sizes = sorted(capture_sizes, reverse=True) + logger.info("Capturing talker_mtp graphs for sizes %s", capture_sizes) + + set_cudagraph_capturing_enabled(True) + try: + with torch.inference_mode(), graph_capture(device=self.device): + for bsz in capture_sizes: + _, batch_desc, _, _, _ = self._determine_batch_execution_and_padding( + num_tokens=bsz, + num_reqs=bsz, + num_scheduled_tokens_np=np.ones(bsz, dtype=np.int32), + max_num_scheduled_tokens=1, + use_cascade_attn=False, + ) + n = batch_desc.num_tokens + ids = self.talker_mtp_input_ids.gpu[:n] + emb = self.talker_mtp_inputs_embeds.gpu[:n] + hid = self.last_talker_hidden.gpu[:n] + ts = self.text_step.gpu[:n] + + for _ in range(num_warmups): + with set_forward_context( + None, + self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.NONE, + batch_descriptor=batch_desc, + ): + self.talker_mtp(ids, emb, hid, ts) + + with set_forward_context( + None, + self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.FULL, + batch_descriptor=batch_desc, + ): + self.talker_mtp(ids, emb, hid, ts) + torch.cuda.synchronize() + + logger.info("Captured talker_mtp graphs for %d sizes", len(capture_sizes)) + except RuntimeError as e: + raise RuntimeError( + f"talker_mtp graph capture failed for a model that declared talker_mtp_graph_safe=True: {e}" + ) from e + finally: + set_cudagraph_capturing_enabled(False) + @torch.inference_mode() def execute_model( self, diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 35e1598435..1f678b579f 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -83,11 +83,9 @@ def load_model(self, *args, **kwargs) -> None: self.has_talker_mtp = True cudagraph_mode = self.compilation_config.cudagraph_mode assert cudagraph_mode is not None - # Only wrap talker_mtp in CUDAGraphWrapper for Omni models that - # have a separate .talker sub-module. TTS models' code predictor - # has internal AR loops / torch.multinomial — not graph-safe. has_separate_talker = getattr(self.model, "talker", None) is not None - if cudagraph_mode.has_full_cudagraphs() and has_separate_talker: + talker_mtp_graph_safe = getattr(self.model, "talker_mtp_graph_safe", False) + if cudagraph_mode.has_full_cudagraphs() and (has_separate_talker or talker_mtp_graph_safe): self.talker_mtp = CUDAGraphWrapper(talker_mtp, self.vllm_config, runtime_mode=CUDAGraphMode.FULL) # TTS exposes mtp_hidden_size; Omni uses hf_text_config.hidden_size. hidden_size = int( From 8097747a5dc0d90f267050ae4b77d53bbaea88ae Mon Sep 17 00:00:00 2001 From: Jiaqian Liu <61532106+Celeste-jq@users.noreply.github.com> Date: Mon, 13 Apr 2026 14:20:04 +0800 Subject: [PATCH 140/204] [Model] Adapt Wan2.2-I2V-A14B via LightX2V offline conversion path (#2134) Signed-off-by: Celeste-jq <591998922@qq.com> Co-authored-by: Canlin Guo --- docs/user_guide/diffusion/lora.md | 86 ++++ .../offline_inference/image_to_video.md | 6 +- .../image_to_video/README.md | 6 +- .../image_to_video/image_to_video.py | 13 + .../online_serving/image_to_video/README.md | 49 +++ .../image_to_video/run_curl_image_to_video.sh | 5 + .../openai_api/test_video_server.py | 22 + tools/wan22/assemble_wan22_i2v_diffusers.py | 385 ++++++++++++++++++ .../models/wan2_2/pipeline_wan2_2.py | 58 ++- .../models/wan2_2/pipeline_wan2_2_i2v.py | 21 +- .../models/wan2_2/pipeline_wan2_2_ti2v.py | 21 +- .../models/wan2_2/scheduling_wan_euler.py | 147 +++++++ .../models/wan2_2/wan2_2_transformer.py | 8 + vllm_omni/engine/async_omni_engine.py | 2 + 14 files changed, 804 insertions(+), 25 deletions(-) create mode 100644 tools/wan22/assemble_wan22_i2v_diffusers.py create mode 100644 vllm_omni/diffusion/models/wan2_2/scheduling_wan_euler.py diff --git a/docs/user_guide/diffusion/lora.md b/docs/user_guide/diffusion/lora.md index e45c033b84..256698752a 100644 --- a/docs/user_guide/diffusion/lora.md +++ b/docs/user_guide/diffusion/lora.md @@ -56,6 +56,92 @@ outputs = omni.generate( !!! note "Server-side Path Requirement" The LoRA adapter path (`local_path`) must be readable on the **server** machine. If your client and server are on different machines, ensure the LoRA adapter is accessible via a shared mount or copied to the server. +## Wan2.2 LightX2V Offline Assembly + +This workflow is LoRA-adjacent: it uses external LightX2V conversion plus +`Wan2.2-Distill-Loras` to bake converted Wan2.2 I2V checkpoints into a local +Diffusers directory, instead of loading LoRA adapters at runtime. + +### Required assets + +- Base model: `Wan-AI/Wan2.2-I2V-A14B` +- Diffusers skeleton: `Wan-AI/Wan2.2-I2V-A14B-Diffusers` +- Optional external converter from the LightX2V project (not shipped in this repository) +- Optional LoRA weights: `lightx2v/Wan2.2-Distill-Loras` + +### Step 1: Optional - convert high/low-noise DiT weights with LightX2V + +Install or clone LightX2V from the upstream repository +(`https://github.com/ModelTC/LightX2V`). After cloning, the converter used +below is available at `/tools/convert/converter.py`. + +```bash +python /path/to/lightx2v/tools/convert/converter.py \ + --source /path/to/Wan2.2-I2V-A14B/high_noise_model \ + --output /tmp/wan22_lightx2v/high_noise_out \ + --output_ext .safetensors \ + --output_name diffusion_pytorch_model \ + --model_type wan_dit \ + --direction forward \ + --lora_path /path/to/wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step_1022.safetensors \ + --lora_key_convert auto \ + --single_file + +python /path/to/lightx2v/tools/convert/converter.py \ + --source /path/to/Wan2.2-I2V-A14B/low_noise_model \ + --output /tmp/wan22_lightx2v/low_noise_out \ + --output_ext .safetensors \ + --output_name diffusion_pytorch_model \ + --model_type wan_dit \ + --direction forward \ + --lora_path /path/to/wan2.2_i2v_A14b_low_noise_lora_rank64_lightx2v_4step_1022.safetensors \ + --lora_key_convert auto \ + --single_file +``` + +If you are not using LightX2V, skip this step and either keep the original +Diffusers weights from the skeleton or point Step 2 at any other converted +`transformer/` and `transformer_2/` checkpoints. + +### Step 2: Assemble a final Diffusers-style directory + +```bash +python tools/wan22/assemble_wan22_i2v_diffusers.py \ + --diffusers-skeleton /path/to/Wan2.2-I2V-A14B-Diffusers \ + --transformer-weight /tmp/wan22_lightx2v/high_noise_out \ + --transformer-2-weight /tmp/wan22_lightx2v/low_noise_out \ + --output-dir /path/to/Wan2.2-I2V-A14B-Custom-Diffusers \ + --asset-mode symlink \ + --overwrite +``` + +`--transformer-weight` and `--transformer-2-weight` are optional. If you omit +them, the tool keeps the original weights from the Diffusers skeleton. + +### Step 3: Run offline inference + +```bash +python examples/offline_inference/image_to_video/image_to_video.py \ + --model /path/to/Wan2.2-I2V-A14B-Custom-Diffusers \ + --image /path/to/input.jpg \ + --prompt "A cat playing with yarn" \ + --num-frames 81 \ + --num-inference-steps 4 \ + --tensor-parallel-size 4 \ + --height 480 \ + --width 832 \ + --flow-shift 12 \ + --sample-solver euler \ + --guidance-scale 1.0 \ + --guidance-scale-high 1.0 \ + --boundary-ratio 0.875 +``` + +Notes: + +- This route avoids runtime LoRA loading changes in vLLM-Omni when you choose to bake converted weights into a local Diffusers directory. +- Output quality and speed depend on the replacement checkpoints and sampling params you choose. + ## See Also diff --git a/docs/user_guide/examples/offline_inference/image_to_video.md b/docs/user_guide/examples/offline_inference/image_to_video.md index 7a750aeff3..6e105741a7 100644 --- a/docs/user_guide/examples/offline_inference/image_to_video.md +++ b/docs/user_guide/examples/offline_inference/image_to_video.md @@ -62,12 +62,13 @@ Key arguments: - `--negative-prompt`: Optional list of artifacts to suppress. - `--boundary-ratio`: Boundary split ratio for two-stage MoE models. - `--flow-shift`: Scheduler flow shift (5.0 for 720p, 12.0 for 480p). +- `--sample-solver`: Wan2.2 sampling solver. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints. - `--num-inference-steps`: Number of denoising steps (default 50). - `--fps`: Frames per second for the saved MP4 (requires `diffusers` export_to_video). - `--output`: Path to save the generated video. - `--vae-use-slicing`: Enable VAE slicing for memory optimization. - `--vae-use-tiling`: Enable VAE tiling for memory optimization. -- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/parallelism_acceleration.md#cfg-parallel). +- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/parallelism/cfg_parallel.md). - `--tensor-parallel-size`: tensor parallel size (effective for models that support TP, e.g. LTX2). - `--enable-cpu-offload`: enable CPU offloading for diffusion models. - `--use-hsdp`: Enable Hybrid Sharded Data Parallel to shard model weights across GPUs. @@ -78,6 +79,9 @@ Key arguments: > ℹ️ If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage. +For Wan2.2 LightX2V-converted local Diffusers directories and related LoRA +assets, see the [LoRA guide](../../diffusion/lora.md#wan22-lightx2v-offline-assembly). + ## Example materials ??? abstract "image_to_video.py" diff --git a/examples/offline_inference/image_to_video/README.md b/examples/offline_inference/image_to_video/README.md index 2692c76df2..a458850a02 100644 --- a/examples/offline_inference/image_to_video/README.md +++ b/examples/offline_inference/image_to_video/README.md @@ -59,12 +59,13 @@ Key arguments: - `--negative-prompt`: Optional list of artifacts to suppress. - `--boundary-ratio`: Boundary split ratio for two-stage MoE models. - `--flow-shift`: Scheduler flow shift (5.0 for 720p, 12.0 for 480p). +- `--sample-solver`: Wan2.2 sampling solver. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints. - `--num-inference-steps`: Number of denoising steps (default 50). - `--fps`: Frames per second for the saved MP4 (requires `diffusers` export_to_video). - `--output`: Path to save the generated video. - `--vae-use-slicing`: Enable VAE slicing for memory optimization. - `--vae-use-tiling`: Enable VAE tiling for memory optimization. -- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](../../../docs/user_guide/diffusion/parallelism_acceleration.md#cfg-parallel). +- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/parallelism/cfg_parallel.md). - `--tensor-parallel-size`: tensor parallel size (effective for models that support TP, e.g. LTX2). - `--enable-cpu-offload`: enable CPU offloading for diffusion models. - `--use-hsdp`: Enable Hybrid Sharded Data Parallel to shard model weights across GPUs. @@ -74,3 +75,6 @@ Key arguments: > ℹ️ If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage. + +For Wan2.2 LightX2V-converted local Diffusers directories and related LoRA +assets, see the [LoRA guide](../../../docs/user_guide/diffusion/lora.md#wan22-lightx2v-offline-assembly). diff --git a/examples/offline_inference/image_to_video/image_to_video.py b/examples/offline_inference/image_to_video/image_to_video.py index 7e7cfbf84e..53319c8221 100644 --- a/examples/offline_inference/image_to_video/image_to_video.py +++ b/examples/offline_inference/image_to_video/image_to_video.py @@ -84,6 +84,13 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--flow-shift", type=float, default=5.0, help="Scheduler flow_shift (5.0 for 720p, 12.0 for 480p)." ) + parser.add_argument( + "--sample-solver", + type=str, + default="unipc", + choices=["unipc", "euler"], + help="Sampling solver for Wan2.2 pipelines. Use 'euler' for Lightning/Distill setups.", + ) parser.add_argument("--output", type=str, default="i2v_output.mp4", help="Path to save the video (mp4).") parser.add_argument("--fps", type=int, default=None, help="Frames per second for the output video.") parser.add_argument( @@ -305,6 +312,7 @@ def main(): print(f" Model: {args.model}") print(f" Inference steps: {args.num_inference_steps}") print(f" Frames: {args.num_frames}") + print(f" Solver: {args.sample_solver}") print( f" Parallel configuration: cfg_parallel_size={args.cfg_parallel_size}," f" tensor_parallel_size={args.tensor_parallel_size}, vae_patch_parallel_size={args.vae_patch_parallel_size}" @@ -326,9 +334,14 @@ def main(): generator=generator, guidance_scale=guidance_scale, guidance_scale_2=args.guidance_scale_high, + boundary_ratio=args.boundary_ratio, num_inference_steps=num_inference_steps, num_frames=num_frames, frame_rate=frame_rate, + extra_args={ + "sample_solver": args.sample_solver, + "flow_shift": args.flow_shift, + }, ), ) generation_end = time.perf_counter() diff --git a/examples/online_serving/image_to_video/README.md b/examples/online_serving/image_to_video/README.md index 49283bd9a0..285eeb2798 100644 --- a/examples/online_serving/image_to_video/README.md +++ b/examples/online_serving/image_to_video/README.md @@ -26,6 +26,23 @@ The script allows overriding: - `CACHE_BACKEND` (default: `none`) - `ENABLE_CACHE_DIT_SUMMARY` (default: `0`) +### Ascend / Local LightX2V Example + +For a local Wan2.2-LightX2V Diffusers directory on Ascend/NPU, you can start the server like this: + +```bash +vllm serve /path/to/Wan2.2-I2V-A14B-LightX2V-Diffusers-Lightning \ + --omni \ + --port 8091 \ + --flow-shift 12 \ + --cfg-parallel-size 1 \ + --ulysses-degree 4 \ + --use-hsdp \ + --trust-remote-code \ + --allowed-local-media-path / \ + --seed 42 +``` + ## Async Job Behavior `POST /v1/videos` is asynchronous. It creates a video job and immediately @@ -69,10 +86,35 @@ curl -X POST http://localhost:8091/v1/videos/sync \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ + -F 'extra_params={"sample_solver":"euler"}' \ -F "seed=42" \ -o sync_i2v_output.mp4 ``` +For Wan Lightning/Distill checkpoints, pass `{"sample_solver":"euler"}` via `extra_params`. The default solver is `unipc`. + +Example matching the local LightX2V deployment above: + +```bash +curl -sS -X POST http://localhost:8091/v1/videos/sync \ + -H "Accept: video/mp4" \ + -F "prompt=A cat playing with yarn" \ + -F "input_reference=@/path/to/input.jpg" \ + -F "width=832" \ + -F "height=480" \ + -F "num_frames=81" \ + -F "fps=16" \ + -F "num_inference_steps=4" \ + -F "guidance_scale=1.0" \ + -F "guidance_scale_2=1.0" \ + -F "boundary_ratio=0.875" \ + -F "seed=42" \ + -F 'extra_params={"sample_solver":"euler"}' \ + -o ./output.mp4 +``` + +Use `/v1/videos/sync` if you want to write the MP4 directly to a file. `POST /v1/videos` is async and returns job metadata, not inline `b64_json`. + ## Storage Generated video files are stored on local disk by the async video API. @@ -96,6 +138,9 @@ export VLLM_OMNI_STORAGE_MAX_CONCURRENCY=8 # Basic image-to-video generation bash run_curl_image_to_video.sh +# Wan Lightning/Distill checkpoints +SAMPLE_SOLVER=euler bash run_curl_image_to_video.sh + # Or execute directly (OpenAI-style multipart) create_response=$(curl -s http://localhost:8091/v1/videos \ -H "Accept: application/json" \ @@ -111,6 +156,7 @@ create_response=$(curl -s http://localhost:8091/v1/videos \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ + -F 'extra_params={"sample_solver":"euler"}' \ -F "seed=42") video_id=$(echo "$create_response" | jq -r '.id') @@ -169,9 +215,12 @@ curl -X POST http://localhost:8091/v1/videos \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ + -F 'extra_params={"sample_solver":"euler"}' \ -F "seed=42" ``` +`sample_solver` is supported by Wan2.2 online serving through the existing `extra_params` field, which is merged into the pipeline `extra_args`. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints. + ## Create Response Format `POST /v1/videos` returns a job record, not inline base64 video data. diff --git a/examples/online_serving/image_to_video/run_curl_image_to_video.sh b/examples/online_serving/image_to_video/run_curl_image_to_video.sh index f4c1496a69..6f6a6f96d5 100644 --- a/examples/online_serving/image_to_video/run_curl_image_to_video.sh +++ b/examples/online_serving/image_to_video/run_curl_image_to_video.sh @@ -7,6 +7,7 @@ INPUT_IMAGE="${INPUT_IMAGE:-../../offline_inference/image_to_video/qwen-bear.png BASE_URL="${BASE_URL:-http://localhost:8099}" OUTPUT_PATH="${OUTPUT_PATH:-wan22_i2v_output.mp4}" NEGATIVE_PROMPT="${NEGATIVE_PROMPT:-}" +SAMPLE_SOLVER="${SAMPLE_SOLVER:-}" POLL_INTERVAL="${POLL_INTERVAL:-2}" if [ ! -f "$INPUT_IMAGE" ]; then @@ -34,6 +35,10 @@ if [ -n "${NEGATIVE_PROMPT}" ]; then create_cmd+=(-F "negative_prompt=${NEGATIVE_PROMPT}") fi +if [ -n "${SAMPLE_SOLVER}" ]; then + create_cmd+=(-F "extra_params={\"sample_solver\":\"${SAMPLE_SOLVER}\"}") +fi + create_response="$("${create_cmd[@]}")" video_id="$(echo "${create_response}" | jq -r '.id')" if [ -z "${video_id}" ] || [ "${video_id}" = "null" ]; then diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py index 0fdee7a77a..fd7d4df60d 100644 --- a/tests/entrypoints/openai_api/test_video_server.py +++ b/tests/entrypoints/openai_api/test_video_server.py @@ -766,6 +766,28 @@ def test_extra_params_merged_with_existing_extra_args(test_client, mocker: Mocke assert captured.extra_args["zero_steps"] == 2 +def test_sample_solver_forwarded_via_extra_params(test_client, mocker: MockerFixture): + """sample_solver can be passed through existing extra_params for Wan2.2 online serving.""" + mocker.patch( + "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", + return_value="Zg==", + ) + response = test_client.post( + "/v1/videos", + data={ + "prompt": "A fox running through snow.", + "extra_params": json.dumps({"sample_solver": "euler"}), + }, + ) + + assert response.status_code == 200 + video_id = response.json()["id"] + _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value) + engine = test_client.app.state.openai_serving_video._engine_client + captured = engine.captured_sampling_params_list[0] + assert captured.extra_args["sample_solver"] == "euler" + + # --------------------------------------------------------------------------- # Sync endpoint tests (POST /v1/videos/sync) # --------------------------------------------------------------------------- diff --git a/tools/wan22/assemble_wan22_i2v_diffusers.py b/tools/wan22/assemble_wan22_i2v_diffusers.py new file mode 100644 index 0000000000..8e14ca3c26 --- /dev/null +++ b/tools/wan22/assemble_wan22_i2v_diffusers.py @@ -0,0 +1,385 @@ +#!/usr/bin/env python3 +""" +Assemble a Wan2.2-I2V-A14B-Diffusers-style model directory using a Diffusers +skeleton and optional replacement transformer checkpoints. + +This tool does NOT run any external conversion step. You can use it in two +ways: +- keep the original weights from the Diffusers skeleton +- replace transformer/transformer_2 with converted checkpoints such as + LightX2V outputs +- use legacy LightX2V arg names (--high-noise-weight/--low-noise-weight), + which are accepted as aliases + +Typical use: + python tools/wan22/assemble_wan22_i2v_diffusers.py \ + --diffusers-skeleton /path/to/Wan2.2-I2V-A14B-Diffusers \ + --transformer-weight /path/to/high_noise_out/diffusion_pytorch_model.safetensors \ + --transformer-2-weight /path/to/low_noise_out/diffusion_pytorch_model.safetensors \ + --output-dir /path/to/Wan2.2-I2V-A14B-Custom-Diffusers +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import sys +from dataclasses import dataclass +from pathlib import Path + +WEIGHT_CANDIDATES = ( + "diffusion_pytorch_model.safetensors", + "diffusion_pytorch_model.bin", + "diffusion_pytorch_model.pt", + "model.safetensors", + "pytorch_model.bin", + "model.pt", +) +WEIGHT_INDEX_CANDIDATES = ( + "diffusion_pytorch_model.safetensors.index.json", + "model.safetensors.index.json", + "pytorch_model.bin.index.json", +) + +ROOT_REQUIRED_FILES = ("model_index.json",) +ROOT_REQUIRED_DIRS = ("tokenizer", "text_encoder", "vae", "transformer", "transformer_2") +OPTIONAL_DIRS = ("image_encoder", "image_processor", "scheduler", "feature_extractor") + + +class AssembleError(RuntimeError): + pass + + +@dataclass(frozen=True) +class WeightSpec: + kind: str # "single" | "sharded" + single_file: Path | None = None + index_file: Path | None = None + shard_files: tuple[Path, ...] = () + + +def _load_shard_files_from_index(index_file: Path, role: str) -> tuple[Path, ...]: + try: + with index_file.open(encoding="utf-8") as f: + payload = json.load(f) + except Exception as exc: + raise AssembleError(f"Failed to parse {role} index file: {index_file}. error={exc}") from exc + + weight_map = payload.get("weight_map") + if not isinstance(weight_map, dict) or not weight_map: + raise AssembleError(f"Invalid {role} index file (missing/empty weight_map): {index_file}") + + shard_names = sorted({str(v) for v in weight_map.values()}) + shard_paths: list[Path] = [] + missing: list[str] = [] + for shard_name in shard_names: + shard_path = index_file.parent / shard_name + if not shard_path.is_file(): + missing.append(str(shard_path)) + else: + shard_paths.append(shard_path) + + if missing: + raise AssembleError(f"{role} index references missing shard file(s): " + ", ".join(missing)) + + if not shard_paths: + raise AssembleError(f"No shard files referenced by {role} index: {index_file}") + + return tuple(shard_paths) + + +def _resolve_weight_spec(path: Path, role: str) -> WeightSpec: + if path.is_file(): + return WeightSpec(kind="single", single_file=path) + + if path.is_dir(): + for name in WEIGHT_CANDIDATES: + candidate = path / name + if candidate.is_file(): + return WeightSpec(kind="single", single_file=candidate) + + for index_name in WEIGHT_INDEX_CANDIDATES: + index_file = path / index_name + if not index_file.is_file(): + continue + shard_files = _load_shard_files_from_index(index_file, role=role) + return WeightSpec( + kind="sharded", + index_file=index_file, + shard_files=shard_files, + ) + + shard_candidates = sorted(path.glob("diffusion_pytorch_model-*.safetensors")) + if shard_candidates: + raise AssembleError( + f"Detected sharded {role} files under {path}, but index json is missing. " + f"Expected one of: {', '.join(WEIGHT_INDEX_CANDIDATES)}" + ) + + raise AssembleError( + f"Cannot find {role} weight under directory: {path}. " + f"Expected one of single files [{', '.join(WEIGHT_CANDIDATES)}] " + f"or sharded index files [{', '.join(WEIGHT_INDEX_CANDIDATES)}]." + ) + + raise AssembleError(f"{role} path does not exist: {path}") + + +def _canonical_weight_name(weight_file: Path) -> str: + suffix = weight_file.suffix.lower() + if suffix == ".safetensors": + return "diffusion_pytorch_model.safetensors" + if suffix == ".bin": + return "diffusion_pytorch_model.bin" + if suffix == ".pt": + return "diffusion_pytorch_model.pt" + return weight_file.name + + +def _validate_skeleton(skeleton: Path) -> None: + if not skeleton.is_dir(): + raise AssembleError(f"--diffusers-skeleton is not a directory: {skeleton}") + + for file_name in ROOT_REQUIRED_FILES: + if not (skeleton / file_name).is_file(): + raise AssembleError(f"Missing required file in skeleton: {skeleton / file_name}") + + for dir_name in ROOT_REQUIRED_DIRS: + if not (skeleton / dir_name).is_dir(): + raise AssembleError(f"Missing required directory in skeleton: {skeleton / dir_name}") + + if not (skeleton / "transformer" / "config.json").is_file(): + raise AssembleError(f"Missing transformer config: {skeleton / 'transformer/config.json'}") + + if not (skeleton / "transformer_2" / "config.json").is_file(): + raise AssembleError(f"Missing transformer_2 config: {skeleton / 'transformer_2/config.json'}") + + +def _ensure_clean_output(output_dir: Path, overwrite: bool) -> None: + if output_dir.exists(): + if not overwrite: + raise AssembleError( + f"Output directory already exists: {output_dir}. Use --overwrite to remove and recreate it." + ) + shutil.rmtree(output_dir) + output_dir.mkdir(parents=True, exist_ok=False) + + +def _copy_or_link_dir(src: Path, dst: Path, asset_mode: str) -> None: + if asset_mode == "copy": + shutil.copytree(src, dst) + elif asset_mode == "symlink": + dst.symlink_to(src, target_is_directory=True) + else: + raise AssembleError(f"Unknown asset mode: {asset_mode}") + + +def _materialize_weight(weight: WeightSpec, dst_dir: Path, role: str) -> tuple[Path, ...]: + if weight.kind == "single": + assert weight.single_file is not None + dst = dst_dir / _canonical_weight_name(weight.single_file) + shutil.copy2(weight.single_file, dst) + return (dst,) + + if weight.kind == "sharded": + assert weight.index_file is not None + copied: list[Path] = [] + index_dst = dst_dir / weight.index_file.name + shutil.copy2(weight.index_file, index_dst) + copied.append(index_dst) + for shard_file in weight.shard_files: + shard_dst = dst_dir / shard_file.name + shutil.copy2(shard_file, shard_dst) + copied.append(shard_dst) + return tuple(copied) + + raise AssembleError(f"Unknown {role} weight kind: {weight.kind}") + + +def _assemble( + skeleton: Path, + output_dir: Path, + transformer_weight: WeightSpec, + transformer_2_weight: WeightSpec, + asset_mode: str, +) -> tuple[tuple[Path, ...], tuple[Path, ...]]: + shutil.copy2(skeleton / "model_index.json", output_dir / "model_index.json") + + for dir_name in ROOT_REQUIRED_DIRS: + if dir_name in ("transformer", "transformer_2"): + continue + _copy_or_link_dir(skeleton / dir_name, output_dir / dir_name, asset_mode) + + for dir_name in OPTIONAL_DIRS: + src_dir = skeleton / dir_name + if src_dir.is_dir(): + _copy_or_link_dir(src_dir, output_dir / dir_name, asset_mode) + + (output_dir / "transformer").mkdir(parents=True, exist_ok=True) + (output_dir / "transformer_2").mkdir(parents=True, exist_ok=True) + + shutil.copy2(skeleton / "transformer" / "config.json", output_dir / "transformer" / "config.json") + shutil.copy2(skeleton / "transformer_2" / "config.json", output_dir / "transformer_2" / "config.json") + + transformer_copied = _materialize_weight(transformer_weight, output_dir / "transformer", role="transformer") + transformer_2_copied = _materialize_weight( + transformer_2_weight, + output_dir / "transformer_2", + role="transformer_2", + ) + + return transformer_copied, transformer_2_copied + + +def _validate_output( + output_dir: Path, + transformer_copied: tuple[Path, ...], + transformer_2_copied: tuple[Path, ...], +) -> None: + if not (output_dir / "model_index.json").is_file(): + raise AssembleError("Output validation failed: model_index.json missing") + + required_paths = ( + output_dir / "tokenizer", + output_dir / "text_encoder", + output_dir / "vae", + output_dir / "transformer" / "config.json", + output_dir / "transformer_2" / "config.json", + *transformer_copied, + *transformer_2_copied, + ) + missing = [str(p) for p in required_paths if not p.exists()] + if missing: + raise AssembleError("Output validation failed, missing: " + ", ".join(missing)) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Assemble a Wan2.2-I2V-A14B-Diffusers directory while optionally " + "replacing transformer and transformer_2 weights." + ) + ) + parser.add_argument( + "--diffusers-skeleton", + type=Path, + required=True, + help="Path to a local Wan-AI/Wan2.2-I2V-A14B-Diffusers directory.", + ) + parser.add_argument( + "--transformer-weight", + type=Path, + help=( + "Optional checkpoint file, or directory containing either a single-file " + "weight or sharded index+shards for transformer/. If omitted, keep the " + "skeleton's original transformer weights." + ), + ) + parser.add_argument( + "--transformer-2-weight", + type=Path, + help=( + "Optional checkpoint file, or directory containing either a single-file " + "weight or sharded index+shards for transformer_2/. If omitted, keep the " + "skeleton's original transformer_2 weights." + ), + ) + parser.add_argument( + "--high-noise-weight", + type=Path, + help=argparse.SUPPRESS, + ) + parser.add_argument( + "--low-noise-weight", + type=Path, + help=argparse.SUPPRESS, + ) + parser.add_argument( + "--output-dir", + type=Path, + required=True, + help="Output directory for the assembled model.", + ) + parser.add_argument( + "--asset-mode", + choices=("symlink", "copy"), + default="symlink", + help=( + "How to materialize non-transformer assets (tokenizer/text_encoder/vae/optional dirs). " + "symlink saves disk and is default." + ), + ) + parser.add_argument( + "--overwrite", + action="store_true", + help="Overwrite output-dir if it exists.", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + + skeleton = args.diffusers_skeleton.resolve() + output_dir = args.output_dir.resolve() + + if args.transformer_weight is not None and args.high_noise_weight is not None: + print( + "[ERROR] --transformer-weight and --high-noise-weight are aliases; please provide only one.", + file=sys.stderr, + ) + return 2 + if args.transformer_2_weight is not None and args.low_noise_weight is not None: + print( + "[ERROR] --transformer-2-weight and --low-noise-weight are aliases; please provide only one.", + file=sys.stderr, + ) + return 2 + + transformer_weight_arg = args.transformer_weight if args.transformer_weight is not None else args.high_noise_weight + transformer_2_weight_arg = ( + args.transformer_2_weight if args.transformer_2_weight is not None else args.low_noise_weight + ) + + transformer_input = ( + transformer_weight_arg.resolve() if transformer_weight_arg is not None else skeleton / "transformer" + ) + transformer_2_input = ( + transformer_2_weight_arg.resolve() if transformer_2_weight_arg is not None else skeleton / "transformer_2" + ) + + try: + _validate_skeleton(skeleton) + transformer_weight = _resolve_weight_spec(transformer_input, role="transformer") + transformer_2_weight = _resolve_weight_spec(transformer_2_input, role="transformer_2") + + _ensure_clean_output(output_dir, overwrite=args.overwrite) + transformer_copied, transformer_2_copied = _assemble( + skeleton=skeleton, + output_dir=output_dir, + transformer_weight=transformer_weight, + transformer_2_weight=transformer_2_weight, + asset_mode=args.asset_mode, + ) + _validate_output(output_dir, transformer_copied, transformer_2_copied) + except AssembleError as exc: + print(f"[ERROR] {exc}", file=sys.stderr) + return 2 + + def _weight_summary(copied: tuple[Path, ...]) -> str: + if len(copied) == 1: + return copied[0].name + return f"{copied[0].name} + {len(copied) - 1} shard files" + + print("[OK] Assembled Wan2.2 I2V Diffusers directory:") + print(f" output_dir: {output_dir}") + print(f" transformer weight: {_weight_summary(transformer_copied)}") + print(f" transformer_2 weight: {_weight_summary(transformer_2_copied)}") + print("\nUse it with vLLM-Omni, for example:") + print(f" vllm serve {output_dir} --omni --port 8091") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py index a550e576f0..84d89619e8 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py @@ -24,6 +24,7 @@ from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin, _is_rank_zero from vllm_omni.diffusion.models.schedulers import FlowUniPCMultistepScheduler +from vllm_omni.diffusion.models.wan2_2.scheduling_wan_euler import WanEulerScheduler from vllm_omni.diffusion.models.wan2_2.wan2_2_transformer import WanTransformer3DModel from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest @@ -32,6 +33,46 @@ logger = logging.getLogger(__name__) DEBUG_PERF = False +WAN_SAMPLE_SOLVER_CHOICES = {"unipc", "euler"} + + +def build_wan_scheduler(sample_solver: str, flow_shift: float) -> Any: + if sample_solver == "unipc": + return FlowUniPCMultistepScheduler( + num_train_timesteps=1000, + shift=flow_shift, + prediction_type="flow_prediction", + ) + if sample_solver == "euler": + return WanEulerScheduler( + num_train_timesteps=1000, + shift=flow_shift, + ) + + raise ValueError( + f"Unsupported Wan sample_solver: {sample_solver}. Expected one of: {sorted(WAN_SAMPLE_SOLVER_CHOICES)}" + ) + + +def resolve_wan_sample_solver(req: OmniDiffusionRequest, default: str = "unipc") -> str: + extra_args = getattr(req.sampling_params, "extra_args", {}) or {} + raw = extra_args.get("sample_solver", default) + sample_solver = str(raw).strip().lower() + if sample_solver not in WAN_SAMPLE_SOLVER_CHOICES: + raise ValueError(f"Invalid sample_solver={raw!r}. Expected one of: {sorted(WAN_SAMPLE_SOLVER_CHOICES)}") + return sample_solver + + +def resolve_wan_flow_shift(req: OmniDiffusionRequest, od_config: OmniDiffusionConfig) -> float: + extra_args = getattr(req.sampling_params, "extra_args", {}) or {} + raw_flow_shift = extra_args.get("flow_shift") + if raw_flow_shift is None: + raw_flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 + + try: + return float(raw_flow_shift) + except (TypeError, ValueError) as exc: + raise ValueError(f"Invalid flow_shift={raw_flow_shift!r}. flow_shift must be a float.") from exc def retrieve_latents( @@ -296,13 +337,9 @@ def __init__( else: raise RuntimeError("No transformer loaded") - # Initialize UniPC scheduler - flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 # default for 720p - self.scheduler = FlowUniPCMultistepScheduler( - num_train_timesteps=1000, - shift=flow_shift, - prediction_type="flow_prediction", - ) + self._sample_solver = "unipc" + self._flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 + self.scheduler = build_wan_scheduler(self._sample_solver, self._flow_shift) self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4 self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8 @@ -462,6 +499,13 @@ def forward( current_omni_platform.synchronize() _t_text_enc_ms = (time.perf_counter() - _t_text_enc_start) * 1000 + sample_solver = resolve_wan_sample_solver(req, default=self._sample_solver) + flow_shift = resolve_wan_flow_shift(req, self.od_config) + if sample_solver != self._sample_solver or abs(flow_shift - self._flow_shift) > 1e-6: + self.scheduler = build_wan_scheduler(sample_solver, flow_shift) + self._sample_solver = sample_solver + self._flow_shift = flow_shift + # Timesteps self.scheduler.set_timesteps(num_steps, device=device) timesteps = self.scheduler.timesteps diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py index c05ecc9c9a..46484cd789 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py @@ -24,10 +24,12 @@ from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.interface import SupportImageInput from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin, _is_rank_zero -from vllm_omni.diffusion.models.schedulers import FlowUniPCMultistepScheduler from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import ( + build_wan_scheduler, create_transformer_from_config, load_transformer_config, + resolve_wan_flow_shift, + resolve_wan_sample_solver, retrieve_latents, ) from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin @@ -230,13 +232,9 @@ def __init__( else: self.transformer_2 = None - # Initialize UniPC scheduler - flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 # default for 720p - self.scheduler = FlowUniPCMultistepScheduler( - num_train_timesteps=1000, - shift=flow_shift, - prediction_type="flow_prediction", - ) + self._sample_solver = "unipc" + self._flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 + self.scheduler = build_wan_scheduler(self._sample_solver, self._flow_shift) # VAE scale factors self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if hasattr(self.vae, "config") else 4 @@ -440,6 +438,13 @@ def forward( current_omni_platform.synchronize() _t_img_enc_ms = (time.perf_counter() - _t_img_enc_start) * 1000 + sample_solver = resolve_wan_sample_solver(req, default=self._sample_solver) + flow_shift = resolve_wan_flow_shift(req, self.od_config) + if sample_solver != self._sample_solver or abs(flow_shift - self._flow_shift) > 1e-6: + self.scheduler = build_wan_scheduler(sample_solver, flow_shift) + self._sample_solver = sample_solver + self._flow_shift = flow_shift + # Timesteps self.scheduler.set_timesteps(num_steps, device=device) timesteps = self.scheduler.timesteps diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py index 261f62fb79..939fe294a3 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py @@ -36,10 +36,12 @@ from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.interface import SupportImageInput from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin -from vllm_omni.diffusion.models.schedulers import FlowUniPCMultistepScheduler from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import ( + build_wan_scheduler, create_transformer_from_config, load_transformer_config, + resolve_wan_flow_shift, + resolve_wan_sample_solver, retrieve_latents, ) from vllm_omni.diffusion.request import OmniDiffusionRequest @@ -183,13 +185,9 @@ def __init__( transformer_config = load_transformer_config(model, "transformer", local_files_only) self.transformer = create_transformer_from_config(transformer_config) - # Initialize UniPC scheduler - flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 # default for 720p - self.scheduler = FlowUniPCMultistepScheduler( - num_train_timesteps=1000, - shift=flow_shift, - prediction_type="flow_prediction", - ) + self._sample_solver = "unipc" + self._flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 + self.scheduler = build_wan_scheduler(self._sample_solver, self._flow_shift) # VAE scale factors self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if hasattr(self.vae, "config") else 4 @@ -323,6 +321,13 @@ def forward( batch_size = prompt_embeds.shape[0] + sample_solver = resolve_wan_sample_solver(req, default=self._sample_solver) + flow_shift = resolve_wan_flow_shift(req, self.od_config) + if sample_solver != self._sample_solver or abs(flow_shift - self._flow_shift) > 1e-6: + self.scheduler = build_wan_scheduler(sample_solver, flow_shift) + self._sample_solver = sample_solver + self._flow_shift = flow_shift + # Timesteps self.scheduler.set_timesteps(num_steps, device=device) timesteps = self.scheduler.timesteps diff --git a/vllm_omni/diffusion/models/wan2_2/scheduling_wan_euler.py b/vllm_omni/diffusion/models/wan2_2/scheduling_wan_euler.py new file mode 100644 index 0000000000..25444044c2 --- /dev/null +++ b/vllm_omni/diffusion/models/wan2_2/scheduling_wan_euler.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +from dataclasses import dataclass +from types import SimpleNamespace + +import numpy as np +import torch + + +@dataclass +class WanEulerSchedulerOutput: + prev_sample: torch.FloatTensor + + +def _unsqueeze_to_ndim(in_tensor: torch.Tensor, target_ndim: int) -> torch.Tensor: + if in_tensor.ndim >= target_ndim: + return in_tensor + return in_tensor[(...,) + (None,) * (target_ndim - in_tensor.ndim)] + + +def _get_timesteps(num_steps: int, max_steps: int = 1000) -> np.ndarray: + # Keep num_steps + 1 points so Euler update can always access sigma_next. + return np.linspace(max_steps, 0, num_steps + 1, dtype=np.float32) + + +def _timestep_shift(timesteps: torch.Tensor, shift: float = 1.0) -> torch.Tensor: + return shift * timesteps / (1 + (shift - 1) * timesteps) + + +class WanEulerScheduler: + order = 1 + + def __init__( + self, + num_train_timesteps: int = 1000, + shift: float = 1.0, + device: torch.device | str = "cpu", + ) -> None: + self.num_train_timesteps = int(num_train_timesteps) + self._shift = float(shift) + self.device = device + self.config = SimpleNamespace(num_train_timesteps=self.num_train_timesteps) + self.init_noise_sigma = 1.0 + + self._step_index: int | None = None + self._begin_index: int | None = None + + self.timesteps = torch.empty(0, dtype=torch.float32) + self.sigmas = torch.empty(0, dtype=torch.float32) + self.timesteps_ori = torch.empty(0, dtype=torch.float32) + + self.set_timesteps(num_inference_steps=self.num_train_timesteps, device=self.device) + + @property + def step_index(self) -> int | None: + return self._step_index + + @property + def begin_index(self) -> int | None: + return self._begin_index + + def set_begin_index(self, begin_index: int = 0) -> None: + self._begin_index = int(begin_index) + + def index_for_timestep(self, timestep: torch.Tensor) -> int: + indices = (self.timesteps == timestep).nonzero() + if len(indices) > 0: + pos = 1 if len(indices) > 1 else 0 + return int(indices[pos].item()) + # Fallback for tiny float drift + return int(torch.argmin(torch.abs(self.timesteps - timestep)).item()) + + def _init_step_index(self, timestep: float | torch.Tensor) -> None: + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep_t = timestep.to(self.timesteps.device, dtype=self.timesteps.dtype) + else: + timestep_t = torch.tensor(timestep, device=self.timesteps.device, dtype=self.timesteps.dtype) + self._step_index = self.index_for_timestep(timestep_t) + else: + self._step_index = self._begin_index + + def set_shift(self, shift: float = 1.0) -> None: + # Compute shifted sigma schedule on [0, 1]. + sigmas_full = self.timesteps_ori / float(self.num_train_timesteps) + sigmas_full = _timestep_shift(sigmas_full, shift=float(shift)) + self.sigmas = sigmas_full + # Public timesteps are the first N points; next point is consumed as sigma_next. + self.timesteps = self.sigmas[:-1] * self.num_train_timesteps + self._shift = float(shift) + + def set_timesteps( + self, + num_inference_steps: int, + device: torch.device | str | int | None = None, + **kwargs, # noqa: ARG002 - kept for scheduler API compatibility + ) -> None: + timesteps = _get_timesteps( + num_steps=int(num_inference_steps), + max_steps=self.num_train_timesteps, + ) + self.timesteps_ori = torch.from_numpy(timesteps).to( + dtype=torch.float32, + device=device or self.device, + ) + self.set_shift(self._shift) + self._step_index = None + self._begin_index = None + + def scale_model_input(self, sample: torch.Tensor, timestep: int | None = None) -> torch.Tensor: # noqa: ARG002 + return sample + + def step( + self, + model_output: torch.FloatTensor, + timestep: float | torch.FloatTensor, + sample: torch.FloatTensor, + return_dict: bool = True, + **kwargs, # noqa: ARG002 - kept for scheduler API compatibility + ) -> WanEulerSchedulerOutput | tuple[torch.FloatTensor]: + if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)): + raise ValueError( + "Passing integer indices as timesteps is not supported. Use one value from scheduler.timesteps instead." + ) + + if self.step_index is None: + self._init_step_index(timestep) + assert self._step_index is not None + + sample_fp32 = sample.to(torch.float32) + sigma = _unsqueeze_to_ndim(self.sigmas[self._step_index], sample_fp32.ndim).to(sample_fp32.device) + sigma_next = _unsqueeze_to_ndim(self.sigmas[self._step_index + 1], sample_fp32.ndim).to(sample_fp32.device) + + prev_sample = sample_fp32 + (sigma_next - sigma) * model_output + prev_sample = prev_sample.to(model_output.dtype) + + self._step_index += 1 + + if not return_dict: + return (prev_sample,) + return WanEulerSchedulerOutput(prev_sample=prev_sample) + + def __len__(self) -> int: + return self.num_train_timesteps diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py index 65a2d4390a..3b43f3eaf5 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py @@ -1015,6 +1015,14 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if ".to_out.0." in lookup_name: lookup_name = lookup_name.replace(".to_out.0.", ".to_out.") + # Compatibility: some Wan conversion pipelines still keep + # block modulation keys as `blocks.N.modulation` instead of + # `blocks.N.scale_shift_table`. + if lookup_name.endswith(".modulation"): + modulation_alias = lookup_name[: -len(".modulation")] + ".scale_shift_table" + if modulation_alias in params_dict: + lookup_name = modulation_alias + if lookup_name not in params_dict: logger.warning(f"Skipping weight {original_name} -> {lookup_name}") continue diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 8e0b2b2df1..32e8336f6d 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -1221,6 +1221,8 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list: "enable_cpu_offload": kwargs.get("enable_cpu_offload", False), "enable_layerwise_offload": kwargs.get("enable_layerwise_offload", False), "enforce_eager": kwargs.get("enforce_eager", False), + "boundary_ratio": kwargs.get("boundary_ratio", None), + "flow_shift": kwargs.get("flow_shift", None), "diffusion_load_format": kwargs.get("diffusion_load_format", "default"), "custom_pipeline_args": kwargs.get("custom_pipeline_args", None), "worker_extension_cls": kwargs.get("worker_extension_cls", None), From d9e745ce2c562be06913cf27c3c9942a56154b93 Mon Sep 17 00:00:00 2001 From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Date: Mon, 13 Apr 2026 02:30:56 -0400 Subject: [PATCH 141/204] [Fix] VoxCPM2: support raw audio for voice cloning via OpenAI API (#2720) Signed-off-by: Yueqian Lin --- examples/online_serving/voxcpm2/README.md | 42 ++++++ .../voxcpm2/openai_speech_client.py | 108 +++++++++++++++ .../models/voxcpm2/voxcpm2_talker.py | 130 +++++++++++++++++- 3 files changed, 277 insertions(+), 3 deletions(-) create mode 100644 examples/online_serving/voxcpm2/README.md create mode 100644 examples/online_serving/voxcpm2/openai_speech_client.py diff --git a/examples/online_serving/voxcpm2/README.md b/examples/online_serving/voxcpm2/README.md new file mode 100644 index 0000000000..8735180f0a --- /dev/null +++ b/examples/online_serving/voxcpm2/README.md @@ -0,0 +1,42 @@ +# VoxCPM2 Online Serving + +Serve VoxCPM2 TTS via the OpenAI-compatible `/v1/audio/speech` endpoint. + +## Start the Server + +```bash +python -m vllm_omni.entrypoints.openai.api_server \ + --model openbmb/VoxCPM2 \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm2.yaml \ + --host 0.0.0.0 --port 8000 +``` + +## Zero-shot Synthesis + +```bash +python openai_speech_client.py --text "Hello, this is VoxCPM2." +``` + +Or with curl: + +```bash +curl -X POST http://localhost:8000/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{"model": "voxcpm2", "input": "Hello, this is VoxCPM2.", "voice": "default"}' \ + --output output.wav +``` + +## Voice Cloning + +Clone a speaker's voice using a reference audio file: + +```bash +python openai_speech_client.py \ + --text "This should sound like the reference speaker." \ + --ref-audio /path/to/reference.wav +``` + +The `--ref-audio` parameter accepts: +- Local file path (auto-encoded to base64) +- URL (`https://...`) +- Base64 data URI (`data:audio/wav;base64,...`) diff --git a/examples/online_serving/voxcpm2/openai_speech_client.py b/examples/online_serving/voxcpm2/openai_speech_client.py new file mode 100644 index 0000000000..a117d24fd1 --- /dev/null +++ b/examples/online_serving/voxcpm2/openai_speech_client.py @@ -0,0 +1,108 @@ +"""OpenAI-compatible client for VoxCPM2 TTS via /v1/audio/speech endpoint. + +Examples: + # Zero-shot synthesis + python openai_speech_client.py --text "Hello, this is VoxCPM2." + + # Voice cloning with a local reference audio file + python openai_speech_client.py --text "Hello world" \ + --ref-audio /path/to/reference.wav + + # Voice cloning with a URL + python openai_speech_client.py --text "Hello world" \ + --ref-audio "https://example.com/reference.wav" + +Server setup: + python -m vllm_omni.entrypoints.openai.api_server \ + --model openbmb/VoxCPM2 \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm2.yaml \ + --host 0.0.0.0 --port 8000 +""" + +from __future__ import annotations + +import argparse +import base64 +import os + +import httpx + +DEFAULT_API_BASE = "http://localhost:8000" +DEFAULT_API_KEY = "sk-empty" + + +def encode_audio_to_base64(audio_path: str) -> str: + """Encode a local audio file to a base64 data URL.""" + if not os.path.exists(audio_path): + raise FileNotFoundError(f"Audio file not found: {audio_path}") + + ext = audio_path.lower().rsplit(".", 1)[-1] + mime = { + "wav": "audio/wav", + "mp3": "audio/mpeg", + "flac": "audio/flac", + "ogg": "audio/ogg", + }.get(ext, "audio/wav") + + with open(audio_path, "rb") as f: + b64 = base64.b64encode(f.read()).decode("utf-8") + return f"data:{mime};base64,{b64}" + + +def main() -> None: + parser = argparse.ArgumentParser(description="VoxCPM2 OpenAI speech client") + parser.add_argument("--text", type=str, required=True, help="Text to synthesize") + parser.add_argument( + "--ref-audio", + type=str, + default=None, + help="Reference audio for voice cloning (local path, URL, or data: URI)", + ) + parser.add_argument("--model", type=str, default="voxcpm2") + parser.add_argument("--output", type=str, default="output.wav") + parser.add_argument("--api-base", type=str, default=DEFAULT_API_BASE) + parser.add_argument("--api-key", type=str, default=DEFAULT_API_KEY) + parser.add_argument("--response-format", type=str, default="wav") + args = parser.parse_args() + + # VoxCPM2 has no predefined voices. The "voice" field is required by + # the OpenAI API schema but ignored by VoxCPM2 — use any placeholder. + # For voice cloning, pass --ref-audio instead. + payload: dict = { + "model": args.model, + "input": args.text, + "voice": "default", + "response_format": args.response_format, + } + + if args.ref_audio: + ref = args.ref_audio + if ref.startswith(("http://", "https://", "data:")): + payload["ref_audio"] = ref + else: + payload["ref_audio"] = encode_audio_to_base64(ref) + + url = f"{args.api_base}/v1/audio/speech" + print(f"POST {url}") + print(f" text: {args.text}") + if args.ref_audio: + print(f" ref_audio: {args.ref_audio[:80]}...") + + with httpx.Client(timeout=300) as client: + resp = client.post( + url, + json=payload, + headers={"Authorization": f"Bearer {args.api_key}"}, + ) + + if resp.status_code != 200: + print(f"Error {resp.status_code}: {resp.text[:500]}") + return + + with open(args.output, "wb") as f: + f.write(resp.content) + print(f"Saved: {args.output} ({len(resp.content):,} bytes)") + + +if __name__ == "__main__": + main() diff --git a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py index ade68b673b..b9faf9fa3b 100644 --- a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py +++ b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py @@ -22,6 +22,7 @@ from collections.abc import Iterable from typing import Any +import librosa import torch import torch.nn as nn from vllm.config import VllmConfig @@ -41,6 +42,53 @@ logger = init_logger(__name__) +def _encode_raw_audio( + tts: nn.Module, + samples: list[float] | torch.Tensor, + sr: int, + padding_mode: str = "right", +) -> torch.Tensor: + """Encode raw audio samples using the native VoxCPM2 AudioVAE. + + Mirrors ``VoxCPM2Model._encode_wav`` but accepts in-memory samples + instead of a file path. This is needed for the OpenAI speech API + where ``_resolve_ref_audio`` returns decoded audio data. + + Args: + tts: Native VoxCPM2 tts_model instance. + samples: Audio samples (mono, float32). + sr: Sample rate of the input audio. + padding_mode: "right" (default) or "left" padding. + + Returns: + audio_feat: (T, P, D) tensor of latent patches. + """ + if isinstance(samples, list): + audio = torch.tensor(samples, dtype=torch.float32) + else: + audio = samples.float() + + if audio.ndim == 1: + audio = audio.unsqueeze(0) + + # Resample to the model's expected encoding sample rate + encode_sr = tts._encode_sample_rate + if sr != encode_sr: + audio_np = audio.squeeze(0).numpy() + audio_np = librosa.resample(audio_np, orig_sr=sr, target_sr=encode_sr) + audio = torch.from_numpy(audio_np).unsqueeze(0) + + # Pad to patch boundary + patch_len = tts.patch_size * tts.chunk_size + if audio.size(1) % patch_len != 0: + padding_size = patch_len - audio.size(1) % patch_len + pad = (padding_size, 0) if padding_mode == "left" else (0, padding_size) + audio = torch.nn.functional.pad(audio, pad) + + feat = tts.audio_vae.encode(audio.to(tts.device), encode_sr).cpu() + return feat.view(tts.audio_vae.latent_dim, -1, tts.patch_size).permute(1, 2, 0) + + class VoxCPM2TalkerForConditionalGeneration(nn.Module): """VoxCPM2 talker using native MiniCPM4 base_lm. @@ -83,6 +131,82 @@ def tts(self) -> nn.Module: assert self._tts is not None, "Model not loaded yet" return self._tts + def _build_prompt_cache( + self, + ref_audio: Any = None, + prompt_audio: Any = None, + prompt_text: str | None = None, + ) -> dict | None: + """Build prompt cache, handling both file paths and raw audio data. + + The OpenAI speech API sends decoded audio as [samples_list, sr] + via ``_resolve_ref_audio``, while offline usage sends file paths. + This method detects the format and routes accordingly. + """ + tts = self.tts + + def _is_raw_audio(v: Any) -> bool: + """Check if value is [samples, sr] from serving_speech.""" + return ( + isinstance(v, (list, tuple)) + and len(v) == 2 + and isinstance(v[1], int) + and isinstance(v[0], (list, torch.Tensor)) + ) + + # If all inputs are file paths (or None), use native build_prompt_cache + if not _is_raw_audio(ref_audio) and not _is_raw_audio(prompt_audio): + return tts.build_prompt_cache( + prompt_text=prompt_text, + prompt_wav_path=prompt_audio, + reference_wav_path=ref_audio, + ) + + # Raw audio path: encode directly + cache: dict[str, Any] = {} + + if ref_audio is not None: + if _is_raw_audio(ref_audio): + samples, sr = ref_audio + cache["ref_audio_feat"] = _encode_raw_audio( + tts, + samples, + sr, + padding_mode="right", + ) + else: + cache["ref_audio_feat"] = tts._encode_wav( + ref_audio, + padding_mode="right", + ) + + if prompt_audio is not None and prompt_text is not None: + cache["prompt_text"] = prompt_text + if _is_raw_audio(prompt_audio): + samples, sr = prompt_audio + cache["audio_feat"] = _encode_raw_audio( + tts, + samples, + sr, + padding_mode="left", + ) + else: + cache["audio_feat"] = tts._encode_wav( + prompt_audio, + padding_mode="left", + ) + + has_ref = "ref_audio_feat" in cache + has_prompt = "audio_feat" in cache + if has_ref and has_prompt: + cache["mode"] = "ref_continuation" + elif has_ref: + cache["mode"] = "reference" + else: + cache["mode"] = "continuation" + + return cache + # -------------------- vllm hooks -------------------- def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor: @@ -482,10 +606,10 @@ def preprocess( self._prompt_cache = None if ref_audio or (prompt_audio and prompt_text): try: - self._prompt_cache = self.tts.build_prompt_cache( + self._prompt_cache = self._build_prompt_cache( + ref_audio=ref_audio, + prompt_audio=prompt_audio, prompt_text=prompt_text, - prompt_wav_path=prompt_audio, - reference_wav_path=ref_audio, ) except Exception as e: logger.warning("build_prompt_cache failed: %s; falling back to zero-shot", e) From 22261430b42b3e91d2019367da9fe1a8bac7f58a Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Mon, 13 Apr 2026 14:47:55 +0800 Subject: [PATCH 142/204] [CI][Bugfix] Refactor the test case to add support for increasing init timeout and stage init timeout in order to resolve the CI timeout error. (#2711) Signed-off-by: wangyu <410167048@qq.com> --- .buildkite/test-merge.yml | 2 +- .buildkite/test-nightly.yml | 3 +- tests/conftest.py | 8 +- .../offline_inference/test_bagel_img2img.py | 15 +- .../e2e/offline_inference/test_bagel_lora.py | 11 +- .../offline_inference/test_bagel_text2img.py | 32 ++-- .../test_bagel_understanding.py | 27 +-- tests/e2e/offline_inference/test_cache_dit.py | 35 +--- .../test_diffusion_cpu_offload.py | 43 ++--- .../test_diffusion_layerwise_offload.py | 56 +++--- .../offline_inference/test_diffusion_lora.py | 14 +- .../e2e/offline_inference/test_dynin_omni.py | 73 ++------ .../offline_inference/test_expert_parallel.py | 51 +++--- .../test_flux_autoround_w4a16.py | 40 ++--- .../offline_inference/test_flux_kontext.py | 97 +++++----- .../test_hunyuanimage3_text2img.py | 14 +- .../e2e/offline_inference/test_magi_human.py | 17 +- .../offline_inference/test_mammoth_moda2.py | 11 +- tests/e2e/offline_inference/test_omnivoice.py | 55 +++--- .../test_quantization_fp8.py | 19 +- .../test_qwen_image_diffusion_batching.py | 165 ++++++++---------- .../test_sequence_parallel.py | 63 ++++--- .../test_stable_audio_model.py | 21 +-- tests/e2e/offline_inference/test_t2i_model.py | 101 +++++------ tests/e2e/offline_inference/test_t2v_model.py | 51 +++--- tests/e2e/offline_inference/test_teacache.py | 37 +--- .../test_vae_decode_parallelism.py | 36 ++-- tests/e2e/offline_inference/test_voxcpm2.py | 7 +- .../e2e/offline_inference/test_voxtral_tts.py | 17 +- .../test_zimage_parallelism.py | 112 ++++++------ .../test_images_generations_lora.py | 2 +- 31 files changed, 497 insertions(+), 738 deletions(-) diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 7355e2b4c7..24fc6dd3dc 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -113,7 +113,7 @@ steps: - "/fsx/hf_cache:/fsx/hf_cache" - label: "Diffusion Sequence Parallelism Test" - timeout_in_minutes: 20 + timeout_in_minutes: 25 depends_on: upload-merge-pipeline commands: - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py tests/diffusion/distributed/test_ulysses_uaa_perf.py diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 06b7c14ae1..31b3e17976 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -141,7 +141,6 @@ steps: - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" - - buildkite-agent artifact upload "tests/dfx/perf/results/*.html" agents: queue: "mithril-h100-pool" plugins: @@ -244,7 +243,7 @@ steps: - export DEFAULT_OUTPUT_DIR=tests/dfx/perf/results - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-omni-performance - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-qwen-image-performance - - buildkite-agent artifact download "tests/dfx/perf/results/*.html" . --step nightly-omni-performance + - buildkite-agent artifact download "tests/dfx/perf/results/*.html" . --step nightly-testcase-statistics - python tools/nightly/generate_nightly_perf_excel.py - python tools/nightly/generate_nightly_perf_html.py - python tools/nightly/send_nightly_email.py --report-file "tests/dfx/perf/results/*.xlsx, tests/dfx/perf/results/*.html" diff --git a/tests/conftest.py b/tests/conftest.py index 18a0ee57d9..9c739533b8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1771,8 +1771,12 @@ def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: st server_args = params.server_args or [] if params.use_omni and params.stage_init_timeout is not None: server_args = [*server_args, "--stage-init-timeout", str(params.stage_init_timeout)] + else: + server_args = [*server_args, "--stage-init-timeout", "600"] if params.init_timeout is not None: server_args = [*server_args, "--init-timeout", str(params.init_timeout)] + else: + server_args = [*server_args, "--init-timeout", "900"] if params.use_stage_cli: if not params.use_omni: raise ValueError("omni_server with use_stage_cli=True requires use_omni=True") @@ -2870,9 +2874,9 @@ def __init__( self, model_name: str, seed: int = 42, - stage_init_timeout: int = 300, + stage_init_timeout: int = 600, batch_timeout: int = 10, - init_timeout: int = 300, + init_timeout: int = 900, shm_threshold_bytes: int = 65536, log_stats: bool = False, stage_configs_path: str | None = None, diff --git a/tests/e2e/offline_inference/test_bagel_img2img.py b/tests/e2e/offline_inference/test_bagel_img2img.py index a0c3f6cc9f..63d2a37da7 100644 --- a/tests/e2e/offline_inference/test_bagel_img2img.py +++ b/tests/e2e/offline_inference/test_bagel_img2img.py @@ -22,9 +22,9 @@ from PIL import Image from vllm.assets.image import ImageAsset -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni +from vllm_omni import Omni from vllm_omni.platforms import current_omni_platform # Reference pixel data extracted from the known-good output image @@ -210,11 +210,10 @@ def test_bagel_img2img_shared_memory_connector(run_level): input_image = _load_input_image() config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") config_path = _resolve_stage_config(config_path, run_level) - omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300) - - try: - generated_image = _generate_bagel_img2img(omni, input_image) + with OmniRunner( + "ByteDance-Seed/BAGEL-7B-MoT", + stage_configs_path=config_path, + ) as runner: + generated_image = _generate_bagel_img2img(runner.omni, input_image) if run_level == "advanced_model": _validate_pixels(generated_image) - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_bagel_lora.py b/tests/e2e/offline_inference/test_bagel_lora.py index 593a640478..501d23eaa8 100644 --- a/tests/e2e/offline_inference/test_bagel_lora.py +++ b/tests/e2e/offline_inference/test_bagel_lora.py @@ -22,7 +22,6 @@ from vllm_omni.outputs import OmniRequestOutput os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" from pathlib import Path @@ -32,9 +31,9 @@ from PIL import Image from safetensors.torch import save_file -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni +from vllm_omni import Omni from vllm_omni.lora.request import LoRARequest from vllm_omni.lora.utils import stable_lora_int_id @@ -154,8 +153,8 @@ def _make_file_lora_request(adapter_dir: Path) -> LoRARequest: def test_bagel_lora_scale_and_deactivation(run_level, tmp_path): """Validate LoRA effect, bounded perturbation, and clean deactivation.""" config_path = _resolve_stage_config(BAGEL_STAGE_CONFIG, run_level) - omni = Omni(model=MODEL, stage_configs_path=config_path, stage_init_timeout=300) - try: + with OmniRunner(MODEL, stage_configs_path=config_path) as runner: + omni = runner.omni lora_request = _make_file_lora_request(tmp_path / "bagel_lora") # 1) Baseline (no LoRA) @@ -194,5 +193,3 @@ def test_bagel_lora_scale_and_deactivation(run_level, tmp_path): # (d) Deactivation fully restores base model assert diff_restored == 0.0, f"Base model not restored after LoRA deactivation: diff={diff_restored}" - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_bagel_text2img.py b/tests/e2e/offline_inference/test_bagel_text2img.py index 7cce8da3a7..e45d64f2ac 100644 --- a/tests/e2e/offline_inference/test_bagel_text2img.py +++ b/tests/e2e/offline_inference/test_bagel_text2img.py @@ -16,7 +16,6 @@ import os os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" import signal import socket import subprocess @@ -28,9 +27,9 @@ import pytest from PIL import Image -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni +from vllm_omni import Omni from vllm_omni.platforms import current_omni_platform # Reference pixel data extracted from the known-good output image @@ -199,14 +198,13 @@ def test_bagel_text2img_shared_memory_connector(run_level): """Test Bagel text2img with shared memory connector.""" config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") config_path = _resolve_stage_config(config_path, run_level) - omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300) - - try: - generated_image = _generate_bagel_image(omni) + with OmniRunner( + "ByteDance-Seed/BAGEL-7B-MoT", + stage_configs_path=config_path, + ) as runner: + generated_image = _generate_bagel_image(runner.omni) if run_level == "advanced_model": _validate_pixels(generated_image) - finally: - omni.close() def _wait_for_port(host: str, port: int, timeout: int = 30) -> bool: @@ -319,7 +317,6 @@ def test_bagel_text2img_mooncake_connector(run_level): mooncake_master_proc = None temp_config_file = None - omni = None try: _cleanup_mooncake_processes() @@ -349,15 +346,16 @@ def test_bagel_text2img_mooncake_connector(run_level): ) temp_config_file = _resolve_stage_config(temp_config_file, run_level) - omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=temp_config_file, stage_init_timeout=300) - - generated_image = _generate_bagel_image(omni) - if run_level == "advanced_model": - _validate_pixels(generated_image) + with OmniRunner( + "ByteDance-Seed/BAGEL-7B-MoT", + stage_configs_path=temp_config_file, + stage_init_timeout=300, + ) as runner: + generated_image = _generate_bagel_image(runner.omni) + if run_level == "advanced_model": + _validate_pixels(generated_image) finally: - if omni: - omni.close() if temp_config_file: try: os.unlink(temp_config_file) diff --git a/tests/e2e/offline_inference/test_bagel_understanding.py b/tests/e2e/offline_inference/test_bagel_understanding.py index 6f95e7ee00..bbee329807 100644 --- a/tests/e2e/offline_inference/test_bagel_understanding.py +++ b/tests/e2e/offline_inference/test_bagel_understanding.py @@ -21,15 +21,13 @@ import os os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" from pathlib import Path import pytest from vllm.assets.image import ImageAsset -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni MODEL_NAME = "ByteDance-Seed/BAGEL-7B-MoT" STAGE_CONFIG = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") @@ -76,13 +74,11 @@ def _extract_text(omni_outputs: list) -> str: def test_bagel_text2text(run_level): """Test Bagel text2text produces correct text output.""" config_path = _resolve_stage_config(STAGE_CONFIG, run_level) - omni = Omni( - model=MODEL_NAME, + with OmniRunner( + MODEL_NAME, stage_configs_path=config_path, - stage_init_timeout=300, - ) - - try: + ) as runner: + omni = runner.omni prompt = "<|im_start|>user\nWhere is the capital of France?<|im_end|>\n<|im_start|>assistant\n" params_list = omni.default_sampling_params_list omni_outputs = list( @@ -100,8 +96,6 @@ def test_bagel_text2text(run_level): assert text == REFERENCE_TEXT_TEXT2TEXT, ( f"Text mismatch: expected {REFERENCE_TEXT_TEXT2TEXT!r}, got {text!r}" ) - finally: - omni.close() @pytest.mark.core_model @@ -112,13 +106,12 @@ def test_bagel_img2text(run_level): """Test Bagel img2text produces correct text output.""" input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB") config_path = _resolve_stage_config(STAGE_CONFIG, run_level) - omni = Omni( - model=MODEL_NAME, + with OmniRunner( + MODEL_NAME, stage_configs_path=config_path, stage_init_timeout=300, - ) - - try: + ) as runner: + omni = runner.omni prompt = "<|im_start|>user\n<|image_pad|>\nPlease describe this image<|im_end|>\n<|im_start|>assistant\n" params_list = omni.default_sampling_params_list omni_outputs = list( @@ -140,5 +133,3 @@ def test_bagel_img2text(run_level): if run_level == "advanced_model": assert text == REFERENCE_TEXT_IMG2TEXT, f"Text mismatch: expected {REFERENCE_TEXT_IMG2TEXT!r}, got {text!r}" - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_cache_dit.py b/tests/e2e/offline_inference/test_cache_dit.py index 0e31413dc0..fc08da7bed 100644 --- a/tests/e2e/offline_inference/test_cache_dit.py +++ b/tests/e2e/offline_inference/test_cache_dit.py @@ -8,27 +8,15 @@ It uses minimal settings to keep test time short for CI. """ -import os -import sys -from pathlib import Path - import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams - -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" - # Use random weights model for testing models = ["riverclouds/qwen_image_random"] @@ -48,20 +36,17 @@ def test_cache_dit(model_name: str): "residual_diff_threshold": 0.24, "max_continuous_cached_steps": 3, } - m = None - try: - m = Omni( - model=model_name, - cache_backend="cache_dit", - cache_config=cache_config, - ) - + with OmniRunner( + model_name, + cache_backend="cache_dit", + cache_config=cache_config, + ) as runner: # Use minimal settings for fast testing height = 256 width = 256 num_inference_steps = 4 # Minimal steps for fast test - outputs = m.generate( + outputs = runner.omni.generate( "a photo of a cat sitting on a laptop keyboard", OmniDiffusionSamplingParams( height=height, @@ -90,9 +75,3 @@ def test_cache_dit(model_name: str): # Check image size assert images[0].width == width assert images[0].height == height - except Exception as e: - print(f"Test failed with error: {e}") - raise - finally: - if m is not None and hasattr(m, "close"): - m.close() diff --git a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py index f3830f02e9..257755ef8b 100644 --- a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py +++ b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py @@ -1,22 +1,14 @@ import gc -import sys -from pathlib import Path import pytest import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from tests.conftest import OmniRunner from tests.utils import DeviceMemoryMonitor, hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni - models = ["riverclouds/qwen_image_random"] @@ -27,30 +19,29 @@ def inference(model_name: str, offload: bool = True): current_omni_platform.reset_peak_memory_stats() monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - m = Omni( - model=model_name, + with OmniRunner( + model_name, # TODO: we might want to add overlapped feature e2e tests # cache_backend="cache_dit", enable_cpu_offload=offload, - ) - current_omni_platform.reset_peak_memory_stats() - height = 256 - width = 256 + ) as runner: + current_omni_platform.reset_peak_memory_stats() + height = 256 + width = 256 - m.generate( - "a photo of a cat sitting on a laptop keyboard", - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=9, - guidance_scale=0.0, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - ), - ) + runner.omni.generate( + "a photo of a cat sitting on a laptop keyboard", + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=9, + guidance_scale=0.0, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + ), + ) peak = monitor.peak_used_mb monitor.stop() - del m gc.collect() current_omni_platform.empty_cache() diff --git a/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py b/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py index 6132f1bd0e..bdfd594c77 100644 --- a/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py +++ b/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py @@ -1,21 +1,12 @@ -import sys -from pathlib import Path - import pytest import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from tests.conftest import OmniRunner from tests.utils import DeviceMemoryMonitor from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni - # Models to test and expected saved memory in MB, correspondingly MODELS_SAVED_MEMORY_MB = { "riverclouds/qwen_image_random": 4500, @@ -33,34 +24,33 @@ def run_inference( monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - m = Omni( - model=model_name, + with OmniRunner( + model_name, enable_layerwise_offload=layerwise_offload, # TODO: we might want to add overlapped feature e2e tests # cache_backend="cache_dit", boundary_ratio=0.875, flow_shift=5.0, - ) - - current_omni_platform.reset_peak_memory_stats() - - # Refer to tests/e2e/offline_inference/test_t2v_model.py - # Use minimal settings for testing - height = 480 - width = 640 - num_frames = 5 - - m.generate( - "A cat sitting on a table", - OmniDiffusionSamplingParams( - height=height, - width=width, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - guidance_scale=1.0, - num_inference_steps=num_inference_steps, - num_frames=num_frames, - ), - ) + ) as runner: + current_omni_platform.reset_peak_memory_stats() + + # Refer to tests/e2e/offline_inference/test_t2v_model.py + # Use minimal settings for testing + height = 480 + width = 640 + num_frames = 5 + + runner.omni.generate( + "A cat sitting on a table", + OmniDiffusionSamplingParams( + height=height, + width=width, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + guidance_scale=1.0, + num_inference_steps=num_inference_steps, + num_frames=num_frames, + ), + ) peak = monitor.peak_used_mb monitor.stop() diff --git a/tests/e2e/offline_inference/test_diffusion_lora.py b/tests/e2e/offline_inference/test_diffusion_lora.py index b414fe30ee..7edd03f20d 100644 --- a/tests/e2e/offline_inference/test_diffusion_lora.py +++ b/tests/e2e/offline_inference/test_diffusion_lora.py @@ -7,6 +7,7 @@ import torch from safetensors.torch import save_file +from tests.conftest import OmniRunner from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform @@ -16,15 +17,12 @@ if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) -from vllm_omni import Omni - os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" # This test is specific to Z-Image LoRA behavior. Keep it focused on a single # model to reduce runtime and avoid extra downloads. models = ["Tongyi-MAI/Z-Image-Turbo"] -DIFFUSION_INIT_TIMEOUT_S = 600 @pytest.mark.parametrize("model_name", models) @@ -77,12 +75,8 @@ def _write_zimage_lora(adapter_dir: Path) -> str: ) return str(adapter_dir) - m = Omni( - model=model_name, - stage_init_timeout=DIFFUSION_INIT_TIMEOUT_S, - init_timeout=DIFFUSION_INIT_TIMEOUT_S, - ) - try: + with OmniRunner(model_name) as runner: + m = runner.omni # high resolution may cause OOM on L4 height = 256 width = 256 @@ -140,5 +134,3 @@ def _write_zimage_lora(adapter_dir: Path) -> str: diff = np.abs(np.array(images[0], dtype=np.int16) - np.array(images_lora[0], dtype=np.int16)).mean() assert diff > 0.0 - finally: - m.close() diff --git a/tests/e2e/offline_inference/test_dynin_omni.py b/tests/e2e/offline_inference/test_dynin_omni.py index d17e7b8175..5388ac6746 100644 --- a/tests/e2e/offline_inference/test_dynin_omni.py +++ b/tests/e2e/offline_inference/test_dynin_omni.py @@ -18,7 +18,6 @@ import torch from transformers import AutoTokenizer -from tests.conftest import OmniRunner from tests.utils import hardware_test os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -37,6 +36,7 @@ pytestmark = [ pytest.mark.core_model, pytest.mark.omni, + pytest.mark.parametrize("omni_runner", test_params, indirect=True), ] @@ -291,20 +291,11 @@ def _numel(value: Any) -> int: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_t2i_decode_to_image(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_t2i_decode_to_image(omni_runner) -> None: _configure_dynin_config_env() prompt = _build_t2i_decode_prompt(dynin_config_path=DYNIN_CONFIG_PATH) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) image_output = _find_stage_output(outputs, "image") assert image_output is not None @@ -314,25 +305,16 @@ def test_dynin_t2i_decode_to_image(test_config: tuple[str, str]) -> None: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_mmu_to_text(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_mmu_to_text(omni_runner) -> None: _configure_dynin_config_env() - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(omni_runner.model_name, trust_remote_code=True) prompt = _build_mmu_prompt( tokenizer=tokenizer, question="What is 2 + 2? Answer in one short sentence.", dynin_config_path=DYNIN_CONFIG_PATH, ) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) text_output = _find_stage_output(outputs, "text") assert text_output is not None @@ -341,11 +323,9 @@ def test_dynin_mmu_to_text(test_config: tuple[str, str]) -> None: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_image_to_text(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_image_to_text(omni_runner) -> None: _configure_dynin_config_env() - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(omni_runner.model_name, trust_remote_code=True) prompt = _build_mmu_multimodal_prompt( tokenizer=tokenizer, question="Describe the image briefly in one sentence.", @@ -353,14 +333,7 @@ def test_dynin_image_to_text(test_config: tuple[str, str]) -> None: image=_generate_synthetic_image(), ) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) text_output = _find_stage_output(outputs, "text") assert text_output is not None @@ -369,11 +342,9 @@ def test_dynin_image_to_text(test_config: tuple[str, str]) -> None: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_speech_to_text(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_speech_to_text(omni_runner) -> None: _configure_dynin_config_env() - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(omni_runner.model_name, trust_remote_code=True) prompt = _build_mmu_multimodal_prompt( tokenizer=tokenizer, question="Transcribe the audio briefly in one sentence.", @@ -381,14 +352,7 @@ def test_dynin_speech_to_text(test_config: tuple[str, str]) -> None: audio=_generate_synthetic_audio(), ) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) text_output = _find_stage_output(outputs, "text") assert text_output is not None @@ -397,20 +361,11 @@ def test_dynin_speech_to_text(test_config: tuple[str, str]) -> None: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_t2s_decode_to_audio(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_t2s_decode_to_audio(omni_runner) -> None: _configure_dynin_config_env() prompt = _build_t2s_decode_prompt(dynin_config_path=DYNIN_CONFIG_PATH) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) audio_output = _find_stage_output(outputs, "audio") assert audio_output is not None diff --git a/tests/e2e/offline_inference/test_expert_parallel.py b/tests/e2e/offline_inference/test_expert_parallel.py index ba126986ec..29d84d7a3e 100644 --- a/tests/e2e/offline_inference/test_expert_parallel.py +++ b/tests/e2e/offline_inference/test_expert_parallel.py @@ -18,8 +18,8 @@ import torch.distributed as dist from PIL import Image +from tests.conftest import OmniRunner from tests.utils import hardware_test -from vllm_omni import Omni from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform @@ -96,12 +96,26 @@ def _run_inference( tensor_parallel_size=tensor_parallel_size, enable_expert_parallel=enable_expert_parallel, ) - omni = Omni(model=model_name, parallel_config=parallel_config) - try: - # Warmup run (not timed) - if warmup: - _ = omni.generate( + with OmniRunner(model_name, parallel_config=parallel_config) as runner: + omni = runner.omni + # Warmup run (not timed) + if warmup: + _ = omni.generate( + PROMPT, + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=DEFAULT_STEPS, + guidance_scale=guidance_scale, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), + num_outputs_per_prompt=1, + ), + ) + + # Timed run + start = time.time() + outputs = omni.generate( PROMPT, OmniDiffusionSamplingParams( height=height, @@ -112,28 +126,13 @@ def _run_inference( num_outputs_per_prompt=1, ), ) + elapsed_ms = (time.time() - start) * 1000 - # Timed run - start = time.time() - outputs = omni.generate( - PROMPT, - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=DEFAULT_STEPS, - guidance_scale=guidance_scale, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), - num_outputs_per_prompt=1, - ), - ) - elapsed_ms = (time.time() - start) * 1000 - - return InferenceResult( - images=outputs[0].images, - elapsed_ms=elapsed_ms, - ) + return InferenceResult( + images=outputs[0].images, + elapsed_ms=elapsed_ms, + ) finally: - omni.close() _cleanup_distributed() diff --git a/tests/e2e/offline_inference/test_flux_autoround_w4a16.py b/tests/e2e/offline_inference/test_flux_autoround_w4a16.py index 42aab7f26a..cbcd1009dd 100644 --- a/tests/e2e/offline_inference/test_flux_autoround_w4a16.py +++ b/tests/e2e/offline_inference/test_flux_autoround_w4a16.py @@ -8,31 +8,21 @@ """ import gc -import sys -from pathlib import Path +import os as _os import pytest import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from tests.conftest import OmniRunner from tests.utils import DeviceMemoryMonitor, hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni - QUANTIZED_MODEL = "vllm-project-org/FLUX.1-dev-AutoRound-w4a16" BASELINE_MODEL = "black-forest-labs/FLUX.1-dev" -# Allow overriding via environment for local testing -import os as _os - QUANTIZED_MODEL = _os.environ.get("FLUX_AUTOROUND_MODEL", QUANTIZED_MODEL) BASELINE_MODEL = _os.environ.get("FLUX_BASELINE_MODEL", BASELINE_MODEL) @@ -51,19 +41,18 @@ def _generate_image(model_name: str, **extra_kwargs) -> tuple[list, float]: monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - m = Omni(model=model_name, enforce_eager=True, **extra_kwargs) - - current_omni_platform.reset_peak_memory_stats() - outputs = m.generate( - "a photo of a cat sitting on a laptop keyboard", - OmniDiffusionSamplingParams( - height=HEIGHT, - width=WIDTH, - num_inference_steps=NUM_STEPS, - guidance_scale=0.0, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - ), - ) + with OmniRunner(model_name, enforce_eager=True, **extra_kwargs) as runner: + current_omni_platform.reset_peak_memory_stats() + outputs = runner.omni.generate( + "a photo of a cat sitting on a laptop keyboard", + OmniDiffusionSamplingParams( + height=HEIGHT, + width=WIDTH, + num_inference_steps=NUM_STEPS, + guidance_scale=0.0, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + ), + ) peak = monitor.peak_used_mb monitor.stop() @@ -74,7 +63,6 @@ def _generate_image(model_name: str, **extra_kwargs) -> tuple[list, float]: assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images") images = req_out.images - del m gc.collect() current_omni_platform.empty_cache() diff --git a/tests/e2e/offline_inference/test_flux_kontext.py b/tests/e2e/offline_inference/test_flux_kontext.py index 93dca21c9a..cd711d6b81 100644 --- a/tests/e2e/offline_inference/test_flux_kontext.py +++ b/tests/e2e/offline_inference/test_flux_kontext.py @@ -9,23 +9,14 @@ - Image editing with text guidance """ -import os -import sys -from pathlib import Path - import pytest from PIL import Image +from vllm.assets.image import ImageAsset +from tests.conftest import OmniRunner from vllm_omni.diffusion.data import DiffusionParallelConfig -from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" - MODEL = "black-forest-labs/FLUX.1-Kontext-dev" @@ -33,17 +24,15 @@ @pytest.mark.diffusion def test_flux_kontext_text_to_image(): """Test FluxKontext text-to-image generation with real model.""" - omni = Omni( - model=MODEL, + with OmniRunner( + MODEL, parallel_config=DiffusionParallelConfig( tensor_parallel_size=2, ), enable_cpu_offload=False, - ) - - try: + ) as runner: omni_outputs = list( - omni.generate( + runner.omni.generate( prompts=["A photo of a cat sitting on a laptop"], sampling_params_list=OmniDiffusionSamplingParams( height=512, @@ -54,43 +43,37 @@ def test_flux_kontext_text_to_image(): ) ) - assert len(omni_outputs) > 0 - output = omni_outputs[0] - images = None - if output.images: - images = output.images - elif hasattr(output, "request_output") and output.request_output: - for stage_out in output.request_output: - if hasattr(stage_out, "images") and stage_out.images: - images = stage_out.images - break + assert len(omni_outputs) > 0 + output = omni_outputs[0] + images = None + if output.images: + images = output.images + elif hasattr(output, "request_output") and output.request_output: + for stage_out in output.request_output: + if hasattr(stage_out, "images") and stage_out.images: + images = stage_out.images + break - assert images is not None - assert len(images) > 0 - assert isinstance(images[0], Image.Image) - assert images[0].size == (512, 512) - finally: - omni.close() + assert images is not None + assert len(images) > 0 + assert isinstance(images[0], Image.Image) + assert images[0].size == (512, 512) @pytest.mark.core_model @pytest.mark.diffusion def test_flux_kontext_image_edit(): """Test FluxKontext image-to-image editing with real model.""" - from vllm.assets.image import ImageAsset - input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB") - omni = Omni( - model=MODEL, + with OmniRunner( + MODEL, parallel_config=DiffusionParallelConfig( tensor_parallel_size=2, ), enable_cpu_offload=False, - ) - - try: + ) as runner: omni_outputs = list( - omni.generate( + runner.omni.generate( prompts=[ { "prompt": "Transform this image into a Vincent van Gogh style painting", @@ -107,20 +90,18 @@ def test_flux_kontext_image_edit(): ) ) - assert len(omni_outputs) > 0 - output = omni_outputs[0] - images = None - if output.images: - images = output.images - elif hasattr(output, "request_output") and output.request_output: - for stage_out in output.request_output: - if hasattr(stage_out, "images") and stage_out.images: - images = stage_out.images - break - - assert images is not None - assert len(images) > 0 - assert isinstance(images[0], Image.Image) - assert images[0].size == (512, 512) - finally: - omni.close() + assert len(omni_outputs) > 0 + output = omni_outputs[0] + images = None + if output.images: + images = output.images + elif hasattr(output, "request_output") and output.request_output: + for stage_out in output.request_output: + if hasattr(stage_out, "images") and stage_out.images: + images = stage_out.images + break + + assert images is not None + assert len(images) > 0 + assert isinstance(images[0], Image.Image) + assert images[0].size == (512, 512) diff --git a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py index 5522f33eaa..79bb64dca1 100644 --- a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py +++ b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py @@ -8,6 +8,7 @@ from PIL import Image from transformers import CLIPModel, CLIPProcessor +from tests.conftest import OmniRunner from vllm_omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform @@ -271,16 +272,11 @@ def clip_bundle() -> tuple[CLIPModel, CLIPProcessor]: @pytest.fixture(scope="module") def omni() -> Generator[Omni, None, None]: - engine = Omni( - model=MODEL_NAME, + with OmniRunner( + MODEL_NAME, stage_configs_path=str(STAGE_CONFIG_PATH), - stage_init_timeout=600, - init_timeout=900, - ) - try: - yield engine - finally: - engine.close() + ) as runner: + yield runner.omni def _extract_generated_image(outputs: list[object]) -> Image.Image: diff --git a/tests/e2e/offline_inference/test_magi_human.py b/tests/e2e/offline_inference/test_magi_human.py index 8648216a92..abb7f9c163 100644 --- a/tests/e2e/offline_inference/test_magi_human.py +++ b/tests/e2e/offline_inference/test_magi_human.py @@ -8,9 +8,9 @@ import numpy as np import pytest +from tests.conftest import OmniRunner from tests.utils import hardware_test from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes -from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -49,12 +49,6 @@ def test_magi_human_e2e(run_level): model_path = "SII-GAIR/daVinci-MagiHuman-Base-1080p" - omni = Omni( - model=model_path, - init_timeout=1200, - tensor_parallel_size=2, - ) - prompt = ( "A young woman with long, wavy golden blonde hair and bright blue eyes, " "wearing a fitted ivory silk blouse with a delicate lace collar, sits " @@ -94,7 +88,12 @@ def test_magi_human_e2e(run_level): }, ) - try: + with OmniRunner( + model_path, + init_timeout=1200, + tensor_parallel_size=2, + ) as runner: + omni = runner.omni outputs = list( omni.generate( prompts=[prompt], @@ -140,5 +139,3 @@ def test_magi_human_e2e(run_level): assert len(video_bytes) > 1000, f"MP4 too small ({len(video_bytes)} bytes)" _validate_mp4(video_bytes) - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_mammoth_moda2.py b/tests/e2e/offline_inference/test_mammoth_moda2.py index 5293b5ed1b..ff744c86e1 100644 --- a/tests/e2e/offline_inference/test_mammoth_moda2.py +++ b/tests/e2e/offline_inference/test_mammoth_moda2.py @@ -23,10 +23,9 @@ import torch from vllm.sampling_params import SamplingParams +from tests.conftest import OmniRunner from tests.utils import hardware_test -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" - # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- @@ -116,8 +115,6 @@ def test_mammothmoda2_t2i_e2e(): - A fixed set of pixel values matches a golden reference (regenerate with ``UPDATE_GOLDEN=1``). """ - from vllm_omni import Omni - if not Path(MODEL_PATH).exists(): pytest.skip(f"Model weights not found at {MODEL_PATH}") if not Path(T2I_STAGE_CONFIG).exists(): @@ -135,8 +132,8 @@ def test_mammothmoda2_t2i_e2e(): prompt_text = "A cat sitting on a laptop keyboard" formatted_prompt = _format_t2i_prompt(prompt_text, ar_width, ar_height) - omni = Omni(model=MODEL_PATH, stage_configs_path=T2I_STAGE_CONFIG, trust_remote_code=True) - try: + with OmniRunner(MODEL_PATH, stage_configs_path=T2I_STAGE_CONFIG, trust_remote_code=True) as runner: + omni = runner.omni # Greedy / deterministic sampling so pixel values are reproducible. ar_sampling = SamplingParams( temperature=0.0, @@ -211,5 +208,3 @@ def test_mammothmoda2_t2i_e2e(): found_image = True assert found_image, "No image tensor found in pipeline output" - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_omnivoice.py b/tests/e2e/offline_inference/test_omnivoice.py index 4b093e357d..bb4c8a5dd7 100644 --- a/tests/e2e/offline_inference/test_omnivoice.py +++ b/tests/e2e/offline_inference/test_omnivoice.py @@ -16,6 +16,7 @@ import numpy as np import pytest +from tests.conftest import OmniRunner from tests.utils import hardware_test MODEL = "k2-fsa/OmniVoice" @@ -37,48 +38,42 @@ def test_omnivoice_text_to_audio() -> None: Input Modal: text Output Modal: audio """ - from vllm_omni.entrypoints.omni import Omni + from vllm_omni.inputs.data import OmniDiffusionSamplingParams - omni = Omni( - model=MODEL, + with OmniRunner( + MODEL, stage_configs_path=get_stage_config(), trust_remote_code=True, log_stats=True, - ) - - try: + ) as runner: prompts = {"prompt": "Hello, this is a test for text to audio."} - from vllm_omni.inputs.data import OmniDiffusionSamplingParams - sampling_params_list = [OmniDiffusionSamplingParams()] - outputs = list(omni.generate(prompts, sampling_params_list=sampling_params_list)) + outputs = list(runner.omni.generate(prompts, sampling_params_list=sampling_params_list)) - assert len(outputs) > 0, "No outputs generated" + assert len(outputs) > 0, "No outputs generated" - # Check final output has audio - final_output = outputs[-1] - ro = final_output.request_output - assert ro is not None, "No request_output" + # Check final output has audio + final_output = outputs[-1] + ro = final_output.request_output + assert ro is not None, "No request_output" - mm = getattr(ro, "multimodal_output", None) - if not mm and ro.outputs: - mm = getattr(ro.outputs[0], "multimodal_output", None) + mm = getattr(ro, "multimodal_output", None) + if not mm and ro.outputs: + mm = getattr(ro.outputs[0], "multimodal_output", None) - assert mm is not None, "No multimodal_output" - assert "audio" in mm, f"No 'audio' key in multimodal_output: {mm.keys()}" + assert mm is not None, "No multimodal_output" + assert "audio" in mm, f"No 'audio' key in multimodal_output: {mm.keys()}" - audio = mm["audio"] - if isinstance(audio, np.ndarray): - audio_np = audio - else: - audio_np = audio.cpu().numpy().squeeze() + audio = mm["audio"] + if isinstance(audio, np.ndarray): + audio_np = audio + else: + audio_np = audio.cpu().numpy().squeeze() - assert audio_np.size > 0, "Audio output is empty" - rms = np.sqrt(np.mean(audio_np**2)) - assert rms > 0.01, f"Audio RMS too low ({rms:.4f}), likely silence" + assert audio_np.size > 0, "Audio output is empty" + rms = np.sqrt(np.mean(audio_np**2)) + assert rms > 0.01, f"Audio RMS too low ({rms:.4f}), likely silence" - print(f"Generated audio: {len(audio_np) / 24000:.2f}s, rms={rms:.4f}") - finally: - omni.close() + print(f"Generated audio: {len(audio_np) / 24000:.2f}s, rms={rms:.4f}") diff --git a/tests/e2e/offline_inference/test_quantization_fp8.py b/tests/e2e/offline_inference/test_quantization_fp8.py index f71c53de74..291779fd93 100644 --- a/tests/e2e/offline_inference/test_quantization_fp8.py +++ b/tests/e2e/offline_inference/test_quantization_fp8.py @@ -29,7 +29,6 @@ import os os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" from pathlib import Path from typing import Any @@ -37,8 +36,8 @@ import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform @@ -61,16 +60,15 @@ def _generate_single_stage_image( Returns (images, peak_memory_gib). """ - omni_kwargs: dict[str, Any] = {"model": model, **extra_omni_kwargs} + omni_kwargs: dict[str, Any] = dict(extra_omni_kwargs) if quantization: omni_kwargs["quantization"] = quantization - omni = Omni(**omni_kwargs) - try: + with OmniRunner(model, **omni_kwargs) as runner: torch.cuda.reset_peak_memory_stats() generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(seed) - outputs = omni.generate( + outputs = runner.omni.generate( "a photo of a cat sitting on a laptop keyboard", OmniDiffusionSamplingParams( height=height, @@ -94,8 +92,6 @@ def _generate_single_stage_image( assert images[0].height == height return images, peak_mem - finally: - omni.close() def _generate_bagel_image( @@ -115,8 +111,9 @@ def _generate_bagel_image( if quantization_config: omni_kwargs["quantization_config"] = quantization_config - omni = Omni(**omni_kwargs) - try: + model_name = omni_kwargs.pop("model") + with OmniRunner(model_name, **omni_kwargs) as runner: + omni = runner.omni torch.cuda.reset_peak_memory_stats() params_list = omni.default_sampling_params_list @@ -168,8 +165,6 @@ def _generate_bagel_image( ) return generated_image, peak_mem - finally: - omni.close() # ─── Single-stage diffusion model tests ────────────────────────────────────── diff --git a/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py b/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py index d5f82f893e..f0b0b55c9f 100644 --- a/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py +++ b/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py @@ -28,7 +28,6 @@ import argparse import asyncio -import os import sys import time import uuid @@ -37,6 +36,7 @@ import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test from vllm_omni.entrypoints.async_omni import AsyncOmni from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -48,9 +48,6 @@ if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) -from vllm_omni import Omni - -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" # ------------------------------------------------------------------ models = ["tiny-random/Qwen-Image"] @@ -391,31 +388,28 @@ async def main(model: str, num_prompts: int, mode: str, batch_size: int = 1) -> def test_diffusion_batching_sync_sequential(model_name: str): """Test that synchronous Omni can generate images for multiple prompts submitted sequentially (one at a time) and each returns a valid image.""" - m = None try: - m = Omni(model=model_name) - sp = _default_sync_sampling_params() - prompts = TEST_PROMPTS[:4] + with OmniRunner(model_name) as runner: + m = runner.omni + sp = _default_sync_sampling_params() + prompts = TEST_PROMPTS[:4] - for i, prompt in enumerate(prompts): - outputs = m.generate(prompt, sp) - first_output = outputs[0] - assert first_output.final_output_type == "image", ( - f"Expected 'image', got '{first_output.final_output_type}'" - ) + for i, prompt in enumerate(prompts): + outputs = m.generate(prompt, sp) + first_output = outputs[0] + assert first_output.final_output_type == "image", ( + f"Expected 'image', got '{first_output.final_output_type}'" + ) - # Images are surfaced both at top-level and inside request_output - images = _extract_images(first_output) - assert len(images) >= 1, f"Expected at least 1 image for prompt {i}, got {len(images)}" - assert images[0].width == 256 - assert images[0].height == 256 - print(f" prompt {i}: OK ({len(images)} images)") + # Images are surfaced both at top-level and inside request_output + images = _extract_images(first_output) + assert len(images) >= 1, f"Expected at least 1 image for prompt {i}, got {len(images)}" + assert images[0].width == 256 + assert images[0].height == 256 + print(f" prompt {i}: OK ({len(images)} images)") except Exception as e: print(f"Test failed with error: {e}") raise - finally: - if m is not None and hasattr(m, "close"): - m.close() @pytest.mark.core_model @@ -431,34 +425,31 @@ def test_diffusion_batching_sync_multi_prompt(model_name: str): handling at the diffusion stage, not the explicit list-batch path (which is only available via AsyncOmni). """ - m = None try: - m = Omni(model=model_name) - sp = _default_sync_sampling_params() - prompts = TEST_PROMPTS[:4] + with OmniRunner(model_name) as runner: + m = runner.omni + sp = _default_sync_sampling_params() + prompts = TEST_PROMPTS[:4] - outputs = m.generate(prompts, sp) - assert len(outputs) == len(prompts), f"Expected {len(prompts)} outputs, got {len(outputs)}" + outputs = m.generate(prompts, sp) + assert len(outputs) == len(prompts), f"Expected {len(prompts)} outputs, got {len(outputs)}" - for i, output in enumerate(outputs): - assert output.final_output_type == "image", ( - f"Output {i} final_output_type expected 'image', got '{output.final_output_type}'" - ) - images = _extract_images(output) - assert images and len(images) >= 1, f"Expected at least 1 image for prompt {i}" - assert images[0].width == 256 - assert images[0].height == 256 - print(f" prompt {i}: OK ({len(images)} images, request_id={output.request_id})") - - # Verify all request_ids are distinct - request_ids = [o.request_id for o in outputs] - assert len(set(request_ids)) == len(request_ids), f"Duplicate request_ids found: {request_ids}" + for i, output in enumerate(outputs): + assert output.final_output_type == "image", ( + f"Output {i} final_output_type expected 'image', got '{output.final_output_type}'" + ) + images = _extract_images(output) + assert images and len(images) >= 1, f"Expected at least 1 image for prompt {i}" + assert images[0].width == 256 + assert images[0].height == 256 + print(f" prompt {i}: OK ({len(images)} images, request_id={output.request_id})") + + # Verify all request_ids are distinct + request_ids = [o.request_id for o in outputs] + assert len(set(request_ids)) == len(request_ids), f"Duplicate request_ids found: {request_ids}" except Exception as e: print(f"Test failed with error: {e}") raise - finally: - if m is not None and hasattr(m, "close"): - m.close() @pytest.mark.core_model @@ -552,32 +543,29 @@ async def _inner(): def test_diffusion_batching_num_outputs(model_name: str): """Test that the diffusion model respects num_outputs_per_prompt and generates the correct number of images per request.""" - m = None try: - m = Omni(model=model_name) - num_outputs = 2 - sp = _default_sync_sampling_params(num_outputs_per_prompt=num_outputs) - - outputs = m.generate( - "a photo of a cat sitting on a laptop keyboard", - sp, - ) + with OmniRunner(model_name) as runner: + m = runner.omni + num_outputs = 2 + sp = _default_sync_sampling_params(num_outputs_per_prompt=num_outputs) + + outputs = m.generate( + "a photo of a cat sitting on a laptop keyboard", + sp, + ) - first_output = outputs[0] - assert first_output.final_output_type == "image" - images = _extract_images(first_output) - assert images is not None and len(images) == num_outputs, ( - f"Expected {num_outputs} images, got {len(images) if images else 0}" - ) - for img in images: - assert img.width == 256 - assert img.height == 256 + first_output = outputs[0] + assert first_output.final_output_type == "image" + images = _extract_images(first_output) + assert images is not None and len(images) == num_outputs, ( + f"Expected {num_outputs} images, got {len(images) if images else 0}" + ) + for img in images: + assert img.width == 256 + assert img.height == 256 except Exception as e: print(f"Test failed with error: {e}") raise - finally: - if m is not None and hasattr(m, "close"): - m.close() @pytest.mark.core_model @@ -587,34 +575,31 @@ def test_diffusion_batching_num_outputs(model_name: str): def test_diffusion_batching_distinct_results(model_name: str): """Test that different prompts produce distinct images when batched, ensuring the batching logic does not mix up results across requests.""" - m = None try: - m = Omni(model=model_name) - sp = _default_sync_sampling_params() - prompts = [ - {"prompt": "a bright red apple on a white table", "negative_prompt": "blurry"}, - {"prompt": "a blue ocean with white waves crashing", "negative_prompt": "blurry"}, - ] - - outputs = m.generate(prompts, sp) - assert len(outputs) == len(prompts), f"Expected {len(prompts)} outputs, got {len(outputs)}" - - # Verify each output has a unique request_id - request_ids = [o.request_id for o in outputs] - assert len(set(request_ids)) == len(request_ids), f"Duplicate request_ids: {request_ids}" - - # Verify each output has images - for i, output in enumerate(outputs): - images = _extract_images(output) - assert images and len(images) >= 1, f"No images for prompt {i}" - assert images[0].width == 256 - assert images[0].height == 256 + with OmniRunner(model_name) as runner: + m = runner.omni + sp = _default_sync_sampling_params() + prompts = [ + {"prompt": "a bright red apple on a white table", "negative_prompt": "blurry"}, + {"prompt": "a blue ocean with white waves crashing", "negative_prompt": "blurry"}, + ] + + outputs = m.generate(prompts, sp) + assert len(outputs) == len(prompts), f"Expected {len(prompts)} outputs, got {len(outputs)}" + + # Verify each output has a unique request_id + request_ids = [o.request_id for o in outputs] + assert len(set(request_ids)) == len(request_ids), f"Duplicate request_ids: {request_ids}" + + # Verify each output has images + for i, output in enumerate(outputs): + images = _extract_images(output) + assert images and len(images) >= 1, f"No images for prompt {i}" + assert images[0].width == 256 + assert images[0].height == 256 except Exception as e: print(f"Test failed with error: {e}") raise - finally: - if m is not None and hasattr(m, "close"): - m.close() # ------------------------------------------------------------------ diff --git a/tests/e2e/offline_inference/test_sequence_parallel.py b/tests/e2e/offline_inference/test_sequence_parallel.py index 16239a1c52..d3abccd78c 100644 --- a/tests/e2e/offline_inference/test_sequence_parallel.py +++ b/tests/e2e/offline_inference/test_sequence_parallel.py @@ -20,8 +20,8 @@ import torch.distributed as dist from PIL import Image +from tests.conftest import OmniRunner from tests.utils import hardware_test -from vllm_omni import Omni from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform @@ -92,49 +92,48 @@ def _run_inference( warmup: If True, run one warmup iteration before the timed run. """ parallel_config = DiffusionParallelConfig(ulysses_degree=ulysses_degree, ring_degree=ring_degree) - omni = Omni( - model=model_name, - parallel_config=parallel_config, - dtype=dtype, - attention_backend=attn_backend, - ) - try: - # Warmup run (not timed) - if warmup: - _ = omni.generate( + with OmniRunner( + model_name, + parallel_config=parallel_config, + dtype=dtype, + attention_backend=attn_backend, + ) as runner: + omni = runner.omni + # Warmup run (not timed) + if warmup: + _ = omni.generate( + PROMPT, + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=DEFAULT_STEPS, + guidance_scale=0.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed + 1000), + num_outputs_per_prompt=1, + ), + ) + + # Timed run + start = time.time() + outputs = omni.generate( PROMPT, OmniDiffusionSamplingParams( height=height, width=width, num_inference_steps=DEFAULT_STEPS, guidance_scale=0.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed + 1000), + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), num_outputs_per_prompt=1, ), ) + elapsed_ms = (time.time() - start) * 1000 - # Timed run - start = time.time() - outputs = omni.generate( - PROMPT, - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=DEFAULT_STEPS, - guidance_scale=0.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), - num_outputs_per_prompt=1, - ), - ) - elapsed_ms = (time.time() - start) * 1000 - - return InferenceResult( - images=outputs[0].request_output.images, - elapsed_ms=elapsed_ms, - ) + return InferenceResult( + images=outputs[0].request_output.images, + elapsed_ms=elapsed_ms, + ) finally: - omni.close() _cleanup_distributed() diff --git a/tests/e2e/offline_inference/test_stable_audio_model.py b/tests/e2e/offline_inference/test_stable_audio_model.py index ff4d9b4017..21d75aad52 100644 --- a/tests/e2e/offline_inference/test_stable_audio_model.py +++ b/tests/e2e/offline_inference/test_stable_audio_model.py @@ -1,6 +1,3 @@ -import sys -from pathlib import Path - import numpy as np import pytest import torch @@ -10,31 +7,25 @@ from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni - # Use random weights model for CI testing (small, no authentication required) models = ["linyueqian/stable_audio_random"] +# omni_runner expects (model, stage_configs_path); single-stage diffusion has no YAML. +test_params = [(m, None) for m in models] + @pytest.mark.core_model @pytest.mark.diffusion @hardware_test(res={"cuda": "L4", "xpu": "B60"}) -@pytest.mark.parametrize("model_name", models) -def test_stable_audio_model(model_name: str): - m = Omni(model=model_name) - +@pytest.mark.parametrize("omni_runner", test_params, indirect=True) +def test_stable_audio_model(omni_runner): # Use minimal settings for testing # Generate a short 2-second audio clip with minimal inference steps audio_start_in_s = 0.0 audio_end_in_s = 2.0 # Short duration for fast testing sample_rate = 44100 # Stable Audio uses 44100 Hz - outputs = m.generate( + outputs = omni_runner.omni.generate( prompts={ "prompt": "The sound of a dog barking", "negative_prompt": "Low quality.", diff --git a/tests/e2e/offline_inference/test_t2i_model.py b/tests/e2e/offline_inference/test_t2i_model.py index 55a154f61b..fc54f9a7ff 100644 --- a/tests/e2e/offline_inference/test_t2i_model.py +++ b/tests/e2e/offline_inference/test_t2i_model.py @@ -1,7 +1,3 @@ -import os -import sys -from pathlib import Path - import pytest import torch @@ -10,14 +6,12 @@ from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) +# Match unprefixed HF id even when MODEL_PREFIX is set (omni_runner resolves full path). +_QWEN_IMAGE_RANDOM_ID = "riverclouds/qwen_image_random" -from vllm_omni import Omni -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" +def _is_qwen_image_random(model_path: str) -> bool: + return model_path.rstrip("/").endswith(_QWEN_IMAGE_RANDOM_ID) models = ["Tongyi-MAI/Z-Image-Turbo", "riverclouds/qwen_image_random"] @@ -27,56 +21,55 @@ if current_omni_platform.is_npu(): models = ["Tongyi-MAI/Z-Image-Turbo", "Qwen/Qwen-Image"] +# omni_runner expects (model, stage_configs_path); single-stage diffusion has no YAML. +test_params = [(m, None) for m in models] + @pytest.mark.core_model @pytest.mark.advanced_model @pytest.mark.diffusion @hardware_test(res={"cuda": "L4", "rocm": "MI325", "xpu": "B60"}, num_cards={"cuda": 1, "rocm": 1, "xpu": 2}) -@pytest.mark.parametrize("model_name", models) -def test_diffusion_model(model_name: str, run_level): - if run_level == "core_model" and model_name != "riverclouds/qwen_image_random": +@pytest.mark.parametrize("omni_runner", test_params, indirect=True) +def test_diffusion_model(omni_runner, run_level): + resolved = omni_runner.model_name + if run_level == "core_model" and not _is_qwen_image_random(resolved): pytest.skip() - if run_level == "advanced_model" and model_name == "riverclouds/qwen_image_random": + if run_level == "advanced_model" and _is_qwen_image_random(resolved): pytest.skip() - m = None - try: - m = Omni(model=model_name) - # high resolution may cause OOM on L4 - height = 256 - width = 256 - outputs = m.generate( - "a photo of a cat sitting on a laptop keyboard", - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=2, - guidance_scale=0.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), - num_outputs_per_prompt=2, - ), - ) - # Extract images from request_output['images'] - first_output = outputs[0] - assert first_output.final_output_type == "image" - if not hasattr(first_output, "request_output") or not first_output.request_output: - raise ValueError("No request_output found in OmniRequestOutput") - - req_out = first_output.request_output - if not isinstance(req_out, OmniRequestOutput) or not hasattr(req_out, "images"): - raise ValueError("Invalid request_output structure or missing 'images' key") - - images = req_out.images - - assert len(images) == 2 - # check image size - assert images[0].width == width - assert images[0].height == height - images[0].save("image_output.png") - except Exception as e: - print(f"Test failed with error: {e}") - raise - finally: - if m is not None and hasattr(m, "close"): - m.close() + # high resolution may cause OOM on L4 + height = 256 + width = 256 + sampling = OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=2, + guidance_scale=0.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), + num_outputs_per_prompt=2, + ) + + # OmniRunner.generate() is typed for list[TextPrompt]; diffusion uses Omni.generate(str, ...). + outputs = omni_runner.omni.generate( + "a photo of a cat sitting on a laptop keyboard", + sampling, + ) + + # Extract images from request_output['images'] + first_output = outputs[0] + assert first_output.final_output_type == "image" + if not hasattr(first_output, "request_output") or not first_output.request_output: + raise ValueError("No request_output found in OmniRequestOutput") + + req_out = first_output.request_output + if not isinstance(req_out, OmniRequestOutput) or not hasattr(req_out, "images"): + raise ValueError("Invalid request_output structure or missing 'images' key") + + images = req_out.images + + assert len(images) == 2 + # check image size + assert images[0].width == width + assert images[0].height == height + images[0].save("image_output.png") diff --git a/tests/e2e/offline_inference/test_t2v_model.py b/tests/e2e/offline_inference/test_t2v_model.py index 94c9dedf74..6fe623cfc8 100644 --- a/tests/e2e/offline_inference/test_t2v_model.py +++ b/tests/e2e/offline_inference/test_t2v_model.py @@ -1,22 +1,13 @@ import os -import sys -from pathlib import Path import pytest import torch +from tests.conftest import OmniRunner from vllm_omni.inputs.data import OmniDiffusionSamplingParams - -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" models = ["Wan-AI/Wan2.2-T2V-A14B-Diffusers"] @@ -24,28 +15,28 @@ @pytest.mark.parametrize("model_name", models) def test_video_diffusion_model(model_name: str): - m = Omni( - model=model_name, + with OmniRunner( + model_name, boundary_ratio=0.875, flow_shift=5.0, - ) - # Use minimal settings for testing - # num_frames must satisfy: num_frames % vae_scale_factor_temporal == 1 - # For Wan2.2, vae_scale_factor_temporal=4, so valid values are 5, 9, 13, 17, ... - height = 480 - width = 640 - num_frames = 5 - outputs = m.generate( - prompts="A cat sitting on a table", - sampling_params_list=OmniDiffusionSamplingParams( - height=height, - width=width, - num_frames=num_frames, - num_inference_steps=2, - guidance_scale=1.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), - ), - ) + ) as runner: + # Use minimal settings for testing + # num_frames must satisfy: num_frames % vae_scale_factor_temporal == 1 + # For Wan2.2, vae_scale_factor_temporal=4, so valid values are 5, 9, 13, 17, ... + height = 480 + width = 640 + num_frames = 5 + outputs = runner.omni.generate( + prompts="A cat sitting on a table", + sampling_params_list=OmniDiffusionSamplingParams( + height=height, + width=width, + num_frames=num_frames, + num_inference_steps=2, + guidance_scale=1.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), + ), + ) first_output = outputs[0] assert first_output.final_output_type == "image" if not hasattr(first_output, "request_output") or not first_output.request_output: diff --git a/tests/e2e/offline_inference/test_teacache.py b/tests/e2e/offline_inference/test_teacache.py index efc0e43e86..7cd1c5a479 100644 --- a/tests/e2e/offline_inference/test_teacache.py +++ b/tests/e2e/offline_inference/test_teacache.py @@ -8,26 +8,14 @@ It uses minimal settings to keep test time short for CI. """ -import os -import sys -from pathlib import Path - import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from vllm_omni.platforms import current_omni_platform - -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni from vllm_omni.outputs import OmniRequestOutput - -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" +from vllm_omni.platforms import current_omni_platform # Use random weights model for testing models = ["riverclouds/qwen_image_random"] @@ -44,20 +32,17 @@ def test_teacache(model_name: str): cache_config = { "rel_l1_thresh": 0.2, # Default threshold } - m = None - try: - m = Omni( - model=model_name, - cache_backend="tea_cache", - cache_config=cache_config, - ) - + with OmniRunner( + model_name, + cache_backend="tea_cache", + cache_config=cache_config, + ) as runner: # Use minimal settings for fast testing height = 256 width = 256 num_inference_steps = 4 # Minimal steps for fast test - outputs = m.generate( + outputs = runner.omni.generate( "a photo of a cat sitting on a laptop keyboard", OmniDiffusionSamplingParams( height=height, @@ -86,9 +71,3 @@ def test_teacache(model_name: str): # Check image size assert images[0].width == width assert images[0].height == height - except Exception as e: - print(f"Test failed with error: {e}") - raise - finally: - if m is not None and hasattr(m, "close"): - m.close() diff --git a/tests/e2e/offline_inference/test_vae_decode_parallelism.py b/tests/e2e/offline_inference/test_vae_decode_parallelism.py index cee76fac2e..0fce28d669 100644 --- a/tests/e2e/offline_inference/test_vae_decode_parallelism.py +++ b/tests/e2e/offline_inference/test_vae_decode_parallelism.py @@ -18,7 +18,7 @@ import time -from vllm_omni import Omni +from tests.conftest import OmniRunner from vllm_omni.platforms import current_omni_platform # os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" @@ -72,23 +72,22 @@ def is_nextstep_model(model_name: str) -> bool: def model_run(model_configs, tp, out_height, out_width, out_frames, using_tile, vae_patch_parallel_size=1): - m = None - try: - parallel_config = DiffusionParallelConfig( - tensor_parallel_size=tp, - vae_patch_parallel_size=vae_patch_parallel_size, - ) + parallel_config = DiffusionParallelConfig( + tensor_parallel_size=tp, + vae_patch_parallel_size=vae_patch_parallel_size, + ) - omni_kwargs = { - "model": model_configs["model_name"], - "vae_use_tiling": using_tile, - "parallel_config": parallel_config, - } - use_nextstep = is_nextstep_model(model_configs["model_name"]) - if use_nextstep: - # NextStep-1.1 requires explicit pipeline class - omni_kwargs["model_class_name"] = "NextStep11Pipeline" - m = Omni(**omni_kwargs) + omni_kwargs = { + "vae_use_tiling": using_tile, + "parallel_config": parallel_config, + } + use_nextstep = is_nextstep_model(model_configs["model_name"]) + if use_nextstep: + # NextStep-1.1 requires explicit pipeline class + omni_kwargs["model_class_name"] = "NextStep11Pipeline" + + with OmniRunner(model_configs["model_name"], **omni_kwargs) as runner: + m = runner.omni image = Image.new("RGB", (out_width, out_height), (0, 0, 0)) start = time.perf_counter() outputs = m.generate( @@ -115,9 +114,6 @@ def model_run(model_configs, tp, out_height, out_width, out_frames, using_tile, # frames shape: (batch, num_frames, height, width, channels) cost = (end - start) * 1000 return frames, cost - finally: - if m is not None: - m.close() cleanup_dist_env_and_memory() diff --git a/tests/e2e/offline_inference/test_voxcpm2.py b/tests/e2e/offline_inference/test_voxcpm2.py index 7e17c6a369..4e4f635d5c 100644 --- a/tests/e2e/offline_inference/test_voxcpm2.py +++ b/tests/e2e/offline_inference/test_voxcpm2.py @@ -5,6 +5,7 @@ import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test VOXCPM2_MODEL = "openbmb/VoxCPM2" @@ -24,10 +25,8 @@ @pytest.fixture(scope="module") def voxcpm2_engine(): """Create VoxCPM2 engine for testing.""" - from vllm_omni import Omni - - engine = Omni(model=VOXCPM2_MODEL, stage_configs_path=STAGE_CONFIG) - yield engine + with OmniRunner(VOXCPM2_MODEL, stage_configs_path=STAGE_CONFIG) as runner: + yield runner.omni def _extract_audio(multimodal_output: dict) -> torch.Tensor: diff --git a/tests/e2e/offline_inference/test_voxtral_tts.py b/tests/e2e/offline_inference/test_voxtral_tts.py index b559cc252d..4f440f243b 100644 --- a/tests/e2e/offline_inference/test_voxtral_tts.py +++ b/tests/e2e/offline_inference/test_voxtral_tts.py @@ -19,7 +19,6 @@ import uuid os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" from pathlib import Path @@ -30,10 +29,9 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer from vllm import SamplingParams -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test from vllm_omni.entrypoints.async_omni import AsyncOmni -from vllm_omni.entrypoints.omni import Omni MODEL = "mistralai/Voxtral-4B-TTS-2603" STAGE_CONFIG = str( @@ -83,14 +81,12 @@ def test_voxtral_tts_offline_basic(run_level): """Test basic Voxtral TTS offline inference with a voice preset.""" stage_config = _resolve_stage_config(run_level) - omni = Omni( - model=MODEL, + with OmniRunner( + MODEL, stage_configs_path=stage_config, - stage_init_timeout=300, enforce_eager=True, - ) - - try: + ) as runner: + omni = runner.omni inputs = _compose_request(MODEL, TEST_TEXT, VOICE) sampling_params = SamplingParams(max_tokens=2500) @@ -127,9 +123,6 @@ def test_voxtral_tts_offline_basic(run_level): # Verify audio isn't all zeros / silence assert np.max(np.abs(audio_array)) > 0.01, "Audio appears to be silence" - finally: - omni.close() - @pytest.mark.advanced_model @pytest.mark.omni diff --git a/tests/e2e/offline_inference/test_zimage_parallelism.py b/tests/e2e/offline_inference/test_zimage_parallelism.py index b685704ae4..27edc48f20 100644 --- a/tests/e2e/offline_inference/test_zimage_parallelism.py +++ b/tests/e2e/offline_inference/test_zimage_parallelism.py @@ -12,7 +12,6 @@ """ import os -import sys import time from pathlib import Path @@ -20,21 +19,14 @@ import pytest import torch from PIL import Image -from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from tests.conftest import OmniRunner from tests.utils import DeviceMemoryMonitor, hardware_test -from vllm_omni import Omni from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" PROMPT = "a photo of a cat sitting on a laptop keyboard" @@ -97,61 +89,61 @@ def _run_zimage_generate( device_index = current_omni_platform.current_device() monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - m = Omni( - model=_get_zimage_model(), - parallel_config=DiffusionParallelConfig( - tensor_parallel_size=tp_size, - vae_patch_parallel_size=vae_patch_parallel_size, - ), - enforce_eager=enforce_eager, - vae_use_tiling=vae_use_tiling, - ) try: - # NOTE: Omni closes itself when a generate() call is exhausted. - # To avoid measuring teardown time (process shutdown, memory cleanup), - # we measure the latency to produce *subsequent* outputs within a single - # generator run. - # - # This also serves as a warmup: the first output may include extra - # compilation/caching overhead, while later outputs are closer to - # steady-state inference. - gen = m.generate( - [PROMPT] * num_requests, - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=num_inference_steps, - guidance_scale=0.0, - seed=seed, - num_outputs_per_prompt=1, + # Each run needs a distinct DiffusionParallelConfig; use OmniRunner per call (not the + # parametrized omni_runner fixture, which is fixed per module). + with OmniRunner( + _get_zimage_model(), + parallel_config=DiffusionParallelConfig( + tensor_parallel_size=tp_size, + vae_patch_parallel_size=vae_patch_parallel_size, ), - py_generator=True, - ) - - warmup_output = next(gen) - - t_prev = time.perf_counter() - per_request_times_s: list[float] = [] - last_output = warmup_output - for _ in range(num_requests - 1): - last_output = next(gen) - t_now = time.perf_counter() - per_request_times_s.append(t_now - t_prev) - t_prev = t_now - - # Ensure the generator is fully consumed so it can clean up. - for _ in gen: - pass - - median_time_s = float(np.median(per_request_times_s)) - - peak_memory_mb = monitor.peak_used_mb - - return _extract_single_image([last_output]), median_time_s, peak_memory_mb + enforce_eager=enforce_eager, + vae_use_tiling=vae_use_tiling, + ) as runner: + # NOTE: Omni closes itself when a generate() call is exhausted. + # To avoid measuring teardown time (process shutdown, memory cleanup), + # we measure the latency to produce *subsequent* outputs within a single + # generator run. + # + # This also serves as a warmup: the first output may include extra + # compilation/caching overhead, while later outputs are closer to + # steady-state inference. + gen = runner.omni.generate( + [PROMPT] * num_requests, + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=num_inference_steps, + guidance_scale=0.0, + seed=seed, + num_outputs_per_prompt=1, + ), + py_generator=True, + ) + + warmup_output = next(gen) + + t_prev = time.perf_counter() + per_request_times_s: list[float] = [] + last_output = warmup_output + for _ in range(num_requests - 1): + last_output = next(gen) + t_now = time.perf_counter() + per_request_times_s.append(t_now - t_prev) + t_prev = t_now + + # Ensure the generator is fully consumed so it can clean up. + for _ in gen: + pass + + median_time_s = float(np.median(per_request_times_s)) + + peak_memory_mb = monitor.peak_used_mb + + return _extract_single_image([last_output]), median_time_s, peak_memory_mb finally: monitor.stop() - m.close() - cleanup_dist_env_and_memory() @pytest.mark.advanced_model diff --git a/tests/e2e/online_serving/test_images_generations_lora.py b/tests/e2e/online_serving/test_images_generations_lora.py index 8c826591a5..fb1e3ea1e0 100644 --- a/tests/e2e/online_serving/test_images_generations_lora.py +++ b/tests/e2e/online_serving/test_images_generations_lora.py @@ -28,7 +28,7 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" MODEL = "Tongyi-MAI/Z-Image-Turbo" -DIFFUSION_INIT_TIMEOUT_S = 700 +DIFFUSION_INIT_TIMEOUT_S = 900 PROMPT = "a photo of a cat sitting on a laptop keyboard" From 2b70e89535aca2f29eff74687a6b07b5fd2bd077 Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Mon, 13 Apr 2026 14:55:16 +0800 Subject: [PATCH 143/204] =?UTF-8?q?[Revert]=20Revert=20"[Log]=20Wire=20sta?= =?UTF-8?q?t=20loggers=20into=20AsyncOmniEngine=20to=20match=20AsyncLL?= =?UTF-8?q?=E2=80=A6=20(#2716)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: amy-why-3459 --- .../test_async_omni_engine_do_log_stats.py | 56 ------------------ .../test_async_omni_engine_stage_init.py | 2 - tests/engine/test_single_stage_mode.py | 3 - vllm_omni/engine/async_omni_engine.py | 58 +------------------ vllm_omni/engine/orchestrator.py | 26 +-------- vllm_omni/entrypoints/async_omni.py | 7 ++- 6 files changed, 8 insertions(+), 144 deletions(-) delete mode 100644 tests/engine/test_async_omni_engine_do_log_stats.py diff --git a/tests/engine/test_async_omni_engine_do_log_stats.py b/tests/engine/test_async_omni_engine_do_log_stats.py deleted file mode 100644 index e2b8c03b93..0000000000 --- a/tests/engine/test_async_omni_engine_do_log_stats.py +++ /dev/null @@ -1,56 +0,0 @@ -"""Guard tests for AsyncOmniEngine.do_log_stats edge cases. - -These are pure-Python tests that bypass __init__ and only exercise the -no-op branches of do_log_stats, so no stage cores / threads are needed. -""" - -import asyncio - -import pytest - -from vllm_omni.engine.async_omni_engine import AsyncOmniEngine - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -def _make_bare_engine() -> AsyncOmniEngine: - # Bypass __init__ so we don't spin up stage cores; we only need the - # attributes do_log_stats touches. - return AsyncOmniEngine.__new__(AsyncOmniEngine) - - -@pytest.mark.asyncio -async def test_do_log_stats_noop_when_manager_missing(): - engine = _make_bare_engine() - engine.logger_manager = None - engine.orchestrator_loop = None - await engine.do_log_stats() # should silently return - - -@pytest.mark.asyncio -async def test_do_log_stats_noop_when_loop_missing(): - engine = _make_bare_engine() - - class _Manager: - def log(self) -> None: # pragma: no cover - must not be called - raise AssertionError("log() should not be called without a loop") - - engine.logger_manager = _Manager() - engine.orchestrator_loop = None - await engine.do_log_stats() - - -@pytest.mark.asyncio -async def test_do_log_stats_noop_when_loop_not_running(): - engine = _make_bare_engine() - - class _Manager: - def log(self) -> None: # pragma: no cover - must not be called - raise AssertionError("log() should not be called on a stopped loop") - - dead_loop = asyncio.new_event_loop() - dead_loop.close() - - engine.logger_manager = _Manager() - engine.orchestrator_loop = dead_loop - await engine.do_log_stats() diff --git a/tests/engine/test_async_omni_engine_stage_init.py b/tests/engine/test_async_omni_engine_stage_init.py index f397307936..6993f391eb 100644 --- a/tests/engine/test_async_omni_engine_stage_init.py +++ b/tests/engine/test_async_omni_engine_stage_init.py @@ -31,7 +31,6 @@ def test_initialize_stages_restores_device_visibility_after_diffusion_init(monke from vllm_omni.platforms import current_omni_platform engine = object.__new__(AsyncOmniEngine) - engine.log_stats = False engine.model = "dummy-model" engine.config_path = "dummy-config" engine.num_stages = 1 @@ -283,7 +282,6 @@ def __init__(self, vllm_config, renderer=None): ) engine = object.__new__(AsyncOmniEngine) - engine.log_stats = False _stage_client, _out_proc, _vllm_cfg, input_processor = engine._attach_llm_stage(started) diff --git a/tests/engine/test_single_stage_mode.py b/tests/engine/test_single_stage_mode.py index 1afe2fd6d9..2c5bf6cc79 100644 --- a/tests/engine/test_single_stage_mode.py +++ b/tests/engine/test_single_stage_mode.py @@ -461,7 +461,6 @@ def _build_engine_skeleton( engine.stage_configs = stage_cfgs engine.num_stages = len(stage_cfgs) engine.async_chunk = False - engine.log_stats = False engine.single_stage_mode = single_stage_mode engine._single_stage_id_filter = stage_id_filter engine._omni_master_address = omni_master_address @@ -1367,7 +1366,6 @@ class TestLaunchLlmStageSingleStageMode: def _build_engine_with_oms(self) -> AsyncOmniEngine: engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" - engine.log_stats = False engine.single_stage_mode = True engine._single_stage_id_filter = 0 engine._llm_stage_launch_lock = threading.Lock() @@ -1448,7 +1446,6 @@ def test_spawn_stage_core_used_in_normal_mode(self): """~single_stage_mode → spawn_stage_core + complete_stage_handshake.""" engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" - engine.log_stats = False engine.single_stage_mode = False engine._omni_master_server = None engine._llm_stage_launch_lock = threading.Lock() diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 32e8336f6d..0a2e02d66e 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -31,7 +31,6 @@ from vllm.tokenizers import cached_tokenizer_from_config from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.input_processor import InputProcessor -from vllm.v1.metrics.loggers import StatLoggerManager from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient @@ -285,7 +284,6 @@ def __init__( self.num_stages = len(self.stage_configs) stage0_args = getattr(self.stage_configs[0], "engine_args", None) if self.num_stages > 0 else None self.async_chunk = bool(getattr(stage0_args, "async_chunk", False)) - self.log_stats = not bool(getattr(stage0_args, "disable_log_stats", False)) self.stage_clients: list[Any] = [] self.stage_vllm_configs: list[Any] = [] self.output_processors: list[MultimodalOutputProcessor | None] = [] @@ -415,7 +413,7 @@ def _launch_llm_stage( addresses, proc, handshake_address = spawn_stage_core( vllm_config=vllm_config, executor_class=executor_class, - log_stats=self.log_stats, + log_stats=False, ) started_stage = StartedLlmStage( stage_id=metadata.stage_id, @@ -617,7 +615,7 @@ def _attach_llm_stage( ) output_processor = MultimodalOutputProcessor( tokenizer=tokenizer, - log_stats=self.log_stats, + log_stats=False, engine_core_output_type=started.metadata.engine_output_type, ) input_processor = None @@ -872,30 +870,6 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: self.default_sampling_params_list = default_sampling_params_list self.stage_metadata = stage_metadata - # Single StatLoggerManager for the whole pipeline, mirroring how - # vLLM AsyncLLM uses one manager with multiple engine indices for DP. - # We treat each stage as a separate "engine_idx" so logs are - # distinguishable as "Engine 000/001/002/...". Using a single manager - # also avoids PrometheusStatLogger registry collisions. - self.logger_manager: StatLoggerManager | None = None - if self.log_stats: - base_vllm_config = next( - (cfg for cfg in self.stage_vllm_configs if cfg is not None), - None, - ) - if base_vllm_config is not None: - try: - self.logger_manager = StatLoggerManager( - vllm_config=base_vllm_config, - engine_idxs=list(range(self.num_stages)), - custom_stat_loggers=None, - enable_default_loggers=True, - ) - self.logger_manager.log_engine_initialized() - except Exception: - logger.exception("[AsyncOmniEngine] Failed to build StatLoggerManager") - self.logger_manager = None - def _initialize_janus_queues(self) -> None: """Initialize janus queues inside orchestrator thread loop context.""" self.request_queue = janus.Queue() @@ -912,10 +886,6 @@ def _bootstrap_orchestrator( loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) - # Expose the orchestrator loop so other threads (API server) can - # schedule coroutines onto it via run_coroutine_threadsafe, keeping - # single-threaded access to StatLoggerManager (mirrors AsyncLLM). - self.orchestrator_loop = loop async def _run_orchestrator() -> None: self._initialize_janus_queues() @@ -929,7 +899,6 @@ async def _run_orchestrator() -> None: stage_clients=self.stage_clients, output_processors=self.output_processors, stage_vllm_configs=self.stage_vllm_configs, - logger_manager=self.logger_manager, ) if not startup_future.done(): startup_future.set_result(asyncio.get_running_loop()) @@ -1554,29 +1523,6 @@ async def abort_async(self, request_ids: list[str]) -> None: """Async abort API.""" self.abort(request_ids) - async def do_log_stats(self) -> None: - """Flush the StatLoggerManager on the orchestrator thread. - - ``StatLoggerManager`` is only safe to access from the orchestrator - loop (where ``record()`` runs). Schedule ``log()`` onto that loop - via ``run_coroutine_threadsafe`` so all access stays single-threaded, - matching upstream vLLM ``AsyncLLM``. - """ - manager = self.logger_manager - if manager is None: - return - loop = getattr(self, "orchestrator_loop", None) - if loop is None or not loop.is_running(): - return - - async def _log() -> None: - manager.log() - - try: - await asyncio.wrap_future(asyncio.run_coroutine_threadsafe(_log(), loop)) - except Exception: - logger.exception("[AsyncOmniEngine] do_log_stats failed") - def collective_rpc( self, method: str, diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index e64fd3685c..386b545eb7 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -22,8 +22,6 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.v1.engine import EngineCoreOutputs -from vllm.v1.metrics.loggers import StatLoggerManager -from vllm.v1.metrics.stats import IterationStats from vllm_omni.distributed.omni_connectors.adapter import compute_talker_prompt_ids_length from vllm_omni.engine import ( @@ -124,7 +122,6 @@ def __init__( stage_vllm_configs: list[Any], *, async_chunk: bool = False, - logger_manager: StatLoggerManager | None = None, ) -> None: self.request_async_queue = request_async_queue self.output_async_queue = output_async_queue @@ -136,8 +133,6 @@ def __init__( self.stage_clients: list[Any] = stage_clients self.output_processors: list[Any] = output_processors self.stage_vllm_configs: list[Any] = stage_vllm_configs - self.logger_manager: StatLoggerManager | None = logger_manager - self.log_stats = self.logger_manager is not None # Per-request state self.request_states: dict[str, OrchestratorRequestState] = {} @@ -629,13 +624,10 @@ async def _process_stage_outputs(self, stage_id: int, raw_outputs: EngineCoreOut """ processor = self.output_processors[stage_id] - num_outputs = len(raw_outputs.outputs) - iteration_stats = IterationStats() if (self.log_stats and num_outputs) else None - processed = processor.process_outputs( raw_outputs.outputs, raw_outputs.timestamp, - iteration_stats, + None, ) if processed.reqs_to_abort: @@ -644,22 +636,6 @@ async def _process_stage_outputs(self, stage_id: int, raw_outputs: EngineCoreOut if raw_outputs.scheduler_stats is not None: processor.update_scheduler_stats(raw_outputs.scheduler_stats) - # Mirror vLLM AsyncLLM output_handler: feed stats to the logger - # manager so LoggingStatLogger can periodically print KV cache / - # prefix cache hit rate, and PrometheusStatLogger can publish. - if self.logger_manager is not None: - try: - self.logger_manager.record( - engine_idx=stage_id, - scheduler_stats=raw_outputs.scheduler_stats, - iteration_stats=iteration_stats, - ) - except Exception: - logger.exception( - "[Orchestrator] stat logger record failed for stage-%s", - stage_id, - ) - return processed.request_outputs async def _handle_add_request(self, msg: dict[str, Any]) -> None: diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py index 0b25ce7141..129ef3c99d 100644 --- a/vllm_omni/entrypoints/async_omni.py +++ b/vllm_omni/entrypoints/async_omni.py @@ -743,8 +743,11 @@ async def is_tracing_enabled(self) -> bool: return False async def do_log_stats(self) -> None: - """Log statistics via the engine, mirroring vLLM ``AsyncLLM``.""" - await self.engine.do_log_stats() + """Log statistics. + + TODO: Forward to Orchestrator process via message. + """ + pass async def get_supported_tasks(self) -> tuple[SupportedTask, ...]: """Return the task set exposed by the orchestrator-backed engine.""" From 0d4e975e1bf6c574babc7e8279db2b4ff612dd22 Mon Sep 17 00:00:00 2001 From: NATURE Date: Mon, 13 Apr 2026 16:01:14 +0800 Subject: [PATCH 144/204] [core]refactor communication layer: PR1(Added Refactor Infra Only) (#1555) Signed-off-by: natureofnature Co-authored-by: Hongsheng Liu --- .../test_chunk_scheduling_coordinator.py | 690 ++++++ tests/worker/test_omni_connector_mixin.py | 1419 +++++++++++ .../core/sched/omni_scheduling_coordinator.py | 380 +++ .../worker/diffusion_model_runner.py | 3 +- vllm_omni/outputs.py | 28 + vllm_omni/worker/gpu_ar_model_runner.py | 3 +- .../worker/gpu_generation_model_runner.py | 3 +- .../omni_connector_model_runner_mixin.py | 2125 +++++++++++++++++ vllm_omni/worker/payload_span.py | 64 + 9 files changed, 4712 insertions(+), 3 deletions(-) create mode 100644 tests/core/sched/test_chunk_scheduling_coordinator.py create mode 100644 tests/worker/test_omni_connector_mixin.py create mode 100644 vllm_omni/core/sched/omni_scheduling_coordinator.py create mode 100644 vllm_omni/worker/omni_connector_model_runner_mixin.py create mode 100644 vllm_omni/worker/payload_span.py diff --git a/tests/core/sched/test_chunk_scheduling_coordinator.py b/tests/core/sched/test_chunk_scheduling_coordinator.py new file mode 100644 index 0000000000..5e19465e22 --- /dev/null +++ b/tests/core/sched/test_chunk_scheduling_coordinator.py @@ -0,0 +1,690 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for OmniSchedulingCoordinator (formerly ChunkSchedulingCoordinator). + +These tests use mock request objects and mock queues. They do not require +GPU, vLLM runtime, or any connector. +""" + +from __future__ import annotations + +import unittest +from types import SimpleNamespace + +import vllm_omni.core.sched.omni_scheduling_coordinator as coord_mod +from vllm_omni.core.sched.omni_scheduling_coordinator import ( + ChunkSchedulingCoordinator, + OmniSchedulingCoordinator, +) + +# ------------------------------------------------------------------ # +# Mock helpers +# ------------------------------------------------------------------ # + + +class _RequestStatus: + WAITING = "waiting" + RUNNING = "running" + WAITING_FOR_CHUNK = "waiting_for_chunk" + WAITING_FOR_INPUT = "waiting_for_input" + FINISHED_STOPPED = "finished_stopped" + + +# Patch RequestStatus for tests that don't import vllm +try: + from vllm.v1.request import RequestStatus +except ImportError: + RequestStatus = _RequestStatus # type: ignore[misc,assignment] + +if not hasattr(RequestStatus, "WAITING_FOR_INPUT"): + coord_mod.RequestStatus = _RequestStatus # type: ignore[assignment] + RequestStatus = _RequestStatus # type: ignore[misc,assignment] + + +def _make_request(req_id: str, status: str = "waiting") -> SimpleNamespace: + return SimpleNamespace( + request_id=req_id, + external_req_id=req_id, + status=status, + additional_information=None, + prompt_token_ids=[], + num_prompt_tokens=0, + num_computed_tokens=0, + _all_token_ids=[], + _output_token_ids=[], + ) + + +class MockQueue: + """Simplified queue that mimics the Scheduler waiting queue interface.""" + + def __init__(self, items: list | None = None): + self._items: list = list(items or []) + + def __iter__(self): + return iter(self._items) + + def __len__(self): + return len(self._items) + + def __contains__(self, item): + return item in self._items + + def add_request(self, request): + self._items.append(request) + + def prepend_requests(self, requests): + self._items = list(requests) + self._items + + def remove(self, request): + self._items.remove(request) + + def remove_requests(self, requests): + remove_set = set(id(r) for r in requests) + self._items = [r for r in self._items if id(r) not in remove_set] + + +# ------------------------------------------------------------------ # +# Tests +# ------------------------------------------------------------------ # + + +class TestChunkCoordinatorStateTransition(unittest.TestCase): + """Test 5: process_pending_chunks transitions WAITING_FOR_CHUNK → target.""" + + def test_ready_request_transitions_to_waiting(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1, async_chunk=True) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids={"r1"}, + chunk_finished_req_ids=set(), + ) + + self.assertEqual(req.status, RequestStatus.WAITING) + self.assertIn("r1", coord.requests_with_ready_chunks) + + def test_non_ready_stays_waiting_for_chunk(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1, async_chunk=True) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids=set(), + chunk_finished_req_ids=set(), + ) + + self.assertEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) + + def test_stage_0_is_noop(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=0) + req = _make_request("r1") + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids={"r1"}, + chunk_finished_req_ids=set(), + ) + self.assertNotEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) + + +class TestChunkCoordinatorRestoreQueues(unittest.TestCase): + """Test 6: restore_queues returns waiting-for-chunk requests.""" + + def test_restore(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + r1 = _make_request("r1") + r2 = _make_request("r2") + coord._waiting_for_chunk_waiting.append(r1) + coord._waiting_for_chunk_running.append(r2) + + waiting = MockQueue() + running: list = [] + + coord.restore_queues(waiting, running) + + self.assertIn(r1, waiting) + self.assertIn(r2, running) + self.assertEqual(len(coord._waiting_for_chunk_waiting), 0) + self.assertEqual(len(coord._waiting_for_chunk_running), 0) + + +class TestChunkCoordinatorFinishedSignal(unittest.TestCase): + """Test 8: chunk_finished_req_ids → finished_requests.""" + + def test_finished_signal(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1, async_chunk=True) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids={"r1"}, + chunk_finished_req_ids={"r1"}, + ) + + self.assertIn("r1", coord.finished_requests) + + +class TestChunkCoordinatorUpdateRequestMetadata(unittest.TestCase): + """Test update_request_metadata applies scheduling metadata to requests.""" + + def test_ar_mode_no_longer_sets_additional_information(self): + """AR mode only processes scheduling metadata, not full payloads.""" + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + req = _make_request("r1") + requests = {"r1": req} + + # Only scheduling metadata is passed now (full payload stays in model runner) + request_metadata = {"r1": {"next_stage_prompt_len": 50}} + + coord.update_request_metadata(requests, request_metadata, model_mode="ar") + + # next_stage_prompt_len should update prompt_token_ids + self.assertEqual(len(req.prompt_token_ids), 50) + self.assertEqual(req.num_prompt_tokens, 50) + # additional_information should NOT be set + self.assertIsNone(getattr(req, "additional_information", None)) + + def test_generation_mode(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + req = _make_request("r1") + req.prompt_token_ids = [0, 0, 0] + requests = {"r1": req} + + request_metadata = { + "r1": { + "code_predictor_codes": [10, 20, 30], + "left_context_size": 25, + } + } + + coord.update_request_metadata(requests, request_metadata, model_mode="generation") + + self.assertEqual(req.prompt_token_ids, [10, 20, 30]) + self.assertEqual(req.num_computed_tokens, 0) + self.assertIsNone(req.additional_information) + self.assertEqual(req._omni_initial_model_buffer, {"left_context_size": 25}) + + +class TestChunkCoordinatorPostprocess(unittest.TestCase): + """Test postprocess_scheduler_output clears ready chunks.""" + + def test_clear_ready(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + coord.requests_with_ready_chunks = {"r1", "r2"} + + new_req = SimpleNamespace(req_id="r1") + cached_reqs = SimpleNamespace(req_ids=["r2"]) + scheduler_output = SimpleNamespace( + scheduled_new_reqs=[new_req], + scheduled_cached_reqs=cached_reqs, + ) + + coord.postprocess_scheduler_output(scheduler_output) + + self.assertEqual(coord.requests_with_ready_chunks, set()) + + +class TestWaitingForInputTransition(unittest.TestCase): + """Test B8: process_pending_full_payload_inputs transitions WAITING_FOR_INPUT.""" + + def test_transition_on_recv(self): + coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_full_payload_inputs( + waiting, + running, + stage_recv_req_ids={"r1"}, + ) + + self.assertEqual(req.status, RequestStatus.WAITING) + + def test_stays_waiting_for_input_if_not_received(self): + coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_full_payload_inputs( + waiting, + running, + stage_recv_req_ids=set(), + ) + + self.assertEqual(req.status, RequestStatus.WAITING_FOR_INPUT) + self.assertEqual(len(coord._waiting_for_input), 1) + + def test_stage_0_is_noop(self): + coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=0) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_full_payload_inputs( + waiting, + running, + stage_recv_req_ids={"r1"}, + ) + self.assertEqual(req.status, RequestStatus.WAITING_FOR_INPUT) + + def test_restore_queues_includes_waiting_for_input(self): + coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + r1 = _make_request("r1") + coord._waiting_for_input.append(r1) + + waiting = MockQueue() + running: list = [] + + coord.restore_queues(waiting, running) + + self.assertIn(r1, waiting) + self.assertEqual(len(coord._waiting_for_input), 0) + + def test_full_payload_mode_auto_transitions_waiting_to_waiting_for_input(self): + """In full_payload_mode (async_chunk=False), fresh WAITING requests on + non-Stage-0 should be transitioned to WAITING_FOR_INPUT.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=False, + ) + + req = _make_request("r1", status=RequestStatus.WAITING) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_full_payload_inputs( + waiting, + running, + stage_recv_req_ids=set(), + ) + + self.assertEqual(req.status, RequestStatus.WAITING_FOR_INPUT) + self.assertEqual(len(coord._waiting_for_input), 1) + self.assertEqual(len(coord.pending_input_registrations), 1) + + def test_async_chunk_mode_does_not_auto_transition(self): + """In async_chunk mode, fresh WAITING requests should NOT be + transitioned to WAITING_FOR_INPUT.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=True, + ) + + req = _make_request("r1", status=RequestStatus.WAITING) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_full_payload_inputs( + waiting, + running, + stage_recv_req_ids=set(), + ) + + self.assertEqual(req.status, RequestStatus.WAITING) + + def test_pending_input_registrations(self): + coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_full_payload_inputs( + waiting, + running, + stage_recv_req_ids=set(), + ) + + self.assertEqual(len(coord.pending_input_registrations), 1) + self.assertEqual(coord.pending_input_registrations[0].request_id, "r1") + + +class TestTimeoutDetection(unittest.TestCase): + """Regression tests for orphaned pending-recv timeout detection. + + Covers the full lifecycle: + 1. Request enters WAITING_FOR_CHUNK from either waiting or running queue + 2. restore_queues() moves it back to the scheduler queue + 3. Timeout fires via collect_timed_out_request_ids() + 4. Scheduler removes from both queues and calls _free_request() + """ + + def test_waiting_since_recorded_on_chunk_wait(self): + """_waiting_since is set when a request enters WAITING_FOR_CHUNK.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=True, + ) + req = _make_request("r1", status=RequestStatus.WAITING) + waiting = MockQueue([req]) + + coord.process_pending_chunks( + waiting, + [], + chunk_ready_req_ids=set(), + chunk_finished_req_ids=set(), + ) + + self.assertIn("r1", coord._waiting_since) + self.assertEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) + + def test_waiting_since_cleared_on_chunk_arrival(self): + """_waiting_since is cleared when a chunk arrives.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=True, + ) + req = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) + waiting = MockQueue([req]) + + coord.process_pending_chunks( + waiting, + [], + chunk_ready_req_ids={"r1"}, + chunk_finished_req_ids=set(), + ) + + self.assertNotIn("r1", coord._waiting_since) + + def test_waiting_since_recorded_on_input_wait(self): + """_waiting_since is set when a request enters WAITING_FOR_INPUT.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=False, + ) + req = _make_request("r1", status=RequestStatus.WAITING) + waiting = MockQueue([req]) + + coord.process_pending_full_payload_inputs( + waiting, + [], + stage_recv_req_ids=set(), + ) + + self.assertIn("r1", coord._waiting_since) + + def test_waiting_since_cleared_on_input_arrival(self): + """_waiting_since is cleared when input data arrives.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=False, + ) + req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) + coord._waiting_for_input.append(req) + coord._waiting_since["r1"] = 0.0 + + waiting = MockQueue() + coord.process_pending_full_payload_inputs( + waiting, + [], + stage_recv_req_ids={"r1"}, + ) + + self.assertNotIn("r1", coord._waiting_since) + self.assertEqual(req.status, RequestStatus.WAITING) + + def test_collect_timed_out_request_ids_no_timeout(self): + """No IDs returned when nothing has timed out.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + ) + import time + + coord._waiting_since["r1"] = time.monotonic() + + result = coord.collect_timed_out_request_ids(timeout_s=300.0) + self.assertEqual(result, set()) + + def test_collect_timed_out_request_ids_expired(self): + """Timed-out IDs are returned and _waiting_since is cleared.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + ) + coord._waiting_since["r1"] = 0.0 # epoch → definitely expired + coord._waiting_since["r2"] = 0.0 + + import time + + coord._waiting_since["r3"] = time.monotonic() + 9999 # far future + + result = coord.collect_timed_out_request_ids(timeout_s=1.0) + + self.assertEqual(result, {"r1", "r2"}) + self.assertNotIn("r1", coord._waiting_since) + self.assertNotIn("r2", coord._waiting_since) + self.assertIn("r3", coord._waiting_since) + + def test_collect_removes_from_coordinator_queues(self): + """Timed-out requests are defensively removed from internal queues.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + ) + r1 = _make_request("r1") + r2 = _make_request("r2") + coord._waiting_for_chunk_waiting.append(r1) + coord._waiting_for_input.append(r2) + coord._waiting_since["r1"] = 0.0 + coord._waiting_since["r2"] = 0.0 + + result = coord.collect_timed_out_request_ids(timeout_s=1.0) + + self.assertEqual(result, {"r1", "r2"}) + self.assertEqual(len(coord._waiting_for_chunk_waiting), 0) + self.assertEqual(len(coord._waiting_for_input), 0) + + def test_free_finished_request_clears_waiting_since(self): + """free_finished_request clears _waiting_since.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + ) + coord._waiting_since["r1"] = 0.0 + coord.free_finished_request("r1") + self.assertNotIn("r1", coord._waiting_since) + + def test_timeout_from_running_queue_full_lifecycle(self): + """End-to-end: request from running → WAITING_FOR_CHUNK → restore → + timeout → removed from running list. + + This is the critical regression case: WAITING_FOR_CHUNK requests + that originated from self.running are placed back into self.running + by restore_queues(), but their status remains WAITING_FOR_CHUNK. + The scheduler must remove from BOTH queues unconditionally. + """ + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=True, + ) + + # 1) Request starts in running queue with WAITING status + req = _make_request("r1", status=RequestStatus.WAITING) + running = [req] + waiting = MockQueue() + + # 2) process_pending_chunks: moves to WAITING_FOR_CHUNK + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids=set(), + chunk_finished_req_ids=set(), + ) + self.assertEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) + self.assertIn("r1", coord._waiting_since) + self.assertEqual(len(coord._waiting_for_chunk_running), 1) + + # 3) restore_queues: back to running (status stays WAITING_FOR_CHUNK) + coord.restore_queues(waiting, running) + self.assertIn(req, running) + self.assertEqual(len(coord._waiting_for_chunk_running), 0) + self.assertEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) + + # 4) Force timeout by setting _waiting_since to epoch + coord._waiting_since["r1"] = 0.0 + + timed_out_ids = coord.collect_timed_out_request_ids(timeout_s=1.0) + self.assertEqual(timed_out_ids, {"r1"}) + + # 5) Scheduler removes from both queues (simulating the scheduler path) + timed_out_id_set = {id(req)} + running = [r for r in running if id(r) not in timed_out_id_set] + waiting.remove_requests([req]) + + self.assertNotIn(req, running) + self.assertEqual(len(waiting), 0) + + def test_timeout_from_waiting_queue_full_lifecycle(self): + """End-to-end: request from waiting → WAITING_FOR_CHUNK → restore → + timeout → removed from waiting queue.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=True, + ) + + req = _make_request("r1", status=RequestStatus.WAITING) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids=set(), + chunk_finished_req_ids=set(), + ) + self.assertEqual(len(coord._waiting_for_chunk_waiting), 1) + + coord.restore_queues(waiting, running) + self.assertIn(req, waiting) + + coord._waiting_since["r1"] = 0.0 + timed_out_ids = coord.collect_timed_out_request_ids(timeout_s=1.0) + self.assertEqual(timed_out_ids, {"r1"}) + + waiting.remove_requests([req]) + self.assertEqual(len(waiting), 0) + + +class TestOverflowPreemption(unittest.TestCase): + """Tests for P1-1: overflow requests must get WAITING status. + + Overflow happens when multiple WAITING_FOR_CHUNK requests in + ``_waiting_for_chunk_running`` receive their chunk in the same cycle. + ``_process_chunk_queue`` restores them to RUNNING (``continue`` + path) while RUNNING requests without chunks are moved out. If the + net result exceeds ``scheduler_max_num_seqs``, the tail is pushed + to ``waiting_queue`` and must have status == WAITING. + """ + + def test_overflow_sets_waiting_status(self): + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=1, + stage_id=1, + async_chunk=True, + ) + + # r1 is currently RUNNING in the queue. + # r2, r3 were previously moved to _waiting_for_chunk_running. + r1 = _make_request("r1", status=RequestStatus.RUNNING) + r2 = _make_request("r2", status=RequestStatus.WAITING_FOR_CHUNK) + r3 = _make_request("r3", status=RequestStatus.WAITING_FOR_CHUNK) + + running = [r1] + waiting = MockQueue([]) + coord._waiting_for_chunk_running.extend([r2, r3]) + + # restore_queues puts r2, r3 back into running + coord.restore_queues(waiting, running) + self.assertEqual(len(running), 3) + + # Now process_pending_chunks with r2, r3 chunks ready: + # _process_chunk_queue will: + # r1 (RUNNING) → no chunk → move to _waiting_for_chunk_running + # r2 (WAITING_FOR_CHUNK, chunk ready) → set RUNNING, stay in running + # r3 (WAITING_FOR_CHUNK, chunk ready) → set RUNNING, stay in running + # running = [r2, r3], len=2 > max=1 → overflow + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids={"r2", "r3"}, + chunk_finished_req_ids=set(), + ) + + self.assertEqual(len(running), 1) + self.assertEqual(len(waiting), 1) + overflow_req = list(waiting)[0] + self.assertEqual( + overflow_req.status, + RequestStatus.WAITING, + f"Overflowed request should have WAITING status, got {overflow_req.status}", + ) + + def test_overflow_does_not_strand_request(self): + """Without the fix, the overflowed request would keep its + RUNNING status in the waiting queue and never be re-scheduled.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=1, + stage_id=1, + async_chunk=True, + ) + + r1 = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) + r2 = _make_request("r2", status=RequestStatus.WAITING_FOR_CHUNK) + coord._waiting_for_chunk_running.extend([r1, r2]) + + running: list = [] + waiting = MockQueue([]) + + coord.restore_queues(waiting, running) + self.assertEqual(len(running), 2) + + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids={"r1", "r2"}, + chunk_finished_req_ids=set(), + ) + + self.assertEqual(len(running), 1) + self.assertEqual(len(waiting), 1) + for req in waiting: + self.assertNotEqual(req.status, RequestStatus.RUNNING, "Overflowed request must not keep RUNNING status") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/worker/test_omni_connector_mixin.py b/tests/worker/test_omni_connector_mixin.py new file mode 100644 index 0000000000..0e162a37e5 --- /dev/null +++ b/tests/worker/test_omni_connector_mixin.py @@ -0,0 +1,1419 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for OmniConnectorModelRunnerMixin. + +These tests use a mock connector (in-memory dict store) and do not require +GPU or vLLM runtime. +""" + +from __future__ import annotations + +import time +import unittest +from types import SimpleNamespace +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest +import torch + +from vllm_omni.outputs import OmniConnectorOutput +from vllm_omni.worker.omni_connector_model_runner_mixin import ( + OmniConnectorModelRunnerMixin, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + +# ------------------------------------------------------------------ # +# Mock helpers +# ------------------------------------------------------------------ # + + +class MockConnector: + """In-memory connector for testing (mimics OmniConnectorBase).""" + + def __init__(self, stage_id: int = 0): + self.stage_id = stage_id + self._store: dict[str, Any] = {} + + def put(self, from_stage, to_stage, put_key, data): + key = f"{from_stage}_{to_stage}_{put_key}" + self._store[key] = data + return True, len(str(data)), None + + def get(self, from_stage, to_stage, get_key, metadata=None): + key = f"{from_stage}_{to_stage}_{get_key}" + data = self._store.pop(key, None) + if data is None: + return None + return data, len(str(data)) + + def close(self): + pass + + +def _make_model_config( + stage_id: int = 0, + async_chunk: bool = False, + worker_type: str = "ar", + custom_func: str | None = None, +) -> SimpleNamespace: + return SimpleNamespace( + stage_connector_config=None, + async_chunk=async_chunk, + worker_type=worker_type, + custom_process_next_stage_input_func=custom_func, + ) + + +def _make_request(req_id: str, external_req_id: str | None = None): + r = SimpleNamespace( + request_id=req_id, + external_req_id=external_req_id or req_id, + additional_information=None, + prompt_token_ids=[], + num_computed_tokens=0, + ) + return r + + +class MixinHost(OmniConnectorModelRunnerMixin): + """Minimal class that mixes in the mixin for testing.""" + + pass + + +class _FakeTPGroup: + def __init__(self, *, world_size: int, rank_in_group: int, follower_result: Any = None): + self.world_size = world_size + self.rank_in_group = rank_in_group + self.follower_result = follower_result + self.broadcast_inputs: list[Any] = [] + + def broadcast_object(self, obj: Any | None = None, src: int = 0): + self.broadcast_inputs.append(obj) + if self.rank_in_group == src: + return obj + return self.follower_result + + +# ------------------------------------------------------------------ # +# Test cases +# ------------------------------------------------------------------ # + + +class TestMixinAsyncChunkSendRecv(unittest.TestCase): + """Test 2: Async chunk send/recv + bg threads.""" + + def test_send_chunk_passes_is_finished_and_connector(self): + connector = MockConnector(stage_id=0) + + sender = MixinHost() + sender.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=0, async_chunk=True), + ) + sender._omni_connector = connector + sender._stage_id = 0 + sender._async_chunk = True + + seen = {} + + def mock_process(transfer_manager, pooling_output, request, is_finished=False): + seen["connector"] = transfer_manager.connector + seen["is_finished"] = is_finished + return {"data": pooling_output, "finished": is_finished} + + sender._custom_process_func = mock_process + + request = _make_request("req-1", "ext-req-1") + request.is_finished = lambda: True + sender._send_single_request( + { + "stage_id": 0, + "next_stage_id": 1, + "request_id": "ext-req-1", + "request": request, + "pooling_output": {"value": 42}, + } + ) + self.assertIs(seen["connector"], connector) + self.assertTrue(seen["is_finished"]) + + sender.shutdown_omni_connectors() + + def test_send_chunk_does_not_retry_real_type_error(self): + connector = MockConnector(stage_id=0) + + sender = MixinHost() + sender.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=0, async_chunk=True), + ) + sender._omni_connector = connector + sender._stage_id = 0 + sender._async_chunk = True + + seen = {"calls": 0} + + def broken_process(transfer_manager, pooling_output, request, is_finished=""): + seen["calls"] += 1 + return {"data": is_finished + "tail"} + + sender._custom_process_func = broken_process + + request = _make_request("req-1", "ext-req-1") + request.is_finished = lambda: True + ok = sender.send_chunk(request, pooling_output={"value": 42}) + self.assertFalse(ok) + self.assertEqual(seen["calls"], 1) + + sender.shutdown_omni_connectors() + + +class TestMixinKVCacheTransfer(unittest.TestCase): + """Test 3: KV cache delegation to OmniKVTransferManager.""" + + def test_send_kv_delegates(self): + mock_kvm = MagicMock() + mock_kvm.handle_finished_requests_kv_transfer.return_value = ["req-1"] + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + kv_transfer_manager=mock_kvm, + ) + + result = host.send_kv_cache( + finished_reqs={"req-1": {"seq_len": 10, "block_ids": [0]}}, + kv_caches=[], + block_size=16, + cache_dtype="float16", + ) + self.assertEqual(result, ["req-1"]) + mock_kvm.handle_finished_requests_kv_transfer.assert_called_once() + + host.shutdown_omni_connectors() + + def test_recv_kv_delegates(self): + mock_kvm = MagicMock() + mock_kvm.receive_kv_cache_for_request.return_value = ({"layer_blocks": {}}, 100) + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + kv_transfer_manager=mock_kvm, + ) + + data, size = host.recv_kv_cache("req-1") + self.assertIsNotNone(data) + self.assertEqual(size, 100) + mock_kvm.receive_kv_cache_for_request.assert_called_once() + + host.shutdown_omni_connectors() + + def test_receive_multi_kv_fetches_companions_via_mixin(self): + mock_kvm = MagicMock() + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + kv_transfer_manager=mock_kvm, + ) + + host.recv_kv_cache = MagicMock( + side_effect=[({"layer_blocks": {"k": [1]}}, 64), ({"layer_blocks": {"k": [2]}}, 32)] + ) + seen = {} + + def collect_cfg(request_id, cfg_role_payloads): + seen["request_id"] = request_id + seen["cfg_role_payloads"] = cfg_role_payloads + return {"cfg_text_kv_metadata": {"seq_len": 3}} + + req = SimpleNamespace( + request_id="req-1", + sampling_params=SimpleNamespace(cfg_kv_request_ids={"cfg_text": "req-1__cfg_text"}), + ) + ok = host.receive_multi_kv_cache(req, cfg_kv_collect_func=collect_cfg) + self.assertTrue(ok) + host.recv_kv_cache.assert_any_call("req-1", target_device=None) + host.recv_kv_cache.assert_any_call("req-1__cfg_text", target_device=None) + mock_kvm.apply_kv_cache_to_request.assert_called_once_with(req, {"layer_blocks": {"k": [1]}}) + self.assertEqual(seen["request_id"], "req-1") + self.assertEqual( + seen["cfg_role_payloads"], + {"cfg_text": ({"layer_blocks": {"k": [2]}}, 32)}, + ) + self.assertEqual(req.sampling_params.cfg_text_kv_metadata, {"seq_len": 3}) + + host.shutdown_omni_connectors() + + def test_receive_multi_kv_skips_inactive_request(self): + mock_kvm = MagicMock() + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + kv_transfer_manager=mock_kvm, + ) + + host.requests = {} + host.recv_kv_cache = MagicMock(return_value=({"layer_blocks": {"k": [1]}}, 64)) + req = SimpleNamespace(request_id="req-1", sampling_params=None) + + ok = host.receive_multi_kv_cache(req) + + self.assertFalse(ok) + host.recv_kv_cache.assert_not_called() + mock_kvm.apply_kv_cache_to_request.assert_not_called() + + host.shutdown_omni_connectors() + + +class TestOmniConnectorOutput(unittest.TestCase): + """Test 4: Output aggregation across transfer modes.""" + + def test_output_aggregation(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + ) + + host._chunk_ready_req_ids.add("req-1") + host._chunk_finished_req_ids.add("req-2") + host._local_request_metadata["req-1"] = {"next_stage_prompt_len": 10} + host._stage_recv_req_ids.add("req-3") + + output = host.get_omni_connector_output() + self.assertIsInstance(output, OmniConnectorOutput) + self.assertEqual(output.chunk_ready_req_ids, {"req-1"}) + self.assertEqual(output.chunk_finished_req_ids, {"req-2"}) + self.assertEqual(output.request_metadata, {"req-1": {"next_stage_prompt_len": 10}}) + self.assertEqual(output.stage_recv_req_ids, {"req-3"}) + + output2 = host.get_omni_connector_output() + self.assertEqual(output2.chunk_ready_req_ids, set()) + self.assertEqual(output2.request_metadata, {}) + + host.shutdown_omni_connectors() + + +class TestMixinNoConnector(unittest.TestCase): + """Edge case: mixin works gracefully without a connector.""" + + def test_no_connector(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + ) + self.assertIsNone(host._omni_connector) + + results = host.recv_full_payload_inputs(scheduler_output=None) + self.assertIsNone(results) + + sent = host.send_full_payload_outputs(None, {"req-1": {}}) + self.assertEqual(sent, []) + + ok = host.send_chunk(_make_request("req-1"), pooling_output={}) + self.assertFalse(ok) + + output = host.get_omni_connector_output() + self.assertIsInstance(output, OmniConnectorOutput) + + host.shutdown_omni_connectors() + + +class TestFinishedLoadReqsDrain(unittest.TestCase): + """Test A1 fix: get_omni_connector_output drains _finished_load_reqs.""" + + def test_finished_load_reqs_flow_to_chunk_ready(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + ) + + host._finished_load_reqs.add("req-1") + host._finished_load_reqs.add("req-2") + + output = host.get_omni_connector_output() + self.assertIn("req-1", output.chunk_ready_req_ids) + self.assertIn("req-2", output.chunk_ready_req_ids) + + self.assertEqual(len(host._finished_load_reqs), 0) + self.assertEqual(len(host._chunk_ready_req_ids), 0) + + host.shutdown_omni_connectors() + + +class TestLoadCustomFuncSelection(unittest.TestCase): + def test_skips_legacy_stage_list_processors_for_full_payload_mode(self): + legacy_paths = [ + "vllm_omni.model_executor.stage_input_processors.mimo_audio.llm2code2wav", + "vllm_omni.model_executor.stage_input_processors.mammoth_moda2.ar2dit", + "vllm_omni.model_executor.stage_input_processors.cosyvoice3.text2flow", + "vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion", + ] + + for func_path in legacy_paths: + selected_path, func = MixinHost._load_custom_func( + SimpleNamespace( + async_chunk=False, + custom_process_input_func=func_path, + custom_process_next_stage_input_func=None, + ) + ) + assert selected_path != func_path + assert func is None or MixinHost._is_connector_payload_builder(func) + + +class TestFullPayloadSendWithCustomFunc(unittest.TestCase): + """Test B4: send_full_payload_outputs with full_payload_mode custom process func.""" + + def test_full_payload_send_passes_is_finished_and_connector(self): + seen = {} + + def full_payload_func(transfer_manager, pooling_output, request, is_finished=False): + seen["connector"] = transfer_manager.connector + seen["is_finished"] = is_finished + seen["data"] = pooling_output + seen["rid"] = request.request_id if request else None + return {"processed": True, "finished": is_finished} + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + ) + host._omni_connector = MockConnector(stage_id=0) + host._stage_id = 0 + host._custom_process_func = full_payload_func + + req = _make_request("req-1") + req.is_finished = lambda: True + sent = host.send_full_payload_outputs( + scheduler_output=None, + outputs={"req-1": ({"raw": 100}, req)}, + ) + self.assertEqual(sent, ["req-1"]) + self.assertEqual( + seen, + { + "connector": host._omni_connector, + "is_finished": True, + "data": {"raw": 100}, + "rid": "req-1", + }, + ) + + host.shutdown_omni_connectors() + + def test_accumulate_and_flush(self): + call_log = [] + + def full_payload_func(transfer_manager, pooling_output, request): + call_log.append(request.request_id if request else None) + return {"processed": True} + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + ) + host._omni_connector = MockConnector(stage_id=0) + host._stage_id = 0 + host._custom_process_func = full_payload_func + + req = _make_request("req-1") + host.accumulate_full_payload_output("req-1", {"raw": 42}, req) + self.assertEqual(len(host._pending_full_payload_send), 1) + + host.flush_full_payload_outputs({"req-1"}) + self.assertEqual(len(host._pending_full_payload_send), 0) + self.assertEqual(len(call_log), 1) + self.assertEqual(call_log[0], "req-1") + + time.sleep(0.1) + host.shutdown_omni_connectors() + + +class TestKVSentReqIdsAccumulation(unittest.TestCase): + """Test that kv_sent_req_ids accumulates results from send_kv_cache.""" + + def test_kv_sent_accumulation(self): + mock_kvm = MagicMock() + mock_kvm.handle_finished_requests_kv_transfer.return_value = ["req-1", "req-2"] + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + kv_transfer_manager=mock_kvm, + ) + + host.send_kv_cache( + finished_reqs={"req-1": {}, "req-2": {}}, + kv_caches=[], + block_size=16, + cache_dtype="float16", + ) + + output = host.get_omni_connector_output() + self.assertIn("req-1", output.kv_sent_req_ids) + self.assertIn("req-2", output.kv_sent_req_ids) + + output2 = host.get_omni_connector_output() + self.assertEqual(output2.kv_sent_req_ids, []) + + host.shutdown_omni_connectors() + + +class TestChunkStreamCompletedGuard(unittest.TestCase): + """Test that register_chunk_recv is skipped after finish sentinel. + + This validates the fix for the race condition where the scheduling + coordinator re-registers a request for chunk polling after its + upstream chunk stream has already finished (is_finished sentinel + received), causing the bg recv thread to poll for a non-existent + shared-memory segment (e.g. ``_0_7`` when only 7 chunks 0–6 exist). + """ + + def _make_host(self, stage_id: int = 1) -> MixinHost: + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=stage_id, async_chunk=True), + ) + host._omni_connector = MockConnector(stage_id=stage_id) + host._stage_id = stage_id + host._async_chunk = True + return host + + def test_register_blocked_after_finish_sentinel(self): + """register_chunk_recv must be a no-op after the finish sentinel.""" + host = self._make_host(stage_id=1) + + req = _make_request("req-1", "ext-req-1") + + # Simulate the bg thread having received the finish sentinel: + with host._lock: + host._chunk_stream_completed.add("req-1") + + # Now try to re-register — this mimics the coordinator asking + # the model runner to poll for the next (non-existent) chunk. + host.register_chunk_recv(req) + + # The request must NOT appear in _pending_load_reqs + self.assertNotIn( + "req-1", + host._pending_load_reqs, + "register_chunk_recv should skip requests whose chunk stream is already complete", + ) + + host.shutdown_omni_connectors() + + def test_register_allowed_before_finish(self): + """register_chunk_recv works normally before finish sentinel.""" + host = self._make_host(stage_id=1) + req = _make_request("req-1", "ext-req-1") + + host.register_chunk_recv(req) + self.assertIn( + "req-1", + host._pending_load_reqs, + "register_chunk_recv should add request to pending when stream is not yet complete", + ) + + host.shutdown_omni_connectors() + + def test_finish_sentinel_populates_completed_set(self): + """Receiving is_finished=True adds to _chunk_stream_completed.""" + host = self._make_host(stage_id=1) + + # Simulate _poll_single_request receiving is_finished=True + req_id = "req-1" + with host._lock: + host._chunk_finished_req_ids.add(req_id) + host._chunk_stream_completed.add(req_id) + host._local_stage_payload_cache[req_id] = {"finished": True} + host._local_request_metadata[req_id] = {} + host._finished_load_reqs.add(req_id) + host._pending_load_reqs.pop(req_id, None) + + self.assertIn(req_id, host._chunk_stream_completed) + + # Subsequent register_chunk_recv should be blocked + req = _make_request(req_id, f"ext-{req_id}") + host.register_chunk_recv(req) + self.assertNotIn(req_id, host._pending_load_reqs) + + host.shutdown_omni_connectors() + + def test_stage_0_always_skipped(self): + """Stage-0 has no upstream, register_chunk_recv is always no-op.""" + host = self._make_host(stage_id=0) + host._stage_id = 0 + + req = _make_request("req-1") + host.register_chunk_recv(req) + self.assertNotIn("req-1", host._pending_load_reqs) + + host.shutdown_omni_connectors() + + def test_full_payload_recv_guard_still_works(self): + """Pre-existing guard: staged full-payload results prevent registration.""" + host = self._make_host(stage_id=1) + + with host._lock: + host._stage_recv_req_ids.add("req-1") + + req = _make_request("req-1", "ext-req-1") + host.register_chunk_recv(req) + self.assertNotIn("req-1", host._pending_load_reqs) + + host.shutdown_omni_connectors() + + +class TestCleanupFinishedRequest(unittest.TestCase): + """Test cleanup_finished_request frees per-request mixin state.""" + + def _make_host(self, stage_id: int = 1) -> MixinHost: + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=stage_id, async_chunk=True), + ) + host._omni_connector = MockConnector(stage_id=stage_id) + host._stage_id = stage_id + host._async_chunk = True + return host + + def test_cleanup_removes_all_state(self): + """cleanup_finished_request removes all tracking dicts/sets.""" + host = self._make_host(stage_id=1) + req_id = "req-1" + ext_id = "ext-req-1" + + # Simulate state accumulated during a request's lifetime + host._request_ids_mapping[req_id] = ext_id + host._put_req_chunk[ext_id] = 5 + host._get_req_chunk[req_id] = 3 + host._send_side_request_payload[ext_id] = {"some": "data"} + host._code_prompt_token_ids[ext_id] = [[1, 2, 3]] + host._chunk_stream_completed.add(req_id) + host._stage_recv_req_ids.add(req_id) + host._local_stage_payload_cache[req_id] = {"engine_inputs": {}} + host._local_request_metadata[req_id] = {"prompt_len": 10} + + # Cleanup + host.cleanup_finished_request(req_id) + + # All state should be gone + self.assertNotIn(req_id, host._request_ids_mapping) + self.assertNotIn(ext_id, host._put_req_chunk) + self.assertNotIn(req_id, host._get_req_chunk) + self.assertNotIn(ext_id, host._send_side_request_payload) + self.assertNotIn(ext_id, host._code_prompt_token_ids) + self.assertNotIn(req_id, host._chunk_stream_completed) + self.assertNotIn(req_id, host._stage_recv_req_ids) + self.assertNotIn(req_id, host._local_stage_payload_cache) + self.assertNotIn(req_id, host._local_request_metadata) + + host.shutdown_omni_connectors() + + def test_cleanup_removes_per_cycle_ready_state(self): + """cleanup_finished_request clears ready/finished carry-over for req-id reuse.""" + host = self._make_host(stage_id=1) + req_id = "req-1" + + host._pending_load_reqs[req_id] = _make_request(req_id, "ext-req-1") + host._finished_load_reqs.add(req_id) + host._chunk_ready_req_ids.add(req_id) + host._chunk_finished_req_ids.add(req_id) + + host.cleanup_finished_request(req_id) + + self.assertNotIn(req_id, host._pending_load_reqs) + self.assertNotIn(req_id, host._finished_load_reqs) + self.assertNotIn(req_id, host._chunk_ready_req_ids) + self.assertNotIn(req_id, host._chunk_finished_req_ids) + + host.shutdown_omni_connectors() + + def test_cleanup_without_mapping(self): + """cleanup works for Stage-0 where _request_ids_mapping isn't set.""" + host = self._make_host(stage_id=0) + host._stage_id = 0 + req_id = "req-1" + + # Stage-0 uses req_id directly (no ext_id mapping) + host._put_req_chunk[req_id] = 3 + host._get_req_chunk[req_id] = 0 + + host.cleanup_finished_request(req_id) + + self.assertNotIn(req_id, host._put_req_chunk) + self.assertNotIn(req_id, host._get_req_chunk) + + host.shutdown_omni_connectors() + + def test_prune_inactive_requests_cleans_stale_state_but_keeps_active(self): + """Inactive request IDs should be pruned without touching active ones.""" + host = self._make_host(stage_id=1) + active_req_id = "req-active" + stale_req_id = "req-stale" + stale_ext_id = "ext-stale" + + host._request_ids_mapping[active_req_id] = "ext-active" + host._request_ids_mapping[stale_req_id] = stale_ext_id + host._put_req_chunk[stale_ext_id] = 2 + host._get_req_chunk[stale_req_id] = 1 + host._finished_load_reqs.add(stale_req_id) + host._chunk_ready_req_ids.update({active_req_id, stale_req_id}) + host._chunk_finished_req_ids.add(stale_req_id) + host._chunk_stream_completed.add(stale_req_id) + host._stage_recv_req_ids.add(active_req_id) + host._send_side_request_payload[stale_ext_id] = {"stale": True} + host._code_prompt_token_ids[stale_ext_id] = [[1, 2, 3]] + + pruned = host.prune_inactive_requests({active_req_id}) + + self.assertEqual(pruned, {stale_req_id}) + self.assertIn(active_req_id, host._request_ids_mapping) + self.assertIn(active_req_id, host._chunk_ready_req_ids) + self.assertIn(active_req_id, host._stage_recv_req_ids) + self.assertNotIn(stale_req_id, host._request_ids_mapping) + self.assertNotIn(stale_ext_id, host._put_req_chunk) + self.assertNotIn(stale_req_id, host._get_req_chunk) + self.assertNotIn(stale_req_id, host._pending_load_reqs) + self.assertNotIn(stale_req_id, host._finished_load_reqs) + self.assertNotIn(stale_req_id, host._chunk_ready_req_ids) + self.assertNotIn(stale_req_id, host._chunk_finished_req_ids) + self.assertNotIn(stale_req_id, host._chunk_stream_completed) + self.assertNotIn(stale_req_id, host._stage_recv_req_ids) + self.assertNotIn(stale_ext_id, host._send_side_request_payload) + self.assertNotIn(stale_ext_id, host._code_prompt_token_ids) + + host.shutdown_omni_connectors() + + def test_prune_inactive_requests_keeps_recently_received_full_payload_state(self): + """Late bg-thread receives must survive until the scheduler catches up.""" + host = self._make_host(stage_id=1) + req_id = "req-recv-race" + ext_id = "ext-recv-race" + + host._request_ids_mapping[req_id] = ext_id + host._put_req_chunk[ext_id] = 1 + host._local_stage_payload_cache[req_id] = {"engine_inputs": {"ids": [1, 2, 3]}} + host._local_request_metadata[req_id] = {"next_stage_prompt_len": 3} + host._stage_recv_req_ids.add(req_id) + + pruned = host.prune_inactive_requests(set()) + + self.assertEqual(pruned, set()) + self.assertIn(req_id, host._request_ids_mapping) + self.assertIn(req_id, host._local_stage_payload_cache) + self.assertIn(req_id, host._local_request_metadata) + self.assertIn(req_id, host._stage_recv_req_ids) + self.assertIn(ext_id, host._put_req_chunk) + + # Once the scheduler has consumed the wake-up and the request really + # disappears from all protected sets, prune should clean it up. + host._stage_recv_req_ids.clear() + host._local_stage_payload_cache.clear() + host._local_request_metadata.clear() + + pruned = host.prune_inactive_requests(set()) + + self.assertEqual(pruned, {req_id}) + self.assertNotIn(req_id, host._request_ids_mapping) + self.assertNotIn(ext_id, host._put_req_chunk) + + host.shutdown_omni_connectors() + + +class TestSendChunkCachesMapping(unittest.TestCase): + """Test that send_chunk caches internal→external req ID mapping.""" + + def test_send_chunk_populates_request_ids_mapping(self): + """send_chunk should cache the internal→external mapping.""" + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=0, async_chunk=True), + ) + host._omni_connector = MockConnector(stage_id=0) + host._stage_id = 0 + host._async_chunk = True + + def mock_process(transfer_manager, pooling_output, request): + return {"data": "test", "finished": False} + + host._custom_process_func = mock_process + + request = _make_request("internal-1", "external-1") + host.send_chunk(request, pooling_output={"v": 1}) + + # The mapping should be cached + self.assertEqual( + host._request_ids_mapping.get("internal-1"), + "external-1", + ) + + time.sleep(0.1) + host.shutdown_omni_connectors() + + +class TestLocalPayloadCacheLifecycle(unittest.TestCase): + """Unit tests for the local payload cache API (RFC §2.4).""" + + def _make_host(self) -> MixinHost: + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=0), + ) + host._omni_connector = MockConnector(stage_id=0) + host._stage_id = 0 + return host + + def test_put_get_pop(self): + host = self._make_host() + payload = {"engine_inputs": {"ids": [1, 2, 3]}} + host.put_local_stage_payload("r1", payload) + + self.assertEqual(host.get_local_stage_payload("r1"), payload) + popped = host.pop_local_stage_payload("r1") + self.assertEqual(popped, payload) + self.assertIsNone(host.get_local_stage_payload("r1")) + host.shutdown_omni_connectors() + + def test_recv_full_payload_inputs_populates_local_cache(self): + host = self._make_host() + host._omni_connector = MockConnector(stage_id=0) + host._stage_id = 0 + + # Simulate a full payload already staged by the bg recv path + with host._lock: + host._local_stage_payload_cache["r1"] = {"tok": [10]} + host._stage_recv_req_ids.add("r1") + + host.recv_full_payload_inputs(scheduler_output=None) + self.assertEqual(host.get_local_stage_payload("r1"), {"tok": [10]}) + host.shutdown_omni_connectors() + + def test_rank0_only_polls_connector_for_tp_full_payload(self): + host = self._make_host() + host._omni_connector = MagicMock() + host._stage_id = 2 + host._local_rank = 0 + host._request_ids_mapping["r1"] = "ext-r1" + host._get_req_chunk["r1"] = 0 + payload = {"tok": [10], "finished": torch.tensor(True)} + connector_result = (payload, 123) + host._omni_connector.get.return_value = connector_result + tp_group = _FakeTPGroup(world_size=2, rank_in_group=0) + + with patch("vllm_omni.worker.omni_connector_model_runner_mixin.get_tp_group", return_value=tp_group): + made_progress = host._poll_single_request("r1") + + self.assertTrue(made_progress) + host._omni_connector.get.assert_called_once_with("1", "2", "ext-r1_1_0") + self.assertEqual(tp_group.broadcast_inputs, []) + self.assertEqual(host.get_local_stage_payload("r1"), payload) + self.assertIn("r1", host._full_payload_pending_broadcast_req_ids) + self.assertNotIn("r1", host._stage_recv_req_ids) + self.assertIsNone(host.get_local_request_metadata("r1")) + host.shutdown_omni_connectors() + + def test_tp_follower_skips_connector_poll_for_full_payload(self): + host = self._make_host() + host._omni_connector = MagicMock() + host._stage_id = 2 + host._local_rank = 1 + host._request_ids_mapping["r1"] = "ext-r1" + host._get_req_chunk["r1"] = 0 + tp_group = _FakeTPGroup(world_size=2, rank_in_group=1) + + with patch("vllm_omni.worker.omni_connector_model_runner_mixin.get_tp_group", return_value=tp_group): + made_progress = host._poll_single_request("r1") + + self.assertFalse(made_progress) + host._omni_connector.get.assert_not_called() + self.assertEqual(tp_group.broadcast_inputs, []) + self.assertNotIn("r1", host._local_stage_payload_cache) + host.shutdown_omni_connectors() + + def test_recv_full_payload_inputs_broadcasts_tp_leader_results_to_followers(self): + host = self._make_host() + host._omni_connector = MagicMock() + host._stage_id = 2 + host._local_rank = 1 + host._pending_load_reqs["r1"] = object() + payload = {"tok": [10], "finished": torch.tensor(True)} + tp_group = _FakeTPGroup(world_size=2, rank_in_group=1, follower_result={"r1": payload}) + + with patch("vllm_omni.worker.omni_connector_model_runner_mixin.get_tp_group", return_value=tp_group): + results = host.recv_full_payload_inputs(scheduler_output=None) + + self.assertEqual(results, {"r1": payload}) + self.assertEqual(host.get_local_stage_payload("r1"), payload) + self.assertEqual(host.get_local_request_metadata("r1"), {}) + self.assertEqual(host._stage_recv_req_ids, {"r1"}) + self.assertNotIn("r1", host._pending_load_reqs) + self.assertEqual(tp_group.broadcast_inputs, [None]) + host.shutdown_omni_connectors() + + +class TestTPAsyncChunkFanout(unittest.TestCase): + def _make_host(self, rank: int) -> MixinHost: + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=2, async_chunk=True, worker_type="gen"), + ) + host._omni_connector = MagicMock() + host._stage_id = 2 + host._async_chunk = True + host._model_mode = "gen" + host._local_rank = rank + host._request_ids_mapping["r1"] = "ext-r1" + host._get_req_chunk["r1"] = 0 + return host + + def test_rank0_only_polls_connector_for_tp_async_chunk(self): + host = self._make_host(rank=0) + payload = { + "code_predictor_codes": [10, 11], + "left_context_size": 0, + "finished": torch.tensor(False), + } + host._omni_connector.get.return_value = (payload, 123) + tp_group = _FakeTPGroup(world_size=2, rank_in_group=0) + + with patch("vllm_omni.worker.omni_connector_model_runner_mixin.get_tp_group", return_value=tp_group): + made_progress = host._poll_single_request("r1") + + self.assertTrue(made_progress) + host._omni_connector.get.assert_called_once_with("1", "2", "ext-r1_1_0") + self.assertEqual(host.get_local_stage_payload("r1"), payload) + self.assertIn("r1", host._finished_load_reqs) + self.assertIn("r1", host._async_chunk_updated_req_ids) + self.assertEqual(tp_group.broadcast_inputs, []) + host.shutdown_omni_connectors() + + def test_tp_follower_skips_connector_poll_for_async_chunk(self): + host = self._make_host(rank=1) + tp_group = _FakeTPGroup(world_size=2, rank_in_group=1) + + with patch("vllm_omni.worker.omni_connector_model_runner_mixin.get_tp_group", return_value=tp_group): + made_progress = host._poll_single_request("r1") + + self.assertFalse(made_progress) + host._omni_connector.get.assert_not_called() + self.assertIsNone(host.get_local_stage_payload("r1")) + self.assertEqual(tp_group.broadcast_inputs, []) + host.shutdown_omni_connectors() + + def test_get_output_broadcasts_tp_async_chunk_payloads_to_followers(self): + host = self._make_host(rank=1) + host._pending_load_reqs["r1"] = object() + payload = { + "code_predictor_codes": [10, 11], + "left_context_size": 0, + "finished": torch.tensor(True), + } + packet = { + "staged_payloads": {"r1": payload}, + "request_metadata": {"r1": {"code_predictor_codes": [10, 11], "left_context_size": 0}}, + "newly_finished": {"r1"}, + "chunk_finished": {"r1"}, + } + tp_group = _FakeTPGroup(world_size=2, rank_in_group=1, follower_result=packet) + + with patch("vllm_omni.worker.omni_connector_model_runner_mixin.get_tp_group", return_value=tp_group): + output = host.get_omni_connector_output() + + self.assertEqual(output.chunk_ready_req_ids, {"r1"}) + self.assertEqual(output.chunk_finished_req_ids, {"r1"}) + self.assertEqual( + output.request_metadata, + {"r1": {"code_predictor_codes": [10, 11], "left_context_size": 0}}, + ) + self.assertEqual(host.get_local_stage_payload("r1"), payload) + self.assertNotIn("r1", host._pending_load_reqs) + self.assertIn("r1", host._chunk_stream_completed) + self.assertEqual(tp_group.broadcast_inputs, [None]) + host.shutdown_omni_connectors() + + +class TestKVTransferLifecycle(unittest.TestCase): + """Unit tests for KV transfer lifecycle methods.""" + + def _make_host(self) -> MixinHost: + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=0), + ) + return host + + def test_mark_drain_ack_complete(self): + host = self._make_host() + self.assertFalse(host.has_pending_kv_work()) + + host.mark_kv_transfer("r1", seq_len=100, block_ids=[0, 1, 2]) + self.assertTrue(host.has_pending_kv_work()) + self.assertTrue(host.is_kv_transfer_triggered("r1")) + + # Drain moves pending → active + pending = host.drain_pending_kv_transfers() + self.assertEqual(pending, {"r1": {"seq_len": 100, "block_ids": [0, 1, 2]}}) + self.assertIn("r1", host._kv_active_transfers) + self.assertTrue(host.has_pending_kv_work()) + + # Ack moves active → completed + host.ack_kv_transfers(["r1"]) + self.assertNotIn("r1", host._kv_active_transfers) + self.assertIn("r1", host._kv_completed_transfers) + + # Drain completed + completed = host.drain_completed_kv_transfers() + self.assertEqual(completed, {"r1"}) + self.assertFalse(host.has_pending_kv_work()) + host.shutdown_omni_connectors() + + def test_mark_dedup(self): + host = self._make_host() + host.mark_kv_transfer("r1", seq_len=100, block_ids=[0]) + host.mark_kv_transfer("r1", seq_len=200, block_ids=[0, 1]) + # Second mark is a no-op + self.assertEqual(host._kv_pending_transfers["r1"]["seq_len"], 100) + host.shutdown_omni_connectors() + + def test_cleanup_removes_kv_state(self): + host = self._make_host() + host.mark_kv_transfer("r1", seq_len=50, block_ids=[0]) + host.drain_pending_kv_transfers() + host.cleanup_finished_request("r1") + self.assertFalse(host.is_kv_transfer_triggered("r1")) + self.assertNotIn("r1", host._kv_active_transfers) + self.assertFalse(host.has_pending_kv_work()) + host.shutdown_omni_connectors() + + +class TestAsyncPayloadLifecycle(unittest.TestCase): + """Regression tests for async payload delivery lifecycle.""" + + def test_send_side_request_payload_not_cleared_before_payload_is_consumable(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=1, async_chunk=True, worker_type="ar"), + ) + host._request_ids_mapping["r1"] = "r1" + payload = { + "thinker_decode_embeddings": torch.ones(1, 2), + "thinker_output_token_ids": [1], + "override_keys": ["thinker_decode_embeddings", "thinker_output_token_ids"], + "finished": torch.tensor(False), + } + + host._accumulate_payload("r1", dict(payload)) + with host._lock: + host._finished_load_reqs.add("r1") + + host.get_omni_connector_output() + self.assertIn("r1", host._send_side_request_payload) + host.shutdown_omni_connectors() + + def test_payload_consumable_ignores_token_horizon_only_updates(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=1, async_chunk=True, worker_type="ar"), + ) + payload = { + "thinker_output_token_ids": [1, 2, 3], + "finished": torch.tensor(False), + "override_keys": [ + "thinker_output_token_ids", + "thinker_decode_embeddings_token_start", + "thinker_decode_embeddings_token_end", + ], + "thinker_decode_embeddings_token_start": 2, + "thinker_decode_embeddings_token_end": 3, + } + self.assertFalse(host._payload_is_consumable(payload)) + host.shutdown_omni_connectors() + + def test_payload_consumable_accepts_decode_embeddings(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=1, async_chunk=True, worker_type="ar"), + ) + payload = { + "thinker_output_token_ids": [1, 2, 3], + "thinker_decode_embeddings": torch.ones(1, 2), + "finished": torch.tensor(False), + } + self.assertTrue(host._payload_is_consumable(payload)) + host.shutdown_omni_connectors() + + def test_ar_metadata_only_followup_chunk_does_not_rewake_request(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=1, async_chunk=True, worker_type="ar"), + ) + host._omni_connector = MagicMock() + host._stage_id = 1 + host._async_chunk = True + host._model_mode = "ar" + host._request_ids_mapping["r1"] = "ext-r1" + host._get_req_chunk["r1"] = 0 + + host._omni_connector.get.side_effect = [ + ( + { + "thinker_decode_embeddings": torch.ones(1, 2), + "finished": torch.tensor(False), + }, + 1, + ), + ( + { + "next_stage_prompt_len": 7, + "finished": torch.tensor(False), + }, + 1, + ), + ] + + host._poll_single_request("r1") + output1 = host.get_omni_connector_output() + self.assertEqual(output1.chunk_ready_req_ids, {"r1"}) + + host._poll_single_request("r1") + output2 = host.get_omni_connector_output() + self.assertEqual(output2.chunk_ready_req_ids, set()) + self.assertEqual(output2.request_metadata, {"r1": {"next_stage_prompt_len": 7}}) + + host.shutdown_omni_connectors() + + def test_non_ar_recv_does_not_overwrite_unconsumed_staged_chunk(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=2, async_chunk=True, worker_type="gen"), + ) + host._omni_connector = MagicMock() + host._stage_id = 2 + host._async_chunk = True + host._model_mode = "gen" + host._request_ids_mapping["r1"] = "ext-r1" + host._get_req_chunk["r1"] = 1 + host._local_stage_payload_cache["r1"] = { + "code_predictor_codes": [1, 2, 3], + "left_context_size": 0, + "finished": torch.tensor(False), + } + + made_progress = host._poll_single_request("r1") + + self.assertFalse(made_progress) + host._omni_connector.get.assert_not_called() + self.assertEqual(host._get_req_chunk["r1"], 1) + + host.shutdown_omni_connectors() + + def test_non_ar_recv_waits_for_scheduler_handoff_before_fetching_next_chunk(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=2, async_chunk=True, worker_type="gen"), + ) + host._omni_connector = MagicMock() + host._stage_id = 2 + host._async_chunk = True + host._model_mode = "gen" + host._request_ids_mapping["r1"] = "ext-r1" + host._get_req_chunk["r1"] = 1 + host._local_request_metadata["r1"] = { + "code_predictor_codes": [10, 11, 12], + "left_context_size": 0, + } + host._finished_load_reqs.add("r1") + + made_progress = host._poll_single_request("r1") + + self.assertFalse(made_progress) + host._omni_connector.get.assert_not_called() + self.assertEqual(host._get_req_chunk["r1"], 1) + + output = host.get_omni_connector_output() + self.assertEqual(output.request_metadata["r1"]["code_predictor_codes"], [10, 11, 12]) + self.assertEqual(output.chunk_ready_req_ids, {"r1"}) + + host._omni_connector.get.return_value = ( + { + "code_predictor_codes": [20, 21, 22], + "left_context_size": 0, + "finished": torch.tensor(False), + }, + 1, + ) + made_progress = host._poll_single_request("r1") + + self.assertTrue(made_progress) + host._omni_connector.get.assert_called_once() + self.assertEqual(host._get_req_chunk["r1"], 2) + + host.shutdown_omni_connectors() + + +class TestRankAwareKVRouting(unittest.TestCase): + def _make_host(self, *, from_tp: int, to_tp: int, local_rank: int) -> MixinHost: + host = MixinHost() + host.init_omni_connectors(vllm_config=None, model_config=_make_model_config(stage_id=1)) + host._from_tp = from_tp + host._to_tp = to_tp + host._local_rank = local_rank + return host + + def test_recv_keys_use_remote_rank_as_from_rank(self): + host = self._make_host(from_tp=4, to_tp=2, local_rank=1) + self.assertEqual( + host.get_rank_aware_kv_keys("req", from_stage=0), + ["req_0_0_2_1", "req_0_0_3_1"], + ) + host.shutdown_omni_connectors() + + def test_send_keys_route_from_rank_gt_to_rank(self): + host = self._make_host(from_tp=4, to_tp=2, local_rank=3) + self.assertEqual(host.get_rank_aware_kv_send_keys("req", from_stage=0), ["req_0_0_3_1"]) + host.shutdown_omni_connectors() + + def test_invalid_recv_rank_mapping_raises(self): + host = self._make_host(from_tp=3, to_tp=2, local_rank=1) + with self.assertRaises(ValueError): + host.get_rank_aware_kv_keys("req", from_stage=0) + host.shutdown_omni_connectors() + + def test_invalid_send_rank_mapping_raises(self): + host = self._make_host(from_tp=3, to_tp=2, local_rank=1) + with self.assertRaises(ValueError): + host.get_rank_aware_kv_send_keys("req", from_stage=0) + host.shutdown_omni_connectors() + + def test_merge_rank_sharded_payloads_concatenates_head_dimension(self): + host = self._make_host(from_tp=4, to_tp=2, local_rank=0) + payloads = [ + {"layer_blocks": {"key_cache": [torch.ones(2, 1, 3)], "value_cache": [torch.ones(2, 1, 3)]}}, + {"layer_blocks": {"key_cache": [torch.full((2, 1, 3), 2.0)], "value_cache": [torch.full((2, 1, 3), 2.0)]}}, + ] + merged = host._merge_rank_sharded_kv_payloads(payloads) + self.assertEqual(tuple(merged["layer_blocks"]["key_cache"][0].shape), (2, 2, 3)) + self.assertTrue(torch.equal(merged["layer_blocks"]["key_cache"][0][:, 0], torch.ones(2, 3))) + self.assertTrue(torch.equal(merged["layer_blocks"]["key_cache"][0][:, 1], torch.full((2, 3), 2.0))) + host.shutdown_omni_connectors() + + def test_slice_rank_sharded_payload_splits_head_dimension(self): + host = self._make_host(from_tp=2, to_tp=4, local_rank=1) + payload = { + "layer_blocks": { + "key_cache": [torch.arange(24, dtype=torch.float32).reshape(2, 4, 3)], + "value_cache": [torch.arange(24, dtype=torch.float32).reshape(2, 4, 3)], + }, + "metadata": {}, + } + sliced = host._slice_rank_sharded_kv_payload(payload) + self.assertEqual(tuple(sliced["layer_blocks"]["key_cache"][0].shape), (2, 2, 3)) + expected = torch.arange(24, dtype=torch.float32).reshape(2, 4, 3)[:, 2:4, :] + self.assertTrue(torch.equal(sliced["layer_blocks"]["key_cache"][0], expected)) + host.shutdown_omni_connectors() + + +class TestAttachOmniConnectorOutput(unittest.TestCase): + def test_wraps_empty_model_runner_output_when_signals_exist(self): + from vllm.v1.worker.gpu_model_runner import EMPTY_MODEL_RUNNER_OUTPUT + + host = MixinHost() + host.get_omni_connector_output = lambda: OmniConnectorOutput(chunk_ready_req_ids={"req-1"}) + + wrapped = host.attach_omni_connector_output(EMPTY_MODEL_RUNNER_OUTPUT) + + self.assertIsNot(wrapped, EMPTY_MODEL_RUNNER_OUTPUT) + self.assertEqual(wrapped.omni_connector_output.chunk_ready_req_ids, {"req-1"}) + + +class TestConnectorConfigValidation(unittest.TestCase): + def test_invalid_connector_name_raises(self): + host = MixinHost() + model_config = _make_model_config(stage_id=1) + model_config.stage_connector_config = {"name": " "} + + with self.assertRaisesRegex(RuntimeError, "missing connector name"): + host.init_omni_connectors(vllm_config=None, model_config=model_config) + + +class _FailingConnector: + """Connector whose put() fails a configurable number of times.""" + + def __init__(self, fail_count: int = 1, raise_on_fail: bool = False): + self._fail_count = fail_count + self._raise_on_fail = raise_on_fail + self.attempt = 0 + + def put(self, from_stage, to_stage, put_key, data): + self.attempt += 1 + if self.attempt <= self._fail_count: + if self._raise_on_fail: + raise ConnectionError("transient connector error") + return False, 0, None + return True, len(str(data)), None + + def get(self, *a, **kw): + return None + + def close(self): + pass + + +class TestSendRetry(unittest.TestCase): + """Tests for P1-2: failed connector sends must be retried.""" + + def _make_sender(self, connector): + sender = MixinHost() + sender.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=0, async_chunk=True), + ) + sender._omni_connector = connector + sender._stage_id = 0 + sender._async_chunk = True + return sender + + def _make_task(self, req_id="r1"): + return { + "stage_id": 0, + "next_stage_id": 1, + "request_id": req_id, + "data": {"payload": "test"}, + } + + def test_send_single_request_returns_false_on_put_failure(self): + connector = _FailingConnector(fail_count=999) + sender = self._make_sender(connector) + + result = sender._send_single_request(self._make_task()) + self.assertFalse(result) + sender.shutdown_omni_connectors() + + def test_send_single_request_does_not_decrement_on_failure(self): + connector = _FailingConnector(fail_count=999) + sender = self._make_sender(connector) + sender._pending_save_counts["r1"] = 1 + + sender._send_single_request(self._make_task()) + self.assertEqual(sender._pending_save_counts.get("r1"), 1, "pending count must NOT be decremented on failure") + sender.shutdown_omni_connectors() + + def test_send_single_request_decrements_on_success(self): + connector = MockConnector(stage_id=0) + sender = self._make_sender(connector) + sender._pending_save_counts["r1"] = 1 + + result = sender._send_single_request(self._make_task()) + self.assertTrue(result) + self.assertNotIn("r1", sender._pending_save_counts, "pending count should be zero/removed on success") + sender.shutdown_omni_connectors() + + def test_requeue_or_drop_requeues_on_first_failure(self): + sender = self._make_sender(MockConnector(stage_id=0)) + task = self._make_task() + + sender._requeue_or_drop_failed_send(task) + + self.assertEqual(task.get("_retry_count"), 1) + with sender._lock: + dq = sender._pending_save_reqs.get("r1") + self.assertIsNotNone(dq) + self.assertEqual(len(dq), 1) + sender.shutdown_omni_connectors() + + def test_requeue_or_drop_drops_after_max_retries(self): + sender = self._make_sender(MockConnector(stage_id=0)) + sender._pending_save_counts["r1"] = 1 + task = self._make_task() + task["_retry_count"] = sender._MAX_SEND_RETRIES # already at max + + sender._requeue_or_drop_failed_send(task) + + with sender._lock: + dq = sender._pending_save_reqs.get("r1") + self.assertTrue(dq is None or len(dq) == 0, "task should NOT be re-enqueued after max retries") + self.assertNotIn("r1", sender._pending_save_counts, "pending count should be cleaned up on final drop") + sender.shutdown_omni_connectors() + + def test_save_loop_retries_on_exception(self): + """Integration: _save_loop retries a task when put() raises.""" + from collections import deque + + connector = _FailingConnector(fail_count=1, raise_on_fail=True) + sender = self._make_sender(connector) + task = self._make_task() + + with sender._lock: + sender._pending_save_reqs["r1"] = deque([task]) + sender._pending_save_counts["r1"] = 1 + + sender._stop_event.clear() + + def run_one_loop(): + sender._save_loop() + + sender._stop_event.set() # will exit after one iteration + # Run manually instead of threading + # Simulate: pop task, send fails, requeue + popped_task = None + with sender._lock: + dq = sender._pending_save_reqs.get("r1") + if dq: + popped_task = dq.popleft() + if not dq: + del sender._pending_save_reqs["r1"] + + if popped_task is not None: + success = False + try: + success = sender._send_single_request(popped_task) + except Exception: + pass + if not success: + sender._requeue_or_drop_failed_send(popped_task) + + # After first failure, task should be re-enqueued + with sender._lock: + dq = sender._pending_save_reqs.get("r1") + self.assertIsNotNone(dq) + self.assertEqual(len(dq), 1) + requeued = dq[0] + self.assertEqual(requeued.get("_retry_count"), 1) + + # Second attempt should succeed (connector now returns True) + success = sender._send_single_request(requeued) + self.assertTrue(success) + sender.shutdown_omni_connectors() + + +if __name__ == "__main__": + unittest.main() diff --git a/vllm_omni/core/sched/omni_scheduling_coordinator.py b/vllm_omni/core/sched/omni_scheduling_coordinator.py new file mode 100644 index 0000000000..c9d891afb4 --- /dev/null +++ b/vllm_omni/core/sched/omni_scheduling_coordinator.py @@ -0,0 +1,380 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Scheduling-side coordination for chunk and full_payload input waiting. + +Manages WAITING_FOR_CHUNK and WAITING_FOR_INPUT state transitions +based on readiness signals from OmniConnectorOutput, without ever +calling connector.put()/get(). + +This replaces the scheduling half of OmniChunkTransferAdapter; the +transport half lives in OmniConnectorModelRunnerMixin. +""" + +from __future__ import annotations + +import time +from collections import deque +from typing import Any + +from vllm.logger import init_logger +from vllm.v1.request import Request, RequestStatus + +logger = init_logger(__name__) + + +class OmniSchedulingCoordinator: + """Pure-scheduling coordinator for chunk and full_payload input waiting. + + The Scheduler owns an instance of this class. It consumes readiness + signals produced by the Model Runner's ``OmniConnectorModelRunnerMixin`` + (via ``OmniConnectorOutput``) and manages ``WAITING_FOR_CHUNK`` and + ``WAITING_FOR_INPUT`` state transitions accordingly. + """ + + def __init__(self, scheduler_max_num_seqs: int, stage_id: int = 0, async_chunk: bool = False): + self._stage_id = stage_id + self._scheduler_max_num_seqs = scheduler_max_num_seqs + self._async_chunk = async_chunk + + self.finished_requests: set[str] = set() + self.requests_with_ready_chunks: set[str] = set() + self._full_payload_input_received: set[str] = set() + + self._waiting_for_chunk_waiting: deque[Any] = deque() + self._waiting_for_chunk_running: deque[Any] = deque() + + # Request IDs that were newly registered for chunk recv this cycle. + # The engine/Model Runner should call register_chunk_recv() for these + # so the bg thread starts polling. + self.pending_chunk_registrations: list[Any] = [] + + # Requests waiting for full_payload stage input (WAITING_FOR_INPUT). + self._waiting_for_input: deque[Any] = deque() + self.pending_input_registrations: list[Any] = [] + + # Monotonic timestamp recording when each request first entered + # WAITING_FOR_CHUNK or WAITING_FOR_INPUT. Used by + # collect_timed_out_request_ids() to detect orphaned waits. + self._waiting_since: dict[str, float] = {} + + # ------------------------------------------------------------------ # + # Core scheduling methods + # ------------------------------------------------------------------ # + + def process_pending_chunks( + self, + waiting_queue: Any, + running_queue: list[Request], + chunk_ready_req_ids: set[str], + chunk_finished_req_ids: set[str], + ) -> None: + """Transition requests whose chunks have arrived. + + Args: + waiting_queue: Scheduler's waiting request queue. + running_queue: Scheduler's running request list. + chunk_ready_req_ids: IDs with a newly arrived chunk this cycle. + chunk_finished_req_ids: IDs whose final chunk has arrived. + """ + if self._stage_id == 0 or not self._async_chunk: + return + + terminal_ready_req_ids = chunk_ready_req_ids.intersection(chunk_finished_req_ids) + self.finished_requests.update(chunk_finished_req_ids - terminal_ready_req_ids) + self.pending_chunk_registrations = [] + + self._process_chunk_queue( + waiting_queue, + self._waiting_for_chunk_waiting, + RequestStatus.WAITING, + chunk_ready_req_ids, + ) + self._process_chunk_queue( + running_queue, + self._waiting_for_chunk_running, + RequestStatus.RUNNING, + chunk_ready_req_ids, + ) + self.finished_requests.update(terminal_ready_req_ids) + + while len(running_queue) > self._scheduler_max_num_seqs: + request = running_queue.pop() + # Must reset status to WAITING so the scheduler treats it as + # schedulable work. KV blocks are NOT freed here (unlike a + # real preemption), so PREEMPTED would be incorrect. + request.status = RequestStatus.WAITING + waiting_queue.prepend_requests([request]) + + def process_pending_full_payload_inputs( + self, + waiting_queue: Any, + running_queue: list[Request], + stage_recv_req_ids: set[str], + ) -> None: + """Manage WAITING_FOR_INPUT lifecycle for full_payload_mode. + + For non-Stage-0 stages in full_payload_mode (``async_chunk=False``): + 1. Fresh WAITING requests are transitioned to WAITING_FOR_INPUT + and registered for bg-thread polling. + 2. WAITING_FOR_INPUT requests whose data has arrived (in + ``stage_recv_req_ids``) are transitioned back to WAITING. + """ + if self._stage_id == 0: + return + + self._full_payload_input_received.update(stage_recv_req_ids) + if not self._async_chunk and stage_recv_req_ids: + self.finished_requests.update(stage_recv_req_ids) + logger.debug( + "[Coordinator stage-%s] full_payload recv -> finished_requests: %s", + self._stage_id, + stage_recv_req_ids, + ) + self.pending_input_registrations = [] + + remaining: deque[Any] = deque() + for request in self._waiting_for_input: + if request.request_id in stage_recv_req_ids: + request.status = RequestStatus.WAITING + self._waiting_since.pop(request.request_id, None) + waiting_queue.add_request(request) + else: + remaining.append(request) + self._waiting_for_input = remaining + + if not self._async_chunk: + to_remove: list[Any] = [] + queue_snapshot = list(waiting_queue) + for request in queue_snapshot: + if request.status == RequestStatus.WAITING: + if request.request_id in self._full_payload_input_received: + continue + if request.request_id in self.requests_with_ready_chunks: + continue + if request.request_id in self.finished_requests: + continue + request.status = RequestStatus.WAITING_FOR_INPUT + self._waiting_since.setdefault(request.request_id, time.monotonic()) + to_remove.append(request) + self._waiting_for_input.append(request) + self.pending_input_registrations.append(request) + elif request.status == RequestStatus.WAITING_FOR_INPUT: + if request.request_id in stage_recv_req_ids: + request.status = RequestStatus.WAITING + self._waiting_since.pop(request.request_id, None) + else: + to_remove.append(request) + self._waiting_for_input.append(request) + self.pending_input_registrations.append(request) + for request in to_remove: + waiting_queue.remove(request) + + def process_pending_full_payload_inputs_legacy( + self, + waiting_queue: Any, + running_queue: list[Request], + stage_recv_req_ids: set[str], + ) -> None: + """Compatibility wrapper for ``process_pending_full_payload_inputs``.""" + self.process_pending_full_payload_inputs(waiting_queue, running_queue, stage_recv_req_ids) + + def free_finished_request(self, request_id: str) -> None: + """Prune internal tracking sets for a freed request to prevent unbounded growth.""" + self._full_payload_input_received.discard(request_id) + self.finished_requests.discard(request_id) + self.requests_with_ready_chunks.discard(request_id) + self._waiting_since.pop(request_id, None) + + def collect_timed_out_request_ids( + self, + timeout_s: float, + ) -> set[str]: + """Return IDs of requests that have been waiting longer than *timeout_s*. + + Uses ``_waiting_since`` timestamps (always up-to-date) to detect + timed-out requests. This method is safe to call at any point in + the scheduling cycle — it does **not** rely on coordinator internal + queues (which are empty after ``restore_queues()``). + + Clears ``_waiting_since`` for timed-out IDs and defensively removes + them from coordinator internal queues if present. The caller + (scheduler) should then remove the requests from its queues, + set ``FINISHED_ERROR``, and call ``_free_request()`` so that + ``cleanup_finished_request()`` fires in the model runner mixin. + """ + if timeout_s <= 0: + return set() + now = time.monotonic() + timed_out_ids: set[str] = set() + for req_id, start_time in self._waiting_since.items(): + if now - start_time > timeout_s: + timed_out_ids.add(req_id) + if not timed_out_ids: + return set() + + # Defensively remove from coordinator internal queues (may already + # be empty if restore_queues() has run). + for queue_attr in ( + "_waiting_for_chunk_waiting", + "_waiting_for_chunk_running", + "_waiting_for_input", + ): + queue = getattr(self, queue_attr) + remaining: deque[Any] = deque() + for request in queue: + if request.request_id not in timed_out_ids: + remaining.append(request) + setattr(self, queue_attr, remaining) + + for req_id in timed_out_ids: + self._waiting_since.pop(req_id, None) + logger.warning( + "[Coordinator stage-%s] Request %s timed out waiting for chunk/input (waited > %.0fs)", + self._stage_id, + req_id, + timeout_s, + ) + + return timed_out_ids + + def restore_queues( + self, + waiting_queue: Any, + running_queue: list[Request], + ) -> None: + """Return waiting-for-chunk/input requests to scheduling queues.""" + for request in self._waiting_for_chunk_waiting: + waiting_queue.add_request(request) + self._waiting_for_chunk_waiting = deque() + + if self._waiting_for_chunk_running: + running_queue.extend(self._waiting_for_chunk_running) + self._waiting_for_chunk_running = deque() + + for request in self._waiting_for_input: + waiting_queue.add_request(request) + self._waiting_for_input = deque() + + def update_request_metadata( + self, + requests: dict[str, Request], + request_metadata: dict[str, dict[str, Any]], + model_mode: str = "ar", + ) -> None: + """Apply received scheduling metadata to request objects. + + For AR mode: only scheduler-visible metadata is applied locally. + For Generation mode: updates ``request.prompt_token_ids``. + + Additionally, if the payload contains ``next_stage_prompt_len``, + updates the request's ``prompt_token_ids`` to the correct length. + """ + for req_id, metadata in request_metadata.items(): + request = requests.get(req_id) + if request is None: + continue + + # Handle next_stage_prompt_len if present (for models like Qwen3-Omni). + # Only apply when the request has not started decoding yet + # (no output tokens). Resetting a mid-decode request would + # destroy generated tokens and desync KV cache state. + if "next_stage_prompt_len" in metadata: + next_len = metadata["next_stage_prompt_len"] + if isinstance(next_len, int) and next_len > 0: + output_token_ids = getattr(request, "_output_token_ids", None) + has_decode_output = output_token_ids is not None and len(output_token_ids) > 0 + if has_decode_output: + logger.debug( + "[Coordinator stage-%s] Skipping prompt resize for req %s: " + "request already has %s output tokens", + self._stage_id, + req_id, + len(output_token_ids), + ) + else: + current_prompt_ids = getattr(request, "prompt_token_ids", []) or [] + current_prompt_len = len(current_prompt_ids) + if current_prompt_len != next_len or getattr(request, "num_prompt_tokens", None) != next_len: + new_prompt = [0] * next_len + request.prompt_token_ids = new_prompt + request.num_prompt_tokens = next_len + request._all_token_ids.clear() + request._all_token_ids.extend(new_prompt) + request._output_token_ids.clear() + request.num_computed_tokens = 0 + logger.debug( + "[Coordinator stage-%s] Updated prompt_token_ids length to %s for req %s", + self._stage_id, + next_len, + req_id, + ) + + if model_mode != "ar": + new_ids = metadata.get("code_predictor_codes", []) + runtime_seed = None + if "left_context_size" in metadata: + runtime_seed = { + "left_context_size": metadata["left_context_size"], + } + request._omni_initial_model_buffer = runtime_seed + if new_ids: + request.prompt_token_ids = new_ids + request.num_computed_tokens = 0 + + def postprocess_scheduler_output( + self, + scheduler_output: Any, + requests: dict[str, Request] | None = None, + ) -> None: + """Clear per-cycle ready state after scheduler output is materialized.""" + self._clear_chunk_ready(scheduler_output) + + # ------------------------------------------------------------------ # + # Internal helpers + # ------------------------------------------------------------------ # + + def _process_chunk_queue( + self, + queue: Any, + waiting_for_chunk_list: deque[Any], + target_status: RequestStatus, + chunk_ready_req_ids: set[str], + ) -> None: + queue_snapshot = list(queue) + for request in queue_snapshot: + if request.status != RequestStatus.WAITING_FOR_CHUNK: + if request.request_id in self.requests_with_ready_chunks: + continue + if request.request_id in self.finished_requests: + continue + if request.status == RequestStatus.WAITING_FOR_INPUT: + continue + if request.request_id in chunk_ready_req_ids: + self.requests_with_ready_chunks.add(request.request_id) + continue + self.pending_chunk_registrations.append(request) + request.status = RequestStatus.WAITING_FOR_CHUNK + self._waiting_since.setdefault(request.request_id, time.monotonic()) + else: + if request.request_id in chunk_ready_req_ids: + request.status = target_status + self.requests_with_ready_chunks.add(request.request_id) + self._waiting_since.pop(request.request_id, None) + continue + queue.remove(request) + waiting_for_chunk_list.append(request) + + def _clear_chunk_ready(self, scheduler_output: Any) -> None: + if scheduler_output.scheduled_new_reqs: + for req_data in scheduler_output.scheduled_new_reqs: + self.requests_with_ready_chunks.discard( + getattr(req_data, "req_id", None), + ) + + if scheduler_output.scheduled_cached_reqs: + for req_id in scheduler_output.scheduled_cached_reqs.req_ids: + self.requests_with_ready_chunks.discard(req_id) + + +# Backward-compatible alias +ChunkSchedulingCoordinator = OmniSchedulingCoordinator diff --git a/vllm_omni/diffusion/worker/diffusion_model_runner.py b/vllm_omni/diffusion/worker/diffusion_model_runner.py index 32ea5bf64d..535f053c38 100644 --- a/vllm_omni/diffusion/worker/diffusion_model_runner.py +++ b/vllm_omni/diffusion/worker/diffusion_model_runner.py @@ -35,11 +35,12 @@ from vllm_omni.diffusion.worker.utils import DiffusionRequestState, RunnerOutput from vllm_omni.distributed.omni_connectors.kv_transfer_manager import OmniKVTransferManager from vllm_omni.platforms import current_omni_platform +from vllm_omni.worker.omni_connector_model_runner_mixin import OmniConnectorModelRunnerMixin logger = init_logger(__name__) -class DiffusionModelRunner: +class DiffusionModelRunner(OmniConnectorModelRunnerMixin): """ Model runner that handles model loading and execution for diffusion models. diff --git a/vllm_omni/outputs.py b/vllm_omni/outputs.py index 9a7bb67065..2c2c1d21c1 100644 --- a/vllm_omni/outputs.py +++ b/vllm_omni/outputs.py @@ -9,6 +9,33 @@ from vllm_omni.inputs.data import OmniPromptType +@dataclass +class OmniConnectorOutput: + """Communication results from Model Runner to Scheduler. + + Carries transfer readiness signals so the Scheduler can make scheduling + decisions without ever calling connector.put()/get() directly. + + Attributes: + chunk_ready_req_ids: Request IDs with newly arrived chunks this cycle. + chunk_finished_req_ids: Request IDs whose final chunk has arrived. + request_metadata: Lightweight scheduling metadata keyed by request ID + (e.g. next_stage_prompt_len, code_predictor_codes, left_context_size). + Full payloads are owned by the Model Runner's local cache. + kv_sent_req_ids: Request IDs whose KV cache was successfully sent. + stage_recv_req_ids: Request IDs that received batch stage inputs. + has_pending_kv_work: True if the mixin has pending, active, or + completed KV transfers that the scheduler should account for. + """ + + chunk_ready_req_ids: set[str] = field(default_factory=set) + chunk_finished_req_ids: set[str] = field(default_factory=set) + request_metadata: dict[str, dict[str, Any]] = field(default_factory=dict) + kv_sent_req_ids: list[str] = field(default_factory=list) + stage_recv_req_ids: set[str] = field(default_factory=set) + has_pending_kv_work: bool = False + + class OmniModelRunnerOutput(ModelRunnerOutput): """Model runner output for omni models. @@ -24,6 +51,7 @@ class OmniModelRunnerOutput(ModelRunnerOutput): # IDs of requests whose KV cache has been extracted from GPU/NPU to CPU. # The Scheduler can safely free the block tables for these requests. kv_extracted_req_ids: list[str] | None = None + omni_connector_output: OmniConnectorOutput | None = None @dataclass diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 72e745fb17..868140d265 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -40,6 +40,7 @@ from vllm_omni.distributed.omni_connectors.kv_transfer_manager import OmniKVTransferManager from vllm_omni.outputs import OmniModelRunnerOutput from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner +from vllm_omni.worker.omni_connector_model_runner_mixin import OmniConnectorModelRunnerMixin logger = init_logger(__name__) @@ -60,7 +61,7 @@ class ExecuteModelState(NamedTuple): slot_mappings: dict[str, torch.Tensor] | list[dict[str, torch.Tensor]] | None = None -class GPUARModelRunner(OmniGPUModelRunner): +class GPUARModelRunner(OmniGPUModelRunner, OmniConnectorModelRunnerMixin): """Autoregressive GPU model runner that returns hidden states per request. Follows the v0.12 two-phase execute/sample flow from GPUModelRunner, and diff --git a/vllm_omni/worker/gpu_generation_model_runner.py b/vllm_omni/worker/gpu_generation_model_runner.py index d95b676f6d..f10115c8e9 100644 --- a/vllm_omni/worker/gpu_generation_model_runner.py +++ b/vllm_omni/worker/gpu_generation_model_runner.py @@ -39,11 +39,12 @@ from vllm_omni.outputs import OmniModelRunnerOutput from vllm_omni.worker.gpu_ar_model_runner import ExecuteModelState from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner +from vllm_omni.worker.omni_connector_model_runner_mixin import OmniConnectorModelRunnerMixin logger = logging.getLogger(__name__) -class GPUGenerationModelRunner(OmniGPUModelRunner): +class GPUGenerationModelRunner(OmniGPUModelRunner, OmniConnectorModelRunnerMixin): """Generation model runner for vLLM-Omni (non-autoregressive). - Reuses GPUModelRunner preparation, multimodal handling, and TP/PP/DP glue. diff --git a/vllm_omni/worker/omni_connector_model_runner_mixin.py b/vllm_omni/worker/omni_connector_model_runner_mixin.py new file mode 100644 index 0000000000..e0df3ba3d7 --- /dev/null +++ b/vllm_omni/worker/omni_connector_model_runner_mixin.py @@ -0,0 +1,2125 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unified data-plane communication mixin for Model Runners. + +All connector.put()/get() calls are consolidated here. Background I/O +threads handle async_chunk and full_payload_mode transfers; KV cache is delegated to +the existing OmniKVTransferManager (to be absorbed later). + +The mixin reports transfer results via OmniConnectorOutput so that the +Scheduler can make scheduling decisions without ever touching a connector. +""" + +from __future__ import annotations + +import importlib +import inspect +import os +import threading +from collections import defaultdict, deque +from types import SimpleNamespace +from typing import TYPE_CHECKING, Any + +import torch +from vllm.distributed.parallel_state import get_tp_group +from vllm.logger import init_logger + +from vllm_omni.distributed.omni_connectors.factory import OmniConnectorFactory +from vllm_omni.distributed.omni_connectors.utils.config import ConnectorSpec +from vllm_omni.outputs import OmniConnectorOutput +from vllm_omni.worker.payload_span import ( + THINKER_DECODE_EMBEDDINGS_KEY, + THINKER_DECODE_TOKEN_END_KEY, + THINKER_DECODE_TOKEN_START_KEY, + THINKER_OUTPUT_TOKEN_IDS_KEY, + get_tensor_span, + merge_tensor_spans, +) + +if TYPE_CHECKING: + from vllm_omni.distributed.omni_connectors.connectors.base import ( + OmniConnectorBase, + ) + from vllm_omni.distributed.omni_connectors.kv_transfer_manager import ( + OmniKVTransferManager, + ) + +logger = init_logger(__name__) + + +class OmniConnectorModelRunnerMixin: + """Unified data-plane communication mixin for Model Runners. + + Provides three transfer modes through a single pair of bg I/O threads: + - **full_payload_mode**: ``recv_full_payload_inputs`` / ``send_full_payload_outputs`` + - **Streaming (async_chunk)**: ``recv_chunk`` / ``send_chunk`` + - **KV cache**: ``send_kv_cache`` / ``recv_kv_cache`` (delegates to + the existing ``OmniKVTransferManager``) + + The mixin owns connector instances and background threads. It never + touches scheduling queues -- readiness is communicated to the Scheduler + via ``OmniConnectorOutput``. + """ + + # ------------------------------------------------------------------ # + # Init / Shutdown + # ------------------------------------------------------------------ # + + def init_omni_connectors( + self, + vllm_config: Any, + model_config: Any, + kv_transfer_manager: OmniKVTransferManager | None = None, + ) -> None: + """Initialize connectors and background threads. + + Args: + vllm_config: Full vLLM config object. + model_config: Stage-level model config with connector settings. + kv_transfer_manager: Existing KV transfer manager to delegate to. + """ + self._omni_connector: OmniConnectorBase | None = self._create_connector(model_config) + self._kv_transfer_manager = kv_transfer_manager + + self._async_chunk: bool = getattr(model_config, "async_chunk", False) + self._model_mode: str = getattr(model_config, "worker_type", "ar") + stage_id = getattr(model_config, "stage_id", 0) + if isinstance(stage_id, str): + stage_id = int(stage_id) + self._stage_id: int = stage_id if isinstance(stage_id, int) else 0 + + self._custom_process_func_path, self._custom_process_func = self._load_custom_func(model_config) + self._custom_process_supports_is_finished = self._custom_process_supports_is_finished_kwarg() + logger.info( + "[Stage-%s] init_omni_connectors: async_chunk=%s, custom_process_func=%s, connector=%s, func_path=%s", + self._stage_id, + self._async_chunk, + self._custom_process_func, + type(self._omni_connector).__name__ if self._omni_connector else None, + self._custom_process_func_path, + ) + + # -- next stage ID (from connector config or default stage_id + 1) -- + self._next_stage_id: int = self._resolve_next_stage_id(model_config) + + # -- heterogeneous TP rank support -- + rank_cfg = self._parse_rank_mapping(model_config) + self._from_tp: int = rank_cfg["from_tp"] + self._to_tp: int = rank_cfg["to_tp"] + self._local_rank: int = rank_cfg["local_rank"] + if self._kv_transfer_manager is not None: + self._kv_transfer_manager.kv_send_key_builder = self.get_rank_aware_kv_send_keys + self._kv_transfer_manager.kv_recv_key_builder = self.get_rank_aware_kv_keys + self._kv_transfer_manager.kv_payload_merger = self._merge_rank_sharded_kv_payloads + self._kv_transfer_manager.kv_payload_slicer = self._slice_rank_sharded_kv_payload + + # -- chunk index tracking (ported from OmniChunkTransferAdapter) -- + self._put_req_chunk: dict[str, int] = defaultdict(int) + self._get_req_chunk: dict[str, int] = defaultdict(int) + # Send-side async accumulation / staging buffer. Receive-side payload + # ownership lives in ``_local_stage_payload_cache``. + self._send_side_request_payload: dict[str, dict[str, Any]] = {} + self._code_prompt_token_ids: dict[str, list[list[int]]] = defaultdict(list) + self._request_ids_mapping: dict[str, str] = {} + + # -- async I/O state (shared by chunk + full_payload_mode) -- + self._pending_load_reqs: dict[str, Any] = {} + self._finished_load_reqs: set[str] = set() + self._pending_save_reqs: dict[str, deque] = {} + self._pending_save_counts: dict[str, int] = defaultdict(int) + self._deferred_send_cleanup: set[str] = set() + # -- per-cycle output accumulator -- + self._chunk_ready_req_ids: set[str] = set() + self._chunk_finished_req_ids: set[str] = set() + self._stage_recv_req_ids: set[str] = set() + self._full_payload_pending_broadcast_req_ids: set[str] = set() + self._async_chunk_updated_req_ids: set[str] = set() + + # -- Model Runner local payload cache (RFC §2.4) -- + # Full stage payloads land here first on the recv side. We + # intentionally do not write connector recv results straight into + # `model_intermediate_buffer`: runner-owned runtime state is + # materialized later by `_sync_local_stage_payloads()` on the + # model thread. This keeps recv timing separate from execute-step + # visibility and avoids mixing connector I/O with model runtime + # ownership. + self._local_stage_payload_cache: dict[str, dict[str, Any]] = {} + # Lightweight scheduling metadata pending delivery to the Scheduler. + self._local_request_metadata: dict[str, dict[str, Any]] = {} + + # -- persistent set of request IDs whose chunk stream is complete -- + # Prevents re-registration after the finish sentinel has been received. + self._chunk_stream_completed: set[str] = set() + + # -- full_payload_mode: accumulate latest pooler_output per request, + # send only when the request finishes (next-cycle flush) -- + self._pending_full_payload_send: dict[str, tuple[Any, Any]] = {} + + # -- KV sent accumulator -- + self._kv_sent_req_ids: list[str] = [] + + # -- KV transfer lifecycle (absorbed from scheduler) -- + # Requests marked for KV transfer: {req_id: {seq_len, block_ids}} + self._kv_pending_transfers: dict[str, dict[str, Any]] = {} + # Requests whose KV transfer has been submitted but not yet acked + self._kv_active_transfers: set[str] = set() + # Requests whose KV transfer is complete (acked by kv_extracted_req_ids) + self._kv_completed_transfers: set[str] = set() + # Dedup guard: requests that have already triggered KV transfer + self._kv_triggered_requests: set[str] = set() + + self._lock = threading.Lock() + self._stop_event = threading.Event() + self._work_available = threading.Event() + + # Start background threads only when there's a connector + self._recv_thread: threading.Thread | None = None + self._save_thread: threading.Thread | None = None + if self._omni_connector is not None: + self._recv_thread = threading.Thread( + target=self._recv_loop, + daemon=True, + name="omni-mixin-recv", + ) + self._recv_thread.start() + self._save_thread = threading.Thread( + target=self._save_loop, + daemon=True, + name="omni-mixin-save", + ) + self._save_thread.start() + + def shutdown_omni_connectors(self) -> None: + """Stop background threads and release connector resources.""" + self._stop_event.set() + if self._recv_thread is not None: + self._recv_thread.join(timeout=5) + if self._save_thread is not None: + self._save_thread.join(timeout=5) + if self._omni_connector is not None: + try: + self._omni_connector.close() + except Exception: + pass + + def cleanup_finished_request(self, req_id: str) -> None: + """Clean up per-request state after a request is fully finished. + + Call this when a request is freed from the model runner to prevent + memory leaks in the mixin's tracking dicts/sets. The external + request ID is resolved before cleaning up ``_put_req_chunk`` which + is keyed by external ID. + """ + ext_id = self._request_ids_mapping.pop(req_id, None) + send_req_id = ext_id if ext_id is not None else req_id + + with self._lock: + if self._pending_save_counts.get(send_req_id, 0): + self._deferred_send_cleanup.add(send_req_id) + else: + self._put_req_chunk.pop(send_req_id, None) + self._send_side_request_payload.pop(send_req_id, None) + self._code_prompt_token_ids.pop(send_req_id, None) + self._kv_pending_transfers.pop(req_id, None) + self._kv_active_transfers.discard(req_id) + self._kv_completed_transfers.discard(req_id) + self._kv_triggered_requests.discard(req_id) + self._cleanup_recv_delivery_state(req_id) + + def drop_inactive_request_delivery_state(self, req_id: str) -> None: + """Clear recv-side state for inactive requests.""" + ext_id = self._request_ids_mapping.pop(req_id, None) + if hasattr(self, "_lock"): + with self._lock: + self._drop_send_side_payload_state(req_id, ext_id) + else: + self._drop_send_side_payload_state(req_id, ext_id) + self._cleanup_recv_delivery_state(req_id) + + def _drop_send_side_payload_state(self, req_id: str, ext_id: str | None) -> None: + if ext_id is not None: + self._send_side_request_payload.pop(ext_id, None) + self._send_side_request_payload.pop(req_id, None) + + def _cleanup_recv_delivery_state(self, req_id: str) -> None: + """Clear recv-side delivery-cycle state.""" + if hasattr(self, "_lock"): + with self._lock: + self._clear_recv_delivery_state(req_id) + else: + self._clear_recv_delivery_state(req_id) + + def _clear_recv_delivery_state(self, req_id: str) -> None: + self._get_req_chunk.pop(req_id, None) + self._pending_load_reqs.pop(req_id, None) + self._finished_load_reqs.discard(req_id) + self._chunk_ready_req_ids.discard(req_id) + self._chunk_finished_req_ids.discard(req_id) + self._chunk_stream_completed.discard(req_id) + self._stage_recv_req_ids.discard(req_id) + self._full_payload_pending_broadcast_req_ids.discard(req_id) + self._async_chunk_updated_req_ids.discard(req_id) + self._local_stage_payload_cache.pop(req_id, None) + self._local_request_metadata.pop(req_id, None) + + def prune_inactive_requests(self, active_req_ids: Any) -> set[str]: + """Drop connector state for requests that no longer exist locally. + + Preempted / unscheduled requests are expected to stay in + ``self.requests`` and therefore remain untouched. This only prunes + stale request IDs that have already fallen out of the active request + map, preventing background recv/send bookkeeping from outliving the + request lifecycle. + """ + if active_req_ids is None: + return set() + + active_req_ids = set(active_req_ids) + pending_req_ids = set(getattr(self, "_pending_load_reqs", {}).keys()) + received_req_ids = set(getattr(self, "_stage_recv_req_ids", set())) + received_req_ids.update(getattr(self, "_full_payload_pending_broadcast_req_ids", set())) + received_req_ids.update(getattr(self, "_local_request_metadata", {}).keys()) + # Pending recv requests may not yet be in the caller's active set + # (e.g. WAITING_FOR_CHUNK requests live in the coordinator's internal + # queues, not in model runner self.requests). Protect them so that + # legitimate waiting requests are not pruned. + # + # Likewise, a full payload can arrive on the background recv thread + # after the scheduler_output snapshot for the current execute_model() + # cycle was already materialized. Those requests may briefly live only + # in recv-side buffers/local cache until the next scheduler cycle wakes + # them up; pruning them here drops the payload before stage_recv can be + # published. + active_req_ids.update(pending_req_ids) + active_req_ids.update(received_req_ids) + stale_req_ids: set[str] = set() + + # NOTE: _pending_load_reqs is excluded from the scan list because + # all its entries are unconditionally protected above. The mixin + # cannot distinguish a legitimately-waiting pending recv from an + # orphaned one (only the coordinator/scheduler knows). + # + # Requests with freshly received full payloads / local stage payloads + # are also protected above. Their scheduler wake-up may lag the recv + # thread by one execute_model() cycle, especially when the request was + # added after the current scheduler_output snapshot. + # + # Orphaned pending recv entries (e.g. from upstream stage crash) + # are handled by OmniSchedulingCoordinator.collect_timed_out_request_ids() + # which detects wait-time violations. The scheduler then removes the + # request from its queues, sets FINISHED_ERROR, and calls _free_request() + # which ultimately triggers cleanup_finished_request() here. + for attr_name in ( + "_request_ids_mapping", + "_get_req_chunk", + "_finished_load_reqs", + "_chunk_ready_req_ids", + "_chunk_finished_req_ids", + "_chunk_stream_completed", + "_stage_recv_req_ids", + "_full_payload_pending_broadcast_req_ids", + "_async_chunk_updated_req_ids", + "_local_stage_payload_cache", + "_local_request_metadata", + "_kv_pending_transfers", + "_kv_active_transfers", + "_kv_completed_transfers", + "_kv_triggered_requests", + ): + state = getattr(self, attr_name, None) + if isinstance(state, dict): + stale_req_ids.update(req_id for req_id in state if req_id not in active_req_ids) + elif isinstance(state, set): + stale_req_ids.update(req_id for req_id in state if req_id not in active_req_ids) + + for req_id in stale_req_ids: + self.cleanup_finished_request(req_id) + + return stale_req_ids + + # ------------------------------------------------------------------ # + # Local payload cache (RFC §2.4 – Model Runner ownership) + # ------------------------------------------------------------------ # + + def put_local_stage_payload(self, req_id: str, payload: dict[str, Any]) -> None: + """Store a full stage payload in the local cache.""" + self._local_stage_payload_cache[req_id] = payload + + def get_local_stage_payload(self, req_id: str) -> dict[str, Any] | None: + """Read a stage payload without removing it.""" + return self._local_stage_payload_cache.get(req_id) + + def pop_local_stage_payload(self, req_id: str) -> dict[str, Any] | None: + """Remove and return a stage payload (consume after use).""" + return self._local_stage_payload_cache.pop(req_id, None) + + def put_local_request_metadata(self, req_id: str, metadata: dict[str, Any]) -> None: + """Store lightweight scheduling metadata for a request.""" + self._local_request_metadata[req_id] = metadata + + def get_local_request_metadata(self, req_id: str) -> dict[str, Any] | None: + """Retrieve scheduling metadata for a request.""" + return self._local_request_metadata.get(req_id) + + # ------------------------------------------------------------------ # + # Scheduling metadata extraction + # ------------------------------------------------------------------ # + + _SCHEDULING_METADATA_KEYS = ( + "next_stage_prompt_len", + "code_predictor_codes", + "left_context_size", + ) + + @classmethod + def _extract_scheduling_metadata(cls, payload: dict[str, Any]) -> dict[str, Any]: + """Extract only the fields the scheduler needs from a full payload.""" + return {k: payload[k] for k in cls._SCHEDULING_METADATA_KEYS if k in payload} + + _NON_CONSUMABLE_PAYLOAD_KEYS = { + "finished", + "override_keys", + "next_stage_prompt_len", + "left_context_size", + THINKER_OUTPUT_TOKEN_IDS_KEY, + THINKER_DECODE_TOKEN_START_KEY, + THINKER_DECODE_TOKEN_END_KEY, + } + + @staticmethod + def _payload_value_has_content(value: Any) -> bool: + if value is None: + return False + if isinstance(value, torch.Tensor): + return value.numel() > 0 + if isinstance(value, (list, tuple, dict, set)): + return len(value) > 0 + return True + + @classmethod + def _payload_is_consumable(cls, payload: dict[str, Any] | None) -> bool: + """Return True when an async payload can drive a real forward step. + + Metadata-only wake-ups should not transition WAITING_FOR_CHUNK requests + back to schedulable state. In particular, a widened token horizon without + any newly visible thinker decode embeds should not force a placeholder-only + talker decode step. + """ + if not isinstance(payload, dict) or not payload: + return False + + decode_embeddings = payload.get(THINKER_DECODE_EMBEDDINGS_KEY) + if isinstance(decode_embeddings, torch.Tensor): + if decode_embeddings.ndim == 0: + return True + return decode_embeddings.numel() > 0 and decode_embeddings.shape[0] > 0 + + if "code_predictor_codes" in payload: + code_predictor_codes = payload.get("code_predictor_codes") + if isinstance(code_predictor_codes, torch.Tensor): + return code_predictor_codes.numel() > 0 + # Codec code 0 is valid; non-empty code payloads are consumable. + if hasattr(code_predictor_codes, "__len__"): + return len(code_predictor_codes) > 0 + else: + return code_predictor_codes is not None + + for key, value in payload.items(): + if key in cls._NON_CONSUMABLE_PAYLOAD_KEYS: + continue + if cls._payload_value_has_content(value): + return True + return False + + @staticmethod + def _get_local_tp_group() -> Any | None: + """Return the local TP group when tensor parallelism is initialized.""" + try: + return get_tp_group() + except Exception: + return None + + def _recv_ordinary_stage_result( + self, + connector: OmniConnectorBase, + from_stage: str, + to_stage: str, + connector_get_key: str, + ) -> Any: + """Receive one ordinary non-KV stage payload on the local leader rank only.""" + tp_group = self._get_local_tp_group() + if tp_group is None or getattr(tp_group, "world_size", 1) <= 1: + return connector.get(from_stage, to_stage, connector_get_key) + if not self.is_data_transfer_rank(): + return None + return connector.get(from_stage, to_stage, connector_get_key) + + def _recv_full_payload_result( + self, + connector: OmniConnectorBase, + from_stage: str, + to_stage: str, + connector_get_key: str, + ) -> Any: + """Receive one full-payload transfer on the local leader rank only.""" + return self._recv_ordinary_stage_result( + connector, + from_stage, + to_stage, + connector_get_key, + ) + + def _recv_async_chunk_result( + self, + connector: OmniConnectorBase, + from_stage: str, + to_stage: str, + connector_get_key: str, + ) -> Any: + """Receive one ordinary async chunk on the local leader rank only.""" + return self._recv_ordinary_stage_result( + connector, + from_stage, + to_stage, + connector_get_key, + ) + + @staticmethod + def _snapshot_payload(payload: Any) -> Any: + if isinstance(payload, dict): + return dict(payload) + return payload + + def _broadcast_tp_payload_packet(self, packet: Any) -> Any: + """Broadcast one ordinary payload packet from TP rank 0 when TP is active.""" + tp_group = self._get_local_tp_group() + if tp_group is None or getattr(tp_group, "world_size", 1) <= 1: + return packet + leader_packet = packet if self.is_data_transfer_rank() else None + return tp_group.broadcast_object(leader_packet, src=0) + + def _apply_staged_payloads_locked(self, staged_payloads: dict[str, Any]) -> None: + for req_id, payload in staged_payloads.items(): + self._local_stage_payload_cache[req_id] = self._snapshot_payload(payload) + + def _collect_full_payload_results_locked(self) -> dict[str, Any] | None: + if not self._full_payload_pending_broadcast_req_ids: + return None + results: dict[str, Any] = {} + missing_req_ids: list[str] = [] + for req_id in tuple(self._full_payload_pending_broadcast_req_ids): + payload = self._local_stage_payload_cache.get(req_id) + if payload is None: + missing_req_ids.append(req_id) + continue + results[req_id] = self._snapshot_payload(payload) + self._full_payload_pending_broadcast_req_ids.discard(req_id) + if missing_req_ids: + logger.warning( + "[Stage-%s] _collect_full_payload_results_locked: " + "pending full-payload reqs missing from local cache: %s", + self._stage_id, + missing_req_ids, + ) + return results or None + + def _collect_async_chunk_fanout_packet_locked(self) -> dict[str, Any] | None: + payload_req_ids = set(self._async_chunk_updated_req_ids) + payload_req_ids.update(self._finished_load_reqs) + payload_req_ids.update(self._chunk_finished_req_ids) + payload_req_ids.update(self._local_request_metadata) + if not ( + payload_req_ids or self._finished_load_reqs or self._chunk_finished_req_ids or self._local_request_metadata + ): + return None + + staged_payloads = { + req_id: self._snapshot_payload(self._local_stage_payload_cache[req_id]) + for req_id in payload_req_ids + if req_id in self._local_stage_payload_cache + } + packet = { + "staged_payloads": staged_payloads, + "request_metadata": dict(self._local_request_metadata), + "newly_finished": set(self._finished_load_reqs), + "chunk_finished": set(self._chunk_finished_req_ids), + } + + self._async_chunk_updated_req_ids.clear() + self._finished_load_reqs.clear() + self._chunk_finished_req_ids.clear() + self._local_request_metadata.clear() + + for req_id in packet["chunk_finished"]: + if req_id not in self._local_stage_payload_cache: + continue + ext_req_id = self._request_ids_mapping.get(req_id, req_id) + self._send_side_request_payload.pop(ext_req_id, None) + if ext_req_id != req_id: + self._send_side_request_payload.pop(req_id, None) + + return packet + + def _apply_async_chunk_fanout_packet(self, packet: dict[str, Any]) -> None: + staged_payloads = packet.get("staged_payloads", {}) + chunk_finished = set(packet.get("chunk_finished", ())) + with self._lock: + self._apply_staged_payloads_locked(staged_payloads) + for req_id in chunk_finished: + self._pending_load_reqs.pop(req_id, None) + self._chunk_stream_completed.add(req_id) + + # ------------------------------------------------------------------ # + # full_payload_mode (recv_full_payload_inputs / send_full_payload_outputs) + # ------------------------------------------------------------------ # + + def recv_full_payload_inputs(self, scheduler_output: Any) -> dict[str, Any] | None: + """Check for incoming full_payload_mode stage inputs (non-blocking). + + Returns a dict mapping ``request_id -> engine_inputs`` for data + that has arrived, or ``None`` if nothing is ready. Stores full + payloads in the local cache and extracts scheduling metadata. + """ + with self._lock: + results = self._collect_full_payload_results_locked() if self.is_data_transfer_rank() else None + results = self._broadcast_tp_payload_packet(results) + if not results: + return None + with self._lock: + self._stage_recv_req_ids.update(results.keys()) + for req_id in results: + self._pending_load_reqs.pop(req_id, None) + self._apply_staged_payloads_locked(results) + for req_id, payload in results.items(): + self._local_request_metadata[req_id] = self._extract_scheduling_metadata(payload) + logger.info( + "[Stage-%s] recv_full_payload_inputs: consumed %s reqs: %s, stage_recv_req_ids now=%s", + self._stage_id, + len(results), + list(results.keys()), + self._stage_recv_req_ids, + ) + return results + + @staticmethod + def _is_all_zero_tensor(t: Any) -> bool: + """Return True if *t* is a torch.Tensor whose elements are all zero.""" + return isinstance(t, torch.Tensor) and t.numel() > 0 and not t.any() + + def accumulate_full_payload_output( + self, + req_id: str, + pooler_output: Any, + request: Any, + ) -> None: + """Accumulate pooler_output for a request across steps (full_payload_mode). + + Per-token tensors (2-D+, matching trailing dims) are concatenated + along dim-0. Scalar / global tensors (1-D or 0-D) are replaced + with the latest value. + + All-zero tensors (e.g. ``code_predictor_codes`` emitted during + prefill) are dropped so that they do not pollute downstream stages + with garbage / noise frames. + + The data is actually sent when ``flush_full_payload_outputs`` is called + with the finished request IDs from the next scheduler cycle. + """ + # ---- Filter out all-zero tensors from the incoming pooler_output ---- + filtered: dict[str, Any] = {} + dropped_zero_keys: list[tuple[str, tuple[int, ...]]] = [] + for k, v in pooler_output.items(): + if self._is_all_zero_tensor(v): + dropped_zero_keys.append((k, tuple(v.shape))) + continue # skip prefill zero-filled placeholders + filtered[k] = v + if dropped_zero_keys: + logger.info( + "[Stage-%s] accumulate_full_payload_output: req=%s dropped_zero_keys=%s", + self._stage_id, + req_id, + dropped_zero_keys, + ) + pooler_output = filtered + + existing = self._pending_full_payload_send.get(req_id) + if existing is None: + self._pending_full_payload_send[req_id] = (pooler_output, request) + return + + prev_output, _ = existing + merged: dict[str, Any] = {} + for k in set(prev_output) | set(pooler_output): + v_new = pooler_output.get(k) + v_old = prev_output.get(k) + if v_new is None: + merged[k] = v_old + elif v_old is None: + merged[k] = v_new + elif ( + isinstance(v_new, torch.Tensor) + and isinstance(v_old, torch.Tensor) + and v_new.dim() >= 2 + and v_old.dim() >= 2 + and v_new.shape[1:] == v_old.shape[1:] + ): + merged[k] = torch.cat([v_old, v_new], dim=0) + else: + merged[k] = v_new + self._pending_full_payload_send[req_id] = (merged, request) + + def flush_full_payload_outputs(self, finished_req_ids: set[str]) -> None: + """Send accumulated full_payload outputs for requests that just finished.""" + logger.info( + "[Stage-%s] flush_full_payload_outputs: finished_req_ids=%s, pending=%s", + self._stage_id, + finished_req_ids, + list(self._pending_full_payload_send.keys()), + ) + to_send: dict[str, tuple[Any, Any]] = {} + for req_id in finished_req_ids: + entry = self._pending_full_payload_send.pop(req_id, None) + if entry is not None: + to_send[req_id] = entry + logger.info("[Stage-%s] flush_full_payload_outputs: to_send=%s", self._stage_id, list(to_send.keys())) + if to_send: + self.send_full_payload_outputs(scheduler_output=None, outputs=to_send) + + def send_full_payload_outputs( + self, + scheduler_output: Any, + outputs: dict[str, tuple[Any, Any] | Any], + ) -> list[str]: + """Send full_payload stage outputs to the next stage via connector. + + Args: + outputs: Mapping of ``req_id`` to either a + ``(pooling_output, request)`` tuple (preferred) or a raw + payload dict. When a tuple is supplied the request object + is forwarded to ``custom_process_stage_input_func``. + + Returns list of request IDs successfully enqueued. + """ + if self._omni_connector is None: + logger.info("[Stage-%s] send_full_payload_outputs: connector is None, skip", self._stage_id) + return [] + if not self.is_data_transfer_rank(): + logger.info( + "[Stage-%s] send_full_payload_outputs: not data_transfer_rank (rank=%s), skip", + self._stage_id, + self._local_rank, + ) + return list(outputs.keys()) + sent_ids: list[str] = [] + next_stage_id = self._next_stage_id + for req_id, value in outputs.items(): + if isinstance(value, tuple) and len(value) == 2: + raw_output, request = value + else: + raw_output, request = value, None + + payload = raw_output + if self._custom_process_func is not None: + payload = self._build_custom_process_payload( + request_id=req_id, + request=request, + pooling_output=raw_output, + ) + if payload is None: + continue + if payload is None: + logger.info("[Stage-%s] send_full_payload_outputs: payload is None for %s", self._stage_id, req_id) + continue + if isinstance(payload, dict): + code_predictor_codes = payload.get("code_predictor_codes") + if isinstance(code_predictor_codes, torch.Tensor): + code_len = int(code_predictor_codes.numel()) + elif hasattr(code_predictor_codes, "__len__"): + code_len = len(code_predictor_codes) + else: + code_len = None + logger.info( + "[Stage-%s] send_full_payload_outputs: req=%s payload_keys=%s code_len=%s left_context_size=%s", + self._stage_id, + req_id, + sorted(payload.keys()), + code_len, + payload.get("left_context_size"), + ) + + external_req_id = self._resolve_external_req_id(request, req_id) + chunk_id = self._put_req_chunk[req_id] + self._put_req_chunk[req_id] += 1 + connector_put_key = f"{external_req_id}_{self._stage_id}_{chunk_id}" + + logger.info( + "[Stage-%s] send_full_payload_outputs: enqueue req=%s put_key=%s next_stage=%s", + self._stage_id, + req_id, + connector_put_key, + next_stage_id, + ) + task = { + "stage_id": self._stage_id, + "next_stage_id": next_stage_id, + "put_key": connector_put_key, + "data": payload, + "request_id": req_id, + } + with self._lock: + self._pending_save_reqs.setdefault(req_id, deque()).append(task) + self._pending_save_counts[req_id] += 1 + sent_ids.append(req_id) + if sent_ids: + self._work_available.set() + return sent_ids + + def recv_stage_inputs(self, scheduler_output: Any) -> dict[str, Any] | None: + """Compatibility wrapper for ``recv_full_payload_inputs``.""" + return self.recv_full_payload_inputs(scheduler_output) + + def accumulate_batch_output( + self, + req_id: str, + pooler_output: Any, + request: Any, + ) -> None: + """Compatibility wrapper for ``accumulate_full_payload_output``.""" + self.accumulate_full_payload_output(req_id, pooler_output, request) + + def flush_batch_outputs(self, finished_req_ids: set[str]) -> None: + """Compatibility wrapper for ``flush_full_payload_outputs``.""" + self.flush_full_payload_outputs(finished_req_ids) + + def send_stage_outputs( + self, + scheduler_output: Any, + outputs: dict[str, tuple[Any, Any] | Any], + ) -> list[str]: + """Compatibility wrapper for ``send_full_payload_outputs``.""" + return self.send_full_payload_outputs(scheduler_output, outputs) + + # ------------------------------------------------------------------ # + # Streaming chunk mode (recv_chunk / send_chunk) + # ------------------------------------------------------------------ # + + def register_chunk_recv(self, request: Any) -> None: + """Register a request for async chunk retrieval by the bg thread. + + Stage-0 has no upstream producer so this is a no-op there. + Skips requests whose batch data has already been received to + prevent the bg thread from polling for non-existent chunks. + """ + if self._stage_id == 0: + return + request_id = request.request_id + self._request_ids_mapping[request_id] = getattr( + request, + "external_req_id", + request_id, + ) + with self._lock: + if request_id in self._stage_recv_req_ids: + return + # Don't re-register if the finish sentinel was already received + if request_id in self._chunk_stream_completed: + return + self._pending_load_reqs[request_id] = request + self._work_available.set() + + def recv_chunk(self) -> dict[str, Any]: + """Collect chunks received by the bg thread since last call. + + Returns a dict ``{request_id: chunk_payload}`` for newly arrived + chunks. Empty dict when nothing is ready. + + This method reads from ``_finished_load_reqs`` without clearing + it -- ``get_omni_connector_output()`` is the sole consumer that + drains and resets ``_finished_load_reqs`` at the end of each + ``execute_model`` cycle. + + Returns **shallow copies** of the cached payloads so that the + caller can read them without racing against the background recv + thread, which may concurrently mutate the live cache entries via + ``dict.update()``. + """ + with self._lock: + finished = set(self._finished_load_reqs) + if not finished: + return {} + # Snapshot the payloads under the lock to avoid racing with + # _poll_single_request which does existing.update(payload_data) + # on the same dict objects. + result = {} + for rid in finished: + payload = self._local_stage_payload_cache.get(rid) + result[rid] = dict(payload) if isinstance(payload, dict) else payload + + self._chunk_ready_req_ids.update(finished) + return result + + def send_chunk( + self, + request: Any, + pooling_output: Any | None = None, + ) -> bool: + """Derive and enqueue one chunk for async sending. + + Payload extraction runs in the caller thread (via + ``custom_process_stage_input_func``); the actual + ``connector.put()`` is done by the background save thread. + Non-KV data is identical across TP ranks; only rank 0 sends. + """ + if self._omni_connector is None: + logger.warning("[Stage-%s] send_chunk: connector is None", self._stage_id) + return False + if not self.is_data_transfer_rank(): + return True + raw_req_id = getattr(request, "request_id", None) or getattr(request, "req_id", None) + request_id = self._resolve_external_req_id(request, raw_req_id) + # Cache the internal→external mapping so that finish sentinels can + # resolve the external ID even after the request is freed. + if raw_req_id and raw_req_id != request_id: + self._request_ids_mapping.setdefault(raw_req_id, request_id) + chunk_id = self._put_req_chunk[request_id] + + payload_data = self._build_custom_process_payload( + request_id=request_id, + request=request, + pooling_output=pooling_output, + ) + if payload_data is None: + if chunk_id == 0: + logger.warning( + "[Stage-%s] send_chunk: payload is None for req=%s chunk=%s (process_func=%s)", + self._stage_id, + request_id, + chunk_id, + self._custom_process_func, + ) + return False + + self._put_req_chunk[request_id] += 1 + next_stage_id = self._next_stage_id + connector_put_key = f"{request_id}_{self._stage_id}_{chunk_id}" + + if chunk_id == 0: + logger.info( + "[Stage-%s] send_chunk: first chunk enqueued, req=%s key=%s", + self._stage_id, + request_id, + connector_put_key, + ) + + task = { + "stage_id": self._stage_id, + "next_stage_id": next_stage_id, + "put_key": connector_put_key, + "data": payload_data, + "request_id": request_id, + } + with self._lock: + self._pending_save_reqs.setdefault(request_id, deque()).append(task) + self._pending_save_counts[request_id] += 1 + self._work_available.set() + return True + + # ------------------------------------------------------------------ # + # KV cache (delegates to OmniKVTransferManager) + # ------------------------------------------------------------------ # + + def send_kv_cache( + self, + finished_reqs: dict[str, dict[str, Any]], + kv_caches: list[torch.Tensor], + block_size: int, + cache_dtype: str, + request_id_resolver: Any | None = None, + ) -> list[str]: + """Send KV cache for finished requests. + + Delegates to the existing ``OmniKVTransferManager``. + """ + if self._kv_transfer_manager is None: + return list(finished_reqs.keys()) if finished_reqs else [] + result = self._kv_transfer_manager.handle_finished_requests_kv_transfer( + finished_reqs=finished_reqs, + kv_caches=kv_caches, + block_size=block_size, + cache_dtype=cache_dtype, + request_id_resolver=request_id_resolver, + ) + if result: + self._kv_sent_req_ids.extend(result) + return result + + def recv_kv_cache( + self, + request_id: str, + target_device: torch.device | None = None, + ) -> tuple[dict[str, Any] | None, int]: + """Receive KV cache for a request. + + Delegates to the existing ``OmniKVTransferManager``. + """ + if self._kv_transfer_manager is None: + return None, 0 + return self._kv_transfer_manager.receive_kv_cache_for_request( + request_id=request_id, + target_device=target_device, + ) + + def receive_cfg_companion_kv_payloads( + self, + cfg_request_ids: dict[str, str], + target_device: torch.device | None = None, + ) -> dict[str, tuple[dict[str, Any] | None, int]]: + """Receive raw CFG companion KV payloads keyed by role.""" + return { + role: self.recv_kv_cache(companion_rid, target_device=target_device) + for role, companion_rid in cfg_request_ids.items() + } + + def receive_multi_kv_cache( + self, + req: Any, + cfg_kv_collect_func: Any | None = None, + target_device: torch.device | None = None, + ) -> bool: + """Receive primary and optional companion KV caches for a request. + + The mixin owns the runner-facing orchestration: primary KV receive, + companion payload fetch, and applying any model-specific CFG fields back + onto ``req.sampling_params``. + """ + if self._kv_transfer_manager is None: + return False + + request_id = getattr(req, "request_id", None) or ( + req.request_ids[0] if hasattr(req, "request_ids") and req.request_ids else None + ) + if not request_id: + logger.warning("Request has no ID, cannot receive KV cache") + return False + + active_requests = getattr(self, "requests", None) + if active_requests is not None and request_id not in active_requests: + logger.info("Skip receiving KV cache for inactive request %s", request_id) + return False + + primary_ok = False + data, _size = self.recv_kv_cache(request_id, target_device=target_device) + if data: + self._kv_transfer_manager.apply_kv_cache_to_request(req, data) + primary_ok = True + + cfg_ids = getattr(getattr(req, "sampling_params", None), "cfg_kv_request_ids", None) + if cfg_ids and cfg_kv_collect_func: + try: + cfg_role_payloads = self.receive_cfg_companion_kv_payloads( + cfg_ids, + target_device=target_device, + ) + cfg_kvs = cfg_kv_collect_func(request_id, cfg_role_payloads) + if cfg_kvs and hasattr(req, "sampling_params") and req.sampling_params is not None: + for key, value in cfg_kvs.items(): + setattr(req.sampling_params, key, value) + logger.info("Applied CFG KV caches: %s", list(cfg_kvs.keys())) + except Exception: + logger.exception("Failed to collect CFG KV caches for %s", request_id) + + return primary_ok + + # ------------------------------------------------------------------ # + # Rank-aware KV transfer routing + # ------------------------------------------------------------------ # + + def get_rank_aware_kv_keys( + self, + req_id: str, + from_stage: int, + to_stage: int | None = None, + chunk_id: int = 0, + ) -> list[str]: + """Build recv-side connector keys for all remote ranks this rank needs. + + For heterogeneous TP receive, the local rank is the target rank and must + fetch one or more source-rank shards keyed as ``from_rank -> to_rank``. + """ + remote_ranks = self.get_kv_remote_ranks() + return [ + self.get_kv_connector_key( + req_id=req_id, + from_stage=from_stage, + chunk_id=chunk_id, + from_rank=remote_rank, + to_rank=self._local_rank, + ) + for remote_rank in remote_ranks + ] + + def get_kv_target_ranks_for_send(self) -> list[int]: + """Determine which target ranks this local rank should send KV shards to.""" + self._validate_kv_tp_topology() + if self._from_tp == self._to_tp: + return [self._local_rank] + if self._from_tp > self._to_tp: + tp_ratio = self._from_tp // self._to_tp + return [self._local_rank // tp_ratio] + tp_ratio = self._to_tp // self._from_tp + base_rank = self._local_rank * tp_ratio + return [base_rank + i for i in range(tp_ratio)] + + def get_rank_aware_kv_send_keys( + self, + req_id: str, + from_stage: int, + to_stage: int | None = None, + chunk_id: int = 0, + ) -> list[str]: + """Build send-side connector keys for this rank's KV shard(s).""" + target_ranks = self.get_kv_target_ranks_for_send() + return [ + self.get_kv_connector_key( + req_id=req_id, + from_stage=from_stage, + chunk_id=chunk_id, + from_rank=self._local_rank, + to_rank=target_rank, + ) + for target_rank in target_ranks + ] + + @staticmethod + def _merge_rank_sharded_kv_payloads(payloads: list[dict[str, Any]]) -> dict[str, Any] | None: + """Merge multiple source-rank KV shards for one target rank.""" + payloads = [payload for payload in payloads if isinstance(payload, dict)] + if not payloads: + return None + if len(payloads) == 1: + return payloads[0] + + merged = dict(payloads[0]) + layer_blocks = merged.get("layer_blocks") + if not isinstance(layer_blocks, dict): + return merged + + def _merge_tensor_lists(name: str) -> list[torch.Tensor | None]: + merged_list: list[torch.Tensor | None] = [] + cache_lists = [payload.get("layer_blocks", {}).get(name, []) for payload in payloads] + max_len = max((len(cache_list) for cache_list in cache_lists), default=0) + for idx in range(max_len): + tensors = [cache_list[idx] for cache_list in cache_lists if idx < len(cache_list)] + tensors = [tensor for tensor in tensors if isinstance(tensor, torch.Tensor)] + if not tensors: + merged_list.append(None) + elif len(tensors) == 1: + merged_list.append(tensors[0]) + else: + merged_list.append(torch.cat(tensors, dim=-2).contiguous()) + return merged_list + + merged["layer_blocks"] = { + "key_cache": _merge_tensor_lists("key_cache"), + "value_cache": _merge_tensor_lists("value_cache"), + } + metadata = dict(merged.get("metadata", {})) + metadata["merged_remote_rank_count"] = len(payloads) + merged["metadata"] = metadata + return merged + + def _slice_rank_sharded_kv_payload(self, payload: dict[str, Any] | None) -> dict[str, Any] | None: + """Slice a duplicated source-rank KV shard for ``from_tp < to_tp`` cases.""" + if payload is None or self._from_tp >= self._to_tp: + return payload + + tp_ratio = self._to_tp // self._from_tp + shard_index = self._local_rank % tp_ratio + layer_blocks = payload.get("layer_blocks") if isinstance(payload, dict) else None + if not isinstance(layer_blocks, dict): + return payload + + def _slice_tensor_list(name: str) -> list[torch.Tensor | None]: + sliced: list[torch.Tensor | None] = [] + for tensor in layer_blocks.get(name, []): + if not isinstance(tensor, torch.Tensor) or tensor.ndim < 2: + sliced.append(tensor) + continue + head_dim = tensor.shape[-2] + if head_dim % tp_ratio != 0: + sliced.append(tensor) + continue + per_rank = head_dim // tp_ratio + start = shard_index * per_rank + sliced.append(tensor.narrow(-2, start, per_rank).contiguous()) + return sliced + + payload = dict(payload) + payload["layer_blocks"] = { + "key_cache": _slice_tensor_list("key_cache"), + "value_cache": _slice_tensor_list("value_cache"), + } + metadata = dict(payload.get("metadata", {})) + metadata["sliced_for_local_rank"] = self._local_rank + payload["metadata"] = metadata + return payload + + def should_replicate_payload(self) -> bool: + """Whether non-KV payloads should be replicated across ranks. + + Data payloads (stage inputs, chunks) are identical after all-gather, + so only rank 0 transfers them. KV payloads are rank-specific and + all ranks participate. + """ + return self._local_rank != 0 + + def get_kv_rank_mapping(self) -> dict[str, Any]: + """Return the current rank mapping configuration. + + Useful for debugging and for downstream code that needs to know + the TP topology without re-parsing model config. + """ + return { + "from_tp": self._from_tp, + "to_tp": self._to_tp, + "local_rank": self._local_rank, + "remote_ranks": self.get_kv_remote_ranks(), + "is_data_transfer_rank": self.is_data_transfer_rank(), + } + + # ------------------------------------------------------------------ # + # KV transfer lifecycle (RFC – mixin-owned) + # ------------------------------------------------------------------ # + + def mark_kv_transfer( + self, + req_id: str, + seq_len: int, + block_ids: list[int], + custom_metadata: dict[str, Any] | None = None, + ) -> None: + """Mark a request as needing KV cache transfer. + + Called by the scheduler when a transfer trigger fires. The mixin + owns the lifecycle from this point: pending → active → completed. + """ + if req_id in self._kv_pending_transfers: + return + self._kv_triggered_requests.add(req_id) + transfer = { + "seq_len": seq_len, + "block_ids": block_ids, + } + if custom_metadata is not None: + transfer["custom_metadata"] = custom_metadata + self._kv_pending_transfers[req_id] = transfer + + def drain_pending_kv_transfers(self) -> dict[str, dict[str, Any]]: + """Drain pending KV transfers and move them to active. + + Returns ``{req_id: {seq_len, block_ids}}`` for the model runner + to submit to ``send_kv_cache``. + """ + if not self._kv_pending_transfers: + return {} + pending = dict(self._kv_pending_transfers) + self._kv_active_transfers.update(pending.keys()) + self._kv_pending_transfers.clear() + return pending + + def ack_kv_transfers(self, req_ids: list[str] | set[str]) -> None: + """Acknowledge completed KV transfers (from kv_extracted_req_ids). + + Moves requests from active to completed so the scheduler can + safely free their blocks. + """ + for req_id in req_ids: + self._kv_active_transfers.discard(req_id) + self._kv_completed_transfers.add(req_id) + + def drain_completed_kv_transfers(self) -> set[str]: + """Drain and return completed KV transfer request IDs. + + The scheduler calls this to know which requests' blocks can be freed. + """ + completed = set(self._kv_completed_transfers) + self._kv_completed_transfers.clear() + return completed + + def is_kv_transfer_triggered(self, req_id: str) -> bool: + """Check if a request has already triggered KV transfer.""" + return req_id in self._kv_triggered_requests + + def has_pending_kv_work(self) -> bool: + """True if any KV transfers are pending, active, or awaiting ack.""" + return bool(self._kv_pending_transfers or self._kv_active_transfers or self._kv_completed_transfers) + + # Output aggregation + # ------------------------------------------------------------------ # + + def _empty_output_with_connector_signals(self) -> Any: + """Return a minimal ModelRunnerOutput carrying pending connector signals. + + Used by early-return paths (e.g. ``num_scheduled_tokens == 0``) + that still need to deliver ``omni_connector_output`` to the + Scheduler so that WAITING_FOR_INPUT / WAITING_FOR_CHUNK + transitions are not lost. + """ + from vllm_omni.outputs import OmniModelRunnerOutput + + output = OmniModelRunnerOutput(req_ids=[], req_id_to_index={}) + output.omni_connector_output = self.get_omni_connector_output() + return output + + def get_omni_connector_output(self) -> OmniConnectorOutput: + """Collect and reset transfer results for this execute_model cycle. + + ``request_metadata`` carries only lightweight scheduling metadata. + Full payloads remain owned by the Model Runner local cache for all + paths. + """ + if not hasattr(self, "_lock"): + return OmniConnectorOutput() + + tp_group = self._get_local_tp_group() + if self._async_chunk and tp_group is not None and getattr(tp_group, "world_size", 1) > 1: + if self.is_data_transfer_rank(): + with self._lock: + fanout_packet = self._collect_async_chunk_fanout_packet_locked() + else: + fanout_packet = None + fanout_packet = self._broadcast_tp_payload_packet(fanout_packet) + if fanout_packet is None: + newly_finished = set() + chunk_finished = set() + request_metadata = {} + else: + if not self.is_data_transfer_rank(): + self._apply_async_chunk_fanout_packet(fanout_packet) + newly_finished = set(fanout_packet["newly_finished"]) + chunk_finished = set(fanout_packet["chunk_finished"]) + request_metadata = dict(fanout_packet["request_metadata"]) + else: + with self._lock: + newly_finished = set(self._finished_load_reqs) + self._finished_load_reqs.clear() + chunk_finished = set(self._chunk_finished_req_ids) + self._chunk_finished_req_ids.clear() + request_metadata = dict(self._local_request_metadata) + self._local_request_metadata.clear() + # _send_side_request_payload is the async accumulation buffer for + # future recv chunks. Clearing it on every consumable wake-up drops + # intermediate + # thinker decode spans before the model side can consume them. + # Only terminal chunk_finished requests may release that buffer. + for req_id in chunk_finished: + if req_id not in self._local_stage_payload_cache: + continue + ext_req_id = self._request_ids_mapping.get(req_id, req_id) + self._send_side_request_payload.pop(ext_req_id, None) + if ext_req_id != req_id: + self._send_side_request_payload.pop(req_id, None) + self._chunk_ready_req_ids.update(newly_finished) + + output = OmniConnectorOutput( + chunk_ready_req_ids=set(self._chunk_ready_req_ids), + chunk_finished_req_ids=chunk_finished, + request_metadata=request_metadata, + kv_sent_req_ids=list(self._kv_sent_req_ids), + stage_recv_req_ids=set(self._stage_recv_req_ids), + has_pending_kv_work=self.has_pending_kv_work(), + ) + if output.stage_recv_req_ids or chunk_finished or newly_finished: + logger.info( + "[Stage-%s] get_omni_connector_output: stage_recv=%s, chunk_finished=%s, chunk_ready=%s", + self._stage_id, + output.stage_recv_req_ids, + chunk_finished, + output.chunk_ready_req_ids, + ) + self._chunk_ready_req_ids.clear() + self._kv_sent_req_ids.clear() + self._stage_recv_req_ids.clear() + return output + + @staticmethod + def _connector_output_has_signals(output: OmniConnectorOutput) -> bool: + return bool( + output.chunk_ready_req_ids + or output.chunk_finished_req_ids + or output.request_metadata + or output.kv_sent_req_ids + or output.stage_recv_req_ids + or output.has_pending_kv_work + ) + + def attach_omni_connector_output(self, result: Any | None) -> Any: + omni_output = self.get_omni_connector_output() + if not self._connector_output_has_signals(omni_output): + return result + + from copy import copy + + from vllm.v1.worker.gpu_model_runner import EMPTY_MODEL_RUNNER_OUTPUT + + wrapped = copy(result if result is not None else EMPTY_MODEL_RUNNER_OUTPUT) + wrapped.omni_connector_output = omni_output + return wrapped + + # ------------------------------------------------------------------ # + # Properties for compatibility with custom_process funcs that access + # transfer_manager.put_req_chunk / request_payload / code_prompt_token_ids + # ------------------------------------------------------------------ # + + @property + def put_req_chunk(self) -> dict[str, int]: + return self._put_req_chunk + + @property + def request_payload(self) -> dict[str, dict[str, Any]]: + return self._send_side_request_payload + + @request_payload.setter + def request_payload(self, value: dict[str, dict[str, Any]]) -> None: + self._send_side_request_payload = value + + @property + def code_prompt_token_ids(self) -> dict[str, list[list[int]]]: + return self._code_prompt_token_ids + + @property + def connector(self) -> Any | None: + return self._omni_connector + + # ------------------------------------------------------------------ # + # Background I/O threads + # ------------------------------------------------------------------ # + + def _recv_loop(self) -> None: + """Background thread: poll connector for incoming data.""" + _recv_poll_count = 0 + while not self._stop_event.is_set(): + with self._lock: + pending_ids = list(self._pending_load_reqs.keys()) + + if not pending_ids: + self._work_available.wait(timeout=0.01) + self._work_available.clear() + continue + + _recv_poll_count += 1 + if _recv_poll_count % 5000 == 1: + logger.info( + "[Stage-%s] _recv_loop: polling %s pending reqs: %s (poll#%s)", + self._stage_id, + len(pending_ids), + pending_ids[:5], + _recv_poll_count, + ) + + made_progress = False + for req_id in pending_ids: + if self._stop_event.is_set(): + break + try: + made_progress = self._poll_single_request(req_id) or made_progress + except Exception: + logger.warning("Error receiving data for %s", req_id, exc_info=True) + + if not made_progress and not self._stop_event.is_set(): + self._work_available.wait(timeout=0.001) + self._work_available.clear() + + _MAX_SEND_RETRIES = 3 + + def _save_loop(self) -> None: + """Background thread: send outgoing data via connector.""" + while not self._stop_event.is_set(): + task = None + with self._lock: + for req_id in list(self._pending_save_reqs.keys()): + dq = self._pending_save_reqs[req_id] + if dq: + task = dq.popleft() + if not dq: + del self._pending_save_reqs[req_id] + break + del self._pending_save_reqs[req_id] + + if task is not None: + success = False + try: + success = self._send_single_request(task) + except Exception: + logger.error( + "Error saving data for %s", + task.get("request_id"), + exc_info=True, + ) + if not success: + self._requeue_or_drop_failed_send(task) + continue + + self._work_available.wait(timeout=0.01) + self._work_available.clear() + + def _requeue_or_drop_failed_send(self, task: dict) -> None: + """Re-enqueue a failed send task or drop it after max retries.""" + retry_count = task.get("_retry_count", 0) + 1 + req_id = task.get("request_id") + if retry_count <= self._MAX_SEND_RETRIES: + task["_retry_count"] = retry_count + logger.warning( + "[Stage-%s] Re-enqueuing failed send for %s (retry %d/%d)", + getattr(self, "_stage_id", "?"), + req_id, + retry_count, + self._MAX_SEND_RETRIES, + ) + with self._lock: + dq = self._pending_save_reqs.setdefault(req_id, deque()) + dq.appendleft(task) + else: + logger.error( + "[Stage-%s] Giving up on send for %s after %d retries", + getattr(self, "_stage_id", "?"), + req_id, + self._MAX_SEND_RETRIES, + ) + self._decrement_pending_save_count(req_id) + + # ------------------------------------------------------------------ # + # Chunk-level poll / send (ported from OmniChunkTransferAdapter) + # ------------------------------------------------------------------ # + + def _poll_single_request(self, req_id: str) -> bool: + """Poll connector for one chunk of a request (non-blocking).""" + connector = self._omni_connector + if connector is None: + return False + + if self._async_chunk and self._model_mode != "ar": + with self._lock: + staged_payload = self._local_stage_payload_cache.get(req_id) + metadata_in_flight = req_id in self._local_request_metadata + scheduler_wakeup_pending = req_id in self._finished_load_reqs + if self._payload_is_consumable(staged_payload) or metadata_in_flight or scheduler_wakeup_pending: + logger.debug( + "[Stage-%s] delaying recv for req=%s until staged async payload is handed to scheduler", + self._stage_id, + req_id, + ) + return False + + target_stage_id = self._stage_id - 1 + chunk_id = self._get_req_chunk[req_id] + external_req_id = self._request_ids_mapping.get(req_id, req_id) + connector_get_key = f"{external_req_id}_{target_stage_id}_{chunk_id}" + + if self._async_chunk: + result = self._recv_async_chunk_result( + connector, + str(target_stage_id), + str(self._stage_id), + connector_get_key, + ) + else: + result = self._recv_full_payload_result( + connector, + str(target_stage_id), + str(self._stage_id), + connector_get_key, + ) + + if result is None: + return False + + payload_data, _size = result + if not payload_data: + return False + if isinstance(payload_data, dict): + logger.info( + "[Stage-%s] recv_chunk_result: req=%s ext=%s key=%s keys=%s finished=%s", + self._stage_id, + req_id, + external_req_id, + connector_get_key, + sorted(payload_data.keys()), + bool(payload_data.get("finished")) if "finished" in payload_data else None, + ) + + self._get_req_chunk[req_id] += 1 + + if self._async_chunk: + is_finished = bool(payload_data.get("finished")) + incoming_payload_consumable = self._payload_is_consumable(payload_data) + + if self._model_mode == "ar": + payload_data = self._accumulate_payload(external_req_id, payload_data) + payload_consumable = incoming_payload_consumable + else: + new_ids = payload_data.get("code_predictor_codes", []) + if not new_ids and not is_finished: + return False + payload_consumable = self._payload_is_consumable(payload_data) + + with self._lock: + if is_finished: + self._chunk_finished_req_ids.add(req_id) + self._chunk_stream_completed.add(req_id) + # Local cache (RFC §2.4) — merge, don't replace, so that + # earlier chunk keys (e.g. thinker_prefill_embeddings from + # chunk 0) are not overwritten by later chunks. + existing = self._local_stage_payload_cache.get(req_id) + if existing is not None and isinstance(existing, dict) and isinstance(payload_data, dict): + existing.update(payload_data) + else: + self._local_stage_payload_cache[req_id] = payload_data + staged_payload = self._local_stage_payload_cache[req_id] + self._async_chunk_updated_req_ids.add(req_id) + self.put_local_request_metadata(req_id, self._extract_scheduling_metadata(staged_payload)) + # A finish-only sentinel still needs one terminal wake-up so + # the downstream stage can sync the merged local payload and + # flush/finish even when the last recv carries no new + # consumable chunk bytes. + if payload_consumable or is_finished: + self._finished_load_reqs.add(req_id) + if is_finished and not payload_consumable: + logger.debug( + "[Stage-%s] finish sentinel arrived for req=%s without new consumable payload", + self._stage_id, + req_id, + ) + elif not payload_consumable: + logger.debug( + "[Stage-%s] req=%s received metadata-only / non-consumable async payload; delaying wake-up", + self._stage_id, + req_id, + ) + if is_finished: + self._pending_load_reqs.pop(req_id, None) + else: + # full_payload_mode: the complete payload arrives in a single get(), + # so always unregister immediately. + if isinstance(payload_data, dict): + engine_inputs = payload_data.get("engine_inputs", payload_data) + else: + engine_inputs = payload_data + with self._lock: + self._local_stage_payload_cache[req_id] = self._snapshot_payload(engine_inputs) + # Publish full-payload readiness only after the aligned TP broadcast + # path in recv_full_payload_inputs() has materialized the payload on all + # local ranks. Publishing metadata / stage_recv from the background recv + # thread can let the scheduler observe a request before the payload is + # actually visible to the model thread. + self._full_payload_pending_broadcast_req_ids.add(req_id) + self._pending_load_reqs.pop(req_id, None) + logger.info( + "[Stage-%s] full_payload recv complete: req=%s key=%s payload_type=%s", + self._stage_id, + req_id, + connector_get_key, + type(engine_inputs).__name__, + ) + + logger.debug("[Stage-%s] Received data for key %s", self._stage_id, connector_get_key) + return True + + def _build_custom_process_payload( + self, + request_id: str | None, + request: Any | None, + pooling_output: Any | None, + ) -> Any | None: + """Run the custom process hook with a best-effort finished kwarg.""" + if self._custom_process_func is None: + return None + + kwargs = { + "transfer_manager": self, + "pooling_output": pooling_output, + "request": request, + } + supports_is_finished = getattr( + self, + "_custom_process_supports_is_finished", + self._custom_process_supports_is_finished_kwarg(), + ) + is_finished_fn = getattr(request, "is_finished", None) + if callable(is_finished_fn): + try: + if supports_is_finished is not False: + kwargs["is_finished"] = bool(is_finished_fn()) + except Exception: + logger.debug("request.is_finished() failed for %s", request_id, exc_info=True) + + try: + return self._custom_process_func(**kwargs) + except TypeError as exc: + if "is_finished" not in kwargs or not self._is_unexpected_is_finished_kwarg_error(exc): + logger.exception("custom_process_stage_input_func failed for chunk %s", request_id) + return None + kwargs.pop("is_finished", None) + try: + return self._custom_process_func(**kwargs) + except Exception: + logger.exception("custom_process_stage_input_func failed for chunk %s", request_id) + return None + except Exception: + logger.exception("custom_process_stage_input_func failed for chunk %s", request_id) + return None + + def _custom_process_supports_is_finished_kwarg(self) -> bool | None: + """Return whether the custom process hook accepts `is_finished`.""" + if self._custom_process_func is None: + return None + try: + signature = inspect.signature(self._custom_process_func) + except (TypeError, ValueError): + return None + + for param in signature.parameters.values(): + if param.kind == inspect.Parameter.VAR_KEYWORD: + return True + + is_finished_param = signature.parameters.get("is_finished") + if is_finished_param is None: + return False + return is_finished_param.kind in ( + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + ) + + @staticmethod + def _is_unexpected_is_finished_kwarg_error(exc: TypeError) -> bool: + message = str(exc) + return ( + "unexpected keyword argument 'is_finished'" in message + or 'unexpected keyword argument "is_finished"' in message + or "positional-only arguments passed as keyword arguments: 'is_finished'" in message + ) + + def _send_single_request(self, task: dict) -> bool: + """Send one queued task via connector.put(). + + Returns True on success. On failure (put() raises or returns + ``success=False``), returns False **without** decrementing + ``_pending_save_counts`` so the caller can retry or clean up. + """ + connector = self._omni_connector + if connector is None: + return True + + request_id = task.get("request_id") + payload_data = task.get("data") + if payload_data is None and task.get("request") is not None: + payload_data = self._build_custom_process_payload( + request_id=request_id, + request=task.get("request"), + pooling_output=task.get("pooling_output"), + ) + put_key = task.get("put_key") + + success, _size, _metadata = connector.put( + from_stage=str(task["stage_id"]), + to_stage=str(task["next_stage_id"]), + put_key=put_key, + data=payload_data, + ) + logger.info( + "[Stage-%s] _send_single_request: put_key=%s success=%s size=%s", + task["stage_id"], + put_key, + success, + _size, + ) + + if not success: + return False + + self._decrement_pending_save_count(request_id) + return True + + def _decrement_pending_save_count(self, request_id: str) -> None: + """Decrement pending save count and run deferred cleanup if zero.""" + cleanup_req_id = None + with self._lock: + remaining = self._pending_save_counts.get(request_id, 0) + if remaining > 1: + self._pending_save_counts[request_id] = remaining - 1 + elif remaining == 1: + self._pending_save_counts.pop(request_id, None) + if request_id in self._deferred_send_cleanup: + self._deferred_send_cleanup.remove(request_id) + cleanup_req_id = request_id + if cleanup_req_id is not None: + self._put_req_chunk.pop(cleanup_req_id, None) + self._send_side_request_payload.pop(cleanup_req_id, None) + self._code_prompt_token_ids.pop(cleanup_req_id, None) + + # ------------------------------------------------------------------ # + # Payload accumulation (ported from OmniChunkTransferAdapter) + # ------------------------------------------------------------------ # + + def _accumulate_payload(self, req_id: str, payload_data: dict[str, Any]) -> dict[str, Any]: + """Accumulate chunk payloads (concat tensors, extend lists). + + Returns a **shallow copy** of the accumulated state so callers + (e.g. ``_poll_single_request``) can store it in + ``_local_stage_payload_cache`` without aliasing the authoritative + ``_send_side_request_payload`` dict. + """ + if req_id not in self._send_side_request_payload: + self._send_side_request_payload[req_id] = dict(payload_data) + return dict(self._send_side_request_payload[req_id]) + + origin = self._send_side_request_payload[req_id] + merged = dict(origin) + override_keys = payload_data.get("override_keys", ()) + drop_decode_span = False + decode_span_handled = False + for key, value in payload_data.items(): + if key == "finished": + merged[key] = value + continue + if key == THINKER_DECODE_EMBEDDINGS_KEY: + merged_span = merge_tensor_spans( + get_tensor_span( + origin, + tensor_key=THINKER_DECODE_EMBEDDINGS_KEY, + start_key=THINKER_DECODE_TOKEN_START_KEY, + end_key=THINKER_DECODE_TOKEN_END_KEY, + ), + get_tensor_span( + payload_data, + tensor_key=THINKER_DECODE_EMBEDDINGS_KEY, + start_key=THINKER_DECODE_TOKEN_START_KEY, + end_key=THINKER_DECODE_TOKEN_END_KEY, + ), + ) + if merged_span is not None: + merged[key], merged[THINKER_DECODE_TOKEN_START_KEY], merged[THINKER_DECODE_TOKEN_END_KEY] = ( + merged_span + ) + decode_span_handled = True + continue + if isinstance(value, torch.Tensor) and key in origin: + if ( + THINKER_DECODE_TOKEN_START_KEY in origin + or THINKER_DECODE_TOKEN_END_KEY in origin + or THINKER_DECODE_TOKEN_START_KEY in payload_data + or THINKER_DECODE_TOKEN_END_KEY in payload_data + ): + logger.warning( + "[Stage-%s] req=%s falling back to legacy thinker decode " + "merge due to missing/invalid/non-contiguous span " + "metadata", + self._stage_id, + req_id, + ) + drop_decode_span = True + merged[key] = torch.cat([origin[key], value], dim=0) + continue + merged[key] = value + continue + if key in {THINKER_DECODE_TOKEN_START_KEY, THINKER_DECODE_TOKEN_END_KEY}: + if decode_span_handled or drop_decode_span: + continue + merged[key] = value + continue + if key in override_keys: + merged[key] = value + continue + if isinstance(value, torch.Tensor) and key in origin: + merged[key] = torch.cat([origin[key], value], dim=0) + elif isinstance(value, list) and key in origin: + merged[key] = origin[key] + value + else: + merged[key] = value + + if drop_decode_span: + merged.pop(THINKER_DECODE_TOKEN_START_KEY, None) + merged.pop(THINKER_DECODE_TOKEN_END_KEY, None) + self._send_side_request_payload[req_id] = merged + return dict(merged) + + def drop_inactive_request_runtime_state(self, req_id: str) -> None: + """Clear inactive request state used by both the runner and mixin. + + This centralizes the model-runner-side cleanup pattern so + ``OmniGPUModelRunner`` can reuse it instead of open-coding the same + inactive-request state mutations. + """ + if hasattr(self, "model_intermediate_buffer"): + self.model_intermediate_buffer.pop(req_id, None) + self.drop_inactive_request_delivery_state(req_id) + + # ------------------------------------------------------------------ # + # Helpers + # ------------------------------------------------------------------ # + + @staticmethod + def _freeze_request_attr(value: Any) -> Any: + if isinstance(value, list): + return list(value) + if isinstance(value, tuple): + return list(value) + if isinstance(value, torch.Tensor): + return value.clone() + raw_list = getattr(value, "_x", None) + if raw_list is not None: + return list(raw_list) + return value + + def _snapshot_request_for_send(self, request: Any, external_req_id: str) -> Any: + finished = bool(getattr(request, "is_finished", lambda: False)()) + attrs: dict[str, Any] = {} + try: + attrs.update(vars(request)) + except TypeError: + pass + + for name in ( + "request_id", + "req_id", + "external_req_id", + "prompt_token_ids", + "output_token_ids", + "all_token_ids", + "additional_information", + "sampling_params", + "multi_modal_data", + "mm_hashes", + ): + if hasattr(request, name): + attrs[name] = self._freeze_request_attr(getattr(request, name)) + + attrs["external_req_id"] = external_req_id + attrs["_frozen_is_finished"] = finished + snapshot = SimpleNamespace(**attrs) + snapshot.is_finished = lambda: finished + return snapshot + + @staticmethod + def _create_connector(model_config: Any) -> OmniConnectorBase | None: + """Create a connector from model_config, or None if unconfigured.""" + connector_config = getattr(model_config, "stage_connector_config", None) + if connector_config is None: + return None + + if not isinstance(connector_config, dict): + connector_config = { + "name": getattr(connector_config, "name", None), + "extra": getattr(connector_config, "extra", None), + } + + name = connector_config.get("name") + if not isinstance(name, str) or not name.strip(): + raise RuntimeError("Invalid stage connector config: missing connector name") + name = name.strip() + + extra = connector_config.get("extra") + if extra is None: + extra = {} + elif not isinstance(extra, dict): + raise RuntimeError(f"Invalid extra config for connector {name}: expected dict, got {type(extra).__name__}") + + spec = ConnectorSpec(name=name, extra=extra) + try: + return OmniConnectorFactory.create_connector(spec) + except Exception as exc: + raise RuntimeError(f"Failed to create connector {name}") from exc + + @staticmethod + def _load_custom_func(model_config: Any) -> tuple[str | None, Any | None]: + """Load the connector payload builder for the downstream stage. + + Preferred source is ``custom_process_next_stage_input_func``. Some + full_payload_mode configs (async_chunk=false) only expose the next-stage prompt builder via + ``custom_process_input_func`` (for example ``thinker2talker``), while the + connector payload builder lives beside it as ``thinker2talker_full_payload``. + In that case, derive the full_payload_mode builder path automatically. + """ + candidates: list[str] = [] + + next_stage_func = getattr(model_config, "custom_process_next_stage_input_func", None) + if isinstance(next_stage_func, str) and next_stage_func: + candidates.append(next_stage_func) + + if not getattr(model_config, "async_chunk", False): + input_func = getattr(model_config, "custom_process_input_func", None) + if isinstance(input_func, str) and input_func: + try: + module_path, func_name = input_func.rsplit(".", 1) + if func_name.endswith("_full_payload") or func_name.endswith("_batch"): + candidates.append(f"{module_path}.{func_name}") + else: + candidates.append(f"{module_path}.{func_name}_full_payload") + candidates.append(f"{module_path}.{func_name}_batch") + candidates.append(input_func) + except ValueError: + candidates.append(input_func) + + tried: set[str] = set() + for func_path in candidates: + if func_path in tried: + continue + tried.add(func_path) + try: + module_path, func_name = func_path.rsplit(".", 1) + module = importlib.import_module(module_path) + func = getattr(module, func_name, None) + if callable(func): + if not OmniConnectorModelRunnerMixin._is_connector_payload_builder(func): + logger.debug( + "Skipping incompatible connector payload hook %s; signature=%s", + func_path, + inspect.signature(func), + ) + continue + return func_path, func + except Exception: + logger.warning("Failed to load custom func: %s", func_path, exc_info=True) + + return None, None + + @staticmethod + def _is_connector_payload_builder(func: Any) -> bool: + """Whether *func* matches the mixin payload-builder contract.""" + try: + signature = inspect.signature(func) + except (TypeError, ValueError): + return False + + params = signature.parameters + if any(param.kind == inspect.Parameter.VAR_KEYWORD for param in params.values()): + return True + + required = {"transfer_manager", "pooling_output", "request"} + supported = { + name + for name, param in params.items() + if param.kind + in ( + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + ) + } + return required.issubset(supported) + + def _resolve_external_req_id(self, request: Any, fallback_req_id: str) -> str: + """Resolve the external request ID consistently. + + Checks ``_request_ids_mapping`` first (populated by + ``register_chunk_recv``), then falls back to the request's + ``external_req_id`` attribute, and finally to the given + ``fallback_req_id``. + """ + mapped = self._request_ids_mapping.get(fallback_req_id) + if mapped is not None: + return mapped + if request is not None: + return getattr(request, "external_req_id", fallback_req_id) + return fallback_req_id + + def _resolve_next_stage_id(self, model_config: Any) -> int: + """Determine the downstream stage ID from connector config. + + Falls back to ``stage_id + 1`` when the config does not specify + a ``to_stage`` explicitly. + """ + connector_config = getattr(model_config, "stage_connector_config", None) + if connector_config is not None: + if isinstance(connector_config, dict): + to_stage = connector_config.get("to_stage") + else: + to_stage = getattr(connector_config, "to_stage", None) + if isinstance(to_stage, int): + return to_stage + if isinstance(to_stage, str) and to_stage.strip(): + return int(to_stage) + return self._stage_id + 1 + + @staticmethod + def _parse_rank_mapping(model_config: Any) -> dict[str, int]: + """Parse rank_mapping from connector config (optional). + + Returns ``{"from_tp": int, "to_tp": int, "local_rank": int}``. + When ``rank_mapping`` is absent, assumes 1:1 homogeneous mapping. + """ + connector_config = getattr(model_config, "stage_connector_config", None) + if connector_config is not None and not isinstance(connector_config, dict): + connector_config = getattr(connector_config, "__dict__", {}) + + rank_mapping: dict = {} + if isinstance(connector_config, dict): + rank_mapping = connector_config.get("rank_mapping", {}) + + from_tp = int(rank_mapping.get("from_tp", 1)) + to_tp = int(rank_mapping.get("to_tp", 1)) + + local_rank = 0 + try: + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + except (ValueError, TypeError): + pass + + return {"from_tp": from_tp, "to_tp": to_tp, "local_rank": local_rank} + + # ------------------------------------------------------------------ # + # Heterogeneous TP rank support + # ------------------------------------------------------------------ # + + def _validate_kv_tp_topology(self) -> None: + """Reject heterogeneous TP mappings that cannot be routed losslessly.""" + if self._from_tp <= 0 or self._to_tp <= 0: + raise ValueError(f"Invalid KV TP mapping: from_tp={self._from_tp}, to_tp={self._to_tp}") + larger = max(self._from_tp, self._to_tp) + smaller = min(self._from_tp, self._to_tp) + if larger % smaller != 0: + raise ValueError( + f"KV TP mapping must be divisible for rank-aware routing: from_tp={self._from_tp}, to_tp={self._to_tp}" + ) + + def get_kv_remote_ranks(self) -> list[int]: + """Determine which remote ranks this local rank exchanges KV with. + + Follows vLLM's ``TpKVTopology.get_target_remote_ranks()`` pattern: + - ``from_tp > to_tp``: each to-rank reads from multiple from-ranks + - ``from_tp < to_tp``: multiple to-ranks read from the same from-rank + - ``from_tp == to_tp``: 1:1 mapping + """ + self._validate_kv_tp_topology() + if self._from_tp == self._to_tp: + return [self._local_rank] + + if self._from_tp > self._to_tp: + tp_ratio = self._from_tp // self._to_tp + return [self._local_rank * tp_ratio + i for i in range(tp_ratio)] + else: + tp_ratio = self._to_tp // self._from_tp + return [self._local_rank // tp_ratio] + + def is_data_transfer_rank(self) -> bool: + """Whether this rank should participate in data (non-KV) transfer. + + Ordinary stage payloads are TP-identical, so exactly one TP rank + should talk to the connector. When TP is initialized, use TP rank 0 + so the connector leader matches TP-local broadcast source rank. + Otherwise fall back to LOCAL_RANK==0 for the single-rank case. + """ + tp_group = self._get_local_tp_group() + if tp_group is not None and getattr(tp_group, "world_size", 1) > 1: + return getattr(tp_group, "rank_in_group", 0) == 0 + return self._local_rank == 0 + + def get_kv_connector_key( + self, + req_id: str, + from_stage: int, + chunk_id: int, + from_rank: int, + to_rank: int, + ) -> str: + """Build connector key that includes rank info for KV transfers.""" + return f"{req_id}_{from_stage}_{chunk_id}_{from_rank}_{to_rank}" diff --git a/vllm_omni/worker/payload_span.py b/vllm_omni/worker/payload_span.py new file mode 100644 index 0000000000..994392343a --- /dev/null +++ b/vllm_omni/worker/payload_span.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Helpers for explicit thinker decode span metadata.""" + +from collections.abc import Mapping +from typing import Any + +import torch + +THINKER_DECODE_EMBEDDINGS_KEY = "thinker_decode_embeddings" +THINKER_OUTPUT_TOKEN_IDS_KEY = "thinker_output_token_ids" +THINKER_DECODE_TOKEN_START_KEY = "thinker_decode_embeddings_token_start" +THINKER_DECODE_TOKEN_END_KEY = "thinker_decode_embeddings_token_end" + +CACHED_THINKER_DECODE_EMBEDDINGS_KEY = "cached_thinker_decode_embeddings" +CACHED_THINKER_DECODE_TOKEN_START_KEY = "cached_thinker_decode_embeddings_token_start" +CACHED_THINKER_DECODE_TOKEN_END_KEY = "cached_thinker_decode_embeddings_token_end" + +TensorSpan = tuple[torch.Tensor, int, int] + + +def get_tensor_span(payload: Mapping[str, Any], *, tensor_key: str, start_key: str, end_key: str) -> TensorSpan | None: + tensor = payload.get(tensor_key) + start = payload.get(start_key) + end = payload.get(end_key) + if not isinstance(tensor, torch.Tensor): + return None + if not isinstance(start, int) or not isinstance(end, int): + return None + if start < 0 or end < start or (end - start) != int(tensor.shape[0]): + return None + return tensor, start, end + + +def merge_tensor_spans(existing_span: TensorSpan | None, incoming_span: TensorSpan | None) -> TensorSpan | None: + if existing_span is None or incoming_span is None: + return None + + existing_tensor, existing_start, existing_end = existing_span + incoming_tensor, incoming_start, incoming_end = incoming_span + if incoming_tensor.device != existing_tensor.device or incoming_tensor.dtype != existing_tensor.dtype: + incoming_tensor = incoming_tensor.to(device=existing_tensor.device, dtype=existing_tensor.dtype) + if incoming_start == existing_end: + return torch.cat([existing_tensor, incoming_tensor], dim=0), existing_start, incoming_end + if incoming_start < existing_end: + overlap = existing_end - incoming_start + if overlap >= int(incoming_tensor.shape[0]): + return existing_tensor, existing_start, existing_end + trimmed_tensor = incoming_tensor[overlap:] + return ( + torch.cat([existing_tensor, trimmed_tensor], dim=0), + existing_start, + existing_end + int(trimmed_tensor.shape[0]), + ) + return None + + +def get_tensor_span_row(span: TensorSpan | None, index: int) -> torch.Tensor | None: + if span is None: + return None + tensor, start, end = span + if index < start or index >= end: + return None + return tensor[index - start] From cd2761e15c8e49ea7c53cd551f820318155b4988 Mon Sep 17 00:00:00 2001 From: JohnJan Date: Mon, 13 Apr 2026 17:51:48 +0800 Subject: [PATCH 145/204] [Feature]: support Flux.2-dev tea_cache (#1871) Co-authored-by: wuzhongjian --- docs/user_guide/diffusion_features.md | 2 +- .../cache/test_teacache_extractors.py | 105 ++++++++++++- .../cache/teacache/coefficient_estimator.py | 27 ++++ vllm_omni/diffusion/cache/teacache/config.py | 9 ++ .../diffusion/cache/teacache/extractors.py | 140 ++++++++++++++++++ 5 files changed, 281 insertions(+), 2 deletions(-) diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 2f28131ee5..ac140ff84a 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -110,7 +110,7 @@ The following tables show which models support each feature: | **FLUX.1-dev** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | | **FLUX.2-klein** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | | **FLUX.1-Kontext-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **FLUX.2-dev** | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | +| **FLUX.2-dev** | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **GLM-Image** | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **HunyuanImage3** | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | | **LongCat-Image** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | diff --git a/tests/diffusion/cache/test_teacache_extractors.py b/tests/diffusion/cache/test_teacache_extractors.py index a52e11b3d4..c22a60e227 100644 --- a/tests/diffusion/cache/test_teacache_extractors.py +++ b/tests/diffusion/cache/test_teacache_extractors.py @@ -22,7 +22,7 @@ import torch from tests.utils import hardware_test -from vllm_omni.diffusion.cache.teacache.extractors import extract_flux2_klein_context +from vllm_omni.diffusion.cache.teacache.extractors import extract_flux2_context, extract_flux2_klein_context from vllm_omni.diffusion.models.flux2_klein.flux2_klein_transformer import ( Flux2Transformer2DModel, ) @@ -174,3 +174,106 @@ def test_invalid_module_raises_error(self): img_ids=torch.randint(0, 64, (1, 1024, 4)), txt_ids=torch.randint(0, 64, (1, 512, 4)), ) + + +class TestFlux2Extractor(BaseExtractorTest): + """Test extract_flux2_context function.""" + + def get_extractor(self): + return extract_flux2_context + + @pytest.fixture + def flux2_module(self): + """Create a minimal Flux2Transformer2DModel for testing.""" + from vllm_omni.diffusion.models.flux2.flux2_transformer import Flux2Transformer2DModel + + model = Flux2Transformer2DModel( + num_layers=2, + num_single_layers=2, + num_attention_heads=48, + attention_head_dim=128, + joint_attention_dim=15360, + ) + return model + + def get_module(self, flux2_module): + return flux2_module + + @pytest.fixture + def sample_inputs(self): + """Create sample input tensors for Flux2. + + Note: hidden_states uses in_channels=128 (default for Flux2), + not inner_dim=6144. The x_embedder projects from 128 -> 6144. + encoder_hidden_states uses joint_attention_dim=15360 (model default), + which then gets projected to inner_dim=6144 by context_embedder. + """ + batch_size = 1 + img_seq_len = 1024 + txt_seq_len = 512 + in_channels = 128 # Model default in_channels + txt_dim = 15360 # Model default joint_attention_dim + + return { + "hidden_states": torch.randn(batch_size, img_seq_len, in_channels), + "encoder_hidden_states": torch.randn(batch_size, txt_seq_len, txt_dim), + "timestep": torch.tensor([500]), + "img_ids": torch.randint(0, 64, (batch_size, img_seq_len, 4)), + "txt_ids": torch.randint(0, 64, (batch_size, txt_seq_len, 4)), + "guidance": torch.tensor([3.5]), + } + + def get_sample_inputs(self, sample_inputs): + return sample_inputs + + @hardware_test(res={"cuda": "L4"}, num_cards=1) + def test_modulated_input_shape(self, flux2_module, sample_inputs): + """Test that modulated_input has correct shape matching the model's inner_dim. + + Note: After x_embedder projection, hidden_states are projected from + in_channels (128) to inner_dim (6144), so modulated_input should match + the projected shape, not the input shape. + """ + context = extract_flux2_klein_context(flux2_module, **sample_inputs) + + batch_size, img_seq_len, _ = sample_inputs["hidden_states"].shape + inner_dim = flux2_module.inner_dim + assert context.modulated_input.shape == (batch_size, img_seq_len, inner_dim) + + @hardware_test(res={"cuda": "L4"}, num_cards=1) + def test_run_transformer_blocks_callable(self, flux2_module, sample_inputs): + """Test that run_transformer_blocks is callable.""" + context = extract_flux2_context(flux2_module, **sample_inputs) + assert callable(context.run_transformer_blocks) + + @hardware_test(res={"cuda": "L4"}, num_cards=1) + def test_postprocess_callable(self, flux2_module, sample_inputs): + """Test that postprocess is callable.""" + context = extract_flux2_context(flux2_module, **sample_inputs) + assert callable(context.postprocess) + + def test_without_guidance(self, flux2_module, sample_inputs): + """Test context extraction works without guidance (no CFG).""" + inputs = sample_inputs.copy() + inputs["guidance"] = None + + context = extract_flux2_context(flux2_module, **inputs) + + assert context is not None + assert context.temb is not None + + @pytest.mark.cpu + def test_invalid_module_raises_error(self): + """Test that invalid module without transformer_blocks raises ValueError.""" + invalid_module = Mock() + invalid_module.transformer_blocks = [] + + with pytest.raises(ValueError, match="Module must have transformer_blocks"): + extract_flux2_context( + invalid_module, + hidden_states=torch.randn(1, 1024, 6144), + encoder_hidden_states=torch.randn(1, 512, 15360), + timestep=torch.tensor([500]), + img_ids=torch.randint(0, 64, (1, 1024, 4)), + txt_ids=torch.randint(0, 64, (1, 512, 4)), + ) diff --git a/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py b/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py index 5dd80718d1..baec21c276 100644 --- a/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py +++ b/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py @@ -13,6 +13,7 @@ from vllm_omni.diffusion.hooks import HookRegistry, ModelHook from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.bagel.pipeline_bagel import BagelPipeline +from vllm_omni.diffusion.models.flux2.pipeline_flux2 import Flux2Pipeline from vllm_omni.diffusion.models.stable_audio.pipeline_stable_audio import StableAudioPipeline from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -103,6 +104,31 @@ def install_hook(transformer: Any, hook: DataCollectionHook) -> None: registry.register_hook(hook._HOOK_NAME, hook) +class Flux2Adapter: + """Adapter for Flux2 model coefficient estimation.""" + + @staticmethod + def load_pipeline(model_path: str, device: str = "cuda", dtype: torch.dtype = torch.bfloat16) -> Flux2Pipeline: + """Load Flux2 pipeline for coefficient estimation.""" + od_config = OmniDiffusionConfig.from_kwargs(model=model_path, dtype=dtype) + od_config.model_class_name = "Flux2Pipeline" + + pipeline = Flux2Pipeline(od_config=od_config) + loader = DiffusersPipelineLoader(LoadConfig()) + loader.load_weights(pipeline) + pipeline.to(device) + return pipeline + + @staticmethod + def get_transformer(pipeline: Any) -> tuple[Any, str]: + return pipeline.transformer, pipeline.transformer.__class__.__name__ + + @staticmethod + def install_hook(transformer: Any, hook: DataCollectionHook) -> None: + registry = HookRegistry.get_or_create(transformer) + registry.register_hook(hook._HOOK_NAME, hook) + + class DefaultAdapter: """Default adapter for standard diffusers pipelines.""" @@ -123,6 +149,7 @@ def install_hook(transformer: Any, hook: DataCollectionHook) -> None: _MODEL_ADAPTERS: dict[str, type] = { "Bagel": BagelAdapter, "StableAudio": StableAudioAdapter, + "Flux2": Flux2Adapter, } _EPSILON = 1e-6 diff --git a/vllm_omni/diffusion/cache/teacache/config.py b/vllm_omni/diffusion/cache/teacache/config.py index 96cf3f03ee..ecf3bfc1d3 100644 --- a/vllm_omni/diffusion/cache/teacache/config.py +++ b/vllm_omni/diffusion/cache/teacache/config.py @@ -64,6 +64,15 @@ -1.04182570e01, 6.78098549e-01, ], + # Flux2 transformer coefficients + # Copied from Qwen-Image, need to be tuned specifically for Flux2 in future + "Flux2Transformer2DModel": [ + -4.50000000e02, + 2.80000000e02, + -4.50000000e01, + 3.20000000e00, + -2.00000000e-02, + ], } diff --git a/vllm_omni/diffusion/cache/teacache/extractors.py b/vllm_omni/diffusion/cache/teacache/extractors.py index bdb3f6a786..3d247e3187 100644 --- a/vllm_omni/diffusion/cache/teacache/extractors.py +++ b/vllm_omni/diffusion/cache/teacache/extractors.py @@ -21,6 +21,7 @@ import torch.nn as nn from vllm_omni.diffusion.forward_context import get_forward_context +from vllm_omni.platforms import current_omni_platform @dataclass @@ -827,6 +828,144 @@ def postprocess(h: torch.Tensor) -> Any: ) +def extract_flux2_context( + module: nn.Module, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor = None, + timestep: torch.LongTensor = None, + img_ids: torch.Tensor = None, + txt_ids: torch.Tensor = None, + guidance: torch.Tensor | None = None, + joint_attention_kwargs: dict[str, Any] | None = None, + return_dict: bool = True, + **kwargs: Any, +) -> CacheContext: + """ + Extract cache context for Flux2Transformer2DModel. + + This is the ONLY Flux2-specific code needed for TeaCache support. + It encapsulates preprocessing, modulated input extraction, transformer execution, + and postprocessing logic. + + Args: + module: Flux2Transformer2DModel instance + hidden_states: Input hidden states tensor + encoder_hidden_states: Text encoder outputs + timestep: Current diffusion timestep + img_ids: Image inputs for position embedding + txt_ids: Text inputs for position embedding + guidance: Optional guidance scale for CFG + joint_attention_kwargs: Additional attention arguments + return_dict: Whether to return a Transformer2DModelOutput instead of a plain tensor + **kwargs: Additional keyword arguments ignored by this extractor + + Returns: + CacheContext with all information needed for generic caching + """ + + from diffusers.models.modeling_outputs import Transformer2DModelOutput + + if not hasattr(module, "transformer_blocks") or len(module.transformer_blocks) == 0: + raise ValueError("Module must have transformer_blocks") + + # ============================================================================ + # PREPROCESSING (Flux2-specific) + # ============================================================================ + num_txt_tokens = encoder_hidden_states.shape[1] + + timestep = timestep.to(hidden_states.dtype) * 1000 + if guidance is not None: + guidance = guidance.to(hidden_states.dtype) * 1000 + + temb = module.time_guidance_embed(timestep, guidance) + + double_stream_mod_img = module.double_stream_modulation_img(temb) + double_stream_mod_txt = module.double_stream_modulation_txt(temb) + single_stream_mod = module.single_stream_modulation(temb)[0] + + hidden_states = module.x_embedder(hidden_states) + encoder_hidden_states = module.context_embedder(encoder_hidden_states) + + if img_ids.ndim == 3: + img_ids = img_ids[0] + if txt_ids.ndim == 3: + txt_ids = txt_ids[0] + + if current_omni_platform.is_npu(): + freqs_cos_image, freqs_sin_image = module.pos_embed(img_ids.cpu()) + image_rotary_emb = (freqs_cos_image.npu(), freqs_sin_image.npu()) + freqs_cos_text, freqs_sin_text = module.pos_embed(txt_ids.cpu()) + text_rotary_emb = (freqs_cos_text.npu(), freqs_sin_text.npu()) + else: + image_rotary_emb = module.pos_embed(img_ids) + text_rotary_emb = module.pos_embed(txt_ids) + concat_rotary_emb = ( + torch.cat([text_rotary_emb[0], image_rotary_emb[0]], dim=0), + torch.cat([text_rotary_emb[1], image_rotary_emb[1]], dim=0), + ) + + # ============================================================================ + # EXTRACT MODULATED INPUT (for cache decision) + # ============================================================================ + block = module.transformer_blocks[0] + (shift_msa, scale_msa, gate_msa), _ = double_stream_mod_img + modulated_input = block.norm1(hidden_states) + modulated_input = (1 + scale_msa) * modulated_input + shift_msa + + # ============================================================================ + # DEFINE TRANSFORMER EXECUTION (Flux2-specific) + # ============================================================================ + def run_transformer_blocks(): + """Execute all Flux2 transformer blocks.""" + h = hidden_states + e = encoder_hidden_states + + for transformer_block in module.transformer_blocks: + e, h = transformer_block( + hidden_states=h, + encoder_hidden_states=e, + temb_mod_params_img=double_stream_mod_img, + temb_mod_params_txt=double_stream_mod_txt, + image_rotary_emb=concat_rotary_emb, + joint_attention_kwargs=joint_attention_kwargs, + ) + h = torch.cat([e, h], dim=1) + + for single_transformer_block in module.single_transformer_blocks: + h = single_transformer_block( + hidden_states=h, + encoder_hidden_states=None, + temb_mod_params=single_stream_mod, + image_rotary_emb=concat_rotary_emb, + joint_attention_kwargs=joint_attention_kwargs, + ) + + h = h[:, num_txt_tokens:, ...] + return (h,) + + # ============================================================================ + # DEFINE POSTPROCESSING + # ============================================================================ + def postprocess(h): + h = module.norm_out(h, temb) + output = module.proj_out(h) + if not return_dict: + return (output,) + return Transformer2DModelOutput(sample=output) + + # ============================================================================ + # RETURN CONTEXT + # ============================================================================ + return CacheContext( + modulated_input=modulated_input, + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + temb=temb, + run_transformer_blocks=run_transformer_blocks, + postprocess=postprocess, + ) + + # Registry for model-specific extractors # Key: Transformer class name # Value: extractor function with signature (module, *args, **kwargs) -> CacheContext @@ -839,6 +978,7 @@ def postprocess(h: torch.Tensor) -> Any: "ZImageTransformer2DModel": extract_zimage_context, "Flux2Klein": extract_flux2_klein_context, "StableAudioDiTModel": extract_stable_audio_context, + "Flux2Transformer2DModel": extract_flux2_context, # Future models: # "FluxTransformer2DModel": extract_flux_context, # "CogVideoXTransformer3DModel": extract_cogvideox_context, From 155583f49f9a20477ea95a0119a7abfddbf0c646 Mon Sep 17 00:00:00 2001 From: Chenguang Zheng <645327136@qq.com> Date: Mon, 13 Apr 2026 18:35:59 +0800 Subject: [PATCH 146/204] [Bugfix] Release stage launch lock before handshake (#2717) Signed-off-by: Chenguang ZHENG <645327136@qq.com> --- .../test_async_omni_engine_stage_init.py | 89 +++++++++++++++++++ vllm_omni/engine/async_omni_engine.py | 23 ++--- 2 files changed, 101 insertions(+), 11 deletions(-) diff --git a/tests/engine/test_async_omni_engine_stage_init.py b/tests/engine/test_async_omni_engine_stage_init.py index 6993f391eb..7b995fe70d 100644 --- a/tests/engine/test_async_omni_engine_stage_init.py +++ b/tests/engine/test_async_omni_engine_stage_init.py @@ -227,6 +227,95 @@ def _capture_stage_timeout(_proc, _handshake_addr, _addresses, _vllm_cfg, handsh assert captured_timeout == 302 +def test_launch_llm_stage_releases_launch_lock_before_complete_stage_handshake(monkeypatch): + """Regression test for parallel LLM stage startup during handshake wait.""" + import vllm_omni.engine.async_omni_engine as engine_mod + from vllm_omni.platforms import current_omni_platform + + engine = object.__new__(AsyncOmniEngine) + engine.log_stats = False + engine.model = "dummy-model" + engine.single_stage_mode = False + engine._omni_master_server = None + + fake_vllm_config = types.SimpleNamespace() + fake_addresses = types.SimpleNamespace() + shared_launch_lock = threading.Lock() + counter_lock = threading.Lock() + first_handshake_started = threading.Event() + second_stage_spawned = threading.Event() + allow_first_handshake_to_finish = threading.Event() + launch_errors: list[BaseException] = [] + spawn_count = 0 + + device_env_var = current_omni_platform.device_control_env_var + prev_device_env = os.environ.get(device_env_var) + os.environ[device_env_var] = "0" + + monkeypatch.setattr(engine_mod, "setup_stage_devices", lambda *_: None) + monkeypatch.setattr(engine_mod, "build_engine_args_dict", lambda *_, **__: {}) + monkeypatch.setattr(engine_mod, "build_vllm_config", lambda *_, **__: (fake_vllm_config, object)) + monkeypatch.setattr(engine_mod, "acquire_device_locks", lambda *_: []) + + def _spawn_stage_core(**_): + nonlocal spawn_count + with counter_lock: + spawn_count += 1 + call_idx = spawn_count + if call_idx == 2: + second_stage_spawned.set() + return fake_addresses, types.SimpleNamespace(), f"ipc://handshake-{call_idx}" + + def _complete_stage_handshake(_proc, handshake_address, _addresses, _vllm_cfg, _timeout): + if handshake_address == "ipc://handshake-1": + first_handshake_started.set() + assert second_stage_spawned.wait(timeout=1), ( + "second stage did not reach spawn_stage_core while first stage waited in handshake" + ) + assert allow_first_handshake_to_finish.wait(timeout=1), ( + "second stage did not enter handshake while first stage was still waiting" + ) + else: + allow_first_handshake_to_finish.set() + + monkeypatch.setattr(engine_mod, "spawn_stage_core", _spawn_stage_core) + monkeypatch.setattr(engine_mod, "complete_stage_handshake", _complete_stage_handshake) + + def _launch_stage(stage_id: int) -> None: + metadata = types.SimpleNamespace(stage_id=stage_id, runtime_cfg={"devices": str(stage_id)}) + try: + engine._launch_llm_stage( + stage_cfg=types.SimpleNamespace(engine_args={}), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=302, + llm_stage_launch_lock=shared_launch_lock, + ) + except BaseException as exc: # pragma: no cover - surfaced through assertion below + launch_errors.append(exc) + + try: + first_thread = threading.Thread(target=_launch_stage, args=(0,)) + first_thread.start() + assert first_handshake_started.wait(timeout=1), "first stage never entered handshake" + + second_thread = threading.Thread(target=_launch_stage, args=(1,)) + second_thread.start() + + first_thread.join(timeout=3) + second_thread.join(timeout=3) + finally: + if prev_device_env is None: + os.environ.pop(device_env_var, None) + else: + os.environ[device_env_var] = prev_device_env + + assert not first_thread.is_alive() + assert not second_thread.is_alive() + assert second_stage_spawned.is_set() + assert not launch_errors + + def test_attach_llm_stage_uses_omni_input_preprocessor(monkeypatch): """Regression test for GLM-Image t2i preprocessing path. diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 0a2e02d66e..9609cf6e26 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -424,23 +424,24 @@ def _launch_llm_stage( proc=proc, ) logger.info("[AsyncOmniEngine] Stage %s engine launch started", metadata.stage_id) - # Keep the stage-specific device visibility until vLLM - # finishes starting all child processes. - if self.single_stage_mode and self._omni_master_server is not None: - launch_stack.close() - else: - assert proc is not None - assert handshake_address is not None - complete_stage_handshake( - proc, handshake_address, addresses, vllm_config, stage_init_timeout - ) - logger.info("[AsyncOmniEngine] Stage %s engine startup completed", metadata.stage_id) finally: if previous_visible_devices is None: current_omni_platform.unset_device_control_env_var() else: current_omni_platform.set_device_control_env_var(previous_visible_devices) + # After StageEngineCoreProc has been spawned it carries its + # stage-specific device visibility into descendants, so the + # slow HELLO/READY handshake can run without holding the + # process-wide launch lock. + if self.single_stage_mode and self._omni_master_server is not None: + launch_stack.close() + else: + assert proc is not None + assert handshake_address is not None + complete_stage_handshake(proc, handshake_address, addresses, vllm_config, stage_init_timeout) + logger.info("[AsyncOmniEngine] Stage %s engine startup completed", metadata.stage_id) + assert started_stage is not None return started_stage except Exception: From ef3f72b9ae0bee0baf45258abde55bec3ae6752d Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Mon, 13 Apr 2026 19:03:13 +0800 Subject: [PATCH 147/204] [Tests][Qwen3-Omni]Modify Qwen3-Omni performance test cases (#2600) Signed-off-by: amy-why-3459 --- tests/dfx/perf/scripts/run_benchmark.py | 2 + tests/dfx/perf/tests/test.json | 305 +++++++++++++++++------- 2 files changed, 219 insertions(+), 88 deletions(-) diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index c566c2e0a0..b64cc0d950 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -72,6 +72,8 @@ def run_benchmark( ["vllm", "bench", "serve", "--omni"] + args + [ + "--num-warmups", + "2", "--save-result", "--result-dir", os.environ.get("BENCHMARK_DIR", "tests"), diff --git a/tests/dfx/perf/tests/test.json b/tests/dfx/perf/tests/test.json index fe7e380469..159e27a064 100644 --- a/tests/dfx/perf/tests/test.json +++ b/tests/dfx/perf/tests/test.json @@ -10,83 +10,97 @@ "dataset_name": "random", "backend": "openai-chat-omni", "endpoint": "/v1/chat/completions", - "num_prompts": [ - 10, - 40, - 100 - ], - "max_concurrency": [ - 1, - 4, - 10 - ], + "num_prompts": [4, 16, 40], + "max_concurrency": [1, 4, 10], + "random_input_len": 2500, + "random_output_len": 900, + "ignore_eos": true, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [1000, 3000, 5000], + "mean_audio_ttfp_ms": [30000, 60000, 90000], + "mean_audio_rtf": [0.35, 0.45, 0.55] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [10], + "request_rate": [0.1], "random_input_len": 100, "random_output_len": 100, + "random_range_ratio": 0.0, "ignore_eos": true, + "random_mm_base_items_per_request": 1, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "audio": 1 + }, + "random_mm_bucket_config": { + "(0, 60, 3)": 1.0 + }, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_ttft_ms": [1000, 3000, 5000], - "mean_audio_ttfp_ms": [8000, 10000, 13000], - "mean_audio_rtf": [0.2, 0.25, 0.45] + "mean_ttft_ms": [2000], + "mean_audio_ttfp_ms": [10000], + "mean_audio_rtf": [0.25] } }, { "dataset_name": "random-mm", "backend": "openai-chat-omni", "endpoint": "/v1/chat/completions", - "num_prompts": [ - 10, - 40, - 100 - ], - "request_rate": [ - 0.1, - 0.3, - 0.5 - ], + "num_prompts": [40], + "request_rate": [0.3], "random_input_len": 100, "random_output_len": 100, "random_range_ratio": 0.0, "ignore_eos": true, - "random_mm_base_items_per_request": 3, - "random_mm_num_mm_items_range_ratio": 0, + "random_mm_base_items_per_request": 2, + "random_mm_num_mm_items_range_ratio": 0.5, "random_mm_limit_mm_per_prompt": { "image": 1, - "video": 1, - "audio": 1 + "video": 1 }, "random_mm_bucket_config": { - "(32, 32, 1)": 0.5, - "(0, 1, 1)": 0.1, - "(32, 32, 2)": 0.4 + "(256, 256, 1)": 0.5, + "(720, 1280, 2)": 0.5 }, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_ttft_ms": [2000, 4000, 6000], - "mean_audio_ttfp_ms": [10000, 13000, 15000], - "mean_audio_rtf": [0.25, 0.35, 0.45] + "mean_ttft_ms": [4000], + "mean_audio_ttfp_ms": [13000], + "mean_audio_rtf": [0.35] } }, { - "dataset_name": "random", + "dataset_name": "random-mm", "backend": "openai-chat-omni", "endpoint": "/v1/chat/completions", - "num_prompts": [ - 4, - 16 - ], - "max_concurrency": [ - 1, - 4 - ], - "random_input_len": 2500, - "random_output_len": 900, + "num_prompts": [100], + "request_rate": [0.5], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, "ignore_eos": true, + "random_mm_base_items_per_request": 3, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "image": 1, + "video": 1, + "audio": 1 + }, + "random_mm_bucket_config": { + "(256, 256, 1)": 0.34, + "(720, 1280, 2)": 0.33, + "(0, 60, 3)": 0.33 + }, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_ttft_ms": [1000, 3000], - "mean_audio_ttfp_ms": [30000, 60000], - "mean_audio_rtf": [0.35, 0.45] + "mean_ttft_ms": [6000], + "mean_audio_ttfp_ms": [15000], + "mean_audio_rtf": [0.45] } } ] @@ -120,18 +134,10 @@ "dataset_name": "random", "backend": "openai-chat-omni", "endpoint": "/v1/chat/completions", - "num_prompts": [ - 10, - 40, - 100 - ], - "max_concurrency": [ - 1, - 4, - 10 - ], - "random_input_len": 100, - "random_output_len": 100, + "num_prompts": [4, 16, 40], + "max_concurrency": [1, 4, 10], + "random_input_len": 2500, + "random_output_len": 900, "ignore_eos": true, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { @@ -144,59 +150,182 @@ "dataset_name": "random-mm", "backend": "openai-chat-omni", "endpoint": "/v1/chat/completions", - "num_prompts": [ - 10, - 40, - 100 - ], - "request_rate": [ - 0.1, - 0.3, - 0.5 - ], + "num_prompts": [10], + "request_rate": [0.1], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, + "ignore_eos": true, + "random_mm_base_items_per_request": 1, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "audio": 1 + }, + "random_mm_bucket_config": { + "(0, 60, 3)": 1.0 + }, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [2000], + "mean_audio_ttfp_ms": [2000], + "mean_audio_rtf": [0.25] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [40], + "request_rate": [0.3], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, + "ignore_eos": true, + "random_mm_base_items_per_request": 2, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "image": 1, + "video": 1 + }, + "random_mm_bucket_config": { + "(256, 256, 1)": 0.5, + "(720, 1280, 2)": 0.5 + }, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [4000], + "mean_audio_ttfp_ms": [4000], + "mean_audio_rtf": [0.4] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [100], + "request_rate": [0.5], "random_input_len": 100, "random_output_len": 100, "random_range_ratio": 0.0, "ignore_eos": true, "random_mm_base_items_per_request": 3, - "random_mm_num_mm_items_range_ratio": 0, + "random_mm_num_mm_items_range_ratio": 0.5, "random_mm_limit_mm_per_prompt": { "image": 1, "video": 1, "audio": 1 }, "random_mm_bucket_config": { - "(32, 32, 1)": 0.5, - "(0, 1, 1)": 0.1, - "(32, 32, 2)": 0.4 + "(256, 256, 1)": 0.34, + "(720, 1280, 2)": 0.33, + "(0, 60, 3)": 0.33 }, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_ttft_ms": [2000, 4000, 6000], - "mean_audio_ttfp_ms": [2000, 4000, 6000], - "mean_audio_rtf": [0.25, 0.4, 0.7] + "mean_ttft_ms": [6000], + "mean_audio_ttfp_ms": [6000], + "mean_audio_rtf": [0.7] } }, { "dataset_name": "random", "backend": "openai-chat-omni", "endpoint": "/v1/chat/completions", - "num_prompts": [ - 4, - 16 - ], - "max_concurrency": [ - 1, - 4 - ], + "num_prompts": [4, 16, 40], + "max_concurrency": [1, 4, 10], "random_input_len": 2500, "random_output_len": 900, "ignore_eos": true, - "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "extra_body": { + "modalities": ["text"] + }, + "percentile-metrics": "ttft,tpot,itl,e2el", + "baseline": { + "mean_ttft_ms": [1000, 3000, 5000] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [10], + "request_rate": [0.1], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, + "ignore_eos": true, + "extra_body": { + "modalities": ["text"] + }, + "random_mm_base_items_per_request": 1, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "audio": 1 + }, + "random_mm_bucket_config": { + "(0, 60, 3)": 1.0 + }, + "percentile-metrics": "ttft,tpot,itl,e2el", + "baseline": { + "mean_ttft_ms": [2000] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [40], + "request_rate": [0.3], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, + "ignore_eos": true, + "extra_body": { + "modalities": ["text"] + }, + "random_mm_base_items_per_request": 2, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "image": 1, + "video": 1 + }, + "random_mm_bucket_config": { + "(256, 256, 1)": 0.5, + "(720, 1280, 2)": 0.5 + }, + "percentile-metrics": "ttft,tpot,itl,e2el", + "baseline": { + "mean_ttft_ms": [4000] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [100], + "request_rate": [0.5], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, + "ignore_eos": true, + "extra_body": { + "modalities": ["text"] + }, + "random_mm_base_items_per_request": 3, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "image": 1, + "video": 1, + "audio": 1 + }, + "random_mm_bucket_config": { + "(256, 256, 1)": 0.34, + "(720, 1280, 2)": 0.33, + "(0, 60, 3)": 0.33 + }, + "percentile-metrics": "ttft,tpot,itl,e2el", "baseline": { - "mean_ttft_ms": [1000, 3000], - "mean_audio_ttfp_ms": [1000, 3000], - "mean_audio_rtf": [0.35, 0.45] + "mean_ttft_ms": [6000] } } ] From 2c67c30550ad91e62a5919b0008caba459a09049 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Mon, 13 Apr 2026 19:15:49 +0800 Subject: [PATCH 148/204] [Bagel]: Support `think mode` in single stage deployment of Bagel (#2650) Signed-off-by: princepride --- examples/offline_inference/bagel/end2end.py | 98 ++++++++---- .../models/bagel/bagel_transformer.py | 113 +++++++++++++- .../diffusion/models/bagel/pipeline_bagel.py | 146 +++++++++++++++--- 3 files changed, 301 insertions(+), 56 deletions(-) diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index 472d748d1e..ed5fa57e8d 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -97,6 +97,24 @@ def parse_args(): default=False, help="Enable thinking mode: AR stage decodes ... planning tokens before image generation.", ) + parser.add_argument( + "--max-think-tokens", + type=int, + default=1000, + help="Maximum number of tokens for thinking text generation (default: 1000).", + ) + parser.add_argument( + "--do-sample", + action="store_true", + default=False, + help="Enable sampling for text generation (default: greedy).", + ) + parser.add_argument( + "--text-temperature", + type=float, + default=0.3, + help="Temperature for text generation sampling (default: 0.3).", + ) args = parser.parse_args() return args @@ -108,7 +126,6 @@ def main(): model_name = args.model prompts: list[OmniPromptType] = [] try: - # Preferred: load from txt file (one prompt per line) if getattr(args, "txt_prompts", None) and args.prompt_type == "text": with open(args.txt_prompts, encoding="utf-8") as f: lines = [ln.strip() for ln in f.readlines()] @@ -121,10 +138,8 @@ def main(): raise if not prompts: - # Default prompt for text2img test if none provided prompts = ["A cute cat"] print(f"[Info] No prompts provided, using default: {prompts}") - omni_outputs = [] from PIL import Image @@ -132,11 +147,13 @@ def main(): omni_kwargs = {} stage_configs_path = args.stage_configs_path + is_single_stage = stage_configs_path and "single_stage" in stage_configs_path if args.think and stage_configs_path is None: stage_configs_path = "vllm_omni/model_executor/stage_configs/bagel_think.yaml" print(f"[Info] Think mode enabled, using stage config: {stage_configs_path}") if stage_configs_path: omni_kwargs["stage_configs_path"] = stage_configs_path + is_single_stage = "single_stage" in stage_configs_path omni_kwargs.update( { @@ -198,40 +215,61 @@ def main(): formatted_prompts.append(prompt_dict) params_list = omni.default_sampling_params_list + + # For single-stage DiT, think/text params go into the diffusion sampling params extra_args. + # For 2-stage, diffusion params are at index 1. + diffusion_params_idx = 0 if is_single_stage else (1 if len(params_list) > 1 else 0) + diffusion_params = params_list[diffusion_params_idx] + if args.modality in ("text2img", "img2img"): - if len(params_list) > 1: - diffusion_params = params_list[1] - diffusion_params.num_inference_steps = args.steps # type: ignore - diffusion_params.cfg_parallel_size = args.cfg_parallel_size # type: ignore - if args.seed is not None: - diffusion_params.seed = args.seed # type: ignore - extra = { - "cfg_text_scale": args.cfg_text_scale, - "cfg_img_scale": args.cfg_img_scale, - } - if args.cfg_interval is not None: - extra["cfg_interval"] = tuple(args.cfg_interval) - if args.cfg_renorm_type is not None: - extra["cfg_renorm_type"] = args.cfg_renorm_type - if args.cfg_renorm_min is not None: - extra["cfg_renorm_min"] = args.cfg_renorm_min - if args.negative_prompt is not None: - extra["negative_prompt"] = args.negative_prompt - diffusion_params.extra_args = extra # type: ignore + diffusion_params.num_inference_steps = args.steps # type: ignore + diffusion_params.cfg_parallel_size = args.cfg_parallel_size # type: ignore + if args.seed is not None: + diffusion_params.seed = args.seed # type: ignore + + extra = getattr(diffusion_params, "extra_args", {}) or {} + extra["cfg_text_scale"] = args.cfg_text_scale + extra["cfg_img_scale"] = args.cfg_img_scale + if args.cfg_interval is not None: + extra["cfg_interval"] = tuple(args.cfg_interval) + if args.cfg_renorm_type is not None: + extra["cfg_renorm_type"] = args.cfg_renorm_type + if args.cfg_renorm_min is not None: + extra["cfg_renorm_min"] = args.cfg_renorm_min + if args.negative_prompt is not None: + extra["negative_prompt"] = args.negative_prompt + + needs_text_gen = is_single_stage and (args.think or args.modality in ("text2text", "img2text")) + if needs_text_gen: + if args.think: + extra["think"] = True + extra["max_think_tokens"] = args.max_think_tokens + extra["do_sample"] = args.do_sample + extra["text_temperature"] = args.text_temperature + diffusion_params.extra_args = extra # type: ignore omni_outputs = list(omni.generate(prompts=formatted_prompts, sampling_params_list=params_list)) img_idx = 0 for req_output in omni_outputs: - if args.think: - ro = getattr(req_output, "request_output", None) - if ro and getattr(ro, "outputs", None): - txt = "".join(getattr(o, "text", "") or "" for o in ro.outputs) - if txt: - print(txt) + # 2-stage think mode: text output from thinker stage + ro = getattr(req_output, "request_output", None) + if ro and getattr(ro, "outputs", None): + txt = "".join(getattr(o, "text", "") or "" for o in ro.outputs) + if txt: + if args.think: + print(f"[Think]\n{txt}") + else: + print(f"[Output] Text:\n{txt}") - images = getattr(req_output, "images", None) + # Single-stage DiT: text from custom_output + custom = getattr(req_output, "_custom_output", {}) or {} + if custom.get("think_text"): + print(f"[Think]\n{custom['think_text']}") + if custom.get("text_output"): + print(f"[Output] Text:\n{custom['text_output']}") + images = getattr(req_output, "images", None) if not images: continue @@ -241,8 +279,6 @@ def main(): print(f"[Output] Saved image to {save_path}") img_idx += 1 - print(omni_outputs) - if __name__ == "__main__": main() diff --git a/vllm_omni/diffusion/models/bagel/bagel_transformer.py b/vllm_omni/diffusion/models/bagel/bagel_transformer.py index f848077568..d1254f8456 100644 --- a/vllm_omni/diffusion/models/bagel/bagel_transformer.py +++ b/vllm_omni/diffusion/models/bagel/bagel_transformer.py @@ -854,6 +854,7 @@ def __init__( config, parallel_config=parallel_config, quant_config=quant_config, prefix=f"{prefix}.model" ) self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing self.post_init() @@ -864,6 +865,12 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.model.embed_tokens = value + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + def set_decoder(self, decoder): self.model = decoder @@ -1207,7 +1214,7 @@ def prepare_prompts(self, curr_kvlens, curr_rope, prompts, tokenizer, new_token_ packed_key_value_indexes.extend(range(curr, curr + curr_kvlen)) curr += curr_kvlen - text_ids = tokenizer.encode(prompt) + text_ids = tokenizer.encode(prompt, add_special_tokens=False) text_ids = [new_token_ids["bos_token_id"]] + text_ids + [new_token_ids["eos_token_id"]] text_token_lens.append(len(text_ids)) packed_text_ids.extend(text_ids) @@ -1619,10 +1626,110 @@ def _merge_naive_caches(caches: list) -> NaiveCache: num_layers = len(caches[0].key_cache) merged = NaiveCache(num_layers) for layer_idx in range(num_layers): - merged.key_cache[layer_idx] = torch.cat([c.key_cache[layer_idx] for c in caches], dim=0) - merged.value_cache[layer_idx] = torch.cat([c.value_cache[layer_idx] for c in caches], dim=0) + key_parts = [c.key_cache[layer_idx] for c in caches if c.key_cache[layer_idx] is not None] + val_parts = [c.value_cache[layer_idx] for c in caches if c.value_cache[layer_idx] is not None] + merged.key_cache[layer_idx] = torch.cat(key_parts, dim=0) if key_parts else None + merged.value_cache[layer_idx] = torch.cat(val_parts, dim=0) if val_parts else None return merged + def prepare_start_tokens(self, curr_kvlens, curr_rope, new_token_ids): + """Prepare start tokens for autoregressive text generation. + + Ported from the original BAGEL ``Bagel.prepare_start_tokens``. + """ + packed_start_tokens, packed_key_value_indexes = list(), list() + packed_query_position_ids = list() + + curr = 0 + for curr_kvlen, curr_position_id in zip(curr_kvlens, curr_rope): + packed_key_value_indexes.extend(range(curr, curr + curr_kvlen)) + packed_start_tokens.append(new_token_ids["bos_token_id"]) + packed_query_position_ids.append(curr_position_id) + curr += curr_kvlen + + generation_input = { + "packed_start_tokens": torch.tensor(packed_start_tokens, dtype=torch.long), + "packed_query_position_ids": torch.tensor(packed_query_position_ids, dtype=torch.long), + "key_values_lens": torch.tensor(curr_kvlens, dtype=torch.int), + "packed_key_value_indexes": torch.tensor(packed_key_value_indexes, dtype=torch.long), + } + return generation_input + + @torch.no_grad() + def generate_text( + self, + past_key_values: NaiveCache, + packed_key_value_indexes: torch.LongTensor, + key_values_lens: torch.IntTensor, + packed_start_tokens: torch.LongTensor, + packed_query_position_ids: torch.LongTensor, + max_length: int, + do_sample: bool = False, + temperature: float = 1.0, + end_token_id: int | None = None, + ): + """Autoregressive text generation (ported from original BAGEL). + + Decodes tokens one at a time, appending to ``past_key_values`` + until ``max_length`` is reached or ``end_token_id`` is generated. + """ + step = 0 + generated_sequence = [] + curr_tokens = packed_start_tokens + while step < max_length: + generated_sequence.append(curr_tokens) + packed_text_embedding = self.language_model.model.embed_tokens(curr_tokens) + query_lens = torch.ones_like(curr_tokens) + packed_query_indexes = torch.cumsum(key_values_lens, dim=0) + torch.arange( + 0, + len(key_values_lens), + device=key_values_lens.device, + dtype=key_values_lens.dtype, + ) + + uppacked = list(packed_key_value_indexes.split(key_values_lens.tolist(), dim=0)) + for i in range(len(uppacked)): + uppacked[i] += i + packed_key_value_indexes = torch.cat(uppacked, dim=0) + + output = self.language_model( + packed_query_sequence=packed_text_embedding, + query_lens=query_lens, + packed_query_position_ids=packed_query_position_ids, + packed_query_indexes=packed_query_indexes, + past_key_values=past_key_values, + key_values_lens=key_values_lens, + packed_key_value_indexes=packed_key_value_indexes, + update_past_key_values=True, + is_causal=True, + mode="und", + ) + past_key_values = output.past_key_values + packed_query_sequence = output.packed_query_sequence + pred_logits = self.language_model.lm_head(packed_query_sequence) + + if do_sample: + probs = nn.functional.softmax(pred_logits / temperature, dim=-1) + curr_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + curr_tokens = torch.argmax(pred_logits, dim=-1) + + uppacked = list(packed_key_value_indexes.split(key_values_lens.tolist(), dim=0)) + for i in range(len(uppacked)): + uppacked[i] = torch.cat( + [uppacked[i], torch.tensor([uppacked[i][-1] + 1], device=uppacked[i].device)], dim=0 + ) + packed_key_value_indexes = torch.cat(uppacked, dim=0) + key_values_lens = key_values_lens + 1 + packed_query_position_ids = packed_query_position_ids + 1 + step += 1 + + if end_token_id is not None and curr_tokens[0] == end_token_id: + break + + output_device = generated_sequence[0].device + return torch.stack([i.to(output_device) for i in generated_sequence], dim=0) + def generate_image( self, packed_text_ids: torch.LongTensor, diff --git a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py index 13d0cc2093..72e53e7f48 100644 --- a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py +++ b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py @@ -495,11 +495,15 @@ def vae_transforms(img): cfg_text_context = deepcopy(gen_context) + # Strip <|im_start|>/<|im_end|> wrappers that end2end.py may have + # already added, so prepare_prompts doesn't double-add bos/eos. + clean_prompt = prompt.removeprefix("<|im_start|>").removesuffix("<|im_end|>") + # Update gen_context with text prompt generation_input, newlens, new_rope = self.bagel.prepare_prompts( curr_kvlens=gen_context["kv_lens"], curr_rope=gen_context["ropes"], - prompts=[prompt], + prompts=[clean_prompt], tokenizer=self.tokenizer, new_token_ids=self.new_token_ids, ) @@ -527,34 +531,37 @@ def vae_transforms(img): gen_context["kv_lens"] = newlens gen_context["ropes"] = new_rope - # cfg_text_context: update with negative prompt (no text condition) + # cfg_text_context: update with negative prompt (no text condition). + # When empty, keep cfg_text_context as-is (kv_lens=0) to match + # original BAGEL; _merge_naive_caches handles None KV entries. neg_prompt = extra_args.get("negative_prompt", "") - neg_input, neg_newlens, neg_rope = self.bagel.prepare_prompts( - curr_kvlens=cfg_text_context["kv_lens"], - curr_rope=cfg_text_context["ropes"], - prompts=[neg_prompt], - tokenizer=self.tokenizer, - new_token_ids=self.new_token_ids, - ) - for k, v in neg_input.items(): - if torch.is_tensor(v): - neg_input[k] = v.to(self.device) - with torch.autocast( - device_type=self.device.type, - enabled=self.device.type != "cpu", - dtype=self.od_config.dtype, - ): - cfg_text_context["past_key_values"] = self.bagel.forward_cache_update_text( - cfg_text_context["past_key_values"], **neg_input + if neg_prompt: + neg_input, neg_newlens, neg_rope = self.bagel.prepare_prompts( + curr_kvlens=cfg_text_context["kv_lens"], + curr_rope=cfg_text_context["ropes"], + prompts=[neg_prompt], + tokenizer=self.tokenizer, + new_token_ids=self.new_token_ids, ) - cfg_text_context["kv_lens"] = neg_newlens - cfg_text_context["ropes"] = neg_rope + for k, v in neg_input.items(): + if torch.is_tensor(v): + neg_input[k] = v.to(self.device) + with torch.autocast( + device_type=self.device.type, + enabled=self.device.type != "cpu", + dtype=self.od_config.dtype, + ): + cfg_text_context["past_key_values"] = self.bagel.forward_cache_update_text( + cfg_text_context["past_key_values"], **neg_input + ) + cfg_text_context["kv_lens"] = neg_newlens + cfg_text_context["ropes"] = neg_rope # cfg_img_context: update with text prompt (no image condition) cfg_img_generation_input, cfg_img_newlens, cfg_img_new_rope = self.bagel.prepare_prompts( curr_kvlens=cfg_img_context["kv_lens"], curr_rope=cfg_img_context["ropes"], - prompts=[prompt], + prompts=[clean_prompt], tokenizer=self.tokenizer, new_token_ids=self.new_token_ids, ) @@ -572,6 +579,96 @@ def vae_transforms(img): cfg_img_context["kv_lens"] = cfg_img_newlens cfg_img_context["ropes"] = cfg_img_new_rope + # ---- Detect output modality and think mode ---- + modalities = first_prompt.get("modalities", []) if isinstance(first_prompt, dict) else [] + is_text_output = "text" in modalities + think_enabled = extra_args.get("think", False) + think_text = None + + if think_enabled and injected_kv is None: + max_think_tokens = int(extra_args.get("max_think_tokens", 1000)) + do_sample = bool(extra_args.get("do_sample", False)) + text_temperature = float(extra_args.get("text_temperature", 0.3)) + + with torch.autocast( + device_type=self.device.type, + enabled=self.device.type != "cpu", + dtype=self.od_config.dtype, + ): + start_input = self.bagel.prepare_start_tokens( + gen_context["kv_lens"], gen_context["ropes"], self.new_token_ids + ) + for k, v in start_input.items(): + if torch.is_tensor(v): + start_input[k] = v.to(self.device) + + gen_ctx_copy = deepcopy(gen_context) + token_ids = self.bagel.generate_text( + past_key_values=gen_ctx_copy["past_key_values"], + max_length=max_think_tokens, + do_sample=do_sample, + temperature=text_temperature, + end_token_id=self.new_token_ids["eos_token_id"], + **start_input, + ) + # token_ids shape: (seq_len, batch=1) + decoded = self.tokenizer.decode(token_ids[:, 0].tolist()) + # Strip chat markers to get clean text + think_text = decoded.split("<|im_end|>")[0] + if "<|im_start|>" in think_text: + think_text = think_text.split("<|im_start|>")[-1] + logger.info("Think mode generated %d tokens", token_ids.shape[0]) + + if not is_text_output: + # Use the autoregressive KV cache from think generation + # directly, instead of decode→re-encode which adds extra + # bos/eos and may alter tokenization. + num_think_tokens = token_ids.shape[0] + gen_context["past_key_values"] = gen_ctx_copy["past_key_values"] + gen_context["kv_lens"] = [kl + num_think_tokens for kl in gen_context["kv_lens"]] + gen_context["ropes"] = [r + num_think_tokens for r in gen_context["ropes"]] + + # ---- Text-only output (text2text / img2text) ---- + if is_text_output and injected_kv is None: + if think_text is not None: + # Think mode already generated the text (including reasoning) + text_output = think_text + else: + max_text_tokens = int(extra_args.get("max_think_tokens", 500)) + do_sample = bool(extra_args.get("do_sample", False)) + text_temperature = float(extra_args.get("text_temperature", 0.3)) + + with torch.autocast( + device_type=self.device.type, + enabled=self.device.type != "cpu", + dtype=self.od_config.dtype, + ): + start_input = self.bagel.prepare_start_tokens( + gen_context["kv_lens"], gen_context["ropes"], self.new_token_ids + ) + for k, v in start_input.items(): + if torch.is_tensor(v): + start_input[k] = v.to(self.device) + token_ids = self.bagel.generate_text( + past_key_values=gen_context["past_key_values"], + max_length=max_text_tokens, + do_sample=do_sample, + temperature=text_temperature, + end_token_id=self.new_token_ids["eos_token_id"], + **start_input, + ) + decoded = self.tokenizer.decode(token_ids[:, 0].tolist()) + text_output = decoded.split("<|im_end|>")[0] + if "<|im_start|>" in text_output: + text_output = text_output.split("<|im_start|>")[-1] + + return DiffusionOutput( + output=text_output, + custom_output={"text_output": text_output}, + stage_durations=self.stage_durations if hasattr(self, "stage_durations") else None, + ) + + # ---- Image generation (text2img / img2img) ---- if req.sampling_params.seed is not None: torch.manual_seed(req.sampling_params.seed) if self.device.type == "cuda": @@ -676,12 +773,17 @@ def vae_transforms(img): if trajectory_log_probs: trajectory_log_probs_stacked = torch.stack(trajectory_log_probs) + custom = {} + if think_text is not None: + custom["think_text"] = think_text + return DiffusionOutput( output=img, trajectory_latents=trajectory_latents_stacked, trajectory_timesteps=trajectory_timesteps_stacked, trajectory_log_probs=trajectory_log_probs_stacked, trajectory_decoded=trajectory_decoded, + custom_output=custom, stage_durations=self.stage_durations if hasattr(self, "stage_durations") else None, ) From e0cdbe9a5d7ec654bbbe26c2fb6e76abe41446d2 Mon Sep 17 00:00:00 2001 From: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com> Date: Mon, 13 Apr 2026 19:21:42 +0800 Subject: [PATCH 149/204] [Misc] Cleanup: use consistent pytest-mock in unit tests (#2698) Signed-off-by: yuanheng --- tests/comfyui/conftest.py | 18 +- tests/comfyui/test_comfyui_integration.py | 95 +- .../test_generation_scheduler_restore.py | 27 +- .../test_distributed_vae_executor.py | 41 +- .../models/bagel/test_trajectory_recording.py | 34 +- .../models/flux2/test_flux2_transformer_tp.py | 20 +- .../offloader/test_sequential_backend.py | 120 +- .../quantization/test_int8_config.py | 32 +- tests/diffusion/test_diffusion_scheduler.py | 103 +- .../diffusion/test_diffusion_step_pipeline.py | 26 +- .../test_diffusion_worker_cuda_profiler.py | 6 +- .../test_multiproc_engine_concurrency.py | 28 +- tests/engine/test_arg_utils.py | 9 +- tests/engine/test_async_omni_engine_input.py | 15 +- .../engine/test_async_omni_engine_outputs.py | 20 +- tests/engine/test_single_stage_mode.py | 1533 ++++++++++------- .../openai_api/test_serving_chat_speaker.py | 40 +- .../openai_api/test_serving_speech.py | 215 ++- .../openai_api/test_serving_speech_stream.py | 117 +- tests/entrypoints/test_omni_base_profiler.py | 27 +- tests/entrypoints/test_serve.py | 188 +- .../test_mimo_audio_code2wav_batch_decode.py | 40 +- .../qwen2_5_omni/test_qwen2_5_omni_embed.py | 37 +- .../qwen3_tts/test_code_predictor_dtype.py | 131 +- .../models/test_fish_speech_voice_cache.py | 30 +- tests/test_fish_speech_voice_cache.py | 39 +- 26 files changed, 1610 insertions(+), 1381 deletions(-) diff --git a/tests/comfyui/conftest.py b/tests/comfyui/conftest.py index 0b4565e946..4280d3506f 100644 --- a/tests/comfyui/conftest.py +++ b/tests/comfyui/conftest.py @@ -9,8 +9,8 @@ import os import sys +from types import ModuleType, SimpleNamespace from typing import BinaryIO, TypedDict -from unittest.mock import MagicMock def pytest_configure(config): @@ -58,15 +58,15 @@ def save_to(self, file: str | BinaryIO): else: file.write(self._data) - mock_comfy_api = MagicMock() - mock_comfy_api_input = MagicMock() + mock_comfy_api = ModuleType("comfy_api") + mock_comfy_api_input = ModuleType("comfy_api.input") mock_comfy_api_input.AudioInput = AudioInput mock_comfy_api_input.VideoInput = VideoInput mock_comfy_api.input = mock_comfy_api_input - mock_comfy_api_latest = MagicMock() - mock_comfy_api_latest.Types.VideoComponents = MagicMock(side_effect=lambda **kwargs: kwargs) - mock_comfy_api_latest.InputImpl.VideoFromComponents = MagicMock( - side_effect=lambda _: VideoInput(b"mock_video_from_components") + mock_comfy_api_latest = ModuleType("comfy_api.latest") + mock_comfy_api_latest.Types = SimpleNamespace(VideoComponents=lambda **kwargs: kwargs) + mock_comfy_api_latest.InputImpl = SimpleNamespace( + VideoFromComponents=lambda _: VideoInput(b"mock_video_from_components") ) mock_comfy_api.latest = mock_comfy_api_latest @@ -76,8 +76,8 @@ def mock_load(_: str | BinaryIO): sample_rate = 24000 return waveform, sample_rate - mock_comfy_extras = MagicMock() - mock_nodes_audio = MagicMock() + mock_comfy_extras = ModuleType("comfy_extras") + mock_nodes_audio = ModuleType("comfy_extras.nodes_audio") mock_nodes_audio.load = mock_load mock_comfy_extras.nodes_audio = mock_nodes_audio diff --git a/tests/comfyui/test_comfyui_integration.py b/tests/comfyui/test_comfyui_integration.py index f6ce82f9b2..80e86d8241 100644 --- a/tests/comfyui/test_comfyui_integration.py +++ b/tests/comfyui/test_comfyui_integration.py @@ -13,7 +13,6 @@ from enum import StrEnum, auto from types import SimpleNamespace from typing import Any, NamedTuple -from unittest.mock import AsyncMock, MagicMock, patch import pytest import requests @@ -28,6 +27,7 @@ ) from comfyui_vllm_omni.utils.types import AutoregressionSamplingParams, DiffusionSamplingParams, WanModelSpecificParams from PIL import Image +from pytest_mock import MockerFixture from vllm import SamplingParams from vllm.outputs import CompletionOutput, RequestOutput from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -217,9 +217,10 @@ def _build_diffusion_video_output() -> OmniRequestOutput: def _build_diffusion_image_output_for_chat_endpoint() -> OmniRequestOutput: - request_output = MagicMock() - request_output.images = [_build_image_output(color="blue")] - request_output.finished = True + request_output = SimpleNamespace( + images=[_build_image_output(color="blue")], + finished=True, + ) return OmniRequestOutput( request_id="test_req_img_chat", finished=True, @@ -389,51 +390,55 @@ def sampling_case(request) -> SamplingCase: @pytest.fixture -def mock_async_omni(server_case: ServerCase, sampling_case: SamplingCase): +def mock_async_omni( + server_case: ServerCase, + sampling_case: SamplingCase, + monkeypatch: pytest.MonkeyPatch, + mocker: MockerFixture, +): async def _mock_preprocess_chat(self, *args, **kwargs): return ([{"role": "user", "content": "test"}], [{"prompt": "test prompt"}]) # Need to mock AsyncOmni itself (not only its generate method) because # 1. The API layer uses its stage_list and stage_configs attributes # 2. Its __init__ method has slow side effects (model & config loading). - with ( - patch("vllm_omni.entrypoints.openai.api_server.AsyncOmni") as MockAsyncOmni, - patch( - "vllm_omni.entrypoints.openai.serving_chat.OmniOpenAIServingChat._preprocess_chat", - new=_mock_preprocess_chat, - ), - ): - mock_instance = AsyncMock(spec=RealAsyncOmni) - mock_instance.generate = _build_mock_outputs(server_case.outputs, sampling_case, server_case) - - mock_instance.stage_list = server_case.stage_list - mock_instance.stage_configs = server_case.stage_configs - mock_instance.output_modalities = _build_output_modalities(server_case.stage_configs) - mock_instance.default_sampling_params_list = [ - SamplingParams() if _stage_type(stage) != "diffusion" else MagicMock() - for stage in server_case.stage_configs - ] - mock_instance.errored = False - mock_instance.dead_error = RuntimeError("Mock engine error") - mock_instance.model_config = MagicMock( - max_model_len=4096, - io_processor_plugin=None, - allowed_local_media_path=None, - allowed_media_domains=None, - ) - # Mimic Qwen3-TTS talker speaker config so CustomVoice validation passes. - mock_instance.model_config.hf_config = MagicMock() - mock_instance.model_config.hf_config.talker_config = MagicMock() - mock_instance.model_config.hf_config.talker_config.speaker_id = {"Vivian": 0} - mock_instance.io_processor = MagicMock() - mock_instance.input_processor = MagicMock() - mock_instance.shutdown = MagicMock() - mock_instance.get_vllm_config = AsyncMock(return_value=None) - mock_instance.get_supported_tasks = AsyncMock(return_value=["generate"]) - mock_instance.get_tokenizer = AsyncMock(return_value=None) + mock_async_omni_cls = mocker.patch("vllm_omni.entrypoints.openai.api_server.AsyncOmni") + monkeypatch.setattr( + "vllm_omni.entrypoints.openai.serving_chat.OmniOpenAIServingChat._preprocess_chat", + _mock_preprocess_chat, + ) + + mock_instance = mocker.AsyncMock(spec=RealAsyncOmni) + mock_instance.generate = _build_mock_outputs(server_case.outputs, sampling_case, server_case) + + mock_instance.stage_list = server_case.stage_list + mock_instance.stage_configs = server_case.stage_configs + mock_instance.output_modalities = _build_output_modalities(server_case.stage_configs) + mock_instance.default_sampling_params_list = [ + SamplingParams() if _stage_type(stage) != "diffusion" else mocker.MagicMock() + for stage in server_case.stage_configs + ] + mock_instance.errored = False + mock_instance.dead_error = RuntimeError("Mock engine error") + mock_instance.model_config = mocker.MagicMock( + max_model_len=4096, + io_processor_plugin=None, + allowed_local_media_path=None, + allowed_media_domains=None, + ) + # Mimic Qwen3-TTS talker speaker config so CustomVoice validation passes. + mock_instance.model_config.hf_config = mocker.MagicMock() + mock_instance.model_config.hf_config.talker_config = mocker.MagicMock() + mock_instance.model_config.hf_config.talker_config.speaker_id = {"Vivian": 0} + mock_instance.io_processor = mocker.MagicMock() + mock_instance.input_processor = mocker.MagicMock() + mock_instance.shutdown = mocker.MagicMock() + mock_instance.get_vllm_config = mocker.AsyncMock(return_value=None) + mock_instance.get_supported_tasks = mocker.AsyncMock(return_value=["generate"]) + mock_instance.get_tokenizer = mocker.AsyncMock(return_value=None) - MockAsyncOmni.return_value = mock_instance - yield MockAsyncOmni + mock_async_omni_cls.return_value = mock_instance + yield mock_async_omni_cls @pytest.fixture @@ -583,9 +588,9 @@ async def test_image_generation_node(api_server: str, model: str, image_input: b ServerCase( served_model="Qwen/Qwen2.5-Omni-7B", stage_list=[ - MagicMock(is_comprehension=True, model_stage="llm"), - MagicMock(is_comprehension=False, model_stage="llm"), - MagicMock(is_comprehension=False, model_stage="llm"), + SimpleNamespace(is_comprehension=True, model_stage="llm"), + SimpleNamespace(is_comprehension=False, model_stage="llm"), + SimpleNamespace(is_comprehension=False, model_stage="llm"), ], stage_configs=[ _make_stage_config("llm", is_comprehension=True, model_stage="thinker"), diff --git a/tests/core/sched/test_generation_scheduler_restore.py b/tests/core/sched/test_generation_scheduler_restore.py index 154f40b399..5cc1cab702 100644 --- a/tests/core/sched/test_generation_scheduler_restore.py +++ b/tests/core/sched/test_generation_scheduler_restore.py @@ -6,7 +6,6 @@ those requests are permanently orphaned. """ -import unittest from collections import deque import pytest @@ -39,7 +38,7 @@ def postprocess_scheduler_output(self, output): pass -class TestRestoreQueuesOnError(unittest.TestCase): +class TestRestoreQueuesOnError: """Verify that restore_queues is called even when rewrapping raises.""" def test_requests_not_lost_on_exception(self): @@ -52,8 +51,8 @@ def test_requests_not_lost_on_exception(self): # Step 1: process_pending_chunks moves req-B out adapter.process_pending_chunks(waiting=[], running=running) - self.assertEqual(running, ["req-A"]) - self.assertEqual(len(adapter.waiting_for_chunk_running_requests), 1) + assert running == ["req-A"] + assert len(adapter.waiting_for_chunk_running_requests) == 1 # Step 2: simulate the try/except/finally pattern try: @@ -65,9 +64,9 @@ def test_requests_not_lost_on_exception(self): adapter.restore_queues(waiting=[], running=running) # Step 3: verify request is restored - self.assertTrue(adapter.restore_called) - self.assertIn("req-B", running) - self.assertEqual(len(adapter.waiting_for_chunk_running_requests), 0) + assert adapter.restore_called is True + assert "req-B" in running + assert len(adapter.waiting_for_chunk_running_requests) == 0 def test_requests_lost_without_fix(self): """Demonstrate the bug: without restore in except, request is lost.""" @@ -76,7 +75,7 @@ def test_requests_lost_without_fix(self): running = ["req-A", "req-B"] adapter.process_pending_chunks(waiting=[], running=running) - self.assertEqual(running, ["req-A"]) + assert running == ["req-A"] # Simulate the BUGGY code: except without restore try: @@ -85,8 +84,8 @@ def test_requests_lost_without_fix(self): pass # Bug: no restore_queues call # Request is lost! - self.assertNotIn("req-B", running) - self.assertEqual(len(adapter.waiting_for_chunk_running_requests), 1) + assert "req-B" not in running + assert len(adapter.waiting_for_chunk_running_requests) == 1 def test_happy_path_restores_via_finally(self): """When no exception, restore_queues is still called via finally.""" @@ -102,9 +101,5 @@ def test_happy_path_restores_via_finally(self): finally: adapter.restore_queues(waiting=[], running=running) - self.assertTrue(adapter.restore_called) - self.assertIn("req-B", running) - - -if __name__ == "__main__": - unittest.main() + assert adapter.restore_called is True + assert "req-B" in running diff --git a/tests/diffusion/distributed/test_distributed_vae_executor.py b/tests/diffusion/distributed/test_distributed_vae_executor.py index dc491dcdaf..b2ee7c10d3 100644 --- a/tests/diffusion/distributed/test_distributed_vae_executor.py +++ b/tests/diffusion/distributed/test_distributed_vae_executor.py @@ -1,4 +1,4 @@ -from unittest.mock import MagicMock, patch +from types import SimpleNamespace import pytest import torch @@ -61,40 +61,31 @@ def merge(self, coord_tensor_map, grid_spec): class DummyMixin(DistributedVaeMixin): def __init__(self): self.use_tiling = True - self.distributed_executor = MagicMock() - self.distributed_executor.parallel_size = 2 - self.distributed_executor.group = None + self.distributed_executor = SimpleNamespace(parallel_size=2, group=None) @pytest.fixture(autouse=True) -def mock_dist(): - with ( - patch.object(dist, "get_world_size", return_value=2), - patch.object(dist, "get_rank", return_value=0), - patch.object(dist, "is_initialized", return_value=True), - patch.object(dist, "all_reduce", return_value=None), - patch.object(dist, "gather", return_value=None), - patch.object(dist, "broadcast", return_value=None), - ): - yield +def mock_dist(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr(dist, "get_world_size", lambda *args, **kwargs: 2) + monkeypatch.setattr(dist, "get_rank", lambda *args, **kwargs: 0) + monkeypatch.setattr(dist, "is_initialized", lambda: True) + monkeypatch.setattr(dist, "all_reduce", lambda *args, **kwargs: None) + monkeypatch.setattr(dist, "gather", lambda *args, **kwargs: None) + monkeypatch.setattr(dist, "broadcast", lambda *args, **kwargs: None) @pytest.fixture(autouse=True) -def mock_dit_group(): - with patch( +def mock_dit_group(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr( "vllm_omni.diffusion.distributed.autoencoders.distributed_vae_executor.get_dit_group", - new=MagicMock(return_value=None), - ): - yield + lambda: None, + ) @pytest.fixture(autouse=True) -def mock_dist_vae_executor(): - with ( - patch.object(DistributedVaeExecutor, "gather_tensors", side_effect=lambda x: [x]), - patch.object(DistributedVaeExecutor, "broadcast_tensor", side_effect=lambda x: x), - ): - yield +def mock_dist_vae_executor(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr(DistributedVaeExecutor, "gather_tensors", lambda self, x: [x]) + monkeypatch.setattr(DistributedVaeExecutor, "broadcast_tensor", lambda self, x: x) # ============================ diff --git a/tests/diffusion/models/bagel/test_trajectory_recording.py b/tests/diffusion/models/bagel/test_trajectory_recording.py index 80b3f9d9ba..345eac1078 100644 --- a/tests/diffusion/models/bagel/test_trajectory_recording.py +++ b/tests/diffusion/models/bagel/test_trajectory_recording.py @@ -4,10 +4,10 @@ import types from dataclasses import dataclass -from unittest.mock import MagicMock, patch import pytest import torch +from pytest_mock import MockerFixture from vllm_omni.diffusion.models.bagel.bagel_transformer import ( Bagel, @@ -23,9 +23,9 @@ EXPECTED_STEPS = NUM_TIMESTEPS - 1 -def _make_mock_bagel(): +def _make_mock_bagel(mocker: MockerFixture): """Create a mock Bagel with forward returning constant velocity.""" - mock = MagicMock(spec=Bagel) + mock = mocker.MagicMock(spec=Bagel) mock._sp_size = 1 # forward returns a small constant velocity so x_t changes each step @@ -78,18 +78,22 @@ def _make_generate_args(num_tokens=NUM_TOKENS, hidden_dim=HIDDEN_DIM, cfg=False) @pytest.fixture(params=[False, True], ids=["no_cfg", "batched_cfg"]) -def bagel_and_args(request): +def bagel_and_args( + request, + monkeypatch: pytest.MonkeyPatch, + mocker: MockerFixture, +): """Mock Bagel instance and generate_image arguments. Parametrized over CFG mode so every test runs on both the no-CFG and batched-CFG code paths. """ cfg = request.param - with patch( + monkeypatch.setattr( "vllm_omni.diffusion.models.bagel.bagel_transformer.get_classifier_free_guidance_world_size", - return_value=1, - ): - yield _make_mock_bagel(), _make_generate_args(cfg=cfg) + lambda: 1, + ) + yield _make_mock_bagel(mocker), _make_generate_args(cfg=cfg) class TestTrajectoryRecording: @@ -188,12 +192,16 @@ class TestTrajectoryLogProbs: """Tests for log-prob recording when a scheduler is provided.""" @pytest.fixture() - def bagel_scheduler_args(self): - with patch( + def bagel_scheduler_args( + self, + monkeypatch: pytest.MonkeyPatch, + mocker: MockerFixture, + ): + monkeypatch.setattr( "vllm_omni.diffusion.models.bagel.bagel_transformer.get_classifier_free_guidance_world_size", - return_value=1, - ): - yield _make_mock_bagel(), _make_generate_args(), _MockScheduler() + lambda: 1, + ) + yield _make_mock_bagel(mocker), _make_generate_args(), _MockScheduler() def test_log_probs_recorded_with_scheduler(self, bagel_scheduler_args): bagel, args, scheduler = bagel_scheduler_args diff --git a/tests/diffusion/models/flux2/test_flux2_transformer_tp.py b/tests/diffusion/models/flux2/test_flux2_transformer_tp.py index faad08afd1..54dda1dd07 100644 --- a/tests/diffusion/models/flux2/test_flux2_transformer_tp.py +++ b/tests/diffusion/models/flux2/test_flux2_transformer_tp.py @@ -1,7 +1,6 @@ -from unittest.mock import MagicMock, patch - import pytest import torch +from pytest_mock import MockerFixture from tests.utils import hardware_test from vllm_omni.diffusion.models.flux2.flux2_transformer import ( @@ -12,14 +11,17 @@ # Initialize TP group before tests @pytest.fixture(scope="function", autouse=True) -def setup_tp_group(): +def setup_tp_group(mocker: MockerFixture): """Set up TP group for each test function""" - with patch("vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", return_value=2): - with patch("vllm.distributed.parallel_state.get_tp_group") as mock_get_tp_group: - mock_tp_group = MagicMock() - mock_tp_group.world_size = 2 - mock_get_tp_group.return_value = mock_tp_group - yield + mocker.patch( + "vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", + return_value=2, + ) + mock_get_tp_group = mocker.patch("vllm.distributed.parallel_state.get_tp_group") + mock_tp_group = mocker.MagicMock() + mock_tp_group.world_size = 2 + mock_get_tp_group.return_value = mock_tp_group + yield class TestFlux2TransformerWeightLoading: diff --git a/tests/diffusion/offloader/test_sequential_backend.py b/tests/diffusion/offloader/test_sequential_backend.py index d18637a780..2539cc0689 100644 --- a/tests/diffusion/offloader/test_sequential_backend.py +++ b/tests/diffusion/offloader/test_sequential_backend.py @@ -3,8 +3,6 @@ """Unit tests for SequentialOffloadBackend.""" -from unittest.mock import patch - import pytest import torch from torch import nn @@ -44,7 +42,7 @@ def mock(self): class TestMoveParamsPinMemory: - def test_dtensor_skips_pin_memory(self, accelerator_device): + def test_dtensor_skips_pin_memory(self, accelerator_device, monkeypatch: pytest.MonkeyPatch): """DTensor should skip pin_memory to avoid RuntimeError.""" module = _create_simple_module().to(accelerator_device) tracker, mock_pin = _track_pin_memory_calls() @@ -56,73 +54,73 @@ def fake_isinstance(obj, cls): return True return original_isinstance(obj, cls) - with patch.object(torch.Tensor, "pin_memory", mock_pin): - with patch("builtins.isinstance", fake_isinstance): - hook = SequentialOffloadHook( - offload_targets=[], - device=accelerator_device, - pin_memory=True, - use_hsdp=False, - ) - hook._move_params( - module, - torch.device("cpu"), - non_blocking=False, - pin_memory=True, - ) - assert not tracker["called"], "pin_memory should not be called for DTensor" - - def test_regular_tensor_calls_pin_memory(self, accelerator_device): + monkeypatch.setattr(torch.Tensor, "pin_memory", mock_pin) + monkeypatch.setattr("builtins.isinstance", fake_isinstance) + hook = SequentialOffloadHook( + offload_targets=[], + device=accelerator_device, + pin_memory=True, + use_hsdp=False, + ) + hook._move_params( + module, + torch.device("cpu"), + non_blocking=False, + pin_memory=True, + ) + assert not tracker["called"], "pin_memory should not be called for DTensor" + + def test_regular_tensor_calls_pin_memory(self, accelerator_device, monkeypatch: pytest.MonkeyPatch): """Regular tensor should call pin_memory when moving to CPU.""" module = _create_simple_module().to(accelerator_device) tracker, mock_pin = _track_pin_memory_calls() - with patch.object(torch.Tensor, "pin_memory", mock_pin): - hook = SequentialOffloadHook( - offload_targets=[], - device=accelerator_device, - pin_memory=True, - use_hsdp=False, - ) - hook._move_params( - module, - torch.device("cpu"), - non_blocking=False, - pin_memory=True, - ) - assert tracker["called"], "pin_memory should be called for regular tensors" - - def test_pin_memory_skipped_when_disabled(self, accelerator_device): + monkeypatch.setattr(torch.Tensor, "pin_memory", mock_pin) + hook = SequentialOffloadHook( + offload_targets=[], + device=accelerator_device, + pin_memory=True, + use_hsdp=False, + ) + hook._move_params( + module, + torch.device("cpu"), + non_blocking=False, + pin_memory=True, + ) + assert tracker["called"], "pin_memory should be called for regular tensors" + + def test_pin_memory_skipped_when_disabled(self, accelerator_device, monkeypatch: pytest.MonkeyPatch): """pin_memory should not be called when pin_memory=False.""" module = _create_simple_module().to(accelerator_device) tracker, mock_pin = _track_pin_memory_calls() - with patch.object(torch.Tensor, "pin_memory", mock_pin): - hook = SequentialOffloadHook( - offload_targets=[], - device=accelerator_device, - pin_memory=False, - use_hsdp=False, - ) - hook._move_params( - module, - torch.device("cpu"), - non_blocking=False, - pin_memory=False, - ) - assert not tracker["called"], "pin_memory should not be called when disabled" - - def test_pin_memory_skipped_for_non_cpu_target(self, accelerator_device): + monkeypatch.setattr(torch.Tensor, "pin_memory", mock_pin) + hook = SequentialOffloadHook( + offload_targets=[], + device=accelerator_device, + pin_memory=False, + use_hsdp=False, + ) + hook._move_params( + module, + torch.device("cpu"), + non_blocking=False, + pin_memory=False, + ) + assert not tracker["called"], "pin_memory should not be called when disabled" + + def test_pin_memory_skipped_for_non_cpu_target(self, accelerator_device, monkeypatch: pytest.MonkeyPatch): """pin_memory should not be called for non-CPU targets.""" module = _create_simple_module().to("cpu") tracker, mock_pin = _track_pin_memory_calls() - with patch.object(torch.Tensor, "pin_memory", mock_pin): - hook = SequentialOffloadHook( - offload_targets=[], - device=torch.device("cpu"), - pin_memory=True, - use_hsdp=False, - ) - hook._move_params(module, accelerator_device, non_blocking=False, pin_memory=True) - assert not tracker["called"], "pin_memory should not be called for non-CPU target" + monkeypatch.setattr(torch.Tensor, "pin_memory", mock_pin) + hook = SequentialOffloadHook( + offload_targets=[], + device=torch.device("cpu"), + pin_memory=True, + use_hsdp=False, + ) + hook._move_params(module, accelerator_device, non_blocking=False, pin_memory=True) + assert not tracker["called"], "pin_memory should not be called for non-CPU target" diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py index d4d5aa5a7f..875277ece4 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/diffusion/quantization/test_int8_config.py @@ -2,8 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Unit tests for Int8 quantization config.""" -from unittest.mock import MagicMock, patch - import pytest import torch from pytest_mock import MockerFixture @@ -102,7 +100,7 @@ def test_quantization_config_string_and_dict_equivalent(): assert config_str.quantization_config.activation_scheme == config_dict.quantization_config.activation_scheme -def test_get_quant_method(mocker: MockerFixture): +def test_get_quant_method(mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch): """Test for get_quant_method method for GPU""" from vllm_omni.quantization.int8_config import Int8OnlineLinearMethod @@ -111,18 +109,16 @@ def test_get_quant_method(mocker: MockerFixture): def _fake_init(self, quant_config): pass - layer = MagicMock(spec=LinearBase) + layer = mocker.Mock(spec=LinearBase) mocker.patch.object(Int8OnlineLinearMethod, "__init__", _fake_init) prefix = "test_layer" # Mock the platform to be GPU - with ( - patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=True), - patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=False), - ): - method = config.get_quant_method(layer, prefix) - assert isinstance(method, Int8OnlineLinearMethod) + monkeypatch.setattr(current_omni_platform, "is_cuda", lambda: True) + monkeypatch.setattr(current_omni_platform, "is_npu", lambda: False) + method = config.get_quant_method(layer, prefix) + assert isinstance(method, Int8OnlineLinearMethod) # Test skipping quantization for a layer config.ignored_layers = [prefix] @@ -130,22 +126,20 @@ def _fake_init(self, quant_config): assert isinstance(method, UnquantizedLinearMethod) -def test_get_npu_quant_method(): +def test_get_npu_quant_method(mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch): """Test for get_quant_method method for NPU""" from vllm_omni.quantization.int8_config import NPUInt8OnlineLinearMethod config = build_quant_config("int8") - layer = MagicMock(spec=LinearBase) + layer = mocker.Mock(spec=LinearBase) prefix = "test_layer" # Mock the platform to be NPU - with ( - patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=False), - patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=True), - ): - method = config.get_quant_method(layer, prefix) - assert isinstance(method, NPUInt8OnlineLinearMethod) + monkeypatch.setattr(current_omni_platform, "is_cuda", lambda: False) + monkeypatch.setattr(current_omni_platform, "is_npu", lambda: True) + method = config.get_quant_method(layer, prefix) + assert isinstance(method, NPUInt8OnlineLinearMethod) # Test skipping quantization for a layer config.ignored_layers = [prefix] @@ -245,7 +239,7 @@ class TestNPUInt8LinearMethod: @pytest.fixture def mock_torch_npu(self, mocker): - torch_npu = MagicMock() + torch_npu = mocker.MagicMock() mocker.patch("vllm_omni.quantization.int8_config.torch_npu", return_value=torch_npu) mocker.patch( diff --git a/tests/diffusion/test_diffusion_scheduler.py b/tests/diffusion/test_diffusion_scheduler.py index 4324ba1e63..a64d9920e0 100644 --- a/tests/diffusion/test_diffusion_scheduler.py +++ b/tests/diffusion/test_diffusion_scheduler.py @@ -4,10 +4,10 @@ import queue import threading from types import SimpleNamespace -from unittest.mock import Mock, patch import pytest import torch +from pytest_mock import MockerFixture from vllm_omni.diffusion.data import DiffusionOutput, DiffusionRequestAbortedError from vllm_omni.diffusion.diffusion_engine import DiffusionEngine @@ -97,19 +97,19 @@ def initialize(self, od_config) -> None: def add_request(self, request: OmniDiffusionRequest) -> str: assert request is self._request - self._state = Mock(sched_req_id=self._sched_req_id, req=request) + self._state = SimpleNamespace(sched_req_id=self._sched_req_id, req=request) return self._sched_req_id def schedule(self): if self._scheduled or self._state is None: - return Mock( + return SimpleNamespace( scheduled_new_reqs=[], scheduled_cached_reqs=CachedRequestData.make_empty(), scheduled_req_ids=[], is_empty=True, ) self._scheduled = True - return Mock( + return SimpleNamespace( scheduled_new_reqs=[NewRequestData.from_state(self._state)], scheduled_cached_reqs=CachedRequestData.make_empty(), scheduled_req_ids=[self._state.sched_req_id], @@ -153,7 +153,7 @@ def close(self) -> None: class TestRequestScheduler: def setup_method(self) -> None: self.scheduler: RequestScheduler = RequestScheduler() - self.scheduler.initialize(Mock()) + self.scheduler.initialize(SimpleNamespace()) def test_single_request_success_lifecycle(self) -> None: req_id = self.scheduler.add_request(_make_request("a")) @@ -276,23 +276,23 @@ def test_request_id_mapping_lifecycle(self) -> None: class TestDiffusionEngine: - def test_add_req_and_wait_for_response_single_path(self) -> None: + def test_add_req_and_wait_for_response_single_path(self, mocker: MockerFixture) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.scheduler = RequestScheduler() - engine.scheduler.initialize(Mock()) + engine.scheduler.initialize(SimpleNamespace()) engine._rpc_lock = threading.RLock() engine.abort_queue = queue.Queue() request = _make_request("engine") runner_output = _make_request_output("engine") - engine.execute_fn = Mock(return_value=runner_output) + engine.execute_fn = mocker.Mock(return_value=runner_output) output = engine.add_req_and_wait_for_response(request) assert output is runner_output.result engine.execute_fn.assert_called_once() - def test_supports_scheduler_interface_injection(self) -> None: + def test_supports_scheduler_interface_injection(self, mocker: MockerFixture) -> None: request = _make_request("engine_iface") runner_output = _make_request_output("engine_iface") scheduler = _StubScheduler(request, runner_output) @@ -301,33 +301,45 @@ def test_supports_scheduler_interface_injection(self) -> None: engine.scheduler = scheduler engine._rpc_lock = threading.RLock() engine.abort_queue = queue.Queue() - engine.execute_fn = Mock(return_value=runner_output) + engine.execute_fn = mocker.Mock(return_value=runner_output) output = engine.add_req_and_wait_for_response(request) assert output is runner_output.result engine.execute_fn.assert_called_once() - def test_initializes_injected_scheduler(self) -> None: + def test_initializes_injected_scheduler( + self, + monkeypatch: pytest.MonkeyPatch, + mocker: MockerFixture, + ) -> None: request = _make_request("init") scheduler = _StubScheduler(request, DiffusionOutput(output=None)) - od_config = Mock(model_class_name="mock_model") - fake_executor_cls = Mock(return_value=Mock()) + od_config = SimpleNamespace(model_class_name="mock_model") + fake_executor_cls = mocker.Mock(return_value=mocker.Mock()) - with ( - patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_post_process_func", return_value=None), - patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_pre_process_func", return_value=None), - patch("vllm_omni.diffusion.diffusion_engine.DiffusionExecutor.get_class", return_value=fake_executor_cls), - patch.object(DiffusionEngine, "_dummy_run", return_value=None), - ): - DiffusionEngine(od_config, scheduler=scheduler) + monkeypatch.setattr( + "vllm_omni.diffusion.diffusion_engine.get_diffusion_post_process_func", + lambda *args, **kwargs: None, + ) + monkeypatch.setattr( + "vllm_omni.diffusion.diffusion_engine.get_diffusion_pre_process_func", + lambda *args, **kwargs: None, + ) + monkeypatch.setattr( + "vllm_omni.diffusion.diffusion_engine.DiffusionExecutor.get_class", + lambda *args, **kwargs: fake_executor_cls, + ) + monkeypatch.setattr(DiffusionEngine, "_dummy_run", lambda self: None) + + DiffusionEngine(od_config, scheduler=scheduler) assert scheduler.initialized_with is od_config fake_executor_cls.assert_called_once_with(od_config) def test_scheduler_alias_keeps_default_request_scheduler(self) -> None: scheduler = Scheduler() - scheduler.initialize(Mock()) + scheduler.initialize(SimpleNamespace()) req_id = scheduler.add_request(_make_request("alias")) sched_output = scheduler.schedule() @@ -336,10 +348,10 @@ def test_scheduler_alias_keeps_default_request_scheduler(self) -> None: assert req_id in finished assert scheduler.get_request_state(req_id).status == DiffusionRequestStatus.FINISHED_COMPLETED - def test_step_raises_aborted_error(self) -> None: + def test_step_raises_aborted_error(self, mocker: MockerFixture) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.pre_process_func = None - engine.add_req_and_wait_for_response = Mock( + engine.add_req_and_wait_for_response = mocker.Mock( return_value=DiffusionOutput(aborted=True, abort_message="Request req-abort aborted.") ) @@ -349,7 +361,7 @@ def test_step_raises_aborted_error(self) -> None: def test_abort_queue_marks_request_finished_aborted(self) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.scheduler = RequestScheduler() - engine.scheduler.initialize(Mock()) + engine.scheduler.initialize(SimpleNamespace()) engine.abort_queue = queue.Queue() req_id = engine.scheduler.add_request(_make_request("req-abort")) @@ -361,7 +373,7 @@ def test_abort_queue_marks_request_finished_aborted(self) -> None: def test_finalize_finished_request_returns_aborted_output(self) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.scheduler = RequestScheduler() - engine.scheduler.initialize(Mock()) + engine.scheduler.initialize(SimpleNamespace()) req_id = engine.scheduler.add_request(_make_request("req-finalize")) engine.scheduler.finish_requests(req_id, DiffusionRequestStatus.FINISHED_ABORTED) @@ -371,29 +383,40 @@ def test_finalize_finished_request_returns_aborted_output(self) -> None: assert output.aborted is True assert output.abort_message == "Request req-finalize aborted." - def test_initializes_step_scheduler_when_step_execution_enabled(self) -> None: - od_config = Mock(model_class_name="mock_model") + def test_initializes_step_scheduler_when_step_execution_enabled( + self, + monkeypatch: pytest.MonkeyPatch, + mocker: MockerFixture, + ) -> None: + od_config = SimpleNamespace(model_class_name="mock_model") od_config.step_execution = True - fake_executor = Mock() - fake_executor_cls = Mock(return_value=fake_executor) + fake_executor = mocker.Mock() + fake_executor_cls = mocker.Mock(return_value=fake_executor) - with ( - patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_post_process_func", return_value=None), - patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_pre_process_func", return_value=None), - patch("vllm_omni.diffusion.diffusion_engine.DiffusionExecutor.get_class", return_value=fake_executor_cls), - patch.object(DiffusionEngine, "_dummy_run", return_value=None), - ): - engine = DiffusionEngine(od_config) + monkeypatch.setattr( + "vllm_omni.diffusion.diffusion_engine.get_diffusion_post_process_func", + lambda *args, **kwargs: None, + ) + monkeypatch.setattr( + "vllm_omni.diffusion.diffusion_engine.get_diffusion_pre_process_func", + lambda *args, **kwargs: None, + ) + monkeypatch.setattr( + "vllm_omni.diffusion.diffusion_engine.DiffusionExecutor.get_class", + lambda *args, **kwargs: fake_executor_cls, + ) + monkeypatch.setattr(DiffusionEngine, "_dummy_run", lambda self: None) + engine = DiffusionEngine(od_config) assert isinstance(engine.scheduler, StepScheduler) assert engine.execute_fn is fake_executor.execute_step fake_executor_cls.assert_called_once_with(od_config) - def test_dummy_run_raises_on_output_error(self) -> None: + def test_dummy_run_raises_on_output_error(self, mocker: MockerFixture) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) - engine.od_config = Mock(model_class_name="mock_model") + engine.od_config = SimpleNamespace(model_class_name="mock_model") engine.pre_process_func = None - engine.add_req_and_wait_for_response = Mock(return_value=DiffusionOutput(error="boom")) + engine.add_req_and_wait_for_response = mocker.Mock(return_value=DiffusionOutput(error="boom")) with pytest.raises(RuntimeError, match="Dummy run failed: boom"): engine._dummy_run() @@ -402,7 +425,7 @@ def test_dummy_run_raises_on_output_error(self) -> None: class TestStepScheduler: def setup_method(self) -> None: self.scheduler: StepScheduler = StepScheduler() - self.scheduler.initialize(Mock()) + self.scheduler.initialize(SimpleNamespace()) def test_single_request_step_lifecycle(self) -> None: request = _make_step_request("step", num_inference_steps=3) diff --git a/tests/diffusion/test_diffusion_step_pipeline.py b/tests/diffusion/test_diffusion_step_pipeline.py index 68aba9ba3b..42687d4a1e 100644 --- a/tests/diffusion/test_diffusion_step_pipeline.py +++ b/tests/diffusion/test_diffusion_step_pipeline.py @@ -7,10 +7,10 @@ import threading from contextlib import contextmanager from types import SimpleNamespace -from unittest.mock import Mock import pytest import torch +from pytest_mock import MockerFixture import vllm_omni.diffusion.worker.diffusion_model_runner as model_runner_module from tests.utils import hardware_test @@ -542,11 +542,11 @@ def test_rejects_lora_requests_in_step_mode(self): class TestExecutor: """MultiprocDiffusionExecutor.execute_step""" - def test_execute_step_passes_through_runner_output(self): + def test_execute_step_passes_through_runner_output(self, mocker: MockerFixture): executor = object.__new__(MultiprocDiffusionExecutor) executor._ensure_open = lambda: None expected = RunnerOutput(req_id="req-step", step_index=1, finished=False, result=None) - executor.collective_rpc = Mock(return_value=expected) + executor.collective_rpc = mocker.Mock(return_value=expected) request = _make_engine_request("req-step", num_inference_steps=2) scheduler_output = _make_scheduler_output(request, sched_req_id="req-step") @@ -578,9 +578,9 @@ class TestEngine: ), ], ) - def test_step_engine_returns_error(self, execute_fn, expected_error): + def test_step_engine_returns_error(self, execute_fn, expected_error, mocker: MockerFixture): scheduler = StepScheduler() - scheduler.initialize(Mock()) + scheduler.initialize(mocker.Mock()) engine = _make_engine(scheduler, execute_fn=execute_fn) output = engine.add_req_and_wait_for_response(_make_engine_request("req-error", num_inference_steps=2)) @@ -588,9 +588,9 @@ def test_step_engine_returns_error(self, execute_fn, expected_error): assert output.output is None assert expected_error in output.error - def test_step_execution_completes(self): + def test_step_execution_completes(self, mocker: MockerFixture): scheduler = StepScheduler() - scheduler.initialize(Mock()) + scheduler.initialize(mocker.Mock()) engine = _make_engine(scheduler) request = _make_engine_request("req-step", num_inference_steps=2) @@ -614,9 +614,9 @@ def execute_fn(_): assert output.error is None assert torch.equal(output.output, torch.tensor([2.0])) - def test_step_abort_stops_rescheduling_after_first_step(self): + def test_step_abort_stops_rescheduling_after_first_step(self, mocker: MockerFixture): scheduler = StepScheduler() - scheduler.initialize(Mock()) + scheduler.initialize(mocker.Mock()) engine = _make_engine(scheduler) request = _make_engine_request("req-stop", num_inference_steps=4) @@ -639,9 +639,9 @@ def execute_fn(_): assert step["n"] == 1 _assert_aborted_output(output, "req-stop") - def test_step_abort_after_reschedule_returns_aborted_output(self): + def test_step_abort_after_reschedule_returns_aborted_output(self, mocker: MockerFixture): scheduler = StepScheduler() - scheduler.initialize(Mock()) + scheduler.initialize(mocker.Mock()) engine = _make_engine(scheduler) request = _make_engine_request("req-mid", num_inference_steps=4) @@ -666,9 +666,9 @@ def execute_fn(sched_output): assert step["n"] == 2 _assert_aborted_output(output, "req-mid") - def test_finished_step_without_result_returns_error(self): + def test_finished_step_without_result_returns_error(self, mocker: MockerFixture): scheduler = StepScheduler() - scheduler.initialize(Mock()) + scheduler.initialize(mocker.Mock()) engine = _make_engine( scheduler, execute_fn=lambda _: RunnerOutput( diff --git a/tests/diffusion/test_diffusion_worker_cuda_profiler.py b/tests/diffusion/test_diffusion_worker_cuda_profiler.py index ddc2aed2fc..4a3b22c212 100644 --- a/tests/diffusion/test_diffusion_worker_cuda_profiler.py +++ b/tests/diffusion/test_diffusion_worker_cuda_profiler.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from unittest.mock import MagicMock - import pytest from pytest_mock import MockerFixture @@ -55,8 +53,8 @@ def test_profile_start_stop_delegates_to_cuda_profiler( mock_diffusion_worker_dependencies, ): fake_profiler = mocker.Mock() - fake_profiler.start = MagicMock() - fake_profiler.stop = MagicMock() + fake_profiler.start = mocker.Mock() + fake_profiler.stop = mocker.Mock() mocker.patch( "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper", return_value=fake_profiler, diff --git a/tests/diffusion/test_multiproc_engine_concurrency.py b/tests/diffusion/test_multiproc_engine_concurrency.py index 517f98ddaa..4bc3e05fe9 100644 --- a/tests/diffusion/test_multiproc_engine_concurrency.py +++ b/tests/diffusion/test_multiproc_engine_concurrency.py @@ -3,7 +3,7 @@ import queue import threading -from unittest.mock import Mock, patch +from types import SimpleNamespace import pytest import torch @@ -24,11 +24,9 @@ def _tagged_output(tag: str) -> DiffusionOutput: return DiffusionOutput(output=torch.tensor([0]), error=tag) -def _mock_request(tag: str) -> Mock: - """Return a mock ``OmniDiffusionRequest`` identifiable by *tag*.""" - req = Mock() - req.request_ids = [tag] - return req +def _mock_request(tag: str): + """Return a lightweight request object identifiable by *tag*.""" + return SimpleNamespace(request_ids=[tag]) def _make_executor(num_gpus: int = 1): @@ -36,20 +34,18 @@ def _make_executor(num_gpus: int = 1): Returns ``(executor, request_queue, result_queue)``. """ - od_cfg = Mock() - od_cfg.num_gpus = num_gpus - - with patch.object(MultiprocDiffusionExecutor, "_init_executor"): - executor = MultiprocDiffusionExecutor(od_cfg) + od_cfg = SimpleNamespace(num_gpus=num_gpus) + monkeypatch = pytest.MonkeyPatch() + monkeypatch.setattr(MultiprocDiffusionExecutor, "_init_executor", lambda self: None) + executor = MultiprocDiffusionExecutor(od_cfg) + monkeypatch.undo() req_q: queue.Queue = queue.Queue() res_q: queue.Queue = queue.Queue() - mock_broadcast_mq = Mock() - mock_broadcast_mq.enqueue = req_q.put + mock_broadcast_mq = SimpleNamespace(enqueue=req_q.put) - mock_rmq = Mock() - mock_rmq.dequeue = lambda timeout=None: res_q.get(timeout=timeout if timeout is not None else 10) + mock_rmq = SimpleNamespace(dequeue=lambda timeout=None: res_q.get(timeout=timeout if timeout is not None else 10)) executor._broadcast_mq = mock_broadcast_mq executor._result_mq = mock_rmq @@ -63,7 +59,7 @@ def _make_engine(num_gpus: int = 1): executor, req_q, res_q = _make_executor(num_gpus) engine = DiffusionEngine.__new__(DiffusionEngine) sched = RequestScheduler() - sched.initialize(Mock()) + sched.initialize(SimpleNamespace()) engine.scheduler = sched engine.executor = executor engine._rpc_lock = threading.RLock() diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index cb1f31164c..a1fc18f845 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -6,7 +6,7 @@ import argparse import inspect -from unittest.mock import Mock +from types import SimpleNamespace import pytest from pydantic import ValidationError @@ -102,7 +102,7 @@ def test_qwen3_tts_codec_frame_rate_patching(): vllm_config = EngineArgs().create_model_config() # Create a mock talking config with a dummy value for position_id_per_seconds - mock_talker_config = Mock() + mock_talker_config = SimpleNamespace() mock_talker_config.position_id_per_seconds = 12.3 vllm_config.hf_config.talker_config = mock_talker_config @@ -146,13 +146,12 @@ def test_stage_specific_text_config_override(): # Switch the created hf text config with a mock whose # values we want to pull through the text config helper stage_text_config = vllm_config.hf_text_config - vllm_config.hf_text_config = Mock() + vllm_config.hf_text_config = SimpleNamespace() stage_text_config.sliding_window = 4096 stage_text_config.attention_chunk_size = 2048 # Move the stage config's text config getter & thinker config - mock_stage_config = Mock() - mock_stage_config.get_text_config.return_value = stage_text_config + mock_stage_config = SimpleNamespace(get_text_config=lambda: stage_text_config) vllm_config.hf_config.thinker_config = mock_stage_config # Ensure that create from a vLLM config correctly pulls the diff --git a/tests/engine/test_async_omni_engine_input.py b/tests/engine/test_async_omni_engine_input.py index ed6a7277b4..3700e426d4 100644 --- a/tests/engine/test_async_omni_engine_input.py +++ b/tests/engine/test_async_omni_engine_input.py @@ -1,6 +1,5 @@ -from unittest.mock import Mock - import pytest +from pytest_mock import MockerFixture from vllm.sampling_params import SamplingParams from vllm.v1.engine import EngineCoreRequest @@ -24,18 +23,18 @@ def _make_engine_core_request() -> EngineCoreRequest: ) -def test_build_add_request_message_preserves_additional_information(): +def test_build_add_request_message_preserves_additional_information(mocker: MockerFixture): engine = object.__new__(AsyncOmniEngine) params = SamplingParams(max_tokens=8) engine.default_sampling_params_list = [params] engine.stage_metadata = [{"stage_type": "llm"}] engine.supported_tasks = ("speech",) - input_processor = Mock() + input_processor = mocker.Mock() input_processor.process_inputs.return_value = _make_engine_core_request() engine.input_processor = input_processor - output_processor = Mock() + output_processor = mocker.Mock() engine.output_processors = [output_processor] prompt = { @@ -63,18 +62,18 @@ def test_build_add_request_message_preserves_additional_information(): output_processor.add_request.assert_called_once() -def test_build_add_request_message_with_resumable_streaming(): +def test_build_add_request_message_with_resumable_streaming(mocker: MockerFixture): engine = object.__new__(AsyncOmniEngine) params = SamplingParams(max_tokens=8) engine.default_sampling_params_list = [params] engine.stage_metadata = [{"stage_type": "llm"}] engine.supported_tasks = ("generate",) - input_processor = Mock() + input_processor = mocker.Mock() input_processor.process_inputs.return_value = _make_engine_core_request() engine.input_processor = input_processor - output_processor = Mock() + output_processor = mocker.Mock() engine.output_processors = [output_processor] msg = engine._build_add_request_message( diff --git a/tests/engine/test_async_omni_engine_outputs.py b/tests/engine/test_async_omni_engine_outputs.py index ccf9e8cb6b..ef3cfab3bf 100644 --- a/tests/engine/test_async_omni_engine_outputs.py +++ b/tests/engine/test_async_omni_engine_outputs.py @@ -5,36 +5,36 @@ """ import queue -from unittest.mock import MagicMock import pytest +from pytest_mock import MockerFixture from vllm_omni.engine.async_omni_engine import AsyncOmniEngine pytestmark = [pytest.mark.core_model, pytest.mark.cpu] -def _make_engine(output_queue, *, thread_alive: bool = True) -> AsyncOmniEngine: +def _make_engine(output_queue, mocker: MockerFixture, *, thread_alive: bool = True) -> AsyncOmniEngine: """Create an AsyncOmniEngine bypassing __init__.""" engine = object.__new__(AsyncOmniEngine) engine.output_queue = output_queue - engine.orchestrator_thread = MagicMock( - is_alive=MagicMock(return_value=thread_alive), + engine.orchestrator_thread = mocker.MagicMock( + is_alive=mocker.MagicMock(return_value=thread_alive), ) return engine -def test_try_get_output_raises_after_orchestrator_dies(): +def test_try_get_output_raises_after_orchestrator_dies(mocker: MockerFixture): """Draining remaining results then hitting an empty queue with a dead orchestrator must raise RuntimeError so callers know the pipeline is gone.""" - mock_queue = MagicMock() + mock_queue = mocker.MagicMock() # First call succeeds; second call finds the queue empty. mock_queue.sync_q.get.side_effect = [ {"type": "output", "request_id": "r1"}, queue.Empty, ] - engine = _make_engine(mock_queue, thread_alive=True) + engine = _make_engine(mock_queue, mocker, thread_alive=True) # Collect the one buffered result. assert engine.try_get_output()["request_id"] == "r1" @@ -47,15 +47,15 @@ def test_try_get_output_raises_after_orchestrator_dies(): @pytest.mark.asyncio -async def test_try_get_output_async_raises_after_orchestrator_dies(): +async def test_try_get_output_async_raises_after_orchestrator_dies(mocker: MockerFixture): """Same scenario as above but for the async variant.""" - mock_queue = MagicMock() + mock_queue = mocker.MagicMock() mock_queue.sync_q.get_nowait.side_effect = [ {"type": "output", "request_id": "r1"}, queue.Empty, ] - engine = _make_engine(mock_queue, thread_alive=True) + engine = _make_engine(mock_queue, mocker, thread_alive=True) assert (await engine.try_get_output_async())["request_id"] == "r1" diff --git a/tests/engine/test_single_stage_mode.py b/tests/engine/test_single_stage_mode.py index 2c5bf6cc79..608e92ac49 100644 --- a/tests/engine/test_single_stage_mode.py +++ b/tests/engine/test_single_stage_mode.py @@ -17,10 +17,11 @@ import threading from contextlib import contextmanager +from types import SimpleNamespace from typing import Any -from unittest.mock import MagicMock, Mock, patch import pytest +from pytest_mock import MockerFixture from vllm.v1.engine.utils import EngineZmqAddresses from vllm_omni.engine.async_omni_engine import AsyncOmniEngine @@ -41,31 +42,33 @@ # --------------------------------------------------------------------------- -def _make_stage_cfg(stage_id: int, stage_type: str = "llm") -> Mock: +def _make_stage_cfg(stage_id: int, stage_type: str = "llm"): """Return a lightweight stage config mock.""" - cfg = Mock() - cfg.stage_id = stage_id - cfg.stage_type = stage_type - cfg.engine_args = MagicMock() - cfg.engine_args.async_chunk = False - cfg.engine_args.model_stage = None - cfg.engine_args.engine_output_type = None - return cfg + return SimpleNamespace( + stage_id=stage_id, + stage_type=stage_type, + engine_args=SimpleNamespace( + async_chunk=False, + model_stage=None, + engine_output_type=None, + ), + ) def _make_started_llm_stage(stage_id: int) -> StartedLlmStage: """Return a minimal StartedLlmStage for mocking.""" - addresses = Mock() - addresses.inputs = ["tcp://127.0.0.1:5000"] - addresses.outputs = ["tcp://127.0.0.1:5001"] - addresses.frontend_stats_publish_address = None + addresses = SimpleNamespace( + inputs=["tcp://127.0.0.1:5000"], + outputs=["tcp://127.0.0.1:5001"], + frontend_stats_publish_address=None, + ) return StartedLlmStage( stage_id=stage_id, - metadata=Mock(stage_id=stage_id), - vllm_config=Mock(), - executor_class=Mock(), - engine_manager=Mock(), - coordinator=Mock(), + metadata=SimpleNamespace(stage_id=stage_id), + vllm_config=SimpleNamespace(), + executor_class=SimpleNamespace(), + engine_manager=SimpleNamespace(), + coordinator=SimpleNamespace(), addresses=addresses, ) @@ -348,74 +351,80 @@ class TestSingleStageModeDetection: the orchestrator thread, so no actual engines are started. """ - def _make_engine_no_thread(self, **kwargs: Any) -> AsyncOmniEngine: + def _make_engine_no_thread(self, mocker: MockerFixture, **kwargs: Any) -> AsyncOmniEngine: """Create an AsyncOmniEngine without starting the orchestrator thread.""" stage_cfg = _make_stage_cfg(0) mock_stage_configs = [stage_cfg] - with ( - patch.object( - AsyncOmniEngine, - "_resolve_stage_configs", - return_value=("/fake/path", mock_stage_configs), - ), - patch.object( - AsyncOmniEngine, - "_bootstrap_orchestrator", - ), - patch("threading.Thread") as mock_thread_cls, - patch("concurrent.futures.Future") as mock_future_cls, - ): - mock_future = Mock() - mock_future.result.return_value = Mock() # simulates a loop - mock_future_cls.return_value = mock_future + mocker.patch.object( + AsyncOmniEngine, + "_resolve_stage_configs", + return_value=("/fake/path", mock_stage_configs), + ) + mocker.patch.object( + AsyncOmniEngine, + "_bootstrap_orchestrator", + ) + mock_thread_cls = mocker.patch("threading.Thread") + mock_future_cls = mocker.patch("concurrent.futures.Future") + + mock_future = mocker.Mock() + mock_future.result.return_value = mocker.Mock() # simulates a loop + mock_future_cls.return_value = mock_future - mock_thread = Mock() - mock_thread.is_alive.return_value = False - mock_thread_cls.return_value = mock_thread + mock_thread = mocker.Mock() + mock_thread.is_alive.return_value = False + mock_thread_cls.return_value = mock_thread - engine = AsyncOmniEngine(model="fake-model", **kwargs) + engine = AsyncOmniEngine(model="fake-model", **kwargs) return engine - def test_explicit_single_stage_mode_true(self): + def test_explicit_single_stage_mode_true(self, mocker: MockerFixture): engine = self._make_engine_no_thread( + mocker, single_stage_mode=True, omni_master_address="127.0.0.1", omni_master_port=20000, ) assert engine.single_stage_mode is True - def test_stage_id_kwarg_promotes_to_single_stage_mode(self): + def test_stage_id_kwarg_promotes_to_single_stage_mode(self, mocker: MockerFixture): engine = self._make_engine_no_thread( + mocker, stage_id=0, omni_master_address="127.0.0.1", omni_master_port=20001, ) assert engine.single_stage_mode is True - def test_stage_id_kwarg_sets_filter(self): + def test_stage_id_kwarg_sets_filter(self, mocker: MockerFixture): engine = self._make_engine_no_thread( + mocker, stage_id=1, omni_master_address="127.0.0.1", omni_master_port=20002, ) assert engine._single_stage_id_filter == 1 - def test_no_stage_id_no_single_stage_mode(self): - engine = self._make_engine_no_thread() + def test_no_stage_id_no_single_stage_mode(self, mocker: MockerFixture): + engine = self._make_engine_no_thread( + mocker, + ) assert engine.single_stage_mode is False assert engine._single_stage_id_filter is None - def test_single_stage_mode_without_stage_id_has_no_filter(self): + def test_single_stage_mode_without_stage_id_has_no_filter(self, mocker: MockerFixture): engine = self._make_engine_no_thread( + mocker, single_stage_mode=True, omni_master_address="127.0.0.1", omni_master_port=20003, ) assert engine._single_stage_id_filter is None - def test_master_address_and_port_stored(self): + def test_master_address_and_port_stored(self, mocker: MockerFixture): engine = self._make_engine_no_thread( + mocker, stage_id=0, omni_master_address="10.0.0.1", omni_master_port=12345, @@ -423,8 +432,10 @@ def test_master_address_and_port_stored(self): assert engine._omni_master_address == "10.0.0.1" assert engine._omni_master_port == 12345 - def test_omni_master_server_starts_as_none(self): - engine = self._make_engine_no_thread() + def test_omni_master_server_starts_as_none(self, mocker: MockerFixture): + engine = self._make_engine_no_thread( + mocker, + ) assert engine._omni_master_server is None @@ -448,7 +459,7 @@ class TestInitializeStagesRouting: def _build_engine_skeleton( self, - stage_cfgs: list[Mock], + stage_cfgs: list[Any], single_stage_mode: bool, stage_id_filter: int | None, omni_master_address: str = "127.0.0.1", @@ -478,8 +489,8 @@ def _build_engine_skeleton( engine.prompt_expand_func = None return engine - def _fake_metadata(self, stage_id: int, stage_type: str = "llm") -> Mock: - meta = Mock() + def _fake_metadata(self, mocker: MockerFixture, stage_id: int, stage_type: str = "llm") -> Any: + meta = mocker.Mock() meta.stage_id = stage_id meta.stage_type = stage_type meta.runtime_cfg = {} @@ -492,13 +503,14 @@ def _fake_metadata(self, stage_id: int, stage_type: str = "llm") -> Mock: def _run_initialize_stages_mocked( self, + mocker: MockerFixture, engine: AsyncOmniEngine, - stage_cfgs: list[Mock], + stage_cfgs: list[Any], *, launch_side_effect: Any = None, remote_side_effect: Any = None, attach_result: Any = None, - ) -> tuple[Mock, Mock]: + ) -> tuple[Any, Any]: """Execute _initialize_stages with all heavy helpers mocked. Returns (mock_launch_llm_stage, mock_create_remote_llm_stage). @@ -509,167 +521,217 @@ def _run_initialize_stages_mocked( if getattr(cfg, "stage_type", "llm") != "diffusion" } - default_attach = (Mock(), Mock(), Mock(), Mock()) + default_attach = (mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()) - mock_launch = Mock( + mock_launch = mocker.Mock( side_effect=launch_side_effect or (lambda cfg, meta, spec, timeout, llm_stage_launch_lock, kv: started_by_stage[meta.stage_id]) ) - mock_remote = Mock( + mock_remote = mocker.Mock( side_effect=remote_side_effect or (lambda cfg, meta, spec, timeout, srv: started_by_stage[meta.stage_id]) ) - mock_attach = Mock(return_value=attach_result or default_attach) + mock_attach = mocker.Mock(return_value=attach_result or default_attach) - mock_oms = Mock(spec=OmniMasterServer) - mock_oms.get_zmq_addresses.side_effect = lambda sid: Mock() + mock_oms = mocker.Mock(spec=OmniMasterServer) + mock_oms.get_zmq_addresses.side_effect = lambda sid: mocker.Mock() finalized = ( - [Mock() for _ in stage_cfgs], - [Mock() for _ in stage_cfgs], + [mocker.Mock() for _ in stage_cfgs], + [mocker.Mock() for _ in stage_cfgs], [{"final_output": True, "final_output_type": None, "stage_type": "llm"} for _ in stage_cfgs], ) - with ( - patch.object(engine, "_launch_llm_stage", mock_launch), - patch.object(engine, "_create_remote_llm_stage", mock_remote), - patch.object(engine, "_attach_llm_stage", mock_attach), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch( - "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", - return_value=None, - ), - patch( - "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", - return_value={}, - ), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", - return_value=(None, None, None), - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id, getattr(cfg, "stage_type", "llm")), - ), - patch( - "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", - return_value=finalized, + mocker.patch.object(engine, "_launch_llm_stage", mock_launch) + mocker.patch.object(engine, "_create_remote_llm_stage", mock_remote) + mocker.patch.object(engine, "_attach_llm_stage", mock_attach) + mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.prepare_engine_environment", + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata( + mocker, + cfg.stage_id, + getattr(cfg, "stage_type", "llm"), ), - ): - engine._initialize_stages(stage_init_timeout=60) + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) return mock_launch, mock_remote # -- single-stage mode: stage matches filter → local launch --------------- - def test_matching_stage_uses_launch_llm_stage(self): + def test_matching_stage_uses_launch_llm_stage(self, mocker: MockerFixture): """stage_id == _single_stage_id_filter → _launch_llm_stage is called.""" stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) - mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + mock_launch, mock_remote = self._run_initialize_stages_mocked(mocker, engine, stage_cfgs) launched_ids = [c.args[1].stage_id for c in mock_launch.call_args_list] assert 0 in launched_ids, "_launch_llm_stage should be called for stage 0" - def test_non_matching_stage_uses_create_remote_llm_stage(self): + def test_non_matching_stage_uses_create_remote_llm_stage(self, mocker: MockerFixture): """stage_id != _single_stage_id_filter → _create_remote_llm_stage is called.""" stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) - mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + mock_launch, mock_remote = self._run_initialize_stages_mocked(mocker, engine, stage_cfgs) remote_ids = [c.args[1].stage_id for c in mock_remote.call_args_list] assert 1 in remote_ids, "_create_remote_llm_stage should be called for stage 1" - def test_filter_1_routes_correctly(self): + def test_filter_1_routes_correctly(self, mocker: MockerFixture): """With filter=1, stage 0 is remote and stage 1 is local.""" stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=1) - mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + mock_launch, mock_remote = self._run_initialize_stages_mocked(mocker, engine, stage_cfgs) launched_ids = [c.args[1].stage_id for c in mock_launch.call_args_list] remote_ids = [c.args[1].stage_id for c in mock_remote.call_args_list] assert 1 in launched_ids, "stage 1 should be launched locally with filter=1" assert 0 in remote_ids, "stage 0 should use remote path with filter=1" - def test_no_filter_all_stages_use_launch_path(self): + def test_no_filter_all_stages_use_launch_path(self, mocker: MockerFixture): """single_stage_mode=True but no filter → all stages use _launch_llm_stage.""" stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=None) - mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + mock_launch, mock_remote = self._run_initialize_stages_mocked(mocker, engine, stage_cfgs) assert mock_remote.call_count == 0, "No remote launches without a filter" launched_ids = [c.args[1].stage_id for c in mock_launch.call_args_list] assert set(launched_ids) == {0, 1} - def test_non_single_stage_mode_never_calls_create_remote(self): + def test_non_single_stage_mode_never_calls_create_remote(self, mocker: MockerFixture): """Outside single_stage_mode, _create_remote_llm_stage must not be called.""" stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=False, stage_id_filter=None) - mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + mock_launch, mock_remote = self._run_initialize_stages_mocked(mocker, engine, stage_cfgs) assert mock_remote.call_count == 0 - def test_omni_master_server_started_in_single_stage_mode(self): + def test_omni_master_server_started_in_single_stage_mode(self, mocker: MockerFixture): """OmniMasterServer.start() must be called when single_stage_mode=True.""" stage_cfgs = [_make_stage_cfg(0)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) - mock_oms = Mock(spec=OmniMasterServer) - mock_oms.get_zmq_addresses.return_value = Mock() - finalized = ([Mock()], [Mock()], [{"final_output": True, "final_output_type": None, "stage_type": "llm"}]) - - with ( - patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)), - patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(0)), - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id), - ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + mock_oms = mocker.Mock(spec=OmniMasterServer) + mock_oms.get_zmq_addresses.return_value = mocker.Mock() + finalized = ( + [mocker.Mock()], + [mocker.Mock()], + [{"final_output": True, "final_output_type": None, "stage_type": "llm"}], + ) + + mocker.patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)) + mocker.patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(0)) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(mocker, cfg.stage_id), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) mock_oms.start.assert_called_once() - def test_omni_master_server_uses_configured_stage_ids(self): + def test_omni_master_server_uses_configured_stage_ids(self, mocker: MockerFixture): """Configured stage IDs, not list indexes, should drive pre-allocation.""" stage_cfgs = [_make_stage_cfg(7), _make_stage_cfg(11)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=7) - mock_oms = Mock(spec=OmniMasterServer) - mock_oms.get_zmq_addresses.return_value = Mock() + mock_oms = mocker.Mock(spec=OmniMasterServer) + mock_oms.get_zmq_addresses.return_value = mocker.Mock() finalized = ( - [Mock(), Mock()], - [Mock(), Mock()], + [mocker.Mock(), mocker.Mock()], + [mocker.Mock(), mocker.Mock()], [{"final_output": False, "final_output_type": None, "stage_type": "llm"} for _ in stage_cfgs], ) - with ( - patch.object( - engine, "_launch_llm_stage", side_effect=[_make_started_llm_stage(7), _make_started_llm_stage(11)] - ), - patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(11)), - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms) as mock_oms_cls, - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id), - ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + mocker.patch.object( + engine, + "_launch_llm_stage", + side_effect=[_make_started_llm_stage(7), _make_started_llm_stage(11)], + ) + mocker.patch.object( + engine, + "_create_remote_llm_stage", + return_value=_make_started_llm_stage(11), + ) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mock_oms_cls = mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(mocker, cfg.stage_id), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) mock_oms_cls.assert_called_once_with( master_address=engine._omni_master_address, @@ -677,73 +739,121 @@ def test_omni_master_server_uses_configured_stage_ids(self): stage_ids=[7, 11], ) - def test_single_stage_filter_uses_configured_stage_ids(self): + def test_single_stage_filter_uses_configured_stage_ids(self, mocker: MockerFixture): """Local/remote dispatch should compare against configured stage IDs.""" stage_cfgs = [_make_stage_cfg(7), _make_stage_cfg(11)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=7) - mock_oms = Mock(spec=OmniMasterServer) + mock_oms = mocker.Mock(spec=OmniMasterServer) finalized = ( - [Mock(), Mock()], - [Mock(), Mock()], + [mocker.Mock(), mocker.Mock()], + [mocker.Mock(), mocker.Mock()], [{"final_output": False, "final_output_type": None, "stage_type": "llm"} for _ in stage_cfgs], ) - with ( - patch.object(engine, "_launch_llm_stage", side_effect=[_make_started_llm_stage(7)]) as mock_launch, - patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(11)) as mock_remote, - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id), - ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + mock_launch = mocker.patch.object( + engine, + "_launch_llm_stage", + side_effect=[_make_started_llm_stage(7)], + ) + mock_remote = mocker.patch.object( + engine, + "_create_remote_llm_stage", + return_value=_make_started_llm_stage(11), + ) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(mocker, cfg.stage_id), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) assert [call.args[1].stage_id for call in mock_launch.call_args_list] == [7] assert [call.args[1].stage_id for call in mock_remote.call_args_list] == [11] - def test_omni_master_server_preallocates_diffusion_stage_ids(self): + def test_omni_master_server_preallocates_diffusion_stage_ids(self, mocker: MockerFixture): """Diffusion stages should also receive OmniMasterServer allocations.""" stage_cfgs = [_make_stage_cfg(7), _make_stage_cfg(11, stage_type="diffusion")] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=7) - mock_oms = Mock(spec=OmniMasterServer) + mock_oms = mocker.Mock(spec=OmniMasterServer) finalized = ( - [Mock(), Mock()], - [Mock(), Mock()], + [mocker.Mock(), mocker.Mock()], + [mocker.Mock(), mocker.Mock()], [ {"final_output": False, "final_output_type": None, "stage_type": "llm"}, {"final_output": False, "final_output_type": None, "stage_type": "diffusion"}, ], ) - with ( - patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(7)), - patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(7)), - patch.object(engine, "_launch_diffusion_stage", return_value=Mock()), - patch.object(engine, "_create_remote_diffusion_stage", return_value=Mock()), - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms) as mock_oms_cls, - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id, getattr(cfg, "stage_type", "llm")), + mocker.patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(7)) + mocker.patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(7)) + mocker.patch.object(engine, "_launch_diffusion_stage", return_value=mocker.Mock()) + mocker.patch.object( + engine, + "_create_remote_diffusion_stage", + return_value=mocker.Mock(), + ) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mock_oms_cls = mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata( + mocker, + cfg.stage_id, + getattr(cfg, "stage_type", "llm"), ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) mock_oms_cls.assert_called_once_with( master_address=engine._omni_master_address, @@ -751,135 +861,200 @@ def test_omni_master_server_preallocates_diffusion_stage_ids(self): stage_ids=[7, 11], ) - def test_duplicate_llm_stage_ids_raise(self): + def test_duplicate_llm_stage_ids_raise(self, mocker: MockerFixture): """Duplicate configured LLM stage IDs should fail fast.""" stage_cfgs = [_make_stage_cfg(3), _make_stage_cfg(3)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=3) - with ( - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - pytest.raises(ValueError, match="Duplicate stage_id"), - ): + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + with pytest.raises(ValueError, match="Duplicate stage_id"): engine._initialize_stages(stage_init_timeout=60) - def test_omni_master_server_not_started_in_normal_mode(self): + def test_omni_master_server_not_started_in_normal_mode(self, mocker: MockerFixture): """OmniMasterServer must NOT be instantiated outside single_stage_mode.""" stage_cfgs = [_make_stage_cfg(0)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=False, stage_id_filter=None) - finalized = ([Mock()], [Mock()], [{"final_output": True, "final_output_type": None, "stage_type": "llm"}]) - - with ( - patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)), - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer") as mock_oms_cls, - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id), - ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + finalized = ( + [mocker.Mock()], + [mocker.Mock()], + [{"final_output": True, "final_output_type": None, "stage_type": "llm"}], + ) + + mocker.patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mock_oms_cls = mocker.patch("vllm_omni.engine.async_omni_engine.OmniMasterServer") + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(mocker, cfg.stage_id), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) mock_oms_cls.assert_not_called() - def test_single_stage_mode_missing_master_address_raises(self): + def test_single_stage_mode_missing_master_address_raises(self, mocker: MockerFixture): """single_stage_mode without master address/port raises ValueError.""" stage_cfgs = [_make_stage_cfg(0)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) engine._omni_master_address = None # missing engine._omni_master_port = None - with ( - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - pytest.raises(ValueError, match="omni_master_address"), - ): + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + with pytest.raises(ValueError, match="omni_master_address"): engine._initialize_stages(stage_init_timeout=60) - def test_matching_diffusion_stage_uses_local_registered_launch(self): + def test_matching_diffusion_stage_uses_local_registered_launch(self, mocker: MockerFixture): """A local diffusion stage should use the registered single-stage launch path.""" stage_cfgs = [_make_stage_cfg(0, stage_type="diffusion"), _make_stage_cfg(1)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) - mock_oms = Mock(spec=OmniMasterServer) - diffusion_client = Mock(stage_type="diffusion") + mock_oms = mocker.Mock(spec=OmniMasterServer) + diffusion_client = mocker.Mock(stage_type="diffusion") finalized = ( - [diffusion_client, Mock()], - [Mock(), Mock()], + [diffusion_client, mocker.Mock()], + [mocker.Mock(), mocker.Mock()], [ {"final_output": False, "final_output_type": None, "stage_type": "diffusion"}, {"final_output": False, "final_output_type": None, "stage_type": "llm"}, ], ) - with ( - patch.object(engine, "_launch_diffusion_stage", return_value=diffusion_client) as mock_local_diff, - patch.object(engine, "_create_remote_diffusion_stage") as mock_remote_diff, - patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(1)), - patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(1)), - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id, getattr(cfg, "stage_type", "llm")), + mock_local_diff = mocker.patch.object( + engine, + "_launch_diffusion_stage", + return_value=diffusion_client, + ) + mock_remote_diff = mocker.patch.object(engine, "_create_remote_diffusion_stage") + mocker.patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(1)) + mocker.patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(1)) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata( + mocker, + cfg.stage_id, + getattr(cfg, "stage_type", "llm"), ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) assert mock_local_diff.call_count == 1 assert mock_local_diff.call_args.args[1].stage_id == 0 mock_remote_diff.assert_not_called() - def test_non_matching_diffusion_stage_uses_remote_diffusion_client(self): + def test_non_matching_diffusion_stage_uses_remote_diffusion_client(self, mocker: MockerFixture): """A non-local diffusion stage should attach via the remote diffusion path.""" stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1, stage_type="diffusion")] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) - mock_oms = Mock(spec=OmniMasterServer) - remote_diffusion_client = Mock(stage_type="diffusion") + mock_oms = mocker.Mock(spec=OmniMasterServer) + remote_diffusion_client = mocker.Mock(stage_type="diffusion") finalized = ( - [Mock(), remote_diffusion_client], - [Mock(), Mock()], + [mocker.Mock(), remote_diffusion_client], + [mocker.Mock(), mocker.Mock()], [ {"final_output": False, "final_output_type": None, "stage_type": "llm"}, {"final_output": False, "final_output_type": None, "stage_type": "diffusion"}, ], ) - with ( - patch.object(engine, "_launch_diffusion_stage") as mock_local_diff, - patch.object( - engine, "_create_remote_diffusion_stage", return_value=remote_diffusion_client - ) as mock_remote_diff, - patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)), - patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(0)), - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id, getattr(cfg, "stage_type", "llm")), + mock_local_diff = mocker.patch.object(engine, "_launch_diffusion_stage") + mock_remote_diff = mocker.patch.object( + engine, + "_create_remote_diffusion_stage", + return_value=remote_diffusion_client, + ) + mocker.patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)) + mocker.patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(0)) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata( + mocker, + cfg.stage_id, + getattr(cfg, "stage_type", "llm"), ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) mock_local_diff.assert_not_called() assert mock_remote_diff.call_count == 1 @@ -894,45 +1069,47 @@ def test_non_matching_diffusion_stage_uses_remote_diffusion_client(self): class TestLaunchDiffusionStage: """Test local diffusion stage launch wiring.""" - def test_registers_stage_with_public_master_properties(self): + def test_registers_stage_with_public_master_properties(self, mocker: MockerFixture): engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" engine.diffusion_batch_size = 4 stage_cfg = _make_stage_cfg(5, stage_type="diffusion") - metadata = Mock(stage_id=5) - omni_master_server = Mock(spec=OmniMasterServer) + metadata = mocker.Mock(stage_id=5) + omni_master_server = mocker.Mock(spec=OmniMasterServer) omni_master_server.address = "127.0.0.1" omni_master_server.port = 25000 - proc = Mock() - diffusion_client = Mock() - - with ( - patch("vllm_omni.engine.async_omni_engine.build_diffusion_config", return_value="diffusion-config"), - patch( - "vllm_omni.engine.async_omni_engine.register_stage_with_omni_master", - return_value=( - "tcp://127.0.0.1:25001", - "tcp://127.0.0.1:25002", - "tcp://127.0.0.1:25003", - ), - ) as mock_register, - patch( - "vllm_omni.engine.async_omni_engine.spawn_diffusion_proc", - return_value=(proc, None, None, None), - ) as mock_spawn, - patch("vllm_omni.engine.async_omni_engine.complete_diffusion_handshake") as mock_handshake, - patch( - "vllm_omni.engine.async_omni_engine.StageDiffusionClient.from_addresses", - return_value=diffusion_client, - ) as mock_from_addresses, - ): - result = engine._launch_diffusion_stage( - stage_cfg=stage_cfg, - metadata=metadata, - omni_master_server=omni_master_server, - ) + proc = mocker.Mock() + diffusion_client = mocker.Mock() + + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_diffusion_config", + return_value="diffusion-config", + ) + mock_register = mocker.patch( + "vllm_omni.engine.async_omni_engine.register_stage_with_omni_master", + return_value=( + "tcp://127.0.0.1:25001", + "tcp://127.0.0.1:25002", + "tcp://127.0.0.1:25003", + ), + ) + mock_spawn = mocker.patch( + "vllm_omni.engine.async_omni_engine.spawn_diffusion_proc", + return_value=(proc, None, None, None), + ) + mock_handshake = mocker.patch("vllm_omni.engine.async_omni_engine.complete_diffusion_handshake") + mock_from_addresses = mocker.patch( + "vllm_omni.engine.async_omni_engine.StageDiffusionClient.from_addresses", + return_value=diffusion_client, + ) + + result = engine._launch_diffusion_stage( + stage_cfg=stage_cfg, + metadata=metadata, + omni_master_server=omni_master_server, + ) mock_register.assert_called_once_with( omni_master_address="127.0.0.1", @@ -967,14 +1144,14 @@ def test_registers_stage_with_public_master_properties(self): class TestCreateRemoteLlmStage: """Test _create_remote_llm_stage delegates correctly.""" - def _engine(self) -> AsyncOmniEngine: + def _engine(self, mocker: MockerFixture) -> AsyncOmniEngine: engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" engine.single_stage_mode = True engine._single_stage_id_filter = 0 - engine._omni_master_server = Mock(spec=OmniMasterServer) - engine._omni_master_server.get_zmq_addresses.return_value = Mock() - engine._omni_master_server.get_allocation.return_value = Mock() + engine._omni_master_server = mocker.Mock(spec=OmniMasterServer) + engine._omni_master_server.get_zmq_addresses.return_value = mocker.Mock() + engine._omni_master_server.get_allocation.return_value = mocker.Mock() engine._omni_master_server.get_stage_config.return_value = { "stage_id": 0, "stage_type": "llm", @@ -982,42 +1159,40 @@ def _engine(self) -> AsyncOmniEngine: } return engine - @contextmanager - def _patch_build_and_connect(self, stage_id: int): - fake_vllm_config = Mock() - fake_executor_cls = Mock() - fake_addresses = Mock() + def _mock_build_and_connect(self, mocker: MockerFixture, stage_id: int): + fake_vllm_config = mocker.Mock() + fake_executor_cls = mocker.Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None - eng_mgr = Mock() - coordinator = Mock() + eng_mgr = mocker.Mock() + coordinator = mocker.Mock() @contextmanager def fake_connect_cm(*args, **kwargs): yield eng_mgr, coordinator, fake_addresses - with ( - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": stage_id}, - ), - patch( - "vllm_omni.engine.async_omni_engine.build_vllm_config", - return_value=(fake_vllm_config, fake_executor_cls), - ), - patch( - "vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", - return_value=fake_connect_cm(), - ) as mock_connect, - ): - yield mock_connect, fake_vllm_config, fake_executor_cls, fake_addresses + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": stage_id}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(fake_vllm_config, fake_executor_cls), + ) + mock_connect = mocker.patch( + "vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", + return_value=fake_connect_cm(), + ) + + return mock_connect, fake_vllm_config, fake_executor_cls, fake_addresses - def test_returns_started_llm_stage_with_correct_stage_id(self): - engine = self._engine() + def test_returns_started_llm_stage_with_correct_stage_id(self, mocker: MockerFixture): + engine = self._engine(mocker) stage_cfg = _make_stage_cfg(1) - metadata = Mock(stage_id=1) + metadata = mocker.Mock(stage_id=1) omni_ms = engine._omni_master_server omni_ms.get_stage_config.return_value = { "stage_id": 1, @@ -1025,93 +1200,93 @@ def test_returns_started_llm_stage_with_correct_stage_id(self): "engine_args": {}, } - with self._patch_build_and_connect(1): - result = engine._create_remote_llm_stage( - stage_cfg=stage_cfg, - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - omni_master_server=omni_ms, - ) + self._mock_build_and_connect(mocker, 1) + result = engine._create_remote_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + omni_master_server=omni_ms, + ) assert isinstance(result, StartedLlmStage) assert result.stage_id == 1 - def test_connect_remote_engine_cores_called_with_stage_id(self): - engine = self._engine() + def test_connect_remote_engine_cores_called_with_stage_id(self, mocker: MockerFixture): + engine = self._engine(mocker) stage_cfg = _make_stage_cfg(2) - metadata = Mock(stage_id=2) + metadata = mocker.Mock(stage_id=2) omni_ms = engine._omni_master_server - omni_ms.get_zmq_addresses.return_value = Mock(inputs=["x"], outputs=["y"]) + omni_ms.get_zmq_addresses.return_value = mocker.Mock(inputs=["x"], outputs=["y"]) omni_ms.get_stage_config.return_value = { "stage_id": 2, "stage_type": "llm", "engine_args": {}, } - fake_vllm_config = Mock() - fake_executor_cls = Mock() - fake_addresses = Mock() + fake_vllm_config = mocker.Mock() + fake_executor_cls = mocker.Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None @contextmanager def fake_connect_cm(*args, **kwargs): - yield Mock(), Mock(), fake_addresses + yield mocker.Mock(), mocker.Mock(), fake_addresses - with ( - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": 2}, - ), - patch( - "vllm_omni.engine.async_omni_engine.build_vllm_config", - return_value=(fake_vllm_config, fake_executor_cls), - ), - patch( - "vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", return_value=fake_connect_cm() - ) as mock_connect, - ): - engine._create_remote_llm_stage( - stage_cfg=stage_cfg, - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - omni_master_server=omni_ms, - ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 2}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(fake_vllm_config, fake_executor_cls), + ) + mock_connect = mocker.patch( + "vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", + return_value=fake_connect_cm(), + ) + + engine._create_remote_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + omni_master_server=omni_ms, + ) mock_connect.assert_called_once() _, kwargs = mock_connect.call_args assert kwargs.get("stage_id") == 2 or mock_connect.call_args.args[-1] == 2 omni_ms.get_stage_config.assert_called_once_with(2, timeout_s=60) - def test_missing_registered_stage_config_raises_value_error(self): - engine = self._engine() + def test_missing_registered_stage_config_raises_value_error(self, mocker: MockerFixture): + engine = self._engine(mocker) stage_cfg = _make_stage_cfg(3) - metadata = Mock(stage_id=3) + metadata = mocker.Mock(stage_id=3) omni_ms = engine._omni_master_server omni_ms.get_stage_config.return_value = None - with patch("vllm_omni.engine.async_omni_engine.build_engine_args_dict") as mock_build_args: - with pytest.raises( - ValueError, - match="Remote stage 3 registered without stage config", - ): - engine._create_remote_llm_stage( - stage_cfg=stage_cfg, - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - omni_master_server=omni_ms, - ) + mock_build_args = mocker.patch("vllm_omni.engine.async_omni_engine.build_engine_args_dict") + with pytest.raises( + ValueError, + match="Remote stage 3 registered without stage config", + ): + engine._create_remote_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + omni_master_server=omni_ms, + ) mock_build_args.assert_not_called() - def test_exception_during_connect_closes_started_stage(self): + def test_exception_during_connect_closes_started_stage(self, mocker: MockerFixture): """If an error occurs after StartedLlmStage creation, close_started_llm_stage is called.""" - engine = self._engine() + engine = self._engine(mocker) stage_cfg = _make_stage_cfg(1) - metadata = Mock(stage_id=1) + metadata = mocker.Mock(stage_id=1) omni_ms = engine._omni_master_server omni_ms.get_stage_config.return_value = { "stage_id": 1, @@ -1121,26 +1296,30 @@ def test_exception_during_connect_closes_started_stage(self): @contextmanager def boom(*args, **kwargs): - yield Mock(), Mock(), Mock() + yield mocker.Mock(), mocker.Mock(), mocker.Mock() raise RuntimeError("handshake failed") - with ( - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": 1}, - ), - patch("vllm_omni.engine.async_omni_engine.build_vllm_config", return_value=(Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", return_value=boom()), - patch("vllm_omni.engine.async_omni_engine.close_started_llm_stage") as mock_close, - ): - with pytest.raises(RuntimeError, match="handshake failed"): - engine._create_remote_llm_stage( - stage_cfg=stage_cfg, - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - omni_master_server=omni_ms, - ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 1}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", + return_value=boom(), + ) + mock_close = mocker.patch("vllm_omni.engine.async_omni_engine.close_started_llm_stage") + with pytest.raises(RuntimeError, match="handshake failed"): + engine._create_remote_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + omni_master_server=omni_ms, + ) mock_close.assert_called_once() @@ -1148,27 +1327,29 @@ class TestConnectRemoteEngineCoresCoordinator: """Test coordinator launch parity with launch_core_engines.""" @staticmethod - def _build_vllm_config(*, dp_rank: int = 0, offline_mode: bool = False, needs_dp_coordinator: bool = True) -> Mock: - parallel_config = Mock() + def _build_vllm_config( + mocker: MockerFixture, *, dp_rank: int = 0, offline_mode: bool = False, needs_dp_coordinator: bool = True + ) -> Any: + parallel_config = mocker.Mock() parallel_config.data_parallel_size_local = 1 parallel_config.data_parallel_size = 2 parallel_config.data_parallel_rank = dp_rank parallel_config.data_parallel_rank_local = 0 if offline_mode else None - vllm_config = Mock() + vllm_config = mocker.Mock() vllm_config.parallel_config = parallel_config vllm_config.needs_dp_coordinator = needs_dp_coordinator - vllm_config.model_config = Mock(is_moe=False) + vllm_config.model_config = mocker.Mock(is_moe=False) return vllm_config - def test_uses_registered_coordinator_addresses(self): - vllm_config = self._build_vllm_config(dp_rank=0, offline_mode=False, needs_dp_coordinator=True) + def test_uses_registered_coordinator_addresses(self, mocker: MockerFixture): + vllm_config = self._build_vllm_config(mocker, dp_rank=0, offline_mode=False, needs_dp_coordinator=True) - omni_master_server = Mock(spec=OmniMasterServer) + omni_master_server = mocker.Mock(spec=OmniMasterServer) omni_master_server.get_zmq_addresses.return_value = EngineZmqAddresses( inputs=["tcp://client-in"], outputs=["tcp://client-out"] ) - omni_master_server.get_allocation.return_value = Mock(handshake_bind_address="tcp://127.0.0.1:26001") + omni_master_server.get_allocation.return_value = mocker.Mock(handshake_bind_address="tcp://127.0.0.1:26001") omni_master_server.get_stage_coordinator_addresses.return_value = StageCoordinatorAddresses( coordinator_input="tcp://coord-in", coordinator_output="tcp://coord-out", @@ -1177,103 +1358,107 @@ def test_uses_registered_coordinator_addresses(self): @contextmanager def fake_socket_ctx(*args, **kwargs): - yield Mock() + yield mocker.Mock() - with ( - patch("vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", return_value=fake_socket_ctx()), - patch("vllm_omni.engine.stage_engine_startup._wait_for_omni_engine_startup") as mock_wait, - ): - with connect_remote_engine_cores( - vllm_config=vllm_config, - omni_master_server=omni_master_server, - stage_id=7, - ) as (_, yielded_coordinator, yielded_addresses): - assert yielded_coordinator is None - assert yielded_addresses.coordinator_input == "tcp://coord-in" - assert yielded_addresses.coordinator_output == "tcp://coord-out" - assert yielded_addresses.frontend_stats_publish_address == "tcp://stats" + mocker.patch( + "vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", + return_value=fake_socket_ctx(), + ) + mock_wait = mocker.patch("vllm_omni.engine.stage_engine_startup._wait_for_omni_engine_startup") + with connect_remote_engine_cores( + vllm_config=vllm_config, + omni_master_server=omni_master_server, + stage_id=7, + ) as (_, yielded_coordinator, yielded_addresses): + assert yielded_coordinator is None + assert yielded_addresses.coordinator_input == "tcp://coord-in" + assert yielded_addresses.coordinator_output == "tcp://coord-out" + assert yielded_addresses.frontend_stats_publish_address == "tcp://stats" omni_master_server.get_stage_coordinator_addresses.assert_called_once_with(7) mock_wait.assert_called_once() - def test_defaults_to_no_coordinator_addresses_when_none_registered(self): + def test_defaults_to_no_coordinator_addresses_when_none_registered(self, mocker: MockerFixture): vllm_config = self._build_vllm_config( + mocker, dp_rank=0, offline_mode=False, needs_dp_coordinator=True, ) - omni_master_server = Mock(spec=OmniMasterServer) + omni_master_server = mocker.Mock(spec=OmniMasterServer) omni_master_server.get_zmq_addresses.return_value = EngineZmqAddresses( inputs=["tcp://client-in"], outputs=["tcp://client-out"] ) - omni_master_server.get_allocation.return_value = Mock(handshake_bind_address="tcp://127.0.0.1:26001") + omni_master_server.get_allocation.return_value = mocker.Mock(handshake_bind_address="tcp://127.0.0.1:26001") omni_master_server.get_stage_coordinator_addresses.return_value = StageCoordinatorAddresses() @contextmanager def fake_socket_ctx(*args, **kwargs): - yield Mock() + yield mocker.Mock() - with ( - patch("vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", return_value=fake_socket_ctx()), - patch("vllm_omni.engine.stage_engine_startup._wait_for_omni_engine_startup"), - ): - with connect_remote_engine_cores( - vllm_config=vllm_config, - omni_master_server=omni_master_server, - stage_id=7, - ) as (_, yielded_coordinator, yielded_addresses): - assert yielded_coordinator is None - assert yielded_addresses.coordinator_input is None - assert yielded_addresses.coordinator_output is None - assert yielded_addresses.frontend_stats_publish_address is None + mocker.patch( + "vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", + return_value=fake_socket_ctx(), + ) + mocker.patch("vllm_omni.engine.stage_engine_startup._wait_for_omni_engine_startup") + with connect_remote_engine_cores( + vllm_config=vllm_config, + omni_master_server=omni_master_server, + stage_id=7, + ) as (_, yielded_coordinator, yielded_addresses): + assert yielded_coordinator is None + assert yielded_addresses.coordinator_input is None + assert yielded_addresses.coordinator_output is None + assert yielded_addresses.frontend_stats_publish_address is None class TestLaunchOmniCoreEngines: """Tests for local omni engine launch wiring.""" - def test_registers_stage_once_and_reuses_handshake_for_all_local_engines(self): - parallel_config = Mock( + def test_registers_stage_once_and_reuses_handshake_for_all_local_engines(self, mocker: MockerFixture): + parallel_config = mocker.Mock( data_parallel_size_local=2, data_parallel_size=4, data_parallel_rank=3, ) - vllm_config = Mock(parallel_config=parallel_config) + vllm_config = mocker.Mock(parallel_config=parallel_config) - omni_master_server = Mock(spec=OmniMasterServer) + omni_master_server = mocker.Mock(spec=OmniMasterServer) omni_master_server.address = "127.0.0.1" omni_master_server.port = 26000 - omni_master_server.get_allocation.return_value = Mock(handshake_bind_address="tcp://127.0.0.1:26001") + omni_master_server.get_allocation.return_value = mocker.Mock(handshake_bind_address="tcp://127.0.0.1:26001") stage_config = {"stage_id": 7, "stage_type": "llm"} - local_engine_manager = Mock() + local_engine_manager = mocker.Mock() @contextmanager def fake_socket_ctx(*args, **kwargs): - yield Mock() - - with ( - patch( - "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", - return_value="tcp://127.0.0.1:26001", - ) as mock_register, - patch("vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", return_value=fake_socket_ctx()), - patch( - "vllm_omni.engine.stage_engine_startup.CoreEngineProcManager", - return_value=local_engine_manager, - ) as mock_manager_cls, - patch("vllm_omni.engine.stage_engine_startup.wait_for_engine_startup"), - ): - with launch_omni_core_engines( - vllm_config=vllm_config, - executor_class=Mock(), - log_stats=False, - omni_master_server=omni_master_server, - stage_id=7, - stage_config=stage_config, - ) as (yielded_manager, yielded_coordinator, yielded_addresses): - assert yielded_manager is local_engine_manager - assert yielded_coordinator is None + yield mocker.Mock() + + mock_register = mocker.patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value="tcp://127.0.0.1:26001", + ) + mocker.patch( + "vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", + return_value=fake_socket_ctx(), + ) + mock_manager_cls = mocker.patch( + "vllm_omni.engine.stage_engine_startup.CoreEngineProcManager", + return_value=local_engine_manager, + ) + mocker.patch("vllm_omni.engine.stage_engine_startup.wait_for_engine_startup") + with launch_omni_core_engines( + vllm_config=vllm_config, + executor_class=mocker.Mock(), + log_stats=False, + omni_master_server=omni_master_server, + stage_id=7, + stage_config=stage_config, + ) as (yielded_manager, yielded_coordinator, yielded_addresses): + assert yielded_manager is local_engine_manager + assert yielded_coordinator is None mock_register.assert_called_once_with( omni_master_address="127.0.0.1", @@ -1292,55 +1477,56 @@ def fake_socket_ctx(*args, **kwargs): assert manager_kwargs["handshake_address"] == "tcp://127.0.0.1:26001" assert manager_kwargs["executor_class"] is not None - def test_registers_stage_with_coordinator_when_started(self): - parallel_config = Mock( + def test_registers_stage_with_coordinator_when_started(self, mocker: MockerFixture): + parallel_config = mocker.Mock( data_parallel_size_local=1, data_parallel_size=2, data_parallel_rank=0, ) - vllm_config = Mock(parallel_config=parallel_config) + vllm_config = mocker.Mock(parallel_config=parallel_config) vllm_config.needs_dp_coordinator = True - vllm_config.model_config = Mock(is_moe=False) + vllm_config.model_config = mocker.Mock(is_moe=False) - omni_master_server = Mock(spec=OmniMasterServer) + omni_master_server = mocker.Mock(spec=OmniMasterServer) omni_master_server.address = "127.0.0.1" omni_master_server.port = 26000 omni_master_server.get_zmq_addresses.return_value = EngineZmqAddresses( inputs=["tcp://client-in"], outputs=["tcp://client-out"] ) - omni_master_server.get_allocation.return_value = Mock(handshake_bind_address="tcp://127.0.0.1:26001") + omni_master_server.get_allocation.return_value = mocker.Mock(handshake_bind_address="tcp://127.0.0.1:26001") - coordinator = Mock() + coordinator = mocker.Mock() coordinator.proc.pid = 1234 coordinator.get_engine_socket_addresses.return_value = ("tcp://coord-in", "tcp://coord-out") coordinator.get_stats_publish_address.return_value = "tcp://stats" @contextmanager def fake_socket_ctx(*args, **kwargs): - yield Mock() - - with ( - patch("vllm_omni.engine.stage_engine_startup.DPCoordinator", return_value=coordinator), - patch( - "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", - return_value="tcp://127.0.0.1:26001", - ) as mock_register, - patch("vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", return_value=fake_socket_ctx()), - patch( - "vllm_omni.engine.stage_engine_startup.CoreEngineProcManager", - return_value=Mock(), - ) as mock_manager_cls, - patch("vllm_omni.engine.stage_engine_startup.wait_for_engine_startup") as mock_wait, + yield mocker.Mock() + + mocker.patch("vllm_omni.engine.stage_engine_startup.DPCoordinator", return_value=coordinator) + mock_register = mocker.patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value="tcp://127.0.0.1:26001", + ) + mocker.patch( + "vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", + return_value=fake_socket_ctx(), + ) + mock_manager_cls = mocker.patch( + "vllm_omni.engine.stage_engine_startup.CoreEngineProcManager", + return_value=mocker.Mock(), + ) + mock_wait = mocker.patch("vllm_omni.engine.stage_engine_startup.wait_for_engine_startup") + with launch_omni_core_engines( + vllm_config=vllm_config, + executor_class=mocker.Mock(), + log_stats=False, + omni_master_server=omni_master_server, + stage_id=7, + stage_config={"stage_id": 7}, ): - with launch_omni_core_engines( - vllm_config=vllm_config, - executor_class=Mock(), - log_stats=False, - omni_master_server=omni_master_server, - stage_id=7, - stage_config={"stage_id": 7}, - ): - pass + pass mock_register.assert_called_once_with( omni_master_address="127.0.0.1", @@ -1363,19 +1549,19 @@ class TestLaunchLlmStageSingleStageMode: """Test that _launch_llm_stage selects launch_omni_core_engines when single_stage_mode=True and _omni_master_server is set.""" - def _build_engine_with_oms(self) -> AsyncOmniEngine: + def _build_engine_with_oms(self, mocker: MockerFixture) -> AsyncOmniEngine: engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" engine.single_stage_mode = True engine._single_stage_id_filter = 0 engine._llm_stage_launch_lock = threading.Lock() - mock_oms = Mock(spec=OmniMasterServer) + mock_oms = mocker.Mock(spec=OmniMasterServer) mock_oms.address = "127.0.0.1" mock_oms.port = 25000 - alloc = Mock() + alloc = mocker.Mock() alloc.handshake_bind_address = "tcp://127.0.0.1:25001" mock_oms.get_allocation.return_value = alloc - fake_addresses = Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None @@ -1383,66 +1569,60 @@ def _build_engine_with_oms(self) -> AsyncOmniEngine: engine._omni_master_server = mock_oms return engine - @contextmanager - def _patch_launch_omni_cm(self, stage_id: int): - fake_vllm_config = Mock() - fake_executor_cls = Mock() - fake_addresses = Mock() + def _mock_launch_omni(self, mocker: MockerFixture, stage_id: int): + fake_vllm_config = mocker.Mock() + fake_executor_cls = mocker.Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None - eng_mgr = Mock() + eng_mgr = mocker.Mock() @contextmanager def fake_launch_omni(*args, **kwargs): yield eng_mgr, None, fake_addresses - with ( - patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": stage_id}, - ), - patch( - "vllm_omni.engine.async_omni_engine.build_vllm_config", - return_value=(fake_vllm_config, fake_executor_cls), - ), - patch( - "vllm_omni.engine.async_omni_engine.acquire_device_locks", - return_value=[], - ), - patch( - "vllm_omni.engine.async_omni_engine.release_device_locks", - ), - patch( - "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", - return_value=fake_launch_omni(), - ) as mock_launch_omni, - ): - yield mock_launch_omni + mocker.patch("vllm_omni.engine.async_omni_engine.setup_stage_devices") + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": stage_id}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(fake_vllm_config, fake_executor_cls), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.acquire_device_locks", + return_value=[], + ) + mocker.patch("vllm_omni.engine.async_omni_engine.release_device_locks") + return mocker.patch( + "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", + return_value=fake_launch_omni(), + ) - def test_launch_omni_core_engines_used_in_single_stage_mode(self): + def test_launch_omni_core_engines_used_in_single_stage_mode(self, mocker: MockerFixture): """single_stage_mode + _omni_master_server → launch_omni_core_engines.""" - engine = self._build_engine_with_oms() - metadata = Mock(stage_id=0, runtime_cfg={}) + engine = self._build_engine_with_oms(mocker) + metadata = mocker.Mock(stage_id=0, runtime_cfg={}) stage_cfg = _make_stage_cfg(0) - with self._patch_launch_omni_cm(0) as mock_launch_omni: - result = engine._launch_llm_stage( - stage_cfg=stage_cfg, - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - llm_stage_launch_lock=threading.Lock(), - ) + mock_launch_omni = self._mock_launch_omni(mocker, 0) + result = engine._launch_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + llm_stage_launch_lock=threading.Lock(), + ) mock_launch_omni.assert_called_once() assert mock_launch_omni.call_args.kwargs["stage_config"] is stage_cfg assert isinstance(result, StartedLlmStage) assert result.stage_id == 0 - def test_spawn_stage_core_used_in_normal_mode(self): + def test_spawn_stage_core_used_in_normal_mode(self, mocker: MockerFixture): """~single_stage_mode → spawn_stage_core + complete_stage_handshake.""" engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" @@ -1450,44 +1630,45 @@ def test_spawn_stage_core_used_in_normal_mode(self): engine._omni_master_server = None engine._llm_stage_launch_lock = threading.Lock() - fake_vllm_config = Mock() - fake_executor_cls = Mock() - fake_addresses = Mock() + fake_vllm_config = mocker.Mock() + fake_executor_cls = mocker.Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None - fake_proc = Mock() + fake_proc = mocker.Mock() fake_handshake_address = "ipc:///tmp/fake-handshake" stage_init_timeout = 60 - with ( - patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": 0}, - ), - patch( - "vllm_omni.engine.async_omni_engine.build_vllm_config", - return_value=(fake_vllm_config, fake_executor_cls), - ), - patch("vllm_omni.engine.async_omni_engine.acquire_device_locks", return_value=[]), - patch("vllm_omni.engine.async_omni_engine.release_device_locks"), - patch( - "vllm_omni.engine.async_omni_engine.spawn_stage_core", - return_value=(fake_addresses, fake_proc, fake_handshake_address), - ) as mock_spawn, - patch("vllm_omni.engine.async_omni_engine.complete_stage_handshake") as mock_handshake, - patch("vllm_omni.engine.async_omni_engine.launch_omni_core_engines") as mock_omni, - ): - metadata = Mock(stage_id=0, runtime_cfg={}) - result = engine._launch_llm_stage( - stage_cfg=_make_stage_cfg(0), - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=stage_init_timeout, - llm_stage_launch_lock=threading.Lock(), - ) + mocker.patch("vllm_omni.engine.async_omni_engine.setup_stage_devices") + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 0}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(fake_vllm_config, fake_executor_cls), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.acquire_device_locks", + return_value=[], + ) + mocker.patch("vllm_omni.engine.async_omni_engine.release_device_locks") + mock_spawn = mocker.patch( + "vllm_omni.engine.async_omni_engine.spawn_stage_core", + return_value=(fake_addresses, fake_proc, fake_handshake_address), + ) + mock_handshake = mocker.patch("vllm_omni.engine.async_omni_engine.complete_stage_handshake") + mock_omni = mocker.patch("vllm_omni.engine.async_omni_engine.launch_omni_core_engines") + metadata = mocker.Mock(stage_id=0, runtime_cfg={}) + result = engine._launch_llm_stage( + stage_cfg=_make_stage_cfg(0), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=stage_init_timeout, + llm_stage_launch_lock=threading.Lock(), + ) mock_spawn.assert_called_once_with( vllm_config=fake_vllm_config, @@ -1505,50 +1686,58 @@ def test_spawn_stage_core_used_in_normal_mode(self): assert isinstance(result, StartedLlmStage) assert result.proc is fake_proc - def test_launch_omni_passes_stage_id_and_master_server(self): + def test_launch_omni_passes_stage_id_and_master_server(self, mocker: MockerFixture): """launch_omni_core_engines receives the correct stage_id and omni_master_server.""" - engine = self._build_engine_with_oms() - metadata = Mock(stage_id=0, runtime_cfg={}) + engine = self._build_engine_with_oms(mocker) + metadata = mocker.Mock(stage_id=0, runtime_cfg={}) captured_kwargs: dict[str, Any] = {} @contextmanager def capturing_launch(*args, **kwargs): captured_kwargs.update(kwargs) - fake_addresses = Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None - yield Mock(), None, fake_addresses + yield mocker.Mock(), None, fake_addresses - with ( - patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": 0}, - ), - patch("vllm_omni.engine.async_omni_engine.build_vllm_config", return_value=(Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.acquire_device_locks", return_value=[]), - patch("vllm_omni.engine.async_omni_engine.release_device_locks"), - patch("vllm_omni.engine.async_omni_engine.launch_omni_core_engines", side_effect=capturing_launch), - ): - engine._launch_llm_stage( - stage_cfg=_make_stage_cfg(0), - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - llm_stage_launch_lock=threading.Lock(), - ) + mocker.patch("vllm_omni.engine.async_omni_engine.setup_stage_devices") + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 0}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.acquire_device_locks", + return_value=[], + ) + mocker.patch("vllm_omni.engine.async_omni_engine.release_device_locks") + mocker.patch( + "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", + side_effect=capturing_launch, + ) + + engine._launch_llm_stage( + stage_cfg=_make_stage_cfg(0), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + llm_stage_launch_lock=threading.Lock(), + ) assert captured_kwargs.get("stage_id") == 0 assert captured_kwargs.get("omni_master_server") is engine._omni_master_server - def test_launch_omni_context_exits_before_stage_cleanup_on_error(self): + def test_launch_omni_context_exits_before_stage_cleanup_on_error(self, mocker: MockerFixture): """Errors after entering the omni launch context still unwind it first.""" - engine = self._build_engine_with_oms() - metadata = Mock(stage_id=0, runtime_cfg={}) + engine = self._build_engine_with_oms(mocker) + metadata = mocker.Mock(stage_id=0, runtime_cfg={}) - fake_addresses = Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None @@ -1558,47 +1747,51 @@ def test_launch_omni_context_exits_before_stage_cleanup_on_error(self): @contextmanager def fake_launch_omni(*args, **kwargs): try: - yield Mock(), None, fake_addresses + yield mocker.Mock(), None, fake_addresses finally: events.append("launch_exit") - with ( - patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": 0}, - ), - patch("vllm_omni.engine.async_omni_engine.build_vllm_config", return_value=(Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.acquire_device_locks", return_value=[]), - patch("vllm_omni.engine.async_omni_engine.release_device_locks"), - patch( - "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", - return_value=fake_launch_omni(), - ), - patch("vllm_omni.engine.async_omni_engine.logger.info", side_effect=RuntimeError("boom")), - patch( - "vllm_omni.engine.async_omni_engine.close_started_llm_stage", - side_effect=lambda _started: events.append("stage_close"), - ) as mock_close_stage, - ): - with pytest.raises(RuntimeError, match="boom"): - engine._launch_llm_stage( - stage_cfg=_make_stage_cfg(0), - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - llm_stage_launch_lock=threading.Lock(), - ) + mocker.patch("vllm_omni.engine.async_omni_engine.setup_stage_devices") + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 0}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.acquire_device_locks", + return_value=[], + ) + mocker.patch("vllm_omni.engine.async_omni_engine.release_device_locks") + mocker.patch( + "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", + return_value=fake_launch_omni(), + ) + mocker.patch("vllm_omni.engine.async_omni_engine.logger.info", side_effect=RuntimeError("boom")) + mock_close_stage = mocker.patch( + "vllm_omni.engine.async_omni_engine.close_started_llm_stage", + side_effect=lambda _started: events.append("stage_close"), + ) + with pytest.raises(RuntimeError, match="boom"): + engine._launch_llm_stage( + stage_cfg=_make_stage_cfg(0), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + llm_stage_launch_lock=threading.Lock(), + ) mock_close_stage.assert_called_once() assert events == ["launch_exit", "stage_close"] - def test_base_exception_propagates_without_started_stage_cleanup(self): + def test_base_exception_propagates_without_started_stage_cleanup(self, mocker: MockerFixture): """BaseException subclasses should bypass the Exception cleanup path.""" - engine = self._build_engine_with_oms() - metadata = Mock(stage_id=0, runtime_cfg={}) + engine = self._build_engine_with_oms(mocker) + metadata = mocker.Mock(stage_id=0, runtime_cfg={}) - fake_addresses = Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None @@ -1611,37 +1804,41 @@ class FatalLaunchInterrupt(BaseException): @contextmanager def fake_launch_omni(*args, **kwargs): try: - yield Mock(), None, fake_addresses + yield mocker.Mock(), None, fake_addresses finally: events.append("launch_exit") - with ( - patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": 0}, - ), - patch("vllm_omni.engine.async_omni_engine.build_vllm_config", return_value=(Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.acquire_device_locks", return_value=[]), - patch("vllm_omni.engine.async_omni_engine.release_device_locks"), - patch( - "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", - return_value=fake_launch_omni(), - ), - patch( - "vllm_omni.engine.async_omni_engine.logger.info", - side_effect=FatalLaunchInterrupt("stop"), - ), - patch("vllm_omni.engine.async_omni_engine.close_started_llm_stage") as mock_close_stage, - ): - with pytest.raises(FatalLaunchInterrupt, match="stop"): - engine._launch_llm_stage( - stage_cfg=_make_stage_cfg(0), - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - llm_stage_launch_lock=threading.Lock(), - ) + mocker.patch("vllm_omni.engine.async_omni_engine.setup_stage_devices") + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 0}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.acquire_device_locks", + return_value=[], + ) + mocker.patch("vllm_omni.engine.async_omni_engine.release_device_locks") + mocker.patch( + "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", + return_value=fake_launch_omni(), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.logger.info", + side_effect=FatalLaunchInterrupt("stop"), + ) + mock_close_stage = mocker.patch("vllm_omni.engine.async_omni_engine.close_started_llm_stage") + with pytest.raises(FatalLaunchInterrupt, match="stop"): + engine._launch_llm_stage( + stage_cfg=_make_stage_cfg(0), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + llm_stage_launch_lock=threading.Lock(), + ) mock_close_stage.assert_not_called() assert events == ["launch_exit"] diff --git a/tests/entrypoints/openai_api/test_serving_chat_speaker.py b/tests/entrypoints/openai_api/test_serving_chat_speaker.py index 3b9151120e..97c05e45b4 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_speaker.py +++ b/tests/entrypoints/openai_api/test_serving_chat_speaker.py @@ -4,9 +4,9 @@ import asyncio from types import SimpleNamespace -from unittest.mock import AsyncMock, MagicMock import pytest +from pytest_mock import MockerFixture from vllm_omni.entrypoints.openai.utils import ( get_supported_speakers_from_hf_config, @@ -25,9 +25,9 @@ def serving_chat(): return instance -def _make_hf_config(*, speaker_id: dict | None = None, spk_id: dict | None = None): - hf_config = MagicMock() - talker_config = MagicMock() +def _make_hf_config(mocker: MockerFixture, *, speaker_id: dict | None = None, spk_id: dict | None = None): + hf_config = mocker.MagicMock() + talker_config = mocker.MagicMock() talker_config.speaker_id = speaker_id talker_config.spk_id = spk_id hf_config.talker_config = talker_config @@ -51,14 +51,14 @@ def test_validate_requested_speaker_skips_validation_when_supported_empty(): assert validate_requested_speaker(" ", {"vivian"}) is None -def test_get_supported_speakers_from_hf_config_uses_spk_id_fallback(): - hf_config = _make_hf_config(speaker_id=None, spk_id={"Serena": 0}) +def test_get_supported_speakers_from_hf_config_uses_spk_id_fallback(mocker: MockerFixture): + hf_config = _make_hf_config(mocker, speaker_id=None, spk_id={"Serena": 0}) assert get_supported_speakers_from_hf_config(hf_config) == {"serena"} -def test_get_supported_speakers_caches_normalized_keys(serving_chat): - serving_chat.model_config = MagicMock() - serving_chat.model_config.hf_config = _make_hf_config(speaker_id={"Vivian": 0, "Ethan": 1}) +def test_get_supported_speakers_caches_normalized_keys(mocker: MockerFixture, serving_chat): + serving_chat.model_config = mocker.MagicMock() + serving_chat.model_config.hf_config = _make_hf_config(mocker, speaker_id={"Vivian": 0, "Ethan": 1}) assert serving_chat._get_supported_speakers() == {"vivian", "ethan"} @@ -67,15 +67,15 @@ def test_get_supported_speakers_caches_normalized_keys(serving_chat): assert serving_chat._get_supported_speakers() == {"vivian", "ethan"} -def test_create_chat_completion_converts_value_error_to_error_response(serving_chat): +def test_create_chat_completion_converts_value_error_to_error_response(mocker: MockerFixture, serving_chat): serving_chat._diffusion_mode = False - serving_chat._check_model = AsyncMock(return_value=None) - serving_chat.engine_client = MagicMock(errored=False) - serving_chat._maybe_get_adapters = MagicMock(return_value=None) - serving_chat.models = MagicMock() + serving_chat._check_model = mocker.AsyncMock(return_value=None) + serving_chat.engine_client = mocker.MagicMock(errored=False) + serving_chat._maybe_get_adapters = mocker.MagicMock(return_value=None) + serving_chat.models = mocker.MagicMock() serving_chat.models.model_name.return_value = "test-model" - serving_chat.renderer = MagicMock() - serving_chat.renderer.get_tokenizer.return_value = MagicMock() + serving_chat.renderer = mocker.MagicMock() + serving_chat.renderer.get_tokenizer.return_value = mocker.MagicMock() serving_chat.reasoning_parser_cls = None serving_chat.tool_parser = None serving_chat.use_harmony = False @@ -85,12 +85,12 @@ def test_create_chat_completion_converts_value_error_to_error_response(serving_c serving_chat.chat_template = None serving_chat.chat_template_content_format = "string" serving_chat.default_chat_template_kwargs = {} - serving_chat._validate_chat_template = MagicMock(return_value=None) - serving_chat._prepare_extra_chat_template_kwargs = MagicMock(return_value={}) - serving_chat._preprocess_chat = AsyncMock( + serving_chat._validate_chat_template = mocker.MagicMock(return_value=None) + serving_chat._prepare_extra_chat_template_kwargs = mocker.MagicMock(return_value={}) + serving_chat._preprocess_chat = mocker.AsyncMock( side_effect=ValueError("Invalid speaker 'uncle_fu'. Supported: ethan, vivian") ) - serving_chat.create_error_response = MagicMock(return_value="error-response") + serving_chat.create_error_response = mocker.MagicMock(return_value="error-response") request = SimpleNamespace( tool_choice=None, diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 06b6f5c16c..c884120620 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -6,7 +6,6 @@ from inspect import Signature, signature from pathlib import Path from types import SimpleNamespace -from unittest.mock import AsyncMock, MagicMock, patch import numpy as np import pytest @@ -901,7 +900,7 @@ def test_load_supported_speakers(self, mocker: MockerFixture): # Verify speakers are normalized to lowercase assert server.supported_speakers == {"ryan", "vivian", "aiden"} - def test_build_tts_params_with_uploaded_voice(self, speech_server): + def test_build_tts_params_with_uploaded_voice(self, speech_server, mocker: MockerFixture): """Test _build_tts_params auto-sets ref_audio for uploaded voices (x_vector only).""" speech_server.uploaded_speakers = { "custom_voice": { @@ -914,18 +913,18 @@ def test_build_tts_params_with_uploaded_voice(self, speech_server): } speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"} - with patch.object(speech_server, "_get_uploaded_audio_data") as mock_get_audio: - mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv" - req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice") - params = speech_server._build_tts_params(req) + mock_get_audio = mocker.patch.object(speech_server, "_get_uploaded_audio_data") + mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv" + req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice") + params = speech_server._build_tts_params(req) - assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] - assert params["x_vector_only_mode"] == [True] - assert params["task_type"] == ["Base"] - assert params["voice_created_at"] == [1711234567.89] - assert "ref_text" not in params + assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] + assert params["x_vector_only_mode"] == [True] + assert params["task_type"] == ["Base"] + assert params["voice_created_at"] == [1711234567.89] + assert "ref_text" not in params - def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server): + def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server, mocker: MockerFixture): """Test _build_tts_params enables in-context cloning when ref_text is stored.""" speech_server.uploaded_speakers = { "custom_voice": { @@ -938,16 +937,16 @@ def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server): } speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"} - with patch.object(speech_server, "_get_uploaded_audio_data") as mock_get_audio: - mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv" - req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice") - params = speech_server._build_tts_params(req) + mock_get_audio = mocker.patch.object(speech_server, "_get_uploaded_audio_data") + mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv" + req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice") + params = speech_server._build_tts_params(req) - assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] - assert params["x_vector_only_mode"] == [False] - assert params["task_type"] == ["Base"] - assert params["ref_text"] == ["Hello world transcript"] - assert params["voice_created_at"] == [1711234567.89] + assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] + assert params["x_vector_only_mode"] == [False] + assert params["task_type"] == ["Base"] + assert params["ref_text"] == ["Hello world transcript"] + assert params["voice_created_at"] == [1711234567.89] def test_build_tts_params_without_uploaded_voice(self, speech_server): """Test _build_tts_params does not auto-set ref_audio for non-uploaded voices.""" @@ -989,45 +988,43 @@ def test_build_tts_params_with_explicit_ref_audio(self, speech_server): # x_vector_only_mode should not be set when explicit ref_audio is provided assert "x_vector_only_mode" not in params - def test_get_uploaded_audio_data(self, speech_server): + def test_get_uploaded_audio_data(self, speech_server, mocker: MockerFixture): """Test _get_uploaded_audio_data function.""" # Mock file operations - with ( - patch("builtins.open", create=True) as mock_open, - patch("base64.b64encode") as mock_b64encode, - patch("pathlib.Path.exists") as mock_exists, - ): - mock_exists.return_value = True - mock_b64encode.return_value = b"ZmFrZWF1ZGlv" - - # Setup mock file - mock_file = MagicMock() - mock_file.read.return_value = b"fakeaudio" - mock_open.return_value.__enter__.return_value = mock_file - - # Setup uploaded speaker - speech_server.uploaded_speakers = { - "test_voice": {"name": "test_voice", "file_path": "/tmp/test.wav", "mime_type": "audio/wav"} - } - result = speech_server._get_uploaded_audio_data("test_voice") + mock_open = mocker.patch("builtins.open", create=True) + mock_b64encode = mocker.patch("base64.b64encode") + mock_exists = mocker.patch("pathlib.Path.exists") + mock_exists.return_value = True + mock_b64encode.return_value = b"ZmFrZWF1ZGlv" + + # Setup mock file + mock_file = mocker.MagicMock() + mock_file.read.return_value = b"fakeaudio" + mock_open.return_value.__enter__.return_value = mock_file + + # Setup uploaded speaker + speech_server.uploaded_speakers = { + "test_voice": {"name": "test_voice", "file_path": "/tmp/test.wav", "mime_type": "audio/wav"} + } + result = speech_server._get_uploaded_audio_data("test_voice") - assert result == "data:audio/wav;base64,ZmFrZWF1ZGlv" - mock_open.assert_called_once_with(Path("/tmp/test.wav"), "rb") - mock_b64encode.assert_called_once_with(b"fakeaudio") + assert result == "data:audio/wav;base64,ZmFrZWF1ZGlv" + mock_open.assert_called_once_with(Path("/tmp/test.wav"), "rb") + mock_b64encode.assert_called_once_with(b"fakeaudio") - def test_get_uploaded_audio_data_missing_file(self, speech_server): + def test_get_uploaded_audio_data_missing_file(self, speech_server, mocker: MockerFixture): """Test _get_uploaded_audio_data when file is missing.""" - with patch("pathlib.Path.exists") as mock_exists: - mock_exists.return_value = False + mock_exists = mocker.patch("pathlib.Path.exists") + mock_exists.return_value = False - # Setup uploaded speaker - speech_server.uploaded_speakers = { - "test_voice": {"name": "test_voice", "file_path": "/tmp/test.wav", "mime_type": "audio/wav"} - } + # Setup uploaded speaker + speech_server.uploaded_speakers = { + "test_voice": {"name": "test_voice", "file_path": "/tmp/test.wav", "mime_type": "audio/wav"} + } - result = speech_server._get_uploaded_audio_data("test_voice") + result = speech_server._get_uploaded_audio_data("test_voice") - assert result is None + assert result is None def test_get_uploaded_audio_data_voice_not_found(self, speech_server): """Test _get_uploaded_audio_data when voice is not in uploaded_speakers.""" @@ -1049,7 +1046,7 @@ def test_voice_field_still_accepted(self): req = OpenAICreateSpeechRequest.model_validate({"input": "Hello", "voice": "custom_voice"}) assert req.voice == "custom_voice" - def test_speaker_alias_in_base_task_with_uploaded_voice(self, speech_server): + def test_speaker_alias_in_base_task_with_uploaded_voice(self, speech_server, mocker: MockerFixture): """Using 'speaker' key with an uploaded voice should work for Base task.""" speech_server.uploaded_speakers = { "utesf": { @@ -1061,13 +1058,13 @@ def test_speaker_alias_in_base_task_with_uploaded_voice(self, speech_server): } req = OpenAICreateSpeechRequest.model_validate({"input": "Hello", "speaker": "UTESF", "task_type": "Base"}) assert req.voice == "UTESF" - with patch("pathlib.Path.exists", return_value=True): - result = speech_server._validate_qwen_tts_request(req) + mocker.patch("pathlib.Path.exists", return_value=True) + result = speech_server._validate_qwen_tts_request(req) assert result is None # ── uploaded voice with embedding ── - def test_build_tts_params_with_uploaded_voice_embedding(self, speech_server): + def test_build_tts_params_with_uploaded_voice_embedding(self, speech_server, mocker: MockerFixture): """Test _build_tts_params loads embedding for embedding-uploaded voices.""" speech_server.uploaded_speakers = { "emb_voice": { @@ -1083,20 +1080,20 @@ def test_build_tts_params_with_uploaded_voice_embedding(self, speech_server): speech_server.supported_speakers = {"ryan", "vivian", "emb_voice"} fake_embedding = [0.1] * 1024 - with patch.object(speech_server, "_get_uploaded_speaker_embedding") as mock_get_emb: - mock_get_emb.return_value = fake_embedding - req = OpenAICreateSpeechRequest(input="Hello", voice="emb_voice") - params = speech_server._build_tts_params(req) + mock_get_emb = mocker.patch.object(speech_server, "_get_uploaded_speaker_embedding") + mock_get_emb.return_value = fake_embedding + req = OpenAICreateSpeechRequest(input="Hello", voice="emb_voice") + params = speech_server._build_tts_params(req) - assert "voice_clone_prompt" in params - assert params["voice_clone_prompt"][0]["ref_spk_embedding"] == fake_embedding - assert params["task_type"] == ["Base"] - assert params["x_vector_only_mode"] == [True] - assert "ref_audio" not in params + assert "voice_clone_prompt" in params + assert params["voice_clone_prompt"][0]["ref_spk_embedding"] == fake_embedding + assert params["task_type"] == ["Base"] + assert params["x_vector_only_mode"] == [True] + assert "ref_audio" not in params # ── regression: full flow from issue #1603 ── - def test_regression_1603_speaker_key_with_uploaded_audio_voice(self, speech_server): + def test_regression_1603_speaker_key_with_uploaded_audio_voice(self, speech_server, mocker: MockerFixture): """Regression test for #1603: upload audio voice, then invoke TTS with 'speaker' key. Verifies the full validate → build_params pipeline works end-to-end. @@ -1116,14 +1113,14 @@ def test_regression_1603_speaker_key_with_uploaded_audio_voice(self, speech_serv assert req.voice == "UTESF" # Validation should pass (file exists) - with patch("pathlib.Path.exists", return_value=True): - err = speech_server._validate_qwen_tts_request(req) + mocker.patch("pathlib.Path.exists", return_value=True) + err = speech_server._validate_qwen_tts_request(req) assert err is None, f"Validation failed: {err}" # Build params should auto-set ref_audio from stored file - with patch.object(speech_server, "_get_uploaded_audio_data") as mock_audio: - mock_audio.return_value = "data:audio/wav;base64,ZmFrZQ==" - params = speech_server._build_tts_params(req) + mock_audio = mocker.patch.object(speech_server, "_get_uploaded_audio_data") + mock_audio.return_value = "data:audio/wav;base64,ZmFrZQ==" + params = speech_server._build_tts_params(req) assert params["task_type"] == ["Base"] assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZQ=="] @@ -1131,7 +1128,7 @@ def test_regression_1603_speaker_key_with_uploaded_audio_voice(self, speech_serv assert params["x_vector_only_mode"] == [False] assert params["speaker"] == ["utesf"] - def test_regression_1603_speaker_key_with_uploaded_embedding_voice(self, speech_server): + def test_regression_1603_speaker_key_with_uploaded_embedding_voice(self, speech_server, mocker: MockerFixture): """Regression test for #1603: upload embedding voice, then invoke TTS with 'speaker' key. Verifies embedding-uploaded voices are loaded as voice_clone_prompt, not as audio. @@ -1154,15 +1151,15 @@ def test_regression_1603_speaker_key_with_uploaded_embedding_voice(self, speech_ assert req.voice == "myvoice" # Validation should pass - with patch("pathlib.Path.exists", return_value=True): - err = speech_server._validate_qwen_tts_request(req) + mocker.patch("pathlib.Path.exists", return_value=True) + err = speech_server._validate_qwen_tts_request(req) assert err is None, f"Validation failed: {err}" # Build params should use embedding, NOT audio fake_emb = [0.1] * 1024 - with patch.object(speech_server, "_get_uploaded_speaker_embedding") as mock_emb: - mock_emb.return_value = fake_emb - params = speech_server._build_tts_params(req) + mock_emb = mocker.patch.object(speech_server, "_get_uploaded_speaker_embedding") + mock_emb.return_value = fake_emb + params = speech_server._build_tts_params(req) assert params["task_type"] == ["Base"] assert params["x_vector_only_mode"] == [True] @@ -1171,7 +1168,7 @@ def test_regression_1603_speaker_key_with_uploaded_embedding_voice(self, speech_ # Must NOT have ref_audio — that would fail for safetensors files assert "ref_audio" not in params - def test_validate_rejects_embedding_voice_with_pending_cache(self, speech_server): + def test_validate_rejects_embedding_voice_with_pending_cache(self, speech_server, mocker: MockerFixture): """Validation should reject embedding voices whose cache is not yet ready.""" speech_server.uploaded_speakers = { "myvoice": { @@ -1184,12 +1181,12 @@ def test_validate_rejects_embedding_voice_with_pending_cache(self, speech_server } } req = OpenAICreateSpeechRequest.model_validate({"input": "Hello", "speaker": "myvoice", "task_type": "Base"}) - with patch("pathlib.Path.exists", return_value=True): - err = speech_server._validate_qwen_tts_request(req) + mocker.patch("pathlib.Path.exists", return_value=True) + err = speech_server._validate_qwen_tts_request(req) assert err is not None assert "not yet ready" in err - def test_x_vector_only_mode_not_overwritten_for_uploaded_embedding(self, speech_server): + def test_x_vector_only_mode_not_overwritten_for_uploaded_embedding(self, speech_server, mocker: MockerFixture): """x_vector_only_mode set by uploaded embedding must not be overwritten by request field.""" speech_server.uploaded_speakers = { "emb_voice": { @@ -1203,11 +1200,11 @@ def test_x_vector_only_mode_not_overwritten_for_uploaded_embedding(self, speech_ } } fake_emb = [0.1] * 1024 - with patch.object(speech_server, "_get_uploaded_speaker_embedding") as mock_emb: - mock_emb.return_value = fake_emb - # Client explicitly sends x_vector_only_mode=False, but embedding requires True - req = OpenAICreateSpeechRequest(input="Hello", voice="emb_voice", x_vector_only_mode=False) - params = speech_server._build_tts_params(req) + mock_emb = mocker.patch.object(speech_server, "_get_uploaded_speaker_embedding") + mock_emb.return_value = fake_emb + # Client explicitly sends x_vector_only_mode=False, but embedding requires True + req = OpenAICreateSpeechRequest(input="Hello", voice="emb_voice", x_vector_only_mode=False) + params = speech_server._build_tts_params(req) assert params["x_vector_only_mode"] == [True] assert "voice_clone_prompt" in params @@ -1654,9 +1651,9 @@ async def test_omni_model_includes_generate(self): assert "generate" in tasks -def test_api_server_create_speech_wraps_error_response_status(): - handler = MagicMock() - handler.create_speech = AsyncMock( +def test_api_server_create_speech_wraps_error_response_status(mocker: MockerFixture): + handler = mocker.MagicMock() + handler.create_speech = mocker.AsyncMock( return_value=ErrorResponse( error=ErrorInfo(message="bad request", type="BadRequestError", param=None, code=400), ) @@ -1851,9 +1848,9 @@ def test_build_fish_prompt_normalizes_legacy_speaker_tags(self, fish_speech_serv assert "<|speaker:0|>你好,[laughing]欢迎回来。<|speaker:1|>我也来了。" in encoded_texts assert all(allowed_special is None for _, _, allowed_special in tokenizer.calls) - def test_build_fish_clone_prompt_normalizes_text_fields(self, fish_speech_server): + def test_build_fish_clone_prompt_normalizes_text_fields(self, fish_speech_server, mocker: MockerFixture): fish_speech_server._fish_speech_tokenizer = _FakeFishTokenizer() - fish_speech_server._estimate_fish_prompt_len = MagicMock(return_value=123) + fish_speech_server._estimate_fish_prompt_len = mocker.MagicMock(return_value=123) request = OpenAICreateSpeechRequest( input="你好,欢迎回来。", @@ -1904,8 +1901,10 @@ def test_build_fish_prompt_rejects_unsafe_control_tokens(self, fish_speech_serve with pytest.raises(ValueError, match="unsupported control token"): fish_speech_server._build_fish_speech_prompt(request) - def test_prepare_speech_generation_overrides_fish_default_max_tokens(self, fish_speech_server): - fish_speech_server._build_fish_speech_prompt_async = AsyncMock( + def test_prepare_speech_generation_overrides_fish_default_max_tokens( + self, fish_speech_server, mocker: MockerFixture + ): + fish_speech_server._build_fish_speech_prompt_async = mocker.AsyncMock( return_value={ "prompt_token_ids": [1, 2, 3], "additional_information": {}, @@ -1924,8 +1923,8 @@ def test_prepare_speech_generation_overrides_fish_default_max_tokens(self, fish_ assert sampling_params_list[0].max_tokens == 4096 assert fish_speech_server.engine_client.default_sampling_params_list[0].max_tokens == 2048 - def test_prepare_speech_generation_uses_stage_default_max_tokens(self, fish_speech_server): - fish_speech_server._build_fish_speech_prompt_async = AsyncMock( + def test_prepare_speech_generation_uses_stage_default_max_tokens(self, fish_speech_server, mocker: MockerFixture): + fish_speech_server._build_fish_speech_prompt_async = mocker.AsyncMock( return_value={ "prompt_token_ids": [1, 2, 3], "additional_information": {}, @@ -1956,9 +1955,9 @@ def test_prepare_speech_generation_rejects_invalid_fish_max_new_tokens(self, fis fish_speech_server.engine_client.generate.assert_not_called() - def test_create_speech_batch_allows_fish_text_only_items(self, fish_speech_server): - fish_speech_server._check_model = AsyncMock(return_value=None) - fish_speech_server._generate_audio_bytes = AsyncMock(return_value=("YWJj", "audio/wav")) + def test_create_speech_batch_allows_fish_text_only_items(self, fish_speech_server, mocker: MockerFixture): + fish_speech_server._check_model = mocker.AsyncMock(return_value=None) + fish_speech_server._generate_audio_bytes = mocker.AsyncMock(return_value=("YWJj", "audio/wav")) batch = BatchSpeechRequest(items=[SpeechBatchItem(input="hello fish")]) response = asyncio.run(fish_speech_server.create_speech_batch(batch)) @@ -2154,8 +2153,8 @@ def test_validate_cosyvoice3_max_new_tokens_range(self, cosyvoice3_server): assert error is not None assert "max_new_tokens" in error - def test_prepare_speech_generation_cosyvoice3(self, cosyvoice3_server): - cosyvoice3_server._build_cosyvoice3_prompt = AsyncMock( + def test_prepare_speech_generation_cosyvoice3(self, cosyvoice3_server, mocker: MockerFixture): + cosyvoice3_server._build_cosyvoice3_prompt = mocker.AsyncMock( return_value={ "prompt": "Hello", "multi_modal_data": {"audio": (np.zeros(24000), 24000)}, @@ -2236,9 +2235,9 @@ def qwen3_tts_server(self, mocker: MockerFixture): yield server server.shutdown() - def test_prepare_speech_generation_awaits_voxtral_async(self, voxtral_server): + def test_prepare_speech_generation_awaits_voxtral_async(self, voxtral_server, mocker: MockerFixture): """Voxtral path in _prepare_speech_generation should call the async wrapper.""" - voxtral_server._build_voxtral_prompt_async = AsyncMock( + voxtral_server._build_voxtral_prompt_async = mocker.AsyncMock( return_value={ "prompt_token_ids": [1, 2, 3], "additional_information": {"voice": ["test"]}, @@ -2248,13 +2247,13 @@ def test_prepare_speech_generation_awaits_voxtral_async(self, voxtral_server): asyncio.run(voxtral_server._prepare_speech_generation(request)) voxtral_server._build_voxtral_prompt_async.assert_awaited_once() - def test_prepare_speech_generation_awaits_qwen3_tts_async(self, qwen3_tts_server): + def test_prepare_speech_generation_awaits_qwen3_tts_async(self, qwen3_tts_server, mocker: MockerFixture): """Qwen3 TTS path should call _estimate_prompt_len_async.""" - qwen3_tts_server._validate_tts_request = MagicMock(return_value=None) - qwen3_tts_server._build_tts_params = MagicMock( + qwen3_tts_server._validate_tts_request = mocker.MagicMock(return_value=None) + qwen3_tts_server._build_tts_params = mocker.MagicMock( return_value={"text": ["hello"], "task_type": ["CustomVoice"], "speaker": ["Vivian"]} ) - qwen3_tts_server._estimate_prompt_len_async = AsyncMock(return_value=512) + qwen3_tts_server._estimate_prompt_len_async = mocker.AsyncMock(return_value=512) request = OpenAICreateSpeechRequest(input="hello") asyncio.run(qwen3_tts_server._prepare_speech_generation(request)) qwen3_tts_server._build_tts_params.assert_called_once() @@ -2281,8 +2280,8 @@ def test_shutdown_is_idempotent(self, mocker: MockerFixture): server.shutdown() # Should not raise assert server._tts_executor is None - def test_diffusion_instance_shutdown_safe(self): + def test_diffusion_instance_shutdown_safe(self, mocker: MockerFixture): """Diffusion instances (created via for_diffusion) should have safe shutdown.""" - server = OmniOpenAIServingSpeech.for_diffusion(diffusion_engine=MagicMock(), model_name="test-model") + server = OmniOpenAIServingSpeech.for_diffusion(diffusion_engine=mocker.MagicMock(), model_name="test-model") assert server._tts_executor is None server.shutdown() # Should not raise diff --git a/tests/entrypoints/openai_api/test_serving_speech_stream.py b/tests/entrypoints/openai_api/test_serving_speech_stream.py index 1d26b5855f..1b93ef58e2 100644 --- a/tests/entrypoints/openai_api/test_serving_speech_stream.py +++ b/tests/entrypoints/openai_api/test_serving_speech_stream.py @@ -1,8 +1,8 @@ import asyncio -from unittest.mock import AsyncMock, MagicMock import pytest from fastapi import FastAPI, WebSocket +from pytest_mock import MockerFixture from starlette.testclient import TestClient from starlette.websockets import WebSocketDisconnect @@ -13,19 +13,26 @@ pytestmark = [pytest.mark.core_model, pytest.mark.cpu] -def _build_test_app(speech_service=None, *, idle_timeout=30.0, config_timeout=10.0): +def _build_test_app( + speech_service=None, + *, + idle_timeout=30.0, + config_timeout=10.0, + mocker: MockerFixture | None = None, +): if speech_service is None: - speech_service = MagicMock(spec=OmniOpenAIServingSpeech) - speech_service._generate_audio_bytes = AsyncMock(return_value=(b"RIFF" + b"\x00" * 32, "audio/wav")) - speech_service._prepare_speech_generation = AsyncMock(return_value=("req-1", object(), {})) + assert mocker is not None + speech_service = mocker.MagicMock(spec=OmniOpenAIServingSpeech) + speech_service._generate_audio_bytes = mocker.AsyncMock(return_value=(b"RIFF" + b"\x00" * 32, "audio/wav")) + speech_service._prepare_speech_generation = mocker.AsyncMock(return_value=("req-1", object(), {})) async def mock_generate_pcm_chunks(_generator, _request_id): for chunk in (b"\x01\x02", b"\x03\x04\x05"): yield chunk speech_service._generate_pcm_chunks = mock_generate_pcm_chunks - speech_service.engine_client = MagicMock() - speech_service.engine_client.abort = AsyncMock() + speech_service.engine_client = mocker.MagicMock() + speech_service.engine_client.abort = mocker.AsyncMock() handler = OmniStreamingSpeechHandler( speech_service=speech_service, @@ -42,8 +49,8 @@ async def ws_endpoint(websocket: WebSocket): class TestStreamingSpeechWebSocket: - def test_non_streaming_single_frame(self): - app, speech_service = _build_test_app() + def test_non_streaming_single_frame(self, mocker: MockerFixture): + app, speech_service = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -68,13 +75,13 @@ def test_non_streaming_single_frame(self): assert speech_service._generate_audio_bytes.await_count == 1 - def test_streaming_multiple_binary_frames(self): + def test_streaming_multiple_binary_frames(self, mocker: MockerFixture): captured_requests = [] - speech_service = MagicMock(spec=OmniOpenAIServingSpeech) - speech_service._generate_audio_bytes = AsyncMock(return_value=(b"", "audio/wav")) - speech_service.engine_client = MagicMock() - speech_service.engine_client.abort = AsyncMock() + speech_service = mocker.MagicMock(spec=OmniOpenAIServingSpeech) + speech_service._generate_audio_bytes = mocker.AsyncMock(return_value=(b"", "audio/wav")) + speech_service.engine_client = mocker.MagicMock() + speech_service.engine_client.abort = mocker.AsyncMock() async def mock_prepare_speech_generation(request): captured_requests.append(request) @@ -123,8 +130,8 @@ async def mock_generate_pcm_chunks(_generator, _request_id): assert captured_requests[0].initial_codec_chunk_frames == 12 assert speech_service._generate_audio_bytes.await_count == 0 - def test_flush_on_input_done(self): - app, _ = _build_test_app() + def test_flush_on_input_done(self, mocker: MockerFixture): + app, _ = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -142,8 +149,8 @@ def test_flush_on_input_done(self): } assert ws.receive_json() == {"type": "session.done", "total_sentences": 1} - def test_invalid_streaming_config(self): - app, _ = _build_test_app() + def test_invalid_streaming_config(self, mocker: MockerFixture): + app, _ = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -159,8 +166,8 @@ def test_invalid_streaming_config(self): assert error["type"] == "error" assert "response_format='pcm'" in error["message"] - def test_empty_input_text_emits_no_audio(self): - app, speech_service = _build_test_app() + def test_empty_input_text_emits_no_audio(self, mocker: MockerFixture): + app, speech_service = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -172,8 +179,8 @@ def test_empty_input_text_emits_no_audio(self): assert speech_service._generate_audio_bytes.await_count == 0 - def test_multiple_sentences_increment_indices(self): - app, _ = _build_test_app() + def test_multiple_sentences_increment_indices(self, mocker: MockerFixture): + app, _ = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -203,8 +210,8 @@ def test_multiple_sentences_increment_indices(self): ws.send_json({"type": "input.done"}) assert ws.receive_json() == {"type": "session.done", "total_sentences": 2} - def test_unknown_message_type_keeps_session_open(self): - app, _ = _build_test_app() + def test_unknown_message_type_keeps_session_open(self, mocker: MockerFixture): + app, _ = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -227,21 +234,21 @@ def test_unknown_message_type_keeps_session_open(self): ws.send_json({"type": "input.done"}) assert ws.receive_json() == {"type": "session.done", "total_sentences": 1} - def test_config_timeout_closes_session(self): - app, _ = _build_test_app(config_timeout=0.01) + def test_config_timeout_closes_session(self, mocker: MockerFixture): + app, _ = _build_test_app(config_timeout=0.01, mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: error = ws.receive_json() assert error == {"type": "error", "message": "Timeout waiting for session.config"} - def test_generation_error_marks_audio_done(self): - speech_service = MagicMock(spec=OmniOpenAIServingSpeech) - speech_service._generate_audio_bytes = AsyncMock(side_effect=RuntimeError("boom")) - speech_service._prepare_speech_generation = AsyncMock(return_value=("req-err", object(), {})) - speech_service._generate_pcm_chunks = AsyncMock() - speech_service.engine_client = MagicMock() - speech_service.engine_client.abort = AsyncMock() + def test_generation_error_marks_audio_done(self, mocker: MockerFixture): + speech_service = mocker.MagicMock(spec=OmniOpenAIServingSpeech) + speech_service._generate_audio_bytes = mocker.AsyncMock(side_effect=RuntimeError("boom")) + speech_service._prepare_speech_generation = mocker.AsyncMock(return_value=("req-err", object(), {})) + speech_service._generate_pcm_chunks = mocker.AsyncMock() + speech_service.engine_client = mocker.MagicMock() + speech_service.engine_client.abort = mocker.AsyncMock() app, _ = _build_test_app(speech_service) with TestClient(app) as client: @@ -256,12 +263,12 @@ def test_generation_error_marks_audio_done(self): ws.send_json({"type": "input.done"}) assert ws.receive_json() == {"type": "session.done", "total_sentences": 1} - def test_streaming_generation_error_marks_audio_done(self): - speech_service = MagicMock(spec=OmniOpenAIServingSpeech) - speech_service._generate_audio_bytes = AsyncMock(return_value=(b"", "audio/wav")) - speech_service._prepare_speech_generation = AsyncMock(return_value=("req-stream-err", object(), {})) - speech_service.engine_client = MagicMock() - speech_service.engine_client.abort = AsyncMock() + def test_streaming_generation_error_marks_audio_done(self, mocker: MockerFixture): + speech_service = mocker.MagicMock(spec=OmniOpenAIServingSpeech) + speech_service._generate_audio_bytes = mocker.AsyncMock(return_value=(b"", "audio/wav")) + speech_service._prepare_speech_generation = mocker.AsyncMock(return_value=("req-stream-err", object(), {})) + speech_service.engine_client = mocker.MagicMock() + speech_service.engine_client.abort = mocker.AsyncMock() async def mock_generate_pcm_chunks(_generator, _request_id): yield b"\x01\x02" @@ -298,8 +305,8 @@ async def mock_generate_pcm_chunks(_generator, _request_id): ws.send_json({"type": "input.done"}) assert ws.receive_json() == {"type": "session.done", "total_sentences": 1} - def test_invalid_input_text_type_returns_validation_error(self): - app, speech_service = _build_test_app() + def test_invalid_input_text_type_returns_validation_error(self, mocker: MockerFixture): + app, speech_service = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -316,9 +323,9 @@ def test_invalid_input_text_type_returns_validation_error(self): assert speech_service._generate_audio_bytes.await_count == 0 - def test_input_text_message_too_large(self, monkeypatch): + def test_input_text_message_too_large(self, monkeypatch, mocker: MockerFixture): monkeypatch.setattr(streaming_speech_module, "_MAX_INPUT_TEXT_MESSAGE_SIZE", 32) - app, speech_service = _build_test_app() + app, speech_service = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -335,9 +342,9 @@ def test_input_text_message_too_large(self, monkeypatch): assert speech_service._generate_audio_bytes.await_count == 0 - def test_session_config_message_too_large(self, monkeypatch): + def test_session_config_message_too_large(self, monkeypatch, mocker: MockerFixture): monkeypatch.setattr(streaming_speech_module, "_MAX_CONFIG_MESSAGE_SIZE", 64) - app, _ = _build_test_app() + app, _ = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -348,12 +355,12 @@ def test_session_config_message_too_large(self, monkeypatch): "message": "session.config message too large", } - def test_disconnect_aborts_streaming_request(self): - speech_service = MagicMock(spec=OmniOpenAIServingSpeech) - speech_service._generate_audio_bytes = AsyncMock(return_value=(b"", "audio/wav")) - speech_service._prepare_speech_generation = AsyncMock(return_value=("req-abort", object(), {})) - speech_service.engine_client = MagicMock() - speech_service.engine_client.abort = AsyncMock() + def test_disconnect_aborts_streaming_request(self, mocker: MockerFixture): + speech_service = mocker.MagicMock(spec=OmniOpenAIServingSpeech) + speech_service._generate_audio_bytes = mocker.AsyncMock(return_value=(b"", "audio/wav")) + speech_service._prepare_speech_generation = mocker.AsyncMock(return_value=("req-abort", object(), {})) + speech_service.engine_client = mocker.MagicMock() + speech_service.engine_client.abort = mocker.AsyncMock() async def mock_generate_pcm_chunks(_generator, _request_id): yield b"\x01\x02" @@ -361,11 +368,11 @@ async def mock_generate_pcm_chunks(_generator, _request_id): speech_service._generate_pcm_chunks = mock_generate_pcm_chunks handler = OmniStreamingSpeechHandler(speech_service=speech_service) - websocket = MagicMock() - websocket.send_json = AsyncMock(side_effect=[None, WebSocketDisconnect()]) - websocket.send_bytes = AsyncMock(side_effect=WebSocketDisconnect()) + websocket = mocker.MagicMock() + websocket.send_json = mocker.AsyncMock(side_effect=[None, WebSocketDisconnect()]) + websocket.send_bytes = mocker.AsyncMock(side_effect=WebSocketDisconnect()) - config = MagicMock() + config = mocker.MagicMock() config.model = None config.voice = "Vivian" config.task_type = None diff --git a/tests/entrypoints/test_omni_base_profiler.py b/tests/entrypoints/test_omni_base_profiler.py index 0c1ddc6a5d..ca10eed91f 100644 --- a/tests/entrypoints/test_omni_base_profiler.py +++ b/tests/entrypoints/test_omni_base_profiler.py @@ -1,8 +1,7 @@ """Unit tests for OmniBase and AsyncOmni profiler methods.""" -from unittest.mock import MagicMock, patch - import pytest +from pytest_mock import MockerFixture pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -11,12 +10,12 @@ class TestOmniBaseProfiler: """Test suite for OmniBase profiler methods (start_profile, stop_profile).""" @pytest.fixture - def mock_engine(self): + def mock_engine(self, mocker: MockerFixture): """Create a mock AsyncOmniEngine for testing.""" - engine = MagicMock() + engine = mocker.MagicMock() engine.num_stages = 3 engine.is_alive.return_value = True - engine.default_sampling_params_list = [MagicMock() for _ in range(3)] + engine.default_sampling_params_list = [mocker.MagicMock() for _ in range(3)] engine.get_stage_metadata.side_effect = lambda i: { "final_output_type": "text" if i == 0 else "audio", "final_output": True, @@ -25,17 +24,15 @@ def mock_engine(self): return engine @pytest.fixture - def omni_base_instance(self, mock_engine): + def omni_base_instance(self, mock_engine, mocker: MockerFixture): """Create an OmniBase instance with mocked dependencies.""" - with ( - patch("vllm_omni.entrypoints.omni_base.AsyncOmniEngine", return_value=mock_engine), - patch("vllm_omni.entrypoints.omni_base.omni_snapshot_download", side_effect=lambda x: x), - patch("vllm_omni.entrypoints.omni_base.weakref.finalize"), - ): - from vllm_omni.entrypoints.omni_base import OmniBase - - instance = OmniBase(model="test-model") - return instance + mocker.patch("vllm_omni.entrypoints.omni_base.AsyncOmniEngine", return_value=mock_engine) + mocker.patch("vllm_omni.entrypoints.omni_base.omni_snapshot_download", side_effect=lambda x: x) + mocker.patch("vllm_omni.entrypoints.omni_base.weakref.finalize") + from vllm_omni.entrypoints.omni_base import OmniBase + + instance = OmniBase(model="test-model") + return instance def test_start_profile_calls_collective_rpc(self, omni_base_instance, mock_engine): """Test that start_profile calls collective_rpc with correct arguments.""" diff --git a/tests/entrypoints/test_serve.py b/tests/entrypoints/test_serve.py index 916db3cc22..afa7fa82e4 100644 --- a/tests/entrypoints/test_serve.py +++ b/tests/entrypoints/test_serve.py @@ -3,9 +3,9 @@ from __future__ import annotations import argparse -from unittest.mock import Mock, patch import pytest +from pytest_mock import MockerFixture from vllm_omni.entrypoints.cli.serve import run_headless @@ -26,45 +26,43 @@ def _make_headless_args() -> argparse.Namespace: ) -def test_run_headless_registers_stage_once_and_launches_all_local_engines() -> None: +def test_run_headless_registers_stage_once_and_launches_all_local_engines(mocker: MockerFixture) -> None: args = _make_headless_args() - stage_cfg = Mock(stage_id=3) + stage_cfg = mocker.Mock(stage_id=3) stage_cfgs = [stage_cfg] - parallel_config = Mock( + parallel_config = mocker.Mock( data_parallel_size_local=2, data_parallel_rank=4, data_parallel_rank_local=1, node_rank_within_dp=0, ) - vllm_config = Mock(parallel_config=parallel_config) - executor_class = Mock() - engine_manager = Mock() - - with ( - patch( - "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", - return_value=("/fake/stages.yaml", stage_cfgs), - ), - patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment"), - patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=Mock()), - patch("vllm_omni.engine.stage_init_utils.get_stage_connector_spec", return_value={}), - patch("vllm_omni.engine.stage_init_utils.build_engine_args_dict", return_value={}), - patch( - "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", - return_value=(None, None, None), - ), - patch( - "vllm_omni.engine.stage_init_utils.build_vllm_config", - return_value=(vllm_config, executor_class), - ) as mock_build_vllm_config, - patch( - "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", - return_value="tcp://127.0.0.1:26001", - ) as mock_register, - patch("vllm.v1.engine.utils.CoreEngineProcManager", return_value=engine_manager) as mock_manager_cls, - patch("signal.signal"), - ): - run_headless(args) + vllm_config = mocker.Mock(parallel_config=parallel_config) + executor_class = mocker.Mock() + engine_manager = mocker.Mock() + + mocker.patch( + "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", + return_value=("/fake/stages.yaml", stage_cfgs), + ) + mocker.patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment") + mocker.patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=mocker.Mock()) + mocker.patch("vllm_omni.engine.stage_init_utils.get_stage_connector_spec", return_value={}) + mocker.patch("vllm_omni.engine.stage_init_utils.build_engine_args_dict", return_value={}) + mocker.patch( + "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mock_build_vllm_config = mocker.patch( + "vllm_omni.engine.stage_init_utils.build_vllm_config", + return_value=(vllm_config, executor_class), + ) + mock_register = mocker.patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value="tcp://127.0.0.1:26001", + ) + mock_manager_cls = mocker.patch("vllm.v1.engine.utils.CoreEngineProcManager", return_value=engine_manager) + mocker.patch("signal.signal") + run_headless(args) mock_build_vllm_config.assert_called_once_with( stage_cfg, @@ -92,89 +90,85 @@ def test_run_headless_registers_stage_once_and_launches_all_local_engines() -> N engine_manager.shutdown.assert_called_once_with() -def test_run_headless_honors_explicit_log_stats_flag() -> None: +def test_run_headless_honors_explicit_log_stats_flag(mocker: MockerFixture) -> None: args = _make_headless_args() args.log_stats = True - stage_cfg = Mock(stage_id=3) + stage_cfg = mocker.Mock(stage_id=3) stage_cfgs = [stage_cfg] - parallel_config = Mock( + parallel_config = mocker.Mock( data_parallel_size_local=2, data_parallel_rank=4, data_parallel_rank_local=1, node_rank_within_dp=0, ) - vllm_config = Mock(parallel_config=parallel_config) - executor_class = Mock() - engine_manager = Mock() - - with ( - patch( - "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", - return_value=("/fake/stages.yaml", stage_cfgs), - ), - patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment"), - patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=Mock()), - patch("vllm_omni.engine.stage_init_utils.get_stage_connector_spec", return_value={}), - patch("vllm_omni.engine.stage_init_utils.build_engine_args_dict", return_value={}), - patch( - "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", - return_value=(None, None, None), - ), - patch( - "vllm_omni.engine.stage_init_utils.build_vllm_config", - return_value=(vllm_config, executor_class), - ), - patch( - "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", - return_value="tcp://127.0.0.1:26001", - ), - patch("vllm.v1.engine.utils.CoreEngineProcManager", return_value=engine_manager) as mock_manager_cls, - patch("signal.signal"), - ): - run_headless(args) + vllm_config = mocker.Mock(parallel_config=parallel_config) + executor_class = mocker.Mock() + engine_manager = mocker.Mock() + + mocker.patch( + "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", + return_value=("/fake/stages.yaml", stage_cfgs), + ) + mocker.patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment") + mocker.patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=mocker.Mock()) + mocker.patch("vllm_omni.engine.stage_init_utils.get_stage_connector_spec", return_value={}) + mocker.patch("vllm_omni.engine.stage_init_utils.build_engine_args_dict", return_value={}) + mocker.patch( + "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.stage_init_utils.build_vllm_config", + return_value=(vllm_config, executor_class), + ) + mocker.patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value="tcp://127.0.0.1:26001", + ) + mock_manager_cls = mocker.patch("vllm.v1.engine.utils.CoreEngineProcManager", return_value=engine_manager) + mocker.patch("signal.signal") + run_headless(args) manager_kwargs = mock_manager_cls.call_args.kwargs assert manager_kwargs["log_stats"] is True -def test_run_headless_launches_diffusion_stage_via_omni_master() -> None: +def test_run_headless_launches_diffusion_stage_via_omni_master(mocker: MockerFixture) -> None: args = _make_headless_args() - stage_cfg = Mock(stage_id=3, stage_type="diffusion") - stage_cfg.engine_args = Mock() + stage_cfg = mocker.Mock(stage_id=3, stage_type="diffusion") + stage_cfg.engine_args = mocker.Mock() stage_cfg.engine_input_source = [] stage_cfgs = [stage_cfg] - metadata = Mock(stage_id=3) - od_config = Mock() - proc = Mock() + metadata = mocker.Mock(stage_id=3) + od_config = mocker.Mock() + proc = mocker.Mock() proc.exitcode = 0 proc.is_alive.return_value = False - with ( - patch( - "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", - return_value=("/fake/stages.yaml", stage_cfgs), - ), - patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment"), - patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=Mock()), - patch( - "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", - return_value=(None, None, None), - ), - patch("vllm_omni.engine.stage_init_utils.extract_stage_metadata", return_value=metadata), - patch("vllm_omni.engine.stage_init_utils.inject_kv_stage_info") as mock_inject_stage_info, - patch("vllm_omni.engine.stage_init_utils.build_diffusion_config", return_value=od_config), - patch( - "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", - return_value=("tcp://127.0.0.1:26001", "tcp://127.0.0.1:26002", "tcp://127.0.0.1:26003"), - ) as mock_register, - patch( - "vllm_omni.diffusion.stage_diffusion_proc.spawn_diffusion_proc", - return_value=(proc, "tcp://127.0.0.1:26001", "tcp://127.0.0.1:26002", "tcp://127.0.0.1:26003"), - ) as mock_spawn, - patch("vllm_omni.diffusion.stage_diffusion_proc.complete_diffusion_handshake") as mock_handshake, - patch("signal.signal"), - ): - run_headless(args) + mocker.patch( + "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", + return_value=("/fake/stages.yaml", stage_cfgs), + ) + mocker.patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment") + mocker.patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=mocker.Mock()) + mocker.patch( + "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch("vllm_omni.engine.stage_init_utils.extract_stage_metadata", return_value=metadata) + mock_inject_stage_info = mocker.patch("vllm_omni.engine.stage_init_utils.inject_kv_stage_info") + mocker.patch("vllm_omni.engine.stage_init_utils.build_diffusion_config", return_value=od_config) + mock_register = mocker.patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value=("tcp://127.0.0.1:26001", "tcp://127.0.0.1:26002", "tcp://127.0.0.1:26003"), + ) + mock_spawn = mocker.patch( + "vllm_omni.diffusion.stage_diffusion_proc.spawn_diffusion_proc", + return_value=(proc, "tcp://127.0.0.1:26001", "tcp://127.0.0.1:26002", "tcp://127.0.0.1:26003"), + ) + mock_handshake = mocker.patch("vllm_omni.diffusion.stage_diffusion_proc.complete_diffusion_handshake") + mocker.patch("signal.signal") + run_headless(args) mock_inject_stage_info.assert_called_once_with(stage_cfg, 3) mock_register.assert_called_once_with( diff --git a/tests/model_executor/models/mimo_audio/test_mimo_audio_code2wav_batch_decode.py b/tests/model_executor/models/mimo_audio/test_mimo_audio_code2wav_batch_decode.py index 85c0e8b56e..8858d1f8f1 100644 --- a/tests/model_executor/models/mimo_audio/test_mimo_audio_code2wav_batch_decode.py +++ b/tests/model_executor/models/mimo_audio/test_mimo_audio_code2wav_batch_decode.py @@ -2,10 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from types import SimpleNamespace -from unittest.mock import Mock import pytest import torch +from pytest_mock import MockerFixture from vllm_omni.model_executor.models.mimo_audio.config_mimo_audio import TALKER_CODEC_PAD_TOKEN_ID from vllm_omni.model_executor.models.mimo_audio.mimo_audio_code2wav import ( @@ -51,7 +51,7 @@ def _make_invalid_flat_immediate_eostm(eostm_id: int = 666) -> torch.Tensor: return g.reshape(-1) -def _minimal_model(): +def _minimal_model(mocker: MockerFixture): """Avoid __init__ (HF tokenizer paths); only fields used by _batch_decode_waveforms.""" model = object.__new__(MiMoAudioToken2WavForConditionalGenerationVLLM) model.device = torch.device("cpu") @@ -59,7 +59,7 @@ def _minimal_model(): model.streamer_config = AudioStreamerConfig(group_size=_GROUP, audio_channels=_AC) model.codes = _codes_ns() - decode_vq = Mock( + decode_vq = mocker.Mock( side_effect=lambda audio_codes: torch.ones( audio_codes.shape[1], 7, @@ -67,7 +67,7 @@ def _minimal_model(): device=audio_codes.device, ) ) - decoder = Mock() + decoder = mocker.Mock() audio_tok = SimpleNamespace( encoder=SimpleNamespace(decode_vq=decode_vq), @@ -78,9 +78,9 @@ def _minimal_model(): return model, audio_tok -def test_batch_decode_waveforms_empty_input_list(): +def test_batch_decode_waveforms_empty_input_list(mocker: MockerFixture): """Empty input list returns a single zero-length float32 tensor on model device.""" - model, _ = _minimal_model() + model, _ = _minimal_model(mocker) out = MiMoAudioToken2WavForConditionalGenerationVLLM._batch_decode_waveforms(model, []) assert len(out) == 1 assert out[0].dtype == torch.float32 @@ -88,9 +88,9 @@ def test_batch_decode_waveforms_empty_input_list(): assert out[0].device == model.device -def test_batch_decode_waveforms_single_vs_multiple_decoder_shapes(): +def test_batch_decode_waveforms_single_vs_multiple_decoder_shapes(mocker: MockerFixture): """Single and multi-request batches produce correctly shaped packed hidden states and trimmed waveforms.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) decoder = audio_tok.decoder # Single valid request: decoder output rank-3 for double squeeze path @@ -118,9 +118,9 @@ def test_batch_decode_waveforms_single_vs_multiple_decoder_shapes(): assert out2[1].shape == (8 * _FTP,) -def test_batch_decode_waveforms_mixed_valid_invalid_requests(): +def test_batch_decode_waveforms_mixed_valid_invalid_requests(mocker: MockerFixture): """Mixed valid and invalid requests: invalid slots get empty tensors, valid slots get decoded waveforms.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) valid_a = _make_valid_flat_codes(1) valid_b = _make_valid_flat_codes(1) dummy = _make_dummy_code_tensor() @@ -151,9 +151,9 @@ def test_batch_decode_waveforms_mixed_valid_invalid_requests(): assert input_lengths.tolist() == [4, 4] -def test_batch_decode_waveforms_all_invalid_returns_per_request_empty(): +def test_batch_decode_waveforms_all_invalid_returns_per_request_empty(mocker: MockerFixture): """All-invalid batch skips decoder entirely and returns empty tensors for every slot.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) out = MiMoAudioToken2WavForConditionalGenerationVLLM._batch_decode_waveforms( model, [None, _make_dummy_code_tensor(), torch.tensor([], dtype=torch.long)], @@ -163,9 +163,9 @@ def test_batch_decode_waveforms_all_invalid_returns_per_request_empty(): audio_tok.decoder.assert_not_called() -def test_batch_decode_waveforms_output_shape_trim_when_decoder_returns_extra_samples(): +def test_batch_decode_waveforms_output_shape_trim_when_decoder_returns_extra_samples(mocker: MockerFixture): """Decoder output longer than valid_len is trimmed to the exact expected waveform length.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) flat = _make_valid_flat_codes(1) # Longer than valid_len so branch wav = wav[:valid_len] runs audio_tok.decoder.return_value = torch.ones(1, 1, 10_000, dtype=torch.float32) @@ -175,9 +175,9 @@ def test_batch_decode_waveforms_output_shape_trim_when_decoder_returns_extra_sam assert out[0].dtype == torch.float32 -def test_batch_decode_waveforms_multi_request_trims_each_row_when_decoder_returns_extra(): +def test_batch_decode_waveforms_multi_request_trims_each_row_when_decoder_returns_extra(mocker: MockerFixture): """Else-branch split: per-request wav[:valid_len] when decoder pads each batch row.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) a = _make_valid_flat_codes(1) b = _make_valid_flat_codes(2) audio_tok.decoder.return_value = torch.ones(2, 1, 10_000, dtype=torch.float32) @@ -189,9 +189,9 @@ def test_batch_decode_waveforms_multi_request_trims_each_row_when_decoder_return assert out[1].dtype == torch.float32 -def test_batch_decode_waveforms_valid_only_at_edges_maps_to_correct_indices(): +def test_batch_decode_waveforms_valid_only_at_edges_maps_to_correct_indices(mocker: MockerFixture): """Tensor packing order must match valid_indices when invalid requests are in the middle.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) first = _make_valid_flat_codes(1) last = _make_valid_flat_codes(2) inputs = [ @@ -212,9 +212,9 @@ def test_batch_decode_waveforms_valid_only_at_edges_maps_to_correct_indices(): assert input_lengths.tolist() == [4, 8] -def test_batch_decode_waveforms_output_shapes_1d_float32_for_all_slots(): +def test_batch_decode_waveforms_output_shapes_1d_float32_for_all_slots(mocker: MockerFixture): """Every slot is a 1-D float32 vector (empty or waveform), matching downstream expectations.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) inputs = [_make_valid_flat_codes(1), None, _make_valid_flat_codes(1)] audio_tok.decoder.return_value = torch.ones(2, 1, 5000, dtype=torch.float32) out = MiMoAudioToken2WavForConditionalGenerationVLLM._batch_decode_waveforms(model, inputs) diff --git a/tests/model_executor/models/qwen2_5_omni/test_qwen2_5_omni_embed.py b/tests/model_executor/models/qwen2_5_omni/test_qwen2_5_omni_embed.py index 8e04b04966..587e7f7f8b 100644 --- a/tests/model_executor/models/qwen2_5_omni/test_qwen2_5_omni_embed.py +++ b/tests/model_executor/models/qwen2_5_omni/test_qwen2_5_omni_embed.py @@ -10,10 +10,9 @@ - Interleaved (use_audio_in_video) should also work correctly. """ -from unittest.mock import Mock - import pytest import torch +from pytest_mock import MockerFixture from vllm.model_executor.models.qwen2_5_omni_thinker import ( check_interleaved_audio_video, merge_interleaved_embeddings, @@ -107,7 +106,7 @@ def test_interleaved(self): # --------------------------------------------------------------------------- -def make_mock_model(hidden: int = 8): +def make_mock_model(mocker: MockerFixture, hidden: int = 8): """ Return a minimal mock of Qwen2_5OmniThinkerForConditionalGeneration that has enough structure to run embed_input_ids. @@ -116,10 +115,10 @@ def make_mock_model(hidden: int = 8): Qwen2_5OmniThinkerForConditionalGeneration, ) - model = Mock(spec=Qwen2_5OmniThinkerForConditionalGeneration) + model = mocker.Mock(spec=Qwen2_5OmniThinkerForConditionalGeneration) # Config with token IDs - cfg = Mock() + cfg = mocker.Mock() cfg.video_token_index = VIDEO_TOKEN_ID cfg.audio_token_index = AUDIO_TOKEN_ID model.config = cfg @@ -130,9 +129,9 @@ def fake_lm_embed(ids: torch.Tensor) -> torch.Tensor: # view with shared memory, which masked_scatter_ cannot handle). return ids.float().unsqueeze(-1).expand(-1, hidden).clone() - lang_model = Mock() + lang_model = mocker.Mock() lang_model.embed_input_ids = fake_lm_embed - model.get_language_model = Mock(return_value=lang_model) + model.get_language_model = mocker.Mock(return_value=lang_model) from vllm.model_executor.models.interfaces import SupportsMultiModal @@ -169,7 +168,7 @@ def build_mm_embeds(audio_n, image_n, video_n, hidden, audio_val=10.0, image_val class TestEmbedInputIds: - def _run(self, audio_n, image_n, video_n, hidden=8): + def _run(self, mocker: MockerFixture, audio_n, image_n, video_n, hidden=8): """ Run embed_input_ids for a non-interleaved mixed-modality sequence. Returns (result_embeds, input_ids, is_multimodal). @@ -177,33 +176,33 @@ def _run(self, audio_n, image_n, video_n, hidden=8): input_ids, is_multimodal = make_token_seq(audio_n, image_n, video_n) mm_embeds = build_mm_embeds(audio_n, image_n, video_n, hidden) - model, _ = make_mock_model(hidden) + model, _ = make_mock_model(mocker, hidden) result = model.embed_input_ids(input_ids, mm_embeds, is_multimodal=is_multimodal) return result, input_ids, is_multimodal - def test_audio_only(self): + def test_audio_only(self, mocker: MockerFixture): """Audio-only: audio positions get audio embeddings.""" audio_n, hidden = 5, 8 audio_val = 10.0 - result, input_ids, is_multimodal = self._run(audio_n, 0, 0, hidden) + result, input_ids, is_multimodal = self._run(mocker, audio_n, 0, 0, hidden) audio_pos = (input_ids == AUDIO_TOKEN_ID).nonzero(as_tuple=True)[0] assert result[audio_pos].allclose(torch.full((audio_n, hidden), audio_val)), ( "Audio positions should get audio embeddings" ) - def test_video_only(self): + def test_video_only(self, mocker: MockerFixture): """Video-only: video positions get video embeddings.""" video_n, hidden = 6, 8 video_val = 30.0 - result, input_ids, is_multimodal = self._run(0, 0, video_n, hidden) + result, input_ids, is_multimodal = self._run(mocker, 0, 0, video_n, hidden) video_pos = (input_ids == VIDEO_TOKEN_ID).nonzero(as_tuple=True)[0] assert result[video_pos].allclose(torch.full((video_n, hidden), video_val)), ( "Video positions should get video embeddings" ) - def test_mixed_modalities_audio_goes_to_audio_pos(self): + def test_mixed_modalities_audio_goes_to_audio_pos(self, mocker: MockerFixture): """ Regression test for GitHub issue #34506: With audio + image + video (non-interleaved), audio positions must @@ -212,7 +211,7 @@ def test_mixed_modalities_audio_goes_to_audio_pos(self): audio_n, image_n, video_n, hidden = 5, 4, 6, 8 audio_val, image_val, video_val = 10.0, 20.0, 30.0 - result, input_ids, is_multimodal = self._run(audio_n, image_n, video_n, hidden) + result, input_ids, is_multimodal = self._run(mocker, audio_n, image_n, video_n, hidden) audio_pos = (input_ids == AUDIO_TOKEN_ID).nonzero(as_tuple=True)[0] image_pos = (input_ids == IMAGE_TOKEN_ID).nonzero(as_tuple=True)[0] @@ -233,10 +232,10 @@ def test_mixed_modalities_audio_goes_to_audio_pos(self): f"Video emb wrong: expected {video_val}, got mean={mean_v:.1f}" ) - def test_text_positions_unchanged(self): + def test_text_positions_unchanged(self, mocker: MockerFixture): """Text positions should keep their text embeddings.""" audio_n, image_n, video_n, hidden = 3, 2, 4, 8 - result, input_ids, is_multimodal = self._run(audio_n, image_n, video_n, hidden) + result, input_ids, is_multimodal = self._run(mocker, audio_n, image_n, video_n, hidden) text_pos = (~is_multimodal).nonzero(as_tuple=True)[0] # Text tokens have value TEXT_TOKEN_ID=0, so embed -> 0.0 @@ -244,7 +243,7 @@ def test_text_positions_unchanged(self): "Text positions should keep text embeddings" ) - def test_interleaved_use_audio_in_video(self): + def test_interleaved_use_audio_in_video(self, mocker: MockerFixture): """ Interleaved (use_audio_in_video): video chunks interleaved with audio. Video embeddings must go to video positions, audio to audio positions. @@ -263,7 +262,7 @@ def test_interleaved_use_audio_in_video(self): torch.full((audio_n, hidden), audio_val), ] - model, _ = make_mock_model(hidden) + model, _ = make_mock_model(mocker, hidden) result = model.embed_input_ids(input_ids, mm_embeds, is_multimodal=is_multimodal) video_pos = (input_ids == VIDEO_TOKEN_ID).nonzero(as_tuple=True)[0] diff --git a/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py b/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py index e2970dcb2d..b0ce10a8d5 100644 --- a/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py +++ b/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py @@ -15,9 +15,10 @@ import os import sys import types -from unittest.mock import MagicMock, patch +import pytest import torch +from pytest_mock import MockerFixture # Direct file import to avoid vllm_omni.__init__ patch dependencies. _BASE = os.path.join( @@ -41,28 +42,31 @@ def _load_module(name: str, filename: str): return mod -def _build_mock_modules() -> dict[str, object]: +def _build_mock_modules(mocker: MockerFixture) -> dict[str, object]: """Build the dict of modules to inject into sys.modules.""" - platforms_mock = MagicMock() + platforms_mock = mocker.MagicMock() platforms_mock.current_omni_platform.supports_torch_inductor.return_value = False - logger_mock = MagicMock() - logger_mock.init_logger = lambda name: MagicMock() + logger_mock = mocker.MagicMock() + logger_mock.init_logger = lambda name: mocker.MagicMock() - vllm_config_mod = MagicMock() - vllm_config_mod.set_current_vllm_config = lambda cfg: MagicMock(__enter__=MagicMock(), __exit__=MagicMock()) + vllm_config_mod = mocker.MagicMock() + vllm_config_mod.set_current_vllm_config = lambda cfg: mocker.MagicMock( + __enter__=mocker.MagicMock(), + __exit__=mocker.MagicMock(), + ) - weight_utils_mock = MagicMock() + weight_utils_mock = mocker.MagicMock() weight_utils_mock.default_weight_loader = lambda p, w: None pkg = types.ModuleType("vllm_omni.model_executor.models.qwen3_tts") pkg.__path__ = [os.path.abspath(_BASE)] return { - "vllm_omni": MagicMock(), + "vllm_omni": mocker.MagicMock(), "vllm_omni.platforms": platforms_mock, "vllm.logger": logger_mock, - "vllm.config": MagicMock(), + "vllm.config": mocker.MagicMock(), "vllm.config.vllm": vllm_config_mod, "vllm.model_executor.model_loader.weight_utils": weight_utils_mock, "vllm_omni.model_executor": types.ModuleType("vllm_omni.model_executor"), @@ -71,38 +75,47 @@ def _build_mock_modules() -> dict[str, object]: } -def _load_target_classes(): +def _load_target_classes(mocker: MockerFixture): """Load config and code predictor modules with mocked dependencies. - Uses patch.dict to ensure sys.modules is always restored, even on failure. + Uses mocker.patch.dict to ensure sys.modules is always restored, even on failure. """ - mocks = _build_mock_modules() - with patch.dict(sys.modules, mocks): - config_mod = _load_module( - "vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts", - "configuration_qwen3_tts.py", - ) - sys.modules["vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts"] = config_mod + mocks = _build_mock_modules(mocker) + mocker.patch.dict(sys.modules, mocks) + config_mod = _load_module( + "vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts", + "configuration_qwen3_tts.py", + ) + sys.modules["vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts"] = config_mod - cp_mod = _load_module( - "vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_code_predictor_vllm", - "qwen3_tts_code_predictor_vllm.py", - ) + cp_mod = _load_module( + "vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_code_predictor_vllm", + "qwen3_tts_code_predictor_vllm.py", + ) return config_mod, cp_mod -_config_mod, _cp_mod = _load_target_classes() - -Qwen3TTSTalkerCodePredictorConfig = _config_mod.Qwen3TTSTalkerCodePredictorConfig -Qwen3TTSTalkerConfig = _config_mod.Qwen3TTSTalkerConfig -CodePredictorWrapper = _cp_mod.Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM -CodePredictorModel = _cp_mod.Qwen3TTSTalkerCodePredictorModelVLLM +@pytest.fixture +def loaded_target_classes(mocker: MockerFixture): + config_mod, cp_mod = _load_target_classes(mocker) + return ( + config_mod.Qwen3TTSTalkerCodePredictorConfig, + config_mod.Qwen3TTSTalkerConfig, + cp_mod.Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM, + cp_mod.Qwen3TTSTalkerCodePredictorModelVLLM, + ) -def _make_tiny_config() -> tuple: +def _make_tiny_config(loaded_target_classes) -> tuple: """Create minimal configs for a tiny code predictor model.""" - cp_config = Qwen3TTSTalkerCodePredictorConfig( + ( + qwen3_tts_talker_code_predictor_config, + qwen3_tts_talker_config, + _, + _, + ) = loaded_target_classes + cp_config = qwen3_tts_talker_code_predictor_config( vocab_size=64, hidden_size=32, intermediate_size=64, @@ -113,16 +126,16 @@ def _make_tiny_config() -> tuple: num_code_groups=4, rms_norm_eps=1e-6, ) - talker_config = Qwen3TTSTalkerConfig( + talker_config = qwen3_tts_talker_config( hidden_size=32, num_code_groups=4, ) return cp_config, talker_config -def _make_vllm_config(max_num_seqs: int = 4) -> MagicMock: +def _make_vllm_config(mocker: MockerFixture, max_num_seqs: int = 4): """Create a mock VllmConfig with scheduler_config.""" - vllm_config = MagicMock() + vllm_config = mocker.MagicMock() vllm_config.scheduler_config.max_num_seqs = max_num_seqs return vllm_config @@ -130,12 +143,13 @@ def _make_vllm_config(max_num_seqs: int = 4) -> MagicMock: class TestCodePredictorDtypeAlignment: """Test that code predictor buffers match model parameter dtype.""" - def test_ensure_buffers_uses_given_dtype(self) -> None: + def test_ensure_buffers_uses_given_dtype(self, mocker: MockerFixture, loaded_target_classes) -> None: """_ensure_buffers should create proj_buf with the given dtype.""" - cp_config, talker_config = _make_tiny_config() - vllm_config = _make_vllm_config() + _, _, code_predictor_wrapper, _ = loaded_target_classes + cp_config, talker_config = _make_tiny_config(loaded_target_classes) + vllm_config = _make_vllm_config(mocker) - predictor = CodePredictorWrapper( + predictor = code_predictor_wrapper( vllm_config=vllm_config, config=cp_config, talker_config=talker_config, @@ -150,12 +164,13 @@ def test_ensure_buffers_uses_given_dtype(self) -> None: predictor._ensure_buffers(torch.device("cpu"), torch.float32) assert predictor._proj_buf.dtype == torch.float32 - def test_warmup_aligns_buffer_to_model_params(self) -> None: + def test_warmup_aligns_buffer_to_model_params(self, mocker: MockerFixture, loaded_target_classes) -> None: """_warmup_buckets should align proj_buf dtype to model parameters.""" - cp_config, talker_config = _make_tiny_config() - vllm_config = _make_vllm_config(max_num_seqs=2) + _, _, code_predictor_wrapper, _ = loaded_target_classes + cp_config, talker_config = _make_tiny_config(loaded_target_classes) + vllm_config = _make_vllm_config(mocker, max_num_seqs=2) - predictor = CodePredictorWrapper( + predictor = code_predictor_wrapper( vllm_config=vllm_config, config=cp_config, talker_config=talker_config, @@ -177,12 +192,13 @@ def test_warmup_aligns_buffer_to_model_params(self) -> None: assert predictor._proj_buf.dtype == torch.float16 - def test_setup_compile_caches_model_dtype(self) -> None: + def test_setup_compile_caches_model_dtype(self, mocker: MockerFixture, loaded_target_classes) -> None: """_setup_compile should cache model parameter dtype.""" - cp_config, talker_config = _make_tiny_config() - vllm_config = _make_vllm_config(max_num_seqs=2) + _, _, code_predictor_wrapper, _ = loaded_target_classes + cp_config, talker_config = _make_tiny_config(loaded_target_classes) + vllm_config = _make_vllm_config(mocker, max_num_seqs=2) - predictor = CodePredictorWrapper( + predictor = code_predictor_wrapper( vllm_config=vllm_config, config=cp_config, talker_config=talker_config, @@ -193,12 +209,13 @@ def test_setup_compile_caches_model_dtype(self) -> None: predictor._setup_compile() assert predictor._model_dtype == torch.float16 - def test_forward_with_mismatched_input_dtype(self) -> None: + def test_forward_with_mismatched_input_dtype(self, mocker: MockerFixture, loaded_target_classes) -> None: """forward() should not crash when inputs are float32 but model is float16.""" - cp_config, talker_config = _make_tiny_config() - vllm_config = _make_vllm_config(max_num_seqs=2) + _, _, code_predictor_wrapper, _ = loaded_target_classes + cp_config, talker_config = _make_tiny_config(loaded_target_classes) + vllm_config = _make_vllm_config(mocker, max_num_seqs=2) - predictor = CodePredictorWrapper( + predictor = code_predictor_wrapper( vllm_config=vllm_config, config=cp_config, talker_config=talker_config, @@ -231,10 +248,11 @@ def test_forward_with_mismatched_input_dtype(self) -> None: class TestCodePredictorModelDtype: """Test the inner model forward with different dtypes.""" - def test_model_forward_float16(self) -> None: + def test_model_forward_float16(self, loaded_target_classes) -> None: """Inner model forward should work in float16.""" - cp_config, _ = _make_tiny_config() - model = CodePredictorModel(cp_config, talker_hidden_size=32).to(torch.float16) + _, _, _, code_predictor_model = loaded_target_classes + cp_config, _ = _make_tiny_config(loaded_target_classes) + model = code_predictor_model(cp_config, talker_hidden_size=32).to(torch.float16) bsz, seq_len = 1, 4 inputs = torch.randn(bsz, seq_len, 32, dtype=torch.float16) @@ -244,10 +262,11 @@ def test_model_forward_float16(self) -> None: assert output.dtype == torch.float16 assert output.shape == (bsz, seq_len, 32) - def test_model_forward_float32(self) -> None: + def test_model_forward_float32(self, loaded_target_classes) -> None: """Inner model forward should work in float32.""" - cp_config, _ = _make_tiny_config() - model = CodePredictorModel(cp_config, talker_hidden_size=32).to(torch.float32) + _, _, _, code_predictor_model = loaded_target_classes + cp_config, _ = _make_tiny_config(loaded_target_classes) + model = code_predictor_model(cp_config, talker_hidden_size=32).to(torch.float32) bsz, seq_len = 1, 4 inputs = torch.randn(bsz, seq_len, 32, dtype=torch.float32) diff --git a/tests/model_executor/models/test_fish_speech_voice_cache.py b/tests/model_executor/models/test_fish_speech_voice_cache.py index 8fe7a4a4d1..fef4b551ab 100644 --- a/tests/model_executor/models/test_fish_speech_voice_cache.py +++ b/tests/model_executor/models/test_fish_speech_voice_cache.py @@ -10,11 +10,11 @@ import os import tempfile -from unittest.mock import MagicMock, patch import numpy as np import pytest import torch +from pytest_mock import MockerFixture pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -61,18 +61,18 @@ class TestFishSpeechVoiceCacheIntegration: """Test the cache-hit / cache-miss / no-cache paths in the model.""" @pytest.fixture - def mock_model(self): + def mock_model(self, mocker: MockerFixture): """Create a mock FishSpeechSlowARForConditionalGeneration with cache.""" from vllm_omni.utils.voice_cache import VoiceEmbeddingCache - model = MagicMock() + model = mocker.MagicMock() model._voice_cache = VoiceEmbeddingCache(max_entries=4) model._semantic_begin_id = 151678 model._num_codebooks = 10 model._codebook_size = 4096 model.model_path = "/fake/model" - model.codebook_embeddings = MagicMock() - model.codebook_embeddings.weight = MagicMock() + model.codebook_embeddings = mocker.MagicMock() + model.codebook_embeddings.weight = mocker.MagicMock() model.codebook_embeddings.weight.device = torch.device("cpu") return model @@ -166,9 +166,9 @@ def test_created_at_zero_disables_cache(self, mock_model): class TestFishSpeechValidatorUploadedVoice: """Test _validate_fish_tts_request uploaded voice resolution.""" - def test_uploaded_voice_resolves_ref_audio(self): + def test_uploaded_voice_resolves_ref_audio(self, mocker: MockerFixture): """When voice matches an uploaded speaker, ref_audio should be auto-set.""" - request = MagicMock() + request = mocker.MagicMock() request.input = "Hello" request.voice = "alice" request.ref_audio = None @@ -185,17 +185,17 @@ def test_uploaded_voice_resolves_ref_audio(self): } # Simulate: voice in uploaded_speakers, file exists, get_audio returns data URL. - with patch("pathlib.Path.exists", return_value=True): - voice_lower = request.voice.lower() - assert voice_lower in uploaded_speakers + mocker.patch("pathlib.Path.exists", return_value=True) + voice_lower = request.voice.lower() + assert voice_lower in uploaded_speakers - speaker_info = uploaded_speakers[voice_lower] - ref_text_from_upload = speaker_info.get("ref_text") - assert ref_text_from_upload == "Hi this is Alice" + speaker_info = uploaded_speakers[voice_lower] + ref_text_from_upload = speaker_info.get("ref_text") + assert ref_text_from_upload == "Hi this is Alice" - def test_uploaded_voice_without_ref_text_uses_request_ref_text(self): + def test_uploaded_voice_without_ref_text_uses_request_ref_text(self, mocker: MockerFixture): """If upload has no ref_text but request provides it, use request's.""" - request = MagicMock() + request = mocker.MagicMock() request.input = "Hello" request.voice = "bob" request.ref_audio = None diff --git a/tests/test_fish_speech_voice_cache.py b/tests/test_fish_speech_voice_cache.py index 8fe7a4a4d1..1c299d8014 100644 --- a/tests/test_fish_speech_voice_cache.py +++ b/tests/test_fish_speech_voice_cache.py @@ -10,11 +10,12 @@ import os import tempfile -from unittest.mock import MagicMock, patch +from pathlib import Path import numpy as np import pytest import torch +from pytest_mock import MockerFixture pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -61,18 +62,18 @@ class TestFishSpeechVoiceCacheIntegration: """Test the cache-hit / cache-miss / no-cache paths in the model.""" @pytest.fixture - def mock_model(self): + def mock_model(self, mocker: MockerFixture): """Create a mock FishSpeechSlowARForConditionalGeneration with cache.""" from vllm_omni.utils.voice_cache import VoiceEmbeddingCache - model = MagicMock() + model = mocker.MagicMock() model._voice_cache = VoiceEmbeddingCache(max_entries=4) model._semantic_begin_id = 151678 model._num_codebooks = 10 model._codebook_size = 4096 model.model_path = "/fake/model" - model.codebook_embeddings = MagicMock() - model.codebook_embeddings.weight = MagicMock() + model.codebook_embeddings = mocker.MagicMock() + model.codebook_embeddings.weight = mocker.MagicMock() model.codebook_embeddings.weight.device = torch.device("cpu") return model @@ -166,9 +167,13 @@ def test_created_at_zero_disables_cache(self, mock_model): class TestFishSpeechValidatorUploadedVoice: """Test _validate_fish_tts_request uploaded voice resolution.""" - def test_uploaded_voice_resolves_ref_audio(self): + def test_uploaded_voice_resolves_ref_audio( + self, + monkeypatch: pytest.MonkeyPatch, + mocker: MockerFixture, + ): """When voice matches an uploaded speaker, ref_audio should be auto-set.""" - request = MagicMock() + request = mocker.MagicMock() request.input = "Hello" request.voice = "alice" request.ref_audio = None @@ -185,17 +190,21 @@ def test_uploaded_voice_resolves_ref_audio(self): } # Simulate: voice in uploaded_speakers, file exists, get_audio returns data URL. - with patch("pathlib.Path.exists", return_value=True): - voice_lower = request.voice.lower() - assert voice_lower in uploaded_speakers + monkeypatch.setattr(Path, "exists", lambda self: True) - speaker_info = uploaded_speakers[voice_lower] - ref_text_from_upload = speaker_info.get("ref_text") - assert ref_text_from_upload == "Hi this is Alice" + voice_lower = request.voice.lower() + assert voice_lower in uploaded_speakers + + speaker_info = uploaded_speakers[voice_lower] + ref_text_from_upload = speaker_info.get("ref_text") + assert ref_text_from_upload == "Hi this is Alice" - def test_uploaded_voice_without_ref_text_uses_request_ref_text(self): + def test_uploaded_voice_without_ref_text_uses_request_ref_text( + self, + mocker: MockerFixture, + ): """If upload has no ref_text but request provides it, use request's.""" - request = MagicMock() + request = mocker.MagicMock() request.input = "Hello" request.voice = "bob" request.ref_audio = None From 2a1d5060abbae97648d86f57d70fe5af57d41467 Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Mon, 13 Apr 2026 20:43:40 +0800 Subject: [PATCH 150/204] [skip ci][doc]Update async_chunk design diagram (#2420) Signed-off-by: amy-why-3459 --- docs/design/feature/async_chunk_design.md | 76 +++++++++++++++--- .../architecture/qwen3-omni-async-chunk.png | Bin 198564 -> 68497 bytes .../qwen3-omni-non-async-chunk.png | Bin 263596 -> 49242 bytes 3 files changed, 67 insertions(+), 9 deletions(-) diff --git a/docs/design/feature/async_chunk_design.md b/docs/design/feature/async_chunk_design.md index 202ef0e18e..45314a0aec 100644 --- a/docs/design/feature/async_chunk_design.md +++ b/docs/design/feature/async_chunk_design.md @@ -19,7 +19,7 @@ The `async_chunk` feature enables asynchronous, chunked processing of data acros For qwen3-omni: - **Thinker → Talker**: Per decode step (typically chunk_size=1) -- **Talker → Code2Wav**: Accumulated to `codec_chunk_frames` (default=25) before sending. During the initial phase, a dynamic initial chunk size (IC) is automatically selected based on server load to reduce TTFA. Use the per-request `initial_codec_chunk_frames` API field to override. +- **Talker → Code2Wav**: Accumulated to `codec_chunk_frames` (default=25) before sending. During the initial phase, a dynamic initial chunk size (IC) is automatically selected based on server load to reduce TTFP. Use the per-request `initial_codec_chunk_frames` API field to override. - **Code2Wav**: Streaming decode with code2wav chunk_size With `async_chunk`: @@ -75,26 +75,84 @@ Enabling **async_chunk** (False→True) sharply reduces time-to-first-audio (TTF

## Architecture -### Data Flow -#### Sequential Flow +### Async Chunk Pipeline Overview + +The following diagram illustrates the **Async Chunk Architecture** for multi-stage models (e.g., Qwen3-Omni with Thinker → Talker → Code2Wav), showing how data flows through the 4-stage pipeline with parallel processing and dual-stream output: +

- - Data Flow between stages + + Async Chunk Pipeline Architecture

-#### Async Chunk Flow +**Diagram Legend:** +| Step | Stage Type | Description | +|:------:|:-----------:|:------------| +| `prefill` | Initialization | Context processing, KV cache initialization | +| `decode` | Autoregressive | Token-by-token generation in AR stages | +| `codes` | Audio Encoding | RVQ codec codes from Talker stage | +| `output` | Final Output | Text chunks or audio waveforms | +### Data Flow + +#### Stage 0: Thinker (Multimodal Understanding + Text Generation) +- **Prefill**: Processes multimodal input (text/image/audio/video), initializes KV cache +- **Decode Loop**: Generates text tokens autoregressively +- **Chunk Triggers**: Each decode step (typically `chunk_size=1`) can trigger downstream processing +- **Dual Output**: + - **Text Stream**: `text_0`, `text_1`, `text_2`... `text_n` streamed to output + - **Hidden States**: Passed to Talker stage for audio synthesis + +#### Stage 1: Talker (Text → RVQ Audio Codes) +- **Prefill**: Receives hidden states from Thinker as semantic condition +- **Decode Loop**: Generates RVQ codec codes autoregressively +- **Accumulation**: Codes accumulate to `codec_chunk_frames` (default=25) before forwarding +- **Dynamic IC**: Initial chunk size auto-selected based on server load to optimize TTFP +- **Output**: `codes` blocks (chunk 0, 1, ... n) sent to Code2Wav + +#### Stage 2: Code2Wav (Vocoder Decoder) +- **Non-Autoregressive**: Processes RVQ codes in parallel batches +- **Streaming Decode**: Converts codes to audio waveforms chunk-by-chunk +- **Batching**: Supports batched inference for multiple concurrent requests +- **Output**: Audio segments `audio_0`, `audio_1`, ... `audio_n` + +#### Stage 3: Output (Dual Stream) +- **Text Streaming**: `text_0` → `text_1` → `text_2` → ... (user sees response in real-time) +- **Audio Streaming**: `audio_0` → `audio_1` → ... (user hears audio progressively) + +### Execution Timeline + +``` +Timeline: Parallel vs Sequential + +Sequential (async_chunk=false): +[Thinker: ████████████████████] (2.0s) + [Talker: ████████████████████] (3.0s) + [Code2Wav: ████] (1.0s) +Total: 6.0s, TTFP: 6.0s + +Async Chunk (async_chunk=true): +[Thinker: ████░░░░████░░░░████] (2.0s, streaming) + [Talker: ░░████░░░░████░░] (3.0s, parallel) + [Code2Wav: ░░░░████░░] (1.0s, batched) +Total: ~3.5s, TTFP: ~0.5s + +█ = Active computation ░ = Waiting/idle +``` + +#### Sequential Flow (for comparison)

- - Data Flow between stages + + Sequential Data Flow

-### Async Chunk architecture +In sequential mode, each stage must wait for the previous stage to complete entirely before starting. + +### Async Chunk System Architecture

diff --git a/docs/source/architecture/qwen3-omni-async-chunk.png b/docs/source/architecture/qwen3-omni-async-chunk.png index b2d98b80f3329f2e55084cd79529f485ab1f58b5..e73ca84b283ed767494ff748e1aec189ec63c676 100644 GIT binary patch literal 68497 zcmd?R1zS~76E=)N2-2m5lyoT)(&3>c1O(}Bq&r3G08*0DDIm?EyHmQmySw3C$LD$8 z@BI(oxh^k(v-jF-&CHs)=bl+RP+nFX9hC?b0RaJBQUaoYfPi?4fbcjME;p+)MKnb`VVy;) z@)w?;qoG8tkuk3j*QxpMKXP&~7a*-s^KOfADUPH?tLgpy%La?1Sw)YdB%ow_J@mJi zoss&-0vXxuwV{U6s@qF9ezFalSH&I|ckZ|Dl~(RrHyYbfQBhc6LjOKh*)<9(|N8~u z59Siae}DN#{GsZ}e}7)=z}o-cHC1nYdH?sb<@5jlT>e*j8&zp>LfebqZvF4CcgBU9 zwKNWzi|5;8c@q_1)wQ)3kjdQT|M1-2-yE2lnr>}vVFY~m!vihyyt@__7H)2C)~t1m z{)5rp-VVM4R|wvnMk+akLyENOzyJHZ&i`O!WaRqqBPki^JBFp2Et~Y+2mT@Ew4nPc zV>w&ZY{}brcb-tjfvpHL>YMzUE$oA&FtHYspU>n-b-j_|m01|<%;UKGf`o(w4{tQ* z6IJdG+z8p^DT%c`&c@12M{aL#Gc9p)ejD}0hmZ++j{KIzwmAWxg1x!g z*t_X;!xPT0>NjQQrp_rp8eaJJ%+fU+?M&Hi^xr$4|1H{$2zcdcGGFKN>C-0>!g|vY z8lPtC^WCW%B2Fdbb9rPW$Kw;i8u_cslWFajs^afTL)X_gaEB&K^krxi^OOr9bk_l7 zo^M_}2NH*5?oaXO7+)mMk>R9i{F;Ov$gkWSNDtg@ZOU~9O1!gPw2hQ3P!5*2x3ny5 z)|ssMN{sEWw@Nn3C_e{tySug}@u3e-v6KBx+RYOeAqm0cA9cC19VymrZOd=^^EvN1 zQfrFWefRwBcJiFN`VCa|k!cdI7yZhn+?IGL(48=(H-zwO34EN<)=Nb9A0citBc6QNy*=<}xZOs_W^d&00^iz@JyW zdK>2lv}>cUP4_PE1Exx(Jq02*4i3CdH-~Nx8nriZZq9a`4i}q|XUepQc#TI#KKQZ7 zRT2#j1B9narEqLPzYWJ|>$jLP|OoL`BsL&)$OOp?U3f{z^>LSg5`K8><~BU>>Bz;T?arC9bYD-gtho(EjMrBQlTEp^PvA z1_(XB>uFa{k1KP>VbAGHk;H|1JUjybP*RWWQ&a2)39t2E#uXM*Y&3e4;s^EACc>6! zRhJzj5f5{w(k}N`8!Ckw^d_n&8w1*9CZFVsnyduwid9FDpECC{ztGaW?T-&b#vn0) zO+o1P+X8X<-OlSA_ohd4Hs&>N|DaI!XzV`BYwD;8L%>y=GnGYgqO4zv7sphz_r|$( zRMNA`<>Ed|&{3NW>9gYAWEtK*IVv-7|E<~#jXVs{HlE&N0wg>FG<<3YM0Z9!Hm2EeOO!(s(uO zABkV1C3-KI}328z`VP%+jCk>9tw>hFRRpNnT#v_ZNVC?c za%N{NkBd`7-6y*Manze13N5f>3O_xyr;I1y7ptP29XfsXfL*gzy_V<4*g`6EJMeCH zMnPWVMqeZm&7sJ0S>rJVqx2js!GyZTFUPo?QX2(s|M;dYNx+w44}CW*a@e1i{E7PQ zeI2oL6sZ}|rO0HNY zJpq2a2=4JU3E@-u_S9pAYr*}UpkSl6L<)2$xwhVKV{?l4P_V|h8d4;tQElr7RV#0z zj4zzNyu=#vLnD03^8j=v3;`DS%--{2Z6w_VXd#wWyYT2GG!g7E+)p>iGpOLJu~NV9 z&_vnn)U4Dy#+8}h9N0;`PO@bLzU>9uV-E|5+y(6TytQq?NQ{NqTZnp0&u8k`NsS)r}!)#^f2oA9SDW%a4MNJ~lB}*S1$MdRjMS>Fuv)=%*>d51 zed_{1jQ>6)C9O%lv8=eC6h##ydJ#=IKJztcdRS{i$NuxZfyDQ@(*ItIkouc^_7Rrv z*R)0_W3Pk$`*}1kdk@P6;`yXI;{(RKy;b9kI^HlfH>q15sh)`UOL6?YRacjiy zbv}4sBKC?k#~%u@t~?5(uQVE6EH0l#U#WACoiy9FcVaONj6JC0z&Jvqp{%iOOLwa> zpP+cx&{OXduw{Q-N1<6Oz;|5#MCECIl)-^y!rMVczPA?`p-;iwsy?7NAv`TU2x)%C zzLHUwo>-TD6*inJ$ASNw55ei#$#RoSmMYC_lCJ*wtN!%W*%`sao3p%NCnIAriTH-M zGT(dP(XNWjmv_-~wedK3ueKb5>n}*@zz?)QKlR`QU%!wNc!)3dtvvGn2FAejB-Seg zgs{=+Mp|raj;*ogjdRQ!zRB`O*q?j^Fv|%lli~MR_5d7*@NFjiA&wp)JbVA$aN&>Y z!+_Xy8JK^i*QIy1fW4N+B9rB4OLpMXXhd*_wcv~DC&~R4g9|s+a&p6AFx~}?$5-IT zc~=3iJv+F>4f4oi)lsTvrO)e^=BB)?AAg#b*5tie(4fESocdxHfbsVj{Z#KE&zXAY zJI4|iW{ytZN>7T6Y777V=Vr>jTSDc!pn^W z&A)J-{uQYb6~V+%BTr3AFY(WnbiM zbz+KG8mu%Nb(L*&mCs@0EEF9|6HokQ(}te}DXy5D%WH8}?ZW(G8erN+q%gRBi4}xR z{e3mSFi-MQFlfPx3mNy~d2o;~x0h+vgRWRt9i*&OSf+k>YZ@tTJ@VsK z{#hbjusiab>4>OYz+emGeBj5my-DoU%i9gj`PN#RBFvR`c9sDn*1qSA^*k)G{`KL6 zU*6*&`@T@L(n{6LJRvHdFV~|GV<`Bde6A3WLpi0Twj90<3$NAH0HbiRru&H@`WHAB z#EH8T0NDyjpg1+%p}gfb=@92qjKdfx)SL_$#`$V^PIGY5>{Mh{wIJ(3l+fh%>`G=( zF10LnenDJ5Z5G|CGt$-Y_T3{01nP?q50lm^8sK5J->ld1i;-A!ZH`CQt(>xjc`hI{cEg?f?9Zc>b5kMxZXwYAVB&s44Jp>49+``EYNFD82gOM0A=sO`zSz|lFfRsK1_sD*;k zJq_(o?QEe*4cT5rKccKC*IGZ-S*L9iOsvK4*PSY(^0`ySP$)?@29BS-srr2#z6afPNZiLVo+&Yi zQn-0`kboi=_jwWTrtP{X6%Fo`Ty^iah*}JutZODEYwVklMZ~YKPE>)RA?WG~8-BBO zCqvbh?4$CGCbKu$CV$8^Bty;J{ZfUC9HQwSVo0Xvv4~oe`*Cfqpte-tuTy>R2M zvY0<*h^VFSpp=ZzF$PT1eX)+H@Fsdgvg5y)pKf!SoMQu&AY3QV7mddUcT$3QPSg(P z@@D#4Z34_vhI!TP4Y_Z~;^x6{f3zpvzEsyR>1-Jjd5bYTot%B~+wc+AP()-M zzN2h1j|dB_u;-7?Jb_Z+4xx>$VnH2Z^bB$i6k}bP^I6($ue6P(_N$rxGe`v0^+?}o z_AW_Yl)*h#pwxhdInZk7y+fU-L7t^IDp9f0%)Upj%#x&JC~eTo*#p_0g4@64AFuow ziamPN<*U;#4w-u}Jhs;H3Q8GjtdT1UXyQr!qy-5oRwUKErH@Xj9{;%wlNVMbo9s}R zpVI%oX1nP5^z!(2a8%f0z&2emMK;b0nr^Jj!LSk0h53DJ?}$rV!~$k_UnDQKrB?7Z z>!z`|UjGugSk$2PlDI9CX{jQ?#)W&xTh&4(b=V;`z*Ze?!}0}TMSSD;%wyF*sq_Qwi5!eJ)uxq^-8 zfn9ocXPL`Ea8cB+tk36b5MUjXct|oD%8s!%nwHF}=~}Y`LsuOb{v%eJ8`wK7>aCl_=nC0q_)Dp;Gg+_!v|ftdMRQ!t zV*XIwp8M#Dh`kmUl}~sUZ;*?fyQPZdO`tlAow&oHx_nfEIqBE1m*tVsG12Fm8kxWA z`Fd*09NXh3@m2ZD&C+$wz59p8rief8-X}O89D-@6tGbU^sV;@?*GYfySSdww!6gPO zlBC{$?8h|sDY3>&2pa7sptIZAP7(M-#FDvIamUycI6gWz>@h|~SgB&C!&Q{q{C}+| zlc$9x{Zs$RfMBER*>>f;>KhufQBo3?Un?I4h#?_VyMJdxgJzn=O}OIRTijiULy81vDGaO2-s!kdLvjFfjblZfwBX zu$n4e1Gv7QRQhY72}9e}+(fHfh%9fS1E2^5Jg!kxdf*u!vs#&TvhMz=WJ~mNsKD)O zo9*C^?nv4qwQ{LMoMl!B0m-qM5t~S2e%*~tajtJB*lag3*H+HJIqrNHn`aIUz4Qw>-RknueJ(SxHS2U z(Wk4<$U;;h=k>(|4BG24+x_}(f3jsO4b(JU7_VMBF z)@^H)2fvw1Uii-p{T*P8XLX?d_ecbUKfP2fq;J{Xo3hhJ9T2`vV~l+N_f2Zue>vR6 zeez8>`d<7{WrtjNcf3-w7WpB-?0dky@pe#%;H_=_EJY_XE=ThR@w&PTzJunI8SRm$ zn50LZ<9(6&ak->_p1tB(7PK+1SWA4k!3b_ltgug~an7N!H|;^ZUztF|_^STS{2IEL zurcdiwWwCDl5(@3wU`z^=*Sx7qsb#x9+)&l)RlM zvN)eQLrD>NylNfyh&U}Sdw*$rHC~HHc7!}-XeUjwPQ1Q3&;!Zs^_kU4L}A6ZZ{N@e zSqxIT(6;gja!|OUcj(9|vlQq}==Y|}snY9iwVyJ#ldi21Vtu+f-xFml(eDhsy1JU8 zJ-(g33R2Ede;*Oc`ky$q`i#oyPt=jCa~%DhSw?Ef$wEot)nU?+a7bVdouH?}ZfLKQ z>rd8o3>WvM{k(b457~9)x_z|f&vozTA@*mrE{e(qgeqq}z2Xa=Y3x&PsS8qu>e*#$ z%Dk2qpOKNU9G_z*!Pa4`iBYGC2aS?ysXH^!!s!LngzKTeW8=ObWP)2hb(9OV&kw## z)WKfs2{l$Q_YiKM%P}-2F2$zq5xcJl& zp)XR&$Llg3!L$@)EiRbSWvA7EgM~F%YS_#BzV3SPK3Z~suVe0ITXUf(&%THuK3+02 zpIf0!0;i{^<#jtcFxgpLU*69(T1umrC6ySrRcAgDd$?iffv$$V_gNo^us#jNoY8L_ zEIx%m8WZU)26eZU6ZT-g^!)`nbw-tX@6TAvs3@)I{&=&%C8&D%WyOp$h^bg`zwy>Gb`(D%HtGYi-@ai<%z~*jb|H$>=^UmY>fG_} zYONwBs){e~wb_KF%}>A7)YKY>o$0XqHYC(Xc0(@QR424)kurq4whzSz;cyOTZ($lD zI|Y_5q>Qz%l-t~DwWrGc+x*G?qoQ*;nYc|E$Z6KyWm0n1UM6$k$p`h>_{S6Sce}IX zl<6n*f@_~Ie=bLo$-O*pxu-N^KfL>zGArc>^^}`CJr|`skSzLPn>74c`z6uU8*F@! zn#SAM-uWkbg3hW1bQe_#M1r1t*!5M;&A;p#r8EhvzntFJ3X<>*)~jFWw)#I)>B;$& z0p09#Ql3%3Kv4;Dlbk*83rKlAo~L59+7;0fjq~c&{GZQ^I}^16Un(W56lSH@0zU z)aYv5P(|1eX{gC;m5ss#Jmt>xw$7H?#->Qph@n1qEmjY!Uh90& ztZIF4G(Dd;>gc<6;%n2x#_`^Bb!G7@f6eHYM}~IJP=xFydJC0!fo5%Xg(se z7l=z6`ilQLePrt47j(pUKuVKz%OmmBCExTVHZvDl?3E> zY<`Sgn(mEG3%irlkTUnHW6QakXDZ=gVPV|{l<89?R$u3ilB^M{gHvo}{rNMf(oIKx zo0h9#OR@j@*wFB`@pk-t+|3+9Cwo5+v-t8bHGZ>SYjtHC8Mnr1IU5&!sW+>h;&D_S zC)zvGeXEV+A6BcSlsU51@#7_xg9F?&h)|pi=&rop;i3iR7;eVVMzKsO97UrMLZLw{ zt7~iPQ8(I4WDp4Y^bLFD?`=J{Umrniy#M~fTrcjEox`P8 z?s%;?s}LKALISDQuzPSZOB;}ZLn;abR?t@qcQcu6 zBSrnmMqVxFt_}l1(2D59RAJl{nq{TWQVk6azkY#2UH>eod1Y3!;zW1_U?j~tAQx|6 z?p&Y!$j|V)y`shDFdOYMV3SGaQ%y`)R8%zXi!TRyo`|WtoBmljQ*pjER1SMD+ADk= zq3^Ig3Je+NX3T`WqoX58v8-KhYYMiJuL@IsZJ`lST(b2<=DRDR>)1(Z-2Fyk-S?Jv zyBh%4Bz&Z?va%YIBI9?p&kzd%WugnW3y(%8?RTPz4g>pN)t}YI!18Y$j}(9>ZPcvl zvTOnTpV#@6F?DV?sIn~7yE*wZ7aM(8jzu;a#^2Oj@VWof&Hd|u?tPzUXiyN) zJgR8HFE_X^b?WoEe{M^4#uF0s&@`@HnP)Px|<{M~JSjF7WjqrKNj$9T*Ojs-KybYXYd}^Zh#kiw52Cc_pY_xgB@YF1Q`k zpPrt!w6wfyyr(hRvK>tO5U{wo2vmM{u%P|d*0?|MB>_Q&PV;wg+3f7>YW+^x(N8iTrI?cX2Zp>L8qa2V%vMD}S+wfn zA|h;7I?yHmG_SXMp7e3)5o{kEEL6jg(SHH__w5hGY(&9t3ai(34)$${{Gz$65Z2QL zGlM)fCsd;OU%TA)*4^1R;tiL@$R8VP>@-2SXgDkQwW!r>)lpXjm6n#44>D#<=HuaQ zxNpF`j<_xw;SypGg+dg`zh1dIlyfz(?=MWIb5R?LER+mQGsg0ykqZ zJMVYADsFe`2sIkrZ<{&@X&cHke-uS{hxwt&W-D$7Ia3C-QaIrA;i+kTq#UUAHEZ&M zlf*(IC{C*dQ>`b02_w-^PgO3@+x18<#jPI&aTxOu(Eve`1(=FZCoMp|9L-N^_y_aE zQ;zJE_xU~V<#KoSY>+TWBp?BpWHB|A$ta`@xv6XIt0k=?vnb25VD#{CV}IVW?ZC?F zp8Df$?DFHJfcn6o^Ex&ffnaFgdAly=$ZnRl1t!8|iq1XLtVsDa835ZLe)Bv-HZ`()r^`fIWLnUNSu z|4AjHhxwbBL7>TMH{d+Y>uw}9a9}UA=&(!Si4kd^OGD1QN1xjKxpPtICv!|pyL$(p zx&&Ksw{oR0=es=YqKCUqzhJMs50KS6p&ewRDAq1Py}RzgMoVYG!J0n=-fqCKqNK7~ zmn0-4prN5LH9X0PPkv{9kKm7+=twI>z2ek#uGF+uieaQCOU|#mzhFZ7oM|*JF7Cw- z!|g$taX^FHy*1KKA z)S$}a|NRTc1kWgDALYD6_4~fYYhwd&Lc5Q%n_Gq)b>MHAg!DDh-(zN3g^!imx9llr zk-_D;pe&N}mL@U~eZJWmX^31eTnae3x0s>^v)_9punc<5%B3{n;~vTb9hg1ma_-c5 z0}5Gk%`Hinm(ID2%Qj!C^vl&s@=Gtn$-4;tV+^P*q~GEV*nR=o^%;SU3Z_BDZaMoU zS4of4_Y$;%bdFz$4w4J}y^CUQO9G>l;{&RaLo!YN3ljmSGjb%Xt(kyQ*h$45v@<}; zEa!OO4+=Nx!L22v*oUWXlJI%!5{3B7^rNDqx2sf9>Ld(ie4rK>k1n6GKp|=iU{~~S zX>GMJ1s4l3z~`RauJ-A_jT>Q-|5ATx}=;KT<41IG+9klo!5p zMTUKOsi~=dMxl`t`H4hCMYYiskS}1X+1r;Xwhf~!}3OVJm_9NF#L4DUW77XV^y88%^`y(BGH3*nIW zYrR(5+z-O6S3Lsw3*2DkT)l*JuxZcppZ1qmPnR&hEl%Dt`HARO&WxFr&soOSh?V04 zll;UGz1bHORVz(rutXhsT+Kp7#wb{+kJy6I3`$VQp;<%p{;SO*!v~uu-UL zp9P0KB8`htz`(3DOP+4hnp1-aObjaoRQEt*4z$ZEt|41f~juT2)!H{8nPg# z*cXMb7z0!VE56R)3>6vo2WqfL1Z&zjp7CoaDB#?PO;~W|jhWf|m?sXI7%(zY3E=)x z&NF0W)GzyPQuVP<=Z#KT{A{teuW}xK>%WZSsa^mtz()`z>uo}Ki809*&sv4-)YU^w z%pXi(X7B#~{<5du$YxFxHe~_#UZKV%UzjXx3FgPrfybY976NiLq>Glav?Z{0hc7&S za;g8@qbJbKTG9%0FHPKPlkh>Og~>0pknNx@MD_0h1+ltsUcB+mHZo-=0(TK9e*~L3 zM7c=_=dCIeK}9m*F=_SC&KXe%(*HNduhV5O-&;3J(!yzYg4?wBh(00000 z0j*AT@O?GxjlL8Z5)us*8qjT^=fPm_B-%l3o|cvdj~m<0PgxXUY6Z`pJyTFY?vap^ z(rcXx|7UQiuP2#zwz69TH+?QK+u=oG(PhVvFLOTH+n;7h$4+7;S5>{Y$o%Z{^XJdW z*~))gWm&b#0G*kYC`+OHm^4W?U;oGZL`Z%c+y<3cFez$7@C(9qFE zg_szDq5Wt2@?ft6*Az)3!*E~w<>59_8x@s0_X$ctW+nvuhC5YF6tdpOwGdUqvoBmU zVFxSwYh;(i#c{1xot24o?L~_9%gW@!o7k#3^d40YI9SG{djM8y$frl zTKJ*qc(zT3ql{!W7;sR8Ms~WJcK4DY+6*u<;3>cb?2Bh9s4~T@t{PN$@lhtbWlrpb zh*2Gr9sw(jIcRHdr&z#7R{t7Q;F77F7je*qTAN3;0Cx;Tw1!n9AhTd?xcvS#{$~F8 za~P5UHm}nXW_K=*^0R$E{=KeYzoRsdd;E*xbhHiDP!eu7YU(WGfn@_%d<3*NRVp@|GSSyanR zhE%jY+C2-j7xElt#tYPL*z1bCM?d?_FtRk*?EkIS_PX0E5`54L1Nhk(G|z3ek1^$q z)qr?onPFeAMTU#pDnd5UCPtm3^ykNqAG}_lEi5cFG)S;P7aigQ7K&Vy=4R;`G z1eyGw)`bPHNdWZ$x?(ecjymL)?>f4*!;3($A zlf)1mL>z|Xw5BEEo0<8#K$+2Ck57f85X~zl2kh;PI3Rn3KQdxV7}+^FImPhL;q}=L zc$y`^@2+2GD$dT%zK-TZj5-|QVFXYK2qY({XDTt2s}*FUujV?oE&gN!J|kL+`Xi}% zwwtL-Kx=cw_l_?=w{x|iI&WS~?md1o{#2oUN2IRA*R?*m;0A8C^8B{Xoe&l=e)X*b zaA5iDRTxp19V_+|K|>vNJU5AqF{qn`OZG_a$kyc34+iK?dD9f%ffr{ig)#rdL-l)u ze=0xLz+%zEoY4LQ`zsjGnl)2H{w2bGgjUf&Z!hSg)SL1hGaK8ow-_?y#D2n%p~O&q zx9-40>PU%;P7G@$s0#j^6|=Eg41$$Pa-?dw0cZBRnb0#E;x zFk46wDznGTmwgTMTQzq=CMVoy_u*OC& z6B82)3nqnR)DkhpT%}!eJNrCH#*oSJ@$r`cgq_3e(Ng$epLSd~lj3aFRA~?a|5^}_31+?c7=W2<5YlR$4rC=H z7FmrMA5z6Pq2Bn|WMl(j&RFcwk>j;ijxs(f4d^0H9>sDs<998>SZSaYJw3fiXGSmx z0tT=Wx;5+fUk3GIQyS;Y!)E0GaT^JTiKK-WhE_tc-dZCbQN1cqCdS1DH$G)n>VIu< zb$Dov8Aj}jf!!m{2LgpfflbbLlnKNwcx#gqI%o%1`i964Xl`I!hFiFtIpW9p;$={8 z#!xc$93=(3X%UKAUISTBiv$m zf8K>BCEd;14LBUkdw_6xdvX|t9XV!Jptnv$tkm+o;P`Cy{>pObQKV#5tF9dqkcFCV z?+KNtla&j%FUpTOOJjX8%NxrLfuUXcePk}P6JCnQot8Oh1rS=_mD2>=FRjNu{9sHFqB2R;cpSGTX8AIi<#t)0==BQ+_UrpmX&sI^{!$zSfmU! zwF{JIz*DZ6W|#)Bnh}F-RADeVo2!9?uQM7MLsg$r7g;9!%pD1eVjGQ>BOaAN13XAp zt^cn@G%?R{>Mhnp%`TD`1`Ie@Sl;+?qIp!+H?_SKZyk@wweB|KwoqBsTI$!-&3#I| z+w_~jR?eoE)4W{mnfWFGMN5)?L7#yC+DsxMKWO3w_-=UYD0H-Rr)6y@n)x?P05P0X ztr7x@x6C!7Lb~S>&VQ$P*7cZES2*$P=aiNx@GTW;1+zZ+cKBcq-a~(?k?dDl^CWR5 ze7)G8yPbnVC5k4b#|jDzDMVZ}5@1($D`LfEq$C0)z~E-BkAes;m6=H5*WZQ zR8iYd`tweq{b*DVALJ*gfFgO-QvP#sR7Tkob}gg=F&z2om7&rkAh}k~3keLwwhIcP z^+h4c9{Cj>zF2Aj(<&mY?mO@m*Ce!DV2?FSSaX5L0zPD5%hu8gqR*N3bM@*CEDcdB zNJ@pqbod!zQ-Ap>)dqUrwUS~GOW*_NxMmJSA(rteQJ^5(4}}C=Am_qPB%`Dh<&Jdh zyz4;!U=x1z0{F*oYH$jqiuwxj{PV%!>(lMxeB<_LMv*qEl$h9~NdSX}QD4^hTM{5yUEqWnX)Xn>@jbSiXXAI!P7Gn&+c4ky8 z9&@>pdscb&bIuYJ6gFIyzlTyseC?HPS+aip`n9!OchLh zn&i=hUuz>f)ENxe<_qfBvp>}#&t&&a2_$rCWUAD~U0dK}!VC{-*hSVv$)DEtUe6?J9si>lO4 z!yRf6brh20iTYP?j=nI(#P2CrYxV{M;EIA>NCBY>13&QF{$ZxND^p9cyN;?)Uj*#mpX%j>U*xv{ZCdl#=2cL#3q)_WLARFq;+* zQ2A#kGPkC9<%<%H&7+)0R49#~Sr3N`$r**C_`pTH2SKoC0`z{zMn%1*rrtVp^%Gh8 z7qtNh0a5!o(g(kp!Ub-E&Eikb9c$+n76dstE7tE3<>^7e)@u#;AHGlO z2U({9Ivkdvu&)Jb5@if^p)WOO0f(6*3>+rHs$a@+@k@1!N2Fd}4*;`SX06t?92^ln zM*=LGPO*r%2i$&mr#yOma~Cx#OiQU9j&o1F+fp#-R;O*y|< zt$-&fl(|PSgtcCVCbC^mm-zXTIY=8R_%s3t%tN8y+RZFDcEF>aC;Y^W;efft%VxLY zF~isA4PW1WnOZ@+9z8w%(m4)5Lu15UxTK_g1!sB6c_mmsOume#4w-;XZU9DC-(%C^ z1p%0EIjIET=_to{bwGtw(&59E&{PjUfVRka>dK!`*iP1!p`3TXu)HM3UhJpyhFOFi zum>{b`jIPdVjv_I#ypgzQxW_G_}A z$70_EP6y!M!t z>Q*Q%CudC7bFc{j9hNy{0t|9egJP9;SqMsQyo=GHoY|tL1JpA14M;MTQ^&3xT%kdk z%S4s=Yw)@+)C z2U2<75-2ST@Q%wUvU?0u~;IKYslr1{_3s zC9ouj6_~vf3h+EaUpqLxjdDZgEQ?$zfYTZ%aPi(^&;wK-{H|3zWnydq#dL;f*W0?W z(z$@q)%w9jbw09l21rVfv7uvr${UOBt6%mqW^v`0Df%Bb> zqyH-wg%5xdi;Ref$QYV8xE}#-VFT$3xmBua7ASI87|dSppIj?Z>UYQY8MRiNrZ07T z)cxREy(Fj7X3Yua3yL$3fG2aU7oej0kfkuS=UA?$DIg9W4Rm>yTb!C32Q0|k#i#k3 z_v>^J0T`lDVc3QsV>T;Sdu|NZ)ju^SL-(5cl$YMVc)r%j1Q8e44+?(`gk8%5Na8jz zD4_hFgM8)xy`k5D%_c7iG?aMB~E0CyO|9W6qMCA0dr zX!gj_M1_vW9{|wd1m2lZNolD`loa(0HzRPfa6S}l`M!h?v(G^ONCCHW`5pr{z*vlX zEFY6bOSL1NrPeaC!PSHOyFD*y%`MUUj; zy(3(*UpQta>ZD>|!2|+B=OXD49UkiQ`3Pnf4^R;TlPmQB_Jz+gZOvn$uej!BQ<8~IrAP|rSfORyyk^A^Ds>UqY4~=Lr0UuWytN3l? zbC8DrBd@WY0B${m8_~BQpx_X(6?>kk?+y92wT8U}_ivT+&LZUm03B-h#61;>I|<+d zNyyu`Z!Kcvuk>+naIl0>gmaaYIqy2b8IH=xClR0d3sJeiLJz;dH$ti&l26*F3^9-q zfqDbrq4UOYh8jF>s-6Jq1_EjO@$%jB^70$+tPY^x>zkXC0GYeHi-p@KAXNfkV7Zio zj)Ec$WZF!Fx~A(O9nuU17J9qZAxr|u*aBtPmyq_PHGP@9F(TkC=hlGdc=zA66pRD* zh|6L!+`$BduA7#`RvJ`%LPFe60G-)$6eN8~Z7n5^$*=b1Ws?8UJ3$U4HNS@u;8R{b zW@fmO{UV_+COkYZz}o5?*#OAUOl#L$r~gA^2yEK249>mf(~XFYF& z+eIJ_J2CgMdOcC}AWvUSE}z*0MuSew3C{imiGi)}u{d9H4nUo9UUUfMQUEW=YmH#^ zc&HW2Y@;Y(2AuxyUOdS=9(OwPdY>0yeZW z+ds+D7iRQ!{2<|XWq#1F+{h7C6L zLsjtS$}(}1Y$t&Ph~u}qyb3Ddh`JElKa*qtb%%?cf8}*iQPKJN`4<&bj{#l7=4sF# zgzs2O>Vx;RosFZykxd#4E2rBRRB#xgK^hz~r3))dl%<6iP!^ZA4`~mdAXK5%&J`-> zeP;L!I5WUV-c8c9&(IK{iK|}hk zLC9tCdYw=P70iKv?0>4X#{%5~(6shEM}fdmU^a1oR<>`q?{Hn6p(BV2@^$9 zX&3>2fMyT4p36!aaPhz@wF4s9+t&bg#W8x+9d!_|bcWGVva%L!Zf;7)vIHu#kL0Vq zi;%8!>I@?*$~A90PbPb3^wH7LF(gqQ1)QGM(i(nnu(>|n0(}_30l4uEfJOn($1u~> z)D%p}>In|UekM^pgHZyO1Pv8c4-G-GeaZUvxR-S~3LmtCfTMJIDn$avT~vU-z|WQ^ z1Zo65gtpGD51^Boo10r1W#_(`9 zp$+d?KHr-O11HzPIsBFuk*iVkKjXh;h_$t~vlO`90T9+)%Gz zVrFLM=bsz7j|a@0gS@=Fwe=x5B3|pTL*wAU;BbF?MJ*kZnVp@Tl?8fFRJXUGe}D@E zXZFDqBO)TeXT2)eBtL{m9WCasM(`je1+mcY7q zCys`-cShWT8JD%s7D|rK_^zyks%y^(woA=5oM`gmN@ROtE zs|(nW$&Zi_{DRHP%S+H$RoY0cR&Iek%6Qn|Kc61W0{ZyA|JBd>P$Tx^MG5Gk0up?t zxj6+hMn`G|tC=jqtoT}B@sK*`*V9Snr3o{RnGYdzKWqkDE$kJ)TG=q@3ugzXiT5X1 z9^I$oGgsp2Rd*YJ0xDkGED&u{V^gGf`BjLQWnvd^yQk&YpXhD_c-vtd z{^e?<9O$mHYP_Q{N@K&r$2VmAb@-V_N#fetxceuySjG{MG30b}$Imurq(9s?Rdit5j&iLL5{ zAz>FgTsEh1sjv8JkWl7bBBNWh2Y!?B*YID}a->*NrZ2SySj|+(Y)eJIwOjvK0sDM8 zD6B}Cg1$XgZ)i7gjh4T8&^Qui;8C)2Q#r5x0Rp!$`qq7rre!@_e%`?{kuC^l40G>O%e)XbF!dU186;qY41ZP#u_gC zYQ7W{Nqg?^b+U)qc+2?@b=s~fQ0sBLAB->7FLNh zv^kUkf0aQ~>-{u>jk8YGNh_w{^A~TvLG{pK>oN%}4i2x+uR@EAnceSBr(kf0w3QW` zSty%Z1YS0w{cuf)T@Txgeh>?bsNuo6@jPhArpgBI?3+(oo9-skM+~59 zCMK@&pc=hgxII=+Xl#5K$`A({^~^PYdQ53p1uDwA8Z0r+TugW425y3uFW~qT zR04wF4WgS7NlEpKhIi7eA3>K;X)*8FVybcaKlH&AkJoaxGvRiM!e>M2G<5sR?%paL zEF(6!zqkHF>%i^x4DUhRJC7Q2)x#isM&JU*r?a!}ZeFJl^!d@X?EQ7Q^Xe6pKzZiE zPEGx}N&$o9eN#)uUP?X*&UGAUs>MF}a|Y9oy}Y_IG&15e9nS0+81SerHHV3gG-`*W zERZt0T(n*Ve+2MJ!1XkzGMp+7wCs&mT4_^GpAQ9PBRxA?z`i&hjRQ>svL63^bAyAX zkk;!NRFCm|Rpq>LhXm!6gITCS_W3gpTcP2~gc|#UTVPrsos!lBMYKxLWd|DI5HEHE z4FXPB`*2wyThPrpr-kSm76=5ql%=b`fBf_yz#!n=o6LjWcgeKr;GGI+Dm~yFyxlUy zGS~?9HE-g2(C^;m^-}!Qrwn@FYyhN;-4Wv_>Hi_?J;15%|M-99M#yR)GlViLA$t~O zRrU@UA)8}U$U&&AY!S-d>)3mQBzy17WAE{Mow~oj>;C`#pXo*pQ9y-p9(e>vW&dRSL=J~anf zxxF-=YT$zl(#j)P$KtTPIdP=^m5CBxKq!MocIosgtW$jG$Ak% ziVl`2xJ~e3>`EZ;B^cYE47qUuF%DwR@87>$iBw1{k~^LWzi(tGTo9$BvfEwB<>JyfnU*Tj6_ISyF7$9-p6^Y? zIBw29VkB7kS>D{evUV&~MDev#fFt%{xTf9{rMfU)8<|VKLfR#k49V|X{KjOkW@iHG zumf)1u`T>{^_-2{PSrYd`DouaWtk;5XPTFLC)@I7_U~h1vV5eA6WUvrFRRVVyS{rV z{W|~8yx06>V`a8dOM%{%fKG@LLU0KPL?h=ZO#^lA$r}GWdvH`_<+3uU1XcD|%?}Gz zEV~!pC`P9;$TT8vTAKZa3S+hFx&B!~>YnsA{6$U|la7GPd-&msO?qD5d*t!)ND#=J zO%Xaq-k7i*HM zkr)B%ellK*9MW=~_iD;b*rR>GB5`ZabFDXQrBpf5T=2+3v?D6%w;5=s*GD=;e&~jvE8Y$NwFBFH*p*{pwm_qA-P0vkMt1mm-bsW$ z)+JeN+ac)l+&po>eB?qm25EpLzWO5_h0fN$x3v{GFQ*9!S#LN0NlL6yoQDu&fUE4> zKM+=Zat*UXqcfSHw&bM&w_mK>^(al6g+H8#8iN#i(?U~d)XcUF#h+BQ zLda zo8|RuneF4KBkwlX6%h03-x4sYh(WX&}@O=)EOEWzC)x%;s)k>Y1(lyJp8 z*4*DI_|hJt`&{smZoXlq0|H2DS2ziYI)R7+|*KV39r8f7hd_ESl`Tr3NR@)J4**I+0E;7?t z`ntz2(`9SA_r(^Qhj~=32)i0j$BnL) z73@d7{AnBXN`>;W_ut1?ZN)!mD+k_%p1@z!0bdu_PnTdV{dw;K^vFu;J!anlgq@4J z^_l>4T9HAA9>=bvr}b(rhrX`+M49F8%3sPuRNzML@OBhLp&uOi#+Y79LE&n zCI1ikfN?Qt<3y`>tg40lYtbBGehiqCg zioKP~EXgj$K9yh68j5~BX;IHEy3%T`f8^?;SyQQUek}Yhe*uRz(cM*xwC{2*3M?#o z23v1G49_!jC~JD&x_ryai{^OU;Non-s(VRQ0?C>8D|b=l;lJ?p!Trav2X5o|K?&UOQY(+k{L?xiXy zA}TSa$nRl9{*Mz@qs>UQ9BVWmP`CU;)>HkZE|rTMCpfXDh?TiUdz4-XHShD{8K(l> zWtHQVg>pjG5S+{U#hx=wVr$rITG*xo?(gLIdW#w}XUlQtA2&b# zy!o05Gg9d__+@g24-aL$8u1UftF#c1;N@&c--8vmg(upPTwRxp6SnvPX_h>*;N_tD z+WkR9ZjX{FPk?KQh4Ln{5~0%l5iKty%CtmxgWKiC*W5JjXnRjaMnSi=`yVpoGVl5q z$BW7TDpe-jFj~P==HXVEuQcTejwY5_qo9*>*pHKD!i^emZ{mGo)!CaG*C5?qA~Cz~ z(^(~DMuu7Nq96rdVj+ta7|j;!7H>x0ex!I zsdL|gVd>h9!!g;DvacgAg+93sYR+kNC8pXfMShLz=y`Zj6i7_1z{i+OSZ-qKCG$oq zHcE&l`ea`-fNw7YrA?%BJDXPsb+4s+I8MLJ(D*DQ&}+T0^QiS8b&-Ngk%zsy$)z}^ zSa7e;rD>poCBup57CLg40DZ;@L14=;Ab;=av1q{{rZ5dh1Qsb&{9NoNa>C@qew9`Z9H!Y> zB#p)($TL-?HV1Rsh)Hf)IJe?9s14>;8FOdsiT55gg(9LqIC_lsO<1(W@MpN~`HNiN zn)_U=$8|z#u{JmGsyr&?ab(Lct+n+hdoi>}?r65;m$9W+8-Cf)eqLH`^r)`&tdXLSfo$&6v>}Dj1YXJ<$FrF zyl8XRRz2ai>F031c$JmQV#N(^ooW&uLzH2jyln^-IUYIO#G}+dFA-I``Y1fkE#T|J zHm#M4FlKu$oiwYI`ObJ@oukvV&UlBMc9+4`L#|P^yYWx5jw?9#>{Qwa-zI--bGS4& zp7hz&Fz2@RmvC}j*Mbe@Dw+f$^MUx^g>i{UbBC0hd*lm8?XMiIa<=Sp9XnMvOrCXd zwkfw~x+5mcM^z56K1JeSO(urKQDo=YpB|IheR5Qd33Jh&WRxUe%A=LP%DK#58R5Y8 zRTp6==p`{;!YZIRYq0e$`LN{Mcb5*4g6^-HMB!&yKbS8MMJ32zIdbN<^L$DCirmXp zr{wuhvoCcF{g_1hGC$+uVV;a__$_@ zv&2w!a_2`&eIn9bMMDemkgEw*d3IMuN_;pUQz;kVwN!smH^yz8y@mC;ZU;;dp@{dHeeJa zMN+LSkml?kU*BG&&8S?NNA$#YtQo20xWV_rm@D#@=kM8uX=4S`#*dnV=(n#lSOl%3 zGn@HtY<>`l{d$SlX5l9i(Rewd^t&H>Ibp=h9Mb_|e&=#$-ie=Hw~jv2@NrgjaCXJ^ z<@J^4cClCYe|)z3*&I5Ziq63rs?S+@&WFFgh@oB+G>bSDfuDHij=840TrDAHY zf%2MfS;XB1=e#Fk^V@$t;LiHIbsW;0t1f(_`b2q|;3!K~Os!yr*yLq-py0+V*OFkF z^Q));)TZJy_klX*?mbsA^el2eCz$qks+9o8qI3Ylek<|qEZ(~rsJn9CPcjeSI|gxf@@UbrHksE1y-_SIN^;~51vW#H!T z)W|#VBC)FB6sOPRdZ=i|Z_8z}K~vnkbIqknJ%mcDfC}lnDZ5D6a!RhTlW|#T-s#e? zy=@rz*v-dhsoV13J4Z~Yc<&Xf)UlQQX8;`LA6-?mk7`FuWH-~Z(eLBiphw5@#$C|8zgQ%9 z@r!Zmi|8aQ25mHn*zxU&k9igj0<4AkSqu{&WfgKiD;d)Zxq!Pm z?{+Ci-55wQ!Kg?hH!+y1jEA6^aC@1YW;OH@z^lE$>(FtV1>L#f@&3jPW1`MZ4=P8b zpj7CSW6=b113Y;{HC{DBA!o3$LVzPct|iHcrSdb|Tyvt=_=M;RY+Fyj=B(rPV%Jlu zuc~8ebs7F_9d$OclNL|^^4HfM2xX5t1VU^l^cs(eL*tA^)Z(ZE0e6JQ$_sc-KR-WO z0XtuWrnuy7J68U={14XgIinWo?oI|0)(8qrb^hUGmEF|W z#ZPrH{|~O~oRsCgPNN~CwyAUip9Y^#6TIt%Y!%i``iB^~k1R&gg(cRzuv6cgyTt_x zbCt`6yxmgq_P*_SUvGTp`Li{G53Uz z3Xb_@UwLEYR&woI1*(3>-9qA#OrugqtO-Ao+%F^pNQz&=aFczWccb3|kK>9OUoz1V zyy{JoDV_BETz4YrWdN^2^xKgE|vYJAEQo((v$kfcH=0_KL@ z1j+72qBRPt(cBTAv53&FgL?PA4f&(62)IY$utSJd!s+JRDW??X?<2dEn{Cll#~JX$ zm}`<2%>S?F!XELAPrPh9ivOy&_58t;ykr*-NYD9qR%~PpcDnQlnw{2_;)&KwGd?{z zMWdB2)+CA?YR@k>(0NR+I?Ci;?SDDSr9pOOG)wfVo=C-1#Oozfu{KT{yQJi;Z`>D8 z2IlSggKEykor(`@*sx^3SM80<*wOJZo%ykU)BHq41@;XC^MKzO`CO$S5#y zrYy*rc8vH$j0YgVmT1P$Hqq8nrN3IJOE?iHpIc=_$mnpSU^?H78 z|F*A?>W|7G*V@_AYGJ$m?>j3Enj`X>Bg6}|9XcXu%w%JuJFZLlZ1i`|CP$XMuCfYU zlS^EBBFwxzy3osdBau6+cdm&@NO5lUjLRbS7W>Svxhm?3C0bd(=v;Zd1l4);(6Qq7 zwvS)7zek?fi0jGhx47Vm${&5qHN04|5*iwmWX6g2>rmH!+Epb@9`Ts2LGjRuxUO>% zLXP{=hXZF~^c&fZ2#W*LtOdxt41&p>KmJXhdw+hIm0)!g#Bpmh;`31c7WqcW#ZgU5 zjkJzkMG$-f53`Gye)apk!dS(rRE^F;)N2b!X+2^IJ8@pCpJN};@wCY`5uMKwr116i zom87qDH)*>cIL&Dhx@PNK6J3hmlWp@@q(`&es?*##+aPBSiX?qP$t%$6| zD%Ru7P_5aK8uM8in2o~0dQ^USjb=fY^rgYo>1t1vh*5lm;szJ8=y+g17&YO|XlAoi z$jyHWgURF60!7+^+`0is75Yiqa!ypU8`X4>to3Om&A-M?48B)bWLf$kx~wg{#p89% z^_w@p0uEt;dw%ZZO@4Ih3wnF&kc)Z*Q;MB^)_n?3=jbbpj|}B!N8d@Bi!|pMO(BVs zjl)S+wOsv>xWvjQGXwfL;ir3{kp`RrU)Lu?7idEVou}LHJ1S?rCz^~Mj7<+!OHDXh zWJDLpwz|oQMRvu;LZtK)lAP;cdv-6~CF+4>KUEDw@_3aS3Vj7KEV;_*U2k&{1=sOE z&1DXd^}rywmNr8|bN@Y(Jok81p=Ix(zSZ#IPL~J_fKC=Z=)O$I8p0F1#L!yxE~; zayzFow$gyeixumA7HgurUA-@U&DlnaoOMkbisMJLeH6VP?ja9e zOXl)S-X(_$wQMbqs$O4re*vc@|ESC3%*jBNEw&u`>A?!FH9hdwaIh+Nx>$TT>YCQo z3)Ei+STjgzp~Ks?Y)p6_y-I!Na(mgV-Ysa%*4=vh+3_220x2a3yFfm{N~Fgkx9Xwv zBl;ZKv$Z5`rP&91in50@&51C@tqc{iq6@r>u8)3;N!C~T!mLQ$U+oqv6szHfyGMO$ zuBtxrl5X_dIkKLJZIN_Ogfo{|_e1IWm`=sBTMHMbYAXN1a_)zRh4^{N%Y_i+LW*Mi z#&SLbFp~Ga^ItY)t!i-nw5I*7G{Zqq_sNqVrEYtB*G{*qrG_el8hW8qKkYRwtiutE z?;1;wy=x4jkAs8Ft8>?b6HSQ(%1z$}yG_N`sg#xE7DL zgLce2eJc~WLumx;U}|0yrOiCpi_D_gn+8s{?p)r%ehO3KSV8YEfTfUcR^SWmYgZ_- zD8|k$1ff^N3|(g1Or6%+EGMd~w3p)+Cu+_h-Gekc_R`a<@+?GcS0#-8PYa;LY70Zj z=4klYNRbBu;0Pj54kjIM2c;9NTth#2r}wmyRyDqPb(rr(pL+ONZ#lc4s=%{e2J18j z;XBh2@8sAYDxu5HdToeJ~S(p`)6-&vJnh*!=|SFSN?Z&^mHQI!2U<&hIf64+IG=#Y5Qx)FxaZ$;Yk z#y#v^V>ir2Ob+JDeSDVi9T8lPI%kWj$G#;c?pdA*8y^PlTGP23&!Sd$J6GKn&Qgx3 z#bjSF7w8@O!4_Ct0K{-)L#wC7WN6T40PH=h&pZlP6(%XRoe!DNmuz{S*J6RPs%~f= z@Djg2Fv);`q7XP8v}qnI3VB|ju@sC~0E#|jqc-++?IRztx!AMbV-vDUU-`-mACYy{ch-N9=q$;ld>UI%kg z)YaqVQ_JymLdCzn#|dn~RI*naz@iXy&v|_kgalUL5j=^cT`pZ&uYHud2sv5XiEyR# z>cgc>aN!o*bb$&_5H9}p4pai^y~Qtk-y$T$17UC-&i8-_KbgYuOp;@1iFHkafoNdKy^&&B&asVZ2)a}W1diIySbkX0gPa_+;;h{r4l9Fq?%lh`!I9+!U~)0Hxz0o}FF^FHP#K$= zqL!#$jaBS`(5>MuQKm=flom)j4&amoz)B3}eNK-uA|xg@8ThK4u1F`!OD%WTX=M=7 z(*-pl+8}+O{2~~?1`?HD=EX2b>w*m9U~3`KV|NHdA+D3Z4DHI}`rO6Ey;-|QAkQC6 z&97JC5YTr36(a27RM!`f!}h}uWg;L ztpEoa_!HUy(jW3wQ1IKUBLfZTEa1OrqH>2*5`>+8!Y#7(Yef0^qsB(OfS?{4KP+MO z=Ym5hgq_lX7!MlstU8pV>bK3jn+LGyF(Dc^FLtfID6OJ_xhVf!zw^VOF!(t*-l?8K zI!(k$Z(Z;Lf-SqN>TYtTbj`$Z*w+_P8_D{imKwInuKwXf&T~Oc6C}}|dADbZZ3%@| zV|!D`$^873%&Ydtg2F|26pZw%(u!wz+1ScXP2+KQPR!nkVclPX)HkR`K;k6qWggp^ z_T;8kky*Z%Sc!O!EM~I%bLlvLe;7m;5cruF5BuS)lV?L6 zK;=Pi5wWiT@Y6n2MI3mvG&|V}`^e|zwGqW$lW^wQ&B~pL5=Q+t ziy$8*%j^?1-^jSdiM(lj`mU5`(9l(FzsF{fMSMdn64qV}!&paqL@=?Kx94DjkgI=vMz27BJ*@QK?=ySSY}n z+e+~5 z?;@k7HoF@Q&w)-F@(DMXRn+8k6&+!^sc|Ps*~p*u1@>uoc%iY&&;8E^fl1FF{77o` zjnFqaE!#X8m!S2zDiKdTaJj1NQTnMw9f7e1;fBx7@b(j2_BXgVykzuyrwN68i_LFu zy&2XUFNjj0&+fbHKHp5gXQh9#6uxm1gb*j8zPqtgscHfS)jYmoJx3GY(wfy`LL%Vl(W_K`&0z2ky=c9*4v&JKjJzy?c-=l8rB~y z8h2^#)p0SgI96cJLD($+IvgWL#9EigN(qYAqTUb z)Yzgda;l!5nF#<1`z9CC3y2Qaaq-{W`};!*pu$Fi_VUjAy0G=?M5dDc3%F?KcUsvd z7Hlc?_Pg!O%98dyhkTMQ zt1cy_gOKoEu@>hhZz#YjTJK53{E?aObMQI0nB7ZWp1V_DAhgdBPYAPNR0o(`^l}$; z$!okno4Z3Hl20@J*uL>1=DGqVL5SrZDbHe*xZK>>ILY%c)Bt_dGd3q_wgp}%!~Auw z5cFl&^0J0?!`qZF{hn7)qT)5bLYA7Aw#jut{MO$aXzJ+1X=Q`=38aLT9n;){Z0QeE zdaO#fxPruB(gk(va3$Ua<*D0&uPLJCBO@c5&nw7Z*tu8|Ue;tNk%s^A4Mu@p+mf1^ z+BRnw)byr0#}t@lPfbr}k9Z7!c5lHRRG`}t z=$6rCp-@M(x|Xe-6x4ap=pr8bWp!rz;s=zq`AB1&Xjtn^lKT92?vPo0D1$FXod4Vj zRJK{6waZ|&Cbw#77jnlxAl80Db7Wc`0XlXCx-Zz6tb(@}*0}>8bnC~W1BZV9 z@(yL;JPXK%Id6u<3fn9!+W0%7yAw)W_bY1$qDKoQi=YW!s(ZUeNWW@`Psb0Ue!KF~ z7k3#XW)xY%pk?AyM}I~w+hN>n4z^++kU&n^xUax*vP>)(Pt5P~{=AaE5KOW6$yOC6 zOuNrI8`5z@s>iG@VTwXFP%-^-(+r-_t| zI^*Dd1p|ji^70oWwd^@ZXC)Y+Z10)enbE7I&JQWsWO>jn19YZEI25p2ew41!jV zg%$TCvkx5UfJ13*mCfp!wEN>(skgxmu=%|P6bhpYp_$N~_ypuK`wU+(1w5zl!y$zc zul1hpp9d#K8H=23^GFvtPq8M5@!D+k9Cg4Zt!$%`5aN(?hNuOCOrCq@zb;sx=ypK; zbOvnn>oaN~qMArq!tlp*DV}BaSyr3AcyR+aKo&1HxyjC*&lfK`jWGXO8PucXYh8JsFwLcDdJ*pauJ4unMg7O?1RsZZPg7DKTNWa(AZu;ATfLx=_0q5 z1ZzI4g=bjn@31FmwNU0FWL1KDs^iAsD0xg)vt!Es@p)uqX5Ltg2GVSQ>@%(6a~G3R@9hSiqIjdT*$G`T~=VBv(`+DLuc-;qo)t$(-*-w z!|b8RzDl(fjI4rjrgaiX3jq9@vlBquxiB+So6Zb4pR^F2&XxQF678yk`B-3Ep zJP#Y31X!wre!Cf^`8w37?Tro>L?&x|Viu59yx+UK7ql5q!hp(xhs zG^qtUGyfGL9iQxCj>^6OusiEA<~H{87#=`yO#DHXTIPLkaNNKNaFv8}5u1Qmu*n6! z%Gm_bsrOkcZ9WiiFfw6a<-qYl?HwIqDnm)2FPsWtyx)GW_fwmYBwx1zMFU?CugKdA{*v>B{Z@0z#JGz&)qV9J5<0(INtO+ z5UfaGXUw5^XrUH++tdHm7e9%ZNwuE@u*P7+C&T9c>JoC}>e}sxDPY!ms=*7r3#jVt z6XTWr#}(2KAA<9_L>?^RwvOlOBHleTGB;;K=~+IJ!k|~m+cBV*fB=J1!je`(S(+ec zUq2&K3$=*%=;xQX%BZ_>B47%v&4k5G;ZPjtLsz(fG6g%8d`t6)FR7{dPcOpKWblsG zG|a*VGMp(rlyr1C&%S{z4V~=b58biED-yaY3CV&kO_>llyvct9F~v`k?Ao<}BX{Qf zfKp2`58!Y}H(+sL48`Cil?HIgMO^5{{l9KU%fmZ?iR6zze1j_<0dr2)?xo9@duAd; z4T^8|&lGELXl3(VzkZ#C1^3ly4)lW|%gDADEMb7odk&f=AflLp|Lx`kaA$&N8||-t zJ`X;O3mbJ~w>b&1Dlf6%ZM=zDkbToz5C>ot0GVE?C<7NPoPP*G%4)%P6Mh3tFYe~Q zz{})urU3qe?aV`i>pnVMroF}P#hobM+v+%S>&{I6#se4&B@9qezwkR0zx6GFHa6hQ znph7HRs+t=(iba}<^?t*ZT!KP2{i>)SB-CV>JulZ-HJiW5}04MovS_oI_Z_(?~RN; z^Zxm&Y3oQr@c)_=f#C@X1X@CdEt1z`Ulc7#9>gpxJZe>HMC)NA0BfAd2+}+2@EWbC zdfM0BHAjdRM>5yS$EkNYIf<~vD}#FZ#B3(TzX&Vh%WX&jNjVG zOI+_ad}zBbn|F!g5R~$s)7rC?e5)=!RG=(LtChb##>*uU%qr{ zx}oq&W{Y8dV`F1d?(!;l{2o7kY;K<6Ov_9!x1}rlDY>bCy;LEZMjB`(hQejC z)gX5MB&ElB2Blr97X|J&a-zwx8g*!lLuIxp&QUbbJ0nx-#jn2B%Z2re<%LkibSaw2 zs=+Hw1-D?YfQ%Yn7?Y-%X6pBi!I-`E{_O|uETd%Ms~ZJ%#~i;mBK3R61|A%*i3th{ zii*wz$4EC(`<5@yM)|lbe5BG8 ziLD?lT-`{G-b(lYq&fIi(OL0E>^~mx|JCJzR2ftATRw=Ew|vIF9@Wuj?g-`g>|1!f ze-&LdrgEVzBAsEVH8L`~J}_^5_nO5PEVT(4T2|+s+^U=tML+s8 zxL^B!pWb>ZM8ky^tA7G^xim*uIypg7jo`e8P~MMo>&%vshC@9>D|Q`oc4v!wuS`I9?=dgF3d1Dcb+11QWOSPKsmt3dCWR-MMl`Np5?81y)I_Em+OmsjTt~sY1WYW z3?~r5fsQ}D^O$nig?kyhb^E-db;o4JMG==ytHvRHyo83#{HX_o8csIY1F z0~*Vt?1*)3ExGuJ51z7cXl;0YNMYQ6hOWrvE9jj6Z$G*)6{2kEE#;!AG-T!7ahd*> z4s`ubad8}{WwF^E9sPNes%$ow*Ip%Xr)S5%s`7P9$u)d+I$CZ|9+bLLM)$cB4AYS- zh3Bk8&o=SPPRS(=Fc%KTYpZtvaDPO-2e$Lpx0_Th}nNtl?kK zomkp=oocV!KH@ki7G<$P&oRC6A~xzg{7$Ey!I)cPjOcfhLi~oI!GXD|kHN-U(I!AX zq8+Zx8>2j0p4Sqb#85c?H{N_yL{91gtN`dD8XREaDja)Sw+z-M*z<@;^)r*wz3{aj zT9%TO%+as0Djb3o5!!ps%ko1`CYNgXH{4qJ5f_uc@uZ) zegAw_U{rQ2qmmCjBDI@z_rE3LE5z04dfxIK$uiam{C0Ln_Z%rczCPhbpthJix#$F! zwV)J^?~PWk{Df6T=-l7j3<(nsGs_i;B$I6x9+UEM(t{}2{)UbFhXF=9iTb6jxtNXJ|60@qgs=~e$Vzt2dJ`-kOlo{YxZWWJ zyiSiNyw1h|LFes_HFceM#yI24%Jwx1C)`F6cYpoIH(hWTD1w~9%%9iq!)-tj-}&lW zQfCmg(-8Su24A-wc~p^$!N;^Rb^Q%%hO{2DS-S%`gM^n#fk^KBr5mHmjR?ox9kAy4 z4ulUV(rA5X2rjy-6cPbxkAo=4$pX`@;+aNUMyn5n~Gs1<9Rt3jHA!!ik!;WKf45>51^IVwl;J7W?OoIJN>`SN3x7bfvbkgIK6w+F+yio zGZSx`j&HSTk*x$4b)VYK_bFHLq1O(SVPTb5XJNMb#NIeoWo)|M_=Nw2mvtI{5L@?6M1^~;wFp#W$EY8!t%@p0U3?Yzn|84Y%rmQcsJ zKVMq3h|}4yZ}g)TDF{tpdBQZgR@NP*;PPXhnehK9osSh<0JskRiip|CyT)v_w0#M< zd`gOnCJoqzsGR&Pf?((zRroJkDd|8cJNCf$18CZ9hnaV)QE14uV>xU$I* z{Ke#@PSV1YE_1~$48Ls+{sF%PoaTy|ZAvZi*`$Rp{kLo$2}rM->N)7fartwsxXleS zoDj$B#Vb-!P%LElV5@w*KYE4ah5f5oZt9KeKmO5UE9)+t63(gUUfywq1xYo;1Yl>L z&w}{xQ&ad|@NAOHC0yQ)T*b(Yk={EHUxLKq{dvBk z&+`o~jbspf0m`3-)wO*#v?x8WUWVN}l^8;tNiFO!i^2cZrOQj?c_CYM*pRD@gBKBvY~?70xYm#(YT`Y=$lGfYO29~Y}s?#6w$%U zWONRy1pph~aTPflet!FU|18dbYIE~7X&vjqPe%w2!VaF8eyf1ax>K2M3R^E)9^gHB z53yzcvDliP61H62fH})NUD)#JrT(B0SQD-FT&Gp^-})syXG@tLpbCA!DmB?;hC6`7Ys@uMJU((=jmbwY%r5JqQE`@J3m#p&8i?Gh`*}FetAwmnKeER0L zZ_Czyhe&-QvdZ{w{U}%f5lx)H-W`IOb}%)%oTOuw62G=7HP@;R0Y6!Yw0v z!vPvoLY4UihI7m?M|m-aizEEM4SDl`v`+4z+woTHCiDtb>l%g)Qo3ztiGnzm zSk={yz@rMvJlsn_Eo$N^88lgfzOPD)i2V2wT35sTR9bpEQ{&#n|Gl$e=?FEi`HiqP z*e$t4HKF&FUS*{}4*KZb2p#cDT3Vo6Wmc#9X|A5|bBxv6 z%Zg$$81|PvX7bRg)%0_ky%l0R$*)~&lXQ<)R1ZF5O3L_KEM*OZ^|T?tIslraR<=f& z#IxhuqXxhMa@*sU*M;3&<`Q0|WT<+!8u+tO*?Cx8PfA*-DJawr9HabhX;F4yjn^Bj zQw8y6hBv`0-1z$nYwK@p{6RqebGCFU#M=?!Cqblo;yqxP@9XamQ`0*$EbFNsGug+1 z!T@{`7LE%xmQ_;9Z4Q5k#kK8PC6%RlaFd>)x@EKh1)CX&TQ&i-$=7#B-&DZWzz6JzgmiNxfM#H{<@}gQMFQ7+PIcfiJ8(@Wku_8yoxhZ;W zHDp+8V4=@av(VC_vZ!*c();}6LXh9YjwzG?L=mIq*&^DR=5~~!W5Sh`HrTITI_}X9 zPx6-wU_H^l=m4of&S;(JH7@F}*xrq8bDI#__}n>SSpw8`nDqE$Kc9yP zd$t(kN(AxbRgJ7$A)8kT9ymLfImdFQ57x@>>iaxSy~MKh#^O&r9t$hVfPOs?g$kzP z&6q_3IgvA=^!(&iUkM`1YqY@28bD%55KqNg5w*+66n8#5PB@b}hb+|jJxgrhavkFl zrr%&n%uo`c&J{JXy8HN{u;tZDk!Tf@A~A0Vvw!!XIXz}u-}JZ1 zeohYN5cd`ZXc=&MYr}{AkqbE%8^|r%${oC1Sc&%j{MksRgp;3ke}XLX2HrXsF&Pjm zkpTg>xTkGf5yE;J^nZ7tSfj`X!Vn$T2uh#!3me)dFF6e{l_Qv>l5Dge(BJ_kM|O?R z5OT5)ZU@tqwWcCaLNTkigN9UcP>d^%b~ zg>)t=;3MUEk()5RQ3vfrM|U@c(=Rsb!5DcKIfw-unF0uB^X50}KUoX#T-A>s7Jj>V zwPr^NLhU}sFoK1CJYV*f)PjJC3Sb*FdO&LO9YV$h;!jfDjefWK6_llZ1{TK3awU%# zN9+hK6w3iXk13h;uTaYdJY%Tp9oScCg#1$(?<_3^$mi!ZR~Yj>M3y)_r? zyf<|ETf@Hn4`>F8G(Er5!S5P7fnV~<1wf~FHt$1h!LWkBS;#;d+hWVoBzb=b_Pl)W zKTmU}o~wK?B;Pxw2$S|ujB;5U3UH~IpP%&U)G)9FP`32h`Ld8_ z-+=y=C6x)ojFj`Noj?bIZP4-pG!>rC0)c8wWPM!qZ|!0!VJb&L6QB(jwMqNO8mEOO zd7wa@FNFRm;f47or4Ig!QDbzPS|$+*2}N#NXl!NXpI|-~R@a{hfyvk~Y`N2WRbdRf z6L-%2trS9?vZQvNZP=2}!J)1N@b3nx?tYRC?=+2g$Zf-N|L$TjIthU>DGDsCN-{#* z!8A$Bia?t%^~^qwJu-h$EKzZ2;8w35KPHHcyy@ToJwUKx(&(4DBbYQtWIGl=@yd5) z;Qwvqeg0WP#n71>GkGr1uYwdNTK;XZN!=bMQ{a&@bOng@jX&ih(d%5a-a(Ke0m$<^ z&C5!0mTIs1vYHhsevJ?unM_7Ahv)G-6FV+95;@pj^ney=2*E z14N3|B7}M@{cerUT42D6aLE!LnW<}NM$QRAv;)HOR;B4*zN~Xy_g;o%3gL((e7(P? zX_9tb6~JTlmxMVc0zyJe@~dyh;bmY}ikMqGg~4cf8N!SesL2tKN&%*sy7GEh+hBXU zEKi+!e(Ls+B?ln1?pT1ZrZQ+g;@>_h4eKx4Y9OrLcE)qph9%s4wD9hrk>xl6TJl_3 z(6evyN=krs2f|ddrQy0F@4pqecxgch!@F|317Fhslp6IH3?l@O-u;J3p}`#Oy1}$N zE5s|tX74zsQ~mo}UltuIAQ%;i2JrmF@pFl=)M=?G1E9=9847p04#XhiG$7rwD}7?! zRsSv6ZxD*K-zk)zHHrQX zxwV846uTAiIJR@QPhCHs#JR%>QEEA~3E*=Zw(H(O`SM{Ns}5Iu#;ib2KpxaI^8njM z1UeJTjU27w3{Y~w&S^zq99W0~37h}EF!%a*p6)m?uLLA;JeENQmG3Td{03Au?b~YvRRUb> z5EF4`01UAZH(J|9c70m7NS1TR?R6dNQQk!{eiQ5oC{@4QU^D4*Bk?w+!#)1ly-nKGSJmWipMdtv$G&%q97+HNO8lejOj5Gwx563XAYO1R!Fx$> z?~YhAjh?fHq$@8~CqT-ZhFt(S8(;uX?s;qPmio&gi&F0p&+ltfkFaoBL}0QPzpvgKOKAg z28YmUw5-m=nxPIs@VAFJ`-6n^at}Q9RZ7ZMIsw=1o75!@{X?3Sm;6O zpiz+&kn%7yLuUSXhW^ZKFpuHwsyBDEn3pGj^{pnVJKvfT@VlVK65`^%^YvYF`hbo1 zZ=DIXz9_&jPLKqIghNb1PAjCYl_VnDPXH!``HhRz5e?1UwwYanXM6e92P1G$iqE!XrB`shE51q zWgDCY>YEG42G#YOLC%S?1iM;>D+*H20rUdS>(|Ku9SdPkqe0^RFPO%Ok&~~l260zA zgMbkV-Dvt%)4yGtf3YW}1x`xxvtD&_6BieU@b&;^`~co_$6S3WQ03vL_@EfAwNXyl z>yCzdJ-7tT&JVlHsPi(DPbv&~Mvtl6h0-M?~O&$Oi;zO;|S&yW6X<;vobOHc)T_cn%shkja3#^_xyd^3$T7LFwCC6MW zCv7#C2oU7ldX+DGSvGd@CLQVAR`NkT*!Ag-4<^}+Rl5rMjbbg#i3$hgGXT6w5PPYo zHx3vXwOl;~X^=Q!1d`m?&V%rTfZ|@@i7x=y0!eq`c6y>G=w>f#Uo?h~fQ$-0wO#;( zC{P7LA%}Osmgfkv#;tR|4B_qriw;0fnbm!7hhuYg_6Z3PkF72tbeyouM-OVz7vlaQ ztFgt->y@k14IYv%QN(O^vq|NzP->7Pci$~?+zSy|se@CrzDM&clrJRA=nAeI!$=KD zvAfY6Vq%GZb6`wB*@K*%8vDZ;xj-+yC^%>Yq)U(s@6yc~bs4GE0fpjIJ{IQoWp&Q;#tSh+16}~Xa zRvBS*XF(&xN^h$P3`Hs+u!CG)&3ZjyWd{y;7Q5nb2+2vGqLPxj9viDi6ub-~hs2e@ zfaeKsi(%UxZO`wW%os?I=l+yG{12Kl8&GW}nkp|=_&K=Tar$Yb^mTbcKhgR_k}Cm$ z=R{TK_M^@A85yO5wpwDsBkVHLzv+PkQ{;7zQy1Md#rTRBZ68gtwK^ZioRyiytBW}> zvJ1WJx7>P-4P)K}3c-z@_SVkQDmI7n)1~TV+SHv zCCD5r<^EaQX{NRY!29JMuler@hf6x+O;c@2aL8NnK!6!2C;zMxzzq5+oZC|w zG=p5Q(u7%L0wP@NQnWvv++V+Fi4vyX+J3ugA?S!d4$y3v=UgZ-qod9utR0w>qb6La z+d?KIaJB+&ii(~>IaPL?=h_8XCPwZV&?}((y%Yut|OfU6p z4d1^Z^nE?wsL1-prKfRf9ViY(8$tvu^LN+AM!w0dFHlD!)Sc#3FWQ7zG3bg;SPU0nxXy!$e(q*&5(_xrV5S=npeN@Za&F)<%M zKHD3wI7~&CSdM*^yH-0EQ)-!}f4By-6ADzqN@6GBe+M)i4gH+VyT0%v>6woMs&kYK zU+b!;1|lTBqy}9g8gGHaR3G?7qLE`nHz?<}@DbxK3$%=`TU7%nKZhxdPJwHT-qn6; zSsH@}!{`(Mn}0jD(;0*QrJP1&7DE4=b;;i3PoN`ipB$yB``R1Mra=P62&cKpHn zxLU#2(%Z1g+2XoTrk7niTp4s-z;UUc8T?C!K?sRjT>=6El;?n9kW-;n}<{ zDMrwNLFz2==Y=jP5$S9z?~&#F*%QOP%OcYo;(-CP2rG}FEfClLfu5lKy6x(6lj^X&OV=IB>;ZVqXg+={L6S(ZZ{B|_ z@t<34;3Hy+fmWkGcva@_#0YymI~u1B#$>m(DSJJ5eysHe{X0I?4zUq+C`v3kJBGOp zs~uDNY2Axhpws#TRe=vZk<1`eWN7E6_`CYRe2*Iw3WtkLE+cX7d>IavFqZSbp#sc* z*86+n9F~o}nVOH)4*K{qcgk~x_$M{9-|%nsZ+fk7?1U)&jK6A%t~9Y(!r8E&_Ay$; zT~FGKO?5n(#GNbAU~K=STbJn5K-ivD9QQ$`?!&oS)_>RDTv^L_{LZH-w-U*9#Gcxm zNZ0i=W+87x7QOhs*6$(c;k^Bi+tR*a3^}8s0dc4K`mv11rO-mtL|!M70I?@aKm1L2 zDfxl}^`F?>`;bP$zSwJWBY^ny)|kL8@7M^d$N@2;Skcn^X>GC@^p})^O39I0S@)CL zK8?NFzocrLB9d_+{9>@^8Rw~F`^?x|?&$T-cqS#?P4~7??E#Vy4PHk=H}hrt%mu;D zw^*YWpH`#(T};#6jcle*!k^&yt$JK#xSg|Hh)eqP;m>vEdyX9$W=-*?$>RrkBp%yp z8zZh3OVgpUJ5xV<0;f&YY4x)%(GSH`>@&rzcZN(;yf@sgyyfuxFlKu8cdV-P*f53I z0DD0EmD*04Va}(yUzW2z(8pZO4@`5fV9!w4;0f&MpvW|9o1WIoaXg`x``SH?2s~rF zl7gUBSYZ-%XpVV_r$SG6Rm(1y;(?e|=WV*{steObr2*qlH=}SS8IYSI=l-slZEU7m zER#iPhr?a!%|#@bJ)Wj@)|#q1)Rwh8mUfs5*9`9>CA)=dChajs=%yA+8zTIC|DWW` zVp@K=>O|I)Bj`;DyY`Y1!+t_dX%E3tQn573q(!OuZXPaDGTr<3-Z$kL%zFa$x&yS_ z7b^>9TaSEz>S<}Yte9GS~UKQg8~?>wP5ulFOW9=+H5$Vr*DX|;Sm(b>TVQwxcT z%v(jIS>@O|*W3R@OriPKN(6OzAPczv4{L7$6m|FikK&`^BOr(fNV&R-GzbVtt1KWb zEiK(0OD!OXE-jr)ONTT_Nq2WFB^^t5>^=DXeP`~Sd+*GD{`3FLKX2ZS>}+#fC|WF&s$#@<^RK^Rv$A|Gd_8-^APa>HqChst{VfsZ{7#dAmF3&MKTA z-pUR^%)iQ);)$Hru5uV6wd2Oyb~SOV8uGj{Pvd1{PpDhKiK)O26NC~f5DDma=-j!rw38z(yYlOq7o4PRMS+w6FAY7L35w2 z`Xz!fHdBU-%rK@O@oS zlEhYvm#0?ay$8nMn{oB7UmiAVT!sN5)X+FL=mi(CVK-~6zoCrQMJRrT*A1n+sqTGQ zE&mg8HPkG34dCB}q* z;P%0-@5{knEe&~Zyvl9Z{}Jq7tJ?Xe$zSKU&?JktZHdgmU~|J$^txh)M3KvVXV{vP z6qreOM~@!L=i)t-IDGbXBuTbWF;znO)u#qO>-Q%CF*`knB(JKcb`->gb{99TnyCII zQINbcz-2ypmG`zv9Cyd-MmWhivmZo=zs4&++d@nD^D0r*984TYs zW96OjU{7LDTIs^OGvVQH&ihMl|NM6yrqis{AoKDRiP(1fX#aOTz8<^**90U*^Q3!J zIfO|`wy|)D>yHM{^gW-kns3LlU5t#c-R#DKQ4~{fL!Bq7Dk;P_ArQ`VtO(2cm{Z?f zPr-QGNy*{W%vp1%1dV!)5X{VFi&NTKHx`NUTqW+|{3Cwvo|Qt*xT;gL&~Y!^0sE(r zV)@v5Ve_S`Q(wqwy&yI%n6vI|E~(d;kc>J3!nu))Rki^Vz(BI=wINN3DIQ14y6~fvX+Npkm5}9N_VRIeE)@h9N2H)Ts6MKw%!g&U>Z${qKGf_^MQ@ z1>CICk7ny#5R9c(9!mq{^*{Gme<+U16k#>vwBCG{aSf7Dubiy3YgxF9h`aSbKE-iA zBJ`IJmPA`!=d}3~6gug;6H_9B=Br}MKj|vb2u9hv{?h+8Wc5IPuK6h=`~v95|9zg^ zo*~0b^OZZNJMD`?&Dix?#-e;6bwHcoChj5RrAc$ToXX>?-6>@@hK(UV$i8!QiGbs; zn#%d91c)ljSs(I-&zUgs;pDG`Ql7K>!~-p)5X(zZXWj4*j&jh{MwD8WY;d?6^l}+h zNE8#ytAV>J-j$dOgEgIH*VwsVI|yArO1McjnZNc+17r~w>!|^-hvRVD&O36BS$}ri z(WC1pBW^@i$hD;~y)r>t`s}BLf{&rE}5ZylidTs~0Ob?eS!{yI!OwSH{ZAAdz)K-~TxGV`;EX_{8#k zbiZ^Pu0!88q}L1@UNU^Z2=A3B@kIjb1u!l&8J<5+2Nc>WI@#$OTOUBerERl&cyFe{ zakc+5veyGE2+9-YhR@A?nz?PeC^Ih`dRl#t#}DaR_d4A^av}bD&RA%l4we$@ij08& z=7+Q|ol8zPx0et#3Q9Isp6hL#T-QGl2RxvFT5{{CeaU&cf`l_nz_F2VS8!o$?d!v8 zh)4cGCXXj6T2IHp9Ibv}Q{nOKA3vlMLc{E~k!}O3HSJ9F$b<1avcSXfB%+Bugl0k- zmhKGAv+O8z+&%Z%5Q>L*W+v!{8?}0tZ=DzpSi|i^Pd+$~_Zvqq@v&U?@y}2ab1Ts# zy4DoeQ-S6vs937a+0wRe!QebX4=6X?@RU1$#W-LHUB$$VKUsc zrd&}Z=iKCBBA;yGJb(wbH)J2AGSI7TAQ)e^U!wMT>*}x}T4J8C$$j&i5cTom0?E5M zTi0yE{XrgAGvN@Xm0PF0IwqMl#q$as&ocA3F|uyQOXYELQ5#~>JdIhfhS-H;@yiWIVqrxGBXqO44!1hK-?}3;`ze}>39LG-vi_Sx&%bBwO#72R(-Hn?=3nD@xwv4E zslB>bq4G$z;G=-cPw3CNpR)TFIpHMJ#R&HKHC1lS)U(c*7N&}%@o0oAxV8Bf5?E3> z@7J&!X}X_T!IOG-s#ZHXHeZ&I3Iii*~yXhFX>qMoS(OFImPM_P!{OvcCi%W84P= zk&zll(G=DDh-Gp*@T;*gH)7TSR)4Mnq;mY|LF{2_ZK$RTN-9xeL^1#%S@GPKoz={9 zdca#ZN2mTO3R_@k3r@p>dAuvVi8O!&peQHU0t$$M2n1~^+_-OE>;7GdIIil|(R{#e z<}w>b!quo<&RBPO@# zST}W!J77J@-wAvCyLio~$O|25QyaH7NB^sPKZm-m_PB(LP)51tSbe+OH8XMA-?ipox%M?Lw$^nF znu;~_EGhqv{D-b?cMxY_!(6yuzcrJ8L*`Ld4=ci(b9R*jnPu$WofcfA)g+ER?fbpN zgeQmLNJ@I7IrFMmqnypC?p%;~WbJa4Dujn^-~D@FM^kQSgloUgog!l+qzo{dPth`Jx`)0fYJ*ujo9AfKM?wm|?pDl_o$ z{)dpqB$+Z>c?6~t<#;43wW!O^k#MqohOPn9ZMI*Cb0-k3QNu#q2l3fzY=HQWAqdVp z8>%d0(3jr`)^e1bRwsSoRRWni%RP!wX>>iyLAk0^tFm6g|HT5xCOu6Bxf&2GRay{l zs9tmY1yW1-=fgiek1@LXZ9@Bn5BA=wyHLRbf3Cocs{H z(^V!D_EE^Jw&n(!1OYUM!AFRs@EhMdo&s{UQzQdbL*J|)5CJm0cd=BN~!p^P}byC+OW!gEC>fS6~L?jA1})Sq$!K}iMy0nN-I-!ETwSpRiy z1kgTUDdl}hW>fLAGQvj>@8aY=F}IoBtJtxo$P zRQ>2vjXOMa1zp8PQ%HFVx5~#$b?s88eRQ;*r6xWT=_I%fIMU89Hl_}`FU89tSJ@{i z7fIAQBbIZ`1>3DyVYlPQV}CA%JuW81I3##Q9m4cvvr8$X^qg$Qsf~=pj4X`Y>GnCh z`;FL3%z!5c0vW3-OQlbcudB|q^fD}+W5t=@(tqu^EfOxyFydHC6ms6vlRu+XtJ7C- zd{|d7cIaNO-8c4Bt>l%_x}xBE0CL(NYw8k(EcK|3ZWsvy(_=D5z-15E(*Ve1FOa+e zJ_2C5Hhmx_D=X{dRQ}_KAxP;cnU?}izL1d6%*;%Pz+O98*ng;3_xFQr#u!k=0x6J| zw(AGDz~w@kfXoLRBrUJ=#+{v#Jx&nkxf|Bj){yHbiCh~XvNKYuz{$)Ka5(_`31rle zAtA#1$Fs&)tWNl*gX^RD8(UjwP@%Pq_qJBK%g+ogiI#7U?Dl6u*h>cDF0WXond6dLw`Z1o zhvgAj3HVg{M8T%BFAC>^x5F*>lFD2+WD$k~@UY6uLj1zYVc}@XXobNzz33}-_W7Rd zT+Ys)ig9qNB|(IRw}s~MRQ6*84b1BCc-=`VvXlU`>anc@?rqDBQ&SrIOLkD;A@%N^ zSjc0`7}5KmB!Ox)ryW>Eo{5{0zHxAH0O?ae0s=Y#W&SsoO@WFG$qVWGp}@ie#D5(r zLHyYaNCW_-qFVhnLPvGx0?_anOsg%41+0U{)*AAO!IU?w1T%JX`_{CZ-FK8v7{TDB zy>j{&UfZ!Nd6n5cTNZ&v+If=DGJNlr>o^*>q4U$pX^65sG6nfTN95^&Sl>`-yQV%lD{P< z4lNO`W4Bpor4Ch}DU^AP&a~WEK8)-u@6B_qZR0-=c(p!#+44kUoPmAA3AJ@zbD58* z_i%d)`$tSzaqkH|SQ&7WWU7yk?!{I5A={VH=l=QYEKnYzEp0le-r5*5o8rRUq&5r|Hpt;`?XgiGFA6bR z-}za5IX^_p^`Tg!O@%2pU&A$6tIXp2x!@h8QLCx^Xyp6pzB_E2vlS9Bvx6fh&6@6s z6maR-Mf5qp(;l$c&}P&P*H|IC zD%?(kY?2XYV^paI~6Spy9BOpVb51K`ntj_6gc>OAgEt-8tJAx#gNvMhg$l%u)`_AH5Odj~1i*q3)38spB<#d+YK3Vq)xXCs7p=^G7fDdRm1& z?f5pTdH4%*H#U_rC-b{vP08(>2dmVQz~5f#m)W5~sWmJzK)oifXz)TcKH zeN}!JJFQ;qrE(SV`^yC8U^rIKd%9*^)3vW8>NLr(|X;B zfrh@+t?O(O&L^_KgRiw;Ypu7BM~^+^irJEC)m)!2CznVXqxQV9hD`!~6O7hA6s>1mCl`>_dwuIWByZ%B~V)%ZumOJ5Hr zo%6f(&k3=|PT8I@hNv(y*=RGNZ83=Zp;_<)cfVY7F895m(pizhSrsSIj`OF^qepYc zg$YE0CMxn^Bg9_Q0S;AL>F7|l7Q z;W!8u#5tmIm*^=KFJg1mfFsK~$uv{UdfEC!bc9n?pR+&X6y)PYA^ldY`S>W?)3v|g z3LJv3#Ik=AYarA}^ZYNU6$dBqQO92_Ar8(nF}wdnB7Y-8{}c4Yd8zlmUo1lgh*I*X zhCBU@pW;XpJo{h!L^C?L*=QaE6nk}E*tzvigXgL$#q;p*OWx2b@K+516bvyhe7WZO zhv1GhR)6-lfs!9&qo!?bZ0?70X7*)Ivz9s@9q?Y}UK_Q8!)fSb2P%q1!@nKkcBNFM zWg_L$5~&LL`!gkISvbF-+fiwGxvb>t@3v6IrHquw@Yu`;3lAW!t0~&GuBJ4A_YIy5 zF`tp*2>y%1To0vsQxiROCnTa~<|ffkKB`Z67M7Nxu*@M-8svm-A87Gp80EAD5kyE6 zwv4EmYdmE>)cDrPYNT~$fr&mF`%j}QvEB>g2Y7{Tpg9)gX-%+p?thjqV#s!gSOVHy zTSJ+B*T*i$|K#&xo>*pdNGA%Y!3Z(JHO>tjGXHq-rPir?JlXw@l#b;P-T3&omM>){ zpTmJ&6|fbihYnrMc0=lB(3D!#-S;!&-b5R|7!eg66kI}&atrHn9AE!^dkiy+DjR@B#dDcQA-i3K_gaj< zVXvkjl(;^f#qcqR&w0(Bu1V$I1AxEK*g*cLMusD$mL7xY5!;$X+!bbS=VCeJBJ7zQhCJLSxvVRtX zk-%~&8Ni1+=@WIzH!lpkWQnsi`g-eZo4y|1D-9+*cEW6)mSpy6b^l6tKmUD`2s94x zHevvr=3%~TVN)C&>7=6$8e-sV>AGEe44@Dtz+ndXZCOrN?&^*H2vCbAgh-F_y^2r< za0NH;2biV-aKz-)lsOH+X)+d-m5~wyPe;cmU@o~+!0f~a0J#Z>5&iA&ULZ_$O=HfK zGpBio$p^p|LPElOlrOvS%u342MgUS1C0kYUcTw6mc+-LM27;gc$!~pt_LC{iU)@$< zkU3;m_CIM{rThN}r>K{T0=Z-PY7KpvLXy2P(pR*-R}vxF^xwH}Li?v5)9KMUJY3o7 zA$e2$|FtBL7I8MGdU=ziUcZK(V}sFAex=6J=_?SPatL}jAy{1H>+4aCdP+tn$09=l z+DP?+R|)|{l~F9raUs(z4<^b!zRU<%{EK$Qsnp>YVP^g%7CaRiZ`@YdUSPcCD~!iF1Vi`_bUJ6WnN-n|E@ zNu1xw$_-(3%0(RbALR2M-Aar2W#3`7_KLYZf6m9sst#IBWT>>l>Cf`AhD*M3gJRsjVb_?1EVJV(|4ywafeDJ#i_YHr`1k?ezuzbJyWlE$ zZ|-ZHI(-^hUC{lHQgAh7Q{k(wS<&p3$|?h#b%F(@&ab@4AVcwYKq>ve%S%=045OL z1|Gn~w&N8U3rr5ZjxFA&HOAH~@A%Lh3e+C{zblKCyMi&W&w8fC@K$A*W7ADWANQJBD|CI3cc>Eq+e zEZI@*URBoUn%^Ew@kn}(M9OBC{A}-vkNU$vYZ7?1vdnIlA2)ivto?RiXj&zY`spvZ zm_iVWM-a!LSz8jUpDVl96^6=w=1+EbYdB@Jh;=Y4cysyVw-j5dA|^PD*F$r0Dj5kH z;(_PT=;+bl52@qbh+|~?;>*8dOF5%cMV{d!{KiWh>J6My*nj>}EI?TaGf4m82gW5& zb=|hZ|1A#Qv+FMRVpKHD&3ef2)1G&~uo3*~R$&&xUIOfeB z+p6t+oTYb!ji#yZ-bFS+`!w zgO=_@UVD08KVpIZvvha?hphjcYd$rDrhBRKkR1fBad7@lC-8S`(EoSA>oPcx*#BGX z|L=YOM~cxyG&D4b2i6?a@8~&=?}K;J6N9POn!?Oe3Mc|}vI!q(sX71Cj0$Jmi1VX+ zcz6m-Q2QbP1O!ThQFSu!-~a2TZQlYEj#0tXnj^fn3tyt6$ABf2)Wi#R_JOW0CO$s( zmP)VA%%L?P>mq+PP+-wCKsfXvs#k#Ksv4|3zhB?FeS2bJBGRtj0pJF~Yh7JjvZ*HUjkmsh{d(=+_ai~>IvOG(0{ErFLHJn@B#rALZVCa_ z0D5|)T)8BeGhVho|89dnrdPjy{R&iXfB$~t_=f`lHcax+YP2|@7sv^n!s;m)<$+K7 zpgFK%X14X|@BcvTvfsm_rKzbD((4Wi0j#}3ujsYVXK=8St*yM+LGs{jT-6x2L`KJsXf?uxck8*~PQc$>nh6IkQz~a=D7{y=@ zKKsbbPm7w?=aad4*H&(4MEcZqQ&RV4)ZcLfb(X*=p#ZFqLG{Z*5{9h%_5Oq3w<2b+ zfU^Sz2dLnmCYBi7B`=+N4}{+xfvWRCa035CeXz3oAou&~b_P%?EEIKkfw2;ly1ess zaX9BaXcs_r3ae&5sTA`0y_I_gPM?wRX7nHm^al7k2mHA}vr+9r`NFQo&AruQaKIF z%gZ}o6EBshUH*~8ee3Co9|>HUrDMrn>9+=Yhj2>C%K%6wC9p}$z!G2Kv=JBV34lx`Ezd{8Z1knUg6EGu76tyZ&4Kw?WR>l!o+IHYO8+|HGXvvGiQDJGXLk`ZqpFYh2jY%N?U}A3W2KpI<^+B*29OsO|0apE+ zQ5}$$i%3jN1jN3G2m^ET_?Vb+FsT4m_@kN{u$ip{=RNg1CS;(~Af%*@%iR=94h{f> z41%VUlatp@)Ll2Er9p*($F>c4KcH=b_aoaum0ZD40K)|6Ju7UidP!yo&jHCeAZn~< zXc(E0kO086pvK_!rZlMe;`W}N5k6cT`a39DV`Jk?9bqXK8$d~Sm(BGC)geo`p$qB; zWQ*pANkNPk(96i&zvye*1BK<05oIt7d?q9Av;|RRy3hRUNN>JC=57GatnPMq!@(+X zS5KRb6KR;MY!$yHl(-mFMvu*rr%B21A`<}vaGtjLQ>i`w-6luAS%heOZ zF2Z29xUf)cJu6gDoTFCnQLE5DIXdbD1cPte3-*5pIDHRN-qt{qcftwbpy9L|(oiIX z(up2ln9;o)dSH!DONYE`ypJ1s{VoY6dZ`f;nqUfW_zvNCHxEF?&r9@QAJ{`5ai)@9 zp4=9*ZCqTtLtxkVnYSOZ!wi)d8!07B&H_gtkflfzLZ~y4;--DFw%(qb>n}A`1aQ`T z{m-}#d}t8MRk9lH>|_MuIAG0t^5hesVwiDT>grzGvV$R!HDTgcTv9?pLNWsJIt60% zz`i;g^cCZ_3$d-ey?KA~=gy!gC@n256O&Zg@{IGJpz-l>5EIU=Um?AASOzF^F4JEQ zJk~S7fA#|SiM}$e1@~KUDRBiRs>1>#zkdA!7m(ahO$r(y-3UO;fTzJ&;RaGyt$9qL~c-LZMJ0A$=F8`&_2A55c7UPjnw# zZpQe?l%8_&3j-dM0~`*BB!I6Gse(!>wsCR-L#}949=S;Q6%0Y(&ny5K4c9j**XTJR zH8r)v?O9K-V1f-s9rEGBwdwECLI7L2$-uh}5jiT5!GQ?5p8%H3@a%qec6MRSo+ZqTbfO~MEMn%CWzsOB)Zf;=t43ZKfHrNedI~q*QJJ{AH zzEPWckDNUE>hc^El-(G6s+u`;4! zUQzr4PRH)pt4sBIl`I3vl}vD_SBL1}c6M}R%OwdhsTK_HaB(2x_^&m8^VCWXlJ;L& z4Q8bve7*(|td^)n0LsWdNN@{Z!e>6^0&y-GaG?a&zv^wijIV&XC%`7_oSvn8Gb^{S z6k|vPlIz>E@Et$_l9c6yLf`d>Qjjx$y#L_A9_Us7_M1ck{pE-akSqC6Hp&IxT-{}g zE~M8nFJ>M9-wfmtfHrZ8`^M^U>1PBYQop~yB~OYjI1@yePEUa#HhgnjKKBzKf(UqA zI3siq%eOS*M~qwALQwr$o##_PrUhiX3r1`juPtc-V+I&NLI15KUx@7uxGjI{2_-&Z69mDIVuatrD;BSA5MvCoBkZS6n} z4-cn^e8T|STNt0??)GkXI69HgVlYDJv;5ilVm6J2$K~3$B)BM+6?%1b1$@vQ;qWm& zR;9Y+jsvXF0xMQT(zf7YB>6he ztLomK9|6CVXY$s*Tb$^M1T^JaHbmw5!RO25%8B zBg6XK^!$9=ogRw8U?e0Y)`>i4e(L%t;N?t(e;zWDu^LS<`v8xO3p<)wRO2tdVuT7` z?1+GMqb&H}rSDNuqd;C7oL(wSme>)ai*4Soy3DESHpU7(Uv8xt2B;w00YhdMVDR&w zz4N^`ng%k-VC?qo*M69`rK!Smw&8QF zccE4-8DLL?p(mTRB``%Z?)ho8o!q!PP60g_i0cigu(zBN=99Ml0>i?QQJ7z z>G>!l`cBsu&A9aLsk`MwIgY;v4ji1%Wr8zRU>SDU(gkm{$V1P{r zaaknJZ)&(4f_r3qlPS;B&31hK6*E5Y=VqeETH|Tn+}s>k-m<2D8Fiq*(Go1pU}9Wm z&!C0XP6kAXe|}t&WwDP;lu;HKmVf(0Q4BE-mz(w zyv%a*8l8Xi1&gS1IBO<~0}DAIcji>L)Y>gc0!|wz2S-LvoWNd?uJW+It%T(3ex@B> z;?uRvq(qNjox+b}Ug|c2b!)nz&m3L8o%4%De|K*`&10f?ADn2eX0cb9>SJ zv)gMneX38F#q(_NJ1sVHuhNr^_IBQ~JH(J#H`)qSuXbE!78K`2o`g89K)DUewa(XF zTQP$}oT6Fr*0)zq?MJ4e0A}{Q zwVtOANCh_d+{v9N<(5j(sdWY5x)czDhV1Cg`gRFFE75n&GgC)Y9Sp7Ltb2kJ4ItI6 znnQn&v*^@m!o=#>1G|G~cH~7Mo{XzNr+?NHix?>@HtLB3`hNv-4o^$cR@Tr?3#Ipo z)qXi-$Vr{*o^2Z{V5R?|7x<<4X&;pH(E3& z#K60{Rx;Zu@31b!%K`zA7e^hMRRB|4z&3JeyBoJ5y!-NjiyarTJuToDJ{VK$?w zxIn=FCtf>J`ab&kQB>P6r-z{+m!4z(&SkGxdXH_hBN2V@%`g&Zxt)P60SD0<044f>!lPjLxK-X;!ez065>jEumz&WV2-5h+e2Ih&eLCr*Wq@ceC`l= z98DNQ_pZdZb7uLJxa~2omPob%NGM%TDVr=Rwk2Oh)&u%z*TW6I>eoO(p8+VP z^*C|8$8@9kRbgKP9}VYNN?pguK~F-C)}^3(0Ka2SgU`4Q)YicGvYt*4m7$@^*Jo#b zOjI4v4%mRX1$UP^G%DOG*W+F)^h|4sE&|$@@}xgUx~5Bt{D*xqE2tfC4H!|EmXIJ0 zRk2)s_wYxr+uQ9%Uwr6brRg)rI8OE;>P^u_09aAK6Fe;Fm4S`PmP4d7`` zcE(^KCTG=ju9U%FDiMmg$paq)odr0L=@DuF30g$H-&*ViTM=xrC|Q=V0^d@QY*-$< ze$#6}cbpOj=OvSaM_@2WT7WZ<9}E{Tlz zy!D>475Su@$?O~O5*!?TagTdO-Xb(!MIZR9?EkdhBX(I$`KAPlfJ3&(-y$HyBAI?2 z(+PYC(FQNW@wx&18&P={UH&UcCZPXOeiKfPrbU)afQLMV`nMZdEP`g=(`3dU=Qq7H z0CAj{q|%W~3-#B_msF)tnGNS|saAhF_1BbBf-!ys8Uc6hyM_0vc)S>A^Wj8N*?-3ga z?F^S2EBv6H97@BD+$R|hB;`zN_B4Y14BtbvO-PBqZ*0C@6iFQ15O6oQlxSg4<(O$( z+?QP!)%N0~cPgriy^;g^QmiP2n}wQAxw9dvXu*#xST0YN?cBPO@S~ey%agI1?F1j; zgg^6~+a@|no8a6L(fTLDKwCoFZDlTx zzF_zzrL0f+3uEorLHUi>1b86gu>73G=}>!@W^*M0`(5XF0J>knnN!lKY<1@$>C)mR zOEJ%qb^ae50vd9{7@~qENzfP8}3UI%DuA6F%-^M|RY?t;U*>zwUHaFe-S<(L4fduj?>F>9zO@$fA7m@{Y z`{E)tV-qIK=;;@Qa(J2y#L@Ul z&ac8VIAUu+=>Y|HT^If3W7#bW`&mO`jzJRG1v*}-Gs#$5lGkxDGL~LF{JqO(yaZhg zfiUxyrLsRi?Nj_$ss(R9W~B8Kvbhtosc@8k8SuVcK-uE->8?)+-dF--TXm~D8+Fzz zN#l%h7wx@XI$y%Mg`1rCb&CF4Hd3DEN|QG7b5{@fC|&PYfhBB z8m&w1g8FLh2N?QO*@HW1t8Ts#Ex0)f`h!;;=B=tN(TjDUw{#SaPYDoXQ|_cNup-bN zX4;O3<8d+A=p1!migg9y48`E{1E!Q;~f>?}D4qWtXm*Y^exyjmh=zMR)7= zg{T)9X^iQNU-TT1?)K!qX^0tjvnSNP{+)k`JNSF|Uj(L$-?Q5!jXX}(1Qq5p`=l+` zV%qN=TX(7`%%M`l%_STA=)p_c{mydUE9ey`TuY+2CQLvu`QpZpmVz0kg{?)+ua+`h zXjvLCoY8cWGps$*aSUg1up5dh!DO+J*fE*FyHu}r#4Hj9cMGg@i;_eYJ$J8jQ+T6QWT zqK}us#nhH~Tk%h`Td(=CtY4hpK~}rygBpb-lr7CmT?Y?!-on;S^V}g??o!o=Ky6py zyJd}H7B%6K_B1aWs8>!-WY}V&37oSvMUw~9pyU!){EfWMEd?$!a5UfcWKsSbdqov8 zXGE75{vC`?+r({Q&pj&cK8>RSv|?*~YmD^^jB;@Yw6ipOFlT;ID-TX;Vs+L!chP5x z!$rz+#AEfPJvFKGL_#c7&ls#OR`Gv4k9+oo;0)>-JH2Sw#0k(G%%tkyEsL!S`^neZ zK8gYf5JVuzVla91}8>_>ChMPYwNY4=agKP+XRJ0{Yo8Kzcbqt zuCBz_2Ve|FEqFP&Q?P=4w_J4@qqkOHyxO;a* zb%Iuw^N3lFo(zqG z57y)a3wn#DmPZ~2k0HUVe`1Nxn3+jK@0xzwfy#EQM-IXw^qezU!v&Ld5}uynka!TNa=EpDaf)T>3!cXle=W)UooxB**2>}6Zr zju>8JyTeMIRVf%A;qBw99Ns6>_5Q&wqS7`}fY%0cz42zy;CpiBd3sYsPsB~suO2?r-Rv&~c1_iYhm@ru{FsL}3_EO|UKJdiy<<8H zt#y_T;XC(26)2v2CPnkj)agZRc3;Y+EHr+`lQ#oYEBf0ChR+T`d(wJt(tTjS)bEp! zEU)r%6@0p#B}vEv5yg;#lR$)WWRSGg<~&>2ciS=EB1sS3nQ!KYEpSZy;5LKO=u@XD z_s{`!_kP-mn-+d#cVjm)|7?LjONkc!iYz^yYlZn^3R|5bdr?ZF;vYPfsJDr3ECfmA zdlGI+{g|qgYlYn+E3`Mw3T3#o@D!B-<#TwxI7{{1R^$bDILBm740F ztdXvleM9l6{{xJZja-o61^!r1Pwu3K%`P&KRBl>rdR|uD{=0d@!YCdsg?wV{Vi;ID zQv%XIU0UobUks;26tk9ok&zxp(Hi|-`dPPOp{nk5u=;+5q6*T^^c@dy(mHRhz3sU< zR=a~|*crC(>atS6uU3AL_4Y@8s={VaMg*#pO~H+588QDxlvHS+i9{d&L#Oc4$N-y= zs=S)sJI_6~BW?P;kKZII^3^D~=05K+x@Rl%J-tmYin~A7#9~L;#~-t;X&Gtw2Q%|= zhM(3Gkt-P9U8MaSoMX&}#*mJsrYSn6udT~u#jN5wt$ts-am}jpcYd@$@zazJl}8 zi%{=FLygyy5tcAH>5tkZwW$&BID-wbS|llKAJB~hFAA{(woY~pHwOnY&IhtPODq2| zm0_r{NO*sBCX>iQZP{5qXbfBkkNu}lNiXa7n1w9g`6uyd-pQ*iwhX5K=729APZrFH zj#TCxOq*dXR`2N?xc$O7a;N+q1vv3lvwy@S9Np<0gxV|f_0Dj&VSDHKh7hU)yBFX@ zz)TYg$5R-MWwre62~K|1U|*Rv>9ngdtVeTg;NM~FH0>Y#@ocfzu{=6YI!-O?1pfP> zp`Xe1Cvt~Amgry-CqF&3w7=NCIFmls`q8IL5EB;rYDjf9WiNqozuYiNkhmj~GBT^b z2#q+Lvup|8x~LFaY5KzvP|L-^gS+7gw(CWOXn%Wf260(hy^!twecHf;-<{7*LdK}- z1@d;=AQy!_Mw6r69F$AY)$Mie2k#XW1~EP>FjyX{79lWQqrqK~pW^r_!)R)mrs+=I zwSor3UJTF;Q67c?m8_EDV#0K+ zuDNQaKe&nsRv*mPZ-3I%)C7d9oPtQ8Btgf_Y%v&EG4lv(Hd3Y+Sf^FIzJ+yz1CX>1 z;xLkr1<3QKXJ&3+nj42_5U!V+FykIPx`q>hyb>S`$fs0Y8!dx7sfc*pNkiL^vPo~V zw5$E(UJDZlt?sPnCMXb~!F8&pwsy+3{A#VDOubFdU4>PHb3TXxz;du4AYT@Idx;xF z+JSJ@+}7Hkwwgn5^?#H!x1x>Ws?Mt2CIV{mu~5>Mjs)lssz!$l#}EI9~b1mJ;(VZsq}aQ z#F3I&_=Lrp3BviKk8Aj)J2qb*#8>U4)ooEOZhs-NGTOo(U54uOBz5+l6l2GY>ao@z z_Wd_gpHX40=b^;2MuKG$80zs0Cs+isqRaj{{KH)W?4q`-jLfXd)aY^~+gn4hBlOv? zYenbmCRGJ2#pbB2`Qn0eQ7pRDz9&K(<8Ux&_ptsY?pIH(N;+2Rm*kq+x&`21$8K{4 zczAmHyplgDUYAH=Gy_iE4=#qFdJ}G#U%s_8IpkNihpt=RD^@w?Ydp>(vQC@BrW!tV zTLbTNKi-H4x8L#aR*=k3S^s;D@)~^`!&4QA+7CQ=3He+B;3r$*VyX{!H9XBJJ8-%vPyC*08S*KrUKnmX}cH zcK-!c;bir6i(%4Rn5SMq6}GXsw z-(6_o1+;ApBrh?Pd-j?+2YphQi?Sp1>AvXm%&wP(!J-~d?T-_@ zM=XQ4{taqRvP&W@vGb6g8LR6_1VuI9U$N@xCM-I*)GqmE9gEAHunV5|vcZra;e=>v zq3M!0qATIHI(79gWt%J1`sm`wW3LdqJA=Oa$NmPw8PDOt)`am%!bLj^i7oiYi-@f) zj{l-DGMZ|__*bq}UZ|CsuQ?YAPiLDlg$hFLb`~Xe&DCe3YH$m+AUlQBr#eX?z#O7k&1sMgRW?FWu+2?Ls{0?` z0_84{qf%Qb*<2^1d2O=nK+YC`3<{EoELX^sr6I0zp4W>k*9!!&%)xDlt9cJN?) zhaxm1UXGmR4KLOK5OQU=oy4ip(rh-nx6fCOJdXzyERAOidZ8n5f8=|#llBd*?egJw z2XLvDUsL={^{(vJ=z@NDu=u@gA#QFuz#?d47Zk#;3~?+6z~-CNupEXzzMGRLj)b33 z50yRwS4?0#b?sB%bSE5QHS})7hM@g(vkw#)yeT@$fG8w2zmtXMLdr7BbWQ>=_!yZrNq}*09C>&)zNvrYi zWrAwG^OgMuXJl{eGgV3?JGrD4Z52EjK$=ceMH*@eKQ<1En`yFW%$E zwxj;A{HXdKBOFRYO-)ThQ^IP{f-RCO6O$2{p7|n0&c@5n&tul?pV#`9cWcvcA{WH< zmM2Hd7uka-QAy`VeWbXBr`uo9LN(-l)JQzFsYUWw|sa`OMPNCgM~HE&epH1%O|=ITx2z$Tq=+G51h`Q z%RmQ#G+2&RJJ=-$#4ihVu*S>7yvpb(;#i*JuI__y7IkN(!}e_Ni;D}`;uLMJ2X>1H zM0?$5J*%~0d``^j4}MDCqkQ!q_KvPcp+uhKJ|BZLOxt<>cK*$`YS&m4@Poe2*P+=U z%kkV{FAsmE#Pz^A&QwjTb9A&^zfBx`+Rap@zmnW71H>W| z74xlTI_2wjoa_qrDkb}evw=lZI!ZU}89@h?b^)vw9JMUYXa~^1ni((LiEab19>TscI11c&LCP% zm2V{#oj*R_F1uV=&9UnJ4pJlpkdw3MBxVJ|_l}f@$H$%kT?||FfvTXR zk^6mYMrAM#sB)Ed=JzG=4)yhI_g^)I2oKtKg-j_~p_3E%TwI6yu>lE}RdL(C-VtJd zj#`iL+*vWl&WAT&LOlVtS^Q1?$v6U<>Jb7VeMn4vZKPuj7`ZATeOwv<-wc`!+6W#1k=(Sre*X8l7mA7`x}y>nrf$>@#L* z*85h(UxYChn~r_%e^RJ$brPYqp(qMlcd{&$nJ?AAG9-0tM5P>YCNsj}#~|zI8%T6| zSs|RbKHXLrZe4wp-;-0f*ArjypL}sc`j?x~2M-^ntMecL^Y2=-4L|{Y-Y9f&t+1S` zbDTQL&sF6~<2)UBmVGCjuw_f8y)JaCzxgRt;9V?ZRxo@P!Nbpg?n5LL17PVJs+Z#+ zaSkHslw4L>>Ns&-zie#58t3EpZBSuI)L^>ORvn$js#a_O(%vAq_?Xu@a8JzS&jg?( zZfQq+RZtu}+UAdVRF7Sc%st!~3qT?_C(ClrbZ_C}0$)jhk^y3Q$8LKqlV$l+rPe*~ zj6@T<1sp%kHUMs30J5;q+S%C|Ky1ng@s7Z=l?42J;R!c+BVVH8-f7=rKv#$L_nzm0xCg35u`(CO7A^%2o@l8=^dm=4M^_+ zL_xZ=(0eBkdJSb(^quQ`=ey3Cng6r?k?hUhS@kLReLsv+PwffI&QKFjZPv+YZ=(Ja zxQm((n4inb%OJng5l5Uo=%a7{4)W$ES&PJ7#(=>J;a!J3a5%x?Z~&g~qi6`Qyab02 z*)z?E8X5ZOyH~LvO7y+f?X{&@e9m@8ZqJjP-NYbca1m)y90Y(En&K2Zsq%U0v zUk-J?OimsIRzdgd{A*sjIholBP$THKjM#pUcak!B-80Y$K)v0p+eW?^i17mH$jcNI zC_JW&{4Nvs{6Wj7^H;$K{oi?4^@2rq=95sep!|G$EFUNYPf{)_ z14z0MK?biFWdFEo4{hr|o&(S^-*sZvN7WQi9~q1bP+b-<|9w>J>jWfWfmgyrNvS&r zfGG>(wI~3B19f(8mSWrtBS%7LIUC4Ez5ySluBU1{Kmgu-3Inn_3%hZiT{={c>3JSV7} zW2&U9dcge#p0+)$eK4M5G5Xb8s`RG>3;tt*`DPgw`F>s?0gcY#W|jRS<*i$&tp)kniCXK)k18RVoj4p0+&ny71^Bm!CK!wxAUq!H zHL_X(60cKHENfS;I?rs$qc7A^pi2QtCl@p@uSxV?#-R6r9s_pJEE34(!&k}xfLa3O zI0P(5q(GCF?h;~XAX!C#vWu}alTSZ=`t(*p61W>^9h)(Rya%Gy@|-Q?=O|4ui}J}r_#1NWy{u&TMSLE zM^d`NyMK)Px|rKoSDo(z_VYj)r~QB{zQ^{JY5P!Kft26xY>wKRmEGus-|hIQBD3BC z&f;M^UBjodQHT3OC%fg8&4GIO#yNwi#P~J+56j&Rj293d6}*x-x3%ZBK1Ms12ywV& ze8BM}Zw^1c5U`gMK5G4zn+>(&MMtI3}>b;scu)sQ@(zq6<1p z$YD8iqvZ+fZ=eC*(C}&7lGn(@WN~#>C*75|BlNiPEyn{d254)Lc#=q!y|`&tW31Em#+T?++sR>CkN>5zjrkpihbCU6Ix%P_aHe5yESCHjs_(<%#EcX>9>}YmmxUzEv8_aga-1HT~^p&<@)4`-ia$ITC!^N)L&yyTTrW5{ z&}=*8-RYN2PoFbzFme{p_Rx{5#FxQf~xZ-KzUXXog4Te04S-K0vG?)0C^^d7ArnJTr@nO7IZWf}MJ1N-3e z+6p7P3wTr5)Yd`vtehrh1&3YW;$1y?bD-!|RvUcu7p@K=3$M8ExS>=8ibgpfb+otc zoAH^)IU_Ap7Lu2?p9&#&$~D;EXdLiy>>KUpn#Ve_YTWHJ6D;25P{_*iE;9186D<7~ zmH1ZwFypT#Cj=6G`F{y-Z>#=V_$}5Up=KY?GgzW^t4oOTZ^ykWmLeEu6rU^o`xm6; zvk3|C58D4HNWNH_Ff%u2+6^H4pn3Yf|3Aaq%dE@QGi|H{Mz zR4JJ4PMcuFuvckEqG-ndg8LWxk5%yXRVHBmIXcpY02`9T{EJwVOhNNnjcMYa{e8Db zd^8mmz^rrxfhqt>o1dE_ydG7oeJ;Wm=dA)LuHk_JV-md!wEGe(A;Kl5 zeM^Hy8R*;tLrlvXarT_3eCiNz-o>^BT|)c$9RNk!lAzO&x4i%gKLVVHfB-VgnPw^g z`UMGhutz|?UIutVBLGnXD6f8@Yh8wHTL2O2``gH_w%T_8FV(L$Zv_pM@BjT+y~|ipfUrQ0MfT0phjYyDJ6%8hXZ?x{x=Id-Cai) zfS&}XBcQ2yy@|{5z`s==D6WV?MvZ1-(@0sr)E?jw| z`%F0wctI3nM^bRW%A8wW%||}TJlh9^wY6&wfR=q^e0EVfT}*1Ah;@lmQqt2AlX^~ljR+w4tW<%W;zoT zIs%lAyM%y@UG`g`Z-G`rAb^8u95Vt9DI5221&>uxQOUuoZyMwC?8bs+j@cy|oZi|< zQe?RCZiYz?+aFmj?t*&YB)xY4ZUKf|gzvwf-DZB9*%630aKKdawE^4~JmHINx?-M7 zY>(64AJ0=~fFVFSg=br%I*AMZ&bQnr*T1{QIZzA>SUMdJT&k7D4te_&qDL9~b1Paa zc;+bxC$Ezd=>Z=woJ|o(a;GBgGxY&d7XX24oYqqS+zeL$fouSqNfQ+ItG{rI_sem5 zU|=9xv*Jf3ULNt(PiNFK&lL8>TV)DBbB<}8<=Zx$nU=x4_ zzYFLbJUp9V8oUxX_p%rflTDZCnQMR<>ON(tFL?obHEHf1~^n8w*Lh;3Bq6Z01S@> z1{~o(VEvST<%Xp802Aj(>nx=18Eb|CpaX&>*CZte@ieKjnf3;JFrcbIGH?nmf{euM z^yK6n7yt2*9s<^G==2m@t-QqT= z-6sXOUGIK$#0%=Z&AAC!3vbE6FaX%Upr9ZC-1UGi@?=H+0l;L1DhCT7b%N1ro?lve z703_qyrh_C1IorCE`&BsTz&v*PlCRddG!iWpi5$$fC1di0&DTrDYjoAL_h)snZWCg z;>1u`TU(samlxrupOIKn-&+1$9s@j(x1syBIj~M-lWGoeWfJ~W#B!dLB0do)pzCXpsGhoh~ zBJ=>?89oE5I{>TBWURUZU2Hjuz<>250&}n&%KXxpQQ{7;*qNA^fQ2gPypcwlcW?KF z>=&*9`;dxib3iZHgwdbGGK)_Ai_={Y7;|dq>q7#06LD`c5_S*w{M5H^9!Cct00)?2 zfDndg0-KCMrSm2zC6yVL=F^<;+#Hu_mDK!2B;B9HrEX>qz>1+S`5>*5^4s{I9 zPELm)VbDSgUO4(b7Z(?JT6uXn06hlZ&Z6bmfr^aL#Z-pZ~H`+d{&^6hpWmN|hkL0q7%y)L>CvXO*u@hafQn z#$W7sFLh%D;CumF3HYg_-`>z;#Lvj!7P?Y(To&(PQdvKz|2YMjGX{m&M=Oz3A*6i) z1P&2CXCQS)QiunT@Ij$Ys90pnD=Yn^K;;K<#ZyHuo*O_s1q;i{xHf>j z^0H}Y2ys*NYq+EGF$fRNH!nlxyunsj$Iu7rLi`3pbtCm7dk@h}oc9 z5f{!qvmmXCxxY-nut`z?6IktisH=4&|0W@jItRdHv=jhtgR=?Zd3k+p?OF&ZI)2m- zX(JK4KzpPM4r5qoObi<=H|$(A3u$NKF)%V>fq}*S_XzZ2ZBDpC%ehc{{+;DtMkd4) zs`!BQgt%IJkbf$g{-0s>?kDi#A98SjkHw$`Wo7+B3=q11tYAnZ2At9@wx_^*kamgx z=VO3YJ+QTfwEc(9{m2rWy_8ThGc&LrzXBZq(hI&D0WMkc$t};D|FC%AzqxEL9h{ts zPBujVJI|9JbQD;j?+pyn|DLnGAU2Xox}HRZsdEb%1T+17sIX_u|BD?7{R)`zt(t>K9{>yfePLnYt?Jol#y3Dwvl^V2;C@xFTV9nQ z!{t8iF*?Ey8=ZH}R|}aRlJk{XEfX2nY0(t_^M#!aCQ5XYbI3HaIrrIxfva;6h!j~X zzXJ~@wg1*-b?EJ<814^2>Gk^VL^VWihgFK4)oac}uzsp5>Ej855qURb7k8 z7n4}YUO)e-{+i$elvR=|n^weNYW(ZWWp~cTcaLr`X#zDaW&f9g{%&hXG4}Yak`=BF zshiYPi&|cubP5#@71+ORP7Li+>B6p_7w_fR9JeCRP7bmUbj8sga=LfR6(8Jy>AigO zs&iZkAF(rorx0TA?kVRJ)8ZJshHOpS&v8s#va?7)Sn9WZ-a5ntiX?g^K0K({Meu45 z%5`(!j0lx?`>t5aZYD6hTh{$)euCl=T}j;Kx%6^=F$3)nvYU+T129lAF&$!k01MQWnWAYkw5f$Wc5pO951nBy;`yC(`1souC6Sjs8 z#iuw2U#WQUL|{A`9{Q)mwS+3;OzrOC2SQ9}pinyklLqQ$sYrz(6l%~BNnSOIW$EXA z79x$Ob64t9(TPq|y|Ar#iceqi8(K~vv*lyS9QF|DyF}y6OO8Lm? zeF|F2(8S`sUzadG?Z?3;CZfVd`{n~_uWkOC_ye=$ME5STBGnz;j*%{?n(ET_Amme5 zu|a`{q%W&YMb*d^CE{o*j?hGTrH}K9B?HCvg{_Pmint179L@xB6Dzd!qEi;KCbvsT z4tb9TONmTD)apq|bfC?~MriCe&NxutIB7MPo$D?0rB~fT^E`HvgOuBYdmRxz48i@S z;_R!}q4M7>o~*>?G>O;fl$51ge!t%7d z&Fa*pphv!fIX_C~4&vpXK9W@uw0{2EPuRqCMtSP8CW8o? zdyi%bHplHZ$D}nCt>~W=Q*BKb@uxO)t|ojqd#?9d*9Y>>)OtRpBn#Cb>FRG-;}Xip zTm#+uJ;D=$;}WgE^?sm%5*<3)$ox-2QHnlc;iO2<*3U?|(;|i$2D2;O^CcM9+?kUp zYnE~7{~oO2gx@&p2>%(aFd-UrG+R|1JwfhS|9sVNy83IoQ@+UE2%+)8?(V6*h^Zy2 z4ML6Xi>E=Qd8NQ->DavUfay+MtT5!7#}1Zw2!+N(^33O53>JqvzH#oo3tJhcNcwY3 zbMz<4FDe?I_F%Kj3h)NZUKa+TS$oFZoafg3vtQb8wZmW=&EpiV9E+3>?=8-%Ci(gA z?3sPJ+DB_cOz?%lYTR91jal;co!XfVqeeJPUiU>um^4QQC%v)GQh@*9@-U)=vWbn{ zFs`EPchlsx3ox54Idt7kFt$Bjud*7FfP1*)y9%c^%1U2{?oY{7t?H6nx=f6NiQ)v_7aDo>o^?GHdOEKU94!dY|CX`L5^tq~2+ZKc0hl zmH8q38yJizTRwiuf$6yX75t}ub%zlf3%QjQEmYd&sl3(OR05HOPrGDyU{>36&znA_ z+FRaxeE+L5Jgp`r^4ynGWFGSC>cc^d!lf454Uu2MPuz8^q*`iTycn%fl|eVH6IU&g zA2KCB14rBsS=A<0j~>>Z`&Ga0q})>99%GP5D=SllQ|fTWqb?D$QohZ&SHF*x2pOZo zC%vMh;#-ak513lY=NwFArtSWd*eHU(GMB!!8r4_Z*?pTP>Oi;%t#ov}ZlDqDKFSPp zAI-=_+DAy-PR-a18#s+L zbN?P_p6|BZG6-okX>B{VD4aXP{`5N@s0D7aq#Z>d6RvP1ZYpp-`xzbc3rR7rV&TRn zK+LCJX*b|%=X5o(X{kKYm~)l;c?t3T>dHvMi5#0?-zcrt8Z|9+*Ga|AvT zH1oQ&$%VGkYiUE{56EfTI)q@lvHZ-^&_#E=Dr#iR**BZ&J%b2xa!GEuQX@E0{}&r< zbl78iAhwh9c}G28agM#Zf-JHO?fmk&a{`|7W5+N03m7V^VM)7?(Pa|})Y$A{`3WZvRKZ`P7vbeLeBpA8YlIkKc6cq@6PFI% zWO~13YA6(zB`r}w2aj0n%+1Z0w_ZS@gT$?$YHtuBDiKr?CZJ^UEH9oy$^A+cZ)1|sRT5U?Em!MWQ|K6^q~F$iSi2agBsZ-Rb!WSK+(=ZC zxo*-v_Z`({D{OVwzwhy)&glI$n*eQXPM514J4+|GWe>uv#OT0|BD}s>1A{3%8C7>{ z;}Je`xb)*DFe7iw2w^uh&-QcA-MS79$!@8v!!`B25N?S_=2O>H0;P)~9cSCRwT?k6 zY}FUPmp=pIj(;>+bzX-GZggMhA}E=2Xd6|7$D zREFUIeaDmQD|=aK*c5+-@&lq{%^}&5vt|TSD1({olY&{>_JAiwzLa zjjwzj(2;5H(KD6vC~4Yh$2@BH>Yh>GYMDP;ZIuo>yBGG$HYs+DsENl5x#A?(X`LKS z-xxu><__ftAV^Qfq~yV%)UcubL^oWSKY5 zUP&4n)FL*f5MQR{W^N&Wt+yFWw>>V)m@^j|AXG-bvLe>M5zNwS;$q(=rb_EH4EZ#D z%gZ!grJlCV=`&@t3@6MLQa7UpLGqzt;Q|BdI$xtzUjUCYXU)!={X$>GgZbrY*Uw5keoiKP`inn?btnS$N)@IiL|6$=85luFPi-$6J}IF#KkZ!mB-3W(Xe14)eN{W;snD;Q z;g3%iQ+>wqhqvx_B!fP~?P;KvcY7|HC68oZ0b<~KSYPDqolfRb#+W66QbD7r6AsP< zjU*O~u%+WTzpD>%YD_srM7km2f6mFBj&?3|kM~E5MX+ z3&^&&T5r>J)`CPNxgzo+sFh-raGt0466hOE9+TtPzXn^VZUN zubD;Xdwg#=i7$r{yBHaT{98Tz@ix-ATGpqlepI@x_&UE#c1aK!S68icE97CA2#F%} z*H!pEEQ)*$_9!8dURFp>$mW z?%T>?C2B=7NpnPf8I(!CHG4BPuZLKI{wTXjw@uowVge}4?vYoVDcjk{d z*GY>zx422?-eSc@Wn^UK(vvfUYc0?s^zq$%k@EYUI@ivz0ahCT(IyuQFjfCcgxe6{vcLOdd!B!b$x+y0wnF4ml_JAqig}T)1a^vwF^C|lz!o*L zdUEIW^z{9=aSx`ooD`CYM@9MGdzT1jhu&7&t(Avphss zMqPDP06V?$WvcI7a?Z%x0GMQ^CyB@KiRuenxG0jfmk(FhPA_Tnh$a6Q^@39>id^@ zU285TrP5YcO2vsTbA5##T5IP`1QCplIb;$23nAdl+4}*^{zt~@bG2nv#wLZ+=@!## zGu1a8g(+UL=&xE5Q>v{^Y)@S0{F*7n+?%tfmb`bW*aBQRxHK2Pn|!Q*$k=|Y>&Vuu1Tl8!Y24EnPX?%!tHZGrRPw`}Er28Ke-ydK21Vw^=5RFUa<)8m zRBrD2R;kML*mWiNeN4^lo@kkkIzmm`vDj&D0iUP1^~Tw1r9zb5@6hUE zW1XHvBdVuSUO};;tBKr}F51mq(-DK01jzZ8w*QXUBV7NeN8==boJiy+Ic)qq)`o`s zWYbHn4%XgG_2j2mB2K-FK%*DCnjIGy%YMog0W3~G?-G|eK+7?s_M2pYZEO(WFa102 z#UThrU!InRKUY#CF3yjB;CeZ|D4NspEj-YtRRWGZEW2P&bA#zfOu(0{TKaisKO!S( zk%z3BxAm$3+&R*9Wm^C^LCf3PvfJK;G*Mr!MOD*(+ZD~ z?N;q{d6CV}##W@F5H3kmZ6nVi9y2UEAXvtkgX^YMO{5G-8gq^6Sas~`Z}Lu2fLPz# z=BrNRUsoMw5I%p?N5jft`N7tT>UR66s2E*l&}t>zevY7t5YM}5%#{7$$Kl&!LeOlI zs)d`01cBb5eTAC6yXoX!*4p>^`2H)8w-9{hyteAi1sSUwN!TJcb+@G!swHiCzh-W3 zgggJ?YDXB$h{G2;H}5fC!*E!Ji>q8NYH1#yv?TUbt?<=&oYngF;O#wn*>`UzNy3~j zmWC~nCqt1|_6}VuBgfhWXtZsTm+BXYCMI^g`LM6l5_->MmN2)ulh@P4YsIl9>;C2^ z%|k+PqZ%=vK(V;XD2Oi&*Ktex)ja8;&u)xhPh4ae5)^imH|HC@$AB zuvY`iy2&^i*z}2t8W{%e9Uq{pBWk->op9UD`f5WrKZbg9YRC=SV0y}C<*N5uG*#N~ zooo;{w0lB1_-4I2JKvNdFe+vnBlA~U>4)&zQX6i2N(#cK9dUkK9T{9Mz<-WR(?jIePr;#d}6*F(@^oUkx72BztmLruU(xjt4heD zCnW^8d?ol@kG>o^Iqcx-1L20I%mSY@*^{~3Y8 zwP7&b8UbZEd<^p^C*9|h0m_0Fs=hH4yOG`-|y5@%>NEqg4qeBcJF%~OJD!N%zvk>=u@WL=e; z$d&-!vga2+sRKQeND1hoNICaW;&6%&9T|vyKH@znb?sJV0#raw$NmeoHGBI9kVjG` z`BVwS(x#PFktFKc1FWMh$6c)8DyHFxqS4)gsG9LcQ*$=rDd&Da=!R~w87s$x+k>0x z>&w1b`NbEMBwjB~DM3Sas$*J9%PK^Ic1tW!4uot6=wq&v?M}Dqy}tgVlHJQKBIAiv zHaeT*phK|po9~P^N1h;e$RyxHn;x51eL`W+)93N|30e%!7@>yrs@-Z=yJMBDmgC!3 zus;}hv`=`5F>iciv;CiyT_}074Mv}tsTVsnOQGdbnqkuuy2pDI#`@JJ7xDD5#rE?k z2I5fZ+)1(${O8tIz;P1t~eqS9?__m~fN+&~kP)GTSBf}zx6 z{2VBE^})H4K@&a6k==Q*d; zG|8hveIGakrpg=W0+T`Teb&AVqbSU|j}7|R2wTBBrpjTl`r33*2g<9hYz%ca`N#_H1D7UQesEADY2g^u7LzQkCu) zJ=Yc-ZCf0jox44v>yqS9fp2gOn1-O|KAoPpn%iUCKI#mip?%#a6ZA&X1kRntnF$WIQ^@>#MJF^j#d>N-u9Ww$I-tm8o2IdeIFaQJ}VMmdT^DeEm1n6Jh$*$ zq0A`xMSX^-YQ-$O2lR@U$?cu`1b&B&jTJEWq+*xjkr8@je?^XyuRk_C#+7Cz(O^*1 zqh;B(*Z0>M7yUgpXSkRcXzlcstNR|59S?}O|2_Vhh-)hm`FUB#3NyH7e-zQDNpTe` zSXlBXz^QFPI(@>>O{#Mh#M7u#Qs9}#lynBFvOPjt~pD|QGfLUfJ^@i*Q> zwhid#Y`5NWHH3B9+|(Fm5>{I8^0CV4yIz6Zh;K@Anwh*n*r`?eG7?fyw*SE|SoGnz zdPT*-Ybf}>*ax-$b2=;WR?v;2wbV*^39p}|$gZYaU$^vSg6{w|*iR%M8DuB|(kR;yo*u%CS}JE&OH;@V1UfDJq73Tvq0 zw!>#mc=PDCwFO>X_t$DPkZ(MU@l6ecmIOAd;G?t|C(Ic5#4YzEgB1szOd>)<00$A%2(`Z=gV6Q|!o_o2RPQ{%q!i3`MSMhqN*7 z(1yKGtcS#_xt?T7D1WMnyvPHpv68Y(7b&X7HEdqN8v_E~=bWlL_b-d~%xMpsuL@6_ zoS+>O{)mjsZysldaa(LK-pQo9_Ub=5x5>#MV&Bzq9ch0#JA#PA+)Q%I_P)t_R*tMM zE7g_6*R_v(x6kH1q5+5j&_4Un-_lY{><+h-VRNTYq9M!I!1upP{r&xY|N9K# d|819@+?omIa=zg#V@&$x(h{=pych32{tqUKs*wNy literal 198564 zcmeFZbyQW`8vuw1N=T`+2uLZ7h;)e(k^-0RR4#Do4iN#R1?d(MX`~yZLrUqC?(V#^ zuRig<-@KVWXU&>5u7w=WuCMmj$F~Y{5;$07SSTncI8u_26;V*o3Q$n4qGDVDZ{7-Z z5TKx3lX)Q~rXVFIMx|f_GkIZdjDjNhHbV8PnvyEHkfYHf4E#4Q1M{P(?$Epp#8*KX z)>M9Uohp#z#w&mFdpUaJcV6Add39GU=jBaeGwqvA)d80Y(}Nf`WCo?pppKYWe#cgoL=s zI7x8JygX6Z^C*h<8t_$k-Y2@7ZRjXe&+yZ+=UxfC@kB*|5lR{#q2Q;5awkQwL8goG zS6cO6;t$@n{)AP^XkCG&du#V?z|^Bt43q-CB(@GTkyO0xY*bPDzLW-YukgorBSluP zev#LixJKO=Kvm);WF=Rjn3QlHAvZty;D^M6qtkB$+3m(Xt(OeSA4CZ+z?oa%B~@BY zo3{q?Gfdka8NBChs1R(AA^6m|j9(k^QR4BVY#bE-03Yx3r{BIQzHZDqeMasTj}5&N zjV~Xz(7f$?NJ1@4bd-H}M8BDz=1odP{)XQr z&SfehlL2GX>zWZNffb2?uLLdbzT{61qMNTKsjpu;YUP1{)gie1@-+QfcG!&&tIX8P ztt87znX@lj@E5$;L$UJ+y`P4?!?<2Y)A|NZKtTEM(l=`2hsN;Q1;1goYgg}2i3J)s zi4(aGg_B5Lqw>Y~ep6C*Y2Dj(YHD-wY8+=*v!L(J;k3My%p$D-l@^^Hfu$TVahv!P zUQ#>Jv*eWHkDl6;V_0St)8}Q}%6{!UH@H!7%dq49yie+5w5dBN{591H2kx%b8gKSf zkZq^(JX2^V@uzGu*_FiM#~CCbG|s8b4PhMEedl5;;UeDMeU^Fr$O)Hx?X9~XMV*{s z%a=kmrU5jyp=a5o&=y>#tyX{62W8W~rU9gZ3|E%}bvSj`vkSs(()rSFr-x+Lxb?2O z%{8d}S5=MD)gU^p>w|KLyG# za$k*F?b&%RB;!?Gp~Od{rhf3+YmtWj4V^&<4~==_Rm{ir??`A#rNr(NV15vpry|-_Q^`i2F8wqcT zJ~S=b6rtvbA0!7YTJH+D5UxE+Ntm-k@#obu4-9x>a8K1OZP@H%rJVH{mQ!aTbC zB-}u1>*Hg^*YA1X6TYW-Pt=b7lzZ+nB*08|?^E-$!i<0n!akSfd&?BdOk7-CoLqRt z&$+&g+$qK?b~iKaThFRi8dND&TF6Y#Wa^8`+E5h!;EZ&UNtn zPUyUSrw5Dg09uY@jzEsFMrCoAnP5Nt3(Xf-&9I9Dic6tMJsz!fB=gr;#p=}RhraSO zEe=pGtF3I>71+1i?JwIc?k<_FR4mdjb`9iw7t9`QYK&Hqxy>0>{;E95Iq1kEu$QEZ zq&GUvaJfZ`Y)2fHotTs;nWvm5x{*2I>K^Hl>=AsQ(CR)V+tbZ8LdC1sdFP zoa3C6eBL%01vnGg6O}UGi!|?G?%0b=XPHz}*QC>MhxQxo^=Wj+nrc7SHg#|DHJO2O z{`fR9@6Xa3X%S~w*|z_I!fe4T%2?Id@$2nk$3o&!Si#0E36cd4x%;^;WA<0=uLg^G zi6u0+OMEgcGkj+lJ`=D-G^5mH^~e z(~TE5yy;ovhXl6L0;&UAjSD+)RDD;B24UKxTEgY!m2G3o<1S+?V?V~YD|Ag)@>7R8 zhD-%8ZlB-A7Ss@&aISEkcU?a?ZRq&Q)B-IX{2?_E{y5x0#%0lBzVl#WetfQBTWV)` z`Nte|x#J*b-~FfQI>YYh(!p!Pj<)&Lsj5xK?WiNlUC)NS*{+4#*mbC{Ed+3!{x^aYv9${c&s}tHJm^R+nG;?-OSx?y!llRT=Yx0Ho3SyLE zNBNY>h%@-f>2My&y*7SL`{Ca8$J=)XA5R1o1jmx@**@UlW2|62w*K}y+LETqj*gguwGp=B^7>5djmRWf5^dFRl%8((f|lEf@iq7>Yk-X6DO|1-p+ zvR-^EXxhU|UKH|mR2{Pr_uD`$OU$@Xi)Gh$UU@8UEYIkGZo+RCai)B*t*Vn9-knkw z5u%T5&o!UdRWT;(E|^^|n1& zbh;u#4Otxd2i?fT@bC&b5%Lz2=gi7D4?g{d1#lN3lzUC*I}H?BcCb znmLlKGLIg;h%!=Dm6hpi(>NFz8TYMJ?Mt!uuSgWkaAjS`85-5vP^iuE8dfO5TUAKb6J$$v5?c^k5gQ)IkLPpS47z2;dssNrU8y0f znT5M@TgUpf%g4jky59QKvzCu7E+MA3J%=7vukFA6ctcfAJZY7O&|&!-n`mu3PbaVY zZs0-8Kw+XOUV>o~lyAeK=F52T+0VS(+S{vEE6=NbdZZkEXlaV;>>bB610*rSZRp?6R3`H-I>dIrAcYYRv}iVQJ%MY)i$M>XI_|}8R z#px7}e+ujO-?r)bnItWEnjW(5Yv$Y1<9BHJYNO-prD8SSm&tZ181sTlX1*Vmxx-AL{5gp>^ zl9VFtCfI-Ig|eERnyd_;AX4{f-b86&00$jgbkT;$!iDmVd46ovoU4k;N|6IW53UK|2_+t!D8!dX{YbRVrfhBdy_x=JT|s9w0U7= z_X1`~h1gf$0A_C|c;^n{po>4h;WT!7@%JQ4+kemk2(lqY*f?0(*)H}5O9c>b`4nC_ z8Jnv=eqjM<2F?)T;O2cG@N2>UY3T2fNK4g_mfmN-kGvFV=zo_&Y>jQiU>4v|JE6Y} z`)4uo;6Dom*bulQ(c(9uf4v2i7Qzx>yWpA-*2H_oY{1A{FCNP)gWtcz0|n)>7kIw= z`!{&LWQpeG+N_3xB7!3ISX9~R((3rtc-kjlh1V-ut&M1HCEv%3qDlFqT~0>D^7qF$ zM5V`nC(Ib_Lwxh=YqTkd$OnH~LgiPp@$cMbX-rus&nv}EJ^1+rD+>z+ImbEWPk2o~ zRa8_gXO#2ryN>s#9*40EVc=7VyhOQ#iuU(mR!i?a71ILeM^~hm|4gI1BpS(v`mgDT z5fN%MG4eUp1VCD~At=+^oDpndT zN0O{o@0V0ec!kM$fGy9#_3THB6+;@nTlA7DtLI1h4jRU80eFluogDfL+3qLsihE)F zGba8bDgs8C57d9brebFXG|4YhrUx|f05my@!4{-Of?E*avd_bh4UoQ13}_;kDT+iB z8^G)KBebR4V0al5qhPu1?q6<*Bx?e+#`y}HpS%$~wNUbn%_y+h)inV z#7zI3i{Js^M;1=2oajpwFBJ@o0*#bR*}s6HZ8HO+?5r-518}Yo;2Yt*G9*TVO#2>H z>vH=GuDhscE~0>$*oEeS7gzxnBA^}IOay45Uk5lmFR+V1qL~cfQ4G5oLXg(L%;L#0 z(!XfoFJ=SC3eUUb0A!y4@nwJTA`|=-7DfRUpLK;SAlm;Z^J;nkRWT{I4k}uOIDm>v z=>-)MR1fY_wVL5SMcjv3S&`6)Qqo>ZB$PE+z;JQ?J^-cX4j@>79+&hVJ_5nA1fY~B zkWqjOQHRhyU)@*fF8C~xd;)M9=S~-Y{8A(YkkiHJ15P*+%KiYSM^iX}g#L6uwG6l| zu^^F?22derOaU7JyZ8poq*;}FigffXP0UK)lZ1tjG!si3pI8VK_*0cKL@J1Qa_%?Eh7OjVxZm0I+YzJa0g9YLf3zwPq?fi20(f-T*BJz}{PdkDZxiJ^q|NX)_oNAtbe=mWr#yan8#bqPkkv~K{gce-MQ0N_;M=p!;@ z*@uEbojB_mI+0{35I}9th%1kfE~PgZzLKI0)PcwZ7{Df$dl`9L2Mx6o0L=4u0IMhYu}I9IDg~&F$pr-iU8H9PQ1~gdkAzi^I~ZP%Q=+*e zl8n#?g7WN#ASe z5K@FtMZx*}_b#@+gh~N;X^%gd0qDSf-Roq%l1F>>MY6HpGCGGS4_X0rh|FF})cm01F8IfSLa}c<^j^Q-@wEui}u~+{sT_i>abD@ly z`In~p&(Bg7SMJgF;CiX*IMc;D&&N!aj9OyL3mq9b?R0WCe3S~Wf6A0|Q)uh0{A)xo zLosfJi-fH&aMsA^qnFVz`iPxIM<pg&5Ps{C*Y92utVc)q;+CZ#*HjVqu+kgfQX2lt_eW z|8s}Bz_sENzUqQd^2b-z4Vwcox3HqOi3Yn|_J>Uy-rrNIZ@?4Q!eR!SS8T6;U#Z7t zyJZP3f#PR{;s(E5qBm2D*J7skD<{ub-Hv}Gb-7iU##@UY_N5C!UU29362T|}SM-}S zwL3;>LIkcm3>!x&Izd=Rty?G>lHjKc{|;0ylyi>->SgcjvfI6SqlBtj&no(vblxdV zlmN`}^8&x@1=n@)Z`?XRY!QyUGL}gNT`L~MbsV=>Mc>n$JXSe8Kd99`8iRKlh6oZp z9J6WwRAy85pMj{aq=A!$~I&Y7QW)4@b=lt>z+BeGh3nT{PWL zP2pUlLPu@#zD|&+e~}ttj8`^R(JG!M$51h$R-LWE@Z<%y9yd(`Y_36bidE)pQ(^Pp zVs1=B^v+;}(A}Yw<<4l1RJt`>`w5lR5JOUaht*_JInIS3`AhNa`t|v^f@?)T>P>}q zV%j-!Fbbx_IXnRqRJsU~)DC^v+)}oA*JO2db=BOqni_Vxg>^>+;io%(#JX-@W+PQ} zNiXC8L?XZeC~g^6$Jl)>QYbobRcO;|bFlWj*kJc$5?)i}@GMO)lzYo8j>PwLh7yj^ zEwCe#F~6LZcl+Uxp7-}WC%Vo@*enkuv6m`!fTFaTk1_Qnw=7H9T{%D5zysG*SIy&S zP8xc54P-Xbl4)+A&4$bSisABp>{Gb*#^Ypk@)LLLdvMisud&sh+Pr{Q?R1gL@l3t0 zus_)-JDd&>v%3caY8W=;w$s@`dDT5=B|O37cqz@N3sycC`JKhq`rXdnfM%Z2%_Op; ztq@OvM*CE<@lWzeIvU-K6W2*#O#!vjEFXILu7r26R+>8vR!$H)fb_@r6R$O)^TU}b zKjBmT7YUg|wc90a^l|S4JCZu`;3u_ka(dZV<#V3wL7uL%eGTWzg9+CLv#cDIbF~95 z565Z8hC!70{bi~Z`mc%XV z#?Th}DT&ETS8plUYaNZ9>4uA>Z5*%UQ5IC66RuZo)YLdu%}{ukB)ryeUr2D&S;`ka zox+Q#kP7Xu&4@jaW-T{Lv%G%Xx0d2k%2xs(~awxQ%v zyH+x~PZzMIN};(SX49{%3%7nhD(Na)|D}fZbV>SE)>GR_p!$x(vgUnPmEikfdpV3P zcnh6zu$p1w69r4A{mR}o?%E^SxiY6yJdaIpaXd%}k`^JtEW5*WwA3j6Qsg0OC(tu_ zSNIObY?%^xPF4%0RWTq~SE`)X%NL%W%n;rWGPfSjNevO=bDLw*B`t8#BEKipw%)I* zOHp8%jSus2T}<(NbiM2Satl+zv9A7u{C8=Q8ksh8YE?jQN9mBM(e2r;!hvk=wbuZ` z-*#Pnb`-x)FGP4>$ZKOMJxOj$3k%zLKVlI(uQn0SX6mgU3THQ5QmRdZ{-KIUZh1Z zy6%s{YB#fV&oZ}%5#IP84Ke~!N@lg?oYxmMyiB~a?FZBu8tuQdkLp2EtK|~G_P@a`i#W`r@Nltz}R$9}cp?(!1$H;2MN^aUH zrlH>9Pj5V5Px#5?R4g?2-cbuzN3g(lYkN-ARvC}u)<9=j(&+UD|Q;f0fpTF=8dHqYUnEo>PT^Q1T(i`H@hqy#rDosukw3}JOs;s{(N54sx z1xE)9CTG@2$mH88^DUy`ZAZ>>MGGFkOys}~8|@MHNFOb5J{j4^sF>8*t6;zm<)Wc; zu)kSh=pPNsay?c7%JAvgY=(_{Tf~#S%&7M8tRlue!xLg{yJu!+vqZHuW$O^$BFWsM zW$DC^(Hf;%(PN5gg!N5^JNQOTTs6m6+4(oBY|A?`^4bjcATw%uUg*aqh zT&Q(By}WN0@wNJYRdbe`fmVm+>JZQHnas_oYFRhA^uGs@-Jc+#haR{namfk3B5WAI zMQIXYbpEafxl7paYW87G4R9p}()jlJl$cT;Whaf@Q7N@_me9Iced}@K;Os!fc5de2 zET#)ie-_`}qfKC6YnRIe(A3*3OTo+8}BXCUU}O zyWcBxB3@BXH10Kqjp{NLhow_JELKrXyM#ys{UHMh2o9l--Eiqel50iaR9T)!dK-p6 z35k1!mOh3*a`T_m)3hNsFn)J1yPStltVwAW5RlM@z*{YHN-~JtW8M z(xFYcDzZ)l!LeCLVIlq^rq|RJ^$m}M$_7k^cm*s8vP=Hd%2jvhEfL=RVLfq8PpkeE z|KyR}BAm9~$Ra(IxlhM_^%XWNi4O3Q71ayGmFF^CAQ6f}jSeTmQ+Yo`2Y}ws%Wopm zRCYP8IjCkcFYJ66(Y-ez0BksQ`C4v)YfkY%rc0QSvxE({$>Bh04dh(SC1~e%UYGyQ zyKGIBV%TTA9NdCn!TlktpI%p0DhTofMja>JnUefmeh3RBKy%A8Yb3-dprH71q2`By zn_R%%f5$74kc7m@0(cw{B97hiDPM9%pULqYcdJ3zBby+DThZX5Aug zp&zYvpP|O@xZZy*({!ucHw6RYUhSh)%@u-b9hhmjh4W zI6~ZI2EJtuM)YkT8htC!w>f!XM&WZoWsp(?9Xsl6@^>GC@(sbF=aYeG4#Ug=MaHBl zV1Dt)&GSXysO~pfjFnkaN9NQVFV8V%MpZU#Sd+ZR-H-*H)4&a8!OnH7ka z@YHP0w5Zl(X~L@QB~h!yaUsYl(~Hz`_1zxHP(tl#vM1IS+1vYx2UQGOz$=~$$;i75 zW4UEH^nPT~6MGVbj+y9Hey-4|I|0m!>P|Hf(E-)k6bsDzqF=4S}UCt+Uo-%j<6H?iys11x~f6o<6GFpm$mJj&uz_8 zEEP@<$wWkr)DxM{kxN{*AQRFx%+KkSd*k_oR{yDb{)w&NO!R(vpu6&c`;BLIomf9a zJ}5v_i;YPrg)J+`b8;r)F~VgRlP!gQIgoc>R)WY=W)mA}gJ@U(hb(u_uAr|*dc4ik zIoqts(>0hUw(CZnC0p}v+mHx`&e1S#aM&uSfcRJd+F3tQ5i{C&E0t>({ngsDe#P}Ab z?+pgIY`5EOaeITh!eb7j77_FYB80?jlxD88?<6uU$H|xdv*#i;Sq1io4AV?ze&BWl zG~(GOT9l5LESHqHhz)sRdR8qBc>ecfzl%47C4?|@#rG{X{vC%CIH=SHk$B{~Em ze#WE@mCRdbxkw4 zBL-AJ9K6JdT-nIK#q$=jsS{@vggR`e>r6(~?51t6_YKZ2*+TZhH3t+|xRegxg}8mp zD1n2_fHDPlpTP#;KF`2*9wJ@Px=!zP<1@cf5yv11)E}>C7~d*PI?K*nJM60# zF9B>ngS6lf8o7gDH8;}}QR8-wC0T#(owvQBCpxeobWomo+(xD@mPoULWqrHDJt%K4 zk;rK|BVDDRe?HoGiKS~d$ z9{OKV*t1k|Nbp;)P6g3{(%Q4N%wvurGV_2le=lvg!rdB5=2m}I+$N?k?JbQ5VnN3F zLJtaIdWZWh!si^tF|(1iPlXLV4<MrB0N)Hrey2=!o zC`}@4ZH*DZZViZ5sx7O%N|RZN`xM;~-m%#kFm0b;TTR4yyQyRFlvmlF=Q~M|md31} z=?Ie-qaZLJJ;-&e&dT&S5)eaYlDO?`1PsGKqmk%Y(GTTzgCYAy0*T!wb`A?okgPbC z<6HVzgGk9)IHobH%T&Wabe78pzacO1YoCt_~0qx(P#LbGj^iP@GQUZM}VC&9Q$j&m0)g&G1!RvOas$R|)Y+%-Ff=JzbCjYap&YY{~~YKVKvFyzh?saTX=_Iz`hIZ@_T*7L!BA zD%9+ab`R^W3Msh6>R&ha@jP46O><^Z^~Yl=JKgROv!T-~Z)HOAd@d5Y zOG!}PH>9RC$7rG7K!i7$n?`xn`&fvskL8rM9KR{xce4sUWA&A47`}pCV8G9zFbuMN zV%HpfN2Zl<*lG@^@ssMJnFVw+CnQ57wB`s6)P8*K1|Inle$sl>*sbgBh-zUA#>(b4(*pDpX1{|shaj@y5f#HD}25cqNt`OM8CF|UD$;M(v#4UT;Gll zjy6otd#}xBlMgA*kGef|?wfzC;X~F5qRD5(Uc5AyP{$d2zUBBB2RN@JfL(V;@6tj7B{FqX8k z9}a(1bo{J|-LW8*dp*l_RL}X^R%AzNFkdEYe|0#mAyUhxt=-wTqB!3z@zAB2)O44n z?B}Zp!;ZD|YI;@GJ@*aQ$rVqhxieGYli?cYg@n=+hcKq$V}yTK^L)6~0oav@0b=PD zC1&YY+2XOJ1#(3Kz?Pg$AEZo_8#q^S-NzjAHJ3-!cmBi?BTPU2vu7EIKn7|Uqmd~* ztT^*&5S^&<;qhSYgjGWur8~8=jb1dLz1dte$Q0+;q06RG&B;~;ZOq5PSjUA(VAO#i zVdewOCbRa}qUFr2NK0(#L{<_ajpr%viM|I+_w!4I5?Pi_`GyL7&9w_KxZW3Gc@_ZD z6q=W2`cs=5N*J>K???pmg<8E%?eaT4VgrN za-(->zG<3R@CDEFbh{425$n$Ph^#2JPJd*!JyDo_#iJ@lc8!)%QeYn!@L_P~ZW!kp zKccku4|Tl;Kv_{^CKCb*`;t_-Hy}CzW;~)Qcm$-37jwYFeUMQ#uK!E`W~l=+)KLgq z{mUMVA|>!n&SC-7^O!&(NSin3%9D#Y_@xLgc*h%mQyUEbM~zbfbU~?DFUWOc;{D&_tqKOogOC`&KzlRgweMZ;pU0%Z#y zgeUk3&X8cII0QV{|80RA1q>S(YG{^tDWVS^eG>R6kf@5>7eE85nnB7vTS!#316>7K zqb7Y$fU5sdMGXZR0VW#BPH?xWpn&KDA4ZT+L~4`kDFy_{P0+ZEJS+uPgj?Z)S0Y&e zvK*oTAtW(<0*a`F7R|e$8zd3Ye(+71{4UZkQa8vNqJFJ8c67lAV3{icU>lqtJwc%! z(Y9ooK2e8M0sRa(m3cWe8jv#qEVg&K5|325^r`}#58DrEjuHJHa8Mj^LylUh6hJ>x z=Kkheu(%umHfO?Uhv=C4hiYE&2+6rmP=RRR@&Z_U-9fZ5{9$3w3V5Xhzd{0nc;qVb zpVheFXg;)!BJgjLr=Sr0KJ*NU9Y{s(yMX{LcBQmbq~RhdaCPaQrvvxB&I(Ypflvt< zElW_OwLiH=P6_%15glv}%XaZdfL}y|pqoh<9IYtYf($sKfzdMV*$nl6w_S*X;Xo=b z1j(|&(dq$a$a(k=AWh*f>Kw3}C@504yRdqX#I}3o*0-wg2|E z$BS3MMpppVSNsn_-_swgMe+bmT+Yb#PQhxVwv#`0l8+YLbNs3wpw~k0E@-<@c(jkq z4x}y^c7S2MOhhnpSH_>5DmbnI@ES)|2GJPxXYUG7d+%T4G;z54@-KRwghVQ@Xy9VYLfXijfk|~I^H&O?$ zHX!KoNCp=>(r_%IRRwba@d~MX>CfQ`4}i+(HIeZIDSA@y*ez4+g^cyzrj8y!$P6zDP&kwvEVlggNaBLpKpi33LK4s+M|O9b6KQcZLZ$D)UT)o>Lhe)hqcj#n zftZPH%7GleR|>F@Vwl6o>_BRps|UIVixa|!T-1mID=c0)kO5e)se)^*#j%Hk)}jM| zwzNBci-1Z&4Ls_n%e5d4BlQE-NrFpCLue;}1o$|jv9pAf>-K*)b{YXWYR6FI1cKKJ zE>}eH=Y1ri+^qm;EpmQ_gR7SV9--+2bx6a{6@iS>#y5ahJ%8IB{~RvT3($HJ$8iR_ zz)ZoTAZPy-WUP^Vg-QVT8j&NgmlwNl1A@1oTw4N~P!J)6!x`LGNW;s3jLUU*5pB!q zU~xj?r3ECDiQJI+vlh{@hwR>BC<7_W6m+`-7Fz{bKL`?>h=P*LfD=ZR-O`IqN~zmXb)z>TCN8GMoZLXG0fg ztzd@^Waooyoi8YB=6&}?ZpXl1L}*m*&QD)JTkKsg5LDi;f}d^RA@Ua)lL5>Hk|1!x zuBVSPS3Dk*4CQj(o>e3VnHA5aXu3$XJ6a&)so#LZ&HUb{3rS&#BqQ8Xny1S^0Fu#B zPu5Cx5BKsswfna_usk#TDg`5e+&feGNlg5^MRk1{O43mEap0k5%30Zo{;#;uyAE4L*7m?WU2y-*QqmtCqt@}{P_V|N15R`~a zq5D?qMB0DB5}v#A$0PuKQO6FZG}I{~8u-`npW9V4uNV{?K8*MNbD_ch0tiR;0r@eA zV!R$FzuPOw4w%bA3TZ^x5O&adASebzJP63aF*d?Jd`Q@dV1r{=MEXHF88n}LE=5c! z=;=rJG=J#zw>1@b>WIWrJtEoH?o>KzS^29~=iiM97`tIW$wirrssXw9SO@diiL+YC zDfLBUliZFvbXjLf>=3=u7yE+ksVf%hhPI${TED)ba7B?FO1gU!CjegeRDa zGhp!f@0(F8=Lr?CGM|;~B4wT>_$b|8eszcd+t^XhkBk#v&BL#L6s zoe05M+Ypl^O9I^@OC|-WQfQ zqaNF@Q~E_!uOL;N$#K`wzJ?Ov!}ptG9k7q<;8Q_R+yHQ5#ReOKOgwJg z7WMma1LyASM=?Ln@O$RSmzw4ms9RbDMI8 z;tDS$IliWZpSQ0j)X0e~N!j955NU>VS6H0Iy`dVAX zqvbr>fPk-TL*gNB(x^x3)a_6IgqJ;{UWr%*=N>=2sm~<_grwBAr#c5e8xR{otd-sI z!QT%~MML}9A!#4Lw;>$a+^@3rts#uqptT#PmcWQh1_F5&3Tf)?@pC{|uRUCrXb{h0 z`=OAe3`{;kY`3XW|sP0IP_$G^4CtVHq( zsC*kVX@=$2E_`9!qd7Ry8#42bU^E7u#ZNlH7C2I3=&!PUqca&X3o?1g&VfT8otX&` z@=r~{LkyUK0W(QoW^e*w6zQ7|oR1dW45e+X6$)z>u`lc+K({yIurywVjt4%8r#Nx_muf0`0Re- zLg%b7YvIWJ(?mt0q>2YtapBjDmWyI82gIGK1?7l zGd@ZPw&><`Ja*(euX(%BZa}`uPm#3A?dtHb*0Gs(CuzT^g2$KHbE9#?7ET#LM7@$?v;Bh>!laQDft9_+C}Z^`D*=U z-Hqt0=$b{3&~<&+6ZJ9%ct^LcJ=JNYLiD-psj7oP|$KJ)TR za^Y~B9yR?K?4fL9L(g@%LlY1cJqKs+HxLT7`pq1_?2!rhoA*s#6UA7PSAlN2Z6Zqf3%9x+5N4zV_5F!T@E5_SD=Ja)`BMn3ATkKtoeC31oQteivN{+p zvKz4>bpfMHNoUIe@qxx8iUfP#pTF zWgHnNH6e*=sZ;%`<1Pvr%FEyMC{M>3+8`+QnU1P<=y|4WJ z@0k#r1AO^Fc)1){S$KidEcoOuBDJlD@Hb(>$5BxHTa_1NaS;Rilkb?nM-WKC4X|)) zp$Fq)SR9cPp-yi2#R!q#84EdnSzX1$=G~gj_Feo>-OJdVrhe>(a@$Y`-ROv_`4{AZ zjIX!a;6dc!RgoS+SUbUZ`YD*H_0he~%P(9{?pvr}u*t7rE1B-o?GQ2A@^Hp9@YFgF zhb>9ZTPol8+u9Shgf zC25Pm!y}#Zh03EOVZ-^SHK{w@T$##=X;SGUvEeT#;bZYuE8&F=hHTxKc`|F*Y_1yWS^z(>!YH4u(_26~y z!-KKIFRAsT^ubu+M7Ic4w9RFPtYSFyQ)UZu&QO1Z*LJVFm^hTN93{#*yURK8#-tu< zQ*fFoFS(M{X2EoyE2{|~C4zR>=Gqf<^)>heqL2uv{Tz`o4IIG;UXu1mcBVU;+D<$4 z{?<3^*M4|I&a3orrWv&jFCLo??p0~E&E%^!UQ^9W6I1&l<$)iJ?vhGx>07DfVyo+t ztPdO|>!pN6h@6GOvl*+x(6Y9=qYh(BMcjBww&HknPYdaeESrTj{YlDdxM*HuZ^V%1 zIytEPc!pms;0=x!TR|4^*yL4A4L$>5w$mzRriO2a1Z~o-Ee@@GZxxyky@8;BL zt?yI1w^!2_C6SD^|O3gu5 zI!0>dm(U{;+{MUR%+m=IfTNdz}7P*Bj*3A;5hF;`dwTpBQJJFSjI zdpBOuq_Dk)-I%pjv2ZoqG9l}H#^3X!z?p`w({T`Q%FkgZxIUy>mmRYxQd`e1TJ2?4 z#I&&5>DsyaRJUWdIZs_CnfW$#`OEv}<==-kzu31BZXXz)3MJZ&?{4;BcWUe0f}{D1 zwyl!~;@J}GbzSY!B@mLA&K(fg2qvAP#OpHS+f9r<)h1}j#>;-l+Q?KIP=onTnjBoR zYLGrQYmjMSqPZ=P8p>Vyb2&l0Y#L8b8n|i9ypx}E5AjBwDw}57a@g7EMU4)`)%rOr zC2gSH_jf{AXCdtUG)wF99PBaMCb~|Hm^l6(_NFwPgr0J&?%d%bOcNf_Ei4o5&Zk^I zq)U5^duyA*g{ASvY~0lNNNt~$kcTt5%2ip4FGU``+vg32XKlnVF+(=9euyuJ?rn9` zsp_BYwl7V!sm5ZHUEMd$nT;G`vwWx;9nH+EFe@7hnH~P&h!DR;@Lz|hV4L#mkZxHO z+-}E~LirvrswMTIxcyFQy+B(I^vQUl?4wt0ZJvvcD-c%iPU+Jx&co;uesuFJ>!&ym zYd&naGGWTBds@pyM`q^@LP2NT%$ihwp;1>&9LMVUmTqN8em5k*<+q2{*uSp6_ikmcL@Z2Z0^kaPdZ_FEW?W+{+=oQGRU;8pm=Av#XO%FfZ z+RK^83izfpR?7)$7alZ4ze}DPzzprx`POV@x!d)U zX8Q=f>Lw=cTF<2--@#X8mYxi>0Us!9w6D+8&UCIgwEQS+;82x;No`UN>YuKkFsNa8 zybh<#Ydr4KO*g)65K+l>@5sZ$brjRbUC)auOCVh5x8AG)xk{GkB<`@{_v@b2&i>kL zI?YNgoD-kGM^lXTM8vHgROR2(;s)!<)&&}i3svNZbk>`TyM3X(OULr`r36ATW6O*N zXWcT5>Ez8DCR()BN}BNjNxEEuW4n*H*+^K%h-7m`0`%po9^Hd!x?#<5v&IbU#_9zH zG6>z4m>?tDH+PV2@>rPNSt|Jv)lW@6$MVXF>tD+5bR3jf`PVzwnx*>6Vp}WXJhe+>fnCa5+qrc%^Rdj2Pohz@YomLcvD(l z@ic#cQzxfLWqW*8x#u`f!X#qgb>^D-KHY|nrTGx-$0^I>-GD69*W|aQS*4*{C4-gC zeh0JxKSzDptVTLH>(koAFh;%*+9!gn@Sy|G)n8ijpRd{@h^E(GJJM0N2I|yKrr%LK zF-e;xDt=;H!%O~bu+e-f3S z^T-zd6Fc3PIQ-~E3lS{!)WZfcP|(IQ>O4%(P+ER_Kk9|Wl3Z2OW^~B6@h?3EEZ*i{ zr9eM}AMNB1rzUIu5*~Ts^UX1RxAQ3btZ76zdidMs{`#u_x!H7tz}C}jrzfDzR7C3| z?|DA2?sn^Xm4*E2L1a$V9uw^Bt^7wf*RH(mj<{jTlO2tMyF1lwNo?nKq+0iLV~QWe z5u?F7+zabBUb^p|>r%BZREKB~2Q+4Y{GK8snIrht!&g zmpRy+%-`K#n^_6Q=FjG^zDE33xLZqqW4_an_Eb z|9`Oe)=^Q0d%Li$fV6A-^D+mnTT_PRB&>i11 z#y;=f+xMLP&spF1t#j6z#ab@sndg4)`~KB+UH9)P4rAeNa3)E4>%7`psAp&L{mTNf ze!(%G|69H;%5Vk``K`@jXWO!k+p>Ac!_z5?hwPpVpJdZPd^aJpT2#HN{8gFZ6&zPi zerO1RCI3D;I59`sonOP$LgkxgL9-AAyy9qob;Ai_A`O-rTZ2|^JwC-^Ck~2gkIuv= zGpuO`{$`!+458`T)&T=ly86cEg)i=F81|b1;}&9cXrOERXk%HO$v7+T{N+TMWJ9B^ z;&9r;eSDoPsflTAV3EQG%X-Mt$EdlwMGBecW$~KmuvyQRVfYlr zf+%_*3~>2Rl}~@3nMM&oo!?1t##(J<=C>&qAfrVM$FWa1?mCeP9JuN^pR0;2Zohr< zv5Mhpve8~7S&VPGbSd+4qoc5MXZ~j8JQ@r1)>2m$)nn@wYUIt^qaZWc<4HGuh}(Qc zgt6a3u_;8#c5(~gq~`)$Qxv4EF6wK8t2_^p?Xx7I*jh#7CtoTcDRv!zKuGSH#(9+lGlWTuwue{1{%wYczw%ICIzO_0l`z^?AtKQ~qfK5b0o!(S zpVY{%Y}Z9#C^XH+U420g$Ce2o!}mh4x>(0+PP%MaT+F*_Eol;<)xKKVU4>Coj6B`< zVuXihZ!0?FXgGNJj^j2(kbGmlqgAaoYNIZP z%_BUpOi+Fn%#W?#TT|6{A3&Ah>pZMAZf6smbLeIK`$(=01!8=xY1m%Xmt`v@d;edR z&_m$B?;Q}NZvIjw)^fJS_F>Wh`bdP{8b)9JK-$%FSu&j+^acF;JQ`;9lHNEXVD7n? zk%;_t!qxAp4ilfK^$NeOU`?M7Z+=&Zf%eF_r;53kh~-zNU-Ah~AAF*e2^-0?*5F2X zo_?x=ndStj3FyPnp85Tzu|&~IWLnox+$= zG=73FaVR%3nV23v>LpOwZCz-PpG~jv~!U>r_Jfb)jQd* z8uMD{Lr4flLG=wHQtq1batavK`Sb?2D7YIvh z?y|9Uyb2n9X5*~pE42(BJY~(n3T&Sgo>;Wf!|}isKir}m5`eFYS4jRUAAbrK7Vis0 zPM_}8YeC|N^93y;7CR&m)3dPyL|d-CbnKwG;+pZ*Cv(T6A8&&WN~I9MX_17&rHF&6 zXsJ;f^MCr`Ht{Rxr5XKQPWBPfC#I#ho7<~2v;Wb5s~^R;M{FH6JYlGR%IZ=c%Zl&i zco$@|^@DU0hO6NK9iU<>u)XcZm{L&w)pu+K6AfTs*doj#PIc%gl=TCPf09{}JG9tP4xzp~%Cmxl+A~eRg zOp<{GkARH$*hn2bA}bbgo0;X=a~7e{U;m3jD&>g2z2~}9_`m#F3|Rcfe7jgcJ|Zg; zcpEe!546wYR}fKij5cLWay#pO_f?Q{$g!e)vwVi3vi!ozOgn7k*a96!T&47@1Cc$7 z;m&9T?p*!c55O(IJjehZ3n7LEgY7*48yUxqhEp=j(dhx8Zrh3|@Mux-KkdtCa+raLS|vUmFi2%!c8oNEUHB}@Jqis*oPrHptY-ET z!SIp?aHRNQB0zegzVbvwjV#vt0R{|~^cYypWMFn!hBWcOnYoMkvcM9$18s=@UtB^I zxK?P&sieao|1d>UX!<3p?cAMWAZAZ9Al-mv><#-L`|!V;UI5lq%U1-xj1D~JfrXPf zuzy;CfBye?b{G7I*s!aFw^e$s=d(9CA0Mt%=KeFjl~MPfn3{W!AF6{7%Ei(Up(Z;r z$Vzr(V&MacFGsp$_7;hJ8l^d}rsx;}7}GLxg&3GQW66KFyd`s?{om5uhc&2JOoF&5 zQ%TAXy?Xgj*h&`Q{2rjgeABt?bwoz{C0vTAfc@?Gm;K$iEc*KlrWdMNPW-b6|G_4W zDn9A|eHQ%9BFb4{{wL1jpN5RGc+tOFJSdWK<0~W??`>bACh_r8l1-dt&>}*VIBtGVQ=dX%$yhXvehLj zch3&d(%VCd-R0fNmctF3skC~cDrNq3fJjA2u_EmMYSCqnMMhoD*h#V)4quvn*YdbH zFBzVkx*rhc_kfpLt(A1f+z45{6UohAIY)}60cmF`ZX%JeP-~PUo{gH^gk+1&X2fqGSqtW{^ z#b=k>AxB$}de53|b($nbyQ_3Dt{V5%=Y9OF?iUNV=*3!>F0~lsTM?^W^OA!VWe)uFykuAiBGP4UFCKmuec$CkHL)2_8?otB!YppNpoy= zDzJ`RfMWfN7j{8|kvFZKzncj-^Nmy1D}FQ`sO3p7;Ih%Gf4adQedw@YfMgf;RIgsI zo)Oi4;{LsIdw0WU2cPJk@>b2~@-749O&^KFngPoEVr%*84FyU1rjUC(7hf!c5!m;S zwD*UkZFtxY+2DDy#lLhvlw&Bv$(HutSvGsQ>68p`Ot-@ za&6|5Ps9YtkZs?02keLUtm&(WqpPeiU|YOMxA~&ItEy-9is|hi4;(LaH5}?LA}Foh zO{d4s&5F0DFUBJM09u+LP^092vcy6L7AAM-RR1 zZN2Imfav^cod}fH{cllr`u*>A3LK6$R}jWwJ3e)dx^8*dI_GP|Gn=`~*XUxpi^kh49q2eIGFIMIR$=QqR>6Q*Gh=>@DG4!0fqOy1i%_6WLp~ z9HNJ@;dx${yHbBlLgH8b4l)gNnic!4+4ABBXuWu`|7NzZVIi-)3tFUau6F<(CC=6d zk=9zo0P$XjJb0&R_89(rc_EX*xD8N5M3_5|{gp2=8`Sx$Ejd1iyZ(sAXo(V@DSVdd zY(C%HB7DouN&H)i5f4b8xfmJ#n(*?Ifg29HKg@f77%RMQQpTQ096hzT z5Y@NS6BIg4e7TEYt#vFtvO7w8kkzzfYucS}QD*JDLVkr-(0Oksz-5J^+BNp_G$@l= zL=8lrFa}JuU-5HRZKofNnfPt3vrYo*m0;E~g1qPNu8f+ncrFf8rQ@_Pn18khL>adw zA&^{{Pu;vK8->5<3kto|e#qn4{pE)~gb<9GV9|dQ5IXEGdIAak6eB9po>-$~4q2p- zw4z9^IolmfZIeqBwm21L@W~vzdh;;q99eU>2vhxP-EO{+D5c_VwyKiORHN= z_0?bM1;AAYrU#fC>9{qov;e^?|Izd$zX#{Hsmji(Q3tIsg1hIP_(vb}L(ag?mYt$p z3n}Ga4>MC2Y8yHR(CpqtoZEx%7%_Zj9g**~=(lCx`X)bKK zlE?%M?pK$?pOb;|+3)`Dvtxi&Ts<(RK*QGAfdDboTH1=vor$Hon^=Fs$)mjd?dU<% zi9u|X_6c4Umm_bEy^Q)=3+0v5l?4$IK6b9|4P!# ztT3al`N4E&r1GA*1>{XNvbgCUyP&I9ZjurAG>6Tjb2J;5hi;-DdPOpQKNMUY9@H$p ziU`%Y@>_IswTlJf&UWR}7zWaLPDp>YH5#KT(NoGuEMe2m-T(U7o2hu25HvpvuCX_& zwiU?thV1voW(8}q6p}SH{sby};KaxXZPQ;9>~5g+n#s8wQX+3ooCoNY5MgY?K3jo8 z3-f9bwP+7quu*O<>u>Z7Xm89UbAL}Yj7zqJDh~tZd#&@t3RS|<{;h-yMQH)g-FEGO z{OZ}7rT$gYtj<5HAznHCbs7GpYCVa^Zul+ImS96kK8~)C$lT|Cp04>%9J^PCJz5wa zfH5V_S8Gwah+S<_piJKcpp<_t)Bm-0Fi&rQMCaKZ?C9NFXb%RoLW!tqt=zYW8BRR; zxQ`3S_j^PBy;|!Wb6-WZBnILts%fZNgzsPD>c3R9yId5m!0{W@b*LVaBN=IbU5$T1 z7XE3-fVVV#{8w)Y)Q0~B9$?nnOZ%;k{@MIiju`!a0*3sTA~b{-vHPz5gTfeT2}(iH zax`80uL#qB@~Ffqq5sge-2A&b;lGPE|EI$T{|6&LoYub*r~2*fp7~ZuD@@K=cRrSq z{XG3NqcOCurn3e8{qda^;$zb~1yZ#OH5GE8fLV_?@-ap>(LcIkoey4N@H-*YN&}ZP zzfHMs*K<_qTNmqB$}>HIrA}%U#^ttL{$y#Zt0gFOR`WDDgjkS1E?u?RfVqjErD682 zFIPMlpaH>}Fe}n08|vuvE|6d6=lN;oCMbKYR#lY)q@nHp!az@4+3|rpfOpsh;F-N{ z89@EKzA;~me}ox2XYaZGeOb#ObGui8f-G9ocxw2<)W_wX9dM4}^mO@S-K9Nvcf%iN z4tu&yL7TFmVXvw=Z)%GO=vI2uY0SO1W!(IE)Q~sehl_>Jq^j4A>-=IU>Np*q1r=9{ zkDG)VHs6R)lbI1Nc^$^}`Wvb#H2{WCVxYWG;9pIX6Ts2#m$LC$lx?3z`Z3*1aPw+y z`FNWUvzS)>@j*|V4pXRlqlMG)Tnja`rAjW(c!I!3zTi6`b+RfqRev6<=6Wh$r((g2 z7qMImg`H1$#xLL{lXn|cPPCD7dN|uNPK8Pb^%%fb@4pxY<3D`4GYHX3$a;UM3622(^FzE-GBh_9@9cnF@CoXBnuJ z1U=d_=Tjsh<-ti?#d)%qt$gRU5%rk!)c`9MPu?$ueY~RoF-w$zybiFN^_|xgpI^1>zV@zSkfaDfC-d3D2Rmj z>ir0lI$6V!IpiLys?WRLriEa;Y-@`Od$Y%W$<0!?!=~ymD6~k`ZAuHTJx)kau~OAC z^!w24TIX5(mX{}SH*@>DumWRLF|4O?KFRGoBTN3uTi{az1hRYH2&8IvG&taaTH5)D z^!8bzW!-lHdHSu$-IPX{w7SF|&JNHbC0B>B`d3jf`^?nQ+Xz)b^F3vr7MDcZZ}<*O>)T|mBWt-P5i zO1^>vBA3FscK!D!?8P$P{>Y@6f^Ik;e(qp3_R!5|z}dvIyyMcenw1kZkda zov>mWV#`4Z=tvINP$D;0HCZE3bXFMfRsPXXr zcjIxepSiEexAh1ZPtN`QZ{G~R5k8Bt`8gd0Q|i6_6vgyN_x8#2ZMQB)$({A7&)Fuq1H!u(xi@q^EGW9++=M7cAHyptB~n^60me)?p3$pB)N#u1Y9=~X{; zOP;(0GaF9ojsDKua?L5h2r>(v4!=1qk%^$z+Gk6nt@!2h*_V-#vG4`el=Jt%y4b&3 z@Wi#C?rONKzVl~7Uq`%%CjhZSo{CDjPj6U0rWTNO?n|K*ixw_-&-Q`u_DC%~zyaC1 z-KCZxSEv&wH18P1EnM3_%&b?W(w|p38llL2Q=z8)@Y_uNOA?4@0#?y6bPgLsLDI6_ zgul(Gx2L$_;0Y~sA^t+mM}$Jv@?G+i zD``8Qjc!S#Ci^w@5ww+XDP>AH+Db=WH{lcO@vQR~?M}^{8`c9oLG7kXJVL=sVRsbkD#WZgiY6sdtgG1 zoUm^ly89P=Ij&jph1)V1Txi~gIxL*y#`B~;Z*89$Bnf9zb`0!L_GU8)N@Jk=wy`rV zPoY#p=&^5Z&Ah-tDS%Q1UHa-!;i;yRi7?n{&bm18dver0U6gsPk+fhvtbC~664$#y28~P)G}hX zIsO=+*9){4U=Xc8E{O2!Bk%|K(`)#&`u;36WVfMoq1ouvK^4oJTT$i9`7r^nSm)D% zEf|kh&A*97Pl(=*Gn4?A9+o&3U+&U9uQHC(-`iC)5yPB4?%AM5hIBKP&nOa&2=@fC zjE3BVC@j}m%+}e7jWK3E zSw^Y3cTHC6`&Q2zx!ga>Oc&-yav0-Uf|ZOc<#HO&hf<>RDebxgS!RneXeWyU!Xpw^ z_73Oj?Y%#jEoOF~biK~an#uisvrragmkC7$J!luWpn^OL;`!d1lz~hdt<}z*q{(;< zv`9t;n@-g{?o!jVwX$Wd8ESu)p`0+BKjZv@4UE(Ci>*95bV-(G+YVUeN0Xo%V$s^C zIG|nUJX+YaR%_M+7*|;VIk1vVV$oWn`1o5pre;?9$}zyysE89tEf&LVi)@vYcAK{> zt~)c|6^kY?gWtSCdF|)td(btpX!e(Pj;LVwzi8YS1*RlhiW{{w*HKH;NzL4^mPcuK z0~h2~j&d?eSHs;T{&4j2WWcGMhNMVtIFj#~I7Wq^m`syc{-;g_AFx6h#aB2WY9_N* zN?7-iWXgb@$w3GW`6B}vqSRLxlWyN||KYq#{D3ivZvjj8@0*7Kh#k?N!zp26D#L*y z;5a0y!A zfR@j0Qg;b3XZ-Qe{HQIPfekgnO%wqp8UxGUf8JW<4JuY7OP%Qf7 z@;&@%A3CGRukH0k{^#~W&E>zxIpA`QKxuC{o}wc29jH;GF!1D+do+g^O zASdK>mR+{OJFK*{w8*A+HeIqx&lei1>+F{oIW9PE2SpF}T?l%?cVl9Y_8IA58Y%oC z{&4#cWjf)c4PtWKS5#C~(d%eXBeaAySsYN7;zTqKuuEP4Ws?KZVf1@3A$koKVluM1 z<)zQjIa*dEoR}+zsOQ**x>&j^q;KepqTZQup^Dtu5}azkr8@MOlSE)J#20c2K`2pfnLp6%fvHvnuDi~Xmuneo=-UfDZ z!{y_pZo4bZrnE0;5gG?U#{K3`9oD#+H1iTY5q)T=7<{gP0bLR<8}7XhHA?0ci*`|y zLi_E49Py;esF^d>b;5ixLJ{RXryT=VR?`ID0X_?&oxey6Zk&h8O(l71j%IoUu>jZ$J5l;X05W?E`+?t0Ug6yZ)B83&!i2til}dNhceZa6ao~!%~x~KpUZM z@l-)Y*F{DKt>!cHc=G(2Rou=a`EkwWHR-<*C)3=3tJ>y2^sF(|05~4elZvRR$_qKW;wI zs`HxVZV!+GCkc6e96fJ%-*hvs+@Xv~V4G5Kxcat_Dbjnd^xbmbX{Oy3uea7{VMbYY z$X=I0W|wk@Z3pt2TT=-6bl1WO2EoD-=V8KFQU5VTT4K%KfT*07nL~)fgH~aWLTGDp zYw{W|Y^7UtWc`_~cCEWXFbfgsb-h>u(9X`$D*STB{)HRQMDv9U#)B+_OlIeE&o=~k zh0rKIb4LSY=awbOUDeqqdph#+18Xp8x}(3u%4PX z2?s_#Zb9k^Kz!v%g}(F+Ng8J;5#-@a^?P5)^TQxSM!oT(XGufv(PJJXl#+|NU>bE! zR+`x>XqcGkfg@&XW4#P{g;0m1Hlx~i--T;!-S5uuLOy;jBS=L>Y_D-i#%C44Bw zYTfP6LjOW&{R;2msLvoY`=hUq;+~_zzy>lj>r>p*jiVWHwOMbUwGBnlgR_tb9>3ts z_T-KE>I|d3tfBAf$%|tr&*C^P<~&XX9m8Y8UN{6!U!`X$|! z_AhH<;!p>qE~$s`Tiy%adBX-WdDSdfRQ^ChOPh_ziho^Df#O_WTBDBUP_Vf=>8oSo zp3@%x89O)&JuwB7{guq-(H_q>`$ccu?E2xui?Vmz*bZxgVYNqYDJhW4t&RONr-kDa zno!{uy5?`8clnH$RvLUiWysIW^)Jkd5xP0K2@PlvK`j)wZJOB9UbcSYCwU28)5uH& zvT)gukcZnawy;DB?aa42>wG4+qKaK191thyK}&P+zGBnXH?f>O#Tp4R;=R2afwDxI ze*9boFC?)y=f^gkZk^36x%0{(PO+;pLmQKOt@}<+NKQ5KY51kp3#5yicqEQj=1<1% z3hp@jzQ?N5j;g4+P#Wb9Ozfv(Hgupq~==lLo4mJkxT0X}!R@wbQfrBmAO= z-Vdfs9};+O+bD8=CD7BZ`g{`lwNgQcO@0w z(y1u4bLG;%0H~=Zh0JduV)+BZ@dS9Tb?9zh&)bn;MnR{eGYZBB?QSTm2&e-)Rmt1 zO8ONw|zTL9TrlNRTFHn z=no$g74)dh2mQb9FC2$92(d7EJo6B85K=9=#8FsbgN>)G>>10WcbN?4-Mr%5sAd!e zm*S?{I3XWix;)&Us2cteuyRZjEJk={*Kvyve`)g#q-~Z$IEuK^`IBCdXAU)N^lF

L*%%s+?`k?l0EhN*sn}Zn^Qg9i3nKZ6yZuR5hPIZ-DopPQH zPu|XX?3wBPUI2R#8HvP5AOOi~v!16iQgEye-mCHqlu8MR$4XR~2vl=;MPGZ58%k$( z)8KdfxOQV57l?EDs#hH^CpmqEWpL;9o82yNH3SrV`zGd^Ye^==6&lr%HNQ_HM`GtDYa_P-$|AgOXEXMQV&dYdcOg6VP$=nLM; zZT>83PZG#am;Dwh7=2t!mp<1#n{TOF9rP(XkFyx=Jj2i0wS&S94AR>df>##@J_yLo z-Visbr(fSCWN9uA_Iv>paHWJt^svf8J-K3&@DZxk31n0+&_P$=1bv`w1hOYw9AfHe z5x-q<6|y~`xKc-ln>IA=a1-o)2pG!I z-QyN$8o(wH8=aV8Hr?5lv^Q0)e(RPu?1^L44T|kW=*T5N$X>>TSS-oi!wk(hG(1#N zc{{e2@cA$os2u$Z_{Z4#r!Q6gp`OSXwwU*C5 zSLM>x#Uhy8%5;^UdCClP?7jHZ6jZQFMU|$KCI?{}D8UfV2u&NBis^?$oSAqy2UM~C zgb@rP`UdQ9K++B9Iro1jN*RxCn5uzCR=L&2U**#uX2AU;2|yNw=|`@@T`7IE_Y#_w z>t)4bPD>aCH`G{ulmgLru*z=CHf^2-Ha?sT+Q3qd-sC%|k7&O?luZn)sPxo|6PS^V zh8rk{H^wHP8;$Pt7qL=g%xLj!cV+9kM(C0Wc&7j{LXuxV@}{2q%NUKGNypDl5|F;B zrN>>X%5+eNi*+M|AyzV)+}g&SqC+2uzs$%)PFdgFT6(*$+__J&+!7ni*Ig)X++Mid z#(9oyeY!e)Av}v^wtEf}L<1X-Ia|v=!Y8p@;Ps3@M)Tk}xAg4zHcVk2KE`581hV6% zRbUYCbQV@f05WeOyr5wCuEPuP3H9Qfhc)%a#9P%9SGR(1#10Al#=;k^M)?b9vXO`D z;~!&R8@ErTxQiDiyMHwdE~r#+%D$xVn1@S@6>0xK3X*WOhF~*X;96}wT+F*4~ zy`{IqIdifxOnSn0VbLKLq86G&{7mIrGMVxAPOoTce*sRt=>vnty>ga9Bcnl|q{)4{ zC!^jWu~;9?dXLx2Zo;+Fs7BT=>4bSs(_T&muV-ma0}#!S&ef`_H{Or%u`clE3Z=6} zNViS-%CRqB`DJ@!h)mv;Ir<3-hw{!JH~{A{D$Tvs5DY)He`kHV;J3)` zZ&CiGeQx?H=B-|+9U+K1cmF5_(w`a=G3MP7O~cbOl_T;1Hl6naj7`7iSZg=F*(13ltbzwh7%fOu z=YO`$W%Lvp94}qWnrglLjtI0@l{7+F-dkjBq@2D^AX6Ngbz%z~NS3N#fvO5s(t8M= z>0P*Nf)56pj%&p>1?XkCHA^l?`RSaXs;^0QY|s0JJVTSp);{*W{51FE7$`liwHb^{ zfjK%R>zh4FFAfq%*f!qqk*_=~Y%Q8tWpb?Eyv(*$@CovOKu2_>7ZZgfSV=CI62j=!W zHZfTk8UUzVDDn9dmq`~5u?K=2JF!6)<(fd1lfM{-{SSDE1XwI<`}*lUdYG%g?CUv1 z;Z%a?nvc+jhR8+NE)clXL2o?g-bxE7vPAp`5uz4u-Gv82hMaiCS0OjgrHGG2gyRoK zRs`B^jePHGTv+L)?&7!uVM;RwZsLeqU2(pOHVy%E0^j*ND4XH^VMm-jQ;SlG4(* z&WJ}Z2%WhjW)yM_tHu&Vb^8#6AR=tbj1%PW(8lp%)yiXmnTr9bQ%npL2&zXNR2Xm| zGp2nimC|-|`SmQ&Df#uAv<_E7g2^Ivi5+}SYIEf0fK!uvmfKy*U7?o|gG~M`j1Lv> z{Xx;+qZWgQ@+~_M(T(HP(muF3Zht<%7+hk3Y1*$gkEz=h;cK;IjYY`0z$u_hm#zIK zW6ij8J+zfAse2q-A1>qJr=R>ji4l@~ZE>u%la;ZRU(t>pr~y*Ux`CF}1O>m{JheRC z1o|E+-J#B$tzH{r=+d-suW5W#mpo4qLl-SXOs#2Us=rC*a#5VGIDfg`e7+@AuqWH@ zxo~JO*rl{bRA5mK3}pR^DX{3`H;SikXNe8WS>pJBTKVF{Lf{+XwEm@|%R9>?-!mOd8ES zCSPCi*V#z+oddu6k%>(~AY*2S59&wrrj>{yu78*)%d+_!5da5&naQ+g`DeIun?oiAL(sxzp0zWCk$y3m7vbIZ_RslQhd-maa>d+ z*ZFqDP=Y!LZBF0XaS4`4$EH|>=aDJb<*<2}M?}e6ss)~@wGV&0-ze+>n~4>Sq^jdX zV=xJkk-8nXAOkK%@7q+4w$kNo+tAH!2Qw+veURZV(9Y?1WIm{*34H{0*ldd1?;Fo{ zIT$qV5Dg6$ew>+y0*^l>i{?9#?wGh)tlfB)tLOID#WLW>^E~LNs|Rx@EssUj31(fD z+>RS#Nk}=jx5P}avaWB*{sAHYZK$(ljwMU$a{97VG6@U0UHXuE}jtn@}tYq#yfVbk~PV>UEcj-xmJBsM)xV_=wQpb@=gEa5!ga z;hUolh8UPHbl8?C&|-e8k}D7gKYUXP4O(&PEHJ0IQz74TaQiIDsco2qnb!-B_X3R) zjH+ugeKQRnb5OZ_d%x$s*&CVw6yB$+aJ)$@uPAd9^9R3SvyO9I5PM}&A+ouX+wDz) z5HE$b&{Q}Ir}0yI36|5Tl}oUCC~5I+JZY-x*lrvd1=Oiqc4R^Tca^_sJ+r4cUAU+i zi*0`y+bI+-Fc{49@j^=aGonmV={#ve0H_xom{b~qJ*QU1eo`nGg6ka5Yv@ic-XQXh zCi}?co|l^u>j;)cv`>Tt8&Rwwply7qFQ<|=(syfsB|{7g#Sb;*&3;-&vh8WWlMou! z=DNG97SScIgAXGcGzyX>?QfvKwugWG--}cLbz{NXr-zLfS@%6F%!5ET&u#VAIkss$ z3Te5MeC0;?oK9~+TrCPupIugEo=@ii`&gMIcW3(oN}LYS{o?e@?pY`+PM-uZUq~b# zXcNP6j;+uXvFmfLXu&j<5qI^v!#0X6l+Y94J;q;7SnNj_+L+Sk2#q=V+Ub^GEL^e= zB%03cR(|yH!%xU+CCtSV0}CoGy00Psj#`V~O*~NV=RCD^$<-l><@Kc$0BUTZ;9VYF0RtCkaQ?Pf!n)5}W0&sn>KG--QTAAU0bov)FUxzD}eT=2rly6S>p56eF~?v&ja5?qI2^HFxnz z&f(l~R+DX?sxo14t2%nb>@6YGY!%tMlt6r{O0r0`>S{JAJabjlznjco8ja>7tCwm zW@0{};4O!Yq`e%OG)Z?MPnD6Z8q6)kSer#G4wyg(;&NNJDI;d^g}8+HVr{ z+AS7TM7$Pk1c)29W`fccZ|c!;x3*6WT&1(;AckfRXw}3CncoKwK0H>stJApZW5!@1 z=4-f)9h98`H&~+JuMVxP$jf9m(*<&IxqgWxj<593RRCZp1jAT%$pDHdcxzuWz-STk zWqbQxGA$0YH4C$Gy779_b4U1s4WI4Gki^Vl>dm^!9XCc+SnRWn3oxh0kz6I#P|PI! zIk%jgJ*qa*2z3E|Ql(O6ZQGN5mn-X2a`WOw#v|tHRRdFim9${83~eV~-2n3eyDnJ+ zs9-0X>@cvi%i#tz^m7L3gMF_Oxaa1T4EDt*-QhlZ%A#glaV&RIoy#mQ}6}^a3n(zi+&U?~30$`b`I3yH$Xi=$CR`K(yow)TuJS z^$=bboosSTxXg^`NQ^&77{KuG#nHMpaq6zW_tKE(pWE(N*GAq5? zKQTL@=iSibE~u!<=_1-W3{)pSZ0)?uH#K8D8Pl<0G;BH~8yoP5nP4elrGovknujCo(MiGq*a7P7V-#z^Z3 z9X*%iELBD+ll_7tZ9XlUO&-27upK@bZ*aI7qr^fOB^1AGC7nFjnU#L%T%VR~zq2KC z%Ew=0yCcw0(1Fojt8MYEPt1~7E2|r%Ggyk@&w@>R{WhQ&QsX6w<1%~4Zm_Yy=ZImn zW3RZDJHyF7FjTy6_vrQ`e_;2*j=o!3ilQ9~@r|w;H0j-eT#_LsDxsbak8ow8KPzLL z+$^W4veI7Rw;`Xc3i5-2`(DkNkaO~^l)C(87J&1Thxmu2eXdB! zKrcMQGt^)H!zVxh-fA{vy%1 z)Ikd^*ncDz*E>-0_XZa;jI^~(@m6*yMb;C`MXagaWY&+#;q5d0RVT#SCm2LWH4km2 zM_Vi=@&Z{_PRkS-)!B4Y-F*Uav|bC-VIMY@9KdW%>geMq;#lc646-czSvVuRvw<)~ zn#qxCFRK-U><&ne%zmNYsh5+peE~*B?ybJ0Pn(8dmof&R1)$ZN(0>!-SDUSIh&bFD zmzB*RV%G7x^3esdL7{0B<@7(R0Cfiq^^9Xe8LP%H18&KZgNB1Kw)w(Y0j_Qu&KM^=zITRmtb7LE~N(b zphi0!SDX{xBH)WYdtV=4K%w@2nJ!3}x=V(mIurM`D!5~n6evtop#&ypswQr!mQsc1 zUj79yHtspF zp+)}asj;j=l+5CRKg}=g5b*a6w%0Re7#<$JkC@S>1{_^Y2RzeYmw>f7D_>;Ul1e1}LzO#FUi6JvnJ<#e~(@ZUHhJ>lgnn!!bL!YxWOy5)Low=gsU%m@~@E z3={%C-NT>VW5~&aKL=3VhPiAE#B8k>9zIZYEcNR7yJDYw1ZW=F&|Fk)vK9PDSpKu( z2i+0tkYmR?BbK9@O=?T1+baC8Z;L`f@2frlna8kaIJpXl^AzcES&WnwH;S`kKYZ&v zS40BH6c($jHADqdOvC`Ddp`jO^gQsH2XNzJw|`zZnZnB#mj6h)ZZM#->ohZJ=B+I< zGp(}60E44ztq1@kj2`xPp1Zzd=TlH15p)n5$r)2o$ZiJY6Xxts0=Tv_vFvxIO=Ex? zv+)jY-WH3lFY@{4y;X5~v*P+6S9C8Cr6lsr3n-XSZpy;7ZfMqO8C7&OUyK=E?;tup zNJb>A0u}7q=}6ak{3{mvG8F({@b7Aa5cX!#*?DFH{Bj*!Qc_O@%qNwBKy9cCjDHqP z{mV-InlgM)60urp`rX6r(P7iUa0&V3r9oJna$|6Vb+P4t~Elsu4VqS;ifbq46zpNwNk``3wsFjO;GCLutUe9^|8X;(hy0yf@O2h-8f_g4&^ zRUBHUYsDxiN1q#29IB_T7Mz^t#iLsut%@f*(qV#;%d&$w=f^_ z1(=zouV|_>XE*_pl@MyV6aY%HnyHpY;WI7@I<>*55?gOn-i! z&8CiB1sC+{RY1A%SLeyov%3$9(O6^KE4r>{bWq_7aimOgO^%Eb4+cf^|-xVYPO`s~7o}oKKjYe}N`ps-TXk#DMqOg!k zYs-RY&co%GVsOpHUtzw`4Fd zPt@UcL#OUKtl~UiPKCjl0Az}6yh84tPlpYChw5{nhdeBq6rBJP$j$QwP!$ znlDRcwmYhgju~p-v}A(`UGn;)1O!qnG}N=aRCxuq)NDx9MU0DCcWfR=G8!kJqaJ4I z@W5p1bdg=jqI2?nD8qBBiAz9SxaEBXY=)Cvqsbb(?7RLsIgj(y3gacGAHQ20q?@cT zk7-A&u9^e}2Os#8-0n>gR&YC_57JwC_K5Rs29s(YuU|$=J_}64Frj=)Dp*rM!dRX> zT^MqfjI}k9#u-~S|8NP8n0v+{XosGdLkUBrqLqE88rEd^Ts{=;pPGlnht?2Xt6u^s z_6OKqn7Ok6Tzl0N04RSFkN{;k2q>TrvcLmbIZ+nFQBT3_QbgF+pJyheDvtSiPA>LA zyRY-13GVV0_cIVgY}%>(PFt%7Hi_V#c;L%VgsDR=LaN=y?1IZ_hp86jU2$w5*(-GUJ)f1E_*&$~$(#@|@m8S?dRCC7I@)v92TbaJnOF{3N2 zG#?+jHYUN~1t(2^dJ%l~05rK@4ziA9;GXm&tC1edlDPR6fU1^xi;~CX_U3Z46>3;! zHVN`BHy^iK&Kpx&S`TKRvFe;&HMiyN18=rg?n^!z3%9SO6DU;9#=1B|Hjy6zFHIp? zpRC9S_SF}$hE=Cd;1$ud=VV}&5Ji9pe1145duqHn7jFoRm^DWhIuam{WcgP-dKcpMV$#%T>britn8X4MHsvy5+V7C{ zB7P38Q<==&I$~$M;8s;U)GcJ>&w8sbM018gamTf9+b$i*ha|u_|H#|#eMALISEhgh zIdirCd@|wdZxjpFU&(g>CjxaZh0^-M1N%lQXDv0GI=XTOWP2wE$-gdqUVhy+1 zK7O~_zzigGMQAxFq87+4;-L-E~pFIsrUSqt(+YD!nVgrbse0uvd8gtK;LRQ@? zP~O20G5xM1w)h;mne{Fta46;lqTRWuZ#kI*CvHJEdc+)H-sHr!u3AXyk7EVsat?9| znYR+D3jxQV`^5|dJxen zd|mIPp_yw+c}7?H;~J@mKg(S-_kSFpfd5p1X8}~&R?^gP37Ect4jFO*MY?C-EsvPu zvF@jd+Iu}cvv}L;3^U~gBH5*bFQ5=OL1b`_1Hu9Q;GsGA{Qt1`)lpTp+rElLNGT!h z0x2m00cmL@B?M`OMW=vt2-3M|B&0!F7AdJ9u;}hmVbP&etUo0z0W>p+&jiS z_ntq1F@Uw+_kEuE%sGF*Ie&AlQ?HD8I;ea|r`5Urb>6Pfqzir^a5)0$?N`xtqv&K0 zf;-QeBR@tO-`U1R7_n94;mmY5v&Fj0PVvh#ih#~<44wM#;}H58j-w2e4nDp)8T9me zsW8~X@Ei&);lxvtErRgUol-szSGD-=2zFS77?hV``V5-!nLPNNy2(^ro;}`~*hu3Z zY2U%gijyR_S#ap|ro(+uKz-@@%W_<& z@iCMjraLi)5hCQA(^6f0nz~khP$f&c2e=(a>Oa*!FcFk{k{A*nE=Z|QkFehRREPo6 z#HV_4-`kuO~&1MLA*PO z7kuLM1IlL^?>)~>q-Z3aiY6k3Jw3oDr4}f0hwrz?Fqv*I&apXk^8^zzNv61M2HHs< zzH6@`(ia~-SqJ*F_30{cyF<0~8XnfIkW2bnc-PN8jaZ7p>UVfKaXi6aX18wa@}cW8 zTc)<*^5WkF13-D#`fhLyOMWXrN|Y}7%1@t0jcz|^eWpJvpvhhxRl%-Hja$-p-$E5u zNdv2B*lcI|=%H9eb>Z^WR2+&Vhh;m(PYzuUbdzwyV6A8q1Pn4*VY%s|N6B-^D<{Q6 zvnu2Ekvqe9+oOj*Bd}LGSQ=+c+8bFW zEOG-9B`pX4_e~+7=g-3BY0Z`3zL?+z=ivCx+d65t4cn7q94s_tEmVDHove=)2g|ym z!enWMAn=b!Mzut{j)eKZv-g3fYN9jCyHWe)oNpaPxh`;809LT zSD8eo8#f7c2R-$zKP53JJtNxPSP6-U=H=(0Gm7-MJ?c10hd&Hque7Z{lQwu@xG}&* zzNB&l0KFVJwQMJCp%lQSJ8y3TwbkNa1#Q{4oKyRqZ6WuI*Mg-CyPI?RW2|D^gE4TqvM zR~;r&X3=%wWOQ)MOn90gz_{vTB>-E;rOH&uOoAp+uFjouzd)AIL7wZ8zt=dc?D zGx?yc!Eo!VJQmP)1&r7NtqlhsEeDW$tzG^{M((e~&L0|017CYJ`PByz_}}|`|1Bm` z^TccXherAw`b7m)Y!myf822ya+~6|q;N_pzaMK30e);XKfCdL1uNw*dtQV$1zgiE++N=5_Z1ZQ}}OuQx___X!PT+M{Vu zn@>5`Ep3Mx)>daF){BFF3S{~6k9foglA~Ii5F%VpRcnoo&npZ9w_kd&@a4jILN4Ex zp|8$@UZs|8vl7YrK@q-x39!hOYj$8s?2exWA^Mv!>9F2Ajf0}j8?~+v)Wp@?CRwWOov>&> zPGwt2aBfxNB?3s)qD(sCGY@arz zmm+hj64e<_*X2EgSy*?$usjGV#cbU5_mjG2U;{`A531lgEOreX5F5gDFepwwo!;km zSs%}JcLCB@DX;54D~4XGbbyM+2a>wqh+}iT5#yy_v4pO*_2r@TQEAJT0T@Y9iNw9W zFIOlk_l-5L|7fQirk9cVvKAcF_CUhydGFjKA-+oMP@!ub=g(7jh=_LQ03y!z*v{U5 z&Hg0qUB^&{8)2eJS*wmJW6If78j(dhTvk_c$xw1^%8cz$Vp zC%9JW;5Jv*p?kK{W2lQqa&)xxnf$ugQJi}|4_{$m0c{gi7H5sx*ImQTN6`t`x(KN! zyrgblQy5V?bBef&U^WHJR$med(5CMxsj^gDD52(F#%&6}<(>w|6;hU?Op4^02DNaJ ze}A{Y?Jbn!OJU7eH*eM-+MfVV9*>ZX(978AD_%O^av0^_MjQ>SXUFc1BHzo-jlpV%k}*s z)1ZK`zG4dcnuYL;P?ec4aR8&;wiW6MDEQtlH12)0%FvgJB{*t{ChU0jO{(~s{n4f= zB^Wo5^YZ6uI_EV=jjC+i{jE8fRwBj^TknhuZP85Mc zkP#ECsmjl5C=pF@{w=e!lAY4U6!zo;7NF0DtgwS+S!!hgF1F!7?6$CwkKzV_;Ij|C ze)gZ=dmaM_8-S<(E1>0ypi(XnaL^B$W4jg8Adr7v6C7uO@3I|*4ZDv7^DvmF1r6%& zpK$plX}vQOCFXWS6ngSA+d1zk(XoXBZHyPKf`B~`-bZ>}2G4LVizf2tSB1kx7OdWt%N0s}K zug0^0BApfoCAkMI8EO{7ev`lR%f`k5r3v;LM)&ziR&2SD@C=j#za#>-4`Xk=myK<1 zkegX<&eCdh54T;9_LV=U4`Bv>-#2VKCv;Hhmix3|Bt9I*o6;LQyct>_&r6{wP=krc;JSETau2Msv~nbmD) z&7+I%NWyddEilt(rGasmo1XITEPhFwEB^o|D#~2btO^*tE7tnVrIkaIFTI>q`X-zl z`hCF`b_>!ani5uU3;GQ2#+YrUdvp=48N##IzJ+dWOH5N>^d}&`q%Y5^h0QC;)hny!DX9l_$EI|zY00`0EA~5bK?A{= zQdL#}opZZ4>8>kU>*WcTI-M9Z>NU661Hq`!+E>sq1QeENfbh9M{HLLPpW*^qBz{vz zRk;MJSOzRNG*gO_Mp-N#g9vmGz`~6kPGJh*C3j)M*V>B9qt>f~v*@4|?~NSq;%*G* zF(nB7=`YZXB#5+nEWW)p($)^7ypddYZmQbJ^8~<6eA_gYL#bllSDB+a3nhc57(XPMUy=rq(DkSkEbYes~0?tE4*GI6?0Yt({z7 zJ3Hd%OAA#n07dRFsrEoXG% z#|V7jLPkMv5tWJvxYJ7eC*5d4hpNygP)PZjn8&jD=wSImSFuH*V;#c1IbzUDGQC{F z;;~HO_lrOgD~$WCl9WXd5!qN|){{%^x_4t6GdR9-WnR1f#hIgG)B&&*QbbKXjL8|; z8(G7lGnJDWy!;!lJSOY3Mi#v*bcctcY8X`3zeLxKULYWhiYiJTmvXe$okv1@v&rFo z!qX@%RiVf2sVyn!Hy88%ZwVD|kD6bSRlVp^%e-wkFw8A07m_0@Rk1AJp;Sxi>*Er~ zPq^vV_d;@MTi{Qhd+t05>Hz2USr)*0@&X03j`K++p%~vZaX{`QnIkRPIO?ELwvpXtbVh6t+`~;jJ3o`0eTOOj?ap5uwVuEQ*qMsv9E5!h3G#4Lfc!?Tr zmBcRxNqZU%aTQ62;WgxIK&)?In?!eSxH%XC)E_MH8taDxIz`EoSGJp1M6Wr+4dI(- z`jm!X9>+}#hI0{wctR7LsJ0|D6bS;FwMP`zGFTcKq=<2fa%3V(QhgS( zCTHOj-iIGmOu*%APAOsBVI3_Vp5N<|aGav! zz{qP~oHLjzToZ2#bB_|70}Y2jiLtT!JmCb4JVZrp;_Htjs9YjB7MSGK!!|NOsd;&C zGurySVQg{Q0BYP5JtGWJfEceap)NctDb_WhQ`>rE(OC=<8cBNYv#c5OQOC>3;(qN0 zZUFr`s9)aurQW!S0$u$te+#+*1`YP(%CU1*`zA@~mP$CZwnO9VC$$_ZP;QCAa<+;g z6B0hVf?rNvAI3q!$0rVn1~-$d5eqUUP`}lS3b|m;h0z%bU(v> z>&r7r*S8bIs_7r>4&D0#!ML;`+2(kX2qY`nD>*~up@UK0J(ckSd2BU=OxSP^kGz&b{=14u)JUrYUo>%aC=j@$J-z{ zVc{~kcY#syVOTM7zu#1w=+Ha%7@4^&e8FR_rfy#0KO0l$P$gQ#_1{`inHH2nbcC~?6^oG4Kl zkxg`HH@GYm7&PMAVjlbCx`r;<%fw$PZ95)t1LlV_6F#tskz(nzv{P%ynl6}e#UJs$ z2%v>oV@E5H-3WfqZli_Jn@lg6|C zh6ovpG)p~yKwqp&y+kGDFp*B@a55VLUc9MOuGt zpF|o=m;#LbF%y@6xo($}FmKu~#q|hyPM{che6oO>D+@Y?zQ_l5>%O2Qsh%_(5v3~` zeN(t%jPvy_;6StFHICv_$WB0KyX12f7Ar|85Wxfyd5&rqyBx$8s|?ZmFO96up(d!f zC~ry^!rt1L9gX39Zn@O`>hDJ7+`mtRTI^fDCy2jKb1V{;8pt+%xh;t->lkTC1*9(JmuCmO39;87 z&?~cU8>xn|sO4NA@;D$9EbIf#oA@Lje+36L2#O&A81MIloBqHuk>~!M&U4a z7C^BXtI4$5HqP>Irmk{Fa6SJ_$7i#dE>#*d*Cx{=sM+=RH4tFRw*QobEZ};8@$EZY zbRTYwzCpFn_a4-+{DU5y@D@MJCRJ!yPl<`LqjE?L=$#3sl zVC`u&niGz*7Q}mt06D~?WO?7IJ6UWgl8b!=@^^_`7as(z-JM(@H43Z!QyThARqrh1 z501ZBZ}#G^lc$SfybgZPc!BASQo*Apal!uF*R+U(F_PnJX&KBvw@2sOTS{FUNC9#T zAh0mMp@kbPPhab1K{j>I(TB|~h1Qd#U8{t>L}I#&5=2r9oi&ApaD?sP4#->Vmk>s$ zujy!SeO2lMT?9DV_vbPUjY8>JX#qqKR|Cdp!>X4 zg$r~A8=A=#Z>dAu%mCbVj#L9v$@=SZfHUj9;`qxz3Q&TCmeAij6_9)VeNT71rpbS+ zPV)a;(|l8QE!WRk0ROEU=>Kz%{eS3DQGN83ZqJfnE5dr)D2nE@pr<#U#%b?nG7)tL z4^4~;h@xv0|5)VwN7(`Dt&~Y`gF15~NVc7`9ZbQGKmZT^zn-XRux-s`+WLW$l0=5E z{lN)8&)5H2fm1nO_#xS^63u&zOpOSx-MvSme1cT@^8Xom%U_DhC`z1kss(fDkMY5c zK43(oDD*(A!%;JUc_u_v0nB@?bo`BC<-c0<%^jk8gZ&F%d_v(14xiuRi~m?I0r?|j zDr-TL8)C$-u+owaUViF{LC`}g^Nz!3KM{Kg8(;vE7bSS0Y;>+GoEfFOf-?f;mHek~ zkAJNu0S&6)wYF;)Xivl1rf&tcqlb-Cuk^3o8g~Vdt&XZdH?EyhW^4e&7UaTfpcYFG z*jW)E5qo12`<`bXZ|KP!yr%5L-g{9k?_;)om_p2glPkJ-EDxXABuZt2Ymt#k;ot29 zWzb=uuN%v)mm>e!Pr`I=w~-=MF-W8_vQF36y#tMiwbSg&wwj`d4cdo$OP`mIJ^Sdp+M2!48si9lTE)qa#NJ~i(MNL38^UV zob0PyVOCt*>MAnDvZWAr74M|HNIHf5d5nxVU+?x2(2@cpD<)I69cMzd9oF)2A;BbE zjvnBF{$V<2cyl9*_jPFS17cC6U3L*druXtdq42BwH&o~s@vzcJ4;mSmpV#z?vb?14 z@KXu2XT!J738BnS4JGP~7R|T>0TV!>wHb^$mqD}ze(E14hMu^NKY5MK&AMq`GA?7p z6wMpGX)FV7f10#bOzZ6#AJOWbA7W6DI}0Yy@(~h#ZbB)l&6eB?zxT4M@^g7OyeIk| zYRahcm%2zDI&h8jf4BzIxx>+Scg{H}Xyls=^!HEPVObm8xy9!~I3Si>-PoV2L*1`z z`100uhoR^JrLnT)yRzo|X4!-1BQ8b6C~w4e+Ct!+17NrRGFbS0uW)W4%m5E^9`Ba? z6W@n10*reS16s&+lNG3C3!la`-m7YL+DbZV8aY}C>ZpsOgCp2E*U(F`#~_9I;?z>!$HEI8Zf7L}mD3qx{OeP=nfB!!9xRZ?!+_ugO_<0aFX9y^=w4aWi>9nTaM_Th*rKN3e6(Wi2li#*@B$!5@IWM z_iSULTYrzA>Bg36MhWJ`J<{NzSD}t$*`e*#h64?1TA#CJD7jh6;M}BGq)bQ`=*WS1 zwF_|u{Ft_r_8m8pO9!*Ku2qt{1pk2p`GLNusdzpEKjqq{9NlJouk6|d8hGdRGn;QZ z<8}%YX)S)UB{KMf75ZO01wpd@nf3$CkL>Y18o)?cy_2O+{Mj9;>x;+iQ}V`53Zj#C zXKRSs7)Q5RxI?l0Jui+@k6W-?k~2zu@~gz3t|ef{i=nE=zATMacYh5?ep20DppU zwe9D<(4paJTzfw=bea}b7~|{vs;h}P37jxPnFF>2Bl9v9vIARe8okah>2eVeRfu}K z!UTuTtGlml&YH!mZOAgnc@dM@vpd$ay=zA(bt$Ya&=LSv*pPbG+drb)lz6=Z&@|KJJrh4yh&*NmFU%ImO zwz+(aKx{kp*%|xZW63C(Mmhl;)-oi|={QOt#%+ej+Tj{EKafp8q~Ocw+g(2D#^EKT z^}1SRB_ReU$2c#S#@1uTHhg|nt}fU7LZY~m#@Yrx`8-DFAo$>di1y&kd=9f-0onj& z&&Lp!A0d780mvwr`BOR0!^wg<$~49`(~npvI5t?01DC`_5|J8TOR4)@wdlZ0Lp#dZE+a5r~=Z+KwTm7Tpd zNMzi?>QwY(j8#9YtocLf?QLX7x!+##9r(Hs)Mv>zStwnTwZ7VR1 z^~nsbcPbEL2H&x57s~!&c311q$*LsmBWW@h%WoG?H(c)Qmpnd>Xr%FoCm`u(4w5l* z4{h>9>%jduP`RS);s2n*;{B&=CDq&LM;O!{XM32uKfAebz)4`dpUiSxqesU3!J(;1 zWc+R;-tA->A2vKnW&s6LtQ%A{L8g8M4Wn_b-s4}64+u4rgdb05M8BaY<8o3Z;|iEz zZL=SfE>Ip@jd~m=r2Cfsb+(n9#)F4ko;T`rg_D(6u=vJ%PVpDq<>x6BJ}A8mj}A@E z(I1L(!nyq6E3fHu@)Os`WUphqTcaM)ypec!X0RdmDT{RG-f5Y2K)*uGHO)*ok;LuW zvW%h**ZRCIVyIQ2jNVnerxWcR^!w+~@Ls$KzK93c_IXvwcoLbLPd^MjSPoQ9i@Ek( zOzUcp^YvI}HrtV6E~D7jH!Asq2`^0HxWp@}^}jnk2eoIgqoLb7AN%*cdsgo zZ3i2^i@cO6X~L9X#k6hSKXCO?=Xfe58M4KYK)ZohpNsJ8&tB{o~w8EJBg9rJ9>ZZ6_+0WZoXLlC4cuau}Okb z9NcELRYw_i#@8RHUsD9jm0ONQ&}0~`MQ`W`S#Hf~`Xc*{B-~lJPdDl=!7X+qgIDs@ z`J`F-OLLpU&V!{rck$*quWh{h2$}PaC$?MWCi&yzEMw1>N4=UcrH6Xkp5FGkNNb6X z5!v9QHhL1}{iJ2dD4SlEQnUI6L_FJ-d7CQnMSK2=SpE}_1Acm1uL^_kmLqLQPI$~-*~GU4*-8{cEu=Y=$5cvr0`+H_xT-ZcNguXv zdfIjdY- zV$>qdy=SBaadT8?haTxArS?pAc$H+!vybV@NOR3hTy>tMxT&9V5g1s1iL06G&=a#h z#<(>lGfGQfEzwM#HT{ga+ZV)6BC3eE=*|5JlQSJ);qFw_v z^e*WlRb{=UBX$K(6?qe{p(^Q_D9WbKT|^VP`Bqe|l!>bRd2a!IQB=C=si}H{y;(@2 zu;?@K(_<;Onze@w+LU;j?#~}CVr6@@Z5xm?PxycIde)(^lX637a((uuCN1W|p`sgM z$c$Go93BuH%v`m6JG}p_zG2ZcN5VbNm>DBk|47Ls|FgRJE~29MNU|Wa zp=2=sp|rvqUFo~Su#`%}g>E~Q#Zs&B$f?bxps7u%MX97?<T!JmpQN7U`J$hF19f$4b*{_HxUX-EFvdw_?J<-4jDFkw0`V)_sHXu$gPD%y#S=qk0KEdffU9 zn#eJ&zP)!z@UyDk&O6lz0d!FXR`QecVzW{wl$yMiT(?YEl7tm2tS6{jlLwV$eAP-R z34$0Ovy|m!0WXt;?Cw~~7qV>tzdt3_f6EU3T?hWyL+hw?P!L|Cn(Op-5wjpKt;#sd z9q&r#Qqsv|1>qk1=dnRyT4L$@#3@}c{pO$!_O39_*8{d`gf+`KcUdgxWt2Ja>Kp+;LLrrhHojjg(A-$BEpm<#55P&R3kR&{;Z83}&qg zIUN4gI~||4gC*)*c9S% zGYt)fE?7j&u$E9Vt<@X-VySKin{ba47nOyvn^icZdtS_sYsc?e*X8QvM^27pB>&X` z<2ow3N+31~iWfp@pw40OIM;4{oedY3nW}!yQhw#?jhof|0nx(eeVBa;j#DDH;}B|) z{JkdYwuPCZvU{n|ro^C~T??A4H^2sb_mtA8gSTz7SEla6toY4k`krX)Lcr2Z5j1oS z4DSvVYzP*Fsb>~e5{Ir~G;$OA%Gu`1j~DWQ7uvhNe|d_JGQ37Qe7G`tV$m1s7cfF% z)`>=R4RnnPTuDC?e0V302!dnl6Ua?akpyN1-g{~H#g!BA0Oz=T<*A`rc-%V866YXH ztlAa6_;5V91y#sZPHN~Y3U8R<{09mUe!xX&*V!X? zV$^jbJWmg#T0%%@z-I)!%j!~1(yX#Y${cJDC^otCnD&b=59JfrFHn1edp_Vy7k5Fw z38n3!K&{F_4&sOi58fF99#Ej_B`QV&14;xpgDKcspuzcy$6B|lH7P0SYdIM)vOR_q z!!W`!TNIIS=c%{0&4){$i=@KH8Ql-JF=p%{z>XChlPFOOf=7&82!r{;Vvbej+_Au_ zw}3CoNs&LoLBIS>bKe3zj{^xl$MsdlGnDjhd8C+X{Q`t3#5?!~c)9+22vZ^IZCbBf{rOQC<+RM6e*@#A8i>-8$Iy^Wg0mDt#y>+hV zx$FgMg_^Vtr-%8glkZf_hVp2K@*YKr`hFp^hHGK*SWPkrpBM z-ZZ+;_V8?VthA}C?pq5PZ_aBMGX-gA>EKAol*>vXZyHPN=S>IV+17BLAqrmGOda+p z%O*c;UipN(?4c{bR(N6pYY*n46m5Y=PO$rkFEvNNsy#ZmaQ+H8%%#Wx7$^NL zD+#soI^m_vsKyh=^~o`YQsbW2Nj2-$O!UQPF)v4!%13lLyXf~c&;xyR9p#Vk0v$EMtXQ}o(TOwhw%-0${A)Bq9SU%kLGA<=~l(or@YNvxLB7yIDUT!A{TDGchl^`sY#gB`*%mxt$ zyA#$KVKllBrm}Iyg;9fWuAQkG$Bj2@{%U&2NrwqbUT}kVhXT3hF$r(qunQai)IhSc zGJ;#_IQguVWY~569lb?|B^888ICAdFE8w>_rzH@g2$@sH&H76|%%$4ZKb-BOM=Evr#|s)>rMa^vlcG znIOWpRMj6YGa1wvREaL8%O`SgPP=X(xqORf$26;4_pC77bE9BYL`9y>gzAXV5(9}! zUCC@<(;Z{RDJ*Gv>rEOiE}5GTtss3eqIzXWU}X`89kIS0CR#jG_EA@zOj`JgAqnOn}!GKmX_a9vK`z?IH!dbMc+9rT0um2M&LYyriIlN zGNx(rXGB`N_(iD>US2@V1h+R-2_|9*Ar!90wrUO{B&roY{`#_Hie($-qkj4MV($lK z){#$8iQ!f%PjV~SiM75Asj#BmewGkiaDWx0tJ*ACkNNaJqN`98@))#)*mz81+!3qg|+il#B{|2CD7mJKLQ1rW`Z4J|`p>`WB^SLgFyqQZqTH79jK`)HW3(_E_Fh z2NP;c3mb0#Obe30vs1j?5=FEV%2z*MG5S~(#2wNC%)31zQzsi`noI#6(@i@e_ zkabuGFU0A2n2vipZMM-%nF~p6*)9+t((*}F-Wmqslb&E($c&(s zCki;CM8Hl_cMQ#k?Q|~)&j}o~Sah6OyB{X+^)CGctLIyRKO;}wltJ{T&NZB!y4KT} zj1=iShANd99&92jo+qJeq<9?W=CW9BjGGsiTe1~D>r0o&b~!ZcWaY(&+auMBOfrje zl5Az_Mdr8=7SntO7}oC$YL@fz<#y5wZ}G2{4agzO*yzvi+8sUHB7}6vCzK)SP!Z0Q z8ss6gD0>mDz|@$I%DVCsl1|>ldkk4=tkLZ*Ir3a!T^W9NeBXr@x)kW75P6xWhv|#> z1a({hEGvh8IXCct78_l!;Bm>Q7Ws4Z(7OFE!Pzm7fTQieXhtu!WEy#sqk&9r zD4MoSuWTkR7qHlT+2+Jswa+0p$*Yu#1?vev=cd<|}2 zUmeO`8Tt^mn-ogMGtOlI&)tvFlG59``@uj4AdSnW&n~oq-2v zU`wvOAX2)73XESO5C|=wf)IPGm)?J5PWM0Qfw+l7d8t?pI(EsalB|2NFGF6mTSZaO zP@S2+_(_!dep?+*vS$4CcjaG0IJ@7X?uk9ac48cawAG+F@$2xrEiuDwg~i_eVBV4C zM8I7?1=Xt~1BceN-z1?C@p~Xya;d5>x;=1zD}qrO;j)nINFAKHN(o^ye6z%$051Ow zt&|ZJ1P!+yLz6HdOtadJFMejq2(i=`x&T+d&oWXMGULzDP}HWE2qu7c1sgFx%8Ry` zs9;H=z~o?sJY#a(*rqNERS@-7sB&8LxsQ>2<3@eB!?%yu6{UM#3pR~GAhMz*DUEyY zh|`1x3_&;pZgB(#qt$c!=6lzAS|G`B>iNh6X$GvB*6Mp8wnN9@hLRC52HHd(>k5LD z9|`1BR};giGV2*#1%Yqk8Vw$GPm8tR!QD;|7OwH(-_d3rZbruGv8Jh@H%V;oA95JB z`)q90#0$DsFzeRd(`LSv`T+Q;rac$@Yc)*_|gZP?$@<^$Bh2hMslZPAp@Fc;L0{o*ODg@2&iB zsaQ{r_9{DXCvUuq{qO|Vi@EyM*nu_dVS&smA$J#m*?vaE`!_AWpVU1Wgl!%~=yZQJ z-CM=VRt2#Ppj^{ndQ z>)?lXoL=$T8BxmG#sPSsqf(l8LF6P<9A~#c&MT6VQV*de3W&KnjF9 zxHB_HcXXlX@asBP+A@=gFX7&ou%y$xFR=iy`QwCS`kxbxy|9k1adeZG8cKNelT;kzA*5Ut(*e#X#bNoUm>Py-V+Q*3MKcz0jR354u(hKmsQ#6US3#M@!Xj43E`oF+hp5sQa^Q4#qX!+w<+(bbF zavLec_!UNc2orI!)ujU6x@uHfHF8T`&%Im#WIvyeOo)Nkj{=!DTT&rYKMwC4?5le% zF_=0+Ff|>|1A!v5K5$)J1s5N)3v3MnCQjJ16%jiR;L;-P+aukx^ybidEb3AF+(?%PoeXZP$ zngrzd4>J@h@Ifq?p8Xq#_D@oCbE-^nA7;Vww~8r3bnPzsF+nQt9#kGKnFjTJF$4)@ zo4w`?;5p`sYS!Be*V~yyW0`K4-ueXAJJRvauRsGpFA2?GfG_hE%KKnL+E^gwd+SsE z=@MA41YI5CPq{|4X+ZGCm|3XXU$5Didy^4{nTX2XWn-BmM13)RE!%VffRu$Dj@?Ij zWi}LM{T?gOL}OpEgRd)&KhiCG6kYrYET=nc@*2FX{87GITb~pa3&5+!qZdFv9JkdY zEbNxDmu}D+1`c=hTSm1po3vivAS2+CiB2-KU$!_d_KK`luv3HGZ~0H|J~)E?L|hKT z?-ouSlge8lN>uE&)x{&3x?ktIN9tUCd*vmr5G;YqT(fi zz}uygR?6$xlvJAE}^udpPmKtu(6Z0ju#ok9|WPPIL-C|v-Oq}ChGJV*qb-mZ?9GmVgM zm3y2VO1c#S3zJ~+hY=EDli;BPyTQzm-bX`)+0Xv;zJ!3Ec>7n5BK89AV7|H{h!rU| zZ77<~VMVLzel*qzTB%82y3IPuzt)F6Pn%e}eDww%tL=>Y-q>Y{k<@uo0e9f+0BRn# zIhe#E^sq;&=z1*zpBK%)UPV(W-8rJ7z}t-8U*xMgc?19o3zF+*jTLdHyyLWUj`p)= z!grNQdR$INmX%XP-E2D5W%w6l)s@F+L$kPCwoMGKE{7uyh?vymx&^l6-mStrLT+vx zB-^~1ATsVsAY7L@$z-qTkDyb0&792l9Bwc8wDIB+7)OU<$#Mbn6eD2zujqvzypx`N zpM>P<8$|Xa#BJ>=o6NDjJziek7KcVcCN*ggYO`#S?aD>kyo?W1U$8Cs!H}|*hRvz? zz$;5IF0cr%*AF@k50VUV3SZ@vfpQPWL z2j$5YP|p6JkxAg*rYOIrdipTqre?nd1Q0=Ec77G$xSD-CDo`#EFcc9_B>@-|15DT7 zvs~4JpG3~kH7M`PKaE1yVDyn!e+C?L5LTS!Q?Lij6IW=OLO^Ko|1Xh-rH>G#yq~YX zIzVEMy*15W9j~KbjWOE6iRnA(27KB1KRC!2_gyfcp|EludMP3nEvBK{l((*|=~34W z3a(Q?It<=Z@`S^X>`xLD;{m#DTxz3yH^FM#cg#IE_->~w8DhnpN|&bstW*V(S|wssgAbIa4%Zq!I3HeZ*- z3EqS{KZvV{f&c@&TIc>8b(KiZpjzwmkPg?DpOEUs{Vx~Mn4sm02R_H8c5%LPV;5xF z&G#JQBAbIXH@)@mgT1E5haays+zbaPsq@C4ID#8fW8!p-WU&Fe&eLr12grf(p)z+X zD$YkjNezRsbEfh*QLe6KrCnIwDBIEfz3q(ALFMFwtj$GmB=%T; z9SLYKEbIJKirW^JGvt;&GP^W^$`oCbHi)o|>l^0J4f8`?AS=l=B%T>7t{^P1*10L6 zKh=Sj*QVi%Bro=5DwfoRBfD6i3a0h2vDX8gE<^3I$$3cTX0EW2iIkHkC9MWdhf6ug zV7B@@c^0~rat_CJAF8Iod{aRCIHkBx;2fBIfIB6GYE(!B%!*uj{Bp7Q$e^_IV0|I} z`D~kotF{iK8gzyn(DSD&k_QgW)wS6^#aP6*8H`^)?5Q~ANpmu18a{rSNL>8wg>rM2 z;jCt?S)Wfq;=0Ry1+$@HWg%g)X82J@iIi$NOLouu`QFk`l6jA^3dkRordz5U%B_^H zNt!*(j~slX-O?X@Ix*;(VU;x-RzOGCzG`f0HJfe8Vcw@me&kBnJe^Y{cvAsGPYjtL(ZVo1nTbrm%v~VePv<{OySal5+R1%x5_D7^4Wwr8FTwf%n zTD zsOp<1BBq|{CktL4^NYseWQ^krUKN51M=6F=EnQajG$lf~Lb_Gjci+FumgVD)llMF^ zT`m=wH84ap&B|4n@PNNh9~@s9t7}c}d3jiVd;Ij~JA)A`=Z)aSlq(4h+i}<}cGi0& zj}Jaw$W%0WKHL^+CYea>@DpnHOW*7j9d0l8l-{eM|41ve6jygXF4<@$vcf$jR#^Mc z+rCSqwT+M>LE~_&-XJc?ZR$E|O9(kn>B-`Hc=8qq;jx9&3{&eajD1EdJFn|^1jO({ zI^z$9FyA?EPejc7Y~%ma!o}2d2p^WvXJuJ5KT|JM1nMm#~KVgb>E4L5T7F-H! zbNzz)ojOBC-T5=-nxD0!F0TQ?ITt10cB&9x$QeN&1q z=*i`+)yAYO&y4L}fe@0gRpnI{yxvor{?f7&VLZc3+d-*c(#lbL-DSAl_w_r(-DijD zo1gX-sk<jHW$bh z!6tDQyE0)$eAR`Kxzqiip8|fA!jf%!MYV|L`Gk%!T%u;l#Y}9;@k3k17>kP#<;aX- zMd-2l>Oi@#cJKJ1bE@#YvXwpig1oTJ*Hs7FymYtDMg8Icn9;n5-~>B^Y(yQYIl?w_ zAk47eRuS=GzK$csgRT*FM*|ZA)qvgOxC!NLgx&^qlE%;bm%%^f*@%GhN>CMREdg9% z$SH56EDCMdx8PZ5MB`7SS*3XMt7t5>D>6f3^Zov-$f&3&Ht-@h{4t{Y2zc|gfdClc zb;UtBpMp0g`bHo|e>6x8gc%0Y>d}a*)dw(N!hm52EOzXe>p{ zD7R0+s9QRc8Y)3^eJTt_m@HpFUg=wamWEWHnxLc+?B-RtD@2O%GiY|+f^mt)emS$- za&W$F&)20@S|3hYdF?8Hnb{Tx1#Djk^&gPmV zfN$UNA?xN>3VWjqDf#da>*pIs<0EcEnBwv-TcmJrNWD5)>q)0~Kw^IigT=@))~ovz z>^qs7g72?Pv3&|YRzaZ6J=oxaMw*~5_VcoB=o%mxu`38z*zQC6tCyZu&_AH(zHRoS zw083omsbG+yw)#aeV46zk=l8Qb|uiqS&>C<@5YiapO;AvkwW|(5m06zt)u3}`x^f~ zT=ihskCt%rvTAlWMHX_14M%Dy#Jw_K*J4c1)!|cfqNXg$sX^^Y*#8dt> zXz3)78GTvhsGt*7YpHJOdk?sVr|Az#QDsitzb$jt8q57C1e`~|r2z8AXAUG<*C9*> zIF>fQ?Ew{r|RxG|id)i~PJs53-|sH6ew8u$YpfXLs-TFiXWVL|+Dv(3w7@vvu(XadVe@E$ye?v9= zg*!!69Xm!7&^1U_NTn_S!qq2G)bu6_h}i!P5ZQdqF8H(41%{$XK8_h$$OHQGd1!_g z1b7PZw7aP8DoVaz$<+UB(?3a}|7V*}C;h+kX&PgN3hsY;X*(d#qjA3*6cH1+EkQ5M zk4}^v^`ALh5`dGoEp6&qcyKo)YsV9O5Z>-U(NSlgB4SN+_`0G1M9_7&?8Wh7MtCNu z@DcI*eQ#34b#(Fl!JfV+0o0RmlG`IpNQe45{3;Qo!~3F`AL_78WZ0=yX%!PmD+?NFI8f~; zMh7qZj-}2)R)mQHbq#A^JD@Nvo3B>Lyf67jk4MQUJBmnFbEz6&`sv}jfs^H(oy%7S z-A$389H`Y&>#(Mo`?w?>1cy@aN5fd*8(p5jtDYJ}EIz8h(&&g|ivy(%SueCpjp%l^ zwsX}hMWrW`n8aVGv%tS82PJm@bnF39;A%+MDY2&)&Vg z5KupG^AXnTjZun18b+P*PvOr%!xUgm3>15Q#+Nxo;QT!$>zTfRqPt8ur6BX{YYxL8 zP=}b-Z-r@q+u-P+3YixMFFPxd-o}K;W%wA8|6jiC>CuKPXqjS} z9X9AW+Fb)wID#3x*()Tjg0>+o$jRQgL#i+)e0UzUDiduFXQ|5$^3)D(s5y~v=p31MPr(ZD2m2^u3E z11=C7fnLH47J`6p*2ARslf$16$u$bJ2e*>jTK`*FfFwT-VFmGonsUX;>k_w5QQvuo zzhqB<@4awzdt5RQ&>jMx=<=v{$%<*br2Kh+qqg~H1!Q120USI9Qc4xF04&0npcP26 zp7=K{UGQ~676tm=wOu!u;;lX?7nuYss9Yly9<)C~hJQ85nvw+JUN1Zt5)AAWOh4LP zrUiyij)H^vrug@x)ei|R6dvFDn)@DSCZyuk7> z3shJQ%TGN!&||JFI1;4Fgs;3^4*q zr@h5!YSEq8Id)QHBDF5zFmBczOnC=-!!C!M0dDsZCXq85s0dlIlU-hCitdFS1)=@L zE-s+;NyFSv-WAw5ZYHyfm85(&(Dx6Ws8+ObEDAMa`ow309pp_e(tehOt2fl`k6HCq zJ1(AIIoFG0(bZNjd_<2<;rjL#j;JNZ2@oF+JtF8@d{$Gn)t*3s|33}b%qLTr+@Q|# z%N&-8V`u}Q9RgIMh;XIt8_;jWf0ld0qg3OxZ1_it2z8GFDG1-TARxwn_;l26ePwbP zhYd&Qr0Dm1k#gHwqfJMx!GerX&XP-%x_9vJvKFQ?K_W;Xp8$Dobf%xWai2jAscJ&3 z8`^Hh9RW55_PVVv|2S+n5lgcc2)4Jz>Q)zT*^_#aQ7pV7gL>JFpz6mVRS zvj8^zS4!9be${eT(%ffs$}%TJ9?+hdpohpN|3>Eve?VJ?K8uyb>cy0p4wjGK`aEEA z8Ur(_Tw*LOOA)w>|BB5hu*`b2k3ATIWQX&xF02tmc!|8N_H@+VGhTL^sah?L9yhMr zlZ9MXLrfh5&gDa20{#ISFPD1jM-Zh9gaqbfd-L#1nE#oT%5v~An;QCibpgp2ulJG2 z0x-SpBF=qWiv__>G~}ccBwCtWyNHf`Ey7O1Pq^myEd|FI5+K<~7igEi+-wM979)^z zVt_mpi%kEZQmp1^MvX!ytKq_OIn(W#rZjJz=znLak1!uxBGcbJ^!NsJ{D1UpZ?K|n zWhE8D?I;(Hkvj$=R0SoF+42bzmUP=5(+8uVQXqU6MZWFo5G5W>ObY(+d$l|%$DxTl zgFRrBN4gF`TJ?g=w(2t%PI@I82b!;LK709UsocNB`UTWwq$0&$u~_;EDxo--lG^UJ z1352>ozsPVQRDAlS*CrP?QAnv*2dv88_D0rHIU^A81zjNosBgan89Z)4}-T8*_$HVrrB3E#?e1PZWSKN!fs>_bz z$Nq0G0N+?{yb({z2s~PeR_4lym&6gv?kB&t9e_E3=;l;V;^4-Du_^@Hi1z%N&9}Yn~ z^x1D{?wNI!aR7l*FAahf=6&mX2pPo=Njax~Xhe=QJtUS7m9%~xw7(Tj2qHROI8<`2 z@vnRb7TBYk@cJ}zLL8ai!Od8APX4aGm7rWS_izR-3Qz{d^R6f@)tm;R{f_+(K}X3? zr|3n4u**62fpra`SP{lw?#m3Tg|i@(l$N4cY6wf8{P?2O5&eV07rsPDsJe@^M(?u4 zoB#IK3`nG>NftQ`C*@>Vz*LM9Jj&T!8D#XAc#|xetCAgw&~@x?0$6n*2uDJm>qB6K z$-lU3Tf-D!j!1sCKe_|SL}zW(Qjx;$)1m3WGNs$xi{>`BUbOfe!ZxX3VW3>+z0Z$R zZFEmx$Z3$i{IBK3fGjURhs*o#>Qh#@%FUM-lI-H)I3~%C<`h)ant&dbXqItH6e8UjeCG+_>$>&bmm{o<2wtcAi8{ zuS0Fw1}F`jIPF2pU`TlDE+#5O@Gw%@pAZ0IA-pSw65mY`!%_XW?7e_`2s`KYRHe7F zk6cDg((JX;GC>D{L^mpTj{Ed`Xpwv?cMgG~W+GQexv$m5yLbahr?tqWq0qg$Z(B*2 z2drX=vvzgTU8IA3(Lz#-TDX**e? z1@igtgubw*rK%m~nL70$p1{hy4yBAWD*P&8#OAd`21o*o>iLLqL)_Hlv@Y}x@TR2^ zl&>h?fb=%xaRKl5J4pMXMuWPBF{Bg~{K4`&r}6=N}_HOx|= zI)J13YPOy@S+7t&1PNyXv7W044AZr)Mby6t8Hh_kK;Oaq62)Bd;>t?Gdyq}e^Cy8> zLxQDt>j5I>SyI3L@+lg)KGeqtS<0=3;~^j!uO+Z<2KPAdVU9d|F=SI%GZ}YH2qi+~ zSHFUg7K6k0z)K-*TR;+beE;k}4^%(vsK00W`95OI5Do9|Hqa$>mRZUWCampy!4a8$ zmQ!xjf-P#$L0Ih2lJ;N?v|G2_$Eg2>2jLl^fC8l)in*)?%EXheS9iMjH+fPzVYg&w z&3pgHa{{sRsv-SI=?%OdHCCjOvl*|@A{~smYLqN!k?VtkDvjTxM(ysdH8oIWZ=_XZ zAfiVwjE%V@!*%tlDINcV+53;;cRn@j;)iQIXUVkkyA1&-)b{YE z2o)jP?#)iAUg2|H>(P>1gp=qcQMVty9>MahVYd45C4i(fnJMOnLK4@*U9mcl#hUK0 z3Q#5$di5C`)U4lC`z^I`>=xi&u(Un>D6G+A|3vP6?E|4d<(sboLi$&D$Gn~d<*f-E z&cp7$yOc48={^s8i?GW!L+LFF0(+{xF_-J4kW?fdMx^z6=(;u{IHS(jVuuK1f?9K` zX~Zv{g?M*3F5}UAp@Lr`?p_zq*NU`4Q#VL4Ha}UlF;5^`LPGcujCI0bYhu;suLT&pVZS5PxV8Z$C|7f4w#RQ z>2)m-F6$h|Z7O-KLkz3YoZ`NRliV}jUmI->v`K3?bUy_@H*J@s_@S;9QW zn}_-u=DN;G~aITp3Y@sMCy?8b0@tMMJi6$C;Xy90KPjKUwx( zKI=qonwH11$HDV)7lPeb$`f-9A{{4G2HVq}j=O28X5OcJ33m>liWK`0-TVg8))eEN z|C+^tq!GcS|LKv_{47*`GZcdlxhIVqb3;EGq7ouo7DQDs`46tQF?yj3QeWFeSB(1 zCmzF+-u-j(>9oy>K(vz)$y5VuW|$;z*grSFQDhV)psvcA51BPE>h+JmJi-YQ<5InT zcIG&f1=&uvwzX|{zDmEU8bv%^>sGpre%LK^Zf%Wn1d+jbW1`x=MYkI2S@HU4A|_2% z?Wb((?lQ5qI^prV*UBIv$w%JbX*Cxw5~X^KdSpG?^tER0rEK^j%a6cZbKIUdCR&N` zU>Q+y3_@Bd+*Vm(p#+HtQbOlrx2Y0v@#(f>JM`A0ILV%XGKz+b%es|6yg`4}%pR;Y z8L+xNDbT0J<#2ze5M^X<8&|f>ScJx5`s$3~4E$^2ll#QHZUU2AA#^FxG4g!$>>a&z zDc=S&WfO8tAO=zc15)+KeK`|qLKI&y4C}EnX(x|&x_xne^BoD59bbshfi@9c;OZ?g zk?n0bese8R_WAH5R8mn{jWbKR4oztdb@|sE^y+vTKb->h5vgzRS@Ju$s{`;ESGv9tn{>`jaN`5;+lH609N zsgXi-QWZRn8DgfjXuH*=VJ$8&93Kvr5(8hHo`Z(*9I2FX`UuyHY;e?R|BX9LktxdV8}JH`8wp z#-@}gllUKmVjJXsg~4N%!y2Db)>gDT)Ei zp`BLjy7~|-n=6E3*dn-VUb}>|$mIq8xUhUE!hSx%NVPEexlg!~NQOr5YMWtM9nC zo6!cnKh%`$6n({(rM0gK{DTzy0+2Pn{-nwLnVfKx%C9L6|yJ z=Y#ZfwcgyvLO?tM4doWA{frnd#O(AhRfvbJV%#?DwSvSMnv+S{Sy2XhNiEoB+3luk zPZ$Ga!1XkM@##bAkN<3bb`XY3h*n0pffMy8ufz+EbUj>5uSi_8P?p!Qo5TJN4lR7$ zws#_nNt{-RPog$k684H z-r`_A!K8ab1>(+lFv0Q^$W&jze+9^^)7l8z%C{-EMRMNR@Oxc;p7fXAbRKM{Y2z3*;$ia<(oFSUDLpVXd%kUilR_Od)i&cpW}>sIc(YwZ3d#}! zCc6-eV*$#gH{}O3h8?@Fr<)%CUrcZxUUIG4Ai!r4ZTGsv=^m#x;=FWPALG=npgjit z(489Hn?t#3k)&9z)VjH8jOx3tyIeM7*Y$%h3YutSXScMimS)jwG`6pQ2PBfnFM6@> z$)@wqg-!=RC&-GOC#*PKnRAN<%PdrknwE33Qy6OeqHm^U&#$cLe{=$GQgCSIO*@PU zNR7=nY99M*A!EL_m_sV$YT>v3<(v&*Et;74T|%X&-9mQg0F&E}PeX}y_a9;w4)v>} zM8RKk^kpQXC~QvpC?w1CncwT!0pLdiJAlVS?7?O;q_J9+MHb&@TQl;!1P|v3Xoc9L zj^{9!Fkc{uDYcyWmW8r4YE7T2!g9Z+(kP2%6CYj-b6|05y0IgC+1rREs`hXpNhzss z@cEJG1^bOO8k&p0CrSw-nBG9R*fL|s{8vJZ!AlF3;bCl!P+WFam5bcs( zEgj>nfp^`E%R@2(4@&@zZ&nveRcPt|vFbQj=kau0kmFZXa|a(}RGXh{w-tmn%yU@_ zw_O*yl9N3BwR-$gYI52B*^7>_e%LEdVbhlpGAlek+TIo6CbZRX(E-%PI@WG4FUuv z7Ai6K@ps)i8pXmIEjc-HHqtnj!4>QeE9)IYQw|iKdtoDD(h~bwiS*J(c#M^cP!bGj z4!)-&`%m3F0=!Td9SSPR17U^G5}jvY%cnnt9ghH^B1hm57!KwD6Q`t4b zPyw;?3U%z+<0aw{a43e+t{7F#l=aKUEca)R$18p|m}v}g*zJ`ZUodh?ZGeqTxXP`1 z@XH;-v_afWP{h@@!aiUgDtV$#x^>#O#bvv1uj*nZJ(RxWF0jtUZsC~)85EG*10Nr+ zeV3I1>39}h%NwzHqXEqw167It32FNPJ z^({nl_Uk)iBMx`3J-s^j^6daUjlvwC190LsaY|0Qq-QIXoA=W6XD9D$FH5^K>r{lh zH~3(cLJd^Aoyk1!pDy|j%IQiZL(?RDvyPwLFsAhR^6%<#R0fI}9f9uJ zdoLS+BZ^pOu0~a#5?M(ofpJ;nv|qrgsO}(}^!d~%liW>^XqqPob zs&yPH2qdk~fERb`)bg$~Zu^Z87B?{xg#+UCC7Azmy=H8yW0DU=#=GKo925rm1C@6A zMvV6P>>{ln;dM-wdfvt7nqMh^RN8IW@|_%oP-m}&@neSl1Sri75(2LN9VSj>Z8s0f z4Je@PcAPVQs5KKLJYx5>rj3lJMEhv_vpg&$6W!>BeIuU8t2DcXB7j$T zcUoy9{H1Bole;b&(6oFJ_*)*6!+`aPBnwi1_wi|N?a?H8AwA25r*h||%*sZp#|E!a zhh$mI6p)_uP*md8$B$S2w!StB!x2y%$xpsuHmmDIWM}40#sa*khgWs9k{9vIwRdb- zjUCE-*D~wr@jZ^AB-@%c_9%o>M6k(XqpkRc4C$CFO4EhJlQjiF`2TVpqf)Q}$?Ce^ zJQERC__i`ynr)I_!yu*biY7Cv0lXa4P%+-Ccsdd>Q#BhI1KV8bcMk&AN=nBnz>wej zia%X4Nd@DKWCw!U0;U2%X ze)QFQpXWFt7KvhVHcJ#rGsLc9Sedr#_YVv1q~Svu0pEMw_xtzDI-w3!;^-p#3rh+Rz96yU6|}T&?oy5*LoLJEnQ~OUEu=$z3PGk50gWbs*lzgaBqS;id+_ zA2fhULyhvrG+!grUDn9gWaHf*F^Y&Y2QNDVBWwTGbMyO50?(fMW-<2pjqH3!0X!M!ZQo6_37WV1hR#Xa z#z%n0MX1rgaM@6i{SJgn9B{26g@#44H4gF`JulJ-dKPsmnGuSl_a75!`NC*IUR}a6 ze&>~&H4Ir`DS0tE;UV@NE|MU7w{D#Z>|`tJG6g=v_|yGy zWP!qXb@7O1$tPPbFTq^DHi<3!m)|ia1FpvUC(qA4LKNL%sC&e-0JsWGv6hk%Lb>{B z8uolzkP_AnU}KAvtreYLe^DZ>VWu=cXEEx|guL6+|7X-5sh@gfCT!(T)21v~RdMGO7p-P@6CT(H@0N8|anVk0p zHaTCbj$Iug5RMUOX_pp^8mN|g=TSpTBpXwRYW6TZ>L;V1hRLVby06IMSO z6i#$CgkiB-XdDel`D{)S>(!=)l7(0x?_U{xXaHi39Vlf;x`VKs-CY~r@H&S7lHHE3 zK08{CAo#{Ze~-elt7D~JU!YTAR!lr=weTa$4;3ET@%HB)lMh_Kv9)LhrserKJCo82 zpb4Jts?1&pZk>XL1}voAusQpUXtmx(^3!=_g4F^sok7cu{(*OABbmjd(tm+hYa|mqoH8x^bn^+= z6t9aE(V~ACe-B(e@=-g{oXFl4_OvEB{nO`*PH_iXC_PqF*JFLePxH1r!j`hHK-XQ_ z)GH+B+YW84>lsiOrj528<9=tyLou_eVpksY5jHSHirv17`T;#Ux=#|(mX=&)eEX#` z#a8CuRrn9G3b#3a?lSvM)p>LP(ql+c@Aa+#yud*EpQ1VGMA~+TT$vJZn3IAEQsD@_ z6~j~ZJFwzh*zP@p-o-CdT-*UjXk>Vxhl+-%er|j>&``V70xvOYa@bDvu(bwa0g|{s zy_W5{oVUHTroO-O`8sU5u*RGcasK8X26&Df^Hv{WQ{0}rI4*S8SOB+#bdyNpZ+z`G zD&klKY&_B0Jzb|BA=eSUX@^fqhi3 z#M@sT?q+X&oBYHykt#42f27(m`w_&E3j19LbTCL(wgNKbVCUxSD_m;YX*^Zy79THo ze&YCB72_gA0>$C54(XEa#ZQu1NVgkk6h&;YK=G$4!qtTN@9WG=@DCOw25)y|qwBE3 zKePW5#Su{T0%s^Ps4uCZALA70B$SeRMzp!7T#W`i*=K0q+W*7p_1L&(Z%D_c(oZtJ zW6tAwzTAE!Fn8j{K&fVlthfvV%=KTQAU{MQ(K=;f>kImUdFYY>UULLz=s;WOPQ6Qt zn}UYIjS0DJ((*bp$Y?zJK(=H)Ix<add&w_nL?`MUwLajH1G1G+N`F z2W3ScNDELZG2TFp$L_IjE$aw&Zd`%AGR1cfAaZwM=Iht%ffU^?w+CpAL;y*eHi$1G zH^Ta#ZzL|z1cH^v@x@HqWm3?E3d%trA00HLn%GlB5%}>0q=&Z7apmi(?^UH?aLiuU zQ_Q_Z=ljJ;4I{P)Gkshr{7+XfWl@*|oNtydp<+Uwl)0`ep^9jtG9{xZ;)@a3tz!Ho z(UGE>ps_s+xrRJ7_+I~~;Ss-j;t3pqR`bqZ zYdIpG@)<`%TG`Z~va&CGbbv?tiEiOHXP=()`kB9e+stGM|K`EbQ=&N z2F1spA+L<;LDaK8XXX2y`=qYI_Z@hWMT~J1S(lHUNR$xb7^W*hhbGc;fqUAs6fwf; z;qpMMiFtr_D>({iz9EnjP#zM(O|X@g)WJFjEU&`%JM*P`J#Y_S`L=sEpXc9at$L0? zt-kpHNX3;Gn$37o!&B8-LU>wK4Yhk$%yvo_7)%v9k49SEf7~a`?DjbFmnHU zs1Rw=msZ+aGf=*Bs>f6Qo~mHN_#eLoXyBh*^vA&CfgOIth0PbNh)51rco>J{rjIZp z>j807ajaNt3DxKC_^^Z$g!``iPB_0_!{Yrg+48*Y$TcITHYc56@-E6XIZ z{_apHEf%Jb8X7(N&I}|dCJp<(>wkVd7Jf^CyrrlV>F-egEaLC412P?lU^wiV$SOh_ zRDu3970RKr@H;nazpade>E46TAPp+oHr#~+YL5aB^oTG-as zkg0B`|7QqqoFYmRsFS!K7RV*CBhX$JHC2qCZDZx_xN z_WKP+|GvTWJ5h8ukf%IUVMH3kK(!Cr-@hFMpMQjLo0ylj`aO!@6_BsLOP1b6^LrX_ z{hkJmz2lf#q!+wEv?%hXFxwH@9ACQ|F%CMF5?{YR`_+<2m=r_RZ!nJ|U__?xn^N5U z-PB;2t_Q?ui9icjP+@4@z7c5$hMx4ax3X~8N2t>G{@fK&8+~%2^zTO8H)s3ofA30b zT)n9FoasQb@8uuKyAsy#vcnG~T8txSCN3yQ9biXC^Lwt{5O@&pp(ozz5Os^07ny|N z1et$Q(g-rQ*Y!<_FZ_Nfn*Y80{|?Uo&Yb^UR{t9m{x@O*fc`g~?47PLaIepe-G~%8 z!f-py{b)13>e@uk?%n&_gRVPP85@MIYo}A7G|o7G5aB%r_U_tzM=O%C`k>6QeQy`3 zf`v(@eX*-i*q=oBQteAJk7nkDWb=L=BL?{?Pxl-S8`rALBlOg7F|x;#cExm-zZ_Qj zm4AHqXBupY^wzl?;W=ce(5P6e$)?4&YLafrL2Y#|8u{{+VMsdT@JVtFXI^^?0VNhR zb6T1@5Io%KKXiMDsb6V3C6+?zmSl4puY273l2B+Id9a<6efA7+k`G#@3_Ls=E&d9< z7iOcq5<21*Q!R4$#{-4tms~WE4iB-3C1G+uKj|@<>AFc~@cuZCq7!6twxY1iJzx2F zZ$3~YN_)+lC5id`q!k zt!GtPrC4Ee-eN4fYpS>@JOBD0e$DqE&S&phj5L4pmVJyKXeP0nP?6118vAIg6My;1 z;X2ar0Y{&LUh;-|r}DEMkJQ$^i)*Pk$j@e`*51sNxmFqLX+FWoZWw<0)0X~C)Q|7% znI^r*-6QcU_vY9VdFE-6W(g6i!!IgRtGg?-v#l<-uLVqxB7F$5Z5-X!Hl`{DUzgQO z44qVbMtX55w|$nP)x%rj+){G0S&`Ft4ULJffaCvP(>&;oW&@)Mg`l16(l^`^G2D_g zAxfm{Rb#tIYp8gOAP#j$imaD9G?>PCL3s^IGLda#;YHtQ!pY+N*3hoo>e!4WhP}nz zM&j`T2`Bmat?g0KoT-Fb>bK>Y@>?Aqv}p$-e>v2`E!m+pPKDlG-M8Yw&QFxQK`+b=qflKR;5IgvQZ;3_x&tpDNXX4w3Tkb#FDTSV9lC7`dSxjac{i5fe zzbwD3JPRWM9no_py4ZXZTzKG}QmFhTTLs}Tj}%!ZM&ucKf5I4-QqyT7 zb?$t3L+3(N{aJhgOR}6wzPS>;{=Tq|gu8ZP%Ep{TpUR{58?HV!D(*CG&}7gC`aDYt z@+eD0P?)r(2tAGDuzV8RloAs|n>SogIYPj>^k_{!tG8casPO4xBQtqY%1zBODZCEW zy=%=^y=_b2CsYfc6kTJ`>`{L9b7*~McshxBU+l`I44)gyxo?y>$8rNIC+)KzdFXx+ zdyO+pubfk~9-^ad@Rc&j_@iw0NgPJDj7PIJ-s@gKw?wT%3PFw8?v=pf3KGLFpR27}c z`)qUIpr_U8@D_1LP`@B+BBBeFjsL(gN@Itet0@mXt#K{l>$&aX9_my;ytocmJR>WY zwe_M{@`#vR&K%p^Bs+ge&fl5+Wi&$4=IPa1l@vGk{97JBhNs&<&$p9oCd1Ea@*NE; zy4Nc^G@NBJjfQwqoYY#zy>Y7Db1Gl8TVh7&c^N^R_5=Z`&;a@4FuC)!5qkI6Z*=Gf zQ_)|{o~>>&m$u$DyWH}_=S+AB6j;bq{Rbxy@2)e9mmq5rf4(`gi6pxMK zZ&rW?DSvda0qcM%9(0G91eyz>Fu##9rMF@I=d6MSZ7{=))(n!fwGTXLuya4yU@x{=!WV-!snC&;D0$9(j9!Xs`rFe88(w3;Quz`WGN|b zhin5OP`G0KNA+oo8kgi!QlL_)u6F49MYO_A1p4nEj;H_K(n24t6b-zR2_tFQKelBt zdgvXC=*X12?x+pl{@IjfyWYPA=`hDjTbmdQhwZD>R<`@O+ z(b_Z0Z%Aj04)iFCZ`3r330$4$WKy;>+g3VMl#WJ{I^DWyI^e_pP*A5-67a zPdE!mbBlTZ7LEm6yb{A1>U&7biwXZocve|$2MKsr5;_@x26E|GZNGfnX)XOQ=2E+K zYG%EDojUpsJiAz%(%oC|J1}fJZ{0>PAH)YevEZYI(ZGCMS^(W*!Y-u;!T;ZXF5CZS zt4;?0_sL$(1)Brf<$~6s9fYD8(Cryqw!|@|$tl`Zo`()d>kt5ZCE!c|YfFE!O-#fV z%4Ijj1RZZ{R135@Cm!E_F!XDC)&!I~eHAu3;JA~iy2?U@d)F5PeXSGKj)mE@6L*s)i1fKvu5YXJr#{Bc~%#6FESWvIum|v zrx6jmbVaI*7yF965B1f@moL9+_@NOQ{(KOgOc!B?<7?eB{q`1` zRVxDRne%jW>ah*gMKqraxED}`8xAmHX;4sbXrqUI)+pDVU!L4D!;vLVGKZ5sSO^KE zm+A$d9mOszE(Vdh0ZsUzyZuu_Vj_vVWb~cKn~z+71Y`yT1iV|Uvd8uX+fSSeoPYA% zTUl9o89veja;yi`yZQO2dZ3T);YY`zU=jd&;lXBRMn)_a-Ob$_+`?2uXn+XE^=Qtp z(Y0#M^$4UJ&f~B6oGtTbew{rlPAj-U1fs~NQXfKt^Es`CVsF#YC0NaNa!+3= z|4~kctF@tJ49?K`W-RX(l`J_-Of|e0Kv~?G{H#z1D1~aw1Ehzc{UJS{nwZY7}nOa&}GNPby8c_(!|3U z)+q17AYVX(iMItuI+mOm=K4Tt6C+A0_dY+v^LB+s7?m8PFZRGUBax4E&O~<_XjMz9 z#3+e*dqDZYFGRxlVXB~RB}L_}8RIrJw7RBCsB)2EJZ?3!GP?BFBU*vKcx9_g5mSG% zBm2nUS{Oc!gp$~Ov6G@cbFm>r@l?D4@-Y0c>&nxYUfd$wDC&A;-FIC1yd&@mNm?yg zNGl%@#JRy{**Zd;oaL&G`d3!}A^G#f7f`(n$@olL3}IqoJp~?3KjWE zH$=W?A2Kd|)CzMRr#P)dcj{2WK>J1mCsHyw&EWfwxCh{>DJXw|yR`rRWMx>Rb@}C>Fl? zN9>7t4Y;nIQs3cXTo%4 zVB}t2nXGm+)2{b=R&7SZLPSVN7{_QgT9U~w6sE>GpVG#Jt3!+QSJqg_vMQ_&1gDh1 z3|m9nofo$%KI0L%K+Dsc6`BN!pmiDn$;-7nRSc|f7%9;J90S%5|4aMWvsXz1m1stYuXSUj$Cf~X=i zCBHr#oP2eLHriRf-oQOdqv3GtQW6$WZza^9h-K3gb?+YFtns^5L!$ORcLW61$HuN7w+~8Fn{bW&tlEkosMXo21`H*Ic z@^tTmcfk{JbqR?;KXETFFHM44tb(E<*8D3ZKQY27g;H#5mT&i8W&P1~vn=?hY0Xtt zG*k$AjeZw`^EG?Ed{oDSgQW>Y2A*E?rNV=WIMWms(stz@rhW0Zx2E37=G!G{d(HO6 zBg@|lGNgnXSkQR5x}e{#>HF%p%}t?b(8;)T3Hl3!?DAOh+SMg-w2_T7P`J(E;nOLu zv@^=rr2|*10s^FV0q>a*x}_#!eO1Fi`!!Y@{GcnbVUrop&<{s&gNg3urNtwrd}u=R zk-`0Hw=a@YE2v?SaNX&RMnQukvM*qMs0tG931*Vu)2_{ziWh#AFV)(5H;U*k<7hms6NBN?>VU6XRv>;s zES?B&P9UJ+qC)*UMg{G!?z;p2L$i3xTpPY$G-pkyS>n;d)g_+){-Zj{){|8X)Z+YW zVvq6-ugu_FxWF7MEheJ6bUT<+ZwNU0rBKGg;HjLsY*11vH4gHL243kE*o#-_IR#Og z)2;beI@4aq1@KJTqVP;ib@6z_vN(jXC@54^7h&qLD@sr@Kn_s7`Q#ER#RIB%SFPZ% zdFromcuh+T*BhG{E|S`ki&DkM`N0rPUMX(um??0rec2Ya42d*qnd74r9UII&O)dK& zAb;<^Cn07hiAV^j(zJhUdj|#0GYqog^y{t!KmXR9%uy;QTwGTCyZ_i1*9zb>XkYTL z(wwT$ua!14&ZL?#Bq^4Qe;0we9+d5x!bm(*S-W_($Y*3=QI8r1QjW`zNKt3(VuBVW?N6R=Qezt&|^i9MRTTFU(i}W@${~i z($+8{Auc{X>m(^QC1nGBZ$?JKrZyj%y!63a{#2{xG(Dl7IaSR1ZEWu5Xf6GAD z&D+_O@%iDi0^i;h4Q4L|o0xFpg%^alqQ`2TDH@aMjP6N9%e$$?-AjUw>05I4nB=X3 z5~Cln`_BX^}Iga(*~e5}Q1GEuXgCyCo_) zLphaRxm?@Ca-uA<=&^w1E4?s$0>(qRi@`cXzF+(d$10E4T<_mmh)<(&WD9Svp#S;l z#{it%wrkK2cMjC zo_e!=;3{~3oT3wuY3Qd@zjM7aHBNL@{YLr;f!UJO6w#6G8M&&qHoh9nvrw$4bsUA{ z`=bAtlBYE};`Hx#(%n(!DUFSlagtAXJ+L`*G$0#Z-6+O}A&?@|726luBpKkvTW0@| zh49S~rAJ(&kXgZ)TDBnZVES3YK=#0jG{tau<5RIR7hZ3pAoJ7EM||{aISCAy_c*Q8 zBHVFuBrmeqP}!xj8h;;oQr*tVM!* z5&8vD(5g1A~JBsmmH1Y+@$S5Vk?@uW7L>o)wcZC>}^ot0b_jL~vs$z8cs{) zq_N~A`8~41aI(4?T|5_07ds&_es&ziuCx0;FqY4K_Gq!oEj7|j3&<45=66SGj>)vET#Tz7 z%Dnz?of)sXYg_H4eb4tOUQT&~hE z`bZqawCmj}i6n2!cJ6q}fEJ&+6%JC)!TwKtT$j z$_mrFrp~#J-9)M?xR_E?;y!IpQdC@RLl$F|Zzb|2Plszeo39<=*rxMP3aUyO>9=Rt z7;=WkHOjB-RdS?L4`*swG&@ax@xwnlIZftR3=&lu*Q`CGEHfFNbJ$dUm@BEVPbp~; z(Z7^(ZY8?tlBc#oer?3RAtjk^IoaDfA`zo^e@bg)$@X!cc6nWCdyod37mqtxfwsFG zqpjSbgay6Rx&F9PM@25%pySTGU%#381V4Wqp{e>&t_pYJhFb=+^L-re;1a)d{S^v6 zMd8iR&B>K|wu13&hG{W>3HriZY?=Lm7|wRc}0oHC1;Oh+YnO!1AC=!&lNmEKa5 z-#&{Ecl|sWOyOoowz_hQKe!U-GQHXdt+=k$LFLh|HIq6wtCm+U64}lJ?!T5N#x7L& zdOi~R@|Nu--V`0N4?-UkE6hk~WwP(qCSjB7l4Ui2Px+wSFL4^jewKdDd{CjPe&KoD z^KcV1*Sy77N{M)tqfKKdAG+MG-q`KQiFOBZqw&I6O9Pi~;@l@D| zdJSqQY&bSrb~}2MZO!dxU(Iu{*mX>zZNAgn-(pVkq0GR6n!fGOhAO+UqU=SK-N2;G zb!l+1PL_$~e;Z3E4Oh=&yiYxd!Gx9M}EMn+nUJHuWme) zccV|`d`awT|3sMDQ?Yz&&mX;4op@TNY)<5Uoh-LuJ9%x675m!EjvpGvjfQzon!8;} zR=)`mH7H1%vADFXJNiHf^<%|+e-7T7#>g?*Tg!4WcTZJGOZpj(cKQFFb5n+=7{!J* z$a%{=^-L%1MDMXvSq?wU^h$8K8JRwx;6$^@!F*ytJ3rq{^x@0hqb3@<7P^&tMFN>2 z%t2Bc)I}DZwqsT7jEao>wC_5{^*zGamNQ4Rjt^3@7SHVJ@X-WbPN;XF>-I(m3XQZ| z-cwq2w7z?0bxw*`+9Jn5&zvQL`9j0q0e7QCZz-5r(X?T3fcK{1htqY1Rhr9}hHEJt zx$3kC-CdlBWpZiGag=Usu2xG9^%vUg=d1TIbn^-D*OW5mb^Rjqm|uQa_SiW^ok?d$ zZXn|Npfu*1jjX?UBLT)1!)SKXf!FR6>+Pa&Ed6Xfc7bzs;`d6j8x@AQS`y6957wa2 ziYLF5cC`g}3-v|0#h&e05km(X`RT|L!s5ZLFCBbJ$wwPZx{;Sk-&}O_YaF|lQ}CSd z$5$)inqAAxMxneV-TRfqDA-Fi)*D2I4mXxH(0HzUo_BN;qC`&O1a=i?obC7Uja4r0 zen91OTHFoIO0DM?PW|+;pL)5Qs{^}VcGEeNK@IQC7*mg2VC}Dw%k2}!%-A~Or;e;k z;~Q35y;c=M=iMz@yQdk?Dk^mrzr#FsYGZ?;?wU&Oj=?A8K))CD@Z_d*$f_Er$ zG12lXgmV=w;S&!!h$)<}`B4xL;N=mH^N$5pY=8Ka^lWLg2aDP<%slJ1KZt&n& z8~RWv;rhO9!hrv;{o^L};c2cuk2xulRF;ERznbM!9nJ<;9&>ZPvpTQLrFDO+RE@D! zvFCk1nL4vyGF2$(R}cD4D%>b2>Ov*+nqSD<;%f!%FZR`vkmWo-R}IYwyI^AorAo5a#Y){XtKx_vYYg-+l>$q1CVz<6%A04-%f2n+ZA&uCvtPL@dTUeX@`dLc_T{e7-R7h8h>T0( zvE_bv%%Fo6KPWblKHS8u&Pm8fnR#KufR0xDet9QZB>%|Q4-YFhRsB=0ZEr_IPZa=9 z7%2)!CHbnwjpr#B+)3hr!k@w$_jkciR>S&m- z@$PJGsW~++G>pCf=02>mVIh)m)h7P{!$b-9`-RQBLiA?Nm%~1@CAikM1t)(M$v=1M zkSS&uJ0z54X!Q1~5pZ5rk?iDiVa8+pMp5(Z);xLXVi_;3-;PDLzgK@xYo2RMI)<@X zavI)2P%6f1Sxzl8XI=Gic06AJ-53r9%vnD(M}3Pmq8Cl$y^+BxSTDk5^r8&~XtQ=nfg`#@p4SKPl$n`4VRc~+DJ$MSRDMKd{V=}Xxn;bffM4b;iGB1VVf z)G4^w&3Zrn5BA^Wz=g`wssw_He9q-*e4*%{i}Yu6tSZDqg($9nASyeWBW} zqB>I@5o31Zn}4wIho2Hkjh|X>DEwG=6C6!Gjuda9b@ZnV!B}vgJbs2t;ULVB?)#~` zIezPdfYF|zVC}29TZhy#Zf5RTZpF|?=M-rc!p5nMGLbdmbXQ~ucTyPlZPI4-iZ*YX zUfU?hAR-IvOs{dxV^ae=&lp2J)>3sbK};%qZRcBT;2^E0!h z<%>_6Yis%W8TtBNP$SCl)@g?Td@I7Y|27a5j(&bO{I)4g6K78(v=ypk_dUuwO=yWb zZi;HS=8*f`@R{u&tPPH_Mtzl_D1ANx1ncqHsLP;(O;BK9T|ghDTj(6u*j;;@eN*=Z5iWRB+{xH3!*Ch2f_511tagvrQmQHTi1(xhnU_pUeG zo#o~4r(HL4)h6dl*el@48Pu@AU&BD9t>yh&FbNCYaYUVTL>bhfC02tRl z<8~IqY5Yq275m&N3*6#H7PO9Bm6ezt-94MlZ8;sl!2gCsm&kP+Uw8PBiZGsl@ii3C172z{H?%ZG-21LQ}^~nm7(slr4o6Rh@fT58D^ZoxUtn$`SW&Tx!ZS9YQs&TD7jE*v;fiWp0RK@-2ju-JVu z6xVynR7-;oibRZK2S~U+N%GsTlHyFUK~r+M@6><(X4Jx>|B>NEzdCZKm%>9O4HgRQ z)Lu-`Hh&Sm*Ei|@^0-!-`|~4v5VeQ1y_}rW9c@_HitA`)DP$1Kx;!^M8a0$XqB}-= z4yT7yI?)9wuon_-gsL=`I+UlkIvYw}5T=|9Ycf*e(Mql)!fFzonsF4e zOeqDOiHdn;6b`@M!bh}7pG}42QKgRt*#N81LW=S5@IezVu#K_oybdBOitC<8YvRjR zc?t&tO0egt%_9Vm^u!TNO)ka(iGC>o6F*;HAc3=K{RVR$a({Fh#uggb!~l|A?5FUk zBd=VjN8PU}v)+-=;pZClmRJ@KAhjh|EsdBF`iT+BzJm{SN-UP&PA7#q*hqjuHDbVT zk5S7}1e!l~gC=0u^lYKoGotAd;JOe|4}QN^v3a=aMai{`)vt7c8_kR4oaMg@_W!J-#4!j{ou%+T03#}Yq_oyZigYnIG_a4?m>j!P$N zo6%4yA#$wgoF*iVaVO3r4@H1d7ufn99z9ezSi+_)>LA@QkxwA77jrq^RmmKX(TqN4 zh<<8~=0LOP3MiqD>s|PMV|N+!n*^YbX;vZCPSecg7dkrJo;-W=u`8l&Ue8q4;-A)E zn{6Pe0Z;XnAXrF#g7Hid>mP=mr2!r{cHmS15DzQt)^@0|B%LaI8AG%XSO>MJC%S)R zl_x-H=>NG9fJZgA^%vGsB_h1%HH4hl%S4?l(fM?Wey_6^i*rh@3mW|un*J5P;`EtH*ue*MID=xw$gStb9&J)SyV zinBPB;UN6OM(c5F?4YDE5suoWseoRHo$r6`?EcHkJlFP&_>4zfFH#9Z*;D;)?0LA; z6iYg3A(f$DhXZ@LKd0*eG*S2`QLTU6q59vv!>PgKPV;z{g7dZMjm4a~8Kwj{S%Slf z3b0xA$A8!i_@zg&1cP2)rahx5QV0d`8{NKtD{=r_MWS^-Bmpj}Y5RXK3i$f}pKW^6 z0W;-2>p`i{rGnpNy8J)IR)BI=eP7)~^xz5>|1DQ|;_Qqst9!Ep!1JB$t~HJX@Y0SK z5ELBT|2ilFDu_jb|KOf+&6jE*<#EvWQb@D{7`xUEv*{Fe>#={Rxl+3jnYcpPeaZEL zM+;<7A7rKzcYM%#xteuAuWD@B2CIo8!UJYIBpUaOJRp(guIulKN3l6F$nFe(N+P|l zsZrW!2bVa&Mtv%@TUSwH*OUUG4#HKve!aixjb8!y{?gS-0UyLlc&=yq2X|&Mhg5eT z*FQi)_?+Sl`NKYF?Sa>En?ELEIad(uEK;>R2yynLSYI1~VFOEt2Sj`FudUYx3a66# zlTXTL-|8a$k;|fJAF2r_BRhU$s>T%opp>{8G)DMBCY>mT!wH};JrccqMCjGaQ1+9l z0zL_F!J8@Pgco^$SYyW~C|_=}`GD$dt|FG!5BmscBtV8;&Vx7y245Eke!KzN3icKd z`}^a7-s8MBQ^9l{usGO~-~}+QXG$$n#T3d;g5mj-dz}~nJX;iV7FX=Khn(AG$83x9 zu#Xsl06!oRK^d*>?kooK=ns@V8NI; z|9Ziop*WYfO``!6(HuZ#ItLa()j?bRU;sgAN@~wPRqx5lX(-cHVm3bd*9Ub9w5qIx z33PlvT5UX;A5eDWUpv&lhq8b#po$+avOQdM=Ks8l`X4O}{Pr%=98@Rg&i}m6!xI$Y zJN?enb^U8WxUevKFAyAa0sk^M;+3{*pDhT-;{clOy_J8or4#$Zz&K4E0k z#96Bn@L_=Da@9sCl2NIR)qaB#jnx`_ERi#Ef3h;dNpC#?k$ooMn@8_^fqS7AQQ2lx zZ^6U;Ve~w#(Hnj0sEI67NMUq+j*xI0GH$@kV$!P zZ2iBvcyVw-4M}M6@z*JRi5F2-BfL$EhoW`Fz)zzNhq>cRgA1OMuO>!HP}#)NbSopyy9*(Br2%kfW@ zi;?lO!FeZC@CixJ(=s(Q&)3lxx?$txqRzQ~(W|y}(O-bETQs@1KMNw;+@^H2b#x?q z6nDSfsKE;qqpIU|s$z{>FM;3X>X;#$mv3Kasc+PM5BpA0P3mz^QKi z$~oADB_bxGeiYw2=U1M-%fB^?LRC(x?jSi<=Q!1kWM?Xy0QG3pw6l=0HwAEe}z&xcIk-#zTweo(SRs^m)$l8wR+*N%+*XkPji! zW0DNf9Hq+FMn_1-e8UEF)m9qzESm6LY|#^<*sdS2VJLO(u9UpwaZ*!(W)2+*24a5x z#}q zA0ChJG#|0Xu6+VF(f-c4mr^OV8$AC2%6t*8TBrHuS5@-;_i^33 zC;grYz`H5{7eH4*(Ol45E}E7qu`o!(Im4zgv-|rTcY9)KUK!q4b(WXoOZsUzSUK4+ zls)={&wZlmL;Ybsy{AXuE8hGbXnef3Oy5bO6jNO_SOH^eos2%hG*fuvHKWfHtYL6x zPSe6urt-1Q6XansD?|WA2wEn?n?D+O3(43}tWIgC&G1vORue{l`}9dpE~AEW$UsPv z&OHI~NbI40t3`dN@+2y1hfjejk0z*Id&s)l*#3 z6ImuZme&Cd#97lSEoO_p`DRuX&*uco&Q?K_LA{(U_#kG*J+(K&8p(9GufCzW)aQfN z`e#{(Nr)-y5Lf(|LF)Mi1A z)gYyUpk!l0&G$$8zc6=sO}h~C+<4f0Gk(Us2XdSVkKn+Lj{^-C~EF?-B)Gk zp)8`};H{|OO;Zw~YOKlTH@Y8{)W|ItydIzJ9q(vXK{%2`r-!+uNqdr41Z~sY%$AeQnF+E!bPxz0ho)wK`N+FBFt@AE9)&CjO+2nXIWLi7bFftONHkBIN=X4b z&Rv&(cODJCwR9-eayh$qowv<;tOEo-oum;k))q_hhyf0g0f4kJ1IQ&2*niC>#-Dm; z>fyRNwdW`@1s2gCyN58^CA{_Bha;pD)jR;P$GcH$b?S~b;8l_1WW7tRtWcJ6Y0?21 z>`6HD8sKM2ta=q|QZiKYn!UOQr@zA$Oe{Lix#Ayo;U==W?83#yJy|fF7>s+Vr^FX@ zXttb66w0n2x?EWH{UBY-gu;_l5#8*S{`C7=ozXwcRQ@UzTm|IsV zLM_y)-s+^G(r!mEr}v+`U=_iPoaFRaZlUH?t`2!y@smJe0q>a%uu$%b+~40_ zvWb&-#w?_yxP;|36UHMBHcEY73tpHj-JM_oSmh^3Y0wHfY_(yW%saNoklB$L4C+;t@WP$rnmn|BYYIg zh+;M?5)0HmL4U7(BJn{pwE|i=m+^v&4g8KUn&|tEzJG4&cl4+56Trn92SaNp9EIi2 z5BoEa8B6aD-0r9u#Vg72=YbINXDG5*{|+bSyNs8NMpNg@X)a83S`!pk^SRRB1iuDi z$o&d*dbYa|@}bUbG07MXE>bpVx&j7@IuUM#{CibZ!WOJ$7DVhB*7rKoE_G||4#9H8 z>O$Bt0qrKdmJ`pHI|v1VePR+3KcQWG4m1{&sE-GArGVmt?*>-L0kkmNsDEu?9@9Yt zV?;#onU!eNiRFtHBBsjCnDrD)1J9h2$->z4PJxCq8gXSZoN^X&G!TFE)EMko%+B}X z7mw%FN+pmys@{anb-#eo=X6Sc_f^;<+A6nQ!<(y)i>4CtyiPI54w7NEST4>|*YlEd zht>Dt*O%>#W)aQQI?PQdS&Xk=NRHo1f*olu!o&;}9&#H*z|sLhAGVWFCVp%B7Rv`i zTC;b+(G!4>3{T@c!AE{toav%DmDUXn`PT4;{zVnm(4VncweB}SLU`ilK=BOd-%$Et zcWMB|d~efD_+PZcuS3}j-oUxpWyZPwc9ju-M`mk_R!-AIE;GKFL)4>qQb_Uh|j zNDS!aPlau)GSf%2lqW~*=LE`5Zf|!Ntpm9pbDBgtk@KCShQ+GquvnF$7tzPCaWA3z zdg~UT!mfXw_52Y)ePH^<-WJmX)Q67JAL@hcZ~S8TFA`8Of&GR~`4;ebF%z$^0W5ud zU5;C&zaUZBQp~;X|IFD&9c{|pp(aoqDzUnx_2S18vd7h*s%C2ql(v5zApj$Jhst%C zq3on7eZ?}?vSH&)VxK-Nrj8XxC49j)=WPD0NJgoWaTV5U>@P1D-syG7WINBN5^H#C z7>@pExZa`6lZo{q{(glm6#Y6#?}5WKloH^97(HOhDeoVlVHXYfZ)hQs%8N|zA9nV& z?KYS5ft}bbZv59J2~agA7LISVXJZ<5`KJrT@YwT!xg)4X>6g-+3sp+O&}Vd&N(hgv zm@#gTHQtzMN(TKiSjqIn4r>G{x*Z)ePdr~h$ z@(7QE4uAH-C$V)%sM;SB)4Aa1<-|^yeY2{h>XhrU2cJl2NiB=YS{$F9oSa;jm-=@$ zdDS@dp%7$b8)*QSqTYXJfj^9+i{*0b!~@uS_CRg@zx&Pse9*bY$7&K0swnL+_J)Tp zr~!QJAyO_}?Yg<5K@Y{9tHpV;@?4W1s#HJC$Tq|B`GMV$r&~!~^!evI2Lblb9N?vc z{)DVY=-LvC_ehT(-QRzmRDSvBk@%yB`Rv_vG%w<=*&Z65{I{E;r}Lh&If3E%v{Rn; zjikHU%F^ZE=z)Wx1ds{TQ%+f%l14nSvcANK&%3XG9h{$^pPSzYyK-Gb(o~25|4weq zTzgLj2W&FfN&OI67FAGKtM~17Du6=)S_d*^lMO1q)LN`FO;xOfrWeQeNc$sv{J{Hm zaQa8N;o$HzTBq_B-;Rdg`kr-XOxLl(ZTgw)b&p2IPuA}E@`c^tu$H|qT3hJy*Ji4- z=+4M3-lsdMMq2@6ivp`7B+2}%$|Ry%yS7@Kng)4ZSC5+0XiHmAc-|kDK$oU|MZQmR zX4jVy^RnK2t=+Kf7Cnt${GOy5#gjirM5{IGg37w2a3S)}gEI2*Nnn98Jn_aZMKSD$ zpFYYlelLWLYE3Qckv}^LE7Ug_GjCMe8WHqbX150eJUywn`ZR_yKE_%7MJ|4?)4XP3 z8si4vnIZ;2gCaxHYQ+56H^{qCwlKmDsE+S9G0CyRex(=Hmg7z(yl&Zf0OU0shKO`kl%==~YzxPsG159_`9k8m_((+m> zLr+uErDt~MqOICa@*RYul$if#EOH)$hzwcf|MTcf{)g;=$0ANEZ)ftbOuql22ls#Q zGO;iR^lIBre;UhRO6}yHy(^u5`!vQ>wrbH`T`;{nkEoZM)mTiC7TWAe&+N~h^(ieo z{6&y>;z_mY8oN+w!ACiVj&jDBwZ>Zvu+ZdlQhA3JZ`pK>s6k|iak*B5Igda4MP(kd zuJSkIox9wR<0T*zv%8o55AKEta#hF5kPRBO^b+=G&!#_LL5A#FGdmIX63X=o6+Tu& z2MgJ=X$-}kqwfNCqEFDN}e7N6cltY z2){3|=D|74{>RqtC!UBB`_e4}03Zwg|GU(`ctjUQuW*th=1?w;|0R6_t^GV=bz~wy zIs16?u6ZItEm0ey_X;ZDba<;l--Dps&=YBJ)1-vvOr5%5 zl4D$plKb&$L@$3MNJgv@3o`PvVoNsJmc%xSgw-ZO+oAtG2%+Hso?I#Auq4Ukxd4^Q zc{5mzw1&|059W>5l>VhfIA|B&Z~%oFu)j$ ztCSj%EU^LX$XZYVAM6k{t&Xr!&#ze^SlgRtIw<7hjVM-{&1pFxG;D6Fs5jyh)2GB{ zJ+i91F_wX887sG{^o}+cR0ha9HTT#oJp$s_bt|yin=34-IzojknWm5|X-N)wR8InM zx?mH9rp+~SG^H+D-Z1r|D=jf6wINcn%9Vsp&wM|O(s1Pt#4mx zmc{xb)OQ+dOlvz*jOCAg8K0Ta&T8j}N{&Vmu;KT?eFR-_il>ipd>0(kx?{KuC7z{E z`86HjNhT9Uaflgew16@ji8Sq}taM6uO1G{4nN!ih(;CsoEGmFU6DH?~Ndsmn=KZTj z(<6Y;R1-U3BvR7@n3>K-W~Rx@QQtggFS5k%zMCZ>{RRt@Yf4PC#gjZY@4}+nd#2w= zce?0GjFn&ULLwKZ8{%vY3pYtka6!;y^leLpk#uXS5L&;<}%S zfTf8S5;-t>ZFQZl#W{;nU+M7$iM9ociGbvGgtd3Z7mPu}_Irqmr=WnVz@{&Eq-oGI zM$B;O+eIiAs}0fc_iQhe;>Lr^F`QXne^ZLr9YMtd-xBh5T^?21o`OequCd1KPpwAW z?SL(7+FADLS)utCZ`x=B30mdJUlWox3aGZz24U{9LfcB-@%4J4W0m}^s!pmq9&O8dX-Ac;d31sAWIHf>b;_>!rXZe#rD|KtIGi&CrQ=7> z0-Z)l#SpiiFuUmppuS-ZLU_t;JHi?VRcaTi7L>Py4cytVWZAz~st~>?OV;{$RRBpD z9Mh@WQp?c%p=?=O4ZVzkbGc)H74%Q>orjW+>a{p0F*t3Hgjb;ydD9_K7ISv0b|d^D z;hotlk!o03uIt>cGypk>koBK#1l0$r{YBc<(nOmRX7LNy3VN@FypP@s_MGQ-j8x1$ zer8z6D28_~6=d6%p|CYXf_~Awf?bSJf!fvw&=eipntQ&E6B_5gM_1>3+uO%WPj&=X zE@n7azP;Bo?(J7;!C=ADk5mN`mW*Xbms}MFSjJqw2uNG~_P>@wTDv4vv|L+02KBXsx-#9lT5E_iAEwB_AS!L&c_hq}1INR4Gr z(vrDrDkXd66;uliUx!xFYvg49P3Yj@#uDYMf=Ex>auq`za_z5^@uXW!cN~guhVOY2 z9o9cj>2CKAEr|Dhs+TZ5!R(b2ts`L?stcPWq8lGiy6ZPX)LfZeuoWB{eBu;9leZRvCV;&Y5EmjH{etc>7t&~a3uzm@tITD*8%6blWaA%&z z;qxnrAlvJWUv+vY?M(*_xB~$oXvPv=6dOL?Z9}wxzSH%^O~chZ$c+KTU;tmIB_%GEnvk zd)aW@bWmo!kqr7>GHn3LmNJXAl04;IS_5vZB z(z-X2dq0Z4d4JiEG*kJ7*Wu3&leORejwUrR>PB#ZurqUQYXULU3$pdW`w^=Cu&L|m z1Wg*)_j#!sB!;a0JRc%Cs(R8Apa}_|NQ{W}AbL1PN)u>B66pR`c!!h31j!#cyft`= zkmP~0xvtwz#+vzO1MS`+4q@P@Sq^Qg4*C}@lK)}(x_a|r6v48D?SqJ?Kzt?K2o=xV zxIfKB@R80^>KSoYKkaa!wLbZIRAw=lN-(Wi?0{06IM|9oqCQ2sef4258F6X;!rkV4 zsk-S4gU6dFxu$9AV6KpX?7{^~5l2i_Rd498D3U@@)%O=+%Rl@W09O`@`^>#j2_#=9 zouxDu!-wo5^e>)A@f_Ml>p+f@WA^cSAKo6h7|A)C6>x|^$To>*5Q*d;FWwUp{JbY} z`m|Jyy82y-_ex&JOl-)sZnmn02zUlx6F^ca!5&fp@9sfBUNM5Rj~k^1gkWLVs5v6P zdU#k9udBmueaQsMxrnu3zX|p9>=M-XaRV^Bm|7X5(~PPdMKjh{#uAWL^O_DPeoLX%;xx=Np+37 zGr@RtEn32*6PX#hPyNMLZLSHbH*`b z|5XZb@P0a@0&OASkGSmYZB@V?uT%e?FMurCJ=mgffQ63pJpN5SO4F?G_5~w^$5QtF zZr!H)eI&K9Jh z-{_=`h!Ff`;Hq+{0HA3&((uD-$<0N=9U^Vf?*SC5^yIXCL0PD|$;6pTt?>{+Y-~G- z6l|R4D}D#oUl@sYAmz(MVvByMCrT+FDd6;Lj&>nh{15?>1j3wW`1{^su1*v;ePVu# zfxoM!G90L|J=^!XxZFA2_2AmFwVy@acWE>dT~Z=~@)1!q0IR9y@QU^%0wHD7x9-98 zz$?7@d)a~ndUVYwWuXtG&uOR9A_S`OKo++5CqKm&FAMfy4r%)S74Hs-w*CO$%U)ww z%C8s|Nuos>SS|VM71`93FY5{8lBqV;jqnYJkK5lpE@`h@eF~V%Y)I#vcUSniZpGtS zmV%ocWxv7pslmjsWwS#>)bV03p$|6K-(5?m!vJucu)VFOL1=+718{2lP4hQNjCLjY zt_bAoG`}L8+(h!%)6I`tTsC0_296C|D(em4>qm)z&lxq_dgiq@CU9BG#IyYR+G4^; zXjM^eEL36pGEK1Qi(yYrf}p0Gsace!rEqMaU&4hq0bFj&Y~nzsbuD+C%{8PQMIMP0 zYoQki)BsLYFGG*YK_6df>J^nP5&vPJomJ0R7AgQvi%(Ne>US*$dV(IpdR4!NcJA@t z;VFQ<(qbr_?0rx37F`q+dfv75YaQvC7OTTn7U*?ybTEL&E?*m#Wu}_#3qdZGyp>2T z^%m8q$_-n~Vecp+O*bZLNqR`lhi>@}T=|KfGycFvf8)i29Pz1fSe_QkoKpdG`iN4v zo)w%3!fa}BYO-_k&3d!nQAoaeMy6l6JoUW}DAX2Pst%qX3U+l~zw~18 z(d)MADmWwTl{A4&8Hia|dm$BlX#vCUz^GGN)nK)GJb9)uj9RnAZ{KNjP(YW_wy+}# z_Jzw>errWT^Gb|`7nQ%{l#NmoJVr#%iOa5=SEwVP{9e80PmO7~QEwI{c7i`A@3WXU zlC2b8F06mh!ahf-1>BNv@boWljBz0ue;Ohc&mW4AkbhC_i@Vq<}x1W*?f~ z;un8)@LfN4p@}QG*ewfbObB=ySUSk%*h45*xSD(Sh_yG)h$1lyXh|_4-~0GME;&w! zBml3|qm%E(d?Xtr;z&B%m4S8 zp%7rbj5RMF(((V!%S1tGrWTn#{TUZ_36I(KsSDPb=pTj%g4YntN9D8}C!ElGfv z)-7;KOq(1NVFc#x2>~+v4q(8oA4f3O9yMnV6?Oysboo@#Tt%{YqUX%$7mjDjKuo+!|0_@Oz^Ny&n)kUYG^stzodWFmo)RszF1TDf zfZ{-8yVwfg)=46oW<&v2ye>~vbUFY)ANJ7IHrM#H#b2i|D&B&yh+?kxCnjf17jq{|f8m?se^5+qwn7o$FenvS->Y&_LD` zT!HtR1538b<$8ptTd!Ix_R1}xytgi8&#;!Lv)GgECQH6E};PtmXVOI~zGDN*wn zwF}e*Pf|}$f@44D%jqfarO$4UP6DdFHyJTo%C{~CA-WinFrD>_b>iNXruW=7)6M>3 zl`2i7Bh_m;>m-Nb+wF`zMz`w;jZ9V6yngaC7Z826Ka$bxV7BQ~^S58xkHh{133QO? z^fe+#A}0%rj`#sYw@0QZzpHGjjVg0jUtN!HoZ+G#vwKKnbsaXzPLwIUT5 z-aN--x`&ID_L)*~g@o*T47rx5N3s9~r@-gL*fd53OepgoNsV1VX@mj;jFJ${`APXf zxhs+~aSnZ*C9hHhlveq+^;7vkP2sW!aPrE4Oy{4X9L@){K?s1*Hlfw24^XJTN`Yff zuvBG!^+$8tDr;IM(X9SMR?Gg6vfAhrWkeIl#Vl~0iyB9D-8Z!I`X>avm}4p8n%|h> zjTM0GfD9=_(4`j$tu;)&+Oe`1rfj7;Mof)9O+bY%8+NGJJOXjM>QDaOe`FH7Aud|3^)?8+t zmaNyGe1W+wXfy#`z>V?JL^}N4+5Svcoux2i9HJ|*6;myt{yb@fQvc!(7PR-UANEG@ zp}{ii1O8fvyO;4S`UvC`*0_^;K0NVkx_wGy480z%hi@*AdoJFu&2+^$o#GRl!8!P> z$50)J9NA?*iNal5zj7ID`~p_odblA*l2o~pO7jw@CmNm@4dXtPuu!gf(h>~U+ncHi z%DmX)LoZ|+4=DBVqKT_139PU*|K?`QvI%QuhPmxE!7NEpf)PRJZOwOANK=ibR4r7~ zg9=Ch}hh=xF2?hU-Scx4&gKaW^%>(z-derO{ zi8U%@og)KP&HN6b0R-PL#jns&wSDX`zBdA~L~CiZ_c_G2&dh$fx4wf|b!|OU`rJO& z-WizUokg@8ibfEC-CJ*$-ZXlQ&p7np1TGb2&W9velOmnBvZLK~ZtsHML_F{9iI_B#%VuC1YObn;`u2-6G8)&&soxk-!Jcudu3@wMbk4EoOizL3iT}3U>LSl zr>8zLB8~R+u0O}S{3`P-CWwal@yVKlpjYQb@_uctpxa`!`OS5u<`dWN#_!52WGAYd z?pvd8?U1`T^7n6@EwwHg$tKhh#kvZcpDpRH-P+obRuUucX)?ctKkljL!#?XE9UZ>Ps-pkW0#o&juYHRRAmmK;U zG{&;X7{li>L323AWkSnwTU_u+kY{IKH^Xl~$sk96GO(C@d>f$WmcMq?y>8%CWKAq> ze?QJVqqRO#=Y2lzd1;zSE|NC1e#w-}Ie$_K?Rx5dM}bRGh?u%v0eVtY^1;Q&Pvt#mBanzWwMey*}3uY>C9 zH2W=Nso!AI`fZ!I=5-zK3yw#*2U z`M)M)Qj8W0SHp3+f>DY>I+A%8THt0nV@0l1m6n&ErWsp!0URaD``C7q(6M+}UgdK( zCflEAgE=?b*kdM1Ho=^lQM4HX2wlmQ9mR77NH#6+0cTV=kB(YeO;5$CKz7r3XwBMmB7Qqd0!d3Ki#35o0io27J#EmB$42^@ zN+Yo|r|u)pHPE2hMVhE*ilOA@=~E5dIDTMZuO8OouPg%l8}hSTZqs6HgLj2S zBrR5EpZQ|>w7uu~-Y1UPN$c^}S4UeSz78WHVcg=0&l(j6gO$s@G&Co3yqGfI9Q@wG zbUPd}cjtZFt5fgvDo~s2(a`G-Kk>aIv|!7!Xl8OBK-oYyW*`S3_tg;vE=2klNxw$$ zwM&ZAWu4IpYimx?V|+6l-zBwt)dJMdI})T!i(f}8Rjfm~QqZAVT<^uViCge|VV*nq zn~oqc>Le6MS(&j6(G<^Y>((gql^~NiWi%d56eU)Ue8S2wFA`{DD5oRzA~T?hBJs!* zlaw8K7}p1iH*tg&OvBnuyhF=|mCNphk4t^#FIa3fO{nnA%?n4+$wo9)A#OkwQj4SnRiuhWdb)la3^0Lmy?!N84%(u`+Q`k5}|Nvf%4 zlthDz4eHZql4fP@EL;kcTPea~B)iZ@ik8iSXe5YI84y%(L*-=p)mpJ=v65gf&u<22 zcA521&o6H~E+OKT#B>uxTysC$VxF!$G{Xz+F+LN3^?MSX!q)UIv4&&~qy0c9_e%i@ zrt(-lk*9;_u_%y{a<9F*&smBJ3rS6+^Ogp5-p;Rwu7yx`xWG#NfExt({ z*%ZSEU51=AjVDiPi&xfOQg4R<9Np!OGVRVv>Fd_WI^QGqFbu|}x}N!9a*Z7enMJ2} zw~-YprQUc6tm-HXy@mSO2#u#>XuGAkL2mQ+iNirPuTK4+zH0?>3eKVgvd4^<4lDS{ z6TP6-zFFN7NapzB)(8&c#?H8!>A?ms9rx6{@-hh2-qZc)-*(YdJotMl=<5woanZ|t zdYdIaafj5f6h>x0{$AtY=Vbey0Pe($>4Lxh;TYRT$eP!yO;~|G-ea!=`9{>&F+q!s zwov&izE25KvM+xiXr>umjxjJ?@__)e3ODoYQrJ!}-359szTnSY#OA@qgB`Jd*z-aW z69_2woQl-YBaAwvLE`l8{8(jz>2&6xcw!&A8H`Yrft1#y(x9z7z z!-iY&F{n~;P31$pQbKtvvj0e`GZ46vrqTkTs~s8~`syyo5LpkC&v(3t9+N7i@ljRW z&#`4QW^UW(Y+?erAV(YT)?aA8j=_306WM~BM_aKdqojE2;#ez!CYItgVK%64<=>nD zF|`DC%2GVPoJyxd>r1ftdz%9Yx2P6l_ma&s160I%T@L#h0EM4 z@;Z&(iEj*YHN5|+H|aDwpBO1O87pzvxqffwuh>vcp?nbXoi(w8sNlIrXrwIar1e!# zx)O;d##o=*uR+Uf55Uwt&4{59z{7 zQdR{>Vrx=gJI}+~N4~%HbUVMFU*7PD(P)yR%GYgp@$T;8qYsXNbGK<|E%K0@@dl;H~kpkJOmp+vn#dmn-%Q)P%Le)1DnIcwgy4RxqO39v_$c zIJjRRcvWXogMoJ$zDO~8v1t{fqMN?xMFdUiIby-4fU~I?mJ(`Et$B)nHXUo$ z`tP-bNZ>()E4NVd6f;M<-@<4PgL)Qz(KZ0)Nf^&ro ztb9lAKOYEVM6HUu=2kvy{K?;})hNls;`>t5Nl`n~2N5*f95O z+4KHARqA*hS;|=RXlebW{Y5ubOIuJec7_3KIdw%#!h&dbD8R3xfNzC1UJC;13vh~l z;cb>}HeGiiVmmtwjxA;U%SyRUFVD~O6ob`fRP6K68ta2n0}M3zWTf1pq43m(-I@YQ ziguS*ONHjMadJ7P2^^5hgUqGbYxXT7?tudZDYGj6M0~r4nnJ2wV@CMyeu-+-+Le%O z5QgmOO8H{&x?@V|>*$s&yP(OLsUOYy90^IpodsFx+ z!iAK_X2DM7k5aL;fvf@o@YXDTlKiU=8z+ghq@?d?<-bgxzXh5PxDl)^DLQ;qo~kHzL67M=S2jKJmyeN2og) zlMfetrwK*^k#T}s@oa$aLkSP>jOBHhE9XkDANY%jlqE@f+e_&Tx|*$W@zl&BSL9J&;9RUeIUjiJBlNhpeOg;d z?3yXFDTj-BtnN={drxb99?*BVv{8CAdnSJrc{RBFI_3-|l`04j&qJ6E>(hN5aYs*~ zgw-Abr7tx-NAeKy31!r8DZDlu)IFcCbLy_w`hjAcfS+RL zjO?x_>EN1#nEs)wM1(l3%!!$qex63EzwK+L*Wi~PEHRAx^Q@Sa4*ZWQP%RR67S8a_ z&l-+(VEcghwL3vKa~$Ms;|QYSEbZ`vvz^)9hnWyHAwte+K<$Y#*R29RRR`<9;mh5Y zcxaI5bQq>D#CuTs@ze^INDubTN!Mb@})afIbR&Mm(3;K!T0}!4IItZu_>NHycjx5i_3eBkyL}3-_6h{PvVE z#s$vceQ5^OeqUBfIzRMnmIS+bs{@P*X@2i%XoL=gPG;Wk0SwYdcu5F2zWYRP3f1)u z)kmBkO6H|Dpjz{LPKdPMcd{p|76l6Ij@mmJ{L113y?MU!g=-#FAW1tyVZO4E}bWq83hSc`+0KyR%BY1(De}BpHPC#@OYqIw5AvtKxja$Og+34F8Qpv z=96^S*{o<`t2f4tA_S;EK*(J_`Kb4sv0WqSrj@^Fcs@|(hpb1urPPn%=m~g-P&GyM zJ)-=?=jHW;H~Bc;BEB0_>OcddHxn+eCsSK~@lM{5k@x~lr0~zUk57riZi{yntAX>h z_PI)Q|AR+h&hAb`9S3RA!r;v&3m?8#ETb6U`y9Exzi8mf>FX*ycoNTgH(9^J@m%0Pei71hSi3)lMa!jxW$-A!b$=rs(-FV(}u zZS&{GG}G1e5LYgpjrgV?9kcj^tbNp)XV$H#H@h$DP;p0T$5IbwI|?Fbi5SyYl(_eI z$1_+d&V_9S`Kdbt0?3(6AqekIAX2Q`rf>UPKXs`fsp29x76)?3Kma^z)Zs=CyahHa zw%=t2&aF=jE@j3YB!Ia%6tIyf0hn&2-QjJ-k`G1GiFB$MKx6{bQkq9KTUr)|4GqWw z#idgnW`h+KeU)?#V=!;hq4M!zo%Sdg+F2Um`A+N-F))nPo>L+QVf<-~2Yp2)7Q)>{ z5me+ewF6Ktz+gnejk^_IwbuMwg9Xl#`N~Qy1HlzwuIDH2hq>m5ffkq=&V?qCk@Dxh zT_mYHHiXWqH_;E}RPh^~pnaD@0QmASRqyn&0N|@~U2$?V1)mqArO^fQn4T^T_ldO) zuXy`m`L}Hnh@F&rOrwBgRdJ6+6FYeh*^Fw-E6^{uJYerZpVA(I?3+&+DFrJ-L^AS} z;64N$Yq5ew>|(QC!s-w8QYH;hFBuqq#AzmU*rGzcjU-S(9U^v56q-sVYiSIw(DN{G zb!jy_h#lx{KXwRv7L1@ZhY+-#*HJ`04Kn0|@zhHk6v$o5Ifd8l7kgKx{mJtsiWXEaaYXX~PHpZ$u_kF_mb+Mt zSJhtUtOFpypgdVbNMec3Bm2gC;^C0t8;?^lOW zuJ)yaSGZ)Hm#>;f_Triy?)RFXe9nOp8C9=m-DF2JSw+^U!UDrkFZUjUcMWnkVr9?^*i2+E5beD8@3rGvn-Jx_hBHb<0 zC0*wm%(c(m=Wbp5uC>>>_nvd_{AbQF=a`P)@B7~O{oZ(<=S$%BeJ^kE1=xd}uw&@n zLwhQQn54NcEw|6EHzzJ%c$S)WM$=YSG8ajTcChbM)*iHj19B^C0jI&_oY~5_57&K6 zPDZ#l{$~*8Fv)m5KnZw6{PzrLsr-c*35J0Tr`J5$&AQ4MK)UKICnv!gS?V;GBvqS| zz883-lcYsMN3G;#wa4GM-Xy!z}PY>hz18U=+qJW39@JN736ABh`gE z0e!){E(leU7qhf7fFbBV%d7qQhVovwXo!E)B|@Rb*N-IeA79}iEnMPp<}Y~rL@1fw z>LoEWA#7|IUwv^GbNUK0b&a`$Czqz@9VJUuCgV|-pF*{G`IBv6B*Wd6)f0WO2k0XG z3`#VcK1`+UI4$?1*-42ZLWB1?2;TDneW_OG_()f84vr^gF>C3#p8h&BsAX|?iub_8 zDfw)BJC%!S27m1IoKi;EHrpwqWM~oN1GrEO-ktBYD=uDIf+A&sY~G>k3HqRS%ap-= zD{ibodW@>f5D~)o-r0HT$7e)dfr=Cr#l@=YGn)xtH47=c4Oi}s+p0H)C8iTTd? z1?JN6H9cNr)$Jm=)2}JBb)qKkK&|`j2WDI0VWc*wrNP*P(0AB+M&4;{>*PuSo*$Z9 z`)M(@uf5VGM1>>2dH`vGL&eY})4n8jgH?C9j8n^yQ|9rF-*~4A3;Bs5x9o7n{kd4V zjK9*ndpl63{#{pmf4Fd!VOcp(nRX9^(lkP}YsbL?FMv$xXijzlyLLK(1 zSH0Eh4pwAKLNPn=zM#=78YmX8``ioQ!b=A5uU z+M4Sn3RHO?KICRDHaFd}WA$+%^TSXWt0-kZ_2c=&*GMx1;P|C!$iT`EmD=#CBc zzIR@ISn56^ZOyMW_@?k(eRfW*XJ5Tl9T5_(3_7lBqtNDy6ZFEptq4`kvJ6x!?i~yt zNWvAmcgD31p8k@p1Eke#H6rzZsHqsD=m6bOIGl(ABp18Xf4$vas(Fddp8prF>H3>*ME$mMDM#j zE5K#d+tqUFCEGPZ3b^^?d)E|beaf=Hi!^R=jrEenU%+hS^tG8`2X0hzL{-5rpOdh( zV;OWpaMlAOQH@w55!Q8ZEhvbtBLfWp-HKk(TUdDXC}W@E*L4@8!FpRsWR4X`1+Q=r z)XtBHkY@K}ee3&&ccwRsA}h+A=3Qa5QXhC@g`)DO%xWSpYto%Y{DQtNgJIra?fw&KB863 z$GC&>yR-90_Q1V#4El2_>wmNfdbhyf0=KiB3^&wX z*>edRO2nH;_AFeX4R8I%L;n550^eZy#sz!sT%i8a`lP%JJZ6HKt4jjYlq4qWV!-`< z^pqB%C+WJfK?wRfDLi`@J_Cg3BIoqE&EjW1ys-so>YnuL*J|zcd*;+nXwTB?w2bHyF|mgRIB~7miV@ z0jIhwiu8BF4gZ4m~1svSQL_vol=2U=iY${+pL1|)i?U?8vZ=TLGY7}pGiHbhGak= zHe2Pp@8p}KX6-{A*vV04W@E189a%=T&oPlSG%NKxG~B~o4u5SXUL$hIikEUPrBU3+ zIlPVz3SU=huw3Neb9y|74UB5n`ihgZ8}D>)wt-;WLZdV8o`u(E41_qHq)C-L4frpm zG%>XSAB22ODe8%u;GDOe(1ymdJF+}U*0$QdgfVD~9nw$sp7ga6L+nkt{FyPRi$JH# zb{W^b+H}m%NUY1fBT?dtlxLEtqD?%3mtjAmsfkFT*}w>K1+A>9H>c&SsoW^=*N(jp zcgXWu0m}nBA%2-R+Wm{rm6f|Zvp zWA8C(Xuetat^gqAsNHxD+k%xi;WqpZ?EKQ~{P~h*0kR~FL2>97<(rdeir87j{$kHn z%7khVzjK5?W2^2jw$l~N)y%6|lUr;+D)%)%!`cx(q}U6n{e>0(f-s_FFy&a(nx@@c zCjyM{?IYI;n3?(KU7VIW%5-@s81O=r{2s(I2LO_rU)77*Pp#Ysj8A)2D8HXxKT(%< zqBP*kOqKoNl1ISs8dti!B!1c51jVTc>8>UkBTT>4e*$VE|KK;0;dF~!k+FLIKgzwK zl$)MHZ$>?#U}PwU;W3&()Qs0=&KsG(0 zja<`cQ)O%cs1ti-EENn+T&(&BDuj2*mbI=*Fc|>6_Xbg67t+JI*u@Ur}RBe9?9skeR z^BiFVWs@^DO<=igJV>5b`9Yaq=TK0e!EcfLckr{QihFfXMN$AivZU3+g8uLm3kg`j z+btvt8&FmY*bU+a&nBTB(xK`GB|rK6nf!DIA*D8=v%v(Sn&7+h@|~p6U4Ati3sZh8j0iR5iDbOVa-#v6- z{{dX*jqkGZ9g6II-h-qGvsomW2{Z24z=p~j zF#OYg7V$S-2IG-k^~10KX9N-)L|Q-Y1_zv9W#9jWPKQ|Pvge@sRfpJYrjA%XS=Ns} z)>QO3-~N-q2o}JYL?RN5EkMJE8UZFH0|0Nyrd`u&boVZgS*q3$zbSY<^DcM`kbDP| zK(T3yc}$ZFWs+|Y)HoLRLdsS6Eq~bj#aL$ncK`RZ97GCz3h6jHTU3PIVAh9>kaCr{ z@(2ZvyHTE~e-Ulws= zbWPb{1s<%~M{*@R>tSj1m?)V>D8E|mR>B3)=_JLDk5DH}`Ug?L-|}uB0*-#}YuHAz z{i4UFS)A&$N1+6hNb8-$2LuliC=igq61ubW3Pvs--FC6ks+q!6s zWA8TD2NIsP20_UcIV#TxtJj8Bgag1jiYbpHHksgeAI3aXkwDS!F_-_F#76c%BC!FK z8Z_Y{E4Me~?xdHfDo>b;DeKvO*+Lt(irXtC^Pa@EKuk^8*@d*7NM^M7s~gl(rhiIA za&5-M^V;?I_b-kyy`se07uZ@R2Qb3WqK_`iK%$~L>n~XYZpD8@)&L;Y0es}v<>v9^ zR}Wg1T)7Y#Z2;4`%(LI$7jin=@KT#j;klszuvC$d|Km~YPj8R?R%l;4E8#DPw2eeG zqOvp|L{_g8dok)U{79{x5Fzy5XwAFC8fV zW~bMVCVtqM*2jyO^>ua zL*xZcRB)g>B4CtTG&b?uK6dRxqo`dW{yBZ#3lkMk?~1%V8QVWnvdxEL@payn*ib|g z5fz#$5NjerXkU^AJjPUOo2wCkUF&`0yR}yrDiT+AV52i@O1U*+DS60Zm@!H=ZRexc z^;#PseqnoSEawo(zk1xi^y7Qk%*BHbdVNu#Ov|ZmyHYA?iPGDWHRwqz!Qm{qgn^bi zjzY4Q?2mfwI6;!OlH$a?EMA z7T|p;4buT2<>$9mM69a6G3K#yioFmEw0u@_Ts@jcCJ2Vm38g&8F)0Eb#J(qCN$6Di zC00NQnu^Ld;ea`=tu_IbBMA9|U0w!vd!N{SR!JwxSb|O=7*2WN6c8|5!F;|of%C}x zWiTQ0(ERE8C(#Y=xXrA))g2|Zi+KcHUA*YvAPdBLgmSBwSu+&7pHVMb@%KbW{Jm~p zg^^w}GD6tm7beo}Rh+iHbkxXeUf#vU29$Ec{%dqKZgcD))G3b_K&i#7ZkG6z9SAh6 z5WTd%ZCEuYxjM0I6Fjxr6J+_T+ZF4;Oz^n>0RmtWaR%X>wX+#r%1%=;VZKb{KpCwa zCv!J$p}w0Uhs>~Dz88v-9i`K1^u~Db2x)TouH|r&vRwV?c>*_rn-ud`?T#{uKq%nB zJ&uT3HF!YnDLQe2QROdJGfCwInZ^L$45cd03NS}z^v#R;F%P%Sau&C!=Z3z@(rcJ> z4=&j&8#^Vjp4oiGmrE4ofYzmJ&>;1D{yGW6@)W;=s!Ur<+%bi}KbOUc z3`YnUF>cp=c>EleVehiBEkx(S!{?e@6<{M_lBRNQ9g$m%_OQ}K0ov$lmn$R}+SBSE z&8bX&4UjkTbTVs_v2dDjkgq3ZezcnPawB_TM8W}yW1Gg-)`vY4maR>;j}PB9OouIv z83_}{YY69q*{|e#Tz%rLz=N&5#t+x6?x|Y6MKb8Z7e60<#=y$>`HZC4N87oO z0o7ip)P<*WfuN7hx!o>P?O0EWT zq|%fR+fz=HB)WG}+hw>vC8!Atsa>@=Y`r{><{!mPIR@$uqFL-a=NQd;X51sQ1T5uQ z{*7g*Mux{X1~zsMlKCe78x%zc-dAAU*W8PILZFaAuP?`gy6l^jC{%z z@{w|5f867e9}~FLT4U$Pfh-R$7)I&M(2c%M9S;dzV%%I$@Gx(dXjyjWeWW}CeXx4Iab;%Nm^Db5?U;tc>ZW0q|oMmV%A)4X2n^x{$v{$|F;V118L zYrZzJO5u+9XNL6U2?XSU(TG{g8H%s~Cc(88-%?sraf`=s5tf$jI@PRY)_A$|@a!m! zqHeKPf2WPd<2dvT8WGF>%Ht*qRbUfqN_4(@s#`V?G+yq0^}HGqJSD}T5E7siRE2{p z05?W6EgqKYv}ynRmJZ#IVsK&cN8Vv%x8u`KG5q>cgRTc7eV{Y$elgr#X6BKyeQlTB ze;+x`nKVj;_j#Eh!a@ba_-Tkul z>%~>Q!e_}+!+}K4UsCu0e^L-F9H5jWyxu{LQJ3Af2TN%*lJus#W#7I zD4Cdo$TQ`**F2iA;#~PY_9^S(8cPSwM8tn+3v3ufjuk4XJKb1&bm>BK!p=(}WVCn^ zcka!XI>EGnV+wZKbixb5YC6TCfuBsqUfQKE^tJWL_f5dJSTjshRADb;)|lELSz&^; zzTx`S!;BxI4Xjvz_Mi%Y$+ zbLJBO!hg&8jIDNv5JN$4#^G!>`3Mf8L$k~SX6iuC8WVuy*G`q`JczfpEI$$o7!1Qt z_b-4mLRUtSg011@qq}~l@3P&Szx3@!ILsITEAK?B@z`(lP9=+!cTCIU<)d}5DYblO zcf6k44%&r)PE5Gl@~4|CL^>_e#8-`Ra{SF4KJf9c-uXQdf|h%7)c;;Wg2ks$=IJl* z=es9q!_^imXim?PlR~Qy+_qs9L1MEK^6-OM9ah|G2U;$pFme0GWbMH<<8XpONoF*i>4 z_tjO}6(dZ~n4HsnTa85JoL;UQ9*xkA_%gpF!F7VjD(zz{q}<0g==mz!9dQ5vXfkuL zBi!=+c<;!xb`;Gh%MT4i)!f4aHGrl-egk~W8z6x=nQSf_^IXp#X|{(gSZ@0@O(4Rt zL_wg~>nKuGeTX6hX?J3IBy9d?1}7aklLf?q%~?IQeURO-Ev~s>TM<4SG{hjdBg&?$ zc@_6S=a6O+c>;N8?bO|(u$J{T=rhe+KBjzU^YNb)zcw2XvNRl&o{W{H#=pfpoi({p zQOm>BEn|h2y7d3P)P>so_4k6s#+j@5lES%KQcdFXCnZ8&jKjfRUBFJf2$EbcahFqR z;vKK3_aDb)q=o5p@O6mZ&HKlvixyfA!}OfyWQZ)Aw^`GMLtMO;Qnl|}#cYy%j0sEg z#3t0A1h^ipIn)YEZLOo3LHrp`S8os0ky|V+Gcf4Y9!ftKc%9QbW+wc2q5oNh3qaWr z7LlPQIfkz&6{@9^XiV$YLhPR{Xj#Af1h5~v@6)6{3}Z_`TEdq^;)H)$I9M-~H-E3* zJx?-U6%N&Eo{E|{O(t}TVV3YS|I!5!qKPz+%LghU)637bZ=-1HOFOG9RMI3 zmf6KIV*4Dti;EVxzskudLJuF~=#d~ekTcTZB#`T9c>ygYY+()*Eb{3-zpnt%yZ`O& z(0P#4k8R^x)BV^;XIzK39s`2knk-L{4%~UZ;sDZ}#xK>$?Kx8yjc?=-m^60|yzHS9 zrU9Uea^tR+5S6tM@`Txg?f#@K`|3c9@F$ULv(Q(~GHKA}E>OKeTDoO+P8KF_!r{qX zzo*D=1z9qwuE!C3ve;n>G$6oQ%H(eF4?+&$;6INzKjQt=>%Cy4(I(98$>R^F=>!Fn z8GTsn$HPd6A+QK;1c+02t-t-hzr+-jOZ9+JrVRupO=54~)X*VlR_kyU&YZw|$WBGK zc*W-ipv|;bQ1b!CZniQA9vs-#NYwtXJU7$M9os<4f9{s zn{1;=Wl8`RIBjynz`r@T)>W*0ClIPbQb)vuSF&+7)YQ}#FRaf}jLeIhxBW!laM{(t}=UQl15gWCum_oLEmi^ z2MwfpX8p&6T3P|RxWl7u`-_U(L^VeV8eM8*TWw766yC3CdYgl* zLYB7DguWGX8Gl3b7c#z9Tz>uFu`Cr>Vf1_Gl%{R|m{yC;UNE@%%0*~gCZi}Iizv02 zd&+C~^HuZO=2(GPmB|>~cORO?_>OSW1Gi`L9xn$HxzeR$=`S7`uUA&SO8ihy`4x*+4)7E;}mH?@p;vOXDvoRwp;n8f`jHZA+Jw_$^3k@`GKs6)6f4+<~ zT^pf7g0!|@VDb)mx5w8Zu<2~Y-s1UiFV#S#j+luFeW6Aj`ywAAI-xHX*;9l}WErU! zFTBPIRFE~8HELNaO(&Q^($>1v7ND~;T?HE-Ldb*$^o>!aA~>mlz(?dt#FP)9Nj+it z0xdWD8(%9^r*rEYyj{HfsFNn{8B8_mE_;P^(gsD6eAbfoli7O8?dCNgzR_>QLMGT4 zWd8An+NH$dFrP@5BkAJp3BzIlp(rBAcHW#jF@bEyM@Pdn-812G+BrooIIe!L zReO4UqERlb5N8K&!4*gPVJ+3;qfvV5(o+E#)rSzaQHWfGQNk zj-V-?hu;eXGtUgSt1*<1Y&=)~ePU_%5#=6KJ7iXD_k(msFU2_GPKQ$dw7dLvbv6cQ znax%Md07y~Kt-zE%7{cbm}MeK{qq^^mD&5?IDguKEum(^2vzkHraSyjs3*c!2+5Z& zHYuEU&(9#NAGNYERW%uX0qnxWQwH5SC|!thv_~Lq@J6_RMo)O#r3X9m73T{X4*ml? zkQK^!4AwCAE?-$p76f~m0XdqpIs1+qf4x|(Lq2_NsiLhac z2cj2k3Zr3~%s$C@76Iyt3#%`Ss-X7dw>S|pSn}D1G!j>wy}CN}i(=%Bca|=IADrBr zyHR|im{Ar-xGk>KM0_;h_FHjL93_bhuVOZL(U47Ig zl8N9!GxHjoCU+R%!-CAC5JA)zbTmeURxAoX4uvB#TdOt4hR*Fx{g?LEh6?aVT|5T|&hwXjWtC(5R!p1#74nvy#MixJw=%*_| zFBL8#`-V~*WXg)WF_J)(?ZbGgzHpx)~9Cca0B zrY|#4QrZ-W=(~6pD?J`>9CY}G6gsHm&p_07+i%jb@?t`6v4R8$eJiw9|4dg56pZ;S z;Xw1yV_nu4kgu%UpuN61=8Y|*)l@fr?d|pwXSH;1zona*w3Dy94&ogj@ld-DH=;VY zKc&J2gctjMrj4c&T;cto?UL{zuw)?qXZmN{7hvuOjCIOE*1(}~zv8o#E0aY)tU*+( zJm4v@+g=Xm>xDdB#Gtr(O?<-RD+CZ8-qB`3IvedsuNd-JRt-2d3lU1dx*quz$^tPy zYOT1ClN8~!&iO1gS5%r^CZ#9Ez62v@f?(7MJc*!{=2I=?I(<~Ba4!0MUbS{lA;?$l zRl#AbPCWi5rixC|jk6uIikB3C8|JV&Pf(bhZ#s7bl{;x})C@70^NHD0%`_fj-VCo08u8 zCODi8{>*o#W%mm}!TrxJDjJs*9b{u8XO0A*q!h75g`do>xJK$&Aa zq|=b0NbFQR2{?5|QHO3}BXoe*@@MJB|7?I4pI_!YAI-(8*J7hcr%CFX^gm-Vbo8^N zt0j<4_#^jDBTdu)l=0U87Gvc9!sz%v4CiY%h9_kA@-@;i>d&{`pz1TsiFK$a+q}B8 z3~Ou(B2X4HRvSbAYF*; zw=8{M?l1(S0(d-eT?po<|CvwV0j@}SYk;H{_Ib3)Tt zdX;kG)gcwt-=@}}R}EEnkB34Fey{G{jR5f&YQhJ?$3tP36F=#46_2M&MZ4`d^#Rg+ zC(>9f)Xj87~=O38?(_dA8A_Xu(g~fd5@}VhT_d z1rOg|S0l0+q;DY!j%qP5uQGgWYz&B7Naqgh;VeO5z4N;MWiq;9kqkBae9Ru=r z$)Mq;qMAH(0H5SvHLJyPKg-!(k0ZK~cEb^%8U0bT)87Kiwz8EP=hENe)O{i!~f%qSQ z177A50K|hoMaK?}-Zw=c`f6bZJ3ogd-sAUT+^eJ=Ykvz$>be49LBUWs0`*g#hR+rQ z^sGng3jY`Np%_Ywz{{9W^Gn>xtu!ftqHgOM+ujq(6*qvaNjb?(tOXh_jE=`SIRQ|i zlImsk#O849;Rxv(b)0irx`b9L@6;O~Rn#BNzRi zN-)&2c^`f%6VzTN==>ggTZoq^oKy=rVSNOKN2o-z44H%d<>+3NJ@d+=Z2?yN@XTWH zo5+A)&s0sM>;eRK3B)hSA(P7dRb{%^U_OZjbdvph4ftVqqSme*-xy*(bY2+BB=dfY zYaHDX7(-hnCQ?(P(KJy!Z7yanLK!J6Jnx@L!gN@w4X59p|HdhnFwEXWQ6j6C^tKzcC0ihEtU zJh?l`L-wQ4spVC%vqxJ}2C6<)eIo1SdtQ&F=MFd#zbMmw?r@xt&mct zRG(=v^;;axzBgQR6Lzvgl!csq8h;+D_HzNbUJ{r%PKTZZaILcM-xW+~p{Wq>pHm@_ ztM>7PdsV47jbNC*Ia1D5KUNE5h=c*vTsefpvK0uf?|e~trYt}Cj8ewbp^p}b@}d*5 zFM@)#a!T57Iaus#=TK>uvH2otQc+aBDO}I1Hlz#8X+hhj5U@ zdv92ht1MrS7-dR$-T@IIxbgYgYGDLm7@G1@$iF>i1SzOn$UlR9UNXrBhM%n}Imov9c zuEKdCl!-4bdas%;C+Z5 zVFW-Q2uv2hP(x=kW83|E)iJ*rOI5hLjoCCG7NLus@si_8c59BX?5~27!2V0O zO5Uu8bd=CNf32S>Y6Fd{d|76LanRRQ1KrF2sILg-0O+T6=eC1ki3;#8SMcCudS`9z zeR;h-E9~>Uf%hRp&Ng7wzV|%V1Ny#3THv}nCYDm&gleGvehKSmq@U`gg_Ed}oQ2Co zX$rJK_V+v@tX-DT+S}~sZkIXALXkFwQ*oFmw|)l6u^R81&)(BO;|9|MZhVIMcMtkC zp85_6v;pu9+-i?KDtOf}es_ZkoP{-q_`niWH^wAXfy08 z)UMEHb;A8?Pi4P~b+?1|fW^J=9lvyi0rknk)PS?PgD!;XtNpXQ6ahXppv(ESZO!q1 z?>$Qdx2NudLt(;JF_!~hZx(xo0Y9_u5pz=eE2wyL^_G*n#Z{Tjkx}W>bM#aIxy+T{ z4GC~rAS!Vq;oc$b!z@wAgJk0==wbkvDFf02zC^i4pzSxh0!hq8z{XE{Q0spAved4@@<%8U@YN|$9#!)D%3`cf%HQtYYExW# z6Z>aZ?fGZh?{YI?$LEq6X!W&&?J>oinKA9ie(99b7jJ?pI1{$C-br!MkPS*^^C>}G z!b0*vxfhJem=|y9PkIk)uuf;%t~Xh6x(_EF^K=4DZFJ=(BtV0?5qbSpTKL_<8AOrz z-w0GCxZUirfp~h*`i|{e_pli?G!hB^ok#?xzpyOKwymBdiVLGIgsTJUl+9AJ`))_f z3m6zI-i68KJ|fB@Dkw=fgJm59gFL%uVeyI$x-36uuLxq;D;9I;<|ea$8l6?W1Ao#L zVf>Qrlh?V7NlRz6!aO(4YXP+wxMG!_g#cXK0$9ffPuKPI+QS-*D%c*fL~MEdY(ey5 zY+$cD5&$#bzCdd}HvhvcHP+fKex7_Utu_Jc_dGA5cijXmmt_DGR5jjsQWXQvGw$R+ zpJ%}SLi-gl?%Pd-wl>ABXbl`MX>`)v=Q(d|rXBr@S*7l>sljwA( zm9VO3a5HObao=JteKwL*-s~x2jdHvH%y-v(-Y3{#I_QaTsz3!1fN5mz+!8PWPiDEW zSECq+gAU9GdR5+83%n3^Ob#QMFwru26uCzGvN#m5t&jT}aa~Wo{-Om}YmB`_U-93E z;?>fnFrW#U{{#L9a-irBD?0pG673z zmUhUw{ne60q~CEbDnE*TdU=$eRrn4#MD7dQ!Yr!nvo7M*%czILY`fgI`hmUsnTFfF zv+`X+l%EzuGUuLy`3wEw*0prgf*R<}9h~q`K>77AV=2ByudajVAZ`{GxIc!p)GmL^ zT1x@~{EGGMRO8!gL#CIcIuE=U-%vRQ+_B=$8dqE*(!pg7KYF}dYiN9ykVVl`YV5UE;7e=k&K z70da@Ij8XF2DEaUR)DZ|m$Rh91J)_3Y<@$kOk-r}n+m-bBvv(|B@*Av=ag(aoUvXG zeHgyn(}#4nRt+K9?=^KJp;o!euXe*)D=IOK<*s*d5Z)CtptpV18N7TjMg@i~QaV~( z0v%V^7J1%{(c1abV+1;-z!Z6NtVq-QQB2x;;7D-N);`aQUHocAYkr zvFYB$A}O>+l!HRI_y)YLJVE9aAE1^NlE(6Mp+V8P3_w8iH^)N(B|6h|wIiA>r(`LF z{pK55iPDp~#DPbAuDGmDj*@rj0J>5bqpGzTt`npFH73snHiq*ji?N&O_`&`6;6jta zy$f^UmhvzPcI0aT$!oY>^y-jz zUg!gFcglQQVZ6n7*`{EI{}2a(6vs1i4F)gz>^Y2RoX2*WL#4IGWCQt)dc$3-Hhb#P8@PqTS8B*H$Np-8O$L+>^sVcrCH(IJ{`4D6W9}} ze0;z4luSb+WPHCyH;-DwT(OaWAXlTO8gR!1d+BnA=tRdQx!gWK;8FB@ZA5&SD9CQc zF@f5|3l~(sSFbgmFSn34G>{ki@J+3VQa&UdS&7Nx*(xQh1u+X`&7iI5d2PPsU~gtI8B2*92xs_C#Jr5Nf_>L&!%o{Dwb49WMQQpr4|`a!>v6rCHHRkJN? zh-5Gj{g&dxenYpHB(-QLS&8NsC(D|Qu#S6Q%bipcin?jnzSx&H4&ao}p5PqTQNd94VREfTNrD~b92I4V zX8Xy>b6+$rA<#wGm+^GFG8w0w6Hm98y^&7Sml0PvnDkvOL3#Z&)pw*4^B1vK7Rc3_ z+vr~+*E+Zd8CLl081%Ee%{6BNuZB_24jOGu_3QC9uzeY`VQ52Od>J8`3K0z8pjaB={}rSDMyl-SAUR>Lqi`AB_H6dhA~8wIe{(=>cRSEvwla$___5NPyJTVK zF~i{`P9%6O6?CdT+4cqs(Kr_eHF*@s15`E@;)Q!CkoimLaqS)#86QP&3V3zsXYd_I ze;vk_h|?6lz6)-#ZnPD}oiwm!S9%eN@Zw;tCI_K^WXafmHv4bITvyuSe&vHDyVz0{nouG^_}+^AU-sE zOqlYh?iglN;IjvKRAT=!1mStdAB-0^139nAwT2RX%((g*PF|Wbwo*Uv z`4&8?(fv-mE2o8oib6{c-$GWQv70Fjsy^Y9Ts{#5O8uZ63}PR7+YqYXIRG40pNm${ z14d^{V=L)>J#DawF1O$n2U$aDfQ4}wK^6>AAUdx3*A>LjAf;gIi2}T8h8XzN4??kS zc6{r-0C2-%N*umuoTO;KQYR0hXxwRIG?(h({P)EVi(Sox;PVamPKwTbnf(KNGDl zW0>#9f>28|;HhpjrlqS5SuJ!e5nfE-^ELPme;aGuV%OzhvVj|LTfc+6lObSvH)YQU zuzUU@HQ=$!h@> z76cp%qQ(LF_sGVt!(e<1-)xmdRuPFRn>f@0OHIJ|tx?UQKxpN6bLNoaMpEJXe*Qdo z`{s$31ii+m2eEYX8f*eGqMae6?=apw(+s3G4q0E*%9Fp^RwXR^tlfzACR4;9xSJ zfw6j-XcYlJxs6$X6bWP4ixHR{_?ejCQV#nT+e+uIH|VH@B&Kg_KPHZT@v&KBGz;QE zMv&|7?H3R`Qo%}>Cv1;F-CsI8S`lHYoUuCwA&nQlxga2lO!gD;yK zTUbkcSad9OTKW>l`xS<`dTK0$l?@@eagR7BVni!MD~Bt|qq3;hW~w63ImLy}ayPpA z;D?9+k0JGb4m>zvHok=zo5A}Qj&5s!7|ucb+aUFbTK}4Dk65k7-n*rCd+O+wWM`Sw z{Sfr^YWH|)LUU5o5c`KVpO)TqCqBJ}sC`|d6|30Z=FJ(?rdKKqqDo15o{8zOmZSGE z#%pIuZ|3JZu_OM_;5Uu&LcUS;2AH0T@-H>FAIc5-vS=zAkY02zLmoF>*$bO5aHvm= zGk$nm=7oHe*q6%0W536$k*=#1-jW?0 zgKRlBv8VhI7&F_O5+Cow$Brn-|xJoZil-5$DpN%SYnKpK5m%qZ&0`w3y(!z-YcQm+Kuq3!Tldqzpl<0kSE>Skz3AFTV|C((aoCWk`Bx7)pHzUKjF@MMemvIM?h>E=#WaJ$(W0iSGP6(@mof}X`5MMjdh57+-vVj%%g7e4-7(2{>ov`# z+ctUVyk~d(4XqQ?Zt8s;cGPqIc}iviHkIq9BRjWojeY92?T))hCd&hlRw{3Ee;hFi z6W&8#E75JXUff5@5-w4WD^5{pt@uV@beoG7?JxfHdiWDnU@^m-kFCN9YfFLD6719KMo8&4XZuKssUJLWXsqCH;_w<=@ zO|FZPwNt@G#OosI_>CCVU4u!(ynC>%DD5r_kWUv5hp|yKTfn(ksMSov76r!}6Gl1M z^`dT7UyDyAvBkw4&$=GWS^ir3Nnw_B*w@Wt-iC_;X90>48yT$gOmITdAXe;4Ll3jC z-=2`c_*5at!~&k*G_Fl}Cee`iPX=jH588|0`|b9^T)$wF87%;13V3`8SVA05m8R+A zU@{3V%hfM>djcKd3rZeUheY`t@YGSOX-2864F4M$`9 zl3W()l3bQ$nZwB4PFO3b5Jo%?{J##4;rqKTjYNgrb&xdoeUwu}|#O#b7nIN*% zHm#vh7kDtRHT&GYKog$}0?dklq%)sH?YViboOs7~vOL9#2{YXr%;*EsPW*Cii!|Ao zb%7Jh&!n>lz|+PdO>WP);%kBQr<+O*afx&!Os z$r;u<16rfX-CIKisff%Id1JpG@J3-?e|RI)Y zYo+Zrgn`{G1eGW%73vYktz3JET><3nwgl2C5&{eiPzMm{4jw}f#Fqnk)w_%*FQ0&; zOw?MMK^B@6|2}_7MuG63GQTYjt#|9f(Mf^fzZC@sxUru-vf6xLw}dH@IP@NsAc`eGi-O zBvKE9K~k^qNJ0KJ$V8>l#xsfNp@}%$9|9u?LIE2`sB_ekTqU2)JT=F@z+Uu*Er@gH zFl*Lo&JN_&iA3d$(L?@C3*gQsEz7UZeT$`w67req8U5mOKF;-q-70QRW*`xPT!ub& zLswpRKcuqdmDK|m@Xjjq)Z-Wu{kmNo<;**rl2Z%?m0sOFiUGz44>V8C9X$?13*nsm zh`;|7e1;2j7}Y+Tt={a}PLyr*aSy3x>&~D&b>94B0$6E9xprkg-rF*svZuBI*8r&L z_sdO}gjI#7I0r$|hBdlv6LzbU4qUb3QAq6-rn-E94ma`oFB{?bHerpcM zz#9*5zWmvrplD)6u=Zhy8qxG8G;|8`;FL4|#Ele(x-1HjgbVHde4wQ{I-OcA<=d%- z3iIToX5tv-Z$FN|&gn~#lfXd>OOxN$s>|Ztx~A?)*R1CAY)V|AMrwBE=W*p_4t;^? zrrfu=XhrXAb z_BR!Y@yq6vgQybIIC;DSmS9;U$QO#t9=Ow_^zvpzFXE#+8q`3xjH(1iQ3uoW>LvA3X-NK+k$xWgu(Nbi9OHM^M2JZ( z#*8jE%wHaMF1UW0vf$$2+T3<-E}U6kCzI>$ZgivZi~fM!(E+3_it(;TT>sJSSZJWZ zWybu^rFutRr2Xn7mH}bUSa7Kor!MI&SN+HGyvuOGqrjKkqZ;SYJ9#o8Je-#O;KcFA zr`iH##b%yKqDX5P!|T#7y3fr?_^0U{>WM3^N$I}(*T!bO@OA?FIU}_Tcxg{RhDZ~c zi%!M1Y$+S8xL;RHruW#s$=a@C+dz=$UY6Vyto3&P_#XcNCWS0SN#L^N)lF}u`KeYE ztYFtz{%DNan1`>W-c#bCbnwIbW#+%;J`~Wc{5tTMPLjagFnJ==X+nRQIuO=Bz1kg| z#ZQ+s+4Nw9IsAi;`m7|Smnk%_JbDJp=w05NK&xVX>?V9Lv@LESe)OfcaFKo-(((hT z_q_$^3ih<6$wH0u1(87x2DGd5Rr=3e9bSU+Not{E%V}?-pJzc03Ye?--Gdf_Qd5OI zH~#w?fp^OkvNfC0Pnb|VX}#S}elq=K*>4MPh~8Eg)EZ_aG5%iiNY(ZFRww1pD$VAI za9g^9snNOe?D@HJ=Vu-mprJ-bcF3uJHK@0FzeExRG0OgtKaZC6G&hH!{2Rd#h{h=> zkXiZEh8|=FQkA#|lD0G<+!+<>SZ@$}-x*h05my`AFVUxZR-C4MNC<2*Zpp`f*oVBM z_B!W~U^HXi$3Y>E6*7eB%ZUCS;ZYPwr609Q4CjeKn{JKJ_T(R$Zb2vzv$3EUea}Q@xCZMbc8)Pm_F4A45CH`3mU)n~u^VAb-w_%$iQWviF!jbc~&1sVH=K;<2%Q z9hV%4tPv+gl58V$Y$uOgZANSPUD#s{$DS^go59D(=)f%OantQ#W{x z&iUJs=}83}6}f!k>j{}(N#L$qWb_(1z2q^H@3j5-rhA!?LbxvC2mRh8NLxNYRA)Zm z?5HbvZK`~Y%a7d)ckG8uWV_~z&!4*WlG|LF&o_AFU0{?QxSAL%7*gFLoO zCx6TFv+nPTU2f&TWu&=vKBVE@b-Te^D6gVacy!?eJO9H1k|v|!=N%2$8If6*oFR6!_YP>AsL_su z?1F%;>p;8D*P2bBN*<{pI@Lw3bz5*AXJk?Ksy>*){=d*QKt10zH)J^-QGl<`ZO~pW%(!K)CXN$alm$>{h9iI>8 z!z0{EMJ5Es--)ixLBCnF+eVD(JvnqgcK$WZ(J5Hq>{yhDtU;iHoHF}uY5h;M%K!iV zAI$$d`+qS;{W++LhEonZ3~rWH|GV}7?j{%*ad<{Qp&fVScX2ezG=nxkw{VcL`#p9-{U(^>o9jaO*qU;ZG4{_vusXfv1t z&G=bX#LPcSeDe^7iecNef9eju*Uee_d- zjsPq;f7=57hlg1IpPwsS@ouopyg@u%OUGg&neMDZO=dGjG}mmJkyBnRqoM>$T@0pI zzZus4!_yYzz~(|>5)GOAI;O z%a;-k%$>ojIL7^B)70i*3H5iw+sCjb3MM=f&>kVm-70xhIE>{<54_$*7m-gq9MPSw z+JTWm`ETsKbyU?|w>C^E8v#L(kVaBc0qImiKtLpF#c%MM_e-yPHin z@7mz~oacGZdGGT+-x%LFzCX@AlyNiM`?qSYIj=ddYpowyn8=gV6{#(xuMcY=pz3kT z=^yc2$+ACEUVPz}9n3`hdqVjAWllVs(|QhEIc<3?0y%y8d7()Zw|vFhzCre;mEqpF zl5qBdsU<7={?xO{?^bV)b5*IQ@Bw}Mp!45jHjO9BqB1T^7{KgJW$1K=doyiJDOSUO z`%HD;ArqD04fO4u799Mh@FF#3wM;ywn9%A&1BQY|S}LLT00e4}_sX8>NHGsj(ry`K zHwLgH|Bu}47`%sr#s)wxZ~tPB_inKtoOl8 zh%x+oq+k2P65QS~!A?9=+uwAQ{vxxr2~tnoe()BDQ&O4EZ2vsisJbKmgy*&QeDe#7 zIzE&6x8oHKiB$+MmEMLBjSW8fq+GhE_G6J7*uiPXzhMWZhzKm+d381S`^;vTtzk`z zCcRJQweHcCN)@5ui;MEzW0B=NM(U|6dxAsDpy_8lGe5hmR;h*D9bL~|h5p(jQX%VY zoQh2jReHL%m~n<#!|3C5rWy(1`)qgXYQyzpqL@Q=!Rd~xerJqG{FS=@F7i;i z2o?}ja9QdlW3*9UQ~pMF>P_XVk&)P;dDJ@>+slyYZFN5_-K0PGo~$>JiUhrbWlnvF{>(=W9kwV8TQN~WoF$eemJe0* z>zwTPP>&T1o z5~x!D#13tY}C5ejTZ1kGBkPAA&ft1f+-U!x!K1g|Rf3217SW4usg;Qr^5<=6|{ zYxC2?n$t43od?ObUdctGjJa=u9d%teY{o96%5M5on|;4;}mIL(yGAjlM+ za9Y>78%)aTD^2&Vsu>bzqv1lQobg60nm0{@cLxq?NiLu^m%?Sxm^mM*>vlOteP()~ zxR%k5iJ3+iv3Sp{UWZy&r=56AG5>RD!kVVQugC|d&y2)}lUx?V&=2 z>*v9}8Mg(s@yeC2gQ9q{uJyHyyDO4;tuNaHM3Dcft_+rAzIA7`I`QgTk#H9Anub#S z#rL(z&X_mD#>y@n&nXtrF4MrR&aSVoEbq<)AOCjm!DCO;qC&N&g;!2}p!D4y6Z@~tyr~6l54Bg*+BVC`bMl4MoeN9`|BF}SCispA7Uwl)Kq_tRp=6S5gpeMJR|fI zxp3rN{K0wcX-S_`YAj71iF_C@f*D`UC)}>-#M*nt5~icIn0yDuCb9TXAuGJXl!j&I za>kRM4aZR*ps1U~0*bmwqCIi)S>Lc0pSiN)=ITnD6&3k3#9viqh-{R{8aLOj;IlWC?)dL*5@DW+Z&;AKc=FjrgpSX#dCej$a z9T*nziY_tPw{YCN|2?yc-8cg+GT{rW64qThxH~S-j46VzMGHo)+I|jYNo6ixcWFE| z5~w?zyqEp8CQdWlw6;c~e`B(0o0iXJCnaqpwJ<4qFSttPRIxFg3Ep+&G(Kvr;7mq@ z`?`dMC0AITjO-Tk$@;4zxT1i2>vVpbE#SVNT}xW9=OSmOM~jJNHF=(|`klg{`<|N9 zJ)9oN_!^G1O|FZH2?p~w;lGnS&qMnum*fZKMjIHJrk1yi-YgMNEo4ZFeu8R82(^k? z0+y-sQxwi}7nQB(hIS=$l8Ndfv4&;mr9wosE>(CRqoMk8wSw>381iB_LT@^+KB5>~ z`lLhnb+>@c9@>=Mlp43A{B(W9wM+W_A=5Wc*Lu;+M4-Ej`-aE&9cZX#PG8x4oRaOC z!xKlw@4t7n$t{j?txppFxh&>e)Oh`~yk|Jf(e!D}A8fg?-NPb!>SFI!oKopL<3{6M z5D-l7z)_64)|pHE3U8(Vbq34z+OT*}a`}gQIRj_+9Qs*}+F;loPCId&Xu5@Dg~_<& zfSa;txT~9#c=&Ui54FjmO(*^f`m22t_KCs{PT$n}Y7o>%B`>Z%VMkm#iPD~*w42vD zoWznof56oG%fJb9-F_QhmwJ|tixr-So;A`@@>_A5#ijG;-Lo}9E+_Gb+;6@b1e0Mp zml92#)DEX`KbzFJ=dt#Zhm-SRXudH(xw(u1DYa5QlpJj`8Hd$W}l&1}PO9%?G zgGxh*e`>uKQaj^k6g}VtQq4?r{;R61s4?A9}e~sn^j`uhd{{F`$IWz~o zsn0+6uC&&Zls#UBE1XI-{B$_X5;k;*HAHSQPbB<7gTE*4W3Gr$%dA*qYV%bns{yh6 zm%vt9=!~+y$$P990XV(lh4&4GxvT=>gIlEd9OluWd3k&_Um7&YA`9%DIPH6hrg><_ zn8bP*4mzBIvxJ{Gl(HV}@QJ&asS38?eC8b6-fsK&rt@e|dYaF5v+S`ma}wftIJ2{+ zof8%Jc}o8;QNKMZOr?f1NU?qVCU}*01Zlc+*2|^!Q%WUoIzd5xFzUVN)K(3o}k5^ z{;JLDy}^YX%l`ax@iDyBeaanF`37-v$HhCRPGy%|4jgsYtubjV`SOg+eCLWTOM=9; z+TwQ@i+5#aRAm~j!X~$vtJRV=@$BB=_$wyUVTQr7Hnr(2Mj-?t%#x~KPIkm^_4F9q z#O?0|vfu=~%?)`j_fdvwdf&Kp5vNR!>odzHlUcpmAS@oLp}8gR?<(Ii-s!Hm!?A(r4kzy+qei zl88b|H|;B((FOfz-}`#MS&tbd#?h+B4^iLDM6I7Z_SsE4s!ULqMqMF#b#MNKw^f!F z9dpC}PjnX}bg4@dySF(#@ig>&6AkbYPeK{t!VyL@UpTZfkTR7xytKS(SE+wX;wbt+ zKxC&5A7ccHO|!Q8?7Z$#Ag=V)v<-CJ_vGL)$l~kHShw;}08R2f6u??HI<4V)KHq_m zQaR6$&uL{2V->xs*;WJfcP5O#8{j!OYKAL*h-g? zdF#&0p)_Yn8G6>PGkTgf#u$1<;r5KK`)m91QXdCvi}HgZuj3k*(Z$x5>LnwVcC|Rf zR+Ozm8x*7V@OI7U-L=J}2k5WWB?yN}q=;Sz5iuNG97j~wGbMI#Mu?m7mc~UgKCGdk z>9us;9aE7nd{zPH2Zf{=KQz0znx~L7D4boI5s9GMkU7c~Ymr#|+N1ZIMtd#d2ZhPT z6!(VFggMeDcU4f0nGAo){{-n1rB3k=UMH>elD6)E7FAjb;mlAFgk<8M{pfUiJ7qFe z^=>NFFN(;gWl8?%S?*8~?@*Si=&4cdNu7OD)3Zmmd;F%7D<)|+^tQC;x?cK(C*iuN zxiepzL_GOe%~vWGnhEz!$7q+GeoVNjJ8sRe7VDVaY0Ye_8!>y{eJa=0|2)CpCbWc9 z_NAXvlK?71Qhm1Hj`g{}_BvHD#?*OK7>|Crzbstt;hlT8 zn_iC2f4(gwA!E*Sq%UaZm=@Vh_&K8~60=kFTu;YfUT|%9)?x5`#r4Ud;+fu}9(Q^d znZ*X3;x9=%ewFwR#rA~?$YS8jS@)8%!=G11yo@c{$zNq?YnLM>I+7WFoFa;fic$xk z>QG!>FkW9hL>3NOv*{VGsf(&`9L3~$zYRb3qv#9HRMWIJB#im^53Kx-nM$&eLC zt6ZV=EJvk=ZAr=l{dC~))KVL7r>)X;NqW56&}`f<&$&u4+fpR!MZ|oEUAlco!2E_H zaYi4hnS}i*W^Z2AYzd3?HkGcQKFjIW3G=n4OSS(pEeJ!8DIdyuRiQdXq!-{t?&&5{ zHGDsnz3vh?`mH2lKUp#6+8b?Pl9RsaYjYZmlO-CD&=-C}Qg1r+;Oy;#vPh;^##|cN zEZ^P1Twjia2BrYtPlHAiH~JSM2-QKK`7z z{7z4tnAi=LuX9i!%KisL{f;m-C%w?344p?3sSL1QLuvu}ng&>>Hoa&gG+Txw?@IaMKbE-$;+SWjWCHdaZC!#>}W0tu~MQMmf# zkY#=PxI^4aWXS2V)c<^tqS0+kUp7$0l0VJ;cy&Ra4TmU^w|(Yo3KeQjF9kEjYZ{@r zDA#TU>8RNA6%CsP(i&wIhlmD04*9z{QYv_|$6;E}=nYKUh8Nge`b}d5j08;7oV+&g zFE8;$*e^?V9*JZdZlrXOlj5s&Zi~6rNJZ)J^lunF0kg9yk^(;!}=i;&J$`N*jPb5Qr-nTc>>&64coWD*H^wLCw)O+}Zwhftn!jAd z_hYgD6;8!w@SAMB!i3OfV>o!b?pQ-sM3PZ|tv{U&T(R}F@+BRc{9ObxI0SL(-6^}7 zzLi9`f?>svARH3-vxs#vAIR+2LSIMt{Nq=u*Q7fud|nI! zYrci15aqcIDF>Rz0|aTl~lQFA?AIv9GV`3UYIqsrH#P4OLUZ1lURd%S0>sm-NhU8W-i>hiVr z7FOo(FZ(t}zgy8n(8@k8+7kT<8xC1K3Y*z<+;Ug6_S4V$@UcJ9P@E3{ul2wck_(PW zG)epZf(-@^VUDWt+$e~UqX&-6Y`WHZ@s$=0)uVfmY~QE{_wk=LYF!>}O=Qbwzm42Y zx+`4jd3;HyUTys*HrUn!)=d>`gWpN;j2gA0@3re34;eoAAn`kr>6}6YycECzj}MtY zbui)JG2uLRfj6Tn*hxhZ1g9OIiHhIMq#_7}P2>AT6DR;PO4E{~-;5m&k_P*Io*U~$ zBxfWo3T$lz8-YYI@-wnCSrQG9OvxY~dr<6d^~#w{$dkG8ulHB@n-C!g-7I4YNGfpR zrBa?@mS9o&9Ndml8W6me68F}cU`j`PCq8duriqkH6C)#q+*e=fU*#`!o1hDk!J2Q+8-&h!VqgU-$Ha2m(3KmFNbidla9J$}L6n0D zIWipfmqq5y5n-+P!y*Q4@#-2m>^&R!x2>4LT3PU&T!XdZcx=|83R;QTT{HSh|E?fy zoYQjgRrPX=QTSVAue`MPBFW%67NDpTfJp0M08vJ1eDU^n0li?94+Ia(Hwf&?FdO@v zXjJk*2y4mPt!xn|A+k{zalS$N_tOTZ52EbhMB7J(`N{&UeX|Xj7lQ?ZEY{7sy8~GQ z0|my3pr8y`>x8@8h+<@sKyHJ(JyF>!8#_^E_>x{R&oAwAzgNJYLppS2Z>cl0$1fnN zM~XKRw>sH}7xY~NEcBkl?#u;d$(61L&+Qx(BfXVsS1J8KpRZK&3%BV`yz1D$)(tRa z+AYWZz=Zepr%O`&iegZH0rOoamsQ~K4YJK33YcDU$w^k2X-IxWf=xErd&;nnFCvQ% zcRd6xVe$bc&9B?A5il?woIuEVAnxw|;5Tec48XaO%#zdJ?rWIlL~CA1ryYFd-YFuc z^}Z*yPIo*{REBiCgldJc>@~Q%FRR2||jftwe{HyJHPo zgC6Kbj0ULQw}#q-d#s9sjDPX zt@B0wES+Y{EIiLlbWI_3J@FVWP?z++hm(?-;25wzn4L=1f&<*fFW5RFqW#M_L4a*X zQ6OYbloArC^#FxYVt&MsKbgm2s%9D*q5XAti!B0L}PAdUo87)0<5 z-g%WXuCXNcvmOw<1PhyAR8hYp-f4(0bw*`2d82|G5n~5~7ia7j&~3gaM}CXyguZPy z=sXohRnC--PeEm_4WpKh%X;py{e6XmfX(1db72FQ`K){O+~sh-D#ef5*L%snBq`u2 zIh~gkW-|rjt)*E?=KVm{uWYzET0EF4hQZiwPRwgxq*95?X)?|hL5Qlq32w1WPaycU z)BvskA1pU|$M~cGn7H_jD}4Vid)h-DyTo>LT1?e$lbrw>S}4w^uUa^&CrSnxebDZO z=mCdKJ8)6*y2dTOpjIGRk)GzE9$l~7Vj{~+T2ogM{0;_26;eJ&>p|xZZ{kedMR3Svir>tXIwZhP9vHgMgNR@^51e<UZkTE0#Z%1QYpP7#N~6ANOZI3^A}Z2EIFeQ&1ZgMabeIyF23-tRmN; zZc+Dh2v49R6$|Vnoj*L>4G_M4PIrDZ`$@8~_mT2KTPVlolr>llEHinMPAFO)*r8wc zQgVKc19vr$DO1_~*isa?mtbK`J7#07l;%kaAry^;r>(Ro=Ut(6?GK)s037;$@HeH0 zWT4~e2&-NfKDcc(3YYl%eNz57$K!2c=t^&5qCMEpZWqa>WTgI$AWXVjgKkjW=hD;h z*Y*#;#V~=f=}8iZH7nwH@+GaqM)tYwXe?cer`+g_Cq<|*6U45sR7{$eH zJldYhU0xzBLd9iBVo)v-?wF5#Kq313a75D#LI`l-<%+=dvzzTomI(ZyMHl(Mbd^l! zkwYUgf+N@=2W;k~oCvP>lubXx*$gYMcTq52M>47oj=hS}tO>v@`MTt~{={D;dTXLG z69tgzILNa?ZBc+1{4?NoYS08ap?C*|r zGB+5%Lc{>ZEdUEb3-|vn2$fOIDuRlXGZ$3c1u9J+bVoWZOdyD>4D()?U9$>=IF6XzuizPCu2&P_n8g-6+Bbsi+ADXl7H1TAH&N}@AaZ^xSt}xTkY#+^TD<(RK2lhyo1DPs)c;P?1qT`k zpcd7q#~pAUi#H}o|92)i5Gk1QBGTdI0SL{pu2-k}V$L52vxIWr6^5kYJke3eRSX6m zM6}jR)(W;4>aMTOW1_(OMZ~_)$YdCt>k@NWdhM((Fb6R4xvT{TI79-jaVq5>MNsjWf{?6KzjybkfxOueq;wyxe|)+# z^aX!^sS}UUp=l0eF`_J_+- zv){>8Vq#_eVy~GG(o}+QMwPPmqOUT6&bvR5v)!)UOaLaz;f|FE9D!vRX>ys;*&00L zffV**+NFBke)_7#3zsGJ*H=!}v!6h%{L~GEWC|C6vOdoWOXH}0x1=%|E%NB$-eBke z`yvhP5(th#!sIDf1hNOL5z@b(RQLw3ybnKp`T+jInuB(+_TuXqh&;af{^7w!2+wvB z`AeT~sRkgf4px|`N>;rh4@A}f5O9aC8;4m#Ext|xydVaku_eC7k3%^kG%^YDKRMBv zeS@f9i?GmfsMpx0!!R50GY%{9nWn~xjH>J|SU|bj2#*0lf*tj+n1cVWJkC?hH+N^v z$uN}=bYnCl4UY>4B93VWmw`Leqr-4rOe~pm=(=#bfQnHvuFwO}z6{=~ld}AMqo8K+ zz#A2>NB$^}Wl=$zyR17i4%gsSW>EAj24gc-Azj3LwmCa6oR|82iX_+#-4oU@9WBw7qn1 zPlC%)za(=X2%&<@M==9?#-;Rywc*nscc6Ql=Euc7?|Yk>K!{)fq#Eh(0SK8yI|08l z=hjpWhZd<>Ko7!K?6%HB`+YlIO~!fbb9-@(Xck>Rj#n#k@J`@;J2m3msA9ZEh9F(d zS(O8k7vzv~I=`mBK9Munp8Woqx=2vI!gvf6MIJqU4;Rm6C3?8G*9>-rbYOtP5<4lU zIa#<*6sqNA-z0#{L+M|QGmn^!hVwqgR;kwx?gh{nyopi}rGJU4Sz-L>@?bzFCW_l` zo2Zxv%MWJwnlOjrOZ+cGA{+9`Q+GNZ=$SeX7QI-=eOFpy!igZ%$OwjS`ingAAwmjJ zQwEDr!P56I1Q^KJp!`qSO;Els22wZ(e~Ol;Db{}#X>Kxpz;AUTHGAT{h(xw;zbWjH zVSWMSkZ<;wk1PR_eQ^bcH02CY7TO^%!vI+XOeTvpqW#}^Jx~(az+!`d&$j?DiZ$Z# z|Ly0qsde&IDx)I{RLt1sNi{>Q#wY%@I0A|u_w?iuK;`o>Ebq1f*IWD(SQN}bQT|K7 z3cf!xtgQKOeIGd9f35j|iJz@TLI-*M|6Psf|LxBwU5#;!Rb)FZSspYbGmp2DW8M@b z$+&@4^^H*yAd=I|YP|wL5*XF&S1_E+v@j6V$#lGnN95XOi89{SijOC)u?Vl1Oq`33TQ;)-BN zwxSo&;qncNI1|PVuj5~4valF0V4>$Jlp`qQD-DM;QY^4Z#c@wyZL=qdPZ7vh>&QwT zdHRO@F5?PhoHIonXU~=KzPDJavvRevBKUP2%{kz?O7t^L zw_?Zi)JrV+OeU*;!ut2=6dWO6J~S!qRw;_J-Qjr;>tFUfASx~i0K;=_1W(LmI|=(F z9Ztuat&cb+rI&0D)Vg%n{^o}t6_W#WB~=2rZ7-0d5}wUq^}zS?L7!eecgZ1VdRH)c z&@J-?{ez=Spig`QIEdZpX(HCZ~JP<5X3{Z;=^naK*t zM3pdNAP&uaFz_`LZ-cPcaT9?mQoT*rwuwUJkjp26s%-=pxmzRjFKJh#~j>}SMyxMiZR;5DE zlWI~*@W^;=rfgZ=#^!TQ?4dJ)74M?GdWp_-%ZO0b$wL9+sSOld#G zLa9{JOwN=9FsNTgi|DiTjxK|>HVy8O&sKXPS+PJP-Jd3~qTDofm(5HmP~st@@vnNj zYoRkE*F859=~2B@A~CwdCK_$7>ZB8Uz**0GR4d9R!eC;CNZPb{?CWM z?+c9l_OTIFrB^7B*m2?QsK`t4zV|kl`Us3OuEy(#^dT!ht!6dS^DOnD{=0E1211|Q zDPh%Q;VRm39uKEPl2@9ciSu=2>0SnH_4Ro~*ykaZYI%^h2EmKd20N3$>Pu*)!DO>x zg@ylJTZ6GMQ4H5s_K8!Dqx3r+Wa3^m+x?|$hiyAWj5($w(;XPiFJT|#cRjYTmf4YH z98-FqGqrRVw(!TAu3=34HStvEDN9CclN+SPUNz_^7AHo{o|NE1hY^9bini&e$TdgI zr%Tv$Z?oK~u4iSg#hB60y0EsXliVLa&Hr|gZ8F@ZLHr`omG1}4I?u{;`g@@`>_^1&LWyI+XEV$PyOKN6p6cfum(P8`(fW_SuGec)4QAVVLLmZGhUCO!A1Z|@q~Hv@*4$|x0)F5%`lN0 zYT&!$RS7eGhbR+1^qy0Utp}9s$94Mz>-&ibk&a68Sz}Oc*reKNI=eG*Rn}4Qpt4LA zIJ5&#F$->7D#z(y2e2y+O4UbJsLo8z)L`4JcV{xdr|v4tq~d#TI_ErRWz!f-ZD1&IKpXkx{LU3RxwgZv_rFUmd{vt<532~dERuvlP$o4rt| zj0j+zg@VquQ0R~XHZ?&q?+wu4uL_5H!nmh9YWOgT+I6~NuM=OnbJKI$sO6@2( zjE}CEFXVx->X8r{)Ug>`}5p1F4%*)!3}qu%l&%dh_+=3O*acNk`@B93F9si%s{&UU!Uklm)GqZ@t|6{YT zvhvNw{htl?{}0#2Lr$+*?`(atJI7Y3R_g$=SXL4L+a+=X?$VnE9_dy#g06MTr3sR4 zK@K*@zO;rA8x_%ug%DZqEsEc0i-0tu7xLuxQax@YcsSO9ug~V;WDhSH3&*c>X+X6IIv9N!Gz!WJTHf=`#D3VkVF!)c%4tHi| zw5~y2GRtzQBhnm6m?n(rh2l)T%Nii%D)Fmrw^VZ8rUy_F!QYPj7mYcLeTES-S+tCM(B`_Hf>Es&Y2!QVb zmQR-0sBSj}gu=%O>GMFP@)igU80H{w9#0P=5DZ4p3?Jq@gm^3zEm(t=6TBsmdMuwO+{IRN%H6P2lo zOv@mWL3!}9#C|3Z5a7&MA+QP)_~U?NuT0L&%vJ-++@2(ktK?QsFUTf`ZTx18^Btes@K)qJP+Jrp7c8pWX!b`ze>|6|*AK+79I?$d~F#Ne0?S z4kME~zef@K6R!7AcBdl%VNovMT$M0i9tEUEhmLC;wJG_3p4eli-=b| z^l9VX3)Jv$@U_j_Fis_=tLyb)hf_?Raw)Z5eUgaux2GOGJUhISrEg`wmg0NB!M^@V z76Axwl7Xs7RYemNH0K2}{;L+MPM2fXQbYMmo;Di3!litM$W(u#MZjSiN7W6)L)p^t z+>yIv$cuophhb9#EpYLqe-z#xiHiBH&F`UD18I@#<>5QY7Ql)n0}!fwGdBP{)1%d) z5&oH*Mq#q?y(XXqz)sXcUI}W9HLDlL5V$C!g60<1O8~l{+zXj&)`!qE3Fr>H`kMjK z<-1*fZVI!X8zEe{%4TEW-a!jfJ15m7t1Q*J3eCnbZdj|=pH`k zg#ADo!|;sMc^7`lcESW~G9?rs*7ycQChz@UwP0W0&_1gvfeaNzE?`TQ!JY(bi0l*z zj9H{55HHgc8AFARP{6fztHfOL1rorbV7o`?5#O`A?U7*P6Zix}E8rGX!rx`T^#&3o z;0WFMJOC{=0JwaH&wwXO0Z&tI@+5i#w#&pLheX%2C+cpOheD>@<2R~NL&;g78A^A( zJX79Wz}C-v9erbVX*^)xuTB3f0)^L`j%i`izAGRynW`QNpU=)rNC7g_X|QMLJ>F?J zc$2tYSV%Slzay>bdYTS1=2|g=L?2z-+8Ml^^E?0WVn9ffrPUHpfuoIE&WMYY0fX;` z(?6d6qmc*p=UbI)9>^p`QY==>uRr#scmpNkH5KNT{1fSCHtX_0iZIpSkLYT5ablrm z@rA>lqXtjY`7_LE!o0~1Ou&Nh9{hfJFsy2({1KGyA&kl;UO)n-?f(*JA?ab`8ZRwE z{WC5itXaM=`$C#hqk#-`sYL!MBD4l{4}(m>1gGuqAjSxcEkaB&&E&6VuCGQaJocBn zDKsQtIs}+sj%oyiYZhQ0t!EqIC0qEMj?xP2&pN=oV7__>{91px{s+82xe6!cZa2rw zZ`~+$z1W!J-vpfINjF1lbkZ3^;RP3VCLhnZ2`sVqt_;wRj#cd>|!7s@mX@s!x45t^ruoT?m zeRtaRe2Wc4gds=VA{JKGp>yzVAIx4IAc#}!2s;H-&gyz@VTH*K5D-f_zk&PXg4{4el&(Wj{mT%kJirdhW({CBoitEw0^brHQMZeS zjgPv~Nf(;!2Bf>?WNW{Z5ihUKcVIhO&1tKg?c*Tr1a&=-r8Q?7{J2$+v{N;z!tbKZ z0p0HOAl81oSv+jmUtOno3t)}d^@-qRYrO+dB3#y` zBZ})o6B74M?S9I*#aen)PA3{rBH=%m3VRHbfjjPZbELAb0Yjik$bXmi>T{MMCj9Ld zh=kkuV$;L5^4qN861RLd*0frjFUd2CsL4aH8YmAUnD|h5{#c3NQ}}6&K$*h8Wu>y! zkwVfb=l$-!a4+r);Qj-CWdKmI-j!!MRh^EMS+iXoIPJ7kmt~?WTk3Xw)y#@Cd;63{ zOrWv#J8Me^ODd9I1omFDDzcjGn6B&q83NK;>(-uK#5(o6sc-3TJ+(W z(Nb!ogTt7zemoAVIzENA$yEV#R@bRko>eA+SLMz7!dlW{AcDg#bzyMl2Nan+&|7|6 z!-R=d!B+Qa%$Ca>6nHE=ktmzH1o*Thngsh#KqHiO2Z)lNh=WCxuq7lwzxgyA!XAY8 za8M3LlD6iD0omb|3eb7`$Fjdhu@}e#j4w~dFBl`sF}Bj}8pJ>U0IYdy!&> zl@(wfUeCZpU!bVOZ<>R@ffBFod+=;my;NT&E<6B&du+E<2LUlo%vU17&Acrh`K@;w z&oRSZzi;+?ur50TaAlw>>$R41_w$iAg&K8P&Zm181HR-xK->}syOS6kh1dT6HeI4G zzj9ilR!liepUq|O7tMOfFIN4YW)`vF1-a4_hgm9L0J?%>Nl$3wLa`$XghLpeC{!08SaYm0rHuI0L8qHSN{sULHBx{~;8t544e zD*Ce)o*yIbM+VrNw7 zJv^qod-?$4J)TzLqb>cpU9-av6N)x1vs8kz7-sJ{jL8SesCdi8b|Iimp;_)YQYh4j z2S_8+hepk#6Ac?}}Bh+e=@BCP8g&s$_x$coB-NT_19`7~c(7i*Bac$b@_GT`T>cUF?dBkE(r|A|>fE>~GP`Q+jr22W}!B&t~mE(Wg<{g+WSJ(q!+y zoTw54n1Ez>#Jj(N1QKzg)c}1#`9rM`U*ddo=kbO4OCPjk4)EF(#R9dpsxx6^{EMgq zAZ1gVN~IsO1+sy{^?l5pMTWN4)>rK*xDzes{!q=4xm(*z!T!eY!VML#P8ACcNq{_C z@5P>;`0U2fIQRUA3mKWtol3LcEPwXNt49c`m8R*T0$1NeLx{wIF-b<-8lf*;3fg}; z1a3eMHQW)#lN$opaR9;sO}WNZAnOjewJqWub$P;AseW!RA}N;n?(C1W;TCToF%Diw zQ3Gnp*N*C0H?HSxG~ABeFJ#*wg6FWf_^l&@vTh5VfbESN$LuAUg_3x`1&utsSDu$S z`LjPu=Pt;LP$ zNpxme>%8xvoUDf-fWcr-Jpk4GP3pAyOzQ*I{TnAp#txq#(kW1Qiiwx-A^*Fu{?*6R z{=<%QPX3GCp9k#}p=r7PPxSG;^1g1cu;Ksln{Rz#(8d7hKG@u@oabG*f#UgPEW1ex zQ2z7*Pl#ajDqp31kS{Tu&*}*-+_Y~{)Tg4P7DP0?N)Ttc&1VJkj8R$+x2IxZ0*ZOQ z7do63+Tb8Y+4~;|^UCl*=9UTO3Rr-zmtwD2FUZt7*O!}Lpo)Hg*-2Oi(ur!)`V#=Ha|v@x?Zf@F$$?;H`mIc_B&8o)ComKFvBJ<4Isl8f!_3#~!IP({uzf+IUJ4f}37=U}DV*I70U%uBK)| z=3^n8^!~@E^_{!UvyeDv;ItQ6#e)BAP&fp5vLaCQpO+dIIGIJXK@q8?=KCY~X_dsS zve#g0Vy_1$z1B5TwJ$|3mt)N`KqB?joPG4UL4TU~x?wa-)~sGeo(HN6&4j)CC?S7x zH8RX#xZVqJR)7yK~r3jNs*{iJ@`^$cybRQ}J z{#Wwjqntm%Qy0ORfED;o$w}?+o!7n7@r|9bLkPndW4gOIfQOk`-vDAEDNEo2dU38@!c+& zHNK7*`fH+cEq#@{OA;b5Tx&O`8|9(7I<6hc*E%l31CM|N!Ne!W1PZGVyIO6}UI@>u zJL25}^omifzy~gMw4#Dx3PHKDC^zkU#N2DvXO6d@_b0z%I92N0PdKjKjo)#mc@1`Y zo;uwOiJK_zmDZe6=d(5cp1q z5;_iiP`h#?4;uD3o$zXz;$!f$$ z(7f7<`sw4EzdKaude|8HE2vN13J(cMSzV9~>1N`|iq`Hm7h<;l)}E@i`EJHCU5?Yx zqqsuFkFl}D?B=koI_h#f_0l=Xk+?zhG*jU636bsShq;Vc(+(H29oOR;juYmM@9}I{ zuo?2g^9v>Q8@UUPm35W-*_pIh1*PM|3n@M;5T&{?9VB*g5Bnbkq}qi`Tvpqux;(q z@-#~1#zd^icD;lv>A^-U$n3kZM>r2DwOMd#2_q5oU#zwh z16so zakFl49_T2whxumcq?fB=Vw+fn=mf!{RY0lv6BIQ#wPNuIX~^2gIR)ze>_8OS-LgzR z#*6iL4FrxWo_M*?7tb|{hF$!xONg$$9S$rPKa2UC(EHk`&%3?h1;fVx;K-^|b+>St zk&*z`c`~6{iJ!(8hW~`sY7mLwUSgx8uUhyW`N?=~6j!(#mny z^%t$3wiLkvV!z##&B)}~@MTZBQX)kN=i}qL%Q*};NT|!EwpFDe<&CKkVx1&jpC)>& zJX_w{2bRorF(7a$H%)D_fQhLSj3|Y7yI=siDk?hb4X;wUmhbusT+FR5kJq{WB%H?V z0UI~cq2ai02YttJq()_YW6*r`gReI@Bvz)|Vdl`O+Vn_2uCKqqhV_B5 zW*Y>EIeme1Wyu;JC<>&cbtMDucDFtKaR(63us!5Eso50R8%?kgl3!iaf(i~vPuJ*H z%Q$q%i8|+(==u_{H77xLuCI1f^?>Se(j1h`d_e8J(s}9SAt|=NQ;ZuK^9fbfYTVmP9MrySl^0 zul@0O%C7Km!U_0o()#nMor+ptP!NCLrO9DDri^&gYkaC*;ALszY{v~#RbFxe2=;TN z;p#Bb7JUtPENfQ2+eL#%KtzBR^Is={UyB=g>G7j16F(R-2_d*Qp?X0D~_>bFg<3X@Goly6l`EKe7^fZgN;jyBq` z1i7%20aJtvk-GKCa2~b@*DKc-iRM5-TfS*~G^X!+B57} zPQ&64C^E!F67#jCc?;i+ixLuai0T79jfQf?LX^hOPXG1jBH&hh8ZMTYDJa2F zPy&Enu|e^XJ*Cc+S>vP$&j+LwwYuZrFeoZ0G?^I|%KZc~vaqO4n<$a6Z5xm{oWJgz^k>+9CIak3sEcc%9u zE4_frF4et;1=%@(`%Mf-et&l-%eA6+g2e^c{;<3=Ux-QUu=tsAVj4Wa+pc~;o^i|O zHnf(%*mJm=8hRg8hFv==Km~`sGJ}!l<87z85I|Dn_Km|#qcid(;8_VP^2W~ zzg&~4+0y^)CGNaYkf?-3^U+Vty5QGm;8j<-irTGpABV7srdf5fRw=TVo2|d;^>`>r( zU*H1>zeZ2mZUU9>WaVbot$z*rU+legTvc1wHhfeR6e*EzQB;sp0cjAGbEGUv1Ozss zbT@1f2_+4>L_~=#v5`(eN@Al(mvonOe`9U%+@5;g^M2obzwh(?p7Res<;PlUuDRwM zbBt?T;~H$_=mnw8)#;JJO36)&R=XY-Ss|ycSXgEYfseNvrPjGntE^YgQ!0t2|1fl@2MJjdt6ui;oP9zN{9B7Hb^DM}vbwyO)rpVDoNm#4PlXnb#=3~BZFH6KA~74DCyr-&uEBJGyyGHKMC z5a(|grfw|xt%_ZvUBH?Hj>p<+t?#yi5(4uw4FmQrjAw)bfB#(A)5MN9tRF z%DB8R0l01qtTpHB%P^)}ewVj5j65GaUYwYxD#@U$HXN?Af%BN^`mrCqM2=8+eDq!# z7`N#ih5J9bi4YNCA16L#O{P_i+p#E=u&|VI4~SvhLKz%V_obL%eC0QO`7u=350jQ; z@P4oQ0wzKo?9~fWWO|a0{xdzuc5(Y>dj45G|7@;*Y|lTYnf%XBu7t}Z#M`Hz)Qi`< z+Xk=3X`m6=j$0n+jTZD|J3r8Uo;&%rE<+@oXH@9`uC(}&#Udo6Q<#Rqtz@1%2cUyy zi$A4NU^Wm>UK>@?ZV0cA$V!S>S236sq$gLdfMa#`?BYz298d16c2p=^?>p|v2v!%>^l~A-uWCRB zNZ<69k-d$9AP;T^LL^=lOofNG(BmUpF=xA5&`Wk`A;;y|7Mw%9Ri166YT=l&OuxTY z8)WJ8Z6oc9*Sg}oSnLZS7n;JveiJ?Gqg^Xr55drvrhWwesuHh9Ce3Ey92y){4Sy?$ z*!&!PgQK{b$vFjC8rinv1S0EGeBH!r1+EV;^H;6}2ANVzIzfmjB(0LCzK+=sku*ou zWjjh#d_?({-vO*h(Evkw9IFw57tOZ2BNavPZFk~q)_&|~-qOA5qjzTKPJX`t7tU(r zZ2IfndJ^Ejs!-ex1Ez4Qnw3Ym9-?LRaP(NuyS>sa$O`)A4M+7SXQS*`Qa1- zgQj2rLOxpzR4Bh`-D`mY5<)x9t=j>jfxXxIO<-`^4~ZcH?;F1U{m)|rq??yJf4x`n zRroIt$6ob*t45z~-znjKiM>oR{=I#`5rem#fOqJEOYY2Oa8%L9M8J69@ut*ULY!6f zJ>>(hTDkYULy$@228PFHe6-hJ8d}WGV6_|)vh`xP{psmU_0Q$ z&!$13Lh?>2LXcZ*^cN+=w&Q}G!1d7qk(=C(YYYI?fEJf&{-Es|D6FukyuBHe&z(98 zxZk~VQ#t)N1rgl$K&n&Mcq!Clw_>>kQN=vez>ACRG)eSp<7M?+Hkb#GHQ=H4#+Yhc z#4|0}6$=ZQ^VP1Ie390lf7vssUPTA6VNd47nQl+D2(`FRH9)s2LqM!gb^WePl18&< zgdB5$@=(TP*U6vfa4Qvwb)r|lyct^btMslMt9ZI0qFYsp>5P4;{%*Vi3-Oj`rPvUE zu8e-M3N-?{=5uFSM3`jlZ{y7!wDpM@U2dDII21q;AmI3DtHU>D!<75esdhgttkj84 z%Np*uh{_E|C{8A7OBKnQ57;U`v$@q^$~}9ANwOm@L=i_Aw>DVV-LUd?vBQiE3bW0$ zvInvNBKCrPE}pGDw!&%`xMdP4iFR$h_a@5HuG)1)Ky3M|J2Ib@A}k-&`a_D>^~cQh zrdV($z)-l`7VRjJGnSNGT75jG^K`wJ#C)vzS2se^$mF?tLeRG&La+C;Xsr5WvrRiU z^z_6yZTeIuILaTbqMtr4ExGNozp!>o-;9&gl2-~l9sg!7AByoyjkM;%gY~p_!iszM zQsra{h$Kf|xZixemWy-YDcD>{1=v8N4A3bCfJ$P5?~##VZwl^&yG%sRlSbOavP?EN zYNFFO;3FE}o7Q({O%7@I-`OAJ%i3-Qq6vMkms5uztv8YQj5Ya9r>1BqYqHVveMb-_ze{CS}MpLW0aE1;fz==56R(3E6a}L&nK#qE+4CP-`=A?Qh`6-afcD~WlZ?eKmtjjz3Ut1B>rJY`6QQ8W7I z)+RD!oO#OM-D})uI*Kmm(&B&j788V|x$qzjf~+B{AF*_vuRq@lq|yY2Xu%91S{|^4 z2YW;H{ORc9CW4TJ7cgOkb%4^_J5xqR2DF;KhUY1Ti|uK^#5`EYd}{>`d3r^W(O2(7 zdJ$Bk4789|(m;9A90A9L3P%-WIxyQ4CI-_HeVL{mJ1uc@dxm!H&OBVjcykj7 zm^tIq!vLHb3M*1~9%LE@;)7Wm^KkH9b(uA1uoPx(!*bfIpBolWt6+U5f`wfM*jebp zKZ2xK8&l?)S!=EI%nHf1galk>?Ib|Q4B-g9w-NrX_*4~r`*GsZ zn~N&|(D1$MI)L|lswi`S@OiVIWtNc=SE$cBOV3e%Y+9+!%E)fL!IZFLJ0-Adw=z{# z{_aRu@2H;M#tEffMB_2qsSl9NGw>R1d^2pyVAkIVN6bu(_9ixd4Om)C zeJzcG^G&705=*(gbU6D<2&Y$LmBqYdmkws>6V_@M39sWPpHc*;41U5g7h3rc04>w6 zIB7zrFIe$>b}q!4l6_i5oT-{c?a7pIaLyp`f3Jy|lg3oO z5AL~Rb4Ikl&>3tIfZ7iyxE|)Z5blZX^b@~ZJ9U&;mc_+->WMDVyr)nBn=cir3&`U38wo25sGZZbmg(SZLCm3a97mxd=t!K>vU$Si^;J8fAPiE&fH32e}S!YOYfH? zR;Ou-m&=3fwcVfD*JyN~9=fHyl-Dw7IzB#LV-hgEP_l=Nk+Ho8nHET&^a-COj-gj4 zYO30tHF$XGjW)OMaC<8Vj#Q49UmG?+<+5|-)2HLN7QBK8g<6)edMnWW%bIC<#9P|0t z*R(5B?e5@Cj@y%F7~n`jXvnk;4gAYJ)4WcXRB8lC@Z(D-=-!yJ^Mb6A3wyWqV!-iU z-!t{JcgF|(G1=gNvu$3Rni|L#>=LN8fM*E^a(H6g!bbPc+)i%VAmP`I|f5q?(>Rex& z3H#a!|Fk1yg`f0@v4Y_ZKkY2S3zW;?ywfT|_THj+QVG4<$R?qhY*>>!94$H{kD09> z7oT=e^4=2BY#M~kWobXir7b))8s+@0nytC}Jl;2bSwgJ6Txt#}_UzlWq5OKR-m6{H zt~3<~u*^$2l_cwXYM^2T%!37Q{scC7(NvHKo0oX+)V8B@U&?Fu>{LR*AhH+oH=H2av?~_520nEuo@Wzh21S8-JU{4(ocDxr}R6Z#Z_bO>;3u)v`J9 zsL+njHUENMLl#EK>*L#CvROA4m@1x>isy7{S;s+c=%6H+pl^Ak57qc+c)g7IbT}r2s`#-Tka30-*s{4LXt-RJZ-W3zE;tRLs|A=O2^ynQ zxG%%{bup6Y7p8EI%&Ey39!FfH{hZ`6PXwB~s`z~$>$HsMV-|C?8K;56Fg(JeiRP{v z6G_}z3%hnt?)`WW+c$A7CPX=NSl91Z(UwI=}t@;lr=>q;kZlc|aHzv*Y*xl=~sATHraF~x{5jS6& z7VI!SGsS;ROuysT9=%%(6{&K+5_Pt*B0yem^d-r6vStr$7o(3uQ3jd5o+ z%JH=MzPtcOeDM|rl=F*LhRD(C zoSsk0XyB*DJ}LdE-MZ$)QET7Ep~P#1JB;!9QV$J&Yu-#@_q z-O2~BeeWa}5|uobZ8E)>9Zwjims9sXi&;mV1GzBD?yCZJez<2Mpsrm83~Uxqie{`o z*Y9iI_vpBa6}CUL30dmq6X}(nV7CokED=h~xLsNBS+*!p&_tf{f^hdqT+P^mRl4KM zqLU?&w2D$F)5C4keB%6zL((dXW)H@jF$c$0-X0`NMe|{979Cli#i0*4iA`DPWQ1ju ze!16Jb|)j7s4+v*Mnji%sllDhQ7_IDV zfwMgeZEH?5u@f8O>*kBhEc+otoLc5DIzijla^Sd9nAny4Q2|%9jM*2wXEA@{1uVRU z)m%04izjAoKfvs^AOWZ!In@tyUAYsu(9qu0yP3avsJXNFtQgkSr9()>q5U17Q@4sv z&7|$Y#yhpf`Dk@fu}BY*WR(=6ab|ng4-2+(1u{k%bZ1I|;q_#%WMliJ*6QMNw2DD& z{~n_j+s#eE2#Pyq=?>oSqrMY>8m1i7^~GqCpEdRIVVrf`jcUoGc%!$%f2zdLK=9bw$M&pziIc`zE0MPO)J5X19y3Kw zA-2|D*Yc}Bh`t-D*CvPkm|>g0XuP(%FuT#k&}?Tud@KJKh583oZc8Cu9(T5}7KTIK zxEe@_6_#pmt+vE=iP0E&>vnQIz*sbn{=EGqo^vmx{fh5ozSX@NKBu|5%Jz7J$yu(avm522 z^?8{e@^M+A&kBnt+%~s8wKO(2aH@DB9mf*!I~G~b-jXK{ZSgoc@Hy=pYM-EIHETO9 z>ZR&Hfl|&D8M?GREaNLLRHd7f;^j)k(RugWt=dBUI;J)U_>86=_|fU!a%p+%#M+r{ zJ(^R*)>gf1_zsyufHo*sI`FO0k6@QK^JpbV(Y#h!ks6QEOqU}uh~<%5**&W`Y@`13 zI{>iUSPna!Tl@6fp6783_jL}XYS$~Mk0ntx5p|Ja;^_3qVY$VnC9k5QqFb1k^%1#* z46*=0G2YGFaXOmFM=l0tnnGvw<=5H|E7-g8DUIHBznXZkcvY$OjC@R6e#3Fsx~Xob zsj0E3%iPmk`L|<=pS}>#{H_Jiq@uGr-;J*0llZdHoxerRJk=Ma#y!oOKWE&nGHdj_ z=+3$J3M2<7f5A;4S=FnlZH}WwvU5}R_CA-_cW|1%cFXnQUHL}$?Tw|IZ|_~-*;nKv zu~MI;RoIE8Hyz6|dwVD9qhE&HYjgjQjsAQ_W)YWpk;U-#dc1lSHicV4w8RsX=^uN{25OFg7;^M(#F!Pg! zF~7mrmV<=aG#hr#Q!OqvQ{1b3GX1!2XCJ;deK<oHUu`Z(MP!$-Kn zoMAJ$yl=!FQT6jv18Oqz~@etYj{HHl&l~TFb06_m*0aA+Ml$ zJU?jP8Pq%=(_)7x$_An62vD-!9)^IOoV8ai^|F#5jMP(uYorAu|CjZYtCZ zVUO2d4YNkmY`^ZJ_06^O&aI}YAcJdy4tCF zPF-+%Vh!byu^`)m!3~NB1?&QtAZCjz8lqzP=GtK+d9ck)f7o9erMJGfEGl|7vXpc- zyWEayRn9|CK}=#vuS1rbb3Jz2j#s_2F;#&SUEE>1bb`!7Ze$ZI}-D~R+sX?F~@ztwQ1A&o{;3K7E)y61eSWi=fzNFr1TDvu!rpG~6M=zV< zoP?|)xLT989SM{AMa97BEI3nx_cbF@N*=FGlYw(S*2WMi?ld>ia&Xvo!~Ov#g#n%F#8irzbTxy;!AH9%V3Kurf6>=bA`sC*AkEItogc$PD zII6~WzhwJOMWXC_ zZ7~pgdUtv3ezos7zOO2)#7_cd>d>A-EyrA^`?Y^NfjSfavFW2#mpK~oZW-rMNY7v5 zCs1!na97{4od`QGJ}$_Cf~V?oY6vrakutfR#>Gxm8M=}Zs4EZ~?X1i?$oG%#<(fNk zD=X%(9rjRr>`!O*w^IVN}` zgD3w#Gx!IXp<$rwyi40TXTJc)B1kLkk0ZSi*tE__gbk-|-d-+l;XokyOsq15WK9aS z9Py^ZhTx-^m#t{kO$fkqfO~Ic5wrO2&0v;l{MA6^pv$574?)fvcnY{N9COJYqbR1O zP*F42P+=pL%C!zW0x@t3TdZOZr#zfg7b1^AF9JukJrq}6Xrt2CC< zBfo;T`2({0LI{>__qlA*$E?Xwfs;FP^AJ`()pg5Xp?0@!KwlmEJEu-Oz}TM8!pCO2 zc2fesT3NpOUqJeHWq?(F91*sCf7?!wCVvh9P5xsvzY>f_{J{7K)ld2#sWLo{+95&l z^&)(o=fD-(q4xiO8CVdN{aQy)v}pEqC2;{HoXYJGxpx?4^YVF70Hn?2$&krN@P}d0 zXOg3o<4EsFZyRi1ncGnQkz0sxa(AZslC;Y4%cE<*QfR>fAZv;E&)3rLOv;F@Pu4W& zg>AlA4O7Oay90l|-iiO4?P#z)&7fUS2DL@uKKU(LPJgveeKFQpf2Z0gA<5cEf#m*; zNqnfVaH#b;P`|1JBHywaz4B7sURF|m$d)VToIBFVq(1svQ)h=#mKAW}fYj=D`moz$ znCQ>MSN~{44q(G$_WZD_ubhDu=J@_&0TB%q>EgT-sjE!7ipoDrcz{PmYYg{ zwL!+0rB!y8!dscRPgN$Vm*ZHIIfMA{BL*IzI+z)G$R{5k&+%eS*5ecfa@&hw*r0~b z6#|w0L_+s&Z(}}Ecwv#7@ES!oLw|GF~dhp8O4(&yOO$^?gyLoEL<4ly`J|DBWb z$FsIX53&rMblO&R0y+SW)ps@j(%(KKAHQYgF~QR zXklu1SXxDtZ6`(HwjT-%IA0I^D^pkq5JbFD-n$`_P4W5XB?_QNd|&H`xub>! z_2vDkowM2k*=C>qxOo~TsZDy6y*D>E8(LaIMDP6Nf~EfBwf~8Y_s6yM7k=E`Yu?BWGG z;fEnOr+i)fRSsT+|I7$K+eq4 zqGvHH`>(9}w)ZuY?7?)s4RO3O4_6TY!K}>GL9wa<@fVtzg_g~S;Pj0(l!}7Bb{QuRX`g^Y44VAA*?sTdXR8}6R zL3wVcNfZzSM6O=$kp-&K>z!B&yhgv9h!aJav4d<@u<_md{Hjw6TI`Slk)#$*2{;)k zen28ak3U~aK<)a`pRmrq*kMQYvbb1seqmwIzwPeXzc{7;B<%Y8!OS05 z41DB2^FtsruCBfEQMDB6i6VURLB--Pm=kcE!~O}#og4BB%eZ4xP|SqS+-dEOxE83) zyYeIhSZ4DxkGBne!V|;*s)6SS>F3|=B!A3U!fV`xUnd=KWeQ|)Z2!${_1}-VCv5|5 zKm*JZTK!4VAcbQe|J;50lhX*9{v8GiP4-{9;K0-S)8hb%5*7~qHc9~N?vF&bqQ65B zME`Gkrhhyjk@~d=(S9=Ya+wZ@jv-Odt$b&7HOo1YsEM0Z5~9ZMNPOl@{;&Fu|1QDa-!n~rFABY)?uJw;)=xsdh!nCW3W&|$ zs~&YCewzJP6ebDk-;*wJ{|=`LJ$vt(X)2Nph88bFrsJ=!1~xsqwkq*!L<49P;&+|@ zb4UJn@kn0R%gOu$pL%*&R##UoD~-?6=T0g^2&)`_XlN%^JqkG9)9UkoaX9{ap)tgz z!dLS_qTY^Y3Sn3MW5ND+QA5OnH7TolptqLLxbi}f3;Ex}0(U~ka+AMY3~*Tf4m6Vz z+6$GNEFKJ|IkEaxND*BxP(MxO*3?Lla?mnAq3%}fN}`gBVJnDV-E6Nk%z?f$We86j z>j0&(2Z1A}B$vMrUzVjD2c=to7CG3-;_*=^n0<6Snii>W&z^8aGSUc%hI>*^;KK&+ z;3k6WEY^|A(0CV&r#;lD_v_7ngK~XwgliNRQOI$H_&c(D;lEnG)|^oLcH^TfK?U!2 zl!yTb^Dp6DH%0J4qb|1vlq~Pvwre(&KiIY3W4sNCVDGP=T%({Z-UbjdfbRVx(95#g z;ALsxygpm$1*G3$*ytA9!2DY>g{fC&LimU_pI?5877yuI%DsFdr3$uFQ|By^8_9uf z^jMDlhqxOmvLRqyVTB)1Ugrk2f!~B#uN1&I?E4k{;ax9OOY8*m4RsNsNp{$`1g1&6V?H$#7PlJ3FL!^+_L;7r`NMtR0B%9dw|(S=-Z?q z>UJ6INYYj2AMq?H55(9Xmif1UaevP~gX>tdHdPbV5v*ZK2ONh;Kso;=^akwq=N(3~ zedRj6r5=AD>HB_${|=!x&Az*qa(;e0q1(yN;CIQ?Ng2)n<-~^$J8*C{7;+T$%6~s# z%4KVoF{0T})L*Rwa(;i`9Q~7c_@4A^ zwrSC%;VGCYz111uJiz`Jj;|OyKE-D>Tt{}spl-|_jezC%r3wiI<5`onzOfr{Rnd?3 zQf&j~SJ;u@HywiC9ajZef%sJJ0~WO3~`rtxk~f;rS;8u^XRf1=C~%N{Bk6pW}0LKnz&#&zF>y5`gYz7vCHKmb_K>wu!t^hMU+Vud{|LliYocZ$U04P! z26oxzGmotY6epcX65?>}?5j^4roo5p^df3d`MUoY8YbyhLa2)SsH) zpo>Ia=p!XCL;$gA*8B|_4(?{uH5SyH!jQ_Ppm&`Fo7L_Z!Dq5Me_pd@J-~NG!z602 zO$62I<~A{NaQ|}$|54k(OjPw#^F{@sLEWihw8hqoV5gD~fOH%Q?Lu-rc)#>86-s=* zq}{xdq$POql28+?(YQ%A3DBvzzGT*JRgGM{qIx3wJ@V(b-GSn$m&b%!*a0k-I{q}* zb%L|X5w)>uzP2EFIa*0oME-!zumSZFZ>-!@-kIk%Zh4co0SAC)Hg%v;&HeexW|n%6 zX6R}Ck80N&@iZ<0s9{!)yZ<}*55`%Z*f~2Me%5?eo(^r%UW<31G8Gbt(u&fir#%H_fvRX4dXmKLay># zgd8DJuT3pm;HS%i`5b`1co__Ss%wb<8r9?FVBv9OvYf9LPZL52t5()t=_3)K2EC(r4t=bb&^f6!+Hu_iz{lg zRvq5c(`It#ons1biLvaI$h%p>QwDPaB$8N--g&p|kf2NuHu@1LRVWh1(EfmopcO~6qgzMq~xFy zdTSs?97UQ{TXAD(8oztd@rrS$PJyTgJ|a5sH;h`T*EZMKp=XC(D& zUAVZYv9QoqN^bhlm%7E5EXF7-$2;@n-voJP6czsH4I_HUaU+&rH>-o9A zEjVl<=u@5DoX<~N?at4a$l%U$xY24j+?6k3`esjxnk!nZ&`|kJzFk3&Z&;gE+)&6? zXF^b023AEMS39crsEA?nwxnrri z3qaz*v$V)d!_IyY4swGI8adPc8u@2(Cmd*qv4r_J^&A(wiVp9qeZ_p0Qd|-mhI%2( z#2yi5{Kt(Y4C*ULn=&_zS}=6WH1nnf#pe>+BFrd;jC1u{h76#y1aKY$q}D5A?CU3* zP8-zZwmH06)BZ$*T1a}}FSAX6BnO^(-Nurqew!)@i{tId9D~y?%|klx{qsHPtcLuZ zd`<13W}C1KVbzm~&wtJFiVhb~lh87+%nwS+5*8-0CZccHdW1Nwuj>xJ zxF<9es!uNlcW-2;f!1&7u098wag!{Pwn;Tj$zayx@mKr4%TV4>W6k&0d?HTZy3*HC zwmK2p7sYek0HvL+@$P!7bFxR2?dXcDQA^7HWEITMc~SwzKj%0smdu@I9*aHzMl%ye zu8MN&l3}v*gMtkYsu-od+1gFuP#9p0z zfbr{V3`6!5d;mTPH&9^|%cyL}GbQbWrS9X*L{KU{QxdbB+Av!xvQHFj z?fyogj#D4BuZ0H};_|p>wLiD$$2R9}xu643)3Z_;lQc47U88rP->a$aW zewuwuxLFztCyO^3tVVjt@sc~M6wl_dcrm(N*-K=t{Pl?}Go5|WK)671JR+nZtHdX^^My(;;~ zJXL0e)glaFqa0iFDAnS5fYCA(rQUh=S2SdHBo6G1mTGSwFGC7?8}gBzkn_6V)=xJEF9s?Pj)gL8ua-1vHyAiRjY=WQ$UoEc{u5;k zzuqPLQp#xZXvo5ltrvpA#|&tN99th%LenqPd|JLg-z!&eW&~6Ki1-gDbzF;Q9Vaof zmpQl8X(dZ8(NBjX@?vqdZv}CJ^9M0~#h14Rj!W*R(X_rw>pEN1s3?kLa@nAzpuq(+ zFLF%Wm+FG$XOsG3WrviXc{t%(;6gvk);UH<@v(L9I_vz1OjvRpld*3%hE{Qqdsg@+ zFiRDe;l^}R#ao*c`OeE_ETm>m9UYh6DQ{eaf@&mf+Y5_^p)Xv*WmPI+AxX86b4m3u zl>5hchJav1Zz`yEU5Qy1z=1%S*+?e73Xe7IBqTBgJvbf7c2?gcQcB$gVhg?Zl$>tn zAYx_*5%nfSK7c$X^Q_I-m)J#;w;zUu@B;c|Y%HC;-TcIn4d*JzXjRKElRBr8Qt<7gIh;jgCXY8kVg6ux^iCCNA~K4g%Z@BNqj-yq0?``o z9f!)+K+SdfUPQ)YrgK=rF5zus5XihDYGyR1xr4#6Jc#^y8B$Um+oG(Ucs`ISMmYfGsRaIZkH|QXz6g>HE^zepf}X`e5Nru41(!|m9BTb?C&Cc$f--o! zUJyf>Ea|BU;U$C48BX5C-n7$K9^*ztK*h589o2%xXIn5 zJso{k=}?>v$j{yvLVKiwkDmt-T^2F-Qn}RwLSbc~ zS;eR-l=DH`BX7CS$7}HsuGlT{ z{wTLu+#WJ9zBKei28byiTqlw~IA3wIkI`%EfpUT9j04W8TP=S$OB*_4^(G~!(6>{g z({3*_naVKWrE;VaciVt?!Nrj6(rr9Q1j=Nieg%?X;U?SG96-N{0o`sef?HpiV?dJL zmsqwC21`F^qv^8e%OuC8gWd~HKoDfkVjT{~G>$BsQoD%iy=%E;f|4^?x?)uA z7Rdb4k0HY30xGFmU6#H`Ew3)-OTM6s*mbe4%k`_$jVqNr73Fg zcB}gu>W(%bQ#3=_QD+jIXxr1wfN0=fnkbHC<=Q9R#!Kwqik_k6Kimn$5?0xhprPP5 zk&>1*$ATkmi@L?`AJ%0fbXMLO?LOy&*0w08lIf_psh(rr>n)Mq<0;>Fj{hM8^F(=? z;r!Ds2O-^<-nqx`br+pegZnF7ptndw1){d*m{S3bXc@|e!#B=J_rqe-sBzU3Kxw~c zv?!QR%uBaz!UwNN#p7k3`(-;8lHenGRdnvzKFQ7u13dw3s?~xuC&)r?c?*BI&D|Xp zBtB3;;hvR)Dt_1sIz_GxhwRHmtz8$Awf%&9XrCV>lkHmt#h&II==tZ>ZLru>d}&cE z@zP>bzMG4l{plQuRyIuJkkMo|eJOStLVT2^6k(0L67xrk$*xD2K}apAW7P=uhL z{qG4`7~4}dxcKkWMXo`=%YEnL-=~9yi6ohc&)SFlGQoE!?I(=wX#2G1E_D_%UG%ex z$?t_f6ywC^YAM!I)zV+iCdi5#evoL(D5MVotv9JcLOU%evh3&5Ub!syB22u+#5{xo z9YD&Kq20-t?jpn6z1F?tABvNe`1XWk(iDG|JqOMSCq+-}HUP0}xBhL{wzJ1*4S^ey$8|gPq#H&uPI)y@t)NiIoqt-*T-~${|g;+ z(ZQ4efCf^dNVYEDT4YSURFG_d(eKK$@q;3Hib_OytFqnY)?}PQz_~u}U<)uL*|EYX zvLUlJi2|Kvg8`&fgV(A)PMo1gqh@u+^%MWTXJsK1Z5%Rs<&zf7pA0a^Jj2SSXCAxs zsa}%hMMUyteIqt*zaP=8aE=mpH%Txnh~VFa58ujn9=G|&DtBlSAMPQN_$9(3`!dslH*UBy9LyO#Nj#5<+@ zVGz!5-PfasQCiPKIWtYyw_lk#EGg%{fhEk$&Own~Woc(#s$o9RgT;+_e(F_QbSH?j zqq7%Z_kXu6|Mgy`&UY;Utm5uOdxM^@&i(M(slODcrFJ^n$l(hl{Rvj+W+%lJz?XQgT zl(geyKE8RVr{WaQRnZly5>tfzMZ~(3D`jp*7tMc`tR<**5too>~9+UNS{= zy;XGMy_VpOzKpoOxukvEeLaYPh90C^xYtO{ykqj;OMr~#^o)Yccv)G+d#}>UR2tGn zhVhl_EbCkCqGaYLA7e;Zp1!bloZ`~OxS`Z6KNlFn>-dMye)n^Y! z<_%JdUXl?}K38+$s_9g_W7ELO)pCZ##FxV$_8I3XQoUJdOGw7dC>+c{e*Y%|)<6b= zZ|YT?aYS{z%m)Zqf7(SP_0w+hD$(6T#I_^Rew9zYB&+hyYjKDw+>H&T?8W@jufq;~ zI5JHPGISpv=6_T@c?jv^8`^m(D=I;^APqIunQI$LvGu|Jb8A+$%`pzTe3OXl1o0OA z6 zEHY1c@~W@>F^&(Pw4g1SgJ>6#`Zs7O*^!m4`?T7Jg25z2Sm5gB=29{88_V)XvzOj! zf-1#R9?iT~L8|Sk6W{%QtKP{jbnf(3)4OgHyLaQ~87B6XV#)fa!8tyQj_RXt6kRf3 zp5e(GInuLQPGZ58civhF0!0_2RLy)rNAoHs_MI;q@fi{q?u`` zY%oQ+vtFlNNPQ+0H}EeAUXCA$hW_j09WpTzE>AxS#E%n{F}O2Ze5@h5<-c+Jy#2xl z%a}9vmM=K&FF8islSs2K_g3LwxPJ}D-Es%pV$ZA|-rAEN0A7QrHN(fQ2xo02o>#~k zxGyzDS!ED#s@kPsg8bK)cu48JTB>#JeDF~Gx2aeFhk6<6&Zi2=UeC20)D3@gBIt85 z8r{=f_@>*W4SVdhV3$E-q-x@G&}^)%S7wlAlPf1PBjvD*>boV7dL64j(VBkoA&bJ7 zR9w{MdtVZsxYTo{7JL?Jitv@Bbv&p0fk<>3{Lyc9bDqqc0S_RI2)HvyYVVu8EO-@6%&)1vR2J{oHpd6BI~%es~EspN8{u3?aL>7x#TB*F&w% zu^w=;`-bwaQEU0p3k*Okm^NvwV$Dq=XXWFWL}yWB!ch>;sHXeWyTju8Yk7+PlLPPS zVtV|!Nq%Y6B_Ka8#{D?^Ox0Y#>8paK`-!=~%!rV`1Uptqu`p^;Lvi* z%I~gsQZPtdx6b_Xrfe?2bCoVYN-5xHXwnL;+!QHDhy}XuC?+IOu$9|C6qXcG>xtLs z^PnCJg)T*+u_05-&^*4~{BpKZ8JU3CMW1#--H@}!tqp~BRQS(m`TVC?k6d0stPFm^ zf)zLrcz>m%VVI;ZOYS4P9OVN-aos!hk?KcYEw$>A^<;QC)O_gpdAL5xWlq8LSdV#X z_7h8mV{Wc|`&}E@dBS2xRAuGPajO1ns7YQPVq>< zDW++X;{OMwvq^!L?>TdK!^7Ye%aaiL`HCdSr*9 zx+K=-hH*>X{u(`i3OHMpJ;VTlx>5n1hlMOmsF#L-E;X!yJL(+}c67S@w2@e!Gh1lN zZT*68#v`vh?o^RC=p(;0B2n_G{_Y|F$XL@Z@{LgfwV*4LNK?FTt zCwCMz>iq0$myp4ymlvBS;3WN;Z$K)Mwv@2F^-L)no)c8eiOe6*wlnRluyS#qAp3O> zUZ61dxc@cts0Rj#ZHsv4@PpLffU3vJr~u>Vu4hgplvwL6Ew&3*+=(YbKYl_Grq1V*}}-vOr;5I zQBzi9b0gh1s;t+#(OVl0vVeR%tFwwb_jSy5CBZ2P`@2(2mvx6)bdx$m*b z#YoDRjDCufJr#5m`D8g%LY)Sm8#19|>%$|=BGw61wARk2wXTPM>EgP3)~NZ^*55_1ATdalWgM{ONAx zs1^dRqH5oqz0{)sx|FzAQqWoALVG@dh;yMM>|ISPGtp_bN8!9p6GW7YL2ywDg8WV_rGk7m0Htickw3W3Kt+7^3(T|C)4#Izz z9Ly41`Jl&hLa$=qGU)rSEG9&&UcG1~l#7d1rOJBw;nc#ig_T~SRA1B*1f~ypV%aRf zOIh+O1UYfW)YobIIq4I<;~)4MY0Kc_8KQspcD>Ew4$MT&KrD3rG^)=BVuFztJec`ZRv1^dv0^k;vr z9W{xq0fk}N+ItSyvQ0JlWA$~vX;p)t{o&}(yr+xIdWuflPN*fqZhhj^P0D_&`dG-V zzR5$+w14r+!=V_%AMYS}F1WQc~`H*4DqfapPgo)woO5-~ztc8cr(gij^T0 z1~f$<*4t3$E4ml*Zrj~7aWOM$#ypB# z1OEBP#5d752a5Jr&sh+W@>O>+1;2M?A9}QZ>7Z>#)>J%ng8SB}B-tk6q67uXv5l$I zYLXfp>q;Qa+vhvlnjTjhE+8kr$-_^0xYeN|U(Rr*UT4z(W9g|ytDK8~fG&4GiVmhpz5Br4Z1>!YuCy#i0);3MfzOzh zvXjQHoGzE_eQ(K{LH?ng#&Nh^u=HxDLwI8?!`N_rBf-L>%Yo^lLKB7`%0nsSnVvPi zR+uYfn6{sb=de^~4_~pw=Fa(kr4{zt2`=&ppg013aZ{qIOLNP>0FLTa23_>`Wp?3^ zd9gfUYRAp0&m?A@IsA*zxnuQw?t``YEI%*5XV#i~-K`&UD78GBI&1K}#WXGDctd=6 zs74ts&vugjo~q0jYZD>)$0% zF9+1~DEh^RobHeepV5-DSV|D`Gl(#Y_IycCmh*7V`u$qZ=xM9n_b0QKUN&kKt|xjyRj=I@@~lY0kgp>x&MO4hkGa5 zvxvBTU*7#DPz`)oYoR3bKCMlFVE79#jqI33* zUO4A(KP;+v zcXLR-L9$7)5rW~j?xItykDvxWM%-rk{^4KAvuZ;JaTeeG-+v>;>O(+|#S+mp{_8i7 zp~v%Ky44*c1@XvPu*-J^JHOBQPrKFG?*|46yrcEp`3f)YBd63%&ETi+$5fT(Za)Sg z(ymUB(>?+gmaLGbIZ-&Uw%SxD`@>NxnmUr8M~8ULKDTJ2xrYwt%b#KgsMyl-s0UqR zf0mG0H=`}rphVrW`b8`6u%nX~Yxi2Y zQqcwK%HGgQ1RutGh#4aACoM-u$OM3gbm)_qW!MQSE|us0OroCQ;gaYgjCQfayjD`w zYVj~QCZ39o&gOtI%Z(J?5EWpF7{U;9vAE1X32nSkW*Xo5CU>^>WX#2(usyp!L>|8R zO0tWz&WSph#te3#B59%ZBC=Q8fgwP1jv~z|CI-fner3R z8=$&XMBHi7_ttTHT@>e52bn*Un7#Rch~oj2xpUqmbXDQDW_so z+54vGk-8CS4u=^;SC5EXx)_U&7c>@s$VJ zxWr9~)JH93sxD8bDm=u@=kOEn!~kxLtmKDZeKui7#V-+_JFnu{NshfXj9h*O{w@@` zoMh+n4gN=gLCivTevOaz5PYUed4N20Z4cx2A0=ABGot9=KDF>`WEjjajQP#b3ssi` z7&3m{OuKeVOLKM!!r!6q!=sUB*uW4I&%osY$mK8${N>#&&ybPp{pvyqOR;M9a-Jh& z<0_Oa`LAC!;Fserg)0G!J0ZgkB889m@cx;w!cS~mZ?wrG+;?{UPT;rmhvr}ca*yVm z|Lt#gE+0TlNTjQ{G3l?t)UT&3+`Q!2{lLMeUk_~if&Y3WesKpjf`zhNT}TuD{{6ns z%l03J{B%B8?g7_#r}6uS*pBBEDW$4Dj<%Ey{q~ygce(}f1JC}H-(2(mwD#TcRQK=y z-BO{*PNdt&COesNtn5ujWR)#@D`kX8_MXWIgL<4ldhxCapfAA3$p>uKOV`34IXA7 zL}VV7j*ScLjZ|SAEAix^v7@**}0#~XTiBOU!Cgd1EIp?`fxiKWKKmzS+#`Y=eHA zdgCnIy8engP!>G_Kez1Lot;^xJpHmFo8ij74IfD6JDw_04Ln`_8w)XyNP#h~Lo}z6 zA*FyFJ7g`Dd)w~KIkjbnPiF*C;i2vRKVnT6@{E-I(gv@eS614M>aYP$d zc`{TTYaZ7ucQhZ_y0X7jGgUX&ne#|9SM|NymA@%r97}X16wbG39{^5DOoXo9+t7#3 zF~KjVVmb7|oyVYU`goXv5(4k|0-7S`td+0%7y|YW4Ox>FkAuRz$WN|X#OY_sDMFd|l4B!@ z(Lk1(1Qc+wf|e^5iac7F;ukb)jd@cdz!+s|y*0&WNsqi)(yghXUhdBXFXx1~$C+Uv<(xVwFP+JgS?01(1f zs05xmJ4PstX>31DK5*~neS}ElIDDe{5jsYi2o|hu8nZeRZZ%F8vXiCYYM%UkT{Pc$ z3B)aFYv-tYU*1yjfD^ZvY(R{5)}R)0zKy~g!q3mqW*tozYF)pi5Rg-0h}LQ5AU`D> zmRP~zUodj#;QE@>)jhdaGt*X}5{fy?d-Jje5n7!A_>Y4TZ46eag15W9FDnaZ5$yDXugAxFi#R|3tT5IWX|u-@cV&R0lR- z>s|PYHZ);m2W3NCl!E-c=e^8WI7C9UUcfd*4Oe+JAp#=S7r8WiCbbXf%5(XWpTkU& z;p}r>5aL!m=lm?g)#rsy)tcjYYG`VAR?zXuDQclETk>+N>Ox=&w1$0mZ4)5gsnPH0 zh+?>Yi9@gCc4a7hg|?Y54>jvU%hhct?{9*Fij*{bs!Ny0vI0lS7j`$7IoIcVI8Au8 zXsrRka6H)C)E8#e9`@NBQqPjXA2?HxOS9djq0#`N7bb{!G9aEZWYsQUUK}WkgCWTp zYkU|;23U$-WMg&L29r!CS{FHp#Vt|7(TxI~OSf&mb1FsvIb`C89zF7n&-autcvpYJV(dDlt2-8clrFv#NlhS#PW z%orjs-h?Qi4pH936Oo{%-KVXqARXo=A-=gt@tA#TNcOh(wTkN%uCBgsVp>v!cQ>qS zv!5sqdCnJBd)!KK(VnS<__5Xw8VFn)uMhJXuSxy5$Sb#GDuaFM)tS$NK09l@7b_Mb zyC`?UAHKU*8%-49{VnW^w%Nmhj9uQKQ(N^^#;sMvPPyXformHQ?z^nZ2Vk=|E&)H#;`YEG8-!cQfc+gNSlY=U0QjFiWb$&$e{(54x9=g8(*LF-<{n@hWG&b;UiTu)L&{l@2>y^ zHb#!-`KaSCk5v~lH(gHIv#Zx@Q1+E8GDdeGwwG2>?-DVjEXp1AqNL0>4x@D3f)pMD zPINvgE6hytWLQqB2$1}))q$*5xi`4hI;qpss=yP4uJk)qB)2{aB&1ws%{j<=B<4sl zBQ~V|Oq&C=9~qj3dY>1GoAcHod+sk?F(et!o-E;)T7lpBYI8881p&YZ8FhZOH{vvZ z<4#m}*Avi!1DuGI)do?cEhfG}m-Ot|q>r%7ceEjo%vMOG|@vY6N%1LS6`s1GC_>>s9 z+R3}n!Q(|GsOKJ=MpYGq7)@wca>-qZxX92T;o_{jS=f0|y|t8bY0=MJUW!#cAsUzL zhZo#0a$JAY#ht~pop(l~E7*4q*L*dvj{)c|^!$8WZ8*=~AMf()u=#;!oq9oRt?SQSH(6O7mU~CfSJjbbc*b zNR(P7G_c(={+feA5uUE6dRm~C)q^Dc^LM^G2sS61#0O^@r+{auHiidD4=S>rxWX#Fl%W%Kd4vfw_kmF6q;(vnt2@zF#0bRmF4AP&}$0Jjd9h zHY1tiWtCBslW{MPH0os2l8#Dk5|6n&Yp3MLDS3-gkduu1>m+{H;vUbwnbzsU*}!Oi z+6NP%j4$DAJI~KPTCVH`16aMsJvW!AO*mc^-LS|A+_Bt7s0GHHxa+MCZWiN`FkUb> zh$^jU?Swv_Pp0C%OT;^&ap^jRhXW(A~ z#l}i8U&D&zbl&Unp(gOY6haqcl%GeKe>j_xm)OJ_Up5CJghFEh-i=|=#M%Y9W?tL@ zpKlLO7)*%x7hL5V{52&Jmms1kY4fhg6sOU_!InI!(HEC0mXp(VOu+$la(zOe`EiU5 zx1mByULa-o42P%HaNQXrcHkf)@W@!*?HLN}pS6APr`lb?b?xN~o)^K(Qt76*Dthvc6dR@J&&mn(U|guf?7Z+%NslZAZ`{cY-9i7sP^Xzp*Xudvo z@Q?Qy$4mM1$^^MN`L4t=&h5iMINHSBT=^JQhsDBgsH~OaF(KJ7I+27{0n30olJlBW}WQd?jathDMhBBm;vSKIWn}S$eZa@uUdDtyc5q& zm7^A)eT`kXCF$g2+46jhoJ>NC9iI(H*I0Ar*ECdolt>)ITItqEGRf_i9+Az|&XBYq zI#(P@Z)Q9AT8&kw=$yF_+Nh3_gpvsc>phGAE-zg_n;J+!%?eboF6wllErVIYIjzjy zu1u_u?$5%O5WkU=;dcrOw)|Y-P(Cg?sIySZL3%-s=OF@VJtZzG{D+dL4>B;jGhaFi5wpv z;!JqClEhNzS9^Or$$R#^x?f-z&=}JS=h-nT0n_@2Z4P5iv!X0Tymz9mlIHP(9 zCoOt-@9BS<*p#>E_b2Zcv}$r6V@QsgI}%xyBmZ#a*RTDNpU}fWbwjhjd9PulK^Mm9xUlBsRh|%HwCBdcEL9xt z3NN!63vmQm;Ad`|($46Zl>h2G^m``%37)of>lA+X!4b@$cn zq?ZkMc1-OH7oUV|APR0>pV^AnLU9nbO6e&@xpFxR<3@F`Dhzo8bGLJ}theAy}P%{`O`g zpX+bvh~GJYHDV+=?-z=v^Wo}xT8#ocVQI7j@G*7bkwO2KV9(p-=1_KzaARc&#nz?% z?CaI}8ad%ZId~U~(oKbrl92^CU#Tza84e~!^95Hkx*fntTewEj$h6r1>gA#)Td+U- zs~p2?&_gVke<)9bDFLR6Pq0Kwv=##{@lFu?9VPQka|mW#yWN_ZA3<{6ZtEEEOL7Ww zjsWptYzed)PBa$Qb~1gFe{%L(Ac5$bYLb9lRG{pnt`J0wi4aiS;fRp(JPc;JDDZ4 zeV}{loF*f(TIdBuFJxz@T`f>|#uILSwC-?MI-h1Ex3*1r`F8rN!b&lS|IL(me}~;L z^jH_XTC?Knv`?mqi=EU3YaUANv)0Z%E=slw%Rg3K@bSFYCnh0eTHWJu`>J*AkzPbL z=xF0$XQqMa@SiE7$=2~{HAYC+M@SiZb@5WAJ1Q1;C6wQOcu+t^nvvuKMXBN^|6BEn z`-Qy-aJ$SzZ#G+NU%g%L7LviZlE!bXovXBn)Y|!Y@zLHb1}0lJyoZXhlE)P)S!y2! z=?*$TX0zFM>AaD{M<r zg?jwDWp*h*!=m_dq6E%pfm~_=2E{%G944Up__!`TW$6@SV2?yjvSY~F;(!j{naHUm zo>b9Hxt353G?`wmmnV#&n}Er&-)xt^+&>+Q%meIr?O@78^d0UjgNoSp6N}vRSX#-z zE!1)*>g4q-5=KQr`U`KArt=|bw~t}e>G9SC4#34x!;&JW$W+;%wD5!iPe#B}@{I?^ zNGc3Ky~}WzaD5de=S3zn^<}S6L5DWnfG%%~MACg2R__{b@!&}kQ-R}TBidnHW|_|j za?x33Vo&>NkTpq3YLpGw2~WQ)yYU{2=|YEapwO@GoF?KH_NvAm(;h;A>$Vpj^8-zs zCi`$(#aMk2l$LCxO(3-99q(t=FSS8Y1s^x-q*v&KVVj~b z$P$B;{*0gLVe6*an@pYssL-TAfZ=AQ{3+CufLZ7A12_Jh;X1x^+zFYqZuR$BI_$)7 zbP>83pMr~K^=x#^R9MGX;j*>)p4M;`8BB*q{XDDd)@aax$I7rD-W6j8GFpY{6OyOa zdA|Y<6qHcvL&Bu|FyX?M++wHuR^8a-h+`NqgCeC+6(ZFyg>jT~`1J;h?-L`DT~ihk z*~N1nWM(+qam!U8K&u^Z;XKSHQCa$ z#AEe(5Q?Rfv-xdaNQaWk*^fhTONif`#Ct3Jn`y21$I?IwtE~~FXXSRECq#$;$V55D zB^ZPkOQLLpw4e{+Lc#cOXM`O6V0m-fvp~E#+_0Slw>y_?`zQR*%zXzZuk|V82pSY) z!lQl}GFT~7J;gH>T~Qa;oib^o-?-stb$!(`|4GRH8^W&wD6hR=E$mA0iwFAozw%^& zpsBCF@wj&Ao_@Ya6Y&<%Ns(o{smk?LjDDzj-RZZ2RX&^foYU0pgS__+W7P+YTvn@e z30tFgP$g8|gAy&5op1H!*oMvE{z6p}UJX8i>b~P3DrlZ3vbT?>`+9t#=j#4)#8c^= z*~75j5KE=9LbG0pZKtgE-BHS>yrCrbAoZX_^L{l-Jnsy;d+&SD;I~Gs?U@1m3><5% zA;bFbSxz&ehj7kag)jDQci2Y`*3xSD2gNq~%O{l`jYr7U2fkmIIAl->ucnFTk)vJiBmo6Rd1)Ob8ysDLUa2N$T>~M$c{t*#2DEFZ8#kYmco?-Lm%fF~&#X*W|XH__^4T z^s00I1yULQhBk|d+ELGrm&D}PvxWW;Y#zZLBifqWd^^#9>+NGs@&i))c&A6IHU1AM z4kHT`ZZs<`TY@Sou7lf4AmXJbpHtEgfeRpc>ey@86KD3T=ti%h@JZ-Q4p$T=b9UkA zHUtuD4;Cdi7`3Aa>~tz{*o6*%7!USS-&ZYyic@9H`osiGU0k$u#}8>Eh4e201)7q$ zTD4XN)+bIrI7A+<2E7aU+n3j*8->E}^!s@3=UNa-q^Ae4<1cT&-%T)bm6wu zCS&Gk;J>8(KS$6n6B|z(GZre>9;eh=$^;C~;>cgZcf}))yyd(nWigHQm#@*p2mKqT ze?*-VbZ~lVo4Jw|&&!amLC1NRZO6!7Km7tXIR1 z7%8fc*ID31W5WkTrhlT$BmS}}onK3F`oA+FZb_^90@cj|XW05qzk%K`eQN>}_LfN` z6K9BREGs}C*>f(_bv51jZ>k<{%T8DJvL*Td+*Ago^cw*M_BytaUW^>k$EM`{#EXCf zX^atgFGlnA3cUNkRKj+E#6sL_F4;vpK2U9Hnj49J_S$YjiO8Ik|znt=`H(JnXlP$p~&FW736EphP9r^iS5y$=1 z8O+MP)p7aHIefT?c1xmM(mbR&X`9h*OKeO8U=;kU+%=aHdFm3T4&2XvU3NS@Ec z#$TZSai5t#V~1CpKmC4l)l`~Hnc*RwMw(MFx^u*{uQ~7!7431OJsa96mD`R5=9c zVCg70>Lq}2Z#YJq68(sRdmio-=!Zv|Hs2&12=yxxDCijZ|qeyjL=Fg ziTU7bhmJyhfdp>Tz-14ML$mttZbB#o*LKA1V+*Y+9rmgcW(U%@CqN9MEjt)s&phtBRL#j(W++StVcld?ToW+FPaw!bLuGk(NsifZ`=K|VbtP4T3bYFo zODx(^2zO}qlu=Zsyd_CM1(^8@AY5|omhYI7b>dHZZ7HdAfLUUQ^4Mg@*gFvVZ$%Y7#ny zrs^S}O_}BMqbCTW5m2PGh81f3&`=k>hrk5ekcBNoOf@Gpft;GfY8(W5rF$!^l7Xil z1ANtV))BPLjmORyeCdK_5)25Ug#_{Fo6VHgsAD2S0J2w20#>S`;JgD0=gFdqrAUEc zcgd-i6t*|9Wtv3>x&)WM91(g|a3PKMK&%%q6OO_=1EtRkr(SkMFf1cZs@G z_pxBraT$vc+qZigas%qS_fMQAc$J+ZQxbCBd!w^-2 zDLMxv#Sbah6d!f%c)oO55S0Vi)KYLT^(#LMVIP@n{mlQV7vr`MZI#(82eaAtw($Eztx4;zybqx#8gqlw_gN+3r&n+gm$N1ggx%z$rg87TK?D95_Ioum0g{k;WX9jlFO&oMv% zjy|AWs>$8rb(zyFn>En#cBo4O;#U~;P&gg3RbobYtz1=Fz7sJkv5_FZeV=j>!F!;j zSF%gF%5&WeI*}tffQGgV$rJ;_rFU@7j6mEdR;D`vATFOxo4bi%Yi{cms3k&}wEHUHACB~{I4KySfB6yuKO4}2 zrHexIi`of2v_@A=zSrhb>0%+NEJpowCLATfDL!`P0G9QeA*_gQz`%1hd!v+5=?-Xv zYzOplNzi;CaF(T>q}F~?blw7r#H$x|?D-Z?vOzb4oN<<^;raeo&%W=D9FUW}Q|DkU z0(C-D!c)0Tel_pPHaY6oU_J#ul@u9=09l&xpAz-ZVI?4-qg-O|J@LPw_6yaWtCl6D zvI7S2Ue1{IH8~wjqNrD?^U`2nBEqJcug$cVQfv3UFuorzwQ#95CPZM4j7<9;DmIWM z-z_FYYDHd^y|WF4&wYw*PC9#7(+<>7HjlX^Hw`nNiB(JyIfYJxeq} z5!jXCsXr}&DYz4!Kw?P*5CILe=iYix%GXPAMN`ckW>dxt*J;^inxB+E<$JD~35{SX z9!w=UHieL}p(GFBL8f{sL1`h(v6_G0aM;IUYZyvg^vx=#NMROu_~LU=J6Yg7^^DEd z;2Ttx782O(g)+zKiLMZ|OAWpNMoZ_qxxKaTOOSW`uX9g8Fiw`pUO#caf7S{LL8Jpn z%PcOBz>LbdOFli$0R=#jwgPUhgHW02+{i-e0DcnR_*)3`B+)g%QGT9Gcy@8dPwkVLKq`^%$D`| zWD^4EtcJM`L(G5n(N7W4(DA7)NTk2RiT01kruFcoOW#L84&b|)-tO-_`<6xP4k%%4 zb|OBlLR7xBqf6O20HZT=PBOAWOS9W3>>&rB;9o!`0|w+Ds+f)o%dD3eSUu5SphEaY zPzsbOW@f~`&tzgGILlM(YCV%3c!N9#TXYoYi79L++j-&6PU@jZ#cNN6V}C}4|3)`{ znqdC25l(>F92?DGmZzG{bOuqYguBPEcf;){)Gvz#)cjH(hg5hcVZ~@o(3$eTNzqVf z$CCI^R83AB^9E{+%xWJHbtiW}FQ1|uP|g8=K>iWttvANBtPxNPJxE=kU`I432B6F> z)vv--jwMw|{2{$N6ep=-u+g+;mOKv4c~4SxfOe{Pfwl4>2-vO%TaMbvDGT*miugVfo{UqI)x zs^Azv8X3tADKbVO-33%q*rOB~SPI6Jp$A>t=KemfTA8sC6n!JLL znGh?16g}0N#t8-7IX3+z4ISlQKb!>l{M_eREoPx3NANDoNl9ov8xXUX{${#9(_Z6h zE{W+7^?|^|H7w0{r-7)b;PPdRc~_V`dC2b zY=ejAy;ObJ(G-Ydc)(G);mUR)EqiP#UdPy+)J-%qj4G&@d=jM`$!%Kyy4mD9zh$L> z;tEhFuC-VBJ-_&!Xat;mxwl7EZv9KA*cq4D7 z!F~9JgjbD5IW%PO32u4`N6N(c`bQ4Xx11TWekxxYz`h=@-pW$80ERu&I5>eBvycbH7^lbI7tbKV zJc-E8u!N3T=m)W!Mi;HF7@h?l0f{YLuZqxG15r>N@eUe9D1eY#Qdl5AbNJd^%w+6e zM~`oHJMVT@-PFf)NUc*Q<}pcFwPSC4;$=-yHA$^hA8j_>fS~HKcgY9`<)wazYRVe^ zbZXj&w(l6546W*AY#tV~SN0=@)u78iDO(o{adp4zi%e7;Bh}_qR|ph`d{1%hm+K0> z8p=p#ROw!<@o4iA-VN3du=Vw4pQ0i{_z{+=(o$}usT7ok?PlAiAH!dHbqs1Hu8ERN zLW`gXAOLhVKtf6`+Wr+<(8P7-2-fpWMErSV2aqGHM6rv|pm1dQdSa$39h(5L=P-PJ zzHFrGJlMHaIPL)VCtk&Ln2p2P%;POZ1>j|7hNj+)&;9-_?I4=B<#G+d{BTvpOIJ-| zPvJJrHEbFDSD|`27yQ}D_V`p{=3}y%#Ht}QbA+A=?q!ME*K1B_ks1`61n?S3a%wSm z^1nh-N=Fcc3A+55AVny_A?|w;YKud91Yy6+;~L{Q)7GL~z^{hCg_5%gsO3gGCGlC( zfi;c*(M_*dVS(N&`4$MC=;&gHp(^&+x7*jDD}y6A!}+e)<`DhDaO5-XQJaIAyBdD; z>GugDfqp0-+JF)Ug9PXrFrL6?(MEF?>I|!<3lX<4e{6fs3vrIg9#aYGdFl+yvFA9; zAgjOtb;OV*6DCg)n43d4H~T=ZYqv>>@%gYd*fb>yx;g#iL%j1}m&1qHblh-DLS_!^ zdnoPv)~f%=QeGv=O(a^+Ojlw&Nx;M_19M4_s9vFeK;PM{;JUxvzx<_J z?wH-Fe8DX51>A}&)<$gL0b`1;R|o|+#Iftr&E@fXu33yw*nUbD)F2RE;4)&)+5lIv zq5L3gD!JpyG@`4>eK5JC_A&QHNnrr{P}1ri#JJ81rMbONXplBh`EGZc#?WC?A2tr@a)nn|CPcelExsS zIHKgrhI$fUPq*46PsczM#^i{PUj~YvP1G`{!RxlZ5eN(|? z^*4*vGwGcuBD8Y{gyoA5d<>65z#5SZ_3y(M=YAjo|2+#@2DJRv7Tl-&KF~x!DQTVv zbY~0Sj7&0Rxhnrn7#aVZiHap(Po)%bf1v?Xi+TmcIs!tGKfbc ziJX~A#_PX50NTlDgd0FEscb??9eCQIf=OJ(bL5VPBt@s}mb|66WN-!#z^~ zVH)BE!!M8FK$&%L2%}&cs1CiM`BlrK{&p@&r6VIyV1jVp$q{ugzUvwe2#v5YFG9wL z1_iWdkbn33n)GJ5i#VKU8+W4_fAO4Z0esx9+e&+C_AkMO3JNxKX+r{@4Hm{vw_ORp z32B(Ico!TfQV7Z1+AKzG<_IJBRl9A1j)=& zO3~!t=DMXM=_{&8bWQBN?<92E`K$NbZ=g)XCRnf4eX`Xf$h*-hH~P@#ND+Wn z$0$oq^4Nu?*8RqULn!|rC;jCPyl4O8J<5MRNs5pHTvp4L~b6wsPRl zrY^WX2Pp+|6gQyRgC*=tqISaZ|A4{&QhORF_^~`6i5331zW-%{x=3;*{W|{lqJ#fl zB*dHpJM*cymiV_L@ng5e$zk=)i7r0<1LT6|M81qi77TRgpIb_!`&`L>1wG zVm|qmWt%7!+{-KLdp@jUq9|#O&700HuPv;kI2HQo`Jk)Llg}iIUnR5~TX3GfoyU}z zAp4f7m|?&e`uvW8--F|`&KERtq_xj#q>nrWy?e82YHHSofAshItR)ZRscL9E2QvaM zmiO9Qs5zpc35ScqZ{MKlgcRPneOFDWdGGd(JzW{b?VEWSW8B*}zvH^dZr}JZMswW0 zK~slV{rjd^{QsYU?pCm*cd<5I#iXS{ug(vg*7_}Uv0wg>2qR(EsxTkTSKXPMe5Qtu zjy_doH`nNcKwcv_UI;VN(9mRLWX#mLjg?}AwU-%n;)S(m8sMgm{W9vL67c*S7>I#^ zVW}&tdUd`O$>+LV?S7C64{(?7`Nr&Zyq+-%l>*W_XAdtAJlPs!)o=8^I9!>7-ZWIk zrCV-}q8^L3>sz96VpvevXO>5a{7h2C`%M;>O}dkqcu#m+S7<$*YI zXle89aJC$SeDY&*@~zzr)RWJvtCJhNBz{PZ6=$T>xmwOO&-i;r_NcR%t!+>IG8=Fc zS`O{{{qbIGZ0ug`77IR3N^C5<>-P9)fqK~6oCvqit~<`V&^1Kh$>HxoJ;)kjnzHW9 zb}oSyqFd)GEidn}-;4=k+2l3tOC+YIh9a*#2x@lVk~F$|`OtFeYd7oJXd}Ke?;PLL z3LaUT8$@*0Jc;>eM){AX6J=Q_p20i*;w-M4cg;O4K5g*e(W#vcQaHU{>*U zL3NHRGGoQMXS*{~m2y2#q*?K(9`5cEG{E88GfRXyCN_O|I5^V{UT#5nv*a!f$A4w* z8ZWYgai|GjzqWNo%yVj~|M-Ya%37>d<)G8@W-HnVW;Heovr=E{Pl2%jL`Yvfa)v7^ zD!%eQ-)|2i4eMfeP!Qnq?PbQbJutpwAA7^3eKOk%-TN~;oG#*zj!klPF#vV#eD05d zs|B+h&CQ5}?q907>C9Cn^W8KB8D^GM+8zD=ima}#9vK-CnjSt9!Zw>xsVdg)Jj7Cn z^r71+XGI?82o-7k65Z00IXfBCFR&mzn)Ns;iNu4~QiP`<<}SRD=ka@v_@{ew`h9&s zfBm#Qc&*%*qovr`*vMw`WmB$Q-ERa_Dadx!G&CL~j*p&%HL0BOEG)1fH@#MS6P7x{ zhpQ~(8VpSulCI3D+<7W@CW%kYKL;1;)}4ODq_Q7=UZ7q=#ciJ+Y^_fHFh?(L{6)igHf~lxEu;aS}^3v&~Z*`IAJsatt5q@_c&~9uG%3h zn4fn)@;PFKusiX<%xz?{+ zHBurq_`ywirgoS}|6#O(;Dz{>7V6o8117jP*L%<$g)|n(iyPkpUBrjTaBO^8cPGOe zAwm1&^`Y}~_c1*W#OV&?@R+Do$4gV6^lwdAFfid(AF*6F!3u4LGu(wRNhw@d6VI&V zwLlv`G8lkJe9+|y?myKAKv#Slqk*#cVY;(eeP=TX4X*#ry^FtKKt=v2(H+siWm|iZ z&DwZw2w<$Z*3{P*xzHMvH0`N91xJq06;ovB)>b1T)Mu*M@|vZCDla8#13f%EfOP|* znzf1j%m@1ZcC247=^@zck!MtY$ONyChr`LaDlL5wbDhXOc3M%fDvrjJ8i%Z%JF&9K z0$yt3iPNr01uPmyZ|wf6h|8z)Z#2@3v}@1Jw)lvh=h9bk5)vf!WxLSy<5BWHgSf%$ za$IQia&(ivK^6~*^?>B;7c&YC02)Hr4kw-u;9&NJhj`V>&+w9ST5~%t2g70D8ZlzpH;v64rp{F9J0K_9IxQPj6+R4W;?QMd1YD@1Xi zu{hZuMEyQ({kODck0K88qv{xs7rdPW%d>6@+z@6XGM`UmA>Wr z?ISrTHSf)nVK!+d(g{dSw{{di?+K`n%L7`_Aw?c}rT4M5DQ*0;o1C@rd*Z=Ni7G{8 zL_WB2P{t3Ajz+vl>n1ldSy_4Gy>~oazmceRZ_n`Bd4zfi9`g3j1|I3`rc+95naAa| zLZgq8c;%Ze1g~U?>L?gMnD|x69W?&M!P@ zA>L~Dw;vGQraW)X4(2m$BX$^)N(7+hyGzXi$&gW&jpz*l0{MYVFlE`nar$)q`Mu}EZx@9w7HR2L3US+98~ZH` zYZ|go&VIKrl;}>d3;eS+_gDOSP+WcQGU#XO?&g8^rdK*|on@noiBO%1j-p2zYze#5 z=4DZt+;DM2{TCqyFkg49GK}-0ntPMvXB{iVK+MZ5S2?|uicGc-rKw4{b=20tN2_v8 zQsQk_lKUyK%dN0Yx>fv@c0SdwgPyrOTzd#E&(JT<{BwJh2}G~WkKqJXhfXXjYdTtx z>raO7Qxw*x?M_FD4u=iRI=QDm@e0&Aexu7?CvhIG@GHHa(0CZ*`I1q(1K+XUjf25? zmy6F>?VBh34XgFwdQm$DC-{h4FI2uupGTIE*Yl~Z`DxeH^<1GA%y%F)q@e_=;i*un z0WZ-y>1$R9;&gJea2@sD3KdE!?}&PsW%zXKub{~5JcS%x$E!k0J@*d_v6P)=oh&ar z)dIn@>c*=vNd%v~GDeOPIl}n`ny4K>Oy@@mw(#bGSF1P2qiVOyPtg#kc(7aE1NdW& zh804*_0M1HB5OW$cV$N4*hqGSf8Akiya`WtP9lkzefdk4{!v5FnOt>+v3LlPsHusr zfQY9MpMbdOO~HDdb56W$fl`m7=`;gGy8}0YRUyabF+6|$$+eM11ZxG0k4xoexinrG z{p0Px!XK|X9g1Z2Q<#6r3bOEuXoY(ke72TzZyMSAtGm&mJvk@vK{%~n&aSY_Adh^c zVZiT>8|Ijcn=VQM$$Bh?@BU1@Dvaxj=J|!C*${I|H_`fami-%d9>pXx;ol9{AhELmzFFZ!^#Rh2$^MKDgSuDB-;MVK z)H-^Iw-$;Z29(;|RLZQ$@h;-x%i|xJqV;UFbiK5oZYr!nEoE%wGK|(+8Huh-S{o*; zU==cr^<4)I+7dWLQz1v#%IbE}e}#r;{nK3CD=c&^Edoq-`1ES@Q7D+&BKT+@*LE%_ z(`73aa`UH;^o7~*9~&bo94h52MGeh0IQQf?TL{ucB_ezm6)eYOCkD=RSBJ~nz1*`x zAH^DmlPmioA6#@D!?wgLtd75_=A*C0jnD2BJWg2*oSF*AWBt2VAV2+(jj^ZkAzIw~ zRqe52IrZoZugDN?fdac3*=H_u4$+o$;=g1ax2;fTj`AVWM z1P!n>e?D5R&RF{9V6wfpuAhGa=V{PsOHQw(WwxZ57;01x1{g zmh8+Qy3+j)ByxEBin-`l3Zwh4Lnj}6(TGi>#Y#aCGct8n?{`OxUE-Dg$b8E&SG&kc z>cP7cGs&{th!tNpd4d&&Rn4nSV}1EYwzZf!&zx&}$>-3x6U>38yv*^`L}0=}=~oL2 z?st;;!LpBZ#9&3Uh2N&McSGtgR+Csk7%kKho*MNEK4YZXiL6k9dYz3k-_w&%@`M^i zhF{KPHVG!U7K+S`ku;3tqVguLu?7 zLt*}5LDM|amXhy53>grpyN5blA`N*1a(>L8b92-|SJ5kuf>o$XAG zWr$%GS-n`D_1e2$O|mQd2A$-qNP@%M;~CO?kmbEMOgh#3Aq=S?iUOdSSlBVkv+WD* za&q!57|c5pYI<|h+yP-Yv*C|obeuYHf)IC;S@SjN?6(VAkR#VisYJNcV1sG+)?Ix> z7S7u`swMN({0~Ecp`oGaWM-SE8+^f6FoKe{WS%F2@*h*&fZwTKd@9X-m{iTvMbxfVs=+WYj8?OPzWZz7luzUcpV< zhg1iTcp*RTk8bZ_nzcs`A$oLmJJgYbga$E@ff*w6!W=DOY^46vZJfa$% zQWdNxowzKJO^3VIW41S^74wM~&gIs3e8Fp|6zezcmE|#I%(KVFZV8dv61Th>t?8}pr6)Sj+On$hV5M97EUC+kCU!>Nyt7WzL~F|4j? zHtKxWnA;a!ZA#mk8O$itKU-U+j52CUxje;PQl?(bi(hWwYuH=5r&X7!9)K}fVL=PQ z#M6Yav~6wkxeJR=!mQRBX~0jQ)^j@unvyMD%9r-FP1S%-z=yY<3dxuJ{UJ?6(0#LE zR)l@P+o^B`fmF@7Ccy*1f=BFot)9~$V8Om%y>M(w&W)5_9_j{m{))!nM-(zXFCA7k zLiLt!=y9%E$+reDDxbW$EDCTEOdy(LEsO!2tK?@P^Jhq>E2#6O*nbDXY z>Tu+`J?XXbCoU=~3Ye?8ZW#N?S*ekF1UMz7Y|rrQBG1slLXB~M)@Udy-XNa!szmOJ!o;EP&T=t2S$CJNFe(QBl?`peIn)^9YP^{yJ58qAAHwa`d z$6K|(q0FXLI~uB)Sn?^b`;3Dn@s5K9qRxSulDLN9pumP&kn0D6vX6|*!qqDWGpN4u zFzm%ya92dz*TV9C(y3^eNNxS`M1Jq@O>WxYnr!B?IQCo6w*wo zo11*K6~b5Iqx58OGm=&cJv5l9YMXU0E<%g{*(9xvRbdlw%!U5z-!P|LdR(uCNs6&>ZciRl)ed+pCj2rPsA3`DT z7vk!R)sbYp&bh(Qo(+uGTIB%93%k%~y58Q-6Fj z%@t~QVOmFLaangNBx~n%bA7huTsyid=OlEkv!zwv<>e$q5dHBUrll?bE#bb-`anWa zjG&o?kq*fGXzs>z4~g_D>KU4a_UcDM zjmm<1lX=Qrw&1(uZ%f7ol6jQHJLukNU!2@H%?URen)RO<5oL#T9SsQ0H9?_8Juf7( zB*SB3VUz%KMTko!19N8c-N#&sc`RYLFsCL${R~?EHlDN5SyB z&$4E|7ZTz>=Hu(@@72&DyM-7a?{idRB`3e`N&~)beG;~{N;&0)gETr!p6K+wQrZfw zh)#;<+C3S2rx00|!ov>6SIUyC@oHQI<57)M{&{ayv%*qPFC3jt|D$ls4N;nWngs+%y_?-m5*xX}PkRym* zUS1}*tC#me9llkl5sL0uKGQi4h_uHCLqn8AH&<$Pazw2ZpZuvKDIXF<+m=|LKP{7Z zLYnI&8>`K)oZnGtGsOw`yt7#^yOtzn`g$I^T#uuEw~Dda=L0qGHL8sUc9nC&Ccnz3 z@Se1j>L2$GhLN(SvYGt(AsriI)+d|Fv8Sfn3RtTE*RAD*p@!z!Ys?7B8|mw-X{P~? zo3W*hjg4?J4$r-YGx~R_f?$amGrTA3@EW~_Bv^5e(;*(^M5C46>23pra3oX0QRYt? z@|dA048-aC*v7beSZP-`)-Gk{p5o6*XmEkwAd@GeR8_0AaxOG7SnC%4hf0a zI5^$_MiD1lXM4~aug(a{3nAq9J!m=kCx3r2F)@A54Lba);WTas!yDxFocB>5C7-KC zks85;rO!78>_MHQ)jn`p=a(n!?)x8#tZEv8R1Z+N40Ola+gnd>Hs6I@y=W@eMQJiP zG*m`j-gcx5-xLZQp*tqf8NxxM-kd$_jhpEzbaFl4#W;AuZ2csR=K1(jLxPC}H;v0` z2hB$O*9q_FW2MCp4-XxkoCNqUC(6yJeU67NPqv`QnjG>y!1WYpSK0O@avbefyEb^9 zyl*;D5W22#%E?J`&Ue!j7rePTx;znRD3S@nqdfL(2*NCC1hxxaS65d%w>0z5K03=J7ZZpD%-5B=x5_xcSmPY0?7M~`* zaZh|XVlZebj845$Ui)~5c3^IY+KTSwb8ggY{Ag*@nK!8qUgu_)QruyEti%ImO8)AD zUl1m2!g@0ZXcHNZuTXam+o#*tn}i65g%FLle-6wM@yNR%YMWCgc z;Zp)#M{}ns!`O4mLWZEIr*@tTNOZwW_#N}%ycw_}lCXymqVF?cF_4+%n!0?v*H{E2m z)ijD*gfEMg+!xoBL?DjNv-{_&3=(~A=X+IKIi1r`tMs?c-L068M|jIfXSGd^y`dZG zJ*KymGSrxnn^RRK!G-!wdRwAbkM`>E4PYR4@}tfNnV9w6*V(|J92VQ4U&sY$3<~Qy zUk(JY3#GB->VmNBnypslt5eI~n(xKQRUB{Co1xLXKg|$Qg_~pTW?U`~OC8pEUQ7vb zmPz^4n&&@9vH~9xU|A0qigX)@v!n=E@Ih+|%>h{0dkwVsdguh_tjcaGq`>$bh`OJT1~s4}R^9 zin6MLDcp#_(Nv{=X|NhMjkT3$8FV?nX@HWb85eP_XQ3+kN^DnK1u00 z#VQB5;2ggUBVFt7-@oE*)|^L_bqo?xQedcp(LYnWV%D7G?0%@n-3O0j=1pOtr4N%ke58aZ=b$ey(Mg1`Oi+ zsql{d$BK$KFa`;C_i8W^0+=7&ix(jY7lI-=!nfb#|Es0LrW$E`xaLGJANXCsRhLbz@2 zHO;eUpVgwNKD;4YT3i%=#>N(-ho&dLLlyJ&>kk*yO6*z7Fn+h=V2HW8R_c{hX)uA5 zn)T<6J77QPV?$<>XBBp+3}IRmSKvgEZc1x~rlU|YW@ctaj)PG!N=8N1KkbXJsQodQ z3&atMR2(oGK|x?AsD(*_*=nyl*KTiHBjR^LLPE6ERa!Tkc@s^}@5gqxji6{$vH>DZ zG2qO_+gvv*P{c3yNkJcS15xzl3l8G$$^O3mBv*A@D7pCO>gFVHZ#E|wdBh#WTmY5C zmvfdF_+OFKcZN_yE1{Q2rq{2BkUn1a_OIqvYU3=wiA>M0Z0nMbN1?c)>_Z*6{r1~N z)Kf~S_{HJrkHb6a>-m8`l}|uuX)k!kih{|#Kl$XmufzjNkQ)uy<>T#%a)+f3uQTtP z*2a5OkMZ;jcV3nU+n49$Zd?NqL1JQJfW3*fzC8oG>noY161;_@&X(+FH}&NKMR@`xBMPbpEq1YH2BEYD$kC z29{|_g>QoEkBLtbJ+YTfs>8J38$1Z#KKl#Ol>8rl`+>wL2(;al&VHX_;Q_~N*j=0S z7$*>{C~o^cCZ;xkK}ONnyeabSxUXNo{wxAU_9YVwa%sjqK!M=3D8r%ftRH)^K33HZk`STAJ77hG>b7*MjRFGb43}uh% zhl=p05Wok-1V2;oaXSfxp%Nb76S)N;h8F zK}<&{`4XAz!Z=G0jL-%C6blTOE6pFz!N33eUP z+xj342Z!pGmbb8u&UBt~&PYnB*8AqZQod&l41Aoc#v~zUgB#8-ZG%HX3{n6>3x_Y% z6pDz09YS7?H6c%@Cc5q@cwe+b1$f@@&u`nANeLWmGgyS(y?d90Nn^0u!KjGw@Cn$| ziHQlY%r_49)G%9#k})7jxUxHOZ1)f3d24HH!X(&UMRxZ$Ru7{=UYN61{qg_S0*vG; zp_5{#hDRhlPWui$t`5TXOJYKv9|@g!I0NyBh=>%Z6>ZMcRl$ib9{V>(efySe8@MA! zQBCd244!2J>+*b15(7S+j32B9I+$*ni*hm-N;&Tm*ew1&BLE0|xIIAe^-q?Uf@U`$ojS@0-yOFkh!(n9z&~0W7~4A08e5tAJ3Jg46EngO^>2PiPF~P> z63}OAHIf@GpTp;}sp3)$zRdiD0Vb22Z2g8zSDjWysX&dH;KdEp7ND%XeQBR*SVV*x zLK*D%Z#Rorj^MZQ^8H_5D3h@0Q=IqP0<7dDqicrvV6}COa<#gj26>}Hw=x8 zh=_<_EHdmYy<}h8+S-2ncy=@5W46}?kBH2r5lo;{LU zC>0bO>`wC(S&0Nxm=8goKhG8M#{jO3ltNuKcW=SS$mrU>D72>>BL@o)Xa$&`?|Db9 zc6&KSCo#(wyvWD{6%0fkek;85+r`AxG&?&xvCs5m5T6Ls%@{ZI8mVT3o#{#wP@wYC zN)e%--;vvfs=^QqExfb;nk0;lgJZN&B4Eh;OK9j}?8NGB57|gNC^8r&m-WXP%eRJq zhyXgJ~swDfCovQGKj`+;0h z`21roV`Jl2bwHDTSAm38W8W{2AGHDz!NIwC`dwa6trRKZRnEnNmuyvg$TZ%8%jWC4 z$ovw2TXPO1aSb$S3;GaRQnB?zmt_^ zWMZlqxKNOm#=^tH!^d~Sd8+7ON!kmrpF{c?9oFd9=H+7mkw;58D;tGB_h)A-l@z;1DGWwBGT2Q0FR#b z1Cc=Y`}glbV$pPtUU`N=N?iPW{jXb58AiJ`XHIsusi|px&@((@_3J{BoeQ!jPx4F; z3bm{HT}DyDI6DCstE8mV2un(@)bxMP=8)^*^IE5$PSZc}10fZa zZTSRnkdl%Zc6>noW1v>%DtN+Lt%q?t!(XpNy@$~o;=CVNDB_^?laP`!B`WH7k)5Va z97k?74LyB_;jdCsdT<{XaGqDu6V`f_)|#H4wSA__QY!27D?N<$3|5NCm)s-UxKahJ z0h<1VV7A%c1{+Z`^?AU zV&b2b@_vhlg(4KS5@a}mAVvWGl2i{Ru2E3bBP^q@A?pP#fi&vm)4ApC0fwC({bU69l7a11lmN0N2{Qpgrs|>e z=CUePmJatxdM7mTV%SL2fA~Hoz{lw51$E`O;Es9_L86f=hB2w=OLs06_fc93!#~8Q zrV4`kozCys7(sZc0EEB`fY2zD|1E)@Dg{UT9S6i|YPt#_ZRRVVJF<9m2CO?7{^R?* zY*5l9DJA9At(G;KGnzv}L}ZjE8k3y7&A_OJdg2}MM7LiP)~4k(BNuo8tv|fVT?ao@ zt`a$b_(qDU>BNF|XX>P+q!5=c(Qt)0!rEgJ6Y~kd!~yDS)0#zqVcf}qT&doxK`}dD z;A8k_T(KTmS;FU^)6unC6eu<5m8fZ)uYXHOgjxb|92^|vzh($QAr2kJom6Fy3j3Ph_ilXp>JTomf&q(<{$5E3AF08Nciool>&_O>lk?-R+i1r zEs1&{piXA($2JWtWxC2tdMM&E*PVG0#+a;o+w;KIL>Ivv=kEZusSg{q$KeX)fiAf@kL{uIQSJ~vq^?U0xy|cLTX)UVw}Nf6Y}Ja5WM(e4Ev!qOjKVBp`laVj%u~I^u-zlE0X+?aaa& z{D@&{6*PyL9ZsKUt&e3+Yx1!9mJQ7lJm?>m4yKI7kHIlIKtDCBDIGX&lXMH+=D=dE5+|OY)dS$7@kr(_dFc zCRCXZhnP3_i{$WMlzc!{1DgI~K8wFvfhzOs+_z{3P55rfm`e9}MSwmwUj10cD%SE^ z%mOwJ(;9t;+1d0E_K$=OcfN%SxLpsW5UdVo5g4ZB6Q<>PSRk*s{YF-brlzxKQrHY|M>}~*86hv76Go=bBoid(Wm1jyXJIP(V z2p7%=4|=!s{ve$|Sez0Of0?!!#BSD4!S7z0_%TfRyW@<0j`4~-2OqHozVl1f_djzJ zp5~)VUJMIwD{F1n#4NEco|hY4FFyh+P`}j%!FUdps9FZ~=+Bn8=@H4G>V;AXUzLC1 z+`q~pu_!$Pu)j~)C_|wV|0jl#qYKbcenGbi zVvO6Ed+DX8c3P?Lpp2nZMj)mcm+SNV*wY6%7J+HiX4CS7zSZ|!t8WtC5fBi7@PUdE zoM+Dgm4(*$xv-g3RmkeOzKPpng@Uj-P)=-h;c+vPTr!!NAW=Fz%as6ztRUd4V2n9N zyC2vRfKMlyRo0^t5-fxWNl8fw2t*V>@U7_cKl{OYSXr2%Lacdb;Sh;! zTCTZ6w@^{rEVv@V;r!+?&L~B$H!6-`r~DWA3=K22F+r|}gB%4EAOa2ZhFCq9{)vC; zL)Wv}l&+?iE1sU%^Z~@llW#LeUf^3c`Xd}M3q})x$4N}IOj{uUa{zeUy~BS8oZtX= z^zXW;`^aQ!ZEw&y4w@TRGkM)MEVKRpP*QM}g5d3?+pC0kww_>#?D5RnDB6hXFRCt5 z1?4qwCt>*xt{5XjG14FZQ{Ed~{L(X59vg#WDcyKCJ)5UGuk};AVFJ)tfOnqeuO5Tb zgfwtq!s!323m9g&^4mnzz$cVc{^j4Yzw2D-phdhsI6}j2e%R~ zz3j|>S)wu(h!;FPF9D}a2^P9_l0W(hTU{dxgh+l5K-4lX3sO%6nqqm|b@E?zY%*&3 zRhZorQz(HPcEXrY`Y~2n&;ymoG+g{6Ds%EKue)8qtKgKj(1EwPwXApRndkq7=h~ z)k8QrD-(|hz}&sP>nACdbB5^{7|MSD&3^tL(Lk(n@esE05jFmR!y;Dz@lR+|foZwN zM41W5GMuguOJ==(`7%H8&c}8`KpZKpJi!SBjL9*`~OJVUptQ~3Y3mJ=@=WlAsc-V7V7L; zs#OG3_q2#k#S}1g+As?|aHwNrd#@XcicHU$|2tjxz&=x11qJmmrfp0{*uu}@$p`-i zXE0klK(=i1f<@EP`bUBa66mcvLD_Q|#(**~{VC!6Y#abI^syVYtl*U8ki7bklvF)H zjuAxH;N+(=3le@xDW>3D_IvHtK>!2`E))^7Y?TE8F@5X{2xPIAh-9SYffv`m`k<57 z22d;@D5xMqQ{fJppZ+c0-)fXUSE&XQ*pAtHkBAI02~%DTrf=ZXj%W#eX&syJ{rh(i z+UnKW&eZ1LfHnSh`|ksr3v%V7vo&W>P*AxLzy_rM zf0W%AcUC$84}gaXZTF!zJ9njHuFB$vUmAdwWg@}e0Y`Nd<@uj2+jk@hlL){&HH`v- zfa3upHR&?-dc018{@Jr<^z^kWZb0)A|7&ypk?*gc z?(m3S3{!v*emjLWD=Q1cjDHj;wr3#V{qWzv(?=+(q3{2a%@|Kn(Q~t!&ps`mCZ`}g zJT=KZlS>$ z6jW5Z)gJ?zKXmo?t64o~B9XPXx7QSSii_c&I24tdYP)xix{>k-M6racKpj^(Mm|^? z1PIVRpZEi`r@gbY;Bm``S3iFK1Qr@I_KP+9t*mT8S4RR+i=O|WXK-ONHTSN!CYu9L zD=@vyi84>g$t9FU$96AI90KJF`xSWCk5!4Q0L8V7qCFxaC5?}d&mrT=HN^!iWQ4Z* z^2$m(Ac|BV&z`l-e)tQb@O&Hyurm%%FvG?$RM$(j>qoX5Opyx}F3G+$=O0a10LqJ}*&AUo~ zpmQeVE!g!@OI2lZfEpDn0IhYTidtGkZ5X*d4wnII4n1E80_oqS&Pe?lM~NvelHH{=EAH>i zWW7Z>iJV>aM-$4%g*F8Sg{yO?{@pz|(rpH0mBGJLRAPUzEd=yKjn=+F6UNOL+bS4-@IFc{krH74l%9cceq@AS)PLC&uMZUw2ipwjt%~A*&=j2j^8{lL z2nJ`ZTCkfSm&wkqyj2TEO5jiA@ILHjxJ_IW?CBhxr8Zw)xa|?-tu+m<*6fhLgFh{2?Yt5b&y${C75AIhZs` zg~e0Jg|aonvQwWBOkN_!hb#vOim<_w?(?SD!V5JajvIgJ5F*6F;|v@XKy?;^sNog= znMQn1)1&C-Xy|dT{owSx6mmS>z<+yt`xy~XB_J`%>D4rR~h-`9Aap zF_$fuG;%Wt5`wYPz_i`*LC1y?aEumE{~VJHhs^SzW7Pvm5<+GmwZWl#YJlvQhg0RaKvQMwi8(Uk&zRkuLFDHm9*U9}5}cT$p*DF_KOrSFWL^sqLyb#_88 zH*(@xbpO_p^f_DfrQN?aEkL|PIE*|pzjnrk}u2PRJvoxR6QFZ-3(8|;nOrS-I zp8gKyJ<cI1tO329pkXmw*{r|*8}CuW#S#6x@Ad#lH->YpbcZ;= zIYfc~;B)Ym$NW9^82bBFr&-Usx~|sh*!M5)y#r&JoP&5;{RZb||5EWa7)C4$!&M zZ6ksU&{f-XzKC7alUED$3Z%GC_sAa;a1Hb9_s7(eMMv-=ZQvb2sfF+csxNx?CnqjA z+XC|^JqJK7VSL%9Ucu59_Rjm1x}nx(4wPa6^QtJ85J#Z|1hhXs1~t792PnCNokA<< z>gBSX&UIl{&G39vzkjBb!aJMc!mMZ9HUANExKlL;DlGl}g2)&IDLlSc=Kzz)=RBWl_4p;#nH!WE-u@N6}YT-bL8a`p|1taj+-( zpgnNyltgX^+A6qqyfF;HnsD{g18Ftl_g@+l*LN;0~(y1oGUS*;2_K&!GNKEdbSx+z{;%IcPUzND|P`DgiwyPuIJ7)FLCTbSD{dkE-RUsK;2XpOB8! z47AWvQJ?*9w;>A}^MwU!Q!G>c!#X1;WBYaxTXL|8HnBT;bcK)uKJn{k7EXJ zJd6c&h>fNj70@rK-#G5<|11ycrLI38+dZqa8vLc|AF(redb9QYVpWVZMZa-l#}$m= zd47}ysZ5dWKrYOp{(i8@?mD3`nu~>oHa7-y(MAju1tmHiJ)oZ{un?hYSD&*zL=>Ox z+9(#N&16Hbuf8uG7~OcRM%2PAIo1zPzs;3EAERAiX|}p7_lNktI$WtDvE`3K3{PYl zt@-xOd_SGGUz%ZWx-OC-y`H(WSe|#V8VIRyw>*%=pH_1ouUu)B;=_3?NMj_XHzfE3 zodMrYi~5U9EG3d8$C6Xto{imUkZYEcwZL{S|Lf(12zo&uxXi0wQacqX|?p z+nV~)a_=8VV48qFCy5yfRP*Zo`{h>3gHX*kc<0?JI1uP1_kV#+)j zU${Rhsn#FOLvUJUtCg3c*oQSXI~9}Wl`uepsFvM8wRQgG`$B>!cz*4KIX?wx#H{P@ ztWAj1nYBRYrYy~SFj@&pUG1f1{G2mVf(ze^{;asV*^NT=%(Uyp>JbY8(~Gjtg19AA zLWrC9dOfwzKSthIT_h8nxhfi=kJx#vrFU%UwCf)&{uqye+1gP`Wih&eYLi?9|s9HZ%o9M zGrgABs~_%6iF>w$$wN=D`nmtwH0Y7fx@NYAfxo03GpGGQQ)pcIvkY2?dx}d@94d0< z^Ql##!H)nshN!3Xd%iYYM|A#_&-umpO}>`oO>pw~e)rYgXziQVygF-F`GJ8SeZ zgmeVr^x7&c7S@Xh>x@N*nppq8`(dApI(pVrkinSDI(#5yCeu-dP@GICQ{O=V{AWr3 zx6%#eFhS>QBl5DC7K%<7iw|sEU!p*X^-EL{)*ibNdah;{S$?jPqKP`=RlS4d@%~w) zY)YNJUM$sIox5;UQn|eN@>s$`{CgG#&I>LZ(hv`S_`41{^#;dF<#!+uBezKDYH6Ffx#56;Y+&`9QESS{&C( zvxF9ecn|t5D`hk~N$>{FsIDr4SN87^L@#+&g+A7*Vsx0f7vviD+4-fb*X$qeH{zA1 ziA4~I)Tv3Bv(4K6P5xv+0>!Rul{e4S#-Gg<%;cQytk3UfuMjOplvD}o1FSoJ1G8R* zU*Z|zt=jDyks0d&M`fV2h=MPQ?r%YOt9~qUH&+iYz{JEfPHh}6=AADb-V$z!eueBo z;_93A>R!$5VB#&O6>O^dPB(5~jvo9HY4DUENV(ow`?RH(zT)EsE8Lt1N3EuGJOjuS z9(9b@SPCVm~HeN|A)o20U!lU)Ypq4+pq<(e~tLg|@p(_UD5LSZ6 zSmfQ}9INe5)$(WI=j-cLtXCIBoJsV79JQy@0ZuPl);Uc&V;0<0yq<$nq~ALT#pS&L z!V9x4$4?jpPihkNaLBNd8V@HcJfuJatoZvgN>T^kO4ZJX(KDEZhK2Ie`_7Ye2(ky& z+K;cr8AoA>^p0Y;DbZ>5_)H5*NPTh4=;8ON2L_$&R|q`Y_#= zQzqP&dmCxZPHnsyD*lpMIiFQeD4>UQ4SBQ%vGS~!&N(j8Tarx_`}f{OM{}B*z^3aeq&;{NWY&0r6u%44=XJ`DtJ(agufcR{mJCVEfCRR1J3S)>@*b;Z~rUG$2- zo_3~xlMoMp5q7?6>A&3J|MxQCK2f-0vB1{aT5vEqXMD=N~RSNB&bME0*wg~BuX}*AUQXw0YwEQiAWG6iR2_XgNS4a5+w(L zCU%pW2Ac5gW@esu-tYdrKkjqu&)`5+ovJ!#?|t@OYi-Z1w&54Zzlj#sIay+TXpOG= zUY&K#d}ByFHxyl=H5~rAvh+D5wCMKQ?~td&MQc(-?%oHWPJeQ@oUbAWMBEW&q3pbM zXF^Y3SS1+@-}wgC*eLhuv2^o?0e^bmOdlfM^HHqadA!hW6{S_+pI|m6HaQSPKmAHB zu~?@FF|KzjQ^OxD(^pU2a;d9Sh;};+QGSgaJ)-#p-{GOZt>B`MV z!1C3NSkJW%oL#dMzsfDYh)|mde$-V!J~vvfYQ4ZX(6ecTH!rwr1j)6_iz*7Z^u+uw zq^N~YPn9_v)=>n`G2*{2k(Tiynv2ym2M2Cc=8u*8Cg9oIJ@?Xn8EEmj<`w$hnm0^# zJ3r4XetFaAY)D#2)`mV*P99Fq6|$aWKhBoOrYOi4>0n)o9hH<8#gB`=WYrb<`sKUg zTCio~dznXyF?2u~uJ}N(jQe_*w7U2PAUR$b9kk;;jnmt#Zn&^y=G+g zRkQv(erGhgLlU$*tG2_YedSiOV>;l}wy{@jH2lmgvN02a+Ho5gwY7$hXHm5cAQRBd zK(82ZTipD|ZbIGT}jR+C_BoBZ`y64P070fThbT1BRJ`E zvLILA%CN9)>fCsfMy24)w%SHW+A)63@`b=H?p z<+*I&nboO;{BSh;=AUvdLzFd{G=}g}i8u1LTeZxZ^Skx1QbP2!BwL4w8IDYFk?Png zl0fdoRZ3D@r|a&?cqBf!uKH5zUGuxEzEtu@uU@>RPGO6P_v~^CQC|!b4HaVJay@DWiG|e3l4@dSk`M#mIB4zgsoKNt!OrF;}~k z{Je;npsFKWE8)ZybOm-bDJta3!Cg!CC2KsEvQbm(NrINDzGa)N+w8F6<_)mL>WjEUArlu3*;&-(dC5kKskne5 zb28**AT#-Tlh(gEkdRO9Eq|vjd^&UV3!KBfH>gZu7ufcK*KN3%nILffEBwM?;z!eb zb~$dsU$D2F0U<-KmU!{{y?txeo7w*Y6q$-WDn|+rg>O^O5977B)g^Sa zsk1!%dhS}Za8A|o9_l}o@e%FXZ z*_$JCyF^)DBxLjCa||oj1Sq|%$xk%K_a!C8{}%48wJU8Y_oqH9F@3d_?v}lx2$l0h(Lh<{Tgy?lvvDByoQ2O(s zWzR<6$HG&M%ylNUq~1@I*>vCKj%#NODaz6_w0U#}W4p049k;u^szTUEiZHyRq|{Z8 zM9}Y-`AkxPHUjcb0G*8$x1Y`iS+&LC478#IP!?xKDoIexIB5U848FJkFjV;YE!P+N z=lgPuN_$J~vefaP3m_FA3ZMbysB2B8*X9iTJ)uwqxL1&bxFRd-3FQy7hK8?DzXS?7 zl$xx|W-nBlQ>gy6nD?f$Q?0fBz=?FAQTdkW3sfGX_&$7?Q6C_=aNGy9Wn5gfd7nVw zvEP8g1Zo15xuxwUZw2~PZA|zBMDsM1ah6N499-mMsxjmopy389-HPiIts%(^Y1%$R zw?%)!s?_BU5T%$MDzyixife4z2el0xHE-N_<62?+=f>im7g@Ig^5;azBbH4^nO|co z1GF1R#`QX@BCr8fux@Q|0XGr_jF`&xvmsy3B4%@)l*u{Mou#K8L0Hun=qs`Pwiq z5&m`gjJ|-jt!?f|;Wulj^ZJU#JQC%i{MO$IkQ(irSYMXSMc*S&IBh;`YfV&bA-?@& z+%SNWm~RQGiOvhar@p@zohpyL82wRS|F*F+Nr?c7yZA1oIWS>QhG5)?=9d``_>@h| z%^jpNF$<&qhR@W>_+<**w4vrBnU8AU_)(D#zdD)U&xfS(QNYV14oSf0Ic(4P9L{Tq zH2?uZsEvUYLr^In@#X}OKMG*`rl0=03-#SJ0nU19jNkLq2;OAH-VT^QbmGoqSkPNk z@E$+G!60ap%3lQoLY&BhyY^c+mFfcs*|qP$D9l~O^Wx$a?U6DB9lWN+ygNIrTrqkk)&^{$0Nag>;cK$7y`d0p|*@I|eK~Ans1e zC7=kW`WKzFb`Ggh1gh3^jeg7r z)S-x6Cj9h*B4EUhYh{F`U4bXL-+LLBA0(0KGx@hCYx@TRs9530PvTFC_7m7(*$@fs z##iZ2pFSaah!3jC6K{aD*n9dob$THu0DgLpAf|p@1U3;i7MOnRTq)UTt8NQb)11lT z_bGAihNxKAxiq-`ya5e@gD%&XY=hFtXk_|*PmoRyIPyvDhc&R}x430|R%KN&L$gO3!?Yuy6&1 zgM)928i=eFHX{?Msi|j$p3RWQxd4^#moEiYt(6G{{4H#oBz~yeZB%;4C&Z;@13QZcSaw;~qaCPPqKukxe!~&_~C=CS{)O>gDUltet*uPuTR(b4r6CFZO7|f*OS!d8&AXk8~ z+u6Rlbb;9x&NYX18`#A}DT(Y-c9UpgFvyg$g9fP=#X$%DqX~jgval)!CM7T|D$V6I zZF}O+0qGUM7)%ww$%4xBY{jt?IyyS=9nBH~0Y;Fmv}A|h`|gRQaP9>52l(#GXdKuo ztOuB->g^T(ps+C(eY+;E8;Q$7lDHWlyaQDnyxpT0<-HGoXnNydEe;9sK;^jm__3L` zS%-SS?}VQ!wEHqbKh6RGzMjwTqod75q6q?&*ROYon=d(lJ|Ya8@}6|+1As0ImQJ3m z%((u0JX`pTZO;j+PxnlK72~IXu;Z3{mM<SSmif}>*NE(;J^ zezU6I1nUFesx`qzA_l;9@D7cy%QXytudh}`kn0rJ#nk*1?%xlY`;8W3f!`L0&$O5o zGSJhfC0)zQ%j=%%D8KlLl^B~0zk`Y~`9VbYZ{dD6upYp-V02*UiN7A`>t_zIKt*g* zOG^-f0UQs#NOTs&4(DlVxTK|}$rcd)w%aKZ@%;ojQ1k+H36#|U`I4chUp@@;DjsL2 zBYEA;?9OYf6yr&sG2m@PMMq~yya5>84E=$>Py5CLKnB3@4o}0M1K6z>`<{C#d4L5}3WhVp0jQN$3L|WtvYRz{ zcm`}CkR3r@c!BMt0~mpFaV}2In#z10FB@PkLx2z6=cw<+aiPstmA8 zpxXjqribqpg>BJM5R_)97`A4CUJ6Bm!u2a>j>7C-q_LuYpB95|lgLdT!j_wa{!E6fe zz5YQ#brv4QYfcjY2Pcu=fJ-!Wo)#V^I(k0V7{V!w{C=3xnmCa20ie{UdnqG?H#ES1 zoz|M1o=)$O!%RC!96FuRqpP7YIr0F-&-}c2kdS@sTWQ99UEOpvH9}Pr9t#jMumAwR zh}Tusgf#>hO#tPR7gOJXLoDUPh7h-fhW>GEFu!~TCKV`s0P^4t*O(xD&2gK%pciVt zcpNz9Kw1r_^{N6aid*3=fHFr$Hab$gq|-U*^-#X|L;z6vcHMoA6Y*W(+>pkht@P1w$0N9p;1g1#J7np`3Oo1p`Y)*xH z!{b~C+|oDswlo=o7M$Dw#Ng(CG{0X8T)_n`1O>n#uV3agrvmEz^+L4_B?bnD^yA;> zHG$(U5)G7$HeFh5tyOWxGs^a<^F3QJlQc~B5;1^_D(<(ynV8xm4!~yi-`9%Z*JP0{ zDK1v^m$C72FM^pAWqiM8E2bdS6jqxue4icK(tZoFPH>MIm+HX-K#TKw_}#$#@P*6dF0El8?Ty3`L7$B5l9C4m$#;@7sTR=?huu%`&UU0A8x9S!Z*VG?28Z}K z*Vx2l-r(2?9DJ1F7HKf_?Pn>hfHd{+@;jMN^u#Jo!&N>hM!a#lVqxCkB6;Y8i#U@^ zt$EP4?!R%U8s8vR$#eq8 zkSDDPE}JrJ?89(gH!QGDpVb7B6HU#{D?icQ6ET23nq*h}CI~+A&?%g*H=Q;$l?ktx zjY`c4x;Q=SGM9C^Gt#CH>=4VJDkt)nYx0RN!X(>Y97R_|u@03gz(D#fl1C}T$j)+q zW%ybQnlui2ZpI_i*COf2%t&QTKE?WK(?^!&N=!+ih%+T;C)knV=nE?hp!&iMW8|4co^OiK)^1Ya;5oN?yH zMcf;z+ULNT(=Lv{?fS;ghh2=8((6*$I$*Z^M0Blci7fjc@9p9WmIDwxFGdcf&^NiErL8QuDT% z##($E{_O3Nz3jg7^>aD&*D9PmG-tiH%7?9pQfh^r&irVmtC(vDvO`0Bk0~q6aPX%< z_l~W^tNG;(HN@F#%B)fmM|%$TJQyb!yUWf~TNkoruYkQ7fW`IzHVP8gOLtAA zPMswX-=fd(kisiFDTz-A&ssdFWJcxd#uZ&a@P0Dk{ec|B_ewWk6T?4trM$y?g`M%u z*g+nALwuV9+5W=_yqaSc7ZAjqB1u;p*)-fw=~b%s-oFU**t425bwcdofOjDyS;)c$ zX#fBMXqtIn_6}^ifj16BUEfB0^`1w~VMZ2^www8t_+mZ0awR6uHdu0g`Ocx{wnUzC zH{bR&X+!FBCc-sXi_s{PG|q~+*=a9&5fIvY?$SgMP%NJDToJ=w={_}!~V=VsrPP-SDi9J?D zOUkFtq&mPvb!v4{!$#5lffa5T%f& zNwapBYgW9`7t4zj4^M3_2&zizYgC@e$=6Jpt*^8&>nB{0I(83W;65M|oo&xqmNrV?VqiAB#OjQxG^%sChU=)Y-#kM6^Okh1LihN# zWNX69r|QzhJUV3#X(ovA(m!v2t%t8ItrP-L z^LfG2v;X5^O>vceaXRcNiqywL#PxJG!reMy=cgxnFva+hmHe+?0rD8Wlc_-sDq_Ip zL(y^u|Lcrxdk_fDWQ+0_*zK2hJS#1Z+O(ZQ!l2UC-qbz*#pk6;!lZ+TV~YdtWO}W> zzCNmT`k{WzA^QE`g6z}uVdGP>T$=Qh-o#|XT>e%2a~k^NX4ja&6K6k@*650Ie@0Z~3kBFM?TAn(%StMg*0u$MU}hOYXiQ=Y)BQ$nn- zJnKDdg_ei`vq~^?vB3iMs1ENfFSKuabrdd>nFaB0+nnTI88JLn^=1HS9Yh2`t%zBO zPvxOaC)VJF1R;!ocpM^$z8MDpB!FP@@t-9xszayp-aI6C1Nr9C>3si4M9Y+oSoZ^& z2wfY&wFj(jcR%#u9~SC@K1P;S277}%0d^YyhE%|Z_+fqzb=dxjlqAQ`XRH19YHaXV z5;}eI5>r=6m6SDB#exQ98jjK>aT-E=nOc`FNglfTA47_W=W0iM1|o$p&xTYd>xoYa z;BwvWZb-YTw%m4A&I1w#uY*|el+gXG{|?j(mtMmA#ZBcX{dEoL3#!MYQ$r&H*}4ye z0vlD^U*;3m4cuIFQ&aBXVl0+j%!VU|h60Fp|3*v*#GwoAAL*vDj?VUz;+=kEHh=7S zj&YI(+;@KeLPR(v?ABm63QolXG#yeq#@+87tKQ%E{2fS==iJ*_Z#T_GUMrt827Oh2 zT7ig`Rj%rVYxhMQZr!=V1h*{Lo&gX4pIZu|x8kEtH@8*T5P4FweeR^(N~KDKN9V5@ z+w9!L!mgs#UKto$*KEeVfu6v$mg)Wk{X575!+x@m*Vsi#D7Eo=)`n`BIOEi!jf+gF zch_veesqN9J_j4eD_259DEYxfQC;{u#6WnRn6`4iXx((1UK%1wtpS3n91es6+-TD zK=PuWCtr8}hC_u%&Sis*9{)L<6)!Tp=6;)BDDAV)$}>I4Bz_erV{b((1|mG&wm@zd__3!Hbdn|abl7RGGEl2zhxu@AuELwfKw%XZLz7p|8| zY^g$j<>kN|PY(Yjucf606CBoxti6^H-@jRlzo#sDp~BV?Heojsb5m1qk@I!ez0~+= zpZ1xi7TC+Rer<%{=X_u$vkziTGy^oUI&n2pu&- zwhvh;|M&b5W2c{_0Yd{a0EY8-!l_yJQ+8?Bx>>gb$U<;ROAnhBYOT5*RsOFHK=jm9 zGA3ic{FUFIDNot{jMDK-xrrPB1*Dd6(ctS#a!IY|A``@6qVI& z_g-9FJhqiara0;A;3E8tTjcH@dJBxBI7=G%?y$AvZG3pAt{*JU{}_Re0Js(O<4L}k z%1+KKJp!SG07T+%*c3l^QNL=2=t;3iXC zG_Y5OA3*J6)`2Vzl1)h=y6^nS*{iNgNB0kMGH>o7Y`38Z6e)vogC;cM zkk|u{+gqTDN7ey=20eh89?3UphY&0Yt(b~3Ha6DK(112H;5~X4E?j3`a5)?BdleMs z4)N^ou*VJ4I3ZO4aSJ)4>RNABnEd6-nZO%9wLlN&MnbAlR8$+Ee(+_RPA$lLUNP^@ z1U0Am2M62KqeU#YhRvM%rYhY|EsLYw z&9iRbU96LYn#ICkz7YzP70At$7!94}@SP_n{&fgc6!%hu&Yi1`5tG{8_(MsUd9^ui=S_TYP$X+f$4eJc`GCpsv~;c_k4(m89UWJIAbSMdj5zUffU zF&uurhnc5Qs)jq$vTl8g^W05CE0R~FszEpMvDw_KqfBOf*^3Sm+{#f7eFkVhUo#sj z6{wKFPTcKJpeX9c`rZYU@bk?6l8(s5VL#~F0F9;6$qkIi&H!R4SDmch&!3UxVheD7 zveZ+8x@PqYOz*>FgK`|u+}lUDW)g#AohSZk_3sIC(y86hZt!o+Pg8Nk_iN_MB~%VL zngvILP~P3h%P$1TWMafPhmcTyeEc`lYrD%0>}E8xus=uru!3}kQ;xU4zkin2JoFU9 z3|bb?YFHO`T{Pc-J5)81lAo_$Pzi8FMbjlD&X?B>Ip!#$>Vw&HU?WQ&>fz8PSX+~Z zv@7uDazL;XRHDy2vu+NU=fp_4sG*?VnRqivx8tIDq!8Ag&c*l(@n(Rk@WYNp-rBBm zKtVtgntM)8>oMq&G&Xi=*K0`R_bEHDbY-;Q<3ap4)>q%*mtt(Y&zZoyH&A?ddkgRJ zj9fRfs^VIYa?)m(a<*BXsDmUH`C0>Dnfw9*G=f2DO#n%b546?Q)iu>?ed*3YzSv4r zONc-34Ybx`7hiGvNnni@vHo0RLv;dU6?iHHIB0WZL2A(TFoUgAtZbRS0CX*40}WbD z6d)STepjYu0+t+zi^G)MLT*GqKDoh$*jF(kV?Z?rd14Kw#R zvj|^7G*f{d^T|hzzdwH*;UeF`D^@F8f7U|^Bw%{F$pp%I-46`=8(E+&FhKRmc`hwQ zDy|g7oE56lUXT|EoMd(lBmjvFkzi#Ku)7hQkZAh*{};HdP9GB}xsEzQ!*Yg3nHs64 zj^BU#h*T#sX9+q^fq>k~>Z&JcQ8e%-9%}z{JsB!ccQ@_efFk)y^Cb``0!>A3Nb>f_ zL+C)h-HC{gAH~hOFGjQ^S)Vm?xm5|WdUs=`ARP%s;(t7eF{_OGMs5Qe3EJx^PL|C} z8kxT>?d{IpeM(J+Ex0)?3{5(CVEOniUi9_%XNKw*6uZ@iz26IyZ-J2S{R4&%rhi}g zZ3u`03xZ zQl=+DLIkTWKb_t~8UKvDaxW!cw_%@B4MrtC@n&!klkgo>?8hbDhHQg10d9^T!?vV% zH`(dv2u&L7|F~1C=Js}3hZ$8;Tg;5<(uWV>7my(HK*nvPTc7^-Zva#f!Z@SR6Ry zRd3(+^VN5%qjzOIX(Mpu%ut+)@IT=IV)x(ni~H~*)yfA?VPbI%0est(ge4mNN}f%M zxscge4z3c8mnhYz)Z8Aswr!M&e8oq zOeI-H8JgWQ(jL*1*?3JTBcCAdJQBOgyU*5nsrUSO+6dRHocJat{nz<746SjP(M$Q^ zrx&FQA54c-Mn{+gxu+5fivzcfTQ-v@dZvGX?hx)u;aRcmj|)!~t63NUk4#xC=!RZn z?HgHd&NGsxvF!eWdF9JuR5UrbBh&7cbJIQc$EnOVD* ztZWE2Ppt;TK*B3Ws@spPfBu7Xl$73DDd1XN6~#_j$L?*qwT})bWwQ)gd2TdiQoCx; zcC}5-?%9Ul8qT0RucC8R*3*9EZG~vV(9i?u`m>Hc5*U*|h%s%YnI1%;jDIx;Rt+$C z=v6!8(tKn+%cdob5@YxtT)Byw%zISqkQ#*EkN&PFz2r0(`w%T*I#BckSy)!N$GB#9LMQ=LMw5UnG<>}SdS3Vvgs47M%2BY-e2H& zlVIAO!&d%HScnkJFlu>>vN)6PWQag;U|B|?ZzIdikA4g!Q9eT+WM|H(BO$%megC@6 z^fNjNx2vAKfU7dw)rOf6qHm(JBV}Bi^>d0MVn+&)xTe3lwDDqdq@^c=TAtf$3etM| zedI-u=(;(;{e6C$v?{wS=BbfGK~!!N*eFfcd(0xf?eE&LuyQLnWpI7QvlO0dV%yt< z#S``%GafO2%7z_3`m*d92lgGM=N8S@;Xrn#p;Nk2q?T8M{Y{iz7C7xqg4+{dg~cq3 zpT6t6zvwJ)@(T{~a3Z5qsLjah>V9aS)^Zm4t6cx~Q1qz5A=looSFAzb-NAS9jLvM0 zRrYsUob@kK+_}*myWy%XV*AbYY|>n72<^~sp66{0`0L*a7YHMLvX`farv_=*80mQW z(PnYJT$J}VB=(y4jRF>W-NtI!u!8;APpi1<$F0+ht2A5NF+!}{oSqrJzC*vOn$8M3 z%$zh*LY+sq_6ntI_zQQ>IT|oOAANEime;59xu3v^Zx&QU=@PFf#<@OQb?Y5j7QDNS zU&)G=mhowA5xOb!5GUf&V?FnAeoM;Zqwj~Gk=4`1d0`%s=|>C@gsjG>M$QY}L#F5&{FN&XKc-$5 zbtt)IRzGz4I_isQjYRtkj|YTVL8RTLfx&>QlwOYI=oLKXeYs0(cx}qurx0h$SB=Y` z(MB7$_?2|>pH#aEForydo1Sz*i+s|rG~SYBU3)m@KKs}$xy`ZW-tF#oVd+l=iVi39Ix3se}T-o$3E2(7j{X} zWOEU?=V~=QLvcm>rs1!Pc}9x@XUw%4ddx-b;EfS`oUE3f}r`1mrqo6WESv^7EBwgB5Q`_1np?QW?x|18C zf*9Pngr%`&4EN$Cx{#KeQbVH6^L1U4MzOz&zMAiRN79w%_Vo}@lkewUfxM-Y|FCxS z6n~NRgFGD+a>7l2-Xojv#J)<8Fx^-#S!CvrrrZ8GJmIy=kYwoATVHqCVw)7Eym`yu zVg|a8J2iPiCRU&L^e?ZP(D_xQpzpTck#!}{6e?xKF80(u&-zpLq90gBRPCH=w|JPX zWW$&i8q1QPt%mGU-MG3S=E~#x#aI`)Ukfs<>RLh0d)*l;4=ZiMb*C*A3*Cn1THtF^a#R%`Eg^wI`mqbqlBRj3vz7>ru-)t?C-ZBtC zQm$5IJS4qW$haE2fp15i6gO@b8g5QLn}V)dGu7KzY_JaZyS3JOq2~}?e}WM=T}UYn zc23e2QQ2p^ zs)d;K_TQ0q#^_lcj&mb%9rNAoziOX-sOZ&1y-g0r*ggFj{kr{=hoWoc`tB*UrOEOX z`4>;|$(bK0tDLjuXSGG$uY6zqDVb*&%Xx0Z=vwCNtWfaU2BBwlgkiROt^3-hVf%+q zGAiGWm1d5+VGq!S_?BC*yR;AA;Lp4m?fE+i*Ga?=Ijf?6IFKBl_QhtpJoj#?h#jbM znUbRmIsBJ&RIIM$uRl2@JhJ}2lY1qJ6G633jS~TZ&EMxrBROJ zGaaZ8(b4s56&RvW1I0_R(KgMqgA1&b-4-v--R$-_!K!<0dFPlRP5HW}>)kAw828cW zy)v9;hU1(dl1}tqlWUc>i=K1r$mUhIx?*k=s!FeTqr~@z(P$gy7D?dXImJomY<2!C zrO$h~8!&h$mkBI8V|$d%=j~9WlyFH%F#(hZIO4Zb3>i%&~kR%XMSK;~jx% zJAGQS&k35tXpUN}lyW9c2knfXqzg$9XGXK^))0*-N0`RW&5CVyt%dBlJ9gQIq|$Yb zus+Ltm|mL}BN*JOL&);AK)3PphMv=R!&LY-_Lh!XoeKHPUM1r^Z53hG2yUlhWJq%N zwJ6rT+>sb^bo4=D^qVz0I-Sj$3A`d|vUpqY85?6riCbChuvH@Yo1IXLDw-3>%TBe} zbM0b?L1|g>6z5i&mBkrmj^nEtXzYsbplGA-+c_)Tex!^3XiMVHzZChE{nh~~^LGr% zzgu8Fv<+>izoOO_RyV3)#P8G*DSVt5{e2r9TiZ?TLhwcSb@)aC=G_ySL9$bmD(o_@ z4UL)&il1+|e;q5ddh?c6ws3VyP~1hf(OOSC>vYJBfTfiEt0+rrjbC1W26yZ(kLFJQ z92S5ZwNTQYAM_7W$}(psT{Njv*;GvFdN-vm5ymRC67F`S?_>zOtdj*@$osHkGLjdN z2Yp6*XDmiQvNOjdVAFQJkR2F6PA_|mmVM13+8%Qv3NR0;M+9bYsGJIq0TLXKCC7>hdr6+2@>DcH-Wl|8R#CD@qp zGz8)LsU`EQT$cFT>p0kz=LLj@NH_W1)&qD>;)IAFchPUYF7&Ubc<@CHWAQGh$Y#xE zcU8FH4fuxL4F& ziiU~nX%t-nuILE&n#Az=j`eysBnq`Fx|v9A!h+{@$5+Yh_0LJ`-yKqz+bAg3j^MAS z?91~*AOBRP_r(yu%%0MqsQDb)t|xntsjXRC#{GGJkew+p#FHm}v2GQD*ZLC~R^8}U zth5}7`xS3EEEFr7i!HBs9Opf9Z7rci(Rj5$kWqj%3Y>+no79-Ms}a@V_14l#_tbkz?~ zGi)Z(uZRwbti()37qLry#gBy-+9z+X7FC_oW@Yr73EMI@BC@?ccZ_K>4UnP z8H=Y|^9S!Lr}Bz2g;8B3a?g!tIG50Pn@-%;m>8vVE~onG#-4lo<+QV>B+sj%JU;IJ zE;)=mxv9Fx%sVW1iu?t>e@k;b)ts5a(b-+rK~y`I;%VRUxhqG2saNAOCBKAJEz=^& zOT9ru#?q~GUa2M2@+E-V_0v{UhxyNjk0iWu zWHsrx6DiVX;YpiVks%&yQYo8S+8`t>DZ9S>p$Bs_Tf%U{eYNU^^)AM=y9 zXS0xxKie9v;$SV`>?YUX;yHY7@S1?==he~RKIJ{4D6jtmYkzC=)y;L}N!cNZY=2@t zdHRJ5M?;cK%PIGb`T+6F3C^zI4EC*PWy4TbfzsvW+ru)&I?S_;5vb*L{hR?)gUtJuI)|Bl26h?!_CnI_h)4gr91Q35ELNL24M$ zt9Hb7|FvAHD=I;thF)q>XdG$s!`YU%(<*13vKo+7!g|N^`1NF;OsHd&@28~DcAFbB zEzSIJ@sH_O7!+NSaDgyt#Hi3g?R#shKn@pDO)$MSOHi3KokC<=c(C^!s9k!bpSU9m zeGPUVy4y`3D0c;JAO(VPHuFo^6h55?8=a#yHBVb}f>IYM3rm{IIbfyRuQr}>T=AVL zx3a+h@r}BIVfPDCWq&;t+xE1R3Qy!BFcFwmwgU*qMyhgl>X0XGYhFKX>n|H!dLV3; zwN5+SnVdI3_Lqnyt;B9xw?M7OP|Jbx4Np!W#`(wBIksaj{xsl4h`8Qrs7e5drK6nT z6-`7-PrTeg?;6OCb5@$y=#c9lzNSl_WbyLia$K1?BnXT4JbnW@osiA-^)3_1y_Z=X zw2s%h)m1dRPSkYvk?#qKn1g@k!w^ekJP`R919>R&f4$z-y}e%sRbuk-Dqh5Lz_)Qa zFnXUPwu5tn_+~C6!f923+LLVew}AEW}o%jAjN3ttBScyGP=_k4~p;Y^)%!bl8x zJYHf|k$WvQ-k=Z{`1Wmj)%|}Gh@2NCBwQQcadK7fh>U;4u4QwnvPil*ObE5VZ3OB| zh9d{ut>(wW%f7xXQ=R)M-$O5&Lb-3=xZx2NDm1&vxR$NxNpf0r#f{%AVx%W54iaZm z;<9^l|G7az8tAyg%SFBqTdp?{UDuZ~v2t!pxwy65&>gau=P+B;VFhsX7H*YT{o!-H znYjHggXn14A&!zO2XJ@cGSw?#+J-v+ zlPK(`79INfxb7{mOUUN}UkHyICOP4I%P%SyTJ(bkdRy^`tGT@VOObQ`CKpG^j2^br zjF4NlLy-;u_#Fh5LO*`&Pf}v!Z?J?JVr={+{NF$S?*GfQ`{@uQO4EN07MiChQRsGB zSvLMnJwhBhEph@LgX+?P!rxExKm5N?251xUVa2Tp&lK?b+kG{=9~i>^-;65&M+F9q z%(EW#pstr6`&~`+@&!S#*}s$8*VT2Q<%yO3C|ll}rO(+VccS)Z3Hda+MQpC!CKLIj zM!h066p#?&(VY9gR(>8LTmTx2$?8AHjc6qqFjIHm4yEs{^`Gs9XKBi!H^&!FHK>pQPPaoZ{F9*X_;5p~)|*!5A~LRZ+r9OzBk)%+ zm78&B#YxEP{!V<51l&H0g)4~I~vmT?NWRgo#_11LTV!7-3f(yByBq*-Iaorpe z^2ilv87OQ$vW@kwt`Kwm^vqk{jePZ$40RMLNs0V-UN(say{bUQ1Hy>!t=>J-QlOu; zxrjhP!9(2VALAc3sM%P@{g^I4@`-CHzvbQlYh5REV8z z<&`hFzy{gg-g5HdMR>`Oi6uYG3frQDGoUqEoV!@L_T6d~nNL}^6sa_qsthr4Qwg5I z7-baH+($g3tLVw6Q5gvrvabMt((Kpgx>G7rxpq^{DK`Qzdoi?bqh`2a!6Jp9j>6`K zsTgT76X3lFZgu8vC^htzx*$6tnF6wS7$~&4_JPV9lu<%ybi92X=DMGQP85{n$aTey z#pJ+IE452v`oAvc-T(83Q9s_X_;6dVASLbMU^)JWU+7%JF~JHOkPT~>Ch9{iVNGD) zj>HRL@{RyFAC5=k-(Qmz)+NzawzkVJb|`5VWK*Bq7P|yoILy}CEYLonGa>+0bI{`C zad>&foAky*aS)`8?3ADc?+Z<>Oo+JQq@*NQ#@BaHAbSaQT@-5O?`g>~HWMf8YzC}2 zsL&F!|1KnalGM6Ez8J7%|5u8d{D1ytz`_sz59{NS9i@FrEu%{wYFyA12?sfJs2T_yezAa`Tdwrk+>ARBelrQt{nI z&i4gh3gEaV-L@08IL%aRTqcp!-;F{|jPRy@7yr`<60i~H{nk|J`8`XfTq>no|M|Gz z^>NX)tLL9LjTW!*vl~{OzilBJhqnuk#a=6bsB(>YFPr0R&Sb`7*Y{9*pB^a zpnIf$h|=?4I7MMNfBA-Uuh0wA-fW-I8|L^u^&|Vbe_%nJ&Efw|KAnmcH+W2{mzsSP zNGiWF9ID(xg}36mPkiH)VCXj#z|#u}xuc zvN&t9$O&>AF&@&Qo6|z}FUWVj%U~f|-oDuodw{OgeC3p_reqTS(D_^czHoyUpNI&7 z+s(`)>2EsCoFlkm%eB)nJVQC29wxKzzX;tsL7ePnBY(p{rdY|R7x?=hw@MHsGdzG& zjZAJ%vDjQH>B`w-eR;@_iM((j@-Rt|^FAUFF%L);vr@T3uV-dv0Q?JeYTy{|SA(N| zOJtd3j2nb+$7lmU5@64ON&yUzy=6}@nXC%Ozj4ZcSL)xKWca20J@TMsGSPHp2|0he zz5lP)IvEq5vH(;xuv>2H3jpZ-GEx|SN1%Km4{!@N6%`FYckv&WM#UjvLhV$J0r_$W z3*elbs<5$7!_X=g5CVYvd+5)=zm~mkqvK_V^T;Zg7TX)kzzJ5DHQcw$H>jBvdn+!g z>hi1RDpZ%yEJ&;B7hpiZQz@M9EC6kU+ND;(#r5PE8R0=o77o|W@%pNQ0%#AM!0xe; z;9FqTHOKCaUl?v5D#9=?UaHxYw3{Sy8A&i;K9c%>wsYT&T!KIcvs6P#ZUjE!xc_<` z5(xwE02;B_pO5un_jmyQ@k3v|fQ}HpDTR&8l*-U?-cSlk^=H+M+IQzXp-3+2yfg-? zg1gXUc-DZCJd*38QO~Fwg4v-Cza%>9ZolZ*XY5H>u08kWxwu4n4`KN6RaqCT-~C|U z#!wX*@hA1iD=*3W0t^*!T=eWu$oPYDE&2pI_$Sov!`MQhjgTf8PN54_k+9vfg}e=i zc$D+s0ILarJ-ywf3hUZ?<;lZ6x-`_5Qqf!8{y9>K*^Y*Wu_IF)rvZEbhZsxH3Joyv z=FV8o%PT_nFF-uvSq#saqhP=G*$(jfUgUW49u4D9SiHGR(jw*fUtV1Gk%fR)dO0I=mCp?pF8)~?^J^2|pt z6_6{P8z#C2t!=)XxUju1*MhBzAojeYH~QiZAR(9n>nA22$L)y$gV?7QAbAi%I9#Ne z0vYOkWZ32>t{P1+=i}d?(G?#^nYAqwnm%x&D+zsk$oHe971rg4ngN1WInl_=~B0L*M&{a4R4V0EBXt3y^c z6NId(Zr<6V`qrkZMr5w{PmI;c5FpM2yhHwlIG$B&x4W)o*6TG$$iGI-2=AoeVs(4gm5D^IM>2hJxFj zGzQGyXp&1zie|uQznNGc2xt;9&!^aYt_v4VUOObyGcd+5Q27!jt4!M28ek@I6?h+k zcMia5zgsc>=41l>%@~P7#YqUWT^V0?Bsi&Q1S?d{^BJcO$96y2e@iMbPj0ElowzE8J5KXuR1qFe>Co5YGpxB11 zu0&i`O9{VT(L#4xSC@+HFl?d>j*3x&ptsv>QUpw8&^W(=x%6u)-N3ev9wJvJ&;tS2 zRLBl(bq@K$`w3)|c7;%Qb90kv;hpLWXg5D6=6W~^P%Qu>YG89)03#;7mk+Uo*fBs# z0{U^z7NfAU&N?_}5aJ|q+6sU&op2w}kZ^>Z9H=HVgCs0LAl(jNNrrZIE7sLJ7m##? zNQuaSzI4_E^!)*z-RGoi)^bKrntoXGQqiA5IaW$MRZ<;L?WG?ffQFMm?YcV{AYXc; zHfbY@hr`)LEJt8GAYW^5X9v!h5VyCDl~TFY3j!#(l0QL$RU7iC8|P`Ou`kMX3ywBO zbC7ihJo9BfZatd@mE&rs#R0OeGX5>^Ro69B!k*uTCyn!(5pCi8i5vaz2CJk=jDj{_ zq$NG-1I`*@2tCE49&UB?+A{{w-KnE(Z)bEb@XQr?lxvjr1{UMJ)iCOM|A(S0BfW1C zBfjyYFnU645TrK9O)<@S(pSZq%@q!$o`hy;08-NLxiXiP( z?L1uXZoDMW-qz-UjX|$vr|Lr&C=I#VNwlHclYuWw$C*%HU@xMOxu;*@YM6M585+rn zlGPT-ve8aOpid4cAUYQT%e96$le$qMulNM1)&S+3GCLcehIGhiIak6QfRe`D+koN` zZ^tZZrGYl($&u;na`zsYni0&l&iT+6dy3rR9zNfkK@crOFmvw%J!F<~oXeolQhlK{ zyIJRIzF}nFh1uHBtfJt}`mJxcC7);kx3po$5ND1^U7lJsQ%T$eUUVQZB1pdG&Ryw(keykjx)L>F*ofw9wX zQ~MGP0}kn^%N;y8?@aFa7;hG3)D>IFrd4*LWn%Ldt$(F$k4DdG2-_XT$2tQAmc#u? zU0{L%JRe35hPowN=|i}+QNn%slk&yAt7ME;Dp}UMJyzP>+H{~AWY!LL`jJwLr7*OUlItnBMA3a8AlT(;X=t~*>l}G7+6Z`DffD;j{Thx zLwx%H8-RGabVqAZ_{Q-#_?O0uaxM57A^fkp(%rO=orfbwW0+G^Sj3&3M|T&Pk;tH7 z_IVs`_nlh)mwKQ-bjv%My;kQ?U_5I}ln zjFn*fkU^?4*m@FV(2g~9`&HY9>goHos6NPmiEyZ0&Mm${eEL$Jd}*(K;gHPnwx-Cx zqT*}n%@^{*PO+o4W(}h+ye^gX+?`&U>HaS3c?{!dm4Y`pRFtXoTjT>(_#P>Ae z#I;>Plbef-Jg(4HJXvlOL|+l{BTuuKCxnOpPjTNF*3|O#i+a=}3K$U(5Y&jGbOoe@ zA`psn1?d5lUII#Q0Th)W3L+p%kzSPErAB)1C3Hk;fCMQaKmxoI&i_2mdq3Ux-uvOs z2Og5lp1t?Xnl&@OwbpN~1ixc_p{oCsR>Q)vHHU6ZLk0NFa3-%Dh$+h2o`*o9eSSv0 zmpx34L#vo>jq}n^>1Aos5P2LX&-*fzZcqOiGE~Jd!DzCsMoEP+5I_J>HmgwN&jG(N{|1V&lwU1A5jD${ zSfu4)>1^Uk@bjXF7V+<} zbB4y}r^*TAZ^N}2!dQ)i=&dEP7aP&j@vpBXh#XFnX6G%gD+i;Z#VHnm9dW?Gs^#38 znA}2U4yP%}?5e1Y73hhKzWa%Pd(ayuu&1~|+2D;WZQ*E;YMrKJOqi;hLm+ z_w0Os_q$a)=>%XURhQqQyl#U|B?Qa#rFRZ(H@y=*OH@`8CvFaAE%oa&TCm)-57>6~ zSBw*#!^d(k@q?Q5_PZ^Mvr2x^ehQ>FiYdz?_DvfBGDD_oIcbMNcX*88w1*pSwMJnn z!Z2nW!@1`-?X2eN=lVVUuSA<}xb`tx(%Fs`UBKd_Wz_r<)L3@u&qXh_nt24QI=oz^ z%17YW=EHMM31>5f6@im^OxWYx@jNv_APvu!O^~t6cYMZ+M&5VSCtid^sLK*6pA?2hZVi4g({4$me>PG1_vBqX3OF64OG?QowF;~DX1ia+gtvFf?u zYJll;CP)PLfnon%bM$p7hw=wM;YGuAg`~g3E=GJd2QpjJ^5Jv*N34q;L;w5aq;e@4 zdE{F1*VwG_X2Lg4dop@5mUy}at+Hhfss{V?R<81#ZI!MyYwSpmJSd`{tWl)55!-IL zWzYNkQ;jm_OdGWAi7Drs#d^L>b@IEXZR2={KU5&AcvQM{tFrZv(G45o)gZO&UBu{! z8%&7BM&&4u{5tckJNvx$YJ)q)Z-gbroJmMRZV_V(%7X!oR++(9mD;s;H|3j|ANrfd z`tFx8NRMI6k`|_~{ZiI`I_ss{d%9dM5gsb9chJ97Sjh@rW;hW)V{im_mgPPcZ|#=piHr0yZc^%83a z$gC9VlA^8My)4I>=z*m_t+(kA-DtR-nz~;S^B5D3URe}H1>DGH50lBLIHzO$Cp|)< z=*^o=ug()l*kcQRoq{v*6{61&3?=)xj;=iPk=r^gWp|0f%T6P1p|ZFkmgGipdBPog^t zNb-u%fKdci1AvmB;h8hzZf40tJ4QFC&3g45s`?tvUoP4s-I!v^&#U>-@9Z>lVXZ&k zn@?+Y4Xe#4G1@j{&~dT)%}F<;skLG8)PC>?8qLLRJ-Bg)Hc75+lp~JMA3;nMS|+Qb zr(Ef3h5i4I039+K9`@-&sXIqwpKXr0NT98hItMX!H!50Mj-yh4^U$9xZ|A`|={OJc*wP|j-yiebjyk%ynjYYPSYJnQfUoM1;-__*Ly z!*TwSVg-<7*Jz8CQiMi39`=yFdGeSF+6=s0dkcq)x-}-9c2gTK6P|9E5YxOYehaMn z1B%%El%XI**RYIv{5PKj_ay@Vg<^#B*wea%GSbic-0Dd`GWH(Y(MVfu&0*jP~h?RUlv9g_k(G=yr_f17dKOGIC`fc z=sMAiFzESeifA)jdF{=Bx_Gd<*fD)?8k#G;R9}GpL#LZCYcI^1XsOMkCPOag%1(mM zZKzLvcPAeC;_ki3kz3`w0?W%qw-)Z|U^y^WZVABeBSUMN7ut_+=nJq*CM_`2$F8$^ zrke_TwcypNYWK?fM!uCshN`5S?T4%$TF&R4wR#yIuaSrngr@8{On*(smk*yiER?=M zEPK*~%VrT2F2kMX?42{B%MPN8J-+xsYk~Y=%e-bZ5b>so$uUuIfQ-9I`*naw0_vm` zu`Vin^KDdTrokA$8H_-%opptAE=ybywZ z-{I<-((q>M`C!*Q!_S^VPJ2Yg$KxPyUnnQEbjmotMGLpl94j9YrjT0Zm-Z#S zNXQqPf}P%ddYBmdST)m<5fU-Lt4t2Q`L=|0e}A|_j$e^5@xy4OdT75Z&656cmygH3 zHA>d`RmR8hm~E7#$4+$#6q0GTav_5_DBbv+(y_P~%)wZQgV_)0>*D4u3-8t=ld8)~1POnb1(Am&qEY!yn_j_Ksat9_Z9xod( z1+;jY6L{lk6lrv9jI%>F%Z;B|8M*f4i!d;OPJvVUJ~k~k55X_GqSv~_GCfiu?GH}< zAaJ6jzZCTRo+!X20d0hGlmGgl>xt;R^qDhD5?2>>3Bc@&FtK-)9^F|?299H&P&NUK z-h`pvAJr*28c(zwX?04U69$XEv?lP{6bUlh=@qTV{=w5WB+0>zwov;DB+-HJHr(3O zABvfA41F$bkImP89eBJF1HP70W!5e_yxwr-OTOXrtI9ku#IaEbp;u~aY71;hTP(h)iSE)G)vBU) z(tJ!$zM+o)YAFE~gse3&oJI3-#nr-;&*XxvOMQQ%y?nIVJyv^?0?{_mE@_)5LA!4$bxXw<9OJ!u+e;_9^WB zwHh5`9RC={>~li49D23Cov!8({e`awGjq;5p;{Ygt+m0)+#m(s$5v~_PAi*JV-3!W z9BOUV2Df-lV>u_s#tlQ!dqj?om$XJdcA~r$iUqF6aQII>+b?3TS$t9J9FQoz?C3KF zNbpm9lGGSt!kD6xm$xTAM#;H`rDw+13iQ{vRmB5$Ee*Jp zg!;n&?FwGMPlTy?Sg>W!pSwp!bI9IosDF~Tce3D2a9-^`Z)`?=>)w65NT8|Bi z?n9nQB^1hUJnZsFq3D#b?;PmCr($`OoiEGp|2|`I%Guc&BH<^bfs^O!NDlf2?XU85 zrk;FV+aKwK-9n(PDtt+VY z-M(7@MCewOAGw~zTC~h>bXn4LiFztGx_>_qP1zW59yOFu#Xh|q#vy%xn`naegWQJc z4tMIOr_JE^^2SS09rWoAg;%pJhC^B{!lg6Zt^wTZvY(&dpuM}js*#)JyZ%xUu9H~W zGN-Y;s8?pR4JoDp9=u%OszCU8TkkhN=%bf%kx5_q`)kO&OA z(PMWdv0En9O@jGJU(4YXhWi`9T&VC;B?t>JzZr`m7iTC@emu_xce*5;*$wl&7XWZu zU<=PDDp61V)R0w>-K)%b^iI~A8KeJrwwr|N9&r(Ig7S5|R5%%MTYU1vvN1E?GyfT= z!(E(6?>AF5#2zi6{2Ay<-mo>FfZ4Xksz}U^L+2L*8(`_SICH(|<1ckhG{VT^r7Wi2bsv zG2-lNwmBG8ay>Zgt;#Qp7iVRSZ5>z#eO2B1;ICQR*xh%<~g|GnizXOt-DVvw;pv^CR}IWAN2))K}ncZ4Hx z1OWL!)!hEa1?MNVB3i}E>qkrs_DV=wevwzos_a9r%oLseOK5kv!|vtIM+=gluB^t4Nr255b>ZUdpEGQgw( zY~;5Qz+eU-s|VoZ!v-WA=#+gBbPB{P`uj0=8w(3zU_}Le2r?cXfwQ1)yUtyQUI{?c z44h+3O{ajv3=m|XnzBAojdnoLB_xPz)6>n(N;ns4+ccl*L9Xtj|9WjdyL0}teDyB< z>-D1Zuh&b)zwR^Y{}RZ5zVIL451|`W{c8-!r9t0D`!$< zBEF@zyo3!@3bMHcK6eIEKO2?W-+%&Z1b~^7^72e7s0(VD?tgof%N!hvT(YueIQc-y z3b6Wu+`33ds=7&a;|Hz0r?Q=W2S7BS0C^v{Ae^%+1x7vT;0OW-*s&rPFRvQ)TB^+- zEe?b6Qm5ek>&jGw8&T;7i2$sY)Q*5}vU<3?KR&v%5|vM6NJ>fy4)ol0Y9~CfxBpJq zJO?^x53rjg*%Jr^pn$r&x};qfz%ocv8WSJS2c4RlI?@fjtzh7?m;(X=AfvUYD$sSM zfX2QQ_>P9CB+8(osDOD=ghPhgF&efvAb+C=*lq(xG2;6A!^%%eS-?g`n$Tf80PLl= zjWlOV?1y+DtU~5AjHW>E42op~^KrTZO)t$vwfDg2=q^|SRayGQzYQ(7wzdMAhUpq# zs>fxsKbN>l=}Lk$EASg z1C5?Ks?cap0djd>XklR?u)&3eh5bbxdfQ%b=lkBrjva%31OzN`8`Ozu`{U?K&}Yz( zfEAMr4tf862i8}{2#{&>EgEEf@1F-h(702v{q`yl(q83;OW7SA8hU&*bpj=+&O4z= zAlJF<(ddGuf<~Jgh^aY*DmsmTv#G95j%qZa`A=PQ8yjxu_V)HaYJ8BI{LtX9Xw@4~ z@aq5lyH~!13T0uHv?Ixn+ztSYh&89HZ@V$(VK1!ZQ!**K0{@72b zZPBT&0I&uGo_Y`Zi|=KI@;|=4Y{$TSDEJ$sWQ6`+Tg#_zrD*OW{*R|OxQfPH6}%j^ z{EVXN#ArZab`|Qfxu9WeY2tt}C_Ujv6MtG4-rIE( ziQfjj#1>_(%0lx~e)5M)hBiEJnoqBkuXcNPt)iSr(j0JdRbIOW zF)ntBMs5wZXSRncKXlrxy}|z6*X8Nzi;iZ7`3O2ly04+qXrx))sCdzn^=z%>wS&lf zfqO;du|9?zNky)h>9-5v3XBvgq}HdZ8A}6wKaA%%Jm*L$LgCrR>Ha@6!BwJs>aeeA_t++l&@b5bjOq) z{#GwVqwNPX)Btl*h%g6_?cJr79932Rb~No>y`Tle&{zaVBpiM`<)a$yP;5!6H4Lz} z`XM(jrLKgaOT1i~&II*q!B3*m>SHck%Wcf_VjLHvp9{z-7!y2v{I$gb&4 zoZ0Tj%83OnCocEQ`99KM9JRHO;VZz#dN#3C@vB4Ewl0 zXnv5TueB@ zB&VYA@kvx})$6JfeqGn>g^H_+zL6$TML&U;Gm#QIuxHHg8g7_=JybOAj8TEIj>1E^ z?z$fpL5Rdb%;_9AQ@pVeMtFCtjZ2E< z%C`+TtJ4%Qy2)6S-O%Ktl(_m>vOf6rM_4G#XE&p4t0|UwNXLI5e?lCQ#l=s#2}eqQ zj*f|m@pN-uosb)AvXcMEW=R(PIDH#?Cn=2wLj0^Dq1bGCYn(mFQo*WLGoR1lJcOv`a86P_syYS8f?MHs zHo1uGjU2cIfv}c-ds~n^XN&gvD5oF(bpK-gRphtC969*BWcROU%kCq+0o>Fr)&2da z)u(7g%bX|U8=xDM)2Tj2G2U&JHorpYp)OCz$(0@hpU%&OUz1 z=JH8k3iDwnWTHqvX&@1iivqe+oiFG%8A8!g(rYy@!mQ{Y$0Zp?q=Fm8ARzCIPG$lpW>WI_k=+P37zz9Q<}2gE$Wv-Ps6IYCgY};4KFpf1{Q(xyCUuueU~UdaZSw%rs z-uk?Ly}JxZ%vI#6%H@svKn;V1P|T| zw#M2zCu6$xD#Dh%WrN0l-wR?viK*UrWMFU$ddv)7vtUG$Up=vSP`$vUAAQj}&xgBp zvKHCG&D~6pHg?K*GxohJY^q(=q;zM9Tg@irRzls+mqeevHG9Z|OR{{94tvqYP#3Zf zVM;5Q-i#OeebJ17&Fe2;yxgZwoEaMR`u^yGvt@jRE<=-k+rdE@lxdp}=(qdQAnZn4 zr~F`0{mZ4w+|{e6(w&gE29Pt`xUHbqHx4(IDGyVN1%^MD`aWCjJKN%+q~X;mcLUqw zXneOH6Ev%HD6O{`%#+3p^YPYtg|T44Kq8-(Sy`Ii%5fZEZmz@@+?Ihp@-nX*m>eH5 z0_uOnb0e{+SQ{>F*HI+tDMx(-M(SQQL)B*^(^8+f71)he2vydU;QRfDq9zB%k|{sn z$E1i|5ZXO7FFbsog5AV!>dac@P@Sy1CGZO5rRPx39mde>xhCZm)CieU+8K9nOD_=g>f&l|#x84xbzJo;?Ab zOT1EP0&b4K@}TR6fgjfqg>{DOr6|Ee|B()lTXV`*J>)o%Z`Z zpV4+fmXqk#*R2qy3_6zl1H(A^Tn^uAp~Jl?72p4Y}ZZez8}7-+BJKu`uz_?KdTts(_6bwJ2M=BQ1#U#q10oTL8O$ zFG~>fND^3^0#U7vPt3I@BZU)uY;XDsx#$U)b!DgJw##yO>vsfX(S9sRPL;}1 zCJ3@_N$L1W7Qx|4D)*BuICTi+%{g_pdift{h3kZ%bN)`iWG>f1+#V?O_Ok|!8&rjd znEbj8Wr4PIPT6@`WYrvYibv_J;Ao9TqRGpZA346MYItldBw~Rf-71?*r)*K}clCLI z2K&8YHJiI|E{oX(BYKY~SH%6^ZQt)2G2FI3+~^)%%VC17$IEmomk-X=2>`W&FXFe^ z(yK-JV{?u*VA&kfj1t328%}qQxtkuIi5(9<*gPnbhdg}G%4wT#JW&=Dj8o}=B6`PIpq8QO4OtdP!JfPld6Q~46VJzeB5GpI(MECWfF8jAw3&LMCAsw!ZnDK z6H-GOWa{Er#;>qwrc*9TuRm8N7kJ_IR6m|NW3V-^q7Q+rd|iY0=z@egDVAiDz&$I@BDG0#=1U!ha2LVarX zord83zyd*$)AkP!yANivFAVfhAtv2+tv<$m7|AS(7rK1gta|Um6s%+&KGXc6Mal2L zVXwk9^O66aADpE5LYU#w6Vp#nN;S-a`xs?uM{Xux$R^#I*wK6*Np=ht}FXBle z_8-3Xyz+UAp6kAR{!;C=`N0ZTTu65PK8No+%029k%<$>}NOs*j_*ncGYhVVU5i7^j zV0J0eW689-#5d0@ls2F^D}$nELM{Jn2BcWhmcsMB@a4DdA&`e!yZG3MCdp$Dxp{cj zCSn(PZKZDUG?)vuw0po|pX8hHy3Wdg|1DGy|2gemnQH!*D}TjE_%t)0m zcBxOIkM%8Q<~Y@bYEvUUWk38Qy{JWa7c117s?gSz&|WvR__|lI&^H|Mnf(^j#RoYO z)d1zeR`jiH4F(kblzC>~$yJJ;6F~1w+toC=GM06rlJ(n`Ue%Z;oHv-UeTjeL7tL0F z%ko38hmeBCrt9n@SAVt@r$3hztY9fc$ZJrZss)l>uX?G6Zn1T5y#;r9C#p`l>T7#4 z_UwQjq31#TovcVzxK-|SV@(%c7usK6M!`EZ-C#xO5LxspP{cHjn390YPNv-*9UYPe zL4EhecY0JhAG|ld_~>qBnzdis2gT>#B$Th8&d%+qt#hBtU?kIZcK{8jtYZ;WH8%R~ z-9>{0EoI;75UYF3oAnd?+~K)&9)jbwD0BF7AOGIC?-M7J<&d3chaP48?>MU1P)9ps z@&BCgX>Mg*m06u>VV?65PDV(ZO!&1hdRlxjz^(-BPJnaI)z?MZ{OWq0L0$ejK%`(0AZv z2cT6ej;)zbx8uH=elP^f(FbMbC||^o!$S9A4Z_J$5mZc)2SO2;k*Cp1NtJjL>0`@1 z{+YRSbL4d+>jCm&UI9;Z*AW%hrp14NFLPCJ!BP3&;a${9|38nb)cgOh2;Be06X6a| YT)HgO;`QJLl6v(6MNN2~y!p%j0u1++O8@`> literal 263596 zcmeEuXH=70w=M!AQBj1A0)l`@Zz|HOqV(Q-5kv32Bf>@n0fi8h-jS}McMzo{^iJr~ zd++V8xWBXaKHryp?yoz>{c#v0D5%Li>)K^i{}hmVD!}yRS&|UomUvzx?f%mBDX6YC`eIGHzNs zvYkUN6z|Y^fwK_4`n<)zR3c5{(J9Op7~ z{nalAxD_x`GO}yQL`mm%g@uppEn=u)y5}qBg_#V$wO+)bRUyv0GW$ySjXy4qJ()E0 z5QjM9Jzr8Zm&VT$;^h|Om&Ak2j+vLsSRE@b8&U1N4V`*&LWqM9OyWY}i>F=R%ENuk z*q7316A<;387{tZ>8q0A_*J^^p|qs|q7I6cs!0iF(TZ~uk9wsZ9iFt^$U{QAS}r`V zcoZWxf6m@~URtgHV}okoOSXCIljksj#!3<7ryH5ymx$}4)1{t1$s@uE3H?3rOt!5} zHRyZpiOTJO_$$@~ABmMB=bN^I56J1nXsMJO=rf9+v>Dg)Xmsg1=6^iS zyW|)qeTQ6V##x@1-ppg%n&Kg+`5S*^3{g{Z-+5lk+xgEs4_4W4*h|wmv$uuF1%15i zC?3F1hx?r94gHM3R}%@*?B8`p-$q zO^8YI+$@WllO#HvK1&@nX0Ru4jIXA3^RRg=svpnlTm3YahAbTF%VNX5#sP`|qz5k_E~}ejp^Nr*C<4e&YrW?}awH zTfEToy7||`TvsnWoRWC=+(YuF?@$!E^i|qm;=nhhQ;yR84QNI$TUGu%E9X;xjpQfg!?{*|CH{5b z@wZO~bSUZ&Jsq+E=G8jrMn5&>RvN#GGLk%m=7-siG?5U|;0-cpeqBKX>%h(jFJ~z) z$0a@fZEQ)POQ0t;H=8Zu-w369zIhL~*^%9x$(#=dHbK+B zysEJ?>3-3q)PD+oLhy3GJ0Kt+sm>uq>cT-cbuGy?-zl4+jows(`cD$Lzvqenm_p)R zs1;b)SnR#jR8KWj?-L(zdjr>xmN@8@H1Va#hDYSSH}&Tq-oMg@7y1~&jK|g>s0=B6 zxtdI{a`E^vsl~9aFBM!SRf%D1Ns|jd`@#*Lu1qCe7GrnPX9>Gs<^ZgR! z>9-H$_b6l}9^QbYiqFx>y?^+ENA<0Zgw+R{4-zeoGweNYN1v|Ii?k%Qc>gBRuR`5N zSn{Mf%?1`2oZ`5k{)jZ;?akC53r@wj`I7s|VGE8s!d_&nPmGn$167~8xH zefFG1n}D(Lrg4V0fM!^~DaVGWQB&m?!xd`9?-cx3++VWs<3}`KJdpRfP#zbPb7T$UX=7b-$;?xxZh-cMs3M3q!Xo!q^G6JrQ7xBAt;A~R^+eU z&3ex;bJt10lkbr5kmL|@i1#e&xy)wzQ`I1t0E`Sq4ZDfFD9bmCrx9wUu$$SWQj{H< zP1fhN#Ii)a#KyzJ!_9NO#DeE5`fkbP5??FxzO~#MwL#4?wfUT^9Jaog+;vs4Oh2_# zvM|Y_)WdF7i^JmCVqG)@ErQk~3Jy*qP?GrkRXI&IO=UW=>5Aa*b~5u_S>0C_XLs;+ zuIv~pgk(lDYDE=Rh!@Fjzn^n$_u~j2z|WV?7tV+3R+V&EiS#pC>RDd0x>6EaQf8gh z<=0YAK6jNn0XVGf8a)EK7V<5jrByZ%$_m7(L zcerCJUR5M{h8_C7>n86Y@BWx!y40*sxh-j*mzb0&U8r98cs*y_+ZXPa>=%B#A4t(z{(sHgY{dxAP&r;Lj-uChw;78&`u@gHI)!BpW1E(}*OQ z-g$+6YVvCuZn6%GAm9IZU*1RlJEOHic&xDLB7#3pU-RPfz1+^eu1DY1O)A@q8+QUf zmFQIKe9#FTHgL`n7!hvt$@h%&O#1!pj~AhyH(hU*v9rK!Q0%B(coxTmwvHZyuCI0f zi`_om&RBB;3j=fC=3ujFYwq4m^jru>H{3SPw5oM4mD+0FDh8?rb#J&+;$C!X#2&Fu zB}G2}K=EOL*Qo0y*Gu6N0TKy~zEYW{<)$A@qozYQZ%(T<6;k?potQIeW$9;8E0!@? zC=n(oKT3gqOX_gzz@2BEXCt*F4P@ku9}?co2(1Ziffk{Nw1Ssk4B8uv=!;cURJD#S zjd_i7jP{Q5RT`Nue@Poc4VeoQ-Z{H-MMPI*+_Tbi&U#6pKBr5^Vco-CdqZVBEr9vj~J#ZMQZ1pOCxFIEwB5{ce?LI}B{L*|rV{+)Eg%F{Px zEoECmDLDJ}k6?{p5yG-7F~66|OESGyG~_;141xyTOJyN>x^;K(>3A3-JeFeD`OyPG z)=JhR$F`u4o)5B2DZciax;NK`Z%63ea2Fb9G`ukZ7gv&2_^jX&MgG2A@GG08PtQsE zb}K>a&zZU3RArwaJ78xl0cryZi;Aua95OY${a#Be+2&ak@~8wi4?$l+|D%9MPjsK7 z*NP)fr2Tm6#1dj^AJiz-ljYqKzI6l{yzKpSJheZSiDH~0mpjqS%eNDg`CqpCUf%6k0NPmR$Lq(+}d~IE9}gY z*yT&r!dtSCaoetw5q?$QBsar;`UNOG)@T^fffQY98;IrjG$z_?*U=-ObUE;{|Hsfy zvNqc|b3yyf>f>#J?J^GWo9SE@dKR@`JWTdeC!g_zW`{c8JlCC3FRolDWOys58xj_B zi%jf<@gS~}B#cC-b=SV8c5I9KEa9LkJcBgGPWz}rUE8amEjzDzW%^+Q!g)gJp!Vl+ zz$^tyNV7bDs5-J-6|se8ozURPEz!JH;8$+8728yjSCV0UuFK~;HtZC$#j%{dOtxIb z`i(7?PuoVjV#*Y%Ior>Ao)a7vd^^U@r%0xJvB#{geOK>YDa_WhPNRCasdUmdtY5Q# zcI55Il~S?s4_~HtuCLHoxl^vNj~rUYywK88kZ*6*-AALxf~&OpQd~nS6GgJUIoF7W zM)cQ}>+%DJl}oR$sI7SKnW7^G_g=5s3)PD%pGVCq-M;?t8l3lTm^0;ZEF!itHY!f& z`tNtFtT(L$4vMBbt8^9ga<47lF?0;_NejO`0oh&SFuBF0<)PJK6jqM*Wv-Yy^ikT;dQFG- z>6F5v-f0=n`p~*!os!S_Ok-X|dIVXLjaZ4#wPPQ1T6<%0WX$Fz|BLsNN6{~EHs(2T zELw5yPu6}Ad{!bx_2_9y7WLCfF_Zo~PF<5pav~>L5o^Eu^9^1YD#OEJ`>dpNkv4=t zJpPh6lOmO6qjyDvZ&uG62m3D?J1inx)F-#qR%Pl!ymu&=H4N$(w(D?WZUj=q&Hca! z=|u=sTgFU50f!Y_6XM`spv1Wdt}cL==!M&VUQ1qJ#=-mbIW7*)YfBu=lbCxH!3*>A z9=tGr|M89&8H{rY{6zv@9?7`>bvHqHGTwh(U)%xT;XGE8kdXoJYNk$5sGYNgy~`|$ zsSSJpagf$=#=*JCfO%bzQN6bT=nq+{YrAMG$P1d<+j5$`uzwEa^ssfn(7_S*5CoUD zP!|(g4_g~MXF(5 zUYH50K9&5}1Ox=Q?my&u_>cqK!Qt#_=VIc) zVdqT$2a*4x^AzfA>SXEQVrg$ji=k`s+}_nifITd1`43WCq3%eZceZk?^kv{^P5^hWy*3+JAeLmyiGN zkN)kee}7cN8R{fqZwm%>5&dhy{`K(Rzx>yO!dw{M|0at+1pVt%AZgLd!d!o@n&{N8lIpAMfD$Li@;$Q0*2DjyR6Y)5q!_7gokDl_zKEIQ z0xmuwF|GK2T-;x|C`lL|d}+A)YNf+)qi#+4A&*8CB?$;ANLXz*0Wv>hcimir%ML9@9n<($GLY& z7)M`}5-FIH`X}kU#0lUeCZ=+2xVL!spG8QD`zlS5xr*SQ@3?}GFJ*9Frc(MJ`CABj zIr*6sS@6%xxqy42LFJZwBqzx~vMxwR%fA$au*Rkge+Z{iT->4WVpMu4c8viy1t<~{ zh6|>wQ2z5B|0|Mzm(Blb$$#vi{~tv{-hsP4-^o>E-pRQ=*PeTIIls)1g3nH$g2$@Q z!R&$Ce=vdB(Rm~n8oF6c`k61_;xTM2q(#7sIz_8Sz1MxA-`}WS6I%I#_;FJYRx+S# zoy?g+jg>#I4s*c+s-Z8TozviC&kAvc?8 z#IE9B{eac@C6ED|HZampNNgc7xs+STX`bk5cKGd9t@2vcdhJ8El_95%hD(E83kmB! z=C?TnqOnrJKe`LdO=zSuA+a_Gu9ktThQRbYZse`HgC9=kr~8AG3-x%{+3meYMvztG zy9NC{Ot%>RH0OW2s8_y8Ohq(1%)^UozzHaBLHc~LabKwjY?rHAW+Q%Z#LG#GgSL_n z*i7Mleqd|^P4Dcy!U-S)woLQoPdaSm2=ks`7WRApqYJofOtk#E0%Dzvf1B2yJd0NW z`|kE5Ug@QHGayZ!=7yb~`h|uMw{S1u25O2sJhI4=Y{uqQ$x{I_I>E2O&y1PDUw^8DJkKuOWr-B>qf@MoZqZxr)PZjfIFL#EshZNfG+aXtuPc%^0OPIUc!Qb4W< z!l#dI6*Q{=xe}#-Ys9o&abU;uQAA*kcZ~Zn`W=?52_YuO0n)*twR-iZ+W!5*7Tkxf zcVfMO``6aT!YeKR7A38?aTkaQ30C_S$gzn{LLI2%Q>KA0gjX+uA(cNqGsQwlJUI?5 z`Pd`9BU$5*fa&1jpxd0-$mJS{I~1d{#4iyNFDL+oi+k6MP0wk+SOI#uO&YmCuU-P_ z;1Kg*4V40T!F{&Iz+?q<05$}dSFNxq6IgmrB4XP678il7aIy5WCAHSZ z4}JxPWNu4+fMpBPb}4`*w~OWQWw_r425+?aMildR;s0y=fPP@ybXV1q@$d@Z(^}Y_@(%5a3AlNc-fcY=75caT{j+V$^@3ant+g) zycpd5ScpeZ5{rGPCk7sYsQWDsJFdiIFywD$-Q?J;*8(Hhczo^fS`u3+}OWffe@xioVM@?TB@1d6Ynfavb}CV&c&PVdFeiz$RhwWFj!`ZEZyc@#G>f zZiiPDB{pRO(l-Rsy{ZeSCMQr*V>4aGZ-80^FMOxG8IXb*@^X>IRcy=P5(PG1oSB>v z;PGz@V8v^IqI*Sc@?)KvW*AVR{&Fjzm=}Qd5#gjVVv{g! zS1=fth&_vhwo3;vZC+y#hDDh!;J%_S1mh6o1FBt$Vd2Jg81YPphNPh07i z$iA${0oq4^qe6;J!XWqn;}XEtAjAuKfa#*~0&Kbo4BLl6FwXzF)jxLH|GL$G*Wv$l ztN&q1{+nB!+#9jy(lYdf9_|+U8{F|;`TEeT?c+n=qaDQg*>Q}-b&=w-EKlrv8UKg{ zxGJXjtQ)jnmM+e9CK)AquT?xW?}$fObf-RanrV6HwX^Wh@8kdh5}^E}YvmvU#T)B<1J_N;Bt{@7h9-!9xY~}Od~`z z@A&R?-#Yc1Y54#R;~8MKE~qf0;5L)TpWY=h|30MfUY|z8F8I5Hhp*^kpCj(IL1Da~(7&^CRM-PQ&W|k}*@^$PvLs zL1q}wIm$*=Im;(sIn8}CubnJ|gh@u+saa_{C%A2+%sXOJ^w6~F9Y*mYC$2;oiu10V zeeu}*xz#0cJ^Hp?c}hG`t=(g4_fZvz^me|9>Xk2z*PwH4pP+>`d#oTcJ^r%&z!;l$ z{i!1xcLUVl==<*W&(4dTdkQS3M;UG$2R%uX49kg@G;QYXrzd6Xvq}q+8nG=i5LieK zL=NcL8-Q2ME#XcYf*A1}Bu2;Ws#Z%84pZL{?YtwfHgnci%{l3XkFq5|QY8ctd}A(Z zokU|?s>ANtv^c#G*blgL=W~Qo9B+R0H1#@j zp;vz{5^TckqtD#E{e`*rsC+<0!RndLlk~y5vt8)v?qH$wSRp}@e-D_524~Gqj}w?I zT#mCMogfrKwtdv<6pSf>?2mifR~*j8Eibb?p4OW<8bEA!2n{l1&v2H0EgR_H(^YT4{{*EF1EV_w#W?e>0Z z1r>9#x|Wg1=;;WtTA!UX8tV6#;o;YO$PHF7Y+UFptKH3~ym>k{acW@T>K&l*b>V(H@MKROz$GqgX-Vx4KEIdiD_I-xta^2t0g z`*`5qJ!#{OK$?@}v$G-72o4CIs^vq=o^(aGv63TZhkgyM=ym_|<5^JacsBeZN&zW$ z+A9{#Q96{m!^basQ5XLCcr(IZ|LmynyfQdATiE;jc*1D^k?fD|Hy`Mc)fL-Tzw5#u z3$GUUB8SbB48n!h%D0;MYQIuBw>@yB_%3>_!SdhD!Y`}(8H9aY__(>+#)p&4-d)wJ zw)SlPXY>ANsUh!!q@9!E^lJA{*ZXCoF46dWS*bfen*}@=d;VZERN)P06@iH}UwFRa zO=&)EZ_CT22f2pH z36r165>K{b>e!9Wwqj72^+l|fj=;-!QT^2Y>?);WZX@|$8X*>l&UV|fG0nB|>2QqY z>Q^?c*&WC=QG-{HSY{FFCxgiW&(aws*bUX8>^+=Ogv=QVL$)onfQ;);3bw4ddvD2L`a8b4=7A={5uvpRD z4XT>TZo|-+$io(AqcxAMRwlWeH;MwYFsG&LEI42F6j4b#U)^Cl*X4$rJOrJ*&hTX> zcG)Dg?!D{h8`n#oSnL|r!MTrYFsN{y4$2&^4BHZTVofV1v z5c&q+0`ZCS>FLyJNw)5-l+Q%US#3B@`0hvRb8B!9 zH9Ol;qNC3&n^AU3_iT)j;m-|VtZEJG`puR-A+@TfJ5fm|=_lPxhJPPRb@z&;F@%CS9U( zbPjA}_1Zej`n=iyMCx!_>}+~gi6c7%SwdO>wNJuC1e(mVXh=rS0479f%=k#{wU9E4 zDN5%XT7^~d%r6BVL$Dj> zggGUspUFtrO?tOwUUa{L@=f|ThG;?H z5S8+2EnPWMlyg)N>BCC?N4=CcS8MjiOOBWt?I(ZW)s?eHLDLs!Ek8hwcfd?`yVG8qUhi6nHfl-+di zY`?Bf^Q<^WOLZzzoS972lB+?Q^#wv$7hWMprt~U}9U8;1d3IcPUKOkfQ70*QVV|I= zPF2GC@?j>I-)876h!OJ}T1LkAN-N*#JxNhN`Ht~d>`X~MPDC9KOUtTfIe30UepG!N z?TlbIfHxfj)-~CjRmXOi{I{6;1)`+{R4ZVB(xrm+)YUbV5z^Y_Z-XfPqC z@A+cVS$1MsD#M}E8HyT`ezsJ2mJ|HSa$C;mNR=p@M11!qm4Qq5GoGDeOd7ZU0_k&) zF!U|DAf+Qr+#p0F4s4oot@&bZnR1ErZJRR5lHUcZt4RSI`2%%z0>=yosrHp{DVusTkhN^{YOqZu4Yq2p3%XT!}v(QjjG2qAsI6-fJnl|L%5-XWO{y zHEfvz;f@B#^R!szZ!BbqcTO(k8l74h+_eLVPu}BavGU}RHGzm7{k_$S#mu-RgcHW! zq)2~@WB&k8Bcn{e%P`eGake@!kkJM-$1hR9dG5*{C(;>_u8$iT;kT3KYLpnpH(u^D zb;am2otkw;Whhr|xzAQq5<4bJoqGBt6rV-t5vq^F|Ap{%(GBCnzAP~fHhn=IM_5fL zCREq`u)BVKwqAlngx}-;?w~BT@$*I-?@|pZVWunPg7PW<0LHB&r-(p{|_a4++=x*2@2U z9nz|*0kxA3I7`U+vk1P<`dq{i|_4Uf9UGzM}oKCLgMlp1gJenEZubvli)u~9FFY&7d z9@+Z73y5F_mG9B1yh_P0Mdyf$9sfL8ydIUpZE8{F#@v3-{jn~DhIf-J{A#z@K7R_H}1_=)PGfU+}k;F2Phm52} zh*sJ}q!lMXPr^m^hAOM|=9ruxfZE5c-yKB?qms0qIlyM|JNc*I2^Oq0!X}wDG(;M| zCZ0+=!2Hsmb3|(mqm~rxz0?w#6Xy>?eBlWVGCjVMpRL3mI za23;IN!$$jaQ@Ws9VPNe5vKvLac;%OZ5OfY>>*mugtM8EqE8`S10YR1IeI3A8_C@% zT=ZhNQYMPA|J1q1?@6k@klZvhA`xHvc&*BQOw8h0fCi;u&k5Ig3~QqQahZRv+_#Ka zmS{mpDx>_zFDplNC$n73)0ntVVDcp%(p+@=6D5+pF5WQl{d*?VjPJ=bO`%)?badF6 z<@j1jc=2}4x^Lmhin&;BKY#WcSuy9a#KinPW@H)r*%{;V2Sb09RfVUGLs*^FkMi?jTlILhT7*%k%x=PxvJ$@sM77k z4pwKrcw2?9G%Ux2Dxa+mye9aWfLG710-9f{Q=31KL|T7usCCPW=CDOdkc0|ql&Wsa zz+VZX&ZTnhM0}x|i>%khP8>WZ11?#W70oN{@ym9>=V!}L_eQMCT`=*XIOnm5Ehf4y zYUj3R+N?h^YU=r$JKF9@3kOQLL%B@v<|0>Pi7na4> zc~MQXY>yj4_Rs>SJH5+(ykfxXTmMkXF*ju8UmD(J?}g6}fx2KFVme%CV3HLqm^aB? zs8U7{x1P<{X9in&^UmI9ClTid5xdp+s(3f3GSEzP>hOA0xu&`!EaZ>WJ1f!4s$wZ8 z*Q#H+XgM)j;Pru7JP=NeAUZ{l z`<4zwE{?C?XVT!l5*|@3`qE zK^3Vts=gCzdh8(sPHN<~4j2NWZ)uD2qe10r8R4Ka&Os**my@%Ij62Jgmc7^p>M7d$ zS*=b^adwr9p@JVm(t?<>`DI<@JCvTu;s(YD4QQK&RW>Vq%iJVwiaFe#i!tJnLsCz$ zYs5@002T_!ICaiA=rrw{F=1n5#=dGhH*teXe2*r){>el~L^{@ten952sAcx$q$JLl&C&MjL9)!+V>Q_75kW?KiL%NtPmgu)3B7Q?(`8if zZJUi(BGCkb{K&{xNvA93s80X0ZCw`l`!&}>Z-e2SY#3#KwV9^N$zEAqzT7QChlYzK zs!RT0!BqBZWpNh%?~&C_DriNy=B{*(VTaIubtS&u-Ufw;mrK4=t0*5Rf9V-S=FiU- zB`NIHwW>Paeni&~{jG<=R4YX2ya@Yq5Qdxx+64_;R?b%*Rdmt}UDPjH7T`7WvP+}Bb0p|Zz+1mZHI{ms@y z1sy}KWTzWw7Tx=*y9r~sCKX0m^39Lwh3rT7Mj}4-UK31pMG8dM6RYdlDlUtCXQkAsIK<8wfMS3T|BF5hgrjlzVn zxvC_==|2|=L7cYK<9z5`C54v#U3O0w4eEJIuG=E02@ZdD!|L*21)B4NW>ie zlP~Ud$73Z)q>S>~d$1rqnuGdFOGA4A@hMo!z7J>3o{eDdgfkqAo*#C`aCh*V{!S+K zwGH8w74IZ#(6I*4N1XOP)3;He%2oEQ<~6)dQpmr9{}!jd_OA*eCPq#_Vt(AFB1Nsy z41-SVO1(-WRpd`uqjMcHc^%#9yXMT`{Zt{I|EsI4-UR9UjCLPrXKEQwzi~5_vwXgw zZ1TXi^iZ^)-+t8u2hG{WYik;kOoo?ng05D+)VsnH|FG{^F#B%tfEpZaE?7G}vLCOe z96bSLDRUx4J5U^P0tL!}%Q5PeJ6D595GQNyb-Bj?>68-mKnKk})poEsJ+ij@O;Vyv(2eyW>&tRqHT`v_}Iz~vbc9#(P9*)ojK`l zoJo#`pt{UMbqu(h-@+tF)!7vDNxa8~9CbdECx_eCJnF3d2rktR%wHcI5ohVUN zie|7TYIvE#@QPN`MLl7-zg?n#I}exx+aSt^%JKUR*XlU?T2DWN2z)u6f_0$x`Epkn z*-<2cBDs;e^Lw-2MS_BFdpu4*c<8u-Its!tK@cppQ)YLaQ% zs0HjcFuow6+LTeVQIJ3y^V~!(!%x2JXI( zdD;M6Pc`N(s>c}uRV|5}okyJL?kUTDiQHg0tyoA6mCZ3$P1pftJN2*QuhQb3G4=Nq zDcIRuCFGGI$P5R3<4Dl=>MM#MX%EBIMKY2gK|oFWViSw^KXv+KaxTUYGnjN>YVunl zTGgqdg&Ll)uVRDvl8~96LC5K)e%2iPr%EEb>aC)Zp6m_s=tYXmSzXUF9rJh{#nZ#= z+1nEblB@nlXUz((-5|wsk`*BtJvp5?_vLAu9v)nLFL5nf?ow$->x(=KqYP*r$l5=J z;!AA~`yHhaPAUi2c=Rjnz_L)vsKGA2Zbt`edI&1AUY+E;A;&SXBF;)wz$qWi1 zbfK3ZSK~2-kZ+&xS%AGa94ngJ=~XHWXa`4+Dr~wWcNo6QSiE0K0h&;u8S67IG{K^S zVDD$vDYZk+v_cbPqLwNk&q*ThQOS-B8!h;*dm9-D7VsePi3R2{`5I9gVz3*`O*_{N zq1CBNI$-q##=zqv^X7vHT(u!eoY9*bQw-6WQgwxdVqEMQ;JgL|*6Q*M!f3~)a40Ij zG>`1TW(A1iP#}mh%uOi(5$CUJZr3y>&n2?dJ+<-eJcN8f6e^)R{gfB&_g-0DG!Wp6 zmbcvh+I%PX`1GE(*Xgp6QDGb{i;3Kx29lh2++$j|zduJhL?*|R#1A@L{;_)5V4c-o zV2uO1KjzM5U0CFnE5uVTYvgx`G8)+Ki2<9mj6fJ-|B!1@?@5LmoTQD!KU-XGL3g1h!z$ENng9u&Y9qi_y+0;0*JJsb~b zV+LpEReia^@m?iN8=j=s-D6QasdCa$RY5P?`+( zPjH;OWb%vFO%EM^ey4nOZ&cKwS`(Ou1t~xfJUDB#!olo-j%jcKP(5NLTPhYQ)L`#d6S?oPR zjz4})jVJwTT}9 zVbK#i98(5!f`z#&g|Lv%LFzeSM>#9P*h&jt;AGd!>flp7E3pSA^3BOD%$fRC|DgPs zzQlm~1PVADAM28ZPg)v(m7T}x7&W*H#f}!XmKckraX?v03ipF%{fBx64gO&z=3^X& znbvjZbR-nulnX9Me*4`gW6R=@-j?C2EOE3rtBG`uE{Q6G@UZuKjT~GYCHf4)?FUK? zefR05HK{3_!?KQu9tsEUP_;8)B5Hn4lZuiYg~5xTlZOCec4#oj9{K@UuVk z%N*NbNs)Py#=Z#8Nme(`K(GdAp(E^bW#?PyBl5i2GVZda$UcctSIk&q4$yh>Nf_?t zJX5ha0cu97G4&`X4@gzd#7TiUkJ8-o@O&Z%#YCsyUXh?VTh4M9}XFDO^RSlv;zPw|RoBr{IQ*CHx1Usnj48Vwv(&hvJW>cC+fmI99 zr&Ei_(`8iwx%}p6YM#iyc^Mc?*_b`h1o`wX3URSE4eZ9zEr9;6XvWvOl z$+u8Y8x2R`kHA>etTsNZ=Wk^9E7oS^3-dKR&CO>wiD_07C^8}CwH;uuf-`g@XjJ!} zBb}^jG^k3@iuF;n%w$s8O)?DaC!X7Jz8@vEBG^f9MFEjzOs|1*F6A4jPL+Oc89k6x z3Hu=YKI@wG%-$>|IVoPZd$%I8xGdSWV@s!F1X`Erw*|8)I=$Mp3~z7h ziT3H%iuhv%F4xNeK$@X!^E#nRwm5Dmz9s|)j^lfM@`nq>=}e^2(p`}_Ip)lp6`j5X zoR4FmyM~{_CLM>~}L61Au6`({j>YS`MU^=F2+3DibJ;(LIF(QRxN>Z(03P#cN|=MXscZ=vFqx|sw2L@6N&Meu2ZSr`!uSrMMeKI^4}_B#vYiH`~8#KA_0Ej>PTV!EmnGefh~<4z_; zfVRwv4&|);EDTc4Sq%FAc)ia5ldz`~K!LO^t;H^5g%Q6L7X#d%25mpWZ8VsK>)Ekj zpaGt174yiZ*I=6BaXUsH<7JY=IP3QE0Uo19};lXS^>s z6-0`}#K@SF-cq#lL##_rxMKt=(f&f6_b`zBL{PYP>XBiJPCDDkLcL1+#BC18R842W z2{YzEoBU<21$J!r-`g;tfJF_-ToY{WiLjUhK>Op`NC6gD8tAyVl~pL#9rIT+gfRoq z3pU8YB6cMg2UKvLTZP~R?1B4M<2{tRG?fm`r0KEU4OlB)wysIg4-vyiLqV4@EG7sGCl6(2AKdL$_CY}>%FiN)nL0HX=b$#6lg0{m|G`vhG)Y{<3b zD}W0Fe#=UEV`d}&0=7qU<);c(e)Fz@`7J4fvr&9F-~+C5fUV!=0`6bn z&_8`KHmh#zfDy)^&jL|^3FwcabW^jd$HvFi=U`acL@ERN`&2;8jVACgY&$Z*4)o=? zqU0Bu{?=H5gFN_IM9b>#QIfLR6X_JQ6FTfi?G-&qiw z$-~!70rbdzTE*f7#DGSUn(7bSSY`+K1!YH-taO9dU|(S5*Yfi*_W9iedSqqd4t9+$ zm;pR!9h({xYoCHI`^7GKi5qK!Z5F({b6XhO`~oozc`b}j-3hPq0m2S(j?;fbd5#Kn zz5TEM1@PyJ8{fyo4BIOUw*yQtO(z6VHgB-a!38vle74S%m4f__VQ8`tM)w8my%daF zfWU*>)F}zE%~S%@K7`NMO-$%g1PI5OD@|Zq7;-DXcnWV?lSDEKtg`{HNhQ{VycPp$VTnL^b69Dv= z8U__{YeJX*iY#Z_xI6P~K|Q?)#FpzUuc@4*Kk?e?R%{{JPzrow>URLZ3gJt740x2CQI}E^GEmNMQ`Hr`Z=Wec%0S3N8$NHFsx-%>+o3=$m! z==wCWVgk!pz<@N54&G64V;ZiF?)YppH~|!AGzM!eyg%y9w*r_FJyJ(Kro(2B zC(iu@fPyajJp;Ow=S^;lXl3M~sss%p%SFv$x3zdDe73hhN(lpqCxj?3z<@+v&e#W_ zNhrY1upbuvUq_pS;`<#m(!lL~m-SqADgkr`c7t9LrhX?QmD!t|!+&OfnDkd+I$9EQ z7A4a)gHpe)zB6=p)Yete{UITM(RR6fND6Y`+dn5+7gV`o%x17*x{F5)YFt|J_V0G}kM8dG;7@)Cz9SCvU_5_5#*9d>L z7gTR|u+@PgF$&DD|D8v-+?_&0DDKcE2E;T}Snf*U93;L+{JVX6@ORrq3UsqaQQJcA4O=OyNd*#cflk5u!SY@!Vx?33U#`)4r!BJCq7l zrc!*T232`Q14B@yxWQsqqu#aN&Nhzio9w!K9!CD;y=}umnPaWXg$9h_=Z&HQojj&> z>r+g~`1!e*LxqVEk*09Nw7mXeKF_hY&~N44sKhgQZmoUMou>&!7sFhuq-+N5(Cy3Y zVrAi3NeNnOo)&JVRmqrU)y`Fi&!Et3+gO@^_r_#H&*InCE)>4a2iv=Sl{W=#z*J+YHRVl|$F3p0!WN zT#2oVTC+qaj8*d;Ilr?WH@eNcKH*9kIN;6in^0wJvVF%dRk+^=zB^AoQT=**|EXwm z6K~S0z)jvHgF~vi{liYgS#jL{25fHoL6YfV6|Djact?KGOgZM4zX%T%cfiyon|I?mLv`)pgh7PJ+7 zY7R@gK4hs=1@Z86oN}E#VOwV&oY-|BGJ~HYu8Yky{VZk-@|=Dw6s@%`;ycXk6u4?W zZ=lCnebnK!EfTTkwe34h3O}af!*eEG847gcytG!ji2C2Z@@pYFqixh} z8mgqkyg$}k$gDNKC0=J|p=4;Ek2p2E4L7|un%y?!ghaSG4vR-H7?6l^$txA7jI$eB@?2_hao_Vz2io`H=LfAnb~F9H z6}v@Hm&NNFR!p1Cz85LwxGjGqA%0k8z0Q8V6?g?1KX)3ZZ1uZTR64tE_2)Kw(Mxl@ z-c$_aIz^sIVYh_}Rv!3HBRUo1VglVKDj_82Uz)pzV!R5i1i+vx6% zd4?j+3-mayT_sP? z*RCcD4$2ZcZ`kQYpT?f8ja3hB&Er21+P#TMr|0*%Y)fBq?LwC}u$%g@Nqu{(@w zcgN5_=KVTP;b_5%h-nexbS`Sy(^El&+uiVRsnbuZqw=KGFzHye?qiP+Z*{3>(&_`> zD8%me`cy@`H*J6lx@IG`%5Z71=FG0BH452%+-@P-oEhDo$?kidDX3Kp|G;ZoxBMjY z*vofK?(}(_!T^_jUXp24%xT)2AO?4y$4h*QdPh2Ys-5er?m~87RZd68?0Sh}PN#qO zNu22M3HDu%iHX)*_s%QJs-2mxvggaMt~eB)c#b~e`P|le*nE3%;?Vd|aT{GzMdPoO z9_H(Y&ZBZ~DS%EBF>xGA5nO+CUOtJ`TjQD#nnrFZJBPW2SD#7oo(otdt)i;JM~i}e zt#=4>M=Vhle@hI0MIr%qx3u#cRP0hkw!X(Z9fs}qEybLK?xgJvr`U{)mLnaT*6mr% z^}XbX=;nk+%pCeomUl+${nSWps?#bUecJ~=E)Mp)ne*^{oa?YIWM$+FRD*8ACEQr- zt?YD5!cVUJAL`yZD(bcEA2mQlL8K9e97&Oo8XBpgVF(dv>FzE;X&4wJXXtLErInTj z>6C6zQu=(^dq3~*eYWi9{NCSM=d5$qnm=5_@;i52_jP^ZntKk=*o?KD%&Si-@0z2l zZa8}$CwYq}y}_9)KET1)PVx;53>m^GATc|jxh?78Ie5$)KglH(+^5~uJ& zZuR2L`es_Wh)?!-{e_L2Yok+c-p4Bx+KSs27Rg>lGo|gPC(REvtW*vytyUY0fb3Z} zN5S}$x&C5%_jo5!Er0n$S^eWiWlxe=trZOJvGlOZiZ@$fh5P1~P?Cewo*QZwt!1lz zboZ>@*h1-HLbWPS9ftBA(wrva+qt(;Pb`mMQebfOJzMdF{bbhbb!-9?A%P=g z+uRu~@dm#sR61KT-oEj(9=uP%2uNUH@N4=lYF#4G_NzN+O?V_Yn%||muYda%` zjkIy=k3pV$yEs?+JeDFsr?-0k(5s@hwcOEMw{+zY74rsp6iBQ?TDh)hcMbcf8Ff`*nF3qo?+pH9b*w zO*pl&WL&d3FO9OofU`DqCK`BYqK(ZxeIV(LEZmuMt}_ZJ>9qXeF>HQ8^?A6*cw9~4 z-1D4iL!tR@?^J2sUWb6$$C(og$=2beo>?3^uAPv( zxcA9Q+eaMLyO}uS!KyjxkUidb6g~CvqL_8T$IXz_FU{YGu}g?;PLl`t?LAY=Or4 z7)XbWAD2v{GQ(a&MJhvzs>^2TY8uCX)J}V)Zu=VrwHvPw>J$Rm z#B(7le3A=z_&{CTuA4d8c*t6LNRjoe>FS=W-7OY|Dp&hGC%3Ye;qEOi=JCrs$Ep6h z0B@&@3Ql6}ekQe%?UIAGjnQ)pC(Fam`2Nk&Vcw#erH0+T2gSvww1)8=PT^}kUTW;f zQOg*9(n%HWN;t4>&c}vjeXt3Oqxz8or(vc5NTxw!Hd$qy(ysadF95ROMl)6xhqR1_ zQhSX~aF3X+x`QDj>!Pp25CUvds^s_;j?>SH!*whGmbDVJH?m@{ICPwB1E2TB5pfO^ z7;8J(SEsDmd&7T*tyuMJ=mxYocNyuRKtH>^#4yghyhSf#OuANF4_7)PBT#~kJ8ChX zQIqdC#!>5?Mr&mkpbah=%I#sMkoY6<4S_n22&9ge<4>#}ZGP;7?D5jMj0$8U9hZ5` zzNYuMJH$KO5TkCE<8*ULU^H3(bs4w%VP@w_?@1rad5pjf3ac1zzJ(+1(LVt64WrBD zouJLfL-*h8x_VuL6-L%)Pf<%Er}hSCcRAdH> zTg-aQfX2#eUo1?WSab}!sh92~EbpXfruRYj{7Gg#8VnRFHj@cLJp6S=-%G4;o-!F> zw=r$S*CVsIX8oX^F^+%#R(gNtEi!C)Y52f&5ADpX1=nORxhS?^BlZgspefy*^R&i5 z;R9Hkt-cx|R9yY_P}ViC+O;AFBX)wP^sA%Q6zx>ph(1efIl^1~%*Fm4!>L?`&=yF& zi1u8zRvAg^QV8zaF`V_cpcTv>ug#rC^`;M6;q5LiPx#AC7J`A8Wj`oKq25*hoa#k* zlG^F>Z&g}7Up`i6WL4$I_*sgtPzycOddzL)faN-p_$%NB^$RsC4%)XaMLi!`Jc*~f z$A8gA1zTRAyKHn^K#n-K1P?6mEI5R!+zon1%&aiw0kVj-I)b?Itc!H1B7$^si(DTi zP)y6EH@etX<+%6U(%`bl9qAOyY(L~xTYa2V{|1-DaJ+2FyX-V89B%9cl_N$!4`GckFQ6J( zB#Ns!WaU_6y_SKhU3vkEFE*8{Jw7cw7?051&RyO~LOa|-`|j$Q6;@t#_}=!Q9`*5P ziuNsU*?5)BKpIWYg&Jv86-9~0`CVdxqks*CS%KdgZGgLZdC5UEIUR#6mj`7sW(Sf_ z(O!8E{wf_FIzaaHIWw_WO@AtwopR|8`-$Gub2in|6_$7i$LYZuQL=s;wr2Z|-|`8i zEA9A947Dp{WxDMs;Itn{|m$k{N>7$83tc_6< zt5sSK6LeZrKJN0BB9&S>9nye0I_-X|J#u*SptH6)zUO7q;ovMMX*lq~oH3)#o#m)X z_lAIP`E`V^lT$r+3^%$nll5H_4=lL%k{gzny}aR}wZo%HI~6`A+{t0Ci4}mHAwKMc z8WqttKj#^o^@zzX?wYW~+(pX$N>t1ef(nuj&Q5Ft@wZ=bpdNFi=?$+7&;aRH5G^g<19k&}SW_e1aZSU;L;bUOmB(YJbdT7en@zA@~3AhWO z*plxzE2cl}D(!98U6$CHmZ=Ol%O|UnyEIJD+wfU&Rx>NdW*P%|{k!eIz!onY14?Jt;?#e_+?N9 zW7y?p_#uNek42Td^Vi?aS2BWC7n>;0+oe>8$7<8#o5k!^p(Zi$kA<9i?DiY@qOcom zlQU(OIAkUu|57Cr2|G#yh_vjXJlPm=7?gL`)EqT9bu z2Wa<#-*@1L@U8@FbUlup(PXZ^n5hXm=zxLOvliPytmzo=!>!ftTK~GvuwR+-^of@1 zp6Ak(dQ~($%t^t4m~yc>KO{*bPdsfAqkvsPVG9BRSlE@UkYmti5+SPC*dqD zVAsmF3ou<4RjwVc_hjHyVQOU^XAq1ihl0?YtBF~#MMPK1LLDHej!9G zq-)UGIvt+5K!KR^@!uM}eNqAEo+P-;O!m*w3OAye**I*RFD0l~#Zn}HLxYC6-GQFQ zRFMIVoJ0k5f7jEs!2LhJo51#1oXfE>Qf{>GmEbsm5Tt!4l47l7qxqh75wL4R>ME1wT5&FU<<&k zYEI|uTIG(STSG1%fdEp*Wtf~;wW?5-B>Gb?4Grmue;{ z=!rBPp~vIs+)?5^$r|l<;jZF5+x_u{LMOl75KJ5i%$JI%@Q(POPu+(C4-sEl2b8Ck z?wZ20ADiYnj{w`}(|IQbbaNot=ywRv&v~mE__E?W`*;mye*e(UL7z-)hWg3u7q)OP z(kis$5=kK;cp06%=Tb%f6K>ljbeY5_v@-DJP2&*vZkXdW|hLWwv8b7VT?2wWji1OP1PB++S(O@#$DB zKR|-?+k=!XTI*Y43}s3U4rXYAqa;g?W)y$*KE98Ot}POE@9m@GWa3-f=r(LesA^rW zy}1N^PUG$=1oOIQ4(C$t7ktZJCy3@-(FV-Kau;CgfYBINe zY3$&Y%>12qab$B*!5 zDzz}h1qrEoQi3qDUCG8<6}hhl_J|W6`Rhq>g-VuCv9n=fz*3#K$j*no9hq}5pXS+v ztCac*$q4L{unJf9=d10%l}H6+G)IV$N@P#4Dp}d?l94jc7F)#9rg*bEUMv!~Sw#(3 zr5C9dOWpK?XZH~I*{3(R7`mj_xIQ8EZUs}$Y85t zv?wDZ|DYsM8lPA63>8anv1_8nuk|tp$_=xc@~p=0hq0|1=dG2}56)W~5zt-8goX2& zbVJ|Ccr*6PB1V7aH^oizzswEZVg?1>`@!_;=*P#K?(r+JyNowEa9>oLtN0>ru57DP z`nfHNf4+gf_F3Xq$Jy#9Z9cS}WAm3un-O7T8$+JF&IU<23PT&J)+DL=%c0$*1W@vy z`KKqZ`PY5D-!c7I31Q`Z+9YYcL>;PfYp5!r_F-RGaj!Q40|WJ|!G&g4Nvme@8<+|R zcO0ptdy$lK0Ww~pom9~}t}DHTiW!I`4XqGKv-Vc&(UQOrw8425i!rfKzsIgo@w)7% z%wC*T{Aen_)b4-toq$)Tzmb1D+HnSJbCYvJUTh6k7njnDF83-fB`_U5cUBQzKSMAO zD#@9H$hw&g@@gVT2%P-UB;Wq_q69icLm^6cGeSB^;uYy0gjCq&TaBO>Iqhxi>x%4b z8Rz!*tH7jS!gl2syO$!tnKR^kk9O1!Pd4nM?R!oh92lu5SeZjVFwZ0DZvV{n;mcz7 zHB%AEV|u?{-^r)-Ysgk_kK$$`31gTFqZz(v7}MECy>^)=V_?yDe8-=fe(9aDV%6G? zGdY^~4-?pA<)RRifzfh2%!?|Y!o=3sGIPH1v*W}eP1`Hy?87-H~@&|@kWDG-YyHvd7jELojRr-gDd|8I$VIKiyh3Fn+$z2ot?P2Vv zk1GW-)HS@sM+O)4(n~yC0+{UYcg_$_@j_hSS1cYPvqEW*^;0%o>}!_lE!s)U=eIb} zlLxDJ5Cn|R_)XW!Y2O{|IVilWJ8?>2zHm@?`POkK>{ehTD(j|nWI0TV8o|L!ITkWN zCHH-#_wU9d7{e&}J zYjlHGeQ^eYL&_d7#%3EKmu9d(86$_W%IR9y?FXj8MJO*m$1w}DBopdf{YtYIPtLf? z;btk~kHxM!`1NdHMP$jEs1;|0@3%Y%WX8|Jbp1Q6h@e*yt;|{~L+Gn!)oUzSj^5rT zHT_v@F}UI&(|rW9Wd7%S%WoWxScchX(BC@H*ruLn@`DEs%4^j6-?0h3;s%M-Fwvfl@(#gprM`OuUVu2s zlA6w%x+A74QnCc5w6jVQlry*&9J@2z04WZkm8Y;aajP+t60Z`cu2$h^T$ni{8=ioY z6mmJok%dT%%iV8L%yk#D9cup&Mxk))iSTc){uQ*7ddM!5aI9n}bu?7XJgO_xD46aR zt&CS|;Q$mVUs#M{-dF1osgQ70?J8_!I&-%lMmN-A9px~HZxSNL{alNEZmI#krMgSl z#hE`~YK4$EFG|E~_-a>ZBqcyb5aCl4oyqAXFGEsAgw4~P))<^I!^MLY)bA72UG46M zb_Z~sPWTH>GzaK&zljmoQ?9&x${^_u=Ggz`rtL=ll+IuO;vSzwh}e3_UMcTr!-Se< zHp7kSF_%t;wHr&PQ`ls%-gG6twAz@^N))fM7sR3%`-Sb0`FsA~YvEN4sLH3z27{yV zNitK`22`{;z7}UXrj`qJJs=?bS^6lE8UbBX^3G;K_lJ(DcCZYoj;&Sdyd)9*>5I3G zYcho6`eawqT)`9{V{!?nztsz9%b;nE)2<@PHKKlVRwCl*s)f;+&&jCxjm#{plmpLm z2zErTNKwRT>LeqF^o&d^YQwf`{iYD+(k+eIjJ43rCZ$x1C3WA+tcuK4elyaZPjQiJ zZS9QBaQ{9zm|=s1+NkxtjhHz{^h(Cv{`!j^hYr7-+_f3OHL(|AVmB*)VGGxFdnXE2 zDFm_RZtPpX(%}wHZy>+vC3xKd5Z1xFR|A=FeS)oly)xIaro_Qf^|km@rl71Qei2H5 z(A+9viHN+$BnKgW5IHZMuBDjD#LF}0Q9LdFca5M(5<8@}WeR6>^{|}P+(wf* zG-3HJap#nd98bG6vQL9HKtE=Pk0c(~B%$3BI)%9s4yhM^$!EJEUfevLZroyi7`pWc`2Y-n=_z45S@Tf9fRx^`1P+lPj|(Ma=U1n z@?Sp@PGov{>go5Bk=TzoNU5J!xl>4K?3B-Ukx|%2U2CZILIS5w^|^896B3E^$9F<4 z_@SMFv648@=f>sQF+xj-(q{#Z`t=!)C!Hi@t!^OK@dW}M3yR%;OrwN9ogWwFLO9cq8b>T5m`89T=IP*Gjl@VRv%f zGZ@>13MJAm;n6ie8pJ|C^q8TVKlk^uFr21n=Ow}R)I)dBKCOAc7!FTh*}2yEf#IYv zAb-R3tvI$8_}tgq3Lw?4GuOG4+W{e3IT0#veN1?jsSPh!2zn_rC|x&BK0UT|$h(I+ zT>ef$irqC^`e3w&Xj{iOK&@wMfqZvC>UaDFyhHgB?IaqaWSVh6Ed50Mb*>om>Vgkp zb-!zkm{fg~f%^NGJ~&hxi9T@NL7mId-~#2to=8D6-LimtZ&`CSz?^kw^f!6Kt-W0e zN@2?12m&pJs@W9{)8Uz&(t6;(O{%SOkGKyvD=Ys&EI$-6SD(*clqUIYDECu83EypX&DFC(d{dHYR#`$peFGfse!po> z0WCkK=Xp2!qa1HnyFPh4QtmmzA|7=`ku?~9V&V(Pop zxJfAc*e8A=;12HX@N3hY@GVby%GLRthFpS!uxBP_j&UT=6TSr-DbpxoqOkqNU)nbZ z#7zh!kgwZ^M%RW1PTEoWhCg4X6Gc8bq2c8_>DpQ5z|Gqw_XrI-4*b4`+k-|5IXQS!hj6p`=TDQ_%X5+L42_a8w7k|T#R|K zC%mP^(@o+g%}N^*PeM4P!`C99^|z*aPzUW}H3k|?4~G0A?Yfy}?|6lkpl06pTd8yR z;ooGHY=M=BiawHVc;El^sNgB1n<|8>vw|<;+mK^cCd@b2u0HFl+y?nUWpo~dcdO}w zI#z52?XYQxg!3sq+>OtG>c9&OuMCDTVsB0je8_^_&rLNpz82))Vf9G~Nyl<8EwNgW z+T?xtwx;dnm?5VC57nLgScNa({MdandXgtgs@ih3Qffr2QM~o*CXZ$da2_C^ODkt( z%5;_sbA&P51wK=u25*6Yw=@=7Pp$zwQ$hz#&E`>4(OVifi0_m7%ZD16B2E*#s8SF|%dEC9O zY~8>>3l?s~g}!E4Qi(=YDDn0Ys?QCI7a^p$@V@jPGk6P(tQJY!RivHI5ix^*)S^r= zZI4X%35AfR(ZiYF@*GLF!jb~PHWLZP>7{DvXzdveLp;2fEd+uokQx!2l^F?Jjvp-u z4Fi~^Cwr$kiK2>jz@hxEN%u#N&~DkS>5Tc>e@Hbg7>XdjUpJ2P-N7_&dXm2HpQEMZ zLh@oce3o!6RitbkaONOE#4uZqSFH$&^mlDp;64`_AAyV;WDWKYZs(XIul|}W$F76z5Po!F4F&P-44|nR(R;|$(-~~_>;0_;^V6Zta(c#OW zoZeg;{c|Zgcueq@hr;Q9jDg`;4SNY+Z|~xp%g=hSB4W`rx!RqLyw0@J_`}il!P3tH zEX6^W-j+^}^Ob9G#XoY`z02w^EM} zO*BPYwyn;f^z<+%m5;f50lnGPHuke)zT&paIz}%pS*UCh? zdCdHrs=WZJj_x{5htkJO>g{}v&O`Ij5BJLg%FVI0r8Ua{mBlSNW12eulRD99z_O zk(9U&bryqiLD4eH5cJVs8_%T}%*1Gu6 z5f6HJLAA?HCS;d({?alS91OnfV)^%g3t-L|i-zR>^Bvr*z>*3AGt#U(QUbZ+ z{k-IA#c4%c8WV&|UWf1>t)tPyMLgf14G9H#lQrL^(Vy>l zQ9mzkJ0u_&oW4Ddw);33JnI0Q+hfK5Nvs0A(9Fy>rMd#E@FCsQw$mKmomw%nRtYFI zgJ6wX9X-6J9SVE~N>~sGV-)lMIE;aU?1Lr(JVEO}jq>(OP$FYnmM4#k*SFUB)xD!< zt#@Dc{uDB~J}Td$i3Hwgh26uFpsW?K#h0v^ra;Z-xO1HV5&rMT%w0Da%>8f$U%%_i zHy6`A*eA{HJ>AQLZNFFc3!8hB9={7Cf4p;!0QML$!f6MYwy80s%FF=?8Hw|Lfb;-?;p9g)Rj#c`Au{U#gOP9~RNm2H3 z;+l8=^||8)#`{NCb$`8vR%Mhs9YAQz2OUcxW+8>BtyfRILq3;Cv6z*2a~tLj(!0fZ;aPr45XcJnZKu< z<79^GPnhtn)GKgl(wt;bFzJe03nxe^Dj{%kCyvB>^U>`XpTE>is@Mevatsf@+AtrF zt5d9Xb&6A;z%?yF@HlyQu(p>aJknR_zO$&xp|WTLp1fniZLgEy*X zg3~Mg&B{4PWhPX(sr5fRdYn9%+5Qf%iPbm`>w(03ea)J}uDx?lsC6r%7d7H9{bJK! zfdBqL(3;W3D_ae&HaL_J`dXmX^V-A-y!iqsmiS_0p2skNB#I^jgLM9DkVtcBn2y&S zECOR3nn8NceI_xn)-QxO!s=9) zK4p0+JX19K-GM{DJI$TVX${G|0;q(1wi$z~W0MLg6(x5Vn(6c>o+abm9Oo>R`4SKe zQBnsuA;25XxdZPyH8q;MuajS$A~fHVNUN&|SZwnN)asnbhh>b>MqxPx2>B86w${v1Et7(f8@OSr1elG zaV$7AcT`7pzGHYKG^XZ0K1eehm#3;zxpuLf8Yeo+XHoA%sI1wefdS^{rn_W126?<8 zYc3478JiPg%rS{m@TO)`7}Z_})j{Ycf=v>)&B~ZGuMhM#zXIp6a*Pf^E`ZaRXm2?q zuEQFb^Z;_YZSaW!eqxlu0Qa!Z{&UzFcA15_5AW)nGwfCtMX#dFS{@=P2TkZFNb(0> zD>G8(w%mJLQ0)wiI;UZuqf?A9aqI`jdhaH|*p?)SQ_bKNNQJm!c*C_Kcu}%tXjlhW zjW~ELF4PihIH#NMeFAW>zQDcjz2SZ{*iLQRfi@(Qp(eAW4tbrYiybbpyg3Psnb`hX zrxwWiBv$~56mjDqV4hxBm&F!{!`Zjog3qx3br%4D*pAVZL7Y;ICHI5w@l;&C4Hpi1Lb51dtMNz8u;hAcUs0Rz5V3kjC&gr=R17au3!HM;T#+Q|5o5&D0NJ;>s`K8>Pp}@d_|k|2(+K`{N`qWX z?(n=+&j>}jItGFf5)(&wq!;f!y^l$n7VhZn@7`rQ-7fxO^x$sOqPlC3l4%`lUoH>{8OBCrPXP8Ru z-C2-Q-EOVx>)0c+E4*tdc58?tIQ@oCeTyD&B5tQ$od`|aa?gNNha#m`L`GiM^qKRMaHK6k5IAl^!ae!;KJ^=R*5!!+wv`5Jha1><_D{B2 zYQ0;&hIs>SWy+i59Uy@Gg41Wc2@8j5Byriwf83rG-|d&8Ug0H<4Tek{&4>4k*&2L} z<-0yxQsQ4xjrkn36a{?f&9@WpM3q3t{=om}p^KZBY*jpT_kJJ4UuX&y$1DSLKhat^LUT}3DR9oofln@4aIamQ!WGjLoJHDeRrNp|%B9ygdnq%B z{*qdMR|(*O|7RhS_w?(FfjKP5!Hmv;rt14td+X-F@%#whHzX6t4Mmq9WQ!ITq$D!o z`aeQ7iWzOA8dtDD7c0ezERHUb5AcmM_yI zzC$JUp^qMf2a9{IJ&sBm3-(WIrj7&*TF>bDjyJs8(jZP%B2u2Hs#8^!#3|af^<^qK z>XFFOw$xO91v3LO;nslz>J>l?p8`nTm$=&N0Se{NmGm2}%AHbBL(X~7V^cGgg(>o0 z02FVELCnrks0I+^-)K)Vz$Csd&|X&C?r+Uf9i6WzdOI~rEiC84@W>P2|M;Ymds~GP z2V8YkLF8K0Jl9M?9tYX<0_6_@Fk`Zm$m))^iG#75GL0!5_w1JP_Q~^BM5{wx6vR#n z*$7lg18cWCbff3;#A=BsAub#{wnzfj&J6ycV-A<)`@!G`ZBydcQCa|SOI)eBnLR!+ zTy&s^L{nR2Ax>Nx#BZp{uSpac99@7XqBtBX;HV6fP5#?v$&)Xjd+xJfbS)fRre%;2a@-oEy4@0|Rgkmtz+0 zov^kXbZ07RiHtYjh8&6Xe`1|Vh)^9OQPWViAgP>F5l;hr{i^=VqS68%QoF8cugOa zE_wwwFEjzT`Nnp?`ZVTyDr)(aWAf?sQJpgT2tb?MqHCvO09@O{7y>Fu!`CZjx<+dI z4 zHo&bnGEdaD9!}y$3y;e0ML6kscwGc?46dc;K8m(|6=$P1=VB$VB};OT z#@1oc0i%+fKTK^_=uWK0iX!xMi0pyQEEfBJQsS;6xMxlbm62-?(K4cv%AGJ^V_gnnI5XJCYdWkjo+LFV*V$z4u~O)#iqipjJSNNbIXD}tN0`bn?#oq^>s22j_zs{>ymX)BD5_=Sfgpq+C4ChAZIgPq;^7&FNA zVXkEIM5?J;l>(L%W3sZuY#n7^-LFSAi_V=3j`MZb#^*1Gso+zq5FUiE0EFegq@hH^w1YZ<#wx89h1&01z>h_o z(!=?mxLn)01^BlzfOBDee0S>`1KijDBn>+aQgaU)i}i<90dIH&?Txv^F&veewF&{s z`kaMnq~9egz?-OOU0DIF1mdSwjU7!8XqJj8Pv^|;*fPoIM+=1Ykx>7^XR6$Y(lC@%iL(z*VXG2z3 z3=u1{+9vQ`L_eq{BV2elaE-YByL?6%GH5KS04}e=bP<3AniUpJa-m;;ZqW-?J_Zz# zF9VA2oYbCL%neAAHQrGF5UV#<5FPkouN2zq^!QP5x=bLrij-N6G;jA|>f+c-d~&zD zSgqM&{3D-@!X*;!wZB*(def4CLI9YYI&I;mhT4nl??8T(723XbzbSnLuOuY8v3HlL z_|PxLMae+2Lh%8~>Iy1Wm0|Uc(1LU>pX?si`NwqLYQc^V{1u90Q^5&9VfAv!CsMns z>cl)pT%j5@61VI*(A7sv{_q7=%#xGVc)K3Z?bv(~jA5B&!p(d|F7Pe<-y6UdQtKn{rMS z{aGhbGr5y9{xkB#Jj3ecQb&WHT~e%!JH$k93txF0`mVA zC&`rX{8kln{NU4jj)hwg)g_7iGPQuUSnDjvI!X^<`aOU)@CVbE(W+)TOKzGQ^$rSN z&3L`e7l$7`AW%s9Zm{==~hQ!KyB!p3@;(@VKL+F`S zF|yqK9W4vA^9O?jDEHA*foiNUT+?V)i|4eOd#n;HHQD;YyyD7$8AJ#uV15#iB zzmjrbc~BRd<5D(`QFw$dL+#q>xTkfh?yS(j>%{P{Ro>|AJ7-mJNygyxJ$;Rt?;4=o zS5vizwK=#_#vy@iQGDc_g9Ee)6{?8>zEelZ;>uFYN^L#ug9>M$5+(L7MlG_|0oToc z@;p4HuTa=9fWn&51HREXavB&aAms?x(x{tpNXf>Z5(8*?0r2Jj4J`wbj`+4#FK`j~EEVB(8XbY)y{b$y zI&Q&t^?)cc1vh%%oG(aZ5-grVl*WkqCg)`~cqpk|CgSEX2%uN-XwcjBUcMWJjfe zAP8@J_4F2Q+rwsvxY8~8x(jfi45|N*)qX-~fL-uMS8rx*kDZ(#_@OMaZ>EeGdxm53 z{{R93;TvuIkn*(NSkXbZUh;!j_ot06Ns{L`2QW;t>;VBb!Sb1nGzp)(LHVWv^gYwb zWN%PrukjB*Jl?AQr~@9<&!d{t6uHzwdj)CZ_b0iPc4ap%==0=7=PL)x&??ObYO7EobWsWq=eZ*33LnX~-LGEYbZE_5)rOeuD4RijaS zdc0XOYA@bqIzZMlOA^PRFjAzD!=277M1e}1TKA63Nj+xn7k!degt4S;E(x@PIuu@a zjDiOOS7_k9^iX=mERvRyZf=<)Avb`5Pv+V;)GbXWh%TC~GmFS7!qe{y@C%06$=q*1 zp3p|UXhTj9mRxDIw7$+^N!-VZufGrc+E1r%^#jB4T1Af^WMZ@# zZx(@ibDX8nOimue>h2QT3zX|TUi(MJ*af09y=vdhEJb!Z);bbC5yR}5<%V17Bk1$n zB0-pm?kJo$>dHlHcJiT+T^!pd<;Z*)Mf!u1t0LO@VG_Y%=Z|A!8rl1?Ity)yn(Tp7 zQS0}|Rl2`6Q8YB)$!N)RiTX>yDlG`r*-zou^)tLhyMue>uoedN>UQ~Pg7{T^PnKVf zS#mvT6Q9dEUbxZ?>fb0_A%?egrCy^4dUDN%w91Uk-kp-UUKaZD8~dZ5?;!VId{11UEr=dopgowsmkn7brvP@@;@0t1 zZ%EBmdx9?moL?a41NtSh2`|e?&R*^@NGVnV9j4-Of4(=svN4ZIc+m; zH8UV2qO*_EYrfN)Jt;&c#cCMDkBkrCJAM_VqqZN!<>}+Vk;c*L4Vv$$f(#~R?Hz7sKJoL()z96r(#$@_}!xz79`lv0M6&L}L_gj27;F@p&>6zKTd|6oxGQUcKH zZvW{#7|GAb3jTh6U`^`V@Bn@WUNRYFg-;xHNb;WZ}Q|=a$c!ES$h! zc^hZ;;^tRz=R0moZ*w+DD&OE5jftnv2+V=BJi!+6XuD@Cj$>CwD3K)kJ`!RVjA;*< zHnE2YOtT*iAaDYMH<4LC(}--}T_2J~b^z;DJHByPeu@tjpjnz71yIeh6i|Dzi(7D} za0TX_nrLpNOs_qPzcsdrptWmZI&sEhnCA&mB!nU(`<5&Ss^=mmbDEvn;L zk_3B^teAjM;>P9r58UHC5?I(q&`8mh9uqCadU#bPWv_G;R3d zLu0Anbz@rFMmd4t*sd^TKY}su(nqMnm8PXZGaa(6nT;&Hr6#HWV^Up}mrKS&U{xTr z)-s?}b@a_^JnPT4wE0zCC#t`i!NfsYaT?)8>cK$!^Fi0t{EO?w5=xaT^I^T?SG5k% z=X7VHxf$4kJL15+{;e~J**fKq-k`dF>J35xt78%#KI;Y$WbUX~M?k5V{d5gM2D00K zXfOgE^8am?{r}f4G;lGF8I*5|nFwTAXkuz5B4(h{{)`$|Pft&~_~DEZYCk)-gq#aS zhBFwO6GuGOWFfHqd6*im$xJu#T4TC0;knV|7?G!R#cNSzz0pZhq?i5`F*YZTARoUechixx|8QApbDX0=VK!ulWw%c@t6llAdJ9xT_%1D}Vb00OdVBhU6QflO31TIA9h^|7zm8rjTG6OEkQ39IN~7GEW5=nrY#%Kb6HA#r>O4k$PRzG zF=^6C>k$IMgZ6EispX(*=A8>dQovFz1j5)U6gI8FO>9a;h8IvP_yYrn6a0vjX!e`( z!N!IPHR1Boe7$3FS_UgcNO9xi560bFC1@#-T>;R=Bd8qK0sS-P&oNaxqIzD;Ml3aV zJ{b2l2M#QFdb`|6wzRfx7t(RLGs#`&M+I)z{jy6C2R>gn1(?%gKzK~DUGK@K2Ra?y z*u8&gU&ls#@;_X`RXI=gybV(S~BEW56}a~$wnwT=*}!sU~{AbGUa${>PAO) zkG+0%Y{`LF(T4UeYQ_Vj@qEDh|03)wqoQoXbw9g6q#Kl>5$P_GfuUiBl9rMZaOg%w zVt^SKx)Fx%?ob*eq(ee_0O^i_^Sbxi=j?Nq-#&j>{F?Q~^FGggUv>Xly^svEbudE8 zPR!g^(a7dhXWMixt6RGDVmoTv>pEp(4+$B6GZ~MleY-;I{)^c7B}`GTYtw7jffWIA zN;R61$cgaUzdz@Dru)w5x<0rJ?7e;MOAqQ&0ehSB_LaIENDf|<9M1JU2t89b)D6_= zdc1Tdg}W=#MPPQ+md48qzg(u2IA;a+zWB>$DE?knt~JI~pZz#F;W^jqARwnlAH3Ti z#fF*ZIp-wqV@_kA*x&KJiwlm=QIr`+4OVDFcmVjlI^TxF*+O7yY z1$2%}j5AVYlOiA}$XIitD^7UB`}xk`@gWazD({bYO1p|k6pUz5lbDqD?+Th;rktOU^bv4<-{M+GcYde~78h}(Rr}{bI^G8IF z*<1oN?R?thPkgQ}THiKU4LGx)m=(<9$2ms!suWV5`3V&jW#)0(4b$_54t$S6H5>IW zc5~irGA!15@F30MBz230AGIg?cd;d{r_yaXSdtYFT`uAsx!1V)~xEw>Zffg`2{wn z)4}D8&qM@ht^Ik?PobQIXf!!+mb+~>MdsV*WinIyYv*MxItynGs59^e;XzhQ34>8c z87DJPZ^$F4mJvH(7temIs{F38JMBJ5Y(TLov%*sK84sJRO$lDx8xIHHZ3WF}GK}sy zno+=8Q#)Fro-=2}8ih`j>~3`Z5n=Ln;in?ay5+?8y5#U#h*i`yo*L;!F$JN1g&sgn)gv!qOLX1TacGEw)$IxPm9*2L)KhmBsxksRTKxE|~E)xoF&4OPMb`m?m@wdTUIShhKkD&0LQb>nq z{nUyiNe8r{FKhf<@ei(@m;c1BUo_Aj-0L9=pxW%u@b|3Sz>Et0!k`;ml7M0#K&q>= z^(sfTzCbCwR@Kav8{ezGGB+Sv91iTgR3Ox0A*!K9lPE6Ak8IeBM1;l(()%mjF3Z7f zZUnrpF#R9h3wZb8na~>oS{A~!UercD$JQYH{O_R_TfKoM7#4ZDO~E(w!L#FekmOdPeGEV9IXav?EiyG8cqqdNdFKi?eLoGqj0P7sTyyd{g|ihwIH{bb?$2?uuo;^L*mUgaH_$j-4obMTV_f=_xzj znG7Spy3_~V-SIsgp%8JsIgl)h+VNQ#kss`9eM9KMem~^VzPa}!r1)!R9Ovu0bLkPG z7#;1rjX*s507Lz`4mkzu+J&mBctN)Z#SIn$+uplo!A~eHM&$2WlR)mu)LaAGndaQ+ z#Q2+opC!F6d+aFMnNV%?%OOWQ1`^SXb8;~`uw$$eXZ=AK8Mr# z?Orem1YtlWVo$VvjkV-i7)4`Xauy$zWg42kvwy1Ihrc+55%la zcR9SDBD@mr#A9=nB2O)HtBm2FdfRkX6#stKb1#;M$0cp+Dt;deiaNp1ra>!^TNq1# z6L1XE^)4Ya6Qm^Gk%N!&!8Lu3?}vojOK*>_{ZMC}Kp3!8M!p_bAs)y4$qgdC2#oq51ru#`kmBU5$!J^edm5}siZL)Bq8d=Lcx2aj< za|7FXt@d2Jy-~CM(%+BZnHf3t<*lPB9$Qm6>M+fxn*k;iEJIIS5ye)q!T6-I>`s45 z(1X~SHyOU-i@BO)xSRnat9ToJEj0LXOdLsqe6*L;W$^RS`KO_WBfCH8{WQ!+oaE$u z7U!>)yg3sZlZhu;7P0hd*PY#;yrd3;3GY*l9j0*kC=cJN=XRg8Qhs8Ri&mzJadjVb z-E%e}5)iyYe|i`f!Ssps{ptZq9IJ#O)Ijd7muh9^Wb_+JpBaAtY5{ z!rzMno;f7viH{ZVa-V-3O|l#s(a(=|9wPG5j!5&*&82Ws$sZkJZEkHe&{(qiULS`R zFbxBfBc&D3-#Bf>VFj3mUVYb|c~KQ^FEDW?NElB&?@77SbXI;pYfuqCBOJtJ7vW8R zLyH0Bd%RT=4^OLlUl*Jeu4Jz9{OYiv>%C^P6xFh4nxJL6x1SZVj2r8nUs51|{zQK| zO5We7jK*SEe9u!31W@+%n>Wyg6Q(2jCGjF28G}^jQvHh}gQ=cX(uaZ}gLp{O@N;K~HzPq>hzXyLX|hl|CUYyYchq$qp+Vftv+k{ck?1YP*(j?HCNRO86`9f09>aI@MR(ta;&3!{^IVw zYZG%A2VqKbKxQHQeTcYC9*2y7*tcB|w>Q{BYHs(|SA>d0vlc^u5Z$`{jh)ORcAFK* zofsG9s`Q)&O!+Xkcj@Si8R%a=ALT!v&zHixo*X6C+*K$|8d<=;V6Ocz8`-l#Y6HBo z;ar1ZL)>26wLj3X^n7xPMwljuo23#$x(K#nI(`{4TO`Y|TYV1CP97P3&yIremWMt0 zA^(X2VZh482ILBj%vy^U`r)SC!mNJCZht3*u*(kwN56KT?T>Fb6eVQmQx+}s`6*=9 zwOYgD_06&H>Q@doeEJ=YSSe6kd}H9v?n@4udQ|f5qoID8TbCCF%f5M?JTs|rqTFD^ z-H2CPz^8E(rfF~Idm(Tsn~~9}oM=CM)oF8faWmOT&Jt~Qv zd0%k||9vJ88U(*>M+bdAY61Yr%1Cz8lbxAQ^ODCZ3eR5KwnyxbVI^ZYysGU;>h#~- zo=pt*A#0Oej_EnBP`lz1Y&l|p-NPr9c7F6$F+`!r%=OL5#p&8~4wBIT-i15ewQLEr zBVwyuK$4d!EYByZdEapJ+Z|y@6m3V#b>o&>nps~RMD% zLEuDlSMWG@A_0IN%$$#v5eQxb&{zlfC`{)CO%0WZj;}lIFtZ^?SR7?`Vsml!FX4xe z3yIl;W=rXLPTqcobUa#8jTkI@ObTD7^!gV+>erV{fIKNv;MO0Y?&`PcsF1?mJH$(c zknqm_M=Xn=nt`1$;nxTZERWjx)$)&d-)pBRX2lT&>-t)S5g@y)0$+fL5VMZqk7R|~ zOyfmHLj|;ACXctJkiv;I?*il##FV2WLNFN>IeVjA%?`_-#<6GT;dDal!=512)yn3z ze+&^iA|l>&W6ugaGH?D(7sSR-o0`2M>hk*wA*5`d%Ft?jHB!^XKkKUZR@!N!2z^mM z=Ftx-o~2pf$hJl%3K?H=2=?B@0Z0q*7bDk)jFdgNpZ9Hadib4rgxq^8&KNjDxMnx2 z5-CKGP?11sbkLm8LeF2Z3ybs<_qX)L-FhDF?3j3NyRs2f{F;VI-t#4L?)+;`3oF~h zpafCTYws({gQ+grQQSqiFGr+(GveilvzyqwJ`2&C#~U{6P<=x-v|X8hN=B>l6sQ z5vvrc1ffBRBaSG^_vo;^W}^}pyCjPjIB8T?RYY-vhk)cYy;S?Lii2U9FihkE>nHJ+ zVT`|gXL$j~fHeL#&EogbOY)MEaHK$1kc&(g=W}^fBZbbb;$fyzu;sVzFTBG*1yp|; z*>Z{F5+n60bDZ&ux#{AAfI0xMSm@=4a61moQb^j80%imLd6sN@m8|i*`B8cTJ@Nxo zT#v5drLxW{n`AbNxxc>t6h1yC&IJNB@`%Yy#y3jji&7RMo4I>3n+rK4I0ZaH!Wva+ z_^A*HW@3REBW|pJ7OkdEBq+=;Ji~wIE87v~?d&x^UTQ4`JWmFHDR$a%lq7}Uph{c2 zH#je{7?4ef@f`R0g@6bi{L8X^IZp)eX49oC)SIrU?Zswya&1ZA<^(d1ukf_F(mp}7 zH%ppI8g&sM6`g9G;l`%JIF@Cd{e{}2IRcY!`_w9mJUGUptl{aIFfud+RsRz@!jI*P zuil}j*{R^pIJ=Xi`%Eg-04QY(i;}>#NPV5N+et9a^3%wGRhIg=E#_8L-5FG1{G8JX zjni}?f*+3UTvG`t-A0DoQ;BkOpI1d|%*!|`*5_4WtY;j<=>-?%H5vm7oYC_6&S+Y` zRjzL`k#tHUO|o*XX&WV6?db9`Cmq3@%+8^;s{y_j@P;T9xXAe3BgLB86S+!erlY~nq6COyqaZod}*t42FUyya1!Dr zc^ixYU)=KRl@YwdqZYQA$0~`V*}!Sfa}*<}@UQ~+Q!o)eH0rmK)!_~plLQg3lm@!N zm`2q*v%L*cZ-yVdcp!ZgInTi=3v{Av|w0RecmkBK12X^ zyt-fZpojU$yy4KN)*wf3>0MPY2q(FooT@7Ia|9sGY#xS#98PS7D{!(0dm0Y7VgAZl zaide{RjYh|OxCyis6WkcCu6S>;m&?q&VOiK1nxk}PR{7RC~n9HguXU79HIvq+CTyz zU6h}TtKU`%j3R7_%?JzZa=hIS3U^Sxc zSW%&owdcInCC>cL&-85bt@uC4ZCDwmW?b^Qt1W*F!Dulg@-s~@Zfa9XXAV6cyj<;? z#9iGLPI>ITNv@W3=a-u-$_f|hTm)xa-wX_=vfWt4E*|S4KD+CIf4oD`Y&=KJ5lTDC zou3?a`3C84pM6Q7v50)QgZ((`zIctRA!MUMRI1ysS*N5ph?mRx=Fxs+)va|=^5f$T zl4q}~W9}bk&8s7!j{C87dlM)ApwN&`7PfA|Laf@rDCB^zp4f3MYvG>uhyfk zkJA`(==f=zrHw}tn@-o}5$4pxvNT8?g z)%rwuf+(c^`ujDGj}Wp6K5l$te`1?2ssM>&C-maQXG+YYkiw0(BqVAlGM72!ivr0X z6Q@GUiDMpl^Eb}wrY3g>$@hZNyoue!PahD&Gj}d9_gAH^&x0aYRl%WsaqkJ+Fct%b z6&tE`bM%5}m^N_0uDuDtY+Qo6XOG1PwmoQFb$_Q-ofpVcGr6=dTBHa%`D_T}H?D}m zG)5D0<>~^P^m75X=}B7KRnY+DM4w0p>tUz z1QQxq)%dOEKkT!He%H)4gESG-uVu=Xqonw(*TwVjT&}G8B@#fCdH=IX>J=*~AY6n+}5Y}Vdmji_s0#O{LKfi?jV4#7G z++7$kK8r_=d~P=fQTznUR|iMf+1rq?5rjvv$NAk(&IXzsjV)jnaCju-Oy za(^U+RAsI&H_T$_{ft+^BxJsaJmz63mcvx+W;uXuV5;gXtb2Jh>@me6<``H?v#yB zIkVRsA@}2cU`2Vnx1LRY?sFx6_gft(XDDgVe z<%4dcbF&d6mP(&B%AZTp^AYmbznqn&QHX~59z!*h)>a$n1$UaFg{cJJW@P@-`xG#o z<|9@Uug6WrRtC1YGT=CJtt3-dR)&Op8K|fR3WJdJlX(vr26v;*pIkUnA^GKQ0254* ze}eo0)pdKbc9DIo{K~OB^NE28&{Xhc85-P~PWVDm+*P0>V*LDv zWiz|unk4a!Od^vCaIsbUlrW{61Ds!Pd4U%l>?ZzY(dq{i2&D<0J^Iz9avL$;F?^t? zs?wIRscqVWUt_vVp+HVS%enWEoGgV6B|6wz91c_i*&Lb+C&1t{;xOrAB~_I;QHw0t64p;;t8BX++HA1-9jFT0hnlyNnDTc?8_!z# zB2h3K3x()JqicMZDq*;L34*9FpS1?C0j!_w0A$1Hq!NTi0-&apMyFr(MdwzUU^=_k zaAV#rSdXImi)YF)n!yUgen}Gyo`4$C$>l-*09_|q$j^Vev5kWNh$<}Jr{ek5pa9$K zhN2@h!R<$;vr;crQ2IzQu>Q-+PwCGEP>SIWvM4Yu>GlPSDW9C_;t_p}RlGGr@bE%3 zi4AF;7gXQ-;n0J1$7ZJp$+y+YusUC{EGUrb!il2lJzz3bxr$Y2*TVoIhkheD5dt1{ z!CepJz~eC~u~d3HAYMUAKzIvQe8DGBF3NZ8&G#A5jMdF_OX{N}SRU?f&(RQu+N+V5 zt#&5VTI?&<2 z^#w{^k0(Zq$({P zVIu(gy17NLsRP}tR{|vsX}MbP3pUStHNJ~}?Xh-?s6%LOv~yU*&-wZVVYtBQj8HRI zq$>(s91|*ht`*)fgpj%D6S!*SYiR>oSZ?FYmBYGFW&SBT1(`ZSb4+&JfvT3<bOI%NHx8!JSka<#J4IeuAT2`z22`hf=BII4kL{joQic6kg@oAW!0`)h z1cO#qlEKh9ww4ggy{s2XwzdUe$nQ2OMttME%!y%=_b0xm#(g(y<>)+?0|v$cABA3x z{KyY$y&5+rT=IAt8>gu4d7h)}$fv3vkzA{WtJpCLG3)xwIgXAMAX)Z)BI?~1y&uww z@+S$Qhn|?6gXmSx07AQC@cMo5at(9An)5u^0rr+6~R!6ouw~VRp+*Kpqy2xN@jIUSfI(wiwUQO(Pe5k zTo8}*IsWPO@05GV29_Qr*J2i4CDVpSMn(4evF3Fr4DZaivv1#|cm@9u@zPP@S-{MU zfxV+f@y92xUcpP*?z+8GdJ#FsRA5`BS5$psls)J(UqKZ!#Rn=nRvy~E_8;mKv0N2f z6JHanxbhin+$n(Cz8$g=jb}y4&B6?%|E{Ne@Qiz*0}aEMhq|TE6pO%bQ<-E6*xbE$ zj`z#sP6xw{Y$e7%o4)vpuMt!!cKve8t(%dG_GeX_Oo^79t6aJ!!S}b;R21M@M{4{T za3VCa-Ji#AK2_OB^@fZdCdqA8(^iMEav6TC+H@fN7RTk{bQj55 zYrok*7u!y}A9ryv8AbWBE42%gHo zo{EOu+y!zOc%H0JDbuWbX<@6Y_<#M{-V$bB%GS3rvEGm&b!2yc+i%Qx@|&v8a*g^V zE8E?Bt!izEb;2_M9Jd~y_+tH6C2dqaSl{?O21@YmYH`JV9wKQslcsW%gY?QNUd@Da zYE{O@s>OJ|9$^fys5;!_-=HvLIckHA42MnYpHr$x=biZIcJL#>yFGBh9$|O#5{YZP zam9^Brh?)QaY|7A<@UG^RKzD6jF}CLnXrrAc|eY$pAHf-It|F|*yPE`6P!K!5;nNr z-NV6+(z!UL_~&5(bK7zno!3WrSj$l+0zb z`IVS@N^ZHt}2JwAl2!O ztC5OwJnIao%HxE-akHL&F5cmxyhf_nt040h(W%tLSbJ|~p_U(5u0u734k(twpsiK9 zcGCkZV&(HxVkg3U=w!Y9{H0&oGdO>~G}A|Pr;^dVtT%PzvKxeg)-WacspVyqa`naM zCjIjrD?7r+K5~_D+cFp9KGTdYX~2jEW4dsNjb7FL@0v@(j{E!g&(fc}%(t~dOQb&< zkT<>68#Nrs@K|xut*hHlc$fLW(yq5r&1TEJ$==L(S|>5=9@2HVGZ=JBrrnPLf%$RI zS^Q9GHa)geRAwOmYXWnIHt6i|>IYf9i#T5FA0 ze8=TkXWi^S2FE6gW9DC=L#d`-t80ebL?jNcIF>fBi z_#EG;k)jE+pfrZ5!m!W1V=Q4o7ouUB9}>(t83BOMkShLO63RE_@n^0e!HeOOWhA-89f<{`N6|_Z)H33;5}9( zxP8cM?7@XP-{g0%!zRxwcF)7)S8MqhDlCUpfl=jnDVFNB>jF?Y&n=jS2L2U$+9BV$ z2dV5du!;htLw>t`w%9nk!bjEnwLDvJmI_pk-pra%4A%)aycl{LMw}3#4b8<_M~A2s zHyE)1Q(W!_245Qha-*vKjd)5b{!Ge3VB0I?nywo zsKwBFXf(c=kK4p;#5%`L8|H&$a4N^_x>}D>;=aRJKf*wUs3s1jIx881?XgLAx zh^pDO>Jlz71lkobx0II$D(7*g#|n>|xyctjUAr@X!?W)JF{HKq*SZTOT2wUKZr{+% z4Mo#Hr(YS{w?5Bova)?^Km7u1Rkxa>g+s2^5e)(+i&n|< zQMxpS>v>Qbq%@5o+j2!3kU7fo(l`0 zXq|AyzXQ()eVEAi7M`ZMKmV?W_fJaXyr1toi8HLHxbK>cQYV?%v%eFx=1ec)(S#gLVCdd{$RsnX>n}r zaYjAIKQ#^pv&!8B8Z=C+PqELg8ruGeOXQ% z+yFSCozr#bWL^;5yrFU(c{6A>u<^HZG}&@kD0v6e9LP{aPqjNf>VP@}=95(*ccriH zb4ix)Ll#rqf4rn4Oo+r^1Kb$JKGueU(ncTdp5b78Pw(T5Lx>a=K58sO^GA4I0L};W z9+XD*E%A3w637r>=}R~$)1$Ey+v@trET!spx8KcbX!@w)cdcDBEoq&gc{bJjHrdg; zIeg%ikaJAH{@j>I>f~SZy1tG{KiytrjeX6EVgTYpLi(6v^?~IJ{L8xr2;&Esyg56d z!i1Y~ECs@%u#qskd$xNsVh|*K{pW7Uy6hvln3fUBibtEN6T$yG6ZlIlRi zA23`2*MR~w+yCcPsmZ~Y<67voj1ZCW)eh__N{Q>XL%Atl8aoI*-kSYbe5@@)#i{RftCu~S zLBhv-X;X0$I5Bd7=$`+3nE}_HDfQ1fy(6Zmz+wsD@Zt zuwz!3%VfNn8vAFG@N1W)j5t>K=*TQ0aN0&3*W}lJJYoFAWEtXW^ej2g8oO@wv48t( z;hECzddXG5vvB@>);YV3P>{v;QEl15*2L~OM@#3ikdd69+P(eI-rX#zYeDUz2Ewxd zroGO`AsB2o8>$aTa!7D`V5|fuQEfj5(lIs@{_TS0UH>c${o~a3f`RcHj{83vmIHx?!^5;vwd4?j-*R3` z#FD#+!KHeY8nbW1T1%pX9A~>=GDy=bZEk@GLgHujmyPZlfDblspqi{U=pH&ID!k-! zOv>~KUnm{OC|3+rRhEik1!O_^xAbbjFj$rqj)zTZb=!m6@LjFr5X@}Pv? z8gRnNrBwgpT`Yat^dd9(&{cTEM6ExfjZ{94b)?!imbRw92E#BB*f*-wC_?q&`-~tN zoP?x~X4hlEMPY-d?rNY?c{0V?5j8Xt6uXl7W=xa1IA#0mjA>q+V9#`XJ9N^QQ1XaXK_;B#>rDV>h*UzicTax>2G^XeriG$g(9A#HdNEvo454$ z4o|KhNG4DK6DtxWOB} zX_#bI_3S?kg{o_s24_YqQ4)vdF}}R7bIqVb?7Q`n( zEr{ks>kT77q$W4$XKu~D$9L}WaQOx*5qQNTz(Kw~6(Ik8+=(N4=b;D}I(PxO$oe6t z+Dm=Rw|QVtHtcg5#cTJ#^8v^0Xv+oh(eTkegG>uj3hxnfy!BHB1;S1n8x_6sV2}hT zyer70M#qNN(|rB1ZtbnbQ-}?3`Co${Bff%}fg0f3n9o7X+OCe@E?Xv|bkhB2llx-9 zJpaZzwq*L=x!2yo_2uKM&9s&P-AA#Db0>bvHrw9g|Md|0d$CQ1KTB6uj7|4KY=H(Y zb~JJNBj~~nQ&aHY9^)|ft6ija`9`-_e8rmwz3e$=*Qgpik7eg<(}C8k+N*4__ZPS5 zBZ42Wyc35XT8SdBk4CyRzK zqTemDvJP<>=}2u5G=^k@95S{h1#jbw+HPe|Zgr=@@q1T|!zpdIW!{?90o`G^C@}R0 zh_yfrkFa9Kqr`r=?+Q@L?DU>mORF;IThk#_#l?694fq}vkgHM)sMdb^dx6DF1f-k54OGrJ#V#f3jr^{3FUR7f zo<$Av!M`^VEG!vc))RI-i8eO9See+Cx?a#Y-JbWkga1D2l@vvH zKq0oodP!72fCIyVbc7L?e(O*7eB5|&S5z;lL=T?1#PMq)+UBDZx#38zwT(cMB;<6g zxmmX>mFMd6$|-G(To6}i2&<4qe0kubeNs>&qjF*y*K=%jf!}oEm`2QmO1+*U*b4#Q zq?@k52NaM%a@u#igfCD?KYmrGhplJZiO=asJWK>%r0!0fmtt%qS1O>GQVa@#V z6g-->3M3m8d{wt3ZY(#{c&V!IMb92L_j%Z3B2~pl_@O|gEX~=9*9q8=`k1Dm%i$zB zx@BptC;Yz^UjIB$ehL^$mulmwpvzQrf{Y#WzDF8F&`Ra@p*Hv5 z8ZBIlteXadEth*IyD8U4mYw@D^k@8%xELv^8criF=5pcT%`g4(9~lRW`j3cKjz|E- z`AUHv&6(_ZWp%ab`}e!a;)+qxHg$D%-Mzh^no*;TMqGFA-u*E&1mCVWpZoObCJC9M zUNP%U)ozdKY{Sc(iGMDf?`W%-(;4E)mnYxs}c59CW(l*P9Efo6<;<|cNUr+-oFnz#YQb&4cgshD- zZ*w}|HDje|zIVBWd}Ko66Y0){jDmEmrRZ6j=A!)Sxjw218MPzM*>#-=Z?sSnOgG=Q zL3s9nb#R3I^m8HKR+qYys;CYe&U3+961cG=47$t<1+&sZ3c}I=kP@tYW~yxHLl8T6 zq^m%{$}rBj!K&MiR%;*n7KNOwza~kYa(w+dHSR=BvYsw{zRvUc{jKtv)w;b`(n2dQb99HX&1t3zd4YezMnejZ>-3O6`@Qzrd5c zfcOcZnvA9(ES18d`hE=jgksQksIf8q)mhp#N&ESlFS< zR@a<#b0(F(D6+aoOr4 z@&&{Wm;dCLxU(}d=|iN#!he{J@;2n)~uJ$A9-YVA^Pn9C1UU;&ZrpDADex{B}u&j#FX)jpKd%uD3Mk1`BD!w~Dz1hmDoF3Pj`wp^}oElpXbNwD}i z55HdN=d79eBpcdi5s3O;UoYBtx&OU$O$I8{yB>ah8ot-TnUdI(~ z9?p%hN>+@|&Dqa*4N9EQ7%U%+!PHO ztQbB#Kw^vHt#Dkfxd6GmO~r-#b!hlqu&_R7iyPHx2@SuT)9I?4DDz8R#4?kR#D>w% z2uq>2|8X3FsiTuV;RSiv zGY!d^J9DU{_rMZ)`>k(U#yRSq2dbYY-_8>bd)q7t-=5;{(&^^iB8{(I{ZMC@LpPZx+-|$CvEN&)0 z-6YK}xige)C=8vh<03d*GR*_v-t2S*)mJQuSy#|}Fk_MLA+=n)p651p2MD(xQRoDMER^8tg`Jv9+Y#*6~em)P}<597k;JsN)DBB8NM zKwU~g;IoW?@3<7wssmWWdb~;dPkS)~yrE5AoKPI{p8W()T^57P_vyaxsF5K z(}>LcB$$Yy@B5Tbd}Cc@D;+?=xjDGh*+9>0^K{m|>lWE>mv@HhG=@Hl^0p~f?X3kR z%^`DhMoXV;eV!rdUbpy4toA2air(jcs5)Y1H(O6Se;x~tlrPFuzi1_Ielz;I=X-zD z{Umg7f7RMh#HXEFE;{BMCXx+3zY=Lq^I%ZTT-|Iw| zP}UmIvmCtk?#e;hEfv8vH8lL)<~;6Q?-E?U$qak+@%*=?RA`Bh3T#bQL2F`a3Ne4V zDkPU!KVj@Qdurw3QHM^1?HD(mU<7b|X}GDh^sKCvnWhr!jqSPFhE!i5s96cDXDi zF+N#xYuFdQ$H|I zBPa3kO;QLgEp6vos_(fq9sxm`*Zi#$si(xL{5Aj(iGC1taHBDC8l0T*u zHE#q736aOd#Qe}>V~;4b0boeRrG)%}*?B~BtcLLJrUM4ITLA0Iy}9yz=jB2r6VU1^ zu$Rwb#hYb6D>A>GJAGE@iM1vA7G)y3m-Y_mByz{Ly&tYP&j=OHpAa=ah#IW1jk|#}N z@Flr-lchIo#|wG3aaL&r^FVFEKRNY*&np$ORcx$8i=pjKXntt%Xgu9&RY1c@9K(or)T ztmd?`wFdi|@FWjt@(Yb-m@8&=XA0C_ED)F-e`4zH>8}AM$!~sI#BLBnj7rGI#>Ujq zOXPdGoE#jFfK!x}m6h8FLUrdw+#{+wb$xv=pq(&^Q8`z)HV@^7Z@D%g2RA;~sG8b3 zDe|kz$=!VmR8OnHcQF3ReM|4arfY5@4B`?JrfT zuQWiUCe|I_APLp%P2sg1y;Da7=NA^Xnvt;_Nd7pdmY#i*D)(@sDTgrcZU539#n7v* zt*yYzp%bgmk#iS(mIGiY6g|~7P&6{ArKR<4bFX#ngmP0!QSpmQB)_Bj&PrLO5(1QF z`wwo*rE}7H@;aZp*q@VA=C(Q8Osf=Y*RLl~{)75^(B@M)8TsjC>+B!^

Gj_$xgi z|8yG&xqbeUF4~hyGL68i#sRQx{d|A;)gF%kUml5k3d_2G-fF9Db~b%f`?epyn~Obm zz9LDDLhn5*VMDy~Vr~Hj-=X=EgT}{!;M~h=x4TQy|IT-9Ch%mIlDe9`-a~drjfjQd zYnO20r;+9|8SO)2B~4GEpGwlRH(0m54l%Vy%_pJCPNr6+^mYs%{f?q`zE7?TZvWKU z6f~LctAjI_ZT$k0ei^cM-iKl{R-Y$kWXSY&4%YQBrzDwr+o#mPkKu^8T6%cn(wC-y*fr{8S8v->Hja%$U*t={iR+Mtt zd#1m4((NgL8Bn%cKqB7Oy*u80JLl#zwNdY!WLXBM~D&1rKY zNN-b3gU0gg7c@g@X=a~1wN*`0&fCzH6`+Dr6@o72Rv0os>&3!&U@+~Ei)^HFw}Z2z zof`s&rZoxDz}_p}O-VZHpTqhJu`H&0Q4KARZV)P3>z=)TyZkjF)goCxM5UQtIpYH`iNoEd z;hxr3>u@2P88FTC?R)J`e+k%p?x4IvTq4_$ZR->|@j(N4;i0+4!1O&p=2 zFc0(nDXtPeUUL`>c8lqio`L)0C}3Ps`+<@tV-s0~h)TodhJ>C??jsRhTp|FU4}lS& z63B2xk59fCqksUuDgtj*{w1wg{lJndC857-jtm1jlTIZ$z=5G14bQ@R@Fk-o;KQI< zW~PQN30u|c(=io*yk5AWXpWV?JzQ$A_8~Y7Pv#S)^~now<4M~;xEbb=N6#s=(>IOU zx324V-g@VWLx{@olA+vJrjDTi?z6}WYI7inB2!t zvVz>v!Sy^v5xKEpc070y7%byrLkw^?J7C~{NX%&wfIu&aA7zq3blmNM-t7rQr(5`t zCj|54f#knzv!)LKUf>LO1JRB)KH4m4>7b^RdKG*4(iI`@DB2m<31F-HZ!-QyL)lHJ zKgkij>0ZB_n4f7h7DIB64x5&zy z^x8fhSi?X>eljrmPb6-)iUhI%E;P2Mj|W-@E0ssp^!XhQ)%1M6zlj(w@-8M87n0(F z$j<(S0w#_GI7l>91}uY=yK;CrmWv@lhz@%~mH<~jdj0=l?$QBbji@Hw((|2cg&?*I1h1T13&`1axp)Ol7X^g7~E+l!Jt z4x>F;pe&jxbS7D|41a|G|DhxI%}F5Vx8kJZ?)lt$;e15!zu0@rsH)fQZB!Jcq*1y- zX`~k*9g7f<6r_}HS<)>^!&;PpAR!^$9Rf;82nZ~?L8U=jK;XUCezyDF&-38^pZA;( z=X^M0tPg827{B$qYu?wq=A73gmvF|4=49`d>Vll>Gb2gUeq)gJP|G*W@A{^`jJTKD z$I;7MA(|yKLJQ;OJ?{32_MwmMp}Gz;<|2HV z()oi!O*-9eYJ5B02sMdX2ZKEjsRYukzmHS`6=uusizChobmOQI*pGV=m4L3cBXmF= zW`X7Ekudq#I~Y&un0IR}=;tQa0m9;mDjYTUK>!b^rv7^X59na`zq~k(8wJ9|Zy-ts z>}CTXV5^7zG-4V&0^`CMpa8-kM#;@lca=&}MIOE1iStu0zwi1}UtHtgLQsp)p+`y}$QAORL9VY|pzWmprB!{e``V)jA8j~h=hNKjcb;4$oGIiFzwljr zG2BxCULEmHIYJzWIt|=gCXUR8;5h`>+&Cv?Yf>@q}S1`4?@idq2h#c7;1thYiFA77Cny*Sb$&d zEJoApR5d>DME#qYCqvErHw}3SLa?0PCRHf=s`uy2PZS)!aoEDio!z~RZNd3*2c!9U z;d5*VpA(e&=P*i<_rOQX{)ip~06E_64gn{MI1@-yT6r(zejO8B_c|z<-?W96$x;@b z&Iv`Q>fa5kD1HOsZEyJ}-yQ}{eMPMSTE>ckGzOsi{>__+8Sxy%0f~GTcdyegAGWfp z%cWT2WjM?#=T086ll1AqwekIM69eNEdOuORd{<;eL`Wt+i)QOxqmY#B8?bPr6JR9$ zs72Lyw#9%rE?b`AHd&cYqjXUP3j`$JrO=tbrtuzliS3BcnKF^0_=+t*Kg~9E_ApX| z$TbPj|B)&#VxbT;o&INJqdpaV+aefse`P# zI9Q;5`kWwo1#R6l&UDrGQmNEwEg+DE>_*L>9>D zd8w7a=!zi&BN440@XUsU$zy=!Sc(M^HMXa@U)vh2;({%69~ek}4q+=7)+GGyF<{tu zg<095F27Cv{gS`e8-0z9#*pgO>1Erzcm=X%q-8f@9*<_c+T;79pj9=^MdMhApLpDWFM_Rkl{04 ztPp_!J>29UaSwnb5NFU#;+yvZE8=HNlB(Q5SB-C&r=p_JXJ;yFjaC-1=1(CM+sYFn zX|hqZBDU()3p1c;8?K9hVt%7cuIFy^XQyK^#$y&h>m-v42T4+_=Dq;#BkKV_NBo4! zJ#8V+yWqV&f=@nA8o-BQx=VdWoiIZblzO>jJI1#cX_ng@*I4P=gUY|w0M=0clfcFN zRFpGt8DchL+P(ppXwtf`(Fk&VZi8f|zT2q_R<$`druU(+<#s>m(NLji^}D1)B|ewh zD|D1RYXteJJ#czYWHA-?RF75<4MFFluRjEnCl2`dH@x(|L|u6sE|_7`bt@Fo&2O{C zPdmYov0lotWZHFd2J(~sn04IB(Rt*7<^ATvt9wFJvh&YXT7Jlr^c#gDG`$8mq803e zjr_#~sJ|A_fOUWTN&7cLutN=DBdl%N9Si~VoSOzi!1!|rxixv%`zOZIy$6=_lr-x!*ARN;JU6`q!9{a zucxux{76;pj_6mbC3=B)`D3d>P)xOI4GFr^d~Cd&J3%ichi9$JNw)thKAh@x1fkV4mI37u? z?-z12v`Mj-^Fu!L4E^d*0v%%s5>i{8p8{#LWx~~)PZoF(48dMa$57uNi7Y+;XdKIL zt@>%=2V_M2V2MF!y_~ZhQj=M7H;@>xdfvro9QB9c5fI*7gP;ANYunPwm9euI+@=bv za^?@X+V*KrL`hWjO=R7z)q0^bZJTMnS5li?Nw{@ChGd3sKMlPY7RR8JVKt0yuHlD) z)|A-^LyX`6fDWL49DoV6&6endb6OGAcj4<{M;&c`K*I(y$P0fl#P6!Y5SL&5^OA=q zsWFJW^{NwmG(LA*KN)xstxFmL9js16ehxOjso93?X*i^tn-8y8MI&l>{l^Uo*K>+Mr;_53%vkWBTOa*QvhKgG zo96+AkwR~*S2Z;3Ft1&K9Iv|F-=8=nhnb0AhgX6Ak||YN^M-H;(NQ)bbX5FIsN+Z3 zxaQYgG=6Fy9Iv!9r&_N`@SC1tnJk5Mzgw6A}!)(X`j_ z2!z5;*guB3#v~9qtJuSJ=S%d6lgQ5GR>ncm`Lo_=IP>fCXN1sOt*D-cIr9*_ySD`p z2z$Q4p(6J^HTAn*%!9nU^Dg)qE8lOfpRc5W!}F?qK{f0jO)V<_jsTN^fXc%vpDt5T zp0YufW_+U>whf?5MC8A_D{6P!qYFx|_-LekYc%$wq=p*HJp6F|o}MQ$HXG@q+GgU7 zoP+eWw%3^MhAhUXO$ChF(|U#WF+n9Gx&3>#!oEV-OeXnSG_ID{>iejqgY9THbA!Y< zX_Y}lvGK$Sbcfed#nw@&nx9y~`@~5zJDaPXVtkdIiy3iEl+w|nf4Ajmbh&6ebV|<_ z#3u!?v$H%f(v)SFg)!!irzl6DDx<3XTu4MA$wf8`nytHHjI(tjae zZOyZnkU2ej@cDro=E~Kt3PV~h3`ai2QtrnMbF1Qvgp7w_kYf>E9QLTBM4{EW(f;zq z>LXXJFB4ITA{vVp$k!YO0|F#%d5B9!1v~Rr1@fgvQ$h7fvPM)=a9K?kMNXQ?kW!A& zETy>I%r~=<9}Yy^JzqyJD6|sg`Beshs^Mql&aAL6)SEI-c3H_qxzl~Ol)s0BIAhO# zTCgV4UJurPgt-7r=mm?CO+9}aC|;aT zO^iC=>RZon8eg=MQ5;C#wyfOrm(f64`(&243=l_~qVup;#9O#qrfoA^&h zOwuUqFi*Z7SWseUGju)V(YCUddU2MEazc_Mp3Hm5P(`%17T|=IvpVa)$2%`5Fl6(u zwhzyu9ZJ^R^_0Er4yS;w2nTiu=PPI^#?$jEGUl~Vwhz2*i&l!8>l`B_%t=0_NkA_+ zdYh`1{y}8!knCw!Vrt-e47rA8_uYJu-k^TJqqlT2Tlbwft3Yh(6?Zx zaogC3RE7t720!J)GI_uiI{^0pKFO!f9~Wk%qY;H*fn2Otk5rmeSrJ4|Dj@gRDd(@; z<9{Q4l8%wb@_!_z=l&)!ohfPxg`G~0bzPlFpz-NcnxM#a0ER*z|3Q^e>Y>S-joQmtfp#Gp!!u%@Q-Q9 zBv3_X9up515@=Z#W!)1L_2Z25-?ab$zn=bX*}7Cw-bP83+(%GPK-kIS3S1;=vj6`X z|DTNyqTXSJnU4h{s+Z@q5HIU<@ z*FI-9#%|*aEMx(QGy>D+e)|Y=)1O(EI7W|2{b3;kLF2A>%eVw8UzJ*>Nj!UTo3hO1 zob_%6z<-ZPpNn4Tg`?u|nE~mMTL@OfLXT)A$Tuax+O-5nm@AYB$`)bZLru19cPCX- zmc;z#OU4|tYvBGY70yswy`2LgN~hOoV*1mjx|=>KT^{#hwASfNXQ) zS-+SW`a+h)H}cr?ui-)&juR$kH=t*!02q+$dCgPPNbYXiKXuqj&nZ*-(n0XtH;w*7 zd3ZL1-ztL^SEC}jhA^nf%;dssA7E}%r@PebxOtb;bm@!g9=|sEpgLpZIs?Vf)37MR zyM|=55zno|ucMJG(t&Cx3g6#XAQQNOPMTgk>)RBA0RB=W+G^35XXP*m@!1;?TtM*x zfJpM)1CDggaXTY&IWd-hu+IhI-HA%lrjue4cgOHz>U~WbSFrg3d7~!t>3A{4NvxU-K+WNGT6=y$9>^wP?$OxZ^BnIGih0LK-C zDCF+oxz@aKJ#@;Q-o_#4ZY<3)XAo$(G?Om=WBLWA`_wD1o-MM)oJSKxV=3SK)>skp zxDv&`X{{H4x=;sSN}&}662$L!+}i21;49&Q)tP}|#{NgJ1BHV(x1^12aS`u+mRKcp zCa>>!ylUEU06ctF5P5zgmjEIGY7 zOGYRk*Yo7#%RnK?%v?_jwKtpZ*_VYIY3msHzS(kn6?z1KB8-eR_d*`Rhe?}u4tXDog7&lfsLa2{M( z%M>L!g`(KocdlQI^0NqqRf20Wj*HSBl^&t*e7O}7|4R#ic_-WKz~;-#K3PePsC|AK zsQ$3(_3C8Fl^A^(JcMyNKHd=W$4G}m`9)BkaPua?AU2zv!IXb;d7>kPFKa~owF2Dj zvt|W8t1HIM>D{Xwb-P%sF3zwGG+|8VY>jQF-)bKurG%gj_dcpy_?tCBdVsr*7UxP@pNq3 z_WK-Mxu0|Q>Tk$hS=1>OL0gPX!c5A>OV4G*;(t13?8?I)+0C%$`7jiFBXUW*<}qTi|9$EjKM;Pius?ieCv-%|ngDwdYOZ?i(G z2pVjHv`C1pVc~|v58i8(_~Fl#p#v?r3eA#i_1Eg}eD=hqA#@4cEpBy*JQpZu5_{9j zWM~L}>FCMJ;}M!<$WTz@ZHfq%mCx_%T=wr4>I?E!c3NJX0QD*g6|Sm33Hm%N5#Uc> z^E1pXt0USlOI_XQCY$n3$JN9O@FeisH-;NvdvRM7)~i1^(MbA?Z|6-teO!@4fsx|^ zw4*z@U2LBAEGbZw!&#D>u9IKY1XHmCgzO$m6MUTg;UAJ&LPjV@-tMH%6s<)nSOn}x z-PfN#LDDV_kY*V7|rY zsrF8y9`MlaI(QpoM6=B)^kX-EopLG$^# z?u$F~Ji3fB1tLb1VVOISp+Ve)j~GSvFB3Q01e|)xtiMjd=r6+icR>Ibh)%0#7uU@a z$dU^)cYN>&0(srJS5ni(%}CW`@|B`QS)+u_#|9cO9YhFO6a9!|v{ zlU*X;rzYLn-%EbFE_GeWC-re?fWK^GJg08()4blyXq}{bF618t_I(qF-#7iduU>&e z=RdKvtW!!b<2T^V#V3Ci62Ya-pFI)9}5ig)9UF+tBMb(cbrF%)rQ3 znr|t-t1rNAh!lA9keLVbJ0!1edwLBajPsc%{q`nV4CHOBXOKX2n8M4Tt`bBzt=r6t=%B$F0LbHr#%xH zbi$rj=#jlM2elo>b50dzT8asa5w^u;z3B^L5bABw?@`p}Y4cA}XG~=;8xb9e1C_^h z6xMoy4B8g%|B0^KUc02Y?n zW&nDTFN4;wfB5coZ?CENO_Ec2p3x4ty5oT4oJ_lcYb5s!&16IO_-PVE1TmtcAB1(A z*F4$`!Y3mXX009&Scw6NBm8mn?bYY^G7#@r;&$1Ro3sW8zA9y@vx}HDSe9aFCd?vs zn1z%!vg1(Kw7N5lA2}T%pBJfJu}Hd{@2PJ?HEo7`-_4$eks#}|&z~aOpz!u7q5KVH zhnU=AT*s_@2t^ls%Ah^6;|_Nd9gWU0T;*$)r8 zf7;i7-)~NPPe_}dvqW|!~)_hO}a-EZqw$$j@tCz_)sk+!l>1+Wp;%bs7oZJ3hs zclUh3-5sqhBEw+H`)29uVQm!azUwy!?@v+GNzb|8{c%Z&kJ4QeS-mdxpm~d#5H=lC z&PAJe_3QWR&aJ2zlMPkx_z!wi?#$7R-Frhw)Az*o76Wp7fy&)ewF z&|e*?tzV1SGf6bZxzBt4@KfD9AtSb8rpq6S7A8$|3H2%Q)sTJBO1-QQa~oTlK^3aL5kEQmL#@$? z27#77#2@4X2lv)}S`d4?X4)TxpsnH2(cp7}FIRDdt`BZAM#0a69}C`$1*Jhh(Wu0x zpS$6weZZNF42NJRi%!ipKH;RPUrGAEv1A~o<~$Hhrp{D?s1A zYe5t#045VONl)BWdw=rMG4 zp!pZ`;eOfrGRW|g5zYqxoB=cm{-frc6@m8x)c(VMSNo4@Q1{fBwg-V&=;VEAx{8uY z;QCKS=6});0M95lrVmF%E_5HFe1~QLF9FSE9Z;HGw8I{no3epX) zyf565HsGr_H9cP9-&J zQf@#F{rC^!_CG4LL5()_h8NRCT;ab;wAsS18}cti^lcNmq2EyWpEImbUD`$>*$xyg zBK4xyQMl9uD*(Vx!~Y>^saaN*w#2q|Gt@4be5F_Tpa3Y>+w(gLf( zv-ReVq~=R2Z>6Y+8B7R`#2M|I&C=SS~v0CaA*gSjpxxmA=&; z|GaQdt1#I(baJGRc69uI%Bupx2xek(7cZ3L7-^< zTC9IBCFx>luTVrdsO0z`L^z`N?~Pxm?{Nu>Nfa1hvu5d=)U+T;mhR&iE;wF~P!ZpM zKVJWr9${SAu3jPjUF zDS-O6ZaIuMEc4MFE9B$#RZ3~w(Pv1A$hn)nryd&ckvJars9t=n)9ImX=~`Ubk6IH} zgg2Vx5PEeuWH8jAdKaBHIVOyU`c;ey%sI9D=m80INzdR%7zhbaL>mKj3JcTWcvQmJ z*(DGwE^W>y)IS`hBZ!iCYB|{|#NC=Q%6a`Caw@@mVR`7asw!PJwD@9;4Cl0aArQ`n zxOP{n68GAGBFvc(S%@TrX~mnr=)P1P4&Ku6Q>$*wiMA$HA7?2&G-J49^5MyqG1rO6 zD=NOpn-N80h7vu9$#0YD(Y7+#5vRI*batzZ7pUk?@>FXq$`7ipm*LMDqe zX{I!t`_WX12x3fr5DC1Ihk>&7R`9}YL3t{8Bpu_gJngb(DWN|!Bk~p-9^LeQs5t~w zfi6d_Us#<#m(P4By=|ZrGxH`PX`_Rram9z+0!@|{1zHhu>VoXpv)(Ic{0CRlOUR`z zTXmPQGOSDQ!}*WHH%u-YwlW?TDjCN`+TB^DSUG*jG&m5p7eov)R-MtmPoiK>hs7mO zCFcT!aQm2mKB-2Vdf|D#2GurY)I})e8G6i%_W?vuno!==_|RolkG*;mvGzDB((fA3 zXjVd;uS9iks$6-g5nGJt%9;lvllsPdxgB1af3)AFX6aY<_dRT@x6ZFm zQYD1mHepBTq!q#V_1XvAX5vCJ1;ZGr`wL+tYYj=K7v@@eV?Z)9q zw)99YsZp7_;OzdJYW8Oj5GRgV%z}i_uREu8u1VCkUw8^wI*c}# zUK|5)S|8+-n@yWS^Sv{neGDF^yvw(X(4)G}ofG#)kL9n%JmnCL8(|qtd25ZUO*4FW zjj25=tKfB8RhQOYcKDq3crGDS2Tl=Jhe6>p=5dRl!r38x&1=^{bkpdIWP0{^tW&7V z+6|c2Eb;sBkKPa@6(vlIbwG2?VxlZ#5J7$w_JaW~P|0&pvbgloORQszJc_@x%t}N< zD>)x>jNAw7h10Pfd+|Q&kVXwp3>J4o{!92h78`7iEvD2EuIR2M20~H~hRDJYgIeT! zUZ!WV-U4BeLEAfenTVHF+1HZUT2wO;Ce$k~d22DBCI!yq7_Uw;=cns44Bv7J>*7~r z){8Nh@vV=b#W6Qcb1bsKpWC3IJrHP}B$CQc4RI2w+zvKpL;CQ!MVFT_x>9k)Fp=sO6DNwJ*GSbGNR@bzuEHRj=YD2P_J zKj%5Q5Vg_yml3>jTu@LmvR&Wvl4k%yKIY}alLMVs;dj{j%g70#Njq#td4t}(ZUrr> zv|_xE?eFn*SQNuJ7Vr9Q-LQViT+U)_HcbNBs;v=lA8d;%+8$=L!0vBqs)%L}JSSp- ztYi#paoLz9jW7;vo4$1>zgf-6;8cb8SybRG+G}t^*(${PG__7S1HNgvE}8#jDm=Jw zWb0U*T+g&Ryb@%jK!9j|(jFHU`!x^1Mi5$>o$0BMYi#Mw&xe*7l4lZf~Qx zx9z%T7kazu+7VDbX zCo^LfN=^iFK=?@AoX_yGeLT=qR^MAVef)@@cTNRZe#+y;fIVogm+) zl!zV!T*B{k5o7-4xNwSHDhiG)PY* z!}$1?cQ3U$YxC3y(wU9B(kLu08YjUE=FIDM--dcWq*C)thIbo6-e#B&j(HeQk$u*J z>T@nKd*{R#^0zapXp&}Eoj>siBaXdx-677h%w}``J{<$V*JdTpm%)3I{QLKnt zlPpU|7e+_|P(?>ON-X~3&L;A=e^##WM+2p|`o4Gi$jT=vsIn~X8_+Af>Z^qQJnEUo z&O5#7(eeyjJ8z4lM+D0~FMDz%$jhaiX&Bm$?kP#R_}Ck4eU_tDwp2Vbic4c#)}pw9 z?WQ6e?=XE|b*G|c^ZVWmQqkrHl)@)#;lZei*1T;jcaw9pyoj|?R=Iiy9Pw7SDt5#7 z-RPH(eSB!nn5K=;A}jJ3J-dg#*bZr1@27|;!``ZDzMD+9XhuF6blRmfk&zpBPcpi8 zdS;;Ez!xxfEw3?i=b6NKMp2*H#j6BBE1Ed2&d_DLMGE%U*F|V@2G|OBrO39Y~G7CmQD~oBG?=g(YD83I`pafLOw^2RWU0lLwTwbPC)D}r~8gjZ%{>~%uif#6UI@|sN z5zJ^&7^6U9l34;|9Z|o>fi^~kO5hxs{^fP7b5y%!OK{L z(CvV%3~4`~;>(%F491>j&tWad`_Wx0r-a@?NN87_r-(-kjf$-!M2@1J_vvGf(SG&} z2i*+#F;LF)v&I(HtK3{#H5DdfpJksK?j>JmHj&jFGFSQ%2%z{j;eGWifo}OhYD89% z=Da9;06*@nX_m`dMs$%!t^rNdc(dR z3xg6}+s4_ zu52HDwaCl2{g}GAFRvmGNqXh3wJa1m?wStn*}RS#!Mb-tJ}HT7!!DmaPCki^ll-=jOP`9Z ziL-KJ=1f#=>{ln^2Gf*j_yAd5p+j7QxY>?`wh>t#n%pI;`eL(?o%%wjTI`vu!>pN# zX(ErT0jNxpjmGt01o_eV7+mss|9GvX`KMfdjq%(=nMl{d{_q_5ok zJ?>!-ADm-Za_y_T7Z#uVuYs!Kh#lFOq1*@kA6+7!g}duyxdaTS9yw zNM54H5c;yATci=M+vv6|v9Pu8aKtPRRW5?ENb_5Cusm&^^ZJeLwK$%v!<}Iye_R$K z?sDd}k&rusCjv8vb|EXzO6_7XQ&L=fqc5Q$4-Jec6vd2Eei$>Em?viC=(7$mcrfVj zYcLHg8ZRB}pENq%dG=5-ullr-w#pFASy3#t$6Al5N{@lpE+tDvpM}BBCbc(ece_bG z!KpCRJMye?og9zsvbEc3i0mrO$kvWBZJh4|3v5ej(H{1!TRLS~wx3EBPji+11XH&@ zpR{oce#!)C?UD)!w6(f=kvyqO?LNOL`T9x(rdULGI^a-MFulI)N~^wU;#RykY zP($%KN>l!s*6@A@e&Zn?wt_mEl;lebG*CNAxGrE~5;~x(5pk@>7ixWLKo;H+L)I`L zyH0_F&4SIK5Jef8iA&qNtz=9>bk3ucN1ReZG{~9v*0TAuvW!iv#-F7o zeZ*vbFwo39&Xx7vxYSgHV8r5~^I$Um&ZTg0$7PCeO070wAB$d&bR8B>!hCQd0u8hm z9YJ5a@*zQ>bzg}i;jqPj?`!>s56e$(=q`#(yEJYpi6!_p#vcYsQ!}_HXllu|9n2Tb>mPuS-;9>1L@WmjO4I~7bGC|+m>t? z=q_LQ2@IEP!hyeeG~Zj#ulg?!qR;+>KQ+FVhE++gOR^sXk^N3fZymmiKaY2CM)Pyt zh+v%&gkbToE}ua~ zL!03U&H~pc_gd~9L3E~MgAfj7vBt@6xG3X~$O@cb>I_1M?n-ykEZo7{>`R;trv!T4-h-nNwsJ=7aK5fo z8_wpEsV`6O6)P7{oAr_$xLi?dt0^+h-e`+{VTCuUdwzChpddL7Q{WlF{l(;%Sgtl4 zEKBWpncftIdBNkSusagTq+HT%?tRLe3GdWytp!)T7{bY+i16SZiR7~=n!7743l9#L zFFo`dzIY6h=-90OMBG$220aE!|N0l~XP2zGACyd*_2`t;1on*9V;;PnST?mD$U+F% z>NZ@Ge=8IA3^U}vKayvJQI0SD3SF283DExd_)Duk3f3Ra?$ft0z13gg)^RV=)3_^& zd)cWnWhB#nN2CA3-N zCOwq6<&kIk>(zh#?!QLD5|7SAb)(06STXe1xXq*`ThwnMDExh+acrCLz#}`x}!rH7Grw!D*vIsoaOnzKw8NiJ4n!sdn1=T4k;q6MEof z59><=|6Fx6!Z9&0?G2WhYJY92zkgA><}b}kHJ_0v>Q>D8u(ngD$f#yVk9Q| zB}%4HF8rpT-zx>@Y-nYXdQJS~<@cgQvNgF1mm}jL55FKltPhfx0C*r*PNX&DuG*(SqST+xc z^o=^yeZ9HJH*<2-@8`g&U;SxWw-$a&CCRK`{GZo#@emY0)ia|Hvp z++)*1B&vn53Qq7;9zjKJYBN<^=CU@@i-0VI>^-I`on@w?zVmWb;m<==PN+bNNcG#| zQ%h{%TigA0TGpz33IF|={5tjv3UBUXL)jk`U(X@DJUIAPxw*v$wV|ySZ+U$ix3p^s zz}h}~?|)@u%rp{CA)l44{3Jv%%gJ^)UmJNj3COvi-9Y5FR)SZ`6>Zz9eFaX1O&puH z5^V?CiDAnCnoJ(z&kaRK33l4J<8t7y>HBp&QT9m^i|IZ~AftgSkm*5prD!)ARhdz&h1lWt^$bMXeyQ^|YTR9N=+eS@qHL#o0CmC>O z5F4U)AN!~Wa9fCdz$(w6?169~E7koFy;{T%F&}*q1&uZOkZs|USn){9K4y>as1GqrBJ0@~Nx0}za3^W>G&r*<8euF(AKd(q))*RM;2w{p@U zfA&Hgg13YoySK=6x+A`ncBiu_ElWZy{9h9*=?muIWm=^R$3=$HzeRy=HZb#T!N}yG z_L|*(y^6f!HQ_9SI^K~|R~OrXtUhrb)W&PM)-s^xcRb|C49-N)hMW zGW!(JQ2NJWBt8boW)9G*8vB&UXFuN(NMCy=`x-8V?#aPMChe*aeK`{+cMBmy?>Cj# z_Q+**LD%)M<@4{Y_%)f4;vxutVs`6@*lo2W{w`qMsP>Z_ed*nYiw!4NM~Wy<-p>Pp z!K$zu>=wkS%NW69ZK1^W$9pRuvXm3JKH$NYcl#wGee*Endh=pcMOXvG`yTwl4%k<~ zCJlUilmFURSSFSoUUOej)A`kqoUuU+`(ohhgOz~z8i=RV;j-5Q7K z1?Q0xW7@4*r+%>@tHDA&KGDM|Cz)L7d(v+nzH^5mouqR=B)+&mlr9sz{9>CPIY*MQ zQQ#P$kNLr-EZ1Uk0; zTnZ#DG@PuS4|rUtnWKt3a775u1aFhchP}#afI*|C4A+FffiWN*i}+T+m<&au@9FV2 z3~8E?_S&JIH6WG|0cqi?oqO)MH080BGT-@OARy#LNWhg)3RO(a5U#^%x3m=Rovv-+ zEif~(-4Fx$=9B5I^~FdD1QJGSqgZm#gYb?(kyn=6)(2?iSJ+`oI9v>HOaOZYknj2 zb}FV6qK;&9`GDdI{JVp8!kdV#|65kyT{9!^kJ2@qkRqy8-k2i^||cv=n3{b=F*Lpf!9x( z$Q`&4y(b|~m@1`jw_5Q;Ib`$N6mMJ^#Q*b%u}EI7&B*CJY(CYzIp$>)OBn1t*taqL z;n+=-_n%S*Ny>X0geHsGHiD>np`%g z#*Z6pml)hG(J#!cro$X+5obFI&0|1wh-Q`D$>d(G_l)Nnr+%4< z93)^YtE=qCl+A4?XiQ;!VN|bjVNjfT)NVHXFCCQkX=%c@( zJ^%61bwtlnuJKk!kmO$4`On+3xettehA9N?tU}DFNNaAkQ1-E*-+9OYX~#MJb$4IB zRqABf9q-QB;i0TES?STU&_mW0B|_~RQe``E=YN>Ghwn^^I1QRcbc(0R7NTwCbA-s&aa1D>HEkgG5`$n153@D8UlPp;^?Ok_AOm?Jb z#zwd!zt8a=K{9~PH#0~UhR7|G3HP59#0*9T|JWH+<~9%I&Pnt24B^X4Y)z@SJs+|i z!?gwQO*au7QUW?tvkXqd3S}!FuV;;&EJ>VOp94r}dL6(p-(<>`o0H6oCi+uqkVt|m z#~btz`XKd>m{OSZ)&Px1Z$1fWNCuj-aRgmIlJ|UFR*_vwHBi4_zVm%DtU2C^GDh>s z(=8)U_6OAw+xLlo1B33NZbO(R=~#~D(j?NO$A0l!kxoZ@XntI`evNCdon(LY3232( z4GrN(?i>xJossmBP00&WvQDIWoe6ubv?nmcP1Y@?QcB~w1Y3FBKsavCeU~C&P%*5J zlJUzzFj7x7gMlgNxeY4bCjXdqi$FDZ8Vfpm)aTYhC>tg+B@cO~p!~53O6yZ8U_bG4 zvqGTy6etK2o=YF8U(Ykm`Zm*D~6UEn&nwg8o`nL&S(Y<`cMTJYv*Lx;vlMg>LX)2m7ZEuHb=u|vprgWFq zZ}6Ps!swrgqEeU-(Q<}7-2@}dZH}ER0}#=+r19WktzAbX?Gxkk51Y~A z6cq@AHg#H>Q=jnsppu%`VH{#V=puOeZ&1^p7)1y0eQt1j(y)jSWiU5%;NyKV^Nu6^ z%aED`oW!ZtikA>UVCX0ycDgq-((HF$QVWNOBN!07Q_W}QevRk8qMNmg(y!_cCT!i< zsrEM}g;x3L6;zTTJm`6oPnyNrST)+_#g(40DvzsjUqTdMeZZ0etp=KXPcw`LHf3qr zZn@t}5fQKv2S}IGeJ&{fL5Y#zJZ2fL`bwdc<#^pv3Zpc9r8D3O;XpYwF0PIwjX+rZ;1gfJtO}hC@qFnA9@(Do#pPCfiOM7a+n`0v=vy^G zj{E>mIYIzi=Myc0m(C_Mzg*&XYFqo_VWrh$IHgc^G$S;~{RKqMkiG-uc;>=i`f{MZ z>3uCU3Bc2n1VdN7+DJ#{=m{=M>a64w`>9$UM44GvjDml#RiJGixV6Uv7?af^ygt8? z;=!BJ`0Gf%!dl;Jj$|LTt88Zs79tzW6-loc3%;4z$!QhwxEZPrX zxHmDv@silt*y2cy@vH%5MJakU2jZ?;A-5{ zpoBO+3b6}u{`=cI3T=K7Ru+4^X>hW&Y`xhzohUG+eC$@uhj@~(|FHQ2vIXVdz?cA3 z3c-OQEu_Cly|E$VTN*h9Hska=unxOjQt!3TPK;xrB&j zd zO)PZBf+pS{68{$(3=F&^JrH1xv))+7w|;aezRRumQw>r-|1Q-Pc5aH@b6)`OBDjr4 z2w_2yIk5TVvy-hF;2n&q2u+*Kp3EFdlO^5tC~w>wRGzE?DKz8GDE^wyQWdFjyrSl3voTSHGpb&$(r1Z&n}%ODS2kp!Du9tC_g^#-iS}h8 zMA+&l>#4s0?BBm&gaPt~fs1OM1Lc5EyD!4GA3ZMAtGG2dEC%+@K{tEJ*v{9UjrEFt z#6uY|uLIOsvnWaf$B4ouM*PRz3k6}fF6M*)uh5rnbpq&_bxunY-Er)U!_=&+x%RU5 zZwI*;6d~R4q(7}T3ma|J=BgXAicN3&`zCm4i9IK{$i~8q35u1O864)y0DZ+65i>SOgNx&^? zT@TDOgK-$vBZKg19VCvr)a(w{$4B0I>^d^5Z!L!~B0_q>?>y>>=d4Hr4)g(-I120a z;@X8UM;3s*gbfr@U-h|q_69R6$B$(x#zyS9Ltfe-b062tcow91d`na(R%}t1Hu{s6 zl#2uFlOf_nQFD>`fA7x#jvpBg6fiDWq=X1YjS=AJh*L?UKt_ld;Ko9aW$g~1H^dtG z>}z`gd|YTflC5fjY5izQ=ihd0=mQ3(TAz%3pOQszSWX z-;^{=T7~}W4ANl&RupqpB>SS*k$lW>-z+e*%64?ZzGnL7a}3F?+C@CkwZe*)t1ouV z)$C{9dCa#RO>g;<=8bvoEkB+;KRaxVj-B)kVcu#!Z$fSRgQKN1C`Ao0+H2dN4!p%5 zx1gV#%$_>{pQ;K$Ejlsx;YAU!5d+hJWwReC)SLf#^d(lUP&Gxwb=1(!+=qtp?dhWv z04F#Qhs&9f4|7yg)+6}(N_)7Qr@HGtw=ad?fsW_9-E{!*1T?0i%kiKC5i5Lu_*(X4?3eGqRf=m-)B*?~H@4Da_nQI~ ziY;ox%#$jEFUJ4sHb!>SAb=1SPnwRh8&5VHT*1j5W6en>DvYUg>#F``ru znh^f4nf;IZfvy>gL~-8gBVeq$+G9V`czpuo1Mw#U8G>hy15pn4YH*|-rX&PLkod61 z!Tt&cUL~+1U+O6xCYyl(`Sz(>+uo2+>fkYU4;(}B3(I)h7Ols)f*##w zt!QmzfcvuTS`v=zItYP;s9*}$K8Mgq<7SYhIeh|_E@(H``)CJtw=H>E;8?Hr5eQP% z2x|04+u#v~G{wp{*M^P7o0jO;aP?t~Ao-pXpF}qyoZ6ya#tKZ9bMKaeAAJODws4IK zqTm)kq+Ug;;v2gYjN;=s!8v0jskd06&my6-5Y}Ql+Wtg!a^EoFrO)9yjqBs)z9XLN)G`;DbP#=tko3oa zLLvq2XDr?}j;O!I{)&A)0FUF>d(r_Ho|MbjbZCjY1~zna?ArO0m%aMc+)KxS?gVIJn#Md?q{7PYWgw#>*POM3LLb(L%%BB3hRn+nt+cZ z{Xspt^GEz^g3nVZp5+bGALQA$*=rV9C7Iysv;LVBP!+l{>o;BG`X%3>qBFight#is zDIW!&*g%;bTr@meCF%7=JpK1y`PVVh2lA^1S!%y={r%ThZ$0;aQoS!Xs_HQ;ACb>^ zGE&wO5y88!xPOKqIX=yR4WAOU3a>>Z|J`;e~SH=TZ3-_=%%C;ur6bF zNEn|93M*ybi=SwJGGQ|L?C#qNx3bULR|`72!SUhw(kYgj1&_(^>6Vv1W78VB8_#9F z*nPnjNIipPP-E*0&7Z^qDuDS^ZoMgGN#?TcPb_CDX8?{8_8+jDv2WQJYB4s^1IGVj~S`8Cni2H*IK5+=Y&^UMEf7SxvC%N#d zYA4%LP??IeSU5FHDQ-_Jd$D2jrkcKH6Vr9$z1Qcb1x9x+-YJf7?lnx|{j?n~;a_ma zUe+J(=zjN2r>4HWd}D0x!X}hIdbGGieC@Mk=1n{N?80Jeo@s0H8HX;HhT*$frS@L= zFY|c}YbMThOlx(I7E*~WPRF0fd2I6OwSYH$;Ql=8D0zTq`bBtCkXKbuE@SLUU6{Mf z!QDIg`w56Juc}D-hI%i)x&*tv5U+;X8Kw*SCssTAFO#ObD|E_L<_A)2Mbz6Nw?w-1pAIm1)BwgL|%SIt8Ko z@p|UE6}EkQb)?Ce9tD*N&vxJgA39GN7O4MblWe=SY>TS-zi1z& zzxzplfY7OX_bk6;hBO9ofBDjl_9K+jN$mqxr#meyqMv{Tgtx z-U;Lw!^>V7@r&hXe$iq6LtoNq2pTTb=Li*VYU__(8mrAjoY&Rjj(v9duo~sY|H;-* zwiI&TRn=lYqpIAcZQO;cQ1Y{DV}XUg&Pb)QD zmK^w;U-IvJ#Xj|dP$EWb%Xwm#e|~nP0pjUITjS^FO`?8|kF*NE7Hg2m8A--^QJ$R8j#NerAJ{QI?uV4}8ZOkKgq!*w zQ}{NuPz)E3^WTLVN#5Y)+%6OGI`s?l@9jzESa?azH~`bjbZS3m{$kx6CND8mTd3b# z_%gP-p2kx5$yNe}n6z0yPF1MZ&y1g;eA3V}s{T^1eck+rfzT(Aaz?sdgVKSl9^H3# z4c3H+-xh3$wu2|Ljy0j&c~j3rHOIbR`=i??yDOZLgy)6yQDd64;*X529)JL^`Hn!x z^A`h&*@v74f(5$betrAWjVaYue04M6a|PxD)(~@eg}vJ3mTBiWvsArFPtb3{L3e`X z>78v^u_FDp!0{3ufe|v_tt1*8!}vLI+?|hmuwZTLhRC;$(r?RqN1F%Uu6ayribUe? z7#Mi}FkEINJUDZ2yKfvJ;vA|~IcU1`(2(@~?)XcqzPhYO@3V?YVrC^A%3rm;tK=ke zXUSKaqHxMP3iS!_k@4SXUi@O6YS>ZKn_(yZ+Ir&tqnxZr=3b}Ef#Lf0k$}eNQLn1& zm0Qj9bsReNESH*``^&Z7?9EpKHODi%Q6eM7j_Lc3kN+Y6F#U`-YD*Alqe5RDB%Qz_^!kW~GwZZCXQ1 zs#dLe@X*PAS#zXJfADu?{eK}aK2#Ec$sFNBNb>$xhu``zy+$NQmjn~it!QQ?jE!sYs{y~Nt-5J_(0}k zziM^5K8X}oNa}0u)KSc9eQAG|aP&z3%VVA3c>D{>*`9p)Sdr&5pUr#gk;T7GJ`#Cc z^5V_YILHyv+U`M@j}Ju=n8v2fVS$HfM=b4vt9<0PV3KHefn_1FpdXkL#hq3?D0$cS z!s;*jgp^mu7_gGU_0O52IuE{j^th3-F8};OtK9n{r&y)IRCl7-`h8HPuTL`tqs!UO zvctxPO4iHCtrhX{N^K-+_{esH5&E?q@;h>0$GoMzrz^+Muh%^FN3Z|SaQ*Esx0QXK zpzLYRFOc(AXWu%Kl^CBaH4;UZqoq@HdC{|s zbg|<}I>ZJ;3|;dN+pHt1&6mr`lk}~vt;S6h@8?Fxaf!^US{z&z7Od3^t5asOO(9;D zrmt`&v#U+xT;H#()bd&c^B24%^~%>k&F1k?9PtzwFIkNyVD3)y9VLeX{byXlJOn#j z_{?hQQza)!;kU=7lGV>-Ybw1-TlfJSuB7FQ|H+~dxgSCJTwd4_}uscu#%*3O^J zPRV$)rk>&Xhi57V6rK#*`^ez$ffpj?69*3F+#^?-44TUaG$H+OT<25ya;$i8k!I=493fGvB6+*F7osf}bP^{&pW&&#bEUFR@NuxOQ}H>GMo+@I?BInvp1k@}!M z_iD|%-r6=iH8mRf8YP-ozl8aPNVE*;JXl~WN+iaB-0$|TUG$w4?)QIy7WLSHQDT1upqW}ot!S3sy(PrmEII1kI@9UN}&v*Q{83r#bd zsB~SyHy_AMNawHPm&clMl+Q?>q>2jiI#^SU9zsRa%_ktGb44HMx5M^~W>Zw~yS1Z4 zicK9v7ml4I%<=Ini190i*$9QMR({Tv-<6gAyk(ca{+iZyVrlcWgEJ`tQ>zKJ!?&@y zEK&4?)5dI|_xI2CGi%d)@tknFwu)t#Nl^7Gx4u1Evr;GW4p-KSNTYjL6n?~<#n)&& zyb(jh?%Qo@Nw@e9Pug=+^1VkHWu#i_{G>$tbAl*|r3`2Ay& z+F@8m=*_ql&5*a<@$zB$j03NQ5T&hHVRnr)fg_LO=(zr-A1}jcXp7`tS^Q7&Y^96U z!wWo>UE;Lx5*+r)N1&IM&`)o%8s7i*qLTA;q?p#=wR+`!#WVR9|STuT|0N4|Kn);$yM1Eh3R{DxMuCtpBF{TV!NQxWaSc7gU@mnAk#1^0nILL1b1? zC=(Hs)lua8{LjoXA7mPi)gP^d z9m}+pJac{h+BGd>Tx>5uz-f<}bx>dP?AtY>O<~!NUfvOV0}6{75WSi%A8Zz`0c4AHi9^c^C&Y(I(5(_r2uUXkJg(n?5CXqT=kAYJ>`?Fc7*qT3Nn zRK>RmP{?1OU|7z0S1}4I&b{+p){E>1$bIJdxq@feY5&@~{2%a1>;^Fk8Ta8AWg`GU z4=H%@tap<$Q*|r?M;o7S5ybS&QH?Drr?_q2ouizo6RFq--x8ySH@L1-tE4SPFggeS zDpLcWyW!#3XpP{gtCwjR6St$9np4gqWa;fh2UU1iB%TcmI_^a)MaZOWmWXuuO^oBR zNWsAeS{cS7Z7Fn9V)$wflGi!xs4gx;J_>Z6yc~#dN@%u74O%91cE}WsP^5HR*4T2V z$iGDmhtu39pi##(bD(m4_d!x(BZ83>pU;A|5)X%!$RS=R;`sUnTcNY#XoEt4>HS!* zv+6S|pKJ|8?52>t;5Xa_^Pg#H&K=RBFL*7oMJF)D$92|Z*TBW*?p7YnYzlQ5ZSY1o zxK~tYR8z0`_Ha?d4ftMt6s4V}5Fe&-bCslmUx+@RS+Ez1n1me%iDIN4+b%_riubCeh=|(ba1KP1U{e(wVnTg!gNl=kO%C)6~x?Mn~iLg>M>N zPj(RNi?B_h-Tik-4_57`kTxo!?NgpZCm*AmcO9IM>V7F_!N{i_Difw&U*3qOiLI7< zuzKyu{)14V42hk0CP}55n`UR1fvgG?-K}NGRlq+N5R0>{rPQ= zot#_As7G(s_wM^;G`TZ2Ri(}X9kARPMcw124h?I~ApSyS`|QsAncuNfcE56Gf@&5} z4U$3^yW@HZ+h-?pPVYsNVqSah^Njqg2Lxq}i`nr^bFFa1!zw|7;)U-E6Jvy&n<>_c zLJ_}@Je{fXyG-%f2lnPlgtN8W&w77~e?2(6)TV;-Sp|r`&Ml5lP|y*d6ay<@IzA23 zs$BnS&5)&uTy9myl5jCI5czAgDbtHfbVh@L1z&3kv6=j$Hl5YnnX1V~Yqhq7r?0H8 z0G3NZcyuOk&`WUew75*{+NVEBx_{5l`9R``v98^vVvd4YDbDcsUL$tWk~wwNR`4IF!ZFR5u`<`OCrbZko6+EOe%-+q`*}r`?48`(=J9Zv9bNbUb1^S3mFl3@wGr zL*Zl$ASE~!I|Ra9ye2gCUk-de71Q7JT{oS0P$rUU5J4xoW*L;-DClS3b-%`;{P+Cac z(^tuCzqpPWsF1?C>V@yvAMn6EsdRJW-T1+Bs?qRyebkW9J2Cf=Pzr;z?!d*za>#%; z)2S>6L3=9SS*Yj_?QD;Ynyqv<5sX|Mun5Js?=)t~7rgs&ocVXZR=L*0?RztbnHJ4@ zu@!#il zaIESFAqM!&G(?(GPx}J_0cH-z<-6BTL!arsJ^aCcA|O6No$Yy*R>P{QEI< zoHlR64cyJkak~VUcwQy!KboRWh7Q-U;FxANIo09LF5lN#*^5asQqQ-S|DyHvGp_Ev z7gaYQllglX=TxLM@pWA1vM4LCx+tIZwYB(8ldA^v14VPq<)s zr@CjEdh|Y*528hU!9bxl$T;FRnx#|aa z@5p72)5r(u3r`80rb*e=Dx$_h(*kponmnF@YXPx$dWB-It4uoMMCaPcn1Z88lo~M{l)1BL;ealc=z9}sXOaF~szOOWsS5+&} z%D9OaCRjJu^~-kA(i<86tBgKx=*Z2#Gl3-Lg8UxgQbYr#t3}wI;);*nA?YDCO9PY# ziiN8Iyes*d;!2X9z`#^o@Aa{ocSn0XR+(;21sp``(opw!8Fa= z(Z7@bltKT`UHv2d^Wg^+q!?di$za2LW29#R#n5v*0;DfUn)~HR^@*GNeK4^fUdKtv z{Fl1_?<)B}j)~RLS(BC4SK17!dT0|v0pFLsKTQn(PFeipn2iAa2*J;Xgi%s3bUqRC zBtGcwU&~_Xr&gUu=K$?A3W!L2{g}b|Mki~(|2mfY$M`Q|1vhjk2TudiJ{1xi69}n0 zviq1)&ay5ai=3u8%r&%ya@S4nHAzip#J@BZE|_yJ{%$zU&Vna+y*>6BB-ecPK%;hDdo%qvBLV6w zgLc%=CqeLKaUR|TzsCGB%Q5wD-kW?hIz2Chp!fPh+V}Mj*c!&)+z>+)54dge^uY*lQW6*x zaX0uk82@gpoydYmzC@0wf!nBO;(q;Gm6G=b3QdTi(e^KZlF)aV3lw{?QaS z@xfIB&F(&c!q~`hK;qe$tKqx^AiJjJE;}Y)p605f6wFf`<5N5je==kH^{W5(PKoxH zp)dB@qGuYW zEG2(p-+5W1`PPs_@V8@Nsl0ZL z-%pQ?07-J#giAXHOf@!Wt_4_Uv;hgd{^njmQQG-0R{4{m2Tp%CLfCfEed>gL=mtuv zp8*|EQ1Y3DUCiQ2a!`&qj}8*=T;{Ezq=Il-{vhan^{ei_tYQHKOcl*O$@BUr?R8>XaBC#vA7S2O%%z@!4yAJBZ z!wFl;hCyYq9XU!dmm)yGb`yxM7rCfN%6$-mflpz>A(?z;-;02qDA;Jc+7W^H@uA5p z&Ft>m5?~C2OTO%m=fBt1aJfKN0g!`nV9&A!9E;sKc-hYZ9hL8Pu!?)~ZMB{aYNTHx z0oGG9#Arbo_}{j4@MUR|wf*kcSvT#{SJC+ZeA3ItrxXa|f<7(Kt71#eYX7T+sx1ON zj0H1A=wWQMm*ZGI=931D#PrmLH|8bJy3W-g>g)I<|KMI+DH=4br=m31>a4osiVTY~=0R7suw4VFXPf|pNtDU!KA{isact%E7nd zJ6sBYmk;e57%8{bMNC%Om4y5?iMWfN$2RLt{l39E=m->ngoipuIu~-h1|$ zSi0E|P~yPI(QsTY%)UM(_n{KnnBxD#Z^GvHkB{x|vI|C6veW7fvnzp@57O0ey85M^ z2hznjOV>`b9%KrIxg2lL(ZA7m`uWlcpu#`Z;M9&qg!z}D7?=T!n|paZ02mwt5X%ef z>$i$bBgIG1K19dsXapZ^O$cgynypM-GwX+!r(0oTpJoOd;QEVx7S#Z{*CLlKH88$3 z-mv;v!i@DBhZxX2y~w*!W_TZe!zE9nfmh;(h~p|TdnnFhoSUMUeha9N8H)^L$~Oxp z<@)f~1jo*=?QJZ*w=EwvaOcpIj+8ikHFPwd=8!DmSshNs)4W-CG_6P;?7vdIR`3;| zwl?0?Q`w$MHAl1lgyX;&U;)tv@O>RXWh4t`JXb$JY~@y)@^vf5f%~g@>KQOI$ao5@ zjTS~0&np7krA+x#J6y!9==r$Iuesn2z#`*0180f>#abOalIG_Cx%$?o@eF}<89-~C zhM#H4pQ@`1fBwsv_?IM@?N0ji;KaVSRp?$`_VvaAlCpKP{>+9_&_)jh&&Y1cM*<}O zXGQ%%zEy76L$L>#Y;icyP$-neWmQ-X2Wtui$60bH_h+U8)2n$j!Fn{P@-?mmXri($ z@2j)u+xM5r2=>=v?o-WeekvaX=v&^yx1TplEVXhpta!EdmisEes0#cN9EZfMQ4$gv z3m9(riztAK3LU=?XdR&A7ebu&2x?09u+;5$TpC%AL&Vuqr?XXP|1h1b6j1P6mJ*6v z5W(2Uu!in1Tu^NaC&STpMr!+S^zrY{ei_|BBmETsEBXn5S#zNy#b@xl-0gNnx=JJw zpv_6x1%XFVL-c&RC0#<-M)|A1IIjDeH-LNQQ0^y-3HauHAY`|o7I|F+&fc^h1?5|s z`@I;S;{HV!Q>8=CU=Szjr#NyLSdZfG7xzhCO`;tXa>lRK+52_Mt;n06SQ>+2(ALmn z$gD2i7JN#N6^Lq_ta1oHy`MJ+3|!-p-sh&G^C-q!m=DZFwVPoPtOJ=n$-;b8Twn^D z&R=6+PCPtv1h)^|2WDVsxC||xjE~lvc^|jX-dyyw1O{0Jk(OW@w{O5FaHnQ5UUwaE za0I?~-pk&$XOD!&Dd|1`$|D#U3r2dgFc-yJNE`&Kl2Nlj&q64|z-=Wn$G0g&gYkOt z?{P$7xXCIo4nG^OzFo}lW?E@p;N+;`irRvd5UiV6Kv_uWz&5|Zky|Xc+TlO-@G}x; z3%rhLn}!P~;1mf&FZF?%Ld~!vFtJZGTpn;yN_y1>r2BiS<#_+aqQiC_-J-N21vb2Q`ajbrQ@=`?BA(I&WHM-zSX&8HPh2Gt)pP<-w5p-_79NnnB{ z=a0}Sv%FJQrHH8s%td5o=t&>Y&I;m59Pd5T{%tI;(uo7B(cwX%x43!;he7RFJf}Xz z=0~v5253K&%sT^(y?I!gTlUqghXOE_&ojEy+tvNA8PA7$)*q=qVdD!RA^Vxr|reN{+*KQIAz}txDFu+ zrmkBJk5>i^$se6kPL$hxEn6D_W&aIa=C0Lj)t}9qZmIz!I6X7_GvhP5y>|B%2AVZo z8g6r{&XXO9<<@s|O_dH`-oBMcow`t_7QTG>M9mw|&H+rucD3D{HTTuY1#G9qoXiWJ zr&DoxazYvA2p^>iT8%;HnHF00I}Vd+`ydxpW!Ni1^OfI8*Ziv0Yt8ccf~JP=nC5>a z5W@9Z*74;rRUQ#oZm5aRlRkVYQDyV+#G_!_Rkz4Udi#a-K!nI@j`ra9=cgW91-5lu ze-VrZCZRi}m?*`2NE`+hj*#=_m<6LkSdsf?73+W(jF?GEkr5*4vTd|BRa4=(KE{I; zSA%DgfT1VSCL@Kx+ z03MD=nq{aAf8~#AEFY4ZA7?;C;9c8^H)SNCdrikgH!^>?kmT@?^iGV|(KDgux3blx zpzlfsF!~(=qc)1?c5600&bHK#=aop~?GH|aSfp;6l}O)LIHv7`)sAaW56J1VljH|8 z3L9?q8zr8=P$A>thq)KD?N5Y@X%4&wPNyr`!N)$j2$gWp-)A7&4DUc2F$uVqj^i|x zJ6Xu~fmvh8cc&@NVZkMjHh^sDy&bXuLWy!mSwKWk&%-GDp=vCJGB|6nh`bxEPbSrh zjHo(jy*>8RGBlXV=of-5c4=g8e7fNvFtKbDu@#EpzuhcMH%XrXTzQ=Bo$tCiGJpFL zN6fZ}nAE9%2|)y`rGs^t$EKB$BAMu8+<~mf(9f^t!Kn620AYqCbD40Nnp4w{WC@-c z;}?U;cNg)#K^oN#-c1SMQ#;OioWJ+I)iNMAbTUd=a;zVS{4(*l7+6HR-xCb^$V7e#M;2f_+ z8i`@r1kjzzpt^oSX(1VCyC{7G-A#Ruoi@AgFLd=dfDc|3hPfTEAw*j0%oPW!=f*VimyxiIl#kA z!%)09i?BMV%&f1=HPcBHgg^P-#iAADX{D8r;3j!SU8|kHR1eu~F@TK|E6^JOr-Naw z;2PGXisTFCLgVwCnpWsjs0UPiHV~T?rXORS>+A>wqv!Ipe9A1nk!60fjbpd_QFcf# zdH=`AoFlA@8rjX1Dx$b%=e0g8)}Om^pV9 zqCiwD3C!LyNbOB!)cEGS5uh?~>R~aENk+1w%Z8)6EqNcBBbDk#Uk%8|MX)=&)d7|1 ztZ2B{5VG+L{UD0D4?KV(mQ%?t^C5N~(;d0e&=gZ5oaMSCKJy^i8~saQRW9G_73~8U zxE^6tLbua;-3|B`t-^>sx)fRy@`5{);^xVA3J`oMJ`162qv)vFo&wNBnXf35H(6y_iRj9GuB` zNHVs|IH!3ODtVWSg6}kBe|?y^u}S_BqDB-K-I%Jv5^@;NFbNGYJ%skm%-<=;p zx5#gEGE?c~EizG4Vcl3py=o^j_PK}eJbXt>54PN_$GjnR1Kv303_Ip@ui5R{j z$#9SN$+ot!3@8i{i?!o0BjF}jtU#PwN@{UJBoGQ#Lag^In3E=9kF(=da0W(+Ojxk+ zdaoxxkUaa@$G|oN4DM%hgx+MBuhi@MA)Ktfi{X?4W;fu4!1S7DKJozb40y*Ru}&d4 zRCqf+YZ$Dzo4g~A(rGW)^xzqH9u<1VYR&xyc50N}+yLG7lQ$fG%NcM+I+a=K?CT7; ze%R7>0XY*kH5^!LE6t6m0gRLE5JMDGGHwA&d&F6AwQ`YCD@}nNjxTViwoW_PU620V zN~By`n09giU4WusvPL@~nq-`OfYOn?SW(#k7P+uEi!eTmm1fwdTbNh58d*9m3ivJh zgVoOvUvvv?Cbbj^J4_8JumnxPC)-zZv)3a;Hz8_>qU^!*MK158Cr|w~)VG387N*-> z=A>zb`jKl$NmH6c=YV41Qt57c<6VR8V3OhMKMYJsL|9KwMJ5Mbh>l1QcX&2IRf(Gk zw~}qESF#dq)#v)P{l8y}l#7ysF8FQ-{|>2P6}P7GgJIpwC3(*0aal8S!70RwNUP@q z#zdDG{*c6$sv?3aGo*)|GDKhUxQTH7ol?QIfz%j++rGDbFmUcnN<%Qtm8B$?x1X>+GKFZ4KNnx-VS{DK|4o zsaO?^l1ic2gh2?tGh(S!lf@%@HppTAc63x|`Hh~!UT;zW!Wy#OT;Ewe=S-!CqJ_*u z9P(g?#sssSp5g5mk**ya0b~9?-zKA!3ls1t!zgXs5uK;Mk`8u&jYpzYN}H9T6YMC- zr`C1SIBVS-lxBo1UFah0y&%rpcXiv_geAo`RjEXQfzw&25&FUgxIQbdL)~NIbCIvX z-Z!u^@B@5YT>XsxI?q)e*ODmgYs}hh=^N4n#yj=4H*&bX6>|!uP1s z2iX4K8iYizK|;&&>7XM%@iC&@4C_w|BQVG0XNGv)R%M33r6LuI-uiH3st$0#UdRbC zd{ZilomFKgjQSEHm4)-!a)V5=%y#v5GoHWc{O!3!oO4*aVq_2_ zu2(XvC;expW{e4jyjiD@hu>42r^!tyE%v>@x`G=kcMJ;dK!u@;U{iO9JVy2@ZpC`- z4UlndvZ~MV%uFJnBA39u`62`&eAY7Or+d8#iiu6y544Nr`3^NXi)nL+Bj|!9+S#r zkpzL~6(S0%Sk$5Rk>!$e#HsJQer#Su_YW+d-uUpRY& znEzlb-gPc9uVTyVsOGt!nerS;0(plaYA*SavnI2tqIY|KU}gL1#FF>m(9L)Z`+Eo< z*B4Vo<&}hX7bWp7NJII!87=xMqdrCWe_9cI{U1p+_ctwmnFf(6FH=+oH3Qj`uKQTQ zT=mB$LXiUvO({5U7IgP4W?EII`?j~opWygm*hIF5OI>_Qw)PHWY$XgJ#2Q_ZMn?E6 z%4{&;ybY%H5*o0FakJ(`97mPbf;uad3{q}Wj-!EzGTi(XYcElFnuKmfDL&$k0nm3j zRYV8x>cw2Dk)_RR59mKdD})){|3p+krzetO*V5zrh8em#JJx#r0lFj`LXWKKYeNeSZ#BrP9hBFlg(sKuK>i_vxVyOEXM{#X}K0 z_>7n*+QDdJTeMfAyi+(b1B=O%MrLt)uEn1efgP;c8-%#kj`H+AU!XOensqwsjP$;J z5L3xw4@wU>WhdJ|%=p&5aV#9!q!&vP73V$lT>TyuPNg{#-)@7R?P~#&V}cu723s8uNGhD_I&;eEM85v=K1Mucj}^n; z(m+)rDjJ&mu?fYDNcE7?5aE>w$SJG%0ABEQ5$E?is(f%P;N82|n(qe%RX0@q zzTk{xls7)dbNjeLV*WhGBXKgr`TGer{oP`qmQpIDk<- zmHshZ03F8MQwF2(vA4R6bu?lwQOe;@eNh%5T5Q)!{8jFK>VC#ZMW5P8IeN=t)`iXw zIg=vk?UlhGH)c|+#~4Ov%!(dEm2g-DQa_wpRG^nOI$W3v6zRxo*nb+{Sqv?N{)X!0 zCJc9{*rMuI2y$LbP)l4sxvaQ07y7^@k5P_h;FyOsuLhK*nrB?@5HWeOk#C2Oq{(2D zqk0BjP9kDc8U=?8R&pdctZFb@AYmP_m>9T$zE|MFWB!}}DfEm<0>SSvGVKtQ+4;tb zh0}k-ASTNaU5D%Os)(flw3Ml`)Bl5wx|UyK+B?WPp)XJ%nfW7V4n$-!Y#F(eG#ZV2 zzK$5~OC3{hPJhh81NDkFUpLVgMlkv|d>%UZ|-P{=ww{S3`0&P9z2YC`Nk(~&n?^TP-Lq(q!_P*Y_C1p;B~(|j{(+j)eX8Z zLf`90w;n^cpzBDZxzjELMgHS}bh-QUmv6=bT~t>2H_OBlv8hCd(qw;h$Sd5Gm#e+F z6QMn^&u{-rk+F@IfX{&OG1?-Gbzap&kJwQ&gcrKw-lwcOv3)+!3I06>^YX5mBIOP+2=RF0;jrDub$qUrs7_lg&S= z{ij^q=yiMhiboS&OT$-2rP05TolrCDd}5QGP&-R3Dv3hE%4IdW;YxgwUsQcFNb#hi zTZ4w}u`A|D9ImrE4GJziAbX3~06Q=HOJThTzey`fuMkH1Og(j)Z4pbV%m0j>luzB@ zie=P#v`{x~SIWK1e~taB-Mvd6Ki53fm8EUGH{VqxKMuECL-MlWDzt39AT%y}_02(A z{RJ@Qsvk|e+rH@&pG`Uig@=;tv(-;o*b7%;c1!Kp3F|xTroy9lPdR^rQe zVPLjcRO&9p<6N|6wAgfF^}fm{);>X@D=;U{+h1~VUbBl@>G`PP)^wf5CdI!-%L>GUMtR+>VC@)+)@2_FC|+YhUtgW`INxhiA`T_~@<4t<*yf zeo)9QukUD?l+;^&vs?+CzT)8u+i9n?4Lj55^%5^L=#ozy(s!PE>nLe&$Fa!p*m5mC zUs$lutw(~>t8Lsg-^Ds5F8gWdnc;}S*)iT)i#{*Avs%VvPpo~mW>;H=_@?;FuDH0- z=7&|^?(AW~Ngt_mu4af8(dbP%#_q-ivy+CKgjc$AjCi|sOkA8_9ac1C)*qG?wmIcJ zA-$vV-PO&(m(3A~+8r0~FU2#Bx~`>Z!@qyUgR?I^@QfHdT6XRYIau{mD>BpLEZK0v z=drbl&0{Rocxx|+=n5?U{cdeQgEedNB{2|ec>2VT^zWVqFY4y1D+zn(o7G>PJu1!s5>1_RHFOw=}qU(shnyCRuq*OoStn-WXM%Ue0 z-EAS#<1De24dk`$C*I{6M81!2sNzAgZ{?1^dtkBq)`s`Vv!m57g$kdUEKg^HI`gUT zcfdFq^yq~Q#iG(CJ}!t8#w~T)1+@6tuEkn@4-8V|Ye%f^Moo8P7tO_YuH?;+UvwK@ z-rl58kKMt)0tR6PN#c?j-IixE8_cUQE=7U@D$Z1ML-WTOjgJcZN-Mmak6aC?;G8$c z1)RynL0N;PdG*OiP#A)$%n@1k>v3BGRqPpA#NB}$wKY^6!aU6(Lf>=v+4Gg-80(|M zWYP{8`*6-3xgPda4Yh3Cglh!1)Ys-pjVCis=BGu?z9%M{TVAJT>wxw0M|9TmUgh1e z?5jK`<`kxSQB_r-xBkXaPM@3fZaODvLHcCD#AG5`+s#Ohc*$Olwa)hu&WvDhDP{Bi=e$425 zkus~d&y~}9`nY`f>wjiqhy~Hu!bI4r2Z(q}AaoQ;0$t4?4SDHQSGzs!-%u-QCEbMM zEZ(S({aip7z~~HWo&DmB{PnbcH(6^A679Qh50T#_3po|bh-ZSHg&LkSSf0^I99Nut zrN(O-)n_jjKd@I+QqU=yw4{1DvC~b8ZhnU*hC3ilYOs&qNkTdOA6)QK6|tJ1rb z4~PS;^=gg1VqfcT&c30L&d`_35ms%6Nqe-?3^1rW1f6OPX~;xnC=(wP9*dhZzD%w8 zEwgXpK5$CUwxqehwp?GU0F`AkqgKNXuJRY}Fc)8ZCiF?=0^d)D-a;>wlgdu=lQL;8 zDeMS(*Ss&4dWEYhm_9j%gj9|F1nZ`Gcdt9WSvl`cz@P!_=8G^TkYZpDPIR43TiK1N4}y#OZ=jz^b4Z&DWORtd`~u-s(us zjtVt8^SvE`45uI1Um0W#pz6KKubAfT#g>lN(M7%L)MH%^by>{)6^6Kt*{IPi-@~lZ zooH3kVLqY?iS)9mj{(4cX$SUdiwA2Iq)OCz9b~e}o1ycn3@r3a=q$Aaz)Er|+TShk z9WF?;l)>?V+kJ#|UH~iR{~^=FkFYK!W&cHzl0&!Y99MJFBOabq^lJ=_eXyZu8N3rQMo^7W>H4N(tGonJoIhQzj@-( zSy5FP$To+_p9~2Ix?S==Jq7HGdg}b$#WEH3TBmG<0`WV*TeyS4)7Frm#_&6$aTjC+ zv&9_sQE-E7mNf^zQY+a#H^$s(+K5}yjnuU4I0S{b#PhdIzHd^Tx!1+G{F+=cx(N@v zv4dyaq0hxG*NBcb77u*YleKK{rFQ@sqv-mkEN&F}5WNz!2P90E0|Q*$9Z!a7y2n#q zXB+H=2SoEvZ7#M}>VS%!{YRuwJFNl{LrX`d5%v?f{e4sN^tIfCsiz$BMq=isHb5OZ z5z`+91z^Q`j92G^X(JYzaf`oeh_pK{*<{3tqSSE?vQ^Sj!_!7TE73a{EMaA2@8Ly< z2+Je}tZeF(MA-LU+p0;w8;G%J!K{G>^NNn^A~C*uh9LIQY2TY4Q_4CnLJ$(L=-Zg7 zW#9Ne{=W*qZRncH^AFyOkp2-x`+6|@aL?z~svoS4s?5b51I7FugKC=YJ*8&YvI?CS zsDr{@m=N21;D7H6phDzs-q`Lj5{R9&#?lioVQGjA%BgCXyij@r z@BMJqL>wVRBK>lS;s`SmOf=kq8chhIOhUdj=Sk&M1&(3Bu*8kx-{iq3z1{mM+4Lw5 zF=y5EG2km^Q^(qRz#y|A$#dMFJzO|twm_vZQt_@#*RH54YqcBtAni{~I4Rx|OnB9m zm3osi`4L1HVvajt7sp8jYYgrSxIr}Uf{Lt^)TXU0M=~)#h9=f3XxuaobNpq2p^>V5 zzg%~npm=i9Xrj(GfVtIRiJ?wO=wY_nu9x}LoaI8bS5t}^x?GYvIzbBKfko7o>wE^R z1iQ)>+0g|xr@;TCBc$x;KTyEicriaJKl~%XKqm?DIN#}5YE~R_-LXx6k9(FbK*-OX zb4|96#F_DVI0e5I2;2m`n>=bjkzPOq8}K>vB#N?s*PvMxd3w6%Q&lJQ0yVX>W(e2E zS79_0oO&JW+Q^HbT}7(GQR5GW@Jw!Pa)+E!b}G;qRRDzKTdLQo#ooS)+Shh!VkZ!w zvA*|MbOr28@kS_D{i{@l`OWm7Tg^%mn1wS&%w54=p$Vp{A`?)!h|d}JQ?Pyq-@&*= zkL=B!oo|a?PKaF$@DOfRE3(Nlfc@t$#9^1fsbe$|XpFP@q(+eq=;gFs12Y#rsoWT5T18dRz83Pf#wmK);4U=5B2)%N!g1M{@`rF?~dh12>M1GeevE4QjaqkR0imq$iuV z+pYh9{Mv^cv@rYBTzTOi<$KwXSd|8lr;Jo{3^K=!i&9dl82rADDuX1Pk#FwYjn|%o zXuZi6<0~Y+Z9|E}Ux-Isz<`zi0PCxe)Ukl%%xVXZANKX8YyA)yqut`Hhc6eweJUQ5 zBMMikn{W<-8ED5)GTYb`;*a596SiaLWe$L~GEec@hub2eb`l1H%-XXUvKR||a!fc6 zdhcL{@Pqfca8nu=u5x<9yj;cAhZ0A@xj+fN%8Xh~b);Nei+ljaf@8SMcUdZQE#$nH zphZ=CuaVx_WNmMEon-^0qCV=OP$Bp#Nu584MQMO8(+0SY39(^gl;OuH@xXK+?|D2I9K

UR}Qa*O9EgM4OB*#yd1J*Zppa&4wGv zeU=CyY{${{E8kCgKqVk#uDCD?w1q@$J%J2qQPprBSI%wHB7L`;6HgH^-y6H4$MAk|S3YxySbXCwco@%}Y zr*e?KDT*Ur{?fA~6&4``Zx1dC^nss>w_kT&!XSmRQ@U6B=YHv9^-~|HB+#bC8Gr7* zvVyC$koaK0#}1#z5b{)LAY%8#a$ZJnfQ-~*CG>SD^3e!JFK2t|gK|^xk|1I80$8O3 zH8;x=$}ylZR5r}PoLaycS9?H+1_6>aL>-DFy7OL-%7+~~=^GvU+~y71$CoHR(qER) z*ip|(%!?>Du)d{|5z+X+vYhWcns~QMQc4N_V`nJk3#DrWNFNE|!@|(lI+an7E+Rbz zWRtkxmjEMvX#`eeC9_MoYLO_bQkR%&y(b)J<0MPehVV4^7v|AGz3l)b#&)W@r(%Li z+S~6f4$n^k3xkrW@-rS7hLC`#onHBb{sNUtk@zRf10}?4Si8d5 zS;^gp))Na8C+o6bq`dERnNzZEwLqfTge7DqOESBy2AA~xs3K@+Dv?4Y%Sz$z)h|^B z#+<*V-CO=%WuF=rGDGY?MuSuq91(2w>7KO% zZB~u`I)B}())${;W`VGc&1lW=D$MrGdTTy9G$E09-;f9)ITEo>CTb|!t3N6++C3hf zf5+_Qy99leFToc()(%dSKMTGV=(|nShMRqjhHH|#r&Un@^j!69uq9tIax7WM_i_rW zFsOH|SQs;bH*UJuEL*?M4Hqb7hgp9WFa~1Aglz(-;8)Eb2_ZAh)<dECLAv4M_}+}2)IHL1K<9rb`wFP2y0%e8P!N$6l$J)ypj&Eaq;mkJySoGx z2^kusrMpYIrKE=eh7{@Uk-mrb6~FPj@vplUi=~Hi_I~!$`-#0j4;K6o^8nR(Byf3_ zK+Z6&-W8sH=y~{kpa3#6*Vd@DT#BgGW6mkV>7FsL`lhQLhHvvaTw##?Gw;2lhxz$l zF_p@v23Ao1!bYz)96lJl^aH(fK~ij7UATx>(R4=v+mS>5r9NagIh*gO&DhZGhdw+! zshZ2S%%O6>O315?YTLxM=|q`r+v68+8QMl;Ny(T*xd2cgYJH&Z=2pXJMnY>!4YVCB zvei{qPMtq5RJd7}91u#zPVWuPU(}%B7IPnIua?VDk?kklQs?7ASj+^u-O7AsS&@0y*{I+P?-8Lmwx$^gwJy1I~il z#V*BVsY%20{3)P}nB};YsFAX!rp8*JG$7|`^nZ|a^{?BE!nW*^4EIHmV7-Z2kHX0T zHZVx9Kiliw1ETE8_sMt{8mm*Tidte7yw;S>5HsMcYHk*%$*k_DVjB7ebfn}Cv=ILx2o zKm2|5FCs#2*PSo*zvhGYfD0hH01tfYell=SEO035w?Zd&yZLrZqGknLqVh~|Ucho0LOh$;U>AjT8J?OyS#rqGMaCo zJQ21Pdx1RqO;i*DQ)}y(yu2sJz&$gx*n+>lI66B^dwSM0Ym^9I-Qp6*Ws^)pON*w# zYx6gnTiF0odOHZ>UbF4hz!Cw?K&f74e!fiOZir16w$CroQp)b5zR%A@8-do4ULX`g zcYJ(Iy$9;O@E8dI{cuqf&1FzcPo6DW|WY<+!wN-8QeK(r-x3P>%K z0WAtE{EX%y*DCf3r_TV9A&87kij+X&S6_VqOf==lGGs9IxzW%e{>mu>f4zzO7!Wxs zR%&`*ve*r(zn#k|MxUkr zOMiF+uO4H3`+@u>PAQ<~a6Qa&9wPIOSTMj06*`P-fye+50n$|a8Ak|&wECNY@HYf;LhJ-V&U|ow&)kTJ2uJ}evc+(rwP~`8?4*;dPcB{1-`AwWk z09Hkh)g11>U=0Fb&CtPE`wOX}4*(sKN4bp*tM6wbKph;@-9Vm+TH@`TX}gZJJSsA4 z_{#uKzb+i>AhR}l6);nYI3qUln{VXb&M_>p^3Wr#zd7J)_R`oGd}OKjwZ*_8{U8%R z@RxOXg$X>=`n-p%xb)kBhataYzGEPv59bB2`ezQgJc@t86b-m*RGrVC%d+--ZOabf#!I(e|$T;}|u!*7` zZX+_FH&M?3Q?ZBtsSWu}$~OR{R7bX<^_SpL`K||LO zdpoy6C4BY}`PH;5Sj|}e_zQfSfNa!4am4^*1PFP?bE?cj{H5&3Lq2EJMV4& zg4G;=HSK6v(e5u;2>@6nYw)1AkO%Mw6a~A=%bC9f<=K@X*_XR<1L=E-Pgj(#iAqFA zqBQ!wSm3E`_kPV^^s@wJ(hpW5M}D&%;1tF0V0(F__50!ioWlNwjfLJHS$clpQF7mn zWlwpB{4kXecv$n1~*2LQ51yc_!&U+h2VqclcEhrC%*a!C6NrY8VQuez03kzsnT z4Pbw2-t#X&KXC(Z;<`xjJVsjm8#UlfV>SW$zi8TW1!xS-BC_smwgtot3-MrX|1XlP zU6JGh&mMVt%AM33Ex2+wk?Dc+Fb%*c&P_KyXM9V9jGwo{@2^~~t`A>`kssrbp~%U} z4G#`dYHDiE@4D(_-oYp3;Np@2gOfb;q@|@D6PYWKkblGX^Obj0Bh!!Z7f_TafJwEV zYre_I#1skmHgUX8c~MbOl6H2b{1?A|ZVdp(G=owGy@BD=_^f&-JU8amkoCWwj5zT@4^b0`drl=mMnO4%( zWu-VB?_2;H{bGO}PtVnLdiu8PF8w++m%gWq6DBFqT(h3aK))#6RdCny$~FD;=~K8^ zd(B_uMaKbH?j}*VGP2j^J9u{->gaK_0n|{OHpK!JNYWM-`Le*hULg!8-%SLmfu2N2 z{az2#uuJ<8KnPkE4pr~nLD4O*=;-K(*jW1NY3Bsb)5%%qVxSlE0lY_<7n%H&a~}b7 zMxj9(giQWY##hS9)D)gOzL*}m9-ADcP7q6=3&(c~NQ5Y9Ycri9wqt=tEnQ3LI-m)o z7pVASG&3`MLmds2m&|45l_SRCl2_-@Uxig8Qc~DZL0PST$zeEPgb5hc{oIieAl!Zz z@JDM`SFi3=;L7WgcsLQ0n7AR52Dn@`+g8x<5Q(1terM(J2`$O1-s|yzObFz@%O^Qy zARNp~#GPr9A(ME~_SZz{V*f{Ti_4<_V*rR@E&}a(|G1p2lx4z|ymX z;)6^T-7wJKw*a{1F+g83+N)bLe@Q6%dtE?p9DMUcK_=&?8@|B3d$p8Z5)XhYM-;c3 zF1;seo$~o!V1`(h<4Vr-wNiV8T47*dyitZun5F~nyNse@IO1s|lHmsIATBLH9>v&q zJ%5oW0}~2^Ndtnz57af3y>7cb0ol`omv1 zt-4aj>+TL!$n)T!-qfzkNlO#Cy0WbXDE~168d0`R8^sk;1i@F=OK}aG0`~_*BqdEF zaQ@3YQ%M6jO=GagxErX@ zRhB7}8Ris{wmPh&{y*w>(FdkxX5qZx{T027L%l^H-5xiN;3Q^bWRwE#>thD0ma9+k zDJZsEGg7_D|MCiNqKaI(OXjb8kX;MF-VIJntirh9C9mHV=r!qHZPU%n&Fxc%=!K4@ zu5RKn+MVy9K04$1%guP`1~^32+wt}izj26z;=8Auy$D@hdn;YVm_IzCM9cRAY+wGu zBU({ckv2QuA~U~SrhiT7!g4o#o|YQi+b2C%_QJj4@K?8VZeH$4I8grOvq;xYFKO&Q z$|c&Ab@-(Epq(){w~#0)uy@-YA0LlK25UpX-`$lf-(Yn&Qb~Ju&LvJwLg`IfzJ;1` zd`4#0*EfakWMoP9vb}{irjDm)=dmErrv*bNl8h`Gm)Lyz85oRpX>c66VH3XfE6Ol5 zCSmncqyTyKz&$N3?aYFDk@$FRlqX|rpA(WYt*FEcwyK=gx`>#zzs5`s!wFy2tmfuM zBt+&~ad&8HKPDrNVdvxwt}L%QI(pmFC#x`GUw#lz8(fJs3FqTokQI$gvWj${{N{COWGgyLORQm9QEscyPTC zalXfRyy-v2&TM-5lz;8;C#H8~WD={oE~kQ;Sxqd<=h5|XpUdvz#fjLNVMs|oc3Zlx zP~#lehvNzJ65%2pUcIG(c$h;0+Oydjg!K9IIrLROV;QllloMFDAW0dzQ#f&lroA z#~&$w>poOxiKg8*>3MG_Zry~22I`gXX}UZZ+q;-a7wA0Me>3x4%6??D*QjN~tZ08? zX0m{6+XFkM$LDOrj>olPDrMds0)FmOJLP$}_}pcQU~}rb-D#n+<8t!5y?1w#2X`1s zYz-D8cScDYlIefDvJHDYD#PfIA5 zDx8;W;Cqc{E2-Jj4ND4cGX^%b#<^1!?lrQ>Phn0Q+WPi+Wm=PC&d0lKKFh6X-KgH!4i)$~dwzkTX=Ne%pP3)IuI z_(75v$~1d=<|glE7N^J z#-%87F?h*zC3X>o(SvnEKqb$pk(RBY)apyMuC0%*y#LvuiA|*VKHDnBWR`Q%qPa85KY=A2X5sCh?km|d)j+jN6pDR%5((1vr|rWGx$Hj zB<1!j2VUFnytKCHST0L8_dF5=y!<>5*|x$0Pe{u-jVHqm7PSVZ_gcTX7!I^NZ|be! zVov4)Hy(8;NWXqr?wy963I=ti-zuO zy|OOsw6Nf;!e`OknqB%`p@lkYMDg@FBi2S8Rsl}A*Wu=7ncaxABNSX$rvB*i za1MVj5|lC;mM#U-_(eo&w2q#@!Ne9#qRtk{0pH!Io8C{?>$w%$^rb0tdddT_;1bx`al8X;jllzhp*RaUxw{!S6v+R|=G3Lm1bue8lOXBpt?&;=k@nvtr zUi*Hr8Q8-a`{V-}KX6x$4&Qdn*TMD~Q)*YscK_(pDT-M#N6p4N{m{3|RU~RXswre)Y@Gw%s{*|54Drx0|5W9n`zj3l01^b!th|)k595NV0ai z1v!7m+KXs5!Zhh3qX4tWjKWAHCvtKS2=WM)7iK1yk>HtUT%Jbr#CYD~jae+z#_O3)SoD z4tA_MiCH{7fvp#ZDR>^3$2oKxay|bhLW+5+>*kQ}xh~|hDk1udTxfhOMXOHBy2-8b z`)A&vkc09e9T=f5^!}dDi4S(&Q>PMOF2u2d4qNrKYqV?vdw&Re=@3R{r-Ipy-K;5x zZ5vPQ!}UXTL66{q2J1{&V!aKCFB4wQD=QdoCm+TZn!I;E=>bEWRlCeS=CYVmPotKz z%=jFFI^px{Gyn0K1XeBQFLK@$;F^`)J#4*;7nj?8+hwXL~XvnMwU zN!T?YTD*@Bw{>Hdxva^r2q^!AG0Nv+ZP{@5?2$El`HgIbQ(>V>y;^ zpW_qoR`L!cx#ElaNELQsTb7lFSN^k=F_|U#5&KJ#cZYTYsogiNFpC?@MMc{~_GXGi z{~@aC$@k;_%X2@-+LN&N*zeN8r;rtadU5a5)^#{5T z_8=j-_S12m0cH!=fidSg+#z-q%0`Sn=BDB+UD!%#t}AQ&Dx1Re6wu04QSf^=2I=@a+eS)YJi zscZL64cNn%*zdo-{(m9LEi@2XP26+wY`G!7rQW{k| ziyJ8CleEm8mIpV;E6Bx&J7m{!xLUeDpWTdAunxoG{h*d#Xt%JOXs`Jwx0Y^1Rx{7y zlsQpgx?1D7yEu8s>19=|lg3$MuKeC1*M6);93aQEn-q|9Oab@jlkkETayWwK=wy>`hAr*>3BLA{cf5B;elOQ>*FobtEc(OOvK z1Yti(Is*za&&-Wx{t%D|@idDd0`J5@jQiMuhW)oPT=S)2aUhd-1n(hACS)u2gN+KCsT~ev@>-H6j#+DcEb$9pMDvyqGvDY%5>biTC6Z_!2@^WyD9~_wR zPXC_5qx$^Cu+Ae+v}Cyn#PGVdV{JEJ*u;60W|p_UN*S*;#KgqU_XrePSt5MXYi|z_ zPBikBeSSG7iKtCzrRIHsbcx zU1rvjF=sd+N2~7Y+t#z`%7$A}kR9R$_M4vx)hgW=M^2BmH!$*Go zL_Nt}k7Cei>BOVxnQBs#dc@Cbp&|u@>c^|47r#TLlxt`ww?TyX6MkEmCJJ`zN9`R> zF4l)GY0gs`oApUKWc@Q7R`s1$9zMn6a4f>)UnDykpd=Pgn?*RqY#(nqf|P%aH}(8` z$P_Qu&rZTMzvYgi7n9XfW0fOY&EW>aqi$+veXa%N)3BM+575FFlXaRO<5{S@e~n{c zdDeT}x#XZ0P<@y5ChlofSE0e1?TV6S-U>Yrle+aTQctsGNCig*ox;QX+aHMCjcO<~ zG;AeABe`a#Sr;FSF5f@;shWJ-6n6jH>IvCQbNjsl(Oo(4Y_`(aF@_;Y;X0}&x`9Qz zD_{9!C0O%osT5K4c5$2i(-h@`$52}0t_Oq*QtnzVCe9YX74~In-ILlvw&d>^v$vnY zoU<*)jSi2B>fXE@v%}(=akGhKudX=s>iyD~qN0>>bKaE*Xw|LZ$fX0X=c=@Y^ub?R zBD5$cp@9bAr*W}Ki@X7aA)4CUT54M`k20r9lnG;5GPV)An6LV6uzO{kYCaNeA;b6p zAAiSiOvv}0C9rCmh+BmplW+4DS#W&j`(j}>QLFB2jkzp4Qc}+sRsOaUW>_}Zp2aZf zMOorhQLyh->d%LN05{6WtbLm;H>sg&m9)X<_;w*%iZ%OUiLlMPy=iw6DnRsv z1zbhuq7ixrQ+^x~@Q8_}O2wi(UcfQS@g&)JPckt9A&VSvx*e#+q+q*mr zw#DJVFjfL=zTnM(=eQ;iP8&#Zn;#HfmBg5Ht)fvrU|UuvB6>=S8DktAmqmNS169PgEYFh^8v)OE6uTH=e1tzl~4SzM$eYO6@~jfQIcye53|OaU2~ zuT|2|nOQ0Xxe1v*R)}0t`S`5$vLdmX=AEqQ^-Whqj75@*T9K3Wsdf>ItW8p?T2{3( zRJ+J&Mw9-tljpH^QHcCb%=&qu3)9$|>7rI=ev=XWv~bG$R;Y7TvUXy9d3kU2R;5bmjr!DR9wf_Dwe zA;uPZ#uiT1eXWw~Z>#MK-wB1WXv(oX-7~G3`LM<^RdV0e!ej}z%XoRaXLPS2{q9D5 z<;r21J>TIAl!*eCjQn@t7Z$x$nq(|dd878{Sf&k5k}^_uhm{_)DEJonNn}Yy({Qhi zOR^zw85*tgbb;t>@`Iufvl{1=iKw@})^V$5MM;|4CCic3A(}OuYjlGwOu#+&M1%p_ zHA@pkR#m0B;_oW5-Ud|)ceY~HIq#|!#k{flE+U92Pn>YRJy;#|Ir~F-t~<8JJ1h%z zX9bNKIhGS`fAi~PM9-Pu(v?RqHdy2qdgqjr+j^DmW%N6H^u&rRfI8* zj;#qr`jdD%T!@PQYzkgqrAUrsgwd`QBrz(WAdg6T^J52M6N+@{){FE;^V=AR`#G2p|whwQBx~==YNV4ZzC@Xkb^l_g6IkDz5Gaq^b$TWP^T} zaQKEZ?abN0dI!AEbp|l#Xlc8K(OxX%En+U#s>;OW6@mP(a~X3?`spE-Nm;6edv|SacB%XM3{&x1nLYlu-S9RxaTW;qTv%C>B^DOF(|cJDb`_KCl0iL-0h?a;g}TDYl8Z+OGtSaLm_D4RykF9_Pof z90(+i{TaUGD!X(_`{mkC{i7}@pDLjdCulFsd`ym5)^*c9B7d^}fYj?0aBT;s?e--F zocb|;<2g+Q&@9qyq@&1n1poZax9cjIeP$ts^_zdbR0E{8!EQuOIsYET56E&7uuI>) zrqR=@L^MWx=5ILs5+ ze7b1MOnHg?_<9$gMv6es?mBW~F7(lRutZw>M8hTxt;|~TB*W+wAAN;_IVCY5T4_I?D1v^^_s|)b?{yQZ%D-t*AQAwqqo-FQ^WSv@ zm;rzO+<2DAe~oG&y=hvaMEdb}e(8Tk1CX{q$9r|nP*1x=0L7i6mtpsy-7|PBfG}29N1U6>wQY@Uu^nwc zg?&S$_fH4UoDNUt3s_~I3LbfZG8(Fu^-MlqtU`qZ21Gj7lV2PalsW{3hDM^EVYjh1KC!qqc|IlRvRIhxoB}E1)dJ(!Zcm5r6x^#WKld8{VTBK8tw*HP zZNSJ0I0SVM1{kcc&9~qluI3ZaDJmhK4Ia1SJ}dhb4Y%T^?8^b7tuUd6Mfi@&dxbwa zkpHT@^BpG)$e8QmZPU_*XXcj55f8A-$S5ax&!#3tB*D#Kv}Bt}TW_3P6h5xknweyK zLm4*1WCriU-@kX2k_{qKkdDYOa;0Gr(2O{&bb`3IW!jE1e@-z-YFwHeA51$c9L~_2 zoaA*(sm=GvcD_IPY`ONsyW`bcu|_8_4C0iBKK+6`ikp$6Gt`W^QgF>Yfvlos&pG>F zQah*{f|aMGHBysO4OOw<)Ldq@PIs$_h|c_Z_{*x^AoNAjoRVFtV;7>tq@+epRVITg zqiGaGb|^(&MNDueu7)WuvMAY=iRUqCXErv9dL5EO=bHOjXpN#WOlTxuDwYut(#s5e zCXj-jAKkmV3@aM&?@?wcD$15;mjPSV2@3HZ)(kEBQ7PEh=bK!ZhJJ#%LmtIa<(gq^ znfxd)sql&6(N5sDE9kBa%#%JpElm}LYt^N6cdsR8Z9eAXQVjZ*C7D_koH#`@rq*h= zb%bcUb9!G&(=9X6%yR8ez1?qO#PAyw<44Zzlt0?(9w5!8gDRu}zon~``RP&loSSg4JRYjbRC)|vbrhpmGczv+xpx#KUK{pR^KBf`exV>Lb(PRLeCPFtMIsP)e8z_pu6(*B3y zkOo#L`{!m9B3 z7AyoX1)gWohzGHZ4}M`{0dN1zP<&atMP2?G29wXtHyhJ0nyjfW5YK})WNG=-%(uPm z-zv{oP-G43{VqTFw*O~q`x~s)yTQsdP0i^D5tm=&Pcm7(I1_nB82SoNT>LMk!VUsYUx>(MidnR`K+tjW5}PVGM@C zi@AIE?u*PEnlbP#%t*_-FC{3k#H$b<4XNR1V)UV7D_&aEmDwa2a-a=vt%VjW$Wkhf46qN?OpCI|9Ax{rC1pN}SG_GtI{@-VYz;}bnP zS?S1v#EbxMi6a;CA8-?^v=hgK4Y(q`_Do(3czZOzK`;SK>G4A=!#s!ETRKmtib{cy zrOzon+)|AAPgL!#FyK$%vizxddHBBB77fHn3kJ)jCo<=c&Z^i9uMTNVNmBJ zvLu8sm6>fY19@h)N1BYpQ9~@Xgh@E?1oq&| zHpgKXy?kVv{{AR##8eL;eT1R@sGKJOR;DPefJ=R)N zqbd4bvF92qg{e&A-Ep5h4O$zo48KPojC;hpc*qz>$zxHEN9avUW`Zp}jG5LD3I;mXvW+gZ2pi zI-Y6bc-0!nm{4rt(hiLmZC*-ihM6jZ;0)=~WZ`PyiMgzhPe)a6#@Zx+8>BQ1=oX|J z{X;Z)%&g@ek}y6j>T6Af$8%0J)<{&@ls^h5gipoGT*O+Xqy9A9nwRAgSyLjzjf}`z|MraP znKdt82jRP9A{htiiPBhL#ARs&xhQWAUdG_YzTu2|E(MSx!L!RR$ z$m|n^Hp2*gbXS24o!JcT6tK|>sKUJmI!dvp*5y6m+DbQbuOQF4sjHEl=p<;en9*ZO z*7xc;qu%qwL)(u;ut!#8o<<*;oS?#$Dv3D`77xWIbHZ+o=;R#y|S=y^T5~n z4ys1^r0`ivHh$%ze&0K&Gj}JC+vx+Cb&{!rW0!j@$-JV%c@FkVO&lZTL5TUwsNQuS z?m}v@$-@a;S5H%ybh%P4iwR>>o|RSObV4*0&)i``BXWItg|caPB7Jk&^qG#NRZ4q1iw4%r?lGZ@_6_j1|2b_a3( z%X__vn)Y%|CDR{Wl|w+aQt^dZE^mmkMEruG$4j1cDxAj44@wP^=|1eD$4~b(xEy5`2%f9ljc*rFY8>Ob*rm>PJ9sZpW&qa9Ms$~`y4UQV0pSAbE*=RJry#6utARp7PhxXA+3JiR? z%^_|+N9d*?R*3FdN~Dr|Nq5K^@fweYX>ua3^b$*dM1B!dROwG;+80b;AA|EDoba8_ z>cs}|>7t5Jh`fvY)^@Z|&F+&Lmc zrjAgXM-2%-C)8gE>9?f@EC6E|4cW9l^_$k3IIoY+CzI|HpdBcX9@JIT->Fg>Yf&?@+xe&6wwQYs&;3mGz^cr4&cE4;JG8X2NS4##ipQ2Z5XB54C zv$r3=yO>F)0AX{O3-lUYZr#Qrc!$_1$%Q_5$`~s2;gl!TOJ2^($HYDUGT`#{euZ8^ zPO()ii;a4(0XC__$OxA4W=;#t=9IOveKA0)62S<1H0f3otMn^|MZx8UjqIq`L_Lw#z{R-eCy zn{vfJ27?}daBxtDtfEtsm^m&qGlF9?WX4jt5hOo;6v%uP>V@sLrFyX%KW%7$MBjw8a150s4Mc7MA z{M9%NUfOh!p!c$C90wOuPPRNUGx`iHL zR$bsu9-~aBqwqPM=RdJx%XTD70Gj7Exi6b>>pNjCuZ-388{P8HaEHQ z3@0Z!%3M0*UU|NERhl=NdT&@`FXyXMnZ+o2O8uKX0qU4_~>cY2r5=F_G*e{hr>ZNFAD`)6kqW$ zUI(&TDFMN{Z77Ixy}{M{(wM49OdQ{z6mV4K5rIcbJ3*bxD7wyF*>8%;1|cho60K9{ zd4v@LpOl! zr&XLvpAYA3t4faDXom5{3z`%-C(~iCYbkrJ6Ee=b%F=gIiT4*lXlgw2cUw{4B z#`|@ayd5_@MDT8hqZ7RXd+*6?a##c3zzVMt%VQo;f;*e|SXIqd43*|-U_`T^D5ze= ztgF&XCWL=C*nl@v*ED&$SADrQcWsHeE5;D3{*9f5*;W)$3QKIE&1lyTuaL)&C1r_e zU3|QPn6=`GpnDh)zaFVSxy-9j$WzLF-0R%tIf_Dok9e}HwZT9j4IN!q{i8_3RX@!# z`&AS$KHX04J6=BC%JR}txp3q3hnDaPjeBf~UoLtj8a@pPb^NraA*I&KV;Yb=E;VQ5 zQ(Ba8NAutWgP`&K z`6z!KLQ;YAE;}=GbU9nc)x`<>)#q5D2MW^WmJoDH>$NZwZ)G}du2Iv^59^&WJlXjF z3fIJ3#Yc|U9Ja3GBc{F-QX9^q%S+MCoy}q0*`I6PwhA?@wd5Wnki$bMIYifcMl>azi1jS2 zjnjRNix~-$34(~aY6=JH5v_y9ki|Zc*uYx4iC>KAPHVM}rIlrVHDm%)4A>VtVNqnm zxSwmQqd^B%k+Tn3>;`vt_rDin$HqDRUHW%*g66wuM5+ zA1mkQ+JjW*7YnvjrXOEXlZn`AN@V#eA zh)x$t(qqgRxtLGi2_9vplZcc~5cm?_CLIT});Y%Wp3X>Mx9`PdP=hR?;e~oHbZ^mA zIiRrgSt&W3e5!A%R9VaO$|~e{k^J6Jwz~Gsv`Zt+sw1lybIEx@|J~WbEOt*|I!l_1 z@zY?z;)2xMAfwEXvU4yNgq$sQ2?q`2Kxov{n#ohk^Y_Ep{({CAVB&_*Hxb6x2&JxqpR8)z7_||MQ*q*mq1lG+85vwUd8W23`=(2 zES=$TtF>#RPd!Khmt{ex=yl0Cj~ao7rM^PQX9kWN>N?(2q-D^;{3MtUHCL}$t#)QPb)3j_d z!X-WI#k1qHYR3iI2Ec?*-v5p~Py>-))Aza>^4DIh9}`aATvuyw{0P?GOqM9Gs+RKi zP}f~uj`jD+j#8-vknV6xtZu526<& zyW2ghXA~Lwb2D@0Amjy5r_8)*4)hKaF^f25V}jM3^iu1SQdg|d@%9i|zKZ3H#N2WY z7m;#^Tm_GIhH3h(SREd@GJ8Lg9X!%QYb;q+`dV)SYKVLF-T#EVMd$%-&Z{kb9Tdi; z0dSU6nkqiVnHM`f87f6RqIX#F8HiYw9xBtxlCgBu`b#rZWqniQsmRumUqg?8=Y4zS zDF%%(vmlj=(_~)deGnJ&CR5+{P7!j~Y|%^9`4b<$CT**jki z?Hgs2br&SUH3B;6tbs~a1Z0)h&_px%d-h8l00YPDAHy0A^J>HPl>T(+4FZjHIYf76 z{}jHTV_r>LXdR{8X|XW3Wsk)MKe^m_IF%5F!)Eq~JdV@ilPuJ1M5L4h%2ijHX&I6m zvvl+iNqDT48fD`-we0ssXuBgHRxaay588GwR(U6Z1@Y5dL1zdC*36*O_Zj##?0NsU z)KgN5*oraz$F+HU#SvL~FUQ7nhbo=r*9d?bpBb6(u&2^W!~D;^jvgiz6?N6|<>*<( zHzy^8N~(%CR8%>lR&JQGSptq;9Czg_d(7dENLgFMu@S#?FfnQAArR!I;$EAk540DEh;#OlnKa$E&iZFMM_3)t1|RY63F)luukGFrsn0}!Tf((zCR$IhaTb9`3tUXp4XPIsU6PtT_P4+T7hXqT#+<>@uqWRA(`X z*g7^!BFO!y{nCBm5+w+HQZ((Tz3W7juP1pYD zG99vdf@t>_)|d#T{z*NZxhX1-QUVdVu2cc~%$m}^lxoO|0S3e$v4=;RwB5Acpfpjr zIesh@u=@|Hf7GI-@o?|StB%2DSd(gJ!nc6}9I9MKS=D8W*6S?MLvH}3KbmesuR{uH z!gJF=D6$vQAixn4NPA zCL>fI8vy@=gk&syw5l9V!&AdlCSI;|>pq~UPmBi>Z8Fy~tp5vGz5gZgt6SHUMe!XO zby5Ork0~pYyf$oloR|#wf{#`OPS3qw0<%4AQM_dR0ha-0V{aKcrmOl0h)B+54DMOO z5GiS8MOAmXkE>|a#uWYH163-M0dY`@&M9uqR$gYyopiB3)1m_mCEEP6@EYmOuK+ks zh-kRiW@&XiFd+T0blkkN?MGKi;BfZ2UlUjSyYz2Mgf)oN?1dWjnV5!;J)REK$4BZy zOeu9gm*(CnQvKmpQ~;BY=XTZp%$5Vpc8&=s`E9(S27pmz@JW+w-gsP<;hoq#YMGU> zCGb?1{?o~J8|dsKmh?Zr2JUUBu=<$hIo1yc<=*j7$12|Q^n8+u;2T_i=@&nMAzU!V zbp6uzGXb!swFXP)crIiYo-*Pm0=#*+PAK(Jufr&m}1Rhtgp{6i)d0 z#-BDDPaqZQ`LuN>VvgZn7tt?t00_JP35YYF8#iuTUP@3IVqYJ%w$X6MRcFj;`xHcS zFoSv1uK^)19hJkXR7I0N5yMiGu|kJ{vw3e%AbJtgh&|fb-a|Q0{7$Kb@t9y80*-uT za8x_h5ejFC?@Gh1cdPHJVL)ci$IwmHGs|`P&70Yx^v6-l*nSiC* zva6uGCe^6$ITv)O$1{Pbq8%i1wRS5F&zBr!xk<_gW3OyR>8*Y9Iol4-DJ;<&acj^e znj$}!-xPP%prN8Rkl-IU`slFRg)dIsC`MKa@AD&N>CkWRL>uWM7R zezGcTQ|!%pPjPslqy#7YtLcs_*V}6-Q94(ZH15i_yJ6PDEQEs@4`!+67w4@4NKAdXB3KV0<#m*(8#+@SrjebdW_vdspD?cHk z70#v+$?aSelXjWD9!Qu(Lw~PGD(9=HPbB#k_1~W?{+mJRbN6vO6m8wGn}|DxY^iQ^ zgtOsW#pTU+{aEwsn?Wf?za?YHk2hbIQGJ2t!_W~9%1{hFP+nBj(D-`rf+QTbF(P;# z{c`}fQuvCa<_D+poe&8X6{<9rK&nuGY2F~}%K;v3sXr4^028iqE!7*~QQrLL$Jc^V z*v=7akZuPzC>^62vl+w)@qDaAdi;DSCie-1SX<$T{UIy&w3=ZSYjb@Z;#kt}g70o7 zcNv`6E_dygqo#$N3ba)do03tmsmbD26C;F<4zkhjTv3tIc^s4^DbFCL27hy`i6R3X zl;jCY_hsxP`!lzh{M&icc!e7?f*4^p{`qMhM}I{c5~BPClE!_Ppxc5E8^8xw5D@c& zKUeKg6yMC|);f{-%3sLP1rNfKX}Tq+uw;Xo$!)%!*F5xM;7jKEy|mUy+mSp9N&%x} z1+IUpnTRtkyMb55HvO9OR|5 zgHBp3iDp;c<^<*cAjQS~kkf@);S`g)Fh7E4R}p~=x}{Phdc9C#lnY>`$IGp1*Oa)4 z_fP~N1v*5^=cOI{Y5=I=iVw;E!8_g)VDNR{&YvO{o#O13v1z4;>^iJWR8(oq1chnC-u31H94 z+ZXDST_0@^f36xnKBe|VHZ-+O3oCRia3u#dgNCqeYeiUdlOjC%EAPsDm$A#7klrPZ z2X>N}krlPOxb2~hg4brS>rQgvIool~c7$J=8EX{-Y>(His#rG( zeQv(SPnWV9jb8CEj!3F?%xatu%+e*RMd4_-uVdBx%9^N}`ASU6~*O_B|fs?!2 z8vgASRwmJ{8U_LPeF}%<>CV@fxnjJf*K_M5;52_&&VL zzQnq(&N9#3@n>C(%UM+~xNT4L=8hbt$!htQC-vJvpFh25zk&o&I*8?w8Y%a2v8kyLO>=9B z_mEyHrALEUOS|Mk;2o=^MQ*#TYSUF-YO^CeMu9Alt_gV$)OXDiHyv`YI8kN$g z!~|7ak_ei42H6Xch3u8OE8Q2u&8?tX!3CvH5zhVrnbMb03XpW^`bPGyeAi^cNC6;|EksMp!Vm)8c^O?uRRe(NF?jMzh`myv4|r8b-tU zoX#1)0r@lhc;5TLQ0e9@qgjwQ4DRdg?5Hj!loILaL_65^IYI5bS*-gW>CSO~#Hf@; zjo<^4YGW!vLHl2mBlCx*^EnOqC{A)9+MW%~cJbwxnToIbjcM8sNAClgRE(6Y-S@>M zzS(Z?J5~t0Dp=hGd*eR+lry0tAlA=)o^bTgmn9mAbz)j3s(R)9PhcEp|YLIo&mdbyP~T`ML;^m=YaG^ zZxM4+)p&1Cr$iA$4d#w;`8?dJA>d?v^#^P5p=k@lHaH~FA@gV2icv0+VyTPq@!lPy~$-IH}}qjj~nDT4ec?dfSzSRi#>I$0;}1F+MG zZmt3Jtx@@9yI3m?EkW~-jR;~;wq3zN?24UQ(9Y+VfGLfSgAKKZ!z#`3V(tu_C3K1L zJ&%kr__1*(Jcj*FIPU*q?ybY3T)#EoZ4)BWf`rV_CEbF+zznGj-QC>{8x>)Q!J!+0 zp`?_M4rxgTkPaQXr3D1Omwk@xed;^EbAA7O|LS$UJh7g&)_t#gtszG#bSa;-mQQL) z!{=e(C_|rRHEQud@+yLUVwN#6=lo~l?m{2uKH7W}@Vd&eV8(3s-wLPT<+Mm#orIDe z5Q)!2GqBx~iN;kxvJ;c{J1lI%AFh~wJuR>u$4|^BxRtCYOU?PzUny^-q7dpZOh58* zL%L=}JSDl}i8;1=wVVo}VR(w0GI<1M-eg)Z=b*^qdB`S~GBr5wI-~c)yErU5L-}fB zQ)Zz9h#b)O$@<6LHBd9p^BI^jM}jZ>LxGb7lVlgh@d+Xl?IHC{!>VdxXTPG%c)9e3 zu$E>CCHb9?&8M@mKbOqJZX3FV{==(Yzx1lycpEpaylN%jRTqL*SpVy_p)sQAaJrR| z4vOc)yC{YNd>II&VT33=bZVF$f6oyiuol1X{GFwzRtVVIx<}TquGsx+xrlvrOIbPe zr}?VZ^{V7RTfem2BrJ^iE6$WP76$I+AKHL_y+f>1`LJZBE2KhpNv8N$dol|{^n1^G zhnPv{8yy&&$O)z2uIB0WGr(n@$o~-fy2|Lzyr9o^92PMX~Q)1K0!|`yu}R z3qPv!JJvBBMCQL}R?0`a{e@q?P9WYNPmz7#b9Wy7)8B-Y>DOa0r%!1gBG=N8<>Hgx zKhVRp`1b%gYUKy?&#utceRIGrrEU-OUPT@K%$v4rsn5s0@P6?jlSAOTz?4-Wa=JZF zPrbc>(;@k9PDeG!iF!aq+kwD(wFW1@;-LvBbSTEEcdGII#1unzSIEyA4lr&j{>i?cr~KXY=k%K*&soW*({hHUMtR z>&-qp4!_nqylx|-KSE6V%a6zF0!6lJe9}2IZL_m>oPej9tO}&3#KoF`#~A`{Ua69C z1Y4j5QfM%&MhFhA(BiU5p&>M@c^D=tA5vOco3-gAImrv$*TO?htI>GiGKiE-GYCQ& z5(0fYw$8J;v$AawSLb^*7a1YHjw(492Y>_}rh($K|QC6bN>LUaf)u~wP zpMCRoSN?HT0SK0LNb&8fMAKORX5gS89n%^q~uQOI8#Fn{8nPi4KJqy9e*m} zA8ca44zB$-JLnE7>l~xH>%!kedrYhB!td#_py;##APF<84)7I5bF>GHKjXXRDV@G|QFL(}|Rd~+MAXVUtzuiOWC48x> zm}kEFcpeepFuvUr{4aC~Wr40HyT^V#nT z&m{2!>x_Lpx{UyQjVZ1!7zh!ze}xF?zi^S|&8B!P%MT?Vx!eB;1Y#0?C=lNfAD>eL_nAo<>;(LexLOA4?=zOW{L93YAFnXu zo0dnw@?IL>G)cs=;Jks>-UJSY`v1oVQ)cC2UM==xXdwE-#WgTcqku(cIzf`~kA$h3(m<-lxPRk?g#V<^_o43E;| zBch`d!MX_TJ~ShP-m?nr3$+|ytkzo!9b1=z9KA770HT?zY3sj`j1MpOI3JpJJT(HcgE#T0@E2(ur2X9; zRW@M2LI1J(h9*3RUiiiN3xHD*0gMCyBIL*; zD8OQ&nfhZSrNe%lhT@PyTLAuI;Eq#SZ(nF#o*58p|FC-o)~97BHg$W0Dt~(3Kl>*H z1?5RF$Fa^7;n4w*>%;WM=g7&|Ovx33wWSZb2CFUbmyKtOTh9}JDgO?SPp_-J0DMr~ zgWH~0e%$yS$;!sZ>?w}J{R8#HR1SEbs34OrIV1GK?hvyT1thVTiMsxauL?jsN%_lN zy;7jIL8JOHFUr@VpBr3IZj@R=pq5DmF^+yOVT?i?L_0|UUUcQv42w(N)aqV&q68$s zhrY%Y)!C)1EL`eMQM6Wkz^_m4f{ae)@bc?Ph)l>AtV(304sWCL`$8Cdgic;zSJnWM zazi?x*JVVIR?eVTm2!eR`37!Sl!~m@GtLHQFpnUG8{Rh2ter9`9H1*zf`c8(+fn^n zEs9Q?tya)yId!F5jl?X%35>&`sRINEAKO86{bOG#z7v8>}ZBBLa;VWWy%L&MxLr} zDTDC0croJVTc*4KWHN)wxZ!e*1xD_k2%$*%=xkW`-Y{WgpMa2}b!?a+AL0)Q;tLj~ ztqs|mu*kuKRi+LMz(}ooL{PtLOICcGm34r|sTDM5cr4@EdY1@lLmcA%sSmeko_J{? z=rc}796DLT<}R}$wT^?Q_}K>gcxhCc#h9K z4J#|9q!T;5y7dY~L>YgkYMpx_L{pYYXk=aZ>x73+H2XJ~-(M_5f8RL_eMT60zrfzSi05bv8HFtjw?r3xR2;miQR`k${FTw+pDwm_Ggr4!Nn%v3lV)A|fIp&wO~f@zs@k>AqA|ob`)yZyXuA zxMJ$$YTk^MWyeCai>VByS=SJ{tYuxN??CbqlthKw#2&bwg0Vkud z@76N0!w3h*@Ed9&s4XR9vaVdpGtb_?E4UBISc&er@Mt@q>I(DwUkBgtq&6!>KG=N1(E}ec{^028*iN=FK`GpPrs}m zgiZIv*NK7ne%ZV5Bx{Q=n*bhAfY|;AJt;INA_rNDH=;mO7@L?uCN;(s^aT}G5T>ps zY0C|rc^1~65L8cL;)rR?JI8!GV*OXrd+#lRUtQaC4}>Kqi9-n3DNLu9G-{SdqaqTke=<=Qyu-E{-=j+@P)TWADKHk=(KY@UySyt1?0W_e=e6@h4u` z*8WLI4lpos#LR+NCkBLFp+}4)hiVhua`Yn96?>dhcuRVZVGec$`}SDy`P&K4UnaL)`XmxbSnF0CX7}qr8teaWfDG_I2JqN;jetLU zUI)O3^9cCL9pzLy@c;t9b;}y+(RMBFYa`=z6Z&x1>IaCoted|aVp)-CTrkRX%9Dq>VqDn1MJ@D_YGfs6+Y9U|BQew9$Rg#m765WOPzp83 zQUz*})4P*(17Y3~tuK-ZlUGU~DB@A+sbn=B3$N)OWw9)CkuJ}f!mrE8b>8c|dzlmb z{5y_P0W4oi=gUF8D}VGr49I6cRN%FvfJE)B_vZ3tc>wCN&6Mq00U24~r?t))s!{;#3A*Q_M{gg|m@VJ>lT+$FchVx~Jfk>JkJ*e^H? z`9=7&(YR<~LA?q{&Ow;^{sZ~A`-v7q zluNeeDLA4*0Cy_MW)J?2!;+AfAz{8?FXQi!pbX@6)7?z_z~$h6PyG=JM0%osD@g*8 z{(RQEkybd3CT4f@1?;69I9k1vfwAvt%4jueB|Yp%!Fk)+d4{^n{xO-KU*iwhJ}NJ7 zy;Ad&rIQo{!jCoORqCl6Us1pNELE_kf)uJn0X=Krr|P$KB1_gr()v|E+z+0vLm4|L z9*hzZiPHJo8LTd|B2QR%uNbQqcL4J6&shDjeT4*!m4HojzG5}Lqe^}2?xH|?7?6ho z8@+-sCn)5U|h)a6G~=8NWwo`IRdnh0g;g??1w6?wQM+ zshVi{EarFJLZ1XUMY~qde!6VoI!pI!$B%$z$^8XKiCtY5n)T*w2@-^2kPErVB^v@yU0tlt@zi=VL|@+ zA1Jn;cLYBD#ZKU=&2dGAk`5LJKazf@^1IS(ou$D4I$ zzT{BPP?ECd>W85!X)+FG6$wjJhk(<=p?WBQf>p_Uv7B; z)aQSy{9SS^{6o_??xzPFNMFYG?nTeG`JGlV;c3_-9T1p^jEvG+U!3mu(E_RECAabF z$4=YTu?&X*bw+1s3V^X2LXD!Z*1|-9S%3lnLJFW)`lFs!8x3MvzHoR;{7y!%5hq=e zQ?bfP%U3S6hW@V=KKP$kDAu9+7P0Gp^>-P#c>&u&P5ihL0szF&qaP`xRe@)XjW@~! zO8T;EWwjgaB?L499%5!(Mi3eBQ11Xg@*h1A0K zePEjt%JaEI-m33ugV%MaZK85U4+om4pfTl+&QUp$Bj%&Rc%uwhk1IIQsRS5cUxD|d z->1~#&g8rWXXHxIB7B5HNGKy`i|1NC#0`Gz_cowGESMB(SazpQI9C!#Dsc#-Yb>(_ zW>0^3`8zoo83Dte_Zj`a-KnWFub*v4%@$2@+&%<}bLjtszD^1#>wDRB{MIbz3?m{& ze&{RSfr3A%y9KANo#EO`d*{3rq#0_+SyQ%8`Sm#4L%UK^)Tpyg^^%J4C{^08x!g=Z zP7EJ~s?OA(NUrt!q;?6MEiuOxumAVDv9_Q6%7=-veGPK|s<$2hJHoT;R-*T|9mVz2 z)4ejgv4V^y-JyG+PsY~vHis&Pp5%dNX#ak0)(3I~?q`_1iVWOzH;FvAOvq)#VH4sI z5Kk)M1r}tw=TjT<`mAao0ueYszo9pkf|-~JHY7qE`^bhxMA4D>H41j1wsCI4sWrap z%jJvbV|YGvEA`5ii$BboPSIPlXsT>++?Mk6Y&t%FU>C%K1iXl-7L)HZV9oZPch2_? z2;VP%kw-zJ{5yr>kaFRJ3 zJcq(X3!k{p)z|3zTq|T<)vj|E2#;7gHslWD%paib2vM~9GCdhq#a<1}U-B?(61}8{ zl9+pM5;kOXpft{ZEUa1?eHt*h<6K&-+m0c7@GpbI^_MMgJ!pr&7cuMCp9I{xME$*4 zIezaix|x4&{D6A@#Gh+Xez^72SCuW8TB5 z{n48SJ5OBdTeYrC9d9U%FuM58ys~E7wV{K*({2`IGH!Gr$GrMy-D}&8^NlCe5+5nJ zs!8U4t$A@5SxaE&NxIx+ftRQRJQ2y$eo52qPlV6)g(<|g{7LN~qpw{3oW_cNdJj(3 z=zGu_oegE0bs0n>YXP1d;J!0BNW|Y@tzZ0c?DDstd2k1C5iTv*tShfEocL$k_c&|f zF|t2Qqd>XjFf_WT0ut3o+Z|7`zo|-1z&vyWkROtXA3g%nD}H1=`m#0>js+s|MIVko zdxz?`@4AootR6WS2+nz8B<*{4tBuRH^`gl?xXRkHKL{9f8{UfDn` zQt`0eo1V#j>oKBIPg)gD-)$Vs9?(`bf7s2L`WBYcN?Gxv<*=*YrH4se{JKjkm+G2W zM;Bv{2&li7pEC(rlA&n#v?G8Pkvldds?ozDCy|(*jGy0k!(q(ltvx4s#A@pB>Kl>o zJYx!RwQmw)f}M#{4&YinM(wKh_Gu#3zznqs2iTkQ73!hE*C8RPewYhxfSBuO8MxH> zY;GlFW^k=cV>p>~Gt?-xXv}7!-+UM=d5m2N=u6`ngxns;tRIOiYocHrl$mpjv(9h} z6_3yqv9j~y!~f-n+d0rVu^3qQWfXHg-RkUY$fxY|Os}8=)iwWW*dhLSnr$#%S-6IX$Pu;u zM*G+ik+{sLlTo{JzIAfec=laid>4Y6J1QM5VC{OxULx#LK;YD*eJAzn{1JHaadp?V z)EteBWiHBx`H?2*eNMSf29=Y-xy6$J?1fS6-zCDh?RmQkvs=yS^(s_dYykEt>N{3x z(@FO8d(x`CB!eEfb9QgkYNv;q*8fyB!CCy^Y(l$HBEWg?)lu?0$5tGl>%SjF@S{tC z0mBRv{WW+n(II(I4Cy`MkqztL*;?N6SqNH(OoYn>iZ2LXY$rf=6Y1FXn|H#0SQ5vr5w*Q#a)7S3qtC0RjpG)UQV+Fe%J#R-C*Z z;j`XHK56ydRJ9Yg%72hw(~H)4HA(5##GzAZSiQ#7$7T2@^#q+fwFtGK+Nt%$_vn7C zr<||gN5>|`6Co>OlK{4`92I#nEiMmhl|)*;qiM`Uc-(?JDNd1yYjlUKo2oYVROG%f zpeRcT{tDV$lPynAtmD~n*@4g5Tub24XQZa+kq*q(8Ncuz6k{n8kM?QRoLet*%Mo~o zq@D}KD}06~wmsb>%&IE+964sy${|$rbdAL#r}!5&Rf;V^+plS94x@7N)G-GV1Ze~l z(TnqTlw7yXNy`vEBTM3mTEucmORG=sd*}0AdR77el+b+x9`9Ohn!^LdbBd~ztnqGR0j~}&YsE6ZMAcJvSn*uNTQm-xaUg}ZrPWxop#ra z?j-Yh1h}XKwu_0E8U+vUL*h^kQbj(B89B@AOVYwv~7PFGfuk&_P!p3Nd!52nRh!50fHs5iM z*R67m2&jM8{u~@k1+~+(7j#`I?@r2LmXL9?dr9=zPSa7a$`GzLbcftah7(DZy6(K{ z%6gCbydCx+>X*W^-Z+Z@fcgCV021b`bUwidur@veZH!f%AO(PInR!U`yl`Jxtv18o?_on6ov6}FUnH1W4Q5g_>(SUEvawdvMI z2ya3%(~I1o7l0@rwr1Cl!3C{q9RD@Zd1K+4K8KjyO%=FlIO5(A>}^2-OPf|ELK?+I zRhy4pnQwHGCxyR23E6-i&Nx>4Al1p2BlZp0-LMIqM{f<+G}1|fPbUZF%oc|T@@R)Z zr~sV@=vmIbeSp)|6{bX|A%RTt)TSuhJmIPwZT}|NhGTV6( zzmyzsF+X<4mO5bZlVqEN(DwafcPd8LjM#j_oTp0(14Z$?S^~DJIr%5m&kvqhf1-tSet5 z#9#~<1KePwrmy#zr8_&=Q83R1%i^bjqoQagGQ|ay6OC|qPEiCnvHlL{CdKnHcT$0< zQ01E`augjM=+pv!vI-v6&n#Tyuc!MonDA_z(YXVFZYsCsA5zGLi#oJ+Ywx8Vrfr6 zQCmFaIfc2j%qldzSnJi+Dkh-6pKAa{IJ8V%daHj4$JWIOp`?^@K<#_n(Alz{ zhMzxgfaEa_dNcU+0&IxfM<*99N*bp}NZdeX2g4bsw z*ol(JXXzI%LS57;8_FM_75+yQZ7i9++vfSj^S_LLDpPpqLQ5fMNMiCVfs#&{D4$55 z3X#CVise#-B#)*JjYJzYEcY%wiFap1N=%>C5mD}0Jr){7$?d2kRxrzz_4y=vqjQKZ zx^p+gVIf7Zn>NEX-)`~TE9G^^_~WV`bc>qK>F)X4ClXe%z!{u!eaHoEWhosH}E=ZNRwV1qyXj}PXI$7ZW7 zvKom9r%wf@D|haZNM#3%b)2|SJR=y1!@!DfBiP_B>Qm26&;w(%J_><@+*JZ9=}(y8 zPK3ye>CijXlEJwVn#t}jAAc55{}`o3O&0bCKzo2k5zY$aQfQBY^G9Yyt9@~cW3Ce8<|VYqs;;uX2QJkBlQ1s+34)n(x4#| zvF*dHnnipNa+dl&)GdM{)jhJy!qIl7BA#~)2+nq(2UX2&m#$e^(W?pTpf0o(>m(%v z5a{83m9p?^c4)S@I0%mP(53VS0E&u|QZv0>ACcCc>)0%c>Tj0ZXd#h&Q`H(B$+2q=~?NYHNA{rw3xYzprz5h2*-BMk+ce4ncyl6V2DJK zqT9*8^KRqmOE(SBsEyG^ZH0VL6uYWiw8Rf(`NvueL{hAI+W?(X&N5>5?fz8gJzL4X)o<-wn2LuTigTB!1?kRPV-R2XM)$T_{~ z*wte!(WC`2q|g4~1YSQnidbZSefeAizS|gGGY%ms$vH6Qmwl@>tRkNxQbLbrJ>e*S zQ)<)6hBkZ?4pw#%DzeT%-#wLl6Q9i9C93>{Uimm5qe#`C74cIy2euezZt=%TKn#e~0B%e33K z>GB=lD`*13PDatsV8ZHzRhqp)Q-n4VWR3S2_}b=olejQIjL!%(aou4y{aQ1*zkuD&N$jwn-@;^E|&s!Kg9vOGI&}q%2Tt;rZm-mHRqCLD)#g z@wL2>S2z0IX*+~Ez@{s<#-?qP*e3>zZ}z>QsI0U3$=*dWPFM%NmV#(SsIo!YoWi@h z0h7YD%G*7Q;d>xj295M?w)0p!=&SS@)|*()!V%f6^`XkR;}NhOD?Qgf&l zKVJzzP&?KJWKrWfwdOfG?+y8bYKaUdo4@M;GF~~W!UrcE+voD2CQrq7A-8s z;y?3qxGZQ8_5v_eRW~Ve`6?#mGLCcR)P1WIu4l`!J$RR;s>ft>E)1pxxh0<=$7f5& znZ}>eEQmX*$g1s|rqHN|!Y}*c&nHZ+dta%#R4#uz-HN#M-0zbl_tnh1)fa! z<r*=%i)+lR1*?0c>^{$VS?5>h5C3X2`5C%(ktbdjVJ=~r%oWgJ z#TN2<_O}iLeuUKBQPc<|yW&`HQ(xFi#>~oeCY!g4E5nFfOMUjO<#T4s z(mN@I>Ff4}$3YR)J+ui(m2v!EP2XrP+zhr&068ar_R7kH)$`#7t)_gj4?e3GW+3Ei zyojbiTcg+c*GB|r9$eNU%O_KmUQM7Xp7k~hxUlq1MAn8J!Q4@3EOq5UkXJN6Ga&H& zgys63x3*MX3VcgKGlic0RxojEUjIZ6rTVmYc<1qa@yHFfG@hxEyFCLvu^|_bVT-b{ z8)aQ~Amdi`hVHeM@AF!=)KC29gx<>n=J4@>3j-_me3K%`DLal_5wCdD>kyGM*Z5N{ zw$=WVe!SWoPsp|(8aa52NXm~P_p4Q?#4ex$hz?-0DNVp`P0{kBWG3mK7^Exmhzd#gDRANVmOsgXz9|Zqg-?7S_d_n^(x9#_ z{3}h&c5GU)zdXX5WVCqPeyLdG)B6iRRwnQs;+`~0=9Yn}{hNy69)DGONAU0k$AiXhd83Gq3c7^2Kl$zmuciHfDTpZ(70d+$ z6yk@v0{LT0$5jO>_`hu>1w3KAmf?Myie7X{xpub8a>lDVZZaQMm;~bT3dbC zmCU;oa*j55_qk9I7S5 z>Dq%1%%)I~R5o@LMmnUzUoF`E3)TtH0NkkPRU5X<*IbpM3e;7e`t1|tTB>8NFovf$ z6Z(&-0RAa8&Rus~ZhfM%np8trJE4O~f8ududu*mxNEP$aoT`TDDn?``(fxl!nM7K58=-HC5UO$vV1pZXA zy&ok@ZF5FLMoQ76qF55IByV#-AfJ`Mtja4`VXg1F#H~9;bpwRdjQB&R(Vqkl-!d5| zs#?MuGxDU)XE>b+E1kofZBE}1gr6=v^sU6ue}7ev$@kqp)&(V`$FPRAv{p~ZyD?m8 z;uQ2F24SpSZyNz|uz#sB)DfjYU83J4wI<##h<{GEJ;RF9uC4K#y79%WZ~@a@GB$hO zh*E#Z2)Fi*$`P0+g|g)|v0}`>m)DeDH%)}3Su02=>l8>owOidkTMUp47!|IQ#`a`H z3!nVrj2EOjP*c-FR>pB4+Wr1;TM+rm|02cnIXFgPfVMSVZ<|7@a-utTCk(6@*U>#{ zBza`Ur==O@J>oLFg4pd*^wwBXP!13=MGLAvYR70b?=L?uw6jf+pBpq_L#60eKQyUQ zSSlTe|AZ0)x`GWxMMwB)TXWwXeX)2f&0>cxR&+K7j+M_FyU|nLgw>o8&i4SyQLV2 zWFH?$fr3M$D8H9}_4B0i2r3U>?R#uID&pPbE>8ql`BXp; z^i<;K0)bku9Jr~RZ&!(pJF&}R?YV&9S9zi%u41EF0b?wS7@rZSpicQ6Z^`EodYc0P zXi5}omm3ac_Y1O)oY38oyJ9S3jy8^C;azJoU~l&f2Ijn!l&4fSXpm;dO2Co7PWJ7@ zB=Ap;u19&LWy7; zoZg%~k4lQYcpH`y#Y9zjacc2)FLN8vSpzEbZs97#f@m0@MOn+@CNbh+6uQQr2x>TP zK(pc*N6+k$gHt*MvE(4bP2z2cAcab!yVg4QnhRS#obA#~ukD&FID1|y-q_YWb3QJr z0m|+OVWn7{QghTfFP*r-(p*vqA06rE8rkfdYJGHn+?4>z&Ht+k_tIX(@*=|X8B?TVXL8#tfWRPGTE--PyA;gQO29~7(b^8x?89Y+uqzcUo3+fo~2c0NCR6#)c z{H8rz_^TQMi%jteJ@^LJV>qG7A#Mw0&77_u-`|e_CrONsYDlj)jC8m9E9%9*KMK1W z#_P(d4EN;JdEWyuf#6iI)n25-V|y&Cd(H-1o)BM4NMk76d?n?f90Dr<%WbfF#aoG5 z>b9-8uezkwJHtD36}RfgAcgIaEtlY-8Bhkw+xHe2Ri6+GiXGa#O@zriH6}rbC{BG_nSJJiz1+ zimh~F8!XRi7e`D6hlC(W2Oxh<5Qj2+BAfU+cXU1n2toS<7^w0})_LEp6DHw+m6Mlt zL>GJ_bm`l~C_>w$)fWS! ziH0myMli6+ghP}I`o65-ZkDOpwiSrmCZ;H=tj3Pw4xO;>l;1Cn@X3l29uixNzvUDC zU4!``Ywe*O!}a3f5x>s~w*nS(Y_1EO$XF*<%<+oW14Yogoro}6yM_Ada{(Hi%?NO-TK8r z>GuSR#$k%Ty$B+$JdDYEoXS{*HkD-&=ss}&LHpXHR70cGjVQkfdX9?3%bTsPTUj<4QtE&F9{MEYoA zWjPA1l@qOGgwhGwG5{CT_iJdXV!~x)fS?GapC712=;7UN-u9EViGo5EbZg@YPafznjm9#fB7r*DA$KBi!Z$ z5>+s~onb8!if0(#c&4|~u4+W6PAZZ<1Fz3hXsTFSJVon`TO{d15iYf@PfqsKBTdc> z4v(L#Vqv8PgL&am_w2~{`!h5^#_E;_U#Bq35{4&~7EKraqAUb|sZ4v6o43Vj?@*Vj zRC;N2H^gsiK8uE2BbP5J()Y<2g7g_2mp&Zu1^QzSi;C)@8D^pWR(_%Sf~-6$49?U4 zLW2~*t%l^7KBWMDz&yy^h~lS)u63oncMaF;o@fq*3E9pS=`sb%C@LVz3q*W9X;Q#b z{*Y(0b&_pK9O?dL&yJ1rNp%k{87|d9&X7!5?(%TR`pSy+v3{jU=2b()6{28kwI|YuNg&9t983 zk-a%P`)Vf5i99)B9Snse)J#7avP$d=GNGlk3alK|ntDNxJURU`_N?efC1r3H5j26t zXDj!1GI9?F5c}ipsNMBMca_9GJrs;>BhE!#J+05LBv*)cS)K&ys&Qs^6QEQw6gRCd z$SEmzj+?)pasIQTEbCJKdctE&cqM!+y>UGY5XoN4_|wb9HZ14Qxls={X+X!81*n{M z&)WB`lku?D+31(7$kM7ZK%zwf9qZAS6w<+3g95NtS4k%95h>t2rKQrxuDWV#w*FFs z0+H-WuX^p@yLnL~dG9K*?OB2i$EGy)n$bbeN$hh01#ZeWQf$}X-vs)r0WDxi#GQZk zC5TB*ho;vb5>5xIt1~POa<%t#xP5T!0Db1>cU?;jCOn?7v0If3P=45>kQ<277}9g$ zyIh$V5%9N<|LPID>=^}L@PU7nJkb=&xKO@@+;oCMi`aiqyx~*P1ZK#99Bgf%R@He= zJW|zj&xGEjCo>&%!cUC;)_nsST3kT=s%H9N9QSts2`*H5TbzA+@vFy;nY)Q~l!dGa zc$x&6&jjU-&G{KX6a6gmKM$F>(lTH6#S-hdqtT=m8XhYvalYR08zn(gm6o`g92sN( ztJKlsx?sHw-T+}ZPRt)*PA(@@H#mZEEu|jlj-#UidTh8p+2#U6f&kG4(9Liuu>3U` z*-~vdh=pA^h0UGq82J0Xy}2lGAk&Ts^4q;8?mFiXe5aG|N;v+e<>sXt`43u2h2PxX z)9WOc(#LH{f}Poq|07n+?ILh z)!pY21{;26=M%1qob?IP$+0VOLIVvMaJ;5V9yZn(Xu>`o`nN$HS7-PyHFZCY{#Erp zFQv9Tgr1G_x*%XkOnP#7YlA&0#iEP?t-&zRzeNYE{C_wpIQ6Xz)!JSxazU_Png)#4@WI`SQ?$- z*-iT0WV1#=`_0C)X^gRiQ-frHSIPp6nHTf7R|5Kvta|?W_+WqolLNJR{`NG0MjrA_ zjs(ia(bJ>2bKX@pzi~`lGvipS`|%T6&P|ktMI@jb(qxXdm=6Z(hqD_UVG)WCAm*-K z`7edxE@NM`BLG0)e2!&J19Z%rG|F<57FML+ri?DwInt`_U8SO(J%NkjR#T>?{2Vbm zIBagH7bP%PPaV{&P7KvbdKJAc_faw}0-C`>HD+}}s5xe6VAFw+aq!;j5FIbhIIosF zp0Q&HNYXvw8ST<57sX{C%Qu}LRch~oB3-v09+lXUKr<@@)5^}%AUdr(BS%(d2PgY~ zzNnns@IIN)dha%NEseb>VW-*kr1=aOb~9=w_Ad+{a5h8a7^?NIXn(%;Yol6Hx~y93 zW(RyaXnQ^{4^@?qza8+43YjS$rb$F;+P!bVE6@+Q8+EwlkH&4G6zTOEaSAKa&07-(SgmYy7`%Ix+ywe7?{1jBb%p3^t#*6`EDvd#LHuk>@dW z|KUTY?1%4cdv*#(Bu^8n(@56X{c3l=52ffGpkn<3&MtlA#AKRn zwllSSL8DP+0YbV0IhD(n(rA}7J(B-m@>nq+LbcgdUvEf+8xvX_sZU~KUgeopRDa`M zA_@L7SN!skCMiX~jDoANn!Q1k)n(N*PTL`>pmZ*JGB0PPQX`x?9Ad70S3iRb!G@4A zwZDhwR{eBHp#A66>X6|5N&?_)=6SiSY+IB5~`k3r!XSK zEv-S4A2|4KtF;foSMrDyVxT7@@5bQNRrA;by5NWMz$8J(I4#5W;G9fsRdrA(AZ8-t zsU%}hbmHG_WyHsDmPCzLSuq<19f@#Id_cojmz2Gt2q$kMGwxx#xeB|0_L}ZG;nkaM@K4b|m$^P)M=-WTRAShiULR1QuR}BHwBxzLhWOS)WJkM(D3QT;9=P zOB&_x`HIxg_kHhT%)W!l-b9nA5I68kQF2hblQM)8!29M>C^-KdH*7e=C=zE8v!ZM| zv>W{^AbGE{YI8r~7^lnhQk2CPd^+W>aNcTn@}#!Hi5FfgAI+-U`GZTdDPkalkJuqT zx-YH+7TGJK*d<%(W%O!BBeV})#llrX=4LtV03ScDuL5}Dl$6qsJKVsSG--=@v7teG zG((?^=OnSa=l{drTgFxWb#3BOq9`D(lu9=u-6(N5Af1vT4N?-)Arglaq*FMD?hXkh z1nF+1yIcCd`8|*5ea}2I^PhP$FXk;DKIfe8{_ef@x~{d>b*aVIX^=(VBhn6fFiOJa zv4}mG6&|0$aD1))GUu zJ!^!ql5kFU`o+HRFldvXLMuAaYTw#<15eBD7xo7lRnzJ#1-&Eg%6)_O7)xoAvN4a} z^an|30V7H5klGElZGV|Ge&76#05< z0?2^DwXsN#@vYpp1-J@oQAma?Iq!7$;vS@yDUDG_PC}A5n;7|%o8R*?78*8MRLT&h zd6&tcgUVN8wLeKp8J7WDSy9$zH4)_WaHH|z#Ws4$$V9zJt^Jy6u1+nnxVX5)qvLqd z#KgqRoSa7VhpwJV8M2t9;kxpk1%ekm81qLafgRM;U_5`HPBvL6cyrUTTT=9^!D3AHI)lWpn!F7e$IXRgcu^HrGer=2&DL5Bx66*0rR_vA z=0B6$7F5r^lqnNlDiykJ&f(@!+A=-;Hc*k3JLT8cwuA`zh{g^Kw+|Yu;!n6P%*Fw) zTg=aIy?gdhezcH;%b1g}(-kN`=0z%-n&RTxwu18rH7B+>p5jqW znYIRaT0Oq$0Cj^2*J4+tG(~5?JU^Erk8XHtDP^VB(ak+Iwm9R)P{AgSv8i&SA>#{poS)3foa23l4qo~+YNVqb-k<8r^vGFeOZx_ z`&X@$jqGNvPoW>X4E?=&Y?pce=iMi5$l zXmoZqdKnxT-A$osM&`DVBHRo*`wKR17VIdc+=FPer%R+d9!8xdwyM!S#Z?QiKE9J> z7{Pu|Mfqh#i$8VF+|y#kq4rY`Qwli>O1GtH1dI}=Q5C9GQ9qyDJ7A+urAj))dO?m+ zJPfvPHA$`#GtZO zf9XEqFm4ee=yLec{Gh(aUouj-p#Fl-ecSutK6gCm)ExG(NGe!gU&ryWwzuaz+?))I z)^Wc5#bM)Vvb2m$bGAy(Hn+zfw|e2LZy#-SHk!c^4l~ehRn>vd2R1Oq%NVTuWw>J<$ApWI*^38Y+DV1a0xG!*_!vHpTgV2a0K(4Pfjii z#Ya^l8KW&5zL!-<4fJk2BcTg`ua$i7E$J3A7YmkxU--kf+>gE( zdA%eG?#8bvyp|HO!f8C5BuyldFULN-Lb`vouAfXh(m_g6`mxsNo`uEwzu9^wIvQ?S*fDB;qzJ-LJIEv$s9#(CryGCfIxmjZc;r-S1(?#KI9e5 zQ~&a-Ka)vYpGy4|<;&Or9zOe3eC2%;v1EydE5&+^Hz#ab$SP-iNeUfzUiWkDJ|uut zpU;H~hIG4K*f!SHU0$5-JcHdFXcdyV{lJpfb}Efacao_oMR8+b6{LM#Vh?=egWpVM_>5pUlWf>vUk=#j*L)=N<($)%^eeKnzQwO8J zzsbDfL+pyd)F4IW_gG=zlA|X8t{0Mo5F;h$i2mBu3@d*A>&}vd)}CET@_V1TTgHW)(JA4Zu|i$-i%emWv2PEG?)b;Km> zaHGX|yBNFg6W5lnU@(`$jxEicg_fCJc~?4T3e!U@{Boifa8tJ1*B363I7b1L*~$~C z^=np2Wa4&$+^nQtwg8`_BRBc+WN2^Skbi2bVvYXZ7kNcxUoRv-s8WW#l7aNu273LY z*DDnBONlZl1RF>xJaI|0k_(;Db^FwmA!nQ+Li;J$Gbd{WExQZtkUXFir7W8y0875u z&+tO;;A`O!pB%0` z1$3n#KHh1|7Rcyv$AEoxdyynH#DdC)>kLQ>>*>OkpTHWbEke04D*r&5k( z8kI1u=sQuHh=K(B&iMJj{_=5B~D{GJ@bv<0oH= zr4$7X8|Uoj=2ze}Cnhkl2imp^Ulcfvy#z?_>b6j%COV%2))O4(th3Q7+l(x$dASxu z@7=w{PEi}1B7sV|+x*%Ivb4_5&My+-i;Ih2xrh@B-Oruz2?;}D+n^7FhKGmaO#-~U zkZj4_M6^{@3{e%QJ(bjfkUKW34rsb|s&3v*DRQ#R;_{A8GG%JxCn8I z&dTuK+|2A_Y@|=bd`hXyx2d)2q>8jOG>tXMwV0LEgOB@lLnhB_3uudjs6%d|*ZMB3 ze2;!1Rc*QU*dVQ&z*N~FwW9jeeBoU@foYbHI>#0t>}1SDktCf8o2aiB7owAOI@C7= zOA^XCv-!R~bx^`6rJzI1w}_|LXz_=_ebfhNJPMhJmg{C69@g6foaLWWoH8>jbzUjH z#oe{Fl~;`MD-Qo5yba6X>C=!96iggBT-+ORd40pD{WF#t782F z!R2D$q8Lb5GjMe|(hJjKC5Fx4H}S@{qzqI1R1y}iLVw`M(((qX7ybDB1^Hw4(kUe% zQLlYuXZ}!{bfwH>wcg#OB+DLwqazCnq{4g#sIC-*{U1NEX)2E7HB1Bq)8Nui96+Q= zt0pXKiq;;iNH#T}Xh*%f^MnyXO7x zhlrgFpfY4bLSfiD0G&&9tQ~y*l7T{|Z2?O2?;BU1^eQz2gM?fJE_E}(F45To+;42h z=0INavYcIklvjM7jNj=NmrK3CeAn0atJG{n4(=ED5O-YTskz4?%6$8W4)BsMWSzK= zGmj8;Z{NP{6kK_WDQP#2qi%7&Yuidu7D|}%NVq(l8=M@7gpy_TgsKum(#NZR)Nqv~ z?d4nQI0PzhtSF9Tmf{f|FTHDNJUMOktmG*#T`vpP-O3ZaNcS<)4Vp2@H8w*8K>$zo zM-6_Ta4zO|naPxW1Evq@Lp*I^lnd*7RUa&p&sf&*hw>U~ zbSNW(6?9)@2u)BNZZ+HR9%rVvn@=&6b;uWJ?U1OJggY{UC0pUy*L0y?ZTB{Su0+Kr z67}zUew%w?ga1O`L{Mn<)?yOred{(sv*=R=G4C2haa>bO(&TP_yQ#JbsQYPMnR=fn z|HI}eM^Mc(Ja#3g{HR4v;)?4NL817lGu&cz`0gMEu?}l%ee$K?*leo@x!Z6Ygr!0T zHCj}$CJnaTjt^Ry=mvvKx68k+;X{@PA?`X1__adE`q*I-mIX?ic%iMov_2KfI<>R% zUZy|Z0!N+84-Z@5j4H|9hCgJ8%b$nOW%acF*JIcH!}u;P>%S&DaXo>e4)R^(FV~9R zAAg~`);-%B-e`1Yz&1P3+i0a&Y%J$(@e{iTSz3OrRhI;trDORF;iYd@po?*Y!F!UZ zRHXrK$ijuC@x;B+r4RYe8bZ^qegp>^_JW(zzw1TTLZDp4mn`vV`1$WY-Qz@7jTf6A zA@iCGWDT!3s%X`{%zzqn&0jVo&(e9?jbJqQJ|Shh1gUgD!Yp0Hm8vlkrajmCWN&Z= zxs`9@A||$3K;BhtO)25|#9w&_%~j>=d1WQrH5K%Ny3lqCXI;pe$@*Ah%0$)acJ@+N z!z`LD^-kwkJmtR3<~sMS1$pU*WW{v0Oitm#YOrJXK|F0@e#n3I zXbPl0$q&MSU~_8bh3&V9X*2v)MwC7EP}0F z*8A%5@#niBan$_trHn%hdY!({-;8MmtgjDz+%kUNsN57by!O6-&|AkGrBAUWELO+A zBZz{E7TeeqD*Zc^qYM~Ww4U|yOK$3CwQT7fySZGm!+PYgBl>H3gZo%`_pjrF3nYCN zp5H~`zux8_mqA+an*TXIq-a5K_YIKSg53aeRzT-op;-ZPgp*+EPC$;sE{=Am$a<4M zIOg^{0uWjqAn3B(7CwVN-wzDjWy-I{;eAQ)coM;boK6vPQ_F36UC>JjvzwCJg-Z$} z`g7?vvbti4ZxI-j@b!Lp4V;ahfX)5@4BFcLMvGoMS5{TRIg)RnHu$##J_+9hrHfj4#@0=%)R^)8~#=Tazvv$7eB|;@30kaVu(GK&$RBSzfD<2Nh`2iUFDTn(%aa z8r1MG0O~?hFIByP{$6@5|SWr;unO-iOIz)?wTgP3W8;eZX8;tSkE!AAK zVOH+XMmCQ9x-NQnqRs-vLb{)nU-zz`;Kh}~c+r)@_{c7~6VUJr6N|}aiD;FVu4xX# zk)XIGP={nfMnVA5pv=xocLTVl!Gn9<`5E?5-Qyksc(IvmiZ-hB8}2$10jm(U4SGqcR%xyxWpwLzan zFDV0cI%PQiD;t7ciK?@G_o}_tuamU?=chjU>!*U{hTSb|k|F)k_)Y=UWQ~umKy-PN z|8~W=Wvb(@?b5eJ6Nu)K5RQ~vjHZK%7G;Psv8hybRF_+N@)&n=D@$J4r-X#}LtW?A zm2qKFicS9TUYfYlWNA|Rex!H8V_&dSvMh*zOGTI+i{6`O&$M(EOq@ksTp64NO^3MM1li0Zu3Dmt1xK{uF%JCTdW!x*3CxCe?%ox~wK|g@# zeYMn0z!v8oTaTAgFl%c50w+}5K?3zzdTTT?jHfNq1TS9l17Vo7We1~&@kCQt_mYqP zXQkD(^>05;jC+=NDEnHGq|Mm+6w~TLP?LEO$vOoD528k|-UWxLa5T?b7>DBRjJ=ei za{^lg0>J@3^SP??ZGor8&Tmc?!r$3wHTB+){$6l>^duk6fQ}mKiI0V%*MJWOs*iN# z@aH!8nm;MKfgWi1EpYn6QldaE;I4^BtWgmdg+{>2*V$Ja3^2 z$mm7T$Lm&6Th2S*U&FB8sC1rilO!QcN(Zuw+ZrxE-$TT-pqs?=OR7*ZIK=~RvgJsp*_{#jxcabJle=AK z0l%wzo~RKwnsJtpey0JTC&v40U_mXlPu>w~`G7O(;GU7*XzDSjo%JI6JM7+R^!AMe zk5MvPk06B5iA2Gp*5P@9cXOE{XwOsnMQA_p!B1RPn}J?8{L|3On;UK}VN43t?Tb?Z z4QIq4SbXE{@jh7`woYBSea!Meg@F6J3@K*iiFM^6G{N#GM$X=S!p=I(k~Ws-vZ_8= z-}*;w@A}CK)2wZ@O2q?J-|y!fK-$v!0A6x*-+|omo~#w^TlPi0nVL2EkNmpxdT~(G zk%0@sI6htc6g!2EW)Sv8MQur{olgwhvF?6pb1Gqc@+H+yokaLrVeX?y4Tx6FE~t*~ z*YVTz+2^7(vQCRcuR0o9TBut9laqXGV0h|#ky9}>kN^Rf7L0CcRghVw03pZcz{94d zNx}A%rKWuAvSDH6?7AS;laC16j^G&x#O%8&jXP%}8r0EoWQVdVI*x z$wh33*TPdRZ;+_D-nM0Ac?L!p2y36I=`uTzx{4@fl8HM@fi&_LFr#Y$E#!wrr4H+j zoBG0BSu2Z?x&h)U+!Vsb`J)xX8lk>X+=WL?D0FQv$!vjhK-ErLyo=4e0U6&H7eDQV zvS{5PnKjkB=5y%3pAiYypBvm8j-fZY|4e!^->MWOhuX|x?yKB+Y5r_2BErBa4RwsR z8tmqz`^^fDEJdIN4Fj~G;kSut&QgaEYfLpUy41gqK)g^UwD?+W_)t{UsA!+=hh3?H z;wLhJW6k|hIX%@MzLfxol;Hgbn|cJ5fh*!bJp9`F?7iqzWC|Q4Pevz-AkIl*HU<)` zXTtw5s7gKlsorJv5hid3rkGRN4i??e^^O!t@`{#_??Ysc8(-BFB5D~HfhvaeY(4oo zUeZ+g9olY=K=z1PRV&sb_f-ei?b_l0*}eTl=?Y6ebP9lPYrO_~`}k5oJrg9-P12;C zWgAU7F7-4x;w)J&O9)9=&T=K+KgD_aBOPwvIbOK5+&!~jswca$V1ow?6Nq%CjxKwK zN|Y`}9-+juoS}EHXtIVWh?}hSTl0bHNk)iGw#P zoeL!Qp99JLt7{m9Cqk%n`P|ndJ=;XCGB0IPQ;lMLDALki6_v_l;E3%Rq)(J}F|wW= zM5i_84Zbs76B7?KOF8Z#=JWT6?C}qF(jk(pp#uxNutao^TWeqGamY)+VUc6+=ZoSg z<{gtEgI9!{^}NwkD7`72ZTZ|CD5q#oPHHmFE@P7BpHc_d4t*B@oo*&UUJGj1@$@!u zx1)Rfw@IHQ2Xd{O>b{1Bmt_`5o22xo0m+rtuC}|Sa&|s|sAT)^yi5Co#HONtxs)O| zet|ln4*Joc;#q>$AGsoOm!>A!)lr*jFFN7J1e&unAzGG?sb(4-6FfNJ=^^qr6E@&A}xpZ)Eutk)4gFE4K`Gpe{rGkY1M{H|*oUz%MNrYTM z9ugKzLP|>NnMyMz*rqyLp8I?ES2cBj^I3m*J_}i_sUclQj&Hk#cW19Js5imFiJyci zMHigY;uA4pdDXtwCLel3*OcCw*4ibDdF)QZ@ejA=)=*5=vUWjH7F2$xe@=1DBrF!2 z5VonysU)x@7L{2>wC3#{woTU zVMzEIjrR}#2u_SZv^ubK^8uD_7{LAqkX?1p3dB|_ty3>vCYU)sqkx7;@%19%uxC*i zEfVX^@9-j`I&LgRT6`A=1%+P7j8A)%CZo;xq{OkFR#T2g+V?K+!yeYMl0N}X7KZwW6`U==}EE=r_nGZ2cD!Cy`kGYzsK z3^I_}0pxx4>Sx8_FLH6AuS+VV#wt{^9?w6)6{gM6a*m=(-Y5=MOO=sc>Sx&Wds1%9dxYi z(+YQY!6U_uT-ec=mYG?)Kjr<{Oa)Vhpi=EMf}5;B&Y9y+4f4sS zE08kZyHs|a;s?b8kW#52ZXg-}_kWeJ{|sc!z5fAH0J~%CLgO^o@l@+(!nx3y#yT+S zJ;U3g1*oPuEV)=B4U>5;8SD(>6`O1R=#|lxKKhk`ti%6G*mXX3*Z=wgpn5_Kp04*_gk9fX4JCZ@ z|Hja)aQ9KD4F)(5JpaJ$|0_Rb&5jT-mLvsR0-pLAtG$E$+;_WFqH~*12HSnhcK)g# zHGvJNPy_M&BgP#=_z$n};;-c@{C))|t{RQuy^9iv@r)84MM7_H??ujVX+=fc{brxv zJH-U_){!- z<#Y}3q`)7aLHs+~`^R+4#Fnye10NDb(>8bAAN%p^726_V+-P+j zJ0%~9Ob?Q|fI6FpsFV~&_M-ADn@)jTbw$CI6V$&;Y|)q7>=%guwkUzn`Y*8M>PO?} zXujeDQd7f${Rhk@TdQ7;D-jX)##zh>!X$e}Y6%w8#DDH@s;hYi)cdFoe->otc4_&$ zl-d&h9)&#U$HYXRvxztOi%U2Es}ve{=tw~4w-$gnDJIs*^q$i+$^Ii=dT|4HhVV~H z4ey-A*8cPUqSRKT(}wmdNzvr6MpR!T>2a9j>7OV;>C;-RUo<-U+TYUZ`JEH z<<3KX->d-ru!5KOYjW?$J2e1*>5b#X@ zf&ui@v#WoXDjE*`g8xLxd#oT{?@)B_UC;ZxU_>SlB_hvKxK~3Z-J}p8Mq*kE|G@&@ z#Sd_)E#c+>A4YlL#(Bk3>Hrrk+Xu&r!}}Pk8s#Y^m!3^wCa5O9Vt#z{OK(};J~h&} z84RXl)Og(Y`GqryP#;y?tcj%2NbivSjWSHu&$UaBCVt8uPlhp?9^5Q#k(6V8c95I% zq+nNZ#TZRdiT(L2CDKGqQYU5NmuvFFHJBN37Yr*mEPVA96{|aL2a7REi0^19cUF@c z`m{Gn@ddy?X0_b>{kFUwT$y(IVhOPRG3|Vf3ZQN8k)0las!pK@xuox<^x;=5_!Y227ya)5yZoPcJg%$t zy%SYR6w#U3sgs?Mb@B!9FsJK;BUtOhTx2-vV$ zK-8w7HTKJp<#$QX%-)hOMp!&~OGn8(g(_7zlSMN&>bjbjzY|*BO-f(+>uj>DxG5$)jE^< zsslvG|7|w^ifJ1uM=4mZ!qy36qo;q++NsW=T(Y>qCt;cOj(vKg@$dO#woQ2Bt8-HGBL? zIzWjh0rgJ^eu8!NG@kh1WI&6^En8ea;Gyax@tZ5t;5&SMda-q7;OD;h`i+)7_HMCp zET6>JM{hzL_9gKHFdpQ&D1&;&qeuxIsB&`{;8g6FA9FaoVGlQN-1XpUpx{)YOb(xR z(CnS`AZBq=i$$C($Fdb$l7o?Az=N%eXf7eG3+#Hzm%Wl*xyh8}9-U=_ND$Pah-zc24Y^J=4mK3>Y=4s<@YBnST1Dg0v*U%I6X)z47@x{LtcdTcDbr z?D0vZKV|f|uM#Hfv|WF@$<`#xK~%n&;fe+x+QQ*Tua)_8rE%F$5(Ua;yB%3024cfk zVN^eml0cM?)`86ZR_&oi7Exp{TFDfB0+GLeJ49^@YIih9Y{TI(WCUoo(DmG>qg_w$ za$Z%#|97iUjxb6;v0b}^gs)lOiKs9q$SHj=O3Xig(!T9ia>uP)0w!Vh?R^^w+ZR{w zRyctT5O*nPzZK{^#BH6r!-{(vPGuj`7Su!hk&ZIHRz|D^8bOpVI+)v@cG$%bCMmiy z_#Ou2(3nH0us_4-6k{_F3QpEr!lKJ0Qv(=Y$j}qO9E`v8$bCl}tazofU?@RHO`CzL zfAjzFf%3HH_ndlq4rx~7bR(6mHD?aSUeKabt7!YCAyXf>7b#PDNswWt`QGus?0un< zVY*k-Cl{$@&s@-EPO^@Yfqec#=ErlLMO>!1cPNXX80AhwjT!x*nytB)w<+vNpR)oM zBtt9(hRC_R^QK{R&i?+(h=_=w^&m80n~7-@sr{j$X0?zaY_|1<%*9{8GS{Yw_qtXn zf;zU3FOm8pH-t-lqmyP~+%rc`?3-tn=BF?n|AdaG zAWxD8qI&Ak>H5^=0C-A?$%#M?ym?>Ae4~=uNBiwthCU`mE7}r#G~T_BjGS@4W+c?d zUkDvDuGj+b5_Vh7tezE={S-?w-JnbECYVl0W>-^0ijpc-u=KRp#%NY}f|-Afe>x11 zgZ`UK38aiR@PJldfabsPtayV9IQ2i!ybS@wiv=1K){UO`08mn4ap8}5>9kje5oqQ2 z&8Z1NBg@6(DmG7~=)qh0%KhE@6+U4eIX&;LQbYL6z)f+AYAOs1k)MeYq$@Rmg(-?s z_lN6e5L1`@eD#^+$YVzISL+5hnvm_^V9)Cc>;)m8UN7#)?SH@?FtYkLuuaX@)GI%w z+_o&e;5mdF9{Hb%HzGr5Ty7(I-=6dw!4sFsqu9k@Y$tmcs}p7`Nl6xe)lJk0L-*Km zIXh0{wz2!bb0X~F!7a4w#q8|N@+S^{d;-1}*NAQe=Q@KEyK-zE?*&}Kzg6@oVDR1( z$NWR}K)CGz?F3*w@Q5HYJV`ms8o>VK@YPZsC6CI~ha{aGl zFH_6UtfKk*{J2{JRhtx)`P#j&bMlWO_y9AD&-j;^*jDU+5tv#4b403kgYa_K9DJnjMXF3sq`HT=?RrPzxVm5NQPg^7y)`f zF8JR8)>VJfjIqSLCSWpl9cy`)GzJRKEGOu93EPs8^6_c{ys0r4bM`fdWtd=E%}T01Cm_u4MfEOTKnIenPimxXd&msju*IX&XnP2M@=tDyAcV@=R z|7ZNdt3?34F@z8QM^6(b)6|QCEey-hod|)KVKz4P{w^7Xp?|0Rvk|}IUx`5=k-z&x z*72fhS#nQU%JrbpJ{DgkO-n~S;^d^xbbFlxzR>C4G2{PFs?9_OInQ=pmxEv1LG|YD z+Y*J3(Q8&o+rUfC(cr&)9#D{TO8@)S2T6c;!0|7V4UdpL+ze@TMh;a!i^GEmyOM(! zq|FK)nrETIyx=82A}fCmJP;urVEWOA0yq%@s`YxufvmGT0&>lxKJw}XixMp1yYbh- z5}O{!7WB0e_@(}tnsV}e4p&}7R1Wv7;II`aNPL8Kfc<5_BYQ{(j?p~eU1LqIcq@fp zu{g&7U$?R^;9N5dlCQ4%!{K3i*PRYGZli*zg7F^5TO|N~Uxa)PAAE}S|NE4yb@7h^ z<5i`pX+`L?p&X5RwrGhqMg`eCZEet&)P)lH+DZzBC%JA|>r+lQ6D{1V`6d*2TkHgv z6ONl>>N81wsCgy33ttSk`)W}WDQTPT+ zhOcJOA|+m-*7;8d1i(eVto8itL-$MUW4XUEv5MQF%QTzGgKoN#^-Z^P-<)AdxT3cwYWyWgniV)QcVqtAU`gQGc4T>TeGVUa>7k}lhnzUCQ-nBMufM`u zAM(;$O8D`xuTL>dEy5>`njl6^iI@Pt+?_UL-Ta=-z#a2sc7~;xeDuVM&jW?sj2-Vo z(1@lpJ0crhNTv@?p72OI%_rmiqp?zTZv!M5CU}cGf41c`6G&r9m@5BjDuUyJs#IDq~FY_b<5(qAnfMO)u>~?M<%8poiCC{01MD%Gfh#Y&y;pE#F%P1{(6J zFK-09X2mGoCt$+T^H_Cc)F7_z9kdK?<4s?CW*!xxKU4fNy-V4Qp*bu>f1{T$AtEB2 zD+iA})T$cGqUluTiUHn%a<(C5eBn6(M53oLxE6MlWVX>_;g<(h*t46OR-_Brqt|Yp zBFH;etx69!1^!$L9F+H%K?{ac&)`sKtyR+dvf*#5^nh?(pRoIsn*Y`^OpJU%!m#|V zYP<}&(jVtzeqKm& zO`*bM*f9y=2CJ{~Y7HLmQs?PWMfcO@{hFze{k-yG-{<5%sp3zobwZa`nI(E#A(jFM zj_jmdTZjYFYY?7GS#UIz#e>$TV$Ow}WK!m;YbvH>Fjdt9S+zj$Ofq~vmz)L#% z?r1yH2MW(-9b7O^tMN8ycIg;sV~x;daLpqCZeKSRZB}^u7FU_d28mDOv`2Z^$5)B0 zNAekqCzSqWRA3}gC&Uu;6^F@_VzxkSP^-57v)F_QWgS}}$D(_l%w+oG*Ff!^LQ7FQ zMTA6VyujVxH}j3#Vu|2X4dCqkf#*voS3mdM{NO@|A8R8Ek|7Y|FrvMLl25Pg9T3nE zPMTX&IVQ%Y-j}*Je88}AlhiUH0^dMGA0z1KeR`Ubf?7!_@?an%UGRIOViiRw7|<%O zM{*Mk`D|shY4*&9tIe3;)we#WPeV__seph_|5d~x7NYzG`<+30qNZNebE{pQ{&Z^Y zeGmbof3v+B{KjVEGGnD(Ll=7=M&?;5+1WAEBAwLBjgka{x}GSoei9yAcEJiL+jBy< zSu}`m^_1Wt%YUHzMZ#R#R3?jekUBGlEOAT`9xD0TV=k&R+)dCyDKuBp?6qmr*b7}F zHG=pmMIpwArYlkYvph=X&7xCTk+)dUYu+YH*v6V=SATc@0#!L`X^BgX%HC`U+$}PK zBeK8LpuS}1QJI{p&uq*ecA?VX;PWu}<_|adX)3_=d#%6PU3rQpN(8!HM=*a9uPc<` zdbCuJoghrv`Cgh;BgCAw{uZ~kZI+nP3!&s_p^uwUcJ^^k^EM5>3<%v%=q+NUUa0B; zZ4X3|@yhB*%L*E%7VBJP=6o7P4o%UrdNSPQCdcvtUnMih1N*j{_X`=ELf6ej%$2Zn z-FgdZ@%jVfzy7exHz96BYd1>K*zC_uutSJd5~`k?l8|4_&>vSV-^(Hqgk}uCg#$Ta z^Igu6IFj{|k`fY+Gnr2ar9fF-wY=^Y4IIcCYvSnaGE8?^KN^v}_9j@iy7g+~CpAY~f5FUkBE7G_OhY5WT4xJkUp$f%sgUdrQTZ z0l3X+z0e^+ihC-v5GGNVATaToLboO$=!VlTrnnm*i3T3Hqpt+0=$KukY{09SB*NZ*tm)tfme1zR>RLQ<5Po zUTt285v(7PglDzxS(C<#eb1!gu^I8*JX{uwHc4hSg;vhE5jbvL#t`L`EWPPZCfdl8 zU3beNkvmI8axk}R>TOoTpG}n!S1-%I75kEX@2A~Jt>eiL2^`2I4emyQH#-ayTvGz% ztRV?J-swXQ>J}k-9;9z)`iPnGV<*nyo2cIs{fQsg6eT?HW}@8rl#~XoRg^wkujVF z_Ze6y&Y$p+-f{bE94#5Hk51e>G{A`M4&RGjpXo}xWg+3*CQMD>6hj@Rvqu=wOx4GT zlkb-5whb-y{23o?%qMQb(YRpOUAgZ|Y#Q+?8pcZz#PL*qElgbB)XH`?t-kd2<)%&o zuVv_WdNE>ZFmi>dW<}q4u$9EQV!MA>S!kt(?Y{Sk4<@$v;)F+kqw?5S5yr*vJvFWB ztw$Gd-k#ony|!l&o}EwPcXSAy?^17~xk&8V2lij%nFfi z%O*-oZ{JVO+^vd9JL>$}*XQ?L_89*xpT8hjz7h7N2iLnx@(~k~-=rah4%Sv4cJCd4uiPol7^mjajVr5vXuW);GV$7dYruAZP-= zhev*TPtEwYXr&GJO=Ye@UxqcNW08j(w)@Fsf8n8{*a>9BM6%oRM1-14(hc4=+9u%C z=a$&TU#%syJ~bFz@pHMsy!)NR**io))5Mvw>#veQ^(vNP&Yy3vXvrOa0gK>qo7{4R z6L*J@iOXi}8On+&jzm9Sx9d^>4>o|%iGJQ~(iFe_%wv*JIRq2?ANStP;O}9we>WC? zn!e=oqHvgjrP!7o1i6${d5WVo7sn1-%O?tp0Xnv!3rnDonYe)#(*uA4IUtwcwSqJS8W4WXk^2LZ% zI_M+(6P1>^8h6Os&Fd*HqXtDu3t@15P{sF_QDnyaqu z=`=WsC)_&j@~g(1$0qk-y$9kOLVj(-LGDDI^4*Ivs@Plbqe;$|*1X*Hh)lMJPMcz< z>jYhSomlIxwRbMhb7+>AULmm$UPN_DwKUv$YqKG`;HGND`{@PxkaaOm2 zj2TlqygVO87G`8E&CzjOZDJrDC*aBwpU8@x8b~#kNoU#1a)LfxOM$|%nkmzVw#s8d z{N=OoUEu=XXFl2zMsWk2ZQ_xL3`V*x2_uISC;Q;u>icf$8xE)PwDv#tbvD9VDMS5r z2|g~(rP_*b^)rP9fh$&C=*!m*0+|hhJVSZ&dZ(L3=Yv=^zm~W-hIy&d4CdVl7w6O3 zYPuVuj=CGp;z+}Wl|BtJ%V53nNJDXXcbIyZi8*#D$#m>P2)z>nTeHe)3gARzS0$#G zK7YAxCutWgmU?~>CIRcP7e}N=bZtcp7%){mL64T)AMGP>qN8v3Z`wu+0W(7)-w1vm zKrhqh0C>jT#s31mp3c%mmdPVCiL6Gw5I5hNm$HDrKO@B_T0D)X+|9RFb{2b8I`9tG z1Y}xo1?}1YXpQuhDa%8qmQEYyrih9Md&fsanc-vE0N^1+$APMP-Kf3W^V+B9IC<^bb_iy+HsC)8aUYOqVvd+7x$@PKNrW;b4ZnXvh0;A-k^*VzOF{VX8KVcp}W< zN4Q@F&LU5*JH4)(`eeT;=YhF0#=iYHRo~26rarucgO~aT0!0}~UL%%uoqFs!1$4I4 zJyk#Q*0}kH&)S>)`3^Zy+mX?qR*K|FtYHxPr)nHziCv$H16(6o8AUhyO$IqJxYLK+ zxi#Vv9M2c9NJpoP6~Fn6Th>^t#M1BD(qa0mvEBYDLk2du0^MyJw|BCCS}7B+dCt4y|6n}V5Q|w1pc^9 zls?|x@FQ>JCU<4(nF_MIlRs99p_%{F|46HzmjQe7z)RmF8HG$yH0@d6(5MA&@AQ7F zZZWoCP{PDq>sRU~SMkJ;7?f`J66vv%%kpJMb6q3a8kfaB5o%2Q{3+}?^qsu4Y+p)> zig5xLhFH&l;{(gPte6NliS*`Mp1ga3!6H)K{#x11F>=|cJSjgj_-Ny(pUW>vN4LU} z{KgS61Iyo_&fb;Uu|G4Mg-@2GWOZ&XLK#GXsP!oqd7#pVj6@&)wl{4d>qnvlhlYS}??qF5fDjKYUWv&7A}Nrn4DEm-X&n=(;`gs@9*_Bk$j z9kL6uKREWuQZK8!!8lC#>R1Y@K>O6Y{T`BG*3o2$MWVz28NZ3RN2q2~qSb_dM9ENS zpR#93&QqH~Ov6l_81MHD%N;f#C&9+#zGi?V?*WG9_S1XoH-D$MBCqlM{dg29hThfn2WHPcqiC}RI;cs%;<<9u?+y-fJ=LT#T z9k!OaQrjPw#R@1EtY`%uQd4u@-Xk*BPZJ}~Q`de)k@^(g(`|b|-CI9A&4xKqWKO@tVDx>-V9#q#7WOaYEGy56of=WD~j2qLqd0F z>UVAN4??eG2X7HUSE6aK1rMH;24Dp1$eX70(J^a36r+XYJq+V3#6dnpQZjEFTYEx} z%9$$Ge;dnxV9&Y6$o1wNnKlJ>BK5a$ClSlkKqh+0PbR5%-F|pqlIrD8M0z-X-+%Qm zi!Er-rc@$IX%kv2cVHM@4LKR@Ac|#lPG03=c>ev=lV8J)`>FVr3Jyb@a~Fj|1aWY> zEj1(mBqDs^I6Utl$j(xHtnkHbgED?;1=jRFlyNn!OGX;5wEC?|rW7{0h*vJERaOvH zN(Ie{emyUukmdgb#*v-<nS$;7C)OPr^+w0mS zCIu#9ZNzIjcK*)RfQd6E%HaJJyP8x&if`W@o)6L@Lq|1T5-BzXXpmp{h`1Vh4lQ^h zbQKQ?4FXVm&@Zd30;}yzl~S6hPf+*0$^*25WtC>k7p0Q%uwWvJIJ{J$vm~$xs=lFt zC*jB8^r@0)3P~N&q*ZEVi;);fpqh>M^f z9kOgIKNtP!;#lch30XSDE%T+;M+R>zh8b2kS?@Xu%O)4j*yWK;ZlF*ktL(S?_*eF| zMxP9FKX|!jS z0?!U_f8JO;(FQtd8ii<3&JHH8zgCQp6ay;@<6&6GyACo9IFi6C%|Mxa#oIX8P639< zD%^jJkW0>N1M=wQF`3aR113Lqm(JW_h^L}yK0-Y*O@6&|3u)MCD!1s~g(47rY}=5~ z7!y`O`I=v+qm1wHpiil`Bfjh=Z?3oSOTdZ~yU=U74|EP7#7>+HGK=yPO93K%CM2~) z2WdxHv9uJfoQm?pc*jD6E?v~SlJIw>AVZ2WiRk`Z$y0ki-!~s8mfB{YN!J$93eR0v zJ{GXEYBCF1?tUFisNytV=ps`sxRH_NwC_MWuS)PUocxJkvzVlzw6F?I*1+6TcVzOa zZaeb(L*Ya6!;!U@M=4$JD5ZQX(#0IJV)~uJQc>=q41m^&htihU>D9;kS5-ZI(vd%s zb*Qu->1OG^u@$*ohY`vvzWzo-==C3N%K?gI#?PPZPe}iF<)jzFR~y@^V?_Tyv%8oT z@Kr`9cc%}oi@iaCSFPYrY$&f2KpgNt5;tb9TL`^AfEr`OwrCU9bxq~J>5xr-hy)6z z4=;9sl}|gr^_lXVyi7v{>s+17W`p1V$JkpyRkgK^!Use_K|&CuOHewcQ>06}K|xBo zbAx~g(%qp_(jXn14y8-Fk?xL7+_}#Y-^2U;|NZV5XLuNk#az#P=F>H;CRi^R1Ni^s z9JvbNZp}z@XW$<~@y7uMyF~aeo8K|FS4LPbZ4UH81$^u4&T!?DOxtE>$JVd1n*UjF%0?e!nq`X$&tW^pj5#Q<4Tv*GI3RRXg6d)fL&S{`Y3O zSG4mcFE2}-U%mPZOt8J|;9BVo%CcUhwW7xj4ChsP$OR@r>*(lkIjlu->w0(yyai)k zqN5F6zg}2cShR*yiU-ltGBUQ^mS%cK)y?WlH(T)Mxc~jb-<_9O;=YhSZlMGhBA7)& z&^VJ9ZK^aP6V*`dMKBc97Ze&Ew{@jFC8Uo?hb=!RAzU?+6qNzzRT&Nc+4_Q-+ zC?Cl(<-KG-UU~E|M^b{5hUoO8iZfR z{)v)42yFoE=iP6cBkMgM_@NVf|9l(#z13p02qBTf;<=$=M!2{V_g683i^D?CR`}?q zNbju#vw}voy1j?kU}}mYy5B8%zFnlP+I>0faQjY%idp!qL@=lo^giBn}55 z^4sttNBsh{ZjQzbB%wcD$u|DGK;i$Z8XcV||Cw+81+ffPm6EEqKXIY7@DX<2FM>`H$c7Z_NV%~0xoY73@`@^tloNMIjd`PRwUoTpI& z?!2U>^0;ox-+eM0BY2P7Ivw|Zcm7JrIMw})F@b!Y;zymhYN`KwE%$+Bb(M$9=z?BE zL`0V!im_2TB^_Jy4AIflqV3%GaLSdHm3I@H=wo7Lq{IU5kC^ICD9SCT0>D7M+3NLz zX15vGbeP4(ecB-VnxrHmb$TV3w4oT$Y4)E{0u=HM+_O!jscsK>kBLR?w6d~ER|Gq+ zDi<}et2X|Z@ox=5)St39%B^NX^sdfEPP(l;d<;8d=D@gBW`V2I-qgrWvR|GNM`D9h zrnE$m)7ZzsJ`q=DYrGIJU#sGUCXuP$!J^(lZCdh7eg0~Ny-uMx-c);i>K5<#{Ggc! zZ2SVYQbW?q+#e-mJ4Qo8<7E)yNMHV2J_@m;j1tN5?d(K4pS5msX-<&Ur}d;PUNU*0 zkU}pqOYUp`5n*IrNbugozkC`P|FB7yIb!5PaavjJh^2z6D*PvMNz7QXP$;x$VpBp| zTEx+jV{d=I_*EmKvYJ|AFYQY)#&A%B?U9g}U^Cl*z?Cegl$HJM+rSI)hYug#mdn={ zU*|3>FQ5PQBXVV9Ysg+G86E{mPnW{1cO?b9+Prg?AeT&H4c1W~4@Z5Z6Gp1no!m)~U}Te%3p)!lh+p_`6@8 zi=|nzva-HZSNBI{to(~kHIF%5W^r+meDAU1yhLyA1GA9=9p|m8Wtphn^V5ABFeFxF zMp{Y=jbyVXb6Kf4?P8|FiZsfroSO)pGLijuYW=q6w&Y^0$FSKwUFDZhFzU{EA(*qs zq@QA?oVNWS0nASF0rT(-cIR82uV6D))6&w?K2FO#zoocMqRnh%yFqPwhxTw_50~(w3W!M7gmA~Nm9ES7!$0tE8E+aM5 zzP;{CY+`38W-O*NAtd|(spr3*I4?%4z0}hS?dp=jW>Hp9c>Trq{6ek2%yCoG{d|Fx zbNh_j*#%%}H%sv1t{`lz*Shr~7z!E4$l5OlJKpA0l%h;5s9GWzL=NUj;CFM}+0ykjFdQ>&x|GD z#q^|-72oFIe7|?0xSgb#t(VWBC(|JuXb~$4x4peRnS+o_FJ9YU{vW^heKSL5GrVy z9v)BthUDsRRy)Me(X6t6`Mziyzmg4gJJvtmngSzwf2!#PYgty^ru}kW7Ie_0Ld1VK zq;%3B>KT+F9rH*yGkcr8TGBB78<D(DU&+)Y{MY?mQ2MGEeS)w@_z`+M@QVspMG)y3$y+iJQl z8R3s<^^$rJN-HXaDXmB#2MUyLOXd2E1z{TJ3xbzv*_7!HVfJ8z>7FZ;`kSOutLd*^ z_>5S^ZXL%l!^(XhC;jg6XPMIx9V8E*0~^8lWX^9F$U5xmZQki!^uGt4p3Dcbm>tnS z{&7_*r|6_s5iUj>caSl1Fo)s!DD~=mIT%fGEj>E)>$NzgT|%(_znYgEE8a2P>R z7&PC3>3Y^+cH;c7men%_4vn!f30%nq?8aZw2_l;EV0tqN>lZOIGRRI3?`|uTJA&Qg zYH{BHMqr+3#!65Ivp-9|M;^T6a$GYLEbp-w*;i_JrjZfONE=*kG4Y|D1cD~dclutE z?$`^%{Vyk}RCbl*0EVkCBdhj2yT2mk=K=9&gS&*{FORIQhF@TuDh;phrg)s0p3fo+ zGApQRF0YVY&XHRGpo$fuiV#yd)oTowy4?RWit-D5+{yTR7xzE=m4NdGy8=n*pZ(T8 zhVM5mc+!%yCyCqOeeb&$yItYc{CL{dULQ9kB_;39T?Oi$BhFkNRoJZbkx!m)CGM{F zx)UUf7gSb`n)7Oa%NLeTb{9^jwmtkt^gM|Z3bw#x(fW@E{r;rECik{NJ+44FIF3gm zp=Yb0W~-^2bH=>u*4gqo@FHV z@;;7Wzn>&U6+G1MBx{<*q{GfCx!7NzQxlZl|1~)kkcjKd|{ML2__^^#Oed`&-F+_!!bp;o^T76AL&uR83EdF#Z6F4X>5O(wvOBK9yH( z4>kqtmylklkY-34$@x!NX!0Z zAODrBdMUwdr>qciznA|wDLWOYT4TjUQ>6X~D(RQX%Ix$5-vv#T5h16YdTpId^=LIy zmPZ&;yGgWoEiX7o!Fa$2@PkEhKBCRZIB=Oqf4legWU0R2S*RzxQK zu>Ko%OipkNHX^;)e_Sk?2;gEpZnWY2Lv!(>Wc@7)($c74===R>7TRFt>90Y%;w0H` z`o{&TSDb*M@b2f^)?m(Q|4{c|lwfLlb2_-_DNBhv{aG}~|45Npz((5zKFGVzW5+Q$ z_@0+ViuGX1MjtzQ9y=KcMr(p=;4>6dRfoUJ-by`8VCRusR1*`sqgtT-(GynT*_@}DjwRvSaIKT?FSUnEJz zT1=G3KP3uo$u2D9jF2!p1GUkQ$fHtIt=MtF%ZGTWL zt-k_;T~o@<$GmH{>j>?Zy6~%bubVc7v;hUR-sYA6bKw*7ZEy@q@7i_#q!EqK;k2KS zu+v{&;)6gAfJ~1NGWb$Y!+I(-yQ1<-k`t#2ESL?(1)Ez6idKa`DR?|U1;)Ls< z!^S-616v>XD-zIzSXQr6o8(i85&4RK|L58fP%aE|uP6P5m;aOu4a)YdyuRA27dRH4 zgscbuZ-W*e^T5wQvrs~`Pf4Etw(P}G-7&PcmR)p0LLp7&aBk?h{t7bbGy_@L+zgh; z;fWTTIbY)Zj3~KIZCcG2t^PL)d#2N@!}%^fU{=zlrYGV^)Qsfof!MnDAl^V}SGN~k_zCbH-{io8 zNe=JB=bE7sy~jSJ=YMH-!VMk*X?cOu#~#3nB;YPH9i)aUe`<&14yvm!WpN&t`>BaT zj>s4!?Cr3tFyB6l#SJ1s@jP z7scTvlL)6!sg8Il%JVrEQxg$k;K${(@f=AHS|qbE>kk>$-loXGBVD(Ah|vOXJnU)ApZR-8Nyv0#Czg$NOvVb2;uKPMnL43<`$ET zeA6zDX9PWf@Q#6g%t7-|Vatv1*vc#Eq1}A0J#O2{5GYi3x|G=dN!TY~R3rhqUQol> z!@Zm|KvKs$`|j2}I4|q4y%1>Rnzz#0AzCT_L7&L(tPe&TU16=F;;uz$znG{Hg$}Xk z+*j(Q5~Rlf{%gSDKYlVpc}nDPU)(GlpaK5i6~dD~rT1xV$_`GGHsa^<_%o*mi1Z2PjhPif^~-*$(q(mhv{u&JZ^hF?J7PWqmr7ud=(<_h z&_Z?t_0SYoEQ%WMc=h?1`In!cjnp%?hiir^yO9D7AQez!7^9I{P_PlpPLnKW4BX+x zljRh*g3?o9U(jzB9NQ{#y%k> zc6{R`EKfNF!!g5HhWUOu-c1}7rb-C4OdKYPjK4US!%3~xOr6cqm@ys#aux;*$D(}I zsJJb`ENa0T(Scn>CvD*?$?@u77Vb0C) zM5hNM+WH@<0^)olO#lJVZ#^EDA(bd`8kFU{jX2guXr-Cx&-9NDc61%n{+i-5*G?9iUg%tkUzq$ zJ1vzq3jzLY_A7l6ZI;2$hM&R+FMd4{{0MGN$W4FWoLvg23l|S#L_FvWoRe^ z`YTv{V2NLp&4!KroG)J6oFA#~dzu!)H3B|o>)fNcrxazARI%C*X09%e1J_N!aQ46B za;*w4+@5kJSpq*d;v2dG3f_RGS^h)1yrSn=RhaFV!NBHSosXi(m=;DRR(}5##16}= zm%b{bb$H0vpZzS<2PQwG_oMb<(-TeN4Z?rUUx*WP)@n8Vhm!tU>FVxTSA4!^(dd`J zmIinAKZGqLPXW*wDb`$mjp06nyFo6$emI}(W@(Q5l^E{~NVn~YAh!oJ$5@=!vN2K1 zC8;AN1JjAGpzdr<-QlN-bU+1=k~<7PLBR#~RtjvpUJz5(dVA7s8!GDFs8M=1|GDt> zEP(U9PCSG=#U>YrBN&sSflPJIwcwG>ZYy4_odk{FijR=}cXB@@{h=wR8RD#DM@_8H z98>m#Tx(y`0&e-g!kZZ^D1>a9iE=$hjq@9BJf=?%?{@7YHX|AOTI+k} zV;}gBCrc_na5;8F(PZ$a_lO8Jxx8;iK?rnYgboc9cl8n5DM&2==d`fqS^ABBxTKsy!U?H%L#nO27A5<)y+){I8S8N^&FSS*5iVe@ zUjorg&MQ|(P}sQ4t=Gf!Lk=wbsnV?yRfJzPYP>gLcib2ia$lRN_mn*R@b3MJcVCka zy8~Q4KbK2x3h7nVby=iXE4spBq(&kY5VsfRd;aJ-+B{60#O1LixGQ5mk0OI;ULl#W zz=CiUwF$SkGE9P@T|nGl$YBm|q|ivJ`bnxgD~bC~BgLbBKQ%o_*@2hYAO;Cn^hz%X zHfW|!@5%58oV}NLX43bG!ZE+3hewp&jQcy@Kb+}h`Iy;ot(5C(V10hca|%zMANKro zLX&g3=y)3owoz?yF>L%t8ud{E@putBcd!3Nx|xTcLc> zP5p_`^EP9QKuW>V-bB(9sXtof;4h$7_+Md~dtYPrUrVA=BlxD>w$t8>{w<}2sXtkl zl7$vPB7e*6Pn{^WXXC%ffai^?`Lx4BC=hK$uswTijRz8QE!rC?!5_^AXY)t=^Mrl{ z9477oR=MZ(r|j=~iKjdw3>7?;k=B8n9em7)CBhN$qkq^Oqm{X6r+3X#WL{zb+F*8f zhHL42E+jN(;_--T;_2@vLKWCt+g2lORk1oV10k2FOhkl1>I~(Nsw~YbTEDBKphjHF zJA}KXa0z(F--_zDc3Rm5fVrMyjtO03+7;1@^x|yeWQ(cC(zJu()>*5ZR%j5^X_|Jr zq#ga9IKvxv^6L*%0$hm^)cx7E2iYfUcEO+si>PZ{P22@a4~9AV*FVcQxJv@y>xeiA ze{W~yC;a<|I}`G?`o0{WoXv4|pz-w1OJM1_8XLLovxpxa znVMQn6~(ht@>cxj5?d@GR&J9~+Mn(}WGfq$Sjf`5l;Lr1SLb zP>bu%!S=}LB&*eo$^JT4xBmBX#;B{qURPK}w$r%%qhATHLL9zK3*7ypSMQOrRo$q} z1_*`KlecX68+2$waNSo_XI5i8aoySI$uP4)`OAXaT+C0__Qar8r4?*u-lrW*iAB6{Tg^ zhfPtt-XaUopn85KQzq2>lEbBQ{jKsG&JkvJ>FDIIvE0&bE|z&tngw&&+`T-5s_HSW z*@;KP*8B0?s%sV}d1*{fWK$kwU9L7&&VSHa=zr2)MsRH9epJM@CjzrCr47C2Kq18a z_wSne`5HGwua!~U%-zz5A$NX}h)-CQ>u!{E4)1zFoM?pUyj8IETo_W^Y@W+FpGM0$ z#J^ivqbtOeW-NA=ZisFe@5G+@dc3HnyQvHfM0he4o-ikZi-Vl1F7*Kai2Yte{#Sji^%MZk*Cxgl~75~$xD(po{U=wtljNmN6DB5J8mwU^&+ z_4=dVIy(-X>tw?2WMr#cM0-C|+*|i@sVO%yK@S_L9ShTsNsv7TzgAfqI>@$jc~H6B zslC#bKo%AnR;@o=W8X7ZjY;IW7Qeh#xBpfqg{SPYvblG%{SITbM0Tv3h7R>If7Vdq zvE-InH|{8-fOUV97kH|h$WE2=h?lWaoV|XS0=FBs@W2O`&hB+_UOi*suSwVQRhz|> z;d9NvN0r|xij|K%Pm}7c3sAL5-PYo5>rEZ=6_C*?V~eK=d=0mrg$l5gPvRx$m<@G8 zoOTp_Y_}M7TDS$$to57^J{osGK97~Pol2u?@~!Jod_dR8iQYNgKwVfUev(Ra?AwE4 zC(Ts#qU!h?HGyx7v28wo%)$q*u?>PHT@Gi{H67Lu^w2kSuS2R}ErY$6;-PQlRulLP zHP2Vi>GX%{w?aQ+S5?$%;Dk)SnWMXwbgt+?e5SDBmcO_bbtE#M@~wyJubPTaKbW=F z)mUQt)mnu<818R`Ap}{e6xw*>rCU^a#5bnWv7Uu92#TA$1EWm24BR@1BU` z`E~AzoG8@Ni<%Q2xxohO-5e&|31pYD71Jt9h-Qe(vTV029(o-USByXAm@s+KnT-+w z@{2PzPCYYE+66}TT*URrc*rQWN)I!O=c>(0la4e4UXBExsZAdD8SHl1EocdBub?9N zFDgDBHBFQ?Xi3N|P5Mw|(=A&OTeUM8$W-P&-H#_tU#Ff|G&yU&ZM3MXn(Fr#7#38~ zMj09!yz8Q8R*t4?`!cVz$+oyzm9i6>CqRm|ySFZu#NNOp!F@j#}RE8=rS+q)yn`_XuZr>l)VBi>gwMp@G zczMQl8DgVg+t59P|Je*$rpj;_qHUG0D%b8gdA42oYn$`o@~aZFM~x}4(>v>VDZOSm z!nKy$Kd&ko7;7Bu2aj*}i2s5zTu}?0cI}^5KNImyODOu|haEhI+g+yY=7HCC7k^BH z*84OMh(R;jd#GE5#r*eM#|%yV`}jCee$g~L#ww0W$IblYwa$P}jE`2Hb1S2^gNITL zNm3_=2P>XaT5HgyOi!nr=`uA!k|&VWG#1;bXIdVomoM&7g99x%>C6~um93bW$#&(` zbvitmzMJY_rlyxo^=za$lwZYrY+1)qWYVLzIXTBEv3odpJJnhykKMfr*KY`FKoVGQ z+;`eUcZGi(Z$AE{fgE~yad!W${qS}xnQ5;8LGN-J-_m|ozWvRNy3w!Z9lKh)$SZ7+Lh07 z5JY(Yc)jhZM&$VmA%6|J(?Q58aoJnxc%ZY2V#;_TKz4h4@dG2VIK;Y;i*`gq#~LdM z(*M)98MGQG4S4*VarosjOJP;%EpRjkS@?9}WX65Fq62?f4?k9$jaF>N##3@}bk7i3 z7fsW=U)`bAZ~+^+p7&_R02ae$-E@9ADQ%ZB6@j=D2j8^*DivPnGc|nNcA{#}+Faqd z=CR{kHYPcirSi&kucg*CLqB@_^>>mOxZ?lx6&h-M>`nCj+Tc1auRDV~ zh|KIyY^+X}A5;K37*W z;kYeP>bgnem

sSLc=iwt3auVr+$P!P!7e*iaEH?qp-aDk#a-e~$taB$6 zfU_OY@eRrCP(-LEEW zlDV1170R+(xWoE;5hc%a-pr5rF>tk>{Y)G_mLXkym*9*69U1j`0Eiu%*2Ar&kDbM| zGo1R=B6d)DuV|b=NX}O#ymgaa!$eu&3y3s(+{Y2JQ15ofYqoKgbP?-qBh*nYOG$fG z<*eu!X!vTS+OP0_kagM9mJyni&WFMvi7n0Mt1`g9u_tKmX;(&IRp8qp#w&_YVWEvey&4Vf>UnJeF~i$^}Sc+Irq$GF_|yq0|&tHpj>W!3sV zU%UoPy~*wC04ERR<~uNLEt~C_Ltg<|5nD8`yKbQ4qI+&j3&-$zKgAEH8F)NY#Mf}?0aA=wbH~{YKDL~&wLz2(lVK`(7WZudpZvwS7tW+ECF>@q@J{PX{D$9Gs73D z-&%5!&%z@z1Dsi3*Rt=>GOdt!%SsSRP)NK~E5v!rxO0%o`DvVmTnfF({p+h(!Z~zc zzIBh#rd?ZH5{mDDGG8T6y`S_-Kl_g0k&#LM))e~g;ZC)dM9*^3a28)C{5)64BP;wF zH^s~aNueZsqo~jz z?y8T0o84?D^d{<;2pa}zP5$s;Ms}Px_Yf*8Vx`r}A3I96hh_~ltsUZWL>PuxBSoX4LdtH~Wa!fjT`-O+zJMr6EC*6b%|4^a@Tdtr@CtOB_4{*_5*0ek5djQlGG?_kHGS0Jk)H#Xt=>=i_CEPCwF15 z_+CGm;S)F1uObKGc}aA!jQth+YV|0MeA39G{`7Hcjh<#|(O^aVH|_2R{q7;$s{Yzr zx#X@{qoPQDzNM!@3C8^z_N;ZFrJh85BU<`MY#LJ8Qe2eAmb~tg_(eQs2i}VfRE=b( zJvgXFsSgaZCq9c4dM^)SNd-{uTxrC%C$v6I5U__8PosNrnHs=F8l2ab&A$lW4p)Iy z1gNu;t>2TDR=iI`vCf;gJRTV{$fXX_nK!SZ;dBpH==@1!zZ6Xl^|R5H5TKy(st-QB zep^NMaaXCSZmjd=Ldy@LE|2|+n+cVJ#g_SYza-F}=?>mYl~}n!czH9>CPP9%e+pz~ z`j1d1n?}i!U#?-_dXjUElW$aUjvXXG-nN%~oKx~>55OTlWvL$dnmaJ%Ur6?9CF!=* zokXvzie!GTa>9OfJmJBMI?n&V=jJ?%bCjak+r`%uJFQ8O|HRkd|kw^P-HArS~oLhUGdJU8fhd*qc6UvIiR# z(qN=Ed)VI{7TiOtGUbWQN@;dQN>lboDt$9JCHyoy;x-41;%5f@tXG~Ik6(yzULs|j zl}O(7vw0<%T*|yWrP_#eOUk)WDyNjl_py$8^kMFb(;|1Jj%!(|Cx`Ugv;h9)me(f1 z@#2Yb0Q;p-X%pmwHcwdAL{B^7$~h=ZwC~>yrcw-J9H)HROdULrNh_IIN4Pz2=}&mA zm8|uku*K0O)>HejK-p~q+AxdD6{vHM-jqh*MRM9d#gFCGvL7>=Q#EK12$oH+yLP5O z>7bNXo?pZ#>2@v6)9ldOxmiMQZq42(h%AlR6Oa@bpt=7wWcB0a`@1bsX-%FV&&3P=- zwM|Hg^Jc^=c}+WjYexO^p%`mZJrlW$6-bhiT)F#T_vYw48na%rfV9CwHRr86ZS6KZ zlF83l%;vrk>b6>?XH-bDj95&1=wW*%#f;FVwsqRPc@2(>oik8dL!*^Tr;PD_@+{e* zLMc5*-=j#|a~6$ZJ3#kQc7DjTCK*KYLfj-#RlUy!dw3?M{($@a+S74evhSpI<79nN zC)Ng3C~^N*o!#F&+Q&bN7Q?AZB8?Jhc9M`#)@aXL_^`gE9ROrt)Rixz@-uu%8cDp4 z(fv#h%fX`TLvrpGkGfQHI+K_=+7|30db;)W?s+l^V3OV7^`Z5~M)!$5STYFi@OA(x zCH~qkQLoQabrKpUv7#qo#nCp70JD&;M8sij;yl^A@d{Zf2?7xVji+rj6>2I$j8x)IW4XRG#Ym$UrS4gB}S&U;VyWSf`+6B{w8M$ z>W1*LuJL=t#NnJ-1?;12`XV&v6*2X^9G+8scqB=Mz*?3n5!*LWzhp6FGv+uKa*8J& zFaBcPIbHNRJTs}KL~{x1S)NBj7f=hk00ppGO2B>!pwKZcYr7_~rgu$;jpl2+?DeUA8_AHx+RKwgY)G#RUv`&C9K@wtT3@Eq z(z8hB`xrN(zsnQ%VQiq~^4=c`G2uFb7vTx5)qLyn73s}GEm&&@Icx&_(YQ8+MnVGZ zHx(9La*c2FN{B%*Lo%7ahll0u8w9+=7f>m&+x4LwH@h_LM{WH@pIN#9P3%2DG^7A$ zD_c6(yS!dHcV3%~no2X0hI`6wMNfJ5EUWAr-Eeh~rTxehz|ZMtZI8qZaDt#PFH|xr zlVGOxPpxEv72Q%2K0+rS>5LIFAtW|Fob|;kqk1`6EPZ^GS%AWX?MHX{}9o-8{ z^a2Zgn4C6+6ej94x=$%1nU%()3+NkGGVrJ?s}oF;|jK;}B)Ivn%RGCY*n(%Wkkz%$c*t^ZPe z5k~E7E}^+)I@Rk@kh+>ALw48wvH|wibOW_Rp#RxPeY-k>WpOzFbO4vMbNE~UCjLVF zHTqOaJoIJ)ZFs4rhl1sY=%+1=!Hk4=NyDAn*eP&kR4eDVn0>2fD5Knv+$HFw7}ZV` z077*fD@tUO(wrt=Kh-b zrjf}@13hncFL}@Gyqj=T6BvOXz+YOuYnSGmD&rdny9*(rw?-wIFVsgPnbTq}*gCtc zl%l8I)_FR#$%+4c4`CTJA+7UBjTF|;7i7tJ93tC92d*q(9YRyDUn3nLct(oo=eo3~ zE5HiteOVS@V}Nc)Km|KBJtZ18V3-i-chhV25WuQ86@Fi|LcO>381?|2eee{6f@1zo$ClsC#+nX!mTxHIOX}Q#2fAktLev>J9pX`Z36btSD%VhmWfP|BTR9{r^0bTGwF?{pN)9Cg@VGBWe#}_| z(yFzeZkAo|NU(V1Zw)iHSFRQIA;pk77mvyKexhZ7>hsK@(tES}T0yIpwp7L=PK%kr z&fSE*-5@(t^%31BZnuZ_a@xFT?Lpn8e#YqqrwG@pDflW`D%S6HJpC~V&St@w0@PM+ zDm&Dko4c(6XNRjTSS3;{BrLb#FNji!U(tDFIh#>^c$S~Ik8#@bT3=DknRTSqGg`bW z=xjp`WN(Gr?mS}UYqg{feumm}J+W zIO=q~&jea|(wWt;-f|9WD(K`qf@|Y$?4Z$K-UwLNkKrWdZMJpk^2KLDJ8xZD`JMl0 zvo{pzy*Owfnu?)AxcnQbv~iqnIE!|G?nz(>`#NbZ#!^cy7FQ+ft6Xl!cX1s_C4{JY zd$~)C{>(Itn@nyky6pCS?^g+9M@bwL{^ba5&eIY=0$nd8OFVDEZRbrp4)du}JL)5LuTos;ytoF8IIzj8Xq;j>}H zTie!Xcc1vQ-H@zyq{&2qr$PdA>N0nvYY^4V10;pz;{Z6TrCrUXIlOqOVevJ$p2Ryz zfiO%yN4P$$hW7CU6@r)Mo(iTBy$N5$e7ykJ`>^_p1n9X~Z#};^bL2B4 z9W%Um=eqNg4lI*`(vwA1Hj=Bigv6uX3L{yO+3gn>o?%Rpnk45^h1H_I*NxmV)ENn9 z&p*$jbMMAP`@@82WDQK#Hj*^E)}-KPxT$^TI7LHaf)an&poij5ZsVOTI#ws_p{iEdT}}00 zs!xD1<8mjv2D5vnRVr72a=#Blg7euvd!Q|6Fs?xi4BM3ul%s znwBo&j%hKfC~?90Gp-|!gVep zKt{dW!^HaBIYV04L<{Z<1Gvp@tY2f-4TGkm3St*6WoFMV`*6itk(bn6cu*1ZsnF<# zZSQn_)O5bnqZBVGVfyi`O3Nr7+lP}1jgdV1(LR!W^-=NWSqhz$3(3)BT;*uPWzkzg zQx3CQqvHTgU^nZ|EBeB<@KNrey{sp--->5cHxjai3~>&$tJ?h3rky-^G1#t!X}s;+ zs`N0iGV43NbEf0nY5|d$)ldQN3bD)BgKuUPXKm$)^=eIqidOXoznAz z%Szvtc-U|YyW@J#r%6k*&Y6BLXT5abF~9&tSlylidE0mYc`HBNj$f?MEf0bfUsrA| z9H1=sCnX?!b0tMJnx`k355!RB1I?=KKc)N6EC__A)Tw>~BLX>4UGwcdyPLwJ48W-3 zk1Ix@A`m(vI$5q`cjpR(mqOlKnfy-y@l*foV#a!LF?jQtL6p@kf}jYye?Eh3Mg zDj=7MGn7auH-9pPR}OEF4Pg#S!+YD$Ow9cIWKt8Lp3{FZtCC0da1vjyEsR5wfrQ6Y zl_0QcyD(c4RNgpK5<0U0GZ6+!LfuQI-v7GMNLT>n4UyvlHqdN<7@ zNFPE$)rq;EU*0`Czj&N$KMMg#ILeGH{v#~#BW(&CUu8_qVs>msT!vyISJFB2EVWYJP+L!1}jAWx?fG%I6tNsw}tuR64o~w zc4|Vva`kD3a+MnrK}!q1XZVz3t&D@NKnMIY?!?b~g)(KF(54#${efqJn++6emH_Y; z>>#t^Q63bim>ba9Th;@`7+qip-stRN_E z$QVPKf)wcWT70)_+o1cMD)2`pUV-v2VxYqOx#r5af4d7R1af`42?7uygNm>pS=M4; ze0WA`erXjXnM^}v*nvDlx9>q=xmkYb9#A7VzXodmp0|q(yQU-l$m%_j6(F`oD24gO z1ScrhtelGT!t;~i8I<^6&*U%@S(U?J!bRlowzrUlt?;bQT(rjw*pl$ zR_<2j&bXi(1$5Qz=5rV5_|DtrYNB7oKL@h5Bh`x;I_3)-;2g#2&k zO2O%k0AP#j#mx`1|9$d7C^C;rRtzg0tFjJvIsFLHlXAu2;E_qQcagoGW2KwOPP4L=N|8`&h_)(kf zyf$S5l*-Bhz~`X~#R>s zPRJPnNu@?0re*>72Eg6;k_ld%(_-_m#eYcTpP!6@mw>(<)1bAa<{f@tn1oJsJOS;k z2B@&o0Tl9011Nf?*nD58HHDJIaUJIhC?E(Eadp~U>}Q9+Lh;XrK`v9hi5f)iS{=?C z>wT!kUsMHpV$u^i(g)?(30K02<=fr22`%5 zKYtke?@Z$?+HsSWE>KrZ4oMV(;M;1xdWPZpp;%S10ONq)%hmteISwPx!C)p|`GFc; z5x4+C6rO(|&&ahH*LI7ob?i5~$5W(0({hMl^@`NTf~XZz}qK_*9KMY=Q~&0xh2cwJ!dkgp8WP z@pJHCo|;X!q#kIM7zFitTaVePpa)g!irWGa%Xi$M#cJb$#kNz6%KZZ!=5OQx>RdWS z_DrFsMpnRi_8#g28U`R7+M@V3Sl{Kl6U2A#fl?!<*;nCrN9?)E#LsifmfH7SHtvZHpxKkFX3@Or}q3;q|vfdj*a=>0VnK(oEh z&ftvuZ_hv!4Hq5iMp6Qrx=i)nmt##`1h6IQRr*>9pz8eXl1TxmEpLK4gZSOTop1ps z+GYy`tTlNq@xLq%EV(PQ#!I-^sE)>B-&tZtki%H5OCn#(IEQ7kB>0-5{ z`tUAs>czzx_ri{c%*KnFNYdgGdKj8#3Papd^qrS`N3OA!_|!CyHMvBZ8b_i=oE!Iq z<_7vV+E*RnqNh%y|Lw1VQjgi|-;EvDyOadon~Jl|dP{}b4^GYZ;Jn}FkSFp!na0+1%%4>BL> ztEuPNt#< zR61>8uOcl}&ZfOoE4RxeGG!e&{#|kbI~X3__*=bCc3aA-@~U4|%H5aWWlS#uOTe(2 z?nc~nFHhk?d|mv@>Y)RZ@vN--Z`d^IRQAuL5xz_>N;e^9_IvNHoV<$rlEm92X?Y>6 z5ER%G^MFQV>Qyd|hW6M&B;Rh#s1<08Vmk2@fR<{zhGJr3C6tflgPmDZbref13P)dx zg~sqZjWuhx&a!qNw(Q31Bs`nc?BaSS6Uo!vv}vlEt1DSDo|m7Ou%++%b}nH>gWMwh zL1ui=%;(P%!|hvh5{_>gWk@1pxHZb{3NNxgl%VXl-T}4^UT41FNWIg_!eGjFSB2w- zVYw4eim>4xQK@g_RbJfAr?}23llarEpe3H^V4nWfpv>>;)9J1Im1@%Y6KO@HN2@_1 z8QWl2yUL#jU-Ssk?eoeE2Oimr4}TKQSYD`>oN?p%Hg{h=@!9q#AQQe+Ul_`YDILlh zvXYF!-Ca!_$}A#vk1;7XF30vN9Q`Uny6MQe&Xdt^e8Je7PNtx$jMy__;o_4$7?sB< zv|s+Jkwwn}JOng0+pCV=f3lO_tFQ$Sc4{HV{WW4gk+-ttpfltUs2DU0x^ehzg*We& zJp;>kkj!sb{|q;Ob8~YI&o-+)_6A7xA3qubxG&YOT7; zdkgp#vS?QDDfvJx+qMA|Sb0@uUwQ}wVMVt)!P0*AG@@qSFT|X zf;q!bR)5!bGW@P5oiS!W8O##~MZi~I`LO_TRHgxdK&U?>?E|2!gKYr2H2_%2O-v)n zqP_gsgSNvR_al4Xpcc{PyJjkgdyW^$BL$S*x6-u+p|Ut~ku)a1YuQ4nH{xQLx8QHz|bZ7-IesL+s%~sBeE*DAc>!7cdHfLYF>=WBBHvr_-+pk}? z`n8JVk686T87eE_V>4L#TQPue4hp#K#vCv<=~yG}~=w>cZi zFZ@^4N9|`c2n``%F*{%cJ~L3;-zc+qCmU@HGI`A%WE=Y5Y?X^|0qvt7>R`6U;_oio zdAZ$v3&dX@e4*$YQ3aS-#j%H*?~)CiUAi~cL26wd2F^wA8s@61jB!lR2qUW>QsNlC z^%`uE9W~uJ)QJTh^I#8J%PPO^0lBOTpt({Y2TYFL^3Zz~-#O2sa+~=cxZ?VK7Q-&H~wmA*r{oTcEM} zIMC6+#B^<Fdx4c#b?i%I_ja9=yM|!umdSAoxv0%zb z0Oi(}&d*da3U%S-IAT6h)zQ%@7=`d?g$e>3CGUcdkneVHPq7PmKR*DFNzcD)t-owaZSJ&+lN(_eOxDd_le4IEYuR z31xo~UQ#yUz`w6YVY-~7A7 zCnFlah%S|TsaC;Cor;gi4*HW4RI4B)+Oq>GNfqM7muL}3SNWu=c*@0pC)p`9bSZQx zV;pp;_x9+u1hoqLgSRb}Si~%qmP9Q#N71F)tUl+G%e;U5B7QowDH}o|&dK>YEm?@Aqrh*NZQ< zm&bocj=5~KV|UiRiWwV2@#(sRWr&D;FQ0wf6Gaq3Drv2&^tgMU0d)R)9A;4OONt%Y zRlz(IGdtDYFh`_zof3)PZeV0&l)M%=f?FXZo2t=PhjF?M|KSR-ovp*y&ka~NVc#? zbQ(?9uY~M>GS^y+;(s;5>-aXlI+l8Ux8W0d(5jXB` zSbvD40gsineJ7Fs^i~v--MBov;;XcSh_OgP9-CA43S)chgRj-k%Whoveej`3q0j8W ztSh=yNcVL9?=M3X149~9YC-g1c>M55xE@!g^Q|2Xb7k_O1(&`Z2j41z@GWH}CBiDZ zQiR2K{Xz@g4?+D$-HJ&~>c-_(USI7hn$)gSJHoX0AOSX{p-eOj56t(0)rsxe#r!7SjaI zv(7Q~G^SaX)5U@Y3Edn_LcKeF6c)B-D$Al;Ru{X`FrlKA$*OoiWa7!E42y| zwK#t_D^9G8xD}iw7QV8tEzI>=ANC&4mt)8qsD9`Wb)~XVOwDf*Jj-0RWK%v|q%vJZ zdalSMX)fV9B_gXf&B}MAppTnK=D0B6qZlY|4!1P!fj6t}+WpIyFWc^Y<`?|2K_vQA zZ8re&GX>^Rw-cBE`;QZ7c}%3NXz6vpkL@i)9$>K{lFxVk^sPr=C`wCP7XM@@CpH*L z;8#rjPbC;EOKoDMe3i%b{hLr$^&!xgR0jYe6{5%m8&x*Qe;9TPC8uf?$g69jD1K+N z@)r@di+zKDVZ|3;rK3}e4ui0tOgfdsx|Lpj0He=j9Gz<)k z2db)Z6;I^a^8tK0vVdr^ODit$~=sAjrxl%DgQr zTjYf&&{v+dVjUe}pi6br zP5eCBzwP%09|E944nj>R4Sj+6n|t&<_6TcI0a%mqTNOXngD7K_|GXuPK|zs9!q3di zEHDU+=EafHd~Wj*zK3s+#OPAb<)2Xgb!I<*5@r04I4#V4SeOTr+<*p@tpE72#G)hO zr)_mb(PM%kcxVad4+nfz)KcR9&t*g~h*~OL$i{B+{`sr_*@>+txTOv)UD+RW{*Vz2 zWo9SE4E$|qp#~UQq$P%;JpLXcjevoPDKAr`she2KMF$cPJVWu4IE#b%@3@V6uKdsY z10S5di;O=p0X_DmFhJ=Gt_$PPTLKwqxd5JD;{R%<($dnRr!y_` z8W3dbZJqsZ>pZp#N{}a^fQ^71hC2VYP2_LBn4hPu2$ZNK(QgDNr{u&0)+fg<^zQcq zg}o*O3R-Hd$q8p#$OaGppmIBk|M!Ov$Ljq&!Eli|Pc^toclIbxwXHh;~8xaB5!T>jlZu=HfC-rkAZ2n8|CtQmcaqD=n5LxCpjuNy+5 z*hlF`0c7|m-=4Y1(Bc0wL*e6NlcrswnXdHdyS9*=rQ<5p;; zn+y!)hmB*xMj`d=gr10SuA7bwMDSSXFq(a2dlzAyE zm{zrhoJi@rj*Cb5hf@+kEG?O~e)#a*E^g9NQBg4?7eC}J#VF z{Ow9a<4Dl`{QT}|txHBLO|}XqZ_3%Ssaxl>I@b@ag%s=O>_#+>1~LbNMm_+3=(?Y{A3YRjn?gG*MYKL_*G?Y^jZaF|2*N}mwO7qiDif^i^5%GaXug! z&NB~DAu)0gh+RTr{5VNFrnnYbWs<_R35F!)VK~k5W?+cw9ZMxFArZg7$o=O}y+jBg zyaxNV+{vR+v)l_kN&bb#pITnNay`Goq)7-TA|f1U3~3d&EZ`RO?@I9tJRw2RQNr8O z(s{|O4H{};Uz6&Ze`grxBmNGjwPSHa#^Q=s* z!_pbkM=E3_-ONhL6{eWSH-m@6z+gP#KJj)UMW&wuqHL219YtkUb=QG`_DJ4g5Yl`C z3>T|{!Ldr`Vc-NKJ50IqFx&&kF)Yz1#DCxjMhM`x96N;afYXNyO$>k(uL>mN?)X!93wkAy-N=Ykfvpab~Dbr}!sQuW_7c*J7D__wSo9Y8bk)HSJ zfvX%s1tWcdfu5dzD2wTZvKeqfCm3qbXu`?p#KT?xBia61O%nQ5xfs(6^s)hc#z+BN zg;(L>MnEa--c54~uGGmi+jmNx*@N2x5=%GEKRJq12q)SuzS3~tTbY?jo%X%g3-zOg zEj$whoQ?Q7emnJ)4`^7psD1Wu-fT6U#VS=(Nl712;WUzRwHOzLdwjG{u&+4I$R%8r z$uuk04DBh?XcuZ`;$!kmU=n?N;jg9s{7KXi<2(_W(YHvN*m1*FV|-Mo>l2cMj-w{e zA5=F(X!xSH=A}zkLIK5rOI5<=Q@671is*@y?MKY0-`~)}3(-2ygm{|ah&OubeHjI~ zt<@ zT_#aM#V75}`6s1I8tV!hA5%dEYouB0kaCgH%vVLJ+MpkslBYl{z))W|Hq4fX3hh8N zWvDHjE1{l$!sE;maKYvk6cqfIYXnC^M8w25bjdTJJ-v#}mhguHaSCmQAO$tSz{}hJ zAX}4MJbwP;J1Jjl5TYIi=y(Q@%Lgcn7_|yCuLm*z!2YGFf@>8zlD9$&%|kk^3-Wpu z)@XkWdo2}L=9v(}tTOb|WS9h5OJm8vlNl{w>-S%#wW?J>=v$~73@$$KO-oSp{p&37 zfCBVXd;P(0v;^cu9?lgWzK4rMQt7NF%L{pwRI3svDmcOdSRt-n-b6pi=88H=K z+Oo=vX{lo&7zBoHt*2Rc{9z~>nDx9z8{3arb-_NF&gxtLKJpaWdoWD_`pHs_W5G(7 z%b$>O{JvCqX7psaSaUg{bqSjc5S6k6NfAT91#LDzKi|FEQpLc;lC!jB!fG4d5oW^;*XAFiairm0HmNtkMYRNh#VkyDawHpllAQvJ-(*NRrVLV^5{jxzqiDBkp#5X7+1Pbh1ppGAg zuKHTwY9GAlJUNOSjW_|%1XY$N;QouOrf;>jw))!v%kzTCXjvh*>A3rFyNr3y zb0Q_r(}m*XT9?s25s;aUJGDDVS}Jj6oss&LJfIugx(BW-M#EC}7lpj-%Qdz5 ze`0CMkTt6O@~0Bf0IrFyBSQEOuAnVxa7&sR=CT)0o;Gy(!8jj`{&e}!(@xePKpkso zrx2q!U;$%D`#%hYDT&6}SoVVGe>tdUW#Qub$jQmcl=D8?JuxMxCa*ucFvB*zzQ1Gq z{lgz8!r%{fr}o<6hvE0jrbJ6I*vgkU%RvA%p_B306KJ+Hkc%!AjQzt)lcEMmjYTKR z>%aWsF)I^|CV;Tam%h)%26fN_Hx)=yGdS4Y%0D*g!E^EZh10;v%!((7erMGlqO8D7 zPV*rSym?}R8Jz{E5bW1+#|mKF0aetbzop6%M(shu`PAEl9YHAA>{% zpxIo%d*{ycKeD_Q;Q4@jdV#zn+S%3Xw+iw>!o(m3@gw+QwO9e?X?BtZmpe6X1Ap*A zd5B^m0O)^RjN9(Kq0Gi?`!Z-nmq@#j!k(ZNb$nP#%5t=Zq;=AuCJc= z_>N5glw1HBw5S;y8~+!7288*401&GK`-&{a$Jolg&dpKAJ_J~3RZ z$)!m^_9kH;2t$^~#=wo{x>&;Nd5qV3R7$9`|xlW>@}Js4Yl zVQT01p{LM@&V#XVQp%H|6M{s5fu-c+bUn4UHtWk!E7E=M_hrB5<3}^lVX7De$B!LX zZx{Yoexip|^5nG&nwl}0YWa<9Ag&V0p?53sn$xBs^pZ1N_qP9w00d5Fv`MKS+7#mW z8h9QS8iyRL2e}tOdJFb+Q&2I$mWloUto1(>!~K87S|>n(dOql@aVysJWAd^5`x}k^ zziFY^&jBV1;W9NVYy07DfAtV>R%STwto;72PStkX;b{IrkieAQ)_{ofW00I;TmYqH z41#E5rv2(z@As)ac0rSps6rP(@%L?>_@hOqFAy)(8aEr*C%~PSZeQwwDHo|A#iY zd`iCy<5LI=NYnl3&3!Vh62zP1pSt{MTJ1MzI2T!RfJ!L23VN_U2dT&X0ybZ?frldK zUs>$dV*fB}@rW*fhqo)pr@r}Y0N8UGfCWOY$`HOmMk4|1 zn4CNW+DjRp!Ne;78A*TEEbHpTTU%Q4c*k8nhO%jpEN9$*C(8rk4onALFAb;59mFgP zaLC+zepmLD-qe5^4kM_COnf^tOBu*A2FcNQ`75L4Qj335F5kBuG5VoaL&_DqPdBIm>Y?b=cd04;1Sd0Lkp|B9_RxHQ>Ew`6|? zS5BzwnBr0#!BFni2uq2R*_UJi-PnuopJ>a!5>K2s7)zu(ee-0f%R5MwHquN00>ybT zvlvQk>;7)L(EH&Qip=Qv*8g8LSil-(BCdacdSTtx0fkJEN8%xfL_rpfnB{Aj{yz-5 zPwvP59#ALs(M^Z;B)u|&n`{se<$iIfR+5^$wx3j!j@B-=MR%o$tzBPoVQ2~kZ;+Y#$z@# z05ZDG!^6XYeYyiZHA13`s@uDTz(-^IEp2K0+Yh^OOAQd6h?Z-|&-m%ziS8BxM9v1F z-jd7I!os6~)jo#O!hpWT`a8K(@eP^RGXF;8z#+p#2n-V5C6N64yeKRx$`sTgb1I^N|I?g%NI19$5Ye;^Ln zYe_JcMY`gL%AW@eiIDc^_J11MD-W)N#7T#b1zJ1ee~940dR-88@ihM5RX)%$GJfc# zPRc%!5P1T{7xL@GE$|P6)XtccWRe>O<*K&%RYVhh-$_iI8lM+t87E}$EB#pH%#aj{Ukg}k*Qw_v;3^ts(5l~gmG9#Hr_AKlnW)d#H!reNiODhY~>$Y zv8fIy1h34l6|TEJEbq`%Zf(%K#uF#(7khc)=y18MHFs-n7eEGahA2OT8_Q? z;&P^K`h%IV2V!CfeSL#r!F-Vd8TBI7ciYRE_Ce=NV*9d2hM86|zkb~?n_1v4`nJJt z`rVI+C&yiDIKi6Uq(3=u#sPCn(L!alBSOz;c}VsDQ1;$oO>N)WsNzO23PA-?s@Uj) zN)1(_ROyK*NE4MNO*#o6B}7F6iXhTr=!B~D5)e>96hd#ID>XoXKmvqv7u(;t_uI#P z&iCAZ)}v%)u9eZ|oMVjnz7>~K3hXOX_S3dgBl-owd;NL02h$AI!2PwBNqut20S@PR z;$(!*Sldq7AnDr+c%DnAJr?aV984VTtFu&cHb(8~Yp>k|awH{Od1h$2iu=7h&eMze zp3Urifzg*~(+I%<&3a;hy;p_6=wYs5!cN2E@9p2pZcADWbX-SRkmDsKgPrRLyX}nK zm-hsRmY=F1?q<6XTW(;ZX=cEBw{EP2SMG5!&}KRUB=iWgrH9#g9_$l96aYD*|I30G zXdOO8>wto9I$M-F*k&g`dBG;2JOF{&qJjRm8~5z29}kp%TMC%%@vB*O^zfViX>~SsZaQB%6oS)6CGD<`mXlN?5mY?On&PEqaOZ zdG=yem1{oA%T61P)OZQT>`&CCSny2XW3+(sFir4dG~1fzV5MV8TcQ$isdCSjyx9D4 zZ2Y>qBL<)+EY6qoEqLpS*1afLSgMHq%ziH^$IegZ)9+ z+?}Y?kk3HZ$OBUfqNUGm;Pj~P(2AA?cW|(e4`zp4$k1r%Cl$pmec!nD5rRSZlJYU* z+FWo&fqUXITL2Ge8nk0i@bs1Y;h7m3|CgG&J3tOX+CNxY;4%{o4+8jY1_wUa75pD- zT)?@k;xx_1yz&T?l+D~W(*@#8>Pg6EnZ5V6+TJ`_?q{aiV)*&1i@)qJYxSju6~D5+ z^#M>Swh5cE}qSx`C zRC0Ac##s@(rq=h4#i^Y~Q|z0g)s$gTaQVfdb@F0n|5&NLbx_+VGPt}V`?`QuF^ z{?SD${}4tKd<3NlaNu9M*8aTAXzUjZSZ3vw56EEAigH<%l@G^@Gv}WS+C51gBqX`C z58aQ=+HM+UxEESy6Bb@l_ytwYiz8lgZ6bj^IE0;zJzMw7>#)y`)}X8FDzA*ZV z><5U@bc<>ej!Q$fq#afjy!Gv7)3d1nu4bTCB>4sH?A#|Y$PreY7vrG{Ylsk!pU=^ZTM4Io-c+I4oT)nOb6WqAKi`x#un2n?viGIB{Vu~{Seqd-$4&j&La%xQl5KyTYj>~rfCbVa zA84xDlzylm+O{-Z{mSinL!Y#8EX1F;rYOQL*6a{AEZ4u}5sunlqfXF0E6H7Uni3!j&Yyz728ov$s2+;^o7Q-<(f~wd;4feZ8uOWp=M_G2XwL4 zdbdZdoKOOf({^G>s(8TIx}#vQnElHm7jnN(i@GQTK6#4K;j5;&^MJRG64UX`a_hA2 zUVgh3e{5aP*I#+9*m_G})6-7N97S@Fm6j$K?7^M%==p1qOKgJKa+$RF!} zFrx#F=10EgSKS(yzA)4-HmG4=PFrMj*Ba{cFpOL;E*X{`VJpe>$t*AMv3Sx+>Disa zEt2_s%VzKOW|dklD)ov>u2qecwRMqte15%Ws9mg-TmMP=aAYzia+}2I-{j1wo&IUxpI<@m~+lxKq=1!wa(WI$0Avk3ru-|eD$(t zH)9fNczInnmi~xXh7b%?Uic|&b+C+Rbx319YUr_5<*4goq-r(;APsqQ@4+SKvgpP% z4Y$Z3zTN0P$NFkRE+c1bqGz6;XlkThzYgR1o&l-Dv-)Nv44hX{q2r?GB1goEGNYJx z`3#RVpR<;P98Rf-U_u7BYxw~4Z@ABuALvx2jif#M_tH9wR*TL#3`$R9_iz%4>^{AE9Ww zio%qFI2P$|qoRqFhNloV_{9(H=$$TS=CcJBWouQJ%`Fo5m}{o@p80Az2gQ8Riw|#i z@wSn7>{2)uoxyI(Eas@%tfz$Sze#<>$#Phr*3Uj4A|)HFxuW{9wi_;FG%ruXv3J4+ zE=%0ozRRl6TSPt8wU*W zyC<={7_`ACc@`$GCR1?-T15z1T{xQQ`5IB7k zU9S2j&L9@M_NFa%RzN$PVQTeYwtOpA36D_SUX zblR`2_qO(`el{ueeId^pcN|pD>F!W?mRb1Q=;ml&n6Zn6C-av)#U78;sU--@U6d|x^%yIF*IWvE5a2g<#me#|7*mRs ztP@;>Vl8We?EIoJ;>jfz-|{~A7Ux;U%PVxzN>q|NR%VlX3waSWrgvWbqRAWVL`yOP zP;tjFUYD!6jC5=Un6t?ln?4Up`bgPBR;FOrUxl{o<$kPL<#Bz5 z1z`KWK1fD3O1g=$3r*_dcuYMNbfPiTYJ|S58%=oJ9*Uy1^16{MN2de)Pv=&>j{x&O zK7`zmF01N_Y~<$07SPvdWTYTnB~Z!ixeNk2u5?njS4u%-5u^w}eb|gW3MudmThw>Xoxkr(*G@Ozok;l!9M{sqLl(L2NAU(~pBBjV)s&~KNv>`} zM%h!6=2_wP#SnvDooX~1!o58|g9|ec=`fY5T?jT-Bi7xc!3UxtiKKwj?v1a0KF-+d zB`{rJEApYH&Z} zH)ZLghBxVl_T50#%CL0 zJF6`KaVV0G?rKnd1N)|%@|r&5SyVgSG~HIZiIt#LX(2-q;3Hm`JQZnU*)J^-YegD@EXRRZ8OV1(Q9hVBPU6BGxAlj&V}9RNY0C!uBvC8>@@zBVoG`{UurzGYW4Q8zv3o|04qRS} z;f-MIfT3J3Lgi7XMykD5Y0?ru-laL8&t5&OMXo&=7{Ve0R#_=U1RW0zy-<%)srEif zJwwFvOYV0oL+64na-Q&B8W`Q8FS%&FJi|48!Nj8fp0Zn= z+IMs+qL*|}x6)eGytsvwXYK123u2Pt8=)I2^g$3dIG>$~(CEcBUo}<(VSW`=q$bOg z8ln(2-?~}SO6OeOBdd`NBZ|G6La%|!(yV*_#l6f=+Lwr%giN2HA&=&bZJEG)QwD9q zh1iB{bFJN}$5{$oy&nu#jXo;$xj9m89pMxK?yk6S&!Ya95Qk~fk(p`sM2cpl#-%b_ z6<#ZMDU>_N%Uexk**NZbL=Smp0uvfZ=iY3IBvYYdG;xsvdnUP$|(= zOQ>MTRawDlo=Xb--w+CSkv85H3Nu{a`4*d zrs@TfMNm|#|6E+hqi99sE!Ofo_qEN z|5qM4gM(jL1dMXK&2i5l!M%gq8NP*eE2At{xy6^g3I(M7jQof>hv;;tS6?QiXxM<2 zhm=~?Ap^S2dzYl!syf@RInIeZBh~uru^#0-Jr%H#fj`I}4RMI(m;b8ty#{StC)3}4 z3@q@301YbkjmQho;X01>*J79mvKtBw98rBUM91)0-{YUC7bo1puH0iC7{w}$h1Fa( z&Pjj4)7IJZr3fu1%{i1iuhDNM6dTIP-z*jERlUAKnSUHK2t9cwH_SR>1#})BC7N$^ z;A*QrUz(SHIzZUBEnS4oSWZl zZ;NiMPI_w1qA#quNvu3yluChcsoSEXVuYO97SCNiS!hLhk|q1 z_$z&-k#p-6NY&KrMp-hcCcA7!_e5FIy4B6B14aiIzG(>_pAPCQkN4Za9Wd>+O%&Fe z$(p~w43gSf2^zOGa#p*HDi1`ezFR>HoPg}=SyPLUyorZqUuD~efBN~>K1dCAId{jw zIH~gUjGjqMTGi|J%}%gJ%>J9#{4~YA&d{nTw8F(c&fBiOyiU$IYm7rhv~(+q$n<%L z-s<*zUn3Zcck+tz9+COWoUT$m@LY>epwWoA9dt(bGX9NmO=qBT;8!YcNax`5)NA;A zZ}LKKF(*{hIkj6QfpxO&Ku{1xRC;ut-tt!Uak5xej~o%Q_)D7Qb*fJAyC(&NDdKrJ zZwjOP+Ttf$Xp^mjYrcDw07C4q)NHK#txA?rq+9&X6r~?g+}Ens{pISg#=)Y{Gi-6$ z2MoBZv;7q^&q{$X${fny(fIC_e41{0_Rg9244NcWcZcxh>B<&>9`5Gkm z3HnhWNM$eRJ8W;^tx&RALXK1Q77w3jiy8B@cv0q>j%+&n!rqF7&FKdMN##TV#f zkuEv;XL{NOSbqSaq^Pg=%63cawFL}|z!Cm;TY8(uyKiap%Vy0JFW*|N3lVSo--!_j zJ&%%34E{M3tk1^ZFn>WHctEJ!Y)pl4s6>DE9vr=(B~Z(U*s#Ufc^Xo-gBqYIM=x1V z1)I8!eyFx*<$xW@z-4EttUo>t(P@&lTt$!huWJ~BjaQLDxLl!Hhe@vMKQ*ID2?S_) znQwAC&Ha}w^MNp>(J~YfemTGQXw5*veH%l{dkQxsjDp<_ zXSwbSWvSZzXnin@=}qtseN_5aTtu2_c1dUwDpHLu7Q^B)nr5gWooVD;$tX3{JIZwo zbOAWd<=D*6&m^8Qv$sPX!`-))qJ;AUU^WpHQl+C_PX1;~6NrCYs(qu*_c4Vxl$%-0 zPqCbF`oxi$A(TD;@1XH3d@@+ZxRi0D)IkQE+72xmaQS0{2zAz5u#xxe+w|!lRP~F%6i&FRhD}%GD7F8mBYWC&jTAx-$i>1#bX$XTzCPI|OaPmKtb*7Z@7h)54pZ z*i&5S$_~||WI0zKazlef3PVpN!$=wiQE-}RNYqFG5|>I|UD2yzl5IulPocevl&ZMG zDzRbC+o|63bdAbvYR9IySBJ9BPcZLPouo$G@S!>@+%x=~!RoTMi%DTrZKzHhT!&4z z-@rKM?A>3Bcj2CU)#M;>=herbI|8BJiFglA_BU2yC-`!mv^@>l{5EBkh7KL%&(lCL z2dP-C9k}Jn;x9WGy)*2U>?HWs5@C0G$Q=w8<9wNuW=MScc5E%?JLQX%iqTe+qCI+^ zZ=@T4R-)?HuW3K;$z-DyYK)q*47UB5Ej08w4R>IE#on5=iI^Lz19t@VO*Xv32gvMU zlz%};hi))m#DCu0J)%Xzv0#cvcVFf@xMy%U=Ty|k5?soupet;28kQv+WxK%Hc#k$S zhQ^EBb-H_!%r+K`?-!q ztNa>wuNP4^Ti$730J|(TV+Rbk69ob~+{q*7o5c0h=#2F+o8*b>og%?j3u=CXMefc& ziDAgl^b_1h@Qgg9Ujxuc*|A??oR!~ip%uW%wA#dOwJ-td%ve7MVzN7q*mxwLv zIWR3oJtQK+*W_Z%^n$<7G`Vh6sgq!?k+J~dgL(9HDan-?^n&Mq!ODmwqt6SRgxK`y z&12hFkCkOD4*L)f3DtHzSlAK|Vr$4^?~_%%7q1}+Kkl&8`3{Xz|JCF#h#uF568RVS z*Xx%-$Jpw#gg(pY)J}^U+m}T6nVin7AMTGqP?Zym{mQ;tlZ%$5c7Fav{&g@cV%!v+ zA92XkyFoB$omwo|R&}jA&jWUm6O1uftDhd5jk^uP>@qsMC>)myQ#+O|<`>}aKiYI= z(-M4R>X%Zge%2n^ZMl-}GOK#X1@XnW&T_5B8dWSh2rX3{y)^9 zvmXP1MB6QJHwN5$HL^Brqcb~|D%7aDj3o^LH3bf_Vy|0xNC$^APrFMf*gv!CLFq8E z*%BA34Y!3zc~3hhRKxT716bBHYQ8|r&lTP2suck(!fbsu*X7Z=#k&Y6)x;7FuEfw+ z5=9IClbhU&UzL`Q^jpEfc!hw*yx7#B2Jy$W^R~~>l2H1f5{%}M%8h#Nsgi7xy=_u- zIrh1jEvzad#?!aq6_YU=**g+#0<2OsPD6~FIPi9Aqnc%lN=v~Cu)@#c7#JK!jX{ge zwkyqK12w0j4xb+X?JP68LyNTK_Z{>xTKN{$=y}GcTMe%o3(~&|@Zjw@zdEQEh%0l# zruxnnT~MO_ItdlYd#@dmH=SzoB4^nII^gK1#AfR==q-=v)vxbhm*Gpt_xQ*K8{G^| z?^l-YcJ~8aK1^HWFSHG=2kKJVkz#mGqzIsqbh#(pi@vDxo_a$!-f>(;#GRfiQgkOd zl$%?Cv|l3nGUyC=+2-SOrTBX=b zFIZ?Q%t~l2YfN0F+L!u%z0@cqQ*J%eY+}wARFC2JO`!N@unpw}qr%b;1_ngn%T2fW z%>CBeUc@Y8ZUZgaPX@f~xC>w(& z`VP6Cj>5~SL~>4xzOh=QdK-I`?rhDCTF=?ifR4&~sJjesh?GIE-*ZW+vm+2h(xDvou6-WP}~>f_2_Ff1y~iCc0jGJuV~KJbzosX_0>O=>r}JMJuw z&J}vA5KPUWm&(?34@lyz?p6e)`-%kfm8TnY+7Uw4V@LMBg{5CsHT-c!U~PLo|C)wy z9mHDp!D|fEQ&JS>Fq5)`2USrzg^_;wT(;VZ%%(hvx_WhB=kf-q6uiViVZT?!=htQ7ixT%f-kG5BT`^$z#2ne7c0A)8J< zHP8ZQPA+s;yGYk~>h&qM=FE;q`(>MnKAF#|J_6dD+2agm_Er}-Oukw2 zlM|)pBXakSKp*i>QLoSOA&sNLKtVC)VQD#p? zKX(~ffi`@&`9z;}a0FHaOku=KVb`-2b9&0lauN+ZA6fc0sWmN!;A6TvLRRZ-R^uj+ zuq)mTMZrr=8{Sldos8uw`pgfukle--XLdJ(+mG;5c~T7N|}15>1~J-CM>dAH>C4Uw?yd@+n`CQwW$=O!8V+ zYwQRG9p>y4Q`oUt(z4x9WVQe~!Qkmmd3~dWGG15!z=52uu0fx+wE7S~U2i7R*(na@ zf;bgkDPv*d&3!?*1x9NjWFs{uvsC3xzulz2nl|qu;1CNhjs^=xWON%jUF|P^ephkO z)gTwBdK>RByYNwnjg+c6&RM zyxwGVGnbf?W+X%#BQTXZ+6l@PJNLV%ByM(KgQzI=C-1D!w_VJNvEJ5!}%!6nNu^km+*4iOxI3}CP zI#x=0Yd9MV*NH`1tYw8iWcq0Va(Z^MWILg4)y8J)8WHVo1V4tT%!DlWdK4&T8K_P_Gh!1uDtQ&wOZ;MHNroRg z3S}O+SQgT!FJq4KKm#%5*dWibiLY(GZ`=nE&`$YdNlSZ(vA zSt62aC}w)XScJnG7LpF)Bp%mT&Z+W~b7@~KcBD6S9>%`ao9IfRtDV4q;k+mI9+*eX zW!|J{Kv57XkboVk9m=3>qV)<{se>ZRJ?^22j7xPnJF`|Dvc@;jL-VKH<8%=x>b z;U;(T4(K+#8p4U8WG;-NscP&lx2QUfg+g$IXu@f798`EC`6h_dn&wO|9z%Z2bf zJ>?O6VX%1p(N>|51f4F_<8z8{PVvclTSdfED0Ct>RLj;UK3D6*Ixvkl&%M;Z-! zr^lj&_D@I~*VY6%ay}zN@v)lE24 zbgnx{PwHaWd3f$lGz+OorL=XqSSvE9zNe^7jZ&*LoCI>^k|?{izL}(~V6uUXF%y`t z!cCkkb4XBD@pRoKsP`XiXuw9knD zqxbATeYPQx%SPDH^WJjK8#!l2KNKiB1bIw5;!fB|Q4aKe>=DNGOF!W$G=N_OVR}S! z4Yc5y4<62JA$h|zSX}zszfhHb`O9`(D%vobagOZik$!MjZ7yeh9gJD#AAyhG4wwp|;dG`EesSRuCrsTl zoQ`<^MV8uu0a{##@j;!-4^?Wz_CBg%1tEK~ zpKsp{OaDnoy9vK6wdt=qNrH@u$FS4wjLP(&SfVEUaw2*9vYPNtkeXnkep7ZX)7ihY z1T-Bm!pvGLtqC9&E&InuG06JNuym?Q>D;LI46tQ$D0Qe4ZgbA`e~l3gt^`N+s^yzq zBx-=^g&oCSsw`2E*51v6_tW&ZRuh!xD0Lfzw{{rEt}<1Y6Ls% zA_`x_PPJ_)omKYQHXGZf7=?JYEbr8wbIfN!vMmq=Tnq0JAh5fZ8zgHK`<9>4YPp$% zCE~kFgU!>556@2qEiU>Jh+|Y+5BFb-Fep72WY|;%_jS)`Rb&1LMx0G!Xw!RwRByd2 zEQ*ki?fqhAZ9lTJYgC*&u6_NPYW94=1F+anC|&fa-x&5FUSrs|!%Co#kGFhtVtZ3{ zZ!o>Qg5^C=>#evuxw&o7J8$Areo`%5j#0>cWu7^(t}5qerLQALJ-=Jhhay$gdY;r5 z3Gc1=PAI19ilMaT6U-2%%%-m^?vBhP5_N5pBkqd-6fEAx&)?+|bE12cLpw+<>4CLHL z>6e+RN~0C8mKe8t+uP$`H@p%%7VO^WF|ljL=u5K)wu)4!NYTW`+O1B$`}lDa_YB*= zdT+xgdYnlFywENuL_DEe`N5NpYd0r8hGrNYoZrP8bZ_4TMafc<8lhqsAD_6oD$Mw7 z(xv==fz_K&+27K_BymDR0(4xjK=9H<${W5@kip)~JL63#$rXc6dt`#7zGjgTw87Yy za`rZQMc;RL(K95U9h%LrLX#%X#6fr(b#bqmIK3U59K^j%vMQW^LhagyfpnxbZEuYs z9PI3#v3yBMxZjn)sFTKivjJb`Y;?fHeO2W%9YCX{c`oVk_krYkjEm}P&D>>j_EJZW zBeiEgo4IGScgtF3y6>$CFJc8Q9b3cobHW@>%fZWjSeFjC*3d(weRTxE<(C;$3UsC( z62Nu2b}9tlFGw7Ej=e*_s(5SWB%)x#6&dk%i&_Og&X;ogp2x2^UZ9-XUcNNypD*^R zi%W~=YF;LqNq*GhZQ!_&M!y07jsh4=1%E*ocPX3D4GJ>$7mVITo+s#)}fE-3aVz_hB z=KDT|J(q;YMkk*sm75EFM9@FS{iB(FPAjx&R&-nBZowN0v{4oVe9=0MB$YPHROG1JlnITK|=HK<|2_Mh^fi8!+(ZA^wrE>@!+sYaA7spjzhb zJuLo8N{?Aohs{t2!qP-{la6b}KJDFe=(C?n&jmjlXXKm6@J$VFOYG?xWeP)HXWwzlCL~M=x)6>j*zGD=9V5b_{ z@Jfv&dm}TIT-_YJ6MId=eWBJW#7|#(sXzN|%EV%DxEa}TxAHapfx^IF+j_*vwNU}r za%by@i1Ytb$9Ip_e&X73Km_isytdKpv*_Zx_4xcW|7%1z`+fMD#JcUbM}eiEdW`Wp zg8&x7N7~mY9+LN7-T5DPKXxjiqG@Nf`Qzt}`Z!Dzi4IjPO7^Kw?w`J`vwx}dCk6x*p@&3Mlcs!} zCxxPz&bA%N#4uz@)1xM3uORI}i6x`sF5%N5RW>f5&?D%_vJ)e{og9I7CiilIBrBvo zaJ(FufcDPqfP4@8%_?|xjM^&AYxhe(u-^19rOWYxF_4T*f4|j$oLh)7(w)6gl}7t5 zgkOfx;vZAiKl#C0 z1Q?suai5r~(pVe-dRVr*N_H)mQg(14(?3jy0?#p0hhK}_-Zjq2!u^6-W=@3Rj4~Hqh>CUa48iCK zZYSHXSLY+kU-S7<{+S&{!D9|Z0i7J?^U^Tpj`q;=ZI0<;Z@8@juX$+}S^b=uZN?iI zp2?w_Lmx`gi(YeSCB|y=FV3Ism-bUpR#x)c+tGN4#>Q%*C10cq8t%wS=WWwl(9EdRv7Be zyW+)H6u_E3FU!_D&_haUhtg5ppTo=5c8j6NXI~p|0Z^&{T;Cn0oS%SaxK{7f3-lnl zrokpQUaMEMB5<8P%1TGCXE$bPrTTqMU75IGhVa%&1XFW9T^^(~KNOg3qYBMd`@hFk zl>%n)!U(QKtlfCQB3d=E4QtCJ<0tsMqaFQZ;8JA+K5Y+kUMRp@MPC|e=c#L6;tzDi z+lC#E;4ln{6DyOr(Lsv7ef_&;y}do+S?2sTR2ffvVTqZE=!VhLf|;0z`0ixY#Pd!K z!q|9mg|1;Wo<0Mv!03%(gArt1;aX|=#5O0A{O&^wiFAC_3V%*v3&?`-;6FSk?qtkE zRnBudh|6?(8Xim-3i^gmdfm`@Qebp_<%M{enhN&U*}bCL7oHm(feSH*XcbL_6NZbF zl8buvvI-_}l2g6ZBSsy(fVPVk+*IY|R1@E+Drueh&<#}8^XyL-8c`eO?iR%0l^cu` z#||ZW8c;iPTXlu%hI;&pXkFV6c~-q}EKgasO70xSk9Gw^PY{69UFOL<|8gp4{l@q{ zYN1Z-rJMGozNV+&FW3A4*bbaWD%ilBJ0DL-y+97&{$J3N=lbxC^c5gWt1r2|;3=58 zkZxofGn3LX=eVemoi!t7D{_I&(sx&iSbiPh|Ij!|&UQ(NjUVU(r1;r0KwvI8l>3D; zf6t=Qxf75O`r3=Ac790sPt%yI>vibHVv8oEh;o}}#*yJ(T;8}Y_4BM0*-+u?HP~8) zd9vB^4$MI?c=ax;39zXkQ{~qw-QqtO zCHL3oJ3_YVl5+75`4m0Rwa1dnwkx$zOE|kHx5TlKHyHgHygRh%06TRvij#F#iVkP? zpq!5;0TZ>`d`d^QfN!bU4KAYS+SKaQ#2q+ucN1FB)V_3DWbiIJA3}SWuPcBO(GsOoYM4<|>9#`XJ* zH6K-pK@RX?Mg(r^L7sD3yFWbNp*`hh9f1!#!l~}Gp$OM?(3bR06C95d%H~641mmwe zKtUJB)?Y}$FXleEN$LWciR(Zd-cej@4zrh;kMT?z@c*g~0tw4X+*YG-noZrFyUAsj z7^ayrgbl0$Iwc^7%ZQ@_Cq!90Yc=O@sSQ9QN6(CGE8AtYUK%-ymo_<_ZUns^JM?b) zs|o7abV8>`t7AM13}h_(W^&*a3u-=pI>@CrH+beybU3ZM&a%cPAD?Y03CiRAI-i)M z;#7B*4LHELks@hj!4=VgAcfWW2Mq%>47fHrePa3vp-YPkif=KgQF6VPWgXcl-TaD4 z6)r9+Li%`-)T9q?*$xcc-tO653-WHvGj|x-iV}L@*H4iuLolPZES>A!4*N4tY746B z_sW4pe99cy1=)_~B8=k9?*`?At_%UtPyHFrr_&4=w@;^+=So|6Iyug&z$--_7}vn# z(wRh<<#@78dNuK_CfaEntIzGNk3qTclDUfx19E|84`WA*gpRJ4OdD@epxqh|yNv3E zdkeXGM7H~4e0j`Gh{A3BnNH$mLd-9&fCFT_vLc2|Yr11XcnX)e_AK?}6x&b5M-xiR(ZQPHg*iTpq@h_==!X3b!Fu*)=~i zr=udk@Ps1!_e*z~OF$W3sR57J=pYqTAE=^i);z~#P6zyzYkZClG?lK+dy|o)`Sy(n znZ4J*?XN*PT(@$!3>F`j3f4ECAE^(ha~LpfGt8c8F}8wNm(0MtI(pXtoR~zLL5V+_ z!p<6vF&9=%^O*W^S_9BJ3jnrCDFDzjD4?Z?V3XKyz7Om?LAOW&ngYk@40 z**o)GNAZ0|A~>v+DrLCiNba~Me%blFoy!ssR%VhRuJ`K)ByyHap7 zWrHX57XR`*AN(24K-D|KN&BPZ=<`}{=Mc1%B-H*REn62=U=b*&NTN+py4{~hBKLS& z^?HxvABim`N1@tH;1wcIjYZPcMRKp3czxqMkkfVDQ)$YdHDhPaulFu~99nONio>Ur z8btQZ)joOhq-oO$lR5b`=WO_S#lfoh#KaTrRU7y0f(&ljs?Gm^-+Y$r{jy*?nM=e$ zKu~dZd#bW;#`<--7_&@%3PxptM1;eI@HdRm{R^B z?LF(P6i|ebC+0jBlk|X*B7uvn|m#RhDu%Tx-7ppB6XH~vx7jK5-m5c z#|8;UPnO5?V&KggKSc+x3(D2GFy>+X_M8WR_L+n6lUTvCSKV^%f+1jq7iHvsR6#lwUGN!4?9xn=vis+g><4j z9r8H3ISWArZusY<%$bX6_Ym+1W0LDH>fC4ICJAFo52#zR->XeMQ$1S!HCOb@qB*0d z3|tsATsEEDlOS*ulIjz8BK0D~prC&}KC@9`q+}?ImPgNXakTh& zmyC-VBD7zhNu!I-zR4VF+bi)Ebbs<@U9cIu7QUtd(mA_^QJ>XtIPzD_Io*>ST)AQi zIpcy+=Pa4J-Wa^Oe3PjQb=ETA?KxR>R-LmKGWvJau);qcxq0doUPkz0SG<=-hZn99 zYwca5fA|Iy!_*klzDd-Eb@j6eVnub;OA43oDO?IFpCx3QBK4^Qqmy)O1h@ky?Wrs6~>3>AQ4sOnvPxBH7C-xGd1@%8GWhn;fGre*QndBY*!dxPh0t3I?ft9B$tA*=+OPFh7;Rx4Qy zs3Q&3y0{}eYJSMqtPqEVYhSNNVixo4UI-mOcxcDIo-11EbDf9lOmPH*n9^OLWGX6ivQo2lid!9Quagf#|i z#kWjaAuVyfV~57}L#M1^$T^d4v87`JL#mXFEzm{D&MRfQf_FZ;RP`yA-<)wUb}x5` zytzm8ne&^JD=nKHy4N%ISS5N;Hfd1`Vfpc8(f2~pKF_wVXpt|BV)+=l zey;rN9@gDqtQ}E7ctyffQ+R)^ys^t>!?dpC4+#G$$M?rsInt!F^(J2Hi?t)c-eqr0 z-C<_U50rjO0cpv!&c&zy5?%$MG~GX~65zGl0nicg0O&ql9W}m`?P?Qc$5X;@UYP1s z{UJ8s=e#i)2Rc2#`}~qYZ(SxIOeH=3s51+&Va);%)IotR*wcKyw>$d9|CV_1o%gDt z>HeNGU&Mw*Cjj=kqx<9xLG}RtY7jtLa$cjq(2`2lUHLqo!`Zq7{b#J&w+fLH0G2s# zBS31KU`+zR*W6o#(BUCTz)6eV3E@aK%<5-VR1 zgX5+uZMO6LF+K(9Qs@55^@W^<$1u9M1=T;|qI$@=kZuDd$?YZI`tL=H0O70xz-H7G zBLr;(NEKJf&J5rc%R}p|{(i-fFZ&OBSYRfn&c_g6zwZun@}Ot`srnlL1_HpT_)C%% zvyTem{)T(y`ENU%5;np%7+3KX0GH$=4BQ{z)z|$ro}-iE3f?ycqxY{^PxHyXJ~?Om zFDjY31lXRd{pa1^pA{nyhNVyOR-yiB@qa%-o$aq<7^kPxdcOs*k;puMULjE}IB1{N zv2t&h(WFGFjaAc_Rvrd`6*ZbM{C{f)H~&8g4S3W!j(YfS6&z&k(Q=modO)QC#4LcR zL^vEcLa$O(OfIa60JfSEYfB?a|GN(U`*1)N15l$ha$_05oSLm|?7%^r;AWOw0L5vn zwb={d!&;d}K7xO`xB4r<^^ZRr6Z_i=I}ZFce@y{kgdRB2&ef_^0wvX`tOsY zdWiW^z#8 zh3~)wJVbx|0^-@9w|?pua__rqj{-PD73;|Wau3=RAbVb@rMBC~!Eb*nDq4(19i2wM zvqlO$x|RB-&WV&-WcFL5qJTVPr%EUve!KHOUUpr?9iVuQ0LZ%J>sdn6_X)>rJKkT% zS=R?`u9pP@PGND~y6Ep8^65UH*e|D4P5&PCpO+kHh?$lD#oFC1^5!W3yp3OO*rut& zK2sIodSn0pGmyhmKm=^d8G{e^V`9ihE@=fZlBr$upT2&O8GjCN8Lo>J|A`$=z&GKZ zPsINWrs^SA_s8izX;<<0I6N7EqQjJ3e9Zq21pl~*1c%%$d4w7)v7b?DjmiN41yS(@ z=BQI`Ni+TZwC4@RF3f?1-!>4*JTI=02#{JS%w@zjE>?`Cjt%bH_Dya}&H(yS%D!#4 zJ@F9zf*D5tX;JS#-46k1u+%01BGa}a2U5AqfKy&UqgY=42>>D_+1i=}I7S)Q+WK!> zkT{hSJ?=1C>p$^bs{RW~}>hC29cqpIfc-o;Gy$ zw^G{wC6s&>w{h?BBNk|7?YlsS-`|l|-py*C+!tUJQ)&d5I^H!~0PO0G0GXR$0?UUt zk)C{UCZs^of4JG-IicBQ?L}DtjD=d(RV#e*4@%<$9+O5ag5R!@vI; z@)1yhSdHm@^q;;P@>K|st3rFa*57kxN9n$v`!;C~|DuA|fYjn%4)OmMsF6LOXKzhM z=${etDH96#48^1RpRi}UKij^fK9KsCK@SVN0~oactkwj0GiV>&cj8y9A-mFIsp%d6 zsW>XIQg|>Cx@kMjjYXe{70xUgDxV?$9>$>+c!>Vy0%GCMH&9mr-2*VDCIHCsRtgRX zgA;&_90mB-TxLlXZhtPTK8-tkKoS2G&q3ySqwzNRv1624szKcsph}G@>oXT$TNu2y z4}@D^R}QRzoWy7U^GbIlY}`u+8twmlp{Go2M!;NjFjb}|f8T(O%7K@Eh;Uj+_^r)< zd_x@$XaS%03&^_d+b&EW_@oAV^8ffFPj=B8^B&H-f|rB^@Fy3@w7Bblm--=l6NOpgkulWD%hy3YMpTd+IlVDynA!rWPS}@z+$gUJl z1csC2pFTs>3LHOp+X%tlF4iw)l*~w(hT`48ov*lWYhA5EAxFn`uG0z<1%%e zvHWQOVh1=^<4;S>}IOZ8}D3zn#3FXNOo0I#~%9s{3ljRp#bkx)-e3zQT7DC8(|pn z|M~aR0j`CYRqp=*pGyVu>E~pq1tWv z4>95yrBFVv6Zik$JbKSxcpwIuyjjwIayTpjR6vwxHi!Sg^AjMhfdzpp+ozq}nA`wl zY3F9{{QJZ`!LbDEipbz>w*TpyZ-C`F<1w;S`g47Le}~2q&h3m__@E1D=c7`OjXB6% znIWm~z&aiwbV3_6jc$&F|9lr46w=-6N)QB1O&u4rGf`k1xiRa{{{E9bB zi{!xA#EtoGQqXad5b%`hJHVK~nivM~mZtV_*&zs^&)LD~DFCLqUylC)8f#pDT6+bl z@K;@^D+ludu%OvKv4I09B(3gK{<9UQ;Wwm%EV&Grx-56ixtxqHuXb-S6>5K-^FoTA zF*#6if+^1pc0ns_L&Q$Ek6{9keQsQ`8Xyq`9oJ_&l_de&enPp-(IW=3DJWqVJp_dYbp+gAG~ zGjX^wdP(gMb~*%@ero9hwUb-%L$PYGH1vWwbwD(I8v~pmrLj8}DTO)1U`qy_ww|&F zgAlPKBo0Rw@BiVhLU5Jo8C731TB#*bW%$YzWzqWljDeqkHmgwhvb8h}5jti=g=S`( z3%$GFe*slEm|V^I|0Wgu`tXzp(Ye`C7$X0)ZVsv+Xop|8F69cii-BI}Ur_iTx9JBb ze0d4LY&&Iw@C;Q(li8!Hzj**&Zh)HJK7H_y*Z*46MY$A2gQh*!U+~^usmQJISCq{I zSG(pnG`+K-xHg6`Q1Y~>!nCuK^|K=hGs8~g8KELtMbt-dJ*th4ovoWAD;q~7i zC^6_VOAlnq|KV2t-|?pwo)X)PyuBI!t^K8bL60TLwO#b&a!S*D!!Iz2&`$9PrAfW^ z?oR4H`~iyk{Z`gudrWoWZ#e%S$H4jkC!H|7umgE6N|x}p%sGa+&LnD@uH=awKWX`& zPhn9v#WKD;RW&F))n(-<)vl$>_49ksXI670^52i8#}Tg!kXeqP+5a4Aic$nK_HC3J z0U)Hj7bfa_eY}mNUke|$3m$GfHJf|cjj7QS{+6RR`il^_32W3`G`If5{cxdIvh>>+ zU#ZP#epmSDWL6iK-Hm@-_sM3quEZHMdSo=oV*rg`A(31v6m*=dhv-yW*Fz7kE?#l>5nQ75~HzWT3J@(qSOviZx4BMt-NhDP!Ag|4n;fQE{GyZi^U zL##WQ79m5B?6rRL2&P|ddv0%QnE`TYBjtASCMG)Sv7^=v2OegNJEWKn0U5%L`Hpa` zrcs&wFZx%+*w3mnKIk(~Dd}?c52D1!KZw25BmbMx|J_xj$d%zb3vtH@Ns#D(*d1bt zU{SjBe{Zg6LXihwcuTE^uavHQ5zg&N_hih-e=y7N*&I-te5+pB|6Z9CIXhfuvxXXv z`$=g6&E8}*U5N}d&8w}~emz$>tu-@`zd?lo8?okKhQj<6rZ@bfX$_hj*t9==*hjpq zOL1bJRs<1QQgL8OBW=QYrQo|&jzK54G0PKq{mp~VQrdK75iSg6-#%R~wI0t<3(9H_ z|1!xHwW|4t)kNbYdvEFO9Bg!N;P6dd7#**EM9MVs2$rW`(N<6p!AVHlLAo*Dh2PKV zO@I1r!JMQr=JPpXG)1Vy2H(E-_G)3mRT)iEF87NbBVHjw64(3a7 zu1?_HOY*RCUu`(MQ4Ivx%V;oRo%!+B((4oEXG*->C+RI!-=6{%F0xACf-zu*eY{CO zu+)Vu%$|WiJ{t%CtH@qg>ytFoH+J3So*HRMh7Wm$Dm@ELWBeTw10C6s@*D_4^c3RK z5UUT$JY1^MvK}qEO%Mr-T72;O7Y9ryY<0^W319TP?baN8UcN1>Rbr6))B~XAuL;5+jWEz@|WCmx# z!vrVq_oL;|vdVc7h(rw3@#$BP*i9Q=x07^znyNY9^F}-V@$11XjVGk`9ic8{;S6H@ zx4=Y}<1{ch6BnI!CvUNWLZS{+kOTmB=*};C?f3nwf%6|)sV=iCQ8EZjYej)p>w%|E zpP3mP+>SD|H`Aq){VL_sHX~v?4(tF1GNfocLU~>l<>;F~#?Oxaqy;7)kOyB=?A$Tg z*co_ZX?<>9(1tSTnNjMu5N=>!Yp#9hxW8G(_~5dejU$4XSL`P#{u7`6+GC5#DTe-O zu_Wn^!K>CsV^wC~JMNopl?qG`zAR+z3V4b0_Wc#>zim(guDN65*fp?+h?lRp65CyQ zNRT&L7NLFJNhXc%!od9M!`ViSF>oZ!$?N{WRVlM-$Taj$1*Gp;`y_m`{<8GiuPtpC zSIJ4d8gGqs*1nz_YQq0>Z-`ttHFqf_UGkO<6$>mft|f;(Co)bz|M)U<$Jh>Am_Y5$c|$Ns z2>be8(#P$)pwBjm?D1~50w6N~2Wrh4)}X2R>hL6M1J-`=s_B1sw9@~38r2ME5}_{p8w&M9^f;jkm^67i*+BsKdO71o~yr~n4iH){doWr z3vhpWUMdi=wtoA^Uj7AsSOGCf{_ENQ1wPIA?7u$>BIb=x|N40(2w-Z+{sf)C&wnLe zG*}_NlkE>nN)SlMU&#t0L4TgH$YgwbF~7lwc*mu#fM zAsPJ&3Y;G<^ss5Kyet{p=$2(k@>>763Z!9yAUk*2_QQ|QX8o#S&>0obtKbB)M0WOo zGrj%f8Xi>)5!(=m2A*8ZC%*WM=Ujs1NgR_^1ORT>SFgVz_HRqyINFng!BMG88~nz+3=y+^Y8Z?R`glA?(W zImxj3!lk}+m34y$d1X*4SSvXz#GFf-_UE~ISwIO8vZIP1xyrR7)v3Vx4CBg%!*ezjMOu1BU6u z9UrW^x>DKKan-p+{n80(XNQw1;tEGPW8!X(BUF0zUyl5z0 zV+oXkJ%uf@A$=Ies)mn+-9iB!Fk(?akrldz+p<6Fe#7*7J0@X z7L(-py$V1bmbX0?U~3Q_?r*CrWP?q>kGx^4a+yL~1Xq=)ta;TTP-!VZ32vodYW;k9 z>VYq>tE1v%bC1vV7f%NWDq&8ON=5pMq296^wxiP1-hTrih!7&WbL7rTjri;@W59fF z2GKD`OwSw3Q1MIzCRSj(L$#`Bq9={F0AHXyp!Yw@qm=68luBXKg=!wt;5Jx0nNtws1vn)kiE$<1rtS_%eS zjAr)*t6$uBI%E6&{M<6uHIUt*X6^b3&;*0+V^CT9&0@9BQB9-X+vB&JZdFZ2UlD(N ze7KDPnPh56IkLLhub#O&-yz&h1lIaqTz?j5D3N`hPzhC%wz%-}wih4LaPOkhQJdo!c$s zwy=PSp`ZU!=w_8dUaal84QI_s=*vhN2p3hLi5;bB0lOAE8Y#aFslDR35 zL$Ff-qL*)$&Bhp@K(0+-2`E{<-fLQRF)USJ$!?yK*ig{e#W;F*oGSx1CnE=rnt5ld zmRa%TF%O;-fxoPhRn^PC0sRHt>j*}8nvqQ<4qVyXeb}@%czCl2LsCP z>&l1cfhx}9V?z2bZ9SvjBNsWr6T>EI)r@UZVDw}}4d&O`LL7T*TQlw~+t z0XLa#{*>)omQ0)djUoY@vp#C|8_CB9oPr(WnbWew$?8W^g54CDV%L=~(s)Gi_V4e< z5n9gz4tLS(W(`Vk8jMaMD5zOire6;$$EStz<#Sf%;T)(SPq4ZAy%YkKR0zXik_+r+ zy=R@mGdYtF)V;i9@g$Ock34K%35K#-L75W+hYzj(qP_S#T;o{-L;x{UwT6Z$BXgmi5vAfO+g1MVn^Hj9dzv!oUzqJTE098fqm3V~b*=RP59?1T z00S#zp94?DnXf-hddm!tT2^nNq{c`KDW4)YiOzgo@>g(+EgKPQ7OUyryzKGoIi)>N zH*%Xxv5DX+j9L)T#>9yB!f-+J`NT&g@pk2UIafZ8v@ySC$SRaJR$o=DRJcgw{f58_ z@==T{EiAqX@W(MUjN~tSlu%{tmG0Z)kD^XdUoUMbDkAoN+ZWsuh@HtgKQ+hD63MrT zoNnhh-wjWJS;Gw~Un>pJHuQoPV`M&CCCTX_?C5OZyDu7gE|+DWhep2MNTq0?Q-E2i zxLK_F6Y|4!44tkO-?fL!m#`e~wD`?85uT!M@xTy^y)_NhumSE#CDnjt?*I&CQDb8{ z#WYXW&0CW?!b=zlYp(RE-TttWo=j=en+eP~|G~0@uL`wdsdvxSvv_E!E=^0qK??2L zKtSO_cX=^Ku8C^!0<|u#f0LP>rCsF-6KY!Y*UETJ5C|1~NubHVM`sW0q}LiL`}b}C zhe!o1Pvvt(7`jFhSBV#d^tL-+)qUyrw!g8AJks4Fb86b3@q`=ehMea7OgVM%&E!~_ znyce!W1(f>IY>-L@U(c+daTkZyBJBph9-xn)E*zM1QCl4vC7&qogz8Brz@6|&K#;R zt9el&n!*Lg@+wus8BDGd=9yr~e1}*mWYvTaOl)T{Ipp-twdzhO6KQI{$9I$ zvVOPbYeu|Q9julF@c5361It7;*=6?Cu)b`wvNYc5;{35MFjv>+gz9WKd^3$P)L=4WO zb`?haD3&T+?2BrkQgkT-^*RM~FJ@$z#rVa5(Cw4Mh~#=JyNeK2TT@P^my64MM`zPw z;+VS6dej(4yzPsv_M+x^%c^(z@8K=A#TUQmwl|sbQ~;IaoVlw8{~6IL*~72uRj*8Z zBt|btPbtLfHiaaY+7Y?Gu*~htF0|9G!@94gCen?}Och?O0$K#E~aYHTRx^K2#m zR8K$^*n~2NLU=ayUzJXezj>>tAb{9pT&H*wy+E1IgGc$-~Wl zm!k84h$vN==RRXky=S~b+~V5CojGH!t;-LZg8*1nX-L#TAhuR@!Vg`>-C>{%GEw!l zAUGa})3?eDIi5o?>T1GDeIUIN z1rG#8yzr-Nombp~H|9(2qZPAeQVHfE<%Q6Nn|4~5amy(--?>V&^^|zv zrVyEO3!vZ*I2b`^#_XKX9D%(?nYFaIP`2Hy1@qV|U8f2%7eggp&~<$WM_f(Y5)rIf z*390Xnd}qSbqFCsMDMi$ibJzXx$rzk?3hLa6()1Fl|AAuOH&Uwabo3&%{yxQMVAx2 z04I29&;*9nQ>!P${~?9x%B>MS8N7-V4BEX1w^lUO93;ea6_KKmy2DiF(JBP` zv7TAQNV8>TpB*gFADOk)`vz+# zfZI^XUD$IgLC}7zg`3qvTnr~(m;2=$zIkxDuFy+k^r~Hl=jJ<;LpbsMmPiMnoVbDM zwoGMUBcFd^_C?n`TKjRA;6u&?M|pyeup!-IGXjeNL~yhYx0FRZ%faMXZYuK|vS=C9 zJIt8?-Kv);l3T=72P~L@4^)^BvPrIY2zKf8Biw|n%lGJfgK1vKm%$dip7|Bx`J;wt(JlhtIPvRlxG%CWdn2o0*L z(shkO8IHP*+PMrGd)j^2e989J{F(-TS4323v{V%slg>pPR9>3cnIOT_;%HR_2eQ03 zhPf|4Hig6*5()LGm{Y5M2XlLs6_SN;^%F9=Ap|6lS0&QOgekTwYNENXTRxS#NT)2I zMxV6z86*uw66i5$O_BGr?Jm`H8$BA29tEe|#fd+44CH#FL$Uo@6#_@6S||eEh%kNu z@l66M%OX|^INFC@~NJu~F< zH``p7hJ=U*S_Hj@(T+YvZ@60AfRXgxuAWeBd2yIbZQz4xoGF_wKY|x_vz${v^Oai9*`lM zM)yxx{;v-pDFaMa&%34NlRHatAb-b}nu{FN${+Qx}KPbB2FgAE$2I;$-~ zM7`Zess7e%i0)2!PMXi*p7whIxOrCa8~Lw~>h_I-T~@&`A-cuzkc+5j3s?zVQlL1C zMsTWPj;Y?GaaSI@h-jh~B~QeRAd!(-lP1y4J~CIL2qjk{g`wU$I@7nQ!|(^2K+JiI zS^@zVVq_eUBGP$Of9fov+{4F6w9!D6Y77dMQ7&n3P&pU(t`Cq$YwKOsIUM3H&=r=?PLpRVl$9>y$<#oja} z`Xl}`HoN6nrXg6fl)a}dAcPe2_MVrV_~?L%60&Fx6tC01qO&Q{wfhpqb=O+qgj(^7 z!-s5&!oa|i)E@i~fc6dqXk$G&;yHuOW}pxNCBiP1^9?D7N(rbBV(XwIiNMasN@f-# zv9mC4z?dY3ks&3#tpg<7j30kZ7uh_gFiD;uC`)>^yLgWFJrH~KF)9_Ua}aatFfs}i zkTg*6cRWxuDAMP|(~kb}yzuE?^B@4h$7LttX{k}g6;&~CC-$Sn)RQl+URyv%+7?am zASIWo-M_Ppq?r(PE6HJM)H9QuZRn)#I_kRQUB=9GbDj+kl?XAPRVOfXoABN)<-ena zx7EHz!XhJu)Fdi-1V#g9lvkz#0UBq9oWp>&j;c|_z%_10bGj=iycxP-*_N9cjjJxD zALXr^R()~p6U0zu7GH8fnc1M=CL`M{q`)rx^7tDPqImG^aQ%1*r+d5yn|V;>(QAfnhvDq?bO!$;3X^Ojh35$J@yha(AX6{7o+ zk2W)@=ExE>#xd;LDx2|D$QbR{@;5LC*E&K=LGnr`OZ@x+Yl?X-3!2rRAtxcrsx8MS zWdA11?i7kqg;Aa#s4abOL~AyUT1~XDO=%?$t25^CX|MIr-U2qR;?tS^Jlo`&7zuxI z6hIB^oo0&BWPX&?>JW3}Y?^dFCrNbtVL#$|)7VXfmF5S`m;xiEeTQ)49z&?za+u>p@ZkSD+L{CLN4lYkF77y}lQw5J5-|vt*;xR*J?`ayk=ERV+lrgk`x(8+{L9 ztvTE1GfYZtiMU8^?i5Bw;|xYtza1K0IUy-xfVfn@ukrG~`FFV}N>)HE`j$hYSnV>w zC0a8Wp+XqB6_i=Xo(gmfz+z&jgLf@qqB-x$h{(2X#fl4cgkfUIc{RA@E9KJL0MbjC z`;gj((m;9JR?Wt6M`qcwY=&tBum-UPu1*HChl8l78$aA;tGwb4QO*c|-bZ*|3sP_q z5XB?_(|gb-g`&uqr;E6~G*3xm^*@EX7zDpS>;ywr{s%F375kc=J$B$-0thqY#g?8lZ1`3jikcw0vOmbP?H^WbagL=9&Y zvdA@-fx=aVNk3fTVHWB*I%GnSeLaKBWDA2joADs264j7ER8qo&oz z7koHkS1}#~zY4RB^;|fSmD>V6T%Gf=WtZzq?JSMg@{{P!V@O#{+~D>}QwhmrE~mhS z9+N^eYIrC%QDO!(sWFjvqxk|g5S&8Eps@!+naR18ja!4Y()30t81pVdyno%2LN*PK zQhqaq6by{FB3fx41hE7?Q2W&=Ix4bmMED-Cj{!)SO%cpCnFTZd{-!Mk)CD5X3Ez90 z5mfv8bVxca&$JZExM-b`{4&fwyP;LYpy9gx&DI|OMHr+T9JshB|#l=-Atv4 z!i9d#8!In(UDh5XA`sUUhm8=94T-T0xEGO>gq8}W0QA>EQqiz4tC@d8{# zN0Z_;YP+Aske}2*-ptLX_7CMh*3LMAGKj_mpOLTFA7aL&9d?a^b`Tj(59cLRfGH6K zaj}cATBpbZ0-yP&*(ogfsBkig%O^6%BEihgI>2!tqAz)@&lGAVOf1aka${;FkXH#u z`}nbVY$OWb*RJ4D=RB-n7IZzK?c_a@sK=&oZb%n=6*g$_#-_PH%8klL4v43X+T0`k02#%bk(gJS1me%6eM|ZsS!~cH`hPiR!^6(qYHm# zwgxk_7BGH$d7;qxLeDFM*a}VO&GE;)sc6I4& z!csqd2bBj@j)HILzqyUO4iYqwU+vjho4$=lX-2?-%Zn?G_?r8EF-LD(YqfDq11Jv6 zFR{m_ItLTLQG4G2ZXSu*T9}3~9h6sntIUHY_a{fW@DQ1;htW!5#fMP^=)X|YYzx4s zZJ?nPZOYYvKC&mJ9!o_n9KLfxZ8TCvUXy-j`vm>nU22Hpm15n%cNr4d;%9NKiSk;? z>Q{Z2UZpO^-U|^OK1(<%#CSgS3()k&kDRZ!a914JHAskGl$_XABS<3H?|N`QxhliH z?r@9hGx?4XCrv>^99)3+wadp7)mF+D$aR;AmHtwd{?}T7($xk+4Sa>!8|8wHbYLI* zmf4Mp8YKGOg$cp6VgnXH#!h8G2oX<4BY2l-gNZ!YF(A1->y}w-g{=<*N`XBx^@Y_d zP=#AzF-xjYVL2eC*VD*&qf|Xo-db@Klu&u(Sg|>#7@;KD9lMKf6DuL%FACED6Q1=jQA{DnGCEMSy{L;CWn8 zP>yqxnzP`;mcG8yUl>}yQ_Ey$$c@QLSLthEB%;H2qY!KIPoCq64Ig=25Crn=FUqIR zxDX3PkYiz~pFkSy1z6630lkW;2BV|BTG>*-vo1pq$JOBL? zGnPbKHjrA%cRTSB<^XS_U*=3>l}pyk2BZxfRcn!-o(m6=p@5N_YLH`UqyrVDJ-2RA zE7`VWmgAQ%u%uEgGXXN`esB%&exgk3pIdyJ;hDlykq;2+iAj<3(I$q*`q4&|vEHgD7o zl`<3E^mnf2GJdNv?dAAyh#(a+t?=$(GNyWFjN9b*) zD~(RdjR?Gpd%Y&LDQ8MEwZO1s8AonC*Iq_ zGbJUj-kiD(dk$L^PqGM?;P?^d@{&)FdJsgjv3r<3AZd_ZA`4(gRZSmI3nVz;u)}-k zLcRlBtTvwk(xcqGZ)6!|=TY^MF30E%fxygUk1Ka5rJtkrrsuul5_9-ad>t{!qT>?t z*e1b1T0_UH^!5?SX3CLGg8qR%$dO2K?M|;_-NjjEnttlhHsbjDUfTp;on~Zk}QGVCu0(l7U1Gbl`S3=5R z>6Jp)Aq9PfE?bEfK?zJuw>#lh!G}Hlg^|1m`s&tNx}8zWLF!g69f1acnnW&2SA0#kx{x9rvZ{7!~d7#X%xqK(?)ri%8 zF4NxiRkRP(m!(^)&SJu#5#;-@!~X7LdY7pRX5Xz=el^(A)?*naJ*A#%u;vr3_(XBuJ ze6O|!HP35{@fpf@=ibELV>~^dExrTyNt`&bo+7pYfup>SmQJX{f0Q7JX!wnQy7bM$ zeat$i_LUOSSSxR{7(DtYw?3d2A|waeh-s=KCyZoi08`MG2JQ3&B=K0 zKIZu5MIc)<0gtkE1a{b3_MKF=+NfR`Ip(uw(F?w@)i@vkL*~3$x~J{&)(Ea|{c3(# zewJieOAkyBai!gB4HHAH@Z^3mV}6=zWIv?+s6pgG#cBq#LNzwjPIV;x4d;Vb%Bi+k z{gm;zJ2%YQ6SR;ULcLqBZr2g@XQ;pHEt5Mgb_<=aG*9B>wQ=q$UgX$o*TJGAJxdKJ z4Mv(`L_IoU+}lNO9eFwX$Yk1jYqONZSC@t=l{+pGHMCwfj6zNXKjSUS-;mTTSq#-Fs#lzdS=G$5s~##zS{8YkktxtX}c%!vX-AtRJ!Vc(F2zGWwfZLL%ryO zS3ZMx8(vPFWhNYLivE_%u@bQv;A6FHwIWg4O_!LkB##FA_>+Nhl<C6hx8uj+NnfgBVFBgyKuu-J-`PlStpKGDh*>_m1GR`+ z!7SriwV=(XtojvaRhqrl1d{`1;R-s!LHI@_s3r4NS&_JW|J4Nk+}UaG%zNEQ+4!~| z0@Ztq?=YOnURqIB{befErQL?6yfw6r2T*y(&QeLY%_siB27t8DP-Z+iEi7dQI3{9H z{OcWtA?`Z2yq?s&0vp6xDT4rgCEzW1a7hNfimzOTYPl!%Cm%gN<#$iYEg!I$gURfSPtkjV3On>dZ`!;5CW6uLRr5)H+YEF5(;vbfJga+N1BkVi57gy( zU1Ff%*kn(7j(++>?1N`jxGYdqa%7NLUTlYNwLRf`yeT;*U8_!(;uHqbe@eh@Gvi0d z9=VUa^d^67$wKqdxZCLnNWn+}#b{ohb0Ol4(nT}3ztyw9*=o6KU{yo_APkDPFSq>mZm+d-4&=Io4gg`ICe~3h=;X2t2{#~B-21Pwc?FB?Voqm4k)Y3N z0LzR6$%a0fV#FJ89=9(mYLB*;k6OjsoyayxBxWKBw57(#J}kTm#c42cl|Ag@9Itx* z0Vq9#PLqhIpffu_o7h)%2N%a#J7v<7epq@$j1kSvt%4wqymljT1wpqJY&!iyg&#jK z7@7Rg#5n#SVNuZb{wvlZM~UNujoa4fu;s-wT7gMJef-O`{D9DSOu%NN)d{ea;8TG? z$8xf`gE_Ifj}u-6_KuBVUiMb~_s~_gSgAzN{RRst8>KM&+6_;L5e)(*2X7;T++I(F z#_!eb4HVdTMR}!#i4uUTb=dQUdU0qyl29@%fx0regGDP*$okCQ($)FM7GG7jl1*&~ z9FXHHQsA-%Yy?JQC-8k(Dxaz?G@(1Uq5dh5(h3Xc{ch2`&+Pf7DjF)V26?V+Ajo_B z&1`9?2zrw{K<&!Q6uOk~Vhy(If31y99El$0TaMnDe_l%=ziNCF)>bCygbt+Yq2OPY z)Qm!IH?R0?S98a*M^1qe0pVmQw|5LZR^F2ixP#|>h22Tx#2y*%tfrCPF7yGaQ>%FG z{WaRU1Irw|q_aLhri(I|n4gdb2RMi=$2n>dNbje(ALNugqO>ojZqU#lSD zC9MPVd=B%FYV5dJ16?^Zp zcKv?XNSIiQ3#7c_|AL#_W1k!R(TNxQylgZq*}1!k;aL^Lq|rK@zhf=-rLv=mA-&2y zt#o90J#lEWvW~Dle^r_1sigpG`%(5VQNFUlmZYmC46@QDS-xK|d`Q z#{TvEfvMycIruh)db6^xI_Pu|__Osx;z@3To4ZWp111lDe zFQKwm{xbr4B~`j*&qGhC^P?&*FY@e%4`^0}3ec8#bH!kv}|lRJU`OdQY!`(Jx{3{=XR=d3cp}xCqoG)}_3@H`b?ZnO~RnbkQ zAH7^Z=DO$d?;Ux7iSgy#AT_X8bL!9BL7ASG7OBpYOQYqgb+U4#(i$>%yzAIVY!r}o zEw#9z_xxG#9}-YD&PWu#cr5!I{5vru{hKODgU4^m<^|lF<(&ortvsGgSS#oFl%NPw z)@HN&)$9NrNs>CN%w~_m*QhUEa7mKPqDD?Ig52pb5rN2czo|(s5=qcxCU<74=Ikz} zY|5G(Ne%v1uDJe;Y14CRB}4o>jI(_~$XxNZbW%-C>3dDRZ?;?C@Av{)p++h3Z|w4- z=95KoRA^S04uJ{*njMXcnl@WV0B*CG+0(#BRiLw=2r=8}A?xf^voDfBpupi2eons; zP(^JfDdOVhbfPFBbMZ7=A#wc+W{#z;4U^W-0~($;kn4caPYodJ%lBA4H{#ggdRoyz z|KDYSC!|u_E!Oa}EPz_4FsHs*#zA2<^+~+eOV{`CM?Y@vty21y`=o2Pm)J-wXxv8b zXOgw_lQ`XYD{|BBHIrMS8z)NPK3b1Me9i+)F>i1q>g%Obj-bU^RLjVN0;W4gt^+*< zCUpc+DB9Zmu_R%S%arLYPG9?Re?Wq&{>EO$pit2MuA&}vMvLKS>3Tezp&DNw_`}I3 zg={|O#=Bj~)vZy7AaODEFb?T^THonzAjQw#c%0n6)pTLmiR1;@ydKP{vfb7-ngz6U zVFA-nnhq2fhY2kfqQjp)t$RIF{a%GJiMLV08fT?tQ`!I&JtU7V%r-Rgs z2ecpG#o0Cf@HpLVT1uAM7<2gu6asRLE`IU9i}v)PT@au_wgY2DH5GKBZ(?h)ynTh{ zI8U*ZkZ)Y?1qFj-HimBWIXmUJ1gTk-%p>et)y`m zNHh9E>k$El@Noc~X6*fApx-xneTC!;Pw|6i6D49_+|zQ#4a_Bvt~(~KO{~^bIEAe} zx(9siSBE(YfvhQlqr*UQ5X?_GU%=<#-1NRMI9l5da}g4m+e~GlV%E_ zr^!PLEQEtO6CY~rwdf~YkcPTw9V)~y_|6uuM%+K@?e5gHk9CbO>716$wJC9=T!qqs z1f7*t^|FP#N<7D=)1mVR0a3meHc0pV;_YZekDA5$!VHdjhsq2F$trSHX%`R6W*Hfs z*aH{@4lt5xNl6JpkAAmze(O-9vyHdQqIXyv8N$Z9A7)8Y!$x-6l_Q~nzc`H zz8h3F(`4M^O7Pwq{z_Q~;n^K1p-%ONS0GUTv(Mt*yjutU3FWV zk#X&-PAzP5=Pk=s>SRQxj9KR%=;Og>OLO`y%h77kN3v~a<2DKLDa!2xb(#B;tUe*p z%9Ts~Ewi>Z6Tgwtf9k|!$)`cbcnIP$b+fYr@(2pP2iRsQv9IU)0#lx*UZe{T#){H0 z`(yb;C%iTgcv@DGc~S?gh7r2Kz=>Q`_rMNs@bkihm;euE$e)LheMELlXxgEuAn z_I%`2;g;QAvV?`{HEDCOv1oJZj$h< z_va(u*zkl+Fi#2b>C%BF4rx@mMnHFsO$}mJwg~qHx+`J`ybgI+j4Nbzg*Gpk=*zx% zaDYX-GvXS#Qnz9|i+KaIaFrNYOtd9lWn zyt>1r|Eq_L^%T@{47JEOmt~gEimEcumqUBW6CF2;+ScXT;VbQ20i`f#(Acr> zLd|@FiyGH8$H23GuxKUs38xmO`_^SpOzp+q<*GQ;z<9HrfjRQXrGu`_e&9-}%?Ee? z&A7P!0$Xzrcv1`vbM$#}b2PT9$Un$$aHQ=S3n?)v$t&QJm^(3T5OJ zP|aa3073UX=a?S@jUyUli24)c#^1ZDV8(l|?{6*NUQoEP397?UAg+_Wl^`ypy+n42 z+)ZP+AJkfl*fI}rfFz)E3h~^B*PtET&MJoVG^SCAaRN#xQi(GGd`LYfeqJ09^QwKT zo;TvPm}=V z&{Trj>q{5Q4uLz$SPe=g!f#o9+dYgvvg%5BjZ*_v*nJ4Eb(9(A#IQIspIeU&w-kRn2n0B; z70oh|5bv^$SE%i=0DG?50&6Wo+Q(YJ}jI%f-Z#-5q6=R;gm|1MU)-la8? z?f#I0yu3dj&6bi}W9|J{WMMw!1jR``!IRI)96&QGKaWuHJMu!|0S{s}d_jZ9AV{6+ zL6JdzU};8AZH@bs!LC$ueH^S&U+tmnf6Nbf{#>NcV*InYjo(Gw#Y^Jvm-^b#t0qM1 z`{I|nobOeLN$V9oI3QV zsX1^t0;P4KarR%IC({ZbaL9|vde*L8q|{bL;PmpJK?G>xm3CeE76^12LU@jP$uYEy zYghbJz!|&*IRlMro8WQF;<*QuGf<6`KZs}X2@l~UihZyMl@^^cEinGj=aI3(I0*XW z^l#zHLe&66AbYYQlW-4R3bMXIz!>l>hlcR_E99R=^UGhT{cPyc`H+cR?jPNl<#wUg zu-#RKzmqlyvaENL!S!=fp#mge;k;~)frOX9cdtKLtBHJo1u(%KY(b1K;J4^d&lZ`{ zq-b*cbpXI5@nE?4h(rx(MZhCZCVI@h=C)4x04nD)7tg~{#RN3yfHk3q*5}^2!{}kj zuxqeTnc(Q_ft(N}Mxe1kI=Te(H}`!m_@DDXlam5tDPD}tooIiYtLE5PMDbX|=^-Gp z!2A&ZG^jBjH7aM?9pBy9XLlCCttj4DB>0R$q1GuE@s*619EZHT_e@Jk8~hz&fj^aG z)EM;Fy6*`;5%0}(3O)+d#8rX{eYHW^lpN_XPz=8e=qAgFPbAwg!Mr+l$X#jDb>$Q|Y_M{bV z6R+nvMTOYVMfO`I-;ge?LgczyPR)Kg6JNLJPX~D@$ylW^!AuA^6J=3_ekf~M?+CWJ6Wk<1^Fp7 zi`tRGkn=mfjnTfdwXyBNZRg~<;2OmsgL&#xp&14Z)nCTRKskQEsYUHQ$72|ldAFfE zFv+zg*a1L;HpMuod<8jU`x~jPkk%Uh7S>RVBm=rIwr{YatZ?`e`zb23A%GI8A+~^O zhtXr^`A4VB#76|6pI$N2^l2gBuy$2JBal=&oXE(hi*L|KZ>^__CIs^;`udKFH;_Cn z=mgb3tqv7eATkaFjP&JB&6j1#iD{5TF;S(9x<#!nJGL!6`S7;WHR{j4M;pmh{F>>_ zJxIiJOTwh>F>m!8srSGY-nQ(8sBZa!+&o(Ymfyu2DK*ez{pK~q@S9ax$7yOH*B0y8 zP`KPxOoJvu38n32ay0Zp5g6QyF2oktK((u~%E!2A&E&L39jwA?`*qaj_iF@(G`Ps)v>>3Urqr zb&Uz$`=0QS5pl|rjc(L2muT1xQszqdXKke>LX{C;-#HpAfqGF1KS2(`E35UVZ(pNP zLlU|n$efs{&S--wglKaJ4-y1tk-fPT;t?`L+Q7ob>Ye)d*($$5Z1Pv22UFL+NhEpW zNt2;6oyPD>;-Z$Sy`BHl*q6scy}#{CL{p)%Rk9>yP1%xtA3KAwMpSmP?;%7*L`7xE zGS*=bhHME(i|o6xr?Q7^A^h$+&-0w7^L<{=>v#U@jPjY!d%2hEzV7RueAGy+bw-p< z4wQpL!4igThZ~)5y>NoX`q+Qfi2n--00CG~HyFKA`u=ylNJ|Ogzb$C5d8vT7^~B6=Z_Hr)#IUux{94wql-hV=BZf_`(v{3 zsOE7yz)5(dtbU4h}1P;mZbWhbMM4E>o?GXp1BA8|}$OE0dnhR{qLZG7V< z11WDEeIfk;BrMFBUOWR@is-7uK}${Fi>6*1{R{|pR@34t~ z;sx)a3!Ngp*YC<1oF`(Nh!#l@&ZG(l4uH=C=NFUI7-B0?M{t|)$^{fp)2n4`*D{JE zKGHD$BCt-5d@~qC{c~nQvd=*n@1~e~>c?S}p}3}r^*NyBgA32Oa?@8!D-;8I01;!m z>lK@uk-S5+bGZc^tIU_KEBViZ=7G3GTe^s6#fV<-p?Kw0D-#W#FwSqz*GE~gZZ_>N zPhYedeK{XR_n^bNVj!3|n3)j0969+EILIgXouXDmhz&n2 zV@HuJaoF(Ob)JF)ytk@|9)bz&?ic z+YB+XYq(=f+2m>~U|MFJfzvyHgdp`0QH5m(>ep6N%%(A};l(z9+G?$cEUVTX?BF$H z76Nt@uNRCMJN10yez6Ud-ts~pAirH{2qU{k7J<0Xz!A#1{?SdCwhJQMmlldy9YsNF z`3Cj+DTo|6I2|*&%)_tRA&L`{78+*fVD7bS5j0@FYW;p40Eq~q(>>CSWB_89Gb6E3 zrd}jEOg+LH(>Q*q{pdpH#x-DL*fT3^-Ew{Uaf?K36O=`v-Q0Joh4nZ0MdWA*JTZMX zSA+nxc0KA>{Gmj`M{B%(Qu^%H#vdDS)b+VDSFpHM)IfP}S0tgOL$K*^y_rs>-a^#VvSm z5i2@G$R>i=uZO;Hlyw=pt`r+D8L-qJV+7;(UUS;}-3i;wrTgqNL?ewIPS**S`BXAx z3udPLu)t5S8aW)Zc0k0VE}&jy4lOPTpG@1DfgBBSSM{o-lg2e(Gx%w~_<;n7IK>vI zkXs-u@*;7L^d2!2@6n{xpvLcebdDSD(Ip)k`4*m>d z0l7z1da8|yq>ngpL$cwCQ@Z-sSVfwHEpAHa;|$z~-O9^~IRZ}iUcO5``7T;R9x<^$ zo|}Sq^~J>yM7H$$0O#QoLT3wV*9i;}*ZHhnJ%JaQy&3WR85M?~HdtXDpr z?c7J2R}GwsDsjd%B47PZszvF$s1vS-hRkR7yo zX&Zaura>CH(7{Lo!og9%K{g}NUeZ1~w&++|_5JJQij=)s?ZOZiogZ^BronOs zj_tZ_IyMEut>K~C6bHwe5937fl{W`=22qw-caYx_(OHZU{xp=Q6Bk#(K95ep%+W4C z#AeLK{MztZmSSlvv9o!z_AAoWAXaFRWNM#l5H%f33S0XF^qD(k|2>!Msa(h}7mtqsk)p>lR}UbeLHTZl^C7;}hS`^)go>|Y7~TivPJ`D|gSPVQZ^iL?7}RHC zd0Umc9P!oqt{IHJhK@9MEF!gEUbF9H~TYFIm0lg zCB2_xn{MK~>~LEi-LGyStD&E|COWMF_v`(Sub&nm7Jeldy6%C90fYHv3adVIlLoJQ zc(J)R)An!Tcmr>yZK}}9{wg-&PiZEc3F8!y+c+uV zncJ?e*CB4(#!|!|0EGcu@ZN4;w$`as4f+!aIEz{W1i99PB`nd)|+4(1`nk&K`u? zy#coga1;(_mT2%;tlJBs6t+=Km3pJ1^M>V}Y1>bbt844RGz9jxMW5o+m!0u{rbOQX*8Z;$wl`F&Ue zBHa+Q1QMg}634>Icf=HJ&-?tEaDAuRCAVWFBj!*%a<268kHVEnj=klSR}<>Xihhy= z-cf%oI`kok+-*s*P^wCZoQG0&DCU`k|Hoj%=NW!V4m*q|T$L;~Zdx!|yJo7h;B&Y8 z>vpX{%yvzhTO7!>`TSyHVlGz@TIII}9WW}iI5 zo7FCu`?fuJgFJvt)LLWPTW0fyZiMFwnbD^%SHZpA^%<)4$f-!HhHfPsXu}%f9eQgt zTt=#Tr%u&E_q>?fXQjE(+Uk3niDsF#%7*8^22P6$-dvjbOGo-*Pi?j5%=IuT+I2?< za~5j$f37n-T`f5UvUA(X_{2Y}B6pBUoQ)Ue?dH0x+-nHeyHszqq~psi_hBH;DvD7+ zh>{vR0Jk&y5 zMFX$hCU46k-oPYPq4VXQbcMeH#t`*-`uYpB!9T;OY!agkberPJ)9GeYd`4d_Prf&U zC3Aw}z&dFAG}y}dWufKr=9&lULfcjQar<3Phf^YtoDy3d(|LZeUXdTIW?!zqI9j_9 zGvQ-Ag&qce5Yrt*HkWp7eE6mB4KW`E>T zX?UPu^SCN*wg;%d8$A()=zPObcO8LmJ&fAheLD&K7T5K2wD6Fg2Mk@tf8gU6f7}Y$ zeogpG-qMqoZ_NkP0hF)++WkK&4G|{m3q!VuLXdsC5zmYMAvmJ{{y`{aZLWU^Gfhv=$u8~Z%0LsKL}U%9iH%ljPsWoJT9Y8dtr{ll57 zl-HqRw2?a3hjY|Q6uB~csU=l*Z2u+nqda#_51hz*we7C*r%>UMGz;<*_pzSzLOd6BJ2L3x>ovYC z14NT(n$?;p`Uyl^CapA>S!VpJyy3M96v7d4Y?9ySG7iVkCxBMi`}}N+;2keM{p^QS zW{B7Hi8Hp8QlQke6nVC`@1>}0E-0{iL;5|iG|~77JKT9C->9e@df!G*!;9X0X72sM zYr3O8@TLjxxp(UaHVlh1JBX}Kh)RX?^>bntca-W;-rnUiRWNv}gz5?+oW3PbPn|*A z)Mvm!XC3j=U|&|6W_uY7pRu@fHXtP@R*gX&j+9udyWX9wG>v`wU(y40UP=wJ2LE!C z)tPu=!#`8qAZV`PJV&44M!INntKfB&+e#hxD4_cwJXt_T`p4MKKAk zzs)WlSDl(oL}Y@##eNr_97`3n3bp`YRy89Pm}iO<_~I!|c~L?e?zZ#rsC0l9gBfrexU4RZ!da=uLxqk*mN~p~FQ=4d^gh zGtmi(3*MWvT$fLhwk?+iF$$SRd$`xPGCNe|lfDxR3D3}7kaSYi;W^<2{?^FWYvBlpW@el_VA_I45x-v26-xf9BARd>tSSnx zJC_!jYu>S2U|#LLYr(8^cf)9|G)v=|KmHRqAtwkuEk-hQF~Is<|8#A)3$$`S7d5sg zPgg5205a2cBOKeYskmYq~ti9kv0mq zs4C};Dnm+}?O5DWQ&W=(R^Mj=!gjaotYR5Pg)c7=SPUe124DzGxxm%e9UFO5g|f%1 z_JA`d?TL_W^!X>YfQ{e|f+=L5gAWw^-W9SCazJdS1((s$840!s|DI!-rMBAZy7-S1 z!;QPo)*bkAFxHI9)3C4gJ8Jt9C3|8BhL>ZcqF52vNAJ(Asivj1R(aWB zCL5n`sH`fU;bj=u7?Vs+>c~l{)zt;JY^@{wo z5La$8iY|jJ8${aQ1XEtx49kB~o2~FHMbc->12uqQbVtNDxFXZz9^$pb-N2)JQH3KB ztq4gGTF^s7IMR>uJJPa}B zSxC-j;x*gD^MkWSg>>PrGqQ{GexrK3a8&M}kGVD33mEmpF?CM;>8=@Rb!me0CnvTM zT&r1%XgL|l^VHsFXki*#AiV*%0nanzHud-MZH~_v{cimKd&u=P=AZ;svc=`QHK!PsqjC?&bcOO_l=EJQ1+QtFy|S-Z820F(IbfwX zGoWW!p>8tQ|0dCV2yb{)-}MUVz5M5)|Gjc)b`&MFUkUg3D*Cbov?^+;LJJ+_-=V~E zCDeJ?Y)_v$Z}u#kr?1S0lJ8o%XXkM7{la6SkP1F5q4_rOZgr>JeT6lt&L#(U#Iz^N z+UBdXsEq0;b<7WO-;n=U*WaEE#hW%hopn%_0=XUPCY4x!lJ!W~D=FOUIg{#$kA1oR zm1$eKP8aX>XnP!U^zF^qw3x4td+zQ*OJzTlo_5+iLCd11m9b#Ney(DnB6)mgsUr=Y zb<6qGSxW1x@e3t}=*DZyr~h#V|GD0D_jyCY44+7SUI$Y(3YR5!v)UzGNWHIps&}XQ@bOaS7(tpF=DLCj^FyKYcrZn=x>E`erWe ztIpUgTEI~tY!x@89Dm!@AU^Z%XV1ye-|zdMf09cluasAPTIE$FaPZryz|933v()l2 z@A;2ylGixmzOnM`z0NiCa~#Xf5Y0`ngx?&c}xnGW7H&VQlzj?hzNxp4BVSrogJIt}|3(mV3+#R#!u)a~uukf$m6 zS*>aIW8{Jv!^2s%Pc1aqNA4>LxJqO^?_Wbr@0^&JZ6sF)#6Z% zv~#W2)z=F(qx+-&2yH?d<=6f;JmZ$yOYx5P!qhVxkERSXw2Ky^+Nwk}Ua|S$zIwzq zY_vCRc(B57(kE7>p?U>Z#%zAlaL8$gnVa zwNYBUc-{7{ZS;2A;-@1duI|2Ah&SBVJ@X+E=C4m&wV!#ng<-7goiKj#?XxqVx&#D- zhe|@-3vN7On~hSZTA1xA(ZHfUMsjE3#zo?uTFTwyCEJXCp%1j zyCvP2zvM=8I5PC*7?4-w`Q5oM_anZ4SZ}O4eq7z`p2|n>jkhX>GYUoGglF+LlDamA zn8ut+mFI+&b`5xo2elC7z#1?-K2B)-*-ky4~-J;b^F@oBTmKR&(7)Pg;}Z?sQC3$o(RQ z*Y}TwMD%_#*>QM)Dmj@DBDahv7_E6Pc3|%n2NlWfV2Zr6tG(rVu|svw=avH@$gIve z)q4JlT|ns>_qeo0Q-!OVv0;Cih0Q*Y$D@=TQ8}T#d3t@Huit3nQGP{%oS+FW6{n6l zC!k%rNM10{Zhr_{8mfTf48~u(B3HGVL0_F|V(n-y_)&ze*{pxLBAtVCjO1AS+n2E; zDBZ)}PV2T!SUTz4y4YY_-D>)(F`;nGdb{+|!)4SrlmzuX?78=5Nn2{(eo0uv%hV6` zn%^^hiis2zC$N>_J`E%-EX=;yy!EX0T4{Hek1ZP|BO8TsJb7|9zS+;UC*4gsKjG@T zCl%L^PG zXO(EtmwkF+X(?2s8}T1shAx`CS@s*QjXKgs$jL%i{K>2Baa6dlgS?S6SA8O-RUB`$ zX}?DKNrvN*0v8cSqNK$UOc@Wk&rj=L_FEQ9s+5ve9TN)sflG5SA4N4cQ|#=V?AI)l zan2{C#fYv1H69|lE+mKbzqW)ukwWC0AU|H6Th1`lFt|cz?1suP*-YWNtg*lJ$ zF2V3}cfZs^TR|FNIH!Mi(h+-WC}*9uiPt{|gAFF3~Y z{Y4e8!VtRdi4H7BRo?AlJaxP*Cn{psb0Q;b5p{wkd~PT;<%VR~Eau42#^9U%R~QXk zO;XZs=DX{Ka#7Xp{SicuLtPfo+pg|zX8?W6y%$V1)YVHQyk?C71q58PYS-~~Dd$`C zAJ;HM__%JICZUaA@G;6w^Y3t3W^@*+lFX;-dXVPs;m&C;OSi)Xbmd?kCFkz^&v z$eAL@>N~@iSq3RdtdcTsnxzd)E?nr%LDvwy5lf;~V@a856n3*mR@P4=cwXw?Va;^px_pg$Z zEX^S;cGEBn^py{kI}gpnJK*E_j0k{W;f#tb`T)*5v~aYT?T?#~W=ugH6B(Y8L=t4t zjgWVqWWiml8|6#40L^fiI&g0Un2CV=?jRrZl3?F=-uj)BbM<^FeK8^Gw>5iPNypth zJ`K-JNJ?^)^!dJ8R2#67Xv1x7ZEd2d`7r*1IpNr`W48@a>gthP=gw_yPsproCMUBD zy$P&tH$N#GR1*l}3U(dw?Q_GGrJ_y)9l?FJPaZ!mxZV`W$i~KIab_{V#wL44VY%}!+kSh=->*SEsAh%pn%$ADWSOGo9GE7sY5V!}=iAORGc&uA_-;Nl_USdF zx!kLjB{N$Ri%Q5_#r-?=Ri13-I@+d~@3pGQ zz>JC0@axXS^nSvPq9 z@M;XO3q+*cEWN$yo+Rm0SVyz}Y`7%Ah9!&wrpCbZ98ufoHT8@k7ikT6EbrvxchP6J)Y9+srLs23TJOH8I zfYH9|#VY9^;8w3-1L+fTKdYi;5XyvdXGaj@uYfVJCNNy#_HhTD4d4V1A=v_N!Z?Yn zYdRtK6aZ#J+-%_Gpj1P60;Dn|ONWydP?Fhw^>tWz5|_a`oZN zE<+&%sY}TaZ1A7c-rsM8srn+MJumdmFg`t$W z4HZAF`lNN-#vyEZS2TVK#tIUkP&}2|JIV&-hRgkHkC>bos$7HqNQ0T4g0-St6CXTG zvX07}f1j^ZB2TT{s__8rIr?>OGL&7qoDyz~o|}hV&)0)!yuA9&r}oA0`WT$ZbcCV3 zOsoaDv&enXp_p@8iNjBwLHdpZ+mO}JZMq3-fY_Bn^%|L(@W1aUC8@v4+Xh(n3ymyULO|7hNaP)&Jgp% zzSKJ;gHP;!3AgfdKu=I1h;Zz6RdMb=d4gM0pKKKxExYUZ#!+A6O-Fd>^H4ZD}KPM-&@jNuG@qI-U!m;SgWS;}Px-4Wr6)VHz z@PKcUBx^(cR0?3k#~Uq!mqm{fjDXm;uTYI3*@FD0i}eR;>CR}%5Kfm7)3Sp+M* z@gj|?bJM-~&#qnLh{fK6E+Ht}Xs>nfUY!HF=0vJK9X%_euxD3W1$sp?>Tu&49I@X8 z`NH*}V824A=@H%2rL!8fr-)gXNZDYcHD(_Es{YWIbLSaxgy z=P<;FhI$4yy^xy%kc!de%MAkZA>OdTmhA6w}r_ zg=fN{#~2Rt6ze?~3;N}|U9`6)=<>p6ChHl2Sgr_pVoLZbTlStdvZ?AEXy=))UdWu}Q9kxs1n zZj0s5@SlBODCaKA?_`(#&f81mQxf7`2!TnUUN0GEGZtm^>MuS zdJ-3a{f(zQ#AsZdh9BCI*SL(gg%f+lR<&0xQ6n?Kg=oYX{fXqtg&f(Pyn71RpAQbl z7@R=Y1qBv~=`qtsXqaMR8n@1G6t^qt8ROl*su#_p7@nB6*atnPY4stFRk^2u_1C-4 zNrmr@I-K$Ai^6VJxi7H$i-CSLIHkHaKiC~X%XSOOaVDeZMUXtw(jyz$vAjGyA2+f) z_zLWrC0{U+xEOZb45FJ}owQvc`IMgDRV@l+D9An|peIXNARSUG;a?cq(7abo^e;We63X8?}+(pwNm@ni4 z^Y~sAsl}^fxx4t!;(MbG$MkF!LVa*|8@L)3Z~YXiUK2j-fV2W0VE)aW#j3dra|C>T zZ@y9cB2LT;#sw{_pRN7gksiiV+=XYjmC};#WgqkOn}WUJQ=2o=N6WUlEztFucQo7e zITB6h?WGg#Gi&XoCGPG29S*{usNr=L4!+?eiG1Z=$`LDg4@I8AJXEazV6jygl#MnW zTV=ZzdrF)k_+?!JW8P@Xv@>rjyE2>HeI&-99y6}Op!DgP;0A)Dl>>{ysEo=$l$Cuy*IsBHGRVKena<9MSy zxz9p;o45lysY|+A!jV7_+K?JlbY5Ox1jn;8;<5nk==bs#Mupp@Zv5!x&nz~b1$x}G zdmd0AD}`%22!fk)y+&Kof1YMK1RYEGu|MscR7eIQM#Rh(>d~X>3ouu^q$!Fipee4f z$W^8H!j@kC(Q+NX)lrdc3N@XL9^pL*BVfFKR~ATrB{B_td~dpq?B*5`NosE z`G`|4;tP?S;bsoe8_-zvS|0iVq{JNtOTMb%E=j53cIlY|K~-$MOGD5EtWZtHH1z2` zXusICi~g87dydi$2v2*CV4d+_zI?&hNQd`N{^_Kz5}ow+{NNQ5Cml|FqgC+iz5ybX z;)rVo`#gvqb*e2T&^uNEQ*1f~cWu?s5sGYhqTZY>3ky#!xEDXA#5|(?rPW6yxjki5 zoI^9jnpWCt>+VK!w$kZc#F)rWu>QCT6clbH_OfEO zJ#Oi8;4j(j6Sr!haCv2vqkVd8(Isf*luK#OwiNU#TR3Nm`RYioJ*{Yw{I0yoMe@z5 zpQ^(99NQci6e0QaJrj7;X(ExL7AWPTyx*bq@yB>JIAp_88!fPe;~Bat_pp{Am!g%5 z^aNv*>gJl6-~{7eDx+Dhbcs0=z6d{nzBfjFaStIRBv2J@-9D#nImOkrwD3e`#1kEv znf_fTJeIbupgQmr(J{@sj8E%#8vRA+{J!N2O^c)nIAEl7Eb$#w?d2qJgZRt0FMwF*j-q^sy(Z#; zWVi$zo6M2G4Rga+*OiNIA6zW*pk)!ad?fYbek~A?yqCu1tAR0ls6ZTSebGD=!q?O@ zph!;uRvD3J1njw_C3n9uVId*S(mK8Q$#WN*4Og-B)mn*_(!3JD;6l{s`$0h3uKQLu^R zzC)J%g^69fS+JpECuUFA0gx>+*<4bxch?L%dgSwvJ zXX?T1$s?jCSLQWm*j>Z1ASu2_^ba;dy4}Sz5E0{?7uJE_I5es>*BcY^?%(!6Jeg=nz+W3$x;{-vV5CqPBY-xOGck!s@OgIC{XqpbABBl9I@nua z(!KhU@G4LFrNVLa7^vF{@0vbBdnA-GrsR9ww6zVe%=DVZW23=)$`oxK++$A0jjLMe z`1@B86P3S>t|LA!->jmLi0s%h@X+at#*yP7X?)@?dE{icik|+V zDwtH%lK+IB0%!cH!FDbeJ^Z;TDsVK0H}LV3C(^|z<=BQK00g$@F~lsMXpmiGeSigK zO!TS*!pxTGGxDbp{rE9-hEg6XxV`y<^_ac4-lqfs9C#ydP;oK^Tw?X;-a~Q@O1;E2 zOt~|b&>3VPou$iP@C`Mc@irSWywN^w4kO>2Ha7nFC%n%MI7EGyEcL&3L7MKP`KfwI zSU+2HnenzmQ-7JgKJ8zgTH%Hh>BT@L}_A-8{d$df2QVAQ;C1LFYITHbN z1;1}ZAC75o#4e=tZ9*LL`dN$UoLV{x;-$v!pE1ATIrtL`Cqxp554!vj^+O(A<24Lv z_jfe%JYPm626S3_ZE5nb$og2`n+B&d$89<@lr1_jb3DaFnldr^=Cf)DhyfqPmvG=P z^LU*RUUT;b066R5p$DK1;xld(l58^Ng|9?3s2YaN{Rrg@G@R2F7Y^L=+|yGnweLPh zi>7CsDMi?p*@UBkeOVfFT9TDvX@0bJNbw?C&|Wa{NhaQr5d{d-F4tB&0!u&6Z(h~Y zkBLkL&uEM@#m8KcO}>^{tS0{?R5JV~;vxNoyjj(F-K`oMro5DtK672y%<^D|nVbZ# z#Qq*EnlCjVQ?7u${RU&>DN-A%d!3jZ%wudPNj{~&kakfyBpmA`5;k>{Hk^6lA?B`7 zJ%47S4dSln5p!SznP-!!Oywyj6UMSKGkqn+M)U=K^u%b#YY^F#9tecBrvTov+sjL9 zM6q7|Yjgb0?v~RR{&SRqTU}GDS=)BOTg}bQKgN;i&lDI&B!BV3?vM5{$jn%5)vTvE zu%q|vb|}LyZ>6;vosvMS^Kf@pc|Q7p55L^8MbBz~t!fuKgl-2Cl16F!bJ`tK-&YqA zHhJ#&gD}@^XNS{2VFV5le@7gtZ17t8gW8~8PF9cl7_>Xj`a5L^@i<3!fD?hDFXL84 z4R~a%`-BTzJRC+}kejH=pt$T-FFHAi*#3Z8z9dMuMKdRJgV+eaE{UUNT)x<98w zeNNIu5X~P=i+!b1^Q%PEWpsFGme|l^fkG>3ObU@KEW2w6xM=e<+5~4(Kw0kNaq3xY zO|v7&fXIy5!L&e`uo)PCRN3yV*L3{PSl^Wh@l4JvNs``txpApux@%7fi+^WVM8dq; zrZmuv#cuy}%?B$VBJiUPv+&A*^xue65u+cn7iOgQfs9;PAnk>&%pSMh4Ign|T!$TC zp5^{)4j>|wNpuyqCy5;%*FOj<95n!dHXxN#{<-Q?=+8i49e0gNsHYOI*+xLm&Rt1|+# z2=Xt|q-)8o3#>{x#!)4br2sm#ncPZr&tq){l|)B+D9i9L=ofgW5dNmi0*nZw$fBd8 zzeg8Kkp8Osys*Ey{w(|lNEeV!{7MM1Rscnbh2hF>As-u4O{a(PJo@h*KYq+lq)B6* zR3Eb?Jyc}mNTQE0UPAowi~d!S9scUmeNs8~%Si z?x`IyIobI+DD}r?lVAhHf9R$Jz-c}G zYMkN!dffkt$oy-a0FgQhU!9Y&M&aMzg!Gx}Wnj=A1pRU%F=(Ve|F1U^CnnK Date: Mon, 13 Apr 2026 20:50:38 +0800 Subject: [PATCH 151/204] [Bugfix] Update Flux2-dev & Dynin_omni L4 e2e test (#2723) Signed-off-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- tests/dfx/perf/tests/test_qwen_image_vllm_omni.json | 2 +- tests/e2e/online_serving/test_dynin_omni_expansion.py | 6 +++--- tests/e2e/online_serving/test_flux_2_dev_expansion.py | 4 +--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index 387e874ad5..97c1bbfb3c 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -72,7 +72,7 @@ "enable-negative-prompt": true, "baseline": { "throughput_qps": 0.1, - "latency_mean": 2.34, + "latency_mean": 2.7, "peak_memory_mb_mean": 61000 } }, diff --git a/tests/e2e/online_serving/test_dynin_omni_expansion.py b/tests/e2e/online_serving/test_dynin_omni_expansion.py index 39b6dc8e21..710c480f08 100644 --- a/tests/e2e/online_serving/test_dynin_omni_expansion.py +++ b/tests/e2e/online_serving/test_dynin_omni_expansion.py @@ -120,7 +120,7 @@ def _build_i2i_messages(prompt: str) -> list[dict]: @pytest.mark.advanced_model @pytest.mark.omni -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) @pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True) def test_send_i2i_request_001(omni_server, openai_client) -> None: request_config = { @@ -136,7 +136,7 @@ def test_send_i2i_request_001(omni_server, openai_client) -> None: @pytest.mark.advanced_model @pytest.mark.omni -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) @pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True) def test_send_t2i_request_001(omni_server, openai_client) -> None: request_config = { @@ -149,7 +149,7 @@ def test_send_t2i_request_001(omni_server, openai_client) -> None: @pytest.mark.core_model @pytest.mark.omni -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) @pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True) def test_send_t2s_request_001(omni_server, dynin_t2s_openai_client) -> None: request_config = { diff --git a/tests/e2e/online_serving/test_flux_2_dev_expansion.py b/tests/e2e/online_serving/test_flux_2_dev_expansion.py index 9d96a48c0c..f7477ed803 100644 --- a/tests/e2e/online_serving/test_flux_2_dev_expansion.py +++ b/tests/e2e/online_serving/test_flux_2_dev_expansion.py @@ -27,7 +27,7 @@ NEGATIVE_PROMPT = "low quality, blurry, distorted, deformed, watermark" SINGLE_CARD_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}) -PARALLEL_FEATURE_MARKS = hardware_marks(res={"cuda": "L4"}, num_cards=2) +PARALLEL_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}, num_cards=2) def _get_flux_2_dev_feature_cases(model: str): @@ -48,8 +48,6 @@ def _get_flux_2_dev_feature_cases(model: str): OmniServerParams( model=model, server_args=[ - "--cache-backend", - "cache_dit", "--enable-cpu-offload", "--cfg-parallel-size", "2", From c9e2e3e8d764875764ab89c1bfbb294314959e44 Mon Sep 17 00:00:00 2001 From: Chen-Yo Sun Date: Mon, 13 Apr 2026 10:53:35 -0700 Subject: [PATCH 152/204] [Voxtral TTS] Correct decode steps param in Voxtral TTS (#2524) Signed-off-by: Chen-Yo Sun --- .../voxtral_tts/test_cuda_graph_acoustic_transformer.py | 8 ++++++++ .../models/voxtral_tts/configuration_voxtral_tts.py | 9 +++++++++ .../cuda_graph_acoustic_transformer_wrapper.py | 4 ++-- .../models/voxtral_tts/voxtral_tts_audio_generation.py | 6 +++--- 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py b/tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py index 6f072944d9..847adae06f 100644 --- a/tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py +++ b/tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py @@ -78,6 +78,13 @@ AudioSpecialTokens = _mod2.AudioSpecialTokens +class SyntheticAcousticTransformerArgs: + """Mimics AcousticTransformerArgs interface.""" + + def __init__(self): + self.n_decoding_steps = 7 + + class SyntheticModelArgs: """Mimics MultimodalAudioModelArgs interface.""" @@ -96,6 +103,7 @@ class SyntheticAcousticTransformer(nn.Module): def __init__(self): super().__init__() self.model_args = SyntheticModelArgs() + self.acoustic_transformer_args = SyntheticAcousticTransformerArgs() self.acoustic_embeddings_levels = ACOUSTIC_EMBEDDINGS_LEVELS # semantic_codebook_output: hidden_dim -> padded_codebook_size diff --git a/vllm_omni/model_executor/models/voxtral_tts/configuration_voxtral_tts.py b/vllm_omni/model_executor/models/voxtral_tts/configuration_voxtral_tts.py index d32a882e78..0f22c764a0 100644 --- a/vllm_omni/model_executor/models/voxtral_tts/configuration_voxtral_tts.py +++ b/vllm_omni/model_executor/models/voxtral_tts/configuration_voxtral_tts.py @@ -48,6 +48,15 @@ def _remap_mistral_audio_args(self, config_dict: dict) -> dict: audio_tokenizer_args = config_dict["multimodal"].pop("audio_tokenizer_args", None) audio_config = {} if encoder_args is not None: + # Default n_decoding_steps if not provided + acoustic_args = encoder_args.get("acoustic_transformer_args", {}) + if acoustic_args.get("n_decoding_steps") is None: + logger.warning( + "n_decoding_steps not provided in acoustic_transformer_args, defaulting to 7. " + "Please add 'n_decoding_steps' to params.json under acoustic_transformer_args." + ) + acoustic_args["n_decoding_steps"] = 7 + audio_config = { "sampling_rate": encoder_args["audio_encoding_args"]["sampling_rate"], "codec_args": audio_tokenizer_args, diff --git a/vllm_omni/model_executor/models/voxtral_tts/cuda_graph_acoustic_transformer_wrapper.py b/vllm_omni/model_executor/models/voxtral_tts/cuda_graph_acoustic_transformer_wrapper.py index a4d58df5b1..ff053342db 100644 --- a/vllm_omni/model_executor/models/voxtral_tts/cuda_graph_acoustic_transformer_wrapper.py +++ b/vllm_omni/model_executor/models/voxtral_tts/cuda_graph_acoustic_transformer_wrapper.py @@ -49,7 +49,7 @@ def __init__( self.acoustic_embeddings_levels = self.acoustic_transformer.acoustic_embeddings_levels self.cfg_alpha = 1.2 - self.n_steps = 8 + self.n_steps = self.acoustic_transformer.acoustic_transformer_args.n_decoding_steps # Graph storage self.graphs: dict[int, CUDAGraph] = {} @@ -73,7 +73,7 @@ def _warmup_and_capture(self, device: torch.device, dtype: torch.dtype, hidden_d ) # Pre-create persistent buffers - self.timesteps = torch.linspace(0, 1, self.n_steps, device=device, dtype=dtype) + self.timesteps = torch.linspace(0, 1, self.n_steps + 1, device=device, dtype=dtype) self.fake_eos_one = torch.tensor(1.0, dtype=dtype, device=device) self.fake_eos_zero = torch.tensor(0.0, dtype=dtype, device=device) diff --git a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py index b5d1161733..4041a53e55 100644 --- a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py +++ b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py @@ -108,6 +108,7 @@ class AcousticTransformerArgs: use_biases: bool = False norm_eps: float = 1e-5 sigma: float = 1e-5 # was 0.01 in beta version + n_decoding_steps: int | None = None # Number of Euler ODE steps for flow matching @dataclass @@ -436,14 +437,13 @@ def __init__( self._empty_audio_token_id = AudioSpecialTokens.id(AudioSpecialTokens.empty_audio) # Flow matching constants - # TODO(chenyo): hardcoded, need to fix - self._acoustic_decode_iters = 8 + self._n_steps = args.n_decoding_steps # TODO(chenyo): hardcoded, need to fix self._cfg_alpha = 1.2 self._noise_scale = 1.0 self.register_buffer( "_timesteps", - torch.linspace(0, 1, self._acoustic_decode_iters), + torch.linspace(0, 1, self._n_steps + 1), persistent=False, ) From 14f79109000f64f61ca78045abdf5518c0b4fceb Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Tue, 14 Apr 2026 05:16:47 +0800 Subject: [PATCH 153/204] [Perf]: Speedup VoxCPM2 TTS performance and Support PagedAttention (#2690) Signed-off-by: Sy03 <1370724210@qq.com> Signed-off-by: Yueqian Lin Co-authored-by: Yueqian Lin Co-authored-by: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> --- examples/offline_inference/voxcpm2/README.md | 6 +- examples/offline_inference/voxcpm2/end2end.py | 6 +- .../entrypoints/openai/serving_speech.py | 17 + .../models/voxcpm2/minicpm4_hf_compat.py | 114 ++ .../models/voxcpm2/minicpm4_paged.py | 448 +++++++ .../models/voxcpm2/voxcpm2_talker.py | 1162 +++++++++++------ .../model_executor/stage_configs/voxcpm2.yaml | 8 +- vllm_omni/worker/gpu_ar_model_runner.py | 17 +- vllm_omni/worker/gpu_model_runner.py | 1 + 9 files changed, 1332 insertions(+), 447 deletions(-) create mode 100644 vllm_omni/model_executor/models/voxcpm2/minicpm4_hf_compat.py create mode 100644 vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py diff --git a/examples/offline_inference/voxcpm2/README.md b/examples/offline_inference/voxcpm2/README.md index df48a85f56..e982730799 100644 --- a/examples/offline_inference/voxcpm2/README.md +++ b/examples/offline_inference/voxcpm2/README.md @@ -58,12 +58,12 @@ The script accepts the following arguments: ## Performance -Measured on a single H20 GPU (80 GB), voxcpm 0.0.0, PyTorch 2.10.0+cu128: +Measured on a single H20 GPU (80 GB): | Input length | RTF | Sample rate | |---|---|---| -| Short (~6 words) | ~0.81 | 48 kHz | -| Long (~50 words) | ~0.72 | 48 kHz | +| Short (~10 tokens) | ~0.28 | 48 kHz | +| Long (~100 tokens) | ~0.34 | 48 kHz | RTF < 1.0 means faster than real time. diff --git a/examples/offline_inference/voxcpm2/end2end.py b/examples/offline_inference/voxcpm2/end2end.py index 2dce750897..ce404bf962 100644 --- a/examples/offline_inference/voxcpm2/end2end.py +++ b/examples/offline_inference/voxcpm2/end2end.py @@ -71,10 +71,10 @@ def parse_args(): def extract_audio(multimodal_output: dict) -> torch.Tensor: """Extract the final complete audio tensor from multimodal output. - The output processor accumulates per-step full audio under ``audio`` - as a list. The last element is the complete waveform. + The output processor concatenates per-step delta tensors under + ``model_outputs``. Falls back to ``audio`` for backwards compat. """ - audio = multimodal_output.get("audio") or multimodal_output.get("model_outputs") + audio = multimodal_output.get("model_outputs") or multimodal_output.get("audio") if audio is None: raise ValueError(f"No audio key in multimodal_output: {list(multimodal_output.keys())}") diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index a95fa69515..3dc5f595d0 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -49,12 +49,14 @@ _FISH_TTS_MODEL_STAGES = {"fish_speech_slow_ar"} _COSYVOICE3_TTS_MODEL_STAGES = {"cosyvoice3_talker"} _OMNIVOICE_TTS_MODEL_STAGES = {"omnivoice_generator"} +_VOXCPM2_TTS_MODEL_STAGES = {"latent_generator"} _TTS_MODEL_STAGES: set[str] = ( _VOXTRAL_TTS_MODEL_STAGES | _QWEN3_TTS_MODEL_STAGES | _FISH_TTS_MODEL_STAGES | _COSYVOICE3_TTS_MODEL_STAGES | _OMNIVOICE_TTS_MODEL_STAGES + | _VOXCPM2_TTS_MODEL_STAGES ) _TTS_LANGUAGES: set[str] = { "Auto", @@ -290,6 +292,8 @@ def _detect_tts_model_type(self) -> str | None: return "cosyvoice3" if model_stage in _OMNIVOICE_TTS_MODEL_STAGES: return "omnivoice" + if model_stage in _VOXCPM2_TTS_MODEL_STAGES: + return "voxcpm2" return None def _compute_max_instructions_length(self) -> int: @@ -787,6 +791,8 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non return self._validate_fish_tts_request(request) if self._tts_model_type == "cosyvoice3": return self._validate_cosyvoice3_request(request) + if self._tts_model_type == "voxcpm2": + return None # VoxCPM2 accepts any text input return self._validate_qwen_tts_request(request) def _validate_ref_audio_format(self, ref_audio: str) -> str | None: @@ -1430,6 +1436,15 @@ async def _prepare_speech_generation( prompt["lang"] = request.language if request.instructions: prompt["instruct"] = request.instructions + elif self._tts_model_type == "voxcpm2": + tts_params = {} + additional: dict[str, Any] = {} + if request.ref_audio is not None: + wav_list, sr = await self._resolve_ref_audio(request.ref_audio) + additional["reference_audio"] = [[wav_list, sr]] + prompt = {"prompt": request.input} + if additional: + prompt["additional_information"] = additional elif self._is_tts: validation_error = self._validate_tts_request(request) if validation_error: @@ -1466,6 +1481,8 @@ async def _prepare_speech_generation( model_type = "voxtral_tts" elif self._tts_model_type == "cosyvoice3": model_type = "cosyvoice3" + elif self._tts_model_type == "voxcpm2": + model_type = "voxcpm2" elif self._is_tts: model_type = tts_params.get("task_type", ["unknown"])[0] else: diff --git a/vllm_omni/model_executor/models/voxcpm2/minicpm4_hf_compat.py b/vllm_omni/model_executor/models/voxcpm2/minicpm4_hf_compat.py new file mode 100644 index 0000000000..cb3101b16a --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm2/minicpm4_hf_compat.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""fp32 RoPE + MLP matching native VoxCPM2 numerics. + +Exports: _MiniCPMLongRoPE, _MiniCPMMLP, _apply_rotary_pos_emb +""" + +from __future__ import annotations + +import math +from typing import Any + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# =================================================================== +# Primitives +# =================================================================== + + +def _rotate_half(x: torch.Tensor) -> torch.Tensor: + x1, x2 = x.chunk(2, dim=-1) + return torch.cat((-x2, x1), dim=-1) + + +def _apply_rotary_pos_emb( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """Apply rotary embeddings in float32.""" + orig_dtype = q.dtype + q, k = q.to(torch.float32), k.to(torch.float32) + q_embed = (q * cos) + (_rotate_half(q) * sin) + k_embed = (k * cos) + (_rotate_half(k) * sin) + return q_embed.to(orig_dtype), k_embed.to(orig_dtype) + + +# =================================================================== +# LongRoPE — must match native computation order exactly +# =================================================================== + + +class _MiniCPMLongRoPE(nn.Module): + """LongRoPE matching native computation order.""" + + def __init__( + self, + hidden_size: int, + num_attention_heads: int, + kv_channels: int | None, + rope_theta: float, + max_position_embeddings: int, + rope_scaling: dict[str, Any], + ) -> None: + super().__init__() + self.dim = kv_channels if kv_channels else hidden_size // num_attention_heads + self.base = rope_theta + self.max_position_embeddings = max_position_embeddings + self.short_factor = rope_scaling["short_factor"] + self.long_factor = rope_scaling["long_factor"] + self.original_max_position_embeddings = rope_scaling["original_max_position_embeddings"] + + scale = self.max_position_embeddings / self.original_max_position_embeddings + self.scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings)) + + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + self.max_seq_len_cached = 0 + self.register_buffer("cos_cached", torch.empty(0), persistent=False) + self.register_buffer("sin_cached", torch.empty(0), persistent=False) + self._set_cos_sin_cache(self.max_position_embeddings, self.inv_freq.device, torch.float32) + + def _set_cos_sin_cache(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> None: + self.max_seq_len_cached = seq_len + t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype) + + ext_factors = torch.tensor( + self.long_factor if seq_len > self.original_max_position_embeddings else self.short_factor, + dtype=torch.float32, + device=device, + ) + + freqs = torch.mul( + torch.outer(t, 1.0 / ext_factors).to(device=device), + self.inv_freq.to(device=device).to(dtype), + ) + emb = torch.cat((freqs, freqs), dim=-1) + self.cos_cached = emb.cos().to(dtype) * self.scaling_factor + self.sin_cached = emb.sin().to(dtype) * self.scaling_factor + + def forward(self, position_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + return self.cos_cached[position_ids], self.sin_cached[position_ids] + + +# =================================================================== +# MLP +# =================================================================== + + +class _MiniCPMMLP(nn.Module): + """SiLU-gated MLP matching native MiniCPMMLP.""" + + def __init__(self, hidden_size: int, intermediate_size: int) -> None: + super().__init__() + self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x)) diff --git a/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py b/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py new file mode 100644 index 0000000000..7ea5bc229d --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py @@ -0,0 +1,448 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""MiniCPM4 with PagedAttention + fp32 RoPE/RMSNorm for VoxCPM2. + +Uses vllm Attention for KV cache, keeps fp32 precision ops from +minicpm4_hf_compat.py to match native VoxCPM2 numerics. +""" + +from __future__ import annotations + +import math +from collections.abc import Iterable +from typing import Any + +import torch +import torch.nn as nn +from vllm.config import CacheConfig, VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.utils import make_empty_intermediate_tensors_factory +from vllm.sequence import IntermediateTensors + +from .minicpm4_hf_compat import ( + _apply_rotary_pos_emb, + _MiniCPMLongRoPE, + _MiniCPMMLP, +) + +logger = init_logger(__name__) + + +def _resolve_lm_cfg(config: Any) -> Any: + """Extract lm_config from VoxCPM2Config, converting dict to namespace if needed.""" + lm_cfg = getattr(config, "lm_config", config) + if isinstance(lm_cfg, dict): + + class _Cfg: + pass + + c = _Cfg() + for k, v in lm_cfg.items(): + setattr(c, k, v) + return c + return lm_cfg + + +# =================================================================== +# Attention with vllm PagedAttention backend +# =================================================================== + + +class _PagedMiniCPM4Attention(nn.Module): + """PagedAttention + fp32 RoPE with separate q/k/v projections.""" + + def __init__( + self, + hidden_size: int, + num_attention_heads: int, + num_key_value_heads: int, + kv_channels: int | None, + layer_idx: int, + cache_config: CacheConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.layer_idx = layer_idx + self.hidden_size = hidden_size + self.num_heads = num_attention_heads + self.head_dim = kv_channels if kv_channels else hidden_size // num_attention_heads + self.num_kv_heads = num_key_value_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + + self.q_proj = nn.Linear(hidden_size, self.q_size, bias=False) + self.k_proj = nn.Linear(hidden_size, self.kv_size, bias=False) + self.v_proj = nn.Linear(hidden_size, self.kv_size, bias=False) + self.o_proj = nn.Linear(self.q_size, hidden_size, bias=False) + self._fused_qkv_weight: torch.Tensor | None = None + + self.attn = Attention( + self.num_heads, + self.head_dim, + scale=self.head_dim**-0.5, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + prefix=f"{prefix}.attn", + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + rope_emb: _MiniCPMLongRoPE | None = None, + ) -> torch.Tensor: + """Forward: fused QKV → fp32 RoPE → PagedAttention → o_proj.""" + if self._fused_qkv_weight is None: + self._fused_qkv_weight = torch.cat( + [ + self.q_proj.weight, + self.k_proj.weight, + self.v_proj.weight, + ], + dim=0, + ).detach() + qkv = nn.functional.linear(hidden_states, self._fused_qkv_weight) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + if rope_emb is not None: + cos, sin = rope_emb(positions) + bsz = q.shape[0] + q_r = q.view(bsz, self.num_heads, self.head_dim) + k_r = k.view(bsz, self.num_kv_heads, self.head_dim) + q_r = q_r.unsqueeze(0).transpose(1, 2) # [1, heads, n_tokens, dim] + k_r = k_r.unsqueeze(0).transpose(1, 2) # [1, kv_heads, n_tokens, dim] + q_r, k_r = _apply_rotary_pos_emb(q_r, k_r, cos, sin) + q = q_r.transpose(1, 2).squeeze(0).reshape(bsz, -1) # [n_tokens, q_size] + k = k_r.transpose(1, 2).squeeze(0).reshape(bsz, -1) # [n_tokens, kv_size] + + attn_output = self.attn(q, k, v) + + output = self.o_proj(attn_output) + return output + + +# =================================================================== +# Decoder Layer +# =================================================================== + + +class _PagedMiniCPM4DecoderLayer(nn.Module): + """Decoder layer: PagedAttention + fp32 RMSNorm + muP scale_depth.""" + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + num_attention_heads: int, + num_key_value_heads: int, + kv_channels: int | None, + rms_norm_eps: float, + layer_idx: int, + num_hidden_layers: int, + use_mup: bool, + scale_depth: float, + cache_config: CacheConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.self_attn = _PagedMiniCPM4Attention( + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + num_key_value_heads=num_key_value_heads, + kv_channels=kv_channels, + layer_idx=layer_idx, + cache_config=cache_config, + prefix=f"{prefix}.self_attn", + ) + self.mlp = _MiniCPMMLP(hidden_size, intermediate_size) + self.input_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps) + self.post_attention_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps) + + self.use_mup = use_mup + self.scale_depth = scale_depth + self.num_hidden_layers = num_hidden_layers + + def _residual_scale(self) -> float: + if self.use_mup: + return self.scale_depth / math.sqrt(self.num_hidden_layers) + return 1.0 + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + rope_emb: _MiniCPMLongRoPE | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + # Pre-norm + attention + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.self_attn(positions, hidden_states, rope_emb) + + scale = self._residual_scale() + if scale != 1.0: + hidden_states = residual + hidden_states * scale + else: + hidden_states = residual + hidden_states + + # Pre-norm + FFN + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + + if scale != 1.0: + hidden_states = residual + hidden_states * scale + else: + hidden_states = residual + hidden_states + + return hidden_states, None + + +# =================================================================== +# Full Model +# =================================================================== + + +class MiniCPM4PagedForVoxCPM2(nn.Module): + """PagedAttention base_lm (28 layers) for VoxCPM2 scaffold.""" + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + self.config = config + + lm_cfg = _resolve_lm_cfg(config) + + hidden_size = lm_cfg.hidden_size + num_hidden_layers = lm_cfg.num_hidden_layers + kv_channels = getattr(lm_cfg, "kv_channels", None) + + self.vocab_size = lm_cfg.vocab_size + self.embed_tokens = nn.Embedding(self.vocab_size, hidden_size) + + rope_scaling = getattr(lm_cfg, "rope_scaling", None) + if isinstance(rope_scaling, dict): + rope_scaling_dict = rope_scaling + elif hasattr(rope_scaling, "__dict__"): + rope_scaling_dict = { + "short_factor": rope_scaling.short_factor, + "long_factor": rope_scaling.long_factor, + "original_max_position_embeddings": rope_scaling.original_max_position_embeddings, + } + else: + rope_scaling_dict = {} + + no_rope = getattr(lm_cfg, "no_rope", False) + if not no_rope: + self.rope_emb = _MiniCPMLongRoPE( + hidden_size=hidden_size, + num_attention_heads=lm_cfg.num_attention_heads, + kv_channels=kv_channels, + rope_theta=getattr(lm_cfg, "rope_theta", 10000.0), + max_position_embeddings=getattr(lm_cfg, "max_position_embeddings", 32768), + rope_scaling=rope_scaling_dict, + ) + else: + self.rope_emb = None + + self.layers = nn.ModuleList( + [ + _PagedMiniCPM4DecoderLayer( + hidden_size=hidden_size, + intermediate_size=lm_cfg.intermediate_size, + num_attention_heads=lm_cfg.num_attention_heads, + num_key_value_heads=lm_cfg.num_key_value_heads, + kv_channels=kv_channels, + rms_norm_eps=lm_cfg.rms_norm_eps, + layer_idx=i, + num_hidden_layers=num_hidden_layers, + use_mup=getattr(lm_cfg, "use_mup", False), + scale_depth=getattr(lm_cfg, "scale_depth", 1.0), + cache_config=cache_config, + prefix=f"{prefix}.layers.{i}", + ) + for i in range(num_hidden_layers) + ] + ) + + self.norm = RMSNorm(hidden_size, eps=lm_cfg.rms_norm_eps) + + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], hidden_size + ) + + use_mup = getattr(lm_cfg, "use_mup", False) + self._scale_emb = getattr(lm_cfg, "scale_emb", 1.0) if use_mup else 1.0 + self._compiled_layers: set[int] = set() + + def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor: + return self.embed_tokens(input_ids) * self._scale_emb + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: Any, + ) -> torch.Tensor | IntermediateTensors: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + + residual = None + for layer in self.layers: + hidden_states, residual = layer( + positions, + hidden_states, + residual, + self.rope_emb, + ) + + hidden_states = self.norm(hidden_states) + return hidden_states + + def compile_selective(self) -> list[str]: + """Compile MLP + o_proj; keep RMSNorm/RoPE eager for precision.""" + compiled: list[str] = [] + for i, layer in enumerate(self.layers): + if i in self._compiled_layers: + continue + try: + layer.mlp = torch.compile( + layer.mlp, + mode="default", + fullgraph=True, + ) + layer.self_attn.o_proj = torch.compile( + layer.self_attn.o_proj, + mode="default", + fullgraph=True, + ) + layer.self_attn._fused_qkv_weight = None + self._compiled_layers.add(i) + if i == 0: + compiled.append(f"layers.*.mlp (×{len(self.layers)})") + compiled.append(f"layers.*.self_attn.o_proj (×{len(self.layers)})") + except Exception as e: + logger.warning("compile_selective: layer %d failed: %s", i, e) + break + return compiled + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load weights from native checkpoint (base_lm. prefix pre-stripped).""" + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded: set[str] = set() + + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + param = params_dict.get(name) + if param is None: + continue + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded.add(name) + + return loaded + + +# =================================================================== +# Residual LM with PagedAttention (no RoPE, 8 layers) +# =================================================================== + + +class MiniCPM4PagedResidualLM(nn.Module): + """PagedAttention residual LM (8 layers, no RoPE) for VoxCPM2.""" + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + self.config = config + + lm_cfg = _resolve_lm_cfg(config) + + hidden_size = lm_cfg.hidden_size + num_hidden_layers = getattr(config, "residual_lm_num_layers", 8) + kv_channels = getattr(lm_cfg, "kv_channels", None) + + self.rope_emb = None + + self.layers = nn.ModuleList( + [ + _PagedMiniCPM4DecoderLayer( + hidden_size=hidden_size, + intermediate_size=lm_cfg.intermediate_size, + num_attention_heads=lm_cfg.num_attention_heads, + num_key_value_heads=lm_cfg.num_key_value_heads, + kv_channels=kv_channels, + rms_norm_eps=lm_cfg.rms_norm_eps, + layer_idx=i, + num_hidden_layers=num_hidden_layers, + use_mup=getattr(lm_cfg, "use_mup", False), + scale_depth=getattr(lm_cfg, "scale_depth", 1.0), + cache_config=cache_config, + prefix=f"{prefix}.layers.{i}", + ) + for i in range(num_hidden_layers) + ] + ) + + self.norm = RMSNorm(hidden_size, eps=lm_cfg.rms_norm_eps) + self._compiled_layers: set[int] = set() + + def forward( + self, + positions: torch.Tensor, + inputs_embeds: torch.Tensor, + ) -> torch.Tensor: + hidden_states = inputs_embeds + residual = None + for layer in self.layers: + hidden_states, residual = layer( + positions, + hidden_states, + residual, + self.rope_emb, + ) + hidden_states = self.norm(hidden_states) + return hidden_states + + def compile_selective(self) -> list[str]: + """Compile MLP + o_proj (same as base_lm).""" + compiled: list[str] = [] + for i, layer in enumerate(self.layers): + if i in self._compiled_layers: + continue + try: + layer.mlp = torch.compile(layer.mlp, mode="default", fullgraph=True) + layer.self_attn.o_proj = torch.compile(layer.self_attn.o_proj, mode="default", fullgraph=True) + layer.self_attn._fused_qkv_weight = None + self._compiled_layers.add(i) + if i == 0: + compiled.append(f"layers.*.mlp (×{len(self.layers)})") + compiled.append(f"layers.*.self_attn.o_proj (×{len(self.layers)})") + except Exception as e: + logger.warning("compile_selective: residual layer %d failed: %s", i, e) + return compiled + + def load_weights_from_native(self, native_residual_lm: nn.Module) -> int: + """Load weights from native residual_lm. Returns param count.""" + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded = 0 + for name, param in native_residual_lm.named_parameters(): + if "rotary_emb" in name: + continue + target = params_dict.get(name) + if target is None: + continue + weight_loader = getattr(target, "weight_loader", default_weight_loader) + weight_loader(target, param.data) + loaded += 1 + return loaded diff --git a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py index b9faf9fa3b..0898ca59ae 100644 --- a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py +++ b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py @@ -1,33 +1,27 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""VoxCPM2 native AR talker — uses native MiniCPM4 base_lm directly. - -Uses native VoxCPM2 modules (no PagedAttention, manual KV cache). -Each AR decode step: - feat_encoder → base_lm → FSQ → residual_lm → LocDiT → stop - -TODO(PagedAttention): The base_lm is a MiniCPM4 variant (GQA + LongRoPE, -use_mup=False). vllm's MiniCPMModel already supports the architecture -(LongRoPE via Phi3LongRoPEScaledRotaryEmbedding, muP via config), but -two issues block replacing the native base_lm with a vllm MiniCPM4Model: - 1. Per-request state isolation — residual_lm and LocDiT diffusion use - shared native KV caches; concurrent requests clobber each other. - Fix: save/restore residual_lm cache per request, or pool N instances. - 2. Streaming audio — make_omni_output re-decodes all patches each step. - Fix: sliding-window VAE decode (decode_pad pattern from nanovllm). +"""VoxCPM2 AR talker — PagedAttention pipeline with per-request state. + +Architecture: + MiniCPM4PagedForVoxCPM2 (base_lm, 28 layers, PagedAttention + fp32 RoPE) + → FSQ → MiniCPM4PagedResidualLM (8 layers, PagedAttention, no RoPE) + → LocDiT (CFM solver) → AudioVAE → 48kHz waveform """ from __future__ import annotations +import dataclasses +import os +import time from collections.abc import Iterable from typing import Any import librosa import torch import torch.nn as nn +from einops import rearrange from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.model_executor.models.minicpm import MiniCPMModel from vllm.model_executor.models.utils import ( AutoWeightsLoader, WeightsMapper, @@ -37,10 +31,13 @@ from vllm_omni.model_executor.models.output_templates import OmniOutput +from .minicpm4_paged import MiniCPM4PagedForVoxCPM2, MiniCPM4PagedResidualLM from .voxcpm2_import_utils import import_voxcpm2_core logger = init_logger(__name__) +_ENABLE_PROFILING = os.environ.get("VOXCPM2_PROFILE", "0") == "1" + def _encode_raw_audio( tts: nn.Module, @@ -51,34 +48,21 @@ def _encode_raw_audio( """Encode raw audio samples using the native VoxCPM2 AudioVAE. Mirrors ``VoxCPM2Model._encode_wav`` but accepts in-memory samples - instead of a file path. This is needed for the OpenAI speech API - where ``_resolve_ref_audio`` returns decoded audio data. - - Args: - tts: Native VoxCPM2 tts_model instance. - samples: Audio samples (mono, float32). - sr: Sample rate of the input audio. - padding_mode: "right" (default) or "left" padding. - - Returns: - audio_feat: (T, P, D) tensor of latent patches. + instead of a file path (needed for the OpenAI speech API). """ if isinstance(samples, list): audio = torch.tensor(samples, dtype=torch.float32) else: audio = samples.float() - if audio.ndim == 1: audio = audio.unsqueeze(0) - # Resample to the model's expected encoding sample rate encode_sr = tts._encode_sample_rate if sr != encode_sr: audio_np = audio.squeeze(0).numpy() audio_np = librosa.resample(audio_np, orig_sr=sr, target_sr=encode_sr) audio = torch.from_numpy(audio_np).unsqueeze(0) - # Pad to patch boundary patch_len = tts.patch_size * tts.chunk_size if audio.size(1) % patch_len != 0: padding_size = patch_len - audio.size(1) % patch_len @@ -89,48 +73,301 @@ def _encode_raw_audio( return feat.view(tts.audio_vae.latent_dim, -1, tts.patch_size).permute(1, 2, 0) -class VoxCPM2TalkerForConditionalGeneration(nn.Module): - """VoxCPM2 talker using native MiniCPM4 base_lm. +# =================================================================== +# Per-request state +# =================================================================== + + +@dataclasses.dataclass +class _RequestState: + request_id: str + curr_embed_for_next: torch.Tensor | None = None + prev_feat_embed: torch.Tensor | None = None + curr_prefix_feat_cond: torch.Tensor | None = None + last_audio_patch_gpu: torch.Tensor | None = None + precomputed_stop_logits: torch.Tensor | None = None + accumulated_patches: list[torch.Tensor] = dataclasses.field(default_factory=list) + decode_step_count: int = 0 + request_start_time: float = 0.0 + prefill_completed: bool = False + prefill_text: str = "" + prompt_cache: dict | None = None + prefill_masks: tuple | None = None + is_stopping: bool = False + last_decoded_audio: torch.Tensor | None = None + + +# =================================================================== +# Profiling timer +# =================================================================== + + +class _PerfTimer: + __slots__ = ("_enabled", "_timers", "_counts", "_starts", "_pairs") + + def __init__(self, enabled: bool = False): + self._enabled = enabled + self._timers: dict[str, float] = {} + self._counts: dict[str, int] = {} + self._starts: dict[str, torch.cuda.Event] = {} + self._pairs: list[tuple[str, torch.cuda.Event, torch.cuda.Event]] = [] + + def start(self, name: str) -> None: + if not self._enabled: + return + evt = torch.cuda.Event(enable_timing=True) + evt.record() + self._starts[name] = evt + + def stop(self, name: str) -> None: + if not self._enabled or name not in self._starts: + return + start_evt = self._starts.pop(name) + end_evt = torch.cuda.Event(enable_timing=True) + end_evt.record() + self._pairs.append((name, start_evt, end_evt)) + + def _resolve(self) -> None: + if not self._pairs: + return + torch.cuda.synchronize() + for name, s, e in self._pairs: + self._timers[name] = self._timers.get(name, 0.0) + s.elapsed_time(e) + self._counts[name] = self._counts.get(name, 0) + 1 + self._pairs.clear() + + def breakdown(self) -> str: + if not self._enabled: + return "" + self._resolve() + if not self._timers: + return "" + total = self._timers.get("decode_step", sum(self._timers.values())) + lines = [ + "=== VoxCPM2 Decode Step Breakdown ===", + f"{'Component':<30} | {'ms':>10} | {'%':>6} | {'N':>5} | {'avg':>8}", + "-" * 70, + ] + for name in sorted(self._timers): + t, c = self._timers[name], self._counts[name] + lines.append(f"{name:<30} | {t:>10.2f} | {t / total * 100:>5.1f}% | {c:>5} | {t / c:>8.3f}") + lines.append(f"{'TOTAL':<30} | {total:>10.2f} |") + return "\n".join(lines) + + def reset(self) -> None: + self._timers.clear() + self._counts.clear() + self._starts.clear() + self._pairs.clear() + + +# =================================================================== +# CFM pre-allocated buffers + optimized Euler solver +# =================================================================== + + +class _CFMBufferManager: + def __init__( + self, + device: torch.device, + dtype: torch.dtype, + feat_dim: int, + patch_size: int, + dit_hidden_size: int, + max_batch_size: int = 1, + sway_sampling_coef: float = 1.0, + ): + n = 2 * max_batch_size # CFG doubles the batch + self.x_in = torch.zeros(n, feat_dim, patch_size, device=device, dtype=dtype) + self.mu_in = torch.zeros(n, dit_hidden_size, device=device, dtype=dtype) + self.t_in = torch.zeros(n, device=device, dtype=dtype) + self.dt_in = torch.zeros(n, device=device, dtype=dtype) + self.cond_in = torch.zeros(n, feat_dim, patch_size, device=device, dtype=dtype) + self.noise = torch.zeros(max_batch_size, feat_dim, patch_size, device=device, dtype=dtype) + self._sway_coef = sway_sampling_coef + self._device = device + self._dtype = dtype + self.t_span_10 = self._make_t_span(10) + + def _make_t_span(self, n: int) -> torch.Tensor: + t = torch.linspace(1, 0, n + 1, device=self._device, dtype=self._dtype) + return t + self._sway_coef * (torch.cos(torch.pi / 2 * t) - 1 + t) + + def get_t_span(self, n: int) -> torch.Tensor: + return self.t_span_10 if n == 10 else self._make_t_span(n) + + +def _optimized_solve_euler( + cfm_module: nn.Module, + mu: torch.Tensor, + patch_size: int, + cond: torch.Tensor, + n_timesteps: int, + cfg_value: float, + buffers: _CFMBufferManager, + use_cfg_zero_star: bool = True, + cfg_cutoff_ratio: float = 1.0, + perf: _PerfTimer | None = None, +) -> torch.Tensor: + estimator = cfm_module.estimator + mean_mode = getattr(cfm_module, "mean_mode", False) + b = mu.size(0) + + buffers.noise[:b].normal_() + x = buffers.noise[:b].clone() + + t_span = buffers.get_t_span(n_timesteps) + t, dt = t_span[0], t_span[0] - t_span[1] + zero_init_steps = max(1, int(len(t_span) * 0.04)) + cfg_cutoff_step = max(zero_init_steps + 1, int(len(t_span) * cfg_cutoff_ratio)) + + for step in range(1, len(t_span)): + if use_cfg_zero_star and step <= zero_init_steps: + dphi_dt = torch.zeros_like(x) + elif step <= cfg_cutoff_step: + buffers.x_in[:b].copy_(x) + buffers.x_in[b : 2 * b].copy_(x) + buffers.mu_in[:b].copy_(mu) + buffers.mu_in[b : 2 * b].zero_() + buffers.t_in[:b].fill_(t.item()) + buffers.t_in[b : 2 * b].fill_(t.item()) + if mean_mode: + buffers.dt_in[:b].fill_(dt.item()) + buffers.dt_in[b : 2 * b].fill_(dt.item()) + else: + buffers.dt_in.zero_() + buffers.cond_in[:b].copy_(cond[:b]) + buffers.cond_in[b : 2 * b].copy_(cond[:b]) + + if perf: + perf.start(" cfm.estimator_cfg") + raw_out = estimator( + buffers.x_in[: 2 * b], + buffers.mu_in[: 2 * b], + buffers.t_in[: 2 * b], + buffers.cond_in[: 2 * b], + buffers.dt_in[: 2 * b], + ) + if perf: + perf.stop(" cfm.estimator_cfg") + + dphi_dt, cfg_dphi_dt = raw_out[:b], raw_out[b : 2 * b] + if use_cfg_zero_star: + pos = dphi_dt.reshape(b, -1) + neg = cfg_dphi_dt.reshape(b, -1) + st = torch.sum(pos * neg, 1, keepdim=True) / (torch.sum(neg**2, 1, keepdim=True) + 1e-8) + st = st.view(b, *([1] * (len(dphi_dt.shape) - 1))) + else: + st = 1.0 + dphi_dt = cfg_dphi_dt * st + cfg_value * (dphi_dt - cfg_dphi_dt * st) + else: + buffers.x_in[:b].copy_(x) + buffers.mu_in[:b].copy_(mu) + buffers.t_in[:b].fill_(t.item()) + if mean_mode: + buffers.dt_in[:b].fill_(dt.item()) + else: + buffers.dt_in[:b].zero_() + buffers.cond_in[:b].copy_(cond[:b]) + if perf: + perf.start(" cfm.estimator_nocfg") + dphi_dt = estimator( + buffers.x_in[:b], buffers.mu_in[:b], buffers.t_in[:b], buffers.cond_in[:b], buffers.dt_in[:b] + ) + if perf: + perf.stop(" cfm.estimator_nocfg") - Loads the full VoxCPM2 model natively and decomposes the AR loop: - each vllm decode step runs one iteration of the native generate loop. - """ + x = x - dt * dphi_dt + t = t - dt + if step < len(t_span) - 1: + dt = t - t_span[step + 1] + return x + +# =================================================================== +# Main talker model +# =================================================================== + + +class VoxCPM2TalkerForConditionalGeneration(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.vllm_config = vllm_config self.config = vllm_config.model_config.hf_config - # Flags for OmniGPUModelRunner self.have_multimodal_outputs = True self.has_preprocess = True self.has_postprocess = True - self._accumulated_patches: list[torch.Tensor] = [] - # vllm MiniCPMModel scaffold — needed for warmup/profiling/KV cache - # sizing. Not used for actual computation (native modules are used). - self.model = MiniCPMModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) + self.model = MiniCPM4PagedForVoxCPM2( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), + ) + self.residual_model = MiniCPM4PagedResidualLM( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "residual_model"), + ) self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors - # Placeholder — actual native model loaded in load_weights self._tts: nn.Module | None = None self._device = "cuda" self._side_dtype = torch.bfloat16 - # Config values self._patch_size = getattr(self.config, "patch_size", 4) self._feat_dim = getattr(self.config, "feat_dim", 64) + self._sample_rate = getattr(self.config, "sample_rate", 48000) + self._inference_timesteps = 10 self._cfg_value = 2.0 - - # TODO: implement sliding-window VAE decode (nanovllm pattern) - # for O(1) per-step streaming. Current impl re-decodes all patches. + self._cfg_cutoff_ratio = 1.0 + self._vae_decode_interval = 5 + self._enable_torch_compile = True + self._compile_vae = True + self._max_decode_steps = 2000 + self._max_batch_size = getattr(vllm_config.scheduler_config, "max_num_seqs", 4) + + self._perf = _PerfTimer(enabled=_ENABLE_PROFILING) + self._cfm_buffers: _CFMBufferManager | None = None + + self._active_states: dict[str, _RequestState] = {} + self._current_request_id: str | None = None + self._pending_requests: list[tuple[str, bool, torch.Tensor | None, int]] = [] + self._results_queue: list[tuple[str, torch.Tensor | None]] = [] + self._audio_queue: list[tuple[str, Any]] = [] + self._deferred_cleanup_ids: set[str] = set() @property def tts(self) -> nn.Module: assert self._tts is not None, "Model not loaded yet" return self._tts + # -------------------- request state management -------------------- + + def _get_or_create_state(self, request_id: str) -> _RequestState: + if request_id not in self._active_states: + self._active_states[request_id] = _RequestState(request_id=request_id) + return self._active_states[request_id] + + def _switch_to_request(self, request_id: str) -> _RequestState: + if request_id != self._current_request_id: + self._current_request_id = request_id + return self._get_or_create_state(request_id) + + def _cleanup_request(self, request_id: str) -> None: + self._active_states.pop(request_id, None) + if self._current_request_id == request_id: + self._current_request_id = None + + def on_requests_finished(self, finished_req_ids: set[str] | list[str]) -> None: + # Defer cleanup: on_requests_finished is called before forward(), + # so we must not delete state that the current step may still need. + self._deferred_cleanup_ids.update(finished_req_ids) + + def _flush_deferred_cleanup(self) -> None: + for req_id in self._deferred_cleanup_ids: + self._cleanup_request(req_id) + self._deferred_cleanup_ids.clear() + def _build_prompt_cache( self, ref_audio: Any = None, @@ -141,20 +378,19 @@ def _build_prompt_cache( The OpenAI speech API sends decoded audio as [samples_list, sr] via ``_resolve_ref_audio``, while offline usage sends file paths. - This method detects the format and routes accordingly. """ tts = self.tts def _is_raw_audio(v: Any) -> bool: - """Check if value is [samples, sr] from serving_speech.""" + import numbers + return ( isinstance(v, (list, tuple)) and len(v) == 2 - and isinstance(v[1], int) + and isinstance(v[1], numbers.Integral) and isinstance(v[0], (list, torch.Tensor)) ) - # If all inputs are file paths (or None), use native build_prompt_cache if not _is_raw_audio(ref_audio) and not _is_raw_audio(prompt_audio): return tts.build_prompt_cache( prompt_text=prompt_text, @@ -162,39 +398,21 @@ def _is_raw_audio(v: Any) -> bool: reference_wav_path=ref_audio, ) - # Raw audio path: encode directly cache: dict[str, Any] = {} - if ref_audio is not None: if _is_raw_audio(ref_audio): samples, sr = ref_audio - cache["ref_audio_feat"] = _encode_raw_audio( - tts, - samples, - sr, - padding_mode="right", - ) + cache["ref_audio_feat"] = _encode_raw_audio(tts, samples, sr) else: - cache["ref_audio_feat"] = tts._encode_wav( - ref_audio, - padding_mode="right", - ) + cache["ref_audio_feat"] = tts._encode_wav(ref_audio, padding_mode="right") if prompt_audio is not None and prompt_text is not None: cache["prompt_text"] = prompt_text if _is_raw_audio(prompt_audio): samples, sr = prompt_audio - cache["audio_feat"] = _encode_raw_audio( - tts, - samples, - sr, - padding_mode="left", - ) + cache["audio_feat"] = _encode_raw_audio(tts, samples, sr, padding_mode="left") else: - cache["audio_feat"] = tts._encode_wav( - prompt_audio, - padding_mode="left", - ) + cache["audio_feat"] = tts._encode_wav(prompt_audio, padding_mode="left") has_ref = "ref_audio_feat" in cache has_prompt = "audio_feat" in cache @@ -207,12 +425,95 @@ def _is_raw_audio(v: Any) -> bool: return cache + # -------------------- compile setup -------------------- + + def _setup_cfm_buffers(self) -> None: + if self._cfm_buffers is not None: + return + tts = self.tts + dit_hidden = tts.lm_to_dit_proj.out_features + tts.res_to_dit_proj.out_features + self._cfm_buffers = _CFMBufferManager( + device=torch.device(self._device), + dtype=self._side_dtype, + feat_dim=self._feat_dim, + patch_size=self._patch_size, + dit_hidden_size=dit_hidden, + max_batch_size=self._max_batch_size, + ) + + def _setup_torch_compile(self) -> None: + if not self._enable_torch_compile: + return + tts = self.tts + estimator = tts.feat_decoder.estimator + if hasattr(estimator, "_compiled"): + return + + targets: list[str] = [] + + try: + tts.feat_decoder.estimator = torch.compile(estimator, mode="reduce-overhead", fullgraph=False) + tts.feat_decoder.estimator._compiled = True + targets.append("LocDiT") + except Exception as e: + logger.warning("torch.compile LocDiT failed: %s", e) + + try: + if not hasattr(tts.feat_encoder, "_compiled"): + tts.feat_encoder = torch.compile(tts.feat_encoder, mode="reduce-overhead", fullgraph=False) + tts.feat_encoder._compiled = True + targets.append("feat_encoder") + except Exception as e: + logger.warning("torch.compile feat_encoder failed: %s", e) + + if self._compile_vae: + try: + if not hasattr(tts.audio_vae, "_compiled"): + tts.audio_vae.decode = torch.compile(tts.audio_vae.decode, mode="reduce-overhead", fullgraph=False) + tts.audio_vae._compiled = True + targets.append("AudioVAE") + except Exception as e: + logger.warning("torch.compile AudioVAE failed: %s", e) + + if not getattr(self.model, "_selective_compiled", False): + try: + targets.extend(f"scaffold.{t}" for t in self.model.compile_selective()) + self.model._selective_compiled = True + except Exception as e: + logger.warning("scaffold compile failed: %s", e) + + if not getattr(self.residual_model, "_selective_compiled", False): + try: + targets.extend(f"residual.{t}" for t in self.residual_model.compile_selective()) + self.residual_model._selective_compiled = True + except Exception as e: + logger.warning("residual compile failed: %s", e) + + if not getattr(self, "_projections_compiled", False): + try: + self._compiled_dit_proj = torch.compile(self._dit_proj_fn, mode="default", fullgraph=True) + self._compiled_stop_fn = torch.compile(self._stop_fn, mode="default", fullgraph=True) + self._projections_compiled = True + targets.append("projections") + except Exception as e: + self._compiled_dit_proj = self._compiled_stop_fn = None + logger.warning("projections compile failed: %s", e) + + if targets: + logger.info("VoxCPM2: torch.compile applied to: %s", ", ".join(targets)) + + def _dit_proj_fn(self, lm_h: torch.Tensor, res_h: torch.Tensor) -> torch.Tensor: + tts = self.tts + return torch.cat([tts.lm_to_dit_proj(lm_h), tts.res_to_dit_proj(res_h)], dim=-1) + + def _stop_fn(self, lm_h: torch.Tensor) -> torch.Tensor: + tts = self.tts + return tts.stop_head(tts.stop_actn(tts.stop_proj(lm_h))) + # -------------------- vllm hooks -------------------- def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor: - """Embed input IDs using native base_lm with scale_emb.""" - embeds = self.tts.base_lm.embed_tokens(input_ids) - return embeds * self.tts.config.lm_config.scale_emb + return self.model.embed_input_ids(input_ids) def forward( self, @@ -222,8 +523,9 @@ def forward( inputs_embeds: torch.Tensor | None = None, **kwargs: Any, ) -> torch.Tensor | IntermediateTensors: - """Full VoxCPM2 AR step: base_lm → FSQ → residual_lm → diffusion.""" - # Always run scaffold model to keep FlashInfer/attention happy + self._perf.start("forward_total") + dev = input_ids.device + model_output = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) if isinstance(model_output, IntermediateTensors): return model_output @@ -231,368 +533,315 @@ def forward( if isinstance(scaffold_hidden, tuple): scaffold_hidden = scaffold_hidden[0] - # Real computation: use native modules - has_infos = bool(getattr(self, "_current_step_infos", None)) - is_prefill = scaffold_hidden.shape[0] > 1 - - if is_prefill and has_infos: - self._forward_prefill(inputs_embeds, scaffold_hidden.device) - # Return scaffold output (right shape for engine) — our side - # computation results are stored in instance state - return scaffold_hidden - - if not is_prefill and hasattr(self, "_prev_feat_embed"): - self._forward_decode(inputs_embeds, scaffold_hidden.device) - return scaffold_hidden - + # Phase 1: per-request FSQ + residual input + token_offset = 0 + residual_inputs: list[torch.Tensor] = [] + residual_positions: list[torch.Tensor] = [] + req_metas: list[tuple] = [] + + for req_id, is_prefill, _req_embeds, n in self._pending_requests: + state = self._switch_to_request(req_id) + req_hidden = scaffold_hidden[token_offset : token_offset + n] + req_pos = positions[token_offset : token_offset + n] + + if is_prefill: + res_input, meta = self._prepare_residual_prefill(state, req_hidden, dev) + elif state.prefill_completed: + res_input, meta = self._prepare_residual_decode(state, req_hidden, dev) + else: + token_offset += n + self._results_queue.append((req_id, None)) + self._audio_queue.append((req_id, None)) + continue + + residual_inputs.append(res_input) + residual_positions.append(req_pos) + req_metas.append((state, is_prefill, meta)) + token_offset += n + + # Phase 2: batch residual_model forward + if residual_inputs: + batch_in = torch.cat(residual_inputs, dim=0) + batch_pos = torch.cat(residual_positions, dim=0) + batch_out = self.residual_model(batch_pos, batch_in) + + # Phase 3: per-request LocDiT + update + offset = 0 + for idx, (state, is_prefill, meta) in enumerate(req_metas): + n = residual_inputs[idx].shape[0] + res_out = batch_out[offset : offset + n] + offset += n + + if is_prefill: + self._finish_prefill(state, meta, res_out, dev) + else: + self._finish_decode(state, meta, res_out, dev) + + self._results_queue.append((state.request_id, state.precomputed_stop_logits)) + self._audio_queue.append((state.request_id, self._collect_audio(state))) + + self._pending_requests.clear() + self._flush_deferred_cleanup() + self._perf.stop("forward_total") return scaffold_hidden - def _build_prefill_inputs(self, text: str, dev: Any): - """Build text_token / audio_feat / masks like native _generate_with_prompt_cache. + # -------------------- prefill / decode helpers -------------------- - Returns a dict with keys: text_token, audio_feat, text_mask, audio_mask, - prefix_feat_cond. Handles zero-shot, reference (voice clone), continuation, - and ref_continuation modes. - """ + def _prepare_residual_prefill(self, state: _RequestState, base_lm_out: torch.Tensor, dev: Any): tts = self.tts - dtype = self._side_dtype - cache = getattr(self, "_prompt_cache", None) - mode = cache.get("mode", "continuation") if cache else "zero_shot" - - if cache is not None and mode in ("continuation", "ref_continuation"): - full_text = cache.get("prompt_text", "") + text - else: - full_text = text - - text_token = torch.LongTensor(tts.text_tokenizer(full_text)) - text_token = torch.cat( - [ - text_token, - torch.tensor([tts.audio_start_token], dtype=torch.int32, device=text_token.device), - ], - dim=-1, - ) - text_length = text_token.shape[0] - latent_dim = tts.audio_vae.latent_dim - patch_size = tts.patch_size - - if mode in ("zero_shot", "continuation"): - prompt_audio_feat = ( - cache["audio_feat"] if cache else torch.empty((0, patch_size, latent_dim), dtype=torch.float32) - ) - audio_length = prompt_audio_feat.size(0) - text_pad_token = torch.zeros(audio_length, dtype=torch.int32) - text_pad_feat = torch.zeros((text_length, patch_size, latent_dim), dtype=torch.float32) - text_token = torch.cat([text_token, text_pad_token]) - audio_feat = torch.cat([text_pad_feat, prompt_audio_feat], dim=0) - text_mask = torch.cat( - [ - torch.ones(text_length, dtype=torch.int32), - torch.zeros(audio_length, dtype=torch.int32), - ] + text_mask, feat_mask, feat, feat_embed = state.prefill_masks + state.prefill_masks = None + + tts_len = text_mask.shape[1] + scaffold_len = base_lm_out.shape[0] + + if scaffold_len < tts_len: + # Voice clone / continuation: scaffold only processed vllm tokens. + # Pad to match TTS sequence length (extra positions are masked out). + pad = torch.zeros( + tts_len - scaffold_len, + base_lm_out.shape[-1], + device=base_lm_out.device, + dtype=base_lm_out.dtype, ) - audio_mask = torch.cat( - [ - torch.zeros(text_length, dtype=torch.int32), - torch.ones(audio_length, dtype=torch.int32), - ] - ) - elif mode == "reference": - ref_audio_feat = cache["ref_audio_feat"] - ref_tokens, ref_feats, ref_t_mask, ref_a_mask = tts._make_ref_prefix(ref_audio_feat, text_token.device) - text_pad_feat = torch.zeros((text_length, patch_size, latent_dim), dtype=torch.float32) - text_token = torch.cat([ref_tokens.cpu(), text_token]) - audio_feat = torch.cat([ref_feats.cpu(), text_pad_feat], dim=0) - text_mask = torch.cat([ref_t_mask.cpu(), torch.ones(text_length, dtype=torch.int32)]) - audio_mask = torch.cat([ref_a_mask.cpu(), torch.zeros(text_length, dtype=torch.int32)]) + enc_out = torch.cat([base_lm_out, pad], dim=0).unsqueeze(0) else: - # ref_continuation - ref_audio_feat = cache["ref_audio_feat"] - prompt_audio_feat = cache["audio_feat"] - prompt_audio_length = prompt_audio_feat.size(0) - ref_tokens, ref_feats, ref_t_mask, ref_a_mask = tts._make_ref_prefix(ref_audio_feat, text_token.device) - prompt_pad_token = torch.zeros(prompt_audio_length, dtype=torch.int32) - text_pad_feat = torch.zeros((text_length, patch_size, latent_dim), dtype=torch.float32) - text_token = torch.cat([ref_tokens.cpu(), text_token, prompt_pad_token]) - audio_feat = torch.cat([ref_feats.cpu(), text_pad_feat, prompt_audio_feat], dim=0) - text_mask = torch.cat( - [ - ref_t_mask.cpu(), - torch.ones(text_length, dtype=torch.int32), - torch.zeros(prompt_audio_length, dtype=torch.int32), - ] - ) - audio_mask = torch.cat( - [ - ref_a_mask.cpu(), - torch.zeros(text_length, dtype=torch.int32), - torch.ones(prompt_audio_length, dtype=torch.int32), - ] - ) - - return { - "text_token": text_token.unsqueeze(0).to(dev), - "audio_feat": audio_feat.unsqueeze(0).to(dev).to(dtype), - "text_mask": text_mask.unsqueeze(0).to(dev), - "audio_mask": audio_mask.unsqueeze(0).to(dev), - } + enc_out = base_lm_out.unsqueeze(0) - def _forward_prefill(self, inputs_embeds: torch.Tensor, dev: Any) -> torch.Tensor: - """Prefill: build combined embeds, run base_lm + residual_lm + first diffusion. - - Uses the same path as native ``VoxCPM2Model._inference`` so zero-shot, - voice cloning (reference), continuation, and ref_continuation modes - all share the same code. - """ - tts = self.tts - dtype = self._side_dtype - text = getattr(self, "_prefill_text", None) - if text is None: - # Fallback (should not hit at runtime; preprocess sets this) - text = "" - - inputs = self._build_prefill_inputs(text, dev) - text_token = inputs["text_token"] - feat = inputs["audio_feat"] - text_mask = inputs["text_mask"] - feat_mask = inputs["audio_mask"] - - # Compose combined_embed exactly like native _inference - feat_embed = tts.feat_encoder(feat) - feat_embed = tts.enc_to_lm_proj(feat_embed) - scale_emb = tts.config.lm_config.scale_emb if tts.config.lm_config.use_mup else 1.0 - text_embed = tts.base_lm.embed_tokens(text_token) * scale_emb - combined_embed = text_mask.unsqueeze(-1) * text_embed + feat_mask.unsqueeze(-1) * feat_embed - - # last audio patch becomes initial prefix_feat_cond (zeros for zero-shot, - # last reference/prompt patch for voice clone / continuation) prefix_feat_cond = ( feat[:, -1, ...] if feat.shape[1] > 0 - else torch.zeros(1, tts.patch_size, tts.feat_dim, device=dev, dtype=dtype) + else torch.zeros(1, self._patch_size, self._feat_dim, device=dev, dtype=self._side_dtype) ) - - # Base LM prefill - tts.base_lm.setup_cache(1, 4096, dev, dtype) - enc_out, enc_kv = tts.base_lm(inputs_embeds=combined_embed, is_causal=True) - tts.base_lm.kv_cache.fill_caches(enc_kv) - - # FSQ: identity on text positions, quantized on audio positions enc_outputs = tts.fsq_layer(enc_out) * feat_mask.unsqueeze(-1) + enc_out * text_mask.unsqueeze(-1) - lm_hidden = enc_outputs[:, -1, :] # [1, H] - - logger.info( - "PREFILL: enc shape=%s last_norm=%.4f", - enc_outputs.shape, - lm_hidden.norm().item(), - ) + lm_hidden = enc_outputs[:, -1, :] - # Residual LM prefill - tts.residual_lm.setup_cache(1, 4096, dev, dtype) residual_input = tts.fusion_concat_proj(torch.cat([enc_outputs, feat_mask.unsqueeze(-1) * feat_embed], dim=-1)) - res_out, res_kv = tts.residual_lm(inputs_embeds=residual_input, is_causal=True) - tts.residual_lm.kv_cache.fill_caches(res_kv) - residual_hidden = res_out[:, -1, :] # [1, H] - - # Precompute stop logits for first compute_logits call - stop_logits = tts.stop_head(tts.stop_actn(tts.stop_proj(lm_hidden))) - self._precomputed_stop_logits = stop_logits.detach() - logger.info("PREFILL stop: %s", stop_logits[0].tolist()) - - # First diffusion step - dit_h = torch.cat( - [ - tts.lm_to_dit_proj(lm_hidden), - tts.res_to_dit_proj(residual_hidden), - ], - dim=-1, - ) - pred_feat = tts.feat_decoder( + meta = {"lm_hidden": lm_hidden, "prefix_feat_cond": prefix_feat_cond} + return residual_input.squeeze(0), meta + + def _prepare_residual_decode(self, state: _RequestState, base_lm_out: torch.Tensor, dev: Any): + tts = self.tts + state.decode_step_count += 1 + + if state.decode_step_count >= self._max_decode_steps: + logger.warning("MAX_DECODE_STEPS for %s (%d), forcing stop", state.request_id, state.decode_step_count) + state.is_stopping = True + + h = base_lm_out.unsqueeze(0) if base_lm_out.ndim == 1 else base_lm_out + lm_h = tts.fsq_layer(h) + if lm_h.ndim == 1: + lm_h = lm_h.unsqueeze(0) + + prev = state.prev_feat_embed.to(self._side_dtype) + if prev.ndim == 1: + prev = prev.unsqueeze(0) + res_input = tts.fusion_concat_proj(torch.cat([lm_h, prev], dim=-1)) + return res_input, {"new_lm_hidden": lm_h} + + def _run_cfm(self, dit_h: torch.Tensor, cond: torch.Tensor) -> torch.Tensor: + if self._cfm_buffers is not None: + return _optimized_solve_euler( + self.tts.feat_decoder, + dit_h, + self._patch_size, + cond, + self._inference_timesteps, + self._cfg_value, + self._cfm_buffers, + cfg_cutoff_ratio=self._cfg_cutoff_ratio, + perf=self._perf, + ).transpose(1, 2) + return self.tts.feat_decoder( mu=dit_h, - patch_size=tts.patch_size, - cond=prefix_feat_cond.transpose(1, 2).contiguous(), + patch_size=self._patch_size, + cond=cond, n_timesteps=self._inference_timesteps, cfg_value=self._cfg_value, - ).transpose(1, 2) # [1, P, D] + ).transpose(1, 2) + + def _finish_prefill(self, state: _RequestState, meta: dict, res_out: torch.Tensor, dev: Any): + tts = self.tts + lm_hidden = meta["lm_hidden"] + prefix_feat_cond = meta["prefix_feat_cond"] + residual_hidden = res_out[-1:, :] + + state.precomputed_stop_logits = tts.stop_head(tts.stop_actn(tts.stop_proj(lm_hidden))).detach() + dit_h = torch.cat([tts.lm_to_dit_proj(lm_hidden), tts.res_to_dit_proj(residual_hidden)], dim=-1) + + self._setup_cfm_buffers() + if self._enable_torch_compile: + self._setup_torch_compile() + + pred_feat = self._run_cfm(dit_h, prefix_feat_cond.transpose(1, 2).contiguous()) with torch.no_grad(): curr_embed = tts.enc_to_lm_proj(tts.feat_encoder(pred_feat.unsqueeze(1))).squeeze(1) - # Store state for decode steps - self._curr_embed_for_next = curr_embed.detach() - self._prev_feat_embed = curr_embed.detach() - self._curr_prefix_feat_cond = pred_feat[0].detach() - self._last_audio_patch = pred_feat.reshape(1, -1).detach().cpu().float() + state.curr_embed_for_next = curr_embed.detach() + state.prev_feat_embed = curr_embed.detach() + state.curr_prefix_feat_cond = pred_feat[0].detach() + state.last_audio_patch_gpu = pred_feat.detach() + state.decode_step_count = 0 + state.request_start_time = time.perf_counter() + state.prefill_completed = True - logger.info( - "PREFILL patch: norm=%.4f first3=%s", - pred_feat.norm().item(), - pred_feat[0, 0, :3].tolist(), - ) + logger.info("PREFILL[%s]: patch norm=%.4f", state.request_id, pred_feat.norm().item()) + self._perf.reset() - return lm_hidden.to(dtype) - - def _forward_decode(self, inputs_embeds: torch.Tensor | None, dev: Any) -> torch.Tensor: - """Decode step: base_lm → FSQ → residual_lm → diffusion.""" + def _finish_decode(self, state: _RequestState, meta: dict, res_out: torch.Tensor, dev: Any): + self._perf.start("decode_step") tts = self.tts - dtype = self._side_dtype - # 1. Base LM step with curr_embed from previous diffusion - curr_embed = self._curr_embed_for_next.to(dev, dtype=dtype) - if curr_embed.ndim == 2: - curr_embed_3d = curr_embed.unsqueeze(0) # [1, 1, H] - else: - curr_embed_3d = curr_embed - - step_pos = torch.tensor([tts.base_lm.kv_cache.step()], device=dev) - new_hidden = tts.base_lm.forward_step(curr_embed_3d[:, 0, :], step_pos).clone() - - # 2. FSQ - new_lm_hidden = tts.fsq_layer(new_hidden) - if new_lm_hidden.ndim == 1: - new_lm_hidden = new_lm_hidden.unsqueeze(0) - - # 3. Residual LM step - prev_fe = self._prev_feat_embed.to(dtype) - if prev_fe.ndim == 1: - prev_fe = prev_fe.unsqueeze(0) - res_input = tts.fusion_concat_proj(torch.cat([new_lm_hidden, prev_fe], dim=-1)) - res_step_pos = torch.tensor([tts.residual_lm.kv_cache.step()], device=dev) - new_res_hidden = tts.residual_lm.forward_step(res_input, res_step_pos).clone() - if new_res_hidden.ndim == 1: - new_res_hidden = new_res_hidden.unsqueeze(0) - - # 4. Diffusion - p = self._patch_size - pfc = self._curr_prefix_feat_cond.to(dtype).unsqueeze(0) - - dit_h = torch.cat( - [ - tts.lm_to_dit_proj(new_lm_hidden), - tts.res_to_dit_proj(new_res_hidden), - ], - dim=-1, - ) - pred_feat = tts.feat_decoder( - mu=dit_h, - patch_size=p, - cond=pfc.transpose(1, 2).contiguous(), - n_timesteps=self._inference_timesteps, - cfg_value=self._cfg_value, - ).transpose(1, 2) # [1, P, D] + lm_h = meta["new_lm_hidden"] + res_h = res_out.unsqueeze(0) if res_out.ndim == 1 else res_out - # 5. feat_encoder → curr_embed - with torch.no_grad(): - curr_embed = tts.enc_to_lm_proj(tts.feat_encoder(pred_feat.unsqueeze(1))).squeeze(1) + dit_proj = getattr(self, "_compiled_dit_proj", None) or self._dit_proj_fn + stop_fn = getattr(self, "_compiled_stop_fn", None) or self._stop_fn + + dit_h = dit_proj(lm_h, res_h) + pfc = state.curr_prefix_feat_cond.to(self._side_dtype) + if pfc.ndim == 2: + pfc = pfc.unsqueeze(0) + + pred_feat = self._run_cfm(dit_h, pfc.transpose(1, 2).contiguous()) + next_embed = tts.enc_to_lm_proj(tts.feat_encoder(pred_feat.unsqueeze(1))).squeeze(1) + + state.precomputed_stop_logits = stop_fn(lm_h).detach() + state.curr_embed_for_next = next_embed.detach() + state.prev_feat_embed = next_embed.detach() + state.curr_prefix_feat_cond = pred_feat[0].detach() + state.last_audio_patch_gpu = pred_feat.detach() + + self._perf.stop("decode_step") + if _ENABLE_PROFILING and state.decode_step_count % 20 == 0: + logger.info("Step %d[%s]:\n%s", state.decode_step_count, state.request_id, self._perf.breakdown()) - # 6. Stop logits - stop_logits = tts.stop_head(tts.stop_actn(tts.stop_proj(new_lm_hidden))) - self._precomputed_stop_logits = stop_logits.detach() + # -------------------- audio collection -------------------- - # 7. Store state - self._curr_embed_for_next = curr_embed.detach() - self._prev_feat_embed = curr_embed.detach() - self._curr_prefix_feat_cond = pred_feat[0].detach() - self._last_audio_patch = pred_feat.reshape(1, -1).detach().cpu().float() + def _collect_audio(self, state: _RequestState) -> torch.Tensor | None: + patch = state.last_audio_patch_gpu + if patch is not None: + state.last_audio_patch_gpu = None + state.accumulated_patches.append(patch.reshape(1, -1).float()) + + if not state.accumulated_patches: + return None + + n = len(state.accumulated_patches) + if n <= 1 or n % self._vae_decode_interval == 0 or state.is_stopping: + self._perf.start("vae_decode") + all_p = torch.cat(state.accumulated_patches, dim=0) + state.accumulated_patches = [all_p] + feat = rearrange(all_p.reshape(1, -1, self._feat_dim), "b t d -> b d t") + with torch.no_grad(): + audio = self.tts.audio_vae.decode(feat.to(self._device)).reshape(-1).cpu().float() + self._perf.stop("vae_decode") + state.last_decoded_audio = audio + return audio + return state.last_decoded_audio - return new_lm_hidden[-1:].detach() + # -------------------- compute_logits -------------------- def compute_logits( - self, - hidden_states: torch.Tensor | OmniOutput, - sampling_metadata: Any = None, + self, hidden_states: torch.Tensor | OmniOutput, sampling_metadata: Any = None ) -> torch.Tensor | None: if isinstance(hidden_states, OmniOutput): hidden_states = hidden_states.text_hidden_states if hidden_states is None: return None - precomputed = getattr(self, "_precomputed_stop_logits", None) - if precomputed is not None: - self._precomputed_stop_logits = None - raw_logits = precomputed[: hidden_states.shape[0]] - else: - # Fallback for warmup - bsz = hidden_states.shape[0] - raw_logits = torch.zeros(bsz, 2, device=hidden_states.device) - raw_logits[:, 0] = 1.0 # continue - - bsz = raw_logits.shape[0] - full_logits = torch.full( - (bsz, self.config.vocab_size), - float("-inf"), - device=raw_logits.device, - dtype=raw_logits.dtype, + bsz = hidden_states.shape[0] + logits = torch.full( + (bsz, self.config.vocab_size), float("-inf"), device=hidden_states.device, dtype=hidden_states.dtype ) - full_logits[:, 0] = raw_logits[:, 0] # continue - full_logits[:, 1] = raw_logits[:, 1] # stop - return full_logits - # -------------------- Omni output -------------------- + if self._results_queue: + for i, (req_id, stop_logits) in enumerate(self._results_queue): + if i >= bsz: + break + state = self._active_states.get(req_id) + if stop_logits is not None: + if state is not None and state.is_stopping: + logits[i, 0] = 0.0 + logits[i, 1] = 1.0 + state.precomputed_stop_logits = None + else: + logits[i, 0] = stop_logits[0, 0] + logits[i, 1] = stop_logits[0, 1] + if state is not None: + state.is_stopping = bool(stop_logits[0, 1] > stop_logits[0, 0]) + state.precomputed_stop_logits = None + elif state and state.prefill_completed: + logits[i, 1] = 1.0 + else: + logits[i, 0] = 1.0 + self._results_queue.clear() + else: + logits[:, 0] = 1.0 + return logits + + # -------------------- omni output -------------------- def make_omni_output(self, model_outputs: torch.Tensor | OmniOutput, **kwargs: Any) -> OmniOutput: if isinstance(model_outputs, OmniOutput): return model_outputs - hidden = model_outputs - patch = getattr(self, "_last_audio_patch", None) mm: dict[str, Any] = {} + if self._audio_queue: + audio_by_req = {rid: audio for rid, audio in self._audio_queue} + order = [r for r, _ in self._audio_queue] + mm["model_outputs"] = [audio_by_req.get(r) for r in order] + mm["sr"] = [torch.tensor(self._sample_rate, dtype=torch.int32) for _ in order] + self._audio_queue.clear() - if patch is not None: - self._last_audio_patch = None - self._accumulated_patches.append(patch.clone()) - - # Decode all accumulated patches → full audio waveform. - # TODO: implement sliding-window VAE decode (nanovllm pattern) - # for O(1) per-step streaming instead of O(N) re-decode. - if self._accumulated_patches: - all_p = torch.cat(self._accumulated_patches, dim=0) - d = self._feat_dim - from einops import rearrange - - feat = rearrange(all_p.float().reshape(1, -1, d), "b t d -> b d t") - with torch.no_grad(): - audio = self.tts.audio_vae.decode(feat.to(self._device)).reshape(-1).detach().cpu().float() - - mm["model_outputs"] = [audio] - mm["sr"] = [torch.tensor(48000, dtype=torch.int32)] - - return OmniOutput( - text_hidden_states=hidden, - multimodal_outputs=mm, - ) + return OmniOutput(text_hidden_states=model_outputs, multimodal_outputs=mm) # -------------------- preprocess / postprocess -------------------- def preprocess( - self, - input_ids: torch.Tensor, - input_embeds: torch.Tensor | None, - **info_dict: Any, + self, input_ids: torch.Tensor, input_embeds: torch.Tensor | None, **info_dict: Any ) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any]]: - additional_information = info_dict.get("additional_information") - if isinstance(additional_information, dict): + additional = info_dict.get("additional_information") + if isinstance(additional, dict): merged = {k: v for k, v in info_dict.items() if k != "additional_information"} - for k, v in additional_information.items(): + for k, v in additional.items(): merged.setdefault(k, v) info_dict = merged span_len = int(input_ids.shape[0]) dev = input_ids.device - - if span_len > 1: - # ---- Prefill ---- - # Decode the text from input_ids for native-matching tokenization. - # Speech API tokenizes with BOS; we use the detokenized string so - # native's ``text_tokenizer`` produces the exact same tokens as - # ``generate()``. - ids = input_ids.tolist() - if ids and ids[0] == self.config.bos_token_id: - ids = ids[1:] - text = self.tts.text_tokenizer.tokenizer.decode(ids, skip_special_tokens=True) - self._prefill_text = text - - # Voice clone / continuation: build prompt cache from info_dict. + req_id = info_dict.get("request_id", "default") + is_prefill = span_len > 1 + + if is_prefill: + # Evict stale states + pending_ids = {rid for rid, *_ in self._pending_requests} + pending_ids.add(req_id) + if self._current_request_id: + pending_ids.add(self._current_request_id) + for rid in [r for r, s in self._active_states.items() if r not in pending_ids and s.prefill_completed]: + self._cleanup_request(rid) + + # VoxCPM2Tokenizer does char-level Chinese splitting, so use input_ids directly + token_ids = input_ids.tolist() + if token_ids and token_ids[0] == self.config.bos_token_id: + token_ids = token_ids[1:] + + state = self._get_or_create_state(req_id) + state.prefill_text = "" + state.accumulated_patches = [] + state.prefill_completed = False + state.decode_step_count = 0 + state.precomputed_stop_logits = None + state.last_audio_patch_gpu = None + state.curr_embed_for_next = None + state.prev_feat_embed = None + state.curr_prefix_feat_cond = None + state.is_stopping = False + state.last_decoded_audio = None + + # Voice clone / continuation ref_audio = info_dict.get("reference_audio") or info_dict.get("ref_audio") prompt_audio = info_dict.get("prompt_audio") prompt_text = info_dict.get("prompt_text") @@ -603,68 +852,111 @@ def preprocess( if isinstance(prompt_text, list): prompt_text = prompt_text[0] if prompt_text else None - self._prompt_cache = None + state.prompt_cache = None if ref_audio or (prompt_audio and prompt_text): try: - self._prompt_cache = self._build_prompt_cache( + state.prompt_cache = self._build_prompt_cache( ref_audio=ref_audio, prompt_audio=prompt_audio, prompt_text=prompt_text, ) except Exception as e: - logger.warning("build_prompt_cache failed: %s; falling back to zero-shot", e) - self._prompt_cache = None - - # Reset per-request state (fresh generation) - self._accumulated_patches = [] - if hasattr(self, "_prev_feat_embed"): - del self._prev_feat_embed - if hasattr(self, "_curr_embed_for_next"): - del self._curr_embed_for_next - - # Store info for forward - self._current_step_infos = [{"is_prefill": True}] - - # The scaffold model still needs embeddings sized to span_len for - # its warmup/attention bookkeeping. Native modules use the full - # (potentially longer) sequence internally. Pass zeros — scaffold - # output is discarded. - embeds = torch.zeros( - span_len, - self.config.hidden_size, - device=dev, - dtype=self._side_dtype, - ) - - return input_ids, embeds, {} - - # ---- Decode ---- - curr_embed = getattr(self, "_curr_embed_for_next", None) - if curr_embed is not None: - inputs_embeds = curr_embed.to(dev, dtype=self._side_dtype).reshape(1, -1) + logger.warning("build_prompt_cache failed: %s", e) + + inputs = self._build_prefill_inputs(token_ids, dev, req_id) + tts = self.tts + feat_embed = tts.enc_to_lm_proj(tts.feat_encoder(inputs["audio_feat"])) + text_embed = self.model.embed_input_ids(inputs["text_token"].to(dev)) + text_mask, feat_mask = inputs["text_mask"], inputs["audio_mask"] + embeds = (text_mask.unsqueeze(-1) * text_embed + feat_mask.unsqueeze(-1) * feat_embed).squeeze(0) + state.prefill_masks = (text_mask, feat_mask, inputs["audio_feat"], feat_embed) else: - inputs_embeds = torch.zeros( - 1, - self.config.hidden_size, - device=dev, - dtype=self._side_dtype, - ) + state = self._active_states.get(req_id) + curr = state.curr_embed_for_next if state else None + if curr is not None: + embeds = curr.to(dev, dtype=self._side_dtype).reshape(1, -1) + else: + embeds = torch.zeros(1, self.config.hidden_size, device=dev, dtype=self._side_dtype) - self._current_step_infos = [{}] - return input_ids, inputs_embeds, {} + self._pending_requests.append((req_id, is_prefill, embeds, span_len)) + return input_ids, embeds, {} def postprocess(self, hidden_states: torch.Tensor, **info: Any) -> dict[str, Any]: + req_id = info.get("request_id", self._current_request_id or "default") + if _ENABLE_PROFILING: + state = self._active_states.get(req_id) + if state and state.decode_step_count > 0: + logger.info( + "REQUEST DONE[%s]: %d steps, %.2fs\n%s", + req_id, + state.decode_step_count, + time.perf_counter() - state.request_start_time, + self._perf.breakdown(), + ) return {} - # -------------------- Weight loading -------------------- + # -------------------- build prefill inputs -------------------- + + def _build_prefill_inputs(self, token_ids: list[int], dev: Any, req_id: str = "default") -> dict: + tts = self.tts + dtype = self._side_dtype + state = self._active_states.get(req_id) + cache = state.prompt_cache if state else None + mode = cache.get("mode", "continuation") if cache else "zero_shot" + + if cache and mode in ("continuation", "ref_continuation"): + prompt_text = cache.get("prompt_text", "") + prompt_ids = list(tts.text_tokenizer(prompt_text)) if prompt_text else [] + all_ids = prompt_ids + token_ids + else: + all_ids = token_ids + + text_token = torch.tensor(all_ids, dtype=torch.int32) + text_token = torch.cat([text_token, torch.tensor([tts.audio_start_token], dtype=torch.int32)], dim=-1) + text_len = text_token.shape[0] + latent_dim = tts.audio_vae.latent_dim + ps = self._patch_size + + if mode in ("zero_shot", "continuation"): + audio_feat = cache["audio_feat"] if cache else torch.empty((0, ps, latent_dim), dtype=torch.float32) + a_len = audio_feat.size(0) + text_token = torch.cat([text_token, torch.zeros(a_len, dtype=torch.int32)]) + audio_feat = torch.cat([torch.zeros((text_len, ps, latent_dim), dtype=torch.float32), audio_feat]) + text_mask = torch.cat([torch.ones(text_len, dtype=torch.int32), torch.zeros(a_len, dtype=torch.int32)]) + audio_mask = torch.cat([torch.zeros(text_len, dtype=torch.int32), torch.ones(a_len, dtype=torch.int32)]) + elif mode == "reference": + ref = cache["ref_audio_feat"] + rt, rf, rtm, ram = tts._make_ref_prefix(ref, text_token.device) + text_token = torch.cat([rt.cpu(), text_token]) + audio_feat = torch.cat([rf.cpu(), torch.zeros((text_len, ps, latent_dim), dtype=torch.float32)]) + text_mask = torch.cat([rtm.cpu(), torch.ones(text_len, dtype=torch.int32)]) + audio_mask = torch.cat([ram.cpu(), torch.zeros(text_len, dtype=torch.int32)]) + else: # ref_continuation + ref = cache["ref_audio_feat"] + prompt = cache["audio_feat"] + p_len = prompt.size(0) + rt, rf, rtm, ram = tts._make_ref_prefix(ref, text_token.device) + text_token = torch.cat([rt.cpu(), text_token, torch.zeros(p_len, dtype=torch.int32)]) + audio_feat = torch.cat([rf.cpu(), torch.zeros((text_len, ps, latent_dim), dtype=torch.float32), prompt]) + ones_t = torch.ones(text_len, dtype=torch.int32) + zeros_p = torch.zeros(p_len, dtype=torch.int32) + zeros_t = torch.zeros(text_len, dtype=torch.int32) + ones_p = torch.ones(p_len, dtype=torch.int32) + text_mask = torch.cat([rtm.cpu(), ones_t, zeros_p]) + audio_mask = torch.cat([ram.cpu(), zeros_t, ones_p]) + + return { + "text_token": text_token.unsqueeze(0).to(dev), + "audio_feat": audio_feat.unsqueeze(0).to(dev).to(dtype), + "text_mask": text_mask.unsqueeze(0).to(dev), + "audio_mask": audio_mask.unsqueeze(0).to(dev), + } + + # -------------------- weight loading -------------------- - # Weight mapping for vllm scaffold hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"base_lm.": "model."}) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - """Load scaffold weights via vllm + native model for computation.""" - - # Filter: only pass base_lm weights to the scaffold def _base_lm_only(ws): for name, tensor in ws: if name.startswith("base_lm."): @@ -673,21 +965,27 @@ def _base_lm_only(ws): loader = AutoWeightsLoader(self) loaded = loader.load_weights(_base_lm_only(weights), mapper=self.hf_to_vllm_mapper) - # Load the full native model for actual computation model_path = self.vllm_config.model_config.model VoxCPM = import_voxcpm2_core() native = VoxCPM.from_pretrained(model_path, load_denoiser=False, optimize=False) self._tts = native.tts_model.to("cuda") self._side_dtype = self._tts.fusion_concat_proj.weight.dtype self._device = "cuda" - self._patch_size = self._tts.patch_size self._feat_dim = self._tts.feat_dim + n = self.residual_model.load_weights_from_native(self._tts.residual_lm) + for name, _ in self.residual_model.named_parameters(): + loaded.add(f"residual_model.{name}") + logger.info("VoxCPM2: loaded %d params into paged residual_model", n) + + del self._tts.base_lm + self._tts.base_lm = None + del self._tts.residual_lm + self._tts.residual_lm = None + torch.cuda.empty_cache() + logger.info( - "Loaded native VoxCPM2 (patch_size=%d, feat_dim=%d, dtype=%s)", - self._patch_size, - self._feat_dim, - self._side_dtype, + "Loaded VoxCPM2 (patch=%d, feat_dim=%d, dtype=%s)", self._patch_size, self._feat_dim, self._side_dtype ) return loaded diff --git a/vllm_omni/model_executor/stage_configs/voxcpm2.yaml b/vllm_omni/model_executor/stage_configs/voxcpm2.yaml index de15c88de4..7cc93d6b26 100644 --- a/vllm_omni/model_executor/stage_configs/voxcpm2.yaml +++ b/vllm_omni/model_executor/stage_configs/voxcpm2.yaml @@ -1,13 +1,13 @@ -# VoxCPM2 native AR single-stage pipeline. -# Uses native MiniCPM4 base_lm + native VAE decode in one stage. -# All computation (base_lm, residual_lm, diffusion, VAE) in forward(). +# VoxCPM2 AR pipeline with per-request state batching. +# Uses native MiniCPM4 base_lm + per-request StaticKVCache. +# max_batch_size > 1 supported via KV cache save/restore. stage_args: - stage_id: 0 stage_type: llm is_comprehension: true runtime: devices: "0" - max_batch_size: 1 + max_batch_size: 4 engine_args: dtype: bfloat16 model_stage: latent_generator diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 868140d265..4f3f843e65 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -262,6 +262,10 @@ def execute_model( # Update persistent batch states. deferred_state_corrections_fn = self._update_states(scheduler_output) + # Notify model of finished requests for state cleanup + if scheduler_output.finished_req_ids and hasattr(self.model, "on_requests_finished"): + self.model.on_requests_finished(scheduler_output.finished_req_ids) + if has_ec_transfer() and not get_ec_transfer().is_consumer: with self.maybe_get_ec_connector_output( scheduler_output, @@ -793,11 +797,14 @@ def propose_draft_token_ids(sampled_token_ids): elif isinstance(v, dict): mm_payload[k] = {sk: sv[start:end].contiguous() for sk, sv in v.items()} elif isinstance(v, list): - element = v[idx] if idx < len(v) else v[0] - # Clone tensors to avoid cross-request aliasing - if isinstance(element, torch.Tensor): - element = element.clone() - mm_payload[k] = element + if idx < len(v): + element = v[idx] + if element is not None: + if isinstance(element, torch.Tensor): + element = element.clone() + mm_payload[k] = element + # Skip None elements: msgspec cannot serialize None + # in dict[str, torch.Tensor] typed fields. elif isinstance(v, torch.Tensor): # List-derived tensor payloads are request-invariant; clone to # avoid accidental cross-request aliasing on downstream mutation. diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 1f678b579f..5ff62c11b4 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -1241,6 +1241,7 @@ def _preprocess( span_len = int(e) - int(s) # call the custom process function + req_infos["request_id"] = req_id embed_slice = inputs_embeds[s:e] if inputs_embeds is not None else None req_input_ids, req_embeds, update_dict = self.model.preprocess( input_ids=input_ids[s:e], input_embeds=embed_slice, **req_infos From dd1389173b4e2893d21cf742979c89ab0255a5d5 Mon Sep 17 00:00:00 2001 From: Chen-Yo Sun Date: Mon, 13 Apr 2026 15:37:45 -0700 Subject: [PATCH 154/204] [Voxtral TTS] Fix Voxtral TTS input with text and ref_audio (#2750) Signed-off-by: Chen-Yo Sun --- .../voxtral_tts_audio_generation.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py index 4041a53e55..cd67e4f074 100644 --- a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py +++ b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py @@ -864,6 +864,29 @@ def get_replacement(item_idx: int): ), ] + def _apply_hf_processor_mm_only( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + tokenization_kwargs: Mapping[str, object], + ) -> BatchFeature: + """ + Apply the HF processor on the multi-modal data only. + + Issue: Voxtral TTS use Mistral Tokenizer with custom audio encoder. It doesn't + inherit Transformers ProcessorMixin and can't use call_hf_processor_mm_only. + + Solution: Override this method to call _apply_hf_processor_text_mm directly. + """ + mm_counts = mm_items.get_all_counts() + _, mm_processed_data, _ = self._apply_hf_processor_text_mm( + prompt_text=self.dummy_inputs.get_dummy_text(mm_counts), + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + tokenization_kwargs=tokenization_kwargs, + ) + return mm_processed_data + def _cached_apply_hf_processor( self, inputs: ProcessorInputs, From 8d23549b29ca408b4c5176bb85a87bfd4dff0b83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zeyu=20Huang=20=7C=20=E9=BB=83=E6=BE=A4=E5=AE=87?= <11222265+fhfuih@users.noreply.github.com> Date: Tue, 14 Apr 2026 11:35:56 +0800 Subject: [PATCH 155/204] [CI] Qwen image edit performance benckmark (#2216) Signed-off-by: Huang, Zeyu <11222265+fhfuih@users.noreply.github.com> --- .buildkite/test-nightly-diffusion.yml | 19 +- .../diffusion/diffusion_benchmark_serving.py | 28 ++- .../perf/scripts/run_diffusion_benchmark.py | 170 ++++++++++++++++-- .../test_qwen_image_edit_2509_vllm_omni.json | 167 +++++++++++++++++ .../tests/test_qwen_image_edit_vllm_omni.json | 161 +++++++++++++++++ .../perf/tests/test_qwen_image_vllm_omni.json | 2 - tools/nightly/generate_nightly_perf_excel.py | 120 +++++++++---- 7 files changed, 608 insertions(+), 59 deletions(-) create mode 100644 tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json create mode 100644 tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json diff --git a/.buildkite/test-nightly-diffusion.yml b/.buildkite/test-nightly-diffusion.yml index 04b99c0a83..a520ca4356 100644 --- a/.buildkite/test-nightly-diffusion.yml +++ b/.buildkite/test-nightly-diffusion.yml @@ -325,10 +325,23 @@ steps: if: *nightly_or_pr_label commands: - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results + - export DIFFUSION_ATTENTION_BACKEND=FLASH_ATTN - export CACHE_DIT_VERSION=1.3.0 - - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json - - buildkite-agent artifact upload "tests/dfx/perf/results/benchmark_results_*.json" - - buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" + # [HACK]: run upload in the same command block as pytest. + # Because `exit` aborts the entire commands list. + - | + set +e + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json + EXIT1=$$? + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json + EXIT2=$$? + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json + EXIT3=$$? + if [ $$EXIT1 -eq 0 ] || [ $$EXIT2 -eq 0 ] || [ $$EXIT3 -eq 0 ]; then + buildkite-agent artifact upload "tests/dfx/perf/results/diffusion_result_*.json" + buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" + fi + exit $$((EXIT1 | EXIT2 | EXIT3)) agents: queue: "mithril-h100-pool" plugins: diff --git a/benchmarks/diffusion/diffusion_benchmark_serving.py b/benchmarks/diffusion/diffusion_benchmark_serving.py index aad955b0d1..32ec48a698 100644 --- a/benchmarks/diffusion/diffusion_benchmark_serving.py +++ b/benchmarks/diffusion/diffusion_benchmark_serving.py @@ -558,6 +558,7 @@ def __init__(self, args, api_url: str, model: str, enable_negative_prompt: bool super().__init__(args, api_url, model) self.num_prompts = args.num_prompts self.enable_negative_prompt = enable_negative_prompt + self.num_input_images = max(1, args.num_input_images) self.random_request_config = getattr(args, "random_request_config", None) if self.random_request_config: self.random_request_config = json.loads(self.random_request_config) @@ -580,11 +581,7 @@ def __init__(self, args, api_url: str, model: str, enable_negative_prompt: bool # Random image generate if self.args.task in ["i2v", "ti2v", "ti2i", "i2i"]: - img = Image.new("RGB", (512, 512), (255, 255, 255)) - - image_path = os.path.join(tempfile.gettempdir(), "diffusion_benchmark_random_image.png") - self._random_image_path = [image_path] - img.save(image_path) + self._random_image_path = self._generate_random_image_paths() else: self._random_image_path = None @@ -619,6 +616,18 @@ def __getitem__(self, idx: int) -> RequestFuncInput: def get_requests(self) -> list[RequestFuncInput]: return [self[i] for i in range(len(self))] + def _generate_random_image_paths(self) -> list[str]: + image_paths: list[str] = [] + for image_idx in range(self.num_input_images): + img = Image.new("RGB", (512, 512), (255, 255, 255)) + image_path = os.path.join( + tempfile.gettempdir(), + f"diffusion_benchmark_random_image_{image_idx}.png", + ) + img.save(image_path) + image_paths.append(image_path) + return image_paths + def _compute_expected_latency_ms_from_base(req: RequestFuncInput, args, base_time_ms: float | None) -> float | None: """Compute expected execution time (ms) based on a base per-step-per-frame unit time. @@ -1115,6 +1124,15 @@ async def limited_request_func(req, session, pbar): '{"width":768,"height":768,"num_inference_steps":20,"weight":0.85}]' ), ) + parser.add_argument( + "--num-input-images", + type=int, + default=1, + help=( + "Number of synthetic input images to attach for image-conditioned tasks " + "(i2v, ti2v, ti2i, i2i) when using random dataset." + ), + ) args = parser.parse_args() diff --git a/tests/dfx/perf/scripts/run_diffusion_benchmark.py b/tests/dfx/perf/scripts/run_diffusion_benchmark.py index 1bd9bf1a14..123f21405e 100644 --- a/tests/dfx/perf/scripts/run_diffusion_benchmark.py +++ b/tests/dfx/perf/scripts/run_diffusion_benchmark.py @@ -27,13 +27,14 @@ import time from datetime import datetime from pathlib import Path -from typing import Any +from typing import Any, cast import psutil import pytest os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" +os.environ.setdefault("DIFFUSION_ATTENTION_BACKEND", "FLASH_ATTN") # --------------------------------------------------------------------------- # Paths @@ -50,6 +51,7 @@ # Populated lazily after CONFIG_FILE_PATH is resolved. _SESSION_TIMESTAMP = datetime.now().strftime("%Y%m%d-%H%M%S") _RESULT_LOCK = threading.Lock() +_BRANCHPOINT_COMMIT_SHA: str | None = None def _get_config_file_from_argv() -> str | None: @@ -110,7 +112,7 @@ def load_configs(config_path: str) -> list[dict[str, Any]]: BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) _config_stem = Path(CONFIG_FILE_PATH).stem # e.g. "test_qwen_image_vllm_omni" -AGGREGATED_RESULT_FILE = BENCHMARK_RESULT_DIR / f"benchmark_results_{_config_stem}_{_SESSION_TIMESTAMP}.json" +AGGREGATED_RESULT_FILE = BENCHMARK_RESULT_DIR / f"diffusion_result_{_config_stem}_{_SESSION_TIMESTAMP}.json" def _append_to_aggregated_file(record: dict[str, Any]) -> None: @@ -232,13 +234,13 @@ class DiffusionServer: def __init__( self, - model: str, - serve_args: list[str], + server_cfg: dict[str, Any], *, port: int | None = None, ) -> None: - self.model = model - self.serve_args = serve_args + self.server_cfg: dict[str, Any] = server_cfg + self.model = server_cfg["model"] + self.serve_args = server_cfg["serve_args"] self.host = "127.0.0.1" self.port = port if port is not None else _get_open_port() self.proc: subprocess.Popen | None = None @@ -299,6 +301,95 @@ def _build_serve_args(serve_args_dict: dict[str, Any]) -> list[str]: return args +def _get_branchpoint_commit_sha() -> str: + """Return the branch-point commit SHA against main. + + Uses git command: ``git merge-base HEAD origin/main``. + """ + global _BRANCHPOINT_COMMIT_SHA + if _BRANCHPOINT_COMMIT_SHA is not None: + return _BRANCHPOINT_COMMIT_SHA + + repo_root = Path(__file__).parent.parent.parent.parent + try: + sha = ( + subprocess.check_output( + ["git", "merge-base", "HEAD", "origin/main"], + cwd=str(repo_root), + stderr=subprocess.STDOUT, + text=True, + ) + .strip() + .splitlines()[0] + ) + _BRANCHPOINT_COMMIT_SHA = sha + except Exception as e: + print(f"Warning: failed to get branch-point commit SHA: {e}") + _BRANCHPOINT_COMMIT_SHA = "" + return _BRANCHPOINT_COMMIT_SHA + + +def _to_resolution_string(params: dict[str, Any]) -> str: + width = params.get("width", "unknown width") + height = params.get("height", "unknown height") + return f"{width}x{height}" + + +def _to_parallelism_string(framework: str, serve_args_dict: dict[str, Any]) -> str: + parts: list[str] = [] + if framework == "vllm-omni": + keys = [ + "num-gpus", + "usp", + "ulysses-degree", + "ring", + "ring-degree", + "cfg-parallel-size", + "vae-patch-parallel-size", + "vae-use-tiling", + "tensor-parallel-size", + ] + for key in keys: + if key in serve_args_dict: + parts.append(f"{key}={serve_args_dict[key]}") + return ",".join(parts) if parts else "none" + + +def _to_cache_string(framework: str, serve_args_dict: dict[str, Any]) -> str: + if framework == "vllm-omni": + if "cache-backend" in serve_args_dict: + return str(serve_args_dict["cache-backend"]) + return "disabled" + + +def _to_offload_string(framework: str, serve_args_dict: dict[str, Any]) -> str: + selected: list[str] = [] + if framework == "vllm-omni": + offload_keys = [ + "enable-cpu-offload", + "enable-layerwise-offload", + ] + for key in offload_keys: + if key in serve_args_dict: + selected.append(key) + return f"enabled({';'.join(selected)})" if selected else "disabled" + + +def _to_compile_value(framework: str, serve_args_dict: dict[str, Any]) -> str: + if framework == "vllm-omni": + if "enforce-eager" in serve_args_dict: + return "disabled" + return "enabled" + return "disabled" + + +def _to_quantization_value(framework: str, serve_args_dict: dict[str, Any]) -> str: + if framework == "vllm-omni": + quant = serve_args_dict.get("quantization") + return str(quant) if quant else "disabled" + return "disabled" + + def _unique_server_params(configs: list[dict[str, Any]]) -> list[dict[str, Any]]: """Return one server-config dict per unique test_name.""" seen: set[str] = set() @@ -310,12 +401,14 @@ def _unique_server_params(configs: list[dict[str, Any]]) -> list[dict[str, Any]] seen.add(test_name) if cfg.get("server_type", "vllm-omni") != "vllm-omni": raise ValueError(f"Unsupported server_type in config: {cfg.get('server_type')}") + serve_args_dict = cfg["server_params"].get("serve_args", {}) result.append( { "test_name": test_name, "server_type": "vllm-omni", "model": cfg["server_params"]["model"], - "serve_args": _build_serve_args(cfg["server_params"].get("serve_args", {})), + "serve_args_dict": serve_args_dict, + "serve_args": _build_serve_args(serve_args_dict), "benchmark_backend": "vllm-omni", "server_params": cfg["server_params"], } @@ -334,9 +427,7 @@ def _test_param_mapping(configs: list[dict[str, Any]]) -> dict[str, list[dict]]: def _make_server(server_cfg: dict[str, Any]) -> DiffusionServer: """Factory: return a vLLM-Omni diffusion server instance for the config.""" - model = server_cfg["model"] - serve_args = server_cfg["serve_args"] - return DiffusionServer(model=model, serve_args=serve_args) + return DiffusionServer(server_cfg=server_cfg) # --------------------------------------------------------------------------- @@ -364,7 +455,6 @@ def diffusion_server(request): print(f"\nStarting {server_type} server for test: {test_name}") with _make_server(server_cfg) as server: server.test_name = test_name - server.server_params = server_cfg["server_params"] print(f"{server_type} server started successfully") yield server print(f"{server_type} server stopping…") @@ -402,16 +492,18 @@ def run_benchmark( params: dict[str, Any], test_name: str, backend: str = "vllm-omni", - server_params: dict[str, Any] | None = None, + server_cfg: dict[str, Any] | None = None, + source_file: str = "", ) -> dict[str, Any]: """Run diffusion_benchmark_serving.py as a subprocess and return parsed metrics. The raw metrics are written to a temporary file by the subprocess. After the run completes the metrics are merged with full metadata (test_name, - backend, benchmark_params, timestamp) and appended to the session-wide - aggregated JSON file (AGGREGATED_RESULT_FILE). The temporary file is - removed afterwards. Subprocess stdout/stderr are tee'd to a .log file - under BENCHMARK_RESULT_DIR/logs/; its path is stored in the record. + backend, benchmark_params, timestamp, flat reporting fields) and appended + to the session-wide aggregated JSON file (AGGREGATED_RESULT_FILE). The + temporary file is removed afterwards. Subprocess stdout/stderr are tee'd + to a .log file under BENCHMARK_RESULT_DIR/logs/; its path is stored in + the record. """ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") @@ -495,14 +587,55 @@ def run_benchmark( finally: tmp_result_file.unlink(missing_ok=True) + server_cfg = server_cfg or {} + serve_args_dict = server_cfg.get("serve_args_dict", {}) + if not isinstance(serve_args_dict, dict): + serve_args_dict = {} + + completed = metrics.get("completed_requests", metrics.get("completed", 0)) + failed = metrics.get("failed_requests", metrics.get("failed", 0)) + record: dict[str, Any] = { "test_name": test_name, "backend": backend, "timestamp": timestamp, - "server_params": server_params, + "server_params": server_cfg.get("server_params"), "benchmark_params": params, "result": metrics, "log_file": str(log_file), + "Model": model, + "Framework": backend, + "Hardware": "", + "Deployment": "", + "Task": params.get("task", "t2i"), + "Dataset": params.get("dataset", "random"), + "resolution": _to_resolution_string(params), + "Parallelism": _to_parallelism_string(backend, serve_args_dict), + "max_concurrency": params.get("max-concurrency", ""), + "Cache": _to_cache_string(backend, serve_args_dict), + "Quantization": _to_quantization_value(backend, serve_args_dict), + "offload": _to_offload_string(backend, serve_args_dict), + "compile": _to_compile_value(backend, serve_args_dict), + "Attn_backend": os.environ.get("DIFFUSION_ATTENTION_BACKEND", ""), + "num_inference_steps": params.get("num-inference-steps", ""), + "completed": completed, + "failed": failed, + "throughput_qps": metrics.get("throughput_qps"), + "latency_mean": metrics.get("latency_mean"), + "latency_median": metrics.get("latency_median"), + "latency_p99": metrics.get("latency_p99"), + "latency_p95": metrics.get("latency_p95"), + "latency_p50": metrics.get("latency_p50"), + "peak_memory_mb_max": metrics.get("peak_memory_mb_max"), + "peak_memory_mb_mean": metrics.get("peak_memory_mb_mean"), + "peak_memory_mb_median": metrics.get("peak_memory_mb_median"), + "stage_durations_mean": metrics.get("stage_durations_mean"), + "stage_durations_p50": metrics.get("stage_durations_p50"), + "stage_durations_p99": metrics.get("stage_durations_p99"), + "commit_sha": _get_branchpoint_commit_sha(), + "build_id": os.environ.get("BUILDKITE_BUILD_ID", ""), + "build_url": os.environ.get("BUILDKITE_BUILD_URL", ""), + "source_file": source_file, } _append_to_aggregated_file(record) print(f"\n Result appended to: {AGGREGATED_RESULT_FILE}") @@ -565,7 +698,8 @@ def test_diffusion_performance_benchmark(diffusion_server, benchmark_params): params=params, test_name=test_name, backend=backend, - server_params=diffusion_server.server_params, + server_cfg=getattr(diffusion_server, "server_cfg", {}), + source_file=cast(str, CONFIG_FILE_PATH), ) print(f"\n{'=' * 60}") diff --git a/tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json new file mode 100644 index 0000000000..7d1fbbfa70 --- /dev/null +++ b/tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json @@ -0,0 +1,167 @@ +[ + { + "test_name": "test_qwen_image_edit_2509_single_device", + "description": "Single-device baseline (two input images)", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Edit-2509", + "serve_args": { + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20_i2i_2img", + "dataset": "random", + "task": "i2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "num-input-images": 2, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.05, + "latency_mean": 18, + "peak_memory_mb_max": 78500, + "peak_memory_mb_mean": 78500 + } + }, + { + "name": "1536x1536_steps35_i2i_2img", + "dataset": "random", + "task": "i2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "num-input-images": 2, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.01, + "latency_mean": 70, + "peak_memory_mb_max": 81000, + "peak_memory_mb_mean": 81000 + } + } + ] + }, + { + "test_name": "test_qwen_image_edit_2509_ulysses2_cfg2_vae_patch4", + "description": "Ulysses SP=2 + CFG=2 + VAE patch parallel=4", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Edit-2509", + "serve_args": { + "ulysses-degree": 2, + "cfg-parallel-size": 2, + "vae-patch-parallel-size": 4, + "vae-use-tiling": true, + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20_i2i_2img", + "dataset": "random", + "task": "i2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "num-input-images": 2, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.1, + "latency_mean": 12, + "peak_memory_mb_max": 69000, + "peak_memory_mb_mean": 69000 + } + }, + { + "name": "1536x1536_steps35_i2i_2img", + "dataset": "random", + "task": "i2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "num-input-images": 2, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.03, + "latency_mean": 28, + "peak_memory_mb_max": 69000, + "peak_memory_mb_mean": 69000 + } + } + ] + }, + { + "test_name": "test_qwen_image_edit_2509_ulysses2_cfg2_cache_dit", + "description": "Ulysses SP=2 + CFG=2 + CacheDiT", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Edit-2509", + "serve_args": { + "ulysses-degree": 2, + "cfg-parallel-size": 2, + "cache-backend": "cache_dit", + "cache-config": { + "Fn_compute_blocks": 1, + "Bn_compute_blocks": 0, + "max_warmup_steps": 4, + "residual_diff_threshold": 0.24, + "max_continuous_cached_steps": 3, + "enable_taylorseer": false, + "taylorseer_order": 1, + "scm_steps_mask_policy": null, + "scm_steps_policy": "dynamic" + }, + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20_i2i_2img", + "dataset": "random", + "task": "i2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "num-input-images": 2, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.10, + "latency_mean": 12, + "peak_memory_mb_max": 73000, + "peak_memory_mb_mean": 73000 + } + }, + { + "name": "1536x1536_steps35_i2i_2img", + "dataset": "random", + "task": "i2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "num-input-images": 2, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.05, + "latency_mean": 20, + "peak_memory_mb_max": 81000, + "peak_memory_mb_mean": 81000 + } + } + ] + } +] diff --git a/tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json new file mode 100644 index 0000000000..f68201db5f --- /dev/null +++ b/tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json @@ -0,0 +1,161 @@ +[ + { + "test_name": "test_qwen_image_edit_single_device", + "description": "Single-device baseline", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Edit", + "serve_args": { + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20_i2i", + "dataset": "random", + "task": "i2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.05, + "latency_mean": 15.0, + "peak_memory_mb_max": 72500, + "peak_memory_mb_mean": 72500 + } + }, + { + "name": "1536x1536_steps35_i2i", + "dataset": "random", + "task": "i2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.01, + "latency_mean": 65.6, + "peak_memory_mb_max": 80777, + "peak_memory_mb_mean": 80777 + } + } + ] + }, + { + "test_name": "test_qwen_image_edit_ulysses2_cfg2_vae_patch4", + "description": "Ulysses SP=2 + CFG=2 + VAE patch parallel=4", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Edit", + "serve_args": { + "ulysses-degree": 2, + "cfg-parallel-size": 2, + "vae-patch-parallel-size": 4, + "vae-use-tiling": true, + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20_i2i", + "dataset": "random", + "task": "i2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.10, + "latency_mean": 7.2, + "peak_memory_mb_max": 68100, + "peak_memory_mb_mean": 68100 + } + }, + { + "name": "1536x1536_steps35_i2i", + "dataset": "random", + "task": "i2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.03, + "latency_mean": 24.0, + "peak_memory_mb_max": 68100, + "peak_memory_mb_mean": 68100 + } + } + ] + }, + { + "test_name": "test_qwen_image_edit_ulysses2_cfg2_cache_dit", + "description": "Ulysses SP=2 + CFG=2 + CacheDiT", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Edit", + "serve_args": { + "ulysses-degree": 2, + "cfg-parallel-size": 2, + "cache-backend": "cache_dit", + "cache-config": { + "Fn_compute_blocks": 1, + "Bn_compute_blocks": 0, + "max_warmup_steps": 4, + "residual_diff_threshold": 0.24, + "max_continuous_cached_steps": 3, + "enable_taylorseer": false, + "taylorseer_order": 1, + "scm_steps_mask_policy": null, + "scm_steps_policy": "dynamic" + }, + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20_i2i", + "dataset": "random", + "task": "i2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.1, + "latency_mean": 6.5, + "peak_memory_mb_max": 72600, + "peak_memory_mb_mean": 72600 + } + }, + { + "name": "1536x1536_steps35_i2i", + "dataset": "random", + "task": "i2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.05, + "latency_mean": 16.0, + "peak_memory_mb_max": 81000, + "peak_memory_mb_mean": 81000 + } + } + ] + } +] diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index 97c1bbfb3c..1f3a2bbf77 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -44,7 +44,6 @@ } ] }, - { "test_name": "test_qwen_image_ulysses2_cfg2_vae_patch4", "description": "Ulysses SP=2 + CFG-parallel=2 + VAE Patch Parallel=4", @@ -94,7 +93,6 @@ } ] }, - { "test_name": "test_qwen_image_ulysses2_cfg2_cache_dit", "description": "Ulysses SP=2 + CFG-parallel=2 + CacheDiT acceleration", diff --git a/tools/nightly/generate_nightly_perf_excel.py b/tools/nightly/generate_nightly_perf_excel.py index 817f37f664..5f9eb428bc 100644 --- a/tools/nightly/generate_nightly_perf_excel.py +++ b/tools/nightly/generate_nightly_perf_excel.py @@ -23,6 +23,22 @@ GREY_BLOCK_FILL = PatternFill(start_color="D3D3D3", fill_type="solid") # Diffusion sheet columns (Qwen-Image diffusion benchmark). +# Per-stage latency metrics. Unpack from stage_durations_mean/p50/p99 dicts +DIFFUSION_STAGE_LATENCY_COLUMNS: tuple[str, ...] = ( + # "vae.encode_mean", + # "vae.encode_p50", + # "vae.encode_p99", + "vae.decode_mean", + "vae.decode_p50", + "vae.decode_p99", + "diffuse_mean", + "diffuse_p50", + "diffuse_p99", + "text_encoder.forward_mean", + "text_encoder.forward_p50", + "text_encoder.forward_p99", +) + DIFFUSION_BENCHMARK_COLUMNS: tuple[str, ...] = ( "duration", "completed_requests", @@ -36,7 +52,7 @@ "peak_memory_mb_mean", "peak_memory_mb_median", "slo_attainment_rate", -) +) + DIFFUSION_STAGE_LATENCY_COLUMNS DIFFUSION_NUMERIC_FORMAT_COLUMNS: tuple[str, ...] = DIFFUSION_BENCHMARK_COLUMNS @@ -63,7 +79,7 @@ "build_id", "build_url", "source_file", -) +) + DIFFUSION_STAGE_LATENCY_COLUMNS # Benchmark metric columns: grey the latest row's cell when value changed vs previous date. BENCHMARK_COLUMNS: tuple[str, ...] = ( @@ -106,7 +122,7 @@ _COLUMNS_FILENAME = "nightly_perf_summary_columns.txt" _RESULT_JSON_PREFIX = "result_test_" -_DIFFUSION_JSON_PREFIX = "diffusion_perf_" +_DIFFUSION_RESULT_PREFIX = "diffusion_result_" DEFAULT_INPUT_DIR = os.getenv("DEFAULT_INPUT_DIR") if os.getenv("DEFAULT_INPUT_DIR") else "tests" DEFAULT_OUTPUT_DIR = os.getenv("DEFAULT_OUTPUT_DIR") if os.getenv("DEFAULT_OUTPUT_DIR") else "tests" DEFAULT_DIFFUSION_INPUT_DIR = os.getenv("DIFFUSION_BENCHMARK_DIR") @@ -252,7 +268,7 @@ def parse_args() -> argparse.Namespace: type=str, default=None, help=( - "Directory containing diffusion_perf_*.json files; default is " + "Directory containing diffusion_result_*.json files; default is " "DIFFUSION_BENCHMARK_DIR, fallback to --input-dir." ), ) @@ -286,7 +302,7 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -def _load_json_file(path: str) -> dict[str, Any] | None: +def _load_json_file(path: str) -> dict[str, Any] | list[Any] | None: """Safely load a single JSON file; return None and log a warning on failure.""" try: with open(path, encoding="utf-8") as f: @@ -295,8 +311,8 @@ def _load_json_file(path: str) -> dict[str, Any] | None: LOGGER.warning("failed to load json '%s': %s", path, exc) return None - if not isinstance(data, dict): - LOGGER.warning("json root in '%s' is not an object, skip", path) + if not isinstance(data, (dict, list)): + LOGGER.warning("json root in '%s' is not a dict or list, skip", path) return None return data @@ -396,27 +412,29 @@ def _iter_omni_json_records(input_dir: str) -> Iterable[dict[str, Any]]: yield record -def _parse_diffusion_from_filename(filename: str) -> dict[str, Any]: - """Parse diffusion test_name/date from filename: diffusion_perf__.json""" +def _parse_diffusion_result_from_filename(filename: str) -> dict[str, Any]: + """Parse test_name/date from filename: diffusion_result__.json""" name, ext = os.path.splitext(filename) - if ext != ".json" or not name.startswith(_DIFFUSION_JSON_PREFIX): + if ext != ".json" or not name.startswith(_DIFFUSION_RESULT_PREFIX): return {} - core = name[len(_DIFFUSION_JSON_PREFIX) :] + core = name[len(_DIFFUSION_RESULT_PREFIX) :] parts = core.split("_") if len(parts) < 2: return {} timestamp = parts[-1] - test_name = "_".join(parts[:-1]) if parts[:-1] else "" parsed: dict[str, Any] = {} if len(timestamp) >= 15: parsed["date"] = timestamp - if test_name: - parsed["test_name"] = test_name return parsed -def _iter_diffusion_json_records(input_dir: str) -> Iterable[dict[str, Any]]: - """Iterate over diffusion_perf_*.json files and yield normalized diffusion records.""" +def _iter_diffusion_records(input_dir: str) -> Iterable[dict[str, Any]]: + """Iterate over diffusion_result_*.json files and yield normalized records. + + Unlike omni format where each JSON file contains one test case, diffusion format + produces a single JSON file containing a list of all test case records. + Test params (feature toggles) are NOT embedded in the filename. + """ if not os.path.isdir(input_dir): LOGGER.warning("diffusion input dir '%s' does not exist or is not a directory", input_dir) return @@ -424,7 +442,7 @@ def _iter_diffusion_json_records(input_dir: str) -> Iterable[dict[str, Any]]: for entry in sorted(os.listdir(input_dir)): if not entry.endswith(".json"): continue - if not entry.startswith(_DIFFUSION_JSON_PREFIX): + if not entry.startswith(_DIFFUSION_RESULT_PREFIX): continue full_path = os.path.join(input_dir, entry) if not os.path.isfile(full_path): @@ -434,23 +452,63 @@ def _iter_diffusion_json_records(input_dir: str) -> Iterable[dict[str, Any]]: if data is None: continue - record: dict[str, Any] = dict(data) - filename_meta = _parse_diffusion_from_filename(os.path.basename(full_path)) - if "date" not in record or not record.get("date"): - record["date"] = filename_meta.get("date") or datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") - if "test_name" not in record or not record.get("test_name"): - if "test_name" in filename_meta: - record["test_name"] = filename_meta["test_name"] - record["source_file"] = os.path.basename(full_path) - yield record + filename_meta = _parse_diffusion_result_from_filename(os.path.basename(full_path)) + if not isinstance(data, list): + LOGGER.warning("diffusion result file '%s' root is not a list, skip", full_path) + continue -def _collect_records(input_dir: str) -> list[dict[str, Any]]: + for record in data: + if not isinstance(record, dict): + continue + record = dict(record) + if "date" not in record or not record.get("date"): + record["date"] = filename_meta.get("date") or datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") + record["source_file"] = os.path.basename(full_path) + yield record + + +def _collect_omni_records(input_dir: str) -> list[dict[str, Any]]: return list(_iter_omni_json_records(input_dir)) def _collect_diffusion_records(diffusion_input_dir: str) -> list[dict[str, Any]]: - return list(_iter_diffusion_json_records(diffusion_input_dir)) + """Collect diffusion records from diffusion_result_*.json files. + Their format is different from omni JSON files. + """ + return [_process_diffusion_record(r) for r in _iter_diffusion_records(diffusion_input_dir)] + + +def _flatten_stage_durations(record: dict[str, Any]) -> dict[str, Any]: + """Flatten stage_durations dict into individual columns matching DIFFUSION_STAGE_LATENCY_COLUMNS.""" + result = dict(record) + + for prefix in ("stage_durations_mean", "stage_durations_p50", "stage_durations_p99"): + durations = result.pop(prefix, None) + if not isinstance(durations, dict): + continue + + suffix = prefix.replace("stage_durations_", "") # "mean", "p50", "p99" + + for stage_key, value in durations.items(): # e.g., "SomePipeline.vae.decode_mean": 100.0 + stage_key = stage_key.split(".", 1)[-1] # "decode_mean" + col_name = f"{stage_key}_{suffix}" + if col_name not in DIFFUSION_STAGE_LATENCY_COLUMNS: + print(f"skipping stage_key: {col_name}") + continue + result[col_name] = value + + return result + + +def _process_diffusion_record(record: dict[str, Any]) -> dict[str, Any]: + """Normalize a diffusion record by merging `result` and flattening stage metrics.""" + flat = record.copy() + flat.update(flat.pop("result", {})) + flat = _flatten_stage_durations(flat) + flat.pop("benchmark_params", None) + flat.pop("server_params", None) + return flat def _apply_build_metadata_to_latest_only( @@ -493,7 +551,7 @@ def _apply_build_metadata_to_latest_only( def _sort_records_for_summary(records: list[dict[str, Any]]) -> list[dict[str, Any]]: """Sort so that same test configuration is grouped, newest date first within each group.""" - by_date_desc = sorted(records, key=lambda r: (r.get("date") or ""), reverse=True) + by_date_desc = sorted(records, key=lambda r: r.get("date") or "", reverse=True) return sorted( by_date_desc, key=_omni_group_key, @@ -501,7 +559,7 @@ def _sort_records_for_summary(records: list[dict[str, Any]]) -> list[dict[str, A def _sort_diffusion_records_for_summary(records: list[dict[str, Any]]) -> list[dict[str, Any]]: - by_date_desc = sorted(records, key=lambda r: (r.get("date") or ""), reverse=True) + by_date_desc = sorted(records, key=lambda r: r.get("date") or "", reverse=True) return sorted(by_date_desc, key=_diffusion_group_key) @@ -678,7 +736,7 @@ def generate_excel_report( script_dir = os.path.dirname(os.path.abspath(__file__)) omni_summary_columns = _ensure_omni_summary_columns(_load_summary_columns(script_dir)) - omni_records = _collect_records(input_dir) + omni_records = _collect_omni_records(input_dir) diffusion_records = _collect_diffusion_records(diffusion_input_dir) if not omni_records: From a5b38b5d0d612d4be0b452dfd29c552f2dfa94a3 Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Tue, 14 Apr 2026 13:32:00 +0800 Subject: [PATCH 156/204] [BugFix] Remove stage_configs_path validation (#2741) Signed-off-by: amy-why-3459 --- tests/engine/test_arg_utils.py | 7 ------- vllm_omni/engine/arg_utils.py | 5 ----- 2 files changed, 12 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index a1fc18f845..35d55f1cc4 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -118,13 +118,6 @@ def test_qwen3_tts_codec_frame_rate_patching(): assert omni_config.codec_frame_rate_hz == 12.3 -def test_stage_configs_path_blocks_create_model_config(): - """create_model_config() should raise when stage_configs_path is set.""" - args = OmniEngineArgs(stage_configs_path="/some/path.yaml") - with pytest.raises(RuntimeError, match="stage_configs_path"): - args.create_model_config() - - def test_from_cli_args_picks_up_stage_configs_path(): """from_cli_args should pick up stage_configs_path from namespace.""" ns = argparse.Namespace( diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index 4e2ad9b257..d61102c7e1 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -194,11 +194,6 @@ def create_model_config(self) -> OmniModelConfig: Returns: OmniModelConfig instance with all configuration fields set """ - if self.stage_configs_path is not None: - raise RuntimeError( - "create_model_config() should not be called when stage_configs_path is set. " - "Per-stage model configs are resolved from the stage config YAML." - ) # register omni models to avoid model not found error self._ensure_omni_models_registered() From 644edac0b6e29b153380a2a3796c328918c2d614 Mon Sep 17 00:00:00 2001 From: Samit <285365963@qq.com> Date: Tue, 14 Apr 2026 14:20:38 +0800 Subject: [PATCH 157/204] [Perf] Optimize MP4 encoding latency in video generation (#2735) Signed-off-by: samithuang <285365963@qq.com> --- .../openai_api/test_video_server.py | 144 +++++++++++++----- vllm_omni/diffusion/utils/media_utils.py | 7 +- vllm_omni/entrypoints/openai/api_server.py | 27 +--- vllm_omni/entrypoints/openai/serving_video.py | 16 +- .../entrypoints/openai/video_api_utils.py | 29 +++- 5 files changed, 158 insertions(+), 65 deletions(-) diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py index fd7d4df60d..82c34f87e8 100644 --- a/tests/entrypoints/openai_api/test_video_server.py +++ b/tests/entrypoints/openai_api/test_video_server.py @@ -69,7 +69,7 @@ def set_stage_configs_if_missing(self, stage_configs): if self.stage_configs is None: self.stage_configs = stage_configs - async def generate_videos(self, request, reference_id, *, reference_image=None): + async def generate_video_bytes(self, request, reference_id, *, reference_image=None): self.started.set() try: await asyncio.Future() @@ -137,15 +137,81 @@ def _wait_until(predicate, timeout_s: float = 2.0, interval_s: float = 0.02): raise AssertionError("Timed out waiting for condition") +def test_async_video_generation_bypasses_base64(test_client, mocker: MockerFixture): + """Regression test: Ensure async video generation saves raw bytes directly + without bouncing through base64 encoding.""" + # We mock _encode_video_bytes (the correct path) + mocker.patch( + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"raw-mp4-bytes", + ) + + # We assert that encode_video_base64 is never called + mock_base64 = mocker.patch( + "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", + side_effect=RuntimeError("Regression: async video path should not base64 encode"), + ) + + response = test_client.post( + "/v1/videos", + data={"prompt": "A base64 test."}, + ) + assert response.status_code == 200 + video_id = response.json()["id"] + + # Wait for completion. If it used base64, the RuntimeError would fail the task + _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value) + mock_base64.assert_not_called() + + +def test_async_video_generation_with_audio_bypasses_base64(test_client, mocker: MockerFixture): + """Regression test: Ensure async video generation passes audio through + generate_video_bytes without bouncing through base64 encoding.""" + mock_encode = mocker.patch( + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"raw-mp4-bytes", + ) + + mock_base64 = mocker.patch( + "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", + side_effect=RuntimeError("Regression: async video path should not base64 encode"), + ) + + engine = test_client.app.state.openai_serving_video._engine_client + + async def _generate(prompt, request_id, sampling_params_list): + engine.captured_prompt = prompt + engine.captured_sampling_params_list = sampling_params_list + yield MockVideoResult([object()], audios=[object()], sample_rate=48000) + + engine.generate = _generate + + response = test_client.post( + "/v1/videos", + data={"prompt": "A base64 test with audio."}, + ) + assert response.status_code == 200 + video_id = response.json()["id"] + + _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value) + mock_base64.assert_not_called() + + mock_encode.assert_called_once() + kwargs = mock_encode.call_args.kwargs + assert "audio" in kwargs + assert kwargs["audio"] is not None + assert kwargs["audio_sample_rate"] == 48000 + + def test_t2v_video_generation_form(test_client, mocker: MockerFixture): fps_values = [] - def _fake_encode(video, fps): + def _fake_encode(video, fps, audio=None, audio_sample_rate=None, **kwargs): fps_values.append(fps) - return "Zg==" + return b"fake-video" mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", side_effect=_fake_encode, ) response = test_client.post( @@ -177,8 +243,8 @@ def test_i2v_video_generation_form(test_client, mocker: MockerFixture): image_bytes = _make_test_image_bytes((48, 32)) mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -203,8 +269,8 @@ def test_i2v_video_generation_resizes_input_to_requested_dimensions(test_client, image_bytes = _make_test_image_bytes((48, 32)) mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -229,8 +295,8 @@ def test_i2v_video_generation_resizes_input_to_requested_dimensions(test_client, def test_i2v_video_generation_with_image_reference_form(test_client, mocker: MockerFixture): mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -254,12 +320,12 @@ def test_i2v_video_generation_with_image_reference_form(test_client, mocker: Moc def test_seconds_defaults_fps_and_frames(test_client, mocker: MockerFixture): fps_values = [] - def _fake_encode(video, fps): + def _fake_encode(video, fps, audio=None, audio_sample_rate=None, **kwargs): fps_values.append(fps) - return "Zg==" + return b"fake-video" mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", side_effect=_fake_encode, ) response = test_client.post( @@ -283,8 +349,8 @@ def _fake_encode(video, fps): def test_size_param_sets_width_height(test_client, mocker: MockerFixture): mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -305,8 +371,8 @@ def test_size_param_sets_width_height(test_client, mocker: MockerFixture): def test_sampling_params_pass_through(test_client, mocker: MockerFixture): mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -337,10 +403,10 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture): def test_audio_sample_rate_comes_from_model_config(test_client, mocker: MockerFixture): audio_sample_rates = [] - def _fake_encode(video, fps, audio=None, audio_sample_rate=None): - del video, fps, audio + def _fake_encode(video, fps, audio=None, audio_sample_rate=None, video_codec_options=None): + del video, fps, audio, video_codec_options audio_sample_rates.append(audio_sample_rate) - return "Zg==" + return b"fake-video" engine = test_client.app.state.openai_serving_video._engine_client engine.model_config = SimpleNamespace( @@ -354,12 +420,14 @@ def _fake_encode(video, fps, audio=None, audio_sample_rate=None): async def _generate(prompt, request_id, sampling_params_list): engine.captured_prompt = prompt engine.captured_sampling_params_list = sampling_params_list - yield MockVideoResult([object()], audios=[object()]) + import numpy as np + + yield MockVideoResult([np.zeros((1, 64, 64, 3), dtype=np.uint8)], audios=[object()]) engine.generate = _generate mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", side_effect=_fake_encode, ) response = test_client.post( @@ -387,8 +455,8 @@ async def _generate(prompt, request_id, sampling_params_list): engine.generate = _generate mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post("/v1/videos", data={"prompt": "profile me"}) @@ -457,8 +525,8 @@ def test_invalid_seconds_returns_422(test_client): def test_negative_prompt_and_seed_pass_through(test_client, mocker: MockerFixture): mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -531,8 +599,8 @@ def test_video_request_validation(): def test_list_videos_supports_order_after_and_limit(test_client, mocker: MockerFixture): mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) ids = [] for i in range(3): @@ -600,8 +668,8 @@ def test_list_videos_supports_order_after_and_limit(test_client, mocker: MockerF def test_delete_completed_job_removes_file_and_metadata(test_client, mocker: MockerFixture): mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) create_resp = test_client.post("/v1/videos", data={"prompt": "Delete this video"}) assert create_resp.status_code == 200 @@ -672,8 +740,8 @@ def test_video_response_file_extension_is_robust(): def test_extra_params_merged_into_extra_args(test_client, mocker: MockerFixture): """extra_params JSON object is merged into sampling_params.extra_args.""" mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) extra_params = { "is_enable_stage2": True, @@ -703,8 +771,8 @@ def test_extra_params_merged_into_extra_args(test_client, mocker: MockerFixture) def test_extra_params_none_by_default(test_client, mocker: MockerFixture): """When extra_params is omitted, extra_args stays empty.""" mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -744,8 +812,8 @@ def test_extra_params_invalid_json(test_client): def test_extra_params_merged_with_existing_extra_args(test_client, mocker: MockerFixture): """extra_params is merged on top of existing extra_args (e.g. flow_shift).""" mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -769,8 +837,8 @@ def test_extra_params_merged_with_existing_extra_args(test_client, mocker: Mocke def test_sample_solver_forwarded_via_extra_params(test_client, mocker: MockerFixture): """sample_solver can be passed through existing extra_params for Wan2.2 online serving.""" mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", diff --git a/vllm_omni/diffusion/utils/media_utils.py b/vllm_omni/diffusion/utils/media_utils.py index f96a28fbd7..a09cd45953 100644 --- a/vllm_omni/diffusion/utils/media_utils.py +++ b/vllm_omni/diffusion/utils/media_utils.py @@ -20,6 +20,7 @@ def mux_video_audio_bytes( video_codec: str = "h264", audio_codec: str = "aac", crf: str = "18", + video_codec_options: dict[str, str] | None = None, ) -> bytes: """Mux video frames and optional audio waveform into MP4 bytes. @@ -42,7 +43,11 @@ def mux_video_audio_bytes( v_stream.width = video_frames.shape[2] v_stream.height = video_frames.shape[1] v_stream.pix_fmt = "yuv420p" - v_stream.options = {"crf": crf} + + options = {"crf": str(crf)} + if video_codec_options: + options.update(video_codec_options) + v_stream.options = options a_stream = None if audio_waveform is not None: diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index defaa9822c..6a65f44332 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1955,18 +1955,6 @@ def video_response_from_request(model_name: str, req: VideoGenerationRequest) -> return resp -async def decode_and_save_video_output(output: Any, file_name: str) -> str: - if not output.b64_json: - raise RuntimeError(f"Video output for {file_name} did not include b64_json content.") - - try: - video_bytes = base64.b64decode(output.b64_json) - except Exception as decode_exc: - raise RuntimeError(f"Failed to decode generated video payload for {file_name}") from decode_exc - - return await STORAGE_MANAGER.save(video_bytes, file_name) - - def _cleanup_video(video_id: str, output_path: str | None): try: if output_path is not None: @@ -1990,15 +1978,12 @@ async def _run_video_generation_job( started_at = time.perf_counter() output_path = None try: - response = await handler.generate_videos(request, video_id, reference_image=reference_image) - if not response.data: - raise RuntimeError("Video generation completed but returned no outputs.") - - if (video_count := len(response.data)) > 1: - logger.warning("Video request %s generated %s outputs but we only expected one.", video_id, video_count) + video_bytes, stage_durations, peak_memory_mb = await handler.generate_video_bytes( + request, video_id, reference_image=reference_image + ) file_name = f"{video_id}.{job.file_extension}" - output_path = await decode_and_save_video_output(response.data[0], file_name) + output_path = await STORAGE_MANAGER.save(video_bytes, file_name) logger.info("Video request %s persisted %s output file.", video_id, output_path) await VIDEO_STORE.update_fields( @@ -2009,8 +1994,8 @@ async def _run_video_generation_job( "file_name": file_name, "completed_at": int(time.time()), "inference_time_s": time.perf_counter() - started_at, - "stage_durations": response.stage_durations, - "peak_memory_mb": response.peak_memory_mb, + "stage_durations": stage_durations, + "peak_memory_mb": peak_memory_mb, }, ) except Exception as exc: diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py index 3e05a1eedd..0001fa65f8 100644 --- a/vllm_omni/entrypoints/openai/serving_video.py +++ b/vllm_omni/entrypoints/openai/serving_video.py @@ -178,17 +178,24 @@ async def generate_videos( reference_image: ReferenceImage | None = None, ) -> VideoGenerationResponse: artifacts = await self._run_and_extract(request, reference_id, reference_image=reference_image) + + video_codec_options = {"preset": "ultrafast", "threads": "0"} + if request.extra_params is not None and isinstance(request.extra_params, dict): + if "video_codec_options" in request.extra_params: + video_codec_options = request.extra_params["video_codec_options"] + _t_encode_start = time.perf_counter() video_data = [ VideoData( b64_json=( - encode_video_base64(video, fps=artifacts.output_fps) + encode_video_base64(video, fps=artifacts.output_fps, video_codec_options=video_codec_options) if artifacts.audios[idx] is None else encode_video_base64( video, fps=artifacts.output_fps, audio=artifacts.audios[idx], audio_sample_rate=artifacts.audio_sample_rate, + video_codec_options=video_codec_options, ) ) ) @@ -219,11 +226,18 @@ async def generate_video_bytes( len(artifacts.videos), ) audio = artifacts.audios[0] + + video_codec_options = {"preset": "ultrafast", "threads": "0"} + if request.extra_params is not None and isinstance(request.extra_params, dict): + if "video_codec_options" in request.extra_params: + video_codec_options = request.extra_params["video_codec_options"] + _t_encode_start = time.perf_counter() video_bytes = _encode_video_bytes( artifacts.videos[0], fps=artifacts.output_fps, **({"audio": audio, "audio_sample_rate": artifacts.audio_sample_rate} if audio is not None else {}), + video_codec_options=video_codec_options, ) _t_encode_ms = (time.perf_counter() - _t_encode_start) * 1000 logger.info("Video response encoding (MP4 bytes): %.2f ms", _t_encode_ms) diff --git a/vllm_omni/entrypoints/openai/video_api_utils.py b/vllm_omni/entrypoints/openai/video_api_utils.py index 69178fb3d3..1935469792 100644 --- a/vllm_omni/entrypoints/openai/video_api_utils.py +++ b/vllm_omni/entrypoints/openai/video_api_utils.py @@ -202,7 +202,13 @@ def _coerce_audio_to_numpy(audio: Any) -> np.ndarray: return arr.astype(np.float32) -def _encode_video_bytes(video: Any, fps: int, audio: Any | None = None, audio_sample_rate: int | None = None) -> bytes: +def _encode_video_bytes( + video: Any, + fps: int, + audio: Any | None = None, + audio_sample_rate: int | None = None, + video_codec_options: dict[str, str] | None = None, +) -> bytes: """Encode a video payload into MP4 bytes, optionally muxing audio.""" from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes @@ -213,7 +219,13 @@ def _encode_video_bytes(video: Any, fps: int, audio: Any | None = None, audio_sa frames_np = np.stack(frames, axis=0) if frames_np.ndim == 4 and frames_np.shape[-1] == 4: frames_np = frames_np[..., :3] - frames_u8 = (np.clip(frames_np, 0.0, 1.0) * 255).round().clip(0, 255).astype(np.uint8) + + if frames_np.dtype == np.uint8: + frames_u8 = frames_np + else: + frames_np = np.clip(frames_np, 0.0, 1.0) + frames_np *= 255.0 + frames_u8 = np.round(frames_np).astype(np.uint8) audio_np = _coerce_audio_to_numpy(audio) if audio is not None else None @@ -222,10 +234,19 @@ def _encode_video_bytes(video: Any, fps: int, audio: Any | None = None, audio_sa audio_np, fps=float(fps), audio_sample_rate=audio_sample_rate or 24000, + video_codec_options=video_codec_options, ) -def encode_video_base64(video: Any, fps: int, audio: Any | None = None, audio_sample_rate: int | None = None) -> str: +def encode_video_base64( + video: Any, + fps: int, + audio: Any | None = None, + audio_sample_rate: int | None = None, + video_codec_options: dict[str, str] | None = None, +) -> str: """Encode a video (frames/array/tensor) to base64 MP4.""" - video_bytes = _encode_video_bytes(video, fps=fps, audio=audio, audio_sample_rate=audio_sample_rate) + video_bytes = _encode_video_bytes( + video, fps=fps, audio=audio, audio_sample_rate=audio_sample_rate, video_codec_options=video_codec_options + ) return base64.b64encode(video_bytes).decode("utf-8") From 48c30bc399b40cadb550b106f5846f0b3354bddd Mon Sep 17 00:00:00 2001 From: iancarrasco-b10 Date: Tue, 14 Apr 2026 02:49:13 -0400 Subject: [PATCH 158/204] [Qwen3-TTS] Remove hardcoded `distributed_executor_backend` to improve single-GPU performance (#2604) Signed-off-by: Ian Carrasco --- examples/online_serving/qwen3_tts/README.md | 48 +++++++++ .../stage_configs/qwen3_tts_uniproc.yaml | 97 +++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 vllm_omni/model_executor/stage_configs/qwen3_tts_uniproc.yaml diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md index e53fa7392b..b48db9cf45 100644 --- a/examples/online_serving/qwen3_tts/README.md +++ b/examples/online_serving/qwen3_tts/README.md @@ -378,6 +378,54 @@ Server -> Client: {"type": "session.done", "total_sentences": 1} ``` +## Choosing an Execution Backend: Uniproc vs Multiprocessing + +Qwen3-TTS stage configs support two execution backends controlled by the +`distributed_executor_backend` engine arg. The performance tradeoff between +them is **both hardware- and task-dependent**, so there is no single best +default (see [#2603](https://github.com/vllm-project/vllm-omni/issues/2603), +[#2604](https://github.com/vllm-project/vllm-omni/pull/2604) for the full +investigation). + +| Backend | Stage config setting | Behaviour | +| ------- | -------------------- | --------- | +| **Uniproc** (default, world_size=1) | `distributed_executor_backend` omitted | Both stages run inside the orchestrator process. Avoids IPC serialisation, D2H copies, and msgpack overhead between stages. | +| **Multiprocessing** | `distributed_executor_backend: "mp"` | Each stage runs in its own subprocess. The Talker can continue decoding while Code2Wav runs the vocoder in parallel, improving pipeline utilisation under concurrency. | + +> **Note:** When `distributed_executor_backend` is omitted and `world_size=1`, +> vLLM [automatically uses the uniproc executor](https://github.com/vllm-project/vllm/blob/main/vllm/config/parallel.py#L825). +> When `world_size > 1`, it defaults to `mp`. + +### When uniproc wins + +The uniproc path eliminates inter-process data transfer (D2H copies, +msgpack serialisation/deserialisation, tensor detaching). This matters most +when per-request processing is heavy relative to autoregressive decode. + +The Base cloning task involves reference-audio encoding on every request, making IPC +overhead a larger fraction of total cost. Qwen3-Omni shows a similar pattern. + +### When multiprocessing (`mp`) wins + +For lighter per-request workloads, process-level parallelism between the +Talker and Code2Wav stages dominates. + +CustomVoice is lighter per-request (no reference audio encoding), so the +process-level parallelism of `mp` outweighs its serialisation cost at +concurrency ≥ 4. + +### How to switch + +To use the uniproc executor on a single-GPU setup, pass the +`qwen3_tts_uniproc.yaml` stage config: + +```bash +vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-Base \ + --omni \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_uniproc.yaml \ + --port 8091 +``` + ## Limitations - **Single request**: Batch processing is not yet optimized for online serving. diff --git a/vllm_omni/model_executor/stage_configs/qwen3_tts_uniproc.yaml b/vllm_omni/model_executor/stage_configs/qwen3_tts_uniproc.yaml new file mode 100644 index 0000000000..d2e920806d --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/qwen3_tts_uniproc.yaml @@ -0,0 +1,97 @@ +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + engine_args: + model_stage: qwen3_tts + max_num_seqs: 10 + model_arch: Qwen3TTSTalkerForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: false + trust_remote_code: true + async_scheduling: true + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.3 + max_num_batched_tokens: 512 + max_model_len: 4096 + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk + # Use named connector to apply runtime.connectors.extra. + output_connectors: + to_stage_1: connector_of_shared_memory + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + detokenize: false + repetition_penalty: 1.05 + stop_token_ids: [2150] + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + engine_args: + model_stage: code2wav + max_num_seqs: 1 + model_arch: Qwen3TTSCode2Wav + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: true + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.3 + # Must be divisible by num_code_groups and cover (left_context + chunk). + # Prefill length is Q * num_frames (e.g. 16 * 2148 = 34368); keep headroom past 32k. + max_num_batched_tokens: 65536 + # async_chunk appends windows per step; max_model_len must cover accumulated flat codec stream. + max_model_len: 65536 + engine_input_source: [0] + final_output: true + final_output_type: audio + # Distributed connector configuration + input_connectors: + from_stage_0: connector_of_shared_memory + tts_args: + max_instructions_length: 500 + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + detokenize: true + repetition_penalty: 1.0 + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 + + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + # Frame-aligned codec streaming transport. + codec_streaming: true + # Connector polling / timeout (unit: loop count, sleep interval in seconds). + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + # Match the decoder sliding attention window to avoid chunk-boundary noise. + codec_chunk_frames: 25 + codec_left_context_frames: 72 + + edges: + - from: 0 + to: 1 + window_size: -1 From 17acd0589a26a84bd30733496d9ffedee7f8cb67 Mon Sep 17 00:00:00 2001 From: Zhang Jian Date: Tue, 14 Apr 2026 15:05:12 +0800 Subject: [PATCH 159/204] [Test] Add Stable Audio offline e2e TeaCache Test (#2377) Signed-off-by: Zhang Signed-off-by: Zhang Jian Co-authored-by: Claude Opus 4.6 (1M context) --- .buildkite/test-amd-merge.yml | 2 +- .buildkite/test-amd-ready.yaml | 2 +- .buildkite/test-merge.yml | 18 ---- .buildkite/test-ready.yml | 2 +- docs/contributing/ci/CI_5levels.md | 2 +- docs/contributing/ci/tests_style.md | 2 +- docs/user_guide/diffusion_features.md | 2 +- .../offline_inference/text_to_audio/README.md | 2 + .../text_to_audio/text_to_audio.py | 26 +++++ pyproject.toml | 1 + tests/conftest.py | 28 ++++-- .../test_stable_audio_expansion.py | 99 +++++++++++++++++++ .../test_stable_audio_model.py | 63 ------------ 13 files changed, 156 insertions(+), 93 deletions(-) create mode 100644 tests/e2e/offline_inference/test_stable_audio_expansion.py delete mode 100644 tests/e2e/offline_inference/test_stable_audio_model.py diff --git a/.buildkite/test-amd-merge.yml b/.buildkite/test-amd-merge.yml index b6f2037d18..ac52f60b35 100644 --- a/.buildkite/test-amd-merge.yml +++ b/.buildkite/test-amd-merge.yml @@ -54,7 +54,7 @@ steps: # - export GPU_ARCHS=gfx942 # - export VLLM_LOGGING_LEVEL=DEBUG # - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model - label: "Diffusion Cache Backend Test" agent_pool: mi325_1 diff --git a/.buildkite/test-amd-ready.yaml b/.buildkite/test-amd-ready.yaml index ced91635c2..30bbc76941 100644 --- a/.buildkite/test-amd-ready.yaml +++ b/.buildkite/test-amd-ready.yaml @@ -69,7 +69,7 @@ steps: # - export GPU_ARCHS=gfx942 # - export VLLM_LOGGING_LEVEL=DEBUG # - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model - label: "Diffusion Cache Backend Test" agent_pool: mi325_1 diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 24fc6dd3dc..2a6cb6488a 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -76,24 +76,6 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Audio Generation Model Test" - timeout_in_minutes: 20 - depends_on: upload-merge-pipeline - commands: - - pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py - agents: - queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Diffusion Cache Backend Test" timeout_in_minutes: 15 depends_on: upload-merge-pipeline diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 13a812a62f..2f749f0ee9 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -123,7 +123,7 @@ steps: - label: "Audio Generation Model Test" depends_on: upload-ready-pipeline commands: - - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model agents: queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU plugins: diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index 74ae1a38eb..9306035738 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -242,7 +242,7 @@ vllm_omni/ tests/ ├── test_zimage_tensor_parallel.py ├── test_cache_dit.py ├── test_teacache.py - ├── test_stable_audio_model.py + ├── test_stable_audio_expansion.py ├── test_diffusion_cpu_offload.py ├── test_diffusion_layerwise_offload.py ├── test_diffusion_lora.py diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md index 8b10cf4cc1..69d5b16d7a 100644 --- a/docs/contributing/ci/tests_style.md +++ b/docs/contributing/ci/tests_style.md @@ -147,7 +147,7 @@ vllm_omni/ tests/ ├── test_zimage_tensor_parallel.py ├── test_cache_dit.py ├── test_teacache.py - ├── test_stable_audio_model.py + ├── test_stable_audio_expansion.py ├── test_diffusion_cpu_offload.py ├── test_diffusion_layerwise_offload.py ├── test_diffusion_lora.py diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index ac140ff84a..31cd1500fa 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -147,7 +147,7 @@ The following tables show which models support each feature: | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| -| **Stable-Audio-Open** | ❌ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | +| **Stable-Audio-Open** | ✅ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ## Feature Compatibility diff --git a/examples/offline_inference/text_to_audio/README.md b/examples/offline_inference/text_to_audio/README.md index 7edc38092a..50bab3e2f2 100644 --- a/examples/offline_inference/text_to_audio/README.md +++ b/examples/offline_inference/text_to_audio/README.md @@ -23,6 +23,7 @@ python text_to_audio.py \ --guidance-scale 7.0 \ --audio-length 10.0 \ --num-inference-steps 100 \ + --cache-backend tea_cache \ --output stable_audio_output.wav ``` @@ -34,4 +35,5 @@ Key arguments: - `--guidance-scale`: classifier-free guidance scale. - `--audio-length`: audio duration in seconds. - `--num-inference-steps`: diffusion sampling steps.(more steps = higher quality, slower). +- `--cache-backend`: cache acceleration backend. Stable Audio currently supports `tea_cache`. - `--output`: path to save the generated WAV file. diff --git a/examples/offline_inference/text_to_audio/text_to_audio.py b/examples/offline_inference/text_to_audio/text_to_audio.py index a6968c419f..3adb3ad53a 100644 --- a/examples/offline_inference/text_to_audio/text_to_audio.py +++ b/examples/offline_inference/text_to_audio/text_to_audio.py @@ -11,6 +11,7 @@ python text_to_audio.py --prompt "The sound of a dog barking" python text_to_audio.py --prompt "A piano playing a gentle melody" --audio-length 10.0 python text_to_audio.py --prompt "Thunder and rain sounds" --negative-prompt "Low quality" + python text_to_audio.py --prompt "A soft synth pad" --cache-backend tea_cache """ import argparse @@ -90,6 +91,23 @@ def parse_args() -> argparse.Namespace: default=44100, help="Sample rate for output audio (Stable Audio uses 44100 Hz).", ) + parser.add_argument( + "--cache-backend", + type=str, + default=None, + choices=["tea_cache"], + help=( + "Cache backend to use for acceleration. " + "Stable Audio currently supports 'tea_cache'. " + "Default: None (no cache acceleration)." + ), + ) + parser.add_argument( + "--tea-cache-rel-l1-thresh", + type=float, + default=0.2, + help="[tea_cache] Threshold for accumulated relative L1 distance.", + ) parser.add_argument( "--enable-diffusion-pipeline-profiler", action="store_true", @@ -124,6 +142,11 @@ def save_audio(audio_data: np.ndarray, output_path: str, sample_rate: int = 4410 def main(): args = parse_args() generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed) + cache_config = None + if args.cache_backend == "tea_cache": + cache_config = { + "rel_l1_thresh": args.tea_cache_rel_l1_thresh, + } print(f"\n{'=' * 60}") print("Stable Audio Open - Text-to-Audio Generation") @@ -134,12 +157,15 @@ def main(): print(f" Audio length: {args.audio_length}s") print(f" Inference steps: {args.num_inference_steps}") print(f" Guidance scale: {args.guidance_scale}") + print(f" Cache backend: {args.cache_backend if args.cache_backend else 'None (no acceleration)'}") print(f" Seed: {args.seed}") print(f"{'=' * 60}\n") # Initialize Omni with Stable Audio model omni = Omni( model=args.model, + cache_backend=args.cache_backend, + cache_config=cache_config, enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler, ) diff --git a/pyproject.toml b/pyproject.toml index e49aa6e325..57a4b474fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -182,6 +182,7 @@ markers = [ "H100: Tests that require H100 GPU", "L4: Tests that require L4 GPU", "MI325: Tests that require MI325 GPU (AMD/ROCm)", + "B60: Tests that require Intel Arc Pro B60 XPU", "S5000: Tests that require S5000 GPU (Moore Threads/MUSA)", "A2: Tests that require A2 NPU", "A3: Tests that require A3 NPU", diff --git a/tests/conftest.py b/tests/conftest.py index 9c739533b8..e41d15bdf5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -167,7 +167,6 @@ def assert_audio_diffusion_response( Validate audio diffusion response. """ raise NotImplementedError("Audio validation is not implemented yet") - # consider using assert_audio_valid defined above def _maybe_int(value: Any) -> int | None: @@ -277,15 +276,32 @@ def assert_video_valid( pass -def assert_audio_valid(path: Path, *, sample_rate: int, channels: int, duration_s: float) -> None: - """Assert the WAV has the expected sample rate, channel count, and duration.""" +def assert_audio_valid( + audio_or_path: Path | np.ndarray, + *, + sample_rate: int, + channels: int, + duration_s: float, +) -> None: + """Assert WAV file or (batch, channels, samples) ndarray matches expected audio format.""" + expected_samples = int(duration_s * sample_rate) + if isinstance(audio_or_path, np.ndarray): + audio = audio_or_path + assert audio.ndim == 3, f"Expected audio ndim=3 (batch, channels, samples), got shape {audio.shape}" + assert audio.shape[0] == 1, f"Expected batch size 1, got {audio.shape[0]}" + assert audio.shape[1] == channels, f"Expected {channels} channels, got {audio.shape[1]}" + assert audio.shape[2] == expected_samples, ( + f"Expected {expected_samples} samples ({duration_s}s @ {sample_rate} Hz), got {audio.shape[2]}" + ) + return + + path = audio_or_path assert path.exists(), f"Audio not found: {path}" info = sf.info(str(path)) assert info.samplerate == sample_rate, f"Expected sample_rate={sample_rate}, got {info.samplerate}" assert info.channels == channels, f"Expected {channels} channel(s), got {info.channels}" - expected_frames = int(duration_s * sample_rate) - assert info.frames == expected_frames, ( - f"Expected {expected_frames} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}" + assert info.frames == expected_samples, ( + f"Expected {expected_samples} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}" ) diff --git a/tests/e2e/offline_inference/test_stable_audio_expansion.py b/tests/e2e/offline_inference/test_stable_audio_expansion.py new file mode 100644 index 0000000000..54c1799e14 --- /dev/null +++ b/tests/e2e/offline_inference/test_stable_audio_expansion.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Stable Audio offline e2e: real weights, FP8 + TeaCache (single job to save GPU). + +NOTE: This test instantiates Omni directly instead of using the omni_runner +fixture (introduced in PR #2711) because the fixture's parametrize interface +only accepts (model, stage_config_path) and does not support extra kwargs like +quantization, cache_backend, or cache_config. +""" + +from __future__ import annotations + +import numpy as np +import pytest +import torch + +from tests.conftest import assert_audio_valid +from tests.utils import hardware_test +from vllm_omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput +from vllm_omni.platforms import current_omni_platform + +_SAMPLE_RATE = 44100 +_CLIP_DURATION_S = 2.0 + + +def generate_stable_audio_short_clip( + omni: Omni, + *, + audio_start_in_s: float = 0.0, + audio_end_in_s: float = 2.0, + num_inference_steps: int = 4, + seed: int = 42, +) -> np.ndarray: + """Run a minimal Stable Audio generation and return audio as (batch, channels, samples).""" + outputs = omni.generate( + prompts={ + "prompt": "The sound of a dog barking", + "negative_prompt": "Low quality.", + }, + sampling_params_list=OmniDiffusionSamplingParams( + num_inference_steps=num_inference_steps, + guidance_scale=7.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), + num_outputs_per_prompt=1, + extra_args={ + "audio_start_in_s": audio_start_in_s, + "audio_end_in_s": audio_end_in_s, + }, + ), + ) + + assert outputs is not None + first_output = outputs[0] + # Outer OmniRequestOutput.final_output_type comes from get_stage_metadata. + # The nested request_output is the worker OmniRequestOutput + # (e.g. final_output_type="audio") and holds the multimodal payload. + # Follow-up: add StableAudioPipeline stage YAML, and pass model into + # _create_default_diffusion_stage_cfg so default diffusion metadata can set + # final_output_type to "audio" for future audio pipelines without YAML. + assert first_output.final_output_type == "image" + assert hasattr(first_output, "request_output") and first_output.request_output + + req_out = first_output.request_output + assert isinstance(req_out, OmniRequestOutput) + assert req_out.final_output_type == "audio" + assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output + audio = req_out.multimodal_output.get("audio") + assert isinstance(audio, np.ndarray) + return audio + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@pytest.mark.cache +@hardware_test(res={"cuda": "L4", "xpu": "B60"}) +def test_stable_audio_quantization_and_teacache() -> None: + """Stable Audio Open on real Hub weights with FP8 + TeaCache (covers former L2 smoke + L4 features). + + CI should provide ``HF_TOKEN`` if the checkpoint is gated. + """ + m = Omni( + model="stabilityai/stable-audio-open-1.0", + quantization="fp8", + cache_backend="tea_cache", + cache_config={"rel_l1_thresh": 0.2}, + ) + try: + audio = generate_stable_audio_short_clip(m) + assert_audio_valid( + audio, + sample_rate=_SAMPLE_RATE, + channels=2, + duration_s=_CLIP_DURATION_S, + ) + finally: + m.close() diff --git a/tests/e2e/offline_inference/test_stable_audio_model.py b/tests/e2e/offline_inference/test_stable_audio_model.py deleted file mode 100644 index 21d75aad52..0000000000 --- a/tests/e2e/offline_inference/test_stable_audio_model.py +++ /dev/null @@ -1,63 +0,0 @@ -import numpy as np -import pytest -import torch - -from tests.utils import hardware_test -from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from vllm_omni.outputs import OmniRequestOutput -from vllm_omni.platforms import current_omni_platform - -# Use random weights model for CI testing (small, no authentication required) -models = ["linyueqian/stable_audio_random"] - -# omni_runner expects (model, stage_configs_path); single-stage diffusion has no YAML. -test_params = [(m, None) for m in models] - - -@pytest.mark.core_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "L4", "xpu": "B60"}) -@pytest.mark.parametrize("omni_runner", test_params, indirect=True) -def test_stable_audio_model(omni_runner): - # Use minimal settings for testing - # Generate a short 2-second audio clip with minimal inference steps - audio_start_in_s = 0.0 - audio_end_in_s = 2.0 # Short duration for fast testing - sample_rate = 44100 # Stable Audio uses 44100 Hz - - outputs = omni_runner.omni.generate( - prompts={ - "prompt": "The sound of a dog barking", - "negative_prompt": "Low quality.", - }, - sampling_params_list=OmniDiffusionSamplingParams( - num_inference_steps=4, # Minimal steps for speed - guidance_scale=7.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), - num_outputs_per_prompt=1, - extra_args={ - "audio_start_in_s": audio_start_in_s, - "audio_end_in_s": audio_end_in_s, - }, - ), - ) - - # Extract audio from OmniRequestOutput - assert outputs is not None - first_output = outputs[0] - assert first_output.final_output_type == "image" - assert hasattr(first_output, "request_output") and first_output.request_output - - req_out = first_output.request_output - assert isinstance(req_out, OmniRequestOutput) - assert req_out.final_output_type == "audio" - assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output - audio = req_out.multimodal_output.get("audio") - assert isinstance(audio, np.ndarray) - # audio shape: (batch, channels, samples) - # For stable-audio-open-1.0: sample_rate=44100, so 2 seconds = 88200 samples - assert audio.ndim == 3 - assert audio.shape[0] == 1 # batch size - assert audio.shape[1] == 2 # stereo channels - expected_samples = int((audio_end_in_s - audio_start_in_s) * sample_rate) - assert audio.shape[2] == expected_samples # 88200 samples for 2 seconds From 6d01a8b506a2a28a7aedc1ffd5c989a407b0bd70 Mon Sep 17 00:00:00 2001 From: NATURE Date: Tue, 14 Apr 2026 16:06:37 +0800 Subject: [PATCH 160/204] [Omni Connector] Omni Transfer Engine Connector: Enable 1-receiver-to-N-senders to support Bagel TP/CFG parallel (#2731) Signed-off-by: natureofnature --- .../omni_connectors/test_shm_connector.py | 184 ++++++++++++++++++ .../omni_connectors/connectors/base.py | 10 +- .../connectors/mooncake_store_connector.py | 19 +- .../mooncake_transfer_engine_connector.py | 178 +++++++++++------ .../connectors/shm_connector.py | 113 ++++++++--- .../omni_connectors/utils/initialization.py | 5 + 6 files changed, 422 insertions(+), 87 deletions(-) create mode 100644 tests/distributed/omni_connectors/test_shm_connector.py diff --git a/tests/distributed/omni_connectors/test_shm_connector.py b/tests/distributed/omni_connectors/test_shm_connector.py new file mode 100644 index 0000000000..e702318e3f --- /dev/null +++ b/tests/distributed/omni_connectors/test_shm_connector.py @@ -0,0 +1,184 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for SharedMemoryConnector focusing on TP / CFG / metadata fallback.""" + +import pytest + +from vllm_omni.distributed.omni_connectors.connectors.shm_connector import ( + SharedMemoryConnector, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +@pytest.fixture() +def connector(): + c = SharedMemoryConnector({"shm_threshold_bytes": 64}) + yield c + c.close() + + +# ── Key-based read (the fundamental SHM path) ──────────────────────── + + +class TestKeyBasedReadWrite: + def test_put_then_get_by_key(self, connector): + data = {"hello": "world", "n": 42} + ok, size, meta = connector.put("s0", "s1", "test_key_1", data) + assert ok + assert size > 0 + assert "shm" in meta + assert "test_key_1" in connector._pending_keys + + result = connector.get("s0", "s1", "test_key_1", metadata=None) + assert result is not None + obj, rsize = result + assert obj == data + assert rsize == size + assert "test_key_1" not in connector._pending_keys + + def test_get_nonexistent_key_returns_none(self, connector): + result = connector.get("s0", "s1", "no_such_key_xyz", metadata=None) + assert result is None + + def test_rank_aware_keys_independent(self, connector): + """Each TP rank writes/reads its own key — simulates homogeneous TP.""" + payloads = {} + for rank in range(4): + key = f"req1_s0_0_{rank}_{rank}" + data = {"rank": rank, "values": list(range(rank, rank + 3))} + ok, _, _ = connector.put("s0", "s1", key, data) + assert ok + payloads[rank] = data + + for rank in range(4): + key = f"req1_s0_0_{rank}_{rank}" + result = connector.get("s0", "s1", key, metadata=None) + assert result is not None + obj, _ = result + assert obj == payloads[rank] + + +# ── Metadata fallback behaviour ────────────────────────────────────── + + +class TestMetadataFallback: + def test_rdma_style_metadata_falls_back_to_key(self, connector): + """source_host/source_port metadata should be ignored; key read used.""" + data = {"payload": True} + connector.put("s0", "s1", "fb_key_1", data) + + rdma_meta = {"source_host": "10.0.0.1", "source_port": 12345} + result = connector.get("s0", "s1", "fb_key_1", metadata=rdma_meta) + assert result is not None + obj, _ = result + assert obj == data + + def test_non_dict_metadata_falls_back_to_key(self, connector): + data = {"val": 99} + connector.put("s0", "s1", "fb_key_2", data) + + result = connector.get("s0", "s1", "fb_key_2", metadata="not_a_dict") + assert result is not None + obj, _ = result + assert obj == data + + def test_empty_dict_metadata_falls_back_to_key(self, connector): + data = {"x": 1} + connector.put("s0", "s1", "fb_key_3", data) + + result = connector.get("s0", "s1", "fb_key_3", metadata={}) + assert result is not None + obj, _ = result + assert obj == data + + def test_shm_handle_metadata_still_works(self, connector): + """When metadata contains a proper 'shm' handle, use it directly.""" + data = {"direct": True} + ok, size, meta = connector.put("s0", "s1", "shm_direct_1", data) + assert ok + result = connector.get("s0", "s1", "shm_direct_1", metadata=meta) + assert result is not None + obj, _ = result + assert obj == data + + def test_metadata_keyed_by_request_id(self, connector): + """Metadata wrapped as {get_key: actual_meta} should be unwrapped.""" + data = {"wrapped": True} + ok, size, meta = connector.put("s0", "s1", "wrap_key", data) + assert ok + wrapped = {"wrap_key": meta} + result = connector.get("s0", "s1", "wrap_key", metadata=wrapped) + assert result is not None + obj, _ = result + assert obj == data + + +# ── Heterogeneous TP multi-key read ────────────────────────────────── + + +class TestHeteroTPMultiKey: + def test_receiver_reads_multiple_sender_keys(self, connector): + """Simulates from_tp=2 -> to_tp=1: receiver reads 2 keys and merges.""" + for sender_rank in range(2): + key = f"req1_s0_0_{sender_rank}_0" + data = {"sender": sender_rank, "shard": [sender_rank * 10]} + connector.put("s0", "s1", key, data) + + shards = [] + for sender_rank in range(2): + key = f"req1_s0_0_{sender_rank}_0" + result = connector.get("s0", "s1", key, metadata=None) + assert result is not None + obj, _ = result + shards.append(obj) + + assert len(shards) == 2 + assert shards[0]["sender"] == 0 + assert shards[1]["sender"] == 1 + + def test_sender_writes_multiple_receiver_keys(self, connector): + """Simulates from_tp=1 -> to_tp=2: sender writes 2 sliced keys.""" + for recv_rank in range(2): + key = f"req1_s0_0_0_{recv_rank}" + data = {"target": recv_rank, "slice": list(range(recv_rank, recv_rank + 2))} + connector.put("s0", "s1", key, data) + + for recv_rank in range(2): + key = f"req1_s0_0_0_{recv_rank}" + result = connector.get("s0", "s1", key, metadata=None) + assert result is not None + obj, _ = result + assert obj["target"] == recv_rank + + +# ── Cleanup ────────────────────────────────────────────────────────── + + +class TestCleanup: + def test_cleanup_removes_unconsumed_segment(self, connector): + data = {"leak": True} + connector.put("s0", "s1", "cleanup_req_42", data) + assert "cleanup_req_42" in connector._pending_keys + + connector.cleanup("req_42") + assert "cleanup_req_42" not in connector._pending_keys + + result = connector.get("s0", "s1", "cleanup_req_42", metadata=None) + assert result is None + + def test_cleanup_noop_for_consumed_segment(self, connector): + data = {"consumed": True} + connector.put("s0", "s1", "consumed_req_99", data) + connector.get("s0", "s1", "consumed_req_99", metadata=None) + + connector.cleanup("req_99") + assert "consumed_req_99" not in connector._pending_keys + + def test_close_cleans_all_pending(self, connector): + for i in range(3): + connector.put("s0", "s1", f"close_test_{i}", {"i": i}) + + assert len(connector._pending_keys) == 3 + connector.close() + assert len(connector._pending_keys) == 0 diff --git a/vllm_omni/distributed/omni_connectors/connectors/base.py b/vllm_omni/distributed/omni_connectors/connectors/base.py index 83edb2ab0a..0df428f2ff 100644 --- a/vllm_omni/distributed/omni_connectors/connectors/base.py +++ b/vllm_omni/distributed/omni_connectors/connectors/base.py @@ -34,13 +34,21 @@ def put(self, from_stage: str, to_stage: str, put_key: str, data: Any) -> tuple[ pass @abstractmethod - def get(self, from_stage: str, to_stage: str, get_key: str, metadata=None) -> tuple[Any, int] | None: + def get( + self, from_stage: str, to_stage: str, get_key: str, metadata: dict[str, Any] | None = None + ) -> tuple[Any, int] | None: """Retrieve Python object and payload size (bytes). Args: from_stage: Source stage identifier to_stage: Destination stage identifier get_key: Unique request identifier + metadata: Optional transport-specific metadata. When provided, + the connector uses it directly (e.g. source_host, source_port, + data_size) instead of querying the sender. For heterogeneous + TP the manager may supply partial metadata (host/port only); + the connector will query the sender at that address to fill + in data_size. Returns: Tuple of (Python object, serialized byte size) if found, None otherwise diff --git a/vllm_omni/distributed/omni_connectors/connectors/mooncake_store_connector.py b/vllm_omni/distributed/omni_connectors/connectors/mooncake_store_connector.py index c672e35f79..fa1fc3286d 100644 --- a/vllm_omni/distributed/omni_connectors/connectors/mooncake_store_connector.py +++ b/vllm_omni/distributed/omni_connectors/connectors/mooncake_store_connector.py @@ -78,7 +78,24 @@ def put(self, from_stage: str, to_stage: str, put_key: str, data: Any) -> tuple[ try: serialized_data = self.serialize_obj(data) key = self._make_key(put_key, from_stage, to_stage) - self.store.put(key, serialized_data, self.pin) + put_rc = self.store.put(key, serialized_data, self.pin) + + if isinstance(put_rc, bool): + put_ok = put_rc + else: + put_ok = put_rc is None or put_rc == 0 + + if not put_ok: + self._metrics["errors"] += 1 + logger.error( + "MooncakeStoreConnector put failed for %s (%s -> %s), rc=%r, %d bytes", + key, + from_stage, + to_stage, + put_rc, + len(serialized_data), + ) + return False, 0, None self._metrics["puts"] += 1 self._metrics["bytes_transferred"] += len(serialized_data) diff --git a/vllm_omni/distributed/omni_connectors/connectors/mooncake_transfer_engine_connector.py b/vllm_omni/distributed/omni_connectors/connectors/mooncake_transfer_engine_connector.py index 96a528963f..bd4160f3e6 100644 --- a/vllm_omni/distributed/omni_connectors/connectors/mooncake_transfer_engine_connector.py +++ b/vllm_omni/distributed/omni_connectors/connectors/mooncake_transfer_engine_connector.py @@ -230,16 +230,19 @@ class MooncakeTransferEngineConnector(OmniConnectorBase): sender immediately cleans up the buffer (``cleanup()``), so only the first receiver to pull a given key will succeed. Broadcast / multicast (1 sender → N receivers sharing the same data) is not yet supported. - - **1 receiver → 1 sender**: ``update_sender_info()`` stores a single - ``(sender_host, sender_zmq_port)`` pair, so a receiver can only query - metadata from one sender at a time. + - **1 receiver → N senders**: Supported via partial metadata. The + manager constructs metadata with the target sender's + ``source_host`` / ``source_port`` (computed from ``from_rank``) + and passes it to ``get(metadata=...)``. The connector detects + that ``data_size`` is missing, queries the specified sender at + the given address to fill it in, then performs the RDMA pull. + This enables heterogeneous TP (sender TP > receiver TP) where a + single receiver must pull KV shards from multiple sender ranks. Future work: - Support 1 sender → N receivers (e.g. reference-counted buffers, or explicit ``retain()`` / ``release()`` semantics so the buffer survives multiple pulls). - - Support 1 receiver → N senders (e.g. a sender registry mapping - ``get_key`` prefixes to different sender endpoints). """ # RDMA connector copies raw bytes/tensor directly to the memory pool @@ -267,6 +270,7 @@ def __init__(self, config: dict[str, Any]): self._req_local = threading.local() self._worker_local = threading.local() self._last_ttl_check: float = _time_mod.monotonic() + self._sender_endpoints: dict[int, tuple[str, int]] = {} self._metrics = { "puts": 0, @@ -408,16 +412,38 @@ def get_connection_info(self) -> dict[str, Any]: "can_put": self.can_put, } - def update_sender_info(self, sender_host: str, sender_zmq_port: int) -> None: - """ - Inject the sender's ZMQ endpoint into the receiver connector. - Used for NO METADATA GET calls.(E.g: KV-cache transfer path) - Must be called before using get() without metadata! - Otherwise, get() will raise an error. + def update_sender_info( + self, + sender_host: str, + sender_zmq_port: int, + sender_rank: int | None = None, + ) -> None: + """Inject a sender's ZMQ endpoint into the receiver connector. + + When ``sender_rank`` is ``None`` (default), sets the single default + sender used by ``get()`` when no rank is specified — this preserves + backward-compatible 1:1 semantics. + + When ``sender_rank`` is an integer, the endpoint is stored in a + per-rank registry for internal use (e.g. by + ``_query_metadata_from_sender(sender_rank=R)``). """ - self.sender_host = sender_host - self.sender_zmq_port = sender_zmq_port - logger.info(f"Sender info updated: host={sender_host!r}, zmq_port={sender_zmq_port}") + if sender_rank is not None: + self._sender_endpoints[sender_rank] = (sender_host, sender_zmq_port) + logger.info( + "Sender info updated for rank %s: host=%r, zmq_port=%s", + sender_rank, + sender_host, + sender_zmq_port, + ) + else: + self.sender_host = sender_host + self.sender_zmq_port = sender_zmq_port + logger.info( + "Sender info updated (default): host=%r, zmq_port=%s", + sender_host, + sender_zmq_port, + ) def _get_local_ip(self) -> str: """ @@ -657,56 +683,75 @@ def put(self, from_stage: str, to_stage: str, put_key: str, data: Any) -> tuple[ logger.error(f"RDMA Put failed for {put_key}: {e}", exc_info=True) return False, 0, None - def _query_metadata_from_sender(self, get_key: str) -> dict[str, Any] | None: - """Query metadata from sender via ZMQ (fallback when ``metadata=None``). - - ``get()`` supports two metadata resolution paths:: - - get(metadata=?) - ├── metadata provided (adapter path) - │ → use metadata directly (source_host/port/data_size) - │ → RDMA pull - └── metadata=None (KV-transfer polling path) - → _query_metadata_from_sender(get_key) ← this method - │ - ├── sender_host resolved (via update_sender_info) - │ → ZMQ query → get data_size/is_fast_path - │ → construct metadata → RDMA pull - └── sender_host unresolved ("auto" / None) - → return None → caller retries or times out + def _resolve_sender_endpoint(self, sender_rank: int | None = None) -> tuple[str, int] | None: + """Return ``(host, zmq_port)`` for *sender_rank*. - For the second path, the caller must call - :meth:`update_sender_info` before ``get()`` to resolve the sender's ZMQ endpoint. - Support the two paths in case that the orchestrator pushes the request info - to different stages at the same time knowing metadata or not. + Resolution order: + 1. Per-rank registry (``_sender_endpoints[sender_rank]``) + 2. Default sender (``sender_host`` / ``sender_zmq_port``) + 3. ``None`` if nothing is configured. + """ + if sender_rank is not None and sender_rank in self._sender_endpoints: + return self._sender_endpoints[sender_rank] + host = getattr(self, "sender_host", None) + port = getattr(self, "sender_zmq_port", None) + if host and port and str(host).lower() != "auto": + return (host, int(port)) + return None + + def _query_metadata_at(self, get_key: str, host: str, port: int) -> dict[str, Any] | None: + """Query metadata from a sender endpoint via ZMQ. + + Returns ``{source_host, source_port, data_size, is_fast_path}`` + or ``None`` when the key is not found / the query fails. """ - zmq_addr = f"tcp://{self.sender_host}:{self.sender_zmq_port}" + zmq_addr = f"tcp://{host}:{port}" req_socket = self._get_req_socket(zmq_addr, timeout_ms=5000) - try: - # Send query request - query = QueryRequest(request_id=get_key) - req_socket.send(QUERY_INFO + msgspec.msgpack.encode(query)) + req_socket.send(QUERY_INFO + msgspec.msgpack.encode(QueryRequest(request_id=get_key))) resp = req_socket.recv() - if resp == INFO_NOT_FOUND: return None - - # Parse response query_resp = msgspec.msgpack.decode(resp, type=QueryResponse) return { - # source_host/source_port are used for verification - "source_host": self.sender_host, - "source_port": self.sender_zmq_port, + "source_host": host, + "source_port": port, "data_size": query_resp.data_size, "is_fast_path": query_resp.is_fast_path, } except Exception as e: - # Socket may be stuck in bad state after timeout; discard it self._invalidate_req_socket(zmq_addr) - logger.debug(f"Failed to query metadata for {get_key}: {e}") + logger.debug("Failed to query metadata at %s for %s: %s", zmq_addr, get_key, e) return None + def _query_metadata_from_sender(self, get_key: str, sender_rank: int | None = None) -> dict[str, Any] | None: + """Query metadata from sender via ZMQ (fallback when ``metadata=None``). + + ``get()`` supports three metadata resolution paths:: + + get(metadata=?) + ├── Path 1: metadata has data_size (adapter path) + │ → use metadata directly → RDMA pull + ├── Path 2: metadata has source_host/port but no data_size + │ → _query_metadata_at(host, port) → get data_size → RDMA pull + └── Path 3: metadata=None (KV-transfer polling path) + → _query_metadata_from_sender(get_key) ← this method + │ + ├── sender endpoint resolved (via update_sender_info) + │ → ZMQ query → get data_size/is_fast_path + │ → construct metadata → RDMA pull + └── sender endpoint unresolved + → return None → caller retries or times out + + When *sender_rank* is provided, the query is routed to that + rank's endpoint (registered via ``update_sender_info(rank=...)``). + Otherwise the default sender is used. + """ + endpoint = self._resolve_sender_endpoint(sender_rank) + if endpoint is None: + return None + return self._query_metadata_at(get_key, *endpoint) + def get( self, from_stage: str, @@ -714,12 +759,18 @@ def get( get_key: str, metadata: dict[str, Any] | None = None, ) -> tuple[Any, int] | None: - """ - Consumer Side. - Allocates from local pool and pulls data via RDMA. + """Consumer Side. Allocates from local pool and pulls data via RDMA. + + Metadata resolution: - If metadata is not provided, will attempt to query it from sender - using configured sender_host/sender_zmq_port. + 1. ``metadata`` provided **with** ``data_size`` → use directly (RDMA pull). + 2. ``metadata`` provided with ``source_host``/``source_port`` but + **without** ``data_size`` → query that specific sender for + ``data_size`` / ``is_fast_path``, then RDMA pull. This is the + heterogeneous-TP path where the manager knows the target sender + endpoint but not the payload size. + 3. ``metadata=None`` → query the default sender (set via + ``update_sender_info()``) for the full metadata. Returns: ``(data, size)`` on success, ``None`` on failure. @@ -727,9 +778,6 @@ def get( - **is_fast_path=True** (tensor *or* bytes payload): Returns ``(ManagedBuffer, size)``. **CALLER MUST call ``ManagedBuffer.release()`` after consuming.** - Note: even if the producer ``put()`` raw ``bytes``, the consumer - receives a ``ManagedBuffer`` — use ``buf.to_bytes()`` to obtain - a ``bytes`` copy, or ``buf.tensor`` for zero-copy access. - **is_fast_path=False** (serialized Python object): Returns ``(DeserializedObject, size)``. Buffer is auto-released internally after deserialization. @@ -741,9 +789,8 @@ def get( _t0 = _time_mod.perf_counter() - # If no metadata provided, try to query from sender if not metadata: - # Must insert sender info before using get() without metadata. + # Path 3: no metadata at all — query default sender if not self.sender_host or not self.sender_zmq_port or str(self.sender_host).lower() == "auto": raise RuntimeError( f"get(metadata=None) requires sender info to be resolved, " @@ -753,6 +800,21 @@ def get( metadata = self._query_metadata_from_sender(get_key) if not metadata: return None + elif "data_size" not in metadata: + # Path 2: partial metadata (host/port only) — query that sender + partial_host = metadata.get("source_host") + partial_port = metadata.get("source_port") + if not partial_host or not partial_port: + logger.warning( + "get(%s): partial metadata missing source_host/source_port, cannot resolve data_size. metadata=%s", + get_key, + metadata, + ) + return None + queried = self._query_metadata_at(get_key, str(partial_host), int(partial_port)) + if not queried: + return None + metadata = queried _t1 = _time_mod.perf_counter() _query_ms = (_t1 - _t0) * 1000 diff --git a/vllm_omni/distributed/omni_connectors/connectors/shm_connector.py b/vllm_omni/distributed/omni_connectors/connectors/shm_connector.py index 5c7384c1f8..6cf5c2f15b 100644 --- a/vllm_omni/distributed/omni_connectors/connectors/shm_connector.py +++ b/vllm_omni/distributed/omni_connectors/connectors/shm_connector.py @@ -15,9 +15,13 @@ class SharedMemoryConnector(OmniConnectorBase): - """ - Connector that uses SharedMemory for large objects and inline data for small objects. - Acts as a unified replacement for the legacy IPC fallback logic. + """Key-addressed local shared-memory connector. + + SHM is a local-only transport: it reads/writes POSIX shared memory + segments identified purely by *key*. It does **not** understand + remote-transport metadata such as ``source_host`` / ``source_port`` + (that is the RDMA connector's job). When such metadata is passed in, + the connector silently falls back to key-based lookup. """ def __init__(self, config: dict[str, Any]): @@ -25,6 +29,7 @@ def __init__(self, config: dict[str, Any]): self.stage_id = config.get("stage_id", -1) self.device = config.get("device", "cuda:0") self.threshold = int(config.get("shm_threshold_bytes", 65536)) + self._pending_keys: set[str] = set() self._metrics = { "puts": 0, "gets": 0, @@ -59,6 +64,7 @@ def put( # meta contains {'name': ..., 'size': ...} metadata = {"shm": meta, "size": size} + self._pending_keys.add(put_key) self._metrics["shm_writes"] += 1 else: # Inline - pass bytes directly to avoid double serialization of the object @@ -93,6 +99,28 @@ def _get_data_with_lock(self, lock_file: str, shm_handle: dict): if obj and os.path.exists(lock_file): os.remove(lock_file) + def _get_by_key(self, get_key: str) -> tuple[Any, int] | None: + """Read a SHM segment addressed purely by *get_key*.""" + shm = None + try: + shm = shm_pkg.SharedMemory(name=get_key) + if shm is None or shm.size == 0: + return None + lock_file = f"/dev/shm/shm_{get_key}_lockfile.lock" + shm_handle = {"name": get_key, "size": shm.size} + result = self._get_data_with_lock(lock_file, shm_handle) + if result is not None: + self._pending_keys.discard(get_key) + return result + except FileNotFoundError: + return None + except Exception: + logger.debug("_get_by_key: unexpected error reading SHM segment %s", get_key, exc_info=True) + return None + finally: + if shm: + shm.close() + def get( self, from_stage: str, @@ -101,16 +129,16 @@ def get( metadata=None, ) -> tuple[Any, int] | None: if metadata is not None: - # Some callers may wrap metadata by request id. if isinstance(metadata, dict) and get_key in metadata: metadata = metadata.get(get_key) if not isinstance(metadata, dict): - return None + return self._get_by_key(get_key) if "inline_bytes" in metadata: try: obj = self.deserialize_obj(metadata["inline_bytes"]) + self._pending_keys.discard(get_key) return obj, int(metadata.get("size", 0)) except Exception as e: logger.error(f"SharedMemoryConnector inline get failed for req {get_key}: {e}") @@ -119,33 +147,64 @@ def get( if "shm" in metadata: shm_handle = metadata["shm"] lock_file = f"/dev/shm/shm_{shm_handle['name']}_lockfile.lock" - return self._get_data_with_lock(lock_file, shm_handle) + result = self._get_data_with_lock(lock_file, shm_handle) + if result is not None: + self._pending_keys.discard(get_key) + return result - return None - shm = None - try: - shm = shm_pkg.SharedMemory(name=get_key) - if shm is None or shm.size == 0: - return None - lock_file = f"/dev/shm/shm_{get_key}_lockfile.lock" - shm_handle = {"name": get_key, "size": shm.size} - return self._get_data_with_lock(lock_file, shm_handle) - except Exception: - return None - finally: - if shm: - shm.close() + # Metadata is a dict but has no SHM-specific handle (e.g. RDMA- + # style source_host/source_port). Fall back to key-based read. + return self._get_by_key(get_key) + + return self._get_by_key(get_key) def cleanup(self, request_id: str) -> None: - # SHM segments are automatically unlinked during 'get' (shm_read_bytes). - # If 'get' is never called (e.g. error flow), the SHM segment might leak. - # A robust implementation might track created segments and unlink them here - # if they haven't been consumed. - # For now, we rely on the consumer to read and unlink. - pass + """Best-effort cleanup of unconsumed SHM segments for *request_id*. + + Matches pending keys where *request_id* appears as the full key, + as a ``_``-delimited prefix, or as a ``_``-delimited suffix. + If ``get()`` was never called, we unlink it here so /dev/shm + doesn't leak. + """ + stale = [ + k + for k in self._pending_keys + if k == request_id or k.startswith(request_id + "_") or k.endswith("_" + request_id) + ] + for key in stale: + self._pending_keys.discard(key) + try: + seg = shm_pkg.SharedMemory(name=key) + seg.close() + seg.unlink() + logger.debug("cleanup: unlinked unconsumed SHM segment %s", key) + except FileNotFoundError: + pass + except Exception as e: + logger.debug("cleanup: failed to unlink SHM segment %s: %s", key, e) + lock_file = f"/dev/shm/shm_{key}_lockfile.lock" + if os.path.exists(lock_file): + try: + os.remove(lock_file) + except OSError: + pass def close(self) -> None: - pass + """Unlink all remaining tracked SHM segments.""" + for key in list(self._pending_keys): + try: + seg = shm_pkg.SharedMemory(name=key) + seg.close() + seg.unlink() + except Exception: + pass + lock_file = f"/dev/shm/shm_{key}_lockfile.lock" + if os.path.exists(lock_file): + try: + os.remove(lock_file) + except OSError: + pass + self._pending_keys.clear() def health(self) -> dict[str, Any]: return {"status": "healthy", "threshold": self.threshold, **self._metrics} diff --git a/vllm_omni/distributed/omni_connectors/utils/initialization.py b/vllm_omni/distributed/omni_connectors/utils/initialization.py index 37b7d0d7f8..0497bbb3a2 100644 --- a/vllm_omni/distributed/omni_connectors/utils/initialization.py +++ b/vllm_omni/distributed/omni_connectors/utils/initialization.py @@ -23,6 +23,11 @@ # collide with request-forwarding endpoints that share the same base port. KV_TRANSFER_PORT_OFFSET = 100 +# Port stride between TP ranks so each worker binds a unique ZMQ port +# when TP > 1. Must be larger than the maximum number of pipeline stages. +# Formula: zmq_port = base + KV_TRANSFER_PORT_OFFSET + rank * STRIDE + stage +KV_RANK_PORT_STRIDE = 16 + def initialize_connectors_from_config( config_path: str | Path | None = None, From 3229bae331cb7ad37a71bb19853dae62fff9b4ec Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Tue, 14 Apr 2026 18:33:31 +0800 Subject: [PATCH 161/204] [skip ci] fix docs, gdown remove --id param (#2787) Signed-off-by: rongfu.leng --- benchmarks/build_dataset/download_process_data_seedtts.md | 4 ++-- benchmarks/qwen3-omni/README.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/build_dataset/download_process_data_seedtts.md b/benchmarks/build_dataset/download_process_data_seedtts.md index ec16f64424..faf072303b 100644 --- a/benchmarks/build_dataset/download_process_data_seedtts.md +++ b/benchmarks/build_dataset/download_process_data_seedtts.md @@ -27,7 +27,7 @@ pip install gdown Download the dataset from Google Drive: ```bash -gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP +gdown 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP ``` ### 4. Extract the Dataset @@ -74,7 +74,7 @@ rm meta.lst # Full setup and benchmark cd benchmarks/build_dataset pip install gdown -gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP +gdown 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP tar -xf seedtts_testset.tar cp seedtts_testset/en/meta.lst meta.lst python extract_tts_prompts.py -i meta.lst -o top100.txt -n 100 diff --git a/benchmarks/qwen3-omni/README.md b/benchmarks/qwen3-omni/README.md index de27c05c2c..dc282d0525 100644 --- a/benchmarks/qwen3-omni/README.md +++ b/benchmarks/qwen3-omni/README.md @@ -9,7 +9,7 @@ cd benchmarks/build_dataset pip install gdown # Download SeedTTS test set from Google Drive -gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP +gdown 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP # Extract tar -xf seedtts_testset.tar From 159d6558ea55ef59b3c57cf512e8114b62cd881e Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Tue, 14 Apr 2026 19:36:02 +0800 Subject: [PATCH 162/204] [Tests][Qwen3-Omni]Add test cases for long videos and long audios. (#2598) Signed-off-by: amy-why-3459 --- .../test_qwen3_omni_expansion.py | 159 ++++++------------ 1 file changed, 54 insertions(+), 105 deletions(-) diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index 1637627695..3065439084 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -29,6 +29,16 @@ IMAGE_KEY = ["square", "quadrate", "rectangle"] VIDEO_KEY = ["sphere", "globe", "circle", "round", "ball"] +# Heavier synthetic inputs than the default expansion cases (longer timeline / more pixels). +# Long video: 120s @ 30fps => 3600 frames (generate_synthetic_video in tests/conftest.py). +# Use 224² spatial size to bound RAM (~W*H*num_frames*3) vs. 288² at this frame count. +LONG_VIDEO_WIDTH = 224 +LONG_VIDEO_HEIGHT = 224 +LONG_VIDEO_FRAMES = 3600 +LARGE_IMAGE_WIDTH = 1920 +LARGE_IMAGE_HEIGHT = 1080 +LONG_AUDIO_DURATION_SEC = 120 + def get_chunk_config(default_path): path = modify_stage_config( @@ -37,7 +47,8 @@ def get_chunk_config(default_path): "async_chunk": True, "stage_args": { 0: { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk" + "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk", + "default_sampling_params.max_tokens": 2048, }, 1: { "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk" @@ -167,88 +178,17 @@ def test_text_to_text_audio_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_image_to_text_001(omni_server, openai_client) -> None: - """ - Input Modal: image - Output Modal: text - Input Setting: stream=True - Datasets: single request - """ - image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" - messages = dummy_messages_from_mix_data(image_data_url=image_data_url) - - request_config = { - "model": omni_server.model, - "messages": messages, - "modalities": ["text"], - "stream": True, - "key_words": {"image": IMAGE_KEY}, - } - - openai_client.send_omni_request(request_config) - - -@pytest.mark.advanced_model -@pytest.mark.omni -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_image_to_audio_001(omni_server, openai_client) -> None: - """ - Input Modal: image - Output Modal: audio - Input Setting: stream=False - Datasets: single request - """ - image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" - messages = dummy_messages_from_mix_data(image_data_url=image_data_url) - - request_config = { - "model": omni_server.model, - "messages": messages, - "modalities": ["audio"], - "key_words": {"image": IMAGE_KEY}, - } - - openai_client.send_omni_request(request_config) - - -@pytest.mark.advanced_model -@pytest.mark.omni -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_image_to_text_audio_001(omni_server, openai_client) -> None: - """ - Input Modal: image - Output Modal: text, audio - Input Setting: stream=False - Datasets: few requests - """ - image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(1280, 720)['base64']}" - - messages = dummy_messages_from_mix_data(image_data_url=image_data_url) - - request_config = { - "model": omni_server.model, - "messages": messages, - "key_words": {"image": IMAGE_KEY}, - } - - openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) - - -@pytest.mark.advanced_model -@pytest.mark.omni -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_video_to_text_001(omni_server, openai_client) -> None: +def test_text_video_to_text_001(omni_server, openai_client) -> None: """ - Input Modal: video + Input Modal: long synthetic video (120s @ 30fps, LONG_VIDEO_FRAMES frames) Output Modal: text Input Setting: stream=False Datasets: single request """ - video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}" - messages = dummy_messages_from_mix_data(video_data_url=video_data_url) + video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(LONG_VIDEO_WIDTH, LONG_VIDEO_HEIGHT, LONG_VIDEO_FRAMES)['base64']}" + messages = dummy_messages_from_mix_data( + video_data_url=video_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_video") + ) request_config = { "model": omni_server.model, @@ -257,28 +197,29 @@ def test_video_to_text_001(omni_server, openai_client) -> None: "key_words": {"video": VIDEO_KEY}, } - openai_client.send_omni_request(request_config) + openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) @pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_video_to_audio_001(omni_server, openai_client) -> None: +@pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True) +def test_text_audio_to_text_audio_001(omni_server, openai_client) -> None: """ - Input Modal: video - Output Modal: audio + Input Modal: text, audio + Output Modal: text, audio Input Setting: stream=False Datasets: single request """ - video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}" - messages = dummy_messages_from_mix_data(video_data_url=video_data_url) + audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(5, 1)['base64']}" + messages = dummy_messages_from_mix_data( + audio_data_url=audio_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_audio") + ) request_config = { "model": omni_server.model, "messages": messages, - "modalities": ["audio"], - "key_words": {"video": VIDEO_KEY}, + "key_words": {"audio": AUDIO_KEY}, } openai_client.send_omni_request(request_config) @@ -287,22 +228,25 @@ def test_video_to_audio_001(omni_server, openai_client) -> None: @pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_video_to_text_audio_001(omni_server, openai_client) -> None: +@pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True) +def test_text_audio_to_text_audio_002(omni_server, openai_client) -> None: """ - Input Modal: video + Input Modal: text, long-duration audio (~LONG_AUDIO_DURATION_SEC s WAV) Output Modal: text, audio Input Setting: stream=False - Datasets: few requests + Datasets: single request """ - video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}" - - messages = dummy_messages_from_mix_data(video_data_url=video_data_url) + audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(LONG_AUDIO_DURATION_SEC, 1)['base64']}" + messages = dummy_messages_from_mix_data( + audio_data_url=audio_data_url, + system_prompt=get_system_prompt(), + content_text=get_prompt("text_audio"), + ) request_config = { "model": omni_server.model, "messages": messages, - "key_words": {"video": VIDEO_KEY}, + "key_words": {"audio": AUDIO_KEY}, } openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) @@ -312,22 +256,23 @@ def test_video_to_text_audio_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True) -def test_text_audio_to_text_audio_001(omni_server, openai_client) -> None: +def test_text_image_to_text_audio_001(omni_server, openai_client) -> None: """ - Input Modal: text, audio + Input Modal: text, image Output Modal: text, audio Input Setting: stream=False Datasets: single request """ - audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(5, 1)['base64']}" + image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" + messages = dummy_messages_from_mix_data( - audio_data_url=audio_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_audio") + image_data_url=image_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_image") ) request_config = { "model": omni_server.model, "messages": messages, - "key_words": {"audio": AUDIO_KEY}, + "key_words": {"image": IMAGE_KEY}, } openai_client.send_omni_request(request_config) @@ -337,17 +282,21 @@ def test_text_audio_to_text_audio_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True) -def test_text_image_to_text_audio_001(omni_server, openai_client) -> None: +def test_large_image_to_text_audio_001(omni_server, openai_client) -> None: """ - Input Modal: text, image + Input Modal: text, high-resolution image (1080p-class JPEG) Output Modal: text, audio Input Setting: stream=False Datasets: single request """ - image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" + image_data_url = ( + f"data:image/jpeg;base64,{generate_synthetic_image(LARGE_IMAGE_WIDTH, LARGE_IMAGE_HEIGHT)['base64']}" + ) messages = dummy_messages_from_mix_data( - image_data_url=image_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_image") + image_data_url=image_data_url, + system_prompt=get_system_prompt(), + content_text=get_prompt("text_image"), ) request_config = { @@ -356,7 +305,7 @@ def test_text_image_to_text_audio_001(omni_server, openai_client) -> None: "key_words": {"image": IMAGE_KEY}, } - openai_client.send_omni_request(request_config) + openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) @pytest.mark.advanced_model From f87674aa447b24fb305f3eafcab1e51b30e0d9a6 Mon Sep 17 00:00:00 2001 From: Hongsheng Liu Date: Tue, 14 Apr 2026 20:26:27 +0800 Subject: [PATCH 163/204] [skip ci]add skills (#2710) Signed-off-by: hsliuustc0106 --- .claude/skills/add-diffusion-model/SKILL.md | 534 ++++++++++++++++ .../references/cache-dit-patterns.md | 254 ++++++++ .../references/custom-model-patterns.md | 273 +++++++++ .../references/parallelism-patterns.md | 571 ++++++++++++++++++ .../references/transformer-adaptation.md | 218 +++++++ .../references/troubleshooting.md | 178 ++++++ .claude/skills/add-tts-model/SKILL.md | 284 +++++++++ .claude/skills/readme.md | 34 ++ .gitignore | 14 +- 9 files changed, 2359 insertions(+), 1 deletion(-) create mode 100644 .claude/skills/add-diffusion-model/SKILL.md create mode 100644 .claude/skills/add-diffusion-model/references/cache-dit-patterns.md create mode 100644 .claude/skills/add-diffusion-model/references/custom-model-patterns.md create mode 100644 .claude/skills/add-diffusion-model/references/parallelism-patterns.md create mode 100644 .claude/skills/add-diffusion-model/references/transformer-adaptation.md create mode 100644 .claude/skills/add-diffusion-model/references/troubleshooting.md create mode 100644 .claude/skills/add-tts-model/SKILL.md create mode 100644 .claude/skills/readme.md diff --git a/.claude/skills/add-diffusion-model/SKILL.md b/.claude/skills/add-diffusion-model/SKILL.md new file mode 100644 index 0000000000..a7e0bbf9a5 --- /dev/null +++ b/.claude/skills/add-diffusion-model/SKILL.md @@ -0,0 +1,534 @@ +--- +name: add-diffusion-model +description: Add a new diffusion model (text-to-image, text-to-video, image-to-video, text-to-audio, image editing) to vLLM-Omni, including Cache-DiT acceleration and parallelism support (TP, SP/USP, CFG-Parallel, HSDP). Use when integrating a new diffusion model, porting a diffusers pipeline or a custom model repo to vllm-omni, creating a new DiT transformer adapter, adding diffusion model support, or enabling multi-GPU parallelism and cache acceleration for an existing model. +--- + +# Adding a Diffusion Model to vLLM-Omni + +## Overview + +This skill guides you through adding a new diffusion model to vLLM-Omni. The model may come from HuggingFace Diffusers (structured pipeline) or from a private/custom repo. The workflow differs significantly depending on the source. + +## Prerequisites + +Before starting, determine: + +1. **Model category**: Text-to-Image, Text-to-Video, Image-to-Video, Image Editing, Text-to-Audio, or Omni +2. **Reference source**: Diffusers pipeline, custom repo, or a combination +3. **Model HuggingFace ID** or local checkpoint path +4. **Architecture**: Scheduler, text encoder, VAE, transformer/backbone + +## Step 0: Classify the Migration Path + +Check the model's HF repo for `model_index.json`. This determines your path: + +| Scenario | How to identify | Migration path | +|----------|----------------|----------------| +| **Already supported** | `_class_name` in `model_index.json` matches a key in `_DIFFUSION_MODELS` in `registry.py` | Skip to Step 5 (test) and Step 7 (docs) | +| **Diffusers-based** | Has standard `model_index.json` with `_diffusers_version`, subfolders for `transformer/`, `vae/`, etc. | Follow **Path A** below | +| **Custom/private repo** | No diffusers `model_index.json`, weights in non-standard format, custom model code in a separate git repo | Follow **Path B** below | +| **Hybrid** | Has some diffusers components (VAE) but custom transformer/fusion | Mix of Path A and Path B | + +## Path A: Diffusers-Based Model + +For models with a standard diffusers layout. See [references/transformer-adaptation.md](references/transformer-adaptation.md) for detailed code patterns. + +### A1. Analyze `model_index.json` + +Identify components: `transformer`, `scheduler`, `vae`, `text_encoder`, `tokenizer`. + +### A2. Create model directory + +``` +vllm_omni/diffusion/models/your_model_name/ +├── __init__.py +├── pipeline_your_model.py +└── your_model_transformer.py +``` + +### A3. Adapt transformer + +1. Copy from diffusers source. Remove mixins (`ModelMixin`, `ConfigMixin`, `AttentionModuleMixin`). +2. Replace attention with `vllm_omni.diffusion.attention.layer.Attention` (QKV shape: `[B, seq, heads, head_dim]`). +3. Add `od_config: OmniDiffusionConfig | None = None` to `__init__`. +4. Add `load_weights()` method mapping diffusers weight names to vllm-omni names. +5. Add class attributes: `_repeated_blocks`, `_layerwise_offload_blocks_attr`. + +### A4. Adapt pipeline + +Inherit from `nn.Module`. The key contract: + +```python +class YourPipeline(nn.Module): + def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = ""): + # Load VAE, text encoder, tokenizer via from_pretrained() + # Instantiate transformer (weights loaded later via weights_sources) + self.weights_sources = [ + DiffusersPipelineLoader.ComponentSource( + model_or_path=od_config.model, subfolder="transformer", + prefix="transformer.", fall_back_to_pt=True)] + + def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: + # Encode prompt → prepare latents → denoise loop → VAE decode + return DiffusionOutput(output=output) + + def load_weights(self, weights): + return AutoWeightsLoader(self).load_weights(weights) +``` + +Add post/pre-process functions in the same pipeline file. Register them in `registry.py`. + +### A5. Register, test, docs → continue at Step 4 below. + +--- + +## Path B: Custom/Private Repo Model + +For models without a diffusers pipeline — weights in custom format, model code in a private repo. Real examples: DreamID-Omni, BAGEL, HunyuanImage3. + +### B1. Understand the reference repo + +Study the original model's code to identify: +- **Model architecture files** (transformers, fusion modules, embeddings) +- **Weight format** (safetensors, `.pth`, custom checkpoint structure) +- **Weight loading helpers** (custom init functions, checkpoint loaders) +- **Pre/post-processing** (image/audio transforms, tokenization, VAE encode/decode) +- **External dependencies** (packages not on PyPI) +- **Config format** (JSON config files, hardcoded dicts) + +### B2. Decide what lives WHERE + +This is the key design decision for custom models. Follow these placement rules: + +| Code type | Where to place | Example | +|-----------|---------------|---------| +| **Pipeline orchestration** (init, forward, denoise loop) | `vllm_omni/diffusion/models//pipeline_.py` | Always required | +| **Custom transformer/backbone** (ported and adapted to vllm-omni) | `vllm_omni/diffusion/models//_transformer.py` or similar | `wan2_2.py`, `fusion.py`, `bagel_transformer.py` | +| **Custom sub-models** (VAE, fusion, autoencoder) | `vllm_omni/diffusion/models//` as separate files | `autoencoder.py`, `fusion.py` | +| **External dependency code** (original repo utilities) | **External repo**, installed via download script or pip | `dreamid_omni` package via git clone | +| **Hardcoded model configs** | Module-level dicts in pipeline file | `VIDEO_CONFIG`, `AUDIO_CONFIG` dicts | +| **Download/setup script** | `examples/offline_inference//download_.py` | `download_dreamid_omni.py` | +| **Custom `model_index.json`** | Generated by download script, placed at model root | Minimal: `{"_class_name": "YourPipeline", ...}` | + +### B3. Handle external dependencies + +If the model's code lives in a separate git repo: + +**Option 1: Import with graceful fallback** (recommended for models with external utils) + +```python +try: + from external_model.utils import init_vae, load_checkpoint +except ImportError: + raise ImportError( + "Failed to import from dependency 'external_model'. " + "Please run the download script first." + ) +``` + +**Option 2: Port the code directly** (preferred when feasible) + +Copy the essential model files into `vllm_omni/diffusion/models//` and adapt them. This avoids external dependencies. BAGEL does this — `autoencoder.py` and `bagel_transformer.py` are ported directly. + +**Decision criteria**: Port if the code is self-contained and won't diverge. Use external deps if the model repo is actively maintained and the code is complex. + +### B4. Handle custom weight loading + +Custom models have two patterns for weight loading: + +**Pattern 1: Bypass standard loader** (DreamID-Omni style) + +When the original model has complex custom init functions that load weights in `__init__`: + +```python +class CustomPipeline(nn.Module): + def __init__(self, *, od_config, prefix=""): + super().__init__() + model = od_config.model + # Load everything eagerly in __init__ using custom helpers + self.vae = custom_init_vae(model, device=self.device) + self.text_encoder = custom_init_text_encoder(model, device=self.device) + self.transformer = CustomFusionModel(CONFIG) + load_custom_checkpoint(self.transformer, + checkpoint_path=os.path.join(model, "model.safetensors")) + # NO weights_sources defined — bypasses standard loader + + def load_weights(self, weights): + pass # No-op — all weights loaded in __init__ +``` + +**Pattern 2: Use standard loader with custom `load_weights`** (BAGEL style) + +When weights are in safetensors format but need name remapping: + +```python +class CustomPipeline(nn.Module): + def __init__(self, *, od_config, prefix=""): + super().__init__() + # Instantiate model architecture without weights + self.bagel = BagelModel(config) + self.vae = AutoEncoder(ae_params) + + # Point loader at the safetensors in the model root + self.weights_sources = [ + DiffusersPipelineLoader.ComponentSource( + model_or_path=od_config.model, + subfolder=None, # weights at root, not in subfolder + prefix="", + fall_back_to_pt=False, + ) + ] + + def load_weights(self, weights): + # Custom name remapping for non-diffusers weight names + params = dict(self.named_parameters()) + loaded = set() + for name, tensor in weights: + # Remap original weight names to vllm-omni module names + name = self._remap_weight_name(name) + if name in params: + default_weight_loader(params[name], tensor) + loaded.add(name) + return loaded +``` + +### B5. Create the `model_index.json` + +Custom models need a `model_index.json` at the model root for vllm-omni to discover them. For custom models, this is minimal: + +```json +{ + "_class_name": "YourModelPipeline", + "custom_key": "path/to/custom_weights.safetensors" +} +``` + +The `_class_name` must match a key in `_DIFFUSION_MODELS` in `registry.py`. Additional keys are model-specific (accessed via `od_config.model_config`). + +If the model's weights come from multiple HF repos, write a **download script** that: +1. Downloads from each repo +2. Assembles into a single directory +3. Generates `model_index.json` +4. Installs any external dependencies (git clone + `.pth` file) + +Place at: `examples/offline_inference//download_.py` + +### B6. Handle multi-modal inputs + +If the model accepts images, audio, or other multi-modal inputs, implement the protocol classes from `vllm_omni/diffusion/models/interface.py`: + +```python +from vllm_omni.diffusion.models.interface import SupportImageInput, SupportAudioInput + +class MyPipeline(nn.Module, SupportImageInput, SupportAudioInput): + # Protocol markers — the engine uses these to enable proper input routing + pass +``` + +Preprocessing for custom models is typically done **inside `forward()`** rather than via registered pre-process functions, since the logic is often tightly coupled to the model. + +### B7. Continue at Step 4 below. + +--- + +## Common Steps (Both Paths) + +### Step 4: Register Model in registry.py + +Edit `vllm_omni/diffusion/registry.py`: + +```python +_DIFFUSION_MODELS = { + "YourModelPipeline": ("your_model_name", "pipeline_your_model", "YourModelPipeline"), +} +_DIFFUSION_POST_PROCESS_FUNCS = { + "YourModelPipeline": "get_your_model_post_process_func", # if applicable +} +_DIFFUSION_PRE_PROCESS_FUNCS = { + "YourModelPipeline": "get_your_model_pre_process_func", # if applicable +} +``` + +The registry key is the `_class_name` from `model_index.json`. The tuple is `(folder_name, module_file, class_name)`. + +Create `__init__.py` exporting the pipeline class and any factory functions. + +### Step 5: Run, Test, Debug + +Use the appropriate existing example script: + +| Category | Script | +|----------|--------| +| Text-to-Image | `examples/offline_inference/text_to_image/text_to_image.py` | +| Text-to-Video | `examples/offline_inference/text_to_video/text_to_video.py` | +| Image-to-Video | `examples/offline_inference/image_to_video/image_to_video.py` | +| Image-to-Image | `examples/offline_inference/image_to_image/image_edit.py` | +| Text-to-Audio | `examples/offline_inference/text_to_audio/text_to_audio.py` | + +For custom/Omni models that don't fit these categories, create a dedicated example script. + +**Validation**: No errors, output is meaningful, quality matches reference implementation. + +See [references/troubleshooting.md](references/troubleshooting.md) for common errors. + +### Step 6: Add Example Scripts + +For Omni or custom models, create: +- `examples/offline_inference/your_model_name/` — offline script + README +- `examples/online_serving/your_model_name/` — server script + client +- Download script if weights require assembly from multiple sources + +### Step 7: Update Documentation + +Required updates: +1. `docs/user_guide/diffusion/parallelism_acceleration.md` — parallelism support table +2. `docs/user_guide/diffusion/teacache.md` — if TeaCache supported +3. `docs/user_guide/diffusion/cache_dit_acceleration.md` — if Cache-DiT supported +4. `examples/offline_inference/xxx/README.md` — offline example docs +5. `examples/online_serve/xxx/README.md` — online serve docs + +### Step 8: Add E2E Tests (Recommended) + +Create `tests/e2e/online_serving/test_your_model_expansion.py`. + +### Step 9: Add Cache-DiT Acceleration + +Cache-DiT accelerates inference by caching intermediate computation results across denoising steps. After your model is working correctly on a single GPU, add cache-dit support. + +See [references/cache-dit-patterns.md](references/cache-dit-patterns.md) for detailed code patterns. + +#### 9a. Determine your model type + +| Model Type | Description | Action | +|------------|-------------|--------| +| **Standard single-transformer** | One transformer with one `ModuleList` of blocks | No code needed — `CacheDiTBackend` auto-detects via `enable_cache_for_dit()` | +| **Multi-block-list** | One transformer with multiple block lists (e.g., `transformer_blocks` + `single_transformer_blocks`) | Write custom enabler with `BlockAdapter` | +| **Dual-transformer** | Two transformers (e.g., high-noise + low-noise) | Write custom enabler with `BlockAdapter` wrapping both | + +#### 9b. Standard models — verify automatic support + +For standard single-transformer models, test directly: + +```python +omni = Omni( + model="your-model-name", + cache_backend="cache_dit", + cache_config={ + "Fn_compute_blocks": 1, + "Bn_compute_blocks": 0, + "max_warmup_steps": 4, + } +) +``` + +Check logs for "Cache-dit enabled successfully on xxx". If it works, skip to Step 9e. + +#### 9c. Custom architectures — write a custom enabler + +For multi-block-list or dual-transformer models, write a custom enabler function: + +```python +from cache_dit import BlockAdapter, ForwardPattern, ParamsModifier, DBCacheConfig + +def enable_cache_for_your_model(pipeline, cache_config): + db_cache_config = DBCacheConfig( + num_inference_steps=None, + Fn_compute_blocks=cache_config.Fn_compute_blocks, + Bn_compute_blocks=cache_config.Bn_compute_blocks, + max_warmup_steps=cache_config.max_warmup_steps, + max_cached_steps=cache_config.max_cached_steps, + max_continuous_cached_steps=cache_config.max_continuous_cached_steps, + residual_diff_threshold=cache_config.residual_diff_threshold, + ) + + cache_dit.enable_cache( + BlockAdapter( + transformer=pipeline.transformer, + blocks=[ + pipeline.transformer.transformer_blocks, + pipeline.transformer.single_transformer_blocks, + ], + forward_pattern=[ForwardPattern.Pattern_1, ForwardPattern.Pattern_1], + params_modifiers=[ParamsModifier(...)], + ), + cache_config=db_cache_config, + ) + + def refresh_cache_context(pipeline, num_inference_steps, verbose=True): + cache_dit.refresh_context( + pipeline.transformer, num_inference_steps=num_inference_steps, verbose=verbose + ) + return refresh_cache_context +``` + +#### 9d. Register the custom enabler + +Add your enabler to `CUSTOM_DIT_ENABLERS` in `vllm_omni/diffusion/cache/cache_dit_backend.py`: + +```python +CUSTOM_DIT_ENABLERS = { + "Wan22Pipeline": enable_cache_for_wan22, + "LongCatImagePipeline": enable_cache_for_longcat_image, + "YourModelPipeline": enable_cache_for_your_model, # Add here +} +``` + +#### 9e. Test Cache-DiT + +```python +omni = Omni( + model="your-model-name", + cache_backend="cache_dit", + cache_config={ + "Fn_compute_blocks": 1, "Bn_compute_blocks": 0, + "max_warmup_steps": 4, "residual_diff_threshold": 0.24, + } +) +images = omni.generate("a beautiful landscape", + OmniDiffusionSamplingParams(num_inference_steps=50)) +``` + +**Verify**: 1) logs show cache enabled, 2) 1.5-2x speedup, 3) output quality acceptable vs baseline. + +If quality degrades, lower `residual_diff_threshold` (try 0.12-0.18) or increase `max_warmup_steps` (try 6-8). + +--- + +### Step 10: Add Parallelism Support + +After the model works on a single GPU, add multi-GPU parallelism. Add each type incrementally, testing after each addition. + +See [references/parallelism-patterns.md](references/parallelism-patterns.md) for detailed code patterns and API reference. + +**Recommended order**: TP → SP/USP → CFG Parallel → HSDP + +#### 10a. Tensor Parallelism (TP) + +Shards DiT linear layers across GPUs. Requires code changes in the transformer. + +**What to change in the transformer**: +1. Replace `nn.Linear` with `ColumnParallelLinear` / `RowParallelLinear` / `QKVParallelLinear` +2. Update `load_weights()` to handle QKV fusion with `stacked_params_mapping` +3. Use `self.to_qkv.num_heads` (local heads) instead of total heads for split sizes + +```python +from vllm.model_executor.layers.linear import ( + QKVParallelLinear, RowParallelLinear, ColumnParallelLinear, +) + +# Attention: QKV → RowParallel output +self.to_qkv = QKVParallelLinear(dim, head_dim, num_heads, num_kv_heads) +self.to_out = RowParallelLinear(dim, dim, input_is_parallel=True) + +# FFN: ColumnParallel → RowParallel +self.w1 = ColumnParallelLinear(dim, ffn_dim) +self.w2 = RowParallelLinear(ffn_dim, dim, input_is_parallel=True) +``` + +**Constraints**: `num_heads % tp_size == 0` and `num_kv_heads % tp_size == 0`. + +**Test**: `--tensor-parallel-size 2` + +#### 10b. Sequence Parallelism (SP / USP) + +Splits sequence tokens across GPUs. Non-intrusive via `_sp_plan` on the transformer class — no changes to `forward()`. + +**What to change in the transformer**: + +Add `_sp_plan` class attribute: + +```python +from vllm_omni.diffusion.distributed.sp_plan import ( + SequenceParallelInput, SequenceParallelOutput, +) + +class YourTransformer(nn.Module): + _sp_plan = { + "blocks.0": { + "hidden_states": SequenceParallelInput(split_dim=1, expected_dims=3), + }, + "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), + } +``` + +If inline tensor ops (e.g., `torch.cat`) exist between shard/gather points, extract them into `nn.Module` submodules so hooks can intercept them. + +For RoPE that needs splitting, add an entry for the RoPE module with `split_output=True`. + +**Test**: `--ulysses-degree 2` (offline) or `--usp 2` (online serving) + +#### 10c. CFG Parallel + +Distributes positive/negative CFG branches across 2 GPUs. Requires the pipeline to inherit `CFGParallelMixin`. + +**What to change in the pipeline**: + +```python +from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin + +class YourPipeline(nn.Module, CFGParallelMixin): + def diffuse(self, ...) -> torch.Tensor: + for i, t in enumerate(timesteps): + positive_kwargs = {...} + negative_kwargs = {...} if do_true_cfg else None + noise_pred = self.predict_noise_maybe_with_cfg( + do_true_cfg=do_true_cfg, true_cfg_scale=cfg_scale, + positive_kwargs=positive_kwargs, negative_kwargs=negative_kwargs, + ) + latents = self.scheduler_step_maybe_with_cfg( + noise_pred, t, latents, do_true_cfg + ) + return latents +``` + +Override `predict_noise()` if your transformer call is non-standard. Override `combine_cfg_noise()` for multi-output models (e.g., video + audio). + +**Constraint**: Exactly 2 GPUs. Only for models using classifier-free guidance. + +**Test**: `--cfg-parallel-size 2` + +#### 10d. HSDP (Hybrid Sharded Data Parallel) + +Shards transformer weights via PyTorch FSDP2 to reduce per-GPU VRAM. No code changes to the forward pass — just add a class attribute. + +**What to change in the transformer**: + +```python +class YourTransformer(nn.Module): + @staticmethod + def _is_transformer_block(name: str, module) -> bool: + return "blocks" in name and name.split(".")[-1].isdigit() + + _hsdp_shard_conditions = [_is_transformer_block] +``` + +**Constraint**: Cannot combine with TP. For standalone HSDP, set `hsdp_shard_size` explicitly. + +**Test**: `--use-hsdp` or `DiffusionParallelConfig(use_hsdp=True)` + +#### 10e. Update parallelism documentation + +After adding parallelism support, update: +1. `docs/user_guide/diffusion/parallelism_acceleration.md` — add your model to the support table +2. Record which parallelism methods are supported (USP, Ring, CFG, TP, HSDP, VAE-Patch) + +--- + +## Iterative Development Tips + +1. **Start minimal**: Basic generation first, no parallelism/caching +2. **Use `--enforce-eager`**: Disable torch.compile during debugging +3. **Use small models**: Test with smaller variants first +4. **Check tensor shapes**: Most errors are reshape mismatches in attention +5. **Add features incrementally**: Single GPU → TP → SP → CFG → HSDP → Cache-DiT +6. **For custom models**: Get the model running with the original code first, then progressively replace components with vllm-omni equivalents +7. **Cache-DiT before parallelism tuning**: Cache-DiT is lossy — verify quality at baseline before combining with parallelism +8. **Combine lossless + lossy**: e.g., TP + SP + Cache-DiT for maximum throughput + +## Reference Files + +- [Transformer Adaptation](references/transformer-adaptation.md) — porting transformers from diffusers +- [Custom Model Patterns](references/custom-model-patterns.md) — patterns for non-diffusers models +- [Parallelism Patterns](references/parallelism-patterns.md) — TP, SP/USP, CFG parallel, HSDP implementation details +- [Cache-DiT Patterns](references/cache-dit-patterns.md) — cache-dit acceleration for standard and custom architectures +- [Troubleshooting](references/troubleshooting.md) — common errors and fixes diff --git a/.claude/skills/add-diffusion-model/references/cache-dit-patterns.md b/.claude/skills/add-diffusion-model/references/cache-dit-patterns.md new file mode 100644 index 0000000000..d34ce0e0f4 --- /dev/null +++ b/.claude/skills/add-diffusion-model/references/cache-dit-patterns.md @@ -0,0 +1,254 @@ +# Cache-DiT Patterns Reference + +## Overview + +Cache-DiT accelerates Diffusion Transformers by caching intermediate computation results across denoising steps. Adjacent steps produce similar features, so redundant computations can be skipped. + +Three caching strategies: +- **DBCache**: Dynamic block-level caching — selectively computes or caches transformer blocks based on residual differences +- **TaylorSeer**: Calibration-based prediction using Taylor expansion to estimate block outputs +- **SCM** (Step Computation Masking): Dynamic step skipping based on configurable policies + +**Typical speedup**: 1.5-2.5x depending on model and configuration. + +**Official docs**: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/cache_dit + +## Architecture + +vLLM-Omni integrates cache-dit through `CacheDiTBackend`: + +| Component | Purpose | +|-----------|---------| +| `CacheDiTBackend` | Unified backend — auto-selects enabler (standard or custom) | +| `enable_cache_for_dit()` | Default enabler for standard single-transformer models | +| `CUSTOM_DIT_ENABLERS` dict | Registry of custom enablers keyed by pipeline class name | +| `BlockAdapter` | Wraps complex architectures (multi-block-list or multi-transformer) | +| `ForwardPattern` | Specifies block forward signature: `Pattern_0`, `Pattern_1`, `Pattern_2` | +| `ParamsModifier` | Per-transformer or per-block-list config customization | +| `DBCacheConfig` | Configuration for DBCache parameters | +| `cache_dit.refresh_context()` | Updates cache context when `num_inference_steps` changes | + +**Source files**: +- `vllm_omni/diffusion/cache/cache_dit_backend.py` — `CacheDiTBackend`, enablers, `CUSTOM_DIT_ENABLERS` +- `vllm_omni/diffusion/cache/` — cache backend implementations + +## Standard Models: Automatic Support + +Most DiT models follow this pattern: +- Single transformer with one `nn.ModuleList` of blocks +- Standard forward signature +- Compatible with cache-dit's automatic detection + +**Examples**: Qwen-Image, Z-Image, FLUX + +No code changes needed. `CacheDiTBackend` automatically uses `enable_cache_for_dit()`: + +```python +from vllm_omni import Omni + +omni = Omni( + model="Qwen/Qwen-Image", + cache_backend="cache_dit", + cache_config={ + "Fn_compute_blocks": 1, + "Bn_compute_blocks": 0, + "max_warmup_steps": 4, + } +) +``` + +What happens automatically: + +```python +def enable_cache_for_dit(pipeline, cache_config): + db_cache_config = DBCacheConfig( + num_inference_steps=None, + Fn_compute_blocks=cache_config.Fn_compute_blocks, + Bn_compute_blocks=cache_config.Bn_compute_blocks, + max_warmup_steps=cache_config.max_warmup_steps, + max_cached_steps=cache_config.max_cached_steps, + max_continuous_cached_steps=cache_config.max_continuous_cached_steps, + residual_diff_threshold=cache_config.residual_diff_threshold, + ) + + cache_dit.enable_cache(pipeline.transformer, cache_config=db_cache_config) + + def refresh_cache_context(pipeline, num_inference_steps, verbose=True): + cache_dit.refresh_context( + pipeline.transformer, num_inference_steps=num_inference_steps, verbose=verbose + ) + return refresh_cache_context +``` + +## Custom Architectures: Writing Custom Enablers + +### When you need a custom enabler + +- Model has multiple block lists in one transformer (e.g., `transformer_blocks` + `single_transformer_blocks`) +- Model has two transformers (e.g., high-noise + low-noise like Wan2.2) +- Model uses non-standard block forward signature + +### Pattern 1: Multi-Block-List (LongCat-Image style) + +Single transformer with two block lists: + +```python +import cache_dit +from cache_dit import BlockAdapter, ForwardPattern, ParamsModifier, DBCacheConfig + +def enable_cache_for_your_model(pipeline, cache_config): + db_cache_config = DBCacheConfig( + num_inference_steps=None, + Fn_compute_blocks=cache_config.Fn_compute_blocks, + Bn_compute_blocks=cache_config.Bn_compute_blocks, + max_warmup_steps=cache_config.max_warmup_steps, + max_cached_steps=cache_config.max_cached_steps, + max_continuous_cached_steps=cache_config.max_continuous_cached_steps, + residual_diff_threshold=cache_config.residual_diff_threshold, + ) + + cache_dit.enable_cache( + BlockAdapter( + transformer=pipeline.transformer, + blocks=[ + pipeline.transformer.transformer_blocks, + pipeline.transformer.single_transformer_blocks, + ], + forward_pattern=[ForwardPattern.Pattern_1, ForwardPattern.Pattern_1], + params_modifiers=[ParamsModifier(...)], + ), + cache_config=db_cache_config, + ) + + def refresh_cache_context(pipeline, num_inference_steps, verbose=True): + cache_dit.refresh_context( + pipeline.transformer, num_inference_steps=num_inference_steps, verbose=verbose + ) + return refresh_cache_context +``` + +For single transformer with multiple block lists, `refresh_context` works the same as standard models — call it once on the transformer. + +### Pattern 2: Dual-Transformer (Wan2.2 style) + +Two transformers with separate configs: + +```python +def enable_cache_for_dual_transformer(pipeline, cache_config): + db_cache_config = DBCacheConfig(...) + + cache_dit.enable_cache( + BlockAdapter( + transformer=[pipeline.transformer, pipeline.transformer_2], + blocks=[pipeline.transformer.blocks, pipeline.transformer_2.blocks], + forward_pattern=[ForwardPattern.Pattern_2, ForwardPattern.Pattern_2], + params_modifiers=[ + ParamsModifier(...), # Config for transformer 1 + ParamsModifier(...), # Config for transformer 2 + ], + ), + cache_config=db_cache_config, + ) + + def refresh_cache_context(pipeline, num_inference_steps, verbose=True): + high_steps, low_steps = _split_inference_steps(num_inference_steps) + cache_dit.refresh_context( + pipeline.transformer, num_inference_steps=high_steps, verbose=verbose + ) + cache_dit.refresh_context( + pipeline.transformer_2, num_inference_steps=low_steps, verbose=verbose + ) + return refresh_cache_context +``` + +Key difference: `refresh_context` must be called on **each transformer separately** with its own step count. + +### Choosing the ForwardPattern + +| Pattern | Block forward signature | Example models | +|---------|------------------------|----------------| +| `Pattern_0` | `block(hidden_states, **kwargs)` → residual added inside block | Default | +| `Pattern_1` | `block(hidden_states, **kwargs)` → returns `(hidden_states, ...)` tuple | FLUX-style single blocks | +| `Pattern_2` | `block(hidden_states, **kwargs)` → `(hidden_states, ...)` with different residual pattern | Wan2.2 blocks | + +Inspect your block's `forward()` return type and residual connection pattern to choose the right one. See [Cache-DiT API Reference](https://cache-dit.readthedocs.io/en/latest/user_guide/CACHE_API/) for details. + +## Registering Custom Enablers + +Add your enabler to `CUSTOM_DIT_ENABLERS` in `vllm_omni/diffusion/cache/cache_dit_backend.py`: + +```python +CUSTOM_DIT_ENABLERS = { + "Wan22Pipeline": enable_cache_for_wan22, + "LongCatImagePipeline": enable_cache_for_longcat_image, + "YourModelPipeline": enable_cache_for_your_model, +} +``` + +The key must match `pipeline.__class__.__name__`. + +## Configuration Parameters + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `Fn_compute_blocks` | 1 | Number of blocks to always compute at the front | +| `Bn_compute_blocks` | 0 | Number of blocks to always compute at the back | +| `max_warmup_steps` | 4 | Steps to run without caching at the beginning | +| `max_cached_steps` | — | Max total cached steps | +| `max_continuous_cached_steps` | — | Max consecutive cached steps | +| `residual_diff_threshold` | 0.24 | Threshold for deciding whether to cache a block | + +### Tuning for quality vs speed + +| Goal | Adjustments | +|------|-------------| +| **More speed, acceptable quality loss** | Higher `residual_diff_threshold` (0.24-0.4), lower `max_warmup_steps` (2-4) | +| **Better quality, less speed** | Lower `residual_diff_threshold` (0.12-0.18), higher `max_warmup_steps` (6-8), lower `max_continuous_cached_steps` (2) | + +## Testing + +```python +from vllm_omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + +omni = Omni( + model="your-model-name", + cache_backend="cache_dit", + cache_config={ + "Fn_compute_blocks": 1, + "Bn_compute_blocks": 0, + "max_warmup_steps": 4, + "residual_diff_threshold": 0.24, + } +) +images = omni.generate( + "a beautiful landscape", + OmniDiffusionSamplingParams(num_inference_steps=50), +) +``` + +CLI (online serving): + +```bash +vllm serve your-model --omni --port 8098 \ + --cache-backend cache_dit \ + --cache-config '{"Fn_compute_blocks": 1, "Bn_compute_blocks": 0, "max_warmup_steps": 4}' +``` + +**Verification checklist**: +1. Logs show "Cache-dit enabled successfully on xxx" +2. Performance: 1.5-2x speedup vs no cache +3. Quality: compare output with `cache_backend=None` + +## Excluded Models + +Models listed in `_NO_CACHE_ACCELERATION` in `vllm_omni/diffusion/registry.py` do not support cache-dit (e.g., `NextStep11Pipeline`, `StableDiffusionPipeline`). Check this set before attempting to enable cache-dit. + +## Reference Implementations + +| Model | Path | Notes | +|-------|------|-------| +| Standard DiT | `cache_dit_backend.py::enable_cache_for_dit` | Default enabler, automatic | +| Wan2.2 | `cache_dit_backend.py::enable_cache_for_wan22` | Dual-transformer, auto-detects mode | +| LongCat | `cache_dit_backend.py::enable_cache_for_longcat_image` | Multi-block-list | +| BAGEL | `cache_dit_backend.py::enable_cache_for_bagel` | Complex omni model | diff --git a/.claude/skills/add-diffusion-model/references/custom-model-patterns.md b/.claude/skills/add-diffusion-model/references/custom-model-patterns.md new file mode 100644 index 0000000000..2434e0b5da --- /dev/null +++ b/.claude/skills/add-diffusion-model/references/custom-model-patterns.md @@ -0,0 +1,273 @@ +# Custom Model Patterns Reference + +Patterns for adding models that don't come from the standard diffusers pipeline format. + +## Directory Structure Comparison + +### Diffusers-based model (e.g., Wan2.2) + +``` +vllm_omni/diffusion/models/wan2_2/ +├── __init__.py # Exports pipeline + transformer + helpers +├── pipeline_wan2_2.py # Pipeline: loads components via from_pretrained() +├── pipeline_wan2_2_i2v.py # Variant pipeline for image-to-video +└── wan2_2_transformer.py # Transformer: ported from diffusers, uses Attention layer +``` + +The transformer is loaded separately via `weights_sources` + `load_weights()`. Non-transformer components (VAE, text encoder) are loaded in `__init__` via `from_pretrained()`. + +### Custom model with external deps (e.g., DreamID-Omni) + +``` +vllm_omni/diffusion/models/dreamid_omni/ +├── __init__.py # Exports pipeline only +├── pipeline_dreamid_omni.py # Pipeline: loads ALL weights in __init__ via custom helpers +├── fusion.py # Custom fusion architecture (video + audio cross-attention) +└── wan2_2.py # Re-implemented Wan backbone with split API + +examples/offline_inference/x_to_video_audio/ +└── download_dreamid_omni.py # Downloads weights from 3 HF repos + clones code repo +``` + +All weights loaded eagerly in `__init__`. `load_weights()` is a no-op. External dependency (`dreamid_omni` package) imported with try/except. + +### Custom model with ported code (e.g., BAGEL) + +``` +vllm_omni/diffusion/models/bagel/ +├── __init__.py +├── pipeline_bagel.py # Pipeline: instantiates models, uses weights_sources +├── bagel_transformer.py # Full LLM backbone (Qwen2-MoT) ported into vllm-omni +└── autoencoder.py # Custom VAE ported from original repo +``` + +Model code is fully ported (no external dependency). Uses `weights_sources` and `load_weights()` with custom name remapping to handle non-diffusers safetensors format. + +## Weight Loading Patterns + +### Pattern 1: Standard diffusers flow (Wan2.2, Z-Image, FLUX) + +``` +init → create transformer (empty) → set weights_sources → [loader calls load_weights()] +``` + +- `weights_sources` points to safetensors in HF subfolder (e.g., `transformer/`) +- `load_weights()` receives `(name, tensor)` pairs from the loader +- Name remapping handles diffusers→vllm-omni differences (QKV fusion, Sequential index removal) + +### Pattern 2: Custom safetensors at root (BAGEL) + +``` +init → create all models (empty) → set weights_sources(subfolder=None) → [loader calls load_weights()] +``` + +- `weights_sources` points to **root** of model directory, not a subfolder +- Weights have non-diffusers names (e.g., `bagel.language_model.model.layers.0.self_attn.q_proj.weight`) +- `load_weights()` does heavy name normalization + +```python +self.weights_sources = [ + DiffusersPipelineLoader.ComponentSource( + model_or_path=od_config.model, + subfolder=None, # root directory + prefix="", # no prefix stripping + fall_back_to_pt=False, + ) +] +``` + +### Pattern 3: Fully custom loading (DreamID-Omni) + +``` +init → load ALL weights eagerly via custom helpers → load_weights() = no-op +``` + +- No `weights_sources` attribute — standard loader finds nothing to iterate +- Custom init functions (e.g., `init_wan_vae_2_2()`, `load_fusion_checkpoint()`) handle downloading and loading +- `load_weights()` is `pass` +- Weights may come from multiple HF repos in different formats (`.pth`, `.safetensors`) + +Use this when: +- The original model has complex, well-tested loading code you don't want to rewrite +- Weights span multiple HF repos +- Weight format is non-standard (e.g., a single `.pth` file, not sharded safetensors) + +## model_index.json for Custom Models + +Standard diffusers `model_index.json`: +```json +{ + "_class_name": "WanPipeline", + "_diffusers_version": "0.35.0.dev0", + "scheduler": ["diffusers", "UniPCMultistepScheduler"], + "transformer": ["diffusers", "WanTransformer3DModel"], + "vae": ["diffusers", "AutoencoderKLWan"] +} +``` + +Custom model `model_index.json` (minimal): +```json +{ + "_class_name": "DreamIDOmniPipeline", + "fusion": "DreamID-Omni/dreamid_omni.safetensors" +} +``` + +The only **required** field is `_class_name` — it must match a key in `_DIFFUSION_MODELS` in `registry.py`. Other fields are model-specific and accessible via `od_config.model_config` dict. + +## External Dependency Management + +### Git clone + .pth injection (DreamID-Omni pattern) + +```python +def download_dependency(): + CACHE_DIR.mkdir(parents=True, exist_ok=True) + with open(LOCK_FILE, "w") as f: + fcntl.flock(f, fcntl.LOCK_EX) + if not DEPENDENCY_DIR.exists(): + subprocess.run([ + "git", "clone", "--depth", "1", + REPO_URL, "--branch", BRANCH, + str(DEPENDENCY_DIR) + ], check=True) + fcntl.flock(f, fcntl.LOCK_UN) + + # Add to Python path via .pth file + site_packages = Path(site.getsitepackages()[0]) + pth_file = site_packages / "vllm_omni_dependency.pth" + pth_file.write_text(str(DEPENDENCY_DIR)) +``` + +### Direct port (BAGEL pattern) + +Copy essential files from the original repo into `vllm_omni/diffusion/models//`. Adapt imports to use vllm-omni utilities. Benefits: no external dependency, no git clone step. Drawback: must maintain the ported code. + +## Multi-Modal Input/Output Protocols + +Custom models that handle images, audio, or video I/O should implement protocol classes: + +```python +from vllm_omni.diffusion.models.interface import ( + SupportImageInput, # Model accepts image input + SupportAudioInput, # Model accepts audio input + SupportAudioOutput, # Model produces audio output +) + +class MyPipeline(nn.Module, SupportImageInput, SupportAudioInput, SupportAudioOutput): + pass # Protocol markers enable proper engine routing +``` + +The engine checks `isinstance(pipeline, SupportImageInput)` at startup to configure input validation and warmup behavior. + +## Hardcoded Config vs Config Files + +Diffusers models use `config.json` in each subfolder. Custom models often use: + +**Module-level config dicts** (DreamID-Omni): +```python +VIDEO_CONFIG = { + "patch_size": [1, 2, 2], "model_type": "ti2v", + "dim": 3072, "ffn_dim": 14336, "num_heads": 24, "num_layers": 30, ... +} +``` + +**Loaded from custom JSON** (BAGEL): +```python +cfg_path = os.path.join(model_path, "config.json") +with open(cfg_path) as f: + bagel_cfg = json.load(f) +vae_cfg = bagel_cfg.get("vae_config", {}) +``` + +## Custom Architecture Patterns + +### Split forward API (DreamID-Omni) + +When a fusion model needs to interleave blocks from two backbones: + +```python +class WanModel(nn.Module): + def prepare_transformer_block_kwargs(self, x, t, context, ...): + # Patch embed, time embed, text embed, RoPE + return x, e, kwargs + + def post_transformer_block_out(self, x, grid_sizes, e): + # Output projection, unpatchify + return output + + def forward(self, *args, **kwargs): + raise NotImplementedError # Fusion model handles block iteration +``` + +The `FusionModel` then iterates blocks in lock-step: +```python +for video_block, audio_block in zip(self.video_model.blocks, self.audio_model.blocks): + video_out = video_block(video_hidden, ...) + audio_out = audio_block(audio_hidden, ...) + # Cross-attend between modalities + video_out = cross_attention(video_out, audio_out) + audio_out = cross_attention(audio_out, video_out) +``` + +### LLM-as-denoiser (BAGEL) + +When the backbone is a language model that also does diffusion: + +```python +class BagelModel(nn.Module): + def __init__(self): + self.language_model = Qwen2MoTForCausalLM(config) + self.vit_model = SiglipVisionModel(vit_config) +``` + +The LLM processes both text tokens and latent image tokens in a single forward pass, using KV caching for the text portion. + +## Pre/Post Processing for Custom Models + +Custom models typically handle pre/post processing **inside `forward()`** rather than via registered functions, because the logic is tightly coupled: + +```python +def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: + # Inline preprocessing + image = self._load_and_resize_image(req.prompts[0].get("multi_modal_data", {}).get("image")) + image_latent = self._vae_encode(image) + + # ... denoising loop ... + + # Inline postprocessing + pil_image = self._decode_to_pil(latents) + return DiffusionOutput(output=[pil_image]) +``` + +If pre/post functions are not registered in `_DIFFUSION_PRE_PROCESS_FUNCS` / `_DIFFUSION_POST_PROCESS_FUNCS`, the engine simply skips those steps. + +## Download Script Template + +```python +# examples/offline_inference//download_.py +from huggingface_hub import snapshot_download +import json, os + +def main(output_dir): + # Download model weights from HF + snapshot_download(repo_id="org/model-weights", local_dir=os.path.join(output_dir, "weights")) + + # Download additional components if from separate repos + snapshot_download(repo_id="org/vae-weights", local_dir=os.path.join(output_dir, "vae"), + allow_patterns=["*.safetensors"]) + + # Generate model_index.json + config = {"_class_name": "YourPipeline", "custom_key": "weights/model.safetensors"} + with open(os.path.join(output_dir, "model_index.json"), "w") as f: + json.dump(config, f, indent=2) + + # Install external code dependency (if needed) + download_dependency() + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--output-dir", default="./your_model") + args = parser.parse_args() + main(args.output_dir) +``` diff --git a/.claude/skills/add-diffusion-model/references/parallelism-patterns.md b/.claude/skills/add-diffusion-model/references/parallelism-patterns.md new file mode 100644 index 0000000000..933e2d2320 --- /dev/null +++ b/.claude/skills/add-diffusion-model/references/parallelism-patterns.md @@ -0,0 +1,571 @@ +# Parallelism Patterns Reference + +## Overview + +vLLM-Omni supports multiple parallelism strategies for diffusion models. Each targets a different bottleneck: + +| Strategy | Splits | Best For | Constraint | +|----------|--------|----------|------------| +| Tensor Parallel (TP) | Model layers across GPUs | Latency reduction, large models | Requires fast GPU interconnect, `num_heads % tp == 0` | +| Sequence Parallel (SP/USP) | Sequence tokens across GPUs | Long sequences (video, high-res) | Near-linear scaling | +| CFG Parallel | Positive/negative CFG branches | Models using classifier-free guidance | Exactly 2 GPUs | +| HSDP | Weight shards via FSDP2 | VRAM reduction | Cannot combine with TP | +| VAE Patch Parallel | VAE decode spatial tiles | Large VAE outputs | Auto-enables tiling | + +**Recommended integration order**: TP → SP → CFG Parallel → HSDP + +**Official design docs**: +- TP: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/tensor_parallel +- SP: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/sequence_parallel +- CFG: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/cfg_parallel +- HSDP: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/hsdp + +--- + +## Tensor Parallelism (TP) + +Replace standard `nn.Linear` with vLLM's parallel linear layers. This is the most invasive change but provides direct VRAM savings and compute speedup. + +### Layer replacement rules + +| Pattern | vLLM Layer | When to Use | +|---------|-----------|-------------| +| Fan-out (first in FFN) | `ColumnParallelLinear` | Projection that splits output across ranks | +| Fan-in (second in FFN) | `RowParallelLinear` | Projection that gathers across ranks | +| QKV projection | `QKVParallelLinear` | Fused Q/K/V for self-attention | +| Single Q or K or V | `ColumnParallelLinear` | Separate projections (cross-attention) | +| Attention output | `RowParallelLinear` | Output projection after attention | +| Must not shard | `ReplicatedLinear` | Layers that must stay replicated | + +### MLP Block (Up-Down Pattern) + +```python +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, RowParallelLinear, +) + +class TPFeedForward(nn.Module): + def __init__(self, dim, ffn_dim): + super().__init__() + self.fc1 = ColumnParallelLinear(dim, ffn_dim, bias=False, return_bias=False) + self.fc2 = RowParallelLinear( + ffn_dim, dim, bias=False, + input_is_parallel=True, # Input already sharded from fc1 + return_bias=False, + ) + + def forward(self, x): + x, _ = self.fc1(x) + x = torch.nn.functional.gelu(x) + x, _ = self.fc2(x) + return x +``` + +### Attention Block (QKV-Out Pattern) + +```python +from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear +from vllm_omni.diffusion.attention.layer import Attention + +class TPSelfAttention(nn.Module): + def __init__(self, dim, num_heads, num_kv_heads=None): + super().__init__() + num_kv_heads = num_kv_heads or num_heads + self.head_dim = dim // num_heads + + self.to_qkv = QKVParallelLinear( + hidden_size=dim, + head_size=self.head_dim, + total_num_heads=num_heads, + total_num_kv_heads=num_kv_heads, + bias=False, + return_bias=False, + ) + self.to_out = RowParallelLinear( + dim, dim, bias=False, + input_is_parallel=True, + return_bias=False, + ) + self.attn = Attention( + num_heads=self.to_qkv.num_heads, # Local heads per GPU + head_size=self.head_dim, + softmax_scale=1.0 / (self.head_dim ** 0.5), + causal=False, + num_kv_heads=self.to_qkv.num_kv_heads, # Local KV heads per GPU + ) + + def forward(self, x): + qkv, _ = self.to_qkv(x) + q, k, v = qkv.split( + [self.to_qkv.num_heads * self.head_dim, + self.to_qkv.num_kv_heads * self.head_dim, + self.to_qkv.num_kv_heads * self.head_dim], + dim=-1, + ) + B, S, _ = x.shape + q = q.view(B, S, self.to_qkv.num_heads, self.head_dim) + k = k.view(B, S, self.to_qkv.num_kv_heads, self.head_dim) + v = v.view(B, S, self.to_qkv.num_kv_heads, self.head_dim) + out = self.attn(q, k, v) + out = out.reshape(B, S, -1) + out, _ = self.to_out(out) + return out +``` + +### QKV Fusion in load_weights + +When you fuse separate Q/K/V into `QKVParallelLinear`, map diffusers' separate weight names: + +```python +stacked_params_mapping = [ + ("to_qkv", "to_q", "q"), + ("to_qkv", "to_k", "k"), + ("to_qkv", "to_v", "v"), +] + +def load_weights(self, weights): + params = dict(self.named_parameters()) + loaded = set() + for name, tensor in weights: + for fused_name, orig_name, shard_id in stacked_params_mapping: + if orig_name in name: + name = name.replace(orig_name, fused_name) + param = params[name] + param.weight_loader(param, tensor, shard_id) + loaded.add(name) + break + else: + if name in params: + param = params[name] + if hasattr(param, "weight_loader"): + param.weight_loader(param, tensor) + else: + default_weight_loader(param, tensor) + loaded.add(name) + return loaded +``` + +### RMSNorm with TP + +When RMSNorm sits between TP-sharded dimensions, use `DistributedRMSNorm` — it computes global RMS via all-reduce across TP ranks. See the Wan2.2 implementation for the pattern. + +### TP Constraints + +- `num_heads % tp_size == 0` +- `num_kv_heads % tp_size == 0` +- Use `self.to_qkv.num_heads` (local per-GPU count), not total heads, for split sizes + +### Testing TP + +```bash +python text_to_image.py --model Your-org/your-model \ + --tensor-parallel-size 2 --output "tp_test.png" +``` + +**Verify**: speedup, memory reduction proportional to TP size, quality matches single-GPU. + +### Reference implementations + +| Model | Path | +|-------|------| +| Z-Image | `vllm_omni/diffusion/models/z_image/z_image_transformer.py` | +| FLUX | `vllm_omni/diffusion/models/flux/flux_transformer.py` | +| Qwen-Image | `vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py` | + +--- + +## Sequence Parallelism (SP / USP) + +SP splits sequence tokens across GPUs using Ulysses (all-to-all) or Ring (P2P) communication. It is applied non-intrusively via the `_sp_plan` dict — no changes to `forward()` logic. + +### Approach 1: Non-Intrusive `_sp_plan` (Recommended) + +The framework automatically registers hooks to shard inputs and gather outputs at `nn.Module` boundaries. + +#### Step 1: Identify module boundaries + +Find where tensors need sharding/gathering: + +```python +class MyTransformer(nn.Module): + def __init__(self): + self.patch_embed = PatchEmbed() # Before blocks + self.pos_embed = RoPE() # RoPE may need splitting + self.blocks = nn.ModuleList([...]) # Blocks process sharded x + self.norm_out = LayerNorm() + self.proj_out = Linear() # Gather after this + + def forward(self, x): + x = self.patch_embed(x) + pos = self.pos_embed(x) + for block in self.blocks: + x = block(x, pos) + x = self.norm_out(x) + return self.proj_out(x) +``` + +#### Step 2: Handle inline operations + +`_sp_plan` hooks only work at `nn.Module` boundaries. Inline ops like `torch.cat()` must be extracted into submodules: + +```python +# BAD: Inline — hooks can't intercept +unified = torch.cat([x, cap_feats], dim=1) + +# GOOD: Extract into submodule +class UnifiedPrepare(nn.Module): + def forward(self, x, cap_feats): + return torch.cat([x, cap_feats], dim=1) + +self.unified_prepare = UnifiedPrepare() +unified = self.unified_prepare(x, cap_feats) +``` + +Common cases: `torch.cat()`, `pad_sequence()`, `tensor.reshape()`, complex preprocessing. + +#### Step 3: Write `_sp_plan` + +**Pattern 1: Shard at first block, gather at output** (most common) + +```python +from vllm_omni.diffusion.distributed.sp_plan import ( + SequenceParallelInput, SequenceParallelOutput, +) + +class StandardTransformer(nn.Module): + _sp_plan = { + "blocks.0": { + "hidden_states": SequenceParallelInput(split_dim=1, expected_dims=3), + }, + "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), + } +``` + +**Pattern 2: Shard RoPE outputs separately** + +```python +class TransformerWithRoPE(nn.Module): + _sp_plan = { + "rope": { + 0: SequenceParallelInput(split_dim=1, expected_dims=4, split_output=True), + 1: SequenceParallelInput(split_dim=1, expected_dims=4, split_output=True), + }, + "blocks.0": { + "hidden_states": SequenceParallelInput(split_dim=1, expected_dims=3), + }, + "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), + } +``` + +**Pattern 3: Dual-stream (shard image, replicate text)** + +```python +class DualStreamTransformer(nn.Module): + _sp_plan = { + "rope_preparer": { + 2: SequenceParallelInput(split_dim=0, expected_dims=2, split_output=True), + 3: SequenceParallelInput(split_dim=0, expected_dims=2, split_output=True), + }, + "transformer_blocks.0": { + "hidden_states": SequenceParallelInput(split_dim=1, expected_dims=3), + }, + "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), + } +``` + +### API Reference + +**SequenceParallelInput**: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `split_dim` | int | Dimension to split (usually 1 for sequence) | +| `expected_dims` | int/None | Expected tensor rank for validation | +| `split_output` | bool | `False`: shard input params; `True`: shard output tensors | +| `auto_pad` | bool | Auto-pad if sequence not divisible by world_size | + +**SequenceParallelOutput**: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `gather_dim` | int | Dimension to gather (usually 1 for sequence) | +| `expected_dims` | int/None | Expected tensor rank for validation | + +**Module naming**: + +| Key | Meaning | +|-----|---------| +| `"blocks.0"` | First element of ModuleList | +| `"blocks.*"` | All elements of ModuleList | +| `"rope"` | Named submodule | + +**Dictionary value types**: + +| Key type | split_output | Description | +|----------|-------------|-------------| +| `"param_name"` (str) | False | Shard input parameter by name | +| `0, 1, ...` (int) | True | Shard output tuple by index | + +### Approach 2: Intrusive Modification (Complex Cases) + +For dynamic sharding logic that can't be expressed via `_sp_plan`: + +```python +from vllm_omni.diffusion.distributed.sp_sharding import sp_shard, sp_gather + +def forward(self, hidden_states, ...): + if self.parallel_config.sequence_parallel_size > 1: + hidden_states = sp_shard(hidden_states, dim=1) + for block in self.blocks: + hidden_states = block(hidden_states) + if self.parallel_config.sequence_parallel_size > 1: + hidden_states = sp_gather(hidden_states, dim=1) + return hidden_states +``` + +Use intrusive modification as a last resort — `_sp_plan` is preferred for maintainability. + +### UAA Mode (Experimental) + +`ulysses_mode="advanced_uaa"` handles arbitrary sequence lengths and head counts that aren't divisible by `ulysses_degree`. Uses variable all-to-all split sizes and temporary head padding. + +### Combining SP methods + +Ulysses and Ring can be combined: `ulysses_degree × ring_degree = total SP GPUs`. + +```python +DiffusionParallelConfig(ulysses_degree=2, ring_degree=2) # 4 GPUs total +``` + +### Testing SP + +```bash +# Offline +python text_to_image.py --model Your-model --ulysses-degree 2 + +# Online serving +vllm serve Your-model --omni --usp 2 +``` + +### Reference implementations + +| Model | Path | +|-------|------| +| Qwen-Image | `vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py` | +| Wan2.2 | `vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py` | +| Z-Image | `vllm_omni/diffusion/models/z_image/z_image_transformer.py` | + +--- + +## CFG Parallelism + +Distributes positive/negative Classifier-Free Guidance branches across 2 GPUs. + +### Implementation + +Inherit `CFGParallelMixin` and implement `diffuse()`: + +```python +from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin + +class YourPipeline(nn.Module, CFGParallelMixin): + def diffuse(self, latents, timesteps, prompt_embeds, negative_embeds, + do_true_cfg, true_cfg_scale, **kwargs): + for i, t in enumerate(timesteps): + positive_kwargs = { + "hidden_states": latents, + "encoder_hidden_states": prompt_embeds, + "timestep": t, + } + negative_kwargs = { + "hidden_states": latents, + "encoder_hidden_states": negative_embeds, + "timestep": t, + } if do_true_cfg else None + + noise_pred = self.predict_noise_maybe_with_cfg( + do_true_cfg=do_true_cfg, + true_cfg_scale=true_cfg_scale, + positive_kwargs=positive_kwargs, + negative_kwargs=negative_kwargs, + ) + latents = self.scheduler_step_maybe_with_cfg( + noise_pred, t, latents, do_true_cfg + ) + return latents +``` + +### Customization hooks + +| Method | Override when | +|--------|-------------| +| `predict_noise()` | Non-standard transformer call (e.g., dual-transformer like Wan2.2) | +| `cfg_normalize_function()` | Custom normalization (e.g., LongCat with clamping) | +| `combine_cfg_noise()` | Multi-output models (e.g., video + audio: CFG on video, positive-only on audio) | + +**Custom predict_noise** (Wan2.2 — selects active transformer): + +```python +def predict_noise(self, current_model=None, **kwargs): + if current_model is None: + current_model = self.transformer + return current_model(**kwargs)[0] +``` + +**Custom combine_cfg_noise** (multi-output): + +```python +def combine_cfg_noise(self, positive_pred, negative_pred, scale, normalize): + video_pos, audio_pos = positive_pred + video_neg, audio_neg = negative_pred + video_combined = super().combine_cfg_noise(video_pos, video_neg, scale, normalize) + return (video_combined, audio_pos) +``` + +### Composite scheduler for multi-output + +When each output has its own schedule: + +```python +class VideoAudioScheduler: + def __init__(self, video_scheduler, audio_scheduler): + self.video_scheduler = video_scheduler + self.audio_scheduler = audio_scheduler + + def step(self, noise_pred, t, latents, return_dict=False, generator=None): + video_out = self.video_scheduler.step( + noise_pred[0], t[0], latents[0], return_dict=False, generator=generator + )[0] + audio_out = self.audio_scheduler.step( + noise_pred[1], t[1], latents[1], return_dict=False, generator=generator + )[0] + return ((video_out, audio_out),) +``` + +### Testing CFG Parallel + +```bash +python text_to_image.py --model Your-model \ + --cfg-parallel-size 2 --cfg-scale 4.0 \ + --negative-prompt "ugly, unclear" +``` + +**Constraint**: `guidance_scale > 1.0` and negative prompt must be provided. + +### Reference implementations + +| Model | Path | +|-------|------| +| Qwen-Image | `vllm_omni/diffusion/models/qwen_image/cfg_parallel.py` | +| Wan2.2 | `vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py` | +| Mixin base | `vllm_omni/diffusion/distributed/cfg_parallel.py` | + +--- + +## HSDP (Hybrid Sharded Data Parallel) + +Shards model weights across GPUs using PyTorch FSDP2. Reduces per-GPU VRAM without changing computation. + +### Implementation + +Add `_hsdp_shard_conditions` to the transformer class: + +```python +class YourTransformer(nn.Module): + @staticmethod + def _is_transformer_block(name: str, module) -> bool: + return "blocks" in name and name.split(".")[-1].isdigit() + + _hsdp_shard_conditions = [_is_transformer_block] +``` + +For MoE models, add additional conditions: + +```python +class MoETransformer(nn.Module): + @staticmethod + def _is_transformer_block(name, module): + return "blocks" in name and name.split(".")[-1].isdigit() + + @staticmethod + def _is_moe_expert(name, module): + return "experts" in name and name.split(".")[-1].isdigit() + + _hsdp_shard_conditions = [_is_transformer_block, _is_moe_expert] +``` + +A module is sharded if **any** condition returns `True`. + +### Constraints + +- Cannot combine with Tensor Parallelism +- For standalone HSDP (no other parallelism), `hsdp_shard_size` must be specified explicitly +- Can combine with SP: HSDP reduces memory while SP distributes sequence + +### Testing HSDP + +```python +from vllm_omni.diffusion.data import DiffusionParallelConfig + +parallel_config = DiffusionParallelConfig(use_hsdp=True, hsdp_shard_size=8) +omni = Omni(model="your-model", parallel_config=parallel_config) +``` + +Or CLI: + +```bash +vllm serve Your-model --omni --use-hsdp +``` + +**Verify**: logs show "HSDP Inference: replicate_size=..., shard_size=..." and "Sharded N modules + root". Check VRAM reduction. + +### Reference implementations + +| Model | Path | +|-------|------| +| Wan2.2 | `vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py` | +| HSDP Core | `vllm_omni/diffusion/distributed/hsdp.py` | + +--- + +## VAE Patch Parallelism + +Shards VAE decode spatially across ranks using tiling: + +```bash +python text_to_image.py --model Your-model --vae-patch-parallel-size 4 +``` + +Auto-enables `--vae-use-tiling`. Uses `DistributedAutoencoderKLWan` or similar distributed VAE. Set `vae_patch_parallel_size` in `DiffusionParallelConfig`. + +--- + +## Combining Parallelism Methods + +Common multi-GPU recipes: + +```bash +# 4 GPUs: CFG (2) × Ulysses (2) +python text_to_image.py --model Qwen/Qwen-Image \ + --cfg-parallel-size 2 --ulysses-degree 2 + +# 8 GPUs: Ulysses (4) × Ring (2) + VAE patch (8) +python text_to_video.py --model Wan-AI/Wan2.2-T2V-A14B-Diffusers \ + --ulysses-degree 4 --ring-degree 2 --vae-patch-parallel-size 8 + +# 2 GPUs: HSDP + Ulysses (cannot combine HSDP with TP) +vllm serve Your-model --omni --use-hsdp --usp 2 +``` + +## Discovering Parallelism Support + +Check which parallelism methods a model supports: + +| Check | How | +|-------|-----| +| **Ulysses / Ring SP** | Transformer defines `_sp_plan`. Search: `grep -r '_sp_plan' vllm_omni/diffusion/models/` | +| **CFG Parallel** | Pipeline inherits `CFGParallelMixin`. Search: `grep -r 'CFGParallelMixin' vllm_omni/diffusion/models/` | +| **TP** | Uses `ColumnParallelLinear` / `QKVParallelLinear`. Search: `grep -r 'ParallelLinear\|QKVParallel' vllm_omni/diffusion/models//` | +| **HSDP** | Transformer defines `_hsdp_shard_conditions`. Search: `grep -r '_hsdp_shard_conditions' vllm_omni/diffusion/models/` | + +The canonical per-model support table is in `docs/user_guide/diffusion/parallelism_acceleration.md`. diff --git a/.claude/skills/add-diffusion-model/references/transformer-adaptation.md b/.claude/skills/add-diffusion-model/references/transformer-adaptation.md new file mode 100644 index 0000000000..6e344b6a66 --- /dev/null +++ b/.claude/skills/add-diffusion-model/references/transformer-adaptation.md @@ -0,0 +1,218 @@ +# Transformer Adaptation Reference + +## Adapting a Diffusers Transformer to vLLM-Omni + +### Step-by-step Checklist + +1. Copy the transformer class from diffusers source +2. Remove all mixin classes — inherit only from `nn.Module` +3. Replace attention dispatch with `vllm_omni.diffusion.attention.layer.Attention` +4. Replace logger with `vllm.logger.init_logger` +5. Add `od_config: OmniDiffusionConfig | None = None` to `__init__` +6. Remove training-only code (gradient checkpointing, dropout) +7. Add `load_weights()` method for weight loading from safetensors +8. Add class-level attributes for acceleration features + +### Mixin Removal + +Remove these diffusers mixins (and their imports): + +```python +# Remove all of these: +from diffusers.models.modeling_utils import ModelMixin +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.models.attention_processor import AttentionModuleMixin +from diffusers.loaders import PeftAdapterMixin, FromOriginalModelMixin + +# Replace: +class MyTransformer(ModelMixin, ConfigMixin, AttentionModuleMixin): +# With: +class MyTransformer(nn.Module): +``` + +Also remove `@register_to_config` decorators from `__init__`. + +### Attention Replacement + +The vLLM-Omni `Attention` layer wraps backend selection (FlashAttention, SDPA, SageAttn, etc.) and supports sequence parallelism hooks. + +**QKV tensor shape must be `[batch, seq_len, num_heads, head_dim]`.** + +#### Self-Attention Pattern + +```python +from vllm_omni.diffusion.attention.layer import Attention +from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata + +class SelfAttentionBlock(nn.Module): + def __init__(self, dim, num_heads): + super().__init__() + self.num_heads = num_heads + self.head_dim = dim // num_heads + + self.to_q = nn.Linear(dim, dim) + self.to_k = nn.Linear(dim, dim) + self.to_v = nn.Linear(dim, dim) + self.to_out = nn.Linear(dim, dim) + + self.attn = Attention( + num_heads=num_heads, + head_size=self.head_dim, + softmax_scale=1.0 / (self.head_dim ** 0.5), + causal=False, + num_kv_heads=num_heads, + ) + + def forward(self, x, attn_mask=None): + B, S, _ = x.shape + q = self.to_q(x).view(B, S, self.num_heads, self.head_dim) + k = self.to_k(x).view(B, S, self.num_heads, self.head_dim) + v = self.to_v(x).view(B, S, self.num_heads, self.head_dim) + + attn_metadata = AttentionMetadata(attn_mask=attn_mask) + out = self.attn(q, k, v, attn_metadata=attn_metadata) + out = out.reshape(B, S, -1) + return self.to_out(out) +``` + +#### Fused QKV with TP (Advanced) + +For tensor parallelism, use vLLM's parallel linear layers: + +```python +from vllm.model_executor.layers.linear import ( + QKVParallelLinear, RowParallelLinear +) + +class TPSelfAttention(nn.Module): + def __init__(self, dim, num_heads): + super().__init__() + self.num_heads = num_heads + self.head_dim = dim // num_heads + + self.to_qkv = QKVParallelLinear( + hidden_size=dim, + head_size=self.head_dim, + total_num_heads=num_heads, + total_num_kv_heads=num_heads, + ) + self.to_out = RowParallelLinear(dim, dim) + + self.attn = Attention( + num_heads=num_heads, + head_size=self.head_dim, + softmax_scale=1.0 / (self.head_dim ** 0.5), + causal=False, + num_kv_heads=num_heads, + ) +``` + +### Logger Replacement + +```python +# Replace: +from diffusers.utils import logging +logger = logging.get_logger(__name__) + +# With: +from vllm.logger import init_logger +logger = init_logger(__name__) +``` + +### Custom Layers from vLLM-Omni + +Available utility layers: + +```python +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm_omni.diffusion.layers.rope import RotaryEmbedding +from vllm_omni.diffusion.layers.adalayernorm import AdaLayerNorm +``` + +### Config Support + +```python +from vllm_omni.diffusion.data import OmniDiffusionConfig + +class MyTransformer(nn.Module): + def __init__(self, *, od_config=None, num_layers=28, hidden_size=3072, **kwargs): + super().__init__() + self.od_config = od_config + self.parallel_config = od_config.parallel_config if od_config else None + # ... build layers +``` + +The transformer config values come from `model_index.json` → `config.json` in the transformer subfolder. The pipeline uses `get_transformer_config_kwargs(od_config.tf_model_config, TransformerClass)` to filter config keys to match the `__init__` signature. + +### Weight Loading + +The `load_weights` method receives an iterable of `(name, tensor)` from safetensors files, with the prefix (e.g., `"transformer."`) already stripped by the loader. + +```python +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + +class MyTransformer(nn.Module): + def load_weights(self, weights): + params = dict(self.named_parameters()) + loaded = set() + for name, tensor in weights: + # Optional: remap names from diffusers to vllm-omni naming + # e.g., "ff.net.0.proj" -> "ff.net_0.proj" + + if name in params: + param = params[name] + if hasattr(param, "weight_loader"): + param.weight_loader(param, tensor) + else: + default_weight_loader(param, tensor) + loaded.add(name) + return loaded +``` + +#### QKV Fusion in load_weights + +If you fused separate Q/K/V into a `QKVParallelLinear`, you need to map diffusers' separate weight names: + +```python +stacked_params_mapping = [ + ("to_qkv", "to_q", "q"), + ("to_qkv", "to_k", "k"), + ("to_qkv", "to_v", "v"), +] + +def load_weights(self, weights): + params = dict(self.named_parameters()) + loaded = set() + for name, tensor in weights: + for fused_name, orig_name, shard_id in stacked_params_mapping: + if orig_name in name: + name = name.replace(orig_name, fused_name) + param = params[name] + param.weight_loader(param, tensor, shard_id) + loaded.add(name) + break + else: + # Normal loading + ... + return loaded +``` + +### Class-Level Attributes for Features + +```python +class MyTransformer(nn.Module): + # torch.compile: list block class names that repeat and can be compiled + _repeated_blocks = ["MyTransformerBlock"] + + # CPU offload: attribute name of the nn.ModuleList containing blocks + _layerwise_offload_blocks_attr = "blocks" + + # LoRA: mapping of fused param names to original param names + packed_modules_mapping = {"to_qkv": ["to_q", "to_k", "to_v"]} + + # Sequence parallelism plan (advanced — add after basic impl works) + _sp_plan = { + "blocks.0": SequenceParallelInput(split_dim=1), + "proj_out": SequenceParallelOutput(gather_dim=1), + } +``` diff --git a/.claude/skills/add-diffusion-model/references/troubleshooting.md b/.claude/skills/add-diffusion-model/references/troubleshooting.md new file mode 100644 index 0000000000..27acdd8d15 --- /dev/null +++ b/.claude/skills/add-diffusion-model/references/troubleshooting.md @@ -0,0 +1,178 @@ +# Troubleshooting Reference + +## Common Errors When Adding a Diffusion Model + +### ImportError / ModuleNotFoundError + +**Cause**: Missing or incorrect registration. + +**Fix checklist**: +1. Model registered in `vllm_omni/diffusion/registry.py` `_DIFFUSION_MODELS` dict +2. `__init__.py` exports the pipeline class +3. Pipeline file exists at the correct path: `vllm_omni/diffusion/models/{folder}/{file}.py` +4. Class name in registry matches the actual class name in the file + +### Shape Mismatch in Attention + +**Symptom**: `RuntimeError: shape mismatch` or `expected 4D tensor` + +**Cause**: QKV tensors not reshaped to `[batch, seq_len, num_heads, head_dim]`. + +**Fix**: Before calling `self.attn(q, k, v, ...)`, ensure: +```python +q = q.view(batch, seq_len, self.num_heads, self.head_dim) +k = k.view(batch, kv_seq_len, self.num_kv_heads, self.head_dim) +v = v.view(batch, kv_seq_len, self.num_kv_heads, self.head_dim) +``` + +After attention, reshape back: +```python +out = out.reshape(batch, seq_len, -1) +``` + +### Weight Loading Failures + +**Symptom**: `RuntimeError: size mismatch for parameter ...` or missing keys + +**Debugging**: +1. Print diffusers weight names: `safetensors.safe_open(path, "pt").keys()` +2. Print model parameter names: `dict(model.named_parameters()).keys()` +3. Compare and add name remappings in `load_weights()` + +**Common remappings needed**: +- `ff.net.0.proj` → `ff.net_0.proj` (PyTorch Sequential indexing) +- `.to_out.0.` → `.to_out.` (Sequential unwrapping) +- `scale_shift_table` → moved to a wrapper module + +### Black/Blank/Noisy Output + +**Possible causes**: +1. **Wrong latent normalization**: Check VAE expects latents scaled by `vae.config.scaling_factor` +2. **Wrong scheduler**: Using the wrong scheduler class or wrong `flow_shift` +3. **Missing CFG**: Some models require `guidance_scale > 1.0` with negative prompt +4. **Wrong timestep format**: Some schedulers expect float, others expect int/long +5. **Missing post-processing**: Raw VAE output may need denormalization + +**Quick test**: Run with diffusers directly using the same seed and compare latents at each step. + +### OOM (Out of Memory) + +**Solutions** (in order of preference): +1. `--enforce-eager` to disable torch.compile (saves compile memory) +2. `--enable-cpu-offload` for model-level offload +3. `--enable-layerwise-offload` for block-level offload (better for large models) +4. `--vae-use-slicing --vae-use-tiling` for VAE memory reduction +5. Reduce resolution: `--height 480 --width 832` +6. Use TP: `--tensor-parallel-size 2` + +### Different Output vs Diffusers Reference + +**Common causes**: +1. **Attention backend difference**: FlashAttention vs SDPA may produce slightly different results. Set `DIFFUSION_ATTENTION_BACKEND=TORCH_SDPA` to match diffusers +2. **Float precision**: vLLM-Omni may use bfloat16 where diffusers uses float32 for some operations +3. **Missing normalization**: Check all LayerNorm/RMSNorm are preserved +4. **Scheduler rounding**: Some schedulers have numerical sensitivity + +### Tensor Parallel Errors + +**Symptom**: `AssertionError: not divisible` or incorrect output with TP>1 + +**Fix**: +1. Verify `num_heads % tp_size == 0` and `num_kv_heads % tp_size == 0` +2. Ensure `ColumnParallelLinear` / `RowParallelLinear` are used correctly +3. Check that norms between parallel layers use distributed norm if needed +4. Verify `load_weights` handles TP sharding for norm weights +5. Use `self.to_qkv.num_heads` (local heads per GPU) for QKV split sizes, not total heads + +**Missing `input_is_parallel=True`**: + +`RowParallelLinear` expects sharded input from `ColumnParallelLinear`: +```python +self.w1 = ColumnParallelLinear(dim, hidden_dim, return_bias=False) +self.w2 = RowParallelLinear(hidden_dim, dim, input_is_parallel=True, return_bias=False) +``` + +### Sequence Parallel Errors + +**Symptom**: Incorrect output or crashes with `--ulysses-degree N` or `--usp N` + +**Possible causes**: +1. **Inline operations between shard/gather points**: `torch.cat()`, `pad_sequence()` etc. not at `nn.Module` boundaries. Fix: extract into submodule. +2. **Wrong `split_dim`**: Check the tensor shape at the shard point. Sequence dimension is typically `dim=1` for `[B, S, D]` tensors. +3. **RoPE not sharded**: If RoPE is computed separately, add it to `_sp_plan` with `split_output=True`. +4. **Sequence not divisible by SP degree**: Use `auto_pad=True` in `SequenceParallelInput` or switch to `ulysses_mode="advanced_uaa"`. + +**Debugging**: Add `expected_dims=N` to `SequenceParallelInput`/`Output` for shape validation at runtime. + +### CFG Parallel Errors + +**Symptom**: CFG parallel not activating, no speedup + +**Fix checklist**: +1. Pipeline inherits `CFGParallelMixin` +2. `guidance_scale > 1.0` +3. Negative prompt provided (even if empty string) +4. `--cfg-parallel-size 2` specified +5. `diffuse()` method calls `predict_noise_maybe_with_cfg()` and `scheduler_step_maybe_with_cfg()` + +**Symptom**: Different output with CFG parallel vs sequential + +**Possible cause**: Non-deterministic scheduler. Fix: pass `generator=torch.Generator(device).manual_seed(seed)` to `scheduler_step_maybe_with_cfg()`. + +### HSDP Errors + +**Symptom**: HSDP not activating or errors during weight loading + +**Fix checklist**: +1. Transformer defines `_hsdp_shard_conditions` class attribute +2. Shard condition functions return `True` for correct modules (test with `model.named_modules()`) +3. Not combining with TP (HSDP and TP are incompatible) +4. For standalone HSDP, `hsdp_shard_size` is specified explicitly + +**Verify**: Check logs for "HSDP Inference: replicate_size=..., shard_size=..." and "Sharded N modules + root". + +### Cache-DiT Not Applied + +**Symptom**: No speedup, no cache-related log messages + +**Fix checklist**: +1. Model not in `_NO_CACHE_ACCELERATION` in `registry.py` +2. Pipeline class name matches `CUSTOM_DIT_ENABLERS` key (if using custom enabler) +3. `cache_backend="cache_dit"` specified +4. Check logs for "Cache-dit enabled successfully on xxx" + +**Verify pipeline name**: `print(pipeline.__class__.__name__)` — must match registry key. + +### Cache-DiT Quality Degradation + +**Symptom**: Artifacts or lower quality with cache-dit + +**Fix**: Reduce aggressiveness: +```python +cache_config={ + "residual_diff_threshold": 0.12, # Lower from 0.24 + "max_warmup_steps": 6, # Increase from 4 + "max_continuous_cached_steps": 2, # Reduce if higher +} +``` + +If quality is still poor, the model may need a custom enabler with per-block-list `ParamsModifier` tuning. + +### Model Not Detected / Wrong Pipeline Class + +**Symptom**: `ValueError: Model class ... not found in diffusion model registry` + +**Cause**: The model's `model_index.json` has a `_class_name` for the pipeline that doesn't match registry keys. + +**Fix**: The registry key must match the diffusers pipeline class name from `model_index.json`. If using a different name, map it in the registry: +```python +"DiffusersPipelineClassName": ("your_folder", "your_file", "YourVllmClassName"), +``` + +## Debugging Workflow + +1. **Add verbose logging**: Use `logger.info()` to print tensor shapes at each stage +2. **Compare step-by-step**: Run diffusers and vllm-omni side by side, comparing tensors after each major operation +3. **Use small configs**: Reduce `num_inference_steps=2`, small resolution for fast iteration +4. **Test transformer isolation**: Feed the same input to both diffusers and vllm-omni transformers, compare outputs +5. **Binary search for bugs**: Comment out blocks/layers to isolate where divergence starts diff --git a/.claude/skills/add-tts-model/SKILL.md b/.claude/skills/add-tts-model/SKILL.md new file mode 100644 index 0000000000..e64e7e763e --- /dev/null +++ b/.claude/skills/add-tts-model/SKILL.md @@ -0,0 +1,284 @@ +--- +name: add-tts-model +description: "Integrate a new text-to-speech model into vLLM-Omni from HuggingFace reference implementation through production-ready serving with streaming and CUDA graph acceleration. Use when adding a new TTS model, wiring stage separation for speech synthesis, enabling online voice generation serving, debugging TTS integration behavior, or building audio output pipelines." +--- + +# TTS Model Integration Workflow + +## Overview + +``` +HF Reference -> Stage Separation -> Online Serving -> Async Chunk -> CUDA Graph + (Phase 1) (Phase 2) (Phase 3) (Phase 4) (Phase 5) +``` + +## Phase 1: HuggingFace Reference + +**Goal**: Understand the reference implementation and verify it produces correct audio. + +### Steps + +1. **Run the reference model** end-to-end using the official HuggingFace / GitHub code +2. **Document the architecture**: + - What are the sub-models? (AR decoder, codec decoder, vocoder, etc.) + - What is the token vocabulary? (semantic codes, RVQ codebooks, special tokens) + - What is the output format? (sample rate, channels, codec type) +3. **Capture reference outputs** for comparison during integration +4. **Identify the config structure**: `config.json` fields, `model_type`, sub-model configs + +### Key Questions + +- How many codebooks? What are the codebook sizes? +- What special tokens exist? (`<|voice|>`, `<|audio_start|>`, `<|im_end|>`, etc.) +- What is the token-to-ID mapping for codec codes? +- What is the hop length / frame rate of the codec? +- Does the model support voice cloning? How? (reference audio encoding, speaker embeddings, etc.) + +### Deliverables + +- Working reference script that produces audio +- Architecture diagram / notes +- Token vocabulary mapping +- Reference audio samples for regression testing + +## Phase 2: Stage Separation (Offline Inference) + +**Goal**: Split the model into vLLM-Omni stages and get offline inference working. + +### Steps + +1. **Register the model** in `vllm_omni/model_executor/models/registry.py` +2. **Create config classes** (`configuration_.py`) with `model_type` registration +3. **Implement Stage 0** (AR model): + - Subclass appropriate base (e.g., wrap Qwen3 decoder layers) + - Implement `forward()` for autoregressive token generation + - Handle special token logic (start/stop tokens, codec token mapping) + - If dual-AR (like Fish Speech), implement Fast AR as a nested module +4. **Implement Stage 1** (Decoder): + - Load codec weights (may need lazy loading from separate checkpoint) + - Implement `forward()`: codec codes -> audio waveform + - Return `OmniOutput` with `multimodal_outputs` +5. **Create stage config YAML** defining both stages, memory allocation, and model paths +6. **Create stage input processor** for prompt building +7. **Write end2end.py** test script + +### Critical Parameters to Get Right + +| Parameter | Impact if Wrong | +|-----------|----------------| +| Hop length | Audio duration wrong, streaming noise | +| Token ID mapping | Garbage codes -> noise output | +| Codebook count/size | Shape mismatch crashes | +| Stop token | Generation never stops or stops too early | +| dtype / autocast | Numerical issues, silent quality degradation | +| Repetition penalty | Must match reference (often 1.0 for TTS) | + +### Debugging Priority (from experience) + +When audio output is wrong, check in this order: + +1. **RoPE / attention**: Are position encodings correct? Is the attention mask right? +2. **Normalization**: RMSNorm epsilon, layer norm placement (pre vs post) +3. **Hop length**: Product of all upsample rates in the codec decoder +4. **Token mapping**: Are codec IDs correctly offset from the vocabulary base? +5. **Sampling parameters**: Temperature, top_k, top_p, repetition_penalty +6. **Tensor layout**: Codebook-major vs frame-major ordering +7. **dtype**: Float32 for codec decoders (autocast can corrupt audio) + +### Deliverables + +- Model files in `vllm_omni/model_executor/models//` +- Stage config YAML +- Working `end2end.py` with correct audio output +- README.md in the example directory + +## Phase 3: Online Serving + +**Goal**: Expose the model via `/v1/audio/speech` API endpoint. + +### Steps + +1. **Register in `serving_speech.py`**: + - Add model stage name to `_TTS_MODEL_STAGES` set + - Add model detection flag (e.g., `_is_fish_speech`) + - Implement prompt builder method (e.g., `_build_fish_speech_prompt()`) +2. **Handle model-specific parameters**: + - Voice cloning: `ref_audio` encoding and prompt injection + - `max_new_tokens` override in sampling params + - Model-specific default values +3. **Create client scripts**: `speech_client.py`, `run_server.sh` +4. **Test all response formats**: wav, mp3, flac, pcm +5. **Add Gradio demo**: Interactive web UI with streaming support + +### Voice Cloning Pattern + +```python +import base64 +from pathlib import Path + +def build_voice_clone_prompt(ref_audio_path: str, text: str, codec) -> list: + """Build prompt with reference audio for voice cloning in serving_speech.py.""" + audio_bytes = Path(ref_audio_path).read_bytes() + codes = codec.encode(audio_bytes) # Encode on CPU using model's codec (e.g., DAC) + token_ids = [code + codec.vocab_offset for code in codes.flatten().tolist()] + return [ + {"role": "system", "content": f"<|voice|>{''.join(chr(t) for t in token_ids)}"}, + {"role": "user", "content": text}, + ] +``` + +### Deliverables + +- Updated `serving_speech.py` with model-specific prompt builder +- Client scripts and server launcher +- Gradio demo with streaming and voice cloning UI +- Documentation (offline + online serving docs) + +## Phase 4: Async Chunk (Streaming) + +**Goal**: Enable inter-stage streaming so audio chunks are produced while AR generation continues. + +### Steps + +1. **Update stage config YAML**: + ```yaml + async_chunk: true + codec_chunk_frames: 25 # frames per chunk + codec_left_context_frames: 25 # overlap for smooth boundaries + ``` +2. **Implement chunk handling in Stage 1**: + - Accept partial input (chunk of codec codes) + - Handle left context for smooth audio boundaries + - Return partial audio in `OmniOutput` +3. **Test streaming**: + - Verify audio quality matches non-streaming output + - Check for artifacts at chunk boundaries + - Measure TTFA (time to first audio) +4. **Update online serving** to support `stream=true` with PCM output + +### Streaming Architecture + +``` +Stage 0 (AR) Stage 1 (Decoder) + | | + |-- chunk 0 (25 frames) ------> decode -> audio chunk 0 -> client + |-- chunk 1 (25 frames) ------> decode -> audio chunk 1 -> client + |-- chunk 2 (25 frames) ------> decode -> audio chunk 2 -> client + ... +``` + +### Key Considerations + +- **Left context overlap**: Prevents audible artifacts at chunk boundaries +- **Hop length matters**: `context_audio_samples = context_frames * hop_length` +- **First chunk latency**: Can use larger initial chunk for better quality, then smaller chunks + +### Deliverables + +- Updated stage config with async_chunk enabled +- Smooth streaming audio without boundary artifacts +- TTFA metrics + +## Phase 5: CUDA Graph Acceleration + +**Goal**: Capture the AR loop as a CUDA graph for significant speedup. + +### Steps + +1. **Identify the hot loop**: The AR decoding loop that runs N steps per token +2. **Create static buffers**: + - KV caches with fixed max sequence length + - Pre-built causal masks and position tensors per step + - Static input/output tensors +3. **Implement graph capture**: + - Warm up with real data + - Capture the forward pass + - Replay with updated inputs +4. **Handle constraints**: + - Use `torch.argmax` instead of `torch.multinomial` (graph-safe) + - Fixed batch size (fall back to eager for other sizes) + - No dynamic control flow inside the graph + +### Example: Code Predictor CUDA Graph (Qwen3-TTS) + +```python +import torch + +class CodePredictorGraph: + """Captures the 16-step code predictor AR loop as a single CUDA graph.""" + + def setup_graph(self, device: torch.device, kv_heads: int = 4, head_dim: int = 64): + self.num_steps = 16 + self.kv_cache = torch.zeros(1, kv_heads, self.num_steps, head_dim, device=device) + self.positions = torch.arange(self.num_steps, device=device) + self.causal_mask = torch.tril(torch.ones(self.num_steps, self.num_steps, device=device)) + self.input_buf = torch.zeros(1, 1, kv_heads * head_dim, device=device) + self.output_buf = torch.zeros(1, self.num_steps, device=device, dtype=torch.long) + # Warm up, then: self.graph = torch.cuda.CUDAGraph(); self.graph.capture(...) + + def run_graph(self, initial_input: torch.Tensor) -> torch.Tensor: + self.input_buf.copy_(initial_input) + self.graph.replay() + return self.output_buf.clone() +``` + +### Performance Expectations + +Based on Qwen3-TTS code predictor experience: +- **3-5x speedup** for the graphed component +- Only effective for fixed batch sizes (typically batch_size=1) +- Falls back to eager mode for unsupported configurations + +### Deliverables + +- CUDA graph implementation for the AR hot loop +- Benchmark script comparing eager vs graph performance +- Documentation of constraints and fallback behavior + +## Integration Checklist + +Use this checklist when integrating a new TTS model: + +### Phase 1: HF Reference +- [ ] Reference model runs and produces correct audio +- [ ] Architecture documented (stages, codebooks, tokens, sample rate) +- [ ] Reference audio samples saved for comparison + +### Phase 2: Stage Separation +- [ ] Model registered in `registry.py` +- [ ] Config classes created with `model_type` registration +- [ ] Stage 0 (AR) implemented and generates correct tokens +- [ ] Stage 1 (Decoder) produces correct audio from tokens +- [ ] Stage config YAML created +- [ ] `end2end.py` produces audio matching reference quality +- [ ] README.md written + +### Phase 3: Online Serving +- [ ] Model added to `serving_speech.py` +- [ ] Prompt builder handles text input correctly +- [ ] Voice cloning works (if supported) +- [ ] All response formats work (wav, mp3, flac, pcm) +- [ ] Client scripts and server launcher created +- [ ] Gradio demo working +- [ ] Documentation added (offline + online docs, nav, supported models) + +### Phase 4: Async Chunk +- [ ] Stage config updated with `async_chunk: true` +- [ ] Stage 1 handles partial chunks correctly +- [ ] No audio artifacts at chunk boundaries +- [ ] Streaming via API (`stream=true`) works +- [ ] TTFA measured and acceptable + +### Phase 5: CUDA Graph +- [ ] Hot loop identified and profiled +- [ ] Static buffers allocated +- [ ] Graph captured and replays correctly +- [ ] Benchmark shows meaningful speedup +- [ ] Fallback to eager works for unsupported configs + +## References + +- [TTS audio skill](../vllm-omni-audio-tts/SKILL.md) -- supported models and usage +- [Fish Speech integration](../vllm-omni-audio-tts/references/fish-speech.md) -- complete example of Phases 1-3 +- [Qwen3-TTS reference](../vllm-omni-audio-tts/references/qwen-tts.md) -- complete example of all 5 phases +- [Adding a TTS model (developer guide)](https://github.com/vllm-project/vllm-omni/blob/main/docs/contributing/model/adding_tts_model.md) diff --git a/.claude/skills/readme.md b/.claude/skills/readme.md new file mode 100644 index 0000000000..b66f2ecd13 --- /dev/null +++ b/.claude/skills/readme.md @@ -0,0 +1,34 @@ +# Claude Skills for vLLM-Omni + +This directory contains Claude Code skills maintained for the `vllm-omni` +repository. These skills capture repeatable workflows for common contributor +tasks such as model integration, pull request review, and release note +generation. + +## Directory Structure + +Each skill lives in its own directory under `.claude/skills/`. A skill may +include: + +- `SKILL.md`: the main workflow and operating instructions +- `references/`: focused reference material used by the skill +- `scripts/`: small helper scripts used by the skill + +## Available Skills + +- `add-diffusion-model`: guides integration of a new diffusion model into + `vllm-omni` +- `add-omni-model`: covers addition of new omni-modality model support +- `add-tts-model`: covers integration of new TTS models and related serving + workflows +- `generate-release-note`: helps prepare release notes for repository changes +- `review-pr`: provides a structured workflow for reviewing pull requests + +## Maintenance Guidelines + +- Keep skill names short and task-oriented. +- Prefer repository-local paths, commands, and examples. +- Avoid hardcoding fast-changing support matrices unless the skill is actively + maintained alongside those changes. +- Treat skills as contributor tooling: optimize for clarity, actionability, and + low maintenance overhead. diff --git a/.gitignore b/.gitignore index 7f101a784c..c0ee968064 100644 --- a/.gitignore +++ b/.gitignore @@ -158,7 +158,19 @@ cython_debug/ # Claude CLAUDE.md -.claude/ +/.claude/* +!.claude/skills/ +!.claude/skills/readme.md +!.claude/skills/add-diffusion-model/ +!.claude/skills/add-diffusion-model/SKILL.md +!.claude/skills/add-diffusion-model/references/ +!.claude/skills/add-diffusion-model/references/*.md +!.claude/skills/add-tts-model/ +!.claude/skills/add-tts-model/SKILL.md +!.claude/skills/review-pr/ +!.claude/skills/review-pr/SKILL.md +!.claude/skills/review-pr/references/ +!.claude/skills/review-pr/references/*.md # Codex AGENTS.md From bcd5f16321df6bbc6f997a3906d16a23c8bb489e Mon Sep 17 00:00:00 2001 From: n1ptune Date: Tue, 14 Apr 2026 20:41:23 +0800 Subject: [PATCH 164/204] [Misc] clean Temporary CI Configs (#2784) Signed-off-by: neptune Co-authored-by: neptune --- tests/conftest.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e41d15bdf5..adb87cbd72 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,4 @@ +import atexit import base64 import datetime import io @@ -1362,9 +1363,10 @@ def delete_by_path(config_dict: dict, path: str) -> None: continue # Delete specified paths in this stage - for path in delete_paths: - if path: # Skip empty paths - delete_by_path(target_stage, path) + # Avoid shadowing the original YAML Path used for the output filename below. + for delete_path in delete_paths: + if delete_path: # Skip empty paths + delete_by_path(target_stage, delete_path) elif "." in key: # Delete using dot-separated path delete_by_path(config, key) @@ -1394,15 +1396,15 @@ def delete_by_path(config_dict: dict, path: str) -> None: raise KeyError(f"Stage ID {stage_id} not found, available: {available_ids}") # Apply updates to this stage - for path, val in stage_updates.items(): + for update_path, val in stage_updates.items(): # Check if this is a simple key (not dot-separated) # Example: 'engine_input_source' vs 'engine_args.max_model_len' - if "." not in path: + if "." not in update_path: # Direct key assignment (e.g., updating a list value) - target_stage[path] = val + target_stage[update_path] = val else: # Dot-separated path (e.g., nested dict access) - apply_update(target_stage, path, val) + apply_update(target_stage, update_path, val) elif "." in key: # Apply using dot-separated path apply_update(config, key, value) @@ -1414,13 +1416,14 @@ def delete_by_path(config_dict: dict, path: str) -> None: # within the same second (e.g. test_qwen3_omni_expansion imports both # get_chunk_config and get_batch_token_config). int(time.time()) would collide # and the later write would overwrite the earlier YAML on disk. - base_name = yaml_path.rsplit(".", 1)[0] if "." in yaml_path else yaml_path - output_path = f"{base_name}_{time.time_ns()}.yaml" + # Keep generated configs outside the repo and delete them when pytest exits. + output_fd, output_path = tempfile.mkstemp(prefix=f"{path.stem}_", suffix=".yaml") + atexit.register(Path(output_path).unlink, missing_ok=True) - with open(output_path, "w", encoding="utf-8") as f: + with os.fdopen(output_fd, "w", encoding="utf-8") as f: yaml.dump(config, f, default_flow_style=None, sort_keys=False, allow_unicode=True, indent=2) - return output_path + return str(output_path) class OmniServer: From 5ce0a434920590e090d7080f9f67e03c4c300d82 Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Tue, 14 Apr 2026 20:48:04 +0800 Subject: [PATCH 165/204] [CI][Bugfix] Update thresholds for accuracy tests (#2725) Signed-off-by: wangyu <410167048@qq.com> --- tests/e2e/accuracy/test_gedit_bench_h100_smoke.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py b/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py index ac5f2cb3cf..960ea57960 100644 --- a/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py +++ b/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py @@ -106,9 +106,9 @@ def test_gedit_bench_h100_smoke( group_summary = language_summary["by_group"][group] assert set(group_summary) == {"count", "Q_SC", "Q_PQ", "Q_O"} - assert summary["languages"]["en"]["overall"]["Q_SC"] >= 7.0 + assert summary["languages"]["en"]["overall"]["Q_SC"] >= 6.95 assert summary["languages"]["en"]["overall"]["Q_PQ"] >= 5.8 - assert summary["languages"]["en"]["overall"]["Q_O"] >= 6.2 + assert summary["languages"]["en"]["overall"]["Q_O"] >= 6.15 assert summary["languages"]["cn"]["overall"]["Q_SC"] >= 6.9 assert summary["languages"]["cn"]["overall"]["Q_PQ"] >= 5.7 assert summary["languages"]["cn"]["overall"]["Q_O"] >= 6.1 From cf1fcd5acf9ec0c7d74daf550a922f6fd3d716ca Mon Sep 17 00:00:00 2001 From: Alex Brooks Date: Tue, 14 Apr 2026 06:49:57 -0600 Subject: [PATCH 166/204] [CI/BugFix] Fix Flaky Test for Qwen Omni Perf (#2754) Signed-off-by: Alex Brooks --- vllm_omni/benchmarks/patch/patch.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm_omni/benchmarks/patch/patch.py b/vllm_omni/benchmarks/patch/patch.py index 343655df20..17d7498ba2 100644 --- a/vllm_omni/benchmarks/patch/patch.py +++ b/vllm_omni/benchmarks/patch/patch.py @@ -143,7 +143,11 @@ async def async_request_openai_chat_omni_completions( if response.status == 200: handler = StreamedResponseHandler() async for chunk_bytes in response.content.iter_any(): - chunk_bytes = chunk_bytes.strip() + # NOTE: Do NOT strip() here; TCP may fragment the SSE messages, + # so stripping here can cause problems depending on how it is split. + # + # Simple example: [b'data: ', b'{json}\n\n'] <- stripping the first + # chunk will break SSE parsing because the space after 'data:' is required. if not chunk_bytes: continue From 4fb078a03166fc749e889a1934b6a59b483d5e18 Mon Sep 17 00:00:00 2001 From: Bvicii <98971614+scyyh11@users.noreply.github.com> Date: Tue, 14 Apr 2026 05:53:06 -0700 Subject: [PATCH 167/204] [Bugfix] Reject /v1/audio/speech for Qwen omni models (#2763) Signed-off-by: Bvicii --- .../openai_api/test_serving_speech.py | 26 +++++++++++++++++++ .../entrypoints/openai/serving_speech.py | 18 +++++++++++++ 2 files changed, 44 insertions(+) diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index c884120620..b388b18606 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -684,6 +684,32 @@ def test_is_tts_detection_with_tts_stage(self, mocker: MockerFixture): assert server._is_tts is True assert server._tts_stage is mock_stage + def test_prepare_speech_rejects_non_tts_omni_model(self, mocker: MockerFixture): + """Multi-stage omni models (e.g. Qwen3-Omni) must not use /v1/audio/speech.""" + mock_engine_client = mocker.MagicMock() + mock_engine_client.errored = False + mock_engine_client.tts_max_instructions_length = None + + # Simulate Qwen3-Omni: multiple stages, none in _TTS_MODEL_STAGES + thinker = SimpleNamespace(engine_args=SimpleNamespace(model_stage="thinker"), tts_args={}) + talker = SimpleNamespace(engine_args=SimpleNamespace(model_stage="talker"), tts_args={}) + code2wav = SimpleNamespace(engine_args=SimpleNamespace(model_stage="code2wav"), tts_args={}) + mock_engine_client.stage_configs = [thinker, talker, code2wav] + + mock_models = mocker.MagicMock() + mock_models.is_base_model.return_value = True + server = OmniOpenAIServingSpeech( + engine_client=mock_engine_client, + models=mock_models, + request_logger=mocker.MagicMock(), + ) + assert server._is_tts is False + + request = OpenAICreateSpeechRequest(input="Hello world") + with pytest.raises(ValueError, match="only supported for dedicated TTS models"): + asyncio.run(server._prepare_speech_generation(request)) + server.shutdown() + def test_estimate_prompt_len_fallback(self, speech_server): """Test prompt length estimation falls back to 2048 when model is unavailable.""" tts_params = {"text": ["Hello"], "task_type": ["CustomVoice"]} diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 3dc5f595d0..1d9754853f 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -1471,6 +1471,24 @@ async def _prepare_speech_generation( ph_len = await self._estimate_prompt_len_async(tts_params) prompt = {"prompt_token_ids": [1] * ph_len, "additional_information": tts_params} else: + # Qwen omni models (Qwen3-Omni, Qwen2.5-Omni) use a "talker" + # stage whose preprocess requires chat-templated tokens. The + # async-chunk orchestrator prewarms the talker via + # compute_talker_prompt_ids_length(), which scans for Qwen + # chat-template markers (im_start_token_id 151644). A raw-text + # prompt produces a 1-token placeholder that crashes the talker's + # prefill/decode handoff. Reject early with an actionable message. + stage_names = { + getattr(getattr(s, "engine_args", None), "model_stage", None) for s in self.engine_client.stage_configs + } + if "talker" in stage_names: + raise ValueError( + "The /v1/audio/speech endpoint is only supported for " + "dedicated TTS models (e.g., Qwen3-TTS, Voxtral, Fish " + "Speech, CosyVoice3, OmniVoice, VoxCPM2). For omni " + "models like Qwen3-Omni, use /v1/chat/completions with " + '\'"modalities": ["audio"]\' instead.' + ) tts_params = {} prompt = {"prompt": request.input} From 53a9cf49a6a2ee8dbacb7985458390ffb804ddbe Mon Sep 17 00:00:00 2001 From: "Yiyang \"Ian\" Liu" Date: Tue, 14 Apr 2026 06:52:32 -0700 Subject: [PATCH 168/204] fix: do not apply FP8 quant config to vision/audio encoders for pre-quantized checkpoints (#2702) Signed-off-by: Yiyang Liu <37043548+ianliuy@users.noreply.github.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../models/test_encoder_quant_config.py | 77 +++++++++++++++++++ .../qwen2_5_omni/qwen2_5_omni_thinker.py | 12 ++- .../qwen3_omni/qwen3_omni_moe_thinker.py | 26 ++++--- vllm_omni/quantization/component_config.py | 25 ++++++ 4 files changed, 129 insertions(+), 11 deletions(-) create mode 100644 tests/model_executor/models/test_encoder_quant_config.py diff --git a/tests/model_executor/models/test_encoder_quant_config.py b/tests/model_executor/models/test_encoder_quant_config.py new file mode 100644 index 0000000000..8020184986 --- /dev/null +++ b/tests/model_executor/models/test_encoder_quant_config.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Regression test for #2686: pre-quantized methods must not apply +quant config to vision / audio encoders. + +For modelopt FP8/FP4/MXFP8 checkpoints the Thinker LM is the only +quantized component. Vision and audio encoder weights are BF16 with no +FP8 scale tensors — passing quant_config to them causes FP8 kernels to +run on BF16 weights, producing garbage embeddings. +""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +import pytest + +from vllm_omni.quantization.component_config import ( + PRE_QUANTIZED_METHODS, + ComponentQuantizationConfig, + resolve_encoder_quant_config, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + +# --------------------------------------------------------------------------- +# resolve_encoder_quant_config — the core routing logic for encoder quant +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("method", sorted(PRE_QUANTIZED_METHODS)) +def test_pre_quantized_returns_none(method: str) -> None: + """visual_quant_config and audio_quant_config must be None for + pre-quantized methods (modelopt, modelopt_fp4, modelopt_mxfp8).""" + mock_config = MagicMock() + mock_config.get_name.return_value = method + + assert resolve_encoder_quant_config(mock_config) is None + + +@pytest.mark.parametrize("method", ["fp8", "awq", "gptq", "bitsandbytes"]) +def test_non_pre_quantized_preserves_config(method: str) -> None: + """Non-pre-quantized methods should pass through the original config.""" + mock_config = MagicMock() + mock_config.get_name.return_value = method + + assert resolve_encoder_quant_config(mock_config) is mock_config + + +def test_none_input_returns_none() -> None: + """No quantization → None for encoders.""" + assert resolve_encoder_quant_config(None) is None + + +def test_component_config_passed_through() -> None: + """ComponentQuantizationConfig should be returned as-is so the caller + can call .resolve() with the appropriate prefix.""" + inner = MagicMock() + inner.get_name.return_value = "modelopt" # would be None if not Component + component = ComponentQuantizationConfig( + component_configs={"language_model": inner}, + default_config=None, + ) + + result = resolve_encoder_quant_config(component) + assert result is component + + +# --------------------------------------------------------------------------- +# PRE_QUANTIZED_METHODS constant — exhaustiveness check +# --------------------------------------------------------------------------- + + +def test_pre_quantized_methods_contains_expected() -> None: + """Guard against accidental removal of a known pre-quantized method.""" + expected = {"modelopt", "modelopt_fp4", "modelopt_mxfp8"} + assert PRE_QUANTIZED_METHODS == expected diff --git a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py index 0307034089..617f0f9e32 100644 --- a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py +++ b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py @@ -64,6 +64,10 @@ ) from vllm.sequence import IntermediateTensors +from vllm_omni.quantization.component_config import ( + resolve_encoder_quant_config, +) + try: import flash_attn except (ImportError, ModuleNotFoundError): @@ -359,6 +363,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.quant_config = quant_config + # Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) only quantize + # the Thinker LM. Vision encoder weights remain in BF16 with no FP8 + # scale tensors; passing quant_config causes FP8 kernels to run on + # BF16 weights, producing garbage embeddings. Keep None for encoders. + visual_quant_config = resolve_encoder_quant_config(quant_config) + with self._mark_tower_model(vllm_config, "audio"): if multimodal_config.get_limit_per_prompt("audio"): self.audio_tower = Qwen2_5OmniAudioEncoder(thinker_config.audio_config) @@ -370,7 +380,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.visual = Qwen2_5_VisionTransformer( vision_config=thinker_config.vision_config, norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6), - quant_config=quant_config, + quant_config=visual_quant_config, prefix=maybe_prefix(prefix, "visual"), ) else: diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py index 671ffb6cb1..d03a96fd85 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py @@ -119,7 +119,10 @@ from vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_thinker import ( Qwen2_5OmniConditionalGenerationMixin, ) -from vllm_omni.quantization.component_config import ComponentQuantizationConfig +from vllm_omni.quantization.component_config import ( + PRE_QUANTIZED_METHODS, + ComponentQuantizationConfig, +) try: import flash_attn @@ -1114,21 +1117,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.multimodal_config = multimodal_config self.quant_config = quant_config - # Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) quantize the - # entire thinker — audio tower, visual encoder, and language model - # all share the same quant method. Dynamic quantization methods - # (e.g. --quantization fp8) should only target the language model. - _PRE_QUANTIZED_METHODS = {"modelopt", "modelopt_fp4", "modelopt_mxfp8"} + # Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) only quantize + # the Thinker LM (language model). Vision and audio encoder weights + # remain in BF16 and have no corresponding scale tensors in the + # checkpoint. Dynamic quantization methods (e.g. --quantization fp8) + # should also only target the language model. if isinstance(quant_config, ComponentQuantizationConfig): audio_quant_config = quant_config.resolve("audio_tower") visual_quant_config = quant_config.resolve("visual") language_quant_config = quant_config.resolve("language_model") elif quant_config is not None: - if quant_config.get_name() in _PRE_QUANTIZED_METHODS: - # Pre-quantized: pass quant_config to all subcomponents. - audio_quant_config = quant_config - visual_quant_config = quant_config + if quant_config.get_name() in PRE_QUANTIZED_METHODS: + # Pre-quantized: only the Thinker LM is quantized. + # Vision/audio encoder weights are BF16 with no FP8 scales; + # passing quant_config to them causes FP8 kernels to run on + # BF16 weights (producing garbage embeddings). Keep None. + audio_quant_config = None + visual_quant_config = None language_quant_config = quant_config else: # Dynamic quantization: scope to language_model only. diff --git a/vllm_omni/quantization/component_config.py b/vllm_omni/quantization/component_config.py index 7986da8850..f9286079be 100644 --- a/vllm_omni/quantization/component_config.py +++ b/vllm_omni/quantization/component_config.py @@ -23,6 +23,31 @@ ) +# Pre-quantized checkpoints (modelopt FP8/FP4/MXFP8) only quantize the +# Thinker LM. Vision and audio encoder weights remain in BF16 with no +# corresponding scale tensors in the checkpoint. +PRE_QUANTIZED_METHODS: frozenset[str] = frozenset({"modelopt", "modelopt_fp4", "modelopt_mxfp8"}) + + +def resolve_encoder_quant_config( + quant_config: QuantizationConfig | None, +) -> QuantizationConfig | None: + """Resolve quantization config for vision / audio encoders. + + Returns *None* for pre-quantized methods so that FP8 kernels are never + applied to BF16 encoder weights (which lack scale tensors). All other + configs — including ``ComponentQuantizationConfig`` and ``None`` — are + returned as-is so the caller can handle them. + """ + if ( + quant_config is not None + and not isinstance(quant_config, ComponentQuantizationConfig) + and quant_config.get_name() in PRE_QUANTIZED_METHODS + ): + return None + return quant_config + + class ComponentQuantizationConfig(QuantizationConfig): """Routes quantization to different configs by layer prefix.""" From f03ab38783cb6ed5f110540966aae54fec06828d Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Tue, 14 Apr 2026 22:26:55 +0800 Subject: [PATCH 169/204] [BugFix] Fix NoneType' object has no attribute 'detach' (#2797) Signed-off-by: amy-why-3459 --- tests/e2e/online_serving/test_qwen3_omni.py | 2 +- vllm_omni/worker/gpu_ar_model_runner.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py index fcda20ba38..f4aabb8b95 100644 --- a/tests/e2e/online_serving/test_qwen3_omni.py +++ b/tests/e2e/online_serving/test_qwen3_omni.py @@ -120,7 +120,7 @@ def test_mix_to_text_audio_001(omni_server, openai_client) -> None: } # Test single completion - openai_client.send_omni_request(request_config) + openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) @pytest.mark.advanced_model diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 4f3f843e65..62a0c85716 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -797,12 +797,11 @@ def propose_draft_token_ids(sampled_token_ids): elif isinstance(v, dict): mm_payload[k] = {sk: sv[start:end].contiguous() for sk, sv in v.items()} elif isinstance(v, list): - if idx < len(v): - element = v[idx] - if element is not None: - if isinstance(element, torch.Tensor): - element = element.clone() - mm_payload[k] = element + element = v[idx] if idx < len(v) else v[0] + if element is not None: + if isinstance(element, torch.Tensor): + element = element.clone() + mm_payload[k] = element # Skip None elements: msgspec cannot serialize None # in dict[str, torch.Tensor] typed fields. elif isinstance(v, torch.Tensor): From bc4a659f03f7d28892fa1a52a1cceaa55ddac0ba Mon Sep 17 00:00:00 2001 From: "Yiyang \"Ian\" Liu" Date: Tue, 14 Apr 2026 07:41:28 -0700 Subject: [PATCH 170/204] [Bugfix] Make mrope kwargs optional in HunyuanImage3 get_mrope_input_positions (#2654) Signed-off-by: Yiyang Liu Co-authored-by: SYLAR <125541396+lishunyang12@users.noreply.github.com> --- .../model_executor/models/hunyuan_image3/hunyuan_image3.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index 6d25274f90..5c280ddcf4 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -1507,9 +1507,9 @@ def get_mrope_input_positions( input_tokens: list[int], mm_features: list[MultiModalFeatureSpec] | None = None, *, - hf_config: PretrainedConfig, - image_grid_thw: list[list[int]] | torch.Tensor, - video_grid_thw: list[list[int]] | torch.Tensor, + hf_config: PretrainedConfig | None = None, + image_grid_thw: list[list[int]] | torch.Tensor | None = None, + video_grid_thw: list[list[int]] | torch.Tensor | None = None, second_per_grid_ts: list[float] | None = None, context_len: int = 0, seq_len: int | None = None, From 9e46a79c17d4f0153f8347a17fc18710e10a8298 Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Wed, 15 Apr 2026 08:52:32 +0800 Subject: [PATCH 171/204] [Bugfix] Handle numpy array outputs when generate image (#1680) Signed-off-by: rongfu.leng --- .../openai_api/test_image_server.py | 88 +++++++++++++++++++ vllm_omni/entrypoints/openai/api_server.py | 35 +++++++- 2 files changed, 122 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py index c91c5a5c75..4b38692da3 100644 --- a/tests/entrypoints/openai_api/test_image_server.py +++ b/tests/entrypoints/openai_api/test_image_server.py @@ -1165,3 +1165,91 @@ def test_image_edit_with_seed_zero_single_stage(test_client): f"Expected seed=0, but got seed={captured_sampling_params.seed}. " "This indicates the bug where seed=0 is treated as falsy." ) + + +def test_normalize_image(): + """Test _normalize_image with various input types""" + import numpy as np + + from vllm_omni.entrypoints.openai.api_server import _normalize_image + + # Test PIL Image input + img = Image.new("RGB", (64, 64), color="red") + result = _normalize_image(img) + assert isinstance(result, Image.Image) + assert result.size == (64, 64) + + # Test uint8 numpy array + arr = np.random.randint(0, 255, (64, 64, 3), dtype=np.uint8) + result = _normalize_image(arr) + assert isinstance(result, Image.Image) + assert result.size == (64, 64) + + # Test float [0, 1] numpy array + arr = np.random.rand(64, 64, 3).astype(np.float32) + result = _normalize_image(arr) + assert isinstance(result, Image.Image) + assert result.size == (64, 64) + + # Test float [-1, 1] numpy array + arr = np.random.rand(64, 64, 3).astype(np.float32) * 2 - 1 + result = _normalize_image(arr) + assert isinstance(result, Image.Image) + assert result.size == (64, 64) + + # Test batch dimensions (1, 1, H, W, C) + arr = np.random.randint(0, 255, (1, 1, 64, 64, 3), dtype=np.uint8) + result = _normalize_image(arr) + assert isinstance(result, Image.Image) + assert result.size == (64, 64) + + +def test_extract_images_from_result(): + """Test _extract_images_from_result with various result formats""" + import numpy as np + + from vllm_omni.entrypoints.openai.api_server import _extract_images_from_result + + # Test empty result + class EmptyResult: + pass + + result = EmptyResult() + images = _extract_images_from_result(result) + assert images == [] + + # Test nested batch: [np.array(shape=(3, 64, 64, 3))] + batch = np.random.randint(0, 255, (3, 1, 64, 64, 3), dtype=np.uint8) + + class BatchResult: + def __init__(self): + self.images = [batch] + + result = BatchResult() + images = _extract_images_from_result(result) + assert len(images) == 3 + assert all(isinstance(img, Image.Image) for img in images) + assert all(img.size == (64, 64) for img in images) + + # Test dict path: result.request_output["images"] + class DictRequestOutput: + def __init__(self): + self.request_output = {"images": [np.random.randint(0, 255, (64, 64, 3), dtype=np.uint8)]} + + result = DictRequestOutput() + images = _extract_images_from_result(result) + assert len(images) == 1 + assert isinstance(images[0], Image.Image) + + # Test attribute path: result.request_output.images + class AttrRequestOutput: + def __init__(self): + self.request_output = type( + "obj", (), {"images": [np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)]} + )() + + result = AttrRequestOutput() + images = _extract_images_from_result(result) + assert len(images) == 1 + assert isinstance(images[0], Image.Image) + assert images[0].size == (32, 32) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 6a65f44332..d847a96db6 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -18,6 +18,7 @@ from typing import Annotated, Any, Literal, cast import httpx +import numpy as np import vllm.envs as envs from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, Request, UploadFile, WebSocket from fastapi.responses import FileResponse, JSONResponse, Response, StreamingResponse @@ -1767,6 +1768,34 @@ def _update_if_not_none(object: Any, key: str, val: Any) -> None: setattr(object, key, val) +def _normalize_image(image: Any) -> Any: + """Normalize a single image output to a PIL-compatible format.""" + if isinstance(image, Image.Image): + return image + if not isinstance(image, np.ndarray): + raise ValueError(f"Unsupported image type: {type(image)}") + if not np.issubdtype(image.dtype, np.integer) and not np.issubdtype(image.dtype, np.floating): + raise ValueError(f"Unsupported dtype: {image.dtype}") + if isinstance(image, np.ndarray): + while image.ndim > 3: + image = image[0] + if image.min() < 0: + if image.min() < -1.01 or image.max() > 1.01: + logger.warning( + f"Image float range [{image.min():.2f}, {image.max():.2f}] outside expected [-1, 1]. " + f"Clipping to [-1, 1] before normalization." + ) + image = np.clip(image, -1.0, 1.0) * 0.5 + 0.5 + elif image.max() > 1.01: + logger.warning( + f"Image float range [{image.min():.2f}, {image.max():.2f}] outside expected [0, 1]. " + f"Clipping to [0, 1] before normalization." + ) + image = (np.clip(image, 0.0, 1.0) * 255).astype(np.uint8) + image = Image.fromarray(image) + return image + + def _extract_images_from_result(result: Any) -> list[Any]: images = [] if hasattr(result, "images") and result.images: @@ -1777,6 +1806,10 @@ def _extract_images_from_result(result: Any) -> list[Any]: images = request_output["images"] elif hasattr(request_output, "images") and request_output.images: images = request_output.images + # Handle when generate more than one image + if images and isinstance(images[0], np.ndarray) and images[0].shape[0] > 1 and images[0].ndim == 5: + # Unwrap batch: (N, T, H, W, C) -> [img1, img2, ...] + images = list(images[0]) # Flatten nested lists (e.g., from layered models like Qwen-Image-Layered). # Note: This only flattens one level deep. Deeper nesting is not supported. flattened = [] @@ -1785,7 +1818,7 @@ def _extract_images_from_result(result: Any) -> list[Any]: flattened.extend(img) else: flattened.append(img) - return flattened + return [_normalize_image(img) for img in flattened] async def _load_input_images( From 02e5dc747d028ab75a136988985b32dc83d33557 Mon Sep 17 00:00:00 2001 From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Date: Tue, 14 Apr 2026 21:58:25 -0400 Subject: [PATCH 172/204] [Perf] VoxCPM2: streaming VAE + compile optimization (45% RTF reduction) (#2758) Signed-off-by: Yueqian Lin --- examples/offline_inference/voxcpm2/end2end.py | 10 +- .../online_serving/voxcpm2/gradio_demo.py | 602 ++++++++++++++++++ tests/e2e/offline_inference/test_voxcpm2.py | 10 +- vllm_omni/engine/output_processor.py | 7 +- .../models/voxcpm2/minicpm4_paged.py | 71 +-- .../models/voxcpm2/voxcpm2_talker.py | 95 ++- 6 files changed, 716 insertions(+), 79 deletions(-) create mode 100644 examples/online_serving/voxcpm2/gradio_demo.py diff --git a/examples/offline_inference/voxcpm2/end2end.py b/examples/offline_inference/voxcpm2/end2end.py index ce404bf962..687e596018 100644 --- a/examples/offline_inference/voxcpm2/end2end.py +++ b/examples/offline_inference/voxcpm2/end2end.py @@ -74,16 +74,20 @@ def extract_audio(multimodal_output: dict) -> torch.Tensor: The output processor concatenates per-step delta tensors under ``model_outputs``. Falls back to ``audio`` for backwards compat. """ - audio = multimodal_output.get("model_outputs") or multimodal_output.get("audio") + audio = multimodal_output.get("model_outputs") + if audio is None: + audio = multimodal_output.get("audio") if audio is None: raise ValueError(f"No audio key in multimodal_output: {list(multimodal_output.keys())}") if isinstance(audio, list): - # Take the last valid tensor (most complete audio) + # Defensive: usually the output processor consolidates into a single + # tensor at request completion, but concatenate here too in case the + # caller consumes intermediate (pre-consolidation) outputs. valid = [torch.as_tensor(a).float().cpu().reshape(-1) for a in audio if a is not None] if not valid: raise ValueError("Audio list is empty or all elements are None.") - return valid[-1] + return torch.cat(valid, dim=0) if len(valid) > 1 else valid[0] return torch.as_tensor(audio).float().cpu().reshape(-1) diff --git a/examples/online_serving/voxcpm2/gradio_demo.py b/examples/online_serving/voxcpm2/gradio_demo.py new file mode 100644 index 0000000000..a33a2d9245 --- /dev/null +++ b/examples/online_serving/voxcpm2/gradio_demo.py @@ -0,0 +1,602 @@ +"""Gradio demo for VoxCPM2 TTS with gapless streaming audio playback. + +Uses a custom AudioWorklet-based player for gap-free streaming +(adapted from the Qwen3-TTS demo). Audio is streamed from the vLLM +server through a same-origin proxy and played via the Web Audio API's +AudioWorklet, which maintains a FIFO buffer queue and plays samples at +the audio clock rate. + +Usage: + # Start the vLLM server first: + python -m vllm_omni.entrypoints.openai.api_server \ + --model openbmb/VoxCPM2 \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm2.yaml \ + --host 0.0.0.0 --port 8000 + + # Then launch the demo: + python gradio_demo.py --api-base http://localhost:8000 +""" + +from __future__ import annotations + +import argparse +import base64 +import io +import json +import logging + +import gradio as gr +import httpx +import numpy as np +import soundfile as sf +from fastapi import FastAPI, Request +from fastapi.responses import Response, StreamingResponse + +logger = logging.getLogger(__name__) + +SAMPLE_RATE = 48000 + +# ── AudioWorklet processor (loaded in browser via Blob URL) ────────── +WORKLET_JS = r""" +class TTSPlaybackProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this.queue = []; + this.buf = null; + this.pos = 0; + this.playing = false; + this.played = 0; + this.port.onmessage = (e) => { + if (e.data && e.data.type === 'clear') { + this.queue = []; this.buf = null; this.pos = 0; this.played = 0; + if (this.playing) { this.playing = false; this.port.postMessage({type:'stopped'}); } + return; + } + this.queue.push(e.data); + }; + } + process(inputs, outputs) { + const out = outputs[0][0]; + for (let i = 0; i < out.length; i++) { + if (!this.buf || this.pos >= this.buf.length) { + if (this.queue.length > 0) { + this.buf = this.queue.shift(); this.pos = 0; + } else { + for (let j = i; j < out.length; j++) out[j] = 0; + if (this.playing) { this.playing = false; this.port.postMessage({type:'stopped', played:this.played}); } + return true; + } + } + out[i] = this.buf[this.pos++] / 32768; + this.played++; + } + if (!this.playing) { this.playing = true; this.port.postMessage({type:'started'}); } + return true; + } +} +registerProcessor('tts-playback-processor', TTSPlaybackProcessor); +""" + +PLAYER_HTML = """ +

+""" + + +def _build_player_js() -> str: + return f""" + +""" + + +def _encode_audio(audio_data: tuple) -> str: + sr, audio_np = audio_data + if audio_np.dtype in (np.float32, np.float64): + audio_np = np.clip(audio_np, -1.0, 1.0) + audio_np = (audio_np * 32767).astype(np.int16) + elif audio_np.dtype != np.int16: + audio_np = audio_np.astype(np.int16) + buf = io.BytesIO() + sf.write(buf, audio_np, sr, format="WAV") + return f"data:audio/wav;base64,{base64.b64encode(buf.getvalue()).decode()}" + + +def create_app(api_base: str): + app = FastAPI() + _pending: dict[str, dict] = {} + + @app.post("/proxy/v1/audio/speech") + async def proxy_speech(request: Request): + body = await request.json() + req_id = body.get("_req_id") + if req_id and req_id in _pending: + body = _pending.pop(req_id) + logger.info("Proxy: %s", {k: (f"<{len(str(v))} chars>" if k == "ref_audio" else v) for k, v in body.items()}) + try: + client = httpx.AsyncClient(timeout=300) + resp = await client.send( + client.build_request( + "POST", + f"{api_base}/v1/audio/speech", + json=body, + headers={"Authorization": "Bearer EMPTY", "Content-Type": "application/json"}, + ), + stream=True, + ) + except Exception as exc: + logger.exception("Proxy connection error") + await client.aclose() + return Response(content=str(exc), status_code=502) + if resp.status_code != 200: + content = await resp.aread() + await resp.aclose() + await client.aclose() + return Response(content=content, status_code=resp.status_code) + + async def relay(): + try: + async for chunk in resp.aiter_bytes(): + yield chunk + finally: + await resp.aclose() + await client.aclose() + + return StreamingResponse(relay(), media_type="application/octet-stream") + + css = """ + #generate-btn button { width: 100%; } + #streaming-player { border: 1px solid var(--border-color-primary) !important; border-radius: var(--block-radius) !important; padding: var(--block-padding) !important; } + """ + theme = gr.themes.Default( + primary_hue=gr.themes.Color( + c50="#f0f5ff", + c100="#dce6f9", + c200="#b8cef3", + c300="#8eb2eb", + c400="#6496e0", + c500="#4A90D9", + c600="#3a7bc8", + c700="#2d66b0", + c800="#1f4f8f", + c900="#163a6e", + c950="#0e2650", + ), + ) + + with gr.Blocks(title="VoxCPM2 TTS Demo") as demo: + gr.HTML(f""" + + """) + + gr.Markdown( + "**Three modes:** " + "**Voice Design** (control instruction only) · " + "**Controllable Cloning** (ref audio + optional style control) · " + "**Ultimate Cloning** (ref audio + transcript for audio continuation)" + ) + + with gr.Row(): + with gr.Column(scale=3): + text_input = gr.Textbox( + label="Target Text", + placeholder="Enter text to synthesize...", + lines=4, + ) + control_instruction = gr.Textbox( + label="Control Instruction (optional)", + placeholder="e.g. A warm young woman / Excited and fast-paced", + lines=2, + info="Describe voice style, emotion, pace. Works for both Voice Design and Controllable Cloning.", + ) + + with gr.Accordion("Voice Cloning", open=False): + ref_audio = gr.Audio( + label="Reference Audio (upload for cloning)", + type="numpy", + sources=["upload", "microphone"], + ) + ref_audio_url = gr.Textbox( + label="or Reference Audio URL", + placeholder="https://example.com/reference.wav", + ) + ultimate_clone = gr.Checkbox( + label="Ultimate Cloning Mode", + value=False, + info="Provide transcript of ref audio for audio continuation (disables control instruction)", + ) + prompt_text = gr.Textbox( + label="Reference Audio Transcript", + placeholder="Transcript of your reference audio (for ultimate cloning)", + lines=2, + visible=False, + ) + + with gr.Row(): + stream_checkbox = gr.Checkbox( + label="Stream (gapless)", + value=True, + info="AudioWorklet streaming", + ) + with gr.Row(): + generate_btn = gr.Button( + "Generate Speech", + variant="primary", + size="lg", + elem_id="generate-btn", + scale=3, + ) + reset_btn = gr.Button("Reset", variant="secondary", size="lg", scale=1) + + with gr.Column(scale=2): + player_html = gr.HTML( + value=PLAYER_HTML, + visible=True, + label="streaming player", + elem_id="streaming-player", + ) + audio_output = gr.Audio( + label="generated audio", + interactive=False, + autoplay=True, + visible=False, + ) + gr.Examples( + examples=[ + ["Hello, this is a VoxCPM2 demo running on vLLM-Omni.", ""], + [ + "I have a dream that my four little children will one day live in a nation " + "where they will not be judged by the color of their skin but by the content " + "of their character.", + "", + ], + [ + "I never asked you to stay. It's not like I care or anything. " + "But why does it still hurt so much now that you're gone?", + "A young girl with a soft, sweet voice. Speaks slowly with a melancholic tone.", + ], + ], + inputs=[text_input, control_instruction], + label="examples", + ) + gr.HTML(""" +
+ + vLLM-Omni + +
+ """) + + hidden_payload = gr.Textbox(visible=False, elem_id="tts-payload") + + def on_ultimate_toggle(checked): + return ( + gr.update(visible=checked), # prompt_text + gr.update(interactive=not checked), # control_instruction + ) + + ultimate_clone.change( + fn=on_ultimate_toggle, + inputs=[ultimate_clone], + outputs=[prompt_text, control_instruction], + ) + + def on_stream_change(stream: bool): + if stream: + return gr.update(visible=True), gr.update(visible=False) + return gr.update(visible=False), gr.update(visible=True) + + stream_checkbox.change( + fn=on_stream_change, + inputs=[stream_checkbox], + outputs=[player_html, audio_output], + ) + + def on_reset(): + return "", "", None, "", False, "", PLAYER_HTML + + reset_btn.click( + fn=on_reset, + outputs=[ + text_input, + control_instruction, + audio_output, + hidden_payload, + ultimate_clone, + prompt_text, + player_html, + ], + js="() => { if (window.ttsStop) window.ttsStop(); }", + ) + + def on_generate(stream_enabled, text, ctrl_instr, ref_a, ref_url, ult_clone, p_text): + import time as _time + + if not text or not text.strip(): + raise gr.Error("Please enter text to synthesize.") + + # VoxCPM2 uses "(instruction)text" format for control + ctrl = ctrl_instr.strip() if ctrl_instr and not ult_clone else "" + final_text = f"({ctrl}){text.strip()}" if ctrl else text.strip() + + payload: dict = { + "input": final_text, + "voice": "default", + "response_format": "pcm" if stream_enabled else "wav", + "stream": stream_enabled, + } + + # Reference audio for cloning + ref_url_s = ref_url.strip() if ref_url else "" + if ref_url_s: + payload["ref_audio"] = ref_url_s + elif ref_a is not None: + payload["ref_audio"] = _encode_audio(ref_a) + + # Ultimate cloning: prompt_audio + prompt_text for continuation + if ult_clone and p_text and p_text.strip(): + if ref_url_s: + payload["prompt_audio"] = ref_url_s + elif ref_a is not None: + payload["prompt_audio"] = payload.get("ref_audio", "") + payload["prompt_text"] = p_text.strip() + + if stream_enabled: + if ref_a is not None and not ref_url_s: + req_id = f"req-{int(_time.time() * 1000)}" + _pending[req_id] = payload + browser_payload = {"_req_id": req_id, "_nonce": int(_time.time() * 1000)} + return json.dumps(browser_payload), gr.update() + payload["_nonce"] = int(_time.time() * 1000) + return json.dumps(payload), gr.update() + else: + try: + with httpx.Client(timeout=300.0) as client: + resp = client.post( + f"{api_base}/v1/audio/speech", + json=payload, + headers={"Content-Type": "application/json", "Authorization": "Bearer EMPTY"}, + ) + except httpx.ConnectError: + raise gr.Error(f"Cannot connect to server at {api_base}.") + if resp.status_code != 200: + raise gr.Error(f"Server error ({resp.status_code}): {resp.text[:200]}") + audio_np, sr = sf.read(io.BytesIO(resp.content)) + if audio_np.ndim > 1: + audio_np = audio_np[:, 0] + return "", (sr, audio_np.astype(np.float32)) + + generate_btn.click( + fn=on_generate, + inputs=[ + stream_checkbox, + text_input, + control_instruction, + ref_audio, + ref_audio_url, + ultimate_clone, + prompt_text, + ], + outputs=[hidden_payload, audio_output], + ).then( + fn=lambda p: p, + inputs=[hidden_payload], + outputs=[hidden_payload], + js="(p) => { if (p && p.trim()) { const d = JSON.parse(p); delete d._nonce; window.ttsGenerate(d); } return p; }", + ) + + demo.queue() + + return gr.mount_gradio_app(app, demo, path="/", css=css, theme=theme, head=_build_player_js()) + + +def main(): + parser = argparse.ArgumentParser(description="VoxCPM2 streaming Gradio demo") + parser.add_argument("--api-base", default="http://localhost:8000", help="vLLM API server URL") + parser.add_argument("--host", default="0.0.0.0", help="Gradio server host") + parser.add_argument("--port", type=int, default=7860, help="Gradio server port") + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO) + print(f"Connecting to vLLM server at: {args.api_base}") + + import uvicorn + + uvicorn.run(create_app(args.api_base), host=args.host, port=args.port) + + +if __name__ == "__main__": + main() diff --git a/tests/e2e/offline_inference/test_voxcpm2.py b/tests/e2e/offline_inference/test_voxcpm2.py index 4e4f635d5c..6ec4630a45 100644 --- a/tests/e2e/offline_inference/test_voxcpm2.py +++ b/tests/e2e/offline_inference/test_voxcpm2.py @@ -33,14 +33,16 @@ def _extract_audio(multimodal_output: dict) -> torch.Tensor: """Extract the final complete audio tensor from multimodal output.""" assert isinstance(multimodal_output, dict), f"Expected dict, got {type(multimodal_output)}" - # Output processor accumulates per-step full audio under "audio". - audio = multimodal_output.get("audio") or multimodal_output.get("model_outputs") + # Output processor accumulates per-step audio chunks under "audio". + audio = multimodal_output.get("audio") + if audio is None: + audio = multimodal_output.get("model_outputs") assert audio is not None, f"No audio key, got {list(multimodal_output.keys())}" if isinstance(audio, list): - valid = [x for x in audio if isinstance(x, torch.Tensor) and x.numel() > 100] + valid = [torch.as_tensor(x).float().cpu().reshape(-1) for x in audio if x is not None] assert valid, "No valid audio tensors in output list" - audio = valid[-1] + audio = torch.cat(valid, dim=0) if len(valid) > 1 else valid[0] assert isinstance(audio, torch.Tensor), f"Expected Tensor, got {type(audio)}" return audio diff --git a/vllm_omni/engine/output_processor.py b/vllm_omni/engine/output_processor.py index 43d02e85b8..badd799fc9 100644 --- a/vllm_omni/engine/output_processor.py +++ b/vllm_omni/engine/output_processor.py @@ -118,9 +118,10 @@ def _consolidate_multimodal_tensors(self) -> None: if isinstance(v, list) and v and isinstance(v[0], torch.Tensor): try: if k == "audio": - # When the audio tensor shape is inconsistent, torch.cat will fail. - # We need to use torch.cat in -1 dimension. - continue + # Concatenate delta audio chunks (1-D) into the full waveform. + # Each entry is a per-step slice; flatten to -1 so chunks with + # inconsistent leading dims can still be joined on the sample axis. + self.mm_accumulated[k] = torch.cat([t.reshape(-1) for t in v], dim=0) elif k == "sr": # Sample rate is a constant scalar, keep last value. self.mm_accumulated[k] = v[-1] diff --git a/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py b/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py index 7ea5bc229d..40bacfff6c 100644 --- a/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py +++ b/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py @@ -308,31 +308,28 @@ def forward( return hidden_states def compile_selective(self) -> list[str]: - """Compile MLP + o_proj; keep RMSNorm/RoPE eager for precision.""" - compiled: list[str] = [] - for i, layer in enumerate(self.layers): - if i in self._compiled_layers: - continue - try: - layer.mlp = torch.compile( - layer.mlp, - mode="default", - fullgraph=True, - ) - layer.self_attn.o_proj = torch.compile( - layer.self_attn.o_proj, - mode="default", - fullgraph=True, - ) - layer.self_attn._fused_qkv_weight = None - self._compiled_layers.add(i) - if i == 0: - compiled.append(f"layers.*.mlp (×{len(self.layers)})") - compiled.append(f"layers.*.self_attn.o_proj (×{len(self.layers)})") - except Exception as e: - logger.warning("compile_selective: layer %d failed: %s", i, e) - break - return compiled + """Compile the full model forward as one graph. + + Earlier versions compiled ``layer.mlp`` + ``layer.self_attn.o_proj`` + (PR #2690) and then the whole ``layer`` (perf/voxcpm2-streaming-vae). + Both still paid one Dynamo dispatch per layer per decode step. + V3 profiling showed 1,332 per-layer dispatches (~28 layers × ~47 + decode steps) costing ~726 ms of CPU self-time for a long prompt. + + Compiling ``forward`` at the model level lets Dynamo unroll the + 28-layer Python loop inside the graph. Graph breaks at + PagedAttention produce sub-graphs but Dynamo memoises the whole + trace once, so the per-step dispatch drops from 28 to just a few. + """ + if self._compiled_layers: + return [] + # Null the fused-qkv caches so the compile sees the real weight layout. + for layer in self.layers: + layer.self_attn._fused_qkv_weight = None + self.forward = torch.compile(self.forward, mode="default", fullgraph=False) + # Mark every layer as compiled so idempotent callers don't double-wrap. + self._compiled_layers.update(range(len(self.layers))) + return ["forward (whole model)"] def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: """Load weights from native checkpoint (base_lm. prefix pre-stripped).""" @@ -415,22 +412,14 @@ def forward( return hidden_states def compile_selective(self) -> list[str]: - """Compile MLP + o_proj (same as base_lm).""" - compiled: list[str] = [] - for i, layer in enumerate(self.layers): - if i in self._compiled_layers: - continue - try: - layer.mlp = torch.compile(layer.mlp, mode="default", fullgraph=True) - layer.self_attn.o_proj = torch.compile(layer.self_attn.o_proj, mode="default", fullgraph=True) - layer.self_attn._fused_qkv_weight = None - self._compiled_layers.add(i) - if i == 0: - compiled.append(f"layers.*.mlp (×{len(self.layers)})") - compiled.append(f"layers.*.self_attn.o_proj (×{len(self.layers)})") - except Exception as e: - logger.warning("compile_selective: residual layer %d failed: %s", i, e) - return compiled + """Compile the full residual model forward as one graph (same strategy as base_lm).""" + if self._compiled_layers: + return [] + for layer in self.layers: + layer.self_attn._fused_qkv_weight = None + self.forward = torch.compile(self.forward, mode="default", fullgraph=False) + self._compiled_layers.update(range(len(self.layers))) + return ["forward (whole residual)"] def load_weights_from_native(self, native_residual_lm: nn.Module) -> int: """Load weights from native residual_lm. Returns param count.""" diff --git a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py index 0898ca59ae..94f0658904 100644 --- a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py +++ b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py @@ -11,6 +11,7 @@ from __future__ import annotations import dataclasses +import logging import os import time from collections.abc import Iterable @@ -19,7 +20,6 @@ import librosa import torch import torch.nn as nn -from einops import rearrange from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.models.utils import ( @@ -86,7 +86,11 @@ class _RequestState: curr_prefix_feat_cond: torch.Tensor | None = None last_audio_patch_gpu: torch.Tensor | None = None precomputed_stop_logits: torch.Tensor | None = None - accumulated_patches: list[torch.Tensor] = dataclasses.field(default_factory=list) + # Rolling tail of previously-decoded latents used as VAE receptive-field context. + # Shape (n_pad_frames, feat_dim) on GPU. None before first decode. + decode_pad: torch.Tensor | None = None + # Audio chunks already emitted (CPU float32), concatenated for cumulative output. + audio_chunks: list[torch.Tensor] = dataclasses.field(default_factory=list) decode_step_count: int = 0 request_start_time: float = 0.0 prefill_completed: bool = False @@ -229,11 +233,11 @@ def _optimized_solve_euler( buffers.x_in[b : 2 * b].copy_(x) buffers.mu_in[:b].copy_(mu) buffers.mu_in[b : 2 * b].zero_() - buffers.t_in[:b].fill_(t.item()) - buffers.t_in[b : 2 * b].fill_(t.item()) + # Broadcast the 0-dim GPU scalar directly instead of + # ``.fill_(t.item())`` — ``.item()`` forces a GPU->CPU sync. + buffers.t_in[: 2 * b].copy_(t) if mean_mode: - buffers.dt_in[:b].fill_(dt.item()) - buffers.dt_in[b : 2 * b].fill_(dt.item()) + buffers.dt_in[: 2 * b].copy_(dt) else: buffers.dt_in.zero_() buffers.cond_in[:b].copy_(cond[:b]) @@ -263,9 +267,10 @@ def _optimized_solve_euler( else: buffers.x_in[:b].copy_(x) buffers.mu_in[:b].copy_(mu) - buffers.t_in[:b].fill_(t.item()) + # Broadcast the 0-dim GPU scalar; ``.fill_(t.item())`` would sync. + buffers.t_in[:b].copy_(t) if mean_mode: - buffers.dt_in[:b].fill_(dt.item()) + buffers.dt_in[:b].copy_(dt) else: buffers.dt_in[:b].zero_() buffers.cond_in[:b].copy_(cond[:b]) @@ -320,7 +325,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._inference_timesteps = 10 self._cfg_value = 2.0 self._cfg_cutoff_ratio = 1.0 - self._vae_decode_interval = 5 + # Number of trailing latent frames to keep as VAE receptive-field context + # for sliding-window streaming decode. 12 matches the nanovllm reference + # implementation and covers the longest VAE decoder receptive field. + self._n_decode_pad_frames = 12 self._enable_torch_compile = True self._compile_vae = True self._max_decode_steps = 2000 @@ -686,7 +694,9 @@ def _finish_prefill(self, state: _RequestState, meta: dict, res_out: torch.Tenso state.request_start_time = time.perf_counter() state.prefill_completed = True - logger.info("PREFILL[%s]: patch norm=%.4f", state.request_id, pred_feat.norm().item()) + if logger.isEnabledFor(logging.DEBUG): + # Only compute the norm (which forces a GPU->CPU sync) if we will log it. + logger.debug("PREFILL[%s]: patch norm=%.4f", state.request_id, pred_feat.norm().item()) self._perf.reset() def _finish_decode(self, state: _RequestState, meta: dict, res_out: torch.Tensor, dev: Any): @@ -720,26 +730,54 @@ def _finish_decode(self, state: _RequestState, meta: dict, res_out: torch.Tensor # -------------------- audio collection -------------------- def _collect_audio(self, state: _RequestState) -> torch.Tensor | None: - patch = state.last_audio_patch_gpu - if patch is not None: - state.last_audio_patch_gpu = None - state.accumulated_patches.append(patch.reshape(1, -1).float()) + """Per-step sliding-window VAE decode (nanovllm pattern). - if not state.accumulated_patches: + Each decode step feeds ``[decode_pad, new_patch]`` through the VAE + and slices out only the audio region corresponding to the new patch. + The pad buffer (last ``_n_decode_pad_frames`` latent frames) provides + the receptive-field context needed by the VAE's transposed convolutions, + eliminating boundary artifacts between chunks. + + Returns the delta audio chunk (not cumulative) so the output processor + can stream each chunk to the client independently. + """ + patch = state.last_audio_patch_gpu + if patch is None: return None + state.last_audio_patch_gpu = None + + # patch shape: (patch_size, feat_dim) or (1, patch_size, feat_dim) + new_latent = patch.reshape(-1, self._feat_dim).to(torch.float32) + n_new = new_latent.shape[0] # = patch_size (typically 4) + + self._perf.start("vae_decode") + + # Build VAE input: [pad_frames | new_latent] + if state.decode_pad is not None: + vae_input = torch.cat([state.decode_pad, new_latent], dim=0) + pad_frames = state.decode_pad.shape[0] + else: + vae_input = new_latent + pad_frames = 0 + + # VAE decode: (1, feat_dim, T_frames) -> (1, 1, T_samples) + feat = vae_input.unsqueeze(0).transpose(1, 2).contiguous() + with torch.no_grad(): + audio = self.tts.audio_vae.decode(feat.to(self._device)).reshape(-1) + + # Slice out only the new audio (after the pad region). + # Each latent frame maps to decoder_chunk_size audio samples. + dcs = int(getattr(self.tts.audio_vae, "decode_chunk_size", audio.numel() // vae_input.shape[0])) + new_audio = audio[pad_frames * dcs : (pad_frames + n_new) * dcs].detach().cpu().float() + + # Roll the pad buffer: keep last N latent frames as context for next step. + all_latents = vae_input # [pad + new] + state.decode_pad = all_latents[-self._n_decode_pad_frames :].detach() - n = len(state.accumulated_patches) - if n <= 1 or n % self._vae_decode_interval == 0 or state.is_stopping: - self._perf.start("vae_decode") - all_p = torch.cat(state.accumulated_patches, dim=0) - state.accumulated_patches = [all_p] - feat = rearrange(all_p.reshape(1, -1, self._feat_dim), "b t d -> b d t") - with torch.no_grad(): - audio = self.tts.audio_vae.decode(feat.to(self._device)).reshape(-1).cpu().float() - self._perf.stop("vae_decode") - state.last_decoded_audio = audio - return audio - return state.last_decoded_audio + state.audio_chunks.append(new_audio) + state.last_decoded_audio = new_audio + self._perf.stop("vae_decode") + return new_audio # -------------------- compute_logits -------------------- @@ -830,7 +868,8 @@ def preprocess( state = self._get_or_create_state(req_id) state.prefill_text = "" - state.accumulated_patches = [] + state.decode_pad = None + state.audio_chunks = [] state.prefill_completed = False state.decode_step_count = 0 state.precomputed_stop_logits = None From a782ae47805d9761f446e4e715530af0f54859ab Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Wed, 15 Apr 2026 10:28:28 +0800 Subject: [PATCH 173/204] [Perf] Enhance benchmark script to support baseline thresholds and proved result handling (#2789) --- tests/dfx/perf/scripts/run_benchmark.py | 98 +++++++++++++++++-- .../scripts/test_benchmark_stability.py | 2 + tools/nightly/generate_nightly_perf_excel.py | 49 +++++++--- tools/nightly/generate_nightly_perf_html.py | 45 +++++++-- 4 files changed, 162 insertions(+), 32 deletions(-) diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index b64cc0d950..67dedcd048 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -56,16 +56,41 @@ def omni_server(request): print("OmniServer stopped") +def _safe_filename_token(value: Any | None, *, default: str = "na") -> str: + """Make a single path segment safe for result filenames on common filesystems.""" + if value is None: + return default + s = str(value).strip() + for bad in ("/", "\\", ":", "*", "?", '"', "<", ">", "|"): + s = s.replace(bad, "_") + return s if s else default + + def run_benchmark( args: list, test_name: str, flow, dataset_name: str, num_prompt, + *, + baseline_config: dict[str, Any] | None = None, + sweep_index: int | None = None, + request_rate: Any | None = None, + max_concurrency: Any | None = None, + random_input_len: Any | None = None, + random_output_len: Any | None = None, ) -> Any: - """Run a single benchmark iteration and return the parsed result JSON.""" + """Run a single benchmark iteration and return the parsed result JSON. + + After ``vllm bench`` writes the JSON, ``result["baseline"]`` holds the same + per-metric resolved thresholds as ``assert_result`` (via ``_baseline_thresholds_for_step``). + When ``random_input_len`` / ``random_output_len`` are set, they are also written into the result JSON; + omitted keys when not configured. + """ current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") - result_filename = f"result_{test_name}_{dataset_name}_{flow}_{num_prompt}_{current_dt}.json" + ri = _safe_filename_token(random_input_len) + ro = _safe_filename_token(random_output_len) + result_filename = f"result_{test_name}_{dataset_name}_{flow}_{num_prompt}_in{ri}_out{ro}_{current_dt}.json" if "--result-filename" in args: print(f"The result file will be overwritten by {result_filename}") command = ( @@ -97,8 +122,26 @@ def run_benchmark( else: result_dir = "./" - with open(os.path.join(result_dir, result_filename), encoding="utf-8") as f: + result_path = os.path.join(result_dir, result_filename) + with open(result_path, encoding="utf-8") as f: result = json.load(f) + + if baseline_config: + result["baseline"] = _baseline_thresholds_for_step( + baseline_config, + sweep_index=sweep_index, + request_rate=request_rate, + max_concurrency=max_concurrency, + ) + else: + result["baseline"] = {} + if random_input_len is not None: + result["random_input_len"] = random_input_len + if random_output_len is not None: + result["random_output_len"] = random_output_len + with open(result_path, "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=2) + return result @@ -164,10 +207,33 @@ def _resolve_baseline_value( f"or request_rate={request_rate!r}; keys={list(baseline_raw.keys())!r}" ) if isinstance(baseline_raw, (list, tuple)): + if sweep_index is None: + raise ValueError("list baseline requires sweep_index") + if not (0 <= sweep_index < len(baseline_raw)): + raise IndexError(f"baseline list len={len(baseline_raw)} has no index {sweep_index}") return baseline_raw[sweep_index] return baseline_raw +def _baseline_thresholds_for_step( + baseline_data: dict[str, Any], + *, + sweep_index: int | None = None, + max_concurrency: Any = None, + request_rate: Any = None, +) -> dict[str, Any]: + """Resolve ``test.json`` ``baseline`` block to one threshold per metric (same as ``assert_result``).""" + return { + metric_name: _resolve_baseline_value( + baseline_raw, + sweep_index=sweep_index, + max_concurrency=max_concurrency, + request_rate=request_rate, + ) + for metric_name, baseline_raw in baseline_data.items() + } + + def assert_result( result, params, @@ -179,14 +245,14 @@ def assert_result( ) -> None: assert result["completed"] == num_prompt, "Request failures exist" baseline_data = params.get("baseline", {}) - for metric_name, baseline_raw in baseline_data.items(): + thresholds = _baseline_thresholds_for_step( + baseline_data, + sweep_index=sweep_index, + max_concurrency=max_concurrency, + request_rate=request_rate, + ) + for metric_name, baseline_value in thresholds.items(): current_value = result[metric_name] - baseline_value = _resolve_baseline_value( - baseline_raw, - sweep_index=sweep_index, - max_concurrency=max_concurrency, - request_rate=request_rate, - ) if "throughput" in metric_name: if current_value <= baseline_value: print( @@ -258,6 +324,12 @@ def to_list(value, default=None): flow=qps, dataset_name=dataset_name, num_prompt=num_prompt, + baseline_config=params.get("baseline"), + sweep_index=i, + request_rate=qps, + max_concurrency=None, + random_input_len=params.get("random_input_len"), + random_output_len=params.get("random_output_len"), ) assert_result( result, @@ -276,6 +348,12 @@ def to_list(value, default=None): flow=concurrency, dataset_name=dataset_name, num_prompt=num_prompt, + baseline_config=params.get("baseline"), + sweep_index=i, + request_rate=None, + max_concurrency=concurrency, + random_input_len=params.get("random_input_len"), + random_output_len=params.get("random_output_len"), ) assert_result( result, diff --git a/tests/dfx/stability/scripts/test_benchmark_stability.py b/tests/dfx/stability/scripts/test_benchmark_stability.py index e8568652d1..a9faae8ab8 100644 --- a/tests/dfx/stability/scripts/test_benchmark_stability.py +++ b/tests/dfx/stability/scripts/test_benchmark_stability.py @@ -112,6 +112,8 @@ def _run_one_benchmark_batch( flow=flow, dataset_name=dataset_name, num_prompt=num_prompts, + random_input_len=params.get("random_input_len"), + random_output_len=params.get("random_output_len"), ) return result except (FileNotFoundError, OSError) as e: diff --git a/tools/nightly/generate_nightly_perf_excel.py b/tools/nightly/generate_nightly_perf_excel.py index 5f9eb428bc..4bb7785317 100644 --- a/tools/nightly/generate_nightly_perf_excel.py +++ b/tools/nightly/generate_nightly_perf_excel.py @@ -319,10 +319,10 @@ def _load_json_file(path: str) -> dict[str, Any] | list[Any] | None: def _parse_from_filename(filename: str) -> dict[str, Any]: - """Parse test-related metadata from a result JSON filename. + """Parse test-related metadata from a ``result_test_*.json`` filename. - Expected pattern (after prefix/suffix stripped): - ____ + Matches ``tests/dfx/perf/scripts/run_benchmark.py`` naming, including optional + ``_in{X}_out{Y}_`` before the timestamp (``na`` when unset). """ name, ext = os.path.splitext(filename) if ext != ".json" or not name.startswith(_RESULT_JSON_PREFIX): @@ -331,22 +331,42 @@ def _parse_from_filename(filename: str) -> dict[str, Any]: core = name[len(_RESULT_JSON_PREFIX) :] parts = core.split("_") if len(parts) < 5: - LOGGER.warning("filename '%s' does not match expected pattern, skip parsing test metadata", filename) + LOGGER.warning( + "filename '%s' does not match expected pattern (need >= 5 segments), skip parsing", + filename, + ) return {} - timestamp = parts[-1] - num_prompts_str = parts[-2] - max_concurrency_str = parts[-3] - dataset_name = parts[-4] - test_name = "_".join(parts[:-4]) if parts[:-4] else "" + idx = len(parts) - 1 + timestamp = parts[idx] + idx -= 1 parsed: dict[str, Any] = {} - if len(timestamp) >= 15: parsed["date"] = timestamp - if dataset_name in DATASET_NAME_ALLOWED: - parsed["dataset_name"] = dataset_name + if idx >= 0 and parts[idx].startswith("out"): + parsed["random_output_len"] = parts[idx][3:] + idx -= 1 + if idx >= 0 and parts[idx].startswith("in"): + parsed["random_input_len"] = parts[idx][2:] + idx -= 1 + + if idx < 3: + LOGGER.warning( + "filename '%s' has too few segments after timestamp / optional in-out (idx=%s)", + filename, + idx, + ) + return parsed + + num_prompts_str = parts[idx] + idx -= 1 + flow_str = parts[idx] + idx -= 1 + dataset_name = parts[idx] + idx -= 1 + test_name = "_".join(parts[: idx + 1]) if idx >= 0 else "" try: parsed["num_prompts"] = int(num_prompts_str) @@ -354,13 +374,16 @@ def _parse_from_filename(filename: str) -> dict[str, Any]: pass try: - parsed["max_concurrency"] = int(max_concurrency_str) + parsed["max_concurrency"] = int(flow_str) except (TypeError, ValueError): pass if test_name: parsed["test_name"] = test_name + if dataset_name in DATASET_NAME_ALLOWED: + parsed["dataset_name"] = dataset_name + return parsed diff --git a/tools/nightly/generate_nightly_perf_html.py b/tools/nightly/generate_nightly_perf_html.py index 05dc48d717..dd5ece7907 100644 --- a/tools/nightly/generate_nightly_perf_html.py +++ b/tools/nightly/generate_nightly_perf_html.py @@ -67,6 +67,7 @@ def _load_json_file(path: str) -> dict[str, Any] | None: def _parse_from_filename(filename: str) -> dict[str, Any]: + """Parse ``result_test_*.json`` filenames; same rules as ``generate_nightly_perf_excel``.""" name, ext = os.path.splitext(filename) if ext != ".json" or not name.startswith(_RESULT_JSON_PREFIX): return {} @@ -75,32 +76,58 @@ def _parse_from_filename(filename: str) -> dict[str, Any]: parts = core.split("_") if len(parts) < 5: LOGGER.warning( - "filename '%s' does not match expected pattern, skip parsing test metadata", + "filename '%s' does not match expected pattern (need >= 5 segments), skip parsing", filename, ) return {} - timestamp = parts[-1] - num_prompts_str = parts[-2] - max_concurrency_str = parts[-3] - dataset_name = parts[-4] - test_name = "_".join(parts[:-4]) if parts[:-4] else "" + idx = len(parts) - 1 + timestamp = parts[idx] + idx -= 1 parsed: dict[str, Any] = {} if len(timestamp) >= 15: parsed["date"] = timestamp - if dataset_name in ("random", "random-mm"): - parsed["dataset_name"] = dataset_name + + if idx >= 0 and parts[idx].startswith("out"): + parsed["random_output_len"] = parts[idx][3:] + idx -= 1 + if idx >= 0 and parts[idx].startswith("in"): + parsed["random_input_len"] = parts[idx][2:] + idx -= 1 + + if idx < 3: + LOGGER.warning( + "filename '%s' has too few segments after timestamp / optional in-out (idx=%s)", + filename, + idx, + ) + return parsed + + num_prompts_str = parts[idx] + idx -= 1 + flow_str = parts[idx] + idx -= 1 + dataset_name = parts[idx] + idx -= 1 + test_name = "_".join(parts[: idx + 1]) if idx >= 0 else "" + try: parsed["num_prompts"] = int(num_prompts_str) except (TypeError, ValueError): pass + try: - parsed["max_concurrency"] = int(max_concurrency_str) + parsed["max_concurrency"] = int(flow_str) except (TypeError, ValueError): pass + if test_name: parsed["test_name"] = test_name + + if dataset_name in ("random", "random-mm"): + parsed["dataset_name"] = dataset_name + return parsed From 227bab3038a10ba1bde4c2c9154be428b496a7e3 Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Wed, 15 Apr 2026 11:07:35 +0800 Subject: [PATCH 174/204] [Benchmark]Omni-modality model accuracy benchmark(Daily-Omni & seed-tts-eval) (#2558) Signed-off-by: amy-why-3459 --- pyproject.toml | 11 + .../data_modules/daily_omni_dataset.py | 887 ++++++++++++++++++ .../data_modules/daily_omni_eval.py | 406 ++++++++ .../data_modules/daily_omni_text_audio.py | 255 +++++ .../data_modules/seed_tts_dataset.py | 272 ++++++ .../benchmarks/data_modules/seed_tts_eval.py | 729 ++++++++++++++ vllm_omni/benchmarks/patch/__init__.py | 3 + vllm_omni/benchmarks/patch/patch.py | 332 ++++++- vllm_omni/benchmarks/serve.py | 12 + vllm_omni/entrypoints/cli/benchmark/serve.py | 143 ++- 10 files changed, 3041 insertions(+), 9 deletions(-) create mode 100644 vllm_omni/benchmarks/data_modules/daily_omni_dataset.py create mode 100644 vllm_omni/benchmarks/data_modules/daily_omni_eval.py create mode 100644 vllm_omni/benchmarks/data_modules/daily_omni_text_audio.py create mode 100644 vllm_omni/benchmarks/data_modules/seed_tts_dataset.py create mode 100644 vllm_omni/benchmarks/data_modules/seed_tts_eval.py diff --git a/pyproject.toml b/pyproject.toml index 57a4b474fd..753e0e3981 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,17 @@ demo = [ "gradio>=6.7.0", ] +# Seed-TTS serve benchmark WER (BytedanceSpeech/seed-tts-eval run_wer.py protocol). +seed-tts-eval = [ + "jiwer>=3.0.0", + "zhon>=2.0.0", + "zhconv>=1.4.2", + "scipy>=1.10.0", + "soundfile>=0.12.0", + "transformers>=4.36.0", + "funasr>=1.0.0", +] + docs = [ "mkdocs>=1.5.0", "mkdocs-api-autonav", diff --git a/vllm_omni/benchmarks/data_modules/daily_omni_dataset.py b/vllm_omni/benchmarks/data_modules/daily_omni_dataset.py new file mode 100644 index 0000000000..01b86d0fd1 --- /dev/null +++ b/vllm_omni/benchmarks/data_modules/daily_omni_dataset.py @@ -0,0 +1,887 @@ +"""Daily-Omni Dataset loader for benchmark. + +Daily-Omni is an audio-visual reasoning benchmark with 684 videos +and 1,197 multiple-choice QA pairs across 6 major task types. + +Dataset source: https://huggingface.co/datasets/liarliar/Daily-Omni + +Supports loading QA metadata from: +- Local JSON file (``qa_json_path``): recommended for offline/air-gapped environments +- HuggingFace datasets (``dataset_path``): legacy online mode + +The videos must be separately downloaded and extracted from Videos.tar. + +Why ``BenchmarkDataset`` instead of ``HuggingFaceDataset``? + vLLM's ``HuggingFaceDataset`` is a thin wrapper whose ``__init__`` always ends by calling + ``load_data()`` → ``datasets.load_dataset(...)`` with a required Hub id and split. That + contract fits "Hub-only" benches, but Daily-Omni also needs **offline QA metadata** from a + local ``qa.json`` without touching the network. Subclassing ``HuggingFaceDataset`` would + mean fighting the parent constructor (fake ``dataset_path``, reordering ``load_data``, or + duplicating half the parent) and would still imply ``datasets`` is always relevant. + + This class therefore inherits only ``BenchmarkDataset`` (minimal: ``dataset_path``, + ``random_seed``, ``self.data``) and implements **two explicit loaders**: + ``_load_from_local_json`` (default path for air-gapped runs) and ``_load_from_huggingface`` + (optional legacy path for users who prefer ``datasets`` + Hub cache). The latter is **not** + inheritance; it is the same Hub rows as before, factored into a helper so one class can + serve both deployment modes without mandatory ``datasets`` when using ``qa_json_path``. + +Usage: + from vllm_omni.benchmarks.data_modules.daily_omni_dataset import DailyOmniDataset + + # Local JSON mode (recommended) + dataset = DailyOmniDataset( + qa_json_path="/path/to/qa.json", + video_dir="/path/to/Videos", + random_seed=42, + ) + + # HuggingFace mode (legacy, requires network) + dataset = DailyOmniDataset( + dataset_path="liarliar/Daily-Omni", + dataset_split="train", + random_seed=42, + ) + requests = dataset.sample( + tokenizer=tokenizer, + num_requests=100, + output_len=256, + ) +""" + +import base64 +import json +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Literal + +try: + from vllm.benchmarks.datasets import BenchmarkDataset, SampleRequest +except ImportError: + # Fallback: if BenchmarkDataset not available, use base class from same module + from vllm.benchmarks.datasets import HuggingFaceDataset as BenchmarkDataset + from vllm.benchmarks.datasets import SampleRequest +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.hf import get_cached_tokenizer + +try: + from datasets import load_dataset +except ImportError: + load_dataset = None + +logger = logging.getLogger(__name__) + + +class _ListDatasetIterator: + """Simple iterator wrapper around a list to mimic HuggingFace streaming dataset behavior.""" + + def __init__(self, data: list[dict[str, Any]]) -> None: + self._data = data + self._index = 0 + + def __iter__(self): + self._index = 0 + return self + + def __next__(self) -> dict[str, Any]: + if self._index >= len(self._data): + raise StopIteration + item = self._data[self._index] + self._index += 1 + return item + + def __len__(self) -> int: + return len(self._data) + + def __getitem__(self, idx: int | slice) -> dict[str, Any] | list[dict[str, Any]]: + return self._data[idx] + + +# Aligns with Lliar-liar/Daily-Omni CLI ``--input_mode`` (test_model/*/testmodel.py). +DailyOmniInputMode = Literal["all", "visual", "audio"] + +# ``build_conversation()`` in Daily-Omni ``test_model/Qwen2.5-Omni/testmodel.py`` (verbatim). +DAILY_OMNI_SYSTEM_TEXT = ( + "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, " + "capable of perceiving auditory and visual inputs, as well as generating text and speech." +) + + +@dataclass +class DailyOmniSampleRequest(SampleRequest): + """``SampleRequest`` with Daily-Omni gold labels for post-run accuracy scoring.""" + + daily_omni_gold_answer: str = "" + daily_omni_video_id: str = "" + daily_omni_task_type: str = "" + #: Official qa.json ``video_duration`` (e.g. ``30s``, ``60s``) for leaderboard-style breakdown. + daily_omni_video_duration: str = "" + #: Official ``video_category`` (YouTube-style category string) for per-category accuracy. + daily_omni_video_category: str = "" + #: Extra JSON fields merged into chat-completions ``extra_body`` (e.g. ``mm_processor_kwargs``). + omni_extra_body: dict[str, Any] | None = None + #: Full OpenAI ``messages`` (system + user) mirroring upstream Daily-Omni conversation. + omni_chat_messages: list[dict[str, Any]] | None = None + #: Used only when ``omni_chat_messages`` is None (non-Daily-Omni-style requests). + omni_chat_mm_position: Literal["first", "last"] = "last" + + +class DailyOmniDataset(BenchmarkDataset): + """Daily-Omni audio-visual QA dataset for benchmarking. + + Inherits ``BenchmarkDataset`` only (not ``HuggingFaceDataset``): see module docstring for why + Hub loading lives in ``_load_from_huggingface`` instead of subclassing the HF base class. + + The dataset includes: + - 684 videos from daily life scenarios (available in Videos.tar) + - 1,197 multiple-choice QA pairs in qa.json + - 6 major task categories + + QA metadata can be loaded from: + - Local JSON file (``qa_json_path``): recommended for offline/air-gapped environments + - HuggingFace datasets (``dataset_path``): legacy online mode + + The videos must be separately downloaded and extracted from Videos.tar. + + Args: + qa_json_path: Path to local qa.json file (offline mode, preferred). When provided, + ``dataset_path`` and ``dataset_split`` are ignored. + dataset_path: HuggingFace dataset path (e.g., "liarliar/Daily-Omni"). Used only if + ``qa_json_path`` is not provided (legacy online mode). + dataset_split: Dataset split to use (default: "train"). Used only in online mode. + random_seed: Random seed for shuffling + video_dir: Directory containing extracted video files (default: None) + input_mode: Which modalities to send, matching upstream Daily-Omni ``--input_mode``: + ``all`` — video + WAV (default; official audio-visual protocol); + ``visual`` — video only; + ``audio`` — extracted WAV only (requires ``{video_id}/{video_id}_audio.wav`` under ``video_dir``). + max_duration_seconds: Reserved for future ffprobe-based filtering; currently **not applied** + when building requests (metadata ``video_duration`` is still passed through for eval). + dataset_subset: Optional HuggingFace subset name (``load_dataset(..., name=...)``); used by bench + ``--hf-subset`` / patch. + no_stream: If True, load the Hub split non-streaming (matches bench ``--no-stream``). + inline_local_video: If True, embed local MP4 as ``data:video/mp4;base64,...`` in requests so + the API server does not need ``--allowed-local-media-path`` (large JSON; use for small runs). + When ``input_mode`` is ``audio`` or ``all``, local WAV is embedded the same way + (``data:audio/wav;base64,...``). + trust_remote_code: Whether to trust remote code when loading HuggingFace dataset + (online mode only). + """ + + SUPPORTED_DATASET_PATHS: set[str] = { + "liarliar/Daily-Omni", + } + #: Default Hub id for synthetic video URLs when ``qa_json_path`` is used (``dataset_path`` None). + DEFAULT_HF_DATASET_ID = "liarliar/Daily-Omni" + IS_MULTIMODAL = True + DEFAULT_OUTPUT_LEN = 256 + + def __init__( + self, + qa_json_path: str | None = None, + dataset_path: str | None = None, + dataset_split: str = "train", + random_seed: int = 0, + video_dir: str | None = None, + input_mode: DailyOmniInputMode = "all", + inline_local_video: bool = False, + trust_remote_code: bool = False, + max_duration_seconds: float | None = None, + dataset_subset: str | None = None, + no_stream: bool = False, + **kwargs, + ) -> None: + if input_mode not in ("all", "visual", "audio"): + raise ValueError(f"input_mode must be 'all', 'visual', or 'audio', got {input_mode!r}") + + # Validate arguments: need either local JSON or HF path + if qa_json_path is None and dataset_path is None: + raise ValueError( + "Either 'qa_json_path' (local JSON) or 'dataset_path' (HuggingFace) must be provided. " + "For offline/air-gapped environments, download qa.json and use qa_json_path." + ) + + # Store configuration + self.qa_json_path = Path(qa_json_path) if qa_json_path else None + self.dataset_path = dataset_path + self.dataset_split = dataset_split + self.dataset_subset = dataset_subset + #: Match vLLM ``HuggingFaceDataset`` / bench CLI ``--no-stream``. + self._hf_streaming = not no_stream + self.video_dir = Path(video_dir) if video_dir else None + self.inline_local_video = inline_local_video + self.input_mode: DailyOmniInputMode = input_mode + self.max_duration_seconds = max_duration_seconds + self.trust_remote_code = trust_remote_code + + #: In-process cache of ffprobe durations only (no disk persistence). + self._video_durations: dict[str, float] = {} + + # Initialize parent BenchmarkDataset + super().__init__( + dataset_path=dataset_path if qa_json_path is None else None, + random_seed=random_seed, + **kwargs, + ) + + # Load data based on mode + self.load_data() + + # Verify dataset info + logger.info( + "Loaded Daily-Omni dataset: mode=%s, source=%s, random_seed=%d, input_mode=%s, max_duration=%s", + "local_json" if self.qa_json_path else "huggingface", + str(self.qa_json_path) if self.qa_json_path else f"{dataset_path}/{dataset_split}", + random_seed, + input_mode, + f"{max_duration_seconds}s" if max_duration_seconds else "unlimited", + ) + + def load_data(self) -> None: + """Populate ``self.data`` from either local JSON or the Hub. + + See module docstring: we do not subclass ``HuggingFaceDataset`` because Daily-Omni needs + a first-class offline path; Hub loading is an optional branch implemented below. + """ + if self.qa_json_path is not None: + self._load_from_local_json() + else: + self._load_from_huggingface() + + def _load_from_local_json(self) -> None: + """Load QA data from local JSON file.""" + if not self.qa_json_path.exists(): + raise FileNotFoundError(f"QA JSON file not found: {self.qa_json_path}") + + with open(self.qa_json_path, encoding="utf-8") as f: + data = json.load(f) + + # Support both list format and dict with "train"/"test" splits + if isinstance(data, dict): + # Try to get the requested split, fallback to first available + split_data = data.get(self.dataset_split) + if split_data is None: + available = list(data.keys()) + if available: + logger.warning( + "Split '%s' not found in %s, using '%s' instead", + self.dataset_split, + self.qa_json_path, + available[0], + ) + split_data = data[available[0]] + else: + split_data = [] + data = split_data + + if not isinstance(data, list): + raise ValueError(f"Expected list of QA items in JSON, got {type(data).__name__}") + + # Shuffle if requested + if not getattr(self, "disable_shuffle", False) and self.random_seed is not None: + import random + + rng = random.Random(self.random_seed) + shuffled = data[:] + rng.shuffle(shuffled) + data = shuffled + + # Create an iterator-like wrapper for compatibility + self.data = _ListDatasetIterator(data) + + def _load_from_huggingface(self) -> None: + """Load QA rows via ``datasets.load_dataset`` (legacy / convenience path). + + Kept for backward compatibility: callers can still pass ``dataset_path=liarliar/Daily-Omni`` + and get the same parquet-backed rows as the Hub dataset card, with streaming (or + non-streaming if ``no_stream=True``) and shuffle. + + This is intentionally **not** implemented by subclassing ``HuggingFaceDataset``: that base + always runs Hub ``load_dataset`` from its constructor and expects a Hub id as the primary + API; Daily-Omni instead chooses the source in ``load_data()`` (JSON vs Hub) while sharing + one ``sample()`` / request-building implementation for both. + """ + if load_dataset is None: + raise ImportError( + "datasets library is required for HuggingFace mode. " + "Install with: pip install datasets, or use local JSON mode instead." + ) + + ds = load_dataset( + self.dataset_path, + name=self.dataset_subset, + split=self.dataset_split, + streaming=self._hf_streaming, + trust_remote_code=self.trust_remote_code, + ) + if not getattr(self, "disable_shuffle", False): + ds = ds.shuffle(seed=self.random_seed) + self.data = ds + + def get_task_statistics(self) -> dict[str, int]: + """Get distribution of task types in the dataset. + + Returns: + Dict mapping task type to count + """ + stats: dict[str, int] = {} + for item in self.data: + row = self._coerce_row(item) + fields = self._normalize_qa_fields(row) + task_type = fields["task_type"] or "unknown" + stats[task_type] = stats.get(task_type, 0) + 1 + return stats + + @staticmethod + def _coerce_row(item: Any) -> dict[str, Any]: + """Turn a dataset row into a plain dict (Arrow / Mapping).""" + if isinstance(item, dict): + return item + if hasattr(item, "as_py"): + return dict(item.as_py()) # pyarrow Row + try: + return dict(item) + except (TypeError, ValueError): + return {k: item[k] for k in item} # type: ignore[misc] + + @staticmethod + def _normalize_qa_fields(row: dict[str, Any]) -> dict[str, Any]: + """Map official Daily-Omni qa.json / Hub schema to internal fields. + + Official fields (see liarliar/Daily-Omni ``qa.json``): ``Question``, ``Choice`` (list), + ``Answer``, ``video_id``, ``Type``, ``video_duration`` (``30s`` / ``60s``), ``video_category``, + plus other category columns. Legacy aliases (lowercase / older loaders) are still accepted. + """ + out: dict[str, Any] = {} + + out["question"] = str(row.get("Question") or row.get("question") or "").strip() + vid = row.get("video_id") if row.get("video_id") is not None else row.get("video") + out["video_id"] = str(vid).strip() if vid is not None else "" + out["task_type"] = str(row.get("Type") or row.get("task_type") or row.get("type") or "").strip() + vc = row.get("video_category") if row.get("video_category") is not None else row.get("videoCategory") + out["video_category"] = str(vc).strip() if vc is not None else "" + vd = row.get("video_duration") if row.get("video_duration") is not None else row.get("videoDuration") + out["video_duration"] = str(vd).strip() if vd is not None else "" + out["answer"] = str(row.get("Answer") or row.get("answer") or "").strip() + vu = row.get("video_url") if row.get("video_url") is not None else row.get("Video_URL") + out["video_url"] = str(vu).strip() if vu is not None and str(vu).strip() else None + + choice = row.get("Choice") + if choice is None: + choice = row.get("options") or row.get("choice") + out["choice"] = choice + + return out + + def sample( + self, + tokenizer: TokenizerLike, + num_requests: int, + output_len: int | None = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list[SampleRequest]: + """Sample requests from Daily-Omni dataset. + + Args: + tokenizer: Tokenizer for computing prompt length + num_requests: Number of requests to sample + output_len: Target output length in tokens (default: 256) + request_id_prefix: Prefix for request IDs + no_oversample: If True, do not oversample if fewer examples available + **kwargs: Additional arguments (ignored) + + Returns: + List of SampleRequest objects with video URLs and prompts + """ + if output_len is None: + output_len = self.DEFAULT_OUTPUT_LEN + + sampled_requests: list[SampleRequest] = [] + ind = 0 + cached_tokenizer = get_cached_tokenizer(tokenizer) + + # Iterate over shuffled dataset + for item in self.data: + if len(sampled_requests) >= num_requests: + break + + request = self._create_sample_request( + self._coerce_row(item), cached_tokenizer, output_len, request_id_prefix, ind + ) + if request: + sampled_requests.append(request) + ind += 1 + + logger.info("Created %d sample requests from Daily-Omni dataset", len(sampled_requests)) + + # Handle oversampling if needed + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix, no_oversample) + + return sampled_requests + + def _create_sample_request( + self, + qa_item: dict[str, Any], + tokenizer: TokenizerLike, + output_len: int, + request_id_prefix: str, + index: int, + ) -> SampleRequest | None: + """Create a SampleRequest from a QA item. + + Args: + qa_item: QA pair from the dataset + tokenizer: Tokenizer + output_len: Target output length + request_id_prefix: Prefix for request ID + index: Request index + + Returns: + SampleRequest or None if invalid + """ + fields = self._normalize_qa_fields(qa_item) + video_id = fields["video_id"] + question = fields["question"] + choice = fields["choice"] + task_type = fields["task_type"] + video_url = fields["video_url"] + video_duration = fields.get("video_duration") or "" + video_category = fields.get("video_category") or "" + + if not video_id and not video_url: + logger.warning("Skipping item: no video_id / video_url") + return None + + if not question: + logger.warning("Skipping item: no question found") + return None + + # Official layout after extracting Videos.tar (see Lliar-liar/Daily-Omni test_model): + # {video_base_dir}/{video_id}/{video_id}_video.mp4 + mm_payload, omni_extra, mm_pos = self._compose_daily_omni_multimodal(video_id, video_url) + if not mm_payload: + return None + + messages = self._build_daily_omni_openai_messages(mm_payload, question, choice) + user_text = self._official_daily_omni_user_prompt(question, choice) + # Text-only length estimate (same as before: no MM token count in bench). + prompt_len = len(tokenizer.encode(f"{DAILY_OMNI_SYSTEM_TEXT}\n{user_text}")) + + return DailyOmniSampleRequest( + prompt=user_text, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=None, + request_id=f"{request_id_prefix}{index}", + daily_omni_gold_answer=fields["answer"], + daily_omni_video_id=video_id, + daily_omni_task_type=task_type, + daily_omni_video_duration=video_duration, + daily_omni_video_category=video_category, + omni_extra_body=omni_extra, + omni_chat_messages=messages, + omni_chat_mm_position=mm_pos, + ) + + @staticmethod + def _official_video_relpath(video_id: str) -> str: + """Relative path inside extracted ``Videos/`` per upstream Daily-Omni scripts.""" + return f"{video_id}/{video_id}_video.mp4" + + @staticmethod + def _official_audio_relpath(video_id: str) -> str: + """Relative path for extracted WAV per upstream ``get_audio_path``.""" + return f"{video_id}/{video_id}_audio.wav" + + def _resolve_local_video_path(self, video_id: str) -> Path | None: + """Pick an existing file under ``video_dir`` (official layout + flat fallback).""" + if not self.video_dir or not video_id: + return None + + candidates = [ + self.video_dir / self._official_video_relpath(video_id), + self.video_dir / f"{video_id}.mp4", # flat layout (custom mirrors / outdated docs) + ] + seen: set[Path] = set() + for p in candidates: + rp = p.resolve() + if rp in seen: + continue + seen.add(rp) + if p.exists(): + return p + return None + + def _resolve_local_audio_path(self, video_id: str) -> Path | None: + """Pick an existing WAV under ``video_dir`` (official layout + flat fallback).""" + if not self.video_dir or not video_id: + return None + candidates = [ + self.video_dir / self._official_audio_relpath(video_id), + self.video_dir / f"{video_id}.wav", + ] + seen: set[Path] = set() + for p in candidates: + rp = p.resolve() + if rp in seen: + continue + seen.add(rp) + if p.exists(): + return p + return None + + def _local_file_to_video_url_payload(self, video_path: Path) -> dict[str, Any]: + """Build OpenAI-style video_url part for a resolved local file. + + vLLM rejects ``file://`` unless the server was started with + ``--allowed-local-media-path`` set to a directory that **contains** the file + (typically the extracted ``Videos`` root). Use ``inline_local_video=True`` to + send base64 data URLs instead (no server path allowlist; larger requests). + """ + path = video_path.expanduser().resolve() + if self.inline_local_video: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + return { + "type": "video_url", + "video_url": {"url": f"data:video/mp4;base64,{b64}"}, + } + return { + "type": "video_url", + "video_url": {"url": path.as_uri()}, + } + + def _local_file_to_audio_url_payload(self, audio_path: Path) -> dict[str, Any]: + """Build OpenAI-style ``audio_url`` part for a resolved local WAV file.""" + path = audio_path.expanduser().resolve() + if self.inline_local_video: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + return { + "type": "audio_url", + "audio_url": {"url": f"data:audio/wav;base64,{b64}"}, + } + return { + "type": "audio_url", + "audio_url": {"url": path.as_uri()}, + } + + def _get_video_content( + self, + video_id: str, + video_url: str | None, + ) -> dict[str, Any] | None: + """Resolve video for OpenAI-style ``video_url`` content. + + Upstream uses ``get_video_path(video_id, base) -> base/video_id/video_id_video.mp4``. + The Hub repo only publishes ``Videos.tar``; use ``--daily-omni-video-dir`` pointing + at the extracted ``Videos`` folder (parent of per-``video_id`` subdirs). + + For ``file://`` URLs, start ``vllm serve`` with e.g. + ``--allowed-local-media-path /same/path/as/daily-omni-video-dir``. + """ + if video_url: + url = video_url + if not url.startswith(("http://", "https://", "file://")): + url = f"https://{url.lstrip('/')}" + return {"type": "video_url", "video_url": {"url": url}} + + if self.video_dir and video_id: + video_path = self._resolve_local_video_path(video_id) + if video_path is not None: + return self._local_file_to_video_url_payload(video_path) + logger.warning( + "Video not found under video_dir=%s for video_id=%r (expected %s or %s)", + self.video_dir, + video_id, + self._official_video_relpath(video_id), + f"{video_id}.mp4", + ) + + if video_id: + repo = self.dataset_path or self.DEFAULT_HF_DATASET_ID + rel = self._official_video_relpath(video_id) + hf_video_url = f"https://huggingface.co/datasets/{repo}/resolve/main/Videos/{rel}" + logger.debug( + "Using HF video URL (likely 404 — Hub ships Videos.tar only): %s", + hf_video_url, + ) + return {"type": "video_url", "video_url": {"url": hf_video_url}} + + logger.error("Could not determine video source for video_id=%r", video_id) + return None + + def _get_audio_content(self, video_id: str) -> dict[str, Any] | None: + """Resolve extracted WAV for OpenAI-style ``audio_url`` (local files only).""" + if not self.video_dir or not video_id: + logger.warning( + "Daily-Omni input_mode %r requires --daily-omni-video-dir with %s", + self.input_mode, + self._official_audio_relpath(video_id), + ) + return None + audio_path = self._resolve_local_audio_path(video_id) + if audio_path is not None: + return self._local_file_to_audio_url_payload(audio_path) + logger.warning( + "Audio not found under video_dir=%s for video_id=%r (expected %s or %s)", + self.video_dir, + video_id, + self._official_audio_relpath(video_id), + f"{video_id}.wav", + ) + return None + + def _compose_daily_omni_multimodal( + self, + video_id: str, + video_url: str | None, + ) -> tuple[dict[str, Any] | list[dict[str, Any]] | None, dict[str, Any] | None, Literal["first", "last"]]: + """Build ``multi_modal_data`` and request extras for the active ``input_mode``. + + Mirrors upstream Daily-Omni: separate video + WAV with ``use_audio_in_video=False``. + """ + extra: dict[str, Any] = {"mm_processor_kwargs": {"use_audio_in_video": False}} + mode = self.input_mode + + if mode == "visual": + v = self._get_video_content(video_id, video_url) + return v, extra, "last" + + if mode == "audio": + a = self._get_audio_content(video_id) + return a, extra, "first" + + v = self._get_video_content(video_id, video_url) + a = self._get_audio_content(video_id) + if not v or not a: + return None, None, "first" + return [v, a], extra, "first" + + @staticmethod + def _media_desc_for_official_prompt(mode: DailyOmniInputMode) -> str: + """``media_desc`` in upstream ``build_conversation``.""" + if mode == "audio": + return "given audio" + if mode == "all": + return "given video and audio together" + return "given video" + + @staticmethod + def _choices_repr_for_official_prompt(choice: Any) -> str: + """Format ``Choice`` from qa.json for the model (one option per line when possible). + + Using ``str(list)`` embeds Python list brackets and quotes, which is poor for MCQ + reading; lists/tuples are joined with newlines instead. Other shapes fall back to + ``str(choice)`` for parity with exotic upstream payloads. + """ + if choice is None: + return "" + if isinstance(choice, (list, tuple)): + lines = [str(x).strip() for x in choice if str(x).strip()] + return "\n".join(lines) + if isinstance(choice, dict): + return "\n".join(f"{k}. {v}" for k, v in choice.items()) + return str(choice) + + def _official_daily_omni_user_prompt(self, question: str, choice: Any) -> str: + """User text block from Daily-Omni ``build_conversation`` (after media parts).""" + task_prompt = self._media_desc_for_official_prompt(self.input_mode) + choices = self._choices_repr_for_official_prompt(choice) + # Single f-string with explicit newlines avoids accidental implicit concatenation + # gluing sentences (e.g. ``...media_desc.Select...``) when editing. + return ( + "Your task is to accurately answer multiple-choice questions " + f"based on the {task_prompt}.\n" + "Select the single most accurate answer from the given choices.\n" + f"Question: {question}\n" + f"Choices: {choices}\n" + "Your answer should be a capital letter representing your choice: " + "A, B, C, or D. Don't generate any other text.\n" + ) + + def _build_daily_omni_openai_messages( + self, + mm_payload: dict[str, Any] | list[dict[str, Any]], + question: str, + choice: Any, + ) -> list[dict[str, Any]]: + """Map upstream conversation to OpenAI Chat Completions ``messages`` (video_url / audio_url parts).""" + user_text = self._official_daily_omni_user_prompt(question, choice) + mm_list: list[dict[str, Any]] = mm_payload if isinstance(mm_payload, list) else [mm_payload] + user_content: list[dict[str, Any]] = [*mm_list, {"type": "text", "text": user_text}] + return [ + {"role": "system", "content": [{"type": "text", "text": DAILY_OMNI_SYSTEM_TEXT}]}, + {"role": "user", "content": user_content}, + ] + + def sample_by_task_type( + self, + tokenizer: TokenizerLike, + task_type: str, + num_samples: int, + output_len: int | None = None, + request_id_prefix: str = "", + **kwargs, + ) -> list[SampleRequest]: + """Sample requests filtered by task type. + + Args: + tokenizer: Tokenizer + task_type: Task type to filter by + num_samples: Number of samples + output_len: Target output length + request_id_prefix: Prefix for request IDs + **kwargs: Additional sampling arguments + + Returns: + List of SampleRequest objects matching the task type + """ + if output_len is None: + output_len = self.DEFAULT_OUTPUT_LEN + + filtered = [ + item for item in self.data if self._normalize_qa_fields(self._coerce_row(item))["task_type"] == task_type + ] + + available = len(filtered) + if available < num_samples: + logger.warning( + "Only %d samples available for task type '%s', requested %d", + available, + task_type, + num_samples, + ) + num_samples = available + + sampled_requests: list[SampleRequest] = [] + cached_tokenizer = get_cached_tokenizer(tokenizer) + + for i, item in enumerate(filtered[:num_samples]): + request = self._create_sample_request(item, cached_tokenizer, output_len, request_id_prefix, i) + if request: + sampled_requests.append(request) + + return sampled_requests + + def __repr__(self) -> str: + return ( + f"DailyOmniDataset(" + f"dataset_path={self.dataset_path!r}, " + f"dataset_split={self.dataset_split!r}, " + f"video_dir={self.video_dir!r}, " + f"input_mode={self.input_mode!r}, " + f"inline_local_video={self.inline_local_video!r}, " + f"max_duration_seconds={self.max_duration_seconds}, " + f"random_seed={self.random_seed}" + f")" + ) + + +def load_daily_omni_dataset( + qa_json_path: str | None = None, + dataset_path: str | None = None, + dataset_split: str = "train", + random_seed: int = 0, + video_dir: str | None = None, + input_mode: DailyOmniInputMode = "all", + max_duration_seconds: float | None = None, + dataset_subset: str | None = None, + no_stream: bool = False, + **kwargs, +) -> DailyOmniDataset: + """Convenience function to load Daily-Omni dataset. + + Args: + qa_json_path: Path to local qa.json file (recommended for offline/air-gapped environments). + When provided, ``dataset_path`` is ignored. + dataset_path: HuggingFace dataset path (default: liarliar/Daily-Omni). Used only if + ``qa_json_path`` is not provided (legacy online mode). + dataset_split: Dataset split to use (default: "train") + random_seed: Random seed for shuffling + video_dir: Directory containing extracted ``Videos/`` tree (MP4 and, for ``all``/``audio``, WAV) + input_mode: ``visual`` | ``audio`` | ``all`` (same semantics as upstream Daily-Omni) + max_duration_seconds: Maximum video duration in seconds (e.g., 30 for 30s subset, 60 for 60s subset); + uses ffprobe on local files under ``video_dir`` (in-memory cache only for this process). + **kwargs: Additional arguments passed to DailyOmniDataset + + Returns: + DailyOmniDataset instance + + Example: + >>> from vllm_omni.benchmarks.data_modules.daily_omni_dataset import load_daily_omni_dataset + + # Local JSON mode (recommended for offline) + >>> dataset = load_daily_omni_dataset( + ... qa_json_path="/path/to/qa.json", + ... video_dir="/path/to/Daily-Omni/Videos", + ... random_seed=42, + ... max_duration_seconds=30, + ... ) + + # HuggingFace mode (legacy online) + >>> dataset = load_daily_omni_dataset( + ... dataset_path="liarliar/Daily-Omni", + ... video_dir="/path/to/Daily-Omni/Videos", + ... random_seed=42, + ... ) + >>> requests = dataset.sample(tokenizer, num_requests=100) + """ + return DailyOmniDataset( + qa_json_path=qa_json_path, + dataset_path=dataset_path, + dataset_split=dataset_split, + random_seed=random_seed, + video_dir=video_dir, + input_mode=input_mode, + max_duration_seconds=max_duration_seconds, + dataset_subset=dataset_subset, + no_stream=no_stream, + **kwargs, + ) + + +def get_daily_omni_statistics( + qa_json_path: str | None = None, + dataset_path: str | None = DailyOmniDataset.DEFAULT_HF_DATASET_ID, + dataset_split: str = "train", +) -> dict[str, Any]: + """Get statistics about the Daily-Omni dataset. + + Args: + qa_json_path: Path to local qa.json file (recommended for offline/air-gapped environments). + When provided, ``dataset_path`` is ignored. + dataset_path: HuggingFace dataset path. Defaults to ``DailyOmniDataset.DEFAULT_HF_DATASET_ID`` + when ``qa_json_path`` is omitted. Pass ``None`` only together with ``qa_json_path``. + dataset_split: Dataset split to use (default: "train") + + Returns: + Statistics dict with task type distribution and other info + + Example: + >>> from vllm_omni.benchmarks.data_modules.daily_omni_dataset import get_daily_omni_statistics + + # Local JSON mode + >>> stats = get_daily_omni_statistics(qa_json_path="/path/to/qa.json") + + # HuggingFace mode + >>> stats = get_daily_omni_statistics(dataset_path="liarliar/Daily-Omni") + >>> print(f"Total QA pairs: {stats['total_qa_pairs']}") + >>> print(f"Task distribution: {stats['task_distribution']}") + """ + dataset = DailyOmniDataset( + qa_json_path=qa_json_path, + dataset_path=dataset_path, + dataset_split=dataset_split, + ) + task_stats = dataset.get_task_statistics() + + source = str(qa_json_path) if qa_json_path else f"{dataset_path}/{dataset_split}" + return { + "source": source, + "total_qa_pairs": len(list(dataset.data)), + "task_distribution": task_stats, + } diff --git a/vllm_omni/benchmarks/data_modules/daily_omni_eval.py b/vllm_omni/benchmarks/data_modules/daily_omni_eval.py new file mode 100644 index 0000000000..ecc9edc844 --- /dev/null +++ b/vllm_omni/benchmarks/data_modules/daily_omni_eval.py @@ -0,0 +1,406 @@ +"""Daily-Omni multiple-choice accuracy scoring for vLLM-Omni bench serve. + +Compares model ``generated_text`` to dataset ``Answer`` (A/B/C/D). + +**Alignment with open-source** (`Lliar-liar/Daily-Omni` ``test_model/.../testmodel.py``): + +- Answer extraction defaults to the same rules as ``extract_choice_letter`` (strip after an + ``assistant`` marker, then leading ``A``–``D``, else first ``\\b[A-D]\\b``). Set env + ``DAILY_OMNI_EXTRACT_MODE=relaxed`` to use the older vLLM-Omni heuristics (last ``answer:``, + tail scan, etc.). +- Overall accuracy comparable to the official script uses **successful HTTP responses only** as + the denominator (their ``valid_questions = total - failed`` excludes inference / I/O skips). + We also report ``daily_omni_accuracy_incl_http_fail`` where each failed request counts as a + wrong answer in the denominator (stricter throughput-bench view). +- **By video length:** mirrors upstream ``--- Accuracy by Video Duration ---`` for ``30s`` / + ``60s`` (``qa.json`` ``video_duration``): ``daily_omni_per_duration*`` metrics and a printed block. +- **By video category:** mirrors ``--- Accuracy by Video Category ---`` using ``video_category`` + from ``qa.json`` (``daily_omni_per_category*``; empty category is bucketed as ``unknown``). +- **Correctness:** uses the same ``evaluate_answer`` rule as upstream (truthy extracted letter vs + raw ``Answer`` string, both ``strip().upper()``). Rows with empty ``Answer`` are skipped + (``no_gold``), matching missing-field skips in the official loop. +""" + +from __future__ import annotations + +import os +import re +from typing import Any + +from vllm.benchmarks.lib.endpoint_request_func import RequestFuncOutput + +from vllm_omni.benchmarks.data_modules.daily_omni_dataset import DailyOmniSampleRequest + +_VALID = frozenset("ABCD") + +# Official ``testmodel.py`` buckets (``qa.json`` ``video_duration``). +DAILY_OMNI_DURATION_KEYS: tuple[str, ...] = ("30s", "60s") + + +def extract_choice_letter_official(text: str | None) -> str | None: + """Port of Daily-Omni ``extract_choice_letter`` (first A–D, assistant-tail semantics).""" + if not text: + return None + raw = str(text).strip() + if not raw: + return None + match = re.search(r"assistant\s*([\s\S]*)$", raw, flags=re.IGNORECASE) + candidate = match.group(1).strip() if match else raw + direct = re.match(r"(?i)^\s*([A-D])(?:[\s\.\)::]|$)", candidate) + if direct: + return direct.group(1).upper() + fallback = re.search(r"\b([A-D])\b", candidate.upper()) + if fallback: + return fallback.group(1) + return None + + +def evaluate_answer_official(model_answer: str | None, correct_answer: str) -> bool: + """Port of Daily-Omni ``evaluate_answer`` (strict string match after strip/upper).""" + if not model_answer: + return False + return model_answer.strip().upper() == (correct_answer or "").strip().upper() + + +def normalize_gold_answer(gold: str) -> str | None: + """Best-effort single letter from ``Answer`` (for ``gold_normalized`` in saved items only).""" + g = (gold or "").strip().upper() + if len(g) == 1 and g in _VALID: + return g + m = re.search(r"([ABCD])\b", g) + if m: + return m.group(1).upper() + return None + + +def _extract_predicted_choice_relaxed(text: str) -> str | None: + """Legacy vLLM-Omni heuristics (last ``answer:`` patterns, tail scan).""" + if not text or not str(text).strip(): + return None + t = str(text).strip() + + strong_patterns = [ + r"(?i)\*\*answer\*\*\s*[::]?\s*\(?([ABCD])\)?", + r"(?i)\banswer\s*[::]?\s*\(?([ABCD])\)?", + r"(?i)\bfinal\s+answer\s*[::]?\s*\(?([ABCD])\)?", + r"(?i)\bcorrect\s+(?:answer|option)\s*[::]?\s*\(?([ABCD])\)?", + r"(?i)\bthe\s+(?:correct\s+)?option\s+(?:is|would\s+be)\s*\(?([ABCD])\)?", + r"(?i)\bI\s+(?:would\s+)?(?:choose|select|pick)\s*\(?([ABCD])\)?", + ] + last_letter: str | None = None + for pat in strong_patterns: + for m in re.finditer(pat, t): + last_letter = m.group(1).upper() + if last_letter: + return last_letter + + # Weaker phrases: first match can be spurious; still prefer last occurrence. + weak_patterns = [ + r"(?i)\boption\s*[::]?\s*\(?([ABCD])\)?", + r"(?i)\bchoice\s*[::]?\s*\(?([ABCD])\)?", + ] + for pat in weak_patterns: + for m in re.finditer(pat, t): + last_letter = m.group(1).upper() + if last_letter: + return last_letter + + paren = list(re.finditer(r"\(([ABCD])\)", t)) + if paren: + return paren[-1].group(1).upper() + + # First line sometimes is just "B" or "B." — allow if whole output is short + one_line = t.split("\n", 1)[0].strip() + if len(t) < 120 and len(one_line) <= 6: + m0 = re.match(r"^([ABCD])\s*[.:\)]?\s*$", one_line, re.I) + if m0: + return m0.group(1).upper() + + # Tail-only: avoids matching echoed "A. ..." option blocks at the start + tail_len = min(500, len(t)) + tail = t[-tail_len:] + # ``\b`` after the letter avoids "Because"/"Definitely" false positives + m = re.search(r"(?:^|[^\w])([ABCD])\b", tail, re.I) + if m: + return m.group(1).upper() + + return None + + +def extract_predicted_choice(text: str | None) -> str | None: + """Parse model output to A–D (official Daily-Omni rules by default).""" + if not text or not str(text).strip(): + return None + mode = os.environ.get("DAILY_OMNI_EXTRACT_MODE", "official").strip().lower() + if mode in ("relaxed", "heuristic", "legacy"): + return _extract_predicted_choice_relaxed(str(text)) + return extract_choice_letter_official(text) + + +def compute_daily_omni_accuracy_metrics( + input_requests: list[Any], + outputs: list[RequestFuncOutput], + *, + include_per_item: bool = False, +) -> dict[str, Any] | None: + """If all requests are :class:`DailyOmniSampleRequest`, compute accuracy stats. + + Rows with empty ``Answer`` (after strip) are skipped as ``no_gold``, like upstream missing + ``correct_answer``. + + **Denominators:** The open-source script excludes items that hit inference / I/O failures + from ``valid_questions``; we mirror that with ``daily_omni_accuracy`` (= correct / + successful responses). Failed HTTP requests are also tracked and used in + ``daily_omni_accuracy_incl_http_fail`` (each failure counts as incorrect in the + denominator). + """ + if not input_requests or len(input_requests) != len(outputs): + return None + if not all(isinstance(r, DailyOmniSampleRequest) for r in input_requests): + return None + + # total / correct: all rows with gold (incl. HTTP fail in total) + # total_ok / correct_ok: successful HTTP only (GitHub-style per-type denominator) + per_task: dict[str, dict[str, int]] = {} + per_category: dict[str, dict[str, int]] = {} + per_duration: dict[str, dict[str, int]] = { + k: {"correct": 0, "total": 0, "correct_ok": 0, "total_ok": 0} for k in DAILY_OMNI_DURATION_KEYS + } + items: list[dict[str, Any]] = [] + correct = 0 + evaluated = 0 + no_gold = 0 + request_failed = 0 + parse_failed = 0 # success but could not extract A–D + + for req, out in zip(input_requests, outputs, strict=True): + assert isinstance(req, DailyOmniSampleRequest) + gold_raw = (req.daily_omni_gold_answer or "").strip() + gold_norm = normalize_gold_answer(req.daily_omni_gold_answer) + tt = (req.daily_omni_task_type or "unknown").strip() or "unknown" + dur_key = (req.daily_omni_video_duration or "").strip() + dur_active = dur_key in per_duration + cat_key = (req.daily_omni_video_category or "").strip() or "unknown" + if tt not in per_task: + per_task[tt] = {"correct": 0, "total": 0, "correct_ok": 0, "total_ok": 0} + if cat_key not in per_category: + per_category[cat_key] = {"correct": 0, "total": 0, "correct_ok": 0, "total_ok": 0} + + if not gold_raw: + no_gold += 1 + items.append( + { + "request_id": req.request_id, + "skipped": True, + "reason": "no_gold", + "task_type": tt, + "video_id": req.daily_omni_video_id, + "video_duration": dur_key or None, + "video_category": cat_key if cat_key != "unknown" else None, + } + ) + continue + + if not out.success: + request_failed += 1 + evaluated += 1 + per_task[tt]["total"] += 1 + per_category[cat_key]["total"] += 1 + if dur_active: + per_duration[dur_key]["total"] += 1 + # GitHub: failed inference not in valid_questions — do not increment total_ok + items.append( + { + "request_id": req.request_id, + "gold": gold_raw, + "gold_normalized": gold_norm, + "predicted": None, + "correct": False, + "task_type": tt, + "video_id": req.daily_omni_video_id, + "video_duration": dur_key or None, + "video_category": cat_key if cat_key != "unknown" else None, + "error": (out.error or "")[:500], + } + ) + continue + + pred = extract_predicted_choice(out.generated_text) + evaluated += 1 + per_task[tt]["total"] += 1 + per_task[tt]["total_ok"] += 1 + per_category[cat_key]["total"] += 1 + per_category[cat_key]["total_ok"] += 1 + if dur_active: + per_duration[dur_key]["total"] += 1 + per_duration[dur_key]["total_ok"] += 1 + if pred is None: + parse_failed += 1 + is_correct = evaluate_answer_official(pred, req.daily_omni_gold_answer) + if is_correct: + correct += 1 + per_task[tt]["correct"] += 1 + per_task[tt]["correct_ok"] += 1 + per_category[cat_key]["correct"] += 1 + per_category[cat_key]["correct_ok"] += 1 + if dur_active: + per_duration[dur_key]["correct"] += 1 + per_duration[dur_key]["correct_ok"] += 1 + + items.append( + { + "request_id": req.request_id, + "gold": gold_raw, + "gold_normalized": gold_norm, + "predicted": pred, + "correct": is_correct, + "parse_failed": pred is None, + "task_type": tt, + "video_id": req.daily_omni_video_id, + "video_duration": dur_key or None, + "video_category": cat_key if cat_key != "unknown" else None, + } + ) + + evaluated_ok = evaluated - request_failed + accuracy_github = (correct / evaluated_ok) if evaluated_ok else None + accuracy_incl_fail = (correct / evaluated) if evaluated else None + + per_task_accuracy: dict[str, float | None] = {} + per_task_accuracy_github: dict[str, float | None] = {} + for name, st in per_task.items(): + tot = st["total"] + per_task_accuracy[name] = (st["correct"] / tot) if tot else None + tok = st["total_ok"] + per_task_accuracy_github[name] = (st["correct_ok"] / tok) if tok else None + + per_category_accuracy: dict[str, float | None] = {} + per_category_accuracy_github: dict[str, float | None] = {} + for name, st in per_category.items(): + tot = st["total"] + per_category_accuracy[name] = (st["correct"] / tot) if tot else None + tok = st["total_ok"] + per_category_accuracy_github[name] = (st["correct_ok"] / tok) if tok else None + + per_duration_accuracy: dict[str, float | None] = {} + per_duration_accuracy_github: dict[str, float | None] = {} + for name, st in per_duration.items(): + tot = st["total"] + per_duration_accuracy[name] = (st["correct"] / tot) if tot else None + tok = st["total_ok"] + per_duration_accuracy_github[name] = (st["correct_ok"] / tok) if tok else None + + out: dict[str, Any] = { + # Comparable to GitHub testmodel.py: correct / successful inferences + "daily_omni_accuracy": accuracy_github, + "daily_omni_accuracy_incl_http_fail": accuracy_incl_fail, + "daily_omni_correct": correct, + "daily_omni_evaluated": evaluated, + "daily_omni_evaluated_ok": evaluated_ok, + "daily_omni_no_gold": no_gold, + "daily_omni_request_failed": request_failed, + "daily_omni_parse_failed": parse_failed, + "daily_omni_per_task": {k: dict(v) for k, v in per_task.items()}, + "daily_omni_per_task_accuracy": per_task_accuracy, + "daily_omni_per_task_accuracy_github_style": per_task_accuracy_github, + "daily_omni_per_category": {k: dict(v) for k, v in per_category.items()}, + "daily_omni_per_category_accuracy": per_category_accuracy, + "daily_omni_per_category_accuracy_github_style": per_category_accuracy_github, + "daily_omni_per_duration": {k: dict(v) for k, v in per_duration.items()}, + "daily_omni_per_duration_accuracy": per_duration_accuracy, + "daily_omni_per_duration_accuracy_github_style": per_duration_accuracy_github, + } + if include_per_item: + out["daily_omni_eval_items"] = items + return out + + +def print_daily_omni_accuracy_summary(metrics: dict[str, Any]) -> None: + """Pretty-print accuracy block (stdout).""" + acc = metrics.get("daily_omni_accuracy") + acc_fail = metrics.get("daily_omni_accuracy_incl_http_fail") + if acc is None and acc_fail is None and metrics.get("daily_omni_evaluated", 0) == 0: + return + print("{s:{c}^{n}}".format(s=" Daily-Omni accuracy (MCQ) ", n=50, c="=")) + ok = int(metrics.get("daily_omni_evaluated_ok", 0) or 0) + cor = int(metrics.get("daily_omni_correct", 0) or 0) + if ok > 0 and acc is not None: + print(f"Overall Accuracy: {cor}/{ok} = {acc:.2%}") + elif int(metrics.get("daily_omni_evaluated", 0) or 0) > 0: + print("Overall Accuracy: 0/0 = N/A (no successful HTTP responses)") + print( + "{:<40} {:<10}".format( + "Submitted (gold present):", + metrics.get("daily_omni_evaluated", 0), + ) + ) + print( + "{:<40} {:<10}".format( + "Successful HTTP (GitHub denom.):", + metrics.get("daily_omni_evaluated_ok", 0), + ) + ) + print("{:<40} {:<10}".format("Correct:", metrics.get("daily_omni_correct", 0))) + if acc is not None: + print("{:<40} {:<10.4f}".format("Accuracy (ratio, same as above):", acc)) + if acc_fail is not None and metrics.get("daily_omni_request_failed", 0): + print( + "{:<40} {:<10.4f}".format( + "Accuracy (incl. HTTP as wrong):", + acc_fail, + ) + ) + print("{:<40} {:<10}".format("Skipped (no gold):", metrics.get("daily_omni_no_gold", 0))) + print( + "{:<40} {:<10}".format( + "HTTP failed (excl. from GitHub acc.):", + metrics.get("daily_omni_request_failed", 0), + ) + ) + print( + "{:<40} {:<10}".format( + "Parsed OK but no A–D found:", + metrics.get("daily_omni_parse_failed", 0), + ) + ) + pt = metrics.get("daily_omni_per_task") or {} + pta = metrics.get("daily_omni_per_task_accuracy_github_style") or {} + if pta: + print("\n--- Accuracy by QA Type ---") + for name in sorted(pta.keys()): + a = pta[name] + st = pt.get(name) or {} + tok = int(st.get("total_ok", 0) or 0) + cok = int(st.get("correct_ok", 0) or 0) + if tok and a is not None: + print(f"{name}: {cok}/{tok} = {a:.2%}") + else: + print(f"{name}: 0/0 = N/A") + + pc = metrics.get("daily_omni_per_category") or {} + ptc = metrics.get("daily_omni_per_category_accuracy_github_style") or {} + if ptc: + print("\n--- Accuracy by Video Category ---") + for name in sorted(ptc.keys()): + a = ptc[name] + st = pc.get(name) or {} + tok = int(st.get("total_ok", 0) or 0) + cok = int(st.get("correct_ok", 0) or 0) + if tok and a is not None: + print(f"{name}: {cok}/{tok} = {a:.2%}") + else: + print(f"{name}: 0/0 = N/A") + + pdf = metrics.get("daily_omni_per_duration_accuracy_github_style") or {} + if pdf: + print("\n--- Accuracy by Video Duration ---") + for name in DAILY_OMNI_DURATION_KEYS: + a = pdf.get(name) + st = (metrics.get("daily_omni_per_duration") or {}).get(name) or {} + tok = int(st.get("total_ok", 0) or 0) + cor = int(st.get("correct_ok", 0) or 0) + if tok and a is not None: + print(f"{name} Duration: {cor}/{tok} = {a:.2%}") + else: + print(f"{name} Duration: 0/0 = N/A") + print("=" * 50) diff --git a/vllm_omni/benchmarks/data_modules/daily_omni_text_audio.py b/vllm_omni/benchmarks/data_modules/daily_omni_text_audio.py new file mode 100644 index 0000000000..69fbe026bd --- /dev/null +++ b/vllm_omni/benchmarks/data_modules/daily_omni_text_audio.py @@ -0,0 +1,255 @@ +"""Daily-Omni: optional consistency check between text stream and generated speech. + +The benchmark MCQ accuracy uses ``generated_text`` only. When the omni server also +streams ``modality=audio`` (TTS), this module can transcribe the concatenated WAV +with Whisper and compare the inferred option letter to the one parsed from text. + +Requires ``openai-whisper`` (``pip install openai-whisper``). Enable via env +``DAILY_OMNI_TEXT_AUDIO_CONSISTENCY=1`` or CLI ``--daily-omni-text-audio-consistency``. + +Whisper model name defaults to ``tiny`` (override with ``DAILY_OMNI_WHISPER_MODEL``). +""" + +from __future__ import annotations + +import logging +import os +import re +import threading +from typing import Any + +from vllm_omni.benchmarks.data_modules.daily_omni_dataset import DailyOmniSampleRequest +from vllm_omni.benchmarks.data_modules.daily_omni_eval import extract_predicted_choice + +logger = logging.getLogger(__name__) + +_whisper_model = None +_whisper_model_name: str | None = None +_whisper_lock = threading.Lock() + + +def env_text_audio_check_enabled() -> bool: + return os.environ.get("DAILY_OMNI_TEXT_AUDIO_CONSISTENCY", "").lower() in ( + "1", + "true", + "yes", + ) + + +def extract_choice_from_asr_transcript(transcript: str) -> str | None: + """Parse A–D from ASR text; extends :func:`extract_predicted_choice` with spoken Chinese phrases.""" + c = extract_predicted_choice(transcript) + if c: + return c + t = transcript or "" + for pat in ( + r"(?i)选项\s*([ABCD])\b", + r"(?i)选\s*([ABCD])\b", + r"(?i)答案\s*是\s*([ABCD])\b", + r"(?i)答案\s*([ABCD])\b", + ): + m = re.search(pat, t) + if m: + return m.group(1).upper() + return None + + +def _get_whisper_model(model_name: str): + global _whisper_model, _whisper_model_name + with _whisper_lock: + if _whisper_model is None or _whisper_model_name != model_name: + import whisper + + logger.warning( + "Loading Whisper model %r for Daily-Omni text/audio consistency (one-time)...", + model_name, + ) + _whisper_model = whisper.load_model(model_name) + _whisper_model_name = model_name + return _whisper_model + + +def transcribe_wav_bytes( + wav_bytes: bytes, + *, + language: str | None = None, + model_name: str | None = None, +) -> tuple[str | None, str | None]: + """Transcribe WAV bytes. Returns ``(transcript, error)`` — one of them is set. + + Args: + wav_bytes: RIFF WAV file bytes. + language: Optional Whisper language code (e.g. ``en``, ``zh``); improves accuracy/latency. + model_name: Override model id; else ``DAILY_OMNI_WHISPER_MODEL`` or ``tiny``. + """ + if not wav_bytes: + return None, "empty_wav" + if model_name is None or not str(model_name).strip(): + model_name = os.environ.get("DAILY_OMNI_WHISPER_MODEL") or "tiny" + model_name = str(model_name).strip() or "tiny" + path: str | None = None + try: + import tempfile + + model = _get_whisper_model(model_name) + fd, path = tempfile.mkstemp(suffix=".wav") + with os.fdopen(fd, "wb") as fp: + fp.write(wav_bytes) + kwargs: dict = {} + if language: + kwargs["language"] = language + result = model.transcribe(path, **kwargs) + text = (result.get("text") or "").strip() + return (text if text else None), None + except ImportError: + return None, "openai-whisper is not installed (pip install openai-whisper)" + except Exception as e: + return None, str(e)[:500] + finally: + if path: + try: + os.unlink(path) + except OSError: + pass + + +def compute_daily_omni_text_audio_consistency_metrics( + input_requests: list[Any], + outputs: list[Any], + *, + include_per_item: bool = False, +) -> dict[str, Any] | None: + """Compare option letter from ``generated_text`` vs Whisper transcript of output audio. + + Only considers requests where ``outputs[i]`` has ``generated_audio_wav_bytes`` set + (populated by the omni benchmark when TA check is enabled). + """ + if not input_requests or len(input_requests) != len(outputs): + return None + if not all(isinstance(r, DailyOmniSampleRequest) for r in input_requests): + return None + + ta_no_wav = 0 + ta_asr_failed = 0 + ta_text_unparsed = 0 + ta_audio_unparsed = 0 + ta_consistent = 0 + ta_mismatch = 0 + ta_both_parsed = 0 + items: list[dict[str, Any]] = [] + + for req, out in zip(input_requests, outputs, strict=True): + assert isinstance(req, DailyOmniSampleRequest) + rid = req.request_id + if not getattr(out, "success", False): + if include_per_item: + items.append( + { + "request_id": rid, + "skipped": True, + "reason": "request_not_success", + } + ) + continue + + wav = getattr(out, "generated_audio_wav_bytes", None) + if not wav: + ta_no_wav += 1 + if include_per_item: + items.append( + { + "request_id": rid, + "skipped": False, + "reason": "no_output_audio", + "text_choice": extract_predicted_choice(getattr(out, "generated_text", "") or ""), + } + ) + continue + + transcript, asr_err = transcribe_wav_bytes(wav) + if asr_err: + ta_asr_failed += 1 + if include_per_item: + items.append( + { + "request_id": rid, + "asr_error": asr_err, + "text_choice": extract_predicted_choice(getattr(out, "generated_text", "") or ""), + } + ) + continue + + text_choice = extract_predicted_choice(getattr(out, "generated_text", "") or "") + audio_choice = extract_choice_from_asr_transcript(transcript or "") + + if text_choice is None: + ta_text_unparsed += 1 + if audio_choice is None: + ta_audio_unparsed += 1 + + if text_choice is not None and audio_choice is not None: + ta_both_parsed += 1 + if text_choice == audio_choice: + ta_consistent += 1 + else: + ta_mismatch += 1 + + if include_per_item: + consistent: bool | None + if text_choice is None or audio_choice is None: + consistent = None + else: + consistent = text_choice == audio_choice + items.append( + { + "request_id": rid, + "text_choice": text_choice, + "audio_choice": audio_choice, + "asr_transcript": (transcript or "")[:500], + "text_audio_consistent": consistent, + } + ) + + comparable = ta_consistent + ta_mismatch + rate = (ta_consistent / comparable) if comparable else None + + out: dict[str, Any] = { + "daily_omni_ta_enabled": True, + "daily_omni_ta_no_output_audio": ta_no_wav, + "daily_omni_ta_asr_failed": ta_asr_failed, + "daily_omni_ta_text_unparsed": ta_text_unparsed, + "daily_omni_ta_audio_unparsed": ta_audio_unparsed, + "daily_omni_ta_both_parsed": ta_both_parsed, + "daily_omni_ta_consistent": ta_consistent, + "daily_omni_ta_mismatch": ta_mismatch, + "daily_omni_ta_consistency_rate": rate, + } + if include_per_item: + out["daily_omni_ta_items"] = items + return out + + +def print_daily_omni_text_audio_summary(metrics: dict[str, Any]) -> None: + if not metrics.get("daily_omni_ta_enabled"): + return + print("{s:{c}^{n}}".format(s=" Daily-Omni text vs audio (ASR) ", n=50, c="=")) + print("{:<40} {:<10}".format("No output audio captured:", metrics.get("daily_omni_ta_no_output_audio", 0))) + print("{:<40} {:<10}".format("ASR failed:", metrics.get("daily_omni_ta_asr_failed", 0))) + print("{:<40} {:<10}".format("Both text+audio letter parsed:", metrics.get("daily_omni_ta_both_parsed", 0))) + print("{:<40} {:<10}".format("Consistent (same letter):", metrics.get("daily_omni_ta_consistent", 0))) + print("{:<40} {:<10}".format("Mismatch:", metrics.get("daily_omni_ta_mismatch", 0))) + r = metrics.get("daily_omni_ta_consistency_rate") + if r is not None: + print("{:<40} {:<10.4f}".format("Consistency rate (of both parsed):", r)) + print( + "{:<40} {:<10}".format( + "Text unparsed (among w/ audio):", + metrics.get("daily_omni_ta_text_unparsed", 0), + ) + ) + print( + "{:<40} {:<10}".format( + "Audio unparsed (among w/ audio):", + metrics.get("daily_omni_ta_audio_unparsed", 0), + ) + ) diff --git a/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py b/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py new file mode 100644 index 0000000000..ca6de4cb20 --- /dev/null +++ b/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py @@ -0,0 +1,272 @@ +"""Seed-TTS zero-shot evaluation-style prompts for ``vllm bench serve``. + +Loads rows from the `meta.lst` format used in `BytedanceSpeech/seed-tts-eval`_ (or any +HuggingFace dataset repo with the same layout):: + + utt_id|prompt_transcript|prompt_wav_relative_path|text_to_synthesize + +Each benchmark request supplies target text plus ``ref_text`` / ``ref_audio`` (Qwen3-TTS ``Base`` / +voice clone), merged into the JSON body. By default ``ref_audio`` is an inline ``data:`` URL so +the server does not need ``--allowed-local-media-path``. Use ``--seed-tts-file-ref-audio`` for +``file://`` (smaller bodies; requires that flag). Use ``--backend openai-audio-speech`` +(``/v1/audio/speech``) or ``--backend openai-chat-omni`` (``/v1/chat/completions`` with the same +fields on the body plus a Qwen3-Omni-style ``system`` message and the target text as ``user`` content). + +.. _BytedanceSpeech/seed-tts-eval: https://github.com/BytedanceSpeech/seed-tts-eval +""" + +from __future__ import annotations + +import base64 +import logging +import random +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from vllm.benchmarks.datasets import BenchmarkDataset, SampleRequest +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.hf import get_cached_tokenizer + +logger = logging.getLogger(__name__) + +# Matches Qwen3-Omni serving examples (``openai_chat_completion_client_for_multimodal_generation`` / +# ``qwen3_omni/gradio_demo``) plus explicit TTS / voice-clone instructions for chat completions. +SEED_TTS_DEFAULT_OMNI_SYSTEM_PROMPT = ( + "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, " + "capable of perceiving auditory and visual inputs, as well as generating text and speech.\n" + "For this request you act as a text-to-speech engine with zero-shot voice cloning: " + "the API provides reference audio and its transcript (ref_audio, ref_text) and task_type Base. " + "The user message is the exact text you must speak. " + "Synthesize natural speech in the same language as that user text, " + "matching the timbre, prosody, and speaking style of the reference audio while reading the new content clearly." +) + + +@dataclass +class SeedTTSSampleRequest(SampleRequest): + """``SampleRequest`` with per-row fields merged into ``/v1/audio/speech`` JSON.""" + + #: Shallow-merged into ``RequestFuncInput.extra_body`` (ref_audio, ref_text, task_type, …). + seed_tts_speech_extra: dict[str, Any] | None = None + seed_tts_utterance_id: str = "" + seed_tts_locale: str = "" + #: For ``openai-chat-omni``: becomes the chat ``system`` message (Qwen3-Omni + TTS behavior). + seed_tts_system_prompt: str = "" + #: Local path to reference prompt WAV (for SIM vs. synthesized PCM in ``seed_tts_eval``). + seed_tts_ref_wav_path: str = "" + + +@dataclass +class _SeedTTSRow: + utterance_id: str + ref_text: str + prompt_wav_rel: str + target_text: str + + +def _parse_meta_line(line: str) -> _SeedTTSRow | None: + line = line.strip() + if not line or line.startswith("#"): + return None + parts = line.split("|") + if len(parts) < 4: + logger.warning("Skipping malformed meta.lst line (need 4 '|'-fields): %r", line[:120]) + return None + utt_id, ref_text, wav_rel, target = parts[0], parts[1], parts[2], parts[3] + if not target.strip(): + return None + return _SeedTTSRow( + utterance_id=utt_id.strip(), + ref_text=ref_text.strip(), + prompt_wav_rel=wav_rel.strip(), + target_text=target.strip(), + ) + + +def _load_meta_rows(meta_file: Path) -> list[_SeedTTSRow]: + text = meta_file.read_text(encoding="utf-8") + rows: list[_SeedTTSRow] = [] + for line in text.splitlines(): + r = _parse_meta_line(line) + if r is not None: + rows.append(r) + return rows + + +def resolve_seed_tts_root(dataset_path: str | None, *, explicit_root: str | None) -> Path: + """Return directory containing ``{locale}/meta.lst`` and ``{locale}/prompt-wavs/``.""" + if explicit_root: + root = Path(explicit_root).expanduser().resolve() + if not root.is_dir(): + raise FileNotFoundError(f"--seed-tts-root is not a directory: {root}") + return root + + if not dataset_path: + raise ValueError("Seed-TTS requires --dataset-path (HF repo id or local root) or --seed-tts-root.") + + p = Path(dataset_path).expanduser() + if p.exists() and p.is_dir(): + return p.resolve() + + repo_id = dataset_path.strip() + try: + from huggingface_hub import snapshot_download + except ImportError as e: + raise ImportError( + "Install huggingface_hub to download Seed-TTS from the Hub, or clone the dataset " + "locally and pass --dataset-path / --seed-tts-root to that directory." + ) from e + cache = snapshot_download(repo_id=repo_id, repo_type="dataset") + return Path(cache).resolve() + + +def _ref_audio_payload(wav_path: Path, *, inline: bool) -> str: + if inline: + raw = wav_path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + return f"data:audio/wav;base64,{b64}" + return wav_path.expanduser().resolve().as_uri() + + +class SeedTTSDataset(BenchmarkDataset): + """Seed-TTS-style zero-shot TTS rows for throughput/latency benchmarking. + + Args: + dataset_path: HuggingFace dataset repo id (``org/dataset``) or local directory with + ``en/meta.lst`` (and ``zh/meta.lst`` if using zh). + locale: ``en`` or ``zh`` — which subfolder under the root to read. + inline_ref_audio: If True (default), embed prompt WAV as ``data:audio/wav;base64,...`` + so Qwen3-TTS / ``/v1/audio/speech`` works without server + ``--allowed-local-media-path``. If False, use ``file://`` (smaller + requests; server must set ``--allowed-local-media-path`` to the dataset root). + seed_tts_root: Optional override for the root directory (same layout as HF dataset). + system_prompt: Optional override for the chat system message when using + ``--backend openai-chat-omni``; defaults to :data:`SEED_TTS_DEFAULT_OMNI_SYSTEM_PROMPT`. + """ + + IS_MULTIMODAL = False + DEFAULT_OUTPUT_LEN = 2048 + + def __init__( + self, + dataset_path: str, + random_seed: int = 0, + locale: str = "en", + inline_ref_audio: bool = True, + seed_tts_root: str | None = None, + system_prompt: str | None = None, + disable_shuffle: bool = False, + **kwargs: Any, + ) -> None: + if locale not in ("en", "zh"): + raise ValueError("locale must be 'en' or 'zh'") + self.locale = locale + self.inline_ref_audio = inline_ref_audio + self._explicit_root = seed_tts_root + sp = (system_prompt or "").strip() + self._system_prompt = sp if sp else SEED_TTS_DEFAULT_OMNI_SYSTEM_PROMPT + super().__init__( + dataset_path=dataset_path, + random_seed=random_seed, + disable_shuffle=disable_shuffle, + **kwargs, + ) + self._root = resolve_seed_tts_root(self.dataset_path, explicit_root=self._explicit_root) + self._rows: list[_SeedTTSRow] = [] + self.load_data() + + def load_data(self) -> None: + meta = self._root / self.locale / "meta.lst" + if not meta.is_file(): + raise FileNotFoundError( + f"Seed-TTS meta not found: {meta}. " + f"Expected layout from seed-tts-eval (e.g. {self._root}/{self.locale}/meta.lst)." + ) + self._rows = _load_meta_rows(meta) + if not self._rows: + raise ValueError(f"No valid rows in {meta}") + if not self.disable_shuffle: + rng = random.Random(self.random_seed) + rng.shuffle(self._rows) + self.data = self._rows + logger.info( + "Loaded Seed-TTS: root=%s locale=%s rows=%d inline_ref_audio=%s", + self._root, + self.locale, + len(self._rows), + self.inline_ref_audio, + ) + + def sample( + self, + tokenizer: TokenizerLike, + num_requests: int, + output_len: int | None = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs: Any, + ) -> list[SampleRequest]: + if output_len is None: + output_len = self.DEFAULT_OUTPUT_LEN + + tok = get_cached_tokenizer(tokenizer) + out: list[SampleRequest] = [] + for i, row in enumerate(self._rows): + if len(out) >= num_requests: + break + wav_path = (self._root / self.locale / row.prompt_wav_rel).resolve() + if not wav_path.is_file(): + logger.warning("Missing prompt wav for %s: %s", row.utterance_id, wav_path) + continue + + target = row.target_text + prompt_len = len(tok.encode(f"{self._system_prompt}\n{target}")) + lang = "English" if self.locale == "en" else "Chinese" + ref_uri = _ref_audio_payload(wav_path, inline=self.inline_ref_audio) + speech_extra: dict[str, Any] = { + "ref_audio": ref_uri, + "ref_text": row.ref_text, + "task_type": "Base", + "language": lang, + "max_new_tokens": output_len, + } + + out.append( + SeedTTSSampleRequest( + prompt=target, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=None, + request_id=f"{request_id_prefix}{i}", + seed_tts_speech_extra=speech_extra, + seed_tts_utterance_id=row.utterance_id, + seed_tts_locale=self.locale, + seed_tts_system_prompt=self._system_prompt, + seed_tts_ref_wav_path=str(wav_path), + ) + ) + + logger.info("Seed-TTS: built %d requests (asked %d)", len(out), num_requests) + self.maybe_oversample_requests(out, num_requests, request_id_prefix, no_oversample) + return out + + +def load_seed_tts_dataset( + dataset_path: str, + random_seed: int = 0, + locale: str = "en", + inline_ref_audio: bool = True, + seed_tts_root: str | None = None, + system_prompt: str | None = None, + **kwargs: Any, +) -> SeedTTSDataset: + return SeedTTSDataset( + dataset_path=dataset_path, + random_seed=random_seed, + locale=locale, + inline_ref_audio=inline_ref_audio, + seed_tts_root=seed_tts_root, + system_prompt=system_prompt, + **kwargs, + ) diff --git a/vllm_omni/benchmarks/data_modules/seed_tts_eval.py b/vllm_omni/benchmarks/data_modules/seed_tts_eval.py new file mode 100644 index 0000000000..d5f1b64709 --- /dev/null +++ b/vllm_omni/benchmarks/data_modules/seed_tts_eval.py @@ -0,0 +1,729 @@ +"""Seed-TTS WER aligned with Bytedance ``seed-tts-eval`` / ``run_wer.py``. + +Matches the published protocol (see Hugging Face dataset card and +https://github.com/BytedanceSpeech/seed-tts-eval): + +- **EN**: ``openai/whisper-large-v3`` via ``transformers``, audio resampled to **16 kHz** + (same as ``run_wer.py``). +- **ZH**: ``funasr`` **paraformer-zh**, hypothesis converted with **zhconv** to zh-cn. +- **WER**: ``jiwer`` after punctuation stripping (``zhon.hanzi.punctuation`` + ``string.punctuation``, + preserving ``'``) and EN lowercasing / ZH per-character spacing. Supports jiwer 3.x + (``compute_measures``) and 4.x (``process_words``). + +- **SIM** (speaker similarity proxy): cosine similarity of L2-normalized mean-pooled **WavLM** + embeddings (reference prompt WAV vs. synthesized PCM), 16 kHz. Official ``cal_sim.sh`` uses + UniSpeech ``verification_pair_list_v2.py`` with a **fine-tuned** WavLM SV checkpoint — set + ``SEED_TTS_WAVLM_MODEL`` to another HF id if you need closer parity. Disable with + ``SEED_TTS_SIM_EVAL=0``. Optional: ``SEED_TTS_SIM_DEVICE`` (e.g. ``cpu``) to avoid GPU + issues when Whisper already uses CUDA; ``SEED_TTS_WAVLM_MIN_SAMPLES`` pads very short + waveforms so the WavLM CNN front-end does not fail. + +- **UTMOS** (predicted MOS from TorchScript): default ``balacoon/utmos`` → ``utmos.jit`` + (Sarulab-style demo export). Uses ``torch`` + ``huggingface_hub`` only. Aggregate metrics + are over **all requests with captured PCM** (independent of ASR/WER). Non-finite scores are + dropped and counted as failures. Override repo/file via ``SEED_TTS_UTMOS_HF_REPO`` / + ``SEED_TTS_UTMOS_JIT_FILE``. **Device**: defaults to **CPU** when ``SEED_TTS_UTMOS_DEVICE`` + is unset; set ``SEED_TTS_UTMOS_DEVICE=cuda:0`` (or ``cuda:1`` etc.) to run on GPU. The JIT + model is loaded directly onto the target device via ``map_location`` to avoid cross-device + issues (some PyTorch builds/Windows have problems moving TorchScript modules after load). + Forward uses **float32** waveform in ``[-1, 1]`` (same as the WER resampled array) so + tensor dtypes match JIT weights; using int16 triggers + ``RuntimeError: input type and weight type should be same`` on common exports. Disable + with ``SEED_TTS_UTMOS_EVAL=0``. + +Enable with ``SEED_TTS_WER_EVAL=1`` or ``--seed-tts-wer-eval``. Install optional deps:: + + pip install 'vllm-omni[seed-tts-eval]' + +Env: ``SEED_TTS_EVAL_DEVICE`` (e.g. ``cuda:0``, ``cpu``); ``SEED_TTS_HF_WHISPER_MODEL`` +defaults to ``openai/whisper-large-v3`` (override for debugging only). +""" + +from __future__ import annotations + +import io +import logging +import math +import os +import statistics +import string +import tempfile +import threading +import wave +from typing import Any + +import numpy as np +from vllm.benchmarks.datasets import SampleRequest + +from vllm_omni.benchmarks.data_modules.seed_tts_dataset import SeedTTSSampleRequest + +logger = logging.getLogger(__name__) + +# Mirrors seed-tts-eval/run_wer.py +OFFICIAL_WHISPER_HF_ID = "openai/whisper-large-v3" +PARAFORMER_MODEL_ID = "paraformer-zh" + +_lock = threading.Lock() +_device: str | None = None +_en_processor = None +_en_model = None +_zh_paraformer = None +_wavlm_model = None +_wavlm_processor = None +_wavlm_device: str | None = None +_utmos_jit_model = None +_utmos_jit_device: str | None = None +_utmos_jit_load_failed = False +_utmos_forward_warned = False + + +def pcm_s16le_mono_to_wav_bytes(pcm: bytes, *, sample_rate: int = 24000) -> bytes: + buf = io.BytesIO() + with wave.open(buf, "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(pcm) + return buf.getvalue() + + +def _get_eval_device() -> str: + explicit = os.environ.get("SEED_TTS_EVAL_DEVICE", "").strip() + if explicit: + return explicit + try: + import torch + + return "cuda:0" if torch.cuda.is_available() else "cpu" + except ImportError: + return "cpu" + + +def _punctuation_all() -> str: + from zhon.hanzi import punctuation + + return punctuation + string.punctuation + + +def _jiwer_wer(reference: str, hypothesis: str) -> float: + """Word-level WER; strings are normalized like ``run_wer.process_one``. + + jiwer 4.x removed ``compute_measures`` (``ImportError``); fall back to ``process_words``. + """ + try: + from jiwer import compute_measures + + return float(compute_measures(reference, hypothesis)["wer"]) + except ImportError: + import jiwer + + out = jiwer.process_words(reference, hypothesis) + return float(out.wer) + + +def process_one_official(hypo: str, truth: str, lang: str) -> tuple[float, str, str]: + """Same normalization + ``jiwer`` call as ``run_wer.process_one`` (hypo=ASR, truth=reference).""" + raw_truth = truth + raw_hypo = hypo + truth_n = truth + hypo_n = hypo + for x in _punctuation_all(): + if x == "'": + continue + truth_n = truth_n.replace(x, "") + hypo_n = hypo_n.replace(x, "") + truth_n = truth_n.replace(" ", " ") + hypo_n = hypo_n.replace(" ", " ") + if lang == "zh": + truth_n = " ".join([x for x in truth_n]) + hypo_n = " ".join([x for x in hypo_n]) + elif lang == "en": + truth_n = truth_n.lower() + hypo_n = hypo_n.lower() + else: + raise ValueError(f"unsupported lang {lang!r}") + wer = _jiwer_wer(truth_n, hypo_n) + return wer, raw_truth, raw_hypo + + +def _pcm_s16le_to_f32_16k(pcm: bytes, pcm_sample_rate: int = 24000) -> np.ndarray: + import scipy.signal + + if not pcm: + return np.zeros(0, dtype=np.float32) + raw = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0 + target_len = int(len(raw) * 16000 / pcm_sample_rate) + if target_len <= 0: + return np.zeros(0, dtype=np.float32) + return scipy.signal.resample(raw, target_len).astype(np.float32) + + +def _eval_submetric_enabled(env_name: str, *, default: bool = True) -> bool: + raw = os.environ.get(env_name, "").strip().lower() + if raw in ("0", "false", "no", "off"): + return False + if raw in ("1", "true", "yes", "on"): + return True + return default + + +def _audio_path_to_f32_16k(path: str) -> np.ndarray: + import scipy.signal + import soundfile as sf + + data, sr = sf.read(path, dtype="float32", always_2d=True) + mono = np.mean(data, axis=1).astype(np.float32) + if int(sr) == 16000: + return mono + target_len = max(1, int(len(mono) * 16000 / int(sr))) + return scipy.signal.resample(mono, target_len).astype(np.float32) + + +def _ensure_wavlm_sim() -> None: + global _wavlm_model, _wavlm_processor, _wavlm_device + with _lock: + if _wavlm_model is not None: + return + from transformers import AutoFeatureExtractor, AutoModel + + mid = os.environ.get("SEED_TTS_WAVLM_MODEL", "microsoft/wavlm-base-plus").strip() or "microsoft/wavlm-base-plus" + _wavlm_device = os.environ.get("SEED_TTS_SIM_DEVICE", "").strip() or _get_eval_device() + logger.warning( + "Loading WavLM %r on %s for Seed-TTS SIM (embedding cosine; not identical to " + "seed-tts-eval UniSpeech SV checkpoint).", + mid, + _wavlm_device, + ) + _wavlm_processor = AutoFeatureExtractor.from_pretrained(mid) + _wavlm_model = AutoModel.from_pretrained(mid).to(_wavlm_device) + _wavlm_model.eval() + + +def _wavlm_prepare_waveform(wav: np.ndarray) -> np.ndarray: + """Trim, pad to a minimum length WavLM/Wav2Vec2 CNN stack accepts, float32 mono.""" + max_sec = float(os.environ.get("SEED_TTS_WAVLM_MAX_SECONDS", "30")) + cap = int(max_sec * 16000) + w = np.asarray(wav, dtype=np.float32).reshape(-1) + if len(w) == 0: + return w + if len(w) > cap: + w = w[:cap].copy() + # Very short clips make the strided conv front-end fail (shape / empty time dim). + min_samples = int(os.environ.get("SEED_TTS_WAVLM_MIN_SAMPLES", "4000")) + if len(w) < min_samples: + w = np.pad(w, (0, min_samples - len(w)), mode="constant") + return w + + +def _wavlm_mean_embedding_f32_16k(wav: np.ndarray) -> np.ndarray | None: + import torch + + _ensure_wavlm_sim() + w = _wavlm_prepare_waveform(wav) + if len(w) == 0: + return None + assert _wavlm_processor is not None and _wavlm_model is not None and _wavlm_device is not None + # Single utterance: avoid padding=True (adds zeros that distort mean pooling). Still pass + # attention_mask when the extractor provides it (sample-level; do not mix with hidden length). + try: + inputs = _wavlm_processor( + w, + sampling_rate=16000, + return_tensors="pt", + padding=False, + return_attention_mask=True, + ) + except TypeError: + inputs = _wavlm_processor( + w, + sampling_rate=16000, + return_tensors="pt", + padding=False, + ) + iv = inputs["input_values"].to(_wavlm_device) + am = inputs.get("attention_mask") + if am is not None: + am = am.to(_wavlm_device) + with torch.inference_mode(): + out = _wavlm_model(iv, attention_mask=am) + h = out.last_hidden_state + v = h.mean(dim=1).squeeze(0).float().cpu().numpy() + n = float(np.linalg.norm(v)) + if not np.isfinite(n) or n < 1e-8: + return None + return (v / n).astype(np.float32) + + +def _cosine_similarity_unit_vectors(a: np.ndarray, b: np.ndarray) -> float: + return float(np.dot(a, b)) + + +def _ensure_utmos_jit_model() -> Any | None: + """Load UTMOS as TorchScript (``balacoon/utmos`` style): no ``import utmos`` / fairseq.""" + global _utmos_jit_model, _utmos_jit_device, _utmos_jit_load_failed + with _lock: + if _utmos_jit_load_failed: + return None + if _utmos_jit_model is not None: + return _utmos_jit_model + try: + import torch + from huggingface_hub import hf_hub_download + + repo = os.environ.get("SEED_TTS_UTMOS_HF_REPO", "balacoon/utmos").strip() or "balacoon/utmos" + fname = os.environ.get("SEED_TTS_UTMOS_JIT_FILE", "utmos.jit").strip() or "utmos.jit" + logger.warning( + "Loading UTMOS TorchScript from Hugging Face %r file %r (one-time download/cache)...", + repo, + fname, + ) + path = hf_hub_download(repo_id=repo, filename=fname, repo_type="model") + + # TODO The model weights in UTMOS must be loaded in cuda:0; otherwise, the model execution will fail. + want = "cuda:0" + if want.startswith("cuda") and torch.cuda.is_available(): + idx = want.split(":")[-1] if ":" in want else "0" + target_dev = f"cuda:{idx}" + else: + target_dev = "cpu" + + try: + m = torch.jit.load(path, map_location=target_dev) + m.eval() + _utmos_jit_device = target_dev + except Exception as load_e: + if target_dev.startswith("cuda"): + logger.warning( + "UTMOS JIT load on %s failed (%s), retrying on CPU...", + target_dev, + load_e, + ) + m = torch.jit.load(path, map_location="cpu") + m.eval() + _utmos_jit_device = "cpu" + else: + raise + _utmos_jit_model = m + except Exception as e: + logger.warning( + "UTMOS JIT unavailable (install torch + huggingface_hub; check HF access): %s", + e, + ) + _utmos_jit_load_failed = True + return None + return _utmos_jit_model + + +def _utmos_predict_f32_16k(wav_f32: np.ndarray) -> float | None: + """MOS from JIT model; input is float32 mono @ 16 kHz in ``[-1, 1]`` (WER pipeline). + + ``balacoon/utmos`` demos sometimes use int16 numpy, but the exported ``.jit`` weights are + float32; passing int16 tensors causes: "RuntimeError: ... input type and weight type + should be same". + """ + import torch + + if len(wav_f32) == 0: + return None + model = _ensure_utmos_jit_model() + if model is None: + return None + # Infer model's device from its first parameter/buffer to guarantee input sits with weights. + try: + model_dev = next(model.parameters()).device + except StopIteration: + try: + model_dev = next(model.buffers()).device + except StopIteration: + model_dev = torch.device("cpu") + w = np.ascontiguousarray(wav_f32, dtype=np.float32) + x = torch.from_numpy(w).unsqueeze(0).to(device=model_dev, dtype=torch.float32) + with torch.no_grad(): + out = model(x) + val = float(out.reshape(-1)[0].item()) + if not math.isfinite(val): + return None + return val + + +def _ensure_en_asr() -> None: + global _en_processor, _en_model, _device + with _lock: + if _en_processor is not None: + return + from transformers import WhisperForConditionalGeneration, WhisperProcessor + + _device = _get_eval_device() + mid = os.environ.get("SEED_TTS_HF_WHISPER_MODEL", OFFICIAL_WHISPER_HF_ID).strip() or OFFICIAL_WHISPER_HF_ID + logger.warning( + "Loading Seed-TTS eval Whisper HF model %r on %s (one-time, seed-tts-eval protocol)...", + mid, + _device, + ) + _en_processor = WhisperProcessor.from_pretrained(mid) + _en_model = WhisperForConditionalGeneration.from_pretrained(mid).to(_device) + _en_model.eval() + + +def _ensure_zh_asr() -> None: + global _zh_paraformer, _device + with _lock: + if _zh_paraformer is not None: + return + from funasr import AutoModel + + _device = _get_eval_device() + logger.warning( + "Loading Seed-TTS eval Paraformer %r on %s (one-time, seed-tts-eval protocol)...", + PARAFORMER_MODEL_ID, + _device, + ) + try: + _zh_paraformer = AutoModel(model=PARAFORMER_MODEL_ID, device=_device) + except TypeError: + _zh_paraformer = AutoModel(model=PARAFORMER_MODEL_ID) + + +def _transcribe_en_f32_16k(wav_f32: np.ndarray) -> str: + import torch + + _ensure_en_asr() + if len(wav_f32) == 0: + return "" + with _lock: + assert _en_processor is not None and _en_model is not None and _device is not None + inputs = _en_processor(wav_f32, sampling_rate=16000, return_tensors="pt") + input_features = inputs.input_features.to(_device) + with torch.no_grad(): + try: + forced = _en_processor.get_decoder_prompt_ids(language="english", task="transcribe") + predicted_ids = _en_model.generate(input_features, forced_decoder_ids=forced) + except Exception: + predicted_ids = _en_model.generate( + input_features, + language="english", + task="transcribe", + ) + text = _en_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] + return (text or "").strip() + + +def _transcribe_zh_wav_path(wav_path: str) -> str: + import zhconv + + _ensure_zh_asr() + with _lock: + assert _zh_paraformer is not None + res = _zh_paraformer.generate(input=wav_path, batch_size_s=300) + transcription = res[0]["text"] if res else "" + return zhconv.convert(transcription, "zh-cn").strip() + + +def _missing_deps_message(lang: str) -> str | None: + try: + import jiwer # noqa: F401 + from zhon.hanzi import punctuation # noqa: F401 + except ImportError as e: + return f"Seed-TTS WER eval needs jiwer and zhon ({e!s}). Install: pip install 'vllm-omni[seed-tts-eval]'" + try: + import scipy.signal # noqa: F401 + import soundfile # noqa: F401 + except ImportError as e: + return f"Seed-TTS WER eval needs scipy and soundfile ({e!s})." + if lang == "en": + try: + import torch # noqa: F401 + from transformers import WhisperForConditionalGeneration # noqa: F401 + except ImportError as e: + return f"English WER needs torch and transformers ({e!s}). Install: pip install 'vllm-omni[seed-tts-eval]'" + else: + try: + import zhconv # noqa: F401 + from funasr import AutoModel # noqa: F401 + except ImportError as e: + return f"Chinese WER needs funasr and zhconv ({e!s}). Install: pip install 'vllm-omni[seed-tts-eval]'" + return None + + +def compute_seed_tts_wer_metrics( + input_requests: list[SampleRequest], + outputs: list[Any], + *, + include_per_item: bool = False, +) -> dict[str, Any] | None: + """If all requests are :class:`SeedTTSSampleRequest`, run seed-tts-eval-style WER.""" + global _utmos_forward_warned + if not input_requests or len(input_requests) != len(outputs): + return None + if not all(isinstance(r, SeedTTSSampleRequest) for r in input_requests): + return None + + first = input_requests[0] + assert isinstance(first, SeedTTSSampleRequest) + lang = "zh" if (first.seed_tts_locale or "en").lower().startswith("zh") else "en" + + setup_err = _missing_deps_message(lang) + if setup_err: + logger.error("%s", setup_err) + return { + "seed_tts_eval_setup_error": setup_err, + "seed_tts_eval_protocol": "seed-tts-eval", + "seed_tts_content_evaluated": 0, + "seed_tts_content_error_mean": None, + "seed_tts_content_error_median": None, + "seed_tts_request_failed": 0, + "seed_tts_no_pcm": 0, + "seed_tts_asr_failed": 0, + "seed_tts_content_metric": "wer", + } + + import soundfile as sf + + errs: list[float] = [] + items: list[dict[str, Any]] = [] + asr_failed = 0 + no_pcm = 0 + request_failed = 0 + sim_values: list[float] = [] + utmos_values: list[float] = [] + sim_failed = 0 + sim_skipped_no_ref = 0 + utmos_failed = 0 + utmos_on = _eval_submetric_enabled("SEED_TTS_UTMOS_EVAL", default=True) + + for req, out in zip(input_requests, outputs, strict=True): + assert isinstance(req, SeedTTSSampleRequest) + ref = req.prompt + locale = req.seed_tts_locale or "en" + row_lang = "zh" if locale.lower().startswith("zh") else "en" + utmos_v: float | None = None + + if not out.success: + request_failed += 1 + if include_per_item: + items.append( + { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "error": "request_failed", + "detail": (out.error or "")[:500], + } + ) + continue + + pcm = getattr(out, "tts_output_pcm_bytes", None) + if not pcm: + no_pcm += 1 + if include_per_item: + items.append( + { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "error": "no_pcm", + } + ) + continue + + wav_16k = _pcm_s16le_to_f32_16k(pcm) + if len(wav_16k) == 0: + asr_failed += 1 + if include_per_item: + items.append( + { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "error": "empty_audio", + } + ) + continue + + # UTMOS scores synthesized audio only; do not gate on ASR/WER (those can fail independently). + if utmos_on: + try: + utmos_v = _utmos_predict_f32_16k(wav_16k) + if utmos_v is not None: + utmos_values.append(utmos_v) + elif not _utmos_jit_load_failed: + utmos_failed += 1 + except Exception: + if not _utmos_forward_warned: + _utmos_forward_warned = True + logger.warning( + "UTMOS JIT forward failed (first utterance=%s; set logging DEBUG for " + "full trace). Check sample rate (16 kHz), input shape, or " + "SEED_TTS_UTMOS_DEVICE.", + req.seed_tts_utterance_id, + exc_info=True, + ) + else: + logger.debug( + "UTMOS forward failed for %s", + req.seed_tts_utterance_id, + exc_info=True, + ) + utmos_failed += 1 + + try: + if row_lang == "en": + hyp = _transcribe_en_f32_16k(wav_16k) + else: + fd, tmp_wav = tempfile.mkstemp(suffix=".wav") + os.close(fd) + try: + sf.write(tmp_wav, wav_16k, 16000, subtype="PCM_16") + hyp = _transcribe_zh_wav_path(tmp_wav) + finally: + try: + os.unlink(tmp_wav) + except OSError: + pass + except Exception as e: + logger.exception("Seed-TTS ASR failed for %s", req.seed_tts_utterance_id) + asr_failed += 1 + if include_per_item: + items.append( + { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "error": "asr_exception", + "detail": str(e)[:500], + } + ) + continue + + if not hyp: + asr_failed += 1 + if include_per_item: + items.append( + { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "error": "empty_asr", + } + ) + continue + + try: + wer, raw_truth, raw_hypo = process_one_official(hyp, ref, row_lang) + except Exception as e: + logger.warning("jiwer/normalize failed for %s: %s", req.seed_tts_utterance_id, e) + asr_failed += 1 + if include_per_item: + items.append( + { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "error": "wer_compute_failed", + "detail": str(e)[:500], + } + ) + continue + + errs.append(wer) + sim_v: float | None = None + + if _eval_submetric_enabled("SEED_TTS_SIM_EVAL", default=True): + ref_path = getattr(req, "seed_tts_ref_wav_path", "") or "" + if ref_path and os.path.isfile(ref_path): + try: + ref_wav = _audio_path_to_f32_16k(ref_path) + e_ref = _wavlm_mean_embedding_f32_16k(ref_wav) + e_hyp = _wavlm_mean_embedding_f32_16k(wav_16k) + if e_ref is not None and e_hyp is not None: + sim_v = _cosine_similarity_unit_vectors(e_ref, e_hyp) + sim_values.append(sim_v) + except Exception as e: + logger.warning( + "SIM embedding failed for utterance=%s: %s: %s", + req.seed_tts_utterance_id, + type(e).__name__, + e, + ) + sim_failed += 1 + else: + sim_skipped_no_ref += 1 + + if include_per_item: + row: dict[str, Any] = { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "wer": wer, + "reference_raw": raw_truth, + "asr_raw": raw_hypo, + } + if sim_v is not None: + row["sim"] = sim_v + if utmos_v is not None: + row["utmos"] = utmos_v + items.append(row) + + result: dict[str, Any] = { + "seed_tts_eval_protocol": "seed-tts-eval", + "seed_tts_content_evaluated": len(errs), + "seed_tts_content_error_mean": statistics.fmean(errs) if errs else None, + "seed_tts_content_error_median": statistics.median(errs) if errs else None, + "seed_tts_request_failed": request_failed, + "seed_tts_no_pcm": no_pcm, + "seed_tts_asr_failed": asr_failed, + "seed_tts_content_metric": "wer", + "seed_tts_sim_evaluated": len(sim_values), + "seed_tts_sim_mean": statistics.fmean(sim_values) if sim_values else None, + "seed_tts_sim_median": statistics.median(sim_values) if sim_values else None, + "seed_tts_sim_failed": sim_failed, + "seed_tts_sim_skipped_no_ref": sim_skipped_no_ref, + "seed_tts_utmos_evaluated": len(utmos_values), + "seed_tts_utmos_mean": statistics.fmean(utmos_values) if utmos_values else None, + "seed_tts_utmos_median": statistics.median(utmos_values) if utmos_values else None, + "seed_tts_utmos_failed": utmos_failed, + } + if include_per_item: + result["seed_tts_wer_eval_items"] = items + return result + + +def print_seed_tts_wer_summary(metrics: dict[str, Any]) -> None: + setup = metrics.get("seed_tts_eval_setup_error") + if setup: + print("{s:{c}^{n}}".format(s=" Seed-TTS eval (seed-tts-eval protocol) ", n=50, c="=")) + print(setup) + return + + ev = int(metrics.get("seed_tts_content_evaluated", 0) or 0) + rf = int(metrics.get("seed_tts_request_failed", 0) or 0) + npc = int(metrics.get("seed_tts_no_pcm", 0) or 0) + af = int(metrics.get("seed_tts_asr_failed", 0) or 0) + sim_ev = int(metrics.get("seed_tts_sim_evaluated", 0) or 0) + ut_ev = int(metrics.get("seed_tts_utmos_evaluated", 0) or 0) + if ev == 0 and rf == 0 and npc == 0 and af == 0 and sim_ev == 0 and ut_ev == 0: + return + print("{s:{c}^{n}}".format(s=" Seed-TTS eval (seed-tts-eval protocol) ", n=50, c="=")) + print("{:<40} {:<10}".format("Evaluated (WER, lower is better):", ev)) + mean = metrics.get("seed_tts_content_error_mean") + if mean is not None: + print("{:<40} {:<10.4f}".format("Mean WER:", float(mean))) + med = metrics.get("seed_tts_content_error_median") + if med is not None: + print("{:<40} {:<10.4f}".format("Median WER:", float(med))) + print("{:<40} {:<10}".format("Request failed:", metrics.get("seed_tts_request_failed", 0))) + print("{:<40} {:<10}".format("No PCM captured:", metrics.get("seed_tts_no_pcm", 0))) + print("{:<40} {:<10}".format("ASR / WER failed:", metrics.get("seed_tts_asr_failed", 0))) + if sim_ev or metrics.get("seed_tts_sim_skipped_no_ref") or metrics.get("seed_tts_sim_failed"): + print("{:<40} {:<10}".format("SIM evaluated (higher ~ closer):", sim_ev)) + sm = metrics.get("seed_tts_sim_mean") + if sm is not None: + print("{:<40} {:<10.4f}".format("Mean SIM:", float(sm))) + s_med = metrics.get("seed_tts_sim_median") + if s_med is not None: + print("{:<40} {:<10.4f}".format("Median SIM:", float(s_med))) + print("{:<40} {:<10}".format("SIM skipped (no ref path):", metrics.get("seed_tts_sim_skipped_no_ref", 0))) + print("{:<40} {:<10}".format("SIM embedding errors:", metrics.get("seed_tts_sim_failed", 0))) + if ut_ev or metrics.get("seed_tts_utmos_failed"): + print("{:<40} {:<10}".format("UTMOS evaluated (JIT MOS, higher better):", ut_ev)) + um = metrics.get("seed_tts_utmos_mean") + if um is not None: + print("{:<40} {:<10.4f}".format("Mean UTMOS:", float(um))) + u_med = metrics.get("seed_tts_utmos_median") + if u_med is not None: + print("{:<40} {:<10.4f}".format("Median UTMOS:", float(u_med))) + print("{:<40} {:<10}".format("UTMOS errors:", metrics.get("seed_tts_utmos_failed", 0))) + print("=" * 50) diff --git a/vllm_omni/benchmarks/patch/__init__.py b/vllm_omni/benchmarks/patch/__init__.py index e69de29bb2..ca6b41ba8f 100644 --- a/vllm_omni/benchmarks/patch/__init__.py +++ b/vllm_omni/benchmarks/patch/__init__.py @@ -0,0 +1,3 @@ +"""Omni benchmark monkey-patches (side effects in ``patch.patch``).""" + +from . import patch as _patch_module # noqa: F401 diff --git a/vllm_omni/benchmarks/patch/patch.py b/vllm_omni/benchmarks/patch/patch.py index 17d7498ba2..41aed09423 100644 --- a/vllm_omni/benchmarks/patch/patch.py +++ b/vllm_omni/benchmarks/patch/patch.py @@ -6,6 +6,7 @@ import os import random import ssl +import sys import time import traceback from collections.abc import Iterable @@ -33,15 +34,245 @@ from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) + +from vllm_omni.benchmarks.data_modules.daily_omni_dataset import DailyOmniDataset, DailyOmniSampleRequest from vllm_omni.benchmarks.data_modules.random_multi_modal_dataset import OmniRandomMultiModalDataset +from vllm_omni.benchmarks.data_modules.seed_tts_dataset import ( + SEED_TTS_DEFAULT_OMNI_SYSTEM_PROMPT, + SeedTTSDataset, + SeedTTSSampleRequest, +) get_samples_old = datasets.get_samples +_DEFAULT_DAILY_OMNI_REPO = "liarliar/Daily-Omni" + + +def _seed_tts_capture_pcm_for_wer() -> bool: + return os.environ.get("SEED_TTS_WER_EVAL", "").lower() in ( + "1", + "true", + "yes", + ) + + +def _merge_extra_body_mm_kwargs(base: dict | None, overlay: dict | None) -> dict | None: + """Shallow-merge ``extra_body`` dicts; deep-merge ``mm_processor_kwargs`` if both set.""" + if not base and not overlay: + return None + out = dict(base or {}) + if not overlay: + return out + for k, v in overlay.items(): + if k == "mm_processor_kwargs" and isinstance(v, dict): + prev = out.get("mm_processor_kwargs") + merged_kw = {**(prev if isinstance(prev, dict) else {}), **v} + out["mm_processor_kwargs"] = merged_kw + else: + out[k] = v + return out + + +def _attach_daily_omni_to_request_func_input(sample: SampleRequest, rfi: RequestFuncInput) -> None: + """Apply per-request OpenAI fields (``mm_processor_kwargs``, messages) for Daily-Omni.""" + if not isinstance(sample, DailyOmniSampleRequest): + return + rfi.extra_body = _merge_extra_body_mm_kwargs(rfi.extra_body, sample.omni_extra_body) + if sample.omni_chat_messages is not None: + setattr(rfi, "omni_chat_messages", sample.omni_chat_messages) + else: + setattr(rfi, "mm_position", sample.omni_chat_mm_position) + + +def _attach_seed_tts_to_request_func_input(sample: SampleRequest, rfi: RequestFuncInput) -> None: + """Merge Seed-TTS per-row TTS fields (ref_audio, ref_text, task_type, …) into ``extra_body``. + + Used by both ``/v1/audio/speech`` and ``/v1/chat/completions`` (flattened into JSON body). + For ``openai-chat-omni``, also sets ``omni_chat_messages`` (system + user) so Qwen3-Omni + follows the same role layout as official TTS / multimodal demos. ``/v1/audio/speech`` ignores + ``messages`` and only uses ``input`` + body fields. + Flags ``openai-chat-omni`` to request audio output and optionally export PCM for WER. + """ + if not isinstance(sample, SeedTTSSampleRequest): + return + ex = sample.seed_tts_speech_extra + if not ex: + return + base = dict(rfi.extra_body) if rfi.extra_body else {} + base.update(ex) + rfi.extra_body = base + # Used by request funcs to force streaming TTS behavior and to export PCM when WER is on. + setattr(rfi, "seed_tts_row", True) + sys_prompt = (sample.seed_tts_system_prompt or "").strip() or SEED_TTS_DEFAULT_OMNI_SYSTEM_PROMPT + setattr( + rfi, + "omni_chat_messages", + [ + {"role": "system", "content": [{"type": "text", "text": sys_prompt}]}, + {"role": "user", "content": [{"type": "text", "text": sample.prompt}]}, + ], + ) + + +def _daily_omni_repo_from_args(args) -> str | None: + """Resolve HuggingFace repo id for Daily-Omni from CLI args. + + vLLM allows ``--dataset-path`` to be a local path while the real HF id is + passed via ``--hf-name``. Upstream ``get_samples`` for ``hf`` only matches + a fixed elif-chain and never discovers Omni's loader, so we must detect + Daily-Omni here using either field. + """ + dp = getattr(args, "dataset_path", None) + hn = getattr(args, "hf_name", None) + if dp in DailyOmniDataset.SUPPORTED_DATASET_PATHS: + return dp + if hn in DailyOmniDataset.SUPPORTED_DATASET_PATHS: + return hn + return None + def get_samples(args, tokenizer): - if args.backend not in ["openai-chat-omni", "openai-audio-speech"]: + # Daily-Omni: explicit dataset name, or hf + matching path/hf-name + is_daily_omni = args.dataset_name == "daily-omni" or ( + args.dataset_name == "hf" and _daily_omni_repo_from_args(args) is not None + ) + is_seed_tts = args.dataset_name == "seed-tts" + + # Check if we need to handle omni-related backends/datasets + is_omni_backend = args.backend in ["openai-chat-omni", "openai-audio-speech", "daily-omni"] + is_omni_dataset = is_daily_omni or is_seed_tts or args.dataset_name == "random-mm" + + if not is_omni_backend and not is_omni_dataset: + # Not an omni-related request, delegate to original implementation return get_samples_old(args, tokenizer) - elif args.dataset_name == "random-mm": + + # Handle Daily-Omni dataset + if is_daily_omni: + # Support: + # --dataset-name daily-omni [--dataset-path liarliar/Daily-Omni] + # --dataset-name daily-omni --daily-omni-qa-json /path/to/qa.json (offline QA) + # --dataset-name hf --dataset-path liarliar/Daily-Omni + # --dataset-name hf --hf-name liarliar/Daily-Omni (dataset-path may be local) + + # Validate backend supports multimodal (video) + if args.backend not in ["openai-chat-omni", "daily-omni"]: + raise ValueError( + f"Daily-Omni dataset requires a multimodal backend that supports video. " + f"Got backend='{args.backend}'. Please use '--backend openai-chat-omni'" + ) + + # Determine video directory if specified (for local video files) + video_dir = getattr(args, "daily_omni_video_dir", None) + + # Get HF split (default to "train"; unused when loading from local qa.json) + dataset_split = getattr(args, "hf_split", None) or "train" + + qa_json = getattr(args, "daily_omni_qa_json", None) + if isinstance(qa_json, str): + qa_json = qa_json.strip() or None + + if qa_json is not None: + logger.info( + "Loading Daily-Omni dataset: qa_json=%s, video_dir=%s (Hub not used for QA)", + qa_json, + video_dir, + ) + dataset = DailyOmniDataset( + qa_json_path=qa_json, + dataset_path=None, + dataset_split=dataset_split, + random_seed=args.seed, + video_dir=video_dir, + input_mode=getattr(args, "daily_omni_input_mode", "all"), + inline_local_video=getattr(args, "daily_omni_inline_local_video", False), + trust_remote_code=getattr(args, "trust_remote_code", False), + disable_shuffle=getattr(args, "disable_shuffle", False), + ) + else: + repo_id = _daily_omni_repo_from_args(args) + if args.dataset_name == "daily-omni": + if repo_id is None: + repo_id = _DEFAULT_DAILY_OMNI_REPO + elif repo_id is None: + raise ValueError( + "Daily-Omni with --dataset-name hf requires " + f"--dataset-path {_DEFAULT_DAILY_OMNI_REPO} or " + f"--hf-name {_DEFAULT_DAILY_OMNI_REPO}." + ) + + logger.info( + "Loading Daily-Omni dataset: hf_repo=%s, split=%s, video_dir=%s", + repo_id, + dataset_split, + video_dir, + ) + + dataset = DailyOmniDataset( + dataset_path=repo_id, + dataset_split=dataset_split, + dataset_subset=getattr(args, "hf_subset", None), + random_seed=args.seed, + video_dir=video_dir, + input_mode=getattr(args, "daily_omni_input_mode", "all"), + inline_local_video=getattr(args, "daily_omni_inline_local_video", False), + trust_remote_code=getattr(args, "trust_remote_code", False), + no_stream=getattr(args, "no_stream", False), + disable_shuffle=getattr(args, "disable_shuffle", False), + ) + + out_len = getattr(args, "output_len", None) + if out_len is None: + out_len = getattr(args, "hf_output_len", None) + if out_len is None: + out_len = DailyOmniDataset.DEFAULT_OUTPUT_LEN + + input_requests = dataset.sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=out_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ) + return input_requests + + if is_seed_tts: + if args.backend not in ("openai-audio-speech", "openai-chat-omni"): + raise ValueError( + "Seed-TTS requires --backend openai-audio-speech (POST /v1/audio/speech) or " + "--backend openai-chat-omni (POST /v1/chat/completions with ref_audio/ref_text). " + f"Got backend={args.backend!r}." + ) + repo_id = getattr(args, "dataset_path", None) or getattr(args, "hf_name", None) + if not repo_id: + raise ValueError( + "Seed-TTS requires --dataset-path (HF dataset repo id or local directory) or " + "--hf-name for the Hub dataset id." + ) + + dataset = SeedTTSDataset( + dataset_path=repo_id, + random_seed=args.seed, + locale=getattr(args, "seed_tts_locale", "en"), + inline_ref_audio=not getattr(args, "seed_tts_file_ref_audio", False), + seed_tts_root=getattr(args, "seed_tts_root", None), + system_prompt=getattr(args, "seed_tts_system_prompt", None), + disable_shuffle=getattr(args, "disable_shuffle", False), + ) + out_len = getattr(args, "output_len", None) + if out_len is None: + out_len = getattr(args, "hf_output_len", None) + if out_len is None: + out_len = SeedTTSDataset.DEFAULT_OUTPUT_LEN + return dataset.sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=out_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ) + + # Handle random-mm dataset (Omni's synthetic multimodal dataset) + if args.dataset_name == "random-mm": dataset = OmniRandomMultiModalDataset(random_seed=args.seed, dataset_path=args.dataset_path) input_requests = dataset.sample( tokenizer=tokenizer, @@ -64,6 +295,10 @@ def get_samples(args, tokenizer): datasets.get_samples = get_samples +_serve_mod = sys.modules.get("vllm.benchmarks.serve") +if _serve_mod is not None: + _serve_mod.get_samples = get_samples + @dataclass class MixRequestFuncOutput(RequestFuncOutput): @@ -72,6 +307,9 @@ class MixRequestFuncOutput(RequestFuncOutput): audio_frames: int = 0 audio_rtf: float = 0.0 text_latency: float = 0.0 + #: Raw PCM s16le mono at 24 kHz for Seed-TTS WER: from ``/v1/audio/speech`` stream or + #: resampled export after ``openai-chat-omni`` audio deltas. + tts_output_pcm_bytes: bytes | None = None async def async_request_openai_chat_omni_completions( @@ -83,13 +321,17 @@ async def async_request_openai_chat_omni_completions( api_url = request_func_input.api_url _validate_api_url(api_url, "OpenAI Chat Completions API", "chat/completions") - content = _get_chat_content(request_func_input, mm_position=mm_position) + omni_messages = getattr(request_func_input, "omni_chat_messages", None) + if omni_messages is not None: + messages_payload = omni_messages + else: + effective_mm_position = getattr(request_func_input, "mm_position", mm_position) + content = _get_chat_content(request_func_input, mm_position=effective_mm_position) + messages_payload = [{"role": "user", "content": content}] payload = { "model": request_func_input.model_name if request_func_input.model_name else request_func_input.model, - "messages": [ - {"role": "user", "content": content}, - ], + "messages": messages_payload, "temperature": 0.0, "max_tokens": request_func_input.output_len, "stream": True, @@ -98,6 +340,10 @@ async def async_request_openai_chat_omni_completions( }, } _update_payload_common(payload, request_func_input) + # Seed-TTS via chat: voice-clone fields live on the body; ensure audio is streamed. + if getattr(request_func_input, "seed_tts_row", False): + if payload.get("modalities") is None: + payload["modalities"] = ["text", "audio"] response_format = payload.get("response_format", "wav") if response_format == "pcm": @@ -167,7 +413,10 @@ async def async_request_openai_chat_omni_completions( data = json.loads(chunk) if choices := data.get("choices"): modality = data.get("modality") - content = choices[0]["delta"].get("content") + delta = choices[0].get("delta") or {} + content = delta.get("content") + if not content and isinstance(delta.get("audio"), dict): + content = delta["audio"].get("data") if modality == "text": # First token if ttft == 0.0: @@ -182,7 +431,7 @@ async def async_request_openai_chat_omni_completions( if output.audio_ttfp == 0.0: output.audio_ttfp = timestamp - st audio_generate_time = timestamp - st - if content != "": + if content: audio_bytes = base64.b64decode(content) seg = AudioSegment.from_file(io.BytesIO(audio_bytes)) if seg is not None: @@ -214,6 +463,12 @@ async def async_request_openai_chat_omni_completions( else: output.audio_rtf = 0 logger.warning("Audio duration is zero") + if _seed_tts_capture_pcm_for_wer() and getattr(request_func_input, "seed_tts_row", False): + try: + seg = generated_audio.set_frame_rate(24000).set_channels(1).set_sample_width(2) + output.tts_output_pcm_bytes = bytes(seg.raw_data) + except Exception as ex: + logger.warning("seed_tts WER PCM export failed: %s", ex) output.success = True else: output.error = response.reason or "" @@ -268,6 +523,10 @@ async def async_request_openai_audio_speech( "response_format": "pcm", } _update_payload_common(payload, request_func_input) + # Seed-TTS + WER: ``--extra-body`` may set stream=false / other formats; speech must stream PCM. + if getattr(request_func_input, "seed_tts_row", False) and _seed_tts_capture_pcm_for_wer(): + payload["stream"] = True + payload["response_format"] = "pcm" headers = { "Content-Type": "application/json", @@ -286,6 +545,8 @@ async def async_request_openai_audio_speech( st = time.perf_counter() output.start_time = st total_pcm_bytes = 0 + capture_wer_pcm = _seed_tts_capture_pcm_for_wer() and getattr(request_func_input, "seed_tts_row", False) + pcm_capture = bytearray() if capture_wer_pcm else None try: async with session.post(url=api_url, json=payload, headers=headers) as response: if response.status == 200: @@ -297,6 +558,8 @@ async def async_request_openai_audio_speech( output.audio_ttfp = timestamp - st output.ttft = output.audio_ttfp total_pcm_bytes += len(chunk) + if pcm_capture is not None: + pcm_capture.extend(chunk) end_time = time.perf_counter() output.latency = end_time - st @@ -309,6 +572,16 @@ async def async_request_openai_audio_speech( else: output.audio_rtf = 0 logger.warning("Audio duration is zero") + if pcm_capture is not None and pcm_capture: + output.tts_output_pcm_bytes = bytes(pcm_capture) + elif capture_wer_pcm: + ct = response.headers.get("Content-Type", "") + logger.warning( + "Seed-TTS WER: HTTP 200 but no PCM bytes (Content-Type=%r, url=%s). " + "Check stream=true and response_format=pcm on the server.", + ct, + api_url, + ) output.success = True else: output.error = response.reason or "" @@ -331,6 +604,12 @@ async def async_request_openai_audio_speech( if "openai-audio-speech" not in OPENAI_COMPATIBLE_BACKENDS: OPENAI_COMPATIBLE_BACKENDS.append("openai-audio-speech") +# Daily-Omni backend for audio-visual reasoning benchmark +# Reuses openai-chat-omni completions for video+text understanding +ASYNC_REQUEST_FUNCS["daily-omni"] = async_request_openai_chat_omni_completions +if "daily-omni" not in OPENAI_COMPATIBLE_BACKENDS: + OPENAI_COMPATIBLE_BACKENDS.append("daily-omni") + # ruff: noqa: E402 # Prevent import order from causing patch failures from vllm.benchmarks import serve @@ -422,6 +701,8 @@ async def benchmark( extra_headers=extra_headers, extra_body=extra_body, ) + _attach_daily_omni_to_request_func_input(input_requests[0], test_input) + _attach_seed_tts_to_request_func_input(input_requests[0], test_input) if ready_check_timeout_sec > 0: test_output = await wait_for_endpoint( @@ -484,6 +765,8 @@ async def warmup_limited_request_func(): extra_headers=extra_headers, extra_body=extra_body, ) + _attach_daily_omni_to_request_func_input(input_requests[0], profile_input) + _attach_seed_tts_to_request_func_input(input_requests[0], profile_input) profile_output = await request_func(request_func_input=profile_input, session=session) if profile_output.success: print("Profiler started") @@ -564,6 +847,8 @@ async def limited_request_func(request_func_input, session, pbar): extra_body=extra_body, request_id=request_id, ) + _attach_daily_omni_to_request_func_input(request, request_func_input) + _attach_seed_tts_to_request_func_input(request, request_func_input) tasks.append( asyncio.create_task(limited_request_func(request_func_input=request_func_input, session=session, pbar=pbar)) ) @@ -631,6 +916,37 @@ async def limited_request_func(request_func_input, session, pbar): "errors": [output.error for output in outputs], } + from vllm_omni.benchmarks.data_modules.daily_omni_eval import ( + compute_daily_omni_accuracy_metrics, + print_daily_omni_accuracy_summary, + ) + + _save_items = os.environ.get("DAILY_OMNI_SAVE_EVAL_ITEMS", "").lower() in ( + "1", + "true", + "yes", + ) + _daily_acc = compute_daily_omni_accuracy_metrics(input_requests, outputs, include_per_item=_save_items) + if _daily_acc is not None: + result.update(_daily_acc) + print_daily_omni_accuracy_summary(_daily_acc) + + if _seed_tts_capture_pcm_for_wer(): + from vllm_omni.benchmarks.data_modules.seed_tts_eval import ( + compute_seed_tts_wer_metrics, + print_seed_tts_wer_summary, + ) + + _save_wer = os.environ.get("SEED_TTS_WER_SAVE_ITEMS", "").lower() in ( + "1", + "true", + "yes", + ) + _wer_m = compute_seed_tts_wer_metrics(input_requests, outputs, include_per_item=_save_wer) + if _wer_m is not None: + result.update(_wer_m) + print_seed_tts_wer_summary(_wer_m) + if rps_change_events: result["rps_change_events"] = rps_change_events diff --git a/vllm_omni/benchmarks/serve.py b/vllm_omni/benchmarks/serve.py index fe94603693..d3f3510c56 100644 --- a/vllm_omni/benchmarks/serve.py +++ b/vllm_omni/benchmarks/serve.py @@ -1,9 +1,21 @@ import argparse import asyncio +import os from typing import Any from vllm.benchmarks.serve import main_async +# Import patch to register daily-omni dataset and omni backends +# This monkey-patches vllm.benchmarks.datasets.get_samples before it's used +# Must be imported before any vllm.benchmarks module usage +import vllm_omni.benchmarks.patch.patch # noqa: F401 + def main(args: argparse.Namespace) -> dict[str, Any]: + if getattr(args, "seed_tts_wer_eval", False): + os.environ["SEED_TTS_WER_EVAL"] = "1" + if getattr(args, "seed_tts_wer_save_items", False): + os.environ["SEED_TTS_WER_SAVE_ITEMS"] = "1" + if getattr(args, "daily_omni_save_eval_items", False): + os.environ["DAILY_OMNI_SAVE_EVAL_ITEMS"] = "1" return asyncio.run(main_async(args)) diff --git a/vllm_omni/entrypoints/cli/benchmark/serve.py b/vllm_omni/entrypoints/cli/benchmark/serve.py index 906e8851a4..d281432e59 100644 --- a/vllm_omni/entrypoints/cli/benchmark/serve.py +++ b/vllm_omni/entrypoints/cli/benchmark/serve.py @@ -1,4 +1,5 @@ import argparse +import os from vllm.benchmarks.serve import add_cli_args @@ -6,15 +7,149 @@ from vllm_omni.entrypoints.cli.benchmark.base import OmniBenchmarkSubcommandBase +def add_daily_omni_cli_args(parser: argparse.ArgumentParser) -> None: + """Add CLI arguments specific to Daily-Omni dataset. + + This function should be called by the CLI entrypoint to add additional + arguments for daily-omni benchmark support. + + Args: + parser: The ArgumentParser instance to extend + """ + # Daily-Omni specific arguments + daily_omni_group = parser.add_argument_group("Daily-Omni Dataset Options") + + daily_omni_group.add_argument( + "--daily-omni-qa-json", + type=str, + default=None, + help="Path to local upstream qa.json. When set, QA rows are read from this file and " + "the HuggingFace dataset is not loaded (no network). Use with --daily-omni-video-dir " + "for fully offline runs. --dataset-path / Hub split flags are then ignored for QA loading.", + ) + daily_omni_group.add_argument( + "--daily-omni-video-dir", + type=str, + default=None, + help="Root directory of extracted Daily-Omni videos (contents of Videos.tar: " + "each video_id in its own subdir with {video_id}_video.mp4). " + "When using file URLs, you MUST start the vLLM server with " + "--allowed-local-media-path set to this same directory (or a parent), " + "otherwise requests fail with 'Cannot load local files without " + "--allowed-local-media-path'.", + ) + daily_omni_group.add_argument( + "--daily-omni-inline-local-video", + action="store_true", + default=False, + help="For local videos only: embed MP4 as base64 data URLs in benchmark " + "requests so the server does not need --allowed-local-media-path. " + "Increases request size and client memory; use for small --num-prompts. " + "When using --daily-omni-input-mode audio or all, local WAV files are " + "embedded the same way.", + ) + daily_omni_group.add_argument( + "--daily-omni-input-mode", + type=str, + choices=["all", "visual", "audio"], + default="all", + help="Daily-Omni input protocol (mirrors upstream Lliar-liar/Daily-Omni " + "--input_mode). 'visual': video only (default). 'audio': WAV only, " + "requires {video_id}/{video_id}_audio.wav under --daily-omni-video-dir. " + "'all': video + WAV together. Sets mm_processor_kwargs.use_audio_in_video=false " + "and matches official separate video/audio streams.", + ) + daily_omni_group.add_argument( + "--daily-omni-save-eval-items", + action="store_true", + default=False, + help="Include per-request Daily-Omni accuracy rows (gold/predicted/correct) " + "in the saved JSON under key daily_omni_eval_items. " + "Alternatively set env DAILY_OMNI_SAVE_EVAL_ITEMS=1.", + ) + + # Note: --dataset-name daily-omni via get_samples patch; use either Hub (--dataset-path + # liarliar/Daily-Omni) or local --daily-omni-qa-json (offline). + + +def add_seed_tts_cli_args(parser: argparse.ArgumentParser) -> None: + """CLI for Seed-TTS zero-shot TTS benchmark (``--dataset-name seed-tts``).""" + g = parser.add_argument_group("Seed-TTS Dataset Options") + g.add_argument( + "--seed-tts-locale", + type=str, + choices=["en", "zh"], + default="en", + help="Which Seed-TTS split to load: en/meta.lst or zh/meta.lst under the dataset root.", + ) + g.add_argument( + "--seed-tts-root", + type=str, + default=None, + help="Override root directory that contains en/ and zh/ (meta.lst + prompt-wavs). " + "If set, --dataset-path can still name the HF repo for logging; this path is used for files.", + ) + g.add_argument( + "--seed-tts-file-ref-audio", + action="store_true", + default=False, + help="Send ref_audio as file:// URIs (smaller HTTP bodies). Requires the API server " + "to be started with --allowed-local-media-path covering the Seed-TTS dataset root. " + "Default is inline data:audio/wav;base64 so Qwen3-TTS works without that flag.", + ) + g.add_argument( + "--seed-tts-inline-ref-audio", + action="store_true", + default=False, + help=argparse.SUPPRESS, + ) + g.add_argument( + "--seed-tts-system-prompt", + type=str, + default=None, + help="Override chat system message for --backend openai-chat-omni (Qwen3-Omni TTS). " + "Default follows official Qwen3-Omni identity + zero-shot voice-clone instructions.", + ) + g.add_argument( + "--seed-tts-wer-eval", + action="store_true", + default=False, + help="Keep synthesized audio as 24 kHz mono PCM for WER (works with " + "--backend openai-audio-speech or openai-chat-omni). Scoring follows " + "BytedanceSpeech/seed-tts-eval (Whisper-large-v3 / Paraformer-zh + jiwer). " + "Sets SEED_TTS_WER_EVAL=1. Install: pip install 'vllm-omni[seed-tts-eval]'. " + "Optional: SEED_TTS_EVAL_DEVICE, SEED_TTS_HF_WHISPER_MODEL.", + ) + g.add_argument( + "--seed-tts-wer-save-items", + action="store_true", + default=False, + help="Include per-utterance ASR rows in the saved JSON under key seed_tts_wer_eval_items. " + "Or set SEED_TTS_WER_SAVE_ITEMS=1.", + ) + + class OmniBenchmarkServingSubcommand(OmniBenchmarkSubcommandBase): """The `serve` subcommand for vllm bench.""" name = "serve" - help = "Benchmark the online serving throughput." + help = "Benchmark the online serving throughput. Supports Daily-Omni and Seed-TTS datasets." @classmethod def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: add_cli_args(parser) + + # Add Daily-Omni specific arguments + add_daily_omni_cli_args(parser) + add_seed_tts_cli_args(parser) + + for action in parser._actions: + if action.dest == "dataset_name" and action.choices is not None: + extra = [c for c in ("daily-omni", "seed-tts") if c not in action.choices] + if extra: + action.choices = list(action.choices) + extra + + # Update help messages for omni-specific features for action in parser._actions: if action.dest == "percentile_metrics": action.help = ( @@ -48,4 +183,10 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: @staticmethod def cmd(args: argparse.Namespace) -> None: + if getattr(args, "daily_omni_save_eval_items", False): + os.environ["DAILY_OMNI_SAVE_EVAL_ITEMS"] = "1" + if getattr(args, "seed_tts_wer_eval", False): + os.environ["SEED_TTS_WER_EVAL"] = "1" + if getattr(args, "seed_tts_wer_save_items", False): + os.environ["SEED_TTS_WER_SAVE_ITEMS"] = "1" main(args) From 0d020739a7d85e2b2ec2d30f26d0d741b4f4fb98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zeyu=20Huang=20=7C=20=E9=BB=83=E6=BE=A4=E5=AE=87?= <11222265+fhfuih@users.noreply.github.com> Date: Wed, 15 Apr 2026 11:13:11 +0800 Subject: [PATCH 175/204] [CI] qwen image edit L4 accuracy test (#2761) --- .buildkite/test-nightly-diffusion.yml | 40 ++++ pyproject.toml | 1 + tests/conftest.py | 10 +- tests/e2e/accuracy/conftest.py | 25 +++ tests/e2e/accuracy/test_qwen_image_edit.py | 232 +++++++++++++++++++++ tests/e2e/accuracy/utils.py | 74 +++++++ 6 files changed, 377 insertions(+), 5 deletions(-) create mode 100644 tests/e2e/accuracy/test_qwen_image_edit.py create mode 100644 tests/e2e/accuracy/utils.py diff --git a/.buildkite/test-nightly-diffusion.yml b/.buildkite/test-nightly-diffusion.yml index a520ca4356..b5ba8a117c 100644 --- a/.buildkite/test-nightly-diffusion.yml +++ b/.buildkite/test-nightly-diffusion.yml @@ -375,3 +375,43 @@ steps: hostPath: path: /mnt/hf-cache type: DirectoryOrCreate + + - label: ":full_moon: Diffusion · Qwen-Image · Accuracy Test" + key: nightly-qwen-image-accuracy + timeout_in_minutes: 180 + if: *nightly_or_pr_label + commands: + - pytest -s -v tests/e2e/accuracy/test_qwen_image*.py --run-level advanced_model + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate diff --git a/pyproject.toml b/pyproject.toml index 753e0e3981..9b034a7c8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ dev = [ "pyttsx3>=2.99", "opencc>=1.2.0", "mistune>=3.2.0", # for example tests + "torchmetrics>=1.4.0", # for accuracy similarity metrics ] demo = [ diff --git a/tests/conftest.py b/tests/conftest.py index adb87cbd72..4ad4706fc1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2397,7 +2397,7 @@ def _process_diffusion_response(self, chat_completion) -> DiffusionResponse: image_url = item.get("image_url", {}).get("url") else: image_url_obj = getattr(item, "image_url", None) - image_url = hasattr(image_url_obj, "url", None) if image_url_obj else None + image_url = getattr(image_url_obj, "url", None) if image_url_obj else None if image_url and image_url.startswith("data:image"): b64_data = image_url.split(",", 1)[1] img = decode_b64_image(b64_data) @@ -2703,7 +2703,7 @@ def _stream_task(): return responses - def send_diffusion_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]: + def send_diffusion_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[DiffusionResponse]: """ Send OpenAI requests for diffusion models. @@ -2711,9 +2711,9 @@ def send_diffusion_request(self, request_config: dict[str, Any], request_num: in request_config: Request configuration dictionary containing parameters like model, messages request_num: Number of requests to send concurrently, defaults to 1 (single request) Returns: - List[OmniResponse]: List of response objects + List[DiffusionResponse]: List of response objects """ - responses = [] + responses: list[DiffusionResponse] = [] stream = request_config.get("stream", False) modalities = request_config.get("modalities", omit) # Most diffusion models don't require modalities param extra_body = request_config.get("extra_body", None) @@ -2876,7 +2876,7 @@ def _build_url(self, path: str) -> str: return f"{self.base_url.rstrip('/')}/{path.lstrip('/')}" -@pytest.fixture +@pytest.fixture(scope="module") def openai_client(request: pytest.FixtureRequest, run_level: str): """Create OpenAIClientHandler fixture to facilitate communication with OmniServer with encapsulated request sending, concurrent requests, response handling, and validation.""" diff --git a/tests/e2e/accuracy/conftest.py b/tests/e2e/accuracy/conftest.py index 062750b3cd..3d614b8cdc 100644 --- a/tests/e2e/accuracy/conftest.py +++ b/tests/e2e/accuracy/conftest.py @@ -5,10 +5,13 @@ import subprocess from contextlib import contextmanager from dataclasses import dataclass +from io import BytesIO from pathlib import Path import pytest +import requests import torch +from PIL import Image from tests.conftest import OmniServer, OmniServerParams @@ -183,6 +186,28 @@ def accuracy_artifact_root() -> Path: return root +@pytest.fixture(scope="session") +def qwen_bear_image(accuracy_artifact_root: Path) -> Image.Image: + """Download the Qwen bear image from the URL and save it to the accuracy artifact root.""" + QWEN_BEAR_IMAGE_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/omni-assets/qwen-bear.png" + response = requests.get(QWEN_BEAR_IMAGE_URL, timeout=60) + response.raise_for_status() + image = Image.open(BytesIO(response.content)).convert("RGB") + image.save(accuracy_artifact_root / "qwen_bear.png") + return image + + +@pytest.fixture(scope="session") +def rabbit_image(accuracy_artifact_root: Path) -> Image.Image: + """Download the rabbit image from the URL and save it to the accuracy artifact root.""" + RABBIT_IMAGE_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/omni-assets/rabbit.png" + response = requests.get(RABBIT_IMAGE_URL, timeout=60) + response.raise_for_status() + image = Image.open(BytesIO(response.content)).convert("RGB") + image.save(accuracy_artifact_root / "rabbit.png") + return image + + def reset_artifact_dir(path: Path) -> Path: if path.exists(): shutil.rmtree(path) diff --git a/tests/e2e/accuracy/test_qwen_image_edit.py b/tests/e2e/accuracy/test_qwen_image_edit.py new file mode 100644 index 0000000000..9a97010343 --- /dev/null +++ b/tests/e2e/accuracy/test_qwen_image_edit.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +import gc +from pathlib import Path + +import pytest +import requests +import torch +from diffusers import QwenImageEditPipeline, QwenImageEditPlusPipeline +from PIL import Image + +from benchmarks.accuracy.common import decode_base64_image, pil_to_png_bytes +from tests.conftest import ( + OmniServer, + _run_post_test_cleanup, + _run_pre_test_cleanup, +) +from tests.e2e.accuracy.utils import assert_similarity, model_output_dir +from tests.utils import hardware_test + +SINGLE_MODEL = "Qwen/Qwen-Image-Edit" +MULTIPLE_MODEL = "Qwen/Qwen-Image-Edit-2509" +WIDTH = 512 +HEIGHT = 512 +NUM_INFERENCE_STEPS = 20 +TRUE_CFG_SCALE = 4.0 +SEED = 42 +SSIM_THRESHOLD = 0.94 +PSNR_THRESHOLD = 28.0 + +PROMPT_SINGLE_IMAGE = "The input is a 2D cartoon bear mascot. Restyle it into a painterly oil artwork with warm colors while preserving the main structure." +PROMPT_MULTIPLE_IMAGE = "Put the cartoon bear mascot and the furry rabbit into one coherent scene with a painterly oil artwork style and consistent lighting." +NEGATIVE_PROMPT = "low quality, blurry, artifacts, distortion" +SERVER_ARGS = ["--num-gpus", "1", "--stage-init-timeout", "300", "--init-timeout", "900"] + + +def _run_vllm_omni_image_edit( + *, + omni_server: OmniServer, + prompt: str, + input_images: list[Image.Image], + output_path: Path, +) -> Image.Image: + response = requests.post( + f"http://{omni_server.host}:{omni_server.port}/v1/images/edits", + data={ + "model": omni_server.model, + "prompt": prompt, + "size": f"{WIDTH}x{HEIGHT}", + "n": 1, + "response_format": "b64_json", + "negative_prompt": NEGATIVE_PROMPT, + "num_inference_steps": NUM_INFERENCE_STEPS, + "true_cfg_scale": TRUE_CFG_SCALE, + "seed": SEED, + }, + files=[ + ("image", (f"image_{index}.png", pil_to_png_bytes(image), "image/png")) + for index, image in enumerate(input_images) + ], + timeout=600, + ) + response.raise_for_status() + payload = response.json() + assert len(payload["data"]) == 1 + image = decode_base64_image(payload["data"][0]["b64_json"]) + image.load() + image.save(output_path) + return image + + +def _run_diffusers_image_edit( + *, + model: str, + pipeline_class: type[QwenImageEditPipeline] | type[QwenImageEditPlusPipeline], + prompt: str, + input_images: list[Image.Image], + output_path: Path, +) -> Image.Image: + _run_pre_test_cleanup(enable_force=True) + pipe: QwenImageEditPipeline | QwenImageEditPlusPipeline | None = None + device = torch.device("cuda:0") + torch.cuda.set_device(device) + try: + images = input_images[0] if len(input_images) == 1 else input_images + pipe = pipeline_class.from_pretrained( + model, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + ).to(device) + pipe.set_progress_bar_config(disable=False) + generator = torch.Generator(device=device).manual_seed(SEED) + result = pipe( # pyright: ignore[reportCallIssue] + prompt=prompt, + image=images, + negative_prompt=NEGATIVE_PROMPT, + num_inference_steps=NUM_INFERENCE_STEPS, + true_cfg_scale=TRUE_CFG_SCALE, + width=WIDTH, + height=HEIGHT, + generator=generator, + ) + output_image = result.images[0].convert("RGB") # pyright: ignore[reportAttributeAccessIssue] + output_image.save(output_path) + return output_image + finally: + if pipe is not None and hasattr(pipe, "maybe_free_model_hooks"): + pipe.maybe_free_model_hooks() + del pipe + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + _run_post_test_cleanup(enable_force=True) + + +def _vllm_omni_output_single_image( + accuracy_artifact_root: Path, + qwen_bear_image: Image.Image, +) -> Image.Image: + output_dir = model_output_dir(accuracy_artifact_root, SINGLE_MODEL) + output_path = output_dir / "vllm_omni_single.png" + with OmniServer(model=SINGLE_MODEL, serve_args=SERVER_ARGS) as server: + output = _run_vllm_omni_image_edit( + omni_server=server, + prompt=PROMPT_SINGLE_IMAGE, + input_images=[qwen_bear_image], + output_path=output_path, + ) + return output + + +def _diffusers_output_single_image(accuracy_artifact_root: Path, qwen_bear_image: Image.Image) -> Image.Image: + output_dir = model_output_dir(accuracy_artifact_root, SINGLE_MODEL) + output_path = output_dir / "diffusers_single.png" + return _run_diffusers_image_edit( + model=SINGLE_MODEL, + pipeline_class=QwenImageEditPipeline, + prompt=PROMPT_SINGLE_IMAGE, + input_images=[qwen_bear_image], + output_path=output_path, + ) + + +def _vllm_omni_output_multiple_image( + accuracy_artifact_root: Path, + qwen_bear_image: Image.Image, + rabbit_image: Image.Image, +) -> Image.Image: + output_dir = model_output_dir(accuracy_artifact_root, MULTIPLE_MODEL) + output_path = output_dir / "vllm_omni_multiple.png" + with OmniServer(model=MULTIPLE_MODEL, serve_args=SERVER_ARGS) as server: + output = _run_vllm_omni_image_edit( + omni_server=server, + prompt=PROMPT_MULTIPLE_IMAGE, + input_images=[qwen_bear_image, rabbit_image], + output_path=output_path, + ) + return output + + +def _diffusers_output_multiple_image( + accuracy_artifact_root: Path, qwen_bear_image: Image.Image, rabbit_image: Image.Image +) -> Image.Image: + output_dir = model_output_dir(accuracy_artifact_root, MULTIPLE_MODEL) + output_path = output_dir / "diffusers_multiple.png" + return _run_diffusers_image_edit( + model=MULTIPLE_MODEL, + pipeline_class=QwenImageEditPlusPipeline, + prompt=PROMPT_MULTIPLE_IMAGE, + input_images=[qwen_bear_image, rabbit_image], + output_path=output_path, + ) + + +@pytest.mark.advanced_model +@pytest.mark.benchmark +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}, num_cards=1) +def test_qwen_image_edit_single_matches_diffusers( + accuracy_artifact_root: Path, + qwen_bear_image: Image.Image, +) -> None: + vllm_image = _vllm_omni_output_single_image( + accuracy_artifact_root=accuracy_artifact_root, + qwen_bear_image=qwen_bear_image, + ) + diffusers_image = _diffusers_output_single_image( + accuracy_artifact_root=accuracy_artifact_root, + qwen_bear_image=qwen_bear_image, + ) + assert_similarity( + model_name=SINGLE_MODEL, + vllm_image=vllm_image, + diffusers_image=diffusers_image, + width=WIDTH, + height=HEIGHT, + ssim_threshold=SSIM_THRESHOLD, + psnr_threshold=PSNR_THRESHOLD, + ) + + +@pytest.mark.advanced_model +@pytest.mark.benchmark +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}, num_cards=1) +@pytest.mark.skip( + reason="Skipping as the second image seems to be ignored by the API. Will come back to this later after #2772 is merged." +) +def test_qwen_image_edit_multiple_matches_diffusers( + accuracy_artifact_root: Path, + qwen_bear_image: Image.Image, + rabbit_image: Image.Image, +) -> None: + vllm_image = _vllm_omni_output_multiple_image( + accuracy_artifact_root=accuracy_artifact_root, + qwen_bear_image=qwen_bear_image, + rabbit_image=rabbit_image, + ) + diffusers_image = _diffusers_output_multiple_image( + accuracy_artifact_root=accuracy_artifact_root, + qwen_bear_image=qwen_bear_image, + rabbit_image=rabbit_image, + ) + assert_similarity( + model_name=MULTIPLE_MODEL, + vllm_image=vllm_image, + diffusers_image=diffusers_image, + width=WIDTH, + height=HEIGHT, + ssim_threshold=SSIM_THRESHOLD, + psnr_threshold=PSNR_THRESHOLD, + ) diff --git a/tests/e2e/accuracy/utils.py b/tests/e2e/accuracy/utils.py new file mode 100644 index 0000000000..eb0eea757e --- /dev/null +++ b/tests/e2e/accuracy/utils.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pytest +import torch +from PIL import Image +from torchmetrics.image import PeakSignalNoiseRatio, StructuralSimilarityIndexMeasure + + +def model_output_dir(parent_dir: Path, model: str) -> Path: + safe_model_name = model.split("/")[-1].replace(".", "_") + path = parent_dir / safe_model_name + path.mkdir(parents=True, exist_ok=True) + return path + + +def assert_similarity( + *, + model_name: str, + vllm_image: Image.Image, + diffusers_image: Image.Image, + width: int, + height: int, + ssim_threshold: float, + psnr_threshold: float, +) -> None: + requested_size = (width, height) + if diffusers_image.size != requested_size: + pytest.skip( + "Skipping as diffusers baseline output is corrupt and not comparable: " + f"dimensions do not match requested size; requested={requested_size}, got={diffusers_image.size}." + ) + + assert vllm_image.size == diffusers_image.size, ( + f"Online and diffusers output sizes mismatch: online={vllm_image.size}, diffusers={diffusers_image.size}" + ) + + ssim_score, psnr_score = compute_image_ssim_psnr(prediction=vllm_image, reference=diffusers_image) + print(f"{model_name} similarity metrics:") + print(f" SSIM: value={ssim_score:.6f}, threshold>={ssim_threshold:.6f}, range=[-1, 1], higher_is_better=True") + print( + f" PSNR: value={psnr_score:.6f} dB, threshold>={psnr_threshold:.6f} dB, range=[0, +inf), higher_is_better=True" + ) + + assert ssim_score >= ssim_threshold, ( + f"SSIM below threshold for {model_name}: got {ssim_score:.6f}, expected >= {ssim_threshold:.6f}." + ) + assert psnr_score >= psnr_threshold, ( + f"PSNR below threshold for {model_name}: got {psnr_score:.6f}, expected >= {psnr_threshold:.6f}." + ) + + +def compute_image_ssim_psnr( + *, + prediction: Image.Image, + reference: Image.Image, +) -> tuple[float, float]: + pred_tensor = _pil_to_batched_tensor(prediction) + ref_tensor = _pil_to_batched_tensor(reference) + + ssim_metric = StructuralSimilarityIndexMeasure(data_range=1.0) + psnr_metric = PeakSignalNoiseRatio(data_range=1.0) + + ssim_value = float(ssim_metric(pred_tensor, ref_tensor).item()) + psnr_value = float(psnr_metric(pred_tensor, ref_tensor).item()) + return ssim_value, psnr_value + + +def _pil_to_batched_tensor(image: Image.Image) -> torch.Tensor: + array = np.asarray(image.convert("RGB"), dtype=np.float32) / 255.0 + tensor = torch.from_numpy(array).permute(2, 0, 1).unsqueeze(0) + return tensor From 61a3cbdff5785290501d711717e2b2e526ffe34f Mon Sep 17 00:00:00 2001 From: Samit <285365963@qq.com> Date: Wed, 15 Apr 2026 11:46:06 +0800 Subject: [PATCH 176/204] [Perf] Eliminate Hop 3 IPC overhead for single-stage diffusion via inline execution (#2736) Signed-off-by: samithuang <285365963@qq.com> Signed-off-by: Samit <285365963@qq.com> --- .../test_inline_stage_diffusion_client.py | 96 +++++ .../test_async_omni_engine_stage_init.py | 3 +- vllm_omni/diffusion/data.py | 43 +++ .../inline_stage_diffusion_client.py | 348 ++++++++++++++++++ vllm_omni/diffusion/stage_diffusion_client.py | 25 ++ vllm_omni/diffusion/stage_diffusion_proc.py | 46 +-- vllm_omni/engine/async_omni_engine.py | 2 + vllm_omni/engine/orchestrator.py | 17 + vllm_omni/engine/stage_init_utils.py | 8 +- .../entrypoints/openai/video_api_utils.py | 3 + vllm_omni/outputs.py | 3 + 11 files changed, 546 insertions(+), 48 deletions(-) create mode 100644 tests/diffusion/test_inline_stage_diffusion_client.py create mode 100644 vllm_omni/diffusion/inline_stage_diffusion_client.py diff --git a/tests/diffusion/test_inline_stage_diffusion_client.py b/tests/diffusion/test_inline_stage_diffusion_client.py new file mode 100644 index 0000000000..385f39b124 --- /dev/null +++ b/tests/diffusion/test_inline_stage_diffusion_client.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import asyncio +from unittest.mock import MagicMock, patch + +import pytest + +from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.inline_stage_diffusion_client import InlineStageDiffusionClient +from vllm_omni.engine.stage_init_utils import StageMetadata +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +@pytest.fixture +def mock_engine(): + with patch("vllm_omni.diffusion.inline_stage_diffusion_client.DiffusionEngine") as mock: + engine_instance = MagicMock() + mock.make_engine.return_value = engine_instance + yield engine_instance + + +@pytest.fixture +def client(mock_engine): + metadata = StageMetadata( + stage_id=0, + stage_type="diffusion", + engine_output_type="image", + is_comprehension=False, + requires_multimodal_data=False, + engine_input_source="prompt", + final_output=True, + final_output_type="image", + default_sampling_params={}, + custom_process_input_func=None, + model_stage=None, + runtime_cfg=None, + ) + with patch.object(InlineStageDiffusionClient, "_enrich_config"): + od_config = MagicMock(spec=OmniDiffusionConfig) + c = InlineStageDiffusionClient(model="test_model", od_config=od_config, metadata=metadata, batch_size=1) + yield c + c.shutdown() + + +@pytest.mark.asyncio +async def test_inline_dispatch_request_success(client, mock_engine): + # Setup mock engine step to return a successful result + mock_result = OmniRequestOutput.from_diffusion(request_id="req-1", images=[MagicMock()]) + mock_engine.step.return_value = [mock_result] + + sampling_params = OmniDiffusionSamplingParams() + await client.add_request_async("req-1", "A test prompt", sampling_params) + + # Wait for the task to be processed + for _ in range(10): + output = client.get_diffusion_output_nowait() + if output is not None: + break + await asyncio.sleep(0.01) + + assert output is not None + assert output.request_id == "req-1" + mock_engine.step.assert_called_once() + + +@pytest.mark.asyncio +async def test_inline_dispatch_request_error(client, mock_engine): + # Setup mock engine step to raise an exception + mock_engine.step.side_effect = RuntimeError("Engine failure") + + sampling_params = OmniDiffusionSamplingParams() + await client.add_request_async("req-err", "A test prompt", sampling_params) + + for _ in range(10): + output = client.get_diffusion_output_nowait() + if output is not None: + break + await asyncio.sleep(0.01) + + assert output is not None + assert output.request_id == "req-err" + assert output.error == "Engine failure" + assert not output.images + + +def test_inline_shutdown(client, mock_engine): + assert not client._shutting_down + + # Shutting down should cleanly cancel anything queued and close engine + client.shutdown() + + assert client._shutting_down + mock_engine.close.assert_called_once() diff --git a/tests/engine/test_async_omni_engine_stage_init.py b/tests/engine/test_async_omni_engine_stage_init.py index 7b995fe70d..84b0cb0bed 100644 --- a/tests/engine/test_async_omni_engine_stage_init.py +++ b/tests/engine/test_async_omni_engine_stage_init.py @@ -100,10 +100,11 @@ def test_initialize_stages_passes_stage_init_timeout_to_diffusion_handshake(monk engine.log_stats = False engine.model = "dummy-model" engine.config_path = "dummy-config" - engine.num_stages = 1 + engine.num_stages = 2 engine.async_chunk = False engine.diffusion_batch_size = 1 engine.single_stage_mode = False + engine._omni_master_server = None engine.stage_configs = [types.SimpleNamespace(stage_id=0, stage_type="diffusion", engine_args={})] metadata = types.SimpleNamespace( diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py index 56a891aa5c..fca0a5bad0 100644 --- a/vllm_omni/diffusion/data.py +++ b/vllm_omni/diffusion/data.py @@ -666,6 +666,49 @@ def set_tf_model_config(self, tf_config: "TransformerConfig") -> None: def update_multimodal_support(self) -> None: self.supports_multimodal_inputs = self.model_class_name in {"QwenImageEditPlusPipeline"} + def enrich_config(self) -> None: + """Load model metadata from HuggingFace and populate config fields. + + Diffusers-style models expose ``model_index.json`` with ``_class_name``. + Non-diffusers models (e.g. Bagel, NextStep) only have ``config.json``, + so we fall back to reading that and mapping model_type manually. + """ + from vllm.transformers_utils.config import get_hf_file_to_dict + + try: + config_dict = get_hf_file_to_dict("model_index.json", self.model) + if config_dict is not None: + if self.model_class_name is None: + self.model_class_name = config_dict.get("_class_name", None) + self.update_multimodal_support() + + tf_config_dict = get_hf_file_to_dict("transformer/config.json", self.model) + self.tf_model_config = TransformerConfig.from_dict(tf_config_dict) + else: + raise FileNotFoundError("model_index.json not found") + except (AttributeError, OSError, ValueError, FileNotFoundError): + cfg = get_hf_file_to_dict("config.json", self.model) + if cfg is None: + raise ValueError(f"Could not find config.json or model_index.json for model {self.model}") + + self.tf_model_config = TransformerConfig.from_dict(cfg) + model_type = cfg.get("model_type") + architectures = cfg.get("architectures") or [] + + if model_type == "bagel" or "BagelForConditionalGeneration" in architectures: + self.model_class_name = "BagelPipeline" + self.tf_model_config = TransformerConfig() + self.update_multimodal_support() + elif model_type == "nextstep": + if self.model_class_name is None: + self.model_class_name = "NextStep11Pipeline" + self.tf_model_config = TransformerConfig() + self.update_multimodal_support() + elif architectures and len(architectures) == 1: + self.model_class_name = architectures[0] + else: + raise + @classmethod def from_kwargs(cls, **kwargs: Any) -> "OmniDiffusionConfig": # Backwards-compatibility: older callers may use a diffusion-specific diff --git a/vllm_omni/diffusion/inline_stage_diffusion_client.py b/vllm_omni/diffusion/inline_stage_diffusion_client.py new file mode 100644 index 0000000000..a33a3e9561 --- /dev/null +++ b/vllm_omni/diffusion/inline_stage_diffusion_client.py @@ -0,0 +1,348 @@ +"""Inline Stage Diffusion Client for vLLM-Omni multi-stage runtime. + +Runs DiffusionEngine in a ThreadPoolExecutor inside the Orchestrator process +instead of spawning a separate StageDiffusionProc subprocess, eliminating ZMQ +IPC overhead. Used when there is only a single diffusion stage. +""" + +from __future__ import annotations + +import asyncio +import time +from concurrent.futures import ThreadPoolExecutor +from typing import TYPE_CHECKING, Any + +import torch +from PIL import Image +from vllm.logger import init_logger + +from vllm_omni.diffusion.data import DiffusionRequestAbortedError +from vllm_omni.diffusion.diffusion_engine import DiffusionEngine +from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.engine.stage_init_utils import StageMetadata +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput + +if TYPE_CHECKING: + from vllm_omni.diffusion.data import OmniDiffusionConfig + from vllm_omni.inputs.data import OmniPromptType + +logger = init_logger(__name__) + + +class InlineStageDiffusionClient: + """Runs DiffusionEngine in a thread executor inside the Orchestrator.""" + + stage_type: str = "diffusion" + + def __init__( + self, + model: str, + od_config: OmniDiffusionConfig, + metadata: StageMetadata, + batch_size: int = 1, + ) -> None: + self.model = model + self.od_config = od_config + self.stage_id = metadata.stage_id + self.final_output = metadata.final_output + self.final_output_type = metadata.final_output_type + self.default_sampling_params = metadata.default_sampling_params + self.custom_process_input_func = metadata.custom_process_input_func + self.engine_input_source = metadata.engine_input_source + self.batch_size = batch_size + + self._enrich_config() + self._engine = DiffusionEngine.make_engine(self.od_config) + self._executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="inline-diffusion") + + self._output_queue: asyncio.Queue[OmniRequestOutput] = asyncio.Queue() + self._tasks: dict[str, asyncio.Task] = {} + self._shutting_down = False + + logger.info( + "[InlineStageDiffusionClient] Stage-%s initialized inline (batch_size=%d)", + self.stage_id, + self.batch_size, + ) + + def _enrich_config(self) -> None: + """Load model metadata from HuggingFace and populate od_config fields.""" + self.od_config.enrich_config() + + # ------------------------------------------------------------------ + # Request processing + # ------------------------------------------------------------------ + + async def add_request_async( + self, + request_id: str, + prompt: OmniPromptType, + sampling_params: OmniDiffusionSamplingParams, + kv_sender_info: dict[int, dict[str, Any]] | None = None, + ) -> None: + task = asyncio.create_task( + self._dispatch_request( + request_id, + prompt, + sampling_params, + kv_sender_info, + ) + ) + self._tasks[request_id] = task + + async def _dispatch_request( + self, + request_id: str, + prompt: Any, + sampling_params: OmniDiffusionSamplingParams, + kv_sender_info: dict[str, Any] | None = None, + ) -> None: + try: + request = OmniDiffusionRequest( + prompts=[prompt], + sampling_params=sampling_params, + request_ids=[request_id], + request_id=request_id, + kv_sender_info=kv_sender_info, + ) + + loop = asyncio.get_running_loop() + results = await loop.run_in_executor(self._executor, self._engine.step, request) + result = results[0] + if not result.request_id: + result.request_id = request_id + + self._output_queue.put_nowait(result) + except DiffusionRequestAbortedError as e: + logger.info("request_id: %s aborted: %s", request_id, str(e)) + except Exception as e: + logger.exception("Diffusion request %s failed: %s", request_id, e) + error_output = OmniRequestOutput.from_diffusion( + request_id=request_id, + images=[], + ) + error_output.error = str(e) + self._output_queue.put_nowait(error_output) + finally: + self._tasks.pop(request_id, None) + + async def add_batch_request_async( + self, + request_id: str, + prompts: list[OmniPromptType], + sampling_params: OmniDiffusionSamplingParams, + kv_sender_info: dict[int, dict[str, Any]] | None = None, + ) -> None: + task = asyncio.create_task( + self._dispatch_batch( + request_id, + prompts, + sampling_params, + kv_sender_info, + ) + ) + self._tasks[request_id] = task + + async def _dispatch_batch( + self, + request_id: str, + prompts: list[Any], + sampling_params: OmniDiffusionSamplingParams, + kv_sender_info: dict[str, Any] | None = None, + ) -> None: + try: + request = OmniDiffusionRequest( + prompts=prompts, + sampling_params=sampling_params, + request_ids=[f"{request_id}-{i}" for i in range(len(prompts))], + request_id=request_id, + kv_sender_info=kv_sender_info, + ) + + loop = asyncio.get_running_loop() + results = await loop.run_in_executor(self._executor, self._engine.step, request) + + all_images: list = [] + merged_mm: dict[str, Any] = {} + merged_metrics: dict[str, Any] = {} + merged_durations: dict[str, float] = {} + merged_custom: dict[str, Any] = {} + peak_mem = 0.0 + latents = None + trajectory_latents: list[torch.Tensor] | None = None + trajectory_timesteps: list[torch.Tensor] | None = None + trajectory_log_probs: torch.Tensor | None = None + trajectory_decoded: list[Image.Image] | None = None + final_output_type = "image" + + for r in results: + all_images.extend(r.images) + merged_mm.update(r._multimodal_output) + merged_metrics.update(r.metrics) + merged_durations.update(r.stage_durations) + merged_custom.update(r._custom_output) + peak_mem = max(peak_mem, r.peak_memory_mb) + if latents is None and r.latents is not None: + latents = r.latents + if trajectory_latents is None: + trajectory_latents = r.trajectory_latents + if trajectory_timesteps is None: + trajectory_timesteps = r.trajectory_timesteps + if trajectory_log_probs is None: + trajectory_log_probs = r.trajectory_log_probs + if trajectory_decoded is None: + trajectory_decoded = r.trajectory_decoded + if r.final_output_type != "image": + final_output_type = r.final_output_type + + result = OmniRequestOutput.from_diffusion( + request_id=request_id, + images=all_images, + prompt=prompts[0] if len(prompts) == 1 else None, + metrics=merged_metrics, + latents=latents, + trajectory_latents=trajectory_latents, + trajectory_timesteps=trajectory_timesteps, + trajectory_log_probs=trajectory_log_probs, + trajectory_decoded=trajectory_decoded, + custom_output=merged_custom or None, + multimodal_output=merged_mm or None, + final_output_type=final_output_type, + stage_durations=merged_durations, + peak_memory_mb=peak_mem, + ) + + self._output_queue.put_nowait(result) + except DiffusionRequestAbortedError as e: + logger.info("request_id: %s aborted: %s", request_id, str(e)) + except Exception as e: + logger.exception("Batch diffusion request %s failed: %s", request_id, e) + error_output = OmniRequestOutput.from_diffusion( + request_id=request_id, + images=[], + ) + error_output.error = str(e) + self._output_queue.put_nowait(error_output) + finally: + self._tasks.pop(request_id, None) + + def get_diffusion_output_nowait(self) -> OmniRequestOutput | None: + try: + return self._output_queue.get_nowait() + except asyncio.QueueEmpty: + return None + + async def abort_requests_async(self, request_ids: list[str]) -> None: + for rid in request_ids: + task = self._tasks.pop(rid, None) + if task: + task.cancel() + self._engine.abort(rid) + + async def collective_rpc_async( + self, + method: str, + timeout: float | None = None, + args: tuple[Any, ...] = (), + kwargs: dict[str, Any] | None = None, + ) -> Any: + loop = asyncio.get_running_loop() + + if method == "profile": + is_start = args[0] if args else True + profile_prefix = args[1] if len(args) > 1 else None + if is_start and profile_prefix is None: + profile_prefix = f"stage_{self.stage_id}_diffusion_{int(time.time())}" + return await loop.run_in_executor( + self._executor, + self._engine.profile, + is_start, + profile_prefix, + ) + + kwargs = kwargs or {} + + # LoRA methods + if method == "add_lora": + lora_request = args[0] if args else kwargs.get("lora_request") + results = await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + "add_lora", + timeout, + (), + {"lora_request": lora_request}, + None, + ) + return all(results) if isinstance(results, list) else results + + if method == "remove_lora": + results = await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + "remove_lora", + timeout, + args, + kwargs, + None, + ) + return all(results) if isinstance(results, list) else results + + if method == "list_loras": + results = await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + "list_loras", + timeout, + (), + {}, + None, + ) + if not isinstance(results, list): + return results or [] + merged: set[int] = set() + for part in results: + merged.update(part or []) + return sorted(merged) + + if method == "pin_lora": + lora_id = args[0] if args else kwargs.get("adapter_id") + results = await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + "pin_lora", + timeout, + (), + {"adapter_id": lora_id}, + None, + ) + return all(results) if isinstance(results, list) else results + + return await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + method, + timeout, + args, + kwargs, + None, + ) + + def shutdown(self) -> None: + self._shutting_down = True + + # Cancel all pending tasks + for task in self._tasks.values(): + task.cancel() + + try: + # Cancel queued futures and wait for the running one to complete deterministically + self._executor.shutdown(wait=True, cancel_futures=True) + except Exception: + pass + + try: + self._engine.close() + except Exception: + pass diff --git a/vllm_omni/diffusion/stage_diffusion_client.py b/vllm_omni/diffusion/stage_diffusion_client.py index 7e740dc893..480d113d19 100644 --- a/vllm_omni/diffusion/stage_diffusion_client.py +++ b/vllm_omni/diffusion/stage_diffusion_client.py @@ -34,6 +34,24 @@ logger = init_logger(__name__) +def create_diffusion_client( + model: str, + od_config: OmniDiffusionConfig, + metadata: StageMetadata, + stage_init_timeout: int, + batch_size: int = 1, + use_inline: bool = False, +) -> Any: + """Factory to create either an inline or out-of-process diffusion client.""" + if use_inline: + from vllm_omni.diffusion.inline_stage_diffusion_client import InlineStageDiffusionClient + + return InlineStageDiffusionClient(model, od_config, metadata, batch_size=batch_size) + return StageDiffusionClient( + model, od_config, metadata, stage_init_timeout=stage_init_timeout, batch_size=batch_size + ) + + class StageDiffusionClient: """Communicates with StageDiffusionProc via ZMQ for use inside the Orchestrator. @@ -154,6 +172,13 @@ def _drain_responses(self) -> None: "error": True, "reason": error_msg, } + elif req_id is not None: + error_output = OmniRequestOutput.from_diffusion( + request_id=req_id, + images=[], + ) + error_output.error = error_msg + self._output_queue.put_nowait(error_output) # Fields that are subprocess-local and cannot be serialized across # process boundaries. They are recreated in the subprocess with diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py index cef697630f..eced444fd3 100644 --- a/vllm_omni/diffusion/stage_diffusion_proc.py +++ b/vllm_omni/diffusion/stage_diffusion_proc.py @@ -19,12 +19,11 @@ import zmq.asyncio from PIL import Image from vllm.logger import init_logger -from vllm.transformers_utils.config import get_hf_file_to_dict from vllm.utils.network_utils import get_open_zmq_ipc_path, zmq_socket_ctx from vllm.utils.system_utils import get_mp_context from vllm.v1.utils import shutdown -from vllm_omni.diffusion.data import DiffusionRequestAbortedError, TransformerConfig +from vllm_omni.diffusion.data import DiffusionRequestAbortedError from vllm_omni.diffusion.diffusion_engine import DiffusionEngine from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.distributed.omni_connectors.utils.serialization import ( @@ -66,47 +65,8 @@ def initialize(self) -> None: logger.info("StageDiffusionProc initialized with model: %s", self._model) def _enrich_config(self) -> None: - """Load model metadata from HuggingFace and populate od_config fields. - - Diffusers-style models expose ``model_index.json`` with ``_class_name``. - Non-diffusers models (e.g. Bagel, NextStep) only have ``config.json``, - so we fall back to reading that and mapping model_type manually. - """ - od_config = self._od_config - - try: - config_dict = get_hf_file_to_dict("model_index.json", od_config.model) - if config_dict is not None: - if od_config.model_class_name is None: - od_config.model_class_name = config_dict.get("_class_name", None) - od_config.update_multimodal_support() - - tf_config_dict = get_hf_file_to_dict("transformer/config.json", od_config.model) - od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict) - else: - raise FileNotFoundError("model_index.json not found") - except (AttributeError, OSError, ValueError, FileNotFoundError): - cfg = get_hf_file_to_dict("config.json", od_config.model) - if cfg is None: - raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}") - - od_config.tf_model_config = TransformerConfig.from_dict(cfg) - model_type = cfg.get("model_type") - architectures = cfg.get("architectures") or [] - - if model_type == "bagel" or "BagelForConditionalGeneration" in architectures: - od_config.model_class_name = "BagelPipeline" - od_config.tf_model_config = TransformerConfig() - od_config.update_multimodal_support() - elif model_type == "nextstep": - if od_config.model_class_name is None: - od_config.model_class_name = "NextStep11Pipeline" - od_config.tf_model_config = TransformerConfig() - od_config.update_multimodal_support() - elif architectures and len(architectures) == 1: - od_config.model_class_name = architectures[0] - else: - raise + """Load model metadata from HuggingFace and populate od_config fields.""" + self._od_config.enrich_config() # ------------------------------------------------------------------ # Request processing diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 9609cf6e26..054d5342d9 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -759,12 +759,14 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: self._omni_master_server, ) else: + use_inline = True if self.num_stages == 1 else False stage_clients[stage_idx] = initialize_diffusion_stage( self.model, stage_cfg, metadata, stage_init_timeout=stage_init_timeout, batch_size=self.diffusion_batch_size, + use_inline=use_inline, ) logger.info( "[AsyncOmniEngine] Stage %s initialized (diffusion, batch_size=%d)", diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 386b545eb7..0fdab9c0d2 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -246,6 +246,23 @@ async def _orchestration_loop(self) -> None: idle = False req_state = self.request_states.get(output.request_id) if req_state is not None: + if getattr(output, "error", None) is not None: + parent_id = self._companion_to_parent.get(output.request_id, output.request_id) + await self.output_async_queue.put( + { + "type": "error", + "request_id": parent_id, + "stage_id": stage_id, + "error": output.error, + } + ) + role_map = self._companion_map.get(parent_id, {}) + for cid in role_map.values(): + self.request_states.pop(cid, None) + self._cleanup_companion_state(parent_id) + self.request_states.pop(parent_id, None) + continue + stage_metrics = self._build_stage_metrics(stage_id, output.request_id, [output], req_state) await self._route_output(stage_id, output, req_state, stage_metrics) continue diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 158b4c5477..bf40aa77cd 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -530,6 +530,7 @@ def initialize_diffusion_stage( metadata: StageMetadata, stage_init_timeout: int, batch_size: int = 1, + use_inline: bool = False, ) -> Any: """Build a diffusion stage client. @@ -541,13 +542,12 @@ def initialize_diffusion_stage( batch_size: Maximum number of requests to batch together in the diffusion engine. Passed through to ``StageDiffusionClient`` and ultimately to ``AsyncOmni``. + use_inline: If True, uses the inline diffusion client instead of subprocess. """ - from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient + from vllm_omni.diffusion.stage_diffusion_client import create_diffusion_client od_config = build_diffusion_config(model, stage_cfg, metadata) - return StageDiffusionClient( - model, od_config, metadata, stage_init_timeout=stage_init_timeout, batch_size=batch_size - ) + return create_diffusion_client(model, od_config, metadata, stage_init_timeout, batch_size, use_inline) def _shutdown_or_close_resource(resource: Any, resource_name: str, stage_id: int) -> None: diff --git a/vllm_omni/entrypoints/openai/video_api_utils.py b/vllm_omni/entrypoints/openai/video_api_utils.py index 1935469792..3fb991225c 100644 --- a/vllm_omni/entrypoints/openai/video_api_utils.py +++ b/vllm_omni/entrypoints/openai/video_api_utils.py @@ -227,6 +227,9 @@ def _encode_video_bytes( frames_np *= 255.0 frames_u8 = np.round(frames_np).astype(np.uint8) + # Ensure contiguous memory layout for faster PyAV muxing + frames_u8 = np.ascontiguousarray(frames_u8) + audio_np = _coerce_audio_to_numpy(audio) if audio is not None else None return mux_video_audio_bytes( diff --git a/vllm_omni/outputs.py b/vllm_omni/outputs.py index 2c2c1d21c1..c02c0c1427 100644 --- a/vllm_omni/outputs.py +++ b/vllm_omni/outputs.py @@ -100,6 +100,9 @@ class OmniRequestOutput: # memory usage info peak_memory_mb: float = 0.0 + # error handling + error: str | None = None + @classmethod def from_pipeline( cls, From 6c6551dff8856e8e936cf29b5886174e4b149e4a Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Wed, 15 Apr 2026 11:52:20 +0800 Subject: [PATCH 177/204] [Feature] feat: add video frame interpolation postprocess (#2555) Signed-off-by: David Chen <530634352@qq.com> --- docs/.nav.yml | 1 + .../diffusion/frame_interpolation.md | 92 ++++ docs/user_guide/diffusion_features.md | 9 +- .../examples/online_serving/image_to_video.md | 32 ++ .../examples/online_serving/text_to_video.md | 32 ++ .../openai_api/test_video_api_utils.py | 92 ++++ .../openai_api/test_video_server.py | 134 +++++- vllm_omni/diffusion/diffusion_engine.py | 23 +- .../models/wan2_2/pipeline_wan2_2.py | 16 +- .../models/wan2_2/pipeline_wan2_2_i2v.py | 16 +- .../models/wan2_2/pipeline_wan2_2_ti2v.py | 16 +- vllm_omni/diffusion/postprocess/__init__.py | 10 + .../postprocess/rife_interpolator.py | 440 ++++++++++++++++++ vllm_omni/entrypoints/openai/api_server.py | 8 + .../entrypoints/openai/protocol/videos.py | 23 + vllm_omni/entrypoints/openai/serving_video.py | 22 +- vllm_omni/inputs/data.py | 4 + 17 files changed, 961 insertions(+), 9 deletions(-) create mode 100644 docs/user_guide/diffusion/frame_interpolation.md create mode 100644 tests/entrypoints/openai_api/test_video_api_utils.py create mode 100644 vllm_omni/diffusion/postprocess/__init__.py create mode 100644 vllm_omni/diffusion/postprocess/rife_interpolator.py diff --git a/docs/.nav.yml b/docs/.nav.yml index 86ce4a3b0c..441ef9f521 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -64,6 +64,7 @@ nav: - FP8: user_guide/diffusion/quantization/fp8.md - Int8: user_guide/diffusion/quantization/int8.md - GGUF: user_guide/diffusion/quantization/gguf.md + - Frame Interpolation: user_guide/diffusion/frame_interpolation.md - Parallelism: - Overview: user_guide/diffusion/parallelism/overview.md - CFG Parallel: user_guide/diffusion/parallelism/cfg_parallel.md diff --git a/docs/user_guide/diffusion/frame_interpolation.md b/docs/user_guide/diffusion/frame_interpolation.md new file mode 100644 index 0000000000..349af50c51 --- /dev/null +++ b/docs/user_guide/diffusion/frame_interpolation.md @@ -0,0 +1,92 @@ +# Frame Interpolation + +## Overview + +vLLM-Omni supports post-generation frame interpolation for supported video +diffusion pipelines. This feature inserts synthesized intermediate frames +between adjacent generated frames to improve temporal smoothness without +rerunning the diffusion denoising loop. + +Frame interpolation runs in the diffusion worker post-processing path instead +of the API server encoding path. This allows the interpolation step to reuse +the worker's current accelerator device and keeps the FastAPI event loop free +from heavy synchronous PyTorch work. + +For an input video with `N` generated frames and interpolation exponent `exp`, +the output frame count is: + +```text +(N - 1) * 2**exp + 1 +``` + +The output FPS is multiplied by `2**exp` so the clip duration remains close to +the original generated video. + +## Supported Pipelines + +Frame interpolation is currently supported for: + +- `WanPipeline` (Wan2.2 text-to-video) +- `WanImageToVideoPipeline` +- `Wan22TI2VPipeline` + +## Request Parameters + +The video APIs `/v1/videos` and `/v1/videos/sync` accept: + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `enable_frame_interpolation` | bool | `false` | Enable post-generation frame interpolation | +| `frame_interpolation_exp` | int | `1` | Interpolation exponent. `1=2x`, `2=4x`, etc. | +| `frame_interpolation_scale` | float | `1.0` | RIFE inference scale | +| `frame_interpolation_model_path` | str | `None` | Local directory or Hugging Face repo ID containing `flownet.pkl` | + +## Execution Flow + +For supported Wan2.2 pipelines, the execution order is: + +1. Diffusion worker finishes denoising and decodes the raw video tensor. +2. Worker-side model-specific post-processing runs. +3. If frame interpolation is enabled, RIFE interpolates the decoded video + tensor on the worker side and records a FPS multiplier in `custom_output`. +4. The API server receives the already-interpolated video and only performs + MP4 export. + +This design keeps interpolation close to the generated tensor and avoids +introducing another heavyweight GPU context in the API server process. + +## Example + +Start the server: + +```bash +vllm serve Wan-AI/Wan2.2-T2V-A14B-Diffusers --omni --port 8091 +``` + +Run a sync request with interpolation enabled: + +```bash +curl -X POST http://localhost:8091/v1/videos/sync \ + -F "prompt=A dog running through a park" \ + -F "num_frames=81" \ + -F "width=832" \ + -F "height=480" \ + -F "fps=16" \ + -F "num_inference_steps=40" \ + -F "guidance_scale=1.0" \ + -F "guidance_scale_2=1.0" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ + -F "seed=42" \ + -o sync_t2v_interpolated.mp4 +``` + +## Notes + +- This is a post-processing feature. It does not modify the diffusion denoising + schedule. +- Higher interpolation exponents increase post-processing time and memory usage. +- If the interpolation model weights are not available locally, + `frame_interpolation_model_path` may point to a Hugging Face repo containing + `flownet.pkl`. diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 31cd1500fa..45953b8529 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -14,7 +14,7 @@ vLLM-Omni supports various advanced features for diffusion models: - Acceleration: **cache methods**, **parallelism methods**, **startup optimizations** - Memory optimization: **cpu offloading**, **quantization** -- Extensions: **LoRA inference** +- Extensions: **LoRA inference**, **frame interpolation** - Execution modes: **step execution** ## Supported Features @@ -69,6 +69,7 @@ Extension methods add specialized capabilities to diffusion models beyond standa | Method | Description | Best For | |--------|-------------|----------| | **[LoRA Inference](diffusion/lora.md)** | Enables inference with Low-Rank Adaptation (LoRA) adapters weights | Reinforcement learning extensions | +| **[Frame Interpolation](diffusion/frame_interpolation.md)** | Inserts intermediate video frames after generation for smoother motion | Video generation pipelines that need higher temporal smoothness | ### Execution Modes @@ -143,6 +144,11 @@ The following tables show which models support each feature: | **HunyuanVideo-1.5 T2V I2V** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ❌ | | **DreamID-Omni** | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +**Frame Interpolation Support** + +- **Supported**: Wan2.2 text-to-video, image-to-video, and TI2V pipelines +- **Not supported**: Wan2.1-VACE, LTX-2, Helios, HunyuanVideo-1.5, DreamID-Omni + ### AudioGen | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | @@ -258,6 +264,7 @@ Measured on NVIDIA H800: **Extensions:** - **[LoRA Inference Guide](diffusion/lora.md)** - Low-Rank Adaptation for style customization and fine-tuning +- **[Frame Interpolation Guide](diffusion/frame_interpolation.md)** - Worker-side post-generation video frame interpolation for smoother motion **Execution Modes:** diff --git a/docs/user_guide/examples/online_serving/image_to_video.md b/docs/user_guide/examples/online_serving/image_to_video.md index 00b67d74e2..781f0c2a5e 100644 --- a/docs/user_guide/examples/online_serving/image_to_video.md +++ b/docs/user_guide/examples/online_serving/image_to_video.md @@ -72,6 +72,9 @@ curl -X POST http://localhost:8091/v1/videos/sync \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ -F "seed=42" \ -o sync_i2v_output.mp4 ``` @@ -114,6 +117,9 @@ create_response=$(curl -s http://localhost:8091/v1/videos \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ -F "seed=42") video_id=$(echo "$create_response" | jq -r '.id') @@ -172,9 +178,35 @@ curl -X POST http://localhost:8091/v1/videos \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ -F "seed=42" ``` +Frame interpolation is also available for supported Wan2.2 I2V requests. See +[Frame Interpolation](../../diffusion/frame_interpolation.md) for worker-side +execution details and feature constraints. + +### Frame Interpolation Example + +```bash +curl -X POST http://localhost:8091/v1/videos/sync \ + -F "prompt=A bear playing with yarn, smooth motion" \ + -F "input_reference=@/path/to/qwen-bear.png" \ + -F "width=832" \ + -F "height=480" \ + -F "num_frames=33" \ + -F "fps=16" \ + -F "num_inference_steps=40" \ + -F "guidance_scale=1.0" \ + -F "guidance_scale_2=1.0" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ + -o sync_i2v_interpolated.mp4 +``` + ## Create Response Format `POST /v1/videos` returns a job record, not inline base64 video data. diff --git a/docs/user_guide/examples/online_serving/text_to_video.md b/docs/user_guide/examples/online_serving/text_to_video.md index 01e6d9d464..00a9c16723 100644 --- a/docs/user_guide/examples/online_serving/text_to_video.md +++ b/docs/user_guide/examples/online_serving/text_to_video.md @@ -165,6 +165,9 @@ curl -X POST http://localhost:8091/v1/videos \ -F "guidance_scale_2=4.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=5.0" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ -F "seed=42" ``` @@ -187,6 +190,35 @@ curl -X POST http://localhost:8091/v1/videos \ | `flow_shift` | float | None | Scheduler flow shift (Wan2.2) | | `seed` | int | None | Random seed (reproducible) | | `lora` | object | None | LoRA configuration | +| `enable_frame_interpolation` | bool | false | Enable RIFE frame interpolation before MP4 encoding | +| `frame_interpolation_exp` | int | 1 | Interpolation exponent; 1=2x temporal resolution, 2=4x | +| `frame_interpolation_scale` | float | 1.0 | RIFE inference scale; use 0.5 for high-resolution inputs | +| `frame_interpolation_model_path` | str | None | Local directory or Hugging Face repo ID with `flownet.pkl`; defaults to `elfgum/RIFE-4.22.lite` | + +## Frame Interpolation + +Frame interpolation is an optional post-processing step for `/v1/videos` and +`/v1/videos/sync`. It synthesizes intermediate frames between generated frames +without rerunning the diffusion model. If the generated video has `N` frames, +the interpolated output frame count is `(N - 1) * 2**exp + 1`. The encoder FPS +is multiplied by `2**exp` so the output duration remains close to the original. + +Frame interpolation runs in the diffusion worker post-processing path instead of +the API server encoding path, so it can reuse the worker's current accelerator +device without blocking the FastAPI event loop. + +Example: generate 5 frames and interpolate to 9 frames: + +```bash +curl -X POST http://localhost:8091/v1/videos/sync \ + -F "prompt=A dog running through a park" \ + -F "num_frames=5" \ + -F "fps=8" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ + -o sync_t2v_interpolated.mp4 +``` ## Create Response Format diff --git a/tests/entrypoints/openai_api/test_video_api_utils.py b/tests/entrypoints/openai_api/test_video_api_utils.py new file mode 100644 index 0000000000..5012c9b982 --- /dev/null +++ b/tests/entrypoints/openai_api/test_video_api_utils.py @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for OpenAI-compatible video API encoding helpers.""" + +import numpy as np +import pytest +import torch + +from vllm_omni.diffusion.postprocess import rife_interpolator +from vllm_omni.entrypoints.openai import video_api_utils + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def _install_fake_video_mux(monkeypatch, mux_calls): + def _fake_mux_video_audio_bytes(frames, audio, fps, audio_sample_rate, video_codec_options=None): + mux_calls.append( + { + "frames": frames, + "audio": audio, + "fps": fps, + "audio_sample_rate": audio_sample_rate, + "video_codec_options": video_codec_options, + } + ) + return b"fake-video" + + monkeypatch.setattr( + "vllm_omni.diffusion.utils.media_utils.mux_video_audio_bytes", + _fake_mux_video_audio_bytes, + ) + + +def test_encode_video_bytes_exports_frames_without_interpolation(monkeypatch): + mux_calls = [] + _install_fake_video_mux(monkeypatch, mux_calls) + + frames = [np.full((2, 2, 3), fill_value=i / 5, dtype=np.float32) for i in range(5)] + video_bytes = video_api_utils._encode_video_bytes( + frames, + fps=8, + ) + + assert video_bytes == b"fake-video" + assert mux_calls[0]["frames"].shape == (5, 2, 2, 3) + assert mux_calls[0]["frames"].dtype == np.uint8 + assert mux_calls[0]["fps"] == 8.0 + assert mux_calls[0]["audio"] is None + + +def test_rife_model_inference_runs_on_dummy_tensors(): + model = rife_interpolator.Model().eval() + img0 = torch.rand(1, 3, 32, 32) + img1 = torch.rand(1, 3, 32, 32) + + output = model.inference(img0, img1, scale=1.0) + + assert output.shape == (1, 3, 32, 32) + assert torch.isfinite(output).all() + + +def test_frame_interpolator_runs_actual_torch_tensor_path(monkeypatch): + model = rife_interpolator.Model().eval() + interpolator = rife_interpolator.FrameInterpolator() + monkeypatch.setattr(interpolator, "_ensure_model_loaded", lambda preferred_device=None: model) + + video = torch.zeros(1, 3, 2, 32, 32) + output_video, multiplier = interpolator.interpolate_tensor(video, exp=1, scale=1.0) + + assert multiplier == 2 + assert output_video.shape == (1, 3, 3, 32, 32) + assert torch.isfinite(output_video).all() + + +def test_frame_interpolator_prefers_input_tensor_device(monkeypatch): + chosen_devices = [] + model = rife_interpolator.Model().eval() + + def _fake_ensure_model_loaded(*, preferred_device=None): + chosen_devices.append(preferred_device) + return model + + interpolator = rife_interpolator.FrameInterpolator() + monkeypatch.setattr(interpolator, "_ensure_model_loaded", _fake_ensure_model_loaded) + monkeypatch.setattr(model.flownet, "to", lambda device: model.flownet) + + video = torch.zeros(1, 3, 2, 32, 32) + output_video, multiplier = interpolator.interpolate_tensor(video, exp=1, scale=1.0) + + assert chosen_devices == [video.device] + assert multiplier == 2 + assert output_video.shape == (1, 3, 3, 32, 32) diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py index 82c34f87e8..7a395bab5b 100644 --- a/tests/entrypoints/openai_api/test_video_server.py +++ b/tests/entrypoints/openai_api/test_video_server.py @@ -34,15 +34,28 @@ class MockVideoResult: - def __init__(self, videos, audios=None, sample_rate=None, stage_durations=None, peak_memory_mb=0.0): + def __init__( + self, + videos, + audios=None, + sample_rate=None, + custom_output=None, + stage_durations=None, + peak_memory_mb=0.0, + ): self.multimodal_output = {"video": videos} if audios is not None: self.multimodal_output["audio"] = audios if sample_rate is not None: self.multimodal_output["audio_sample_rate"] = sample_rate + self._custom_output = custom_output or {} self.stage_durations = stage_durations or {} self.peak_memory_mb = peak_memory_mb + @property + def custom_output(self): + return self._custom_output + class FakeAsyncOmni: def __init__(self): @@ -400,6 +413,67 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture): assert captured.extra_args["flow_shift"] == 0.25 +def test_frame_interpolation_params_pass_to_diffusion_sampling_params(test_client, mocker: MockerFixture): + """Frame interpolation parameters should be forwarded to diffusion worker sampling params.""" + mocker.patch( + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", + ) + response = test_client.post( + "/v1/videos", + data={ + "prompt": "smooth motion", + "fps": "8", + "enable_frame_interpolation": "true", + "frame_interpolation_exp": "2", + "frame_interpolation_scale": "0.5", + "frame_interpolation_model_path": "local-rife", + }, + ) + + assert response.status_code == 200 + video_id = response.json()["id"] + _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value) + + engine = test_client.app.state.openai_serving_video._engine_client + captured = engine.captured_sampling_params_list[0] + assert captured.enable_frame_interpolation is True + assert captured.frame_interpolation_exp == 2 + assert captured.frame_interpolation_scale == 0.5 + assert captured.frame_interpolation_model_path == "local-rife" + + +def test_worker_fps_multiplier_is_applied_to_async_encoding(test_client, mocker: MockerFixture): + fps_values = [] + engine = test_client.app.state.openai_serving_video._engine_client + + async def _generate(prompt, request_id, sampling_params_list): + engine.captured_prompt = prompt + engine.captured_sampling_params_list = sampling_params_list + import numpy as np + + yield MockVideoResult([np.zeros((1, 64, 64, 3), dtype=np.uint8)], custom_output={"video_fps_multiplier": 2}) + + engine.generate = _generate + + def _fake_encode(video, fps, **kwargs): + del video, kwargs + fps_values.append(fps) + return b"fake-video" + + mocker.patch( + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + side_effect=_fake_encode, + ) + + response = test_client.post("/v1/videos", data={"prompt": "fps multiplier", "fps": "8"}) + + assert response.status_code == 200 + video_id = response.json()["id"] + _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value) + assert fps_values == [16] + + def test_audio_sample_rate_comes_from_model_config(test_client, mocker: MockerFixture): audio_sample_rates = [] @@ -595,6 +669,10 @@ def test_video_request_validation(): with pytest.raises(ValueError): VideoGenerationRequest(prompt="test", image_reference={"file_id": "file-1", "image_url": "https://example.com"}) + with pytest.raises(ValueError): + VideoGenerationRequest(prompt="test", frame_interpolation_exp=0) + with pytest.raises(ValueError): + VideoGenerationRequest(prompt="test", frame_interpolation_scale=0) def test_list_videos_supports_order_after_and_limit(test_client, mocker: MockerFixture): @@ -1032,3 +1110,57 @@ def test_sync_sampling_params_pass_through(test_client, mocker: MockerFixture): assert captured.num_inference_steps == 30 assert captured.guidance_scale == 6.5 assert captured.seed == 42 + + +def test_sync_frame_interpolation_params_pass_to_sampling_params(test_client, mocker: MockerFixture): + """Frame interpolation parameters should be forwarded on the sync path.""" + encode_mock = _mock_encode_video_bytes(mocker) + response = test_client.post( + "/v1/videos/sync", + data={ + "prompt": "smooth sync", + "fps": "8", + "enable_frame_interpolation": "true", + "frame_interpolation_exp": "2", + "frame_interpolation_scale": "0.5", + "frame_interpolation_model_path": "local-rife", + }, + ) + + assert response.status_code == 200 + engine = test_client.app.state.openai_serving_video._engine_client + captured = engine.captured_sampling_params_list[0] + assert captured.enable_frame_interpolation is True + assert captured.frame_interpolation_exp == 2 + assert captured.frame_interpolation_scale == 0.5 + assert captured.frame_interpolation_model_path == "local-rife" + _, kwargs = encode_mock.call_args + assert kwargs["fps"] == 8 + + +def test_worker_fps_multiplier_is_applied_to_sync_encoding(test_client, mocker: MockerFixture): + engine = test_client.app.state.openai_serving_video._engine_client + fps_values = [] + + async def _generate(prompt, request_id, sampling_params_list): + engine.captured_prompt = prompt + engine.captured_sampling_params_list = sampling_params_list + yield MockVideoResult([object()], custom_output={"video_fps_multiplier": 2}) + + engine.generate = _generate + + def _fake_encode(video, fps, **kwargs): + del video, kwargs + fps_values.append(fps) + return b"fps-multiplied" + + mocker.patch( + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + side_effect=_fake_encode, + ) + + response = test_client.post("/v1/videos/sync", data={"prompt": "fps multiplier", "fps": "8"}) + + assert response.status_code == 200 + assert response.content == b"fps-multiplied" + assert fps_values == [16] diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index 52a8f38547..fe940d623e 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -3,6 +3,7 @@ from __future__ import annotations +import inspect import queue import threading import time @@ -78,6 +79,12 @@ def __init__( self.post_process_func = get_diffusion_post_process_func(od_config) self.pre_process_func = get_diffusion_pre_process_func(od_config) + # Cache whether the model-specific postprocess accepts request-level + # sampling params so step() can support both legacy and extended hooks. + self._post_process_accepts_sampling_params = bool( + self.post_process_func is not None + and "sampling_params" in inspect.signature(self.post_process_func).parameters + ) executor_class = DiffusionExecutor.get_class(od_config) self.executor = executor_class(od_config) @@ -143,12 +150,22 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: output_data = output_data.cpu() postprocess_start_time = time.perf_counter() - outputs = self.post_process_func(output_data) if self.post_process_func is not None else output_data + if self.post_process_func is not None: + # Some video pipelines need request-level controls during + # postprocess (for example worker-side frame interpolation). + if self._post_process_accepts_sampling_params: + outputs = self.post_process_func(output_data, sampling_params=request.sampling_params) + else: + outputs = self.post_process_func(output_data) + else: + outputs = output_data audio_payload = None + custom_output = output.custom_output or {} model_audio_sample_rate = None model_fps = None if isinstance(outputs, dict): audio_payload = outputs.get("audio") + custom_output.update(outputs.get("custom_output") or {}) model_audio_sample_rate = outputs.get("audio_sample_rate") model_fps = outputs.get("fps") outputs = outputs.get("video", outputs) @@ -225,7 +242,7 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: trajectory_timesteps=output.trajectory_timesteps, trajectory_log_probs=output.trajectory_log_probs, trajectory_decoded=output.trajectory_decoded, - custom_output=output.custom_output or {}, + custom_output=custom_output, multimodal_output=mm_output, stage_durations=output.stage_durations, peak_memory_mb=output.peak_memory_mb, @@ -295,7 +312,7 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: trajectory_timesteps=output.trajectory_timesteps, trajectory_log_probs=output.trajectory_log_probs, trajectory_decoded=output.trajectory_decoded, - custom_output=output.custom_output or {}, + custom_output=custom_output, multimodal_output=mm_output, stage_durations=output.stage_durations, peak_memory_mb=output.peak_memory_mb, diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py index 84d89619e8..a1b10439c8 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py @@ -26,6 +26,7 @@ from vllm_omni.diffusion.models.schedulers import FlowUniPCMultistepScheduler from vllm_omni.diffusion.models.wan2_2.scheduling_wan_euler import WanEulerScheduler from vllm_omni.diffusion.models.wan2_2.wan2_2_transformer import WanTransformer3DModel +from vllm_omni.diffusion.postprocess import interpolate_video_tensor from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniTextPrompt @@ -162,10 +163,23 @@ def get_wan22_post_process_func( def post_process_func( video: torch.Tensor, output_type: str = "np", + sampling_params=None, ): if output_type == "latent": return video - return video_processor.postprocess_video(video, output_type=output_type) + custom_output = {} + if sampling_params is not None and getattr(sampling_params, "enable_frame_interpolation", False): + video, multiplier = interpolate_video_tensor( + video, + exp=sampling_params.frame_interpolation_exp, + scale=sampling_params.frame_interpolation_scale, + model_path=sampling_params.frame_interpolation_model_path, + ) + custom_output["video_fps_multiplier"] = multiplier + return { + "video": video_processor.postprocess_video(video, output_type=output_type), + "custom_output": custom_output, + } return post_process_func diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py index 46484cd789..ddc6e0bc2b 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py @@ -32,6 +32,7 @@ resolve_wan_sample_solver, retrieve_latents, ) +from vllm_omni.diffusion.postprocess import interpolate_video_tensor from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniTextPrompt @@ -74,10 +75,23 @@ def get_wan22_i2v_post_process_func( def post_process_func( video: torch.Tensor, output_type: str = "np", + sampling_params=None, ): if output_type == "latent": return video - return video_processor.postprocess_video(video, output_type=output_type) + custom_output = {} + if sampling_params is not None and getattr(sampling_params, "enable_frame_interpolation", False): + video, multiplier = interpolate_video_tensor( + video, + exp=sampling_params.frame_interpolation_exp, + scale=sampling_params.frame_interpolation_scale, + model_path=sampling_params.frame_interpolation_model_path, + ) + custom_output["video_fps_multiplier"] = multiplier + return { + "video": video_processor.postprocess_video(video, output_type=output_type), + "custom_output": custom_output, + } return post_process_func diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py index 939fe294a3..62df13cbde 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py @@ -44,6 +44,7 @@ resolve_wan_sample_solver, retrieve_latents, ) +from vllm_omni.diffusion.postprocess import interpolate_video_tensor from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniTextPrompt from vllm_omni.platforms import current_omni_platform @@ -61,10 +62,23 @@ def get_wan22_ti2v_post_process_func( def post_process_func( video: torch.Tensor, output_type: str = "np", + sampling_params=None, ): if output_type == "latent": return video - return video_processor.postprocess_video(video, output_type=output_type) + custom_output = {} + if sampling_params is not None and getattr(sampling_params, "enable_frame_interpolation", False): + video, multiplier = interpolate_video_tensor( + video, + exp=sampling_params.frame_interpolation_exp, + scale=sampling_params.frame_interpolation_scale, + model_path=sampling_params.frame_interpolation_model_path, + ) + custom_output["video_fps_multiplier"] = multiplier + return { + "video": video_processor.postprocess_video(video, output_type=output_type), + "custom_output": custom_output, + } return post_process_func diff --git a/vllm_omni/diffusion/postprocess/__init__.py b/vllm_omni/diffusion/postprocess/__init__.py new file mode 100644 index 0000000000..e6fe5b2d22 --- /dev/null +++ b/vllm_omni/diffusion/postprocess/__init__.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Diffusion post-processing helpers.""" + +from vllm_omni.diffusion.postprocess.rife_interpolator import ( + FrameInterpolator, + interpolate_video_tensor, +) + +__all__ = ["FrameInterpolator", "interpolate_video_tensor"] diff --git a/vllm_omni/diffusion/postprocess/rife_interpolator.py b/vllm_omni/diffusion/postprocess/rife_interpolator.py new file mode 100644 index 0000000000..b2b4a93191 --- /dev/null +++ b/vllm_omni/diffusion/postprocess/rife_interpolator.py @@ -0,0 +1,440 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +RIFE 4.22.lite frame interpolation for vLLM-Omni video generation. + +RIFE model code is vendored and adapted from: + - https://github.com/hzwer/ECCV2022-RIFE (MIT License) + - https://github.com/hzwer/Practical-RIFE (MIT License) + Copyright (c) 2021 Zhewei Huang + +The FrameInterpolator wrapper and vLLM-Omni integration code are original work. +""" + +from __future__ import annotations + +import os +import threading +from typing import Any + +import torch +import torch.nn as nn +import torch.nn.functional as F +from vllm.logger import init_logger + +logger = init_logger(__name__) + +_DEFAULT_RIFE_HF_REPO = "elfgum/RIFE-4.22.lite" +_MODEL_CACHE: dict[tuple[str, str], Model] = {} +_MODEL_CACHE_LOCK = threading.Lock() + + +def warp(ten_input: torch.Tensor, ten_flow: torch.Tensor) -> torch.Tensor: + """Warp input tensor by optical flow using grid_sample.""" + ten_horizontal = ( + torch.linspace(-1.0, 1.0, ten_flow.shape[3], device=ten_flow.device) + .view(1, 1, 1, ten_flow.shape[3]) + .expand(ten_flow.shape[0], -1, ten_flow.shape[2], -1) + ) + ten_vertical = ( + torch.linspace(-1.0, 1.0, ten_flow.shape[2], device=ten_flow.device) + .view(1, 1, ten_flow.shape[2], 1) + .expand(ten_flow.shape[0], -1, -1, ten_flow.shape[3]) + ) + ten_grid = torch.cat([ten_horizontal, ten_vertical], dim=1) + + ten_flow = torch.cat( + [ + ten_flow[:, 0:1, :, :] / ((ten_input.shape[3] - 1.0) / 2.0), + ten_flow[:, 1:2, :, :] / ((ten_input.shape[2] - 1.0) / 2.0), + ], + dim=1, + ) + grid = (ten_grid + ten_flow).permute(0, 2, 3, 1) + return F.grid_sample( + input=ten_input, + grid=grid, + mode="bilinear", + padding_mode="border", + align_corners=True, + ) + + +def _conv( + in_planes: int, + out_planes: int, + kernel_size: int = 3, + stride: int = 1, + padding: int = 1, + dilation: int = 1, +) -> nn.Sequential: + return nn.Sequential( + nn.Conv2d( + in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=True, + ), + nn.LeakyReLU(0.2, True), + ) + + +class ResConv(nn.Module): + """Residual convolution block with learnable beta scaling.""" + + def __init__(self, c: int, dilation: int = 1): + super().__init__() + self.conv = nn.Conv2d(c, c, 3, 1, dilation, dilation=dilation, groups=1) + self.beta = nn.Parameter(torch.ones((1, c, 1, 1)), requires_grad=True) + self.relu = nn.LeakyReLU(0.2, True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.relu(self.conv(x) * self.beta + x) + + +class IFBlock(nn.Module): + """Single-scale optical flow, mask, and feature block.""" + + def __init__(self, in_planes: int, c: int = 64): + super().__init__() + self.conv0 = nn.Sequential( + _conv(in_planes, c // 2, 3, 2, 1), + _conv(c // 2, c, 3, 2, 1), + ) + self.convblock = nn.Sequential( + ResConv(c), + ResConv(c), + ResConv(c), + ResConv(c), + ResConv(c), + ResConv(c), + ResConv(c), + ResConv(c), + ) + self.lastconv = nn.Sequential( + nn.ConvTranspose2d(c, 4 * 13, 4, 2, 1), + nn.PixelShuffle(2), + ) + + def forward( + self, + x: torch.Tensor, + flow: torch.Tensor | None = None, + scale: float = 1.0, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + x = F.interpolate(x, scale_factor=1.0 / scale, mode="bilinear", align_corners=False) + if flow is not None: + flow = ( + F.interpolate( + flow, + scale_factor=1.0 / scale, + mode="bilinear", + align_corners=False, + ) + * 1.0 + / scale + ) + x = torch.cat((x, flow), 1) + feat = self.conv0(x) + feat = self.convblock(feat) + tmp = self.lastconv(feat) + tmp = F.interpolate(tmp, scale_factor=scale, mode="bilinear", align_corners=False) + flow = tmp[:, :4] * scale + mask = tmp[:, 4:5] + feat = tmp[:, 5:] + return flow, mask, feat + + +class Head(nn.Module): + """Feature encoder producing four-channel features at full resolution.""" + + def __init__(self): + super().__init__() + self.cnn0 = nn.Conv2d(3, 16, 3, 2, 1) + self.cnn1 = nn.Conv2d(16, 16, 3, 1, 1) + self.cnn2 = nn.Conv2d(16, 16, 3, 1, 1) + self.cnn3 = nn.ConvTranspose2d(16, 4, 4, 2, 1) + self.relu = nn.LeakyReLU(0.2, True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x0 = self.cnn0(x) + x = self.relu(x0) + x1 = self.cnn1(x) + x = self.relu(x1) + x2 = self.cnn2(x) + x = self.relu(x2) + x3 = self.cnn3(x) + return x3 + + +class IFNet(nn.Module): + """Four-scale IFNet optical flow network.""" + + def __init__(self): + super().__init__() + self.block0 = IFBlock(7 + 8, c=192) + self.block1 = IFBlock(8 + 4 + 8 + 8, c=128) + self.block2 = IFBlock(8 + 4 + 8 + 8, c=64) + self.block3 = IFBlock(8 + 4 + 8 + 8, c=32) + self.encode = Head() + + def forward( + self, + x: torch.Tensor, + timestep: float = 0.5, + scale_list: list[float] | None = None, + ) -> tuple[list[torch.Tensor], torch.Tensor, list[tuple[torch.Tensor, torch.Tensor] | torch.Tensor]]: + if scale_list is None: + scale_list = [8, 4, 2, 1] + + channel = x.shape[1] // 2 + img0 = x[:, :channel] + img1 = x[:, channel:] + + if not torch.is_tensor(timestep): + timestep = (x[:, :1].clone() * 0 + 1) * timestep + else: + timestep = timestep.repeat(1, 1, img0.shape[2], img0.shape[3]) + + f0 = self.encode(img0[:, :3]) + f1 = self.encode(img1[:, :3]) + + flow_list: list[torch.Tensor] = [] + merged: list[tuple[torch.Tensor, torch.Tensor] | torch.Tensor] = [] + mask_list: list[torch.Tensor] = [] + warped_img0 = img0 + warped_img1 = img1 + flow = None + mask = None + + for i, block in enumerate([self.block0, self.block1, self.block2, self.block3]): + if flow is None: + flow, mask, feat = block( + torch.cat((img0[:, :3], img1[:, :3], f0, f1, timestep), 1), + None, + scale=scale_list[i], + ) + else: + wf0 = warp(f0, flow[:, :2]) + wf1 = warp(f1, flow[:, 2:4]) + fd, m0, feat = block( + torch.cat( + ( + warped_img0[:, :3], + warped_img1[:, :3], + wf0, + wf1, + timestep, + mask, + feat, + ), + 1, + ), + flow, + scale=scale_list[i], + ) + mask = m0 + flow = flow + fd + + mask_list.append(mask) + flow_list.append(flow) + warped_img0 = warp(img0, flow[:, :2]) + warped_img1 = warp(img1, flow[:, 2:4]) + merged.append((warped_img0, warped_img1)) + + mask = torch.sigmoid(mask) + merged[3] = warped_img0 * mask + warped_img1 * (1 - mask) + return flow_list, mask_list[3], merged + + +class Model: + """Wraps IFNet and exposes RIFE-compatible load/inference helpers.""" + + def __init__(self): + self.flownet = IFNet() + + def eval(self) -> Model: + self.flownet.eval() + return self + + def device(self) -> torch.device: + return next(self.flownet.parameters()).device + + def load_model(self, path: str) -> None: + flownet_path = os.path.join(path, "flownet.pkl") + if not os.path.isfile(flownet_path): + raise FileNotFoundError( + f"RIFE weight file not found: {flownet_path}. Expected layout: /flownet.pkl" + ) + + state = torch.load(flownet_path, map_location="cpu", weights_only=False) + state = {k.removeprefix("module."): v for k, v in state.items()} + self.flownet.load_state_dict(state, strict=False) + logger.info("Loaded RIFE weights from %s", flownet_path) + + def inference( + self, + img0: torch.Tensor, + img1: torch.Tensor, + scale: float = 1.0, + timestep: float = 0.5, + ) -> torch.Tensor: + _n, _c, h, w = img0.shape + ph = ((h - 1) // 32 + 1) * 32 + pw = ((w - 1) // 32 + 1) * 32 + pad = (0, pw - w, 0, ph - h) + img0 = F.pad(img0, pad) + img1 = F.pad(img1, pad) + + imgs = torch.cat((img0, img1), 1) + scale_list = [8 / scale, 4 / scale, 2 / scale, 1 / scale] + with torch.no_grad(): + _flow_list, _mask, merged = self.flownet( + imgs, + timestep=timestep, + scale_list=scale_list, + ) + return merged[3][:, :, :h, :w] + + +def _resolve_rife_model_path(model_path: str | None) -> str: + model_path = model_path or _DEFAULT_RIFE_HF_REPO + if os.path.isdir(model_path): + return model_path + from vllm_omni.model_executor.model_loader.weight_utils import ( + download_weights_from_hf_specific, + ) + + return download_weights_from_hf_specific( + model_path, + cache_dir=None, + allow_patterns=["flownet.pkl"], + require_all=True, + ) + + +def _select_torch_device() -> torch.device: + try: + from vllm_omni.platforms import current_omni_platform + + return current_omni_platform.get_torch_device() + except Exception as exc: + logger.warning("Failed to resolve current vLLM-Omni torch device: %s", exc) + + if torch.cuda.is_available(): + return torch.device("cuda") + return torch.device("cpu") + + +def _normalize_video_tensor_layout(video: torch.Tensor) -> tuple[torch.Tensor, Any]: + if video.ndim == 5: + if video.shape[1] in (3, 4): + return video, lambda out: out + if video.shape[2] in (3, 4): + return video.permute(0, 2, 1, 3, 4), lambda out: out.permute(0, 2, 1, 3, 4) + elif video.ndim == 4: + if video.shape[0] in (3, 4): + return video.unsqueeze(0), lambda out: out.squeeze(0) + if video.shape[1] in (3, 4): + return video.permute(1, 0, 2, 3).unsqueeze(0), lambda out: out.squeeze(0).permute(1, 0, 2, 3) + raise ValueError(f"Unsupported video tensor shape for interpolation: {tuple(video.shape)}") + + +def _normalize_video_tensor_range(video: torch.Tensor) -> tuple[torch.Tensor, Any]: + original_dtype = video.dtype + video = video.detach() + if video.is_floating_point(): + video = video.to(torch.float32) + if torch.amin(video) < 0.0 or torch.amax(video) > 1.0: + return video.clamp(-1.0, 1.0) * 0.5 + 0.5, lambda out: (out * 2.0 - 1.0).to(original_dtype) + return video.clamp(0.0, 1.0), lambda out: out.to(original_dtype) + return video.to(torch.float32) / 255.0, lambda out: (out * 255.0).round().clamp(0, 255).to(original_dtype) + + +class FrameInterpolator: + """Lazy-loaded RIFE 4.22.lite frame interpolator.""" + + def __init__(self, model_path: str | None = None): + self._model_path = model_path + self._resolved_path: str | None = None + + def _ensure_model_loaded(self, preferred_device: torch.device | None = None) -> Model: + resolved_path = _resolve_rife_model_path(self._model_path) + self._resolved_path = resolved_path + device = preferred_device or _select_torch_device() + cache_key = (resolved_path, str(device)) + + with _MODEL_CACHE_LOCK: + if cache_key in _MODEL_CACHE: + return _MODEL_CACHE[cache_key] + + model = Model() + model.load_model(resolved_path) + model.eval() + model.flownet = model.flownet.to(device) + _MODEL_CACHE[cache_key] = model + logger.info("RIFE model loaded on device: %s", device) + return model + + def _make_inference( + self, + model: Model, + img0: torch.Tensor, + img1: torch.Tensor, + n: int, + scale: float, + ) -> list[torch.Tensor]: + if n == 1: + return [model.inference(img0, img1, scale=scale)] + mid = model.inference(img0, img1, scale=scale) + return ( + self._make_inference(model, img0, mid, n // 2, scale) + + [mid] + + self._make_inference(model, mid, img1, n // 2, scale) + ) + + def interpolate_tensor( + self, + video: torch.Tensor, + exp: int = 1, + scale: float = 1.0, + ) -> tuple[torch.Tensor, int]: + if exp < 1: + raise ValueError(f"frame interpolation exp must be >= 1, got {exp}") + if scale <= 0: + raise ValueError(f"frame interpolation scale must be > 0, got {scale}") + + video, restore_layout = _normalize_video_tensor_layout(video) + if video.shape[2] < 2: + return restore_layout(video), 1 + + video, restore_range = _normalize_video_tensor_range(video) + # Prefer the decoded video's current device so CPU-offloaded requests do + # not move the tensor back to GPU just for interpolation. + model = self._ensure_model_loaded(preferred_device=video.device) + video = video.to(model.device()) + intermediates_per_pair = 2**exp // 2 + + result_frames: list[torch.Tensor] = [] + for idx in range(video.shape[2] - 1): + img0 = video[:, :, idx, :, :] + img1 = video[:, :, idx + 1, :, :] + result_frames.append(img0) + result_frames.extend(self._make_inference(model, img0, img1, intermediates_per_pair, scale)) + result_frames.append(video[:, :, -1, :, :]) + result = torch.stack(result_frames, dim=2) + return restore_layout(restore_range(result)), 2**exp + + +def interpolate_video_tensor( + video: torch.Tensor, + exp: int = 1, + scale: float = 1.0, + model_path: str | None = None, +) -> tuple[torch.Tensor, int]: + """Interpolate a video tensor and return the FPS multiplier.""" + interpolator = FrameInterpolator(model_path=model_path) + return interpolator.interpolate_tensor(video, exp=exp, scale=scale) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index d847a96db6..11ba59e43a 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -2075,6 +2075,10 @@ async def _parse_video_form( true_cfg_scale: float | None = Form(default=None), seed: int | None = Form(default=None), negative_prompt: str | None = Form(default=None), + enable_frame_interpolation: bool = Form(default=False), + frame_interpolation_exp: int = Form(default=1, ge=1), + frame_interpolation_scale: float = Form(default=1.0, gt=0.0), + frame_interpolation_model_path: str | None = Form(default=None), lora: str | None = Form(default=None), extra_params: str | None = Form(default=None), ) -> tuple[VideoGenerationRequest, "OmniOpenAIServingVideo", str, ReferenceImage | None]: @@ -2111,6 +2115,10 @@ async def _parse_video_form( "true_cfg_scale": true_cfg_scale, "seed": seed, "negative_prompt": negative_prompt, + "enable_frame_interpolation": enable_frame_interpolation, + "frame_interpolation_exp": frame_interpolation_exp, + "frame_interpolation_scale": frame_interpolation_scale, + "frame_interpolation_model_path": frame_interpolation_model_path, "lora": _parse_form_json(lora, expected_type=dict), "extra_params": _parse_form_json(extra_params, expected_type=dict), } diff --git a/vllm_omni/entrypoints/openai/protocol/videos.py b/vllm_omni/entrypoints/openai/protocol/videos.py index de5362dd97..7c2c3164d9 100644 --- a/vllm_omni/entrypoints/openai/protocol/videos.py +++ b/vllm_omni/entrypoints/openai/protocol/videos.py @@ -150,6 +150,29 @@ class VideoGenerationRequest(BaseModel): ) seed: int | None = Field(default=None, description="Random seed for reproducibility") + # vllm-omni extensions for post-generation frame interpolation. + enable_frame_interpolation: bool = Field( + default=False, + description="Enable post-generation RIFE frame interpolation before MP4 encoding.", + ) + frame_interpolation_exp: int = Field( + default=1, + ge=1, + description="Interpolation exponent: 1=2x temporal resolution, 2=4x, etc.", + ) + frame_interpolation_scale: float = Field( + default=1.0, + gt=0.0, + description="RIFE inference scale. Use 0.5 for high-resolution inputs to save memory.", + ) + frame_interpolation_model_path: str | None = Field( + default=None, + description=( + "Local directory or Hugging Face repo ID containing RIFE flownet.pkl weights. " + "Defaults to elfgum/RIFE-4.22.lite." + ), + ) + # vllm-omni extension for per-request LoRA. lora: dict[str, Any] | None = Field( default=None, diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py index 0001fa65f8..741295c7c2 100644 --- a/vllm_omni/entrypoints/openai/serving_video.py +++ b/vllm_omni/entrypoints/openai/serving_video.py @@ -113,6 +113,10 @@ async def _run_and_extract( if vp.fps is not None: gen_params.fps = vp.fps gen_params.frame_rate = float(vp.fps) + gen_params.enable_frame_interpolation = request.enable_frame_interpolation + gen_params.frame_interpolation_exp = request.frame_interpolation_exp + gen_params.frame_interpolation_scale = request.frame_interpolation_scale + gen_params.frame_interpolation_model_path = request.frame_interpolation_model_path if request.num_inference_steps is not None: gen_params.num_inference_steps = request.num_inference_steps @@ -160,7 +164,7 @@ async def _run_and_extract( videos = self._extract_video_outputs(result) audios = self._extract_audio_outputs(result, expected_count=len(videos)) audio_sample_rate = self._resolve_audio_sample_rate(result) - output_fps = vp.fps or self._resolve_fps(result) or 24 + output_fps = (vp.fps or self._resolve_fps(result) or 24) * self._resolve_video_fps_multiplier(result) return VideoGenerationArtifacts( videos=videos, audios=audios, @@ -243,6 +247,22 @@ async def generate_video_bytes( logger.info("Video response encoding (MP4 bytes): %.2f ms", _t_encode_ms) return video_bytes, artifacts.stage_durations, artifacts.peak_memory_mb + @staticmethod + def _resolve_video_fps_multiplier(result: Any) -> int: + custom_output = getattr(result, "custom_output", None) + if isinstance(custom_output, dict): + multiplier = custom_output.get("video_fps_multiplier") + if multiplier is not None: + return int(multiplier) + request_output = getattr(result, "request_output", None) + if request_output is not None: + custom_output = getattr(request_output, "custom_output", None) + if isinstance(custom_output, dict): + multiplier = custom_output.get("video_fps_multiplier") + if multiplier is not None: + return int(multiplier) + return 1 + @staticmethod def _apply_lora(lora_body: Any, gen_params: OmniDiffusionSamplingParams) -> None: try: diff --git a/vllm_omni/inputs/data.py b/vllm_omni/inputs/data.py index 9cb6c44335..85faf6b949 100644 --- a/vllm_omni/inputs/data.py +++ b/vllm_omni/inputs/data.py @@ -227,6 +227,10 @@ class OmniDiffusionSamplingParams: frame_rate: float | None = None # Floating-point rate used by the diffusion model when it differs from `fps`. height_not_provided: bool = False width_not_provided: bool = False + enable_frame_interpolation: bool = False + frame_interpolation_exp: int = 1 + frame_interpolation_scale: float = 1.0 + frame_interpolation_model_path: str | None = None # Timesteps timesteps: torch.Tensor | None = None From 1ad726f49524be5a4fb96f777ed90722f1276692 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2587297563@qq.com> Date: Wed, 15 Apr 2026 12:53:47 +0800 Subject: [PATCH 178/204] =?UTF-8?q?[Fix]=20HunyuanImage-3.0:=20unify=20nam?= =?UTF-8?q?ing=20hunyuan=5Fimage=5F3=20=E2=86=92=20hunyuan=5Fimage3=20(#27?= =?UTF-8?q?12)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/design/feature/expert_parallel.md | 4 ++-- .../test_hunyuan_fused_moe.py | 22 +++++++++---------- .../test_hunyuanimage3_text2img.py | 2 +- .../__init__.py | 6 ++--- .../autoencoder.py | 0 .../hunyuan_fused_moe.py | 0 .../hunyuan_image3_tokenizer.py} | 2 +- .../hunyuan_image3_transformer.py} | 2 +- .../pipeline_hunyuan_image3.py} | 4 ++-- .../system_prompt.py | 0 vllm_omni/diffusion/registry.py | 4 ++-- ...age_3_moe.yaml => hunyuan_image3_moe.yaml} | 0 ...3_moe_dit.yaml => hunyuan_image3_t2i.yaml} | 0 ...2gpu.yaml => hunyuan_image3_t2i_2gpu.yaml} | 0 vllm_omni/platforms/interface.py | 2 +- vllm_omni/platforms/musa/platform.py | 2 +- ...3_moe_dit.yaml => hunyuan_image3_t2i.yaml} | 0 ...age_3_moe.yaml => hunyuan_image3_t2i.yaml} | 0 18 files changed, 25 insertions(+), 25 deletions(-) rename tests/diffusion/models/{hunyuan_image_3 => hunyuan_image3}/test_hunyuan_fused_moe.py (85%) rename vllm_omni/diffusion/models/{hunyuan_image_3 => hunyuan_image3}/__init__.py (58%) rename vllm_omni/diffusion/models/{hunyuan_image_3 => hunyuan_image3}/autoencoder.py (100%) rename vllm_omni/diffusion/models/{hunyuan_image_3 => hunyuan_image3}/hunyuan_fused_moe.py (100%) rename vllm_omni/diffusion/models/{hunyuan_image_3/hunyuan_image_3_tokenizer.py => hunyuan_image3/hunyuan_image3_tokenizer.py} (99%) rename vllm_omni/diffusion/models/{hunyuan_image_3/hunyuan_image_3_transformer.py => hunyuan_image3/hunyuan_image3_transformer.py} (99%) rename vllm_omni/diffusion/models/{hunyuan_image_3/pipeline_hunyuan_image_3.py => hunyuan_image3/pipeline_hunyuan_image3.py} (99%) rename vllm_omni/diffusion/models/{hunyuan_image_3 => hunyuan_image3}/system_prompt.py (100%) rename vllm_omni/model_executor/stage_configs/{hunyuan_image_3_moe.yaml => hunyuan_image3_moe.yaml} (100%) rename vllm_omni/model_executor/stage_configs/{hunyuan_image3_moe_dit.yaml => hunyuan_image3_t2i.yaml} (100%) rename vllm_omni/model_executor/stage_configs/{hunyuan_image_3_moe_2gpu.yaml => hunyuan_image3_t2i_2gpu.yaml} (100%) rename vllm_omni/platforms/npu/stage_configs/{hunyuan_image3_moe_dit.yaml => hunyuan_image3_t2i.yaml} (100%) rename vllm_omni/platforms/xpu/stage_configs/{hunyuan_image_3_moe.yaml => hunyuan_image3_t2i.yaml} (100%) diff --git a/docs/design/feature/expert_parallel.md b/docs/design/feature/expert_parallel.md index 9a7c4cdbac..e05eec3361 100644 --- a/docs/design/feature/expert_parallel.md +++ b/docs/design/feature/expert_parallel.md @@ -207,9 +207,9 @@ Complete examples in the codebase: | Model | Path | Pattern | Notes | |-------|------|---------|-------| -| **HunyuanImage3.0** | `vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py` | Standard EP | Full implementation with validation | +| **HunyuanImage3.0** | `vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py` | Standard EP | Full implementation with validation | | **EP Tests** | `vllm-omni/tests/e2e/offline_inference/test_expert_parallel.py` | E2E testing | EP correctness and performance | -| **Constraint Tests** | `vllm-omni/tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py` | Unit testing | Validation logic | +| **Constraint Tests** | `vllm-omni/tests/diffusion/models/hunyuan_image3/test_hunyuan_fused_moe.py` | Unit testing | Validation logic | --- ## Summary diff --git a/tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_fused_moe.py similarity index 85% rename from tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py rename to tests/diffusion/models/hunyuan_image3/test_hunyuan_fused_moe.py index 2cda9116c7..626f78eed9 100644 --- a/tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py +++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_fused_moe.py @@ -12,7 +12,7 @@ class TestSetForwardContextNumTokens: def test_sets_num_tokens_when_context_available(self, mocker): """num_tokens should be set on ForwardContext when available.""" - import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe mock_ctx = mocker.MagicMock() del mock_ctx.in_profile_run # simulate missing attr @@ -26,7 +26,7 @@ def test_sets_num_tokens_when_context_available(self, mocker): def test_sets_in_profile_run_only_if_missing(self, mocker): """in_profile_run should not be overwritten if already set.""" - import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe mock_ctx = mocker.MagicMock() mock_ctx.in_profile_run = True # already set @@ -40,7 +40,7 @@ def test_sets_in_profile_run_only_if_missing(self, mocker): def test_noop_when_context_unavailable(self, mocker): """Should do nothing when ForwardContext is not available.""" - import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe mocker.patch.object(hunyuan_moe._vllm_fc, "is_forward_context_available", return_value=False) mock_get = mocker.patch.object(hunyuan_moe._vllm_fc, "get_forward_context") @@ -55,11 +55,11 @@ class TestHunyuanFusedMoEPlatformDispatch: def test_default_platform_uses_default_impl_qualname(self, mocker): """HunyuanFusedMoE should resolve the impl class from the platform hook.""" - import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe mock_platform = mocker.MagicMock() mock_platform.get_diffusion_model_impl_qualname.return_value = ( - "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault" + "vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe.HunyuanFusedMoEDefault" ) mocker.patch.object( @@ -71,7 +71,7 @@ def test_default_platform_uses_default_impl_qualname(self, mocker): mock_impl = mocker.MagicMock() mock_resolve.return_value = mock_impl - from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import ( + from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe import ( HunyuanFusedMoE, ) @@ -80,7 +80,7 @@ def test_default_platform_uses_default_impl_qualname(self, mocker): mock_platform.prepare_diffusion_op_runtime.assert_called_once_with("hunyuan_fused_moe") mock_platform.get_diffusion_model_impl_qualname.assert_called_once_with("hunyuan_fused_moe") mock_resolve.assert_called_once_with( - "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault" + "vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe.HunyuanFusedMoEDefault" ) mock_impl.assert_called_once_with(prefix="") @@ -90,7 +90,7 @@ class TestHunyuanFusedMoEFactory: def test_new_delegates_to_impl_class(self, mocker): """HunyuanFusedMoE(prefix=..., **kwargs) should instantiate and return impl instance.""" - import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe class MockImpl: def __init__(self, *, prefix: str = "", **kwargs): @@ -104,7 +104,7 @@ def __init__(self, *, prefix: str = "", **kwargs): mock_impl_class = mocker.MagicMock(return_value=MockImpl(prefix="test", a=1)) mocker.patch.object(hunyuan_moe, "resolve_obj_by_qualname", return_value=mock_impl_class) - from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import ( + from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe import ( HunyuanFusedMoE, ) @@ -119,7 +119,7 @@ def __init__(self, *, prefix: str = "", **kwargs): def test_make_expert_params_mapping_delegates_to_impl(self, mocker): """make_expert_params_mapping should delegate to impl class method.""" - import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe expected_mapping = [("a", "b", 0, "c")] mock_platform = mocker.MagicMock() @@ -130,7 +130,7 @@ def test_make_expert_params_mapping_delegates_to_impl(self, mocker): mock_impl_class.make_expert_params_mapping = mocker.MagicMock(return_value=expected_mapping) mocker.patch.object(hunyuan_moe, "resolve_obj_by_qualname", return_value=mock_impl_class) - from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import ( + from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe import ( HunyuanFusedMoE, ) diff --git a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py index 79bb64dca1..6898763e40 100644 --- a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py +++ b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py @@ -17,7 +17,7 @@ MODEL_NAME = "tencent/HunyuanImage-3.0" LOCAL_CLIP_PATH = "openai/clip-vit-base-patch32" REPO_ROOT = Path(__file__).resolve().parents[3] -STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image_3_moe.yaml" +STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image3_moe.yaml" pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion] diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/__init__.py b/vllm_omni/diffusion/models/hunyuan_image3/__init__.py similarity index 58% rename from vllm_omni/diffusion/models/hunyuan_image_3/__init__.py rename to vllm_omni/diffusion/models/hunyuan_image3/__init__.py index cbc6a8ad1f..6612bd855b 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/__init__.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/__init__.py @@ -2,12 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Hunyuan Image 3 diffusion model components.""" -from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import HunyuanFusedMoE -from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_image_3_transformer import ( +from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe import HunyuanFusedMoE +from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import ( HunyuanImage3Model, HunyuanImage3Text2ImagePipeline, ) -from vllm_omni.diffusion.models.hunyuan_image_3.pipeline_hunyuan_image_3 import ( +from vllm_omni.diffusion.models.hunyuan_image3.pipeline_hunyuan_image3 import ( HunyuanImage3Pipeline, ) diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/autoencoder.py b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py similarity index 100% rename from vllm_omni/diffusion/models/hunyuan_image_3/autoencoder.py rename to vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_fused_moe.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_fused_moe.py similarity index 100% rename from vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_fused_moe.py rename to vllm_omni/diffusion/models/hunyuan_image3/hunyuan_fused_moe.py diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_tokenizer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py similarity index 99% rename from vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_tokenizer.py rename to vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py index ce563f7115..4a29e9df93 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_tokenizer.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py @@ -13,7 +13,7 @@ from transformers import AutoTokenizer from vllm.logger import init_logger -from .hunyuan_image_3_transformer import ImageInfo, JointImageInfo, default +from .hunyuan_image3_transformer import ImageInfo, JointImageInfo, default logger = init_logger(__name__) diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py similarity index 99% rename from vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py rename to vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py index bc81ca9c3e..327260ee0b 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py @@ -74,7 +74,7 @@ ) from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.layers.rope import RotaryEmbedding -from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import HunyuanFusedMoE +from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe import HunyuanFusedMoE logger = logging.getLogger(__name__) diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py similarity index 99% rename from vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py rename to vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index 7e9e2d2787..2f140b48fc 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -25,8 +25,8 @@ from vllm_omni.diffusion.request import OmniDiffusionRequest from .autoencoder import AutoencoderKLConv3D -from .hunyuan_image_3_tokenizer import TokenizerWrapper -from .hunyuan_image_3_transformer import ( +from .hunyuan_image3_tokenizer import TokenizerWrapper +from .hunyuan_image3_transformer import ( CausalMMOutputWithPast, HunyuanImage3ImageProcessor, HunyuanImage3Model, diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/system_prompt.py b/vllm_omni/diffusion/models/hunyuan_image3/system_prompt.py similarity index 100% rename from vllm_omni/diffusion/models/hunyuan_image_3/system_prompt.py rename to vllm_omni/diffusion/models/hunyuan_image3/system_prompt.py diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index 97bc7fa292..517b061ece 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -119,8 +119,8 @@ "FluxKontextPipeline", ), "HunyuanImage3ForCausalMM": ( - "hunyuan_image_3", - "pipeline_hunyuan_image_3", + "hunyuan_image3", + "pipeline_hunyuan_image3", "HunyuanImage3Pipeline", ), "Flux2KleinPipeline": ( diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image_3_moe.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml similarity index 100% rename from vllm_omni/model_executor/stage_configs/hunyuan_image_3_moe.yaml rename to vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i.yaml similarity index 100% rename from vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit.yaml rename to vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i.yaml diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image_3_moe_2gpu.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i_2gpu.yaml similarity index 100% rename from vllm_omni/model_executor/stage_configs/hunyuan_image_3_moe_2gpu.yaml rename to vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i_2gpu.yaml diff --git a/vllm_omni/platforms/interface.py b/vllm_omni/platforms/interface.py index 8f1e66747d..b69731a67d 100644 --- a/vllm_omni/platforms/interface.py +++ b/vllm_omni/platforms/interface.py @@ -64,7 +64,7 @@ def get_default_stage_config_path(cls) -> str: @classmethod def get_diffusion_model_impl_qualname(cls, op_name: str) -> str: if op_name == "hunyuan_fused_moe": - return "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault" + return "vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe.HunyuanFusedMoEDefault" raise NotImplementedError(f"Unsupported diffusion model op: {op_name}") @classmethod diff --git a/vllm_omni/platforms/musa/platform.py b/vllm_omni/platforms/musa/platform.py index fe1ccc6d0b..64a70a9beb 100644 --- a/vllm_omni/platforms/musa/platform.py +++ b/vllm_omni/platforms/musa/platform.py @@ -39,7 +39,7 @@ def get_default_stage_config_path(cls) -> str: def get_diffusion_model_impl_qualname(cls, op_name: str) -> str: # MUSA uses default implementations for diffusion ops if op_name == "hunyuan_fused_moe": - return "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault" + return "vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe.HunyuanFusedMoEDefault" return super().get_diffusion_model_impl_qualname(op_name) @classmethod diff --git a/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_moe_dit.yaml b/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_t2i.yaml similarity index 100% rename from vllm_omni/platforms/npu/stage_configs/hunyuan_image3_moe_dit.yaml rename to vllm_omni/platforms/npu/stage_configs/hunyuan_image3_t2i.yaml diff --git a/vllm_omni/platforms/xpu/stage_configs/hunyuan_image_3_moe.yaml b/vllm_omni/platforms/xpu/stage_configs/hunyuan_image3_t2i.yaml similarity index 100% rename from vllm_omni/platforms/xpu/stage_configs/hunyuan_image_3_moe.yaml rename to vllm_omni/platforms/xpu/stage_configs/hunyuan_image3_t2i.yaml From 2dff2d7c747864378764195e0e4a6b137c3cf5df Mon Sep 17 00:00:00 2001 From: fan2956 Date: Wed, 15 Apr 2026 14:02:41 +0800 Subject: [PATCH 179/204] [PERF] Wan2.2 support adalayernorm fused op (#2585) Signed-off-by: fan2956 Co-authored-by: Canlin Guo --- .../diffusion/cache/teacache/extractors.py | 3 +- vllm_omni/diffusion/layers/adalayernorm.py | 80 +++++-------------- .../qwen_image/qwen_image_transformer.py | 19 +++-- .../models/wan2_2/wan2_2_transformer.py | 25 +++--- .../models/wan2_2/wan2_2_vace_transformer.py | 2 +- 5 files changed, 45 insertions(+), 84 deletions(-) diff --git a/vllm_omni/diffusion/cache/teacache/extractors.py b/vllm_omni/diffusion/cache/teacache/extractors.py index 3d247e3187..84c237b60d 100644 --- a/vllm_omni/diffusion/cache/teacache/extractors.py +++ b/vllm_omni/diffusion/cache/teacache/extractors.py @@ -222,7 +222,8 @@ def extract_qwen_context( block = module.transformer_blocks[0] img_mod_params = block.img_mod(temb) img_mod1, _ = img_mod_params.chunk(2, dim=-1) - img_modulated, _ = block.img_norm1(hidden_states, img_mod1) + img_scale1, img_shift1, _ = block._modulate(img_mod1) + img_modulated = block.img_norm1(hidden_states, img_scale1, img_shift1) # ============================================================================ # DEFINE TRANSFORMER EXECUTION (Qwen-specific) diff --git a/vllm_omni/diffusion/layers/adalayernorm.py b/vllm_omni/diffusion/layers/adalayernorm.py index 35f63e2fc9..4d70ed52f7 100644 --- a/vllm_omni/diffusion/layers/adalayernorm.py +++ b/vllm_omni/diffusion/layers/adalayernorm.py @@ -29,105 +29,61 @@ def __init__(self, hidden_size: int, elementwise_affine: bool = False, eps: floa self.hidden_size = hidden_size self.layernorm = nn.LayerNorm(self.hidden_size, elementwise_affine=self.elementwise_affine, eps=self.eps) - def preprocess( - self, - mod_params: torch.Tensor, - index: torch.Tensor = None, - ) -> torch.Tensor: - # shift: b d, scale: b d, gate: b d - shift, scale, gate = mod_params.chunk(3, dim=-1) - - if index is not None: - # Assuming mod_params batch dim is 2*actual_batch (chunked into 2 parts) - # So shift, scale, gate have shape [2*actual_batch, d] - actual_batch = shift.size(0) // 2 - shift_0, shift_1 = shift[:actual_batch], shift[actual_batch:] # each: [actual_batch, d] - scale_0, scale_1 = scale[:actual_batch], scale[actual_batch:] - gate_0, gate_1 = gate[:actual_batch], gate[actual_batch:] - - # index: [b, l] where b is actual batch size - # Expand to [b, l, 1] to match feature dimension - index_expanded = index.unsqueeze(-1) # [b, l, 1] - - # Expand chunks to [b, 1, d] then broadcast to [b, l, d] - shift_0_exp = shift_0.unsqueeze(1) # [b, 1, d] - shift_1_exp = shift_1.unsqueeze(1) # [b, 1, d] - scale_0_exp = scale_0.unsqueeze(1) - scale_1_exp = scale_1.unsqueeze(1) - gate_0_exp = gate_0.unsqueeze(1) - gate_1_exp = gate_1.unsqueeze(1) - - # Use torch.where to select based on index - shift_result = torch.where(index_expanded == 0, shift_0_exp, shift_1_exp) - scale_result = torch.where(index_expanded == 0, scale_0_exp, scale_1_exp) - gate_result = torch.where(index_expanded == 0, gate_0_exp, gate_1_exp) - else: - shift_result = shift.unsqueeze(1) - scale_result = scale.unsqueeze(1) - gate_result = gate.unsqueeze(1) - - return shift_result, scale_result, gate_result - def forward_cuda( self, x: torch.Tensor, - mod_params: torch.Tensor, - index: torch.Tensor = None, + scale: torch.Tensor, + shift: torch.Tensor, ) -> torch.Tensor: - return self.forward_native(x, mod_params, index) + return self.forward_native(x, scale, shift) def forward_hip( self, x: torch.Tensor, - mod_params: torch.Tensor, - index: torch.Tensor = None, + scale: torch.Tensor, + shift: torch.Tensor, ) -> torch.Tensor: - return self.forward_native(x, mod_params, index) + return self.forward_native(x, scale, shift) def forward_npu( self, x: torch.Tensor, - mod_params: torch.Tensor, - index: torch.Tensor = None, + scale: torch.Tensor, + shift: torch.Tensor, ) -> torch.Tensor: - shift_result, scale_result, gate_result = self.preprocess(mod_params, index) - if _HAS_MINDIESD: try: from mindiesd import layernorm_scale_shift - output = layernorm_scale_shift(self.layernorm, x, scale_result, shift_result, fused=True) + output = layernorm_scale_shift(self.layernorm, x, scale, shift, fused=True) - return output, gate_result + return output except ImportError as e: logger.warning_once(f"mindiesd import failed, falling back to torch_npu: {e}") import torch_npu output = ( - torch_npu.npu_layer_norm_eval(x, normalized_shape=[self.hidden_size], eps=self.eps) * (1 + scale_result) - + shift_result + torch_npu.npu_layer_norm_eval(x, normalized_shape=[self.hidden_size], eps=self.eps) * (1 + scale) + shift ) - return output, gate_result + return output def forward_xpu( self, x: torch.Tensor, - mod_params: torch.Tensor, - index: torch.Tensor = None, + scale: torch.Tensor, + shift: torch.Tensor, ) -> torch.Tensor: - return self.forward_native(x, mod_params, index) + return self.forward_native(x, scale, shift) def forward_native( self, x: torch.Tensor, - mod_params: torch.Tensor, - index: torch.Tensor = None, + scale: torch.Tensor, + shift: torch.Tensor, ) -> torch.Tensor: - shift_result, scale_result, gate_result = self.preprocess(mod_params, index) - - return self.layernorm(x) * (1 + scale_result) + shift_result, gate_result + return self.layernorm(x) * (1 + scale) + shift class AdaLayerNormZero(nn.Module): diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py index b34f19e954..9f16d8808c 100644 --- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py +++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py @@ -744,9 +744,9 @@ def __init__( self.zero_cond_t = zero_cond_t - def _modulate(self, x, mod_params, index=None): + def _modulate(self, mod_params, index=None): """Apply modulation to input tensor""" - # x: b l d, shift: b d, scale: b d, gate: b d + # shift: b d, scale: b d, gate: b d shift, scale, gate = mod_params.chunk(3, dim=-1) if index is not None: @@ -778,7 +778,7 @@ def _modulate(self, x, mod_params, index=None): scale_result = scale.unsqueeze(1) gate_result = gate.unsqueeze(1) - return x * (1 + scale_result) + shift_result, gate_result + return scale_result, shift_result, gate_result def forward( self, @@ -804,10 +804,12 @@ def forward( txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1) # Each [B, 3*dim] # Process image stream - norm1 + modulation - img_modulated, img_gate1 = self.img_norm1(hidden_states, img_mod1, modulate_index) + img_scale1, img_shift1, img_gate1 = self._modulate(img_mod1, modulate_index) + img_modulated = self.img_norm1(hidden_states, img_scale1, img_shift1) # Process text stream - norm1 + modulation - txt_modulated, txt_gate1 = self.txt_norm1(encoder_hidden_states, txt_mod1) + txt_scale1, txt_shift1, txt_gate1 = self._modulate(txt_mod1) + txt_modulated = self.txt_norm1(encoder_hidden_states, txt_scale1, txt_shift1) # Use QwenAttnProcessor2_0 for joint attention computation # This directly implements the DoubleStreamLayerMegatron logic: @@ -832,13 +834,16 @@ def forward( encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output # Process image stream - norm2 + MLP - img_modulated2, img_gate2 = self.img_norm2(hidden_states, img_mod2, modulate_index) + img_scale2, img_shift2, img_gate2 = self._modulate(img_mod2, modulate_index) + img_modulated2 = self.img_norm2(hidden_states, img_scale2, img_shift2) img_mlp_output = self.img_mlp(img_modulated2) hidden_states = hidden_states + img_gate2 * img_mlp_output # Process text stream - norm2 + MLP - txt_modulated2, txt_gate2 = self.txt_norm2(encoder_hidden_states, txt_mod2) + txt_scale2, txt_shift2, txt_gate2 = self._modulate(txt_mod2) + txt_modulated2 = self.txt_norm2(encoder_hidden_states, txt_scale2, txt_shift2) + txt_mlp_output = self.txt_mlp(txt_modulated2) encoder_hidden_states = encoder_hidden_states + txt_gate2 * txt_mlp_output diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py index 3b43f3eaf5..b870193a14 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py @@ -29,6 +29,7 @@ SequenceParallelOutput, ) from vllm_omni.diffusion.forward_context import get_forward_context +from vllm_omni.diffusion.layers.adalayernorm import AdaLayerNorm from vllm_omni.platforms import current_omni_platform logger = init_logger(__name__) @@ -620,7 +621,7 @@ def __init__( head_dim = dim // num_heads # 1. Self-attention - self.norm1 = FP32LayerNorm(dim, eps, elementwise_affine=False) + self.norm1 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) self.attn1 = WanSelfAttention( dim=dim, num_heads=num_heads, @@ -640,7 +641,7 @@ def __init__( # 3. Feed-forward self.ffn = WanFeedForward(dim=dim, inner_dim=ffn_dim, dim_out=dim) - self.norm3 = FP32LayerNorm(dim, eps, elementwise_affine=False) + self.norm3 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) # Scale-shift table for modulation self.scale_shift_table = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5) @@ -656,7 +657,7 @@ def forward( if temb.ndim == 4: # temb: batch_size, seq_len, 6, inner_dim (wan2.2 ti2v) shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = ( - self.scale_shift_table.unsqueeze(0) + temb.float() + self.scale_shift_table.unsqueeze(0) + temb ).chunk(6, dim=2) shift_msa = shift_msa.squeeze(2) scale_msa = scale_msa.squeeze(2) @@ -667,25 +668,23 @@ def forward( else: # temb: batch_size, 6, inner_dim (wan2.1/wan2.2 14B) shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = ( - self.scale_shift_table + temb.float() + self.scale_shift_table + temb ).chunk(6, dim=1) # 1. Self-attention - norm_hidden_states = (self.norm1(hidden_states.float()) * (1 + scale_msa) + shift_msa).type_as(hidden_states) + norm_hidden_states = self.norm1(hidden_states, scale_msa, shift_msa).type_as(hidden_states) attn_output = self.attn1(norm_hidden_states, rotary_emb, hidden_states_mask) - hidden_states = (hidden_states.float() + attn_output * gate_msa).type_as(hidden_states) + hidden_states = (hidden_states + attn_output * gate_msa).type_as(hidden_states) # 2. Cross-attention - norm_hidden_states = self.norm2(hidden_states.float()).type_as(hidden_states) + norm_hidden_states = self.norm2(hidden_states).type_as(hidden_states) attn_output = self.attn2(norm_hidden_states, encoder_hidden_states) hidden_states = hidden_states + attn_output # 3. Feed-forward - norm_hidden_states = (self.norm3(hidden_states.float()) * (1 + c_scale_msa) + c_shift_msa).type_as( - hidden_states - ) + norm_hidden_states = self.norm3(hidden_states, c_scale_msa, c_shift_msa).type_as(hidden_states) ff_output = self.ffn(norm_hidden_states) - hidden_states = (hidden_states.float() + ff_output.float() * c_gate_msa).type_as(hidden_states) + hidden_states = (hidden_states + ff_output * c_gate_msa).type_as(hidden_states) return hidden_states @@ -854,7 +853,7 @@ def __init__( ) # 4. Output norm & projection - self.norm_out = FP32LayerNorm(inner_dim, eps, elementwise_affine=False) + self.norm_out = AdaLayerNorm(inner_dim, elementwise_affine=False, eps=eps) self.proj_out = nn.Linear(inner_dim, out_channels * math.prod(patch_size)) # SP helper modules @@ -942,7 +941,7 @@ def forward( shift = shift.unsqueeze(1) scale = scale.unsqueeze(1) - hidden_states = (self.norm_out(hidden_states.float()) * (1 + scale) + shift).type_as(hidden_states) + hidden_states = self.norm_out(hidden_states, scale, shift).type_as(hidden_states) hidden_states = self.proj_out(hidden_states) hidden_states = hidden_states.reshape( diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_vace_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_vace_transformer.py index 4f4217dabf..c48938e1ba 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_vace_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_vace_transformer.py @@ -239,7 +239,7 @@ def forward( shift = shift.unsqueeze(1) scale = scale.unsqueeze(1) - hidden_states = (self.norm_out(hidden_states.float()) * (1 + scale) + shift).type_as(hidden_states) + hidden_states = self.norm_out(hidden_states, scale, shift).type_as(hidden_states) hidden_states = self.proj_out(hidden_states) hidden_states = hidden_states.reshape( From 133e2f97068f4ae57fc91d7afd1e405386a0e12e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zeyu=20Huang=20=7C=20=E9=BB=83=E6=BE=A4=E5=AE=87?= <11222265+fhfuih@users.noreply.github.com> Date: Wed, 15 Apr 2026 14:08:00 +0800 Subject: [PATCH 180/204] [hotfix] API connection error in CI (#2810) --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 4ad4706fc1..098fd8d970 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2876,7 +2876,7 @@ def _build_url(self, path: str) -> str: return f"{self.base_url.rstrip('/')}/{path.lstrip('/')}" -@pytest.fixture(scope="module") +@pytest.fixture def openai_client(request: pytest.FixtureRequest, run_level: str): """Create OpenAIClientHandler fixture to facilitate communication with OmniServer with encapsulated request sending, concurrent requests, response handling, and validation.""" From 38d5f2d530c84cdb5462116103944b2b84e44182 Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Wed, 15 Apr 2026 14:09:22 +0800 Subject: [PATCH 181/204] [Perf] VoxCPM2: Speedup by manual CUDA Graph capture for scaffold/residual forward (#2803) Signed-off-by: Sy03 <1370724210@qq.com> --- .../models/voxcpm2/minicpm4_paged.py | 20 ++ .../models/voxcpm2/voxcpm2_talker.py | 188 ++++++++++++++++-- 2 files changed, 189 insertions(+), 19 deletions(-) diff --git a/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py b/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py index 40bacfff6c..b87ec5aafe 100644 --- a/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py +++ b/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py @@ -307,6 +307,16 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states + def precompute_fused_qkv(self) -> None: + """Materialize fused QKV weights before CUDA Graph capture.""" + for layer in self.layers: + attn = layer.self_attn + if attn._fused_qkv_weight is None: + attn._fused_qkv_weight = torch.cat( + [attn.q_proj.weight, attn.k_proj.weight, attn.v_proj.weight], + dim=0, + ).detach() + def compile_selective(self) -> list[str]: """Compile the full model forward as one graph. @@ -411,6 +421,16 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states + def precompute_fused_qkv(self) -> None: + """Materialize fused QKV weights before CUDA Graph capture.""" + for layer in self.layers: + attn = layer.self_attn + if attn._fused_qkv_weight is None: + attn._fused_qkv_weight = torch.cat( + [attn.q_proj.weight, attn.k_proj.weight, attn.v_proj.weight], + dim=0, + ).detach() + def compile_selective(self) -> list[str]: """Compile the full residual model forward as one graph (same strategy as base_lm).""" if self._compiled_layers: diff --git a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py index 94f0658904..02bcae821e 100644 --- a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py +++ b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py @@ -10,6 +10,7 @@ from __future__ import annotations +import copy import dataclasses import logging import os @@ -21,6 +22,7 @@ import torch import torch.nn as nn from vllm.config import VllmConfig +from vllm.forward_context import get_forward_context, override_forward_context from vllm.logger import init_logger from vllm.model_executor.models.utils import ( AutoWeightsLoader, @@ -101,6 +103,14 @@ class _RequestState: last_decoded_audio: torch.Tensor | None = None +@dataclasses.dataclass +class _CapturedGraph: + graph: torch.cuda.CUDAGraph + input_embeds: torch.Tensor + positions: torch.Tensor + output: torch.Tensor + + # =================================================================== # Profiling timer # =================================================================== @@ -336,6 +346,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._perf = _PerfTimer(enabled=_ENABLE_PROFILING) self._cfm_buffers: _CFMBufferManager | None = None + self._enable_cuda_graph = True + self._scaffold_graphs: dict[int, _CapturedGraph] = {} + self._residual_graphs: dict[int, _CapturedGraph] = {} + self._max_cached_graphs = self._max_batch_size + self._cuda_graph_pool: tuple | None = None + self._cuda_graph_warmup_steps = 0 + self._cuda_graph_warmup_threshold = 3 self._active_states: dict[str, _RequestState] = {} self._current_request_id: str | None = None @@ -483,19 +500,24 @@ def _setup_torch_compile(self) -> None: except Exception as e: logger.warning("torch.compile AudioVAE failed: %s", e) - if not getattr(self.model, "_selective_compiled", False): - try: - targets.extend(f"scaffold.{t}" for t in self.model.compile_selective()) - self.model._selective_compiled = True - except Exception as e: - logger.warning("scaffold compile failed: %s", e) + if not self._enable_cuda_graph: + if not getattr(self.model, "_selective_compiled", False): + try: + targets.extend(f"scaffold.{t}" for t in self.model.compile_selective()) + self.model._selective_compiled = True + except Exception as e: + logger.warning("scaffold compile failed: %s", e) - if not getattr(self.residual_model, "_selective_compiled", False): - try: - targets.extend(f"residual.{t}" for t in self.residual_model.compile_selective()) - self.residual_model._selective_compiled = True - except Exception as e: - logger.warning("residual compile failed: %s", e) + if not getattr(self.residual_model, "_selective_compiled", False): + try: + targets.extend(f"residual.{t}" for t in self.residual_model.compile_selective()) + self.residual_model._selective_compiled = True + except Exception as e: + logger.warning("residual compile failed: %s", e) + else: + self.model.precompute_fused_qkv() + self.residual_model.precompute_fused_qkv() + targets.append("scaffold+residual (CUDA Graph, skipping compile)") if not getattr(self, "_projections_compiled", False): try: @@ -518,6 +540,90 @@ def _stop_fn(self, lm_h: torch.Tensor) -> torch.Tensor: tts = self.tts return tts.stop_head(tts.stop_actn(tts.stop_proj(lm_h))) + def _get_cuda_graph_pool(self) -> tuple: + if self._cuda_graph_pool is None: + self._cuda_graph_pool = torch.cuda.graph_pool_handle() + return self._cuda_graph_pool + + @staticmethod + def _nullify_volatile_metadata(ctx: Any) -> Any: + """Set ``scheduler_metadata`` to None on all attention layers. + + This is the only tensor FA3 reallocates each step (variable shape). + All other metadata tensors are persistent model-runner buffers. + Setting it to None makes FA3 use default scheduling (~0.1ms cost). + """ + if not isinstance(ctx.attn_metadata, dict): + return ctx + + ctx = copy.copy(ctx) + new_meta: dict[str, Any] = {} + for layer_name, meta in ctx.attn_metadata.items(): + if getattr(meta, "scheduler_metadata", None) is not None: + meta = copy.copy(meta) + meta.scheduler_metadata = None + new_meta[layer_name] = meta + ctx.attn_metadata = new_meta + return ctx + + def _capture_graph( + self, + model: nn.Module, + batch_size: int, + label: str, + is_residual: bool = False, + ) -> _CapturedGraph: + """Capture a CUDA Graph for *model* at *batch_size*.""" + hidden_size = self.config.hidden_size + dtype = self._side_dtype + dev = torch.device(self._device) + pool = self._get_cuda_graph_pool() + + model.precompute_fused_qkv() + + g = _CapturedGraph( + graph=torch.cuda.CUDAGraph(), + input_embeds=torch.zeros(batch_size, hidden_size, device=dev, dtype=dtype), + positions=torch.zeros(batch_size, device=dev, dtype=torch.long), + output=torch.zeros(batch_size, hidden_size, device=dev, dtype=dtype), + ) + + if is_residual: + call_kwargs = dict(positions=g.positions, inputs_embeds=g.input_embeds) + else: + call_kwargs = dict(input_ids=None, positions=g.positions, inputs_embeds=g.input_embeds) + + ctx = get_forward_context() + patched_ctx = self._nullify_volatile_metadata(ctx) + + with override_forward_context(patched_ctx): + for _ in range(3): + _ = model(**call_kwargs) + + with torch.cuda.graph(g.graph, pool=pool): + g.output = model(**call_kwargs) + + logger.info("CUDA Graph captured for %s (batch_size=%d)", label, batch_size) + return g + + def _replay_graph( + self, + g: _CapturedGraph, + inputs_embeds: torch.Tensor, + positions: torch.Tensor, + batch_size: int, + ) -> torch.Tensor: + """Copy fresh inputs into static buffers, then replay. + + No metadata copy needed: persistent buffers (seq_lens, slot_mapping, + etc.) are updated in-place by the model runner. scheduler_metadata + was nullified at capture time so no kernel references it. + """ + g.input_embeds[:batch_size].copy_(inputs_embeds[:batch_size]) + g.positions[:batch_size].copy_(positions[:batch_size]) + g.graph.replay() + return g.output[:batch_size].clone() + # -------------------- vllm hooks -------------------- def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor: @@ -534,12 +640,35 @@ def forward( self._perf.start("forward_total") dev = input_ids.device - model_output = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) - if isinstance(model_output, IntermediateTensors): - return model_output - scaffold_hidden = model_output - if isinstance(scaffold_hidden, tuple): - scaffold_hidden = scaffold_hidden[0] + num_reqs = len(self._pending_requests) + num_decode = sum(1 for _, is_p, _, n in self._pending_requests if not is_p and n == 1) + is_all_decode = num_decode == num_reqs and num_reqs > 0 + + tts_compiled = getattr(self.tts.feat_decoder.estimator, "_compiled", False) if self._tts is not None else False + graph_ready = tts_compiled and self._cuda_graph_warmup_steps >= self._cuda_graph_warmup_threshold + if num_decode > 0: + self._cuda_graph_warmup_steps += 1 + + can_use_graph = ( + self._enable_cuda_graph and graph_ready and intermediate_tensors is None and inputs_embeds is not None + ) + + if can_use_graph and is_all_decode and num_reqs <= self._max_cached_graphs: + self._perf.start("scaffold_fwd") + if num_reqs not in self._scaffold_graphs: + self._scaffold_graphs[num_reqs] = self._capture_graph(self.model, num_reqs, "scaffold") + scaffold_hidden = self._replay_graph(self._scaffold_graphs[num_reqs], inputs_embeds, positions, num_reqs) + self._perf.stop("scaffold_fwd") + + else: + self._perf.start("scaffold_fwd") + model_output = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) + self._perf.stop("scaffold_fwd") + if isinstance(model_output, IntermediateTensors): + return model_output + scaffold_hidden = model_output + if isinstance(scaffold_hidden, tuple): + scaffold_hidden = scaffold_hidden[0] # Phase 1: per-request FSQ + residual input token_offset = 0 @@ -571,7 +700,28 @@ def forward( if residual_inputs: batch_in = torch.cat(residual_inputs, dim=0) batch_pos = torch.cat(residual_positions, dim=0) - batch_out = self.residual_model(batch_pos, batch_in) + + residual_batch_size = batch_in.shape[0] + use_residual_graph = ( + self._enable_cuda_graph + and is_all_decode + and graph_ready + and residual_batch_size == num_reqs # 1 token per request + and residual_batch_size <= self._max_cached_graphs + ) + + self._perf.start("residual_fwd") + if use_residual_graph: + if residual_batch_size not in self._residual_graphs: + self._residual_graphs[residual_batch_size] = self._capture_graph( + self.residual_model, residual_batch_size, "residual", is_residual=True + ) + batch_out = self._replay_graph( + self._residual_graphs[residual_batch_size], batch_in, batch_pos, residual_batch_size + ) + else: + batch_out = self.residual_model(batch_pos, batch_in) + self._perf.stop("residual_fwd") # Phase 3: per-request LocDiT + update offset = 0 From 4bf4c6314741da606ff2b99efde5a83713cd8a22 Mon Sep 17 00:00:00 2001 From: IsleOfDawnlight Date: Wed, 15 Apr 2026 15:04:58 +0800 Subject: [PATCH 182/204] Add voxcpm model support. (#2467) Signed-off-by: Celeste-jq <591998922@qq.com> Signed-off-by: lyj-jjj Signed-off-by: IsleOfDawnlight Signed-off-by: Yueqian Lin Co-authored-by: Celeste-jq <591998922@qq.com> Co-authored-by: lyj-jjj Co-authored-by: Yueqian Lin --- .buildkite/test-ready.yml | 25 + benchmarks/voxcpm/README.md | 119 +++ .../voxcpm/vllm_omni/bench_tts_offline.py | 890 ++++++++++++++++++ .../voxcpm/vllm_omni/bench_tts_serve.py | 283 ++++++ .../voxcpm/vllm_omni/run_offline_matrix.py | 303 ++++++ examples/offline_inference/voxcpm/README.md | 123 +++ examples/offline_inference/voxcpm/end2end.py | 206 ++++ examples/online_serving/voxcpm/README.md | 166 ++++ .../voxcpm/openai_speech_client.py | 155 +++ examples/online_serving/voxcpm/run_server.sh | 38 + tests/e2e/offline_inference/test_voxcpm.py | 156 +++ tests/engine/test_arg_utils.py | 19 + .../openai_api/test_serving_speech_voxcpm.py | 143 +++ tests/entrypoints/test_utils.py | 33 + .../test_voxcpm_async_chunk.py | 87 ++ vllm_omni/engine/arg_utils.py | 3 + .../entrypoints/openai/serving_speech.py | 72 +- vllm_omni/model_executor/models/registry.py | 6 + .../model_executor/models/voxcpm/__init__.py | 7 + .../models/voxcpm/configuration_voxcpm.py | 3 + .../model_executor/models/voxcpm/voxcpm.py | 886 +++++++++++++++++ .../models/voxcpm/voxcpm_loader.py | 247 +++++ .../models/voxcpm/voxcpm_runtime_utils.py | 44 + .../models/voxcpm/voxcpm_stage_wrappers.py | 185 ++++ .../model_executor/stage_configs/voxcpm.yaml | 69 ++ .../stage_configs/voxcpm_async_chunk.yaml | 102 ++ .../stage_input_processors/voxcpm.py | 128 +++ .../platforms/npu/stage_configs/voxcpm.yaml | 67 ++ .../npu/stage_configs/voxcpm_async_chunk.yaml | 93 ++ .../transformers_utils/configs/__init__.py | 3 + .../transformers_utils/configs/voxcpm.py | 68 ++ 31 files changed, 4727 insertions(+), 2 deletions(-) create mode 100644 benchmarks/voxcpm/README.md create mode 100644 benchmarks/voxcpm/vllm_omni/bench_tts_offline.py create mode 100644 benchmarks/voxcpm/vllm_omni/bench_tts_serve.py create mode 100644 benchmarks/voxcpm/vllm_omni/run_offline_matrix.py create mode 100644 examples/offline_inference/voxcpm/README.md create mode 100644 examples/offline_inference/voxcpm/end2end.py create mode 100644 examples/online_serving/voxcpm/README.md create mode 100644 examples/online_serving/voxcpm/openai_speech_client.py create mode 100755 examples/online_serving/voxcpm/run_server.sh create mode 100644 tests/e2e/offline_inference/test_voxcpm.py create mode 100644 tests/entrypoints/openai_api/test_serving_speech_voxcpm.py create mode 100644 tests/model_executor/stage_input_processors/test_voxcpm_async_chunk.py create mode 100644 vllm_omni/model_executor/models/voxcpm/__init__.py create mode 100644 vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py create mode 100644 vllm_omni/model_executor/models/voxcpm/voxcpm.py create mode 100644 vllm_omni/model_executor/models/voxcpm/voxcpm_loader.py create mode 100644 vllm_omni/model_executor/models/voxcpm/voxcpm_runtime_utils.py create mode 100644 vllm_omni/model_executor/models/voxcpm/voxcpm_stage_wrappers.py create mode 100644 vllm_omni/model_executor/stage_configs/voxcpm.yaml create mode 100644 vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml create mode 100644 vllm_omni/model_executor/stage_input_processors/voxcpm.py create mode 100644 vllm_omni/platforms/npu/stage_configs/voxcpm.yaml create mode 100644 vllm_omni/platforms/npu/stage_configs/voxcpm_async_chunk.yaml create mode 100644 vllm_omni/transformers_utils/configs/voxcpm.py diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 2f749f0ee9..68f8e61528 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -295,6 +295,31 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" + - label: "VoxCPM E2E Test" + timeout_in_minutes: 20 + depends_on: upload-ready-pipeline + commands: + - | + timeout 20m bash -c ' + pip install voxcpm + export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn + pytest -s -v tests/e2e/offline_inference/test_voxcpm.py -m "core_model" --run-level "core_model" + ' + agents: + queue: "gpu_1_queue" + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + - label: "VoxCPM2 Native AR E2E Test" timeout_in_minutes: 20 depends_on: upload-ready-pipeline diff --git a/benchmarks/voxcpm/README.md b/benchmarks/voxcpm/README.md new file mode 100644 index 0000000000..17f904101b --- /dev/null +++ b/benchmarks/voxcpm/README.md @@ -0,0 +1,119 @@ +# VoxCPM Benchmark + +This directory contains both: + +- online serving benchmark through the OpenAI-compatible `/v1/audio/speech` API +- offline benchmark for `Omni` / `AsyncOmni` +- full offline smoke-matrix orchestration + +Both benchmark paths report: + +- TTFP: time to first PCM packet +- E2E latency +- RTF: real-time factor (`e2e / audio_duration`) + +## Offline Benchmark + +Single offline benchmark run: + +```bash +python benchmarks/voxcpm/vllm_omni/bench_tts_offline.py \ + --model /path/to/voxcpm-model \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm.yaml \ + --text "This is a split-stage VoxCPM synthesis example running on vLLM Omni." \ + --warmup-runs 1 \ + --output-dir benchmarks/voxcpm/results/offline_single +``` + +Streaming offline benchmark: + +```bash +python benchmarks/voxcpm/vllm_omni/bench_tts_offline.py \ + --model /path/to/voxcpm-model \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \ + --text "This is a split-stage VoxCPM streaming example running on vLLM Omni." \ + --warmup-runs 1 \ + --output-dir benchmarks/voxcpm/results/offline_streaming +``` + +Full fixed offline matrix, equivalent to the old `examples/offline_inference/voxcpm/test.py`: + +```bash +python benchmarks/voxcpm/vllm_omni/run_offline_matrix.py \ + --model /path/to/voxcpm-model \ + --ref-audio /path/to/reference.wav \ + --ref-text "The exact transcript spoken in reference.wav." \ + --output-root benchmarks/voxcpm/results/offline_matrix +``` + +The full matrix covers both routes: + +- streaming: `voxcpm_async_chunk.yaml` +- sync: `voxcpm.yaml` + +And these six scenarios under each route: + +- warmup + single TTS +- warmup + single voice cloning +- warmup + batch TTS +- warmup + batch voice cloning +- cold single TTS +- cold single voice cloning + +`bench_tts_offline.py` itself no longer writes `summary.json` / `results.json`; it prints TTFP / RTF inline and saves generated WAV files only. The matrix runner keeps only per-case `run.log`. + +## Start the Server + +Async-chunk: + +```bash +vllm serve /path/to/voxcpm-model \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \ + --trust-remote-code \ + --enforce-eager \ + --omni \ + --port 8091 +``` + +Non-streaming: + +```bash +vllm serve /path/to/voxcpm-model \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm.yaml \ + --trust-remote-code \ + --enforce-eager \ + --omni \ + --port 8091 +``` + +## Run the Benchmark + +```bash +python benchmarks/voxcpm/vllm_omni/bench_tts_serve.py \ + --host 127.0.0.1 \ + --port 8091 \ + --num-prompts 20 \ + --max-concurrency 1 \ + --result-dir /tmp/voxcpm_bench +``` + +Voice cloning benchmark: + +```bash +python benchmarks/voxcpm/vllm_omni/bench_tts_serve.py \ + --host 127.0.0.1 \ + --port 8091 \ + --num-prompts 10 \ + --max-concurrency 1 \ + --ref-audio https://example.com/reference.wav \ + --ref-text "The exact transcript spoken in the reference audio." \ + --result-dir /tmp/voxcpm_clone_bench +``` + +## Notes + +- The benchmark uses `stream=true` and `response_format=pcm` so TTFP is measured from the first audio packet. +- `RTF < 1.0` means the server generates audio faster than real time. +- For `voxcpm_async_chunk.yaml`, keep concurrency at `1`. This matches native VoxCPM streaming more closely. +- Do not benchmark concurrent online streaming on `voxcpm_async_chunk.yaml`; use `voxcpm.yaml` for multi-request throughput runs. +- For the offline matrix mode, `--ref-audio` and `--ref-text` are required because clone cases are part of the fixed coverage set. diff --git a/benchmarks/voxcpm/vllm_omni/bench_tts_offline.py b/benchmarks/voxcpm/vllm_omni/bench_tts_offline.py new file mode 100644 index 0000000000..a3bad3e692 --- /dev/null +++ b/benchmarks/voxcpm/vllm_omni/bench_tts_offline.py @@ -0,0 +1,890 @@ +"""Offline VoxCPM benchmark for vLLM Omni. + +Supports both: +- sync one-shot (Omni.generate) +- streaming (AsyncOmni.generate with async_chunk config) +- text-only synthesis +- voice cloning +- text/clone batch inputs from txt or jsonl +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +import tempfile +import time +import uuid +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import torch +from vllm.utils.argparse_utils import FlexibleArgumentParser + +from vllm_omni import AsyncOmni, Omni + +REPO_ROOT = Path(__file__).resolve().parents[3] +DEFAULT_STAGE_ASYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm_async_chunk.yaml" +DEFAULT_STAGE_SYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True, slots=True) +class PromptSpec: + text: str + label: str + ref_audio: str | None = None + ref_text: str | None = None + + +def _require_soundfile(): + try: + import soundfile as sf # type: ignore + except ModuleNotFoundError as exc: + raise RuntimeError( + "soundfile is required to write VoxCPM benchmark WAV outputs. Install it with: pip install soundfile" + ) from exc + return sf + + +def _build_prompt( + args, + *, + text: str, + ref_audio: str | None = None, + ref_text: str | None = None, + global_request_id: str | None = None, +) -> dict[str, Any]: + additional_information: dict[str, list[Any]] = { + "text": [text], + "cfg_value": [args.cfg_value], + "inference_timesteps": [args.inference_timesteps], + "min_len": [args.min_len], + "max_new_tokens": [args.max_new_tokens], + } + if args.streaming_prefix_len is not None: + additional_information["streaming_prefix_len"] = [args.streaming_prefix_len] + + if ref_audio: + additional_information["ref_audio"] = [ref_audio] + if ref_text: + additional_information["ref_text"] = [ref_text] + if global_request_id is not None: + additional_information["global_request_id"] = [global_request_id] + + return { + "prompt_token_ids": [1], + "additional_information": additional_information, + } + + +def _extract_audio_tensor(mm: dict[str, Any]) -> torch.Tensor: + audio = mm.get("audio", mm.get("model_outputs")) + if audio is None: + raise ValueError("No audio output found in multimodal output.") + if isinstance(audio, list): + parts = [torch.as_tensor(a).float().cpu().reshape(-1) for a in audio] + audio = torch.cat(parts, dim=-1) if parts else torch.zeros(0) + if not isinstance(audio, torch.Tensor): + audio = torch.as_tensor(audio) + return audio.float().cpu().reshape(-1) + + +def _extract_sample_rate(mm: dict[str, Any]) -> int: + sr_raw = mm.get("sr", 24000) + if isinstance(sr_raw, list) and sr_raw: + sr_raw = sr_raw[-1] + if hasattr(sr_raw, "item"): + return int(sr_raw.item()) + return int(sr_raw) + + +def _emit_offline_metrics( + *, + request_id: str, + elapsed_s: float, + first_audio_elapsed: float | None, + audio_duration_s: float, +) -> None: + metrics = { + "request_id": request_id, + "ttfp_ms": round(first_audio_elapsed * 1000.0, 3) if first_audio_elapsed is not None else None, + "audio_duration_s": round(audio_duration_s, 6), + "rtf": round(elapsed_s / audio_duration_s, 6) if audio_duration_s > 0 else None, + } + print(f"[OfflineMetrics] {metrics}") + + +def _write_audio_tensor(output_path: Path, audio_tensor: Any, sample_rate: int) -> None: + sf = _require_soundfile() + if isinstance(audio_tensor, torch.Tensor): + audio_np = audio_tensor.float().cpu().clamp(-1.0, 1.0).numpy() + else: + audio_np = torch.as_tensor(audio_tensor).float().cpu().clamp(-1.0, 1.0).numpy() + sf.write( + output_path, + audio_np, + sample_rate, + format="WAV", + subtype="PCM_16", + ) + + +def _save_wav(mm: dict[str, Any], output_dir: Path, request_id: str) -> Path: + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / f"output_{request_id}.wav" + _write_audio_tensor(output_path, _extract_audio_tensor(mm), _extract_sample_rate(mm)) + return output_path + + +def _iter_request_multimodal_outputs(request_output: Any): + outputs = getattr(request_output, "outputs", None) + if outputs: + for output in outputs: + mm = getattr(output, "multimodal_output", None) + if isinstance(mm, dict): + yield mm + + mm = getattr(request_output, "multimodal_output", None) + if isinstance(mm, dict): + yield mm + + +def _read_non_empty_lines(path: str) -> list[str]: + with open(path, encoding="utf-8") as f: + return [line.strip() for line in f if line.strip()] + + +def _load_prompt_specs(args) -> list[PromptSpec]: + specs: list[PromptSpec] = [] + + if args.txt_prompts is not None: + texts = _read_non_empty_lines(args.txt_prompts) + if not texts: + raise ValueError(f"No prompts found in {args.txt_prompts}") + for idx, text in enumerate(texts, start=1): + specs.append( + PromptSpec( + text=text, + label=f"item{idx:03d}", + ref_audio=args.ref_audio, + ref_text=args.ref_text, + ) + ) + return specs + + if args.jsonl_prompts is not None: + with open(args.jsonl_prompts, encoding="utf-8") as f: + for line_no, raw_line in enumerate(f, start=1): + line = raw_line.strip() + if not line: + continue + try: + item = json.loads(line) + except json.JSONDecodeError as exc: + raise ValueError(f"{args.jsonl_prompts}:{line_no} is not valid JSON: {exc}") from exc + if not isinstance(item, dict): + raise ValueError(f"{args.jsonl_prompts}:{line_no} must be a JSON object") + + text = item.get("text") + if not isinstance(text, str) or not text.strip(): + raise ValueError(f"{args.jsonl_prompts}:{line_no} requires non-empty string field 'text'") + + ref_audio = item.get("ref_audio", args.ref_audio) + ref_text = item.get("ref_text", args.ref_text) + if (ref_audio is None) != (ref_text is None): + raise ValueError( + f"{args.jsonl_prompts}:{line_no} must provide both 'ref_audio' and 'ref_text' together" + ) + + specs.append( + PromptSpec( + text=text.strip(), + label=f"item{len(specs) + 1:03d}", + ref_audio=ref_audio, + ref_text=ref_text, + ) + ) + + if not specs: + raise ValueError(f"No prompts found in {args.jsonl_prompts}") + return specs + + specs.append( + PromptSpec( + text=args.text, + label="item001", + ref_audio=args.ref_audio, + ref_text=args.ref_text, + ) + ) + return specs + + +def _build_prompt_for_spec(args, spec: PromptSpec, *, global_request_id: str | None = None) -> dict[str, Any]: + return _build_prompt( + args, + text=spec.text, + ref_audio=spec.ref_audio, + ref_text=spec.ref_text, + global_request_id=global_request_id, + ) + + +def _count_voice_clone_prompts(prompt_specs: list[PromptSpec]) -> int: + return sum(1 for spec in prompt_specs if spec.ref_audio is not None) + + +def _get_warmup_specs(prompt_specs: list[PromptSpec]) -> list[PromptSpec]: + return prompt_specs[:1] + + +def _extract_stream_finished(stage_output: Any) -> bool: + request_output = getattr(stage_output, "request_output", None) + request_finished = getattr(request_output, "finished", None) + if request_finished is not None: + return bool(request_finished) + return bool(getattr(stage_output, "finished", False)) + + +def _build_profiled_stage_config( + stage_configs_path: str, + profiler_dir: str, +) -> str: + stage_config_path = Path(stage_configs_path) + yaml_text = stage_config_path.read_text(encoding="utf-8") + injected_lines: list[str] = [] + injected_count = 0 + + for line in yaml_text.splitlines(): + injected_lines.append(line) + if line.strip() != "engine_args:": + continue + indent = line[: len(line) - len(line.lstrip())] + child_indent = indent + " " + grandchild_indent = child_indent + " " + injected_lines.extend( + [ + f"{child_indent}profiler_config:", + f'{grandchild_indent}profiler: "torch"', + f'{grandchild_indent}torch_profiler_dir: "{profiler_dir}"', + f"{grandchild_indent}torch_profiler_with_stack: true", + ] + ) + injected_count += 1 + + if injected_count == 0: + raise ValueError(f"No engine_args block found in stage config: {stage_configs_path}") + + tmp = tempfile.NamedTemporaryFile( + mode="w", + encoding="utf-8", + delete=False, + suffix=".yaml", + prefix=f"{stage_config_path.stem}_profile_", + ) + tmp.write("\n".join(injected_lines) + "\n") + tmp.close() + return tmp.name + + +def parse_args(): + parser = FlexibleArgumentParser( + description="Offline split-stage VoxCPM inference with vLLM Omni (auto sync/streaming by stage config)" + ) + parser.add_argument( + "--model", + type=str, + default=os.environ.get("VOXCPM_MODEL"), + help="Local VoxCPM model directory. Defaults to $VOXCPM_MODEL.", + ) + parser.add_argument( + "--text", + type=str, + default="This is a split-stage VoxCPM synthesis example running on vLLM Omni.", + help="Text to synthesize. Ignored when --txt-prompts or --jsonl-prompts is used.", + ) + parser.add_argument( + "--txt-prompts", + type=str, + default=None, + help="Path to a .txt file with one synthesis text per line.", + ) + parser.add_argument( + "--jsonl-prompts", + type=str, + default=None, + help=( + "Path to a .jsonl file. Each line must contain at least {'text': ...}; " + "clone rows can also set ref_audio/ref_text, and ref_text must be the " + "real transcript of ref_audio." + ), + ) + parser.add_argument( + "--ref-audio", + type=str, + default=None, + help=( + "Optional reference audio path for voice cloning. With --txt-prompts, " + "the same reference is applied to every line." + ), + ) + parser.add_argument( + "--ref-text", + type=str, + default=None, + help=( + "Real transcript of the reference audio. Placeholder text or mismatched " + "text will usually produce noisy/electronic clone audio." + ), + ) + parser.add_argument( + "--stage-configs-path", + type=str, + default=str(DEFAULT_STAGE_SYNC), + help="Stage config YAML path. Routing is selected only from this path.", + ) + parser.add_argument( + "--cfg-value", + type=float, + default=2.0, + help="Classifier-free guidance value for VoxCPM.", + ) + parser.add_argument( + "--inference-timesteps", + type=int, + default=10, + help="Number of inference timesteps.", + ) + parser.add_argument( + "--min-len", + type=int, + default=2, + help="Minimum generated token length.", + ) + parser.add_argument( + "--max-new-tokens", + type=int, + default=4096, + help="Maximum generated token length.", + ) + parser.add_argument( + "--streaming-prefix-len", + type=int, + default=None, + help="VoxCPM streaming window (optional, streaming mode only).", + ) + parser.add_argument( + "--output-dir", + type=str, + default=None, + help="Directory for output WAV files.", + ) + parser.add_argument( + "--stage-init-timeout", + type=int, + default=600, + help="Stage initialization timeout in seconds.", + ) + parser.add_argument( + "--log-stats", + dest="log_stats", + action="store_true", + help="Enable vLLM Omni stats logging.", + ) + parser.add_argument( + "--no-log-stats", + dest="log_stats", + action="store_false", + help="Disable vLLM Omni stats logging.", + ) + parser.set_defaults(log_stats=True) + parser.add_argument( + "--num-runs", + type=int, + default=1, + help="Number of full inference runs (same prompt each time). Default 1.", + ) + parser.add_argument( + "--warmup-runs", + type=int, + default=0, + help=( + "Optional number of warmup passes before measured runs. Warmup uses only " + "the first prompt and does not save outputs." + ), + ) + parser.add_argument( + "--enable-profiler", + action="store_true", + help=( + "Enable torch profiler for the configured stages. A temporary profiled " + "stage config is generated automatically." + ), + ) + parser.add_argument( + "--profiler-dir", + type=str, + default=None, + help="Directory for profiler traces. Defaults to /profiler when profiling is enabled.", + ) + parser.add_argument( + "--profiler-stages", + type=int, + nargs="*", + default=None, + help="Optional stage ids to profile. Defaults to all stages that have profiler_config.", + ) + parser.add_argument( + "--profiler-wait-seconds", + type=float, + default=30.0, + help="Seconds to wait after stop_profile for trace files to flush.", + ) + args = parser.parse_args() + + if not args.model: + parser.error("--model is required unless $VOXCPM_MODEL is set") + if args.txt_prompts is not None and args.jsonl_prompts is not None: + parser.error("--txt-prompts and --jsonl-prompts are mutually exclusive") + if (args.ref_audio is None) != (args.ref_text is None): + parser.error("--ref-audio and --ref-text must be provided together") + if args.num_runs < 1: + parser.error("--num-runs must be >= 1") + if args.warmup_runs < 0: + parser.error("--warmup-runs must be >= 0") + if args.output_dir is None: + args.output_dir = ( + "output_audio_streaming" if _is_streaming_stage_config(args.stage_configs_path) else "output_audio" + ) + if args.enable_profiler and args.profiler_dir is None: + args.profiler_dir = str(Path(args.output_dir) / "profiler") + try: + args.prompt_specs = _load_prompt_specs(args) + except ValueError as exc: + parser.error(str(exc)) + + return args + + +def _is_streaming_stage_config(stage_configs_path: str) -> bool: + cfg_name = Path(stage_configs_path).name.lower() + # Keep routing purely config-path based: + # - voxcpm.yaml => sync + # - voxcpm_async_chunk.yaml => streaming + return "async_chunk" in cfg_name + + +async def _collect_streaming_audio( + omni: AsyncOmni, + args: Any, + spec: PromptSpec, + request_id: str, + *, + phase_label: str, + prompt_index: int, + prompt_count: int, + print_prompt: bool = False, +) -> tuple[torch.Tensor, int, float, float | None]: + prompt = _build_prompt_for_spec(args, spec, global_request_id=request_id) + delta_chunks: list[torch.Tensor] = [] + sample_rate = 24000 + chunk_i = 0 + prev_total_samples = 0 + t_start = time.perf_counter() + first_audio_elapsed: float | None = None + + if print_prompt: + print(f"---prompt---:{prompt}") + + async for stage_output in omni.generate(prompt, request_id=request_id): + mm = getattr(stage_output, "multimodal_output", None) + if not isinstance(mm, dict): + ro = getattr(stage_output, "request_output", None) + if ro is None: + continue + mm = getattr(ro, "multimodal_output", None) + if not isinstance(mm, dict) and getattr(ro, "outputs", None): + seq = ro.outputs[0] + mm = getattr(seq, "multimodal_output", None) + if not isinstance(mm, dict): + continue + sample_rate = _extract_sample_rate(mm) + try: + w = _extract_audio_tensor(mm) + n = int(w.numel()) + if n == 0: + continue + finished = _extract_stream_finished(stage_output) + if n > prev_total_samples: + delta = w.reshape(-1)[prev_total_samples:] + prev_total_samples = n + elif finished and n == prev_total_samples: + delta = w.reshape(-1)[:0] + else: + delta = w.reshape(-1) + prev_total_samples += int(delta.numel()) + if int(delta.numel()) > 0: + delta_chunks.append(delta) + if first_audio_elapsed is None and int(delta.numel()) > 0: + first_audio_elapsed = time.perf_counter() - t_start + logger.info( + "%s prompt=%d/%d chunk=%d delta_samples=%d buf_len=%d finished=%s", + phase_label, + prompt_index + 1, + prompt_count, + chunk_i, + int(delta.numel()), + n, + finished, + ) + chunk_i += 1 + except ValueError: + if not _extract_stream_finished(stage_output): + logger.debug("skip non-audio partial output chunk=%d", chunk_i) + + if not delta_chunks: + raise RuntimeError("No audio chunks received; check stage config and logs.") + + audio_cat = torch.cat([c.reshape(-1) for c in delta_chunks], dim=0) + elapsed = time.perf_counter() - t_start + return audio_cat, sample_rate, elapsed, first_audio_elapsed + + +async def _abort_streaming_residual_work( + omni: AsyncOmni, + request_id: str, + *, + settle_seconds: float = 0.1, +) -> None: + """Stop any late stage-0 work once the final audio has been collected.""" + await omni.engine.abort_async([request_id]) + if settle_seconds > 0: + await asyncio.sleep(settle_seconds) + + +async def _run_streaming_single( + omni: AsyncOmni, + args: Any, + spec: PromptSpec, + output_dir: Path, + request_id: str, + *, + run_index: int, + num_runs: int, + prompt_index: int, + prompt_count: int, +) -> Path: + audio_cat, sample_rate, elapsed, first_audio_elapsed = await _collect_streaming_audio( + omni, + args, + spec, + request_id, + phase_label=f"run={run_index + 1}/{num_runs}", + prompt_index=prompt_index, + prompt_count=prompt_count, + print_prompt=(run_index == 0 and prompt_index == 0), + ) + await _abort_streaming_residual_work(omni, request_id) + output_path = output_dir / f"output_run{run_index + 1}_{spec.label}.wav" + _write_audio_tensor(output_path, audio_cat, sample_rate) + audio_duration_s = float(audio_cat.numel()) / float(sample_rate) if sample_rate > 0 else 0.0 + ttfp_text = f", ttfp={first_audio_elapsed:.2f}s" if first_audio_elapsed is not None else "" + rtf_text = f", rtf={elapsed / audio_duration_s:.3f}" if audio_duration_s > 0 else "" + print( + f"Saved (streaming) run {run_index + 1}/{num_runs}, " + f"prompt {prompt_index + 1}/{prompt_count}: {output_path} ({elapsed:.2f}s{ttfp_text}{rtf_text})" + ) + _emit_offline_metrics( + request_id=request_id, + elapsed_s=elapsed, + first_audio_elapsed=first_audio_elapsed, + audio_duration_s=audio_duration_s, + ) + return output_path + + +async def _run_streaming_warmup(args, omni: AsyncOmni) -> None: + if args.warmup_runs == 0: + return + + warmup_specs = _get_warmup_specs(args.prompt_specs) + print( + f"Warmup: {args.warmup_runs} run(s) using the first prompt " + f"({len(warmup_specs)} prompt(s)); outputs will be discarded." + ) + for warmup_index in range(args.warmup_runs): + t_warmup = time.perf_counter() + tasks = [] + request_ids: list[str] = [] + for prompt_index, spec in enumerate(warmup_specs): + request_id = f"warmup_stream_{warmup_index + 1}_{spec.label}_{uuid.uuid4().hex[:8]}" + request_ids.append(request_id) + tasks.append( + _collect_streaming_audio( + omni, + args, + spec, + request_id, + phase_label=f"warmup={warmup_index + 1}/{args.warmup_runs}", + prompt_index=prompt_index, + prompt_count=len(warmup_specs), + ) + ) + results = await asyncio.gather(*tasks) + for request_id in request_ids: + await _abort_streaming_residual_work(omni, request_id) + total_samples = sum(int(audio.numel()) for audio, _, _, _ in results) + warmup_ttfps = [ttfp for _, _, _, ttfp in results if ttfp is not None] + ttfp_text = f", ttfp={min(warmup_ttfps):.2f}s" if warmup_ttfps else "" + print( + f"Warmup (streaming) {warmup_index + 1}/{args.warmup_runs} finished: " + f"{len(results)} prompt(s), {total_samples} sample(s) " + f"({time.perf_counter() - t_warmup:.2f}s{ttfp_text})" + ) + + +async def _run_streaming(args) -> list[Path]: + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + omni = AsyncOmni( + model=args.model, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + stage_init_timeout=args.stage_init_timeout, + ) + + await _run_streaming_warmup(args, omni) + profiler_started = False + if args.enable_profiler: + profile_prefix = f"voxcpm_streaming_{int(time.time())}" + stages_text = args.profiler_stages if args.profiler_stages is not None else "all-configured" + print(f"Starting profiler (streaming): stages={stages_text}, dir={args.profiler_dir}") + await omni.start_profile(profile_prefix=profile_prefix, stages=args.profiler_stages) + profiler_started = True + t_total = time.perf_counter() + total_elapsed = 0.0 + paths: list[Path] = [] + prompt_specs: list[PromptSpec] = args.prompt_specs + try: + for run in range(args.num_runs): + for prompt_index, spec in enumerate(prompt_specs): + request_id = f"stream_{run + 1}_{spec.label}_{uuid.uuid4().hex[:8]}" + paths.append( + await _run_streaming_single( + omni, + args, + spec, + output_dir, + request_id, + run_index=run, + num_runs=args.num_runs, + prompt_index=prompt_index, + prompt_count=len(prompt_specs), + ) + ) + total_elapsed = time.perf_counter() - t_total + finally: + if profiler_started: + print("Stopping profiler (streaming)...") + await omni.stop_profile(stages=args.profiler_stages) + if args.profiler_wait_seconds > 0: + print(f"Waiting {args.profiler_wait_seconds:.1f}s for profiler traces to flush...") + await asyncio.sleep(args.profiler_wait_seconds) + + print( + f"All streaming runs finished: {args.num_runs} run(s), " + f"{len(prompt_specs)} prompt(s), {len(paths)} file(s) in {total_elapsed:.2f}s total" + ) + return paths + + +def _run_sync(args) -> list[Path]: + output_dir = Path(args.output_dir) + + omni = Omni( + model=args.model, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + stage_init_timeout=args.stage_init_timeout, + ) + + def _run_sync_single( + spec: PromptSpec, + *, + request_prefix: str, + save_outputs: bool, + run_index: int | None = None, + ) -> tuple[list[Path], int, float | None, float, float, str]: + global_request_id = f"{request_prefix}_{spec.label}" + prompt = _build_prompt_for_spec(args, spec, global_request_id=global_request_id) + if save_outputs and run_index == 0 and spec.label == "item001": + print(f"---prompt---:{prompt}") + + saved_paths: list[Path] = [] + output_count = 0 + first_audio_elapsed: float | None = None + total_audio_duration_s = 0.0 + metrics_request_id = global_request_id + t_start = time.perf_counter() + for stage_outputs in omni.generate(prompt): + request_output = stage_outputs.request_output + if request_output is None: + continue + request_output_id = getattr(request_output, "request_id", None) + if isinstance(request_output_id, str) and request_output_id: + metrics_request_id = request_output_id + for j, mm in enumerate(_iter_request_multimodal_outputs(request_output)): + output_count += 1 + if first_audio_elapsed is None: + try: + audio_tensor = _extract_audio_tensor(mm) + if int(audio_tensor.numel()) > 0: + first_audio_elapsed = time.perf_counter() - t_start + total_audio_duration_s += float(audio_tensor.numel()) / float(_extract_sample_rate(mm)) + except ValueError: + pass + else: + try: + audio_tensor = _extract_audio_tensor(mm) + total_audio_duration_s += float(audio_tensor.numel()) / float(_extract_sample_rate(mm)) + except ValueError: + pass + if not save_outputs: + continue + save_stem = f"run{run_index + 1}_{spec.label}" if j == 0 else f"run{run_index + 1}_{spec.label}_{j}" + saved_paths.append(_save_wav(mm, output_dir, save_stem)) + + if output_count == 0: + raise RuntimeError("No output from Omni.generate") + elapsed_s = time.perf_counter() - t_start + return saved_paths, output_count, first_audio_elapsed, elapsed_s, total_audio_duration_s, metrics_request_id + + if args.warmup_runs: + warmup_specs = _get_warmup_specs(args.prompt_specs) + print( + f"Warmup: {args.warmup_runs} run(s) using the first prompt " + f"({len(warmup_specs)} prompt(s)); outputs will be discarded." + ) + for warmup_index in range(args.warmup_runs): + t_warmup = time.perf_counter() + _, output_count, first_audio_elapsed, elapsed_s, audio_duration_s, _ = _run_sync_single( + warmup_specs[0], + request_prefix=f"warmup_sync{warmup_index + 1}", + save_outputs=False, + ) + ttfp_text = f", ttfp={first_audio_elapsed:.2f}s" if first_audio_elapsed is not None else "" + rtf_text = f", rtf={elapsed_s / audio_duration_s:.3f}" if audio_duration_s > 0 else "" + print( + f"Warmup (sync) {warmup_index + 1}/{args.warmup_runs} finished: " + f"{output_count} output(s) ({time.perf_counter() - t_warmup:.2f}s{ttfp_text}{rtf_text})" + ) + + profiler_started = False + if args.enable_profiler: + profile_prefix = f"voxcpm_sync_{int(time.time())}" + stages_text = args.profiler_stages if args.profiler_stages is not None else "all-configured" + print(f"Starting profiler (sync): stages={stages_text}, dir={args.profiler_dir}") + omni.start_profile(profile_prefix=profile_prefix, stages=args.profiler_stages) + profiler_started = True + + t_total = time.perf_counter() + total_elapsed = 0.0 + saved_paths: list[Path] = [] + prompt_specs: list[PromptSpec] = args.prompt_specs + try: + for run in range(args.num_runs): + t_run = time.perf_counter() + run_paths: list[Path] = [] + for prompt_index, spec in enumerate(prompt_specs): + prompt_paths, _, first_audio_elapsed, elapsed_s, audio_duration_s, metrics_request_id = ( + _run_sync_single( + spec, + request_prefix=f"sync_run{run + 1}_{prompt_index + 1:03d}", + save_outputs=True, + run_index=run, + ) + ) + run_paths.extend(prompt_paths) + ttfp_text = f", ttfp={first_audio_elapsed:.2f}s" if first_audio_elapsed is not None else "" + rtf_text = f", rtf={elapsed_s / audio_duration_s:.3f}" if audio_duration_s > 0 else "" + print( + f"Saved (sync) run {run + 1}/{args.num_runs}, " + f"prompt {prompt_index + 1}/{len(prompt_specs)}: {len(prompt_paths)} file(s){ttfp_text}{rtf_text}" + ) + _emit_offline_metrics( + request_id=metrics_request_id, + elapsed_s=elapsed_s, + first_audio_elapsed=first_audio_elapsed, + audio_duration_s=audio_duration_s, + ) + + saved_paths.extend(run_paths) + print( + f"Run {run + 1}/{args.num_runs} finished: {len(run_paths)} file(s) ({time.perf_counter() - t_run:.2f}s)" + ) + for path in run_paths: + print(f" {path}") + + total_elapsed = time.perf_counter() - t_total + finally: + if profiler_started: + print("Stopping profiler (sync)...") + omni.stop_profile(stages=args.profiler_stages) + if args.profiler_wait_seconds > 0: + print(f"Waiting {args.profiler_wait_seconds:.1f}s for profiler traces to flush...") + time.sleep(args.profiler_wait_seconds) + + print( + f"All sync runs finished: {args.num_runs} run(s), " + f"{len(prompt_specs)} prompt(s), {len(saved_paths)} file(s) in {total_elapsed:.2f}s total" + ) + return saved_paths + + +def main(args) -> int: + logging.basicConfig(level=logging.INFO) + profiled_stage_config_path: str | None = None + original_stage_config_path = args.stage_configs_path + if args.enable_profiler: + Path(args.profiler_dir).mkdir(parents=True, exist_ok=True) + profiled_stage_config_path = _build_profiled_stage_config( + args.stage_configs_path, + str(Path(args.profiler_dir).resolve()), + ) + args.stage_configs_path = profiled_stage_config_path + + is_streaming = _is_streaming_stage_config(args.stage_configs_path) + voice_clone_count = _count_voice_clone_prompts(args.prompt_specs) + print(f"Model: {args.model}") + print(f"Stage config: {original_stage_config_path}") + print(f"Route: {'streaming' if is_streaming else 'sync'} (from stage-configs-path)") + print(f"Prompt count: {len(args.prompt_specs)}") + print("Batch mode: sequential (aligned with native VoxCPM)") + print(f"Warmup runs: {args.warmup_runs}") + print(f"Voice cloning prompts: {voice_clone_count}/{len(args.prompt_specs)}") + if args.enable_profiler: + print(f"Profiler: enabled (dir={args.profiler_dir}, stages={args.profiler_stages or 'all-configured'})") + print(f"Profiled stage config: {args.stage_configs_path}") + if voice_clone_count: + print("Voice cloning note: --ref-text/ref_text must match the spoken content of the reference audio.") + print(f"Num runs: {args.num_runs}") + try: + if is_streaming: + asyncio.run(_run_streaming(args)) + else: + _run_sync(args) + finally: + if profiled_stage_config_path is not None and os.path.exists(profiled_stage_config_path): + os.unlink(profiled_stage_config_path) + return 0 + + +if __name__ == "__main__": + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + raise SystemExit(main(parse_args())) diff --git a/benchmarks/voxcpm/vllm_omni/bench_tts_serve.py b/benchmarks/voxcpm/vllm_omni/bench_tts_serve.py new file mode 100644 index 0000000000..816df32796 --- /dev/null +++ b/benchmarks/voxcpm/vllm_omni/bench_tts_serve.py @@ -0,0 +1,283 @@ +"""Benchmark VoxCPM via /v1/audio/speech. + +Reports TTFP (time to first packet), E2E latency, and RTF (real-time factor). +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import time +from dataclasses import asdict, dataclass, field +from datetime import datetime +from pathlib import Path + +import aiohttp +import numpy as np +from tqdm.asyncio import tqdm + +DEFAULT_MODEL = "OpenBMB/VoxCPM1.5" +DEFAULT_SAMPLE_RATE = 24000 +PROMPTS = [ + "Hello, welcome to the VoxCPM speech benchmark.", + "This is a short benchmark prompt for online text-to-speech generation.", + "The quick brown fox jumps over the lazy dog near the riverbank.", + "Please remember to bring your identification documents tomorrow morning.", + "Learning a new language takes patience, practice, and curiosity.", + "This benchmark reports TTFP and RTF for the VoxCPM online serving path.", +] + + +@dataclass +class RequestResult: + success: bool = False + ttfp: float = 0.0 + e2e: float = 0.0 + audio_bytes: int = 0 + audio_duration: float = 0.0 + rtf: float = 0.0 + prompt: str = "" + error: str = "" + + +@dataclass +class BenchmarkResult: + concurrency: int = 0 + num_prompts: int = 0 + completed: int = 0 + failed: int = 0 + duration_s: float = 0.0 + mean_ttfp_ms: float = 0.0 + median_ttfp_ms: float = 0.0 + p95_ttfp_ms: float = 0.0 + mean_e2e_ms: float = 0.0 + median_e2e_ms: float = 0.0 + p95_e2e_ms: float = 0.0 + mean_rtf: float = 0.0 + median_rtf: float = 0.0 + p95_rtf: float = 0.0 + total_audio_duration_s: float = 0.0 + request_throughput: float = 0.0 + per_request: list[dict[str, float | str]] = field(default_factory=list) + + +def pcm_bytes_to_duration(num_bytes: int, sample_rate: int = DEFAULT_SAMPLE_RATE, sample_width: int = 2) -> float: + num_samples = num_bytes / sample_width + return num_samples / sample_rate + + +async def send_tts_request( + session: aiohttp.ClientSession, + api_url: str, + *, + model: str, + prompt: str, + ref_audio: str | None, + ref_text: str | None, + pbar: tqdm | None = None, +) -> RequestResult: + payload: dict[str, object] = { + "model": model, + "input": prompt, + "stream": True, + "response_format": "pcm", + } + if ref_audio is not None: + payload["ref_audio"] = ref_audio + if ref_text is not None: + payload["ref_text"] = ref_text + + result = RequestResult(prompt=prompt) + started_at = time.perf_counter() + + try: + async with session.post(api_url, json=payload) as response: + if response.status != 200: + result.error = f"HTTP {response.status}: {await response.text()}" + return result + + first_chunk = True + total_bytes = 0 + async for chunk in response.content.iter_any(): + if not chunk: + continue + if first_chunk: + result.ttfp = time.perf_counter() - started_at + first_chunk = False + total_bytes += len(chunk) + + result.e2e = time.perf_counter() - started_at + result.audio_bytes = total_bytes + result.audio_duration = pcm_bytes_to_duration(total_bytes) + if result.audio_duration > 0: + result.rtf = result.e2e / result.audio_duration + result.success = True + except Exception as e: + result.error = str(e) + result.e2e = time.perf_counter() - started_at + + if pbar is not None: + pbar.update(1) + return result + + +async def run_benchmark( + *, + host: str, + port: int, + model: str, + num_prompts: int, + max_concurrency: int, + num_warmups: int, + ref_audio: str | None, + ref_text: str | None, +) -> BenchmarkResult: + api_url = f"http://{host}:{port}/v1/audio/speech" + connector = aiohttp.TCPConnector(limit=max_concurrency, limit_per_host=max_concurrency, keepalive_timeout=60) + timeout = aiohttp.ClientTimeout(total=600) + + async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session: + if num_warmups > 0: + print(f" Warming up with {num_warmups} requests...") + warmup_tasks = [ + send_tts_request( + session, + api_url, + model=model, + prompt=PROMPTS[i % len(PROMPTS)], + ref_audio=ref_audio, + ref_text=ref_text, + ) + for i in range(num_warmups) + ] + await asyncio.gather(*warmup_tasks) + print(" Warmup done.") + + request_prompts = [PROMPTS[i % len(PROMPTS)] for i in range(num_prompts)] + semaphore = asyncio.Semaphore(max_concurrency) + pbar = tqdm(total=num_prompts, desc=f" concurrency={max_concurrency}") + + async def limited_request(prompt: str) -> RequestResult: + async with semaphore: + return await send_tts_request( + session, + api_url, + model=model, + prompt=prompt, + ref_audio=ref_audio, + ref_text=ref_text, + pbar=pbar, + ) + + started_at = time.perf_counter() + results = await asyncio.gather(*[asyncio.create_task(limited_request(prompt)) for prompt in request_prompts]) + duration = time.perf_counter() - started_at + pbar.close() + + succeeded = [result for result in results if result.success] + bench = BenchmarkResult( + concurrency=max_concurrency, + num_prompts=num_prompts, + completed=len(succeeded), + failed=len(results) - len(succeeded), + duration_s=duration, + ) + + if not succeeded: + return bench + + ttfps = np.array([result.ttfp * 1000 for result in succeeded], dtype=np.float64) + e2es = np.array([result.e2e * 1000 for result in succeeded], dtype=np.float64) + rtfs = np.array([result.rtf for result in succeeded], dtype=np.float64) + audio_durations = np.array([result.audio_duration for result in succeeded], dtype=np.float64) + + bench.mean_ttfp_ms = float(np.mean(ttfps)) + bench.median_ttfp_ms = float(np.median(ttfps)) + bench.p95_ttfp_ms = float(np.percentile(ttfps, 95)) + bench.mean_e2e_ms = float(np.mean(e2es)) + bench.median_e2e_ms = float(np.median(e2es)) + bench.p95_e2e_ms = float(np.percentile(e2es, 95)) + bench.mean_rtf = float(np.mean(rtfs)) + bench.median_rtf = float(np.median(rtfs)) + bench.p95_rtf = float(np.percentile(rtfs, 95)) + bench.total_audio_duration_s = float(np.sum(audio_durations)) + bench.request_throughput = len(succeeded) / duration if duration > 0 else 0.0 + bench.per_request = [ + { + "prompt": result.prompt, + "ttfp_ms": result.ttfp * 1000, + "e2e_ms": result.e2e * 1000, + "rtf": result.rtf, + "audio_duration_s": result.audio_duration, + } + for result in succeeded + ] + + return bench + + +def print_summary(result: BenchmarkResult) -> None: + width = 54 + print("") + print("=" * width) + print(f"{'VoxCPM Serving Benchmark':^{width}}") + print("=" * width) + print(f"concurrency : {result.concurrency}") + print(f"requests : {result.completed}/{result.num_prompts} succeeded") + print(f"wall time (s) : {result.duration_s:.3f}") + print(f"mean TTFP (ms) : {result.mean_ttfp_ms:.2f}") + print(f"p95 TTFP (ms) : {result.p95_ttfp_ms:.2f}") + print(f"mean E2E (ms) : {result.mean_e2e_ms:.2f}") + print(f"p95 E2E (ms) : {result.p95_e2e_ms:.2f}") + print(f"mean RTF : {result.mean_rtf:.3f}") + print(f"p95 RTF : {result.p95_rtf:.3f}") + print(f"request throughput : {result.request_throughput:.2f} req/s") + print("=" * width) + + +async def main_async(args) -> None: + result_dir = Path(args.result_dir) + result_dir.mkdir(parents=True, exist_ok=True) + + all_results: list[BenchmarkResult] = [] + for concurrency in args.max_concurrency: + result = await run_benchmark( + host=args.host, + port=args.port, + model=args.model, + num_prompts=args.num_prompts, + max_concurrency=concurrency, + num_warmups=args.num_warmups, + ref_audio=args.ref_audio, + ref_text=args.ref_text, + ) + print_summary(result) + all_results.append(result) + + payload = { + "model": args.model, + "created_at": datetime.utcnow().isoformat() + "Z", + "results": [asdict(result) for result in all_results], + } + result_path = result_dir / "bench_tts_serve.json" + result_path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + print(f"Saved results to: {result_path}") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Benchmark VoxCPM via /v1/audio/speech") + parser.add_argument("--host", default="127.0.0.1", help="Server host") + parser.add_argument("--port", type=int, default=8091, help="Server port") + parser.add_argument("--model", default=DEFAULT_MODEL, help="Model name or path") + parser.add_argument("--num-prompts", type=int, default=20, help="Number of prompts to send") + parser.add_argument("--max-concurrency", type=int, nargs="+", default=[1], help="Concurrency levels to benchmark") + parser.add_argument("--num-warmups", type=int, default=3, help="Warmup request count") + parser.add_argument("--ref-audio", default=None, help="Reference audio URL or data URL for voice cloning") + parser.add_argument("--ref-text", default=None, help="Reference audio transcript for voice cloning") + parser.add_argument("--result-dir", default="results", help="Directory to save benchmark JSON") + return parser.parse_args() + + +if __name__ == "__main__": + asyncio.run(main_async(parse_args())) diff --git a/benchmarks/voxcpm/vllm_omni/run_offline_matrix.py b/benchmarks/voxcpm/vllm_omni/run_offline_matrix.py new file mode 100644 index 0000000000..cee46c0f86 --- /dev/null +++ b/benchmarks/voxcpm/vllm_omni/run_offline_matrix.py @@ -0,0 +1,303 @@ +"""Run the full offline VoxCPM smoke matrix. + +This script keeps the old `test.py` coverage, but delegates each case to +`bench_tts_offline.py` so the benchmark runner itself stays focused on a +single execution path. +""" + +from __future__ import annotations + +import shlex +import subprocess +import sys +import time +from dataclasses import dataclass +from pathlib import Path + +from vllm.utils.argparse_utils import FlexibleArgumentParser + +REPO_ROOT = Path(__file__).resolve().parents[3] +BENCH_SCRIPT = Path(__file__).with_name("bench_tts_offline.py") +DEFAULT_STAGE_ASYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm_async_chunk.yaml" +DEFAULT_STAGE_SYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" +DEFAULT_OUTPUT_ROOT = BENCH_SCRIPT.parents[1] / "results" / "offline_matrix" + +SINGLE_TTS_TEXT = "This is a single text-to-speech smoke test for VoxCPM on vLLM Omni." +SINGLE_CLONE_TEXT = "This sentence is synthesized with the cloned voice for validation." +BATCH_TTS_TEXTS = [ + "The first batch text-to-speech sample validates sequential batch execution.", + "The second batch text-to-speech sample checks another prompt in the same file.", + "The third batch text-to-speech sample completes the sequential batch path.", +] +BATCH_CLONE_TEXTS = [ + "The first cloned sample validates sequential batch voice cloning.", + "The second cloned sample checks the same reference voice on another prompt.", + "The third cloned sample finishes the shared-reference clone batch path.", +] + + +@dataclass(frozen=True, slots=True) +class ModeSpec: + name: str + stage_config: Path + + +@dataclass(frozen=True, slots=True) +class CaseSpec: + name: str + warmup_runs: int + prompt_kind: str + voice_clone: bool + + +@dataclass(frozen=True, slots=True) +class CaseResult: + mode: str + case: str + returncode: int + elapsed_s: float + output_dir: Path + log_path: Path + + @property + def ok(self) -> bool: + return self.returncode == 0 + + +MODE_SPECS = [ + ModeSpec(name="streaming", stage_config=DEFAULT_STAGE_ASYNC), + ModeSpec(name="sync", stage_config=DEFAULT_STAGE_SYNC), +] + +CASE_SPECS = [ + CaseSpec(name="warmup_single_tts", warmup_runs=1, prompt_kind="single", voice_clone=False), + CaseSpec(name="warmup_single_clone", warmup_runs=1, prompt_kind="single", voice_clone=True), + CaseSpec(name="warmup_batch_tts", warmup_runs=1, prompt_kind="batch", voice_clone=False), + CaseSpec(name="warmup_batch_clone", warmup_runs=1, prompt_kind="batch", voice_clone=True), + CaseSpec(name="cold_single_tts", warmup_runs=0, prompt_kind="single", voice_clone=False), + CaseSpec(name="cold_single_clone", warmup_runs=0, prompt_kind="single", voice_clone=True), +] + + +def _write_lines(path: Path, lines: list[str]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def _prepare_batch_inputs(output_root: Path) -> tuple[Path, Path]: + input_dir = output_root / "inputs" + batch_tts_path = input_dir / "batch_tts_prompts.txt" + batch_clone_path = input_dir / "batch_clone_prompts.txt" + _write_lines(batch_tts_path, BATCH_TTS_TEXTS) + _write_lines(batch_clone_path, BATCH_CLONE_TEXTS) + return batch_tts_path, batch_clone_path + + +def _base_command(args, mode: ModeSpec, output_dir: Path) -> list[str]: + cmd = [ + args.python, + str(BENCH_SCRIPT), + "--model", + args.model, + "--stage-configs-path", + str(mode.stage_config), + "--output-dir", + str(output_dir), + "--num-runs", + str(args.num_runs), + "--stage-init-timeout", + str(args.stage_init_timeout), + ] + cmd.append("--log-stats" if args.log_stats else "--no-log-stats") + cmd.extend(["--cfg-value", str(args.cfg_value)]) + cmd.extend(["--inference-timesteps", str(args.inference_timesteps)]) + cmd.extend(["--min-len", str(args.min_len)]) + cmd.extend(["--max-new-tokens", str(args.max_new_tokens)]) + if args.streaming_prefix_len is not None: + cmd.extend(["--streaming-prefix-len", str(args.streaming_prefix_len)]) + if args.enable_profiler: + profiler_dir = Path(args.profiler_dir) if args.profiler_dir is not None else (output_dir / "profiler") + cmd.append("--enable-profiler") + cmd.extend(["--profiler-dir", str(profiler_dir)]) + cmd.extend(["--profiler-wait-seconds", str(args.profiler_wait_seconds)]) + if args.profiler_stages is not None: + cmd.append("--profiler-stages") + cmd.extend(str(stage_id) for stage_id in args.profiler_stages) + return cmd + + +def _build_case_command( + args, + mode: ModeSpec, + case: CaseSpec, + *, + batch_tts_path: Path, + batch_clone_path: Path, + output_dir: Path, +) -> list[str]: + cmd = _base_command(args, mode, output_dir) + cmd.extend(["--warmup-runs", str(case.warmup_runs)]) + if case.prompt_kind == "single": + cmd.extend(["--text", SINGLE_CLONE_TEXT if case.voice_clone else SINGLE_TTS_TEXT]) + else: + cmd.extend(["--txt-prompts", str(batch_clone_path if case.voice_clone else batch_tts_path)]) + if case.voice_clone: + cmd.extend(["--ref-audio", args.ref_audio, "--ref-text", args.ref_text]) + return cmd + + +def _run_case( + args, + mode: ModeSpec, + case: CaseSpec, + *, + batch_tts_path: Path, + batch_clone_path: Path, + output_root: Path, +) -> CaseResult: + case_output_dir = output_root / mode.name / case.name + case_output_dir.mkdir(parents=True, exist_ok=True) + case_log_path = case_output_dir / "run.log" + cmd = _build_case_command( + args, + mode, + case, + batch_tts_path=batch_tts_path, + batch_clone_path=batch_clone_path, + output_dir=case_output_dir, + ) + + print() + print("=" * 80) + print(f"[{mode.name}] {case.name}") + print(f"Output directory: {case_output_dir}") + print(shlex.join(cmd)) + + start = time.perf_counter() + with case_log_path.open("w", encoding="utf-8") as log_fp: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + assert process.stdout is not None + for line in process.stdout: + print(line, end="") + log_fp.write(line) + process.wait() + + elapsed_s = time.perf_counter() - start + status = "PASS" if (process.returncode or 0) == 0 else f"FAIL({process.returncode})" + print(f"[{mode.name}] {case.name} -> {status} ({elapsed_s:.2f}s)") + return CaseResult( + mode=mode.name, + case=case.name, + returncode=int(process.returncode or 0), + elapsed_s=elapsed_s, + output_dir=case_output_dir, + log_path=case_log_path, + ) + + +def parse_args(): + parser = FlexibleArgumentParser(description="Run the full offline VoxCPM smoke matrix.") + parser.add_argument("--model", type=str, required=True, help="Local VoxCPM model directory.") + parser.add_argument("--ref-audio", type=str, required=True, help="Reference audio path for clone cases.") + parser.add_argument("--ref-text", type=str, required=True, help="Exact transcript spoken in --ref-audio.") + parser.add_argument("--output-root", type=str, default=str(DEFAULT_OUTPUT_ROOT), help="Root directory for outputs.") + parser.add_argument("--python", type=str, default=sys.executable, help="Python executable used to launch cases.") + parser.add_argument("--stage-init-timeout", type=int, default=600, help="Stage initialization timeout in seconds.") + parser.add_argument("--log-stats", dest="log_stats", action="store_true", help="Enable vLLM Omni stats logging.") + parser.add_argument( + "--no-log-stats", + dest="log_stats", + action="store_false", + help="Disable vLLM Omni stats logging.", + ) + parser.set_defaults(log_stats=True) + parser.add_argument("--num-runs", type=int, default=1, help="Number of measured runs per case.") + parser.add_argument("--cfg-value", type=float, default=2.0, help="Classifier-free guidance value for VoxCPM.") + parser.add_argument("--inference-timesteps", type=int, default=10, help="Number of inference timesteps.") + parser.add_argument("--min-len", type=int, default=2, help="Minimum generated token length.") + parser.add_argument("--max-new-tokens", type=int, default=4096, help="Maximum generated token length.") + parser.add_argument( + "--streaming-prefix-len", + type=int, + default=None, + help="Optional VoxCPM streaming window passed to streaming cases.", + ) + parser.add_argument("--enable-profiler", action="store_true", help="Enable torch profiler for each case.") + parser.add_argument( + "--profiler-dir", + type=str, + default=None, + help="Profiler output root. Defaults to /profiler.", + ) + parser.add_argument( + "--profiler-stages", + type=int, + nargs="*", + default=None, + help="Optional stage ids to profile. Defaults to all configured stages.", + ) + parser.add_argument( + "--profiler-wait-seconds", + type=float, + default=30.0, + help="Seconds to wait after stopping profiler for traces to flush.", + ) + args = parser.parse_args() + if args.num_runs < 1: + parser.error("--num-runs must be >= 1") + return args + + +def main(args) -> int: + output_root = Path(args.output_root) + output_root.mkdir(parents=True, exist_ok=True) + batch_tts_path, batch_clone_path = _prepare_batch_inputs(output_root) + + print(f"Model: {args.model}") + print(f"Reference audio: {args.ref_audio}") + print(f"Reference text: {args.ref_text}") + print(f"Python: {args.python}") + print(f"Output root: {output_root}") + print(f"Cases: {len(MODE_SPECS) * len(CASE_SPECS)}") + + results: list[CaseResult] = [] + for mode in MODE_SPECS: + for case in CASE_SPECS: + results.append( + _run_case( + args, + mode, + case, + batch_tts_path=batch_tts_path, + batch_clone_path=batch_clone_path, + output_root=output_root, + ) + ) + + failed = [result for result in results if not result.ok] + print() + print("=" * 80) + print("Summary:") + for result in results: + status = "PASS" if result.ok else f"FAIL({result.returncode})" + print(f"- [{result.mode}] {result.case}: {status} ({result.elapsed_s:.2f}s)") + print(f" output_dir={result.output_dir}") + print(f" log={result.log_path}") + + print(f"Passed: {len(results) - len(failed)}/{len(results)}") + if failed: + print("Failed cases:") + for result in failed: + print(f"- [{result.mode}] {result.case}: see {result.log_path}") + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(parse_args())) diff --git a/examples/offline_inference/voxcpm/README.md b/examples/offline_inference/voxcpm/README.md new file mode 100644 index 0000000000..1eaea9b0db --- /dev/null +++ b/examples/offline_inference/voxcpm/README.md @@ -0,0 +1,123 @@ +# VoxCPM Offline Example + +This directory contains the minimal offline VoxCPM example for vLLM Omni. + +`end2end.py` is intentionally small and only covers: + +- single text-to-speech +- single voice cloning with `ref_audio` + `ref_text` +- non-streaming with `vllm_omni/model_executor/stage_configs/voxcpm.yaml` +- streaming with `vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml` + +Advanced workflows were moved out of the getting-started example: + +- `benchmarks/voxcpm/vllm_omni/bench_tts_offline.py`: warmup, batch prompts, profiler, offline TTFP / RTF +- `benchmarks/voxcpm/vllm_omni/run_offline_matrix.py`: fixed offline smoke matrix +- `benchmarks/voxcpm/`: benchmark scripts and benchmark docs + +## Prerequisites + +Install VoxCPM in one of these ways: + +```bash +pip install voxcpm +``` + +or point vLLM Omni to the local VoxCPM source tree: + +```bash +export VLLM_OMNI_VOXCPM_CODE_PATH=/path/to/VoxCPM/src +``` + +The example writes WAV files with `soundfile`: + +```bash +pip install soundfile +``` + +## Model Path + +Pass the native VoxCPM model directory directly: + +```bash +export VOXCPM_MODEL=/path/to/voxcpm-model +``` + +If the native VoxCPM `config.json` does not contain HuggingFace metadata such as +`model_type`, prepare a persistent HF-compatible config directory and point the +stage configs to it with `VLLM_OMNI_VOXCPM_HF_CONFIG_PATH`: + +```bash +export VLLM_OMNI_VOXCPM_HF_CONFIG_PATH=/tmp/voxcpm_hf_config +mkdir -p "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH" +cp "$VOXCPM_MODEL/config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/config.json" +cp "$VOXCPM_MODEL/generation_config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/generation_config.json" 2>/dev/null || true +python3 -c 'import json, os; p=os.path.join(os.environ["VLLM_OMNI_VOXCPM_HF_CONFIG_PATH"], "config.json"); cfg=json.load(open(p, "r", encoding="utf-8")); cfg["model_type"]="voxcpm"; cfg.setdefault("architectures", ["VoxCPMForConditionalGeneration"]); json.dump(cfg, open(p, "w", encoding="utf-8"), indent=2, ensure_ascii=False)' +``` + +If the model directory itself already has `model_type`, this extra directory is +not required. + +## Quick Start + +Single text-to-speech, non-streaming: + +```bash +python examples/offline_inference/voxcpm/end2end.py \ + --model "$VOXCPM_MODEL" \ + --text "This is a split-stage VoxCPM synthesis example running on vLLM Omni." +``` + +Single voice cloning, non-streaming: + +```bash +python examples/offline_inference/voxcpm/end2end.py \ + --model "$VOXCPM_MODEL" \ + --text "This sentence is synthesized with a cloned voice." \ + --ref-audio /path/to/reference.wav \ + --ref-text "The exact transcript spoken in reference.wav." +``` + +Streaming: + +```bash +python examples/offline_inference/voxcpm/end2end.py \ + --model "$VOXCPM_MODEL" \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \ + --text "This is a split-stage VoxCPM streaming example running on vLLM Omni." +``` + +By default, `end2end.py` writes to `output_audio/` for non-streaming and +`output_audio_streaming/` for streaming. + +## Advanced Workflows + +Use `benchmarks/voxcpm/vllm_omni/bench_tts_offline.py` when you need: + +- warmup runs +- prompt files +- batch JSONL inputs +- profiler injection +- offline TTFP / RTF emission + +Use `benchmarks/voxcpm/vllm_omni/run_offline_matrix.py` when you need the fixed offline smoke matrix that previously lived in `test.py`. + +Full matrix benchmark example: + +```bash +python benchmarks/voxcpm/vllm_omni/run_offline_matrix.py \ + --model "$VOXCPM_MODEL" \ + --ref-audio /path/to/reference.wav \ + --ref-text "The exact transcript spoken in reference.wav." +``` + +For online serving examples, see [examples/online_serving/voxcpm](../../online_serving/voxcpm/README.md). + +For benchmark reporting, see [benchmarks/voxcpm](../../../benchmarks/voxcpm/README.md). + +## Notes + +- `voxcpm.yaml` is the default non-streaming stage config. +- `voxcpm_async_chunk.yaml` is the streaming stage config. +- Streaming is currently single-request oriented; the fixed smoke matrix now lives in `benchmarks/voxcpm/vllm_omni/run_offline_matrix.py`. +- `ref_text` must be the real transcript of the reference audio. Mismatched text usually causes obvious quality degradation. diff --git a/examples/offline_inference/voxcpm/end2end.py b/examples/offline_inference/voxcpm/end2end.py new file mode 100644 index 0000000000..980410feae --- /dev/null +++ b/examples/offline_inference/voxcpm/end2end.py @@ -0,0 +1,206 @@ +"""Minimal offline VoxCPM example for vLLM Omni.""" + +from __future__ import annotations + +import asyncio +import time +from pathlib import Path +from typing import Any + +import soundfile as sf +import torch +from vllm.utils.argparse_utils import FlexibleArgumentParser + +from vllm_omni import AsyncOmni, Omni + +REPO_ROOT = Path(__file__).resolve().parents[3] +DEFAULT_SYNC_STAGE_CONFIG = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" + + +def _build_prompt(args) -> dict[str, Any]: + additional_information: dict[str, list[Any]] = { + "text": [args.text], + "cfg_value": [args.cfg_value], + "inference_timesteps": [args.inference_timesteps], + "min_len": [args.min_len], + "max_new_tokens": [args.max_new_tokens], + } + if args.streaming_prefix_len is not None: + additional_information["streaming_prefix_len"] = [args.streaming_prefix_len] + if args.ref_audio is not None: + additional_information["ref_audio"] = [args.ref_audio] + if args.ref_text is not None: + additional_information["ref_text"] = [args.ref_text] + return { + "prompt_token_ids": [1], + "additional_information": additional_information, + } + + +def _extract_audio_tensor(mm: dict[str, Any]) -> torch.Tensor: + audio = mm.get("audio", mm.get("model_outputs")) + if audio is None: + raise ValueError("No audio output found in multimodal output.") + if isinstance(audio, list): + parts = [torch.as_tensor(item).float().cpu().reshape(-1) for item in audio] + audio = torch.cat(parts, dim=-1) if parts else torch.zeros(0) + if not isinstance(audio, torch.Tensor): + audio = torch.as_tensor(audio) + return audio.float().cpu().reshape(-1) + + +def _extract_sample_rate(mm: dict[str, Any]) -> int: + sr_raw = mm.get("sr", 24000) + if isinstance(sr_raw, list) and sr_raw: + sr_raw = sr_raw[-1] + if hasattr(sr_raw, "item"): + return int(sr_raw.item()) + return int(sr_raw) + + +def _is_streaming_stage_config(stage_config_path: str) -> bool: + return "async_chunk" in Path(stage_config_path).stem + + +def _save_audio(audio: torch.Tensor, sample_rate: int, output_dir: Path, request_id: str) -> Path: + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / f"output_{request_id}.wav" + sf.write( + output_path, + audio.float().cpu().clamp(-1.0, 1.0).numpy(), + sample_rate, + format="WAV", + subtype="PCM_16", + ) + return output_path + + +async def _run_streaming(args) -> Path: + prompt = _build_prompt(args) + output_dir = Path(args.output_dir) if args.output_dir is not None else Path("output_audio_streaming") + request_id = "streaming_example" + sample_rate = 24000 + buffered_samples = 0 + chunks: list[torch.Tensor] = [] + started = time.perf_counter() + omni = AsyncOmni( + model=args.model, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + stage_init_timeout=args.stage_init_timeout, + ) + try: + async for stage_output in omni.generate(prompt, request_id=request_id): + mm = getattr(stage_output, "multimodal_output", None) + if not isinstance(mm, dict): + request_output = getattr(stage_output, "request_output", None) + if request_output is None: + continue + mm = getattr(request_output, "multimodal_output", None) + if not isinstance(mm, dict) and getattr(request_output, "outputs", None): + mm = getattr(request_output.outputs[0], "multimodal_output", None) + if not isinstance(mm, dict): + continue + audio = _extract_audio_tensor(mm) + if audio.numel() == 0: + continue + sample_rate = _extract_sample_rate(mm) + if audio.numel() > buffered_samples: + delta = audio[buffered_samples:] + buffered_samples = int(audio.numel()) + else: + delta = audio + buffered_samples += int(delta.numel()) + if delta.numel() > 0: + chunks.append(delta) + if not chunks: + raise RuntimeError("No streaming audio chunks received from VoxCPM.") + output_audio = torch.cat(chunks, dim=0) + output_path = _save_audio(output_audio, sample_rate, output_dir, request_id) + print(f"Saved streaming audio to: {output_path} ({time.perf_counter() - started:.2f}s)") + return output_path + finally: + omni.shutdown() + + +def _run_sync(args) -> Path: + prompt = _build_prompt(args) + output_dir = Path(args.output_dir) if args.output_dir is not None else Path("output_audio") + request_id = "sync_example" + started = time.perf_counter() + last_mm: dict[str, Any] | None = None + omni = Omni( + model=args.model, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + stage_init_timeout=args.stage_init_timeout, + ) + for stage_outputs in omni.generate(prompt): + request_output = getattr(stage_outputs, "request_output", None) + if request_output is None: + continue + outputs = getattr(request_output, "outputs", None) + if outputs: + for output in outputs: + mm = getattr(output, "multimodal_output", None) + if isinstance(mm, dict): + last_mm = mm + mm = getattr(request_output, "multimodal_output", None) + if isinstance(mm, dict): + last_mm = mm + if last_mm is None: + raise RuntimeError("No audio output received from VoxCPM.") + output_path = _save_audio( + _extract_audio_tensor(last_mm), + _extract_sample_rate(last_mm), + output_dir, + request_id, + ) + print(f"Saved audio to: {output_path} ({time.perf_counter() - started:.2f}s)") + return output_path + + +def parse_args(): + parser = FlexibleArgumentParser(description="Minimal offline VoxCPM example for vLLM Omni.") + parser.add_argument("--model", type=str, required=True, help="Local VoxCPM model directory.") + parser.add_argument( + "--stage-configs-path", + type=str, + default=str(DEFAULT_SYNC_STAGE_CONFIG), + help=("Stage config path. Use voxcpm.yaml for non-streaming or voxcpm_async_chunk.yaml for streaming."), + ) + parser.add_argument("--text", type=str, required=True, help="Input text for synthesis.") + parser.add_argument("--ref-audio", type=str, default=None, help="Reference audio path for voice cloning.") + parser.add_argument("--ref-text", type=str, default=None, help="Transcript of the reference audio.") + parser.add_argument("--output-dir", type=str, default=None, help="Output directory for generated wav files.") + parser.add_argument("--cfg-value", type=float, default=2.0, help="Guidance value passed to VoxCPM.") + parser.add_argument("--inference-timesteps", type=int, default=10, help="Number of diffusion timesteps.") + parser.add_argument("--min-len", type=int, default=2, help="Minimum latent length.") + parser.add_argument("--max-new-tokens", type=int, default=4096, help="Maximum latent length.") + parser.add_argument( + "--streaming-prefix-len", + type=int, + default=3, + help="Streaming prefix length used by voxcpm_async_chunk.yaml.", + ) + parser.add_argument("--stage-init-timeout", type=int, default=600, help="Stage initialization timeout in seconds.") + parser.add_argument("--log-stats", action="store_true", help="Enable vLLM Omni stats logging.") + args = parser.parse_args() + if (args.ref_audio is None) != (args.ref_text is None): + raise ValueError("Voice cloning requires --ref-audio and --ref-text together.") + return args + + +def main(args) -> None: + route = "streaming" if _is_streaming_stage_config(args.stage_configs_path) else "sync" + print(f"Model: {args.model}") + print(f"Stage config: {args.stage_configs_path}") + print(f"Route: {route}") + if route == "streaming": + asyncio.run(_run_streaming(args)) + else: + _run_sync(args) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/examples/online_serving/voxcpm/README.md b/examples/online_serving/voxcpm/README.md new file mode 100644 index 0000000000..78e1bf4aaa --- /dev/null +++ b/examples/online_serving/voxcpm/README.md @@ -0,0 +1,166 @@ +# VoxCPM + +## Prerequisites + +Install VoxCPM in one of these ways: + +```bash +pip install voxcpm +``` + +or point vLLM-Omni to a local VoxCPM source tree: + +```bash +export VLLM_OMNI_VOXCPM_CODE_PATH=/path/to/VoxCPM/src +``` + +If the native VoxCPM `config.json` lacks HF metadata such as `model_type`, +prepare a persistent HF-compatible config directory and export: + +```bash +export VLLM_OMNI_VOXCPM_HF_CONFIG_PATH=/tmp/voxcpm_hf_config +mkdir -p "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH" +cp "$VOXCPM_MODEL/config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/config.json" +cp "$VOXCPM_MODEL/generation_config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/generation_config.json" 2>/dev/null || true +python3 -c 'import json, os; p=os.path.join(os.environ["VLLM_OMNI_VOXCPM_HF_CONFIG_PATH"], "config.json"); cfg=json.load(open(p, "r", encoding="utf-8")); cfg["model_type"]="voxcpm"; cfg.setdefault("architectures", ["VoxCPMForConditionalGeneration"]); json.dump(cfg, open(p, "w", encoding="utf-8"), indent=2, ensure_ascii=False)' +``` + +The VoxCPM stage configs read `VLLM_OMNI_VOXCPM_HF_CONFIG_PATH` directly. The `python3 -c` form above avoids heredoc/indentation issues in interactive shells. + +## Launch the Server + +Use the async-chunk stage config by default: + +```bash +export VOXCPM_MODEL=/path/to/voxcpm-model +cd examples/online_serving/voxcpm +./run_server.sh +``` + +Use the non-streaming stage config: + +```bash +./run_server.sh sync +``` + +You can also launch the server directly: + +```bash +vllm serve "$VOXCPM_MODEL" \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \ + --trust-remote-code \ + --enforce-eager \ + --omni \ + --port 8091 +``` + +## Send Requests + +### Basic text-to-speech + +```bash +python openai_speech_client.py \ + --model "$VOXCPM_MODEL" \ + --text "This is a VoxCPM online text-to-speech example." +``` + +### Voice cloning + +```bash +python openai_speech_client.py \ + --model "$VOXCPM_MODEL" \ + --text "This sentence is synthesized with a cloned voice." \ + --ref-audio /path/to/reference.wav \ + --ref-text "The exact transcript spoken in reference.wav." +``` + +`ref_text` must be the real transcript of the reference audio. Placeholder text or mismatched text will usually degrade quality badly. + +### Streaming PCM output + +```bash +python openai_speech_client.py \ + --model "$VOXCPM_MODEL" \ + --text "This is a streaming VoxCPM request." \ + --stream \ + --output voxcpm_stream.pcm +``` + +### Using curl + +```bash +curl -X POST http://localhost:8091/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{ + "model": "OpenBMB/VoxCPM1.5", + "input": "Hello from VoxCPM online serving.", + "response_format": "wav" + }' --output output.wav +``` + +Voice cloning: + +```bash +curl -X POST http://localhost:8091/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{ + "model": "OpenBMB/VoxCPM1.5", + "input": "This sentence uses a cloned voice.", + "ref_audio": "https://example.com/reference.wav", + "ref_text": "The exact transcript spoken in the reference audio.", + "response_format": "wav" + }' --output cloned.wav +``` + +Streaming PCM: + +```bash +curl -X POST http://localhost:8091/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{ + "model": "OpenBMB/VoxCPM1.5", + "input": "This is a streaming VoxCPM request.", + "stream": true, + "response_format": "pcm" + }' --output output.pcm +``` + +## Supported Request Shape + +VoxCPM online serving currently supports: + +- plain text-to-speech +- voice cloning with `ref_audio` + `ref_text` +- `stream=true` with `response_format=pcm` or `wav` + +VoxCPM online serving does not use these generic TTS fields: + +- `voice` +- `instructions` +- `language` +- `speaker_embedding` +- `x_vector_only_mode` + +## Streaming vs Non-Streaming + +- `voxcpm_async_chunk.yaml` enables async-chunk streaming and is best for single-request streaming latency. +- `voxcpm.yaml` performs one-shot latent generation then VAE decode. + +Like native VoxCPM, the async streaming path should be treated as single-request. If you need stable throughput benchmarking, prefer `voxcpm.yaml`. + +Do not use `voxcpm_async_chunk.yaml` for concurrent online streaming or `/v1/audio/speech/batch`. For multiple requests, prefer `voxcpm.yaml`. + +## Benchmark + +The serving benchmark reports TTFP and RTF: + +```bash +python benchmarks/voxcpm/vllm_omni/bench_tts_serve.py \ + --host 127.0.0.1 \ + --port 8091 \ + --num-prompts 10 \ + --max-concurrency 1 \ + --result-dir /tmp/voxcpm_bench +``` + +For the async-chunk server, keep `--max-concurrency 1`. diff --git a/examples/online_serving/voxcpm/openai_speech_client.py b/examples/online_serving/voxcpm/openai_speech_client.py new file mode 100644 index 0000000000..c400114e8b --- /dev/null +++ b/examples/online_serving/voxcpm/openai_speech_client.py @@ -0,0 +1,155 @@ +"""OpenAI-compatible client for VoxCPM via /v1/audio/speech. + +Examples: + # Basic text-to-speech + python openai_speech_client.py --text "Hello from VoxCPM" + + # Voice cloning + python openai_speech_client.py \ + --text "This sentence uses the cloned voice." \ + --ref-audio /path/to/reference.wav \ + --ref-text "The exact transcript spoken in the reference audio." + + # Streaming PCM output + python openai_speech_client.py \ + --text "This is a streaming VoxCPM request." \ + --stream \ + --output output.pcm +""" + +import argparse +import base64 +import os + +import httpx + +DEFAULT_API_BASE = "http://localhost:8091" +DEFAULT_API_KEY = "EMPTY" +DEFAULT_MODEL = "OpenBMB/VoxCPM1.5" + + +def encode_audio_to_base64(audio_path: str) -> str: + """Encode a local audio file to base64 data URL.""" + if not os.path.exists(audio_path): + raise FileNotFoundError(f"Audio file not found: {audio_path}") + + ext = audio_path.lower().rsplit(".", 1)[-1] + mime_map = { + "wav": "audio/wav", + "mp3": "audio/mpeg", + "flac": "audio/flac", + "ogg": "audio/ogg", + } + mime_type = mime_map.get(ext, "audio/wav") + + with open(audio_path, "rb") as f: + audio_b64 = base64.b64encode(f.read()).decode("utf-8") + return f"data:{mime_type};base64,{audio_b64}" + + +def build_payload(args) -> dict[str, object]: + payload: dict[str, object] = { + "model": args.model, + "input": args.text, + "response_format": "pcm" if args.stream else args.response_format, + } + + if args.ref_audio: + if args.ref_audio.startswith(("http://", "https://", "data:")): + payload["ref_audio"] = args.ref_audio + else: + payload["ref_audio"] = encode_audio_to_base64(args.ref_audio) + if args.ref_text: + payload["ref_text"] = args.ref_text + if args.max_new_tokens is not None: + payload["max_new_tokens"] = args.max_new_tokens + if args.stream: + payload["stream"] = True + + return payload + + +def run_tts(args) -> None: + payload = build_payload(args) + api_url = f"{args.api_base}/v1/audio/speech" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {args.api_key}", + } + + print(f"Model: {args.model}") + print(f"Text: {args.text}") + if args.ref_audio: + print("Mode: voice cloning") + print(f"Reference audio: {args.ref_audio}") + else: + print("Mode: text-to-speech") + + if args.stream: + output_path = args.output or "voxcpm_output.pcm" + with httpx.Client(timeout=300.0) as client: + with client.stream("POST", api_url, json=payload, headers=headers) as response: + if response.status_code != 200: + print(f"Error: {response.status_code}") + print(response.read().decode("utf-8", errors="ignore")) + return + + total_bytes = 0 + with open(output_path, "wb") as f: + for chunk in response.iter_bytes(): + if not chunk: + continue + f.write(chunk) + total_bytes += len(chunk) + print(f"Streamed {total_bytes} bytes to: {output_path}") + return + + with httpx.Client(timeout=300.0) as client: + response = client.post(api_url, json=payload, headers=headers) + + if response.status_code != 200: + print(f"Error: {response.status_code}") + print(response.text) + return + + try: + text = response.content.decode("utf-8") + if text.startswith('{"error"'): + print(f"Error: {text}") + return + except UnicodeDecodeError: + pass + + output_path = args.output or "voxcpm_output.wav" + with open(output_path, "wb") as f: + f.write(response.content) + print(f"Audio saved to: {output_path}") + + +def main(): + parser = argparse.ArgumentParser(description="VoxCPM OpenAI-compatible speech client") + parser.add_argument("--api-base", default=DEFAULT_API_BASE, help="API base URL") + parser.add_argument("--api-key", default=DEFAULT_API_KEY, help="API key") + parser.add_argument("--model", "-m", default=DEFAULT_MODEL, help="Model name or path") + parser.add_argument("--text", required=True, help="Text to synthesize") + parser.add_argument("--ref-audio", default=None, help="Reference audio path, URL, or data URL") + parser.add_argument( + "--ref-text", + default=None, + help="The exact transcript spoken in the reference audio", + ) + parser.add_argument("--stream", action="store_true", help="Enable streaming PCM output") + parser.add_argument( + "--response-format", + default="wav", + choices=["wav", "pcm", "flac", "mp3", "aac", "opus"], + help="Audio format for non-streaming mode (default: wav)", + ) + parser.add_argument("--max-new-tokens", type=int, default=None, help="Maximum tokens to generate") + parser.add_argument("--output", "-o", default=None, help="Output file path") + args = parser.parse_args() + run_tts(args) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/voxcpm/run_server.sh b/examples/online_serving/voxcpm/run_server.sh new file mode 100755 index 0000000000..ab4b6fe854 --- /dev/null +++ b/examples/online_serving/voxcpm/run_server.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Launch vLLM-Omni server for VoxCPM online speech serving. +# +# Usage: +# ./run_server.sh # default: async_chunk stage config +# ./run_server.sh async # async_chunk stage config +# ./run_server.sh sync # no-async-chunk stage config +# VOXCPM_MODEL=/path/to/model ./run_server.sh + +set -e + +MODE="${1:-async}" +MODEL="${VOXCPM_MODEL:-OpenBMB/VoxCPM1.5}" + +case "$MODE" in + async) + STAGE_CONFIG="vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml" + ;; + sync) + STAGE_CONFIG="vllm_omni/model_executor/stage_configs/voxcpm.yaml" + ;; + *) + echo "Unknown mode: $MODE" + echo "Supported: async, sync" + exit 1 + ;; +esac + +echo "Starting VoxCPM server with model: $MODEL" +echo "Stage config: $STAGE_CONFIG" + +vllm serve "$MODEL" \ + --stage-configs-path "$STAGE_CONFIG" \ + --host 0.0.0.0 \ + --port 8091 \ + --trust-remote-code \ + --enforce-eager \ + --omni diff --git a/tests/e2e/offline_inference/test_voxcpm.py b/tests/e2e/offline_inference/test_voxcpm.py new file mode 100644 index 0000000000..d7f65525e9 --- /dev/null +++ b/tests/e2e/offline_inference/test_voxcpm.py @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""E2E test for VoxCPM offline inference.""" + +import json +import os +from pathlib import Path +from typing import Any + +import numpy as np +import pytest +import torch + +import tests.conftest as omni_test_conftest +from tests.conftest import OmniRunner +from tests.utils import hardware_test +from vllm_omni.model_executor.models.voxcpm.voxcpm_runtime_utils import ( + prepare_voxcpm_hf_config_dir, + resolve_voxcpm_model_dir, +) + +VOXCPM_MODEL = os.environ.get("VOXCPM_MODEL", "OpenBMB/VoxCPM1.5") +STAGE_CONFIG = str( + Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" +) +SAMPLE_RATE = 24000 + + +@pytest.fixture(autouse=True) +def _patch_npu_cleanup_for_voxcpm(monkeypatch: pytest.MonkeyPatch): + """Limit the NPU cleanup workaround to this VoxCPM test module only.""" + original_cleanup = omni_test_conftest.cleanup_dist_env_and_memory + + def _safe_cleanup() -> None: + try: + original_cleanup() + except RuntimeError as exc: + if "Allocator for npu is not a DeviceAllocator" in str(exc): + return + raise + + monkeypatch.setattr(omni_test_conftest, "cleanup_dist_env_and_memory", _safe_cleanup) + + +def _build_prompt(text: str) -> dict[str, Any]: + return { + "prompt_token_ids": [1], + "additional_information": { + "text": [text], + "cfg_value": [2.0], + "inference_timesteps": [10], + "min_len": [2], + "max_new_tokens": [1024], + }, + } + + +def _extract_audio_tensor(multimodal_output: dict[str, Any]) -> torch.Tensor: + audio = multimodal_output.get("audio", multimodal_output.get("model_outputs")) + assert audio is not None, f"No audio output found, keys={list(multimodal_output.keys())}" + + if isinstance(audio, list): + parts: list[torch.Tensor] = [] + for item in audio: + if item is None: + continue + tensor = torch.as_tensor(item) + if tensor.numel() == 0: + continue + parts.append(tensor.float().cpu().reshape(-1)) + return torch.cat(parts, dim=-1) if parts else torch.zeros((0,), dtype=torch.float32) + + return torch.as_tensor(audio).float().cpu().reshape(-1) + + +def _extract_final_multimodal_output(outputs) -> dict[str, Any]: + for item in reversed(outputs): + request_output = getattr(item, "request_output", None) + if request_output is not None: + multimodal_output = getattr(request_output, "multimodal_output", None) + if isinstance(multimodal_output, dict): + return multimodal_output + completions = getattr(request_output, "outputs", None) or [] + for completion in completions: + multimodal_output = getattr(completion, "multimodal_output", None) + if isinstance(multimodal_output, dict): + return multimodal_output + + multimodal_output = getattr(item, "multimodal_output", None) + if isinstance(multimodal_output, dict): + return multimodal_output + + raise AssertionError("No multimodal audio output found in VoxCPM generate results") + + +@pytest.fixture +def voxcpm_model_path(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> str: + model_dir = resolve_voxcpm_model_dir(VOXCPM_MODEL) + + hf_config_env = os.environ.get("VLLM_OMNI_VOXCPM_HF_CONFIG_PATH") + if hf_config_env: + hf_config_dir = Path(hf_config_env).expanduser() + else: + hf_config_dir = tmp_path / "voxcpm_hf_config" + + if not (hf_config_dir / "config.json").exists(): + prepare_voxcpm_hf_config_dir(model_dir, hf_config_dir) + + monkeypatch.setenv("VLLM_OMNI_VOXCPM_HF_CONFIG_PATH", str(hf_config_dir)) + return str(model_dir) + + +def test_prepare_voxcpm_hf_config_dir(tmp_path: Path): + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "config.json").write_text(json.dumps({"hidden_size": 1024}), encoding="utf-8") + (model_dir / "generation_config.json").write_text(json.dumps({"do_sample": False}), encoding="utf-8") + + hf_config_dir = prepare_voxcpm_hf_config_dir(model_dir, tmp_path / "voxcpm_hf_config") + + prepared_config = json.loads((hf_config_dir / "config.json").read_text(encoding="utf-8")) + assert prepared_config["model_type"] == "voxcpm" + assert prepared_config["architectures"] == ["VoxCPMForConditionalGeneration"] + assert (hf_config_dir / "generation_config.json").exists() + + +def test_resolve_voxcpm_model_dir_local_path(tmp_path: Path): + model_dir = tmp_path / "OpenBMB" / "VoxCPM1.5" + model_dir.mkdir(parents=True) + + assert resolve_voxcpm_model_dir(str(model_dir)) == model_dir + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "L4"}, num_cards=1) +def test_voxcpm_zero_shot_001(voxcpm_model_path: str): + with OmniRunner(voxcpm_model_path, stage_configs_path=STAGE_CONFIG) as runner: + outputs = list(runner.omni.generate(_build_prompt("Hello, this is a VoxCPM offline inference test."))) + + assert outputs, "No outputs returned" + + multimodal_output = _extract_final_multimodal_output(outputs) + audio = _extract_audio_tensor(multimodal_output) + assert audio.numel() > SAMPLE_RATE // 2, f"Audio too short: {audio.numel()} samples" + + duration_s = audio.shape[0] / SAMPLE_RATE + assert 0.5 < duration_s < 30.0, f"Audio duration out of range: {duration_s:.2f}s" + + peak = float(torch.max(torch.abs(audio)).item()) if audio.numel() > 0 else 0.0 + assert peak > 0.01, "Generated audio appears to be silence" + + audio_np = audio.numpy() + rms = float(np.sqrt(np.mean(np.square(audio_np)))) if audio_np.size else 0.0 + assert rms > 1e-4, "Generated audio RMS too low" diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 35d55f1cc4..565c83c1ad 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -7,6 +7,7 @@ import argparse import inspect from types import SimpleNamespace +from unittest.mock import Mock import pytest from pydantic import ValidationError @@ -166,6 +167,24 @@ def test_stage_configs_path_field(): assert args.stage_configs_path == "/some/path.yaml" +def test_voxcpm_model_arch_injects_model_type_override(mocker): + """Ensure VoxCPM model_arch injects hf_overrides for config resolution.""" + mocker.patch.object(OmniEngineArgs, "_ensure_omni_models_registered", return_value=True) + mocker.patch.object(OmniEngineArgs, "_patch_empty_hf_config") + mocker.patch.object(EngineArgs, "create_model_config", return_value=Mock()) + mocker.patch.object(OmniModelConfig, "from_vllm_model_config", return_value=Mock()) + + args = OmniEngineArgs( + model="OpenBMB/VoxCPM1.5", + model_arch="VoxCPMForConditionalGeneration", + ) + args.create_model_config() + + assert args.hf_overrides["architectures"] == ["VoxCPMForConditionalGeneration"] + assert args.hf_overrides["model_type"] == "voxcpm" + args._patch_empty_hf_config.assert_called_once_with("voxcpm") + + def test_strip_single_engine_args(): """_strip_single_engine_args should remove EngineArgs fields but keep omni fields.""" kwargs = { diff --git a/tests/entrypoints/openai_api/test_serving_speech_voxcpm.py b/tests/entrypoints/openai_api/test_serving_speech_voxcpm.py new file mode 100644 index 0000000000..48660b6d1c --- /dev/null +++ b/tests/entrypoints/openai_api/test_serving_speech_voxcpm.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""UTs for VoxCPM OpenAI speech serving behavior.""" + +import asyncio +from types import SimpleNamespace +from unittest.mock import AsyncMock + +import pytest +from pytest_mock import MockerFixture + +from vllm_omni.entrypoints.openai.protocol.audio import OpenAICreateSpeechRequest +from vllm_omni.entrypoints.openai.serving_speech import OmniOpenAIServingSpeech + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +@pytest.fixture +def voxcpm_server(mocker: MockerFixture): + mocker.patch.object(OmniOpenAIServingSpeech, "_load_supported_speakers", return_value=set()) + mocker.patch.object(OmniOpenAIServingSpeech, "_load_codec_frame_rate", return_value=None) + + mock_engine_client = mocker.MagicMock() + mock_engine_client.errored = False + mock_engine_client.model_config = mocker.MagicMock(model="OpenBMB/VoxCPM1.5") + mock_engine_client.default_sampling_params_list = [SimpleNamespace(max_tokens=2048)] + mock_engine_client.tts_batch_max_items = 32 + mock_engine_client.generate = mocker.MagicMock(return_value="generator") + mock_engine_client.stage_configs = [ + SimpleNamespace( + engine_args=SimpleNamespace( + model_stage="latent_generator", + model_arch="VoxCPMForConditionalGeneration", + ), + tts_args={}, + ), + SimpleNamespace( + engine_args=SimpleNamespace(model_stage="vae"), + tts_args={}, + ), + ] + + mock_models = mocker.MagicMock() + mock_models.is_base_model.return_value = True + + return OmniOpenAIServingSpeech( + engine_client=mock_engine_client, + models=mock_models, + request_logger=mocker.MagicMock(), + ) + + +class TestVoxCPMServing: + def test_voxcpm_model_type_detection(self, voxcpm_server): + assert voxcpm_server._tts_model_type == "voxcpm" + assert voxcpm_server._is_tts is True + assert voxcpm_server.supported_speakers == set() + + @pytest.mark.parametrize( + ("request_kwargs", "expected_substring"), + [ + ({"voice": "alice"}, "voice"), + ({"instructions": "whisper"}, "instructions"), + ({"language": "en"}, "language"), + ({"task_type": "CustomVoice"}, "plain tts"), + ({"x_vector_only_mode": True}, "x_vector_only_mode"), + ({"speaker_embedding": [0.1, 0.2]}, "speaker_embedding"), + ({"initial_codec_chunk_frames": 4}, "initial_codec_chunk_frames"), + ({"ref_text": "reference"}, "ref_audio"), + ], + ) + def test_validate_voxcpm_rejects_unsupported_fields(self, voxcpm_server, request_kwargs, expected_substring): + request = OpenAICreateSpeechRequest(input="hello voxcpm", **request_kwargs) + error = voxcpm_server._validate_voxcpm_request(request) + assert error is not None + assert expected_substring in error.lower() + + def test_validate_voxcpm_accepts_plain_tts_request(self, voxcpm_server): + request = OpenAICreateSpeechRequest(input="hello voxcpm", max_new_tokens=256) + assert voxcpm_server._validate_voxcpm_request(request) is None + + def test_validate_voxcpm_accepts_voice_clone_request(self, voxcpm_server): + request = OpenAICreateSpeechRequest( + input="clone this voice", + ref_audio="data:audio/wav;base64,QUJD", + ref_text="reference transcript", + max_new_tokens=256, + ) + assert voxcpm_server._validate_voxcpm_request(request) is None + + def test_prepare_speech_generation_voxcpm_text_only(self, voxcpm_server): + request = OpenAICreateSpeechRequest(input="hello voxcpm", max_new_tokens=321) + + request_id, generator, tts_params = asyncio.run(voxcpm_server._prepare_speech_generation(request)) + + assert request_id.startswith("speech-") + assert generator == "generator" + assert tts_params == { + "text": ["hello voxcpm"], + "cfg_value": [2.0], + "inference_timesteps": [10], + "min_len": [2], + "max_new_tokens": [321], + } + + voxcpm_server.engine_client.generate.assert_called_once() + call = voxcpm_server.engine_client.generate.call_args + assert call.kwargs["prompt"] == { + "prompt_token_ids": [1], + "additional_information": tts_params, + } + assert call.kwargs["output_modalities"] == ["audio"] + + def test_prepare_speech_generation_voxcpm_voice_clone_resolves_ref_audio(self, voxcpm_server): + voxcpm_server._resolve_ref_audio = AsyncMock(return_value=([0.1, -0.1, 0.2], 16000)) + request = OpenAICreateSpeechRequest( + input="clone this voice", + ref_audio="data:audio/wav;base64,QUJD", + ref_text="reference transcript", + max_new_tokens=512, + ) + + request_id, generator, tts_params = asyncio.run(voxcpm_server._prepare_speech_generation(request)) + + assert request_id.startswith("speech-") + assert generator == "generator" + assert tts_params == { + "text": ["clone this voice"], + "cfg_value": [2.0], + "inference_timesteps": [10], + "min_len": [2], + "max_new_tokens": [512], + "ref_text": ["reference transcript"], + "ref_audio": [[[0.1, -0.1, 0.2], 16000]], + } + + voxcpm_server._resolve_ref_audio.assert_awaited_once_with("data:audio/wav;base64,QUJD") + call = voxcpm_server.engine_client.generate.call_args + assert call.kwargs["prompt"] == { + "prompt_token_ids": [1], + "additional_information": tts_params, + } diff --git a/tests/entrypoints/test_utils.py b/tests/entrypoints/test_utils.py index 94e254c250..248629d51d 100644 --- a/tests/entrypoints/test_utils.py +++ b/tests/entrypoints/test_utils.py @@ -310,6 +310,39 @@ def mock_exists(path): assert result is not None assert "glm_image.yaml" in result + def test_voxcpm_transformers_format_resolution(self, mocker: MockerFixture): + """Test VoxCPM transformers config resolves to the voxcpm stage config.""" + mocker.patch( + "vllm_omni.entrypoints.utils.get_config", + side_effect=ValueError("missing transformers config"), + ) + mocker.patch( + "vllm_omni.entrypoints.utils.file_or_path_exists", + side_effect=lambda _model, filename, revision=None: filename == "config.json", + ) + mocker.patch( + "vllm_omni.entrypoints.utils.get_hf_file_to_dict", + return_value={"model_type": "voxcpm"}, + ) + mocker.patch( + "vllm_omni.entrypoints.utils.current_omni_platform.get_default_stage_config_path", + return_value="vllm_omni/model_executor/stage_configs", + ) + + original_exists = os.path.exists + + def mock_exists(path): + if "voxcpm.yaml" in str(path): + return True + return original_exists(path) + + mocker.patch("os.path.exists", side_effect=mock_exists) + + result = resolve_model_config_path("OpenBMB/VoxCPM1.5") + + assert result is not None + assert "voxcpm.yaml" in result + class TestLoadAndResolveStageConfigs: def test_load_and_resolve_with_kwargs(self): diff --git a/tests/model_executor/stage_input_processors/test_voxcpm_async_chunk.py b/tests/model_executor/stage_input_processors/test_voxcpm_async_chunk.py new file mode 100644 index 0000000000..7d6fc6e74c --- /dev/null +++ b/tests/model_executor/stage_input_processors/test_voxcpm_async_chunk.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""UTs for VoxCPM async-chunk stage input processing.""" + +from types import SimpleNamespace + +import pytest +import torch + +from vllm_omni.model_executor.stage_input_processors.voxcpm import ( + _VOXCPM_LATENT_MAGIC, + _coerce_finished_flag, + latent2vae_async_chunk, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def _request(*, finished): + return SimpleNamespace(is_finished=lambda: finished) + + +def _decode_serialized_latent(codes: list[int]) -> torch.Tensor: + assert codes[0] == _VOXCPM_LATENT_MAGIC + latent_dim = codes[1] + time_dim = codes[2] + payload = torch.tensor(codes[3:], dtype=torch.int32).to(torch.uint16) + return payload.view(torch.bfloat16).to(torch.float32).reshape(1, latent_dim, time_dim) + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + (None, False), + (False, False), + (True, True), + (torch.tensor(False), False), + (torch.tensor(True), True), + ([torch.tensor(True)], True), + (([True],), True), + ([], False), + ], +) +def test_coerce_finished_flag(value, expected): + assert _coerce_finished_flag(value) is expected + + +def test_latent2vae_async_chunk_serializes_latent_payload(): + latent = torch.arange(6, dtype=torch.float32).reshape(2, 3) + + payload = latent2vae_async_chunk( + transfer_manager=None, + pooling_output={"latent_audio_feat": latent}, + request=_request(finished=False), + is_finished=torch.tensor(False), + ) + + assert payload is not None + assert torch.equal(payload["finished"], torch.tensor(False, dtype=torch.bool)) + recovered = _decode_serialized_latent(payload["code_predictor_codes"]) + torch.testing.assert_close(recovered, latent.to(torch.bfloat16).to(torch.float32).unsqueeze(0)) + + +def test_latent2vae_async_chunk_returns_terminal_marker_without_latent(): + payload = latent2vae_async_chunk( + transfer_manager=None, + pooling_output=None, + request=_request(finished=[torch.tensor(True)]), + is_finished=False, + ) + + assert payload == { + "code_predictor_codes": [], + "finished": torch.tensor(True, dtype=torch.bool), + } + + +def test_latent2vae_async_chunk_returns_none_for_nonterminal_empty_chunk(): + payload = latent2vae_async_chunk( + transfer_manager=None, + pooling_output={"latent_audio_feat": torch.zeros((0,), dtype=torch.float32)}, + request=_request(finished=False), + is_finished=False, + ) + + assert payload is None diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index d61102c7e1..5b69d6b1f0 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -21,6 +21,7 @@ "CosyVoice3Model": "cosyvoice3", "OmniVoiceModel": "omnivoice", "VoxCPM2TalkerForConditionalGeneration": "voxcpm2", + "VoxCPMForConditionalGeneration": "voxcpm", } # Maps model architecture names to tokenizer subfolder paths within HF repos. @@ -41,6 +42,7 @@ def _register_omni_hf_configs() -> None: from vllm_omni.model_executor.models.voxtral_tts.configuration_voxtral_tts import ( VoxtralTTSConfig, ) + from vllm_omni.transformers_utils.configs.voxcpm import VoxCPMConfig from vllm_omni.transformers_utils.configs.voxcpm2 import VoxCPM2Config except Exception as exc: # pragma: no cover - best-effort optional registration logger.warning("Skipping omni HF config registration due to import error: %s", exc) @@ -59,6 +61,7 @@ def _register_omni_hf_configs() -> None: ("cosyvoice3", CosyVoice3Config), ("omnivoice", OmniVoiceConfig), ("voxtral_tts", VoxtralTTSConfig), + ("voxcpm", VoxCPMConfig), ("voxcpm2", VoxCPM2Config), ]: try: diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 1d9754853f..1f78f5691b 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -49,6 +49,7 @@ _FISH_TTS_MODEL_STAGES = {"fish_speech_slow_ar"} _COSYVOICE3_TTS_MODEL_STAGES = {"cosyvoice3_talker"} _OMNIVOICE_TTS_MODEL_STAGES = {"omnivoice_generator"} +_VOXCPM_TTS_MODEL_STAGES = {"latent_generator", "vae"} _VOXCPM2_TTS_MODEL_STAGES = {"latent_generator"} _TTS_MODEL_STAGES: set[str] = ( _VOXTRAL_TTS_MODEL_STAGES @@ -56,6 +57,7 @@ | _FISH_TTS_MODEL_STAGES | _COSYVOICE3_TTS_MODEL_STAGES | _OMNIVOICE_TTS_MODEL_STAGES + | _VOXCPM_TTS_MODEL_STAGES | _VOXCPM2_TTS_MODEL_STAGES ) _TTS_LANGUAGES: set[str] = { @@ -282,6 +284,11 @@ def _detect_tts_model_type(self) -> str | None: if self._tts_stage is None: return None model_stage = getattr(self._tts_stage.engine_args, "model_stage", None) + model_arch = getattr(self._tts_stage.engine_args, "model_arch", None) + if model_arch == "VoxCPM2TalkerForConditionalGeneration": + return "voxcpm2" + if model_arch == "VoxCPMForConditionalGeneration": + return "voxcpm" if model_stage in _QWEN3_TTS_MODEL_STAGES: return "qwen3_tts" if model_stage in _VOXTRAL_TTS_MODEL_STAGES: @@ -292,8 +299,12 @@ def _detect_tts_model_type(self) -> str | None: return "cosyvoice3" if model_stage in _OMNIVOICE_TTS_MODEL_STAGES: return "omnivoice" - if model_stage in _VOXCPM2_TTS_MODEL_STAGES: - return "voxcpm2" + if model_stage in (_VOXCPM_TTS_MODEL_STAGES | _VOXCPM2_TTS_MODEL_STAGES): + has_vae_stage = any( + getattr(getattr(stage, "engine_args", None), "model_stage", None) == "vae" + for stage in self.engine_client.stage_configs + ) + return "voxcpm" if has_vae_stage or model_stage == "vae" else "voxcpm2" return None def _compute_max_instructions_length(self) -> int: @@ -318,6 +329,8 @@ def _compute_max_instructions_length(self) -> int: def _load_supported_speakers(self) -> set[str]: """Load supported speakers (case-insensitive) from the model configuration.""" try: + if self._tts_model_type == "voxcpm": + return set() if self._tts_model_type == "voxtral_tts": config = self.engine_client.model_config.hf_config.audio_config else: @@ -377,6 +390,8 @@ def _estimate_ref_code_len(self, ref_audio: object) -> int | None: def _estimate_prompt_len(self, tts_params: dict[str, Any]) -> int: """Estimate prompt length so the placeholder matches model-side embeddings.""" try: + if self._tts_model_type == "voxcpm": + return 1 from vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_talker import ( Qwen3TTSTalkerForConditionalGeneration, ) @@ -791,6 +806,8 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non return self._validate_fish_tts_request(request) if self._tts_model_type == "cosyvoice3": return self._validate_cosyvoice3_request(request) + if self._tts_model_type == "voxcpm": + return self._validate_voxcpm_request(request) if self._tts_model_type == "voxcpm2": return None # VoxCPM2 accepts any text input return self._validate_qwen_tts_request(request) @@ -832,6 +849,43 @@ def _validate_voxtral_tts_request(self, request: OpenAICreateSpeechRequest) -> s return None + def _validate_voxcpm_request(self, request: OpenAICreateSpeechRequest) -> str | None: + """Validate VoxCPM request parameters. Returns error message or None.""" + if not request.input or not request.input.strip(): + return "Input text cannot be empty" + + if request.voice is not None: + return "'voice' is not supported for VoxCPM" + if request.instructions is not None: + return "'instructions' is not supported for VoxCPM" + if request.language is not None: + return "'language' is not supported for VoxCPM" + if request.task_type not in (None, "Base"): + return "VoxCPM only supports plain TTS or voice cloning with ref_audio/ref_text" + if request.x_vector_only_mode is not None: + return "'x_vector_only_mode' is not supported for VoxCPM" + if request.speaker_embedding is not None: + return "'speaker_embedding' is not supported for VoxCPM" + if request.initial_codec_chunk_frames is not None: + return "'initial_codec_chunk_frames' is not supported for VoxCPM" + + if request.ref_audio is not None: + fmt_err = self._validate_ref_audio_format(request.ref_audio) + if fmt_err: + return fmt_err + if not request.ref_text or not request.ref_text.strip(): + return "Voice cloning requires 'ref_text' (transcript of the reference audio)" + elif request.ref_text is not None: + return "'ref_text' requires 'ref_audio' for VoxCPM voice cloning" + + if request.max_new_tokens is not None: + if request.max_new_tokens < _TTS_MAX_NEW_TOKENS_MIN: + return f"max_new_tokens must be at least {_TTS_MAX_NEW_TOKENS_MIN}" + if request.max_new_tokens > _TTS_MAX_NEW_TOKENS_MAX: + return f"max_new_tokens cannot exceed {_TTS_MAX_NEW_TOKENS_MAX}" + + return None + def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None: """Validate Qwen TTS request parameters. Returns error message or None.""" # Infer Base task when ref_audio or ref_text is provided without explicit task_type. @@ -1169,6 +1223,18 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any Processes each parameter if present, skips if not. Values are wrapped in lists as required by the model. """ + if self._tts_model_type == "voxcpm": + params: dict[str, Any] = { + "text": [request.input], + "cfg_value": [2.0], + "inference_timesteps": [10], + "min_len": [2], + "max_new_tokens": [request.max_new_tokens or 4096], + } + if request.ref_text is not None: + params["ref_text"] = [request.ref_text] + return params + params: dict[str, Any] = {} # Text content (always required) @@ -1499,6 +1565,8 @@ async def _prepare_speech_generation( model_type = "voxtral_tts" elif self._tts_model_type == "cosyvoice3": model_type = "cosyvoice3" + elif self._tts_model_type == "voxcpm": + model_type = "voxcpm" elif self._tts_model_type == "voxcpm2": model_type = "voxcpm2" elif self._is_tts: diff --git a/vllm_omni/model_executor/models/registry.py b/vllm_omni/model_executor/models/registry.py index 0894088005..3407b42869 100644 --- a/vllm_omni/model_executor/models/registry.py +++ b/vllm_omni/model_executor/models/registry.py @@ -145,6 +145,12 @@ "fish_speech_dac_decoder", "FishSpeechDACDecoder", ), + ## VoxCPM + "VoxCPMForConditionalGeneration": ( + "voxcpm", + "voxcpm", + "VoxCPMForConditionalGeneration", + ), ## VoxCPM2 "VoxCPM2TalkerForConditionalGeneration": ( "voxcpm2", diff --git a/vllm_omni/model_executor/models/voxcpm/__init__.py b/vllm_omni/model_executor/models/voxcpm/__init__.py new file mode 100644 index 0000000000..3b064c0f68 --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm/__init__.py @@ -0,0 +1,7 @@ +from .configuration_voxcpm import VoxCPMConfig +from .voxcpm import VoxCPMForConditionalGeneration + +__all__ = [ + "VoxCPMConfig", + "VoxCPMForConditionalGeneration", +] diff --git a/vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py b/vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py new file mode 100644 index 0000000000..ce1d809bd3 --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py @@ -0,0 +1,3 @@ +from vllm_omni.transformers_utils.configs.voxcpm import VoxCPMConfig + +__all__ = ["VoxCPMConfig"] diff --git a/vllm_omni/model_executor/models/voxcpm/voxcpm.py b/vllm_omni/model_executor/models/voxcpm/voxcpm.py new file mode 100644 index 0000000000..6fa36fc420 --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm/voxcpm.py @@ -0,0 +1,886 @@ +from __future__ import annotations + +import json +import os +import sys +import tempfile +import warnings +import wave +from collections.abc import Callable, Generator, Iterable +from pathlib import Path +from typing import Any + +import numpy as np +import torch +import torch.nn as nn +from einops import rearrange +from tqdm import tqdm +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.sequence import IntermediateTensors + +from vllm_omni.model_executor.models.output_templates import OmniOutput + +from .voxcpm_loader import ( + _build_prompt_cache_with_soundfile, + _device_to_string, + _force_cuda_available_for_npu, + _import_voxcpm_audio_vae_classes, + _import_voxcpm_base_model_class, + _is_torchcodec_load_error, + _normalize_dtype_name, + _prepare_runtime_model_dir, + _resolve_runtime_device, +) +from .voxcpm_runtime_utils import resolve_voxcpm_model_dir +from .voxcpm_stage_wrappers import _DirectVoxCPMAudioVAE, _DirectVoxCPMLatentGenerator + +logger = init_logger(__name__) +_VOXCPM_LATENT_MAGIC = 131071 + + +def _make_voxcpm_model_for_omni(base: type[Any]) -> type[Any]: + """Subclass upstream VoxCPMModel: local ``_inference`` + ``latents_only`` prompt-cache generation.""" + + from voxcpm.model.utils import get_dtype + + class VoxCPMModelForOmni(base): + @torch.inference_mode() + def build_prompt_cache(self, *args: Any, **kwargs: Any): + try: + return super().build_prompt_cache(*args, **kwargs) + except (ImportError, ModuleNotFoundError, RuntimeError) as exc: + if not _is_torchcodec_load_error(exc): + raise + return _build_prompt_cache_with_soundfile(self, *args, **kwargs) + + @torch.inference_mode() + def _inference( + self, + text: torch.Tensor, + text_mask: torch.Tensor, + feat: torch.Tensor, + feat_mask: torch.Tensor, + min_len: int = 2, + max_len: int = 2000, + inference_timesteps: int = 10, + cfg_value: float = 2.0, + streaming: bool = False, + streaming_prefix_len: int = 3, + ) -> Generator[tuple[torch.Tensor, torch.Tensor | list[torch.Tensor]], None, None]: + B, _, _, _ = feat.shape + + feat_embed = self.feat_encoder(feat) + feat_embed = self.enc_to_lm_proj(feat_embed) + + scale_emb = self.config.lm_config.scale_emb if self.config.lm_config.use_mup else 1.0 + text_embed = self.base_lm.embed_tokens(text) * scale_emb + combined_embed = text_mask.unsqueeze(-1) * text_embed + feat_mask.unsqueeze(-1) * feat_embed + + prefix_feat_cond = feat[:, -1, ...] + pred_feat_seq: list[torch.Tensor] = [] + + audio_patch_count = int(feat_mask.sum().item()) + if audio_patch_count > 0: + context_len = min(streaming_prefix_len - 1, audio_patch_count) + prompt_context_patches = list(feat[:, -context_len:, :, :].split(1, dim=1)) + pred_feat_seq = prompt_context_patches + pred_feat_seq + + enc_outputs, kv_cache_tuple = self.base_lm( + inputs_embeds=combined_embed, + is_causal=True, + ) + self.base_lm.kv_cache.fill_caches(kv_cache_tuple) + + enc_outputs = self.fsq_layer(enc_outputs) * feat_mask.unsqueeze(-1) + enc_outputs * text_mask.unsqueeze(-1) + lm_hidden = enc_outputs[:, -1, :] + + residual_enc_outputs, residual_kv_cache_tuple = self.residual_lm( + inputs_embeds=enc_outputs + feat_mask.unsqueeze(-1) * feat_embed, + is_causal=True, + ) + self.residual_lm.kv_cache.fill_caches(residual_kv_cache_tuple) + residual_hidden = residual_enc_outputs[:, -1, :] + + for step_idx in tqdm(range(max_len)): + dit_hidden = self.lm_to_dit_proj(lm_hidden) + self.res_to_dit_proj(residual_hidden) + pred_feat = self.feat_decoder( + mu=dit_hidden, + patch_size=self.patch_size, + cond=prefix_feat_cond.transpose(1, 2).contiguous(), + n_timesteps=inference_timesteps, + cfg_value=cfg_value, + ).transpose(1, 2) + + curr_embed = self.enc_to_lm_proj(self.feat_encoder(pred_feat.unsqueeze(1))) + pred_feat_seq.append(pred_feat.unsqueeze(1)) + prefix_feat_cond = pred_feat + + if streaming: + pred_feat_chunk = torch.cat(pred_feat_seq[-streaming_prefix_len:], dim=1) + feat_pred = rearrange(pred_feat_chunk, "b t p d -> b d (t p)", b=B, p=self.patch_size) + yield feat_pred, pred_feat_seq + + stop_flag = self.stop_head(self.stop_actn(self.stop_proj(lm_hidden))).argmax(dim=-1)[0].cpu().item() + if step_idx > min_len and stop_flag == 1: + break + + lm_hidden = self.base_lm.forward_step( + curr_embed[:, 0, :], + torch.tensor([self.base_lm.kv_cache.step()], device=curr_embed.device), + ).clone() + lm_hidden = self.fsq_layer(lm_hidden) + residual_hidden = self.residual_lm.forward_step( + lm_hidden + curr_embed[:, 0, :], + torch.tensor([self.residual_lm.kv_cache.step()], device=curr_embed.device), + ).clone() + + if not streaming: + pred_feat_seq_cat = torch.cat(pred_feat_seq, dim=1) + feat_pred = rearrange(pred_feat_seq_cat, "b t p d -> b d (t p)", b=B, p=self.patch_size) + yield feat_pred, pred_feat_seq_cat.squeeze(0).cpu() + + @torch.inference_mode() + def generate_latents_with_prompt_cache( + self, + target_text: str, + prompt_cache: dict, + min_len: int = 2, + max_len: int = 2000, + inference_timesteps: int = 10, + cfg_value: float = 2.0, + retry_badcase: bool = False, + retry_badcase_max_times: int = 3, + retry_badcase_ratio_threshold: float = 6.0, + streaming_prefix_len: int = 3, + ) -> tuple[None, torch.Tensor, torch.Tensor]: + return next( + self._generate_with_prompt_cache( + target_text=target_text, + prompt_cache=prompt_cache, + min_len=min_len, + max_len=max_len, + inference_timesteps=inference_timesteps, + cfg_value=cfg_value, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + streaming=False, + streaming_prefix_len=streaming_prefix_len, + latents_only=True, + ) + ) + + @torch.inference_mode() + def generate_latents_with_prompt_cache_streaming( + self, + target_text: str, + prompt_cache: dict, + min_len: int = 2, + max_len: int = 2000, + inference_timesteps: int = 10, + cfg_value: float = 2.0, + retry_badcase: bool = False, + retry_badcase_max_times: int = 3, + retry_badcase_ratio_threshold: float = 6.0, + streaming_prefix_len: int = 3, + ) -> Generator[tuple[None, torch.Tensor, torch.Tensor], None, None]: + return self._generate_with_prompt_cache( + target_text=target_text, + prompt_cache=prompt_cache, + min_len=min_len, + max_len=max_len, + inference_timesteps=inference_timesteps, + cfg_value=cfg_value, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + streaming=True, + streaming_prefix_len=streaming_prefix_len, + latents_only=True, + ) + + @torch.inference_mode() + def _generate_with_prompt_cache( + self, + target_text: str, + prompt_cache: dict, + min_len: int = 2, + max_len: int = 2000, + inference_timesteps: int = 10, + cfg_value: float = 2.0, + retry_badcase: bool = False, + retry_badcase_max_times: int = 3, + retry_badcase_ratio_threshold: float = 6.0, + streaming: bool = False, + streaming_prefix_len: int = 3, + latents_only: bool = False, + ) -> Generator[tuple[torch.Tensor | None, torch.Tensor, torch.Tensor | list[torch.Tensor]], None, None]: + if retry_badcase and streaming: + warnings.warn("Retry on bad cases is not supported in streaming mode, setting retry_badcase=False.") + retry_badcase = False + if prompt_cache is None: + prompt_audio_feat = torch.empty((0, self.patch_size, self.audio_vae.latent_dim), dtype=torch.float32) + text = target_text + else: + prompt_audio_feat = prompt_cache["audio_feat"] + prompt_text = prompt_cache["prompt_text"] + text = prompt_text + target_text + + text_token = torch.LongTensor(self.text_tokenizer(text)) + text_token = torch.cat( + [ + text_token, + torch.tensor([self.audio_start_token], dtype=torch.int32, device=text_token.device), + ], + dim=-1, + ) + target_text_token = torch.LongTensor(self.text_tokenizer(target_text)) + + audio_length = prompt_audio_feat.size(0) + text_length = text_token.shape[0] + text_pad_token = torch.zeros(audio_length, dtype=torch.int32, device=text_token.device) + audio_pad_feat = torch.zeros( + (text_token.shape[0], self.patch_size, self.audio_vae.latent_dim), + dtype=torch.float32, + device=text_token.device, + ) + text_token = torch.cat([text_token, text_pad_token]) + audio_feat = torch.cat([audio_pad_feat, prompt_audio_feat], dim=0) + text_mask = ( + torch.cat([torch.ones(text_length), torch.zeros(audio_length)]).type(torch.int32).to(text_token.device) + ) + audio_mask = ( + torch.cat([torch.zeros(text_length), torch.ones(audio_length)]).type(torch.int32).to(text_token.device) + ) + + text_token = text_token.unsqueeze(0).to(self.device) + text_mask = text_mask.unsqueeze(0).to(self.device) + audio_feat = audio_feat.unsqueeze(0).to(self.device).to(get_dtype(self.config.dtype)) + audio_mask = audio_mask.unsqueeze(0).to(self.device) + + target_text_length = len(self.text_tokenizer(target_text)) + retry_badcase_times = 0 + while retry_badcase_times < retry_badcase_max_times: + inference_result = self._inference( + text_token, + text_mask, + audio_feat, + audio_mask, + min_len=min_len, + max_len=min(int(target_text_length * retry_badcase_ratio_threshold + 10), max_len), + inference_timesteps=inference_timesteps, + cfg_value=cfg_value, + streaming=streaming, + streaming_prefix_len=streaming_prefix_len, + ) + if streaming: + patch_len = self.patch_size * self.chunk_size + for latent_pred, pred_audio_feat in inference_result: + if latents_only: + decode_audio = None + yield (decode_audio, target_text_token, latent_pred) + else: + decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)) + decode_audio = decode_audio[..., -patch_len:].squeeze(1).cpu() + yield (decode_audio, target_text_token, pred_audio_feat) + break + + latent_pred, pred_audio_feat = next(inference_result) + if retry_badcase and pred_audio_feat.shape[0] >= target_text_length * retry_badcase_ratio_threshold: + ratio = pred_audio_feat.shape[0] / target_text_length + print(f" Badcase detected, audio_text_ratio={ratio}, retrying...", file=sys.stderr) + retry_badcase_times += 1 + continue + break + + if not streaming: + if latents_only: + decode_audio = None + else: + decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)) + patch_len = self.patch_size * self.chunk_size + if audio_mask.sum().item() > 0: + decode_audio = decode_audio[..., patch_len * (streaming_prefix_len - 1) :].squeeze(1).cpu() + else: + decode_audio = decode_audio[..., :].squeeze(1).cpu() + yield (decode_audio, target_text_token, pred_audio_feat) + + VoxCPMModelForOmni.__name__ = "VoxCPMModelForOmni" + VoxCPMModelForOmni.__qualname__ = "VoxCPMModelForOmni" + return VoxCPMModelForOmni + + +def _import_voxcpm_model_class() -> type[Any]: + base = _import_voxcpm_base_model_class() + return _make_voxcpm_model_for_omni(base) + + +def _load_native_voxcpm_model( + model_path: str, + *, + device: torch.device, + dtype: str | None, +): + VoxCPMModel = _import_voxcpm_model_class() + model_dir = resolve_voxcpm_model_dir(model_path) + runtime_model_path = _prepare_runtime_model_dir(model_dir, target_device=device, target_dtype=dtype) + + if device.type == "npu" and hasattr(torch, "npu"): + torch.npu.set_device(device) + + with _force_cuda_available_for_npu(device): + return VoxCPMModel.from_local( + runtime_model_path, + optimize=device.type == "cuda", + ) + + +def _load_native_voxcpm_latent_generator( + model_path: str, + *, + device: torch.device, + dtype: str | None, +) -> _DirectVoxCPMLatentGenerator: + return _DirectVoxCPMLatentGenerator(_load_native_voxcpm_model(model_path, device=device, dtype=dtype)) + + +def _load_native_voxcpm_audio_vae( + model_path: str, + *, + device: torch.device, +) -> _DirectVoxCPMAudioVAE: + AudioVAE, AudioVAEConfig = _import_voxcpm_audio_vae_classes() + model_dir = resolve_voxcpm_model_dir(model_path) + runtime_model_path = _prepare_runtime_model_dir(model_dir, target_device=device, target_dtype="float32") + config_dict = json.loads((Path(runtime_model_path) / "config.json").read_text()) + audio_vae_config = config_dict.get("audio_vae_config") + audio_vae = AudioVAE(config=AudioVAEConfig(**audio_vae_config)) if audio_vae_config is not None else AudioVAE() + + state_dict = torch.load( + Path(runtime_model_path) / "audiovae.pth", + map_location="cpu", + weights_only=True, + )["state_dict"] + audio_vae.load_state_dict(state_dict, strict=True) + audio_vae = audio_vae.to(device=device, dtype=torch.float32).eval() + if device.type == "npu" and hasattr(torch, "npu"): + torch.npu.set_device(device) + patch_size = int(config_dict.get("patch_size", 2)) + return _DirectVoxCPMAudioVAE(audio_vae, patch_size=patch_size) + + +class VoxCPMForConditionalGeneration(nn.Module): + input_modalities = "audio" + _LATENT_STAGES = {"latent_generator", "latent", "ar_dit"} + _VAE_STAGES = {"vae", "audio_vae"} + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + del prefix + self.vllm_config = vllm_config + self.model_path = vllm_config.model_config.model + self.model_stage = getattr(vllm_config.model_config, "model_stage", "latent_generator") + self.have_multimodal_outputs = True + self.has_preprocess = False + self.has_postprocess = False + self.enable_update_additional_information = True + self.requires_raw_input_tokens = True + self.inject_omni_request_id_into_runtime_info = True + self._pipeline = None + self._latent_stream_gens: dict[str, Any] = {} + self._latent_stream_terminal_pending: dict[str, int] = {} + self._latent_stream_completed: set[str] = set() + self._next_local_stream_key = 0 + self._ar_emit_stop_token = True + + def _runner_hidden_device_dtype(self) -> tuple[torch.device, torch.dtype]: + device = _resolve_runtime_device(self.vllm_config) + model_config = getattr(self.vllm_config, "model_config", None) + dtype = getattr(model_config, "dtype", torch.float32) if model_config is not None else torch.float32 + return device, dtype + + def _ensure_model_loaded(self): + if self._pipeline is not None: + return + + target_device = _resolve_runtime_device(self.vllm_config) + model_dtype = getattr(self.vllm_config.model_config, "dtype", None) + normalized_dtype = _normalize_dtype_name(model_dtype) + if self.model_stage in self._LATENT_STAGES: + self._pipeline = _load_native_voxcpm_latent_generator( + self.model_path, + device=target_device, + dtype=normalized_dtype, + ) + elif self.model_stage in self._VAE_STAGES: + self._pipeline = _load_native_voxcpm_audio_vae( + self.model_path, + device=target_device, + ) + else: + raise ValueError( + f"Unsupported VoxCPM model_stage: {self.model_stage}. " + "pure_voxcpm only supports split-stage latent_generator/vae inference." + ) + + logger.info("Loaded VoxCPM stage '%s' on %s", self.model_stage, _device_to_string(target_device)) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + del weights + self._ensure_model_loaded() + return set() + + @staticmethod + def _extract_val(info: dict[str, Any], key: str, default: Any) -> Any: + value = info.get(key, default) + if isinstance(value, list): + return value[0] if value else default + return value + + def _resolve_stream_request_key(self, info: dict[str, Any]) -> str: + request_key = info.get("__voxcpm_stream_key") + if request_key is not None: + return str(request_key) + + request_key = info.get("_omni_req_id") + if request_key is not None: + request_key = str(request_key) + info["__voxcpm_stream_key"] = request_key + return request_key + + request_key = f"voxcpm-local-{self._next_local_stream_key}" + self._next_local_stream_key += 1 + info["__voxcpm_stream_key"] = request_key + return str(request_key) + + def _recover_latent_from_input_ids(self, input_ids: torch.Tensor | None) -> torch.Tensor | None: + if input_ids is None or input_ids.numel() == 0: + return None + flat_ids = input_ids.detach().reshape(-1).to("cpu") + if flat_ids.numel() < 4 or int(flat_ids[0].item()) != _VOXCPM_LATENT_MAGIC: + return None + latent_dim = int(flat_ids[1].item()) + time_dim = int(flat_ids[2].item()) + payload = flat_ids[3:] + expected = latent_dim * time_dim + if latent_dim <= 0 or time_dim <= 0: + raise ValueError(f"Invalid VoxCPM latent header: latent_dim={latent_dim}, time_dim={time_dim}") + if int(payload.numel()) != expected: + raise ValueError( + "Invalid VoxCPM latent payload size: " + f"expected={expected}, actual={int(payload.numel())}, " + f"latent_dim={latent_dim}, time_dim={time_dim}" + ) + packed = payload.to(dtype=torch.int32).to(torch.uint16) + return packed.view(torch.bfloat16).to(torch.float32).reshape(1, latent_dim, time_dim) + + def _maybe_recover_vae_infos( + self, + infos: list[dict[str, Any]], + input_ids: torch.Tensor | None, + *, + async_chunk: bool, + ) -> list[dict[str, Any]]: + if not async_chunk: + return infos + if any(self._extract_val(info, "latent_audio_feat", None) is not None for info in infos): + return infos + recovered = self._recover_latent_from_input_ids(input_ids) + if recovered is None: + return infos + return [{"latent_audio_feat": recovered}] + + @staticmethod + def _normalize_audio_samples(samples: Any) -> np.ndarray: + if isinstance(samples, torch.Tensor): + return samples.detach().cpu().float().reshape(-1).numpy() + return np.asarray(samples, dtype=np.float32).reshape(-1) + + @classmethod + def _normalize_ref_audio(cls, ref_audio: Any) -> tuple[np.ndarray, int]: + if isinstance(ref_audio, str): + raise TypeError("String ref_audio should be handled as a path before waveform normalization.") + + if isinstance(ref_audio, dict): + sample_rate = ref_audio.get("sample_rate") or ref_audio.get("sampling_rate") or ref_audio.get("sr") + samples = None + for key in ("audio", "wav", "samples", "array", "waveform"): + if key in ref_audio and ref_audio[key] is not None: + samples = ref_audio[key] + break + if sample_rate is None or samples is None: + raise ValueError("ref_audio dict must contain waveform data and sample rate.") + return cls._normalize_audio_samples(samples), int(sample_rate) + + if isinstance(ref_audio, (list, tuple)): + if len(ref_audio) == 1: + return cls._normalize_ref_audio(ref_audio[0]) + if len(ref_audio) == 2 and np.isscalar(ref_audio[1]): + return cls._normalize_audio_samples(ref_audio[0]), int(ref_audio[1]) + + raise TypeError(f"Unsupported ref_audio format: {type(ref_audio)!r}") + + @staticmethod + def _write_temp_prompt_wav(waveform: np.ndarray, sample_rate: int) -> str: + prompt_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") + prompt_file.close() + + wav = np.asarray(waveform, dtype=np.float32).reshape(-1) + wav = np.clip(wav, -1.0, 1.0) + pcm16 = (wav * 32767.0).astype(np.int16) + with wave.open(prompt_file.name, "wb") as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(int(sample_rate)) + wav_file.writeframes(pcm16.tobytes()) + + return prompt_file.name + + @classmethod + def _resolve_prompt_inputs(cls, info: dict[str, Any]) -> tuple[str | None, str | None, str | None]: + prompt_text = cls._extract_val(info, "prompt_text", None) + prompt_wav_path = cls._extract_val(info, "prompt_wav_path", None) + if prompt_wav_path: + if prompt_text is None: + prompt_text = cls._extract_val(info, "ref_text", None) + return prompt_wav_path, prompt_text, None + + ref_audio = cls._extract_val(info, "ref_audio", None) + ref_text = cls._extract_val(info, "ref_text", None) + if ref_audio is None or ref_text is None: + return None, None, None + if isinstance(ref_audio, str): + return ref_audio, ref_text, None + + waveform, sample_rate = cls._normalize_ref_audio(ref_audio) + temp_prompt_wav = cls._write_temp_prompt_wav(waveform, sample_rate) + return temp_prompt_wav, ref_text, temp_prompt_wav + + def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor: + if input_ids.numel() == 0: + return torch.empty((0, 1), device=input_ids.device, dtype=torch.float32) + return torch.zeros((input_ids.shape[0], 1), device=input_ids.device, dtype=torch.float32) + + def _get_vocab_size(self) -> int: + model_config = getattr(self.vllm_config, "model_config", None) + if model_config is not None: + getter = getattr(model_config, "get_vocab_size", None) + if callable(getter): + try: + return int(getter()) + except Exception: + pass + hf_config = getattr(model_config, "hf_text_config", None) + if hf_config is not None and hasattr(hf_config, "vocab_size"): + return int(hf_config.vocab_size) + return 32000 + + def _make_empty_output( + self, + *, + output_key: str, + payload_factory: Callable[[], torch.Tensor], + infos: list[dict[str, Any]], + sample_rate: int, + out_device: torch.device, + out_dtype: torch.dtype, + hidden_rows: int | None = None, + ) -> OmniOutput: + if hidden_rows is None: + hidden_rows = len(infos) + return OmniOutput( + text_hidden_states=torch.zeros((hidden_rows, 1), device=out_device, dtype=out_dtype), + multimodal_outputs={ + output_key: [payload_factory() for _ in infos], + "sr": [torch.tensor(sample_rate, dtype=torch.int32) for _ in infos], + }, + ) + + def _finalize_stage_output( + self, + *, + output_key: str, + outputs: list[torch.Tensor], + sample_rates: list[torch.Tensor], + out_device: torch.device, + out_dtype: torch.dtype, + hidden_rows: int | None = None, + ) -> OmniOutput: + multimodal_outputs: dict[str, Any] = {output_key: outputs, "sr": sample_rates} + if hidden_rows is not None: + text_hidden_states = torch.zeros((hidden_rows, 1), device=out_device, dtype=out_dtype) + elif outputs: + outputs_tensor = torch.stack(outputs) + text_hidden_states = ( + outputs_tensor.unsqueeze(-1) + if outputs_tensor.ndim == 1 + else outputs_tensor.reshape(-1, outputs_tensor.shape[-1]) + ) + else: + text_hidden_states = torch.zeros((0, 1), device=out_device, dtype=out_dtype) + text_hidden_states = text_hidden_states.to(device=out_device, dtype=out_dtype) + return OmniOutput( + text_hidden_states=text_hidden_states, + multimodal_outputs=multimodal_outputs, + ) + + def _forward_vae_stage( + self, + infos: list[dict[str, Any]], + *, + sample_rate: int, + async_chunk: bool, + out_device: torch.device, + out_dtype: torch.dtype, + ) -> OmniOutput: + if all(self._extract_val(info, "latent_audio_feat", None) is None for info in infos): + self._ar_emit_stop_token = True + return self._make_empty_output( + output_key="model_outputs", + payload_factory=lambda: torch.zeros((0,), dtype=torch.float32), + infos=infos, + sample_rate=sample_rate, + out_device=out_device, + out_dtype=out_dtype, + ) + + outputs: list[torch.Tensor] = [] + sample_rates: list[torch.Tensor] = [] + for info in infos: + latent_audio_feat = self._extract_val(info, "latent_audio_feat", None) + audio_tensor = self._pipeline.decode(latent_audio_feat, trim_streaming_patch=async_chunk) + outputs.append(audio_tensor.float().cpu()) + sample_rates.append(torch.tensor(sample_rate, dtype=torch.int32)) + + self._ar_emit_stop_token = True + return self._finalize_stage_output( + output_key="model_outputs", + outputs=outputs, + sample_rates=sample_rates, + out_device=out_device, + out_dtype=out_dtype, + ) + + def _forward_latent_stage( + self, + infos: list[dict[str, Any]], + *, + sample_rate: int, + async_chunk: bool, + out_device: torch.device, + out_dtype: torch.dtype, + hidden_rows: int, + ) -> OmniOutput: + texts = [self._extract_val(info, "text", "") for info in infos] + if all(not text for text in texts): + self._ar_emit_stop_token = True + return self._make_empty_output( + output_key="latent_audio_feat", + payload_factory=lambda: torch.zeros((0,), dtype=torch.float32), + infos=infos, + sample_rate=sample_rate, + out_device=out_device, + out_dtype=out_dtype, + hidden_rows=hidden_rows, + ) + + outputs: list[torch.Tensor] = [] + sample_rates: list[torch.Tensor] = [] + last_chunk_flags: list[bool] | None = [] if async_chunk else None + payload_finished_flags: list[bool] | None = [] if async_chunk else None + for info in infos: + text = self._extract_val(info, "text", "") + cfg_value = float(self._extract_val(info, "cfg_value", 2.0)) + inference_timesteps = int(self._extract_val(info, "inference_timesteps", 10)) + min_len = int(self._extract_val(info, "min_len", 2)) + max_len = int(self._extract_val(info, "max_len", self._extract_val(info, "max_new_tokens", 4096))) + retry_badcase = bool(self._extract_val(info, "retry_badcase", True)) + retry_badcase_max_times = int(self._extract_val(info, "retry_badcase_max_times", 3)) + retry_badcase_ratio_threshold = float(self._extract_val(info, "retry_badcase_ratio_threshold", 6.0)) + streaming_prefix_len = int(self._extract_val(info, "streaming_prefix_len", 3)) + + request_key = self._resolve_stream_request_key(info) + created_temp: str | None = None + + if async_chunk: + terminal_pending = self._latent_stream_terminal_pending.get(request_key, 0) + if terminal_pending > 0: + outputs.append(torch.zeros((0,), dtype=torch.float32)) + assert last_chunk_flags is not None + last_chunk_flags.append(True) + assert payload_finished_flags is not None + payload_finished_flags.append(terminal_pending == 1) + if terminal_pending == 1: + self._latent_stream_terminal_pending.pop(request_key, None) + else: + self._latent_stream_terminal_pending[request_key] = terminal_pending - 1 + sample_rates.append(torch.tensor(sample_rate, dtype=torch.int32)) + continue + + if request_key in self._latent_stream_completed: + outputs.append(torch.zeros((0,), dtype=torch.float32)) + assert last_chunk_flags is not None + last_chunk_flags.append(True) + assert payload_finished_flags is not None + payload_finished_flags.append(False) + sample_rates.append(torch.tensor(sample_rate, dtype=torch.int32)) + continue + + if request_key not in self._latent_stream_gens: + prompt_wav_path, prompt_text, temp_prompt_wav = self._resolve_prompt_inputs(info) + created_temp = temp_prompt_wav + self._latent_stream_gens[request_key] = self._pipeline.iter_latent_chunks_streaming( + text=text, + prompt_wav_path=prompt_wav_path, + prompt_text=prompt_text, + cfg_value=cfg_value, + inference_timesteps=inference_timesteps, + min_len=min_len, + max_len=max_len, + streaming_prefix_len=streaming_prefix_len, + retry_badcase=False, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + ) + generator = self._latent_stream_gens[request_key] + try: + chunk_latent, is_last = next(generator) + except StopIteration: + self._latent_stream_gens.pop(request_key, None) + self._latent_stream_terminal_pending[request_key] = 1 + self._latent_stream_completed.add(request_key) + outputs.append(torch.zeros((0,), dtype=torch.float32)) + assert last_chunk_flags is not None + last_chunk_flags.append(True) + assert payload_finished_flags is not None + payload_finished_flags.append(True) + else: + if is_last: + self._latent_stream_gens.pop(request_key, None) + self._latent_stream_terminal_pending[request_key] = 1 + self._latent_stream_completed.add(request_key) + outputs.append(chunk_latent.detach().float().cpu()) + assert last_chunk_flags is not None + last_chunk_flags.append(bool(is_last)) + assert payload_finished_flags is not None + payload_finished_flags.append(False) + finally: + if created_temp is not None and os.path.exists(created_temp): + os.unlink(created_temp) + sample_rates.append(torch.tensor(sample_rate, dtype=torch.int32)) + continue + + prompt_wav_path, prompt_text, temp_prompt_wav = self._resolve_prompt_inputs(info) + try: + latent_audio_feat = self._pipeline.generate_latents( + text=text, + prompt_wav_path=prompt_wav_path, + prompt_text=prompt_text, + cfg_value=cfg_value, + inference_timesteps=inference_timesteps, + min_len=min_len, + max_len=max_len, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + ) + outputs.append(latent_audio_feat.float().cpu()) + finally: + if temp_prompt_wav is not None and os.path.exists(temp_prompt_wav): + os.unlink(temp_prompt_wav) + + sample_rates.append(torch.tensor(sample_rate, dtype=torch.int32)) + + self._ar_emit_stop_token = all(last_chunk_flags) if async_chunk and last_chunk_flags else True + output = self._finalize_stage_output( + output_key="latent_audio_feat", + outputs=outputs, + sample_rates=sample_rates, + out_device=out_device, + out_dtype=out_dtype, + hidden_rows=hidden_rows, + ) + if async_chunk and payload_finished_flags is not None: + output.multimodal_outputs["finished"] = [ + torch.tensor(flag, dtype=torch.bool) for flag in payload_finished_flags + ] + return output + + def compute_logits(self, hidden_states: torch.Tensor | OmniOutput, sampling_metadata: Any = None) -> torch.Tensor: + del sampling_metadata + if isinstance(hidden_states, OmniOutput): + hidden_states = hidden_states.text_hidden_states + if hidden_states is None: + device, dtype = self._runner_hidden_device_dtype() + hidden_states = torch.zeros((0, 1), device=device, dtype=dtype) + if hidden_states.ndim == 1: + hidden_states = hidden_states.unsqueeze(-1) + elif hidden_states.ndim > 2: + hidden_states = hidden_states.reshape(-1, hidden_states.shape[-1]) + + vocab_size = self._get_vocab_size() + num_rows = int(hidden_states.shape[0]) + logits = torch.zeros((num_rows, vocab_size), dtype=torch.float32, device=hidden_states.device) + eos_id = 2 if vocab_size > 2 else 0 + safe_id = 1 if vocab_size > 1 and 1 != eos_id else 0 + emit_stop = getattr(self, "_ar_emit_stop_token", True) + if num_rows > 0: + if emit_stop: + logits[:, eos_id] = 1.0e6 + else: + logits[:, eos_id] = -1.0e9 + logits[:, safe_id] = 1.0e6 + return logits + + @torch.no_grad() + def forward( + self, + input_ids: torch.Tensor | None = None, + positions: torch.Tensor | None = None, + intermediate_tensors: Any = None, + inputs_embeds: torch.Tensor | None = None, + runtime_additional_information: list[dict[str, Any]] | None = None, + model_intermediate_buffer: list[dict[str, Any]] | None = None, + **kwargs: Any, + ) -> OmniOutput: + del positions, intermediate_tensors, inputs_embeds, kwargs + self._ensure_model_loaded() + out_device, out_dtype = self._runner_hidden_device_dtype() + if input_ids is not None and input_ids.device.type == out_device.type: + out_device = input_ids.device + + infos = model_intermediate_buffer or runtime_additional_information or [{}] + hidden_rows = len(infos) + if input_ids is not None and len(input_ids.shape) > 0: + hidden_rows = max(hidden_rows, int(input_ids.shape[0])) + sample_rate = int(getattr(self._pipeline, "sample_rate", 24000)) + async_chunk = bool(getattr(self.vllm_config.model_config, "async_chunk", False)) + if self.model_stage in self._VAE_STAGES: + infos = self._maybe_recover_vae_infos(infos, input_ids, async_chunk=async_chunk) + return self._forward_vae_stage( + infos, + sample_rate=sample_rate, + async_chunk=async_chunk, + out_device=out_device, + out_dtype=out_dtype, + ) + if self.model_stage in self._LATENT_STAGES: + return self._forward_latent_stage( + infos, + sample_rate=sample_rate, + async_chunk=async_chunk, + out_device=out_device, + out_dtype=out_dtype, + hidden_rows=hidden_rows, + ) + raise ValueError(f"Unsupported VoxCPM model_stage at runtime: {self.model_stage}") + + def make_empty_intermediate_tensors( + self, batch_size: int, dtype: torch.dtype, device: torch.device + ) -> IntermediateTensors: + del batch_size, dtype, device + return {} + + +__all__ = ["VoxCPMForConditionalGeneration"] diff --git a/vllm_omni/model_executor/models/voxcpm/voxcpm_loader.py b/vllm_omni/model_executor/models/voxcpm/voxcpm_loader.py new file mode 100644 index 0000000000..dac7117cad --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm/voxcpm_loader.py @@ -0,0 +1,247 @@ +from __future__ import annotations + +import importlib +import json +import os +import shutil +import sys +import tempfile +from contextlib import contextmanager +from hashlib import sha256 +from pathlib import Path +from typing import Any +from unittest.mock import patch + +import numpy as np +import torch +from vllm.config import VllmConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def _iter_voxcpm_src_candidates() -> list[Path]: + candidates: list[Path] = [] + env_path = os.environ.get("VLLM_OMNI_VOXCPM_CODE_PATH") + if env_path: + candidates.append(Path(env_path).expanduser()) + + repo_root = Path(__file__).resolve().parents[4] + candidates.append(repo_root.parent / "VoxCPM" / "src") + + unique_candidates: list[Path] = [] + seen: set[str] = set() + for candidate in candidates: + candidate_key = str(candidate) + if candidate_key in seen: + continue + seen.add(candidate_key) + unique_candidates.append(candidate) + return unique_candidates + + +def _prepend_voxcpm_src(candidate: Path) -> None: + candidate_str = str(candidate) + if candidate_str not in sys.path: + sys.path.insert(0, candidate_str) + + +def _import_voxcpm_attrs(module_name: str, *attr_names: str) -> tuple[Any, ...]: + last_exc: ImportError | None = None + for candidate in _iter_voxcpm_src_candidates(): + if not candidate.exists(): + continue + _prepend_voxcpm_src(candidate) + try: + module = importlib.import_module(module_name) + return tuple(getattr(module, attr_name) for attr_name in attr_names) + except ImportError as exc: + last_exc = exc + + try: + module = importlib.import_module(module_name) + return tuple(getattr(module, attr_name) for attr_name in attr_names) + except ImportError as exc: + last_exc = exc + + raise ImportError(f"Failed to import {module_name}.") from last_exc + + +def _import_voxcpm_base_model_class(): + """Import upstream ``VoxCPMModel`` from ``VoxCPM/src/voxcpm`` (env, sibling tree, or pip).""" + try: + (VoxCPMModel,) = _import_voxcpm_attrs("voxcpm.model.voxcpm", "VoxCPMModel") + return VoxCPMModel + except ImportError as exc: + raise ImportError( + "Failed to import VoxCPMModel. Install the `voxcpm` package or set " + "`VLLM_OMNI_VOXCPM_CODE_PATH` to the VoxCPM repository `src` directory " + "(the parent of the `voxcpm` package that contains `model/` and `modules/`)." + ) from exc + + +def _import_voxcpm_audio_vae_classes(): + try: + return _import_voxcpm_attrs("voxcpm.modules.audiovae", "AudioVAE", "AudioVAEConfig") + except ImportError as exc: + raise ImportError( + "Failed to import VoxCPM AudioVAE. Install the `voxcpm` package or set " + "`VLLM_OMNI_VOXCPM_CODE_PATH` to the VoxCPM repository `src` directory." + ) from exc + + +def _device_to_string(device: torch.device) -> str: + if device.index is None: + return device.type + return f"{device.type}:{device.index}" + + +def _normalize_dtype_name(dtype: Any) -> str | None: + if dtype is None: + return None + if isinstance(dtype, torch.dtype): + mapping = { + torch.bfloat16: "bfloat16", + torch.float16: "float16", + torch.float32: "float32", + } + return mapping.get(dtype, str(dtype).removeprefix("torch.")) + dtype_str = str(dtype) + return dtype_str.removeprefix("torch.") + + +def _resolve_runtime_device(vllm_config: VllmConfig) -> torch.device: + try: + from vllm_omni.platforms import current_omni_platform + + return current_omni_platform.get_torch_device() + except Exception: + pass + + device = getattr(getattr(vllm_config, "device_config", None), "device", None) + if isinstance(device, torch.device): + return device + if device: + return torch.device(device) + return torch.device("cpu") + + +def _prepare_runtime_model_dir( + model_path: str | Path, + *, + target_device: torch.device, + target_dtype: str | None, +) -> str: + source_dir = Path(model_path) + config_path = source_dir / "config.json" + if not config_path.exists(): + return str(source_dir) + + config_text = config_path.read_text() + config_dict = json.loads(config_text) + desired_device = target_device.type + desired_dtype = target_dtype or config_dict.get("dtype") + + if config_dict.get("device") == desired_device and config_dict.get("dtype") == desired_dtype: + return str(source_dir) + + digest = sha256(f"{source_dir.resolve()}:{config_text}:{desired_device}:{desired_dtype}".encode()).hexdigest()[:16] + runtime_dir = Path(tempfile.gettempdir()) / "vllm_omni_voxcpm_runtime" / digest + runtime_dir.mkdir(parents=True, exist_ok=True) + + for entry in source_dir.iterdir(): + target = runtime_dir / entry.name + if entry.name == "config.json" or target.exists(): + continue + try: + target.symlink_to(entry, target_is_directory=entry.is_dir()) + except OSError as exc: + logger.warning( + "Falling back to copying VoxCPM runtime artifact %s into %s because symlink creation failed: %s", + entry, + runtime_dir, + exc, + ) + if entry.is_dir(): + shutil.copytree(entry, target, dirs_exist_ok=True) + else: + shutil.copy2(entry, target) + + patched_config = dict(config_dict) + patched_config["device"] = desired_device + if desired_dtype is not None: + patched_config["dtype"] = desired_dtype + (runtime_dir / "config.json").write_text(json.dumps(patched_config, indent=2, sort_keys=True)) + return str(runtime_dir) + + +@contextmanager +def _force_cuda_available_for_npu(device: torch.device): + if device.type != "npu": + yield + return + + with patch("torch.cuda.is_available", return_value=True): + yield + + +def _is_torchcodec_load_error(exc: BaseException) -> bool: + message = str(exc).lower() + return "torchcodec" in message or "load_with_torchcodec" in message + + +def _load_audio_with_soundfile( + prompt_wav_path: str, + *, + sample_rate: int, +) -> torch.Tensor: + try: + import soundfile as sf + except ImportError: + raise + + audio_np, source_sr = sf.read(prompt_wav_path, dtype="float32", always_2d=True) + audio = torch.from_numpy(np.ascontiguousarray(audio_np.T)) + + if audio.size(0) > 1: + audio = audio.mean(dim=0, keepdim=True) + + if int(source_sr) != int(sample_rate): + try: + import torchaudio + except ImportError as exc: + raise ImportError("torchaudio is required for resampling prompt audio.") from exc + audio = torchaudio.functional.resample(audio, int(source_sr), int(sample_rate)) + + return audio + + +def _build_prompt_cache_with_soundfile(model: Any, *args: Any, **kwargs: Any) -> dict[str, Any]: + if args: + prompt_text = args[0] + prompt_wav_path = args[1] if len(args) > 1 else kwargs.get("prompt_wav_path") + else: + prompt_text = kwargs.get("prompt_text") + prompt_wav_path = kwargs.get("prompt_wav_path") + + if not prompt_text or not prompt_wav_path: + raise ValueError("prompt_text and prompt_wav_path are required") + + audio = _load_audio_with_soundfile(prompt_wav_path, sample_rate=int(model.sample_rate)) + + patch_len = model.patch_size * model.chunk_size + if audio.size(1) % patch_len != 0: + padding_size = patch_len - audio.size(1) % patch_len + audio = torch.nn.functional.pad(audio, (padding_size, 0)) + + audio_feat = model.audio_vae.encode(audio.to(model.device), model.sample_rate).cpu() + audio_feat = audio_feat.view( + model.audio_vae.latent_dim, + -1, + model.patch_size, + ).permute(1, 2, 0) + + return { + "prompt_text": prompt_text, + "audio_feat": audio_feat, + } diff --git a/vllm_omni/model_executor/models/voxcpm/voxcpm_runtime_utils.py b/vllm_omni/model_executor/models/voxcpm/voxcpm_runtime_utils.py new file mode 100644 index 0000000000..36b4282c2d --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm/voxcpm_runtime_utils.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +import json +import shutil +from pathlib import Path + + +def resolve_voxcpm_model_dir(model: str) -> Path: + model_path = Path(model).expanduser() + if model_path.exists(): + return model_path + + from huggingface_hub import snapshot_download + + return Path(snapshot_download(repo_id=model)) + + +def prepare_voxcpm_hf_config_dir(model_dir: str | Path, hf_config_dir: str | Path) -> Path: + model_dir = Path(model_dir).expanduser() + hf_config_dir = Path(hf_config_dir).expanduser() + hf_config_dir.mkdir(parents=True, exist_ok=True) + + source_config_path = model_dir / "config.json" + if not source_config_path.exists(): + raise FileNotFoundError(f"VoxCPM config.json not found under {model_dir}") + + config_path = hf_config_dir / "config.json" + shutil.copy2(source_config_path, config_path) + + source_generation_config_path = model_dir / "generation_config.json" + if source_generation_config_path.exists(): + shutil.copy2(source_generation_config_path, hf_config_dir / "generation_config.json") + + config_dict = json.loads(config_path.read_text(encoding="utf-8")) + config_dict["model_type"] = "voxcpm" + config_dict.setdefault("architectures", ["VoxCPMForConditionalGeneration"]) + config_path.write_text(json.dumps(config_dict, indent=2, ensure_ascii=False), encoding="utf-8") + return hf_config_dir + + +__all__ = [ + "prepare_voxcpm_hf_config_dir", + "resolve_voxcpm_model_dir", +] diff --git a/vllm_omni/model_executor/models/voxcpm/voxcpm_stage_wrappers.py b/vllm_omni/model_executor/models/voxcpm/voxcpm_stage_wrappers.py new file mode 100644 index 0000000000..f4446c796e --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm/voxcpm_stage_wrappers.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +import os +from collections.abc import Generator +from typing import Any + +import torch +import torch.nn as nn +from einops import rearrange + + +class _DirectVoxCPMLatentGenerator: + def __init__(self, tts_model: Any): + self.tts_model = tts_model + self.sample_rate = int(getattr(tts_model, "sample_rate", 24000)) + + def generate_latents( + self, + *, + text: str, + prompt_wav_path: str | None = None, + prompt_text: str | None = None, + cfg_value: float = 2.0, + inference_timesteps: int = 10, + min_len: int = 2, + max_len: int = 4096, + retry_badcase: bool = True, + retry_badcase_max_times: int = 3, + retry_badcase_ratio_threshold: float = 6.0, + ) -> torch.Tensor: + if not isinstance(text, str) or not text.strip(): + raise ValueError("target text must be a non-empty string") + if (prompt_wav_path is None) != (prompt_text is None): + raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None") + if prompt_wav_path is not None and not os.path.exists(prompt_wav_path): + raise FileNotFoundError(f"prompt_wav_path does not exist: {prompt_wav_path}") + + prompt_cache = None + if prompt_wav_path is not None and prompt_text is not None: + prompt_cache = self.tts_model.build_prompt_cache( + prompt_text=prompt_text, + prompt_wav_path=prompt_wav_path, + ) + + gen_kw = dict( + target_text=" ".join(text.split()), + prompt_cache=prompt_cache, + min_len=min_len, + max_len=max_len, + inference_timesteps=inference_timesteps, + cfg_value=cfg_value, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + ) + latent_entry = getattr(self.tts_model, "generate_latents_with_prompt_cache", None) + if latent_entry is not None: + _, _, pred_audio_feat = latent_entry(**gen_kw) + else: + try: + _, _, pred_audio_feat = self.tts_model.generate_with_prompt_cache( + **gen_kw, + latents_only=True, + ) + except TypeError: + _, _, pred_audio_feat = self.tts_model.generate_with_prompt_cache(**gen_kw) + return pred_audio_feat.detach().cpu().to(torch.float32) + + def iter_latent_chunks_streaming( + self, + *, + text: str, + prompt_wav_path: str | None = None, + prompt_text: str | None = None, + cfg_value: float = 2.0, + inference_timesteps: int = 10, + min_len: int = 2, + max_len: int = 4096, + streaming_prefix_len: int = 3, + retry_badcase: bool = False, + retry_badcase_max_times: int = 3, + retry_badcase_ratio_threshold: float = 6.0, + ) -> Generator[tuple[torch.Tensor, bool], None, None]: + """Yield ``(latent_window, is_last_chunk)`` for Omni async_chunk latent to VAE.""" + if not isinstance(text, str) or not text.strip(): + raise ValueError("target text must be a non-empty string") + if (prompt_wav_path is None) != (prompt_text is None): + raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None") + if prompt_wav_path is not None and not os.path.exists(prompt_wav_path): + raise FileNotFoundError(f"prompt_wav_path does not exist: {prompt_wav_path}") + + prompt_cache = None + if prompt_wav_path is not None and prompt_text is not None: + prompt_cache = self.tts_model.build_prompt_cache( + prompt_text=prompt_text, + prompt_wav_path=prompt_wav_path, + ) + + gen_kw = dict( + target_text=" ".join(text.split()), + prompt_cache=prompt_cache, + min_len=min_len, + max_len=max_len, + inference_timesteps=inference_timesteps, + cfg_value=cfg_value, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + streaming_prefix_len=streaming_prefix_len, + ) + stream_entry = getattr(self.tts_model, "generate_latents_with_prompt_cache_streaming", None) + if stream_entry is not None: + gen = stream_entry(**gen_kw) + else: + fallback_stream_entry = getattr(self.tts_model, "generate_with_prompt_cache_streaming", None) + if fallback_stream_entry is not None: + gen = fallback_stream_entry(**gen_kw, latents_only=True) + else: + gen = self.tts_model._generate_with_prompt_cache(streaming=True, latents_only=True, **gen_kw) + + iterator = iter(gen) + previous = next(iterator, None) + while previous is not None: + current = next(iterator, None) + _, _target_tok, chunk_latent = previous + if not isinstance(chunk_latent, torch.Tensor): + chunk_latent = torch.as_tensor(chunk_latent) + yield chunk_latent, current is None + previous = current + + +class _DirectVoxCPMAudioVAE: + def __init__(self, audio_vae: nn.Module, *, patch_size: int = 2): + self.audio_vae = audio_vae + self.sample_rate = int(getattr(audio_vae, "sample_rate", 24000)) + self.latent_dim = int(getattr(audio_vae, "latent_dim", 64)) + self.patch_size = int(patch_size) + self._chunk_size = int(getattr(audio_vae, "chunk_size", 1)) + self._stream_audio_patch_samples = max(1, self.patch_size * self._chunk_size) + + def _prepare_latents_for_decode(self, latent_audio_feat: Any) -> torch.Tensor: + latents = latent_audio_feat + if not isinstance(latents, torch.Tensor): + latents = torch.tensor(latents, dtype=torch.float32) + latents = latents.detach().to(torch.float32) + + if latents.ndim == 3: + if latents.shape[-1] == self.latent_dim: + latents = rearrange(latents, "t p d -> 1 d (t p)") + elif latents.shape[1] == self.latent_dim: + latents = latents.contiguous() + else: + raise ValueError(f"Unsupported latent_audio_feat shape: {tuple(latents.shape)}") + elif latents.ndim == 2: + if latents.shape[0] == self.latent_dim: + latents = latents.unsqueeze(0) + elif latents.shape[1] == self.latent_dim: + latents = rearrange(latents, "t d -> 1 d t") + else: + raise ValueError(f"Unsupported latent_audio_feat shape: {tuple(latents.shape)}") + else: + raise ValueError(f"Unsupported latent_audio_feat ndim: {latents.ndim}") + + return latents + + @torch.no_grad() + def decode(self, latent_audio_feat: Any, *, trim_streaming_patch: bool = False) -> torch.Tensor: + latents = self._prepare_latents_for_decode(latent_audio_feat) + device = next(self.audio_vae.parameters()).device + raw = self.audio_vae.decode(latents.to(device=device, dtype=torch.float32)) + if isinstance(raw, dict): + audio = raw.get("audio") + if audio is None: + audio = next(v for v in raw.values() if isinstance(v, torch.Tensor)) + else: + audio = raw + if audio.dim() == 3: + stream = audio.squeeze(1) + elif audio.dim() == 2: + stream = audio + else: + stream = audio.reshape(audio.shape[0], -1) + if trim_streaming_patch: + stream = stream[..., -self._stream_audio_patch_samples :] + return stream.reshape(-1).detach().cpu().to(torch.float32) diff --git a/vllm_omni/model_executor/stage_configs/voxcpm.yaml b/vllm_omni/model_executor/stage_configs/voxcpm.yaml new file mode 100644 index 0000000000..a5f324f660 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/voxcpm.yaml @@ -0,0 +1,69 @@ +# VoxCPM two-stage (latent → VAE) without async_chunk: one-shot latent then decode. +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: bfloat16 + model_stage: latent_generator + model_arch: VoxCPMForConditionalGeneration + # Optional persistent HF-compatible config dir for native VoxCPM models. + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.7 + distributed_executor_backend: "mp" + max_num_batched_tokens: 4096 + max_model_len: 4096 + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 4096 + stop_token_ids: [2] + seed: 42 + detokenize: false + repetition_penalty: 1.0 + final_output: false + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: float32 + model_stage: vae + model_arch: VoxCPMForConditionalGeneration + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.15 + distributed_executor_backend: "mp" + max_num_batched_tokens: 8192 + max_model_len: 4096 + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.voxcpm.latent2vae + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 1 + seed: 42 + detokenize: true + repetition_penalty: 1.0 diff --git a/vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml new file mode 100644 index 0000000000..cf78d4e438 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml @@ -0,0 +1,102 @@ +# VoxCPM two-stage streaming (align with qwen3_tts.yaml async_chunk pattern). +# Stage0 (latent_generator) emits latent in time chunks; Stage1 (VAE) decodes as chunks arrive. +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: bfloat16 + model_stage: latent_generator + model_arch: VoxCPMForConditionalGeneration + # Optional persistent HF-compatible config dir for native VoxCPM models. + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: true + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.7 + distributed_executor_backend: "mp" + max_num_batched_tokens: 4096 + max_model_len: 4096 + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.voxcpm.latent2vae_async_chunk + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 4096 + stop_token_ids: [2] + seed: 42 + detokenize: false + repetition_penalty: 1.0 + final_output: false + output_connectors: + to_stage_1: voxcpm_shm + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: float32 + model_stage: vae + model_arch: VoxCPMForConditionalGeneration + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.15 + distributed_executor_backend: "mp" + max_num_batched_tokens: 8192 + max_model_len: 4096 + engine_input_source: [0] + final_output: true + final_output_type: audio + input_connectors: + from_stage_0: voxcpm_shm + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 128 + seed: 42 + detokenize: true + repetition_penalty: 1.0 + + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 + + connectors: + voxcpm_shm: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + # Frame-aligned codec streaming transport. + codec_streaming: true + # Connector polling / timeout (unit: loop count, sleep interval in seconds). + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + # Align with Omni: small chunks with sufficient context overlap. + codec_chunk_frames: 1 + codec_left_context_frames: 1 + + edges: + - from: 0 + to: 1 + window_size: -1 diff --git a/vllm_omni/model_executor/stage_input_processors/voxcpm.py b/vllm_omni/model_executor/stage_input_processors/voxcpm.py new file mode 100644 index 0000000000..c2fcf521bf --- /dev/null +++ b/vllm_omni/model_executor/stage_input_processors/voxcpm.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +from typing import Any + +import torch +from vllm.inputs import TextPrompt + +from vllm_omni.inputs.data import OmniTokensPrompt + +_VOXCPM_LATENT_MAGIC = 131071 + + +def _serialize_latent_to_codes(latent: Any) -> list[int]: + latent_tensor = latent if isinstance(latent, torch.Tensor) else torch.as_tensor(latent) + latent_tensor = latent_tensor.detach().cpu().contiguous() + if latent_tensor.ndim == 3: + if latent_tensor.shape[0] != 1: + raise ValueError(f"Expected batch=1 latent tensor, got shape={tuple(latent_tensor.shape)}") + latent_tensor = latent_tensor.squeeze(0) + if latent_tensor.ndim != 2: + raise ValueError(f"Unsupported latent_audio_feat shape for async chunk: {tuple(latent_tensor.shape)}") + latent_dim, time_dim = int(latent_tensor.shape[0]), int(latent_tensor.shape[1]) + packed = latent_tensor.to(torch.bfloat16).contiguous().view(torch.uint16).reshape(-1).to(torch.int32) + return [_VOXCPM_LATENT_MAGIC, latent_dim, time_dim, *packed.tolist()] + + +def _coerce_finished_flag(value: Any) -> bool: + """Normalize VoxCPM async-chunk finished markers to a Python bool.""" + if value is None: + return False + if isinstance(value, torch.Tensor): + if value.numel() != 1: + raise ValueError(f"finished tensor must be scalar, got shape={tuple(value.shape)}") + return bool(value.detach().cpu().item()) + if isinstance(value, (list, tuple)): + if not value: + return False + if len(value) != 1: + raise ValueError(f"finished container must have one element, got len={len(value)}") + return _coerce_finished_flag(value[0]) + return bool(value) + + +def latent2vae( + stage_list: list[Any], + engine_input_source: list[int], + prompt: OmniTokensPrompt | TextPrompt | None = None, + requires_multimodal_data: bool = False, +) -> list[OmniTokensPrompt]: + del prompt, requires_multimodal_data + + if not engine_input_source: + raise ValueError("engine_input_source cannot be empty") + + source_stage_id = engine_input_source[0] + if source_stage_id >= len(stage_list): + raise IndexError(f"Invalid stage_id: {source_stage_id}") + + source_outputs = stage_list[source_stage_id].engine_outputs + if source_outputs is None: + raise RuntimeError(f"Stage {source_stage_id} has no outputs yet") + + vae_inputs: list[OmniTokensPrompt] = [] + for source_output in source_outputs: + output = source_output.outputs[0] + multimodal_output = getattr(output, "multimodal_output", None) + if not isinstance(multimodal_output, dict) or "latent_audio_feat" not in multimodal_output: + raise ValueError( + "VoxCPM latent stage output missing 'latent_audio_feat'. " + f"request_id={getattr(source_output, 'request_id', None)}" + ) + + additional_information = { + "latent_audio_feat": multimodal_output["latent_audio_feat"], + } + if "sr" in multimodal_output: + additional_information["sample_rate"] = [int(multimodal_output["sr"])] + + vae_inputs.append( + OmniTokensPrompt( + prompt_token_ids=[0], + additional_information=additional_information, + multi_modal_data=None, + mm_processor_kwargs=None, + ) + ) + + return vae_inputs + + +def latent2vae_async_chunk( + transfer_manager: Any, + pooling_output: dict[str, Any] | None, + request: Any, + is_finished: bool = False, +) -> dict[str, Any] | None: + """Stage-0 latent → stage-1 VAE under ``async_chunk`` (connector payload).""" + # Kept for callback signature compatibility with OmniChunkTransferAdapter. + _ = transfer_manager + finished_request = _coerce_finished_flag(is_finished) + if callable(getattr(request, "is_finished", None)): + finished_request = finished_request or _coerce_finished_flag(request.is_finished()) + if not isinstance(pooling_output, dict): + if finished_request: + return { + "code_predictor_codes": [], + "finished": torch.tensor(True, dtype=torch.bool), + } + return None + + latent = pooling_output.get("latent_audio_feat") + if isinstance(latent, torch.Tensor) and latent.numel() == 0: + latent = None + + if latent is None: + if finished_request: + return { + "code_predictor_codes": [], + "finished": torch.tensor(True, dtype=torch.bool), + } + return None + + serialized_codes = _serialize_latent_to_codes(latent) + out: dict[str, Any] = { + "code_predictor_codes": serialized_codes, + "finished": torch.tensor(finished_request, dtype=torch.bool), + } + return out diff --git a/vllm_omni/platforms/npu/stage_configs/voxcpm.yaml b/vllm_omni/platforms/npu/stage_configs/voxcpm.yaml new file mode 100644 index 0000000000..dcd1f40517 --- /dev/null +++ b/vllm_omni/platforms/npu/stage_configs/voxcpm.yaml @@ -0,0 +1,67 @@ +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: bfloat16 + model_stage: latent_generator + model_arch: VoxCPMForConditionalGeneration + # Optional persistent HF-compatible config dir for native VoxCPM models. + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.75 + distributed_executor_backend: "mp" + max_num_batched_tokens: 4096 + max_model_len: 4096 + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 4096 + seed: 42 + detokenize: false + repetition_penalty: 1.0 + final_output: false + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: float32 + model_stage: vae + model_arch: VoxCPMForConditionalGeneration + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.1 + distributed_executor_backend: "mp" + max_num_batched_tokens: 8192 + max_model_len: 4096 + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.voxcpm.latent2vae + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 1 + seed: 42 + detokenize: true + repetition_penalty: 1.0 diff --git a/vllm_omni/platforms/npu/stage_configs/voxcpm_async_chunk.yaml b/vllm_omni/platforms/npu/stage_configs/voxcpm_async_chunk.yaml new file mode 100644 index 0000000000..0a4ed7497d --- /dev/null +++ b/vllm_omni/platforms/npu/stage_configs/voxcpm_async_chunk.yaml @@ -0,0 +1,93 @@ +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: bfloat16 + model_stage: latent_generator + model_arch: VoxCPMForConditionalGeneration + # Optional persistent HF-compatible config dir for native VoxCPM models. + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.75 + distributed_executor_backend: "mp" + max_num_batched_tokens: 4096 + max_model_len: 4096 + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.voxcpm.latent2vae_async_chunk + output_connectors: + to_stage_1: connector_of_shared_memory + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 4096 + seed: 42 + detokenize: false + repetition_penalty: 1.0 + final_output: false + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: float32 + model_stage: vae + model_arch: VoxCPMForConditionalGeneration + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.1 + distributed_executor_backend: "mp" + max_num_batched_tokens: 8192 + max_model_len: 4096 + engine_input_source: [0] + input_connectors: + from_stage_0: connector_of_shared_memory + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 1 + seed: 42 + detokenize: true + repetition_penalty: 1.0 + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 + + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + codec_streaming: false + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + + edges: + - from: 0 + to: 1 + window_size: -1 diff --git a/vllm_omni/transformers_utils/configs/__init__.py b/vllm_omni/transformers_utils/configs/__init__.py index 5f957c2f6d..0aa3624f80 100644 --- a/vllm_omni/transformers_utils/configs/__init__.py +++ b/vllm_omni/transformers_utils/configs/__init__.py @@ -17,6 +17,7 @@ "FishSpeechConfig": "vllm_omni.transformers_utils.configs.fish_speech", "FishSpeechSlowARConfig": "vllm_omni.transformers_utils.configs.fish_speech", "FishSpeechFastARConfig": "vllm_omni.transformers_utils.configs.fish_speech", + "VoxCPMConfig": "vllm_omni.transformers_utils.configs.voxcpm", "VoxCPM2Config": "vllm_omni.transformers_utils.configs.voxcpm2", } @@ -28,6 +29,7 @@ "FishSpeechConfig", "FishSpeechSlowARConfig", "FishSpeechFastARConfig", + "VoxCPMConfig", "VoxCPM2Config", ] @@ -49,4 +51,5 @@ def __dir__(): # run as soon as `vllm_omni.transformers_utils.configs` is imported. from vllm_omni.transformers_utils.configs import fish_speech as _fish_speech # noqa: F401, E402 from vllm_omni.transformers_utils.configs import mammoth_moda2 as _mammoth_moda2 # noqa: F401, E402 +from vllm_omni.transformers_utils.configs import voxcpm as _voxcpm # noqa: F401, E402 from vllm_omni.transformers_utils.configs import voxcpm2 as _voxcpm2 # noqa: F401, E402 diff --git a/vllm_omni/transformers_utils/configs/voxcpm.py b/vllm_omni/transformers_utils/configs/voxcpm.py new file mode 100644 index 0000000000..0267838915 --- /dev/null +++ b/vllm_omni/transformers_utils/configs/voxcpm.py @@ -0,0 +1,68 @@ +from transformers import AutoConfig +from transformers.configuration_utils import PretrainedConfig + + +class VoxCPMConfig(PretrainedConfig): + model_type = "voxcpm" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + bos_token_id: int = 1, + eos_token_id: int = 2, + vocab_size: int = 32000, + hidden_size: int = 1024, + intermediate_size: int = 4096, + max_position_embeddings: int = 4096, + num_attention_heads: int = 16, + num_hidden_layers: int = 24, + num_key_value_heads: int = 16, + rms_norm_eps: float = 1e-6, + rope_theta: float = 10000.0, + rope_scaling: dict | None = None, + lm_config: dict | None = None, + encoder_config: dict | None = None, + dit_config: dict | None = None, + audio_vae_config: dict | None = None, + patch_size: int = 2, + feat_dim: int = 64, + residual_lm_num_layers: int = 6, + scalar_quantization_latent_dim: int = 256, + scalar_quantization_scale: int = 9, + max_length: int = 4096, + device: str = "cuda", + dtype: str = "bfloat16", + dit_mean_mode: bool = False, + **kwargs, + ): + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.max_position_embeddings = max_position_embeddings + self.num_attention_heads = num_attention_heads + self.num_hidden_layers = num_hidden_layers + self.num_key_value_heads = num_key_value_heads + self.rms_norm_eps = rms_norm_eps + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + + self.lm_config = lm_config or {} + self.encoder_config = encoder_config or {} + self.dit_config = dit_config or {} + self.audio_vae_config = audio_vae_config + + self.patch_size = patch_size + self.feat_dim = feat_dim + self.residual_lm_num_layers = residual_lm_num_layers + self.scalar_quantization_latent_dim = scalar_quantization_latent_dim + self.scalar_quantization_scale = scalar_quantization_scale + self.max_length = max_length + self.device = device + self.dtype = dtype + self.dit_mean_mode = dit_mean_mode + + +AutoConfig.register("voxcpm", VoxCPMConfig) + +__all__ = ["VoxCPMConfig"] From 82f8c93343552d81e0e4730d90ce08e072fc3bcb Mon Sep 17 00:00:00 2001 From: Juan Pablo Zuluaga <46724788+JuanPZuluaga@users.noreply.github.com> Date: Wed, 15 Apr 2026 09:14:57 +0200 Subject: [PATCH 183/204] [Feat][Qwen3-Omni] Shared code predictor module for Qwen3-TTS and Qwen3-Omni (#2375) Signed-off-by: JuanPZuluaga Co-authored-by: Hongsheng Liu --- .../qwen3_tts/test_code_predictor_dtype.py | 92 ++- vllm_omni/engine/stage_init_utils.py | 5 +- .../model_executor/models/common/__init__.py | 0 .../models/common/qwen3_code_predictor.py | 654 ++++++++++++++++++ .../qwen3_omni_moe_code_predictor_mtp.py | 520 +------------- .../qwen3_tts_code_predictor_vllm.py | 571 +-------------- 6 files changed, 778 insertions(+), 1064 deletions(-) create mode 100644 vllm_omni/model_executor/models/common/__init__.py create mode 100644 vllm_omni/model_executor/models/common/qwen3_code_predictor.py diff --git a/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py b/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py index b0ce10a8d5..8798cb3ca9 100644 --- a/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py +++ b/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py @@ -21,7 +21,7 @@ from pytest_mock import MockerFixture # Direct file import to avoid vllm_omni.__init__ patch dependencies. -_BASE = os.path.join( +_MODELS = os.path.join( os.path.dirname(__file__), os.pardir, os.pardir, @@ -30,14 +30,16 @@ "vllm_omni", "model_executor", "models", - "qwen3_tts", ) +_BASE = os.path.join(_MODELS, "qwen3_tts") +_COMMON = os.path.join(_MODELS, "common") def _load_module(name: str, filename: str): path = os.path.abspath(os.path.join(_BASE, filename)) spec = importlib.util.spec_from_file_location(name, path) mod = importlib.util.module_from_spec(spec) + sys.modules[name] = mod # register before exec (needed for dataclasses etc.) spec.loader.exec_module(mod) return mod @@ -59,8 +61,17 @@ def _build_mock_modules(mocker: MockerFixture) -> dict[str, object]: weight_utils_mock = mocker.MagicMock() weight_utils_mock.default_weight_loader = lambda p, w: None - pkg = types.ModuleType("vllm_omni.model_executor.models.qwen3_tts") - pkg.__path__ = [os.path.abspath(_BASE)] + tts_pkg = types.ModuleType("vllm_omni.model_executor.models.qwen3_tts") + tts_pkg.__path__ = [os.path.abspath(_BASE)] + + common_pkg = types.ModuleType("vllm_omni.model_executor.models.common") + common_pkg.__path__ = [os.path.abspath(_COMMON)] + + models_pkg = types.ModuleType("vllm_omni.model_executor.models") + models_pkg.__path__ = [os.path.abspath(_MODELS)] + + vllm_parallel_mock = mocker.MagicMock() + vllm_parallel_mock.VocabParallelEmbedding = torch.nn.Embedding return { "vllm_omni": mocker.MagicMock(), @@ -69,9 +80,11 @@ def _build_mock_modules(mocker: MockerFixture) -> dict[str, object]: "vllm.config": mocker.MagicMock(), "vllm.config.vllm": vllm_config_mod, "vllm.model_executor.model_loader.weight_utils": weight_utils_mock, + "vllm.model_executor.layers.vocab_parallel_embedding": vllm_parallel_mock, "vllm_omni.model_executor": types.ModuleType("vllm_omni.model_executor"), - "vllm_omni.model_executor.models": types.ModuleType("vllm_omni.model_executor.models"), - "vllm_omni.model_executor.models.qwen3_tts": pkg, + "vllm_omni.model_executor.models": models_pkg, + "vllm_omni.model_executor.models.common": common_pkg, + "vllm_omni.model_executor.models.qwen3_tts": tts_pkg, } @@ -88,6 +101,15 @@ def _load_target_classes(mocker: MockerFixture): ) sys.modules["vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts"] = config_mod + # Load the shared common module (thin wrappers import from it) + common_cp_path = os.path.abspath(os.path.join(_COMMON, "qwen3_code_predictor.py")) + common_spec = importlib.util.spec_from_file_location( + "vllm_omni.model_executor.models.common.qwen3_code_predictor", common_cp_path + ) + common_cp_mod = importlib.util.module_from_spec(common_spec) + sys.modules["vllm_omni.model_executor.models.common.qwen3_code_predictor"] = common_cp_mod + common_spec.loader.exec_module(common_cp_mod) + cp_mod = _load_module( "vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_code_predictor_vllm", "qwen3_tts_code_predictor_vllm.py", @@ -104,6 +126,7 @@ def loaded_target_classes(mocker: MockerFixture): config_mod.Qwen3TTSTalkerConfig, cp_mod.Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM, cp_mod.Qwen3TTSTalkerCodePredictorModelVLLM, + cp_mod.CodePredictorWrapperConfig, ) @@ -114,6 +137,7 @@ def _make_tiny_config(loaded_target_classes) -> tuple: qwen3_tts_talker_config, _, _, + _, ) = loaded_target_classes cp_config = qwen3_tts_talker_code_predictor_config( vocab_size=64, @@ -145,7 +169,7 @@ class TestCodePredictorDtypeAlignment: def test_ensure_buffers_uses_given_dtype(self, mocker: MockerFixture, loaded_target_classes) -> None: """_ensure_buffers should create proj_buf with the given dtype.""" - _, _, code_predictor_wrapper, _ = loaded_target_classes + _, _, code_predictor_wrapper, _, _ = loaded_target_classes cp_config, talker_config = _make_tiny_config(loaded_target_classes) vllm_config = _make_vllm_config(mocker) @@ -156,17 +180,17 @@ def test_ensure_buffers_uses_given_dtype(self, mocker: MockerFixture, loaded_tar ) # Create buffer in float16 - predictor._ensure_buffers(torch.device("cpu"), torch.float16) + predictor._ensure_buffers(torch.device("cpu"), torch.float16, 4) assert predictor._proj_buf is not None assert predictor._proj_buf.dtype == torch.float16 # Re-create buffer in float32 (different dtype triggers re-allocation) - predictor._ensure_buffers(torch.device("cpu"), torch.float32) + predictor._ensure_buffers(torch.device("cpu"), torch.float32, 4) assert predictor._proj_buf.dtype == torch.float32 def test_warmup_aligns_buffer_to_model_params(self, mocker: MockerFixture, loaded_target_classes) -> None: """_warmup_buckets should align proj_buf dtype to model parameters.""" - _, _, code_predictor_wrapper, _ = loaded_target_classes + _, _, code_predictor_wrapper, _, _ = loaded_target_classes cp_config, talker_config = _make_tiny_config(loaded_target_classes) vllm_config = _make_vllm_config(mocker, max_num_seqs=2) @@ -180,7 +204,7 @@ def test_warmup_aligns_buffer_to_model_params(self, mocker: MockerFixture, loade predictor = predictor.to(torch.float16) # Pre-create proj_buf with WRONG dtype (float32) — simulating the bug - predictor._ensure_buffers(torch.device("cpu"), torch.float32) + predictor._ensure_buffers(torch.device("cpu"), torch.float32, 2) assert predictor._proj_buf.dtype == torch.float32 # Simulate _setup_compile having cached model dtype and compiled forward @@ -194,7 +218,7 @@ def test_warmup_aligns_buffer_to_model_params(self, mocker: MockerFixture, loade def test_setup_compile_caches_model_dtype(self, mocker: MockerFixture, loaded_target_classes) -> None: """_setup_compile should cache model parameter dtype.""" - _, _, code_predictor_wrapper, _ = loaded_target_classes + _, _, code_predictor_wrapper, _, _ = loaded_target_classes cp_config, talker_config = _make_tiny_config(loaded_target_classes) vllm_config = _make_vllm_config(mocker, max_num_seqs=2) @@ -211,7 +235,7 @@ def test_setup_compile_caches_model_dtype(self, mocker: MockerFixture, loaded_ta def test_forward_with_mismatched_input_dtype(self, mocker: MockerFixture, loaded_target_classes) -> None: """forward() should not crash when inputs are float32 but model is float16.""" - _, _, code_predictor_wrapper, _ = loaded_target_classes + _, _, code_predictor_wrapper, _, _ = loaded_target_classes cp_config, talker_config = _make_tiny_config(loaded_target_classes) vllm_config = _make_vllm_config(mocker, max_num_seqs=2) @@ -250,9 +274,9 @@ class TestCodePredictorModelDtype: def test_model_forward_float16(self, loaded_target_classes) -> None: """Inner model forward should work in float16.""" - _, _, _, code_predictor_model = loaded_target_classes + _, _, _, code_predictor_model, _ = loaded_target_classes cp_config, _ = _make_tiny_config(loaded_target_classes) - model = code_predictor_model(cp_config, talker_hidden_size=32).to(torch.float16) + model = code_predictor_model(cp_config, embedding_dim=32).to(torch.float16) bsz, seq_len = 1, 4 inputs = torch.randn(bsz, seq_len, 32, dtype=torch.float16) @@ -264,9 +288,9 @@ def test_model_forward_float16(self, loaded_target_classes) -> None: def test_model_forward_float32(self, loaded_target_classes) -> None: """Inner model forward should work in float32.""" - _, _, _, code_predictor_model = loaded_target_classes + _, _, _, code_predictor_model, _ = loaded_target_classes cp_config, _ = _make_tiny_config(loaded_target_classes) - model = code_predictor_model(cp_config, talker_hidden_size=32).to(torch.float32) + model = code_predictor_model(cp_config, embedding_dim=32).to(torch.float32) bsz, seq_len = 1, 4 inputs = torch.randn(bsz, seq_len, 32, dtype=torch.float32) @@ -275,3 +299,37 @@ def test_model_forward_float32(self, loaded_target_classes) -> None: output = model(inputs, pos_ids) assert output.dtype == torch.float32 assert output.shape == (bsz, seq_len, 32) + + +class TestCodePredictorWrapperConfig: + """Test wrapper configuration for different models.""" + + def test_omni_config(self, loaded_target_classes) -> None: + """Qwen3-Omni uses correct wrapper config.""" + _, _, _, _, code_predictor_wrapper_config = loaded_target_classes + config = code_predictor_wrapper_config( + use_cuda_graphs=False, + use_parallel_embedding=True, + use_projection=False, + return_proj_buf=True, + sampling_mode="stored", + ) + assert config.use_cuda_graphs is False + assert config.use_parallel_embedding is True + assert config.return_proj_buf is True + assert config.sampling_mode == "stored" + + def test_tts_config(self, loaded_target_classes) -> None: + """Qwen3-TTS uses correct wrapper config.""" + _, _, _, _, code_predictor_wrapper_config = loaded_target_classes + config = code_predictor_wrapper_config( + use_cuda_graphs=True, + use_parallel_embedding=False, + use_projection=True, + return_proj_buf=False, + sampling_mode="per_call", + ) + assert config.use_cuda_graphs is True + assert config.use_parallel_embedding is False + assert config.return_proj_buf is False + assert config.sampling_mode == "per_call" diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index bf40aa77cd..3a7fe4bad7 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -192,8 +192,9 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata: default_sampling_params: OmniSamplingParams = SPClass(**default_sp) custom_process_input_func: Callable | None = None - if hasattr(stage_config, "custom_process_input_func"): - mod_path, fn_name = stage_config.custom_process_input_func.rsplit(".", 1) + _cpif_path = getattr(stage_config, "custom_process_input_func", None) + if _cpif_path: + mod_path, fn_name = _cpif_path.rsplit(".", 1) custom_process_input_func = getattr(importlib.import_module(mod_path), fn_name) prompt_expand_func: Callable | None = None diff --git a/vllm_omni/model_executor/models/common/__init__.py b/vllm_omni/model_executor/models/common/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/vllm_omni/model_executor/models/common/qwen3_code_predictor.py b/vllm_omni/model_executor/models/common/qwen3_code_predictor.py new file mode 100644 index 0000000000..3a904442fa --- /dev/null +++ b/vllm_omni/model_executor/models/common/qwen3_code_predictor.py @@ -0,0 +1,654 @@ +"""Qwen3 Code Predictor -- optimized re-prefill, no KV cache. + +Shared by Qwen3-Omni and Qwen3-TTS talker models. + +* SDPA attention (F.scaled_dot_product_attention) with native GQA support +* HF-compatible numerics (float32 RMSNorm, float32 RoPE, separate linear layers) +* Per-call embedding buffer to avoid cross-request aliasing +* Pre-allocated position_ids (read-only, safe to persist) +* torch.compile (epilogue_fusion=False) on inner transformer by default +* Optional manual CUDA graph capture per batch-size bucket +* Inline sampling (top-k + top-p) -- no custom op overhead +""" + +from __future__ import annotations + +import dataclasses +from collections.abc import Iterable + +import torch +import torch.nn as nn +import torch.nn.functional as F +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + +from vllm_omni.platforms import current_omni_platform + +logger = init_logger(__name__) + + +# =================================================================== +# HF-numerics-compatible layers for code predictor +# =================================================================== +# +# These use plain PyTorch ops (nn.Linear, manual RMSNorm in float32, +# rotate_half RoPE) to produce outputs numerically identical to the +# HuggingFace reference. vLLM's fused kernels (RMSNorm, QKVParallel, +# get_rope) introduce small precision differences that compound across +# the autoregressive steps of the code predictor, causing severe +# audio quality degradation. +# +# See: https://github.com/vllm-project/vllm-omni/issues/2274 + + +class _RMSNorm(nn.Module): + """RMSNorm matching HuggingFace's implementation exactly. + + Computes variance in float32 to avoid bfloat16 precision loss. + """ + + def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +def _rotate_half(x: torch.Tensor) -> torch.Tensor: + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +class _RotaryEmbedding(nn.Module): + """RoPE matching HuggingFace's implementation exactly. + + Forces float32 computation for cos/sin, matching HF's torch.autocast(enabled=False). + """ + + def __init__(self, config) -> None: + super().__init__() + head_dim = getattr( + config, + "head_dim", + config.hidden_size // config.num_attention_heads, + ) + rope_theta = getattr(config, "rope_theta", 10000.0) + inv_freq = 1.0 / (rope_theta ** (torch.arange(0, head_dim, 2, dtype=torch.float32) / head_dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, x: torch.Tensor, position_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + # position_ids: [batch, seq_len] + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + # Force float32 (matching HF) + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +# =================================================================== +# Attention +# =================================================================== + + +class CodePredictorAttention(nn.Module): + """Multi-head self-attention for code predictor. + + Uses ``F.scaled_dot_product_attention`` with HF-compatible RoPE and RMSNorm. + No KV cache -- the code predictor always re-prefills the full (short) + sequence each AR step. + + Input : [B, seq_len, hidden_size] + Output: [B, seq_len, hidden_size] + """ + + def __init__(self, config, *, prefix: str = "") -> None: + super().__init__() + self.num_heads = config.num_attention_heads + self.num_kv_heads = config.num_key_value_heads + self.head_dim = getattr( + config, + "head_dim", + config.hidden_size // config.num_attention_heads, + ) + self.hidden_size = config.hidden_size + self.scaling = self.head_dim**-0.5 + self._use_gqa = self.num_kv_heads != self.num_heads + + # Separate q/k/v projections matching HF (no fused packing) + bias = getattr(config, "attention_bias", False) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=bias) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self.q_norm = _RMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.k_norm = _RMSNorm(self.head_dim, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + ) -> torch.Tensor: + bsz, seq_len, _ = hidden_states.shape + hidden_shape_q = (bsz, seq_len, self.num_heads, self.head_dim) + hidden_shape_kv = (bsz, seq_len, self.num_kv_heads, self.head_dim) + + q = self.q_norm(self.q_proj(hidden_states).view(hidden_shape_q)).transpose(1, 2) + k = self.k_norm(self.k_proj(hidden_states).view(hidden_shape_kv)).transpose(1, 2) + v = self.v_proj(hidden_states).view(hidden_shape_kv).transpose(1, 2) + + cos, sin = position_embeddings + # cos/sin are [batch, seq_len, head_dim], need unsqueeze at dim=1 for heads + cos = cos.unsqueeze(1) # [batch, 1, seq_len, head_dim] + sin = sin.unsqueeze(1) + q = (q * cos) + (_rotate_half(q) * sin) + k = (k * cos) + (_rotate_half(k) * sin) + + attn_out = F.scaled_dot_product_attention( + q, + k, + v, + scale=self.scaling, + is_causal=True, + enable_gqa=self._use_gqa, + ) + + attn_out = attn_out.transpose(1, 2).reshape(bsz, seq_len, -1) + return self.o_proj(attn_out) + + +# =================================================================== +# MLP +# =================================================================== + + +class CodePredictorMLP(nn.Module): + """SiLU-gated MLP for code predictor, matching HF's implementation.""" + + def __init__(self, config, *, prefix: str = "") -> None: + super().__init__() + self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) + self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) + self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return self.down_proj(F.silu(self.gate_proj(hidden_states)) * self.up_proj(hidden_states)) + + +# =================================================================== +# Decoder Layer +# =================================================================== + + +class CodePredictorDecoderLayer(nn.Module): + """Transformer decoder layer (SDPA, no KV cache).""" + + def __init__(self, config, *, prefix: str = "") -> None: + super().__init__() + self.self_attn = CodePredictorAttention(config, prefix=f"{prefix}.self_attn") + self.mlp = CodePredictorMLP(config, prefix=f"{prefix}.mlp") + self.input_layernorm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.self_attn(hidden_states, position_embeddings) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +# =================================================================== +# Base Transformer Model (re-prefill, no KV cache) +# =================================================================== + + +class CodePredictorBaseModel(nn.Module): + """Inner transformer for code predictor. + + Signature: ``forward(inputs_embeds, position_ids) -> hidden_states`` + """ + + def __init__( + self, + config, + *, + embedding_dim: int | None = None, + use_parallel_embedding: bool = False, + prefix: str = "", + ) -> None: + super().__init__() + self.config = config + + emb_dim = int(embedding_dim) if embedding_dim is not None else int(config.hidden_size) + if use_parallel_embedding: + self.codec_embedding = nn.ModuleList( + [VocabParallelEmbedding(config.vocab_size, emb_dim) for _ in range(config.num_code_groups - 1)] + ) + else: + self.codec_embedding = nn.ModuleList( + [nn.Embedding(config.vocab_size, emb_dim) for _ in range(config.num_code_groups - 1)] + ) + + self.layers = nn.ModuleList( + [ + CodePredictorDecoderLayer(config, prefix=f"{prefix}.layers.{idx}") + for idx in range(config.num_hidden_layers) + ] + ) + self.norm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = _RotaryEmbedding(config) + + def get_input_embeddings(self) -> nn.ModuleList: + return self.codec_embedding + + def forward( + self, + inputs_embeds: torch.Tensor, + position_ids: torch.Tensor, + ) -> torch.Tensor: + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + for layer in self.layers: + hidden_states = layer(hidden_states, position_embeddings) + hidden_states = self.norm(hidden_states) + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + param = params_dict.get(name) + if param is None: + continue + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +# =================================================================== +# Wrapper Configuration +# =================================================================== + + +@dataclasses.dataclass +class CodePredictorWrapperConfig: + """Controls behavioral differences between model-specific code predictors.""" + + use_cuda_graphs: bool = False + use_parallel_embedding: bool = False + use_projection: bool = False + return_proj_buf: bool = False + sampling_mode: str = "stored" + + +# =================================================================== +# Code Predictor Wrapper (optimized re-prefill, persistent buffers) +# =================================================================== + + +class CodePredictorWrapper(nn.Module): + """Optimized code predictor -- re-prefill approach, no KV cache. + + Each AR step forwards the full growing sequence (len 2 -> num_code_groups+1) + through the transformer. The extra O(T^2) FLOPs are negligible for + short sequences, and this avoids all KV-cache management overhead. + + Optimizations: + 1. Per-call embedding buffer -- avoids cross-request aliasing. + 2. Pre-allocated position_ids -- no torch.arange per step. + 3. Cached module references -- bypass ModuleList indexing. + 4. torch.compile on inner transformer. + 5. Inline sampling (top-k + top-p) -- no custom op overhead. + 6. Optional manual CUDA graph capture per batch-size bucket. + """ + + def __init__( + self, + *, + vllm_config: VllmConfig, + cp_config, + wrapper_config: CodePredictorWrapperConfig, + talker_hidden_size: int | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self._vllm_config = vllm_config + self.config = cp_config + self._wrapper_config = wrapper_config + self.prefix = prefix + + self._num_groups = int(cp_config.num_code_groups) + self._cp_hidden = int(cp_config.hidden_size) + + # For Omni backward compat (accessed by the talker) + self.num_code_groups = self._num_groups + + # Determine embedding dimension + _talker_hidden = int(talker_hidden_size) if talker_hidden_size is not None else self._cp_hidden + + self.model = CodePredictorBaseModel( + cp_config, + embedding_dim=_talker_hidden, + use_parallel_embedding=wrapper_config.use_parallel_embedding, + prefix=f"{prefix}.model" if prefix else "model", + ) + + self.lm_head = nn.ModuleList( + [nn.Linear(cp_config.hidden_size, cp_config.vocab_size, bias=False) for _ in range(self._num_groups - 1)] + ) + + # Projection: Identity when hidden sizes match or not needed + if wrapper_config.use_projection and _talker_hidden != self._cp_hidden: + self.small_to_mtp_projection = nn.Linear(_talker_hidden, self._cp_hidden, bias=True) + else: + self.small_to_mtp_projection = nn.Identity() + + # Sampling defaults for "stored" mode + self._top_k: int = 50 + self._top_p: float = 0.8 + + # Lazily initialised state + self._proj_buf: torch.Tensor | None = None + self._model_dtype: torch.dtype | None = None + self._compiled_model_fwd = None + self._bucket_sizes: list[int] = [] + self._bucket_pos_ids: dict[int, torch.Tensor] = {} + self._lm_heads_list: list[nn.Module] | None = None + self._codec_embeds_list: list[nn.Module] | None = None + self._cuda_graphs: dict[int, tuple[torch.cuda.CUDAGraph, torch.Tensor]] = {} + + def get_input_embeddings(self) -> nn.ModuleList: + return self.model.get_input_embeddings() + + def set_sampling_params(self, top_k: int = 50, top_p: float = 0.8) -> None: + """Configure sampling parameters to maintain consistency with previous implementation.""" + self._top_k = top_k + self._top_p = top_p + logger.debug("Sampling parameters updated: top_k=%d, top_p=%.2f", top_k, top_p) + + # ------------------------------------------------------------------ + # Lazy-init helpers + # ------------------------------------------------------------------ + + def _ensure_buffers(self, device: torch.device, dtype: torch.dtype, bsz: int) -> None: + """Ensure the projection buffer can hold at least *bsz* rows.""" + max_seq = self._num_groups + 1 + if ( + self._proj_buf is not None + and self._proj_buf.device == device + and self._proj_buf.dtype == dtype + and self._proj_buf.shape[0] >= bsz + ): + return + self._proj_buf = torch.zeros(bsz, max_seq, self._cp_hidden, dtype=dtype, device=device) + + def _setup_compile(self) -> None: + """Lazily set up torch.compile with optional CUDA graph capture.""" + if self._compiled_model_fwd is not None: + return + + # Cache model parameter dtype so forward() doesn't need to query it + # on every call. Also ensures warmup buffers match model precision + # even when upstream modules produce a different dtype (#2385). + self._model_dtype = next(self.model.parameters()).dtype + self._lm_heads_list = list(self.lm_head) + self._codec_embeds_list = list(self.model.codec_embedding) + + if not current_omni_platform.supports_torch_inductor(): + logger.warning_once("code_predictor: torch.compile disabled") + self._compiled_model_fwd = self.model.forward + return + + # torch.compile fuses RMSNorm/RoPE in ways that lose float32 + # precision, compounding across AR steps. Use epilogue_fusion=False + # to disable the problematic fusions while still getting kernel + # fusion benefits for the linear layers and SDPA. + self._compiled_model_fwd = torch.compile( + self.model.forward, + dynamic=False, + options={"epilogue_fusion": False}, + ) + self._warmup_buckets() + + if self._wrapper_config.use_cuda_graphs: + self._capture_cuda_graphs() + logger.info("code_predictor: torch.compile (no epilogue fusion) + CUDA graphs") + else: + logger.info("code_predictor: torch.compile (dynamic=False, no epilogue fusion)") + + def _padded_bsz(self, bsz: int) -> int: + """Round batch size up to nearest power-of-2 bucket.""" + for bucket in self._bucket_sizes: + if bsz <= bucket: + return bucket + return bsz + + def _warmup_buckets(self) -> None: + """Warmup power-of-2 batch-size buckets to front-load Inductor compilation.""" + max_bsz = self._vllm_config.scheduler_config.max_num_seqs + bucket_sizes = [1 << i for i in range(max_bsz.bit_length()) if (1 << i) <= max_bsz] + if max_bsz not in bucket_sizes: + bucket_sizes.append(max_bsz) + self._bucket_sizes = sorted(bucket_sizes) + + max_seq = self._num_groups + 1 + device = next(self.model.parameters()).device + + # Ensure proj_buf matches model parameter dtype to avoid dtype + # mismatch during warmup compilation (see #2385). + self._ensure_buffers(device, self._model_dtype, max(self._bucket_sizes)) + proj_buf = self._proj_buf + + for bsz in self._bucket_sizes: + pos_ids = torch.arange(max_seq, device=device, dtype=torch.long).unsqueeze(0).expand(bsz, -1).contiguous() + self._bucket_pos_ids[bsz] = pos_ids + for _ in range(3): + self._compiled_model_fwd(proj_buf[:bsz, :max_seq, :], pos_ids) + logger.info("code_predictor: warmup done for buckets %s", self._bucket_sizes) + + def _capture_cuda_graphs(self) -> None: + """Capture a CUDA graph per bucket using vLLM's global graph pool.""" + from vllm.platforms import current_platform + + pool = current_platform.get_global_graph_pool() + max_seq = self._num_groups + 1 + proj_buf = self._proj_buf + + for bsz in self._bucket_sizes: + static_input = proj_buf[:bsz, :max_seq, :] + pos_ids = self._bucket_pos_ids[bsz] + + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g, pool=pool): + static_output = self._compiled_model_fwd(static_input, pos_ids) + + self._cuda_graphs[bsz] = (g, static_output) + + logger.info("code_predictor: captured CUDA graphs for buckets %s", self._bucket_sizes) + + # ------------------------------------------------------------------ + # Forward -- re-prefill + inline sampling + # ------------------------------------------------------------------ + + @torch.inference_mode() + def forward( + self, + layer0_code: torch.Tensor, + layer0_embed: torch.Tensor, + last_talker_hidden: torch.Tensor, + do_sample: bool = True, + temperature: float = 0.9, + top_k: int = 50, + top_p: float = 1.0, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """Predict residual codebooks 1..G-1 autoregressively via re-prefill.""" + bsz = int(layer0_code.shape[0]) + num_groups = self._num_groups + device = layer0_code.device + + # _setup_compile caches _model_dtype on first call; use it for buffers + # so they always match model weight precision (#2385). + self._setup_compile() + dtype = self._model_dtype + + padded_bsz = self._padded_bsz(bsz) + self._ensure_buffers(device, dtype, padded_bsz) + + proj_buf = self._proj_buf + max_seq = num_groups + 1 + projection = self.small_to_mtp_projection + model_fwd = self._compiled_model_fwd + lm_heads = self._lm_heads_list + codec_embeds = self._codec_embeds_list + + # Zero the padded region of the buffer + proj_buf[:padded_bsz].zero_() + + # Fill buffer positions 0 (talker hidden) & 1 (layer0 embed) + proj_buf[:bsz, 0, :] = projection(last_talker_hidden.reshape(bsz, 1, -1).to(dtype)).reshape(bsz, -1) + proj_buf[:bsz, 1, :] = projection(layer0_embed.reshape(bsz, 1, -1).to(dtype)).reshape(bsz, -1) + + # Get pre-computed pos_ids for this bucket + full_pos_ids = self._bucket_pos_ids.get(padded_bsz) + if full_pos_ids is None: + full_pos_ids = ( + torch.arange(max_seq, device=device, dtype=torch.long).unsqueeze(0).expand(padded_bsz, -1).contiguous() + ) + + # Use captured CUDA graph if available, otherwise call compiled fn. + cuda_graph_entry = self._cuda_graphs.get(padded_bsz) + + # Prepare sampling parameters + stored_mode = self._wrapper_config.sampling_mode == "stored" + if stored_mode: + s_top_k = self._top_k + s_top_p = self._top_p + else: + use_sampling = do_sample and temperature > 0 + inv_temperature = 1.0 / max(temperature, 1e-6) if use_sampling else 0.0 + if use_sampling and top_p != 1.0: + raise NotImplementedError( + "top_p sampling is not implemented for the vLLM-native code predictor; please set top_p=1.0." + ) + + # Output codes -- shape depends on return mode + if self._wrapper_config.return_proj_buf: + all_codes = torch.empty(bsz, num_groups, 1, dtype=torch.int64, device=device) + all_codes[:, 0] = layer0_code.reshape(bsz, -1)[:, :1] + else: + all_codes = torch.empty(bsz, num_groups, dtype=torch.long, device=device) + all_codes[:, 0] = layer0_code.reshape(bsz) + + # Autoregressive loop: predict layers 1..G-1 + for step in range(1, num_groups): + # Run transformer (CUDA graph replay or compiled forward) + if cuda_graph_entry is not None: + cuda_graph_entry[0].replay() + hidden_out = cuda_graph_entry[1] + else: + hidden_out = model_fwd(proj_buf[:padded_bsz, :max_seq, :], full_pos_ids) + + logits = lm_heads[step - 1](hidden_out[:bsz, step, :]) + + # Sample next code + if stored_mode: + # "stored" mode: top-k -> top-p -> softmax -> multinomial + if s_top_k > 0: + topk_vals, _ = logits.topk(s_top_k, dim=-1) + logits = logits.masked_fill(logits < topk_vals[:, -1:], float("-inf")) + if s_top_p < 1.0: + sorted_logits, sorted_idx = logits.sort(dim=-1, descending=True) + sorted_probs = F.softmax(sorted_logits, dim=-1) + cumulative_probs = sorted_probs.cumsum(dim=-1) + remove_mask = (cumulative_probs - sorted_probs) >= s_top_p + sorted_logits[remove_mask] = float("-inf") + logits = sorted_logits.scatter(1, sorted_idx, sorted_logits) + probs = F.softmax(logits, dim=-1) + code = torch.multinomial(probs, num_samples=1) + else: + # "per_call" mode: temperature-scaled + top-k + if use_sampling: + scaled = logits * inv_temperature + if top_k > 0: + topk_vals, _ = scaled.topk(top_k, dim=-1) + scaled = scaled.masked_fill(scaled < topk_vals[:, -1:], float("-inf")) + probs = F.softmax(scaled, dim=-1) + code = torch.multinomial(probs, num_samples=1) + else: + code = logits.argmax(dim=-1, keepdim=True) + + # Store code + if self._wrapper_config.return_proj_buf: + all_codes[:, step] = code + else: + all_codes[:, step] = code.reshape(bsz) + + # Embed predicted code -> project -> next buffer position + if step < num_groups - 1 or self._wrapper_config.return_proj_buf: + new_embed = codec_embeds[step - 1](code) + proj_buf[:bsz, step + 1, :] = projection(new_embed.reshape(bsz, 1, -1)).reshape(bsz, -1) + + if self._wrapper_config.return_proj_buf: + return all_codes, proj_buf[:bsz].clone() + return all_codes + + # ------------------------------------------------------------------ + # Weight loading + # ------------------------------------------------------------------ + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load weights directly (no fused projection remapping needed).""" + loaded: set[str] = set() + model_weights: list[tuple[str, torch.Tensor]] = [] + other_weights: list[tuple[str, torch.Tensor]] = [] + + for name, w in weights: + if "rotary_emb.inv_freq" in name: + continue + if name.startswith("model."): + model_weights.append((name[len("model.") :], w)) + else: + other_weights.append((name, w)) + + loaded_model = self.model.load_weights(model_weights) + loaded |= {f"model.{n}" for n in loaded_model} + + params = dict(self.named_parameters(remove_duplicate=False)) + for name, w in other_weights: + param = params.get(name) + if param is None: + continue + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, w) + loaded.add(name) + + return loaded diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py index 2ceaafdb67..819e22e181 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py @@ -1,510 +1,28 @@ -"""Qwen3-Omni Code Predictor -- optimized re-prefill, no KV cache. +"""Qwen3-Omni Code Predictor -- thin wrapper over CodePredictorWrapper.""" -* SDPA attention (F.scaled_dot_product_attention) with native GQA support -* HF-compatible numerics (float32 RMSNorm, float32 RoPE, separate linear layers) -* Per-call embedding buffer to avoid cross-request aliasing -* Pre-allocated position_ids (read-only, safe to persist) -* torch.compile (epilogue_fusion=False) on inner transformer by default -* Inline sampling (top-k + top-p) -- no custom op overhead -""" - -import torch -import torch.nn as nn -import torch.nn.functional as F from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding -from vllm.model_executor.model_loader.weight_utils import default_weight_loader - -from vllm_omni.platforms import current_omni_platform - -logger = init_logger(__name__) - - -# =================================================================== -# HF-numerics-compatible layers for code predictor -# =================================================================== -# -# These use plain PyTorch ops (nn.Linear, manual RMSNorm in float32, -# rotate_half RoPE) to produce outputs numerically identical to the -# HuggingFace reference. vLLM's fused kernels (RMSNorm, QKVParallel, -# get_rope) introduce small precision differences that compound across -# the autoregressive steps of the code predictor, causing severe -# audio quality degradation. -# -# See: https://github.com/vllm-project/vllm-omni/issues/2274 - - -class _RMSNorm(nn.Module): - """RMSNorm matching HuggingFace's implementation exactly. - - Computes variance in float32 to avoid bfloat16 precision loss. - """ - - def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - -def _rotate_half(x: torch.Tensor) -> torch.Tensor: - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -class _RotaryEmbedding(nn.Module): - """RoPE matching HuggingFace's implementation exactly. - - Forces float32 computation for cos/sin, matching HF's torch.autocast(enabled=False). - """ - - def __init__(self, config) -> None: - super().__init__() - head_dim = getattr( - config, - "head_dim", - config.hidden_size // config.num_attention_heads, - ) - rope_theta = getattr(config, "rope_theta", 10000.0) - inv_freq = 1.0 / (rope_theta ** (torch.arange(0, head_dim, 2, dtype=torch.float32) / head_dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - def forward(self, x: torch.Tensor, position_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - # position_ids: [batch, seq_len] - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) - position_ids_expanded = position_ids[:, None, :].float() - - # Force float32 (matching HF) - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() - sin = emb.sin() - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - -class Qwen3OmniCodePredictorAttention(nn.Module): - """Multi-head self-attention for code predictor. - - Uses ``F.scaled_dot_product_attention`` with HF-compatible RoPE and RMSNorm. - No KV cache -- the code predictor always re-prefills the full (short) - sequence each AR step. - - Input : [B, seq_len, hidden_size] - Output: [B, seq_len, hidden_size] - """ - - def __init__( - self, - config, - prefix: str = "", - ): - super().__init__() - cp_cfg = config.code_predictor_config - self.num_heads = cp_cfg.num_attention_heads - self.num_kv_heads = cp_cfg.num_key_value_heads - self.head_dim = getattr( - cp_cfg, - "head_dim", - cp_cfg.hidden_size // cp_cfg.num_attention_heads, - ) - self.hidden_size = cp_cfg.hidden_size - self.scaling = self.head_dim**-0.5 - self._use_gqa = self.num_kv_heads != self.num_heads - - # Separate q/k/v projections matching HF (no fused packing) - self.q_proj = nn.Linear( - self.hidden_size, - self.num_heads * self.head_dim, - bias=False, - ) - self.k_proj = nn.Linear( - self.hidden_size, - self.num_kv_heads * self.head_dim, - bias=False, - ) - self.v_proj = nn.Linear( - self.hidden_size, - self.num_kv_heads * self.head_dim, - bias=False, - ) - self.o_proj = nn.Linear( - self.num_heads * self.head_dim, - self.hidden_size, - bias=False, - ) - self.q_norm = _RMSNorm(self.head_dim, eps=cp_cfg.rms_norm_eps) - self.k_norm = _RMSNorm(self.head_dim, eps=cp_cfg.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - ) -> torch.Tensor: - bsz, seq_len, _ = hidden_states.shape - hidden_shape_q = (bsz, seq_len, self.num_heads, self.head_dim) - hidden_shape_kv = (bsz, seq_len, self.num_kv_heads, self.head_dim) - - q = self.q_norm(self.q_proj(hidden_states).view(hidden_shape_q)).transpose(1, 2) - k = self.k_norm(self.k_proj(hidden_states).view(hidden_shape_kv)).transpose(1, 2) - v = self.v_proj(hidden_states).view(hidden_shape_kv).transpose(1, 2) - - cos, sin = position_embeddings - # cos/sin are [batch, seq_len, head_dim], need unsqueeze at dim=1 for heads - cos = cos.unsqueeze(1) # [batch, 1, seq_len, head_dim] - sin = sin.unsqueeze(1) - q = (q * cos) + (_rotate_half(q) * sin) - k = (k * cos) + (_rotate_half(k) * sin) - - attn_out = F.scaled_dot_product_attention( - q, - k, - v, - scale=self.scaling, - is_causal=True, - enable_gqa=self._use_gqa, - ) - - attn_out = attn_out.transpose(1, 2).reshape(bsz, seq_len, -1) - output = self.o_proj(attn_out) - return output - - -# =================================================================== -# MLP -# =================================================================== - - -class Qwen3OmniCodePredictorMLP(nn.Module): - """SiLU-gated MLP for code predictor, matching HF's implementation.""" - - def __init__( - self, - config, - prefix: str = "", - ): - super().__init__() - hidden_size = config.code_predictor_config.hidden_size - intermediate_size = config.code_predictor_config.intermediate_size - - self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) - self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) - self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - return self.down_proj(F.silu(self.gate_proj(hidden_states)) * self.up_proj(hidden_states)) - - -# =================================================================== -# Decoder Layer -# =================================================================== - - -class Qwen3OmniCodePredictorDecoderLayer(nn.Module): - """Transformer decoder layer (SDPA, no KV cache).""" - - def __init__( - self, - config, - prefix: str = "", - ) -> None: - super().__init__() - self.self_attn = Qwen3OmniCodePredictorAttention( - config, - prefix=f"{prefix}.self_attn", - ) - self.mlp = Qwen3OmniCodePredictorMLP( - config, - prefix=f"{prefix}.mlp", - ) - cp_cfg = config.code_predictor_config - self.input_layernorm = _RMSNorm(cp_cfg.hidden_size, eps=cp_cfg.rms_norm_eps) - self.post_attention_layernorm = _RMSNorm(cp_cfg.hidden_size, eps=cp_cfg.rms_norm_eps) - def forward( - self, - hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - ) -> torch.Tensor: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - hidden_states = self.self_attn(hidden_states, position_embeddings) - hidden_states = residual + hidden_states +from vllm_omni.model_executor.models.common.qwen3_code_predictor import ( + CodePredictorWrapper, + CodePredictorWrapperConfig, +) - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - return hidden_states +class Qwen3OmniMoeTalkerCodePredictor(CodePredictorWrapper): + """Qwen3-Omni code predictor (no CUDA graphs, VocabParallelEmbedding).""" -# =================================================================== -# Base Transformer Model (re-prefill, no KV cache) -# =================================================================== - - -class Qwen3OmniCodePredictorBaseModel(nn.Module): - """Inner transformer for code predictor. - - Signature: ``forward(inputs_embeds, position_ids) -> hidden_states`` - -- plain Tensor in, plain Tensor out (no namedtuple). - """ - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config.code_predictor_config - self.config = config - - self.codec_embedding = nn.ModuleList( - [VocabParallelEmbedding(config.vocab_size, config.hidden_size) for _ in range(config.num_code_groups - 1)] - ) - - self.layers = nn.ModuleList( - [ - Qwen3OmniCodePredictorDecoderLayer( - vllm_config.model_config.hf_config, - prefix=f"{prefix}.layers.{idx}", - ) - for idx in range(config.num_hidden_layers) - ] - ) - self.norm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = _RotaryEmbedding(config) - - def forward( - self, - inputs_embeds: torch.Tensor, - position_ids: torch.Tensor, - ) -> torch.Tensor: - hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) - for layer in self.layers: - hidden_states = layer(hidden_states, position_embeddings) - hidden_states = self.norm(hidden_states) - return hidden_states - - -# =================================================================== -# Code Predictor Wrapper (optimized re-prefill, persistent buffers) -# =================================================================== - - -class Qwen3OmniMoeTalkerCodePredictor(nn.Module): - """Optimized code predictor -- re-prefill approach, no KV cache. - - Each AR step forwards the full growing sequence (len 2 -> num_code_groups+1) - through the transformer. The extra O(T^2) FLOPs are negligible for - short sequences, and this avoids all KV-cache management overhead. - - Optimizations: - 1. Per-call embedding buffer -- avoids cross-request aliasing. - 2. Pre-allocated position_ids -- no torch.arange per step. - 3. Cached module references -- bypass ModuleList indexing. - 4. torch.compile on inner transformer. - 5. Inline sampling (top-k + top-p) -- no custom op overhead. - """ - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - self.config = config - self.quant_config = vllm_config.quant_config - self.prefix = prefix - - self.num_code_groups = config.code_predictor_config.num_code_groups - self._hidden_size = config.code_predictor_config.hidden_size - - self.model = Qwen3OmniCodePredictorBaseModel( + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + cp_config = vllm_config.model_config.hf_config.code_predictor_config + super().__init__( vllm_config=vllm_config, + cp_config=cp_config, + wrapper_config=CodePredictorWrapperConfig( + use_cuda_graphs=False, + use_parallel_embedding=True, + use_projection=False, + return_proj_buf=True, + sampling_mode="stored", + ), + talker_hidden_size=cp_config.hidden_size, prefix=prefix, ) - - # One lm_head per residual layer (layers 1 .. G-1) - self.lm_head = nn.ModuleList( - [ - nn.Linear( - config.code_predictor_config.hidden_size, - config.code_predictor_config.vocab_size, - bias=False, - ) - for _ in range(self.num_code_groups - 1) - ] - ) - - self.set_sampling_params() - - # Lazily initialised position ids (read-only, safe to persist) - self._pos_ids: torch.Tensor | None = None - - # Cached plain-list refs (set once) - self._lm_heads: list | None = None - self._codec_embeds: list | None = None - - # Model forward (optionally compiled) - self._model_fwd: object | None = None - - def set_sampling_params(self, top_k: int = 50, top_p: float = 0.8): - """Configure sampling parameters to maintain consistency with previous implementation.""" - self._top_k = top_k - self._top_p = top_p - logger.debug(f"Sampling parameters updated: top_k={top_k}, top_p={top_p}s") - - # ------------------------------------------------------------------ - # Lazy-init helpers - # ------------------------------------------------------------------ - - def _ensure_pos_ids(self, device: torch.device) -> None: - if self._pos_ids is not None and self._pos_ids.device == device: - return - max_seq = self.num_code_groups + 1 - # [1, max_seq] for HF-style RoPE (will be expanded to [bsz, seq_len] at use) - self._pos_ids = torch.arange(max_seq, dtype=torch.long, device=device).unsqueeze(0) - - def _ensure_cached_refs(self) -> None: - if self._lm_heads is not None: - return - self._lm_heads = list(self.lm_head) - self._codec_embeds = list(self.model.codec_embedding) - - def _ensure_model_fwd(self) -> None: - if self._model_fwd is not None: - return - if current_omni_platform.supports_torch_inductor(): - # torch.compile fuses RMSNorm/RoPE in ways that lose float32 - # precision, compounding across AR steps. Use epilogue_fusion=False - # to disable the problematic fusions while still getting kernel - # fusion benefits for the linear layers and SDPA. - self._model_fwd = torch.compile( - self.model.forward, - dynamic=True, - options={ - "epilogue_fusion": False, - }, - ) - logger.info("code_predictor: torch.compile enabled (no epilogue fusion)") - else: - self._model_fwd = self.model.forward - logger.info("code_predictor: using eager mode (no torch.compile)") - - # ------------------------------------------------------------------ - # Forward -- re-prefill + inline sampling - # ------------------------------------------------------------------ - - @torch.inference_mode() - def forward( - self, - layer0_code: torch.Tensor, - layer0_embed: torch.Tensor, - last_talker_hidden: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - """Predict residual codebooks 1..G-1 autoregressively via re-prefill. - - Args: - layer0_code: [bsz, 1] int64 - layer0_embed: [bsz, 1, hidden_size] - last_talker_hidden: [bsz, 1, hidden_size] - - Returns: - all_codes: [bsz, num_code_groups, 1] - proj_buf: [bsz, num_code_groups + 1, hidden_size] - pos 0 = last_talker_hidden (NOT a codec embed) - pos 1 = layer0_embed - pos 2.. = `codec_embedding[i](predicted_code_i)` - """ - bsz = int(layer0_code.shape[0]) - device = layer0_code.device - dtype = last_talker_hidden.dtype - num_groups = self.num_code_groups - - # Lazy init (read-only caches only) - self._ensure_pos_ids(device) - self._ensure_model_fwd() - self._ensure_cached_refs() - - # Allocate proj_buf locally each call to avoid cross-call aliasing - max_seq = num_groups + 1 - proj_buf = torch.zeros(bsz, max_seq, self._hidden_size, dtype=dtype, device=device) - pos_ids = self._pos_ids - model_fwd = self._model_fwd - lm_heads = self._lm_heads - codec_embeds = self._codec_embeds - - # Output codes - all_codes = torch.empty(bsz, num_groups, 1, dtype=torch.int64, device=device) - all_codes[:, 0] = layer0_code - - # Fill buffer positions 0 & 1 - proj_buf[:bsz, 0:1, :] = last_talker_hidden - proj_buf[:bsz, 1:2, :] = layer0_embed - - # Autoregressive loop: predict layers 1..G-1 - for step in range(1, num_groups): - seq_len = step + 1 - projected = proj_buf[:bsz, :seq_len, :] - # position_ids: [batch, seq_len] for HF-style RoPE - step_pos_ids = pos_ids[:, :seq_len].expand(bsz, -1) - - hidden_out = model_fwd(projected, step_pos_ids) - - # Inline sampling: top-k -> top-p -> softmax -> multinomial - logits = lm_heads[step - 1](hidden_out[:, -1, :]) # [bsz, vocab] - if self._top_k > 0: - topk_vals, _ = logits.topk(self._top_k, dim=-1) - logits = logits.masked_fill(logits < topk_vals[:, -1:], float("-inf")) - if self._top_p < 1.0: - sorted_logits, sorted_idx = logits.sort(dim=-1, descending=True) - cumulative_probs = F.softmax(sorted_logits, dim=-1).cumsum(dim=-1) - # Remove tokens with cumulative probability above top_p - remove_mask = cumulative_probs - F.softmax(sorted_logits, dim=-1) >= self._top_p - sorted_logits[remove_mask] = float("-inf") - logits = sorted_logits.scatter(1, sorted_idx, sorted_logits) - probs = F.softmax(logits, dim=-1) - code = torch.multinomial(probs, num_samples=1) # [bsz, 1] - - all_codes[:, step] = code - - # Embed predicted code -> next buffer position - new_embed = codec_embeds[step - 1](code) # [batch, 1, hidden_size] - proj_buf[:bsz, step + 1 : step + 2, :] = new_embed - - return all_codes, proj_buf[:bsz] - - # ------------------------------------------------------------------ - # Weight loading - # ------------------------------------------------------------------ - - def load_weights(self, weights: list[tuple[str, torch.Tensor]]) -> set[str]: - """Load weights directly (no fused projection remapping needed). - - Since we use separate nn.Linear for q/k/v/o and gate/up/down, - weight names match the HF checkpoint directly. - """ - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - - for name, loaded_weight in weights: - # Skip rotary embeddings - if "rotary_emb.inv_freq" in name: - continue - - param = params_dict.get(name) - if param is None: - continue - - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - - return loaded_params diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py index 1e84eaebaa..8d2f0686ae 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py @@ -1,318 +1,27 @@ +"""Qwen3-TTS Code Predictor -- thin wrapper over CodePredictorWrapper.""" + from __future__ import annotations from collections.abc import Iterable import torch -import torch.nn as nn -import torch.nn.functional as F from vllm.config import VllmConfig from vllm.config.vllm import set_current_vllm_config -from vllm.logger import init_logger -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, -) -from vllm_omni.platforms import current_omni_platform +from vllm_omni.model_executor.models.common.qwen3_code_predictor import ( + CodePredictorBaseModel, + CodePredictorWrapper, + CodePredictorWrapperConfig, +) from .configuration_qwen3_tts import Qwen3TTSTalkerCodePredictorConfig, Qwen3TTSTalkerConfig -logger = init_logger(__name__) - - -# =================================================================== -# HF-numerics-compatible layers for code predictor -# =================================================================== -# -# These use plain PyTorch ops (nn.Linear, manual RMSNorm in float32, -# rotate_half RoPE) to produce outputs numerically identical to the -# HuggingFace reference. vLLM's fused kernels (RMSNorm, QKVParallel, -# get_rope) introduce small precision differences that compound across -# the 15 autoregressive steps of the code predictor, causing severe -# audio quality degradation (UTMOS ~4.26 → ~2.66). -# -# See: https://github.com/vllm-project/vllm-omni/issues/2274 - - -class _RMSNorm(nn.Module): - """RMSNorm matching HuggingFace's Qwen3TTSRMSNorm exactly. - - Computes variance in float32 to avoid bfloat16 precision loss. - """ - - def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - -def _rotate_half(x: torch.Tensor) -> torch.Tensor: - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -class _RotaryEmbedding(nn.Module): - """RoPE matching HuggingFace's Qwen3TTSRotaryEmbedding exactly. - - Forces float32 computation for cos/sin, matching HF's torch.autocast(enabled=False). - """ - - def __init__(self, config: Qwen3TTSTalkerCodePredictorConfig) -> None: - super().__init__() - head_dim = getattr( - config, - "head_dim", - config.hidden_size // config.num_attention_heads, - ) - # Standard default RoPE - rope_theta = getattr(config, "rope_theta", 10000.0) - inv_freq = 1.0 / (rope_theta ** (torch.arange(0, head_dim, 2, dtype=torch.float32) / head_dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - def forward(self, x: torch.Tensor, position_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - # position_ids: [batch, seq_len] - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) - position_ids_expanded = position_ids[:, None, :].float() - - # Force float32 (matching HF) - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() - sin = emb.sin() - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - -class _CodePredictorAttention(nn.Module): - """Standalone multi-head attention for code predictor. - - Uses F.scaled_dot_product_attention with HF-compatible RoPE and RMSNorm. - Input: [B, seq_len, hidden_size], output: [B, seq_len, hidden_size]. - """ - - def __init__( - self, - config: Qwen3TTSTalkerCodePredictorConfig, - *, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.num_kv_heads = config.num_key_value_heads - self.head_dim = getattr( - config, - "head_dim", - config.hidden_size // config.num_attention_heads, - ) - self.scaling = self.head_dim**-0.5 - self._use_gqa = self.num_kv_heads != self.num_heads - - # Separate q/k/v projections matching HF (no fused packing) - self.q_proj = nn.Linear( - self.hidden_size, - self.num_heads * self.head_dim, - bias=getattr(config, "attention_bias", False), - ) - self.k_proj = nn.Linear( - self.hidden_size, - self.num_kv_heads * self.head_dim, - bias=getattr(config, "attention_bias", False), - ) - self.v_proj = nn.Linear( - self.hidden_size, - self.num_kv_heads * self.head_dim, - bias=getattr(config, "attention_bias", False), - ) - self.o_proj = nn.Linear( - self.num_heads * self.head_dim, - self.hidden_size, - bias=False, - ) - self.q_norm = _RMSNorm(self.head_dim, eps=config.rms_norm_eps) - self.k_norm = _RMSNorm(self.head_dim, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - ) -> torch.Tensor: - bsz, seq_len, _ = hidden_states.shape - hidden_shape_q = (bsz, seq_len, self.num_heads, self.head_dim) - hidden_shape_kv = (bsz, seq_len, self.num_kv_heads, self.head_dim) - - q = self.q_norm(self.q_proj(hidden_states).view(hidden_shape_q)).transpose(1, 2) - k = self.k_norm(self.k_proj(hidden_states).view(hidden_shape_kv)).transpose(1, 2) - v = self.v_proj(hidden_states).view(hidden_shape_kv).transpose(1, 2) - - cos, sin = position_embeddings - # cos/sin are [batch, seq_len, head_dim], need unsqueeze at dim=1 for heads - cos = cos.unsqueeze(1) # [batch, 1, seq_len, head_dim] - sin = sin.unsqueeze(1) - q = (q * cos) + (_rotate_half(q) * sin) - k = (k * cos) + (_rotate_half(k) * sin) - - attn_out = F.scaled_dot_product_attention( - q, - k, - v, - scale=self.scaling, - is_causal=True, - enable_gqa=self._use_gqa, - ) - - attn_out = attn_out.transpose(1, 2).reshape(bsz, seq_len, -1) - output = self.o_proj(attn_out) - return output - - -class _CodePredictorMLP(nn.Module): - """SiLU-gated MLP for code predictor, matching HF's Qwen3TTSTalkerTextMLP.""" - - def __init__( - self, - config: Qwen3TTSTalkerCodePredictorConfig, - *, - prefix: str = "", - ) -> None: - super().__init__() - self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) - self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) - self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x)) - - -class _CodePredictorDecoderLayer(nn.Module): - """Transformer decoder layer for code predictor (SDPA, no KV cache).""" - - def __init__( - self, - config: Qwen3TTSTalkerCodePredictorConfig, - *, - prefix: str = "", - ) -> None: - super().__init__() - self.self_attn = _CodePredictorAttention(config, prefix=f"{prefix}.self_attn") - self.mlp = _CodePredictorMLP(config, prefix=f"{prefix}.mlp") - self.input_layernorm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - ) -> torch.Tensor: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - hidden_states = self.self_attn(hidden_states, position_embeddings) - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - return hidden_states - - -# =================================================================== -# Code Predictor Transformer Model -# =================================================================== - - -class Qwen3TTSTalkerCodePredictorModelVLLM(nn.Module): - """Transformer model for the code predictor (re-prefill, no KV cache).""" - - def __init__( - self, - config: Qwen3TTSTalkerCodePredictorConfig, - *, - talker_hidden_size: int | None = None, - prefix: str = "", - ) -> None: - super().__init__() - self.config = config - - self.layers = nn.ModuleList( - [_CodePredictorDecoderLayer(config, prefix=f"{prefix}.layers.{i}") for i in range(config.num_hidden_layers)] - ) - self.norm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = _RotaryEmbedding(config) - - # Codec embeddings: one per residual group. Stored in talker hidden dim - # (some checkpoints use talker_hidden_size != code_predictor hidden_size). - emb_dim = int(talker_hidden_size) if talker_hidden_size is not None else int(config.hidden_size) - self.codec_embedding = nn.ModuleList( - [nn.Embedding(config.vocab_size, emb_dim) for _ in range(config.num_code_groups - 1)] - ) - - def get_input_embeddings(self) -> nn.ModuleList: - return self.codec_embedding - - def forward( - self, - inputs_embeds: torch.Tensor, - position_ids: torch.Tensor, - ) -> torch.Tensor: - hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) - for layer in self.layers: - hidden_states = layer(hidden_states, position_embeddings) - hidden_states = self.norm(hidden_states) - return hidden_states - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - param = params_dict.get(name) - if param is None: - continue - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -# =================================================================== -# Code Predictor Wrapper (optimized re-prefill + torch.compile) -# =================================================================== - - -class Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM(nn.Module): - """vLLM-native code_predictor for the AR talker (residual codebooks). +# Backward-compat alias used by tests +Qwen3TTSTalkerCodePredictorModelVLLM = CodePredictorBaseModel - Re-prefill approach: each AR step forwards the full growing sequence - through the 5-layer transformer. No KV cache needed. This trades - ~O(T^2) extra attention FLOPs (negligible for T=16, 5 layers) for - zero KV cache management overhead and a simpler execution model. - Uses HF-compatible layers (plain nn.Linear, float32 RMSNorm, rotate_half - RoPE) to ensure numerical fidelity with the reference implementation. - Precision matters here because small errors compound across 15 AR steps. - - Optimizations preserved: - 1. torch.compile on model forward -- fuses small kernel launches. - 2. Pre-allocated embedding buffer [B, max_seq, H] -- no torch.cat per step. - 3. Projection caching -- each token projected once and cached. - 4. Pre-allocated position_ids -- no torch.arange per step. - 5. Inline sampling -- no custom op / forward_context overhead. - 6. Cached module references -- bypass nn.Module.__call__ overhead. - 7. CUDA graphs per batch-size bucket. - """ +class Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM(CodePredictorWrapper): + """Qwen3-TTS code predictor (CUDA graphs, per-call sampling, projection).""" def __init__( self, @@ -322,250 +31,24 @@ def __init__( talker_config: Qwen3TTSTalkerConfig, prefix: str = "code_predictor", ) -> None: - super().__init__() - self._vllm_config = vllm_config - self.config = config - self.talker_config = talker_config - - self.model = Qwen3TTSTalkerCodePredictorModelVLLM( - config, + super().__init__( + vllm_config=vllm_config, + cp_config=config, + wrapper_config=CodePredictorWrapperConfig( + use_cuda_graphs=True, + use_parallel_embedding=False, + use_projection=(config.hidden_size != talker_config.hidden_size), + return_proj_buf=False, + sampling_mode="per_call", + ), talker_hidden_size=int(talker_config.hidden_size), - prefix=f"{prefix}.model", + prefix=prefix, ) - - self.lm_head = nn.ModuleList( - [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(config.num_code_groups - 1)] - ) - - if config.hidden_size != talker_config.hidden_size: - self.small_to_mtp_projection = nn.Linear(talker_config.hidden_size, config.hidden_size, bias=True) - else: - self.small_to_mtp_projection = nn.Identity() - - self._num_groups = int(config.num_code_groups) - self._talker_hidden = int(talker_config.hidden_size) - self._cp_hidden = int(config.hidden_size) - - # Pre-allocated buffers (lazily initialized on first forward). - self._proj_buf: torch.Tensor | None = None - self._model_dtype: torch.dtype | None = None - - # torch.compile + warmup state (lazily initialized in _setup_compile). - self._compiled_model_fwd = None - self._bucket_sizes: list[int] = [] - self._bucket_pos_ids: dict[int, torch.Tensor] = {} - self._lm_heads_list: list[nn.Module] | None = None - self._codec_embeds_list: list[nn.Module] | None = None - self._cuda_graphs: dict[int, tuple[torch.cuda.CUDAGraph, torch.Tensor]] = {} - - def get_input_embeddings(self) -> nn.ModuleList: - return self.model.get_input_embeddings() + # Store talker_config for backward compat (accessed by some callers) + self.talker_config = talker_config + self._vllm_config = vllm_config def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load weights with vllm config context (required for VocabParallelEmbedding).""" with set_current_vllm_config(self._vllm_config): - loaded: set[str] = set() - model_weights: list[tuple[str, torch.Tensor]] = [] - other_weights: list[tuple[str, torch.Tensor]] = [] - for name, w in weights: - if name.startswith("model."): - model_weights.append((name[len("model.") :], w)) - else: - other_weights.append((name, w)) - - loaded_model = self.model.load_weights(model_weights) - loaded |= {f"model.{n}" for n in loaded_model} - - params = dict(self.named_parameters(remove_duplicate=False)) - for name, w in other_weights: - if name not in params: - continue - default_weight_loader(params[name], w) - loaded.add(name) - - return loaded - - # ------------------------------------------------------------------ - # Pre-allocated buffer management - # ------------------------------------------------------------------ - - def _ensure_buffers(self, device: torch.device, dtype: torch.dtype) -> None: - max_seq = self._num_groups + 1 - if self._proj_buf is not None and self._proj_buf.device == device and self._proj_buf.dtype == dtype: - return - max_bsz = self._vllm_config.scheduler_config.max_num_seqs - self._proj_buf = torch.zeros( - max_bsz, - max_seq, - self._cp_hidden, - dtype=dtype, - device=device, - ) - - def _setup_compile(self) -> None: - """Lazily set up torch.compile with manual CUDA graph capture.""" - if self._compiled_model_fwd is not None: - return - # Cache model parameter dtype so forward() doesn't need to query it - # on every call. Also ensures warmup buffers match model precision - # even when upstream modules produce a different dtype (#2385). - self._model_dtype = next(self.model.parameters()).dtype - self._lm_heads_list = list(self.lm_head) - self._codec_embeds_list = list(self.model.codec_embedding) - if not current_omni_platform.supports_torch_inductor(): - logger.warning_once("code_predictor: torch.compile disabled") - self._compiled_model_fwd = self.model.forward - return - - # torch.compile fuses RMSNorm/RoPE in ways that lose float32 - # precision, compounding across 15 AR steps. Use torch.compile - # with options that disable the problematic fusions while still - # getting kernel fusion benefits for the linear layers and SDPA. - self._compiled_model_fwd = torch.compile( - self.model.forward, - dynamic=False, - options={ - "epilogue_fusion": False, - }, - ) - self._warmup_buckets() - self._capture_cuda_graphs() - logger.info("code_predictor: torch.compile (no epilogue fusion) + CUDA graphs") - - def _padded_bsz(self, bsz: int) -> int: - for bucket in self._bucket_sizes: - if bsz <= bucket: - return bucket - return bsz - - def _warmup_buckets(self) -> None: - """Warmup power-of-2 batch-size buckets to front-load Inductor compilation.""" - max_bsz = self._vllm_config.scheduler_config.max_num_seqs - bucket_sizes = [1 << i for i in range(max_bsz.bit_length()) if (1 << i) <= max_bsz] - if max_bsz not in bucket_sizes: - bucket_sizes.append(max_bsz) - self._bucket_sizes = sorted(bucket_sizes) - - max_seq = self._num_groups + 1 - device = next(self.model.parameters()).device - - # Ensure proj_buf matches model parameter dtype to avoid dtype - # mismatch during warmup compilation (see #2385). - self._ensure_buffers(device, self._model_dtype) - proj_buf = self._proj_buf - for bsz in self._bucket_sizes: - # position_ids: [batch, seq_len] for HF-style RoPE - pos_ids = torch.arange(max_seq, device=device, dtype=torch.long).unsqueeze(0).expand(bsz, -1) - self._bucket_pos_ids[bsz] = pos_ids - for _ in range(3): - self._compiled_model_fwd(proj_buf[:bsz, :max_seq, :], pos_ids) - logger.info("code_predictor: warmup done for buckets %s", self._bucket_sizes) - - def _capture_cuda_graphs(self) -> None: - """Capture a CUDA graph per bucket using vLLM's global graph pool.""" - from vllm.platforms import current_platform - - pool = current_platform.get_global_graph_pool() - - max_seq = self._num_groups + 1 - proj_buf = self._proj_buf - - for bsz in self._bucket_sizes: - static_input = proj_buf[:bsz, :max_seq, :] - pos_ids = self._bucket_pos_ids[bsz] - - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(g, pool=pool): - static_output = self._compiled_model_fwd(static_input, pos_ids) - - self._cuda_graphs[bsz] = (g, static_output) - - logger.info("code_predictor: captured CUDA graphs for buckets %s", self._bucket_sizes) - - # ------------------------------------------------------------------ - # Optimized forward: re-prefill + torch.compile + projection cache - # ------------------------------------------------------------------ - - @torch.inference_mode() - def forward( - self, - layer0_code: torch.Tensor, - layer0_embed: torch.Tensor, - last_talker_hidden: torch.Tensor, - do_sample: bool = True, - temperature: float = 0.9, - top_k: int = 50, - top_p: float = 1.0, - ) -> torch.Tensor: - """Predict residual codebooks 1..Q-1 autoregressively via re-prefill. - - torch.compile fuses the ~60 small kernel launches per step into fewer - fused kernels, reducing kernel launch overhead by ~75%. - - Projection caching: each token is projected once via small_to_mtp_projection - and cached in _proj_buf, avoiding redundant re-projection of past tokens. - """ - bsz = int(layer0_code.shape[0]) - num_groups = self._num_groups - device = layer0_code.device - - all_codes = torch.empty(bsz, num_groups, dtype=torch.long, device=device) - all_codes[:, 0] = layer0_code.reshape(bsz) - - # _setup_compile caches _model_dtype on first call; use it for buffers - # so they always match model weight precision (#2385). - self._setup_compile() - dtype = self._model_dtype - self._ensure_buffers(device, dtype) - - proj_buf = self._proj_buf - max_seq = self._num_groups + 1 - - projection = self.small_to_mtp_projection - model_fwd = self._compiled_model_fwd - lm_heads = self._lm_heads_list - codec_embeds = self._codec_embeds_list - - use_sampling = do_sample and temperature > 0 - inv_temperature = 1.0 / max(temperature, 1e-6) if use_sampling else 0.0 - if use_sampling and top_p != 1.0: - raise NotImplementedError( - "top_p sampling is not implemented for the vLLM-native code predictor; please set top_p=1.0." - ) - - padded_bsz = self._padded_bsz(bsz) - proj_buf[:padded_bsz].zero_() - - proj_buf[:bsz, 0, :] = projection(last_talker_hidden.reshape(bsz, 1, -1).to(dtype)).reshape(bsz, -1) - proj_buf[:bsz, 1, :] = projection(layer0_embed.reshape(bsz, 1, -1).to(dtype)).reshape(bsz, -1) - full_pos_ids = self._bucket_pos_ids.get(padded_bsz) - if full_pos_ids is None: - full_pos_ids = torch.arange(max_seq, device=device, dtype=torch.long).unsqueeze(0).expand(padded_bsz, -1) - - # Use captured CUDA graph if available, otherwise call compiled fn. - cuda_graph_entry = self._cuda_graphs.get(padded_bsz) - - for step in range(1, num_groups): - if cuda_graph_entry is not None: - cuda_graph_entry[0].replay() - hidden_out = cuda_graph_entry[1] - else: - hidden_out = model_fwd(proj_buf[:padded_bsz, :max_seq, :], full_pos_ids) - logits = lm_heads[step - 1](hidden_out[:bsz, step, :]) - - if use_sampling: - scaled = logits * inv_temperature - if top_k > 0: - topk_vals, _ = scaled.topk(top_k, dim=-1) - scaled = scaled.masked_fill(scaled < topk_vals[:, -1:], float("-inf")) - probs = F.softmax(scaled, dim=-1) - next_ids = torch.multinomial(probs, num_samples=1) - else: - next_ids = logits.argmax(dim=-1, keepdim=True) - - all_codes[:, step] = next_ids.reshape(bsz) - - if step < num_groups - 1: - new_embed = codec_embeds[step - 1](next_ids) - proj_buf[:bsz, step + 1, :] = projection(new_embed.reshape(bsz, 1, -1)).reshape(bsz, -1) - - return all_codes + return super().load_weights(weights) From 50ae1de7da006324942715fd5c03d298290065de Mon Sep 17 00:00:00 2001 From: "Y. Fisher" Date: Wed, 15 Apr 2026 15:54:38 +0800 Subject: [PATCH 184/204] [Feature] HunyuanImage3 allow guidance_scale<=1 in DiT stage (#2762) Signed-off-by: KexiongYu --- .../models/hunyuan_image3/hunyuan_image3_transformer.py | 3 ++- .../models/hunyuan_image3/pipeline_hunyuan_image3.py | 6 ++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py index 327260ee0b..fbdacddaf3 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py @@ -1684,7 +1684,8 @@ def forward( else: attn_output = self.attn(q, k, v) # For o_proj - attn_output = attn_output.view(q.shape[0], -1) + # image_attn may return a non-contiguous tensor; reshape is safe here. + attn_output = attn_output.reshape(q.shape[0], -1) output, _ = self.o_proj(attn_output) output = output.reshape(bsz, q_len, -1) return output, None, past_key_value diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index 2f140b48fc..3de0ab3101 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -6,7 +6,6 @@ from collections.abc import Iterable from typing import Any -import numpy as np import torch import torch.nn as nn from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler @@ -544,7 +543,7 @@ def prepare_model_inputs( generator = [torch.Generator(self.device).manual_seed(seed) for seed in seeds] # 3. apply chat template - cfg_factor = {"gen_text": 1, "gen_image": 2} + cfg_factor = {"gen_text": 1, "gen_image": 1 + int(guidance_scale > 1.0)} bot_task = kwargs.pop("bot_task", "auto") # If `drop_think` enabled, always drop parts in the context. drop_think = kwargs.get("drop_think", self.generation_config.drop_think) @@ -1009,8 +1008,7 @@ def forward( if req.sampling_params.guidance_scale_provided: guidance_scale = req.sampling_params.guidance_scale if guidance_scale <= 1.0: - logger.warning("HunyuanImage3.0 does not support guidance_scale <= 1.0, will set it to 1.0 + epsilon.") - guidance_scale = 1.0 + np.finfo(float).eps + logger.info("HunyuanImage3.0 runs without classifier-free guidance when guidance_scale <= 1.0.") image_size = (height, width) model_inputs = self.prepare_model_inputs( prompt=prompt, From c6d76d081b3e926ea44bece356889f846445440a Mon Sep 17 00:00:00 2001 From: Zhang Jian Date: Wed, 15 Apr 2026 22:25:41 +0800 Subject: [PATCH 185/204] [Bugfix] Fix broken fp8 quantisation on Z-Image-Turbo, Qwen-Image, FLUX.1-dev (#2795) Signed-off-by: Zhang Co-authored-by: pjh4993 --- .../diffusion/models/flux/flux_transformer.py | 12 ++++-- .../qwen_image/qwen_image_transformer.py | 31 ++++++++----- .../models/z_image/z_image_transformer.py | 43 ++++++++++++++++--- 3 files changed, 66 insertions(+), 20 deletions(-) diff --git a/vllm_omni/diffusion/models/flux/flux_transformer.py b/vllm_omni/diffusion/models/flux/flux_transformer.py index 680b8bfbbe..297c626751 100644 --- a/vllm_omni/diffusion/models/flux/flux_transformer.py +++ b/vllm_omni/diffusion/models/flux/flux_transformer.py @@ -381,7 +381,9 @@ def __init__( super().__init__() self.mlp_hidden_dim = int(dim * mlp_ratio) - self.norm = AdaLayerNormZeroSingle(dim, quant_config=quant_config, prefix=f"{prefix}.norm") + # Modulation linear kept full precision; shift/scale/gate outputs + # are multiplied into the residual stream every block (see #2728). + self.norm = AdaLayerNormZeroSingle(dim, quant_config=None, prefix=f"{prefix}.norm") self.proj_mlp = ReplicatedLinear( dim, self.mlp_hidden_dim, @@ -563,13 +565,16 @@ def __init__( self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim) self.x_embedder = nn.Linear(in_channels, self.inner_dim) + # Dual-stream blocks kept full precision — FP8 on their joint + # attention path causes noise on FLUX (#2728). Single-stream + # blocks (38 vs 19) still get FP8 for memory savings. self.transformer_blocks = nn.ModuleList( [ FluxTransformerBlock( dim=self.inner_dim, num_attention_heads=num_attention_heads, attention_head_dim=attention_head_dim, - quant_config=quant_config, + quant_config=None, prefix=f"transformer_blocks.{i}", ) for i in range(num_layers) @@ -589,12 +594,13 @@ def __init__( ] ) + # Final modulation feeds proj_out; keep full precision (see #2728). self.norm_out = AdaLayerNormContinuous( self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6, - quant_config=quant_config, + quant_config=None, prefix="norm_out", ) self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True) diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py index 9f16d8808c..88a66d7f6b 100644 --- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py +++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py @@ -169,12 +169,15 @@ def __init__( self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000) self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim) + # Time embedding MLP is kept full precision (quant_config=None) — + # small layers that feed per-block modulation; precision-sensitive + # (see #2728). self.timestep_embedder.linear_1 = ReplicatedLinear( 256, embedding_dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="timestep_embedder.linear_1", ) self.timestep_embedder.linear_2 = ReplicatedLinear( @@ -182,7 +185,7 @@ def __init__( embedding_dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="timestep_embedder.linear_2", ) self.use_additional_t_cond = use_additional_t_cond @@ -701,7 +704,10 @@ def __init__( self.num_attention_heads = num_attention_heads self.attention_head_dim = attention_head_dim - # Image processing modules + # Image processing modules. + # Modulation linear is kept full precision (quant_config=None) — it + # produces shift/scale/gate values that are precision-sensitive + # (see #2728). self.img_mod = nn.Sequential( nn.SiLU(), ReplicatedLinear( @@ -709,7 +715,7 @@ def __init__( 6 * dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="img_mod.1", ), ) @@ -725,7 +731,7 @@ def __init__( self.img_norm2 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) self.img_mlp = FeedForward(dim=dim, dim_out=dim, quant_config=quant_config, prefix="img_mlp") - # Text processing modules + # Text processing modules. self.txt_mod = nn.Sequential( nn.SiLU(), ReplicatedLinear( @@ -733,7 +739,7 @@ def __init__( 6 * dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="txt_mod.1", ), ) @@ -963,12 +969,14 @@ def __init__( self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6) + # Entry projections (image/text) are kept full precision — + # small sensitive layers at the network boundary (see #2728). self.img_in = ReplicatedLinear( in_channels, self.inner_dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="img_in", ) self.txt_in = ReplicatedLinear( @@ -976,7 +984,7 @@ def __init__( self.inner_dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="txt_in", ) @@ -993,13 +1001,16 @@ def __init__( ] ) + # Final modulation and output projection are kept full precision — + # they produce the output latent and are precision-sensitive + # (see #2728). self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6) self.norm_out.linear = ReplicatedLinear( self.inner_dim, 2 * self.inner_dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="norm_out.linear", ) self.proj_out = ReplicatedLinear( @@ -1007,7 +1018,7 @@ def __init__( patch_size * patch_size * self.out_channels, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="proj_out", ) diff --git a/vllm_omni/diffusion/models/z_image/z_image_transformer.py b/vllm_omni/diffusion/models/z_image/z_image_transformer.py index 3ffad221ba..c36ea74665 100644 --- a/vllm_omni/diffusion/models/z_image/z_image_transformer.py +++ b/vllm_omni/diffusion/models/z_image/z_image_transformer.py @@ -214,12 +214,14 @@ def __init__( super().__init__() if mid_size is None: mid_size = out_size + # Time embedding MLP is kept full precision (quant_config=None) — + # small layers that feed adaLN; precision-sensitive (see #2728). self.mlp = nn.Sequential( ReplicatedLinear( frequency_embedding_size, mid_size, bias=True, - quant_config=quant_config, + quant_config=None, return_bias=False, ), nn.SiLU(), @@ -227,7 +229,7 @@ def __init__( mid_size, out_size, bias=True, - quant_config=quant_config, + quant_config=None, return_bias=False, ), ) @@ -426,9 +428,16 @@ def __init__( self.modulation = modulation if modulation: + # Modulation linear is kept at full precision (quant_config=None) + # — it produces scale/gate values that are precision-sensitive + # (see #2728, mirrors OmniGen2 fix). self.adaLN_modulation = nn.Sequential( ReplicatedLinear( - min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True, return_bias=False, quant_config=quant_config + min(dim, ADALN_EMBED_DIM), + 4 * dim, + bias=True, + quant_config=None, + return_bias=False, ), ) @@ -485,14 +494,24 @@ class FinalLayer(nn.Module): def __init__(self, hidden_size, out_channels, quant_config: "QuantizationConfig | None" = None): super().__init__() self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + # Final output projection and its modulation are precision-sensitive + # (produce the output latent); keep at full precision (see #2728). self.linear = ReplicatedLinear( - hidden_size, out_channels, bias=True, quant_config=quant_config, return_bias=False + hidden_size, + out_channels, + bias=True, + quant_config=None, + return_bias=False, ) self.adaLN_modulation = nn.Sequential( nn.SiLU(), ReplicatedLinear( - min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True, quant_config=quant_config, return_bias=False + min(hidden_size, ADALN_EMBED_DIM), + hidden_size, + bias=True, + quant_config=None, + return_bias=False, ), ) @@ -673,11 +692,13 @@ def __init__( all_x_embedder = {} all_final_layer = {} for patch_idx, (patch_size, f_patch_size) in enumerate(zip(all_patch_size, all_f_patch_size)): + # x_embedder (patch embed) is a small precision-sensitive entry + # layer; keep full precision (see #2728). x_embedder = ReplicatedLinear( f_patch_size * patch_size * patch_size * in_channels, dim, bias=True, - quant_config=quant_config, + quant_config=None, return_bias=False, ) all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder @@ -720,9 +741,17 @@ def __init__( ] ) self.t_embedder = TimestepEmbedder(min(dim, ADALN_EMBED_DIM), mid_size=1024, quant_config=quant_config) + # Caption embedder maps text features -> hidden; keep full precision + # (see #2728). self.cap_embedder = nn.Sequential( RMSNorm(cap_feat_dim, eps=norm_eps), - ReplicatedLinear(cap_feat_dim, dim, bias=True, return_bias=False, quant_config=quant_config), + ReplicatedLinear( + cap_feat_dim, + dim, + bias=True, + quant_config=None, + return_bias=False, + ), ) self.x_pad_token = nn.Parameter(torch.empty((1, dim))) From f1e3f037265852b952cef654c489182bf7c26686 Mon Sep 17 00:00:00 2001 From: Alex Brooks Date: Wed, 15 Apr 2026 11:01:45 -0600 Subject: [PATCH 186/204] [feature] Hidden State Prefix Caching (#2164) Signed-off-by: Alex Brooks --- docs/.nav.yml | 1 + docs/design/feature/prefix_caching.md | 164 +++++++++ tests/conftest.py | 6 + tests/core/test_prefix_cache.py | 347 ++++++++++++++++++++ tests/e2e/online_serving/test_qwen3_omni.py | 75 ++++- vllm_omni/core/prefix_cache.py | 264 +++++++++++++++ vllm_omni/utils/mm_outputs.py | 93 ++++++ vllm_omni/worker/gpu_ar_model_runner.py | 202 +++++++++--- vllm_omni/worker/gpu_model_runner.py | 59 +++- 9 files changed, 1144 insertions(+), 67 deletions(-) create mode 100644 docs/design/feature/prefix_caching.md create mode 100644 tests/core/test_prefix_cache.py create mode 100644 vllm_omni/core/prefix_cache.py create mode 100644 vllm_omni/utils/mm_outputs.py diff --git a/docs/.nav.yml b/docs/.nav.yml index 441ef9f521..79d7c38e27 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -98,6 +98,7 @@ nav: - design/feature/disaggregated_inference.md - design/feature/ray_based_execution.md - design/feature/omni_connectors/ + - design/feature/prefix_caching.md - design/feature/cfg_parallel.md - design/feature/expert_parallel.md - design/feature/sequence_parallel.md diff --git a/docs/design/feature/prefix_caching.md b/docs/design/feature/prefix_caching.md new file mode 100644 index 0000000000..ebad8b6910 --- /dev/null +++ b/docs/design/feature/prefix_caching.md @@ -0,0 +1,164 @@ +# Automatic Prefix Caching in Omni Models + + +--- + +## Table of Contents + +- [Overview](#overview) +- [High-Level Approach](#high-level-approach) +- [Example](#example) +- [What About Multimodal Inputs?](#what-about-multimodal-inputs) + +--- + +### Overview + +Prefix caching in the context of kv-cache management is a useful optimization for avoiding redundant computations. The main idea is that we store portions of the kv-cache from processed requests, so that we can reuse them if incoming requests have the same prefix as previous requests. + +vLLM manages the kv-cache as blocks, which represent a span of tokens of a fixed length. Blocks are hashable by the content that they contain, which typically means the tokens within the span, but also could be influenced by other factors, e.g., LoRA and multimodal data. + +vLLM implements automatic prefix caching for managing its kv-cache, which is best understood by reading the design document [here](https://docs.vllm.ai/en/latest/design/prefix_caching/). vLLM-Omni builds on top of the prefix caching mechanism in a noninvasive way to allow caching between stages in Omni pipelines. This typically means for a given stage we aim to support caching for the following: + +- The last hidden states produced by the stage +- Model / stage specific multimodal data + +!!! note "Note 1" + This document describes vLLM-Omni's mechanism for caching tensor outputs that are meant to be passed between stages, when requests have common prefixes, similar to the way in which vLLM has prefix caching for the kv-cache. This works in conjunction with vLLM's multimodal encoder caching, but is distinct. See the final section for a concrete example for how they tie together in practice. + +### High-Level Approach +!!! note "Note 2" + Prior to reading this section, it's recommended to take a look at the design documents in vLLM for [Automatic Prefix Caching](https://docs.vllm.ai/en/latest/features/automatic_prefix_caching/), which will make some of the concepts more clear. + +The main focus of vLLM-Omni's approach to prefix caching stage outputs is to build on vLLM's prefix caching in the least invasive way possible while minimizing impact for cache misses, and consuming a minimal amount of GPU memory. To understand the implementation, there are a few important things to note: + +- Between stages, device tensors are generally moved to CPU; this is important since we're just caching the outputs of stages, so it is okay to keep the entire cache on the CPU. + +- For a tensor to be considered cacheable, the first dimension (currently) needs to be the same as the token count, as it allows us to reuse block/slot mappings for our externally maintained tensor caches. This allows us to dynamically discover the tensors to be marked as cacheable outputs in each Omni model without having to explicitly specify cacheable output field names in every model. + +With this in mind, consider the set of blocks in a 2D layout, where the row represents the index of blocks being considered, and the columns represent the slots corresponding to tokens within each block. Since we know the `num_blocks` and `block_size` from our kv cache config, if we want to cache a tensor with feature size `D`, we can preallocate a CPU tensor of size `(num_blocks, block_size, D)`, and use the same block index and slot mapping to retrieve the corresponding feature vector. + + +### Example +!!! note "Note 3" + Prefix caching in vLLM-Omni currently is only supported on AutoRegressive stages with one kv-cache group. It can be enabled/disabled per-stage via the `enable_prefix_caching` parameter in the model's stage config. + +The way in which vLLM-Omni ties into vLLM's prefix caching is best understood by example. Say that we have the following: + +- `num_blocks=8` +- `block_size=4` +- `hidden_size=2` +- A stage specific multimodal output tensor named `mm_feature` with feature dimension `16` + +The prefix cache flow is then outlined below. + +1. When the model is initialized, we can determine the `hidden_size` from the `ModelConfig`, and allocate a cache of size `(num_blocks, block_size, hidden_size)`. + +2. Say we process the request `The quick brown fox was tired and slept beneath the shady tree`, which is 12 tokens and evenly divides into 3 blocks as shown below. + +``` + [ The quick brown fox ] [ was tired and slept ] [beneath the shady tree ] +Block 1: |<--- block tokens ---->| +Block 2: |<------- prefix ------>| |<--- block tokens --->| +Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->| +``` + +When the request processes, we inspect the multimodal outputs and identify the `mm_feature` tensor, which will be of shape `(seq_len, feature_dim)`, i.e., `(12, 16)` in this example. We note that the first axis is dependent on the `seq_len` and add a new cache_tensor of shape `(num_blocks, block_size, feature_dim)` to our multimodal cache for tensors. + + +3. If we lay out the cache as a 2D tensor of shape (`num_blocks`, `block_size`), we'll have something like the following: + +``` +0: [ The quick brown fox ] +1: [ was tired and slept ] +2: [beneath the shady tree ] +3: [EMPTY] +... +7: [EMPTY] +``` + +Or, if we flatten it down to 1D, +``` +0: The +1: quick +2: brown +3: fox +... +11: tree +12: [EMPTY] +... +``` + +which we can think of as row indices into the hidden states tensor if we view it as the 2D shape `(num_blocks x block_size, feature_dim)`. That is, the analogous flattened (from 3D -> 2D) mapping of the cache for hidden states becomes the following. +``` +0: +1: +2: +3: +... +11: +12: [EMPTY] +... +``` + +Similarly, for the multimodal outputs cache, the flattened coordinates are the same, but the `mm_feature` maps to vectors of length `16` instead of the hidden size of `2`. Note that in practice, we may have multiple multimodal output tensors per forward pass, which may have different names and different feature dimensions. + + +4. Now, say that we receive a new request `The quick brown fox jumped over the dog`. + +``` + [ The quick brown fox ] [ jumped over the dog ] +Block 1: |<--- block tokens ---->| +Block 2: |<------- prefix ------>| |<--- block tokens --->| +``` + +Here, we will have a cache hit for `Block 1` which will be detected by vLLM based on the hash of the first block when it's handling the prefix caching on the kv-cache. As a result, when we get the output from the scheduler, we will see that `num_computed_tokens=4` (corresponding to the cached first block), and we only need to process the remaining 4 new tokens in the new prefill. + +Since we have the block indices / slot mappings from the kv cache manager, we can simply mirror the mappings and leverage the same indices for the cached hidden states and multimodal outputs. This allows us to look up the correct tensors from our externally maintained 3D caches. + +``` +0: [ The quick brown fox ] < already in the cache +1: [ was tired and slept ] +2: [beneath the shady tree ] +3: [ jumped over the dog ] < added on the second request +4: [EMPTY] +... +7: [EMPTY] +... +``` + +Finally, to pass the full hidden states and multimodal outputs to the next stage, we simply concatenate the cached contents with the corresponding new tensors computed from the current forward call. + + +### What About Multimodal Inputs? +It's also useful to consider the case about how Omni prefix caching is handled when we have multimodal inputs that don't cleanly end on block boundaries, as well as how this works with multimodal encoder caching in vLLM. For example: + +``` + [ Im0 Im1 Im2 Im3 ] [ Im4 Im5 foo ] +Block 1: |<--- block tokens ---->| +Block 2: |<------- prefix ------>| |<--- block tokens --->| +``` + +In this case, only `Block 1` will have outputs stored in the prefix tensor cache, because vLLM does not store partial blocks. This may appear to be a problem at first glance, because the multimodal input is fragmented across a new block that wasn't cached. + +In reality, this isn't a big problem for correctness, because vLLM also maintains an encoder cache for multimodal inputs. In other words, after the first pass, we'll have the following: + +- The Block 1 hash, which is used for prefix caching +- The hash describing the image data starting at position 0 and with length 6 +- In vLLM's encoder cache, a mapping from the image hash above to the encoder output + + +To understand what happens, say we get the following input as a second request: +``` + [ Im0 Im1 Im2 Im3 ] [ Im4 Im5 bar baz ] +Block 1: |<--- block tokens ---->| +Block 2: |<------- prefix ------>| |<--- block tokens --->| +``` + +First, the scheduler will check for a prefix cache hit, which we will see on `Block 1`. As a result, we will have 4 tokens marked as precomputed, and only see the remaining 4 tokens in the following prefill. + +Because we have multimodal data in a scheduled span that isn't fully precomputed, we still need to call the visual encoder. However, since we have the image hash and encoder cache, we will retrieve the encoder outputs for `Im4` and `Im5` as we create the multimodal embeddings. + +When we pass our multimodal tensors to the language model component in the same stage, we'll then expect the same outputs, because the prefix caching behaviors in vLLM-Omni / vLLM match, so the LLM will use vLLM's KV cache manager's prefix caching to correctly handle the attention information for `Block 1` while calculating the outputs for `Block 2`, giving us the correct results for processing `Block 2` with the context of `Block 1`. + +Finally, we look up the output hidden states/multimodal tensors corresponding to the prefix cache hit `Block 1` and concatenate it with the forward pass result to get the final result, which is expected to be identical to the full hidden states when prefix caching is disabled. diff --git a/tests/conftest.py b/tests/conftest.py index 098fd8d970..ad1008b726 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1850,6 +1850,7 @@ class OmniResponse: e2e_latency: float | None = None success: bool = False error_message: str | None = None + cached_tokens: int | None = None @dataclass @@ -2345,6 +2346,11 @@ def _process_non_stream_omni_response(self, chat_completion) -> OmniResponse: if hasattr(choice.message, "content") and choice.message.content is not None: text_content = choice.message.content + # Extract cached_tokens for prefix caching tests + usage = getattr(chat_completion, "usage", None) + if usage and (details := getattr(usage, "prompt_tokens_details", None)): + result.cached_tokens = details.cached_tokens + # Calculate end-to-end latency result.e2e_latency = time.perf_counter() - start_time diff --git a/tests/core/test_prefix_cache.py b/tests/core/test_prefix_cache.py new file mode 100644 index 0000000000..c3d8c1ff92 --- /dev/null +++ b/tests/core/test_prefix_cache.py @@ -0,0 +1,347 @@ +from unittest.mock import Mock, patch + +import pytest +import torch + +from vllm_omni.core.prefix_cache import OmniTensorPrefixCache + +DEFAULT_SEQ_LEN = 15 +NUM_BLOCKS = 10 +BLOCK_SIZE = 4 +HIDDEN_SIZE = 2 +DTYPE = torch.float32 +OTHER_DTYPE = torch.float16 +DEFAULT_SHAPE = torch.Size([NUM_BLOCKS, BLOCK_SIZE, HIDDEN_SIZE]) + + +class MockInputBatch: + def __init__(self, num_computed_tokens_cpu): + self.req_ids = ["req1", "req2"] + self.req_id_to_index = {req_id: i for i, req_id in enumerate(self.req_ids)} + self.num_computed_tokens_cpu = num_computed_tokens_cpu + # Block table is only mocked for validation of length; + # we don't actually need to add valid values here since + # we patch the table when testing. + self.block_table = Mock() + self.block_table.block_tables = [None] + + +def get_omni_pcache_with_mm_tensors(feat_dims, seq_len) -> OmniTensorPrefixCache: + """Build an OmniTensorPrefixCache and init mm tensors.""" + cache = get_omni_pcache() + mm_outputs = get_multimodal_outputs(feat_dims, seq_len) + cache.maybe_init_missing_mm_cache_keys(mm_outputs, seq_len) + return cache + + +def get_omni_pcache() -> OmniTensorPrefixCache: + """Build an OmniTensorPrefixCache, but don't init mm tensors.""" + cache = OmniTensorPrefixCache( + num_blocks=NUM_BLOCKS, + block_size=BLOCK_SIZE, + hidden_size=HIDDEN_SIZE, + hs_dtype=DTYPE, + ) + return cache + + +def get_multimodal_outputs(feat_dims: dict[str, int], seq_len: int) -> dict[str, torch.Tensor]: + fake_mm_inputs = {} + for mm_key, feat_dim in feat_dims.items(): + fake_mm_inputs[mm_key] = torch.rand((seq_len, feat_dim), dtype=DTYPE) + return fake_mm_inputs + + +### Tests for initialization +def test_initialization_simple(): + """Check default initialization only creates the hidden states.""" + cache = get_omni_pcache() + assert isinstance(cache.hidden_states_cache, torch.Tensor) + assert cache.hidden_states_cache.shape == DEFAULT_SHAPE + assert len(cache.mm_outputs_cache) == 0 + assert len(cache.mm_cache_keys) == 0 + + +def test_initialization_with_multimodal(): + """Check initialization + registration of multimodal outputs.""" + cache = get_omni_pcache() + feat_dims = {"foo": 100, "bar": 50, "baz": 10} + mm_outputs = get_multimodal_outputs( + feat_dims, + seq_len=DEFAULT_SEQ_LEN, + ) + # Cast one of the keys to a different dtype; the dtype of the tensor + # that is used to initialize the cache dictates the cache dtype. + mm_outputs["foo"] = mm_outputs["foo"].to(OTHER_DTYPE) + + cache.maybe_init_missing_mm_cache_keys(mm_outputs, DEFAULT_SEQ_LEN) + assert len(cache.mm_cache_keys) == 3 + assert set(cache.mm_cache_keys) == set(feat_dims.keys()) + for mm_key in cache.mm_cache_keys: + cache_tensor = cache.mm_outputs_cache[mm_key] + assert isinstance(cache_tensor, torch.Tensor) + assert cache_tensor.shape[-1] == feat_dims[mm_key] + assert mm_outputs[mm_key].dtype == cache_tensor.dtype + + +def test_init_missing_mm_cache_keys_is_idempotent(): + """Ensure that the cache doesn't reinitialize old keys.""" + cache = get_omni_pcache() + mm_key = "foo" + feat_dims = {mm_key: 100} + mm_outputs = get_multimodal_outputs( + feat_dims, + seq_len=DEFAULT_SEQ_LEN, + ) + cache.maybe_init_missing_mm_cache_keys(mm_outputs, DEFAULT_SEQ_LEN) + assert len(cache.mm_cache_keys) == 1 + assert mm_key in cache.mm_cache_keys + + # Cache is initialized to 0 - fill it with 1s + cache.mm_outputs_cache[mm_key].fill_(1) + + # Ensure that running another initialization + # doesn't zero out our cache values + cache.maybe_init_missing_mm_cache_keys(mm_outputs, DEFAULT_SEQ_LEN) + assert len(cache.mm_cache_keys) == 1 + assert mm_key in cache.mm_cache_keys + assert torch.all(cache.mm_outputs_cache[mm_key] == 1) + + +### Tests for Update +def test_update_no_multimodal(): + """Test that slot mappings act as row indices hidden states.""" + cache = get_omni_pcache() + + num_tokens_unpadded = 8 + slot_offset = 8 + slot_mapping = torch.arange(slot_offset, slot_offset + num_tokens_unpadded) + new_hidden_states = torch.rand((num_tokens_unpadded, HIDDEN_SIZE), dtype=DTYPE) + + cache.update_omni_tensor_prefix_cache( + hidden_states=new_hidden_states, + multimodal_outputs=None, + num_tokens_unpadded=num_tokens_unpadded, + slot_mapping=slot_mapping, + ) + + # Ensure that if we reshape our 3D cache back to 2D, we can use the + # indices in our slot mappings to access the hidden states as expected + hs_rows = cache.hidden_states_cache.view(NUM_BLOCKS * BLOCK_SIZE, HIDDEN_SIZE) + for slot_idx, new_states in zip(slot_mapping, new_hidden_states): + slot_states = hs_rows[slot_idx] + assert torch.all(slot_states == new_states) + + +@pytest.mark.parametrize( + "feat_dims", + [ + {"foo": 100, "bar": 100}, + {"foo": 100, "bar": 50, "baz": 10}, + ], +) +def test_update_with_multimodal_outputs(feat_dims): + """Test that slot mappings are correct for multimodal tensors.""" + cache = get_omni_pcache_with_mm_tensors(feat_dims, seq_len=DEFAULT_SEQ_LEN) + + num_tokens_unpadded = 8 + slot_offset = 8 + slot_mapping = torch.arange(slot_offset, slot_offset + num_tokens_unpadded) + feature_dims = {key: val.shape[-1] for key, val in cache.mm_outputs_cache.items()} + mm_outputs = {key: torch.rand((num_tokens_unpadded, feature_dims[key]), dtype=DTYPE) for key in cache.mm_cache_keys} + cache.update_omni_tensor_prefix_cache( + hidden_states=None, + multimodal_outputs=mm_outputs, + num_tokens_unpadded=num_tokens_unpadded, + slot_mapping=slot_mapping, + ) + + for mm_key in feat_dims.keys(): + assert mm_key in cache.mm_outputs_cache + key_feat_dim = feature_dims[mm_key] + mm_state_rows = cache.mm_outputs_cache[mm_key].view(NUM_BLOCKS * BLOCK_SIZE, key_feat_dim) + + # Similar to hidden states, but for each key in the dict; + # Different tensors may have different feature dims + new_mm_outputs = mm_outputs[mm_key] + for slot_idx, new_output in zip(slot_mapping, new_mm_outputs): + slot_states = mm_state_rows[slot_idx] + assert torch.all(slot_states == new_output) + + +### Tests for Merging +def fake_get_cached_block_ids(self, req_idx, *args, **kwargs): + """Fake block table lookup. + + Assumption: + req_idx 0 is a cache hit with slots 8, 9, ..., 15 + req_idx 1 is a cache miss + """ + assert req_idx < 2 + if req_idx == 0: + # With the slot offset we provided (8), the corresponding + # blocks IDs are 2 & 3 because the block size is 4. + return torch.tensor([2, 3], dtype=torch.long) + return torch.tensor([], dtype=torch.long) + + +@pytest.mark.parametrize("num_tokens_padded", [None, 16]) +def test_get_merged_hidden_states(num_tokens_padded): + """Ensure that hidden states are merged correctly.""" + cache = get_omni_pcache() + + orig_num_tokens_unpadded = 8 + slot_offset = 8 # We'll put our states in slots 8, 9, 10, ..., 15 + orig_slot_mapping = torch.arange(slot_offset, slot_offset + orig_num_tokens_unpadded) + orig_hidden_states = torch.rand((orig_num_tokens_unpadded, HIDDEN_SIZE), dtype=DTYPE) + + cache.update_omni_tensor_prefix_cache( + hidden_states=orig_hidden_states, + multimodal_outputs=None, + num_tokens_unpadded=orig_num_tokens_unpadded, + slot_mapping=orig_slot_mapping, + num_tokens_padded=num_tokens_padded, + ) + + # Say that we have two requests, but only one of them is a cache hit + num_new_toks_req1 = 3 + num_new_toks_req2 = 2 + cache.add_prefix_cached_new_req_id("req1") + + num_scheduled_tokens = { + "req1": num_new_toks_req1, + "req2": num_new_toks_req2, + } + new_hidden_states = torch.rand( + (num_new_toks_req1 + num_new_toks_req2, HIDDEN_SIZE), + dtype=DTYPE, + ) + req1_new_states = new_hidden_states[:num_new_toks_req1] + req2_new_states = new_hidden_states[-num_new_toks_req2:] + + input_batch = MockInputBatch(num_computed_tokens_cpu=torch.Tensor([orig_num_tokens_unpadded, 0])) + + with patch( + "vllm_omni.core.prefix_cache.OmniTensorPrefixCache._get_cached_block_ids", + new=fake_get_cached_block_ids, + ): + merged_states = cache.get_merged_hidden_states( + query_start_loc=[0, num_new_toks_req1], + input_batch=input_batch, + hidden_states=new_hidden_states, + num_scheduled_tokens=num_scheduled_tokens, + ) + + assert "req1" in merged_states and "req2" in merged_states + req1_merged_states = merged_states["req1"] + req2_merged_states = merged_states["req2"] + + # First, check the cache hit case + assert req1_merged_states.shape == torch.Size([orig_num_tokens_unpadded + num_new_toks_req1, HIDDEN_SIZE]) + # Ensure that the req1 merged states are the cached states + the new req1 states + assert torch.all(req1_merged_states[:orig_num_tokens_unpadded] == orig_hidden_states) + assert torch.all(req1_merged_states[-num_new_toks_req1:] == req1_new_states) + + # Next, ensure that the cache miss case only has the new states + assert req2_merged_states.shape == torch.Size([num_new_toks_req2, HIDDEN_SIZE]) + assert torch.all(req2_merged_states == req2_new_states) + + +@pytest.mark.parametrize("num_tokens_padded", [None, 16]) +@pytest.mark.parametrize( + "feat_dims", + [ + {"foo": 100, "bar": 100}, + {"foo": 100, "bar": 50, "baz": 10}, + ], +) +def test_get_merged_multimodal_outputs(feat_dims, num_tokens_padded): + cache = get_omni_pcache_with_mm_tensors(feat_dims, seq_len=DEFAULT_SEQ_LEN) + + orig_num_tokens_unpadded = 8 + slot_offset = 8 # We'll put our states in slots 8, 9, 10, ..., 15 + orig_slot_mapping = torch.arange(slot_offset, slot_offset + orig_num_tokens_unpadded) + feature_dims = {key: val.shape[-1] for key, val in cache.mm_outputs_cache.items()} + orig_mm_outputs = { + key: torch.rand((orig_num_tokens_unpadded, feature_dims[key]), dtype=DTYPE) for key in cache.mm_cache_keys + } + + cache.update_omni_tensor_prefix_cache( + hidden_states=None, + multimodal_outputs=orig_mm_outputs, + num_tokens_unpadded=orig_num_tokens_unpadded, + slot_mapping=orig_slot_mapping, + num_tokens_padded=num_tokens_padded, + ) + + # Similar to hs test- say that we have two requests, but only one of them is a cache hit + num_new_toks_req1 = 3 + num_new_toks_req2 = 2 + cache.add_prefix_cached_new_req_id("req1") + + num_scheduled_tokens = { + "req1": num_new_toks_req1, + "req2": num_new_toks_req2, + } + + new_mm_outputs = {} + for mm_key in cache.mm_cache_keys: + new_mm_outputs[mm_key] = torch.rand( + (num_new_toks_req1 + num_new_toks_req2, feature_dims[mm_key]), + dtype=DTYPE, + ) + # We also want to make sure passthrough data (outside of our keys) isn't dropped + new_mm_outputs["passthrough_data"] = "Something else" + # Lists are a special case because we can't split them yet if we want to match + # the nonprefix cache behavior, because this runs before post process. + new_mm_outputs["passthrough_list"] = ["should", "not", "split"] + + input_batch = MockInputBatch(num_computed_tokens_cpu=torch.Tensor([orig_num_tokens_unpadded, 0])) + + with patch( + "vllm_omni.core.prefix_cache.OmniTensorPrefixCache._get_cached_block_ids", + new=fake_get_cached_block_ids, + ): + merged_mm_outputs = cache.get_merged_multimodal_states( + query_start_loc=[0, num_new_toks_req1], + input_batch=input_batch, + multimodal_outputs=new_mm_outputs, + num_scheduled_tokens=num_scheduled_tokens, + ) + + # Ensure the passthrough data wasn't dropped + assert "passthrough_data" in merged_mm_outputs + assert "passthrough_list" in merged_mm_outputs + + for mm_key, mm_output in merged_mm_outputs.items(): + # Ensure passthrough data is just forwarded normally and not duplicated + assert isinstance(mm_output, dict) + assert "req1" in mm_output and "req2" in mm_output + if mm_key == "passthrough_data": + assert mm_key not in cache.mm_cache_keys + assert new_mm_outputs[mm_key] == mm_output["req1"] + assert new_mm_outputs[mm_key] == mm_output["req2"] + elif mm_key == "passthrough_list": + assert mm_key not in cache.mm_cache_keys + assert new_mm_outputs[mm_key] == mm_output["req1"] + assert new_mm_outputs[mm_key] == mm_output["req2"] + else: + assert mm_key in cache.mm_cache_keys + curr_feat_dim = feature_dims[mm_key] + # Ensure that req1 (cache hit) merged the mm data + req1_merged_mm_outputs = mm_output["req1"] + req1_new_mm_outputs = new_mm_outputs[mm_key][:num_new_toks_req1] + + assert req1_merged_mm_outputs.shape == torch.Size( + [orig_num_tokens_unpadded + num_new_toks_req1, curr_feat_dim] + ) + # Ensure that the req1 merged mm data are the cached data + the new data + assert torch.all(req1_merged_mm_outputs[:orig_num_tokens_unpadded] == orig_mm_outputs[mm_key]) + assert torch.all(req1_merged_mm_outputs[-num_new_toks_req1:] == req1_new_mm_outputs) + + # Ensure that req2 (cache miss) only has the new mm data + req2_merged_mm_outputs = mm_output["req2"] + req2_new_mm_outputs = new_mm_outputs[mm_key][-num_new_toks_req2:] + + assert req2_merged_mm_outputs.shape == torch.Size([num_new_toks_req2, curr_feat_dim]) + assert torch.all(req2_merged_mm_outputs == req2_new_mm_outputs) diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py index f4aabb8b95..c05f8f5067 100644 --- a/tests/e2e/online_serving/test_qwen3_omni.py +++ b/tests/e2e/online_serving/test_qwen3_omni.py @@ -23,11 +23,13 @@ models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] +QWEN3_OMNI_CONFIG_PATH = str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml") +QWEN3_OMNI_XPU_CONFIG_PATH = str(Path(__file__).parent.parent / "stage_configs" / "xpu" / "qwen3_omni_ci.yaml") -def get_chunk_config(): +def get_chunk_config(config_path: str): path = modify_stage_config( - str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml"), + config_path, updates={ "async_chunk": True, "stage_args": { @@ -44,15 +46,41 @@ def get_chunk_config(): return path +def get_prefix_caching_config(config_path: str): + """Create a stage config with prefix caching enabled on the thinker (stage 0).""" + path = modify_stage_config( + config_path, + updates={ + "stage_args": { + 0: {"engine_args.enable_prefix_caching": True}, + }, + }, + ) + return path + + if current_omni_platform.is_xpu(): - stage_configs = [str(Path(__file__).parent.parent / "stage_configs" / "xpu" / "qwen3_omni_ci.yaml")] + stage_configs = [QWEN3_OMNI_XPU_CONFIG_PATH] + prefix_caching_stage_configs = [get_prefix_caching_config(QWEN3_OMNI_XPU_CONFIG_PATH)] else: # MI325 GPU should share the same config as H100 - stage_configs = [get_chunk_config()] + stage_configs = [get_chunk_config(QWEN3_OMNI_CONFIG_PATH)] + prefix_caching_stage_configs = [get_prefix_caching_config(QWEN3_OMNI_CONFIG_PATH)] # Create parameter combinations for model and stage config test_params = [ OmniServerParams(model=model, stage_config_path=stage_config) for model in models for stage_config in stage_configs ] +# For prefix caching, we need to enable prompt token details so that we +# can determine if any tokens were cached. +prefix_test_params = [ + OmniServerParams( + model=model, + stage_config_path=stage_config, + server_args=["--enable-prompt-tokens-details"], # Enable prompt tokens details to get cached_tokens + ) + for model in models + for stage_config in prefix_caching_stage_configs +] def get_system_prompt(): @@ -75,6 +103,7 @@ def get_prompt(prompt_type="text_only"): prompts = { "text_only": "What is the capital of China? Answer in 20 words.", "mix": "What is recited in the audio? What is in this image? Describe the video briefly.", + "text_image": "What color are the squares in this image?", } return prompts.get(prompt_type, prompts["text_only"]) @@ -147,3 +176,41 @@ def test_text_to_text_001(omni_server, openai_client) -> None: } openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) + + +@pytest.mark.advanced_model +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@pytest.mark.parametrize("omni_server", prefix_test_params, indirect=True) +def test_thinker_prefix_caching(omni_server, openai_client) -> None: + """ + Test thinker prefix caching by sending identical requests with an image (i.e., + a large shared prefix) and verifying that the second request uses cached tokens + & produces the same output. + """ + image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + image_data_url=image_data_url, + content_text=get_prompt("text_image"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": False, + "modalities": ["text"], + } + + response_1 = openai_client.send_omni_request(request_config, request_num=1)[0] + response_2 = openai_client.send_omni_request(request_config, request_num=1)[0] + + assert response_1.success + assert response_2.success + assert response_2.cached_tokens is not None + # We should cache the vast majority of the prompt (image + up to last full block), + # and set seed in the CI config, so the second request should give an identical + # response for the generated input image, even if we use dummy weights + assert response_2.cached_tokens > 0 + assert response_1.text_content == response_2.text_content diff --git a/vllm_omni/core/prefix_cache.py b/vllm_omni/core/prefix_cache.py new file mode 100644 index 0000000000..69e7346c4c --- /dev/null +++ b/vllm_omni/core/prefix_cache.py @@ -0,0 +1,264 @@ +""" +Utilities for Prefix Caching in Omni models. +""" + +import torch +from vllm.logger import init_logger +from vllm.v1.worker.gpu_input_batch import InputBatch + +from vllm_omni.utils.mm_outputs import build_mm_cpu, to_payload_element + +logger = init_logger(__name__) + + +class OmniTensorPrefixCache: + """Prefix cache for hidden states (model outputs) and model specific + multimodal outputs. + + This class implements prefix caching in a non-invasive way on top of + vLLM by leveraging the same slot mappings that the vLLM scheduler uses + for the KV Cache. + + Conceptually, this means we are mapping vLLM's cache mapping: + (num_blocks, block_size) + + to 3D tensors of shape: + (num_blocks, block_size, feature_size) + + Note that feature_size may vary across multimodal_outputs. + """ + + def __init__( + self, + num_blocks: int, + block_size: int, + hidden_size: int, + hs_dtype: torch.dtype, + ): + self.num_blocks = num_blocks + self.block_size = block_size + self.default_hidden_size = hidden_size + + # Initialize the hidden states cache immediately + self.hidden_states_cache = self._get_cache_tensor(dtype=hs_dtype) + + # Defer initialization of the mm_outputs_cache until we + # actually see mm output tensors dependent on num tokens. + self.mm_outputs_cache = {} + self.mm_cache_keys = set() + self._new_req_cache_hit_ids: set[str] = set() + + def maybe_init_missing_mm_cache_keys(self, multimodal_outputs: dict, seq_len: int): + """Given multimodal outputs from executing the model, dynamically + determine which multimodal outputs are tensors depending on sequence + length and should be cached, and initialize the cache tensors + accordingly. + + NOTE: This is done to avoid the need for explicit specification of + cache keys for every model/stage and aligns with the current way + that we slice the multimodal outputs based on the first dimension. + + This will usually be called by the first forward pass, i.e., + determined by the warmup. + """ + for key, val in multimodal_outputs.items(): + if isinstance(val, torch.Tensor) and val.shape[0] == seq_len and key not in self.mm_cache_keys: + feat_dim = val.shape[-1] + self.mm_outputs_cache[key] = self._get_cache_tensor( + dtype=val.dtype, + hidden_size=feat_dim, + ) + self.mm_cache_keys.add(key) + new_tensor_shape = self.mm_outputs_cache[key].shape + logger.info("Initializing multimodal output cache of size %s for key: %s", list(new_tensor_shape), key) + + def _get_cache_tensor(self, dtype: torch.dtype, hidden_size: int | None = None) -> torch.Tensor: + """Allocate a CPU cache tensor for a specific key.""" + actual_hidden_size = hidden_size if hidden_size is not None else self.default_hidden_size + return torch.zeros( + (self.num_blocks, self.block_size, actual_hidden_size), + dtype=dtype, + device="cpu", + ) + + def add_prefix_cached_new_req_id(self, req_id: str): + """Adds a new request ID to the set of prefix cache hits on the batch.""" + self._new_req_cache_hit_ids.add(req_id) + + def reset_prefix_cached_new_req_ids(self): + """Clears the cache hit IDs to prepare for a new engine step.""" + self._new_req_cache_hit_ids.clear() + + @staticmethod + def _coerce_to_cpu_tensor(maybe_gpu_tensor: torch.Tensor) -> torch.Tensor: + """Convert GPU tensors -> contiguous CPU tensors if needed.""" + return maybe_gpu_tensor.detach().cpu().contiguous() + + def update_omni_tensor_prefix_cache( + self, + hidden_states: torch.Tensor | None, + multimodal_outputs: dict[str, torch.Tensor] | None, + num_tokens_unpadded: int, + slot_mapping: torch.Tensor, + num_tokens_padded: int | None = None, + ): + """Updates the hidden cache state for the provided hidden states and multimodal outputs. + + Args: + hidden_states: Hidden states tensor to cache (if any) + multimodal_outputs: Multimodal dict whose tensors may be cached + num_tokens_unpadded: Number of tokens without padding + slot_mapping: Slot mapping for the input sequence + num_tokens_padded: Total number of tokens including padding + """ + unpadded_slot_mapping = slot_mapping[:num_tokens_unpadded] + if num_tokens_padded is None: + num_tokens_padded = num_tokens_unpadded + + if hidden_states is not None: + # Slice to unpadded portion before caching + hidden_states = hidden_states[:num_tokens_unpadded] + # Ensure that hidden states are on the CPU + hidden_states = OmniTensorPrefixCache._coerce_to_cpu_tensor(hidden_states) + # View the cache as 2D so that we can treat our slots as row indices + flat_cache = self.hidden_states_cache.view(-1, self.hidden_states_cache.shape[-1]) + flat_cache[unpadded_slot_mapping] = hidden_states + logger.debug("Writing to hidden states for %s tokens", num_tokens_unpadded) + + # Do the same for the stage's cached multimodal outputs + if multimodal_outputs is not None: + # If we haven't initialized the keys already, do it now + # We check against the padded token count since we haven't sliced yet + self.maybe_init_missing_mm_cache_keys( + multimodal_outputs, + seq_len=num_tokens_padded, + ) + + for mm_out_key, mm_cache in self.mm_outputs_cache.items(): + if mm_out_key in multimodal_outputs: + # Slice to unpadded portion before caching + mm_state = multimodal_outputs[mm_out_key][:num_tokens_unpadded] + mm_state = OmniTensorPrefixCache._coerce_to_cpu_tensor(mm_state) + flat_cache = mm_cache.view(-1, mm_cache.shape[-1]) + flat_cache[unpadded_slot_mapping] = mm_state + logger.debug("Writing to mm output cache for %s tokens", num_tokens_unpadded) + + def _coerce_to_payload_dict( + self, + element: object, + query_start_loc: torch.Tensor, + input_batch: InputBatch, + num_scheduled_tokens: dict[str, int], + ) -> dict[str, object]: + """Build the multimodal passthrough data per request for + the object under consideration. This is identical to the case + for no prefix cache when we tensor does have a first dimension + matching the seq len. + """ + elem_dict = {} + for req_id in input_batch.req_ids: + req_idx = input_batch.req_id_to_index[req_id] + start = query_start_loc[req_idx] + end = start + num_scheduled_tokens[req_id] + elem_dict[req_id] = to_payload_element( + element, req_idx, start=start, end=end, pass_lists_through=True, seq_len=None + ) + return elem_dict + + def get_merged_multimodal_states( + self, + query_start_loc: torch.Tensor, + input_batch: InputBatch, + multimodal_outputs: dict, + num_scheduled_tokens: dict[str, int], + ): + """Get the merged multimodal states if hidden state prefix caching is enabled.""" + combined_multimodal_outputs = {} + # First get the prefix cached tensors that are present in the mm data + for mm_key in self.mm_cache_keys: + if mm_key in multimodal_outputs: + combined_multimodal_outputs[mm_key] = self._get_merged_tensors( + query_start_loc=query_start_loc, + input_batch=input_batch, + cache=self.mm_outputs_cache[mm_key], + hidden_states=multimodal_outputs[mm_key], + num_scheduled_tokens=num_scheduled_tokens, + ) + + # Then, get everything else (passthrough data); first, convert to CPU + # tensors similarly to the non prefix cached path, and then populate + # the subdicts mapping request IDs -> payload objects + passthrough_keys = set(multimodal_outputs.keys()) - self.mm_cache_keys + passthrough_mm_data = {k: v for k, v in multimodal_outputs.items() if k in passthrough_keys} + mm_cpu = build_mm_cpu(multimodal_outputs=passthrough_mm_data) + + for mm_key, mm_val in mm_cpu.items(): + combined_multimodal_outputs[mm_key] = self._coerce_to_payload_dict( + element=mm_val, + query_start_loc=query_start_loc, + input_batch=input_batch, + num_scheduled_tokens=num_scheduled_tokens, + ) + return combined_multimodal_outputs + + def get_merged_hidden_states(self, *args, **kwargs) -> dict[str, torch.Tensor]: + """Get the merged hidden states.""" + return self._get_merged_tensors( + *args, + **kwargs, + cache=self.hidden_states_cache, + ) + + def _get_merged_tensors( + self, + query_start_loc: torch.Tensor, + input_batch: InputBatch, + cache: torch.Tensor, + hidden_states: torch.Tensor, + num_scheduled_tokens: dict[str, int], + ) -> dict[str, torch.Tensor]: + """When hidden state caching is enabled, takes the input hidden_states, + which only correspond to the scheduled tokens, and returns a mapping + from request IDs to their full hidden states. This is accomplished by + looking up the block IDs & scheduled token counts to split the + hidden_states. + """ + # We do not support hybrid caches at the moment. + if len(input_batch.block_table.block_tables) > 1: + logger.warning_once( + "Omni prefix caching is enabled, but the batch block table appears to" + " have multiple kv groups; only the first group will be used!" + ) + + combined_hidden_states = {} + hidden_states = OmniTensorPrefixCache._coerce_to_cpu_tensor(hidden_states) + for req_id in input_batch.req_ids: + req_idx = input_batch.req_id_to_index[req_id] + + if req_id in self._new_req_cache_hit_ids: + block_ids = self._get_cached_block_ids(req_idx, input_batch) + cached_hs = cache[block_ids].reshape(-1, cache.shape[-1]) + + # Slice the hidden states corresponding to this request; + # we do this by using the query start + start = query_start_loc[req_idx] + new_hs = hidden_states[start : start + num_scheduled_tokens[req_id]] + combined_hidden_states[req_id] = torch.cat([cached_hs, new_hs], dim=0) + else: + # cache miss for this request, pass through normally + start = query_start_loc[req_idx] + new_hs = hidden_states[start : start + num_scheduled_tokens[req_id]] + combined_hidden_states[req_id] = new_hs + + return combined_hidden_states + + def _get_cached_block_ids(self, req_idx: int, input_batch: InputBatch) -> torch.Tensor: + """Given an input batch and request index in the batch (not ID), get the + block IDs corresponding to the cache hit. + """ + num_computed = input_batch.num_computed_tokens_cpu[req_idx] + # NOTE: vLLM only caches full blocks + num_cached_blocks = num_computed // self.block_size + # Get the block IDs attached to this cache hit and reindex into + # the flattened cached hidden states (i.e., 1 row per token). + return input_batch.block_table[0].block_table.cpu[req_idx, :num_cached_blocks] diff --git a/vllm_omni/utils/mm_outputs.py b/vllm_omni/utils/mm_outputs.py new file mode 100644 index 0000000000..66d4e6ffe0 --- /dev/null +++ b/vllm_omni/utils/mm_outputs.py @@ -0,0 +1,93 @@ +"""Utilities for handling multimodal outputs / building multimodal output +payloads, most of which are shared by the prefix cache / no prefix cache path. +""" + +import torch +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def build_mm_cpu(multimodal_outputs: dict) -> dict[str, object]: + """Pre-copies multimodal tensor to CPU once (not per-request) to avoid + redundant D2H transfers when gpu_resident_buffer_keys keeps them on GPU. + + In the case of prefix caching, the multimodal outputs provided will + only contain the passthrough data. + + Args: + multimodal_outputs: Multimodal dict mapping strings to objects. + """ + # Pre-copy multimodal tensors to CPU once (not per-request) to avoid + # redundant D2H transfers when gpu_resident_buffer_keys keeps them on GPU. + mm_cpu: dict[str, object] = {} + # Currently there are some cases where this is true at the + # moment, which should be fixed. + if not isinstance(multimodal_outputs, dict): + logger.warning("Multimodal outputs are not a dict and will not be passed") + + if multimodal_outputs: + for k, v in multimodal_outputs.items(): + if isinstance(v, torch.Tensor): + mm_cpu[k] = v.detach().to("cpu").contiguous() + elif isinstance(v, dict): + sub_dict: dict[str, torch.Tensor] = {} + for sk, sv in v.items(): + if isinstance(sv, torch.Tensor): + sub_dict[str(sk)] = sv.detach().to("cpu").contiguous() + if sub_dict: + mm_cpu[k] = sub_dict + elif isinstance(v, list) and len(v) > 0: + cpu_list = [] + for elem in v: + if isinstance(elem, torch.Tensor): + cpu_list.append(elem.detach().to("cpu").contiguous()) + else: + cpu_list.append(elem) + mm_cpu[k] = cpu_list + elif v is not None: + mm_cpu[k] = v + return mm_cpu + + +def to_payload_element( + element: object, idx: int, start: int, end: int, pass_lists_through: bool = False, seq_len: int | None = None +): + """Build an mm payload element corresponding to one request index + from an element containing 0 or more CPU tensors. + + Args: + element: The object to be added to the payload. + idx: The index of the request. + start: The start index corresponding to the request idx. + end: The end index corresponding to the request idx. + pass_lists_through: bool Whether or not lists should be treated as + passthrough data; this should be False in normal cases, but True + if we need to avoid splitting nonempty lists prior to calling + postprocess, which is the case for prefix cache. + seq_len: Optional sequence length (i.e., dim 0 of hidden states). + This should be set to None in the prefix caching case, because + the condition that would be executed here is the same as the + criteria for being added to the multimodal outputs cache. + """ + # Prefix cache won't hit this case because this is the condition + # for being a mm_cache_key in the multimodal outputs tensor. + if seq_len is not None and isinstance(element, torch.Tensor) and element.shape[0] == seq_len: + return element[start:end].contiguous() + # Every other case is shared between prefix cache (passthrough data) + # and running a model without prefix caching. + elif isinstance(element, dict): + return {sk: sv[start:end].contiguous() for sk, sv in element.items()} + elif isinstance(element, list): + # For lists, clone tensors to avoid cross-request aliasing + if pass_lists_through: + return [elem.clone() if isinstance(elem, torch.Tensor) else elem for elem in element] + element = element[idx] if idx < len(element) else element[0] + if isinstance(element, torch.Tensor): + element = element.clone() + return element + elif isinstance(element, torch.Tensor): + # List-derived tensor payloads are request-invariant; clone to + # avoid accidental cross-request aliasing on downstream mutation. + return element.clone() + return element diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 62a0c85716..f37b2224ef 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -39,6 +39,7 @@ from vllm_omni.distributed.omni_connectors.kv_transfer_manager import OmniKVTransferManager from vllm_omni.outputs import OmniModelRunnerOutput +from vllm_omni.utils.mm_outputs import build_mm_cpu, to_payload_element from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner from vllm_omni.worker.omni_connector_model_runner_mixin import OmniConnectorModelRunnerMixin @@ -201,6 +202,63 @@ def _capture_talker_mtp_graphs(self) -> None: finally: set_cudagraph_capturing_enabled(False) + def _maybe_update_prefix_cache( + self, + hidden_states: torch.Tensor, + multimodal_outputs: dict, + num_tokens_unpadded: int, + num_tokens_padded: int, + ): + """If prefix caching is enabled and it's the last pipeline parallelism rank, + retrieve the hidden states & multimodal outputs from the prefix cache based + on our batch slot mappings. + """ + # Cache hidden states if we've enabled hidden state prefix caching + # unless this isn't the last pipeline parallelism rank. + if self.omni_prefix_cache is not None and get_pp_group().is_last_rank: + # If this happens, it generally means the model is not following the correct + # interface yet and is therefore currently not compatible with prefix cache. + if multimodal_outputs is not None and not isinstance(multimodal_outputs, dict): + logger.warning_once( + "prefix caching expects mm outputs to be a dict, but got %s", + type(multimodal_outputs), + ) + + self.omni_prefix_cache.update_omni_tensor_prefix_cache( + hidden_states=hidden_states, + multimodal_outputs=multimodal_outputs, + num_tokens_unpadded=num_tokens_unpadded, + slot_mapping=self.input_batch.block_table[0].slot_mapping.cpu, + num_tokens_padded=num_tokens_padded, + ) + + def _maybe_get_combined_prefix_cache_tensors( + self, + hidden_states: torch.Tensor, + multimodal_outputs: dict, + num_scheduled_tokens: dict[str, int], + ) -> tuple[dict[str, torch.Tensor] | None, dict | None]: + """If prefix caching is enabled, extract the merged hidden states and multimodal outputs for + all requests in the batch (including those that aren't a hit on Prefix cache). + """ + # Prior to applying the post-processing func, extract + # the prefix cached hidden states and multimodal states. + combined_hidden_states, combined_multimodal_outputs = None, None + if self.omni_prefix_cache is not None: + combined_hidden_states = self.omni_prefix_cache.get_merged_hidden_states( + query_start_loc=self.query_start_loc.cpu, + input_batch=self.input_batch, + hidden_states=hidden_states, + num_scheduled_tokens=num_scheduled_tokens, + ) + combined_multimodal_outputs = self.omni_prefix_cache.get_merged_multimodal_states( + query_start_loc=self.query_start_loc.cpu, + input_batch=self.input_batch, + multimodal_outputs=multimodal_outputs, + num_scheduled_tokens=num_scheduled_tokens, + ) + return combined_hidden_states, combined_multimodal_outputs + @torch.inference_mode() def execute_model( self, @@ -476,6 +534,15 @@ def execute_model( hidden_states, multimodal_outputs = self.extract_multimodal_outputs(model_output) + # Cache hidden states & multimodal outputs if we've enabled hidden state + # prefix caching unless this isn't the last pipeline parallelism rank. + self._maybe_update_prefix_cache( + hidden_states=hidden_states, + multimodal_outputs=multimodal_outputs, + num_tokens_unpadded=num_tokens_unpadded, + num_tokens_padded=num_tokens_padded, + ) + if not self.broadcast_pp_output: # Common case. if not get_pp_group().is_last_rank: @@ -589,6 +656,23 @@ def _sample( return super()._sample(logits, spec_decode_metadata) + @staticmethod + def _resolve_req_hidden_states( + hidden_states_cpu: torch.Tensor, + combined_hidden_states: dict[str, torch.Tensor] | None, + rid: str, + start: int, + end: int, + ): + if combined_hidden_states is not None: + # We always have all request IDs for prefix cache, even for + # partial cache misses, so this should never happen. + if rid not in combined_hidden_states: + raise RuntimeError("Request IDs in the batch are missing from the merged states!") + return combined_hidden_states[rid] + # Prefix caching is disabled + return hidden_states_cpu[start:end] + @torch.inference_mode() def sample_tokens( self, @@ -597,6 +681,13 @@ def sample_tokens( kv_extracted_req_ids = getattr(self, "kv_extracted_req_ids", None) self.kv_extracted_req_ids = None + # Used for prefix cache + combined_hidden_states = None + combined_multimodal_outputs = None + # Used when we don't use prefix cache; prefix cache builds the payloads + # internally since it already needs to do this for the cached tensors + mm_cpu = {} + if self.execute_model_state is None: kv_connector_output = self.kv_connector_output self.kv_connector_output = None @@ -628,6 +719,7 @@ def sample_tokens( slot_mappings, # OMNI: unpack slot_mappings for drafter ) = self.execute_model_state self.execute_model_state = None + seq_len = hidden_states.shape[0] # Apply structured output bitmasks if present. if grammar_output is not None: @@ -749,67 +841,73 @@ def propose_draft_token_ids(sampled_token_ids): dtype=np.int32, ) + # Prior to applying the post-processing func, extract + # the prefix cached hidden states and multimodal states. + if self.omni_prefix_cache is not None: + ( + combined_hidden_states, + combined_multimodal_outputs, + ) = self._maybe_get_combined_prefix_cache_tensors( + hidden_states, + multimodal_outputs, + scheduler_output.num_scheduled_tokens, + ) + # Otherwise we don't have the mm CPU data yet, so we still need to build it + if self.omni_prefix_cache is None: + mm_cpu = build_mm_cpu(multimodal_outputs) + self._process_additional_information_updates( - hidden_states, multimodal_outputs, num_scheduled_tokens_np, scheduler_output + hidden_states, + multimodal_outputs, + num_scheduled_tokens_np, + scheduler_output, + combined_hidden_states, + combined_multimodal_outputs, ) - # Pre-copy multimodal tensors to CPU once (not per-request) to avoid - # redundant D2H transfers when gpu_resident_buffer_keys keeps them on GPU. - mm_cpu: dict[str, object] = {} - if isinstance(multimodal_outputs, dict) and multimodal_outputs: - for k, v in multimodal_outputs.items(): - try: - if isinstance(v, torch.Tensor) and v.shape[0] == hidden_states_cpu.shape[0]: - mm_cpu[k] = v.detach().to("cpu").contiguous() - elif isinstance(v, dict): - sub_dict: dict[str, torch.Tensor] = {} - for sk, sv in v.items(): - if isinstance(sv, torch.Tensor) and sv.shape[0] == hidden_states_cpu.shape[0]: - sub_dict[str(sk)] = sv.detach().to("cpu").contiguous() - if sub_dict: - mm_cpu[k] = sub_dict - elif isinstance(v, list): - if len(v) == 0: - continue - cpu_list = [] - for elem in v: - if isinstance(elem, torch.Tensor): - cpu_list.append(elem.detach().to("cpu").contiguous()) - else: - cpu_list.append(elem) - mm_cpu[k] = cpu_list - except Exception as e: - logger.error(f"Error in merge multimodal outputs: {e}") - pooler_output: list[dict[str, object]] = [] for rid in req_ids_output_copy: idx = req_id_to_index_output_copy[rid] start = int(self.query_start_loc.cpu[idx]) sched = int(num_scheduled_tokens_np[idx]) end = start + sched - hidden_slice = hidden_states_cpu[start:end] - payload: dict[str, object] = {"hidden": hidden_slice} - if mm_cpu: - mm_payload: dict[str, object] = {} - for k, v in mm_cpu.items(): - if isinstance(v, torch.Tensor) and v.shape[0] == hidden_states_cpu.shape[0]: - mm_payload[k] = v[start:end].contiguous() - elif isinstance(v, dict): - mm_payload[k] = {sk: sv[start:end].contiguous() for sk, sv in v.items()} - elif isinstance(v, list): - element = v[idx] if idx < len(v) else v[0] - if element is not None: - if isinstance(element, torch.Tensor): - element = element.clone() - mm_payload[k] = element - # Skip None elements: msgspec cannot serialize None - # in dict[str, torch.Tensor] typed fields. - elif isinstance(v, torch.Tensor): - # List-derived tensor payloads are request-invariant; clone to - # avoid accidental cross-request aliasing on downstream mutation. - mm_payload[k] = v.clone() - else: - mm_payload[k] = v + # If prefix cache is enabled, we have already split everything + # by request and converted the states to CPU tensors + req_hidden_states = self._resolve_req_hidden_states( + hidden_states_cpu, + combined_hidden_states, + rid, + start, + end, + ) + payload: dict[str, object] = {"hidden": req_hidden_states} + + mm_payload: dict[str, object] = {} + if combined_multimodal_outputs or mm_cpu: + if combined_multimodal_outputs: + # Prefix cache enabled; all items have already been processed + # and split apart for each request as needed, and all tensors + # have already been detached to the CPU. The only exception is + # lists, which we keep as passthrough data for consistent behavior + # in postprocess. + for mm_key in combined_multimodal_outputs.keys(): + value = combined_multimodal_outputs[mm_key][rid] + if isinstance(value, list): + mm_payload[mm_key] = value[idx] if idx < len(value) else value[0] + else: + mm_payload[mm_key] = value + + else: + # Prefix cache disabled; we still need to process the data + for mm_key, mm_val in mm_cpu.items(): + mm_payload[mm_key] = to_payload_element( + element=mm_val, + idx=idx, + start=start, + end=end, + pass_lists_through=False, + seq_len=seq_len, + ) payload.update(mm_payload) pooler_output.append(payload) with record_function_or_nullcontext("gpu_model_runner: ModelRunnerOutput"): diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 5ff62c11b4..de78011c75 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -20,6 +20,7 @@ from vllm.v1.worker.gpu_model_runner import GPUModelRunner, IntermediateTensors, PerLayerAttnMetadata from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices +from vllm_omni.core.prefix_cache import OmniTensorPrefixCache from vllm_omni.engine.serialization import deserialize_additional_information from vllm_omni.model_executor.layers.rotary_embedding.mrope import OmniMRotaryEmbedding as MRotaryEmbedding from vllm_omni.model_executor.models.output_templates import OmniOutput @@ -43,6 +44,9 @@ def __init__(self, *args, **kwargs): self.model_intermediate_buffer: dict[str, dict[str, Any]] = {} self._omni_num_scheduled_tokens_np: np.ndarray | None = None self._omni_last_model_output: object | None = None + # The Omni tensor prefix cache will be allocated + # when we initialize the metadata builders if enabled + self.omni_prefix_cache = None def initialize_metadata_builders(self, kv_cache_config, kernel_block_sizes): """Override to fix scheduler_metadata buffer size for FA3 + CUDA graph. @@ -70,6 +74,16 @@ def initialize_metadata_builders(self, kv_cache_config, kernel_block_sizes): device=sm.device, ) + # Initialize the wrapper for both multimodal output tensors + # and for hidden states to be passed between stages + if self.cache_config.enable_prefix_caching: + self.omni_prefix_cache = OmniTensorPrefixCache( + num_blocks=kv_cache_config.num_blocks, + block_size=self.cache_config.block_size, + hidden_size=self.model_config.get_hidden_size(), + hs_dtype=self.dtype, + ) + @instrument(span_name="Loading (GPU)") def load_model(self, *args, **kwargs) -> None: super().load_model(*args, **kwargs) @@ -234,6 +248,10 @@ def _update_states(self, scheduler_output: "SchedulerOutput"): The SamplingMetadata is updated and copied to the GPU if there is a new/resumed/paused/finished request in the batch. """ + # Used for prefix cache + if self.omni_prefix_cache is not None: + self.omni_prefix_cache.reset_prefix_cached_new_req_ids() + # Remove finished requests from the cached states. for req_id in scheduler_output.finished_req_ids: self.requests.pop(req_id, None) @@ -294,6 +312,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput"): reqs_to_add.append(req_state) continue + # Since this is the first time the request has been scheduled, + # num_computed_tokens > 0 means that we have a hit in prefix + # caching; mark it so that we can manage the hidden states + # later on as needed. + if self.omni_prefix_cache is not None and new_req_data.num_computed_tokens > 0: + self.omni_prefix_cache.add_prefix_cached_new_req_id(req_id) + sampling_params = new_req_data.sampling_params pooling_params = new_req_data.pooling_params @@ -1010,6 +1035,8 @@ def _process_additional_information_updates( multimodal_outputs: object, num_scheduled_tokens_np: np.ndarray, scheduler_output: "SchedulerOutput", + combined_hidden_states: dict[str, torch.Tensor] | None = None, + combined_multimodal_outputs: dict[str, object] | None = None, ) -> None: """Process model-provided per-request updates and merge into model_intermediate_buffer.""" try: @@ -1018,21 +1045,31 @@ def _process_additional_information_updates( if hasattr(self.model, "has_postprocess") and self.model.has_postprocess: for req_index, req_id in enumerate(self.input_batch.req_ids): req_infos = self.model_intermediate_buffer.get(req_id, {}) - start_offset = int(self.query_start_loc.cpu[req_index]) - sched_tokens = int(num_scheduled_tokens_np[req_index]) - s, e = start_offset, start_offset + sched_tokens - # only consider to store data into update dict. - hidden_states_slice = hidden_states[s:e] + if combined_hidden_states: + # Combined hidden states contains all hidden states for every request + hidden_states_slice = combined_hidden_states[req_id] + else: + start_offset = int(self.query_start_loc.cpu[req_index]) + sched_tokens = int(num_scheduled_tokens_np[req_index]) + s, e = start_offset, start_offset + sched_tokens + # only consider to store data into update dict. + hidden_states_slice = hidden_states[s:e] + + if combined_multimodal_outputs: + # NOTE this is a bit ugly, but the mm data is structured as a list of + # keys mapping to request IDs, and if enabled, we will always have all + # request IDs in every subdict, including for cache misses. + mm_out = {k: v[req_id] for k, v in combined_multimodal_outputs.items()} + else: + mm_out = multimodal_outputs update_dict = self.model.postprocess( - hidden_states_slice, multimodal_outputs=multimodal_outputs, **req_infos + hidden_states_slice, + multimodal_outputs=mm_out, + **req_infos, ) self._update_intermediate_buffer(req_id, update_dict) except Exception as e: - logger.error( - f"Error merging for requests:{self.input_batch.req_ids} " - f"additional information update: {e}, with the multimodal_outputs " - f"as {multimodal_outputs}" - ) + logger.error(f"Error merging for requests:{self.input_batch.req_ids} additional information update: {e}") import traceback traceback.print_exc() From e9581137e9d887c0876885d1c4a74ea7d63ba2eb Mon Sep 17 00:00:00 2001 From: Didan Deng <33117903+wtomin@users.noreply.github.com> Date: Thu, 16 Apr 2026 01:45:16 +0800 Subject: [PATCH 187/204] [Perf] Add Performance Test for Qwen-Image Step-Level Execution (#2707) Signed-off-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- .../perf/tests/test_qwen_image_vllm_omni.json | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index 1f3a2bbf77..5ec7f1cc2b 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -44,6 +44,52 @@ } ] }, + { + "test_name": "test_qwen_image_single_device_step_execution", + "description": "Single-device baseline (no parallelism) with step execution", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image", + "serve_args": { + "enable-diffusion-pipeline-profiler": true, + "step-execution": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20", + "dataset": "random", + "task": "t2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.30, + "latency_mean": 3.50, + "peak_memory_mb_mean": 67000 + } + }, + { + "name": "1536x1536_steps35", + "dataset": "random", + "task": "t2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.037, + "latency_mean": 27.0, + "peak_memory_mb_mean": 74000 + } + } + ] + }, { "test_name": "test_qwen_image_ulysses2_cfg2_vae_patch4", "description": "Ulysses SP=2 + CFG-parallel=2 + VAE Patch Parallel=4", From 880a758b1f4b8be49618affbe4a735352f070993 Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Thu, 16 Apr 2026 10:15:39 +0800 Subject: [PATCH 188/204] [CI] Skip test_thinker_prefix_caching in tests/e2e/online_serving/test_qwen3_omni.py (#2836) Signed-off-by: wangyu <410167048@qq.com> --- tests/e2e/online_serving/test_qwen3_omni.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py index c05f8f5067..13af2ad110 100644 --- a/tests/e2e/online_serving/test_qwen3_omni.py +++ b/tests/e2e/online_serving/test_qwen3_omni.py @@ -183,6 +183,7 @@ def test_text_to_text_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", prefix_test_params, indirect=True) +@pytest.mark.skip(reason="issue: #2833") def test_thinker_prefix_caching(omni_server, openai_client) -> None: """ Test thinker prefix caching by sending identical requests with an image (i.e., From c83f664fe17a372e0cfcf31b81b423ffee940e6b Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Thu, 16 Apr 2026 11:13:41 +0800 Subject: [PATCH 189/204] [CI][Perf] Add nightly PR labels, consolidate pipeline, and switch benchmark flag to --test-config-file (#2816) Signed-off-by: wangyu <410167048@qq.com> Co-authored-by: Y. Fisher Co-authored-by: inaniloquentee --- .buildkite/pipeline.yml | 12 +- .buildkite/test-nightly-diffusion.yml | 417 ----------------- .buildkite/test-nightly.yml | 432 ++++++++++++++++-- docs/contributing/ci/CI_5levels.md | 7 +- .../test_examples/l4_performance_tests.inc.md | 2 +- docs/contributing/ci/test_guide.md | 5 +- tests/dfx/conftest.py | 12 + tests/dfx/perf/scripts/run_benchmark.py | 49 +- .../perf/scripts/run_diffusion_benchmark.py | 25 +- .../tests/{test.json => test_qwen_omni.json} | 32 -- tests/dfx/perf/tests/test_tts.json | 34 ++ 11 files changed, 493 insertions(+), 534 deletions(-) delete mode 100644 .buildkite/test-nightly-diffusion.yml rename tests/dfx/perf/tests/{test.json => test_qwen_omni.json} (92%) create mode 100644 tests/dfx/perf/tests/test_tts.json diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index d9a2315953..00823951dc 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -44,11 +44,19 @@ steps: agents: queue: "cpu_queue_premerge" - # L4 Test — main+NIGHTLY=1 (scheduled), or PR with label nightly-test (e.g. add label then Rebuild) + # L4 Test — main+NIGHTLY=1 (scheduled), or PR with specific label (e.g. add label then Rebuild) - label: "Upload Nightly Pipeline" depends_on: image-build key: upload-nightly-pipeline - if: '(build.branch == "main" && build.env("NIGHTLY") == "1") || (build.branch != "main" && build.pull_request.labels includes "nightly-test")' + if: >- + (build.branch == "main" && build.env("NIGHTLY") == "1") || + (build.branch != "main" && ( + build.pull_request.labels includes "nightly-test" || + build.pull_request.labels includes "omni-test" || + build.pull_request.labels includes "tts-test" || + build.pull_request.labels includes "diffusion-x2iat-test" || + build.pull_request.labels includes "diffusion-x2v-test" + )) commands: - buildkite-agent pipeline upload .buildkite/test-nightly.yml agents: diff --git a/.buildkite/test-nightly-diffusion.yml b/.buildkite/test-nightly-diffusion.yml deleted file mode 100644 index b5ba8a117c..0000000000 --- a/.buildkite/test-nightly-diffusion.yml +++ /dev/null @@ -1,417 +0,0 @@ -# Nightly diffusion GPU tests — appended to the main nightly build via -# buildkite-agent pipeline upload .buildkite/test-nightly-diffusion.yml -# from test-nightly.yml (step key: nightly-diffusion-model-test). Top-level groups are -# foldable in the Buildkite UI (Other / Wan / Qwen-Image). -env: - VLLM_WORKER_MULTIPROC_METHOD: spawn - HF_HUB_DOWNLOAD_TIMEOUT: 300 - HF_HUB_ETAG_TIMEOUT: 60 - -steps: - - group: ":card_index_dividers: Other Model Test" - key: nightly-other-model-test-group - steps: - - label: ":full_moon: Diffusion · Other · Function Test with H100" - timeout_in_minutes: 120 - # Shared nightly vs PR label conditional; referenced below as *nightly_or_pr_label - if: &nightly_or_pr_label 'build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"' - commands: - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not test_qwen_image" -m "advanced_model and diffusion and H100" --run-level "advanced_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion · Other · Function Test with L4" - timeout_in_minutes: 60 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" - agents: - queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - shm-size: "8gb" - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - - label: ":full_moon: Diffusion · Other · Doc Test" - timeout_in_minutes: 60 - if: *nightly_or_pr_label - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - - pytest -s -v tests/examples/online_serving/test_text_to_image.py tests/examples/offline_inference/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - group: ":card_index_dividers: Wan Series Model Test" - key: nightly-wan-model-test-group - steps: - - label: ":full_moon: Diffusion · Wan · Function Test" - timeout_in_minutes: 90 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py -m "advanced_model" --run-level "advanced_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion · Wan · Accuracy Test" - key: nightly-wan22-i2v-accuracy - timeout_in_minutes: 180 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level advanced_model - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - group: ":card_index_dividers: Qwen-Image Series Model Test" - key: nightly-qwen-image-edit-group - steps: - - label: ":full_moon: Diffusion · Qwen-Image · Function Test with H100" - timeout_in_minutes: 120 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/online_serving/test_qwen_image*_expansion.py -m "advanced_model and diffusion and H100" --run-level "advanced_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion · Qwen-Image · GEBench Accuracy Test" - key: nightly-gebench-accuracy - timeout_in_minutes: 60 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1 - - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 1 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion · Qwen-Image · GEdit-Bench Accuracy Test" - key: nightly-gedit-bench-accuracy - timeout_in_minutes: 60 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1 - - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv" - - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 1 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_HTTP_TIMEOUT_KEEP_ALIVE - value: "120" - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion · Qwen-Image · Perf Test" - key: nightly-qwen-image-performance - timeout_in_minutes: 180 - if: *nightly_or_pr_label - commands: - - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results - - export DIFFUSION_ATTENTION_BACKEND=FLASH_ATTN - - export CACHE_DIT_VERSION=1.3.0 - # [HACK]: run upload in the same command block as pytest. - # Because `exit` aborts the entire commands list. - - | - set +e - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json - EXIT1=$$? - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json - EXIT2=$$? - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json - EXIT3=$$? - if [ $$EXIT1 -eq 0 ] || [ $$EXIT2 -eq 0 ] || [ $$EXIT3 -eq 0 ]; then - buildkite-agent artifact upload "tests/dfx/perf/results/diffusion_result_*.json" - buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" - fi - exit $$((EXIT1 | EXIT2 | EXIT3)) - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion · Qwen-Image · Accuracy Test" - key: nightly-qwen-image-accuracy - timeout_in_minutes: 180 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/accuracy/test_qwen_image*.py --run-level advanced_model - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 1 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 31b3e17976..58e1e55af7 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -7,12 +7,11 @@ steps: # Group: collapses under one heading in the Buildkite UI; child steps still run in parallel. - group: ":card_index_dividers: Omni Model Test" key: nightly-omni-test-group + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" || build.pull_request.labels includes "omni-test" steps: - - label: ":full_moon: Omni · Function Test with H100" + - label: ":full_moon: Omni · Function Test" timeout_in_minutes: 90 - depends_on: upload-nightly-pipeline - # Shared nightly vs PR label conditional; referenced below as *nightly_or_pr_label - if: &nightly_or_pr_label 'build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"' commands: - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and H100 and omni" --run-level "advanced_model" agents: @@ -49,13 +48,11 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - - label: ":full_moon: Omni · Function Test with L4" + - label: ":full_moon: Omni · Doc Test with L4" timeout_in_minutes: 90 - depends_on: upload-nightly-pipeline - if: *nightly_or_pr_label commands: - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model" + - pytest -s -v tests/examples/ -m "advanced_model and omni and L4" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: @@ -70,13 +67,203 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: ":full_moon: Omni · Doc Test with L4" + - label: ":full_moon: Omni · Doc Test with H100" + timeout_in_minutes: 90 + commands: + - pytest -s -v tests/examples/ -m "advanced_model and omni and H100" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Omni · Perf Test" + key: nightly-omni-performance + timeout_in_minutes: 180 + commands: + - export BENCHMARK_DIR=tests/dfx/perf/results + - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_omni.json + - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + + - group: ":card_index_dividers: TTS Model Test" + key: nightly-tts-test-group + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" || build.pull_request.labels includes "tts-test" + steps: + - label: ":full_moon: TTS · Function Test" timeout_in_minutes: 90 - depends_on: upload-nightly-pipeline - if: *nightly_or_pr_label commands: - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - pytest -s -v tests/examples/ -m "advanced_model and omni and L4" --run-level "advanced_model" + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model" + agents: + queue: "gpu_1_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + + - label: ":full_moon: TTS · Perf Test" + key: nightly-tts-performance + timeout_in_minutes: 180 + commands: + - export BENCHMARK_DIR=tests/dfx/perf/results + - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" + - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py --test-config-file tests/dfx/perf/tests/test_tts.json + - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + # Diffusion X2I suite: x2i / x2a / x2t and related non-video paths; x2v is only in "Diffusion X2V Model Test" below. + - group: ":card_index_dividers: Diffusion X2I(&A&T) Model Test" + key: nightly-diffusion-x2iat-group + depends_on: upload-nightly-pipeline + if: >- + build.env("NIGHTLY") == "1" || + build.pull_request.labels includes "nightly-test" || + build.pull_request.labels includes "diffusion-x2iat-test" + steps: + - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100" + timeout_in_minutes: 120 + commands: + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "advanced_model and diffusion and H100" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4" + timeout_in_minutes: 60 + commands: + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "advanced_model and diffusion and L4" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: @@ -91,12 +278,11 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: ":full_moon: Omni · Doc Test with H100" - timeout_in_minutes: 90 - depends_on: upload-nightly-pipeline - if: *nightly_or_pr_label + - label: ":full_moon: Diffusion X2I(&A&T) · Doc Test" + timeout_in_minutes: 60 commands: - - pytest -s -v tests/examples/ -m "advanced_model and omni and H100" --run-level "advanced_model" + - export VLLM_TEST_CLEAN_GPU_MEMORY="1" + - pytest -s -v tests/examples/*/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model" agents: queue: "mithril-h100-pool" plugins: @@ -131,16 +317,109 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - - label: ":full_moon: Omni · Perf Test" - key: nightly-omni-performance + - label: ":full_moon: Diffusion X2I(&A&T) · GEBench Accuracy Test" + timeout_in_minutes: 60 + commands: + - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1 + - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Diffusion X2I(&A&T) · GEdit-Bench Accuracy Test" + timeout_in_minutes: 60 + commands: + - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1 + - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv" + - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_HTTP_TIMEOUT_KEEP_ALIVE + value: "120" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Diffusion X2I(&A&T) · Perf Test" + key: nightly-diffusion-x2iat-performance timeout_in_minutes: 180 - depends_on: upload-nightly-pipeline - if: *nightly_or_pr_label commands: - - export BENCHMARK_DIR=tests/dfx/perf/results - - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py - - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" + - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results + - export DIFFUSION_ATTENTION_BACKEND=FLASH_ATTN + - export CACHE_DIT_VERSION=1.3.0 + # [HACK]: run upload in the same command block as pytest. + # Because `exit` aborts the entire commands list. + - | + set +e + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json + EXIT1=$$? + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json + EXIT2=$$? + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json + EXIT3=$$? + if [ $$EXIT1 -eq 0 ] || [ $$EXIT2 -eq 0 ] || [ $$EXIT3 -eq 0 ]; then + buildkite-agent artifact upload "tests/dfx/perf/results/diffusion_result_*.json" + buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" + fi + exit $$((EXIT1 | EXIT2 | EXIT3)) agents: queue: "mithril-h100-pool" plugins: @@ -150,7 +429,7 @@ steps: - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT resources: limits: - nvidia.com/gpu: 2 + nvidia.com/gpu: 4 volumeMounts: - name: devshm mountPath: /dev/shm @@ -175,23 +454,96 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - # Dynamically appends steps from test-nightly-diffusion.yml into this build (same mechanism as - # pipeline.yml → test-ready.yml / test-merge.yml / test-nightly.yml). Foldable groups stay in the - # uploaded YAML (Other / Wan / Qwen-Image). - - label: ":card_index_dividers: Diffusion Model Test" - key: nightly-diffusion-model-test + # Diffusion x2v only (Wan, HunyuanVideo, …). x2i/x2a/x2t live in the X2I group above, not here. + - group: ":card_index_dividers: Diffusion X2V Model Test" + key: nightly-diffusion-x2v-group depends_on: upload-nightly-pipeline - if: *nightly_or_pr_label - commands: - - buildkite-agent pipeline upload .buildkite/test-nightly-diffusion.yml - agents: - queue: "cpu_queue_premerge" + if: >- + build.env("NIGHTLY") == "1" || + build.pull_request.labels includes "nightly-test" || + build.pull_request.labels includes "diffusion-x2v-test" + steps: + - label: ":full_moon: Diffusion X2V · Function Test" + timeout_in_minutes: 90 + commands: + - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py -m "advanced_model" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Diffusion X2V · Accuracy Test" + timeout_in_minutes: 180 + commands: + - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level advanced_model + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - label: ":bar_chart: Testcase Statistics" key: nightly-testcase-statistics timeout_in_minutes: 120 depends_on: upload-nightly-pipeline - if: *nightly_or_pr_label + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - python tools/nightly/buildkite_testcase_statistics.py -o tests/dfx/perf/results/buildkite_testcase_statistics.html - buildkite-agent artifact upload "tests/dfx/perf/results/*.html" @@ -234,15 +586,17 @@ steps: key: nightly-perf-distribution depends_on: - nightly-omni-performance - - nightly-qwen-image-performance + - nightly-tts-performance + - nightly-diffusion-x2iat-performance - nightly-testcase-statistics if: build.env("NIGHTLY") == "1" commands: - pip install openpyxl - export DEFAULT_INPUT_DIR=tests/dfx/perf/results - export DEFAULT_OUTPUT_DIR=tests/dfx/perf/results + - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-tts-performance - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-omni-performance - - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-qwen-image-performance + - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-diffusion-x2iat-performance - buildkite-agent artifact download "tests/dfx/perf/results/*.html" . --step nightly-testcase-statistics - python tools/nightly/generate_nightly_perf_excel.py - python tools/nightly/generate_nightly_perf_html.py diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index 9306035738..b0428ddd7d 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -86,7 +86,8 @@ Through five levels (L1-L5) and common (Common) specifications, the system clari /tests/e2e/online_serving/test_{model_name}_expansion.py
/tests/e2e/offline_inference/test_{model_name}_expansion.py
Performance:
- /tests/dfx/perf/tests/test.json
+ /tests/dfx/perf/tests/test_qwen_omni.json (Omni), test_tts.json (TTS),
+ and /tests/dfx/perf/tests/test_{diffusion_model}_vllm_omni.json (Diffusion)
Doc Test:
tests/example/online_serving/test_{model_name}.py
tests/example/offline_inference/test_{model_name}.py @@ -530,13 +531,13 @@ L4 level testing is a comprehensive quality audit before a version release. It e ### 3.2 Testing Content and Scope - ***Full Functionality Testing***: Executes all test cases defined in `test_{model_name}_expansion.py`, covering all implemented features, positive flows, boundary conditions, and exception handling. -- ***Performance Testing***: Uses the `tests/dfx/perf/tests/test.json` configuration file to drive performance testing tools for stress, load, and endurance tests, collecting metrics like throughput, response time, and resource utilization. +- ***Performance Testing***: Uses `tests/dfx/perf/tests/test_qwen_omni.json`, `tests/dfx/perf/tests/test_tts.json`, and diffusion configs in the form `tests/dfx/perf/tests/test_*_vllm_omni.json` (passed to `run_benchmark.py` via `--test-config-file`) to drive performance testing tools for stress, load, and endurance tests, collecting metrics like throughput, response time, and resource utilization. - ***Documentation Testing***: Verifies whether the example code provided to users is runnable and its results match the description. ### 3.3 Test Directory and Execution Files - ***Functional Testing***: Same directories as L3. -- ***Performance Test Configuration***: `tests/dfx/perf/tests/test.json` +- ***Performance Test Configuration***: `tests/dfx/perf/tests/test_qwen_omni.json`, `tests/dfx/perf/tests/test_tts.json`, and diffusion configs `tests/dfx/perf/tests/test_*_vllm_omni.json` (e.g. `test_qwen_image_vllm_omni.json`) - ***Documentation Example Tests***: - - `tests/example/online_serving/test_{model_name}.py` - `tests/example/offline_inference/test_{model_name}.py` diff --git a/docs/contributing/ci/test_examples/l4_performance_tests.inc.md b/docs/contributing/ci/test_examples/l4_performance_tests.inc.md index 8093e1459f..f1f3073dc5 100644 --- a/docs/contributing/ci/test_examples/l4_performance_tests.inc.md +++ b/docs/contributing/ci/test_examples/l4_performance_tests.inc.md @@ -1,4 +1,4 @@ -When you want to add L4-level ***performance test*** cases, you can refer to the following format for case addition in tests/dfx/perf/tests/test.json: +When you want to add L4-level ***performance test*** cases, you can refer to the following format for case addition in `tests/dfx/perf/tests/test_qwen_omni.json`, `tests/dfx/perf/tests/test_tts.json`, or diffusion configs such as `tests/dfx/perf/tests/test_*_vllm_omni.json` (selected via `pytest ... run_benchmark.py --test-config-file `): ```JSON { diff --git a/docs/contributing/ci/test_guide.md b/docs/contributing/ci/test_guide.md index 425f24332c..08b2e3b4ea 100644 --- a/docs/contributing/ci/test_guide.md +++ b/docs/contributing/ci/test_guide.md @@ -45,7 +45,6 @@ Our test scripts use the pytest framework. First, please use `git clone https:// === "L3 level & L4 level" ```bash - cd tests pytest -s -v -m "advanced_model" --run-level=advanced_model ``` If you only want to run L3 test case, you can use: @@ -60,9 +59,9 @@ Our test scripts use the pytest framework. First, please use `git clone https:// ```bash pytest -s -v -m "core_model and distributed_cuda and L4" --run-level=core_model ``` - Note: To run performance tests, use: + Note: To run performance tests (defaults to ``test_qwen_omni.json``; use ``--test-config-file tests/dfx/perf/tests/test_tts.json`` for TTS): ```bash - pytest -s -v perf/scripts/run_benchmark.py + pytest -s -v tests/dfx/perf/scripts/run_benchmark.py ``` The latest L3 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-merge.yml). diff --git a/tests/dfx/conftest.py b/tests/dfx/conftest.py index e54141b344..997f25e6e5 100644 --- a/tests/dfx/conftest.py +++ b/tests/dfx/conftest.py @@ -2,6 +2,8 @@ from pathlib import Path from typing import Any +import pytest + from tests.conftest import modify_stage_config @@ -95,3 +97,13 @@ def create_benchmark_indices( indices.append((test_name, idx)) return indices + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Register shared CLI options for DFX benchmark suites.""" + parser.addoption( + "--test-config-file", + action="store", + default=None, + help=("Path to benchmark config JSON. Example: --test-config-file tests/dfx/perf/tests/test_tts.json"), + ) diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index 67dedcd048..d5ef1b49e7 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -21,10 +21,30 @@ os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" -CONFIG_FILE_PATH = str(Path(__file__).parent.parent / "tests" / "test.json") -BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) -STAGE_INIT_TIMEOUT = 600 +def _get_config_file_from_argv() -> str | None: + """Read ``--test-config-file`` from ``sys.argv`` at import time so parametrization can use it.""" + import sys + + for i, arg in enumerate(sys.argv): + if arg == "--test-config-file" and i + 1 < len(sys.argv): + return sys.argv[i + 1] + if arg.startswith("--test-config-file="): + return arg.split("=", 1)[1] + return None + + +_PERF_TESTS_DIR = Path(__file__).resolve().parent.parent / "tests" +_DEFAULT_CONFIG_FILE = str(_PERF_TESTS_DIR / "test_qwen_omni.json") + +CONFIG_FILE_PATH = _get_config_file_from_argv() +if CONFIG_FILE_PATH is None: + print( + "No --test-config-file in argv, using default: tests/dfx/perf/tests/test_qwen_omni.json " + "(override with e.g. --test-config-file tests/dfx/perf/tests/test_tts.json)" + ) + CONFIG_FILE_PATH = _DEFAULT_CONFIG_FILE +BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) STAGE_CONFIGS_DIR = Path(__file__).parent.parent / "stage_configs" test_params = create_unique_server_params(BENCHMARK_CONFIGS, STAGE_CONFIGS_DIR) @@ -44,7 +64,7 @@ def omni_server(request): print(f"Starting OmniServer with test: {test_name}, model: {model}") - server_args = ["--stage-init-timeout", str(STAGE_INIT_TIMEOUT), "--init-timeout", "900"] + server_args = ["--stage-init-timeout", "300", "--init-timeout", "900"] if stage_config_path: server_args = ["--stage-configs-path", stage_config_path] + server_args with OmniServer(model, server_args) as server: @@ -97,8 +117,6 @@ def run_benchmark( ["vllm", "bench", "serve", "--omni"] + args + [ - "--num-warmups", - "2", "--save-result", "--result-dir", os.environ.get("BENCHMARK_DIR", "tests"), @@ -141,7 +159,6 @@ def run_benchmark( result["random_output_len"] = random_output_len with open(result_path, "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=2) - return result @@ -207,10 +224,6 @@ def _resolve_baseline_value( f"or request_rate={request_rate!r}; keys={list(baseline_raw.keys())!r}" ) if isinstance(baseline_raw, (list, tuple)): - if sweep_index is None: - raise ValueError("list baseline requires sweep_index") - if not (0 <= sweep_index < len(baseline_raw)): - raise IndexError(f"baseline list len={len(baseline_raw)} has no index {sweep_index}") return baseline_raw[sweep_index] return baseline_raw @@ -245,14 +258,14 @@ def assert_result( ) -> None: assert result["completed"] == num_prompt, "Request failures exist" baseline_data = params.get("baseline", {}) - thresholds = _baseline_thresholds_for_step( - baseline_data, - sweep_index=sweep_index, - max_concurrency=max_concurrency, - request_rate=request_rate, - ) - for metric_name, baseline_value in thresholds.items(): + for metric_name, baseline_raw in baseline_data.items(): current_value = result[metric_name] + baseline_value = _resolve_baseline_value( + baseline_raw, + sweep_index=sweep_index, + max_concurrency=max_concurrency, + request_rate=request_rate, + ) if "throughput" in metric_name: if current_value <= baseline_value: print( diff --git a/tests/dfx/perf/scripts/run_diffusion_benchmark.py b/tests/dfx/perf/scripts/run_diffusion_benchmark.py index 123f21405e..8eeeec8df2 100644 --- a/tests/dfx/perf/scripts/run_diffusion_benchmark.py +++ b/tests/dfx/perf/scripts/run_diffusion_benchmark.py @@ -5,8 +5,8 @@ - vllm-omni (default): starts DiffusionServer via vllm_omni.entrypoints.cli.main, benchmarks with diffusion_benchmark_serving.py --backend vllm-omni -A config JSON file is REQUIRED via --config-file: - pytest run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +A config JSON file is REQUIRED via --test-config-file: + pytest run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json JSON config entries use a "server_type" field, and this runner executes the vllm-omni path. @@ -55,16 +55,16 @@ def _get_config_file_from_argv() -> str | None: - """Read --config-file from sys.argv at import time so pytest parametrize can use it. + """Read --test-config-file from sys.argv at import time so pytest parametrize can use it. pytest_addoption (below) registers the same flag so pytest does not reject it. - Supports both ``--config-file path`` and ``--config-file=path`` forms. + Supports both ``--test-config-file path`` and ``--test-config-file=path`` forms. Returns None if the flag is not present; callers must handle the missing case. """ for i, arg in enumerate(sys.argv): - if arg == "--config-file" and i + 1 < len(sys.argv): + if arg == "--test-config-file" and i + 1 < len(sys.argv): return sys.argv[i + 1] - if arg.startswith("--config-file="): + if arg.startswith("--test-config-file="): return arg.split("=", 1)[1] return None @@ -133,19 +133,6 @@ def _append_to_aggregated_file(record: dict[str, Any]) -> None: json.dump(records, f, indent=2, ensure_ascii=False) -# Register --config-file with pytest so it does not reject the argument. -def pytest_addoption(parser: pytest.Parser) -> None: - parser.addoption( - "--config-file", - action="store", - default=None, - help=( - "Path to the benchmark config JSON file (required). " - "Example: --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json" - ), - ) - - _server_lock = threading.Lock() # --------------------------------------------------------------------------- diff --git a/tests/dfx/perf/tests/test.json b/tests/dfx/perf/tests/test_qwen_omni.json similarity index 92% rename from tests/dfx/perf/tests/test.json rename to tests/dfx/perf/tests/test_qwen_omni.json index 159e27a064..4662f8c0c7 100644 --- a/tests/dfx/perf/tests/test.json +++ b/tests/dfx/perf/tests/test_qwen_omni.json @@ -329,37 +329,5 @@ } } ] - }, - { - "test_name": "test_qwen3_tts", - "server_params": { - "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" - }, - "benchmark_params": [ - { - "dataset_name": "random", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "num_prompts": [ - 10, - 40 - ], - "max_concurrency": [ - 1, - 4 - ], - "random_input_len": 100, - "random_output_len": 100, - "extra_body": { - "voice": "Vivian", - "language": "English" - }, - "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_audio_ttfp_ms": [6000, 6000], - "mean_audio_rtf": [0.3, 0.3] - } - } - ] } ] diff --git a/tests/dfx/perf/tests/test_tts.json b/tests/dfx/perf/tests/test_tts.json new file mode 100644 index 0000000000..3583b45b4f --- /dev/null +++ b/tests/dfx/perf/tests/test_tts.json @@ -0,0 +1,34 @@ +[ + { + "test_name": "test_qwen3_tts", + "server_params": { + "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" + }, + "benchmark_params": [ + { + "dataset_name": "random", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "num_prompts": [ + 10, + 40 + ], + "max_concurrency": [ + 1, + 4 + ], + "random_input_len": 100, + "random_output_len": 100, + "extra_body": { + "voice": "Vivian", + "language": "English" + }, + "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_audio_ttfp_ms": [6000, 6000], + "mean_audio_rtf": [0.3, 0.3] + } + } + ] + } +] From de5f8a23b2cc4c51bdfe9d59f9887965c146d5d8 Mon Sep 17 00:00:00 2001 From: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com> Date: Thu, 16 Apr 2026 11:26:48 +0800 Subject: [PATCH 190/204] [Doc][Misc] Update DreamID-Omni Example; Add DreamID-Omni post process function (#2809) Signed-off-by: yuanheng --- .../offline_inference/x_to_video_audio.md | 28 ++++++++++++-- .../x_to_video_audio/x_to_video_audio.md | 28 ++++++++++++-- .../x_to_video_audio/x_to_video_audio.py | 38 +++++++++++++++---- .../dreamid_omni/pipeline_dreamid_omni.py | 15 ++++++++ vllm_omni/diffusion/registry.py | 1 + 5 files changed, 96 insertions(+), 14 deletions(-) diff --git a/docs/user_guide/examples/offline_inference/x_to_video_audio.md b/docs/user_guide/examples/offline_inference/x_to_video_audio.md index 8ea39d8115..cec8d47c59 100644 --- a/docs/user_guide/examples/offline_inference/x_to_video_audio.md +++ b/docs/user_guide/examples/offline_inference/x_to_video_audio.md @@ -31,9 +31,9 @@ dreamid_omni/ ``` ### Run the Inference -``` +```python python x_to_video_audio.py \ - --model /xx/dreamid_omni \ + --model /path/to/dreamid_omni \ --prompt "Two people walking together and singing happily" \ --image-path ./example0.png ./example1.png \ --audio-path ./example0.wav ./example1.wav \ @@ -43,11 +43,33 @@ python x_to_video_audio.py \ --num-inference-steps 45 \ --height 704 \ --width 1280 \ - --output dreamid_omni.mp4 + --output out_dreamid_omni_twoip.mp4 ``` In the current test scenario (2 images + 2 audio inputs), the VRAM requirement is 72GB, regardless of whether cfg-parallel is enabled or disabled. The VRAM usage can be reduced by enabling CPU offload via --enable-cpu-offload. + +You could take reference images/audios from the test cases in the official repo: https://github.com/Guoxu1233/DreamID-Omni + +For example, single IP ref resources can be found under https://github.com/Guoxu1233/DreamID-Omni/tree/main/test_case/oneip, you could download them correspondingly to your local and use them for testing. + +```python +# Example usage for oneip, ref media from the official repo DreamID-Omni +python x_to_video_audio.py \ + --model /path/to/dreamid_omni \ + --prompt ": In the frame, a woman with black long hair is identified as .\n**Overall Environment/Scene**: A lively open-kitchen café at night; stove flames flare, steam rises, and warm pendant lights swing slightly as staff move behind her. The shot is an upper-body close-up.\n**Main Characters/Subjects Appearance**: is a young woman with thick dark wavy hair and a side part. She wears a fitted black top under a light apron, a thin gold chain necklace, and small stud earrings.\n**Main Characters/Subjects Actions**: tastes the sauce with a spoon, then turns her face toward the camera while still holding the spoon, her expression shifting from focused to conflicted.\n maintains eye contact, swallows as if choosing her words, and says, I keep telling myself I’m fine,but some nights it feels like I’m just performing calm." \ + --image-path 9.png \ + --audio-path 9.wav \ + --video-negative-prompt "jitter, bad hands, blur, distortion" \ + --audio-negative-prompt "robotic, muffled, echo, distorted" \ + --cfg-parallel-size 2 \ + --num-inference-steps 45 \ + --height 704 \ + --width 1280 \ + --output out_dreamid_omni_oneip.mp4 +``` + + Key arguments: - `--prompt`: text description (string). - `--model`: path to the model local directory. diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.md b/examples/offline_inference/x_to_video_audio/x_to_video_audio.md index 4b5188f41b..13f2cfe7c0 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.md +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.md @@ -30,9 +30,9 @@ dreamid_omni/ ``` ### Run the Inference -``` +```python python x_to_video_audio.py \ - --model /xx/dreamid_omni \ + --model /path/to/dreamid_omni \ --prompt "Two people walking together and singing happily" \ --image-path ./example0.png ./example1.png \ --audio-path ./example0.wav ./example1.wav \ @@ -42,11 +42,33 @@ python x_to_video_audio.py \ --num-inference-steps 45 \ --height 704 \ --width 1280 \ - --output dreamid_omni.mp4 + --output out_dreamid_omni_twoip.mp4 ``` In the current test scenario (2 images + 2 audio inputs), the VRAM requirement is 72GB, regardless of whether cfg-parallel is enabled or disabled. The VRAM usage can be reduced by enabling CPU offload via --enable-cpu-offload. + +You could take reference images/audios from the test cases in the official repo: https://github.com/Guoxu1233/DreamID-Omni + +For example, single IP ref resources can be found under https://github.com/Guoxu1233/DreamID-Omni/tree/main/test_case/oneip, you could download them correspondingly to your local and use them for testing. + +```python +# Example usage for oneip, ref media from the official repo DreamID-Omni +python x_to_video_audio.py \ + --model /path/to/dreamid_omni \ + --prompt ": In the frame, a woman with black long hair is identified as .\n**Overall Environment/Scene**: A lively open-kitchen café at night; stove flames flare, steam rises, and warm pendant lights swing slightly as staff move behind her. The shot is an upper-body close-up.\n**Main Characters/Subjects Appearance**: is a young woman with thick dark wavy hair and a side part. She wears a fitted black top under a light apron, a thin gold chain necklace, and small stud earrings.\n**Main Characters/Subjects Actions**: tastes the sauce with a spoon, then turns her face toward the camera while still holding the spoon, her expression shifting from focused to conflicted.\n maintains eye contact, swallows as if choosing her words, and says, I keep telling myself I’m fine,but some nights it feels like I’m just performing calm." \ + --image-path 9.png \ + --audio-path 9.wav \ + --video-negative-prompt "jitter, bad hands, blur, distortion" \ + --audio-negative-prompt "robotic, muffled, echo, distorted" \ + --cfg-parallel-size 2 \ + --num-inference-steps 45 \ + --height 704 \ + --width 1280 \ + --output out_dreamid_omni_oneip.mp4 +``` + + Key arguments: - `--prompt`: text description (string). - `--model`: path to the model local directory. diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py index 49a0f496f8..322b184e52 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py @@ -5,10 +5,12 @@ import re import time +import numpy as np from PIL import Image from vllm.multimodal.media.audio import load_audio from vllm_omni.diffusion.data import DiffusionParallelConfig +from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -131,15 +133,35 @@ def main() -> None: if not outputs: raise RuntimeError("No output returned from DreamID-Omni.") - output = outputs[0].request_output - generated_video = output.images[0][0] - generated_audio = output.images[0][1] - try: - from dreamid_omni.utils.io_utils import save_video - except Exception as e: - raise RuntimeError(f"Failed to extract video and audio from DreamID-Omni output. Error: {e}") + result = outputs[0] + if not result.images: + raise RuntimeError("No video frames found in DreamID-Omni output.") + generated_video = result.images[0] + mm = result.multimodal_output or {} + generated_audio = mm.get("audio") + fps = int(mm.get("fps", 24)) + sample_rate = int(mm.get("audio_sample_rate", 16000)) + + # DreamID-Omni returns video as (C, F, H, W) float32 in [-1, 1]. + # mux_video_audio_bytes expects (F, H, W, C) uint8. + if not isinstance(generated_video, np.ndarray) or generated_video.ndim != 4: + raise RuntimeError(f"Unexpected video shape: {getattr(generated_video, 'shape', None)}") + frames = generated_video.transpose(1, 2, 3, 0) + frames = (np.clip((frames + 1.0) / 2.0, 0.0, 1.0) * 255.0).round().astype(np.uint8) + + audio_np = None + if generated_audio is not None: + audio_np = np.squeeze(np.asarray(generated_audio)).astype(np.float32) + output_path = args.output - save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000) + video_bytes = mux_video_audio_bytes( + frames, + audio_np, + fps=float(fps), + audio_sample_rate=sample_rate, + ) + with open(output_path, "wb") as f: + f.write(video_bytes) print(f"Saved generated video to {output_path}") print(f"Total time: {elapsed:.2f}s") diff --git a/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py b/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py index 974cc582f1..c7ab4662d1 100644 --- a/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py +++ b/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py @@ -38,6 +38,21 @@ logger = logging.getLogger(__name__) +def get_dreamid_omni_post_process_func(*args, **kwargs): + def post_process(output): + if isinstance(output, tuple) and len(output) == 2: + video, audio = output + return { + "video": video, + "audio": audio, + "audio_sample_rate": 16000, + "fps": 24, + } + return output + + return post_process + + AUDIO_CONFIG = { "patch_size": [1], "model_type": "t2a", diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index 517b061ece..0bf8c04517 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -375,6 +375,7 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) - "HunyuanVideo15ImageToVideoPipeline": "get_hunyuan_video_15_i2v_post_process_func", "MagiHumanPipeline": "get_magi_human_post_process_func", "OmniVoicePipeline": "get_omnivoice_post_process_func", + "DreamIDOmniPipeline": "get_dreamid_omni_post_process_func", } _DIFFUSION_PRE_PROCESS_FUNCS = { From b43c6c6663311090e5a276826f2e2005d13ac05f Mon Sep 17 00:00:00 2001 From: Lancer Date: Thu, 16 Apr 2026 12:20:46 +0800 Subject: [PATCH 191/204] [Feat] add GLM-Image SP support (#1983) Signed-off-by: Lancer Signed-off-by: Didan Deng <33117903+wtomin@users.noreply.github.com> Co-authored-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- .../models/glm_image/test_glm_image_sp.py | 134 ++++++++ .../diffusion/attention/parallel/ulysses.py | 4 - .../models/glm_image/glm_image_transformer.py | 288 ++++++++++++++---- .../models/glm_image/pipeline_glm_image.py | 35 ++- 4 files changed, 397 insertions(+), 64 deletions(-) create mode 100644 tests/diffusion/models/glm_image/test_glm_image_sp.py diff --git a/tests/diffusion/models/glm_image/test_glm_image_sp.py b/tests/diffusion/models/glm_image/test_glm_image_sp.py new file mode 100644 index 0000000000..1b1c8d7a75 --- /dev/null +++ b/tests/diffusion/models/glm_image/test_glm_image_sp.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for GLM-Image Sequence Parallelism support.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from vllm_omni.diffusion.data import DiffusionParallelConfig + + +@pytest.fixture(scope="function", autouse=True) +def setup_sp_groups(): + """Set up SP and TP groups for each test function.""" + with patch("vllm_omni.diffusion.distributed.parallel_state.get_sp_group") as mock_get_sp_group: + with patch("vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", return_value=1): + with patch("vllm.distributed.parallel_state.get_tp_group") as mock_get_tp_group: + mock_sp_group = MagicMock() + mock_sp_group.world_size = 4 + mock_get_sp_group.return_value = mock_sp_group + + mock_tp_group = MagicMock() + mock_tp_group.world_size = 1 + mock_get_tp_group.return_value = mock_tp_group + yield + + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def test_glm_image_sp_plan_defined(): + """Test that _sp_plan is properly defined on GlmImageTransformer2DModel.""" + from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageTransformer2DModel, + ) + + assert hasattr(GlmImageTransformer2DModel, "_sp_plan") + plan = GlmImageTransformer2DModel._sp_plan + assert plan is not None + + # Verify plan structure + assert "prepare" in plan + assert "proj_out" in plan + + +def test_glm_image_sp_plan_valid(): + """Validate _sp_plan structure.""" + from vllm_omni.diffusion.distributed.sp_plan import validate_sp_plan + from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageTransformer2DModel, + ) + + plan = GlmImageTransformer2DModel._sp_plan + validate_sp_plan(plan) + + +def test_glm_image_prepare_module_exists(): + """Test that GlmImagePrepare module exists.""" + from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImagePrepare, + ) + + assert GlmImagePrepare is not None + + +def test_glm_image_attention_accepts_parallel_config(): + """Test that GlmImageAttention accepts parallel_config parameter.""" + from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageAttention, + ) + + parallel_config = DiffusionParallelConfig( + ulysses_degree=2, + ring_degree=2, + tensor_parallel_size=1, + sequence_parallel_size=4, + ) + + attn = GlmImageAttention( + dim=2560, + num_heads=64, + head_dim=40, + parallel_config=parallel_config, + ) + + assert attn.parallel_config is not None + assert attn.parallel_config.sequence_parallel_size == 4 + + +def test_glm_image_transformer_block_accepts_parallel_config(): + """Test that GlmImageTransformerBlock accepts parallel_config parameter.""" + from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageTransformerBlock, + ) + + parallel_config = DiffusionParallelConfig( + ulysses_degree=2, + ring_degree=2, + tensor_parallel_size=1, + sequence_parallel_size=4, + ) + + block = GlmImageTransformerBlock( + dim=2560, + num_attention_heads=64, + attention_head_dim=40, + time_embed_dim=512, + parallel_config=parallel_config, + ) + + assert block.attn1.parallel_config is not None + assert block.attn1.parallel_config.sequence_parallel_size == 4 + + +def test_glm_image_has_sp_support(): + """Test that GLM-Image has SP support implemented.""" + from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageTransformer2DModel, + ) + + # Check that the model has parallel_config support + assert hasattr(GlmImageTransformer2DModel, "__init__") + + # Verify the model can be instantiated with SP config + + # This test just verifies the structure exists + # Actual SP testing requires multi-GPU setup + + +@pytest.mark.cuda +@pytest.mark.sp +def test_glm_image_sp_inference(): + """Test SP inference (requires multi-GPU setup).""" + pytest.skip("Requires multi-GPU SP setup") diff --git a/vllm_omni/diffusion/attention/parallel/ulysses.py b/vllm_omni/diffusion/attention/parallel/ulysses.py index 5d860b3350..326b5d4567 100644 --- a/vllm_omni/diffusion/attention/parallel/ulysses.py +++ b/vllm_omni/diffusion/attention/parallel/ulysses.py @@ -414,10 +414,6 @@ def pre_attention( def post_attention(self, attn_output: torch.Tensor, ctx: ParallelAttentionContext | None) -> torch.Tensor: assert isinstance(ctx, _UlyssesCtx), f"Unexpected ctx type: {type(ctx)!r}" - # If we have joint tensors (Text), they were Head-Sliced. - # The main sequence (Image) was Sequence-Sliced. - # attn_output contains [Joint_Sliced | Image_Sliced] (if strategy='front'). - if ctx.joint_len > 0: joint_len = ctx.joint_len diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py index 490e0198b9..7ff42a5f00 100644 --- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py +++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py @@ -19,10 +19,16 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata from vllm_omni.diffusion.attention.layer import Attention from vllm_omni.diffusion.cache.base import CachedTransformer -from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.data import DiffusionParallelConfig, OmniDiffusionConfig from vllm_omni.diffusion.distributed.hsdp_utils import is_transformer_block_module +from vllm_omni.diffusion.distributed.sp_plan import ( + SequenceParallelInput, + SequenceParallelOutput, +) +from vllm_omni.diffusion.forward_context import get_forward_context logger = init_logger(__name__) @@ -108,8 +114,8 @@ def __init__( def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: batch_size, channel, height, width = hidden_states.shape - post_patch_height = height // self.patch_size - post_patch_width = width // self.patch_size + post_patch_height = torch.tensor(height // self.patch_size, device=hidden_states.device, dtype=torch.int64) + post_patch_width = torch.tensor(width // self.patch_size, device=hidden_states.device, dtype=torch.int64) # Reshape: [B, C, H, W] -> [B, H', W', C*p*p] -> [B, H'*W', C*p*p] hidden_states = hidden_states.reshape( @@ -159,6 +165,65 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tens return (freqs.cos(), freqs.sin()) +class GlmImagePrepare(nn.Module): + """Prepare module for GLM-Image that handles patch embedding and RoPE computation. + + This module encapsulates the input processing pipeline to create a module boundary + where _sp_plan can shard outputs via split_output=True. + + Similar to Qwen-Image's ImageRopePrepare, this ensures hidden_states and RoPE + embeddings are sharded together to maintain dimension alignment. + """ + + def __init__( + self, + image_projector: nn.Module, + rope: GlmImageRotaryPosEmbed, + patch_size: int, + ): + super().__init__() + self.image_projector = image_projector + self.rope = rope + self.patch_size = patch_size + + def forward( + self, + hidden_states: torch.Tensor, + prior_hidden_states: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Process hidden_states and compute RoPE embeddings. + + Args: + hidden_states: Input latent tensor [B, C, H, W] + prior_hidden_states: Optional prior embedding to add + + Returns: + hidden_states: Patched hidden states [B, seq_len, D] + rope_cos: RoPE cos embeddings [seq_len, dim] + rope_sin: RoPE sin embeddings [seq_len, dim] + post_patch_height: Scalar tensor for height after patching + post_patch_width: Scalar tensor for width after patching + """ + batch_size, num_channels, height, width = hidden_states.shape + + post_patch_height = torch.tensor(height // self.patch_size, device=hidden_states.device, dtype=torch.int64) + post_patch_width = torch.tensor(width // self.patch_size, device=hidden_states.device, dtype=torch.int64) + + # Compute RoPE (uses original 4D hidden_states shape) + image_rotary_emb = self.rope(hidden_states) + rope_cos = image_rotary_emb[0].to(hidden_states.device) + rope_sin = image_rotary_emb[1].to(hidden_states.device) + + # Patch embedding: [B, C, H, W] -> [B, seq_len, D] + hidden_states = self.image_projector(hidden_states) + + # Add prior embedding if provided + if prior_hidden_states is not None: + hidden_states = hidden_states + prior_hidden_states + + return hidden_states, rope_cos, rope_sin, post_patch_height, post_patch_width + + class GlmImageAdaLayerNormZero(nn.Module): """Adaptive LayerNorm with zero initialization for both image and text streams.""" @@ -397,6 +462,7 @@ def __init__( dim: int, num_heads: int, head_dim: int, + parallel_config: DiffusionParallelConfig | None = None, out_bias: bool = True, eps: float = 1e-5, ): @@ -404,6 +470,7 @@ def __init__( self.dim = dim self.total_num_heads = num_heads self.head_dim = head_dim + self.parallel_config = parallel_config # QKV projection (fused for efficiency) self.to_qkv = QKVParallelLinear( @@ -450,16 +517,19 @@ def forward( attention_mask: torch.Tensor | None = None, kv_cache: GlmImageLayerKVCache | None = None, kv_cache_mode: KVCacheMode | None = None, + hidden_states_mask: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: """ Forward pass for joint attention. Args: - hidden_states: Image hidden states [B, img_seq_len, D] - encoder_hidden_states: Text hidden states [B, text_seq_len, D] - image_rotary_emb: Tuple of (cos, sin) for RoPE + hidden_states: Image hidden states [B, img_seq_len, D] (sharded in SP mode) + encoder_hidden_states: Text hidden states [B, text_seq_len, D] (full in SP mode) + image_rotary_emb: Tuple of (cos, sin) for RoPE (sharded in SP mode) + attention_mask: Optional attention mask kv_cache: Optional layer KV cache for image editing kv_cache_mode: Cache mode (WRITE, READ, SKIP) + hidden_states_mask: Mask for SP padding (True=valid, False=padding) Returns: Tuple of (image_hidden_states, text_hidden_states) @@ -467,6 +537,13 @@ def forward( dtype = encoder_hidden_states.dtype batch_size, text_seq_length, _ = encoder_hidden_states.shape + # Check if SP is enabled + sp_size = self.parallel_config.sequence_parallel_size if self.parallel_config else None + use_sp = sp_size is not None and sp_size > 1 + if use_sp: + forward_ctx = get_forward_context() + use_sp = not forward_ctx.split_text_embed_in_sp + # Concatenate text and image: [text, image] hidden_states_combined = torch.cat([encoder_hidden_states, hidden_states], dim=1) @@ -485,41 +562,88 @@ def forward( query = self.norm_q(query).to(dtype=dtype) key = self.norm_k(key).to(dtype=dtype) - # Apply RoPE only to image tokens (not text tokens) - if image_rotary_emb is not None: - # Only apply RoPE to image part (after text_seq_length) - query_img = query[:, text_seq_length:, :, :] - key_img = key[:, text_seq_length:, :, :] - from diffusers.models.embeddings import apply_rotary_emb - - query_img = apply_rotary_emb(query_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) - key_img = apply_rotary_emb(key_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) - query = torch.cat([query[:, :text_seq_length, :, :], query_img], dim=1) - key = torch.cat([key[:, :text_seq_length, :, :], key_img], dim=1) - - # Handle KV cache for image editing - if kv_cache is not None and kv_cache_mode is not None: - if kv_cache_mode == KVCacheMode.WRITE: - kv_cache.store(key, value) - elif kv_cache_mode == KVCacheMode.READ: - k_cached, v_cached = kv_cache.get() - if k_cached is not None: - key = torch.cat([k_cached, key], dim=1) - value = torch.cat([v_cached, value], dim=1) - # KVCacheMode.SKIP: do nothing - - # Attention computation - hidden_states_out = self.attn(query, key, value) - hidden_states_out = hidden_states_out.flatten(2, 3) - hidden_states_out = hidden_states_out.to(dtype) + if use_sp: + # SP mode: use joint attention mechanism + # Split Q/K/V into text and image parts + text_query = query[:, :text_seq_length, :, :] + text_key = key[:, :text_seq_length, :, :] + text_value = value[:, :text_seq_length, :, :] + img_query = query[:, text_seq_length:, :, :] + img_key = key[:, text_seq_length:, :, :] + img_value = value[:, text_seq_length:, :, :] + + # Apply RoPE only to image part + if image_rotary_emb is not None: + from diffusers.models.embeddings import apply_rotary_emb + + img_query = apply_rotary_emb(img_query, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) + img_key = apply_rotary_emb(img_key, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) + + # Create attention metadata for joint attention + attn_metadata = AttentionMetadata( + joint_query=text_query, + joint_key=text_key, + joint_value=text_value, + joint_strategy="front", + ) - # Output projection - for module in self.to_out: - hidden_states_out = module(hidden_states_out) + # Add padding mask for SP if available + if hidden_states_mask is not None: + attn_metadata.attn_mask = hidden_states_mask + + # Attention computation with joint text/image + # Note: Ulysses post_attention returns [text, image] concatenated + joint_hidden_states_out = self.attn(img_query, img_key, img_value, attn_metadata) + + # Project combined [text, image] outputs, then split. + # This keeps SP numerically aligned with the non-SP path. + joint_hidden_states_out = joint_hidden_states_out.flatten(2, 3).to(dtype) + for module in self.to_out: + joint_hidden_states_out = module(joint_hidden_states_out) - # Split back to text and image - encoder_hidden_states_out = hidden_states_out[:, :text_seq_length, :] - hidden_states_out = hidden_states_out[:, text_seq_length:, :] + encoder_hidden_states_out = joint_hidden_states_out[:, :text_seq_length, :] + hidden_states_out = joint_hidden_states_out[:, text_seq_length:, :] + else: + # Non-SP mode: original logic + # Apply RoPE only to image tokens (not text tokens) + if image_rotary_emb is not None: + query_img = query[:, text_seq_length:, :, :] + key_img = key[:, text_seq_length:, :, :] + from diffusers.models.embeddings import apply_rotary_emb + + query_img = apply_rotary_emb(query_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) + key_img = apply_rotary_emb(key_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) + query = torch.cat([query[:, :text_seq_length, :, :], query_img], dim=1) + key = torch.cat([key[:, :text_seq_length, :, :], key_img], dim=1) + + # Handle KV cache for image editing + if kv_cache is not None and kv_cache_mode is not None: + if kv_cache_mode == KVCacheMode.WRITE: + kv_cache.store(key, value) + elif kv_cache_mode == KVCacheMode.READ: + k_cached, v_cached = kv_cache.get() + if k_cached is not None: + key = torch.cat([k_cached, key], dim=1) + value = torch.cat([v_cached, value], dim=1) + + # Attention computation + attn_metadata = None + if attention_mask is not None: + if attention_mask.dim() == 3: + attention_mask = attention_mask.unsqueeze(1) + attn_metadata = AttentionMetadata(attn_mask=attention_mask) + + hidden_states_out = self.attn(query, key, value, attn_metadata) + hidden_states_out = hidden_states_out.flatten(2, 3) + hidden_states_out = hidden_states_out.to(dtype) + + # Output projection + for module in self.to_out: + hidden_states_out = module(hidden_states_out) + + # Split back to text and image + encoder_hidden_states_out = hidden_states_out[:, :text_seq_length, :] + hidden_states_out = hidden_states_out[:, text_seq_length:, :] return hidden_states_out, encoder_hidden_states_out @@ -628,6 +752,7 @@ def __init__( attention_head_dim: int = 40, time_embed_dim: int = 512, ffn_hidden_dim: int | None = None, + parallel_config: DiffusionParallelConfig | None = None, ) -> None: super().__init__() @@ -637,6 +762,7 @@ def __init__( dim=dim, num_heads=num_attention_heads, head_dim=attention_head_dim, + parallel_config=parallel_config, ) # 2. Feedforward @@ -654,6 +780,7 @@ def forward( attention_kwargs: dict[str, Any] | None = None, kv_cache: GlmImageLayerKVCache | None = None, kv_cache_mode: KVCacheMode | None = None, + hidden_states_mask: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: """ Forward pass for transformer block. @@ -667,6 +794,7 @@ def forward( attention_kwargs: Additional attention arguments kv_cache: Layer-specific KV cache for image editing kv_cache_mode: Cache mode (WRITE, READ, SKIP) + hidden_states_mask: Mask for SP padding (True=valid, False=padding) Returns: Tuple of (image_hidden_states, text_hidden_states) @@ -693,6 +821,7 @@ def forward( attention_mask=attention_mask, kv_cache=kv_cache, kv_cache_mode=kv_cache_mode, + hidden_states_mask=hidden_states_mask, ) hidden_states = hidden_states + attn_hidden_states * gate_msa.unsqueeze(1) encoder_hidden_states = encoder_hidden_states + attn_encoder_hidden_states * c_gate_msa.unsqueeze(1) @@ -724,6 +853,26 @@ class GlmImageTransformer2DModel(CachedTransformer): """ _repeated_blocks = ["GlmImageTransformerBlock"] + # SP plan using GlmImagePrepare module for sharding hidden_states and RoPE together. + # Similar to Qwen-Image's ImageRopePrepare, this creates a module boundary where + # _sp_plan can shard outputs via split_output=True. + # + # Key insight: hidden_states and RoPE embeddings MUST be sharded together + # to maintain dimension alignment for RoPE computation in attention layers. + _sp_plan = { + # Shard GlmImagePrepare outputs (hidden_states and RoPE must be sharded together) + "prepare": { + # hidden_states: [B, seq_len, D] - shard along sequence dimension + 0: SequenceParallelInput(split_dim=1, expected_dims=3, split_output=True, auto_pad=True), + # RoPE cos: [seq_len, dim] - shard along sequence dimension + 1: SequenceParallelInput(split_dim=0, expected_dims=2, split_output=True, auto_pad=True), + # RoPE sin: [seq_len, dim] - shard along sequence dimension + 2: SequenceParallelInput(split_dim=0, expected_dims=2, split_output=True, auto_pad=True), + # post_patch_height and post_patch_width are scalars, not sharded + }, + # Gather output at proj_out + "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), + } _hsdp_shard_conditions = [is_transformer_block_module] @@ -790,6 +939,9 @@ def __init__( dim=inner_dim, dim_out=inner_dim, inner_dim=inner_dim, activation_fn="linear-silu" ) + # Prepare module for SP (encapsulates patch embedding and RoPE for _sp_plan) + self.prepare = GlmImagePrepare(self.image_projector, self.rope, patch_size) + self.time_condition_embed = GlmImageCombinedTimestepSizeEmbeddings( embedding_dim=time_embed_dim, condition_dim=condition_dim, @@ -806,6 +958,7 @@ def __init__( attention_head_dim, time_embed_dim, ffn_hidden_dim=ffn_hidden_dim, + parallel_config=self.parallel_config, ) for _ in range(num_layers) ] @@ -859,33 +1012,51 @@ def forward( # Get KV cache mode kv_cache_mode = kv_cache.mode if kv_cache is not None else None - # 1. RoPE - if image_rotary_emb is None: - image_rotary_emb = self.rope(hidden_states) - # Move to correct device - image_rotary_emb = ( - image_rotary_emb[0].to(hidden_states.device), - image_rotary_emb[1].to(hidden_states.device), - ) - - # 2. Patch & Timestep embeddings - p = self.patch_size - post_patch_height = height // p - post_patch_width = width // p + # Set SP context if enabled + sp_size = self.parallel_config.sequence_parallel_size + if sp_size is not None and sp_size > 1: + get_forward_context().split_text_embed_in_sp = False - hidden_states = self.image_projector(hidden_states) + # Text embedding projection encoder_hidden_states = self.glyph_projector(encoder_hidden_states) # Prior embedding with dropout prior_embedding = self.prior_token_embedding(prior_token_id) prior_embedding[prior_token_drop] *= 0.0 prior_hidden_states = self.prior_projector(prior_embedding) - hidden_states = hidden_states + prior_hidden_states + + # 1. Prepare hidden_states and RoPE via GlmImagePrepare module + # _sp_plan will shard hidden_states and RoPE together via split_output=True + hidden_states, rope_cos, rope_sin, post_patch_height_t, post_patch_width_t = self.prepare( + hidden_states, prior_hidden_states + ) + image_rotary_emb = (rope_cos, rope_sin) + post_patch_height = int(post_patch_height_t.item()) + post_patch_width = int(post_patch_width_t.item()) # Timestep conditioning temb = self.time_condition_embed(timestep, target_size, crop_coords, hidden_states.dtype) - # 3. Transformer blocks + # Create padding mask for SP if needed (after _sp_plan hooks have run) + hidden_states_mask = None + if sp_size is not None and sp_size > 1: + from vllm_omni.diffusion.forward_context import is_forward_context_available + + if is_forward_context_available(): + ctx = get_forward_context() + if ctx.sp_original_seq_len is not None and ctx.sp_padding_size > 0: + img_padded_seq_len = ctx.sp_original_seq_len + ctx.sp_padding_size + hidden_states_mask = torch.ones( + batch_size, + img_padded_seq_len, + dtype=torch.bool, + device=hidden_states.device, + ) + hidden_states_mask[:, ctx.sp_original_seq_len :] = False + if hidden_states_mask.all(): + hidden_states_mask = None + + # 2. Transformer blocks for layer_idx, block in enumerate(self.transformer_blocks): # Get layer-specific KV cache if available layer_kv_cache = kv_cache[layer_idx] if kv_cache is not None else None @@ -899,13 +1070,16 @@ def forward( attention_kwargs, kv_cache=layer_kv_cache, kv_cache_mode=kv_cache_mode, + hidden_states_mask=hidden_states_mask, ) - # 4. Output norm & projection + # 3. Output norm & projection + # _sp_plan will gather hidden_states via proj_out hook hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) - # 5. Unpatchify: [B, H'*W', C*p*p] -> [B, C, H, W] + # 4. Unpatchify: [B, H'*W', C*p*p] -> [B, C, H, W] + p = self.patch_size hidden_states = hidden_states.reshape(batch_size, post_patch_height, post_patch_width, -1, p, p) output = hidden_states.permute(0, 3, 1, 4, 2, 5).flatten(4, 5).flatten(2, 3) diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index 375f7e7b80..0386364998 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -712,6 +712,14 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: if img is not None: preprocessed_images = [img] + # Priority: prompt dict (from ar2diffusion) > sampling_params + # ar2diffusion returns adjusted height/width that matches prior_token_ids + if not isinstance(first_prompt, str): + ar_height = first_prompt.get("height") + ar_width = first_prompt.get("width") + else: + ar_height = ar_width = None + img_height = req.sampling_params.height img_width = req.sampling_params.width @@ -719,12 +727,19 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: # Treat that as t2i warmup to avoid requiring i2i-only KV-cache inputs. is_image_edit = (preprocessed_images is not None) and (not is_dummy_warmup) - # Use image dimensions as default if available - height = req.sampling_params.height or img_height or self.default_sample_size * self.vae_scale_factor - width = req.sampling_params.width or img_width or self.default_sample_size * self.vae_scale_factor + # Use prompt dict dimensions (from ar2diffusion) as priority, then sampling_params + height = ( + ar_height or req.sampling_params.height or img_height or self.default_sample_size * self.vae_scale_factor + ) + width = ar_width or req.sampling_params.width or img_width or self.default_sample_size * self.vae_scale_factor num_inference_steps = req.sampling_params.num_inference_steps or 50 guidance_scale = req.sampling_params.guidance_scale or 1.5 + # Ensure dimensions are multiples of vae_scale_factor * patch_size + multiple_of = self.vae_scale_factor * self._patch_size + height = height // multiple_of * multiple_of + width = width // multiple_of * multiple_of + self.check_inputs(prompt=prompt, height=height, width=width, prompt_embeds=prompt_embeds) batch_size = 1 @@ -753,6 +768,20 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: prior_token_id = prior_token_id.to(device=self.device, dtype=torch.long) if prior_token_id.dim() == 1: prior_token_id = prior_token_id.unsqueeze(0) + + # Validate that prior_token_id seq_len matches dimensions + prior_seq_len = prior_token_id.shape[1] + expected_seq_len = (height // self.vae_scale_factor // self._patch_size) * ( + width // self.vae_scale_factor // self._patch_size + ) + if prior_seq_len != expected_seq_len: + raise ValueError( + f"prior_token_ids seq_len ({prior_seq_len}) doesn't match dimensions " + f"({height}x{width}, expected seq_len={expected_seq_len}). " + f"This indicates a mismatch between AR output and Diffusion input. " + f"Please ensure ar2diffusion returns correct height/width." + ) + prior_token_image_ids = None if external_prior_image_ids is not None: if isinstance(external_prior_image_ids, torch.Tensor): From 24e61f4d7bccb61d020f9020c22da51546a4c7c5 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Thu, 16 Apr 2026 14:52:53 +0800 Subject: [PATCH 192/204] [CI] add qwen image and layered accuracy test (#2772) Signed-off-by: david6666666 <530634352@qq.com> --- tests/e2e/accuracy/test_qwen_image.py | 124 ++++++++++++++ tests/e2e/accuracy/test_qwen_image_layered.py | 151 ++++++++++++++++++ tests/e2e/accuracy/utils.py | 47 ++++-- 3 files changed, 313 insertions(+), 9 deletions(-) create mode 100644 tests/e2e/accuracy/test_qwen_image.py create mode 100644 tests/e2e/accuracy/test_qwen_image_layered.py diff --git a/tests/e2e/accuracy/test_qwen_image.py b/tests/e2e/accuracy/test_qwen_image.py new file mode 100644 index 0000000000..e73195017a --- /dev/null +++ b/tests/e2e/accuracy/test_qwen_image.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +import base64 +import gc +import io +import os +from pathlib import Path + +import pytest +import requests +import torch +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from PIL import Image + +from tests.conftest import ( + OmniServer, + _run_post_test_cleanup, + _run_pre_test_cleanup, +) +from tests.e2e.accuracy.utils import assert_similarity, model_output_dir +from tests.utils import hardware_test + +MODEL_ID = "Qwen/Qwen-Image" +MODEL_ENV_VAR = "QWEN_IMAGE_MODEL" +PROMPT = "A photo of a cat sitting on a laptop keyboard, digital art style." +NEGATIVE_PROMPT = "blurry, low quality" +WIDTH = 512 +HEIGHT = 512 +NUM_INFERENCE_STEPS = 20 +TRUE_CFG_SCALE = 4.0 +SEED = 42 +SSIM_THRESHOLD = 0.97 +PSNR_THRESHOLD = 30.0 + + +def _model_name() -> str: + return os.environ.get(MODEL_ENV_VAR, MODEL_ID) + + +def _local_files_only(model: str) -> bool: + return Path(model).exists() + + +def _run_vllm_omni_qwen_image(*, model: str, output_path: Path) -> Image.Image: + server_args = ["--num-gpus", "1", "--stage-init-timeout", "300", "--init-timeout", "900"] + with OmniServer(model, server_args, use_omni=True) as omni_server: + response = requests.post( + f"http://{omni_server.host}:{omni_server.port}/v1/images/generations", + json={ + "model": omni_server.model, + "prompt": PROMPT, + "size": f"{WIDTH}x{HEIGHT}", + "n": 1, + "response_format": "b64_json", + "negative_prompt": NEGATIVE_PROMPT, + "num_inference_steps": NUM_INFERENCE_STEPS, + "true_cfg_scale": TRUE_CFG_SCALE, + "seed": SEED, + }, + timeout=600, + ) + response.raise_for_status() + payload = response.json() + assert len(payload["data"]) == 1 + image_bytes = base64.b64decode(payload["data"][0]["b64_json"]) + image = Image.open(io.BytesIO(image_bytes)).convert("RGB") + image.load() + image.save(output_path) + return image + + +def _run_diffusers_qwen_image(*, model: str, output_path: Path) -> Image.Image: + _run_pre_test_cleanup(enable_force=True) + pipe: DiffusionPipeline | None = None + try: + pipe = DiffusionPipeline.from_pretrained( + model, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + local_files_only=_local_files_only(model), + ).to("cuda") + generator = torch.Generator(device="cuda").manual_seed(SEED) + result = pipe( # pyright: ignore[reportCallIssue] + prompt=PROMPT, + negative_prompt=NEGATIVE_PROMPT, + width=WIDTH, + height=HEIGHT, + num_inference_steps=NUM_INFERENCE_STEPS, + true_cfg_scale=TRUE_CFG_SCALE, + generator=generator, + ) + output_image = result.images[0].convert("RGB") + output_image.save(output_path) + return output_image + finally: + if pipe is not None and hasattr(pipe, "maybe_free_model_hooks"): + pipe.maybe_free_model_hooks() + del pipe + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + _run_post_test_cleanup(enable_force=True) + + +@pytest.mark.advanced_model +@pytest.mark.benchmark +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}, num_cards=1) +def test_qwen_image_matches_diffusers(accuracy_artifact_root: Path) -> None: + model = _model_name() + output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID) + + vllm_output = _run_vllm_omni_qwen_image(model=model, output_path=output_dir / "vllm_omni.png") + diffusers_output = _run_diffusers_qwen_image(model=model, output_path=output_dir / "diffusers.png") + + assert_similarity( + model_name=MODEL_ID, + vllm_image=vllm_output, + diffusers_image=diffusers_output, + width=WIDTH, + height=HEIGHT, + ssim_threshold=SSIM_THRESHOLD, + psnr_threshold=PSNR_THRESHOLD, + ) diff --git a/tests/e2e/accuracy/test_qwen_image_layered.py b/tests/e2e/accuracy/test_qwen_image_layered.py new file mode 100644 index 0000000000..04b13df3bb --- /dev/null +++ b/tests/e2e/accuracy/test_qwen_image_layered.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +import base64 +import gc +import io +import os +from pathlib import Path + +import pytest +import requests +import torch +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from PIL import Image + +from tests.conftest import ( + OmniServer, + _run_post_test_cleanup, + _run_pre_test_cleanup, +) +from tests.e2e.accuracy.utils import assert_image_sequence_similarity, model_output_dir +from tests.utils import hardware_test + +MODEL_ID = "Qwen/Qwen-Image-Layered" +MODEL_ENV_VAR = "QWEN_IMAGE_LAYERED_MODEL" +PROMPT = "decompose into layers" +NEGATIVE_PROMPT = " " +NUM_INFERENCE_STEPS = 20 +TRUE_CFG_SCALE = 4.0 +SEED = 777 +LAYERS = 3 +RESOLUTION = 640 +SSIM_THRESHOLD = 0.97 +PSNR_THRESHOLD = 30.0 + + +def _model_name() -> str: + return os.environ.get(MODEL_ENV_VAR, MODEL_ID) + + +def _local_files_only(model: str) -> bool: + return Path(model).exists() + + +def _normalize_layered_images(images: object) -> list[Image.Image]: + if not isinstance(images, list) or not images: + raise AssertionError(f"Unexpected layered output container: {type(images).__name__}") + + first_item = images[0] + if isinstance(first_item, Image.Image): + return [image.convert("RGBA") for image in images if isinstance(image, Image.Image)] + if isinstance(first_item, (list, tuple)): + return [image.convert("RGBA") for image in first_item if isinstance(image, Image.Image)] + raise AssertionError(f"Unexpected layered image element type: {type(first_item).__name__}") + + +def _run_vllm_omni_qwen_image_layered(*, model: str, input_image: Image.Image, output_dir: Path) -> list[Image.Image]: + input_image.save(output_dir / "input.png") + server_args = ["--num-gpus", "1", "--stage-init-timeout", "300", "--init-timeout", "900"] + with OmniServer(model, server_args, use_omni=True) as omni_server: + buffer = io.BytesIO() + input_image.save(buffer, format="PNG") + buffer.seek(0) + response = requests.post( + f"http://{omni_server.host}:{omni_server.port}/v1/images/edits", + data={ + "model": omni_server.model, + "prompt": PROMPT, + "size": "auto", + "n": 1, + "response_format": "b64_json", + "negative_prompt": NEGATIVE_PROMPT, + "num_inference_steps": NUM_INFERENCE_STEPS, + "true_cfg_scale": TRUE_CFG_SCALE, + "seed": SEED, + "layers": LAYERS, + "resolution": RESOLUTION, + }, + files=[("image", ("input.png", buffer, "image/png"))], + timeout=600, + ) + response.raise_for_status() + payload = response.json() + assert len(payload["data"]) == LAYERS + output_images = [] + for item in payload["data"]: + image_bytes = base64.b64decode(item["b64_json"]) + image = Image.open(io.BytesIO(image_bytes)).convert("RGBA") + image.load() + output_images.append(image) + for index, image in enumerate(output_images, start=1): + image.save(output_dir / f"vllm_omni_layer_{index}.png") + return output_images + + +def _run_diffusers_qwen_image_layered(*, model: str, input_image: Image.Image, output_dir: Path) -> list[Image.Image]: + _run_pre_test_cleanup(enable_force=True) + pipe: DiffusionPipeline | None = None + try: + pipe = DiffusionPipeline.from_pretrained( + model, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + local_files_only=_local_files_only(model), + ).to("cuda") + generator = torch.Generator(device="cuda").manual_seed(SEED) + result = pipe( # pyright: ignore[reportCallIssue] + image=input_image, + prompt=PROMPT, + negative_prompt=NEGATIVE_PROMPT, + num_inference_steps=NUM_INFERENCE_STEPS, + true_cfg_scale=TRUE_CFG_SCALE, + generator=generator, + num_images_per_prompt=1, + layers=LAYERS, + resolution=RESOLUTION, + ) + output_images = _normalize_layered_images(result.images) + assert len(output_images) == LAYERS, f"Expected {LAYERS} diffusers layers, got {len(output_images)}" + for index, image in enumerate(output_images, start=1): + image.save(output_dir / f"diffusers_layer_{index}.png") + return output_images + finally: + if pipe is not None and hasattr(pipe, "maybe_free_model_hooks"): + pipe.maybe_free_model_hooks() + del pipe + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + _run_post_test_cleanup(enable_force=True) + + +@pytest.mark.advanced_model +@pytest.mark.benchmark +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}, num_cards=1) +def test_qwen_image_layered_matches_diffusers(accuracy_artifact_root: Path, qwen_bear_image: Image.Image) -> None: + model = _model_name() + output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID) + input_image = qwen_bear_image.convert("RGBA") + + vllm_outputs = _run_vllm_omni_qwen_image_layered(model=model, input_image=input_image, output_dir=output_dir) + diffusers_outputs = _run_diffusers_qwen_image_layered(model=model, input_image=input_image, output_dir=output_dir) + + assert_image_sequence_similarity( + model_name=MODEL_ID, + vllm_images=vllm_outputs, + diffusers_images=diffusers_outputs, + ssim_threshold=SSIM_THRESHOLD, + psnr_threshold=PSNR_THRESHOLD, + compare_mode="RGBA", + ) diff --git a/tests/e2e/accuracy/utils.py b/tests/e2e/accuracy/utils.py index eb0eea757e..d722b69b01 100644 --- a/tests/e2e/accuracy/utils.py +++ b/tests/e2e/accuracy/utils.py @@ -21,13 +21,14 @@ def assert_similarity( model_name: str, vllm_image: Image.Image, diffusers_image: Image.Image, - width: int, - height: int, ssim_threshold: float, psnr_threshold: float, + width: int | None = None, + height: int | None = None, + compare_mode: str = "RGB", ) -> None: - requested_size = (width, height) - if diffusers_image.size != requested_size: + requested_size = (width, height) if width is not None and height is not None else None + if requested_size is not None and diffusers_image.size != requested_size: pytest.skip( "Skipping as diffusers baseline output is corrupt and not comparable: " f"dimensions do not match requested size; requested={requested_size}, got={diffusers_image.size}." @@ -37,7 +38,11 @@ def assert_similarity( f"Online and diffusers output sizes mismatch: online={vllm_image.size}, diffusers={diffusers_image.size}" ) - ssim_score, psnr_score = compute_image_ssim_psnr(prediction=vllm_image, reference=diffusers_image) + ssim_score, psnr_score = compute_image_ssim_psnr( + prediction=vllm_image, + reference=diffusers_image, + compare_mode=compare_mode, + ) print(f"{model_name} similarity metrics:") print(f" SSIM: value={ssim_score:.6f}, threshold>={ssim_threshold:.6f}, range=[-1, 1], higher_is_better=True") print( @@ -52,13 +57,37 @@ def assert_similarity( ) +def assert_image_sequence_similarity( + *, + model_name: str, + vllm_images: list[Image.Image], + diffusers_images: list[Image.Image], + ssim_threshold: float, + psnr_threshold: float, + compare_mode: str = "RGB", +) -> None: + assert len(vllm_images) == len(diffusers_images), ( + f"Output image count mismatch for {model_name}: online={len(vllm_images)}, diffusers={len(diffusers_images)}" + ) + for index, (vllm_image, diffusers_image) in enumerate(zip(vllm_images, diffusers_images, strict=True), start=1): + assert_similarity( + model_name=f"{model_name}[layer={index}]", + vllm_image=vllm_image, + diffusers_image=diffusers_image, + ssim_threshold=ssim_threshold, + psnr_threshold=psnr_threshold, + compare_mode=compare_mode, + ) + + def compute_image_ssim_psnr( *, prediction: Image.Image, reference: Image.Image, + compare_mode: str = "RGB", ) -> tuple[float, float]: - pred_tensor = _pil_to_batched_tensor(prediction) - ref_tensor = _pil_to_batched_tensor(reference) + pred_tensor = _pil_to_batched_tensor(prediction, compare_mode=compare_mode) + ref_tensor = _pil_to_batched_tensor(reference, compare_mode=compare_mode) ssim_metric = StructuralSimilarityIndexMeasure(data_range=1.0) psnr_metric = PeakSignalNoiseRatio(data_range=1.0) @@ -68,7 +97,7 @@ def compute_image_ssim_psnr( return ssim_value, psnr_value -def _pil_to_batched_tensor(image: Image.Image) -> torch.Tensor: - array = np.asarray(image.convert("RGB"), dtype=np.float32) / 255.0 +def _pil_to_batched_tensor(image: Image.Image, *, compare_mode: str) -> torch.Tensor: + array = np.asarray(image.convert(compare_mode), dtype=np.float32) / 255.0 tensor = torch.from_numpy(array).permute(2, 0, 1).unsqueeze(0) return tensor From 4d816ff1ded1e35393d6175d8f0dbbe07d570add Mon Sep 17 00:00:00 2001 From: NATURE Date: Thu, 16 Apr 2026 16:25:13 +0800 Subject: [PATCH 193/204] [Feature] Bagel: Support tp+cfg parallel using mooncake transfer engine connector (#2705) Signed-off-by: natureofnature Co-authored-by: Hongsheng Liu --- .../omni_connectors/test_tp_rank_aware.py | 716 +++++++++++++++++ .../test_async_omni_engine_stage_init.py | 69 ++ tests/engine/test_single_stage_mode.py | 2 + .../distributed/group_coordinator.py | 5 +- .../diffusion/models/bagel/pipeline_bagel.py | 36 +- .../omni_connectors/kv_transfer_manager.py | 721 ++++++++++++------ .../omni_connectors/utils/kv_utils.py | 367 ++++++++- vllm_omni/engine/async_omni_engine.py | 16 +- vllm_omni/engine/stage_engine_core_client.py | 7 +- vllm_omni/engine/stage_init_utils.py | 116 ++- vllm_omni/entrypoints/openai/serving_chat.py | 53 +- vllm_omni/inputs/data.py | 4 + 12 files changed, 1846 insertions(+), 266 deletions(-) create mode 100644 tests/distributed/omni_connectors/test_tp_rank_aware.py diff --git a/tests/distributed/omni_connectors/test_tp_rank_aware.py b/tests/distributed/omni_connectors/test_tp_rank_aware.py new file mode 100644 index 0000000000..d4793479aa --- /dev/null +++ b/tests/distributed/omni_connectors/test_tp_rank_aware.py @@ -0,0 +1,716 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for rank-aware KV transfer (TP > 1) and heterogeneous TP support. + +Covers: +- _build_rank_aware_send_keys / _build_rank_aware_recv_keys +- _get_kv_source_ranks / _get_kv_target_ranks / get_kv_connector_key +- update_sender_info storing base host/port +- receive path constructing per-rank metadata for connector.get() +- Mooncake connector _query_metadata_at and partial-metadata get() path +""" + +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import pytest +import torch + +from vllm_omni.distributed.omni_connectors.kv_transfer_manager import ( + KVCacheTransferData, + OmniKVCacheConfig, + OmniKVTransferManager, +) +from vllm_omni.distributed.omni_connectors.utils.initialization import ( + KV_RANK_PORT_STRIDE, +) +from vllm_omni.distributed.omni_connectors.utils.kv_utils import ( + KVTPTopology, + build_rank_aware_recv_keys, + build_rank_aware_send_keys, + get_kv_connector_key, + get_kv_source_ranks, + get_kv_target_ranks, + merge_received_rank_shards, + slice_received_rank_shard, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def _make_manager( + from_tp: int = 1, + to_tp: int = 1, + local_rank: int = 0, + from_stage: str = "stage0", + to_stage: str = "stage1", + stage_id: str = "stage1", + need_recv: bool = True, + need_send: bool = False, + recv_timeout: float = 0.3, +) -> OmniKVTransferManager: + """Build a manager with TP params injected, bypassing torch.distributed.""" + config = OmniKVCacheConfig( + connector_config={"type": "mock"}, + from_stage=from_stage, + to_stage=to_stage, + stage_id=stage_id, + need_recv_cache=need_recv, + need_send_cache=need_send, + recv_timeout=recv_timeout, + from_tp=from_tp, + to_tp=to_tp, + ) + with ( + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_local_tp_rank", return_value=local_rank), + patch( + "vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_tp_world_size", + return_value=max(from_tp, to_tp), + ), + ): + mgr = OmniKVTransferManager(config) + return mgr + + +def _make_payload(head_values: list[float], request_id: str = "req-1") -> dict: + head_tensor = torch.tensor(head_values, dtype=torch.float32).view(1, len(head_values), 1).repeat(2, 1, 1) + return { + "request_id": request_id, + "layer_blocks": { + "key_cache": [head_tensor.clone()], + "value_cache": [(head_tensor + 100).clone()], + }, + "block_ids": [0], + "metadata": {"seq_len": 2}, + } + + +def _make_transfer_data(head_values: list[float], request_id: str = "req-1") -> KVCacheTransferData: + payload = _make_payload(head_values, request_id=request_id) + return KVCacheTransferData( + request_id=request_id, + layer_blocks=payload["layer_blocks"], + block_ids=payload["block_ids"], + metadata=payload["metadata"], + ) + + +# ── Key format helper ──────────────────────────────────────────────── + + +class TestConnectorKeyFormat: + def test_key_format_matches_pr2677(self): + key = get_kv_connector_key("req-1", "stage0", 0, 1, 2) + assert key == "req-1_stage0_0_1_2" + + def test_key_fields_are_positional(self): + key = get_kv_connector_key("r", "s", 5, 3, 7) + parts = key.split("_") + assert parts == ["r", "s", "5", "3", "7"] + + +# ── Source / target rank mapping ───────────────────────────────────── + + +class TestRankMapping: + """Verify get_kv_target_ranks and get_kv_source_ranks for various TP configs.""" + + def test_homogeneous_tp2_rank0(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=0) + assert get_kv_target_ranks(topo) == [0] + assert get_kv_source_ranks(topo) == [0] + + def test_homogeneous_tp2_rank1(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=1) + assert get_kv_target_ranks(topo) == [1] + assert get_kv_source_ranks(topo) == [1] + + def test_homogeneous_tp4_rank3(self): + topo = KVTPTopology(source_tp_size=4, target_tp_size=4, local_rank=3) + assert get_kv_target_ranks(topo) == [3] + assert get_kv_source_ranks(topo) == [3] + + def test_sender_gt_receiver_tp4_to_tp2_rank0(self): + """Receiver rank 0 should receive from sender rank 0 and 1.""" + topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=0) + assert get_kv_source_ranks(topo) == [0, 1] + + def test_sender_gt_receiver_tp4_to_tp2_rank1(self): + """Receiver rank 1 should receive from sender rank 2 and 3.""" + topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=1) + assert get_kv_source_ranks(topo) == [2, 3] + + def test_sender_lt_receiver_tp2_to_tp4_rank0(self): + """Sender rank 0 should send to receiver ranks 0 and 1.""" + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=0) + assert get_kv_target_ranks(topo) == [0, 1] + + def test_sender_lt_receiver_tp2_to_tp4_rank1(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=1) + assert get_kv_target_ranks(topo) == [2, 3] + + def test_receiver_lt_sender_source_ranks(self): + """Receiver rank 0 with tp2_to_tp4 should source from rank 0 only.""" + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=0) + assert get_kv_source_ranks(topo) == [0] + + def test_invalid_topology_raises(self): + topo = KVTPTopology(source_tp_size=3, target_tp_size=2, local_rank=0) + with pytest.raises(ValueError, match="divisible"): + get_kv_source_ranks(topo) + + +# ── _build_rank_aware_recv_keys ────────────────────────────────────── + + +class TestBuildRankAwareRecvKeys: + """Verify build_rank_aware_recv_keys returns (key, from_rank) tuples.""" + + def test_tp1_returns_legacy_key_with_none_rank(self): + topo = KVTPTopology(source_tp_size=1, target_tp_size=1, local_rank=0) + pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) + assert len(pairs) == 1 + key, rank = pairs[0] + assert key == "omni_stage0_to_stage1_kv_cache_req-1" + assert rank is None + + def test_homogeneous_tp2_rank0(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=0) + pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) + assert len(pairs) == 1 + key, rank = pairs[0] + assert key == "req-1_stage0_0_0_0" + assert rank == 0 + + def test_homogeneous_tp2_rank1(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=1) + pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) + assert len(pairs) == 1 + key, rank = pairs[0] + assert key == "req-1_stage0_0_1_1" + assert rank == 1 + + def test_heterogeneous_tp4_to_tp2_rank0_gets_two_keys(self): + """Receiver rank 0 with source_tp=4, target_tp=2 should get 2 keys.""" + topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=0) + pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) + assert len(pairs) == 2 + + keys = [k for k, _ in pairs] + ranks = [r for _, r in pairs] + assert keys == ["req-1_stage0_0_0_0", "req-1_stage0_0_1_0"] + assert ranks == [0, 1] + + def test_heterogeneous_tp4_to_tp2_rank1_gets_two_keys(self): + topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=1) + pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) + assert len(pairs) == 2 + + ranks = [r for _, r in pairs] + assert ranks == [2, 3] + + def test_heterogeneous_tp2_to_tp4_rank2_gets_one_key(self): + """Receiver rank 2 with source_tp=2, target_tp=4 should get 1 key from sender rank 1.""" + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=2) + pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) + assert len(pairs) == 1 + key, rank = pairs[0] + assert rank == 1 + assert key == "req-1_stage0_0_1_2" + + +# ── _build_rank_aware_send_keys ────────────────────────────────────── + + +class TestBuildRankAwareSendKeys: + def test_tp1_returns_legacy_key(self): + topo = KVTPTopology(source_tp_size=1, target_tp_size=1, local_rank=0) + keys = build_rank_aware_send_keys("req-1", "stage0", "stage1", topo) + assert keys == ["omni_stage0_to_stage1_kv_cache_req-1"] + + def test_homogeneous_tp2_rank0(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=0) + keys = build_rank_aware_send_keys("req-1", "stage0", "stage1", topo) + assert keys == ["req-1_stage0_0_0_0"] + + def test_sender_lt_receiver_tp2_to_tp4_rank0_sends_two_keys(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=0) + keys = build_rank_aware_send_keys("req-1", "stage0", "stage1", topo) + assert len(keys) == 2 + assert keys == ["req-1_stage0_0_0_0", "req-1_stage0_0_0_1"] + + +# ── update_sender_info stores base host/port ───────────────────────── + + +class TestUpdateSenderInfoBase: + def test_stores_base_host_and_port(self): + mgr = _make_manager(from_tp=2, to_tp=2, local_rank=0) + mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) + + assert mgr._sender_base_host == "10.0.0.1" + assert mgr._sender_base_zmq_port == 50151 + + def test_rank1_adjusts_default_port_but_preserves_base(self): + mgr = _make_manager(from_tp=2, to_tp=2, local_rank=1) + mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) + + assert mgr._sender_base_host == "10.0.0.1" + assert mgr._sender_base_zmq_port == 50151 + expected_adjusted = 50151 + 1 * KV_RANK_PORT_STRIDE + assert mgr.config.connector_config["sender_zmq_port"] == expected_adjusted + + def test_nested_sender_info_resolves_correctly(self): + """Nested sender_info keyed by integer stage id should resolve + using recv_stages (engine_input_source → recv_from).""" + config = OmniKVCacheConfig( + connector_config={"type": "mock"}, + stage_id=2, + engine_input_source=[1], + need_recv_cache=True, + from_tp=2, + to_tp=2, + ) + with ( + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_local_tp_rank", return_value=0), + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_tp_world_size", return_value=2), + ): + mgr = OmniKVTransferManager(config) + mgr.update_sender_info( + { + 0: {"host": "10.0.0.1", "zmq_port": 50151}, + 1: {"host": "10.0.0.2", "zmq_port": 50152}, + } + ) + assert mgr._sender_base_host == "10.0.0.2" + assert mgr._sender_base_zmq_port == 50152 + + +# ── receive path constructs per-rank metadata ──────────────────────── + + +class TestReceiveConstructsMetadata: + """Verify that receive_kv_cache_for_request passes metadata with + correct (host, port) to connector.get() for heterogeneous TP.""" + + def test_tp1_no_metadata_passed(self): + """TP=1: connector.get() should be called WITHOUT metadata.""" + mgr = _make_manager(from_tp=1, to_tp=1, local_rank=0, recv_timeout=0.05) + mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) + + calls = [] + + class _Connector: + def get(self, from_stage, to_stage, get_key, metadata=None): + calls.append({"key": get_key, "metadata": metadata}) + return None + + mgr._connector = _Connector() + mgr.receive_kv_cache_for_request("req-1") + + assert len(calls) > 0 + assert calls[0]["metadata"] is None + + def test_homogeneous_tp2_rank0_passes_metadata(self): + """TP=2 rank 0: metadata should point to sender rank 0's port.""" + mgr = _make_manager(from_tp=2, to_tp=2, local_rank=0, recv_timeout=0.05) + mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) + + calls = [] + + class _Connector: + def get(self, from_stage, to_stage, get_key, metadata=None): + calls.append({"key": get_key, "metadata": metadata}) + return None + + mgr._connector = _Connector() + mgr.receive_kv_cache_for_request("req-1") + + assert len(calls) > 0 + meta = calls[0]["metadata"] + assert meta is not None + assert meta["source_host"] == "10.0.0.1" + assert meta["source_port"] == 50151 + 0 * KV_RANK_PORT_STRIDE + + def test_homogeneous_tp2_rank1_passes_metadata_with_offset(self): + mgr = _make_manager(from_tp=2, to_tp=2, local_rank=1, recv_timeout=0.05) + mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) + + calls = [] + + class _Connector: + def get(self, from_stage, to_stage, get_key, metadata=None): + calls.append({"key": get_key, "metadata": metadata}) + return None + + mgr._connector = _Connector() + mgr.receive_kv_cache_for_request("req-1") + + meta = calls[0]["metadata"] + assert meta["source_port"] == 50151 + 1 * KV_RANK_PORT_STRIDE + + def test_heterogeneous_tp4_to_tp2_rank0_multiple_metadata(self): + """Receiver rank 0 with source_tp=4, target_tp=2 should call get() with + two different metadata entries for sender ranks 0 and 1.""" + mgr = _make_manager(from_tp=4, to_tp=2, local_rank=0, recv_timeout=0.05) + mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) + + calls = [] + + class _Connector: + def get(self, from_stage, to_stage, get_key, metadata=None): + calls.append({"key": get_key, "metadata": metadata}) + return None + + mgr._connector = _Connector() + mgr.receive_kv_cache_for_request("req-1") + + seen_ports = set() + for c in calls: + if c["metadata"]: + seen_ports.add(c["metadata"]["source_port"]) + expected_ports = { + 50151 + 0 * KV_RANK_PORT_STRIDE, + 50151 + 1 * KV_RANK_PORT_STRIDE, + } + assert expected_ports.issubset(seen_ports) + + +# ── Mooncake connector _query_metadata_at ──────────────────────────── + + +class TestMooncakeQueryMetadataAt: + """Test the connector's _query_metadata_at method and partial-metadata + path in get() without requiring real RDMA/Mooncake.""" + + def test_query_metadata_at_returns_full_metadata(self): + """Mock the ZMQ interaction to verify _query_metadata_at returns + complete metadata including data_size.""" + + try: + from vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector import ( + MooncakeTransferEngineConnector, + QueryResponse, + ) + except ImportError: + pytest.skip("Mooncake not available") + + import msgspec + + connector = MagicMock(spec=MooncakeTransferEngineConnector) + connector._get_req_socket = MagicMock() + + mock_socket = MagicMock() + resp = QueryResponse(request_id="test_key@s0_s1", data_size=4096, is_fast_path=True) + mock_socket.recv.return_value = msgspec.msgpack.encode(resp) + connector._get_req_socket.return_value = mock_socket + + result = MooncakeTransferEngineConnector._query_metadata_at( + connector, + "test_key@s0_s1", + "10.0.0.1", + 50151, + ) + + assert result is not None + assert result["source_host"] == "10.0.0.1" + assert result["source_port"] == 50151 + assert result["data_size"] == 4096 + assert result["is_fast_path"] is True + + def test_query_metadata_at_returns_none_on_not_found(self): + try: + from vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector import ( + INFO_NOT_FOUND, + MooncakeTransferEngineConnector, + ) + except ImportError: + pytest.skip("Mooncake not available") + + connector = MagicMock(spec=MooncakeTransferEngineConnector) + mock_socket = MagicMock() + mock_socket.recv.return_value = INFO_NOT_FOUND + connector._get_req_socket.return_value = mock_socket + + result = MooncakeTransferEngineConnector._query_metadata_at( + connector, + "test_key@s0_s1", + "10.0.0.1", + 50151, + ) + assert result is None + + +# ── Merge / slice hooks ────────────────────────────────────────────── + + +class TestMergeSliceHooks: + def test_single_shard_passes_through(self): + payload = {"layer_blocks": {"key_cache": [1]}} + assert merge_received_rank_shards([payload]) == payload + + def test_default_merger_concats_head_dim(self): + p0 = _make_payload([0.0]) + p1 = _make_payload([1.0]) + result = merge_received_rank_shards([p0, p1]) + key_cache = result["layer_blocks"]["key_cache"][0] + value_cache = result["layer_blocks"]["value_cache"][0] + assert key_cache.shape == (2, 2, 1) + assert value_cache.shape == (2, 2, 1) + assert torch.equal(key_cache[:, :, 0], torch.tensor([[0.0, 1.0], [0.0, 1.0]])) + assert torch.equal(value_cache[:, :, 0], torch.tensor([[100.0, 101.0], [100.0, 101.0]])) + + def test_custom_merger_hook_called(self): + merged = {"merged": True} + assert merge_received_rank_shards([{}, {}], merger=lambda payloads: merged) == merged + + def test_slicer_hook_called(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=0) + sliced = {"sliced": True} + assert slice_received_rank_shard({"full": True}, topo, slicer=lambda payload: sliced) == sliced + + def test_default_slicer_extracts_rank_local_heads(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=1) + payload = _make_payload([0.0, 1.0]) + result = slice_received_rank_shard(payload, topo) + key_cache = result["layer_blocks"]["key_cache"][0] + value_cache = result["layer_blocks"]["value_cache"][0] + assert key_cache.shape == (2, 1, 1) + assert value_cache.shape == (2, 1, 1) + assert torch.equal(key_cache[:, :, 0], torch.tensor([[1.0], [1.0]])) + assert torch.equal(value_cache[:, :, 0], torch.tensor([[101.0], [101.0]])) + + def test_presliced_payload_is_not_sliced_twice(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=1) + payload = _make_payload([1.0]) + payload["metadata"]["tp_head_slice"] = {"applied": True, "target_rank": 1} + result = slice_received_rank_shard(payload, topo) + assert result is payload + + def test_round_trip_merge_from_tp4_to_tp2(self): + topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=1) + source_ranks = get_kv_source_ranks(topo) + payloads = [_make_payload([float(rank)]) for rank in source_ranks] + result = merge_received_rank_shards(payloads) + key_cache = result["layer_blocks"]["key_cache"][0] + assert torch.equal(key_cache[:, :, 0], torch.tensor([[2.0, 3.0], [2.0, 3.0]])) + + def test_round_trip_slice_from_tp2_to_tp4(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=3) + payload = _make_payload([2.0, 3.0]) + result = slice_received_rank_shard(payload, topo) + key_cache = result["layer_blocks"]["key_cache"][0] + assert torch.equal(key_cache[:, :, 0], torch.tensor([[3.0], [3.0]])) + + +class TestSenderSideSlicing: + def test_transfer_slices_before_sending_to_multiple_targets(self): + mgr = _make_manager( + from_tp=2, + to_tp=4, + local_rank=0, + need_send=True, + need_recv=False, + ) + sent_payloads = [] + + class _Connector: + supports_raw_data = False + + def put(self, from_stage, to_stage, put_key, data): + sent_payloads.append((put_key, KVCacheTransferData.from_bytes(data))) + return True, len(data), {} + + mgr._connector = _Connector() + mgr._transfer_kv_cache(_make_transfer_data([0.0, 1.0]), "req-1") + + assert [key for key, _ in sent_payloads] == ["req-1_stage0_0_0_0", "req-1_stage0_0_0_1"] + assert sent_payloads[0][1]["layer_blocks"]["key_cache"][0].shape == (2, 1, 1) + assert sent_payloads[1][1]["layer_blocks"]["key_cache"][0].shape == (2, 1, 1) + assert torch.equal( + sent_payloads[0][1]["layer_blocks"]["key_cache"][0][:, :, 0], + torch.tensor([[0.0], [0.0]]), + ) + assert torch.equal( + sent_payloads[1][1]["layer_blocks"]["key_cache"][0][:, :, 0], + torch.tensor([[1.0], [1.0]]), + ) + assert sent_payloads[0][1]["metadata"]["tp_head_slice"]["target_rank"] == 0 + assert sent_payloads[1][1]["metadata"]["tp_head_slice"]["target_rank"] == 1 + + +class _MockBroadcastGroup: + def __init__(self, world_size: int, rank_in_group: int, broadcast_value=None, recv_value=None): + self.world_size = world_size + self.rank_in_group = rank_in_group + self.broadcast_value = broadcast_value + self.recv_value = recv_value + self.broadcast_calls = [] + self.send_calls = [] + self.recv_calls = [] + self.shm_broadcaster = None + + def broadcast_object(self, obj=None, src: int = 0): + self.broadcast_calls.append((obj, src)) + return self.broadcast_value if self.broadcast_value is not None else obj + + def send_object(self, obj, dst: int): + self.send_calls.append((dst, obj)) + + def recv_object(self, src: int): + self.recv_calls.append(src) + return self.recv_value + + +class TestDistributedReceive: + def test_tp_cfg_leader_receives_then_sends_branch_local_payloads(self): + mgr = _make_manager(from_tp=2, to_tp=4, local_rank=0) + req = SimpleNamespace(request_id="req-1", sampling_params=SimpleNamespace()) + world_group = _MockBroadcastGroup(world_size=4, rank_in_group=2) + cfg_group = _MockBroadcastGroup(world_size=3, rank_in_group=0) + + def _receive(req_obj, cfg_func, target_device): + req_obj.past_key_values = SimpleNamespace(key_cache=[torch.tensor([1.0])]) + req_obj.kv_metadata = {"source": "leader"} + req_obj.sampling_params.past_key_values = req_obj.past_key_values + req_obj.sampling_params.kv_metadata = req_obj.kv_metadata + req_obj.sampling_params.cfg_text_past_key_values = SimpleNamespace(key_cache=[torch.tensor([2.0])]) + req_obj.sampling_params.cfg_text_kv_metadata = {"source": "cfg_text"} + req_obj.sampling_params.cfg_img_past_key_values = SimpleNamespace(key_cache=[torch.tensor([3.0])]) + req_obj.sampling_params.cfg_img_kv_metadata = {"source": "cfg_img"} + return True + + mgr.receive_multi_kv_cache = MagicMock(side_effect=_receive) + with ( + patch("vllm_omni.diffusion.distributed.parallel_state.get_world_group", return_value=world_group), + patch( + "vllm_omni.diffusion.distributed.parallel_state.get_classifier_free_guidance_world_size", + return_value=3, + ), + patch( + "vllm_omni.diffusion.distributed.parallel_state.get_classifier_free_guidance_rank", + return_value=0, + ), + patch("vllm_omni.diffusion.distributed.parallel_state.get_cfg_group", return_value=cfg_group), + ): + assert mgr.receive_multi_kv_cache_distributed(req) is True + + mgr.receive_multi_kv_cache.assert_called_once() + assert mgr.receive_multi_kv_cache.call_args.args[2] == torch.device("cpu") + assert req.kv_metadata == {"source": "leader"} + assert cfg_group.broadcast_calls == [] + assert [dst for dst, _ in cfg_group.send_calls] == [1, 2] + rank1_payload = cfg_group.send_calls[0][1] + rank2_payload = cfg_group.send_calls[1][1] + assert torch.equal(rank1_payload["past_key_values"].key_cache[0], torch.tensor([1.0])) + assert torch.equal(rank2_payload["past_key_values"].key_cache[0], torch.tensor([1.0])) + assert rank1_payload["sp.cfg_active_branch"] == "cfg_text" + assert rank2_payload["sp.cfg_active_branch"] == "cfg_img" + assert rank1_payload["sp.cfg_branch_roles"] == ["cfg_text", "cfg_img"] + assert rank2_payload["sp.cfg_branch_roles"] == ["cfg_text", "cfg_img"] + assert "sp.cfg_branch_past_key_values" in rank1_payload + assert "sp.cfg_branch_past_key_values" in rank2_payload + assert list(rank1_payload["sp.cfg_branch_past_key_values"].keys()) == ["cfg_text"] + assert list(rank2_payload["sp.cfg_branch_past_key_values"].keys()) == ["cfg_img"] + assert "sp.cfg_text_past_key_values" in rank1_payload + assert "sp.cfg_img_past_key_values" not in rank1_payload + assert "sp.cfg_img_past_key_values" in rank2_payload + assert "sp.cfg_text_past_key_values" not in rank2_payload + + def test_tp_cfg_follower_receives_local_payload_without_receiving(self): + mgr = _make_manager(from_tp=2, to_tp=4, local_rank=1) + req = SimpleNamespace(request_id="req-1", sampling_params=SimpleNamespace()) + world_group = _MockBroadcastGroup(world_size=4, rank_in_group=3) + cfg_payload = { + "past_key_values": SimpleNamespace(key_cache=[torch.tensor([1.0])]), + "kv_metadata": {"source": "main"}, + "sp.past_key_values": SimpleNamespace(key_cache=[torch.tensor([1.0])]), + "sp.kv_metadata": {"source": "main"}, + "sp.cfg_active_branch": "cfg_text", + "sp.cfg_branch_roles": ["cfg_text", "cfg_img"], + "sp.cfg_branch_past_key_values": { + "cfg_text": SimpleNamespace(key_cache=[torch.tensor([2.0])]), + }, + "sp.cfg_branch_kv_metadata": {"cfg_text": {"source": "cfg-text"}}, + "sp.cfg_text_past_key_values": SimpleNamespace(key_cache=[torch.tensor([2.0])]), + } + cfg_group = _MockBroadcastGroup(world_size=2, rank_in_group=1, recv_value=cfg_payload) + + mgr.receive_multi_kv_cache = MagicMock(return_value=True) + with ( + patch("vllm_omni.diffusion.distributed.parallel_state.get_world_group", return_value=world_group), + patch( + "vllm_omni.diffusion.distributed.parallel_state.get_classifier_free_guidance_world_size", + return_value=2, + ), + patch( + "vllm_omni.diffusion.distributed.parallel_state.get_classifier_free_guidance_rank", + return_value=1, + ), + patch("vllm_omni.diffusion.distributed.parallel_state.get_cfg_group", return_value=cfg_group), + ): + assert mgr.receive_multi_kv_cache_distributed(req) is True + + mgr.receive_multi_kv_cache.assert_not_called() + assert req.kv_metadata == {"source": "main"} + assert torch.equal(req.past_key_values.key_cache[0], torch.tensor([1.0])) + assert torch.equal(req.sampling_params.past_key_values.key_cache[0], torch.tensor([1.0])) + assert req.sampling_params.cfg_active_branch == "cfg_text" + assert req.sampling_params.cfg_branch_roles == ["cfg_text", "cfg_img"] + assert torch.equal( + req.sampling_params.cfg_branch_past_key_values["cfg_text"].key_cache[0], + torch.tensor([2.0]), + ) + assert req.sampling_params.cfg_branch_kv_metadata == {"cfg_text": {"source": "cfg-text"}} + assert torch.equal(req.sampling_params.cfg_text_past_key_values.key_cache[0], torch.tensor([2.0])) + assert cfg_group.broadcast_calls == [] + assert cfg_group.recv_calls == [0] + + def test_tp_without_cfg_keeps_independent_receive_path(self): + mgr = _make_manager(from_tp=2, to_tp=2, local_rank=1) + req = SimpleNamespace(request_id="req-1", sampling_params=SimpleNamespace()) + world_group = _MockBroadcastGroup(world_size=2, rank_in_group=1) + mgr.receive_multi_kv_cache = MagicMock(return_value=True) + + with patch("vllm_omni.diffusion.distributed.parallel_state.get_world_group", return_value=world_group): + assert mgr.receive_multi_kv_cache_distributed(req, target_device=torch.device("cpu")) is True + + mgr.receive_multi_kv_cache.assert_called_once_with(req, None, torch.device("cpu")) + + +# ── TP auto-detect ─────────────────────────────────────────────────── + + +class TestAutoDetectTP: + def test_auto_detect_when_config_defaults(self): + """When config from_tp/to_tp == 1 (default), manager should auto-detect.""" + config = OmniKVCacheConfig( + connector_config={"type": "mock"}, + from_stage="s0", + stage_id="s1", + need_recv_cache=True, + ) + with ( + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_local_tp_rank", return_value=0), + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_tp_world_size", return_value=4), + ): + mgr = OmniKVTransferManager(config) + assert mgr._tp_topo.source_tp_size == 4 + assert mgr._tp_topo.target_tp_size == 4 + + def test_explicit_tp_overrides_auto_detect(self): + config = OmniKVCacheConfig( + connector_config={"type": "mock"}, + from_stage="s0", + stage_id="s1", + need_recv_cache=True, + from_tp=2, + to_tp=4, + ) + with ( + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_local_tp_rank", return_value=0), + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_tp_world_size", return_value=8), + ): + mgr = OmniKVTransferManager(config) + assert mgr._tp_topo.source_tp_size == 2 + assert mgr._tp_topo.target_tp_size == 4 diff --git a/tests/engine/test_async_omni_engine_stage_init.py b/tests/engine/test_async_omni_engine_stage_init.py index 84b0cb0bed..5c2a9edb77 100644 --- a/tests/engine/test_async_omni_engine_stage_init.py +++ b/tests/engine/test_async_omni_engine_stage_init.py @@ -183,6 +183,7 @@ def test_launch_llm_stage_passes_stage_init_timeout_to_complete_stage_handshake( engine.model = "dummy-model" engine.single_stage_mode = False engine._omni_master_server = None + engine.stage_configs = [] metadata = types.SimpleNamespace(stage_id=0, runtime_cfg={"devices": "0"}) fake_vllm_config = types.SimpleNamespace() @@ -238,6 +239,7 @@ def test_launch_llm_stage_releases_launch_lock_before_complete_stage_handshake(m engine.model = "dummy-model" engine.single_stage_mode = False engine._omni_master_server = None + engine.stage_configs = [] fake_vllm_config = types.SimpleNamespace() fake_addresses = types.SimpleNamespace() @@ -378,3 +380,70 @@ def __init__(self, vllm_config, renderer=None): assert input_processor is not None assert isinstance(input_processor.input_preprocessor, DummyOmniInputPreprocessor) assert input_processor.input_preprocessor.renderer is input_processor.renderer + + +def test_inject_kv_stage_info_infers_sender_tp_topology(): + from vllm_omni.engine.stage_init_utils import inject_kv_stage_info + + stage0 = types.SimpleNamespace( + stage_id=0, + engine_args={ + "tensor_parallel_size": 4, + "omni_kv_config": { + "need_send_cache": True, + "omni_from_stage": "0", + "omni_to_stage": "1", + }, + }, + engine_input_source=[], + ) + stage1 = types.SimpleNamespace( + stage_id=1, + engine_args={ + "parallel_config": { + "tensor_parallel_size": 2, + "cfg_parallel_size": 1, + }, + "omni_kv_config": {"need_recv_cache": True}, + }, + engine_input_source=[0], + ) + + inject_kv_stage_info(stage0, 0, [stage0, stage1]) + + assert stage0.engine_args["omni_kv_config"]["stage_id"] == 0 + assert stage0.engine_args["omni_kv_config"]["rank_mapping"] == {"from_tp": 4, "to_tp": 2} + + +def test_inject_kv_stage_info_infers_receiver_tp_topology(): + from vllm_omni.engine.stage_init_utils import inject_kv_stage_info + + stage0 = types.SimpleNamespace( + stage_id=0, + engine_args={ + "tensor_parallel_size": 4, + "omni_kv_config": {"need_send_cache": True}, + }, + engine_input_source=[], + ) + stage1 = types.SimpleNamespace( + stage_id=1, + engine_args={ + "parallel_config": { + "tensor_parallel_size": 2, + "cfg_parallel_size": 1, + }, + "omni_kv_config": { + "need_recv_cache": True, + "omni_from_stage": "0", + "omni_to_stage": "1", + }, + }, + engine_input_source=[0], + ) + + inject_kv_stage_info(stage1, 1, [stage0, stage1]) + + assert stage1.engine_args["omni_kv_config"]["stage_id"] == 1 + assert stage1.engine_args["omni_kv_config"]["engine_input_source"] == [0] + assert stage1.engine_args["omni_kv_config"]["rank_mapping"] == {"from_tp": 4, "to_tp": 2} diff --git a/tests/engine/test_single_stage_mode.py b/tests/engine/test_single_stage_mode.py index 608e92ac49..28ccccaa2b 100644 --- a/tests/engine/test_single_stage_mode.py +++ b/tests/engine/test_single_stage_mode.py @@ -1555,6 +1555,7 @@ def _build_engine_with_oms(self, mocker: MockerFixture) -> AsyncOmniEngine: engine.single_stage_mode = True engine._single_stage_id_filter = 0 engine._llm_stage_launch_lock = threading.Lock() + engine.stage_configs = [] mock_oms = mocker.Mock(spec=OmniMasterServer) mock_oms.address = "127.0.0.1" mock_oms.port = 25000 @@ -1629,6 +1630,7 @@ def test_spawn_stage_core_used_in_normal_mode(self, mocker: MockerFixture): engine.single_stage_mode = False engine._omni_master_server = None engine._llm_stage_launch_lock = threading.Lock() + engine.stage_configs = [] fake_vllm_config = mocker.Mock() fake_executor_cls = mocker.Mock() diff --git a/vllm_omni/diffusion/distributed/group_coordinator.py b/vllm_omni/diffusion/distributed/group_coordinator.py index 8ab38f2a65..5294e6c9ed 100644 --- a/vllm_omni/diffusion/distributed/group_coordinator.py +++ b/vllm_omni/diffusion/distributed/group_coordinator.py @@ -104,6 +104,7 @@ def __init__( self.local_rank = local_rank self.device_group = None self.cpu_group = None + self.shm_broadcaster = None for ranks in group_ranks: device_group = torch.distributed.new_group(ranks, backend=torch_distributed_backend) @@ -316,7 +317,7 @@ def send_object(self, obj: Any, dst: int) -> None: assert dst < self.world_size, f"Invalid dst rank ({dst})" - assert dst != self.rank, "Invalid destination rank. Destination rank is the same as the current rank." + assert dst != self.rank_in_group, "Invalid destination rank. Destination rank is the same as the current rank." # Serialize object to tensor and get the size as well object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8) @@ -338,7 +339,7 @@ def recv_object(self, src: int) -> Any: assert src < self.world_size, f"Invalid src rank ({src})" - assert src != self.rank, "Invalid source rank. Source rank is the same as the current rank." + assert src != self.rank_in_group, "Invalid source rank. Source rank is the same as the current rank." size_tensor = torch.empty(1, dtype=torch.long, device="cpu") diff --git a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py index 72e53e7f48..a3d2259e64 100644 --- a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py +++ b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py @@ -365,28 +365,52 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: if req.sampling_params.kv_metadata and "image_shape" in req.sampling_params.kv_metadata: image_shape = tuple(req.sampling_params.kv_metadata["image_shape"]) - cfg_text_kv = getattr(req.sampling_params, "cfg_text_past_key_values", None) + branch_kvs = getattr(req.sampling_params, "cfg_branch_past_key_values", None) or {} + branch_metadata = getattr(req.sampling_params, "cfg_branch_kv_metadata", None) or {} + active_branch = getattr(req.sampling_params, "cfg_active_branch", None) + branch_roles = getattr(req.sampling_params, "cfg_branch_roles", None) or list(branch_kvs.keys()) + + cfg_text_kv = getattr(req.sampling_params, "cfg_text_past_key_values", None) or branch_kvs.get("cfg_text") + cfg_text_metadata = getattr(req.sampling_params, "cfg_text_kv_metadata", None) or branch_metadata.get( + "cfg_text" + ) + cfg_img_kv = getattr(req.sampling_params, "cfg_img_past_key_values", None) or branch_kvs.get("cfg_img") + cfg_img_metadata = getattr(req.sampling_params, "cfg_img_kv_metadata", None) or branch_metadata.get( + "cfg_img" + ) + + cfg_parallel_contract = ( + active_branch is not None or bool(branch_roles) or cfg_text_kv is not None or cfg_img_kv is not None + ) + if cfg_parallel_contract: + logger.info( + "CFG enabled with injected branch KV context roles=%s active=%s", + branch_roles, + active_branch, + ) + if cfg_text_kv is not None: - logger.info("CFG enabled with multi-KV: using injected cfg_text KV Cache") cfg_text_seq_len = cfg_text_kv.key_cache[0].shape[0] cfg_text_context["past_key_values"] = cfg_text_kv cfg_text_context["kv_lens"] = [cfg_text_seq_len] - cfg_text_metadata = getattr(req.sampling_params, "cfg_text_kv_metadata", None) if cfg_text_metadata and "ropes" in cfg_text_metadata: cfg_text_context["ropes"] = cfg_text_metadata["ropes"] else: cfg_text_context["ropes"] = [cfg_text_seq_len] - cfg_img_kv = getattr(req.sampling_params, "cfg_img_past_key_values", None) or injected_kv + if cfg_img_kv is None and cfg_text_kv is not None: + cfg_img_kv = injected_kv + + if cfg_img_kv is not None: cfg_img_seq_len = cfg_img_kv.key_cache[0].shape[0] cfg_img_context["past_key_values"] = cfg_img_kv cfg_img_context["kv_lens"] = [cfg_img_seq_len] - cfg_img_metadata = getattr(req.sampling_params, "cfg_img_kv_metadata", None) if cfg_img_metadata and "ropes" in cfg_img_metadata: cfg_img_context["ropes"] = cfg_img_metadata["ropes"] else: cfg_img_context["ropes"] = [cfg_img_seq_len] - else: + + if not cfg_parallel_contract: logger.warning("CFG is disabled: only single KV cache available") gen_params = BagelGenParams( num_timesteps=gen_params.num_timesteps, diff --git a/vllm_omni/distributed/omni_connectors/kv_transfer_manager.py b/vllm_omni/distributed/omni_connectors/kv_transfer_manager.py index 1958c9d40a..ad008c3971 100644 --- a/vllm_omni/distributed/omni_connectors/kv_transfer_manager.py +++ b/vllm_omni/distributed/omni_connectors/kv_transfer_manager.py @@ -14,8 +14,20 @@ from .factory import OmniConnectorFactory from .utils.config import ConnectorSpec -from .utils.initialization import KV_TRANSFER_PORT_OFFSET -from .utils.kv_utils import normalize_layer_kv +from .utils.initialization import KV_RANK_PORT_STRIDE +from .utils.kv_utils import ( + KVTPTopology, + build_rank_aware_recv_keys, + build_rank_aware_send_keys, + get_kv_target_ranks, + get_local_tp_rank, + get_tp_world_size, + kv_zmq_port, + merge_received_rank_shards, + normalize_layer_kv, + slice_layer_blocks, + slice_received_rank_shard, +) logger = init_logger(__name__) @@ -57,6 +69,8 @@ class OmniKVCacheConfig: need_recv_cache: bool = False need_send_cache: bool = False recv_timeout: float = 30.0 + from_tp: int = 1 + to_tp: int = 1 @dataclass @@ -72,82 +86,44 @@ def to_dict(self) -> dict[str, Any]: """Convert to dictionary for serialization.""" return asdict(self) - def to_bytes(self) -> bytes: - """Convert to compact binary format for fast transfer.""" - tensors_desc: list[dict[str, Any]] = [] - tensor_bufs: list[bytes] = [] - data_offset = 0 - - for cache_name in ("key_cache", "value_cache"): - cache_list = self.layer_blocks.get(cache_name, []) - for layer_idx, tensor in enumerate(cache_list): - if tensor is None: - tensors_desc.append({"n": f"{cache_name}_{layer_idx}", "x": True}) - continue - - t = tensor.detach().cpu().contiguous() - dtype_str = str(t.dtype).removeprefix("torch.") - raw = t.view(torch.uint8).numpy().tobytes() - tensors_desc.append( - { - "n": f"{cache_name}_{layer_idx}", - "i": layer_idx, - "d": dtype_str, - "s": list(t.shape), - "o": data_offset, - "b": len(raw), - } - ) - tensor_bufs.append(raw) - data_offset += len(raw) - - header = json.dumps( - { - "rid": self.request_id, - "bids": self.block_ids, - "meta": self.metadata, - "td": tensors_desc, - "nl": len(self.layer_blocks.get("key_cache", [])), - }, - separators=(",", ":"), - ).encode("utf-8") - return b"".join([struct.pack(">I", len(header)), header] + tensor_bufs) + def _build_tensors_desc(self, *, cpu: bool) -> tuple[list[dict[str, Any]], list, int, torch.device | None]: + """Iterate layer blocks and build tensor descriptors + data chunks. - def to_gpu_tensor(self) -> torch.Tensor: - """Convert to a packed GPU tensor for raw-data connectors.""" + Returns ``(tensors_desc, chunks, total_bytes, device)``. + *chunks* contains ``bytes`` when *cpu* is True, flat uint8 GPU tensors otherwise. + """ tensors_desc: list[dict[str, Any]] = [] - gpu_tensors: list[torch.Tensor] = [] + chunks: list = [] data_offset = 0 device = None for cache_name in ("key_cache", "value_cache"): - cache_list = self.layer_blocks.get(cache_name, []) - for layer_idx, tensor in enumerate(cache_list): + for layer_idx, tensor in enumerate(self.layer_blocks.get(cache_name, [])): if tensor is None: tensors_desc.append({"n": f"{cache_name}_{layer_idx}", "x": True}) continue - t = tensor.detach().contiguous() - if device is None and t.is_cuda: + if cpu: + t = t.cpu() + elif device is None and t.is_cuda: device = t.device - dtype_str = str(t.dtype).removeprefix("torch.") nbytes = t.numel() * t.element_size() tensors_desc.append( { "n": f"{cache_name}_{layer_idx}", "i": layer_idx, - "d": dtype_str, + "d": str(t.dtype).removeprefix("torch."), "s": list(t.shape), "o": data_offset, "b": nbytes, } ) - gpu_tensors.append(t.view(torch.uint8).flatten()) + chunks.append(t.view(torch.uint8).numpy().tobytes() if cpu else t.view(torch.uint8).flatten()) data_offset += nbytes - if device is None: - raise RuntimeError("No CUDA tensors found, use to_bytes() instead") + return tensors_desc, chunks, data_offset, device + def _build_header_bytes(self, tensors_desc: list[dict[str, Any]]) -> bytes: header = json.dumps( { "rid": self.request_id, @@ -158,19 +134,26 @@ def to_gpu_tensor(self) -> torch.Tensor: }, separators=(",", ":"), ).encode("utf-8") + return struct.pack(">I", len(header)) + header - header_prefix = struct.pack(">I", len(header)) + header - total_size = len(header_prefix) + data_offset - output = torch.empty(total_size, dtype=torch.uint8, device=device) - header_tensor = torch.frombuffer(bytearray(header_prefix), dtype=torch.uint8) - output[: len(header_prefix)].copy_(header_tensor) + def to_bytes(self) -> bytes: + """Convert to compact binary format for fast transfer.""" + tensors_desc, chunks, _, _ = self._build_tensors_desc(cpu=True) + return b"".join([self._build_header_bytes(tensors_desc)] + chunks) + def to_gpu_tensor(self) -> torch.Tensor: + """Convert to a packed GPU tensor for raw-data connectors.""" + tensors_desc, chunks, data_offset, device = self._build_tensors_desc(cpu=False) + if device is None: + raise RuntimeError("No CUDA tensors found, use to_bytes() instead") + header_prefix = self._build_header_bytes(tensors_desc) + output = torch.empty(len(header_prefix) + data_offset, dtype=torch.uint8, device=device) + output[: len(header_prefix)].copy_(torch.frombuffer(bytearray(header_prefix), dtype=torch.uint8)) pos = len(header_prefix) - for t_flat in gpu_tensors: + for t_flat in chunks: n = t_flat.numel() output[pos : pos + n].copy_(t_flat) pos += n - return output @staticmethod @@ -237,11 +220,8 @@ def _resolve_layer_idx(info: dict[str, Any], num_layers: int) -> int: return layer_idx @staticmethod - def from_bytes(raw: "bytes | bytearray | memoryview") -> dict[str, Any]: - """Reconstruct KV cache data from the packed bytes format.""" - raw_mv = memoryview(raw) if not isinstance(raw, memoryview) else raw - header, tensor_data_mv = KVCacheTransferData._load_header_from_memoryview(raw_mv) - + def _populate_caches(header: dict[str, Any], get_tensor: callable) -> dict[str, Any]: + """Shared deserialization loop for both CPU and GPU paths.""" num_layers = header["nl"] key_cache: list[torch.Tensor | None] = [None] * num_layers value_cache: list[torch.Tensor | None] = [None] * num_layers @@ -249,20 +229,9 @@ def from_bytes(raw: "bytes | bytearray | memoryview") -> dict[str, Any]: for info in header["td"]: if info.get("x"): continue - name: str = info["n"] torch_dtype = KVCacheTransferData._resolve_torch_dtype(info["d"]) - offset, nbytes = KVCacheTransferData._validate_tensor_span(name, info, len(tensor_data_mv)) - t = ( - torch.frombuffer( - tensor_data_mv, - dtype=torch.uint8, - offset=offset, - count=nbytes, - ) - .view(torch_dtype) - .reshape(info["s"]) - ) + t = get_tensor(info).view(torch_dtype).reshape(info["s"]) layer_idx = KVCacheTransferData._resolve_layer_idx(info, num_layers) if name.startswith("key_cache_"): key_cache[layer_idx] = t @@ -276,37 +245,30 @@ def from_bytes(raw: "bytes | bytearray | memoryview") -> dict[str, Any]: "metadata": header["meta"], } + @staticmethod + def from_bytes(raw: "bytes | bytearray | memoryview") -> dict[str, Any]: + """Reconstruct KV cache data from the packed bytes format.""" + raw_mv = memoryview(raw) if not isinstance(raw, memoryview) else raw + header, tensor_data_mv = KVCacheTransferData._load_header_from_memoryview(raw_mv) + data_len = len(tensor_data_mv) + + def _get(info: dict) -> torch.Tensor: + offset, nbytes = KVCacheTransferData._validate_tensor_span(info["n"], info, data_len) + return torch.frombuffer(tensor_data_mv, dtype=torch.uint8, offset=offset, count=nbytes) + + return KVCacheTransferData._populate_caches(header, _get) + @staticmethod def from_bytes_gpu(gpu_tensor: torch.Tensor) -> dict[str, Any]: """Reconstruct KV cache data from a packed GPU tensor.""" header, data_start = KVCacheTransferData._load_header_from_tensor(gpu_tensor) + data_len = int(gpu_tensor.numel()) - data_start - num_layers = header["nl"] - key_cache: list[torch.Tensor | None] = [None] * num_layers - value_cache: list[torch.Tensor | None] = [None] * num_layers - tensor_data_bytes = int(gpu_tensor.numel()) - data_start + def _get(info: dict) -> torch.Tensor: + offset, nbytes = KVCacheTransferData._validate_tensor_span(info["n"], info, data_len) + return gpu_tensor[data_start + offset : data_start + offset + nbytes].clone() - for info in header["td"]: - if info.get("x"): - continue - - name: str = info["n"] - torch_dtype = KVCacheTransferData._resolve_torch_dtype(info["d"]) - offset, nbytes = KVCacheTransferData._validate_tensor_span(name, info, tensor_data_bytes) - t = gpu_tensor[data_start + offset : data_start + offset + nbytes].clone() - t = t.view(torch_dtype).reshape(info["s"]) - layer_idx = KVCacheTransferData._resolve_layer_idx(info, num_layers) - if name.startswith("key_cache_"): - key_cache[layer_idx] = t - elif name.startswith("value_cache_"): - value_cache[layer_idx] = t - - return { - "request_id": header["rid"], - "layer_blocks": {"key_cache": key_cache, "value_cache": value_cache}, - "block_ids": header["bids"], - "metadata": header["meta"], - } + return KVCacheTransferData._populate_caches(header, _get) class OmniKVTransferManager: @@ -341,6 +303,30 @@ def __init__(self, config: OmniKVCacheConfig): else (None, None) ) + local_rank = get_local_tp_rank() + + if config.from_tp <= 1 and config.to_tp <= 1: + detected_tp = get_tp_world_size() + from_tp = detected_tp + to_tp = detected_tp + else: + from_tp = config.from_tp + to_tp = config.to_tp + + self._tp_topo = KVTPTopology(source_tp_size=from_tp, target_tp_size=to_tp, local_rank=local_rank) + + # Injectable hooks (compatible with PR #2677 OmniConnectorModelRunnerMixin). + self.kv_send_key_builder: Callable | None = None + self.kv_recv_key_builder: Callable | None = None + self.kv_payload_merger: Callable | None = None + self.kv_payload_slicer: Callable | None = None + + # Base sender endpoint (rank-0 host/port) stored during + # update_sender_info(). Used by the receive path to construct + # per-rank metadata for heterogeneous TP without querying a registry. + self._sender_base_host: str | None = None + self._sender_base_zmq_port: int | None = None + if config.need_send_cache and config.connector_config: try: _ = self.connector @@ -348,11 +334,20 @@ def __init__(self, config: OmniKVCacheConfig): except Exception as e: logger.warning("Failed to eagerly initialize sender connector: %s", e) + # ------------------------------------------------------------------ # + # Factory helpers + # ------------------------------------------------------------------ # + @classmethod def _create(cls, cfg: dict | None) -> "OmniKVTransferManager": """Create manager from raw config dict.""" if not cfg or not isinstance(cfg, dict): return cls(OmniKVCacheConfig()) + + rank_mapping = cfg.get("rank_mapping", {}) + if not isinstance(rank_mapping, dict): + rank_mapping = {} + return cls( OmniKVCacheConfig( connector_config=cfg.get("connector_config"), @@ -363,19 +358,18 @@ def _create(cls, cfg: dict | None) -> "OmniKVTransferManager": need_recv_cache=cfg.get("need_recv_cache", False), need_send_cache=cfg.get("need_send_cache", False), recv_timeout=cfg.get("recv_timeout", 30.0), + from_tp=int(rank_mapping.get("from_tp", 1)), + to_tp=int(rank_mapping.get("to_tp", 1)), ) ) - @classmethod - def from_model_config(cls, config: Any) -> "OmniKVTransferManager": - """Create from model config (for AR model runner).""" - return cls._create(getattr(config, "omni_kv_config", None)) - @classmethod def from_od_config(cls, config: Any) -> "OmniKVTransferManager": - """Create from OmniDiffusion config (for diffusion runner).""" + """Create from model or OmniDiffusion config.""" return cls._create(getattr(config, "omni_kv_config", None)) + from_model_config = from_od_config + @classmethod def from_vllm_config(cls, vllm_config: Any, model_config: Any) -> "OmniKVTransferManager": """Create from vllm config with fallback to kv_transfer_config.""" @@ -417,45 +411,33 @@ def connector(self): ) c_extra["to_stage"] = str(self.config.to_stage) if self.config.to_stage is not None else "1" + try: + stage_int = int(self.config.from_stage) if self.config.from_stage is not None else 0 + except (TypeError, ValueError): + stage_int = 0 + zmq_port = kv_zmq_port(base_port, stage_int, self._tp_topo.local_rank) + if self.config.need_send_cache: c_extra["role"] = "sender" - from_stage = self.config.from_stage - if from_stage is not None: - try: - c_extra["zmq_port"] = base_port + KV_TRANSFER_PORT_OFFSET + int(from_stage) - except (TypeError, ValueError): - c_extra["zmq_port"] = base_port + KV_TRANSFER_PORT_OFFSET + c_extra["zmq_port"] = zmq_port elif self.config.need_recv_cache: c_extra["role"] = "receiver" - from_stage = self.config.from_stage - sender_port = base_port + KV_TRANSFER_PORT_OFFSET - if from_stage is not None: - try: - sender_port = base_port + KV_TRANSFER_PORT_OFFSET + int(from_stage) - except (TypeError, ValueError): - pass c_extra.setdefault("sender_host", c_extra.get("host", "127.0.0.1")) - c_extra.setdefault("sender_zmq_port", sender_port) + c_extra.setdefault("sender_zmq_port", zmq_port) logger.info( - "Initializing OmniConnector (purpose=kv_transfer) with config: %s, role: %s", - cfg, + "Initializing OmniConnector type=%s role=%s", + c_type, c_extra.get("role", "N/A"), ) self._connector = OmniConnectorFactory.create_connector(ConnectorSpec(name=c_type, extra=c_extra)) - except Exception as e: - logger.error(f"Failed to initialize OmniConnector: {e}") - import traceback - - traceback.print_exc() - # Cache failure sentinel to avoid repeated initialization attempts in hot paths. + except Exception: + logger.exception("Failed to initialize OmniConnector") self._connector = False return self._connector if self._connector else None - def get_connector(self): - """Get connector (compatibility wrapper for existing code).""" - return self.connector + get_connector = property(lambda self: self.connector) def _resolve_sender_info( self, sender_info: dict[str, Any], sender_stage_id: str | int | None = None @@ -513,8 +495,187 @@ def _clone_received_payload_tensors(data: dict[str, Any]) -> dict[str, Any]: cache_list[idx] = tensor.clone() return data + def _slice_transfer_data_for_target(self, kv_data: KVCacheTransferData, target_rank: int) -> KVCacheTransferData: + """Pre-slice sender payload for one target rank when sender TP < receiver TP.""" + topo = self._tp_topo + ratio = topo.target_tp_size // topo.source_tp_size + offset_in_sender = target_rank % ratio + metadata = dict(kv_data.metadata) if isinstance(kv_data.metadata, dict) else {} + metadata["tp_head_slice"] = { + "applied": True, + "side": "sender", + "target_rank": target_rank, + "source_rank": topo.local_rank, + "from_tp": topo.source_tp_size, + "to_tp": topo.target_tp_size, + "offset_in_shard": offset_in_sender, + "num_slices": ratio, + } + return KVCacheTransferData( + request_id=kv_data.request_id, + layer_blocks=slice_layer_blocks(kv_data.layer_blocks, offset_in_sender, ratio), + block_ids=list(kv_data.block_ids), + metadata=metadata, + ) + + def _serialize_transfer_payload(self, kv_data: KVCacheTransferData) -> torch.Tensor | bytes | dict[str, Any]: + """Serialize KV transfer data using the connector's fastest supported path.""" + if getattr(self.connector, "supports_raw_data", False): + try: + return kv_data.to_gpu_tensor() + except Exception: + pass + try: + return kv_data.to_bytes() + except Exception: + return kv_data.to_dict() + + @staticmethod + def _collect_request_kv_payload(req: Any) -> dict[str, object]: + """Collect request-side KV objects for object broadcast.""" + kv_payload: dict[str, object] = {} + for attr in ("past_key_values", "kv_metadata"): + val = getattr(req, attr, None) + if val is not None: + kv_payload[attr] = val + + if hasattr(req, "sampling_params") and req.sampling_params is not None: + for key in list(vars(req.sampling_params).keys()): + if key in ("past_key_values", "kv_metadata") or ( + key.startswith("cfg_") + and ( + key.endswith("_past_key_values") + or key.endswith("_kv_metadata") + or key + in ( + "cfg_kv_request_ids", + "cfg_active_branch", + "cfg_branch_roles", + "cfg_branch_past_key_values", + "cfg_branch_kv_metadata", + ) + ) + ): + val = getattr(req.sampling_params, key, None) + if val is not None: + kv_payload[f"sp.{key}"] = val + + return kv_payload + + @staticmethod + def _apply_request_kv_payload( + req: Any, + kv_payload: dict[str, object], + target_device: torch.device | None = None, + ) -> None: + """Apply a broadcast KV payload back onto a request object.""" + for attr in ("past_key_values", "kv_metadata"): + val = kv_payload.get(attr) + if val is not None: + if target_device is not None: + val = _move_to_device(val, target_device) + setattr(req, attr, val) + + if hasattr(req, "sampling_params") and req.sampling_params is not None: + for key, val in kv_payload.items(): + if key.startswith("sp."): + if target_device is not None: + val = _move_to_device(val, target_device) + setattr(req.sampling_params, key[3:], val) + + @staticmethod + def _discover_cfg_branch_roles(req: Any) -> list[str]: + """Discover CFG branch roles in a stable order.""" + sampling_params = getattr(req, "sampling_params", None) + if sampling_params is None: + return [] + + roles: list[str] = [] + branch_map = getattr(sampling_params, "cfg_branch_past_key_values", None) or {} + for preferred_role in ("cfg_text", "cfg_img"): + if ( + preferred_role in branch_map + or getattr(sampling_params, f"{preferred_role}_past_key_values", None) is not None + ): + roles.append(preferred_role) + + for role in branch_map.keys(): + if role not in roles and branch_map.get(role) is not None: + roles.append(role) + + for key in vars(sampling_params).keys(): + if not (key.startswith("cfg_") and key.endswith("_past_key_values")): + continue + role = key.removesuffix("_past_key_values") + if role in ("cfg_branch",) or role in roles: + continue + if getattr(sampling_params, key, None) is not None: + roles.append(role) + + return roles + + @classmethod + def _build_cfg_rank_local_payloads(cls, req: Any, cfg_size: int) -> list[dict[str, object] | None]: + """Build per-cfg-rank payloads so each rank receives only its branch KV.""" + full_payload = cls._collect_request_kv_payload(req) + payloads: list[dict[str, object] | None] = [] + + main_payload = { + key: value + for key, value in full_payload.items() + if key in ("past_key_values", "kv_metadata", "sp.past_key_values", "sp.kv_metadata") + } + branch_roles = cls._discover_cfg_branch_roles(req) + if branch_roles: + main_payload["sp.cfg_branch_roles"] = list(branch_roles) + main_payload["sp.cfg_active_branch"] = None + payloads.append(main_payload or None) + + sampling_params = getattr(req, "sampling_params", None) + branch_map = getattr(sampling_params, "cfg_branch_past_key_values", None) or {} + branch_metadata_map = getattr(sampling_params, "cfg_branch_kv_metadata", None) or {} + + for role in branch_roles: + if sampling_params is None: + payloads.append(None) + continue + + branch_kv = branch_map.get(role) + if branch_kv is None: + branch_kv = getattr(sampling_params, f"{role}_past_key_values", None) + branch_metadata = branch_metadata_map.get(role) + if branch_metadata is None: + branch_metadata = getattr(sampling_params, f"{role}_kv_metadata", None) + if branch_kv is None: + payloads.append(None) + continue + + local_payload = dict(main_payload) + local_payload["sp.cfg_active_branch"] = role + local_payload["sp.cfg_branch_roles"] = list(branch_roles) + local_payload["sp.cfg_branch_past_key_values"] = {role: branch_kv} + local_payload[f"sp.{role}_past_key_values"] = branch_kv + if branch_metadata is not None: + local_payload["sp.cfg_branch_kv_metadata"] = {role: branch_metadata} + local_payload[f"sp.{role}_kv_metadata"] = branch_metadata + + payloads.append(local_payload) + + while len(payloads) < cfg_size: + payloads.append(None) + + return payloads[:cfg_size] + def update_sender_info(self, sender_info: dict[str, Any], sender_stage_id: str | int | None = None) -> None: - """Update receiver-side sender info before loading remote KV cache.""" + """Update receiver-side sender info before loading remote KV cache. + + The orchestrator always reports rank-0's ZMQ port. When TP > 1 the + receiver must offset the port so that each TP rank connects to the + corresponding sender rank's port. + + The base host/port are also stored so that the receive path can + construct per-rank metadata for heterogeneous TP scenarios. + """ if not self.config.need_recv_cache: return @@ -523,18 +684,39 @@ def update_sender_info(self, sender_info: dict[str, Any], sender_stage_id: str | logger.warning("Invalid sender_info format: %s", sender_info) return + sender_host = actual_info.get("host") + base_zmq_port = actual_info.get("zmq_port") + + # Store base sender info for per-rank metadata construction. + self._sender_base_host = sender_host + if base_zmq_port is not None: + self._sender_base_zmq_port = int(base_zmq_port) + + # --- Default sender: offset to match this receiver's corresponding sender rank --- + zmq_port = base_zmq_port + if zmq_port is not None and self._tp_topo.local_rank > 0: + zmq_port = int(zmq_port) + self._tp_topo.local_rank * KV_RANK_PORT_STRIDE + if self.config.connector_config: - self.config.connector_config["sender_host"] = actual_info.get("host") - self.config.connector_config["sender_zmq_port"] = actual_info.get("zmq_port") + self.config.connector_config["sender_host"] = sender_host + self.config.connector_config["sender_zmq_port"] = zmq_port if self._connector and hasattr(self._connector, "update_sender_info"): try: - self._connector.update_sender_info(actual_info.get("host"), actual_info.get("zmq_port")) + self._connector.update_sender_info(sender_host, zmq_port) except Exception: if hasattr(self._connector, "sender_host"): - self._connector.sender_host = actual_info.get("host") + self._connector.sender_host = sender_host if hasattr(self._connector, "sender_zmq_port"): - self._connector.sender_zmq_port = actual_info.get("zmq_port") + self._connector.sender_zmq_port = zmq_port + + logger.info( + "Sender info updated: host=%s, base_port=%s, adjusted_port=%s (local_rank=%s)", + sender_host, + base_zmq_port, + zmq_port, + self._tp_topo.local_rank, + ) def handle_finished_requests_kv_transfer( self, @@ -692,35 +874,54 @@ def _transfer_kv_cache(self, kv_data: KVCacheTransferData, transfer_req_id: str) kv_data.request_id = transfer_req_id serialization_start = time.perf_counter() - transfer_data: torch.Tensor | bytes | dict[str, Any] - supports_raw = getattr(self.connector, "supports_raw_data", False) + topo = self._tp_topo + send_keys = build_rank_aware_send_keys( + transfer_req_id, from_stage, to_stage, topo, hook=self.kv_send_key_builder + ) + sender_slice_active = ( + topo.source_tp_size < topo.target_tp_size and len(send_keys) > 1 and not callable(self.kv_send_key_builder) + ) + per_key_payloads: list[tuple[str, torch.Tensor | bytes | dict[str, Any]]] = [] - try: - if supports_raw: - transfer_data = kv_data.to_gpu_tensor() + if sender_slice_active: + target_ranks = get_kv_target_ranks(topo) + if len(target_ranks) != len(send_keys): + logger.warning( + "Skip sender-side KV slicing because target rank count does not match send key count: " + "target_ranks=%s send_keys=%s", + len(target_ranks), + len(send_keys), + ) + sender_slice_active = False else: - raise RuntimeError("Connector does not support raw tensor") - except Exception: - try: - transfer_data = kv_data.to_bytes() - except Exception: - data_dict = kv_data.to_dict() - data_dict["request_id"] = transfer_req_id - transfer_data = data_dict + for put_key, target_rank in zip(send_keys, target_ranks, strict=False): + sliced_kv_data = self._slice_transfer_data_for_target(kv_data, target_rank) + per_key_payloads.append((put_key, self._serialize_transfer_payload(sliced_kv_data))) + + if not per_key_payloads: + transfer_data = self._serialize_transfer_payload(kv_data) + per_key_payloads = [(put_key, transfer_data) for put_key in send_keys] serialization_ms = (time.perf_counter() - serialization_start) * 1000 logger.info("KV cache serialized for %s in %.1f ms", transfer_req_id, serialization_ms) transfer_start = time.perf_counter() - success, size, _ = self._transfer_with_retry(from_stage, to_stage, f"kv_cache_{transfer_req_id}", transfer_data) + total_size = 0 + all_succeeded = True + for put_key, transfer_data in per_key_payloads: + success, size, _ = self._transfer_with_retry(from_stage, to_stage, put_key, transfer_data) + total_size += size + all_succeeded = all_succeeded and success + elapsed = time.perf_counter() - transfer_start - if success: - mbps = (size / 1024 / 1024) / elapsed if elapsed > 0 else 0 + if all_succeeded: + mbps = (total_size / 1024 / 1024) / elapsed if elapsed > 0 else 0 logger.info( - "KV transfer OK: %s, %s bytes, %.3fs, %.1f MB/s", + "KV transfer OK: %s, %s bytes across %s key(s), %.3fs, %.1f MB/s", transfer_req_id, - size, + total_size, + len(send_keys), elapsed, mbps, ) @@ -731,7 +932,7 @@ def _transfer_with_retry( self, from_stage: str, to_stage: str, - request_id: str, + put_key: str, data: "dict[str, Any] | bytes | torch.Tensor", max_retries: int = 3, ) -> tuple[bool, int, dict[str, Any] | None]: @@ -740,7 +941,7 @@ def _transfer_with_retry( Args: from_stage: Source stage identifier to_stage: Target stage identifier - request_id: Request identifier for the key + put_key: Pre-built connector key (rank-aware when TP > 1) data: Data to transfer max_retries: Maximum number of retry attempts @@ -749,14 +950,12 @@ def _transfer_with_retry( """ for attempt in range(max_retries): try: - # Build the full key for connector - full_request_id = f"omni_{from_stage}_to_{to_stage}_{request_id}" success, size, metadata = self.connector.put( - from_stage=from_stage, to_stage=to_stage, put_key=full_request_id, data=data + from_stage=from_stage, to_stage=to_stage, put_key=put_key, data=data ) if success: return success, size, metadata - logger.warning(f"Transfer attempt {attempt + 1} failed for {request_id}") + logger.warning(f"Transfer attempt {attempt + 1} failed for {put_key}") except Exception as e: logger.warning(f"Transfer attempt {attempt + 1} exception: {e}") @@ -801,22 +1000,46 @@ def receive_kv_cache_for_request( poll_interval = 0.01 max_poll_interval = 0.5 - logger.info(f"Wait for KV cache for request {request_id} from stage {from_stage} to {to_stage}...") + topo = self._tp_topo + recv_key_pairs = build_rank_aware_recv_keys( + request_id, from_stage, to_stage, topo, hook=self.kv_recv_key_builder + ) + pending_pairs = list(recv_key_pairs) + received_payloads: dict[str, tuple[dict[str, Any], int]] = {} + + logger.info( + "Wait for KV cache for request %s from stage %s to %s via %s key(s)...", + request_id, + from_stage, + to_stage, + len(recv_key_pairs), + ) try: while True: - # Build the full key for connector - full_request_id = f"omni_{from_stage}_to_{to_stage}_kv_cache_{request_id}" link_start = time.perf_counter() - result = self.connector.get( - from_stage=from_stage, - to_stage=to_stage, - get_key=full_request_id, - ) - if result: + for get_key, from_rank in list(pending_pairs): + # Construct per-rank metadata so the connector queries + # the correct sender endpoint (heterogeneous TP path). + # When from_rank is None (TP<=1), metadata stays None + # and the connector falls back to its default sender. + rank_metadata: dict[str, Any] | None = None + if from_rank is not None and self._sender_base_host and self._sender_base_zmq_port is not None: + rank_metadata = { + "source_host": self._sender_base_host, + "source_port": self._sender_base_zmq_port + from_rank * KV_RANK_PORT_STRIDE, + } + + result = self.connector.get( + from_stage=from_stage, + to_stage=to_stage, + get_key=get_key, + metadata=rank_metadata, + ) + if not result: + continue + raw_data, size = result - elapsed = time.time() - start_time - link_ms = (time.perf_counter() - link_start) * 1000 managed_buffer = None if hasattr(raw_data, "tensor") and hasattr(raw_data, "release"): @@ -844,6 +1067,21 @@ def receive_kv_cache_for_request( else: data = raw_data + received_payloads[get_key] = (data, size) + pending_pairs.remove((get_key, from_rank)) + + if not pending_pairs and received_payloads: + elapsed = time.time() - start_time + link_ms = (time.perf_counter() - link_start) * 1000 + ordered_payloads = [received_payloads[key][0] for key, _ in recv_key_pairs] + total_size = sum(received_payloads[key][1] for key, _ in recv_key_pairs) + + if len(ordered_payloads) == 1: + data = ordered_payloads[0] + else: + data = merge_received_rank_shards(ordered_payloads, merger=self.kv_payload_merger) + data = slice_received_rank_shard(data, topo, slicer=self.kv_payload_slicer) + try: if isinstance(data, dict) and "layer_blocks" in data: layer_blocks = data["layer_blocks"] @@ -856,18 +1094,18 @@ def receive_kv_cache_for_request( continue if target_device is not None and tensor.device != target_device: cache_list[i] = tensor.to(target_device).contiguous() - finally: - if managed_buffer is not None: - managed_buffer.release() + except Exception: + logger.exception("Failed to move KV cache tensors to target device") logger.info( - "Successfully received KV cache for %s, %s bytes, wait=%.3fs, link=%.1fms", + "Successfully received KV cache for %s, %s bytes across %s key(s), wait=%.3fs, link=%.1fms", request_id, - size, + total_size, + len(recv_key_pairs), elapsed, link_ms, ) - return data, size + return data, total_size if time.time() - start_time > timeout: logger.error(f"Timeout waiting for KV cache for request {request_id} after {timeout}s") @@ -876,11 +1114,8 @@ def receive_kv_cache_for_request( time.sleep(poll_interval) poll_interval = min(poll_interval * 2, max_poll_interval) - except Exception as e: - logger.error(f"Error receiving KV cache for {request_id}: {e}") - import traceback - - traceback.print_exc() + except Exception: + logger.exception("Error receiving KV cache for %s", request_id) return None, 0 def apply_kv_cache_to_request(self, req: Any, data: dict[str, Any]) -> None: @@ -994,73 +1229,79 @@ def receive_multi_kv_cache_distributed( cfg_kv_collect_func: Callable | None = None, target_device: torch.device | None = None, ) -> bool: - """Broadcast-aware wrapper around :meth:`receive_multi_kv_cache`. - - SharedMemory connector is single-reader: once rank 0 consumes the - segment it is deleted. For multi-GPU stages (e.g. sequence-parallel) - only rank 0 receives; the result is then broadcast to every other - rank via the world process-group. - - For single-worker stages this is equivalent to calling - :meth:`receive_multi_kv_cache` directly. + """Distributed wrapper around :meth:`receive_multi_kv_cache`. + + TP-aware path selection: + - world size 1: direct receive + - TP active, cfg size 1: each rank independently receives + - TP active, cfg size > 1: cfg-rank 0 receives, then broadcasts to + peers that share the same TP rank + - TP inactive: legacy rank-0 receive then world broadcast """ - from vllm_omni.diffusion.distributed.parallel_state import get_world_group + from vllm_omni.diffusion.distributed.parallel_state import ( + get_cfg_group, + get_classifier_free_guidance_rank, + get_classifier_free_guidance_world_size, + get_world_group, + ) world = get_world_group() if world.world_size <= 1: return self.receive_multi_kv_cache(req, cfg_kv_collect_func, target_device) - # --- rank 0: receive to CPU (needed for pickle-based broadcast) --- - if world.rank_in_group == 0: - self.receive_multi_kv_cache(req, cfg_kv_collect_func, torch.device("cpu")) + topo = self._tp_topo + tp_active = topo.source_tp_size > 1 or topo.target_tp_size > 1 + cfg_size = 1 + cfg_rank = 0 + cfg_group = None + try: + cfg_size = get_classifier_free_guidance_world_size() + cfg_rank = get_classifier_free_guidance_rank() + cfg_group = get_cfg_group() + except Exception: + cfg_size = 1 + cfg_rank = 0 + cfg_group = None - kv_payload: dict[str, object] = {} - for attr in ("past_key_values", "kv_metadata"): - val = getattr(req, attr, None) - if val is not None: - kv_payload[attr] = val + if tp_active and cfg_size <= 1: + logger.info( + "Rank-aware KV receive: rank %s independently receiving (from_tp=%s, to_tp=%s)", + topo.local_rank, + topo.source_tp_size, + topo.target_tp_size, + ) + return self.receive_multi_kv_cache(req, cfg_kv_collect_func, target_device) - if hasattr(req, "sampling_params") and req.sampling_params is not None: - for key in list(vars(req.sampling_params).keys()): - if (key.startswith("cfg_") and key.endswith("_past_key_values")) or key in ( - "past_key_values", - "kv_metadata", - ): - val = getattr(req.sampling_params, key, None) - if val is not None: - kv_payload[f"sp.{key}"] = val - - payload_list = [kv_payload] - # Use broadcast_object_list (pickle-based) instead of broadcast_tensor_dict - # because the KV cache is a heterogeneous nested structure (NaiveCache objects - # with metadata + tensors), not a flat tensor dict. This runs once before - # the denoising loop so the serialization cost is negligible. - torch.distributed.broadcast_object_list(payload_list, src=world.ranks[0], group=world.cpu_group) - kv_payload = payload_list[0] - else: - payload_list: list[dict[str, object] | None] = [None] - torch.distributed.broadcast_object_list(payload_list, src=world.ranks[0], group=world.cpu_group) - kv_payload = payload_list[0] + if tp_active and cfg_size > 1 and cfg_group is not None: + kv_payload: dict[str, object] | None = None + if cfg_rank == 0: + received = self.receive_multi_kv_cache(req, cfg_kv_collect_func, torch.device("cpu")) + rank_payloads = self._build_cfg_rank_local_payloads(req, cfg_size) if received else [None] * cfg_size + kv_payload = rank_payloads[0] + for dst_rank in range(1, cfg_size): + cfg_group.send_object(rank_payloads[dst_rank], dst_rank) + else: + kv_payload = cfg_group.recv_object(0) - # --- apply on ALL ranks (rank 0 also needs CPU→GPU move) --- - if not kv_payload: - return False + if not kv_payload: + return False - for attr in ("past_key_values", "kv_metadata"): - val = kv_payload.get(attr) - if val is not None: - if target_device is not None: - val = _move_to_device(val, target_device) - setattr(req, attr, val) + self._apply_request_kv_payload(req, kv_payload, target_device) + return True - if hasattr(req, "sampling_params") and req.sampling_params is not None: - for key, val in kv_payload.items(): - if key.startswith("sp."): - if target_device is not None: - val = _move_to_device(val, target_device) - setattr(req.sampling_params, key[3:], val) + kv_payload: dict[str, object] | None = None + if world.rank_in_group == 0: + received = self.receive_multi_kv_cache(req, cfg_kv_collect_func, torch.device("cpu")) + if received: + kv_payload = self._collect_request_kv_payload(req) + + kv_payload = world.broadcast_object(kv_payload, src=0) + + if not kv_payload: + return False + self._apply_request_kv_payload(req, kv_payload, target_device) return True diff --git a/vllm_omni/distributed/omni_connectors/utils/kv_utils.py b/vllm_omni/distributed/omni_connectors/utils/kv_utils.py index 2cb48a8b34..12b9b3d4f7 100644 --- a/vllm_omni/distributed/omni_connectors/utils/kv_utils.py +++ b/vllm_omni/distributed/omni_connectors/utils/kv_utils.py @@ -1,15 +1,380 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Utility helpers for KV cache manipulation.""" +"""Utility helpers for KV cache manipulation, TP routing, and merge/slice.""" + +from __future__ import annotations + +import os +from collections.abc import Callable +from dataclasses import dataclass +from typing import Any import torch +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) from vllm.logger import init_logger +from .initialization import KV_RANK_PORT_STRIDE, KV_TRANSFER_PORT_OFFSET + logger = init_logger(__name__) LayerKV = torch.Tensor | tuple[torch.Tensor, torch.Tensor] +# ------------------------------------------------------------------ # +# TP Topology +# ------------------------------------------------------------------ # + + +@dataclass(frozen=True) +class KVTPTopology: + """Immutable descriptor for a KV-transfer parallel mapping. + + Captures sender/receiver parallel sizes and the local rank within + that parallel dimension. Works for any divisible parallel dimension + (TP, SP, Ring Attention). + """ + + source_tp_size: int + target_tp_size: int + local_rank: int + + def __post_init__(self) -> None: + if self.source_tp_size <= 0 or self.target_tp_size <= 0: + raise ValueError( + f"Parallel sizes must be positive: " + f"source_tp_size={self.source_tp_size}, target_tp_size={self.target_tp_size}" + ) + if self.local_rank < 0: + raise ValueError(f"local_rank must be non-negative, got {self.local_rank}") + + @property + def is_heterogeneous(self) -> bool: + return self.source_tp_size != self.target_tp_size + + @property + def ratio(self) -> int: + """Larger parallel size divided by smaller. Always >= 1.""" + return max(self.source_tp_size, self.target_tp_size) // min(self.source_tp_size, self.target_tp_size) + + +# ------------------------------------------------------------------ # +# Runtime TP detection +# ------------------------------------------------------------------ # + + +def get_local_tp_rank() -> int: + """Return the TP-local rank of this worker process. + + Uses ``get_tensor_model_parallel_rank()`` which returns the rank + within the TP group only, not the stage-global rank. + """ + try: + return get_tensor_model_parallel_rank() + except Exception: + logger.debug("TP parallel state not initialized, falling back to LOCAL_RANK env", exc_info=True) + try: + return int(os.environ.get("LOCAL_RANK", "0")) + except (ValueError, TypeError): + return 0 + + +def get_tp_world_size() -> int: + """Return the TP world size (tensor-parallel dimension only). + + Uses ``get_tensor_model_parallel_world_size()`` so that + cfg_parallel, SP, PP etc. are not included in the count. + """ + try: + return get_tensor_model_parallel_world_size() + except Exception: + logger.debug("TP parallel state not initialized, defaulting world_size=1", exc_info=True) + return 1 + + +# ------------------------------------------------------------------ # +# ZMQ port computation +# ------------------------------------------------------------------ # + + +def kv_zmq_port(base_port: int, from_stage: int, local_rank: int = 0) -> int: + """Compute the ZMQ port for a KV-transfer connector. + + Each TP rank gets its own port so that TP > 1 deployments do not + cause ``EADDRINUSE`` when multiple sender workers bind on the same + host. The formula is backward-compatible: rank 0 produces the same + port as the previous ``base + OFFSET + stage`` formula. + """ + return base_port + KV_TRANSFER_PORT_OFFSET + local_rank * KV_RANK_PORT_STRIDE + from_stage + + +# ------------------------------------------------------------------ # +# TP topology validation and rank routing +# ------------------------------------------------------------------ # + + +def validate_kv_tp_topology(topo: KVTPTopology) -> None: + """Reject heterogeneous TP mappings that cannot be routed losslessly.""" + larger = max(topo.source_tp_size, topo.target_tp_size) + smaller = min(topo.source_tp_size, topo.target_tp_size) + if larger % smaller != 0: + raise ValueError( + f"KV TP mapping must be divisible: " + f"source_tp_size={topo.source_tp_size}, " + f"target_tp_size={topo.target_tp_size}" + ) + + +def get_kv_target_ranks(topo: KVTPTopology) -> list[int]: + """Which remote ranks this local rank sends KV shards to (send side).""" + validate_kv_tp_topology(topo) + if topo.source_tp_size == topo.target_tp_size: + return [topo.local_rank] + if topo.source_tp_size > topo.target_tp_size: + return [topo.local_rank // (topo.source_tp_size // topo.target_tp_size)] + ratio = topo.target_tp_size // topo.source_tp_size + return [topo.local_rank * ratio + i for i in range(ratio)] + + +def get_kv_source_ranks(topo: KVTPTopology) -> list[int]: + """Which remote ranks this local rank receives KV shards from (recv side).""" + validate_kv_tp_topology(topo) + if topo.source_tp_size == topo.target_tp_size: + return [topo.local_rank] + if topo.source_tp_size > topo.target_tp_size: + ratio = topo.source_tp_size // topo.target_tp_size + return [topo.local_rank * ratio + i for i in range(ratio)] + return [topo.local_rank // (topo.target_tp_size // topo.source_tp_size)] + + +# ------------------------------------------------------------------ # +# Rank-aware connector key building +# ------------------------------------------------------------------ # + + +def get_kv_connector_key( + req_id: str, + from_stage: int | str, + chunk_id: int, + from_rank: int, + to_rank: int, +) -> str: + """Build connector key that includes rank info for KV transfers. + + Format matches PR #2677: ``{req_id}_{from_stage}_{chunk_id}_{from_rank}_{to_rank}`` + """ + return f"{req_id}_{from_stage}_{chunk_id}_{from_rank}_{to_rank}" + + +def build_rank_aware_send_keys( + request_id: str, + from_stage: str, + to_stage: str, + topo: KVTPTopology, + hook: Callable | None = None, +) -> list[str]: + """Build send-side connector keys, checking injectable hook first.""" + if callable(hook): + keys = list(hook(request_id, from_stage, to_stage)) + if keys: + return keys + if topo.source_tp_size <= 1 and topo.target_tp_size <= 1: + return [f"omni_{from_stage}_to_{to_stage}_kv_cache_{request_id}"] + target_ranks = get_kv_target_ranks(topo) + return [get_kv_connector_key(request_id, from_stage, 0, topo.local_rank, r) for r in target_ranks] + + +def build_rank_aware_recv_keys( + request_id: str, + from_stage: str, + to_stage: str, + topo: KVTPTopology, + hook: Callable | None = None, +) -> list[tuple[str, int | None]]: + """Build recv-side connector keys with sender rank info. + + Returns a list of ``(key, from_rank)`` tuples. ``from_rank`` is + ``None`` when TP <= 1 (single sender, no per-rank routing needed). + For TP > 1, ``from_rank`` identifies which sender rank owns the + key so that the connector can route metadata queries to the + correct endpoint. + """ + if callable(hook): + raw = list(hook(request_id, from_stage, to_stage)) + if raw: + if isinstance(raw[0], tuple): + return raw + # Hook returned plain strings (e.g. OmniConnectorModelRunnerMixin. + # get_rank_aware_kv_keys). Reconstruct from_rank from topology so + # Mooncake connector can route metadata queries to the correct + # sender endpoint in heterogeneous TP. + # TODO: have the mixin return (key, from_rank) tuples directly + # to avoid this indirect reconstruction. + source_ranks = get_kv_source_ranks(topo) + if len(raw) == len(source_ranks): + return list(zip(raw, source_ranks)) + return [(k, None) for k in raw] + if topo.source_tp_size <= 1 and topo.target_tp_size <= 1: + return [(f"omni_{from_stage}_to_{to_stage}_kv_cache_{request_id}", None)] + source_ranks = get_kv_source_ranks(topo) + return [(get_kv_connector_key(request_id, from_stage, 0, r, topo.local_rank), r) for r in source_ranks] + + +# ------------------------------------------------------------------ # +# KV tensor head slicing (heterogeneous TP) +# ------------------------------------------------------------------ # + + +def slice_kv_tensor_heads( + tensor: torch.Tensor | None, + offset_in_shard: int, + num_slices: int, +) -> torch.Tensor | None: + """Slice one KV tensor along its head dimension (dim 1).""" + if tensor is None: + return None + if not isinstance(tensor, torch.Tensor): + return tensor + if tensor.dim() < 2: + raise ValueError(f"Expected KV tensor with a head dimension, got shape={tuple(tensor.shape)}") + if num_slices <= 0: + raise ValueError(f"num_slices must be > 0, got {num_slices}") + if not (0 <= offset_in_shard < num_slices): + raise ValueError(f"offset_in_shard must be in [0, {num_slices}), got {offset_in_shard}") + + heads_in_shard = tensor.shape[1] + if heads_in_shard % num_slices != 0: + raise ValueError( + "KV head count must be divisible for heterogeneous TP slicing: " + f"heads_in_shard={heads_in_shard}, num_slices={num_slices}" + ) + + heads_per_slice = heads_in_shard // num_slices + start = offset_in_shard * heads_per_slice + end = start + heads_per_slice + return tensor[:, start:end, ...].contiguous() + + +def slice_layer_blocks( + layer_blocks: dict[str, Any], + offset_in_shard: int, + num_slices: int, +) -> dict[str, list[torch.Tensor | None]]: + """Slice all KV layers for one logical receiver rank.""" + sliced_blocks: dict[str, list[torch.Tensor | None]] = {} + for cache_name in ("key_cache", "value_cache"): + cache_list = layer_blocks.get(cache_name, []) + sliced_blocks[cache_name] = [ + slice_kv_tensor_heads(tensor, offset_in_shard, num_slices) for tensor in cache_list + ] + return sliced_blocks + + +# ------------------------------------------------------------------ # +# Multi-rank merge and receiver-side slice +# ------------------------------------------------------------------ # + + +def merge_received_rank_shards( + payloads: list[dict[str, Any]], + merger: Callable | None = None, +) -> dict[str, Any] | None: + """Merge multiple source-rank KV shards for one target rank. + + When *merger* is provided (injectable hook), it is called directly. + Otherwise the default merges along the head dimension (dim 1). + """ + if callable(merger): + return merger(payloads) + if not payloads: + return None + if len(payloads) == 1: + return payloads[0] + + base_payload = payloads[0] + if not isinstance(base_payload, dict) or "layer_blocks" not in base_payload: + return base_payload + + merged: dict[str, Any] = { + "request_id": base_payload.get("request_id"), + "block_ids": list(base_payload.get("block_ids", [])), + "metadata": dict(base_payload.get("metadata", {})), + } + merged_layer_blocks: dict[str, list[torch.Tensor | None]] = {} + + for cache_name in ("key_cache", "value_cache"): + cache_lists = [payload.get("layer_blocks", {}).get(cache_name, []) for payload in payloads] + num_layers = max((len(cache_list) for cache_list in cache_lists), default=0) + merged_cache: list[torch.Tensor | None] = [] + + for layer_idx in range(num_layers): + layer_tensors = [ + cache_list[layer_idx] + for cache_list in cache_lists + if layer_idx < len(cache_list) and cache_list[layer_idx] is not None + ] + if not layer_tensors: + merged_cache.append(None) + elif len(layer_tensors) == 1 or not isinstance(layer_tensors[0], torch.Tensor): + merged_cache.append(layer_tensors[0]) + else: + merged_cache.append(torch.cat(layer_tensors, dim=1).contiguous()) + + merged_layer_blocks[cache_name] = merged_cache + + merged["layer_blocks"] = merged_layer_blocks + return merged + + +def slice_received_rank_shard( + payload: dict[str, Any] | None, + topo: KVTPTopology, + slicer: Callable | None = None, +) -> dict[str, Any] | None: + """Optionally slice a received payload to extract this rank's portion. + + Used when ``to_tp > from_tp``: the sender sent full heads and each + receiver rank slices out its own subset. + """ + if callable(slicer): + return slicer(payload) + if not payload or topo.target_tp_size <= topo.source_tp_size or "layer_blocks" not in payload: + return payload + + metadata = payload.get("metadata", {}) + slice_metadata = metadata.get("tp_head_slice") if isinstance(metadata, dict) else None + if isinstance(slice_metadata, dict) and slice_metadata.get("applied"): + tagged_rank = slice_metadata.get("target_rank") + if tagged_rank is not None and tagged_rank != topo.local_rank: + logger.warning( + "Received pre-sliced KV payload for unexpected target rank: expected=%s got=%s", + topo.local_rank, + tagged_rank, + ) + return payload + + ratio = topo.target_tp_size // topo.source_tp_size + offset_in_sender = topo.local_rank % ratio + updated_metadata = dict(metadata) if isinstance(metadata, dict) else {} + updated_metadata["tp_head_slice"] = { + "applied": True, + "side": "receiver", + "target_rank": topo.local_rank, + "from_tp": topo.source_tp_size, + "to_tp": topo.target_tp_size, + "offset_in_shard": offset_in_sender, + "num_slices": ratio, + } + return { + "request_id": payload.get("request_id"), + "layer_blocks": slice_layer_blocks(payload["layer_blocks"], offset_in_sender, ratio), + "block_ids": list(payload.get("block_ids", [])), + "metadata": updated_metadata, + } + + def normalize_layer_kv( layer_kv: LayerKV, *, diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 054d5342d9..23a85e9f5f 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -61,6 +61,7 @@ ) from vllm_omni.engine.stage_init_utils import ( StartedLlmStage, + _inject_inferred_kv_tp_topology, acquire_device_locks, build_diffusion_config, build_engine_args_dict, @@ -78,7 +79,10 @@ setup_stage_devices, terminate_alive_proc, ) -from vllm_omni.entrypoints.utils import load_and_resolve_stage_configs +from vllm_omni.entrypoints.utils import ( + inject_omni_kv_config, + load_and_resolve_stage_configs, +) from vllm_omni.inputs.preprocess import OmniInputPreprocessor from vllm_omni.platforms import current_omni_platform @@ -378,6 +382,12 @@ def _launch_llm_stage( omni_kv["omni_to_stage"] = omni_to omni_kv.setdefault("stage_id", metadata.stage_id) engine_args_dict["omni_kv_config"] = omni_kv + if self.stage_configs: + _inject_inferred_kv_tp_topology( + engine_args_dict.get("omni_kv_config"), + metadata.stage_id, + self.stage_configs, + ) vllm_config, executor_class = build_vllm_config( stage_cfg, self.model, @@ -747,10 +757,8 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: setup_stage_devices(configured_stage_id, metadata.runtime_cfg) omni_conn_cfg, omni_from, omni_to = omni_kv_connector if omni_conn_cfg: - from vllm_omni.entrypoints.utils import inject_omni_kv_config - inject_omni_kv_config(stage_cfg, omni_conn_cfg, omni_from, omni_to) - inject_kv_stage_info(stage_cfg, configured_stage_id) + inject_kv_stage_info(stage_cfg, configured_stage_id, self.stage_configs) if self.single_stage_mode: assert self._omni_master_server is not None stage_clients[stage_idx] = self._launch_diffusion_stage( diff --git a/vllm_omni/engine/stage_engine_core_client.py b/vllm_omni/engine/stage_engine_core_client.py index 52e674f476..ab2de757ba 100644 --- a/vllm_omni/engine/stage_engine_core_client.py +++ b/vllm_omni/engine/stage_engine_core_client.py @@ -14,7 +14,9 @@ from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import AsyncMPClient, DPLBAsyncMPClient -from vllm_omni.distributed.omni_connectors.utils.initialization import KV_TRANSFER_PORT_OFFSET +from vllm_omni.distributed.omni_connectors.utils.initialization import ( + KV_TRANSFER_PORT_OFFSET, +) from vllm_omni.engine.stage_init_utils import StageMetadata if TYPE_CHECKING: @@ -246,6 +248,8 @@ def _initialize_kv_sender_endpoint(self) -> None: from_stage = omni_kv_config.get("omni_from_stage", from_stage) try: + # Orchestrator always reports rank-0's port; receiver + # workers add their own local_rank * KV_RANK_PORT_STRIDE. sender_port = int(base_port) + KV_TRANSFER_PORT_OFFSET + int(from_stage) except (TypeError, ValueError): logger.warning( @@ -284,6 +288,7 @@ def get_kv_sender_info( self._kv_sender_host = self._resolve_contact_host() if self._kv_sender_host is None: return None + # rank-0 base port; receiver workers adjust per KV_RANK_PORT_STRIDE. return { "host": self._kv_sender_host, "zmq_port": base_port + kv_transfer_port_offset + int(self.stage_id), diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 3a7fe4bad7..c697e34bac 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -13,7 +13,7 @@ import multiprocessing as mp import os import time -from collections.abc import Callable +from collections.abc import Callable, Sequence from dataclasses import dataclass from typing import Any, Literal @@ -101,8 +101,110 @@ def resolve_worker_cls(engine_args: dict[str, Any]) -> None: raise ValueError(f"Unknown worker_type: {worker_type}") -def inject_kv_stage_info(stage_cfg: Any, stage_id: int) -> None: - """Inject stage metadata into omni_kv_config when present.""" +def _get_attr_or_item(obj: Any, key: str, default: Any = None) -> Any: + """Read *key* from *obj* regardless of whether it's a dict or object.""" + if hasattr(obj, "get"): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _tp_size_for_stage(stage_configs: Sequence[Any], stage_id: Any) -> int | None: + """Resolve tensor_parallel_size for *stage_id* from the loaded stage configs.""" + id_strs = {str(stage_id)} + try: + id_strs.add(str(int(stage_id))) + except (TypeError, ValueError): + pass + + for stage_cfg in stage_configs: + if str(getattr(stage_cfg, "stage_id", None)) not in id_strs: + continue + engine_args = getattr(stage_cfg, "engine_args", None) + if engine_args is None: + return 1 + parallel_config = _get_attr_or_item(engine_args, "parallel_config") + if parallel_config is not None: + tp = _get_attr_or_item(parallel_config, "tensor_parallel_size", 1) + else: + tp = _get_attr_or_item(engine_args, "tensor_parallel_size", 1) + try: + return max(1, int(tp)) + except (TypeError, ValueError): + return 1 + return None + + +def _inject_inferred_kv_tp_topology( + omni_kv: Any, + stage_id: int, + stage_configs: Sequence[Any], + engine_input_source: Sequence[int] | None = None, +) -> None: + """Infer adjacent-stage TP topology and inject it into omni_kv_config. + + This keeps heterogeneous TP working without requiring user-authored + rank_mapping blocks in config files. + """ + if omni_kv is None: + return + + if hasattr(omni_kv, "get"): + need_send = bool(omni_kv.get("need_send_cache", False)) + need_recv = bool(omni_kv.get("need_recv_cache", False)) + omni_from_stage = omni_kv.get("omni_from_stage") + omni_to_stage = omni_kv.get("omni_to_stage") + rank_mapping = omni_kv.get("rank_mapping") + else: + need_send = bool(getattr(omni_kv, "need_send_cache", False)) + need_recv = bool(getattr(omni_kv, "need_recv_cache", False)) + omni_from_stage = getattr(omni_kv, "omni_from_stage", None) + omni_to_stage = getattr(omni_kv, "omni_to_stage", None) + rank_mapping = getattr(omni_kv, "rank_mapping", None) + + if not need_send and not need_recv: + return + + current_tp = _tp_size_for_stage(stage_configs, stage_id) + if current_tp is None: + return + + peer_stage_id = None + from_tp = None + to_tp = None + if str(omni_from_stage) == str(stage_id): + peer_stage_id = omni_to_stage + from_tp = current_tp + to_tp = _tp_size_for_stage(stage_configs, peer_stage_id) + elif str(omni_to_stage) == str(stage_id): + peer_stage_id = omni_from_stage + from_tp = _tp_size_for_stage(stage_configs, peer_stage_id) + to_tp = current_tp + elif need_recv and engine_input_source: + peer_stage_id = engine_input_source[0] + from_tp = _tp_size_for_stage(stage_configs, peer_stage_id) + to_tp = current_tp + + if from_tp is None or to_tp is None: + return + + if not isinstance(rank_mapping, dict): + rank_mapping = {} + rank_mapping.setdefault("from_tp", int(from_tp)) + rank_mapping.setdefault("to_tp", int(to_tp)) + + if hasattr(omni_kv, "__setitem__"): + omni_kv["rank_mapping"] = rank_mapping + else: + setattr(omni_kv, "rank_mapping", rank_mapping) + + +def inject_kv_stage_info(stage_cfg: Any, stage_id: int, stage_configs: Sequence[Any] | None = None) -> None: + """Inject stage_id, engine_input_source, and inferred TP topology into omni_kv_config. + + When *stage_configs* is provided, also infers from_tp/to_tp for + heterogeneous TP topologies so the KV transfer manager can compute + rank mappings automatically. + """ try: engine_args = stage_cfg.engine_args if hasattr(engine_args, "get"): @@ -125,6 +227,14 @@ def inject_kv_stage_info(stage_cfg: Any, stage_id: int) -> None: omni_kv.setdefault("engine_input_source", list(engine_input_source)) elif hasattr(omni_kv, "__setitem__") and "engine_input_source" not in omni_kv: omni_kv["engine_input_source"] = list(engine_input_source) + + if stage_configs: + _inject_inferred_kv_tp_topology( + omni_kv, + stage_id=stage_id, + stage_configs=stage_configs, + engine_input_source=engine_input_source, + ) except Exception as e: logger.debug("Failed to inject stage info into omni_kv_config: %s", e) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 39fcbc9a0a..4b3a7045ca 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -86,6 +86,7 @@ from vllm_omni.entrypoints.openai.protocol import OmniChatCompletionStreamResponse from vllm_omni.entrypoints.openai.protocol.audio import AudioResponse, CreateAudio from vllm_omni.entrypoints.openai.utils import ( + get_stage_type, get_supported_speakers_from_hf_config, parse_lora_request, validate_requested_speaker, @@ -294,6 +295,8 @@ async def create_chat_completion( ) num_inference_steps = None + cfg_text_scale = None + cfg_img_scale = None # Omni multistage image generation: Stage-0 (AR) should receive a clean # text prompt (and optional conditioning image/size) so the model's own # processor can construct the correct inputs. @@ -342,6 +345,8 @@ async def create_chat_completion( except Exception: pass negative_prompt = extra_body.get("negative_prompt") + cfg_text_scale = extra_body.get("cfg_text_scale") + cfg_img_scale = extra_body.get("cfg_img_scale") engine_prompt_image: dict[str, Any] | None = None is_img2img = False @@ -397,14 +402,18 @@ async def create_chat_completion( sampling_params_list = self._build_sampling_params_list_from_request(request) # Apply user-specified overrides to diffusion stage(s) for image generation - if _image_gen_height is not None or _image_gen_width is not None or num_inference_steps is not None: - for idx, sp in enumerate(sampling_params_list): - if hasattr(sp, "height") and _image_gen_height is not None: - sp.height = _image_gen_height - if hasattr(sp, "width") and _image_gen_width is not None: - sp.width = _image_gen_width - if hasattr(sp, "num_inference_steps") and num_inference_steps is not None: - sp.num_inference_steps = num_inference_steps + for idx, sp in enumerate(sampling_params_list): + if hasattr(sp, "height") and _image_gen_height is not None: + sp.height = _image_gen_height + if hasattr(sp, "width") and _image_gen_width is not None: + sp.width = _image_gen_width + if hasattr(sp, "num_inference_steps") and num_inference_steps is not None: + sp.num_inference_steps = num_inference_steps + if hasattr(sp, "extra_args") and sp.extra_args is not None: + if cfg_text_scale is not None: + sp.extra_args["cfg_text_scale"] = cfg_text_scale + if cfg_img_scale is not None: + sp.extra_args["cfg_img_scale"] = cfg_img_scale self._log_inputs( request_id, @@ -2108,6 +2117,8 @@ async def _create_diffusion_chat_completion( num_inference_steps = extra_body.get("num_inference_steps") guidance_scale = extra_body.get("guidance_scale") true_cfg_scale = extra_body.get("true_cfg_scale") or extra_body.get("cfg_scale") + cfg_text_scale = extra_body.get("cfg_text_scale") + cfg_img_scale = extra_body.get("cfg_img_scale") seed = extra_body.get("seed") negative_prompt = extra_body.get("negative_prompt") num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1) @@ -2162,6 +2173,10 @@ async def _create_diffusion_chat_completion( gen_params.guidance_scale = guidance_scale if true_cfg_scale is not None: gen_params.true_cfg_scale = true_cfg_scale + if cfg_text_scale is not None: + gen_params.extra_args["cfg_text_scale"] = cfg_text_scale + if cfg_img_scale is not None: + gen_params.extra_args["cfg_img_scale"] = cfg_img_scale if num_frames is not None: gen_params.num_frames = num_frames if guidance_scale_2 is not None: @@ -2206,10 +2221,30 @@ async def _create_diffusion_chat_completion( # Generate image diffusion_engine = cast(AsyncOmni, self._diffusion_engine) + stage_configs = list(getattr(diffusion_engine, "stage_configs", []) or []) + default_params_list = list(getattr(diffusion_engine, "default_sampling_params_list", []) or []) + + sampling_params_list: list[Any] = [] + for idx, stage_cfg in enumerate(stage_configs): + if get_stage_type(stage_cfg) == "diffusion": + sampling_params_list.append(gen_params) + continue + + default_stage_params = default_params_list[idx] if idx < len(default_params_list) else SamplingParams() + if hasattr(default_stage_params, "clone"): + try: + default_stage_params = default_stage_params.clone() + except Exception: + pass + sampling_params_list.append(default_stage_params) + + if not sampling_params_list: + sampling_params_list = [gen_params] + result = None async for output in diffusion_engine.generate( prompt=gen_prompt, - sampling_params_list=[gen_params], # Pass as single-stage params + sampling_params_list=sampling_params_list, request_id=request_id, ): result = output diff --git a/vllm_omni/inputs/data.py b/vllm_omni/inputs/data.py index 85faf6b949..e4c33a58c2 100644 --- a/vllm_omni/inputs/data.py +++ b/vllm_omni/inputs/data.py @@ -267,6 +267,10 @@ class OmniDiffusionSamplingParams: cfg_text_kv_metadata: dict[str, Any] | None = None cfg_img_kv_metadata: dict[str, Any] | None = None cfg_kv_request_ids: dict[str, str] | None = None + cfg_active_branch: str | None = None + cfg_branch_roles: list[str] | None = None + cfg_branch_past_key_values: dict[str, Any] | None = None + cfg_branch_kv_metadata: dict[str, dict[str, Any]] | None = None # Component modules modules: dict[str, Any] = field(default_factory=dict) From f1cb4ebe4ce200ccddb8297c88203c8da9b4fd53 Mon Sep 17 00:00:00 2001 From: fan2956 Date: Thu, 16 Apr 2026 18:21:34 +0800 Subject: [PATCH 194/204] [PERF] Wan2.2 support rmsnorm fused op (#2583) Signed-off-by: fan2956 Signed-off-by: gcanlin Co-authored-by: gcanlin --- tests/diffusion/layers/test_norm.py | 453 ++++++++++++++++++ vllm_omni/diffusion/layers/adalayernorm.py | 3 +- vllm_omni/diffusion/layers/norm.py | 110 +++++ .../models/wan2_2/wan2_2_transformer.py | 29 +- 4 files changed, 585 insertions(+), 10 deletions(-) create mode 100644 tests/diffusion/layers/test_norm.py create mode 100644 vllm_omni/diffusion/layers/norm.py diff --git a/tests/diffusion/layers/test_norm.py b/tests/diffusion/layers/test_norm.py new file mode 100644 index 0000000000..e420415285 --- /dev/null +++ b/tests/diffusion/layers/test_norm.py @@ -0,0 +1,453 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for LayerNorm and RMSNorm custom ops in diffusion layers.""" + +import pytest +import torch + +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] + + +# ── Import tests ── + + +def test_layernorm_import(): + """Verify LayerNorm can be imported from the norm module.""" + from vllm_omni.diffusion.layers.norm import LayerNorm # noqa: F401 + + +def test_rmsnorm_import(): + """Verify RMSNorm can be imported from the norm module.""" + from vllm_omni.diffusion.layers.norm import RMSNorm # noqa: F401 + + +# ── LayerNorm tests ── + + +def test_layernorm_forward_shape(): + """LayerNorm produces correct output shapes.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + batch = 2 + seq_len = 4 + norm = LayerNorm(dim) + + x = torch.randn(batch, seq_len, dim) + out = norm(x) + + assert out.shape == (batch, seq_len, dim) + + +def test_layernorm_forward_shape_2d(): + """LayerNorm works with 2D input tensors.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + batch = 2 + norm = LayerNorm(dim) + + x = torch.randn(batch, dim) + out = norm(x) + + assert out.shape == (batch, dim) + + +def test_layernorm_preserves_dtype_fp32(): + """LayerNorm preserves float32 dtype.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim) + + x = torch.randn(2, 4, dim, dtype=torch.float32) + out = norm(x) + + assert out.dtype == torch.float32 + + +def test_layernorm_preserves_dtype_fp16(): + """LayerNorm preserves float16 dtype.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim) + + x = torch.randn(2, 4, dim, dtype=torch.float16) + out = norm(x) + + assert out.dtype == torch.float16 + + +def test_layernorm_preserves_dtype_bf16(): + """LayerNorm preserves bfloat16 dtype.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim) + + x = torch.randn(2, 4, dim, dtype=torch.bfloat16) + out = norm(x) + + assert out.dtype == torch.bfloat16 + + +def test_layernorm_without_elementwise_affine(): + """LayerNorm works without elementwise_affine (no learned parameters).""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim, elementwise_affine=False) + + assert norm.weight is None + assert norm.bias is None + + x = torch.randn(2, 4, dim) + out = norm(x) + + assert out.shape == (2, 4, dim) + + +def test_layernorm_custom_eps(): + """LayerNorm accepts custom epsilon value.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + eps = 1e-5 + norm = LayerNorm(dim, eps=eps) + + assert norm.eps == eps + + +def test_layernorm_has_learnable_parameters(): + """LayerNorm has learnable weight and bias by default.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim) + + assert norm.weight is not None + assert norm.bias is not None + assert norm.weight.shape == (dim,) + assert norm.bias.shape == (dim,) + + +def test_layernorm_matches_fp32_reference(): + """Verify LayerNorm produces identical output to FP32 nn.LayerNorm.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + eps = 1e-6 + torch.manual_seed(42) + + ours = LayerNorm(dim, eps=eps) + ref = torch.nn.LayerNorm(dim, eps=eps) + + # Copy weights + ref.weight.data.copy_(ours.weight.data) + ref.bias.data.copy_(ours.bias.data) + + x = torch.randn(2, 4, dim) + + out_ours = ours(x) + out_ref = ref(x.float()).to(x.dtype) + + torch.testing.assert_close(out_ours, out_ref, atol=1e-5, rtol=1e-5) + + +def test_layernorm_matches_diffusers_fp32layernorm(): + """Verify LayerNorm produces identical output to diffusers FP32LayerNorm.""" + from diffusers.models.normalization import FP32LayerNorm + + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + eps = 1e-6 + torch.manual_seed(42) + + ours = LayerNorm(dim, eps=eps) + ref = FP32LayerNorm(dim, eps=eps) + + # Copy weights + ref.weight.data.copy_(ours.weight.data) + ref.bias.data.copy_(ours.bias.data) + + # Test with fp16 input to verify FP32 computation + x = torch.randn(2, 4, dim, dtype=torch.float16) + + out_ours = ours(x) + out_ref = ref(x) + + torch.testing.assert_close(out_ours, out_ref, atol=1e-3, rtol=1e-3) + + +# ── RMSNorm tests ── + + +def test_rmsnorm_forward_shape(): + """RMSNorm produces correct output shapes.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + batch = 2 + seq_len = 4 + norm = RMSNorm(hidden_size) + + x = torch.randn(batch, seq_len, hidden_size) + out = norm(x) + + assert out.shape == (batch, seq_len, hidden_size) + + +def test_rmsnorm_forward_shape_2d(): + """RMSNorm works with 2D input tensors.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + batch = 2 + norm = RMSNorm(hidden_size) + + x = torch.randn(batch, hidden_size) + out = norm(x) + + assert out.shape == (batch, hidden_size) + + +def test_rmsnorm_preserves_dtype_fp32(): + """RMSNorm preserves float32 dtype.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + norm = RMSNorm(hidden_size) + + x = torch.randn(2, 4, hidden_size, dtype=torch.float32) + out = norm(x) + + assert out.dtype == torch.float32 + + +def test_rmsnorm_preserves_dtype_fp16(): + """RMSNorm preserves float16 dtype.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + norm = RMSNorm(hidden_size) + + x = torch.randn(2, 4, hidden_size, dtype=torch.float16) + out = norm(x) + + assert out.dtype == torch.float16 + + +def test_rmsnorm_preserves_dtype_bf16(): + """RMSNorm preserves bfloat16 dtype.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + norm = RMSNorm(hidden_size) + + x = torch.randn(2, 4, hidden_size, dtype=torch.bfloat16) + out = norm(x) + + assert out.dtype == torch.bfloat16 + + +def test_rmsnorm_custom_eps(): + """RMSNorm accepts custom epsilon value.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + eps = 1e-5 + norm = RMSNorm(hidden_size, eps=eps) + + assert norm.variance_epsilon == eps + + +def test_rmsnorm_has_weight_parameter(): + """RMSNorm has learnable weight parameter initialized to ones.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + norm = RMSNorm(hidden_size) + + assert norm.weight is not None + assert norm.weight.shape == (hidden_size,) + torch.testing.assert_close(norm.weight, torch.ones(hidden_size)) + + +def test_rmsnorm_numerical_correctness(): + """Verify RMSNorm produces numerically correct output.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + eps = 1e-6 + torch.manual_seed(42) + + norm = RMSNorm(hidden_size, eps=eps) + x = torch.randn(2, 4, hidden_size) + + # Compute expected output manually + x_fp32 = x.to(torch.float32) + variance = x_fp32.pow(2).mean(-1, keepdim=True) + expected = x_fp32 * torch.rsqrt(variance + eps) + expected = norm.weight.to(torch.float32) * expected + expected = expected.to(x.dtype) + + out = norm(x) + + torch.testing.assert_close(out, expected, atol=1e-5, rtol=1e-5) + + +def test_rmsnorm_matches_reference_implementation(): + """Verify RMSNorm matches a reference implementation.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + def reference_rmsnorm(x, weight, eps): + """Reference RMSNorm implementation.""" + input_dtype = x.dtype + x = x.to(torch.float32) + variance = x.pow(2).mean(-1, keepdim=True) + out = x * torch.rsqrt(variance + eps) + out = weight.to(torch.float32) * out + return out.to(input_dtype) + + hidden_size = 128 + eps = 1e-6 + torch.manual_seed(123) + + norm = RMSNorm(hidden_size, eps=eps) + + # Test with various dtypes + for dtype in [torch.float32, torch.float16, torch.bfloat16]: + x = torch.randn(4, 8, hidden_size, dtype=dtype) + expected = reference_rmsnorm(x, norm.weight, eps) + out = norm(x) + torch.testing.assert_close(out, expected, atol=1e-3, rtol=1e-3) + + +# ── CustomOp dispatch tests ── + + +def test_layernorm_inherits_from_customop(): + """LayerNorm inherits from CustomOp for platform dispatch.""" + from vllm_omni.diffusion.layers.custom_op import CustomOp + from vllm_omni.diffusion.layers.norm import LayerNorm + + norm = LayerNorm(64) + assert isinstance(norm, CustomOp) + + +def test_rmsnorm_inherits_from_customop(): + """RMSNorm inherits from CustomOp for platform dispatch.""" + from vllm_omni.diffusion.layers.custom_op import CustomOp + from vllm_omni.diffusion.layers.norm import RMSNorm + + norm = RMSNorm(64) + assert isinstance(norm, CustomOp) + + +def test_layernorm_has_platform_methods(): + """LayerNorm has forward methods for each platform.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + norm = LayerNorm(64) + + assert hasattr(norm, "forward_cuda") + assert hasattr(norm, "forward_hip") + assert hasattr(norm, "forward_xpu") + assert hasattr(norm, "forward_npu") + assert hasattr(norm, "forward_native") + + +def test_rmsnorm_has_platform_methods(): + """RMSNorm has forward methods for each platform.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + norm = RMSNorm(64) + + assert hasattr(norm, "forward_cuda") + assert hasattr(norm, "forward_hip") + assert hasattr(norm, "forward_xpu") + assert hasattr(norm, "forward_npu") + assert hasattr(norm, "forward_native") + + +def test_layernorm_forward_native_directly(): + """LayerNorm.forward_native can be called directly.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim) + x = torch.randn(2, 4, dim) + + out = norm.forward_native(x) + + assert out.shape == (2, 4, dim) + + +def test_rmsnorm_forward_native_directly(): + """RMSNorm.forward_native can be called directly.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + norm = RMSNorm(hidden_size) + x = torch.randn(2, 4, hidden_size) + + out = norm.forward_native(x) + + assert out.shape == (2, 4, hidden_size) + + +# ── Edge case tests ── + + +def test_layernorm_with_large_dim(): + """LayerNorm works with large hidden dimensions.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 4096 + norm = LayerNorm(dim) + x = torch.randn(1, 16, dim) + + out = norm(x) + + assert out.shape == (1, 16, dim) + + +def test_rmsnorm_with_large_dim(): + """RMSNorm works with large hidden dimensions.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 4096 + norm = RMSNorm(hidden_size) + x = torch.randn(1, 16, hidden_size) + + out = norm(x) + + assert out.shape == (1, 16, hidden_size) + + +def test_layernorm_with_single_element_batch(): + """LayerNorm works with batch size of 1.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim) + x = torch.randn(1, 1, dim) + + out = norm(x) + + assert out.shape == (1, 1, dim) + + +def test_rmsnorm_with_single_element_batch(): + """RMSNorm works with batch size of 1.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + norm = RMSNorm(hidden_size) + x = torch.randn(1, 1, hidden_size) + + out = norm(x) + + assert out.shape == (1, 1, hidden_size) diff --git a/vllm_omni/diffusion/layers/adalayernorm.py b/vllm_omni/diffusion/layers/adalayernorm.py index 4d70ed52f7..d147bdcfeb 100644 --- a/vllm_omni/diffusion/layers/adalayernorm.py +++ b/vllm_omni/diffusion/layers/adalayernorm.py @@ -7,6 +7,7 @@ from vllm.model_executor.layers.linear import ReplicatedLinear from vllm_omni.diffusion.layers.custom_op import CustomOp +from vllm_omni.diffusion.layers.norm import LayerNorm if TYPE_CHECKING: from vllm.model_executor.layers.quantization.base_config import QuantizationConfig @@ -27,7 +28,7 @@ def __init__(self, hidden_size: int, elementwise_affine: bool = False, eps: floa self.eps = eps self.elementwise_affine = elementwise_affine self.hidden_size = hidden_size - self.layernorm = nn.LayerNorm(self.hidden_size, elementwise_affine=self.elementwise_affine, eps=self.eps) + self.layernorm = LayerNorm(self.hidden_size, elementwise_affine=self.elementwise_affine, eps=self.eps) def forward_cuda( self, diff --git a/vllm_omni/diffusion/layers/norm.py b/vllm_omni/diffusion/layers/norm.py new file mode 100644 index 0000000000..6096ad7c37 --- /dev/null +++ b/vllm_omni/diffusion/layers/norm.py @@ -0,0 +1,110 @@ +from importlib.util import find_spec + +import torch +import torch.nn as nn +import torch.nn.functional as F +from vllm.logger import init_logger + +from vllm_omni.diffusion.layers.custom_op import CustomOp + +logger = init_logger(__name__) + +_HAS_MINDIESD = find_spec("mindiesd") is not None + + +class LayerNorm(nn.LayerNorm, CustomOp): + """ + LayerNorm implementation that inherits from both ``nn.LayerNorm`` and ``CustomOp``. + NPU: + Uses ``mindiesd.fast_layernorm(self, x)`` when MindIE-SD is installed. + CUDA / HIP / XPU / native: + Falls back to FP32 nn.LayerNorm implementation. + """ + + def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine: bool = True): + super().__init__(normalized_shape=dim, eps=eps, elementwise_affine=elementwise_affine) + # CustomOp.__init__ cannot be called here because it would re-run + # nn.Module initialization and clear LayerNorm parameters. + self._forward_method = CustomOp.dispatch_forward(self) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self._forward_method(x) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + return self.forward_native(x) + + def forward_hip(self, x: torch.Tensor) -> torch.Tensor: + return self.forward_native(x) + + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + return self.forward_native(x) + + def forward_npu(self, x: torch.Tensor) -> torch.Tensor: + if _HAS_MINDIESD: + try: + from mindiesd import fast_layernorm + + return fast_layernorm(self, x) + except ImportError as e: + logger.warning_once( + "mindiesd.fast_layernorm import failed, falling back to FP32 layer_norm: %s", + e, + ) + + return self.forward_native(x) + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + origin_dtype = x.dtype + return F.layer_norm( + x.float(), + self.normalized_shape, + self.weight.float() if self.weight is not None else None, + self.bias.float() if self.bias is not None else None, + self.eps, + ).to(origin_dtype) + + +class RMSNorm(CustomOp): + def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward_cuda( + self, + x: torch.Tensor, + ) -> torch.Tensor: + return self.forward_native(x) + + def forward_hip( + self, + x: torch.Tensor, + ) -> torch.Tensor: + return self.forward_native(x) + + def forward_npu( + self, + x: torch.Tensor, + ) -> torch.Tensor: + import torch_npu + + output = torch_npu.npu_rms_norm(x, gamma=self.weight, epsilon=self.variance_epsilon)[0] + + return output + + def forward_xpu( + self, + x: torch.Tensor, + ) -> torch.Tensor: + return self.forward_native(x) + + def forward_native( + self, + x: torch.Tensor, + ) -> torch.Tensor: + input_dtype = x.dtype + x = x.to(torch.float32) + variance = x.pow(2).mean(-1, keepdim=True) + out = x * torch.rsqrt(variance + self.variance_epsilon) + out = self.weight.to(torch.float32) * out + return out.to(input_dtype) diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py index b870193a14..d4d81b78eb 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py @@ -11,7 +11,6 @@ from diffusers.models.attention import FeedForward from diffusers.models.embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps from diffusers.models.modeling_outputs import Transformer2DModelOutput -from diffusers.models.normalization import FP32LayerNorm from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -30,6 +29,7 @@ ) from vllm_omni.diffusion.forward_context import get_forward_context from vllm_omni.diffusion.layers.adalayernorm import AdaLayerNorm +from vllm_omni.diffusion.layers.norm import LayerNorm, RMSNorm from vllm_omni.platforms import current_omni_platform logger = init_logger(__name__) @@ -236,9 +236,9 @@ class WanImageEmbedding(nn.Module): def __init__(self, in_features: int, out_features: int, pos_embed_seq_len: int | None = None): super().__init__() - self.norm1 = FP32LayerNorm(in_features) + self.norm1 = LayerNorm(in_features) self.ff = FeedForward(in_features, out_features, mult=1, activation_fn="gelu") - self.norm2 = FP32LayerNorm(out_features) + self.norm2 = LayerNorm(out_features) if pos_embed_seq_len is not None: self.pos_embed = nn.Parameter(torch.zeros(1, pos_embed_seq_len, in_features)) else: @@ -378,8 +378,12 @@ def __init__( self.tp_inner_dim = self.num_heads * head_dim # QK normalization using vLLM's RMSNorm - self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) - self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + if get_tensor_model_parallel_world_size() > 1: + self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + else: + self.norm_q = RMSNorm(self.tp_inner_dim, eps=eps) + self.norm_k = RMSNorm(self.tp_inner_dim, eps=eps) self.to_out = RowParallelLinear( self.inner_dim, @@ -498,8 +502,12 @@ def __init__( self.tp_inner_dim = self.num_heads * head_dim # QK normalization - self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) - self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + if get_tensor_model_parallel_world_size() > 1: + self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + else: + self.norm_q = RMSNorm(self.tp_inner_dim, eps=eps) + self.norm_k = RMSNorm(self.tp_inner_dim, eps=eps) # Optional added KV projections for I2V (image embeddings) self.added_kv_proj_dim = added_kv_proj_dim @@ -518,7 +526,10 @@ def __init__( gather_output=False, return_bias=False, ) - self.norm_added_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + if get_tensor_model_parallel_world_size() > 1: + self.norm_added_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + else: + self.norm_added_k = RMSNorm(self.tp_inner_dim, eps=eps) else: self.add_k_proj = None self.add_v_proj = None @@ -637,7 +648,7 @@ def __init__( eps=eps, added_kv_proj_dim=added_kv_proj_dim, ) - self.norm2 = FP32LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity() + self.norm2 = LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity() # 3. Feed-forward self.ffn = WanFeedForward(dim=dim, inner_dim=ffn_dim, dim_out=dim) From e8658b55d14482cdd30b5ee9cc2b6ca8e81d3f15 Mon Sep 17 00:00:00 2001 From: John Liu BUAA Date: Thu, 16 Apr 2026 18:49:59 +0800 Subject: [PATCH 195/204] [Test] Add performance tests for Qwen-Image-Layered model (#2807) Signed-off-by: John Liu BUAA --- .buildkite/test-nightly.yml | 4 +- .../test_qwen_image_layered_vllm_omni.json | 49 +++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 58e1e55af7..ac43b597d1 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -415,7 +415,9 @@ steps: EXIT2=$$? pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json EXIT3=$$? - if [ $$EXIT1 -eq 0 ] || [ $$EXIT2 -eq 0 ] || [ $$EXIT3 -eq 0 ]; then + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json + EXIT4=$$? + if [ $$EXIT1 -eq 0 ] || [ $$EXIT2 -eq 0 ] || [ $$EXIT3 -eq 0 ] || [ $$EXIT4 -eq 0 ]; then buildkite-agent artifact upload "tests/dfx/perf/results/diffusion_result_*.json" buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" fi diff --git a/tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json new file mode 100644 index 0000000000..3cf13509c8 --- /dev/null +++ b/tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json @@ -0,0 +1,49 @@ +[ + { + "test_name": "test_qwen_image_layered_single_device", + "description": "Single-device baseline", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Layered", + "serve_args": { + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "640x640_steps20_i2i", + "dataset": "random", + "task": "i2i", + "width": 640, + "height": 640, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.02, + "latency_mean": 40.0, + "peak_memory_mb_max": 70000, + "peak_memory_mb_mean": 70000 + } + }, + { + "name": "1024x1024_steps35_i2i", + "dataset": "random", + "task": "i2i", + "width": 1024, + "height": 1024, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.005, + "latency_mean": 80.0, + "peak_memory_mb_max": 70000, + "peak_memory_mb_mean": 70000 + } + } + ] + } +] From 322620fd5774ffaf938395f0c065d703f85eed90 Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Thu, 16 Apr 2026 20:47:39 +0800 Subject: [PATCH 196/204] [Fix][Fish Speech] Remove redundant get_vocab() in control token encoding (#2842) Signed-off-by: Sy03 <1370724210@qq.com> --- vllm_omni/model_executor/models/fish_speech/prompt_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm_omni/model_executor/models/fish_speech/prompt_utils.py b/vllm_omni/model_executor/models/fish_speech/prompt_utils.py index 923e97b63a..8b8d8559ea 100644 --- a/vllm_omni/model_executor/models/fish_speech/prompt_utils.py +++ b/vllm_omni/model_executor/models/fish_speech/prompt_utils.py @@ -38,10 +38,7 @@ def _encode_plain_text(tokenizer: Any, text: str) -> list[int]: def _encode_control_token(tokenizer: Any, token: str) -> list[int]: - vocab = tokenizer.get_vocab() if hasattr(tokenizer, "get_vocab") else {} - token_id = vocab.get(token) - if token_id is None: - token_id = tokenizer.convert_tokens_to_ids(token) + token_id = tokenizer.convert_tokens_to_ids(token) if token_id is None or token_id == getattr(tokenizer, "unk_token_id", None): raise ValueError(f"Fish Speech tokenizer is missing required control token: {token}") return [int(token_id)] From 45760d61d231d433b01fb798f8180d146d3bc7ab Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Thu, 16 Apr 2026 21:27:43 +0800 Subject: [PATCH 197/204] [Test] Skip tests for known issues in audio and speaker recognition (#2851) --- tests/e2e/online_serving/test_qwen3_omni_expansion.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index 3065439084..06847f3d51 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -371,6 +371,7 @@ def test_mix_to_text_audio_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) +@pytest.mark.skip(reason="issue: #2827") def test_audio_in_video_001(omni_server, openai_client) -> None: """ Input Modal: text + video (synthetic MP4 with embedded audio; ``use_audio_in_video`` uses audio from the video). @@ -491,6 +492,7 @@ def test_speaker_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) +@pytest.mark.skip(reason="Known issue: occasional inaccuracy in voice recognition.") def test_speaker_002(omni_server, openai_client) -> None: """ Input Modal: text only (one-word answer constraint). From 2ec91d4dfd4dbfe8cb70ed448b56397c28cdd96b Mon Sep 17 00:00:00 2001 From: Mike Qiu Date: Thu, 16 Apr 2026 22:40:34 +0800 Subject: [PATCH 198/204] [FIX] Preserve YAML default stop words when request sends empty list (#2855) Signed-off-by: Mike_Qiu Co-authored-by: Mike_Qiu Co-authored-by: Claude Opus 4.6 --- .../test_serving_chat_sampling_params.py | 179 ++++++++++++++++++ vllm_omni/entrypoints/openai/serving_chat.py | 2 +- 2 files changed, 180 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai_api/test_serving_chat_sampling_params.py b/tests/entrypoints/openai_api/test_serving_chat_sampling_params.py index fa4c1e195d..4190b1fbb1 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_sampling_params.py +++ b/tests/entrypoints/openai_api/test_serving_chat_sampling_params.py @@ -284,6 +284,185 @@ def test_apply_request_overrides_applies_values(serving_chat, mock_request, defa assert result.top_k == 1 # YAML custom param preserved +# ============================================================================= +# Tests for empty-list handling in _apply_request_overrides +# ============================================================================= + + +def test_apply_overrides_empty_stop_list_preserves_default(serving_chat, mocker): + """Test that request.stop=[] does NOT override YAML default stop words.""" + default_params = SamplingParams(temperature=0.5, stop=["<|im_end|>"]) + request = mocker.MagicMock() + request.temperature = None + request.top_p = None + request.top_k = None + request.max_tokens = None + request.min_tokens = None + request.seed = None + request.ignore_eos = None + request.stop = [] # empty list — should be treated as "not set" + request.stop_token_ids = None + request.frequency_penalty = None + request.presence_penalty = None + + result = serving_chat._apply_request_overrides(default_params, request) + + assert result.stop == ["<|im_end|>"] # YAML default preserved + + +def test_apply_overrides_nonempty_stop_list_overrides_default(serving_chat, mocker): + """Test that request.stop=["\\n"] overrides YAML default stop words.""" + default_params = SamplingParams(temperature=0.5, stop=["<|im_end|>"]) + request = mocker.MagicMock() + request.temperature = None + request.top_p = None + request.top_k = None + request.max_tokens = None + request.min_tokens = None + request.seed = None + request.ignore_eos = None + request.stop = ["\n"] # non-empty list — should override + request.stop_token_ids = None + request.frequency_penalty = None + request.presence_penalty = None + + result = serving_chat._apply_request_overrides(default_params, request) + + assert result.stop == ["\n"] # Overridden by request + + +def test_apply_overrides_empty_stop_token_ids_preserves_default(serving_chat, mocker): + """Test that request.stop_token_ids=[] does NOT override YAML default.""" + default_params = SamplingParams(temperature=0.5, stop_token_ids=[2, 3]) + request = mocker.MagicMock() + request.temperature = None + request.top_p = None + request.top_k = None + request.max_tokens = None + request.min_tokens = None + request.seed = None + request.ignore_eos = None + request.stop = None + request.stop_token_ids = [] # empty list — should be treated as "not set" + request.frequency_penalty = None + request.presence_penalty = None + + result = serving_chat._apply_request_overrides(default_params, request) + + assert result.stop_token_ids == [2, 3] # YAML default preserved + + +def test_apply_overrides_nonempty_stop_token_ids_overrides_default(serving_chat, mocker): + """Test that request.stop_token_ids=[100] overrides YAML default.""" + default_params = SamplingParams(temperature=0.5, stop_token_ids=[2, 3]) + request = mocker.MagicMock() + request.temperature = None + request.top_p = None + request.top_k = None + request.max_tokens = None + request.min_tokens = None + request.seed = None + request.ignore_eos = None + request.stop = None + request.stop_token_ids = [100] # non-empty list — should override + request.frequency_penalty = None + request.presence_penalty = None + + result = serving_chat._apply_request_overrides(default_params, request) + + assert result.stop_token_ids == [100] # Overridden by request + + +def test_apply_overrides_mixed_empty_and_nonempty_lists(serving_chat, mocker): + """Test mixing empty and non-empty list fields with scalar fields.""" + default_params = SamplingParams( + temperature=0.4, + stop=["<|end|>"], + stop_token_ids=[2], + ) + request = mocker.MagicMock() + request.temperature = 0.9 + request.top_p = None + request.top_k = None + request.max_tokens = None + request.min_tokens = None + request.seed = None + request.ignore_eos = None + request.stop = [] # empty — should NOT override + request.stop_token_ids = [100, 200] # non-empty — SHOULD override + request.frequency_penalty = None + request.presence_penalty = None + + result = serving_chat._apply_request_overrides(default_params, request) + + assert result.temperature == 0.9 # Scalar override works + assert result.stop == ["<|end|>"] # Empty list did NOT override + assert result.stop_token_ids == [100, 200] # Non-empty list DID override + + +def test_apply_overrides_none_scalar_still_preserves_default(serving_chat, mocker): + """Regression: ensure None scalar values still don't override defaults.""" + default_params = SamplingParams(temperature=0.5, max_tokens=100, seed=42) + request = mocker.MagicMock() + request.temperature = None + request.top_p = None + request.top_k = None + request.max_tokens = None + request.min_tokens = None + request.seed = None + request.ignore_eos = None + request.stop = None + request.stop_token_ids = None + request.frequency_penalty = None + request.presence_penalty = None + + result = serving_chat._apply_request_overrides(default_params, request) + + assert result.temperature == 0.5 + assert result.max_tokens == 100 + assert result.seed == 42 + + +def test_apply_overrides_both_lists_empty_preserves_defaults(serving_chat, mocker): + """Test that both stop=[] and stop_token_ids=[] preserve YAML defaults.""" + default_params = SamplingParams( + temperature=0.5, + stop=["<|end|>", "\\n"], + stop_token_ids=[2, 32000], + ) + request = mocker.MagicMock() + request.temperature = None + request.top_p = None + request.top_k = None + request.max_tokens = None + request.min_tokens = None + request.seed = None + request.ignore_eos = None + request.stop = [] + request.stop_token_ids = [] + request.frequency_penalty = None + request.presence_penalty = None + + result = serving_chat._apply_request_overrides(default_params, request) + + assert result.stop == ["<|end|>", "\\n"] + assert result.stop_token_ids == [2, 32000] + + +def test_build_sampling_params_list_empty_stop_preserves_yaml(serving_chat, mock_request): + """Test that empty stop list in request preserves YAML defaults via + _build_sampling_params_list_from_request.""" + mock_request.stop = [] + mock_request.stop_token_ids = [] + + result = serving_chat._build_sampling_params_list_from_request(mock_request) + + comprehension_params = result[0] + # Empty lists should NOT override — YAML defaults are preserved + assert comprehension_params.stop == [] + assert comprehension_params.stop_token_ids == [] + + # ============================================================================= # Tests for _get_comprehension_stage_index # ============================================================================= diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 4b3a7045ca..34ddbbd302 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -728,7 +728,7 @@ def _apply_request_overrides( for field_name in self._OPENAI_SAMPLING_FIELDS: value = getattr(request, field_name, None) - if value is not None: + if (value is not None and not isinstance(value, list)) or (isinstance(value, list) and len(value) > 0): setattr(params, field_name, value) return params From 7d64a7c9964ed7f285fec120dcb7396e027d600c Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Thu, 16 Apr 2026 23:07:48 +0800 Subject: [PATCH 199/204] [BugFix][VoxCPM2]: split multichar Chinese tokens to match training tokenization (#2832) Signed-off-by: Sy03 <1370724210@qq.com> --- .../entrypoints/openai/serving_speech.py | 25 +++++++- .../models/voxcpm2/voxcpm2_talker.py | 63 ++++++++++++++++++- 2 files changed, 86 insertions(+), 2 deletions(-) diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 1f78f5691b..3eaf18111c 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -216,6 +216,8 @@ def __init__(self, *args, **kwargs): "Re-upload voices after each restart if needed." ) self._tts_tokenizer = None + self._voxcpm2_tokenizer = None + self._voxcpm2_split_map: dict[int, list[int]] = {} logger.info(f"Loaded {len(self.supported_speakers)} supported speakers: {sorted(self.supported_speakers)}") @@ -812,6 +814,25 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non return None # VoxCPM2 accepts any text input return self._validate_qwen_tts_request(request) + def _voxcpm2_encode(self, text: str) -> list[int]: + """Tokenize text for VoxCPM2, splitting multichar Chinese tokens.""" + from vllm_omni.model_executor.models.voxcpm2.voxcpm2_talker import ( + build_cjk_split_map, + split_multichar_chinese, + ) + + if self._voxcpm2_tokenizer is None: + from transformers import AutoTokenizer + + model_name = self.engine_client.model_config.model + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + self._voxcpm2_split_map = build_cjk_split_map(tokenizer) + self._voxcpm2_tokenizer = tokenizer + logger.info("VoxCPM2 serving: built multichar split map (%d entries)", len(self._voxcpm2_split_map)) + + ids = self._voxcpm2_tokenizer.encode(text, add_special_tokens=True) + return split_multichar_chinese(ids, self._voxcpm2_split_map) + def _validate_ref_audio_format(self, ref_audio: str) -> str | None: """Validate ref_audio is a supported URI format. Returns error or None.""" if not ( @@ -1508,7 +1529,9 @@ async def _prepare_speech_generation( if request.ref_audio is not None: wav_list, sr = await self._resolve_ref_audio(request.ref_audio) additional["reference_audio"] = [[wav_list, sr]] - prompt = {"prompt": request.input} + # Pre-split multichar Chinese tokens (VoxCPM2 was trained with single-char CJK IDs). + token_ids = self._voxcpm2_encode(request.input) + prompt: dict[str, Any] = {"prompt_token_ids": token_ids} if additional: prompt["additional_information"] = additional elif self._is_tts: diff --git a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py index 02bcae821e..b666e41ebc 100644 --- a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py +++ b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py @@ -41,6 +41,45 @@ _ENABLE_PROFILING = os.environ.get("VOXCPM2_PROFILE", "0") == "1" +def is_cjk_char(c: str) -> bool: + """Check if a character is a CJK ideograph.""" + cp = ord(c) + return ( + 0x4E00 <= cp <= 0x9FFF # CJK Unified Ideographs + or 0x3400 <= cp <= 0x4DBF # Extension A + or 0xF900 <= cp <= 0xFAFF # Compatibility Ideographs + or 0x20000 <= cp <= 0x2A6DF # Extension B + or 0x2A700 <= cp <= 0x2B73F # Extension C + or 0x2B740 <= cp <= 0x2B81F # Extension D + or 0x2F800 <= cp <= 0x2FA1F # Compatibility Supplement + ) + + +def build_cjk_split_map(tokenizer: Any) -> dict[int, list[int]]: + """Build {multichar_cjk_token_id: [single_char_ids]} from tokenizer vocab.""" + vocab = tokenizer.get_vocab() + split_map: dict[int, list[int]] = {} + for token, token_id in vocab.items(): + clean = token.replace("\u2581", "") + if len(clean) >= 2 and all(is_cjk_char(c) for c in clean): + char_ids = tokenizer.convert_tokens_to_ids(list(clean)) + if all(cid != tokenizer.unk_token_id for cid in char_ids): + split_map[token_id] = char_ids + return split_map + + +def split_multichar_chinese(token_ids: list[int], split_map: dict[int, list[int]]) -> list[int]: + """Replace multichar Chinese token IDs with single-char IDs (idempotent).""" + result: list[int] = [] + for tid in token_ids: + expansion = split_map.get(tid) + if expansion is not None: + result.extend(expansion) + else: + result.append(tid) + return result + + def _encode_raw_audio( tts: nn.Module, samples: list[float] | torch.Tensor, @@ -354,6 +393,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._cuda_graph_warmup_steps = 0 self._cuda_graph_warmup_threshold = 3 + self._multichar_zh_split: dict[int, list[int]] | None = None + self._active_states: dict[str, _RequestState] = {} self._current_request_id: str | None = None self._pending_requests: list[tuple[str, bool, torch.Tensor | None, int]] = [] @@ -985,6 +1026,17 @@ def make_omni_output(self, model_outputs: torch.Tensor | OmniOutput, **kwargs: A return OmniOutput(text_hidden_states=model_outputs, multimodal_outputs=mm) + # -------------------- Chinese token splitting -------------------- + + def _get_multichar_zh_split(self) -> dict[int, list[int]]: + """Lazy-build {multichar_chinese_token_id: [char_id, ...]} map.""" + if self._multichar_zh_split is not None: + return self._multichar_zh_split + base_tokenizer = self.tts.text_tokenizer.tokenizer + self._multichar_zh_split = build_cjk_split_map(base_tokenizer) + logger.info("VoxCPM2: built multichar Chinese split map (%d entries)", len(self._multichar_zh_split)) + return self._multichar_zh_split + # -------------------- preprocess / postprocess -------------------- def preprocess( @@ -1011,8 +1063,17 @@ def preprocess( for rid in [r for r, s in self._active_states.items() if r not in pending_ids and s.prefill_completed]: self._cleanup_request(rid) - # VoxCPM2Tokenizer does char-level Chinese splitting, so use input_ids directly token_ids = input_ids.tolist() + # Fail-fast: unsplit multichar Chinese IDs in input_ids means the + # serving layer didn't pre-split. Silent fixup here would cause + # input_ids/embeds length mismatch (scheduler slot count is fixed). + split_map = self._get_multichar_zh_split() + if split_map and any(tid in split_map for tid in token_ids): + raise ValueError( + "VoxCPM2 preprocess received unsplit multichar Chinese " + "token IDs. The serving layer must send prompt_token_ids " + "with single-char CJK IDs (see _voxcpm2_encode)." + ) if token_ids and token_ids[0] == self.config.bos_token_id: token_ids = token_ids[1:] From c3ca5daafb05acec828a66e3ba5f84951715fcf2 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2587297563@qq.com> Date: Thu, 16 Apr 2026 23:15:25 +0800 Subject: [PATCH 200/204] Feat/Add HunyuanImage-3.0-Instruct ar part support: (#2713) Signed-off-by: TaffyOfficial <2324465096@qq.com> Co-authored-by: TaffyOfficial <2324465096@qq.com> Co-authored-by: Claude Opus 4.6 (1M context) --- .../hunyuan_image3/prompt_utils.py | 88 ++++++++ .../test_hunyuan_image3_sampler.py | 190 +++++++++++++++++ .../test_hunyuanimage3_text2img.py | 2 +- .../models/hunyuan_image3/hunyuan_image3.py | 195 ++++++++++++++++++ .../stage_configs/hunyuan_image3_i2t.yaml | 44 ++++ .../stage_configs/hunyuan_image3_it2i.yaml | 78 +++++++ .../stage_configs/hunyuan_image3_moe.yaml | 81 -------- .../stage_configs/hunyuan_image3_t2t.yaml | 45 ++++ .../stage_input_processors/hunyuan_image3.py | 123 +++++++++++ vllm_omni/patch.py | 52 +++++ 10 files changed, 816 insertions(+), 82 deletions(-) create mode 100644 examples/offline_inference/hunyuan_image3/prompt_utils.py create mode 100644 tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_sampler.py create mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml create mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i.yaml delete mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml create mode 100644 vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml create mode 100644 vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py diff --git a/examples/offline_inference/hunyuan_image3/prompt_utils.py b/examples/offline_inference/hunyuan_image3/prompt_utils.py new file mode 100644 index 0000000000..a5ef8e1536 --- /dev/null +++ b/examples/offline_inference/hunyuan_image3/prompt_utils.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Prompt construction utilities for HunyuanImage-3.0-Instruct examples. + +Wraps system_prompt.get_system_prompt() with task-aware presets so that +examples and tests don't need to manually concatenate system prompts, +, , and tags. + +Usage: + from prompt_utils import build_prompt + + # IT2I (image editing, think+recaption mode) + prompt = build_prompt("Make the petals neon pink", task="it2i_think") + + # I2T (image understanding) + prompt = build_prompt("Describe the content of the picture.", task="i2t") +""" + +from __future__ import annotations + +from vllm_omni.diffusion.models.hunyuan_image3.system_prompt import ( + get_system_prompt, +) + +# task → (sys_type, bot_task, trigger_tag) +# trigger_tag: "", "", or None +_TASK_PRESETS: dict[str, tuple[str, str | None, str | None]] = { + # Pure text generation (text → text, no image) + "t2t": ("en_unified", None, None), + # Image understanding (image → text) + "i2t": ("en_unified", None, None), + # Image editing (image+text → image), think+recaption mode + "it2i_think": ("en_unified", "think", ""), + # Image editing, recaption-only mode + "it2i_recaption": ("en_unified", "recaption", ""), + # Text-to-image, think mode + "t2i_think": ("en_unified", "think", ""), + # Text-to-image, recaption mode + "t2i_recaption": ("en_unified", "recaption", ""), + # Text-to-image, vanilla (no CoT) + "t2i_vanilla": ("en_vanilla", "image", None), +} + + +def build_prompt( + user_prompt: str, + task: str = "it2i_think", + sys_type: str | None = None, + custom_system_prompt: str | None = None, +) -> str: + """Build a complete HunyuanImage-3.0 prompt with auto-selected system + prompt and mode trigger tags. + + Args: + user_prompt: The user's raw instruction or question. + task: One of the preset task keys (see _TASK_PRESETS). + sys_type: Override the preset's sys_type for get_system_prompt(). + custom_system_prompt: Custom system prompt text (used when + sys_type="custom"). + + Returns: + Fully formatted prompt string ready for Omni.generate(). + """ + if task not in _TASK_PRESETS: + raise ValueError(f"Unknown task {task!r}. Choose from: {sorted(_TASK_PRESETS)}") + + preset_sys_type, preset_bot_task, trigger_tag = _TASK_PRESETS[task] + effective_sys_type = sys_type or preset_sys_type + + system_prompt = get_system_prompt(effective_sys_type, preset_bot_task, custom_system_prompt) + sys_text = system_prompt.strip() if system_prompt else "" + + has_image_input = task.startswith("i2t") or task.startswith("it2i") + + parts = ["<|startoftext|>"] + if sys_text: + parts.append(sys_text) + # Instruct conversation template: \n\nUser: ... \n\nAssistant: + parts.append("\n\nUser: ") + if has_image_input: + parts.append("") + parts.append(user_prompt) + parts.append("\n\nAssistant: ") + if trigger_tag: + parts.append(trigger_tag) + + return "".join(parts) diff --git a/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_sampler.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_sampler.py new file mode 100644 index 0000000000..51f6a85f58 --- /dev/null +++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_image3_sampler.py @@ -0,0 +1,190 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for HunyuanImage3 AR sampler logic (stage transitions, +ratio restriction, comprehension blocking).""" + +import pytest +import torch + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + +# Fake token IDs for testing (avoid importing the real model). +END_OF_THINK = 100 +RECAPTION = 101 +END_OF_RECAPTION = 102 +ANSWER = 103 +BOI = 104 +SIZE_TOKEN = 105 +EOS = 106 +RATIO_START = 200 +RATIO_END = 210 +RATIO_OTHER_START = 220 +RATIO_OTHER_END = 223 + + +class FakeSamplerModel: + """Minimal stub that replicates the sampler-relevant attributes of + HunyuanImage3ForConditionalGeneration without loading real weights.""" + + def __init__(self, *, is_comprehension: bool = False): + self._is_comprehension = is_comprehension + self._eos_token_id = EOS + self._end_of_think_id = END_OF_THINK + self._recaption_id = RECAPTION + self._end_of_recaption_id = END_OF_RECAPTION + self._answer_id = ANSWER + self._mrope_boi_token_id = BOI + self._size_token_id = SIZE_TOKEN + self._start_ratio_id = RATIO_START + self._end_ratio_id = RATIO_END + self._ratio_other_slices = [(RATIO_OTHER_START, RATIO_OTHER_END + 1)] + self._all_ratio_ids = set(range(RATIO_START, RATIO_END + 1)) + self._all_ratio_ids.update(range(RATIO_OTHER_START, RATIO_OTHER_END + 1)) + + self._stage_transitions: dict[int, list[int]] = {} + if not is_comprehension: + self._stage_transitions[END_OF_THINK] = [RECAPTION] + self._stage_transitions[END_OF_RECAPTION] = [ANSWER, BOI, SIZE_TOKEN] + + self._blocked_token_ids: set[int] = set() + if is_comprehension: + self._blocked_token_ids.update([BOI, SIZE_TOKEN]) + self._blocked_token_ids.update(self._all_ratio_ids) + + # Bind the real methods from the model class. + from vllm_omni.model_executor.models.hunyuan_image3.hunyuan_image3 import ( + HunyuanImage3ForConditionalGeneration as _Real, + ) + + _get_forced_token = _Real._get_forced_token + _apply_ratio_restriction = _Real._apply_ratio_restriction + + +class TestGetForcedToken: + """Tests for the stateless _get_forced_token method.""" + + def setup_method(self): + self.model = FakeSamplerModel(is_comprehension=False) + + def test_no_trigger_returns_none(self): + assert self.model._get_forced_token([1, 2, 3]) is None + + def test_empty_history_returns_none(self): + assert self.model._get_forced_token([]) is None + + def test_end_of_think_forces_recaption(self): + assert self.model._get_forced_token([END_OF_THINK]) == RECAPTION + + def test_end_of_think_completed(self): + assert self.model._get_forced_token([END_OF_THINK, RECAPTION]) is None + + def test_end_of_recaption_forces_answer(self): + tokens = [END_OF_THINK, RECAPTION, END_OF_RECAPTION] + assert self.model._get_forced_token(tokens) == ANSWER + + def test_end_of_recaption_forces_boi_after_answer(self): + tokens = [END_OF_THINK, RECAPTION, END_OF_RECAPTION, ANSWER] + assert self.model._get_forced_token(tokens) == BOI + + def test_end_of_recaption_forces_size_after_boi(self): + tokens = [END_OF_THINK, RECAPTION, END_OF_RECAPTION, ANSWER, BOI] + assert self.model._get_forced_token(tokens) == SIZE_TOKEN + + def test_full_sequence_complete(self): + tokens = [END_OF_THINK, RECAPTION, END_OF_RECAPTION, ANSWER, BOI, SIZE_TOKEN] + assert self.model._get_forced_token(tokens) is None + + def test_diverged_history_returns_none(self): + tokens = [END_OF_RECAPTION, 999] # 999 != ANSWER + assert self.model._get_forced_token(tokens) is None + + def test_later_trigger_takes_precedence(self): + tokens = [END_OF_THINK, RECAPTION, END_OF_RECAPTION] + assert self.model._get_forced_token(tokens) == ANSWER + + def test_trigger_with_extra_tokens_before(self): + tokens = [1, 2, 3, END_OF_THINK] + assert self.model._get_forced_token(tokens) == RECAPTION + + +class TestComprehensionBlocking: + """Tests for comprehension mode token blocking.""" + + def test_blocked_tokens_masked(self): + model = FakeSamplerModel(is_comprehension=True) + vocab_size = 300 + logits = torch.zeros(1, vocab_size) + logits[0, BOI] = 5.0 + logits[0, SIZE_TOKEN] = 3.0 + logits[0, RATIO_START] = 2.0 + min_score = torch.finfo(logits.dtype).min + + for tid in model._blocked_token_ids: + if tid < vocab_size: + logits[0, tid] = min_score + + assert logits[0, BOI].item() == min_score + assert logits[0, SIZE_TOKEN].item() == min_score + assert logits[0, RATIO_START].item() == min_score + + def test_non_blocked_tokens_preserved(self): + model = FakeSamplerModel(is_comprehension=True) + vocab_size = 300 + logits = torch.zeros(1, vocab_size) + logits[0, 50] = 7.0 + min_score = torch.finfo(logits.dtype).min + + for tid in model._blocked_token_ids: + if tid < vocab_size: + logits[0, tid] = min_score + + assert logits[0, 50].item() == 7.0 + + +class TestRatioRestriction: + """Tests for _apply_ratio_restriction (greedy: only argmax ratio survives).""" + + def test_greedy_selects_single_ratio_token(self): + model = FakeSamplerModel(is_comprehension=False) + vocab_size = 300 + logits = torch.zeros(1, vocab_size) + logits[0, RATIO_START + 3] = 10.0 + logits[0, RATIO_START + 1] = 5.0 + logits[0, 50] = 20.0 # non-ratio, should be masked + min_score = torch.finfo(logits.dtype).min + + model._apply_ratio_restriction(logits, 0, min_score) + + assert logits[0, RATIO_START + 3].item() == 0 + assert logits[0, RATIO_START + 1].item() == min_score + assert logits[0, 50].item() == min_score + + def test_extra_ratio_slices_considered(self): + model = FakeSamplerModel(is_comprehension=False) + vocab_size = 300 + logits = torch.zeros(1, vocab_size) + logits[0, RATIO_OTHER_START] = 15.0 + logits[0, RATIO_START] = 5.0 + min_score = torch.finfo(logits.dtype).min + + model._apply_ratio_restriction(logits, 0, min_score) + + assert logits[0, RATIO_OTHER_START].item() == 0 + assert logits[0, RATIO_START].item() == min_score + + +class TestForceEosAfterRatio: + """Tests that a ratio token as last_token forces EOS.""" + + def test_ratio_token_forces_eos(self): + model = FakeSamplerModel(is_comprehension=False) + vocab_size = 300 + logits = torch.randn(1, vocab_size) + min_score = torch.finfo(logits.dtype).min + + logits[0].fill_(min_score) + logits[0, model._eos_token_id] = 0 + + assert logits[0, EOS].item() == 0 + non_eos_max = logits[0, :EOS].max().item() + assert non_eos_max == min_score diff --git a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py index 6898763e40..ec4f4693d7 100644 --- a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py +++ b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py @@ -17,7 +17,7 @@ MODEL_NAME = "tencent/HunyuanImage-3.0" LOCAL_CLIP_PATH = "openai/clip-vit-base-patch32" REPO_ROOT = Path(__file__).resolve().parents[3] -STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image3_moe.yaml" +STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image3_t2i.yaml" pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion] diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index 5c280ddcf4..6304eeab29 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -77,7 +77,9 @@ from vllm.sequence import IntermediateTensors from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils.tensor_schema import TensorSchema +from vllm.v1.outputs import SamplerOutput from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.sample.sampler import Sampler from vllm_omni.model_executor.models.hunyuan_image3.autoencoder_kl_3d import AutoencoderKLConv3D from vllm_omni.model_executor.models.hunyuan_image3.siglip2 import LightProjector, Siglip2VisionTransformer @@ -175,8 +177,11 @@ def contains_unexpected_keyword(name, keywords): return True return False + skipped_unexpected: set[str] = set() + for name, loaded_weight in weights: if contains_unexpected_keyword(name, unexpected_keywords): + skipped_unexpected.add(name) continue if "rotary_emb.inv_freq" in name: @@ -362,6 +367,17 @@ def contains_unexpected_keyword(name, keywords): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) + + if skipped_unexpected: + logger.warning_once( + "Skipped %d weights matching unexpected_keywords " + "(e.g. vae, vision_model, patch_embed, timestep_emb). " + "If upstream renamed components, these may be silently " + "lost. Skipped names: %s", + len(skipped_unexpected), + sorted(skipped_unexpected)[:10], + ) + return loaded_params @@ -1149,6 +1165,8 @@ class HunyuanImage3ForConditionalGeneration(nn.Module, SupportsMultiModal, Suppo HunyuanImage3Inputs: TypeAlias = HunyuanImage3PixelInputs + prefer_model_sampler = True + packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -1199,6 +1217,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.lm_head = PPMissingLayer() + # --- AR-stage components --- + # These are needed for image encoding in the AR stage. + # If a future text-only stage is added, gate on vllm_config.model_config.model_stage. + # vae self.vae = AutoencoderKLConv3D.from_config(config.vae) self.patch_embed = UNetDown( @@ -1226,6 +1248,63 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._mrope_joint_img_sep_token_id = tokenizer.convert_tokens_to_ids("") self._mrope_max_num_patches = config.vit_processor.get("max_num_patches", 729) + # Special token IDs for logits processors (stage transitions). + # These mirror the official tokenization_hunyuan_image_3.py setup. + self._end_of_think_id = tokenizer.convert_tokens_to_ids("") + self._recaption_id = tokenizer.convert_tokens_to_ids("") + self._end_of_recaption_id = tokenizer.convert_tokens_to_ids("") + self._answer_id = tokenizer.convert_tokens_to_ids("") + self._end_of_answer_id = tokenizer.convert_tokens_to_ids("") + image_base_size = getattr(config, "image_base_size", 1024) + self._size_token_id = tokenizer.convert_tokens_to_ids(f"") + self._start_ratio_id = tokenizer.convert_tokens_to_ids("") + self._end_ratio_id = tokenizer.convert_tokens_to_ids("") + ratio_33 = tokenizer.convert_tokens_to_ids("") + ratio_36 = tokenizer.convert_tokens_to_ids("") + self._ratio_other_slices = [(ratio_33, ratio_36 + 1)] + # Build the full set of ratio token IDs for use as stop tokens. + self._all_ratio_ids = set(range(self._start_ratio_id, self._end_ratio_id + 1)) + for s, e in self._ratio_other_slices: + self._all_ratio_ids.update(range(s, e)) + + # Determine mode: comprehension (I2T/T2T) vs generation (IT2I/T2I). + engine_output_type = getattr(vllm_config.model_config, "engine_output_type", None) + self._is_comprehension = engine_output_type in (None, "text") + + # For comprehension mode, block image generation tokens but allow + # text structure tokens (, , etc.) so the model can + # follow its natural generation pattern. Stop tokens in YAML will + # terminate at or EOS. + self._blocked_token_ids: set[int] = set() + if self._is_comprehension: + self._blocked_token_ids.update( + [ + self._mrope_boi_token_id, # + self._mrope_eoi_token_id, # + self._size_token_id, # + ] + ) + self._blocked_token_ids.update(self._all_ratio_ids) + + # For generation mode, build stage transition map. + # Official logic: → [], + # → [, , ] + # After , restrict vocab to ratio tokens only. + # Stage-transition forced sequences, keyed by trigger token. + self._stage_transitions: dict[int, list[int]] = {} + if not self._is_comprehension: + self._stage_transitions[self._end_of_think_id] = [ + self._recaption_id, + ] + self._stage_transitions[self._end_of_recaption_id] = [ + self._answer_id, + self._mrope_boi_token_id, + self._size_token_id, + ] + + self._sampler: Sampler | None = None + self._eos_token_id: int = tokenizer.eos_token_id + self._replace_rotary_embeddings() def _replace_rotary_embeddings(self): @@ -1257,6 +1336,12 @@ def _replace_rotary_embeddings(self): head_dim, rope_theta, ) + if replaced == 0: + raise RuntimeError( + "HunyuanImage3: _replace_rotary_embeddings replaced 0 layers. " + "The custom interleaved 2D mRoPE is not active — model outputs " + "will be incorrect. Check that model.layers[*].self_attn.rotary_emb exists." + ) def _parse_and_validate_image_input( self, @@ -1274,6 +1359,10 @@ def _parse_and_validate_image_input( if vit_pixel_values is None or vae_pixel_values is None: return None + # Handle empty batch (e.g., during profiling with 0 images / T2T mode) + if vit_pixel_values.numel() == 0 or vae_pixel_values.numel() == 0: + return None + return HunyuanImage3PixelInputs( type="pixel_values", pixel_values={ @@ -1472,6 +1561,112 @@ def compute_logits( logits = self.logits_processor(self.lm_head, hidden_states) return logits + # ------------------------------------------------------------------ + # Custom sampler — applies HunyuanImage3-specific logits processors + # before the standard sampling step. + # + # Comprehension (I2T / T2T): + # Block generation-specific special tokens so sampling can't + # accidentally produce , , ratio tokens, etc. + # + # Generation (IT2I / T2I think): + # 1. _StageTransitionLogitsProcessor — force token sequences at + # transition boundaries (, etc.) + # 2. _ConditionalSliceVocabLogitsProcessor — after , + # restrict vocab to ratio tokens only (greedy). + # ------------------------------------------------------------------ + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> SamplerOutput | None: + if logits is None or logits.numel() == 0: + return None + + if self._sampler is None: + self._sampler = Sampler() + + min_score = torch.finfo(logits.dtype).min + + assert logits.shape[0] == 1, f"HunyuanImage3 sampler requires max_num_seqs=1, got batch size {logits.shape[0]}" + + for req_idx in range(logits.shape[0]): + decoded_tokens: list[int] = ( + sampling_metadata.output_token_ids[req_idx] if req_idx < len(sampling_metadata.output_token_ids) else [] + ) + last_token = decoded_tokens[-1] if decoded_tokens else -1 + + if self._is_comprehension: + for tid in self._blocked_token_ids: + logits[req_idx, tid] = min_score + else: + forced = self._get_forced_token(decoded_tokens) + if forced is not None: + logits[req_idx].fill_(min_score) + logits[req_idx, forced] = 0 + elif last_token == self._size_token_id: + self._apply_ratio_restriction(logits, req_idx, min_score) + elif last_token in self._all_ratio_ids: + logits[req_idx].fill_(min_score) + logits[req_idx, self._eos_token_id] = 0 + + return self._sampler(logits=logits, sampling_metadata=sampling_metadata) + + def _get_forced_token(self, decoded_tokens: list[int]) -> int | None: + """Derive the next forced token from output history (stateless). + + Scans decoded_tokens backwards for the most recent trigger token, + then prefix-matches the forced sequence against what followed. + Returns the next token to force, or None if the sequence is complete + or history has diverged from the expected forced sequence. + """ + for i in range(len(decoded_tokens) - 1, -1, -1): + trigger = decoded_tokens[i] + if trigger not in self._stage_transitions: + continue + + forced_seq = self._stage_transitions[trigger] + emitted = decoded_tokens[i + 1 :] + + matched = 0 + for expected, actual in zip(forced_seq, emitted): + if actual != expected: + # History diverged from the expected forced sequence. + # Stop applying transition forcing for safety. + return None + matched += 1 + + if matched < len(forced_seq): + return forced_seq[matched] + return None + + return None + + def _apply_ratio_restriction( + self, + logits: torch.Tensor, + req_idx: int, + min_score: float, + ) -> None: + """Port of official _ConditionalSliceVocabLogitsProcessor.__call__. + + After the size token, only allow ratio tokens and pick greedily. + """ + original = logits[req_idx].clone() + logits[req_idx].fill_(min_score) + # Allow primary ratio range. + logits[req_idx, self._start_ratio_id : self._end_ratio_id + 1] = original[ + self._start_ratio_id : self._end_ratio_id + 1 + ] + # Allow extra ratio slices. + for s, e in self._ratio_other_slices: + logits[req_idx, s:e] = original[s:e] + # Force greedy: keep only the argmax. + max_id = logits[req_idx].argmax().item() + logits[req_idx].fill_(min_score) + logits[req_idx, max_id] = 0 + def make_empty_intermediate_tensors( self, batch_size: int, dtype: torch.dtype, device: torch.device ) -> IntermediateTensors: diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml new file mode 100644 index 0000000000..203b54f257 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/hunyuan_image3_i2t.yaml @@ -0,0 +1,44 @@ +# Stage config for HunyuanImage-3.0 Image-to-Text (I2T / image understanding). +# Single LLM stage: AR model reads image + text prompt, generates text output. + +stage_args: + - stage_id: 0 + stage_type: llm + runtime: + process: true + devices: "0,1,2,3" + max_batch_size: 1 + requires_multimodal_data: true + engine_args: + model_stage: AR + max_num_seqs: 1 + model_arch: HunyuanImage3ForCausalMM + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.95 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + max_num_batched_tokens: 32768 + tensor_parallel_size: 4 + pipeline_parallel_size: 1 + hf_overrides: + rope_parameters: + mrope_section: [0, 32, 32] + rope_type: default + is_comprehension: true + final_output: true + final_output_type: text + default_sampling_params: + temperature: 0.0 + top_p: 0.95 + top_k: 1024 + max_tokens: 2048 + stop_token_ids: [127957, 128026] # <|endoftext|>, + detokenize: True + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i.yaml new file mode 100644 index 0000000000..9f6adece0f --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/hunyuan_image3_it2i.yaml @@ -0,0 +1,78 @@ +# Stage config for HunyuanImage-3.0 Image+Text-to-Image (image editing). +# Stage 0: AR (HunyuanImage3ForConditionalGeneration) — reads (image, text), emits latent tokens +# Stage 1: Diffusion (HunyuanImage3Pipeline / DiT + VAE) — denoise + decode latents → image + +stage_args: + # Stage 0: AR Model + - stage_id: 0 + stage_type: llm + runtime: + process: true + devices: "0,1,2,3" + max_batch_size: 1 + requires_multimodal_data: true # AR needs the original image + engine_args: + model_stage: AR + model_arch: HunyuanImage3ForCausalMM + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.95 + enforce_eager: true + trust_remote_code: true + engine_output_type: latent # AR outputs latent for DiT + enable_prefix_caching: false + max_num_batched_tokens: 32768 + tensor_parallel_size: 4 + pipeline_parallel_size: 1 + hf_overrides: + rope_parameters: + mrope_section: [0, 32, 32] + rope_type: default + is_comprehension: false # Generation task, not comprehension + final_output: false # AR is not the final output + default_sampling_params: + temperature: 0.6 + top_p: 0.95 + top_k: 1024 + max_tokens: 4096 + stop_token_ids: [127957] # <|endoftext|> + detokenize: false + + # Stage 1: Diffusion (DiT + VAE) + # Receives latents from AR stage, performs denoising + VAE decode + - stage_id: 1 + stage_type: diffusion + runtime: + process: true + devices: "4,5,6,7" + max_batch_size: 1 + requires_multimodal_data: true # May need condition images + engine_args: + model_stage: dit + model_arch: HunyuanImage3ForCausalMM + enforce_eager: true + trust_remote_code: true + distributed_executor_backend: "mp" + parallel_config: + tensor_parallel_size: 4 + enable_expert_parallel: true + omni_kv_config: + need_recv_cache: true + engine_input_source: [0] # Input from AR stage + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.hunyuan_image3.ar2diffusion + final_output: true + final_output_type: image + default_sampling_params: + num_inference_steps: 50 + guidance_scale: 2.5 + +# Top-level runtime config +runtime: + enabled: true + defaults: + window_size: -1 # Trigger downstream only after full upstream completion + max_inflight: 1 # Process serially within each stage + edges: + - from: 0 # AR → Diffusion + to: 1 + window_size: -1 diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml deleted file mode 100644 index 808b4619f7..0000000000 --- a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml +++ /dev/null @@ -1,81 +0,0 @@ -# Stage config for running Hunyuan-Image3.0 for multi-stage omni runtime. -# Stage 0: AR Model (vLLM implementation) - -# The following config has been verified on 8x L40S-48G GPU. -modes: - - mode: text-to-image - stages: [1] - - mode: image-to-text - stages: [0] -stage_args: - - stage_id: 0 - stage_type: llm # Use llm stage type for AR stages - runtime: - process: true # Run this stage in a separate process - devices: "0,1,2,3,4,5,6,7" # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device) - engine_args: - model_stage: AR - max_num_seqs: 1 - model_arch: HunyuanImage3ForCausalMM - worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker - scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.3 - enforce_eager: true # Now we only support eager mode - trust_remote_code: true - engine_output_type: latent - enable_prefix_caching: false - max_num_batched_tokens: 32768 - tensor_parallel_size: 8 - pipeline_parallel_size: 1 - hf_overrides: - rope_parameters: - mrope_section: [0, 32, 32] - rope_type: default - is_comprehension: true - final_output: true - final_output_type: text - default_sampling_params: - temperature: 0.0 - top_p: 1.0 - top_k: -1 - max_tokens: 2048 - seed: 42 - detokenize: True - repetition_penalty: 1.1 - - stage_id: 1 - stage_type: diffusion - runtime: - process: true - devices: "0,1,2,3,4,5,6,7" - max_batch_size: 1 - engine_args: - model_stage: diffusion - enforce_eager: true - distributed_executor_backend: "mp" - vae_use_slicing: false - vae_use_tiling: false - cache_backend: null - cache_config: null - enable_cache_dit_summary: false - parallel_config: - pipeline_parallel_size: 1 - data_parallel_size: 1 - tensor_parallel_size: 8 - enable_expert_parallel: false - sequence_parallel_size: 1 - ulysses_degree: 1 - ring_degree: 1 - cfg_parallel_size: 1 - vae_patch_parallel_size: 1 - use_hsdp: false - hsdp_shard_size: -1 - hsdp_replicate_size: 1 - final_output: true - final_output_type: image - -# Top-level runtime config (concise): default windows and stage edges -runtime: - enabled: true - defaults: - window_size: -1 # Simplified: trigger downstream only after full upstream completion - max_inflight: 1 # Simplified: process serially within each stage diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml new file mode 100644 index 0000000000..60da8e0bc7 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2t.yaml @@ -0,0 +1,45 @@ +# Stage config for HunyuanImage-3.0 Text-to-Text (T2T / pure text generation). +# Single LLM stage: AR model reads text prompt only, generates text output. +# Sampling params aligned with official generation_config.json. + +stage_args: + - stage_id: 0 + stage_type: llm + runtime: + process: true + devices: "0,1,2,3" + max_batch_size: 1 + requires_multimodal_data: false + engine_args: + model_stage: AR + max_num_seqs: 1 + model_arch: HunyuanImage3ForCausalMM + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.95 + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + max_num_batched_tokens: 32768 + tensor_parallel_size: 4 + pipeline_parallel_size: 1 + hf_overrides: + rope_parameters: + mrope_section: [0, 32, 32] + rope_type: default + is_comprehension: true + final_output: true + final_output_type: text + default_sampling_params: + temperature: 0.0 + top_p: 0.95 + top_k: 1024 + max_tokens: 2048 + stop_token_ids: [127957, 128026] # <|endoftext|>, + detokenize: True + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 diff --git a/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py new file mode 100644 index 0000000000..89a7a28f6c --- /dev/null +++ b/vllm_omni/model_executor/stage_input_processors/hunyuan_image3.py @@ -0,0 +1,123 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Stage input processor for HunyuanImage3: AR → Diffusion transition. + +In IT2I (image editing) mode: + - Stage 0 (AR) receives (image + edit instruction), generates CoT/latent tokens + - Stage 1 (DiT) receives the AR output + original image, denoises → edited image + +The ar2diffusion function bridges these two stages, following the same +signature pattern as glm_image.ar2diffusion. +""" + +from typing import Any + +import torch +from vllm.inputs import TextPrompt +from vllm.logger import init_logger + +from vllm_omni.inputs.data import OmniTokensPrompt + +logger = init_logger(__name__) + + +def ar2diffusion( + stage_list: list[Any], + engine_input_source: list[int], + prompt: OmniTokensPrompt | TextPrompt | list | None = None, + requires_multimodal_data: bool = False, +) -> list[dict[str, Any]]: + """Process AR stage outputs to create Diffusion stage inputs. + + Args: + stage_list: List of stage clients (set by orchestrator). + engine_input_source: List of source stage IDs (from YAML). + prompt: Original user prompt (may contain multimodal data). + requires_multimodal_data: Whether to forward multimodal data. + + Returns: + List of dicts, each consumable by the HunyuanImage3 diffusion pipeline. + """ + if not engine_input_source: + raise ValueError("engine_input_source cannot be empty") + + source_stage_id = engine_input_source[0] + if source_stage_id >= len(stage_list): + raise IndexError(f"Invalid source stage_id: {source_stage_id}") + + if stage_list[source_stage_id].engine_outputs is None: + raise RuntimeError(f"Stage {source_stage_id} has no outputs yet") + + ar_outputs = stage_list[source_stage_id].engine_outputs + diffusion_inputs = [] + + # Normalize prompt to list + if not isinstance(prompt, list): + prompt = [prompt] if prompt is not None else [{}] + + for i, ar_output in enumerate(ar_outputs): + output = ar_output.outputs[0] + generated_token_ids = output.token_ids + generated_text = getattr(output, "text", "") or "" + + # Get original prompt info + original_prompt = prompt[i] if i < len(prompt) else {} + if isinstance(original_prompt, dict): + pass + elif hasattr(original_prompt, "_asdict"): + original_prompt = original_prompt._asdict() + elif hasattr(original_prompt, "__dict__"): + original_prompt = vars(original_prompt) + else: + original_prompt = {} + + height = original_prompt.get("height", 1024) + width = original_prompt.get("width", 1024) + text_prompt = original_prompt.get("prompt", "") + + logger.info( + "[ar2diffusion] Request %d: AR generated %d tokens, text length=%d, target size=%dx%d", + i, + len(generated_token_ids), + len(generated_text), + height, + width, + ) + + token_tensor = torch.tensor(generated_token_ids, dtype=torch.long) + + diffusion_input: dict[str, Any] = { + "prompt": text_prompt, + "height": height, + "width": width, + "extra": { + "ar_token_ids": token_tensor, + "ar_generated_text": generated_text, + }, + } + + # Forward multimodal data (original image for IT2I conditioning) + mm_data = original_prompt.get("multi_modal_data") + if mm_data: + pil_image = mm_data.get("image") + if pil_image is None: + images = mm_data.get("images") + if images: + pil_image = images[0] if isinstance(images, list) else images + if pil_image is not None: + diffusion_input["pil_image"] = pil_image + + # Forward multimodal output from AR (if any) + if hasattr(ar_output, "multimodal_output") and ar_output.multimodal_output: + mm_output = ar_output.multimodal_output + if isinstance(mm_output, dict): + diffusion_input["extra"]["ar_multimodal_output"] = mm_output + + # Forward sampling params + for key in ["seed", "num_inference_steps", "guidance_scale", "negative_prompt"]: + if key in original_prompt: + diffusion_input[key] = original_prompt[key] + + diffusion_inputs.append(diffusion_input) + + return diffusion_inputs diff --git a/vllm_omni/patch.py b/vllm_omni/patch.py index eafff821a2..d4ab78f13a 100644 --- a/vllm_omni/patch.py +++ b/vllm_omni/patch.py @@ -1,6 +1,8 @@ import sys +from functools import cached_property from aenum import extend_enum +from vllm.config import ModelConfig as _OriginalModelConfig from vllm.inputs import TokensPrompt as _OriginalTokensPrompt from vllm.model_executor.layers.rotary_embedding import ( MRotaryEmbedding as _OriginalMRotaryEmbedding, @@ -17,6 +19,56 @@ from vllm_omni.model_executor.layers.rotary_embedding import OmniMRotaryEmbedding from vllm_omni.request import OmniRequest +# ============================================================================= +# Patch ModelConfig.is_mm_prefix_lm to support omni-specific models +# ============================================================================= +# WHY: HunyuanImage-3.0 requires bidirectional attention for image tokens +# (cond_token_attn_type: "joint_full" in config.json). vLLM gates this on +# is_mm_prefix_lm, which checks an internal MM_PREFIX_LM_MODELS list that +# does not include "hunyuan_image_3_moe" (the upstream HF model_type). +# +# WHY NOT model-level: is_mm_prefix_lm is checked in vLLM core (scheduler, +# attention backend selection) before model code runs — no model-level hook. +# +# SCOPE: Only affects model_type in _OMNI_MM_PREFIX_LM_MODELS (currently +# just "hunyuan_image_3_moe"). All other models fall through to the +# original vLLM implementation unchanged. +# +# FRAGILITY: Relies on is_mm_prefix_lm being a cached_property on +# ModelConfig. The __dict__ access + __set_name__ dance works around a +# pydantic dataclass issue in vllm 0.19.0+. If vLLM changes +# is_mm_prefix_lm to a regular method or removes it, this will break. +# +# TODO: Upstream a configurable MM_PREFIX_LM_MODELS or a model_config flag +# so this patch can be removed. +_OMNI_MM_PREFIX_LM_MODELS = ("hunyuan_image_3_moe",) +# Access via __dict__ to avoid triggering cached_property.__get__ which fails +# with "Cannot use cached_property instance without calling __set_name__" in +# pydantic dataclasses (vllm 0.19.0+). +_cp = _OriginalModelConfig.__dict__["is_mm_prefix_lm"] +_original_is_mm_prefix_lm = _cp.func if hasattr(_cp, "func") else _cp.fget + + +def _patched_is_mm_prefix_lm(self): + if _original_is_mm_prefix_lm(self): + return True + model_type = getattr(self.hf_config, "model_type", "") + return model_type in _OMNI_MM_PREFIX_LM_MODELS + + +_patched_cp = cached_property(_patched_is_mm_prefix_lm) +_patched_cp.__set_name__(_OriginalModelConfig, "is_mm_prefix_lm") +_OriginalModelConfig.is_mm_prefix_lm = _patched_cp + +# Sanity check: verify the patch is active. If vLLM changes the descriptor +# type or __set_name__ semantics, this will fail loudly at import time +# rather than silently falling back to unpatched behavior. +_installed = _OriginalModelConfig.__dict__.get("is_mm_prefix_lm") +assert _installed is _patched_cp, ( + "is_mm_prefix_lm patch failed to install — bidirectional attention " + "for HunyuanImage3 will not work. Check vLLM ModelConfig changes." +) + # ============================================================================= # Patch GlmImageTextConfig to expose mrope_section in rope_parameters # ============================================================================= From 817e32d548de74d374b34b6f7dcdccb8342cf4cd Mon Sep 17 00:00:00 2001 From: Zhang Jian Date: Thu, 16 Apr 2026 23:33:40 +0800 Subject: [PATCH 201/204] [Quantization] feat: add FP8 for Omnigen2 (#2441) Signed-off-by: Zhang --- .../models/omnigen2/omnigen2_transformer.py | 177 +++++++++++++++++- .../models/omnigen2/pipeline_omnigen2.py | 7 +- 2 files changed, 174 insertions(+), 10 deletions(-) diff --git a/vllm_omni/diffusion/models/omnigen2/omnigen2_transformer.py b/vllm_omni/diffusion/models/omnigen2/omnigen2_transformer.py index 9ff681a3c0..3f03563a1c 100644 --- a/vllm_omni/diffusion/models/omnigen2/omnigen2_transformer.py +++ b/vllm_omni/diffusion/models/omnigen2/omnigen2_transformer.py @@ -5,6 +5,8 @@ import torch import torch.nn as nn +import torch.nn.functional as F +import vllm._custom_ops as ops from diffusers.models.activations import get_activation from diffusers.models.embeddings import Timesteps, get_1d_rotary_pos_embed from diffusers.models.modeling_outputs import Transformer2DModelOutput @@ -16,6 +18,7 @@ QKVParallelLinear, RowParallelLinear, ) +from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm_omni.diffusion.attention.layer import Attention @@ -24,6 +27,105 @@ logger = logging.getLogger(__name__) +def _patch_cutlass_padded_fp8(): + """Monkey-patch vllm._custom_ops.cutlass_scaled_mm to pad tensors whose + dimensions are not multiples of 16, so the CUTLASS FP8 kernel is used. + + OmniGen2 has hidden_size=2520 (2520 % 16 == 8). Without this patch, + vLLM's cutlass_scaled_mm falls back to a Triton scaled_mm kernel for + every FP8 linear layer (QKV, attn output, gate_up_proj, down_proj), + which is dramatically slower than the native CUTLASS FP8 tensor-core + path on H100/H200 GPUs. + + Weight tensors (b) are constant across forward passes, so padded + versions are computed once and cached by data_ptr to avoid repeated + allocation and column-major conversion overhead. + """ + _orig_cutlass_scaled_mm = ops.cutlass_scaled_mm + # Cache: data_ptr → (padded_b, padded_bias, padded_scale_b, pad_k, pad_n, orig_n) + _weight_cache: dict[int, tuple] = {} + + def _padded_cutlass_scaled_mm( + a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: torch.dtype, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + if b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0: + return _orig_cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) + + # Reshape to 2D (mirrors the original function) + target_shape = (*a.shape[:-1], b.shape[1]) + a = a.view(-1, a.shape[-1]) + orig_n = b.shape[1] + + # Cache the padded weight — it's a model parameter that never changes. + key = b.data_ptr() + if key not in _weight_cache: + pad_k = (16 - b.shape[0] % 16) % 16 + pad_n = (16 - orig_n % 16) % 16 + b_pad = b + if pad_k > 0: + b_pad = F.pad(b_pad, (0, 0, 0, pad_k)) + if pad_n > 0: + b_pad = F.pad(b_pad, (0, pad_n)) + # CUTLASS requires b column-major (stride(0)==1). + b_pad = b_pad.t().contiguous().t() + + bias_pad = None + if bias is not None and pad_n > 0: + bias_pad = F.pad(bias, (0, pad_n)) + + scale_b_pad = scale_b + if scale_b.numel() > 1 and pad_n > 0: + scale_b_pad = F.pad( + scale_b.view(-1, scale_b.shape[-1]), + (0, pad_n), + value=1.0, + ) + + _weight_cache[key] = ( + b_pad, + bias_pad, + scale_b_pad, + pad_k, + pad_n, + orig_n, + ) + + b_pad, bias_pad, scale_b_pad, pad_k, pad_n, orig_n = _weight_cache[key] + + # Pad activations on K dimension (cheap — activations are small). + if pad_k > 0: + a = F.pad(a, (0, pad_k)).contiguous() + + out = torch.empty((a.shape[0], b_pad.shape[1]), dtype=out_dtype, device=a.device) + torch.ops._C.cutlass_scaled_mm( + out, + a, + b_pad, + scale_a, + scale_b_pad, + bias_pad if bias is not None else None, + ) + + if pad_n > 0: + out = out[:, :orig_n] + + return out.view(*target_shape) + + ops.cutlass_scaled_mm = _padded_cutlass_scaled_mm + logger.info( + "Patched vllm._custom_ops.cutlass_scaled_mm with CUTLASS-padded FP8 " + "variant (avoids slow Triton fallback for non-%%16 dimensions)" + ) + + +_patch_cutlass_padded_fp8() + + class OmniGen2Attention(nn.Module): def __init__( self, @@ -31,6 +133,8 @@ def __init__( num_heads: int, num_kv_heads: int, eps: float = 1e-5, + quant_config: QuantizationConfig | None = None, + prefix: str = "", ): super().__init__() self.dim = dim @@ -46,12 +150,26 @@ def __init__( total_num_kv_heads=num_kv_heads, disable_tp=True, bias=False, + quant_config=quant_config, + prefix=f"{prefix}.to_qkv", ) self.norm_q = RMSNorm(self.head_dim, eps=eps) self.norm_k = RMSNorm(self.head_dim, eps=eps) - self.to_out = nn.ModuleList([nn.Linear(dim, dim, bias=False)]) + self.to_out = nn.ModuleList( + [ + RowParallelLinear( + dim, + dim, + bias=False, + input_is_parallel=False, + quant_config=quant_config, + return_bias=False, + prefix=f"{prefix}.to_out.0", + ) + ] + ) self.attn = Attention( num_heads=num_heads, head_size=self.head_dim, @@ -78,6 +196,9 @@ def forward( """ batch_size = hidden_states.shape[0] + # Contiguous layout for FP8 quantized linear GEMMs (matches FLUX DiT). + hidden_states = hidden_states.contiguous() + # Get Query-Key-Value Pair qkv, _ = self.to_qkv(hidden_states) @@ -121,7 +242,7 @@ def forward( hidden_states = hidden_states.reshape(batch_size, -1, self.num_heads * self.head_dim) hidden_states = hidden_states.to(dtype) - hidden_states = self.to_out[0](hidden_states) + hidden_states = self.to_out[0](hidden_states.contiguous()) return hidden_states @@ -233,6 +354,7 @@ def __init__( embedding_dim: int, norm_eps: float, norm_elementwise_affine: bool, + **kwargs, ): super().__init__() self.silu = nn.SiLU() @@ -325,6 +447,8 @@ def __init__( inner_dim: int, multiple_of: int | None = 256, ffn_dim_multiplier: float | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", ): super().__init__() @@ -338,6 +462,8 @@ def __init__( [inner_dim, inner_dim], bias=False, return_bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", ) self.act_fn = get_act_and_mul_fn("silu") self.down_proj = RowParallelLinear( @@ -346,6 +472,8 @@ def __init__( bias=False, input_is_parallel=True, return_bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj", ) def forward(self, x): @@ -591,6 +719,8 @@ def __init__( ffn_dim_multiplier: float, norm_eps: float, modulation: bool = True, + quant_config: QuantizationConfig | None = None, + prefix: str = "", ) -> None: """Initialize the transformer block.""" super().__init__() @@ -602,6 +732,8 @@ def __init__( num_heads=num_attention_heads, num_kv_heads=num_kv_heads, eps=1e-5, + quant_config=quant_config, + prefix=f"{prefix}.attn", ) # Initialize feed-forward network @@ -610,11 +742,19 @@ def __init__( inner_dim=4 * dim, multiple_of=multiple_of, ffn_dim_multiplier=ffn_dim_multiplier, + quant_config=quant_config, + prefix=f"{prefix}.feed_forward", ) # Initialize normalization layers if modulation: - self.norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, norm_elementwise_affine=True) + self.norm1 = LuminaRMSNormZero( + embedding_dim=dim, + norm_eps=norm_eps, + norm_elementwise_affine=True, + quant_config=quant_config, + prefix=f"{prefix}.norm1", + ) else: self.norm1 = RMSNorm(dim, eps=norm_eps) @@ -713,6 +853,7 @@ def __init__( axes_lens: tuple[int, int, int] = (1024, 1664, 1664), text_feat_dim: int = 2048, timestep_scale: float = 1000.0, + quant_config: QuantizationConfig | None = None, ) -> None: """Initialize the OmniGen2 transformer model.""" super().__init__() @@ -770,8 +911,10 @@ def __init__( ffn_dim_multiplier, norm_eps, modulation=True, + quant_config=quant_config, + prefix=f"noise_refiner.{i}", ) - for _ in range(num_refiner_layers) + for i in range(num_refiner_layers) ] ) @@ -785,8 +928,10 @@ def __init__( ffn_dim_multiplier, norm_eps, modulation=True, + quant_config=quant_config, + prefix=f"ref_image_refiner.{i}", ) - for _ in range(num_refiner_layers) + for i in range(num_refiner_layers) ] ) @@ -800,8 +945,10 @@ def __init__( ffn_dim_multiplier, norm_eps, modulation=False, + quant_config=quant_config, + prefix=f"context_refiner.{i}", ) - for _ in range(num_refiner_layers) + for i in range(num_refiner_layers) ] ) @@ -816,8 +963,10 @@ def __init__( ffn_dim_multiplier, norm_eps, modulation=True, + quant_config=quant_config, + prefix=f"layers.{i}", ) - for _ in range(num_layers) + for i in range(num_layers) ] ) @@ -847,11 +996,25 @@ def img_patch_embed_and_refine( temb, ): batch_size = len(hidden_states) + has_ref_tokens = any(ref_img_len > 0 for ref_lens in l_effective_ref_img_len for ref_img_len in ref_lens) max_combined_img_len = max( [img_len + sum(ref_img_len) for img_len, ref_img_len in zip(l_effective_img_len, l_effective_ref_img_len)] ) hidden_states = self.x_embedder(hidden_states) + if not has_ref_tokens: + # FP8 kernels do not support zero-token GEMM on ref_image_patch_embedder; skip that path only. + # Still run noise_refiner and return the same combined layout as the no-ref case below + # (batch, max_combined_img_len, hidden) — not raw noise tokens alone. + for layer in self.noise_refiner: + hidden_states = layer(hidden_states, padded_img_mask, noise_rotary_emb, temb) + combined_img_hidden_states = hidden_states.new_zeros( + batch_size, max_combined_img_len, self.config.hidden_size + ) + for i, img_len in enumerate(l_effective_img_len): + combined_img_hidden_states[i, :img_len] = hidden_states[i, :img_len] + return combined_img_hidden_states + ref_image_hidden_states = self.ref_image_patch_embedder(ref_image_hidden_states) for i in range(batch_size): diff --git a/vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.py b/vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.py index e8e307b878..04720c932f 100644 --- a/vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.py +++ b/vllm_omni/diffusion/models/omnigen2/pipeline_omnigen2.py @@ -676,7 +676,10 @@ def __init__( ) transformer_kwargs = get_transformer_config_kwargs(od_config.tf_model_config, OmniGen2Transformer2DModel) - self.transformer = OmniGen2Transformer2DModel(**transformer_kwargs) + self.transformer = OmniGen2Transformer2DModel( + **transformer_kwargs, + quant_config=od_config.quantization_config, + ) self.mllm = Qwen2_5_VLForConditionalGeneration.from_pretrained( model, subfolder="mllm", local_files_only=local_files_only ).to(self.device) @@ -1253,8 +1256,6 @@ def predict( # broadcast to batch dimension in a way that's compatible with ONNX/Core ML timestep = t.expand(latents.shape[0]).to(latents.dtype) - batch_size, num_channels_latents, height, width = latents.shape - optional_kwargs = {} if "ref_image_hidden_states" in set(inspect.signature(self.transformer.forward).parameters.keys()): optional_kwargs["ref_image_hidden_states"] = ref_image_hidden_states From 72313386d2343ce9d2060ba22c17f234d36a0e44 Mon Sep 17 00:00:00 2001 From: lishunyang Date: Thu, 16 Apr 2026 23:55:44 +0800 Subject: [PATCH 202/204] Add Claude code review workflow Signed-off-by: lishunyang --- .github/workflows/claude-review.yml | 58 +++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 .github/workflows/claude-review.yml diff --git a/.github/workflows/claude-review.yml b/.github/workflows/claude-review.yml new file mode 100644 index 0000000000..6f78931429 --- /dev/null +++ b/.github/workflows/claude-review.yml @@ -0,0 +1,58 @@ +name: Claude Code Review + +on: + pull_request: + types: [opened, synchronize] + issue_comment: + types: [created] + +permissions: + contents: read + pull-requests: write + issues: write + +jobs: + claude-review: + runs-on: ubuntu-latest + timeout-minutes: 15 + # Run on new/updated PRs, or when someone mentions @claude in a PR comment. + # Skip PRs authored by the bot account itself. + if: | + (github.event_name == 'pull_request' && + github.event.pull_request.user.login != 'lishunyang12') || + (github.event_name == 'issue_comment' && + github.event.issue.pull_request != null && + contains(github.event.comment.body, '@claude')) + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: anthropics/claude-code-action@v1 + with: + claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + prompt: | + Review this pull request for vLLM-OMNI. + + Style rules (strict): + - Post 2-6 inline comments MAX. Pick the highest-signal issues only. + - Around half of comments should be 1-line (e.g., "Seems unused", "Is this really needed?"). + - Do NOT prefix comments with "Nit:" — state the issue directly. + - Use GitHub ```suggestion blocks for obvious fixes. + - No inline praise ("Good placement", "Nice work") — skip it. + - Be direct: "Why not X?" instead of "Would it make sense to...?" + - Hedge only when genuinely uncertain ("Tbh I think..."). + - Review body: keep it ultra-short ("LGTM", "Some nits", "Please fix pre-commit") or leave empty. + - About half the time, leave the review body empty and only post inline comments. + + Focus areas (in priority order): + 1. Correctness bugs (off-by-one, race conditions, wrong dtype/device, missing error handling at boundaries) + 2. API/interface issues (breaking changes, bad naming, inconsistent with existing code) + 3. Performance regressions in hot paths + 4. Test coverage gaps for new logic + + Skip: + - Style nits already caught by pre-commit/ruff + - Speculative refactor suggestions + - Documentation wording unless clearly wrong + - Commenting on every file — only files with real issues From a5f46d7e72819aff2bf637445d49252fe2429ba3 Mon Sep 17 00:00:00 2001 From: lishunyang Date: Fri, 17 Apr 2026 00:11:12 +0800 Subject: [PATCH 203/204] Fix claude-review trigger condition Signed-off-by: lishunyang --- .github/workflows/claude-review.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/claude-review.yml b/.github/workflows/claude-review.yml index 6f78931429..955fdfddf4 100644 --- a/.github/workflows/claude-review.yml +++ b/.github/workflows/claude-review.yml @@ -15,11 +15,8 @@ jobs: claude-review: runs-on: ubuntu-latest timeout-minutes: 15 - # Run on new/updated PRs, or when someone mentions @claude in a PR comment. - # Skip PRs authored by the bot account itself. if: | - (github.event_name == 'pull_request' && - github.event.pull_request.user.login != 'lishunyang12') || + github.event_name == 'pull_request' || (github.event_name == 'issue_comment' && github.event.issue.pull_request != null && contains(github.event.comment.body, '@claude')) From b721f10943db53dc70e7dd753453bf0c453f7b46 Mon Sep 17 00:00:00 2001 From: lishunyang Date: Fri, 17 Apr 2026 00:19:13 +0800 Subject: [PATCH 204/204] Add id-token permission for claude-code-action Signed-off-by: lishunyang --- .github/workflows/claude-review.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/claude-review.yml b/.github/workflows/claude-review.yml index 955fdfddf4..68d1890e3e 100644 --- a/.github/workflows/claude-review.yml +++ b/.github/workflows/claude-review.yml @@ -10,6 +10,7 @@ permissions: contents: read pull-requests: write issues: write + id-token: write jobs: claude-review:

s%`HI1UC3M zDXlN1FQw6kkTlL=zovddq8qF1FdcV@+J?4#heoaKMH^rQh|JU*|KI(y1A9cl({86a%_Bf< zs)6H}1LyKIT}4aeLV~PH#Y?miCywW|1+b+YXUO1BuOxf|Ba6oGm6iy+{KN+DjK0*i%8(fku(e>nd`a z1HYpX45-^k^*EWzl!hmiRtasIh;!ytC^N?liYM9WwF&*c8HP((DFKv4{K=>=tgU9| zuxBN*c1$e*>VFxwT2k;G*0kv)7?p2jxt*TD1|1;)NLwJCW^(L2&~S(kgCQkfE=zY* zlO=1G7>>z|RXT1p?(d3B6K_xIzv*1vP;wlP$7^*++YUk|-F%T#ZRpo>W|mJx+IeJ| za{R42jATD*MlURzg-vB@Fj+!{(|zpQ_6SydwRM_IjRxM)TUh_pVV(@%X)2xp~nrZIu-my8Ei9SjFz@Q}yIt zA#9#E>zw6ynE(&lRxsPBGjqK#9%d5Ub+PE!<8r=~B9mTxK91pu*9&F-yjmtkSh>CR zZ8MXqn*t{A)Au5+E`erIx1(In(l%eAT~5marv_@bKWZFnq5tE9vICyBqt5wusOQY8 zvV)uXRmdp6((Go{plbe+9M6I`9oxfM(#~l~HY1MW4<-4i%TV)~d38CV4`x66Nr#_I;D=_a?lmW|D@mB-_Fk~LM7DbpppoeJEmqN@%w z*{x_L&Q}t5Hb!@dgugH{8SY$+;k7LrJeNtCJQq7GwHm|WG_m408oHQ&|8dv6{ICkg zDbtZ+bo@#C$J4Uu@!Z>lIiXbuo#cuTU>933%dLSRxnlnLCn+Sn@_a z=5qeReXsNLtLKXpou;Pa*^Jw;X-t?&)v>!b3|KvyPMkOT+w~7t6g0N?id&5+4g825 z{Xy)GF<0;D=#1ZKC!b|Gvr(S!PgN4!P7LA!Tb}15Y4QTtOuv#=x>7iT#^z}pn;cid zIDHY{SAxin)>2ctWcQ}D=z_Xl;4;6Ls~q19NDb|TAs+U`v81g6--lV<@%w=2Fa4gMi`m90fyg8t^Y_U! z<6DnX@5TJ!4lJs5pMHYnzSgAa#(ZT|l9D)NvcD}G1v~xLiPlrbQR!dr01f>$NgTPK zH^#HR@3ORQMR`j=i+|*Y6t_72wX%`;^aKl}6C~4KYhC;9@YD{O*J(rpNSPCIPCFYU z*|g!am=2_?+jT!%_?i|H021}gAJB<;^p1XrT`RwT0$5PEGTSC{a=Q+X{S$byC-(xC zWMuHb?ySq+uOG$Wa?G}VaZ+NC4|c0vAl_bSqSN>t;rpFjBJ-+}2C=EFy)wsZP5C!~ zpYSlbmXqwf+d;}j`+eH$<_{0&Oh9rmD8GC>2M@C-@XyZR3pIE@2a5>fOTw;$WWNC- z^a3S3H7Me-+P0E`)<7&mkH)3vzD|g>qTbw5GCV?u|2Bp4caZHmlI8_AdJ9cY>Kfd? zf4%<v$dG z`{$Q&c)$)>x@}l^6TB`|u=Kw7w~@c;|CUE!;cE-8?4c`GRcnW=6xtVtZAhAkF!#AO}M+WHI`n2`zkHr0(6mJ8l`w^cdIg0&>83XwX1yCy#l6dek{eb~aAojPx z8U>$Bh;O3$e@O0s==J~az(7p*E5Zk+8*qTNKY&jfe{~7Dv6((>5L7*EQ7D$Z@yP~v zQFv^XfRXUu$VRx^1s3TypFR0+au0zM0}55>bw2VR!dgOxW4rn3nApG3D)C7i8@%1$ zY_M>XYpDTVpCNRL{1@P*;$T1gv>wzqP}LO*Kj3zSlI|OT3x&e-NNFFM+pe|TKe)1k z2fEQ`U2<;ho(utek{jbD7`J`F9=%y3j{cK7hOf240X8%BVfw#VOb2dA2YP8H-&nX0 z1{OZ7W2tEWH+1SR0kRpsQkvD@{Gy8(050Y8(er=59M+%>vIXq12wKn^OBc-o>}l?_ zkCObGJw8>yPk@2aLhuImysF?!7kn<$zi|SuNWdBi?F~>cUGpFO{Qc#kSs+d|ciM(X zB>yEDK2vp{F|^JtQf)3#P^r3?v2NBS77ViR|Y2kg={!4 zaby1*2mntPIbZ{Bq?Oeh_z(((OEZHHO&@L^?LHk`ZaMriueh<9z8FC7S`>A$zlGlK z9rZm1Z&Q1@RNj2q0L})ZpNn7L_+-O5TmpOG(@fmt^;%-ENCoAu;~Q_kLz@230FwOJ ze2jG`^KpDGwrGSL^dUTJUq|5?_uTw)*Tj+w%^P?L6Y7QU%C4;`=D&7D;|0Gix^bZi zm5IKof3K5?RGi4>#Rkx8#oZheU0Kj?V>iAoWc3&eqGWy%~@^ivf zSRA{lB7M!qHye=NZ1I3?;!Rl9Lt=G1JO29&WWhPwMHV>RTn*~okP)j(u$=1Y0oSa% z<7S1eVdNt7?{1;{AA_Qr;t9)|!+Uoe6>scPof#Z6!=nWgrW;=YNjvK)hmlrMJRKbP zW*4H>bU#wXs8bw`8iaD~G~6<{x;Wl$i*Yf%LA^qV-GENthQx~BgzbSdsu%b?gEjF5 zsLJ$@)pe!8)v@m%)emIo5D@nxab3|0IllE@OYuhoGLa|7pP;%aUO17!`=2X8vA}!_ zwtNttt~2@B;W}QkAI9H#a+~wr_Fm45qR%(Z;2n@jNw@y%(Nb6gD9T&__k@QUgz|H= zg%A^>GL*ig|Ms0ilMmlWsX>^QMG*SY!zSpqvk9uVLb+tIxni#8z2y;|yu zXK_7PfoG)W8<9za@Y#%_NJwp3utfPZ4Wm^$>cmmG?$kWFu`Bw&0D!pfgiBFxEGKmQ z;A;-;7U)qqIjX29hMP@%puRan*lnxImTII=#yf;9pgbE_oD^c82g(<$wEU27lKq1fOSt^t zN7U83NkL>Nd{^iDEN_Rtp1_NpfE6XTI@sWK>(=gypWyjxb5O8gxSe+UONrx`&bzak zxu|Y$%G7vlW!#oe?;@W@H3obqQ2I{c$_T#60YE2N_Hcm;3#(D66v_bjzP$$8p){@4PCcZ)wYj(WE^)*d&Ay zAU$3h1ZQn-9E3hd>~B#to`)Nn79F#nz9M<)GQeVzJW&a>XSh&)23m zC~$-ts>&E>7m2)29Gg53t0lN!_>XlAg$n{JVi(v=>%Z$EqCEeh^4(ewfbu{fETQSR zT7W_f0H*2Z`yD4N@Nx^P*R-q3qkB@3JV51C%hh@BRg@4v;zBGCYU&(ZX^?194wt%3pN!R-x8Fe0AHC6w9bXmFWIS?+ zS1Y*Vn>L0q2^5&>(ImMAe)j1HG<;SV_Iddr(o&gq%SA+#eR><^!v(`w9bAL9AR1UJ z1JHg{&xqHcI|bzfvhkiq(QMV0XzRW^rK4f>-#UFw%&~ZKHS> z)UcIoP_Z)|DJpO!To94T&H%bmYw-d9avn`N=^a5vZyw9)FvAdz;}}aN08f6QE$x)Z+TV zZ#fp=cAI?O$`htl~pSajipya4ahzUni-XcelB;-{7Tpv^ocFgU#dtSMm25VZH9ASQt{S!Yu*=ufuV3DOcBWzItDECvlhXVLrp{*xhKh z6PPYIK$=7G%Ir95_9^dOby9&K|9 zD%{4%3mo|Q2JfuVU39X%*5+m92zIbva?nE8E~#@AFO(<#+2zhGc4~5Tel)IAG8=!k z#4oL|jKWppEk zEU~42bb_wySaFy03Vv3TL+gM6&a@U&}f^2}~6j>AIJRB)b+NYd@@U=PmwV97}8 zXyhQ|`vl5MwEHn`IBxfo?FS@<393NT2%j9+-21(A$B{3(YYSep$P3k}CA_iwDZpG{ zJyvJq8<$?-ww$btC)Y~36%g1l0dbz`(G@TKqiFwu-_j?`aI_nNOBtCS2?4g2d`6?$NZp&Y4h&;9!a6f zk4A@QB6nM`{i$DiY(tN6)jIPlrZv4s#EBi0&8rq;4Cyg(F?l)0P14js+YPgx+VS+2siIBxr+%0k#)sPBcdQ3^{1X5A|hEti+b7e+yI z%r#9erMI$rnlS{m$HgI&sc~5%e&`H#Q46DXDCE)W^~u)n!)_Zd5nC)040ZAdapmB% zaRO7K3O&$p^U!d<6MG>$E0uPu1XNO1qF4(8W)5_VTxbs=)*5E3oo&2`igGMXvv@7X zclx3r@=5B`dGuqQVLpQa$KWO;1p9-PeqX{bxb5G{2=_eT6nBq=kt( zC*x*tqS!h`i{HV?1rAmX53qi7 z(f(4!Co5nLi;K@@b%`@9!1~_c5p7zSlNKsa-O*qhP4S{L?#*mj$2*JDpk5{RoF2ie zb6TIR_-+fMXs`LT5Za^)@g71Q-*wndmXf*NviFaKupo@`<;FReMG9j{dgVC7#w3@g17&{C8kL& zx=AHkcx??{w(bA|h|2g4mfSJi@m|Zc#|}Alhxt~~m;J-lQ}{2K_7B2D(6a|Nm6jg2 zSEzYF&Q^bdl3taJcM6aN!kMR#THxa3$dId6Hc{Q$RJr%^Clip3M>!^k_z;P?jI;gJ zEA7$DotG>md3eM33-Pf@R)2j}w9fD0S_o2g7&3^;rpU#X=PPzy=AF?nd<9JsAvMKz zF9$Mw4g9HlwD;@iFN*jRst=!?W|H{eKyz?7axgqknpFZ68MMvnv;_xMR8!A}vT_J5 zD?k5KQ9ov$)eWN=%aJkF$cXEI>tW+@k40rGQgw-HBr)8e<8Xq`J%vKdVZx#!dN)({ zAoZvrd>5ZZ|Ahl^$kc+CQ!h`lXm9)5PJzbusfR=pbX{pUGC9C{DLFQtPwu2(9&@HuGQ%XH(pP%307o`sw$?Y&4xXTqx6@TlI7`Y&^AOApfJzBx zF2dUItgnE(rj_BZ#FZ|gIX*f~kj!ps8&5g(<2>#1oJFV#cm#*ggowYxBZi-I}Uq$v3kpCT!0c5;FBQ zIp|j9XLTk7IMO*S<3>+M6L5@2{bHA|XWiRGZVe21@q4U`v6r=NzCvTbkPKjZSNwSv z^P~Y;023dHN$%l*ybk2>XL#fE{GA_vS=%hn4*Y0vvB7^cThC>Ag67l&TsYOEQvIZf zm;!5}IqOEq4s7O1fK@^-31cJe5Ofb#a7|e2yy>gaCK$PUvBaf~!^LimJIDN`x3+!P zr^FlLTRvu#IBHdYp`VPkUQ5+Iyc$2eJu!2pYNMYXhDFoTnrd7Bb=|7+GZ&L}5`Cm) z%;(hr?iCOZNQE3*OkAjLYhZJ6%?2uAj65iyP7 zRJJb4fNB#up@)zkEqT$&5(+&han^{3xwu=mc?Y5sZuSR~B1K?Q%Obkf4{?5zT2gVZ~4w!F?F2DeZuS#w2;H zbgtJXbQVUkn?=p&jYN$WgYh_C=_Lk*T$3-;bDAxp=U+;mN~XpS7G3=;^iWCCn3PsR zqg$AWx$D+zqaBcyaTU}LnwBnDsf#SlN#!zuD9%XVFxapT7`UStlxdSZ9QF0PSDw{h1Dw4^BST^-R9T!VsVS%}o}h#@yyrqd4$1ID>f%Nx z84|Z9VL@VXR7~Cv4kKuT9S%X?ePxmlSwu*h zt-qk;=?ZVQpD5~5WU>d4 z40Sq((e4AWDuU%?f=5)vxX{f1b*XWbvX6|&cH2~*>r3Imt#E1`1^BpwSeDHEASMS2 z9{!^h1Dla4r)U_l{vAXep4tu;f7>-A`0q3v3t)~HH?#0+LB#D9#zlO z!CE!V3RUW?&iw>-M39bL_3Ox`Ad7T*AMFcbq7s5FtoV9I{vkoHOq8y0y&sV`!qf10 zPBRkgtXjB~|Ilmv6}r`^y{m?@bcxPcBT3@uc@asSn0gAP)jkQyD41WAU3N3;GSY0E z*I5(0&*DRxsGW~$RKy?;~A=v8_nb4bL8>}A4&`p-Nd1T6jbI}S}Qj_p0a>IZQVj?ALP z5p@i$)oT$l;q5$ys20WrY^jno;uDIsbf)tX3=*zF1DDTYa1f-|M8OSlBw8x>I1FXh z3|^%ae2F;Q?U!L0uiCdIo)>EyF73H?{O=!jiO>=6jbrlo+vk z*~Udx>4Wnld+7mRcGjIeo`nqhAlK_Ik6fj{YpQA}9A{J4n#4YjsK0!6kyK&cnD51| zo4(|2IH4?;?$QZS4^Tovte3>QUw`7ao96*dMVOm!BY#HBh?|5GOy#p1uCFI)y^Bn|CVJG>KpezsvuxH@N2DT`0yV=TfV*(L#`6+s1W+#PheMK36Y zJzbsR+5%07jCQ$8-Y`UJE;0#`57Hq~PU-jy__6Xj)LEpJ|@9-EC;4R-7md>0< zB?4J#y6%)-reaQ`h3be&;YCtt=Sq5{+`R#^2qQFPX>d+N1ijgcbaI=;>7RV$ zyHSigfw!kY^o%P*(Vl)kLfNsar0r;je=B;@IJXkdCZzueTi``R~8cJ2o7e+hQ{i8WLFy5GUddu2X1nQC_2+vK1ovGnlvHJ9zf@seaFg4 zu#9E#n1Lw!mgLWmbRP^fkck=ch?8Dc7D-_Is3m(ghgY}b%W5=$^y#vE!iQYCbhem+4SF@3 zMPr^ThcRM`Yi%*%(!7l}$i~ci6`)QfO9CmkzmL%7Rn`iLg^|fgU6Pc0kzHFtY zpT-|TAq^~FDj=>CZvKAJ-O{73Dp^*bxQ_T8am5WaNtZk=7cR%m~R&_R8KaBUi>H71?BE@4Z*ZCfQraCacWn{7~Qf z?ymdy{XNg$&mZ;js?+&7$8jF}cpvlMVhSu1L-gblp*li3ZpuY8oj_qv_+wT4H66|1 zKD@UfdmGfgWap2zw2p|<6W59){LYfICdyw4p{|2(csujaTw*-Sn8^5f&!#e@wAs0j zTb%U9MME-XMHWtxHg1V?4Rn2+J4I!dT5+-at_ZIr6BrelRjj1ytcdDSnAf^Q)MsJ$ zH{Vg1i#b+HWywT+P)V-#7ziUr@J3bo{UZ)l44+Uryq_$~C`-%p#-Q`E#1+ka_FEdT z2Lv@4?#uUbUT71pI;xJvoI0(A)+{)Ccm)uSK?m+6!v*ZdjmwE5j9vri1B1Tal`61% z`S`ecs&Ah7&^FxkKIKg{!Uw~J(V~Z2U4x5W{W5N64)^F(vUsm1xprB>XB*yT!OTe@ zLH{LS7)=GI|4Oq=T(q{U28j>tb~`_)n)cVD|7iBf*JigAPz&O8E^HEm9^3Wki`0#W z4FV0HZjIS`sosZ?qOV=1B_j!U%S1XOxeKk6McnK<`Kl~)u2EQuzR+W_^tCOhA?ZFd z_W`fey>m`vuT<5((pEi+)of8`pFj_eHfREC!tb9!tknuX2Hpw~`ZOSXi`jH>(W5eI zs3WkbY{P$T1=(-&ju?cu5I<4_wVPNk`Y-D10SDhfroilSAY_`qI1jzNzZhm2nWCr&x$u(Xgz#*nGlVbRM*`7SZ zl9S^s>{JHzERiQWilQM3!g5MFK%QxD!{4nPQ`n5QYD}#Yd9-x%mJZvLfH+J1azUT`FSjNRo;!FFe(8RNvgAmqKoh)o#Ck{vZ@~{ew&<#Y`v)^IbtRVWleNO<-(H#{ zspFTuhMrUu@6un`5CQR%HVvTjTffOJ{D-9b0zd8Phkl!~HLwrM{G}&SE3O|OEeQQ7 zYc#C~v!{{?pDu|HzVTv!rG)5ZD<7?mAacPX^!dxQQ38c}OCjfOYq_>_)4`(lVj7Bp z&w@s}Kqj&;!&2hm70;H$RhkKzoTwTO#%4I;+l;d_hA;Mf+imh&3mMUz%NNBuicn`e zlH5zU;2hSgku*iYS5Z^~so@Jf3}nltb1`)Ca@RWsU2Z>!^waaM5m`xJ zJ*-|WI)8Df2+PhkhXNCOeg;kgzXv;OUqIzIp(}PzEba;&nJ%`e;6wNs_{9?97vgYx zZ1dE{*G~g&Ir`xd1T^rru;G#nNNb%!iavfTT;c^MwPXk1Z_2=)mN6FhruHt`_NJA< zo#m`MXYJ=GQBzUr*hErm=BtM+)7cFH(PZ|d*7S?j!zOoEKHc{wk-#77h)T*D`Ix%N zzGK5gqomG@X?->qogjz{bV4@MCaqOHKI(?%NxvF|JLJ8l6kHNDEGH+i{BkpKgfA7$v zMdnrzjakPwb^ej1H}xsn=5F>r;brqEdY7gs5Auy6&K;}?aox&SPv8Xj{Zx!trQ;hB z-@PvxLFql$sv#Dkb*shh*RVn5Gw}AeCrbdvjx-YkT3{jt4e~?w*qu~K?N}7}isKp4|HckudnQB2_z;CuQtp4%MkgK^JOuH7^*hFij$3dWgNxOOT_5 zqQ_N-np1f;S7dS{&^J4(tX7`mRZa+iq86B z1OW=xMq82>i?4gCjtE8DgKE#Ba(UqwU}J$KjqC8z@68r?u zSFZA?W2s|CP?0p|aPi~aU2gBQBN}bQz^N*3ph4{V~KPm@F`W$lu zoxW%9zTv=)CZh*Fz1DmK@VjBH%UDt{f|9aZ?*y@|2xZNDUaNiToP%*ib&i-Y@2?4) zoLdjsrgfUk(v!*c#g8X8qQ~keR?)6=y)Z|-aSHL=(I0tR$HkJ`ifroz399H-D8cFA zLQ4S2^=91w&uVpM7YcL@TKlaUzma!!ZJ=HmL>}}8MdI(kch1Zbk6YtU#I#2xyMtA1 zhP%dn;o38&uIaIBVCv(W%QrWo$72`>=2>l}kAk1saCYePDfHOD)=`*DN$%n944~Aj zO*m9<`&Put7g&P1#_M|>we+?zN_R={X{VWb?6FI=%>?j)jfj%|zp5vXO|Uf{beK4S z>6MZXT&E?`UZ5X--x4{kf%c*>ALH}~PG>YeBFp|4o*9`rJpt25ZE8>4(`qBE=*5>Y z73C)JsJRKUu}IDY$%`Lx9*3@zM z2qM1+;`mz=uo;9L`db54GH{|sNLvUFFSVa~S(oX%=KZb3K1V#oAdBi0*LLFO^d;0g za-n*B%#Xkw*OwNFv~MHtA*zd7*Tbbbe@@fMlBMXIZc;YPlh03kB*wDAab);1R|Cy1 z5_ZS$Zl9n@v>lBNXFn7`%$pq?hoJ&ZE>Zj2S+Tm=XwQ?NWqjOS<mm3jius*q7muqces2g*u9VV4Y&|7w6y|MGDke_QpuK! zM^CVeZ%ra=1jHQC3mjR=aw%?aA*YeT{$DPfw(WV@o^@@zaQaJfq&X9c)|(csqMZ+A znM5Tz3L^Q@Pswthxkoy$QeDdF)A`_b7l{wNcY!3_X)vi0d4bP-(mKx%+taA3s9(N# zX=#T_=yX5akPN7s@x(zM1R9!1VH-`rg@^pe3q77KV)+t!;vyXT*!pWq5x2R^#mQmU z+pM+~bTg@vslA2wyXx0>(vLKwZu7M)2oN|Ccv(?ejVh~X6fPb4-1L5J1+(CD05OCT z8WL*p`z3fgx((7jtsL%#l)9vQ!!>7u{R^HW#bLO}2;@ER-V36qZ%0Z=-)VsD?ibpe ze;PaKlt|DZBPk#v!LobHxS_~r3s=xvVaRY^04=+-`FhgF7EbH zlGSP%UnpDSHOO!0VRkkPe(;*mZbS|;74vu{JMblD-hXR>#}Wi0WwV$wB*p_^L#q1%r|cUhb4 zHodfB-JC)Y+e|)rbu^>Y%4gFBopw!w^v-dLY9^gh+70zo4MFVqSsAK-xp~$solw1{ zR#YzE z&SL-gcJ7|{OKmsG`BF$}!=psY147=` zkz|c<&+I9ZeC7bHmAU&#T-hx(cKr0C(wVX!gi&et4pe6q%39M1+tV;99O6E4L_5Bs zkqfEaR@5rvY`lG8H#jrkjSO4U|LTRVQo1W+`s5zIiNMOCwpuXPGdPpT@?F9;7-u&wiq# zLX63t5-j+c6Y`h2+GT=eqr@j3aA!^oOX3}A5<0r|tKF}c#a4?jsA=z`h!oy2`_OuB z&C4<(484OOY)rZuxiu22=R7mn6XP^4NnVL9uQikfu1ZsJ?a-$6=qM!BCz*W|+;`K& zmIAmtTP!}b5+a_(;dz