From 95de2a1263a16043819f502f64fa242d1fdac442 Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Tue, 31 Mar 2026 18:08:32 +0800 Subject: [PATCH 01/76] feat: support multi-stage Signed-off-by: ZhengWG Made-with: Cursor --- vllm_omni/engine/async_omni_engine.py | 75 ++++- vllm_omni/engine/orchestrator.py | 272 +++++++++++++------ vllm_omni/engine/stage_engine_core_client.py | 12 +- vllm_omni/engine/stage_init_utils.py | 7 + 4 files changed, 280 insertions(+), 86 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index a4d87c96e4a..f7a5a5186a7 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -435,10 +435,19 @@ def _attach_llm_stage( return stage_client, output_processor, started.vllm_config, input_processor def _initialize_stages(self, stage_init_timeout: int) -> None: - """Initialize stage clients/processors in orchestrator thread and assign to self.""" + """Initialize stage clients/processors in orchestrator thread and assign to self. + + Multi-replica support: when a stage config contains + ``runtime.num_replicas > 1``, multiple clients are created for the same + logical stage and the flat ``stage_clients`` list grows accordingly. + ``logical_stage_to_clients`` maps each logical stage id to the list of + client indices that belong to it. + """ device_control_env = current_omni_platform.device_control_env_var num_stages = self.num_stages + # These are indexed by *logical* stage_id during initialization, then + # expanded to flat client-indexed lists at the end. stage_clients: list[Any | None] = [None] * num_stages output_processors: list[Any | None] = [None] * num_stages stage_vllm_configs: list[Any | None] = [None] * num_stages @@ -448,6 +457,15 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: started_llm_stages: dict[int, StartedLlmStage] = {} llm_stage_launch_lock = threading.Lock() + # Track per-logical-stage replica count from config + replicas_per_stage: list[int] = [] + for stage_cfg in self.stage_configs: + runtime_cfg = getattr(stage_cfg, "runtime", {}) + num_replicas = int( + runtime_cfg.get("num_replicas", 1) if hasattr(runtime_cfg, "get") else getattr(runtime_cfg, "num_replicas", 1) + ) + replicas_per_stage.append(max(1, num_replicas)) + async_chunk = self.async_chunk prompt_expand_func = None llm_stage_count = sum( @@ -549,21 +567,61 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: ) raise - self.stage_clients = initialized_stage_clients - self.output_processors = output_processors - self.stage_vllm_configs = stage_vllm_configs + # ---- Multi-replica expansion ---- + # Expand the logical-indexed lists into flat client-indexed lists. + # For replica_index > 0 the same client/processor/config is shared + # (they point to the same underlying EngineCore) — full per-replica + # process isolation is out-of-scope for this first iteration. + flat_clients: list[Any] = [] + flat_output_processors: list[Any] = [] + flat_vllm_configs: list[Any] = [] + logical_stage_to_clients: list[list[int]] = [] + # sampling_params and stage_metadata are per-logical-stage + logical_default_sampling_params: list[Any] = [] + logical_stage_metadata: list[dict[str, Any]] = [] + + for logical_id, client in enumerate(initialized_stage_clients): + num_replicas = replicas_per_stage[logical_id] + client_indices: list[int] = [] + for replica_idx in range(num_replicas): + ci = len(flat_clients) + client_indices.append(ci) + if replica_idx == 0: + # First replica uses the already-created objects + flat_clients.append(client) + flat_output_processors.append(output_processors[logical_id]) + flat_vllm_configs.append(stage_vllm_configs[logical_id]) + else: + # Additional replicas: for now, share the same client. + # True per-replica process isolation will be added later. + # TODO: launch separate EngineCore processes for replica_idx > 0 + flat_clients.append(client) + flat_output_processors.append(output_processors[logical_id]) + flat_vllm_configs.append(stage_vllm_configs[logical_id]) + logger.info( + "[AsyncOmniEngine] Logical stage %s replica %s → client %s (shared)", + logical_id, replica_idx, ci, + ) + logical_stage_to_clients.append(client_indices) + logical_default_sampling_params.append(default_sampling_params_list[logical_id]) + logical_stage_metadata.append(stage_metadata[logical_id]) + + self.stage_clients = flat_clients + self.output_processors = flat_output_processors + self.stage_vllm_configs = flat_vllm_configs + self.logical_stage_to_clients = logical_stage_to_clients self.input_processor = input_processor self.prompt_expand_func = prompt_expand_func # TODO(Peiqi): Hack here supported_tasks: set[str] = set() - if any(getattr(stage_client, "is_comprehension", False) for stage_client in initialized_stage_clients): + if any(getattr(stage_client, "is_comprehension", False) for stage_client in flat_clients): supported_tasks.add("generate") - if any(metadata.get("final_output_type") == "audio" for metadata in stage_metadata): + if any(metadata.get("final_output_type") == "audio" for metadata in logical_stage_metadata): supported_tasks.add("speech") self.supported_tasks = tuple(supported_tasks) if supported_tasks else ("generate",) - self.default_sampling_params_list = default_sampling_params_list - self.stage_metadata = stage_metadata + self.default_sampling_params_list = logical_default_sampling_params + self.stage_metadata = logical_stage_metadata def _initialize_janus_queues(self) -> None: """Initialize janus queues inside orchestrator thread loop context.""" @@ -594,6 +652,7 @@ async def _run_orchestrator() -> None: stage_clients=self.stage_clients, output_processors=self.output_processors, stage_vllm_configs=self.stage_vllm_configs, + logical_stage_to_clients=getattr(self, "logical_stage_to_clients", None), ) if not startup_future.done(): startup_future.set_result(asyncio.get_running_loop()) diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index e6373ec96ea..3106b876809 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -104,6 +104,11 @@ class OrchestratorRequestState: # Metrics: timestamp when request was submitted to each stage stage_submit_ts: dict[int, float] = field(default_factory=dict) + # Multi-replica: maps logical_stage_id -> client_index chosen for this + # request. Ensures the same request always hits the same replica within + # a given logical stage (KV / intermediate-state affinity). + chosen_client_index: dict[int, int] = field(default_factory=dict) + class Orchestrator: """Runs inside a background thread's asyncio event loop. @@ -122,18 +127,39 @@ def __init__( stage_vllm_configs: list[Any], *, async_chunk: bool = False, + logical_stage_to_clients: list[list[int]] | None = None, ) -> None: self.request_async_queue = request_async_queue self.output_async_queue = output_async_queue self.rpc_async_queue = rpc_async_queue - self.num_stages = len(stage_clients) + self.num_clients = len(stage_clients) self.async_chunk = bool(async_chunk) self.stage_clients: list[Any] = stage_clients self.output_processors: list[Any] = output_processors self.stage_vllm_configs: list[Any] = stage_vllm_configs + # Multi-replica mapping: logical_stage_id -> list of client indices. + # When not provided (single-replica), default to identity mapping. + if logical_stage_to_clients is not None: + self.logical_stage_to_clients = logical_stage_to_clients + else: + self.logical_stage_to_clients = [[i] for i in range(self.num_clients)] + self.num_logical_stages = len(self.logical_stage_to_clients) + + # Reverse mapping: client_index -> logical_stage_id + self._client_to_logical: list[int] = [0] * self.num_clients + for logical_id, client_indices in enumerate(self.logical_stage_to_clients): + for ci in client_indices: + self._client_to_logical[ci] = logical_id + + # Round-robin counters for replica selection per logical stage + self._replica_rr: list[int] = [0] * self.num_logical_stages + + # Backward compat: num_stages now means num_logical_stages + self.num_stages = self.num_logical_stages + # Per-request state self.request_states: dict[str, OrchestratorRequestState] = {} @@ -144,15 +170,41 @@ def __init__( self._companion_done: dict[str, set[str]] = {} self._deferred_parents: dict[str, dict[str, Any]] = {} - # Per-stage metrics accumulators. - self._batch_seq: list[int] = [0] * self.num_stages - self._agg_total_tokens: list[int] = [0] * self.num_stages - self._agg_total_gen_time_ms: list[float] = [0.0] * self.num_stages + # Per-client metrics accumulators. + self._batch_seq: list[int] = [0] * self.num_clients + self._agg_total_tokens: list[int] = [0] * self.num_clients + self._agg_total_gen_time_ms: list[float] = [0.0] * self.num_clients # Shutdown coordination self._shutdown_event = asyncio.Event() self._stages_shutdown = False + def _choose_client_index( + self, + logical_stage_id: int, + req_state: OrchestratorRequestState, + ) -> int: + """Pick a client for *logical_stage_id* and record the choice. + + If this request already has a chosen client for the logical stage, + return the existing one (affinity). Otherwise round-robin among the + available replicas. + """ + existing = req_state.chosen_client_index.get(logical_stage_id) + if existing is not None: + return existing + + candidates = self.logical_stage_to_clients[logical_stage_id] + if len(candidates) == 1: + chosen = candidates[0] + else: + rr = self._replica_rr[logical_stage_id] + chosen = candidates[rr % len(candidates)] + self._replica_rr[logical_stage_id] = rr + 1 + + req_state.chosen_client_index[logical_stage_id] = chosen + return chosen + async def run(self) -> None: """Main entry point for the Orchestrator event loop.""" logger.info("[Orchestrator] Starting event loop") @@ -226,31 +278,38 @@ async def _orchestration_loop(self) -> None: """Inner loop for _orchestration_output_handler (clean cancellation). Control flow: poll raw → process through output processor → route. + + Multi-replica: iterates over every *client_index* (not logical stage), + and resolves the logical_stage_id from client metadata for routing. """ while not self._shutdown_event.is_set(): idle = True - for stage_id in range(self.num_stages): + for client_index in range(self.num_clients): if self._shutdown_event.is_set(): return + logical_stage_id = self._client_to_logical[client_index] + # 1) Diffusion stage: poll non-blocking queue - # TODO (Peiqi): the output of diffusion stage is OmniRequestOutput, - # which is different from EngineCoreOutputs (LLM stages). We may want to unify - # the output format in the future to simplify the processing logic in Orchestrator. - stage_client = self.stage_clients[stage_id] + stage_client = self.stage_clients[client_index] if stage_client.stage_type == "diffusion": output = stage_client.get_diffusion_output_async() if output is not None: idle = False req_state = self.request_states.get(output.request_id) if req_state is not None: - stage_metrics = self._build_stage_metrics(stage_id, output.request_id, [output], req_state) - await self._route_output(stage_id, output, req_state, stage_metrics) + stage_metrics = self._build_stage_metrics( + client_index, output.request_id, [output], req_state + ) + await self._route_output( + logical_stage_id, output, req_state, stage_metrics, + client_index=client_index, + ) continue - # 1) Poll raw outputs from the stage + # 1) Poll raw outputs from the client try: - raw_outputs = await asyncio.wait_for(self._poll_stage_raw(stage_id), timeout=0.001) + raw_outputs = await asyncio.wait_for(self._poll_stage_raw(client_index), timeout=0.001) except asyncio.TimeoutError: continue except asyncio.CancelledError: @@ -259,8 +318,9 @@ async def _orchestration_loop(self) -> None: if self._shutdown_event.is_set(): return logger.exception( - "[Orchestrator] _poll_stage_raw failed for stage-%s", - stage_id, + "[Orchestrator] _poll_stage_raw failed for client-%s (logical stage-%s)", + client_index, + logical_stage_id, ) raise @@ -269,28 +329,33 @@ async def _orchestration_loop(self) -> None: idle = False # 2) Process raw outputs through the output processor - request_outputs = await self._process_stage_outputs(stage_id, raw_outputs) + request_outputs = await self._process_stage_outputs(client_index, raw_outputs) # 3) Route each processed output for output in request_outputs: req_state = self.request_states.get(output.request_id) if req_state is None: logger.warning( - "[Orchestrator] Dropping output for unknown req %s at stage-%s (known reqs: %s)", + "[Orchestrator] Dropping output for unknown req %s at client-%s " + "(logical stage-%s, known reqs: %s)", output.request_id, - stage_id, + client_index, + logical_stage_id, list(self.request_states.keys()), ) continue stage_metrics = None if output.finished: stage_metrics = self._build_stage_metrics( - stage_id, + client_index, output.request_id, [output], req_state, ) - await self._route_output(stage_id, output, req_state, stage_metrics) + await self._route_output( + logical_stage_id, output, req_state, stage_metrics, + client_index=client_index, + ) if idle: await asyncio.sleep(0.001) @@ -303,12 +368,22 @@ async def _route_output( output: Any, req_state: OrchestratorRequestState, stage_metrics: Any, + *, + client_index: int | None = None, ) -> None: - """Route a processed output: send to main thread and/or forward to next stage.""" + """Route a processed output: send to main thread and/or forward to next stage. + + Args: + stage_id: Logical stage id. + client_index: Physical client index that produced this output. + Defaults to stage_id for backward compat. + """ + if client_index is None: + client_index = stage_id req_id = output.request_id finished = output.finished submit_ts = req_state.stage_submit_ts.get(stage_id) - stage_client = self.stage_clients[stage_id] + stage_client = self.stage_clients[client_index] # CFG companion handling: companions don't produce user-visible output # and don't forward to the next stage directly. @@ -331,6 +406,7 @@ async def _route_output( deferred["stage_id"], deferred["output"], parent_state, + client_index=deferred.get("client_index", deferred["stage_id"]), ) self.request_states.pop(req_id, None) return @@ -364,13 +440,16 @@ async def _route_output( self._deferred_parents[req_id] = { "stage_id": stage_id, "output": output, + "client_index": client_index, } logger.debug( "[Orchestrator] Parent %s deferred, waiting for CFG companions", req_id, ) else: - await self._forward_to_next_stage(req_id, stage_id, output, req_state) + await self._forward_to_next_stage( + req_id, stage_id, output, req_state, client_index=client_index, + ) if finished and stage_id == req_state.final_stage_id: self._cleanup_companion_state(req_id) @@ -395,35 +474,36 @@ def _all_companions_done(self, parent_id: str) -> bool: def _build_stage_metrics( self, - stage_id: int, + client_index: int, req_id: str, request_outputs: list[RequestOutput], req_state: OrchestratorRequestState, ) -> StageRequestMetrics: - """Build StageRequestMetrics for a finished request at a stage. + """Build StageRequestMetrics for a finished request at a client. Reuses StageRequestMetrics so OrchestratorMetrics and downstream metric handlers can consume a stable schema. """ + logical_stage_id = self._client_to_logical[client_index] now = _time.time() - submit_ts = req_state.stage_submit_ts.get(stage_id, now) + submit_ts = req_state.stage_submit_ts.get(logical_stage_id, now) stage_gen_time_ms = (now - submit_ts) * 1000.0 num_tokens_out = count_tokens_from_outputs(request_outputs) num_tokens_in = 0 - if stage_id == 0: + if logical_stage_id == 0: for ro in request_outputs: ptids = getattr(ro, "prompt_token_ids", None) if ptids is not None: num_tokens_in += len(ptids) - # Monotonic batch counter per stage. - self._batch_seq[stage_id] += 1 - batch_id = self._batch_seq[stage_id] + # Monotonic batch counter per client. + self._batch_seq[client_index] += 1 + batch_id = self._batch_seq[client_index] # Accumulate for running-average stage_stats - self._agg_total_tokens[stage_id] += num_tokens_out - self._agg_total_gen_time_ms[stage_id] += stage_gen_time_ms + self._agg_total_tokens[client_index] += num_tokens_out + self._agg_total_gen_time_ms[client_index] += stage_gen_time_ms return StageRequestMetrics( num_tokens_in=num_tokens_in, @@ -435,8 +515,8 @@ def _build_stage_metrics( rx_transfer_bytes=0, rx_in_flight_time_ms=0.0, stage_stats=StageStats( - total_token=self._agg_total_tokens[stage_id], - total_gen_time_ms=self._agg_total_gen_time_ms[stage_id], + total_token=self._agg_total_tokens[client_index], + total_gen_time_ms=self._agg_total_gen_time_ms[client_index], ), ) @@ -446,18 +526,28 @@ async def _forward_to_next_stage( stage_id: int, output: Any, req_state: OrchestratorRequestState, + *, + client_index: int | None = None, ) -> None: """Forward output from current stage to the next stage. Handles the full pipeline: set outputs on current stage, compute next-stage inputs, build lightweight requests, and submit them. + + Args: + stage_id: Logical stage id that produced the output. + client_index: Physical client index that produced the output. """ - next_stage_id = stage_id + 1 - next_client = self.stage_clients[next_stage_id] - params = req_state.sampling_params_list[next_stage_id] + if client_index is None: + client_index = stage_id + + next_logical = stage_id + 1 + next_ci = self._choose_client_index(next_logical, req_state) + next_client = self.stage_clients[next_ci] + params = req_state.sampling_params_list[next_logical] if next_client.stage_type == "diffusion": - self.stage_clients[stage_id].set_engine_outputs([output]) + self.stage_clients[client_index].set_engine_outputs([output]) if next_client.custom_process_input_func is not None: diffusion_prompt = next_client.custom_process_input_func( self.stage_clients, @@ -493,22 +583,25 @@ async def _forward_to_next_stage( ) else: await next_client.add_request_async(req_id, diffusion_prompt, params) - req_state.stage_submit_ts[next_stage_id] = _time.time() + req_state.stage_submit_ts[next_logical] = _time.time() return - self.stage_clients[stage_id].set_engine_outputs([output]) + # Set outputs on the client that actually produced them + self.stage_clients[client_index].set_engine_outputs([output]) # Process inputs for next stage try: next_inputs = next_client.process_engine_inputs( stage_list=self.stage_clients, prompt=req_state.prompt, + source_client_index=client_index, ) except Exception: logger.exception( - "[Orchestrator] req=%s process_engine_inputs FAILED for stage-%s", + "[Orchestrator] req=%s process_engine_inputs FAILED for logical stage-%s (client-%s)", req_id, - next_stage_id, + next_logical, + next_ci, ) raise @@ -518,13 +611,13 @@ async def _forward_to_next_stage( request_id=req_id, prompt=next_input, params=params, - model_config=self.stage_vllm_configs[next_stage_id].model_config, + model_config=self.stage_vllm_configs[next_ci].model_config, ) # TODO: Here we directly use the req id to assign. request.external_req_id = request.request_id - self.output_processors[next_stage_id].add_request( + self.output_processors[next_ci].add_request( request=request, prompt=None, parent_req=None, @@ -534,26 +627,26 @@ async def _forward_to_next_stage( await next_client.add_request_async(request) - # Record submit timestamp for the next stage - req_state.stage_submit_ts[next_stage_id] = _time.time() + # Record submit timestamp for the next logical stage + req_state.stage_submit_ts[next_logical] = _time.time() - async def _poll_stage_raw(self, stage_id: int) -> EngineCoreOutputs | None: + async def _poll_stage_raw(self, client_index: int) -> EngineCoreOutputs | None: """Pull raw EngineCoreOutputs from a stage client without processing. Returns the raw outputs object, or None when there is nothing to consume. """ - outputs = await self.stage_clients[stage_id].get_output_async() + outputs = await self.stage_clients[client_index].get_output_async() if not outputs.outputs: return None return outputs - async def _process_stage_outputs(self, stage_id: int, raw_outputs: EngineCoreOutputs) -> list[RequestOutput]: + async def _process_stage_outputs(self, client_index: int, raw_outputs: EngineCoreOutputs) -> list[RequestOutput]: """Run the output processor on raw outputs, returning RequestOutputs. Also handles abort forwarding and scheduler stats updates. """ - processor = self.output_processors[stage_id] + processor = self.output_processors[client_index] processed = processor.process_outputs( raw_outputs.outputs, @@ -562,7 +655,7 @@ async def _process_stage_outputs(self, stage_id: int, raw_outputs: EngineCoreOut ) if processed.reqs_to_abort: - await self.stage_clients[stage_id].abort_requests_async(processed.reqs_to_abort) + await self.stage_clients[client_index].abort_requests_async(processed.reqs_to_abort) if raw_outputs.scheduler_stats is not None: processor.update_scheduler_stats(raw_outputs.scheduler_stats) @@ -571,7 +664,7 @@ async def _process_stage_outputs(self, stage_id: int, raw_outputs: EngineCoreOut async def _handle_add_request(self, msg: dict[str, Any]) -> None: """Handle an add_request message from the main thread.""" - stage_id = 0 + logical_stage_id = 0 request_id = msg["request_id"] prompt = msg["prompt"] original_prompt = msg.get("original_prompt", prompt) @@ -585,7 +678,7 @@ async def _handle_add_request(self, msg: dict[str, Any]) -> None: "[Orchestrator] _handle_add_request: stage=%s req=%s " "prompt_type=%s original_prompt_type=%s final_stage=%s " "num_sampling_params=%d", - stage_id, + logical_stage_id, request_id, type(prompt).__name__, type(original_prompt).__name__, @@ -601,14 +694,17 @@ async def _handle_add_request(self, msg: dict[str, Any]) -> None: sampling_params_list=sampling_params_list, final_stage_id=final_stage_id, ) - req_state.stage_submit_ts[stage_id] = _time.time() self.request_states[request_id] = req_state + # Choose a replica for logical stage 0 + client_index = self._choose_client_index(logical_stage_id, req_state) + req_state.stage_submit_ts[logical_stage_id] = _time.time() + # Stage-0 prompt is already a fully-formed OmniEngineCoreRequest # (pre-processed by AsyncOmniEngine.add_request, output processor # already registered there) - submit directly. request = prompt - stage_client = self.stage_clients[stage_id] + stage_client = self.stage_clients[client_index] if stage_client.stage_type == "diffusion": if isinstance(prompt, list): await stage_client.add_batch_request_async( @@ -621,7 +717,7 @@ async def _handle_add_request(self, msg: dict[str, Any]) -> None: else: await stage_client.add_request_async(request) - if self.async_chunk and stage_id == 0 and final_stage_id > 0: + if self.async_chunk and logical_stage_id == 0 and final_stage_id > 0: await self._prewarm_async_chunk_stages(request_id, request, req_state) async def _prewarm_async_chunk_stages( @@ -635,6 +731,9 @@ async def _prewarm_async_chunk_stages( In async-chunk mode, stages exchange data through connectors/chunk adapters, so downstream stages should be armed once at request start instead of waiting for stage-finished forwarding. + + Multi-replica: uses _choose_client_index so the prewarm targets align + with the orchestration-face chosen replicas. """ if req_state.final_stage_id <= 0: return @@ -661,24 +760,25 @@ async def _prewarm_async_chunk_stages( base_input["multi_modal_data"] = None base_input["mm_processor_kwargs"] = None - for next_stage_id in range(1, req_state.final_stage_id + 1): - next_client = self.stage_clients[next_stage_id] - params = req_state.sampling_params_list[next_stage_id] + for next_logical in range(1, req_state.final_stage_id + 1): + next_ci = self._choose_client_index(next_logical, req_state) + next_client = self.stage_clients[next_ci] + params = req_state.sampling_params_list[next_logical] if next_client.stage_type == "diffusion": await next_client.add_request_async(request_id, req_state.prompt, params) - req_state.stage_submit_ts[next_stage_id] = _time.time() + req_state.stage_submit_ts[next_logical] = _time.time() continue request = build_engine_core_request_from_tokens( request_id=request_id, prompt=base_input, params=params, - model_config=self.stage_vllm_configs[next_stage_id].model_config, + model_config=self.stage_vllm_configs[next_ci].model_config, ) request.external_req_id = request.request_id - self.output_processors[next_stage_id].add_request( + self.output_processors[next_ci].add_request( request=request, prompt=None, parent_req=None, @@ -686,7 +786,7 @@ async def _prewarm_async_chunk_stages( queue=None, ) await next_client.add_request_async(request) - req_state.stage_submit_ts[next_stage_id] = _time.time() + req_state.stage_submit_ts[next_logical] = _time.time() async def _handle_add_companion(self, msg: dict[str, Any]) -> None: """Handle an add_companion_request message: submit companion to stage 0.""" @@ -710,18 +810,28 @@ async def _handle_add_companion(self, msg: dict[str, Any]) -> None: sampling_params_list=sampling_params_list, final_stage_id=0, ) - companion_state.stage_submit_ts[0] = _time.time() self.request_states[companion_id] = companion_state + # Use same replica as the parent for affinity, or choose one + parent_state = self.request_states.get(parent_id) + if parent_state is not None and 0 in parent_state.chosen_client_index: + client_index = parent_state.chosen_client_index[0] + companion_state.chosen_client_index[0] = client_index + else: + client_index = self._choose_client_index(0, companion_state) + + companion_state.stage_submit_ts[0] = _time.time() + request = companion_prompt # Already a processed OmniEngineCoreRequest - stage_client = self.stage_clients[0] + stage_client = self.stage_clients[client_index] await stage_client.add_request_async(request) logger.info( - "[Orchestrator] CFG companion submitted: %s (role=%s, parent=%s)", + "[Orchestrator] CFG companion submitted: %s (role=%s, parent=%s, client=%s)", companion_id, role, parent_id, + client_index, ) async def _handle_abort(self, msg: dict[str, Any]) -> None: @@ -740,8 +850,8 @@ async def _handle_abort(self, msg: dict[str, Any]) -> None: self._deferred_parents.pop(req_id, None) all_ids_to_abort = list(request_ids) + companion_ids_to_abort - for stage_id in range(self.num_stages): - await self.stage_clients[stage_id].abort_requests_async(all_ids_to_abort) + for ci in range(self.num_clients): + await self.stage_clients[ci].abort_requests_async(all_ids_to_abort) for req_id in request_ids: self.request_states.pop(req_id, None) logger.info("[Orchestrator] Aborted request(s) %s", request_ids) @@ -759,16 +869,26 @@ async def _handle_collective_rpc(self, msg: dict[str, Any]) -> None: args = tuple(msg.get("args", ())) kwargs = dict(msg.get("kwargs") or {}) requested_stage_ids = msg.get("stage_ids") - stage_ids = list(range(self.num_stages)) if requested_stage_ids is None else list(requested_stage_ids) + # When stage_ids are provided they refer to logical stages; expand + # to all client indices belonging to those logical stages. + if requested_stage_ids is None: + stage_ids = list(range(self.num_clients)) + else: + stage_ids = [] + for lid in requested_stage_ids: + if 0 <= lid < self.num_logical_stages: + stage_ids.extend(self.logical_stage_to_clients[lid]) + else: + stage_ids.append(lid) # keep invalid id for error reporting results: list[Any] = [] for stage_id in stage_ids: - if stage_id < 0 or stage_id >= self.num_stages: + if stage_id < 0 or stage_id >= self.num_clients: results.append( { "supported": False, "todo": True, - "error": f"Invalid stage id {stage_id}", + "error": f"Invalid client index {stage_id}", } ) continue @@ -817,10 +937,10 @@ def _shutdown_stages(self) -> None: return self._stages_shutdown = True - logger.info("[Orchestrator] Shutting down all stages") - for stage_id, stage_client in enumerate(self.stage_clients): + logger.info("[Orchestrator] Shutting down all %d client(s)", self.num_clients) + for ci, stage_client in enumerate(self.stage_clients): try: stage_client.shutdown() - logger.info(f"[Orchestrator] Stage {stage_id} shut down") + logger.info("[Orchestrator] Client %d shut down", ci) except Exception as e: - logger.warning(f"[Orchestrator] Failed to shutdown stage {stage_id}: {e}") + logger.warning("[Orchestrator] Failed to shutdown client %d: %s", ci, e) diff --git a/vllm_omni/engine/stage_engine_core_client.py b/vllm_omni/engine/stage_engine_core_client.py index 284cc2d31a2..395a9d84550 100644 --- a/vllm_omni/engine/stage_engine_core_client.py +++ b/vllm_omni/engine/stage_engine_core_client.py @@ -123,8 +123,16 @@ def process_engine_inputs( self, stage_list: list[Any], prompt: OmniTokensPrompt | list[OmniTokensPrompt] | None = None, + source_client_index: int | None = None, ) -> list[OmniTokensPrompt]: - """Process inputs from upstream stages.""" + """Process inputs from upstream stages. + + Args: + source_client_index: When multi-replica is enabled, specifies the + exact client index in *stage_list* that produced the upstream + output. Falls back to ``engine_input_source[0]`` for backward + compat. + """ from vllm_omni.inputs.data import OmniTokensPrompt if self.custom_process_input_func is not None: @@ -138,7 +146,7 @@ def process_engine_inputs( if not self.engine_input_source: raise ValueError(f"engine_input_source empty for stage {self.stage_id}") - source_id = self.engine_input_source[0] + source_id = source_client_index if source_client_index is not None else self.engine_input_source[0] source_outputs = stage_list[source_id].engine_outputs if not isinstance(prompt, list): diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 9c246ce6eb3..73c63255dcc 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -111,6 +111,11 @@ class StageMetadata: runtime_cfg: Any prompt_expand_func: Callable | None = None cfg_kv_collect_func: Callable | None = None + # Multi-replica fields: logical_stage_id is the original stage_id from + # the YAML config; replica_index distinguishes replicas of the same + # logical stage. For single-replica stages these default to stage_id / 0. + logical_stage_id: int = -1 + replica_index: int = 0 @dataclass @@ -172,6 +177,7 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata: model_stage=None, runtime_cfg=runtime_cfg, cfg_kv_collect_func=cfg_kv_collect_func, + logical_stage_id=stage_id, ) model_stage = getattr(engine_args, "model_stage", None) @@ -193,6 +199,7 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata: model_stage=model_stage, runtime_cfg=runtime_cfg, prompt_expand_func=prompt_expand_func, + logical_stage_id=stage_id, ) From 0b774ca0d77002f2ffd9c546b330eb5a7d7b3454 Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Wed, 1 Apr 2026 12:18:57 +0800 Subject: [PATCH 02/76] feat: init multi engine-cores Signed-off-by: ZhengWG Made-with: Cursor --- vllm_omni/engine/async_omni_engine.py | 232 +++++++++++++++++--------- vllm_omni/engine/stage_init_utils.py | 45 +++++ 2 files changed, 194 insertions(+), 83 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index f7a5a5186a7..49b5bd5b161 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -9,6 +9,7 @@ import asyncio import concurrent.futures +import copy import dataclasses import json import os @@ -55,11 +56,13 @@ extract_stage_metadata, finalize_initialized_stages, get_stage_connector_spec, + get_stage_tp_size, initialize_diffusion_stage, load_omni_transfer_config_for_model, prepare_engine_environment, release_device_locks, setup_stage_devices, + split_devices_for_replicas, ) from vllm_omni.entrypoints.utils import ( load_and_resolve_stage_configs, @@ -438,24 +441,22 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: """Initialize stage clients/processors in orchestrator thread and assign to self. Multi-replica support: when a stage config contains - ``runtime.num_replicas > 1``, multiple clients are created for the same - logical stage and the flat ``stage_clients`` list grows accordingly. - ``logical_stage_to_clients`` maps each logical stage id to the list of - client indices that belong to it. + ``runtime.num_replicas > 1``, each replica launches its own EngineCore + process with a dedicated slice of devices. The flat ``stage_clients`` + list contains all replica clients; ``logical_stage_to_clients`` maps + each logical stage id to the list of client indices that belong to it. """ device_control_env = current_omni_platform.device_control_env_var num_stages = self.num_stages - # These are indexed by *logical* stage_id during initialization, then - # expanded to flat client-indexed lists at the end. - stage_clients: list[Any | None] = [None] * num_stages - output_processors: list[Any | None] = [None] * num_stages - stage_vllm_configs: list[Any | None] = [None] * num_stages input_processor: InputProcessor | None = None - llm_stage_ids: list[int] = [] - llm_launch_futures: dict[int, concurrent.futures.Future[StartedLlmStage]] = {} - started_llm_stages: dict[int, StartedLlmStage] = {} + # Keyed by (logical_stage_id, replica_idx) + llm_launch_keys: list[tuple[int, int]] = [] + llm_launch_futures: dict[tuple[int, int], concurrent.futures.Future[StartedLlmStage]] = {} + started_llm_stages: dict[tuple[int, int], StartedLlmStage] = {} llm_stage_launch_lock = threading.Lock() + # Diffusion stages (no multi-replica support yet) + diffusion_clients: dict[int, Any] = {} # Track per-logical-stage replica count from config replicas_per_stage: list[int] = [] @@ -466,22 +467,47 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: ) replicas_per_stage.append(max(1, num_replicas)) + # Pre-compute per-replica device assignments for multi-replica stages + replica_devices_map: dict[tuple[int, int], str] = {} + for logical_id, stage_cfg in enumerate(self.stage_configs): + num_replicas = replicas_per_stage[logical_id] + if num_replicas <= 1: + continue + runtime_cfg = getattr(stage_cfg, "runtime", {}) + devices_str = ( + runtime_cfg.get("devices") if hasattr(runtime_cfg, "get") + else getattr(runtime_cfg, "devices", None) + ) + tp_size = get_stage_tp_size(stage_cfg) + per_replica = split_devices_for_replicas(devices_str, num_replicas, tp_size, logical_id) + for r, dev_str in enumerate(per_replica): + replica_devices_map[(logical_id, r)] = dev_str + logger.info( + "[AsyncOmniEngine] Stage %s: %d replicas, tp=%d, devices split: %s", + logical_id, num_replicas, tp_size, per_replica, + ) + async_chunk = self.async_chunk prompt_expand_func = None - llm_stage_count = sum( - 1 for stage_cfg in self.stage_configs if getattr(stage_cfg, "stage_type", "llm") != "diffusion" + total_llm_replicas = sum( + replicas_per_stage[i] + for i, cfg in enumerate(self.stage_configs) + if getattr(cfg, "stage_type", "llm") != "diffusion" ) prepare_engine_environment() omni_transfer_config = load_omni_transfer_config_for_model(self.model, self.config_path) + # Initialized outside try so error handler can always access them + flat_clients: list[Any] = [] + all_clients: dict[tuple[int, int], Any] = {} + try: with concurrent.futures.ThreadPoolExecutor( - max_workers=max(1, llm_stage_count), + max_workers=max(1, total_llm_replicas), thread_name_prefix="llm-stage-launch", ) as launch_executor: for stage_id, stage_cfg in enumerate(self.stage_configs): - logger.info("[AsyncOmniEngine] Initializing stage %s", stage_id) metadata = extract_stage_metadata(stage_cfg) if metadata.prompt_expand_func is not None: prompt_expand_func = metadata.prompt_expand_func @@ -505,7 +531,7 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: inject_omni_kv_config(stage_cfg, omni_conn_cfg, omni_from, omni_to) _inject_kv_stage_info(stage_cfg, stage_id) - stage_clients[stage_id] = initialize_diffusion_stage( + diffusion_clients[stage_id] = initialize_diffusion_stage( self.model, stage_cfg, metadata, @@ -523,89 +549,129 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: current_omni_platform.set_device_control_env_var(previous_visible_devices) continue - llm_stage_ids.append(stage_id) - llm_launch_futures[stage_id] = launch_executor.submit( - self._launch_llm_stage, - stage_cfg, - metadata, - stage_connector_spec, - stage_init_timeout, - llm_stage_launch_lock, - omni_kv_connector, - ) + # Submit one launch future per replica + num_replicas = replicas_per_stage[stage_id] + for replica_idx in range(num_replicas): + key = (stage_id, replica_idx) + llm_launch_keys.append(key) + + # For replica > 0, deep-copy stage_cfg and override devices + if replica_idx > 0: + replica_cfg = copy.deepcopy(stage_cfg) + else: + replica_cfg = stage_cfg + + if key in replica_devices_map: + replica_cfg.runtime.devices = replica_devices_map[key] + + replica_metadata = extract_stage_metadata(replica_cfg) + replica_metadata.replica_index = replica_idx + + logger.info( + "[AsyncOmniEngine] Launching stage %s replica %s (devices=%s)", + stage_id, replica_idx, + getattr(getattr(replica_cfg, "runtime", None), "devices", "default"), + ) + + llm_launch_futures[key] = launch_executor.submit( + self._launch_llm_stage, + replica_cfg, + replica_metadata, + stage_connector_spec, + stage_init_timeout, + llm_stage_launch_lock, + omni_kv_connector, + ) concurrent.futures.wait(list(llm_launch_futures.values())) - for stage_id in llm_stage_ids: - started_llm_stages[stage_id] = llm_launch_futures[stage_id].result() + for key in llm_launch_keys: + started_llm_stages[key] = llm_launch_futures[key].result() + + # ---- Build flat client lists directly ---- + # Attach each launched replica and build the flat index structures. + flat_output_processors: list[Any] = [] + flat_vllm_configs: list[Any] = [] + logical_stage_to_clients: list[list[int]] = [] + + # Per-logical-stage lists (not per-client) + logical_stage_clients_for_finalize: list[Any | None] = [None] * num_stages + all_output_processors: dict[tuple[int, int], Any] = {} + all_vllm_configs: dict[tuple[int, int], Any] = {} + + for key in llm_launch_keys: + stage_id, replica_idx = key + started = started_llm_stages[key] + client, output_proc, vllm_cfg, stage0_inp = self._attach_llm_stage(started) + all_clients[key] = client + all_output_processors[key] = output_proc + all_vllm_configs[key] = vllm_cfg + if stage0_inp is not None: + input_processor = stage0_inp + # Use first replica for finalize_initialized_stages + if replica_idx == 0: + logical_stage_clients_for_finalize[stage_id] = client - for stage_id in llm_stage_ids: - started = started_llm_stages[stage_id] - stage_client, output_processor, vllm_config, stage0_input_processor = self._attach_llm_stage(started) - stage_clients[stage_id] = stage_client - output_processors[stage_id] = output_processor - stage_vllm_configs[stage_id] = vllm_config - if stage0_input_processor is not None: - input_processor = stage0_input_processor + # Place diffusion clients into the logical list + for stage_id, diff_client in diffusion_clients.items(): + logical_stage_clients_for_finalize[stage_id] = diff_client initialized_stage_clients, default_sampling_params_list, stage_metadata = finalize_initialized_stages( - stage_clients, + logical_stage_clients_for_finalize, input_processor, ) + + # Now build flat lists in logical-stage order, replicas within + logical_default_sampling_params: list[Any] = [] + logical_stage_metadata: list[dict[str, Any]] = [] + + for logical_id in range(num_stages): + num_replicas = replicas_per_stage[logical_id] + client_indices: list[int] = [] + + if logical_id in diffusion_clients: + # Diffusion: single client, no multi-replica + ci = len(flat_clients) + client_indices.append(ci) + flat_clients.append(diffusion_clients[logical_id]) + flat_output_processors.append(None) + flat_vllm_configs.append(None) + else: + for replica_idx in range(num_replicas): + key = (logical_id, replica_idx) + ci = len(flat_clients) + client_indices.append(ci) + flat_clients.append(all_clients[key]) + flat_output_processors.append(all_output_processors[key]) + flat_vllm_configs.append(all_vllm_configs[key]) + if num_replicas > 1: + logger.info( + "[AsyncOmniEngine] Logical stage %s replica %s → client %s (isolated)", + logical_id, replica_idx, ci, + ) + + logical_stage_to_clients.append(client_indices) + logical_default_sampling_params.append(default_sampling_params_list[logical_id]) + logical_stage_metadata.append(stage_metadata[logical_id]) + except Exception: - for stage_id, future in llm_launch_futures.items(): + for key, future in llm_launch_futures.items(): if not future.done() or future.cancelled() or future.exception() is not None: continue - started_llm_stages.setdefault(stage_id, future.result()) + started_llm_stages.setdefault(key, future.result()) + # Collect all initialized clients for cleanup + cleanup_clients: list[Any] = list(diffusion_clients.values()) + list(all_clients.values()) + cleanup_clients = [c for c in cleanup_clients if c is not None] logger.exception( - "[AsyncOmniEngine] Stage initialization failed; shutting down %s initialized stage(s)", - len([stage_client for stage_client in stage_clients if stage_client is not None]), + "[AsyncOmniEngine] Stage initialization failed; shutting down %s initialized client(s)", + len(cleanup_clients), ) cleanup_failed_stage_initialization( - stage_clients, - [started_llm_stages[stage_id] for stage_id in llm_stage_ids if stage_id in started_llm_stages], + cleanup_clients, + list(started_llm_stages.values()), ) raise - # ---- Multi-replica expansion ---- - # Expand the logical-indexed lists into flat client-indexed lists. - # For replica_index > 0 the same client/processor/config is shared - # (they point to the same underlying EngineCore) — full per-replica - # process isolation is out-of-scope for this first iteration. - flat_clients: list[Any] = [] - flat_output_processors: list[Any] = [] - flat_vllm_configs: list[Any] = [] - logical_stage_to_clients: list[list[int]] = [] - # sampling_params and stage_metadata are per-logical-stage - logical_default_sampling_params: list[Any] = [] - logical_stage_metadata: list[dict[str, Any]] = [] - - for logical_id, client in enumerate(initialized_stage_clients): - num_replicas = replicas_per_stage[logical_id] - client_indices: list[int] = [] - for replica_idx in range(num_replicas): - ci = len(flat_clients) - client_indices.append(ci) - if replica_idx == 0: - # First replica uses the already-created objects - flat_clients.append(client) - flat_output_processors.append(output_processors[logical_id]) - flat_vllm_configs.append(stage_vllm_configs[logical_id]) - else: - # Additional replicas: for now, share the same client. - # True per-replica process isolation will be added later. - # TODO: launch separate EngineCore processes for replica_idx > 0 - flat_clients.append(client) - flat_output_processors.append(output_processors[logical_id]) - flat_vllm_configs.append(stage_vllm_configs[logical_id]) - logger.info( - "[AsyncOmniEngine] Logical stage %s replica %s → client %s (shared)", - logical_id, replica_idx, ci, - ) - logical_stage_to_clients.append(client_indices) - logical_default_sampling_params.append(default_sampling_params_list[logical_id]) - logical_stage_metadata.append(stage_metadata[logical_id]) - self.stage_clients = flat_clients self.output_processors = flat_output_processors self.stage_vllm_configs = flat_vllm_configs diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 73c63255dcc..1f360d85aa8 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -218,6 +218,51 @@ def prepare_engine_environment() -> None: pass +def split_devices_for_replicas( + devices_str: str | None, + num_replicas: int, + tp_size: int, + stage_id: int, +) -> list[str]: + """Split a devices string into per-replica subsets. + + When ``num_replicas`` is 1, returns ``[devices_str]`` unchanged. + Otherwise, the total number of device IDs must equal + ``num_replicas * tp_size``; each replica gets ``tp_size`` consecutive + device IDs. + + Example:: + + split_devices_for_replicas("1,2,3,4", num_replicas=2, tp_size=2, stage_id=1) + # → ["1,2", "3,4"] + """ + if num_replicas <= 1 or devices_str is None: + return [devices_str] if devices_str is not None else [devices_str] + + device_list = [d.strip() for d in devices_str.split(",") if d.strip()] + required = num_replicas * tp_size + if len(device_list) != required: + raise ValueError( + f"Stage {stage_id}: num_replicas={num_replicas}, " + f"tensor_parallel_size={tp_size} requires " + f"{required} devices, got {len(device_list)}: {devices_str}" + ) + + result: list[str] = [] + for r in range(num_replicas): + chunk = device_list[r * tp_size : (r + 1) * tp_size] + result.append(",".join(chunk)) + return result + + +def get_stage_tp_size(stage_cfg: Any) -> int: + """Extract tensor_parallel_size from a stage config object.""" + engine_args = getattr(stage_cfg, "engine_args", {}) + if hasattr(engine_args, "get"): + return int(engine_args.get("tensor_parallel_size", 1) or 1) + return int(getattr(engine_args, "tensor_parallel_size", 1) or 1) + + def setup_stage_devices(stage_id: int, runtime_cfg: Any) -> None: """Device mapping via set_stage_devices for a single stage.""" physical_devices = set_stage_devices( From 4786f5eafdcbd20bb466626c9037d53d57f27ed3 Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Wed, 1 Apr 2026 14:30:46 +0800 Subject: [PATCH 03/76] fix lint Signed-off-by: ZhengWG Made-with: Cursor --- vllm_omni/engine/async_omni_engine.py | 19 +++++++++++++------ vllm_omni/engine/orchestrator.py | 16 +++++++++++++--- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 49b5bd5b161..c875fca82ac 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -463,7 +463,9 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: for stage_cfg in self.stage_configs: runtime_cfg = getattr(stage_cfg, "runtime", {}) num_replicas = int( - runtime_cfg.get("num_replicas", 1) if hasattr(runtime_cfg, "get") else getattr(runtime_cfg, "num_replicas", 1) + runtime_cfg.get("num_replicas", 1) + if hasattr(runtime_cfg, "get") + else getattr(runtime_cfg, "num_replicas", 1) ) replicas_per_stage.append(max(1, num_replicas)) @@ -475,8 +477,7 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: continue runtime_cfg = getattr(stage_cfg, "runtime", {}) devices_str = ( - runtime_cfg.get("devices") if hasattr(runtime_cfg, "get") - else getattr(runtime_cfg, "devices", None) + runtime_cfg.get("devices") if hasattr(runtime_cfg, "get") else getattr(runtime_cfg, "devices", None) ) tp_size = get_stage_tp_size(stage_cfg) per_replica = split_devices_for_replicas(devices_str, num_replicas, tp_size, logical_id) @@ -484,7 +485,10 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: replica_devices_map[(logical_id, r)] = dev_str logger.info( "[AsyncOmniEngine] Stage %s: %d replicas, tp=%d, devices split: %s", - logical_id, num_replicas, tp_size, per_replica, + logical_id, + num_replicas, + tp_size, + per_replica, ) async_chunk = self.async_chunk @@ -569,7 +573,8 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: logger.info( "[AsyncOmniEngine] Launching stage %s replica %s (devices=%s)", - stage_id, replica_idx, + stage_id, + replica_idx, getattr(getattr(replica_cfg, "runtime", None), "devices", "default"), ) @@ -647,7 +652,9 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: if num_replicas > 1: logger.info( "[AsyncOmniEngine] Logical stage %s replica %s → client %s (isolated)", - logical_id, replica_idx, ci, + logical_id, + replica_idx, + ci, ) logical_stage_to_clients.append(client_indices) diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 3106b876809..9c44f8ab605 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -302,7 +302,10 @@ async def _orchestration_loop(self) -> None: client_index, output.request_id, [output], req_state ) await self._route_output( - logical_stage_id, output, req_state, stage_metrics, + logical_stage_id, + output, + req_state, + stage_metrics, client_index=client_index, ) continue @@ -353,7 +356,10 @@ async def _orchestration_loop(self) -> None: req_state, ) await self._route_output( - logical_stage_id, output, req_state, stage_metrics, + logical_stage_id, + output, + req_state, + stage_metrics, client_index=client_index, ) @@ -448,7 +454,11 @@ async def _route_output( ) else: await self._forward_to_next_stage( - req_id, stage_id, output, req_state, client_index=client_index, + req_id, + stage_id, + output, + req_state, + client_index=client_index, ) if finished and stage_id == req_state.final_stage_id: From 11976f10a51291aeb0801f7de81501ea9f5cd582 Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Thu, 2 Apr 2026 16:13:42 +0800 Subject: [PATCH 04/76] refacotr: keep name consistency Signed-off-by: ZhengWG Made-with: Cursor --- vllm_omni/engine/async_omni_engine.py | 112 +++++---- vllm_omni/engine/orchestrator.py | 229 ++++++++++--------- vllm_omni/engine/stage_engine_core_client.py | 14 +- 3 files changed, 196 insertions(+), 159 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index c875fca82ac..d0dca006324 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -450,10 +450,10 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: num_stages = self.num_stages input_processor: InputProcessor | None = None - # Keyed by (logical_stage_id, replica_idx) - llm_launch_keys: list[tuple[int, int]] = [] - llm_launch_futures: dict[tuple[int, int], concurrent.futures.Future[StartedLlmStage]] = {} - started_llm_stages: dict[tuple[int, int], StartedLlmStage] = {} + # Per-stage launch futures and results: stage_id → [replicas] + llm_stage_ids: list[int] = [] + llm_launch_futures: dict[int, list[concurrent.futures.Future[StartedLlmStage]]] = {} + started_llm_stages: dict[int, list[StartedLlmStage]] = {} llm_stage_launch_lock = threading.Lock() # Diffusion stages (no multi-replica support yet) diffusion_clients: dict[int, Any] = {} @@ -470,7 +470,8 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: replicas_per_stage.append(max(1, num_replicas)) # Pre-compute per-replica device assignments for multi-replica stages - replica_devices_map: dict[tuple[int, int], str] = {} + # stage_id → [devices_str_per_replica] + replica_devices_map: dict[int, list[str]] = {} for logical_id, stage_cfg in enumerate(self.stage_configs): num_replicas = replicas_per_stage[logical_id] if num_replicas <= 1: @@ -480,15 +481,15 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: runtime_cfg.get("devices") if hasattr(runtime_cfg, "get") else getattr(runtime_cfg, "devices", None) ) tp_size = get_stage_tp_size(stage_cfg) - per_replica = split_devices_for_replicas(devices_str, num_replicas, tp_size, logical_id) - for r, dev_str in enumerate(per_replica): - replica_devices_map[(logical_id, r)] = dev_str + replica_devices_map[logical_id] = split_devices_for_replicas( + devices_str, num_replicas, tp_size, logical_id, + ) logger.info( "[AsyncOmniEngine] Stage %s: %d replicas, tp=%d, devices split: %s", logical_id, num_replicas, tp_size, - per_replica, + replica_devices_map[logical_id], ) async_chunk = self.async_chunk @@ -504,7 +505,8 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: # Initialized outside try so error handler can always access them flat_clients: list[Any] = [] - all_clients: dict[tuple[int, int], Any] = {} + # stage_id → [client_per_replica] + all_clients: dict[int, list[Any]] = {} try: with concurrent.futures.ThreadPoolExecutor( @@ -554,19 +556,19 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: continue # Submit one launch future per replica + llm_stage_ids.append(stage_id) num_replicas = replicas_per_stage[stage_id] - for replica_idx in range(num_replicas): - key = (stage_id, replica_idx) - llm_launch_keys.append(key) + stage_futures: list[concurrent.futures.Future[StartedLlmStage]] = [] + for replica_idx in range(num_replicas): # For replica > 0, deep-copy stage_cfg and override devices if replica_idx > 0: replica_cfg = copy.deepcopy(stage_cfg) else: replica_cfg = stage_cfg - if key in replica_devices_map: - replica_cfg.runtime.devices = replica_devices_map[key] + if stage_id in replica_devices_map: + replica_cfg.runtime.devices = replica_devices_map[stage_id][replica_idx] replica_metadata = extract_stage_metadata(replica_cfg) replica_metadata.replica_index = replica_idx @@ -578,7 +580,7 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: getattr(getattr(replica_cfg, "runtime", None), "devices", "default"), ) - llm_launch_futures[key] = launch_executor.submit( + stage_futures.append(launch_executor.submit( self._launch_llm_stage, replica_cfg, replica_metadata, @@ -586,12 +588,18 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: stage_init_timeout, llm_stage_launch_lock, omni_kv_connector, - ) + )) - concurrent.futures.wait(list(llm_launch_futures.values())) + llm_launch_futures[stage_id] = stage_futures - for key in llm_launch_keys: - started_llm_stages[key] = llm_launch_futures[key].result() + # Wait for all futures across all stages + all_futures = [f for futures in llm_launch_futures.values() for f in futures] + concurrent.futures.wait(all_futures) + + for stage_id in llm_stage_ids: + started_llm_stages[stage_id] = [ + f.result() for f in llm_launch_futures[stage_id] + ] # ---- Build flat client lists directly ---- # Attach each launched replica and build the flat index structures. @@ -601,21 +609,27 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: # Per-logical-stage lists (not per-client) logical_stage_clients_for_finalize: list[Any | None] = [None] * num_stages - all_output_processors: dict[tuple[int, int], Any] = {} - all_vllm_configs: dict[tuple[int, int], Any] = {} - - for key in llm_launch_keys: - stage_id, replica_idx = key - started = started_llm_stages[key] - client, output_proc, vllm_cfg, stage0_inp = self._attach_llm_stage(started) - all_clients[key] = client - all_output_processors[key] = output_proc - all_vllm_configs[key] = vllm_cfg - if stage0_inp is not None: - input_processor = stage0_inp + all_output_processors: dict[int, list[Any]] = {} + all_vllm_configs: dict[int, list[Any]] = {} + + for stage_id in llm_stage_ids: + stage_clients_list: list[Any] = [] + stage_output_procs: list[Any] = [] + stage_vllm_cfgs: list[Any] = [] + + for replica_idx, started in enumerate(started_llm_stages[stage_id]): + client, output_proc, vllm_cfg, stage0_inp = self._attach_llm_stage(started) + stage_clients_list.append(client) + stage_output_procs.append(output_proc) + stage_vllm_cfgs.append(vllm_cfg) + if stage0_inp is not None: + input_processor = stage0_inp + + all_clients[stage_id] = stage_clients_list + all_output_processors[stage_id] = stage_output_procs + all_vllm_configs[stage_id] = stage_vllm_cfgs # Use first replica for finalize_initialized_stages - if replica_idx == 0: - logical_stage_clients_for_finalize[stage_id] = client + logical_stage_clients_for_finalize[stage_id] = stage_clients_list[0] # Place diffusion clients into the logical list for stage_id, diff_client in diffusion_clients.items(): @@ -643,15 +657,14 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: flat_vllm_configs.append(None) else: for replica_idx in range(num_replicas): - key = (logical_id, replica_idx) ci = len(flat_clients) client_indices.append(ci) - flat_clients.append(all_clients[key]) - flat_output_processors.append(all_output_processors[key]) - flat_vllm_configs.append(all_vllm_configs[key]) + flat_clients.append(all_clients[logical_id][replica_idx]) + flat_output_processors.append(all_output_processors[logical_id][replica_idx]) + flat_vllm_configs.append(all_vllm_configs[logical_id][replica_idx]) if num_replicas > 1: logger.info( - "[AsyncOmniEngine] Logical stage %s replica %s → client %s (isolated)", + "[AsyncOmniEngine] Stage %s replica %s → client %s (isolated)", logical_id, replica_idx, ci, @@ -662,21 +675,22 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: logical_stage_metadata.append(stage_metadata[logical_id]) except Exception: - for key, future in llm_launch_futures.items(): - if not future.done() or future.cancelled() or future.exception() is not None: - continue - started_llm_stages.setdefault(key, future.result()) + for stage_id, futures in llm_launch_futures.items(): + for f in futures: + if not f.done() or f.cancelled() or f.exception() is not None: + continue + started_llm_stages.setdefault(stage_id, []).append(f.result()) # Collect all initialized clients for cleanup - cleanup_clients: list[Any] = list(diffusion_clients.values()) + list(all_clients.values()) + cleanup_clients: list[Any] = list(diffusion_clients.values()) + for clients in all_clients.values(): + cleanup_clients.extend(clients) cleanup_clients = [c for c in cleanup_clients if c is not None] + all_started = [s for stages in started_llm_stages.values() for s in stages] logger.exception( "[AsyncOmniEngine] Stage initialization failed; shutting down %s initialized client(s)", len(cleanup_clients), ) - cleanup_failed_stage_initialization( - cleanup_clients, - list(started_llm_stages.values()), - ) + cleanup_failed_stage_initialization(cleanup_clients, all_started) raise self.stage_clients = flat_clients @@ -725,7 +739,7 @@ async def _run_orchestrator() -> None: stage_clients=self.stage_clients, output_processors=self.output_processors, stage_vllm_configs=self.stage_vllm_configs, - logical_stage_to_clients=getattr(self, "logical_stage_to_clients", None), + logical_stage_to_clients=self.logical_stage_to_clients, ) if not startup_future.done(): startup_future.set_result(asyncio.get_running_loop()) diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 9c44f8ab605..7e5e75a6ae2 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -148,11 +148,13 @@ def __init__( self.logical_stage_to_clients = [[i] for i in range(self.num_clients)] self.num_logical_stages = len(self.logical_stage_to_clients) - # Reverse mapping: client_index -> logical_stage_id + # Reverse mappings: client_index -> (logical_stage_id, replica_index) self._client_to_logical: list[int] = [0] * self.num_clients + self._client_to_replica: list[int] = [0] * self.num_clients for logical_id, client_indices in enumerate(self.logical_stage_to_clients): - for ci in client_indices: + for ri, ci in enumerate(client_indices): self._client_to_logical[ci] = logical_id + self._client_to_replica[ci] = ri # Round-robin counters for replica selection per logical stage self._replica_rr: list[int] = [0] * self.num_logical_stages @@ -205,6 +207,10 @@ def _choose_client_index( req_state.chosen_client_index[logical_stage_id] = chosen return chosen + def _resolve_client_index(self, stage_id: int, replica_index: int = 0) -> int: + """Resolve (stage_id, replica_index) to a flat client index.""" + return self.logical_stage_to_clients[stage_id][replica_index] + async def run(self) -> None: """Main entry point for the Orchestrator event loop.""" logger.info("[Orchestrator] Starting event loop") @@ -279,90 +285,92 @@ async def _orchestration_loop(self) -> None: Control flow: poll raw → process through output processor → route. - Multi-replica: iterates over every *client_index* (not logical stage), - and resolves the logical_stage_id from client metadata for routing. + Multi-replica: iterates over every (stage_id, replica_index) pair, + resolves to a flat client_index internally for resource access. """ while not self._shutdown_event.is_set(): idle = True - for client_index in range(self.num_clients): - if self._shutdown_event.is_set(): - return - - logical_stage_id = self._client_to_logical[client_index] - - # 1) Diffusion stage: poll non-blocking queue - stage_client = self.stage_clients[client_index] - if stage_client.stage_type == "diffusion": - output = stage_client.get_diffusion_output_async() - if output is not None: - idle = False - req_state = self.request_states.get(output.request_id) - if req_state is not None: - stage_metrics = self._build_stage_metrics( - client_index, output.request_id, [output], req_state - ) - await self._route_output( - logical_stage_id, - output, - req_state, - stage_metrics, - client_index=client_index, - ) - continue - - # 1) Poll raw outputs from the client - try: - raw_outputs = await asyncio.wait_for(self._poll_stage_raw(client_index), timeout=0.001) - except asyncio.TimeoutError: - continue - except asyncio.CancelledError: - raise - except Exception: + for stage_id in range(self.num_logical_stages): + for replica_index in range(len(self.logical_stage_to_clients[stage_id])): if self._shutdown_event.is_set(): return - logger.exception( - "[Orchestrator] _poll_stage_raw failed for client-%s (logical stage-%s)", - client_index, - logical_stage_id, - ) - raise - - if raw_outputs is None: - continue - idle = False - - # 2) Process raw outputs through the output processor - request_outputs = await self._process_stage_outputs(client_index, raw_outputs) - - # 3) Route each processed output - for output in request_outputs: - req_state = self.request_states.get(output.request_id) - if req_state is None: - logger.warning( - "[Orchestrator] Dropping output for unknown req %s at client-%s " - "(logical stage-%s, known reqs: %s)", - output.request_id, - client_index, - logical_stage_id, - list(self.request_states.keys()), + + client_index = self._resolve_client_index(stage_id, replica_index) + + # 1) Diffusion stage: poll non-blocking queue + # TODO (Peiqi): the output of diffusion stage is OmniRequestOutput, + # which is different from EngineCoreOutputs (LLM stages). We may want to unify + # the output format in the future to simplify the processing logic in Orchestrator. + stage_client = self.stage_clients[client_index] + if stage_client.stage_type == "diffusion": + output = stage_client.get_diffusion_output_async() + if output is not None: + idle = False + req_state = self.request_states.get(output.request_id) + if req_state is not None: + stage_metrics = self._build_stage_metrics( + stage_id, output.request_id, [output], req_state, + replica_index=replica_index, + ) + await self._route_output( + stage_id, output, req_state, stage_metrics, + replica_index=replica_index, + ) + continue + + # 1) Poll raw outputs from the stage replica + try: + raw_outputs = await asyncio.wait_for( + self._poll_stage_raw(stage_id, replica_index=replica_index), + timeout=0.001, ) + except asyncio.TimeoutError: continue - stage_metrics = None - if output.finished: - stage_metrics = self._build_stage_metrics( - client_index, - output.request_id, - [output], - req_state, + except asyncio.CancelledError: + raise + except Exception: + if self._shutdown_event.is_set(): + return + logger.exception( + "[Orchestrator] _poll_stage_raw failed for stage-%s replica-%s", + stage_id, + replica_index, ) - await self._route_output( - logical_stage_id, - output, - req_state, - stage_metrics, - client_index=client_index, + raise + + if raw_outputs is None: + continue + idle = False + + # 2) Process raw outputs through the output processor + request_outputs = await self._process_stage_outputs( + stage_id, raw_outputs, replica_index=replica_index, ) + # 3) Route each processed output + for output in request_outputs: + req_state = self.request_states.get(output.request_id) + if req_state is None: + logger.warning( + "[Orchestrator] Dropping output for unknown req %s " + "at stage-%s replica-%s (known reqs: %s)", + output.request_id, + stage_id, + replica_index, + list(self.request_states.keys()), + ) + continue + stage_metrics = None + if output.finished: + stage_metrics = self._build_stage_metrics( + stage_id, output.request_id, [output], req_state, + replica_index=replica_index, + ) + await self._route_output( + stage_id, output, req_state, stage_metrics, + replica_index=replica_index, + ) + if idle: await asyncio.sleep(0.001) else: @@ -375,17 +383,15 @@ async def _route_output( req_state: OrchestratorRequestState, stage_metrics: Any, *, - client_index: int | None = None, + replica_index: int = 0, ) -> None: """Route a processed output: send to main thread and/or forward to next stage. Args: stage_id: Logical stage id. - client_index: Physical client index that produced this output. - Defaults to stage_id for backward compat. + replica_index: Replica index within the logical stage. """ - if client_index is None: - client_index = stage_id + client_index = self._resolve_client_index(stage_id, replica_index) req_id = output.request_id finished = output.finished submit_ts = req_state.stage_submit_ts.get(stage_id) @@ -412,7 +418,7 @@ async def _route_output( deferred["stage_id"], deferred["output"], parent_state, - client_index=deferred.get("client_index", deferred["stage_id"]), + replica_index=deferred.get("replica_index", 0), ) self.request_states.pop(req_id, None) return @@ -446,7 +452,7 @@ async def _route_output( self._deferred_parents[req_id] = { "stage_id": stage_id, "output": output, - "client_index": client_index, + "replica_index": replica_index, } logger.debug( "[Orchestrator] Parent %s deferred, waiting for CFG companions", @@ -458,7 +464,7 @@ async def _route_output( stage_id, output, req_state, - client_index=client_index, + replica_index=replica_index, ) if finished and stage_id == req_state.final_stage_id: @@ -484,24 +490,26 @@ def _all_companions_done(self, parent_id: str) -> bool: def _build_stage_metrics( self, - client_index: int, + stage_id: int, req_id: str, request_outputs: list[RequestOutput], req_state: OrchestratorRequestState, + *, + replica_index: int = 0, ) -> StageRequestMetrics: - """Build StageRequestMetrics for a finished request at a client. + """Build StageRequestMetrics for a finished request at a stage replica. Reuses StageRequestMetrics so OrchestratorMetrics and downstream metric handlers can consume a stable schema. """ - logical_stage_id = self._client_to_logical[client_index] + client_index = self._resolve_client_index(stage_id, replica_index) now = _time.time() - submit_ts = req_state.stage_submit_ts.get(logical_stage_id, now) + submit_ts = req_state.stage_submit_ts.get(stage_id, now) stage_gen_time_ms = (now - submit_ts) * 1000.0 num_tokens_out = count_tokens_from_outputs(request_outputs) num_tokens_in = 0 - if logical_stage_id == 0: + if stage_id == 0: for ro in request_outputs: ptids = getattr(ro, "prompt_token_ids", None) if ptids is not None: @@ -537,7 +545,7 @@ async def _forward_to_next_stage( output: Any, req_state: OrchestratorRequestState, *, - client_index: int | None = None, + replica_index: int = 0, ) -> None: """Forward output from current stage to the next stage. @@ -546,10 +554,9 @@ async def _forward_to_next_stage( Args: stage_id: Logical stage id that produced the output. - client_index: Physical client index that produced the output. + replica_index: Replica index of the stage that produced the output. """ - if client_index is None: - client_index = stage_id + client_index = self._resolve_client_index(stage_id, replica_index) next_logical = stage_id + 1 next_ci = self._choose_client_index(next_logical, req_state) @@ -604,14 +611,14 @@ async def _forward_to_next_stage( next_inputs = next_client.process_engine_inputs( stage_list=self.stage_clients, prompt=req_state.prompt, - source_client_index=client_index, + source_client=self.stage_clients[client_index], ) except Exception: logger.exception( - "[Orchestrator] req=%s process_engine_inputs FAILED for logical stage-%s (client-%s)", + "[Orchestrator] req=%s process_engine_inputs FAILED for stage-%s replica-%s", req_id, next_logical, - next_ci, + self._client_to_replica[next_ci], ) raise @@ -640,22 +647,28 @@ async def _forward_to_next_stage( # Record submit timestamp for the next logical stage req_state.stage_submit_ts[next_logical] = _time.time() - async def _poll_stage_raw(self, client_index: int) -> EngineCoreOutputs | None: - """Pull raw EngineCoreOutputs from a stage client without processing. + async def _poll_stage_raw( + self, stage_id: int, *, replica_index: int = 0, + ) -> EngineCoreOutputs | None: + """Pull raw EngineCoreOutputs from a stage replica without processing. Returns the raw outputs object, or None when there is nothing to consume. """ + client_index = self._resolve_client_index(stage_id, replica_index) outputs = await self.stage_clients[client_index].get_output_async() if not outputs.outputs: return None return outputs - async def _process_stage_outputs(self, client_index: int, raw_outputs: EngineCoreOutputs) -> list[RequestOutput]: + async def _process_stage_outputs( + self, stage_id: int, raw_outputs: EngineCoreOutputs, *, replica_index: int = 0, + ) -> list[RequestOutput]: """Run the output processor on raw outputs, returning RequestOutputs. Also handles abort forwarding and scheduler stats updates. """ + client_index = self._resolve_client_index(stage_id, replica_index) processor = self.output_processors[client_index] processed = processor.process_outputs( @@ -837,11 +850,12 @@ async def _handle_add_companion(self, msg: dict[str, Any]) -> None: await stage_client.add_request_async(request) logger.info( - "[Orchestrator] CFG companion submitted: %s (role=%s, parent=%s, client=%s)", + "[Orchestrator] CFG companion submitted: %s (role=%s, parent=%s, " + "stage-0 replica-%s)", companion_id, role, parent_id, - client_index, + self._client_to_replica[client_index], ) async def _handle_abort(self, msg: dict[str, Any]) -> None: @@ -951,6 +965,15 @@ def _shutdown_stages(self) -> None: for ci, stage_client in enumerate(self.stage_clients): try: stage_client.shutdown() - logger.info("[Orchestrator] Client %d shut down", ci) + logger.info( + "[Orchestrator] Stage %d replica %d shut down", + self._client_to_logical[ci], + self._client_to_replica[ci], + ) except Exception as e: - logger.warning("[Orchestrator] Failed to shutdown client %d: %s", ci, e) + logger.warning( + "[Orchestrator] Failed to shutdown stage %d replica %d: %s", + self._client_to_logical[ci], + self._client_to_replica[ci], + e, + ) diff --git a/vllm_omni/engine/stage_engine_core_client.py b/vllm_omni/engine/stage_engine_core_client.py index 395a9d84550..dd8c1b4fd55 100644 --- a/vllm_omni/engine/stage_engine_core_client.py +++ b/vllm_omni/engine/stage_engine_core_client.py @@ -123,15 +123,14 @@ def process_engine_inputs( self, stage_list: list[Any], prompt: OmniTokensPrompt | list[OmniTokensPrompt] | None = None, - source_client_index: int | None = None, + source_client: Any | None = None, ) -> list[OmniTokensPrompt]: """Process inputs from upstream stages. Args: - source_client_index: When multi-replica is enabled, specifies the - exact client index in *stage_list* that produced the upstream - output. Falls back to ``engine_input_source[0]`` for backward - compat. + source_client: When multi-replica is enabled, the upstream client + object that produced the output. Falls back to + ``stage_list[engine_input_source[0]]`` for backward compat. """ from vllm_omni.inputs.data import OmniTokensPrompt @@ -146,8 +145,9 @@ def process_engine_inputs( if not self.engine_input_source: raise ValueError(f"engine_input_source empty for stage {self.stage_id}") - source_id = source_client_index if source_client_index is not None else self.engine_input_source[0] - source_outputs = stage_list[source_id].engine_outputs + if source_client is None: + source_client = stage_list[self.engine_input_source[0]] + source_outputs = source_client.engine_outputs if not isinstance(prompt, list): prompt = [prompt] From eaa9dfdab2df94f016166350a34613c9d3ebc3ed Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Sun, 5 Apr 2026 00:41:59 +0800 Subject: [PATCH 05/76] fix lint Signed-off-by: ZhengWG --- vllm_omni/engine/async_omni_engine.py | 29 +++++++++--------- vllm_omni/engine/orchestrator.py | 42 ++++++++++++++++++++------- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index b7177ebe10f..050c635de23 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -482,7 +482,10 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: ) tp_size = get_stage_tp_size(stage_cfg) replica_devices_map[logical_id] = split_devices_for_replicas( - devices_str, num_replicas, tp_size, logical_id, + devices_str, + num_replicas, + tp_size, + logical_id, ) logger.info( "[AsyncOmniEngine] Stage %s: %d replicas, tp=%d, devices split: %s", @@ -580,15 +583,17 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: getattr(getattr(replica_cfg, "runtime", None), "devices", "default"), ) - stage_futures.append(launch_executor.submit( - self._launch_llm_stage, - replica_cfg, - replica_metadata, - stage_connector_spec, - stage_init_timeout, - llm_stage_launch_lock, - omni_kv_connector, - )) + stage_futures.append( + launch_executor.submit( + self._launch_llm_stage, + replica_cfg, + replica_metadata, + stage_connector_spec, + stage_init_timeout, + llm_stage_launch_lock, + omni_kv_connector, + ) + ) llm_launch_futures[stage_id] = stage_futures @@ -597,9 +602,7 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: concurrent.futures.wait(all_futures) for stage_id in llm_stage_ids: - started_llm_stages[stage_id] = [ - f.result() for f in llm_launch_futures[stage_id] - ] + started_llm_stages[stage_id] = [f.result() for f in llm_launch_futures[stage_id]] # ---- Build flat client lists directly ---- # Attach each launched replica and build the flat index structures. diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index daabdbe4d8b..b79f88933ff 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -309,11 +309,17 @@ async def _orchestration_loop(self) -> None: req_state = self.request_states.get(output.request_id) if req_state is not None: stage_metrics = self._build_stage_metrics( - stage_id, output.request_id, [output], req_state, + stage_id, + output.request_id, + [output], + req_state, replica_index=replica_index, ) await self._route_output( - stage_id, output, req_state, stage_metrics, + stage_id, + output, + req_state, + stage_metrics, replica_index=replica_index, ) continue @@ -344,12 +350,16 @@ async def _orchestration_loop(self) -> None: # Handle prefill-finished KV-ready signals before finished outputs. await self._handle_kv_ready_raw_outputs( - stage_id, raw_outputs, replica_index=replica_index, + stage_id, + raw_outputs, + replica_index=replica_index, ) # 2) Process raw outputs through the output processor request_outputs = await self._process_stage_outputs( - stage_id, raw_outputs, replica_index=replica_index, + stage_id, + raw_outputs, + replica_index=replica_index, ) # 3) Route each processed output @@ -368,11 +378,17 @@ async def _orchestration_loop(self) -> None: stage_metrics = None if output.finished: stage_metrics = self._build_stage_metrics( - stage_id, output.request_id, [output], req_state, + stage_id, + output.request_id, + [output], + req_state, replica_index=replica_index, ) await self._route_output( - stage_id, output, req_state, stage_metrics, + stage_id, + output, + req_state, + stage_metrics, replica_index=replica_index, ) @@ -698,7 +714,10 @@ async def _forward_to_next_stage( req_state.stage_submit_ts[next_logical] = _time.time() async def _poll_stage_raw( - self, stage_id: int, *, replica_index: int = 0, + self, + stage_id: int, + *, + replica_index: int = 0, ) -> EngineCoreOutputs | None: """Pull raw EngineCoreOutputs from a stage replica without processing. @@ -712,7 +731,11 @@ async def _poll_stage_raw( return outputs async def _process_stage_outputs( - self, stage_id: int, raw_outputs: EngineCoreOutputs, *, replica_index: int = 0, + self, + stage_id: int, + raw_outputs: EngineCoreOutputs, + *, + replica_index: int = 0, ) -> list[RequestOutput]: """Run the output processor on raw outputs, returning RequestOutputs. @@ -900,8 +923,7 @@ async def _handle_add_companion(self, msg: dict[str, Any]) -> None: await stage_client.add_request_async(request) logger.info( - "[Orchestrator] CFG companion submitted: %s (role=%s, parent=%s, " - "stage-0 replica-%s)", + "[Orchestrator] CFG companion submitted: %s (role=%s, parent=%s, stage-0 replica-%s)", companion_id, role, parent_id, From e12250119bc7f90745354a5349e550f391fa123b Mon Sep 17 00:00:00 2001 From: NATURE Date: Mon, 13 Apr 2026 11:36:20 +0800 Subject: [PATCH 06/76] [Bugfix] Fix Bagel online mode for 1. Hang after several requests 2. Non-deterministic image quality regression. (#2458) Signed-off-by: natureofnature --- vllm_omni/core/sched/omni_ar_scheduler.py | 105 +++++----- .../model_executor/models/bagel/bagel.py | 195 ++++++------------ .../npu/worker/npu_ar_model_runner.py | 26 ++- vllm_omni/worker/gpu_ar_model_runner.py | 35 +++- 4 files changed, 164 insertions(+), 197 deletions(-) diff --git a/vllm_omni/core/sched/omni_ar_scheduler.py b/vllm_omni/core/sched/omni_ar_scheduler.py index eac737b6e66..0ee8cd16a3a 100644 --- a/vllm_omni/core/sched/omni_ar_scheduler.py +++ b/vllm_omni/core/sched/omni_ar_scheduler.py @@ -59,6 +59,11 @@ def __init__(self, *args, **kwargs): # Track ACTIVE transfers (submitted to runner but not yet acked via kv_extracted_req_ids) self.active_kv_transfers: set[str] = set() + # Requests marked for deferred stop: keep running until KV extraction + # completes so that kv_ready can be emitted while the request is still + # alive. Stopped on the first scheduler step after extraction ack. + self.pending_stop_after_extraction: set[str] = set() + # [Omni] Pre-parse KV transfer criteria self.kv_transfer_criteria = self._get_kv_transfer_criteria() @@ -126,11 +131,16 @@ def _process_kv_transfer_trigger(self, request: Request, new_token_ids: list[int stop_decode_on_trigger = self.kv_transfer_criteria.get("stop_after_transfer", True) if request.request_id in self.transfer_triggered_requests: - # Already triggered. When stop_decode_on_trigger is True AND - # transfer was actually queued, the request was already stopped - # at trigger time (see below). Any request that reaches this - # point either has stop_decode_on_trigger=False (continue - # decoding) or was not actually queued (should not be stopped). + # Deferred stop: once KV extraction is complete (no longer in + # active_kv_transfers), stop the request. This guarantees the + # kv_ready signal was emitted while the request was still alive. + if ( + request.request_id in self.pending_stop_after_extraction + and request.request_id not in self.active_kv_transfers + ): + self.pending_stop_after_extraction.discard(request.request_id) + request.status = RequestStatus.FINISHED_STOPPED + return True return False if criteria_type == "prefill_finished": @@ -140,14 +150,11 @@ def _process_kv_transfer_trigger(self, request: Request, new_token_ids: list[int actually_queued = request.request_id in self.requests_needing_kv_transfer if stop_decode_on_trigger and actually_queued: - # Stop immediately so the request is NOT scheduled in - # the next step, freeing scheduling budget for companion - # requests whose chunked-prefill boundaries must be - # deterministic. waiting_for_transfer_free keeps blocks - # alive until the model runner finishes KV extraction. - self.waiting_for_transfer_free.add(request.request_id) - request.status = RequestStatus.FINISHED_STOPPED - return True + # Defer the stop until KV extraction completes so that + # the kv_ready signal can be emitted while the request + # is still alive. The request will be stopped on the + # next scheduler step after extraction ack arrives. + self.pending_stop_after_extraction.add(request.request_id) return False @@ -167,9 +174,7 @@ def _process_kv_transfer_trigger(self, request: Request, new_token_ids: list[int actually_queued = request.request_id in self.requests_needing_kv_transfer if stop_decode_on_trigger and actually_queued: - self.waiting_for_transfer_free.add(request.request_id) - request.status = RequestStatus.FINISHED_STOPPED - return True + self.pending_stop_after_extraction.add(request.request_id) return False @@ -268,6 +273,26 @@ def update_from_output( num_scheduled_tokens, ) + # Pre-process KV extraction acks so that the per-request loop below + # can see up-to-date active_kv_transfers state and emit kv_ready + # signals while requests are still alive (before any deferred stop). + kv_extracted_ids = getattr(model_runner_output, "kv_extracted_req_ids", None) + if kv_extracted_ids: + for req_id in kv_extracted_ids: + try: + self.active_kv_transfers.discard(req_id) + req = self.requests.get(req_id) + if req is not None and not req.is_finished(): + outputs[req.client_index].append( + EngineCoreOutput( + request_id=req_id, + new_token_ids=[], + kv_transfer_params={"kv_ready": True}, + ) + ) + except Exception: + init_logger(__name__).exception("Failed to pre-process KV extraction for %s", req_id) + # NOTE(woosuk): As len(num_scheduled_tokens) can be up to 1K or more, # the below loop can be a performance bottleneck. We should do our best # to avoid expensive operations inside the loop. @@ -436,6 +461,7 @@ def update_from_output( self.transfer_triggered_requests.remove(req.request_id) if req.request_id in self.active_kv_transfers: self.active_kv_transfers.remove(req.request_id) + self.pending_stop_after_extraction.discard(req.request_id) # Same for preempted for req in stopped_preempted_reqs: @@ -444,6 +470,8 @@ def update_from_output( self.transfer_triggered_requests.remove(req.request_id) if req.request_id in self.active_kv_transfers: self.active_kv_transfers.remove(req.request_id) + self.pending_stop_after_extraction.discard(req.request_id) + # KV Connector: update state for finished KV Transfers. if kv_connector_output: self._update_from_kv_xfer_finished(kv_connector_output) @@ -489,35 +517,12 @@ def update_from_output( engine_core_outputs[0] = eco = EngineCoreOutputs() eco.scheduler_stats = stats - # This is where we free blocks that were held for transfer - try: - kv_extracted_ids = getattr(model_runner_output, "kv_extracted_req_ids", None) - if kv_extracted_ids: - for req_id in kv_extracted_ids: - # Emit a kv_ready signal so the orchestrator can forward - # the request to the DiT stage immediately after KV - # extraction, without waiting for AR decode to finish. - req = self.requests.get(req_id) - if req is not None and not req.is_finished(): - eco = engine_core_outputs.get(req.client_index) - if eco is None: - eco = EngineCoreOutputs() - engine_core_outputs[req.client_index] = eco - eco.outputs.append( - EngineCoreOutput( - request_id=req_id, - new_token_ids=[], - kv_transfer_params={"kv_ready": True}, - ) - ) - - # Mark transfer as finished - if req_id in self.active_kv_transfers: - self.active_kv_transfers.remove(req_id) - logger.debug(f"[Omni] KV Transfer finished for {req_id}") - + # Free blocks that were held for transfer (kv_ready and + # active_kv_transfers updates already done before the per-request loop). + if kv_extracted_ids: + for req_id in kv_extracted_ids: + try: if req_id in self.waiting_for_transfer_free: - # Now it's safe to free blocks req = self.requests.get(req_id) if req: self.kv_cache_manager.free(req) @@ -525,13 +530,12 @@ def update_from_output( del self.requests[req_id] if req_id in self.transfer_triggered_requests: self.transfer_triggered_requests.remove(req_id) - if req_id in self.active_kv_transfers: - self.active_kv_transfers.remove(req_id) - + self.active_kv_transfers.discard(req_id) + self.pending_stop_after_extraction.discard(req_id) logger.debug(f"Freed blocks for {req_id} after transfer extraction") self.waiting_for_transfer_free.remove(req_id) - except Exception: - init_logger(__name__).exception("Failed to process finished transfer requests") + except Exception: + init_logger(__name__).exception("Failed to free blocks for %s after transfer", req_id) return engine_core_outputs @@ -564,8 +568,7 @@ def _free_request(self, request: Request, delay_free_blocks: bool = False) -> di kv_xfer_params = None return kv_xfer_params elif request_id in self.waiting_for_transfer_free: - # Stopped immediately by stop_decode_on_trigger; blocks are - # held until KV extraction completes in a future step. + # Blocks held until KV extraction completes in a future step. return None else: logger.debug( diff --git a/vllm_omni/model_executor/models/bagel/bagel.py b/vllm_omni/model_executor/models/bagel/bagel.py index acbbc28b4cf..cbb775680cc 100644 --- a/vllm_omni/model_executor/models/bagel/bagel.py +++ b/vllm_omni/model_executor/models/bagel/bagel.py @@ -1,4 +1,3 @@ -from collections import deque from collections.abc import Iterable, Mapping, Sequence from math import isqrt from typing import Any @@ -442,14 +441,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._pending_img2img_info: list[tuple[int, int, int, int]] = [] self._ropes_pending: list[dict[str, Any]] = [] self._ropes_metadata: dict[str, dict[str, Any]] = {} - self._cfg_companion_queue: deque[tuple[tuple[int, int, int, int], int]] = deque() - - # Per-request position offset for decode after img2img prefill. - # Prefill rewrites positions (VAE→0, ViT→1, text→2..N) but the model - # runner assigns decode positions starting from prefill_len, not N+1. - # offset = rope - prefill_len (a negative number). - self._pending_decode_offsets: list[int] = [] - self._decode_position_offsets: dict[str, int] = {} + self._last_img2img_info: tuple[int, int, int, int] | None = None from transformers import AutoTokenizer @@ -461,7 +453,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._start_of_image_id = int(_tok.convert_tokens_to_ids("<|vision_start|>")) self._end_of_image_id = int(_tok.convert_tokens_to_ids("<|vision_end|>")) self._img2img_token_id = int(_tok.convert_tokens_to_ids("<|fim_middle|>")) - self._vae_token_mask: torch.Tensor | None = None self.device = get_local_device() self._install_mot_modules(config) @@ -540,9 +531,7 @@ def _clear_warmup_state(self): self._ropes_pending.clear() self._ropes_metadata.clear() self._pending_img2img_info.clear() - self._cfg_companion_queue.clear() - self._pending_decode_offsets.clear() - self._decode_position_offsets.clear() + self._last_img2img_info = None self._vae_token_mask = None def get_kv_transfer_metadata( @@ -554,12 +543,10 @@ def get_kv_transfer_metadata( meta = self._ropes_metadata.pop(req_id, None) if meta is None: return None - # In think-mode img2img the prefill rope doesn't account for decoded - # thinking tokens; correct it to num_computed_tokens + offset. - # Skip correction when num_computed_tokens is unavailable (None). - offset = self._decode_position_offsets.pop(req_id, 0) - if offset != 0 and "ropes" in meta and num_computed_tokens is not None: - meta["ropes"] = [num_computed_tokens + offset] + if num_computed_tokens is not None and "image_shape" in meta: + prefill_rope = meta["ropes"][0] if meta.get("ropes") else 0 + if num_computed_tokens > prefill_rope: + meta["ropes"] = [num_computed_tokens] return meta def prepare_runner_inputs( @@ -572,48 +559,29 @@ def prepare_runner_inputs( num_scheduled_tokens: list[int], input_ids_buffer: torch.Tensor | None = None, ) -> tuple[torch.Tensor | None, torch.Tensor | None]: - """Model-runner hook: adjust inputs before ``forward()``. - - Returns ``(input_ids, positions)`` — possibly modified. - - Two adjustments for BAGEL img2img: - - 1. **Restore input_ids** when ``inputs_embeds`` is present so that - ``_adjust_positions_for_img2img`` can locate the - ``<|fim_middle|>`` placeholder. - 2. **Decode position offset**: prefill rewrites positions to a - compact scheme (rope ≪ prefill_len). The runner assigns decode - positions from ``num_computed_tokens``, which is far too large; - apply the stored per-request offset. - """ + """Restore input_ids so _adjust_positions_for_img2img can locate + the <|fim_middle|> placeholder for thinking-mode pre_text_len + detection.""" if inputs_embeds is not None and input_ids is None and input_ids_buffer is not None: input_ids = input_ids_buffer - - if self._decode_position_offsets and positions is not None: - token_start = 0 - for i, rid in enumerate(req_ids): - sched = num_scheduled_tokens[i] - offset = self._decode_position_offsets.get(rid, 0) - if offset != 0 and num_computed_tokens[i] > 0: - positions[token_start : token_start + sched] += offset - token_start += sched - return input_ids, positions def flush_pending_metadata(self, req_ids: list[str]) -> None: - """Map pending metadata (batch order) to req_ids after forward().""" + """Map pending metadata (batch order) to req_ids after forward(). + + Guard: if a request already has metadata with ``image_shape`` + (written during img2img prefill), don't overwrite it with + decode-step metadata that lacks ``image_shape``. + """ pending = self._ropes_pending self._ropes_pending = [] for i, meta in enumerate(pending): if i < len(req_ids): - if req_ids[i] not in self._ropes_metadata: - self._ropes_metadata[req_ids[i]] = meta - - pending_offsets = self._pending_decode_offsets - self._pending_decode_offsets = [] - for i, offset in enumerate(pending_offsets): - if i < len(req_ids) and offset != 0: - self._decode_position_offsets[req_ids[i]] = offset + rid = req_ids[i] + existing = self._ropes_metadata.get(rid) + if existing and "image_shape" in existing and "image_shape" not in meta: + continue + self._ropes_metadata[rid] = meta def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: mm_input_by_modality = {} @@ -727,16 +695,7 @@ def _process_img2img_input(self, multimodal_input): num_vit = vit_emb.shape[0] + 2 info = (num_vae, num_vit, int(H), int(W)) self._pending_img2img_info.append(info) - # Only the gen (main) request should add a companion queue entry. - # Companion requests (cfg_text, cfg_img) also call this method with - # the same image, so guard by checking whether this exact info - # tuple is already enqueued. For batched img2img with multiple - # concurrent gen requests this correctly adds one entry per unique - # image; images with identical (num_vae, num_vit, H, W) that arrive - # in the same batch are indistinguishable here and will share one - # entry, but that is an uncommon edge case. - if not any(entry[0] == info for entry in self._cfg_companion_queue): - self._cfg_companion_queue.append((info, 2)) # cfg_text + cfg_img + self._last_img2img_info = info return tuple(results) @@ -755,31 +714,18 @@ def forward( positions = self._adjust_positions_for_img2img(positions, input_ids) use_mot = True - elif self._cfg_companion_queue: - # Guard: if this looks like a pure decode step (small token count, - # no multimodal embeddings), the queue has stale entries from a - # previous prefill cycle — clear them instead of consuming. - if inputs_embeds is None and seq_len <= 2: - self._cfg_companion_queue.clear() - else: - cached, remaining = self._cfg_companion_queue[0] - remaining -= 1 - num_vae, num_vit, img_H, img_W = cached - num_img2img = num_vae + 1 + num_vit # +1 separator - seq_len = inputs_embeds.shape[0] if inputs_embeds is not None else positions.shape[0] - - if inputs_embeds is not None and seq_len >= num_img2img: - self._pending_img2img_info = [cached] - positions = self._adjust_positions_for_img2img(positions, input_ids) - use_mot = True - else: - rope = int(positions[seq_len - 1].item()) + 1 - self._ropes_pending.append({"ropes": [rope]}) + elif self._last_img2img_info is not None: + info = self._last_img2img_info + num_vae, num_vit, _, _ = info + num_img2img = num_vae + 1 + num_vit - if remaining == 0: - self._cfg_companion_queue.popleft() - else: - self._cfg_companion_queue[0] = (cached, remaining) + if seq_len >= num_img2img: + self._pending_img2img_info = [info] + positions = self._adjust_positions_for_img2img(positions, input_ids) + use_mot = True + else: + rope = int(positions[seq_len - 1].item()) + 1 + self._ropes_pending.append({"ropes": [rope]}) if use_mot: return self._mot_forward(input_ids, positions, intermediate_tensors, inputs_embeds, **kwargs) @@ -790,27 +736,18 @@ def _adjust_positions_for_img2img( positions: torch.Tensor, input_ids: torch.Tensor | None = None, ) -> torch.Tensor: - """Rewrite position IDs to match the original BAGEL position scheme: - - If there are ``pre_text_len`` text tokens before the img2img block:: - - pre_text → 0, 1, ..., M-1 - VAE → M (all share) - separator→ M - ViT → M+1 (all share) - post_text→ M+2, M+3, ... + """Rewrite position IDs for img2img. - When no text precedes the img2img block (M=0), this reduces to the - simpler scheme: VAE→0, ViT→1, text→2, 3, ... + Supports an optional ``pre_text_len`` prefix (thinking-mode) detected + via the ``<|fim_middle|>`` token in *input_ids*: - Also computes ``self._vae_token_mask`` (bool tensor, True for actual - VAE latent patches that should use gen-mode weights) and pushes - per-request ropes + image_shape to the FIFO consumed by - ``get_kv_transfer_metadata``. + pre_text -> 0 .. M-1 + VAE -> M (all share) + separator-> M + ViT -> M+1 (all share) + post_text-> M+2, M+3, ... - For img2img requests, also stores a decode position offset so that - subsequent autoregressive decode steps use positions that continue - from the rewritten scheme rather than from the original prefill length. + When M=0 (standard img2img) this reduces to VAE->0, ViT->1, text->2.. """ info_list = self._pending_img2img_info self._pending_img2img_info = [] @@ -836,70 +773,64 @@ def _adjust_positions_for_img2img( req_len = end - start if img2img_idx < len(info_list): - num_vae, num_vit, img_H, img_W = info_list[img2img_idx] + cur_info = info_list[img2img_idx] + elif self._last_img2img_info is not None: + cur_info = self._last_img2img_info + else: + cur_info = None + + if cur_info is not None: + num_vae, num_vit, img_H, img_W = cur_info num_img2img = num_vae + 1 + num_vit # +1 separator if req_len >= num_img2img: - # Detect offset of img2img tokens within this request - # by searching for the img2img placeholder token ID. pre_text_len = 0 if input_ids is not None: - req_ids = input_ids[start:end] - mask = req_ids == self._img2img_token_id - indices = mask.nonzero(as_tuple=True)[0] + req_ids_slice = input_ids[start:end] + indices = (req_ids_slice == self._img2img_token_id).nonzero(as_tuple=True)[0] if indices.numel() > 0: pre_text_len = int(indices[0].item()) - img_start = start + pre_text_len + M = pre_text_len + img_start = start + M post_text_start = img_start + num_img2img - # pre_text_pos: position base for image tokens - pre_text_pos = pre_text_len - # Pre-image text: sequential positions 0..pre_text_pos-1 - if pre_text_len > 0: + if M > 0: new_positions[start:img_start] = torch.arange( - 0, pre_text_pos, device=positions.device, dtype=positions.dtype + 0, M, device=positions.device, dtype=positions.dtype ) - # VAE tokens: all share position pre_text_pos - new_positions[img_start : img_start + num_vae] = pre_text_pos - # Separator: position pre_text_pos - new_positions[img_start + num_vae] = pre_text_pos - # ViT tokens: all share position pre_text_pos+1 + new_positions[img_start : img_start + num_vae] = M + new_positions[img_start + num_vae] = M # separator vit_start = img_start + num_vae + 1 - new_positions[vit_start : vit_start + num_vit] = pre_text_pos + 1 + new_positions[vit_start : vit_start + num_vit] = M + 1 - # Post-image text: sequential positions pre_text_pos+2, pre_text_pos+3, ... num_post_text = end - post_text_start if num_post_text > 0: new_positions[post_text_start:end] = torch.arange( - pre_text_pos + 2, - pre_text_pos + 2 + num_post_text, + M + 2, + M + 2 + num_post_text, device=positions.device, dtype=positions.dtype, ) - # VAE gen-mode mask: only actual VAE latent patches (not markers) - vae_patches_start = img_start + 1 # skip start_marker - vae_patches_end = img_start + num_vae - 1 # before end_marker + vae_patches_start = img_start + 1 + vae_patches_end = img_start + num_vae - 1 if vae_patches_end > vae_patches_start: vae_mask[vae_patches_start:vae_patches_end] = True - rope = pre_text_pos + 2 + num_post_text + rope = M + 2 + num_post_text self._ropes_pending.append( { "ropes": [rope], "image_shape": [img_H, img_W], } ) - decode_offset = rope - req_len - self._pending_decode_offsets.append(decode_offset) img2img_idx += 1 continue rope = int(new_positions[end - 1].item()) + 1 self._ropes_pending.append({"ropes": [rope]}) - self._pending_decode_offsets.append(0) self._vae_token_mask = vae_mask if vae_mask.any() else None return new_positions diff --git a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py index 138948064ba..ffb997048bd 100644 --- a/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py +++ b/vllm_omni/platforms/npu/worker/npu_ar_model_runner.py @@ -149,7 +149,15 @@ def execute_model( encoder_cache=self.encoder_cache, ) as ec_connector_output: self._execute_mm_encoder(scheduler_output) - return make_empty_encoder_model_runner_output(scheduler_output) + + kv_ids = self.kv_extracted_req_ids + self.kv_extracted_req_ids = None + + output = make_empty_encoder_model_runner_output(scheduler_output) + if kv_ids: + output = copy(output) + output.kv_extracted_req_ids = kv_ids + return output if not num_scheduled_tokens: if ( @@ -163,10 +171,20 @@ def execute_model( # dummy run to ensure coordinate_batch_across_dp # is called into to avoid out of sync issues. self._dummy_run(1) + + kv_ids = self.kv_extracted_req_ids + self.kv_extracted_req_ids = None + if not has_kv_transfer_group(): - # Return empty ModelRunnerOutput if no work to do. - return EMPTY_MODEL_RUNNER_OUTPUT - return self.kv_connector_no_forward(scheduler_output, self.vllm_config) + output = EMPTY_MODEL_RUNNER_OUTPUT + else: + output = self.kv_connector_no_forward(scheduler_output, self.vllm_config) + + if kv_ids: + output = copy(output) + output.kv_extracted_req_ids = kv_ids + + return output if self.cache_config.kv_sharing_fast_prefill: assert not self.num_prompt_logprobs, ( "--kv-sharing-fast-prefill produces incorrect " diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 01ec23acb47..554ac6355de 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -205,24 +205,39 @@ def execute_model( encoder_cache=self.encoder_cache, ) as ec_connector_output: self._execute_mm_encoder(scheduler_output) - return make_empty_encoder_model_runner_output(scheduler_output) + + kv_ids = self.kv_extracted_req_ids + self.kv_extracted_req_ids = None + + output = make_empty_encoder_model_runner_output(scheduler_output) + if kv_ids: + output = copy(output) + output.kv_extracted_req_ids = kv_ids + return output if not num_scheduled_tokens: if ( self.parallel_config.distributed_executor_backend == "external_launcher" and self.parallel_config.data_parallel_size > 1 ): - # this is a corner case when both external launcher - # and DP are enabled, num_scheduled_tokens could be - # 0, and has_unfinished_requests in the outer loop - # returns True. before returning early here we call - # dummy run to ensure coordinate_batch_across_dp - # is called into to avoid out of sync issues. self._dummy_run(1) + + # Capture KV extraction results before early return; + # sample_tokens() is skipped on this path so the IDs + # would otherwise be silently overwritten next step. + kv_ids = self.kv_extracted_req_ids + self.kv_extracted_req_ids = None + if not has_kv_transfer_group(): - # Return empty ModelRunnerOutput if no work to do. - return EMPTY_MODEL_RUNNER_OUTPUT - return self.kv_connector_no_forward(scheduler_output, self.vllm_config) + output = EMPTY_MODEL_RUNNER_OUTPUT + else: + output = self.kv_connector_no_forward(scheduler_output, self.vllm_config) + + if kv_ids: + output = copy(output) + output.kv_extracted_req_ids = kv_ids + + return output if self.cache_config.kv_sharing_fast_prefill: assert not self.num_prompt_logprobs, ( From cb4d13a65806d18337628da0768539ba97c6cd4d Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Mon, 13 Apr 2026 12:53:35 +0800 Subject: [PATCH 07/76] [Perf][Fish Speech] Enable CUDA Graph capture for Fast AR code predictor (#2520) Signed-off-by: Sy03 <1370724210@qq.com> --- .../models/fish_speech/fish_speech_fast_ar.py | 22 +++++-- .../models/fish_speech/fish_speech_slow_ar.py | 39 ++++++------ vllm_omni/worker/gpu_ar_model_runner.py | 62 +++++++++++++++++++ vllm_omni/worker/gpu_model_runner.py | 6 +- 4 files changed, 99 insertions(+), 30 deletions(-) diff --git a/vllm_omni/model_executor/models/fish_speech/fish_speech_fast_ar.py b/vllm_omni/model_executor/models/fish_speech/fish_speech_fast_ar.py index 8bbb643ebec..22a2744ff5d 100644 --- a/vllm_omni/model_executor/models/fish_speech/fish_speech_fast_ar.py +++ b/vllm_omni/model_executor/models/fish_speech/fish_speech_fast_ar.py @@ -310,6 +310,7 @@ def __init__( self._compiled_model_fwd: object | None = None self._compile_attempted = False self._compile_failed = False + self._disable_compile_for_graph = False def _ensure_buffers(self, bsz: int, device: torch.device, dtype: torch.dtype) -> None: max_seq = self._num_codebooks + 1 # hidden_state + num_codebooks codes @@ -327,11 +328,20 @@ def _setup_compile(self) -> None: if self._compile_attempted: return self._compile_attempted = True + if self._disable_compile_for_graph: + try: + self._compiled_model_fwd = torch.compile( + self.model.forward, + dynamic=True, + options={"epilogue_fusion": False}, + ) + except Exception as exc: + logger.warning("Fast AR torch.compile (graph mode) failed: %s", exc) + self._compiled_model_fwd = self.model.forward + return try: self._compiled_model_fwd = torch.compile( self.model.forward, - # Keep the helper compiler separate from vLLM's outer - # cudagraph-managed Stage-0 execution. mode="default", dynamic=True, fullgraph=False, @@ -366,10 +376,10 @@ def warmup_compile( @torch.inference_mode() def _run_model(self, step_input: torch.Tensor, step_pos_ids: torch.Tensor, bsz: int) -> torch.Tensor: - # Default-on compile only pays off for single-request decode. For - # batched decode, eager preserves loaded throughput and avoids the - # regression seen with batch>1 compiled execution. - model_fwd = self._compiled_model_fwd if bsz == 1 else self.model.forward + if self._disable_compile_for_graph: + model_fwd = self._compiled_model_fwd or self.model.forward + else: + model_fwd = self._compiled_model_fwd if bsz == 1 else self.model.forward try: return model_fwd(step_input, step_pos_ids) except Exception as exc: diff --git a/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py b/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py index 3813597caad..62776cbb31f 100644 --- a/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py +++ b/vllm_omni/model_executor/models/fish_speech/fish_speech_slow_ar.py @@ -194,6 +194,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.has_postprocess = True self.mtp_hidden_size = int(self.text_config.hidden_size) self.talker_mtp_output_key = "audio_codes" + self.talker_mtp_graph_safe = True self.gpu_resident_buffer_keys: set[str] = {"last_slow_ar_hidden"} # Qwen3 transformer backbone. @@ -236,6 +237,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): slow_ar_config=self.text_config, prefix="fast_ar", ) + if self.talker_mtp_graph_safe: + self.fast_ar._disable_compile_for_graph = True # Constant logit mask: allow only semantic tokens + im_end. vocab = int(self.text_config.vocab_size) @@ -680,18 +683,13 @@ def talker_mtp( inputs_embeds_out = input_embeds.reshape(bsz, -1).clone() semantic_mask = (input_ids[:, 0] >= self._semantic_begin_id) & (input_ids[:, 0] <= self._semantic_end_id) - if semantic_mask.any(): - semantic_codes = audio_codes[semantic_mask].clamp(min=0) - offsets = ( - torch.arange(self._num_codebooks, device=dev, dtype=semantic_codes.dtype) * self._codebook_size - ).unsqueeze(0) - codebook_sum = self.codebook_embeddings(semantic_codes + offsets).sum(dim=1).to(dtype=torch.bfloat16) - - # Normalize by sqrt(num_codebooks + 1) as in the reference model - # (scale_codebook_embeddings=True for fish_qwen3_omni). - inputs_embeds_out[semantic_mask] = (inputs_embeds_out[semantic_mask] + codebook_sum) / math.sqrt( - self._num_codebooks + 1 - ) + semantic_codes = audio_codes.clamp(min=0, max=self._codebook_size - 1) + offsets = ( + torch.arange(self._num_codebooks, device=dev, dtype=semantic_codes.dtype) * self._codebook_size + ).unsqueeze(0) + codebook_sum = self.codebook_embeddings(semantic_codes + offsets).sum(dim=1).to(dtype=torch.bfloat16) + norm_embeds = (inputs_embeds_out + codebook_sum) / math.sqrt(self._num_codebooks + 1) + inputs_embeds_out = torch.where(semantic_mask.unsqueeze(-1), norm_embeds, inputs_embeds_out) return inputs_embeds_out, audio_codes.to(dtype=torch.long) @@ -802,14 +800,15 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if truncated: logger.info("Truncated %d RoPE cos_sin_cache buffers to bf16 precision", truncated) - try: - self.fast_ar.warmup_compile( - device=self.codebook_embeddings.weight.device, - dtype=torch.bfloat16, - batch_sizes=(1,), - ) - except Exception as exc: - logger.warning("Fish Speech Fast AR compile warmup failed: %s", exc) + if not getattr(self, "talker_mtp_graph_safe", False): + try: + self.fast_ar.warmup_compile( + device=self.codebook_embeddings.weight.device, + dtype=torch.bfloat16, + batch_sizes=(1,), + ) + except Exception as exc: + logger.warning("Fish Speech Fast AR compile warmup failed: %s", exc) codec_device = self.codebook_embeddings.weight.device _load_dac_codec( diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 554ac6355de..72e745fb172 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -138,6 +138,68 @@ def _sampling_metadata_for_model_sampler(self, sampling_metadata): return sampling_metadata return replace(sampling_metadata, output_token_ids=output_token_ids) + def capture_model(self) -> int: + result = super().capture_model() + self._capture_talker_mtp_graphs() + return result + + def _capture_talker_mtp_graphs(self) -> None: + from vllm_omni.worker.gpu_model_runner import CUDAGraphWrapper + + if not self.has_talker_mtp or not isinstance(self.talker_mtp, CUDAGraphWrapper): + return + + from vllm.compilation.monitor import set_cudagraph_capturing_enabled + from vllm.distributed.parallel_state import graph_capture + + capture_sizes = self.compilation_config.cudagraph_capture_sizes + num_warmups = self.compilation_config.cudagraph_num_of_warmups + capture_sizes = sorted(capture_sizes, reverse=True) + logger.info("Capturing talker_mtp graphs for sizes %s", capture_sizes) + + set_cudagraph_capturing_enabled(True) + try: + with torch.inference_mode(), graph_capture(device=self.device): + for bsz in capture_sizes: + _, batch_desc, _, _, _ = self._determine_batch_execution_and_padding( + num_tokens=bsz, + num_reqs=bsz, + num_scheduled_tokens_np=np.ones(bsz, dtype=np.int32), + max_num_scheduled_tokens=1, + use_cascade_attn=False, + ) + n = batch_desc.num_tokens + ids = self.talker_mtp_input_ids.gpu[:n] + emb = self.talker_mtp_inputs_embeds.gpu[:n] + hid = self.last_talker_hidden.gpu[:n] + ts = self.text_step.gpu[:n] + + for _ in range(num_warmups): + with set_forward_context( + None, + self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.NONE, + batch_descriptor=batch_desc, + ): + self.talker_mtp(ids, emb, hid, ts) + + with set_forward_context( + None, + self.vllm_config, + cudagraph_runtime_mode=CUDAGraphMode.FULL, + batch_descriptor=batch_desc, + ): + self.talker_mtp(ids, emb, hid, ts) + torch.cuda.synchronize() + + logger.info("Captured talker_mtp graphs for %d sizes", len(capture_sizes)) + except RuntimeError as e: + raise RuntimeError( + f"talker_mtp graph capture failed for a model that declared talker_mtp_graph_safe=True: {e}" + ) from e + finally: + set_cudagraph_capturing_enabled(False) + @torch.inference_mode() def execute_model( self, diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 35e15984355..1f678b579fa 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -83,11 +83,9 @@ def load_model(self, *args, **kwargs) -> None: self.has_talker_mtp = True cudagraph_mode = self.compilation_config.cudagraph_mode assert cudagraph_mode is not None - # Only wrap talker_mtp in CUDAGraphWrapper for Omni models that - # have a separate .talker sub-module. TTS models' code predictor - # has internal AR loops / torch.multinomial — not graph-safe. has_separate_talker = getattr(self.model, "talker", None) is not None - if cudagraph_mode.has_full_cudagraphs() and has_separate_talker: + talker_mtp_graph_safe = getattr(self.model, "talker_mtp_graph_safe", False) + if cudagraph_mode.has_full_cudagraphs() and (has_separate_talker or talker_mtp_graph_safe): self.talker_mtp = CUDAGraphWrapper(talker_mtp, self.vllm_config, runtime_mode=CUDAGraphMode.FULL) # TTS exposes mtp_hidden_size; Omni uses hf_text_config.hidden_size. hidden_size = int( From 8097747a5dc0d90f267050ae4b77d53bbaea88ae Mon Sep 17 00:00:00 2001 From: Jiaqian Liu <61532106+Celeste-jq@users.noreply.github.com> Date: Mon, 13 Apr 2026 14:20:04 +0800 Subject: [PATCH 08/76] [Model] Adapt Wan2.2-I2V-A14B via LightX2V offline conversion path (#2134) Signed-off-by: Celeste-jq <591998922@qq.com> Co-authored-by: Canlin Guo --- docs/user_guide/diffusion/lora.md | 86 ++++ .../offline_inference/image_to_video.md | 6 +- .../image_to_video/README.md | 6 +- .../image_to_video/image_to_video.py | 13 + .../online_serving/image_to_video/README.md | 49 +++ .../image_to_video/run_curl_image_to_video.sh | 5 + .../openai_api/test_video_server.py | 22 + tools/wan22/assemble_wan22_i2v_diffusers.py | 385 ++++++++++++++++++ .../models/wan2_2/pipeline_wan2_2.py | 58 ++- .../models/wan2_2/pipeline_wan2_2_i2v.py | 21 +- .../models/wan2_2/pipeline_wan2_2_ti2v.py | 21 +- .../models/wan2_2/scheduling_wan_euler.py | 147 +++++++ .../models/wan2_2/wan2_2_transformer.py | 8 + vllm_omni/engine/async_omni_engine.py | 2 + 14 files changed, 804 insertions(+), 25 deletions(-) create mode 100644 tools/wan22/assemble_wan22_i2v_diffusers.py create mode 100644 vllm_omni/diffusion/models/wan2_2/scheduling_wan_euler.py diff --git a/docs/user_guide/diffusion/lora.md b/docs/user_guide/diffusion/lora.md index e45c033b848..256698752a1 100644 --- a/docs/user_guide/diffusion/lora.md +++ b/docs/user_guide/diffusion/lora.md @@ -56,6 +56,92 @@ outputs = omni.generate( !!! note "Server-side Path Requirement" The LoRA adapter path (`local_path`) must be readable on the **server** machine. If your client and server are on different machines, ensure the LoRA adapter is accessible via a shared mount or copied to the server. +## Wan2.2 LightX2V Offline Assembly + +This workflow is LoRA-adjacent: it uses external LightX2V conversion plus +`Wan2.2-Distill-Loras` to bake converted Wan2.2 I2V checkpoints into a local +Diffusers directory, instead of loading LoRA adapters at runtime. + +### Required assets + +- Base model: `Wan-AI/Wan2.2-I2V-A14B` +- Diffusers skeleton: `Wan-AI/Wan2.2-I2V-A14B-Diffusers` +- Optional external converter from the LightX2V project (not shipped in this repository) +- Optional LoRA weights: `lightx2v/Wan2.2-Distill-Loras` + +### Step 1: Optional - convert high/low-noise DiT weights with LightX2V + +Install or clone LightX2V from the upstream repository +(`https://github.com/ModelTC/LightX2V`). After cloning, the converter used +below is available at `/tools/convert/converter.py`. + +```bash +python /path/to/lightx2v/tools/convert/converter.py \ + --source /path/to/Wan2.2-I2V-A14B/high_noise_model \ + --output /tmp/wan22_lightx2v/high_noise_out \ + --output_ext .safetensors \ + --output_name diffusion_pytorch_model \ + --model_type wan_dit \ + --direction forward \ + --lora_path /path/to/wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step_1022.safetensors \ + --lora_key_convert auto \ + --single_file + +python /path/to/lightx2v/tools/convert/converter.py \ + --source /path/to/Wan2.2-I2V-A14B/low_noise_model \ + --output /tmp/wan22_lightx2v/low_noise_out \ + --output_ext .safetensors \ + --output_name diffusion_pytorch_model \ + --model_type wan_dit \ + --direction forward \ + --lora_path /path/to/wan2.2_i2v_A14b_low_noise_lora_rank64_lightx2v_4step_1022.safetensors \ + --lora_key_convert auto \ + --single_file +``` + +If you are not using LightX2V, skip this step and either keep the original +Diffusers weights from the skeleton or point Step 2 at any other converted +`transformer/` and `transformer_2/` checkpoints. + +### Step 2: Assemble a final Diffusers-style directory + +```bash +python tools/wan22/assemble_wan22_i2v_diffusers.py \ + --diffusers-skeleton /path/to/Wan2.2-I2V-A14B-Diffusers \ + --transformer-weight /tmp/wan22_lightx2v/high_noise_out \ + --transformer-2-weight /tmp/wan22_lightx2v/low_noise_out \ + --output-dir /path/to/Wan2.2-I2V-A14B-Custom-Diffusers \ + --asset-mode symlink \ + --overwrite +``` + +`--transformer-weight` and `--transformer-2-weight` are optional. If you omit +them, the tool keeps the original weights from the Diffusers skeleton. + +### Step 3: Run offline inference + +```bash +python examples/offline_inference/image_to_video/image_to_video.py \ + --model /path/to/Wan2.2-I2V-A14B-Custom-Diffusers \ + --image /path/to/input.jpg \ + --prompt "A cat playing with yarn" \ + --num-frames 81 \ + --num-inference-steps 4 \ + --tensor-parallel-size 4 \ + --height 480 \ + --width 832 \ + --flow-shift 12 \ + --sample-solver euler \ + --guidance-scale 1.0 \ + --guidance-scale-high 1.0 \ + --boundary-ratio 0.875 +``` + +Notes: + +- This route avoids runtime LoRA loading changes in vLLM-Omni when you choose to bake converted weights into a local Diffusers directory. +- Output quality and speed depend on the replacement checkpoints and sampling params you choose. + ## See Also diff --git a/docs/user_guide/examples/offline_inference/image_to_video.md b/docs/user_guide/examples/offline_inference/image_to_video.md index 7a750aeff3b..6e105741a7e 100644 --- a/docs/user_guide/examples/offline_inference/image_to_video.md +++ b/docs/user_guide/examples/offline_inference/image_to_video.md @@ -62,12 +62,13 @@ Key arguments: - `--negative-prompt`: Optional list of artifacts to suppress. - `--boundary-ratio`: Boundary split ratio for two-stage MoE models. - `--flow-shift`: Scheduler flow shift (5.0 for 720p, 12.0 for 480p). +- `--sample-solver`: Wan2.2 sampling solver. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints. - `--num-inference-steps`: Number of denoising steps (default 50). - `--fps`: Frames per second for the saved MP4 (requires `diffusers` export_to_video). - `--output`: Path to save the generated video. - `--vae-use-slicing`: Enable VAE slicing for memory optimization. - `--vae-use-tiling`: Enable VAE tiling for memory optimization. -- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/parallelism_acceleration.md#cfg-parallel). +- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/parallelism/cfg_parallel.md). - `--tensor-parallel-size`: tensor parallel size (effective for models that support TP, e.g. LTX2). - `--enable-cpu-offload`: enable CPU offloading for diffusion models. - `--use-hsdp`: Enable Hybrid Sharded Data Parallel to shard model weights across GPUs. @@ -78,6 +79,9 @@ Key arguments: > ℹ️ If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage. +For Wan2.2 LightX2V-converted local Diffusers directories and related LoRA +assets, see the [LoRA guide](../../diffusion/lora.md#wan22-lightx2v-offline-assembly). + ## Example materials ??? abstract "image_to_video.py" diff --git a/examples/offline_inference/image_to_video/README.md b/examples/offline_inference/image_to_video/README.md index 2692c76df26..a458850a02b 100644 --- a/examples/offline_inference/image_to_video/README.md +++ b/examples/offline_inference/image_to_video/README.md @@ -59,12 +59,13 @@ Key arguments: - `--negative-prompt`: Optional list of artifacts to suppress. - `--boundary-ratio`: Boundary split ratio for two-stage MoE models. - `--flow-shift`: Scheduler flow shift (5.0 for 720p, 12.0 for 480p). +- `--sample-solver`: Wan2.2 sampling solver. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints. - `--num-inference-steps`: Number of denoising steps (default 50). - `--fps`: Frames per second for the saved MP4 (requires `diffusers` export_to_video). - `--output`: Path to save the generated video. - `--vae-use-slicing`: Enable VAE slicing for memory optimization. - `--vae-use-tiling`: Enable VAE tiling for memory optimization. -- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](../../../docs/user_guide/diffusion/parallelism_acceleration.md#cfg-parallel). +- `--cfg-parallel-size`: set it to 2 to enable CFG Parallel. See more examples in [`user_guide`](https://github.com/vllm-project/vllm-omni/tree/main/docs/user_guide/diffusion/parallelism/cfg_parallel.md). - `--tensor-parallel-size`: tensor parallel size (effective for models that support TP, e.g. LTX2). - `--enable-cpu-offload`: enable CPU offloading for diffusion models. - `--use-hsdp`: Enable Hybrid Sharded Data Parallel to shard model weights across GPUs. @@ -74,3 +75,6 @@ Key arguments: > ℹ️ If you encounter OOM errors, try using `--vae-use-slicing` and `--vae-use-tiling` to reduce memory usage. + +For Wan2.2 LightX2V-converted local Diffusers directories and related LoRA +assets, see the [LoRA guide](../../../docs/user_guide/diffusion/lora.md#wan22-lightx2v-offline-assembly). diff --git a/examples/offline_inference/image_to_video/image_to_video.py b/examples/offline_inference/image_to_video/image_to_video.py index 7e7cfbf84e8..53319c82211 100644 --- a/examples/offline_inference/image_to_video/image_to_video.py +++ b/examples/offline_inference/image_to_video/image_to_video.py @@ -84,6 +84,13 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--flow-shift", type=float, default=5.0, help="Scheduler flow_shift (5.0 for 720p, 12.0 for 480p)." ) + parser.add_argument( + "--sample-solver", + type=str, + default="unipc", + choices=["unipc", "euler"], + help="Sampling solver for Wan2.2 pipelines. Use 'euler' for Lightning/Distill setups.", + ) parser.add_argument("--output", type=str, default="i2v_output.mp4", help="Path to save the video (mp4).") parser.add_argument("--fps", type=int, default=None, help="Frames per second for the output video.") parser.add_argument( @@ -305,6 +312,7 @@ def main(): print(f" Model: {args.model}") print(f" Inference steps: {args.num_inference_steps}") print(f" Frames: {args.num_frames}") + print(f" Solver: {args.sample_solver}") print( f" Parallel configuration: cfg_parallel_size={args.cfg_parallel_size}," f" tensor_parallel_size={args.tensor_parallel_size}, vae_patch_parallel_size={args.vae_patch_parallel_size}" @@ -326,9 +334,14 @@ def main(): generator=generator, guidance_scale=guidance_scale, guidance_scale_2=args.guidance_scale_high, + boundary_ratio=args.boundary_ratio, num_inference_steps=num_inference_steps, num_frames=num_frames, frame_rate=frame_rate, + extra_args={ + "sample_solver": args.sample_solver, + "flow_shift": args.flow_shift, + }, ), ) generation_end = time.perf_counter() diff --git a/examples/online_serving/image_to_video/README.md b/examples/online_serving/image_to_video/README.md index 49283bd9a06..285eeb27983 100644 --- a/examples/online_serving/image_to_video/README.md +++ b/examples/online_serving/image_to_video/README.md @@ -26,6 +26,23 @@ The script allows overriding: - `CACHE_BACKEND` (default: `none`) - `ENABLE_CACHE_DIT_SUMMARY` (default: `0`) +### Ascend / Local LightX2V Example + +For a local Wan2.2-LightX2V Diffusers directory on Ascend/NPU, you can start the server like this: + +```bash +vllm serve /path/to/Wan2.2-I2V-A14B-LightX2V-Diffusers-Lightning \ + --omni \ + --port 8091 \ + --flow-shift 12 \ + --cfg-parallel-size 1 \ + --ulysses-degree 4 \ + --use-hsdp \ + --trust-remote-code \ + --allowed-local-media-path / \ + --seed 42 +``` + ## Async Job Behavior `POST /v1/videos` is asynchronous. It creates a video job and immediately @@ -69,10 +86,35 @@ curl -X POST http://localhost:8091/v1/videos/sync \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ + -F 'extra_params={"sample_solver":"euler"}' \ -F "seed=42" \ -o sync_i2v_output.mp4 ``` +For Wan Lightning/Distill checkpoints, pass `{"sample_solver":"euler"}` via `extra_params`. The default solver is `unipc`. + +Example matching the local LightX2V deployment above: + +```bash +curl -sS -X POST http://localhost:8091/v1/videos/sync \ + -H "Accept: video/mp4" \ + -F "prompt=A cat playing with yarn" \ + -F "input_reference=@/path/to/input.jpg" \ + -F "width=832" \ + -F "height=480" \ + -F "num_frames=81" \ + -F "fps=16" \ + -F "num_inference_steps=4" \ + -F "guidance_scale=1.0" \ + -F "guidance_scale_2=1.0" \ + -F "boundary_ratio=0.875" \ + -F "seed=42" \ + -F 'extra_params={"sample_solver":"euler"}' \ + -o ./output.mp4 +``` + +Use `/v1/videos/sync` if you want to write the MP4 directly to a file. `POST /v1/videos` is async and returns job metadata, not inline `b64_json`. + ## Storage Generated video files are stored on local disk by the async video API. @@ -96,6 +138,9 @@ export VLLM_OMNI_STORAGE_MAX_CONCURRENCY=8 # Basic image-to-video generation bash run_curl_image_to_video.sh +# Wan Lightning/Distill checkpoints +SAMPLE_SOLVER=euler bash run_curl_image_to_video.sh + # Or execute directly (OpenAI-style multipart) create_response=$(curl -s http://localhost:8091/v1/videos \ -H "Accept: application/json" \ @@ -111,6 +156,7 @@ create_response=$(curl -s http://localhost:8091/v1/videos \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ + -F 'extra_params={"sample_solver":"euler"}' \ -F "seed=42") video_id=$(echo "$create_response" | jq -r '.id') @@ -169,9 +215,12 @@ curl -X POST http://localhost:8091/v1/videos \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ + -F 'extra_params={"sample_solver":"euler"}' \ -F "seed=42" ``` +`sample_solver` is supported by Wan2.2 online serving through the existing `extra_params` field, which is merged into the pipeline `extra_args`. Use `unipc` for the default multistep solver, or `euler` for Lightning/Distill checkpoints. + ## Create Response Format `POST /v1/videos` returns a job record, not inline base64 video data. diff --git a/examples/online_serving/image_to_video/run_curl_image_to_video.sh b/examples/online_serving/image_to_video/run_curl_image_to_video.sh index f4c1496a69a..6f6a6f96d59 100644 --- a/examples/online_serving/image_to_video/run_curl_image_to_video.sh +++ b/examples/online_serving/image_to_video/run_curl_image_to_video.sh @@ -7,6 +7,7 @@ INPUT_IMAGE="${INPUT_IMAGE:-../../offline_inference/image_to_video/qwen-bear.png BASE_URL="${BASE_URL:-http://localhost:8099}" OUTPUT_PATH="${OUTPUT_PATH:-wan22_i2v_output.mp4}" NEGATIVE_PROMPT="${NEGATIVE_PROMPT:-}" +SAMPLE_SOLVER="${SAMPLE_SOLVER:-}" POLL_INTERVAL="${POLL_INTERVAL:-2}" if [ ! -f "$INPUT_IMAGE" ]; then @@ -34,6 +35,10 @@ if [ -n "${NEGATIVE_PROMPT}" ]; then create_cmd+=(-F "negative_prompt=${NEGATIVE_PROMPT}") fi +if [ -n "${SAMPLE_SOLVER}" ]; then + create_cmd+=(-F "extra_params={\"sample_solver\":\"${SAMPLE_SOLVER}\"}") +fi + create_response="$("${create_cmd[@]}")" video_id="$(echo "${create_response}" | jq -r '.id')" if [ -z "${video_id}" ] || [ "${video_id}" = "null" ]; then diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py index 0fdee7a77a8..fd7d4df60da 100644 --- a/tests/entrypoints/openai_api/test_video_server.py +++ b/tests/entrypoints/openai_api/test_video_server.py @@ -766,6 +766,28 @@ def test_extra_params_merged_with_existing_extra_args(test_client, mocker: Mocke assert captured.extra_args["zero_steps"] == 2 +def test_sample_solver_forwarded_via_extra_params(test_client, mocker: MockerFixture): + """sample_solver can be passed through existing extra_params for Wan2.2 online serving.""" + mocker.patch( + "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", + return_value="Zg==", + ) + response = test_client.post( + "/v1/videos", + data={ + "prompt": "A fox running through snow.", + "extra_params": json.dumps({"sample_solver": "euler"}), + }, + ) + + assert response.status_code == 200 + video_id = response.json()["id"] + _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value) + engine = test_client.app.state.openai_serving_video._engine_client + captured = engine.captured_sampling_params_list[0] + assert captured.extra_args["sample_solver"] == "euler" + + # --------------------------------------------------------------------------- # Sync endpoint tests (POST /v1/videos/sync) # --------------------------------------------------------------------------- diff --git a/tools/wan22/assemble_wan22_i2v_diffusers.py b/tools/wan22/assemble_wan22_i2v_diffusers.py new file mode 100644 index 00000000000..8e14ca3c26d --- /dev/null +++ b/tools/wan22/assemble_wan22_i2v_diffusers.py @@ -0,0 +1,385 @@ +#!/usr/bin/env python3 +""" +Assemble a Wan2.2-I2V-A14B-Diffusers-style model directory using a Diffusers +skeleton and optional replacement transformer checkpoints. + +This tool does NOT run any external conversion step. You can use it in two +ways: +- keep the original weights from the Diffusers skeleton +- replace transformer/transformer_2 with converted checkpoints such as + LightX2V outputs +- use legacy LightX2V arg names (--high-noise-weight/--low-noise-weight), + which are accepted as aliases + +Typical use: + python tools/wan22/assemble_wan22_i2v_diffusers.py \ + --diffusers-skeleton /path/to/Wan2.2-I2V-A14B-Diffusers \ + --transformer-weight /path/to/high_noise_out/diffusion_pytorch_model.safetensors \ + --transformer-2-weight /path/to/low_noise_out/diffusion_pytorch_model.safetensors \ + --output-dir /path/to/Wan2.2-I2V-A14B-Custom-Diffusers +""" + +from __future__ import annotations + +import argparse +import json +import shutil +import sys +from dataclasses import dataclass +from pathlib import Path + +WEIGHT_CANDIDATES = ( + "diffusion_pytorch_model.safetensors", + "diffusion_pytorch_model.bin", + "diffusion_pytorch_model.pt", + "model.safetensors", + "pytorch_model.bin", + "model.pt", +) +WEIGHT_INDEX_CANDIDATES = ( + "diffusion_pytorch_model.safetensors.index.json", + "model.safetensors.index.json", + "pytorch_model.bin.index.json", +) + +ROOT_REQUIRED_FILES = ("model_index.json",) +ROOT_REQUIRED_DIRS = ("tokenizer", "text_encoder", "vae", "transformer", "transformer_2") +OPTIONAL_DIRS = ("image_encoder", "image_processor", "scheduler", "feature_extractor") + + +class AssembleError(RuntimeError): + pass + + +@dataclass(frozen=True) +class WeightSpec: + kind: str # "single" | "sharded" + single_file: Path | None = None + index_file: Path | None = None + shard_files: tuple[Path, ...] = () + + +def _load_shard_files_from_index(index_file: Path, role: str) -> tuple[Path, ...]: + try: + with index_file.open(encoding="utf-8") as f: + payload = json.load(f) + except Exception as exc: + raise AssembleError(f"Failed to parse {role} index file: {index_file}. error={exc}") from exc + + weight_map = payload.get("weight_map") + if not isinstance(weight_map, dict) or not weight_map: + raise AssembleError(f"Invalid {role} index file (missing/empty weight_map): {index_file}") + + shard_names = sorted({str(v) for v in weight_map.values()}) + shard_paths: list[Path] = [] + missing: list[str] = [] + for shard_name in shard_names: + shard_path = index_file.parent / shard_name + if not shard_path.is_file(): + missing.append(str(shard_path)) + else: + shard_paths.append(shard_path) + + if missing: + raise AssembleError(f"{role} index references missing shard file(s): " + ", ".join(missing)) + + if not shard_paths: + raise AssembleError(f"No shard files referenced by {role} index: {index_file}") + + return tuple(shard_paths) + + +def _resolve_weight_spec(path: Path, role: str) -> WeightSpec: + if path.is_file(): + return WeightSpec(kind="single", single_file=path) + + if path.is_dir(): + for name in WEIGHT_CANDIDATES: + candidate = path / name + if candidate.is_file(): + return WeightSpec(kind="single", single_file=candidate) + + for index_name in WEIGHT_INDEX_CANDIDATES: + index_file = path / index_name + if not index_file.is_file(): + continue + shard_files = _load_shard_files_from_index(index_file, role=role) + return WeightSpec( + kind="sharded", + index_file=index_file, + shard_files=shard_files, + ) + + shard_candidates = sorted(path.glob("diffusion_pytorch_model-*.safetensors")) + if shard_candidates: + raise AssembleError( + f"Detected sharded {role} files under {path}, but index json is missing. " + f"Expected one of: {', '.join(WEIGHT_INDEX_CANDIDATES)}" + ) + + raise AssembleError( + f"Cannot find {role} weight under directory: {path}. " + f"Expected one of single files [{', '.join(WEIGHT_CANDIDATES)}] " + f"or sharded index files [{', '.join(WEIGHT_INDEX_CANDIDATES)}]." + ) + + raise AssembleError(f"{role} path does not exist: {path}") + + +def _canonical_weight_name(weight_file: Path) -> str: + suffix = weight_file.suffix.lower() + if suffix == ".safetensors": + return "diffusion_pytorch_model.safetensors" + if suffix == ".bin": + return "diffusion_pytorch_model.bin" + if suffix == ".pt": + return "diffusion_pytorch_model.pt" + return weight_file.name + + +def _validate_skeleton(skeleton: Path) -> None: + if not skeleton.is_dir(): + raise AssembleError(f"--diffusers-skeleton is not a directory: {skeleton}") + + for file_name in ROOT_REQUIRED_FILES: + if not (skeleton / file_name).is_file(): + raise AssembleError(f"Missing required file in skeleton: {skeleton / file_name}") + + for dir_name in ROOT_REQUIRED_DIRS: + if not (skeleton / dir_name).is_dir(): + raise AssembleError(f"Missing required directory in skeleton: {skeleton / dir_name}") + + if not (skeleton / "transformer" / "config.json").is_file(): + raise AssembleError(f"Missing transformer config: {skeleton / 'transformer/config.json'}") + + if not (skeleton / "transformer_2" / "config.json").is_file(): + raise AssembleError(f"Missing transformer_2 config: {skeleton / 'transformer_2/config.json'}") + + +def _ensure_clean_output(output_dir: Path, overwrite: bool) -> None: + if output_dir.exists(): + if not overwrite: + raise AssembleError( + f"Output directory already exists: {output_dir}. Use --overwrite to remove and recreate it." + ) + shutil.rmtree(output_dir) + output_dir.mkdir(parents=True, exist_ok=False) + + +def _copy_or_link_dir(src: Path, dst: Path, asset_mode: str) -> None: + if asset_mode == "copy": + shutil.copytree(src, dst) + elif asset_mode == "symlink": + dst.symlink_to(src, target_is_directory=True) + else: + raise AssembleError(f"Unknown asset mode: {asset_mode}") + + +def _materialize_weight(weight: WeightSpec, dst_dir: Path, role: str) -> tuple[Path, ...]: + if weight.kind == "single": + assert weight.single_file is not None + dst = dst_dir / _canonical_weight_name(weight.single_file) + shutil.copy2(weight.single_file, dst) + return (dst,) + + if weight.kind == "sharded": + assert weight.index_file is not None + copied: list[Path] = [] + index_dst = dst_dir / weight.index_file.name + shutil.copy2(weight.index_file, index_dst) + copied.append(index_dst) + for shard_file in weight.shard_files: + shard_dst = dst_dir / shard_file.name + shutil.copy2(shard_file, shard_dst) + copied.append(shard_dst) + return tuple(copied) + + raise AssembleError(f"Unknown {role} weight kind: {weight.kind}") + + +def _assemble( + skeleton: Path, + output_dir: Path, + transformer_weight: WeightSpec, + transformer_2_weight: WeightSpec, + asset_mode: str, +) -> tuple[tuple[Path, ...], tuple[Path, ...]]: + shutil.copy2(skeleton / "model_index.json", output_dir / "model_index.json") + + for dir_name in ROOT_REQUIRED_DIRS: + if dir_name in ("transformer", "transformer_2"): + continue + _copy_or_link_dir(skeleton / dir_name, output_dir / dir_name, asset_mode) + + for dir_name in OPTIONAL_DIRS: + src_dir = skeleton / dir_name + if src_dir.is_dir(): + _copy_or_link_dir(src_dir, output_dir / dir_name, asset_mode) + + (output_dir / "transformer").mkdir(parents=True, exist_ok=True) + (output_dir / "transformer_2").mkdir(parents=True, exist_ok=True) + + shutil.copy2(skeleton / "transformer" / "config.json", output_dir / "transformer" / "config.json") + shutil.copy2(skeleton / "transformer_2" / "config.json", output_dir / "transformer_2" / "config.json") + + transformer_copied = _materialize_weight(transformer_weight, output_dir / "transformer", role="transformer") + transformer_2_copied = _materialize_weight( + transformer_2_weight, + output_dir / "transformer_2", + role="transformer_2", + ) + + return transformer_copied, transformer_2_copied + + +def _validate_output( + output_dir: Path, + transformer_copied: tuple[Path, ...], + transformer_2_copied: tuple[Path, ...], +) -> None: + if not (output_dir / "model_index.json").is_file(): + raise AssembleError("Output validation failed: model_index.json missing") + + required_paths = ( + output_dir / "tokenizer", + output_dir / "text_encoder", + output_dir / "vae", + output_dir / "transformer" / "config.json", + output_dir / "transformer_2" / "config.json", + *transformer_copied, + *transformer_2_copied, + ) + missing = [str(p) for p in required_paths if not p.exists()] + if missing: + raise AssembleError("Output validation failed, missing: " + ", ".join(missing)) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Assemble a Wan2.2-I2V-A14B-Diffusers directory while optionally " + "replacing transformer and transformer_2 weights." + ) + ) + parser.add_argument( + "--diffusers-skeleton", + type=Path, + required=True, + help="Path to a local Wan-AI/Wan2.2-I2V-A14B-Diffusers directory.", + ) + parser.add_argument( + "--transformer-weight", + type=Path, + help=( + "Optional checkpoint file, or directory containing either a single-file " + "weight or sharded index+shards for transformer/. If omitted, keep the " + "skeleton's original transformer weights." + ), + ) + parser.add_argument( + "--transformer-2-weight", + type=Path, + help=( + "Optional checkpoint file, or directory containing either a single-file " + "weight or sharded index+shards for transformer_2/. If omitted, keep the " + "skeleton's original transformer_2 weights." + ), + ) + parser.add_argument( + "--high-noise-weight", + type=Path, + help=argparse.SUPPRESS, + ) + parser.add_argument( + "--low-noise-weight", + type=Path, + help=argparse.SUPPRESS, + ) + parser.add_argument( + "--output-dir", + type=Path, + required=True, + help="Output directory for the assembled model.", + ) + parser.add_argument( + "--asset-mode", + choices=("symlink", "copy"), + default="symlink", + help=( + "How to materialize non-transformer assets (tokenizer/text_encoder/vae/optional dirs). " + "symlink saves disk and is default." + ), + ) + parser.add_argument( + "--overwrite", + action="store_true", + help="Overwrite output-dir if it exists.", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + + skeleton = args.diffusers_skeleton.resolve() + output_dir = args.output_dir.resolve() + + if args.transformer_weight is not None and args.high_noise_weight is not None: + print( + "[ERROR] --transformer-weight and --high-noise-weight are aliases; please provide only one.", + file=sys.stderr, + ) + return 2 + if args.transformer_2_weight is not None and args.low_noise_weight is not None: + print( + "[ERROR] --transformer-2-weight and --low-noise-weight are aliases; please provide only one.", + file=sys.stderr, + ) + return 2 + + transformer_weight_arg = args.transformer_weight if args.transformer_weight is not None else args.high_noise_weight + transformer_2_weight_arg = ( + args.transformer_2_weight if args.transformer_2_weight is not None else args.low_noise_weight + ) + + transformer_input = ( + transformer_weight_arg.resolve() if transformer_weight_arg is not None else skeleton / "transformer" + ) + transformer_2_input = ( + transformer_2_weight_arg.resolve() if transformer_2_weight_arg is not None else skeleton / "transformer_2" + ) + + try: + _validate_skeleton(skeleton) + transformer_weight = _resolve_weight_spec(transformer_input, role="transformer") + transformer_2_weight = _resolve_weight_spec(transformer_2_input, role="transformer_2") + + _ensure_clean_output(output_dir, overwrite=args.overwrite) + transformer_copied, transformer_2_copied = _assemble( + skeleton=skeleton, + output_dir=output_dir, + transformer_weight=transformer_weight, + transformer_2_weight=transformer_2_weight, + asset_mode=args.asset_mode, + ) + _validate_output(output_dir, transformer_copied, transformer_2_copied) + except AssembleError as exc: + print(f"[ERROR] {exc}", file=sys.stderr) + return 2 + + def _weight_summary(copied: tuple[Path, ...]) -> str: + if len(copied) == 1: + return copied[0].name + return f"{copied[0].name} + {len(copied) - 1} shard files" + + print("[OK] Assembled Wan2.2 I2V Diffusers directory:") + print(f" output_dir: {output_dir}") + print(f" transformer weight: {_weight_summary(transformer_copied)}") + print(f" transformer_2 weight: {_weight_summary(transformer_2_copied)}") + print("\nUse it with vLLM-Omni, for example:") + print(f" vllm serve {output_dir} --omni --port 8091") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py index a550e576f01..84d89619e86 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py @@ -24,6 +24,7 @@ from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin, _is_rank_zero from vllm_omni.diffusion.models.schedulers import FlowUniPCMultistepScheduler +from vllm_omni.diffusion.models.wan2_2.scheduling_wan_euler import WanEulerScheduler from vllm_omni.diffusion.models.wan2_2.wan2_2_transformer import WanTransformer3DModel from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest @@ -32,6 +33,46 @@ logger = logging.getLogger(__name__) DEBUG_PERF = False +WAN_SAMPLE_SOLVER_CHOICES = {"unipc", "euler"} + + +def build_wan_scheduler(sample_solver: str, flow_shift: float) -> Any: + if sample_solver == "unipc": + return FlowUniPCMultistepScheduler( + num_train_timesteps=1000, + shift=flow_shift, + prediction_type="flow_prediction", + ) + if sample_solver == "euler": + return WanEulerScheduler( + num_train_timesteps=1000, + shift=flow_shift, + ) + + raise ValueError( + f"Unsupported Wan sample_solver: {sample_solver}. Expected one of: {sorted(WAN_SAMPLE_SOLVER_CHOICES)}" + ) + + +def resolve_wan_sample_solver(req: OmniDiffusionRequest, default: str = "unipc") -> str: + extra_args = getattr(req.sampling_params, "extra_args", {}) or {} + raw = extra_args.get("sample_solver", default) + sample_solver = str(raw).strip().lower() + if sample_solver not in WAN_SAMPLE_SOLVER_CHOICES: + raise ValueError(f"Invalid sample_solver={raw!r}. Expected one of: {sorted(WAN_SAMPLE_SOLVER_CHOICES)}") + return sample_solver + + +def resolve_wan_flow_shift(req: OmniDiffusionRequest, od_config: OmniDiffusionConfig) -> float: + extra_args = getattr(req.sampling_params, "extra_args", {}) or {} + raw_flow_shift = extra_args.get("flow_shift") + if raw_flow_shift is None: + raw_flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 + + try: + return float(raw_flow_shift) + except (TypeError, ValueError) as exc: + raise ValueError(f"Invalid flow_shift={raw_flow_shift!r}. flow_shift must be a float.") from exc def retrieve_latents( @@ -296,13 +337,9 @@ def __init__( else: raise RuntimeError("No transformer loaded") - # Initialize UniPC scheduler - flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 # default for 720p - self.scheduler = FlowUniPCMultistepScheduler( - num_train_timesteps=1000, - shift=flow_shift, - prediction_type="flow_prediction", - ) + self._sample_solver = "unipc" + self._flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 + self.scheduler = build_wan_scheduler(self._sample_solver, self._flow_shift) self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4 self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8 @@ -462,6 +499,13 @@ def forward( current_omni_platform.synchronize() _t_text_enc_ms = (time.perf_counter() - _t_text_enc_start) * 1000 + sample_solver = resolve_wan_sample_solver(req, default=self._sample_solver) + flow_shift = resolve_wan_flow_shift(req, self.od_config) + if sample_solver != self._sample_solver or abs(flow_shift - self._flow_shift) > 1e-6: + self.scheduler = build_wan_scheduler(sample_solver, flow_shift) + self._sample_solver = sample_solver + self._flow_shift = flow_shift + # Timesteps self.scheduler.set_timesteps(num_steps, device=device) timesteps = self.scheduler.timesteps diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py index c05ecc9c9a2..46484cd789d 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py @@ -24,10 +24,12 @@ from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.interface import SupportImageInput from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin, _is_rank_zero -from vllm_omni.diffusion.models.schedulers import FlowUniPCMultistepScheduler from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import ( + build_wan_scheduler, create_transformer_from_config, load_transformer_config, + resolve_wan_flow_shift, + resolve_wan_sample_solver, retrieve_latents, ) from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin @@ -230,13 +232,9 @@ def __init__( else: self.transformer_2 = None - # Initialize UniPC scheduler - flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 # default for 720p - self.scheduler = FlowUniPCMultistepScheduler( - num_train_timesteps=1000, - shift=flow_shift, - prediction_type="flow_prediction", - ) + self._sample_solver = "unipc" + self._flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 + self.scheduler = build_wan_scheduler(self._sample_solver, self._flow_shift) # VAE scale factors self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if hasattr(self.vae, "config") else 4 @@ -440,6 +438,13 @@ def forward( current_omni_platform.synchronize() _t_img_enc_ms = (time.perf_counter() - _t_img_enc_start) * 1000 + sample_solver = resolve_wan_sample_solver(req, default=self._sample_solver) + flow_shift = resolve_wan_flow_shift(req, self.od_config) + if sample_solver != self._sample_solver or abs(flow_shift - self._flow_shift) > 1e-6: + self.scheduler = build_wan_scheduler(sample_solver, flow_shift) + self._sample_solver = sample_solver + self._flow_shift = flow_shift + # Timesteps self.scheduler.set_timesteps(num_steps, device=device) timesteps = self.scheduler.timesteps diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py index 261f62fb798..939fe294a33 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py @@ -36,10 +36,12 @@ from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.interface import SupportImageInput from vllm_omni.diffusion.models.progress_bar import ProgressBarMixin -from vllm_omni.diffusion.models.schedulers import FlowUniPCMultistepScheduler from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import ( + build_wan_scheduler, create_transformer_from_config, load_transformer_config, + resolve_wan_flow_shift, + resolve_wan_sample_solver, retrieve_latents, ) from vllm_omni.diffusion.request import OmniDiffusionRequest @@ -183,13 +185,9 @@ def __init__( transformer_config = load_transformer_config(model, "transformer", local_files_only) self.transformer = create_transformer_from_config(transformer_config) - # Initialize UniPC scheduler - flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 # default for 720p - self.scheduler = FlowUniPCMultistepScheduler( - num_train_timesteps=1000, - shift=flow_shift, - prediction_type="flow_prediction", - ) + self._sample_solver = "unipc" + self._flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 + self.scheduler = build_wan_scheduler(self._sample_solver, self._flow_shift) # VAE scale factors self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if hasattr(self.vae, "config") else 4 @@ -323,6 +321,13 @@ def forward( batch_size = prompt_embeds.shape[0] + sample_solver = resolve_wan_sample_solver(req, default=self._sample_solver) + flow_shift = resolve_wan_flow_shift(req, self.od_config) + if sample_solver != self._sample_solver or abs(flow_shift - self._flow_shift) > 1e-6: + self.scheduler = build_wan_scheduler(sample_solver, flow_shift) + self._sample_solver = sample_solver + self._flow_shift = flow_shift + # Timesteps self.scheduler.set_timesteps(num_steps, device=device) timesteps = self.scheduler.timesteps diff --git a/vllm_omni/diffusion/models/wan2_2/scheduling_wan_euler.py b/vllm_omni/diffusion/models/wan2_2/scheduling_wan_euler.py new file mode 100644 index 00000000000..25444044c2d --- /dev/null +++ b/vllm_omni/diffusion/models/wan2_2/scheduling_wan_euler.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from __future__ import annotations + +from dataclasses import dataclass +from types import SimpleNamespace + +import numpy as np +import torch + + +@dataclass +class WanEulerSchedulerOutput: + prev_sample: torch.FloatTensor + + +def _unsqueeze_to_ndim(in_tensor: torch.Tensor, target_ndim: int) -> torch.Tensor: + if in_tensor.ndim >= target_ndim: + return in_tensor + return in_tensor[(...,) + (None,) * (target_ndim - in_tensor.ndim)] + + +def _get_timesteps(num_steps: int, max_steps: int = 1000) -> np.ndarray: + # Keep num_steps + 1 points so Euler update can always access sigma_next. + return np.linspace(max_steps, 0, num_steps + 1, dtype=np.float32) + + +def _timestep_shift(timesteps: torch.Tensor, shift: float = 1.0) -> torch.Tensor: + return shift * timesteps / (1 + (shift - 1) * timesteps) + + +class WanEulerScheduler: + order = 1 + + def __init__( + self, + num_train_timesteps: int = 1000, + shift: float = 1.0, + device: torch.device | str = "cpu", + ) -> None: + self.num_train_timesteps = int(num_train_timesteps) + self._shift = float(shift) + self.device = device + self.config = SimpleNamespace(num_train_timesteps=self.num_train_timesteps) + self.init_noise_sigma = 1.0 + + self._step_index: int | None = None + self._begin_index: int | None = None + + self.timesteps = torch.empty(0, dtype=torch.float32) + self.sigmas = torch.empty(0, dtype=torch.float32) + self.timesteps_ori = torch.empty(0, dtype=torch.float32) + + self.set_timesteps(num_inference_steps=self.num_train_timesteps, device=self.device) + + @property + def step_index(self) -> int | None: + return self._step_index + + @property + def begin_index(self) -> int | None: + return self._begin_index + + def set_begin_index(self, begin_index: int = 0) -> None: + self._begin_index = int(begin_index) + + def index_for_timestep(self, timestep: torch.Tensor) -> int: + indices = (self.timesteps == timestep).nonzero() + if len(indices) > 0: + pos = 1 if len(indices) > 1 else 0 + return int(indices[pos].item()) + # Fallback for tiny float drift + return int(torch.argmin(torch.abs(self.timesteps - timestep)).item()) + + def _init_step_index(self, timestep: float | torch.Tensor) -> None: + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep_t = timestep.to(self.timesteps.device, dtype=self.timesteps.dtype) + else: + timestep_t = torch.tensor(timestep, device=self.timesteps.device, dtype=self.timesteps.dtype) + self._step_index = self.index_for_timestep(timestep_t) + else: + self._step_index = self._begin_index + + def set_shift(self, shift: float = 1.0) -> None: + # Compute shifted sigma schedule on [0, 1]. + sigmas_full = self.timesteps_ori / float(self.num_train_timesteps) + sigmas_full = _timestep_shift(sigmas_full, shift=float(shift)) + self.sigmas = sigmas_full + # Public timesteps are the first N points; next point is consumed as sigma_next. + self.timesteps = self.sigmas[:-1] * self.num_train_timesteps + self._shift = float(shift) + + def set_timesteps( + self, + num_inference_steps: int, + device: torch.device | str | int | None = None, + **kwargs, # noqa: ARG002 - kept for scheduler API compatibility + ) -> None: + timesteps = _get_timesteps( + num_steps=int(num_inference_steps), + max_steps=self.num_train_timesteps, + ) + self.timesteps_ori = torch.from_numpy(timesteps).to( + dtype=torch.float32, + device=device or self.device, + ) + self.set_shift(self._shift) + self._step_index = None + self._begin_index = None + + def scale_model_input(self, sample: torch.Tensor, timestep: int | None = None) -> torch.Tensor: # noqa: ARG002 + return sample + + def step( + self, + model_output: torch.FloatTensor, + timestep: float | torch.FloatTensor, + sample: torch.FloatTensor, + return_dict: bool = True, + **kwargs, # noqa: ARG002 - kept for scheduler API compatibility + ) -> WanEulerSchedulerOutput | tuple[torch.FloatTensor]: + if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)): + raise ValueError( + "Passing integer indices as timesteps is not supported. Use one value from scheduler.timesteps instead." + ) + + if self.step_index is None: + self._init_step_index(timestep) + assert self._step_index is not None + + sample_fp32 = sample.to(torch.float32) + sigma = _unsqueeze_to_ndim(self.sigmas[self._step_index], sample_fp32.ndim).to(sample_fp32.device) + sigma_next = _unsqueeze_to_ndim(self.sigmas[self._step_index + 1], sample_fp32.ndim).to(sample_fp32.device) + + prev_sample = sample_fp32 + (sigma_next - sigma) * model_output + prev_sample = prev_sample.to(model_output.dtype) + + self._step_index += 1 + + if not return_dict: + return (prev_sample,) + return WanEulerSchedulerOutput(prev_sample=prev_sample) + + def __len__(self) -> int: + return self.num_train_timesteps diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py index 65a2d4390ae..3b43f3eaf51 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py @@ -1015,6 +1015,14 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: if ".to_out.0." in lookup_name: lookup_name = lookup_name.replace(".to_out.0.", ".to_out.") + # Compatibility: some Wan conversion pipelines still keep + # block modulation keys as `blocks.N.modulation` instead of + # `blocks.N.scale_shift_table`. + if lookup_name.endswith(".modulation"): + modulation_alias = lookup_name[: -len(".modulation")] + ".scale_shift_table" + if modulation_alias in params_dict: + lookup_name = modulation_alias + if lookup_name not in params_dict: logger.warning(f"Skipping weight {original_name} -> {lookup_name}") continue diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 8e0b2b2df11..32e8336f6da 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -1221,6 +1221,8 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list: "enable_cpu_offload": kwargs.get("enable_cpu_offload", False), "enable_layerwise_offload": kwargs.get("enable_layerwise_offload", False), "enforce_eager": kwargs.get("enforce_eager", False), + "boundary_ratio": kwargs.get("boundary_ratio", None), + "flow_shift": kwargs.get("flow_shift", None), "diffusion_load_format": kwargs.get("diffusion_load_format", "default"), "custom_pipeline_args": kwargs.get("custom_pipeline_args", None), "worker_extension_cls": kwargs.get("worker_extension_cls", None), From d9e745ce2c562be06913cf27c3c9942a56154b93 Mon Sep 17 00:00:00 2001 From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Date: Mon, 13 Apr 2026 02:30:56 -0400 Subject: [PATCH 09/76] [Fix] VoxCPM2: support raw audio for voice cloning via OpenAI API (#2720) Signed-off-by: Yueqian Lin --- examples/online_serving/voxcpm2/README.md | 42 ++++++ .../voxcpm2/openai_speech_client.py | 108 +++++++++++++++ .../models/voxcpm2/voxcpm2_talker.py | 130 +++++++++++++++++- 3 files changed, 277 insertions(+), 3 deletions(-) create mode 100644 examples/online_serving/voxcpm2/README.md create mode 100644 examples/online_serving/voxcpm2/openai_speech_client.py diff --git a/examples/online_serving/voxcpm2/README.md b/examples/online_serving/voxcpm2/README.md new file mode 100644 index 00000000000..8735180f0ac --- /dev/null +++ b/examples/online_serving/voxcpm2/README.md @@ -0,0 +1,42 @@ +# VoxCPM2 Online Serving + +Serve VoxCPM2 TTS via the OpenAI-compatible `/v1/audio/speech` endpoint. + +## Start the Server + +```bash +python -m vllm_omni.entrypoints.openai.api_server \ + --model openbmb/VoxCPM2 \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm2.yaml \ + --host 0.0.0.0 --port 8000 +``` + +## Zero-shot Synthesis + +```bash +python openai_speech_client.py --text "Hello, this is VoxCPM2." +``` + +Or with curl: + +```bash +curl -X POST http://localhost:8000/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{"model": "voxcpm2", "input": "Hello, this is VoxCPM2.", "voice": "default"}' \ + --output output.wav +``` + +## Voice Cloning + +Clone a speaker's voice using a reference audio file: + +```bash +python openai_speech_client.py \ + --text "This should sound like the reference speaker." \ + --ref-audio /path/to/reference.wav +``` + +The `--ref-audio` parameter accepts: +- Local file path (auto-encoded to base64) +- URL (`https://...`) +- Base64 data URI (`data:audio/wav;base64,...`) diff --git a/examples/online_serving/voxcpm2/openai_speech_client.py b/examples/online_serving/voxcpm2/openai_speech_client.py new file mode 100644 index 00000000000..a117d24fd1a --- /dev/null +++ b/examples/online_serving/voxcpm2/openai_speech_client.py @@ -0,0 +1,108 @@ +"""OpenAI-compatible client for VoxCPM2 TTS via /v1/audio/speech endpoint. + +Examples: + # Zero-shot synthesis + python openai_speech_client.py --text "Hello, this is VoxCPM2." + + # Voice cloning with a local reference audio file + python openai_speech_client.py --text "Hello world" \ + --ref-audio /path/to/reference.wav + + # Voice cloning with a URL + python openai_speech_client.py --text "Hello world" \ + --ref-audio "https://example.com/reference.wav" + +Server setup: + python -m vllm_omni.entrypoints.openai.api_server \ + --model openbmb/VoxCPM2 \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm2.yaml \ + --host 0.0.0.0 --port 8000 +""" + +from __future__ import annotations + +import argparse +import base64 +import os + +import httpx + +DEFAULT_API_BASE = "http://localhost:8000" +DEFAULT_API_KEY = "sk-empty" + + +def encode_audio_to_base64(audio_path: str) -> str: + """Encode a local audio file to a base64 data URL.""" + if not os.path.exists(audio_path): + raise FileNotFoundError(f"Audio file not found: {audio_path}") + + ext = audio_path.lower().rsplit(".", 1)[-1] + mime = { + "wav": "audio/wav", + "mp3": "audio/mpeg", + "flac": "audio/flac", + "ogg": "audio/ogg", + }.get(ext, "audio/wav") + + with open(audio_path, "rb") as f: + b64 = base64.b64encode(f.read()).decode("utf-8") + return f"data:{mime};base64,{b64}" + + +def main() -> None: + parser = argparse.ArgumentParser(description="VoxCPM2 OpenAI speech client") + parser.add_argument("--text", type=str, required=True, help="Text to synthesize") + parser.add_argument( + "--ref-audio", + type=str, + default=None, + help="Reference audio for voice cloning (local path, URL, or data: URI)", + ) + parser.add_argument("--model", type=str, default="voxcpm2") + parser.add_argument("--output", type=str, default="output.wav") + parser.add_argument("--api-base", type=str, default=DEFAULT_API_BASE) + parser.add_argument("--api-key", type=str, default=DEFAULT_API_KEY) + parser.add_argument("--response-format", type=str, default="wav") + args = parser.parse_args() + + # VoxCPM2 has no predefined voices. The "voice" field is required by + # the OpenAI API schema but ignored by VoxCPM2 — use any placeholder. + # For voice cloning, pass --ref-audio instead. + payload: dict = { + "model": args.model, + "input": args.text, + "voice": "default", + "response_format": args.response_format, + } + + if args.ref_audio: + ref = args.ref_audio + if ref.startswith(("http://", "https://", "data:")): + payload["ref_audio"] = ref + else: + payload["ref_audio"] = encode_audio_to_base64(ref) + + url = f"{args.api_base}/v1/audio/speech" + print(f"POST {url}") + print(f" text: {args.text}") + if args.ref_audio: + print(f" ref_audio: {args.ref_audio[:80]}...") + + with httpx.Client(timeout=300) as client: + resp = client.post( + url, + json=payload, + headers={"Authorization": f"Bearer {args.api_key}"}, + ) + + if resp.status_code != 200: + print(f"Error {resp.status_code}: {resp.text[:500]}") + return + + with open(args.output, "wb") as f: + f.write(resp.content) + print(f"Saved: {args.output} ({len(resp.content):,} bytes)") + + +if __name__ == "__main__": + main() diff --git a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py index ade68b673b7..b9faf9fa3b8 100644 --- a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py +++ b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py @@ -22,6 +22,7 @@ from collections.abc import Iterable from typing import Any +import librosa import torch import torch.nn as nn from vllm.config import VllmConfig @@ -41,6 +42,53 @@ logger = init_logger(__name__) +def _encode_raw_audio( + tts: nn.Module, + samples: list[float] | torch.Tensor, + sr: int, + padding_mode: str = "right", +) -> torch.Tensor: + """Encode raw audio samples using the native VoxCPM2 AudioVAE. + + Mirrors ``VoxCPM2Model._encode_wav`` but accepts in-memory samples + instead of a file path. This is needed for the OpenAI speech API + where ``_resolve_ref_audio`` returns decoded audio data. + + Args: + tts: Native VoxCPM2 tts_model instance. + samples: Audio samples (mono, float32). + sr: Sample rate of the input audio. + padding_mode: "right" (default) or "left" padding. + + Returns: + audio_feat: (T, P, D) tensor of latent patches. + """ + if isinstance(samples, list): + audio = torch.tensor(samples, dtype=torch.float32) + else: + audio = samples.float() + + if audio.ndim == 1: + audio = audio.unsqueeze(0) + + # Resample to the model's expected encoding sample rate + encode_sr = tts._encode_sample_rate + if sr != encode_sr: + audio_np = audio.squeeze(0).numpy() + audio_np = librosa.resample(audio_np, orig_sr=sr, target_sr=encode_sr) + audio = torch.from_numpy(audio_np).unsqueeze(0) + + # Pad to patch boundary + patch_len = tts.patch_size * tts.chunk_size + if audio.size(1) % patch_len != 0: + padding_size = patch_len - audio.size(1) % patch_len + pad = (padding_size, 0) if padding_mode == "left" else (0, padding_size) + audio = torch.nn.functional.pad(audio, pad) + + feat = tts.audio_vae.encode(audio.to(tts.device), encode_sr).cpu() + return feat.view(tts.audio_vae.latent_dim, -1, tts.patch_size).permute(1, 2, 0) + + class VoxCPM2TalkerForConditionalGeneration(nn.Module): """VoxCPM2 talker using native MiniCPM4 base_lm. @@ -83,6 +131,82 @@ def tts(self) -> nn.Module: assert self._tts is not None, "Model not loaded yet" return self._tts + def _build_prompt_cache( + self, + ref_audio: Any = None, + prompt_audio: Any = None, + prompt_text: str | None = None, + ) -> dict | None: + """Build prompt cache, handling both file paths and raw audio data. + + The OpenAI speech API sends decoded audio as [samples_list, sr] + via ``_resolve_ref_audio``, while offline usage sends file paths. + This method detects the format and routes accordingly. + """ + tts = self.tts + + def _is_raw_audio(v: Any) -> bool: + """Check if value is [samples, sr] from serving_speech.""" + return ( + isinstance(v, (list, tuple)) + and len(v) == 2 + and isinstance(v[1], int) + and isinstance(v[0], (list, torch.Tensor)) + ) + + # If all inputs are file paths (or None), use native build_prompt_cache + if not _is_raw_audio(ref_audio) and not _is_raw_audio(prompt_audio): + return tts.build_prompt_cache( + prompt_text=prompt_text, + prompt_wav_path=prompt_audio, + reference_wav_path=ref_audio, + ) + + # Raw audio path: encode directly + cache: dict[str, Any] = {} + + if ref_audio is not None: + if _is_raw_audio(ref_audio): + samples, sr = ref_audio + cache["ref_audio_feat"] = _encode_raw_audio( + tts, + samples, + sr, + padding_mode="right", + ) + else: + cache["ref_audio_feat"] = tts._encode_wav( + ref_audio, + padding_mode="right", + ) + + if prompt_audio is not None and prompt_text is not None: + cache["prompt_text"] = prompt_text + if _is_raw_audio(prompt_audio): + samples, sr = prompt_audio + cache["audio_feat"] = _encode_raw_audio( + tts, + samples, + sr, + padding_mode="left", + ) + else: + cache["audio_feat"] = tts._encode_wav( + prompt_audio, + padding_mode="left", + ) + + has_ref = "ref_audio_feat" in cache + has_prompt = "audio_feat" in cache + if has_ref and has_prompt: + cache["mode"] = "ref_continuation" + elif has_ref: + cache["mode"] = "reference" + else: + cache["mode"] = "continuation" + + return cache + # -------------------- vllm hooks -------------------- def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor: @@ -482,10 +606,10 @@ def preprocess( self._prompt_cache = None if ref_audio or (prompt_audio and prompt_text): try: - self._prompt_cache = self.tts.build_prompt_cache( + self._prompt_cache = self._build_prompt_cache( + ref_audio=ref_audio, + prompt_audio=prompt_audio, prompt_text=prompt_text, - prompt_wav_path=prompt_audio, - reference_wav_path=ref_audio, ) except Exception as e: logger.warning("build_prompt_cache failed: %s; falling back to zero-shot", e) From 22261430b42b3e91d2019367da9fe1a8bac7f58a Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Mon, 13 Apr 2026 14:47:55 +0800 Subject: [PATCH 10/76] [CI][Bugfix] Refactor the test case to add support for increasing init timeout and stage init timeout in order to resolve the CI timeout error. (#2711) Signed-off-by: wangyu <410167048@qq.com> --- .buildkite/test-merge.yml | 2 +- .buildkite/test-nightly.yml | 3 +- tests/conftest.py | 8 +- .../offline_inference/test_bagel_img2img.py | 15 +- .../e2e/offline_inference/test_bagel_lora.py | 11 +- .../offline_inference/test_bagel_text2img.py | 32 ++-- .../test_bagel_understanding.py | 27 +-- tests/e2e/offline_inference/test_cache_dit.py | 35 +--- .../test_diffusion_cpu_offload.py | 43 ++--- .../test_diffusion_layerwise_offload.py | 56 +++--- .../offline_inference/test_diffusion_lora.py | 14 +- .../e2e/offline_inference/test_dynin_omni.py | 73 ++------ .../offline_inference/test_expert_parallel.py | 51 +++--- .../test_flux_autoround_w4a16.py | 40 ++--- .../offline_inference/test_flux_kontext.py | 97 +++++----- .../test_hunyuanimage3_text2img.py | 14 +- .../e2e/offline_inference/test_magi_human.py | 17 +- .../offline_inference/test_mammoth_moda2.py | 11 +- tests/e2e/offline_inference/test_omnivoice.py | 55 +++--- .../test_quantization_fp8.py | 19 +- .../test_qwen_image_diffusion_batching.py | 165 ++++++++---------- .../test_sequence_parallel.py | 63 ++++--- .../test_stable_audio_model.py | 21 +-- tests/e2e/offline_inference/test_t2i_model.py | 101 +++++------ tests/e2e/offline_inference/test_t2v_model.py | 51 +++--- tests/e2e/offline_inference/test_teacache.py | 37 +--- .../test_vae_decode_parallelism.py | 36 ++-- tests/e2e/offline_inference/test_voxcpm2.py | 7 +- .../e2e/offline_inference/test_voxtral_tts.py | 17 +- .../test_zimage_parallelism.py | 112 ++++++------ .../test_images_generations_lora.py | 2 +- 31 files changed, 497 insertions(+), 738 deletions(-) diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 7355e2b4c7c..24fc6dd3dc2 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -113,7 +113,7 @@ steps: - "/fsx/hf_cache:/fsx/hf_cache" - label: "Diffusion Sequence Parallelism Test" - timeout_in_minutes: 20 + timeout_in_minutes: 25 depends_on: upload-merge-pipeline commands: - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py tests/diffusion/distributed/test_ulysses_uaa_perf.py diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 06b7c14ae1d..31b3e17976c 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -141,7 +141,6 @@ steps: - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" - - buildkite-agent artifact upload "tests/dfx/perf/results/*.html" agents: queue: "mithril-h100-pool" plugins: @@ -244,7 +243,7 @@ steps: - export DEFAULT_OUTPUT_DIR=tests/dfx/perf/results - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-omni-performance - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-qwen-image-performance - - buildkite-agent artifact download "tests/dfx/perf/results/*.html" . --step nightly-omni-performance + - buildkite-agent artifact download "tests/dfx/perf/results/*.html" . --step nightly-testcase-statistics - python tools/nightly/generate_nightly_perf_excel.py - python tools/nightly/generate_nightly_perf_html.py - python tools/nightly/send_nightly_email.py --report-file "tests/dfx/perf/results/*.xlsx, tests/dfx/perf/results/*.html" diff --git a/tests/conftest.py b/tests/conftest.py index 18a0ee57d97..9c739533b83 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1771,8 +1771,12 @@ def omni_server(request: pytest.FixtureRequest, run_level: str, model_prefix: st server_args = params.server_args or [] if params.use_omni and params.stage_init_timeout is not None: server_args = [*server_args, "--stage-init-timeout", str(params.stage_init_timeout)] + else: + server_args = [*server_args, "--stage-init-timeout", "600"] if params.init_timeout is not None: server_args = [*server_args, "--init-timeout", str(params.init_timeout)] + else: + server_args = [*server_args, "--init-timeout", "900"] if params.use_stage_cli: if not params.use_omni: raise ValueError("omni_server with use_stage_cli=True requires use_omni=True") @@ -2870,9 +2874,9 @@ def __init__( self, model_name: str, seed: int = 42, - stage_init_timeout: int = 300, + stage_init_timeout: int = 600, batch_timeout: int = 10, - init_timeout: int = 300, + init_timeout: int = 900, shm_threshold_bytes: int = 65536, log_stats: bool = False, stage_configs_path: str | None = None, diff --git a/tests/e2e/offline_inference/test_bagel_img2img.py b/tests/e2e/offline_inference/test_bagel_img2img.py index a0c3f6cc9fc..63d2a37da79 100644 --- a/tests/e2e/offline_inference/test_bagel_img2img.py +++ b/tests/e2e/offline_inference/test_bagel_img2img.py @@ -22,9 +22,9 @@ from PIL import Image from vllm.assets.image import ImageAsset -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni +from vllm_omni import Omni from vllm_omni.platforms import current_omni_platform # Reference pixel data extracted from the known-good output image @@ -210,11 +210,10 @@ def test_bagel_img2img_shared_memory_connector(run_level): input_image = _load_input_image() config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") config_path = _resolve_stage_config(config_path, run_level) - omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300) - - try: - generated_image = _generate_bagel_img2img(omni, input_image) + with OmniRunner( + "ByteDance-Seed/BAGEL-7B-MoT", + stage_configs_path=config_path, + ) as runner: + generated_image = _generate_bagel_img2img(runner.omni, input_image) if run_level == "advanced_model": _validate_pixels(generated_image) - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_bagel_lora.py b/tests/e2e/offline_inference/test_bagel_lora.py index 593a640478d..501d23eaa88 100644 --- a/tests/e2e/offline_inference/test_bagel_lora.py +++ b/tests/e2e/offline_inference/test_bagel_lora.py @@ -22,7 +22,6 @@ from vllm_omni.outputs import OmniRequestOutput os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" from pathlib import Path @@ -32,9 +31,9 @@ from PIL import Image from safetensors.torch import save_file -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni +from vllm_omni import Omni from vllm_omni.lora.request import LoRARequest from vllm_omni.lora.utils import stable_lora_int_id @@ -154,8 +153,8 @@ def _make_file_lora_request(adapter_dir: Path) -> LoRARequest: def test_bagel_lora_scale_and_deactivation(run_level, tmp_path): """Validate LoRA effect, bounded perturbation, and clean deactivation.""" config_path = _resolve_stage_config(BAGEL_STAGE_CONFIG, run_level) - omni = Omni(model=MODEL, stage_configs_path=config_path, stage_init_timeout=300) - try: + with OmniRunner(MODEL, stage_configs_path=config_path) as runner: + omni = runner.omni lora_request = _make_file_lora_request(tmp_path / "bagel_lora") # 1) Baseline (no LoRA) @@ -194,5 +193,3 @@ def test_bagel_lora_scale_and_deactivation(run_level, tmp_path): # (d) Deactivation fully restores base model assert diff_restored == 0.0, f"Base model not restored after LoRA deactivation: diff={diff_restored}" - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_bagel_text2img.py b/tests/e2e/offline_inference/test_bagel_text2img.py index 7cce8da3a73..e45d64f2ac5 100644 --- a/tests/e2e/offline_inference/test_bagel_text2img.py +++ b/tests/e2e/offline_inference/test_bagel_text2img.py @@ -16,7 +16,6 @@ import os os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" import signal import socket import subprocess @@ -28,9 +27,9 @@ import pytest from PIL import Image -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni +from vllm_omni import Omni from vllm_omni.platforms import current_omni_platform # Reference pixel data extracted from the known-good output image @@ -199,14 +198,13 @@ def test_bagel_text2img_shared_memory_connector(run_level): """Test Bagel text2img with shared memory connector.""" config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") config_path = _resolve_stage_config(config_path, run_level) - omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=config_path, stage_init_timeout=300) - - try: - generated_image = _generate_bagel_image(omni) + with OmniRunner( + "ByteDance-Seed/BAGEL-7B-MoT", + stage_configs_path=config_path, + ) as runner: + generated_image = _generate_bagel_image(runner.omni) if run_level == "advanced_model": _validate_pixels(generated_image) - finally: - omni.close() def _wait_for_port(host: str, port: int, timeout: int = 30) -> bool: @@ -319,7 +317,6 @@ def test_bagel_text2img_mooncake_connector(run_level): mooncake_master_proc = None temp_config_file = None - omni = None try: _cleanup_mooncake_processes() @@ -349,15 +346,16 @@ def test_bagel_text2img_mooncake_connector(run_level): ) temp_config_file = _resolve_stage_config(temp_config_file, run_level) - omni = Omni(model="ByteDance-Seed/BAGEL-7B-MoT", stage_configs_path=temp_config_file, stage_init_timeout=300) - - generated_image = _generate_bagel_image(omni) - if run_level == "advanced_model": - _validate_pixels(generated_image) + with OmniRunner( + "ByteDance-Seed/BAGEL-7B-MoT", + stage_configs_path=temp_config_file, + stage_init_timeout=300, + ) as runner: + generated_image = _generate_bagel_image(runner.omni) + if run_level == "advanced_model": + _validate_pixels(generated_image) finally: - if omni: - omni.close() if temp_config_file: try: os.unlink(temp_config_file) diff --git a/tests/e2e/offline_inference/test_bagel_understanding.py b/tests/e2e/offline_inference/test_bagel_understanding.py index 6f95e7ee00f..bbee3298079 100644 --- a/tests/e2e/offline_inference/test_bagel_understanding.py +++ b/tests/e2e/offline_inference/test_bagel_understanding.py @@ -21,15 +21,13 @@ import os os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" from pathlib import Path import pytest from vllm.assets.image import ImageAsset -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni MODEL_NAME = "ByteDance-Seed/BAGEL-7B-MoT" STAGE_CONFIG = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml") @@ -76,13 +74,11 @@ def _extract_text(omni_outputs: list) -> str: def test_bagel_text2text(run_level): """Test Bagel text2text produces correct text output.""" config_path = _resolve_stage_config(STAGE_CONFIG, run_level) - omni = Omni( - model=MODEL_NAME, + with OmniRunner( + MODEL_NAME, stage_configs_path=config_path, - stage_init_timeout=300, - ) - - try: + ) as runner: + omni = runner.omni prompt = "<|im_start|>user\nWhere is the capital of France?<|im_end|>\n<|im_start|>assistant\n" params_list = omni.default_sampling_params_list omni_outputs = list( @@ -100,8 +96,6 @@ def test_bagel_text2text(run_level): assert text == REFERENCE_TEXT_TEXT2TEXT, ( f"Text mismatch: expected {REFERENCE_TEXT_TEXT2TEXT!r}, got {text!r}" ) - finally: - omni.close() @pytest.mark.core_model @@ -112,13 +106,12 @@ def test_bagel_img2text(run_level): """Test Bagel img2text produces correct text output.""" input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB") config_path = _resolve_stage_config(STAGE_CONFIG, run_level) - omni = Omni( - model=MODEL_NAME, + with OmniRunner( + MODEL_NAME, stage_configs_path=config_path, stage_init_timeout=300, - ) - - try: + ) as runner: + omni = runner.omni prompt = "<|im_start|>user\n<|image_pad|>\nPlease describe this image<|im_end|>\n<|im_start|>assistant\n" params_list = omni.default_sampling_params_list omni_outputs = list( @@ -140,5 +133,3 @@ def test_bagel_img2text(run_level): if run_level == "advanced_model": assert text == REFERENCE_TEXT_IMG2TEXT, f"Text mismatch: expected {REFERENCE_TEXT_IMG2TEXT!r}, got {text!r}" - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_cache_dit.py b/tests/e2e/offline_inference/test_cache_dit.py index 0e31413dc07..fc08da7bedf 100644 --- a/tests/e2e/offline_inference/test_cache_dit.py +++ b/tests/e2e/offline_inference/test_cache_dit.py @@ -8,27 +8,15 @@ It uses minimal settings to keep test time short for CI. """ -import os -import sys -from pathlib import Path - import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams - -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" - # Use random weights model for testing models = ["riverclouds/qwen_image_random"] @@ -48,20 +36,17 @@ def test_cache_dit(model_name: str): "residual_diff_threshold": 0.24, "max_continuous_cached_steps": 3, } - m = None - try: - m = Omni( - model=model_name, - cache_backend="cache_dit", - cache_config=cache_config, - ) - + with OmniRunner( + model_name, + cache_backend="cache_dit", + cache_config=cache_config, + ) as runner: # Use minimal settings for fast testing height = 256 width = 256 num_inference_steps = 4 # Minimal steps for fast test - outputs = m.generate( + outputs = runner.omni.generate( "a photo of a cat sitting on a laptop keyboard", OmniDiffusionSamplingParams( height=height, @@ -90,9 +75,3 @@ def test_cache_dit(model_name: str): # Check image size assert images[0].width == width assert images[0].height == height - except Exception as e: - print(f"Test failed with error: {e}") - raise - finally: - if m is not None and hasattr(m, "close"): - m.close() diff --git a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py index f3830f02e97..257755ef8b9 100644 --- a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py +++ b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py @@ -1,22 +1,14 @@ import gc -import sys -from pathlib import Path import pytest import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from tests.conftest import OmniRunner from tests.utils import DeviceMemoryMonitor, hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni - models = ["riverclouds/qwen_image_random"] @@ -27,30 +19,29 @@ def inference(model_name: str, offload: bool = True): current_omni_platform.reset_peak_memory_stats() monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - m = Omni( - model=model_name, + with OmniRunner( + model_name, # TODO: we might want to add overlapped feature e2e tests # cache_backend="cache_dit", enable_cpu_offload=offload, - ) - current_omni_platform.reset_peak_memory_stats() - height = 256 - width = 256 + ) as runner: + current_omni_platform.reset_peak_memory_stats() + height = 256 + width = 256 - m.generate( - "a photo of a cat sitting on a laptop keyboard", - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=9, - guidance_scale=0.0, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - ), - ) + runner.omni.generate( + "a photo of a cat sitting on a laptop keyboard", + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=9, + guidance_scale=0.0, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + ), + ) peak = monitor.peak_used_mb monitor.stop() - del m gc.collect() current_omni_platform.empty_cache() diff --git a/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py b/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py index 6132f1bd0eb..bdfd594c774 100644 --- a/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py +++ b/tests/e2e/offline_inference/test_diffusion_layerwise_offload.py @@ -1,21 +1,12 @@ -import sys -from pathlib import Path - import pytest import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from tests.conftest import OmniRunner from tests.utils import DeviceMemoryMonitor from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni - # Models to test and expected saved memory in MB, correspondingly MODELS_SAVED_MEMORY_MB = { "riverclouds/qwen_image_random": 4500, @@ -33,34 +24,33 @@ def run_inference( monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - m = Omni( - model=model_name, + with OmniRunner( + model_name, enable_layerwise_offload=layerwise_offload, # TODO: we might want to add overlapped feature e2e tests # cache_backend="cache_dit", boundary_ratio=0.875, flow_shift=5.0, - ) - - current_omni_platform.reset_peak_memory_stats() - - # Refer to tests/e2e/offline_inference/test_t2v_model.py - # Use minimal settings for testing - height = 480 - width = 640 - num_frames = 5 - - m.generate( - "A cat sitting on a table", - OmniDiffusionSamplingParams( - height=height, - width=width, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - guidance_scale=1.0, - num_inference_steps=num_inference_steps, - num_frames=num_frames, - ), - ) + ) as runner: + current_omni_platform.reset_peak_memory_stats() + + # Refer to tests/e2e/offline_inference/test_t2v_model.py + # Use minimal settings for testing + height = 480 + width = 640 + num_frames = 5 + + runner.omni.generate( + "A cat sitting on a table", + OmniDiffusionSamplingParams( + height=height, + width=width, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + guidance_scale=1.0, + num_inference_steps=num_inference_steps, + num_frames=num_frames, + ), + ) peak = monitor.peak_used_mb monitor.stop() diff --git a/tests/e2e/offline_inference/test_diffusion_lora.py b/tests/e2e/offline_inference/test_diffusion_lora.py index b414fe30eeb..7edd03f20d1 100644 --- a/tests/e2e/offline_inference/test_diffusion_lora.py +++ b/tests/e2e/offline_inference/test_diffusion_lora.py @@ -7,6 +7,7 @@ import torch from safetensors.torch import save_file +from tests.conftest import OmniRunner from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform @@ -16,15 +17,12 @@ if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) -from vllm_omni import Omni - os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" # This test is specific to Z-Image LoRA behavior. Keep it focused on a single # model to reduce runtime and avoid extra downloads. models = ["Tongyi-MAI/Z-Image-Turbo"] -DIFFUSION_INIT_TIMEOUT_S = 600 @pytest.mark.parametrize("model_name", models) @@ -77,12 +75,8 @@ def _write_zimage_lora(adapter_dir: Path) -> str: ) return str(adapter_dir) - m = Omni( - model=model_name, - stage_init_timeout=DIFFUSION_INIT_TIMEOUT_S, - init_timeout=DIFFUSION_INIT_TIMEOUT_S, - ) - try: + with OmniRunner(model_name) as runner: + m = runner.omni # high resolution may cause OOM on L4 height = 256 width = 256 @@ -140,5 +134,3 @@ def _write_zimage_lora(adapter_dir: Path) -> str: diff = np.abs(np.array(images[0], dtype=np.int16) - np.array(images_lora[0], dtype=np.int16)).mean() assert diff > 0.0 - finally: - m.close() diff --git a/tests/e2e/offline_inference/test_dynin_omni.py b/tests/e2e/offline_inference/test_dynin_omni.py index d17e7b81755..5388ac67468 100644 --- a/tests/e2e/offline_inference/test_dynin_omni.py +++ b/tests/e2e/offline_inference/test_dynin_omni.py @@ -18,7 +18,6 @@ import torch from transformers import AutoTokenizer -from tests.conftest import OmniRunner from tests.utils import hardware_test os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -37,6 +36,7 @@ pytestmark = [ pytest.mark.core_model, pytest.mark.omni, + pytest.mark.parametrize("omni_runner", test_params, indirect=True), ] @@ -291,20 +291,11 @@ def _numel(value: Any) -> int: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_t2i_decode_to_image(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_t2i_decode_to_image(omni_runner) -> None: _configure_dynin_config_env() prompt = _build_t2i_decode_prompt(dynin_config_path=DYNIN_CONFIG_PATH) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) image_output = _find_stage_output(outputs, "image") assert image_output is not None @@ -314,25 +305,16 @@ def test_dynin_t2i_decode_to_image(test_config: tuple[str, str]) -> None: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_mmu_to_text(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_mmu_to_text(omni_runner) -> None: _configure_dynin_config_env() - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(omni_runner.model_name, trust_remote_code=True) prompt = _build_mmu_prompt( tokenizer=tokenizer, question="What is 2 + 2? Answer in one short sentence.", dynin_config_path=DYNIN_CONFIG_PATH, ) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) text_output = _find_stage_output(outputs, "text") assert text_output is not None @@ -341,11 +323,9 @@ def test_dynin_mmu_to_text(test_config: tuple[str, str]) -> None: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_image_to_text(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_image_to_text(omni_runner) -> None: _configure_dynin_config_env() - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(omni_runner.model_name, trust_remote_code=True) prompt = _build_mmu_multimodal_prompt( tokenizer=tokenizer, question="Describe the image briefly in one sentence.", @@ -353,14 +333,7 @@ def test_dynin_image_to_text(test_config: tuple[str, str]) -> None: image=_generate_synthetic_image(), ) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) text_output = _find_stage_output(outputs, "text") assert text_output is not None @@ -369,11 +342,9 @@ def test_dynin_image_to_text(test_config: tuple[str, str]) -> None: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_speech_to_text(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_speech_to_text(omni_runner) -> None: _configure_dynin_config_env() - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(omni_runner.model_name, trust_remote_code=True) prompt = _build_mmu_multimodal_prompt( tokenizer=tokenizer, question="Transcribe the audio briefly in one sentence.", @@ -381,14 +352,7 @@ def test_dynin_speech_to_text(test_config: tuple[str, str]) -> None: audio=_generate_synthetic_audio(), ) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) text_output = _find_stage_output(outputs, "text") assert text_output is not None @@ -397,20 +361,11 @@ def test_dynin_speech_to_text(test_config: tuple[str, str]) -> None: @hardware_test(res={"cuda": "L4", "rocm": "MI325"}) -@pytest.mark.parametrize("test_config", test_params) -def test_dynin_t2s_decode_to_audio(test_config: tuple[str, str]) -> None: - model, stage_config_path = test_config +def test_dynin_t2s_decode_to_audio(omni_runner) -> None: _configure_dynin_config_env() prompt = _build_t2s_decode_prompt(dynin_config_path=DYNIN_CONFIG_PATH) - with OmniRunner( - model, - seed=42, - stage_configs_path=stage_config_path, - stage_init_timeout=600, - init_timeout=600, - ) as runner: - outputs = runner.generate([prompt]) + outputs = omni_runner.generate([prompt]) audio_output = _find_stage_output(outputs, "audio") assert audio_output is not None diff --git a/tests/e2e/offline_inference/test_expert_parallel.py b/tests/e2e/offline_inference/test_expert_parallel.py index ba126986ec7..29d84d7a3e2 100644 --- a/tests/e2e/offline_inference/test_expert_parallel.py +++ b/tests/e2e/offline_inference/test_expert_parallel.py @@ -18,8 +18,8 @@ import torch.distributed as dist from PIL import Image +from tests.conftest import OmniRunner from tests.utils import hardware_test -from vllm_omni import Omni from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform @@ -96,12 +96,26 @@ def _run_inference( tensor_parallel_size=tensor_parallel_size, enable_expert_parallel=enable_expert_parallel, ) - omni = Omni(model=model_name, parallel_config=parallel_config) - try: - # Warmup run (not timed) - if warmup: - _ = omni.generate( + with OmniRunner(model_name, parallel_config=parallel_config) as runner: + omni = runner.omni + # Warmup run (not timed) + if warmup: + _ = omni.generate( + PROMPT, + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=DEFAULT_STEPS, + guidance_scale=guidance_scale, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), + num_outputs_per_prompt=1, + ), + ) + + # Timed run + start = time.time() + outputs = omni.generate( PROMPT, OmniDiffusionSamplingParams( height=height, @@ -112,28 +126,13 @@ def _run_inference( num_outputs_per_prompt=1, ), ) + elapsed_ms = (time.time() - start) * 1000 - # Timed run - start = time.time() - outputs = omni.generate( - PROMPT, - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=DEFAULT_STEPS, - guidance_scale=guidance_scale, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), - num_outputs_per_prompt=1, - ), - ) - elapsed_ms = (time.time() - start) * 1000 - - return InferenceResult( - images=outputs[0].images, - elapsed_ms=elapsed_ms, - ) + return InferenceResult( + images=outputs[0].images, + elapsed_ms=elapsed_ms, + ) finally: - omni.close() _cleanup_distributed() diff --git a/tests/e2e/offline_inference/test_flux_autoround_w4a16.py b/tests/e2e/offline_inference/test_flux_autoround_w4a16.py index 42aab7f26a8..cbcd1009dd5 100644 --- a/tests/e2e/offline_inference/test_flux_autoround_w4a16.py +++ b/tests/e2e/offline_inference/test_flux_autoround_w4a16.py @@ -8,31 +8,21 @@ """ import gc -import sys -from pathlib import Path +import os as _os import pytest import torch from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from tests.conftest import OmniRunner from tests.utils import DeviceMemoryMonitor, hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni - QUANTIZED_MODEL = "vllm-project-org/FLUX.1-dev-AutoRound-w4a16" BASELINE_MODEL = "black-forest-labs/FLUX.1-dev" -# Allow overriding via environment for local testing -import os as _os - QUANTIZED_MODEL = _os.environ.get("FLUX_AUTOROUND_MODEL", QUANTIZED_MODEL) BASELINE_MODEL = _os.environ.get("FLUX_BASELINE_MODEL", BASELINE_MODEL) @@ -51,19 +41,18 @@ def _generate_image(model_name: str, **extra_kwargs) -> tuple[list, float]: monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - m = Omni(model=model_name, enforce_eager=True, **extra_kwargs) - - current_omni_platform.reset_peak_memory_stats() - outputs = m.generate( - "a photo of a cat sitting on a laptop keyboard", - OmniDiffusionSamplingParams( - height=HEIGHT, - width=WIDTH, - num_inference_steps=NUM_STEPS, - guidance_scale=0.0, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - ), - ) + with OmniRunner(model_name, enforce_eager=True, **extra_kwargs) as runner: + current_omni_platform.reset_peak_memory_stats() + outputs = runner.omni.generate( + "a photo of a cat sitting on a laptop keyboard", + OmniDiffusionSamplingParams( + height=HEIGHT, + width=WIDTH, + num_inference_steps=NUM_STEPS, + guidance_scale=0.0, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + ), + ) peak = monitor.peak_used_mb monitor.stop() @@ -74,7 +63,6 @@ def _generate_image(model_name: str, **extra_kwargs) -> tuple[list, float]: assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images") images = req_out.images - del m gc.collect() current_omni_platform.empty_cache() diff --git a/tests/e2e/offline_inference/test_flux_kontext.py b/tests/e2e/offline_inference/test_flux_kontext.py index 93dca21c9ad..cd711d6b818 100644 --- a/tests/e2e/offline_inference/test_flux_kontext.py +++ b/tests/e2e/offline_inference/test_flux_kontext.py @@ -9,23 +9,14 @@ - Image editing with text guidance """ -import os -import sys -from pathlib import Path - import pytest from PIL import Image +from vllm.assets.image import ImageAsset +from tests.conftest import OmniRunner from vllm_omni.diffusion.data import DiffusionParallelConfig -from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" - MODEL = "black-forest-labs/FLUX.1-Kontext-dev" @@ -33,17 +24,15 @@ @pytest.mark.diffusion def test_flux_kontext_text_to_image(): """Test FluxKontext text-to-image generation with real model.""" - omni = Omni( - model=MODEL, + with OmniRunner( + MODEL, parallel_config=DiffusionParallelConfig( tensor_parallel_size=2, ), enable_cpu_offload=False, - ) - - try: + ) as runner: omni_outputs = list( - omni.generate( + runner.omni.generate( prompts=["A photo of a cat sitting on a laptop"], sampling_params_list=OmniDiffusionSamplingParams( height=512, @@ -54,43 +43,37 @@ def test_flux_kontext_text_to_image(): ) ) - assert len(omni_outputs) > 0 - output = omni_outputs[0] - images = None - if output.images: - images = output.images - elif hasattr(output, "request_output") and output.request_output: - for stage_out in output.request_output: - if hasattr(stage_out, "images") and stage_out.images: - images = stage_out.images - break + assert len(omni_outputs) > 0 + output = omni_outputs[0] + images = None + if output.images: + images = output.images + elif hasattr(output, "request_output") and output.request_output: + for stage_out in output.request_output: + if hasattr(stage_out, "images") and stage_out.images: + images = stage_out.images + break - assert images is not None - assert len(images) > 0 - assert isinstance(images[0], Image.Image) - assert images[0].size == (512, 512) - finally: - omni.close() + assert images is not None + assert len(images) > 0 + assert isinstance(images[0], Image.Image) + assert images[0].size == (512, 512) @pytest.mark.core_model @pytest.mark.diffusion def test_flux_kontext_image_edit(): """Test FluxKontext image-to-image editing with real model.""" - from vllm.assets.image import ImageAsset - input_image = ImageAsset("2560px-Gfp-wisconsin-madison-the-nature-boardwalk").pil_image.convert("RGB") - omni = Omni( - model=MODEL, + with OmniRunner( + MODEL, parallel_config=DiffusionParallelConfig( tensor_parallel_size=2, ), enable_cpu_offload=False, - ) - - try: + ) as runner: omni_outputs = list( - omni.generate( + runner.omni.generate( prompts=[ { "prompt": "Transform this image into a Vincent van Gogh style painting", @@ -107,20 +90,18 @@ def test_flux_kontext_image_edit(): ) ) - assert len(omni_outputs) > 0 - output = omni_outputs[0] - images = None - if output.images: - images = output.images - elif hasattr(output, "request_output") and output.request_output: - for stage_out in output.request_output: - if hasattr(stage_out, "images") and stage_out.images: - images = stage_out.images - break - - assert images is not None - assert len(images) > 0 - assert isinstance(images[0], Image.Image) - assert images[0].size == (512, 512) - finally: - omni.close() + assert len(omni_outputs) > 0 + output = omni_outputs[0] + images = None + if output.images: + images = output.images + elif hasattr(output, "request_output") and output.request_output: + for stage_out in output.request_output: + if hasattr(stage_out, "images") and stage_out.images: + images = stage_out.images + break + + assert images is not None + assert len(images) > 0 + assert isinstance(images[0], Image.Image) + assert images[0].size == (512, 512) diff --git a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py index 5522f33eaa7..79bb64dca1b 100644 --- a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py +++ b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py @@ -8,6 +8,7 @@ from PIL import Image from transformers import CLIPModel, CLIPProcessor +from tests.conftest import OmniRunner from vllm_omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform @@ -271,16 +272,11 @@ def clip_bundle() -> tuple[CLIPModel, CLIPProcessor]: @pytest.fixture(scope="module") def omni() -> Generator[Omni, None, None]: - engine = Omni( - model=MODEL_NAME, + with OmniRunner( + MODEL_NAME, stage_configs_path=str(STAGE_CONFIG_PATH), - stage_init_timeout=600, - init_timeout=900, - ) - try: - yield engine - finally: - engine.close() + ) as runner: + yield runner.omni def _extract_generated_image(outputs: list[object]) -> Image.Image: diff --git a/tests/e2e/offline_inference/test_magi_human.py b/tests/e2e/offline_inference/test_magi_human.py index 8648216a92f..abb7f9c163c 100644 --- a/tests/e2e/offline_inference/test_magi_human.py +++ b/tests/e2e/offline_inference/test_magi_human.py @@ -8,9 +8,9 @@ import numpy as np import pytest +from tests.conftest import OmniRunner from tests.utils import hardware_test from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes -from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -49,12 +49,6 @@ def test_magi_human_e2e(run_level): model_path = "SII-GAIR/daVinci-MagiHuman-Base-1080p" - omni = Omni( - model=model_path, - init_timeout=1200, - tensor_parallel_size=2, - ) - prompt = ( "A young woman with long, wavy golden blonde hair and bright blue eyes, " "wearing a fitted ivory silk blouse with a delicate lace collar, sits " @@ -94,7 +88,12 @@ def test_magi_human_e2e(run_level): }, ) - try: + with OmniRunner( + model_path, + init_timeout=1200, + tensor_parallel_size=2, + ) as runner: + omni = runner.omni outputs = list( omni.generate( prompts=[prompt], @@ -140,5 +139,3 @@ def test_magi_human_e2e(run_level): assert len(video_bytes) > 1000, f"MP4 too small ({len(video_bytes)} bytes)" _validate_mp4(video_bytes) - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_mammoth_moda2.py b/tests/e2e/offline_inference/test_mammoth_moda2.py index 5293b5ed1b7..ff744c86e1e 100644 --- a/tests/e2e/offline_inference/test_mammoth_moda2.py +++ b/tests/e2e/offline_inference/test_mammoth_moda2.py @@ -23,10 +23,9 @@ import torch from vllm.sampling_params import SamplingParams +from tests.conftest import OmniRunner from tests.utils import hardware_test -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" - # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- @@ -116,8 +115,6 @@ def test_mammothmoda2_t2i_e2e(): - A fixed set of pixel values matches a golden reference (regenerate with ``UPDATE_GOLDEN=1``). """ - from vllm_omni import Omni - if not Path(MODEL_PATH).exists(): pytest.skip(f"Model weights not found at {MODEL_PATH}") if not Path(T2I_STAGE_CONFIG).exists(): @@ -135,8 +132,8 @@ def test_mammothmoda2_t2i_e2e(): prompt_text = "A cat sitting on a laptop keyboard" formatted_prompt = _format_t2i_prompt(prompt_text, ar_width, ar_height) - omni = Omni(model=MODEL_PATH, stage_configs_path=T2I_STAGE_CONFIG, trust_remote_code=True) - try: + with OmniRunner(MODEL_PATH, stage_configs_path=T2I_STAGE_CONFIG, trust_remote_code=True) as runner: + omni = runner.omni # Greedy / deterministic sampling so pixel values are reproducible. ar_sampling = SamplingParams( temperature=0.0, @@ -211,5 +208,3 @@ def test_mammothmoda2_t2i_e2e(): found_image = True assert found_image, "No image tensor found in pipeline output" - finally: - omni.close() diff --git a/tests/e2e/offline_inference/test_omnivoice.py b/tests/e2e/offline_inference/test_omnivoice.py index 4b093e357d9..bb4c8a5dd7e 100644 --- a/tests/e2e/offline_inference/test_omnivoice.py +++ b/tests/e2e/offline_inference/test_omnivoice.py @@ -16,6 +16,7 @@ import numpy as np import pytest +from tests.conftest import OmniRunner from tests.utils import hardware_test MODEL = "k2-fsa/OmniVoice" @@ -37,48 +38,42 @@ def test_omnivoice_text_to_audio() -> None: Input Modal: text Output Modal: audio """ - from vllm_omni.entrypoints.omni import Omni + from vllm_omni.inputs.data import OmniDiffusionSamplingParams - omni = Omni( - model=MODEL, + with OmniRunner( + MODEL, stage_configs_path=get_stage_config(), trust_remote_code=True, log_stats=True, - ) - - try: + ) as runner: prompts = {"prompt": "Hello, this is a test for text to audio."} - from vllm_omni.inputs.data import OmniDiffusionSamplingParams - sampling_params_list = [OmniDiffusionSamplingParams()] - outputs = list(omni.generate(prompts, sampling_params_list=sampling_params_list)) + outputs = list(runner.omni.generate(prompts, sampling_params_list=sampling_params_list)) - assert len(outputs) > 0, "No outputs generated" + assert len(outputs) > 0, "No outputs generated" - # Check final output has audio - final_output = outputs[-1] - ro = final_output.request_output - assert ro is not None, "No request_output" + # Check final output has audio + final_output = outputs[-1] + ro = final_output.request_output + assert ro is not None, "No request_output" - mm = getattr(ro, "multimodal_output", None) - if not mm and ro.outputs: - mm = getattr(ro.outputs[0], "multimodal_output", None) + mm = getattr(ro, "multimodal_output", None) + if not mm and ro.outputs: + mm = getattr(ro.outputs[0], "multimodal_output", None) - assert mm is not None, "No multimodal_output" - assert "audio" in mm, f"No 'audio' key in multimodal_output: {mm.keys()}" + assert mm is not None, "No multimodal_output" + assert "audio" in mm, f"No 'audio' key in multimodal_output: {mm.keys()}" - audio = mm["audio"] - if isinstance(audio, np.ndarray): - audio_np = audio - else: - audio_np = audio.cpu().numpy().squeeze() + audio = mm["audio"] + if isinstance(audio, np.ndarray): + audio_np = audio + else: + audio_np = audio.cpu().numpy().squeeze() - assert audio_np.size > 0, "Audio output is empty" - rms = np.sqrt(np.mean(audio_np**2)) - assert rms > 0.01, f"Audio RMS too low ({rms:.4f}), likely silence" + assert audio_np.size > 0, "Audio output is empty" + rms = np.sqrt(np.mean(audio_np**2)) + assert rms > 0.01, f"Audio RMS too low ({rms:.4f}), likely silence" - print(f"Generated audio: {len(audio_np) / 24000:.2f}s, rms={rms:.4f}") - finally: - omni.close() + print(f"Generated audio: {len(audio_np) / 24000:.2f}s, rms={rms:.4f}") diff --git a/tests/e2e/offline_inference/test_quantization_fp8.py b/tests/e2e/offline_inference/test_quantization_fp8.py index f71c53de74c..291779fd931 100644 --- a/tests/e2e/offline_inference/test_quantization_fp8.py +++ b/tests/e2e/offline_inference/test_quantization_fp8.py @@ -29,7 +29,6 @@ import os os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" from pathlib import Path from typing import Any @@ -37,8 +36,8 @@ import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test -from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform @@ -61,16 +60,15 @@ def _generate_single_stage_image( Returns (images, peak_memory_gib). """ - omni_kwargs: dict[str, Any] = {"model": model, **extra_omni_kwargs} + omni_kwargs: dict[str, Any] = dict(extra_omni_kwargs) if quantization: omni_kwargs["quantization"] = quantization - omni = Omni(**omni_kwargs) - try: + with OmniRunner(model, **omni_kwargs) as runner: torch.cuda.reset_peak_memory_stats() generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(seed) - outputs = omni.generate( + outputs = runner.omni.generate( "a photo of a cat sitting on a laptop keyboard", OmniDiffusionSamplingParams( height=height, @@ -94,8 +92,6 @@ def _generate_single_stage_image( assert images[0].height == height return images, peak_mem - finally: - omni.close() def _generate_bagel_image( @@ -115,8 +111,9 @@ def _generate_bagel_image( if quantization_config: omni_kwargs["quantization_config"] = quantization_config - omni = Omni(**omni_kwargs) - try: + model_name = omni_kwargs.pop("model") + with OmniRunner(model_name, **omni_kwargs) as runner: + omni = runner.omni torch.cuda.reset_peak_memory_stats() params_list = omni.default_sampling_params_list @@ -168,8 +165,6 @@ def _generate_bagel_image( ) return generated_image, peak_mem - finally: - omni.close() # ─── Single-stage diffusion model tests ────────────────────────────────────── diff --git a/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py b/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py index d5f82f893e6..f0b0b55c9f6 100644 --- a/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py +++ b/tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py @@ -28,7 +28,6 @@ import argparse import asyncio -import os import sys import time import uuid @@ -37,6 +36,7 @@ import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test from vllm_omni.entrypoints.async_omni import AsyncOmni from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -48,9 +48,6 @@ if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) -from vllm_omni import Omni - -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" # ------------------------------------------------------------------ models = ["tiny-random/Qwen-Image"] @@ -391,31 +388,28 @@ async def main(model: str, num_prompts: int, mode: str, batch_size: int = 1) -> def test_diffusion_batching_sync_sequential(model_name: str): """Test that synchronous Omni can generate images for multiple prompts submitted sequentially (one at a time) and each returns a valid image.""" - m = None try: - m = Omni(model=model_name) - sp = _default_sync_sampling_params() - prompts = TEST_PROMPTS[:4] + with OmniRunner(model_name) as runner: + m = runner.omni + sp = _default_sync_sampling_params() + prompts = TEST_PROMPTS[:4] - for i, prompt in enumerate(prompts): - outputs = m.generate(prompt, sp) - first_output = outputs[0] - assert first_output.final_output_type == "image", ( - f"Expected 'image', got '{first_output.final_output_type}'" - ) + for i, prompt in enumerate(prompts): + outputs = m.generate(prompt, sp) + first_output = outputs[0] + assert first_output.final_output_type == "image", ( + f"Expected 'image', got '{first_output.final_output_type}'" + ) - # Images are surfaced both at top-level and inside request_output - images = _extract_images(first_output) - assert len(images) >= 1, f"Expected at least 1 image for prompt {i}, got {len(images)}" - assert images[0].width == 256 - assert images[0].height == 256 - print(f" prompt {i}: OK ({len(images)} images)") + # Images are surfaced both at top-level and inside request_output + images = _extract_images(first_output) + assert len(images) >= 1, f"Expected at least 1 image for prompt {i}, got {len(images)}" + assert images[0].width == 256 + assert images[0].height == 256 + print(f" prompt {i}: OK ({len(images)} images)") except Exception as e: print(f"Test failed with error: {e}") raise - finally: - if m is not None and hasattr(m, "close"): - m.close() @pytest.mark.core_model @@ -431,34 +425,31 @@ def test_diffusion_batching_sync_multi_prompt(model_name: str): handling at the diffusion stage, not the explicit list-batch path (which is only available via AsyncOmni). """ - m = None try: - m = Omni(model=model_name) - sp = _default_sync_sampling_params() - prompts = TEST_PROMPTS[:4] + with OmniRunner(model_name) as runner: + m = runner.omni + sp = _default_sync_sampling_params() + prompts = TEST_PROMPTS[:4] - outputs = m.generate(prompts, sp) - assert len(outputs) == len(prompts), f"Expected {len(prompts)} outputs, got {len(outputs)}" + outputs = m.generate(prompts, sp) + assert len(outputs) == len(prompts), f"Expected {len(prompts)} outputs, got {len(outputs)}" - for i, output in enumerate(outputs): - assert output.final_output_type == "image", ( - f"Output {i} final_output_type expected 'image', got '{output.final_output_type}'" - ) - images = _extract_images(output) - assert images and len(images) >= 1, f"Expected at least 1 image for prompt {i}" - assert images[0].width == 256 - assert images[0].height == 256 - print(f" prompt {i}: OK ({len(images)} images, request_id={output.request_id})") - - # Verify all request_ids are distinct - request_ids = [o.request_id for o in outputs] - assert len(set(request_ids)) == len(request_ids), f"Duplicate request_ids found: {request_ids}" + for i, output in enumerate(outputs): + assert output.final_output_type == "image", ( + f"Output {i} final_output_type expected 'image', got '{output.final_output_type}'" + ) + images = _extract_images(output) + assert images and len(images) >= 1, f"Expected at least 1 image for prompt {i}" + assert images[0].width == 256 + assert images[0].height == 256 + print(f" prompt {i}: OK ({len(images)} images, request_id={output.request_id})") + + # Verify all request_ids are distinct + request_ids = [o.request_id for o in outputs] + assert len(set(request_ids)) == len(request_ids), f"Duplicate request_ids found: {request_ids}" except Exception as e: print(f"Test failed with error: {e}") raise - finally: - if m is not None and hasattr(m, "close"): - m.close() @pytest.mark.core_model @@ -552,32 +543,29 @@ async def _inner(): def test_diffusion_batching_num_outputs(model_name: str): """Test that the diffusion model respects num_outputs_per_prompt and generates the correct number of images per request.""" - m = None try: - m = Omni(model=model_name) - num_outputs = 2 - sp = _default_sync_sampling_params(num_outputs_per_prompt=num_outputs) - - outputs = m.generate( - "a photo of a cat sitting on a laptop keyboard", - sp, - ) + with OmniRunner(model_name) as runner: + m = runner.omni + num_outputs = 2 + sp = _default_sync_sampling_params(num_outputs_per_prompt=num_outputs) + + outputs = m.generate( + "a photo of a cat sitting on a laptop keyboard", + sp, + ) - first_output = outputs[0] - assert first_output.final_output_type == "image" - images = _extract_images(first_output) - assert images is not None and len(images) == num_outputs, ( - f"Expected {num_outputs} images, got {len(images) if images else 0}" - ) - for img in images: - assert img.width == 256 - assert img.height == 256 + first_output = outputs[0] + assert first_output.final_output_type == "image" + images = _extract_images(first_output) + assert images is not None and len(images) == num_outputs, ( + f"Expected {num_outputs} images, got {len(images) if images else 0}" + ) + for img in images: + assert img.width == 256 + assert img.height == 256 except Exception as e: print(f"Test failed with error: {e}") raise - finally: - if m is not None and hasattr(m, "close"): - m.close() @pytest.mark.core_model @@ -587,34 +575,31 @@ def test_diffusion_batching_num_outputs(model_name: str): def test_diffusion_batching_distinct_results(model_name: str): """Test that different prompts produce distinct images when batched, ensuring the batching logic does not mix up results across requests.""" - m = None try: - m = Omni(model=model_name) - sp = _default_sync_sampling_params() - prompts = [ - {"prompt": "a bright red apple on a white table", "negative_prompt": "blurry"}, - {"prompt": "a blue ocean with white waves crashing", "negative_prompt": "blurry"}, - ] - - outputs = m.generate(prompts, sp) - assert len(outputs) == len(prompts), f"Expected {len(prompts)} outputs, got {len(outputs)}" - - # Verify each output has a unique request_id - request_ids = [o.request_id for o in outputs] - assert len(set(request_ids)) == len(request_ids), f"Duplicate request_ids: {request_ids}" - - # Verify each output has images - for i, output in enumerate(outputs): - images = _extract_images(output) - assert images and len(images) >= 1, f"No images for prompt {i}" - assert images[0].width == 256 - assert images[0].height == 256 + with OmniRunner(model_name) as runner: + m = runner.omni + sp = _default_sync_sampling_params() + prompts = [ + {"prompt": "a bright red apple on a white table", "negative_prompt": "blurry"}, + {"prompt": "a blue ocean with white waves crashing", "negative_prompt": "blurry"}, + ] + + outputs = m.generate(prompts, sp) + assert len(outputs) == len(prompts), f"Expected {len(prompts)} outputs, got {len(outputs)}" + + # Verify each output has a unique request_id + request_ids = [o.request_id for o in outputs] + assert len(set(request_ids)) == len(request_ids), f"Duplicate request_ids: {request_ids}" + + # Verify each output has images + for i, output in enumerate(outputs): + images = _extract_images(output) + assert images and len(images) >= 1, f"No images for prompt {i}" + assert images[0].width == 256 + assert images[0].height == 256 except Exception as e: print(f"Test failed with error: {e}") raise - finally: - if m is not None and hasattr(m, "close"): - m.close() # ------------------------------------------------------------------ diff --git a/tests/e2e/offline_inference/test_sequence_parallel.py b/tests/e2e/offline_inference/test_sequence_parallel.py index 16239a1c52f..d3abccd78cf 100644 --- a/tests/e2e/offline_inference/test_sequence_parallel.py +++ b/tests/e2e/offline_inference/test_sequence_parallel.py @@ -20,8 +20,8 @@ import torch.distributed as dist from PIL import Image +from tests.conftest import OmniRunner from tests.utils import hardware_test -from vllm_omni import Omni from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.platforms import current_omni_platform @@ -92,49 +92,48 @@ def _run_inference( warmup: If True, run one warmup iteration before the timed run. """ parallel_config = DiffusionParallelConfig(ulysses_degree=ulysses_degree, ring_degree=ring_degree) - omni = Omni( - model=model_name, - parallel_config=parallel_config, - dtype=dtype, - attention_backend=attn_backend, - ) - try: - # Warmup run (not timed) - if warmup: - _ = omni.generate( + with OmniRunner( + model_name, + parallel_config=parallel_config, + dtype=dtype, + attention_backend=attn_backend, + ) as runner: + omni = runner.omni + # Warmup run (not timed) + if warmup: + _ = omni.generate( + PROMPT, + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=DEFAULT_STEPS, + guidance_scale=0.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed + 1000), + num_outputs_per_prompt=1, + ), + ) + + # Timed run + start = time.time() + outputs = omni.generate( PROMPT, OmniDiffusionSamplingParams( height=height, width=width, num_inference_steps=DEFAULT_STEPS, guidance_scale=0.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed + 1000), + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), num_outputs_per_prompt=1, ), ) + elapsed_ms = (time.time() - start) * 1000 - # Timed run - start = time.time() - outputs = omni.generate( - PROMPT, - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=DEFAULT_STEPS, - guidance_scale=0.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), - num_outputs_per_prompt=1, - ), - ) - elapsed_ms = (time.time() - start) * 1000 - - return InferenceResult( - images=outputs[0].request_output.images, - elapsed_ms=elapsed_ms, - ) + return InferenceResult( + images=outputs[0].request_output.images, + elapsed_ms=elapsed_ms, + ) finally: - omni.close() _cleanup_distributed() diff --git a/tests/e2e/offline_inference/test_stable_audio_model.py b/tests/e2e/offline_inference/test_stable_audio_model.py index ff4d9b40172..21d75aad52a 100644 --- a/tests/e2e/offline_inference/test_stable_audio_model.py +++ b/tests/e2e/offline_inference/test_stable_audio_model.py @@ -1,6 +1,3 @@ -import sys -from pathlib import Path - import numpy as np import pytest import torch @@ -10,31 +7,25 @@ from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni - # Use random weights model for CI testing (small, no authentication required) models = ["linyueqian/stable_audio_random"] +# omni_runner expects (model, stage_configs_path); single-stage diffusion has no YAML. +test_params = [(m, None) for m in models] + @pytest.mark.core_model @pytest.mark.diffusion @hardware_test(res={"cuda": "L4", "xpu": "B60"}) -@pytest.mark.parametrize("model_name", models) -def test_stable_audio_model(model_name: str): - m = Omni(model=model_name) - +@pytest.mark.parametrize("omni_runner", test_params, indirect=True) +def test_stable_audio_model(omni_runner): # Use minimal settings for testing # Generate a short 2-second audio clip with minimal inference steps audio_start_in_s = 0.0 audio_end_in_s = 2.0 # Short duration for fast testing sample_rate = 44100 # Stable Audio uses 44100 Hz - outputs = m.generate( + outputs = omni_runner.omni.generate( prompts={ "prompt": "The sound of a dog barking", "negative_prompt": "Low quality.", diff --git a/tests/e2e/offline_inference/test_t2i_model.py b/tests/e2e/offline_inference/test_t2i_model.py index 55a154f61b9..fc54f9a7ff1 100644 --- a/tests/e2e/offline_inference/test_t2i_model.py +++ b/tests/e2e/offline_inference/test_t2i_model.py @@ -1,7 +1,3 @@ -import os -import sys -from pathlib import Path - import pytest import torch @@ -10,14 +6,12 @@ from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) +# Match unprefixed HF id even when MODEL_PREFIX is set (omni_runner resolves full path). +_QWEN_IMAGE_RANDOM_ID = "riverclouds/qwen_image_random" -from vllm_omni import Omni -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" +def _is_qwen_image_random(model_path: str) -> bool: + return model_path.rstrip("/").endswith(_QWEN_IMAGE_RANDOM_ID) models = ["Tongyi-MAI/Z-Image-Turbo", "riverclouds/qwen_image_random"] @@ -27,56 +21,55 @@ if current_omni_platform.is_npu(): models = ["Tongyi-MAI/Z-Image-Turbo", "Qwen/Qwen-Image"] +# omni_runner expects (model, stage_configs_path); single-stage diffusion has no YAML. +test_params = [(m, None) for m in models] + @pytest.mark.core_model @pytest.mark.advanced_model @pytest.mark.diffusion @hardware_test(res={"cuda": "L4", "rocm": "MI325", "xpu": "B60"}, num_cards={"cuda": 1, "rocm": 1, "xpu": 2}) -@pytest.mark.parametrize("model_name", models) -def test_diffusion_model(model_name: str, run_level): - if run_level == "core_model" and model_name != "riverclouds/qwen_image_random": +@pytest.mark.parametrize("omni_runner", test_params, indirect=True) +def test_diffusion_model(omni_runner, run_level): + resolved = omni_runner.model_name + if run_level == "core_model" and not _is_qwen_image_random(resolved): pytest.skip() - if run_level == "advanced_model" and model_name == "riverclouds/qwen_image_random": + if run_level == "advanced_model" and _is_qwen_image_random(resolved): pytest.skip() - m = None - try: - m = Omni(model=model_name) - # high resolution may cause OOM on L4 - height = 256 - width = 256 - outputs = m.generate( - "a photo of a cat sitting on a laptop keyboard", - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=2, - guidance_scale=0.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), - num_outputs_per_prompt=2, - ), - ) - # Extract images from request_output['images'] - first_output = outputs[0] - assert first_output.final_output_type == "image" - if not hasattr(first_output, "request_output") or not first_output.request_output: - raise ValueError("No request_output found in OmniRequestOutput") - - req_out = first_output.request_output - if not isinstance(req_out, OmniRequestOutput) or not hasattr(req_out, "images"): - raise ValueError("Invalid request_output structure or missing 'images' key") - - images = req_out.images - - assert len(images) == 2 - # check image size - assert images[0].width == width - assert images[0].height == height - images[0].save("image_output.png") - except Exception as e: - print(f"Test failed with error: {e}") - raise - finally: - if m is not None and hasattr(m, "close"): - m.close() + # high resolution may cause OOM on L4 + height = 256 + width = 256 + sampling = OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=2, + guidance_scale=0.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), + num_outputs_per_prompt=2, + ) + + # OmniRunner.generate() is typed for list[TextPrompt]; diffusion uses Omni.generate(str, ...). + outputs = omni_runner.omni.generate( + "a photo of a cat sitting on a laptop keyboard", + sampling, + ) + + # Extract images from request_output['images'] + first_output = outputs[0] + assert first_output.final_output_type == "image" + if not hasattr(first_output, "request_output") or not first_output.request_output: + raise ValueError("No request_output found in OmniRequestOutput") + + req_out = first_output.request_output + if not isinstance(req_out, OmniRequestOutput) or not hasattr(req_out, "images"): + raise ValueError("Invalid request_output structure or missing 'images' key") + + images = req_out.images + + assert len(images) == 2 + # check image size + assert images[0].width == width + assert images[0].height == height + images[0].save("image_output.png") diff --git a/tests/e2e/offline_inference/test_t2v_model.py b/tests/e2e/offline_inference/test_t2v_model.py index 94c9dedf741..6fe623cfc82 100644 --- a/tests/e2e/offline_inference/test_t2v_model.py +++ b/tests/e2e/offline_inference/test_t2v_model.py @@ -1,22 +1,13 @@ import os -import sys -from pathlib import Path import pytest import torch +from tests.conftest import OmniRunner from vllm_omni.inputs.data import OmniDiffusionSamplingParams - -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" models = ["Wan-AI/Wan2.2-T2V-A14B-Diffusers"] @@ -24,28 +15,28 @@ @pytest.mark.parametrize("model_name", models) def test_video_diffusion_model(model_name: str): - m = Omni( - model=model_name, + with OmniRunner( + model_name, boundary_ratio=0.875, flow_shift=5.0, - ) - # Use minimal settings for testing - # num_frames must satisfy: num_frames % vae_scale_factor_temporal == 1 - # For Wan2.2, vae_scale_factor_temporal=4, so valid values are 5, 9, 13, 17, ... - height = 480 - width = 640 - num_frames = 5 - outputs = m.generate( - prompts="A cat sitting on a table", - sampling_params_list=OmniDiffusionSamplingParams( - height=height, - width=width, - num_frames=num_frames, - num_inference_steps=2, - guidance_scale=1.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), - ), - ) + ) as runner: + # Use minimal settings for testing + # num_frames must satisfy: num_frames % vae_scale_factor_temporal == 1 + # For Wan2.2, vae_scale_factor_temporal=4, so valid values are 5, 9, 13, 17, ... + height = 480 + width = 640 + num_frames = 5 + outputs = runner.omni.generate( + prompts="A cat sitting on a table", + sampling_params_list=OmniDiffusionSamplingParams( + height=height, + width=width, + num_frames=num_frames, + num_inference_steps=2, + guidance_scale=1.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), + ), + ) first_output = outputs[0] assert first_output.final_output_type == "image" if not hasattr(first_output, "request_output") or not first_output.request_output: diff --git a/tests/e2e/offline_inference/test_teacache.py b/tests/e2e/offline_inference/test_teacache.py index efc0e43e86f..7cd1c5a4797 100644 --- a/tests/e2e/offline_inference/test_teacache.py +++ b/tests/e2e/offline_inference/test_teacache.py @@ -8,26 +8,14 @@ It uses minimal settings to keep test time short for CI. """ -import os -import sys -from pathlib import Path - import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from vllm_omni.platforms import current_omni_platform - -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from vllm_omni import Omni from vllm_omni.outputs import OmniRequestOutput - -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" +from vllm_omni.platforms import current_omni_platform # Use random weights model for testing models = ["riverclouds/qwen_image_random"] @@ -44,20 +32,17 @@ def test_teacache(model_name: str): cache_config = { "rel_l1_thresh": 0.2, # Default threshold } - m = None - try: - m = Omni( - model=model_name, - cache_backend="tea_cache", - cache_config=cache_config, - ) - + with OmniRunner( + model_name, + cache_backend="tea_cache", + cache_config=cache_config, + ) as runner: # Use minimal settings for fast testing height = 256 width = 256 num_inference_steps = 4 # Minimal steps for fast test - outputs = m.generate( + outputs = runner.omni.generate( "a photo of a cat sitting on a laptop keyboard", OmniDiffusionSamplingParams( height=height, @@ -86,9 +71,3 @@ def test_teacache(model_name: str): # Check image size assert images[0].width == width assert images[0].height == height - except Exception as e: - print(f"Test failed with error: {e}") - raise - finally: - if m is not None and hasattr(m, "close"): - m.close() diff --git a/tests/e2e/offline_inference/test_vae_decode_parallelism.py b/tests/e2e/offline_inference/test_vae_decode_parallelism.py index cee76fac2e9..0fce28d6692 100644 --- a/tests/e2e/offline_inference/test_vae_decode_parallelism.py +++ b/tests/e2e/offline_inference/test_vae_decode_parallelism.py @@ -18,7 +18,7 @@ import time -from vllm_omni import Omni +from tests.conftest import OmniRunner from vllm_omni.platforms import current_omni_platform # os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" @@ -72,23 +72,22 @@ def is_nextstep_model(model_name: str) -> bool: def model_run(model_configs, tp, out_height, out_width, out_frames, using_tile, vae_patch_parallel_size=1): - m = None - try: - parallel_config = DiffusionParallelConfig( - tensor_parallel_size=tp, - vae_patch_parallel_size=vae_patch_parallel_size, - ) + parallel_config = DiffusionParallelConfig( + tensor_parallel_size=tp, + vae_patch_parallel_size=vae_patch_parallel_size, + ) - omni_kwargs = { - "model": model_configs["model_name"], - "vae_use_tiling": using_tile, - "parallel_config": parallel_config, - } - use_nextstep = is_nextstep_model(model_configs["model_name"]) - if use_nextstep: - # NextStep-1.1 requires explicit pipeline class - omni_kwargs["model_class_name"] = "NextStep11Pipeline" - m = Omni(**omni_kwargs) + omni_kwargs = { + "vae_use_tiling": using_tile, + "parallel_config": parallel_config, + } + use_nextstep = is_nextstep_model(model_configs["model_name"]) + if use_nextstep: + # NextStep-1.1 requires explicit pipeline class + omni_kwargs["model_class_name"] = "NextStep11Pipeline" + + with OmniRunner(model_configs["model_name"], **omni_kwargs) as runner: + m = runner.omni image = Image.new("RGB", (out_width, out_height), (0, 0, 0)) start = time.perf_counter() outputs = m.generate( @@ -115,9 +114,6 @@ def model_run(model_configs, tp, out_height, out_width, out_frames, using_tile, # frames shape: (batch, num_frames, height, width, channels) cost = (end - start) * 1000 return frames, cost - finally: - if m is not None: - m.close() cleanup_dist_env_and_memory() diff --git a/tests/e2e/offline_inference/test_voxcpm2.py b/tests/e2e/offline_inference/test_voxcpm2.py index 7e17c6a3691..4e4f635d5c4 100644 --- a/tests/e2e/offline_inference/test_voxcpm2.py +++ b/tests/e2e/offline_inference/test_voxcpm2.py @@ -5,6 +5,7 @@ import pytest import torch +from tests.conftest import OmniRunner from tests.utils import hardware_test VOXCPM2_MODEL = "openbmb/VoxCPM2" @@ -24,10 +25,8 @@ @pytest.fixture(scope="module") def voxcpm2_engine(): """Create VoxCPM2 engine for testing.""" - from vllm_omni import Omni - - engine = Omni(model=VOXCPM2_MODEL, stage_configs_path=STAGE_CONFIG) - yield engine + with OmniRunner(VOXCPM2_MODEL, stage_configs_path=STAGE_CONFIG) as runner: + yield runner.omni def _extract_audio(multimodal_output: dict) -> torch.Tensor: diff --git a/tests/e2e/offline_inference/test_voxtral_tts.py b/tests/e2e/offline_inference/test_voxtral_tts.py index b559cc252dc..4f440f243bf 100644 --- a/tests/e2e/offline_inference/test_voxtral_tts.py +++ b/tests/e2e/offline_inference/test_voxtral_tts.py @@ -19,7 +19,6 @@ import uuid os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1" from pathlib import Path @@ -30,10 +29,9 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer from vllm import SamplingParams -from tests.conftest import modify_stage_config +from tests.conftest import OmniRunner, modify_stage_config from tests.utils import hardware_test from vllm_omni.entrypoints.async_omni import AsyncOmni -from vllm_omni.entrypoints.omni import Omni MODEL = "mistralai/Voxtral-4B-TTS-2603" STAGE_CONFIG = str( @@ -83,14 +81,12 @@ def test_voxtral_tts_offline_basic(run_level): """Test basic Voxtral TTS offline inference with a voice preset.""" stage_config = _resolve_stage_config(run_level) - omni = Omni( - model=MODEL, + with OmniRunner( + MODEL, stage_configs_path=stage_config, - stage_init_timeout=300, enforce_eager=True, - ) - - try: + ) as runner: + omni = runner.omni inputs = _compose_request(MODEL, TEST_TEXT, VOICE) sampling_params = SamplingParams(max_tokens=2500) @@ -127,9 +123,6 @@ def test_voxtral_tts_offline_basic(run_level): # Verify audio isn't all zeros / silence assert np.max(np.abs(audio_array)) > 0.01, "Audio appears to be silence" - finally: - omni.close() - @pytest.mark.advanced_model @pytest.mark.omni diff --git a/tests/e2e/offline_inference/test_zimage_parallelism.py b/tests/e2e/offline_inference/test_zimage_parallelism.py index b685704ae4b..27edc48f205 100644 --- a/tests/e2e/offline_inference/test_zimage_parallelism.py +++ b/tests/e2e/offline_inference/test_zimage_parallelism.py @@ -12,7 +12,6 @@ """ import os -import sys import time from pathlib import Path @@ -20,21 +19,14 @@ import pytest import torch from PIL import Image -from vllm.distributed.parallel_state import cleanup_dist_env_and_memory +from tests.conftest import OmniRunner from tests.utils import DeviceMemoryMonitor, hardware_test -from vllm_omni import Omni from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput from vllm_omni.platforms import current_omni_platform -# ruff: noqa: E402 -REPO_ROOT = Path(__file__).resolve().parents[2] -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" PROMPT = "a photo of a cat sitting on a laptop keyboard" @@ -97,61 +89,61 @@ def _run_zimage_generate( device_index = current_omni_platform.current_device() monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) monitor.start() - m = Omni( - model=_get_zimage_model(), - parallel_config=DiffusionParallelConfig( - tensor_parallel_size=tp_size, - vae_patch_parallel_size=vae_patch_parallel_size, - ), - enforce_eager=enforce_eager, - vae_use_tiling=vae_use_tiling, - ) try: - # NOTE: Omni closes itself when a generate() call is exhausted. - # To avoid measuring teardown time (process shutdown, memory cleanup), - # we measure the latency to produce *subsequent* outputs within a single - # generator run. - # - # This also serves as a warmup: the first output may include extra - # compilation/caching overhead, while later outputs are closer to - # steady-state inference. - gen = m.generate( - [PROMPT] * num_requests, - OmniDiffusionSamplingParams( - height=height, - width=width, - num_inference_steps=num_inference_steps, - guidance_scale=0.0, - seed=seed, - num_outputs_per_prompt=1, + # Each run needs a distinct DiffusionParallelConfig; use OmniRunner per call (not the + # parametrized omni_runner fixture, which is fixed per module). + with OmniRunner( + _get_zimage_model(), + parallel_config=DiffusionParallelConfig( + tensor_parallel_size=tp_size, + vae_patch_parallel_size=vae_patch_parallel_size, ), - py_generator=True, - ) - - warmup_output = next(gen) - - t_prev = time.perf_counter() - per_request_times_s: list[float] = [] - last_output = warmup_output - for _ in range(num_requests - 1): - last_output = next(gen) - t_now = time.perf_counter() - per_request_times_s.append(t_now - t_prev) - t_prev = t_now - - # Ensure the generator is fully consumed so it can clean up. - for _ in gen: - pass - - median_time_s = float(np.median(per_request_times_s)) - - peak_memory_mb = monitor.peak_used_mb - - return _extract_single_image([last_output]), median_time_s, peak_memory_mb + enforce_eager=enforce_eager, + vae_use_tiling=vae_use_tiling, + ) as runner: + # NOTE: Omni closes itself when a generate() call is exhausted. + # To avoid measuring teardown time (process shutdown, memory cleanup), + # we measure the latency to produce *subsequent* outputs within a single + # generator run. + # + # This also serves as a warmup: the first output may include extra + # compilation/caching overhead, while later outputs are closer to + # steady-state inference. + gen = runner.omni.generate( + [PROMPT] * num_requests, + OmniDiffusionSamplingParams( + height=height, + width=width, + num_inference_steps=num_inference_steps, + guidance_scale=0.0, + seed=seed, + num_outputs_per_prompt=1, + ), + py_generator=True, + ) + + warmup_output = next(gen) + + t_prev = time.perf_counter() + per_request_times_s: list[float] = [] + last_output = warmup_output + for _ in range(num_requests - 1): + last_output = next(gen) + t_now = time.perf_counter() + per_request_times_s.append(t_now - t_prev) + t_prev = t_now + + # Ensure the generator is fully consumed so it can clean up. + for _ in gen: + pass + + median_time_s = float(np.median(per_request_times_s)) + + peak_memory_mb = monitor.peak_used_mb + + return _extract_single_image([last_output]), median_time_s, peak_memory_mb finally: monitor.stop() - m.close() - cleanup_dist_env_and_memory() @pytest.mark.advanced_model diff --git a/tests/e2e/online_serving/test_images_generations_lora.py b/tests/e2e/online_serving/test_images_generations_lora.py index 8c826591a56..fb1e3ea1e0f 100644 --- a/tests/e2e/online_serving/test_images_generations_lora.py +++ b/tests/e2e/online_serving/test_images_generations_lora.py @@ -28,7 +28,7 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" MODEL = "Tongyi-MAI/Z-Image-Turbo" -DIFFUSION_INIT_TIMEOUT_S = 700 +DIFFUSION_INIT_TIMEOUT_S = 900 PROMPT = "a photo of a cat sitting on a laptop keyboard" From d369648e668b66ce6191003157fc5ad17dd67597 Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Mon, 13 Apr 2026 14:51:34 +0800 Subject: [PATCH 11/76] refactor: add stage_pool Signed-off-by: ZhengWG --- vllm_omni/engine/orchestrator.py | 81 ++++++++++++--------- vllm_omni/engine/stage_pool.py | 116 +++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+), 32 deletions(-) create mode 100644 vllm_omni/engine/stage_pool.py diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index b79f88933ff..da16d3ca66c 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -28,6 +28,7 @@ OmniEngineCoreRequest, ) from vllm_omni.engine.serialization import serialize_additional_information +from vllm_omni.engine.stage_pool import StagePool, StageReplica, build_stage_pools from vllm_omni.metrics.stats import StageRequestStats as StageRequestMetrics from vllm_omni.metrics.stats import StageStats from vllm_omni.metrics.utils import count_tokens_from_outputs @@ -136,6 +137,12 @@ def __init__( self.num_clients = len(stage_clients) self.async_chunk = bool(async_chunk) + # Flat-list view: retained as a compatibility layer so existing call + # sites that index by flat client_index (metrics, shutdown, collective + # RPC fan-out, etc.) keep working. StagePool below is the canonical + # path for replica selection and should be preferred in new code. + # TODO(stage-pool): migrate remaining flat-list readers onto + # self.stage_pools and drop these attributes. self.stage_clients: list[Any] = stage_clients self.output_processors: list[Any] = output_processors self.stage_vllm_configs: list[Any] = stage_vllm_configs @@ -148,7 +155,16 @@ def __init__( self.logical_stage_to_clients = [[i] for i in range(self.num_clients)] self.num_logical_stages = len(self.logical_stage_to_clients) - # Reverse mappings: client_index -> (logical_stage_id, replica_index) + # Canonical per-logical-stage replica container. + self.stage_pools: list[StagePool] = build_stage_pools( + stage_clients, + output_processors, + stage_vllm_configs, + self.logical_stage_to_clients, + ) + + # Reverse mappings: client_index -> (logical_stage_id, replica_index). + # Kept for metrics/shutdown log lines that index by flat client_index. self._client_to_logical: list[int] = [0] * self.num_clients self._client_to_replica: list[int] = [0] * self.num_clients for logical_id, client_indices in enumerate(self.logical_stage_to_clients): @@ -156,9 +172,6 @@ def __init__( self._client_to_logical[ci] = logical_id self._client_to_replica[ci] = ri - # Round-robin counters for replica selection per logical stage - self._replica_rr: list[int] = [0] * self.num_logical_stages - # Backward compat: num_stages now means num_logical_stages self.num_stages = self.num_logical_stages @@ -186,26 +199,14 @@ def _choose_client_index( logical_stage_id: int, req_state: OrchestratorRequestState, ) -> int: - """Pick a client for *logical_stage_id* and record the choice. + """Pick a flat client_index for *logical_stage_id* via the stage pool. - If this request already has a chosen client for the logical stage, - return the existing one (affinity). Otherwise round-robin among the - available replicas. + Thin wrapper that delegates to ``StagePool.select_replica`` so the + flat-index-based call sites keep working. New code should call the + pool directly when the StageReplica object itself is useful. """ - existing = req_state.chosen_client_index.get(logical_stage_id) - if existing is not None: - return existing - - candidates = self.logical_stage_to_clients[logical_stage_id] - if len(candidates) == 1: - chosen = candidates[0] - else: - rr = self._replica_rr[logical_stage_id] - chosen = candidates[rr % len(candidates)] - self._replica_rr[logical_stage_id] = rr + 1 - - req_state.chosen_client_index[logical_stage_id] = chosen - return chosen + replica = self.stage_pools[logical_stage_id].select_replica(req_state) + return replica.flat_index def _resolve_client_index(self, stage_id: int, replica_index: int = 0) -> int: """Resolve (stage_id, replica_index) to a flat client index.""" @@ -251,7 +252,14 @@ async def run(self) -> None: await asyncio.gather(*pending, return_exceptions=True) async def _request_handler(self) -> None: - """Read messages from the main thread via request_async_queue.""" + """Read messages from the main thread via request_async_queue. + + TODO(stage-pool): the while loop below has no top-level try/except, so + any unhandled exception inside a _handle_* coroutine kills this task + and leaves the orchestrator unable to consume further messages. Wrap + each dispatch in a per-message try/except so one bad request can't + wedge the whole engine. + """ while True: msg = await self.request_async_queue.get() msg_type = msg.get("type") @@ -908,26 +916,35 @@ async def _handle_add_companion(self, msg: dict[str, Any]) -> None: ) self.request_states[companion_id] = companion_state - # Use same replica as the parent for affinity, or choose one + # CFG companions must land on the same stage-0 replica as their + # parent so the diffusion stage can fetch both KV caches from a + # single device. Pass affinity_from explicitly; if the parent is + # already gone (aborted between add_request and add_companion) fall + # back to round-robin rather than failing the companion. + stage0_pool = self.stage_pools[0] parent_state = self.request_states.get(parent_id) - if parent_state is not None and 0 in parent_state.chosen_client_index: - client_index = parent_state.chosen_client_index[0] - companion_state.chosen_client_index[0] = client_index - else: - client_index = self._choose_client_index(0, companion_state) + parent_replica: StageReplica | None = None + if parent_state is not None: + parent_flat = parent_state.chosen_client_index.get(0) + if parent_flat is not None: + parent_replica = stage0_pool.get_replica_by_flat_index(parent_flat) + + companion_replica = stage0_pool.select_replica( + companion_state, + affinity_from=parent_replica, + ) companion_state.stage_submit_ts[0] = _time.time() request = companion_prompt # Already a processed OmniEngineCoreRequest - stage_client = self.stage_clients[client_index] - await stage_client.add_request_async(request) + await companion_replica.client.add_request_async(request) logger.info( "[Orchestrator] CFG companion submitted: %s (role=%s, parent=%s, stage-0 replica-%s)", companion_id, role, parent_id, - self._client_to_replica[client_index], + companion_replica.replica_index, ) async def _handle_abort(self, msg: dict[str, Any]) -> None: diff --git a/vllm_omni/engine/stage_pool.py b/vllm_omni/engine/stage_pool.py new file mode 100644 index 00000000000..86a2fdcd77f --- /dev/null +++ b/vllm_omni/engine/stage_pool.py @@ -0,0 +1,116 @@ +"""StagePool: per-logical-stage replica container. + +Groups the {client, output_processor, vllm_config} triple of each replica +under a single logical stage and centralizes replica selection (round-robin ++ per-request affinity). The Orchestrator still owns flat lists as a +compatibility view; StagePool is the canonical lookup going forward. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from vllm_omni.engine.orchestrator import OrchestratorRequestState + + +@dataclass +class StageReplica: + """One replica of a logical stage. + + flat_index is the index into Orchestrator's flat stage_clients list; it + is the value cached in OrchestratorRequestState.chosen_client_index so + existing call sites that resolve a flat client keep working unchanged. + """ + + logical_stage_id: int + replica_index: int + flat_index: int + client: Any + output_processor: Any + vllm_config: Any + + +class StagePool: + """Replicas of one logical stage with RR + affinity selection.""" + + def __init__( + self, + logical_stage_id: int, + stage_type: str | None, + replicas: list[StageReplica], + ) -> None: + if not replicas: + raise ValueError(f"StagePool for logical stage {logical_stage_id} has no replicas") + self.logical_stage_id = logical_stage_id + self.stage_type = stage_type + self.replicas: list[StageReplica] = replicas + self._rr_cursor = 0 + self._by_flat_index: dict[int, StageReplica] = {r.flat_index: r for r in replicas} + + @property + def num_replicas(self) -> int: + return len(self.replicas) + + def get_replica_by_flat_index(self, flat_index: int) -> StageReplica: + return self._by_flat_index[flat_index] + + def select_replica( + self, + req_state: OrchestratorRequestState, + *, + affinity_from: StageReplica | None = None, + ) -> StageReplica: + """Pick a replica for *req_state* and cache the choice. + + Resolution order: + 1. Existing choice recorded on req_state (per-request affinity). + 2. affinity_from (explicit cross-request binding, e.g. CFG companion + inheriting its parent's replica at stage 0). + 3. Round-robin across replicas. + """ + cached = req_state.chosen_client_index.get(self.logical_stage_id) + if cached is not None: + return self._by_flat_index[cached] + + if affinity_from is not None: + if affinity_from.logical_stage_id != self.logical_stage_id: + raise ValueError( + f"affinity_from is for logical stage {affinity_from.logical_stage_id}, " + f"cannot be used to select in stage {self.logical_stage_id}" + ) + chosen = affinity_from + elif self.num_replicas == 1: + chosen = self.replicas[0] + else: + chosen = self.replicas[self._rr_cursor % self.num_replicas] + self._rr_cursor += 1 + + req_state.chosen_client_index[self.logical_stage_id] = chosen.flat_index + return chosen + + +def build_stage_pools( + stage_clients: list[Any], + output_processors: list[Any], + stage_vllm_configs: list[Any], + logical_stage_to_clients: list[list[int]], +) -> list[StagePool]: + """Assemble StagePool list from the flat-list view owned by the engine.""" + pools: list[StagePool] = [] + for logical_id, client_indices in enumerate(logical_stage_to_clients): + replicas = [ + StageReplica( + logical_stage_id=logical_id, + replica_index=ri, + flat_index=ci, + client=stage_clients[ci], + output_processor=output_processors[ci], + vllm_config=stage_vllm_configs[ci], + ) + for ri, ci in enumerate(client_indices) + ] + stage_type = getattr(stage_clients[client_indices[0]], "stage_type", None) + pools.append(StagePool(logical_id, stage_type, replicas)) + return pools From 2b70e89535aca2f29eff74687a6b07b5fd2bd077 Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Mon, 13 Apr 2026 14:55:16 +0800 Subject: [PATCH 12/76] =?UTF-8?q?[Revert]=20Revert=20"[Log]=20Wire=20stat?= =?UTF-8?q?=20loggers=20into=20AsyncOmniEngine=20to=20match=20AsyncLL?= =?UTF-8?q?=E2=80=A6=20(#2716)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: amy-why-3459 --- .../test_async_omni_engine_do_log_stats.py | 56 ------------------ .../test_async_omni_engine_stage_init.py | 2 - tests/engine/test_single_stage_mode.py | 3 - vllm_omni/engine/async_omni_engine.py | 58 +------------------ vllm_omni/engine/orchestrator.py | 26 +-------- vllm_omni/entrypoints/async_omni.py | 7 ++- 6 files changed, 8 insertions(+), 144 deletions(-) delete mode 100644 tests/engine/test_async_omni_engine_do_log_stats.py diff --git a/tests/engine/test_async_omni_engine_do_log_stats.py b/tests/engine/test_async_omni_engine_do_log_stats.py deleted file mode 100644 index e2b8c03b935..00000000000 --- a/tests/engine/test_async_omni_engine_do_log_stats.py +++ /dev/null @@ -1,56 +0,0 @@ -"""Guard tests for AsyncOmniEngine.do_log_stats edge cases. - -These are pure-Python tests that bypass __init__ and only exercise the -no-op branches of do_log_stats, so no stage cores / threads are needed. -""" - -import asyncio - -import pytest - -from vllm_omni.engine.async_omni_engine import AsyncOmniEngine - -pytestmark = [pytest.mark.core_model, pytest.mark.cpu] - - -def _make_bare_engine() -> AsyncOmniEngine: - # Bypass __init__ so we don't spin up stage cores; we only need the - # attributes do_log_stats touches. - return AsyncOmniEngine.__new__(AsyncOmniEngine) - - -@pytest.mark.asyncio -async def test_do_log_stats_noop_when_manager_missing(): - engine = _make_bare_engine() - engine.logger_manager = None - engine.orchestrator_loop = None - await engine.do_log_stats() # should silently return - - -@pytest.mark.asyncio -async def test_do_log_stats_noop_when_loop_missing(): - engine = _make_bare_engine() - - class _Manager: - def log(self) -> None: # pragma: no cover - must not be called - raise AssertionError("log() should not be called without a loop") - - engine.logger_manager = _Manager() - engine.orchestrator_loop = None - await engine.do_log_stats() - - -@pytest.mark.asyncio -async def test_do_log_stats_noop_when_loop_not_running(): - engine = _make_bare_engine() - - class _Manager: - def log(self) -> None: # pragma: no cover - must not be called - raise AssertionError("log() should not be called on a stopped loop") - - dead_loop = asyncio.new_event_loop() - dead_loop.close() - - engine.logger_manager = _Manager() - engine.orchestrator_loop = dead_loop - await engine.do_log_stats() diff --git a/tests/engine/test_async_omni_engine_stage_init.py b/tests/engine/test_async_omni_engine_stage_init.py index f3973079365..6993f391ebc 100644 --- a/tests/engine/test_async_omni_engine_stage_init.py +++ b/tests/engine/test_async_omni_engine_stage_init.py @@ -31,7 +31,6 @@ def test_initialize_stages_restores_device_visibility_after_diffusion_init(monke from vllm_omni.platforms import current_omni_platform engine = object.__new__(AsyncOmniEngine) - engine.log_stats = False engine.model = "dummy-model" engine.config_path = "dummy-config" engine.num_stages = 1 @@ -283,7 +282,6 @@ def __init__(self, vllm_config, renderer=None): ) engine = object.__new__(AsyncOmniEngine) - engine.log_stats = False _stage_client, _out_proc, _vllm_cfg, input_processor = engine._attach_llm_stage(started) diff --git a/tests/engine/test_single_stage_mode.py b/tests/engine/test_single_stage_mode.py index 1afe2fd6d9c..2c5bf6cc79c 100644 --- a/tests/engine/test_single_stage_mode.py +++ b/tests/engine/test_single_stage_mode.py @@ -461,7 +461,6 @@ def _build_engine_skeleton( engine.stage_configs = stage_cfgs engine.num_stages = len(stage_cfgs) engine.async_chunk = False - engine.log_stats = False engine.single_stage_mode = single_stage_mode engine._single_stage_id_filter = stage_id_filter engine._omni_master_address = omni_master_address @@ -1367,7 +1366,6 @@ class TestLaunchLlmStageSingleStageMode: def _build_engine_with_oms(self) -> AsyncOmniEngine: engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" - engine.log_stats = False engine.single_stage_mode = True engine._single_stage_id_filter = 0 engine._llm_stage_launch_lock = threading.Lock() @@ -1448,7 +1446,6 @@ def test_spawn_stage_core_used_in_normal_mode(self): """~single_stage_mode → spawn_stage_core + complete_stage_handshake.""" engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" - engine.log_stats = False engine.single_stage_mode = False engine._omni_master_server = None engine._llm_stage_launch_lock = threading.Lock() diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 32e8336f6da..0a2e02d66ef 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -31,7 +31,6 @@ from vllm.tokenizers import cached_tokenizer_from_config from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.input_processor import InputProcessor -from vllm.v1.metrics.loggers import StatLoggerManager from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient @@ -285,7 +284,6 @@ def __init__( self.num_stages = len(self.stage_configs) stage0_args = getattr(self.stage_configs[0], "engine_args", None) if self.num_stages > 0 else None self.async_chunk = bool(getattr(stage0_args, "async_chunk", False)) - self.log_stats = not bool(getattr(stage0_args, "disable_log_stats", False)) self.stage_clients: list[Any] = [] self.stage_vllm_configs: list[Any] = [] self.output_processors: list[MultimodalOutputProcessor | None] = [] @@ -415,7 +413,7 @@ def _launch_llm_stage( addresses, proc, handshake_address = spawn_stage_core( vllm_config=vllm_config, executor_class=executor_class, - log_stats=self.log_stats, + log_stats=False, ) started_stage = StartedLlmStage( stage_id=metadata.stage_id, @@ -617,7 +615,7 @@ def _attach_llm_stage( ) output_processor = MultimodalOutputProcessor( tokenizer=tokenizer, - log_stats=self.log_stats, + log_stats=False, engine_core_output_type=started.metadata.engine_output_type, ) input_processor = None @@ -872,30 +870,6 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: self.default_sampling_params_list = default_sampling_params_list self.stage_metadata = stage_metadata - # Single StatLoggerManager for the whole pipeline, mirroring how - # vLLM AsyncLLM uses one manager with multiple engine indices for DP. - # We treat each stage as a separate "engine_idx" so logs are - # distinguishable as "Engine 000/001/002/...". Using a single manager - # also avoids PrometheusStatLogger registry collisions. - self.logger_manager: StatLoggerManager | None = None - if self.log_stats: - base_vllm_config = next( - (cfg for cfg in self.stage_vllm_configs if cfg is not None), - None, - ) - if base_vllm_config is not None: - try: - self.logger_manager = StatLoggerManager( - vllm_config=base_vllm_config, - engine_idxs=list(range(self.num_stages)), - custom_stat_loggers=None, - enable_default_loggers=True, - ) - self.logger_manager.log_engine_initialized() - except Exception: - logger.exception("[AsyncOmniEngine] Failed to build StatLoggerManager") - self.logger_manager = None - def _initialize_janus_queues(self) -> None: """Initialize janus queues inside orchestrator thread loop context.""" self.request_queue = janus.Queue() @@ -912,10 +886,6 @@ def _bootstrap_orchestrator( loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) - # Expose the orchestrator loop so other threads (API server) can - # schedule coroutines onto it via run_coroutine_threadsafe, keeping - # single-threaded access to StatLoggerManager (mirrors AsyncLLM). - self.orchestrator_loop = loop async def _run_orchestrator() -> None: self._initialize_janus_queues() @@ -929,7 +899,6 @@ async def _run_orchestrator() -> None: stage_clients=self.stage_clients, output_processors=self.output_processors, stage_vllm_configs=self.stage_vllm_configs, - logger_manager=self.logger_manager, ) if not startup_future.done(): startup_future.set_result(asyncio.get_running_loop()) @@ -1554,29 +1523,6 @@ async def abort_async(self, request_ids: list[str]) -> None: """Async abort API.""" self.abort(request_ids) - async def do_log_stats(self) -> None: - """Flush the StatLoggerManager on the orchestrator thread. - - ``StatLoggerManager`` is only safe to access from the orchestrator - loop (where ``record()`` runs). Schedule ``log()`` onto that loop - via ``run_coroutine_threadsafe`` so all access stays single-threaded, - matching upstream vLLM ``AsyncLLM``. - """ - manager = self.logger_manager - if manager is None: - return - loop = getattr(self, "orchestrator_loop", None) - if loop is None or not loop.is_running(): - return - - async def _log() -> None: - manager.log() - - try: - await asyncio.wrap_future(asyncio.run_coroutine_threadsafe(_log(), loop)) - except Exception: - logger.exception("[AsyncOmniEngine] do_log_stats failed") - def collective_rpc( self, method: str, diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index e64fd3685cf..386b545eb75 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -22,8 +22,6 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.v1.engine import EngineCoreOutputs -from vllm.v1.metrics.loggers import StatLoggerManager -from vllm.v1.metrics.stats import IterationStats from vllm_omni.distributed.omni_connectors.adapter import compute_talker_prompt_ids_length from vllm_omni.engine import ( @@ -124,7 +122,6 @@ def __init__( stage_vllm_configs: list[Any], *, async_chunk: bool = False, - logger_manager: StatLoggerManager | None = None, ) -> None: self.request_async_queue = request_async_queue self.output_async_queue = output_async_queue @@ -136,8 +133,6 @@ def __init__( self.stage_clients: list[Any] = stage_clients self.output_processors: list[Any] = output_processors self.stage_vllm_configs: list[Any] = stage_vllm_configs - self.logger_manager: StatLoggerManager | None = logger_manager - self.log_stats = self.logger_manager is not None # Per-request state self.request_states: dict[str, OrchestratorRequestState] = {} @@ -629,13 +624,10 @@ async def _process_stage_outputs(self, stage_id: int, raw_outputs: EngineCoreOut """ processor = self.output_processors[stage_id] - num_outputs = len(raw_outputs.outputs) - iteration_stats = IterationStats() if (self.log_stats and num_outputs) else None - processed = processor.process_outputs( raw_outputs.outputs, raw_outputs.timestamp, - iteration_stats, + None, ) if processed.reqs_to_abort: @@ -644,22 +636,6 @@ async def _process_stage_outputs(self, stage_id: int, raw_outputs: EngineCoreOut if raw_outputs.scheduler_stats is not None: processor.update_scheduler_stats(raw_outputs.scheduler_stats) - # Mirror vLLM AsyncLLM output_handler: feed stats to the logger - # manager so LoggingStatLogger can periodically print KV cache / - # prefix cache hit rate, and PrometheusStatLogger can publish. - if self.logger_manager is not None: - try: - self.logger_manager.record( - engine_idx=stage_id, - scheduler_stats=raw_outputs.scheduler_stats, - iteration_stats=iteration_stats, - ) - except Exception: - logger.exception( - "[Orchestrator] stat logger record failed for stage-%s", - stage_id, - ) - return processed.request_outputs async def _handle_add_request(self, msg: dict[str, Any]) -> None: diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py index 0b25ce71418..129ef3c99d8 100644 --- a/vllm_omni/entrypoints/async_omni.py +++ b/vllm_omni/entrypoints/async_omni.py @@ -743,8 +743,11 @@ async def is_tracing_enabled(self) -> bool: return False async def do_log_stats(self) -> None: - """Log statistics via the engine, mirroring vLLM ``AsyncLLM``.""" - await self.engine.do_log_stats() + """Log statistics. + + TODO: Forward to Orchestrator process via message. + """ + pass async def get_supported_tasks(self) -> tuple[SupportedTask, ...]: """Return the task set exposed by the orchestrator-backed engine.""" From 0d4e975e1bf6c574babc7e8279db2b4ff612dd22 Mon Sep 17 00:00:00 2001 From: NATURE Date: Mon, 13 Apr 2026 16:01:14 +0800 Subject: [PATCH 13/76] [core]refactor communication layer: PR1(Added Refactor Infra Only) (#1555) Signed-off-by: natureofnature Co-authored-by: Hongsheng Liu --- .../test_chunk_scheduling_coordinator.py | 690 ++++++ tests/worker/test_omni_connector_mixin.py | 1419 +++++++++++ .../core/sched/omni_scheduling_coordinator.py | 380 +++ .../worker/diffusion_model_runner.py | 3 +- vllm_omni/outputs.py | 28 + vllm_omni/worker/gpu_ar_model_runner.py | 3 +- .../worker/gpu_generation_model_runner.py | 3 +- .../omni_connector_model_runner_mixin.py | 2125 +++++++++++++++++ vllm_omni/worker/payload_span.py | 64 + 9 files changed, 4712 insertions(+), 3 deletions(-) create mode 100644 tests/core/sched/test_chunk_scheduling_coordinator.py create mode 100644 tests/worker/test_omni_connector_mixin.py create mode 100644 vllm_omni/core/sched/omni_scheduling_coordinator.py create mode 100644 vllm_omni/worker/omni_connector_model_runner_mixin.py create mode 100644 vllm_omni/worker/payload_span.py diff --git a/tests/core/sched/test_chunk_scheduling_coordinator.py b/tests/core/sched/test_chunk_scheduling_coordinator.py new file mode 100644 index 00000000000..5e19465e224 --- /dev/null +++ b/tests/core/sched/test_chunk_scheduling_coordinator.py @@ -0,0 +1,690 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for OmniSchedulingCoordinator (formerly ChunkSchedulingCoordinator). + +These tests use mock request objects and mock queues. They do not require +GPU, vLLM runtime, or any connector. +""" + +from __future__ import annotations + +import unittest +from types import SimpleNamespace + +import vllm_omni.core.sched.omni_scheduling_coordinator as coord_mod +from vllm_omni.core.sched.omni_scheduling_coordinator import ( + ChunkSchedulingCoordinator, + OmniSchedulingCoordinator, +) + +# ------------------------------------------------------------------ # +# Mock helpers +# ------------------------------------------------------------------ # + + +class _RequestStatus: + WAITING = "waiting" + RUNNING = "running" + WAITING_FOR_CHUNK = "waiting_for_chunk" + WAITING_FOR_INPUT = "waiting_for_input" + FINISHED_STOPPED = "finished_stopped" + + +# Patch RequestStatus for tests that don't import vllm +try: + from vllm.v1.request import RequestStatus +except ImportError: + RequestStatus = _RequestStatus # type: ignore[misc,assignment] + +if not hasattr(RequestStatus, "WAITING_FOR_INPUT"): + coord_mod.RequestStatus = _RequestStatus # type: ignore[assignment] + RequestStatus = _RequestStatus # type: ignore[misc,assignment] + + +def _make_request(req_id: str, status: str = "waiting") -> SimpleNamespace: + return SimpleNamespace( + request_id=req_id, + external_req_id=req_id, + status=status, + additional_information=None, + prompt_token_ids=[], + num_prompt_tokens=0, + num_computed_tokens=0, + _all_token_ids=[], + _output_token_ids=[], + ) + + +class MockQueue: + """Simplified queue that mimics the Scheduler waiting queue interface.""" + + def __init__(self, items: list | None = None): + self._items: list = list(items or []) + + def __iter__(self): + return iter(self._items) + + def __len__(self): + return len(self._items) + + def __contains__(self, item): + return item in self._items + + def add_request(self, request): + self._items.append(request) + + def prepend_requests(self, requests): + self._items = list(requests) + self._items + + def remove(self, request): + self._items.remove(request) + + def remove_requests(self, requests): + remove_set = set(id(r) for r in requests) + self._items = [r for r in self._items if id(r) not in remove_set] + + +# ------------------------------------------------------------------ # +# Tests +# ------------------------------------------------------------------ # + + +class TestChunkCoordinatorStateTransition(unittest.TestCase): + """Test 5: process_pending_chunks transitions WAITING_FOR_CHUNK → target.""" + + def test_ready_request_transitions_to_waiting(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1, async_chunk=True) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids={"r1"}, + chunk_finished_req_ids=set(), + ) + + self.assertEqual(req.status, RequestStatus.WAITING) + self.assertIn("r1", coord.requests_with_ready_chunks) + + def test_non_ready_stays_waiting_for_chunk(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1, async_chunk=True) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids=set(), + chunk_finished_req_ids=set(), + ) + + self.assertEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) + + def test_stage_0_is_noop(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=0) + req = _make_request("r1") + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids={"r1"}, + chunk_finished_req_ids=set(), + ) + self.assertNotEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) + + +class TestChunkCoordinatorRestoreQueues(unittest.TestCase): + """Test 6: restore_queues returns waiting-for-chunk requests.""" + + def test_restore(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + r1 = _make_request("r1") + r2 = _make_request("r2") + coord._waiting_for_chunk_waiting.append(r1) + coord._waiting_for_chunk_running.append(r2) + + waiting = MockQueue() + running: list = [] + + coord.restore_queues(waiting, running) + + self.assertIn(r1, waiting) + self.assertIn(r2, running) + self.assertEqual(len(coord._waiting_for_chunk_waiting), 0) + self.assertEqual(len(coord._waiting_for_chunk_running), 0) + + +class TestChunkCoordinatorFinishedSignal(unittest.TestCase): + """Test 8: chunk_finished_req_ids → finished_requests.""" + + def test_finished_signal(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1, async_chunk=True) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids={"r1"}, + chunk_finished_req_ids={"r1"}, + ) + + self.assertIn("r1", coord.finished_requests) + + +class TestChunkCoordinatorUpdateRequestMetadata(unittest.TestCase): + """Test update_request_metadata applies scheduling metadata to requests.""" + + def test_ar_mode_no_longer_sets_additional_information(self): + """AR mode only processes scheduling metadata, not full payloads.""" + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + req = _make_request("r1") + requests = {"r1": req} + + # Only scheduling metadata is passed now (full payload stays in model runner) + request_metadata = {"r1": {"next_stage_prompt_len": 50}} + + coord.update_request_metadata(requests, request_metadata, model_mode="ar") + + # next_stage_prompt_len should update prompt_token_ids + self.assertEqual(len(req.prompt_token_ids), 50) + self.assertEqual(req.num_prompt_tokens, 50) + # additional_information should NOT be set + self.assertIsNone(getattr(req, "additional_information", None)) + + def test_generation_mode(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + req = _make_request("r1") + req.prompt_token_ids = [0, 0, 0] + requests = {"r1": req} + + request_metadata = { + "r1": { + "code_predictor_codes": [10, 20, 30], + "left_context_size": 25, + } + } + + coord.update_request_metadata(requests, request_metadata, model_mode="generation") + + self.assertEqual(req.prompt_token_ids, [10, 20, 30]) + self.assertEqual(req.num_computed_tokens, 0) + self.assertIsNone(req.additional_information) + self.assertEqual(req._omni_initial_model_buffer, {"left_context_size": 25}) + + +class TestChunkCoordinatorPostprocess(unittest.TestCase): + """Test postprocess_scheduler_output clears ready chunks.""" + + def test_clear_ready(self): + coord = ChunkSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + coord.requests_with_ready_chunks = {"r1", "r2"} + + new_req = SimpleNamespace(req_id="r1") + cached_reqs = SimpleNamespace(req_ids=["r2"]) + scheduler_output = SimpleNamespace( + scheduled_new_reqs=[new_req], + scheduled_cached_reqs=cached_reqs, + ) + + coord.postprocess_scheduler_output(scheduler_output) + + self.assertEqual(coord.requests_with_ready_chunks, set()) + + +class TestWaitingForInputTransition(unittest.TestCase): + """Test B8: process_pending_full_payload_inputs transitions WAITING_FOR_INPUT.""" + + def test_transition_on_recv(self): + coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_full_payload_inputs( + waiting, + running, + stage_recv_req_ids={"r1"}, + ) + + self.assertEqual(req.status, RequestStatus.WAITING) + + def test_stays_waiting_for_input_if_not_received(self): + coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_full_payload_inputs( + waiting, + running, + stage_recv_req_ids=set(), + ) + + self.assertEqual(req.status, RequestStatus.WAITING_FOR_INPUT) + self.assertEqual(len(coord._waiting_for_input), 1) + + def test_stage_0_is_noop(self): + coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=0) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_full_payload_inputs( + waiting, + running, + stage_recv_req_ids={"r1"}, + ) + self.assertEqual(req.status, RequestStatus.WAITING_FOR_INPUT) + + def test_restore_queues_includes_waiting_for_input(self): + coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + r1 = _make_request("r1") + coord._waiting_for_input.append(r1) + + waiting = MockQueue() + running: list = [] + + coord.restore_queues(waiting, running) + + self.assertIn(r1, waiting) + self.assertEqual(len(coord._waiting_for_input), 0) + + def test_full_payload_mode_auto_transitions_waiting_to_waiting_for_input(self): + """In full_payload_mode (async_chunk=False), fresh WAITING requests on + non-Stage-0 should be transitioned to WAITING_FOR_INPUT.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=False, + ) + + req = _make_request("r1", status=RequestStatus.WAITING) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_full_payload_inputs( + waiting, + running, + stage_recv_req_ids=set(), + ) + + self.assertEqual(req.status, RequestStatus.WAITING_FOR_INPUT) + self.assertEqual(len(coord._waiting_for_input), 1) + self.assertEqual(len(coord.pending_input_registrations), 1) + + def test_async_chunk_mode_does_not_auto_transition(self): + """In async_chunk mode, fresh WAITING requests should NOT be + transitioned to WAITING_FOR_INPUT.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=True, + ) + + req = _make_request("r1", status=RequestStatus.WAITING) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_full_payload_inputs( + waiting, + running, + stage_recv_req_ids=set(), + ) + + self.assertEqual(req.status, RequestStatus.WAITING) + + def test_pending_input_registrations(self): + coord = OmniSchedulingCoordinator(scheduler_max_num_seqs=10, stage_id=1) + + req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_full_payload_inputs( + waiting, + running, + stage_recv_req_ids=set(), + ) + + self.assertEqual(len(coord.pending_input_registrations), 1) + self.assertEqual(coord.pending_input_registrations[0].request_id, "r1") + + +class TestTimeoutDetection(unittest.TestCase): + """Regression tests for orphaned pending-recv timeout detection. + + Covers the full lifecycle: + 1. Request enters WAITING_FOR_CHUNK from either waiting or running queue + 2. restore_queues() moves it back to the scheduler queue + 3. Timeout fires via collect_timed_out_request_ids() + 4. Scheduler removes from both queues and calls _free_request() + """ + + def test_waiting_since_recorded_on_chunk_wait(self): + """_waiting_since is set when a request enters WAITING_FOR_CHUNK.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=True, + ) + req = _make_request("r1", status=RequestStatus.WAITING) + waiting = MockQueue([req]) + + coord.process_pending_chunks( + waiting, + [], + chunk_ready_req_ids=set(), + chunk_finished_req_ids=set(), + ) + + self.assertIn("r1", coord._waiting_since) + self.assertEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) + + def test_waiting_since_cleared_on_chunk_arrival(self): + """_waiting_since is cleared when a chunk arrives.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=True, + ) + req = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) + waiting = MockQueue([req]) + + coord.process_pending_chunks( + waiting, + [], + chunk_ready_req_ids={"r1"}, + chunk_finished_req_ids=set(), + ) + + self.assertNotIn("r1", coord._waiting_since) + + def test_waiting_since_recorded_on_input_wait(self): + """_waiting_since is set when a request enters WAITING_FOR_INPUT.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=False, + ) + req = _make_request("r1", status=RequestStatus.WAITING) + waiting = MockQueue([req]) + + coord.process_pending_full_payload_inputs( + waiting, + [], + stage_recv_req_ids=set(), + ) + + self.assertIn("r1", coord._waiting_since) + + def test_waiting_since_cleared_on_input_arrival(self): + """_waiting_since is cleared when input data arrives.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=False, + ) + req = _make_request("r1", status=RequestStatus.WAITING_FOR_INPUT) + coord._waiting_for_input.append(req) + coord._waiting_since["r1"] = 0.0 + + waiting = MockQueue() + coord.process_pending_full_payload_inputs( + waiting, + [], + stage_recv_req_ids={"r1"}, + ) + + self.assertNotIn("r1", coord._waiting_since) + self.assertEqual(req.status, RequestStatus.WAITING) + + def test_collect_timed_out_request_ids_no_timeout(self): + """No IDs returned when nothing has timed out.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + ) + import time + + coord._waiting_since["r1"] = time.monotonic() + + result = coord.collect_timed_out_request_ids(timeout_s=300.0) + self.assertEqual(result, set()) + + def test_collect_timed_out_request_ids_expired(self): + """Timed-out IDs are returned and _waiting_since is cleared.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + ) + coord._waiting_since["r1"] = 0.0 # epoch → definitely expired + coord._waiting_since["r2"] = 0.0 + + import time + + coord._waiting_since["r3"] = time.monotonic() + 9999 # far future + + result = coord.collect_timed_out_request_ids(timeout_s=1.0) + + self.assertEqual(result, {"r1", "r2"}) + self.assertNotIn("r1", coord._waiting_since) + self.assertNotIn("r2", coord._waiting_since) + self.assertIn("r3", coord._waiting_since) + + def test_collect_removes_from_coordinator_queues(self): + """Timed-out requests are defensively removed from internal queues.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + ) + r1 = _make_request("r1") + r2 = _make_request("r2") + coord._waiting_for_chunk_waiting.append(r1) + coord._waiting_for_input.append(r2) + coord._waiting_since["r1"] = 0.0 + coord._waiting_since["r2"] = 0.0 + + result = coord.collect_timed_out_request_ids(timeout_s=1.0) + + self.assertEqual(result, {"r1", "r2"}) + self.assertEqual(len(coord._waiting_for_chunk_waiting), 0) + self.assertEqual(len(coord._waiting_for_input), 0) + + def test_free_finished_request_clears_waiting_since(self): + """free_finished_request clears _waiting_since.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + ) + coord._waiting_since["r1"] = 0.0 + coord.free_finished_request("r1") + self.assertNotIn("r1", coord._waiting_since) + + def test_timeout_from_running_queue_full_lifecycle(self): + """End-to-end: request from running → WAITING_FOR_CHUNK → restore → + timeout → removed from running list. + + This is the critical regression case: WAITING_FOR_CHUNK requests + that originated from self.running are placed back into self.running + by restore_queues(), but their status remains WAITING_FOR_CHUNK. + The scheduler must remove from BOTH queues unconditionally. + """ + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=True, + ) + + # 1) Request starts in running queue with WAITING status + req = _make_request("r1", status=RequestStatus.WAITING) + running = [req] + waiting = MockQueue() + + # 2) process_pending_chunks: moves to WAITING_FOR_CHUNK + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids=set(), + chunk_finished_req_ids=set(), + ) + self.assertEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) + self.assertIn("r1", coord._waiting_since) + self.assertEqual(len(coord._waiting_for_chunk_running), 1) + + # 3) restore_queues: back to running (status stays WAITING_FOR_CHUNK) + coord.restore_queues(waiting, running) + self.assertIn(req, running) + self.assertEqual(len(coord._waiting_for_chunk_running), 0) + self.assertEqual(req.status, RequestStatus.WAITING_FOR_CHUNK) + + # 4) Force timeout by setting _waiting_since to epoch + coord._waiting_since["r1"] = 0.0 + + timed_out_ids = coord.collect_timed_out_request_ids(timeout_s=1.0) + self.assertEqual(timed_out_ids, {"r1"}) + + # 5) Scheduler removes from both queues (simulating the scheduler path) + timed_out_id_set = {id(req)} + running = [r for r in running if id(r) not in timed_out_id_set] + waiting.remove_requests([req]) + + self.assertNotIn(req, running) + self.assertEqual(len(waiting), 0) + + def test_timeout_from_waiting_queue_full_lifecycle(self): + """End-to-end: request from waiting → WAITING_FOR_CHUNK → restore → + timeout → removed from waiting queue.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=10, + stage_id=1, + async_chunk=True, + ) + + req = _make_request("r1", status=RequestStatus.WAITING) + waiting = MockQueue([req]) + running: list = [] + + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids=set(), + chunk_finished_req_ids=set(), + ) + self.assertEqual(len(coord._waiting_for_chunk_waiting), 1) + + coord.restore_queues(waiting, running) + self.assertIn(req, waiting) + + coord._waiting_since["r1"] = 0.0 + timed_out_ids = coord.collect_timed_out_request_ids(timeout_s=1.0) + self.assertEqual(timed_out_ids, {"r1"}) + + waiting.remove_requests([req]) + self.assertEqual(len(waiting), 0) + + +class TestOverflowPreemption(unittest.TestCase): + """Tests for P1-1: overflow requests must get WAITING status. + + Overflow happens when multiple WAITING_FOR_CHUNK requests in + ``_waiting_for_chunk_running`` receive their chunk in the same cycle. + ``_process_chunk_queue`` restores them to RUNNING (``continue`` + path) while RUNNING requests without chunks are moved out. If the + net result exceeds ``scheduler_max_num_seqs``, the tail is pushed + to ``waiting_queue`` and must have status == WAITING. + """ + + def test_overflow_sets_waiting_status(self): + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=1, + stage_id=1, + async_chunk=True, + ) + + # r1 is currently RUNNING in the queue. + # r2, r3 were previously moved to _waiting_for_chunk_running. + r1 = _make_request("r1", status=RequestStatus.RUNNING) + r2 = _make_request("r2", status=RequestStatus.WAITING_FOR_CHUNK) + r3 = _make_request("r3", status=RequestStatus.WAITING_FOR_CHUNK) + + running = [r1] + waiting = MockQueue([]) + coord._waiting_for_chunk_running.extend([r2, r3]) + + # restore_queues puts r2, r3 back into running + coord.restore_queues(waiting, running) + self.assertEqual(len(running), 3) + + # Now process_pending_chunks with r2, r3 chunks ready: + # _process_chunk_queue will: + # r1 (RUNNING) → no chunk → move to _waiting_for_chunk_running + # r2 (WAITING_FOR_CHUNK, chunk ready) → set RUNNING, stay in running + # r3 (WAITING_FOR_CHUNK, chunk ready) → set RUNNING, stay in running + # running = [r2, r3], len=2 > max=1 → overflow + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids={"r2", "r3"}, + chunk_finished_req_ids=set(), + ) + + self.assertEqual(len(running), 1) + self.assertEqual(len(waiting), 1) + overflow_req = list(waiting)[0] + self.assertEqual( + overflow_req.status, + RequestStatus.WAITING, + f"Overflowed request should have WAITING status, got {overflow_req.status}", + ) + + def test_overflow_does_not_strand_request(self): + """Without the fix, the overflowed request would keep its + RUNNING status in the waiting queue and never be re-scheduled.""" + coord = OmniSchedulingCoordinator( + scheduler_max_num_seqs=1, + stage_id=1, + async_chunk=True, + ) + + r1 = _make_request("r1", status=RequestStatus.WAITING_FOR_CHUNK) + r2 = _make_request("r2", status=RequestStatus.WAITING_FOR_CHUNK) + coord._waiting_for_chunk_running.extend([r1, r2]) + + running: list = [] + waiting = MockQueue([]) + + coord.restore_queues(waiting, running) + self.assertEqual(len(running), 2) + + coord.process_pending_chunks( + waiting, + running, + chunk_ready_req_ids={"r1", "r2"}, + chunk_finished_req_ids=set(), + ) + + self.assertEqual(len(running), 1) + self.assertEqual(len(waiting), 1) + for req in waiting: + self.assertNotEqual(req.status, RequestStatus.RUNNING, "Overflowed request must not keep RUNNING status") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/worker/test_omni_connector_mixin.py b/tests/worker/test_omni_connector_mixin.py new file mode 100644 index 00000000000..0e162a37e5b --- /dev/null +++ b/tests/worker/test_omni_connector_mixin.py @@ -0,0 +1,1419 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for OmniConnectorModelRunnerMixin. + +These tests use a mock connector (in-memory dict store) and do not require +GPU or vLLM runtime. +""" + +from __future__ import annotations + +import time +import unittest +from types import SimpleNamespace +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest +import torch + +from vllm_omni.outputs import OmniConnectorOutput +from vllm_omni.worker.omni_connector_model_runner_mixin import ( + OmniConnectorModelRunnerMixin, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + +# ------------------------------------------------------------------ # +# Mock helpers +# ------------------------------------------------------------------ # + + +class MockConnector: + """In-memory connector for testing (mimics OmniConnectorBase).""" + + def __init__(self, stage_id: int = 0): + self.stage_id = stage_id + self._store: dict[str, Any] = {} + + def put(self, from_stage, to_stage, put_key, data): + key = f"{from_stage}_{to_stage}_{put_key}" + self._store[key] = data + return True, len(str(data)), None + + def get(self, from_stage, to_stage, get_key, metadata=None): + key = f"{from_stage}_{to_stage}_{get_key}" + data = self._store.pop(key, None) + if data is None: + return None + return data, len(str(data)) + + def close(self): + pass + + +def _make_model_config( + stage_id: int = 0, + async_chunk: bool = False, + worker_type: str = "ar", + custom_func: str | None = None, +) -> SimpleNamespace: + return SimpleNamespace( + stage_connector_config=None, + async_chunk=async_chunk, + worker_type=worker_type, + custom_process_next_stage_input_func=custom_func, + ) + + +def _make_request(req_id: str, external_req_id: str | None = None): + r = SimpleNamespace( + request_id=req_id, + external_req_id=external_req_id or req_id, + additional_information=None, + prompt_token_ids=[], + num_computed_tokens=0, + ) + return r + + +class MixinHost(OmniConnectorModelRunnerMixin): + """Minimal class that mixes in the mixin for testing.""" + + pass + + +class _FakeTPGroup: + def __init__(self, *, world_size: int, rank_in_group: int, follower_result: Any = None): + self.world_size = world_size + self.rank_in_group = rank_in_group + self.follower_result = follower_result + self.broadcast_inputs: list[Any] = [] + + def broadcast_object(self, obj: Any | None = None, src: int = 0): + self.broadcast_inputs.append(obj) + if self.rank_in_group == src: + return obj + return self.follower_result + + +# ------------------------------------------------------------------ # +# Test cases +# ------------------------------------------------------------------ # + + +class TestMixinAsyncChunkSendRecv(unittest.TestCase): + """Test 2: Async chunk send/recv + bg threads.""" + + def test_send_chunk_passes_is_finished_and_connector(self): + connector = MockConnector(stage_id=0) + + sender = MixinHost() + sender.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=0, async_chunk=True), + ) + sender._omni_connector = connector + sender._stage_id = 0 + sender._async_chunk = True + + seen = {} + + def mock_process(transfer_manager, pooling_output, request, is_finished=False): + seen["connector"] = transfer_manager.connector + seen["is_finished"] = is_finished + return {"data": pooling_output, "finished": is_finished} + + sender._custom_process_func = mock_process + + request = _make_request("req-1", "ext-req-1") + request.is_finished = lambda: True + sender._send_single_request( + { + "stage_id": 0, + "next_stage_id": 1, + "request_id": "ext-req-1", + "request": request, + "pooling_output": {"value": 42}, + } + ) + self.assertIs(seen["connector"], connector) + self.assertTrue(seen["is_finished"]) + + sender.shutdown_omni_connectors() + + def test_send_chunk_does_not_retry_real_type_error(self): + connector = MockConnector(stage_id=0) + + sender = MixinHost() + sender.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=0, async_chunk=True), + ) + sender._omni_connector = connector + sender._stage_id = 0 + sender._async_chunk = True + + seen = {"calls": 0} + + def broken_process(transfer_manager, pooling_output, request, is_finished=""): + seen["calls"] += 1 + return {"data": is_finished + "tail"} + + sender._custom_process_func = broken_process + + request = _make_request("req-1", "ext-req-1") + request.is_finished = lambda: True + ok = sender.send_chunk(request, pooling_output={"value": 42}) + self.assertFalse(ok) + self.assertEqual(seen["calls"], 1) + + sender.shutdown_omni_connectors() + + +class TestMixinKVCacheTransfer(unittest.TestCase): + """Test 3: KV cache delegation to OmniKVTransferManager.""" + + def test_send_kv_delegates(self): + mock_kvm = MagicMock() + mock_kvm.handle_finished_requests_kv_transfer.return_value = ["req-1"] + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + kv_transfer_manager=mock_kvm, + ) + + result = host.send_kv_cache( + finished_reqs={"req-1": {"seq_len": 10, "block_ids": [0]}}, + kv_caches=[], + block_size=16, + cache_dtype="float16", + ) + self.assertEqual(result, ["req-1"]) + mock_kvm.handle_finished_requests_kv_transfer.assert_called_once() + + host.shutdown_omni_connectors() + + def test_recv_kv_delegates(self): + mock_kvm = MagicMock() + mock_kvm.receive_kv_cache_for_request.return_value = ({"layer_blocks": {}}, 100) + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + kv_transfer_manager=mock_kvm, + ) + + data, size = host.recv_kv_cache("req-1") + self.assertIsNotNone(data) + self.assertEqual(size, 100) + mock_kvm.receive_kv_cache_for_request.assert_called_once() + + host.shutdown_omni_connectors() + + def test_receive_multi_kv_fetches_companions_via_mixin(self): + mock_kvm = MagicMock() + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + kv_transfer_manager=mock_kvm, + ) + + host.recv_kv_cache = MagicMock( + side_effect=[({"layer_blocks": {"k": [1]}}, 64), ({"layer_blocks": {"k": [2]}}, 32)] + ) + seen = {} + + def collect_cfg(request_id, cfg_role_payloads): + seen["request_id"] = request_id + seen["cfg_role_payloads"] = cfg_role_payloads + return {"cfg_text_kv_metadata": {"seq_len": 3}} + + req = SimpleNamespace( + request_id="req-1", + sampling_params=SimpleNamespace(cfg_kv_request_ids={"cfg_text": "req-1__cfg_text"}), + ) + ok = host.receive_multi_kv_cache(req, cfg_kv_collect_func=collect_cfg) + self.assertTrue(ok) + host.recv_kv_cache.assert_any_call("req-1", target_device=None) + host.recv_kv_cache.assert_any_call("req-1__cfg_text", target_device=None) + mock_kvm.apply_kv_cache_to_request.assert_called_once_with(req, {"layer_blocks": {"k": [1]}}) + self.assertEqual(seen["request_id"], "req-1") + self.assertEqual( + seen["cfg_role_payloads"], + {"cfg_text": ({"layer_blocks": {"k": [2]}}, 32)}, + ) + self.assertEqual(req.sampling_params.cfg_text_kv_metadata, {"seq_len": 3}) + + host.shutdown_omni_connectors() + + def test_receive_multi_kv_skips_inactive_request(self): + mock_kvm = MagicMock() + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + kv_transfer_manager=mock_kvm, + ) + + host.requests = {} + host.recv_kv_cache = MagicMock(return_value=({"layer_blocks": {"k": [1]}}, 64)) + req = SimpleNamespace(request_id="req-1", sampling_params=None) + + ok = host.receive_multi_kv_cache(req) + + self.assertFalse(ok) + host.recv_kv_cache.assert_not_called() + mock_kvm.apply_kv_cache_to_request.assert_not_called() + + host.shutdown_omni_connectors() + + +class TestOmniConnectorOutput(unittest.TestCase): + """Test 4: Output aggregation across transfer modes.""" + + def test_output_aggregation(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + ) + + host._chunk_ready_req_ids.add("req-1") + host._chunk_finished_req_ids.add("req-2") + host._local_request_metadata["req-1"] = {"next_stage_prompt_len": 10} + host._stage_recv_req_ids.add("req-3") + + output = host.get_omni_connector_output() + self.assertIsInstance(output, OmniConnectorOutput) + self.assertEqual(output.chunk_ready_req_ids, {"req-1"}) + self.assertEqual(output.chunk_finished_req_ids, {"req-2"}) + self.assertEqual(output.request_metadata, {"req-1": {"next_stage_prompt_len": 10}}) + self.assertEqual(output.stage_recv_req_ids, {"req-3"}) + + output2 = host.get_omni_connector_output() + self.assertEqual(output2.chunk_ready_req_ids, set()) + self.assertEqual(output2.request_metadata, {}) + + host.shutdown_omni_connectors() + + +class TestMixinNoConnector(unittest.TestCase): + """Edge case: mixin works gracefully without a connector.""" + + def test_no_connector(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + ) + self.assertIsNone(host._omni_connector) + + results = host.recv_full_payload_inputs(scheduler_output=None) + self.assertIsNone(results) + + sent = host.send_full_payload_outputs(None, {"req-1": {}}) + self.assertEqual(sent, []) + + ok = host.send_chunk(_make_request("req-1"), pooling_output={}) + self.assertFalse(ok) + + output = host.get_omni_connector_output() + self.assertIsInstance(output, OmniConnectorOutput) + + host.shutdown_omni_connectors() + + +class TestFinishedLoadReqsDrain(unittest.TestCase): + """Test A1 fix: get_omni_connector_output drains _finished_load_reqs.""" + + def test_finished_load_reqs_flow_to_chunk_ready(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + ) + + host._finished_load_reqs.add("req-1") + host._finished_load_reqs.add("req-2") + + output = host.get_omni_connector_output() + self.assertIn("req-1", output.chunk_ready_req_ids) + self.assertIn("req-2", output.chunk_ready_req_ids) + + self.assertEqual(len(host._finished_load_reqs), 0) + self.assertEqual(len(host._chunk_ready_req_ids), 0) + + host.shutdown_omni_connectors() + + +class TestLoadCustomFuncSelection(unittest.TestCase): + def test_skips_legacy_stage_list_processors_for_full_payload_mode(self): + legacy_paths = [ + "vllm_omni.model_executor.stage_input_processors.mimo_audio.llm2code2wav", + "vllm_omni.model_executor.stage_input_processors.mammoth_moda2.ar2dit", + "vllm_omni.model_executor.stage_input_processors.cosyvoice3.text2flow", + "vllm_omni.model_executor.stage_input_processors.glm_image.ar2diffusion", + ] + + for func_path in legacy_paths: + selected_path, func = MixinHost._load_custom_func( + SimpleNamespace( + async_chunk=False, + custom_process_input_func=func_path, + custom_process_next_stage_input_func=None, + ) + ) + assert selected_path != func_path + assert func is None or MixinHost._is_connector_payload_builder(func) + + +class TestFullPayloadSendWithCustomFunc(unittest.TestCase): + """Test B4: send_full_payload_outputs with full_payload_mode custom process func.""" + + def test_full_payload_send_passes_is_finished_and_connector(self): + seen = {} + + def full_payload_func(transfer_manager, pooling_output, request, is_finished=False): + seen["connector"] = transfer_manager.connector + seen["is_finished"] = is_finished + seen["data"] = pooling_output + seen["rid"] = request.request_id if request else None + return {"processed": True, "finished": is_finished} + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + ) + host._omni_connector = MockConnector(stage_id=0) + host._stage_id = 0 + host._custom_process_func = full_payload_func + + req = _make_request("req-1") + req.is_finished = lambda: True + sent = host.send_full_payload_outputs( + scheduler_output=None, + outputs={"req-1": ({"raw": 100}, req)}, + ) + self.assertEqual(sent, ["req-1"]) + self.assertEqual( + seen, + { + "connector": host._omni_connector, + "is_finished": True, + "data": {"raw": 100}, + "rid": "req-1", + }, + ) + + host.shutdown_omni_connectors() + + def test_accumulate_and_flush(self): + call_log = [] + + def full_payload_func(transfer_manager, pooling_output, request): + call_log.append(request.request_id if request else None) + return {"processed": True} + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + ) + host._omni_connector = MockConnector(stage_id=0) + host._stage_id = 0 + host._custom_process_func = full_payload_func + + req = _make_request("req-1") + host.accumulate_full_payload_output("req-1", {"raw": 42}, req) + self.assertEqual(len(host._pending_full_payload_send), 1) + + host.flush_full_payload_outputs({"req-1"}) + self.assertEqual(len(host._pending_full_payload_send), 0) + self.assertEqual(len(call_log), 1) + self.assertEqual(call_log[0], "req-1") + + time.sleep(0.1) + host.shutdown_omni_connectors() + + +class TestKVSentReqIdsAccumulation(unittest.TestCase): + """Test that kv_sent_req_ids accumulates results from send_kv_cache.""" + + def test_kv_sent_accumulation(self): + mock_kvm = MagicMock() + mock_kvm.handle_finished_requests_kv_transfer.return_value = ["req-1", "req-2"] + + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(), + kv_transfer_manager=mock_kvm, + ) + + host.send_kv_cache( + finished_reqs={"req-1": {}, "req-2": {}}, + kv_caches=[], + block_size=16, + cache_dtype="float16", + ) + + output = host.get_omni_connector_output() + self.assertIn("req-1", output.kv_sent_req_ids) + self.assertIn("req-2", output.kv_sent_req_ids) + + output2 = host.get_omni_connector_output() + self.assertEqual(output2.kv_sent_req_ids, []) + + host.shutdown_omni_connectors() + + +class TestChunkStreamCompletedGuard(unittest.TestCase): + """Test that register_chunk_recv is skipped after finish sentinel. + + This validates the fix for the race condition where the scheduling + coordinator re-registers a request for chunk polling after its + upstream chunk stream has already finished (is_finished sentinel + received), causing the bg recv thread to poll for a non-existent + shared-memory segment (e.g. ``_0_7`` when only 7 chunks 0–6 exist). + """ + + def _make_host(self, stage_id: int = 1) -> MixinHost: + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=stage_id, async_chunk=True), + ) + host._omni_connector = MockConnector(stage_id=stage_id) + host._stage_id = stage_id + host._async_chunk = True + return host + + def test_register_blocked_after_finish_sentinel(self): + """register_chunk_recv must be a no-op after the finish sentinel.""" + host = self._make_host(stage_id=1) + + req = _make_request("req-1", "ext-req-1") + + # Simulate the bg thread having received the finish sentinel: + with host._lock: + host._chunk_stream_completed.add("req-1") + + # Now try to re-register — this mimics the coordinator asking + # the model runner to poll for the next (non-existent) chunk. + host.register_chunk_recv(req) + + # The request must NOT appear in _pending_load_reqs + self.assertNotIn( + "req-1", + host._pending_load_reqs, + "register_chunk_recv should skip requests whose chunk stream is already complete", + ) + + host.shutdown_omni_connectors() + + def test_register_allowed_before_finish(self): + """register_chunk_recv works normally before finish sentinel.""" + host = self._make_host(stage_id=1) + req = _make_request("req-1", "ext-req-1") + + host.register_chunk_recv(req) + self.assertIn( + "req-1", + host._pending_load_reqs, + "register_chunk_recv should add request to pending when stream is not yet complete", + ) + + host.shutdown_omni_connectors() + + def test_finish_sentinel_populates_completed_set(self): + """Receiving is_finished=True adds to _chunk_stream_completed.""" + host = self._make_host(stage_id=1) + + # Simulate _poll_single_request receiving is_finished=True + req_id = "req-1" + with host._lock: + host._chunk_finished_req_ids.add(req_id) + host._chunk_stream_completed.add(req_id) + host._local_stage_payload_cache[req_id] = {"finished": True} + host._local_request_metadata[req_id] = {} + host._finished_load_reqs.add(req_id) + host._pending_load_reqs.pop(req_id, None) + + self.assertIn(req_id, host._chunk_stream_completed) + + # Subsequent register_chunk_recv should be blocked + req = _make_request(req_id, f"ext-{req_id}") + host.register_chunk_recv(req) + self.assertNotIn(req_id, host._pending_load_reqs) + + host.shutdown_omni_connectors() + + def test_stage_0_always_skipped(self): + """Stage-0 has no upstream, register_chunk_recv is always no-op.""" + host = self._make_host(stage_id=0) + host._stage_id = 0 + + req = _make_request("req-1") + host.register_chunk_recv(req) + self.assertNotIn("req-1", host._pending_load_reqs) + + host.shutdown_omni_connectors() + + def test_full_payload_recv_guard_still_works(self): + """Pre-existing guard: staged full-payload results prevent registration.""" + host = self._make_host(stage_id=1) + + with host._lock: + host._stage_recv_req_ids.add("req-1") + + req = _make_request("req-1", "ext-req-1") + host.register_chunk_recv(req) + self.assertNotIn("req-1", host._pending_load_reqs) + + host.shutdown_omni_connectors() + + +class TestCleanupFinishedRequest(unittest.TestCase): + """Test cleanup_finished_request frees per-request mixin state.""" + + def _make_host(self, stage_id: int = 1) -> MixinHost: + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=stage_id, async_chunk=True), + ) + host._omni_connector = MockConnector(stage_id=stage_id) + host._stage_id = stage_id + host._async_chunk = True + return host + + def test_cleanup_removes_all_state(self): + """cleanup_finished_request removes all tracking dicts/sets.""" + host = self._make_host(stage_id=1) + req_id = "req-1" + ext_id = "ext-req-1" + + # Simulate state accumulated during a request's lifetime + host._request_ids_mapping[req_id] = ext_id + host._put_req_chunk[ext_id] = 5 + host._get_req_chunk[req_id] = 3 + host._send_side_request_payload[ext_id] = {"some": "data"} + host._code_prompt_token_ids[ext_id] = [[1, 2, 3]] + host._chunk_stream_completed.add(req_id) + host._stage_recv_req_ids.add(req_id) + host._local_stage_payload_cache[req_id] = {"engine_inputs": {}} + host._local_request_metadata[req_id] = {"prompt_len": 10} + + # Cleanup + host.cleanup_finished_request(req_id) + + # All state should be gone + self.assertNotIn(req_id, host._request_ids_mapping) + self.assertNotIn(ext_id, host._put_req_chunk) + self.assertNotIn(req_id, host._get_req_chunk) + self.assertNotIn(ext_id, host._send_side_request_payload) + self.assertNotIn(ext_id, host._code_prompt_token_ids) + self.assertNotIn(req_id, host._chunk_stream_completed) + self.assertNotIn(req_id, host._stage_recv_req_ids) + self.assertNotIn(req_id, host._local_stage_payload_cache) + self.assertNotIn(req_id, host._local_request_metadata) + + host.shutdown_omni_connectors() + + def test_cleanup_removes_per_cycle_ready_state(self): + """cleanup_finished_request clears ready/finished carry-over for req-id reuse.""" + host = self._make_host(stage_id=1) + req_id = "req-1" + + host._pending_load_reqs[req_id] = _make_request(req_id, "ext-req-1") + host._finished_load_reqs.add(req_id) + host._chunk_ready_req_ids.add(req_id) + host._chunk_finished_req_ids.add(req_id) + + host.cleanup_finished_request(req_id) + + self.assertNotIn(req_id, host._pending_load_reqs) + self.assertNotIn(req_id, host._finished_load_reqs) + self.assertNotIn(req_id, host._chunk_ready_req_ids) + self.assertNotIn(req_id, host._chunk_finished_req_ids) + + host.shutdown_omni_connectors() + + def test_cleanup_without_mapping(self): + """cleanup works for Stage-0 where _request_ids_mapping isn't set.""" + host = self._make_host(stage_id=0) + host._stage_id = 0 + req_id = "req-1" + + # Stage-0 uses req_id directly (no ext_id mapping) + host._put_req_chunk[req_id] = 3 + host._get_req_chunk[req_id] = 0 + + host.cleanup_finished_request(req_id) + + self.assertNotIn(req_id, host._put_req_chunk) + self.assertNotIn(req_id, host._get_req_chunk) + + host.shutdown_omni_connectors() + + def test_prune_inactive_requests_cleans_stale_state_but_keeps_active(self): + """Inactive request IDs should be pruned without touching active ones.""" + host = self._make_host(stage_id=1) + active_req_id = "req-active" + stale_req_id = "req-stale" + stale_ext_id = "ext-stale" + + host._request_ids_mapping[active_req_id] = "ext-active" + host._request_ids_mapping[stale_req_id] = stale_ext_id + host._put_req_chunk[stale_ext_id] = 2 + host._get_req_chunk[stale_req_id] = 1 + host._finished_load_reqs.add(stale_req_id) + host._chunk_ready_req_ids.update({active_req_id, stale_req_id}) + host._chunk_finished_req_ids.add(stale_req_id) + host._chunk_stream_completed.add(stale_req_id) + host._stage_recv_req_ids.add(active_req_id) + host._send_side_request_payload[stale_ext_id] = {"stale": True} + host._code_prompt_token_ids[stale_ext_id] = [[1, 2, 3]] + + pruned = host.prune_inactive_requests({active_req_id}) + + self.assertEqual(pruned, {stale_req_id}) + self.assertIn(active_req_id, host._request_ids_mapping) + self.assertIn(active_req_id, host._chunk_ready_req_ids) + self.assertIn(active_req_id, host._stage_recv_req_ids) + self.assertNotIn(stale_req_id, host._request_ids_mapping) + self.assertNotIn(stale_ext_id, host._put_req_chunk) + self.assertNotIn(stale_req_id, host._get_req_chunk) + self.assertNotIn(stale_req_id, host._pending_load_reqs) + self.assertNotIn(stale_req_id, host._finished_load_reqs) + self.assertNotIn(stale_req_id, host._chunk_ready_req_ids) + self.assertNotIn(stale_req_id, host._chunk_finished_req_ids) + self.assertNotIn(stale_req_id, host._chunk_stream_completed) + self.assertNotIn(stale_req_id, host._stage_recv_req_ids) + self.assertNotIn(stale_ext_id, host._send_side_request_payload) + self.assertNotIn(stale_ext_id, host._code_prompt_token_ids) + + host.shutdown_omni_connectors() + + def test_prune_inactive_requests_keeps_recently_received_full_payload_state(self): + """Late bg-thread receives must survive until the scheduler catches up.""" + host = self._make_host(stage_id=1) + req_id = "req-recv-race" + ext_id = "ext-recv-race" + + host._request_ids_mapping[req_id] = ext_id + host._put_req_chunk[ext_id] = 1 + host._local_stage_payload_cache[req_id] = {"engine_inputs": {"ids": [1, 2, 3]}} + host._local_request_metadata[req_id] = {"next_stage_prompt_len": 3} + host._stage_recv_req_ids.add(req_id) + + pruned = host.prune_inactive_requests(set()) + + self.assertEqual(pruned, set()) + self.assertIn(req_id, host._request_ids_mapping) + self.assertIn(req_id, host._local_stage_payload_cache) + self.assertIn(req_id, host._local_request_metadata) + self.assertIn(req_id, host._stage_recv_req_ids) + self.assertIn(ext_id, host._put_req_chunk) + + # Once the scheduler has consumed the wake-up and the request really + # disappears from all protected sets, prune should clean it up. + host._stage_recv_req_ids.clear() + host._local_stage_payload_cache.clear() + host._local_request_metadata.clear() + + pruned = host.prune_inactive_requests(set()) + + self.assertEqual(pruned, {req_id}) + self.assertNotIn(req_id, host._request_ids_mapping) + self.assertNotIn(ext_id, host._put_req_chunk) + + host.shutdown_omni_connectors() + + +class TestSendChunkCachesMapping(unittest.TestCase): + """Test that send_chunk caches internal→external req ID mapping.""" + + def test_send_chunk_populates_request_ids_mapping(self): + """send_chunk should cache the internal→external mapping.""" + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=0, async_chunk=True), + ) + host._omni_connector = MockConnector(stage_id=0) + host._stage_id = 0 + host._async_chunk = True + + def mock_process(transfer_manager, pooling_output, request): + return {"data": "test", "finished": False} + + host._custom_process_func = mock_process + + request = _make_request("internal-1", "external-1") + host.send_chunk(request, pooling_output={"v": 1}) + + # The mapping should be cached + self.assertEqual( + host._request_ids_mapping.get("internal-1"), + "external-1", + ) + + time.sleep(0.1) + host.shutdown_omni_connectors() + + +class TestLocalPayloadCacheLifecycle(unittest.TestCase): + """Unit tests for the local payload cache API (RFC §2.4).""" + + def _make_host(self) -> MixinHost: + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=0), + ) + host._omni_connector = MockConnector(stage_id=0) + host._stage_id = 0 + return host + + def test_put_get_pop(self): + host = self._make_host() + payload = {"engine_inputs": {"ids": [1, 2, 3]}} + host.put_local_stage_payload("r1", payload) + + self.assertEqual(host.get_local_stage_payload("r1"), payload) + popped = host.pop_local_stage_payload("r1") + self.assertEqual(popped, payload) + self.assertIsNone(host.get_local_stage_payload("r1")) + host.shutdown_omni_connectors() + + def test_recv_full_payload_inputs_populates_local_cache(self): + host = self._make_host() + host._omni_connector = MockConnector(stage_id=0) + host._stage_id = 0 + + # Simulate a full payload already staged by the bg recv path + with host._lock: + host._local_stage_payload_cache["r1"] = {"tok": [10]} + host._stage_recv_req_ids.add("r1") + + host.recv_full_payload_inputs(scheduler_output=None) + self.assertEqual(host.get_local_stage_payload("r1"), {"tok": [10]}) + host.shutdown_omni_connectors() + + def test_rank0_only_polls_connector_for_tp_full_payload(self): + host = self._make_host() + host._omni_connector = MagicMock() + host._stage_id = 2 + host._local_rank = 0 + host._request_ids_mapping["r1"] = "ext-r1" + host._get_req_chunk["r1"] = 0 + payload = {"tok": [10], "finished": torch.tensor(True)} + connector_result = (payload, 123) + host._omni_connector.get.return_value = connector_result + tp_group = _FakeTPGroup(world_size=2, rank_in_group=0) + + with patch("vllm_omni.worker.omni_connector_model_runner_mixin.get_tp_group", return_value=tp_group): + made_progress = host._poll_single_request("r1") + + self.assertTrue(made_progress) + host._omni_connector.get.assert_called_once_with("1", "2", "ext-r1_1_0") + self.assertEqual(tp_group.broadcast_inputs, []) + self.assertEqual(host.get_local_stage_payload("r1"), payload) + self.assertIn("r1", host._full_payload_pending_broadcast_req_ids) + self.assertNotIn("r1", host._stage_recv_req_ids) + self.assertIsNone(host.get_local_request_metadata("r1")) + host.shutdown_omni_connectors() + + def test_tp_follower_skips_connector_poll_for_full_payload(self): + host = self._make_host() + host._omni_connector = MagicMock() + host._stage_id = 2 + host._local_rank = 1 + host._request_ids_mapping["r1"] = "ext-r1" + host._get_req_chunk["r1"] = 0 + tp_group = _FakeTPGroup(world_size=2, rank_in_group=1) + + with patch("vllm_omni.worker.omni_connector_model_runner_mixin.get_tp_group", return_value=tp_group): + made_progress = host._poll_single_request("r1") + + self.assertFalse(made_progress) + host._omni_connector.get.assert_not_called() + self.assertEqual(tp_group.broadcast_inputs, []) + self.assertNotIn("r1", host._local_stage_payload_cache) + host.shutdown_omni_connectors() + + def test_recv_full_payload_inputs_broadcasts_tp_leader_results_to_followers(self): + host = self._make_host() + host._omni_connector = MagicMock() + host._stage_id = 2 + host._local_rank = 1 + host._pending_load_reqs["r1"] = object() + payload = {"tok": [10], "finished": torch.tensor(True)} + tp_group = _FakeTPGroup(world_size=2, rank_in_group=1, follower_result={"r1": payload}) + + with patch("vllm_omni.worker.omni_connector_model_runner_mixin.get_tp_group", return_value=tp_group): + results = host.recv_full_payload_inputs(scheduler_output=None) + + self.assertEqual(results, {"r1": payload}) + self.assertEqual(host.get_local_stage_payload("r1"), payload) + self.assertEqual(host.get_local_request_metadata("r1"), {}) + self.assertEqual(host._stage_recv_req_ids, {"r1"}) + self.assertNotIn("r1", host._pending_load_reqs) + self.assertEqual(tp_group.broadcast_inputs, [None]) + host.shutdown_omni_connectors() + + +class TestTPAsyncChunkFanout(unittest.TestCase): + def _make_host(self, rank: int) -> MixinHost: + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=2, async_chunk=True, worker_type="gen"), + ) + host._omni_connector = MagicMock() + host._stage_id = 2 + host._async_chunk = True + host._model_mode = "gen" + host._local_rank = rank + host._request_ids_mapping["r1"] = "ext-r1" + host._get_req_chunk["r1"] = 0 + return host + + def test_rank0_only_polls_connector_for_tp_async_chunk(self): + host = self._make_host(rank=0) + payload = { + "code_predictor_codes": [10, 11], + "left_context_size": 0, + "finished": torch.tensor(False), + } + host._omni_connector.get.return_value = (payload, 123) + tp_group = _FakeTPGroup(world_size=2, rank_in_group=0) + + with patch("vllm_omni.worker.omni_connector_model_runner_mixin.get_tp_group", return_value=tp_group): + made_progress = host._poll_single_request("r1") + + self.assertTrue(made_progress) + host._omni_connector.get.assert_called_once_with("1", "2", "ext-r1_1_0") + self.assertEqual(host.get_local_stage_payload("r1"), payload) + self.assertIn("r1", host._finished_load_reqs) + self.assertIn("r1", host._async_chunk_updated_req_ids) + self.assertEqual(tp_group.broadcast_inputs, []) + host.shutdown_omni_connectors() + + def test_tp_follower_skips_connector_poll_for_async_chunk(self): + host = self._make_host(rank=1) + tp_group = _FakeTPGroup(world_size=2, rank_in_group=1) + + with patch("vllm_omni.worker.omni_connector_model_runner_mixin.get_tp_group", return_value=tp_group): + made_progress = host._poll_single_request("r1") + + self.assertFalse(made_progress) + host._omni_connector.get.assert_not_called() + self.assertIsNone(host.get_local_stage_payload("r1")) + self.assertEqual(tp_group.broadcast_inputs, []) + host.shutdown_omni_connectors() + + def test_get_output_broadcasts_tp_async_chunk_payloads_to_followers(self): + host = self._make_host(rank=1) + host._pending_load_reqs["r1"] = object() + payload = { + "code_predictor_codes": [10, 11], + "left_context_size": 0, + "finished": torch.tensor(True), + } + packet = { + "staged_payloads": {"r1": payload}, + "request_metadata": {"r1": {"code_predictor_codes": [10, 11], "left_context_size": 0}}, + "newly_finished": {"r1"}, + "chunk_finished": {"r1"}, + } + tp_group = _FakeTPGroup(world_size=2, rank_in_group=1, follower_result=packet) + + with patch("vllm_omni.worker.omni_connector_model_runner_mixin.get_tp_group", return_value=tp_group): + output = host.get_omni_connector_output() + + self.assertEqual(output.chunk_ready_req_ids, {"r1"}) + self.assertEqual(output.chunk_finished_req_ids, {"r1"}) + self.assertEqual( + output.request_metadata, + {"r1": {"code_predictor_codes": [10, 11], "left_context_size": 0}}, + ) + self.assertEqual(host.get_local_stage_payload("r1"), payload) + self.assertNotIn("r1", host._pending_load_reqs) + self.assertIn("r1", host._chunk_stream_completed) + self.assertEqual(tp_group.broadcast_inputs, [None]) + host.shutdown_omni_connectors() + + +class TestKVTransferLifecycle(unittest.TestCase): + """Unit tests for KV transfer lifecycle methods.""" + + def _make_host(self) -> MixinHost: + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=0), + ) + return host + + def test_mark_drain_ack_complete(self): + host = self._make_host() + self.assertFalse(host.has_pending_kv_work()) + + host.mark_kv_transfer("r1", seq_len=100, block_ids=[0, 1, 2]) + self.assertTrue(host.has_pending_kv_work()) + self.assertTrue(host.is_kv_transfer_triggered("r1")) + + # Drain moves pending → active + pending = host.drain_pending_kv_transfers() + self.assertEqual(pending, {"r1": {"seq_len": 100, "block_ids": [0, 1, 2]}}) + self.assertIn("r1", host._kv_active_transfers) + self.assertTrue(host.has_pending_kv_work()) + + # Ack moves active → completed + host.ack_kv_transfers(["r1"]) + self.assertNotIn("r1", host._kv_active_transfers) + self.assertIn("r1", host._kv_completed_transfers) + + # Drain completed + completed = host.drain_completed_kv_transfers() + self.assertEqual(completed, {"r1"}) + self.assertFalse(host.has_pending_kv_work()) + host.shutdown_omni_connectors() + + def test_mark_dedup(self): + host = self._make_host() + host.mark_kv_transfer("r1", seq_len=100, block_ids=[0]) + host.mark_kv_transfer("r1", seq_len=200, block_ids=[0, 1]) + # Second mark is a no-op + self.assertEqual(host._kv_pending_transfers["r1"]["seq_len"], 100) + host.shutdown_omni_connectors() + + def test_cleanup_removes_kv_state(self): + host = self._make_host() + host.mark_kv_transfer("r1", seq_len=50, block_ids=[0]) + host.drain_pending_kv_transfers() + host.cleanup_finished_request("r1") + self.assertFalse(host.is_kv_transfer_triggered("r1")) + self.assertNotIn("r1", host._kv_active_transfers) + self.assertFalse(host.has_pending_kv_work()) + host.shutdown_omni_connectors() + + +class TestAsyncPayloadLifecycle(unittest.TestCase): + """Regression tests for async payload delivery lifecycle.""" + + def test_send_side_request_payload_not_cleared_before_payload_is_consumable(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=1, async_chunk=True, worker_type="ar"), + ) + host._request_ids_mapping["r1"] = "r1" + payload = { + "thinker_decode_embeddings": torch.ones(1, 2), + "thinker_output_token_ids": [1], + "override_keys": ["thinker_decode_embeddings", "thinker_output_token_ids"], + "finished": torch.tensor(False), + } + + host._accumulate_payload("r1", dict(payload)) + with host._lock: + host._finished_load_reqs.add("r1") + + host.get_omni_connector_output() + self.assertIn("r1", host._send_side_request_payload) + host.shutdown_omni_connectors() + + def test_payload_consumable_ignores_token_horizon_only_updates(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=1, async_chunk=True, worker_type="ar"), + ) + payload = { + "thinker_output_token_ids": [1, 2, 3], + "finished": torch.tensor(False), + "override_keys": [ + "thinker_output_token_ids", + "thinker_decode_embeddings_token_start", + "thinker_decode_embeddings_token_end", + ], + "thinker_decode_embeddings_token_start": 2, + "thinker_decode_embeddings_token_end": 3, + } + self.assertFalse(host._payload_is_consumable(payload)) + host.shutdown_omni_connectors() + + def test_payload_consumable_accepts_decode_embeddings(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=1, async_chunk=True, worker_type="ar"), + ) + payload = { + "thinker_output_token_ids": [1, 2, 3], + "thinker_decode_embeddings": torch.ones(1, 2), + "finished": torch.tensor(False), + } + self.assertTrue(host._payload_is_consumable(payload)) + host.shutdown_omni_connectors() + + def test_ar_metadata_only_followup_chunk_does_not_rewake_request(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=1, async_chunk=True, worker_type="ar"), + ) + host._omni_connector = MagicMock() + host._stage_id = 1 + host._async_chunk = True + host._model_mode = "ar" + host._request_ids_mapping["r1"] = "ext-r1" + host._get_req_chunk["r1"] = 0 + + host._omni_connector.get.side_effect = [ + ( + { + "thinker_decode_embeddings": torch.ones(1, 2), + "finished": torch.tensor(False), + }, + 1, + ), + ( + { + "next_stage_prompt_len": 7, + "finished": torch.tensor(False), + }, + 1, + ), + ] + + host._poll_single_request("r1") + output1 = host.get_omni_connector_output() + self.assertEqual(output1.chunk_ready_req_ids, {"r1"}) + + host._poll_single_request("r1") + output2 = host.get_omni_connector_output() + self.assertEqual(output2.chunk_ready_req_ids, set()) + self.assertEqual(output2.request_metadata, {"r1": {"next_stage_prompt_len": 7}}) + + host.shutdown_omni_connectors() + + def test_non_ar_recv_does_not_overwrite_unconsumed_staged_chunk(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=2, async_chunk=True, worker_type="gen"), + ) + host._omni_connector = MagicMock() + host._stage_id = 2 + host._async_chunk = True + host._model_mode = "gen" + host._request_ids_mapping["r1"] = "ext-r1" + host._get_req_chunk["r1"] = 1 + host._local_stage_payload_cache["r1"] = { + "code_predictor_codes": [1, 2, 3], + "left_context_size": 0, + "finished": torch.tensor(False), + } + + made_progress = host._poll_single_request("r1") + + self.assertFalse(made_progress) + host._omni_connector.get.assert_not_called() + self.assertEqual(host._get_req_chunk["r1"], 1) + + host.shutdown_omni_connectors() + + def test_non_ar_recv_waits_for_scheduler_handoff_before_fetching_next_chunk(self): + host = MixinHost() + host.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=2, async_chunk=True, worker_type="gen"), + ) + host._omni_connector = MagicMock() + host._stage_id = 2 + host._async_chunk = True + host._model_mode = "gen" + host._request_ids_mapping["r1"] = "ext-r1" + host._get_req_chunk["r1"] = 1 + host._local_request_metadata["r1"] = { + "code_predictor_codes": [10, 11, 12], + "left_context_size": 0, + } + host._finished_load_reqs.add("r1") + + made_progress = host._poll_single_request("r1") + + self.assertFalse(made_progress) + host._omni_connector.get.assert_not_called() + self.assertEqual(host._get_req_chunk["r1"], 1) + + output = host.get_omni_connector_output() + self.assertEqual(output.request_metadata["r1"]["code_predictor_codes"], [10, 11, 12]) + self.assertEqual(output.chunk_ready_req_ids, {"r1"}) + + host._omni_connector.get.return_value = ( + { + "code_predictor_codes": [20, 21, 22], + "left_context_size": 0, + "finished": torch.tensor(False), + }, + 1, + ) + made_progress = host._poll_single_request("r1") + + self.assertTrue(made_progress) + host._omni_connector.get.assert_called_once() + self.assertEqual(host._get_req_chunk["r1"], 2) + + host.shutdown_omni_connectors() + + +class TestRankAwareKVRouting(unittest.TestCase): + def _make_host(self, *, from_tp: int, to_tp: int, local_rank: int) -> MixinHost: + host = MixinHost() + host.init_omni_connectors(vllm_config=None, model_config=_make_model_config(stage_id=1)) + host._from_tp = from_tp + host._to_tp = to_tp + host._local_rank = local_rank + return host + + def test_recv_keys_use_remote_rank_as_from_rank(self): + host = self._make_host(from_tp=4, to_tp=2, local_rank=1) + self.assertEqual( + host.get_rank_aware_kv_keys("req", from_stage=0), + ["req_0_0_2_1", "req_0_0_3_1"], + ) + host.shutdown_omni_connectors() + + def test_send_keys_route_from_rank_gt_to_rank(self): + host = self._make_host(from_tp=4, to_tp=2, local_rank=3) + self.assertEqual(host.get_rank_aware_kv_send_keys("req", from_stage=0), ["req_0_0_3_1"]) + host.shutdown_omni_connectors() + + def test_invalid_recv_rank_mapping_raises(self): + host = self._make_host(from_tp=3, to_tp=2, local_rank=1) + with self.assertRaises(ValueError): + host.get_rank_aware_kv_keys("req", from_stage=0) + host.shutdown_omni_connectors() + + def test_invalid_send_rank_mapping_raises(self): + host = self._make_host(from_tp=3, to_tp=2, local_rank=1) + with self.assertRaises(ValueError): + host.get_rank_aware_kv_send_keys("req", from_stage=0) + host.shutdown_omni_connectors() + + def test_merge_rank_sharded_payloads_concatenates_head_dimension(self): + host = self._make_host(from_tp=4, to_tp=2, local_rank=0) + payloads = [ + {"layer_blocks": {"key_cache": [torch.ones(2, 1, 3)], "value_cache": [torch.ones(2, 1, 3)]}}, + {"layer_blocks": {"key_cache": [torch.full((2, 1, 3), 2.0)], "value_cache": [torch.full((2, 1, 3), 2.0)]}}, + ] + merged = host._merge_rank_sharded_kv_payloads(payloads) + self.assertEqual(tuple(merged["layer_blocks"]["key_cache"][0].shape), (2, 2, 3)) + self.assertTrue(torch.equal(merged["layer_blocks"]["key_cache"][0][:, 0], torch.ones(2, 3))) + self.assertTrue(torch.equal(merged["layer_blocks"]["key_cache"][0][:, 1], torch.full((2, 3), 2.0))) + host.shutdown_omni_connectors() + + def test_slice_rank_sharded_payload_splits_head_dimension(self): + host = self._make_host(from_tp=2, to_tp=4, local_rank=1) + payload = { + "layer_blocks": { + "key_cache": [torch.arange(24, dtype=torch.float32).reshape(2, 4, 3)], + "value_cache": [torch.arange(24, dtype=torch.float32).reshape(2, 4, 3)], + }, + "metadata": {}, + } + sliced = host._slice_rank_sharded_kv_payload(payload) + self.assertEqual(tuple(sliced["layer_blocks"]["key_cache"][0].shape), (2, 2, 3)) + expected = torch.arange(24, dtype=torch.float32).reshape(2, 4, 3)[:, 2:4, :] + self.assertTrue(torch.equal(sliced["layer_blocks"]["key_cache"][0], expected)) + host.shutdown_omni_connectors() + + +class TestAttachOmniConnectorOutput(unittest.TestCase): + def test_wraps_empty_model_runner_output_when_signals_exist(self): + from vllm.v1.worker.gpu_model_runner import EMPTY_MODEL_RUNNER_OUTPUT + + host = MixinHost() + host.get_omni_connector_output = lambda: OmniConnectorOutput(chunk_ready_req_ids={"req-1"}) + + wrapped = host.attach_omni_connector_output(EMPTY_MODEL_RUNNER_OUTPUT) + + self.assertIsNot(wrapped, EMPTY_MODEL_RUNNER_OUTPUT) + self.assertEqual(wrapped.omni_connector_output.chunk_ready_req_ids, {"req-1"}) + + +class TestConnectorConfigValidation(unittest.TestCase): + def test_invalid_connector_name_raises(self): + host = MixinHost() + model_config = _make_model_config(stage_id=1) + model_config.stage_connector_config = {"name": " "} + + with self.assertRaisesRegex(RuntimeError, "missing connector name"): + host.init_omni_connectors(vllm_config=None, model_config=model_config) + + +class _FailingConnector: + """Connector whose put() fails a configurable number of times.""" + + def __init__(self, fail_count: int = 1, raise_on_fail: bool = False): + self._fail_count = fail_count + self._raise_on_fail = raise_on_fail + self.attempt = 0 + + def put(self, from_stage, to_stage, put_key, data): + self.attempt += 1 + if self.attempt <= self._fail_count: + if self._raise_on_fail: + raise ConnectionError("transient connector error") + return False, 0, None + return True, len(str(data)), None + + def get(self, *a, **kw): + return None + + def close(self): + pass + + +class TestSendRetry(unittest.TestCase): + """Tests for P1-2: failed connector sends must be retried.""" + + def _make_sender(self, connector): + sender = MixinHost() + sender.init_omni_connectors( + vllm_config=None, + model_config=_make_model_config(stage_id=0, async_chunk=True), + ) + sender._omni_connector = connector + sender._stage_id = 0 + sender._async_chunk = True + return sender + + def _make_task(self, req_id="r1"): + return { + "stage_id": 0, + "next_stage_id": 1, + "request_id": req_id, + "data": {"payload": "test"}, + } + + def test_send_single_request_returns_false_on_put_failure(self): + connector = _FailingConnector(fail_count=999) + sender = self._make_sender(connector) + + result = sender._send_single_request(self._make_task()) + self.assertFalse(result) + sender.shutdown_omni_connectors() + + def test_send_single_request_does_not_decrement_on_failure(self): + connector = _FailingConnector(fail_count=999) + sender = self._make_sender(connector) + sender._pending_save_counts["r1"] = 1 + + sender._send_single_request(self._make_task()) + self.assertEqual(sender._pending_save_counts.get("r1"), 1, "pending count must NOT be decremented on failure") + sender.shutdown_omni_connectors() + + def test_send_single_request_decrements_on_success(self): + connector = MockConnector(stage_id=0) + sender = self._make_sender(connector) + sender._pending_save_counts["r1"] = 1 + + result = sender._send_single_request(self._make_task()) + self.assertTrue(result) + self.assertNotIn("r1", sender._pending_save_counts, "pending count should be zero/removed on success") + sender.shutdown_omni_connectors() + + def test_requeue_or_drop_requeues_on_first_failure(self): + sender = self._make_sender(MockConnector(stage_id=0)) + task = self._make_task() + + sender._requeue_or_drop_failed_send(task) + + self.assertEqual(task.get("_retry_count"), 1) + with sender._lock: + dq = sender._pending_save_reqs.get("r1") + self.assertIsNotNone(dq) + self.assertEqual(len(dq), 1) + sender.shutdown_omni_connectors() + + def test_requeue_or_drop_drops_after_max_retries(self): + sender = self._make_sender(MockConnector(stage_id=0)) + sender._pending_save_counts["r1"] = 1 + task = self._make_task() + task["_retry_count"] = sender._MAX_SEND_RETRIES # already at max + + sender._requeue_or_drop_failed_send(task) + + with sender._lock: + dq = sender._pending_save_reqs.get("r1") + self.assertTrue(dq is None or len(dq) == 0, "task should NOT be re-enqueued after max retries") + self.assertNotIn("r1", sender._pending_save_counts, "pending count should be cleaned up on final drop") + sender.shutdown_omni_connectors() + + def test_save_loop_retries_on_exception(self): + """Integration: _save_loop retries a task when put() raises.""" + from collections import deque + + connector = _FailingConnector(fail_count=1, raise_on_fail=True) + sender = self._make_sender(connector) + task = self._make_task() + + with sender._lock: + sender._pending_save_reqs["r1"] = deque([task]) + sender._pending_save_counts["r1"] = 1 + + sender._stop_event.clear() + + def run_one_loop(): + sender._save_loop() + + sender._stop_event.set() # will exit after one iteration + # Run manually instead of threading + # Simulate: pop task, send fails, requeue + popped_task = None + with sender._lock: + dq = sender._pending_save_reqs.get("r1") + if dq: + popped_task = dq.popleft() + if not dq: + del sender._pending_save_reqs["r1"] + + if popped_task is not None: + success = False + try: + success = sender._send_single_request(popped_task) + except Exception: + pass + if not success: + sender._requeue_or_drop_failed_send(popped_task) + + # After first failure, task should be re-enqueued + with sender._lock: + dq = sender._pending_save_reqs.get("r1") + self.assertIsNotNone(dq) + self.assertEqual(len(dq), 1) + requeued = dq[0] + self.assertEqual(requeued.get("_retry_count"), 1) + + # Second attempt should succeed (connector now returns True) + success = sender._send_single_request(requeued) + self.assertTrue(success) + sender.shutdown_omni_connectors() + + +if __name__ == "__main__": + unittest.main() diff --git a/vllm_omni/core/sched/omni_scheduling_coordinator.py b/vllm_omni/core/sched/omni_scheduling_coordinator.py new file mode 100644 index 00000000000..c9d891afb41 --- /dev/null +++ b/vllm_omni/core/sched/omni_scheduling_coordinator.py @@ -0,0 +1,380 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Scheduling-side coordination for chunk and full_payload input waiting. + +Manages WAITING_FOR_CHUNK and WAITING_FOR_INPUT state transitions +based on readiness signals from OmniConnectorOutput, without ever +calling connector.put()/get(). + +This replaces the scheduling half of OmniChunkTransferAdapter; the +transport half lives in OmniConnectorModelRunnerMixin. +""" + +from __future__ import annotations + +import time +from collections import deque +from typing import Any + +from vllm.logger import init_logger +from vllm.v1.request import Request, RequestStatus + +logger = init_logger(__name__) + + +class OmniSchedulingCoordinator: + """Pure-scheduling coordinator for chunk and full_payload input waiting. + + The Scheduler owns an instance of this class. It consumes readiness + signals produced by the Model Runner's ``OmniConnectorModelRunnerMixin`` + (via ``OmniConnectorOutput``) and manages ``WAITING_FOR_CHUNK`` and + ``WAITING_FOR_INPUT`` state transitions accordingly. + """ + + def __init__(self, scheduler_max_num_seqs: int, stage_id: int = 0, async_chunk: bool = False): + self._stage_id = stage_id + self._scheduler_max_num_seqs = scheduler_max_num_seqs + self._async_chunk = async_chunk + + self.finished_requests: set[str] = set() + self.requests_with_ready_chunks: set[str] = set() + self._full_payload_input_received: set[str] = set() + + self._waiting_for_chunk_waiting: deque[Any] = deque() + self._waiting_for_chunk_running: deque[Any] = deque() + + # Request IDs that were newly registered for chunk recv this cycle. + # The engine/Model Runner should call register_chunk_recv() for these + # so the bg thread starts polling. + self.pending_chunk_registrations: list[Any] = [] + + # Requests waiting for full_payload stage input (WAITING_FOR_INPUT). + self._waiting_for_input: deque[Any] = deque() + self.pending_input_registrations: list[Any] = [] + + # Monotonic timestamp recording when each request first entered + # WAITING_FOR_CHUNK or WAITING_FOR_INPUT. Used by + # collect_timed_out_request_ids() to detect orphaned waits. + self._waiting_since: dict[str, float] = {} + + # ------------------------------------------------------------------ # + # Core scheduling methods + # ------------------------------------------------------------------ # + + def process_pending_chunks( + self, + waiting_queue: Any, + running_queue: list[Request], + chunk_ready_req_ids: set[str], + chunk_finished_req_ids: set[str], + ) -> None: + """Transition requests whose chunks have arrived. + + Args: + waiting_queue: Scheduler's waiting request queue. + running_queue: Scheduler's running request list. + chunk_ready_req_ids: IDs with a newly arrived chunk this cycle. + chunk_finished_req_ids: IDs whose final chunk has arrived. + """ + if self._stage_id == 0 or not self._async_chunk: + return + + terminal_ready_req_ids = chunk_ready_req_ids.intersection(chunk_finished_req_ids) + self.finished_requests.update(chunk_finished_req_ids - terminal_ready_req_ids) + self.pending_chunk_registrations = [] + + self._process_chunk_queue( + waiting_queue, + self._waiting_for_chunk_waiting, + RequestStatus.WAITING, + chunk_ready_req_ids, + ) + self._process_chunk_queue( + running_queue, + self._waiting_for_chunk_running, + RequestStatus.RUNNING, + chunk_ready_req_ids, + ) + self.finished_requests.update(terminal_ready_req_ids) + + while len(running_queue) > self._scheduler_max_num_seqs: + request = running_queue.pop() + # Must reset status to WAITING so the scheduler treats it as + # schedulable work. KV blocks are NOT freed here (unlike a + # real preemption), so PREEMPTED would be incorrect. + request.status = RequestStatus.WAITING + waiting_queue.prepend_requests([request]) + + def process_pending_full_payload_inputs( + self, + waiting_queue: Any, + running_queue: list[Request], + stage_recv_req_ids: set[str], + ) -> None: + """Manage WAITING_FOR_INPUT lifecycle for full_payload_mode. + + For non-Stage-0 stages in full_payload_mode (``async_chunk=False``): + 1. Fresh WAITING requests are transitioned to WAITING_FOR_INPUT + and registered for bg-thread polling. + 2. WAITING_FOR_INPUT requests whose data has arrived (in + ``stage_recv_req_ids``) are transitioned back to WAITING. + """ + if self._stage_id == 0: + return + + self._full_payload_input_received.update(stage_recv_req_ids) + if not self._async_chunk and stage_recv_req_ids: + self.finished_requests.update(stage_recv_req_ids) + logger.debug( + "[Coordinator stage-%s] full_payload recv -> finished_requests: %s", + self._stage_id, + stage_recv_req_ids, + ) + self.pending_input_registrations = [] + + remaining: deque[Any] = deque() + for request in self._waiting_for_input: + if request.request_id in stage_recv_req_ids: + request.status = RequestStatus.WAITING + self._waiting_since.pop(request.request_id, None) + waiting_queue.add_request(request) + else: + remaining.append(request) + self._waiting_for_input = remaining + + if not self._async_chunk: + to_remove: list[Any] = [] + queue_snapshot = list(waiting_queue) + for request in queue_snapshot: + if request.status == RequestStatus.WAITING: + if request.request_id in self._full_payload_input_received: + continue + if request.request_id in self.requests_with_ready_chunks: + continue + if request.request_id in self.finished_requests: + continue + request.status = RequestStatus.WAITING_FOR_INPUT + self._waiting_since.setdefault(request.request_id, time.monotonic()) + to_remove.append(request) + self._waiting_for_input.append(request) + self.pending_input_registrations.append(request) + elif request.status == RequestStatus.WAITING_FOR_INPUT: + if request.request_id in stage_recv_req_ids: + request.status = RequestStatus.WAITING + self._waiting_since.pop(request.request_id, None) + else: + to_remove.append(request) + self._waiting_for_input.append(request) + self.pending_input_registrations.append(request) + for request in to_remove: + waiting_queue.remove(request) + + def process_pending_full_payload_inputs_legacy( + self, + waiting_queue: Any, + running_queue: list[Request], + stage_recv_req_ids: set[str], + ) -> None: + """Compatibility wrapper for ``process_pending_full_payload_inputs``.""" + self.process_pending_full_payload_inputs(waiting_queue, running_queue, stage_recv_req_ids) + + def free_finished_request(self, request_id: str) -> None: + """Prune internal tracking sets for a freed request to prevent unbounded growth.""" + self._full_payload_input_received.discard(request_id) + self.finished_requests.discard(request_id) + self.requests_with_ready_chunks.discard(request_id) + self._waiting_since.pop(request_id, None) + + def collect_timed_out_request_ids( + self, + timeout_s: float, + ) -> set[str]: + """Return IDs of requests that have been waiting longer than *timeout_s*. + + Uses ``_waiting_since`` timestamps (always up-to-date) to detect + timed-out requests. This method is safe to call at any point in + the scheduling cycle — it does **not** rely on coordinator internal + queues (which are empty after ``restore_queues()``). + + Clears ``_waiting_since`` for timed-out IDs and defensively removes + them from coordinator internal queues if present. The caller + (scheduler) should then remove the requests from its queues, + set ``FINISHED_ERROR``, and call ``_free_request()`` so that + ``cleanup_finished_request()`` fires in the model runner mixin. + """ + if timeout_s <= 0: + return set() + now = time.monotonic() + timed_out_ids: set[str] = set() + for req_id, start_time in self._waiting_since.items(): + if now - start_time > timeout_s: + timed_out_ids.add(req_id) + if not timed_out_ids: + return set() + + # Defensively remove from coordinator internal queues (may already + # be empty if restore_queues() has run). + for queue_attr in ( + "_waiting_for_chunk_waiting", + "_waiting_for_chunk_running", + "_waiting_for_input", + ): + queue = getattr(self, queue_attr) + remaining: deque[Any] = deque() + for request in queue: + if request.request_id not in timed_out_ids: + remaining.append(request) + setattr(self, queue_attr, remaining) + + for req_id in timed_out_ids: + self._waiting_since.pop(req_id, None) + logger.warning( + "[Coordinator stage-%s] Request %s timed out waiting for chunk/input (waited > %.0fs)", + self._stage_id, + req_id, + timeout_s, + ) + + return timed_out_ids + + def restore_queues( + self, + waiting_queue: Any, + running_queue: list[Request], + ) -> None: + """Return waiting-for-chunk/input requests to scheduling queues.""" + for request in self._waiting_for_chunk_waiting: + waiting_queue.add_request(request) + self._waiting_for_chunk_waiting = deque() + + if self._waiting_for_chunk_running: + running_queue.extend(self._waiting_for_chunk_running) + self._waiting_for_chunk_running = deque() + + for request in self._waiting_for_input: + waiting_queue.add_request(request) + self._waiting_for_input = deque() + + def update_request_metadata( + self, + requests: dict[str, Request], + request_metadata: dict[str, dict[str, Any]], + model_mode: str = "ar", + ) -> None: + """Apply received scheduling metadata to request objects. + + For AR mode: only scheduler-visible metadata is applied locally. + For Generation mode: updates ``request.prompt_token_ids``. + + Additionally, if the payload contains ``next_stage_prompt_len``, + updates the request's ``prompt_token_ids`` to the correct length. + """ + for req_id, metadata in request_metadata.items(): + request = requests.get(req_id) + if request is None: + continue + + # Handle next_stage_prompt_len if present (for models like Qwen3-Omni). + # Only apply when the request has not started decoding yet + # (no output tokens). Resetting a mid-decode request would + # destroy generated tokens and desync KV cache state. + if "next_stage_prompt_len" in metadata: + next_len = metadata["next_stage_prompt_len"] + if isinstance(next_len, int) and next_len > 0: + output_token_ids = getattr(request, "_output_token_ids", None) + has_decode_output = output_token_ids is not None and len(output_token_ids) > 0 + if has_decode_output: + logger.debug( + "[Coordinator stage-%s] Skipping prompt resize for req %s: " + "request already has %s output tokens", + self._stage_id, + req_id, + len(output_token_ids), + ) + else: + current_prompt_ids = getattr(request, "prompt_token_ids", []) or [] + current_prompt_len = len(current_prompt_ids) + if current_prompt_len != next_len or getattr(request, "num_prompt_tokens", None) != next_len: + new_prompt = [0] * next_len + request.prompt_token_ids = new_prompt + request.num_prompt_tokens = next_len + request._all_token_ids.clear() + request._all_token_ids.extend(new_prompt) + request._output_token_ids.clear() + request.num_computed_tokens = 0 + logger.debug( + "[Coordinator stage-%s] Updated prompt_token_ids length to %s for req %s", + self._stage_id, + next_len, + req_id, + ) + + if model_mode != "ar": + new_ids = metadata.get("code_predictor_codes", []) + runtime_seed = None + if "left_context_size" in metadata: + runtime_seed = { + "left_context_size": metadata["left_context_size"], + } + request._omni_initial_model_buffer = runtime_seed + if new_ids: + request.prompt_token_ids = new_ids + request.num_computed_tokens = 0 + + def postprocess_scheduler_output( + self, + scheduler_output: Any, + requests: dict[str, Request] | None = None, + ) -> None: + """Clear per-cycle ready state after scheduler output is materialized.""" + self._clear_chunk_ready(scheduler_output) + + # ------------------------------------------------------------------ # + # Internal helpers + # ------------------------------------------------------------------ # + + def _process_chunk_queue( + self, + queue: Any, + waiting_for_chunk_list: deque[Any], + target_status: RequestStatus, + chunk_ready_req_ids: set[str], + ) -> None: + queue_snapshot = list(queue) + for request in queue_snapshot: + if request.status != RequestStatus.WAITING_FOR_CHUNK: + if request.request_id in self.requests_with_ready_chunks: + continue + if request.request_id in self.finished_requests: + continue + if request.status == RequestStatus.WAITING_FOR_INPUT: + continue + if request.request_id in chunk_ready_req_ids: + self.requests_with_ready_chunks.add(request.request_id) + continue + self.pending_chunk_registrations.append(request) + request.status = RequestStatus.WAITING_FOR_CHUNK + self._waiting_since.setdefault(request.request_id, time.monotonic()) + else: + if request.request_id in chunk_ready_req_ids: + request.status = target_status + self.requests_with_ready_chunks.add(request.request_id) + self._waiting_since.pop(request.request_id, None) + continue + queue.remove(request) + waiting_for_chunk_list.append(request) + + def _clear_chunk_ready(self, scheduler_output: Any) -> None: + if scheduler_output.scheduled_new_reqs: + for req_data in scheduler_output.scheduled_new_reqs: + self.requests_with_ready_chunks.discard( + getattr(req_data, "req_id", None), + ) + + if scheduler_output.scheduled_cached_reqs: + for req_id in scheduler_output.scheduled_cached_reqs.req_ids: + self.requests_with_ready_chunks.discard(req_id) + + +# Backward-compatible alias +ChunkSchedulingCoordinator = OmniSchedulingCoordinator diff --git a/vllm_omni/diffusion/worker/diffusion_model_runner.py b/vllm_omni/diffusion/worker/diffusion_model_runner.py index 32ea5bf64dc..535f053c388 100644 --- a/vllm_omni/diffusion/worker/diffusion_model_runner.py +++ b/vllm_omni/diffusion/worker/diffusion_model_runner.py @@ -35,11 +35,12 @@ from vllm_omni.diffusion.worker.utils import DiffusionRequestState, RunnerOutput from vllm_omni.distributed.omni_connectors.kv_transfer_manager import OmniKVTransferManager from vllm_omni.platforms import current_omni_platform +from vllm_omni.worker.omni_connector_model_runner_mixin import OmniConnectorModelRunnerMixin logger = init_logger(__name__) -class DiffusionModelRunner: +class DiffusionModelRunner(OmniConnectorModelRunnerMixin): """ Model runner that handles model loading and execution for diffusion models. diff --git a/vllm_omni/outputs.py b/vllm_omni/outputs.py index 9a7bb670658..2c2c1d21c11 100644 --- a/vllm_omni/outputs.py +++ b/vllm_omni/outputs.py @@ -9,6 +9,33 @@ from vllm_omni.inputs.data import OmniPromptType +@dataclass +class OmniConnectorOutput: + """Communication results from Model Runner to Scheduler. + + Carries transfer readiness signals so the Scheduler can make scheduling + decisions without ever calling connector.put()/get() directly. + + Attributes: + chunk_ready_req_ids: Request IDs with newly arrived chunks this cycle. + chunk_finished_req_ids: Request IDs whose final chunk has arrived. + request_metadata: Lightweight scheduling metadata keyed by request ID + (e.g. next_stage_prompt_len, code_predictor_codes, left_context_size). + Full payloads are owned by the Model Runner's local cache. + kv_sent_req_ids: Request IDs whose KV cache was successfully sent. + stage_recv_req_ids: Request IDs that received batch stage inputs. + has_pending_kv_work: True if the mixin has pending, active, or + completed KV transfers that the scheduler should account for. + """ + + chunk_ready_req_ids: set[str] = field(default_factory=set) + chunk_finished_req_ids: set[str] = field(default_factory=set) + request_metadata: dict[str, dict[str, Any]] = field(default_factory=dict) + kv_sent_req_ids: list[str] = field(default_factory=list) + stage_recv_req_ids: set[str] = field(default_factory=set) + has_pending_kv_work: bool = False + + class OmniModelRunnerOutput(ModelRunnerOutput): """Model runner output for omni models. @@ -24,6 +51,7 @@ class OmniModelRunnerOutput(ModelRunnerOutput): # IDs of requests whose KV cache has been extracted from GPU/NPU to CPU. # The Scheduler can safely free the block tables for these requests. kv_extracted_req_ids: list[str] | None = None + omni_connector_output: OmniConnectorOutput | None = None @dataclass diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 72e745fb172..868140d265b 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -40,6 +40,7 @@ from vllm_omni.distributed.omni_connectors.kv_transfer_manager import OmniKVTransferManager from vllm_omni.outputs import OmniModelRunnerOutput from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner +from vllm_omni.worker.omni_connector_model_runner_mixin import OmniConnectorModelRunnerMixin logger = init_logger(__name__) @@ -60,7 +61,7 @@ class ExecuteModelState(NamedTuple): slot_mappings: dict[str, torch.Tensor] | list[dict[str, torch.Tensor]] | None = None -class GPUARModelRunner(OmniGPUModelRunner): +class GPUARModelRunner(OmniGPUModelRunner, OmniConnectorModelRunnerMixin): """Autoregressive GPU model runner that returns hidden states per request. Follows the v0.12 two-phase execute/sample flow from GPUModelRunner, and diff --git a/vllm_omni/worker/gpu_generation_model_runner.py b/vllm_omni/worker/gpu_generation_model_runner.py index d95b676f6d6..f10115c8e90 100644 --- a/vllm_omni/worker/gpu_generation_model_runner.py +++ b/vllm_omni/worker/gpu_generation_model_runner.py @@ -39,11 +39,12 @@ from vllm_omni.outputs import OmniModelRunnerOutput from vllm_omni.worker.gpu_ar_model_runner import ExecuteModelState from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner +from vllm_omni.worker.omni_connector_model_runner_mixin import OmniConnectorModelRunnerMixin logger = logging.getLogger(__name__) -class GPUGenerationModelRunner(OmniGPUModelRunner): +class GPUGenerationModelRunner(OmniGPUModelRunner, OmniConnectorModelRunnerMixin): """Generation model runner for vLLM-Omni (non-autoregressive). - Reuses GPUModelRunner preparation, multimodal handling, and TP/PP/DP glue. diff --git a/vllm_omni/worker/omni_connector_model_runner_mixin.py b/vllm_omni/worker/omni_connector_model_runner_mixin.py new file mode 100644 index 00000000000..e0df3ba3d7a --- /dev/null +++ b/vllm_omni/worker/omni_connector_model_runner_mixin.py @@ -0,0 +1,2125 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unified data-plane communication mixin for Model Runners. + +All connector.put()/get() calls are consolidated here. Background I/O +threads handle async_chunk and full_payload_mode transfers; KV cache is delegated to +the existing OmniKVTransferManager (to be absorbed later). + +The mixin reports transfer results via OmniConnectorOutput so that the +Scheduler can make scheduling decisions without ever touching a connector. +""" + +from __future__ import annotations + +import importlib +import inspect +import os +import threading +from collections import defaultdict, deque +from types import SimpleNamespace +from typing import TYPE_CHECKING, Any + +import torch +from vllm.distributed.parallel_state import get_tp_group +from vllm.logger import init_logger + +from vllm_omni.distributed.omni_connectors.factory import OmniConnectorFactory +from vllm_omni.distributed.omni_connectors.utils.config import ConnectorSpec +from vllm_omni.outputs import OmniConnectorOutput +from vllm_omni.worker.payload_span import ( + THINKER_DECODE_EMBEDDINGS_KEY, + THINKER_DECODE_TOKEN_END_KEY, + THINKER_DECODE_TOKEN_START_KEY, + THINKER_OUTPUT_TOKEN_IDS_KEY, + get_tensor_span, + merge_tensor_spans, +) + +if TYPE_CHECKING: + from vllm_omni.distributed.omni_connectors.connectors.base import ( + OmniConnectorBase, + ) + from vllm_omni.distributed.omni_connectors.kv_transfer_manager import ( + OmniKVTransferManager, + ) + +logger = init_logger(__name__) + + +class OmniConnectorModelRunnerMixin: + """Unified data-plane communication mixin for Model Runners. + + Provides three transfer modes through a single pair of bg I/O threads: + - **full_payload_mode**: ``recv_full_payload_inputs`` / ``send_full_payload_outputs`` + - **Streaming (async_chunk)**: ``recv_chunk`` / ``send_chunk`` + - **KV cache**: ``send_kv_cache`` / ``recv_kv_cache`` (delegates to + the existing ``OmniKVTransferManager``) + + The mixin owns connector instances and background threads. It never + touches scheduling queues -- readiness is communicated to the Scheduler + via ``OmniConnectorOutput``. + """ + + # ------------------------------------------------------------------ # + # Init / Shutdown + # ------------------------------------------------------------------ # + + def init_omni_connectors( + self, + vllm_config: Any, + model_config: Any, + kv_transfer_manager: OmniKVTransferManager | None = None, + ) -> None: + """Initialize connectors and background threads. + + Args: + vllm_config: Full vLLM config object. + model_config: Stage-level model config with connector settings. + kv_transfer_manager: Existing KV transfer manager to delegate to. + """ + self._omni_connector: OmniConnectorBase | None = self._create_connector(model_config) + self._kv_transfer_manager = kv_transfer_manager + + self._async_chunk: bool = getattr(model_config, "async_chunk", False) + self._model_mode: str = getattr(model_config, "worker_type", "ar") + stage_id = getattr(model_config, "stage_id", 0) + if isinstance(stage_id, str): + stage_id = int(stage_id) + self._stage_id: int = stage_id if isinstance(stage_id, int) else 0 + + self._custom_process_func_path, self._custom_process_func = self._load_custom_func(model_config) + self._custom_process_supports_is_finished = self._custom_process_supports_is_finished_kwarg() + logger.info( + "[Stage-%s] init_omni_connectors: async_chunk=%s, custom_process_func=%s, connector=%s, func_path=%s", + self._stage_id, + self._async_chunk, + self._custom_process_func, + type(self._omni_connector).__name__ if self._omni_connector else None, + self._custom_process_func_path, + ) + + # -- next stage ID (from connector config or default stage_id + 1) -- + self._next_stage_id: int = self._resolve_next_stage_id(model_config) + + # -- heterogeneous TP rank support -- + rank_cfg = self._parse_rank_mapping(model_config) + self._from_tp: int = rank_cfg["from_tp"] + self._to_tp: int = rank_cfg["to_tp"] + self._local_rank: int = rank_cfg["local_rank"] + if self._kv_transfer_manager is not None: + self._kv_transfer_manager.kv_send_key_builder = self.get_rank_aware_kv_send_keys + self._kv_transfer_manager.kv_recv_key_builder = self.get_rank_aware_kv_keys + self._kv_transfer_manager.kv_payload_merger = self._merge_rank_sharded_kv_payloads + self._kv_transfer_manager.kv_payload_slicer = self._slice_rank_sharded_kv_payload + + # -- chunk index tracking (ported from OmniChunkTransferAdapter) -- + self._put_req_chunk: dict[str, int] = defaultdict(int) + self._get_req_chunk: dict[str, int] = defaultdict(int) + # Send-side async accumulation / staging buffer. Receive-side payload + # ownership lives in ``_local_stage_payload_cache``. + self._send_side_request_payload: dict[str, dict[str, Any]] = {} + self._code_prompt_token_ids: dict[str, list[list[int]]] = defaultdict(list) + self._request_ids_mapping: dict[str, str] = {} + + # -- async I/O state (shared by chunk + full_payload_mode) -- + self._pending_load_reqs: dict[str, Any] = {} + self._finished_load_reqs: set[str] = set() + self._pending_save_reqs: dict[str, deque] = {} + self._pending_save_counts: dict[str, int] = defaultdict(int) + self._deferred_send_cleanup: set[str] = set() + # -- per-cycle output accumulator -- + self._chunk_ready_req_ids: set[str] = set() + self._chunk_finished_req_ids: set[str] = set() + self._stage_recv_req_ids: set[str] = set() + self._full_payload_pending_broadcast_req_ids: set[str] = set() + self._async_chunk_updated_req_ids: set[str] = set() + + # -- Model Runner local payload cache (RFC §2.4) -- + # Full stage payloads land here first on the recv side. We + # intentionally do not write connector recv results straight into + # `model_intermediate_buffer`: runner-owned runtime state is + # materialized later by `_sync_local_stage_payloads()` on the + # model thread. This keeps recv timing separate from execute-step + # visibility and avoids mixing connector I/O with model runtime + # ownership. + self._local_stage_payload_cache: dict[str, dict[str, Any]] = {} + # Lightweight scheduling metadata pending delivery to the Scheduler. + self._local_request_metadata: dict[str, dict[str, Any]] = {} + + # -- persistent set of request IDs whose chunk stream is complete -- + # Prevents re-registration after the finish sentinel has been received. + self._chunk_stream_completed: set[str] = set() + + # -- full_payload_mode: accumulate latest pooler_output per request, + # send only when the request finishes (next-cycle flush) -- + self._pending_full_payload_send: dict[str, tuple[Any, Any]] = {} + + # -- KV sent accumulator -- + self._kv_sent_req_ids: list[str] = [] + + # -- KV transfer lifecycle (absorbed from scheduler) -- + # Requests marked for KV transfer: {req_id: {seq_len, block_ids}} + self._kv_pending_transfers: dict[str, dict[str, Any]] = {} + # Requests whose KV transfer has been submitted but not yet acked + self._kv_active_transfers: set[str] = set() + # Requests whose KV transfer is complete (acked by kv_extracted_req_ids) + self._kv_completed_transfers: set[str] = set() + # Dedup guard: requests that have already triggered KV transfer + self._kv_triggered_requests: set[str] = set() + + self._lock = threading.Lock() + self._stop_event = threading.Event() + self._work_available = threading.Event() + + # Start background threads only when there's a connector + self._recv_thread: threading.Thread | None = None + self._save_thread: threading.Thread | None = None + if self._omni_connector is not None: + self._recv_thread = threading.Thread( + target=self._recv_loop, + daemon=True, + name="omni-mixin-recv", + ) + self._recv_thread.start() + self._save_thread = threading.Thread( + target=self._save_loop, + daemon=True, + name="omni-mixin-save", + ) + self._save_thread.start() + + def shutdown_omni_connectors(self) -> None: + """Stop background threads and release connector resources.""" + self._stop_event.set() + if self._recv_thread is not None: + self._recv_thread.join(timeout=5) + if self._save_thread is not None: + self._save_thread.join(timeout=5) + if self._omni_connector is not None: + try: + self._omni_connector.close() + except Exception: + pass + + def cleanup_finished_request(self, req_id: str) -> None: + """Clean up per-request state after a request is fully finished. + + Call this when a request is freed from the model runner to prevent + memory leaks in the mixin's tracking dicts/sets. The external + request ID is resolved before cleaning up ``_put_req_chunk`` which + is keyed by external ID. + """ + ext_id = self._request_ids_mapping.pop(req_id, None) + send_req_id = ext_id if ext_id is not None else req_id + + with self._lock: + if self._pending_save_counts.get(send_req_id, 0): + self._deferred_send_cleanup.add(send_req_id) + else: + self._put_req_chunk.pop(send_req_id, None) + self._send_side_request_payload.pop(send_req_id, None) + self._code_prompt_token_ids.pop(send_req_id, None) + self._kv_pending_transfers.pop(req_id, None) + self._kv_active_transfers.discard(req_id) + self._kv_completed_transfers.discard(req_id) + self._kv_triggered_requests.discard(req_id) + self._cleanup_recv_delivery_state(req_id) + + def drop_inactive_request_delivery_state(self, req_id: str) -> None: + """Clear recv-side state for inactive requests.""" + ext_id = self._request_ids_mapping.pop(req_id, None) + if hasattr(self, "_lock"): + with self._lock: + self._drop_send_side_payload_state(req_id, ext_id) + else: + self._drop_send_side_payload_state(req_id, ext_id) + self._cleanup_recv_delivery_state(req_id) + + def _drop_send_side_payload_state(self, req_id: str, ext_id: str | None) -> None: + if ext_id is not None: + self._send_side_request_payload.pop(ext_id, None) + self._send_side_request_payload.pop(req_id, None) + + def _cleanup_recv_delivery_state(self, req_id: str) -> None: + """Clear recv-side delivery-cycle state.""" + if hasattr(self, "_lock"): + with self._lock: + self._clear_recv_delivery_state(req_id) + else: + self._clear_recv_delivery_state(req_id) + + def _clear_recv_delivery_state(self, req_id: str) -> None: + self._get_req_chunk.pop(req_id, None) + self._pending_load_reqs.pop(req_id, None) + self._finished_load_reqs.discard(req_id) + self._chunk_ready_req_ids.discard(req_id) + self._chunk_finished_req_ids.discard(req_id) + self._chunk_stream_completed.discard(req_id) + self._stage_recv_req_ids.discard(req_id) + self._full_payload_pending_broadcast_req_ids.discard(req_id) + self._async_chunk_updated_req_ids.discard(req_id) + self._local_stage_payload_cache.pop(req_id, None) + self._local_request_metadata.pop(req_id, None) + + def prune_inactive_requests(self, active_req_ids: Any) -> set[str]: + """Drop connector state for requests that no longer exist locally. + + Preempted / unscheduled requests are expected to stay in + ``self.requests`` and therefore remain untouched. This only prunes + stale request IDs that have already fallen out of the active request + map, preventing background recv/send bookkeeping from outliving the + request lifecycle. + """ + if active_req_ids is None: + return set() + + active_req_ids = set(active_req_ids) + pending_req_ids = set(getattr(self, "_pending_load_reqs", {}).keys()) + received_req_ids = set(getattr(self, "_stage_recv_req_ids", set())) + received_req_ids.update(getattr(self, "_full_payload_pending_broadcast_req_ids", set())) + received_req_ids.update(getattr(self, "_local_request_metadata", {}).keys()) + # Pending recv requests may not yet be in the caller's active set + # (e.g. WAITING_FOR_CHUNK requests live in the coordinator's internal + # queues, not in model runner self.requests). Protect them so that + # legitimate waiting requests are not pruned. + # + # Likewise, a full payload can arrive on the background recv thread + # after the scheduler_output snapshot for the current execute_model() + # cycle was already materialized. Those requests may briefly live only + # in recv-side buffers/local cache until the next scheduler cycle wakes + # them up; pruning them here drops the payload before stage_recv can be + # published. + active_req_ids.update(pending_req_ids) + active_req_ids.update(received_req_ids) + stale_req_ids: set[str] = set() + + # NOTE: _pending_load_reqs is excluded from the scan list because + # all its entries are unconditionally protected above. The mixin + # cannot distinguish a legitimately-waiting pending recv from an + # orphaned one (only the coordinator/scheduler knows). + # + # Requests with freshly received full payloads / local stage payloads + # are also protected above. Their scheduler wake-up may lag the recv + # thread by one execute_model() cycle, especially when the request was + # added after the current scheduler_output snapshot. + # + # Orphaned pending recv entries (e.g. from upstream stage crash) + # are handled by OmniSchedulingCoordinator.collect_timed_out_request_ids() + # which detects wait-time violations. The scheduler then removes the + # request from its queues, sets FINISHED_ERROR, and calls _free_request() + # which ultimately triggers cleanup_finished_request() here. + for attr_name in ( + "_request_ids_mapping", + "_get_req_chunk", + "_finished_load_reqs", + "_chunk_ready_req_ids", + "_chunk_finished_req_ids", + "_chunk_stream_completed", + "_stage_recv_req_ids", + "_full_payload_pending_broadcast_req_ids", + "_async_chunk_updated_req_ids", + "_local_stage_payload_cache", + "_local_request_metadata", + "_kv_pending_transfers", + "_kv_active_transfers", + "_kv_completed_transfers", + "_kv_triggered_requests", + ): + state = getattr(self, attr_name, None) + if isinstance(state, dict): + stale_req_ids.update(req_id for req_id in state if req_id not in active_req_ids) + elif isinstance(state, set): + stale_req_ids.update(req_id for req_id in state if req_id not in active_req_ids) + + for req_id in stale_req_ids: + self.cleanup_finished_request(req_id) + + return stale_req_ids + + # ------------------------------------------------------------------ # + # Local payload cache (RFC §2.4 – Model Runner ownership) + # ------------------------------------------------------------------ # + + def put_local_stage_payload(self, req_id: str, payload: dict[str, Any]) -> None: + """Store a full stage payload in the local cache.""" + self._local_stage_payload_cache[req_id] = payload + + def get_local_stage_payload(self, req_id: str) -> dict[str, Any] | None: + """Read a stage payload without removing it.""" + return self._local_stage_payload_cache.get(req_id) + + def pop_local_stage_payload(self, req_id: str) -> dict[str, Any] | None: + """Remove and return a stage payload (consume after use).""" + return self._local_stage_payload_cache.pop(req_id, None) + + def put_local_request_metadata(self, req_id: str, metadata: dict[str, Any]) -> None: + """Store lightweight scheduling metadata for a request.""" + self._local_request_metadata[req_id] = metadata + + def get_local_request_metadata(self, req_id: str) -> dict[str, Any] | None: + """Retrieve scheduling metadata for a request.""" + return self._local_request_metadata.get(req_id) + + # ------------------------------------------------------------------ # + # Scheduling metadata extraction + # ------------------------------------------------------------------ # + + _SCHEDULING_METADATA_KEYS = ( + "next_stage_prompt_len", + "code_predictor_codes", + "left_context_size", + ) + + @classmethod + def _extract_scheduling_metadata(cls, payload: dict[str, Any]) -> dict[str, Any]: + """Extract only the fields the scheduler needs from a full payload.""" + return {k: payload[k] for k in cls._SCHEDULING_METADATA_KEYS if k in payload} + + _NON_CONSUMABLE_PAYLOAD_KEYS = { + "finished", + "override_keys", + "next_stage_prompt_len", + "left_context_size", + THINKER_OUTPUT_TOKEN_IDS_KEY, + THINKER_DECODE_TOKEN_START_KEY, + THINKER_DECODE_TOKEN_END_KEY, + } + + @staticmethod + def _payload_value_has_content(value: Any) -> bool: + if value is None: + return False + if isinstance(value, torch.Tensor): + return value.numel() > 0 + if isinstance(value, (list, tuple, dict, set)): + return len(value) > 0 + return True + + @classmethod + def _payload_is_consumable(cls, payload: dict[str, Any] | None) -> bool: + """Return True when an async payload can drive a real forward step. + + Metadata-only wake-ups should not transition WAITING_FOR_CHUNK requests + back to schedulable state. In particular, a widened token horizon without + any newly visible thinker decode embeds should not force a placeholder-only + talker decode step. + """ + if not isinstance(payload, dict) or not payload: + return False + + decode_embeddings = payload.get(THINKER_DECODE_EMBEDDINGS_KEY) + if isinstance(decode_embeddings, torch.Tensor): + if decode_embeddings.ndim == 0: + return True + return decode_embeddings.numel() > 0 and decode_embeddings.shape[0] > 0 + + if "code_predictor_codes" in payload: + code_predictor_codes = payload.get("code_predictor_codes") + if isinstance(code_predictor_codes, torch.Tensor): + return code_predictor_codes.numel() > 0 + # Codec code 0 is valid; non-empty code payloads are consumable. + if hasattr(code_predictor_codes, "__len__"): + return len(code_predictor_codes) > 0 + else: + return code_predictor_codes is not None + + for key, value in payload.items(): + if key in cls._NON_CONSUMABLE_PAYLOAD_KEYS: + continue + if cls._payload_value_has_content(value): + return True + return False + + @staticmethod + def _get_local_tp_group() -> Any | None: + """Return the local TP group when tensor parallelism is initialized.""" + try: + return get_tp_group() + except Exception: + return None + + def _recv_ordinary_stage_result( + self, + connector: OmniConnectorBase, + from_stage: str, + to_stage: str, + connector_get_key: str, + ) -> Any: + """Receive one ordinary non-KV stage payload on the local leader rank only.""" + tp_group = self._get_local_tp_group() + if tp_group is None or getattr(tp_group, "world_size", 1) <= 1: + return connector.get(from_stage, to_stage, connector_get_key) + if not self.is_data_transfer_rank(): + return None + return connector.get(from_stage, to_stage, connector_get_key) + + def _recv_full_payload_result( + self, + connector: OmniConnectorBase, + from_stage: str, + to_stage: str, + connector_get_key: str, + ) -> Any: + """Receive one full-payload transfer on the local leader rank only.""" + return self._recv_ordinary_stage_result( + connector, + from_stage, + to_stage, + connector_get_key, + ) + + def _recv_async_chunk_result( + self, + connector: OmniConnectorBase, + from_stage: str, + to_stage: str, + connector_get_key: str, + ) -> Any: + """Receive one ordinary async chunk on the local leader rank only.""" + return self._recv_ordinary_stage_result( + connector, + from_stage, + to_stage, + connector_get_key, + ) + + @staticmethod + def _snapshot_payload(payload: Any) -> Any: + if isinstance(payload, dict): + return dict(payload) + return payload + + def _broadcast_tp_payload_packet(self, packet: Any) -> Any: + """Broadcast one ordinary payload packet from TP rank 0 when TP is active.""" + tp_group = self._get_local_tp_group() + if tp_group is None or getattr(tp_group, "world_size", 1) <= 1: + return packet + leader_packet = packet if self.is_data_transfer_rank() else None + return tp_group.broadcast_object(leader_packet, src=0) + + def _apply_staged_payloads_locked(self, staged_payloads: dict[str, Any]) -> None: + for req_id, payload in staged_payloads.items(): + self._local_stage_payload_cache[req_id] = self._snapshot_payload(payload) + + def _collect_full_payload_results_locked(self) -> dict[str, Any] | None: + if not self._full_payload_pending_broadcast_req_ids: + return None + results: dict[str, Any] = {} + missing_req_ids: list[str] = [] + for req_id in tuple(self._full_payload_pending_broadcast_req_ids): + payload = self._local_stage_payload_cache.get(req_id) + if payload is None: + missing_req_ids.append(req_id) + continue + results[req_id] = self._snapshot_payload(payload) + self._full_payload_pending_broadcast_req_ids.discard(req_id) + if missing_req_ids: + logger.warning( + "[Stage-%s] _collect_full_payload_results_locked: " + "pending full-payload reqs missing from local cache: %s", + self._stage_id, + missing_req_ids, + ) + return results or None + + def _collect_async_chunk_fanout_packet_locked(self) -> dict[str, Any] | None: + payload_req_ids = set(self._async_chunk_updated_req_ids) + payload_req_ids.update(self._finished_load_reqs) + payload_req_ids.update(self._chunk_finished_req_ids) + payload_req_ids.update(self._local_request_metadata) + if not ( + payload_req_ids or self._finished_load_reqs or self._chunk_finished_req_ids or self._local_request_metadata + ): + return None + + staged_payloads = { + req_id: self._snapshot_payload(self._local_stage_payload_cache[req_id]) + for req_id in payload_req_ids + if req_id in self._local_stage_payload_cache + } + packet = { + "staged_payloads": staged_payloads, + "request_metadata": dict(self._local_request_metadata), + "newly_finished": set(self._finished_load_reqs), + "chunk_finished": set(self._chunk_finished_req_ids), + } + + self._async_chunk_updated_req_ids.clear() + self._finished_load_reqs.clear() + self._chunk_finished_req_ids.clear() + self._local_request_metadata.clear() + + for req_id in packet["chunk_finished"]: + if req_id not in self._local_stage_payload_cache: + continue + ext_req_id = self._request_ids_mapping.get(req_id, req_id) + self._send_side_request_payload.pop(ext_req_id, None) + if ext_req_id != req_id: + self._send_side_request_payload.pop(req_id, None) + + return packet + + def _apply_async_chunk_fanout_packet(self, packet: dict[str, Any]) -> None: + staged_payloads = packet.get("staged_payloads", {}) + chunk_finished = set(packet.get("chunk_finished", ())) + with self._lock: + self._apply_staged_payloads_locked(staged_payloads) + for req_id in chunk_finished: + self._pending_load_reqs.pop(req_id, None) + self._chunk_stream_completed.add(req_id) + + # ------------------------------------------------------------------ # + # full_payload_mode (recv_full_payload_inputs / send_full_payload_outputs) + # ------------------------------------------------------------------ # + + def recv_full_payload_inputs(self, scheduler_output: Any) -> dict[str, Any] | None: + """Check for incoming full_payload_mode stage inputs (non-blocking). + + Returns a dict mapping ``request_id -> engine_inputs`` for data + that has arrived, or ``None`` if nothing is ready. Stores full + payloads in the local cache and extracts scheduling metadata. + """ + with self._lock: + results = self._collect_full_payload_results_locked() if self.is_data_transfer_rank() else None + results = self._broadcast_tp_payload_packet(results) + if not results: + return None + with self._lock: + self._stage_recv_req_ids.update(results.keys()) + for req_id in results: + self._pending_load_reqs.pop(req_id, None) + self._apply_staged_payloads_locked(results) + for req_id, payload in results.items(): + self._local_request_metadata[req_id] = self._extract_scheduling_metadata(payload) + logger.info( + "[Stage-%s] recv_full_payload_inputs: consumed %s reqs: %s, stage_recv_req_ids now=%s", + self._stage_id, + len(results), + list(results.keys()), + self._stage_recv_req_ids, + ) + return results + + @staticmethod + def _is_all_zero_tensor(t: Any) -> bool: + """Return True if *t* is a torch.Tensor whose elements are all zero.""" + return isinstance(t, torch.Tensor) and t.numel() > 0 and not t.any() + + def accumulate_full_payload_output( + self, + req_id: str, + pooler_output: Any, + request: Any, + ) -> None: + """Accumulate pooler_output for a request across steps (full_payload_mode). + + Per-token tensors (2-D+, matching trailing dims) are concatenated + along dim-0. Scalar / global tensors (1-D or 0-D) are replaced + with the latest value. + + All-zero tensors (e.g. ``code_predictor_codes`` emitted during + prefill) are dropped so that they do not pollute downstream stages + with garbage / noise frames. + + The data is actually sent when ``flush_full_payload_outputs`` is called + with the finished request IDs from the next scheduler cycle. + """ + # ---- Filter out all-zero tensors from the incoming pooler_output ---- + filtered: dict[str, Any] = {} + dropped_zero_keys: list[tuple[str, tuple[int, ...]]] = [] + for k, v in pooler_output.items(): + if self._is_all_zero_tensor(v): + dropped_zero_keys.append((k, tuple(v.shape))) + continue # skip prefill zero-filled placeholders + filtered[k] = v + if dropped_zero_keys: + logger.info( + "[Stage-%s] accumulate_full_payload_output: req=%s dropped_zero_keys=%s", + self._stage_id, + req_id, + dropped_zero_keys, + ) + pooler_output = filtered + + existing = self._pending_full_payload_send.get(req_id) + if existing is None: + self._pending_full_payload_send[req_id] = (pooler_output, request) + return + + prev_output, _ = existing + merged: dict[str, Any] = {} + for k in set(prev_output) | set(pooler_output): + v_new = pooler_output.get(k) + v_old = prev_output.get(k) + if v_new is None: + merged[k] = v_old + elif v_old is None: + merged[k] = v_new + elif ( + isinstance(v_new, torch.Tensor) + and isinstance(v_old, torch.Tensor) + and v_new.dim() >= 2 + and v_old.dim() >= 2 + and v_new.shape[1:] == v_old.shape[1:] + ): + merged[k] = torch.cat([v_old, v_new], dim=0) + else: + merged[k] = v_new + self._pending_full_payload_send[req_id] = (merged, request) + + def flush_full_payload_outputs(self, finished_req_ids: set[str]) -> None: + """Send accumulated full_payload outputs for requests that just finished.""" + logger.info( + "[Stage-%s] flush_full_payload_outputs: finished_req_ids=%s, pending=%s", + self._stage_id, + finished_req_ids, + list(self._pending_full_payload_send.keys()), + ) + to_send: dict[str, tuple[Any, Any]] = {} + for req_id in finished_req_ids: + entry = self._pending_full_payload_send.pop(req_id, None) + if entry is not None: + to_send[req_id] = entry + logger.info("[Stage-%s] flush_full_payload_outputs: to_send=%s", self._stage_id, list(to_send.keys())) + if to_send: + self.send_full_payload_outputs(scheduler_output=None, outputs=to_send) + + def send_full_payload_outputs( + self, + scheduler_output: Any, + outputs: dict[str, tuple[Any, Any] | Any], + ) -> list[str]: + """Send full_payload stage outputs to the next stage via connector. + + Args: + outputs: Mapping of ``req_id`` to either a + ``(pooling_output, request)`` tuple (preferred) or a raw + payload dict. When a tuple is supplied the request object + is forwarded to ``custom_process_stage_input_func``. + + Returns list of request IDs successfully enqueued. + """ + if self._omni_connector is None: + logger.info("[Stage-%s] send_full_payload_outputs: connector is None, skip", self._stage_id) + return [] + if not self.is_data_transfer_rank(): + logger.info( + "[Stage-%s] send_full_payload_outputs: not data_transfer_rank (rank=%s), skip", + self._stage_id, + self._local_rank, + ) + return list(outputs.keys()) + sent_ids: list[str] = [] + next_stage_id = self._next_stage_id + for req_id, value in outputs.items(): + if isinstance(value, tuple) and len(value) == 2: + raw_output, request = value + else: + raw_output, request = value, None + + payload = raw_output + if self._custom_process_func is not None: + payload = self._build_custom_process_payload( + request_id=req_id, + request=request, + pooling_output=raw_output, + ) + if payload is None: + continue + if payload is None: + logger.info("[Stage-%s] send_full_payload_outputs: payload is None for %s", self._stage_id, req_id) + continue + if isinstance(payload, dict): + code_predictor_codes = payload.get("code_predictor_codes") + if isinstance(code_predictor_codes, torch.Tensor): + code_len = int(code_predictor_codes.numel()) + elif hasattr(code_predictor_codes, "__len__"): + code_len = len(code_predictor_codes) + else: + code_len = None + logger.info( + "[Stage-%s] send_full_payload_outputs: req=%s payload_keys=%s code_len=%s left_context_size=%s", + self._stage_id, + req_id, + sorted(payload.keys()), + code_len, + payload.get("left_context_size"), + ) + + external_req_id = self._resolve_external_req_id(request, req_id) + chunk_id = self._put_req_chunk[req_id] + self._put_req_chunk[req_id] += 1 + connector_put_key = f"{external_req_id}_{self._stage_id}_{chunk_id}" + + logger.info( + "[Stage-%s] send_full_payload_outputs: enqueue req=%s put_key=%s next_stage=%s", + self._stage_id, + req_id, + connector_put_key, + next_stage_id, + ) + task = { + "stage_id": self._stage_id, + "next_stage_id": next_stage_id, + "put_key": connector_put_key, + "data": payload, + "request_id": req_id, + } + with self._lock: + self._pending_save_reqs.setdefault(req_id, deque()).append(task) + self._pending_save_counts[req_id] += 1 + sent_ids.append(req_id) + if sent_ids: + self._work_available.set() + return sent_ids + + def recv_stage_inputs(self, scheduler_output: Any) -> dict[str, Any] | None: + """Compatibility wrapper for ``recv_full_payload_inputs``.""" + return self.recv_full_payload_inputs(scheduler_output) + + def accumulate_batch_output( + self, + req_id: str, + pooler_output: Any, + request: Any, + ) -> None: + """Compatibility wrapper for ``accumulate_full_payload_output``.""" + self.accumulate_full_payload_output(req_id, pooler_output, request) + + def flush_batch_outputs(self, finished_req_ids: set[str]) -> None: + """Compatibility wrapper for ``flush_full_payload_outputs``.""" + self.flush_full_payload_outputs(finished_req_ids) + + def send_stage_outputs( + self, + scheduler_output: Any, + outputs: dict[str, tuple[Any, Any] | Any], + ) -> list[str]: + """Compatibility wrapper for ``send_full_payload_outputs``.""" + return self.send_full_payload_outputs(scheduler_output, outputs) + + # ------------------------------------------------------------------ # + # Streaming chunk mode (recv_chunk / send_chunk) + # ------------------------------------------------------------------ # + + def register_chunk_recv(self, request: Any) -> None: + """Register a request for async chunk retrieval by the bg thread. + + Stage-0 has no upstream producer so this is a no-op there. + Skips requests whose batch data has already been received to + prevent the bg thread from polling for non-existent chunks. + """ + if self._stage_id == 0: + return + request_id = request.request_id + self._request_ids_mapping[request_id] = getattr( + request, + "external_req_id", + request_id, + ) + with self._lock: + if request_id in self._stage_recv_req_ids: + return + # Don't re-register if the finish sentinel was already received + if request_id in self._chunk_stream_completed: + return + self._pending_load_reqs[request_id] = request + self._work_available.set() + + def recv_chunk(self) -> dict[str, Any]: + """Collect chunks received by the bg thread since last call. + + Returns a dict ``{request_id: chunk_payload}`` for newly arrived + chunks. Empty dict when nothing is ready. + + This method reads from ``_finished_load_reqs`` without clearing + it -- ``get_omni_connector_output()`` is the sole consumer that + drains and resets ``_finished_load_reqs`` at the end of each + ``execute_model`` cycle. + + Returns **shallow copies** of the cached payloads so that the + caller can read them without racing against the background recv + thread, which may concurrently mutate the live cache entries via + ``dict.update()``. + """ + with self._lock: + finished = set(self._finished_load_reqs) + if not finished: + return {} + # Snapshot the payloads under the lock to avoid racing with + # _poll_single_request which does existing.update(payload_data) + # on the same dict objects. + result = {} + for rid in finished: + payload = self._local_stage_payload_cache.get(rid) + result[rid] = dict(payload) if isinstance(payload, dict) else payload + + self._chunk_ready_req_ids.update(finished) + return result + + def send_chunk( + self, + request: Any, + pooling_output: Any | None = None, + ) -> bool: + """Derive and enqueue one chunk for async sending. + + Payload extraction runs in the caller thread (via + ``custom_process_stage_input_func``); the actual + ``connector.put()`` is done by the background save thread. + Non-KV data is identical across TP ranks; only rank 0 sends. + """ + if self._omni_connector is None: + logger.warning("[Stage-%s] send_chunk: connector is None", self._stage_id) + return False + if not self.is_data_transfer_rank(): + return True + raw_req_id = getattr(request, "request_id", None) or getattr(request, "req_id", None) + request_id = self._resolve_external_req_id(request, raw_req_id) + # Cache the internal→external mapping so that finish sentinels can + # resolve the external ID even after the request is freed. + if raw_req_id and raw_req_id != request_id: + self._request_ids_mapping.setdefault(raw_req_id, request_id) + chunk_id = self._put_req_chunk[request_id] + + payload_data = self._build_custom_process_payload( + request_id=request_id, + request=request, + pooling_output=pooling_output, + ) + if payload_data is None: + if chunk_id == 0: + logger.warning( + "[Stage-%s] send_chunk: payload is None for req=%s chunk=%s (process_func=%s)", + self._stage_id, + request_id, + chunk_id, + self._custom_process_func, + ) + return False + + self._put_req_chunk[request_id] += 1 + next_stage_id = self._next_stage_id + connector_put_key = f"{request_id}_{self._stage_id}_{chunk_id}" + + if chunk_id == 0: + logger.info( + "[Stage-%s] send_chunk: first chunk enqueued, req=%s key=%s", + self._stage_id, + request_id, + connector_put_key, + ) + + task = { + "stage_id": self._stage_id, + "next_stage_id": next_stage_id, + "put_key": connector_put_key, + "data": payload_data, + "request_id": request_id, + } + with self._lock: + self._pending_save_reqs.setdefault(request_id, deque()).append(task) + self._pending_save_counts[request_id] += 1 + self._work_available.set() + return True + + # ------------------------------------------------------------------ # + # KV cache (delegates to OmniKVTransferManager) + # ------------------------------------------------------------------ # + + def send_kv_cache( + self, + finished_reqs: dict[str, dict[str, Any]], + kv_caches: list[torch.Tensor], + block_size: int, + cache_dtype: str, + request_id_resolver: Any | None = None, + ) -> list[str]: + """Send KV cache for finished requests. + + Delegates to the existing ``OmniKVTransferManager``. + """ + if self._kv_transfer_manager is None: + return list(finished_reqs.keys()) if finished_reqs else [] + result = self._kv_transfer_manager.handle_finished_requests_kv_transfer( + finished_reqs=finished_reqs, + kv_caches=kv_caches, + block_size=block_size, + cache_dtype=cache_dtype, + request_id_resolver=request_id_resolver, + ) + if result: + self._kv_sent_req_ids.extend(result) + return result + + def recv_kv_cache( + self, + request_id: str, + target_device: torch.device | None = None, + ) -> tuple[dict[str, Any] | None, int]: + """Receive KV cache for a request. + + Delegates to the existing ``OmniKVTransferManager``. + """ + if self._kv_transfer_manager is None: + return None, 0 + return self._kv_transfer_manager.receive_kv_cache_for_request( + request_id=request_id, + target_device=target_device, + ) + + def receive_cfg_companion_kv_payloads( + self, + cfg_request_ids: dict[str, str], + target_device: torch.device | None = None, + ) -> dict[str, tuple[dict[str, Any] | None, int]]: + """Receive raw CFG companion KV payloads keyed by role.""" + return { + role: self.recv_kv_cache(companion_rid, target_device=target_device) + for role, companion_rid in cfg_request_ids.items() + } + + def receive_multi_kv_cache( + self, + req: Any, + cfg_kv_collect_func: Any | None = None, + target_device: torch.device | None = None, + ) -> bool: + """Receive primary and optional companion KV caches for a request. + + The mixin owns the runner-facing orchestration: primary KV receive, + companion payload fetch, and applying any model-specific CFG fields back + onto ``req.sampling_params``. + """ + if self._kv_transfer_manager is None: + return False + + request_id = getattr(req, "request_id", None) or ( + req.request_ids[0] if hasattr(req, "request_ids") and req.request_ids else None + ) + if not request_id: + logger.warning("Request has no ID, cannot receive KV cache") + return False + + active_requests = getattr(self, "requests", None) + if active_requests is not None and request_id not in active_requests: + logger.info("Skip receiving KV cache for inactive request %s", request_id) + return False + + primary_ok = False + data, _size = self.recv_kv_cache(request_id, target_device=target_device) + if data: + self._kv_transfer_manager.apply_kv_cache_to_request(req, data) + primary_ok = True + + cfg_ids = getattr(getattr(req, "sampling_params", None), "cfg_kv_request_ids", None) + if cfg_ids and cfg_kv_collect_func: + try: + cfg_role_payloads = self.receive_cfg_companion_kv_payloads( + cfg_ids, + target_device=target_device, + ) + cfg_kvs = cfg_kv_collect_func(request_id, cfg_role_payloads) + if cfg_kvs and hasattr(req, "sampling_params") and req.sampling_params is not None: + for key, value in cfg_kvs.items(): + setattr(req.sampling_params, key, value) + logger.info("Applied CFG KV caches: %s", list(cfg_kvs.keys())) + except Exception: + logger.exception("Failed to collect CFG KV caches for %s", request_id) + + return primary_ok + + # ------------------------------------------------------------------ # + # Rank-aware KV transfer routing + # ------------------------------------------------------------------ # + + def get_rank_aware_kv_keys( + self, + req_id: str, + from_stage: int, + to_stage: int | None = None, + chunk_id: int = 0, + ) -> list[str]: + """Build recv-side connector keys for all remote ranks this rank needs. + + For heterogeneous TP receive, the local rank is the target rank and must + fetch one or more source-rank shards keyed as ``from_rank -> to_rank``. + """ + remote_ranks = self.get_kv_remote_ranks() + return [ + self.get_kv_connector_key( + req_id=req_id, + from_stage=from_stage, + chunk_id=chunk_id, + from_rank=remote_rank, + to_rank=self._local_rank, + ) + for remote_rank in remote_ranks + ] + + def get_kv_target_ranks_for_send(self) -> list[int]: + """Determine which target ranks this local rank should send KV shards to.""" + self._validate_kv_tp_topology() + if self._from_tp == self._to_tp: + return [self._local_rank] + if self._from_tp > self._to_tp: + tp_ratio = self._from_tp // self._to_tp + return [self._local_rank // tp_ratio] + tp_ratio = self._to_tp // self._from_tp + base_rank = self._local_rank * tp_ratio + return [base_rank + i for i in range(tp_ratio)] + + def get_rank_aware_kv_send_keys( + self, + req_id: str, + from_stage: int, + to_stage: int | None = None, + chunk_id: int = 0, + ) -> list[str]: + """Build send-side connector keys for this rank's KV shard(s).""" + target_ranks = self.get_kv_target_ranks_for_send() + return [ + self.get_kv_connector_key( + req_id=req_id, + from_stage=from_stage, + chunk_id=chunk_id, + from_rank=self._local_rank, + to_rank=target_rank, + ) + for target_rank in target_ranks + ] + + @staticmethod + def _merge_rank_sharded_kv_payloads(payloads: list[dict[str, Any]]) -> dict[str, Any] | None: + """Merge multiple source-rank KV shards for one target rank.""" + payloads = [payload for payload in payloads if isinstance(payload, dict)] + if not payloads: + return None + if len(payloads) == 1: + return payloads[0] + + merged = dict(payloads[0]) + layer_blocks = merged.get("layer_blocks") + if not isinstance(layer_blocks, dict): + return merged + + def _merge_tensor_lists(name: str) -> list[torch.Tensor | None]: + merged_list: list[torch.Tensor | None] = [] + cache_lists = [payload.get("layer_blocks", {}).get(name, []) for payload in payloads] + max_len = max((len(cache_list) for cache_list in cache_lists), default=0) + for idx in range(max_len): + tensors = [cache_list[idx] for cache_list in cache_lists if idx < len(cache_list)] + tensors = [tensor for tensor in tensors if isinstance(tensor, torch.Tensor)] + if not tensors: + merged_list.append(None) + elif len(tensors) == 1: + merged_list.append(tensors[0]) + else: + merged_list.append(torch.cat(tensors, dim=-2).contiguous()) + return merged_list + + merged["layer_blocks"] = { + "key_cache": _merge_tensor_lists("key_cache"), + "value_cache": _merge_tensor_lists("value_cache"), + } + metadata = dict(merged.get("metadata", {})) + metadata["merged_remote_rank_count"] = len(payloads) + merged["metadata"] = metadata + return merged + + def _slice_rank_sharded_kv_payload(self, payload: dict[str, Any] | None) -> dict[str, Any] | None: + """Slice a duplicated source-rank KV shard for ``from_tp < to_tp`` cases.""" + if payload is None or self._from_tp >= self._to_tp: + return payload + + tp_ratio = self._to_tp // self._from_tp + shard_index = self._local_rank % tp_ratio + layer_blocks = payload.get("layer_blocks") if isinstance(payload, dict) else None + if not isinstance(layer_blocks, dict): + return payload + + def _slice_tensor_list(name: str) -> list[torch.Tensor | None]: + sliced: list[torch.Tensor | None] = [] + for tensor in layer_blocks.get(name, []): + if not isinstance(tensor, torch.Tensor) or tensor.ndim < 2: + sliced.append(tensor) + continue + head_dim = tensor.shape[-2] + if head_dim % tp_ratio != 0: + sliced.append(tensor) + continue + per_rank = head_dim // tp_ratio + start = shard_index * per_rank + sliced.append(tensor.narrow(-2, start, per_rank).contiguous()) + return sliced + + payload = dict(payload) + payload["layer_blocks"] = { + "key_cache": _slice_tensor_list("key_cache"), + "value_cache": _slice_tensor_list("value_cache"), + } + metadata = dict(payload.get("metadata", {})) + metadata["sliced_for_local_rank"] = self._local_rank + payload["metadata"] = metadata + return payload + + def should_replicate_payload(self) -> bool: + """Whether non-KV payloads should be replicated across ranks. + + Data payloads (stage inputs, chunks) are identical after all-gather, + so only rank 0 transfers them. KV payloads are rank-specific and + all ranks participate. + """ + return self._local_rank != 0 + + def get_kv_rank_mapping(self) -> dict[str, Any]: + """Return the current rank mapping configuration. + + Useful for debugging and for downstream code that needs to know + the TP topology without re-parsing model config. + """ + return { + "from_tp": self._from_tp, + "to_tp": self._to_tp, + "local_rank": self._local_rank, + "remote_ranks": self.get_kv_remote_ranks(), + "is_data_transfer_rank": self.is_data_transfer_rank(), + } + + # ------------------------------------------------------------------ # + # KV transfer lifecycle (RFC – mixin-owned) + # ------------------------------------------------------------------ # + + def mark_kv_transfer( + self, + req_id: str, + seq_len: int, + block_ids: list[int], + custom_metadata: dict[str, Any] | None = None, + ) -> None: + """Mark a request as needing KV cache transfer. + + Called by the scheduler when a transfer trigger fires. The mixin + owns the lifecycle from this point: pending → active → completed. + """ + if req_id in self._kv_pending_transfers: + return + self._kv_triggered_requests.add(req_id) + transfer = { + "seq_len": seq_len, + "block_ids": block_ids, + } + if custom_metadata is not None: + transfer["custom_metadata"] = custom_metadata + self._kv_pending_transfers[req_id] = transfer + + def drain_pending_kv_transfers(self) -> dict[str, dict[str, Any]]: + """Drain pending KV transfers and move them to active. + + Returns ``{req_id: {seq_len, block_ids}}`` for the model runner + to submit to ``send_kv_cache``. + """ + if not self._kv_pending_transfers: + return {} + pending = dict(self._kv_pending_transfers) + self._kv_active_transfers.update(pending.keys()) + self._kv_pending_transfers.clear() + return pending + + def ack_kv_transfers(self, req_ids: list[str] | set[str]) -> None: + """Acknowledge completed KV transfers (from kv_extracted_req_ids). + + Moves requests from active to completed so the scheduler can + safely free their blocks. + """ + for req_id in req_ids: + self._kv_active_transfers.discard(req_id) + self._kv_completed_transfers.add(req_id) + + def drain_completed_kv_transfers(self) -> set[str]: + """Drain and return completed KV transfer request IDs. + + The scheduler calls this to know which requests' blocks can be freed. + """ + completed = set(self._kv_completed_transfers) + self._kv_completed_transfers.clear() + return completed + + def is_kv_transfer_triggered(self, req_id: str) -> bool: + """Check if a request has already triggered KV transfer.""" + return req_id in self._kv_triggered_requests + + def has_pending_kv_work(self) -> bool: + """True if any KV transfers are pending, active, or awaiting ack.""" + return bool(self._kv_pending_transfers or self._kv_active_transfers or self._kv_completed_transfers) + + # Output aggregation + # ------------------------------------------------------------------ # + + def _empty_output_with_connector_signals(self) -> Any: + """Return a minimal ModelRunnerOutput carrying pending connector signals. + + Used by early-return paths (e.g. ``num_scheduled_tokens == 0``) + that still need to deliver ``omni_connector_output`` to the + Scheduler so that WAITING_FOR_INPUT / WAITING_FOR_CHUNK + transitions are not lost. + """ + from vllm_omni.outputs import OmniModelRunnerOutput + + output = OmniModelRunnerOutput(req_ids=[], req_id_to_index={}) + output.omni_connector_output = self.get_omni_connector_output() + return output + + def get_omni_connector_output(self) -> OmniConnectorOutput: + """Collect and reset transfer results for this execute_model cycle. + + ``request_metadata`` carries only lightweight scheduling metadata. + Full payloads remain owned by the Model Runner local cache for all + paths. + """ + if not hasattr(self, "_lock"): + return OmniConnectorOutput() + + tp_group = self._get_local_tp_group() + if self._async_chunk and tp_group is not None and getattr(tp_group, "world_size", 1) > 1: + if self.is_data_transfer_rank(): + with self._lock: + fanout_packet = self._collect_async_chunk_fanout_packet_locked() + else: + fanout_packet = None + fanout_packet = self._broadcast_tp_payload_packet(fanout_packet) + if fanout_packet is None: + newly_finished = set() + chunk_finished = set() + request_metadata = {} + else: + if not self.is_data_transfer_rank(): + self._apply_async_chunk_fanout_packet(fanout_packet) + newly_finished = set(fanout_packet["newly_finished"]) + chunk_finished = set(fanout_packet["chunk_finished"]) + request_metadata = dict(fanout_packet["request_metadata"]) + else: + with self._lock: + newly_finished = set(self._finished_load_reqs) + self._finished_load_reqs.clear() + chunk_finished = set(self._chunk_finished_req_ids) + self._chunk_finished_req_ids.clear() + request_metadata = dict(self._local_request_metadata) + self._local_request_metadata.clear() + # _send_side_request_payload is the async accumulation buffer for + # future recv chunks. Clearing it on every consumable wake-up drops + # intermediate + # thinker decode spans before the model side can consume them. + # Only terminal chunk_finished requests may release that buffer. + for req_id in chunk_finished: + if req_id not in self._local_stage_payload_cache: + continue + ext_req_id = self._request_ids_mapping.get(req_id, req_id) + self._send_side_request_payload.pop(ext_req_id, None) + if ext_req_id != req_id: + self._send_side_request_payload.pop(req_id, None) + self._chunk_ready_req_ids.update(newly_finished) + + output = OmniConnectorOutput( + chunk_ready_req_ids=set(self._chunk_ready_req_ids), + chunk_finished_req_ids=chunk_finished, + request_metadata=request_metadata, + kv_sent_req_ids=list(self._kv_sent_req_ids), + stage_recv_req_ids=set(self._stage_recv_req_ids), + has_pending_kv_work=self.has_pending_kv_work(), + ) + if output.stage_recv_req_ids or chunk_finished or newly_finished: + logger.info( + "[Stage-%s] get_omni_connector_output: stage_recv=%s, chunk_finished=%s, chunk_ready=%s", + self._stage_id, + output.stage_recv_req_ids, + chunk_finished, + output.chunk_ready_req_ids, + ) + self._chunk_ready_req_ids.clear() + self._kv_sent_req_ids.clear() + self._stage_recv_req_ids.clear() + return output + + @staticmethod + def _connector_output_has_signals(output: OmniConnectorOutput) -> bool: + return bool( + output.chunk_ready_req_ids + or output.chunk_finished_req_ids + or output.request_metadata + or output.kv_sent_req_ids + or output.stage_recv_req_ids + or output.has_pending_kv_work + ) + + def attach_omni_connector_output(self, result: Any | None) -> Any: + omni_output = self.get_omni_connector_output() + if not self._connector_output_has_signals(omni_output): + return result + + from copy import copy + + from vllm.v1.worker.gpu_model_runner import EMPTY_MODEL_RUNNER_OUTPUT + + wrapped = copy(result if result is not None else EMPTY_MODEL_RUNNER_OUTPUT) + wrapped.omni_connector_output = omni_output + return wrapped + + # ------------------------------------------------------------------ # + # Properties for compatibility with custom_process funcs that access + # transfer_manager.put_req_chunk / request_payload / code_prompt_token_ids + # ------------------------------------------------------------------ # + + @property + def put_req_chunk(self) -> dict[str, int]: + return self._put_req_chunk + + @property + def request_payload(self) -> dict[str, dict[str, Any]]: + return self._send_side_request_payload + + @request_payload.setter + def request_payload(self, value: dict[str, dict[str, Any]]) -> None: + self._send_side_request_payload = value + + @property + def code_prompt_token_ids(self) -> dict[str, list[list[int]]]: + return self._code_prompt_token_ids + + @property + def connector(self) -> Any | None: + return self._omni_connector + + # ------------------------------------------------------------------ # + # Background I/O threads + # ------------------------------------------------------------------ # + + def _recv_loop(self) -> None: + """Background thread: poll connector for incoming data.""" + _recv_poll_count = 0 + while not self._stop_event.is_set(): + with self._lock: + pending_ids = list(self._pending_load_reqs.keys()) + + if not pending_ids: + self._work_available.wait(timeout=0.01) + self._work_available.clear() + continue + + _recv_poll_count += 1 + if _recv_poll_count % 5000 == 1: + logger.info( + "[Stage-%s] _recv_loop: polling %s pending reqs: %s (poll#%s)", + self._stage_id, + len(pending_ids), + pending_ids[:5], + _recv_poll_count, + ) + + made_progress = False + for req_id in pending_ids: + if self._stop_event.is_set(): + break + try: + made_progress = self._poll_single_request(req_id) or made_progress + except Exception: + logger.warning("Error receiving data for %s", req_id, exc_info=True) + + if not made_progress and not self._stop_event.is_set(): + self._work_available.wait(timeout=0.001) + self._work_available.clear() + + _MAX_SEND_RETRIES = 3 + + def _save_loop(self) -> None: + """Background thread: send outgoing data via connector.""" + while not self._stop_event.is_set(): + task = None + with self._lock: + for req_id in list(self._pending_save_reqs.keys()): + dq = self._pending_save_reqs[req_id] + if dq: + task = dq.popleft() + if not dq: + del self._pending_save_reqs[req_id] + break + del self._pending_save_reqs[req_id] + + if task is not None: + success = False + try: + success = self._send_single_request(task) + except Exception: + logger.error( + "Error saving data for %s", + task.get("request_id"), + exc_info=True, + ) + if not success: + self._requeue_or_drop_failed_send(task) + continue + + self._work_available.wait(timeout=0.01) + self._work_available.clear() + + def _requeue_or_drop_failed_send(self, task: dict) -> None: + """Re-enqueue a failed send task or drop it after max retries.""" + retry_count = task.get("_retry_count", 0) + 1 + req_id = task.get("request_id") + if retry_count <= self._MAX_SEND_RETRIES: + task["_retry_count"] = retry_count + logger.warning( + "[Stage-%s] Re-enqueuing failed send for %s (retry %d/%d)", + getattr(self, "_stage_id", "?"), + req_id, + retry_count, + self._MAX_SEND_RETRIES, + ) + with self._lock: + dq = self._pending_save_reqs.setdefault(req_id, deque()) + dq.appendleft(task) + else: + logger.error( + "[Stage-%s] Giving up on send for %s after %d retries", + getattr(self, "_stage_id", "?"), + req_id, + self._MAX_SEND_RETRIES, + ) + self._decrement_pending_save_count(req_id) + + # ------------------------------------------------------------------ # + # Chunk-level poll / send (ported from OmniChunkTransferAdapter) + # ------------------------------------------------------------------ # + + def _poll_single_request(self, req_id: str) -> bool: + """Poll connector for one chunk of a request (non-blocking).""" + connector = self._omni_connector + if connector is None: + return False + + if self._async_chunk and self._model_mode != "ar": + with self._lock: + staged_payload = self._local_stage_payload_cache.get(req_id) + metadata_in_flight = req_id in self._local_request_metadata + scheduler_wakeup_pending = req_id in self._finished_load_reqs + if self._payload_is_consumable(staged_payload) or metadata_in_flight or scheduler_wakeup_pending: + logger.debug( + "[Stage-%s] delaying recv for req=%s until staged async payload is handed to scheduler", + self._stage_id, + req_id, + ) + return False + + target_stage_id = self._stage_id - 1 + chunk_id = self._get_req_chunk[req_id] + external_req_id = self._request_ids_mapping.get(req_id, req_id) + connector_get_key = f"{external_req_id}_{target_stage_id}_{chunk_id}" + + if self._async_chunk: + result = self._recv_async_chunk_result( + connector, + str(target_stage_id), + str(self._stage_id), + connector_get_key, + ) + else: + result = self._recv_full_payload_result( + connector, + str(target_stage_id), + str(self._stage_id), + connector_get_key, + ) + + if result is None: + return False + + payload_data, _size = result + if not payload_data: + return False + if isinstance(payload_data, dict): + logger.info( + "[Stage-%s] recv_chunk_result: req=%s ext=%s key=%s keys=%s finished=%s", + self._stage_id, + req_id, + external_req_id, + connector_get_key, + sorted(payload_data.keys()), + bool(payload_data.get("finished")) if "finished" in payload_data else None, + ) + + self._get_req_chunk[req_id] += 1 + + if self._async_chunk: + is_finished = bool(payload_data.get("finished")) + incoming_payload_consumable = self._payload_is_consumable(payload_data) + + if self._model_mode == "ar": + payload_data = self._accumulate_payload(external_req_id, payload_data) + payload_consumable = incoming_payload_consumable + else: + new_ids = payload_data.get("code_predictor_codes", []) + if not new_ids and not is_finished: + return False + payload_consumable = self._payload_is_consumable(payload_data) + + with self._lock: + if is_finished: + self._chunk_finished_req_ids.add(req_id) + self._chunk_stream_completed.add(req_id) + # Local cache (RFC §2.4) — merge, don't replace, so that + # earlier chunk keys (e.g. thinker_prefill_embeddings from + # chunk 0) are not overwritten by later chunks. + existing = self._local_stage_payload_cache.get(req_id) + if existing is not None and isinstance(existing, dict) and isinstance(payload_data, dict): + existing.update(payload_data) + else: + self._local_stage_payload_cache[req_id] = payload_data + staged_payload = self._local_stage_payload_cache[req_id] + self._async_chunk_updated_req_ids.add(req_id) + self.put_local_request_metadata(req_id, self._extract_scheduling_metadata(staged_payload)) + # A finish-only sentinel still needs one terminal wake-up so + # the downstream stage can sync the merged local payload and + # flush/finish even when the last recv carries no new + # consumable chunk bytes. + if payload_consumable or is_finished: + self._finished_load_reqs.add(req_id) + if is_finished and not payload_consumable: + logger.debug( + "[Stage-%s] finish sentinel arrived for req=%s without new consumable payload", + self._stage_id, + req_id, + ) + elif not payload_consumable: + logger.debug( + "[Stage-%s] req=%s received metadata-only / non-consumable async payload; delaying wake-up", + self._stage_id, + req_id, + ) + if is_finished: + self._pending_load_reqs.pop(req_id, None) + else: + # full_payload_mode: the complete payload arrives in a single get(), + # so always unregister immediately. + if isinstance(payload_data, dict): + engine_inputs = payload_data.get("engine_inputs", payload_data) + else: + engine_inputs = payload_data + with self._lock: + self._local_stage_payload_cache[req_id] = self._snapshot_payload(engine_inputs) + # Publish full-payload readiness only after the aligned TP broadcast + # path in recv_full_payload_inputs() has materialized the payload on all + # local ranks. Publishing metadata / stage_recv from the background recv + # thread can let the scheduler observe a request before the payload is + # actually visible to the model thread. + self._full_payload_pending_broadcast_req_ids.add(req_id) + self._pending_load_reqs.pop(req_id, None) + logger.info( + "[Stage-%s] full_payload recv complete: req=%s key=%s payload_type=%s", + self._stage_id, + req_id, + connector_get_key, + type(engine_inputs).__name__, + ) + + logger.debug("[Stage-%s] Received data for key %s", self._stage_id, connector_get_key) + return True + + def _build_custom_process_payload( + self, + request_id: str | None, + request: Any | None, + pooling_output: Any | None, + ) -> Any | None: + """Run the custom process hook with a best-effort finished kwarg.""" + if self._custom_process_func is None: + return None + + kwargs = { + "transfer_manager": self, + "pooling_output": pooling_output, + "request": request, + } + supports_is_finished = getattr( + self, + "_custom_process_supports_is_finished", + self._custom_process_supports_is_finished_kwarg(), + ) + is_finished_fn = getattr(request, "is_finished", None) + if callable(is_finished_fn): + try: + if supports_is_finished is not False: + kwargs["is_finished"] = bool(is_finished_fn()) + except Exception: + logger.debug("request.is_finished() failed for %s", request_id, exc_info=True) + + try: + return self._custom_process_func(**kwargs) + except TypeError as exc: + if "is_finished" not in kwargs or not self._is_unexpected_is_finished_kwarg_error(exc): + logger.exception("custom_process_stage_input_func failed for chunk %s", request_id) + return None + kwargs.pop("is_finished", None) + try: + return self._custom_process_func(**kwargs) + except Exception: + logger.exception("custom_process_stage_input_func failed for chunk %s", request_id) + return None + except Exception: + logger.exception("custom_process_stage_input_func failed for chunk %s", request_id) + return None + + def _custom_process_supports_is_finished_kwarg(self) -> bool | None: + """Return whether the custom process hook accepts `is_finished`.""" + if self._custom_process_func is None: + return None + try: + signature = inspect.signature(self._custom_process_func) + except (TypeError, ValueError): + return None + + for param in signature.parameters.values(): + if param.kind == inspect.Parameter.VAR_KEYWORD: + return True + + is_finished_param = signature.parameters.get("is_finished") + if is_finished_param is None: + return False + return is_finished_param.kind in ( + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + ) + + @staticmethod + def _is_unexpected_is_finished_kwarg_error(exc: TypeError) -> bool: + message = str(exc) + return ( + "unexpected keyword argument 'is_finished'" in message + or 'unexpected keyword argument "is_finished"' in message + or "positional-only arguments passed as keyword arguments: 'is_finished'" in message + ) + + def _send_single_request(self, task: dict) -> bool: + """Send one queued task via connector.put(). + + Returns True on success. On failure (put() raises or returns + ``success=False``), returns False **without** decrementing + ``_pending_save_counts`` so the caller can retry or clean up. + """ + connector = self._omni_connector + if connector is None: + return True + + request_id = task.get("request_id") + payload_data = task.get("data") + if payload_data is None and task.get("request") is not None: + payload_data = self._build_custom_process_payload( + request_id=request_id, + request=task.get("request"), + pooling_output=task.get("pooling_output"), + ) + put_key = task.get("put_key") + + success, _size, _metadata = connector.put( + from_stage=str(task["stage_id"]), + to_stage=str(task["next_stage_id"]), + put_key=put_key, + data=payload_data, + ) + logger.info( + "[Stage-%s] _send_single_request: put_key=%s success=%s size=%s", + task["stage_id"], + put_key, + success, + _size, + ) + + if not success: + return False + + self._decrement_pending_save_count(request_id) + return True + + def _decrement_pending_save_count(self, request_id: str) -> None: + """Decrement pending save count and run deferred cleanup if zero.""" + cleanup_req_id = None + with self._lock: + remaining = self._pending_save_counts.get(request_id, 0) + if remaining > 1: + self._pending_save_counts[request_id] = remaining - 1 + elif remaining == 1: + self._pending_save_counts.pop(request_id, None) + if request_id in self._deferred_send_cleanup: + self._deferred_send_cleanup.remove(request_id) + cleanup_req_id = request_id + if cleanup_req_id is not None: + self._put_req_chunk.pop(cleanup_req_id, None) + self._send_side_request_payload.pop(cleanup_req_id, None) + self._code_prompt_token_ids.pop(cleanup_req_id, None) + + # ------------------------------------------------------------------ # + # Payload accumulation (ported from OmniChunkTransferAdapter) + # ------------------------------------------------------------------ # + + def _accumulate_payload(self, req_id: str, payload_data: dict[str, Any]) -> dict[str, Any]: + """Accumulate chunk payloads (concat tensors, extend lists). + + Returns a **shallow copy** of the accumulated state so callers + (e.g. ``_poll_single_request``) can store it in + ``_local_stage_payload_cache`` without aliasing the authoritative + ``_send_side_request_payload`` dict. + """ + if req_id not in self._send_side_request_payload: + self._send_side_request_payload[req_id] = dict(payload_data) + return dict(self._send_side_request_payload[req_id]) + + origin = self._send_side_request_payload[req_id] + merged = dict(origin) + override_keys = payload_data.get("override_keys", ()) + drop_decode_span = False + decode_span_handled = False + for key, value in payload_data.items(): + if key == "finished": + merged[key] = value + continue + if key == THINKER_DECODE_EMBEDDINGS_KEY: + merged_span = merge_tensor_spans( + get_tensor_span( + origin, + tensor_key=THINKER_DECODE_EMBEDDINGS_KEY, + start_key=THINKER_DECODE_TOKEN_START_KEY, + end_key=THINKER_DECODE_TOKEN_END_KEY, + ), + get_tensor_span( + payload_data, + tensor_key=THINKER_DECODE_EMBEDDINGS_KEY, + start_key=THINKER_DECODE_TOKEN_START_KEY, + end_key=THINKER_DECODE_TOKEN_END_KEY, + ), + ) + if merged_span is not None: + merged[key], merged[THINKER_DECODE_TOKEN_START_KEY], merged[THINKER_DECODE_TOKEN_END_KEY] = ( + merged_span + ) + decode_span_handled = True + continue + if isinstance(value, torch.Tensor) and key in origin: + if ( + THINKER_DECODE_TOKEN_START_KEY in origin + or THINKER_DECODE_TOKEN_END_KEY in origin + or THINKER_DECODE_TOKEN_START_KEY in payload_data + or THINKER_DECODE_TOKEN_END_KEY in payload_data + ): + logger.warning( + "[Stage-%s] req=%s falling back to legacy thinker decode " + "merge due to missing/invalid/non-contiguous span " + "metadata", + self._stage_id, + req_id, + ) + drop_decode_span = True + merged[key] = torch.cat([origin[key], value], dim=0) + continue + merged[key] = value + continue + if key in {THINKER_DECODE_TOKEN_START_KEY, THINKER_DECODE_TOKEN_END_KEY}: + if decode_span_handled or drop_decode_span: + continue + merged[key] = value + continue + if key in override_keys: + merged[key] = value + continue + if isinstance(value, torch.Tensor) and key in origin: + merged[key] = torch.cat([origin[key], value], dim=0) + elif isinstance(value, list) and key in origin: + merged[key] = origin[key] + value + else: + merged[key] = value + + if drop_decode_span: + merged.pop(THINKER_DECODE_TOKEN_START_KEY, None) + merged.pop(THINKER_DECODE_TOKEN_END_KEY, None) + self._send_side_request_payload[req_id] = merged + return dict(merged) + + def drop_inactive_request_runtime_state(self, req_id: str) -> None: + """Clear inactive request state used by both the runner and mixin. + + This centralizes the model-runner-side cleanup pattern so + ``OmniGPUModelRunner`` can reuse it instead of open-coding the same + inactive-request state mutations. + """ + if hasattr(self, "model_intermediate_buffer"): + self.model_intermediate_buffer.pop(req_id, None) + self.drop_inactive_request_delivery_state(req_id) + + # ------------------------------------------------------------------ # + # Helpers + # ------------------------------------------------------------------ # + + @staticmethod + def _freeze_request_attr(value: Any) -> Any: + if isinstance(value, list): + return list(value) + if isinstance(value, tuple): + return list(value) + if isinstance(value, torch.Tensor): + return value.clone() + raw_list = getattr(value, "_x", None) + if raw_list is not None: + return list(raw_list) + return value + + def _snapshot_request_for_send(self, request: Any, external_req_id: str) -> Any: + finished = bool(getattr(request, "is_finished", lambda: False)()) + attrs: dict[str, Any] = {} + try: + attrs.update(vars(request)) + except TypeError: + pass + + for name in ( + "request_id", + "req_id", + "external_req_id", + "prompt_token_ids", + "output_token_ids", + "all_token_ids", + "additional_information", + "sampling_params", + "multi_modal_data", + "mm_hashes", + ): + if hasattr(request, name): + attrs[name] = self._freeze_request_attr(getattr(request, name)) + + attrs["external_req_id"] = external_req_id + attrs["_frozen_is_finished"] = finished + snapshot = SimpleNamespace(**attrs) + snapshot.is_finished = lambda: finished + return snapshot + + @staticmethod + def _create_connector(model_config: Any) -> OmniConnectorBase | None: + """Create a connector from model_config, or None if unconfigured.""" + connector_config = getattr(model_config, "stage_connector_config", None) + if connector_config is None: + return None + + if not isinstance(connector_config, dict): + connector_config = { + "name": getattr(connector_config, "name", None), + "extra": getattr(connector_config, "extra", None), + } + + name = connector_config.get("name") + if not isinstance(name, str) or not name.strip(): + raise RuntimeError("Invalid stage connector config: missing connector name") + name = name.strip() + + extra = connector_config.get("extra") + if extra is None: + extra = {} + elif not isinstance(extra, dict): + raise RuntimeError(f"Invalid extra config for connector {name}: expected dict, got {type(extra).__name__}") + + spec = ConnectorSpec(name=name, extra=extra) + try: + return OmniConnectorFactory.create_connector(spec) + except Exception as exc: + raise RuntimeError(f"Failed to create connector {name}") from exc + + @staticmethod + def _load_custom_func(model_config: Any) -> tuple[str | None, Any | None]: + """Load the connector payload builder for the downstream stage. + + Preferred source is ``custom_process_next_stage_input_func``. Some + full_payload_mode configs (async_chunk=false) only expose the next-stage prompt builder via + ``custom_process_input_func`` (for example ``thinker2talker``), while the + connector payload builder lives beside it as ``thinker2talker_full_payload``. + In that case, derive the full_payload_mode builder path automatically. + """ + candidates: list[str] = [] + + next_stage_func = getattr(model_config, "custom_process_next_stage_input_func", None) + if isinstance(next_stage_func, str) and next_stage_func: + candidates.append(next_stage_func) + + if not getattr(model_config, "async_chunk", False): + input_func = getattr(model_config, "custom_process_input_func", None) + if isinstance(input_func, str) and input_func: + try: + module_path, func_name = input_func.rsplit(".", 1) + if func_name.endswith("_full_payload") or func_name.endswith("_batch"): + candidates.append(f"{module_path}.{func_name}") + else: + candidates.append(f"{module_path}.{func_name}_full_payload") + candidates.append(f"{module_path}.{func_name}_batch") + candidates.append(input_func) + except ValueError: + candidates.append(input_func) + + tried: set[str] = set() + for func_path in candidates: + if func_path in tried: + continue + tried.add(func_path) + try: + module_path, func_name = func_path.rsplit(".", 1) + module = importlib.import_module(module_path) + func = getattr(module, func_name, None) + if callable(func): + if not OmniConnectorModelRunnerMixin._is_connector_payload_builder(func): + logger.debug( + "Skipping incompatible connector payload hook %s; signature=%s", + func_path, + inspect.signature(func), + ) + continue + return func_path, func + except Exception: + logger.warning("Failed to load custom func: %s", func_path, exc_info=True) + + return None, None + + @staticmethod + def _is_connector_payload_builder(func: Any) -> bool: + """Whether *func* matches the mixin payload-builder contract.""" + try: + signature = inspect.signature(func) + except (TypeError, ValueError): + return False + + params = signature.parameters + if any(param.kind == inspect.Parameter.VAR_KEYWORD for param in params.values()): + return True + + required = {"transfer_manager", "pooling_output", "request"} + supported = { + name + for name, param in params.items() + if param.kind + in ( + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + ) + } + return required.issubset(supported) + + def _resolve_external_req_id(self, request: Any, fallback_req_id: str) -> str: + """Resolve the external request ID consistently. + + Checks ``_request_ids_mapping`` first (populated by + ``register_chunk_recv``), then falls back to the request's + ``external_req_id`` attribute, and finally to the given + ``fallback_req_id``. + """ + mapped = self._request_ids_mapping.get(fallback_req_id) + if mapped is not None: + return mapped + if request is not None: + return getattr(request, "external_req_id", fallback_req_id) + return fallback_req_id + + def _resolve_next_stage_id(self, model_config: Any) -> int: + """Determine the downstream stage ID from connector config. + + Falls back to ``stage_id + 1`` when the config does not specify + a ``to_stage`` explicitly. + """ + connector_config = getattr(model_config, "stage_connector_config", None) + if connector_config is not None: + if isinstance(connector_config, dict): + to_stage = connector_config.get("to_stage") + else: + to_stage = getattr(connector_config, "to_stage", None) + if isinstance(to_stage, int): + return to_stage + if isinstance(to_stage, str) and to_stage.strip(): + return int(to_stage) + return self._stage_id + 1 + + @staticmethod + def _parse_rank_mapping(model_config: Any) -> dict[str, int]: + """Parse rank_mapping from connector config (optional). + + Returns ``{"from_tp": int, "to_tp": int, "local_rank": int}``. + When ``rank_mapping`` is absent, assumes 1:1 homogeneous mapping. + """ + connector_config = getattr(model_config, "stage_connector_config", None) + if connector_config is not None and not isinstance(connector_config, dict): + connector_config = getattr(connector_config, "__dict__", {}) + + rank_mapping: dict = {} + if isinstance(connector_config, dict): + rank_mapping = connector_config.get("rank_mapping", {}) + + from_tp = int(rank_mapping.get("from_tp", 1)) + to_tp = int(rank_mapping.get("to_tp", 1)) + + local_rank = 0 + try: + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + except (ValueError, TypeError): + pass + + return {"from_tp": from_tp, "to_tp": to_tp, "local_rank": local_rank} + + # ------------------------------------------------------------------ # + # Heterogeneous TP rank support + # ------------------------------------------------------------------ # + + def _validate_kv_tp_topology(self) -> None: + """Reject heterogeneous TP mappings that cannot be routed losslessly.""" + if self._from_tp <= 0 or self._to_tp <= 0: + raise ValueError(f"Invalid KV TP mapping: from_tp={self._from_tp}, to_tp={self._to_tp}") + larger = max(self._from_tp, self._to_tp) + smaller = min(self._from_tp, self._to_tp) + if larger % smaller != 0: + raise ValueError( + f"KV TP mapping must be divisible for rank-aware routing: from_tp={self._from_tp}, to_tp={self._to_tp}" + ) + + def get_kv_remote_ranks(self) -> list[int]: + """Determine which remote ranks this local rank exchanges KV with. + + Follows vLLM's ``TpKVTopology.get_target_remote_ranks()`` pattern: + - ``from_tp > to_tp``: each to-rank reads from multiple from-ranks + - ``from_tp < to_tp``: multiple to-ranks read from the same from-rank + - ``from_tp == to_tp``: 1:1 mapping + """ + self._validate_kv_tp_topology() + if self._from_tp == self._to_tp: + return [self._local_rank] + + if self._from_tp > self._to_tp: + tp_ratio = self._from_tp // self._to_tp + return [self._local_rank * tp_ratio + i for i in range(tp_ratio)] + else: + tp_ratio = self._to_tp // self._from_tp + return [self._local_rank // tp_ratio] + + def is_data_transfer_rank(self) -> bool: + """Whether this rank should participate in data (non-KV) transfer. + + Ordinary stage payloads are TP-identical, so exactly one TP rank + should talk to the connector. When TP is initialized, use TP rank 0 + so the connector leader matches TP-local broadcast source rank. + Otherwise fall back to LOCAL_RANK==0 for the single-rank case. + """ + tp_group = self._get_local_tp_group() + if tp_group is not None and getattr(tp_group, "world_size", 1) > 1: + return getattr(tp_group, "rank_in_group", 0) == 0 + return self._local_rank == 0 + + def get_kv_connector_key( + self, + req_id: str, + from_stage: int, + chunk_id: int, + from_rank: int, + to_rank: int, + ) -> str: + """Build connector key that includes rank info for KV transfers.""" + return f"{req_id}_{from_stage}_{chunk_id}_{from_rank}_{to_rank}" diff --git a/vllm_omni/worker/payload_span.py b/vllm_omni/worker/payload_span.py new file mode 100644 index 00000000000..994392343a9 --- /dev/null +++ b/vllm_omni/worker/payload_span.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Helpers for explicit thinker decode span metadata.""" + +from collections.abc import Mapping +from typing import Any + +import torch + +THINKER_DECODE_EMBEDDINGS_KEY = "thinker_decode_embeddings" +THINKER_OUTPUT_TOKEN_IDS_KEY = "thinker_output_token_ids" +THINKER_DECODE_TOKEN_START_KEY = "thinker_decode_embeddings_token_start" +THINKER_DECODE_TOKEN_END_KEY = "thinker_decode_embeddings_token_end" + +CACHED_THINKER_DECODE_EMBEDDINGS_KEY = "cached_thinker_decode_embeddings" +CACHED_THINKER_DECODE_TOKEN_START_KEY = "cached_thinker_decode_embeddings_token_start" +CACHED_THINKER_DECODE_TOKEN_END_KEY = "cached_thinker_decode_embeddings_token_end" + +TensorSpan = tuple[torch.Tensor, int, int] + + +def get_tensor_span(payload: Mapping[str, Any], *, tensor_key: str, start_key: str, end_key: str) -> TensorSpan | None: + tensor = payload.get(tensor_key) + start = payload.get(start_key) + end = payload.get(end_key) + if not isinstance(tensor, torch.Tensor): + return None + if not isinstance(start, int) or not isinstance(end, int): + return None + if start < 0 or end < start or (end - start) != int(tensor.shape[0]): + return None + return tensor, start, end + + +def merge_tensor_spans(existing_span: TensorSpan | None, incoming_span: TensorSpan | None) -> TensorSpan | None: + if existing_span is None or incoming_span is None: + return None + + existing_tensor, existing_start, existing_end = existing_span + incoming_tensor, incoming_start, incoming_end = incoming_span + if incoming_tensor.device != existing_tensor.device or incoming_tensor.dtype != existing_tensor.dtype: + incoming_tensor = incoming_tensor.to(device=existing_tensor.device, dtype=existing_tensor.dtype) + if incoming_start == existing_end: + return torch.cat([existing_tensor, incoming_tensor], dim=0), existing_start, incoming_end + if incoming_start < existing_end: + overlap = existing_end - incoming_start + if overlap >= int(incoming_tensor.shape[0]): + return existing_tensor, existing_start, existing_end + trimmed_tensor = incoming_tensor[overlap:] + return ( + torch.cat([existing_tensor, trimmed_tensor], dim=0), + existing_start, + existing_end + int(trimmed_tensor.shape[0]), + ) + return None + + +def get_tensor_span_row(span: TensorSpan | None, index: int) -> torch.Tensor | None: + if span is None: + return None + tensor, start, end = span + if index < start or index >= end: + return None + return tensor[index - start] From cd2761e15c8e49ea7c53cd551f820318155b4988 Mon Sep 17 00:00:00 2001 From: JohnJan Date: Mon, 13 Apr 2026 17:51:48 +0800 Subject: [PATCH 14/76] [Feature]: support Flux.2-dev tea_cache (#1871) Co-authored-by: wuzhongjian --- docs/user_guide/diffusion_features.md | 2 +- .../cache/test_teacache_extractors.py | 105 ++++++++++++- .../cache/teacache/coefficient_estimator.py | 27 ++++ vllm_omni/diffusion/cache/teacache/config.py | 9 ++ .../diffusion/cache/teacache/extractors.py | 140 ++++++++++++++++++ 5 files changed, 281 insertions(+), 2 deletions(-) diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 2f28131ee55..ac140ff84a0 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -110,7 +110,7 @@ The following tables show which models support each feature: | **FLUX.1-dev** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | | **FLUX.2-klein** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | | **FLUX.1-Kontext-dev** | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **FLUX.2-dev** | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | +| **FLUX.2-dev** | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **GLM-Image** | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | **HunyuanImage3** | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | | **LongCat-Image** | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | diff --git a/tests/diffusion/cache/test_teacache_extractors.py b/tests/diffusion/cache/test_teacache_extractors.py index a52e11b3d46..c22a60e227e 100644 --- a/tests/diffusion/cache/test_teacache_extractors.py +++ b/tests/diffusion/cache/test_teacache_extractors.py @@ -22,7 +22,7 @@ import torch from tests.utils import hardware_test -from vllm_omni.diffusion.cache.teacache.extractors import extract_flux2_klein_context +from vllm_omni.diffusion.cache.teacache.extractors import extract_flux2_context, extract_flux2_klein_context from vllm_omni.diffusion.models.flux2_klein.flux2_klein_transformer import ( Flux2Transformer2DModel, ) @@ -174,3 +174,106 @@ def test_invalid_module_raises_error(self): img_ids=torch.randint(0, 64, (1, 1024, 4)), txt_ids=torch.randint(0, 64, (1, 512, 4)), ) + + +class TestFlux2Extractor(BaseExtractorTest): + """Test extract_flux2_context function.""" + + def get_extractor(self): + return extract_flux2_context + + @pytest.fixture + def flux2_module(self): + """Create a minimal Flux2Transformer2DModel for testing.""" + from vllm_omni.diffusion.models.flux2.flux2_transformer import Flux2Transformer2DModel + + model = Flux2Transformer2DModel( + num_layers=2, + num_single_layers=2, + num_attention_heads=48, + attention_head_dim=128, + joint_attention_dim=15360, + ) + return model + + def get_module(self, flux2_module): + return flux2_module + + @pytest.fixture + def sample_inputs(self): + """Create sample input tensors for Flux2. + + Note: hidden_states uses in_channels=128 (default for Flux2), + not inner_dim=6144. The x_embedder projects from 128 -> 6144. + encoder_hidden_states uses joint_attention_dim=15360 (model default), + which then gets projected to inner_dim=6144 by context_embedder. + """ + batch_size = 1 + img_seq_len = 1024 + txt_seq_len = 512 + in_channels = 128 # Model default in_channels + txt_dim = 15360 # Model default joint_attention_dim + + return { + "hidden_states": torch.randn(batch_size, img_seq_len, in_channels), + "encoder_hidden_states": torch.randn(batch_size, txt_seq_len, txt_dim), + "timestep": torch.tensor([500]), + "img_ids": torch.randint(0, 64, (batch_size, img_seq_len, 4)), + "txt_ids": torch.randint(0, 64, (batch_size, txt_seq_len, 4)), + "guidance": torch.tensor([3.5]), + } + + def get_sample_inputs(self, sample_inputs): + return sample_inputs + + @hardware_test(res={"cuda": "L4"}, num_cards=1) + def test_modulated_input_shape(self, flux2_module, sample_inputs): + """Test that modulated_input has correct shape matching the model's inner_dim. + + Note: After x_embedder projection, hidden_states are projected from + in_channels (128) to inner_dim (6144), so modulated_input should match + the projected shape, not the input shape. + """ + context = extract_flux2_klein_context(flux2_module, **sample_inputs) + + batch_size, img_seq_len, _ = sample_inputs["hidden_states"].shape + inner_dim = flux2_module.inner_dim + assert context.modulated_input.shape == (batch_size, img_seq_len, inner_dim) + + @hardware_test(res={"cuda": "L4"}, num_cards=1) + def test_run_transformer_blocks_callable(self, flux2_module, sample_inputs): + """Test that run_transformer_blocks is callable.""" + context = extract_flux2_context(flux2_module, **sample_inputs) + assert callable(context.run_transformer_blocks) + + @hardware_test(res={"cuda": "L4"}, num_cards=1) + def test_postprocess_callable(self, flux2_module, sample_inputs): + """Test that postprocess is callable.""" + context = extract_flux2_context(flux2_module, **sample_inputs) + assert callable(context.postprocess) + + def test_without_guidance(self, flux2_module, sample_inputs): + """Test context extraction works without guidance (no CFG).""" + inputs = sample_inputs.copy() + inputs["guidance"] = None + + context = extract_flux2_context(flux2_module, **inputs) + + assert context is not None + assert context.temb is not None + + @pytest.mark.cpu + def test_invalid_module_raises_error(self): + """Test that invalid module without transformer_blocks raises ValueError.""" + invalid_module = Mock() + invalid_module.transformer_blocks = [] + + with pytest.raises(ValueError, match="Module must have transformer_blocks"): + extract_flux2_context( + invalid_module, + hidden_states=torch.randn(1, 1024, 6144), + encoder_hidden_states=torch.randn(1, 512, 15360), + timestep=torch.tensor([500]), + img_ids=torch.randint(0, 64, (1, 1024, 4)), + txt_ids=torch.randint(0, 64, (1, 512, 4)), + ) diff --git a/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py b/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py index 5dd80718d11..baec21c2762 100644 --- a/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py +++ b/vllm_omni/diffusion/cache/teacache/coefficient_estimator.py @@ -13,6 +13,7 @@ from vllm_omni.diffusion.hooks import HookRegistry, ModelHook from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.bagel.pipeline_bagel import BagelPipeline +from vllm_omni.diffusion.models.flux2.pipeline_flux2 import Flux2Pipeline from vllm_omni.diffusion.models.stable_audio.pipeline_stable_audio import StableAudioPipeline from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -103,6 +104,31 @@ def install_hook(transformer: Any, hook: DataCollectionHook) -> None: registry.register_hook(hook._HOOK_NAME, hook) +class Flux2Adapter: + """Adapter for Flux2 model coefficient estimation.""" + + @staticmethod + def load_pipeline(model_path: str, device: str = "cuda", dtype: torch.dtype = torch.bfloat16) -> Flux2Pipeline: + """Load Flux2 pipeline for coefficient estimation.""" + od_config = OmniDiffusionConfig.from_kwargs(model=model_path, dtype=dtype) + od_config.model_class_name = "Flux2Pipeline" + + pipeline = Flux2Pipeline(od_config=od_config) + loader = DiffusersPipelineLoader(LoadConfig()) + loader.load_weights(pipeline) + pipeline.to(device) + return pipeline + + @staticmethod + def get_transformer(pipeline: Any) -> tuple[Any, str]: + return pipeline.transformer, pipeline.transformer.__class__.__name__ + + @staticmethod + def install_hook(transformer: Any, hook: DataCollectionHook) -> None: + registry = HookRegistry.get_or_create(transformer) + registry.register_hook(hook._HOOK_NAME, hook) + + class DefaultAdapter: """Default adapter for standard diffusers pipelines.""" @@ -123,6 +149,7 @@ def install_hook(transformer: Any, hook: DataCollectionHook) -> None: _MODEL_ADAPTERS: dict[str, type] = { "Bagel": BagelAdapter, "StableAudio": StableAudioAdapter, + "Flux2": Flux2Adapter, } _EPSILON = 1e-6 diff --git a/vllm_omni/diffusion/cache/teacache/config.py b/vllm_omni/diffusion/cache/teacache/config.py index 96cf3f03eec..ecf3bfc1d3d 100644 --- a/vllm_omni/diffusion/cache/teacache/config.py +++ b/vllm_omni/diffusion/cache/teacache/config.py @@ -64,6 +64,15 @@ -1.04182570e01, 6.78098549e-01, ], + # Flux2 transformer coefficients + # Copied from Qwen-Image, need to be tuned specifically for Flux2 in future + "Flux2Transformer2DModel": [ + -4.50000000e02, + 2.80000000e02, + -4.50000000e01, + 3.20000000e00, + -2.00000000e-02, + ], } diff --git a/vllm_omni/diffusion/cache/teacache/extractors.py b/vllm_omni/diffusion/cache/teacache/extractors.py index bdb3f6a7865..3d247e31878 100644 --- a/vllm_omni/diffusion/cache/teacache/extractors.py +++ b/vllm_omni/diffusion/cache/teacache/extractors.py @@ -21,6 +21,7 @@ import torch.nn as nn from vllm_omni.diffusion.forward_context import get_forward_context +from vllm_omni.platforms import current_omni_platform @dataclass @@ -827,6 +828,144 @@ def postprocess(h: torch.Tensor) -> Any: ) +def extract_flux2_context( + module: nn.Module, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor = None, + timestep: torch.LongTensor = None, + img_ids: torch.Tensor = None, + txt_ids: torch.Tensor = None, + guidance: torch.Tensor | None = None, + joint_attention_kwargs: dict[str, Any] | None = None, + return_dict: bool = True, + **kwargs: Any, +) -> CacheContext: + """ + Extract cache context for Flux2Transformer2DModel. + + This is the ONLY Flux2-specific code needed for TeaCache support. + It encapsulates preprocessing, modulated input extraction, transformer execution, + and postprocessing logic. + + Args: + module: Flux2Transformer2DModel instance + hidden_states: Input hidden states tensor + encoder_hidden_states: Text encoder outputs + timestep: Current diffusion timestep + img_ids: Image inputs for position embedding + txt_ids: Text inputs for position embedding + guidance: Optional guidance scale for CFG + joint_attention_kwargs: Additional attention arguments + return_dict: Whether to return a Transformer2DModelOutput instead of a plain tensor + **kwargs: Additional keyword arguments ignored by this extractor + + Returns: + CacheContext with all information needed for generic caching + """ + + from diffusers.models.modeling_outputs import Transformer2DModelOutput + + if not hasattr(module, "transformer_blocks") or len(module.transformer_blocks) == 0: + raise ValueError("Module must have transformer_blocks") + + # ============================================================================ + # PREPROCESSING (Flux2-specific) + # ============================================================================ + num_txt_tokens = encoder_hidden_states.shape[1] + + timestep = timestep.to(hidden_states.dtype) * 1000 + if guidance is not None: + guidance = guidance.to(hidden_states.dtype) * 1000 + + temb = module.time_guidance_embed(timestep, guidance) + + double_stream_mod_img = module.double_stream_modulation_img(temb) + double_stream_mod_txt = module.double_stream_modulation_txt(temb) + single_stream_mod = module.single_stream_modulation(temb)[0] + + hidden_states = module.x_embedder(hidden_states) + encoder_hidden_states = module.context_embedder(encoder_hidden_states) + + if img_ids.ndim == 3: + img_ids = img_ids[0] + if txt_ids.ndim == 3: + txt_ids = txt_ids[0] + + if current_omni_platform.is_npu(): + freqs_cos_image, freqs_sin_image = module.pos_embed(img_ids.cpu()) + image_rotary_emb = (freqs_cos_image.npu(), freqs_sin_image.npu()) + freqs_cos_text, freqs_sin_text = module.pos_embed(txt_ids.cpu()) + text_rotary_emb = (freqs_cos_text.npu(), freqs_sin_text.npu()) + else: + image_rotary_emb = module.pos_embed(img_ids) + text_rotary_emb = module.pos_embed(txt_ids) + concat_rotary_emb = ( + torch.cat([text_rotary_emb[0], image_rotary_emb[0]], dim=0), + torch.cat([text_rotary_emb[1], image_rotary_emb[1]], dim=0), + ) + + # ============================================================================ + # EXTRACT MODULATED INPUT (for cache decision) + # ============================================================================ + block = module.transformer_blocks[0] + (shift_msa, scale_msa, gate_msa), _ = double_stream_mod_img + modulated_input = block.norm1(hidden_states) + modulated_input = (1 + scale_msa) * modulated_input + shift_msa + + # ============================================================================ + # DEFINE TRANSFORMER EXECUTION (Flux2-specific) + # ============================================================================ + def run_transformer_blocks(): + """Execute all Flux2 transformer blocks.""" + h = hidden_states + e = encoder_hidden_states + + for transformer_block in module.transformer_blocks: + e, h = transformer_block( + hidden_states=h, + encoder_hidden_states=e, + temb_mod_params_img=double_stream_mod_img, + temb_mod_params_txt=double_stream_mod_txt, + image_rotary_emb=concat_rotary_emb, + joint_attention_kwargs=joint_attention_kwargs, + ) + h = torch.cat([e, h], dim=1) + + for single_transformer_block in module.single_transformer_blocks: + h = single_transformer_block( + hidden_states=h, + encoder_hidden_states=None, + temb_mod_params=single_stream_mod, + image_rotary_emb=concat_rotary_emb, + joint_attention_kwargs=joint_attention_kwargs, + ) + + h = h[:, num_txt_tokens:, ...] + return (h,) + + # ============================================================================ + # DEFINE POSTPROCESSING + # ============================================================================ + def postprocess(h): + h = module.norm_out(h, temb) + output = module.proj_out(h) + if not return_dict: + return (output,) + return Transformer2DModelOutput(sample=output) + + # ============================================================================ + # RETURN CONTEXT + # ============================================================================ + return CacheContext( + modulated_input=modulated_input, + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + temb=temb, + run_transformer_blocks=run_transformer_blocks, + postprocess=postprocess, + ) + + # Registry for model-specific extractors # Key: Transformer class name # Value: extractor function with signature (module, *args, **kwargs) -> CacheContext @@ -839,6 +978,7 @@ def postprocess(h: torch.Tensor) -> Any: "ZImageTransformer2DModel": extract_zimage_context, "Flux2Klein": extract_flux2_klein_context, "StableAudioDiTModel": extract_stable_audio_context, + "Flux2Transformer2DModel": extract_flux2_context, # Future models: # "FluxTransformer2DModel": extract_flux_context, # "CogVideoXTransformer3DModel": extract_cogvideox_context, From 155583f49f9a20477ea95a0119a7abfddbf0c646 Mon Sep 17 00:00:00 2001 From: Chenguang Zheng <645327136@qq.com> Date: Mon, 13 Apr 2026 18:35:59 +0800 Subject: [PATCH 15/76] [Bugfix] Release stage launch lock before handshake (#2717) Signed-off-by: Chenguang ZHENG <645327136@qq.com> --- .../test_async_omni_engine_stage_init.py | 89 +++++++++++++++++++ vllm_omni/engine/async_omni_engine.py | 23 ++--- 2 files changed, 101 insertions(+), 11 deletions(-) diff --git a/tests/engine/test_async_omni_engine_stage_init.py b/tests/engine/test_async_omni_engine_stage_init.py index 6993f391ebc..7b995fe70db 100644 --- a/tests/engine/test_async_omni_engine_stage_init.py +++ b/tests/engine/test_async_omni_engine_stage_init.py @@ -227,6 +227,95 @@ def _capture_stage_timeout(_proc, _handshake_addr, _addresses, _vllm_cfg, handsh assert captured_timeout == 302 +def test_launch_llm_stage_releases_launch_lock_before_complete_stage_handshake(monkeypatch): + """Regression test for parallel LLM stage startup during handshake wait.""" + import vllm_omni.engine.async_omni_engine as engine_mod + from vllm_omni.platforms import current_omni_platform + + engine = object.__new__(AsyncOmniEngine) + engine.log_stats = False + engine.model = "dummy-model" + engine.single_stage_mode = False + engine._omni_master_server = None + + fake_vllm_config = types.SimpleNamespace() + fake_addresses = types.SimpleNamespace() + shared_launch_lock = threading.Lock() + counter_lock = threading.Lock() + first_handshake_started = threading.Event() + second_stage_spawned = threading.Event() + allow_first_handshake_to_finish = threading.Event() + launch_errors: list[BaseException] = [] + spawn_count = 0 + + device_env_var = current_omni_platform.device_control_env_var + prev_device_env = os.environ.get(device_env_var) + os.environ[device_env_var] = "0" + + monkeypatch.setattr(engine_mod, "setup_stage_devices", lambda *_: None) + monkeypatch.setattr(engine_mod, "build_engine_args_dict", lambda *_, **__: {}) + monkeypatch.setattr(engine_mod, "build_vllm_config", lambda *_, **__: (fake_vllm_config, object)) + monkeypatch.setattr(engine_mod, "acquire_device_locks", lambda *_: []) + + def _spawn_stage_core(**_): + nonlocal spawn_count + with counter_lock: + spawn_count += 1 + call_idx = spawn_count + if call_idx == 2: + second_stage_spawned.set() + return fake_addresses, types.SimpleNamespace(), f"ipc://handshake-{call_idx}" + + def _complete_stage_handshake(_proc, handshake_address, _addresses, _vllm_cfg, _timeout): + if handshake_address == "ipc://handshake-1": + first_handshake_started.set() + assert second_stage_spawned.wait(timeout=1), ( + "second stage did not reach spawn_stage_core while first stage waited in handshake" + ) + assert allow_first_handshake_to_finish.wait(timeout=1), ( + "second stage did not enter handshake while first stage was still waiting" + ) + else: + allow_first_handshake_to_finish.set() + + monkeypatch.setattr(engine_mod, "spawn_stage_core", _spawn_stage_core) + monkeypatch.setattr(engine_mod, "complete_stage_handshake", _complete_stage_handshake) + + def _launch_stage(stage_id: int) -> None: + metadata = types.SimpleNamespace(stage_id=stage_id, runtime_cfg={"devices": str(stage_id)}) + try: + engine._launch_llm_stage( + stage_cfg=types.SimpleNamespace(engine_args={}), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=302, + llm_stage_launch_lock=shared_launch_lock, + ) + except BaseException as exc: # pragma: no cover - surfaced through assertion below + launch_errors.append(exc) + + try: + first_thread = threading.Thread(target=_launch_stage, args=(0,)) + first_thread.start() + assert first_handshake_started.wait(timeout=1), "first stage never entered handshake" + + second_thread = threading.Thread(target=_launch_stage, args=(1,)) + second_thread.start() + + first_thread.join(timeout=3) + second_thread.join(timeout=3) + finally: + if prev_device_env is None: + os.environ.pop(device_env_var, None) + else: + os.environ[device_env_var] = prev_device_env + + assert not first_thread.is_alive() + assert not second_thread.is_alive() + assert second_stage_spawned.is_set() + assert not launch_errors + + def test_attach_llm_stage_uses_omni_input_preprocessor(monkeypatch): """Regression test for GLM-Image t2i preprocessing path. diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 0a2e02d66ef..9609cf6e26b 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -424,23 +424,24 @@ def _launch_llm_stage( proc=proc, ) logger.info("[AsyncOmniEngine] Stage %s engine launch started", metadata.stage_id) - # Keep the stage-specific device visibility until vLLM - # finishes starting all child processes. - if self.single_stage_mode and self._omni_master_server is not None: - launch_stack.close() - else: - assert proc is not None - assert handshake_address is not None - complete_stage_handshake( - proc, handshake_address, addresses, vllm_config, stage_init_timeout - ) - logger.info("[AsyncOmniEngine] Stage %s engine startup completed", metadata.stage_id) finally: if previous_visible_devices is None: current_omni_platform.unset_device_control_env_var() else: current_omni_platform.set_device_control_env_var(previous_visible_devices) + # After StageEngineCoreProc has been spawned it carries its + # stage-specific device visibility into descendants, so the + # slow HELLO/READY handshake can run without holding the + # process-wide launch lock. + if self.single_stage_mode and self._omni_master_server is not None: + launch_stack.close() + else: + assert proc is not None + assert handshake_address is not None + complete_stage_handshake(proc, handshake_address, addresses, vllm_config, stage_init_timeout) + logger.info("[AsyncOmniEngine] Stage %s engine startup completed", metadata.stage_id) + assert started_stage is not None return started_stage except Exception: From ef3f72b9ae0bee0baf45258abde55bec3ae6752d Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Mon, 13 Apr 2026 19:03:13 +0800 Subject: [PATCH 16/76] [Tests][Qwen3-Omni]Modify Qwen3-Omni performance test cases (#2600) Signed-off-by: amy-why-3459 --- tests/dfx/perf/scripts/run_benchmark.py | 2 + tests/dfx/perf/tests/test.json | 305 +++++++++++++++++------- 2 files changed, 219 insertions(+), 88 deletions(-) diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index c566c2e0a0a..b64cc0d9503 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -72,6 +72,8 @@ def run_benchmark( ["vllm", "bench", "serve", "--omni"] + args + [ + "--num-warmups", + "2", "--save-result", "--result-dir", os.environ.get("BENCHMARK_DIR", "tests"), diff --git a/tests/dfx/perf/tests/test.json b/tests/dfx/perf/tests/test.json index fe7e3804698..159e27a064b 100644 --- a/tests/dfx/perf/tests/test.json +++ b/tests/dfx/perf/tests/test.json @@ -10,83 +10,97 @@ "dataset_name": "random", "backend": "openai-chat-omni", "endpoint": "/v1/chat/completions", - "num_prompts": [ - 10, - 40, - 100 - ], - "max_concurrency": [ - 1, - 4, - 10 - ], + "num_prompts": [4, 16, 40], + "max_concurrency": [1, 4, 10], + "random_input_len": 2500, + "random_output_len": 900, + "ignore_eos": true, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [1000, 3000, 5000], + "mean_audio_ttfp_ms": [30000, 60000, 90000], + "mean_audio_rtf": [0.35, 0.45, 0.55] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [10], + "request_rate": [0.1], "random_input_len": 100, "random_output_len": 100, + "random_range_ratio": 0.0, "ignore_eos": true, + "random_mm_base_items_per_request": 1, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "audio": 1 + }, + "random_mm_bucket_config": { + "(0, 60, 3)": 1.0 + }, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_ttft_ms": [1000, 3000, 5000], - "mean_audio_ttfp_ms": [8000, 10000, 13000], - "mean_audio_rtf": [0.2, 0.25, 0.45] + "mean_ttft_ms": [2000], + "mean_audio_ttfp_ms": [10000], + "mean_audio_rtf": [0.25] } }, { "dataset_name": "random-mm", "backend": "openai-chat-omni", "endpoint": "/v1/chat/completions", - "num_prompts": [ - 10, - 40, - 100 - ], - "request_rate": [ - 0.1, - 0.3, - 0.5 - ], + "num_prompts": [40], + "request_rate": [0.3], "random_input_len": 100, "random_output_len": 100, "random_range_ratio": 0.0, "ignore_eos": true, - "random_mm_base_items_per_request": 3, - "random_mm_num_mm_items_range_ratio": 0, + "random_mm_base_items_per_request": 2, + "random_mm_num_mm_items_range_ratio": 0.5, "random_mm_limit_mm_per_prompt": { "image": 1, - "video": 1, - "audio": 1 + "video": 1 }, "random_mm_bucket_config": { - "(32, 32, 1)": 0.5, - "(0, 1, 1)": 0.1, - "(32, 32, 2)": 0.4 + "(256, 256, 1)": 0.5, + "(720, 1280, 2)": 0.5 }, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_ttft_ms": [2000, 4000, 6000], - "mean_audio_ttfp_ms": [10000, 13000, 15000], - "mean_audio_rtf": [0.25, 0.35, 0.45] + "mean_ttft_ms": [4000], + "mean_audio_ttfp_ms": [13000], + "mean_audio_rtf": [0.35] } }, { - "dataset_name": "random", + "dataset_name": "random-mm", "backend": "openai-chat-omni", "endpoint": "/v1/chat/completions", - "num_prompts": [ - 4, - 16 - ], - "max_concurrency": [ - 1, - 4 - ], - "random_input_len": 2500, - "random_output_len": 900, + "num_prompts": [100], + "request_rate": [0.5], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, "ignore_eos": true, + "random_mm_base_items_per_request": 3, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "image": 1, + "video": 1, + "audio": 1 + }, + "random_mm_bucket_config": { + "(256, 256, 1)": 0.34, + "(720, 1280, 2)": 0.33, + "(0, 60, 3)": 0.33 + }, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_ttft_ms": [1000, 3000], - "mean_audio_ttfp_ms": [30000, 60000], - "mean_audio_rtf": [0.35, 0.45] + "mean_ttft_ms": [6000], + "mean_audio_ttfp_ms": [15000], + "mean_audio_rtf": [0.45] } } ] @@ -120,18 +134,10 @@ "dataset_name": "random", "backend": "openai-chat-omni", "endpoint": "/v1/chat/completions", - "num_prompts": [ - 10, - 40, - 100 - ], - "max_concurrency": [ - 1, - 4, - 10 - ], - "random_input_len": 100, - "random_output_len": 100, + "num_prompts": [4, 16, 40], + "max_concurrency": [1, 4, 10], + "random_input_len": 2500, + "random_output_len": 900, "ignore_eos": true, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { @@ -144,59 +150,182 @@ "dataset_name": "random-mm", "backend": "openai-chat-omni", "endpoint": "/v1/chat/completions", - "num_prompts": [ - 10, - 40, - 100 - ], - "request_rate": [ - 0.1, - 0.3, - 0.5 - ], + "num_prompts": [10], + "request_rate": [0.1], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, + "ignore_eos": true, + "random_mm_base_items_per_request": 1, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "audio": 1 + }, + "random_mm_bucket_config": { + "(0, 60, 3)": 1.0 + }, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [2000], + "mean_audio_ttfp_ms": [2000], + "mean_audio_rtf": [0.25] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [40], + "request_rate": [0.3], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, + "ignore_eos": true, + "random_mm_base_items_per_request": 2, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "image": 1, + "video": 1 + }, + "random_mm_bucket_config": { + "(256, 256, 1)": 0.5, + "(720, 1280, 2)": 0.5 + }, + "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_ttft_ms": [4000], + "mean_audio_ttfp_ms": [4000], + "mean_audio_rtf": [0.4] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [100], + "request_rate": [0.5], "random_input_len": 100, "random_output_len": 100, "random_range_ratio": 0.0, "ignore_eos": true, "random_mm_base_items_per_request": 3, - "random_mm_num_mm_items_range_ratio": 0, + "random_mm_num_mm_items_range_ratio": 0.5, "random_mm_limit_mm_per_prompt": { "image": 1, "video": 1, "audio": 1 }, "random_mm_bucket_config": { - "(32, 32, 1)": 0.5, - "(0, 1, 1)": 0.1, - "(32, 32, 2)": 0.4 + "(256, 256, 1)": 0.34, + "(720, 1280, 2)": 0.33, + "(0, 60, 3)": 0.33 }, "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", "baseline": { - "mean_ttft_ms": [2000, 4000, 6000], - "mean_audio_ttfp_ms": [2000, 4000, 6000], - "mean_audio_rtf": [0.25, 0.4, 0.7] + "mean_ttft_ms": [6000], + "mean_audio_ttfp_ms": [6000], + "mean_audio_rtf": [0.7] } }, { "dataset_name": "random", "backend": "openai-chat-omni", "endpoint": "/v1/chat/completions", - "num_prompts": [ - 4, - 16 - ], - "max_concurrency": [ - 1, - 4 - ], + "num_prompts": [4, 16, 40], + "max_concurrency": [1, 4, 10], "random_input_len": 2500, "random_output_len": 900, "ignore_eos": true, - "percentile-metrics": "ttft,tpot,itl,e2el,audio_rtf,audio_ttfp,audio_duration", + "extra_body": { + "modalities": ["text"] + }, + "percentile-metrics": "ttft,tpot,itl,e2el", + "baseline": { + "mean_ttft_ms": [1000, 3000, 5000] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [10], + "request_rate": [0.1], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, + "ignore_eos": true, + "extra_body": { + "modalities": ["text"] + }, + "random_mm_base_items_per_request": 1, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "audio": 1 + }, + "random_mm_bucket_config": { + "(0, 60, 3)": 1.0 + }, + "percentile-metrics": "ttft,tpot,itl,e2el", + "baseline": { + "mean_ttft_ms": [2000] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [40], + "request_rate": [0.3], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, + "ignore_eos": true, + "extra_body": { + "modalities": ["text"] + }, + "random_mm_base_items_per_request": 2, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "image": 1, + "video": 1 + }, + "random_mm_bucket_config": { + "(256, 256, 1)": 0.5, + "(720, 1280, 2)": 0.5 + }, + "percentile-metrics": "ttft,tpot,itl,e2el", + "baseline": { + "mean_ttft_ms": [4000] + } + }, + { + "dataset_name": "random-mm", + "backend": "openai-chat-omni", + "endpoint": "/v1/chat/completions", + "num_prompts": [100], + "request_rate": [0.5], + "random_input_len": 100, + "random_output_len": 100, + "random_range_ratio": 0.0, + "ignore_eos": true, + "extra_body": { + "modalities": ["text"] + }, + "random_mm_base_items_per_request": 3, + "random_mm_num_mm_items_range_ratio": 0.5, + "random_mm_limit_mm_per_prompt": { + "image": 1, + "video": 1, + "audio": 1 + }, + "random_mm_bucket_config": { + "(256, 256, 1)": 0.34, + "(720, 1280, 2)": 0.33, + "(0, 60, 3)": 0.33 + }, + "percentile-metrics": "ttft,tpot,itl,e2el", "baseline": { - "mean_ttft_ms": [1000, 3000], - "mean_audio_ttfp_ms": [1000, 3000], - "mean_audio_rtf": [0.35, 0.45] + "mean_ttft_ms": [6000] } } ] From 2c67c30550ad91e62a5919b0008caba459a09049 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Mon, 13 Apr 2026 19:15:49 +0800 Subject: [PATCH 17/76] [Bagel]: Support `think mode` in single stage deployment of Bagel (#2650) Signed-off-by: princepride --- examples/offline_inference/bagel/end2end.py | 98 ++++++++---- .../models/bagel/bagel_transformer.py | 113 +++++++++++++- .../diffusion/models/bagel/pipeline_bagel.py | 146 +++++++++++++++--- 3 files changed, 301 insertions(+), 56 deletions(-) diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index 472d748d1e6..ed5fa57e8d6 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -97,6 +97,24 @@ def parse_args(): default=False, help="Enable thinking mode: AR stage decodes ... planning tokens before image generation.", ) + parser.add_argument( + "--max-think-tokens", + type=int, + default=1000, + help="Maximum number of tokens for thinking text generation (default: 1000).", + ) + parser.add_argument( + "--do-sample", + action="store_true", + default=False, + help="Enable sampling for text generation (default: greedy).", + ) + parser.add_argument( + "--text-temperature", + type=float, + default=0.3, + help="Temperature for text generation sampling (default: 0.3).", + ) args = parser.parse_args() return args @@ -108,7 +126,6 @@ def main(): model_name = args.model prompts: list[OmniPromptType] = [] try: - # Preferred: load from txt file (one prompt per line) if getattr(args, "txt_prompts", None) and args.prompt_type == "text": with open(args.txt_prompts, encoding="utf-8") as f: lines = [ln.strip() for ln in f.readlines()] @@ -121,10 +138,8 @@ def main(): raise if not prompts: - # Default prompt for text2img test if none provided prompts = ["A cute cat"] print(f"[Info] No prompts provided, using default: {prompts}") - omni_outputs = [] from PIL import Image @@ -132,11 +147,13 @@ def main(): omni_kwargs = {} stage_configs_path = args.stage_configs_path + is_single_stage = stage_configs_path and "single_stage" in stage_configs_path if args.think and stage_configs_path is None: stage_configs_path = "vllm_omni/model_executor/stage_configs/bagel_think.yaml" print(f"[Info] Think mode enabled, using stage config: {stage_configs_path}") if stage_configs_path: omni_kwargs["stage_configs_path"] = stage_configs_path + is_single_stage = "single_stage" in stage_configs_path omni_kwargs.update( { @@ -198,40 +215,61 @@ def main(): formatted_prompts.append(prompt_dict) params_list = omni.default_sampling_params_list + + # For single-stage DiT, think/text params go into the diffusion sampling params extra_args. + # For 2-stage, diffusion params are at index 1. + diffusion_params_idx = 0 if is_single_stage else (1 if len(params_list) > 1 else 0) + diffusion_params = params_list[diffusion_params_idx] + if args.modality in ("text2img", "img2img"): - if len(params_list) > 1: - diffusion_params = params_list[1] - diffusion_params.num_inference_steps = args.steps # type: ignore - diffusion_params.cfg_parallel_size = args.cfg_parallel_size # type: ignore - if args.seed is not None: - diffusion_params.seed = args.seed # type: ignore - extra = { - "cfg_text_scale": args.cfg_text_scale, - "cfg_img_scale": args.cfg_img_scale, - } - if args.cfg_interval is not None: - extra["cfg_interval"] = tuple(args.cfg_interval) - if args.cfg_renorm_type is not None: - extra["cfg_renorm_type"] = args.cfg_renorm_type - if args.cfg_renorm_min is not None: - extra["cfg_renorm_min"] = args.cfg_renorm_min - if args.negative_prompt is not None: - extra["negative_prompt"] = args.negative_prompt - diffusion_params.extra_args = extra # type: ignore + diffusion_params.num_inference_steps = args.steps # type: ignore + diffusion_params.cfg_parallel_size = args.cfg_parallel_size # type: ignore + if args.seed is not None: + diffusion_params.seed = args.seed # type: ignore + + extra = getattr(diffusion_params, "extra_args", {}) or {} + extra["cfg_text_scale"] = args.cfg_text_scale + extra["cfg_img_scale"] = args.cfg_img_scale + if args.cfg_interval is not None: + extra["cfg_interval"] = tuple(args.cfg_interval) + if args.cfg_renorm_type is not None: + extra["cfg_renorm_type"] = args.cfg_renorm_type + if args.cfg_renorm_min is not None: + extra["cfg_renorm_min"] = args.cfg_renorm_min + if args.negative_prompt is not None: + extra["negative_prompt"] = args.negative_prompt + + needs_text_gen = is_single_stage and (args.think or args.modality in ("text2text", "img2text")) + if needs_text_gen: + if args.think: + extra["think"] = True + extra["max_think_tokens"] = args.max_think_tokens + extra["do_sample"] = args.do_sample + extra["text_temperature"] = args.text_temperature + diffusion_params.extra_args = extra # type: ignore omni_outputs = list(omni.generate(prompts=formatted_prompts, sampling_params_list=params_list)) img_idx = 0 for req_output in omni_outputs: - if args.think: - ro = getattr(req_output, "request_output", None) - if ro and getattr(ro, "outputs", None): - txt = "".join(getattr(o, "text", "") or "" for o in ro.outputs) - if txt: - print(txt) + # 2-stage think mode: text output from thinker stage + ro = getattr(req_output, "request_output", None) + if ro and getattr(ro, "outputs", None): + txt = "".join(getattr(o, "text", "") or "" for o in ro.outputs) + if txt: + if args.think: + print(f"[Think]\n{txt}") + else: + print(f"[Output] Text:\n{txt}") - images = getattr(req_output, "images", None) + # Single-stage DiT: text from custom_output + custom = getattr(req_output, "_custom_output", {}) or {} + if custom.get("think_text"): + print(f"[Think]\n{custom['think_text']}") + if custom.get("text_output"): + print(f"[Output] Text:\n{custom['text_output']}") + images = getattr(req_output, "images", None) if not images: continue @@ -241,8 +279,6 @@ def main(): print(f"[Output] Saved image to {save_path}") img_idx += 1 - print(omni_outputs) - if __name__ == "__main__": main() diff --git a/vllm_omni/diffusion/models/bagel/bagel_transformer.py b/vllm_omni/diffusion/models/bagel/bagel_transformer.py index f8480775687..d1254f84566 100644 --- a/vllm_omni/diffusion/models/bagel/bagel_transformer.py +++ b/vllm_omni/diffusion/models/bagel/bagel_transformer.py @@ -854,6 +854,7 @@ def __init__( config, parallel_config=parallel_config, quant_config=quant_config, prefix=f"{prefix}.model" ) self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) # Initialize weights and apply final processing self.post_init() @@ -864,6 +865,12 @@ def get_input_embeddings(self): def set_input_embeddings(self, value): self.model.embed_tokens = value + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + def set_decoder(self, decoder): self.model = decoder @@ -1207,7 +1214,7 @@ def prepare_prompts(self, curr_kvlens, curr_rope, prompts, tokenizer, new_token_ packed_key_value_indexes.extend(range(curr, curr + curr_kvlen)) curr += curr_kvlen - text_ids = tokenizer.encode(prompt) + text_ids = tokenizer.encode(prompt, add_special_tokens=False) text_ids = [new_token_ids["bos_token_id"]] + text_ids + [new_token_ids["eos_token_id"]] text_token_lens.append(len(text_ids)) packed_text_ids.extend(text_ids) @@ -1619,10 +1626,110 @@ def _merge_naive_caches(caches: list) -> NaiveCache: num_layers = len(caches[0].key_cache) merged = NaiveCache(num_layers) for layer_idx in range(num_layers): - merged.key_cache[layer_idx] = torch.cat([c.key_cache[layer_idx] for c in caches], dim=0) - merged.value_cache[layer_idx] = torch.cat([c.value_cache[layer_idx] for c in caches], dim=0) + key_parts = [c.key_cache[layer_idx] for c in caches if c.key_cache[layer_idx] is not None] + val_parts = [c.value_cache[layer_idx] for c in caches if c.value_cache[layer_idx] is not None] + merged.key_cache[layer_idx] = torch.cat(key_parts, dim=0) if key_parts else None + merged.value_cache[layer_idx] = torch.cat(val_parts, dim=0) if val_parts else None return merged + def prepare_start_tokens(self, curr_kvlens, curr_rope, new_token_ids): + """Prepare start tokens for autoregressive text generation. + + Ported from the original BAGEL ``Bagel.prepare_start_tokens``. + """ + packed_start_tokens, packed_key_value_indexes = list(), list() + packed_query_position_ids = list() + + curr = 0 + for curr_kvlen, curr_position_id in zip(curr_kvlens, curr_rope): + packed_key_value_indexes.extend(range(curr, curr + curr_kvlen)) + packed_start_tokens.append(new_token_ids["bos_token_id"]) + packed_query_position_ids.append(curr_position_id) + curr += curr_kvlen + + generation_input = { + "packed_start_tokens": torch.tensor(packed_start_tokens, dtype=torch.long), + "packed_query_position_ids": torch.tensor(packed_query_position_ids, dtype=torch.long), + "key_values_lens": torch.tensor(curr_kvlens, dtype=torch.int), + "packed_key_value_indexes": torch.tensor(packed_key_value_indexes, dtype=torch.long), + } + return generation_input + + @torch.no_grad() + def generate_text( + self, + past_key_values: NaiveCache, + packed_key_value_indexes: torch.LongTensor, + key_values_lens: torch.IntTensor, + packed_start_tokens: torch.LongTensor, + packed_query_position_ids: torch.LongTensor, + max_length: int, + do_sample: bool = False, + temperature: float = 1.0, + end_token_id: int | None = None, + ): + """Autoregressive text generation (ported from original BAGEL). + + Decodes tokens one at a time, appending to ``past_key_values`` + until ``max_length`` is reached or ``end_token_id`` is generated. + """ + step = 0 + generated_sequence = [] + curr_tokens = packed_start_tokens + while step < max_length: + generated_sequence.append(curr_tokens) + packed_text_embedding = self.language_model.model.embed_tokens(curr_tokens) + query_lens = torch.ones_like(curr_tokens) + packed_query_indexes = torch.cumsum(key_values_lens, dim=0) + torch.arange( + 0, + len(key_values_lens), + device=key_values_lens.device, + dtype=key_values_lens.dtype, + ) + + uppacked = list(packed_key_value_indexes.split(key_values_lens.tolist(), dim=0)) + for i in range(len(uppacked)): + uppacked[i] += i + packed_key_value_indexes = torch.cat(uppacked, dim=0) + + output = self.language_model( + packed_query_sequence=packed_text_embedding, + query_lens=query_lens, + packed_query_position_ids=packed_query_position_ids, + packed_query_indexes=packed_query_indexes, + past_key_values=past_key_values, + key_values_lens=key_values_lens, + packed_key_value_indexes=packed_key_value_indexes, + update_past_key_values=True, + is_causal=True, + mode="und", + ) + past_key_values = output.past_key_values + packed_query_sequence = output.packed_query_sequence + pred_logits = self.language_model.lm_head(packed_query_sequence) + + if do_sample: + probs = nn.functional.softmax(pred_logits / temperature, dim=-1) + curr_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + else: + curr_tokens = torch.argmax(pred_logits, dim=-1) + + uppacked = list(packed_key_value_indexes.split(key_values_lens.tolist(), dim=0)) + for i in range(len(uppacked)): + uppacked[i] = torch.cat( + [uppacked[i], torch.tensor([uppacked[i][-1] + 1], device=uppacked[i].device)], dim=0 + ) + packed_key_value_indexes = torch.cat(uppacked, dim=0) + key_values_lens = key_values_lens + 1 + packed_query_position_ids = packed_query_position_ids + 1 + step += 1 + + if end_token_id is not None and curr_tokens[0] == end_token_id: + break + + output_device = generated_sequence[0].device + return torch.stack([i.to(output_device) for i in generated_sequence], dim=0) + def generate_image( self, packed_text_ids: torch.LongTensor, diff --git a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py index 13d0cc2093b..72e53e7f48f 100644 --- a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py +++ b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py @@ -495,11 +495,15 @@ def vae_transforms(img): cfg_text_context = deepcopy(gen_context) + # Strip <|im_start|>/<|im_end|> wrappers that end2end.py may have + # already added, so prepare_prompts doesn't double-add bos/eos. + clean_prompt = prompt.removeprefix("<|im_start|>").removesuffix("<|im_end|>") + # Update gen_context with text prompt generation_input, newlens, new_rope = self.bagel.prepare_prompts( curr_kvlens=gen_context["kv_lens"], curr_rope=gen_context["ropes"], - prompts=[prompt], + prompts=[clean_prompt], tokenizer=self.tokenizer, new_token_ids=self.new_token_ids, ) @@ -527,34 +531,37 @@ def vae_transforms(img): gen_context["kv_lens"] = newlens gen_context["ropes"] = new_rope - # cfg_text_context: update with negative prompt (no text condition) + # cfg_text_context: update with negative prompt (no text condition). + # When empty, keep cfg_text_context as-is (kv_lens=0) to match + # original BAGEL; _merge_naive_caches handles None KV entries. neg_prompt = extra_args.get("negative_prompt", "") - neg_input, neg_newlens, neg_rope = self.bagel.prepare_prompts( - curr_kvlens=cfg_text_context["kv_lens"], - curr_rope=cfg_text_context["ropes"], - prompts=[neg_prompt], - tokenizer=self.tokenizer, - new_token_ids=self.new_token_ids, - ) - for k, v in neg_input.items(): - if torch.is_tensor(v): - neg_input[k] = v.to(self.device) - with torch.autocast( - device_type=self.device.type, - enabled=self.device.type != "cpu", - dtype=self.od_config.dtype, - ): - cfg_text_context["past_key_values"] = self.bagel.forward_cache_update_text( - cfg_text_context["past_key_values"], **neg_input + if neg_prompt: + neg_input, neg_newlens, neg_rope = self.bagel.prepare_prompts( + curr_kvlens=cfg_text_context["kv_lens"], + curr_rope=cfg_text_context["ropes"], + prompts=[neg_prompt], + tokenizer=self.tokenizer, + new_token_ids=self.new_token_ids, ) - cfg_text_context["kv_lens"] = neg_newlens - cfg_text_context["ropes"] = neg_rope + for k, v in neg_input.items(): + if torch.is_tensor(v): + neg_input[k] = v.to(self.device) + with torch.autocast( + device_type=self.device.type, + enabled=self.device.type != "cpu", + dtype=self.od_config.dtype, + ): + cfg_text_context["past_key_values"] = self.bagel.forward_cache_update_text( + cfg_text_context["past_key_values"], **neg_input + ) + cfg_text_context["kv_lens"] = neg_newlens + cfg_text_context["ropes"] = neg_rope # cfg_img_context: update with text prompt (no image condition) cfg_img_generation_input, cfg_img_newlens, cfg_img_new_rope = self.bagel.prepare_prompts( curr_kvlens=cfg_img_context["kv_lens"], curr_rope=cfg_img_context["ropes"], - prompts=[prompt], + prompts=[clean_prompt], tokenizer=self.tokenizer, new_token_ids=self.new_token_ids, ) @@ -572,6 +579,96 @@ def vae_transforms(img): cfg_img_context["kv_lens"] = cfg_img_newlens cfg_img_context["ropes"] = cfg_img_new_rope + # ---- Detect output modality and think mode ---- + modalities = first_prompt.get("modalities", []) if isinstance(first_prompt, dict) else [] + is_text_output = "text" in modalities + think_enabled = extra_args.get("think", False) + think_text = None + + if think_enabled and injected_kv is None: + max_think_tokens = int(extra_args.get("max_think_tokens", 1000)) + do_sample = bool(extra_args.get("do_sample", False)) + text_temperature = float(extra_args.get("text_temperature", 0.3)) + + with torch.autocast( + device_type=self.device.type, + enabled=self.device.type != "cpu", + dtype=self.od_config.dtype, + ): + start_input = self.bagel.prepare_start_tokens( + gen_context["kv_lens"], gen_context["ropes"], self.new_token_ids + ) + for k, v in start_input.items(): + if torch.is_tensor(v): + start_input[k] = v.to(self.device) + + gen_ctx_copy = deepcopy(gen_context) + token_ids = self.bagel.generate_text( + past_key_values=gen_ctx_copy["past_key_values"], + max_length=max_think_tokens, + do_sample=do_sample, + temperature=text_temperature, + end_token_id=self.new_token_ids["eos_token_id"], + **start_input, + ) + # token_ids shape: (seq_len, batch=1) + decoded = self.tokenizer.decode(token_ids[:, 0].tolist()) + # Strip chat markers to get clean text + think_text = decoded.split("<|im_end|>")[0] + if "<|im_start|>" in think_text: + think_text = think_text.split("<|im_start|>")[-1] + logger.info("Think mode generated %d tokens", token_ids.shape[0]) + + if not is_text_output: + # Use the autoregressive KV cache from think generation + # directly, instead of decode→re-encode which adds extra + # bos/eos and may alter tokenization. + num_think_tokens = token_ids.shape[0] + gen_context["past_key_values"] = gen_ctx_copy["past_key_values"] + gen_context["kv_lens"] = [kl + num_think_tokens for kl in gen_context["kv_lens"]] + gen_context["ropes"] = [r + num_think_tokens for r in gen_context["ropes"]] + + # ---- Text-only output (text2text / img2text) ---- + if is_text_output and injected_kv is None: + if think_text is not None: + # Think mode already generated the text (including reasoning) + text_output = think_text + else: + max_text_tokens = int(extra_args.get("max_think_tokens", 500)) + do_sample = bool(extra_args.get("do_sample", False)) + text_temperature = float(extra_args.get("text_temperature", 0.3)) + + with torch.autocast( + device_type=self.device.type, + enabled=self.device.type != "cpu", + dtype=self.od_config.dtype, + ): + start_input = self.bagel.prepare_start_tokens( + gen_context["kv_lens"], gen_context["ropes"], self.new_token_ids + ) + for k, v in start_input.items(): + if torch.is_tensor(v): + start_input[k] = v.to(self.device) + token_ids = self.bagel.generate_text( + past_key_values=gen_context["past_key_values"], + max_length=max_text_tokens, + do_sample=do_sample, + temperature=text_temperature, + end_token_id=self.new_token_ids["eos_token_id"], + **start_input, + ) + decoded = self.tokenizer.decode(token_ids[:, 0].tolist()) + text_output = decoded.split("<|im_end|>")[0] + if "<|im_start|>" in text_output: + text_output = text_output.split("<|im_start|>")[-1] + + return DiffusionOutput( + output=text_output, + custom_output={"text_output": text_output}, + stage_durations=self.stage_durations if hasattr(self, "stage_durations") else None, + ) + + # ---- Image generation (text2img / img2img) ---- if req.sampling_params.seed is not None: torch.manual_seed(req.sampling_params.seed) if self.device.type == "cuda": @@ -676,12 +773,17 @@ def vae_transforms(img): if trajectory_log_probs: trajectory_log_probs_stacked = torch.stack(trajectory_log_probs) + custom = {} + if think_text is not None: + custom["think_text"] = think_text + return DiffusionOutput( output=img, trajectory_latents=trajectory_latents_stacked, trajectory_timesteps=trajectory_timesteps_stacked, trajectory_log_probs=trajectory_log_probs_stacked, trajectory_decoded=trajectory_decoded, + custom_output=custom, stage_durations=self.stage_durations if hasattr(self, "stage_durations") else None, ) From e0cdbe9a5d7ec654bbbe26c2fb6e76abe41446d2 Mon Sep 17 00:00:00 2001 From: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com> Date: Mon, 13 Apr 2026 19:21:42 +0800 Subject: [PATCH 18/76] [Misc] Cleanup: use consistent pytest-mock in unit tests (#2698) Signed-off-by: yuanheng --- tests/comfyui/conftest.py | 18 +- tests/comfyui/test_comfyui_integration.py | 95 +- .../test_generation_scheduler_restore.py | 27 +- .../test_distributed_vae_executor.py | 41 +- .../models/bagel/test_trajectory_recording.py | 34 +- .../models/flux2/test_flux2_transformer_tp.py | 20 +- .../offloader/test_sequential_backend.py | 120 +- .../quantization/test_int8_config.py | 32 +- tests/diffusion/test_diffusion_scheduler.py | 103 +- .../diffusion/test_diffusion_step_pipeline.py | 26 +- .../test_diffusion_worker_cuda_profiler.py | 6 +- .../test_multiproc_engine_concurrency.py | 28 +- tests/engine/test_arg_utils.py | 9 +- tests/engine/test_async_omni_engine_input.py | 15 +- .../engine/test_async_omni_engine_outputs.py | 20 +- tests/engine/test_single_stage_mode.py | 1533 ++++++++++------- .../openai_api/test_serving_chat_speaker.py | 40 +- .../openai_api/test_serving_speech.py | 215 ++- .../openai_api/test_serving_speech_stream.py | 117 +- tests/entrypoints/test_omni_base_profiler.py | 27 +- tests/entrypoints/test_serve.py | 188 +- .../test_mimo_audio_code2wav_batch_decode.py | 40 +- .../qwen2_5_omni/test_qwen2_5_omni_embed.py | 37 +- .../qwen3_tts/test_code_predictor_dtype.py | 131 +- .../models/test_fish_speech_voice_cache.py | 30 +- tests/test_fish_speech_voice_cache.py | 39 +- 26 files changed, 1610 insertions(+), 1381 deletions(-) diff --git a/tests/comfyui/conftest.py b/tests/comfyui/conftest.py index 0b4565e9465..4280d3506ff 100644 --- a/tests/comfyui/conftest.py +++ b/tests/comfyui/conftest.py @@ -9,8 +9,8 @@ import os import sys +from types import ModuleType, SimpleNamespace from typing import BinaryIO, TypedDict -from unittest.mock import MagicMock def pytest_configure(config): @@ -58,15 +58,15 @@ def save_to(self, file: str | BinaryIO): else: file.write(self._data) - mock_comfy_api = MagicMock() - mock_comfy_api_input = MagicMock() + mock_comfy_api = ModuleType("comfy_api") + mock_comfy_api_input = ModuleType("comfy_api.input") mock_comfy_api_input.AudioInput = AudioInput mock_comfy_api_input.VideoInput = VideoInput mock_comfy_api.input = mock_comfy_api_input - mock_comfy_api_latest = MagicMock() - mock_comfy_api_latest.Types.VideoComponents = MagicMock(side_effect=lambda **kwargs: kwargs) - mock_comfy_api_latest.InputImpl.VideoFromComponents = MagicMock( - side_effect=lambda _: VideoInput(b"mock_video_from_components") + mock_comfy_api_latest = ModuleType("comfy_api.latest") + mock_comfy_api_latest.Types = SimpleNamespace(VideoComponents=lambda **kwargs: kwargs) + mock_comfy_api_latest.InputImpl = SimpleNamespace( + VideoFromComponents=lambda _: VideoInput(b"mock_video_from_components") ) mock_comfy_api.latest = mock_comfy_api_latest @@ -76,8 +76,8 @@ def mock_load(_: str | BinaryIO): sample_rate = 24000 return waveform, sample_rate - mock_comfy_extras = MagicMock() - mock_nodes_audio = MagicMock() + mock_comfy_extras = ModuleType("comfy_extras") + mock_nodes_audio = ModuleType("comfy_extras.nodes_audio") mock_nodes_audio.load = mock_load mock_comfy_extras.nodes_audio = mock_nodes_audio diff --git a/tests/comfyui/test_comfyui_integration.py b/tests/comfyui/test_comfyui_integration.py index f6ce82f9b28..80e86d82412 100644 --- a/tests/comfyui/test_comfyui_integration.py +++ b/tests/comfyui/test_comfyui_integration.py @@ -13,7 +13,6 @@ from enum import StrEnum, auto from types import SimpleNamespace from typing import Any, NamedTuple -from unittest.mock import AsyncMock, MagicMock, patch import pytest import requests @@ -28,6 +27,7 @@ ) from comfyui_vllm_omni.utils.types import AutoregressionSamplingParams, DiffusionSamplingParams, WanModelSpecificParams from PIL import Image +from pytest_mock import MockerFixture from vllm import SamplingParams from vllm.outputs import CompletionOutput, RequestOutput from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -217,9 +217,10 @@ def _build_diffusion_video_output() -> OmniRequestOutput: def _build_diffusion_image_output_for_chat_endpoint() -> OmniRequestOutput: - request_output = MagicMock() - request_output.images = [_build_image_output(color="blue")] - request_output.finished = True + request_output = SimpleNamespace( + images=[_build_image_output(color="blue")], + finished=True, + ) return OmniRequestOutput( request_id="test_req_img_chat", finished=True, @@ -389,51 +390,55 @@ def sampling_case(request) -> SamplingCase: @pytest.fixture -def mock_async_omni(server_case: ServerCase, sampling_case: SamplingCase): +def mock_async_omni( + server_case: ServerCase, + sampling_case: SamplingCase, + monkeypatch: pytest.MonkeyPatch, + mocker: MockerFixture, +): async def _mock_preprocess_chat(self, *args, **kwargs): return ([{"role": "user", "content": "test"}], [{"prompt": "test prompt"}]) # Need to mock AsyncOmni itself (not only its generate method) because # 1. The API layer uses its stage_list and stage_configs attributes # 2. Its __init__ method has slow side effects (model & config loading). - with ( - patch("vllm_omni.entrypoints.openai.api_server.AsyncOmni") as MockAsyncOmni, - patch( - "vllm_omni.entrypoints.openai.serving_chat.OmniOpenAIServingChat._preprocess_chat", - new=_mock_preprocess_chat, - ), - ): - mock_instance = AsyncMock(spec=RealAsyncOmni) - mock_instance.generate = _build_mock_outputs(server_case.outputs, sampling_case, server_case) - - mock_instance.stage_list = server_case.stage_list - mock_instance.stage_configs = server_case.stage_configs - mock_instance.output_modalities = _build_output_modalities(server_case.stage_configs) - mock_instance.default_sampling_params_list = [ - SamplingParams() if _stage_type(stage) != "diffusion" else MagicMock() - for stage in server_case.stage_configs - ] - mock_instance.errored = False - mock_instance.dead_error = RuntimeError("Mock engine error") - mock_instance.model_config = MagicMock( - max_model_len=4096, - io_processor_plugin=None, - allowed_local_media_path=None, - allowed_media_domains=None, - ) - # Mimic Qwen3-TTS talker speaker config so CustomVoice validation passes. - mock_instance.model_config.hf_config = MagicMock() - mock_instance.model_config.hf_config.talker_config = MagicMock() - mock_instance.model_config.hf_config.talker_config.speaker_id = {"Vivian": 0} - mock_instance.io_processor = MagicMock() - mock_instance.input_processor = MagicMock() - mock_instance.shutdown = MagicMock() - mock_instance.get_vllm_config = AsyncMock(return_value=None) - mock_instance.get_supported_tasks = AsyncMock(return_value=["generate"]) - mock_instance.get_tokenizer = AsyncMock(return_value=None) + mock_async_omni_cls = mocker.patch("vllm_omni.entrypoints.openai.api_server.AsyncOmni") + monkeypatch.setattr( + "vllm_omni.entrypoints.openai.serving_chat.OmniOpenAIServingChat._preprocess_chat", + _mock_preprocess_chat, + ) + + mock_instance = mocker.AsyncMock(spec=RealAsyncOmni) + mock_instance.generate = _build_mock_outputs(server_case.outputs, sampling_case, server_case) + + mock_instance.stage_list = server_case.stage_list + mock_instance.stage_configs = server_case.stage_configs + mock_instance.output_modalities = _build_output_modalities(server_case.stage_configs) + mock_instance.default_sampling_params_list = [ + SamplingParams() if _stage_type(stage) != "diffusion" else mocker.MagicMock() + for stage in server_case.stage_configs + ] + mock_instance.errored = False + mock_instance.dead_error = RuntimeError("Mock engine error") + mock_instance.model_config = mocker.MagicMock( + max_model_len=4096, + io_processor_plugin=None, + allowed_local_media_path=None, + allowed_media_domains=None, + ) + # Mimic Qwen3-TTS talker speaker config so CustomVoice validation passes. + mock_instance.model_config.hf_config = mocker.MagicMock() + mock_instance.model_config.hf_config.talker_config = mocker.MagicMock() + mock_instance.model_config.hf_config.talker_config.speaker_id = {"Vivian": 0} + mock_instance.io_processor = mocker.MagicMock() + mock_instance.input_processor = mocker.MagicMock() + mock_instance.shutdown = mocker.MagicMock() + mock_instance.get_vllm_config = mocker.AsyncMock(return_value=None) + mock_instance.get_supported_tasks = mocker.AsyncMock(return_value=["generate"]) + mock_instance.get_tokenizer = mocker.AsyncMock(return_value=None) - MockAsyncOmni.return_value = mock_instance - yield MockAsyncOmni + mock_async_omni_cls.return_value = mock_instance + yield mock_async_omni_cls @pytest.fixture @@ -583,9 +588,9 @@ async def test_image_generation_node(api_server: str, model: str, image_input: b ServerCase( served_model="Qwen/Qwen2.5-Omni-7B", stage_list=[ - MagicMock(is_comprehension=True, model_stage="llm"), - MagicMock(is_comprehension=False, model_stage="llm"), - MagicMock(is_comprehension=False, model_stage="llm"), + SimpleNamespace(is_comprehension=True, model_stage="llm"), + SimpleNamespace(is_comprehension=False, model_stage="llm"), + SimpleNamespace(is_comprehension=False, model_stage="llm"), ], stage_configs=[ _make_stage_config("llm", is_comprehension=True, model_stage="thinker"), diff --git a/tests/core/sched/test_generation_scheduler_restore.py b/tests/core/sched/test_generation_scheduler_restore.py index 154f40b3995..5cc1cab7025 100644 --- a/tests/core/sched/test_generation_scheduler_restore.py +++ b/tests/core/sched/test_generation_scheduler_restore.py @@ -6,7 +6,6 @@ those requests are permanently orphaned. """ -import unittest from collections import deque import pytest @@ -39,7 +38,7 @@ def postprocess_scheduler_output(self, output): pass -class TestRestoreQueuesOnError(unittest.TestCase): +class TestRestoreQueuesOnError: """Verify that restore_queues is called even when rewrapping raises.""" def test_requests_not_lost_on_exception(self): @@ -52,8 +51,8 @@ def test_requests_not_lost_on_exception(self): # Step 1: process_pending_chunks moves req-B out adapter.process_pending_chunks(waiting=[], running=running) - self.assertEqual(running, ["req-A"]) - self.assertEqual(len(adapter.waiting_for_chunk_running_requests), 1) + assert running == ["req-A"] + assert len(adapter.waiting_for_chunk_running_requests) == 1 # Step 2: simulate the try/except/finally pattern try: @@ -65,9 +64,9 @@ def test_requests_not_lost_on_exception(self): adapter.restore_queues(waiting=[], running=running) # Step 3: verify request is restored - self.assertTrue(adapter.restore_called) - self.assertIn("req-B", running) - self.assertEqual(len(adapter.waiting_for_chunk_running_requests), 0) + assert adapter.restore_called is True + assert "req-B" in running + assert len(adapter.waiting_for_chunk_running_requests) == 0 def test_requests_lost_without_fix(self): """Demonstrate the bug: without restore in except, request is lost.""" @@ -76,7 +75,7 @@ def test_requests_lost_without_fix(self): running = ["req-A", "req-B"] adapter.process_pending_chunks(waiting=[], running=running) - self.assertEqual(running, ["req-A"]) + assert running == ["req-A"] # Simulate the BUGGY code: except without restore try: @@ -85,8 +84,8 @@ def test_requests_lost_without_fix(self): pass # Bug: no restore_queues call # Request is lost! - self.assertNotIn("req-B", running) - self.assertEqual(len(adapter.waiting_for_chunk_running_requests), 1) + assert "req-B" not in running + assert len(adapter.waiting_for_chunk_running_requests) == 1 def test_happy_path_restores_via_finally(self): """When no exception, restore_queues is still called via finally.""" @@ -102,9 +101,5 @@ def test_happy_path_restores_via_finally(self): finally: adapter.restore_queues(waiting=[], running=running) - self.assertTrue(adapter.restore_called) - self.assertIn("req-B", running) - - -if __name__ == "__main__": - unittest.main() + assert adapter.restore_called is True + assert "req-B" in running diff --git a/tests/diffusion/distributed/test_distributed_vae_executor.py b/tests/diffusion/distributed/test_distributed_vae_executor.py index dc491dcdaf1..b2ee7c10d33 100644 --- a/tests/diffusion/distributed/test_distributed_vae_executor.py +++ b/tests/diffusion/distributed/test_distributed_vae_executor.py @@ -1,4 +1,4 @@ -from unittest.mock import MagicMock, patch +from types import SimpleNamespace import pytest import torch @@ -61,40 +61,31 @@ def merge(self, coord_tensor_map, grid_spec): class DummyMixin(DistributedVaeMixin): def __init__(self): self.use_tiling = True - self.distributed_executor = MagicMock() - self.distributed_executor.parallel_size = 2 - self.distributed_executor.group = None + self.distributed_executor = SimpleNamespace(parallel_size=2, group=None) @pytest.fixture(autouse=True) -def mock_dist(): - with ( - patch.object(dist, "get_world_size", return_value=2), - patch.object(dist, "get_rank", return_value=0), - patch.object(dist, "is_initialized", return_value=True), - patch.object(dist, "all_reduce", return_value=None), - patch.object(dist, "gather", return_value=None), - patch.object(dist, "broadcast", return_value=None), - ): - yield +def mock_dist(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr(dist, "get_world_size", lambda *args, **kwargs: 2) + monkeypatch.setattr(dist, "get_rank", lambda *args, **kwargs: 0) + monkeypatch.setattr(dist, "is_initialized", lambda: True) + monkeypatch.setattr(dist, "all_reduce", lambda *args, **kwargs: None) + monkeypatch.setattr(dist, "gather", lambda *args, **kwargs: None) + monkeypatch.setattr(dist, "broadcast", lambda *args, **kwargs: None) @pytest.fixture(autouse=True) -def mock_dit_group(): - with patch( +def mock_dit_group(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr( "vllm_omni.diffusion.distributed.autoencoders.distributed_vae_executor.get_dit_group", - new=MagicMock(return_value=None), - ): - yield + lambda: None, + ) @pytest.fixture(autouse=True) -def mock_dist_vae_executor(): - with ( - patch.object(DistributedVaeExecutor, "gather_tensors", side_effect=lambda x: [x]), - patch.object(DistributedVaeExecutor, "broadcast_tensor", side_effect=lambda x: x), - ): - yield +def mock_dist_vae_executor(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr(DistributedVaeExecutor, "gather_tensors", lambda self, x: [x]) + monkeypatch.setattr(DistributedVaeExecutor, "broadcast_tensor", lambda self, x: x) # ============================ diff --git a/tests/diffusion/models/bagel/test_trajectory_recording.py b/tests/diffusion/models/bagel/test_trajectory_recording.py index 80b3f9d9ba7..345eac10784 100644 --- a/tests/diffusion/models/bagel/test_trajectory_recording.py +++ b/tests/diffusion/models/bagel/test_trajectory_recording.py @@ -4,10 +4,10 @@ import types from dataclasses import dataclass -from unittest.mock import MagicMock, patch import pytest import torch +from pytest_mock import MockerFixture from vllm_omni.diffusion.models.bagel.bagel_transformer import ( Bagel, @@ -23,9 +23,9 @@ EXPECTED_STEPS = NUM_TIMESTEPS - 1 -def _make_mock_bagel(): +def _make_mock_bagel(mocker: MockerFixture): """Create a mock Bagel with forward returning constant velocity.""" - mock = MagicMock(spec=Bagel) + mock = mocker.MagicMock(spec=Bagel) mock._sp_size = 1 # forward returns a small constant velocity so x_t changes each step @@ -78,18 +78,22 @@ def _make_generate_args(num_tokens=NUM_TOKENS, hidden_dim=HIDDEN_DIM, cfg=False) @pytest.fixture(params=[False, True], ids=["no_cfg", "batched_cfg"]) -def bagel_and_args(request): +def bagel_and_args( + request, + monkeypatch: pytest.MonkeyPatch, + mocker: MockerFixture, +): """Mock Bagel instance and generate_image arguments. Parametrized over CFG mode so every test runs on both the no-CFG and batched-CFG code paths. """ cfg = request.param - with patch( + monkeypatch.setattr( "vllm_omni.diffusion.models.bagel.bagel_transformer.get_classifier_free_guidance_world_size", - return_value=1, - ): - yield _make_mock_bagel(), _make_generate_args(cfg=cfg) + lambda: 1, + ) + yield _make_mock_bagel(mocker), _make_generate_args(cfg=cfg) class TestTrajectoryRecording: @@ -188,12 +192,16 @@ class TestTrajectoryLogProbs: """Tests for log-prob recording when a scheduler is provided.""" @pytest.fixture() - def bagel_scheduler_args(self): - with patch( + def bagel_scheduler_args( + self, + monkeypatch: pytest.MonkeyPatch, + mocker: MockerFixture, + ): + monkeypatch.setattr( "vllm_omni.diffusion.models.bagel.bagel_transformer.get_classifier_free_guidance_world_size", - return_value=1, - ): - yield _make_mock_bagel(), _make_generate_args(), _MockScheduler() + lambda: 1, + ) + yield _make_mock_bagel(mocker), _make_generate_args(), _MockScheduler() def test_log_probs_recorded_with_scheduler(self, bagel_scheduler_args): bagel, args, scheduler = bagel_scheduler_args diff --git a/tests/diffusion/models/flux2/test_flux2_transformer_tp.py b/tests/diffusion/models/flux2/test_flux2_transformer_tp.py index faad08afd1c..54dda1dd07e 100644 --- a/tests/diffusion/models/flux2/test_flux2_transformer_tp.py +++ b/tests/diffusion/models/flux2/test_flux2_transformer_tp.py @@ -1,7 +1,6 @@ -from unittest.mock import MagicMock, patch - import pytest import torch +from pytest_mock import MockerFixture from tests.utils import hardware_test from vllm_omni.diffusion.models.flux2.flux2_transformer import ( @@ -12,14 +11,17 @@ # Initialize TP group before tests @pytest.fixture(scope="function", autouse=True) -def setup_tp_group(): +def setup_tp_group(mocker: MockerFixture): """Set up TP group for each test function""" - with patch("vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", return_value=2): - with patch("vllm.distributed.parallel_state.get_tp_group") as mock_get_tp_group: - mock_tp_group = MagicMock() - mock_tp_group.world_size = 2 - mock_get_tp_group.return_value = mock_tp_group - yield + mocker.patch( + "vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", + return_value=2, + ) + mock_get_tp_group = mocker.patch("vllm.distributed.parallel_state.get_tp_group") + mock_tp_group = mocker.MagicMock() + mock_tp_group.world_size = 2 + mock_get_tp_group.return_value = mock_tp_group + yield class TestFlux2TransformerWeightLoading: diff --git a/tests/diffusion/offloader/test_sequential_backend.py b/tests/diffusion/offloader/test_sequential_backend.py index d18637a780e..2539cc06895 100644 --- a/tests/diffusion/offloader/test_sequential_backend.py +++ b/tests/diffusion/offloader/test_sequential_backend.py @@ -3,8 +3,6 @@ """Unit tests for SequentialOffloadBackend.""" -from unittest.mock import patch - import pytest import torch from torch import nn @@ -44,7 +42,7 @@ def mock(self): class TestMoveParamsPinMemory: - def test_dtensor_skips_pin_memory(self, accelerator_device): + def test_dtensor_skips_pin_memory(self, accelerator_device, monkeypatch: pytest.MonkeyPatch): """DTensor should skip pin_memory to avoid RuntimeError.""" module = _create_simple_module().to(accelerator_device) tracker, mock_pin = _track_pin_memory_calls() @@ -56,73 +54,73 @@ def fake_isinstance(obj, cls): return True return original_isinstance(obj, cls) - with patch.object(torch.Tensor, "pin_memory", mock_pin): - with patch("builtins.isinstance", fake_isinstance): - hook = SequentialOffloadHook( - offload_targets=[], - device=accelerator_device, - pin_memory=True, - use_hsdp=False, - ) - hook._move_params( - module, - torch.device("cpu"), - non_blocking=False, - pin_memory=True, - ) - assert not tracker["called"], "pin_memory should not be called for DTensor" - - def test_regular_tensor_calls_pin_memory(self, accelerator_device): + monkeypatch.setattr(torch.Tensor, "pin_memory", mock_pin) + monkeypatch.setattr("builtins.isinstance", fake_isinstance) + hook = SequentialOffloadHook( + offload_targets=[], + device=accelerator_device, + pin_memory=True, + use_hsdp=False, + ) + hook._move_params( + module, + torch.device("cpu"), + non_blocking=False, + pin_memory=True, + ) + assert not tracker["called"], "pin_memory should not be called for DTensor" + + def test_regular_tensor_calls_pin_memory(self, accelerator_device, monkeypatch: pytest.MonkeyPatch): """Regular tensor should call pin_memory when moving to CPU.""" module = _create_simple_module().to(accelerator_device) tracker, mock_pin = _track_pin_memory_calls() - with patch.object(torch.Tensor, "pin_memory", mock_pin): - hook = SequentialOffloadHook( - offload_targets=[], - device=accelerator_device, - pin_memory=True, - use_hsdp=False, - ) - hook._move_params( - module, - torch.device("cpu"), - non_blocking=False, - pin_memory=True, - ) - assert tracker["called"], "pin_memory should be called for regular tensors" - - def test_pin_memory_skipped_when_disabled(self, accelerator_device): + monkeypatch.setattr(torch.Tensor, "pin_memory", mock_pin) + hook = SequentialOffloadHook( + offload_targets=[], + device=accelerator_device, + pin_memory=True, + use_hsdp=False, + ) + hook._move_params( + module, + torch.device("cpu"), + non_blocking=False, + pin_memory=True, + ) + assert tracker["called"], "pin_memory should be called for regular tensors" + + def test_pin_memory_skipped_when_disabled(self, accelerator_device, monkeypatch: pytest.MonkeyPatch): """pin_memory should not be called when pin_memory=False.""" module = _create_simple_module().to(accelerator_device) tracker, mock_pin = _track_pin_memory_calls() - with patch.object(torch.Tensor, "pin_memory", mock_pin): - hook = SequentialOffloadHook( - offload_targets=[], - device=accelerator_device, - pin_memory=False, - use_hsdp=False, - ) - hook._move_params( - module, - torch.device("cpu"), - non_blocking=False, - pin_memory=False, - ) - assert not tracker["called"], "pin_memory should not be called when disabled" - - def test_pin_memory_skipped_for_non_cpu_target(self, accelerator_device): + monkeypatch.setattr(torch.Tensor, "pin_memory", mock_pin) + hook = SequentialOffloadHook( + offload_targets=[], + device=accelerator_device, + pin_memory=False, + use_hsdp=False, + ) + hook._move_params( + module, + torch.device("cpu"), + non_blocking=False, + pin_memory=False, + ) + assert not tracker["called"], "pin_memory should not be called when disabled" + + def test_pin_memory_skipped_for_non_cpu_target(self, accelerator_device, monkeypatch: pytest.MonkeyPatch): """pin_memory should not be called for non-CPU targets.""" module = _create_simple_module().to("cpu") tracker, mock_pin = _track_pin_memory_calls() - with patch.object(torch.Tensor, "pin_memory", mock_pin): - hook = SequentialOffloadHook( - offload_targets=[], - device=torch.device("cpu"), - pin_memory=True, - use_hsdp=False, - ) - hook._move_params(module, accelerator_device, non_blocking=False, pin_memory=True) - assert not tracker["called"], "pin_memory should not be called for non-CPU target" + monkeypatch.setattr(torch.Tensor, "pin_memory", mock_pin) + hook = SequentialOffloadHook( + offload_targets=[], + device=torch.device("cpu"), + pin_memory=True, + use_hsdp=False, + ) + hook._move_params(module, accelerator_device, non_blocking=False, pin_memory=True) + assert not tracker["called"], "pin_memory should not be called for non-CPU target" diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py index d4d5aa5a7fe..875277ece42 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/diffusion/quantization/test_int8_config.py @@ -2,8 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Unit tests for Int8 quantization config.""" -from unittest.mock import MagicMock, patch - import pytest import torch from pytest_mock import MockerFixture @@ -102,7 +100,7 @@ def test_quantization_config_string_and_dict_equivalent(): assert config_str.quantization_config.activation_scheme == config_dict.quantization_config.activation_scheme -def test_get_quant_method(mocker: MockerFixture): +def test_get_quant_method(mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch): """Test for get_quant_method method for GPU""" from vllm_omni.quantization.int8_config import Int8OnlineLinearMethod @@ -111,18 +109,16 @@ def test_get_quant_method(mocker: MockerFixture): def _fake_init(self, quant_config): pass - layer = MagicMock(spec=LinearBase) + layer = mocker.Mock(spec=LinearBase) mocker.patch.object(Int8OnlineLinearMethod, "__init__", _fake_init) prefix = "test_layer" # Mock the platform to be GPU - with ( - patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=True), - patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=False), - ): - method = config.get_quant_method(layer, prefix) - assert isinstance(method, Int8OnlineLinearMethod) + monkeypatch.setattr(current_omni_platform, "is_cuda", lambda: True) + monkeypatch.setattr(current_omni_platform, "is_npu", lambda: False) + method = config.get_quant_method(layer, prefix) + assert isinstance(method, Int8OnlineLinearMethod) # Test skipping quantization for a layer config.ignored_layers = [prefix] @@ -130,22 +126,20 @@ def _fake_init(self, quant_config): assert isinstance(method, UnquantizedLinearMethod) -def test_get_npu_quant_method(): +def test_get_npu_quant_method(mocker: MockerFixture, monkeypatch: pytest.MonkeyPatch): """Test for get_quant_method method for NPU""" from vllm_omni.quantization.int8_config import NPUInt8OnlineLinearMethod config = build_quant_config("int8") - layer = MagicMock(spec=LinearBase) + layer = mocker.Mock(spec=LinearBase) prefix = "test_layer" # Mock the platform to be NPU - with ( - patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=False), - patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=True), - ): - method = config.get_quant_method(layer, prefix) - assert isinstance(method, NPUInt8OnlineLinearMethod) + monkeypatch.setattr(current_omni_platform, "is_cuda", lambda: False) + monkeypatch.setattr(current_omni_platform, "is_npu", lambda: True) + method = config.get_quant_method(layer, prefix) + assert isinstance(method, NPUInt8OnlineLinearMethod) # Test skipping quantization for a layer config.ignored_layers = [prefix] @@ -245,7 +239,7 @@ class TestNPUInt8LinearMethod: @pytest.fixture def mock_torch_npu(self, mocker): - torch_npu = MagicMock() + torch_npu = mocker.MagicMock() mocker.patch("vllm_omni.quantization.int8_config.torch_npu", return_value=torch_npu) mocker.patch( diff --git a/tests/diffusion/test_diffusion_scheduler.py b/tests/diffusion/test_diffusion_scheduler.py index 4324ba1e630..a64d9920e03 100644 --- a/tests/diffusion/test_diffusion_scheduler.py +++ b/tests/diffusion/test_diffusion_scheduler.py @@ -4,10 +4,10 @@ import queue import threading from types import SimpleNamespace -from unittest.mock import Mock, patch import pytest import torch +from pytest_mock import MockerFixture from vllm_omni.diffusion.data import DiffusionOutput, DiffusionRequestAbortedError from vllm_omni.diffusion.diffusion_engine import DiffusionEngine @@ -97,19 +97,19 @@ def initialize(self, od_config) -> None: def add_request(self, request: OmniDiffusionRequest) -> str: assert request is self._request - self._state = Mock(sched_req_id=self._sched_req_id, req=request) + self._state = SimpleNamespace(sched_req_id=self._sched_req_id, req=request) return self._sched_req_id def schedule(self): if self._scheduled or self._state is None: - return Mock( + return SimpleNamespace( scheduled_new_reqs=[], scheduled_cached_reqs=CachedRequestData.make_empty(), scheduled_req_ids=[], is_empty=True, ) self._scheduled = True - return Mock( + return SimpleNamespace( scheduled_new_reqs=[NewRequestData.from_state(self._state)], scheduled_cached_reqs=CachedRequestData.make_empty(), scheduled_req_ids=[self._state.sched_req_id], @@ -153,7 +153,7 @@ def close(self) -> None: class TestRequestScheduler: def setup_method(self) -> None: self.scheduler: RequestScheduler = RequestScheduler() - self.scheduler.initialize(Mock()) + self.scheduler.initialize(SimpleNamespace()) def test_single_request_success_lifecycle(self) -> None: req_id = self.scheduler.add_request(_make_request("a")) @@ -276,23 +276,23 @@ def test_request_id_mapping_lifecycle(self) -> None: class TestDiffusionEngine: - def test_add_req_and_wait_for_response_single_path(self) -> None: + def test_add_req_and_wait_for_response_single_path(self, mocker: MockerFixture) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.scheduler = RequestScheduler() - engine.scheduler.initialize(Mock()) + engine.scheduler.initialize(SimpleNamespace()) engine._rpc_lock = threading.RLock() engine.abort_queue = queue.Queue() request = _make_request("engine") runner_output = _make_request_output("engine") - engine.execute_fn = Mock(return_value=runner_output) + engine.execute_fn = mocker.Mock(return_value=runner_output) output = engine.add_req_and_wait_for_response(request) assert output is runner_output.result engine.execute_fn.assert_called_once() - def test_supports_scheduler_interface_injection(self) -> None: + def test_supports_scheduler_interface_injection(self, mocker: MockerFixture) -> None: request = _make_request("engine_iface") runner_output = _make_request_output("engine_iface") scheduler = _StubScheduler(request, runner_output) @@ -301,33 +301,45 @@ def test_supports_scheduler_interface_injection(self) -> None: engine.scheduler = scheduler engine._rpc_lock = threading.RLock() engine.abort_queue = queue.Queue() - engine.execute_fn = Mock(return_value=runner_output) + engine.execute_fn = mocker.Mock(return_value=runner_output) output = engine.add_req_and_wait_for_response(request) assert output is runner_output.result engine.execute_fn.assert_called_once() - def test_initializes_injected_scheduler(self) -> None: + def test_initializes_injected_scheduler( + self, + monkeypatch: pytest.MonkeyPatch, + mocker: MockerFixture, + ) -> None: request = _make_request("init") scheduler = _StubScheduler(request, DiffusionOutput(output=None)) - od_config = Mock(model_class_name="mock_model") - fake_executor_cls = Mock(return_value=Mock()) + od_config = SimpleNamespace(model_class_name="mock_model") + fake_executor_cls = mocker.Mock(return_value=mocker.Mock()) - with ( - patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_post_process_func", return_value=None), - patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_pre_process_func", return_value=None), - patch("vllm_omni.diffusion.diffusion_engine.DiffusionExecutor.get_class", return_value=fake_executor_cls), - patch.object(DiffusionEngine, "_dummy_run", return_value=None), - ): - DiffusionEngine(od_config, scheduler=scheduler) + monkeypatch.setattr( + "vllm_omni.diffusion.diffusion_engine.get_diffusion_post_process_func", + lambda *args, **kwargs: None, + ) + monkeypatch.setattr( + "vllm_omni.diffusion.diffusion_engine.get_diffusion_pre_process_func", + lambda *args, **kwargs: None, + ) + monkeypatch.setattr( + "vllm_omni.diffusion.diffusion_engine.DiffusionExecutor.get_class", + lambda *args, **kwargs: fake_executor_cls, + ) + monkeypatch.setattr(DiffusionEngine, "_dummy_run", lambda self: None) + + DiffusionEngine(od_config, scheduler=scheduler) assert scheduler.initialized_with is od_config fake_executor_cls.assert_called_once_with(od_config) def test_scheduler_alias_keeps_default_request_scheduler(self) -> None: scheduler = Scheduler() - scheduler.initialize(Mock()) + scheduler.initialize(SimpleNamespace()) req_id = scheduler.add_request(_make_request("alias")) sched_output = scheduler.schedule() @@ -336,10 +348,10 @@ def test_scheduler_alias_keeps_default_request_scheduler(self) -> None: assert req_id in finished assert scheduler.get_request_state(req_id).status == DiffusionRequestStatus.FINISHED_COMPLETED - def test_step_raises_aborted_error(self) -> None: + def test_step_raises_aborted_error(self, mocker: MockerFixture) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.pre_process_func = None - engine.add_req_and_wait_for_response = Mock( + engine.add_req_and_wait_for_response = mocker.Mock( return_value=DiffusionOutput(aborted=True, abort_message="Request req-abort aborted.") ) @@ -349,7 +361,7 @@ def test_step_raises_aborted_error(self) -> None: def test_abort_queue_marks_request_finished_aborted(self) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.scheduler = RequestScheduler() - engine.scheduler.initialize(Mock()) + engine.scheduler.initialize(SimpleNamespace()) engine.abort_queue = queue.Queue() req_id = engine.scheduler.add_request(_make_request("req-abort")) @@ -361,7 +373,7 @@ def test_abort_queue_marks_request_finished_aborted(self) -> None: def test_finalize_finished_request_returns_aborted_output(self) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) engine.scheduler = RequestScheduler() - engine.scheduler.initialize(Mock()) + engine.scheduler.initialize(SimpleNamespace()) req_id = engine.scheduler.add_request(_make_request("req-finalize")) engine.scheduler.finish_requests(req_id, DiffusionRequestStatus.FINISHED_ABORTED) @@ -371,29 +383,40 @@ def test_finalize_finished_request_returns_aborted_output(self) -> None: assert output.aborted is True assert output.abort_message == "Request req-finalize aborted." - def test_initializes_step_scheduler_when_step_execution_enabled(self) -> None: - od_config = Mock(model_class_name="mock_model") + def test_initializes_step_scheduler_when_step_execution_enabled( + self, + monkeypatch: pytest.MonkeyPatch, + mocker: MockerFixture, + ) -> None: + od_config = SimpleNamespace(model_class_name="mock_model") od_config.step_execution = True - fake_executor = Mock() - fake_executor_cls = Mock(return_value=fake_executor) + fake_executor = mocker.Mock() + fake_executor_cls = mocker.Mock(return_value=fake_executor) - with ( - patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_post_process_func", return_value=None), - patch("vllm_omni.diffusion.diffusion_engine.get_diffusion_pre_process_func", return_value=None), - patch("vllm_omni.diffusion.diffusion_engine.DiffusionExecutor.get_class", return_value=fake_executor_cls), - patch.object(DiffusionEngine, "_dummy_run", return_value=None), - ): - engine = DiffusionEngine(od_config) + monkeypatch.setattr( + "vllm_omni.diffusion.diffusion_engine.get_diffusion_post_process_func", + lambda *args, **kwargs: None, + ) + monkeypatch.setattr( + "vllm_omni.diffusion.diffusion_engine.get_diffusion_pre_process_func", + lambda *args, **kwargs: None, + ) + monkeypatch.setattr( + "vllm_omni.diffusion.diffusion_engine.DiffusionExecutor.get_class", + lambda *args, **kwargs: fake_executor_cls, + ) + monkeypatch.setattr(DiffusionEngine, "_dummy_run", lambda self: None) + engine = DiffusionEngine(od_config) assert isinstance(engine.scheduler, StepScheduler) assert engine.execute_fn is fake_executor.execute_step fake_executor_cls.assert_called_once_with(od_config) - def test_dummy_run_raises_on_output_error(self) -> None: + def test_dummy_run_raises_on_output_error(self, mocker: MockerFixture) -> None: engine = DiffusionEngine.__new__(DiffusionEngine) - engine.od_config = Mock(model_class_name="mock_model") + engine.od_config = SimpleNamespace(model_class_name="mock_model") engine.pre_process_func = None - engine.add_req_and_wait_for_response = Mock(return_value=DiffusionOutput(error="boom")) + engine.add_req_and_wait_for_response = mocker.Mock(return_value=DiffusionOutput(error="boom")) with pytest.raises(RuntimeError, match="Dummy run failed: boom"): engine._dummy_run() @@ -402,7 +425,7 @@ def test_dummy_run_raises_on_output_error(self) -> None: class TestStepScheduler: def setup_method(self) -> None: self.scheduler: StepScheduler = StepScheduler() - self.scheduler.initialize(Mock()) + self.scheduler.initialize(SimpleNamespace()) def test_single_request_step_lifecycle(self) -> None: request = _make_step_request("step", num_inference_steps=3) diff --git a/tests/diffusion/test_diffusion_step_pipeline.py b/tests/diffusion/test_diffusion_step_pipeline.py index 68aba9ba3bf..42687d4a1ed 100644 --- a/tests/diffusion/test_diffusion_step_pipeline.py +++ b/tests/diffusion/test_diffusion_step_pipeline.py @@ -7,10 +7,10 @@ import threading from contextlib import contextmanager from types import SimpleNamespace -from unittest.mock import Mock import pytest import torch +from pytest_mock import MockerFixture import vllm_omni.diffusion.worker.diffusion_model_runner as model_runner_module from tests.utils import hardware_test @@ -542,11 +542,11 @@ def test_rejects_lora_requests_in_step_mode(self): class TestExecutor: """MultiprocDiffusionExecutor.execute_step""" - def test_execute_step_passes_through_runner_output(self): + def test_execute_step_passes_through_runner_output(self, mocker: MockerFixture): executor = object.__new__(MultiprocDiffusionExecutor) executor._ensure_open = lambda: None expected = RunnerOutput(req_id="req-step", step_index=1, finished=False, result=None) - executor.collective_rpc = Mock(return_value=expected) + executor.collective_rpc = mocker.Mock(return_value=expected) request = _make_engine_request("req-step", num_inference_steps=2) scheduler_output = _make_scheduler_output(request, sched_req_id="req-step") @@ -578,9 +578,9 @@ class TestEngine: ), ], ) - def test_step_engine_returns_error(self, execute_fn, expected_error): + def test_step_engine_returns_error(self, execute_fn, expected_error, mocker: MockerFixture): scheduler = StepScheduler() - scheduler.initialize(Mock()) + scheduler.initialize(mocker.Mock()) engine = _make_engine(scheduler, execute_fn=execute_fn) output = engine.add_req_and_wait_for_response(_make_engine_request("req-error", num_inference_steps=2)) @@ -588,9 +588,9 @@ def test_step_engine_returns_error(self, execute_fn, expected_error): assert output.output is None assert expected_error in output.error - def test_step_execution_completes(self): + def test_step_execution_completes(self, mocker: MockerFixture): scheduler = StepScheduler() - scheduler.initialize(Mock()) + scheduler.initialize(mocker.Mock()) engine = _make_engine(scheduler) request = _make_engine_request("req-step", num_inference_steps=2) @@ -614,9 +614,9 @@ def execute_fn(_): assert output.error is None assert torch.equal(output.output, torch.tensor([2.0])) - def test_step_abort_stops_rescheduling_after_first_step(self): + def test_step_abort_stops_rescheduling_after_first_step(self, mocker: MockerFixture): scheduler = StepScheduler() - scheduler.initialize(Mock()) + scheduler.initialize(mocker.Mock()) engine = _make_engine(scheduler) request = _make_engine_request("req-stop", num_inference_steps=4) @@ -639,9 +639,9 @@ def execute_fn(_): assert step["n"] == 1 _assert_aborted_output(output, "req-stop") - def test_step_abort_after_reschedule_returns_aborted_output(self): + def test_step_abort_after_reschedule_returns_aborted_output(self, mocker: MockerFixture): scheduler = StepScheduler() - scheduler.initialize(Mock()) + scheduler.initialize(mocker.Mock()) engine = _make_engine(scheduler) request = _make_engine_request("req-mid", num_inference_steps=4) @@ -666,9 +666,9 @@ def execute_fn(sched_output): assert step["n"] == 2 _assert_aborted_output(output, "req-mid") - def test_finished_step_without_result_returns_error(self): + def test_finished_step_without_result_returns_error(self, mocker: MockerFixture): scheduler = StepScheduler() - scheduler.initialize(Mock()) + scheduler.initialize(mocker.Mock()) engine = _make_engine( scheduler, execute_fn=lambda _: RunnerOutput( diff --git a/tests/diffusion/test_diffusion_worker_cuda_profiler.py b/tests/diffusion/test_diffusion_worker_cuda_profiler.py index ddc2aed2fc2..4a3b22c212e 100644 --- a/tests/diffusion/test_diffusion_worker_cuda_profiler.py +++ b/tests/diffusion/test_diffusion_worker_cuda_profiler.py @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from unittest.mock import MagicMock - import pytest from pytest_mock import MockerFixture @@ -55,8 +53,8 @@ def test_profile_start_stop_delegates_to_cuda_profiler( mock_diffusion_worker_dependencies, ): fake_profiler = mocker.Mock() - fake_profiler.start = MagicMock() - fake_profiler.stop = MagicMock() + fake_profiler.start = mocker.Mock() + fake_profiler.stop = mocker.Mock() mocker.patch( "vllm_omni.diffusion.worker.diffusion_worker.CudaProfilerWrapper", return_value=fake_profiler, diff --git a/tests/diffusion/test_multiproc_engine_concurrency.py b/tests/diffusion/test_multiproc_engine_concurrency.py index 517f98ddaa9..4bc3e05fe91 100644 --- a/tests/diffusion/test_multiproc_engine_concurrency.py +++ b/tests/diffusion/test_multiproc_engine_concurrency.py @@ -3,7 +3,7 @@ import queue import threading -from unittest.mock import Mock, patch +from types import SimpleNamespace import pytest import torch @@ -24,11 +24,9 @@ def _tagged_output(tag: str) -> DiffusionOutput: return DiffusionOutput(output=torch.tensor([0]), error=tag) -def _mock_request(tag: str) -> Mock: - """Return a mock ``OmniDiffusionRequest`` identifiable by *tag*.""" - req = Mock() - req.request_ids = [tag] - return req +def _mock_request(tag: str): + """Return a lightweight request object identifiable by *tag*.""" + return SimpleNamespace(request_ids=[tag]) def _make_executor(num_gpus: int = 1): @@ -36,20 +34,18 @@ def _make_executor(num_gpus: int = 1): Returns ``(executor, request_queue, result_queue)``. """ - od_cfg = Mock() - od_cfg.num_gpus = num_gpus - - with patch.object(MultiprocDiffusionExecutor, "_init_executor"): - executor = MultiprocDiffusionExecutor(od_cfg) + od_cfg = SimpleNamespace(num_gpus=num_gpus) + monkeypatch = pytest.MonkeyPatch() + monkeypatch.setattr(MultiprocDiffusionExecutor, "_init_executor", lambda self: None) + executor = MultiprocDiffusionExecutor(od_cfg) + monkeypatch.undo() req_q: queue.Queue = queue.Queue() res_q: queue.Queue = queue.Queue() - mock_broadcast_mq = Mock() - mock_broadcast_mq.enqueue = req_q.put + mock_broadcast_mq = SimpleNamespace(enqueue=req_q.put) - mock_rmq = Mock() - mock_rmq.dequeue = lambda timeout=None: res_q.get(timeout=timeout if timeout is not None else 10) + mock_rmq = SimpleNamespace(dequeue=lambda timeout=None: res_q.get(timeout=timeout if timeout is not None else 10)) executor._broadcast_mq = mock_broadcast_mq executor._result_mq = mock_rmq @@ -63,7 +59,7 @@ def _make_engine(num_gpus: int = 1): executor, req_q, res_q = _make_executor(num_gpus) engine = DiffusionEngine.__new__(DiffusionEngine) sched = RequestScheduler() - sched.initialize(Mock()) + sched.initialize(SimpleNamespace()) engine.scheduler = sched engine.executor = executor engine._rpc_lock = threading.RLock() diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index cb1f31164ca..a1fc18f8456 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -6,7 +6,7 @@ import argparse import inspect -from unittest.mock import Mock +from types import SimpleNamespace import pytest from pydantic import ValidationError @@ -102,7 +102,7 @@ def test_qwen3_tts_codec_frame_rate_patching(): vllm_config = EngineArgs().create_model_config() # Create a mock talking config with a dummy value for position_id_per_seconds - mock_talker_config = Mock() + mock_talker_config = SimpleNamespace() mock_talker_config.position_id_per_seconds = 12.3 vllm_config.hf_config.talker_config = mock_talker_config @@ -146,13 +146,12 @@ def test_stage_specific_text_config_override(): # Switch the created hf text config with a mock whose # values we want to pull through the text config helper stage_text_config = vllm_config.hf_text_config - vllm_config.hf_text_config = Mock() + vllm_config.hf_text_config = SimpleNamespace() stage_text_config.sliding_window = 4096 stage_text_config.attention_chunk_size = 2048 # Move the stage config's text config getter & thinker config - mock_stage_config = Mock() - mock_stage_config.get_text_config.return_value = stage_text_config + mock_stage_config = SimpleNamespace(get_text_config=lambda: stage_text_config) vllm_config.hf_config.thinker_config = mock_stage_config # Ensure that create from a vLLM config correctly pulls the diff --git a/tests/engine/test_async_omni_engine_input.py b/tests/engine/test_async_omni_engine_input.py index ed6a7277b46..3700e426d42 100644 --- a/tests/engine/test_async_omni_engine_input.py +++ b/tests/engine/test_async_omni_engine_input.py @@ -1,6 +1,5 @@ -from unittest.mock import Mock - import pytest +from pytest_mock import MockerFixture from vllm.sampling_params import SamplingParams from vllm.v1.engine import EngineCoreRequest @@ -24,18 +23,18 @@ def _make_engine_core_request() -> EngineCoreRequest: ) -def test_build_add_request_message_preserves_additional_information(): +def test_build_add_request_message_preserves_additional_information(mocker: MockerFixture): engine = object.__new__(AsyncOmniEngine) params = SamplingParams(max_tokens=8) engine.default_sampling_params_list = [params] engine.stage_metadata = [{"stage_type": "llm"}] engine.supported_tasks = ("speech",) - input_processor = Mock() + input_processor = mocker.Mock() input_processor.process_inputs.return_value = _make_engine_core_request() engine.input_processor = input_processor - output_processor = Mock() + output_processor = mocker.Mock() engine.output_processors = [output_processor] prompt = { @@ -63,18 +62,18 @@ def test_build_add_request_message_preserves_additional_information(): output_processor.add_request.assert_called_once() -def test_build_add_request_message_with_resumable_streaming(): +def test_build_add_request_message_with_resumable_streaming(mocker: MockerFixture): engine = object.__new__(AsyncOmniEngine) params = SamplingParams(max_tokens=8) engine.default_sampling_params_list = [params] engine.stage_metadata = [{"stage_type": "llm"}] engine.supported_tasks = ("generate",) - input_processor = Mock() + input_processor = mocker.Mock() input_processor.process_inputs.return_value = _make_engine_core_request() engine.input_processor = input_processor - output_processor = Mock() + output_processor = mocker.Mock() engine.output_processors = [output_processor] msg = engine._build_add_request_message( diff --git a/tests/engine/test_async_omni_engine_outputs.py b/tests/engine/test_async_omni_engine_outputs.py index ccf9e8cb6b6..ef3cfab3bf8 100644 --- a/tests/engine/test_async_omni_engine_outputs.py +++ b/tests/engine/test_async_omni_engine_outputs.py @@ -5,36 +5,36 @@ """ import queue -from unittest.mock import MagicMock import pytest +from pytest_mock import MockerFixture from vllm_omni.engine.async_omni_engine import AsyncOmniEngine pytestmark = [pytest.mark.core_model, pytest.mark.cpu] -def _make_engine(output_queue, *, thread_alive: bool = True) -> AsyncOmniEngine: +def _make_engine(output_queue, mocker: MockerFixture, *, thread_alive: bool = True) -> AsyncOmniEngine: """Create an AsyncOmniEngine bypassing __init__.""" engine = object.__new__(AsyncOmniEngine) engine.output_queue = output_queue - engine.orchestrator_thread = MagicMock( - is_alive=MagicMock(return_value=thread_alive), + engine.orchestrator_thread = mocker.MagicMock( + is_alive=mocker.MagicMock(return_value=thread_alive), ) return engine -def test_try_get_output_raises_after_orchestrator_dies(): +def test_try_get_output_raises_after_orchestrator_dies(mocker: MockerFixture): """Draining remaining results then hitting an empty queue with a dead orchestrator must raise RuntimeError so callers know the pipeline is gone.""" - mock_queue = MagicMock() + mock_queue = mocker.MagicMock() # First call succeeds; second call finds the queue empty. mock_queue.sync_q.get.side_effect = [ {"type": "output", "request_id": "r1"}, queue.Empty, ] - engine = _make_engine(mock_queue, thread_alive=True) + engine = _make_engine(mock_queue, mocker, thread_alive=True) # Collect the one buffered result. assert engine.try_get_output()["request_id"] == "r1" @@ -47,15 +47,15 @@ def test_try_get_output_raises_after_orchestrator_dies(): @pytest.mark.asyncio -async def test_try_get_output_async_raises_after_orchestrator_dies(): +async def test_try_get_output_async_raises_after_orchestrator_dies(mocker: MockerFixture): """Same scenario as above but for the async variant.""" - mock_queue = MagicMock() + mock_queue = mocker.MagicMock() mock_queue.sync_q.get_nowait.side_effect = [ {"type": "output", "request_id": "r1"}, queue.Empty, ] - engine = _make_engine(mock_queue, thread_alive=True) + engine = _make_engine(mock_queue, mocker, thread_alive=True) assert (await engine.try_get_output_async())["request_id"] == "r1" diff --git a/tests/engine/test_single_stage_mode.py b/tests/engine/test_single_stage_mode.py index 2c5bf6cc79c..608e92ac49e 100644 --- a/tests/engine/test_single_stage_mode.py +++ b/tests/engine/test_single_stage_mode.py @@ -17,10 +17,11 @@ import threading from contextlib import contextmanager +from types import SimpleNamespace from typing import Any -from unittest.mock import MagicMock, Mock, patch import pytest +from pytest_mock import MockerFixture from vllm.v1.engine.utils import EngineZmqAddresses from vllm_omni.engine.async_omni_engine import AsyncOmniEngine @@ -41,31 +42,33 @@ # --------------------------------------------------------------------------- -def _make_stage_cfg(stage_id: int, stage_type: str = "llm") -> Mock: +def _make_stage_cfg(stage_id: int, stage_type: str = "llm"): """Return a lightweight stage config mock.""" - cfg = Mock() - cfg.stage_id = stage_id - cfg.stage_type = stage_type - cfg.engine_args = MagicMock() - cfg.engine_args.async_chunk = False - cfg.engine_args.model_stage = None - cfg.engine_args.engine_output_type = None - return cfg + return SimpleNamespace( + stage_id=stage_id, + stage_type=stage_type, + engine_args=SimpleNamespace( + async_chunk=False, + model_stage=None, + engine_output_type=None, + ), + ) def _make_started_llm_stage(stage_id: int) -> StartedLlmStage: """Return a minimal StartedLlmStage for mocking.""" - addresses = Mock() - addresses.inputs = ["tcp://127.0.0.1:5000"] - addresses.outputs = ["tcp://127.0.0.1:5001"] - addresses.frontend_stats_publish_address = None + addresses = SimpleNamespace( + inputs=["tcp://127.0.0.1:5000"], + outputs=["tcp://127.0.0.1:5001"], + frontend_stats_publish_address=None, + ) return StartedLlmStage( stage_id=stage_id, - metadata=Mock(stage_id=stage_id), - vllm_config=Mock(), - executor_class=Mock(), - engine_manager=Mock(), - coordinator=Mock(), + metadata=SimpleNamespace(stage_id=stage_id), + vllm_config=SimpleNamespace(), + executor_class=SimpleNamespace(), + engine_manager=SimpleNamespace(), + coordinator=SimpleNamespace(), addresses=addresses, ) @@ -348,74 +351,80 @@ class TestSingleStageModeDetection: the orchestrator thread, so no actual engines are started. """ - def _make_engine_no_thread(self, **kwargs: Any) -> AsyncOmniEngine: + def _make_engine_no_thread(self, mocker: MockerFixture, **kwargs: Any) -> AsyncOmniEngine: """Create an AsyncOmniEngine without starting the orchestrator thread.""" stage_cfg = _make_stage_cfg(0) mock_stage_configs = [stage_cfg] - with ( - patch.object( - AsyncOmniEngine, - "_resolve_stage_configs", - return_value=("/fake/path", mock_stage_configs), - ), - patch.object( - AsyncOmniEngine, - "_bootstrap_orchestrator", - ), - patch("threading.Thread") as mock_thread_cls, - patch("concurrent.futures.Future") as mock_future_cls, - ): - mock_future = Mock() - mock_future.result.return_value = Mock() # simulates a loop - mock_future_cls.return_value = mock_future + mocker.patch.object( + AsyncOmniEngine, + "_resolve_stage_configs", + return_value=("/fake/path", mock_stage_configs), + ) + mocker.patch.object( + AsyncOmniEngine, + "_bootstrap_orchestrator", + ) + mock_thread_cls = mocker.patch("threading.Thread") + mock_future_cls = mocker.patch("concurrent.futures.Future") + + mock_future = mocker.Mock() + mock_future.result.return_value = mocker.Mock() # simulates a loop + mock_future_cls.return_value = mock_future - mock_thread = Mock() - mock_thread.is_alive.return_value = False - mock_thread_cls.return_value = mock_thread + mock_thread = mocker.Mock() + mock_thread.is_alive.return_value = False + mock_thread_cls.return_value = mock_thread - engine = AsyncOmniEngine(model="fake-model", **kwargs) + engine = AsyncOmniEngine(model="fake-model", **kwargs) return engine - def test_explicit_single_stage_mode_true(self): + def test_explicit_single_stage_mode_true(self, mocker: MockerFixture): engine = self._make_engine_no_thread( + mocker, single_stage_mode=True, omni_master_address="127.0.0.1", omni_master_port=20000, ) assert engine.single_stage_mode is True - def test_stage_id_kwarg_promotes_to_single_stage_mode(self): + def test_stage_id_kwarg_promotes_to_single_stage_mode(self, mocker: MockerFixture): engine = self._make_engine_no_thread( + mocker, stage_id=0, omni_master_address="127.0.0.1", omni_master_port=20001, ) assert engine.single_stage_mode is True - def test_stage_id_kwarg_sets_filter(self): + def test_stage_id_kwarg_sets_filter(self, mocker: MockerFixture): engine = self._make_engine_no_thread( + mocker, stage_id=1, omni_master_address="127.0.0.1", omni_master_port=20002, ) assert engine._single_stage_id_filter == 1 - def test_no_stage_id_no_single_stage_mode(self): - engine = self._make_engine_no_thread() + def test_no_stage_id_no_single_stage_mode(self, mocker: MockerFixture): + engine = self._make_engine_no_thread( + mocker, + ) assert engine.single_stage_mode is False assert engine._single_stage_id_filter is None - def test_single_stage_mode_without_stage_id_has_no_filter(self): + def test_single_stage_mode_without_stage_id_has_no_filter(self, mocker: MockerFixture): engine = self._make_engine_no_thread( + mocker, single_stage_mode=True, omni_master_address="127.0.0.1", omni_master_port=20003, ) assert engine._single_stage_id_filter is None - def test_master_address_and_port_stored(self): + def test_master_address_and_port_stored(self, mocker: MockerFixture): engine = self._make_engine_no_thread( + mocker, stage_id=0, omni_master_address="10.0.0.1", omni_master_port=12345, @@ -423,8 +432,10 @@ def test_master_address_and_port_stored(self): assert engine._omni_master_address == "10.0.0.1" assert engine._omni_master_port == 12345 - def test_omni_master_server_starts_as_none(self): - engine = self._make_engine_no_thread() + def test_omni_master_server_starts_as_none(self, mocker: MockerFixture): + engine = self._make_engine_no_thread( + mocker, + ) assert engine._omni_master_server is None @@ -448,7 +459,7 @@ class TestInitializeStagesRouting: def _build_engine_skeleton( self, - stage_cfgs: list[Mock], + stage_cfgs: list[Any], single_stage_mode: bool, stage_id_filter: int | None, omni_master_address: str = "127.0.0.1", @@ -478,8 +489,8 @@ def _build_engine_skeleton( engine.prompt_expand_func = None return engine - def _fake_metadata(self, stage_id: int, stage_type: str = "llm") -> Mock: - meta = Mock() + def _fake_metadata(self, mocker: MockerFixture, stage_id: int, stage_type: str = "llm") -> Any: + meta = mocker.Mock() meta.stage_id = stage_id meta.stage_type = stage_type meta.runtime_cfg = {} @@ -492,13 +503,14 @@ def _fake_metadata(self, stage_id: int, stage_type: str = "llm") -> Mock: def _run_initialize_stages_mocked( self, + mocker: MockerFixture, engine: AsyncOmniEngine, - stage_cfgs: list[Mock], + stage_cfgs: list[Any], *, launch_side_effect: Any = None, remote_side_effect: Any = None, attach_result: Any = None, - ) -> tuple[Mock, Mock]: + ) -> tuple[Any, Any]: """Execute _initialize_stages with all heavy helpers mocked. Returns (mock_launch_llm_stage, mock_create_remote_llm_stage). @@ -509,167 +521,217 @@ def _run_initialize_stages_mocked( if getattr(cfg, "stage_type", "llm") != "diffusion" } - default_attach = (Mock(), Mock(), Mock(), Mock()) + default_attach = (mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()) - mock_launch = Mock( + mock_launch = mocker.Mock( side_effect=launch_side_effect or (lambda cfg, meta, spec, timeout, llm_stage_launch_lock, kv: started_by_stage[meta.stage_id]) ) - mock_remote = Mock( + mock_remote = mocker.Mock( side_effect=remote_side_effect or (lambda cfg, meta, spec, timeout, srv: started_by_stage[meta.stage_id]) ) - mock_attach = Mock(return_value=attach_result or default_attach) + mock_attach = mocker.Mock(return_value=attach_result or default_attach) - mock_oms = Mock(spec=OmniMasterServer) - mock_oms.get_zmq_addresses.side_effect = lambda sid: Mock() + mock_oms = mocker.Mock(spec=OmniMasterServer) + mock_oms.get_zmq_addresses.side_effect = lambda sid: mocker.Mock() finalized = ( - [Mock() for _ in stage_cfgs], - [Mock() for _ in stage_cfgs], + [mocker.Mock() for _ in stage_cfgs], + [mocker.Mock() for _ in stage_cfgs], [{"final_output": True, "final_output_type": None, "stage_type": "llm"} for _ in stage_cfgs], ) - with ( - patch.object(engine, "_launch_llm_stage", mock_launch), - patch.object(engine, "_create_remote_llm_stage", mock_remote), - patch.object(engine, "_attach_llm_stage", mock_attach), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch( - "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", - return_value=None, - ), - patch( - "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", - return_value={}, - ), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", - return_value=(None, None, None), - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id, getattr(cfg, "stage_type", "llm")), - ), - patch( - "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", - return_value=finalized, + mocker.patch.object(engine, "_launch_llm_stage", mock_launch) + mocker.patch.object(engine, "_create_remote_llm_stage", mock_remote) + mocker.patch.object(engine, "_attach_llm_stage", mock_attach) + mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.prepare_engine_environment", + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata( + mocker, + cfg.stage_id, + getattr(cfg, "stage_type", "llm"), ), - ): - engine._initialize_stages(stage_init_timeout=60) + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) return mock_launch, mock_remote # -- single-stage mode: stage matches filter → local launch --------------- - def test_matching_stage_uses_launch_llm_stage(self): + def test_matching_stage_uses_launch_llm_stage(self, mocker: MockerFixture): """stage_id == _single_stage_id_filter → _launch_llm_stage is called.""" stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) - mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + mock_launch, mock_remote = self._run_initialize_stages_mocked(mocker, engine, stage_cfgs) launched_ids = [c.args[1].stage_id for c in mock_launch.call_args_list] assert 0 in launched_ids, "_launch_llm_stage should be called for stage 0" - def test_non_matching_stage_uses_create_remote_llm_stage(self): + def test_non_matching_stage_uses_create_remote_llm_stage(self, mocker: MockerFixture): """stage_id != _single_stage_id_filter → _create_remote_llm_stage is called.""" stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) - mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + mock_launch, mock_remote = self._run_initialize_stages_mocked(mocker, engine, stage_cfgs) remote_ids = [c.args[1].stage_id for c in mock_remote.call_args_list] assert 1 in remote_ids, "_create_remote_llm_stage should be called for stage 1" - def test_filter_1_routes_correctly(self): + def test_filter_1_routes_correctly(self, mocker: MockerFixture): """With filter=1, stage 0 is remote and stage 1 is local.""" stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=1) - mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + mock_launch, mock_remote = self._run_initialize_stages_mocked(mocker, engine, stage_cfgs) launched_ids = [c.args[1].stage_id for c in mock_launch.call_args_list] remote_ids = [c.args[1].stage_id for c in mock_remote.call_args_list] assert 1 in launched_ids, "stage 1 should be launched locally with filter=1" assert 0 in remote_ids, "stage 0 should use remote path with filter=1" - def test_no_filter_all_stages_use_launch_path(self): + def test_no_filter_all_stages_use_launch_path(self, mocker: MockerFixture): """single_stage_mode=True but no filter → all stages use _launch_llm_stage.""" stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=None) - mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + mock_launch, mock_remote = self._run_initialize_stages_mocked(mocker, engine, stage_cfgs) assert mock_remote.call_count == 0, "No remote launches without a filter" launched_ids = [c.args[1].stage_id for c in mock_launch.call_args_list] assert set(launched_ids) == {0, 1} - def test_non_single_stage_mode_never_calls_create_remote(self): + def test_non_single_stage_mode_never_calls_create_remote(self, mocker: MockerFixture): """Outside single_stage_mode, _create_remote_llm_stage must not be called.""" stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=False, stage_id_filter=None) - mock_launch, mock_remote = self._run_initialize_stages_mocked(engine, stage_cfgs) + mock_launch, mock_remote = self._run_initialize_stages_mocked(mocker, engine, stage_cfgs) assert mock_remote.call_count == 0 - def test_omni_master_server_started_in_single_stage_mode(self): + def test_omni_master_server_started_in_single_stage_mode(self, mocker: MockerFixture): """OmniMasterServer.start() must be called when single_stage_mode=True.""" stage_cfgs = [_make_stage_cfg(0)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) - mock_oms = Mock(spec=OmniMasterServer) - mock_oms.get_zmq_addresses.return_value = Mock() - finalized = ([Mock()], [Mock()], [{"final_output": True, "final_output_type": None, "stage_type": "llm"}]) - - with ( - patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)), - patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(0)), - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id), - ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + mock_oms = mocker.Mock(spec=OmniMasterServer) + mock_oms.get_zmq_addresses.return_value = mocker.Mock() + finalized = ( + [mocker.Mock()], + [mocker.Mock()], + [{"final_output": True, "final_output_type": None, "stage_type": "llm"}], + ) + + mocker.patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)) + mocker.patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(0)) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(mocker, cfg.stage_id), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) mock_oms.start.assert_called_once() - def test_omni_master_server_uses_configured_stage_ids(self): + def test_omni_master_server_uses_configured_stage_ids(self, mocker: MockerFixture): """Configured stage IDs, not list indexes, should drive pre-allocation.""" stage_cfgs = [_make_stage_cfg(7), _make_stage_cfg(11)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=7) - mock_oms = Mock(spec=OmniMasterServer) - mock_oms.get_zmq_addresses.return_value = Mock() + mock_oms = mocker.Mock(spec=OmniMasterServer) + mock_oms.get_zmq_addresses.return_value = mocker.Mock() finalized = ( - [Mock(), Mock()], - [Mock(), Mock()], + [mocker.Mock(), mocker.Mock()], + [mocker.Mock(), mocker.Mock()], [{"final_output": False, "final_output_type": None, "stage_type": "llm"} for _ in stage_cfgs], ) - with ( - patch.object( - engine, "_launch_llm_stage", side_effect=[_make_started_llm_stage(7), _make_started_llm_stage(11)] - ), - patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(11)), - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms) as mock_oms_cls, - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id), - ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + mocker.patch.object( + engine, + "_launch_llm_stage", + side_effect=[_make_started_llm_stage(7), _make_started_llm_stage(11)], + ) + mocker.patch.object( + engine, + "_create_remote_llm_stage", + return_value=_make_started_llm_stage(11), + ) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mock_oms_cls = mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(mocker, cfg.stage_id), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) mock_oms_cls.assert_called_once_with( master_address=engine._omni_master_address, @@ -677,73 +739,121 @@ def test_omni_master_server_uses_configured_stage_ids(self): stage_ids=[7, 11], ) - def test_single_stage_filter_uses_configured_stage_ids(self): + def test_single_stage_filter_uses_configured_stage_ids(self, mocker: MockerFixture): """Local/remote dispatch should compare against configured stage IDs.""" stage_cfgs = [_make_stage_cfg(7), _make_stage_cfg(11)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=7) - mock_oms = Mock(spec=OmniMasterServer) + mock_oms = mocker.Mock(spec=OmniMasterServer) finalized = ( - [Mock(), Mock()], - [Mock(), Mock()], + [mocker.Mock(), mocker.Mock()], + [mocker.Mock(), mocker.Mock()], [{"final_output": False, "final_output_type": None, "stage_type": "llm"} for _ in stage_cfgs], ) - with ( - patch.object(engine, "_launch_llm_stage", side_effect=[_make_started_llm_stage(7)]) as mock_launch, - patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(11)) as mock_remote, - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id), - ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + mock_launch = mocker.patch.object( + engine, + "_launch_llm_stage", + side_effect=[_make_started_llm_stage(7)], + ) + mock_remote = mocker.patch.object( + engine, + "_create_remote_llm_stage", + return_value=_make_started_llm_stage(11), + ) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(mocker, cfg.stage_id), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) assert [call.args[1].stage_id for call in mock_launch.call_args_list] == [7] assert [call.args[1].stage_id for call in mock_remote.call_args_list] == [11] - def test_omni_master_server_preallocates_diffusion_stage_ids(self): + def test_omni_master_server_preallocates_diffusion_stage_ids(self, mocker: MockerFixture): """Diffusion stages should also receive OmniMasterServer allocations.""" stage_cfgs = [_make_stage_cfg(7), _make_stage_cfg(11, stage_type="diffusion")] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=7) - mock_oms = Mock(spec=OmniMasterServer) + mock_oms = mocker.Mock(spec=OmniMasterServer) finalized = ( - [Mock(), Mock()], - [Mock(), Mock()], + [mocker.Mock(), mocker.Mock()], + [mocker.Mock(), mocker.Mock()], [ {"final_output": False, "final_output_type": None, "stage_type": "llm"}, {"final_output": False, "final_output_type": None, "stage_type": "diffusion"}, ], ) - with ( - patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(7)), - patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(7)), - patch.object(engine, "_launch_diffusion_stage", return_value=Mock()), - patch.object(engine, "_create_remote_diffusion_stage", return_value=Mock()), - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms) as mock_oms_cls, - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id, getattr(cfg, "stage_type", "llm")), + mocker.patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(7)) + mocker.patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(7)) + mocker.patch.object(engine, "_launch_diffusion_stage", return_value=mocker.Mock()) + mocker.patch.object( + engine, + "_create_remote_diffusion_stage", + return_value=mocker.Mock(), + ) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mock_oms_cls = mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata( + mocker, + cfg.stage_id, + getattr(cfg, "stage_type", "llm"), ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) mock_oms_cls.assert_called_once_with( master_address=engine._omni_master_address, @@ -751,135 +861,200 @@ def test_omni_master_server_preallocates_diffusion_stage_ids(self): stage_ids=[7, 11], ) - def test_duplicate_llm_stage_ids_raise(self): + def test_duplicate_llm_stage_ids_raise(self, mocker: MockerFixture): """Duplicate configured LLM stage IDs should fail fast.""" stage_cfgs = [_make_stage_cfg(3), _make_stage_cfg(3)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=3) - with ( - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - pytest.raises(ValueError, match="Duplicate stage_id"), - ): + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + with pytest.raises(ValueError, match="Duplicate stage_id"): engine._initialize_stages(stage_init_timeout=60) - def test_omni_master_server_not_started_in_normal_mode(self): + def test_omni_master_server_not_started_in_normal_mode(self, mocker: MockerFixture): """OmniMasterServer must NOT be instantiated outside single_stage_mode.""" stage_cfgs = [_make_stage_cfg(0)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=False, stage_id_filter=None) - finalized = ([Mock()], [Mock()], [{"final_output": True, "final_output_type": None, "stage_type": "llm"}]) - - with ( - patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)), - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer") as mock_oms_cls, - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id), - ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + finalized = ( + [mocker.Mock()], + [mocker.Mock()], + [{"final_output": True, "final_output_type": None, "stage_type": "llm"}], + ) + + mocker.patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mock_oms_cls = mocker.patch("vllm_omni.engine.async_omni_engine.OmniMasterServer") + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata(mocker, cfg.stage_id), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) mock_oms_cls.assert_not_called() - def test_single_stage_mode_missing_master_address_raises(self): + def test_single_stage_mode_missing_master_address_raises(self, mocker: MockerFixture): """single_stage_mode without master address/port raises ValueError.""" stage_cfgs = [_make_stage_cfg(0)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) engine._omni_master_address = None # missing engine._omni_master_port = None - with ( - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - pytest.raises(ValueError, match="omni_master_address"), - ): + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + with pytest.raises(ValueError, match="omni_master_address"): engine._initialize_stages(stage_init_timeout=60) - def test_matching_diffusion_stage_uses_local_registered_launch(self): + def test_matching_diffusion_stage_uses_local_registered_launch(self, mocker: MockerFixture): """A local diffusion stage should use the registered single-stage launch path.""" stage_cfgs = [_make_stage_cfg(0, stage_type="diffusion"), _make_stage_cfg(1)] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) - mock_oms = Mock(spec=OmniMasterServer) - diffusion_client = Mock(stage_type="diffusion") + mock_oms = mocker.Mock(spec=OmniMasterServer) + diffusion_client = mocker.Mock(stage_type="diffusion") finalized = ( - [diffusion_client, Mock()], - [Mock(), Mock()], + [diffusion_client, mocker.Mock()], + [mocker.Mock(), mocker.Mock()], [ {"final_output": False, "final_output_type": None, "stage_type": "diffusion"}, {"final_output": False, "final_output_type": None, "stage_type": "llm"}, ], ) - with ( - patch.object(engine, "_launch_diffusion_stage", return_value=diffusion_client) as mock_local_diff, - patch.object(engine, "_create_remote_diffusion_stage") as mock_remote_diff, - patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(1)), - patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(1)), - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id, getattr(cfg, "stage_type", "llm")), + mock_local_diff = mocker.patch.object( + engine, + "_launch_diffusion_stage", + return_value=diffusion_client, + ) + mock_remote_diff = mocker.patch.object(engine, "_create_remote_diffusion_stage") + mocker.patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(1)) + mocker.patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(1)) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata( + mocker, + cfg.stage_id, + getattr(cfg, "stage_type", "llm"), ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) assert mock_local_diff.call_count == 1 assert mock_local_diff.call_args.args[1].stage_id == 0 mock_remote_diff.assert_not_called() - def test_non_matching_diffusion_stage_uses_remote_diffusion_client(self): + def test_non_matching_diffusion_stage_uses_remote_diffusion_client(self, mocker: MockerFixture): """A non-local diffusion stage should attach via the remote diffusion path.""" stage_cfgs = [_make_stage_cfg(0), _make_stage_cfg(1, stage_type="diffusion")] engine = self._build_engine_skeleton(stage_cfgs, single_stage_mode=True, stage_id_filter=0) - mock_oms = Mock(spec=OmniMasterServer) - remote_diffusion_client = Mock(stage_type="diffusion") + mock_oms = mocker.Mock(spec=OmniMasterServer) + remote_diffusion_client = mocker.Mock(stage_type="diffusion") finalized = ( - [Mock(), remote_diffusion_client], - [Mock(), Mock()], + [mocker.Mock(), remote_diffusion_client], + [mocker.Mock(), mocker.Mock()], [ {"final_output": False, "final_output_type": None, "stage_type": "llm"}, {"final_output": False, "final_output_type": None, "stage_type": "diffusion"}, ], ) - with ( - patch.object(engine, "_launch_diffusion_stage") as mock_local_diff, - patch.object( - engine, "_create_remote_diffusion_stage", return_value=remote_diffusion_client - ) as mock_remote_diff, - patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)), - patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(0)), - patch.object(engine, "_attach_llm_stage", return_value=(Mock(), Mock(), Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.OmniMasterServer", return_value=mock_oms), - patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment"), - patch("vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", return_value=None), - patch("vllm_omni.engine.async_omni_engine.get_stage_connector_spec", return_value={}), - patch( - "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", return_value=(None, None, None) - ), - patch( - "vllm_omni.engine.async_omni_engine.extract_stage_metadata", - side_effect=lambda cfg: self._fake_metadata(cfg.stage_id, getattr(cfg, "stage_type", "llm")), + mock_local_diff = mocker.patch.object(engine, "_launch_diffusion_stage") + mock_remote_diff = mocker.patch.object( + engine, + "_create_remote_diffusion_stage", + return_value=remote_diffusion_client, + ) + mocker.patch.object(engine, "_launch_llm_stage", return_value=_make_started_llm_stage(0)) + mocker.patch.object(engine, "_create_remote_llm_stage", return_value=_make_started_llm_stage(0)) + mocker.patch.object( + engine, + "_attach_llm_stage", + return_value=(mocker.Mock(), mocker.Mock(), mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.OmniMasterServer", + return_value=mock_oms, + ) + mocker.patch("vllm_omni.engine.async_omni_engine.prepare_engine_environment") + mocker.patch( + "vllm_omni.engine.async_omni_engine.load_omni_transfer_config_for_model", + return_value=None, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.get_stage_connector_spec", + return_value={}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.extract_stage_metadata", + side_effect=lambda cfg: self._fake_metadata( + mocker, + cfg.stage_id, + getattr(cfg, "stage_type", "llm"), ), - patch("vllm_omni.engine.async_omni_engine.finalize_initialized_stages", return_value=finalized), - ): - engine._initialize_stages(stage_init_timeout=60) + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.finalize_initialized_stages", + return_value=finalized, + ) + + engine._initialize_stages(stage_init_timeout=60) mock_local_diff.assert_not_called() assert mock_remote_diff.call_count == 1 @@ -894,45 +1069,47 @@ def test_non_matching_diffusion_stage_uses_remote_diffusion_client(self): class TestLaunchDiffusionStage: """Test local diffusion stage launch wiring.""" - def test_registers_stage_with_public_master_properties(self): + def test_registers_stage_with_public_master_properties(self, mocker: MockerFixture): engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" engine.diffusion_batch_size = 4 stage_cfg = _make_stage_cfg(5, stage_type="diffusion") - metadata = Mock(stage_id=5) - omni_master_server = Mock(spec=OmniMasterServer) + metadata = mocker.Mock(stage_id=5) + omni_master_server = mocker.Mock(spec=OmniMasterServer) omni_master_server.address = "127.0.0.1" omni_master_server.port = 25000 - proc = Mock() - diffusion_client = Mock() - - with ( - patch("vllm_omni.engine.async_omni_engine.build_diffusion_config", return_value="diffusion-config"), - patch( - "vllm_omni.engine.async_omni_engine.register_stage_with_omni_master", - return_value=( - "tcp://127.0.0.1:25001", - "tcp://127.0.0.1:25002", - "tcp://127.0.0.1:25003", - ), - ) as mock_register, - patch( - "vllm_omni.engine.async_omni_engine.spawn_diffusion_proc", - return_value=(proc, None, None, None), - ) as mock_spawn, - patch("vllm_omni.engine.async_omni_engine.complete_diffusion_handshake") as mock_handshake, - patch( - "vllm_omni.engine.async_omni_engine.StageDiffusionClient.from_addresses", - return_value=diffusion_client, - ) as mock_from_addresses, - ): - result = engine._launch_diffusion_stage( - stage_cfg=stage_cfg, - metadata=metadata, - omni_master_server=omni_master_server, - ) + proc = mocker.Mock() + diffusion_client = mocker.Mock() + + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_diffusion_config", + return_value="diffusion-config", + ) + mock_register = mocker.patch( + "vllm_omni.engine.async_omni_engine.register_stage_with_omni_master", + return_value=( + "tcp://127.0.0.1:25001", + "tcp://127.0.0.1:25002", + "tcp://127.0.0.1:25003", + ), + ) + mock_spawn = mocker.patch( + "vllm_omni.engine.async_omni_engine.spawn_diffusion_proc", + return_value=(proc, None, None, None), + ) + mock_handshake = mocker.patch("vllm_omni.engine.async_omni_engine.complete_diffusion_handshake") + mock_from_addresses = mocker.patch( + "vllm_omni.engine.async_omni_engine.StageDiffusionClient.from_addresses", + return_value=diffusion_client, + ) + + result = engine._launch_diffusion_stage( + stage_cfg=stage_cfg, + metadata=metadata, + omni_master_server=omni_master_server, + ) mock_register.assert_called_once_with( omni_master_address="127.0.0.1", @@ -967,14 +1144,14 @@ def test_registers_stage_with_public_master_properties(self): class TestCreateRemoteLlmStage: """Test _create_remote_llm_stage delegates correctly.""" - def _engine(self) -> AsyncOmniEngine: + def _engine(self, mocker: MockerFixture) -> AsyncOmniEngine: engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" engine.single_stage_mode = True engine._single_stage_id_filter = 0 - engine._omni_master_server = Mock(spec=OmniMasterServer) - engine._omni_master_server.get_zmq_addresses.return_value = Mock() - engine._omni_master_server.get_allocation.return_value = Mock() + engine._omni_master_server = mocker.Mock(spec=OmniMasterServer) + engine._omni_master_server.get_zmq_addresses.return_value = mocker.Mock() + engine._omni_master_server.get_allocation.return_value = mocker.Mock() engine._omni_master_server.get_stage_config.return_value = { "stage_id": 0, "stage_type": "llm", @@ -982,42 +1159,40 @@ def _engine(self) -> AsyncOmniEngine: } return engine - @contextmanager - def _patch_build_and_connect(self, stage_id: int): - fake_vllm_config = Mock() - fake_executor_cls = Mock() - fake_addresses = Mock() + def _mock_build_and_connect(self, mocker: MockerFixture, stage_id: int): + fake_vllm_config = mocker.Mock() + fake_executor_cls = mocker.Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None - eng_mgr = Mock() - coordinator = Mock() + eng_mgr = mocker.Mock() + coordinator = mocker.Mock() @contextmanager def fake_connect_cm(*args, **kwargs): yield eng_mgr, coordinator, fake_addresses - with ( - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": stage_id}, - ), - patch( - "vllm_omni.engine.async_omni_engine.build_vllm_config", - return_value=(fake_vllm_config, fake_executor_cls), - ), - patch( - "vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", - return_value=fake_connect_cm(), - ) as mock_connect, - ): - yield mock_connect, fake_vllm_config, fake_executor_cls, fake_addresses + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": stage_id}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(fake_vllm_config, fake_executor_cls), + ) + mock_connect = mocker.patch( + "vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", + return_value=fake_connect_cm(), + ) + + return mock_connect, fake_vllm_config, fake_executor_cls, fake_addresses - def test_returns_started_llm_stage_with_correct_stage_id(self): - engine = self._engine() + def test_returns_started_llm_stage_with_correct_stage_id(self, mocker: MockerFixture): + engine = self._engine(mocker) stage_cfg = _make_stage_cfg(1) - metadata = Mock(stage_id=1) + metadata = mocker.Mock(stage_id=1) omni_ms = engine._omni_master_server omni_ms.get_stage_config.return_value = { "stage_id": 1, @@ -1025,93 +1200,93 @@ def test_returns_started_llm_stage_with_correct_stage_id(self): "engine_args": {}, } - with self._patch_build_and_connect(1): - result = engine._create_remote_llm_stage( - stage_cfg=stage_cfg, - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - omni_master_server=omni_ms, - ) + self._mock_build_and_connect(mocker, 1) + result = engine._create_remote_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + omni_master_server=omni_ms, + ) assert isinstance(result, StartedLlmStage) assert result.stage_id == 1 - def test_connect_remote_engine_cores_called_with_stage_id(self): - engine = self._engine() + def test_connect_remote_engine_cores_called_with_stage_id(self, mocker: MockerFixture): + engine = self._engine(mocker) stage_cfg = _make_stage_cfg(2) - metadata = Mock(stage_id=2) + metadata = mocker.Mock(stage_id=2) omni_ms = engine._omni_master_server - omni_ms.get_zmq_addresses.return_value = Mock(inputs=["x"], outputs=["y"]) + omni_ms.get_zmq_addresses.return_value = mocker.Mock(inputs=["x"], outputs=["y"]) omni_ms.get_stage_config.return_value = { "stage_id": 2, "stage_type": "llm", "engine_args": {}, } - fake_vllm_config = Mock() - fake_executor_cls = Mock() - fake_addresses = Mock() + fake_vllm_config = mocker.Mock() + fake_executor_cls = mocker.Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None @contextmanager def fake_connect_cm(*args, **kwargs): - yield Mock(), Mock(), fake_addresses + yield mocker.Mock(), mocker.Mock(), fake_addresses - with ( - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": 2}, - ), - patch( - "vllm_omni.engine.async_omni_engine.build_vllm_config", - return_value=(fake_vllm_config, fake_executor_cls), - ), - patch( - "vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", return_value=fake_connect_cm() - ) as mock_connect, - ): - engine._create_remote_llm_stage( - stage_cfg=stage_cfg, - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - omni_master_server=omni_ms, - ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 2}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(fake_vllm_config, fake_executor_cls), + ) + mock_connect = mocker.patch( + "vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", + return_value=fake_connect_cm(), + ) + + engine._create_remote_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + omni_master_server=omni_ms, + ) mock_connect.assert_called_once() _, kwargs = mock_connect.call_args assert kwargs.get("stage_id") == 2 or mock_connect.call_args.args[-1] == 2 omni_ms.get_stage_config.assert_called_once_with(2, timeout_s=60) - def test_missing_registered_stage_config_raises_value_error(self): - engine = self._engine() + def test_missing_registered_stage_config_raises_value_error(self, mocker: MockerFixture): + engine = self._engine(mocker) stage_cfg = _make_stage_cfg(3) - metadata = Mock(stage_id=3) + metadata = mocker.Mock(stage_id=3) omni_ms = engine._omni_master_server omni_ms.get_stage_config.return_value = None - with patch("vllm_omni.engine.async_omni_engine.build_engine_args_dict") as mock_build_args: - with pytest.raises( - ValueError, - match="Remote stage 3 registered without stage config", - ): - engine._create_remote_llm_stage( - stage_cfg=stage_cfg, - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - omni_master_server=omni_ms, - ) + mock_build_args = mocker.patch("vllm_omni.engine.async_omni_engine.build_engine_args_dict") + with pytest.raises( + ValueError, + match="Remote stage 3 registered without stage config", + ): + engine._create_remote_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + omni_master_server=omni_ms, + ) mock_build_args.assert_not_called() - def test_exception_during_connect_closes_started_stage(self): + def test_exception_during_connect_closes_started_stage(self, mocker: MockerFixture): """If an error occurs after StartedLlmStage creation, close_started_llm_stage is called.""" - engine = self._engine() + engine = self._engine(mocker) stage_cfg = _make_stage_cfg(1) - metadata = Mock(stage_id=1) + metadata = mocker.Mock(stage_id=1) omni_ms = engine._omni_master_server omni_ms.get_stage_config.return_value = { "stage_id": 1, @@ -1121,26 +1296,30 @@ def test_exception_during_connect_closes_started_stage(self): @contextmanager def boom(*args, **kwargs): - yield Mock(), Mock(), Mock() + yield mocker.Mock(), mocker.Mock(), mocker.Mock() raise RuntimeError("handshake failed") - with ( - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": 1}, - ), - patch("vllm_omni.engine.async_omni_engine.build_vllm_config", return_value=(Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", return_value=boom()), - patch("vllm_omni.engine.async_omni_engine.close_started_llm_stage") as mock_close, - ): - with pytest.raises(RuntimeError, match="handshake failed"): - engine._create_remote_llm_stage( - stage_cfg=stage_cfg, - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - omni_master_server=omni_ms, - ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 1}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.connect_remote_engine_cores", + return_value=boom(), + ) + mock_close = mocker.patch("vllm_omni.engine.async_omni_engine.close_started_llm_stage") + with pytest.raises(RuntimeError, match="handshake failed"): + engine._create_remote_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + omni_master_server=omni_ms, + ) mock_close.assert_called_once() @@ -1148,27 +1327,29 @@ class TestConnectRemoteEngineCoresCoordinator: """Test coordinator launch parity with launch_core_engines.""" @staticmethod - def _build_vllm_config(*, dp_rank: int = 0, offline_mode: bool = False, needs_dp_coordinator: bool = True) -> Mock: - parallel_config = Mock() + def _build_vllm_config( + mocker: MockerFixture, *, dp_rank: int = 0, offline_mode: bool = False, needs_dp_coordinator: bool = True + ) -> Any: + parallel_config = mocker.Mock() parallel_config.data_parallel_size_local = 1 parallel_config.data_parallel_size = 2 parallel_config.data_parallel_rank = dp_rank parallel_config.data_parallel_rank_local = 0 if offline_mode else None - vllm_config = Mock() + vllm_config = mocker.Mock() vllm_config.parallel_config = parallel_config vllm_config.needs_dp_coordinator = needs_dp_coordinator - vllm_config.model_config = Mock(is_moe=False) + vllm_config.model_config = mocker.Mock(is_moe=False) return vllm_config - def test_uses_registered_coordinator_addresses(self): - vllm_config = self._build_vllm_config(dp_rank=0, offline_mode=False, needs_dp_coordinator=True) + def test_uses_registered_coordinator_addresses(self, mocker: MockerFixture): + vllm_config = self._build_vllm_config(mocker, dp_rank=0, offline_mode=False, needs_dp_coordinator=True) - omni_master_server = Mock(spec=OmniMasterServer) + omni_master_server = mocker.Mock(spec=OmniMasterServer) omni_master_server.get_zmq_addresses.return_value = EngineZmqAddresses( inputs=["tcp://client-in"], outputs=["tcp://client-out"] ) - omni_master_server.get_allocation.return_value = Mock(handshake_bind_address="tcp://127.0.0.1:26001") + omni_master_server.get_allocation.return_value = mocker.Mock(handshake_bind_address="tcp://127.0.0.1:26001") omni_master_server.get_stage_coordinator_addresses.return_value = StageCoordinatorAddresses( coordinator_input="tcp://coord-in", coordinator_output="tcp://coord-out", @@ -1177,103 +1358,107 @@ def test_uses_registered_coordinator_addresses(self): @contextmanager def fake_socket_ctx(*args, **kwargs): - yield Mock() + yield mocker.Mock() - with ( - patch("vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", return_value=fake_socket_ctx()), - patch("vllm_omni.engine.stage_engine_startup._wait_for_omni_engine_startup") as mock_wait, - ): - with connect_remote_engine_cores( - vllm_config=vllm_config, - omni_master_server=omni_master_server, - stage_id=7, - ) as (_, yielded_coordinator, yielded_addresses): - assert yielded_coordinator is None - assert yielded_addresses.coordinator_input == "tcp://coord-in" - assert yielded_addresses.coordinator_output == "tcp://coord-out" - assert yielded_addresses.frontend_stats_publish_address == "tcp://stats" + mocker.patch( + "vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", + return_value=fake_socket_ctx(), + ) + mock_wait = mocker.patch("vllm_omni.engine.stage_engine_startup._wait_for_omni_engine_startup") + with connect_remote_engine_cores( + vllm_config=vllm_config, + omni_master_server=omni_master_server, + stage_id=7, + ) as (_, yielded_coordinator, yielded_addresses): + assert yielded_coordinator is None + assert yielded_addresses.coordinator_input == "tcp://coord-in" + assert yielded_addresses.coordinator_output == "tcp://coord-out" + assert yielded_addresses.frontend_stats_publish_address == "tcp://stats" omni_master_server.get_stage_coordinator_addresses.assert_called_once_with(7) mock_wait.assert_called_once() - def test_defaults_to_no_coordinator_addresses_when_none_registered(self): + def test_defaults_to_no_coordinator_addresses_when_none_registered(self, mocker: MockerFixture): vllm_config = self._build_vllm_config( + mocker, dp_rank=0, offline_mode=False, needs_dp_coordinator=True, ) - omni_master_server = Mock(spec=OmniMasterServer) + omni_master_server = mocker.Mock(spec=OmniMasterServer) omni_master_server.get_zmq_addresses.return_value = EngineZmqAddresses( inputs=["tcp://client-in"], outputs=["tcp://client-out"] ) - omni_master_server.get_allocation.return_value = Mock(handshake_bind_address="tcp://127.0.0.1:26001") + omni_master_server.get_allocation.return_value = mocker.Mock(handshake_bind_address="tcp://127.0.0.1:26001") omni_master_server.get_stage_coordinator_addresses.return_value = StageCoordinatorAddresses() @contextmanager def fake_socket_ctx(*args, **kwargs): - yield Mock() + yield mocker.Mock() - with ( - patch("vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", return_value=fake_socket_ctx()), - patch("vllm_omni.engine.stage_engine_startup._wait_for_omni_engine_startup"), - ): - with connect_remote_engine_cores( - vllm_config=vllm_config, - omni_master_server=omni_master_server, - stage_id=7, - ) as (_, yielded_coordinator, yielded_addresses): - assert yielded_coordinator is None - assert yielded_addresses.coordinator_input is None - assert yielded_addresses.coordinator_output is None - assert yielded_addresses.frontend_stats_publish_address is None + mocker.patch( + "vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", + return_value=fake_socket_ctx(), + ) + mocker.patch("vllm_omni.engine.stage_engine_startup._wait_for_omni_engine_startup") + with connect_remote_engine_cores( + vllm_config=vllm_config, + omni_master_server=omni_master_server, + stage_id=7, + ) as (_, yielded_coordinator, yielded_addresses): + assert yielded_coordinator is None + assert yielded_addresses.coordinator_input is None + assert yielded_addresses.coordinator_output is None + assert yielded_addresses.frontend_stats_publish_address is None class TestLaunchOmniCoreEngines: """Tests for local omni engine launch wiring.""" - def test_registers_stage_once_and_reuses_handshake_for_all_local_engines(self): - parallel_config = Mock( + def test_registers_stage_once_and_reuses_handshake_for_all_local_engines(self, mocker: MockerFixture): + parallel_config = mocker.Mock( data_parallel_size_local=2, data_parallel_size=4, data_parallel_rank=3, ) - vllm_config = Mock(parallel_config=parallel_config) + vllm_config = mocker.Mock(parallel_config=parallel_config) - omni_master_server = Mock(spec=OmniMasterServer) + omni_master_server = mocker.Mock(spec=OmniMasterServer) omni_master_server.address = "127.0.0.1" omni_master_server.port = 26000 - omni_master_server.get_allocation.return_value = Mock(handshake_bind_address="tcp://127.0.0.1:26001") + omni_master_server.get_allocation.return_value = mocker.Mock(handshake_bind_address="tcp://127.0.0.1:26001") stage_config = {"stage_id": 7, "stage_type": "llm"} - local_engine_manager = Mock() + local_engine_manager = mocker.Mock() @contextmanager def fake_socket_ctx(*args, **kwargs): - yield Mock() - - with ( - patch( - "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", - return_value="tcp://127.0.0.1:26001", - ) as mock_register, - patch("vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", return_value=fake_socket_ctx()), - patch( - "vllm_omni.engine.stage_engine_startup.CoreEngineProcManager", - return_value=local_engine_manager, - ) as mock_manager_cls, - patch("vllm_omni.engine.stage_engine_startup.wait_for_engine_startup"), - ): - with launch_omni_core_engines( - vllm_config=vllm_config, - executor_class=Mock(), - log_stats=False, - omni_master_server=omni_master_server, - stage_id=7, - stage_config=stage_config, - ) as (yielded_manager, yielded_coordinator, yielded_addresses): - assert yielded_manager is local_engine_manager - assert yielded_coordinator is None + yield mocker.Mock() + + mock_register = mocker.patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value="tcp://127.0.0.1:26001", + ) + mocker.patch( + "vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", + return_value=fake_socket_ctx(), + ) + mock_manager_cls = mocker.patch( + "vllm_omni.engine.stage_engine_startup.CoreEngineProcManager", + return_value=local_engine_manager, + ) + mocker.patch("vllm_omni.engine.stage_engine_startup.wait_for_engine_startup") + with launch_omni_core_engines( + vllm_config=vllm_config, + executor_class=mocker.Mock(), + log_stats=False, + omni_master_server=omni_master_server, + stage_id=7, + stage_config=stage_config, + ) as (yielded_manager, yielded_coordinator, yielded_addresses): + assert yielded_manager is local_engine_manager + assert yielded_coordinator is None mock_register.assert_called_once_with( omni_master_address="127.0.0.1", @@ -1292,55 +1477,56 @@ def fake_socket_ctx(*args, **kwargs): assert manager_kwargs["handshake_address"] == "tcp://127.0.0.1:26001" assert manager_kwargs["executor_class"] is not None - def test_registers_stage_with_coordinator_when_started(self): - parallel_config = Mock( + def test_registers_stage_with_coordinator_when_started(self, mocker: MockerFixture): + parallel_config = mocker.Mock( data_parallel_size_local=1, data_parallel_size=2, data_parallel_rank=0, ) - vllm_config = Mock(parallel_config=parallel_config) + vllm_config = mocker.Mock(parallel_config=parallel_config) vllm_config.needs_dp_coordinator = True - vllm_config.model_config = Mock(is_moe=False) + vllm_config.model_config = mocker.Mock(is_moe=False) - omni_master_server = Mock(spec=OmniMasterServer) + omni_master_server = mocker.Mock(spec=OmniMasterServer) omni_master_server.address = "127.0.0.1" omni_master_server.port = 26000 omni_master_server.get_zmq_addresses.return_value = EngineZmqAddresses( inputs=["tcp://client-in"], outputs=["tcp://client-out"] ) - omni_master_server.get_allocation.return_value = Mock(handshake_bind_address="tcp://127.0.0.1:26001") + omni_master_server.get_allocation.return_value = mocker.Mock(handshake_bind_address="tcp://127.0.0.1:26001") - coordinator = Mock() + coordinator = mocker.Mock() coordinator.proc.pid = 1234 coordinator.get_engine_socket_addresses.return_value = ("tcp://coord-in", "tcp://coord-out") coordinator.get_stats_publish_address.return_value = "tcp://stats" @contextmanager def fake_socket_ctx(*args, **kwargs): - yield Mock() - - with ( - patch("vllm_omni.engine.stage_engine_startup.DPCoordinator", return_value=coordinator), - patch( - "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", - return_value="tcp://127.0.0.1:26001", - ) as mock_register, - patch("vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", return_value=fake_socket_ctx()), - patch( - "vllm_omni.engine.stage_engine_startup.CoreEngineProcManager", - return_value=Mock(), - ) as mock_manager_cls, - patch("vllm_omni.engine.stage_engine_startup.wait_for_engine_startup") as mock_wait, + yield mocker.Mock() + + mocker.patch("vllm_omni.engine.stage_engine_startup.DPCoordinator", return_value=coordinator) + mock_register = mocker.patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value="tcp://127.0.0.1:26001", + ) + mocker.patch( + "vllm_omni.engine.stage_engine_startup.zmq_socket_ctx", + return_value=fake_socket_ctx(), + ) + mock_manager_cls = mocker.patch( + "vllm_omni.engine.stage_engine_startup.CoreEngineProcManager", + return_value=mocker.Mock(), + ) + mock_wait = mocker.patch("vllm_omni.engine.stage_engine_startup.wait_for_engine_startup") + with launch_omni_core_engines( + vllm_config=vllm_config, + executor_class=mocker.Mock(), + log_stats=False, + omni_master_server=omni_master_server, + stage_id=7, + stage_config={"stage_id": 7}, ): - with launch_omni_core_engines( - vllm_config=vllm_config, - executor_class=Mock(), - log_stats=False, - omni_master_server=omni_master_server, - stage_id=7, - stage_config={"stage_id": 7}, - ): - pass + pass mock_register.assert_called_once_with( omni_master_address="127.0.0.1", @@ -1363,19 +1549,19 @@ class TestLaunchLlmStageSingleStageMode: """Test that _launch_llm_stage selects launch_omni_core_engines when single_stage_mode=True and _omni_master_server is set.""" - def _build_engine_with_oms(self) -> AsyncOmniEngine: + def _build_engine_with_oms(self, mocker: MockerFixture) -> AsyncOmniEngine: engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" engine.single_stage_mode = True engine._single_stage_id_filter = 0 engine._llm_stage_launch_lock = threading.Lock() - mock_oms = Mock(spec=OmniMasterServer) + mock_oms = mocker.Mock(spec=OmniMasterServer) mock_oms.address = "127.0.0.1" mock_oms.port = 25000 - alloc = Mock() + alloc = mocker.Mock() alloc.handshake_bind_address = "tcp://127.0.0.1:25001" mock_oms.get_allocation.return_value = alloc - fake_addresses = Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None @@ -1383,66 +1569,60 @@ def _build_engine_with_oms(self) -> AsyncOmniEngine: engine._omni_master_server = mock_oms return engine - @contextmanager - def _patch_launch_omni_cm(self, stage_id: int): - fake_vllm_config = Mock() - fake_executor_cls = Mock() - fake_addresses = Mock() + def _mock_launch_omni(self, mocker: MockerFixture, stage_id: int): + fake_vllm_config = mocker.Mock() + fake_executor_cls = mocker.Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None - eng_mgr = Mock() + eng_mgr = mocker.Mock() @contextmanager def fake_launch_omni(*args, **kwargs): yield eng_mgr, None, fake_addresses - with ( - patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": stage_id}, - ), - patch( - "vllm_omni.engine.async_omni_engine.build_vllm_config", - return_value=(fake_vllm_config, fake_executor_cls), - ), - patch( - "vllm_omni.engine.async_omni_engine.acquire_device_locks", - return_value=[], - ), - patch( - "vllm_omni.engine.async_omni_engine.release_device_locks", - ), - patch( - "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", - return_value=fake_launch_omni(), - ) as mock_launch_omni, - ): - yield mock_launch_omni + mocker.patch("vllm_omni.engine.async_omni_engine.setup_stage_devices") + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": stage_id}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(fake_vllm_config, fake_executor_cls), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.acquire_device_locks", + return_value=[], + ) + mocker.patch("vllm_omni.engine.async_omni_engine.release_device_locks") + return mocker.patch( + "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", + return_value=fake_launch_omni(), + ) - def test_launch_omni_core_engines_used_in_single_stage_mode(self): + def test_launch_omni_core_engines_used_in_single_stage_mode(self, mocker: MockerFixture): """single_stage_mode + _omni_master_server → launch_omni_core_engines.""" - engine = self._build_engine_with_oms() - metadata = Mock(stage_id=0, runtime_cfg={}) + engine = self._build_engine_with_oms(mocker) + metadata = mocker.Mock(stage_id=0, runtime_cfg={}) stage_cfg = _make_stage_cfg(0) - with self._patch_launch_omni_cm(0) as mock_launch_omni: - result = engine._launch_llm_stage( - stage_cfg=stage_cfg, - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - llm_stage_launch_lock=threading.Lock(), - ) + mock_launch_omni = self._mock_launch_omni(mocker, 0) + result = engine._launch_llm_stage( + stage_cfg=stage_cfg, + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + llm_stage_launch_lock=threading.Lock(), + ) mock_launch_omni.assert_called_once() assert mock_launch_omni.call_args.kwargs["stage_config"] is stage_cfg assert isinstance(result, StartedLlmStage) assert result.stage_id == 0 - def test_spawn_stage_core_used_in_normal_mode(self): + def test_spawn_stage_core_used_in_normal_mode(self, mocker: MockerFixture): """~single_stage_mode → spawn_stage_core + complete_stage_handshake.""" engine = object.__new__(AsyncOmniEngine) engine.model = "fake-model" @@ -1450,44 +1630,45 @@ def test_spawn_stage_core_used_in_normal_mode(self): engine._omni_master_server = None engine._llm_stage_launch_lock = threading.Lock() - fake_vllm_config = Mock() - fake_executor_cls = Mock() - fake_addresses = Mock() + fake_vllm_config = mocker.Mock() + fake_executor_cls = mocker.Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None - fake_proc = Mock() + fake_proc = mocker.Mock() fake_handshake_address = "ipc:///tmp/fake-handshake" stage_init_timeout = 60 - with ( - patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": 0}, - ), - patch( - "vllm_omni.engine.async_omni_engine.build_vllm_config", - return_value=(fake_vllm_config, fake_executor_cls), - ), - patch("vllm_omni.engine.async_omni_engine.acquire_device_locks", return_value=[]), - patch("vllm_omni.engine.async_omni_engine.release_device_locks"), - patch( - "vllm_omni.engine.async_omni_engine.spawn_stage_core", - return_value=(fake_addresses, fake_proc, fake_handshake_address), - ) as mock_spawn, - patch("vllm_omni.engine.async_omni_engine.complete_stage_handshake") as mock_handshake, - patch("vllm_omni.engine.async_omni_engine.launch_omni_core_engines") as mock_omni, - ): - metadata = Mock(stage_id=0, runtime_cfg={}) - result = engine._launch_llm_stage( - stage_cfg=_make_stage_cfg(0), - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=stage_init_timeout, - llm_stage_launch_lock=threading.Lock(), - ) + mocker.patch("vllm_omni.engine.async_omni_engine.setup_stage_devices") + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 0}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(fake_vllm_config, fake_executor_cls), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.acquire_device_locks", + return_value=[], + ) + mocker.patch("vllm_omni.engine.async_omni_engine.release_device_locks") + mock_spawn = mocker.patch( + "vllm_omni.engine.async_omni_engine.spawn_stage_core", + return_value=(fake_addresses, fake_proc, fake_handshake_address), + ) + mock_handshake = mocker.patch("vllm_omni.engine.async_omni_engine.complete_stage_handshake") + mock_omni = mocker.patch("vllm_omni.engine.async_omni_engine.launch_omni_core_engines") + metadata = mocker.Mock(stage_id=0, runtime_cfg={}) + result = engine._launch_llm_stage( + stage_cfg=_make_stage_cfg(0), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=stage_init_timeout, + llm_stage_launch_lock=threading.Lock(), + ) mock_spawn.assert_called_once_with( vllm_config=fake_vllm_config, @@ -1505,50 +1686,58 @@ def test_spawn_stage_core_used_in_normal_mode(self): assert isinstance(result, StartedLlmStage) assert result.proc is fake_proc - def test_launch_omni_passes_stage_id_and_master_server(self): + def test_launch_omni_passes_stage_id_and_master_server(self, mocker: MockerFixture): """launch_omni_core_engines receives the correct stage_id and omni_master_server.""" - engine = self._build_engine_with_oms() - metadata = Mock(stage_id=0, runtime_cfg={}) + engine = self._build_engine_with_oms(mocker) + metadata = mocker.Mock(stage_id=0, runtime_cfg={}) captured_kwargs: dict[str, Any] = {} @contextmanager def capturing_launch(*args, **kwargs): captured_kwargs.update(kwargs) - fake_addresses = Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None - yield Mock(), None, fake_addresses + yield mocker.Mock(), None, fake_addresses - with ( - patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": 0}, - ), - patch("vllm_omni.engine.async_omni_engine.build_vllm_config", return_value=(Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.acquire_device_locks", return_value=[]), - patch("vllm_omni.engine.async_omni_engine.release_device_locks"), - patch("vllm_omni.engine.async_omni_engine.launch_omni_core_engines", side_effect=capturing_launch), - ): - engine._launch_llm_stage( - stage_cfg=_make_stage_cfg(0), - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - llm_stage_launch_lock=threading.Lock(), - ) + mocker.patch("vllm_omni.engine.async_omni_engine.setup_stage_devices") + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 0}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.acquire_device_locks", + return_value=[], + ) + mocker.patch("vllm_omni.engine.async_omni_engine.release_device_locks") + mocker.patch( + "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", + side_effect=capturing_launch, + ) + + engine._launch_llm_stage( + stage_cfg=_make_stage_cfg(0), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + llm_stage_launch_lock=threading.Lock(), + ) assert captured_kwargs.get("stage_id") == 0 assert captured_kwargs.get("omni_master_server") is engine._omni_master_server - def test_launch_omni_context_exits_before_stage_cleanup_on_error(self): + def test_launch_omni_context_exits_before_stage_cleanup_on_error(self, mocker: MockerFixture): """Errors after entering the omni launch context still unwind it first.""" - engine = self._build_engine_with_oms() - metadata = Mock(stage_id=0, runtime_cfg={}) + engine = self._build_engine_with_oms(mocker) + metadata = mocker.Mock(stage_id=0, runtime_cfg={}) - fake_addresses = Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None @@ -1558,47 +1747,51 @@ def test_launch_omni_context_exits_before_stage_cleanup_on_error(self): @contextmanager def fake_launch_omni(*args, **kwargs): try: - yield Mock(), None, fake_addresses + yield mocker.Mock(), None, fake_addresses finally: events.append("launch_exit") - with ( - patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": 0}, - ), - patch("vllm_omni.engine.async_omni_engine.build_vllm_config", return_value=(Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.acquire_device_locks", return_value=[]), - patch("vllm_omni.engine.async_omni_engine.release_device_locks"), - patch( - "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", - return_value=fake_launch_omni(), - ), - patch("vllm_omni.engine.async_omni_engine.logger.info", side_effect=RuntimeError("boom")), - patch( - "vllm_omni.engine.async_omni_engine.close_started_llm_stage", - side_effect=lambda _started: events.append("stage_close"), - ) as mock_close_stage, - ): - with pytest.raises(RuntimeError, match="boom"): - engine._launch_llm_stage( - stage_cfg=_make_stage_cfg(0), - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - llm_stage_launch_lock=threading.Lock(), - ) + mocker.patch("vllm_omni.engine.async_omni_engine.setup_stage_devices") + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 0}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.acquire_device_locks", + return_value=[], + ) + mocker.patch("vllm_omni.engine.async_omni_engine.release_device_locks") + mocker.patch( + "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", + return_value=fake_launch_omni(), + ) + mocker.patch("vllm_omni.engine.async_omni_engine.logger.info", side_effect=RuntimeError("boom")) + mock_close_stage = mocker.patch( + "vllm_omni.engine.async_omni_engine.close_started_llm_stage", + side_effect=lambda _started: events.append("stage_close"), + ) + with pytest.raises(RuntimeError, match="boom"): + engine._launch_llm_stage( + stage_cfg=_make_stage_cfg(0), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + llm_stage_launch_lock=threading.Lock(), + ) mock_close_stage.assert_called_once() assert events == ["launch_exit", "stage_close"] - def test_base_exception_propagates_without_started_stage_cleanup(self): + def test_base_exception_propagates_without_started_stage_cleanup(self, mocker: MockerFixture): """BaseException subclasses should bypass the Exception cleanup path.""" - engine = self._build_engine_with_oms() - metadata = Mock(stage_id=0, runtime_cfg={}) + engine = self._build_engine_with_oms(mocker) + metadata = mocker.Mock(stage_id=0, runtime_cfg={}) - fake_addresses = Mock() + fake_addresses = mocker.Mock() fake_addresses.inputs = ["tcp://127.0.0.1:5000"] fake_addresses.outputs = ["tcp://127.0.0.1:5001"] fake_addresses.frontend_stats_publish_address = None @@ -1611,37 +1804,41 @@ class FatalLaunchInterrupt(BaseException): @contextmanager def fake_launch_omni(*args, **kwargs): try: - yield Mock(), None, fake_addresses + yield mocker.Mock(), None, fake_addresses finally: events.append("launch_exit") - with ( - patch("vllm_omni.engine.async_omni_engine.setup_stage_devices"), - patch( - "vllm_omni.engine.async_omni_engine.build_engine_args_dict", - return_value={"model": "fake", "stage_id": 0}, - ), - patch("vllm_omni.engine.async_omni_engine.build_vllm_config", return_value=(Mock(), Mock())), - patch("vllm_omni.engine.async_omni_engine.acquire_device_locks", return_value=[]), - patch("vllm_omni.engine.async_omni_engine.release_device_locks"), - patch( - "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", - return_value=fake_launch_omni(), - ), - patch( - "vllm_omni.engine.async_omni_engine.logger.info", - side_effect=FatalLaunchInterrupt("stop"), - ), - patch("vllm_omni.engine.async_omni_engine.close_started_llm_stage") as mock_close_stage, - ): - with pytest.raises(FatalLaunchInterrupt, match="stop"): - engine._launch_llm_stage( - stage_cfg=_make_stage_cfg(0), - metadata=metadata, - stage_connector_spec={}, - stage_init_timeout=60, - llm_stage_launch_lock=threading.Lock(), - ) + mocker.patch("vllm_omni.engine.async_omni_engine.setup_stage_devices") + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_engine_args_dict", + return_value={"model": "fake", "stage_id": 0}, + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.build_vllm_config", + return_value=(mocker.Mock(), mocker.Mock()), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.acquire_device_locks", + return_value=[], + ) + mocker.patch("vllm_omni.engine.async_omni_engine.release_device_locks") + mocker.patch( + "vllm_omni.engine.async_omni_engine.launch_omni_core_engines", + return_value=fake_launch_omni(), + ) + mocker.patch( + "vllm_omni.engine.async_omni_engine.logger.info", + side_effect=FatalLaunchInterrupt("stop"), + ) + mock_close_stage = mocker.patch("vllm_omni.engine.async_omni_engine.close_started_llm_stage") + with pytest.raises(FatalLaunchInterrupt, match="stop"): + engine._launch_llm_stage( + stage_cfg=_make_stage_cfg(0), + metadata=metadata, + stage_connector_spec={}, + stage_init_timeout=60, + llm_stage_launch_lock=threading.Lock(), + ) mock_close_stage.assert_not_called() assert events == ["launch_exit"] diff --git a/tests/entrypoints/openai_api/test_serving_chat_speaker.py b/tests/entrypoints/openai_api/test_serving_chat_speaker.py index 3b9151120e0..97c05e45b41 100644 --- a/tests/entrypoints/openai_api/test_serving_chat_speaker.py +++ b/tests/entrypoints/openai_api/test_serving_chat_speaker.py @@ -4,9 +4,9 @@ import asyncio from types import SimpleNamespace -from unittest.mock import AsyncMock, MagicMock import pytest +from pytest_mock import MockerFixture from vllm_omni.entrypoints.openai.utils import ( get_supported_speakers_from_hf_config, @@ -25,9 +25,9 @@ def serving_chat(): return instance -def _make_hf_config(*, speaker_id: dict | None = None, spk_id: dict | None = None): - hf_config = MagicMock() - talker_config = MagicMock() +def _make_hf_config(mocker: MockerFixture, *, speaker_id: dict | None = None, spk_id: dict | None = None): + hf_config = mocker.MagicMock() + talker_config = mocker.MagicMock() talker_config.speaker_id = speaker_id talker_config.spk_id = spk_id hf_config.talker_config = talker_config @@ -51,14 +51,14 @@ def test_validate_requested_speaker_skips_validation_when_supported_empty(): assert validate_requested_speaker(" ", {"vivian"}) is None -def test_get_supported_speakers_from_hf_config_uses_spk_id_fallback(): - hf_config = _make_hf_config(speaker_id=None, spk_id={"Serena": 0}) +def test_get_supported_speakers_from_hf_config_uses_spk_id_fallback(mocker: MockerFixture): + hf_config = _make_hf_config(mocker, speaker_id=None, spk_id={"Serena": 0}) assert get_supported_speakers_from_hf_config(hf_config) == {"serena"} -def test_get_supported_speakers_caches_normalized_keys(serving_chat): - serving_chat.model_config = MagicMock() - serving_chat.model_config.hf_config = _make_hf_config(speaker_id={"Vivian": 0, "Ethan": 1}) +def test_get_supported_speakers_caches_normalized_keys(mocker: MockerFixture, serving_chat): + serving_chat.model_config = mocker.MagicMock() + serving_chat.model_config.hf_config = _make_hf_config(mocker, speaker_id={"Vivian": 0, "Ethan": 1}) assert serving_chat._get_supported_speakers() == {"vivian", "ethan"} @@ -67,15 +67,15 @@ def test_get_supported_speakers_caches_normalized_keys(serving_chat): assert serving_chat._get_supported_speakers() == {"vivian", "ethan"} -def test_create_chat_completion_converts_value_error_to_error_response(serving_chat): +def test_create_chat_completion_converts_value_error_to_error_response(mocker: MockerFixture, serving_chat): serving_chat._diffusion_mode = False - serving_chat._check_model = AsyncMock(return_value=None) - serving_chat.engine_client = MagicMock(errored=False) - serving_chat._maybe_get_adapters = MagicMock(return_value=None) - serving_chat.models = MagicMock() + serving_chat._check_model = mocker.AsyncMock(return_value=None) + serving_chat.engine_client = mocker.MagicMock(errored=False) + serving_chat._maybe_get_adapters = mocker.MagicMock(return_value=None) + serving_chat.models = mocker.MagicMock() serving_chat.models.model_name.return_value = "test-model" - serving_chat.renderer = MagicMock() - serving_chat.renderer.get_tokenizer.return_value = MagicMock() + serving_chat.renderer = mocker.MagicMock() + serving_chat.renderer.get_tokenizer.return_value = mocker.MagicMock() serving_chat.reasoning_parser_cls = None serving_chat.tool_parser = None serving_chat.use_harmony = False @@ -85,12 +85,12 @@ def test_create_chat_completion_converts_value_error_to_error_response(serving_c serving_chat.chat_template = None serving_chat.chat_template_content_format = "string" serving_chat.default_chat_template_kwargs = {} - serving_chat._validate_chat_template = MagicMock(return_value=None) - serving_chat._prepare_extra_chat_template_kwargs = MagicMock(return_value={}) - serving_chat._preprocess_chat = AsyncMock( + serving_chat._validate_chat_template = mocker.MagicMock(return_value=None) + serving_chat._prepare_extra_chat_template_kwargs = mocker.MagicMock(return_value={}) + serving_chat._preprocess_chat = mocker.AsyncMock( side_effect=ValueError("Invalid speaker 'uncle_fu'. Supported: ethan, vivian") ) - serving_chat.create_error_response = MagicMock(return_value="error-response") + serving_chat.create_error_response = mocker.MagicMock(return_value="error-response") request = SimpleNamespace( tool_choice=None, diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 06b6f5c16c1..c8841206207 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -6,7 +6,6 @@ from inspect import Signature, signature from pathlib import Path from types import SimpleNamespace -from unittest.mock import AsyncMock, MagicMock, patch import numpy as np import pytest @@ -901,7 +900,7 @@ def test_load_supported_speakers(self, mocker: MockerFixture): # Verify speakers are normalized to lowercase assert server.supported_speakers == {"ryan", "vivian", "aiden"} - def test_build_tts_params_with_uploaded_voice(self, speech_server): + def test_build_tts_params_with_uploaded_voice(self, speech_server, mocker: MockerFixture): """Test _build_tts_params auto-sets ref_audio for uploaded voices (x_vector only).""" speech_server.uploaded_speakers = { "custom_voice": { @@ -914,18 +913,18 @@ def test_build_tts_params_with_uploaded_voice(self, speech_server): } speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"} - with patch.object(speech_server, "_get_uploaded_audio_data") as mock_get_audio: - mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv" - req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice") - params = speech_server._build_tts_params(req) + mock_get_audio = mocker.patch.object(speech_server, "_get_uploaded_audio_data") + mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv" + req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice") + params = speech_server._build_tts_params(req) - assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] - assert params["x_vector_only_mode"] == [True] - assert params["task_type"] == ["Base"] - assert params["voice_created_at"] == [1711234567.89] - assert "ref_text" not in params + assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] + assert params["x_vector_only_mode"] == [True] + assert params["task_type"] == ["Base"] + assert params["voice_created_at"] == [1711234567.89] + assert "ref_text" not in params - def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server): + def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server, mocker: MockerFixture): """Test _build_tts_params enables in-context cloning when ref_text is stored.""" speech_server.uploaded_speakers = { "custom_voice": { @@ -938,16 +937,16 @@ def test_build_tts_params_with_uploaded_voice_ref_text(self, speech_server): } speech_server.supported_speakers = {"ryan", "vivian", "custom_voice"} - with patch.object(speech_server, "_get_uploaded_audio_data") as mock_get_audio: - mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv" - req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice") - params = speech_server._build_tts_params(req) + mock_get_audio = mocker.patch.object(speech_server, "_get_uploaded_audio_data") + mock_get_audio.return_value = "data:audio/wav;base64,ZmFrZWF1ZGlv" + req = OpenAICreateSpeechRequest(input="Hello", voice="custom_voice") + params = speech_server._build_tts_params(req) - assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] - assert params["x_vector_only_mode"] == [False] - assert params["task_type"] == ["Base"] - assert params["ref_text"] == ["Hello world transcript"] - assert params["voice_created_at"] == [1711234567.89] + assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZWF1ZGlv"] + assert params["x_vector_only_mode"] == [False] + assert params["task_type"] == ["Base"] + assert params["ref_text"] == ["Hello world transcript"] + assert params["voice_created_at"] == [1711234567.89] def test_build_tts_params_without_uploaded_voice(self, speech_server): """Test _build_tts_params does not auto-set ref_audio for non-uploaded voices.""" @@ -989,45 +988,43 @@ def test_build_tts_params_with_explicit_ref_audio(self, speech_server): # x_vector_only_mode should not be set when explicit ref_audio is provided assert "x_vector_only_mode" not in params - def test_get_uploaded_audio_data(self, speech_server): + def test_get_uploaded_audio_data(self, speech_server, mocker: MockerFixture): """Test _get_uploaded_audio_data function.""" # Mock file operations - with ( - patch("builtins.open", create=True) as mock_open, - patch("base64.b64encode") as mock_b64encode, - patch("pathlib.Path.exists") as mock_exists, - ): - mock_exists.return_value = True - mock_b64encode.return_value = b"ZmFrZWF1ZGlv" - - # Setup mock file - mock_file = MagicMock() - mock_file.read.return_value = b"fakeaudio" - mock_open.return_value.__enter__.return_value = mock_file - - # Setup uploaded speaker - speech_server.uploaded_speakers = { - "test_voice": {"name": "test_voice", "file_path": "/tmp/test.wav", "mime_type": "audio/wav"} - } - result = speech_server._get_uploaded_audio_data("test_voice") + mock_open = mocker.patch("builtins.open", create=True) + mock_b64encode = mocker.patch("base64.b64encode") + mock_exists = mocker.patch("pathlib.Path.exists") + mock_exists.return_value = True + mock_b64encode.return_value = b"ZmFrZWF1ZGlv" + + # Setup mock file + mock_file = mocker.MagicMock() + mock_file.read.return_value = b"fakeaudio" + mock_open.return_value.__enter__.return_value = mock_file + + # Setup uploaded speaker + speech_server.uploaded_speakers = { + "test_voice": {"name": "test_voice", "file_path": "/tmp/test.wav", "mime_type": "audio/wav"} + } + result = speech_server._get_uploaded_audio_data("test_voice") - assert result == "data:audio/wav;base64,ZmFrZWF1ZGlv" - mock_open.assert_called_once_with(Path("/tmp/test.wav"), "rb") - mock_b64encode.assert_called_once_with(b"fakeaudio") + assert result == "data:audio/wav;base64,ZmFrZWF1ZGlv" + mock_open.assert_called_once_with(Path("/tmp/test.wav"), "rb") + mock_b64encode.assert_called_once_with(b"fakeaudio") - def test_get_uploaded_audio_data_missing_file(self, speech_server): + def test_get_uploaded_audio_data_missing_file(self, speech_server, mocker: MockerFixture): """Test _get_uploaded_audio_data when file is missing.""" - with patch("pathlib.Path.exists") as mock_exists: - mock_exists.return_value = False + mock_exists = mocker.patch("pathlib.Path.exists") + mock_exists.return_value = False - # Setup uploaded speaker - speech_server.uploaded_speakers = { - "test_voice": {"name": "test_voice", "file_path": "/tmp/test.wav", "mime_type": "audio/wav"} - } + # Setup uploaded speaker + speech_server.uploaded_speakers = { + "test_voice": {"name": "test_voice", "file_path": "/tmp/test.wav", "mime_type": "audio/wav"} + } - result = speech_server._get_uploaded_audio_data("test_voice") + result = speech_server._get_uploaded_audio_data("test_voice") - assert result is None + assert result is None def test_get_uploaded_audio_data_voice_not_found(self, speech_server): """Test _get_uploaded_audio_data when voice is not in uploaded_speakers.""" @@ -1049,7 +1046,7 @@ def test_voice_field_still_accepted(self): req = OpenAICreateSpeechRequest.model_validate({"input": "Hello", "voice": "custom_voice"}) assert req.voice == "custom_voice" - def test_speaker_alias_in_base_task_with_uploaded_voice(self, speech_server): + def test_speaker_alias_in_base_task_with_uploaded_voice(self, speech_server, mocker: MockerFixture): """Using 'speaker' key with an uploaded voice should work for Base task.""" speech_server.uploaded_speakers = { "utesf": { @@ -1061,13 +1058,13 @@ def test_speaker_alias_in_base_task_with_uploaded_voice(self, speech_server): } req = OpenAICreateSpeechRequest.model_validate({"input": "Hello", "speaker": "UTESF", "task_type": "Base"}) assert req.voice == "UTESF" - with patch("pathlib.Path.exists", return_value=True): - result = speech_server._validate_qwen_tts_request(req) + mocker.patch("pathlib.Path.exists", return_value=True) + result = speech_server._validate_qwen_tts_request(req) assert result is None # ── uploaded voice with embedding ── - def test_build_tts_params_with_uploaded_voice_embedding(self, speech_server): + def test_build_tts_params_with_uploaded_voice_embedding(self, speech_server, mocker: MockerFixture): """Test _build_tts_params loads embedding for embedding-uploaded voices.""" speech_server.uploaded_speakers = { "emb_voice": { @@ -1083,20 +1080,20 @@ def test_build_tts_params_with_uploaded_voice_embedding(self, speech_server): speech_server.supported_speakers = {"ryan", "vivian", "emb_voice"} fake_embedding = [0.1] * 1024 - with patch.object(speech_server, "_get_uploaded_speaker_embedding") as mock_get_emb: - mock_get_emb.return_value = fake_embedding - req = OpenAICreateSpeechRequest(input="Hello", voice="emb_voice") - params = speech_server._build_tts_params(req) + mock_get_emb = mocker.patch.object(speech_server, "_get_uploaded_speaker_embedding") + mock_get_emb.return_value = fake_embedding + req = OpenAICreateSpeechRequest(input="Hello", voice="emb_voice") + params = speech_server._build_tts_params(req) - assert "voice_clone_prompt" in params - assert params["voice_clone_prompt"][0]["ref_spk_embedding"] == fake_embedding - assert params["task_type"] == ["Base"] - assert params["x_vector_only_mode"] == [True] - assert "ref_audio" not in params + assert "voice_clone_prompt" in params + assert params["voice_clone_prompt"][0]["ref_spk_embedding"] == fake_embedding + assert params["task_type"] == ["Base"] + assert params["x_vector_only_mode"] == [True] + assert "ref_audio" not in params # ── regression: full flow from issue #1603 ── - def test_regression_1603_speaker_key_with_uploaded_audio_voice(self, speech_server): + def test_regression_1603_speaker_key_with_uploaded_audio_voice(self, speech_server, mocker: MockerFixture): """Regression test for #1603: upload audio voice, then invoke TTS with 'speaker' key. Verifies the full validate → build_params pipeline works end-to-end. @@ -1116,14 +1113,14 @@ def test_regression_1603_speaker_key_with_uploaded_audio_voice(self, speech_serv assert req.voice == "UTESF" # Validation should pass (file exists) - with patch("pathlib.Path.exists", return_value=True): - err = speech_server._validate_qwen_tts_request(req) + mocker.patch("pathlib.Path.exists", return_value=True) + err = speech_server._validate_qwen_tts_request(req) assert err is None, f"Validation failed: {err}" # Build params should auto-set ref_audio from stored file - with patch.object(speech_server, "_get_uploaded_audio_data") as mock_audio: - mock_audio.return_value = "data:audio/wav;base64,ZmFrZQ==" - params = speech_server._build_tts_params(req) + mock_audio = mocker.patch.object(speech_server, "_get_uploaded_audio_data") + mock_audio.return_value = "data:audio/wav;base64,ZmFrZQ==" + params = speech_server._build_tts_params(req) assert params["task_type"] == ["Base"] assert params["ref_audio"] == ["data:audio/wav;base64,ZmFrZQ=="] @@ -1131,7 +1128,7 @@ def test_regression_1603_speaker_key_with_uploaded_audio_voice(self, speech_serv assert params["x_vector_only_mode"] == [False] assert params["speaker"] == ["utesf"] - def test_regression_1603_speaker_key_with_uploaded_embedding_voice(self, speech_server): + def test_regression_1603_speaker_key_with_uploaded_embedding_voice(self, speech_server, mocker: MockerFixture): """Regression test for #1603: upload embedding voice, then invoke TTS with 'speaker' key. Verifies embedding-uploaded voices are loaded as voice_clone_prompt, not as audio. @@ -1154,15 +1151,15 @@ def test_regression_1603_speaker_key_with_uploaded_embedding_voice(self, speech_ assert req.voice == "myvoice" # Validation should pass - with patch("pathlib.Path.exists", return_value=True): - err = speech_server._validate_qwen_tts_request(req) + mocker.patch("pathlib.Path.exists", return_value=True) + err = speech_server._validate_qwen_tts_request(req) assert err is None, f"Validation failed: {err}" # Build params should use embedding, NOT audio fake_emb = [0.1] * 1024 - with patch.object(speech_server, "_get_uploaded_speaker_embedding") as mock_emb: - mock_emb.return_value = fake_emb - params = speech_server._build_tts_params(req) + mock_emb = mocker.patch.object(speech_server, "_get_uploaded_speaker_embedding") + mock_emb.return_value = fake_emb + params = speech_server._build_tts_params(req) assert params["task_type"] == ["Base"] assert params["x_vector_only_mode"] == [True] @@ -1171,7 +1168,7 @@ def test_regression_1603_speaker_key_with_uploaded_embedding_voice(self, speech_ # Must NOT have ref_audio — that would fail for safetensors files assert "ref_audio" not in params - def test_validate_rejects_embedding_voice_with_pending_cache(self, speech_server): + def test_validate_rejects_embedding_voice_with_pending_cache(self, speech_server, mocker: MockerFixture): """Validation should reject embedding voices whose cache is not yet ready.""" speech_server.uploaded_speakers = { "myvoice": { @@ -1184,12 +1181,12 @@ def test_validate_rejects_embedding_voice_with_pending_cache(self, speech_server } } req = OpenAICreateSpeechRequest.model_validate({"input": "Hello", "speaker": "myvoice", "task_type": "Base"}) - with patch("pathlib.Path.exists", return_value=True): - err = speech_server._validate_qwen_tts_request(req) + mocker.patch("pathlib.Path.exists", return_value=True) + err = speech_server._validate_qwen_tts_request(req) assert err is not None assert "not yet ready" in err - def test_x_vector_only_mode_not_overwritten_for_uploaded_embedding(self, speech_server): + def test_x_vector_only_mode_not_overwritten_for_uploaded_embedding(self, speech_server, mocker: MockerFixture): """x_vector_only_mode set by uploaded embedding must not be overwritten by request field.""" speech_server.uploaded_speakers = { "emb_voice": { @@ -1203,11 +1200,11 @@ def test_x_vector_only_mode_not_overwritten_for_uploaded_embedding(self, speech_ } } fake_emb = [0.1] * 1024 - with patch.object(speech_server, "_get_uploaded_speaker_embedding") as mock_emb: - mock_emb.return_value = fake_emb - # Client explicitly sends x_vector_only_mode=False, but embedding requires True - req = OpenAICreateSpeechRequest(input="Hello", voice="emb_voice", x_vector_only_mode=False) - params = speech_server._build_tts_params(req) + mock_emb = mocker.patch.object(speech_server, "_get_uploaded_speaker_embedding") + mock_emb.return_value = fake_emb + # Client explicitly sends x_vector_only_mode=False, but embedding requires True + req = OpenAICreateSpeechRequest(input="Hello", voice="emb_voice", x_vector_only_mode=False) + params = speech_server._build_tts_params(req) assert params["x_vector_only_mode"] == [True] assert "voice_clone_prompt" in params @@ -1654,9 +1651,9 @@ async def test_omni_model_includes_generate(self): assert "generate" in tasks -def test_api_server_create_speech_wraps_error_response_status(): - handler = MagicMock() - handler.create_speech = AsyncMock( +def test_api_server_create_speech_wraps_error_response_status(mocker: MockerFixture): + handler = mocker.MagicMock() + handler.create_speech = mocker.AsyncMock( return_value=ErrorResponse( error=ErrorInfo(message="bad request", type="BadRequestError", param=None, code=400), ) @@ -1851,9 +1848,9 @@ def test_build_fish_prompt_normalizes_legacy_speaker_tags(self, fish_speech_serv assert "<|speaker:0|>你好,[laughing]欢迎回来。<|speaker:1|>我也来了。" in encoded_texts assert all(allowed_special is None for _, _, allowed_special in tokenizer.calls) - def test_build_fish_clone_prompt_normalizes_text_fields(self, fish_speech_server): + def test_build_fish_clone_prompt_normalizes_text_fields(self, fish_speech_server, mocker: MockerFixture): fish_speech_server._fish_speech_tokenizer = _FakeFishTokenizer() - fish_speech_server._estimate_fish_prompt_len = MagicMock(return_value=123) + fish_speech_server._estimate_fish_prompt_len = mocker.MagicMock(return_value=123) request = OpenAICreateSpeechRequest( input="你好,欢迎回来。", @@ -1904,8 +1901,10 @@ def test_build_fish_prompt_rejects_unsafe_control_tokens(self, fish_speech_serve with pytest.raises(ValueError, match="unsupported control token"): fish_speech_server._build_fish_speech_prompt(request) - def test_prepare_speech_generation_overrides_fish_default_max_tokens(self, fish_speech_server): - fish_speech_server._build_fish_speech_prompt_async = AsyncMock( + def test_prepare_speech_generation_overrides_fish_default_max_tokens( + self, fish_speech_server, mocker: MockerFixture + ): + fish_speech_server._build_fish_speech_prompt_async = mocker.AsyncMock( return_value={ "prompt_token_ids": [1, 2, 3], "additional_information": {}, @@ -1924,8 +1923,8 @@ def test_prepare_speech_generation_overrides_fish_default_max_tokens(self, fish_ assert sampling_params_list[0].max_tokens == 4096 assert fish_speech_server.engine_client.default_sampling_params_list[0].max_tokens == 2048 - def test_prepare_speech_generation_uses_stage_default_max_tokens(self, fish_speech_server): - fish_speech_server._build_fish_speech_prompt_async = AsyncMock( + def test_prepare_speech_generation_uses_stage_default_max_tokens(self, fish_speech_server, mocker: MockerFixture): + fish_speech_server._build_fish_speech_prompt_async = mocker.AsyncMock( return_value={ "prompt_token_ids": [1, 2, 3], "additional_information": {}, @@ -1956,9 +1955,9 @@ def test_prepare_speech_generation_rejects_invalid_fish_max_new_tokens(self, fis fish_speech_server.engine_client.generate.assert_not_called() - def test_create_speech_batch_allows_fish_text_only_items(self, fish_speech_server): - fish_speech_server._check_model = AsyncMock(return_value=None) - fish_speech_server._generate_audio_bytes = AsyncMock(return_value=("YWJj", "audio/wav")) + def test_create_speech_batch_allows_fish_text_only_items(self, fish_speech_server, mocker: MockerFixture): + fish_speech_server._check_model = mocker.AsyncMock(return_value=None) + fish_speech_server._generate_audio_bytes = mocker.AsyncMock(return_value=("YWJj", "audio/wav")) batch = BatchSpeechRequest(items=[SpeechBatchItem(input="hello fish")]) response = asyncio.run(fish_speech_server.create_speech_batch(batch)) @@ -2154,8 +2153,8 @@ def test_validate_cosyvoice3_max_new_tokens_range(self, cosyvoice3_server): assert error is not None assert "max_new_tokens" in error - def test_prepare_speech_generation_cosyvoice3(self, cosyvoice3_server): - cosyvoice3_server._build_cosyvoice3_prompt = AsyncMock( + def test_prepare_speech_generation_cosyvoice3(self, cosyvoice3_server, mocker: MockerFixture): + cosyvoice3_server._build_cosyvoice3_prompt = mocker.AsyncMock( return_value={ "prompt": "Hello", "multi_modal_data": {"audio": (np.zeros(24000), 24000)}, @@ -2236,9 +2235,9 @@ def qwen3_tts_server(self, mocker: MockerFixture): yield server server.shutdown() - def test_prepare_speech_generation_awaits_voxtral_async(self, voxtral_server): + def test_prepare_speech_generation_awaits_voxtral_async(self, voxtral_server, mocker: MockerFixture): """Voxtral path in _prepare_speech_generation should call the async wrapper.""" - voxtral_server._build_voxtral_prompt_async = AsyncMock( + voxtral_server._build_voxtral_prompt_async = mocker.AsyncMock( return_value={ "prompt_token_ids": [1, 2, 3], "additional_information": {"voice": ["test"]}, @@ -2248,13 +2247,13 @@ def test_prepare_speech_generation_awaits_voxtral_async(self, voxtral_server): asyncio.run(voxtral_server._prepare_speech_generation(request)) voxtral_server._build_voxtral_prompt_async.assert_awaited_once() - def test_prepare_speech_generation_awaits_qwen3_tts_async(self, qwen3_tts_server): + def test_prepare_speech_generation_awaits_qwen3_tts_async(self, qwen3_tts_server, mocker: MockerFixture): """Qwen3 TTS path should call _estimate_prompt_len_async.""" - qwen3_tts_server._validate_tts_request = MagicMock(return_value=None) - qwen3_tts_server._build_tts_params = MagicMock( + qwen3_tts_server._validate_tts_request = mocker.MagicMock(return_value=None) + qwen3_tts_server._build_tts_params = mocker.MagicMock( return_value={"text": ["hello"], "task_type": ["CustomVoice"], "speaker": ["Vivian"]} ) - qwen3_tts_server._estimate_prompt_len_async = AsyncMock(return_value=512) + qwen3_tts_server._estimate_prompt_len_async = mocker.AsyncMock(return_value=512) request = OpenAICreateSpeechRequest(input="hello") asyncio.run(qwen3_tts_server._prepare_speech_generation(request)) qwen3_tts_server._build_tts_params.assert_called_once() @@ -2281,8 +2280,8 @@ def test_shutdown_is_idempotent(self, mocker: MockerFixture): server.shutdown() # Should not raise assert server._tts_executor is None - def test_diffusion_instance_shutdown_safe(self): + def test_diffusion_instance_shutdown_safe(self, mocker: MockerFixture): """Diffusion instances (created via for_diffusion) should have safe shutdown.""" - server = OmniOpenAIServingSpeech.for_diffusion(diffusion_engine=MagicMock(), model_name="test-model") + server = OmniOpenAIServingSpeech.for_diffusion(diffusion_engine=mocker.MagicMock(), model_name="test-model") assert server._tts_executor is None server.shutdown() # Should not raise diff --git a/tests/entrypoints/openai_api/test_serving_speech_stream.py b/tests/entrypoints/openai_api/test_serving_speech_stream.py index 1d26b5855f1..1b93ef58e24 100644 --- a/tests/entrypoints/openai_api/test_serving_speech_stream.py +++ b/tests/entrypoints/openai_api/test_serving_speech_stream.py @@ -1,8 +1,8 @@ import asyncio -from unittest.mock import AsyncMock, MagicMock import pytest from fastapi import FastAPI, WebSocket +from pytest_mock import MockerFixture from starlette.testclient import TestClient from starlette.websockets import WebSocketDisconnect @@ -13,19 +13,26 @@ pytestmark = [pytest.mark.core_model, pytest.mark.cpu] -def _build_test_app(speech_service=None, *, idle_timeout=30.0, config_timeout=10.0): +def _build_test_app( + speech_service=None, + *, + idle_timeout=30.0, + config_timeout=10.0, + mocker: MockerFixture | None = None, +): if speech_service is None: - speech_service = MagicMock(spec=OmniOpenAIServingSpeech) - speech_service._generate_audio_bytes = AsyncMock(return_value=(b"RIFF" + b"\x00" * 32, "audio/wav")) - speech_service._prepare_speech_generation = AsyncMock(return_value=("req-1", object(), {})) + assert mocker is not None + speech_service = mocker.MagicMock(spec=OmniOpenAIServingSpeech) + speech_service._generate_audio_bytes = mocker.AsyncMock(return_value=(b"RIFF" + b"\x00" * 32, "audio/wav")) + speech_service._prepare_speech_generation = mocker.AsyncMock(return_value=("req-1", object(), {})) async def mock_generate_pcm_chunks(_generator, _request_id): for chunk in (b"\x01\x02", b"\x03\x04\x05"): yield chunk speech_service._generate_pcm_chunks = mock_generate_pcm_chunks - speech_service.engine_client = MagicMock() - speech_service.engine_client.abort = AsyncMock() + speech_service.engine_client = mocker.MagicMock() + speech_service.engine_client.abort = mocker.AsyncMock() handler = OmniStreamingSpeechHandler( speech_service=speech_service, @@ -42,8 +49,8 @@ async def ws_endpoint(websocket: WebSocket): class TestStreamingSpeechWebSocket: - def test_non_streaming_single_frame(self): - app, speech_service = _build_test_app() + def test_non_streaming_single_frame(self, mocker: MockerFixture): + app, speech_service = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -68,13 +75,13 @@ def test_non_streaming_single_frame(self): assert speech_service._generate_audio_bytes.await_count == 1 - def test_streaming_multiple_binary_frames(self): + def test_streaming_multiple_binary_frames(self, mocker: MockerFixture): captured_requests = [] - speech_service = MagicMock(spec=OmniOpenAIServingSpeech) - speech_service._generate_audio_bytes = AsyncMock(return_value=(b"", "audio/wav")) - speech_service.engine_client = MagicMock() - speech_service.engine_client.abort = AsyncMock() + speech_service = mocker.MagicMock(spec=OmniOpenAIServingSpeech) + speech_service._generate_audio_bytes = mocker.AsyncMock(return_value=(b"", "audio/wav")) + speech_service.engine_client = mocker.MagicMock() + speech_service.engine_client.abort = mocker.AsyncMock() async def mock_prepare_speech_generation(request): captured_requests.append(request) @@ -123,8 +130,8 @@ async def mock_generate_pcm_chunks(_generator, _request_id): assert captured_requests[0].initial_codec_chunk_frames == 12 assert speech_service._generate_audio_bytes.await_count == 0 - def test_flush_on_input_done(self): - app, _ = _build_test_app() + def test_flush_on_input_done(self, mocker: MockerFixture): + app, _ = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -142,8 +149,8 @@ def test_flush_on_input_done(self): } assert ws.receive_json() == {"type": "session.done", "total_sentences": 1} - def test_invalid_streaming_config(self): - app, _ = _build_test_app() + def test_invalid_streaming_config(self, mocker: MockerFixture): + app, _ = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -159,8 +166,8 @@ def test_invalid_streaming_config(self): assert error["type"] == "error" assert "response_format='pcm'" in error["message"] - def test_empty_input_text_emits_no_audio(self): - app, speech_service = _build_test_app() + def test_empty_input_text_emits_no_audio(self, mocker: MockerFixture): + app, speech_service = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -172,8 +179,8 @@ def test_empty_input_text_emits_no_audio(self): assert speech_service._generate_audio_bytes.await_count == 0 - def test_multiple_sentences_increment_indices(self): - app, _ = _build_test_app() + def test_multiple_sentences_increment_indices(self, mocker: MockerFixture): + app, _ = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -203,8 +210,8 @@ def test_multiple_sentences_increment_indices(self): ws.send_json({"type": "input.done"}) assert ws.receive_json() == {"type": "session.done", "total_sentences": 2} - def test_unknown_message_type_keeps_session_open(self): - app, _ = _build_test_app() + def test_unknown_message_type_keeps_session_open(self, mocker: MockerFixture): + app, _ = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -227,21 +234,21 @@ def test_unknown_message_type_keeps_session_open(self): ws.send_json({"type": "input.done"}) assert ws.receive_json() == {"type": "session.done", "total_sentences": 1} - def test_config_timeout_closes_session(self): - app, _ = _build_test_app(config_timeout=0.01) + def test_config_timeout_closes_session(self, mocker: MockerFixture): + app, _ = _build_test_app(config_timeout=0.01, mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: error = ws.receive_json() assert error == {"type": "error", "message": "Timeout waiting for session.config"} - def test_generation_error_marks_audio_done(self): - speech_service = MagicMock(spec=OmniOpenAIServingSpeech) - speech_service._generate_audio_bytes = AsyncMock(side_effect=RuntimeError("boom")) - speech_service._prepare_speech_generation = AsyncMock(return_value=("req-err", object(), {})) - speech_service._generate_pcm_chunks = AsyncMock() - speech_service.engine_client = MagicMock() - speech_service.engine_client.abort = AsyncMock() + def test_generation_error_marks_audio_done(self, mocker: MockerFixture): + speech_service = mocker.MagicMock(spec=OmniOpenAIServingSpeech) + speech_service._generate_audio_bytes = mocker.AsyncMock(side_effect=RuntimeError("boom")) + speech_service._prepare_speech_generation = mocker.AsyncMock(return_value=("req-err", object(), {})) + speech_service._generate_pcm_chunks = mocker.AsyncMock() + speech_service.engine_client = mocker.MagicMock() + speech_service.engine_client.abort = mocker.AsyncMock() app, _ = _build_test_app(speech_service) with TestClient(app) as client: @@ -256,12 +263,12 @@ def test_generation_error_marks_audio_done(self): ws.send_json({"type": "input.done"}) assert ws.receive_json() == {"type": "session.done", "total_sentences": 1} - def test_streaming_generation_error_marks_audio_done(self): - speech_service = MagicMock(spec=OmniOpenAIServingSpeech) - speech_service._generate_audio_bytes = AsyncMock(return_value=(b"", "audio/wav")) - speech_service._prepare_speech_generation = AsyncMock(return_value=("req-stream-err", object(), {})) - speech_service.engine_client = MagicMock() - speech_service.engine_client.abort = AsyncMock() + def test_streaming_generation_error_marks_audio_done(self, mocker: MockerFixture): + speech_service = mocker.MagicMock(spec=OmniOpenAIServingSpeech) + speech_service._generate_audio_bytes = mocker.AsyncMock(return_value=(b"", "audio/wav")) + speech_service._prepare_speech_generation = mocker.AsyncMock(return_value=("req-stream-err", object(), {})) + speech_service.engine_client = mocker.MagicMock() + speech_service.engine_client.abort = mocker.AsyncMock() async def mock_generate_pcm_chunks(_generator, _request_id): yield b"\x01\x02" @@ -298,8 +305,8 @@ async def mock_generate_pcm_chunks(_generator, _request_id): ws.send_json({"type": "input.done"}) assert ws.receive_json() == {"type": "session.done", "total_sentences": 1} - def test_invalid_input_text_type_returns_validation_error(self): - app, speech_service = _build_test_app() + def test_invalid_input_text_type_returns_validation_error(self, mocker: MockerFixture): + app, speech_service = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -316,9 +323,9 @@ def test_invalid_input_text_type_returns_validation_error(self): assert speech_service._generate_audio_bytes.await_count == 0 - def test_input_text_message_too_large(self, monkeypatch): + def test_input_text_message_too_large(self, monkeypatch, mocker: MockerFixture): monkeypatch.setattr(streaming_speech_module, "_MAX_INPUT_TEXT_MESSAGE_SIZE", 32) - app, speech_service = _build_test_app() + app, speech_service = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -335,9 +342,9 @@ def test_input_text_message_too_large(self, monkeypatch): assert speech_service._generate_audio_bytes.await_count == 0 - def test_session_config_message_too_large(self, monkeypatch): + def test_session_config_message_too_large(self, monkeypatch, mocker: MockerFixture): monkeypatch.setattr(streaming_speech_module, "_MAX_CONFIG_MESSAGE_SIZE", 64) - app, _ = _build_test_app() + app, _ = _build_test_app(mocker=mocker) with TestClient(app) as client: with client.websocket_connect("/v1/audio/speech/stream") as ws: @@ -348,12 +355,12 @@ def test_session_config_message_too_large(self, monkeypatch): "message": "session.config message too large", } - def test_disconnect_aborts_streaming_request(self): - speech_service = MagicMock(spec=OmniOpenAIServingSpeech) - speech_service._generate_audio_bytes = AsyncMock(return_value=(b"", "audio/wav")) - speech_service._prepare_speech_generation = AsyncMock(return_value=("req-abort", object(), {})) - speech_service.engine_client = MagicMock() - speech_service.engine_client.abort = AsyncMock() + def test_disconnect_aborts_streaming_request(self, mocker: MockerFixture): + speech_service = mocker.MagicMock(spec=OmniOpenAIServingSpeech) + speech_service._generate_audio_bytes = mocker.AsyncMock(return_value=(b"", "audio/wav")) + speech_service._prepare_speech_generation = mocker.AsyncMock(return_value=("req-abort", object(), {})) + speech_service.engine_client = mocker.MagicMock() + speech_service.engine_client.abort = mocker.AsyncMock() async def mock_generate_pcm_chunks(_generator, _request_id): yield b"\x01\x02" @@ -361,11 +368,11 @@ async def mock_generate_pcm_chunks(_generator, _request_id): speech_service._generate_pcm_chunks = mock_generate_pcm_chunks handler = OmniStreamingSpeechHandler(speech_service=speech_service) - websocket = MagicMock() - websocket.send_json = AsyncMock(side_effect=[None, WebSocketDisconnect()]) - websocket.send_bytes = AsyncMock(side_effect=WebSocketDisconnect()) + websocket = mocker.MagicMock() + websocket.send_json = mocker.AsyncMock(side_effect=[None, WebSocketDisconnect()]) + websocket.send_bytes = mocker.AsyncMock(side_effect=WebSocketDisconnect()) - config = MagicMock() + config = mocker.MagicMock() config.model = None config.voice = "Vivian" config.task_type = None diff --git a/tests/entrypoints/test_omni_base_profiler.py b/tests/entrypoints/test_omni_base_profiler.py index 0c1ddc6a5db..ca10eed91f6 100644 --- a/tests/entrypoints/test_omni_base_profiler.py +++ b/tests/entrypoints/test_omni_base_profiler.py @@ -1,8 +1,7 @@ """Unit tests for OmniBase and AsyncOmni profiler methods.""" -from unittest.mock import MagicMock, patch - import pytest +from pytest_mock import MockerFixture pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -11,12 +10,12 @@ class TestOmniBaseProfiler: """Test suite for OmniBase profiler methods (start_profile, stop_profile).""" @pytest.fixture - def mock_engine(self): + def mock_engine(self, mocker: MockerFixture): """Create a mock AsyncOmniEngine for testing.""" - engine = MagicMock() + engine = mocker.MagicMock() engine.num_stages = 3 engine.is_alive.return_value = True - engine.default_sampling_params_list = [MagicMock() for _ in range(3)] + engine.default_sampling_params_list = [mocker.MagicMock() for _ in range(3)] engine.get_stage_metadata.side_effect = lambda i: { "final_output_type": "text" if i == 0 else "audio", "final_output": True, @@ -25,17 +24,15 @@ def mock_engine(self): return engine @pytest.fixture - def omni_base_instance(self, mock_engine): + def omni_base_instance(self, mock_engine, mocker: MockerFixture): """Create an OmniBase instance with mocked dependencies.""" - with ( - patch("vllm_omni.entrypoints.omni_base.AsyncOmniEngine", return_value=mock_engine), - patch("vllm_omni.entrypoints.omni_base.omni_snapshot_download", side_effect=lambda x: x), - patch("vllm_omni.entrypoints.omni_base.weakref.finalize"), - ): - from vllm_omni.entrypoints.omni_base import OmniBase - - instance = OmniBase(model="test-model") - return instance + mocker.patch("vllm_omni.entrypoints.omni_base.AsyncOmniEngine", return_value=mock_engine) + mocker.patch("vllm_omni.entrypoints.omni_base.omni_snapshot_download", side_effect=lambda x: x) + mocker.patch("vllm_omni.entrypoints.omni_base.weakref.finalize") + from vllm_omni.entrypoints.omni_base import OmniBase + + instance = OmniBase(model="test-model") + return instance def test_start_profile_calls_collective_rpc(self, omni_base_instance, mock_engine): """Test that start_profile calls collective_rpc with correct arguments.""" diff --git a/tests/entrypoints/test_serve.py b/tests/entrypoints/test_serve.py index 916db3cc22a..afa7fa82e4b 100644 --- a/tests/entrypoints/test_serve.py +++ b/tests/entrypoints/test_serve.py @@ -3,9 +3,9 @@ from __future__ import annotations import argparse -from unittest.mock import Mock, patch import pytest +from pytest_mock import MockerFixture from vllm_omni.entrypoints.cli.serve import run_headless @@ -26,45 +26,43 @@ def _make_headless_args() -> argparse.Namespace: ) -def test_run_headless_registers_stage_once_and_launches_all_local_engines() -> None: +def test_run_headless_registers_stage_once_and_launches_all_local_engines(mocker: MockerFixture) -> None: args = _make_headless_args() - stage_cfg = Mock(stage_id=3) + stage_cfg = mocker.Mock(stage_id=3) stage_cfgs = [stage_cfg] - parallel_config = Mock( + parallel_config = mocker.Mock( data_parallel_size_local=2, data_parallel_rank=4, data_parallel_rank_local=1, node_rank_within_dp=0, ) - vllm_config = Mock(parallel_config=parallel_config) - executor_class = Mock() - engine_manager = Mock() - - with ( - patch( - "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", - return_value=("/fake/stages.yaml", stage_cfgs), - ), - patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment"), - patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=Mock()), - patch("vllm_omni.engine.stage_init_utils.get_stage_connector_spec", return_value={}), - patch("vllm_omni.engine.stage_init_utils.build_engine_args_dict", return_value={}), - patch( - "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", - return_value=(None, None, None), - ), - patch( - "vllm_omni.engine.stage_init_utils.build_vllm_config", - return_value=(vllm_config, executor_class), - ) as mock_build_vllm_config, - patch( - "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", - return_value="tcp://127.0.0.1:26001", - ) as mock_register, - patch("vllm.v1.engine.utils.CoreEngineProcManager", return_value=engine_manager) as mock_manager_cls, - patch("signal.signal"), - ): - run_headless(args) + vllm_config = mocker.Mock(parallel_config=parallel_config) + executor_class = mocker.Mock() + engine_manager = mocker.Mock() + + mocker.patch( + "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", + return_value=("/fake/stages.yaml", stage_cfgs), + ) + mocker.patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment") + mocker.patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=mocker.Mock()) + mocker.patch("vllm_omni.engine.stage_init_utils.get_stage_connector_spec", return_value={}) + mocker.patch("vllm_omni.engine.stage_init_utils.build_engine_args_dict", return_value={}) + mocker.patch( + "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mock_build_vllm_config = mocker.patch( + "vllm_omni.engine.stage_init_utils.build_vllm_config", + return_value=(vllm_config, executor_class), + ) + mock_register = mocker.patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value="tcp://127.0.0.1:26001", + ) + mock_manager_cls = mocker.patch("vllm.v1.engine.utils.CoreEngineProcManager", return_value=engine_manager) + mocker.patch("signal.signal") + run_headless(args) mock_build_vllm_config.assert_called_once_with( stage_cfg, @@ -92,89 +90,85 @@ def test_run_headless_registers_stage_once_and_launches_all_local_engines() -> N engine_manager.shutdown.assert_called_once_with() -def test_run_headless_honors_explicit_log_stats_flag() -> None: +def test_run_headless_honors_explicit_log_stats_flag(mocker: MockerFixture) -> None: args = _make_headless_args() args.log_stats = True - stage_cfg = Mock(stage_id=3) + stage_cfg = mocker.Mock(stage_id=3) stage_cfgs = [stage_cfg] - parallel_config = Mock( + parallel_config = mocker.Mock( data_parallel_size_local=2, data_parallel_rank=4, data_parallel_rank_local=1, node_rank_within_dp=0, ) - vllm_config = Mock(parallel_config=parallel_config) - executor_class = Mock() - engine_manager = Mock() - - with ( - patch( - "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", - return_value=("/fake/stages.yaml", stage_cfgs), - ), - patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment"), - patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=Mock()), - patch("vllm_omni.engine.stage_init_utils.get_stage_connector_spec", return_value={}), - patch("vllm_omni.engine.stage_init_utils.build_engine_args_dict", return_value={}), - patch( - "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", - return_value=(None, None, None), - ), - patch( - "vllm_omni.engine.stage_init_utils.build_vllm_config", - return_value=(vllm_config, executor_class), - ), - patch( - "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", - return_value="tcp://127.0.0.1:26001", - ), - patch("vllm.v1.engine.utils.CoreEngineProcManager", return_value=engine_manager) as mock_manager_cls, - patch("signal.signal"), - ): - run_headless(args) + vllm_config = mocker.Mock(parallel_config=parallel_config) + executor_class = mocker.Mock() + engine_manager = mocker.Mock() + + mocker.patch( + "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", + return_value=("/fake/stages.yaml", stage_cfgs), + ) + mocker.patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment") + mocker.patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=mocker.Mock()) + mocker.patch("vllm_omni.engine.stage_init_utils.get_stage_connector_spec", return_value={}) + mocker.patch("vllm_omni.engine.stage_init_utils.build_engine_args_dict", return_value={}) + mocker.patch( + "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch( + "vllm_omni.engine.stage_init_utils.build_vllm_config", + return_value=(vllm_config, executor_class), + ) + mocker.patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value="tcp://127.0.0.1:26001", + ) + mock_manager_cls = mocker.patch("vllm.v1.engine.utils.CoreEngineProcManager", return_value=engine_manager) + mocker.patch("signal.signal") + run_headless(args) manager_kwargs = mock_manager_cls.call_args.kwargs assert manager_kwargs["log_stats"] is True -def test_run_headless_launches_diffusion_stage_via_omni_master() -> None: +def test_run_headless_launches_diffusion_stage_via_omni_master(mocker: MockerFixture) -> None: args = _make_headless_args() - stage_cfg = Mock(stage_id=3, stage_type="diffusion") - stage_cfg.engine_args = Mock() + stage_cfg = mocker.Mock(stage_id=3, stage_type="diffusion") + stage_cfg.engine_args = mocker.Mock() stage_cfg.engine_input_source = [] stage_cfgs = [stage_cfg] - metadata = Mock(stage_id=3) - od_config = Mock() - proc = Mock() + metadata = mocker.Mock(stage_id=3) + od_config = mocker.Mock() + proc = mocker.Mock() proc.exitcode = 0 proc.is_alive.return_value = False - with ( - patch( - "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", - return_value=("/fake/stages.yaml", stage_cfgs), - ), - patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment"), - patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=Mock()), - patch( - "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", - return_value=(None, None, None), - ), - patch("vllm_omni.engine.stage_init_utils.extract_stage_metadata", return_value=metadata), - patch("vllm_omni.engine.stage_init_utils.inject_kv_stage_info") as mock_inject_stage_info, - patch("vllm_omni.engine.stage_init_utils.build_diffusion_config", return_value=od_config), - patch( - "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", - return_value=("tcp://127.0.0.1:26001", "tcp://127.0.0.1:26002", "tcp://127.0.0.1:26003"), - ) as mock_register, - patch( - "vllm_omni.diffusion.stage_diffusion_proc.spawn_diffusion_proc", - return_value=(proc, "tcp://127.0.0.1:26001", "tcp://127.0.0.1:26002", "tcp://127.0.0.1:26003"), - ) as mock_spawn, - patch("vllm_omni.diffusion.stage_diffusion_proc.complete_diffusion_handshake") as mock_handshake, - patch("signal.signal"), - ): - run_headless(args) + mocker.patch( + "vllm_omni.entrypoints.utils.load_and_resolve_stage_configs", + return_value=("/fake/stages.yaml", stage_cfgs), + ) + mocker.patch("vllm_omni.engine.stage_init_utils.prepare_engine_environment") + mocker.patch("vllm_omni.engine.stage_init_utils.load_omni_transfer_config_for_model", return_value=mocker.Mock()) + mocker.patch( + "vllm_omni.distributed.omni_connectors.utils.initialization.resolve_omni_kv_config_for_stage", + return_value=(None, None, None), + ) + mocker.patch("vllm_omni.engine.stage_init_utils.extract_stage_metadata", return_value=metadata) + mock_inject_stage_info = mocker.patch("vllm_omni.engine.stage_init_utils.inject_kv_stage_info") + mocker.patch("vllm_omni.engine.stage_init_utils.build_diffusion_config", return_value=od_config) + mock_register = mocker.patch( + "vllm_omni.engine.stage_engine_startup.register_stage_with_omni_master", + return_value=("tcp://127.0.0.1:26001", "tcp://127.0.0.1:26002", "tcp://127.0.0.1:26003"), + ) + mock_spawn = mocker.patch( + "vllm_omni.diffusion.stage_diffusion_proc.spawn_diffusion_proc", + return_value=(proc, "tcp://127.0.0.1:26001", "tcp://127.0.0.1:26002", "tcp://127.0.0.1:26003"), + ) + mock_handshake = mocker.patch("vllm_omni.diffusion.stage_diffusion_proc.complete_diffusion_handshake") + mocker.patch("signal.signal") + run_headless(args) mock_inject_stage_info.assert_called_once_with(stage_cfg, 3) mock_register.assert_called_once_with( diff --git a/tests/model_executor/models/mimo_audio/test_mimo_audio_code2wav_batch_decode.py b/tests/model_executor/models/mimo_audio/test_mimo_audio_code2wav_batch_decode.py index 85c0e8b56e4..8858d1f8f16 100644 --- a/tests/model_executor/models/mimo_audio/test_mimo_audio_code2wav_batch_decode.py +++ b/tests/model_executor/models/mimo_audio/test_mimo_audio_code2wav_batch_decode.py @@ -2,10 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from types import SimpleNamespace -from unittest.mock import Mock import pytest import torch +from pytest_mock import MockerFixture from vllm_omni.model_executor.models.mimo_audio.config_mimo_audio import TALKER_CODEC_PAD_TOKEN_ID from vllm_omni.model_executor.models.mimo_audio.mimo_audio_code2wav import ( @@ -51,7 +51,7 @@ def _make_invalid_flat_immediate_eostm(eostm_id: int = 666) -> torch.Tensor: return g.reshape(-1) -def _minimal_model(): +def _minimal_model(mocker: MockerFixture): """Avoid __init__ (HF tokenizer paths); only fields used by _batch_decode_waveforms.""" model = object.__new__(MiMoAudioToken2WavForConditionalGenerationVLLM) model.device = torch.device("cpu") @@ -59,7 +59,7 @@ def _minimal_model(): model.streamer_config = AudioStreamerConfig(group_size=_GROUP, audio_channels=_AC) model.codes = _codes_ns() - decode_vq = Mock( + decode_vq = mocker.Mock( side_effect=lambda audio_codes: torch.ones( audio_codes.shape[1], 7, @@ -67,7 +67,7 @@ def _minimal_model(): device=audio_codes.device, ) ) - decoder = Mock() + decoder = mocker.Mock() audio_tok = SimpleNamespace( encoder=SimpleNamespace(decode_vq=decode_vq), @@ -78,9 +78,9 @@ def _minimal_model(): return model, audio_tok -def test_batch_decode_waveforms_empty_input_list(): +def test_batch_decode_waveforms_empty_input_list(mocker: MockerFixture): """Empty input list returns a single zero-length float32 tensor on model device.""" - model, _ = _minimal_model() + model, _ = _minimal_model(mocker) out = MiMoAudioToken2WavForConditionalGenerationVLLM._batch_decode_waveforms(model, []) assert len(out) == 1 assert out[0].dtype == torch.float32 @@ -88,9 +88,9 @@ def test_batch_decode_waveforms_empty_input_list(): assert out[0].device == model.device -def test_batch_decode_waveforms_single_vs_multiple_decoder_shapes(): +def test_batch_decode_waveforms_single_vs_multiple_decoder_shapes(mocker: MockerFixture): """Single and multi-request batches produce correctly shaped packed hidden states and trimmed waveforms.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) decoder = audio_tok.decoder # Single valid request: decoder output rank-3 for double squeeze path @@ -118,9 +118,9 @@ def test_batch_decode_waveforms_single_vs_multiple_decoder_shapes(): assert out2[1].shape == (8 * _FTP,) -def test_batch_decode_waveforms_mixed_valid_invalid_requests(): +def test_batch_decode_waveforms_mixed_valid_invalid_requests(mocker: MockerFixture): """Mixed valid and invalid requests: invalid slots get empty tensors, valid slots get decoded waveforms.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) valid_a = _make_valid_flat_codes(1) valid_b = _make_valid_flat_codes(1) dummy = _make_dummy_code_tensor() @@ -151,9 +151,9 @@ def test_batch_decode_waveforms_mixed_valid_invalid_requests(): assert input_lengths.tolist() == [4, 4] -def test_batch_decode_waveforms_all_invalid_returns_per_request_empty(): +def test_batch_decode_waveforms_all_invalid_returns_per_request_empty(mocker: MockerFixture): """All-invalid batch skips decoder entirely and returns empty tensors for every slot.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) out = MiMoAudioToken2WavForConditionalGenerationVLLM._batch_decode_waveforms( model, [None, _make_dummy_code_tensor(), torch.tensor([], dtype=torch.long)], @@ -163,9 +163,9 @@ def test_batch_decode_waveforms_all_invalid_returns_per_request_empty(): audio_tok.decoder.assert_not_called() -def test_batch_decode_waveforms_output_shape_trim_when_decoder_returns_extra_samples(): +def test_batch_decode_waveforms_output_shape_trim_when_decoder_returns_extra_samples(mocker: MockerFixture): """Decoder output longer than valid_len is trimmed to the exact expected waveform length.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) flat = _make_valid_flat_codes(1) # Longer than valid_len so branch wav = wav[:valid_len] runs audio_tok.decoder.return_value = torch.ones(1, 1, 10_000, dtype=torch.float32) @@ -175,9 +175,9 @@ def test_batch_decode_waveforms_output_shape_trim_when_decoder_returns_extra_sam assert out[0].dtype == torch.float32 -def test_batch_decode_waveforms_multi_request_trims_each_row_when_decoder_returns_extra(): +def test_batch_decode_waveforms_multi_request_trims_each_row_when_decoder_returns_extra(mocker: MockerFixture): """Else-branch split: per-request wav[:valid_len] when decoder pads each batch row.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) a = _make_valid_flat_codes(1) b = _make_valid_flat_codes(2) audio_tok.decoder.return_value = torch.ones(2, 1, 10_000, dtype=torch.float32) @@ -189,9 +189,9 @@ def test_batch_decode_waveforms_multi_request_trims_each_row_when_decoder_return assert out[1].dtype == torch.float32 -def test_batch_decode_waveforms_valid_only_at_edges_maps_to_correct_indices(): +def test_batch_decode_waveforms_valid_only_at_edges_maps_to_correct_indices(mocker: MockerFixture): """Tensor packing order must match valid_indices when invalid requests are in the middle.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) first = _make_valid_flat_codes(1) last = _make_valid_flat_codes(2) inputs = [ @@ -212,9 +212,9 @@ def test_batch_decode_waveforms_valid_only_at_edges_maps_to_correct_indices(): assert input_lengths.tolist() == [4, 8] -def test_batch_decode_waveforms_output_shapes_1d_float32_for_all_slots(): +def test_batch_decode_waveforms_output_shapes_1d_float32_for_all_slots(mocker: MockerFixture): """Every slot is a 1-D float32 vector (empty or waveform), matching downstream expectations.""" - model, audio_tok = _minimal_model() + model, audio_tok = _minimal_model(mocker) inputs = [_make_valid_flat_codes(1), None, _make_valid_flat_codes(1)] audio_tok.decoder.return_value = torch.ones(2, 1, 5000, dtype=torch.float32) out = MiMoAudioToken2WavForConditionalGenerationVLLM._batch_decode_waveforms(model, inputs) diff --git a/tests/model_executor/models/qwen2_5_omni/test_qwen2_5_omni_embed.py b/tests/model_executor/models/qwen2_5_omni/test_qwen2_5_omni_embed.py index 8e04b04966b..587e7f7f8b1 100644 --- a/tests/model_executor/models/qwen2_5_omni/test_qwen2_5_omni_embed.py +++ b/tests/model_executor/models/qwen2_5_omni/test_qwen2_5_omni_embed.py @@ -10,10 +10,9 @@ - Interleaved (use_audio_in_video) should also work correctly. """ -from unittest.mock import Mock - import pytest import torch +from pytest_mock import MockerFixture from vllm.model_executor.models.qwen2_5_omni_thinker import ( check_interleaved_audio_video, merge_interleaved_embeddings, @@ -107,7 +106,7 @@ def test_interleaved(self): # --------------------------------------------------------------------------- -def make_mock_model(hidden: int = 8): +def make_mock_model(mocker: MockerFixture, hidden: int = 8): """ Return a minimal mock of Qwen2_5OmniThinkerForConditionalGeneration that has enough structure to run embed_input_ids. @@ -116,10 +115,10 @@ def make_mock_model(hidden: int = 8): Qwen2_5OmniThinkerForConditionalGeneration, ) - model = Mock(spec=Qwen2_5OmniThinkerForConditionalGeneration) + model = mocker.Mock(spec=Qwen2_5OmniThinkerForConditionalGeneration) # Config with token IDs - cfg = Mock() + cfg = mocker.Mock() cfg.video_token_index = VIDEO_TOKEN_ID cfg.audio_token_index = AUDIO_TOKEN_ID model.config = cfg @@ -130,9 +129,9 @@ def fake_lm_embed(ids: torch.Tensor) -> torch.Tensor: # view with shared memory, which masked_scatter_ cannot handle). return ids.float().unsqueeze(-1).expand(-1, hidden).clone() - lang_model = Mock() + lang_model = mocker.Mock() lang_model.embed_input_ids = fake_lm_embed - model.get_language_model = Mock(return_value=lang_model) + model.get_language_model = mocker.Mock(return_value=lang_model) from vllm.model_executor.models.interfaces import SupportsMultiModal @@ -169,7 +168,7 @@ def build_mm_embeds(audio_n, image_n, video_n, hidden, audio_val=10.0, image_val class TestEmbedInputIds: - def _run(self, audio_n, image_n, video_n, hidden=8): + def _run(self, mocker: MockerFixture, audio_n, image_n, video_n, hidden=8): """ Run embed_input_ids for a non-interleaved mixed-modality sequence. Returns (result_embeds, input_ids, is_multimodal). @@ -177,33 +176,33 @@ def _run(self, audio_n, image_n, video_n, hidden=8): input_ids, is_multimodal = make_token_seq(audio_n, image_n, video_n) mm_embeds = build_mm_embeds(audio_n, image_n, video_n, hidden) - model, _ = make_mock_model(hidden) + model, _ = make_mock_model(mocker, hidden) result = model.embed_input_ids(input_ids, mm_embeds, is_multimodal=is_multimodal) return result, input_ids, is_multimodal - def test_audio_only(self): + def test_audio_only(self, mocker: MockerFixture): """Audio-only: audio positions get audio embeddings.""" audio_n, hidden = 5, 8 audio_val = 10.0 - result, input_ids, is_multimodal = self._run(audio_n, 0, 0, hidden) + result, input_ids, is_multimodal = self._run(mocker, audio_n, 0, 0, hidden) audio_pos = (input_ids == AUDIO_TOKEN_ID).nonzero(as_tuple=True)[0] assert result[audio_pos].allclose(torch.full((audio_n, hidden), audio_val)), ( "Audio positions should get audio embeddings" ) - def test_video_only(self): + def test_video_only(self, mocker: MockerFixture): """Video-only: video positions get video embeddings.""" video_n, hidden = 6, 8 video_val = 30.0 - result, input_ids, is_multimodal = self._run(0, 0, video_n, hidden) + result, input_ids, is_multimodal = self._run(mocker, 0, 0, video_n, hidden) video_pos = (input_ids == VIDEO_TOKEN_ID).nonzero(as_tuple=True)[0] assert result[video_pos].allclose(torch.full((video_n, hidden), video_val)), ( "Video positions should get video embeddings" ) - def test_mixed_modalities_audio_goes_to_audio_pos(self): + def test_mixed_modalities_audio_goes_to_audio_pos(self, mocker: MockerFixture): """ Regression test for GitHub issue #34506: With audio + image + video (non-interleaved), audio positions must @@ -212,7 +211,7 @@ def test_mixed_modalities_audio_goes_to_audio_pos(self): audio_n, image_n, video_n, hidden = 5, 4, 6, 8 audio_val, image_val, video_val = 10.0, 20.0, 30.0 - result, input_ids, is_multimodal = self._run(audio_n, image_n, video_n, hidden) + result, input_ids, is_multimodal = self._run(mocker, audio_n, image_n, video_n, hidden) audio_pos = (input_ids == AUDIO_TOKEN_ID).nonzero(as_tuple=True)[0] image_pos = (input_ids == IMAGE_TOKEN_ID).nonzero(as_tuple=True)[0] @@ -233,10 +232,10 @@ def test_mixed_modalities_audio_goes_to_audio_pos(self): f"Video emb wrong: expected {video_val}, got mean={mean_v:.1f}" ) - def test_text_positions_unchanged(self): + def test_text_positions_unchanged(self, mocker: MockerFixture): """Text positions should keep their text embeddings.""" audio_n, image_n, video_n, hidden = 3, 2, 4, 8 - result, input_ids, is_multimodal = self._run(audio_n, image_n, video_n, hidden) + result, input_ids, is_multimodal = self._run(mocker, audio_n, image_n, video_n, hidden) text_pos = (~is_multimodal).nonzero(as_tuple=True)[0] # Text tokens have value TEXT_TOKEN_ID=0, so embed -> 0.0 @@ -244,7 +243,7 @@ def test_text_positions_unchanged(self): "Text positions should keep text embeddings" ) - def test_interleaved_use_audio_in_video(self): + def test_interleaved_use_audio_in_video(self, mocker: MockerFixture): """ Interleaved (use_audio_in_video): video chunks interleaved with audio. Video embeddings must go to video positions, audio to audio positions. @@ -263,7 +262,7 @@ def test_interleaved_use_audio_in_video(self): torch.full((audio_n, hidden), audio_val), ] - model, _ = make_mock_model(hidden) + model, _ = make_mock_model(mocker, hidden) result = model.embed_input_ids(input_ids, mm_embeds, is_multimodal=is_multimodal) video_pos = (input_ids == VIDEO_TOKEN_ID).nonzero(as_tuple=True)[0] diff --git a/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py b/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py index e2970dcb2df..b0ce10a8d5e 100644 --- a/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py +++ b/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py @@ -15,9 +15,10 @@ import os import sys import types -from unittest.mock import MagicMock, patch +import pytest import torch +from pytest_mock import MockerFixture # Direct file import to avoid vllm_omni.__init__ patch dependencies. _BASE = os.path.join( @@ -41,28 +42,31 @@ def _load_module(name: str, filename: str): return mod -def _build_mock_modules() -> dict[str, object]: +def _build_mock_modules(mocker: MockerFixture) -> dict[str, object]: """Build the dict of modules to inject into sys.modules.""" - platforms_mock = MagicMock() + platforms_mock = mocker.MagicMock() platforms_mock.current_omni_platform.supports_torch_inductor.return_value = False - logger_mock = MagicMock() - logger_mock.init_logger = lambda name: MagicMock() + logger_mock = mocker.MagicMock() + logger_mock.init_logger = lambda name: mocker.MagicMock() - vllm_config_mod = MagicMock() - vllm_config_mod.set_current_vllm_config = lambda cfg: MagicMock(__enter__=MagicMock(), __exit__=MagicMock()) + vllm_config_mod = mocker.MagicMock() + vllm_config_mod.set_current_vllm_config = lambda cfg: mocker.MagicMock( + __enter__=mocker.MagicMock(), + __exit__=mocker.MagicMock(), + ) - weight_utils_mock = MagicMock() + weight_utils_mock = mocker.MagicMock() weight_utils_mock.default_weight_loader = lambda p, w: None pkg = types.ModuleType("vllm_omni.model_executor.models.qwen3_tts") pkg.__path__ = [os.path.abspath(_BASE)] return { - "vllm_omni": MagicMock(), + "vllm_omni": mocker.MagicMock(), "vllm_omni.platforms": platforms_mock, "vllm.logger": logger_mock, - "vllm.config": MagicMock(), + "vllm.config": mocker.MagicMock(), "vllm.config.vllm": vllm_config_mod, "vllm.model_executor.model_loader.weight_utils": weight_utils_mock, "vllm_omni.model_executor": types.ModuleType("vllm_omni.model_executor"), @@ -71,38 +75,47 @@ def _build_mock_modules() -> dict[str, object]: } -def _load_target_classes(): +def _load_target_classes(mocker: MockerFixture): """Load config and code predictor modules with mocked dependencies. - Uses patch.dict to ensure sys.modules is always restored, even on failure. + Uses mocker.patch.dict to ensure sys.modules is always restored, even on failure. """ - mocks = _build_mock_modules() - with patch.dict(sys.modules, mocks): - config_mod = _load_module( - "vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts", - "configuration_qwen3_tts.py", - ) - sys.modules["vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts"] = config_mod + mocks = _build_mock_modules(mocker) + mocker.patch.dict(sys.modules, mocks) + config_mod = _load_module( + "vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts", + "configuration_qwen3_tts.py", + ) + sys.modules["vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts"] = config_mod - cp_mod = _load_module( - "vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_code_predictor_vllm", - "qwen3_tts_code_predictor_vllm.py", - ) + cp_mod = _load_module( + "vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_code_predictor_vllm", + "qwen3_tts_code_predictor_vllm.py", + ) return config_mod, cp_mod -_config_mod, _cp_mod = _load_target_classes() - -Qwen3TTSTalkerCodePredictorConfig = _config_mod.Qwen3TTSTalkerCodePredictorConfig -Qwen3TTSTalkerConfig = _config_mod.Qwen3TTSTalkerConfig -CodePredictorWrapper = _cp_mod.Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM -CodePredictorModel = _cp_mod.Qwen3TTSTalkerCodePredictorModelVLLM +@pytest.fixture +def loaded_target_classes(mocker: MockerFixture): + config_mod, cp_mod = _load_target_classes(mocker) + return ( + config_mod.Qwen3TTSTalkerCodePredictorConfig, + config_mod.Qwen3TTSTalkerConfig, + cp_mod.Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM, + cp_mod.Qwen3TTSTalkerCodePredictorModelVLLM, + ) -def _make_tiny_config() -> tuple: +def _make_tiny_config(loaded_target_classes) -> tuple: """Create minimal configs for a tiny code predictor model.""" - cp_config = Qwen3TTSTalkerCodePredictorConfig( + ( + qwen3_tts_talker_code_predictor_config, + qwen3_tts_talker_config, + _, + _, + ) = loaded_target_classes + cp_config = qwen3_tts_talker_code_predictor_config( vocab_size=64, hidden_size=32, intermediate_size=64, @@ -113,16 +126,16 @@ def _make_tiny_config() -> tuple: num_code_groups=4, rms_norm_eps=1e-6, ) - talker_config = Qwen3TTSTalkerConfig( + talker_config = qwen3_tts_talker_config( hidden_size=32, num_code_groups=4, ) return cp_config, talker_config -def _make_vllm_config(max_num_seqs: int = 4) -> MagicMock: +def _make_vllm_config(mocker: MockerFixture, max_num_seqs: int = 4): """Create a mock VllmConfig with scheduler_config.""" - vllm_config = MagicMock() + vllm_config = mocker.MagicMock() vllm_config.scheduler_config.max_num_seqs = max_num_seqs return vllm_config @@ -130,12 +143,13 @@ def _make_vllm_config(max_num_seqs: int = 4) -> MagicMock: class TestCodePredictorDtypeAlignment: """Test that code predictor buffers match model parameter dtype.""" - def test_ensure_buffers_uses_given_dtype(self) -> None: + def test_ensure_buffers_uses_given_dtype(self, mocker: MockerFixture, loaded_target_classes) -> None: """_ensure_buffers should create proj_buf with the given dtype.""" - cp_config, talker_config = _make_tiny_config() - vllm_config = _make_vllm_config() + _, _, code_predictor_wrapper, _ = loaded_target_classes + cp_config, talker_config = _make_tiny_config(loaded_target_classes) + vllm_config = _make_vllm_config(mocker) - predictor = CodePredictorWrapper( + predictor = code_predictor_wrapper( vllm_config=vllm_config, config=cp_config, talker_config=talker_config, @@ -150,12 +164,13 @@ def test_ensure_buffers_uses_given_dtype(self) -> None: predictor._ensure_buffers(torch.device("cpu"), torch.float32) assert predictor._proj_buf.dtype == torch.float32 - def test_warmup_aligns_buffer_to_model_params(self) -> None: + def test_warmup_aligns_buffer_to_model_params(self, mocker: MockerFixture, loaded_target_classes) -> None: """_warmup_buckets should align proj_buf dtype to model parameters.""" - cp_config, talker_config = _make_tiny_config() - vllm_config = _make_vllm_config(max_num_seqs=2) + _, _, code_predictor_wrapper, _ = loaded_target_classes + cp_config, talker_config = _make_tiny_config(loaded_target_classes) + vllm_config = _make_vllm_config(mocker, max_num_seqs=2) - predictor = CodePredictorWrapper( + predictor = code_predictor_wrapper( vllm_config=vllm_config, config=cp_config, talker_config=talker_config, @@ -177,12 +192,13 @@ def test_warmup_aligns_buffer_to_model_params(self) -> None: assert predictor._proj_buf.dtype == torch.float16 - def test_setup_compile_caches_model_dtype(self) -> None: + def test_setup_compile_caches_model_dtype(self, mocker: MockerFixture, loaded_target_classes) -> None: """_setup_compile should cache model parameter dtype.""" - cp_config, talker_config = _make_tiny_config() - vllm_config = _make_vllm_config(max_num_seqs=2) + _, _, code_predictor_wrapper, _ = loaded_target_classes + cp_config, talker_config = _make_tiny_config(loaded_target_classes) + vllm_config = _make_vllm_config(mocker, max_num_seqs=2) - predictor = CodePredictorWrapper( + predictor = code_predictor_wrapper( vllm_config=vllm_config, config=cp_config, talker_config=talker_config, @@ -193,12 +209,13 @@ def test_setup_compile_caches_model_dtype(self) -> None: predictor._setup_compile() assert predictor._model_dtype == torch.float16 - def test_forward_with_mismatched_input_dtype(self) -> None: + def test_forward_with_mismatched_input_dtype(self, mocker: MockerFixture, loaded_target_classes) -> None: """forward() should not crash when inputs are float32 but model is float16.""" - cp_config, talker_config = _make_tiny_config() - vllm_config = _make_vllm_config(max_num_seqs=2) + _, _, code_predictor_wrapper, _ = loaded_target_classes + cp_config, talker_config = _make_tiny_config(loaded_target_classes) + vllm_config = _make_vllm_config(mocker, max_num_seqs=2) - predictor = CodePredictorWrapper( + predictor = code_predictor_wrapper( vllm_config=vllm_config, config=cp_config, talker_config=talker_config, @@ -231,10 +248,11 @@ def test_forward_with_mismatched_input_dtype(self) -> None: class TestCodePredictorModelDtype: """Test the inner model forward with different dtypes.""" - def test_model_forward_float16(self) -> None: + def test_model_forward_float16(self, loaded_target_classes) -> None: """Inner model forward should work in float16.""" - cp_config, _ = _make_tiny_config() - model = CodePredictorModel(cp_config, talker_hidden_size=32).to(torch.float16) + _, _, _, code_predictor_model = loaded_target_classes + cp_config, _ = _make_tiny_config(loaded_target_classes) + model = code_predictor_model(cp_config, talker_hidden_size=32).to(torch.float16) bsz, seq_len = 1, 4 inputs = torch.randn(bsz, seq_len, 32, dtype=torch.float16) @@ -244,10 +262,11 @@ def test_model_forward_float16(self) -> None: assert output.dtype == torch.float16 assert output.shape == (bsz, seq_len, 32) - def test_model_forward_float32(self) -> None: + def test_model_forward_float32(self, loaded_target_classes) -> None: """Inner model forward should work in float32.""" - cp_config, _ = _make_tiny_config() - model = CodePredictorModel(cp_config, talker_hidden_size=32).to(torch.float32) + _, _, _, code_predictor_model = loaded_target_classes + cp_config, _ = _make_tiny_config(loaded_target_classes) + model = code_predictor_model(cp_config, talker_hidden_size=32).to(torch.float32) bsz, seq_len = 1, 4 inputs = torch.randn(bsz, seq_len, 32, dtype=torch.float32) diff --git a/tests/model_executor/models/test_fish_speech_voice_cache.py b/tests/model_executor/models/test_fish_speech_voice_cache.py index 8fe7a4a4d11..fef4b551ab2 100644 --- a/tests/model_executor/models/test_fish_speech_voice_cache.py +++ b/tests/model_executor/models/test_fish_speech_voice_cache.py @@ -10,11 +10,11 @@ import os import tempfile -from unittest.mock import MagicMock, patch import numpy as np import pytest import torch +from pytest_mock import MockerFixture pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -61,18 +61,18 @@ class TestFishSpeechVoiceCacheIntegration: """Test the cache-hit / cache-miss / no-cache paths in the model.""" @pytest.fixture - def mock_model(self): + def mock_model(self, mocker: MockerFixture): """Create a mock FishSpeechSlowARForConditionalGeneration with cache.""" from vllm_omni.utils.voice_cache import VoiceEmbeddingCache - model = MagicMock() + model = mocker.MagicMock() model._voice_cache = VoiceEmbeddingCache(max_entries=4) model._semantic_begin_id = 151678 model._num_codebooks = 10 model._codebook_size = 4096 model.model_path = "/fake/model" - model.codebook_embeddings = MagicMock() - model.codebook_embeddings.weight = MagicMock() + model.codebook_embeddings = mocker.MagicMock() + model.codebook_embeddings.weight = mocker.MagicMock() model.codebook_embeddings.weight.device = torch.device("cpu") return model @@ -166,9 +166,9 @@ def test_created_at_zero_disables_cache(self, mock_model): class TestFishSpeechValidatorUploadedVoice: """Test _validate_fish_tts_request uploaded voice resolution.""" - def test_uploaded_voice_resolves_ref_audio(self): + def test_uploaded_voice_resolves_ref_audio(self, mocker: MockerFixture): """When voice matches an uploaded speaker, ref_audio should be auto-set.""" - request = MagicMock() + request = mocker.MagicMock() request.input = "Hello" request.voice = "alice" request.ref_audio = None @@ -185,17 +185,17 @@ def test_uploaded_voice_resolves_ref_audio(self): } # Simulate: voice in uploaded_speakers, file exists, get_audio returns data URL. - with patch("pathlib.Path.exists", return_value=True): - voice_lower = request.voice.lower() - assert voice_lower in uploaded_speakers + mocker.patch("pathlib.Path.exists", return_value=True) + voice_lower = request.voice.lower() + assert voice_lower in uploaded_speakers - speaker_info = uploaded_speakers[voice_lower] - ref_text_from_upload = speaker_info.get("ref_text") - assert ref_text_from_upload == "Hi this is Alice" + speaker_info = uploaded_speakers[voice_lower] + ref_text_from_upload = speaker_info.get("ref_text") + assert ref_text_from_upload == "Hi this is Alice" - def test_uploaded_voice_without_ref_text_uses_request_ref_text(self): + def test_uploaded_voice_without_ref_text_uses_request_ref_text(self, mocker: MockerFixture): """If upload has no ref_text but request provides it, use request's.""" - request = MagicMock() + request = mocker.MagicMock() request.input = "Hello" request.voice = "bob" request.ref_audio = None diff --git a/tests/test_fish_speech_voice_cache.py b/tests/test_fish_speech_voice_cache.py index 8fe7a4a4d11..1c299d80142 100644 --- a/tests/test_fish_speech_voice_cache.py +++ b/tests/test_fish_speech_voice_cache.py @@ -10,11 +10,12 @@ import os import tempfile -from unittest.mock import MagicMock, patch +from pathlib import Path import numpy as np import pytest import torch +from pytest_mock import MockerFixture pytestmark = [pytest.mark.core_model, pytest.mark.cpu] @@ -61,18 +62,18 @@ class TestFishSpeechVoiceCacheIntegration: """Test the cache-hit / cache-miss / no-cache paths in the model.""" @pytest.fixture - def mock_model(self): + def mock_model(self, mocker: MockerFixture): """Create a mock FishSpeechSlowARForConditionalGeneration with cache.""" from vllm_omni.utils.voice_cache import VoiceEmbeddingCache - model = MagicMock() + model = mocker.MagicMock() model._voice_cache = VoiceEmbeddingCache(max_entries=4) model._semantic_begin_id = 151678 model._num_codebooks = 10 model._codebook_size = 4096 model.model_path = "/fake/model" - model.codebook_embeddings = MagicMock() - model.codebook_embeddings.weight = MagicMock() + model.codebook_embeddings = mocker.MagicMock() + model.codebook_embeddings.weight = mocker.MagicMock() model.codebook_embeddings.weight.device = torch.device("cpu") return model @@ -166,9 +167,13 @@ def test_created_at_zero_disables_cache(self, mock_model): class TestFishSpeechValidatorUploadedVoice: """Test _validate_fish_tts_request uploaded voice resolution.""" - def test_uploaded_voice_resolves_ref_audio(self): + def test_uploaded_voice_resolves_ref_audio( + self, + monkeypatch: pytest.MonkeyPatch, + mocker: MockerFixture, + ): """When voice matches an uploaded speaker, ref_audio should be auto-set.""" - request = MagicMock() + request = mocker.MagicMock() request.input = "Hello" request.voice = "alice" request.ref_audio = None @@ -185,17 +190,21 @@ def test_uploaded_voice_resolves_ref_audio(self): } # Simulate: voice in uploaded_speakers, file exists, get_audio returns data URL. - with patch("pathlib.Path.exists", return_value=True): - voice_lower = request.voice.lower() - assert voice_lower in uploaded_speakers + monkeypatch.setattr(Path, "exists", lambda self: True) - speaker_info = uploaded_speakers[voice_lower] - ref_text_from_upload = speaker_info.get("ref_text") - assert ref_text_from_upload == "Hi this is Alice" + voice_lower = request.voice.lower() + assert voice_lower in uploaded_speakers + + speaker_info = uploaded_speakers[voice_lower] + ref_text_from_upload = speaker_info.get("ref_text") + assert ref_text_from_upload == "Hi this is Alice" - def test_uploaded_voice_without_ref_text_uses_request_ref_text(self): + def test_uploaded_voice_without_ref_text_uses_request_ref_text( + self, + mocker: MockerFixture, + ): """If upload has no ref_text but request provides it, use request's.""" - request = MagicMock() + request = mocker.MagicMock() request.input = "Hello" request.voice = "bob" request.ref_audio = None From 2a1d5060abbae97648d86f57d70fe5af57d41467 Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Mon, 13 Apr 2026 20:43:40 +0800 Subject: [PATCH 19/76] [skip ci][doc]Update async_chunk design diagram (#2420) Signed-off-by: amy-why-3459 --- docs/design/feature/async_chunk_design.md | 76 +++++++++++++++--- .../architecture/qwen3-omni-async-chunk.png | Bin 198564 -> 68497 bytes .../qwen3-omni-non-async-chunk.png | Bin 263596 -> 49242 bytes 3 files changed, 67 insertions(+), 9 deletions(-) diff --git a/docs/design/feature/async_chunk_design.md b/docs/design/feature/async_chunk_design.md index 202ef0e18e8..45314a0aec6 100644 --- a/docs/design/feature/async_chunk_design.md +++ b/docs/design/feature/async_chunk_design.md @@ -19,7 +19,7 @@ The `async_chunk` feature enables asynchronous, chunked processing of data acros For qwen3-omni: - **Thinker → Talker**: Per decode step (typically chunk_size=1) -- **Talker → Code2Wav**: Accumulated to `codec_chunk_frames` (default=25) before sending. During the initial phase, a dynamic initial chunk size (IC) is automatically selected based on server load to reduce TTFA. Use the per-request `initial_codec_chunk_frames` API field to override. +- **Talker → Code2Wav**: Accumulated to `codec_chunk_frames` (default=25) before sending. During the initial phase, a dynamic initial chunk size (IC) is automatically selected based on server load to reduce TTFP. Use the per-request `initial_codec_chunk_frames` API field to override. - **Code2Wav**: Streaming decode with code2wav chunk_size With `async_chunk`: @@ -75,26 +75,84 @@ Enabling **async_chunk** (False→True) sharply reduces time-to-first-audio (TTF

## Architecture -### Data Flow -#### Sequential Flow +### Async Chunk Pipeline Overview + +The following diagram illustrates the **Async Chunk Architecture** for multi-stage models (e.g., Qwen3-Omni with Thinker → Talker → Code2Wav), showing how data flows through the 4-stage pipeline with parallel processing and dual-stream output: +

- - Data Flow between stages + + Async Chunk Pipeline Architecture

-#### Async Chunk Flow +**Diagram Legend:** +| Step | Stage Type | Description | +|:------:|:-----------:|:------------| +| `prefill` | Initialization | Context processing, KV cache initialization | +| `decode` | Autoregressive | Token-by-token generation in AR stages | +| `codes` | Audio Encoding | RVQ codec codes from Talker stage | +| `output` | Final Output | Text chunks or audio waveforms | +### Data Flow + +#### Stage 0: Thinker (Multimodal Understanding + Text Generation) +- **Prefill**: Processes multimodal input (text/image/audio/video), initializes KV cache +- **Decode Loop**: Generates text tokens autoregressively +- **Chunk Triggers**: Each decode step (typically `chunk_size=1`) can trigger downstream processing +- **Dual Output**: + - **Text Stream**: `text_0`, `text_1`, `text_2`... `text_n` streamed to output + - **Hidden States**: Passed to Talker stage for audio synthesis + +#### Stage 1: Talker (Text → RVQ Audio Codes) +- **Prefill**: Receives hidden states from Thinker as semantic condition +- **Decode Loop**: Generates RVQ codec codes autoregressively +- **Accumulation**: Codes accumulate to `codec_chunk_frames` (default=25) before forwarding +- **Dynamic IC**: Initial chunk size auto-selected based on server load to optimize TTFP +- **Output**: `codes` blocks (chunk 0, 1, ... n) sent to Code2Wav + +#### Stage 2: Code2Wav (Vocoder Decoder) +- **Non-Autoregressive**: Processes RVQ codes in parallel batches +- **Streaming Decode**: Converts codes to audio waveforms chunk-by-chunk +- **Batching**: Supports batched inference for multiple concurrent requests +- **Output**: Audio segments `audio_0`, `audio_1`, ... `audio_n` + +#### Stage 3: Output (Dual Stream) +- **Text Streaming**: `text_0` → `text_1` → `text_2` → ... (user sees response in real-time) +- **Audio Streaming**: `audio_0` → `audio_1` → ... (user hears audio progressively) + +### Execution Timeline + +``` +Timeline: Parallel vs Sequential + +Sequential (async_chunk=false): +[Thinker: ████████████████████] (2.0s) + [Talker: ████████████████████] (3.0s) + [Code2Wav: ████] (1.0s) +Total: 6.0s, TTFP: 6.0s + +Async Chunk (async_chunk=true): +[Thinker: ████░░░░████░░░░████] (2.0s, streaming) + [Talker: ░░████░░░░████░░] (3.0s, parallel) + [Code2Wav: ░░░░████░░] (1.0s, batched) +Total: ~3.5s, TTFP: ~0.5s + +█ = Active computation ░ = Waiting/idle +``` + +#### Sequential Flow (for comparison)

- - Data Flow between stages + + Sequential Data Flow

-### Async Chunk architecture +In sequential mode, each stage must wait for the previous stage to complete entirely before starting. + +### Async Chunk System Architecture

diff --git a/docs/source/architecture/qwen3-omni-async-chunk.png b/docs/source/architecture/qwen3-omni-async-chunk.png index b2d98b80f3329f2e55084cd79529f485ab1f58b5..e73ca84b283ed767494ff748e1aec189ec63c676 100644 GIT binary patch literal 68497 zcmd?R1zS~76E=)N2-2m5lyoT)(&3>c1O(}Bq&r3G08*0DDIm?EyHmQmySw3C$LD$8 z@BI(oxh^k(v-jF-&CHs)=bl+RP+nFX9hC?b0RaJBQUaoYfPi?4fbcjME;p+)MKnb`VVy;) z@)w?;qoG8tkuk3j*QxpMKXP&~7a*-s^KOfADUPH?tLgpy%La?1Sw)YdB%ow_J@mJi zoss&-0vXxuwV{U6s@qF9ezFalSH&I|ckZ|Dl~(RrHyYbfQBhc6LjOKh*)<9(|N8~u z59Siae}DN#{GsZ}e}7)=z}o-cHC1nYdH?sb<@5jlT>e*j8&zp>LfebqZvF4CcgBU9 zwKNWzi|5;8c@q_1)wQ)3kjdQT|M1-2-yE2lnr>}vVFY~m!vihyyt@__7H)2C)~t1m z{)5rp-VVM4R|wvnMk+akLyENOzyJHZ&i`O!WaRqqBPki^JBFp2Et~Y+2mT@Ew4nPc zV>w&ZY{}brcb-tjfvpHL>YMzUE$oA&FtHYspU>n-b-j_|m01|<%;UKGf`o(w4{tQ* z6IJdG+z8p^DT%c`&c@12M{aL#Gc9p)ejD}0hmZ++j{KIzwmAWxg1x!g z*t_X;!xPT0>NjQQrp_rp8eaJJ%+fU+?M&Hi^xr$4|1H{$2zcdcGGFKN>C-0>!g|vY z8lPtC^WCW%B2Fdbb9rPW$Kw;i8u_cslWFajs^afTL)X_gaEB&K^krxi^OOr9bk_l7 zo^M_}2NH*5?oaXO7+)mMk>R9i{F;Ov$gkWSNDtg@ZOU~9O1!gPw2hQ3P!5*2x3ny5 z)|ssMN{sEWw@Nn3C_e{tySug}@u3e-v6KBx+RYOeAqm0cA9cC19VymrZOd=^^EvN1 zQfrFWefRwBcJiFN`VCa|k!cdI7yZhn+?IGL(48=(H-zwO34EN<)=Nb9A0citBc6QNy*=<}xZOs_W^d&00^iz@JyW zdK>2lv}>cUP4_PE1Exx(Jq02*4i3CdH-~Nx8nriZZq9a`4i}q|XUepQc#TI#KKQZ7 zRT2#j1B9narEqLPzYWJ|>$jLP|OoL`BsL&)$OOp?U3f{z^>LSg5`K8><~BU>>Bz;T?arC9bYD-gtho(EjMrBQlTEp^PvA z1_(XB>uFa{k1KP>VbAGHk;H|1JUjybP*RWWQ&a2)39t2E#uXM*Y&3e4;s^EACc>6! zRhJzj5f5{w(k}N`8!Ckw^d_n&8w1*9CZFVsnyduwid9FDpECC{ztGaW?T-&b#vn0) zO+o1P+X8X<-OlSA_ohd4Hs&>N|DaI!XzV`BYwD;8L%>y=GnGYgqO4zv7sphz_r|$( zRMNA`<>Ed|&{3NW>9gYAWEtK*IVv-7|E<~#jXVs{HlE&N0wg>FG<<3YM0Z9!Hm2EeOO!(s(uO zABkV1C3-KI}328z`VP%+jCk>9tw>hFRRpNnT#v_ZNVC?c za%N{NkBd`7-6y*Manze13N5f>3O_xyr;I1y7ptP29XfsXfL*gzy_V<4*g`6EJMeCH zMnPWVMqeZm&7sJ0S>rJVqx2js!GyZTFUPo?QX2(s|M;dYNx+w44}CW*a@e1i{E7PQ zeI2oL6sZ}|rO0HNY zJpq2a2=4JU3E@-u_S9pAYr*}UpkSl6L<)2$xwhVKV{?l4P_V|h8d4;tQElr7RV#0z zj4zzNyu=#vLnD03^8j=v3;`DS%--{2Z6w_VXd#wWyYT2GG!g7E+)p>iGpOLJu~NV9 z&_vnn)U4Dy#+8}h9N0;`PO@bLzU>9uV-E|5+y(6TytQq?NQ{NqTZnp0&u8k`NsS)r}!)#^f2oA9SDW%a4MNJ~lB}*S1$MdRjMS>Fuv)=%*>d51 zed_{1jQ>6)C9O%lv8=eC6h##ydJ#=IKJztcdRS{i$NuxZfyDQ@(*ItIkouc^_7Rrv z*R)0_W3Pk$`*}1kdk@P6;`yXI;{(RKy;b9kI^HlfH>q15sh)`UOL6?YRacjiy zbv}4sBKC?k#~%u@t~?5(uQVE6EH0l#U#WACoiy9FcVaONj6JC0z&Jvqp{%iOOLwa> zpP+cx&{OXduw{Q-N1<6Oz;|5#MCECIl)-^y!rMVczPA?`p-;iwsy?7NAv`TU2x)%C zzLHUwo>-TD6*inJ$ASNw55ei#$#RoSmMYC_lCJ*wtN!%W*%`sao3p%NCnIAriTH-M zGT(dP(XNWjmv_-~wedK3ueKb5>n}*@zz?)QKlR`QU%!wNc!)3dtvvGn2FAejB-Seg zgs{=+Mp|raj;*ogjdRQ!zRB`O*q?j^Fv|%lli~MR_5d7*@NFjiA&wp)JbVA$aN&>Y z!+_Xy8JK^i*QIy1fW4N+B9rB4OLpMXXhd*_wcv~DC&~R4g9|s+a&p6AFx~}?$5-IT zc~=3iJv+F>4f4oi)lsTvrO)e^=BB)?AAg#b*5tie(4fESocdxHfbsVj{Z#KE&zXAY zJI4|iW{ytZN>7T6Y777V=Vr>jTSDc!pn^W z&A)J-{uQYb6~V+%BTr3AFY(WnbiM zbz+KG8mu%Nb(L*&mCs@0EEF9|6HokQ(}te}DXy5D%WH8}?ZW(G8erN+q%gRBi4}xR z{e3mSFi-MQFlfPx3mNy~d2o;~x0h+vgRWRt9i*&OSf+k>YZ@tTJ@VsK z{#hbjusiab>4>OYz+emGeBj5my-DoU%i9gj`PN#RBFvR`c9sDn*1qSA^*k)G{`KL6 zU*6*&`@T@L(n{6LJRvHdFV~|GV<`Bde6A3WLpi0Twj90<3$NAH0HbiRru&H@`WHAB z#EH8T0NDyjpg1+%p}gfb=@92qjKdfx)SL_$#`$V^PIGY5>{Mh{wIJ(3l+fh%>`G=( zF10LnenDJ5Z5G|CGt$-Y_T3{01nP?q50lm^8sK5J->ld1i;-A!ZH`CQt(>xjc`hI{cEg?f?9Zc>b5kMxZXwYAVB&s44Jp>49+``EYNFD82gOM0A=sO`zSz|lFfRsK1_sD*;k zJq_(o?QEe*4cT5rKccKC*IGZ-S*L9iOsvK4*PSY(^0`ySP$)?@29BS-srr2#z6afPNZiLVo+&Yi zQn-0`kboi=_jwWTrtP{X6%Fo`Ty^iah*}JutZODEYwVklMZ~YKPE>)RA?WG~8-BBO zCqvbh?4$CGCbKu$CV$8^Bty;J{ZfUC9HQwSVo0Xvv4~oe`*Cfqpte-tuTy>R2M zvY0<*h^VFSpp=ZzF$PT1eX)+H@Fsdgvg5y)pKf!SoMQu&AY3QV7mddUcT$3QPSg(P z@@D#4Z34_vhI!TP4Y_Z~;^x6{f3zpvzEsyR>1-Jjd5bYTot%B~+wc+AP()-M zzN2h1j|dB_u;-7?Jb_Z+4xx>$VnH2Z^bB$i6k}bP^I6($ue6P(_N$rxGe`v0^+?}o z_AW_Yl)*h#pwxhdInZk7y+fU-L7t^IDp9f0%)Upj%#x&JC~eTo*#p_0g4@64AFuow ziamPN<*U;#4w-u}Jhs;H3Q8GjtdT1UXyQr!qy-5oRwUKErH@Xj9{;%wlNVMbo9s}R zpVI%oX1nP5^z!(2a8%f0z&2emMK;b0nr^Jj!LSk0h53DJ?}$rV!~$k_UnDQKrB?7Z z>!z`|UjGugSk$2PlDI9CX{jQ?#)W&xTh&4(b=V;`z*Ze?!}0}TMSSD;%wyF*sq_Qwi5!eJ)uxq^-8 zfn9ocXPL`Ea8cB+tk36b5MUjXct|oD%8s!%nwHF}=~}Y`LsuOb{v%eJ8`wK7>aCl_=nC0q_)Dp;Gg+_!v|ftdMRQ!t zV*XIwp8M#Dh`kmUl}~sUZ;*?fyQPZdO`tlAow&oHx_nfEIqBE1m*tVsG12Fm8kxWA z`Fd*09NXh3@m2ZD&C+$wz59p8rief8-X}O89D-@6tGbU^sV;@?*GYfySSdww!6gPO zlBC{$?8h|sDY3>&2pa7sptIZAP7(M-#FDvIamUycI6gWz>@h|~SgB&C!&Q{q{C}+| zlc$9x{Zs$RfMBER*>>f;>KhufQBo3?Un?I4h#?_VyMJdxgJzn=O}OIRTijiULy81vDGaO2-s!kdLvjFfjblZfwBX zu$n4e1Gv7QRQhY72}9e}+(fHfh%9fS1E2^5Jg!kxdf*u!vs#&TvhMz=WJ~mNsKD)O zo9*C^?nv4qwQ{LMoMl!B0m-qM5t~S2e%*~tajtJB*lag3*H+HJIqrNHn`aIUz4Qw>-RknueJ(SxHS2U z(Wk4<$U;;h=k>(|4BG24+x_}(f3jsO4b(JU7_VMBF z)@^H)2fvw1Uii-p{T*P8XLX?d_ecbUKfP2fq;J{Xo3hhJ9T2`vV~l+N_f2Zue>vR6 zeez8>`d<7{WrtjNcf3-w7WpB-?0dky@pe#%;H_=_EJY_XE=ThR@w&PTzJunI8SRm$ zn50LZ<9(6&ak->_p1tB(7PK+1SWA4k!3b_ltgug~an7N!H|;^ZUztF|_^STS{2IEL zurcdiwWwCDl5(@3wU`z^=*Sx7qsb#x9+)&l)RlM zvN)eQLrD>NylNfyh&U}Sdw*$rHC~HHc7!}-XeUjwPQ1Q3&;!Zs^_kU4L}A6ZZ{N@e zSqxIT(6;gja!|OUcj(9|vlQq}==Y|}snY9iwVyJ#ldi21Vtu+f-xFml(eDhsy1JU8 zJ-(g33R2Ede;*Oc`ky$q`i#oyPt=jCa~%DhSw?Ef$wEot)nU?+a7bVdouH?}ZfLKQ z>rd8o3>WvM{k(b457~9)x_z|f&vozTA@*mrE{e(qgeqq}z2Xa=Y3x&PsS8qu>e*#$ z%Dk2qpOKNU9G_z*!Pa4`iBYGC2aS?ysXH^!!s!LngzKTeW8=ObWP)2hb(9OV&kw## z)WKfs2{l$Q_YiKM%P}-2F2$zq5xcJl& zp)XR&$Llg3!L$@)EiRbSWvA7EgM~F%YS_#BzV3SPK3Z~suVe0ITXUf(&%THuK3+02 zpIf0!0;i{^<#jtcFxgpLU*69(T1umrC6ySrRcAgDd$?iffv$$V_gNo^us#jNoY8L_ zEIx%m8WZU)26eZU6ZT-g^!)`nbw-tX@6TAvs3@)I{&=&%C8&D%WyOp$h^bg`zwy>Gb`(D%HtGYi-@ai<%z~*jb|H$>=^UmY>fG_} zYONwBs){e~wb_KF%}>A7)YKY>o$0XqHYC(Xc0(@QR424)kurq4whzSz;cyOTZ($lD zI|Y_5q>Qz%l-t~DwWrGc+x*G?qoQ*;nYc|E$Z6KyWm0n1UM6$k$p`h>_{S6Sce}IX zl<6n*f@_~Ie=bLo$-O*pxu-N^KfL>zGArc>^^}`CJr|`skSzLPn>74c`z6uU8*F@! zn#SAM-uWkbg3hW1bQe_#M1r1t*!5M;&A;p#r8EhvzntFJ3X<>*)~jFWw)#I)>B;$& z0p09#Ql3%3Kv4;Dlbk*83rKlAo~L59+7;0fjq~c&{GZQ^I}^16Un(W56lSH@0zU z)aYv5P(|1eX{gC;m5ss#Jmt>xw$7H?#->Qph@n1qEmjY!Uh90& ztZIF4G(Dd;>gc<6;%n2x#_`^Bb!G7@f6eHYM}~IJP=xFydJC0!fo5%Xg(se z7l=z6`ilQLePrt47j(pUKuVKz%OmmBCExTVHZvDl?3E> zY<`Sgn(mEG3%irlkTUnHW6QakXDZ=gVPV|{l<89?R$u3ilB^M{gHvo}{rNMf(oIKx zo0h9#OR@j@*wFB`@pk-t+|3+9Cwo5+v-t8bHGZ>SYjtHC8Mnr1IU5&!sW+>h;&D_S zC)zvGeXEV+A6BcSlsU51@#7_xg9F?&h)|pi=&rop;i3iR7;eVVMzKsO97UrMLZLw{ zt7~iPQ8(I4WDp4Y^bLFD?`=J{Umrniy#M~fTrcjEox`P8 z?s%;?s}LKALISDQuzPSZOB;}ZLn;abR?t@qcQcu6 zBSrnmMqVxFt_}l1(2D59RAJl{nq{TWQVk6azkY#2UH>eod1Y3!;zW1_U?j~tAQx|6 z?p&Y!$j|V)y`shDFdOYMV3SGaQ%y`)R8%zXi!TRyo`|WtoBmljQ*pjER1SMD+ADk= zq3^Ig3Je+NX3T`WqoX58v8-KhYYMiJuL@IsZJ`lST(b2<=DRDR>)1(Z-2Fyk-S?Jv zyBh%4Bz&Z?va%YIBI9?p&kzd%WugnW3y(%8?RTPz4g>pN)t}YI!18Y$j}(9>ZPcvl zvTOnTpV#@6F?DV?sIn~7yE*wZ7aM(8jzu;a#^2Oj@VWof&Hd|u?tPzUXiyN) zJgR8HFE_X^b?WoEe{M^4#uF0s&@`@HnP)Px|<{M~JSjF7WjqrKNj$9T*Ojs-KybYXYd}^Zh#kiw52Cc_pY_xgB@YF1Q`k zpPrt!w6wfyyr(hRvK>tO5U{wo2vmM{u%P|d*0?|MB>_Q&PV;wg+3f7>YW+^x(N8iTrI?cX2Zp>L8qa2V%vMD}S+wfn zA|h;7I?yHmG_SXMp7e3)5o{kEEL6jg(SHH__w5hGY(&9t3ai(34)$${{Gz$65Z2QL zGlM)fCsd;OU%TA)*4^1R;tiL@$R8VP>@-2SXgDkQwW!r>)lpXjm6n#44>D#<=HuaQ zxNpF`j<_xw;SypGg+dg`zh1dIlyfz(?=MWIb5R?LER+mQGsg0ykqZ zJMVYADsFe`2sIkrZ<{&@X&cHke-uS{hxwt&W-D$7Ia3C-QaIrA;i+kTq#UUAHEZ&M zlf*(IC{C*dQ>`b02_w-^PgO3@+x18<#jPI&aTxOu(Eve`1(=FZCoMp|9L-N^_y_aE zQ;zJE_xU~V<#KoSY>+TWBp?BpWHB|A$ta`@xv6XIt0k=?vnb25VD#{CV}IVW?ZC?F zp8Df$?DFHJfcn6o^Ex&ffnaFgdAly=$ZnRl1t!8|iq1XLtVsDa835ZLe)Bv-HZ`()r^`fIWLnUNSu z|4AjHhxwbBL7>TMH{d+Y>uw}9a9}UA=&(!Si4kd^OGD1QN1xjKxpPtICv!|pyL$(p zx&&Ksw{oR0=es=YqKCUqzhJMs50KS6p&ewRDAq1Py}RzgMoVYG!J0n=-fqCKqNK7~ zmn0-4prN5LH9X0PPkv{9kKm7+=twI>z2ek#uGF+uieaQCOU|#mzhFZ7oM|*JF7Cw- z!|g$taX^FHy*1KKA z)S$}a|NRTc1kWgDALYD6_4~fYYhwd&Lc5Q%n_Gq)b>MHAg!DDh-(zN3g^!imx9llr zk-_D;pe&N}mL@U~eZJWmX^31eTnae3x0s>^v)_9punc<5%B3{n;~vTb9hg1ma_-c5 z0}5Gk%`Hinm(ID2%Qj!C^vl&s@=Gtn$-4;tV+^P*q~GEV*nR=o^%;SU3Z_BDZaMoU zS4of4_Y$;%bdFz$4w4J}y^CUQO9G>l;{&RaLo!YN3ljmSGjb%Xt(kyQ*h$45v@<}; zEa!OO4+=Nx!L22v*oUWXlJI%!5{3B7^rNDqx2sf9>Ld(ie4rK>k1n6GKp|=iU{~~S zX>GMJ1s4l3z~`RauJ-A_jT>Q-|5ATx}=;KT<41IG+9klo!5p zMTUKOsi~=dMxl`t`H4hCMYYiskS}1X+1r;Xwhf~!}3OVJm_9NF#L4DUW77XV^y88%^`y(BGH3*nIW zYrR(5+z-O6S3Lsw3*2DkT)l*JuxZcppZ1qmPnR&hEl%Dt`HARO&WxFr&soOSh?V04 zll;UGz1bHORVz(rutXhsT+Kp7#wb{+kJy6I3`$VQp;<%p{;SO*!v~uu-UL zp9P0KB8`htz`(3DOP+4hnp1-aObjaoRQEt*4z$ZEt|41f~juT2)!H{8nPg# z*cXMb7z0!VE56R)3>6vo2WqfL1Z&zjp7CoaDB#?PO;~W|jhWf|m?sXI7%(zY3E=)x z&NF0W)GzyPQuVP<=Z#KT{A{teuW}xK>%WZSsa^mtz()`z>uo}Ki809*&sv4-)YU^w z%pXi(X7B#~{<5du$YxFxHe~_#UZKV%UzjXx3FgPrfybY976NiLq>Glav?Z{0hc7&S za;g8@qbJbKTG9%0FHPKPlkh>Og~>0pknNx@MD_0h1+ltsUcB+mHZo-=0(TK9e*~L3 zM7c=_=dCIeK}9m*F=_SC&KXe%(*HNduhV5O-&;3J(!yzYg4?wBh(00000 z0j*AT@O?GxjlL8Z5)us*8qjT^=fPm_B-%l3o|cvdj~m<0PgxXUY6Z`pJyTFY?vap^ z(rcXx|7UQiuP2#zwz69TH+?QK+u=oG(PhVvFLOTH+n;7h$4+7;S5>{Y$o%Z{^XJdW z*~))gWm&b#0G*kYC`+OHm^4W?U;oGZL`Z%c+y<3cFez$7@C(9qFE zg_szDq5Wt2@?ft6*Az)3!*E~w<>59_8x@s0_X$ctW+nvuhC5YF6tdpOwGdUqvoBmU zVFxSwYh;(i#c{1xot24o?L~_9%gW@!o7k#3^d40YI9SG{djM8y$frl zTKJ*qc(zT3ql{!W7;sR8Ms~WJcK4DY+6*u<;3>cb?2Bh9s4~T@t{PN$@lhtbWlrpb zh*2Gr9sw(jIcRHdr&z#7R{t7Q;F77F7je*qTAN3;0Cx;Tw1!n9AhTd?xcvS#{$~F8 za~P5UHm}nXW_K=*^0R$E{=KeYzoRsdd;E*xbhHiDP!eu7YU(WGfn@_%d<3*NRVp@|GSSyanR zhE%jY+C2-j7xElt#tYPL*z1bCM?d?_FtRk*?EkIS_PX0E5`54L1Nhk(G|z3ek1^$q z)qr?onPFeAMTU#pDnd5UCPtm3^ykNqAG}_lEi5cFG)S;P7aigQ7K&Vy=4R;`G z1eyGw)`bPHNdWZ$x?(ecjymL)?>f4*!;3($A zlf)1mL>z|Xw5BEEo0<8#K$+2Ck57f85X~zl2kh;PI3Rn3KQdxV7}+^FImPhL;q}=L zc$y`^@2+2GD$dT%zK-TZj5-|QVFXYK2qY({XDTt2s}*FUujV?oE&gN!J|kL+`Xi}% zwwtL-Kx=cw_l_?=w{x|iI&WS~?md1o{#2oUN2IRA*R?*m;0A8C^8B{Xoe&l=e)X*b zaA5iDRTxp19V_+|K|>vNJU5AqF{qn`OZG_a$kyc34+iK?dD9f%ffr{ig)#rdL-l)u ze=0xLz+%zEoY4LQ`zsjGnl)2H{w2bGgjUf&Z!hSg)SL1hGaK8ow-_?y#D2n%p~O&q zx9-40>PU%;P7G@$s0#j^6|=Eg41$$Pa-?dw0cZBRnb0#E;x zFk46wDznGTmwgTMTQzq=CMVoy_u*OC& z6B82)3nqnR)DkhpT%}!eJNrCH#*oSJ@$r`cgq_3e(Ng$epLSd~lj3aFRA~?a|5^}_31+?c7=W2<5YlR$4rC=H z7FmrMA5z6Pq2Bn|WMl(j&RFcwk>j;ijxs(f4d^0H9>sDs<998>SZSaYJw3fiXGSmx z0tT=Wx;5+fUk3GIQyS;Y!)E0GaT^JTiKK-WhE_tc-dZCbQN1cqCdS1DH$G)n>VIu< zb$Dov8Aj}jf!!m{2LgpfflbbLlnKNwcx#gqI%o%1`i964Xl`I!hFiFtIpW9p;$={8 z#!xc$93=(3X%UKAUISTBiv$m zf8K>BCEd;14LBUkdw_6xdvX|t9XV!Jptnv$tkm+o;P`Cy{>pObQKV#5tF9dqkcFCV z?+KNtla&j%FUpTOOJjX8%NxrLfuUXcePk}P6JCnQot8Oh1rS=_mD2>=FRjNu{9sHFqB2R;cpSGTX8AIi<#t)0==BQ+_UrpmX&sI^{!$zSfmU! zwF{JIz*DZ6W|#)Bnh}F-RADeVo2!9?uQM7MLsg$r7g;9!%pD1eVjGQ>BOaAN13XAp zt^cn@G%?R{>Mhnp%`TD`1`Ie@Sl;+?qIp!+H?_SKZyk@wweB|KwoqBsTI$!-&3#I| z+w_~jR?eoE)4W{mnfWFGMN5)?L7#yC+DsxMKWO3w_-=UYD0H-Rr)6y@n)x?P05P0X ztr7x@x6C!7Lb~S>&VQ$P*7cZES2*$P=aiNx@GTW;1+zZ+cKBcq-a~(?k?dDl^CWR5 ze7)G8yPbnVC5k4b#|jDzDMVZ}5@1($D`LfEq$C0)z~E-BkAes;m6=H5*WZQ zR8iYd`tweq{b*DVALJ*gfFgO-QvP#sR7Tkob}gg=F&z2om7&rkAh}k~3keLwwhIcP z^+h4c9{Cj>zF2Aj(<&mY?mO@m*Ce!DV2?FSSaX5L0zPD5%hu8gqR*N3bM@*CEDcdB zNJ@pqbod!zQ-Ap>)dqUrwUS~GOW*_NxMmJSA(rteQJ^5(4}}C=Am_qPB%`Dh<&Jdh zyz4;!U=x1z0{F*oYH$jqiuwxj{PV%!>(lMxeB<_LMv*qEl$h9~NdSX}QD4^hTM{5yUEqWnX)Xn>@jbSiXXAI!P7Gn&+c4ky8 z9&@>pdscb&bIuYJ6gFIyzlTyseC?HPS+aip`n9!OchLh zn&i=hUuz>f)ENxe<_qfBvp>}#&t&&a2_$rCWUAD~U0dK}!VC{-*hSVv$)DEtUe6?J9si>lO4 z!yRf6brh20iTYP?j=nI(#P2CrYxV{M;EIA>NCBY>13&QF{$ZxND^p9cyN;?)Uj*#mpX%j>U*xv{ZCdl#=2cL#3q)_WLARFq;+* zQ2A#kGPkC9<%<%H&7+)0R49#~Sr3N`$r**C_`pTH2SKoC0`z{zMn%1*rrtVp^%Gh8 z7qtNh0a5!o(g(kp!Ub-E&Eikb9c$+n76dstE7tE3<>^7e)@u#;AHGlO z2U({9Ivkdvu&)Jb5@if^p)WOO0f(6*3>+rHs$a@+@k@1!N2Fd}4*;`SX06t?92^ln zM*=LGPO*r%2i$&mr#yOma~Cx#OiQU9j&o1F+fp#-R;O*y|< zt$-&fl(|PSgtcCVCbC^mm-zXTIY=8R_%s3t%tN8y+RZFDcEF>aC;Y^W;efft%VxLY zF~isA4PW1WnOZ@+9z8w%(m4)5Lu15UxTK_g1!sB6c_mmsOume#4w-;XZU9DC-(%C^ z1p%0EIjIET=_to{bwGtw(&59E&{PjUfVRka>dK!`*iP1!p`3TXu)HM3UhJpyhFOFi zum>{b`jIPdVjv_I#ypgzQxW_G_}A z$70_EP6y!M!t z>Q*Q%CudC7bFc{j9hNy{0t|9egJP9;SqMsQyo=GHoY|tL1JpA14M;MTQ^&3xT%kdk z%S4s=Yw)@+)C z2U2<75-2ST@Q%wUvU?0u~;IKYslr1{_3s zC9ouj6_~vf3h+EaUpqLxjdDZgEQ?$zfYTZ%aPi(^&;wK-{H|3zWnydq#dL;f*W0?W z(z$@q)%w9jbw09l21rVfv7uvr${UOBt6%mqW^v`0Df%Bb> zqyH-wg%5xdi;Ref$QYV8xE}#-VFT$3xmBua7ASI87|dSppIj?Z>UYQY8MRiNrZ07T z)cxREy(Fj7X3Yua3yL$3fG2aU7oej0kfkuS=UA?$DIg9W4Rm>yTb!C32Q0|k#i#k3 z_v>^J0T`lDVc3QsV>T;Sdu|NZ)ju^SL-(5cl$YMVc)r%j1Q8e44+?(`gk8%5Na8jz zD4_hFgM8)xy`k5D%_c7iG?aMB~E0CyO|9W6qMCA0dr zX!gj_M1_vW9{|wd1m2lZNolD`loa(0HzRPfa6S}l`M!h?v(G^ONCCHW`5pr{z*vlX zEFY6bOSL1NrPeaC!PSHOyFD*y%`MUUj; zy(3(*UpQta>ZD>|!2|+B=OXD49UkiQ`3Pnf4^R;TlPmQB_Jz+gZOvn$uej!BQ<8~IrAP|rSfORyyk^A^Ds>UqY4~=Lr0UuWytN3l? zbC8DrBd@WY0B${m8_~BQpx_X(6?>kk?+y92wT8U}_ivT+&LZUm03B-h#61;>I|<+d zNyyu`Z!Kcvuk>+naIl0>gmaaYIqy2b8IH=xClR0d3sJeiLJz;dH$ti&l26*F3^9-q zfqDbrq4UOYh8jF>s-6Jq1_EjO@$%jB^70$+tPY^x>zkXC0GYeHi-p@KAXNfkV7Zio zj)Ec$WZF!Fx~A(O9nuU17J9qZAxr|u*aBtPmyq_PHGP@9F(TkC=hlGdc=zA66pRD* zh|6L!+`$BduA7#`RvJ`%LPFe60G-)$6eN8~Z7n5^$*=b1Ws?8UJ3$U4HNS@u;8R{b zW@fmO{UV_+COkYZz}o5?*#OAUOl#L$r~gA^2yEK249>mf(~XFYF& z+eIJ_J2CgMdOcC}AWvUSE}z*0MuSew3C{imiGi)}u{d9H4nUo9UUUfMQUEW=YmH#^ zc&HW2Y@;Y(2AuxyUOdS=9(OwPdY>0yeZW z+ds+D7iRQ!{2<|XWq#1F+{h7C6L zLsjtS$}(}1Y$t&Ph~u}qyb3Ddh`JElKa*qtb%%?cf8}*iQPKJN`4<&bj{#l7=4sF# zgzs2O>Vx;RosFZykxd#4E2rBRRB#xgK^hz~r3))dl%<6iP!^ZA4`~mdAXK5%&J`-> zeP;L!I5WUV-c8c9&(IK{iK|}hk zLC9tCdYw=P70iKv?0>4X#{%5~(6shEM}fdmU^a1oR<>`q?{Hn6p(BV2@^$9 zX&3>2fMyT4p36!aaPhz@wF4s9+t&bg#W8x+9d!_|bcWGVva%L!Zf;7)vIHu#kL0Vq zi;%8!>I@?*$~A90PbPb3^wH7LF(gqQ1)QGM(i(nnu(>|n0(}_30l4uEfJOn($1u~> z)D%p}>In|UekM^pgHZyO1Pv8c4-G-GeaZUvxR-S~3LmtCfTMJIDn$avT~vU-z|WQ^ z1Zo65gtpGD51^Boo10r1W#_(`9 zp$+d?KHr-O11HzPIsBFuk*iVkKjXh;h_$t~vlO`90T9+)%Gz zVrFLM=bsz7j|a@0gS@=Fwe=x5B3|pTL*wAU;BbF?MJ*kZnVp@Tl?8fFRJXUGe}D@E zXZFDqBO)TeXT2)eBtL{m9WCasM(`je1+mcY7q zCys`-cShWT8JD%s7D|rK_^zyks%y^(woA=5oM`gmN@ROtE zs|(nW$&Zi_{DRHP%S+H$RoY0cR&Iek%6Qn|Kc61W0{ZyA|JBd>P$Tx^MG5Gk0up?t zxj6+hMn`G|tC=jqtoT}B@sK*`*V9Snr3o{RnGYdzKWqkDE$kJ)TG=q@3ugzXiT5X1 z9^I$oGgsp2Rd*YJ0xDkGED&u{V^gGf`BjLQWnvd^yQk&YpXhD_c-vtd z{^e?<9O$mHYP_Q{N@K&r$2VmAb@-V_N#fetxceuySjG{MG30b}$Imurq(9s?Rdit5j&iLL5{ zAz>FgTsEh1sjv8JkWl7bBBNWh2Y!?B*YID}a->*NrZ2SySj|+(Y)eJIwOjvK0sDM8 zD6B}Cg1$XgZ)i7gjh4T8&^Qui;8C)2Q#r5x0Rp!$`qq7rre!@_e%`?{kuC^l40G>O%e)XbF!dU186;qY41ZP#u_gC zYQ7W{Nqg?^b+U)qc+2?@b=s~fQ0sBLAB->7FLNh zv^kUkf0aQ~>-{u>jk8YGNh_w{^A~TvLG{pK>oN%}4i2x+uR@EAnceSBr(kf0w3QW` zSty%Z1YS0w{cuf)T@Txgeh>?bsNuo6@jPhArpgBI?3+(oo9-skM+~59 zCMK@&pc=hgxII=+Xl#5K$`A({^~^PYdQ53p1uDwA8Z0r+TugW425y3uFW~qT zR04wF4WgS7NlEpKhIi7eA3>K;X)*8FVybcaKlH&AkJoaxGvRiM!e>M2G<5sR?%paL zEF(6!zqkHF>%i^x4DUhRJC7Q2)x#isM&JU*r?a!}ZeFJl^!d@X?EQ7Q^Xe6pKzZiE zPEGx}N&$o9eN#)uUP?X*&UGAUs>MF}a|Y9oy}Y_IG&15e9nS0+81SerHHV3gG-`*W zERZt0T(n*Ve+2MJ!1XkzGMp+7wCs&mT4_^GpAQ9PBRxA?z`i&hjRQ>svL63^bAyAX zkk;!NRFCm|Rpq>LhXm!6gITCS_W3gpTcP2~gc|#UTVPrsos!lBMYKxLWd|DI5HEHE z4FXPB`*2wyThPrpr-kSm76=5ql%=b`fBf_yz#!n=o6LjWcgeKr;GGI+Dm~yFyxlUy zGS~?9HE-g2(C^;m^-}!Qrwn@FYyhN;-4Wv_>Hi_?J;15%|M-99M#yR)GlViLA$t~O zRrU@UA)8}U$U&&AY!S-d>)3mQBzy17WAE{Mow~oj>;C`#pXo*pQ9y-p9(e>vW&dRSL=J~anf zxxF-=YT$zl(#j)P$KtTPIdP=^m5CBxKq!MocIosgtW$jG$Ak% ziVl`2xJ~e3>`EZ;B^cYE47qUuF%DwR@87>$iBw1{k~^LWzi(tGTo9$BvfEwB<>JyfnU*Tj6_ISyF7$9-p6^Y? zIBw29VkB7kS>D{evUV&~MDev#fFt%{xTf9{rMfU)8<|VKLfR#k49V|X{KjOkW@iHG zumf)1u`T>{^_-2{PSrYd`DouaWtk;5XPTFLC)@I7_U~h1vV5eA6WUvrFRRVVyS{rV z{W|~8yx06>V`a8dOM%{%fKG@LLU0KPL?h=ZO#^lA$r}GWdvH`_<+3uU1XcD|%?}Gz zEV~!pC`P9;$TT8vTAKZa3S+hFx&B!~>YnsA{6$U|la7GPd-&msO?qD5d*t!)ND#=J zO%Xaq-k7i*HM zkr)B%ellK*9MW=~_iD;b*rR>GB5`ZabFDXQrBpf5T=2+3v?D6%w;5=s*GD=;e&~jvE8Y$NwFBFH*p*{pwm_qA-P0vkMt1mm-bsW$ z)+JeN+ac)l+&po>eB?qm25EpLzWO5_h0fN$x3v{GFQ*9!S#LN0NlL6yoQDu&fUE4> zKM+=Zat*UXqcfSHw&bM&w_mK>^(al6g+H8#8iN#i(?U~d)XcUF#h+BQ zLda zo8|RuneF4KBkwlX6%h03-x4sYh(WX&}@O=)EOEWzC)x%;s)k>Y1(lyJp8 z*4*DI_|hJt`&{smZoXlq0|H2DS2ziYI)R7+|*KV39r8f7hd_ESl`Tr3NR@)J4**I+0E;7?t z`ntz2(`9SA_r(^Qhj~=32)i0j$BnL) z73@d7{AnBXN`>;W_ut1?ZN)!mD+k_%p1@z!0bdu_PnTdV{dw;K^vFu;J!anlgq@4J z^_l>4T9HAA9>=bvr}b(rhrX`+M49F8%3sPuRNzML@OBhLp&uOi#+Y79LE&n zCI1ikfN?Qt<3y`>tg40lYtbBGehiqCg zioKP~EXgj$K9yh68j5~BX;IHEy3%T`f8^?;SyQQUek}Yhe*uRz(cM*xwC{2*3M?#o z23v1G49_!jC~JD&x_ryai{^OU;Non-s(VRQ0?C>8D|b=l;lJ?p!Trav2X5o|K?&UOQY(+k{L?xiXy zA}TSa$nRl9{*Mz@qs>UQ9BVWmP`CU;)>HkZE|rTMCpfXDh?TiUdz4-XHShD{8K(l> zWtHQVg>pjG5S+{U#hx=wVr$rITG*xo?(gLIdW#w}XUlQtA2&b# zy!o05Gg9d__+@g24-aL$8u1UftF#c1;N@&c--8vmg(upPTwRxp6SnvPX_h>*;N_tD z+WkR9ZjX{FPk?KQh4Ln{5~0%l5iKty%CtmxgWKiC*W5JjXnRjaMnSi=`yVpoGVl5q z$BW7TDpe-jFj~P==HXVEuQcTejwY5_qo9*>*pHKD!i^emZ{mGo)!CaG*C5?qA~Cz~ z(^(~DMuu7Nq96rdVj+ta7|j;!7H>x0ex!I zsdL|gVd>h9!!g;DvacgAg+93sYR+kNC8pXfMShLz=y`Zj6i7_1z{i+OSZ-qKCG$oq zHcE&l`ea`-fNw7YrA?%BJDXPsb+4s+I8MLJ(D*DQ&}+T0^QiS8b&-Ngk%zsy$)z}^ zSa7e;rD>poCBup57CLg40DZ;@L14=;Ab;=av1q{{rZ5dh1Qsb&{9NoNa>C@qew9`Z9H!Y> zB#p)($TL-?HV1Rsh)Hf)IJe?9s14>;8FOdsiT55gg(9LqIC_lsO<1(W@MpN~`HNiN zn)_U=$8|z#u{JmGsyr&?ab(Lct+n+hdoi>}?r65;m$9W+8-Cf)eqLH`^r)`&tdXLSfo$&6v>}Dj1YXJ<$FrF zyl8XRRz2ai>F031c$JmQV#N(^ooW&uLzH2jyln^-IUYIO#G}+dFA-I``Y1fkE#T|J zHm#M4FlKu$oiwYI`ObJ@oukvV&UlBMc9+4`L#|P^yYWx5jw?9#>{Qwa-zI--bGS4& zp7hz&Fz2@RmvC}j*Mbe@Dw+f$^MUx^g>i{UbBC0hd*lm8?XMiIa<=Sp9XnMvOrCXd zwkfw~x+5mcM^z56K1JeSO(urKQDo=YpB|IheR5Qd33Jh&WRxUe%A=LP%DK#58R5Y8 zRTp6==p`{;!YZIRYq0e$`LN{Mcb5*4g6^-HMB!&yKbS8MMJ32zIdbN<^L$DCirmXp zr{wuhvoCcF{g_1hGC$+uVV;a__$_@ zv&2w!a_2`&eIn9bMMDemkgEw*d3IMuN_;pUQz;kVwN!smH^yz8y@mC;ZU;;dp@{dHeeJa zMN+LSkml?kU*BG&&8S?NNA$#YtQo20xWV_rm@D#@=kM8uX=4S`#*dnV=(n#lSOl%3 zGn@HtY<>`l{d$SlX5l9i(Rewd^t&H>Ibp=h9Mb_|e&=#$-ie=Hw~jv2@NrgjaCXJ^ z<@J^4cClCYe|)z3*&I5Ziq63rs?S+@&WFFgh@oB+G>bSDfuDHij=840TrDAHY zf%2MfS;XB1=e#Fk^V@$t;LiHIbsW;0t1f(_`b2q|;3!K~Os!yr*yLq-py0+V*OFkF z^Q));)TZJy_klX*?mbsA^el2eCz$qks+9o8qI3Ylek<|qEZ(~rsJn9CPcjeSI|gxf@@UbrHksE1y-_SIN^;~51vW#H!T z)W|#VBC)FB6sOPRdZ=i|Z_8z}K~vnkbIqknJ%mcDfC}lnDZ5D6a!RhTlW|#T-s#e? zy=@rz*v-dhsoV13J4Z~Yc<&Xf)UlQQX8;`LA6-?mk7`FuWH-~Z(eLBiphw5@#$C|8zgQ%9 z@r!Zmi|8aQ25mHn*zxU&k9igj0<4AkSqu{&WfgKiD;d)Zxq!Pm z?{+Ci-55wQ!Kg?hH!+y1jEA6^aC@1YW;OH@z^lE$>(FtV1>L#f@&3jPW1`MZ4=P8b zpj7CSW6=b113Y;{HC{DBA!o3$LVzPct|iHcrSdb|Tyvt=_=M;RY+Fyj=B(rPV%Jlu zuc~8ebs7F_9d$OclNL|^^4HfM2xX5t1VU^l^cs(eL*tA^)Z(ZE0e6JQ$_sc-KR-WO z0XtuWrnuy7J68U={14XgIinWo?oI|0)(8qrb^hUGmEF|W z#ZPrH{|~O~oRsCgPNN~CwyAUip9Y^#6TIt%Y!%i``iB^~k1R&gg(cRzuv6cgyTt_x zbCt`6yxmgq_P*_SUvGTp`Li{G53Uz z3Xb_@UwLEYR&woI1*(3>-9qA#OrugqtO-Ao+%F^pNQz&=aFczWccb3|kK>9OUoz1V zyy{JoDV_BETz4YrWdN^2^xKgE|vYJAEQo((v$kfcH=0_KL@ z1j+72qBRPt(cBTAv53&FgL?PA4f&(62)IY$utSJd!s+JRDW??X?<2dEn{Cll#~JX$ zm}`<2%>S?F!XELAPrPh9ivOy&_58t;ykr*-NYD9qR%~PpcDnQlnw{2_;)&KwGd?{z zMWdB2)+CA?YR@k>(0NR+I?Ci;?SDDSr9pOOG)wfVo=C-1#Oozfu{KT{yQJi;Z`>D8 z2IlSggKEykor(`@*sx^3SM80<*wOJZo%ykU)BHq41@;XC^MKzO`CO$S5#y zrYy*rc8vH$j0YgVmT1P$Hqq8nrN3IJOE?iHpIc=_$mnpSU^?H78 z|F*A?>W|7G*V@_AYGJ$m?>j3Enj`X>Bg6}|9XcXu%w%JuJFZLlZ1i`|CP$XMuCfYU zlS^EBBFwxzy3osdBau6+cdm&@NO5lUjLRbS7W>Svxhm?3C0bd(=v;Zd1l4);(6Qq7 zwvS)7zek?fi0jGhx47Vm${&5qHN04|5*iwmWX6g2>rmH!+Epb@9`Ts2LGjRuxUO>% zLXP{=hXZF~^c&fZ2#W*LtOdxt41&p>KmJXhdw+hIm0)!g#Bpmh;`31c7WqcW#ZgU5 zjkJzkMG$-f53`Gye)apk!dS(rRE^F;)N2b!X+2^IJ8@pCpJN};@wCY`5uMKwr116i zom87qDH)*>cIL&Dhx@PNK6J3hmlWp@@q(`&es?*##+aPBSiX?qP$t%$6| zD%Ru7P_5aK8uM8in2o~0dQ^USjb=fY^rgYo>1t1vh*5lm;szJ8=y+g17&YO|XlAoi z$jyHWgURF60!7+^+`0is75Yiqa!ypU8`X4>to3Om&A-M?48B)bWLf$kx~wg{#p89% z^_w@p0uEt;dw%ZZO@4Ih3wnF&kc)Z*Q;MB^)_n?3=jbbpj|}B!N8d@Bi!|pMO(BVs zjl)S+wOsv>xWvjQGXwfL;ir3{kp`RrU)Lu?7idEVou}LHJ1S?rCz^~Mj7<+!OHDXh zWJDLpwz|oQMRvu;LZtK)lAP;cdv-6~CF+4>KUEDw@_3aS3Vj7KEV;_*U2k&{1=sOE z&1DXd^}rywmNr8|bN@Y(Jok81p=Ix(zSZ#IPL~J_fKC=Z=)O$I8p0F1#L!yxE~; zayzFow$gyeixumA7HgurUA-@U&DlnaoOMkbisMJLeH6VP?ja9e zOXl)S-X(_$wQMbqs$O4re*vc@|ESC3%*jBNEw&u`>A?!FH9hdwaIh+Nx>$TT>YCQo z3)Ei+STjgzp~Ks?Y)p6_y-I!Na(mgV-Ysa%*4=vh+3_220x2a3yFfm{N~Fgkx9Xwv zBl;ZKv$Z5`rP&91in50@&51C@tqc{iq6@r>u8)3;N!C~T!mLQ$U+oqv6szHfyGMO$ zuBtxrl5X_dIkKLJZIN_Ogfo{|_e1IWm`=sBTMHMbYAXN1a_)zRh4^{N%Y_i+LW*Mi z#&SLbFp~Ga^ItY)t!i-nw5I*7G{Zqq_sNqVrEYtB*G{*qrG_el8hW8qKkYRwtiutE z?;1;wy=x4jkAs8Ft8>?b6HSQ(%1z$}yG_N`sg#xE7DL zgLce2eJc~WLumx;U}|0yrOiCpi_D_gn+8s{?p)r%ehO3KSV8YEfTfUcR^SWmYgZ_- zD8|k$1ff^N3|(g1Or6%+EGMd~w3p)+Cu+_h-Gekc_R`a<@+?GcS0#-8PYa;LY70Zj z=4klYNRbBu;0Pj54kjIM2c;9NTth#2r}wmyRyDqPb(rr(pL+ONZ#lc4s=%{e2J18j z;XBh2@8sAYDxu5HdToeJ~S(p`)6-&vJnh*!=|SFSN?Z&^mHQI!2U<&hIf64+IG=#Y5Qx)FxaZ$;Yk z#y#v^V>ir2Ob+JDeSDVi9T8lPI%kWj$G#;c?pdA*8y^PlTGP23&!Sd$J6GKn&Qgx3 z#bjSF7w8@O!4_Ct0K{-)L#wC7WN6T40PH=h&pZlP6(%XRoe!DNmuz{S*J6RPs%~f= z@Djg2Fv);`q7XP8v}qnI3VB|ju@sC~0E#|jqc-++?IRztx!AMbV-vDUU-`-mACYy{ch-N9=q$;ld>UI%kg z)YaqVQ_JymLdCzn#|dn~RI*naz@iXy&v|_kgalUL5j=^cT`pZ&uYHud2sv5XiEyR# z>cgc>aN!o*bb$&_5H9}p4pai^y~Qtk-y$T$17UC-&i8-_KbgYuOp;@1iFHkafoNdKy^&&B&asVZ2)a}W1diIySbkX0gPa_+;;h{r4l9Fq?%lh`!I9+!U~)0Hxz0o}FF^FHP#K$= zqL!#$jaBS`(5>MuQKm=flom)j4&amoz)B3}eNK-uA|xg@8ThK4u1F`!OD%WTX=M=7 z(*-pl+8}+O{2~~?1`?HD=EX2b>w*m9U~3`KV|NHdA+D3Z4DHI}`rO6Ey;-|QAkQC6 z&97JC5YTr36(a27RM!`f!}h}uWg;L ztpEoa_!HUy(jW3wQ1IKUBLfZTEa1OrqH>2*5`>+8!Y#7(Yef0^qsB(OfS?{4KP+MO z=Ym5hgq_lX7!MlstU8pV>bK3jn+LGyF(Dc^FLtfID6OJ_xhVf!zw^VOF!(t*-l?8K zI!(k$Z(Z;Lf-SqN>TYtTbj`$Z*w+_P8_D{imKwInuKwXf&T~Oc6C}}|dADbZZ3%@| zV|!D`$^873%&Ydtg2F|26pZw%(u!wz+1ScXP2+KQPR!nkVclPX)HkR`K;k6qWggp^ z_T;8kky*Z%Sc!O!EM~I%bLlvLe;7m;5cruF5BuS)lV?L6 zK;=Pi5wWiT@Y6n2MI3mvG&|V}`^e|zwGqW$lW^wQ&B~pL5=Q+t ziy$8*%j^?1-^jSdiM(lj`mU5`(9l(FzsF{fMSMdn64qV}!&paqL@=?Kx94DjkgI=vMz27BJ*@QK?=ySSY}n z+e+~5 z?;@k7HoF@Q&w)-F@(DMXRn+8k6&+!^sc|Ps*~p*u1@>uoc%iY&&;8E^fl1FF{77o` zjnFqaE!#X8m!S2zDiKdTaJj1NQTnMw9f7e1;fBx7@b(j2_BXgVykzuyrwN68i_LFu zy&2XUFNjj0&+fbHKHp5gXQh9#6uxm1gb*j8zPqtgscHfS)jYmoJx3GY(wfy`LL%Vl(W_K`&0z2ky=c9*4v&JKjJzy?c-=l8rB~y z8h2^#)p0SgI96cJLD($+IvgWL#9EigN(qYAqTUb z)Yzgda;l!5nF#<1`z9CC3y2Qaaq-{W`};!*pu$Fi_VUjAy0G=?M5dDc3%F?KcUsvd z7Hlc?_Pg!O%98dyhkTMQ zt1cy_gOKoEu@>hhZz#YjTJK53{E?aObMQI0nB7ZWp1V_DAhgdBPYAPNR0o(`^l}$; z$!okno4Z3Hl20@J*uL>1=DGqVL5SrZDbHe*xZK>>ILY%c)Bt_dGd3q_wgp}%!~Auw z5cFl&^0J0?!`qZF{hn7)qT)5bLYA7Aw#jut{MO$aXzJ+1X=Q`=38aLT9n;){Z0QeE zdaO#fxPruB(gk(va3$Ua<*D0&uPLJCBO@c5&nw7Z*tu8|Ue;tNk%s^A4Mu@p+mf1^ z+BRnw)byr0#}t@lPfbr}k9Z7!c5lHRRG`}t z=$6rCp-@M(x|Xe-6x4ap=pr8bWp!rz;s=zq`AB1&Xjtn^lKT92?vPo0D1$FXod4Vj zRJK{6waZ|&Cbw#77jnlxAl80Db7Wc`0XlXCx-Zz6tb(@}*0}>8bnC~W1BZV9 z@(yL;JPXK%Id6u<3fn9!+W0%7yAw)W_bY1$qDKoQi=YW!s(ZUeNWW@`Psb0Ue!KF~ z7k3#XW)xY%pk?AyM}I~w+hN>n4z^++kU&n^xUax*vP>)(Pt5P~{=AaE5KOW6$yOC6 zOuNrI8`5z@s>iG@VTwXFP%-^-(+r-_t| zI^*Dd1p|ji^70oWwd^@ZXC)Y+Z10)enbE7I&JQWsWO>jn19YZEI25p2ew41!jV zg%$TCvkx5UfJ13*mCfp!wEN>(skgxmu=%|P6bhpYp_$N~_ypuK`wU+(1w5zl!y$zc zul1hpp9d#K8H=23^GFvtPq8M5@!D+k9Cg4Zt!$%`5aN(?hNuOCOrCq@zb;sx=ypK; zbOvnn>oaN~qMArq!tlp*DV}BaSyr3AcyR+aKo&1HxyjC*&lfK`jWGXO8PucXYh8JsFwLcDdJ*pauJ4unMg7O?1RsZZPg7DKTNWa(AZu;ATfLx=_0q5 z1ZzI4g=bjn@31FmwNU0FWL1KDs^iAsD0xg)vt!Es@p)uqX5Ltg2GVSQ>@%(6a~G3R@9hSiqIjdT*$G`T~=VBv(`+DLuc-;qo)t$(-*-w z!|b8RzDl(fjI4rjrgaiX3jq9@vlBquxiB+So6Zb4pR^F2&XxQF678yk`B-3Ep zJP#Y31X!wre!Cf^`8w37?Tro>L?&x|Viu59yx+UK7ql5q!hp(xhs zG^qtUGyfGL9iQxCj>^6OusiEA<~H{87#=`yO#DHXTIPLkaNNKNaFv8}5u1Qmu*n6! z%Gm_bsrOkcZ9WiiFfw6a<-qYl?HwIqDnm)2FPsWtyx)GW_fwmYBwx1zMFU?CugKdA{*v>B{Z@0z#JGz&)qV9J5<0(INtO+ z5UfaGXUw5^XrUH++tdHm7e9%ZNwuE@u*P7+C&T9c>JoC}>e}sxDPY!ms=*7r3#jVt z6XTWr#}(2KAA<9_L>?^RwvOlOBHleTGB;;K=~+IJ!k|~m+cBV*fB=J1!je`(S(+ec zUq2&K3$=*%=;xQX%BZ_>B47%v&4k5G;ZPjtLsz(fG6g%8d`t6)FR7{dPcOpKWblsG zG|a*VGMp(rlyr1C&%S{z4V~=b58biED-yaY3CV&kO_>llyvct9F~v`k?Ao<}BX{Qf zfKp2`58!Y}H(+sL48`Cil?HIgMO^5{{l9KU%fmZ?iR6zze1j_<0dr2)?xo9@duAd; z4T^8|&lGELXl3(VzkZ#C1^3ly4)lW|%gDADEMb7odk&f=AflLp|Lx`kaA$&N8||-t zJ`X;O3mbJ~w>b&1Dlf6%ZM=zDkbToz5C>ot0GVE?C<7NPoPP*G%4)%P6Mh3tFYe~Q zz{})urU3qe?aV`i>pnVMroF}P#hobM+v+%S>&{I6#se4&B@9qezwkR0zx6GFHa6hQ znph7HRs+t=(iba}<^?t*ZT!KP2{i>)SB-CV>JulZ-HJiW5}04MovS_oI_Z_(?~RN; z^Zxm&Y3oQr@c)_=f#C@X1X@CdEt1z`Ulc7#9>gpxJZe>HMC)NA0BfAd2+}+2@EWbC zdfM0BHAjdRM>5yS$EkNYIf<~vD}#FZ#B3(TzX&Vh%WX&jNjVG zOI+_ad}zBbn|F!g5R~$s)7rC?e5)=!RG=(LtChb##>*uU%qr{ zx}oq&W{Y8dV`F1d?(!;l{2o7kY;K<6Ov_9!x1}rlDY>bCy;LEZMjB`(hQejC z)gX5MB&ElB2Blr97X|J&a-zwx8g*!lLuIxp&QUbbJ0nx-#jn2B%Z2re<%LkibSaw2 zs=+Hw1-D?YfQ%Yn7?Y-%X6pBi!I-`E{_O|uETd%Ms~ZJ%#~i;mBK3R61|A%*i3th{ zii*wz$4EC(`<5@yM)|lbe5BG8 ziLD?lT-`{G-b(lYq&fIi(OL0E>^~mx|JCJzR2ftATRw=Ew|vIF9@Wuj?g-`g>|1!f ze-&LdrgEVzBAsEVH8L`~J}_^5_nO5PEVT(4T2|+s+^U=tML+s8 zxL^B!pWb>ZM8ky^tA7G^xim*uIypg7jo`e8P~MMo>&%vshC@9>D|Q`oc4v!wuS`I9?=dgF3d1Dcb+11QWOSPKsmt3dCWR-MMl`Np5?81y)I_Em+OmsjTt~sY1WYW z3?~r5fsQ}D^O$nig?kyhb^E-db;o4JMG==ytHvRHyo83#{HX_o8csIY1F z0~*Vt?1*)3ExGuJ51z7cXl;0YNMYQ6hOWrvE9jj6Z$G*)6{2kEE#;!AG-T!7ahd*> z4s`ubad8}{WwF^E9sPNes%$ow*Ip%Xr)S5%s`7P9$u)d+I$CZ|9+bLLM)$cB4AYS- zh3Bk8&o=SPPRS(=Fc%KTYpZtvaDPO-2e$Lpx0_Th}nNtl?kK zomkp=oocV!KH@ki7G<$P&oRC6A~xzg{7$Ey!I)cPjOcfhLi~oI!GXD|kHN-U(I!AX zq8+Zx8>2j0p4Sqb#85c?H{N_yL{91gtN`dD8XREaDja)Sw+z-M*z<@;^)r*wz3{aj zT9%TO%+as0Djb3o5!!ps%ko1`CYNgXH{4qJ5f_uc@uZ) zegAw_U{rQ2qmmCjBDI@z_rE3LE5z04dfxIK$uiam{C0Ln_Z%rczCPhbpthJix#$F! zwV)J^?~PWk{Df6T=-l7j3<(nsGs_i;B$I6x9+UEM(t{}2{)UbFhXF=9iTb6jxtNXJ|60@qgs=~e$Vzt2dJ`-kOlo{YxZWWJ zyiSiNyw1h|LFes_HFceM#yI24%Jwx1C)`F6cYpoIH(hWTD1w~9%%9iq!)-tj-}&lW zQfCmg(-8Su24A-wc~p^$!N;^Rb^Q%%hO{2DS-S%`gM^n#fk^KBr5mHmjR?ox9kAy4 z4ulUV(rA5X2rjy-6cPbxkAo=4$pX`@;+aNUMyn5n~Gs1<9Rt3jHA!!ik!;WKf45>51^IVwl;J7W?OoIJN>`SN3x7bfvbkgIK6w+F+yio zGZSx`j&HSTk*x$4b)VYK_bFHLq1O(SVPTb5XJNMb#NIeoWo)|M_=Nw2mvtI{5L@?6M1^~;wFp#W$EY8!t%@p0U3?Yzn|84Y%rmQcsJ zKVMq3h|}4yZ}g)TDF{tpdBQZgR@NP*;PPXhnehK9osSh<0JskRiip|CyT)v_w0#M< zd`gOnCJoqzsGR&Pf?((zRroJkDd|8cJNCf$18CZ9hnaV)QE14uV>xU$I* z{Ke#@PSV1YE_1~$48Ls+{sF%PoaTy|ZAvZi*`$Rp{kLo$2}rM->N)7fartwsxXleS zoDj$B#Vb-!P%LElV5@w*KYE4ah5f5oZt9KeKmO5UE9)+t63(gUUfywq1xYo;1Yl>L z&w}{xQ&ad|@NAOHC0yQ)T*b(Yk={EHUxLKq{dvBk z&+`o~jbspf0m`3-)wO*#v?x8WUWVN}l^8;tNiFO!i^2cZrOQj?c_CYM*pRD@gBKBvY~?70xYm#(YT`Y=$lGfYO29~Y}s?#6w$%U zWONRy1pph~aTPflet!FU|18dbYIE~7X&vjqPe%w2!VaF8eyf1ax>K2M3R^E)9^gHB z53yzcvDliP61H62fH})NUD)#JrT(B0SQD-FT&Gp^-})syXG@tLpbCA!DmB?;hC6`7Ys@uMJU((=jmbwY%r5JqQE`@J3m#p&8i?Gh`*}FetAwmnKeER0L zZ_Czyhe&-QvdZ{w{U}%f5lx)H-W`IOb}%)%oTOuw62G=7HP@;R0Y6!Yw0v z!vPvoLY4UihI7m?M|m-aizEEM4SDl`v`+4z+woTHCiDtb>l%g)Qo3ztiGnzm zSk={yz@rMvJlsn_Eo$N^88lgfzOPD)i2V2wT35sTR9bpEQ{&#n|Gl$e=?FEi`HiqP z*e$t4HKF&FUS*{}4*KZb2p#cDT3Vo6Wmc#9X|A5|bBxv6 z%Zg$$81|PvX7bRg)%0_ky%l0R$*)~&lXQ<)R1ZF5O3L_KEM*OZ^|T?tIslraR<=f& z#IxhuqXxhMa@*sU*M;3&<`Q0|WT<+!8u+tO*?Cx8PfA*-DJawr9HabhX;F4yjn^Bj zQw8y6hBv`0-1z$nYwK@p{6RqebGCFU#M=?!Cqblo;yqxP@9XamQ`0*$EbFNsGug+1 z!T@{`7LE%xmQ_;9Z4Q5k#kK8PC6%RlaFd>)x@EKh1)CX&TQ&i-$=7#B-&DZWzz6JzgmiNxfM#H{<@}gQMFQ7+PIcfiJ8(@Wku_8yoxhZ;W zHDp+8V4=@av(VC_vZ!*c();}6LXh9YjwzG?L=mIq*&^DR=5~~!W5Sh`HrTITI_}X9 zPx6-wU_H^l=m4of&S;(JH7@F}*xrq8bDI#__}n>SSpw8`nDqE$Kc9yP zd$t(kN(AxbRgJ7$A)8kT9ymLfImdFQ57x@>>iaxSy~MKh#^O&r9t$hVfPOs?g$kzP z&6q_3IgvA=^!(&iUkM`1YqY@28bD%55KqNg5w*+66n8#5PB@b}hb+|jJxgrhavkFl zrr%&n%uo`c&J{JXy8HN{u;tZDk!Tf@A~A0Vvw!!XIXz}u-}JZ1 zeohYN5cd`ZXc=&MYr}{AkqbE%8^|r%${oC1Sc&%j{MksRgp;3ke}XLX2HrXsF&Pjm zkpTg>xTkGf5yE;J^nZ7tSfj`X!Vn$T2uh#!3me)dFF6e{l_Qv>l5Dge(BJ_kM|O?R z5OT5)ZU@tqwWcCaLNTkigN9UcP>d^%b~ zg>)t=;3MUEk()5RQ3vfrM|U@c(=Rsb!5DcKIfw-unF0uB^X50}KUoX#T-A>s7Jj>V zwPr^NLhU}sFoK1CJYV*f)PjJC3Sb*FdO&LO9YV$h;!jfDjefWK6_llZ1{TK3awU%# zN9+hK6w3iXk13h;uTaYdJY%Tp9oScCg#1$(?<_3^$mi!ZR~Yj>M3y)_r? zyf<|ETf@Hn4`>F8G(Er5!S5P7fnV~<1wf~FHt$1h!LWkBS;#;d+hWVoBzb=b_Pl)W zKTmU}o~wK?B;Pxw2$S|ujB;5U3UH~IpP%&U)G)9FP`32h`Ld8_ z-+=y=C6x)ojFj`Noj?bIZP4-pG!>rC0)c8wWPM!qZ|!0!VJb&L6QB(jwMqNO8mEOO zd7wa@FNFRm;f47or4Ig!QDbzPS|$+*2}N#NXl!NXpI|-~R@a{hfyvk~Y`N2WRbdRf z6L-%2trS9?vZQvNZP=2}!J)1N@b3nx?tYRC?=+2g$Zf-N|L$TjIthU>DGDsCN-{#* z!8A$Bia?t%^~^qwJu-h$EKzZ2;8w35KPHHcyy@ToJwUKx(&(4DBbYQtWIGl=@yd5) z;Qwvqeg0WP#n71>GkGr1uYwdNTK;XZN!=bMQ{a&@bOng@jX&ih(d%5a-a(Ke0m$<^ z&C5!0mTIs1vYHhsevJ?unM_7Ahv)G-6FV+95;@pj^ney=2*E z14N3|B7}M@{cerUT42D6aLE!LnW<}NM$QRAv;)HOR;B4*zN~Xy_g;o%3gL((e7(P? zX_9tb6~JTlmxMVc0zyJe@~dyh;bmY}ikMqGg~4cf8N!SesL2tKN&%*sy7GEh+hBXU zEKi+!e(Ls+B?ln1?pT1ZrZQ+g;@>_h4eKx4Y9OrLcE)qph9%s4wD9hrk>xl6TJl_3 z(6evyN=krs2f|ddrQy0F@4pqecxgch!@F|317Fhslp6IH3?l@O-u;J3p}`#Oy1}$N zE5s|tX74zsQ~mo}UltuIAQ%;i2JrmF@pFl=)M=?G1E9=9847p04#XhiG$7rwD}7?! zRsSv6ZxD*K-zk)zHHrQX zxwV846uTAiIJR@QPhCHs#JR%>QEEA~3E*=Zw(H(O`SM{Ns}5Iu#;ib2KpxaI^8njM z1UeJTjU27w3{Y~w&S^zq99W0~37h}EF!%a*p6)m?uLLA;JeENQmG3Td{03Au?b~YvRRUb> z5EF4`01UAZH(J|9c70m7NS1TR?R6dNQQk!{eiQ5oC{@4QU^D4*Bk?w+!#)1ly-nKGSJmWipMdtv$G&%q97+HNO8lejOj5Gwx563XAYO1R!Fx$> z?~YhAjh?fHq$@8~CqT-ZhFt(S8(;uX?s;qPmio&gi&F0p&+ltfkFaoBL}0QPzpvgKOKAg z28YmUw5-m=nxPIs@VAFJ`-6n^at}Q9RZ7ZMIsw=1o75!@{X?3Sm;6O zpiz+&kn%7yLuUSXhW^ZKFpuHwsyBDEn3pGj^{pnVJKvfT@VlVK65`^%^YvYF`hbo1 zZ=DIXz9_&jPLKqIghNb1PAjCYl_VnDPXH!``HhRz5e?1UwwYanXM6e92P1G$iqE!XrB`shE51q zWgDCY>YEG42G#YOLC%S?1iM;>D+*H20rUdS>(|Ku9SdPkqe0^RFPO%Ok&~~l260zA zgMbkV-Dvt%)4yGtf3YW}1x`xxvtD&_6BieU@b&;^`~co_$6S3WQ03vL_@EfAwNXyl z>yCzdJ-7tT&JVlHsPi(DPbv&~Mvtl6h0-M?~O&$Oi;zO;|S&yW6X<;vobOHc)T_cn%shkja3#^_xyd^3$T7LFwCC6MW zCv7#C2oU7ldX+DGSvGd@CLQVAR`NkT*!Ag-4<^}+Rl5rMjbbg#i3$hgGXT6w5PPYo zHx3vXwOl;~X^=Q!1d`m?&V%rTfZ|@@i7x=y0!eq`c6y>G=w>f#Uo?h~fQ$-0wO#;( zC{P7LA%}Osmgfkv#;tR|4B_qriw;0fnbm!7hhuYg_6Z3PkF72tbeyouM-OVz7vlaQ ztFgt->y@k14IYv%QN(O^vq|NzP->7Pci$~?+zSy|se@CrzDM&clrJRA=nAeI!$=KD zvAfY6Vq%GZb6`wB*@K*%8vDZ;xj-+yC^%>Yq)U(s@6yc~bs4GE0fpjIJ{IQoWp&Q;#tSh+16}~Xa zRvBS*XF(&xN^h$P3`Hs+u!CG)&3ZjyWd{y;7Q5nb2+2vGqLPxj9viDi6ub-~hs2e@ zfaeKsi(%UxZO`wW%os?I=l+yG{12Kl8&GW}nkp|=_&K=Tar$Yb^mTbcKhgR_k}Cm$ z=R{TK_M^@A85yO5wpwDsBkVHLzv+PkQ{;7zQy1Md#rTRBZ68gtwK^ZioRyiytBW}> zvJ1WJx7>P-4P)K}3c-z@_SVkQDmI7n)1~TV+SHv zCCD5r<^EaQX{NRY!29JMuler@hf6x+O;c@2aL8NnK!6!2C;zMxzzq5+oZC|w zG=p5Q(u7%L0wP@NQnWvv++V+Fi4vyX+J3ugA?S!d4$y3v=UgZ-qod9utR0w>qb6La z+d?KIaJB+&ii(~>IaPL?=h_8XCPwZV&?}((y%Yut|OfU6p z4d1^Z^nE?wsL1-prKfRf9ViY(8$tvu^LN+AM!w0dFHlD!)Sc#3FWQ7zG3bg;SPU0nxXy!$e(q*&5(_xrV5S=npeN@Za&F)<%M zKHD3wI7~&CSdM*^yH-0EQ)-!}f4By-6ADzqN@6GBe+M)i4gH+VyT0%v>6woMs&kYK zU+b!;1|lTBqy}9g8gGHaR3G?7qLE`nHz?<}@DbxK3$%=`TU7%nKZhxdPJwHT-qn6; zSsH@}!{`(Mn}0jD(;0*QrJP1&7DE4=b;;i3PoN`ipB$yB``R1Mra=P62&cKpHn zxLU#2(%Z1g+2XoTrk7niTp4s-z;UUc8T?C!K?sRjT>=6El;?n9kW-;n}<{ zDMrwNLFz2==Y=jP5$S9z?~&#F*%QOP%OcYo;(-CP2rG}FEfClLfu5lKy6x(6lj^X&OV=IB>;ZVqXg+={L6S(ZZ{B|_ z@t<34;3Hy+fmWkGcva@_#0YymI~u1B#$>m(DSJJ5eysHe{X0I?4zUq+C`v3kJBGOp zs~uDNY2Axhpws#TRe=vZk<1`eWN7E6_`CYRe2*Iw3WtkLE+cX7d>IavFqZSbp#sc* z*86+n9F~o}nVOH)4*K{qcgk~x_$M{9-|%nsZ+fk7?1U)&jK6A%t~9Y(!r8E&_Ay$; zT~FGKO?5n(#GNbAU~K=STbJn5K-ivD9QQ$`?!&oS)_>RDTv^L_{LZH-w-U*9#Gcxm zNZ0i=W+87x7QOhs*6$(c;k^Bi+tR*a3^}8s0dc4K`mv11rO-mtL|!M70I?@aKm1L2 zDfxl}^`F?>`;bP$zSwJWBY^ny)|kL8@7M^d$N@2;Skcn^X>GC@^p})^O39I0S@)CL zK8?NFzocrLB9d_+{9>@^8Rw~F`^?x|?&$T-cqS#?P4~7??E#Vy4PHk=H}hrt%mu;D zw^*YWpH`#(T};#6jcle*!k^&yt$JK#xSg|Hh)eqP;m>vEdyX9$W=-*?$>RrkBp%yp z8zZh3OVgpUJ5xV<0;f&YY4x)%(GSH`>@&rzcZN(;yf@sgyyfuxFlKu8cdV-P*f53I z0DD0EmD*04Va}(yUzW2z(8pZO4@`5fV9!w4;0f&MpvW|9o1WIoaXg`x``SH?2s~rF zl7gUBSYZ-%XpVV_r$SG6Rm(1y;(?e|=WV*{steObr2*qlH=}SS8IYSI=l-slZEU7m zER#iPhr?a!%|#@bJ)Wj@)|#q1)Rwh8mUfs5*9`9>CA)=dChajs=%yA+8zTIC|DWW` zVp@K=>O|I)Bj`;DyY`Y1!+t_dX%E3tQn573q(!OuZXPaDGTr<3-Z$kL%zFa$x&yS_ z7b^>9TaSEz>S<}Yte9GS~UKQg8~?>wP5ulFOW9=+H5$Vr*DX|;Sm(b>TVQwxcT z%v(jIS>@O|*W3R@OriPKN(6OzAPczv4{L7$6m|FikK&`^BOr(fNV&R-GzbVtt1KWb zEiK(0OD!OXE-jr)ONTT_Nq2WFB^^t5>^=DXeP`~Sd+*GD{`3FLKX2ZS>}+#fC|WF&s$#@<^RK^Rv$A|Gd_8-^APa>HqChst{VfsZ{7#dAmF3&MKTA z-pUR^%)iQ);)$Hru5uV6wd2Oyb~SOV8uGj{Pvd1{PpDhKiK)O26NC~f5DDma=-j!rw38z(yYlOq7o4PRMS+w6FAY7L35w2 z`Xz!fHdBU-%rK@O@oS zlEhYvm#0?ay$8nMn{oB7UmiAVT!sN5)X+FL=mi(CVK-~6zoCrQMJRrT*A1n+sqTGQ zE&mg8HPkG34dCB}q* z;P%0-@5{knEe&~Zyvl9Z{}Jq7tJ?Xe$zSKU&?JktZHdgmU~|J$^txh)M3KvVXV{vP z6qreOM~@!L=i)t-IDGbXBuTbWF;znO)u#qO>-Q%CF*`knB(JKcb`->gb{99TnyCII zQINbcz-2ypmG`zv9Cyd-MmWhivmZo=zs4&++d@nD^D0r*984TYs zW96OjU{7LDTIs^OGvVQH&ihMl|NM6yrqis{AoKDRiP(1fX#aOTz8<^**90U*^Q3!J zIfO|`wy|)D>yHM{^gW-kns3LlU5t#c-R#DKQ4~{fL!Bq7Dk;P_ArQ`VtO(2cm{Z?f zPr-QGNy*{W%vp1%1dV!)5X{VFi&NTKHx`NUTqW+|{3Cwvo|Qt*xT;gL&~Y!^0sE(r zV)@v5Ve_S`Q(wqwy&yI%n6vI|E~(d;kc>J3!nu))Rki^Vz(BI=wINN3DIQ14y6~fvX+Npkm5}9N_VRIeE)@h9N2H)Ts6MKw%!g&U>Z${qKGf_^MQ@ z1>CICk7ny#5R9c(9!mq{^*{Gme<+U16k#>vwBCG{aSf7Dubiy3YgxF9h`aSbKE-iA zBJ`IJmPA`!=d}3~6gug;6H_9B=Br}MKj|vb2u9hv{?h+8Wc5IPuK6h=`~v95|9zg^ zo*~0b^OZZNJMD`?&Dix?#-e;6bwHcoChj5RrAc$ToXX>?-6>@@hK(UV$i8!QiGbs; zn#%d91c)ljSs(I-&zUgs;pDG`Ql7K>!~-p)5X(zZXWj4*j&jh{MwD8WY;d?6^l}+h zNE8#ytAV>J-j$dOgEgIH*VwsVI|yArO1McjnZNc+17r~w>!|^-hvRVD&O36BS$}ri z(WC1pBW^@i$hD;~y)r>t`s}BLf{&rE}5ZylidTs~0Ob?eS!{yI!OwSH{ZAAdz)K-~TxGV`;EX_{8#k zbiZ^Pu0!88q}L1@UNU^Z2=A3B@kIjb1u!l&8J<5+2Nc>WI@#$OTOUBerERl&cyFe{ zakc+5veyGE2+9-YhR@A?nz?PeC^Ih`dRl#t#}DaR_d4A^av}bD&RA%l4we$@ij08& z=7+Q|ol8zPx0et#3Q9Isp6hL#T-QGl2RxvFT5{{CeaU&cf`l_nz_F2VS8!o$?d!v8 zh)4cGCXXj6T2IHp9Ibv}Q{nOKA3vlMLc{E~k!}O3HSJ9F$b<1avcSXfB%+Bugl0k- zmhKGAv+O8z+&%Z%5Q>L*W+v!{8?}0tZ=DzpSi|i^Pd+$~_Zvqq@v&U?@y}2ab1Ts# zy4DoeQ-S6vs937a+0wRe!QebX4=6X?@RU1$#W-LHUB$$VKUsc zrd&}Z=iKCBBA;yGJb(wbH)J2AGSI7TAQ)e^U!wMT>*}x}T4J8C$$j&i5cTom0?E5M zTi0yE{XrgAGvN@Xm0PF0IwqMl#q$as&ocA3F|uyQOXYELQ5#~>JdIhfhS-H;@yiWIVqrxGBXqO44!1hK-?}3;`ze}>39LG-vi_Sx&%bBwO#72R(-Hn?=3nD@xwv4E zslB>bq4G$z;G=-cPw3CNpR)TFIpHMJ#R&HKHC1lS)U(c*7N&}%@o0oAxV8Bf5?E3> z@7J&!X}X_T!IOG-s#ZHXHeZ&I3Iii*~yXhFX>qMoS(OFImPM_P!{OvcCi%W84P= zk&zll(G=DDh-Gp*@T;*gH)7TSR)4Mnq;mY|LF{2_ZK$RTN-9xeL^1#%S@GPKoz={9 zdca#ZN2mTO3R_@k3r@p>dAuvVi8O!&peQHU0t$$M2n1~^+_-OE>;7GdIIil|(R{#e z<}w>b!quo<&RBPO@# zST}W!J77J@-wAvCyLio~$O|25QyaH7NB^sPKZm-m_PB(LP)51tSbe+OH8XMA-?ipox%M?Lw$^nF znu;~_EGhqv{D-b?cMxY_!(6yuzcrJ8L*`Ld4=ci(b9R*jnPu$WofcfA)g+ER?fbpN zgeQmLNJ@I7IrFMmqnypC?p%;~WbJa4Dujn^-~D@FM^kQSgloUgog!l+qzo{dPth`Jx`)0fYJ*ujo9AfKM?wm|?pDl_o$ z{)dpqB$+Z>c?6~t<#;43wW!O^k#MqohOPn9ZMI*Cb0-k3QNu#q2l3fzY=HQWAqdVp z8>%d0(3jr`)^e1bRwsSoRRWni%RP!wX>>iyLAk0^tFm6g|HT5xCOu6Bxf&2GRay{l zs9tmY1yW1-=fgiek1@LXZ9@Bn5BA=wyHLRbf3Cocs{H z(^V!D_EE^Jw&n(!1OYUM!AFRs@EhMdo&s{UQzQdbL*J|)5CJm0cd=BN~!p^P}byC+OW!gEC>fS6~L?jA1})Sq$!K}iMy0nN-I-!ETwSpRiy z1kgTUDdl}hW>fLAGQvj>@8aY=F}IoBtJtxo$P zRQ>2vjXOMa1zp8PQ%HFVx5~#$b?s88eRQ;*r6xWT=_I%fIMU89Hl_}`FU89tSJ@{i z7fIAQBbIZ`1>3DyVYlPQV}CA%JuW81I3##Q9m4cvvr8$X^qg$Qsf~=pj4X`Y>GnCh z`;FL3%z!5c0vW3-OQlbcudB|q^fD}+W5t=@(tqu^EfOxyFydHC6ms6vlRu+XtJ7C- zd{|d7cIaNO-8c4Bt>l%_x}xBE0CL(NYw8k(EcK|3ZWsvy(_=D5z-15E(*Ve1FOa+e zJ_2C5Hhmx_D=X{dRQ}_KAxP;cnU?}izL1d6%*;%Pz+O98*ng;3_xFQr#u!k=0x6J| zw(AGDz~w@kfXoLRBrUJ=#+{v#Jx&nkxf|Bj){yHbiCh~XvNKYuz{$)Ka5(_`31rle zAtA#1$Fs&)tWNl*gX^RD8(UjwP@%Pq_qJBK%g+ogiI#7U?Dl6u*h>cDF0WXond6dLw`Z1o zhvgAj3HVg{M8T%BFAC>^x5F*>lFD2+WD$k~@UY6uLj1zYVc}@XXobNzz33}-_W7Rd zT+Ys)ig9qNB|(IRw}s~MRQ6*84b1BCc-=`VvXlU`>anc@?rqDBQ&SrIOLkD;A@%N^ zSjc0`7}5KmB!Ox)ryW>Eo{5{0zHxAH0O?ae0s=Y#W&SsoO@WFG$qVWGp}@ie#D5(r zLHyYaNCW_-qFVhnLPvGx0?_anOsg%41+0U{)*AAO!IU?w1T%JX`_{CZ-FK8v7{TDB zy>j{&UfZ!Nd6n5cTNZ&v+If=DGJNlr>o^*>q4U$pX^65sG6nfTN95^&Sl>`-yQV%lD{P< z4lNO`W4Bpor4Ch}DU^AP&a~WEK8)-u@6B_qZR0-=c(p!#+44kUoPmAA3AJ@zbD58* z_i%d)`$tSzaqkH|SQ&7WWU7yk?!{I5A={VH=l=QYEKnYzEp0le-r5*5o8rRUq&5r|Hpt;`?XgiGFA6bR z-}za5IX^_p^`Tg!O@%2pU&A$6tIXp2x!@h8QLCx^Xyp6pzB_E2vlS9Bvx6fh&6@6s z6maR-Mf5qp(;l$c&}P&P*H|IC zD%?(kY?2XYV^paI~6Spy9BOpVb51K`ntj_6gc>OAgEt-8tJAx#gNvMhg$l%u)`_AH5Odj~1i*q3)38spB<#d+YK3Vq)xXCs7p=^G7fDdRm1& z?f5pTdH4%*H#U_rC-b{vP08(>2dmVQz~5f#m)W5~sWmJzK)oifXz)TcKH zeN}!JJFQ;qrE(SV`^yC8U^rIKd%9*^)3vW8>NLr(|X;B zfrh@+t?O(O&L^_KgRiw;Ypu7BM~^+^irJEC)m)!2CznVXqxQV9hD`!~6O7hA6s>1mCl`>_dwuIWByZ%B~V)%ZumOJ5Hr zo%6f(&k3=|PT8I@hNv(y*=RGNZ83=Zp;_<)cfVY7F895m(pizhSrsSIj`OF^qepYc zg$YE0CMxn^Bg9_Q0S;AL>F7|l7Q z;W!8u#5tmIm*^=KFJg1mfFsK~$uv{UdfEC!bc9n?pR+&X6y)PYA^ldY`S>W?)3v|g z3LJv3#Ik=AYarA}^ZYNU6$dBqQO92_Ar8(nF}wdnB7Y-8{}c4Yd8zlmUo1lgh*I*X zhCBU@pW;XpJo{h!L^C?L*=QaE6nk}E*tzvigXgL$#q;p*OWx2b@K+516bvyhe7WZO zhv1GhR)6-lfs!9&qo!?bZ0?70X7*)Ivz9s@9q?Y}UK_Q8!)fSb2P%q1!@nKkcBNFM zWg_L$5~&LL`!gkISvbF-+fiwGxvb>t@3v6IrHquw@Yu`;3lAW!t0~&GuBJ4A_YIy5 zF`tp*2>y%1To0vsQxiROCnTa~<|ffkKB`Z67M7Nxu*@M-8svm-A87Gp80EAD5kyE6 zwv4EmYdmE>)cDrPYNT~$fr&mF`%j}QvEB>g2Y7{Tpg9)gX-%+p?thjqV#s!gSOVHy zTSJ+B*T*i$|K#&xo>*pdNGA%Y!3Z(JHO>tjGXHq-rPir?JlXw@l#b;P-T3&omM>){ zpTmJ&6|fbihYnrMc0=lB(3D!#-S;!&-b5R|7!eg66kI}&atrHn9AE!^dkiy+DjR@B#dDcQA-i3K_gaj< zVXvkjl(;^f#qcqR&w0(Bu1V$I1AxEK*g*cLMusD$mL7xY5!;$X+!bbS=VCeJBJ7zQhCJLSxvVRtX zk-%~&8Ni1+=@WIzH!lpkWQnsi`g-eZo4y|1D-9+*cEW6)mSpy6b^l6tKmUD`2s94x zHevvr=3%~TVN)C&>7=6$8e-sV>AGEe44@Dtz+ndXZCOrN?&^*H2vCbAgh-F_y^2r< za0NH;2biV-aKz-)lsOH+X)+d-m5~wyPe;cmU@o~+!0f~a0J#Z>5&iA&ULZ_$O=HfK zGpBio$p^p|LPElOlrOvS%u342MgUS1C0kYUcTw6mc+-LM27;gc$!~pt_LC{iU)@$< zkU3;m_CIM{rThN}r>K{T0=Z-PY7KpvLXy2P(pR*-R}vxF^xwH}Li?v5)9KMUJY3o7 zA$e2$|FtBL7I8MGdU=ziUcZK(V}sFAex=6J=_?SPatL}jAy{1H>+4aCdP+tn$09=l z+DP?+R|)|{l~F9raUs(z4<^b!zRU<%{EK$Qsnp>YVP^g%7CaRiZ`@YdUSPcCD~!iF1Vi`_bUJ6WnN-n|E@ zNu1xw$_-(3%0(RbALR2M-Aar2W#3`7_KLYZf6m9sst#IBWT>>l>Cf`AhD*M3gJRsjVb_?1EVJV(|4ywafeDJ#i_YHr`1k?ezuzbJyWlE$ zZ|-ZHI(-^hUC{lHQgAh7Q{k(wS<&p3$|?h#b%F(@&ab@4AVcwYKq>ve%S%=045OL z1|Gn~w&N8U3rr5ZjxFA&HOAH~@A%Lh3e+C{zblKCyMi&W&w8fC@K$A*W7ADWANQJBD|CI3cc>Eq+e zEZI@*URBoUn%^Ew@kn}(M9OBC{A}-vkNU$vYZ7?1vdnIlA2)ivto?RiXj&zY`spvZ zm_iVWM-a!LSz8jUpDVl96^6=w=1+EbYdB@Jh;=Y4cysyVw-j5dA|^PD*F$r0Dj5kH z;(_PT=;+bl52@qbh+|~?;>*8dOF5%cMV{d!{KiWh>J6My*nj>}EI?TaGf4m82gW5& zb=|hZ|1A#Qv+FMRVpKHD&3ef2)1G&~uo3*~R$&&xUIOfeB z+p6t+oTYb!ji#yZ-bFS+`!w zgO=_@UVD08KVpIZvvha?hphjcYd$rDrhBRKkR1fBad7@lC-8S`(EoSA>oPcx*#BGX z|L=YOM~cxyG&D4b2i6?a@8~&=?}K;J6N9POn!?Oe3Mc|}vI!q(sX71Cj0$Jmi1VX+ zcz6m-Q2QbP1O!ThQFSu!-~a2TZQlYEj#0tXnj^fn3tyt6$ABf2)Wi#R_JOW0CO$s( zmP)VA%%L?P>mq+PP+-wCKsfXvs#k#Ksv4|3zhB?FeS2bJBGRtj0pJF~Yh7JjvZ*HUjkmsh{d(=+_ai~>IvOG(0{ErFLHJn@B#rALZVCa_ z0D5|)T)8BeGhVho|89dnrdPjy{R&iXfB$~t_=f`lHcax+YP2|@7sv^n!s;m)<$+K7 zpgFK%X14X|@BcvTvfsm_rKzbD((4Wi0j#}3ujsYVXK=8St*yM+LGs{jT-6x2L`KJsXf?uxck8*~PQc$>nh6IkQz~a=D7{y=@ zKKsbbPm7w?=aad4*H&(4MEcZqQ&RV4)ZcLfb(X*=p#ZFqLG{Z*5{9h%_5Oq3w<2b+ zfU^Sz2dLnmCYBi7B`=+N4}{+xfvWRCa035CeXz3oAou&~b_P%?EEIKkfw2;ly1ess zaX9BaXcs_r3ae&5sTA`0y_I_gPM?wRX7nHm^al7k2mHA}vr+9r`NFQo&AruQaKIF z%gZ}o6EBshUH*~8ee3Co9|>HUrDMrn>9+=Yhj2>C%K%6wC9p}$z!G2Kv=JBV34lx`Ezd{8Z1knUg6EGu76tyZ&4Kw?WR>l!o+IHYO8+|HGXvvGiQDJGXLk`ZqpFYh2jY%N?U}A3W2KpI<^+B*29OsO|0apE+ zQ5}$$i%3jN1jN3G2m^ET_?Vb+FsT4m_@kN{u$ip{=RNg1CS;(~Af%*@%iR=94h{f> z41%VUlatp@)Ll2Er9p*($F>c4KcH=b_aoaum0ZD40K)|6Ju7UidP!yo&jHCeAZn~< zXc(E0kO086pvK_!rZlMe;`W}N5k6cT`a39DV`Jk?9bqXK8$d~Sm(BGC)geo`p$qB; zWQ*pANkNPk(96i&zvye*1BK<05oIt7d?q9Av;|RRy3hRUNN>JC=57GatnPMq!@(+X zS5KRb6KR;MY!$yHl(-mFMvu*rr%B21A`<}vaGtjLQ>i`w-6luAS%heOZ zF2Z29xUf)cJu6gDoTFCnQLE5DIXdbD1cPte3-*5pIDHRN-qt{qcftwbpy9L|(oiIX z(up2ln9;o)dSH!DONYE`ypJ1s{VoY6dZ`f;nqUfW_zvNCHxEF?&r9@QAJ{`5ai)@9 zp4=9*ZCqTtLtxkVnYSOZ!wi)d8!07B&H_gtkflfzLZ~y4;--DFw%(qb>n}A`1aQ`T z{m-}#d}t8MRk9lH>|_MuIAG0t^5hesVwiDT>grzGvV$R!HDTgcTv9?pLNWsJIt60% zz`i;g^cCZ_3$d-ey?KA~=gy!gC@n256O&Zg@{IGJpz-l>5EIU=Um?AASOzF^F4JEQ zJk~S7fA#|SiM}$e1@~KUDRBiRs>1>#zkdA!7m(ahO$r(y-3UO;fTzJ&;RaGyt$9qL~c-LZMJ0A$=F8`&_2A55c7UPjnw# zZpQe?l%8_&3j-dM0~`*BB!I6Gse(!>wsCR-L#}949=S;Q6%0Y(&ny5K4c9j**XTJR zH8r)v?O9K-V1f-s9rEGBwdwECLI7L2$-uh}5jiT5!GQ?5p8%H3@a%qec6MRSo+ZqTbfO~MEMn%CWzsOB)Zf;=t43ZKfHrNedI~q*QJJ{AH zzEPWckDNUE>hc^El-(G6s+u`;4! zUQzr4PRH)pt4sBIl`I3vl}vD_SBL1}c6M}R%OwdhsTK_HaB(2x_^&m8^VCWXlJ;L& z4Q8bve7*(|td^)n0LsWdNN@{Z!e>6^0&y-GaG?a&zv^wijIV&XC%`7_oSvn8Gb^{S z6k|vPlIz>E@Et$_l9c6yLf`d>Qjjx$y#L_A9_Us7_M1ck{pE-akSqC6Hp&IxT-{}g zE~M8nFJ>M9-wfmtfHrZ8`^M^U>1PBYQop~yB~OYjI1@yePEUa#HhgnjKKBzKf(UqA zI3siq%eOS*M~qwALQwr$o##_PrUhiX3r1`juPtc-V+I&NLI15KUx@7uxGjI{2_-&Z69mDIVuatrD;BSA5MvCoBkZS6n} z4-cn^e8T|STNt0??)GkXI69HgVlYDJv;5ilVm6J2$K~3$B)BM+6?%1b1$@vQ;qWm& zR;9Y+jsvXF0xMQT(zf7YB>6he ztLomK9|6CVXY$s*Tb$^M1T^JaHbmw5!RO25%8B zBg6XK^!$9=ogRw8U?e0Y)`>i4e(L%t;N?t(e;zWDu^LS<`v8xO3p<)wRO2tdVuT7` z?1+GMqb&H}rSDNuqd;C7oL(wSme>)ai*4Soy3DESHpU7(Uv8xt2B;w00YhdMVDR&w zz4N^`ng%k-VC?qo*M69`rK!Smw&8QF zccE4-8DLL?p(mTRB``%Z?)ho8o!q!PP60g_i0cigu(zBN=99Ml0>i?QQJ7z z>G>!l`cBsu&A9aLsk`MwIgY;v4ji1%Wr8zRU>SDU(gkm{$V1P{r zaaknJZ)&(4f_r3qlPS;B&31hK6*E5Y=VqeETH|Tn+}s>k-m<2D8Fiq*(Go1pU}9Wm z&!C0XP6kAXe|}t&WwDP;lu;HKmVf(0Q4BE-mz(w zyv%a*8l8Xi1&gS1IBO<~0}DAIcji>L)Y>gc0!|wz2S-LvoWNd?uJW+It%T(3ex@B> z;?uRvq(qNjox+b}Ug|c2b!)nz&m3L8o%4%De|K*`&10f?ADn2eX0cb9>SJ zv)gMneX38F#q(_NJ1sVHuhNr^_IBQ~JH(J#H`)qSuXbE!78K`2o`g89K)DUewa(XF zTQP$}oT6Fr*0)zq?MJ4e0A}{Q zwVtOANCh_d+{v9N<(5j(sdWY5x)czDhV1Cg`gRFFE75n&GgC)Y9Sp7Ltb2kJ4ItI6 znnQn&v*^@m!o=#>1G|G~cH~7Mo{XzNr+?NHix?>@HtLB3`hNv-4o^$cR@Tr?3#Ipo z)qXi-$Vr{*o^2Z{V5R?|7x<<4X&;pH(E3& z#K60{Rx;Zu@31b!%K`zA7e^hMRRB|4z&3JeyBoJ5y!-NjiyarTJuToDJ{VK$?w zxIn=FCtf>J`ab&kQB>P6r-z{+m!4z(&SkGxdXH_hBN2V@%`g&Zxt)P60SD0<044f>!lPjLxK-X;!ez065>jEumz&WV2-5h+e2Ih&eLCr*Wq@ceC`l= z98DNQ_pZdZb7uLJxa~2omPob%NGM%TDVr=Rwk2Oh)&u%z*TW6I>eoO(p8+VP z^*C|8$8@9kRbgKP9}VYNN?pguK~F-C)}^3(0Ka2SgU`4Q)YicGvYt*4m7$@^*Jo#b zOjI4v4%mRX1$UP^G%DOG*W+F)^h|4sE&|$@@}xgUx~5Bt{D*xqE2tfC4H!|EmXIJ0 zRk2)s_wYxr+uQ9%Uwr6brRg)rI8OE;>P^u_09aAK6Fe;Fm4S`PmP4d7`` zcE(^KCTG=ju9U%FDiMmg$paq)odr0L=@DuF30g$H-&*ViTM=xrC|Q=V0^d@QY*-$< ze$#6}cbpOj=OvSaM_@2WT7WZ<9}E{Tlz zy!D>475Su@$?O~O5*!?TagTdO-Xb(!MIZR9?EkdhBX(I$`KAPlfJ3&(-y$HyBAI?2 z(+PYC(FQNW@wx&18&P={UH&UcCZPXOeiKfPrbU)afQLMV`nMZdEP`g=(`3dU=Qq7H z0CAj{q|%W~3-#B_msF)tnGNS|saAhF_1BbBf-!ys8Uc6hyM_0vc)S>A^Wj8N*?-3ga z?F^S2EBv6H97@BD+$R|hB;`zN_B4Y14BtbvO-PBqZ*0C@6iFQ15O6oQlxSg4<(O$( z+?QP!)%N0~cPgriy^;g^QmiP2n}wQAxw9dvXu*#xST0YN?cBPO@S~ey%agI1?F1j; zgg^6~+a@|no8a6L(fTLDKwCoFZDlTx zzF_zzrL0f+3uEorLHUi>1b86gu>73G=}>!@W^*M0`(5XF0J>knnN!lKY<1@$>C)mR zOEJ%qb^ae50vd9{7@~qENzfP8}3UI%DuA6F%-^M|RY?t;U*>zwUHaFe-S<(L4fduj?>F>9zO@$fA7m@{Y z`{E)tV-qIK=;;@Qa(J2y#L@Ul z&ac8VIAUu+=>Y|HT^If3W7#bW`&mO`jzJRG1v*}-Gs#$5lGkxDGL~LF{JqO(yaZhg zfiUxyrLsRi?Nj_$ss(R9W~B8Kvbhtosc@8k8SuVcK-uE->8?)+-dF--TXm~D8+Fzz zN#l%h7wx@XI$y%Mg`1rCb&CF4Hd3DEN|QG7b5{@fC|&PYfhBB z8m&w1g8FLh2N?QO*@HW1t8Ts#Ex0)f`h!;;=B=tN(TjDUw{#SaPYDoXQ|_cNup-bN zX4;O3<8d+A=p1!migg9y48`E{1E!Q;~f>?}D4qWtXm*Y^exyjmh=zMR)7= zg{T)9X^iQNU-TT1?)K!qX^0tjvnSNP{+)k`JNSF|Uj(L$-?Q5!jXX}(1Qq5p`=l+` zV%qN=TX(7`%%M`l%_STA=)p_c{mydUE9ey`TuY+2CQLvu`QpZpmVz0kg{?)+ua+`h zXjvLCoY8cWGps$*aSUg1up5dh!DO+J*fE*FyHu}r#4Hj9cMGg@i;_eYJ$J8jQ+T6QWT zqK}us#nhH~Tk%h`Td(=CtY4hpK~}rygBpb-lr7CmT?Y?!-on;S^V}g??o!o=Ky6py zyJd}H7B%6K_B1aWs8>!-WY}V&37oSvMUw~9pyU!){EfWMEd?$!a5UfcWKsSbdqov8 zXGE75{vC`?+r({Q&pj&cK8>RSv|?*~YmD^^jB;@Yw6ipOFlT;ID-TX;Vs+L!chP5x z!$rz+#AEfPJvFKGL_#c7&ls#OR`Gv4k9+oo;0)>-JH2Sw#0k(G%%tkyEsL!S`^neZ zK8gYf5JVuzVla91}8>_>ChMPYwNY4=agKP+XRJ0{Yo8Kzcbqt zuCBz_2Ve|FEqFP&Q?P=4w_J4@qqkOHyxO;a* zb%Iuw^N3lFo(zqG z57y)a3wn#DmPZ~2k0HUVe`1Nxn3+jK@0xzwfy#EQM-IXw^qezU!v&Ld5}uynka!TNa=EpDaf)T>3!cXle=W)UooxB**2>}6Zr zju>8JyTeMIRVf%A;qBw99Ns6>_5Q&wqS7`}fY%0cz42zy;CpiBd3sYsPsB~suO2?r-Rv&~c1_iYhm@ru{FsL}3_EO|UKJdiy<<8H zt#y_T;XC(26)2v2CPnkj)agZRc3;Y+EHr+`lQ#oYEBf0ChR+T`d(wJt(tTjS)bEp! zEU)r%6@0p#B}vEv5yg;#lR$)WWRSGg<~&>2ciS=EB1sS3nQ!KYEpSZy;5LKO=u@XD z_s{`!_kP-mn-+d#cVjm)|7?LjONkc!iYz^yYlZn^3R|5bdr?ZF;vYPfsJDr3ECfmA zdlGI+{g|qgYlYn+E3`Mw3T3#o@D!B-<#TwxI7{{1R^$bDILBm740F ztdXvleM9l6{{xJZja-o61^!r1Pwu3K%`P&KRBl>rdR|uD{=0d@!YCdsg?wV{Vi;ID zQv%XIU0UobUks;26tk9ok&zxp(Hi|-`dPPOp{nk5u=;+5q6*T^^c@dy(mHRhz3sU< zR=a~|*crC(>atS6uU3AL_4Y@8s={VaMg*#pO~H+588QDxlvHS+i9{d&L#Oc4$N-y= zs=S)sJI_6~BW?P;kKZII^3^D~=05K+x@Rl%J-tmYin~A7#9~L;#~-t;X&Gtw2Q%|= zhM(3Gkt-P9U8MaSoMX&}#*mJsrYSn6udT~u#jN5wt$ts-am}jpcYd@$@zazJl}8 zi%{=FLygyy5tcAH>5tkZwW$&BID-wbS|llKAJB~hFAA{(woY~pHwOnY&IhtPODq2| zm0_r{NO*sBCX>iQZP{5qXbfBkkNu}lNiXa7n1w9g`6uyd-pQ*iwhX5K=729APZrFH zj#TCxOq*dXR`2N?xc$O7a;N+q1vv3lvwy@S9Np<0gxV|f_0Dj&VSDHKh7hU)yBFX@ zz)TYg$5R-MWwre62~K|1U|*Rv>9ngdtVeTg;NM~FH0>Y#@ocfzu{=6YI!-O?1pfP> zp`Xe1Cvt~Amgry-CqF&3w7=NCIFmls`q8IL5EB;rYDjf9WiNqozuYiNkhmj~GBT^b z2#q+Lvup|8x~LFaY5KzvP|L-^gS+7gw(CWOXn%Wf260(hy^!twecHf;-<{7*LdK}- z1@d;=AQy!_Mw6r69F$AY)$Mie2k#XW1~EP>FjyX{79lWQqrqK~pW^r_!)R)mrs+=I zwSor3UJTF;Q67c?m8_EDV#0K+ zuDNQaKe&nsRv*mPZ-3I%)C7d9oPtQ8Btgf_Y%v&EG4lv(Hd3Y+Sf^FIzJ+yz1CX>1 z;xLkr1<3QKXJ&3+nj42_5U!V+FykIPx`q>hyb>S`$fs0Y8!dx7sfc*pNkiL^vPo~V zw5$E(UJDZlt?sPnCMXb~!F8&pwsy+3{A#VDOubFdU4>PHb3TXxz;du4AYT@Idx;xF z+JSJ@+}7Hkwwgn5^?#H!x1x>Ws?Mt2CIV{mu~5>Mjs)lssz!$l#}EI9~b1mJ;(VZsq}aQ z#F3I&_=Lrp3BviKk8Aj)J2qb*#8>U4)ooEOZhs-NGTOo(U54uOBz5+l6l2GY>ao@z z_Wd_gpHX40=b^;2MuKG$80zs0Cs+isqRaj{{KH)W?4q`-jLfXd)aY^~+gn4hBlOv? zYenbmCRGJ2#pbB2`Qn0eQ7pRDz9&K(<8Ux&_ptsY?pIH(N;+2Rm*kq+x&`21$8K{4 zczAmHyplgDUYAH=Gy_iE4=#qFdJ}G#U%s_8IpkNihpt=RD^@w?Ydp>(vQC@BrW!tV zTLbTNKi-H4x8L#aR*=k3S^s;D@)~^`!&4QA+7CQ=3He+B;3r$*VyX{!H9XBJJ8-%vPyC*08S*KrUKnmX}cH zcK-!c;bir6i(%4Rn5SMq6}GXsw z-(6_o1+;ApBrh?Pd-j?+2YphQi?Sp1>AvXm%&wP(!J-~d?T-_@ zM=XQ4{taqRvP&W@vGb6g8LR6_1VuI9U$N@xCM-I*)GqmE9gEAHunV5|vcZra;e=>v zq3M!0qATIHI(79gWt%J1`sm`wW3LdqJA=Oa$NmPw8PDOt)`am%!bLj^i7oiYi-@f) zj{l-DGMZ|__*bq}UZ|CsuQ?YAPiLDlg$hFLb`~Xe&DCe3YH$m+AUlQBr#eX?z#O7k&1sMgRW?FWu+2?Ls{0?` z0_84{qf%Qb*<2^1d2O=nK+YC`3<{EoELX^sr6I0zp4W>k*9!!&%)xDlt9cJN?) zhaxm1UXGmR4KLOK5OQU=oy4ip(rh-nx6fCOJdXzyERAOidZ8n5f8=|#llBd*?egJw z2XLvDUsL={^{(vJ=z@NDu=u@gA#QFuz#?d47Zk#;3~?+6z~-CNupEXzzMGRLj)b33 z50yRwS4?0#b?sB%bSE5QHS})7hM@g(vkw#)yeT@$fG8w2zmtXMLdr7BbWQ>=_!yZrNq}*09C>&)zNvrYi zWrAwG^OgMuXJl{eGgV3?JGrD4Z52EjK$=ceMH*@eKQ<1En`yFW%$E zwxj;A{HXdKBOFRYO-)ThQ^IP{f-RCO6O$2{p7|n0&c@5n&tul?pV#`9cWcvcA{WH< zmM2Hd7uka-QAy`VeWbXBr`uo9LN(-l)JQzFsYUWw|sa`OMPNCgM~HE&epH1%O|=ITx2z$Tq=+G51h`Q z%RmQ#G+2&RJJ=-$#4ihVu*S>7yvpb(;#i*JuI__y7IkN(!}e_Ni;D}`;uLMJ2X>1H zM0?$5J*%~0d``^j4}MDCqkQ!q_KvPcp+uhKJ|BZLOxt<>cK*$`YS&m4@Poe2*P+=U z%kkV{FAsmE#Pz^A&QwjTb9A&^zfBx`+Rap@zmnW71H>W| z74xlTI_2wjoa_qrDkb}evw=lZI!ZU}89@h?b^)vw9JMUYXa~^1ni((LiEab19>TscI11c&LCP% zm2V{#oj*R_F1uV=&9UnJ4pJlpkdw3MBxVJ|_l}f@$H$%kT?||FfvTXR zk^6mYMrAM#sB)Ed=JzG=4)yhI_g^)I2oKtKg-j_~p_3E%TwI6yu>lE}RdL(C-VtJd zj#`iL+*vWl&WAT&LOlVtS^Q1?$v6U<>Jb7VeMn4vZKPuj7`ZATeOwv<-wc`!+6W#1k=(Sre*X8l7mA7`x}y>nrf$>@#L* z*85h(UxYChn~r_%e^RJ$brPYqp(qMlcd{&$nJ?AAG9-0tM5P>YCNsj}#~|zI8%T6| zSs|RbKHXLrZe4wp-;-0f*ArjypL}sc`j?x~2M-^ntMecL^Y2=-4L|{Y-Y9f&t+1S` zbDTQL&sF6~<2)UBmVGCjuw_f8y)JaCzxgRt;9V?ZRxo@P!Nbpg?n5LL17PVJs+Z#+ zaSkHslw4L>>Ns&-zie#58t3EpZBSuI)L^>ORvn$js#a_O(%vAq_?Xu@a8JzS&jg?( zZfQq+RZtu}+UAdVRF7Sc%st!~3qT?_C(ClrbZ_C}0$)jhk^y3Q$8LKqlV$l+rPe*~ zj6@T<1sp%kHUMs30J5;q+S%C|Ky1ng@s7Z=l?42J;R!c+BVVH8-f7=rKv#$L_nzm0xCg35u`(CO7A^%2o@l8=^dm=4M^_+ zL_xZ=(0eBkdJSb(^quQ`=ey3Cng6r?k?hUhS@kLReLsv+PwffI&QKFjZPv+YZ=(Ja zxQm((n4inb%OJng5l5Uo=%a7{4)W$ES&PJ7#(=>J;a!J3a5%x?Z~&g~qi6`Qyab02 z*)z?E8X5ZOyH~LvO7y+f?X{&@e9m@8ZqJjP-NYbca1m)y90Y(En&K2Zsq%U0v zUk-J?OimsIRzdgd{A*sjIholBP$THKjM#pUcak!B-80Y$K)v0p+eW?^i17mH$jcNI zC_JW&{4Nvs{6Wj7^H;$K{oi?4^@2rq=95sep!|G$EFUNYPf{)_ z14z0MK?biFWdFEo4{hr|o&(S^-*sZvN7WQi9~q1bP+b-<|9w>J>jWfWfmgyrNvS&r zfGG>(wI~3B19f(8mSWrtBS%7LIUC4Ez5ySluBU1{Kmgu-3Inn_3%hZiT{={c>3JSV7} zW2&U9dcge#p0+)$eK4M5G5Xb8s`RG>3;tt*`DPgw`F>s?0gcY#W|jRS<*i$&tp)kniCXK)k18RVoj4p0+&ny71^Bm!CK!wxAUq!H zHL_X(60cKHENfS;I?rs$qc7A^pi2QtCl@p@uSxV?#-R6r9s_pJEE34(!&k}xfLa3O zI0P(5q(GCF?h;~XAX!C#vWu}alTSZ=`t(*p61W>^9h)(Rya%Gy@|-Q?=O|4ui}J}r_#1NWy{u&TMSLE zM^d`NyMK)Px|rKoSDo(z_VYj)r~QB{zQ^{JY5P!Kft26xY>wKRmEGus-|hIQBD3BC z&f;M^UBjodQHT3OC%fg8&4GIO#yNwi#P~J+56j&Rj293d6}*x-x3%ZBK1Ms12ywV& ze8BM}Zw^1c5U`gMK5G4zn+>(&MMtI3}>b;scu)sQ@(zq6<1p z$YD8iqvZ+fZ=eC*(C}&7lGn(@WN~#>C*75|BlNiPEyn{d254)Lc#=q!y|`&tW31Em#+T?++sR>CkN>5zjrkpihbCU6Ix%P_aHe5yESCHjs_(<%#EcX>9>}YmmxUzEv8_aga-1HT~^p&<@)4`-ia$ITC!^N)L&yyTTrW5{ z&}=*8-RYN2PoFbzFme{p_Rx{5#FxQf~xZ-KzUXXog4Te04S-K0vG?)0C^^d7ArnJTr@nO7IZWf}MJ1N-3e z+6p7P3wTr5)Yd`vtehrh1&3YW;$1y?bD-!|RvUcu7p@K=3$M8ExS>=8ibgpfb+otc zoAH^)IU_Ap7Lu2?p9&#&$~D;EXdLiy>>KUpn#Ve_YTWHJ6D;25P{_*iE;9186D<7~ zmH1ZwFypT#Cj=6G`F{y-Z>#=V_$}5Up=KY?GgzW^t4oOTZ^ykWmLeEu6rU^o`xm6; zvk3|C58D4HNWNH_Ff%u2+6^H4pn3Yf|3Aaq%dE@QGi|H{Mz zR4JJ4PMcuFuvckEqG-ndg8LWxk5%yXRVHBmIXcpY02`9T{EJwVOhNNnjcMYa{e8Db zd^8mmz^rrxfhqt>o1dE_ydG7oeJ;Wm=dA)LuHk_JV-md!wEGe(A;Kl5 zeM^Hy8R*;tLrlvXarT_3eCiNz-o>^BT|)c$9RNk!lAzO&x4i%gKLVVHfB-VgnPw^g z`UMGhutz|?UIutVBLGnXD6f8@Yh8wHTL2O2``gH_w%T_8FV(L$Zv_pM@BjT+y~|ipfUrQ0MfT0phjYyDJ6%8hXZ?x{x=Id-Cai) zfS&}XBcQ2yy@|{5z`s==D6WV?MvZ1-(@0sr)E?jw| z`%F0wctI3nM^bRW%A8wW%||}TJlh9^wY6&wfR=q^e0EVfT}*1Ah;@lmQqt2AlX^~ljR+w4tW<%W;zoT zIs%lAyM%y@UG`g`Z-G`rAb^8u95Vt9DI5221&>uxQOUuoZyMwC?8bs+j@cy|oZi|< zQe?RCZiYz?+aFmj?t*&YB)xY4ZUKf|gzvwf-DZB9*%630aKKdawE^4~JmHINx?-M7 zY>(64AJ0=~fFVFSg=br%I*AMZ&bQnr*T1{QIZzA>SUMdJT&k7D4te_&qDL9~b1Paa zc;+bxC$Ezd=>Z=woJ|o(a;GBgGxY&d7XX24oYqqS+zeL$fouSqNfQ+ItG{rI_sem5 zU|=9xv*Jf3ULNt(PiNFK&lL8>TV)DBbB<}8<=Zx$nU=x4_ zzYFLbJUp9V8oUxX_p%rflTDZCnQMR<>ON(tFL?obHEHf1~^n8w*Lh;3Bq6Z01S@> z1{~o(VEvST<%Xp802Aj(>nx=18Eb|CpaX&>*CZte@ieKjnf3;JFrcbIGH?nmf{euM z^yK6n7yt2*9s<^G==2m@t-QqT= z-6sXOUGIK$#0%=Z&AAC!3vbE6FaX%Upr9ZC-1UGi@?=H+0l;L1DhCT7b%N1ro?lve z703_qyrh_C1IorCE`&BsTz&v*PlCRddG!iWpi5$$fC1di0&DTrDYjoAL_h)snZWCg z;>1u`TU(samlxrupOIKn-&+1$9s@j(x1syBIj~M-lWGoeWfJ~W#B!dLB0do)pzCXpsGhoh~ zBJ=>?89oE5I{>TBWURUZU2Hjuz<>250&}n&%KXxpQQ{7;*qNA^fQ2gPypcwlcW?KF z>=&*9`;dxib3iZHgwdbGGK)_Ai_={Y7;|dq>q7#06LD`c5_S*w{M5H^9!Cct00)?2 zfDndg0-KCMrSm2zC6yVL=F^<;+#Hu_mDK!2B;B9HrEX>qz>1+S`5>*5^4s{I9 zPELm)VbDSgUO4(b7Z(?JT6uXn06hlZ&Z6bmfr^aL#Z-pZ~H`+d{&^6hpWmN|hkL0q7%y)L>CvXO*u@hafQn z#$W7sFLh%D;CumF3HYg_-`>z;#Lvj!7P?Y(To&(PQdvKz|2YMjGX{m&M=Oz3A*6i) z1P&2CXCQS)QiunT@Ij$Ys90pnD=Yn^K;;K<#ZyHuo*O_s1q;i{xHf>j z^0H}Y2ys*NYq+EGF$fRNH!nlxyunsj$Iu7rLi`3pbtCm7dk@h}oc9 z5f{!qvmmXCxxY-nut`z?6IktisH=4&|0W@jItRdHv=jhtgR=?Zd3k+p?OF&ZI)2m- zX(JK4KzpPM4r5qoObi<=H|$(A3u$NKF)%V>fq}*S_XzZ2ZBDpC%ehc{{+;DtMkd4) zs`!BQgt%IJkbf$g{-0s>?kDi#A98SjkHw$`Wo7+B3=q11tYAnZ2At9@wx_^*kamgx z=VO3YJ+QTfwEc(9{m2rWy_8ThGc&LrzXBZq(hI&D0WMkc$t};D|FC%AzqxEL9h{ts zPBujVJI|9JbQD;j?+pyn|DLnGAU2Xox}HRZsdEb%1T+17sIX_u|BD?7{R)`zt(t>K9{>yfePLnYt?Jol#y3Dwvl^V2;C@xFTV9nQ z!{t8iF*?Ey8=ZH}R|}aRlJk{XEfX2nY0(t_^M#!aCQ5XYbI3HaIrrIxfva;6h!j~X zzXJ~@wg1*-b?EJ<814^2>Gk^VL^VWihgFK4)oac}uzsp5>Ej855qURb7k8 z7n4}YUO)e-{+i$elvR=|n^weNYW(ZWWp~cTcaLr`X#zDaW&f9g{%&hXG4}Yak`=BF zshiYPi&|cubP5#@71+ORP7Li+>B6p_7w_fR9JeCRP7bmUbj8sga=LfR6(8Jy>AigO zs&iZkAF(rorx0TA?kVRJ)8ZJshHOpS&v8s#va?7)Sn9WZ-a5ntiX?g^K0K({Meu45 z%5`(!j0lx?`>t5aZYD6hTh{$)euCl=T}j;Kx%6^=F$3)nvYU+T129lAF&$!k01MQWnWAYkw5f$Wc5pO951nBy;`yC(`1souC6Sjs8 z#iuw2U#WQUL|{A`9{Q)mwS+3;OzrOC2SQ9}pinyklLqQ$sYrz(6l%~BNnSOIW$EXA z79x$Ob64t9(TPq|y|Ar#iceqi8(K~vv*lyS9QF|DyF}y6OO8Lm? zeF|F2(8S`sUzadG?Z?3;CZfVd`{n~_uWkOC_ye=$ME5STBGnz;j*%{?n(ET_Amme5 zu|a`{q%W&YMb*d^CE{o*j?hGTrH}K9B?HCvg{_Pmint179L@xB6Dzd!qEi;KCbvsT z4tb9TONmTD)apq|bfC?~MriCe&NxutIB7MPo$D?0rB~fT^E`HvgOuBYdmRxz48i@S z;_R!}q4M7>o~*>?G>O;fl$51ge!t%7d z&Fa*pphv!fIX_C~4&vpXK9W@uw0{2EPuRqCMtSP8CW8o? zdyi%bHplHZ$D}nCt>~W=Q*BKb@uxO)t|ojqd#?9d*9Y>>)OtRpBn#Cb>FRG-;}Xip zTm#+uJ;D=$;}WgE^?sm%5*<3)$ox-2QHnlc;iO2<*3U?|(;|i$2D2;O^CcM9+?kUp zYnE~7{~oO2gx@&p2>%(aFd-UrG+R|1JwfhS|9sVNy83IoQ@+UE2%+)8?(V6*h^Zy2 z4ML6Xi>E=Qd8NQ->DavUfay+MtT5!7#}1Zw2!+N(^33O53>JqvzH#oo3tJhcNcwY3 zbMz<4FDe?I_F%Kj3h)NZUKa+TS$oFZoafg3vtQb8wZmW=&EpiV9E+3>?=8-%Ci(gA z?3sPJ+DB_cOz?%lYTR91jal;co!XfVqeeJPUiU>um^4QQC%v)GQh@*9@-U)=vWbn{ zFs`EPchlsx3ox54Idt7kFt$Bjud*7FfP1*)y9%c^%1U2{?oY{7t?H6nx=f6NiQ)v_7aDo>o^?GHdOEKU94!dY|CX`L5^tq~2+ZKc0hl zmH8q38yJizTRwiuf$6yX75t}ub%zlf3%QjQEmYd&sl3(OR05HOPrGDyU{>36&znA_ z+FRaxeE+L5Jgp`r^4ynGWFGSC>cc^d!lf454Uu2MPuz8^q*`iTycn%fl|eVH6IU&g zA2KCB14rBsS=A<0j~>>Z`&Ga0q})>99%GP5D=SllQ|fTWqb?D$QohZ&SHF*x2pOZo zC%vMh;#-ak513lY=NwFArtSWd*eHU(GMB!!8r4_Z*?pTP>Oi;%t#ov}ZlDqDKFSPp zAI-=_+DAy-PR-a18#s+L zbN?P_p6|BZG6-okX>B{VD4aXP{`5N@s0D7aq#Z>d6RvP1ZYpp-`xzbc3rR7rV&TRn zK+LCJX*b|%=X5o(X{kKYm~)l;c?t3T>dHvMi5#0?-zcrt8Z|9+*Ga|AvT zH1oQ&$%VGkYiUE{56EfTI)q@lvHZ-^&_#E=Dr#iR**BZ&J%b2xa!GEuQX@E0{}&r< zbl78iAhwh9c}G28agM#Zf-JHO?fmk&a{`|7W5+N03m7V^VM)7?(Pa|})Y$A{`3WZvRKZ`P7vbeLeBpA8YlIkKc6cq@6PFI% zWO~13YA6(zB`r}w2aj0n%+1Z0w_ZS@gT$?$YHtuBDiKr?CZJ^UEH9oy$^A+cZ)1|sRT5U?Em!MWQ|K6^q~F$iSi2agBsZ-Rb!WSK+(=ZC zxo*-v_Z`({D{OVwzwhy)&glI$n*eQXPM514J4+|GWe>uv#OT0|BD}s>1A{3%8C7>{ z;}Je`xb)*DFe7iw2w^uh&-QcA-MS79$!@8v!!`B25N?S_=2O>H0;P)~9cSCRwT?k6 zY}FUPmp=pIj(;>+bzX-GZggMhA}E=2Xd6|7$D zREFUIeaDmQD|=aK*c5+-@&lq{%^}&5vt|TSD1({olY&{>_JAiwzLa zjjwzj(2;5H(KD6vC~4Yh$2@BH>Yh>GYMDP;ZIuo>yBGG$HYs+DsENl5x#A?(X`LKS z-xxu><__ftAV^Qfq~yV%)UcubL^oWSKY5 zUP&4n)FL*f5MQR{W^N&Wt+yFWw>>V)m@^j|AXG-bvLe>M5zNwS;$q(=rb_EH4EZ#D z%gZ!grJlCV=`&@t3@6MLQa7UpLGqzt;Q|BdI$xtzUjUCYXU)!={X$>GgZbrY*Uw5keoiKP`inn?btnS$N)@IiL|6$=85luFPi-$6J}IF#KkZ!mB-3W(Xe14)eN{W;snD;Q z;g3%iQ+>wqhqvx_B!fP~?P;KvcY7|HC68oZ0b<~KSYPDqolfRb#+W66QbD7r6AsP< zjU*O~u%+WTzpD>%YD_srM7km2f6mFBj&?3|kM~E5MX+ z3&^&&T5r>J)`CPNxgzo+sFh-raGt0466hOE9+TtPzXn^VZUN zubD;Xdwg#=i7$r{yBHaT{98Tz@ix-ATGpqlepI@x_&UE#c1aK!S68icE97CA2#F%} z*H!pEEQ)*$_9!8dURFp>$mW z?%T>?C2B=7NpnPf8I(!CHG4BPuZLKI{wTXjw@uowVge}4?vYoVDcjk{d z*GY>zx422?-eSc@Wn^UK(vvfUYc0?s^zq$%k@EYUI@ivz0ahCT(IyuQFjfCcgxe6{vcLOdd!B!b$x+y0wnF4ml_JAqig}T)1a^vwF^C|lz!o*L zdUEIW^z{9=aSx`ooD`CYM@9MGdzT1jhu&7&t(Avphss zMqPDP06V?$WvcI7a?Z%x0GMQ^CyB@KiRuenxG0jfmk(FhPA_Tnh$a6Q^@39>id^@ zU285TrP5YcO2vsTbA5##T5IP`1QCplIb;$23nAdl+4}*^{zt~@bG2nv#wLZ+=@!## zGu1a8g(+UL=&xE5Q>v{^Y)@S0{F*7n+?%tfmb`bW*aBQRxHK2Pn|!Q*$k=|Y>&Vuu1Tl8!Y24EnPX?%!tHZGrRPw`}Er28Ke-ydK21Vw^=5RFUa<)8m zRBrD2R;kML*mWiNeN4^lo@kkkIzmm`vDj&D0iUP1^~Tw1r9zb5@6hUE zW1XHvBdVuSUO};;tBKr}F51mq(-DK01jzZ8w*QXUBV7NeN8==boJiy+Ic)qq)`o`s zWYbHn4%XgG_2j2mB2K-FK%*DCnjIGy%YMog0W3~G?-G|eK+7?s_M2pYZEO(WFa102 z#UThrU!InRKUY#CF3yjB;CeZ|D4NspEj-YtRRWGZEW2P&bA#zfOu(0{TKaisKO!S( zk%z3BxAm$3+&R*9Wm^C^LCf3PvfJK;G*Mr!MOD*(+ZD~ z?N;q{d6CV}##W@F5H3kmZ6nVi9y2UEAXvtkgX^YMO{5G-8gq^6Sas~`Z}Lu2fLPz# z=BrNRUsoMw5I%p?N5jft`N7tT>UR66s2E*l&}t>zevY7t5YM}5%#{7$$Kl&!LeOlI zs)d`01cBb5eTAC6yXoX!*4p>^`2H)8w-9{hyteAi1sSUwN!TJcb+@G!swHiCzh-W3 zgggJ?YDXB$h{G2;H}5fC!*E!Ji>q8NYH1#yv?TUbt?<=&oYngF;O#wn*>`UzNy3~j zmWC~nCqt1|_6}VuBgfhWXtZsTm+BXYCMI^g`LM6l5_->MmN2)ulh@P4YsIl9>;C2^ z%|k+PqZ%=vK(V;XD2Oi&*Ktex)ja8;&u)xhPh4ae5)^imH|HC@$AB zuvY`iy2&^i*z}2t8W{%e9Uq{pBWk->op9UD`f5WrKZbg9YRC=SV0y}C<*N5uG*#N~ zooo;{w0lB1_-4I2JKvNdFe+vnBlA~U>4)&zQX6i2N(#cK9dUkK9T{9Mz<-WR(?jIePr;#d}6*F(@^oUkx72BztmLruU(xjt4heD zCnW^8d?ol@kG>o^Iqcx-1L20I%mSY@*^{~3Y8 zwP7&b8UbZEd<^p^C*9|h0m_0Fs=hH4yOG`-|y5@%>NEqg4qeBcJF%~OJD!N%zvk>=u@WL=e; z$d&-!vga2+sRKQeND1hoNICaW;&6%&9T|vyKH@znb?sJV0#raw$NmeoHGBI9kVjG` z`BVwS(x#PFktFKc1FWMh$6c)8DyHFxqS4)gsG9LcQ*$=rDd&Da=!R~w87s$x+k>0x z>&w1b`NbEMBwjB~DM3Sas$*J9%PK^Ic1tW!4uot6=wq&v?M}Dqy}tgVlHJQKBIAiv zHaeT*phK|po9~P^N1h;e$RyxHn;x51eL`W+)93N|30e%!7@>yrs@-Z=yJMBDmgC!3 zus;}hv`=`5F>iciv;CiyT_}074Mv}tsTVsnOQGdbnqkuuy2pDI#`@JJ7xDD5#rE?k z2I5fZ+)1(${O8tIz;P1t~eqS9?__m~fN+&~kP)GTSBf}zx6 z{2VBE^})H4K@&a6k==Q*d; zG|8hveIGakrpg=W0+T`Teb&AVqbSU|j}7|R2wTBBrpjTl`r33*2g<9hYz%ca`N#_H1D7UQesEADY2g^u7LzQkCu) zJ=Yc-ZCf0jox44v>yqS9fp2gOn1-O|KAoPpn%iUCKI#mip?%#a6ZA&X1kRntnF$WIQ^@>#MJF^j#d>N-u9Ww$I-tm8o2IdeIFaQJ}VMmdT^DeEm1n6Jh$*$ zq0A`xMSX^-YQ-$O2lR@U$?cu`1b&B&jTJEWq+*xjkr8@je?^XyuRk_C#+7Cz(O^*1 zqh;B(*Z0>M7yUgpXSkRcXzlcstNR|59S?}O|2_Vhh-)hm`FUB#3NyH7e-zQDNpTe` zSXlBXz^QFPI(@>>O{#Mh#M7u#Qs9}#lynBFvOPjt~pD|QGfLUfJ^@i*Q> zwhid#Y`5NWHH3B9+|(Fm5>{I8^0CV4yIz6Zh;K@Anwh*n*r`?eG7?fyw*SE|SoGnz zdPT*-Ybf}>*ax-$b2=;WR?v;2wbV*^39p}|$gZYaU$^vSg6{w|*iR%M8DuB|(kR;yo*u%CS}JE&OH;@V1UfDJq73Tvq0 zw!>#mc=PDCwFO>X_t$DPkZ(MU@l6ecmIOAd;G?t|C(Ic5#4YzEgB1szOd>)<00$A%2(`Z=gV6Q|!o_o2RPQ{%q!i3`MSMhqN*7 z(1yKGtcS#_xt?T7D1WMnyvPHpv68Y(7b&X7HEdqN8v_E~=bWlL_b-d~%xMpsuL@6_ zoS+>O{)mjsZysldaa(LK-pQo9_Ub=5x5>#MV&Bzq9ch0#JA#PA+)Q%I_P)t_R*tMM zE7g_6*R_v(x6kH1q5+5j&_4Un-_lY{><+h-VRNTYq9M!I!1upP{r&xY|N9K# d|819@+?omIa=zg#V@&$x(h{=pych32{tqUKs*wNy literal 198564 zcmeFZbyQW`8vuw1N=T`+2uLZ7h;)e(k^-0RR4#Do4iN#R1?d(MX`~yZLrUqC?(V#^ zuRig<-@KVWXU&>5u7w=WuCMmj$F~Y{5;$07SSTncI8u_26;V*o3Q$n4qGDVDZ{7-Z z5TKx3lX)Q~rXVFIMx|f_GkIZdjDjNhHbV8PnvyEHkfYHf4E#4Q1M{P(?$Epp#8*KX z)>M9Uohp#z#w&mFdpUaJcV6Add39GU=jBaeGwqvA)d80Y(}Nf`WCo?pppKYWe#cgoL=s zI7x8JygX6Z^C*h<8t_$k-Y2@7ZRjXe&+yZ+=UxfC@kB*|5lR{#q2Q;5awkQwL8goG zS6cO6;t$@n{)AP^XkCG&du#V?z|^Bt43q-CB(@GTkyO0xY*bPDzLW-YukgorBSluP zev#LixJKO=Kvm);WF=Rjn3QlHAvZty;D^M6qtkB$+3m(Xt(OeSA4CZ+z?oa%B~@BY zo3{q?Gfdka8NBChs1R(AA^6m|j9(k^QR4BVY#bE-03Yx3r{BIQzHZDqeMasTj}5&N zjV~Xz(7f$?NJ1@4bd-H}M8BDz=1odP{)XQr z&SfehlL2GX>zWZNffb2?uLLdbzT{61qMNTKsjpu;YUP1{)gie1@-+QfcG!&&tIX8P ztt87znX@lj@E5$;L$UJ+y`P4?!?<2Y)A|NZKtTEM(l=`2hsN;Q1;1goYgg}2i3J)s zi4(aGg_B5Lqw>Y~ep6C*Y2Dj(YHD-wY8+=*v!L(J;k3My%p$D-l@^^Hfu$TVahv!P zUQ#>Jv*eWHkDl6;V_0St)8}Q}%6{!UH@H!7%dq49yie+5w5dBN{591H2kx%b8gKSf zkZq^(JX2^V@uzGu*_FiM#~CCbG|s8b4PhMEedl5;;UeDMeU^Fr$O)Hx?X9~XMV*{s z%a=kmrU5jyp=a5o&=y>#tyX{62W8W~rU9gZ3|E%}bvSj`vkSs(()rSFr-x+Lxb?2O z%{8d}S5=MD)gU^p>w|KLyG# za$k*F?b&%RB;!?Gp~Od{rhf3+YmtWj4V^&<4~==_Rm{ir??`A#rNr(NV15vpry|-_Q^`i2F8wqcT zJ~S=b6rtvbA0!7YTJH+D5UxE+Ntm-k@#obu4-9x>a8K1OZP@H%rJVH{mQ!aTbC zB-}u1>*Hg^*YA1X6TYW-Pt=b7lzZ+nB*08|?^E-$!i<0n!akSfd&?BdOk7-CoLqRt z&$+&g+$qK?b~iKaThFRi8dND&TF6Y#Wa^8`+E5h!;EZ&UNtn zPUyUSrw5Dg09uY@jzEsFMrCoAnP5Nt3(Xf-&9I9Dic6tMJsz!fB=gr;#p=}RhraSO zEe=pGtF3I>71+1i?JwIc?k<_FR4mdjb`9iw7t9`QYK&Hqxy>0>{;E95Iq1kEu$QEZ zq&GUvaJfZ`Y)2fHotTs;nWvm5x{*2I>K^Hl>=AsQ(CR)V+tbZ8LdC1sdFP zoa3C6eBL%01vnGg6O}UGi!|?G?%0b=XPHz}*QC>MhxQxo^=Wj+nrc7SHg#|DHJO2O z{`fR9@6Xa3X%S~w*|z_I!fe4T%2?Id@$2nk$3o&!Si#0E36cd4x%;^;WA<0=uLg^G zi6u0+OMEgcGkj+lJ`=D-G^5mH^~e z(~TE5yy;ovhXl6L0;&UAjSD+)RDD;B24UKxTEgY!m2G3o<1S+?V?V~YD|Ag)@>7R8 zhD-%8ZlB-A7Ss@&aISEkcU?a?ZRq&Q)B-IX{2?_E{y5x0#%0lBzVl#WetfQBTWV)` z`Nte|x#J*b-~FfQI>YYh(!p!Pj<)&Lsj5xK?WiNlUC)NS*{+4#*mbC{Ed+3!{x^aYv9${c&s}tHJm^R+nG;?-OSx?y!llRT=Yx0Ho3SyLE zNBNY>h%@-f>2My&y*7SL`{Ca8$J=)XA5R1o1jmx@**@UlW2|62w*K}y+LETqj*gguwGp=B^7>5djmRWf5^dFRl%8((f|lEf@iq7>Yk-X6DO|1-p+ zvR-^EXxhU|UKH|mR2{Pr_uD`$OU$@Xi)Gh$UU@8UEYIkGZo+RCai)B*t*Vn9-knkw z5u%T5&o!UdRWT;(E|^^|n1& zbh;u#4Otxd2i?fT@bC&b5%Lz2=gi7D4?g{d1#lN3lzUC*I}H?BcCb znmLlKGLIg;h%!=Dm6hpi(>NFz8TYMJ?Mt!uuSgWkaAjS`85-5vP^iuE8dfO5TUAKb6J$$v5?c^k5gQ)IkLPpS47z2;dssNrU8y0f znT5M@TgUpf%g4jky59QKvzCu7E+MA3J%=7vukFA6ctcfAJZY7O&|&!-n`mu3PbaVY zZs0-8Kw+XOUV>o~lyAeK=F52T+0VS(+S{vEE6=NbdZZkEXlaV;>>bB610*rSZRp?6R3`H-I>dIrAcYYRv}iVQJ%MY)i$M>XI_|}8R z#px7}e+ujO-?r)bnItWEnjW(5Yv$Y1<9BHJYNO-prD8SSm&tZ181sTlX1*Vmxx-AL{5gp>^ zl9VFtCfI-Ig|eERnyd_;AX4{f-b86&00$jgbkT;$!iDmVd46ovoU4k;N|6IW53UK|2_+t!D8!dX{YbRVrfhBdy_x=JT|s9w0U7= z_X1`~h1gf$0A_C|c;^n{po>4h;WT!7@%JQ4+kemk2(lqY*f?0(*)H}5O9c>b`4nC_ z8Jnv=eqjM<2F?)T;O2cG@N2>UY3T2fNK4g_mfmN-kGvFV=zo_&Y>jQiU>4v|JE6Y} z`)4uo;6Dom*bulQ(c(9uf4v2i7Qzx>yWpA-*2H_oY{1A{FCNP)gWtcz0|n)>7kIw= z`!{&LWQpeG+N_3xB7!3ISX9~R((3rtc-kjlh1V-ut&M1HCEv%3qDlFqT~0>D^7qF$ zM5V`nC(Ib_Lwxh=YqTkd$OnH~LgiPp@$cMbX-rus&nv}EJ^1+rD+>z+ImbEWPk2o~ zRa8_gXO#2ryN>s#9*40EVc=7VyhOQ#iuU(mR!i?a71ILeM^~hm|4gI1BpS(v`mgDT z5fN%MG4eUp1VCD~At=+^oDpndT zN0O{o@0V0ec!kM$fGy9#_3THB6+;@nTlA7DtLI1h4jRU80eFluogDfL+3qLsihE)F zGba8bDgs8C57d9brebFXG|4YhrUx|f05my@!4{-Of?E*avd_bh4UoQ13}_;kDT+iB z8^G)KBebR4V0al5qhPu1?q6<*Bx?e+#`y}HpS%$~wNUbn%_y+h)inV z#7zI3i{Js^M;1=2oajpwFBJ@o0*#bR*}s6HZ8HO+?5r-518}Yo;2Yt*G9*TVO#2>H z>vH=GuDhscE~0>$*oEeS7gzxnBA^}IOay45Uk5lmFR+V1qL~cfQ4G5oLXg(L%;L#0 z(!XfoFJ=SC3eUUb0A!y4@nwJTA`|=-7DfRUpLK;SAlm;Z^J;nkRWT{I4k}uOIDm>v z=>-)MR1fY_wVL5SMcjv3S&`6)Qqo>ZB$PE+z;JQ?J^-cX4j@>79+&hVJ_5nA1fY~B zkWqjOQHRhyU)@*fF8C~xd;)M9=S~-Y{8A(YkkiHJ15P*+%KiYSM^iX}g#L6uwG6l| zu^^F?22derOaU7JyZ8poq*;}FigffXP0UK)lZ1tjG!si3pI8VK_*0cKL@J1Qa_%?Eh7OjVxZm0I+YzJa0g9YLf3zwPq?fi20(f-T*BJz}{PdkDZxiJ^q|NX)_oNAtbe=mWr#yan8#bqPkkv~K{gce-MQ0N_;M=p!;@ z*@uEbojB_mI+0{35I}9th%1kfE~PgZzLKI0)PcwZ7{Df$dl`9L2Mx6o0L=4u0IMhYu}I9IDg~&F$pr-iU8H9PQ1~gdkAzi^I~ZP%Q=+*e zl8n#?g7WN#ASe z5K@FtMZx*}_b#@+gh~N;X^%gd0qDSf-Roq%l1F>>MY6HpGCGGS4_X0rh|FF})cm01F8IfSLa}c<^j^Q-@wEui}u~+{sT_i>abD@ly z`In~p&(Bg7SMJgF;CiX*IMc;D&&N!aj9OyL3mq9b?R0WCe3S~Wf6A0|Q)uh0{A)xo zLosfJi-fH&aMsA^qnFVz`iPxIM<pg&5Ps{C*Y92utVc)q;+CZ#*HjVqu+kgfQX2lt_eW z|8s}Bz_sENzUqQd^2b-z4Vwcox3HqOi3Yn|_J>Uy-rrNIZ@?4Q!eR!SS8T6;U#Z7t zyJZP3f#PR{;s(E5qBm2D*J7skD<{ub-Hv}Gb-7iU##@UY_N5C!UU29362T|}SM-}S zwL3;>LIkcm3>!x&Izd=Rty?G>lHjKc{|;0ylyi>->SgcjvfI6SqlBtj&no(vblxdV zlmN`}^8&x@1=n@)Z`?XRY!QyUGL}gNT`L~MbsV=>Mc>n$JXSe8Kd99`8iRKlh6oZp z9J6WwRAy85pMj{aq=A!$~I&Y7QW)4@b=lt>z+BeGh3nT{PWL zP2pUlLPu@#zD|&+e~}ttj8`^R(JG!M$51h$R-LWE@Z<%y9yd(`Y_36bidE)pQ(^Pp zVs1=B^v+;}(A}Yw<<4l1RJt`>`w5lR5JOUaht*_JInIS3`AhNa`t|v^f@?)T>P>}q zV%j-!Fbbx_IXnRqRJsU~)DC^v+)}oA*JO2db=BOqni_Vxg>^>+;io%(#JX-@W+PQ} zNiXC8L?XZeC~g^6$Jl)>QYbobRcO;|bFlWj*kJc$5?)i}@GMO)lzYo8j>PwLh7yj^ zEwCe#F~6LZcl+Uxp7-}WC%Vo@*enkuv6m`!fTFaTk1_Qnw=7H9T{%D5zysG*SIy&S zP8xc54P-Xbl4)+A&4$bSisABp>{Gb*#^Ypk@)LLLdvMisud&sh+Pr{Q?R1gL@l3t0 zus_)-JDd&>v%3caY8W=;w$s@`dDT5=B|O37cqz@N3sycC`JKhq`rXdnfM%Z2%_Op; ztq@OvM*CE<@lWzeIvU-K6W2*#O#!vjEFXILu7r26R+>8vR!$H)fb_@r6R$O)^TU}b zKjBmT7YUg|wc90a^l|S4JCZu`;3u_ka(dZV<#V3wL7uL%eGTWzg9+CLv#cDIbF~95 z565Z8hC!70{bi~Z`mc%XV z#?Th}DT&ETS8plUYaNZ9>4uA>Z5*%UQ5IC66RuZo)YLdu%}{ukB)ryeUr2D&S;`ka zox+Q#kP7Xu&4@jaW-T{Lv%G%Xx0d2k%2xs(~awxQ%v zyH+x~PZzMIN};(SX49{%3%7nhD(Na)|D}fZbV>SE)>GR_p!$x(vgUnPmEikfdpV3P zcnh6zu$p1w69r4A{mR}o?%E^SxiY6yJdaIpaXd%}k`^JtEW5*WwA3j6Qsg0OC(tu_ zSNIObY?%^xPF4%0RWTq~SE`)X%NL%W%n;rWGPfSjNevO=bDLw*B`t8#BEKipw%)I* zOHp8%jSus2T}<(NbiM2Satl+zv9A7u{C8=Q8ksh8YE?jQN9mBM(e2r;!hvk=wbuZ` z-*#Pnb`-x)FGP4>$ZKOMJxOj$3k%zLKVlI(uQn0SX6mgU3THQ5QmRdZ{-KIUZh1Z zy6%s{YB#fV&oZ}%5#IP84Ke~!N@lg?oYxmMyiB~a?FZBu8tuQdkLp2EtK|~G_P@a`i#W`r@Nltz}R$9}cp?(!1$H;2MN^aUH zrlH>9Pj5V5Px#5?R4g?2-cbuzN3g(lYkN-ARvC}u)<9=j(&+UD|Q;f0fpTF=8dHqYUnEo>PT^Q1T(i`H@hqy#rDosukw3}JOs;s{(N54sx z1xE)9CTG@2$mH88^DUy`ZAZ>>MGGFkOys}~8|@MHNFOb5J{j4^sF>8*t6;zm<)Wc; zu)kSh=pPNsay?c7%JAvgY=(_{Tf~#S%&7M8tRlue!xLg{yJu!+vqZHuW$O^$BFWsM zW$DC^(Hf;%(PN5gg!N5^JNQOTTs6m6+4(oBY|A?`^4bjcATw%uUg*aqh zT&Q(By}WN0@wNJYRdbe`fmVm+>JZQHnas_oYFRhA^uGs@-Jc+#haR{namfk3B5WAI zMQIXYbpEafxl7paYW87G4R9p}()jlJl$cT;Whaf@Q7N@_me9Iced}@K;Os!fc5de2 zET#)ie-_`}qfKC6YnRIe(A3*3OTo+8}BXCUU}O zyWcBxB3@BXH10Kqjp{NLhow_JELKrXyM#ys{UHMh2o9l--Eiqel50iaR9T)!dK-p6 z35k1!mOh3*a`T_m)3hNsFn)J1yPStltVwAW5RlM@z*{YHN-~JtW8M z(xFYcDzZ)l!LeCLVIlq^rq|RJ^$m}M$_7k^cm*s8vP=Hd%2jvhEfL=RVLfq8PpkeE z|KyR}BAm9~$Ra(IxlhM_^%XWNi4O3Q71ayGmFF^CAQ6f}jSeTmQ+Yo`2Y}ws%Wopm zRCYP8IjCkcFYJ66(Y-ez0BksQ`C4v)YfkY%rc0QSvxE({$>Bh04dh(SC1~e%UYGyQ zyKGIBV%TTA9NdCn!TlktpI%p0DhTofMja>JnUefmeh3RBKy%A8Yb3-dprH71q2`By zn_R%%f5$74kc7m@0(cw{B97hiDPM9%pULqYcdJ3zBby+DThZX5Aug zp&zYvpP|O@xZZy*({!ucHw6RYUhSh)%@u-b9hhmjh4W zI6~ZI2EJtuM)YkT8htC!w>f!XM&WZoWsp(?9Xsl6@^>GC@(sbF=aYeG4#Ug=MaHBl zV1Dt)&GSXysO~pfjFnkaN9NQVFV8V%MpZU#Sd+ZR-H-*H)4&a8!OnH7ka z@YHP0w5Zl(X~L@QB~h!yaUsYl(~Hz`_1zxHP(tl#vM1IS+1vYx2UQGOz$=~$$;i75 zW4UEH^nPT~6MGVbj+y9Hey-4|I|0m!>P|Hf(E-)k6bsDzqF=4S}UCt+Uo-%j<6H?iys11x~f6o<6GFpm$mJj&uz_8 zEEP@<$wWkr)DxM{kxN{*AQRFx%+KkSd*k_oR{yDb{)w&NO!R(vpu6&c`;BLIomf9a zJ}5v_i;YPrg)J+`b8;r)F~VgRlP!gQIgoc>R)WY=W)mA}gJ@U(hb(u_uAr|*dc4ik zIoqts(>0hUw(CZnC0p}v+mHx`&e1S#aM&uSfcRJd+F3tQ5i{C&E0t>({ngsDe#P}Ab z?+pgIY`5EOaeITh!eb7j77_FYB80?jlxD88?<6uU$H|xdv*#i;Sq1io4AV?ze&BWl zG~(GOT9l5LESHqHhz)sRdR8qBc>ecfzl%47C4?|@#rG{X{vC%CIH=SHk$B{~Em ze#WE@mCRdbxkw4 zBL-AJ9K6JdT-nIK#q$=jsS{@vggR`e>r6(~?51t6_YKZ2*+TZhH3t+|xRegxg}8mp zD1n2_fHDPlpTP#;KF`2*9wJ@Px=!zP<1@cf5yv11)E}>C7~d*PI?K*nJM60# zF9B>ngS6lf8o7gDH8;}}QR8-wC0T#(owvQBCpxeobWomo+(xD@mPoULWqrHDJt%K4 zk;rK|BVDDRe?HoGiKS~d$ z9{OKV*t1k|Nbp;)P6g3{(%Q4N%wvurGV_2le=lvg!rdB5=2m}I+$N?k?JbQ5VnN3F zLJtaIdWZWh!si^tF|(1iPlXLV4<MrB0N)Hrey2=!o zC`}@4ZH*DZZViZ5sx7O%N|RZN`xM;~-m%#kFm0b;TTR4yyQyRFlvmlF=Q~M|md31} z=?Ie-qaZLJJ;-&e&dT&S5)eaYlDO?`1PsGKqmk%Y(GTTzgCYAy0*T!wb`A?okgPbC z<6HVzgGk9)IHobH%T&Wabe78pzacO1YoCt_~0qx(P#LbGj^iP@GQUZM}VC&9Q$j&m0)g&G1!RvOas$R|)Y+%-Ff=JzbCjYap&YY{~~YKVKvFyzh?saTX=_Iz`hIZ@_T*7L!BA zD%9+ab`R^W3Msh6>R&ha@jP46O><^Z^~Yl=JKgROv!T-~Z)HOAd@d5Y zOG!}PH>9RC$7rG7K!i7$n?`xn`&fvskL8rM9KR{xce4sUWA&A47`}pCV8G9zFbuMN zV%HpfN2Zl<*lG@^@ssMJnFVw+CnQ57wB`s6)P8*K1|Inle$sl>*sbgBh-zUA#>(b4(*pDpX1{|shaj@y5f#HD}25cqNt`OM8CF|UD$;M(v#4UT;Gll zjy6otd#}xBlMgA*kGef|?wfzC;X~F5qRD5(Uc5AyP{$d2zUBBB2RN@JfL(V;@6tj7B{FqX8k z9}a(1bo{J|-LW8*dp*l_RL}X^R%AzNFkdEYe|0#mAyUhxt=-wTqB!3z@zAB2)O44n z?B}Zp!;ZD|YI;@GJ@*aQ$rVqhxieGYli?cYg@n=+hcKq$V}yTK^L)6~0oav@0b=PD zC1&YY+2XOJ1#(3Kz?Pg$AEZo_8#q^S-NzjAHJ3-!cmBi?BTPU2vu7EIKn7|Uqmd~* ztT^*&5S^&<;qhSYgjGWur8~8=jb1dLz1dte$Q0+;q06RG&B;~;ZOq5PSjUA(VAO#i zVdewOCbRa}qUFr2NK0(#L{<_ajpr%viM|I+_w!4I5?Pi_`GyL7&9w_KxZW3Gc@_ZD z6q=W2`cs=5N*J>K???pmg<8E%?eaT4VgrN za-(->zG<3R@CDEFbh{425$n$Ph^#2JPJd*!JyDo_#iJ@lc8!)%QeYn!@L_P~ZW!kp zKccku4|Tl;Kv_{^CKCb*`;t_-Hy}CzW;~)Qcm$-37jwYFeUMQ#uK!E`W~l=+)KLgq z{mUMVA|>!n&SC-7^O!&(NSin3%9D#Y_@xLgc*h%mQyUEbM~zbfbU~?DFUWOc;{D&_tqKOogOC`&KzlRgweMZ;pU0%Z#y zgeUk3&X8cII0QV{|80RA1q>S(YG{^tDWVS^eG>R6kf@5>7eE85nnB7vTS!#316>7K zqb7Y$fU5sdMGXZR0VW#BPH?xWpn&KDA4ZT+L~4`kDFy_{P0+ZEJS+uPgj?Z)S0Y&e zvK*oTAtW(<0*a`F7R|e$8zd3Ye(+71{4UZkQa8vNqJFJ8c67lAV3{icU>lqtJwc%! z(Y9ooK2e8M0sRa(m3cWe8jv#qEVg&K5|325^r`}#58DrEjuHJHa8Mj^LylUh6hJ>x z=Kkheu(%umHfO?Uhv=C4hiYE&2+6rmP=RRR@&Z_U-9fZ5{9$3w3V5Xhzd{0nc;qVb zpVheFXg;)!BJgjLr=Sr0KJ*NU9Y{s(yMX{LcBQmbq~RhdaCPaQrvvxB&I(Ypflvt< zElW_OwLiH=P6_%15glv}%XaZdfL}y|pqoh<9IYtYf($sKfzdMV*$nl6w_S*X;Xo=b z1j(|&(dq$a$a(k=AWh*f>Kw3}C@504yRdqX#I}3o*0-wg2|E z$BS3MMpppVSNsn_-_swgMe+bmT+Yb#PQhxVwv#`0l8+YLbNs3wpw~k0E@-<@c(jkq z4x}y^c7S2MOhhnpSH_>5DmbnI@ES)|2GJPxXYUG7d+%T4G;z54@-KRwghVQ@Xy9VYLfXijfk|~I^H&O?$ zHX!KoNCp=>(r_%IRRwba@d~MX>CfQ`4}i+(HIeZIDSA@y*ez4+g^cyzrj8y!$P6zDP&kwvEVlggNaBLpKpi33LK4s+M|O9b6KQcZLZ$D)UT)o>Lhe)hqcj#n zftZPH%7GleR|>F@Vwl6o>_BRps|UIVixa|!T-1mID=c0)kO5e)se)^*#j%Hk)}jM| zwzNBci-1Z&4Ls_n%e5d4BlQE-NrFpCLue;}1o$|jv9pAf>-K*)b{YXWYR6FI1cKKJ zE>}eH=Y1ri+^qm;EpmQ_gR7SV9--+2bx6a{6@iS>#y5ahJ%8IB{~RvT3($HJ$8iR_ zz)ZoTAZPy-WUP^Vg-QVT8j&NgmlwNl1A@1oTw4N~P!J)6!x`LGNW;s3jLUU*5pB!q zU~xj?r3ECDiQJI+vlh{@hwR>BC<7_W6m+`-7Fz{bKL`?>h=P*LfD=ZR-O`IqN~zmXb)z>TCN8GMoZLXG0fg ztzd@^Waooyoi8YB=6&}?ZpXl1L}*m*&QD)JTkKsg5LDi;f}d^RA@Ua)lL5>Hk|1!x zuBVSPS3Dk*4CQj(o>e3VnHA5aXu3$XJ6a&)so#LZ&HUb{3rS&#BqQ8Xny1S^0Fu#B zPu5Cx5BKsswfna_usk#TDg`5e+&feGNlg5^MRk1{O43mEap0k5%30Zo{;#;uyAE4L*7m?WU2y-*QqmtCqt@}{P_V|N15R`~a zq5D?qMB0DB5}v#A$0PuKQO6FZG}I{~8u-`npW9V4uNV{?K8*MNbD_ch0tiR;0r@eA zV!R$FzuPOw4w%bA3TZ^x5O&adASebzJP63aF*d?Jd`Q@dV1r{=MEXHF88n}LE=5c! z=;=rJG=J#zw>1@b>WIWrJtEoH?o>KzS^29~=iiM97`tIW$wirrssXw9SO@diiL+YC zDfLBUliZFvbXjLf>=3=u7yE+ksVf%hhPI${TED)ba7B?FO1gU!CjegeRDa zGhp!f@0(F8=Lr?CGM|;~B4wT>_$b|8eszcd+t^XhkBk#v&BL#L6s zoe05M+Ypl^O9I^@OC|-WQfQ zqaNF@Q~E_!uOL;N$#K`wzJ?Ov!}ptG9k7q<;8Q_R+yHQ5#ReOKOgwJg z7WMma1LyASM=?Ln@O$RSmzw4ms9RbDMI8 z;tDS$IliWZpSQ0j)X0e~N!j955NU>VS6H0Iy`dVAX zqvbr>fPk-TL*gNB(x^x3)a_6IgqJ;{UWr%*=N>=2sm~<_grwBAr#c5e8xR{otd-sI z!QT%~MML}9A!#4Lw;>$a+^@3rts#uqptT#PmcWQh1_F5&3Tf)?@pC{|uRUCrXb{h0 z`=OAe3`{;kY`3XW|sP0IP_$G^4CtVHq( zsC*kVX@=$2E_`9!qd7Ry8#42bU^E7u#ZNlH7C2I3=&!PUqca&X3o?1g&VfT8otX&` z@=r~{LkyUK0W(QoW^e*w6zQ7|oR1dW45e+X6$)z>u`lc+K({yIurywVjt4%8r#Nx_muf0`0Re- zLg%b7YvIWJ(?mt0q>2YtapBjDmWyI82gIGK1?7l zGd@ZPw&><`Ja*(euX(%BZa}`uPm#3A?dtHb*0Gs(CuzT^g2$KHbE9#?7ET#LM7@$?v;Bh>!laQDft9_+C}Z^`D*=U z-Hqt0=$b{3&~<&+6ZJ9%ct^LcJ=JNYLiD-psj7oP|$KJ)TR za^Y~B9yR?K?4fL9L(g@%LlY1cJqKs+HxLT7`pq1_?2!rhoA*s#6UA7PSAlN2Z6Zqf3%9x+5N4zV_5F!T@E5_SD=Ja)`BMn3ATkKtoeC31oQteivN{+p zvKz4>bpfMHNoUIe@qxx8iUfP#pTF zWgHnNH6e*=sZ;%`<1Pvr%FEyMC{M>3+8`+QnU1P<=y|4WJ z@0k#r1AO^Fc)1){S$KidEcoOuBDJlD@Hb(>$5BxHTa_1NaS;Rilkb?nM-WKC4X|)) zp$Fq)SR9cPp-yi2#R!q#84EdnSzX1$=G~gj_Feo>-OJdVrhe>(a@$Y`-ROv_`4{AZ zjIX!a;6dc!RgoS+SUbUZ`YD*H_0he~%P(9{?pvr}u*t7rE1B-o?GQ2A@^Hp9@YFgF zhb>9ZTPol8+u9Shgf zC25Pm!y}#Zh03EOVZ-^SHK{w@T$##=X;SGUvEeT#;bZYuE8&F=hHTxKc`|F*Y_1yWS^z(>!YH4u(_26~y z!-KKIFRAsT^ubu+M7Ic4w9RFPtYSFyQ)UZu&QO1Z*LJVFm^hTN93{#*yURK8#-tu< zQ*fFoFS(M{X2EoyE2{|~C4zR>=Gqf<^)>heqL2uv{Tz`o4IIG;UXu1mcBVU;+D<$4 z{?<3^*M4|I&a3orrWv&jFCLo??p0~E&E%^!UQ^9W6I1&l<$)iJ?vhGx>07DfVyo+t ztPdO|>!pN6h@6GOvl*+x(6Y9=qYh(BMcjBww&HknPYdaeESrTj{YlDdxM*HuZ^V%1 zIytEPc!pms;0=x!TR|4^*yL4A4L$>5w$mzRriO2a1Z~o-Ee@@GZxxyky@8;BL zt?yI1w^!2_C6SD^|O3gu5 zI!0>dm(U{;+{MUR%+m=IfTNdz}7P*Bj*3A;5hF;`dwTpBQJJFSjI zdpBOuq_Dk)-I%pjv2ZoqG9l}H#^3X!z?p`w({T`Q%FkgZxIUy>mmRYxQd`e1TJ2?4 z#I&&5>DsyaRJUWdIZs_CnfW$#`OEv}<==-kzu31BZXXz)3MJZ&?{4;BcWUe0f}{D1 zwyl!~;@J}GbzSY!B@mLA&K(fg2qvAP#OpHS+f9r<)h1}j#>;-l+Q?KIP=onTnjBoR zYLGrQYmjMSqPZ=P8p>Vyb2&l0Y#L8b8n|i9ypx}E5AjBwDw}57a@g7EMU4)`)%rOr zC2gSH_jf{AXCdtUG)wF99PBaMCb~|Hm^l6(_NFwPgr0J&?%d%bOcNf_Ei4o5&Zk^I zq)U5^duyA*g{ASvY~0lNNNt~$kcTt5%2ip4FGU``+vg32XKlnVF+(=9euyuJ?rn9` zsp_BYwl7V!sm5ZHUEMd$nT;G`vwWx;9nH+EFe@7hnH~P&h!DR;@Lz|hV4L#mkZxHO z+-}E~LirvrswMTIxcyFQy+B(I^vQUl?4wt0ZJvvcD-c%iPU+Jx&co;uesuFJ>!&ym zYd&naGGWTBds@pyM`q^@LP2NT%$ihwp;1>&9LMVUmTqN8em5k*<+q2{*uSp6_ikmcL@Z2Z0^kaPdZ_FEW?W+{+=oQGRU;8pm=Av#XO%FfZ z+RK^83izfpR?7)$7alZ4ze}DPzzprx`POV@x!d)U zX8Q=f>Lw=cTF<2--@#X8mYxi>0Us!9w6D+8&UCIgwEQS+;82x;No`UN>YuKkFsNa8 zybh<#Ydr4KO*g)65K+l>@5sZ$brjRbUC)auOCVh5x8AG)xk{GkB<`@{_v@b2&i>kL zI?YNgoD-kGM^lXTM8vHgROR2(;s)!<)&&}i3svNZbk>`TyM3X(OULr`r36ATW6O*N zXWcT5>Ez8DCR()BN}BNjNxEEuW4n*H*+^K%h-7m`0`%po9^Hd!x?#<5v&IbU#_9zH zG6>z4m>?tDH+PV2@>rPNSt|Jv)lW@6$MVXF>tD+5bR3jf`PVzwnx*>6Vp}WXJhe+>fnCa5+qrc%^Rdj2Pohz@YomLcvD(l z@ic#cQzxfLWqW*8x#u`f!X#qgb>^D-KHY|nrTGx-$0^I>-GD69*W|aQS*4*{C4-gC zeh0JxKSzDptVTLH>(koAFh;%*+9!gn@Sy|G)n8ijpRd{@h^E(GJJM0N2I|yKrr%LK zF-e;xDt=;H!%O~bu+e-f3S z^T-zd6Fc3PIQ-~E3lS{!)WZfcP|(IQ>O4%(P+ER_Kk9|Wl3Z2OW^~B6@h?3EEZ*i{ zr9eM}AMNB1rzUIu5*~Ts^UX1RxAQ3btZ76zdidMs{`#u_x!H7tz}C}jrzfDzR7C3| z?|DA2?sn^Xm4*E2L1a$V9uw^Bt^7wf*RH(mj<{jTlO2tMyF1lwNo?nKq+0iLV~QWe z5u?F7+zabBUb^p|>r%BZREKB~2Q+4Y{GK8snIrht!&g zmpRy+%-`K#n^_6Q=FjG^zDE33xLZqqW4_an_Eb z|9`Oe)=^Q0d%Li$fV6A-^D+mnTT_PRB&>i11 z#y;=f+xMLP&spF1t#j6z#ab@sndg4)`~KB+UH9)P4rAeNa3)E4>%7`psAp&L{mTNf ze!(%G|69H;%5Vk``K`@jXWO!k+p>Ac!_z5?hwPpVpJdZPd^aJpT2#HN{8gFZ6&zPi zerO1RCI3D;I59`sonOP$LgkxgL9-AAyy9qob;Ai_A`O-rTZ2|^JwC-^Ck~2gkIuv= zGpuO`{$`!+458`T)&T=ly86cEg)i=F81|b1;}&9cXrOERXk%HO$v7+T{N+TMWJ9B^ z;&9r;eSDoPsflTAV3EQG%X-Mt$EdlwMGBecW$~KmuvyQRVfYlr zf+%_*3~>2Rl}~@3nMM&oo!?1t##(J<=C>&qAfrVM$FWa1?mCeP9JuN^pR0;2Zohr< zv5Mhpve8~7S&VPGbSd+4qoc5MXZ~j8JQ@r1)>2m$)nn@wYUIt^qaZWc<4HGuh}(Qc zgt6a3u_;8#c5(~gq~`)$Qxv4EF6wK8t2_^p?Xx7I*jh#7CtoTcDRv!zKuGSH#(9+lGlWTuwue{1{%wYczw%ICIzO_0l`z^?AtKQ~qfK5b0o!(S zpVY{%Y}Z9#C^XH+U420g$Ce2o!}mh4x>(0+PP%MaT+F*_Eol;<)xKKVU4>Coj6B`< zVuXihZ!0?FXgGNJj^j2(kbGmlqgAaoYNIZP z%_BUpOi+Fn%#W?#TT|6{A3&Ah>pZMAZf6smbLeIK`$(=01!8=xY1m%Xmt`v@d;edR z&_m$B?;Q}NZvIjw)^fJS_F>Wh`bdP{8b)9JK-$%FSu&j+^acF;JQ`;9lHNEXVD7n? zk%;_t!qxAp4ilfK^$NeOU`?M7Z+=&Zf%eF_r;53kh~-zNU-Ah~AAF*e2^-0?*5F2X zo_?x=ndStj3FyPnp85Tzu|&~IWLnox+$= zG=73FaVR%3nV23v>LpOwZCz-PpG~jv~!U>r_Jfb)jQd* z8uMD{Lr4flLG=wHQtq1batavK`Sb?2D7YIvh z?y|9Uyb2n9X5*~pE42(BJY~(n3T&Sgo>;Wf!|}isKir}m5`eFYS4jRUAAbrK7Vis0 zPM_}8YeC|N^93y;7CR&m)3dPyL|d-CbnKwG;+pZ*Cv(T6A8&&WN~I9MX_17&rHF&6 zXsJ;f^MCr`Ht{Rxr5XKQPWBPfC#I#ho7<~2v;Wb5s~^R;M{FH6JYlGR%IZ=c%Zl&i zco$@|^@DU0hO6NK9iU<>u)XcZm{L&w)pu+K6AfTs*doj#PIc%gl=TCPf09{}JG9tP4xzp~%Cmxl+A~eRg zOp<{GkARH$*hn2bA}bbgo0;X=a~7e{U;m3jD&>g2z2~}9_`m#F3|Rcfe7jgcJ|Zg; zcpEe!546wYR}fKij5cLWay#pO_f?Q{$g!e)vwVi3vi!ozOgn7k*a96!T&47@1Cc$7 z;m&9T?p*!c55O(IJjehZ3n7LEgY7*48yUxqhEp=j(dhx8Zrh3|@Mux-KkdtCa+raLS|vUmFi2%!c8oNEUHB}@Jqis*oPrHptY-ET z!SIp?aHRNQB0zegzVbvwjV#vt0R{|~^cYypWMFn!hBWcOnYoMkvcM9$18s=@UtB^I zxK?P&sieao|1d>UX!<3p?cAMWAZAZ9Al-mv><#-L`|!V;UI5lq%U1-xj1D~JfrXPf zuzy;CfBye?b{G7I*s!aFw^e$s=d(9CA0Mt%=KeFjl~MPfn3{W!AF6{7%Ei(Up(Z;r z$Vzr(V&MacFGsp$_7;hJ8l^d}rsx;}7}GLxg&3GQW66KFyd`s?{om5uhc&2JOoF&5 zQ%TAXy?Xgj*h&`Q{2rjgeABt?bwoz{C0vTAfc@?Gm;K$iEc*KlrWdMNPW-b6|G_4W zDn9A|eHQ%9BFb4{{wL1jpN5RGc+tOFJSdWK<0~W??`>bACh_r8l1-dt&>}*VIBtGVQ=dX%$yhXvehLj zch3&d(%VCd-R0fNmctF3skC~cDrNq3fJjA2u_EmMYSCqnMMhoD*h#V)4quvn*YdbH zFBzVkx*rhc_kfpLt(A1f+z45{6UohAIY)}60cmF`ZX%JeP-~PUo{gH^gk+1&X2fqGSqtW{^ z#b=k>AxB$}de53|b($nbyQ_3Dt{V5%=Y9OF?iUNV=*3!>F0~lsTM?^W^OA!VWe)uFykuAiBGP4UFCKmuec$CkHL)2_8?otB!YppNpoy= zDzJ`RfMWfN7j{8|kvFZKzncj-^Nmy1D}FQ`sO3p7;Ih%Gf4adQedw@YfMgf;RIgsI zo)Oi4;{LsIdw0WU2cPJk@>b2~@-749O&^KFngPoEVr%*84FyU1rjUC(7hf!c5!m;S zwD*UkZFtxY+2DDy#lLhvlw&Bv$(HutSvGsQ>68p`Ot-@ za&6|5Ps9YtkZs?02keLUtm&(WqpPeiU|YOMxA~&ItEy-9is|hi4;(LaH5}?LA}Foh zO{d4s&5F0DFUBJM09u+LP^092vcy6L7AAM-RR1 zZN2Imfav^cod}fH{cllr`u*>A3LK6$R}jWwJ3e)dx^8*dI_GP|Gn=`~*XUxpi^kh49q2eIGFIMIR$=QqR>6Q*Gh=>@DG4!0fqOy1i%_6WLp~ z9HNJ@;dx${yHbBlLgH8b4l)gNnic!4+4ABBXuWu`|7NzZVIi-)3tFUau6F<(CC=6d zk=9zo0P$XjJb0&R_89(rc_EX*xD8N5M3_5|{gp2=8`Sx$Ejd1iyZ(sAXo(V@DSVdd zY(C%HB7DouN&H)i5f4b8xfmJ#n(*?Ifg29HKg@f77%RMQQpTQ096hzT z5Y@NS6BIg4e7TEYt#vFtvO7w8kkzzfYucS}QD*JDLVkr-(0Oksz-5J^+BNp_G$@l= zL=8lrFa}JuU-5HRZKofNnfPt3vrYo*m0;E~g1qPNu8f+ncrFf8rQ@_Pn18khL>adw zA&^{{Pu;vK8->5<3kto|e#qn4{pE)~gb<9GV9|dQ5IXEGdIAak6eB9po>-$~4q2p- zw4z9^IolmfZIeqBwm21L@W~vzdh;;q99eU>2vhxP-EO{+D5c_VwyKiORHN= z_0?bM1;AAYrU#fC>9{qov;e^?|Izd$zX#{Hsmji(Q3tIsg1hIP_(vb}L(ag?mYt$p z3n}Ga4>MC2Y8yHR(CpqtoZEx%7%_Zj9g**~=(lCx`X)bKK zlE?%M?pK$?pOb;|+3)`Dvtxi&Ts<(RK*QGAfdDboTH1=vor$Hon^=Fs$)mjd?dU<% zi9u|X_6c4Umm_bEy^Q)=3+0v5l?4$IK6b9|4P!# ztT3al`N4E&r1GA*1>{XNvbgCUyP&I9ZjurAG>6Tjb2J;5hi;-DdPOpQKNMUY9@H$p ziU`%Y@>_IswTlJf&UWR}7zWaLPDp>YH5#KT(NoGuEMe2m-T(U7o2hu25HvpvuCX_& zwiU?thV1voW(8}q6p}SH{sby};KaxXZPQ;9>~5g+n#s8wQX+3ooCoNY5MgY?K3jo8 z3-f9bwP+7quu*O<>u>Z7Xm89UbAL}Yj7zqJDh~tZd#&@t3RS|<{;h-yMQH)g-FEGO z{OZ}7rT$gYtj<5HAznHCbs7GpYCVa^Zul+ImS96kK8~)C$lT|Cp04>%9J^PCJz5wa zfH5V_S8Gwah+S<_piJKcpp<_t)Bm-0Fi&rQMCaKZ?C9NFXb%RoLW!tqt=zYW8BRR; zxQ`3S_j^PBy;|!Wb6-WZBnILts%fZNgzsPD>c3R9yId5m!0{W@b*LVaBN=IbU5$T1 z7XE3-fVVV#{8w)Y)Q0~B9$?nnOZ%;k{@MIiju`!a0*3sTA~b{-vHPz5gTfeT2}(iH zax`80uL#qB@~Ffqq5sge-2A&b;lGPE|EI$T{|6&LoYub*r~2*fp7~ZuD@@K=cRrSq z{XG3NqcOCurn3e8{qda^;$zb~1yZ#OH5GE8fLV_?@-ap>(LcIkoey4N@H-*YN&}ZP zzfHMs*K<_qTNmqB$}>HIrA}%U#^ttL{$y#Zt0gFOR`WDDgjkS1E?u?RfVqjErD682 zFIPMlpaH>}Fe}n08|vuvE|6d6=lN;oCMbKYR#lY)q@nHp!az@4+3|rpfOpsh;F-N{ z89@EKzA;~me}ox2XYaZGeOb#ObGui8f-G9ocxw2<)W_wX9dM4}^mO@S-K9Nvcf%iN z4tu&yL7TFmVXvw=Z)%GO=vI2uY0SO1W!(IE)Q~sehl_>Jq^j4A>-=IU>Np*q1r=9{ zkDG)VHs6R)lbI1Nc^$^}`Wvb#H2{WCVxYWG;9pIX6Ts2#m$LC$lx?3z`Z3*1aPw+y z`FNWUvzS)>@j*|V4pXRlqlMG)Tnja`rAjW(c!I!3zTi6`b+RfqRev6<=6Wh$r((g2 z7qMImg`H1$#xLL{lXn|cPPCD7dN|uNPK8Pb^%%fb@4pxY<3D`4GYHX3$a;UM3622(^FzE-GBh_9@9cnF@CoXBnuJ z1U=d_=Tjsh<-ti?#d)%qt$gRU5%rk!)c`9MPu?$ueY~RoF-w$zybiFN^_|xgpI^1>zV@zSkfaDfC-d3D2Rmj z>ir0lI$6V!IpiLys?WRLriEa;Y-@`Od$Y%W$<0!?!=~ymD6~k`ZAuHTJx)kau~OAC z^!w24TIX5(mX{}SH*@>DumWRLF|4O?KFRGoBTN3uTi{az1hRYH2&8IvG&taaTH5)D z^!8bzW!-lHdHSu$-IPX{w7SF|&JNHbC0B>B`d3jf`^?nQ+Xz)b^F3vr7MDcZZ}<*O>)T|mBWt-P5i zO1^>vBA3FscK!D!?8P$P{>Y@6f^Ik;e(qp3_R!5|z}dvIyyMcenw1kZkda zov>mWV#`4Z=tvINP$D;0HCZE3bXFMfRsPXXr zcjIxepSiEexAh1ZPtN`QZ{G~R5k8Bt`8gd0Q|i6_6vgyN_x8#2ZMQB)$({A7&)Fuq1H!u(xi@q^EGW9++=M7cAHyptB~n^60me)?p3$pB)N#u1Y9=~X{; zOP;(0GaF9ojsDKua?L5h2r>(v4!=1qk%^$z+Gk6nt@!2h*_V-#vG4`el=Jt%y4b&3 z@Wi#C?rONKzVl~7Uq`%%CjhZSo{CDjPj6U0rWTNO?n|K*ixw_-&-Q`u_DC%~zyaC1 z-KCZxSEv&wH18P1EnM3_%&b?W(w|p38llL2Q=z8)@Y_uNOA?4@0#?y6bPgLsLDI6_ zgul(Gx2L$_;0Y~sA^t+mM}$Jv@?G+i zD``8Qjc!S#Ci^w@5ww+XDP>AH+Db=WH{lcO@vQR~?M}^{8`c9oLG7kXJVL=sVRsbkD#WZgiY6sdtgG1 zoUm^ly89P=Ij&jph1)V1Txi~gIxL*y#`B~;Z*89$Bnf9zb`0!L_GU8)N@Jk=wy`rV zPoY#p=&^5Z&Ah-tDS%Q1UHa-!;i;yRi7?n{&bm18dver0U6gsPk+fhvtbC~664$#y28~P)G}hX zIsO=+*9){4U=Xc8E{O2!Bk%|K(`)#&`u;36WVfMoq1ouvK^4oJTT$i9`7r^nSm)D% zEf|kh&A*97Pl(=*Gn4?A9+o&3U+&U9uQHC(-`iC)5yPB4?%AM5hIBKP&nOa&2=@fC zjE3BVC@j}m%+}e7jWK3E zSw^Y3cTHC6`&Q2zx!ga>Oc&-yav0-Uf|ZOc<#HO&hf<>RDebxgS!RneXeWyU!Xpw^ z_73Oj?Y%#jEoOF~biK~an#uisvrragmkC7$J!luWpn^OL;`!d1lz~hdt<}z*q{(;< zv`9t;n@-g{?o!jVwX$Wd8ESu)p`0+BKjZv@4UE(Ci>*95bV-(G+YVUeN0Xo%V$s^C zIG|nUJX+YaR%_M+7*|;VIk1vVV$oWn`1o5pre;?9$}zyysE89tEf&LVi)@vYcAK{> zt~)c|6^kY?gWtSCdF|)td(btpX!e(Pj;LVwzi8YS1*RlhiW{{w*HKH;NzL4^mPcuK z0~h2~j&d?eSHs;T{&4j2WWcGMhNMVtIFj#~I7Wq^m`syc{-;g_AFx6h#aB2WY9_N* zN?7-iWXgb@$w3GW`6B}vqSRLxlWyN||KYq#{D3ivZvjj8@0*7Kh#k?N!zp26D#L*y z;5a0y!A zfR@j0Qg;b3XZ-Qe{HQIPfekgnO%wqp8UxGUf8JW<4JuY7OP%Qf7 z@;&@%A3CGRukH0k{^#~W&E>zxIpA`QKxuC{o}wc29jH;GF!1D+do+g^O zASdK>mR+{OJFK*{w8*A+HeIqx&lei1>+F{oIW9PE2SpF}T?l%?cVl9Y_8IA58Y%oC z{&4#cWjf)c4PtWKS5#C~(d%eXBeaAySsYN7;zTqKuuEP4Ws?KZVf1@3A$koKVluM1 z<)zQjIa*dEoR}+zsOQ**x>&j^q;KepqTZQup^Dtu5}azkr8@MOlSE)J#20c2K`2pfnLp6%fvHvnuDi~Xmuneo=-UfDZ z!{y_pZo4bZrnE0;5gG?U#{K3`9oD#+H1iTY5q)T=7<{gP0bLR<8}7XhHA?0ci*`|y zLi_E49Py;esF^d>b;5ixLJ{RXryT=VR?`ID0X_?&oxey6Zk&h8O(l71j%IoUu>jZ$J5l;X05W?E`+?t0Ug6yZ)B83&!i2til}dNhceZa6ao~!%~x~KpUZM z@l-)Y*F{DKt>!cHc=G(2Rou=a`EkwWHR-<*C)3=3tJ>y2^sF(|05~4elZvRR$_qKW;wI zs`HxVZV!+GCkc6e96fJ%-*hvs+@Xv~V4G5Kxcat_Dbjnd^xbmbX{Oy3uea7{VMbYY z$X=I0W|wk@Z3pt2TT=-6bl1WO2EoD-=V8KFQU5VTT4K%KfT*07nL~)fgH~aWLTGDp zYw{W|Y^7UtWc`_~cCEWXFbfgsb-h>u(9X`$D*STB{)HRQMDv9U#)B+_OlIeE&o=~k zh0rKIb4LSY=awbOUDeqqdph#+18Xp8x}(3u%4PX z2?s_#Zb9k^Kz!v%g}(F+Ng8J;5#-@a^?P5)^TQxSM!oT(XGufv(PJJXl#+|NU>bE! zR+`x>XqcGkfg@&XW4#P{g;0m1Hlx~i--T;!-S5uuLOy;jBS=L>Y_D-i#%C44Bw zYTfP6LjOW&{R;2msLvoY`=hUq;+~_zzy>lj>r>p*jiVWHwOMbUwGBnlgR_tb9>3ts z_T-KE>I|d3tfBAf$%|tr&*C^P<~&XX9m8Y8UN{6!U!`X$|! z_AhH<;!p>qE~$s`Tiy%adBX-WdDSdfRQ^ChOPh_ziho^Df#O_WTBDBUP_Vf=>8oSo zp3@%x89O)&JuwB7{guq-(H_q>`$ccu?E2xui?Vmz*bZxgVYNqYDJhW4t&RONr-kDa zno!{uy5?`8clnH$RvLUiWysIW^)Jkd5xP0K2@PlvK`j)wZJOB9UbcSYCwU28)5uH& zvT)gukcZnawy;DB?aa42>wG4+qKaK191thyK}&P+zGBnXH?f>O#Tp4R;=R2afwDxI ze*9boFC?)y=f^gkZk^36x%0{(PO+;pLmQKOt@}<+NKQ5KY51kp3#5yicqEQj=1<1% z3hp@jzQ?N5j;g4+P#Wb9Ozfv(Hgupq~==lLo4mJkxT0X}!R@wbQfrBmAO= z-Vdfs9};+O+bD8=CD7BZ`g{`lwNgQcO@0w z(y1u4bLG;%0H~=Zh0JduV)+BZ@dS9Tb?9zh&)bn;MnR{eGYZBB?QSTm2&e-)Rmt1 zO8ONw|zTL9TrlNRTFHn z=no$g74)dh2mQb9FC2$92(d7EJo6B85K=9=#8FsbgN>)G>>10WcbN?4-Mr%5sAd!e zm*S?{I3XWix;)&Us2cteuyRZjEJk={*Kvyve`)g#q-~Z$IEuK^`IBCdXAU)N^lF

L*%%s+?`k?l0EhN*sn}Zn^Qg9i3nKZ6yZuR5hPIZ-DopPQH zPu|XX?3wBPUI2R#8HvP5AOOi~v!16iQgEye-mCHqlu8MR$4XR~2vl=;MPGZ58%k$( z)8KdfxOQV57l?EDs#hH^CpmqEWpL;9o82yNH3SrV`zGd^Ye^==6&lr%HNQ_HM`GtDYa_P-$|AgOXEXMQV&dYdcOg6VP$=nLM; zZT>83PZG#am;Dwh7=2t!mp<1#n{TOF9rP(XkFyx=Jj2i0wS&S94AR>df>##@J_yLo z-Visbr(fSCWN9uA_Iv>paHWJt^svf8J-K3&@DZxk31n0+&_P$=1bv`w1hOYw9AfHe z5x-q<6|y~`xKc-ln>IA=a1-o)2pG!I z-QyN$8o(wH8=aV8Hr?5lv^Q0)e(RPu?1^L44T|kW=*T5N$X>>TSS-oi!wk(hG(1#N zc{{e2@cA$os2u$Z_{Z4#r!Q6gp`OSXwwU*C5 zSLM>x#Uhy8%5;^UdCClP?7jHZ6jZQFMU|$KCI?{}D8UfV2u&NBis^?$oSAqy2UM~C zgb@rP`UdQ9K++B9Iro1jN*RxCn5uzCR=L&2U**#uX2AU;2|yNw=|`@@T`7IE_Y#_w z>t)4bPD>aCH`G{ulmgLru*z=CHf^2-Ha?sT+Q3qd-sC%|k7&O?luZn)sPxo|6PS^V zh8rk{H^wHP8;$Pt7qL=g%xLj!cV+9kM(C0Wc&7j{LXuxV@}{2q%NUKGNypDl5|F;B zrN>>X%5+eNi*+M|AyzV)+}g&SqC+2uzs$%)PFdgFT6(*$+__J&+!7ni*Ig)X++Mid z#(9oyeY!e)Av}v^wtEf}L<1X-Ia|v=!Y8p@;Ps3@M)Tk}xAg4zHcVk2KE`581hV6% zRbUYCbQV@f05WeOyr5wCuEPuP3H9Qfhc)%a#9P%9SGR(1#10Al#=;k^M)?b9vXO`D z;~!&R8@ErTxQiDiyMHwdE~r#+%D$xVn1@S@6>0xK3X*WOhF~*X;96}wT+F*4~ zy`{IqIdifxOnSn0VbLKLq86G&{7mIrGMVxAPOoTce*sRt=>vnty>ga9Bcnl|q{)4{ zC!^jWu~;9?dXLx2Zo;+Fs7BT=>4bSs(_T&muV-ma0}#!S&ef`_H{Or%u`clE3Z=6} zNViS-%CRqB`DJ@!h)mv;Ir<3-hw{!JH~{A{D$Tvs5DY)He`kHV;J3)` zZ&CiGeQx?H=B-|+9U+K1cmF5_(w`a=G3MP7O~cbOl_T;1Hl6naj7`7iSZg=F*(13ltbzwh7%fOu z=YO`$W%Lvp94}qWnrglLjtI0@l{7+F-dkjBq@2D^AX6Ngbz%z~NS3N#fvO5s(t8M= z>0P*Nf)56pj%&p>1?XkCHA^l?`RSaXs;^0QY|s0JJVTSp);{*W{51FE7$`liwHb^{ zfjK%R>zh4FFAfq%*f!qqk*_=~Y%Q8tWpb?Eyv(*$@CovOKu2_>7ZZgfSV=CI62j=!W zHZfTk8UUzVDDn9dmq`~5u?K=2JF!6)<(fd1lfM{-{SSDE1XwI<`}*lUdYG%g?CUv1 z;Z%a?nvc+jhR8+NE)clXL2o?g-bxE7vPAp`5uz4u-Gv82hMaiCS0OjgrHGG2gyRoK zRs`B^jePHGTv+L)?&7!uVM;RwZsLeqU2(pOHVy%E0^j*ND4XH^VMm-jQ;SlG4(* z&WJ}Z2%WhjW)yM_tHu&Vb^8#6AR=tbj1%PW(8lp%)yiXmnTr9bQ%npL2&zXNR2Xm| zGp2nimC|-|`SmQ&Df#uAv<_E7g2^Ivi5+}SYIEf0fK!uvmfKy*U7?o|gG~M`j1Lv> z{Xx;+qZWgQ@+~_M(T(HP(muF3Zht<%7+hk3Y1*$gkEz=h;cK;IjYY`0z$u_hm#zIK zW6ij8J+zfAse2q-A1>qJr=R>ji4l@~ZE>u%la;ZRU(t>pr~y*Ux`CF}1O>m{JheRC z1o|E+-J#B$tzH{r=+d-suW5W#mpo4qLl-SXOs#2Us=rC*a#5VGIDfg`e7+@AuqWH@ zxo~JO*rl{bRA5mK3}pR^DX{3`H;SikXNe8WS>pJBTKVF{Lf{+XwEm@|%R9>?-!mOd8ES zCSPCi*V#z+oddu6k%>(~AY*2S59&wrrj>{yu78*)%d+_!5da5&naQ+g`DeIun?oiAL(sxzp0zWCk$y3m7vbIZ_RslQhd-maa>d+ z*ZFqDP=Y!LZBF0XaS4`4$EH|>=aDJb<*<2}M?}e6ss)~@wGV&0-ze+>n~4>Sq^jdX zV=xJkk-8nXAOkK%@7q+4w$kNo+tAH!2Qw+veURZV(9Y?1WIm{*34H{0*ldd1?;Fo{ zIT$qV5Dg6$ew>+y0*^l>i{?9#?wGh)tlfB)tLOID#WLW>^E~LNs|Rx@EssUj31(fD z+>RS#Nk}=jx5P}avaWB*{sAHYZK$(ljwMU$a{97VG6@U0UHXuE}jtn@}tYq#yfVbk~PV>UEcj-xmJBsM)xV_=wQpb@=gEa5!ga z;hUolh8UPHbl8?C&|-e8k}D7gKYUXP4O(&PEHJ0IQz74TaQiIDsco2qnb!-B_X3R) zjH+ugeKQRnb5OZ_d%x$s*&CVw6yB$+aJ)$@uPAd9^9R3SvyO9I5PM}&A+ouX+wDz) z5HE$b&{Q}Ir}0yI36|5Tl}oUCC~5I+JZY-x*lrvd1=Oiqc4R^Tca^_sJ+r4cUAU+i zi*0`y+bI+-Fc{49@j^=aGonmV={#ve0H_xom{b~qJ*QU1eo`nGg6ka5Yv@ic-XQXh zCi}?co|l^u>j;)cv`>Tt8&Rwwply7qFQ<|=(syfsB|{7g#Sb;*&3;-&vh8WWlMou! z=DNG97SScIgAXGcGzyX>?QfvKwugWG--}cLbz{NXr-zLfS@%6F%!5ET&u#VAIkss$ z3Te5MeC0;?oK9~+TrCPupIugEo=@ii`&gMIcW3(oN}LYS{o?e@?pY`+PM-uZUq~b# zXcNP6j;+uXvFmfLXu&j<5qI^v!#0X6l+Y94J;q;7SnNj_+L+Sk2#q=V+Ub^GEL^e= zB%03cR(|yH!%xU+CCtSV0}CoGy00Psj#`V~O*~NV=RCD^$<-l><@Kc$0BUTZ;9VYF0RtCkaQ?Pf!n)5}W0&sn>KG--QTAAU0bov)FUxzD}eT=2rly6S>p56eF~?v&ja5?qI2^HFxnz z&f(l~R+DX?sxo14t2%nb>@6YGY!%tMlt6r{O0r0`>S{JAJabjlznjco8ja>7tCwm zW@0{};4O!Yq`e%OG)Z?MPnD6Z8q6)kSer#G4wyg(;&NNJDI;d^g}8+HVr{ z+AS7TM7$Pk1c)29W`fccZ|c!;x3*6WT&1(;AckfRXw}3CncoKwK0H>stJApZW5!@1 z=4-f)9h98`H&~+JuMVxP$jf9m(*<&IxqgWxj<593RRCZp1jAT%$pDHdcxzuWz-STk zWqbQxGA$0YH4C$Gy779_b4U1s4WI4Gki^Vl>dm^!9XCc+SnRWn3oxh0kz6I#P|PI! zIk%jgJ*qa*2z3E|Ql(O6ZQGN5mn-X2a`WOw#v|tHRRdFim9${83~eV~-2n3eyDnJ+ zs9-0X>@cvi%i#tz^m7L3gMF_Oxaa1T4EDt*-QhlZ%A#glaV&RIoy#mQ}6}^a3n(zi+&U?~30$`b`I3yH$Xi=$CR`K(yow)TuJS z^$=bboosSTxXg^`NQ^&77{KuG#nHMpaq6zW_tKE(pWE(N*GAq5? zKQTL@=iSibE~u!<=_1-W3{)pSZ0)?uH#K8D8Pl<0G;BH~8yoP5nP4elrGovknujCo(MiGq*a7P7V-#z^Z3 z9X*%iELBD+ll_7tZ9XlUO&-27upK@bZ*aI7qr^fOB^1AGC7nFjnU#L%T%VR~zq2KC z%Ew=0yCcw0(1Fojt8MYEPt1~7E2|r%Ggyk@&w@>R{WhQ&QsX6w<1%~4Zm_Yy=ZImn zW3RZDJHyF7FjTy6_vrQ`e_;2*j=o!3ilQ9~@r|w;H0j-eT#_LsDxsbak8ow8KPzLL z+$^W4veI7Rw;`Xc3i5-2`(DkNkaO~^l)C(87J&1Thxmu2eXdB! zKrcMQGt^)H!zVxh-fA{vy%1 z)Ikd^*ncDz*E>-0_XZa;jI^~(@m6*yMb;C`MXagaWY&+#;q5d0RVT#SCm2LWH4km2 zM_Vi=@&Z{_PRkS-)!B4Y-F*Uav|bC-VIMY@9KdW%>geMq;#lc646-czSvVuRvw<)~ zn#qxCFRK-U><&ne%zmNYsh5+peE~*B?ybJ0Pn(8dmof&R1)$ZN(0>!-SDUSIh&bFD zmzB*RV%G7x^3esdL7{0B<@7(R0Cfiq^^9Xe8LP%H18&KZgNB1Kw)w(Y0j_Qu&KM^=zITRmtb7LE~N(b zphi0!SDX{xBH)WYdtV=4K%w@2nJ!3}x=V(mIurM`D!5~n6evtop#&ypswQr!mQsc1 zUj79yHtspF zp+)}asj;j=l+5CRKg}=g5b*a6w%0Re7#<$JkC@S>1{_^Y2RzeYmw>f7D_>;Ul1e1}LzO#FUi6JvnJ<#e~(@ZUHhJ>lgnn!!bL!YxWOy5)Low=gsU%m@~@E z3={%C-NT>VW5~&aKL=3VhPiAE#B8k>9zIZYEcNR7yJDYw1ZW=F&|Fk)vK9PDSpKu( z2i+0tkYmR?BbK9@O=?T1+baC8Z;L`f@2frlna8kaIJpXl^AzcES&WnwH;S`kKYZ&v zS40BH6c($jHADqdOvC`Ddp`jO^gQsH2XNzJw|`zZnZnB#mj6h)ZZM#->ohZJ=B+I< zGp(}60E44ztq1@kj2`xPp1Zzd=TlH15p)n5$r)2o$ZiJY6Xxts0=Tv_vFvxIO=Ex? zv+)jY-WH3lFY@{4y;X5~v*P+6S9C8Cr6lsr3n-XSZpy;7ZfMqO8C7&OUyK=E?;tup zNJb>A0u}7q=}6ak{3{mvG8F({@b7Aa5cX!#*?DFH{Bj*!Qc_O@%qNwBKy9cCjDHqP z{mV-InlgM)60urp`rX6r(P7iUa0&V3r9oJna$|6Vb+P4t~Elsu4VqS;ifbq46zpNwNk``3wsFjO;GCLutUe9^|8X;(hy0yf@O2h-8f_g4&^ zRUBHUYsDxiN1q#29IB_T7Mz^t#iLsut%@f*(qV#;%d&$w=f^_ z1(=zouV|_>XE*_pl@MyV6aY%HnyHpY;WI7@I<>*55?gOn-i! z&8CiB1sC+{RY1A%SLeyov%3$9(O6^KE4r>{bWq_7aimOgO^%Eb4+cf^|-xVYPO`s~7o}oKKjYe}N`ps-TXk#DMqOg!k zYs-RY&co%GVsOpHUtzw`4Fd zPt@UcL#OUKtl~UiPKCjl0Az}6yh84tPlpYChw5{nhdeBq6rBJP$j$QwP!$ znlDRcwmYhgju~p-v}A(`UGn;)1O!qnG}N=aRCxuq)NDx9MU0DCcWfR=G8!kJqaJ4I z@W5p1bdg=jqI2?nD8qBBiAz9SxaEBXY=)Cvqsbb(?7RLsIgj(y3gacGAHQ20q?@cT zk7-A&u9^e}2Os#8-0n>gR&YC_57JwC_K5Rs29s(YuU|$=J_}64Frj=)Dp*rM!dRX> zT^MqfjI}k9#u-~S|8NP8n0v+{XosGdLkUBrqLqE88rEd^Ts{=;pPGlnht?2Xt6u^s z_6OKqn7Ok6Tzl0N04RSFkN{;k2q>TrvcLmbIZ+nFQBT3_QbgF+pJyheDvtSiPA>LA zyRY-13GVV0_cIVgY}%>(PFt%7Hi_V#c;L%VgsDR=LaN=y?1IZ_hp86jU2$w5*(-GUJ)f1E_*&$~$(#@|@m8S?dRCC7I@)v92TbaJnOF{3N2 zG#?+jHYUN~1t(2^dJ%l~05rK@4ziA9;GXm&tC1edlDPR6fU1^xi;~CX_U3Z46>3;! zHVN`BHy^iK&Kpx&S`TKRvFe;&HMiyN18=rg?n^!z3%9SO6DU;9#=1B|Hjy6zFHIp? zpRC9S_SF}$hE=Cd;1$ud=VV}&5Ji9pe1145duqHn7jFoRm^DWhIuam{WcgP-dKcpMV$#%T>britn8X4MHsvy5+V7C{ zB7P38Q<==&I$~$M;8s;U)GcJ>&w8sbM018gamTf9+b$i*ha|u_|H#|#eMALISEhgh zIdirCd@|wdZxjpFU&(g>CjxaZh0^-M1N%lQXDv0GI=XTOWP2wE$-gdqUVhy+1 zK7O~_zzigGMQAxFq87+4;-L-E~pFIsrUSqt(+YD!nVgrbse0uvd8gtK;LRQ@? zP~O20G5xM1w)h;mne{Fta46;lqTRWuZ#kI*CvHJEdc+)H-sHr!u3AXyk7EVsat?9| znYR+D3jxQV`^5|dJxen zd|mIPp_yw+c}7?H;~J@mKg(S-_kSFpfd5p1X8}~&R?^gP37Ect4jFO*MY?C-EsvPu zvF@jd+Iu}cvv}L;3^U~gBH5*bFQ5=OL1b`_1Hu9Q;GsGA{Qt1`)lpTp+rElLNGT!h z0x2m00cmL@B?M`OMW=vt2-3M|B&0!F7AdJ9u;}hmVbP&etUo0z0W>p+&jiS z_ntq1F@Uw+_kEuE%sGF*Ie&AlQ?HD8I;ea|r`5Urb>6Pfqzir^a5)0$?N`xtqv&K0 zf;-QeBR@tO-`U1R7_n94;mmY5v&Fj0PVvh#ih#~<44wM#;}H58j-w2e4nDp)8T9me zsW8~X@Ei&);lxvtErRgUol-szSGD-=2zFS77?hV``V5-!nLPNNy2(^ro;}`~*hu3Z zY2U%gijyR_S#ap|ro(+uKz-@@%W_<& z@iCMjraLi)5hCQA(^6f0nz~khP$f&c2e=(a>Oa*!FcFk{k{A*nE=Z|QkFehRREPo6 z#HV_4-`kuO~&1MLA*PO z7kuLM1IlL^?>)~>q-Z3aiY6k3Jw3oDr4}f0hwrz?Fqv*I&apXk^8^zzNv61M2HHs< zzH6@`(ia~-SqJ*F_30{cyF<0~8XnfIkW2bnc-PN8jaZ7p>UVfKaXi6aX18wa@}cW8 zTc)<*^5WkF13-D#`fhLyOMWXrN|Y}7%1@t0jcz|^eWpJvpvhhxRl%-Hja$-p-$E5u zNdv2B*lcI|=%H9eb>Z^WR2+&Vhh;m(PYzuUbdzwyV6A8q1Pn4*VY%s|N6B-^D<{Q6 zvnu2Ekvqe9+oOj*Bd}LGSQ=+c+8bFW zEOG-9B`pX4_e~+7=g-3BY0Z`3zL?+z=ivCx+d65t4cn7q94s_tEmVDHove=)2g|ym z!enWMAn=b!Mzut{j)eKZv-g3fYN9jCyHWe)oNpaPxh`;809LT zSD8eo8#f7c2R-$zKP53JJtNxPSP6-U=H=(0Gm7-MJ?c10hd&Hque7Z{lQwu@xG}&* zzNB&l0KFVJwQMJCp%lQSJ8y3TwbkNa1#Q{4oKyRqZ6WuI*Mg-CyPI?RW2|D^gE4TqvM zR~;r&X3=%wWOQ)MOn90gz_{vTB>-E;rOH&uOoAp+uFjouzd)AIL7wZ8zt=dc?D zGx?yc!Eo!VJQmP)1&r7NtqlhsEeDW$tzG^{M((e~&L0|017CYJ`PByz_}}|`|1Bm` z^TccXherAw`b7m)Y!myf822ya+~6|q;N_pzaMK30e);XKfCdL1uNw*dtQV$1zgiE++N=5_Z1ZQ}}OuQx___X!PT+M{Vu zn@>5`Ep3Mx)>daF){BFF3S{~6k9foglA~Ii5F%VpRcnoo&npZ9w_kd&@a4jILN4Ex zp|8$@UZs|8vl7YrK@q-x39!hOYj$8s?2exWA^Mv!>9F2Ajf0}j8?~+v)Wp@?CRwWOov>&> zPGwt2aBfxNB?3s)qD(sCGY@arz zmm+hj64e<_*X2EgSy*?$usjGV#cbU5_mjG2U;{`A531lgEOreX5F5gDFepwwo!;km zSs%}JcLCB@DX;54D~4XGbbyM+2a>wqh+}iT5#yy_v4pO*_2r@TQEAJT0T@Y9iNw9W zFIOlk_l-5L|7fQirk9cVvKAcF_CUhydGFjKA-+oMP@!ub=g(7jh=_LQ03y!z*v{U5 z&Hg0qUB^&{8)2eJS*wmJW6If78j(dhTvk_c$xw1^%8cz$Vp zC%9JW;5Jv*p?kK{W2lQqa&)xxnf$ugQJi}|4_{$m0c{gi7H5sx*ImQTN6`t`x(KN! zyrgblQy5V?bBef&U^WHJR$med(5CMxsj^gDD52(F#%&6}<(>w|6;hU?Op4^02DNaJ ze}A{Y?Jbn!OJU7eH*eM-+MfVV9*>ZX(978AD_%O^av0^_MjQ>SXUFc1BHzo-jlpV%k}*s z)1ZK`zG4dcnuYL;P?ec4aR8&;wiW6MDEQtlH12)0%FvgJB{*t{ChU0jO{(~s{n4f= zB^Wo5^YZ6uI_EV=jjC+i{jE8fRwBj^TknhuZP85Mc zkP#ECsmjl5C=pF@{w=e!lAY4U6!zo;7NF0DtgwS+S!!hgF1F!7?6$CwkKzV_;Ij|C ze)gZ=dmaM_8-S<(E1>0ypi(XnaL^B$W4jg8Adr7v6C7uO@3I|*4ZDv7^DvmF1r6%& zpK$plX}vQOCFXWS6ngSA+d1zk(XoXBZHyPKf`B~`-bZ>}2G4LVizf2tSB1kx7OdWt%N0s}K zug0^0BApfoCAkMI8EO{7ev`lR%f`k5r3v;LM)&ziR&2SD@C=j#za#>-4`Xk=myK<1 zkegX<&eCdh54T;9_LV=U4`Bv>-#2VKCv;Hhmix3|Bt9I*o6;LQyct>_&r6{wP=krc;JSETau2Msv~nbmD) z&7+I%NWyddEilt(rGasmo1XITEPhFwEB^o|D#~2btO^*tE7tnVrIkaIFTI>q`X-zl z`hCF`b_>!ani5uU3;GQ2#+YrUdvp=48N##IzJ+dWOH5N>^d}&`q%Y5^h0QC;)hny!DX9l_$EI|zY00`0EA~5bK?A{= zQdL#}opZZ4>8>kU>*WcTI-M9Z>NU661Hq`!+E>sq1QeENfbh9M{HLLPpW*^qBz{vz zRk;MJSOzRNG*gO_Mp-N#g9vmGz`~6kPGJh*C3j)M*V>B9qt>f~v*@4|?~NSq;%*G* zF(nB7=`YZXB#5+nEWW)p($)^7ypddYZmQbJ^8~<6eA_gYL#bllSDB+a3nhc57(XPMUy=rq(DkSkEbYes~0?tE4*GI6?0Yt({z7 zJ3Hd%OAA#n07dRFsrEoXG% z#|V7jLPkMv5tWJvxYJ7eC*5d4hpNygP)PZjn8&jD=wSImSFuH*V;#c1IbzUDGQC{F z;;~HO_lrOgD~$WCl9WXd5!qN|){{%^x_4t6GdR9-WnR1f#hIgG)B&&*QbbKXjL8|; z8(G7lGnJDWy!;!lJSOY3Mi#v*bcctcY8X`3zeLxKULYWhiYiJTmvXe$okv1@v&rFo z!qX@%RiVf2sVyn!Hy88%ZwVD|kD6bSRlVp^%e-wkFw8A07m_0@Rk1AJp;Sxi>*Er~ zPq^vV_d;@MTi{Qhd+t05>Hz2USr)*0@&X03j`K++p%~vZaX{`QnIkRPIO?ELwvpXtbVh6t+`~;jJ3o`0eTOOj?ap5uwVuEQ*qMsv9E5!h3G#4Lfc!?Tr zmBcRxNqZU%aTQ62;WgxIK&)?In?!eSxH%XC)E_MH8taDxIz`EoSGJp1M6Wr+4dI(- z`jm!X9>+}#hI0{wctR7LsJ0|D6bS;FwMP`zGFTcKq=<2fa%3V(QhgS( zCTHOj-iIGmOu*%APAOsBVI3_Vp5N<|aGav! zz{qP~oHLjzToZ2#bB_|70}Y2jiLtT!JmCb4JVZrp;_Htjs9YjB7MSGK!!|NOsd;&C zGurySVQg{Q0BYP5JtGWJfEceap)NctDb_WhQ`>rE(OC=<8cBNYv#c5OQOC>3;(qN0 zZUFr`s9)aurQW!S0$u$te+#+*1`YP(%CU1*`zA@~mP$CZwnO9VC$$_ZP;QCAa<+;g z6B0hVf?rNvAI3q!$0rVn1~-$d5eqUUP`}lS3b|m;h0z%bU(v> z>&r7r*S8bIs_7r>4&D0#!ML;`+2(kX2qY`nD>*~up@UK0J(ckSd2BU=OxSP^kGz&b{=14u)JUrYUo>%aC=j@$J-z{ zVc{~kcY#syVOTM7zu#1w=+Ha%7@4^&e8FR_rfy#0KO0l$P$gQ#_1{`inHH2nbcC~?6^oG4Kl zkxg`HH@GYm7&PMAVjlbCx`r;<%fw$PZ95)t1LlV_6F#tskz(nzv{P%ynl6}e#UJs$ z2%v>oV@E5H-3WfqZli_Jn@lg6|C zh6ovpG)p~yKwqp&y+kGDFp*B@a55VLUc9MOuGt zpF|o=m;#LbF%y@6xo($}FmKu~#q|hyPM{che6oO>D+@Y?zQ_l5>%O2Qsh%_(5v3~` zeN(t%jPvy_;6StFHICv_$WB0KyX12f7Ar|85Wxfyd5&rqyBx$8s|?ZmFO96up(d!f zC~ry^!rt1L9gX39Zn@O`>hDJ7+`mtRTI^fDCy2jKb1V{;8pt+%xh;t->lkTC1*9(JmuCmO39;87 z&?~cU8>xn|sO4NA@;D$9EbIf#oA@Lje+36L2#O&A81MIloBqHuk>~!M&U4a z7C^BXtI4$5HqP>Irmk{Fa6SJ_$7i#dE>#*d*Cx{=sM+=RH4tFRw*QobEZ};8@$EZY zbRTYwzCpFn_a4-+{DU5y@D@MJCRJ!yPl<`LqjE?L=$#3sl zVC`u&niGz*7Q}mt06D~?WO?7IJ6UWgl8b!=@^^_`7as(z-JM(@H43Z!QyThARqrh1 z501ZBZ}#G^lc$SfybgZPc!BASQo*Apal!uF*R+U(F_PnJX&KBvw@2sOTS{FUNC9#T zAh0mMp@kbPPhab1K{j>I(TB|~h1Qd#U8{t>L}I#&5=2r9oi&ApaD?sP4#->Vmk>s$ zujy!SeO2lMT?9DV_vbPUjY8>JX#qqKR|Cdp!>X4 zg$r~A8=A=#Z>dAu%mCbVj#L9v$@=SZfHUj9;`qxz3Q&TCmeAij6_9)VeNT71rpbS+ zPV)a;(|l8QE!WRk0ROEU=>Kz%{eS3DQGN83ZqJfnE5dr)D2nE@pr<#U#%b?nG7)tL z4^4~;h@xv0|5)VwN7(`Dt&~Y`gF15~NVc7`9ZbQGKmZT^zn-XRux-s`+WLW$l0=5E z{lN)8&)5H2fm1nO_#xS^63u&zOpOSx-MvSme1cT@^8Xom%U_DhC`z1kss(fDkMY5c zK43(oDD*(A!%;JUc_u_v0nB@?bo`BC<-c0<%^jk8gZ&F%d_v(14xiuRi~m?I0r?|j zDr-TL8)C$-u+owaUViF{LC`}g^Nz!3KM{Kg8(;vE7bSS0Y;>+GoEfFOf-?f;mHek~ zkAJNu0S&6)wYF;)Xivl1rf&tcqlb-Cuk^3o8g~Vdt&XZdH?EyhW^4e&7UaTfpcYFG z*jW)E5qo12`<`bXZ|KP!yr%5L-g{9k?_;)om_p2glPkJ-EDxXABuZt2Ymt#k;ot29 zWzb=uuN%v)mm>e!Pr`I=w~-=MF-W8_vQF36y#tMiwbSg&wwj`d4cdo$OP`mIJ^Sdp+M2!48si9lTE)qa#NJ~i(MNL38^UV zob0PyVOCt*>MAnDvZWAr74M|HNIHf5d5nxVU+?x2(2@cpD<)I69cMzd9oF)2A;BbE zjvnBF{$V<2cyl9*_jPFS17cC6U3L*druXtdq42BwH&o~s@vzcJ4;mSmpV#z?vb?14 z@KXu2XT!J738BnS4JGP~7R|T>0TV!>wHb^$mqD}ze(E14hMu^NKY5MK&AMq`GA?7p z6wMpGX)FV7f10#bOzZ6#AJOWbA7W6DI}0Yy@(~h#ZbB)l&6eB?zxT4M@^g7OyeIk| zYRahcm%2zDI&h8jf4BzIxx>+Scg{H}Xyls=^!HEPVObm8xy9!~I3Si>-PoV2L*1`z z`100uhoR^JrLnT)yRzo|X4!-1BQ8b6C~w4e+Ct!+17NrRGFbS0uW)W4%m5E^9`Ba? z6W@n10*reS16s&+lNG3C3!la`-m7YL+DbZV8aY}C>ZpsOgCp2E*U(F`#~_9I;?z>!$HEI8Zf7L}mD3qx{OeP=nfB!!9xRZ?!+_ugO_<0aFX9y^=w4aWi>9nTaM_Th*rKN3e6(Wi2li#*@B$!5@IWM z_iSULTYrzA>Bg36MhWJ`J<{NzSD}t$*`e*#h64?1TA#CJD7jh6;M}BGq)bQ`=*WS1 zwF_|u{Ft_r_8m8pO9!*Ku2qt{1pk2p`GLNusdzpEKjqq{9NlJouk6|d8hGdRGn;QZ z<8}%YX)S)UB{KMf75ZO01wpd@nf3$CkL>Y18o)?cy_2O+{Mj9;>x;+iQ}V`53Zj#C zXKRSs7)Q5RxI?l0Jui+@k6W-?k~2zu@~gz3t|ef{i=nE=zATMacYh5?ep20DppU zwe9D<(4paJTzfw=bea}b7~|{vs;h}P37jxPnFF>2Bl9v9vIARe8okah>2eVeRfu}K z!UTuTtGlml&YH!mZOAgnc@dM@vpd$ay=zA(bt$Ya&=LSv*pPbG+drb)lz6=Z&@|KJJrh4yh&*NmFU%ImO zwz+(aKx{kp*%|xZW63C(Mmhl;)-oi|={QOt#%+ej+Tj{EKafp8q~Ocw+g(2D#^EKT z^}1SRB_ReU$2c#S#@1uTHhg|nt}fU7LZY~m#@Yrx`8-DFAo$>di1y&kd=9f-0onj& z&&Lp!A0d780mvwr`BOR0!^wg<$~49`(~npvI5t?01DC`_5|J8TOR4)@wdlZ0Lp#dZE+a5r~=Z+KwTm7Tpd zNMzi?>QwY(j8#9YtocLf?QLX7x!+##9r(Hs)Mv>zStwnTwZ7VR1 z^~nsbcPbEL2H&x57s~!&c311q$*LsmBWW@h%WoG?H(c)Qmpnd>Xr%FoCm`u(4w5l* z4{h>9>%jduP`RS);s2n*;{B&=CDq&LM;O!{XM32uKfAebz)4`dpUiSxqesU3!J(;1 zWc+R;-tA->A2vKnW&s6LtQ%A{L8g8M4Wn_b-s4}64+u4rgdb05M8BaY<8o3Z;|iEz zZL=SfE>Ip@jd~m=r2Cfsb+(n9#)F4ko;T`rg_D(6u=vJ%PVpDq<>x6BJ}A8mj}A@E z(I1L(!nyq6E3fHu@)Os`WUphqTcaM)ypec!X0RdmDT{RG-f5Y2K)*uGHO)*ok;LuW zvW%h**ZRCIVyIQ2jNVnerxWcR^!w+~@Ls$KzK93c_IXvwcoLbLPd^MjSPoQ9i@Ek( zOzUcp^YvI}HrtV6E~D7jH!Asq2`^0HxWp@}^}jnk2eoIgqoLb7AN%*cdsgo zZ3i2^i@cO6X~L9X#k6hSKXCO?=Xfe58M4KYK)ZohpNsJ8&tB{o~w8EJBg9rJ9>ZZ6_+0WZoXLlC4cuau}Okb z9NcELRYw_i#@8RHUsD9jm0ONQ&}0~`MQ`W`S#Hf~`Xc*{B-~lJPdDl=!7X+qgIDs@ z`J`F-OLLpU&V!{rck$*quWh{h2$}PaC$?MWCi&yzEMw1>N4=UcrH6Xkp5FGkNNb6X z5!v9QHhL1}{iJ2dD4SlEQnUI6L_FJ-d7CQnMSK2=SpE}_1Acm1uL^_kmLqLQPI$~-*~GU4*-8{cEu=Y=$5cvr0`+H_xT-ZcNguXv zdfIjdY- zV$>qdy=SBaadT8?haTxArS?pAc$H+!vybV@NOR3hTy>tMxT&9V5g1s1iL06G&=a#h z#<(>lGfGQfEzwM#HT{ga+ZV)6BC3eE=*|5JlQSJ);qFw_v z^e*WlRb{=UBX$K(6?qe{p(^Q_D9WbKT|^VP`Bqe|l!>bRd2a!IQB=C=si}H{y;(@2 zu;?@K(_<;Onze@w+LU;j?#~}CVr6@@Z5xm?PxycIde)(^lX637a((uuCN1W|p`sgM z$c$Go93BuH%v`m6JG}p_zG2ZcN5VbNm>DBk|47Ls|FgRJE~29MNU|Wa zp=2=sp|rvqUFo~Su#`%}g>E~Q#Zs&B$f?bxps7u%MX97?<T!JmpQN7U`J$hF19f$4b*{_HxUX-EFvdw_?J<-4jDFkw0`V)_sHXu$gPD%y#S=qk0KEdffU9 zn#eJ&zP)!z@UyDk&O6lz0d!FXR`QecVzW{wl$yMiT(?YEl7tm2tS6{jlLwV$eAP-R z34$0Ovy|m!0WXt;?Cw~~7qV>tzdt3_f6EU3T?hWyL+hw?P!L|Cn(Op-5wjpKt;#sd z9q&r#Qqsv|1>qk1=dnRyT4L$@#3@}c{pO$!_O39_*8{d`gf+`KcUdgxWt2Ja>Kp+;LLrrhHojjg(A-$BEpm<#55P&R3kR&{;Z83}&qg zIUN4gI~||4gC*)*c9S% zGYt)fE?7j&u$E9Vt<@X-VySKin{ba47nOyvn^icZdtS_sYsc?e*X8QvM^27pB>&X` z<2ow3N+31~iWfp@pw40OIM;4{oedY3nW}!yQhw#?jhof|0nx(eeVBa;j#DDH;}B|) z{JkdYwuPCZvU{n|ro^C~T??A4H^2sb_mtA8gSTz7SEla6toY4k`krX)Lcr2Z5j1oS z4DSvVYzP*Fsb>~e5{Ir~G;$OA%Gu`1j~DWQ7uvhNe|d_JGQ37Qe7G`tV$m1s7cfF% z)`>=R4RnnPTuDC?e0V302!dnl6Ua?akpyN1-g{~H#g!BA0Oz=T<*A`rc-%V866YXH ztlAa6_;5V91y#sZPHN~Y3U8R<{09mUe!xX&*V!X? zV$^jbJWmg#T0%%@z-I)!%j!~1(yX#Y${cJDC^otCnD&b=59JfrFHn1edp_Vy7k5Fw z38n3!K&{F_4&sOi58fF99#Ej_B`QV&14;xpgDKcspuzcy$6B|lH7P0SYdIM)vOR_q z!!W`!TNIIS=c%{0&4){$i=@KH8Ql-JF=p%{z>XChlPFOOf=7&82!r{;Vvbej+_Au_ zw}3CoNs&LoLBIS>bKe3zj{^xl$MsdlGnDjhd8C+X{Q`t3#5?!~c)9+22vZ^IZCbBf{rOQC<+RM6e*@#A8i>-8$Iy^Wg0mDt#y>+hV zx$FgMg_^Vtr-%8glkZf_hVp2K@*YKr`hFp^hHGK*SWPkrpBM z-ZZ+;_V8?VthA}C?pq5PZ_aBMGX-gA>EKAol*>vXZyHPN=S>IV+17BLAqrmGOda+p z%O*c;UipN(?4c{bR(N6pYY*n46m5Y=PO$rkFEvNNsy#ZmaQ+H8%%#Wx7$^NL zD+#soI^m_vsKyh=^~o`YQsbW2Nj2-$O!UQPF)v4!%13lLyXf~c&;xyR9p#Vk0v$EMtXQ}o(TOwhw%-0${A)Bq9SU%kLGA<=~l(or@YNvxLB7yIDUT!A{TDGchl^`sY#gB`*%mxt$ zyA#$KVKllBrm}Iyg;9fWuAQkG$Bj2@{%U&2NrwqbUT}kVhXT3hF$r(qunQai)IhSc zGJ;#_IQguVWY~569lb?|B^888ICAdFE8w>_rzH@g2$@sH&H76|%%$4ZKb-BOM=Evr#|s)>rMa^vlcG znIOWpRMj6YGa1wvREaL8%O`SgPP=X(xqORf$26;4_pC77bE9BYL`9y>gzAXV5(9}! zUCC@<(;Z{RDJ*Gv>rEOiE}5GTtss3eqIzXWU}X`89kIS0CR#jG_EA@zOj`JgAqnOn}!GKmX_a9vK`z?IH!dbMc+9rT0um2M&LYyriIlN zGNx(rXGB`N_(iD>US2@V1h+R-2_|9*Ar!90wrUO{B&roY{`#_Hie($-qkj4MV($lK z){#$8iQ!f%PjV~SiM75Asj#BmewGkiaDWx0tJ*ACkNNaJqN`98@))#)*mz81+!3qg|+il#B{|2CD7mJKLQ1rW`Z4J|`p>`WB^SLgFyqQZqTH79jK`)HW3(_E_Fh z2NP;c3mb0#Obe30vs1j?5=FEV%2z*MG5S~(#2wNC%)31zQzsi`noI#6(@i@e_ zkabuGFU0A2n2vipZMM-%nF~p6*)9+t((*}F-Wmqslb&E($c&(s zCki;CM8Hl_cMQ#k?Q|~)&j}o~Sah6OyB{X+^)CGctLIyRKO;}wltJ{T&NZB!y4KT} zj1=iShANd99&92jo+qJeq<9?W=CW9BjGGsiTe1~D>r0o&b~!ZcWaY(&+auMBOfrje zl5Az_Mdr8=7SntO7}oC$YL@fz<#y5wZ}G2{4agzO*yzvi+8sUHB7}6vCzK)SP!Z0Q z8ss6gD0>mDz|@$I%DVCsl1|>ldkk4=tkLZ*Ir3a!T^W9NeBXr@x)kW75P6xWhv|#> z1a({hEGvh8IXCct78_l!;Bm>Q7Ws4Z(7OFE!Pzm7fTQieXhtu!WEy#sqk&9r zD4MoSuWTkR7qHlT+2+Jswa+0p$*Yu#1?vev=cd<|}2 zUmeO`8Tt^mn-ogMGtOlI&)tvFlG59``@uj4AdSnW&n~oq-2v zU`wvOAX2)73XESO5C|=wf)IPGm)?J5PWM0Qfw+l7d8t?pI(EsalB|2NFGF6mTSZaO zP@S2+_(_!dep?+*vS$4CcjaG0IJ@7X?uk9ac48cawAG+F@$2xrEiuDwg~i_eVBV4C zM8I7?1=Xt~1BceN-z1?C@p~Xya;d5>x;=1zD}qrO;j)nINFAKHN(o^ye6z%$051Ow zt&|ZJ1P!+yLz6HdOtadJFMejq2(i=`x&T+d&oWXMGULzDP}HWE2qu7c1sgFx%8Ry` zs9;H=z~o?sJY#a(*rqNERS@-7sB&8LxsQ>2<3@eB!?%yu6{UM#3pR~GAhMz*DUEyY zh|`1x3_&;pZgB(#qt$c!=6lzAS|G`B>iNh6X$GvB*6Mp8wnN9@hLRC52HHd(>k5LD z9|`1BR};giGV2*#1%Yqk8Vw$GPm8tR!QD;|7OwH(-_d3rZbruGv8Jh@H%V;oA95JB z`)q90#0$DsFzeRd(`LSv`T+Q;rac$@Yc)*_|gZP?$@<^$Bh2hMslZPAp@Fc;L0{o*ODg@2&iB zsaQ{r_9{DXCvUuq{qO|Vi@EyM*nu_dVS&smA$J#m*?vaE`!_AWpVU1Wgl!%~=yZQJ z-CM=VRt2#Ppj^{ndQ z>)?lXoL=$T8BxmG#sPSsqf(l8LF6P<9A~#c&MT6VQV*de3W&KnjF9 zxHB_HcXXlX@asBP+A@=gFX7&ou%y$xFR=iy`QwCS`kxbxy|9k1adeZG8cKNelT;kzA*5Ut(*e#X#bNoUm>Py-V+Q*3MKcz0jR354u(hKmsQ#6US3#M@!Xj43E`oF+hp5sQa^Q4#qX!+w<+(bbF zavLec_!UNc2orI!)ujU6x@uHfHF8T`&%Im#WIvyeOo)Nkj{=!DTT&rYKMwC4?5le% zF_=0+Ff|>|1A!v5K5$)J1s5N)3v3MnCQjJ16%jiR;L;-P+aukx^ybidEb3AF+(?%PoeXZP$ zngrzd4>J@h@Ifq?p8Xq#_D@oCbE-^nA7;Vww~8r3bnPzsF+nQt9#kGKnFjTJF$4)@ zo4w`?;5p`sYS!Be*V~yyW0`K4-ueXAJJRvauRsGpFA2?GfG_hE%KKnL+E^gwd+SsE z=@MA41YI5CPq{|4X+ZGCm|3XXU$5Didy^4{nTX2XWn-BmM13)RE!%VffRu$Dj@?Ij zWi}LM{T?gOL}OpEgRd)&KhiCG6kYrYET=nc@*2FX{87GITb~pa3&5+!qZdFv9JkdY zEbNxDmu}D+1`c=hTSm1po3vivAS2+CiB2-KU$!_d_KK`luv3HGZ~0H|J~)E?L|hKT z?-ouSlge8lN>uE&)x{&3x?ktIN9tUCd*vmr5G;YqT(fi zz}uygR?6$xlvJAE}^udpPmKtu(6Z0ju#ok9|WPPIL-C|v-Oq}ChGJV*qb-mZ?9GmVgM zm3y2VO1c#S3zJ~+hY=EDli;BPyTQzm-bX`)+0Xv;zJ!3Ec>7n5BK89AV7|H{h!rU| zZ77<~VMVLzel*qzTB%82y3IPuzt)F6Pn%e}eDww%tL=>Y-q>Y{k<@uo0e9f+0BRn# zIhe#E^sq;&=z1*zpBK%)UPV(W-8rJ7z}t-8U*xMgc?19o3zF+*jTLdHyyLWUj`p)= z!grNQdR$INmX%XP-E2D5W%w6l)s@F+L$kPCwoMGKE{7uyh?vymx&^l6-mStrLT+vx zB-^~1ATsVsAY7L@$z-qTkDyb0&792l9Bwc8wDIB+7)OU<$#Mbn6eD2zujqvzypx`N zpM>P<8$|Xa#BJ>=o6NDjJziek7KcVcCN*ggYO`#S?aD>kyo?W1U$8Cs!H}|*hRvz? zz$;5IF0cr%*AF@k50VUV3SZ@vfpQPWL z2j$5YP|p6JkxAg*rYOIrdipTqre?nd1Q0=Ec77G$xSD-CDo`#EFcc9_B>@-|15DT7 zvs~4JpG3~kH7M`PKaE1yVDyn!e+C?L5LTS!Q?Lij6IW=OLO^Ko|1Xh-rH>G#yq~YX zIzVEMy*15W9j~KbjWOE6iRnA(27KB1KRC!2_gyfcp|EludMP3nEvBK{l((*|=~34W z3a(Q?It<=Z@`S^X>`xLD;{m#DTxz3yH^FM#cg#IE_->~w8DhnpN|&bstW*V(S|wssgAbIa4%Zq!I3HeZ*- z3EqS{KZvV{f&c@&TIc>8b(KiZpjzwmkPg?DpOEUs{Vx~Mn4sm02R_H8c5%LPV;5xF z&G#JQBAbIXH@)@mgT1E5haays+zbaPsq@C4ID#8fW8!p-WU&Fe&eLr12grf(p)z+X zD$YkjNezRsbEfh*QLe6KrCnIwDBIEfz3q(ALFMFwtj$GmB=%T; z9SLYKEbIJKirW^JGvt;&GP^W^$`oCbHi)o|>l^0J4f8`?AS=l=B%T>7t{^P1*10L6 zKh=Sj*QVi%Bro=5DwfoRBfD6i3a0h2vDX8gE<^3I$$3cTX0EW2iIkHkC9MWdhf6ug zV7B@@c^0~rat_CJAF8Iod{aRCIHkBx;2fBIfIB6GYE(!B%!*uj{Bp7Q$e^_IV0|I} z`D~kotF{iK8gzyn(DSD&k_QgW)wS6^#aP6*8H`^)?5Q~ANpmu18a{rSNL>8wg>rM2 z;jCt?S)Wfq;=0Ry1+$@HWg%g)X82J@iIi$NOLouu`QFk`l6jA^3dkRordz5U%B_^H zNt!*(j~slX-O?X@Ix*;(VU;x-RzOGCzG`f0HJfe8Vcw@me&kBnJe^Y{cvAsGPYjtL(ZVo1nTbrm%v~VePv<{OySal5+R1%x5_D7^4Wwr8FTwf%n zTD zsOp<1BBq|{CktL4^NYseWQ^krUKN51M=6F=EnQajG$lf~Lb_Gjci+FumgVD)llMF^ zT`m=wH84ap&B|4n@PNNh9~@s9t7}c}d3jiVd;Ij~JA)A`=Z)aSlq(4h+i}<}cGi0& zj}Jaw$W%0WKHL^+CYea>@DpnHOW*7j9d0l8l-{eM|41ve6jygXF4<@$vcf$jR#^Mc z+rCSqwT+M>LE~_&-XJc?ZR$E|O9(kn>B-`Hc=8qq;jx9&3{&eajD1EdJFn|^1jO({ zI^z$9FyA?EPejc7Y~%ma!o}2d2p^WvXJuJ5KT|JM1nMm#~KVgb>E4L5T7F-H! zbNzz)ojOBC-T5=-nxD0!F0TQ?ITt10cB&9x$QeN&1q z=*i`+)yAYO&y4L}fe@0gRpnI{yxvor{?f7&VLZc3+d-*c(#lbL-DSAl_w_r(-DijD zo1gX-sk<jHW$bh z!6tDQyE0)$eAR`Kxzqiip8|fA!jf%!MYV|L`Gk%!T%u;l#Y}9;@k3k17>kP#<;aX- zMd-2l>Oi@#cJKJ1bE@#YvXwpig1oTJ*Hs7FymYtDMg8Icn9;n5-~>B^Y(yQYIl?w_ zAk47eRuS=GzK$csgRT*FM*|ZA)qvgOxC!NLgx&^qlE%;bm%%^f*@%GhN>CMREdg9% z$SH56EDCMdx8PZ5MB`7SS*3XMt7t5>D>6f3^Zov-$f&3&Ht-@h{4t{Y2zc|gfdClc zb;UtBpMp0g`bHo|e>6x8gc%0Y>d}a*)dw(N!hm52EOzXe>p{ zD7R0+s9QRc8Y)3^eJTt_m@HpFUg=wamWEWHnxLc+?B-RtD@2O%GiY|+f^mt)emS$- za&W$F&)20@S|3hYdF?8Hnb{Tx1#Djk^&gPmV zfN$UNA?xN>3VWjqDf#da>*pIs<0EcEnBwv-TcmJrNWD5)>q)0~Kw^IigT=@))~ovz z>^qs7g72?Pv3&|YRzaZ6J=oxaMw*~5_VcoB=o%mxu`38z*zQC6tCyZu&_AH(zHRoS zw083omsbG+yw)#aeV46zk=l8Qb|uiqS&>C<@5YiapO;AvkwW|(5m06zt)u3}`x^f~ zT=ihskCt%rvTAlWMHX_14M%Dy#Jw_K*J4c1)!|cfqNXg$sX^^Y*#8dt> zXz3)78GTvhsGt*7YpHJOdk?sVr|Az#QDsitzb$jt8q57C1e`~|r2z8AXAUG<*C9*> zIF>fQ?Ew{r|RxG|id)i~PJs53-|sH6ew8u$YpfXLs-TFiXWVL|+Dv(3w7@vvu(XadVe@E$ye?v9= zg*!!69Xm!7&^1U_NTn_S!qq2G)bu6_h}i!P5ZQdqF8H(41%{$XK8_h$$OHQGd1!_g z1b7PZw7aP8DoVaz$<+UB(?3a}|7V*}C;h+kX&PgN3hsY;X*(d#qjA3*6cH1+EkQ5M zk4}^v^`ALh5`dGoEp6&qcyKo)YsV9O5Z>-U(NSlgB4SN+_`0G1M9_7&?8Wh7MtCNu z@DcI*eQ#34b#(Fl!JfV+0o0RmlG`IpNQe45{3;Qo!~3F`AL_78WZ0=yX%!PmD+?NFI8f~; zMh7qZj-}2)R)mQHbq#A^JD@Nvo3B>Lyf67jk4MQUJBmnFbEz6&`sv}jfs^H(oy%7S z-A$389H`Y&>#(Mo`?w?>1cy@aN5fd*8(p5jtDYJ}EIz8h(&&g|ivy(%SueCpjp%l^ zwsX}hMWrW`n8aVGv%tS82PJm@bnF39;A%+MDY2&)&Vg z5KupG^AXnTjZun18b+P*PvOr%!xUgm3>15Q#+Nxo;QT!$>zTfRqPt8ur6BX{YYxL8 zP=}b-Z-r@q+u-P+3YixMFFPxd-o}K;W%wA8|6jiC>CuKPXqjS} z9X9AW+Fb)wID#3x*()Tjg0>+o$jRQgL#i+)e0UzUDiduFXQ|5$^3)D(s5y~v=p31MPr(ZD2m2^u3E z11=C7fnLH47J`6p*2ARslf$16$u$bJ2e*>jTK`*FfFwT-VFmGonsUX;>k_w5QQvuo zzhqB<@4awzdt5RQ&>jMx=<=v{$%<*br2Kh+qqg~H1!Q120USI9Qc4xF04&0npcP26 zp7=K{UGQ~676tm=wOu!u;;lX?7nuYss9Yly9<)C~hJQ85nvw+JUN1Zt5)AAWOh4LP zrUiyij)H^vrug@x)ei|R6dvFDn)@DSCZyuk7> z3shJQ%TGN!&||JFI1;4Fgs;3^4*q zr@h5!YSEq8Id)QHBDF5zFmBczOnC=-!!C!M0dDsZCXq85s0dlIlU-hCitdFS1)=@L zE-s+;NyFSv-WAw5ZYHyfm85(&(Dx6Ws8+ObEDAMa`ow309pp_e(tehOt2fl`k6HCq zJ1(AIIoFG0(bZNjd_<2<;rjL#j;JNZ2@oF+JtF8@d{$Gn)t*3s|33}b%qLTr+@Q|# z%N&-8V`u}Q9RgIMh;XIt8_;jWf0ld0qg3OxZ1_it2z8GFDG1-TARxwn_;l26ePwbP zhYd&Qr0Dm1k#gHwqfJMx!GerX&XP-%x_9vJvKFQ?K_W;Xp8$Dobf%xWai2jAscJ&3 z8`^Hh9RW55_PVVv|2S+n5lgcc2)4Jz>Q)zT*^_#aQ7pV7gL>JFpz6mVRS zvj8^zS4!9be${eT(%ffs$}%TJ9?+hdpohpN|3>Eve?VJ?K8uyb>cy0p4wjGK`aEEA z8Ur(_Tw*LOOA)w>|BB5hu*`b2k3ATIWQX&xF02tmc!|8N_H@+VGhTL^sah?L9yhMr zlZ9MXLrfh5&gDa20{#ISFPD1jM-Zh9gaqbfd-L#1nE#oT%5v~An;QCibpgp2ulJG2 z0x-SpBF=qWiv__>G~}ccBwCtWyNHf`Ey7O1Pq^myEd|FI5+K<~7igEi+-wM979)^z zVt_mpi%kEZQmp1^MvX!ytKq_OIn(W#rZjJz=znLak1!uxBGcbJ^!NsJ{D1UpZ?K|n zWhE8D?I;(Hkvj$=R0SoF+42bzmUP=5(+8uVQXqU6MZWFo5G5W>ObY(+d$l|%$DxTl zgFRrBN4gF`TJ?g=w(2t%PI@I82b!;LK709UsocNB`UTWwq$0&$u~_;EDxo--lG^UJ z1352>ozsPVQRDAlS*CrP?QAnv*2dv88_D0rHIU^A81zjNosBgan89Z)4}-T8*_$HVrrB3E#?e1PZWSKN!fs>_bz z$Nq0G0N+?{yb({z2s~PeR_4lym&6gv?kB&t9e_E3=;l;V;^4-Du_^@Hi1z%N&9}Yn~ z^x1D{?wNI!aR7l*FAahf=6&mX2pPo=Njax~Xhe=QJtUS7m9%~xw7(Tj2qHROI8<`2 z@vnRb7TBYk@cJ}zLL8ai!Od8APX4aGm7rWS_izR-3Qz{d^R6f@)tm;R{f_+(K}X3? zr|3n4u**62fpra`SP{lw?#m3Tg|i@(l$N4cY6wf8{P?2O5&eV07rsPDsJe@^M(?u4 zoB#IK3`nG>NftQ`C*@>Vz*LM9Jj&T!8D#XAc#|xetCAgw&~@x?0$6n*2uDJm>qB6K z$-lU3Tf-D!j!1sCKe_|SL}zW(Qjx;$)1m3WGNs$xi{>`BUbOfe!ZxX3VW3>+z0Z$R zZFEmx$Z3$i{IBK3fGjURhs*o#>Qh#@%FUM-lI-H)I3~%C<`h)ant&dbXqItH6e8UjeCG+_>$>&bmm{o<2wtcAi8{ zuS0Fw1}F`jIPF2pU`TlDE+#5O@Gw%@pAZ0IA-pSw65mY`!%_XW?7e_`2s`KYRHe7F zk6cDg((JX;GC>D{L^mpTj{Ed`Xpwv?cMgG~W+GQexv$m5yLbahr?tqWq0qg$Z(B*2 z2drX=vvzgTU8IA3(Lz#-TDX**e? z1@igtgubw*rK%m~nL70$p1{hy4yBAWD*P&8#OAd`21o*o>iLLqL)_Hlv@Y}x@TR2^ zl&>h?fb=%xaRKl5J4pMXMuWPBF{Bg~{K4`&r}6=N}_HOx|= zI)J13YPOy@S+7t&1PNyXv7W044AZr)Mby6t8Hh_kK;Oaq62)Bd;>t?Gdyq}e^Cy8> zLxQDt>j5I>SyI3L@+lg)KGeqtS<0=3;~^j!uO+Z<2KPAdVU9d|F=SI%GZ}YH2qi+~ zSHFUg7K6k0z)K-*TR;+beE;k}4^%(vsK00W`95OI5Do9|Hqa$>mRZUWCampy!4a8$ zmQ!xjf-P#$L0Ih2lJ;N?v|G2_$Eg2>2jLl^fC8l)in*)?%EXheS9iMjH+fPzVYg&w z&3pgHa{{sRsv-SI=?%OdHCCjOvl*|@A{~smYLqN!k?VtkDvjTxM(ysdH8oIWZ=_XZ zAfiVwjE%V@!*%tlDINcV+53;;cRn@j;)iQIXUVkkyA1&-)b{YE z2o)jP?#)iAUg2|H>(P>1gp=qcQMVty9>MahVYd45C4i(fnJMOnLK4@*U9mcl#hUK0 z3Q#5$di5C`)U4lC`z^I`>=xi&u(Un>D6G+A|3vP6?E|4d<(sboLi$&D$Gn~d<*f-E z&cp7$yOc48={^s8i?GW!L+LFF0(+{xF_-J4kW?fdMx^z6=(;u{IHS(jVuuK1f?9K` zX~Zv{g?M*3F5}UAp@Lr`?p_zq*NU`4Q#VL4Ha}UlF;5^`LPGcujCI0bYhu;suLT&pVZS5PxV8Z$C|7f4w#RQ z>2)m-F6$h|Z7O-KLkz3YoZ`NRliV}jUmI->v`K3?bUy_@H*J@s_@S;9QW zn}_-u=DN;G~aITp3Y@sMCy?8b0@tMMJi6$C;Xy90KPjKUwx( zKI=qonwH11$HDV)7lPeb$`f-9A{{4G2HVq}j=O28X5OcJ33m>liWK`0-TVg8))eEN z|C+^tq!GcS|LKv_{47*`GZcdlxhIVqb3;EGq7ouo7DQDs`46tQF?yj3QeWFeSB(1 zCmzF+-u-j(>9oy>K(vz)$y5VuW|$;z*grSFQDhV)psvcA51BPE>h+JmJi-YQ<5InT zcIG&f1=&uvwzX|{zDmEU8bv%^>sGpre%LK^Zf%Wn1d+jbW1`x=MYkI2S@HU4A|_2% z?Wb((?lQ5qI^prV*UBIv$w%JbX*Cxw5~X^KdSpG?^tER0rEK^j%a6cZbKIUdCR&N` zU>Q+y3_@Bd+*Vm(p#+HtQbOlrx2Y0v@#(f>JM`A0ILV%XGKz+b%es|6yg`4}%pR;Y z8L+xNDbT0J<#2ze5M^X<8&|f>ScJx5`s$3~4E$^2ll#QHZUU2AA#^FxG4g!$>>a&z zDc=S&WfO8tAO=zc15)+KeK`|qLKI&y4C}EnX(x|&x_xne^BoD59bbshfi@9c;OZ?g zk?n0bese8R_WAH5R8mn{jWbKR4oztdb@|sE^y+vTKb->h5vgzRS@Ju$s{`;ESGv9tn{>`jaN`5;+lH609N zsgXi-QWZRn8DgfjXuH*=VJ$8&93Kvr5(8hHo`Z(*9I2FX`UuyHY;e?R|BX9LktxdV8}JH`8wp z#-@}gllUKmVjJXsg~4N%!y2Db)>gDT)Ei zp`BLjy7~|-n=6E3*dn-VUb}>|$mIq8xUhUE!hSx%NVPEexlg!~NQOr5YMWtM9nC zo6!cnKh%`$6n({(rM0gK{DTzy0+2Pn{-nwLnVfKx%C9L6|yJ z=Y#ZfwcgyvLO?tM4doWA{frnd#O(AhRfvbJV%#?DwSvSMnv+S{Sy2XhNiEoB+3luk zPZ$Ga!1XkM@##bAkN<3bb`XY3h*n0pffMy8ufz+EbUj>5uSi_8P?p!Qo5TJN4lR7$ zws#_nNt{-RPog$k684H z-r`_A!K8ab1>(+lFv0Q^$W&jze+9^^)7l8z%C{-EMRMNR@Oxc;p7fXAbRKM{Y2z3*;$ia<(oFSUDLpVXd%kUilR_Od)i&cpW}>sIc(YwZ3d#}! zCc6-eV*$#gH{}O3h8?@Fr<)%CUrcZxUUIG4Ai!r4ZTGsv=^m#x;=FWPALG=npgjit z(489Hn?t#3k)&9z)VjH8jOx3tyIeM7*Y$%h3YutSXScMimS)jwG`6pQ2PBfnFM6@> z$)@wqg-!=RC&-GOC#*PKnRAN<%PdrknwE33Qy6OeqHm^U&#$cLe{=$GQgCSIO*@PU zNR7=nY99M*A!EL_m_sV$YT>v3<(v&*Et;74T|%X&-9mQg0F&E}PeX}y_a9;w4)v>} zM8RKk^kpQXC~QvpC?w1CncwT!0pLdiJAlVS?7?O;q_J9+MHb&@TQl;!1P|v3Xoc9L zj^{9!Fkc{uDYcyWmW8r4YE7T2!g9Z+(kP2%6CYj-b6|05y0IgC+1rREs`hXpNhzss z@cEJG1^bOO8k&p0CrSw-nBG9R*fL|s{8vJZ!AlF3;bCl!P+WFam5bcs( zEgj>nfp^`E%R@2(4@&@zZ&nveRcPt|vFbQj=kau0kmFZXa|a(}RGXh{w-tmn%yU@_ zw_O*yl9N3BwR-$gYI52B*^7>_e%LEdVbhlpGAlek+TIo6CbZRX(E-%PI@WG4FUuv z7Ai6K@ps)i8pXmIEjc-HHqtnj!4>QeE9)IYQw|iKdtoDD(h~bwiS*J(c#M^cP!bGj z4!)-&`%m3F0=!Td9SSPR17U^G5}jvY%cnnt9ghH^B1hm57!KwD6Q`t4b zPyw;?3U%z+<0aw{a43e+t{7F#l=aKUEca)R$18p|m}v}g*zJ`ZUodh?ZGeqTxXP`1 z@XH;-v_afWP{h@@!aiUgDtV$#x^>#O#bvv1uj*nZJ(RxWF0jtUZsC~)85EG*10Nr+ zeV3I1>39}h%NwzHqXEqw167It32FNPJ z^({nl_Uk)iBMx`3J-s^j^6daUjlvwC190LsaY|0Qq-QIXoA=W6XD9D$FH5^K>r{lh zH~3(cLJd^Aoyk1!pDy|j%IQiZL(?RDvyPwLFsAhR^6%<#R0fI}9f9uJ zdoLS+BZ^pOu0~a#5?M(ofpJ;nv|qrgsO}(}^!d~%liW>^XqqPob zs&yPH2qdk~fERb`)bg$~Zu^Z87B?{xg#+UCC7Azmy=H8yW0DU=#=GKo925rm1C@6A zMvV6P>>{ln;dM-wdfvt7nqMh^RN8IW@|_%oP-m}&@neSl1Sri75(2LN9VSj>Z8s0f z4Je@PcAPVQs5KKLJYx5>rj3lJMEhv_vpg&$6W!>BeIuU8t2DcXB7j$T zcUoy9{H1Bole;b&(6oFJ_*)*6!+`aPBnwi1_wi|N?a?H8AwA25r*h||%*sZp#|E!a zhh$mI6p)_uP*md8$B$S2w!StB!x2y%$xpsuHmmDIWM}40#sa*khgWs9k{9vIwRdb- zjUCE-*D~wr@jZ^AB-@%c_9%o>M6k(XqpkRc4C$CFO4EhJlQjiF`2TVpqf)Q}$?Ce^ zJQERC__i`ynr)I_!yu*biY7Cv0lXa4P%+-Ccsdd>Q#BhI1KV8bcMk&AN=nBnz>wej zia%X4Nd@DKWCw!U0;U2%X ze)QFQpXWFt7KvhVHcJ#rGsLc9Sedr#_YVv1q~Svu0pEMw_xtzDI-w3!;^-p#3rh+Rz96yU6|}T&?oy5*LoLJEnQ~OUEu=$z3PGk50gWbs*lzgaBqS;id+_ zA2fhULyhvrG+!grUDn9gWaHf*F^Y&Y2QNDVBWwTGbMyO50?(fMW-<2pjqH3!0X!M!ZQo6_37WV1hR#Xa z#z%n0MX1rgaM@6i{SJgn9B{26g@#44H4gF`JulJ-dKPsmnGuSl_a75!`NC*IUR}a6 ze&>~&H4Ir`DS0tE;UV@NE|MU7w{D#Z>|`tJG6g=v_|yGy zWP!qXb@7O1$tPPbFTq^DHi<3!m)|ia1FpvUC(qA4LKNL%sC&e-0JsWGv6hk%Lb>{B z8uolzkP_AnU}KAvtreYLe^DZ>VWu=cXEEx|guL6+|7X-5sh@gfCT!(T)21v~RdMGO7p-P@6CT(H@0N8|anVk0p zHaTCbj$Iug5RMUOX_pp^8mN|g=TSpTBpXwRYW6TZ>L;V1hRLVby06IMSO z6i#$CgkiB-XdDel`D{)S>(!=)l7(0x?_U{xXaHi39Vlf;x`VKs-CY~r@H&S7lHHE3 zK08{CAo#{Ze~-elt7D~JU!YTAR!lr=weTa$4;3ET@%HB)lMh_Kv9)LhrserKJCo82 zpb4Jts?1&pZk>XL1}voAusQpUXtmx(^3!=_g4F^sok7cu{(*OABbmjd(tm+hYa|mqoH8x^bn^+= z6t9aE(V~ACe-B(e@=-g{oXFl4_OvEB{nO`*PH_iXC_PqF*JFLePxH1r!j`hHK-XQ_ z)GH+B+YW84>lsiOrj528<9=tyLou_eVpksY5jHSHirv17`T;#Ux=#|(mX=&)eEX#` z#a8CuRrn9G3b#3a?lSvM)p>LP(ql+c@Aa+#yud*EpQ1VGMA~+TT$vJZn3IAEQsD@_ z6~j~ZJFwzh*zP@p-o-CdT-*UjXk>Vxhl+-%er|j>&``V70xvOYa@bDvu(bwa0g|{s zy_W5{oVUHTroO-O`8sU5u*RGcasK8X26&Df^Hv{WQ{0}rI4*S8SOB+#bdyNpZ+z`G zD&klKY&_B0Jzb|BA=eSUX@^fqhi3 z#M@sT?q+X&oBYHykt#42f27(m`w_&E3j19LbTCL(wgNKbVCUxSD_m;YX*^Zy79THo ze&YCB72_gA0>$C54(XEa#ZQu1NVgkk6h&;YK=G$4!qtTN@9WG=@DCOw25)y|qwBE3 zKePW5#Su{T0%s^Ps4uCZALA70B$SeRMzp!7T#W`i*=K0q+W*7p_1L&(Z%D_c(oZtJ zW6tAwzTAE!Fn8j{K&fVlthfvV%=KTQAU{MQ(K=;f>kImUdFYY>UULLz=s;WOPQ6Qt zn}UYIjS0DJ((*bp$Y?zJK(=H)Ix<add&w_nL?`MUwLajH1G1G+N`F z2W3ScNDELZG2TFp$L_IjE$aw&Zd`%AGR1cfAaZwM=Iht%ffU^?w+CpAL;y*eHi$1G zH^Ta#ZzL|z1cH^v@x@HqWm3?E3d%trA00HLn%GlB5%}>0q=&Z7apmi(?^UH?aLiuU zQ_Q_Z=ljJ;4I{P)Gkshr{7+XfWl@*|oNtydp<+Uwl)0`ep^9jtG9{xZ;)@a3tz!Ho z(UGE>ps_s+xrRJ7_+I~~;Ss-j;t3pqR`bqZ zYdIpG@)<`%TG`Z~va&CGbbv?tiEiOHXP=()`kB9e+stGM|K`EbQ=&N z2F1spA+L<;LDaK8XXX2y`=qYI_Z@hWMT~J1S(lHUNR$xb7^W*hhbGc;fqUAs6fwf; z;qpMMiFtr_D>({iz9EnjP#zM(O|X@g)WJFjEU&`%JM*P`J#Y_S`L=sEpXc9at$L0? zt-kpHNX3;Gn$37o!&B8-LU>wK4Yhk$%yvo_7)%v9k49SEf7~a`?DjbFmnHU zs1Rw=msZ+aGf=*Bs>f6Qo~mHN_#eLoXyBh*^vA&CfgOIth0PbNh)51rco>J{rjIZp z>j807ajaNt3DxKC_^^Z$g!``iPB_0_!{Yrg+48*Y$TcITHYc56@-E6XIZ z{_apHEf%Jb8X7(N&I}|dCJp<(>wkVd7Jf^CyrrlV>F-egEaLC412P?lU^wiV$SOh_ zRDu3970RKr@H;nazpade>E46TAPp+oHr#~+YL5aB^oTG-as zkg0B`|7QqqoFYmRsFS!K7RV*CBhX$JHC2qCZDZx_xN z_WKP+|GvTWJ5h8ukf%IUVMH3kK(!Cr-@hFMpMQjLo0ylj`aO!@6_BsLOP1b6^LrX_ z{hkJmz2lf#q!+wEv?%hXFxwH@9ACQ|F%CMF5?{YR`_+<2m=r_RZ!nJ|U__?xn^N5U z-PB;2t_Q?ui9icjP+@4@z7c5$hMx4ax3X~8N2t>G{@fK&8+~%2^zTO8H)s3ofA30b zT)n9FoasQb@8uuKyAsy#vcnG~T8txSCN3yQ9biXC^Lwt{5O@&pp(ozz5Os^07ny|N z1et$Q(g-rQ*Y!<_FZ_Nfn*Y80{|?Uo&Yb^UR{t9m{x@O*fc`g~?47PLaIepe-G~%8 z!f-py{b)13>e@uk?%n&_gRVPP85@MIYo}A7G|o7G5aB%r_U_tzM=O%C`k>6QeQy`3 zf`v(@eX*-i*q=oBQteAJk7nkDWb=L=BL?{?Pxl-S8`rALBlOg7F|x;#cExm-zZ_Qj zm4AHqXBupY^wzl?;W=ce(5P6e$)?4&YLafrL2Y#|8u{{+VMsdT@JVtFXI^^?0VNhR zb6T1@5Io%KKXiMDsb6V3C6+?zmSl4puY273l2B+Id9a<6efA7+k`G#@3_Ls=E&d9< z7iOcq5<21*Q!R4$#{-4tms~WE4iB-3C1G+uKj|@<>AFc~@cuZCq7!6twxY1iJzx2F zZ$3~YN_)+lC5id`q!k zt!GtPrC4Ee-eN4fYpS>@JOBD0e$DqE&S&phj5L4pmVJyKXeP0nP?6118vAIg6My;1 z;X2ar0Y{&LUh;-|r}DEMkJQ$^i)*Pk$j@e`*51sNxmFqLX+FWoZWw<0)0X~C)Q|7% znI^r*-6QcU_vY9VdFE-6W(g6i!!IgRtGg?-v#l<-uLVqxB7F$5Z5-X!Hl`{DUzgQO z44qVbMtX55w|$nP)x%rj+){G0S&`Ft4ULJffaCvP(>&;oW&@)Mg`l16(l^`^G2D_g zAxfm{Rb#tIYp8gOAP#j$imaD9G?>PCL3s^IGLda#;YHtQ!pY+N*3hoo>e!4WhP}nz zM&j`T2`Bmat?g0KoT-Fb>bK>Y@>?Aqv}p$-e>v2`E!m+pPKDlG-M8Yw&QFxQK`+b=qflKR;5IgvQZ;3_x&tpDNXX4w3Tkb#FDTSV9lC7`dSxjac{i5fe zzbwD3JPRWM9no_py4ZXZTzKG}QmFhTTLs}Tj}%!ZM&ucKf5I4-QqyT7 zb?$t3L+3(N{aJhgOR}6wzPS>;{=Tq|gu8ZP%Ep{TpUR{58?HV!D(*CG&}7gC`aDYt z@+eD0P?)r(2tAGDuzV8RloAs|n>SogIYPj>^k_{!tG8casPO4xBQtqY%1zBODZCEW zy=%=^y=_b2CsYfc6kTJ`>`{L9b7*~McshxBU+l`I44)gyxo?y>$8rNIC+)KzdFXx+ zdyO+pubfk~9-^ad@Rc&j_@iw0NgPJDj7PIJ-s@gKw?wT%3PFw8?v=pf3KGLFpR27}c z`)qUIpr_U8@D_1LP`@B+BBBeFjsL(gN@Itet0@mXt#K{l>$&aX9_my;ytocmJR>WY zwe_M{@`#vR&K%p^Bs+ge&fl5+Wi&$4=IPa1l@vGk{97JBhNs&<&$p9oCd1Ea@*NE; zy4Nc^G@NBJjfQwqoYY#zy>Y7Db1Gl8TVh7&c^N^R_5=Z`&;a@4FuC)!5qkI6Z*=Gf zQ_)|{o~>>&m$u$DyWH}_=S+AB6j;bq{Rbxy@2)e9mmq5rf4(`gi6pxMK zZ&rW?DSvda0qcM%9(0G91eyz>Fu##9rMF@I=d6MSZ7{=))(n!fwGTXLuya4yU@x{=!WV-!snC&;D0$9(j9!Xs`rFe88(w3;Quz`WGN|b zhin5OP`G0KNA+oo8kgi!QlL_)u6F49MYO_A1p4nEj;H_K(n24t6b-zR2_tFQKelBt zdgvXC=*X12?x+pl{@IjfyWYPA=`hDjTbmdQhwZD>R<`@O+ z(b_Z0Z%Aj04)iFCZ`3r330$4$WKy;>+g3VMl#WJ{I^DWyI^e_pP*A5-67a zPdE!mbBlTZ7LEm6yb{A1>U&7biwXZocve|$2MKsr5;_@x26E|GZNGfnX)XOQ=2E+K zYG%EDojUpsJiAz%(%oC|J1}fJZ{0>PAH)YevEZYI(ZGCMS^(W*!Y-u;!T;ZXF5CZS zt4;?0_sL$(1)Brf<$~6s9fYD8(Cryqw!|@|$tl`Zo`()d>kt5ZCE!c|YfFE!O-#fV z%4Ijj1RZZ{R135@Cm!E_F!XDC)&!I~eHAu3;JA~iy2?U@d)F5PeXSGKj)mE@6L*s)i1fKvu5YXJr#{Bc~%#6FESWvIum|v zrx6jmbVaI*7yF965B1f@moL9+_@NOQ{(KOgOc!B?<7?eB{q`1` zRVxDRne%jW>ah*gMKqraxED}`8xAmHX;4sbXrqUI)+pDVU!L4D!;vLVGKZ5sSO^KE zm+A$d9mOszE(Vdh0ZsUzyZuu_Vj_vVWb~cKn~z+71Y`yT1iV|Uvd8uX+fSSeoPYA% zTUl9o89veja;yi`yZQO2dZ3T);YY`zU=jd&;lXBRMn)_a-Ob$_+`?2uXn+XE^=Qtp z(Y0#M^$4UJ&f~B6oGtTbew{rlPAj-U1fs~NQXfKt^Es`CVsF#YC0NaNa!+3= z|4~kctF@tJ49?K`W-RX(l`J_-Of|e0Kv~?G{H#z1D1~aw1Ehzc{UJS{nwZY7}nOa&}GNPby8c_(!|3U z)+q17AYVX(iMItuI+mOm=K4Tt6C+A0_dY+v^LB+s7?m8PFZRGUBax4E&O~<_XjMz9 z#3+e*dqDZYFGRxlVXB~RB}L_}8RIrJw7RBCsB)2EJZ?3!GP?BFBU*vKcx9_g5mSG% zBm2nUS{Oc!gp$~Ov6G@cbFm>r@l?D4@-Y0c>&nxYUfd$wDC&A;-FIC1yd&@mNm?yg zNGl%@#JRy{**Zd;oaL&G`d3!}A^G#f7f`(n$@olL3}IqoJp~?3KjWE zH$=W?A2Kd|)CzMRr#P)dcj{2WK>J1mCsHyw&EWfwxCh{>DJXw|yR`rRWMx>Rb@}C>Fl? zN9>7t4Y;nIQs3cXTo%4 zVB}t2nXGm+)2{b=R&7SZLPSVN7{_QgT9U~w6sE>GpVG#Jt3!+QSJqg_vMQ_&1gDh1 z3|m9nofo$%KI0L%K+Dsc6`BN!pmiDn$;-7nRSc|f7%9;J90S%5|4aMWvsXz1m1stYuXSUj$Cf~X=i zCBHr#oP2eLHriRf-oQOdqv3GtQW6$WZza^9h-K3gb?+YFtns^5L!$ORcLW61$HuN7w+~8Fn{bW&tlEkosMXo21`H*Ic z@^tTmcfk{JbqR?;KXETFFHM44tb(E<*8D3ZKQY27g;H#5mT&i8W&P1~vn=?hY0Xtt zG*k$AjeZw`^EG?Ed{oDSgQW>Y2A*E?rNV=WIMWms(stz@rhW0Zx2E37=G!G{d(HO6 zBg@|lGNgnXSkQR5x}e{#>HF%p%}t?b(8;)T3Hl3!?DAOh+SMg-w2_T7P`J(E;nOLu zv@^=rr2|*10s^FV0q>a*x}_#!eO1Fi`!!Y@{GcnbVUrop&<{s&gNg3urNtwrd}u=R zk-`0Hw=a@YE2v?SaNX&RMnQukvM*qMs0tG931*Vu)2_{ziWh#AFV)(5H;U*k<7hms6NBN?>VU6XRv>;s zES?B&P9UJ+qC)*UMg{G!?z;p2L$i3xTpPY$G-pkyS>n;d)g_+){-Zj{){|8X)Z+YW zVvq6-ugu_FxWF7MEheJ6bUT<+ZwNU0rBKGg;HjLsY*11vH4gHL243kE*o#-_IR#Og z)2;beI@4aq1@KJTqVP;ib@6z_vN(jXC@54^7h&qLD@sr@Kn_s7`Q#ER#RIB%SFPZ% zdFromcuh+T*BhG{E|S`ki&DkM`N0rPUMX(um??0rec2Ya42d*qnd74r9UII&O)dK& zAb;<^Cn07hiAV^j(zJhUdj|#0GYqog^y{t!KmXR9%uy;QTwGTCyZ_i1*9zb>XkYTL z(wwT$ua!14&ZL?#Bq^4Qe;0we9+d5x!bm(*S-W_($Y*3=QI8r1QjW`zNKt3(VuBVW?N6R=Qezt&|^i9MRTTFU(i}W@${~i z($+8{Auc{X>m(^QC1nGBZ$?JKrZyj%y!63a{#2{xG(Dl7IaSR1ZEWu5Xf6GAD z&D+_O@%iDi0^i;h4Q4L|o0xFpg%^alqQ`2TDH@aMjP6N9%e$$?-AjUw>05I4nB=X3 z5~Cln`_BX^}Iga(*~e5}Q1GEuXgCyCo_) zLphaRxm?@Ca-uA<=&^w1E4?s$0>(qRi@`cXzF+(d$10E4T<_mmh)<(&WD9Svp#S;l z#{it%wrkK2cMjC zo_e!=;3{~3oT3wuY3Qd@zjM7aHBNL@{YLr;f!UJO6w#6G8M&&qHoh9nvrw$4bsUA{ z`=bAtlBYE};`Hx#(%n(!DUFSlagtAXJ+L`*G$0#Z-6+O}A&?@|726luBpKkvTW0@| zh49S~rAJ(&kXgZ)TDBnZVES3YK=#0jG{tau<5RIR7hZ3pAoJ7EM||{aISCAy_c*Q8 zBHVFuBrmeqP}!xj8h;;oQr*tVM!* z5&8vD(5g1A~JBsmmH1Y+@$S5Vk?@uW7L>o)wcZC>}^ot0b_jL~vs$z8cs{) zq_N~A`8~41aI(4?T|5_07ds&_es&ziuCx0;FqY4K_Gq!oEj7|j3&<45=66SGj>)vET#Tz7 z%Dnz?of)sXYg_H4eb4tOUQT&~hE z`bZqawCmj}i6n2!cJ6q}fEJ&+6%JC)!TwKtT$j z$_mrFrp~#J-9)M?xR_E?;y!IpQdC@RLl$F|Zzb|2Plszeo39<=*rxMP3aUyO>9=Rt z7;=WkHOjB-RdS?L4`*swG&@ax@xwnlIZftR3=&lu*Q`CGEHfFNbJ$dUm@BEVPbp~; z(Z7^(ZY8?tlBc#oer?3RAtjk^IoaDfA`zo^e@bg)$@X!cc6nWCdyod37mqtxfwsFG zqpjSbgay6Rx&F9PM@25%pySTGU%#381V4Wqp{e>&t_pYJhFb=+^L-re;1a)d{S^v6 zMd8iR&B>K|wu13&hG{W>3HriZY?=Lm7|wRc}0oHC1;Oh+YnO!1AC=!&lNmEKa5 z-#&{Ecl|sWOyOoowz_hQKe!U-GQHXdt+=k$LFLh|HIq6wtCm+U64}lJ?!T5N#x7L& zdOi~R@|Nu--V`0N4?-UkE6hk~WwP(qCSjB7l4Ui2Px+wSFL4^jewKdDd{CjPe&KoD z^KcV1*Sy77N{M)tqfKKdAG+MG-q`KQiFOBZqw&I6O9Pi~;@l@D| zdJSqQY&bSrb~}2MZO!dxU(Iu{*mX>zZNAgn-(pVkq0GR6n!fGOhAO+UqU=SK-N2;G zb!l+1PL_$~e;Z3E4Oh=&yiYxd!Gx9M}EMn+nUJHuWme) zccV|`d`awT|3sMDQ?Yz&&mX;4op@TNY)<5Uoh-LuJ9%x675m!EjvpGvjfQzon!8;} zR=)`mH7H1%vADFXJNiHf^<%|+e-7T7#>g?*Tg!4WcTZJGOZpj(cKQFFb5n+=7{!J* z$a%{=^-L%1MDMXvSq?wU^h$8K8JRwx;6$^@!F*ytJ3rq{^x@0hqb3@<7P^&tMFN>2 z%t2Bc)I}DZwqsT7jEao>wC_5{^*zGamNQ4Rjt^3@7SHVJ@X-WbPN;XF>-I(m3XQZ| z-cwq2w7z?0bxw*`+9Jn5&zvQL`9j0q0e7QCZz-5r(X?T3fcK{1htqY1Rhr9}hHEJt zx$3kC-CdlBWpZiGag=Usu2xG9^%vUg=d1TIbn^-D*OW5mb^Rjqm|uQa_SiW^ok?d$ zZXn|Npfu*1jjX?UBLT)1!)SKXf!FR6>+Pa&Ed6Xfc7bzs;`d6j8x@AQS`y6957wa2 ziYLF5cC`g}3-v|0#h&e05km(X`RT|L!s5ZLFCBbJ$wwPZx{;Sk-&}O_YaF|lQ}CSd z$5$)inqAAxMxneV-TRfqDA-Fi)*D2I4mXxH(0HzUo_BN;qC`&O1a=i?obC7Uja4r0 zen91OTHFoIO0DM?PW|+;pL)5Qs{^}VcGEeNK@IQC7*mg2VC}Dw%k2}!%-A~Or;e;k z;~Q35y;c=M=iMz@yQdk?Dk^mrzr#FsYGZ?;?wU&Oj=?A8K))CD@Z_d*$f_Er$ zG12lXgmV=w;S&!!h$)<}`B4xL;N=mH^N$5pY=8Ka^lWLg2aDP<%slJ1KZt&n& z8~RWv;rhO9!hrv;{o^L};c2cuk2xulRF;ERznbM!9nJ<;9&>ZPvpTQLrFDO+RE@D! zvFCk1nL4vyGF2$(R}cD4D%>b2>Ov*+nqSD<;%f!%FZR`vkmWo-R}IYwyI^AorAo5a#Y){XtKx_vYYg-+l>$q1CVz<6%A04-%f2n+ZA&uCvtPL@dTUeX@`dLc_T{e7-R7h8h>T0( zvE_bv%%Fo6KPWblKHS8u&Pm8fnR#KufR0xDet9QZB>%|Q4-YFhRsB=0ZEr_IPZa=9 z7%2)!CHbnwjpr#B+)3hr!k@w$_jkciR>S&m- z@$PJGsW~++G>pCf=02>mVIh)m)h7P{!$b-9`-RQBLiA?Nm%~1@CAikM1t)(M$v=1M zkSS&uJ0z54X!Q1~5pZ5rk?iDiVa8+pMp5(Z);xLXVi_;3-;PDLzgK@xYo2RMI)<@X zavI)2P%6f1Sxzl8XI=Gic06AJ-53r9%vnD(M}3Pmq8Cl$y^+BxSTDk5^r8&~XtQ=nfg`#@p4SKPl$n`4VRc~+DJ$MSRDMKd{V=}Xxn;bffM4b;iGB1VVf z)G4^w&3Zrn5BA^Wz=g`wssw_He9q-*e4*%{i}Yu6tSZDqg($9nASyeWBW} zqB>I@5o31Zn}4wIho2Hkjh|X>DEwG=6C6!Gjuda9b@ZnV!B}vgJbs2t;ULVB?)#~` zIezPdfYF|zVC}29TZhy#Zf5RTZpF|?=M-rc!p5nMGLbdmbXQ~ucTyPlZPI4-iZ*YX zUfU?hAR-IvOs{dxV^ae=&lp2J)>3sbK};%qZRcBT;2^E0!h z<%>_6Yis%W8TtBNP$SCl)@g?Td@I7Y|27a5j(&bO{I)4g6K78(v=ypk_dUuwO=yWb zZi;HS=8*f`@R{u&tPPH_Mtzl_D1ANx1ncqHsLP;(O;BK9T|ghDTj(6u*j;;@eN*=Z5iWRB+{xH3!*Ch2f_511tagvrQmQHTi1(xhnU_pUeG zo#o~4r(HL4)h6dl*el@48Pu@AU&BD9t>yh&FbNCYaYUVTL>bhfC02tRl z<8~IqY5Yq275m&N3*6#H7PO9Bm6ezt-94MlZ8;sl!2gCsm&kP+Uw8PBiZGsl@ii3C172z{H?%ZG-21LQ}^~nm7(slr4o6Rh@fT58D^ZoxUtn$`SW&Tx!ZS9YQs&TD7jE*v;fiWp0RK@-2ju-JVu z6xVynR7-;oibRZK2S~U+N%GsTlHyFUK~r+M@6><(X4Jx>|B>NEzdCZKm%>9O4HgRQ z)Lu-`Hh&Sm*Ei|@^0-!-`|~4v5VeQ1y_}rW9c@_HitA`)DP$1Kx;!^M8a0$XqB}-= z4yT7yI?)9wuon_-gsL=`I+UlkIvYw}5T=|9Ycf*e(Mql)!fFzonsF4e zOeqDOiHdn;6b`@M!bh}7pG}42QKgRt*#N81LW=S5@IezVu#K_oybdBOitC<8YvRjR zc?t&tO0egt%_9Vm^u!TNO)ka(iGC>o6F*;HAc3=K{RVR$a({Fh#uggb!~l|A?5FUk zBd=VjN8PU}v)+-=;pZClmRJ@KAhjh|EsdBF`iT+BzJm{SN-UP&PA7#q*hqjuHDbVT zk5S7}1e!l~gC=0u^lYKoGotAd;JOe|4}QN^v3a=aMai{`)vt7c8_kR4oaMg@_W!J-#4!j{ou%+T03#}Yq_oyZigYnIG_a4?m>j!P$N zo6%4yA#$wgoF*iVaVO3r4@H1d7ufn99z9ezSi+_)>LA@QkxwA77jrq^RmmKX(TqN4 zh<<8~=0LOP3MiqD>s|PMV|N+!n*^YbX;vZCPSecg7dkrJo;-W=u`8l&Ue8q4;-A)E zn{6Pe0Z;XnAXrF#g7Hid>mP=mr2!r{cHmS15DzQt)^@0|B%LaI8AG%XSO>MJC%S)R zl_x-H=>NG9fJZgA^%vGsB_h1%HH4hl%S4?l(fM?Wey_6^i*rh@3mW|un*J5P;`EtH*ue*MID=xw$gStb9&J)SyV zinBPB;UN6OM(c5F?4YDE5suoWseoRHo$r6`?EcHkJlFP&_>4zfFH#9Z*;D;)?0LA; z6iYg3A(f$DhXZ@LKd0*eG*S2`QLTU6q59vv!>PgKPV;z{g7dZMjm4a~8Kwj{S%Slf z3b0xA$A8!i_@zg&1cP2)rahx5QV0d`8{NKtD{=r_MWS^-Bmpj}Y5RXK3i$f}pKW^6 z0W;-2>p`i{rGnpNy8J)IR)BI=eP7)~^xz5>|1DQ|;_Qqst9!Ep!1JB$t~HJX@Y0SK z5ELBT|2ilFDu_jb|KOf+&6jE*<#EvWQb@D{7`xUEv*{Fe>#={Rxl+3jnYcpPeaZEL zM+;<7A7rKzcYM%#xteuAuWD@B2CIo8!UJYIBpUaOJRp(guIulKN3l6F$nFe(N+P|l zsZrW!2bVa&Mtv%@TUSwH*OUUG4#HKve!aixjb8!y{?gS-0UyLlc&=yq2X|&Mhg5eT z*FQi)_?+Sl`NKYF?Sa>En?ELEIad(uEK;>R2yynLSYI1~VFOEt2Sj`FudUYx3a66# zlTXTL-|8a$k;|fJAF2r_BRhU$s>T%opp>{8G)DMBCY>mT!wH};JrccqMCjGaQ1+9l z0zL_F!J8@Pgco^$SYyW~C|_=}`GD$dt|FG!5BmscBtV8;&Vx7y245Eke!KzN3icKd z`}^a7-s8MBQ^9l{usGO~-~}+QXG$$n#T3d;g5mj-dz}~nJX;iV7FX=Khn(AG$83x9 zu#Xsl06!oRK^d*>?kooK=ns@V8NI; z|9Ziop*WYfO``!6(HuZ#ItLa()j?bRU;sgAN@~wPRqx5lX(-cHVm3bd*9Ub9w5qIx z33PlvT5UX;A5eDWUpv&lhq8b#po$+avOQdM=Ks8l`X4O}{Pr%=98@Rg&i}m6!xI$Y zJN?enb^U8WxUevKFAyAa0sk^M;+3{*pDhT-;{clOy_J8or4#$Zz&K4E0k z#96Bn@L_=Da@9sCl2NIR)qaB#jnx`_ERi#Ef3h;dNpC#?k$ooMn@8_^fqS7AQQ2lx zZ^6U;Ve~w#(Hnj0sEI67NMUq+j*xI0GH$@kV$!P zZ2iBvcyVw-4M}M6@z*JRi5F2-BfL$EhoW`Fz)zzNhq>cRgA1OMuO>!HP}#)NbSopyy9*(Br2%kfW@ zi;?lO!FeZC@CixJ(=s(Q&)3lxx?$txqRzQ~(W|y}(O-bETQs@1KMNw;+@^H2b#x?q z6nDSfsKE;qqpIU|s$z{>FM;3X>X;#$mv3Kasc+PM5BpA0P3mz^QKi z$~oADB_bxGeiYw2=U1M-%fB^?LRC(x?jSi<=Q!1kWM?Xy0QG3pw6l=0HwAEe}z&xcIk-#zTweo(SRs^m)$l8wR+*N%+*XkPji! zW0DNf9Hq+FMn_1-e8UEF)m9qzESm6LY|#^<*sdS2VJLO(u9UpwaZ*!(W)2+*24a5x z#}q zA0ChJG#|0Xu6+VF(f-c4mr^OV8$AC2%6t*8TBrHuS5@-;_i^33 zC;grYz`H5{7eH4*(Ol45E}E7qu`o!(Im4zgv-|rTcY9)KUK!q4b(WXoOZsUzSUK4+ zls)={&wZlmL;Ybsy{AXuE8hGbXnef3Oy5bO6jNO_SOH^eos2%hG*fuvHKWfHtYL6x zPSe6urt-1Q6XansD?|WA2wEn?n?D+O3(43}tWIgC&G1vORue{l`}9dpE~AEW$UsPv z&OHI~NbI40t3`dN@+2y1hfjejk0z*Id&s)l*#3 z6ImuZme&Cd#97lSEoO_p`DRuX&*uco&Q?K_LA{(U_#kG*J+(K&8p(9GufCzW)aQfN z`e#{(Nr)-y5Lf(|LF)Mi1A z)gYyUpk!l0&G$$8zc6=sO}h~C+<4f0Gk(Us2XdSVkKn+Lj{^-C~EF?-B)Gk zp)8`};H{|OO;Zw~YOKlTH@Y8{)W|ItydIzJ9q(vXK{%2`r-!+uNqdr41Z~sY%$AeQnF+E!bPxz0ho)wK`N+FBFt@AE9)&CjO+2nXIWLi7bFftONHkBIN=X4b z&Rv&(cODJCwR9-eayh$qowv<;tOEo-oum;k))q_hhyf0g0f4kJ1IQ&2*niC>#-Dm; z>fyRNwdW`@1s2gCyN58^CA{_Bha;pD)jR;P$GcH$b?S~b;8l_1WW7tRtWcJ6Y0?21 z>`6HD8sKM2ta=q|QZiKYn!UOQr@zA$Oe{Lix#Ayo;U==W?83#yJy|fF7>s+Vr^FX@ zXttb66w0n2x?EWH{UBY-gu;_l5#8*S{`C7=ozXwcRQ@UzTm|IsV zLM_y)-s+^G(r!mEr}v+`U=_iPoaFRaZlUH?t`2!y@smJe0q>a%uu$%b+~40_ zvWb&-#w?_yxP;|36UHMBHcEY73tpHj-JM_oSmh^3Y0wHfY_(yW%saNoklB$L4C+;t@WP$rnmn|BYYIg zh+;M?5)0HmL4U7(BJn{pwE|i=m+^v&4g8KUn&|tEzJG4&cl4+56Trn92SaNp9EIi2 z5BoEa8B6aD-0r9u#Vg72=YbINXDG5*{|+bSyNs8NMpNg@X)a83S`!pk^SRRB1iuDi z$o&d*dbYa|@}bUbG07MXE>bpVx&j7@IuUM#{CibZ!WOJ$7DVhB*7rKoE_G||4#9H8 z>O$Bt0qrKdmJ`pHI|v1VePR+3KcQWG4m1{&sE-GArGVmt?*>-L0kkmNsDEu?9@9Yt zV?;#onU!eNiRFtHBBsjCnDrD)1J9h2$->z4PJxCq8gXSZoN^X&G!TFE)EMko%+B}X z7mw%FN+pmys@{anb-#eo=X6Sc_f^;<+A6nQ!<(y)i>4CtyiPI54w7NEST4>|*YlEd zht>Dt*O%>#W)aQQI?PQdS&Xk=NRHo1f*olu!o&;}9&#H*z|sLhAGVWFCVp%B7Rv`i zTC;b+(G!4>3{T@c!AE{toav%DmDUXn`PT4;{zVnm(4VncweB}SLU`ilK=BOd-%$Et zcWMB|d~efD_+PZcuS3}j-oUxpWyZPwc9ju-M`mk_R!-AIE;GKFL)4>qQb_Uh|j zNDS!aPlau)GSf%2lqW~*=LE`5Zf|!Ntpm9pbDBgtk@KCShQ+GquvnF$7tzPCaWA3z zdg~UT!mfXw_52Y)ePH^<-WJmX)Q67JAL@hcZ~S8TFA`8Of&GR~`4;ebF%z$^0W5ud zU5;C&zaUZBQp~;X|IFD&9c{|pp(aoqDzUnx_2S18vd7h*s%C2ql(v5zApj$Jhst%C zq3on7eZ?}?vSH&)VxK-Nrj8XxC49j)=WPD0NJgoWaTV5U>@P1D-syG7WINBN5^H#C z7>@pExZa`6lZo{q{(glm6#Y6#?}5WKloH^97(HOhDeoVlVHXYfZ)hQs%8N|zA9nV& z?KYS5ft}bbZv59J2~agA7LISVXJZ<5`KJrT@YwT!xg)4X>6g-+3sp+O&}Vd&N(hgv zm@#gTHQtzMN(TKiSjqIn4r>G{x*Z)ePdr~h$ z@(7QE4uAH-C$V)%sM;SB)4Aa1<-|^yeY2{h>XhrU2cJl2NiB=YS{$F9oSa;jm-=@$ zdDS@dp%7$b8)*QSqTYXJfj^9+i{*0b!~@uS_CRg@zx&Pse9*bY$7&K0swnL+_J)Tp zr~!QJAyO_}?Yg<5K@Y{9tHpV;@?4W1s#HJC$Tq|B`GMV$r&~!~^!evI2Lblb9N?vc z{)DVY=-LvC_ehT(-QRzmRDSvBk@%yB`Rv_vG%w<=*&Z65{I{E;r}Lh&If3E%v{Rn; zjikHU%F^ZE=z)Wx1ds{TQ%+f%l14nSvcANK&%3XG9h{$^pPSzYyK-Gb(o~25|4weq zTzgLj2W&FfN&OI67FAGKtM~17Du6=)S_d*^lMO1q)LN`FO;xOfrWeQeNc$sv{J{Hm zaQa8N;o$HzTBq_B-;Rdg`kr-XOxLl(ZTgw)b&p2IPuA}E@`c^tu$H|qT3hJy*Ji4- z=+4M3-lsdMMq2@6ivp`7B+2}%$|Ry%yS7@Kng)4ZSC5+0XiHmAc-|kDK$oU|MZQmR zX4jVy^RnK2t=+Kf7Cnt${GOy5#gjirM5{IGg37w2a3S)}gEI2*Nnn98Jn_aZMKSD$ zpFYYlelLWLYE3Qckv}^LE7Ug_GjCMe8WHqbX150eJUywn`ZR_yKE_%7MJ|4?)4XP3 z8si4vnIZ;2gCaxHYQ+56H^{qCwlKmDsE+S9G0CyRex(=Hmg7z(yl&Zf0OU0shKO`kl%==~YzxPsG159_`9k8m_((+m> zLr+uErDt~MqOICa@*RYul$if#EOH)$hzwcf|MTcf{)g;=$0ANEZ)ftbOuql22ls#Q zGO;iR^lIBre;UhRO6}yHy(^u5`!vQ>wrbH`T`;{nkEoZM)mTiC7TWAe&+N~h^(ieo z{6&y>;z_mY8oN+w!ACiVj&jDBwZ>Zvu+ZdlQhA3JZ`pK>s6k|iak*B5Igda4MP(kd zuJSkIox9wR<0T*zv%8o55AKEta#hF5kPRBO^b+=G&!#_LL5A#FGdmIX63X=o6+Tu& z2MgJ=X$-}kqwfNCqEFDN}e7N6cltY z2){3|=D|74{>RqtC!UBB`_e4}03Zwg|GU(`ctjUQuW*th=1?w;|0R6_t^GV=bz~wy zIs16?u6ZItEm0ey_X;ZDba<;l--Dps&=YBJ)1-vvOr5%5 zl4D$plKb&$L@$3MNJgv@3o`PvVoNsJmc%xSgw-ZO+oAtG2%+Hso?I#Auq4Ukxd4^Q zc{5mzw1&|059W>5l>VhfIA|B&Z~%oFu)j$ ztCSj%EU^LX$XZYVAM6k{t&Xr!&#ze^SlgRtIw<7hjVM-{&1pFxG;D6Fs5jyh)2GB{ zJ+i91F_wX887sG{^o}+cR0ha9HTT#oJp$s_bt|yin=34-IzojknWm5|X-N)wR8InM zx?mH9rp+~SG^H+D-Z1r|D=jf6wINcn%9Vsp&wM|O(s1Pt#4mx zmc{xb)OQ+dOlvz*jOCAg8K0Ta&T8j}N{&Vmu;KT?eFR-_il>ipd>0(kx?{KuC7z{E z`86HjNhT9Uaflgew16@ji8Sq}taM6uO1G{4nN!ih(;CsoEGmFU6DH?~Ndsmn=KZTj z(<6Y;R1-U3BvR7@n3>K-W~Rx@QQtggFS5k%zMCZ>{RRt@Yf4PC#gjZY@4}+nd#2w= zce?0GjFn&ULLwKZ8{%vY3pYtka6!;y^leLpk#uXS5L&;<}%S zfTf8S5;-t>ZFQZl#W{;nU+M7$iM9ociGbvGgtd3Z7mPu}_Irqmr=WnVz@{&Eq-oGI zM$B;O+eIiAs}0fc_iQhe;>Lr^F`QXne^ZLr9YMtd-xBh5T^?21o`OequCd1KPpwAW z?SL(7+FADLS)utCZ`x=B30mdJUlWox3aGZz24U{9LfcB-@%4J4W0m}^s!pmq9&O8dX-Ac;d31sAWIHf>b;_>!rXZe#rD|KtIGi&CrQ=7> z0-Z)l#SpiiFuUmppuS-ZLU_t;JHi?VRcaTi7L>Py4cytVWZAz~st~>?OV;{$RRBpD z9Mh@WQp?c%p=?=O4ZVzkbGc)H74%Q>orjW+>a{p0F*t3Hgjb;ydD9_K7ISv0b|d^D z;hotlk!o03uIt>cGypk>koBK#1l0$r{YBc<(nOmRX7LNy3VN@FypP@s_MGQ-j8x1$ zer8z6D28_~6=d6%p|CYXf_~Awf?bSJf!fvw&=eipntQ&E6B_5gM_1>3+uO%WPj&=X zE@n7azP;Bo?(J7;!C=ADk5mN`mW*Xbms}MFSjJqw2uNG~_P>@wTDv4vv|L+02KBXsx-#9lT5E_iAEwB_AS!L&c_hq}1INR4Gr z(vrDrDkXd66;uliUx!xFYvg49P3Yj@#uDYMf=Ex>auq`za_z5^@uXW!cN~guhVOY2 z9o9cj>2CKAEr|Dhs+TZ5!R(b2ts`L?stcPWq8lGiy6ZPX)LfZeuoWB{eBu;9leZRvCV;&Y5EmjH{etc>7t&~a3uzm@tITD*8%6blWaA%&z z;qxnrAlvJWUv+vY?M(*_xB~$oXvPv=6dOL?Z9}wxzSH%^O~chZ$c+KTU;tmIB_%GEnvk zd)aW@bWmo!kqr7>GHn3LmNJXAl04;IS_5vZB z(z-X2dq0Z4d4JiEG*kJ7*Wu3&leORejwUrR>PB#ZurqUQYXULU3$pdW`w^=Cu&L|m z1Wg*)_j#!sB!;a0JRc%Cs(R8Apa}_|NQ{W}AbL1PN)u>B66pR`c!!h31j!#cyft`= zkmP~0xvtwz#+vzO1MS`+4q@P@Sq^Qg4*C}@lK)}(x_a|r6v48D?SqJ?Kzt?K2o=xV zxIfKB@R80^>KSoYKkaa!wLbZIRAw=lN-(Wi?0{06IM|9oqCQ2sef4258F6X;!rkV4 zsk-S4gU6dFxu$9AV6KpX?7{^~5l2i_Rd498D3U@@)%O=+%Rl@W09O`@`^>#j2_#=9 zouxDu!-wo5^e>)A@f_Ml>p+f@WA^cSAKo6h7|A)C6>x|^$To>*5Q*d;FWwUp{JbY} z`m|Jyy82y-_ex&JOl-)sZnmn02zUlx6F^ca!5&fp@9sfBUNM5Rj~k^1gkWLVs5v6P zdU#k9udBmueaQsMxrnu3zX|p9>=M-XaRV^Bm|7X5(~PPdMKjh{#uAWL^O_DPeoLX%;xx=Np+37 zGr@RtEn32*6PX#hPyNMLZLSHbH*`b z|5XZb@P0a@0&OASkGSmYZB@V?uT%e?FMurCJ=mgffQ63pJpN5SO4F?G_5~w^$5QtF zZr!H)eI&K9Jh z-{_=`h!Ff`;Hq+{0HA3&((uD-$<0N=9U^Vf?*SC5^yIXCL0PD|$;6pTt?>{+Y-~G- z6l|R4D}D#oUl@sYAmz(MVvByMCrT+FDd6;Lj&>nh{15?>1j3wW`1{^su1*v;ePVu# zfxoM!G90L|J=^!XxZFA2_2AmFwVy@acWE>dT~Z=~@)1!q0IR9y@QU^%0wHD7x9-98 zz$?7@d)a~ndUVYwWuXtG&uOR9A_S`OKo++5CqKm&FAMfy4r%)S74Hs-w*CO$%U)ww z%C8s|Nuos>SS|VM71`93FY5{8lBqV;jqnYJkK5lpE@`h@eF~V%Y)I#vcUSniZpGtS zmV%ocWxv7pslmjsWwS#>)bV03p$|6K-(5?m!vJucu)VFOL1=+718{2lP4hQNjCLjY zt_bAoG`}L8+(h!%)6I`tTsC0_296C|D(em4>qm)z&lxq_dgiq@CU9BG#IyYR+G4^; zXjM^eEL36pGEK1Qi(yYrf}p0Gsace!rEqMaU&4hq0bFj&Y~nzsbuD+C%{8PQMIMP0 zYoQki)BsLYFGG*YK_6df>J^nP5&vPJomJ0R7AgQvi%(Ne>US*$dV(IpdR4!NcJA@t z;VFQ<(qbr_?0rx37F`q+dfv75YaQvC7OTTn7U*?ybTEL&E?*m#Wu}_#3qdZGyp>2T z^%m8q$_-n~Vecp+O*bZLNqR`lhi>@}T=|KfGycFvf8)i29Pz1fSe_QkoKpdG`iN4v zo)w%3!fa}BYO-_k&3d!nQAoaeMy6l6JoUW}DAX2Pst%qX3U+l~zw~18 z(d)MADmWwTl{A4&8Hia|dm$BlX#vCUz^GGN)nK)GJb9)uj9RnAZ{KNjP(YW_wy+}# z_Jzw>errWT^Gb|`7nQ%{l#NmoJVr#%iOa5=SEwVP{9e80PmO7~QEwI{c7i`A@3WXU zlC2b8F06mh!ahf-1>BNv@boWljBz0ue;Ohc&mW4AkbhC_i@Vq<}x1W*?f~ z;un8)@LfN4p@}QG*ewfbObB=ySUSk%*h45*xSD(Sh_yG)h$1lyXh|_4-~0GME;&w! zBml3|qm%E(d?Xtr;z&B%m4S8 zp%7rbj5RMF(((V!%S1tGrWTn#{TUZ_36I(KsSDPb=pTj%g4YntN9D8}C!ElGfv z)-7;KOq(1NVFc#x2>~+v4q(8oA4f3O9yMnV6?Oysboo@#Tt%{YqUX%$7mjDjKuo+!|0_@Oz^Ny&n)kUYG^stzodWFmo)RszF1TDf zfZ{-8yVwfg)=46oW<&v2ye>~vbUFY)ANJ7IHrM#H#b2i|D&B&yh+?kxCnjf17jq{|f8m?se^5+qwn7o$FenvS->Y&_LD` zT!HtR1538b<$8ptTd!Ix_R1}xytgi8&#;!Lv)GgECQH6E};PtmXVOI~zGDN*wn zwF}e*Pf|}$f@44D%jqfarO$4UP6DdFHyJTo%C{~CA-WinFrD>_b>iNXruW=7)6M>3 zl`2i7Bh_m;>m-Nb+wF`zMz`w;jZ9V6yngaC7Z826Ka$bxV7BQ~^S58xkHh{133QO? z^fe+#A}0%rj`#sYw@0QZzpHGjjVg0jUtN!HoZ+G#vwKKnbsaXzPLwIUT5 z-aN--x`&ID_L)*~g@o*T47rx5N3s9~r@-gL*fd53OepgoNsV1VX@mj;jFJ${`APXf zxhs+~aSnZ*C9hHhlveq+^;7vkP2sW!aPrE4Oy{4X9L@){K?s1*Hlfw24^XJTN`Yff zuvBG!^+$8tDr;IM(X9SMR?Gg6vfAhrWkeIl#Vl~0iyB9D-8Z!I`X>avm}4p8n%|h> zjTM0GfD9=_(4`j$tu;)&+Oe`1rfj7;Mof)9O+bY%8+NGJJOXjM>QDaOe`FH7Aud|3^)?8+t zmaNyGe1W+wXfy#`z>V?JL^}N4+5Svcoux2i9HJ|*6;myt{yb@fQvc!(7PR-UANEG@ zp}{ii1O8fvyO;4S`UvC`*0_^;K0NVkx_wGy480z%hi@*AdoJFu&2+^$o#GRl!8!P> z$50)J9NA?*iNal5zj7ID`~p_odblA*l2o~pO7jw@CmNm@4dXtPuu!gf(h>~U+ncHi z%DmX)LoZ|+4=DBVqKT_139PU*|K?`QvI%QuhPmxE!7NEpf)PRJZOwOANK=ibR4r7~ zg9=Ch}hh=xF2?hU-Scx4&gKaW^%>(z-derO{ zi8U%@og)KP&HN6b0R-PL#jns&wSDX`zBdA~L~CiZ_c_G2&dh$fx4wf|b!|OU`rJO& z-WizUokg@8ibfEC-CJ*$-ZXlQ&p7np1TGb2&W9velOmnBvZLK~ZtsHML_F{9iI_B#%VuC1YObn;`u2-6G8)&&soxk-!Jcudu3@wMbk4EoOizL3iT}3U>LSl zr>8zLB8~R+u0O}S{3`P-CWwal@yVKlpjYQb@_uctpxa`!`OS5u<`dWN#_!52WGAYd z?pvd8?U1`T^7n6@EwwHg$tKhh#kvZcpDpRH-P+obRuUucX)?ctKkljL!#?XE9UZ>Ps-pkW0#o&juYHRRAmmK;U zG{&;X7{li>L323AWkSnwTU_u+kY{IKH^Xl~$sk96GO(C@d>f$WmcMq?y>8%CWKAq> ze?QJVqqRO#=Y2lzd1;zSE|NC1e#w-}Ie$_K?Rx5dM}bRGh?u%v0eVtY^1;Q&Pvt#mBanzWwMey*}3uY>C9 zH2W=Nso!AI`fZ!I=5-zK3yw#*2U z`M)M)Qj8W0SHp3+f>DY>I+A%8THt0nV@0l1m6n&ErWsp!0URaD``C7q(6M+}UgdK( zCflEAgE=?b*kdM1Ho=^lQM4HX2wlmQ9mR77NH#6+0cTV=kB(YeO;5$CKz7r3XwBMmB7Qqd0!d3Ki#35o0io27J#EmB$42^@ zN+Yo|r|u)pHPE2hMVhE*ilOA@=~E5dIDTMZuO8OouPg%l8}hSTZqs6HgLj2S zBrR5EpZQ|>w7uu~-Y1UPN$c^}S4UeSz78WHVcg=0&l(j6gO$s@G&Co3yqGfI9Q@wG zbUPd}cjtZFt5fgvDo~s2(a`G-Kk>aIv|!7!Xl8OBK-oYyW*`S3_tg;vE=2klNxw$$ zwM&ZAWu4IpYimx?V|+6l-zBwt)dJMdI})T!i(f}8Rjfm~QqZAVT<^uViCge|VV*nq zn~oqc>Le6MS(&j6(G<^Y>((gql^~NiWi%d56eU)Ue8S2wFA`{DD5oRzA~T?hBJs!* zlaw8K7}p1iH*tg&OvBnuyhF=|mCNphk4t^#FIa3fO{nnA%?n4+$wo9)A#OkwQj4SnRiuhWdb)la3^0Lmy?!N84%(u`+Q`k5}|Nvf%4 zlthDz4eHZql4fP@EL;kcTPea~B)iZ@ik8iSXe5YI84y%(L*-=p)mpJ=v65gf&u<22 zcA521&o6H~E+OKT#B>uxTysC$VxF!$G{Xz+F+LN3^?MSX!q)UIv4&&~qy0c9_e%i@ zrt(-lk*9;_u_%y{a<9F*&smBJ3rS6+^Ogp5-p;Rwu7yx`xWG#NfExt({ z*%ZSEU51=AjVDiPi&xfOQg4R<9Np!OGVRVv>Fd_WI^QGqFbu|}x}N!9a*Z7enMJ2} zw~-YprQUc6tm-HXy@mSO2#u#>XuGAkL2mQ+iNirPuTK4+zH0?>3eKVgvd4^<4lDS{ z6TP6-zFFN7NapzB)(8&c#?H8!>A?ms9rx6{@-hh2-qZc)-*(YdJotMl=<5woanZ|t zdYdIaafj5f6h>x0{$AtY=Vbey0Pe($>4Lxh;TYRT$eP!yO;~|G-ea!=`9{>&F+q!s zwov&izE25KvM+xiXr>umjxjJ?@__)e3ODoYQrJ!}-359szTnSY#OA@qgB`Jd*z-aW z69_2woQl-YBaAwvLE`l8{8(jz>2&6xcw!&A8H`Yrft1#y(x9z7z z!-iY&F{n~;P31$pQbKtvvj0e`GZ46vrqTkTs~s8~`syyo5LpkC&v(3t9+N7i@ljRW z&#`4QW^UW(Y+?erAV(YT)?aA8j=_306WM~BM_aKdqojE2;#ez!CYItgVK%64<=>nD zF|`DC%2GVPoJyxd>r1ftdz%9Yx2P6l_ma&s160I%T@L#h0EM4 z@;Z&(iEj*YHN5|+H|aDwpBO1O87pzvxqffwuh>vcp?nbXoi(w8sNlIrXrwIar1e!# zx)O;d##o=*uR+Uf55Uwt&4{59z{7 zQdR{>Vrx=gJI}+~N4~%HbUVMFU*7PD(P)yR%GYgp@$T;8qYsXNbGK<|E%K0@@dl;H~kpkJOmp+vn#dmn-%Q)P%Le)1DnIcwgy4RxqO39v_$c zIJjRRcvWXogMoJ$zDO~8v1t{fqMN?xMFdUiIby-4fU~I?mJ(`Et$B)nHXUo$ z`tP-bNZ>()E4NVd6f;M<-@<4PgL)Qz(KZ0)Nf^&ro ztb9lAKOYEVM6HUu=2kvy{K?;})hNls;`>t5Nl`n~2N5*f95O z+4KHARqA*hS;|=RXlebW{Y5ubOIuJec7_3KIdw%#!h&dbD8R3xfNzC1UJC;13vh~l z;cb>}HeGiiVmmtwjxA;U%SyRUFVD~O6ob`fRP6K68ta2n0}M3zWTf1pq43m(-I@YQ ziguS*ONHjMadJ7P2^^5hgUqGbYxXT7?tudZDYGj6M0~r4nnJ2wV@CMyeu-+-+Le%O z5QgmOO8H{&x?@V|>*$s&yP(OLsUOYy90^IpodsFx+ z!iAK_X2DM7k5aL;fvf@o@YXDTlKiU=8z+ghq@?d?<-bgxzXh5PxDl)^DLQ;qo~kHzL67M=S2jKJmyeN2og) zlMfetrwK*^k#T}s@oa$aLkSP>jOBHhE9XkDANY%jlqE@f+e_&Tx|*$W@zl&BSL9J&;9RUeIUjiJBlNhpeOg;d z?3yXFDTj-BtnN={drxb99?*BVv{8CAdnSJrc{RBFI_3-|l`04j&qJ6E>(hN5aYs*~ zgw-Abr7tx-NAeKy31!r8DZDlu)IFcCbLy_w`hjAcfS+RL zjO?x_>EN1#nEs)wM1(l3%!!$qex63EzwK+L*Wi~PEHRAx^Q@Sa4*ZWQP%RR67S8a_ z&l-+(VEcghwL3vKa~$Ms;|QYSEbZ`vvz^)9hnWyHAwte+K<$Y#*R29RRR`<9;mh5Y zcxaI5bQq>D#CuTs@ze^INDubTN!Mb@})afIbR&Mm(3;K!T0}!4IItZu_>NHycjx5i_3eBkyL}3-_6h{PvVE z#s$vceQ5^OeqUBfIzRMnmIS+bs{@P*X@2i%XoL=gPG;Wk0SwYdcu5F2zWYRP3f1)u z)kmBkO6H|Dpjz{LPKdPMcd{p|76l6Ij@mmJ{L113y?MU!g=-#FAW1tyVZO4E}bWq83hSc`+0KyR%BY1(De}BpHPC#@OYqIw5AvtKxja$Og+34F8Qpv z=96^S*{o<`t2f4tA_S;EK*(J_`Kb4sv0WqSrj@^Fcs@|(hpb1urPPn%=m~g-P&GyM zJ)-=?=jHW;H~Bc;BEB0_>OcddHxn+eCsSK~@lM{5k@x~lr0~zUk57riZi{yntAX>h z_PI)Q|AR+h&hAb`9S3RA!r;v&3m?8#ETb6U`y9Exzi8mf>FX*ycoNTgH(9^J@m%0Pei71hSi3)lMa!jxW$-A!b$=rs(-FV(}u zZS&{GG}G1e5LYgpjrgV?9kcj^tbNp)XV$H#H@h$DP;p0T$5IbwI|?Fbi5SyYl(_eI z$1_+d&V_9S`Kdbt0?3(6AqekIAX2Q`rf>UPKXs`fsp29x76)?3Kma^z)Zs=CyahHa zw%=t2&aF=jE@j3YB!Ia%6tIyf0hn&2-QjJ-k`G1GiFB$MKx6{bQkq9KTUr)|4GqWw z#idgnW`h+KeU)?#V=!;hq4M!zo%Sdg+F2Um`A+N-F))nPo>L+QVf<-~2Yp2)7Q)>{ z5me+ewF6Ktz+gnejk^_IwbuMwg9Xl#`N~Qy1HlzwuIDH2hq>m5ffkq=&V?qCk@Dxh zT_mYHHiXWqH_;E}RPh^~pnaD@0QmASRqyn&0N|@~U2$?V1)mqArO^fQn4T^T_ldO) zuXy`m`L}Hnh@F&rOrwBgRdJ6+6FYeh*^Fw-E6^{uJYerZpVA(I?3+&+DFrJ-L^AS} z;64N$Yq5ew>|(QC!s-w8QYH;hFBuqq#AzmU*rGzcjU-S(9U^v56q-sVYiSIw(DN{G zb!jy_h#lx{KXwRv7L1@ZhY+-#*HJ`04Kn0|@zhHk6v$o5Ifd8l7kgKx{mJtsiWXEaaYXX~PHpZ$u_kF_mb+Mt zSJhtUtOFpypgdVbNMec3Bm2gC;^C0t8;?^lOW zuJ)yaSGZ)Hm#>;f_Triy?)RFXe9nOp8C9=m-DF2JSw+^U!UDrkFZUjUcMWnkVr9?^*i2+E5beD8@3rGvn-Jx_hBHb<0 zC0*wm%(c(m=Wbp5uC>>>_nvd_{AbQF=a`P)@B7~O{oZ(<=S$%BeJ^kE1=xd}uw&@n zLwhQQn54NcEw|6EHzzJ%c$S)WM$=YSG8ajTcChbM)*iHj19B^C0jI&_oY~5_57&K6 zPDZ#l{$~*8Fv)m5KnZw6{PzrLsr-c*35J0Tr`J5$&AQ4MK)UKICnv!gS?V;GBvqS| zz883-lcYsMN3G;#wa4GM-Xy!z}PY>hz18U=+qJW39@JN736ABh`gE z0e!){E(leU7qhf7fFbBV%d7qQhVovwXo!E)B|@Rb*N-IeA79}iEnMPp<}Y~rL@1fw z>LoEWA#7|IUwv^GbNUK0b&a`$Czqz@9VJUuCgV|-pF*{G`IBv6B*Wd6)f0WO2k0XG z3`#VcK1`+UI4$?1*-42ZLWB1?2;TDneW_OG_()f84vr^gF>C3#p8h&BsAX|?iub_8 zDfw)BJC%!S27m1IoKi;EHrpwqWM~oN1GrEO-ktBYD=uDIf+A&sY~G>k3HqRS%ap-= zD{ibodW@>f5D~)o-r0HT$7e)dfr=Cr#l@=YGn)xtH47=c4Oi}s+p0H)C8iTTd? z1?JN6H9cNr)$Jm=)2}JBb)qKkK&|`j2WDI0VWc*wrNP*P(0AB+M&4;{>*PuSo*$Z9 z`)M(@uf5VGM1>>2dH`vGL&eY})4n8jgH?C9j8n^yQ|9rF-*~4A3;Bs5x9o7n{kd4V zjK9*ndpl63{#{pmf4Fd!VOcp(nRX9^(lkP}YsbL?FMv$xXijzlyLLK(1 zSH0Eh4pwAKLNPn=zM#=78YmX8``ioQ!b=A5uU z+M4Sn3RHO?KICRDHaFd}WA$+%^TSXWt0-kZ_2c=&*GMx1;P|C!$iT`EmD=#CBc zzIR@ISn56^ZOyMW_@?k(eRfW*XJ5Tl9T5_(3_7lBqtNDy6ZFEptq4`kvJ6x!?i~yt zNWvAmcgD31p8k@p1Eke#H6rzZsHqsD=m6bOIGl(ABp18Xf4$vas(Fddp8prF>H3>*ME$mMDM#j zE5K#d+tqUFCEGPZ3b^^?d)E|beaf=Hi!^R=jrEenU%+hS^tG8`2X0hzL{-5rpOdh( zV;OWpaMlAOQH@w55!Q8ZEhvbtBLfWp-HKk(TUdDXC}W@E*L4@8!FpRsWR4X`1+Q=r z)XtBHkY@K}ee3&&ccwRsA}h+A=3Qa5QXhC@g`)DO%xWSpYto%Y{DQtNgJIra?fw&KB863 z$GC&>yR-90_Q1V#4El2_>wmNfdbhyf0=KiB3^&wX z*>edRO2nH;_AFeX4R8I%L;n550^eZy#sz!sT%i8a`lP%JJZ6HKt4jjYlq4qWV!-`< z^pqB%C+WJfK?wRfDLi`@J_Cg3BIoqE&EjW1ys-so>YnuL*J|zcd*;+nXwTB?w2bHyF|mgRIB~7miV@ z0jIhwiu8BF4gZ4m~1svSQL_vol=2U=iY${+pL1|)i?U?8vZ=TLGY7}pGiHbhGak= zHe2Pp@8p}KX6-{A*vV04W@E189a%=T&oPlSG%NKxG~B~o4u5SXUL$hIikEUPrBU3+ zIlPVz3SU=huw3Neb9y|74UB5n`ihgZ8}D>)wt-;WLZdV8o`u(E41_qHq)C-L4frpm zG%>XSAB22ODe8%u;GDOe(1ymdJF+}U*0$QdgfVD~9nw$sp7ga6L+nkt{FyPRi$JH# zb{W^b+H}m%NUY1fBT?dtlxLEtqD?%3mtjAmsfkFT*}w>K1+A>9H>c&SsoW^=*N(jp zcgXWu0m}nBA%2-R+Wm{rm6f|Zvp zWA8C(Xuetat^gqAsNHxD+k%xi;WqpZ?EKQ~{P~h*0kR~FL2>97<(rdeir87j{$kHn z%7khVzjK5?W2^2jw$l~N)y%6|lUr;+D)%)%!`cx(q}U6n{e>0(f-s_FFy&a(nx@@c zCjyM{?IYI;n3?(KU7VIW%5-@s81O=r{2s(I2LO_rU)77*Pp#Ysj8A)2D8HXxKT(%< zqBP*kOqKoNl1ISs8dti!B!1c51jVTc>8>UkBTT>4e*$VE|KK;0;dF~!k+FLIKgzwK zl$)MHZ$>?#U}PwU;W3&()Qs0=&KsG(0 zja<`cQ)O%cs1ti-EENn+T&(&BDuj2*mbI=*Fc|>6_Xbg67t+JI*u@Ur}RBe9?9skeR z^BiFVWs@^DO<=igJV>5b`9Yaq=TK0e!EcfLckr{QihFfXMN$AivZU3+g8uLm3kg`j z+btvt8&FmY*bU+a&nBTB(xK`GB|rK6nf!DIA*D8=v%v(Sn&7+h@|~p6U4Ati3sZh8j0iR5iDbOVa-#v6- z{{dX*jqkGZ9g6II-h-qGvsomW2{Z24z=p~j zF#OYg7V$S-2IG-k^~10KX9N-)L|Q-Y1_zv9W#9jWPKQ|Pvge@sRfpJYrjA%XS=Ns} z)>QO3-~N-q2o}JYL?RN5EkMJE8UZFH0|0Nyrd`u&boVZgS*q3$zbSY<^DcM`kbDP| zK(T3yc}$ZFWs+|Y)HoLRLdsS6Eq~bj#aL$ncK`RZ97GCz3h6jHTU3PIVAh9>kaCr{ z@(2ZvyHTE~e-Ulws= zbWPb{1s<%~M{*@R>tSj1m?)V>D8E|mR>B3)=_JLDk5DH}`Ug?L-|}uB0*-#}YuHAz z{i4UFS)A&$N1+6hNb8-$2LuliC=igq61ubW3Pvs--FC6ks+q!6s zWA8TD2NIsP20_UcIV#TxtJj8Bgag1jiYbpHHksgeAI3aXkwDS!F_-_F#76c%BC!FK z8Z_Y{E4Me~?xdHfDo>b;DeKvO*+Lt(irXtC^Pa@EKuk^8*@d*7NM^M7s~gl(rhiIA za&5-M^V;?I_b-kyy`se07uZ@R2Qb3WqK_`iK%$~L>n~XYZpD8@)&L;Y0es}v<>v9^ zR}Wg1T)7Y#Z2;4`%(LI$7jin=@KT#j;klszuvC$d|Km~YPj8R?R%l;4E8#DPw2eeG zqOvp|L{_g8dok)U{79{x5Fzy5XwAFC8fV zW~bMVCVtqM*2jyO^>ua zL*xZcRB)g>B4CtTG&b?uK6dRxqo`dW{yBZ#3lkMk?~1%V8QVWnvdxEL@payn*ib|g z5fz#$5NjerXkU^AJjPUOo2wCkUF&`0yR}yrDiT+AV52i@O1U*+DS60Zm@!H=ZRexc z^;#PseqnoSEawo(zk1xi^y7Qk%*BHbdVNu#Ov|ZmyHYA?iPGDWHRwqz!Qm{qgn^bi zjzY4Q?2mfwI6;!OlH$a?EMA z7T|p;4buT2<>$9mM69a6G3K#yioFmEw0u@_Ts@jcCJ2Vm38g&8F)0Eb#J(qCN$6Di zC00NQnu^Ld;ea`=tu_IbBMA9|U0w!vd!N{SR!JwxSb|O=7*2WN6c8|5!F;|of%C}x zWiTQ0(ERE8C(#Y=xXrA))g2|Zi+KcHUA*YvAPdBLgmSBwSu+&7pHVMb@%KbW{Jm~p zg^^w}GD6tm7beo}Rh+iHbkxXeUf#vU29$Ec{%dqKZgcD))G3b_K&i#7ZkG6z9SAh6 z5WTd%ZCEuYxjM0I6Fjxr6J+_T+ZF4;Oz^n>0RmtWaR%X>wX+#r%1%=;VZKb{KpCwa zCv!J$p}w0Uhs>~Dz88v-9i`K1^u~Db2x)TouH|r&vRwV?c>*_rn-ud`?T#{uKq%nB zJ&uT3HF!YnDLQe2QROdJGfCwInZ^L$45cd03NS}z^v#R;F%P%Sau&C!=Z3z@(rcJ> z4=&j&8#^Vjp4oiGmrE4ofYzmJ&>;1D{yGW6@)W;=s!Ur<+%bi}KbOUc z3`YnUF>cp=c>EleVehiBEkx(S!{?e@6<{M_lBRNQ9g$m%_OQ}K0ov$lmn$R}+SBSE z&8bX&4UjkTbTVs_v2dDjkgq3ZezcnPawB_TM8W}yW1Gg-)`vY4maR>;j}PB9OouIv z83_}{YY69q*{|e#Tz%rLz=N&5#t+x6?x|Y6MKb8Z7e60<#=y$>`HZC4N87oO z0o7ip)P<*WfuN7hx!o>P?O0EWT zq|%fR+fz=HB)WG}+hw>vC8!Atsa>@=Y`r{><{!mPIR@$uqFL-a=NQd;X51sQ1T5uQ z{*7g*Mux{X1~zsMlKCe78x%zc-dAAU*W8PILZFaAuP?`gy6l^jC{%z z@{w|5f867e9}~FLT4U$Pfh-R$7)I&M(2c%M9S;dzV%%I$@Gx(dXjyjWeWW}CeXx4Iab;%Nm^Db5?U;tc>ZW0q|oMmV%A)4X2n^x{$v{$|F;V118L zYrZzJO5u+9XNL6U2?XSU(TG{g8H%s~Cc(88-%?sraf`=s5tf$jI@PRY)_A$|@a!m! zqHeKPf2WPd<2dvT8WGF>%Ht*qRbUfqN_4(@s#`V?G+yq0^}HGqJSD}T5E7siRE2{p z05?W6EgqKYv}ynRmJZ#IVsK&cN8Vv%x8u`KG5q>cgRTc7eV{Y$elgr#X6BKyeQlTB ze;+x`nKVj;_j#Eh!a@ba_-Tkul z>%~>Q!e_}+!+}K4UsCu0e^L-F9H5jWyxu{LQJ3Af2TN%*lJus#W#7I zD4Cdo$TQ`**F2iA;#~PY_9^S(8cPSwM8tn+3v3ufjuk4XJKb1&bm>BK!p=(}WVCn^ zcka!XI>EGnV+wZKbixb5YC6TCfuBsqUfQKE^tJWL_f5dJSTjshRADb;)|lELSz&^; zzTx`S!;BxI4Xjvz_Mi%Y$+ zbLJBO!hg&8jIDNv5JN$4#^G!>`3Mf8L$k~SX6iuC8WVuy*G`q`JczfpEI$$o7!1Qt z_b-4mLRUtSg011@qq}~l@3P&Szx3@!ILsITEAK?B@z`(lP9=+!cTCIU<)d}5DYblO zcf6k44%&r)PE5Gl@~4|CL^>_e#8-`Ra{SF4KJf9c-uXQdf|h%7)c;;Wg2ks$=IJl* z=es9q!_^imXim?PlR~Qy+_qs9L1MEK^6-OM9ah|G2U;$pFme0GWbMH<<8XpONoF*i>4 z_tjO}6(dZ~n4HsnTa85JoL;UQ9*xkA_%gpF!F7VjD(zz{q}<0g==mz!9dQ5vXfkuL zBi!=+c<;!xb`;Gh%MT4i)!f4aHGrl-egk~W8z6x=nQSf_^IXp#X|{(gSZ@0@O(4Rt zL_wg~>nKuGeTX6hX?J3IBy9d?1}7aklLf?q%~?IQeURO-Ev~s>TM<4SG{hjdBg&?$ zc@_6S=a6O+c>;N8?bO|(u$J{T=rhe+KBjzU^YNb)zcw2XvNRl&o{W{H#=pfpoi({p zQOm>BEn|h2y7d3P)P>so_4k6s#+j@5lES%KQcdFXCnZ8&jKjfRUBFJf2$EbcahFqR z;vKK3_aDb)q=o5p@O6mZ&HKlvixyfA!}OfyWQZ)Aw^`GMLtMO;Qnl|}#cYy%j0sEg z#3t0A1h^ipIn)YEZLOo3LHrp`S8os0ky|V+Gcf4Y9!ftKc%9QbW+wc2q5oNh3qaWr z7LlPQIfkz&6{@9^XiV$YLhPR{Xj#Af1h5~v@6)6{3}Z_`TEdq^;)H)$I9M-~H-E3* zJx?-U6%N&Eo{E|{O(t}TVV3YS|I!5!qKPz+%LghU)637bZ=-1HOFOG9RMI3 zmf6KIV*4Dti;EVxzskudLJuF~=#d~ekTcTZB#`T9c>ygYY+()*Eb{3-zpnt%yZ`O& z(0P#4k8R^x)BV^;XIzK39s`2knk-L{4%~UZ;sDZ}#xK>$?Kx8yjc?=-m^60|yzHS9 zrU9Uea^tR+5S6tM@`Txg?f#@K`|3c9@F$ULv(Q(~GHKA}E>OKeTDoO+P8KF_!r{qX zzo*D=1z9qwuE!C3ve;n>G$6oQ%H(eF4?+&$;6INzKjQt=>%Cy4(I(98$>R^F=>!Fn z8GTsn$HPd6A+QK;1c+02t-t-hzr+-jOZ9+JrVRupO=54~)X*VlR_kyU&YZw|$WBGK zc*W-ipv|;bQ1b!CZniQA9vs-#NYwtXJU7$M9os<4f9{s zn{1;=Wl8`RIBjynz`r@T)>W*0ClIPbQb)vuSF&+7)YQ}#FRaf}jLeIhxBW!laM{(t}=UQl15gWCum_oLEmi^ z2MwfpX8p&6T3P|RxWl7u`-_U(L^VeV8eM8*TWw766yC3CdYgl* zLYB7DguWGX8Gl3b7c#z9Tz>uFu`Cr>Vf1_Gl%{R|m{yC;UNE@%%0*~gCZi}Iizv02 zd&+C~^HuZO=2(GPmB|>~cORO?_>OSW1Gi`L9xn$HxzeR$=`S7`uUA&SO8ihy`4x*+4)7E;}mH?@p;vOXDvoRwp;n8f`jHZA+Jw_$^3k@`GKs6)6f4+<~ zT^pf7g0!|@VDb)mx5w8Zu<2~Y-s1UiFV#S#j+luFeW6Aj`ywAAI-xHX*;9l}WErU! zFTBPIRFE~8HELNaO(&Q^($>1v7ND~;T?HE-Ldb*$^o>!aA~>mlz(?dt#FP)9Nj+it z0xdWD8(%9^r*rEYyj{HfsFNn{8B8_mE_;P^(gsD6eAbfoli7O8?dCNgzR_>QLMGT4 zWd8An+NH$dFrP@5BkAJp3BzIlp(rBAcHW#jF@bEyM@Pdn-812G+BrooIIe!L zReO4UqERlb5N8K&!4*gPVJ+3;qfvV5(o+E#)rSzaQHWfGQNk zj-V-?hu;eXGtUgSt1*<1Y&=)~ePU_%5#=6KJ7iXD_k(msFU2_GPKQ$dw7dLvbv6cQ znax%Md07y~Kt-zE%7{cbm}MeK{qq^^mD&5?IDguKEum(^2vzkHraSyjs3*c!2+5Z& zHYuEU&(9#NAGNYERW%uX0qnxWQwH5SC|!thv_~Lq@J6_RMo)O#r3X9m73T{X4*ml? zkQK^!4AwCAE?-$p76f~m0XdqpIs1+qf4x|(Lq2_NsiLhac z2cj2k3Zr3~%s$C@76Iyt3#%`Ss-X7dw>S|pSn}D1G!j>wy}CN}i(=%Bca|=IADrBr zyHR|im{Ar-xGk>KM0_;h_FHjL93_bhuVOZL(U47Ig zl8N9!GxHjoCU+R%!-CAC5JA)zbTmeURxAoX4uvB#TdOt4hR*Fx{g?LEh6?aVT|5T|&hwXjWtC(5R!p1#74nvy#MixJw=%*_| zFBL8#`-V~*WXg)WF_J)(?ZbGgzHpx)~9Cca0B zrY|#4QrZ-W=(~6pD?J`>9CY}G6gsHm&p_07+i%jb@?t`6v4R8$eJiw9|4dg56pZ;S z;Xw1yV_nu4kgu%UpuN61=8Y|*)l@fr?d|pwXSH;1zona*w3Dy94&ogj@ld-DH=;VY zKc&J2gctjMrj4c&T;cto?UL{zuw)?qXZmN{7hvuOjCIOE*1(}~zv8o#E0aY)tU*+( zJm4v@+g=Xm>xDdB#Gtr(O?<-RD+CZ8-qB`3IvedsuNd-JRt-2d3lU1dx*quz$^tPy zYOT1ClN8~!&iO1gS5%r^CZ#9Ez62v@f?(7MJc*!{=2I=?I(<~Ba4!0MUbS{lA;?$l zRl#AbPCWi5rixC|jk6uIikB3C8|JV&Pf(bhZ#s7bl{;x})C@70^NHD0%`_fj-VCo08u8 zCODi8{>*o#W%mm}!TrxJDjJs*9b{u8XO0A*q!h75g`do>xJK$&Aa zq|=b0NbFQR2{?5|QHO3}BXoe*@@MJB|7?I4pI_!YAI-(8*J7hcr%CFX^gm-Vbo8^N zt0j<4_#^jDBTdu)l=0U87Gvc9!sz%v4CiY%h9_kA@-@;i>d&{`pz1TsiFK$a+q}B8 z3~Ou(B2X4HRvSbAYF*; zw=8{M?l1(S0(d-eT?po<|CvwV0j@}SYk;H{_Ib3)Tt zdX;kG)gcwt-=@}}R}EEnkB34Fey{G{jR5f&YQhJ?$3tP36F=#46_2M&MZ4`d^#Rg+ zC(>9f)Xj87~=O38?(_dA8A_Xu(g~fd5@}VhT_d z1rOg|S0l0+q;DY!j%qP5uQGgWYz&B7Naqgh;VeO5z4N;MWiq;9kqkBae9Ru=r z$)Mq;qMAH(0H5SvHLJyPKg-!(k0ZK~cEb^%8U0bT)87Kiwz8EP=hENe)O{i!~f%qSQ z177A50K|hoMaK?}-Zw=c`f6bZJ3ogd-sAUT+^eJ=Ykvz$>be49LBUWs0`*g#hR+rQ z^sGng3jY`Np%_Ywz{{9W^Gn>xtu!ftqHgOM+ujq(6*qvaNjb?(tOXh_jE=`SIRQ|i zlImsk#O849;Rxv(b)0irx`b9L@6;O~Rn#BNzRi zN-)&2c^`f%6VzTN==>ggTZoq^oKy=rVSNOKN2o-z44H%d<>+3NJ@d+=Z2?yN@XTWH zo5+A)&s0sM>;eRK3B)hSA(P7dRb{%^U_OZjbdvph4ftVqqSme*-xy*(bY2+BB=dfY zYaHDX7(-hnCQ?(P(KJy!Z7yanLK!J6Jnx@L!gN@w4X59p|HdhnFwEXWQ6j6C^tKzcC0ihEtU zJh?l`L-wQ4spVC%vqxJ}2C6<)eIo1SdtQ&F=MFd#zbMmw?r@xt&mct zRG(=v^;;axzBgQR6Lzvgl!csq8h;+D_HzNbUJ{r%PKTZZaILcM-xW+~p{Wq>pHm@_ ztM>7PdsV47jbNC*Ia1D5KUNE5h=c*vTsefpvK0uf?|e~trYt}Cj8ewbp^p}b@}d*5 zFM@)#a!T57Iaus#=TK>uvH2otQc+aBDO}I1Hlz#8X+hhj5U@ zdv92ht1MrS7-dR$-T@IIxbgYgYGDLm7@G1@$iF>i1SzOn$UlR9UNXrBhM%n}Imov9c zuEKdCl!-4bdas%;C+Z5 zVFW-Q2uv2hP(x=kW83|E)iJ*rOI5hLjoCCG7NLus@si_8c59BX?5~27!2V0O zO5Uu8bd=CNf32S>Y6Fd{d|76LanRRQ1KrF2sILg-0O+T6=eC1ki3;#8SMcCudS`9z zeR;h-E9~>Uf%hRp&Ng7wzV|%V1Ny#3THv}nCYDm&gleGvehKSmq@U`gg_Ed}oQ2Co zX$rJK_V+v@tX-DT+S}~sZkIXALXkFwQ*oFmw|)l6u^R81&)(BO;|9|MZhVIMcMtkC zp85_6v;pu9+-i?KDtOf}es_ZkoP{-q_`niWH^wAXfy08 z)UMEHb;A8?Pi4P~b+?1|fW^J=9lvyi0rknk)PS?PgD!;XtNpXQ6ahXppv(ESZO!q1 z?>$Qdx2NudLt(;JF_!~hZx(xo0Y9_u5pz=eE2wyL^_G*n#Z{Tjkx}W>bM#aIxy+T{ z4GC~rAS!Vq;oc$b!z@wAgJk0==wbkvDFf02zC^i4pzSxh0!hq8z{XE{Q0spAved4@@<%8U@YN|$9#!)D%3`cf%HQtYYExW# z6Z>aZ?fGZh?{YI?$LEq6X!W&&?J>oinKA9ie(99b7jJ?pI1{$C-br!MkPS*^^C>}G z!b0*vxfhJem=|y9PkIk)uuf;%t~Xh6x(_EF^K=4DZFJ=(BtV0?5qbSpTKL_<8AOrz z-w0GCxZUirfp~h*`i|{e_pli?G!hB^ok#?xzpyOKwymBdiVLGIgsTJUl+9AJ`))_f z3m6zI-i68KJ|fB@Dkw=fgJm59gFL%uVeyI$x-36uuLxq;D;9I;<|ea$8l6?W1Ao#L zVf>Qrlh?V7NlRz6!aO(4YXP+wxMG!_g#cXK0$9ffPuKPI+QS-*D%c*fL~MEdY(ey5 zY+$cD5&$#bzCdd}HvhvcHP+fKex7_Utu_Jc_dGA5cijXmmt_DGR5jjsQWXQvGw$R+ zpJ%}SLi-gl?%Pd-wl>ABXbl`MX>`)v=Q(d|rXBr@S*7l>sljwA( zm9VO3a5HObao=JteKwL*-s~x2jdHvH%y-v(-Y3{#I_QaTsz3!1fN5mz+!8PWPiDEW zSECq+gAU9GdR5+83%n3^Ob#QMFwru26uCzGvN#m5t&jT}aa~Wo{-Om}YmB`_U-93E z;?>fnFrW#U{{#L9a-irBD?0pG673z zmUhUw{ne60q~CEbDnE*TdU=$eRrn4#MD7dQ!Yr!nvo7M*%czILY`fgI`hmUsnTFfF zv+`X+l%EzuGUuLy`3wEw*0prgf*R<}9h~q`K>77AV=2ByudajVAZ`{GxIc!p)GmL^ zT1x@~{EGGMRO8!gL#CIcIuE=U-%vRQ+_B=$8dqE*(!pg7KYF}dYiN9ykVVl`YV5UE;7e=k&K z70da@Ij8XF2DEaUR)DZ|m$Rh91J)_3Y<@$kOk-r}n+m-bBvv(|B@*Av=ag(aoUvXG zeHgyn(}#4nRt+K9?=^KJp;o!euXe*)D=IOK<*s*d5Z)CtptpV18N7TjMg@i~QaV~( z0v%V^7J1%{(c1abV+1;-z!Z6NtVq-QQB2x;;7D-N);`aQUHocAYkr zvFYB$A}O>+l!HRI_y)YLJVE9aAE1^NlE(6Mp+V8P3_w8iH^)N(B|6h|wIiA>r(`LF z{pK55iPDp~#DPbAuDGmDj*@rj0J>5bqpGzTt`npFH73snHiq*ji?N&O_`&`6;6jta zy$f^UmhvzPcI0aT$!oY>^y-jz zUg!gFcglQQVZ6n7*`{EI{}2a(6vs1i4F)gz>^Y2RoX2*WL#4IGWCQt)dc$3-Hhb#P8@PqTS8B*H$Np-8O$L+>^sVcrCH(IJ{`4D6W9}} ze0;z4luSb+WPHCyH;-DwT(OaWAXlTO8gR!1d+BnA=tRdQx!gWK;8FB@ZA5&SD9CQc zF@f5|3l~(sSFbgmFSn34G>{ki@J+3VQa&UdS&7Nx*(xQh1u+X`&7iI5d2PPsU~gtI8B2*92xs_C#Jr5Nf_>L&!%o{Dwb49WMQQpr4|`a!>v6rCHHRkJN? zh-5Gj{g&dxenYpHB(-QLS&8NsC(D|Qu#S6Q%bipcin?jnzSx&H4&ao}p5PqTQNd94VREfTNrD~b92I4V zX8Xy>b6+$rA<#wGm+^GFG8w0w6Hm98y^&7Sml0PvnDkvOL3#Z&)pw*4^B1vK7Rc3_ z+vr~+*E+Zd8CLl081%Ee%{6BNuZB_24jOGu_3QC9uzeY`VQ52Od>J8`3K0z8pjaB={}rSDMyl-SAUR>Lqi`AB_H6dhA~8wIe{(=>cRSEvwla$___5NPyJTVK zF~i{`P9%6O6?CdT+4cqs(Kr_eHF*@s15`E@;)Q!CkoimLaqS)#86QP&3V3zsXYd_I ze;vk_h|?6lz6)-#ZnPD}oiwm!S9%eN@Zw;tCI_K^WXafmHv4bITvyuSe&vHDyVz0{nouG^_}+^AU-sE zOqlYh?iglN;IjvKRAT=!1mStdAB-0^139nAwT2RX%((g*PF|Wbwo*Uv z`4&8?(fv-mE2o8oib6{c-$GWQv70Fjsy^Y9Ts{#5O8uZ63}PR7+YqYXIRG40pNm${ z14d^{V=L)>J#DawF1O$n2U$aDfQ4}wK^6>AAUdx3*A>LjAf;gIi2}T8h8XzN4??kS zc6{r-0C2-%N*umuoTO;KQYR0hXxwRIG?(h({P)EVi(Sox;PVamPKwTbnf(KNGDl zW0>#9f>28|;HhpjrlqS5SuJ!e5nfE-^ELPme;aGuV%OzhvVj|LTfc+6lObSvH)YQU zuzUU@HQ=$!h@> z76cp%qQ(LF_sGVt!(e<1-)xmdRuPFRn>f@0OHIJ|tx?UQKxpN6bLNoaMpEJXe*Qdo z`{s$31ii+m2eEYX8f*eGqMae6?=apw(+s3G4q0E*%9Fp^RwXR^tlfzACR4;9xSJ zfw6j-XcYlJxs6$X6bWP4ixHR{_?ejCQV#nT+e+uIH|VH@B&Kg_KPHZT@v&KBGz;QE zMv&|7?H3R`Qo%}>Cv1;F-CsI8S`lHYoUuCwA&nQlxga2lO!gD;yK zTUbkcSad9OTKW>l`xS<`dTK0$l?@@eagR7BVni!MD~Bt|qq3;hW~w63ImLy}ayPpA z;D?9+k0JGb4m>zvHok=zo5A}Qj&5s!7|ucb+aUFbTK}4Dk65k7-n*rCd+O+wWM`Sw z{Sfr^YWH|)LUU5o5c`KVpO)TqCqBJ}sC`|d6|30Z=FJ(?rdKKqqDo15o{8zOmZSGE z#%pIuZ|3JZu_OM_;5Uu&LcUS;2AH0T@-H>FAIc5-vS=zAkY02zLmoF>*$bO5aHvm= zGk$nm=7oHe*q6%0W536$k*=#1-jW?0 zgKRlBv8VhI7&F_O5+Cow$Brn-|xJoZil-5$DpN%SYnKpK5m%qZ&0`w3y(!z-YcQm+Kuq3!Tldqzpl<0kSE>Skz3AFTV|C((aoCWk`Bx7)pHzUKjF@MMemvIM?h>E=#WaJ$(W0iSGP6(@mof}X`5MMjdh57+-vVj%%g7e4-7(2{>ov`# z+ctUVyk~d(4XqQ?Zt8s;cGPqIc}iviHkIq9BRjWojeY92?T))hCd&hlRw{3Ee;hFi z6W&8#E75JXUff5@5-w4WD^5{pt@uV@beoG7?JxfHdiWDnU@^m-kFCN9YfFLD6719KMo8&4XZuKssUJLWXsqCH;_w<=@ zO|FZPwNt@G#OosI_>CCVU4u!(ynC>%DD5r_kWUv5hp|yKTfn(ksMSov76r!}6Gl1M z^`dT7UyDyAvBkw4&$=GWS^ir3Nnw_B*w@Wt-iC_;X90>48yT$gOmITdAXe;4Ll3jC z-=2`c_*5at!~&k*G_Fl}Cee`iPX=jH588|0`|b9^T)$wF87%;13V3`8SVA05m8R+A zU@{3V%hfM>djcKd3rZeUheY`t@YGSOX-2864F4M$`9 zl3W()l3bQ$nZwB4PFO3b5Jo%?{J##4;rqKTjYNgrb&xdoeUwu}|#O#b7nIN*% zHm#vh7kDtRHT&GYKog$}0?dklq%)sH?YViboOs7~vOL9#2{YXr%;*EsPW*Cii!|Ao zb%7Jh&!n>lz|+PdO>WP);%kBQr<+O*afx&!Os z$r;u<16rfX-CIKisff%Id1JpG@J3-?e|RI)Y zYo+Zrgn`{G1eGW%73vYktz3JET><3nwgl2C5&{eiPzMm{4jw}f#Fqnk)w_%*FQ0&; zOw?MMK^B@6|2}_7MuG63GQTYjt#|9f(Mf^fzZC@sxUru-vf6xLw}dH@IP@NsAc`eGi-O zBvKE9K~k^qNJ0KJ$V8>l#xsfNp@}%$9|9u?LIE2`sB_ekTqU2)JT=F@z+Uu*Er@gH zFl*Lo&JN_&iA3d$(L?@C3*gQsEz7UZeT$`w67req8U5mOKF;-q-70QRW*`xPT!ub& zLswpRKcuqdmDK|m@Xjjq)Z-Wu{kmNo<;**rl2Z%?m0sOFiUGz44>V8C9X$?13*nsm zh`;|7e1;2j7}Y+Tt={a}PLyr*aSy3x>&~D&b>94B0$6E9xprkg-rF*svZuBI*8r&L z_sdO}gjI#7I0r$|hBdlv6LzbU4qUb3QAq6-rn-E94ma`oFB{?bHerpcM zz#9*5zWmvrplD)6u=Zhy8qxG8G;|8`;FL4|#Ele(x-1HjgbVHde4wQ{I-OcA<=d%- z3iIToX5tv-Z$FN|&gn~#lfXd>OOxN$s>|Ztx~A?)*R1CAY)V|AMrwBE=W*p_4t;^? zrrfu=XhrXAb z_BR!Y@yq6vgQybIIC;DSmS9;U$QO#t9=Ow_^zvpzFXE#+8q`3xjH(1iQ3uoW>LvA3X-NK+k$xWgu(Nbi9OHM^M2JZ( z#*8jE%wHaMF1UW0vf$$2+T3<-E}U6kCzI>$ZgivZi~fM!(E+3_it(;TT>sJSSZJWZ zWybu^rFutRr2Xn7mH}bUSa7Kor!MI&SN+HGyvuOGqrjKkqZ;SYJ9#o8Je-#O;KcFA zr`iH##b%yKqDX5P!|T#7y3fr?_^0U{>WM3^N$I}(*T!bO@OA?FIU}_Tcxg{RhDZ~c zi%!M1Y$+S8xL;RHruW#s$=a@C+dz=$UY6Vyto3&P_#XcNCWS0SN#L^N)lF}u`KeYE ztYFtz{%DNan1`>W-c#bCbnwIbW#+%;J`~Wc{5tTMPLjagFnJ==X+nRQIuO=Bz1kg| z#ZQ+s+4Nw9IsAi;`m7|Smnk%_JbDJp=w05NK&xVX>?V9Lv@LESe)OfcaFKo-(((hT z_q_$^3ih<6$wH0u1(87x2DGd5Rr=3e9bSU+Not{E%V}?-pJzc03Ye?--Gdf_Qd5OI zH~#w?fp^OkvNfC0Pnb|VX}#S}elq=K*>4MPh~8Eg)EZ_aG5%iiNY(ZFRww1pD$VAI za9g^9snNOe?D@HJ=Vu-mprJ-bcF3uJHK@0FzeExRG0OgtKaZC6G&hH!{2Rd#h{h=> zkXiZEh8|=FQkA#|lD0G<+!+<>SZ@$}-x*h05my`AFVUxZR-C4MNC<2*Zpp`f*oVBM z_B!W~U^HXi$3Y>E6*7eB%ZUCS;ZYPwr609Q4CjeKn{JKJ_T(R$Zb2vzv$3EUea}Q@xCZMbc8)Pm_F4A45CH`3mU)n~u^VAb-w_%$iQWviF!jbc~&1sVH=K;<2%Q z9hV%4tPv+gl58V$Y$uOgZANSPUD#s{$DS^go59D(=)f%OantQ#W{x z&iUJs=}83}6}f!k>j{}(N#L$qWb_(1z2q^H@3j5-rhA!?LbxvC2mRh8NLxNYRA)Zm z?5HbvZK`~Y%a7d)ckG8uWV_~z&!4*WlG|LF&o_AFU0{?QxSAL%7*gFLoO zCx6TFv+nPTU2f&TWu&=vKBVE@b-Te^D6gVacy!?eJO9H1k|v|!=N%2$8If6*oFR6!_YP>AsL_su z?1F%;>p;8D*P2bBN*<{pI@Lw3bz5*AXJk?Ksy>*){=d*QKt10zH)J^-QGl<`ZO~pW%(!K)CXN$alm$>{h9iI>8 z!z0{EMJ5Es--)ixLBCnF+eVD(JvnqgcK$WZ(J5Hq>{yhDtU;iHoHF}uY5h;M%K!iV zAI$$d`+qS;{W++LhEonZ3~rWH|GV}7?j{%*ad<{Qp&fVScX2ezG=nxkw{VcL`#p9-{U(^>o9jaO*qU;ZG4{_vusXfv1t z&G=bX#LPcSeDe^7iecNef9eju*Uee_d- zjsPq;f7=57hlg1IpPwsS@ouopyg@u%OUGg&neMDZO=dGjG}mmJkyBnRqoM>$T@0pI zzZus4!_yYzz~(|>5)GOAI;O z%a;-k%$>ojIL7^B)70i*3H5iw+sCjb3MM=f&>kVm-70xhIE>{<54_$*7m-gq9MPSw z+JTWm`ETsKbyU?|w>C^E8v#L(kVaBc0qImiKtLpF#c%MM_e-yPHin z@7mz~oacGZdGGT+-x%LFzCX@AlyNiM`?qSYIj=ddYpowyn8=gV6{#(xuMcY=pz3kT z=^yc2$+ACEUVPz}9n3`hdqVjAWllVs(|QhEIc<3?0y%y8d7()Zw|vFhzCre;mEqpF zl5qBdsU<7={?xO{?^bV)b5*IQ@Bw}Mp!45jHjO9BqB1T^7{KgJW$1K=doyiJDOSUO z`%HD;ArqD04fO4u799Mh@FF#3wM;ywn9%A&1BQY|S}LLT00e4}_sX8>NHGsj(ry`K zHwLgH|Bu}47`%sr#s)wxZ~tPB_inKtoOl8 zh%x+oq+k2P65QS~!A?9=+uwAQ{vxxr2~tnoe()BDQ&O4EZ2vsisJbKmgy*&QeDe#7 zIzE&6x8oHKiB$+MmEMLBjSW8fq+GhE_G6J7*uiPXzhMWZhzKm+d381S`^;vTtzk`z zCcRJQweHcCN)@5ui;MEzW0B=NM(U|6dxAsDpy_8lGe5hmR;h*D9bL~|h5p(jQX%VY zoQh2jReHL%m~n<#!|3C5rWy(1`)qgXYQyzpqL@Q=!Rd~xerJqG{FS=@F7i;i z2o?}ja9QdlW3*9UQ~pMF>P_XVk&)P;dDJ@>+slyYZFN5_-K0PGo~$>JiUhrbWlnvF{>(=W9kwV8TQN~WoF$eemJe0* z>zwTPP>&T1o z5~x!D#13tY}C5ejTZ1kGBkPAA&ft1f+-U!x!K1g|Rf3217SW4usg;Qr^5<=6|{ zYxC2?n$t43od?ObUdctGjJa=u9d%teY{o96%5M5on|;4;}mIL(yGAjlM+ za9Y>78%)aTD^2&Vsu>bzqv1lQobg60nm0{@cLxq?NiLu^m%?Sxm^mM*>vlOteP()~ zxR%k5iJ3+iv3Sp{UWZy&r=56AG5>RD!kVVQugC|d&y2)}lUx?V&=2 z>*v9}8Mg(s@yeC2gQ9q{uJyHyyDO4;tuNaHM3Dcft_+rAzIA7`I`QgTk#H9Anub#S z#rL(z&X_mD#>y@n&nXtrF4MrR&aSVoEbq<)AOCjm!DCO;qC&N&g;!2}p!D4y6Z@~tyr~6l54Bg*+BVC`bMl4MoeN9`|BF}SCispA7Uwl)Kq_tRp=6S5gpeMJR|fI zxp3rN{K0wcX-S_`YAj71iF_C@f*D`UC)}>-#M*nt5~icIn0yDuCb9TXAuGJXl!j&I za>kRM4aZR*ps1U~0*bmwqCIi)S>Lc0pSiN)=ITnD6&3k3#9viqh-{R{8aLOj;IlWC?)dL*5@DW+Z&;AKc=FjrgpSX#dCej$a z9T*nziY_tPw{YCN|2?yc-8cg+GT{rW64qThxH~S-j46VzMGHo)+I|jYNo6ixcWFE| z5~w?zyqEp8CQdWlw6;c~e`B(0o0iXJCnaqpwJ<4qFSttPRIxFg3Ep+&G(Kvr;7mq@ z`?`dMC0AITjO-Tk$@;4zxT1i2>vVpbE#SVNT}xW9=OSmOM~jJNHF=(|`klg{`<|N9 zJ)9oN_!^G1O|FZH2?p~w;lGnS&qMnum*fZKMjIHJrk1yi-YgMNEo4ZFeu8R82(^k? z0+y-sQxwi}7nQB(hIS=$l8Ndfv4&;mr9wosE>(CRqoMk8wSw>381iB_LT@^+KB5>~ z`lLhnb+>@c9@>=Mlp43A{B(W9wM+W_A=5Wc*Lu;+M4-Ej`-aE&9cZX#PG8x4oRaOC z!xKlw@4t7n$t{j?txppFxh&>e)Oh`~yk|Jf(e!D}A8fg?-NPb!>SFI!oKopL<3{6M z5D-l7z)_64)|pHE3U8(Vbq34z+OT*}a`}gQIRj_+9Qs*}+F;loPCId&Xu5@Dg~_<& zfSa;txT~9#c=&Ui54FjmO(*^f`m22t_KCs{PT$n}Y7o>%B`>Z%VMkm#iPD~*w42vD zoWznof56oG%fJb9-F_QhmwJ|tixr-So;A`@@>_A5#ijG;-Lo}9E+_Gb+;6@b1e0Mp zml92#)DEX`KbzFJ=dt#Zhm-SRXudH(xw(u1DYa5QlpJj`8Hd$W}l&1}PO9%?G zgGxh*e`>uKQaj^k6g}VtQq4?r{;R61s4?A9}e~sn^j`uhd{{F`$IWz~o zsn0+6uC&&Zls#UBE1XI-{B$_X5;k;*HAHSQPbB<7gTE*4W3Gr$%dA*qYV%bns{yh6 zm%vt9=!~+y$$P990XV(lh4&4GxvT=>gIlEd9OluWd3k&_Um7&YA`9%DIPH6hrg><_ zn8bP*4mzBIvxJ{Gl(HV}@QJ&asS38?eC8b6-fsK&rt@e|dYaF5v+S`ma}wftIJ2{+ zof8%Jc}o8;QNKMZOr?f1NU?qVCU}*01Zlc+*2|^!Q%WUoIzd5xFzUVN)K(3o}k5^ z{;JLDy}^YX%l`ax@iDyBeaanF`37-v$HhCRPGy%|4jgsYtubjV`SOg+eCLWTOM=9; z+TwQ@i+5#aRAm~j!X~$vtJRV=@$BB=_$wyUVTQr7Hnr(2Mj-?t%#x~KPIkm^_4F9q z#O?0|vfu=~%?)`j_fdvwdf&Kp5vNR!>odzHlUcpmAS@oLp}8gR?<(Ii-s!Hm!?A(r4kzy+qei zl88b|H|;B((FOfz-}`#MS&tbd#?h+B4^iLDM6I7Z_SsE4s!ULqMqMF#b#MNKw^f!F z9dpC}PjnX}bg4@dySF(#@ig>&6AkbYPeK{t!VyL@UpTZfkTR7xytKS(SE+wX;wbt+ zKxC&5A7ccHO|!Q8?7Z$#Ag=V)v<-CJ_vGL)$l~kHShw;}08R2f6u??HI<4V)KHq_m zQaR6$&uL{2V->xs*;WJfcP5O#8{j!OYKAL*h-g? zdF#&0p)_Yn8G6>PGkTgf#u$1<;r5KK`)m91QXdCvi}HgZuj3k*(Z$x5>LnwVcC|Rf zR+Ozm8x*7V@OI7U-L=J}2k5WWB?yN}q=;Sz5iuNG97j~wGbMI#Mu?m7mc~UgKCGdk z>9us;9aE7nd{zPH2Zf{=KQz0znx~L7D4boI5s9GMkU7c~Ymr#|+N1ZIMtd#d2ZhPT z6!(VFggMeDcU4f0nGAo){{-n1rB3k=UMH>elD6)E7FAjb;mlAFgk<8M{pfUiJ7qFe z^=>NFFN(;gWl8?%S?*8~?@*Si=&4cdNu7OD)3Zmmd;F%7D<)|+^tQC;x?cK(C*iuN zxiepzL_GOe%~vWGnhEz!$7q+GeoVNjJ8sRe7VDVaY0Ye_8!>y{eJa=0|2)CpCbWc9 z_NAXvlK?71Qhm1Hj`g{}_BvHD#?*OK7>|Crzbstt;hlT8 zn_iC2f4(gwA!E*Sq%UaZm=@Vh_&K8~60=kFTu;YfUT|%9)?x5`#r4Ud;+fu}9(Q^d znZ*X3;x9=%ewFwR#rA~?$YS8jS@)8%!=G11yo@c{$zNq?YnLM>I+7WFoFa;fic$xk z>QG!>FkW9hL>3NOv*{VGsf(&`9L3~$zYRb3qv#9HRMWIJB#im^53Kx-nM$&eLC zt6ZV=EJvk=ZAr=l{dC~))KVL7r>)X;NqW56&}`f<&$&u4+fpR!MZ|oEUAlco!2E_H zaYi4hnS}i*W^Z2AYzd3?HkGcQKFjIW3G=n4OSS(pEeJ!8DIdyuRiQdXq!-{t?&&5{ zHGDsnz3vh?`mH2lKUp#6+8b?Pl9RsaYjYZmlO-CD&=-C}Qg1r+;Oy;#vPh;^##|cN zEZ^P1Twjia2BrYtPlHAiH~JSM2-QKK`7z z{7z4tnAi=LuX9i!%KisL{f;m-C%w?344p?3sSL1QLuvu}ng&>>Hoa&gG+Txw?@IaMKbE-$;+SWjWCHdaZC!#>}W0tu~MQMmf# zkY#=PxI^4aWXS2V)c<^tqS0+kUp7$0l0VJ;cy&Ra4TmU^w|(Yo3KeQjF9kEjYZ{@r zDA#TU>8RNA6%CsP(i&wIhlmD04*9z{QYv_|$6;E}=nYKUh8Nge`b}d5j08;7oV+&g zFE8;$*e^?V9*JZdZlrXOlj5s&Zi~6rNJZ)J^lunF0kg9yk^(;!}=i;&J$`N*jPb5Qr-nTc>>&64coWD*H^wLCw)O+}Zwhftn!jAd z_hYgD6;8!w@SAMB!i3OfV>o!b?pQ-sM3PZ|tv{U&T(R}F@+BRc{9ObxI0SL(-6^}7 zzLi9`f?>svARH3-vxs#vAIR+2LSIMt{Nq=u*Q7fud|nI! zYrci15aqcIDF>Rz0|aTl~lQFA?AIv9GV`3UYIqsrH#P4OLUZ1lURd%S0>sm-NhU8W-i>hiVr z7FOo(FZ(t}zgy8n(8@k8+7kT<8xC1K3Y*z<+;Ug6_S4V$@UcJ9P@E3{ul2wck_(PW zG)epZf(-@^VUDWt+$e~UqX&-6Y`WHZ@s$=0)uVfmY~QE{_wk=LYF!>}O=Qbwzm42Y zx+`4jd3;HyUTys*HrUn!)=d>`gWpN;j2gA0@3re34;eoAAn`kr>6}6YycECzj}MtY zbui)JG2uLRfj6Tn*hxhZ1g9OIiHhIMq#_7}P2>AT6DR;PO4E{~-;5m&k_P*Io*U~$ zBxfWo3T$lz8-YYI@-wnCSrQG9OvxY~dr<6d^~#w{$dkG8ulHB@n-C!g-7I4YNGfpR zrBa?@mS9o&9Ndml8W6me68F}cU`j`PCq8duriqkH6C)#q+*e=fU*#`!o1hDk!J2Q+8-&h!VqgU-$Ha2m(3KmFNbidla9J$}L6n0D zIWipfmqq5y5n-+P!y*Q4@#-2m>^&R!x2>4LT3PU&T!XdZcx=|83R;QTT{HSh|E?fy zoYQjgRrPX=QTSVAue`MPBFW%67NDpTfJp0M08vJ1eDU^n0li?94+Ia(Hwf&?FdO@v zXjJk*2y4mPt!xn|A+k{zalS$N_tOTZ52EbhMB7J(`N{&UeX|Xj7lQ?ZEY{7sy8~GQ z0|my3pr8y`>x8@8h+<@sKyHJ(JyF>!8#_^E_>x{R&oAwAzgNJYLppS2Z>cl0$1fnN zM~XKRw>sH}7xY~NEcBkl?#u;d$(61L&+Qx(BfXVsS1J8KpRZK&3%BV`yz1D$)(tRa z+AYWZz=Zepr%O`&iegZH0rOoamsQ~K4YJK33YcDU$w^k2X-IxWf=xErd&;nnFCvQ% zcRd6xVe$bc&9B?A5il?woIuEVAnxw|;5Tec48XaO%#zdJ?rWIlL~CA1ryYFd-YFuc z^}Z*yPIo*{REBiCgldJc>@~Q%FRR2||jftwe{HyJHPo zgC6Kbj0ULQw}#q-d#s9sjDPX zt@B0wES+Y{EIiLlbWI_3J@FVWP?z++hm(?-;25wzn4L=1f&<*fFW5RFqW#M_L4a*X zQ6OYbloArC^#FxYVt&MsKbgm2s%9D*q5XAti!B0L}PAdUo87)0<5 z-g%WXuCXNcvmOw<1PhyAR8hYp-f4(0bw*`2d82|G5n~5~7ia7j&~3gaM}CXyguZPy z=sXohRnC--PeEm_4WpKh%X;py{e6XmfX(1db72FQ`K){O+~sh-D#ef5*L%snBq`u2 zIh~gkW-|rjt)*E?=KVm{uWYzET0EF4hQZiwPRwgxq*95?X)?|hL5Qlq32w1WPaycU z)BvskA1pU|$M~cGn7H_jD}4Vid)h-DyTo>LT1?e$lbrw>S}4w^uUa^&CrSnxebDZO z=mCdKJ8)6*y2dTOpjIGRk)GzE9$l~7Vj{~+T2ogM{0;_26;eJ&>p|xZZ{kedMR3Svir>tXIwZhP9vHgMgNR@^51e<UZkTE0#Z%1QYpP7#N~6ANOZI3^A}Z2EIFeQ&1ZgMabeIyF23-tRmN; zZc+Dh2v49R6$|Vnoj*L>4G_M4PIrDZ`$@8~_mT2KTPVlolr>llEHinMPAFO)*r8wc zQgVKc19vr$DO1_~*isa?mtbK`J7#07l;%kaAry^;r>(Ro=Ut(6?GK)s037;$@HeH0 zWT4~e2&-NfKDcc(3YYl%eNz57$K!2c=t^&5qCMEpZWqa>WTgI$AWXVjgKkjW=hD;h z*Y*#;#V~=f=}8iZH7nwH@+GaqM)tYwXe?cer`+g_Cq<|*6U45sR7{$eH zJldYhU0xzBLd9iBVo)v-?wF5#Kq313a75D#LI`l-<%+=dvzzTomI(ZyMHl(Mbd^l! zkwYUgf+N@=2W;k~oCvP>lubXx*$gYMcTq52M>47oj=hS}tO>v@`MTt~{={D;dTXLG z69tgzILNa?ZBc+1{4?NoYS08ap?C*|r zGB+5%Lc{>ZEdUEb3-|vn2$fOIDuRlXGZ$3c1u9J+bVoWZOdyD>4D()?U9$>=IF6XzuizPCu2&P_n8g-6+Bbsi+ADXl7H1TAH&N}@AaZ^xSt}xTkY#+^TD<(RK2lhyo1DPs)c;P?1qT`k zpcd7q#~pAUi#H}o|92)i5Gk1QBGTdI0SL{pu2-k}V$L52vxIWr6^5kYJke3eRSX6m zM6}jR)(W;4>aMTOW1_(OMZ~_)$YdCt>k@NWdhM((Fb6R4xvT{TI79-jaVq5>MNsjWf{?6KzjybkfxOueq;wyxe|)+# z^aX!^sS}UUp=l0eF`_J_+- zv){>8Vq#_eVy~GG(o}+QMwPPmqOUT6&bvR5v)!)UOaLaz;f|FE9D!vRX>ys;*&00L zffV**+NFBke)_7#3zsGJ*H=!}v!6h%{L~GEWC|C6vOdoWOXH}0x1=%|E%NB$-eBke z`yvhP5(th#!sIDf1hNOL5z@b(RQLw3ybnKp`T+jInuB(+_TuXqh&;af{^7w!2+wvB z`AeT~sRkgf4px|`N>;rh4@A}f5O9aC8;4m#Ext|xydVaku_eC7k3%^kG%^YDKRMBv zeS@f9i?GmfsMpx0!!R50GY%{9nWn~xjH>J|SU|bj2#*0lf*tj+n1cVWJkC?hH+N^v z$uN}=bYnCl4UY>4B93VWmw`Leqr-4rOe~pm=(=#bfQnHvuFwO}z6{=~ld}AMqo8K+ zz#A2>NB$^}Wl=$zyR17i4%gsSW>EAj24gc-Azj3LwmCa6oR|82iX_+#-4oU@9WBw7qn1 zPlC%)za(=X2%&<@M==9?#-;Rywc*nscc6Ql=Euc7?|Yk>K!{)fq#Eh(0SK8yI|08l z=hjpWhZd<>Ko7!K?6%HB`+YlIO~!fbb9-@(Xck>Rj#n#k@J`@;J2m3msA9ZEh9F(d zS(O8k7vzv~I=`mBK9Munp8Woqx=2vI!gvf6MIJqU4;Rm6C3?8G*9>-rbYOtP5<4lU zIa#<*6sqNA-z0#{L+M|QGmn^!hVwqgR;kwx?gh{nyopi}rGJU4Sz-L>@?bzFCW_l` zo2Zxv%MWJwnlOjrOZ+cGA{+9`Q+GNZ=$SeX7QI-=eOFpy!igZ%$OwjS`ingAAwmjJ zQwEDr!P56I1Q^KJp!`qSO;Els22wZ(e~Ol;Db{}#X>Kxpz;AUTHGAT{h(xw;zbWjH zVSWMSkZ<;wk1PR_eQ^bcH02CY7TO^%!vI+XOeTvpqW#}^Jx~(az+!`d&$j?DiZ$Z# z|Ly0qsde&IDx)I{RLt1sNi{>Q#wY%@I0A|u_w?iuK;`o>Ebq1f*IWD(SQN}bQT|K7 z3cf!xtgQKOeIGd9f35j|iJz@TLI-*M|6Psf|LxBwU5#;!Rb)FZSspYbGmp2DW8M@b z$+&@4^^H*yAd=I|YP|wL5*XF&S1_E+v@j6V$#lGnN95XOi89{SijOC)u?Vl1Oq`33TQ;)-BN zwxSo&;qncNI1|PVuj5~4valF0V4>$Jlp`qQD-DM;QY^4Z#c@wyZL=qdPZ7vh>&QwT zdHRO@F5?PhoHIonXU~=KzPDJavvRevBKUP2%{kz?O7t^L zw_?Zi)JrV+OeU*;!ut2=6dWO6J~S!qRw;_J-Qjr;>tFUfASx~i0K;=_1W(LmI|=(F z9Ztuat&cb+rI&0D)Vg%n{^o}t6_W#WB~=2rZ7-0d5}wUq^}zS?L7!eecgZ1VdRH)c z&@J-?{ez=Spig`QIEdZpX(HCZ~JP<5X3{Z;=^naK*t zM3pdNAP&uaFz_`LZ-cPcaT9?mQoT*rwuwUJkjp26s%-=pxmzRjFKJh#~j>}SMyxMiZR;5DE zlWI~*@W^;=rfgZ=#^!TQ?4dJ)74M?GdWp_-%ZO0b$wL9+sSOld#G zLa9{JOwN=9FsNTgi|DiTjxK|>HVy8O&sKXPS+PJP-Jd3~qTDofm(5HmP~st@@vnNj zYoRkE*F859=~2B@A~CwdCK_$7>ZB8Uz**0GR4d9R!eC;CNZPb{?CWM z?+c9l_OTIFrB^7B*m2?QsK`t4zV|kl`Us3OuEy(#^dT!ht!6dS^DOnD{=0E1211|Q zDPh%Q;VRm39uKEPl2@9ciSu=2>0SnH_4Ro~*ykaZYI%^h2EmKd20N3$>Pu*)!DO>x zg@ylJTZ6GMQ4H5s_K8!Dqx3r+Wa3^m+x?|$hiyAWj5($w(;XPiFJT|#cRjYTmf4YH z98-FqGqrRVw(!TAu3=34HStvEDN9CclN+SPUNz_^7AHo{o|NE1hY^9bini&e$TdgI zr%Tv$Z?oK~u4iSg#hB60y0EsXliVLa&Hr|gZ8F@ZLHr`omG1}4I?u{;`g@@`>_^1&LWyI+XEV$PyOKN6p6cfum(P8`(fW_SuGec)4QAVVLLmZGhUCO!A1Z|@q~Hv@*4$|x0)F5%`lN0 zYT&!$RS7eGhbR+1^qy0Utp}9s$94Mz>-&ibk&a68Sz}Oc*reKNI=eG*Rn}4Qpt4LA zIJ5&#F$->7D#z(y2e2y+O4UbJsLo8z)L`4JcV{xdr|v4tq~d#TI_ErRWz!f-ZD1&IKpXkx{LU3RxwgZv_rFUmd{vt<532~dERuvlP$o4rt| zj0j+zg@VquQ0R~XHZ?&q?+wu4uL_5H!nmh9YWOgT+I6~NuM=OnbJKI$sO6@2( zjE}CEFXVx->X8r{)Ug>`}5p1F4%*)!3}qu%l&%dh_+=3O*acNk`@B93F9si%s{&UU!Uklm)GqZ@t|6{YT zvhvNw{htl?{}0#2Lr$+*?`(atJI7Y3R_g$=SXL4L+a+=X?$VnE9_dy#g06MTr3sR4 zK@K*@zO;rA8x_%ug%DZqEsEc0i-0tu7xLuxQax@YcsSO9ug~V;WDhSH3&*c>X+X6IIv9N!Gz!WJTHf=`#D3VkVF!)c%4tHi| zw5~y2GRtzQBhnm6m?n(rh2l)T%Nii%D)Fmrw^VZ8rUy_F!QYPj7mYcLeTES-S+tCM(B`_Hf>Es&Y2!QVb zmQR-0sBSj}gu=%O>GMFP@)igU80H{w9#0P=5DZ4p3?Jq@gm^3zEm(t=6TBsmdMuwO+{IRN%H6P2lo zOv@mWL3!}9#C|3Z5a7&MA+QP)_~U?NuT0L&%vJ-++@2(ktK?QsFUTf`ZTx18^Btes@K)qJP+Jrp7c8pWX!b`ze>|6|*AK+79I?$d~F#Ne0?S z4kME~zef@K6R!7AcBdl%VNovMT$M0i9tEUEhmLC;wJG_3p4eli-=b| z^l9VX3)Jv$@U_j_Fis_=tLyb)hf_?Raw)Z5eUgaux2GOGJUhISrEg`wmg0NB!M^@V z76Axwl7Xs7RYemNH0K2}{;L+MPM2fXQbYMmo;Di3!litM$W(u#MZjSiN7W6)L)p^t z+>yIv$cuophhb9#EpYLqe-z#xiHiBH&F`UD18I@#<>5QY7Ql)n0}!fwGdBP{)1%d) z5&oH*Mq#q?y(XXqz)sXcUI}W9HLDlL5V$C!g60<1O8~l{+zXj&)`!qE3Fr>H`kMjK z<-1*fZVI!X8zEe{%4TEW-a!jfJ15m7t1Q*J3eCnbZdj|=pH`k zg#ADo!|;sMc^7`lcESW~G9?rs*7ycQChz@UwP0W0&_1gvfeaNzE?`TQ!JY(bi0l*z zj9H{55HHgc8AFARP{6fztHfOL1rorbV7o`?5#O`A?U7*P6Zix}E8rGX!rx`T^#&3o z;0WFMJOC{=0JwaH&wwXO0Z&tI@+5i#w#&pLheX%2C+cpOheD>@<2R~NL&;g78A^A( zJX79Wz}C-v9erbVX*^)xuTB3f0)^L`j%i`izAGRynW`QNpU=)rNC7g_X|QMLJ>F?J zc$2tYSV%Slzay>bdYTS1=2|g=L?2z-+8Ml^^E?0WVn9ffrPUHpfuoIE&WMYY0fX;` z(?6d6qmc*p=UbI)9>^p`QY==>uRr#scmpNkH5KNT{1fSCHtX_0iZIpSkLYT5ablrm z@rA>lqXtjY`7_LE!o0~1Ou&Nh9{hfJFsy2({1KGyA&kl;UO)n-?f(*JA?ab`8ZRwE z{WC5itXaM=`$C#hqk#-`sYL!MBD4l{4}(m>1gGuqAjSxcEkaB&&E&6VuCGQaJocBn zDKsQtIs}+sj%oyiYZhQ0t!EqIC0qEMj?xP2&pN=oV7__>{91px{s+82xe6!cZa2rw zZ`~+$z1W!J-vpfINjF1lbkZ3^;RP3VCLhnZ2`sVqt_;wRj#cd>|!7s@mX@s!x45t^ruoT?m zeRtaRe2Wc4gds=VA{JKGp>yzVAIx4IAc#}!2s;H-&gyz@VTH*K5D-f_zk&PXg4{4el&(Wj{mT%kJirdhW({CBoitEw0^brHQMZeS zjgPv~Nf(;!2Bf>?WNW{Z5ihUKcVIhO&1tKg?c*Tr1a&=-r8Q?7{J2$+v{N;z!tbKZ z0p0HOAl81oSv+jmUtOno3t)}d^@-qRYrO+dB3#y` zBZ})o6B74M?S9I*#aen)PA3{rBH=%m3VRHbfjjPZbELAb0Yjik$bXmi>T{MMCj9Ld zh=kkuV$;L5^4qN861RLd*0frjFUd2CsL4aH8YmAUnD|h5{#c3NQ}}6&K$*h8Wu>y! zkwVfb=l$-!a4+r);Qj-CWdKmI-j!!MRh^EMS+iXoIPJ7kmt~?WTk3Xw)y#@Cd;63{ zOrWv#J8Me^ODd9I1omFDDzcjGn6B&q83NK;>(-uK#5(o6sc-3TJ+(W z(Nb!ogTt7zemoAVIzENA$yEV#R@bRko>eA+SLMz7!dlW{AcDg#bzyMl2Nan+&|7|6 z!-R=d!B+Qa%$Ca>6nHE=ktmzH1o*Thngsh#KqHiO2Z)lNh=WCxuq7lwzxgyA!XAY8 za8M3LlD6iD0omb|3eb7`$Fjdhu@}e#j4w~dFBl`sF}Bj}8pJ>U0IYdy!&> zl@(wfUeCZpU!bVOZ<>R@ffBFod+=;my;NT&E<6B&du+E<2LUlo%vU17&Acrh`K@;w z&oRSZzi;+?ur50TaAlw>>$R41_w$iAg&K8P&Zm181HR-xK->}syOS6kh1dT6HeI4G zzj9ilR!liepUq|O7tMOfFIN4YW)`vF1-a4_hgm9L0J?%>Nl$3wLa`$XghLpeC{!08SaYm0rHuI0L8qHSN{sULHBx{~;8t544e zD*Ce)o*yIbM+VrNw7 zJv^qod-?$4J)TzLqb>cpU9-av6N)x1vs8kz7-sJ{jL8SesCdi8b|Iimp;_)YQYh4j z2S_8+hepk#6Ac?}}Bh+e=@BCP8g&s$_x$coB-NT_19`7~c(7i*Bac$b@_GT`T>cUF?dBkE(r|A|>fE>~GP`Q+jr22W}!B&t~mE(Wg<{g+WSJ(q!+y zoTw54n1Ez>#Jj(N1QKzg)c}1#`9rM`U*ddo=kbO4OCPjk4)EF(#R9dpsxx6^{EMgq zAZ1gVN~IsO1+sy{^?l5pMTWN4)>rK*xDzes{!q=4xm(*z!T!eY!VML#P8ACcNq{_C z@5P>;`0U2fIQRUA3mKWtol3LcEPwXNt49c`m8R*T0$1NeLx{wIF-b<-8lf*;3fg}; z1a3eMHQW)#lN$opaR9;sO}WNZAnOjewJqWub$P;AseW!RA}N;n?(C1W;TCToF%Diw zQ3Gnp*N*C0H?HSxG~ABeFJ#*wg6FWf_^l&@vTh5VfbESN$LuAUg_3x`1&utsSDu$S z`LjPu=Pt;LP$ zNpxme>%8xvoUDf-fWcr-Jpk4GP3pAyOzQ*I{TnAp#txq#(kW1Qiiwx-A^*Fu{?*6R z{=<%QPX3GCp9k#}p=r7PPxSG;^1g1cu;Ksln{Rz#(8d7hKG@u@oabG*f#UgPEW1ex zQ2z7*Pl#ajDqp31kS{Tu&*}*-+_Y~{)Tg4P7DP0?N)Ttc&1VJkj8R$+x2IxZ0*ZOQ z7do63+Tb8Y+4~;|^UCl*=9UTO3Rr-zmtwD2FUZt7*O!}Lpo)Hg*-2Oi(ur!)`V#=Ha|v@x?Zf@F$$?;H`mIc_B&8o)ComKFvBJ<4Isl8f!_3#~!IP({uzf+IUJ4f}37=U}DV*I70U%uBK)| z=3^n8^!~@E^_{!UvyeDv;ItQ6#e)BAP&fp5vLaCQpO+dIIGIJXK@q8?=KCY~X_dsS zve#g0Vy_1$z1B5TwJ$|3mt)N`KqB?joPG4UL4TU~x?wa-)~sGeo(HN6&4j)CC?S7x zH8RX#xZVqJR)7yK~r3jNs*{iJ@`^$cybRQ}J z{#Wwjqntm%Qy0ORfED;o$w}?+o!7n7@r|9bLkPndW4gOIfQOk`-vDAEDNEo2dU38@!c+& zHNK7*`fH+cEq#@{OA;b5Tx&O`8|9(7I<6hc*E%l31CM|N!Ne!W1PZGVyIO6}UI@>u zJL25}^omifzy~gMw4#Dx3PHKDC^zkU#N2DvXO6d@_b0z%I92N0PdKjKjo)#mc@1`Y zo;uwOiJK_zmDZe6=d(5cp1q z5;_iiP`h#?4;uD3o$zXz;$!f$$ z(7f7<`sw4EzdKaude|8HE2vN13J(cMSzV9~>1N`|iq`Hm7h<;l)}E@i`EJHCU5?Yx zqqsuFkFl}D?B=koI_h#f_0l=Xk+?zhG*jU636bsShq;Vc(+(H29oOR;juYmM@9}I{ zuo?2g^9v>Q8@UUPm35W-*_pIh1*PM|3n@M;5T&{?9VB*g5Bnbkq}qi`Tvpqux;(q z@-#~1#zd^icD;lv>A^-U$n3kZM>r2DwOMd#2_q5oU#zwh z16so zakFl49_T2whxumcq?fB=Vw+fn=mf!{RY0lv6BIQ#wPNuIX~^2gIR)ze>_8OS-LgzR z#*6iL4FrxWo_M*?7tb|{hF$!xONg$$9S$rPKa2UC(EHk`&%3?h1;fVx;K-^|b+>St zk&*z`c`~6{iJ!(8hW~`sY7mLwUSgx8uUhyW`N?=~6j!(#mny z^%t$3wiLkvV!z##&B)}~@MTZBQX)kN=i}qL%Q*};NT|!EwpFDe<&CKkVx1&jpC)>& zJX_w{2bRorF(7a$H%)D_fQhLSj3|Y7yI=siDk?hb4X;wUmhbusT+FR5kJq{WB%H?V z0UI~cq2ai02YttJq()_YW6*r`gReI@Bvz)|Vdl`O+Vn_2uCKqqhV_B5 zW*Y>EIeme1Wyu;JC<>&cbtMDucDFtKaR(63us!5Eso50R8%?kgl3!iaf(i~vPuJ*H z%Q$q%i8|+(==u_{H77xLuCI1f^?>Se(j1h`d_e8J(s}9SAt|=NQ;ZuK^9fbfYTVmP9MrySl^0 zul@0O%C7Km!U_0o()#nMor+ptP!NCLrO9DDri^&gYkaC*;ALszY{v~#RbFxe2=;TN z;p#Bb7JUtPENfQ2+eL#%KtzBR^Is={UyB=g>G7j16F(R-2_d*Qp?X0D~_>bFg<3X@Goly6l`EKe7^fZgN;jyBq` z1i7%20aJtvk-GKCa2~b@*DKc-iRM5-TfS*~G^X!+B57} zPQ&64C^E!F67#jCc?;i+ixLuai0T79jfQf?LX^hOPXG1jBH&hh8ZMTYDJa2F zPy&Enu|e^XJ*Cc+S>vP$&j+LwwYuZrFeoZ0G?^I|%KZc~vaqO4n<$a6Z5xm{oWJgz^k>+9CIak3sEcc%9u zE4_frF4et;1=%@(`%Mf-et&l-%eA6+g2e^c{;<3=Ux-QUu=tsAVj4Wa+pc~;o^i|O zHnf(%*mJm=8hRg8hFv==Km~`sGJ}!l<87z85I|Dn_Km|#qcid(;8_VP^2W~ zzg&~4+0y^)CGNaYkf?-3^U+Vty5QGm;8j<-irTGpABV7srdf5fRw=TVo2|d;^>`>r( zU*H1>zeZ2mZUU9>WaVbot$z*rU+legTvc1wHhfeR6e*EzQB;sp0cjAGbEGUv1Ozss zbT@1f2_+4>L_~=#v5`(eN@Al(mvonOe`9U%+@5;g^M2obzwh(?p7Res<;PlUuDRwM zbBt?T;~H$_=mnw8)#;JJO36)&R=XY-Ss|ycSXgEYfseNvrPjGntE^YgQ!0t2|1fl@2MJjdt6ui;oP9zN{9B7Hb^DM}vbwyO)rpVDoNm#4PlXnb#=3~BZFH6KA~74DCyr-&uEBJGyyGHKMC z5a(|grfw|xt%_ZvUBH?Hj>p<+t?#yi5(4uw4FmQrjAw)bfB#(A)5MN9tRF z%DB8R0l01qtTpHB%P^)}ewVj5j65GaUYwYxD#@U$HXN?Af%BN^`mrCqM2=8+eDq!# z7`N#ih5J9bi4YNCA16L#O{P_i+p#E=u&|VI4~SvhLKz%V_obL%eC0QO`7u=350jQ; z@P4oQ0wzKo?9~fWWO|a0{xdzuc5(Y>dj45G|7@;*Y|lTYnf%XBu7t}Z#M`Hz)Qi`< z+Xk=3X`m6=j$0n+jTZD|J3r8Uo;&%rE<+@oXH@9`uC(}&#Udo6Q<#Rqtz@1%2cUyy zi$A4NU^Wm>UK>@?ZV0cA$V!S>S236sq$gLdfMa#`?BYz298d16c2p=^?>p|v2v!%>^l~A-uWCRB zNZ<69k-d$9AP;T^LL^=lOofNG(BmUpF=xA5&`Wk`A;;y|7Mw%9Ri166YT=l&OuxTY z8)WJ8Z6oc9*Sg}oSnLZS7n;JveiJ?Gqg^Xr55drvrhWwesuHh9Ce3Ey92y){4Sy?$ z*!&!PgQK{b$vFjC8rinv1S0EGeBH!r1+EV;^H;6}2ANVzIzfmjB(0LCzK+=sku*ou zWjjh#d_?({-vO*h(Evkw9IFw57tOZ2BNavPZFk~q)_&|~-qOA5qjzTKPJX`t7tU(r zZ2IfndJ^Ejs!-ex1Ez4Qnw3Ym9-?LRaP(NuyS>sa$O`)A4M+7SXQS*`Qa1- zgQj2rLOxpzR4Bh`-D`mY5<)x9t=j>jfxXxIO<-`^4~ZcH?;F1U{m)|rq??yJf4x`n zRroIt$6ob*t45z~-znjKiM>oR{=I#`5rem#fOqJEOYY2Oa8%L9M8J69@ut*ULY!6f zJ>>(hTDkYULy$@228PFHe6-hJ8d}WGV6_|)vh`xP{psmU_0Q$ z&!$13Lh?>2LXcZ*^cN+=w&Q}G!1d7qk(=C(YYYI?fEJf&{-Es|D6FukyuBHe&z(98 zxZk~VQ#t)N1rgl$K&n&Mcq!Clw_>>kQN=vez>ACRG)eSp<7M?+Hkb#GHQ=H4#+Yhc z#4|0}6$=ZQ^VP1Ie390lf7vssUPTA6VNd47nQl+D2(`FRH9)s2LqM!gb^WePl18&< zgdB5$@=(TP*U6vfa4Qvwb)r|lyct^btMslMt9ZI0qFYsp>5P4;{%*Vi3-Oj`rPvUE zu8e-M3N-?{=5uFSM3`jlZ{y7!wDpM@U2dDII21q;AmI3DtHU>D!<75esdhgttkj84 z%Np*uh{_E|C{8A7OBKnQ57;U`v$@q^$~}9ANwOm@L=i_Aw>DVV-LUd?vBQiE3bW0$ zvInvNBKCrPE}pGDw!&%`xMdP4iFR$h_a@5HuG)1)Ky3M|J2Ib@A}k-&`a_D>^~cQh zrdV($z)-l`7VRjJGnSNGT75jG^K`wJ#C)vzS2se^$mF?tLeRG&La+C;Xsr5WvrRiU z^z_6yZTeIuILaTbqMtr4ExGNozp!>o-;9&gl2-~l9sg!7AByoyjkM;%gY~p_!iszM zQsra{h$Kf|xZixemWy-YDcD>{1=v8N4A3bCfJ$P5?~##VZwl^&yG%sRlSbOavP?EN zYNFFO;3FE}o7Q({O%7@I-`OAJ%i3-Qq6vMkms5uztv8YQj5Ya9r>1BqYqHVveMb-_ze{CS}MpLW0aE1;fz==56R(3E6a}L&nK#qE+4CP-`=A?Qh`6-afcD~WlZ?eKmtjjz3Ut1B>rJY`6QQ8W7I z)+RD!oO#OM-D})uI*Kmm(&B&j788V|x$qzjf~+B{AF*_vuRq@lq|yY2Xu%91S{|^4 z2YW;H{ORc9CW4TJ7cgOkb%4^_J5xqR2DF;KhUY1Ti|uK^#5`EYd}{>`d3r^W(O2(7 zdJ$Bk4789|(m;9A90A9L3P%-WIxyQ4CI-_HeVL{mJ1uc@dxm!H&OBVjcykj7 zm^tIq!vLHb3M*1~9%LE@;)7Wm^KkH9b(uA1uoPx(!*bfIpBolWt6+U5f`wfM*jebp zKZ2xK8&l?)S!=EI%nHf1galk>?Ib|Q4B-g9w-NrX_*4~r`*GsZ zn~N&|(D1$MI)L|lswi`S@OiVIWtNc=SE$cBOV3e%Y+9+!%E)fL!IZFLJ0-Adw=z{# z{_aRu@2H;M#tEffMB_2qsSl9NGw>R1d^2pyVAkIVN6bu(_9ixd4Om)C zeJzcG^G&705=*(gbU6D<2&Y$LmBqYdmkws>6V_@M39sWPpHc*;41U5g7h3rc04>w6 zIB7zrFIe$>b}q!4l6_i5oT-{c?a7pIaLyp`f3Jy|lg3oO z5AL~Rb4Ikl&>3tIfZ7iyxE|)Z5blZX^b@~ZJ9U&;mc_+->WMDVyr)nBn=cir3&`U38wo25sGZZbmg(SZLCm3a97mxd=t!K>vU$Si^;J8fAPiE&fH32e}S!YOYfH? zR;Ou-m&=3fwcVfD*JyN~9=fHyl-Dw7IzB#LV-hgEP_l=Nk+Ho8nHET&^a-COj-gj4 zYO30tHF$XGjW)OMaC<8Vj#Q49UmG?+<+5|-)2HLN7QBK8g<6)edMnWW%bIC<#9P|0t z*R(5B?e5@Cj@y%F7~n`jXvnk;4gAYJ)4WcXRB8lC@Z(D-=-!yJ^Mb6A3wyWqV!-iU z-!t{JcgF|(G1=gNvu$3Rni|L#>=LN8fM*E^a(H6g!bbPc+)i%VAmP`I|f5q?(>Rex& z3H#a!|Fk1yg`f0@v4Y_ZKkY2S3zW;?ywfT|_THj+QVG4<$R?qhY*>>!94$H{kD09> z7oT=e^4=2BY#M~kWobXir7b))8s+@0nytC}Jl;2bSwgJ6Txt#}_UzlWq5OKR-m6{H zt~3<~u*^$2l_cwXYM^2T%!37Q{scC7(NvHKo0oX+)V8B@U&?Fu>{LR*AhH+oH=H2av?~_520nEuo@Wzh21S8-JU{4(ocDxr}R6Z#Z_bO>;3u)v`J9 zsL+njHUENMLl#EK>*L#CvROA4m@1x>isy7{S;s+c=%6H+pl^Ak57qc+c)g7IbT}r2s`#-Tka30-*s{4LXt-RJZ-W3zE;tRLs|A=O2^ynQ zxG%%{bup6Y7p8EI%&Ey39!FfH{hZ`6PXwB~s`z~$>$HsMV-|C?8K;56Fg(JeiRP{v z6G_}z3%hnt?)`WW+c$A7CPX=NSl91Z(UwI=}t@;lr=>q;kZlc|aHzv*Y*xl=~sATHraF~x{5jS6& z7VI!SGsS;ROuysT9=%%(6{&K+5_Pt*B0yem^d-r6vStr$7o(3uQ3jd5o+ z%JH=MzPtcOeDM|rl=F*LhRD(C zoSsk0XyB*DJ}LdE-MZ$)QET7Ep~P#1JB;!9QV$J&Yu-#@_q z-O2~BeeWa}5|uobZ8E)>9Zwjims9sXi&;mV1GzBD?yCZJez<2Mpsrm83~Uxqie{`o z*Y9iI_vpBa6}CUL30dmq6X}(nV7CokED=h~xLsNBS+*!p&_tf{f^hdqT+P^mRl4KM zqLU?&w2D$F)5C4keB%6zL((dXW)H@jF$c$0-X0`NMe|{979Cli#i0*4iA`DPWQ1ju ze!16Jb|)j7s4+v*Mnji%sllDhQ7_IDV zfwMgeZEH?5u@f8O>*kBhEc+otoLc5DIzijla^Sd9nAny4Q2|%9jM*2wXEA@{1uVRU z)m%04izjAoKfvs^AOWZ!In@tyUAYsu(9qu0yP3avsJXNFtQgkSr9()>q5U17Q@4sv z&7|$Y#yhpf`Dk@fu}BY*WR(=6ab|ng4-2+(1u{k%bZ1I|;q_#%WMliJ*6QMNw2DD& z{~n_j+s#eE2#Pyq=?>oSqrMY>8m1i7^~GqCpEdRIVVrf`jcUoGc%!$%f2zdLK=9bw$M&pziIc`zE0MPO)J5X19y3Kw zA-2|D*Yc}Bh`t-D*CvPkm|>g0XuP(%FuT#k&}?Tud@KJKh583oZc8Cu9(T5}7KTIK zxEe@_6_#pmt+vE=iP0E&>vnQIz*sbn{=EGqo^vmx{fh5ozSX@NKBu|5%Jz7J$yu(avm522 z^?8{e@^M+A&kBnt+%~s8wKO(2aH@DB9mf*!I~G~b-jXK{ZSgoc@Hy=pYM-EIHETO9 z>ZR&Hfl|&D8M?GREaNLLRHd7f;^j)k(RugWt=dBUI;J)U_>86=_|fU!a%p+%#M+r{ zJ(^R*)>gf1_zsyufHo*sI`FO0k6@QK^JpbV(Y#h!ks6QEOqU}uh~<%5**&W`Y@`13 zI{>iUSPna!Tl@6fp6783_jL}XYS$~Mk0ntx5p|Ja;^_3qVY$VnC9k5QqFb1k^%1#* z46*=0G2YGFaXOmFM=l0tnnGvw<=5H|E7-g8DUIHBznXZkcvY$OjC@R6e#3Fsx~Xob zsj0E3%iPmk`L|<=pS}>#{H_Jiq@uGr-;J*0llZdHoxerRJk=Ma#y!oOKWE&nGHdj_ z=+3$J3M2<7f5A;4S=FnlZH}WwvU5}R_CA-_cW|1%cFXnQUHL}$?Tw|IZ|_~-*;nKv zu~MI;RoIE8Hyz6|dwVD9qhE&HYjgjQjsAQ_W)YWpk;U-#dc1lSHicV4w8RsX=^uN{25OFg7;^M(#F!Pg! zF~7mrmV<=aG#hr#Q!OqvQ{1b3GX1!2XCJ;deK<oHUu`Z(MP!$-Kn zoMAJ$yl=!FQT6jv18Oqz~@etYj{HHl&l~TFb06_m*0aA+Ml$ zJU?jP8Pq%=(_)7x$_An62vD-!9)^IOoV8ai^|F#5jMP(uYorAu|CjZYtCZ zVUO2d4YNkmY`^ZJ_06^O&aI}YAcJdy4tCF zPF-+%Vh!byu^`)m!3~NB1?&QtAZCjz8lqzP=GtK+d9ck)f7o9erMJGfEGl|7vXpc- zyWEayRn9|CK}=#vuS1rbb3Jz2j#s_2F;#&SUEE>1bb`!7Ze$ZI}-D~R+sX?F~@ztwQ1A&o{;3K7E)y61eSWi=fzNFr1TDvu!rpG~6M=zV< zoP?|)xLT989SM{AMa97BEI3nx_cbF@N*=FGlYw(S*2WMi?ld>ia&Xvo!~Ov#g#n%F#8irzbTxy;!AH9%V3Kurf6>=bA`sC*AkEItogc$PD zII6~WzhwJOMWXC_ zZ7~pgdUtv3ezos7zOO2)#7_cd>d>A-EyrA^`?Y^NfjSfavFW2#mpK~oZW-rMNY7v5 zCs1!na97{4od`QGJ}$_Cf~V?oY6vrakutfR#>Gxm8M=}Zs4EZ~?X1i?$oG%#<(fNk zD=X%(9rjRr>`!O*w^IVN}` zgD3w#Gx!IXp<$rwyi40TXTJc)B1kLkk0ZSi*tE__gbk-|-d-+l;XokyOsq15WK9aS z9Py^ZhTx-^m#t{kO$fkqfO~Ic5wrO2&0v;l{MA6^pv$574?)fvcnY{N9COJYqbR1O zP*F42P+=pL%C!zW0x@t3TdZOZr#zfg7b1^AF9JukJrq}6Xrt2CC< zBfo;T`2({0LI{>__qlA*$E?Xwfs;FP^AJ`()pg5Xp?0@!KwlmEJEu-Oz}TM8!pCO2 zc2fesT3NpOUqJeHWq?(F91*sCf7?!wCVvh9P5xsvzY>f_{J{7K)ld2#sWLo{+95&l z^&)(o=fD-(q4xiO8CVdN{aQy)v}pEqC2;{HoXYJGxpx?4^YVF70Hn?2$&krN@P}d0 zXOg3o<4EsFZyRi1ncGnQkz0sxa(AZslC;Y4%cE<*QfR>fAZv;E&)3rLOv;F@Pu4W& zg>AlA4O7Oay90l|-iiO4?P#z)&7fUS2DL@uKKU(LPJgveeKFQpf2Z0gA<5cEf#m*; zNqnfVaH#b;P`|1JBHywaz4B7sURF|m$d)VToIBFVq(1svQ)h=#mKAW}fYj=D`moz$ znCQ>MSN~{44q(G$_WZD_ubhDu=J@_&0TB%q>EgT-sjE!7ipoDrcz{PmYYg{ zwL!+0rB!y8!dscRPgN$Vm*ZHIIfMA{BL*IzI+z)G$R{5k&+%eS*5ecfa@&hw*r0~b z6#|w0L_+s&Z(}}Ecwv#7@ES!oLw|GF~dhp8O4(&yOO$^?gyLoEL<4ly`J|DBWb z$FsIX53&rMblO&R0y+SW)ps@j(%(KKAHQYgF~QR zXklu1SXxDtZ6`(HwjT-%IA0I^D^pkq5JbFD-n$`_P4W5XB?_QNd|&H`xub>! z_2vDkowM2k*=C>qxOo~TsZDy6y*D>E8(LaIMDP6Nf~EfBwf~8Y_s6yM7k=E`Yu?BWGG z;fEnOr+i)fRSsT+|I7$K+eq4 zqGvHH`>(9}w)ZuY?7?)s4RO3O4_6TY!K}>GL9wa<@fVtzg_g~S;Pj0(l!}7Bb{QuRX`g^Y44VAA*?sTdXR8}6R zL3wVcNfZzSM6O=$kp-&K>z!B&yhgv9h!aJav4d<@u<_md{Hjw6TI`Slk)#$*2{;)k zen28ak3U~aK<)a`pRmrq*kMQYvbb1seqmwIzwPeXzc{7;B<%Y8!OS05 z41DB2^FtsruCBfEQMDB6i6VURLB--Pm=kcE!~O}#og4BB%eZ4xP|SqS+-dEOxE83) zyYeIhSZ4DxkGBne!V|;*s)6SS>F3|=B!A3U!fV`xUnd=KWeQ|)Z2!${_1}-VCv5|5 zKm*JZTK!4VAcbQe|J;50lhX*9{v8GiP4-{9;K0-S)8hb%5*7~qHc9~N?vF&bqQ65B zME`Gkrhhyjk@~d=(S9=Ya+wZ@jv-Odt$b&7HOo1YsEM0Z5~9ZMNPOl@{;&Fu|1QDa-!n~rFABY)?uJw;)=xsdh!nCW3W&|$ zs~&YCewzJP6ebDk-;*wJ{|=`LJ$vt(X)2Nph88bFrsJ=!1~xsqwkq*!L<49P;&+|@ zb4UJn@kn0R%gOu$pL%*&R##UoD~-?6=T0g^2&)`_XlN%^JqkG9)9UkoaX9{ap)tgz z!dLS_qTY^Y3Sn3MW5ND+QA5OnH7TolptqLLxbi}f3;Ex}0(U~ka+AMY3~*Tf4m6Vz z+6$GNEFKJ|IkEaxND*BxP(MxO*3?Lla?mnAq3%}fN}`gBVJnDV-E6Nk%z?f$We86j z>j0&(2Z1A}B$vMrUzVjD2c=to7CG3-;_*=^n0<6Snii>W&z^8aGSUc%hI>*^;KK&+ z;3k6WEY^|A(0CV&r#;lD_v_7ngK~XwgliNRQOI$H_&c(D;lEnG)|^oLcH^TfK?U!2 zl!yTb^Dp6DH%0J4qb|1vlq~Pvwre(&KiIY3W4sNCVDGP=T%({Z-UbjdfbRVx(95#g z;ALsxygpm$1*G3$*ytA9!2DY>g{fC&LimU_pI?5877yuI%DsFdr3$uFQ|By^8_9uf z^jMDlhqxOmvLRqyVTB)1Ugrk2f!~B#uN1&I?E4k{;ax9OOY8*m4RsNsNp{$`1g1&6V?H$#7PlJ3FL!^+_L;7r`NMtR0B%9dw|(S=-Z?q z>UJ6INYYj2AMq?H55(9Xmif1UaevP~gX>tdHdPbV5v*ZK2ONh;Kso;=^akwq=N(3~ zedRj6r5=AD>HB_${|=!x&Az*qa(;e0q1(yN;CIQ?Ng2)n<-~^$J8*C{7;+T$%6~s# z%4KVoF{0T})L*Rwa(;i`9Q~7c_@4A^ zwrSC%;VGCYz111uJiz`Jj;|OyKE-D>Tt{}spl-|_jezC%r3wiI<5`onzOfr{Rnd?3 zQf&j~SJ;u@HywiC9ajZef%sJJ0~WO3~`rtxk~f;rS;8u^XRf1=C~%N{Bk6pW}0LKnz&#&zF>y5`gYz7vCHKmb_K>wu!t^hMU+Vud{|LliYocZ$U04P! z26oxzGmotY6epcX65?>}?5j^4roo5p^df3d`MUoY8YbyhLa2)SsH) zpo>Ia=p!XCL;$gA*8B|_4(?{uH5SyH!jQ_Ppm&`Fo7L_Z!Dq5Me_pd@J-~NG!z602 zO$62I<~A{NaQ|}$|54k(OjPw#^F{@sLEWihw8hqoV5gD~fOH%Q?Lu-rc)#>86-s=* zq}{xdq$POql28+?(YQ%A3DBvzzGT*JRgGM{qIx3wJ@V(b-GSn$m&b%!*a0k-I{q}* zb%L|X5w)>uzP2EFIa*0oME-!zumSZFZ>-!@-kIk%Zh4co0SAC)Hg%v;&HeexW|n%6 zX6R}Ck80N&@iZ<0s9{!)yZ<}*55`%Z*f~2Me%5?eo(^r%UW<31G8Gbt(u&fir#%H_fvRX4dXmKLay># zgd8DJuT3pm;HS%i`5b`1co__Ss%wb<8r9?FVBv9OvYf9LPZL52t5()t=_3)K2EC(r4t=bb&^f6!+Hu_iz{lg zRvq5c(`It#ons1biLvaI$h%p>QwDPaB$8N--g&p|kf2NuHu@1LRVWh1(EfmopcO~6qgzMq~xFy zdTSs?97UQ{TXAD(8oztd@rrS$PJyTgJ|a5sH;h`T*EZMKp=XC(D& zUAVZYv9QoqN^bhlm%7E5EXF7-$2;@n-voJP6czsH4I_HUaU+&rH>-o9A zEjVl<=u@5DoX<~N?at4a$l%U$xY24j+?6k3`esjxnk!nZ&`|kJzFk3&Z&;gE+)&6? zXF^b023AEMS39crsEA?nwxnrri z3qaz*v$V)d!_IyY4swGI8adPc8u@2(Cmd*qv4r_J^&A(wiVp9qeZ_p0Qd|-mhI%2( z#2yi5{Kt(Y4C*ULn=&_zS}=6WH1nnf#pe>+BFrd;jC1u{h76#y1aKY$q}D5A?CU3* zP8-zZwmH06)BZ$*T1a}}FSAX6BnO^(-Nurqew!)@i{tId9D~y?%|klx{qsHPtcLuZ zd`<13W}C1KVbzm~&wtJFiVhb~lh87+%nwS+5*8-0CZccHdW1Nwuj>xJ zxF<9es!uNlcW-2;f!1&7u098wag!{Pwn;Tj$zayx@mKr4%TV4>W6k&0d?HTZy3*HC zwmK2p7sYek0HvL+@$P!7bFxR2?dXcDQA^7HWEITMc~SwzKj%0smdu@I9*aHzMl%ye zu8MN&l3}v*gMtkYsu-od+1gFuP#9p0z zfbr{V3`6!5d;mTPH&9^|%cyL}GbQbWrS9X*L{KU{QxdbB+Av!xvQHFj z?fyogj#D4BuZ0H};_|p>wLiD$$2R9}xu643)3Z_;lQc47U88rP->a$aW zewuwuxLFztCyO^3tVVjt@sc~M6wl_dcrm(N*-K=t{Pl?}Go5|WK)671JR+nZtHdX^^My(;;~ zJXL0e)glaFqa0iFDAnS5fYCA(rQUh=S2SdHBo6G1mTGSwFGC7?8}gBzkn_6V)=xJEF9s?Pj)gL8ua-1vHyAiRjY=WQ$UoEc{u5;k zzuqPLQp#xZXvo5ltrvpA#|&tN99th%LenqPd|JLg-z!&eW&~6Ki1-gDbzF;Q9Vaof zmpQl8X(dZ8(NBjX@?vqdZv}CJ^9M0~#h14Rj!W*R(X_rw>pEN1s3?kLa@nAzpuq(+ zFLF%Wm+FG$XOsG3WrviXc{t%(;6gvk);UH<@v(L9I_vz1OjvRpld*3%hE{Qqdsg@+ zFiRDe;l^}R#ao*c`OeE_ETm>m9UYh6DQ{eaf@&mf+Y5_^p)Xv*WmPI+AxX86b4m3u zl>5hchJav1Zz`yEU5Qy1z=1%S*+?e73Xe7IBqTBgJvbf7c2?gcQcB$gVhg?Zl$>tn zAYx_*5%nfSK7c$X^Q_I-m)J#;w;zUu@B;c|Y%HC;-TcIn4d*JzXjRKElRBr8Qt<7gIh;jgCXY8kVg6ux^iCCNA~K4g%Z@BNqj-yq0?``o z9f!)+K+SdfUPQ)YrgK=rF5zus5XihDYGyR1xr4#6Jc#^y8B$Um+oG(Ucs`ISMmYfGsRaIZkH|QXz6g>HE^zepf}X`e5Nru41(!|m9BTb?C&Cc$f--o! zUJyf>Ea|BU;U$C48BX5C-n7$K9^*ztK*h589o2%xXIn5 zJso{k=}?>v$j{yvLVKiwkDmt-T^2F-Qn}RwLSbc~ zS;eR-l=DH`BX7CS$7}HsuGlT{ z{wTLu+#WJ9zBKei28byiTqlw~IA3wIkI`%EfpUT9j04W8TP=S$OB*_4^(G~!(6>{g z({3*_naVKWrE;VaciVt?!Nrj6(rr9Q1j=Nieg%?X;U?SG96-N{0o`sef?HpiV?dJL zmsqwC21`F^qv^8e%OuC8gWd~HKoDfkVjT{~G>$BsQoD%iy=%E;f|4^?x?)uA z7Rdb4k0HY30xGFmU6#H`Ew3)-OTM6s*mbe4%k`_$jVqNr73Fg zcB}gu>W(%bQ#3=_QD+jIXxr1wfN0=fnkbHC<=Q9R#!Kwqik_k6Kimn$5?0xhprPP5 zk&>1*$ATkmi@L?`AJ%0fbXMLO?LOy&*0w08lIf_psh(rr>n)Mq<0;>Fj{hM8^F(=? z;r!Ds2O-^<-nqx`br+pegZnF7ptndw1){d*m{S3bXc@|e!#B=J_rqe-sBzU3Kxw~c zv?!QR%uBaz!UwNN#p7k3`(-;8lHenGRdnvzKFQ7u13dw3s?~xuC&)r?c?*BI&D|Xp zBtB3;;hvR)Dt_1sIz_GxhwRHmtz8$Awf%&9XrCV>lkHmt#h&II==tZ>ZLru>d}&cE z@zP>bzMG4l{plQuRyIuJkkMo|eJOStLVT2^6k(0L67xrk$*xD2K}apAW7P=uhL z{qG4`7~4}dxcKkWMXo`=%YEnL-=~9yi6ohc&)SFlGQoE!?I(=wX#2G1E_D_%UG%ex z$?t_f6ywC^YAM!I)zV+iCdi5#evoL(D5MVotv9JcLOU%evh3&5Ub!syB22u+#5{xo z9YD&Kq20-t?jpn6z1F?tABvNe`1XWk(iDG|JqOMSCq+-}HUP0}xBhL{wzJ1*4S^ey$8|gPq#H&uPI)y@t)NiIoqt-*T-~${|g;+ z(ZQ4efCf^dNVYEDT4YSURFG_d(eKK$@q;3Hib_OytFqnY)?}PQz_~u}U<)uL*|EYX zvLUlJi2|Kvg8`&fgV(A)PMo1gqh@u+^%MWTXJsK1Z5%Rs<&zf7pA0a^Jj2SSXCAxs zsa}%hMMUyteIqt*zaP=8aE=mpH%Txnh~VFa58ujn9=G|&DtBlSAMPQN_$9(3`!dslH*UBy9LyO#Nj#5<+@ zVGz!5-PfasQCiPKIWtYyw_lk#EGg%{fhEk$&Own~Woc(#s$o9RgT;+_e(F_QbSH?j zqq7%Z_kXu6|Mgy`&UY;Utm5uOdxM^@&i(M(slODcrFJ^n$l(hl{Rvj+W+%lJz?XQgT zl(geyKE8RVr{WaQRnZly5>tfzMZ~(3D`jp*7tMc`tR<**5too>~9+UNS{= zy;XGMy_VpOzKpoOxukvEeLaYPh90C^xYtO{ykqj;OMr~#^o)Yccv)G+d#}>UR2tGn zhVhl_EbCkCqGaYLA7e;Zp1!bloZ`~OxS`Z6KNlFn>-dMye)n^Y! z<_%JdUXl?}K38+$s_9g_W7ELO)pCZ##FxV$_8I3XQoUJdOGw7dC>+c{e*Y%|)<6b= zZ|YT?aYS{z%m)Zqf7(SP_0w+hD$(6T#I_^Rew9zYB&+hyYjKDw+>H&T?8W@jufq;~ zI5JHPGISpv=6_T@c?jv^8`^m(D=I;^APqIunQI$LvGu|Jb8A+$%`pzTe3OXl1o0OA z6 zEHY1c@~W@>F^&(Pw4g1SgJ>6#`Zs7O*^!m4`?T7Jg25z2Sm5gB=29{88_V)XvzOj! zf-1#R9?iT~L8|Sk6W{%QtKP{jbnf(3)4OgHyLaQ~87B6XV#)fa!8tyQj_RXt6kRf3 zp5e(GInuLQPGZ58civhF0!0_2RLy)rNAoHs_MI;q@fi{q?u`` zY%oQ+vtFlNNPQ+0H}EeAUXCA$hW_j09WpTzE>AxS#E%n{F}O2Ze5@h5<-c+Jy#2xl z%a}9vmM=K&FF8islSs2K_g3LwxPJ}D-Es%pV$ZA|-rAEN0A7QrHN(fQ2xo02o>#~k zxGyzDS!ED#s@kPsg8bK)cu48JTB>#JeDF~Gx2aeFhk6<6&Zi2=UeC20)D3@gBIt85 z8r{=f_@>*W4SVdhV3$E-q-x@G&}^)%S7wlAlPf1PBjvD*>boV7dL64j(VBkoA&bJ7 zR9w{MdtVZsxYTo{7JL?Jitv@Bbv&p0fk<>3{Lyc9bDqqc0S_RI2)HvyYVVu8EO-@6%&)1vR2J{oHpd6BI~%es~EspN8{u3?aL>7x#TB*F&w% zu^w=;`-bwaQEU0p3k*Okm^NvwV$Dq=XXWFWL}yWB!ch>;sHXeWyTju8Yk7+PlLPPS zVtV|!Nq%Y6B_Ka8#{D?^Ox0Y#>8paK`-!=~%!rV`1Uptqu`p^;Lvi* z%I~gsQZPtdx6b_Xrfe?2bCoVYN-5xHXwnL;+!QHDhy}XuC?+IOu$9|C6qXcG>xtLs z^PnCJg)T*+u_05-&^*4~{BpKZ8JU3CMW1#--H@}!tqp~BRQS(m`TVC?k6d0stPFm^ zf)zLrcz>m%VVI;ZOYS4P9OVN-aos!hk?KcYEw$>A^<;QC)O_gpdAL5xWlq8LSdV#X z_7h8mV{Wc|`&}E@dBS2xRAuGPajO1ns7YQPVq>< zDW++X;{OMwvq^!L?>TdK!^7Ye%aaiL`HCdSr*9 zx+K=-hH*>X{u(`i3OHMpJ;VTlx>5n1hlMOmsF#L-E;X!yJL(+}c67S@w2@e!Gh1lN zZT*68#v`vh?o^RC=p(;0B2n_G{_Y|F$XL@Z@{LgfwV*4LNK?FTt zCwCMz>iq0$myp4ymlvBS;3WN;Z$K)Mwv@2F^-L)no)c8eiOe6*wlnRluyS#qAp3O> zUZ61dxc@cts0Rj#ZHsv4@PpLffU3vJr~u>Vu4hgplvwL6Ew&3*+=(YbKYl_Grq1V*}}-vOr;5I zQBzi9b0gh1s;t+#(OVl0vVeR%tFwwb_jSy5CBZ2P`@2(2mvx6)bdx$m*b z#YoDRjDCufJr#5m`D8g%LY)Sm8#19|>%$|=BGw61wARk2wXTPM>EgP3)~NZ^*55_1ATdalWgM{ONAx zs1^dRqH5oqz0{)sx|FzAQqWoALVG@dh;yMM>|ISPGtp_bN8!9p6GW7YL2ywDg8WV_rGk7m0Htickw3W3Kt+7^3(T|C)4#Izz z9Ly41`Jl&hLa$=qGU)rSEG9&&UcG1~l#7d1rOJBw;nc#ig_T~SRA1B*1f~ypV%aRf zOIh+O1UYfW)YobIIq4I<;~)4MY0Kc_8KQspcD>Ew4$MT&KrD3rG^)=BVuFztJec`ZRv1^dv0^k;vr z9W{xq0fk}N+ItSyvQ0JlWA$~vX;p)t{o&}(yr+xIdWuflPN*fqZhhj^P0D_&`dG-V zzR5$+w14r+!=V_%AMYS}F1WQc~`H*4DqfapPgo)woO5-~ztc8cr(gij^T0 z1~f$<*4t3$E4ml*Zrj~7aWOM$#ypB# z1OEBP#5d752a5Jr&sh+W@>O>+1;2M?A9}QZ>7Z>#)>J%ng8SB}B-tk6q67uXv5l$I zYLXfp>q;Qa+vhvlnjTjhE+8kr$-_^0xYeN|U(Rr*UT4z(W9g|ytDK8~fG&4GiVmhpz5Br4Z1>!YuCy#i0);3MfzOzh zvXjQHoGzE_eQ(K{LH?ng#&Nh^u=HxDLwI8?!`N_rBf-L>%Yo^lLKB7`%0nsSnVvPi zR+uYfn6{sb=de^~4_~pw=Fa(kr4{zt2`=&ppg013aZ{qIOLNP>0FLTa23_>`Wp?3^ zd9gfUYRAp0&m?A@IsA*zxnuQw?t``YEI%*5XV#i~-K`&UD78GBI&1K}#WXGDctd=6 zs74ts&vugjo~q0jYZD>)$0% zF9+1~DEh^RobHeepV5-DSV|D`Gl(#Y_IycCmh*7V`u$qZ=xM9n_b0QKUN&kKt|xjyRj=I@@~lY0kgp>x&MO4hkGa5 zvxvBTU*7#DPz`)oYoR3bKCMlFVE79#jqI33* zUO4A(KP;+v zcXLR-L9$7)5rW~j?xItykDvxWM%-rk{^4KAvuZ;JaTeeG-+v>;>O(+|#S+mp{_8i7 zp~v%Ky44*c1@XvPu*-J^JHOBQPrKFG?*|46yrcEp`3f)YBd63%&ETi+$5fT(Za)Sg z(ymUB(>?+gmaLGbIZ-&Uw%SxD`@>NxnmUr8M~8ULKDTJ2xrYwt%b#KgsMyl-s0UqR zf0mG0H=`}rphVrW`b8`6u%nX~Yxi2Y zQqcwK%HGgQ1RutGh#4aACoM-u$OM3gbm)_qW!MQSE|us0OroCQ;gaYgjCQfayjD`w zYVj~QCZ39o&gOtI%Z(J?5EWpF7{U;9vAE1X32nSkW*Xo5CU>^>WX#2(usyp!L>|8R zO0tWz&WSph#te3#B59%ZBC=Q8fgwP1jv~z|CI-fner3R z8=$&XMBHi7_ttTHT@>e52bn*Un7#Rch~oj2xpUqmbXDQDW_so z+54vGk-8CS4u=^;SC5EXx)_U&7c>@s$VJ zxWr9~)JH93sxD8bDm=u@=kOEn!~kxLtmKDZeKui7#V-+_JFnu{NshfXj9h*O{w@@` zoMh+n4gN=gLCivTevOaz5PYUed4N20Z4cx2A0=ABGot9=KDF>`WEjjajQP#b3ssi` z7&3m{OuKeVOLKM!!r!6q!=sUB*uW4I&%osY$mK8${N>#&&ybPp{pvyqOR;M9a-Jh& z<0_Oa`LAC!;Fserg)0G!J0ZgkB889m@cx;w!cS~mZ?wrG+;?{UPT;rmhvr}ca*yVm z|Lt#gE+0TlNTjQ{G3l?t)UT&3+`Q!2{lLMeUk_~if&Y3WesKpjf`zhNT}TuD{{6ns z%l03J{B%B8?g7_#r}6uS*pBBEDW$4Dj<%Ey{q~ygce(}f1JC}H-(2(mwD#TcRQK=y z-BO{*PNdt&COesNtn5ujWR)#@D`kX8_MXWIgL<4ldhxCapfAA3$p>uKOV`34IXA7 zL}VV7j*ScLjZ|SAEAix^v7@**}0#~XTiBOU!Cgd1EIp?`fxiKWKKmzS+#`Y=eHA zdgCnIy8engP!>G_Kez1Lot;^xJpHmFo8ij74IfD6JDw_04Ln`_8w)XyNP#h~Lo}z6 zA*FyFJ7g`Dd)w~KIkjbnPiF*C;i2vRKVnT6@{E-I(gv@eS614M>aYP$d zc`{TTYaZ7ucQhZ_y0X7jGgUX&ne#|9SM|NymA@%r97}X16wbG39{^5DOoXo9+t7#3 zF~KjVVmb7|oyVYU`goXv5(4k|0-7S`td+0%7y|YW4Ox>FkAuRz$WN|X#OY_sDMFd|l4B!@ z(Lk1(1Qc+wf|e^5iac7F;ukb)jd@cdz!+s|y*0&WNsqi)(yghXUhdBXFXx1~$C+Uv<(xVwFP+JgS?01(1f zs05xmJ4PstX>31DK5*~neS}ElIDDe{5jsYi2o|hu8nZeRZZ%F8vXiCYYM%UkT{Pc$ z3B)aFYv-tYU*1yjfD^ZvY(R{5)}R)0zKy~g!q3mqW*tozYF)pi5Rg-0h}LQ5AU`D> zmRP~zUodj#;QE@>)jhdaGt*X}5{fy?d-Jje5n7!A_>Y4TZ46eag15W9FDnaZ5$yDXugAxFi#R|3tT5IWX|u-@cV&R0lR- z>s|PYHZ);m2W3NCl!E-c=e^8WI7C9UUcfd*4Oe+JAp#=S7r8WiCbbXf%5(XWpTkU& z;p}r>5aL!m=lm?g)#rsy)tcjYYG`VAR?zXuDQclETk>+N>Ox=&w1$0mZ4)5gsnPH0 zh+?>Yi9@gCc4a7hg|?Y54>jvU%hhct?{9*Fij*{bs!Ny0vI0lS7j`$7IoIcVI8Au8 zXsrRka6H)C)E8#e9`@NBQqPjXA2?HxOS9djq0#`N7bb{!G9aEZWYsQUUK}WkgCWTp zYkU|;23U$-WMg&L29r!CS{FHp#Vt|7(TxI~OSf&mb1FsvIb`C89zF7n&-autcvpYJV(dDlt2-8clrFv#NlhS#PW z%orjs-h?Qi4pH936Oo{%-KVXqARXo=A-=gt@tA#TNcOh(wTkN%uCBgsVp>v!cQ>qS zv!5sqdCnJBd)!KK(VnS<__5Xw8VFn)uMhJXuSxy5$Sb#GDuaFM)tS$NK09l@7b_Mb zyC`?UAHKU*8%-49{VnW^w%Nmhj9uQKQ(N^^#;sMvPPyXformHQ?z^nZ2Vk=|E&)H#;`YEG8-!cQfc+gNSlY=U0QjFiWb$&$e{(54x9=g8(*LF-<{n@hWG&b;UiTu)L&{l@2>y^ zHb#!-`KaSCk5v~lH(gHIv#Zx@Q1+E8GDdeGwwG2>?-DVjEXp1AqNL0>4x@D3f)pMD zPINvgE6hytWLQqB2$1}))q$*5xi`4hI;qpss=yP4uJk)qB)2{aB&1ws%{j<=B<4sl zBQ~V|Oq&C=9~qj3dY>1GoAcHod+sk?F(et!o-E;)T7lpBYI8881p&YZ8FhZOH{vvZ z<4#m}*Avi!1DuGI)do?cEhfG}m-Ot|q>r%7ceEjo%vMOG|@vY6N%1LS6`s1GC_>>s9 z+R3}n!Q(|GsOKJ=MpYGq7)@wca>-qZxX92T;o_{jS=f0|y|t8bY0=MJUW!#cAsUzL zhZo#0a$JAY#ht~pop(l~E7*4q*L*dvj{)c|^!$8WZ8*=~AMf()u=#;!oq9oRt?SQSH(6O7mU~CfSJjbbc*b zNR(P7G_c(={+feA5uUE6dRm~C)q^Dc^LM^G2sS61#0O^@r+{auHiidD4=S>rxWX#Fl%W%Kd4vfw_kmF6q;(vnt2@zF#0bRmF4AP&}$0Jjd9h zHY1tiWtCBslW{MPH0os2l8#Dk5|6n&Yp3MLDS3-gkduu1>m+{H;vUbwnbzsU*}!Oi z+6NP%j4$DAJI~KPTCVH`16aMsJvW!AO*mc^-LS|A+_Bt7s0GHHxa+MCZWiN`FkUb> zh$^jU?Swv_Pp0C%OT;^&ap^jRhXW(A~ z#l}i8U&D&zbl&Unp(gOY6haqcl%GeKe>j_xm)OJ_Up5CJghFEh-i=|=#M%Y9W?tL@ zpKlLO7)*%x7hL5V{52&Jmms1kY4fhg6sOU_!InI!(HEC0mXp(VOu+$la(zOe`EiU5 zx1mByULa-o42P%HaNQXrcHkf)@W@!*?HLN}pS6APr`lb?b?xN~o)^K(Qt76*Dthvc6dR@J&&mn(U|guf?7Z+%NslZAZ`{cY-9i7sP^Xzp*Xudvo z@Q?Qy$4mM1$^^MN`L4t=&h5iMINHSBT=^JQhsDBgsH~OaF(KJ7I+27{0n30olJlBW}WQd?jathDMhBBm;vSKIWn}S$eZa@uUdDtyc5q& zm7^A)eT`kXCF$g2+46jhoJ>NC9iI(H*I0Ar*ECdolt>)ITItqEGRf_i9+Az|&XBYq zI#(P@Z)Q9AT8&kw=$yF_+Nh3_gpvsc>phGAE-zg_n;J+!%?eboF6wllErVIYIjzjy zu1u_u?$5%O5WkU=;dcrOw)|Y-P(Cg?sIySZL3%-s=OF@VJtZzG{D+dL4>B;jGhaFi5wpv z;!JqClEhNzS9^Or$$R#^x?f-z&=}JS=h-nT0n_@2Z4P5iv!X0Tymz9mlIHP(9 zCoOt-@9BS<*p#>E_b2Zcv}$r6V@QsgI}%xyBmZ#a*RTDNpU}fWbwjhjd9PulK^Mm9xUlBsRh|%HwCBdcEL9xt z3NN!63vmQm;Ad`|($46Zl>h2G^m``%37)of>lA+X!4b@$cn zq?ZkMc1-OH7oUV|APR0>pV^AnLU9nbO6e&@xpFxR<3@F`Dhzo8bGLJ}theAy}P%{`O`g zpX+bvh~GJYHDV+=?-z=v^Wo}xT8#ocVQI7j@G*7bkwO2KV9(p-=1_KzaARc&#nz?% z?CaI}8ad%ZId~U~(oKbrl92^CU#Tza84e~!^95Hkx*fntTewEj$h6r1>gA#)Td+U- zs~p2?&_gVke<)9bDFLR6Pq0Kwv=##{@lFu?9VPQka|mW#yWN_ZA3<{6ZtEEEOL7Ww zjsWptYzed)PBa$Qb~1gFe{%L(Ac5$bYLb9lRG{pnt`J0wi4aiS;fRp(JPc;JDDZ4 zeV}{loF*f(TIdBuFJxz@T`f>|#uILSwC-?MI-h1Ex3*1r`F8rN!b&lS|IL(me}~;L z^jH_XTC?Knv`?mqi=EU3YaUANv)0Z%E=slw%Rg3K@bSFYCnh0eTHWJu`>J*AkzPbL z=xF0$XQqMa@SiE7$=2~{HAYC+M@SiZb@5WAJ1Q1;C6wQOcu+t^nvvuKMXBN^|6BEn z`-Qy-aJ$SzZ#G+NU%g%L7LviZlE!bXovXBn)Y|!Y@zLHb1}0lJyoZXhlE)P)S!y2! z=?*$TX0zFM>AaD{M<r zg?jwDWp*h*!=m_dq6E%pfm~_=2E{%G944Up__!`TW$6@SV2?yjvSY~F;(!j{naHUm zo>b9Hxt353G?`wmmnV#&n}Er&-)xt^+&>+Q%meIr?O@78^d0UjgNoSp6N}vRSX#-z zE!1)*>g4q-5=KQr`U`KArt=|bw~t}e>G9SC4#34x!;&JW$W+;%wD5!iPe#B}@{I?^ zNGc3Ky~}WzaD5de=S3zn^<}S6L5DWnfG%%~MACg2R__{b@!&}kQ-R}TBidnHW|_|j za?x33Vo&>NkTpq3YLpGw2~WQ)yYU{2=|YEapwO@GoF?KH_NvAm(;h;A>$Vpj^8-zs zCi`$(#aMk2l$LCxO(3-99q(t=FSS8Y1s^x-q*v&KVVj~b z$P$B;{*0gLVe6*an@pYssL-TAfZ=AQ{3+CufLZ7A12_Jh;X1x^+zFYqZuR$BI_$)7 zbP>83pMr~K^=x#^R9MGX;j*>)p4M;`8BB*q{XDDd)@aax$I7rD-W6j8GFpY{6OyOa zdA|Y<6qHcvL&Bu|FyX?M++wHuR^8a-h+`NqgCeC+6(ZFyg>jT~`1J;h?-L`DT~ihk z*~N1nWM(+qam!U8K&u^Z;XKSHQCa$ z#AEe(5Q?Rfv-xdaNQaWk*^fhTONif`#Ct3Jn`y21$I?IwtE~~FXXSRECq#$;$V55D zB^ZPkOQLLpw4e{+Lc#cOXM`O6V0m-fvp~E#+_0Slw>y_?`zQR*%zXzZuk|V82pSY) z!lQl}GFT~7J;gH>T~Qa;oib^o-?-stb$!(`|4GRH8^W&wD6hR=E$mA0iwFAozw%^& zpsBCF@wj&Ao_@Ya6Y&<%Ns(o{smk?LjDDzj-RZZ2RX&^foYU0pgS__+W7P+YTvn@e z30tFgP$g8|gAy&5op1H!*oMvE{z6p}UJX8i>b~P3DrlZ3vbT?>`+9t#=j#4)#8c^= z*~75j5KE=9LbG0pZKtgE-BHS>yrCrbAoZX_^L{l-Jnsy;d+&SD;I~Gs?U@1m3><5% zA;bFbSxz&ehj7kag)jDQci2Y`*3xSD2gNq~%O{l`jYr7U2fkmIIAl->ucnFTk)vJiBmo6Rd1)Ob8ysDLUa2N$T>~M$c{t*#2DEFZ8#kYmco?-Lm%fF~&#X*W|XH__^4T z^s00I1yULQhBk|d+ELGrm&D}PvxWW;Y#zZLBifqWd^^#9>+NGs@&i))c&A6IHU1AM z4kHT`ZZs<`TY@Sou7lf4AmXJbpHtEgfeRpc>ey@86KD3T=ti%h@JZ-Q4p$T=b9UkA zHUtuD4;Cdi7`3Aa>~tz{*o6*%7!USS-&ZYyic@9H`osiGU0k$u#}8>Eh4e201)7q$ zTD4XN)+bIrI7A+<2E7aU+n3j*8->E}^!s@3=UNa-q^Ae4<1cT&-%T)bm6wu zCS&Gk;J>8(KS$6n6B|z(GZre>9;eh=$^;C~;>cgZcf}))yyd(nWigHQm#@*p2mKqT ze?*-VbZ~lVo4Jw|&&!amLC1NRZO6!7Km7tXIR1 z7%8fc*ID31W5WkTrhlT$BmS}}onK3F`oA+FZb_^90@cj|XW05qzk%K`eQN>}_LfN` z6K9BREGs}C*>f(_bv51jZ>k<{%T8DJvL*Td+*Ago^cw*M_BytaUW^>k$EM`{#EXCf zX^atgFGlnA3cUNkRKj+E#6sL_F4;vpK2U9Hnj49J_S$YjiO8Ik|znt=`H(JnXlP$p~&FW736EphP9r^iS5y$=1 z8O+MP)p7aHIefT?c1xmM(mbR&X`9h*OKeO8U=;kU+%=aHdFm3T4&2XvU3NS@Ec z#$TZSai5t#V~1CpKmC4l)l`~Hnc*RwMw(MFx^u*{uQ~7!7431OJsa96mD`R5=9c zVCg70>Lq}2Z#YJq68(sRdmio-=!Zv|Hs2&12=yxxDCijZ|qeyjL=Fg ziTU7bhmJyhfdp>Tz-14ML$mttZbB#o*LKA1V+*Y+9rmgcW(U%@CqN9MEjt)s&phtBRL#j(W++StVcld?ToW+FPaw!bLuGk(NsifZ`=K|VbtP4T3bYFo zODx(^2zO}qlu=Zsyd_CM1(^8@AY5|omhYI7b>dHZZ7HdAfLUUQ^4Mg@*gFvVZ$%Y7#ny zrs^S}O_}BMqbCTW5m2PGh81f3&`=k>hrk5ekcBNoOf@Gpft;GfY8(W5rF$!^l7Xil z1ANtV))BPLjmORyeCdK_5)25Ug#_{Fo6VHgsAD2S0J2w20#>S`;JgD0=gFdqrAUEc zcgd-i6t*|9Wtv3>x&)WM91(g|a3PKMK&%%q6OO_=1EtRkr(SkMFf1cZs@G z_pxBraT$vc+qZigas%qS_fMQAc$J+ZQxbCBd!w^-2 zDLMxv#Sbah6d!f%c)oO55S0Vi)KYLT^(#LMVIP@n{mlQV7vr`MZI#(82eaAtw($Eztx4;zybqx#8gqlw_gN+3r&n+gm$N1ggx%z$rg87TK?D95_Ioum0g{k;WX9jlFO&oMv% zjy|AWs>$8rb(zyFn>En#cBo4O;#U~;P&gg3RbobYtz1=Fz7sJkv5_FZeV=j>!F!;j zSF%gF%5&WeI*}tffQGgV$rJ;_rFU@7j6mEdR;D`vATFOxo4bi%Yi{cms3k&}wEHUHACB~{I4KySfB6yuKO4}2 zrHexIi`of2v_@A=zSrhb>0%+NEJpowCLATfDL!`P0G9QeA*_gQz`%1hd!v+5=?-Xv zYzOplNzi;CaF(T>q}F~?blw7r#H$x|?D-Z?vOzb4oN<<^;raeo&%W=D9FUW}Q|DkU z0(C-D!c)0Tel_pPHaY6oU_J#ul@u9=09l&xpAz-ZVI?4-qg-O|J@LPw_6yaWtCl6D zvI7S2Ue1{IH8~wjqNrD?^U`2nBEqJcug$cVQfv3UFuorzwQ#95CPZM4j7<9;DmIWM z-z_FYYDHd^y|WF4&wYw*PC9#7(+<>7HjlX^Hw`nNiB(JyIfYJxeq} z5!jXCsXr}&DYz4!Kw?P*5CILe=iYix%GXPAMN`ckW>dxt*J;^inxB+E<$JD~35{SX z9!w=UHieL}p(GFBL8f{sL1`h(v6_G0aM;IUYZyvg^vx=#NMROu_~LU=J6Yg7^^DEd z;2Ttx782O(g)+zKiLMZ|OAWpNMoZ_qxxKaTOOSW`uX9g8Fiw`pUO#caf7S{LL8Jpn z%PcOBz>LbdOFli$0R=#jwgPUhgHW02+{i-e0DcnR_*)3`B+)g%QGT9Gcy@8dPwkVLKq`^%$D`| zWD^4EtcJM`L(G5n(N7W4(DA7)NTk2RiT01kruFcoOW#L84&b|)-tO-_`<6xP4k%%4 zb|OBlLR7xBqf6O20HZT=PBOAWOS9W3>>&rB;9o!`0|w+Ds+f)o%dD3eSUu5SphEaY zPzsbOW@f~`&tzgGILlM(YCV%3c!N9#TXYoYi79L++j-&6PU@jZ#cNN6V}C}4|3)`{ znqdC25l(>F92?DGmZzG{bOuqYguBPEcf;){)Gvz#)cjH(hg5hcVZ~@o(3$eTNzqVf z$CCI^R83AB^9E{+%xWJHbtiW}FQ1|uP|g8=K>iWttvANBtPxNPJxE=kU`I432B6F> z)vv--jwMw|{2{$N6ep=-u+g+;mOKv4c~4SxfOe{Pfwl4>2-vO%TaMbvDGT*miugVfo{UqI)x zs^Azv8X3tADKbVO-33%q*rOB~SPI6Jp$A>t=KemfTA8sC6n!JLL znGh?16g}0N#t8-7IX3+z4ISlQKb!>l{M_eREoPx3NANDoNl9ov8xXUX{${#9(_Z6h zE{W+7^?|^|H7w0{r-7)b;PPdRc~_V`dC2b zY=ejAy;ObJ(G-Ydc)(G);mUR)EqiP#UdPy+)J-%qj4G&@d=jM`$!%Kyy4mD9zh$L> z;tEhFuC-VBJ-_&!Xat;mxwl7EZv9KA*cq4D7 z!F~9JgjbD5IW%PO32u4`N6N(c`bQ4Xx11TWekxxYz`h=@-pW$80ERu&I5>eBvycbH7^lbI7tbKV zJc-E8u!N3T=m)W!Mi;HF7@h?l0f{YLuZqxG15r>N@eUe9D1eY#Qdl5AbNJd^%w+6e zM~`oHJMVT@-PFf)NUc*Q<}pcFwPSC4;$=-yHA$^hA8j_>fS~HKcgY9`<)wazYRVe^ zbZXj&w(l6546W*AY#tV~SN0=@)u78iDO(o{adp4zi%e7;Bh}_qR|ph`d{1%hm+K0> z8p=p#ROw!<@o4iA-VN3du=Vw4pQ0i{_z{+=(o$}usT7ok?PlAiAH!dHbqs1Hu8ERN zLW`gXAOLhVKtf6`+Wr+<(8P7-2-fpWMErSV2aqGHM6rv|pm1dQdSa$39h(5L=P-PJ zzHFrGJlMHaIPL)VCtk&Ln2p2P%;POZ1>j|7hNj+)&;9-_?I4=B<#G+d{BTvpOIJ-| zPvJJrHEbFDSD|`27yQ}D_V`p{=3}y%#Ht}QbA+A=?q!ME*K1B_ks1`61n?S3a%wSm z^1nh-N=Fcc3A+55AVny_A?|w;YKud91Yy6+;~L{Q)7GL~z^{hCg_5%gsO3gGCGlC( zfi;c*(M_*dVS(N&`4$MC=;&gHp(^&+x7*jDD}y6A!}+e)<`DhDaO5-XQJaIAyBdD; z>GugDfqp0-+JF)Ug9PXrFrL6?(MEF?>I|!<3lX<4e{6fs3vrIg9#aYGdFl+yvFA9; zAgjOtb;OV*6DCg)n43d4H~T=ZYqv>>@%gYd*fb>yx;g#iL%j1}m&1qHblh-DLS_!^ zdnoPv)~f%=QeGv=O(a^+Ojlw&Nx;M_19M4_s9vFeK;PM{;JUxvzx<_J z?wH-Fe8DX51>A}&)<$gL0b`1;R|o|+#Iftr&E@fXu33yw*nUbD)F2RE;4)&)+5lIv zq5L3gD!JpyG@`4>eK5JC_A&QHNnrr{P}1ri#JJ81rMbONXplBh`EGZc#?WC?A2tr@a)nn|CPcelExsS zIHKgrhI$fUPq*46PsczM#^i{PUj~YvP1G`{!RxlZ5eN(|? z^*4*vGwGcuBD8Y{gyoA5d<>65z#5SZ_3y(M=YAjo|2+#@2DJRv7Tl-&KF~x!DQTVv zbY~0Sj7&0Rxhnrn7#aVZiHap(Po)%bf1v?Xi+TmcIs!tGKfbc ziJX~A#_PX50NTlDgd0FEscb??9eCQIf=OJ(bL5VPBt@s}mb|66WN-!#z^~ zVH)BE!!M8FK$&%L2%}&cs1CiM`BlrK{&p@&r6VIyV1jVp$q{ugzUvwe2#v5YFG9wL z1_iWdkbn33n)GJ5i#VKU8+W4_fAO4Z0esx9+e&+C_AkMO3JNxKX+r{@4Hm{vw_ORp z32B(Ico!TfQV7Z1+AKzG<_IJBRl9A1j)=& zO3~!t=DMXM=_{&8bWQBN?<92E`K$NbZ=g)XCRnf4eX`Xf$h*-hH~P@#ND+Wn z$0$oq^4Nu?*8RqULn!|rC;jCPyl4O8J<5MRNs5pHTvp4L~b6wsPRl zrY^WX2Pp+|6gQyRgC*=tqISaZ|A4{&QhORF_^~`6i5331zW-%{x=3;*{W|{lqJ#fl zB*dHpJM*cymiV_L@ng5e$zk=)i7r0<1LT6|M81qi77TRgpIb_!`&`L>1wG zVm|qmWt%7!+{-KLdp@jUq9|#O&700HuPv;kI2HQo`Jk)Llg}iIUnR5~TX3GfoyU}z zAp4f7m|?&e`uvW8--F|`&KERtq_xj#q>nrWy?e82YHHSofAshItR)ZRscL9E2QvaM zmiO9Qs5zpc35ScqZ{MKlgcRPneOFDWdGGd(JzW{b?VEWSW8B*}zvH^dZr}JZMswW0 zK~slV{rjd^{QsYU?pCm*cd<5I#iXS{ug(vg*7_}Uv0wg>2qR(EsxTkTSKXPMe5Qtu zjy_doH`nNcKwcv_UI;VN(9mRLWX#mLjg?}AwU-%n;)S(m8sMgm{W9vL67c*S7>I#^ zVW}&tdUd`O$>+LV?S7C64{(?7`Nr&Zyq+-%l>*W_XAdtAJlPs!)o=8^I9!>7-ZWIk zrCV-}q8^L3>sz96VpvevXO>5a{7h2C`%M;>O}dkqcu#m+S7<$*YI zXle89aJC$SeDY&*@~zzr)RWJvtCJhNBz{PZ6=$T>xmwOO&-i;r_NcR%t!+>IG8=Fc zS`O{{{qbIGZ0ug`77IR3N^C5<>-P9)fqK~6oCvqit~<`V&^1Kh$>HxoJ;)kjnzHW9 zb}oSyqFd)GEidn}-;4=k+2l3tOC+YIh9a*#2x@lVk~F$|`OtFeYd7oJXd}Ke?;PLL z3LaUT8$@*0Jc;>eM){AX6J=Q_p20i*;w-M4cg;O4K5g*e(W#vcQaHU{>*U zL3NHRGGoQMXS*{~m2y2#q*?K(9`5cEG{E88GfRXyCN_O|I5^V{UT#5nv*a!f$A4w* z8ZWYgai|GjzqWNo%yVj~|M-Ya%37>d<)G8@W-HnVW;Heovr=E{Pl2%jL`Yvfa)v7^ zD!%eQ-)|2i4eMfeP!Qnq?PbQbJutpwAA7^3eKOk%-TN~;oG#*zj!klPF#vV#eD05d zs|B+h&CQ5}?q907>C9Cn^W8KB8D^GM+8zD=ima}#9vK-CnjSt9!Zw>xsVdg)Jj7Cn z^r71+XGI?82o-7k65Z00IXfBCFR&mzn)Ns;iNu4~QiP`<<}SRD=ka@v_@{ew`h9&s zfBm#Qc&*%*qovr`*vMw`WmB$Q-ERa_Dadx!G&CL~j*p&%HL0BOEG)1fH@#MS6P7x{ zhpQ~(8VpSulCI3D+<7W@CW%kYKL;1;)}4ODq_Q7=UZ7q=#ciJ+Y^_fHFh?(L{6)igHf~lxEu;aS}^3v&~Z*`IAJsatt5q@_c&~9uG%3h zn4fn)@;PFKusiX<%xz?{+ zHBurq_`ywirgoS}|6#O(;Dz{>7V6o8117jP*L%<$g)|n(iyPkpUBrjTaBO^8cPGOe zAwm1&^`Y}~_c1*W#OV&?@R+Do$4gV6^lwdAFfid(AF*6F!3u4LGu(wRNhw@d6VI&V zwLlv`G8lkJe9+|y?myKAKv#Slqk*#cVY;(eeP=TX4X*#ry^FtKKt=v2(H+siWm|iZ z&DwZw2w<$Z*3{P*xzHMvH0`N91xJq06;ovB)>b1T)Mu*M@|vZCDla8#13f%EfOP|* znzf1j%m@1ZcC247=^@zck!MtY$ONyChr`LaDlL5wbDhXOc3M%fDvrjJ8i%Z%JF&9K z0$yt3iPNr01uPmyZ|wf6h|8z)Z#2@3v}@1Jw)lvh=h9bk5)vf!WxLSy<5BWHgSf%$ za$IQia&(ivK^6~*^?>B;7c&YC02)Hr4kw-u;9&NJhj`V>&+w9ST5~%t2g70D8ZlzpH;v64rp{F9J0K_9IxQPj6+R4W;?QMd1YD@1Xi zu{hZuMEyQ({kODck0K88qv{xs7rdPW%d>6@+z@6XGM`UmA>Wr z?ISrTHSf)nVK!+d(g{dSw{{di?+K`n%L7`_Aw?c}rT4M5DQ*0;o1C@rd*Z=Ni7G{8 zL_WB2P{t3Ajz+vl>n1ldSy_4Gy>~oazmceRZ_n`Bd4zfi9`g3j1|I3`rc+95naAa| zLZgq8c;%Ze1g~U?>L?gMnD|x69W?&M!P@ zA>L~Dw;vGQraW)X4(2m$BX$^)N(7+hyGzXi$&gW&jpz*l0{MYVFlE`nar$)q`Mu}EZx@9w7HR2L3US+98~ZH` zYZ|go&VIKrl;}>d3;eS+_gDOSP+WcQGU#XO?&g8^rdK*|on@noiBO%1j-p2zYze#5 z=4DZt+;DM2{TCqyFkg49GK}-0ntPMvXB{iVK+MZ5S2?|uicGc-rKw4{b=20tN2_v8 zQsQk_lKUyK%dN0Yx>fv@c0SdwgPyrOTzd#E&(JT<{BwJh2}G~WkKqJXhfXXjYdTtx z>raO7Qxw*x?M_FD4u=iRI=QDm@e0&Aexu7?CvhIG@GHHa(0CZ*`I1q(1K+XUjf25? zmy6F>?VBh34XgFwdQm$DC-{h4FI2uupGTIE*Yl~Z`DxeH^<1GA%y%F)q@e_=;i*un z0WZ-y>1$R9;&gJea2@sD3KdE!?}&PsW%zXKub{~5JcS%x$E!k0J@*d_v6P)=oh&ar z)dIn@>c*=vNd%v~GDeOPIl}n`ny4K>Oy@@mw(#bGSF1P2qiVOyPtg#kc(7aE1NdW& zh804*_0M1HB5OW$cV$N4*hqGSf8Akiya`WtP9lkzefdk4{!v5FnOt>+v3LlPsHusr zfQY9MpMbdOO~HDdb56W$fl`m7=`;gGy8}0YRUyabF+6|$$+eM11ZxG0k4xoexinrG z{p0Px!XK|X9g1Z2Q<#6r3bOEuXoY(ke72TzZyMSAtGm&mJvk@vK{%~n&aSY_Adh^c zVZiT>8|Ijcn=VQM$$Bh?@BU1@Dvaxj=J|!C*${I|H_`fami-%d9>pXx;ol9{AhELmzFFZ!^#Rh2$^MKDgSuDB-;MVK z)H-^Iw-$;Z29(;|RLZQ$@h;-x%i|xJqV;UFbiK5oZYr!nEoE%wGK|(+8Huh-S{o*; zU==cr^<4)I+7dWLQz1v#%IbE}e}#r;{nK3CD=c&^Edoq-`1ES@Q7D+&BKT+@*LE%_ z(`73aa`UH;^o7~*9~&bo94h52MGeh0IQQf?TL{ucB_ezm6)eYOCkD=RSBJ~nz1*`x zAH^DmlPmioA6#@D!?wgLtd75_=A*C0jnD2BJWg2*oSF*AWBt2VAV2+(jj^ZkAzIw~ zRqe52IrZoZugDN?fdac3*=H_u4$+o$;=g1ax2;fTj`AVWM z1P!n>e?D5R&RF{9V6wfpuAhGa=V{PsOHQw(WwxZ57;01x1{g zmh8+Qy3+j)ByxEBin-`l3Zwh4Lnj}6(TGi>#Y#aCGct8n?{`OxUE-Dg$b8E&SG&kc z>cP7cGs&{th!tNpd4d&&Rn4nSV}1EYwzZf!&zx&}$>-3x6U>38yv*^`L}0=}=~oL2 z?st;;!LpBZ#9&3Uh2N&McSGtgR+Csk7%kKho*MNEK4YZXiL6k9dYz3k-_w&%@`M^i zhF{KPHVG!U7K+S`ku;3tqVguLu?7 zLt*}5LDM|amXhy53>grpyN5blA`N*1a(>L8b92-|SJ5kuf>o$XAG zWr$%GS-n`D_1e2$O|mQd2A$-qNP@%M;~CO?kmbEMOgh#3Aq=S?iUOdSSlBVkv+WD* za&q!57|c5pYI<|h+yP-Yv*C|obeuYHf)IC;S@SjN?6(VAkR#VisYJNcV1sG+)?Ix> z7S7u`swMN({0~Ecp`oGaWM-SE8+^f6FoKe{WS%F2@*h*&fZwTKd@9X-m{iTvMbxfVs=+WYj8?OPzWZz7luzUcpV< zhg1iTcp*RTk8bZ_nzcs`A$oLmJJgYbga$E@ff*w6!W=DOY^46vZJfa$% zQWdNxowzKJO^3VIW41S^74wM~&gIs3e8Fp|6zezcmE|#I%(KVFZV8dv61Th>t?8}pr6)Sj+On$hV5M97EUC+kCU!>Nyt7WzL~F|4j? zHtKxWnA;a!ZA#mk8O$itKU-U+j52CUxje;PQl?(bi(hWwYuH=5r&X7!9)K}fVL=PQ z#M6Yav~6wkxeJR=!mQRBX~0jQ)^j@unvyMD%9r-FP1S%-z=yY<3dxuJ{UJ?6(0#LE zR)l@P+o^B`fmF@7Ccy*1f=BFot)9~$V8Om%y>M(w&W)5_9_j{m{))!nM-(zXFCA7k zLiLt!=y9%E$+reDDxbW$EDCTEOdy(LEsO!2tK?@P^Jhq>E2#6O*nbDXY z>Tu+`J?XXbCoU=~3Ye?8ZW#N?S*ekF1UMz7Y|rrQBG1slLXB~M)@Udy-XNa!szmOJ!o;EP&T=t2S$CJNFe(QBl?`peIn)^9YP^{yJ58qAAHwa`d z$6K|(q0FXLI~uB)Sn?^b`;3Dn@s5K9qRxSulDLN9pumP&kn0D6vX6|*!qqDWGpN4u zFzm%ya92dz*TV9C(y3^eNNxS`M1Jq@O>WxYnr!B?IQCo6w*wo zo11*K6~b5Iqx58OGm=&cJv5l9YMXU0E<%g{*(9xvRbdlw%!U5z-!P|LdR(uCNs6&>ZciRl)ed+pCj2rPsA3`DT z7vk!R)sbYp&bh(Qo(+uGTIB%93%k%~y58Q-6Fj z%@t~QVOmFLaangNBx~n%bA7huTsyid=OlEkv!zwv<>e$q5dHBUrll?bE#bb-`anWa zjG&o?kq*fGXzs>z4~g_D>KU4a_UcDM zjmm<1lX=Qrw&1(uZ%f7ol6jQHJLukNU!2@H%?URen)RO<5oL#T9SsQ0H9?_8Juf7( zB*SB3VUz%KMTko!19N8c-N#&sc`RYLFsCL${R~?EHlDN5SyB z&$4E|7ZTz>=Hu(@@72&DyM-7a?{idRB`3e`N&~)beG;~{N;&0)gETr!p6K+wQrZfw zh)#;<+C3S2rx00|!ov>6SIUyC@oHQI<57)M{&{ayv%*qPFC3jt|D$ls4N;nWngs+%y_?-m5*xX}PkRym* zUS1}*tC#me9llkl5sL0uKGQi4h_uHCLqn8AH&<$Pazw2ZpZuvKDIXF<+m=|LKP{7Z zLYnI&8>`K)oZnGtGsOw`yt7#^yOtzn`g$I^T#uuEw~Dda=L0qGHL8sUc9nC&Ccnz3 z@Se1j>L2$GhLN(SvYGt(AsriI)+d|Fv8Sfn3RtTE*RAD*p@!z!Ys?7B8|mw-X{P~? zo3W*hjg4?J4$r-YGx~R_f?$amGrTA3@EW~_Bv^5e(;*(^M5C46>23pra3oX0QRYt? z@|dA048-aC*v7beSZP-`)-Gk{p5o6*XmEkwAd@GeR8_0AaxOG7SnC%4hf0a zI5^$_MiD1lXM4~aug(a{3nAq9J!m=kCx3r2F)@A54Lba);WTas!yDxFocB>5C7-KC zks85;rO!78>_MHQ)jn`p=a(n!?)x8#tZEv8R1Z+N40Ola+gnd>Hs6I@y=W@eMQJiP zG*m`j-gcx5-xLZQp*tqf8NxxM-kd$_jhpEzbaFl4#W;AuZ2csR=K1(jLxPC}H;v0` z2hB$O*9q_FW2MCp4-XxkoCNqUC(6yJeU67NPqv`QnjG>y!1WYpSK0O@avbefyEb^9 zyl*;D5W22#%E?J`&Ue!j7rePTx;znRD3S@nqdfL(2*NCC1hxxaS65d%w>0z5K03=J7ZZpD%-5B=x5_xcSmPY0?7M~`* zaZh|XVlZebj845$Ui)~5c3^IY+KTSwb8ggY{Ag*@nK!8qUgu_)QruyEti%ImO8)AD zUl1m2!g@0ZXcHNZuTXam+o#*tn}i65g%FLle-6wM@yNR%YMWCgc z;Zp)#M{}ns!`O4mLWZEIr*@tTNOZwW_#N}%ycw_}lCXymqVF?cF_4+%n!0?v*H{E2m z)ijD*gfEMg+!xoBL?DjNv-{_&3=(~A=X+IKIi1r`tMs?c-L068M|jIfXSGd^y`dZG zJ*KymGSrxnn^RRK!G-!wdRwAbkM`>E4PYR4@}tfNnV9w6*V(|J92VQ4U&sY$3<~Qy zUk(JY3#GB->VmNBnypslt5eI~n(xKQRUB{Co1xLXKg|$Qg_~pTW?U`~OC8pEUQ7vb zmPz^4n&&@9vH~9xU|A0qigX)@v!n=E@Ih+|%>h{0dkwVsdguh_tjcaGq`>$bh`OJT1~s4}R^9 zin6MLDcp#_(Nv{=X|NhMjkT3$8FV?nX@HWb85eP_XQ3+kN^DnK1u00 z#VQB5;2ggUBVFt7-@oE*)|^L_bqo?xQedcp(LYnWV%D7G?0%@n-3O0j=1pOtr4N%ke58aZ=b$ey(Mg1`Oi+ zsql{d$BK$KFa`;C_i8W^0+=7&ix(jY7lI-=!nfb#|Es0LrW$E`xaLGJANXCsRhLbz@2 zHO;eUpVgwNKD;4YT3i%=#>N(-ho&dLLlyJ&>kk*yO6*z7Fn+h=V2HW8R_c{hX)uA5 zn)T<6J77QPV?$<>XBBp+3}IRmSKvgEZc1x~rlU|YW@ctaj)PG!N=8N1KkbXJsQodQ z3&atMR2(oGK|x?AsD(*_*=nyl*KTiHBjR^LLPE6ERa!Tkc@s^}@5gqxji6{$vH>DZ zG2qO_+gvv*P{c3yNkJcS15xzl3l8G$$^O3mBv*A@D7pCO>gFVHZ#E|wdBh#WTmY5C zmvfdF_+OFKcZN_yE1{Q2rq{2BkUn1a_OIqvYU3=wiA>M0Z0nMbN1?c)>_Z*6{r1~N z)Kf~S_{HJrkHb6a>-m8`l}|uuX)k!kih{|#Kl$XmufzjNkQ)uy<>T#%a)+f3uQTtP z*2a5OkMZ;jcV3nU+n49$Zd?NqL1JQJfW3*fzC8oG>noY161;_@&X(+FH}&NKMR@`xBMPbpEq1YH2BEYD$kC z29{|_g>QoEkBLtbJ+YTfs>8J38$1Z#KKl#Ol>8rl`+>wL2(;al&VHX_;Q_~N*j=0S z7$*>{C~o^cCZ;xkK}ONnyeabSxUXNo{wxAU_9YVwa%sjqK!M=3D8r%ftRH)^K33HZk`STAJ77hG>b7*MjRFGb43}uh% zhl=p05Wok-1V2;oaXSfxp%Nb76S)N;h8F zK}<&{`4XAz!Z=G0jL-%C6blTOE6pFz!N33eUP z+xj342Z!pGmbb8u&UBt~&PYnB*8AqZQod&l41Aoc#v~zUgB#8-ZG%HX3{n6>3x_Y% z6pDz09YS7?H6c%@Cc5q@cwe+b1$f@@&u`nANeLWmGgyS(y?d90Nn^0u!KjGw@Cn$| ziHQlY%r_49)G%9#k})7jxUxHOZ1)f3d24HH!X(&UMRxZ$Ru7{=UYN61{qg_S0*vG; zp_5{#hDRhlPWui$t`5TXOJYKv9|@g!I0NyBh=>%Z6>ZMcRl$ib9{V>(efySe8@MA! zQBCd244!2J>+*b15(7S+j32B9I+$*ni*hm-N;&Tm*ew1&BLE0|xIIAe^-q?Uf@U`$ojS@0-yOFkh!(n9z&~0W7~4A08e5tAJ3Jg46EngO^>2PiPF~P> z63}OAHIf@GpTp;}sp3)$zRdiD0Vb22Z2g8zSDjWysX&dH;KdEp7ND%XeQBR*SVV*x zLK*D%Z#Rorj^MZQ^8H_5D3h@0Q=IqP0<7dDqicrvV6}COa<#gj26>}Hw=x8 zh=_<_EHdmYy<}h8+S-2ncy=@5W46}?kBH2r5lo;{LU zC>0bO>`wC(S&0Nxm=8goKhG8M#{jO3ltNuKcW=SS$mrU>D72>>BL@o)Xa$&`?|Db9 zc6&KSCo#(wyvWD{6%0fkek;85+r`AxG&?&xvCs5m5T6Ls%@{ZI8mVT3o#{#wP@wYC zN)e%--;vvfs=^QqExfb;nk0;lgJZN&B4Eh;OK9j}?8NGB57|gNC^8r&m-WXP%eRJq zhyXgJ~swDfCovQGKj`+;0h z`21roV`Jl2bwHDTSAm38W8W{2AGHDz!NIwC`dwa6trRKZRnEnNmuyvg$TZ%8%jWC4 z$ovw2TXPO1aSb$S3;GaRQnB?zmt_^ zWMZlqxKNOm#=^tH!^d~Sd8+7ON!kmrpF{c?9oFd9=H+7mkw;58D;tGB_h)A-l@z;1DGWwBGT2Q0FR#b z1Cc=Y`}glbV$pPtUU`N=N?iPW{jXb58AiJ`XHIsusi|px&@((@_3J{BoeQ!jPx4F; z3bm{HT}DyDI6DCstE8mV2un(@)bxMP=8)^*^IE5$PSZc}10fZa zZTSRnkdl%Zc6>noW1v>%DtN+Lt%q?t!(XpNy@$~o;=CVNDB_^?laP`!B`WH7k)5Va z97k?74LyB_;jdCsdT<{XaGqDu6V`f_)|#H4wSA__QY!27D?N<$3|5NCm)s-UxKahJ z0h<1VV7A%c1{+Z`^?AU zV&b2b@_vhlg(4KS5@a}mAVvWGl2i{Ru2E3bBP^q@A?pP#fi&vm)4ApC0fwC({bU69l7a11lmN0N2{Qpgrs|>e z=CUePmJatxdM7mTV%SL2fA~Hoz{lw51$E`O;Es9_L86f=hB2w=OLs06_fc93!#~8Q zrV4`kozCys7(sZc0EEB`fY2zD|1E)@Dg{UT9S6i|YPt#_ZRRVVJF<9m2CO?7{^R?* zY*5l9DJA9At(G;KGnzv}L}ZjE8k3y7&A_OJdg2}MM7LiP)~4k(BNuo8tv|fVT?ao@ zt`a$b_(qDU>BNF|XX>P+q!5=c(Qt)0!rEgJ6Y~kd!~yDS)0#zqVcf}qT&doxK`}dD z;A8k_T(KTmS;FU^)6unC6eu<5m8fZ)uYXHOgjxb|92^|vzh($QAr2kJom6Fy3j3Ph_ilXp>JTomf&q(<{$5E3AF08Nciool>&_O>lk?-R+i1r zEs1&{piXA($2JWtWxC2tdMM&E*PVG0#+a;o+w;KIL>Ivv=kEZusSg{q$KeX)fiAf@kL{uIQSJ~vq^?U0xy|cLTX)UVw}Nf6Y}Ja5WM(e4Ev!qOjKVBp`laVj%u~I^u-zlE0X+?aaa& z{D@&{6*PyL9ZsKUt&e3+Yx1!9mJQ7lJm?>m4yKI7kHIlIKtDCBDIGX&lXMH+=D=dE5+|OY)dS$7@kr(_dFc zCRCXZhnP3_i{$WMlzc!{1DgI~K8wFvfhzOs+_z{3P55rfm`e9}MSwmwUj10cD%SE^ z%mOwJ(;9t;+1d0E_K$=OcfN%SxLpsW5UdVo5g4ZB6Q<>PSRk*s{YF-brlzxKQrHY|M>}~*86hv76Go=bBoid(Wm1jyXJIP(V z2p7%=4|=!s{ve$|Sez0Of0?!!#BSD4!S7z0_%TfRyW@<0j`4~-2OqHozVl1f_djzJ zp5~)VUJMIwD{F1n#4NEco|hY4FFyh+P`}j%!FUdps9FZ~=+Bn8=@H4G>V;AXUzLC1 z+`q~pu_!$Pu)j~)C_|wV|0jl#qYKbcenGbi zVvO6Ed+DX8c3P?Lpp2nZMj)mcm+SNV*wY6%7J+HiX4CS7zSZ|!t8WtC5fBi7@PUdE zoM+Dgm4(*$xv-g3RmkeOzKPpng@Uj-P)=-h;c+vPTr!!NAW=Fz%as6ztRUd4V2n9N zyC2vRfKMlyRo0^t5-fxWNl8fw2t*V>@U7_cKl{OYSXr2%Lacdb;Sh;! zTCTZ6w@^{rEVv@V;r!+?&L~B$H!6-`r~DWA3=K22F+r|}gB%4EAOa2ZhFCq9{)vC; zL)Wv}l&+?iE1sU%^Z~@llW#LeUf^3c`Xd}M3q})x$4N}IOj{uUa{zeUy~BS8oZtX= z^zXW;`^aQ!ZEw&y4w@TRGkM)MEVKRpP*QM}g5d3?+pC0kww_>#?D5RnDB6hXFRCt5 z1?4qwCt>*xt{5XjG14FZQ{Ed~{L(X59vg#WDcyKCJ)5UGuk};AVFJ)tfOnqeuO5Tb zgfwtq!s!323m9g&^4mnzz$cVc{^j4Yzw2D-phdhsI6}j2e%R~ zz3j|>S)wu(h!;FPF9D}a2^P9_l0W(hTU{dxgh+l5K-4lX3sO%6nqqm|b@E?zY%*&3 zRhZorQz(HPcEXrY`Y~2n&;ymoG+g{6Ds%EKue)8qtKgKj(1EwPwXApRndkq7=h~ z)k8QrD-(|hz}&sP>nACdbB5^{7|MSD&3^tL(Lk(n@esE05jFmR!y;Dz@lR+|foZwN zM41W5GMuguOJ==(`7%H8&c}8`KpZKpJi!SBjL9*`~OJVUptQ~3Y3mJ=@=WlAsc-V7V7L; zs#OG3_q2#k#S}1g+As?|aHwNrd#@XcicHU$|2tjxz&=x11qJmmrfp0{*uu}@$p`-i zXE0klK(=i1f<@EP`bUBa66mcvLD_Q|#(**~{VC!6Y#abI^syVYtl*U8ki7bklvF)H zjuAxH;N+(=3le@xDW>3D_IvHtK>!2`E))^7Y?TE8F@5X{2xPIAh-9SYffv`m`k<57 z22d;@D5xMqQ{fJppZ+c0-)fXUSE&XQ*pAtHkBAI02~%DTrf=ZXj%W#eX&syJ{rh(i z+UnKW&eZ1LfHnSh`|ksr3v%V7vo&W>P*AxLzy_rM zf0W%AcUC$84}gaXZTF!zJ9njHuFB$vUmAdwWg@}e0Y`Nd<@uj2+jk@hlL){&HH`v- zfa3upHR&?-dc018{@Jr<^z^kWZb0)A|7&ypk?*gc z?(m3S3{!v*emjLWD=Q1cjDHj;wr3#V{qWzv(?=+(q3{2a%@|Kn(Q~t!&ps`mCZ`}g zJT=KZlS>$ z6jW5Z)gJ?zKXmo?t64o~B9XPXx7QSSii_c&I24tdYP)xix{>k-M6racKpj^(Mm|^? z1PIVRpZEi`r@gbY;Bm``S3iFK1Qr@I_KP+9t*mT8S4RR+i=O|WXK-ONHTSN!CYu9L zD=@vyi84>g$t9FU$96AI90KJF`xSWCk5!4Q0L8V7qCFxaC5?}d&mrT=HN^!iWQ4Z* z^2$m(Ac|BV&z`l-e)tQb@O&Hyurm%%FvG?$RM$(j>qoX5Opyx}F3G+$=O0a10LqJ}*&AUo~ zpmQeVE!g!@OI2lZfEpDn0IhYTidtGkZ5X*d4wnII4n1E80_oqS&Pe?lM~NvelHH{=EAH>i zWW7Z>iJV>aM-$4%g*F8Sg{yO?{@pz|(rpH0mBGJLRAPUzEd=yKjn=+F6UNOL+bS4-@IFc{krH74l%9cceq@AS)PLC&uMZUw2ipwjt%~A*&=j2j^8{lL z2nJ`ZTCkfSm&wkqyj2TEO5jiA@ILHjxJ_IW?CBhxr8Zw)xa|?-tu+m<*6fhLgFh{2?Yt5b&y${C75AIhZs` zg~e0Jg|aonvQwWBOkN_!hb#vOim<_w?(?SD!V5JajvIgJ5F*6F;|v@XKy?;^sNog= znMQn1)1&C-Xy|dT{owSx6mmS>z<+yt`xy~XB_J`%>D4rR~h-`9Aap zF_$fuG;%Wt5`wYPz_i`*LC1y?aEumE{~VJHhs^SzW7Pvm5<+GmwZWl#YJlvQhg0RaKvQMwi8(Uk&zRkuLFDHm9*U9}5}cT$p*DF_KOrSFWL^sqLyb#_88 zH*(@xbpO_p^f_DfrQN?aEkL|PIE*|pzjnrk}u2PRJvoxR6QFZ-3(8|;nOrS-I zp8gKyJ<cI1tO329pkXmw*{r|*8}CuW#S#6x@Ad#lH->YpbcZ;= zIYfc~;B)Ym$NW9^82bBFr&-Usx~|sh*!M5)y#r&JoP&5;{RZb||5EWa7)C4$!&M zZ6ksU&{f-XzKC7alUED$3Z%GC_sAa;a1Hb9_s7(eMMv-=ZQvb2sfF+csxNx?CnqjA z+XC|^JqJK7VSL%9Ucu59_Rjm1x}nx(4wPa6^QtJ85J#Z|1hhXs1~t792PnCNokA<< z>gBSX&UIl{&G39vzkjBb!aJMc!mMZ9HUANExKlL;DlGl}g2)&IDLlSc=Kzz)=RBWl_4p;#nH!WE-u@N6}YT-bL8a`p|1taj+-( zpgnNyltgX^+A6qqyfF;HnsD{g18Ftl_g@+l*LN;0~(y1oGUS*;2_K&!GNKEdbSx+z{;%IcPUzND|P`DgiwyPuIJ7)FLCTbSD{dkE-RUsK;2XpOB8! z47AWvQJ?*9w;>A}^MwU!Q!G>c!#X1;WBYaxTXL|8HnBT;bcK)uKJn{k7EXJ zJd6c&h>fNj70@rK-#G5<|11ycrLI38+dZqa8vLc|AF(redb9QYVpWVZMZa-l#}$m= zd47}ysZ5dWKrYOp{(i8@?mD3`nu~>oHa7-y(MAju1tmHiJ)oZ{un?hYSD&*zL=>Ox z+9(#N&16Hbuf8uG7~OcRM%2PAIo1zPzs;3EAERAiX|}p7_lNktI$WtDvE`3K3{PYl zt@-xOd_SGGUz%ZWx-OC-y`H(WSe|#V8VIRyw>*%=pH_1ouUu)B;=_3?NMj_XHzfE3 zodMrYi~5U9EG3d8$C6Xto{imUkZYEcwZL{S|Lf(12zo&uxXi0wQacqX|?p z+nV~)a_=8VV48qFCy5yfRP*Zo`{h>3gHX*kc<0?JI1uP1_kV#+)j zU${Rhsn#FOLvUJUtCg3c*oQSXI~9}Wl`uepsFvM8wRQgG`$B>!cz*4KIX?wx#H{P@ ztWAj1nYBRYrYy~SFj@&pUG1f1{G2mVf(ze^{;asV*^NT=%(Uyp>JbY8(~Gjtg19AA zLWrC9dOfwzKSthIT_h8nxhfi=kJx#vrFU%UwCf)&{uqye+1gP`Wih&eYLi?9|s9HZ%o9M zGrgABs~_%6iF>w$$wN=D`nmtwH0Y7fx@NYAfxo03GpGGQQ)pcIvkY2?dx}d@94d0< z^Ql##!H)nshN!3Xd%iYYM|A#_&-umpO}>`oO>pw~e)rYgXziQVygF-F`GJ8SeZ zgmeVr^x7&c7S@Xh>x@N*nppq8`(dApI(pVrkinSDI(#5yCeu-dP@GICQ{O=V{AWr3 zx6%#eFhS>QBl5DC7K%<7iw|sEU!p*X^-EL{)*ibNdah;{S$?jPqKP`=RlS4d@%~w) zY)YNJUM$sIox5;UQn|eN@>s$`{CgG#&I>LZ(hv`S_`41{^#;dF<#!+uBezKDYH6Ffx#56;Y+&`9QESS{&C( zvxF9ecn|t5D`hk~N$>{FsIDr4SN87^L@#+&g+A7*Vsx0f7vviD+4-fb*X$qeH{zA1 ziA4~I)Tv3Bv(4K6P5xv+0>!Rul{e4S#-Gg<%;cQytk3UfuMjOplvD}o1FSoJ1G8R* zU*Z|zt=jDyks0d&M`fV2h=MPQ?r%YOt9~qUH&+iYz{JEfPHh}6=AADb-V$z!eueBo z;_93A>R!$5VB#&O6>O^dPB(5~jvo9HY4DUENV(ow`?RH(zT)EsE8Lt1N3EuGJOjuS z9(9b@SPCVm~HeN|A)o20U!lU)Ypq4+pq<(e~tLg|@p(_UD5LSZ6 zSmfQ}9INe5)$(WI=j-cLtXCIBoJsV79JQy@0ZuPl);Uc&V;0<0yq<$nq~ALT#pS&L z!V9x4$4?jpPihkNaLBNd8V@HcJfuJatoZvgN>T^kO4ZJX(KDEZhK2Ie`_7Ye2(ky& z+K;cr8AoA>^p0Y;DbZ>5_)H5*NPTh4=;8ON2L_$&R|q`Y_#= zQzqP&dmCxZPHnsyD*lpMIiFQeD4>UQ4SBQ%vGS~!&N(j8Tarx_`}f{OM{}B*z^3aeq&;{NWY&0r6u%44=XJ`DtJ(agufcR{mJCVEfCRR1J3S)>@*b;Z~rUG$2- zo_3~xlMoMp5q7?6>A&3J|MxQCK2f-0vB1{aT5vEqXMD=N~RSNB&bME0*wg~BuX}*AUQXw0YwEQiAWG6iR2_XgNS4a5+w(L zCU%pW2Ac5gW@esu-tYdrKkjqu&)`5+ovJ!#?|t@OYi-Z1w&54Zzlj#sIay+TXpOG= zUY&K#d}ByFHxyl=H5~rAvh+D5wCMKQ?~td&MQc(-?%oHWPJeQ@oUbAWMBEW&q3pbM zXF^Y3SS1+@-}wgC*eLhuv2^o?0e^bmOdlfM^HHqadA!hW6{S_+pI|m6HaQSPKmAHB zu~?@FF|KzjQ^OxD(^pU2a;d9Sh;};+QGSgaJ)-#p-{GOZt>B`MV z!1C3NSkJW%oL#dMzsfDYh)|mde$-V!J~vvfYQ4ZX(6ecTH!rwr1j)6_iz*7Z^u+uw zq^N~YPn9_v)=>n`G2*{2k(Tiynv2ym2M2Cc=8u*8Cg9oIJ@?Xn8EEmj<`w$hnm0^# zJ3r4XetFaAY)D#2)`mV*P99Fq6|$aWKhBoOrYOi4>0n)o9hH<8#gB`=WYrb<`sKUg zTCio~dznXyF?2u~uJ}N(jQe_*w7U2PAUR$b9kk;;jnmt#Zn&^y=G+g zRkQv(erGhgLlU$*tG2_YedSiOV>;l}wy{@jH2lmgvN02a+Ho5gwY7$hXHm5cAQRBd zK(82ZTipD|ZbIGT}jR+C_BoBZ`y64P070fThbT1BRJ`E zvLILA%CN9)>fCsfMy24)w%SHW+A)63@`b=H?p z<+*I&nboO;{BSh;=AUvdLzFd{G=}g}i8u1LTeZxZ^Skx1QbP2!BwL4w8IDYFk?Png zl0fdoRZ3D@r|a&?cqBf!uKH5zUGuxEzEtu@uU@>RPGO6P_v~^CQC|!b4HaVJay@DWiG|e3l4@dSk`M#mIB4zgsoKNt!OrF;}~k z{Je;npsFKWE8)ZybOm-bDJta3!Cg!CC2KsEvQbm(NrINDzGa)N+w8F6<_)mL>WjEUArlu3*;&-(dC5kKskne5 zb28**AT#-Tlh(gEkdRO9Eq|vjd^&UV3!KBfH>gZu7ufcK*KN3%nILffEBwM?;z!eb zb~$dsU$D2F0U<-KmU!{{y?txeo7w*Y6q$-WDn|+rg>O^O5977B)g^Sa zsk1!%dhS}Za8A|o9_l}o@e%FXZ z*_$JCyF^)DBxLjCa||oj1Sq|%$xk%K_a!C8{}%48wJU8Y_oqH9F@3d_?v}lx2$l0h(Lh<{Tgy?lvvDByoQ2O(s zWzR<6$HG&M%ylNUq~1@I*>vCKj%#NODaz6_w0U#}W4p049k;u^szTUEiZHyRq|{Z8 zM9}Y-`AkxPHUjcb0G*8$x1Y`iS+&LC478#IP!?xKDoIexIB5U848FJkFjV;YE!P+N z=lgPuN_$J~vefaP3m_FA3ZMbysB2B8*X9iTJ)uwqxL1&bxFRd-3FQy7hK8?DzXS?7 zl$xx|W-nBlQ>gy6nD?f$Q?0fBz=?FAQTdkW3sfGX_&$7?Q6C_=aNGy9Wn5gfd7nVw zvEP8g1Zo15xuxwUZw2~PZA|zBMDsM1ah6N499-mMsxjmopy389-HPiIts%(^Y1%$R zw?%)!s?_BU5T%$MDzyixife4z2el0xHE-N_<62?+=f>im7g@Ig^5;azBbH4^nO|co z1GF1R#`QX@BCr8fux@Q|0XGr_jF`&xvmsy3B4%@)l*u{Mou#K8L0Hun=qs`Pwiq z5&m`gjJ|-jt!?f|;Wulj^ZJU#JQC%i{MO$IkQ(irSYMXSMc*S&IBh;`YfV&bA-?@& z+%SNWm~RQGiOvhar@p@zohpyL82wRS|F*F+Nr?c7yZA1oIWS>QhG5)?=9d``_>@h| z%^jpNF$<&qhR@W>_+<**w4vrBnU8AU_)(D#zdD)U&xfS(QNYV14oSf0Ic(4P9L{Tq zH2?uZsEvUYLr^In@#X}OKMG*`rl0=03-#SJ0nU19jNkLq2;OAH-VT^QbmGoqSkPNk z@E$+G!60ap%3lQoLY&BhyY^c+mFfcs*|qP$D9l~O^Wx$a?U6DB9lWN+ygNIrTrqkk)&^{$0Nag>;cK$7y`d0p|*@I|eK~Ans1e zC7=kW`WKzFb`Ggh1gh3^jeg7r z)S-x6Cj9h*B4EUhYh{F`U4bXL-+LLBA0(0KGx@hCYx@TRs9530PvTFC_7m7(*$@fs z##iZ2pFSaah!3jC6K{aD*n9dob$THu0DgLpAf|p@1U3;i7MOnRTq)UTt8NQb)11lT z_bGAihNxKAxiq-`ya5e@gD%&XY=hFtXk_|*PmoRyIPyvDhc&R}x430|R%KN&L$gO3!?Yuy6&1 zgM)928i=eFHX{?Msi|j$p3RWQxd4^#moEiYt(6G{{4H#oBz~yeZB%;4C&Z;@13QZcSaw;~qaCPPqKukxe!~&_~C=CS{)O>gDUltet*uPuTR(b4r6CFZO7|f*OS!d8&AXk8~ z+u6Rlbb;9x&NYX18`#A}DT(Y-c9UpgFvyg$g9fP=#X$%DqX~jgval)!CM7T|D$V6I zZF}O+0qGUM7)%ww$%4xBY{jt?IyyS=9nBH~0Y;Fmv}A|h`|gRQaP9>52l(#GXdKuo ztOuB->g^T(ps+C(eY+;E8;Q$7lDHWlyaQDnyxpT0<-HGoXnNydEe;9sK;^jm__3L` zS%-SS?}VQ!wEHqbKh6RGzMjwTqod75q6q?&*ROYon=d(lJ|Ya8@}6|+1As0ImQJ3m z%((u0JX`pTZO;j+PxnlK72~IXu;Z3{mM<SSmif}>*NE(;J^ zezU6I1nUFesx`qzA_l;9@D7cy%QXytudh}`kn0rJ#nk*1?%xlY`;8W3f!`L0&$O5o zGSJhfC0)zQ%j=%%D8KlLl^B~0zk`Y~`9VbYZ{dD6upYp-V02*UiN7A`>t_zIKt*g* zOG^-f0UQs#NOTs&4(DlVxTK|}$rcd)w%aKZ@%;ojQ1k+H36#|U`I4chUp@@;DjsL2 zBYEA;?9OYf6yr&sG2m@PMMq~yya5>84E=$>Py5CLKnB3@4o}0M1K6z>`<{C#d4L5}3WhVp0jQN$3L|WtvYRz{ zcm`}CkR3r@c!BMt0~mpFaV}2In#z10FB@PkLx2z6=cw<+aiPstmA8 zpxXjqribqpg>BJM5R_)97`A4CUJ6Bm!u2a>j>7C-q_LuYpB95|lgLdT!j_wa{!E6fe zz5YQ#brv4QYfcjY2Pcu=fJ-!Wo)#V^I(k0V7{V!w{C=3xnmCa20ie{UdnqG?H#ES1 zoz|M1o=)$O!%RC!96FuRqpP7YIr0F-&-}c2kdS@sTWQ99UEOpvH9}Pr9t#jMumAwR zh}Tusgf#>hO#tPR7gOJXLoDUPh7h-fhW>GEFu!~TCKV`s0P^4t*O(xD&2gK%pciVt zcpNz9Kw1r_^{N6aid*3=fHFr$Hab$gq|-U*^-#X|L;z6vcHMoA6Y*W(+>pkht@P1w$0N9p;1g1#J7np`3Oo1p`Y)*xH z!{b~C+|oDswlo=o7M$Dw#Ng(CG{0X8T)_n`1O>n#uV3agrvmEz^+L4_B?bnD^yA;> zHG$(U5)G7$HeFh5tyOWxGs^a<^F3QJlQc~B5;1^_D(<(ynV8xm4!~yi-`9%Z*JP0{ zDK1v^m$C72FM^pAWqiM8E2bdS6jqxue4icK(tZoFPH>MIm+HX-K#TKw_}#$#@P*6dF0El8?Ty3`L7$B5l9C4m$#;@7sTR=?huu%`&UU0A8x9S!Z*VG?28Z}K z*Vx2l-r(2?9DJ1F7HKf_?Pn>hfHd{+@;jMN^u#Jo!&N>hM!a#lVqxCkB6;Y8i#U@^ zt$EP4?!R%U8s8vR$#eq8 zkSDDPE}JrJ?89(gH!QGDpVb7B6HU#{D?icQ6ET23nq*h}CI~+A&?%g*H=Q;$l?ktx zjY`c4x;Q=SGM9C^Gt#CH>=4VJDkt)nYx0RN!X(>Y97R_|u@03gz(D#fl1C}T$j)+q zW%ybQnlui2ZpI_i*COf2%t&QTKE?WK(?^!&N=!+ih%+T;C)knV=nE?hp!&iMW8|4co^OiK)^1Ya;5oN?yH zMcf;z+ULNT(=Lv{?fS;ghh2=8((6*$I$*Z^M0Blci7fjc@9p9WmIDwxFGdcf&^NiErL8QuDT% z##($E{_O3Nz3jg7^>aD&*D9PmG-tiH%7?9pQfh^r&irVmtC(vDvO`0Bk0~q6aPX%< z_l~W^tNG;(HN@F#%B)fmM|%$TJQyb!yUWf~TNkoruYkQ7fW`IzHVP8gOLtAA zPMswX-=fd(kisiFDTz-A&ssdFWJcxd#uZ&a@P0Dk{ec|B_ewWk6T?4trM$y?g`M%u z*g+nALwuV9+5W=_yqaSc7ZAjqB1u;p*)-fw=~b%s-oFU**t425bwcdofOjDyS;)c$ zX#fBMXqtIn_6}^ifj16BUEfB0^`1w~VMZ2^www8t_+mZ0awR6uHdu0g`Ocx{wnUzC zH{bR&X+!FBCc-sXi_s{PG|q~+*=a9&5fIvY?$SgMP%NJDToJ=w={_}!~V=VsrPP-SDi9J?D zOUkFtq&mPvb!v4{!$#5lffa5T%f& zNwapBYgW9`7t4zj4^M3_2&zizYgC@e$=6Jpt*^8&>nB{0I(83W;65M|oo&xqmNrV?VqiAB#OjQxG^%sChU=)Y-#kM6^Okh1LihN# zWNX69r|QzhJUV3#X(ovA(m!v2t%t8ItrP-L z^LfG2v;X5^O>vceaXRcNiqywL#PxJG!reMy=cgxnFva+hmHe+?0rD8Wlc_-sDq_Ip zL(y^u|Lcrxdk_fDWQ+0_*zK2hJS#1Z+O(ZQ!l2UC-qbz*#pk6;!lZ+TV~YdtWO}W> zzCNmT`k{WzA^QE`g6z}uVdGP>T$=Qh-o#|XT>e%2a~k^NX4ja&6K6k@*650Ie@0Z~3kBFM?TAn(%StMg*0u$MU}hOYXiQ=Y)BQ$nn- zJnKDdg_ei`vq~^?vB3iMs1ENfFSKuabrdd>nFaB0+nnTI88JLn^=1HS9Yh2`t%zBO zPvxOaC)VJF1R;!ocpM^$z8MDpB!FP@@t-9xszayp-aI6C1Nr9C>3si4M9Y+oSoZ^& z2wfY&wFj(jcR%#u9~SC@K1P;S277}%0d^YyhE%|Z_+fqzb=dxjlqAQ`XRH19YHaXV z5;}eI5>r=6m6SDB#exQ98jjK>aT-E=nOc`FNglfTA47_W=W0iM1|o$p&xTYd>xoYa z;BwvWZb-YTw%m4A&I1w#uY*|el+gXG{|?j(mtMmA#ZBcX{dEoL3#!MYQ$r&H*}4ye z0vlD^U*;3m4cuIFQ&aBXVl0+j%!VU|h60Fp|3*v*#GwoAAL*vDj?VUz;+=kEHh=7S zj&YI(+;@KeLPR(v?ABm63QolXG#yeq#@+87tKQ%E{2fS==iJ*_Z#T_GUMrt827Oh2 zT7ig`Rj%rVYxhMQZr!=V1h*{Lo&gX4pIZu|x8kEtH@8*T5P4FweeR^(N~KDKN9V5@ z+w9!L!mgs#UKto$*KEeVfu6v$mg)Wk{X575!+x@m*Vsi#D7Eo=)`n`BIOEi!jf+gF zch_veesqN9J_j4eD_259DEYxfQC;{u#6WnRn6`4iXx((1UK%1wtpS3n91es6+-TD zK=PuWCtr8}hC_u%&Sis*9{)L<6)!Tp=6;)BDDAV)$}>I4Bz_erV{b((1|mG&wm@zd__3!Hbdn|abl7RGGEl2zhxu@AuELwfKw%XZLz7p|8| zY^g$j<>kN|PY(Yjucf606CBoxti6^H-@jRlzo#sDp~BV?Heojsb5m1qk@I!ez0~+= zpZ1xi7TC+Rer<%{=X_u$vkziTGy^oUI&n2pu&- zwhvh;|M&b5W2c{_0Yd{a0EY8-!l_yJQ+8?Bx>>gb$U<;ROAnhBYOT5*RsOFHK=jm9 zGA3ic{FUFIDNot{jMDK-xrrPB1*Dd6(ctS#a!IY|A``@6qVI& z_g-9FJhqiara0;A;3E8tTjcH@dJBxBI7=G%?y$AvZG3pAt{*JU{}_Re0Js(O<4L}k z%1+KKJp!SG07T+%*c3l^QNL=2=t;3iXC zG_Y5OA3*J6)`2Vzl1)h=y6^nS*{iNgNB0kMGH>o7Y`38Z6e)vogC;cM zkk|u{+gqTDN7ey=20eh89?3UphY&0Yt(b~3Ha6DK(112H;5~X4E?j3`a5)?BdleMs z4)N^ou*VJ4I3ZO4aSJ)4>RNABnEd6-nZO%9wLlN&MnbAlR8$+Ee(+_RPA$lLUNP^@ z1U0Am2M62KqeU#YhRvM%rYhY|EsLYw z&9iRbU96LYn#ICkz7YzP70At$7!94}@SP_n{&fgc6!%hu&Yi1`5tG{8_(MsUd9^ui=S_TYP$X+f$4eJc`GCpsv~;c_k4(m89UWJIAbSMdj5zUffU zF&uurhnc5Qs)jq$vTl8g^W05CE0R~FszEpMvDw_KqfBOf*^3Sm+{#f7eFkVhUo#sj z6{wKFPTcKJpeX9c`rZYU@bk?6l8(s5VL#~F0F9;6$qkIi&H!R4SDmch&!3UxVheD7 zveZ+8x@PqYOz*>FgK`|u+}lUDW)g#AohSZk_3sIC(y86hZt!o+Pg8Nk_iN_MB~%VL zngvILP~P3h%P$1TWMafPhmcTyeEc`lYrD%0>}E8xus=uru!3}kQ;xU4zkin2JoFU9 z3|bb?YFHO`T{Pc-J5)81lAo_$Pzi8FMbjlD&X?B>Ip!#$>Vw&HU?WQ&>fz8PSX+~Z zv@7uDazL;XRHDy2vu+NU=fp_4sG*?VnRqivx8tIDq!8Ag&c*l(@n(Rk@WYNp-rBBm zKtVtgntM)8>oMq&G&Xi=*K0`R_bEHDbY-;Q<3ap4)>q%*mtt(Y&zZoyH&A?ddkgRJ zj9fRfs^VIYa?)m(a<*BXsDmUH`C0>Dnfw9*G=f2DO#n%b546?Q)iu>?ed*3YzSv4r zONc-34Ybx`7hiGvNnni@vHo0RLv;dU6?iHHIB0WZL2A(TFoUgAtZbRS0CX*40}WbD z6d)STepjYu0+t+zi^G)MLT*GqKDoh$*jF(kV?Z?rd14Kw#R zvj|^7G*f{d^T|hzzdwH*;UeF`D^@F8f7U|^Bw%{F$pp%I-46`=8(E+&FhKRmc`hwQ zDy|g7oE56lUXT|EoMd(lBmjvFkzi#Ku)7hQkZAh*{};HdP9GB}xsEzQ!*Yg3nHs64 zj^BU#h*T#sX9+q^fq>k~>Z&JcQ8e%-9%}z{JsB!ccQ@_efFk)y^Cb``0!>A3Nb>f_ zL+C)h-HC{gAH~hOFGjQ^S)Vm?xm5|WdUs=`ARP%s;(t7eF{_OGMs5Qe3EJx^PL|C} z8kxT>?d{IpeM(J+Ex0)?3{5(CVEOniUi9_%XNKw*6uZ@iz26IyZ-J2S{R4&%rhi}g zZ3u`03xZ zQl=+DLIkTWKb_t~8UKvDaxW!cw_%@B4MrtC@n&!klkgo>?8hbDhHQg10d9^T!?vV% zH`(dv2u&L7|F~1C=Js}3hZ$8;Tg;5<(uWV>7my(HK*nvPTc7^-Zva#f!Z@SR6Ry zRd3(+^VN5%qjzOIX(Mpu%ut+)@IT=IV)x(ni~H~*)yfA?VPbI%0est(ge4mNN}f%M zxscge4z3c8mnhYz)Z8Aswr!M&e8oq zOeI-H8JgWQ(jL*1*?3JTBcCAdJQBOgyU*5nsrUSO+6dRHocJat{nz<746SjP(M$Q^ zrx&FQA54c-Mn{+gxu+5fivzcfTQ-v@dZvGX?hx)u;aRcmj|)!~t63NUk4#xC=!RZn z?HgHd&NGsxvF!eWdF9JuR5UrbBh&7cbJIQc$EnOVD* ztZWE2Ppt;TK*B3Ws@spPfBu7Xl$73DDd1XN6~#_j$L?*qwT})bWwQ)gd2TdiQoCx; zcC}5-?%9Ul8qT0RucC8R*3*9EZG~vV(9i?u`m>Hc5*U*|h%s%YnI1%;jDIx;Rt+$C z=v6!8(tKn+%cdob5@YxtT)Byw%zISqkQ#*EkN&PFz2r0(`w%T*I#BckSy)!N$GB#9LMQ=LMw5UnG<>}SdS3Vvgs47M%2BY-e2H& zlVIAO!&d%HScnkJFlu>>vN)6PWQag;U|B|?ZzIdikA4g!Q9eT+WM|H(BO$%megC@6 z^fNjNx2vAKfU7dw)rOf6qHm(JBV}Bi^>d0MVn+&)xTe3lwDDqdq@^c=TAtf$3etM| zedI-u=(;(;{e6C$v?{wS=BbfGK~!!N*eFfcd(0xf?eE&LuyQLnWpI7QvlO0dV%yt< z#S``%GafO2%7z_3`m*d92lgGM=N8S@;Xrn#p;Nk2q?T8M{Y{iz7C7xqg4+{dg~cq3 zpT6t6zvwJ)@(T{~a3Z5qsLjah>V9aS)^Zm4t6cx~Q1qz5A=looSFAzb-NAS9jLvM0 zRrYsUob@kK+_}*myWy%XV*AbYY|>n72<^~sp66{0`0L*a7YHMLvX`farv_=*80mQW z(PnYJT$J}VB=(y4jRF>W-NtI!u!8;APpi1<$F0+ht2A5NF+!}{oSqrJzC*vOn$8M3 z%$zh*LY+sq_6ntI_zQQ>IT|oOAANEime;59xu3v^Zx&QU=@PFf#<@OQb?Y5j7QDNS zU&)G=mhowA5xOb!5GUf&V?FnAeoM;Zqwj~Gk=4`1d0`%s=|>C@gsjG>M$QY}L#F5&{FN&XKc-$5 zbtt)IRzGz4I_isQjYRtkj|YTVL8RTLfx&>QlwOYI=oLKXeYs0(cx}qurx0h$SB=Y` z(MB7$_?2|>pH#aEForydo1Sz*i+s|rG~SYBU3)m@KKs}$xy`ZW-tF#oVd+l=iVi39Ix3se}T-o$3E2(7j{X} zWOEU?=V~=QLvcm>rs1!Pc}9x@XUw%4ddx-b;EfS`oUE3f}r`1mrqo6WESv^7EBwgB5Q`_1np?QW?x|18C zf*9Pngr%`&4EN$Cx{#KeQbVH6^L1U4MzOz&zMAiRN79w%_Vo}@lkewUfxM-Y|FCxS z6n~NRgFGD+a>7l2-Xojv#J)<8Fx^-#S!CvrrrZ8GJmIy=kYwoATVHqCVw)7Eym`yu zVg|a8J2iPiCRU&L^e?ZP(D_xQpzpTck#!}{6e?xKF80(u&-zpLq90gBRPCH=w|JPX zWW$&i8q1QPt%mGU-MG3S=E~#x#aI`)Ukfs<>RLh0d)*l;4=ZiMb*C*A3*Cn1THtF^a#R%`Eg^wI`mqbqlBRj3vz7>ru-)t?C-ZBtC zQm$5IJS4qW$haE2fp15i6gO@b8g5QLn}V)dGu7KzY_JaZyS3JOq2~}?e}WM=T}UYn zc23e2QQ2p^ zs)d;K_TQ0q#^_lcj&mb%9rNAoziOX-sOZ&1y-g0r*ggFj{kr{=hoWoc`tB*UrOEOX z`4>;|$(bK0tDLjuXSGG$uY6zqDVb*&%Xx0Z=vwCNtWfaU2BBwlgkiROt^3-hVf%+q zGAiGWm1d5+VGq!S_?BC*yR;AA;Lp4m?fE+i*Ga?=Ijf?6IFKBl_QhtpJoj#?h#jbM znUbRmIsBJ&RIIM$uRl2@JhJ}2lY1qJ6G633jS~TZ&EMxrBROJ zGaaZ8(b4s56&RvW1I0_R(KgMqgA1&b-4-v--R$-_!K!<0dFPlRP5HW}>)kAw828cW zy)v9;hU1(dl1}tqlWUc>i=K1r$mUhIx?*k=s!FeTqr~@z(P$gy7D?dXImJomY<2!C zrO$h~8!&h$mkBI8V|$d%=j~9WlyFH%F#(hZIO4Zb3>i%&~kR%XMSK;~jx% zJAGQS&k35tXpUN}lyW9c2knfXqzg$9XGXK^))0*-N0`RW&5CVyt%dBlJ9gQIq|$Yb zus+Ltm|mL}BN*JOL&);AK)3PphMv=R!&LY-_Lh!XoeKHPUM1r^Z53hG2yUlhWJq%N zwJ6rT+>sb^bo4=D^qVz0I-Sj$3A`d|vUpqY85?6riCbChuvH@Yo1IXLDw-3>%TBe} zbM0b?L1|g>6z5i&mBkrmj^nEtXzYsbplGA-+c_)Tex!^3XiMVHzZChE{nh~~^LGr% zzgu8Fv<+>izoOO_RyV3)#P8G*DSVt5{e2r9TiZ?TLhwcSb@)aC=G_ySL9$bmD(o_@ z4UL)&il1+|e;q5ddh?c6ws3VyP~1hf(OOSC>vYJBfTfiEt0+rrjbC1W26yZ(kLFJQ z92S5ZwNTQYAM_7W$}(psT{Njv*;GvFdN-vm5ymRC67F`S?_>zOtdj*@$osHkGLjdN z2Yp6*XDmiQvNOjdVAFQJkR2F6PA_|mmVM13+8%Qv3NR0;M+9bYsGJIq0TLXKCC7>hdr6+2@>DcH-Wl|8R#CD@qp zGz8)LsU`EQT$cFT>p0kz=LLj@NH_W1)&qD>;)IAFchPUYF7&Ubc<@CHWAQGh$Y#xE zcU8FH4fuxL4F& ziiU~nX%t-nuILE&n#Az=j`eysBnq`Fx|v9A!h+{@$5+Yh_0LJ`-yKqz+bAg3j^MAS z?91~*AOBRP_r(yu%%0MqsQDb)t|xntsjXRC#{GGJkew+p#FHm}v2GQD*ZLC~R^8}U zth5}7`xS3EEEFr7i!HBs9Opf9Z7rci(Rj5$kWqj%3Y>+no79-Ms}a@V_14l#_tbkz?~ zGi)Z(uZRwbti()37qLry#gBy-+9z+X7FC_oW@Yr73EMI@BC@?ccZ_K>4UnP z8H=Y|^9S!Lr}Bz2g;8B3a?g!tIG50Pn@-%;m>8vVE~onG#-4lo<+QV>B+sj%JU;IJ zE;)=mxv9Fx%sVW1iu?t>e@k;b)ts5a(b-+rK~y`I;%VRUxhqG2saNAOCBKAJEz=^& zOT9ru#?q~GUa2M2@+E-V_0v{UhxyNjk0iWu zWHsrx6DiVX;YpiVks%&yQYo8S+8`t>DZ9S>p$Bs_Tf%U{eYNU^^)AM=y9 zXS0xxKie9v;$SV`>?YUX;yHY7@S1?==he~RKIJ{4D6jtmYkzC=)y;L}N!cNZY=2@t zdHRJ5M?;cK%PIGb`T+6F3C^zI4EC*PWy4TbfzsvW+ru)&I?S_;5vb*L{hR?)gUtJuI)|Bl26h?!_CnI_h)4gr91Q35ELNL24M$ zt9Hb7|FvAHD=I;thF)q>XdG$s!`YU%(<*13vKo+7!g|N^`1NF;OsHd&@28~DcAFbB zEzSIJ@sH_O7!+NSaDgyt#Hi3g?R#shKn@pDO)$MSOHi3KokC<=c(C^!s9k!bpSU9m zeGPUVy4y`3D0c;JAO(VPHuFo^6h55?8=a#yHBVb}f>IYM3rm{IIbfyRuQr}>T=AVL zx3a+h@r}BIVfPDCWq&;t+xE1R3Qy!BFcFwmwgU*qMyhgl>X0XGYhFKX>n|H!dLV3; zwN5+SnVdI3_Lqnyt;B9xw?M7OP|Jbx4Np!W#`(wBIksaj{xsl4h`8Qrs7e5drK6nT z6-`7-PrTeg?;6OCb5@$y=#c9lzNSl_WbyLia$K1?BnXT4JbnW@osiA-^)3_1y_Z=X zw2s%h)m1dRPSkYvk?#qKn1g@k!w^ekJP`R919>R&f4$z-y}e%sRbuk-Dqh5Lz_)Qa zFnXUPwu5tn_+~C6!f923+LLVew}AEW}o%jAjN3ttBScyGP=_k4~p;Y^)%!bl8x zJYHf|k$WvQ-k=Z{`1Wmj)%|}Gh@2NCBwQQcadK7fh>U;4u4QwnvPil*ObE5VZ3OB| zh9d{ut>(wW%f7xXQ=R)M-$O5&Lb-3=xZx2NDm1&vxR$NxNpf0r#f{%AVx%W54iaZm z;<9^l|G7az8tAyg%SFBqTdp?{UDuZ~v2t!pxwy65&>gau=P+B;VFhsX7H*YT{o!-H znYjHggXn14A&!zO2XJ@cGSw?#+J-v+ zlPK(`79INfxb7{mOUUN}UkHyICOP4I%P%SyTJ(bkdRy^`tGT@VOObQ`CKpG^j2^br zjF4NlLy-;u_#Fh5LO*`&Pf}v!Z?J?JVr={+{NF$S?*GfQ`{@uQO4EN07MiChQRsGB zSvLMnJwhBhEph@LgX+?P!rxExKm5N?251xUVa2Tp&lK?b+kG{=9~i>^-;65&M+F9q z%(EW#pstr6`&~`+@&!S#*}s$8*VT2Q<%yO3C|ll}rO(+VccS)Z3Hda+MQpC!CKLIj zM!h066p#?&(VY9gR(>8LTmTx2$?8AHjc6qqFjIHm4yEs{^`Gs9XKBi!H^&!FHK>pQPPaoZ{F9*X_;5p~)|*!5A~LRZ+r9OzBk)%+ zm78&B#YxEP{!V<51l&H0g)4~I~vmT?NWRgo#_11LTV!7-3f(yByBq*-Iaorpe z^2ilv87OQ$vW@kwt`Kwm^vqk{jePZ$40RMLNs0V-UN(say{bUQ1Hy>!t=>J-QlOu; zxrjhP!9(2VALAc3sM%P@{g^I4@`-CHzvbQlYh5REV8z z<&`hFzy{gg-g5HdMR>`Oi6uYG3frQDGoUqEoV!@L_T6d~nNL}^6sa_qsthr4Qwg5I z7-baH+($g3tLVw6Q5gvrvabMt((Kpgx>G7rxpq^{DK`Qzdoi?bqh`2a!6Jp9j>6`K zsTgT76X3lFZgu8vC^htzx*$6tnF6wS7$~&4_JPV9lu<%ybi92X=DMGQP85{n$aTey z#pJ+IE452v`oAvc-T(83Q9s_X_;6dVASLbMU^)JWU+7%JF~JHOkPT~>Ch9{iVNGD) zj>HRL@{RyFAC5=k-(Qmz)+NzawzkVJb|`5VWK*Bq7P|yoILy}CEYLonGa>+0bI{`C zad>&foAky*aS)`8?3ADc?+Z<>Oo+JQq@*NQ#@BaHAbSaQT@-5O?`g>~HWMf8YzC}2 zsL&F!|1KnalGM6Ez8J7%|5u8d{D1ytz`_sz59{NS9i@FrEu%{wYFyA12?sfJs2T_yezAa`Tdwrk+>ARBelrQt{nI z&i4gh3gEaV-L@08IL%aRTqcp!-;F{|jPRy@7yr`<60i~H{nk|J`8`XfTq>no|M|Gz z^>NX)tLL9LjTW!*vl~{OzilBJhqnuk#a=6bsB(>YFPr0R&Sb`7*Y{9*pB^a zpnIf$h|=?4I7MMNfBA-Uuh0wA-fW-I8|L^u^&|Vbe_%nJ&Efw|KAnmcH+W2{mzsSP zNGiWF9ID(xg}36mPkiH)VCXj#z|#u}xuc zvN&t9$O&>AF&@&Qo6|z}FUWVj%U~f|-oDuodw{OgeC3p_reqTS(D_^czHoyUpNI&7 z+s(`)>2EsCoFlkm%eB)nJVQC29wxKzzX;tsL7ePnBY(p{rdY|R7x?=hw@MHsGdzG& zjZAJ%vDjQH>B`w-eR;@_iM((j@-Rt|^FAUFF%L);vr@T3uV-dv0Q?JeYTy{|SA(N| zOJtd3j2nb+$7lmU5@64ON&yUzy=6}@nXC%Ozj4ZcSL)xKWca20J@TMsGSPHp2|0he zz5lP)IvEq5vH(;xuv>2H3jpZ-GEx|SN1%Km4{!@N6%`FYckv&WM#UjvLhV$J0r_$W z3*elbs<5$7!_X=g5CVYvd+5)=zm~mkqvK_V^T;Zg7TX)kzzJ5DHQcw$H>jBvdn+!g z>hi1RDpZ%yEJ&;B7hpiZQz@M9EC6kU+ND;(#r5PE8R0=o77o|W@%pNQ0%#AM!0xe; z;9FqTHOKCaUl?v5D#9=?UaHxYw3{Sy8A&i;K9c%>wsYT&T!KIcvs6P#ZUjE!xc_<` z5(xwE02;B_pO5un_jmyQ@k3v|fQ}HpDTR&8l*-U?-cSlk^=H+M+IQzXp-3+2yfg-? zg1gXUc-DZCJd*38QO~Fwg4v-Cza%>9ZolZ*XY5H>u08kWxwu4n4`KN6RaqCT-~C|U z#!wX*@hA1iD=*3W0t^*!T=eWu$oPYDE&2pI_$Sov!`MQhjgTf8PN54_k+9vfg}e=i zc$D+s0ILarJ-ywf3hUZ?<;lZ6x-`_5Qqf!8{y9>K*^Y*Wu_IF)rvZEbhZsxH3Joyv z=FV8o%PT_nFF-uvSq#saqhP=G*$(jfUgUW49u4D9SiHGR(jw*fUtV1Gk%fR)dO0I=mCp?pF8)~?^J^2|pt z6_6{P8z#C2t!=)XxUju1*MhBzAojeYH~QiZAR(9n>nA22$L)y$gV?7QAbAi%I9#Ne z0vYOkWZ32>t{P1+=i}d?(G?#^nYAqwnm%x&D+zsk$oHe971rg4ngN1WInl_=~B0L*M&{a4R4V0EBXt3y^c z6NId(Zr<6V`qrkZMr5w{PmI;c5FpM2yhHwlIG$B&x4W)o*6TG$$iGI-2=AoeVs(4gm5D^IM>2hJxFj zGzQGyXp&1zie|uQznNGc2xt;9&!^aYt_v4VUOObyGcd+5Q27!jt4!M28ek@I6?h+k zcMia5zgsc>=41l>%@~P7#YqUWT^V0?Bsi&Q1S?d{^BJcO$96y2e@iMbPj0ElowzE8J5KXuR1qFe>Co5YGpxB11 zu0&i`O9{VT(L#4xSC@+HFl?d>j*3x&ptsv>QUpw8&^W(=x%6u)-N3ev9wJvJ&;tS2 zRLBl(bq@K$`w3)|c7;%Qb90kv;hpLWXg5D6=6W~^P%Qu>YG89)03#;7mk+Uo*fBs# z0{U^z7NfAU&N?_}5aJ|q+6sU&op2w}kZ^>Z9H=HVgCs0LAl(jNNrrZIE7sLJ7m##? zNQuaSzI4_E^!)*z-RGoi)^bKrntoXGQqiA5IaW$MRZ<;L?WG?ffQFMm?YcV{AYXc; zHfbY@hr`)LEJt8GAYW^5X9v!h5VyCDl~TFY3j!#(l0QL$RU7iC8|P`Ou`kMX3ywBO zbC7ihJo9BfZatd@mE&rs#R0OeGX5>^Ro69B!k*uTCyn!(5pCi8i5vaz2CJk=jDj{_ zq$NG-1I`*@2tCE49&UB?+A{{w-KnE(Z)bEb@XQr?lxvjr1{UMJ)iCOM|A(S0BfW1C zBfjyYFnU645TrK9O)<@S(pSZq%@q!$o`hy;08-NLxiXiP( z?L1uXZoDMW-qz-UjX|$vr|Lr&C=I#VNwlHclYuWw$C*%HU@xMOxu;*@YM6M585+rn zlGPT-ve8aOpid4cAUYQT%e96$le$qMulNM1)&S+3GCLcehIGhiIak6QfRe`D+koN` zZ^tZZrGYl($&u;na`zsYni0&l&iT+6dy3rR9zNfkK@crOFmvw%J!F<~oXeolQhlK{ zyIJRIzF}nFh1uHBtfJt}`mJxcC7);kx3po$5ND1^U7lJsQ%T$eUUVQZB1pdG&Ryw(keykjx)L>F*ofw9wX zQ~MGP0}kn^%N;y8?@aFa7;hG3)D>IFrd4*LWn%Ldt$(F$k4DdG2-_XT$2tQAmc#u? zU0{L%JRe35hPowN=|i}+QNn%slk&yAt7ME;Dp}UMJyzP>+H{~AWY!LL`jJwLr7*OUlItnBMA3a8AlT(;X=t~*>l}G7+6Z`DffD;j{Thx zLwx%H8-RGabVqAZ_{Q-#_?O0uaxM57A^fkp(%rO=orfbwW0+G^Sj3&3M|T&Pk;tH7 z_IVs`_nlh)mwKQ-bjv%My;kQ?U_5I}ln zjFn*fkU^?4*m@FV(2g~9`&HY9>goHos6NPmiEyZ0&Mm${eEL$Jd}*(K;gHPnwx-Cx zqT*}n%@^{*PO+o4W(}h+ye^gX+?`&U>HaS3c?{!dm4Y`pRFtXoTjT>(_#P>Ae z#I;>Plbef-Jg(4HJXvlOL|+l{BTuuKCxnOpPjTNF*3|O#i+a=}3K$U(5Y&jGbOoe@ zA`psn1?d5lUII#Q0Th)W3L+p%kzSPErAB)1C3Hk;fCMQaKmxoI&i_2mdq3Ux-uvOs z2Og5lp1t?Xnl&@OwbpN~1ixc_p{oCsR>Q)vHHU6ZLk0NFa3-%Dh$+h2o`*o9eSSv0 zmpx34L#vo>jq}n^>1Aos5P2LX&-*fzZcqOiGE~Jd!DzCsMoEP+5I_J>HmgwN&jG(N{|1V&lwU1A5jD${ zSfu4)>1^Uk@bjXF7V+<} zbB4y}r^*TAZ^N}2!dQ)i=&dEP7aP&j@vpBXh#XFnX6G%gD+i;Z#VHnm9dW?Gs^#38 znA}2U4yP%}?5e1Y73hhKzWa%Pd(ayuu&1~|+2D;WZQ*E;YMrKJOqi;hLm+ z_w0Os_q$a)=>%XURhQqQyl#U|B?Qa#rFRZ(H@y=*OH@`8CvFaAE%oa&TCm)-57>6~ zSBw*#!^d(k@q?Q5_PZ^Mvr2x^ehQ>FiYdz?_DvfBGDD_oIcbMNcX*88w1*pSwMJnn z!Z2nW!@1`-?X2eN=lVVUuSA<}xb`tx(%Fs`UBKd_Wz_r<)L3@u&qXh_nt24QI=oz^ z%17YW=EHMM31>5f6@im^OxWYx@jNv_APvu!O^~t6cYMZ+M&5VSCtid^sLK*6pA?2hZVi4g({4$me>PG1_vBqX3OF64OG?QowF;~DX1ia+gtvFf?u zYJll;CP)PLfnon%bM$p7hw=wM;YGuAg`~g3E=GJd2QpjJ^5Jv*N34q;L;w5aq;e@4 zdE{F1*VwG_X2Lg4dop@5mUy}at+Hhfss{V?R<81#ZI!MyYwSpmJSd`{tWl)55!-IL zWzYNkQ;jm_OdGWAi7Drs#d^L>b@IEXZR2={KU5&AcvQM{tFrZv(G45o)gZO&UBu{! z8%&7BM&&4u{5tckJNvx$YJ)q)Z-gbroJmMRZV_V(%7X!oR++(9mD;s;H|3j|ANrfd z`tFx8NRMI6k`|_~{ZiI`I_ss{d%9dM5gsb9chJ97Sjh@rW;hW)V{im_mgPPcZ|#=piHr0yZc^%83a z$gC9VlA^8My)4I>=z*m_t+(kA-DtR-nz~;S^B5D3URe}H1>DGH50lBLIHzO$Cp|)< z=*^o=ug()l*kcQRoq{v*6{61&3?=)xj;=iPk=r^gWp|0f%T6P1p|ZFkmgGipdBPog^t zNb-u%fKdci1AvmB;h8hzZf40tJ4QFC&3g45s`?tvUoP4s-I!v^&#U>-@9Z>lVXZ&k zn@?+Y4Xe#4G1@j{&~dT)%}F<;skLG8)PC>?8qLLRJ-Bg)Hc75+lp~JMA3;nMS|+Qb zr(Ef3h5i4I039+K9`@-&sXIqwpKXr0NT98hItMX!H!50Mj-yh4^U$9xZ|A`|={OJc*wP|j-yiebjyk%ynjYYPSYJnQfUoM1;-__*Ly z!*TwSVg-<7*Jz8CQiMi39`=yFdGeSF+6=s0dkcq)x-}-9c2gTK6P|9E5YxOYehaMn z1B%%El%XI**RYIv{5PKj_ay@Vg<^#B*wea%GSbic-0Dd`GWH(Y(MVfu&0*jP~h?RUlv9g_k(G=yr_f17dKOGIC`fc z=sMAiFzESeifA)jdF{=Bx_Gd<*fD)?8k#G;R9}GpL#LZCYcI^1XsOMkCPOag%1(mM zZKzLvcPAeC;_ki3kz3`w0?W%qw-)Z|U^y^WZVABeBSUMN7ut_+=nJq*CM_`2$F8$^ zrke_TwcypNYWK?fM!uCshN`5S?T4%$TF&R4wR#yIuaSrngr@8{On*(smk*yiER?=M zEPK*~%VrT2F2kMX?42{B%MPN8J-+xsYk~Y=%e-bZ5b>so$uUuIfQ-9I`*naw0_vm` zu`Vin^KDdTrokA$8H_-%opptAE=ybywZ z-{I<-((q>M`C!*Q!_S^VPJ2Yg$KxPyUnnQEbjmotMGLpl94j9YrjT0Zm-Z#S zNXQqPf}P%ddYBmdST)m<5fU-Lt4t2Q`L=|0e}A|_j$e^5@xy4OdT75Z&656cmygH3 zHA>d`RmR8hm~E7#$4+$#6q0GTav_5_DBbv+(y_P~%)wZQgV_)0>*D4u3-8t=ld8)~1POnb1(Am&qEY!yn_j_Ksat9_Z9xod( z1+;jY6L{lk6lrv9jI%>F%Z;B|8M*f4i!d;OPJvVUJ~k~k55X_GqSv~_GCfiu?GH}< zAaJ6jzZCTRo+!X20d0hGlmGgl>xt;R^qDhD5?2>>3Bc@&FtK-)9^F|?299H&P&NUK z-h`pvAJr*28c(zwX?04U69$XEv?lP{6bUlh=@qTV{=w5WB+0>zwov;DB+-HJHr(3O zABvfA41F$bkImP89eBJF1HP70W!5e_yxwr-OTOXrtI9ku#IaEbp;u~aY71;hTP(h)iSE)G)vBU) z(tJ!$zM+o)YAFE~gse3&oJI3-#nr-;&*XxvOMQQ%y?nIVJyv^?0?{_mE@_)5LA!4$bxXw<9OJ!u+e;_9^WB zwHh5`9RC={>~li49D23Cov!8({e`awGjq;5p;{Ygt+m0)+#m(s$5v~_PAi*JV-3!W z9BOUV2Df-lV>u_s#tlQ!dqj?om$XJdcA~r$iUqF6aQII>+b?3TS$t9J9FQoz?C3KF zNbpm9lGGSt!kD6xm$xTAM#;H`rDw+13iQ{vRmB5$Ee*Jp zg!;n&?FwGMPlTy?Sg>W!pSwp!bI9IosDF~Tce3D2a9-^`Z)`?=>)w65NT8|Bi z?n9nQB^1hUJnZsFq3D#b?;PmCr($`OoiEGp|2|`I%Guc&BH<^bfs^O!NDlf2?XU85 zrk;FV+aKwK-9n(PDtt+VY z-M(7@MCewOAGw~zTC~h>bXn4LiFztGx_>_qP1zW59yOFu#Xh|q#vy%xn`naegWQJc z4tMIOr_JE^^2SS09rWoAg;%pJhC^B{!lg6Zt^wTZvY(&dpuM}js*#)JyZ%xUu9H~W zGN-Y;s8?pR4JoDp9=u%OszCU8TkkhN=%bf%kx5_q`)kO&OA z(PMWdv0En9O@jGJU(4YXhWi`9T&VC;B?t>JzZr`m7iTC@emu_xce*5;*$wl&7XWZu zU<=PDDp61V)R0w>-K)%b^iI~A8KeJrwwr|N9&r(Ig7S5|R5%%MTYU1vvN1E?GyfT= z!(E(6?>AF5#2zi6{2Ay<-mo>FfZ4Xksz}U^L+2L*8(`_SICH(|<1ckhG{VT^r7Wi2bsv zG2-lNwmBG8ay>Zgt;#Qp7iVRSZ5>z#eO2B1;ICQR*xh%<~g|GnizXOt-DVvw;pv^CR}IWAN2))K}ncZ4Hx z1OWL!)!hEa1?MNVB3i}E>qkrs_DV=wevwzos_a9r%oLseOK5kv!|vtIM+=gluB^t4Nr255b>ZUdpEGQgw( zY~;5Qz+eU-s|VoZ!v-WA=#+gBbPB{P`uj0=8w(3zU_}Le2r?cXfwQ1)yUtyQUI{?c z44h+3O{ajv3=m|XnzBAojdnoLB_xPz)6>n(N;ns4+ccl*L9Xtj|9WjdyL0}teDyB< z>-D1Zuh&b)zwR^Y{}RZ5zVIL451|`W{c8-!r9t0D`!$< zBEF@zyo3!@3bMHcK6eIEKO2?W-+%&Z1b~^7^72e7s0(VD?tgof%N!hvT(YueIQc-y z3b6Wu+`33ds=7&a;|Hz0r?Q=W2S7BS0C^v{Ae^%+1x7vT;0OW-*s&rPFRvQ)TB^+- zEe?b6Qm5ek>&jGw8&T;7i2$sY)Q*5}vU<3?KR&v%5|vM6NJ>fy4)ol0Y9~CfxBpJq zJO?^x53rjg*%Jr^pn$r&x};qfz%ocv8WSJS2c4RlI?@fjtzh7?m;(X=AfvUYD$sSM zfX2QQ_>P9CB+8(osDOD=ghPhgF&efvAb+C=*lq(xG2;6A!^%%eS-?g`n$Tf80PLl= zjWlOV?1y+DtU~5AjHW>E42op~^KrTZO)t$vwfDg2=q^|SRayGQzYQ(7wzdMAhUpq# zs>fxsKbN>l=}Lk$EASg z1C5?Ks?cap0djd>XklR?u)&3eh5bbxdfQ%b=lkBrjva%31OzN`8`Ozu`{U?K&}Yz( zfEAMr4tf862i8}{2#{&>EgEEf@1F-h(702v{q`yl(q83;OW7SA8hU&*bpj=+&O4z= zAlJF<(ddGuf<~Jgh^aY*DmsmTv#G95j%qZa`A=PQ8yjxu_V)HaYJ8BI{LtX9Xw@4~ z@aq5lyH~!13T0uHv?Ixn+ztSYh&89HZ@V$(VK1!ZQ!**K0{@72b zZPBT&0I&uGo_Y`Zi|=KI@;|=4Y{$TSDEJ$sWQ6`+Tg#_zrD*OW{*R|OxQfPH6}%j^ z{EVXN#ArZab`|Qfxu9WeY2tt}C_Ujv6MtG4-rIE( ziQfjj#1>_(%0lx~e)5M)hBiEJnoqBkuXcNPt)iSr(j0JdRbIOW zF)ntBMs5wZXSRncKXlrxy}|z6*X8Nzi;iZ7`3O2ly04+qXrx))sCdzn^=z%>wS&lf zfqO;du|9?zNky)h>9-5v3XBvgq}HdZ8A}6wKaA%%Jm*L$LgCrR>Ha@6!BwJs>aeeA_t++l&@b5bjOq) z{#GwVqwNPX)Btl*h%g6_?cJr79932Rb~No>y`Tle&{zaVBpiM`<)a$yP;5!6H4Lz} z`XM(jrLKgaOT1i~&II*q!B3*m>SHck%Wcf_VjLHvp9{z-7!y2v{I$gb&4 zoZ0Tj%83OnCocEQ`99KM9JRHO;VZz#dN#3C@vB4Ewl0 zXnv5TueB@ zB&VYA@kvx})$6JfeqGn>g^H_+zL6$TML&U;Gm#QIuxHHg8g7_=JybOAj8TEIj>1E^ z?z$fpL5Rdb%;_9AQ@pVeMtFCtjZ2E< z%C`+TtJ4%Qy2)6S-O%Ktl(_m>vOf6rM_4G#XE&p4t0|UwNXLI5e?lCQ#l=s#2}eqQ zj*f|m@pN-uosb)AvXcMEW=R(PIDH#?Cn=2wLj0^Dq1bGCYn(mFQo*WLGoR1lJcOv`a86P_syYS8f?MHs zHo1uGjU2cIfv}c-ds~n^XN&gvD5oF(bpK-gRphtC969*BWcROU%kCq+0o>Fr)&2da z)u(7g%bX|U8=xDM)2Tj2G2U&JHorpYp)OCz$(0@hpU%&OUz1 z=JH8k3iDwnWTHqvX&@1iivqe+oiFG%8A8!g(rYy@!mQ{Y$0Zp?q=Fm8ARzCIPG$lpW>WI_k=+P37zz9Q<}2gE$Wv-Ps6IYCgY};4KFpf1{Q(xyCUuueU~UdaZSw%rs z-uk?Ly}JxZ%vI#6%H@svKn;V1P|T| zw#M2zCu6$xD#Dh%WrN0l-wR?viK*UrWMFU$ddv)7vtUG$Up=vSP`$vUAAQj}&xgBp zvKHCG&D~6pHg?K*GxohJY^q(=q;zM9Tg@irRzls+mqeevHG9Z|OR{{94tvqYP#3Zf zVM;5Q-i#OeebJ17&Fe2;yxgZwoEaMR`u^yGvt@jRE<=-k+rdE@lxdp}=(qdQAnZn4 zr~F`0{mZ4w+|{e6(w&gE29Pt`xUHbqHx4(IDGyVN1%^MD`aWCjJKN%+q~X;mcLUqw zXneOH6Ev%HD6O{`%#+3p^YPYtg|T44Kq8-(Sy`Ii%5fZEZmz@@+?Ihp@-nX*m>eH5 z0_uOnb0e{+SQ{>F*HI+tDMx(-M(SQQL)B*^(^8+f71)he2vydU;QRfDq9zB%k|{sn z$E1i|5ZXO7FFbsog5AV!>dac@P@Sy1CGZO5rRPx39mde>xhCZm)CieU+8K9nOD_=g>f&l|#x84xbzJo;?Ab zOT1EP0&b4K@}TR6fgjfqg>{DOr6|Ee|B()lTXV`*J>)o%Z`Z zpV4+fmXqk#*R2qy3_6zl1H(A^Tn^uAp~Jl?72p4Y}ZZez8}7-+BJKu`uz_?KdTts(_6bwJ2M=BQ1#U#q10oTL8O$ zFG~>fND^3^0#U7vPt3I@BZU)uY;XDsx#$U)b!DgJw##yO>vsfX(S9sRPL;}1 zCJ3@_N$L1W7Qx|4D)*BuICTi+%{g_pdift{h3kZ%bN)`iWG>f1+#V?O_Ok|!8&rjd znEbj8Wr4PIPT6@`WYrvYibv_J;Ao9TqRGpZA346MYItldBw~Rf-71?*r)*K}clCLI z2K&8YHJiI|E{oX(BYKY~SH%6^ZQt)2G2FI3+~^)%%VC17$IEmomk-X=2>`W&FXFe^ z(yK-JV{?u*VA&kfj1t328%}qQxtkuIi5(9<*gPnbhdg}G%4wT#JW&=Dj8o}=B6`PIpq8QO4OtdP!JfPld6Q~46VJzeB5GpI(MECWfF8jAw3&LMCAsw!ZnDK z6H-GOWa{Er#;>qwrc*9TuRm8N7kJ_IR6m|NW3V-^q7Q+rd|iY0=z@egDVAiDz&$I@BDG0#=1U!ha2LVarX zord83zyd*$)AkP!yANivFAVfhAtv2+tv<$m7|AS(7rK1gta|Um6s%+&KGXc6Mal2L zVXwk9^O66aADpE5LYU#w6Vp#nN;S-a`xs?uM{Xux$R^#I*wK6*Np=ht}FXBle z_8-3Xyz+UAp6kAR{!;C=`N0ZTTu65PK8No+%029k%<$>}NOs*j_*ncGYhVVU5i7^j zV0J0eW689-#5d0@ls2F^D}$nELM{Jn2BcWhmcsMB@a4DdA&`e!yZG3MCdp$Dxp{cj zCSn(PZKZDUG?)vuw0po|pX8hHy3Wdg|1DGy|2gemnQH!*D}TjE_%t)0m zcBxOIkM%8Q<~Y@bYEvUUWk38Qy{JWa7c117s?gSz&|WvR__|lI&^H|Mnf(^j#RoYO z)d1zeR`jiH4F(kblzC>~$yJJ;6F~1w+toC=GM06rlJ(n`Ue%Z;oHv-UeTjeL7tL0F z%ko38hmeBCrt9n@SAVt@r$3hztY9fc$ZJrZss)l>uX?G6Zn1T5y#;r9C#p`l>T7#4 z_UwQjq31#TovcVzxK-|SV@(%c7usK6M!`EZ-C#xO5LxspP{cHjn390YPNv-*9UYPe zL4EhecY0JhAG|ld_~>qBnzdis2gT>#B$Th8&d%+qt#hBtU?kIZcK{8jtYZ;WH8%R~ z-9>{0EoI;75UYF3oAnd?+~K)&9)jbwD0BF7AOGIC?-M7J<&d3chaP48?>MU1P)9ps z@&BCgX>Mg*m06u>VV?65PDV(ZO!&1hdRlxjz^(-BPJnaI)z?MZ{OWq0L0$ejK%`(0AZv z2cT6ej;)zbx8uH=elP^f(FbMbC||^o!$S9A4Z_J$5mZc)2SO2;k*Cp1NtJjL>0`@1 z{+YRSbL4d+>jCm&UI9;Z*AW%hrp14NFLPCJ!BP3&;a${9|38nb)cgOh2;Be06X6a| YT)HgO;`QJLl6v(6MNN2~y!p%j0u1++O8@`> literal 263596 zcmeEuXH=70w=M!AQBj1A0)l`@Zz|HOqV(Q-5kv32Bf>@n0fi8h-jS}McMzo{^iJr~ zd++V8xWBXaKHryp?yoz>{c#v0D5%Li>)K^i{}hmVD!}yRS&|UomUvzx?f%mBDX6YC`eIGHzNs zvYkUN6z|Y^fwK_4`n<)zR3c5{(J9Op7~ z{nalAxD_x`GO}yQL`mm%g@uppEn=u)y5}qBg_#V$wO+)bRUyv0GW$ySjXy4qJ()E0 z5QjM9Jzr8Zm&VT$;^h|Om&Ak2j+vLsSRE@b8&U1N4V`*&LWqM9OyWY}i>F=R%ENuk z*q7316A<;387{tZ>8q0A_*J^^p|qs|q7I6cs!0iF(TZ~uk9wsZ9iFt^$U{QAS}r`V zcoZWxf6m@~URtgHV}okoOSXCIljksj#!3<7ryH5ymx$}4)1{t1$s@uE3H?3rOt!5} zHRyZpiOTJO_$$@~ABmMB=bN^I56J1nXsMJO=rf9+v>Dg)Xmsg1=6^iS zyW|)qeTQ6V##x@1-ppg%n&Kg+`5S*^3{g{Z-+5lk+xgEs4_4W4*h|wmv$uuF1%15i zC?3F1hx?r94gHM3R}%@*?B8`p-$q zO^8YI+$@WllO#HvK1&@nX0Ru4jIXA3^RRg=svpnlTm3YahAbTF%VNX5#sP`|qz5k_E~}ejp^Nr*C<4e&YrW?}awH zTfEToy7||`TvsnWoRWC=+(YuF?@$!E^i|qm;=nhhQ;yR84QNI$TUGu%E9X;xjpQfg!?{*|CH{5b z@wZO~bSUZ&Jsq+E=G8jrMn5&>RvN#GGLk%m=7-siG?5U|;0-cpeqBKX>%h(jFJ~z) z$0a@fZEQ)POQ0t;H=8Zu-w369zIhL~*^%9x$(#=dHbK+B zysEJ?>3-3q)PD+oLhy3GJ0Kt+sm>uq>cT-cbuGy?-zl4+jows(`cD$Lzvqenm_p)R zs1;b)SnR#jR8KWj?-L(zdjr>xmN@8@H1Va#hDYSSH}&Tq-oMg@7y1~&jK|g>s0=B6 zxtdI{a`E^vsl~9aFBM!SRf%D1Ns|jd`@#*Lu1qCe7GrnPX9>Gs<^ZgR! z>9-H$_b6l}9^QbYiqFx>y?^+ENA<0Zgw+R{4-zeoGweNYN1v|Ii?k%Qc>gBRuR`5N zSn{Mf%?1`2oZ`5k{)jZ;?akC53r@wj`I7s|VGE8s!d_&nPmGn$167~8xH zefFG1n}D(Lrg4V0fM!^~DaVGWQB&m?!xd`9?-cx3++VWs<3}`KJdpRfP#zbPb7T$UX=7b-$;?xxZh-cMs3M3q!Xo!q^G6JrQ7xBAt;A~R^+eU z&3ex;bJt10lkbr5kmL|@i1#e&xy)wzQ`I1t0E`Sq4ZDfFD9bmCrx9wUu$$SWQj{H< zP1fhN#Ii)a#KyzJ!_9NO#DeE5`fkbP5??FxzO~#MwL#4?wfUT^9Jaog+;vs4Oh2_# zvM|Y_)WdF7i^JmCVqG)@ErQk~3Jy*qP?GrkRXI&IO=UW=>5Aa*b~5u_S>0C_XLs;+ zuIv~pgk(lDYDE=Rh!@Fjzn^n$_u~j2z|WV?7tV+3R+V&EiS#pC>RDd0x>6EaQf8gh z<=0YAK6jNn0XVGf8a)EK7V<5jrByZ%$_m7(L zcerCJUR5M{h8_C7>n86Y@BWx!y40*sxh-j*mzb0&U8r98cs*y_+ZXPa>=%B#A4t(z{(sHgY{dxAP&r;Lj-uChw;78&`u@gHI)!BpW1E(}*OQ z-g$+6YVvCuZn6%GAm9IZU*1RlJEOHic&xDLB7#3pU-RPfz1+^eu1DY1O)A@q8+QUf zmFQIKe9#FTHgL`n7!hvt$@h%&O#1!pj~AhyH(hU*v9rK!Q0%B(coxTmwvHZyuCI0f zi`_om&RBB;3j=fC=3ujFYwq4m^jru>H{3SPw5oM4mD+0FDh8?rb#J&+;$C!X#2&Fu zB}G2}K=EOL*Qo0y*Gu6N0TKy~zEYW{<)$A@qozYQZ%(T<6;k?potQIeW$9;8E0!@? zC=n(oKT3gqOX_gzz@2BEXCt*F4P@ku9}?co2(1Ziffk{Nw1Ssk4B8uv=!;cURJD#S zjd_i7jP{Q5RT`Nue@Poc4VeoQ-Z{H-MMPI*+_Tbi&U#6pKBr5^Vco-CdqZVBEr9vj~J#ZMQZ1pOCxFIEwB5{ce?LI}B{L*|rV{+)Eg%F{Px zEoECmDLDJ}k6?{p5yG-7F~66|OESGyG~_;141xyTOJyN>x^;K(>3A3-JeFeD`OyPG z)=JhR$F`u4o)5B2DZciax;NK`Z%63ea2Fb9G`ukZ7gv&2_^jX&MgG2A@GG08PtQsE zb}K>a&zZU3RArwaJ78xl0cryZi;Aua95OY${a#Be+2&ak@~8wi4?$l+|D%9MPjsK7 z*NP)fr2Tm6#1dj^AJiz-ljYqKzI6l{yzKpSJheZSiDH~0mpjqS%eNDg`CqpCUf%6k0NPmR$Lq(+}d~IE9}gY z*yT&r!dtSCaoetw5q?$QBsar;`UNOG)@T^fffQY98;IrjG$z_?*U=-ObUE;{|Hsfy zvNqc|b3yyf>f>#J?J^GWo9SE@dKR@`JWTdeC!g_zW`{c8JlCC3FRolDWOys58xj_B zi%jf<@gS~}B#cC-b=SV8c5I9KEa9LkJcBgGPWz}rUE8amEjzDzW%^+Q!g)gJp!Vl+ zz$^tyNV7bDs5-J-6|se8ozURPEz!JH;8$+8728yjSCV0UuFK~;HtZC$#j%{dOtxIb z`i(7?PuoVjV#*Y%Ior>Ao)a7vd^^U@r%0xJvB#{geOK>YDa_WhPNRCasdUmdtY5Q# zcI55Il~S?s4_~HtuCLHoxl^vNj~rUYywK88kZ*6*-AALxf~&OpQd~nS6GgJUIoF7W zM)cQ}>+%DJl}oR$sI7SKnW7^G_g=5s3)PD%pGVCq-M;?t8l3lTm^0;ZEF!itHY!f& z`tNtFtT(L$4vMBbt8^9ga<47lF?0;_NejO`0oh&SFuBF0<)PJK6jqM*Wv-Yy^ikT;dQFG- z>6F5v-f0=n`p~*!os!S_Ok-X|dIVXLjaZ4#wPPQ1T6<%0WX$Fz|BLsNN6{~EHs(2T zELw5yPu6}Ad{!bx_2_9y7WLCfF_Zo~PF<5pav~>L5o^Eu^9^1YD#OEJ`>dpNkv4=t zJpPh6lOmO6qjyDvZ&uG62m3D?J1inx)F-#qR%Pl!ymu&=H4N$(w(D?WZUj=q&Hca! z=|u=sTgFU50f!Y_6XM`spv1Wdt}cL==!M&VUQ1qJ#=-mbIW7*)YfBu=lbCxH!3*>A z9=tGr|M89&8H{rY{6zv@9?7`>bvHqHGTwh(U)%xT;XGE8kdXoJYNk$5sGYNgy~`|$ zsSSJpagf$=#=*JCfO%bzQN6bT=nq+{YrAMG$P1d<+j5$`uzwEa^ssfn(7_S*5CoUD zP!|(g4_g~MXF(5 zUYH50K9&5}1Ox=Q?my&u_>cqK!Qt#_=VIc) zVdqT$2a*4x^AzfA>SXEQVrg$ji=k`s+}_nifITd1`43WCq3%eZceZk?^kv{^P5^hWy*3+JAeLmyiGN zkN)kee}7cN8R{fqZwm%>5&dhy{`K(Rzx>yO!dw{M|0at+1pVt%AZgLd!d!o@n&{N8lIpAMfD$Li@;$Q0*2DjyR6Y)5q!_7gokDl_zKEIQ z0xmuwF|GK2T-;x|C`lL|d}+A)YNf+)qi#+4A&*8CB?$;ANLXz*0Wv>hcimir%ML9@9n<($GLY& z7)M`}5-FIH`X}kU#0lUeCZ=+2xVL!spG8QD`zlS5xr*SQ@3?}GFJ*9Frc(MJ`CABj zIr*6sS@6%xxqy42LFJZwBqzx~vMxwR%fA$au*Rkge+Z{iT->4WVpMu4c8viy1t<~{ zh6|>wQ2z5B|0|Mzm(Blb$$#vi{~tv{-hsP4-^o>E-pRQ=*PeTIIls)1g3nH$g2$@Q z!R&$Ce=vdB(Rm~n8oF6c`k61_;xTM2q(#7sIz_8Sz1MxA-`}WS6I%I#_;FJYRx+S# zoy?g+jg>#I4s*c+s-Z8TozviC&kAvc?8 z#IE9B{eac@C6ED|HZampNNgc7xs+STX`bk5cKGd9t@2vcdhJ8El_95%hD(E83kmB! z=C?TnqOnrJKe`LdO=zSuA+a_Gu9ktThQRbYZse`HgC9=kr~8AG3-x%{+3meYMvztG zy9NC{Ot%>RH0OW2s8_y8Ohq(1%)^UozzHaBLHc~LabKwjY?rHAW+Q%Z#LG#GgSL_n z*i7Mleqd|^P4Dcy!U-S)woLQoPdaSm2=ks`7WRApqYJofOtk#E0%Dzvf1B2yJd0NW z`|kE5Ug@QHGayZ!=7yb~`h|uMw{S1u25O2sJhI4=Y{uqQ$x{I_I>E2O&y1PDUw^8DJkKuOWr-B>qf@MoZqZxr)PZjfIFL#EshZNfG+aXtuPc%^0OPIUc!Qb4W< z!l#dI6*Q{=xe}#-Ys9o&abU;uQAA*kcZ~Zn`W=?52_YuO0n)*twR-iZ+W!5*7Tkxf zcVfMO``6aT!YeKR7A38?aTkaQ30C_S$gzn{LLI2%Q>KA0gjX+uA(cNqGsQwlJUI?5 z`Pd`9BU$5*fa&1jpxd0-$mJS{I~1d{#4iyNFDL+oi+k6MP0wk+SOI#uO&YmCuU-P_ z;1Kg*4V40T!F{&Iz+?q<05$}dSFNxq6IgmrB4XP678il7aIy5WCAHSZ z4}JxPWNu4+fMpBPb}4`*w~OWQWw_r425+?aMildR;s0y=fPP@ybXV1q@$d@Z(^}Y_@(%5a3AlNc-fcY=75caT{j+V$^@3ant+g) zycpd5ScpeZ5{rGPCk7sYsQWDsJFdiIFywD$-Q?J;*8(Hhczo^fS`u3+}OWffe@xioVM@?TB@1d6Ynfavb}CV&c&PVdFeiz$RhwWFj!`ZEZyc@#G>f zZiiPDB{pRO(l-Rsy{ZeSCMQr*V>4aGZ-80^FMOxG8IXb*@^X>IRcy=P5(PG1oSB>v z;PGz@V8v^IqI*Sc@?)KvW*AVR{&Fjzm=}Qd5#gjVVv{g! zS1=fth&_vhwo3;vZC+y#hDDh!;J%_S1mh6o1FBt$Vd2Jg81YPphNPh07i z$iA${0oq4^qe6;J!XWqn;}XEtAjAuKfa#*~0&Kbo4BLl6FwXzF)jxLH|GL$G*Wv$l ztN&q1{+nB!+#9jy(lYdf9_|+U8{F|;`TEeT?c+n=qaDQg*>Q}-b&=w-EKlrv8UKg{ zxGJXjtQ)jnmM+e9CK)AquT?xW?}$fObf-RanrV6HwX^Wh@8kdh5}^E}YvmvU#T)B<1J_N;Bt{@7h9-!9xY~}Od~`z z@A&R?-#Yc1Y54#R;~8MKE~qf0;5L)TpWY=h|30MfUY|z8F8I5Hhp*^kpCj(IL1Da~(7&^CRM-PQ&W|k}*@^$PvLs zL1q}wIm$*=Im;(sIn8}CubnJ|gh@u+saa_{C%A2+%sXOJ^w6~F9Y*mYC$2;oiu10V zeeu}*xz#0cJ^Hp?c}hG`t=(g4_fZvz^me|9>Xk2z*PwH4pP+>`d#oTcJ^r%&z!;l$ z{i!1xcLUVl==<*W&(4dTdkQS3M;UG$2R%uX49kg@G;QYXrzd6Xvq}q+8nG=i5LieK zL=NcL8-Q2ME#XcYf*A1}Bu2;Ws#Z%84pZL{?YtwfHgnci%{l3XkFq5|QY8ctd}A(Z zokU|?s>ANtv^c#G*blgL=W~Qo9B+R0H1#@j zp;vz{5^TckqtD#E{e`*rsC+<0!RndLlk~y5vt8)v?qH$wSRp}@e-D_524~Gqj}w?I zT#mCMogfrKwtdv<6pSf>?2mifR~*j8Eibb?p4OW<8bEA!2n{l1&v2H0EgR_H(^YT4{{*EF1EV_w#W?e>0Z z1r>9#x|Wg1=;;WtTA!UX8tV6#;o;YO$PHF7Y+UFptKH3~ym>k{acW@T>K&l*b>V(H@MKROz$GqgX-Vx4KEIdiD_I-xta^2t0g z`*`5qJ!#{OK$?@}v$G-72o4CIs^vq=o^(aGv63TZhkgyM=ym_|<5^JacsBeZN&zW$ z+A9{#Q96{m!^basQ5XLCcr(IZ|LmynyfQdATiE;jc*1D^k?fD|Hy`Mc)fL-Tzw5#u z3$GUUB8SbB48n!h%D0;MYQIuBw>@yB_%3>_!SdhD!Y`}(8H9aY__(>+#)p&4-d)wJ zw)SlPXY>ANsUh!!q@9!E^lJA{*ZXCoF46dWS*bfen*}@=d;VZERN)P06@iH}UwFRa zO=&)EZ_CT22f2pH z36r165>K{b>e!9Wwqj72^+l|fj=;-!QT^2Y>?);WZX@|$8X*>l&UV|fG0nB|>2QqY z>Q^?c*&WC=QG-{HSY{FFCxgiW&(aws*bUX8>^+=Ogv=QVL$)onfQ;);3bw4ddvD2L`a8b4=7A={5uvpRD z4XT>TZo|-+$io(AqcxAMRwlWeH;MwYFsG&LEI42F6j4b#U)^Cl*X4$rJOrJ*&hTX> zcG)Dg?!D{h8`n#oSnL|r!MTrYFsN{y4$2&^4BHZTVofV1v z5c&q+0`ZCS>FLyJNw)5-l+Q%US#3B@`0hvRb8B!9 zH9Ol;qNC3&n^AU3_iT)j;m-|VtZEJG`puR-A+@TfJ5fm|=_lPxhJPPRb@z&;F@%CS9U( zbPjA}_1Zej`n=iyMCx!_>}+~gi6c7%SwdO>wNJuC1e(mVXh=rS0479f%=k#{wU9E4 zDN5%XT7^~d%r6BVL$Dj> zggGUspUFtrO?tOwUUa{L@=f|ThG;?H z5S8+2EnPWMlyg)N>BCC?N4=CcS8MjiOOBWt?I(ZW)s?eHLDLs!Ek8hwcfd?`yVG8qUhi6nHfl-+di zY`?Bf^Q<^WOLZzzoS972lB+?Q^#wv$7hWMprt~U}9U8;1d3IcPUKOkfQ70*QVV|I= zPF2GC@?j>I-)876h!OJ}T1LkAN-N*#JxNhN`Ht~d>`X~MPDC9KOUtTfIe30UepG!N z?TlbIfHxfj)-~CjRmXOi{I{6;1)`+{R4ZVB(xrm+)YUbV5z^Y_Z-XfPqC z@A+cVS$1MsD#M}E8HyT`ezsJ2mJ|HSa$C;mNR=p@M11!qm4Qq5GoGDeOd7ZU0_k&) zF!U|DAf+Qr+#p0F4s4oot@&bZnR1ErZJRR5lHUcZt4RSI`2%%z0>=yosrHp{DVusTkhN^{YOqZu4Yq2p3%XT!}v(QjjG2qAsI6-fJnl|L%5-XWO{y zHEfvz;f@B#^R!szZ!BbqcTO(k8l74h+_eLVPu}BavGU}RHGzm7{k_$S#mu-RgcHW! zq)2~@WB&k8Bcn{e%P`eGake@!kkJM-$1hR9dG5*{C(;>_u8$iT;kT3KYLpnpH(u^D zb;am2otkw;Whhr|xzAQq5<4bJoqGBt6rV-t5vq^F|Ap{%(GBCnzAP~fHhn=IM_5fL zCREq`u)BVKwqAlngx}-;?w~BT@$*I-?@|pZVWunPg7PW<0LHB&r-(p{|_a4++=x*2@2U z9nz|*0kxA3I7`U+vk1P<`dq{i|_4Uf9UGzM}oKCLgMlp1gJenEZubvli)u~9FFY&7d z9@+Z73y5F_mG9B1yh_P0Mdyf$9sfL8ydIUpZE8{F#@v3-{jn~DhIf-J{A#z@K7R_H}1_=)PGfU+}k;F2Phm52} zh*sJ}q!lMXPr^m^hAOM|=9ruxfZE5c-yKB?qms0qIlyM|JNc*I2^Oq0!X}wDG(;M| zCZ0+=!2Hsmb3|(mqm~rxz0?w#6Xy>?eBlWVGCjVMpRL3mI za23;IN!$$jaQ@Ws9VPNe5vKvLac;%OZ5OfY>>*mugtM8EqE8`S10YR1IeI3A8_C@% zT=ZhNQYMPA|J1q1?@6k@klZvhA`xHvc&*BQOw8h0fCi;u&k5Ig3~QqQahZRv+_#Ka zmS{mpDx>_zFDplNC$n73)0ntVVDcp%(p+@=6D5+pF5WQl{d*?VjPJ=bO`%)?badF6 z<@j1jc=2}4x^Lmhin&;BKY#WcSuy9a#KinPW@H)r*%{;V2Sb09RfVUGLs*^FkMi?jTlILhT7*%k%x=PxvJ$@sM77k z4pwKrcw2?9G%Ux2Dxa+mye9aWfLG710-9f{Q=31KL|T7usCCPW=CDOdkc0|ql&Wsa zz+VZX&ZTnhM0}x|i>%khP8>WZ11?#W70oN{@ym9>=V!}L_eQMCT`=*XIOnm5Ehf4y zYUj3R+N?h^YU=r$JKF9@3kOQLL%B@v<|0>Pi7na4> zc~MQXY>yj4_Rs>SJH5+(ykfxXTmMkXF*ju8UmD(J?}g6}fx2KFVme%CV3HLqm^aB? zs8U7{x1P<{X9in&^UmI9ClTid5xdp+s(3f3GSEzP>hOA0xu&`!EaZ>WJ1f!4s$wZ8 z*Q#H+XgM)j;Pru7JP=NeAUZ{l z`<4zwE{?C?XVT!l5*|@3`qE zK^3Vts=gCzdh8(sPHN<~4j2NWZ)uD2qe10r8R4Ka&Os**my@%Ij62Jgmc7^p>M7d$ zS*=b^adwr9p@JVm(t?<>`DI<@JCvTu;s(YD4QQK&RW>Vq%iJVwiaFe#i!tJnLsCz$ zYs5@002T_!ICaiA=rrw{F=1n5#=dGhH*teXe2*r){>el~L^{@ten952sAcx$q$JLl&C&MjL9)!+V>Q_75kW?KiL%NtPmgu)3B7Q?(`8if zZJUi(BGCkb{K&{xNvA93s80X0ZCw`l`!&}>Z-e2SY#3#KwV9^N$zEAqzT7QChlYzK zs!RT0!BqBZWpNh%?~&C_DriNy=B{*(VTaIubtS&u-Ufw;mrK4=t0*5Rf9V-S=FiU- zB`NIHwW>Paeni&~{jG<=R4YX2ya@Yq5Qdxx+64_;R?b%*Rdmt}UDPjH7T`7WvP+}Bb0p|Zz+1mZHI{ms@y z1sy}KWTzWw7Tx=*y9r~sCKX0m^39Lwh3rT7Mj}4-UK31pMG8dM6RYdlDlUtCXQkAsIK<8wfMS3T|BF5hgrjlzVn zxvC_==|2|=L7cYK<9z5`C54v#U3O0w4eEJIuG=E02@ZdD!|L*21)B4NW>ie zlP~Ud$73Z)q>S>~d$1rqnuGdFOGA4A@hMo!z7J>3o{eDdgfkqAo*#C`aCh*V{!S+K zwGH8w74IZ#(6I*4N1XOP)3;He%2oEQ<~6)dQpmr9{}!jd_OA*eCPq#_Vt(AFB1Nsy z41-SVO1(-WRpd`uqjMcHc^%#9yXMT`{Zt{I|EsI4-UR9UjCLPrXKEQwzi~5_vwXgw zZ1TXi^iZ^)-+t8u2hG{WYik;kOoo?ng05D+)VsnH|FG{^F#B%tfEpZaE?7G}vLCOe z96bSLDRUx4J5U^P0tL!}%Q5PeJ6D595GQNyb-Bj?>68-mKnKk})poEsJ+ij@O;Vyv(2eyW>&tRqHT`v_}Iz~vbc9#(P9*)ojK`l zoJo#`pt{UMbqu(h-@+tF)!7vDNxa8~9CbdECx_eCJnF3d2rktR%wHcI5ohVUN zie|7TYIvE#@QPN`MLl7-zg?n#I}exx+aSt^%JKUR*XlU?T2DWN2z)u6f_0$x`Epkn z*-<2cBDs;e^Lw-2MS_BFdpu4*c<8u-Its!tK@cppQ)YLaQ% zs0HjcFuow6+LTeVQIJ3y^V~!(!%x2JXI( zdD;M6Pc`N(s>c}uRV|5}okyJL?kUTDiQHg0tyoA6mCZ3$P1pftJN2*QuhQb3G4=Nq zDcIRuCFGGI$P5R3<4Dl=>MM#MX%EBIMKY2gK|oFWViSw^KXv+KaxTUYGnjN>YVunl zTGgqdg&Ll)uVRDvl8~96LC5K)e%2iPr%EEb>aC)Zp6m_s=tYXmSzXUF9rJh{#nZ#= z+1nEblB@nlXUz((-5|wsk`*BtJvp5?_vLAu9v)nLFL5nf?ow$->x(=KqYP*r$l5=J z;!AA~`yHhaPAUi2c=Rjnz_L)vsKGA2Zbt`edI&1AUY+E;A;&SXBF;)wz$qWi1 zbfK3ZSK~2-kZ+&xS%AGa94ngJ=~XHWXa`4+Dr~wWcNo6QSiE0K0h&;u8S67IG{K^S zVDD$vDYZk+v_cbPqLwNk&q*ThQOS-B8!h;*dm9-D7VsePi3R2{`5I9gVz3*`O*_{N zq1CBNI$-q##=zqv^X7vHT(u!eoY9*bQw-6WQgwxdVqEMQ;JgL|*6Q*M!f3~)a40Ij zG>`1TW(A1iP#}mh%uOi(5$CUJZr3y>&n2?dJ+<-eJcN8f6e^)R{gfB&_g-0DG!Wp6 zmbcvh+I%PX`1GE(*Xgp6QDGb{i;3Kx29lh2++$j|zduJhL?*|R#1A@L{;_)5V4c-o zV2uO1KjzM5U0CFnE5uVTYvgx`G8)+Ki2<9mj6fJ-|B!1@?@5LmoTQD!KU-XGL3g1h!z$ENng9u&Y9qi_y+0;0*JJsb~b zV+LpEReia^@m?iN8=j=s-D6QasdCa$RY5P?`+( zPjH;OWb%vFO%EM^ey4nOZ&cKwS`(Ou1t~xfJUDB#!olo-j%jcKP(5NLTPhYQ)L`#d6S?oPR zjz4})jVJwTT}9 zVbK#i98(5!f`z#&g|Lv%LFzeSM>#9P*h&jt;AGd!>flp7E3pSA^3BOD%$fRC|DgPs zzQlm~1PVADAM28ZPg)v(m7T}x7&W*H#f}!XmKckraX?v03ipF%{fBx64gO&z=3^X& znbvjZbR-nulnX9Me*4`gW6R=@-j?C2EOE3rtBG`uE{Q6G@UZuKjT~GYCHf4)?FUK? zefR05HK{3_!?KQu9tsEUP_;8)B5Hn4lZuiYg~5xTlZOCec4#oj9{K@UuVk z%N*NbNs)Py#=Z#8Nme(`K(GdAp(E^bW#?PyBl5i2GVZda$UcctSIk&q4$yh>Nf_?t zJX5ha0cu97G4&`X4@gzd#7TiUkJ8-o@O&Z%#YCsyUXh?VTh4M9}XFDO^RSlv;zPw|RoBr{IQ*CHx1Usnj48Vwv(&hvJW>cC+fmI99 zr&Ei_(`8iwx%}p6YM#iyc^Mc?*_b`h1o`wX3URSE4eZ9zEr9;6XvWvOl z$+u8Y8x2R`kHA>etTsNZ=Wk^9E7oS^3-dKR&CO>wiD_07C^8}CwH;uuf-`g@XjJ!} zBb}^jG^k3@iuF;n%w$s8O)?DaC!X7Jz8@vEBG^f9MFEjzOs|1*F6A4jPL+Oc89k6x z3Hu=YKI@wG%-$>|IVoPZd$%I8xGdSWV@s!F1X`Erw*|8)I=$Mp3~z7h ziT3H%iuhv%F4xNeK$@X!^E#nRwm5Dmz9s|)j^lfM@`nq>=}e^2(p`}_Ip)lp6`j5X zoR4FmyM~{_CLM>~}L61Au6`({j>YS`MU^=F2+3DibJ;(LIF(QRxN>Z(03P#cN|=MXscZ=vFqx|sw2L@6N&Meu2ZSr`!uSrMMeKI^4}_B#vYiH`~8#KA_0Ej>PTV!EmnGefh~<4z_; zfVRwv4&|);EDTc4Sq%FAc)ia5ldz`~K!LO^t;H^5g%Q6L7X#d%25mpWZ8VsK>)Ekj zpaGt174yiZ*I=6BaXUsH<7JY=IP3QE0Uo19};lXS^>s z6-0`}#K@SF-cq#lL##_rxMKt=(f&f6_b`zBL{PYP>XBiJPCDDkLcL1+#BC18R842W z2{YzEoBU<21$J!r-`g;tfJF_-ToY{WiLjUhK>Op`NC6gD8tAyVl~pL#9rIT+gfRoq z3pU8YB6cMg2UKvLTZP~R?1B4M<2{tRG?fm`r0KEU4OlB)wysIg4-vyiLqV4@EG7sGCl6(2AKdL$_CY}>%FiN)nL0HX=b$#6lg0{m|G`vhG)Y{<3b zD}W0Fe#=UEV`d}&0=7qU<);c(e)Fz@`7J4fvr&9F-~+C5fUV!=0`6bn z&_8`KHmh#zfDy)^&jL|^3FwcabW^jd$HvFi=U`acL@ERN`&2;8jVACgY&$Z*4)o=? zqU0Bu{?=H5gFN_IM9b>#QIfLR6X_JQ6FTfi?G-&qiw z$-~!70rbdzTE*f7#DGSUn(7bSSY`+K1!YH-taO9dU|(S5*Yfi*_W9iedSqqd4t9+$ zm;pR!9h({xYoCHI`^7GKi5qK!Z5F({b6XhO`~oozc`b}j-3hPq0m2S(j?;fbd5#Kn zz5TEM1@PyJ8{fyo4BIOUw*yQtO(z6VHgB-a!38vle74S%m4f__VQ8`tM)w8my%daF zfWU*>)F}zE%~S%@K7`NMO-$%g1PI5OD@|Zq7;-DXcnWV?lSDEKtg`{HNhQ{VycPp$VTnL^b69Dv= z8U__{YeJX*iY#Z_xI6P~K|Q?)#FpzUuc@4*Kk?e?R%{{JPzrow>URLZ3gJt740x2CQI}E^GEmNMQ`Hr`Z=Wec%0S3N8$NHFsx-%>+o3=$m! z==wCWVgk!pz<@N54&G64V;ZiF?)YppH~|!AGzM!eyg%y9w*r_FJyJ(Kro(2B zC(iu@fPyajJp;Ow=S^;lXl3M~sss%p%SFv$x3zdDe73hhN(lpqCxj?3z<@+v&e#W_ zNhrY1upbuvUq_pS;`<#m(!lL~m-SqADgkr`c7t9LrhX?QmD!t|!+&OfnDkd+I$9EQ z7A4a)gHpe)zB6=p)Yete{UITM(RR6fND6Y`+dn5+7gV`o%x17*x{F5)YFt|J_V0G}kM8dG;7@)Cz9SCvU_5_5#*9d>L z7gTR|u+@PgF$&DD|D8v-+?_&0DDKcE2E;T}Snf*U93;L+{JVX6@ORrq3UsqaQQJcA4O=OyNd*#cflk5u!SY@!Vx?33U#`)4r!BJCq7l zrc!*T232`Q14B@yxWQsqqu#aN&Nhzio9w!K9!CD;y=}umnPaWXg$9h_=Z&HQojj&> z>r+g~`1!e*LxqVEk*09Nw7mXeKF_hY&~N44sKhgQZmoUMou>&!7sFhuq-+N5(Cy3Y zVrAi3NeNnOo)&JVRmqrU)y`Fi&!Et3+gO@^_r_#H&*InCE)>4a2iv=Sl{W=#z*J+YHRVl|$F3p0!WN zT#2oVTC+qaj8*d;Ilr?WH@eNcKH*9kIN;6in^0wJvVF%dRk+^=zB^AoQT=**|EXwm z6K~S0z)jvHgF~vi{liYgS#jL{25fHoL6YfV6|Djact?KGOgZM4zX%T%cfiyon|I?mLv`)pgh7PJ+7 zY7R@gK4hs=1@Z86oN}E#VOwV&oY-|BGJ~HYu8Yky{VZk-@|=Dw6s@%`;ycXk6u4?W zZ=lCnebnK!EfTTkwe34h3O}af!*eEG847gcytG!ji2C2Z@@pYFqixh} z8mgqkyg$}k$gDNKC0=J|p=4;Ek2p2E4L7|un%y?!ghaSG4vR-H7?6l^$txA7jI$eB@?2_hao_Vz2io`H=LfAnb~F9H z6}v@Hm&NNFR!p1Cz85LwxGjGqA%0k8z0Q8V6?g?1KX)3ZZ1uZTR64tE_2)Kw(Mxl@ z-c$_aIz^sIVYh_}Rv!3HBRUo1VglVKDj_82Uz)pzV!R5i1i+vx6% zd4?j+3-mayT_sP? z*RCcD4$2ZcZ`kQYpT?f8ja3hB&Er21+P#TMr|0*%Y)fBq?LwC}u$%g@Nqu{(@w zcgN5_=KVTP;b_5%h-nexbS`Sy(^El&+uiVRsnbuZqw=KGFzHye?qiP+Z*{3>(&_`> zD8%me`cy@`H*J6lx@IG`%5Z71=FG0BH452%+-@P-oEhDo$?kidDX3Kp|G;ZoxBMjY z*vofK?(}(_!T^_jUXp24%xT)2AO?4y$4h*QdPh2Ys-5er?m~87RZd68?0Sh}PN#qO zNu22M3HDu%iHX)*_s%QJs-2mxvggaMt~eB)c#b~e`P|le*nE3%;?Vd|aT{GzMdPoO z9_H(Y&ZBZ~DS%EBF>xGA5nO+CUOtJ`TjQD#nnrFZJBPW2SD#7oo(otdt)i;JM~i}e zt#=4>M=Vhle@hI0MIr%qx3u#cRP0hkw!X(Z9fs}qEybLK?xgJvr`U{)mLnaT*6mr% z^}XbX=;nk+%pCeomUl+${nSWps?#bUecJ~=E)Mp)ne*^{oa?YIWM$+FRD*8ACEQr- zt?YD5!cVUJAL`yZD(bcEA2mQlL8K9e97&Oo8XBpgVF(dv>FzE;X&4wJXXtLErInTj z>6C6zQu=(^dq3~*eYWi9{NCSM=d5$qnm=5_@;i52_jP^ZntKk=*o?KD%&Si-@0z2l zZa8}$CwYq}y}_9)KET1)PVx;53>m^GATc|jxh?78Ie5$)KglH(+^5~uJ& zZuR2L`es_Wh)?!-{e_L2Yok+c-p4Bx+KSs27Rg>lGo|gPC(REvtW*vytyUY0fb3Z} zN5S}$x&C5%_jo5!Er0n$S^eWiWlxe=trZOJvGlOZiZ@$fh5P1~P?Cewo*QZwt!1lz zboZ>@*h1-HLbWPS9ftBA(wrva+qt(;Pb`mMQebfOJzMdF{bbhbb!-9?A%P=g z+uRu~@dm#sR61KT-oEj(9=uP%2uNUH@N4=lYF#4G_NzN+O?V_Yn%||muYda%` zjkIy=k3pV$yEs?+JeDFsr?-0k(5s@hwcOEMw{+zY74rsp6iBQ?TDh)hcMbcf8Ff`*nF3qo?+pH9b*w zO*pl&WL&d3FO9OofU`DqCK`BYqK(ZxeIV(LEZmuMt}_ZJ>9qXeF>HQ8^?A6*cw9~4 z-1D4iL!tR@?^J2sUWb6$$C(og$=2beo>?3^uAPv( zxcA9Q+eaMLyO}uS!KyjxkUidb6g~CvqL_8T$IXz_FU{YGu}g?;PLl`t?LAY=Or4 z7)XbWAD2v{GQ(a&MJhvzs>^2TY8uCX)J}V)Zu=VrwHvPw>J$Rm z#B(7le3A=z_&{CTuA4d8c*t6LNRjoe>FS=W-7OY|Dp&hGC%3Ye;qEOi=JCrs$Ep6h z0B@&@3Ql6}ekQe%?UIAGjnQ)pC(Fam`2Nk&Vcw#erH0+T2gSvww1)8=PT^}kUTW;f zQOg*9(n%HWN;t4>&c}vjeXt3Oqxz8or(vc5NTxw!Hd$qy(ysadF95ROMl)6xhqR1_ zQhSX~aF3X+x`QDj>!Pp25CUvds^s_;j?>SH!*whGmbDVJH?m@{ICPwB1E2TB5pfO^ z7;8J(SEsDmd&7T*tyuMJ=mxYocNyuRKtH>^#4yghyhSf#OuANF4_7)PBT#~kJ8ChX zQIqdC#!>5?Mr&mkpbah=%I#sMkoY6<4S_n22&9ge<4>#}ZGP;7?D5jMj0$8U9hZ5` zzNYuMJH$KO5TkCE<8*ULU^H3(bs4w%VP@w_?@1rad5pjf3ac1zzJ(+1(LVt64WrBD zouJLfL-*h8x_VuL6-L%)Pf<%Er}hSCcRAdH> zTg-aQfX2#eUo1?WSab}!sh92~EbpXfruRYj{7Gg#8VnRFHj@cLJp6S=-%G4;o-!F> zw=r$S*CVsIX8oX^F^+%#R(gNtEi!C)Y52f&5ADpX1=nORxhS?^BlZgspefy*^R&i5 z;R9Hkt-cx|R9yY_P}ViC+O;AFBX)wP^sA%Q6zx>ph(1efIl^1~%*Fm4!>L?`&=yF& zi1u8zRvAg^QV8zaF`V_cpcTv>ug#rC^`;M6;q5LiPx#AC7J`A8Wj`oKq25*hoa#k* zlG^F>Z&g}7Up`i6WL4$I_*sgtPzycOddzL)faN-p_$%NB^$RsC4%)XaMLi!`Jc*~f z$A8gA1zTRAyKHn^K#n-K1P?6mEI5R!+zon1%&aiw0kVj-I)b?Itc!H1B7$^si(DTi zP)y6EH@etX<+%6U(%`bl9qAOyY(L~xTYa2V{|1-DaJ+2FyX-V89B%9cl_N$!4`GckFQ6J( zB#Ns!WaU_6y_SKhU3vkEFE*8{Jw7cw7?051&RyO~LOa|-`|j$Q6;@t#_}=!Q9`*5P ziuNsU*?5)BKpIWYg&Jv86-9~0`CVdxqks*CS%KdgZGgLZdC5UEIUR#6mj`7sW(Sf_ z(O!8E{wf_FIzaaHIWw_WO@AtwopR|8`-$Gub2in|6_$7i$LYZuQL=s;wr2Z|-|`8i zEA9A947Dp{WxDMs;Itn{|m$k{N>7$83tc_6< zt5sSK6LeZrKJN0BB9&S>9nye0I_-X|J#u*SptH6)zUO7q;ovMMX*lq~oH3)#o#m)X z_lAIP`E`V^lT$r+3^%$nll5H_4=lL%k{gzny}aR}wZo%HI~6`A+{t0Ci4}mHAwKMc z8WqttKj#^o^@zzX?wYW~+(pX$N>t1ef(nuj&Q5Ft@wZ=bpdNFi=?$+7&;aRH5G^g<19k&}SW_e1aZSU;L;bUOmB(YJbdT7en@zA@~3AhWO z*plxzE2cl}D(!98U6$CHmZ=Ol%O|UnyEIJD+wfU&Rx>NdW*P%|{k!eIz!onY14?Jt;?#e_+?N9 zW7y?p_#uNek42Td^Vi?aS2BWC7n>;0+oe>8$7<8#o5k!^p(Zi$kA<9i?DiY@qOcom zlQU(OIAkUu|57Cr2|G#yh_vjXJlPm=7?gL`)EqT9bu z2Wa<#-*@1L@U8@FbUlup(PXZ^n5hXm=zxLOvliPytmzo=!>!ftTK~GvuwR+-^of@1 zp6Ak(dQ~($%t^t4m~yc>KO{*bPdsfAqkvsPVG9BRSlE@UkYmti5+SPC*dqD zVAsmF3ou<4RjwVc_hjHyVQOU^XAq1ihl0?YtBF~#MMPK1LLDHej!9G zq-)UGIvt+5K!KR^@!uM}eNqAEo+P-;O!m*w3OAye**I*RFD0l~#Zn}HLxYC6-GQFQ zRFMIVoJ0k5f7jEs!2LhJo51#1oXfE>Qf{>GmEbsm5Tt!4l47l7qxqh75wL4R>ME1wT5&FU<<&k zYEI|uTIG(STSG1%fdEp*Wtf~;wW?5-B>Gb?4Grmue;{ z=!rBPp~vIs+)?5^$r|l<;jZF5+x_u{LMOl75KJ5i%$JI%@Q(POPu+(C4-sEl2b8Ck z?wZ20ADiYnj{w`}(|IQbbaNot=ywRv&v~mE__E?W`*;mye*e(UL7z-)hWg3u7q)OP z(kis$5=kK;cp06%=Tb%f6K>ljbeY5_v@-DJP2&*vZkXdW|hLWwv8b7VT?2wWji1OP1PB++S(O@#$DB zKR|-?+k=!XTI*Y43}s3U4rXYAqa;g?W)y$*KE98Ot}POE@9m@GWa3-f=r(LesA^rW zy}1N^PUG$=1oOIQ4(C$t7ktZJCy3@-(FV-Kau;CgfYBINe zY3$&Y%>12qab$B*!5 zDzz}h1qrEoQi3qDUCG8<6}hhl_J|W6`Rhq>g-VuCv9n=fz*3#K$j*no9hq}5pXS+v ztCac*$q4L{unJf9=d10%l}H6+G)IV$N@P#4Dp}d?l94jc7F)#9rg*bEUMv!~Sw#(3 zr5C9dOWpK?XZH~I*{3(R7`mj_xIQ8EZUs}$Y85t zv?wDZ|DYsM8lPA63>8anv1_8nuk|tp$_=xc@~p=0hq0|1=dG2}56)W~5zt-8goX2& zbVJ|Ccr*6PB1V7aH^oizzswEZVg?1>`@!_;=*P#K?(r+JyNowEa9>oLtN0>ru57DP z`nfHNf4+gf_F3Xq$Jy#9Z9cS}WAm3un-O7T8$+JF&IU<23PT&J)+DL=%c0$*1W@vy z`KKqZ`PY5D-!c7I31Q`Z+9YYcL>;PfYp5!r_F-RGaj!Q40|WJ|!G&g4Nvme@8<+|R zcO0ptdy$lK0Ww~pom9~}t}DHTiW!I`4XqGKv-Vc&(UQOrw8425i!rfKzsIgo@w)7% z%wC*T{Aen_)b4-toq$)Tzmb1D+HnSJbCYvJUTh6k7njnDF83-fB`_U5cUBQzKSMAO zD#@9H$hw&g@@gVT2%P-UB;Wq_q69icLm^6cGeSB^;uYy0gjCq&TaBO>Iqhxi>x%4b z8Rz!*tH7jS!gl2syO$!tnKR^kk9O1!Pd4nM?R!oh92lu5SeZjVFwZ0DZvV{n;mcz7 zHB%AEV|u?{-^r)-Ysgk_kK$$`31gTFqZz(v7}MECy>^)=V_?yDe8-=fe(9aDV%6G? zGdY^~4-?pA<)RRifzfh2%!?|Y!o=3sGIPH1v*W}eP1`Hy?87-H~@&|@kWDG-YyHvd7jELojRr-gDd|8I$VIKiyh3Fn+$z2ot?P2Vv zk1GW-)HS@sM+O)4(n~yC0+{UYcg_$_@j_hSS1cYPvqEW*^;0%o>}!_lE!s)U=eIb} zlLxDJ5Cn|R_)XW!Y2O{|IVilWJ8?>2zHm@?`POkK>{ehTD(j|nWI0TV8o|L!ITkWN zCHH-#_wU9d7{e&}J zYjlHGeQ^eYL&_d7#%3EKmu9d(86$_W%IR9y?FXj8MJO*m$1w}DBopdf{YtYIPtLf? z;btk~kHxM!`1NdHMP$jEs1;|0@3%Y%WX8|Jbp1Q6h@e*yt;|{~L+Gn!)oUzSj^5rT zHT_v@F}UI&(|rW9Wd7%S%WoWxScchX(BC@H*ruLn@`DEs%4^j6-?0h3;s%M-Fwvfl@(#gprM`OuUVu2s zlA6w%x+A74QnCc5w6jVQlry*&9J@2z04WZkm8Y;aajP+t60Z`cu2$h^T$ni{8=ioY z6mmJok%dT%%iV8L%yk#D9cup&Mxk))iSTc){uQ*7ddM!5aI9n}bu?7XJgO_xD46aR zt&CS|;Q$mVUs#M{-dF1osgQ70?J8_!I&-%lMmN-A9px~HZxSNL{alNEZmI#krMgSl z#hE`~YK4$EFG|E~_-a>ZBqcyb5aCl4oyqAXFGEsAgw4~P))<^I!^MLY)bA72UG46M zb_Z~sPWTH>GzaK&zljmoQ?9&x${^_u=Ggz`rtL=ll+IuO;vSzwh}e3_UMcTr!-Se< zHp7kSF_%t;wHr&PQ`ls%-gG6twAz@^N))fM7sR3%`-Sb0`FsA~YvEN4sLH3z27{yV zNitK`22`{;z7}UXrj`qJJs=?bS^6lE8UbBX^3G;K_lJ(DcCZYoj;&Sdyd)9*>5I3G zYcho6`eawqT)`9{V{!?nztsz9%b;nE)2<@PHKKlVRwCl*s)f;+&&jCxjm#{plmpLm z2zErTNKwRT>LeqF^o&d^YQwf`{iYD+(k+eIjJ43rCZ$x1C3WA+tcuK4elyaZPjQiJ zZS9QBaQ{9zm|=s1+NkxtjhHz{^h(Cv{`!j^hYr7-+_f3OHL(|AVmB*)VGGxFdnXE2 zDFm_RZtPpX(%}wHZy>+vC3xKd5Z1xFR|A=FeS)oly)xIaro_Qf^|km@rl71Qei2H5 z(A+9viHN+$BnKgW5IHZMuBDjD#LF}0Q9LdFca5M(5<8@}WeR6>^{|}P+(wf* zG-3HJap#nd98bG6vQL9HKtE=Pk0c(~B%$3BI)%9s4yhM^$!EJEUfevLZroyi7`pWc`2Y-n=_z45S@Tf9fRx^`1P+lPj|(Ma=U1n z@?Sp@PGov{>go5Bk=TzoNU5J!xl>4K?3B-Ukx|%2U2CZILIS5w^|^896B3E^$9F<4 z_@SMFv648@=f>sQF+xj-(q{#Z`t=!)C!Hi@t!^OK@dW}M3yR%;OrwN9ogWwFLO9cq8b>T5m`89T=IP*Gjl@VRv%f zGZ@>13MJAm;n6ie8pJ|C^q8TVKlk^uFr21n=Ow}R)I)dBKCOAc7!FTh*}2yEf#IYv zAb-R3tvI$8_}tgq3Lw?4GuOG4+W{e3IT0#veN1?jsSPh!2zn_rC|x&BK0UT|$h(I+ zT>ef$irqC^`e3w&Xj{iOK&@wMfqZvC>UaDFyhHgB?IaqaWSVh6Ed50Mb*>om>Vgkp zb-!zkm{fg~f%^NGJ~&hxi9T@NL7mId-~#2to=8D6-LimtZ&`CSz?^kw^f!6Kt-W0e zN@2?12m&pJs@W9{)8Uz&(t6;(O{%SOkGKyvD=Ys&EI$-6SD(*clqUIYDECu83EypX&DFC(d{dHYR#`$peFGfse!po> z0WCkK=Xp2!qa1HnyFPh4QtmmzA|7=`ku?~9V&V(Pop zxJfAc*e8A=;12HX@N3hY@GVby%GLRthFpS!uxBP_j&UT=6TSr-DbpxoqOkqNU)nbZ z#7zh!kgwZ^M%RW1PTEoWhCg4X6Gc8bq2c8_>DpQ5z|Gqw_XrI-4*b4`+k-|5IXQS!hj6p`=TDQ_%X5+L42_a8w7k|T#R|K zC%mP^(@o+g%}N^*PeM4P!`C99^|z*aPzUW}H3k|?4~G0A?Yfy}?|6lkpl06pTd8yR z;ooGHY=M=BiawHVc;El^sNgB1n<|8>vw|<;+mK^cCd@b2u0HFl+y?nUWpo~dcdO}w zI#z52?XYQxg!3sq+>OtG>c9&OuMCDTVsB0je8_^_&rLNpz82))Vf9G~Nyl<8EwNgW z+T?xtwx;dnm?5VC57nLgScNa({MdandXgtgs@ih3Qffr2QM~o*CXZ$da2_C^ODkt( z%5;_sbA&P51wK=u25*6Yw=@=7Pp$zwQ$hz#&E`>4(OVifi0_m7%ZD16B2E*#s8SF|%dEC9O zY~8>>3l?s~g}!E4Qi(=YDDn0Ys?QCI7a^p$@V@jPGk6P(tQJY!RivHI5ix^*)S^r= zZI4X%35AfR(ZiYF@*GLF!jb~PHWLZP>7{DvXzdveLp;2fEd+uokQx!2l^F?Jjvp-u z4Fi~^Cwr$kiK2>jz@hxEN%u#N&~DkS>5Tc>e@Hbg7>XdjUpJ2P-N7_&dXm2HpQEMZ zLh@oce3o!6RitbkaONOE#4uZqSFH$&^mlDp;64`_AAyV;WDWKYZs(XIul|}W$F76z5Po!F4F&P-44|nR(R;|$(-~~_>;0_;^V6Zta(c#OW zoZeg;{c|Zgcueq@hr;Q9jDg`;4SNY+Z|~xp%g=hSB4W`rx!RqLyw0@J_`}il!P3tH zEX6^W-j+^}^Ob9G#XoY`z02w^EM} zO*BPYwyn;f^z<+%m5;f50lnGPHuke)zT&paIz}%pS*UCh? zdCdHrs=WZJj_x{5htkJO>g{}v&O`Ij5BJLg%FVI0r8Ua{mBlSNW12eulRD99z_O zk(9U&bryqiLD4eH5cJVs8_%T}%*1Gu6 z5f6HJLAA?HCS;d({?alS91OnfV)^%g3t-L|i-zR>^Bvr*z>*3AGt#U(QUbZ+ z{k-IA#c4%c8WV&|UWf1>t)tPyMLgf14G9H#lQrL^(Vy>l zQ9mzkJ0u_&oW4Ddw);33JnI0Q+hfK5Nvs0A(9Fy>rMd#E@FCsQw$mKmomw%nRtYFI zgJ6wX9X-6J9SVE~N>~sGV-)lMIE;aU?1Lr(JVEO}jq>(OP$FYnmM4#k*SFUB)xD!< zt#@Dc{uDB~J}Td$i3Hwgh26uFpsW?K#h0v^ra;Z-xO1HV5&rMT%w0Da%>8f$U%%_i zHy6`A*eA{HJ>AQLZNFFc3!8hB9={7Cf4p;!0QML$!f6MYwy80s%FF=?8Hw|Lfb;-?;p9g)Rj#c`Au{U#gOP9~RNm2H3 z;+l8=^||8)#`{NCb$`8vR%Mhs9YAQz2OUcxW+8>BtyfRILq3;Cv6z*2a~tLj(!0fZ;aPr45XcJnZKu< z<79^GPnhtn)GKgl(wt;bFzJe03nxe^Dj{%kCyvB>^U>`XpTE>is@Mevatsf@+AtrF zt5d9Xb&6A;z%?yF@HlyQu(p>aJknR_zO$&xp|WTLp1fniZLgEy*X zg3~Mg&B{4PWhPX(sr5fRdYn9%+5Qf%iPbm`>w(03ea)J}uDx?lsC6r%7d7H9{bJK! zfdBqL(3;W3D_ae&HaL_J`dXmX^V-A-y!iqsmiS_0p2skNB#I^jgLM9DkVtcBn2y&S zECOR3nn8NceI_xn)-QxO!s=9) zK4p0+JX19K-GM{DJI$TVX${G|0;q(1wi$z~W0MLg6(x5Vn(6c>o+abm9Oo>R`4SKe zQBnsuA;25XxdZPyH8q;MuajS$A~fHVNUN&|SZwnN)asnbhh>b>MqxPx2>B86w${v1Et7(f8@OSr1elG zaV$7AcT`7pzGHYKG^XZ0K1eehm#3;zxpuLf8Yeo+XHoA%sI1wefdS^{rn_W126?<8 zYc3478JiPg%rS{m@TO)`7}Z_})j{Ycf=v>)&B~ZGuMhM#zXIp6a*Pf^E`ZaRXm2?q zuEQFb^Z;_YZSaW!eqxlu0Qa!Z{&UzFcA15_5AW)nGwfCtMX#dFS{@=P2TkZFNb(0> zD>G8(w%mJLQ0)wiI;UZuqf?A9aqI`jdhaH|*p?)SQ_bKNNQJm!c*C_Kcu}%tXjlhW zjW~ELF4PihIH#NMeFAW>zQDcjz2SZ{*iLQRfi@(Qp(eAW4tbrYiybbpyg3Psnb`hX zrxwWiBv$~56mjDqV4hxBm&F!{!`Zjog3qx3br%4D*pAVZL7Y;ICHI5w@l;&C4Hpi1Lb51dtMNz8u;hAcUs0Rz5V3kjC&gr=R17au3!HM;T#+Q|5o5&D0NJ;>s`K8>Pp}@d_|k|2(+K`{N`qWX z?(n=+&j>}jItGFf5)(&wq!;f!y^l$n7VhZn@7`rQ-7fxO^x$sOqPlC3l4%`lUoH>{8OBCrPXP8Ru z-C2-Q-EOVx>)0c+E4*tdc58?tIQ@oCeTyD&B5tQ$od`|aa?gNNha#m`L`GiM^qKRMaHK6k5IAl^!ae!;KJ^=R*5!!+wv`5Jha1><_D{B2 zYQ0;&hIs>SWy+i59Uy@Gg41Wc2@8j5Byriwf83rG-|d&8Ug0H<4Tek{&4>4k*&2L} z<-0yxQsQ4xjrkn36a{?f&9@WpM3q3t{=om}p^KZBY*jpT_kJJ4UuX&y$1DSLKhat^LUT}3DR9oofln@4aIamQ!WGjLoJHDeRrNp|%B9ygdnq%B z{*qdMR|(*O|7RhS_w?(FfjKP5!Hmv;rt14td+X-F@%#whHzX6t4Mmq9WQ!ITq$D!o z`aeQ7iWzOA8dtDD7c0ezERHUb5AcmM_yI zzC$JUp^qMf2a9{IJ&sBm3-(WIrj7&*TF>bDjyJs8(jZP%B2u2Hs#8^!#3|af^<^qK z>XFFOw$xO91v3LO;nslz>J>l?p8`nTm$=&N0Se{NmGm2}%AHbBL(X~7V^cGgg(>o0 z02FVELCnrks0I+^-)K)Vz$Csd&|X&C?r+Uf9i6WzdOI~rEiC84@W>P2|M;Ymds~GP z2V8YkLF8K0Jl9M?9tYX<0_6_@Fk`Zm$m))^iG#75GL0!5_w1JP_Q~^BM5{wx6vR#n z*$7lg18cWCbff3;#A=BsAub#{wnzfj&J6ycV-A<)`@!G`ZBydcQCa|SOI)eBnLR!+ zTy&s^L{nR2Ax>Nx#BZp{uSpac99@7XqBtBX;HV6fP5#?v$&)Xjd+xJfbS)fRre%;2a@-oEy4@0|Rgkmtz+0 zov^kXbZ07RiHtYjh8&6Xe`1|Vh)^9OQPWViAgP>F5l;hr{i^=VqS68%QoF8cugOa zE_wwwFEjzT`Nnp?`ZVTyDr)(aWAf?sQJpgT2tb?MqHCvO09@O{7y>Fu!`CZjx<+dI z4 zHo&bnGEdaD9!}y$3y;e0ML6kscwGc?46dc;K8m(|6=$P1=VB$VB};OT z#@1oc0i%+fKTK^_=uWK0iX!xMi0pyQEEfBJQsS;6xMxlbm62-?(K4cv%AGJ^V_gnnI5XJCYdWkjo+LFV*V$z4u~O)#iqipjJSNNbIXD}tN0`bn?#oq^>s22j_zs{>ymX)BD5_=Sfgpq+C4ChAZIgPq;^7&FNA zVXkEIM5?J;l>(L%W3sZuY#n7^-LFSAi_V=3j`MZb#^*1Gso+zq5FUiE0EFegq@hH^w1YZ<#wx89h1&01z>h_o z(!=?mxLn)01^BlzfOBDee0S>`1KijDBn>+aQgaU)i}i<90dIH&?Txv^F&veewF&{s z`kaMnq~9egz?-OOU0DIF1mdSwjU7!8XqJj8Pv^|;*fPoIM+=1Ykx>7^XR6$Y(lC@%iL(z*VXG2z3 z3=u1{+9vQ`L_eq{BV2elaE-YByL?6%GH5KS04}e=bP<3AniUpJa-m;;ZqW-?J_Zz# zF9VA2oYbCL%neAAHQrGF5UV#<5FPkouN2zq^!QP5x=bLrij-N6G;jA|>f+c-d~&zD zSgqM&{3D-@!X*;!wZB*(def4CLI9YYI&I;mhT4nl??8T(723XbzbSnLuOuY8v3HlL z_|PxLMae+2Lh%8~>Iy1Wm0|Uc(1LU>pX?si`NwqLYQc^V{1u90Q^5&9VfAv!CsMns z>cl)pT%j5@61VI*(A7sv{_q7=%#xGVc)K3Z?bv(~jA5B&!p(d|F7Pe<-y6UdQtKn{rMS z{aGhbGr5y9{xkB#Jj3ecQb&WHT~e%!JH$k93txF0`mVA zC&`rX{8kln{NU4jj)hwg)g_7iGPQuUSnDjvI!X^<`aOU)@CVbE(W+)TOKzGQ^$rSN z&3L`e7l$7`AW%s9Zm{==~hQ!KyB!p3@;(@VKL+F`S zF|yqK9W4vA^9O?jDEHA*foiNUT+?V)i|4eOd#n;HHQD;YyyD7$8AJ#uV15#iB zzmjrbc~BRd<5D(`QFw$dL+#q>xTkfh?yS(j>%{P{Ro>|AJ7-mJNygyxJ$;Rt?;4=o zS5vizwK=#_#vy@iQGDc_g9Ee)6{?8>zEelZ;>uFYN^L#ug9>M$5+(L7MlG_|0oToc z@;p4HuTa=9fWn&51HREXavB&aAms?x(x{tpNXf>Z5(8*?0r2Jj4J`wbj`+4#FK`j~EEVB(8XbY)y{b$y zI&Q&t^?)cc1vh%%oG(aZ5-grVl*WkqCg)`~cqpk|CgSEX2%uN-XwcjBUcMWJjfe zAP8@J_4F2Q+rwsvxY8~8x(jfi45|N*)qX-~fL-uMS8rx*kDZ(#_@OMaZ>EeGdxm53 z{{R93;TvuIkn*(NSkXbZUh;!j_ot06Ns{L`2QW;t>;VBb!Sb1nGzp)(LHVWv^gYwb zWN%PrukjB*Jl?AQr~@9<&!d{t6uHzwdj)CZ_b0iPc4ap%==0=7=PL)x&??ObYO7EobWsWq=eZ*33LnX~-LGEYbZE_5)rOeuD4RijaS zdc0XOYA@bqIzZMlOA^PRFjAzD!=277M1e}1TKA63Nj+xn7k!degt4S;E(x@PIuu@a zjDiOOS7_k9^iX=mERvRyZf=<)Avb`5Pv+V;)GbXWh%TC~GmFS7!qe{y@C%06$=q*1 zp3p|UXhTj9mRxDIw7$+^N!-VZufGrc+E1r%^#jB4T1Af^WMZ@# zZx(@ibDX8nOimue>h2QT3zX|TUi(MJ*af09y=vdhEJb!Z);bbC5yR}5<%V17Bk1$n zB0-pm?kJo$>dHlHcJiT+T^!pd<;Z*)Mf!u1t0LO@VG_Y%=Z|A!8rl1?Ity)yn(Tp7 zQS0}|Rl2`6Q8YB)$!N)RiTX>yDlG`r*-zou^)tLhyMue>uoedN>UQ~Pg7{T^PnKVf zS#mvT6Q9dEUbxZ?>fb0_A%?egrCy^4dUDN%w91Uk-kp-UUKaZD8~dZ5?;!VId{11UEr=dopgowsmkn7brvP@@;@0t1 zZ%EBmdx9?moL?a41NtSh2`|e?&R*^@NGVnV9j4-Of4(=svN4ZIc+m; zH8UV2qO*_EYrfN)Jt;&c#cCMDkBkrCJAM_VqqZN!<>}+Vk;c*L4Vv$$f(#~R?Hz7sKJoL()z96r(#$@_}!xz79`lv0M6&L}L_gj27;F@p&>6zKTd|6oxGQUcKH zZvW{#7|GAb3jTh6U`^`V@Bn@WUNRYFg-;xHNb;WZ}Q|=a$c!ES$h! zc^hZ;;^tRz=R0moZ*w+DD&OE5jftnv2+V=BJi!+6XuD@Cj$>CwD3K)kJ`!RVjA;*< zHnE2YOtT*iAaDYMH<4LC(}--}T_2J~b^z;DJHByPeu@tjpjnz71yIeh6i|Dzi(7D} za0TX_nrLpNOs_qPzcsdrptWmZI&sEhnCA&mB!nU(`<5&Ss^=mmbDEvn;L zk_3B^teAjM;>P9r58UHC5?I(q&`8mh9uqCadU#bPWv_G;R3d zLu0Anbz@rFMmd4t*sd^TKY}su(nqMnm8PXZGaa(6nT;&Hr6#HWV^Up}mrKS&U{xTr z)-s?}b@a_^JnPT4wE0zCC#t`i!NfsYaT?)8>cK$!^Fi0t{EO?w5=xaT^I^T?SG5k% z=X7VHxf$4kJL15+{;e~J**fKq-k`dF>J35xt78%#KI;Y$WbUX~M?k5V{d5gM2D00K zXfOgE^8am?{r}f4G;lGF8I*5|nFwTAXkuz5B4(h{{)`$|Pft&~_~DEZYCk)-gq#aS zhBFwO6GuGOWFfHqd6*im$xJu#T4TC0;knV|7?G!R#cNSzz0pZhq?i5`F*YZTARoUechixx|8QApbDX0=VK!ulWw%c@t6llAdJ9xT_%1D}Vb00OdVBhU6QflO31TIA9h^|7zm8rjTG6OEkQ39IN~7GEW5=nrY#%Kb6HA#r>O4k$PRzG zF=^6C>k$IMgZ6EispX(*=A8>dQovFz1j5)U6gI8FO>9a;h8IvP_yYrn6a0vjX!e`( z!N!IPHR1Boe7$3FS_UgcNO9xi560bFC1@#-T>;R=Bd8qK0sS-P&oNaxqIzD;Ml3aV zJ{b2l2M#QFdb`|6wzRfx7t(RLGs#`&M+I)z{jy6C2R>gn1(?%gKzK~DUGK@K2Ra?y z*u8&gU&ls#@;_X`RXI=gybV(S~BEW56}a~$wnwT=*}!sU~{AbGUa${>PAO) zkG+0%Y{`LF(T4UeYQ_Vj@qEDh|03)wqoQoXbw9g6q#Kl>5$P_GfuUiBl9rMZaOg%w zVt^SKx)Fx%?ob*eq(ee_0O^i_^Sbxi=j?Nq-#&j>{F?Q~^FGggUv>Xly^svEbudE8 zPR!g^(a7dhXWMixt6RGDVmoTv>pEp(4+$B6GZ~MleY-;I{)^c7B}`GTYtw7jffWIA zN;R61$cgaUzdz@Dru)w5x<0rJ?7e;MOAqQ&0ehSB_LaIENDf|<9M1JU2t89b)D6_= zdc1Tdg}W=#MPPQ+md48qzg(u2IA;a+zWB>$DE?knt~JI~pZz#F;W^jqARwnlAH3Ti z#fF*ZIp-wqV@_kA*x&KJiwlm=QIr`+4OVDFcmVjlI^TxF*+O7yY z1$2%}j5AVYlOiA}$XIitD^7UB`}xk`@gWazD({bYO1p|k6pUz5lbDqD?+Th;rktOU^bv4<-{M+GcYde~78h}(Rr}{bI^G8IF z*<1oN?R?thPkgQ}THiKU4LGx)m=(<9$2ms!suWV5`3V&jW#)0(4b$_54t$S6H5>IW zc5~irGA!15@F30MBz230AGIg?cd;d{r_yaXSdtYFT`uAsx!1V)~xEw>Zffg`2{wn z)4}D8&qM@ht^Ik?PobQIXf!!+mb+~>MdsV*WinIyYv*MxItynGs59^e;XzhQ34>8c z87DJPZ^$F4mJvH(7temIs{F38JMBJ5Y(TLov%*sK84sJRO$lDx8xIHHZ3WF}GK}sy zno+=8Q#)Fro-=2}8ih`j>~3`Z5n=Ln;in?ay5+?8y5#U#h*i`yo*L;!F$JN1g&sgn)gv!qOLX1TacGEw)$IxPm9*2L)KhmBsxksRTKxE|~E)xoF&4OPMb`m?m@wdTUIShhKkD&0LQb>nq z{nUyiNe8r{FKhf<@ei(@m;c1BUo_Aj-0L9=pxW%u@b|3Sz>Et0!k`;ml7M0#K&q>= z^(sfTzCbCwR@Kav8{ezGGB+Sv91iTgR3Ox0A*!K9lPE6Ak8IeBM1;l(()%mjF3Z7f zZUnrpF#R9h3wZb8na~>oS{A~!UercD$JQYH{O_R_TfKoM7#4ZDO~E(w!L#FekmOdPeGEV9IXav?EiyG8cqqdNdFKi?eLoGqj0P7sTyyd{g|ihwIH{bb?$2?uuo;^L*mUgaH_$j-4obMTV_f=_xzj znG7Spy3_~V-SIsgp%8JsIgl)h+VNQ#kss`9eM9KMem~^VzPa}!r1)!R9Ovu0bLkPG z7#;1rjX*s507Lz`4mkzu+J&mBctN)Z#SIn$+uplo!A~eHM&$2WlR)mu)LaAGndaQ+ z#Q2+opC!F6d+aFMnNV%?%OOWQ1`^SXb8;~`uw$$eXZ=AK8Mr# z?Orem1YtlWVo$VvjkV-i7)4`Xauy$zWg42kvwy1Ihrc+55%la zcR9SDBD@mr#A9=nB2O)HtBm2FdfRkX6#stKb1#;M$0cp+Dt;deiaNp1ra>!^TNq1# z6L1XE^)4Ya6Qm^Gk%N!&!8Lu3?}vojOK*>_{ZMC}Kp3!8M!p_bAs)y4$qgdC2#oq51ru#`kmBU5$!J^edm5}siZL)Bq8d=Lcx2aj< za|7FXt@d2Jy-~CM(%+BZnHf3t<*lPB9$Qm6>M+fxn*k;iEJIIS5ye)q!T6-I>`s45 z(1X~SHyOU-i@BO)xSRnat9ToJEj0LXOdLsqe6*L;W$^RS`KO_WBfCH8{WQ!+oaE$u z7U!>)yg3sZlZhu;7P0hd*PY#;yrd3;3GY*l9j0*kC=cJN=XRg8Qhs8Ri&mzJadjVb z-E%e}5)iyYe|i`f!Ssps{ptZq9IJ#O)Ijd7muh9^Wb_+JpBaAtY5{ z!rzMno;f7viH{ZVa-V-3O|l#s(a(=|9wPG5j!5&*&82Ws$sZkJZEkHe&{(qiULS`R zFbxBfBc&D3-#Bf>VFj3mUVYb|c~KQ^FEDW?NElB&?@77SbXI;pYfuqCBOJtJ7vW8R zLyH0Bd%RT=4^OLlUl*Jeu4Jz9{OYiv>%C^P6xFh4nxJL6x1SZVj2r8nUs51|{zQK| zO5We7jK*SEe9u!31W@+%n>Wyg6Q(2jCGjF28G}^jQvHh}gQ=cX(uaZ}gLp{O@N;K~HzPq>hzXyLX|hl|CUYyYchq$qp+Vftv+k{ck?1YP*(j?HCNRO86`9f09>aI@MR(ta;&3!{^IVw zYZG%A2VqKbKxQHQeTcYC9*2y7*tcB|w>Q{BYHs(|SA>d0vlc^u5Z$`{jh)ORcAFK* zofsG9s`Q)&O!+Xkcj@Si8R%a=ALT!v&zHixo*X6C+*K$|8d<=;V6Ocz8`-l#Y6HBo z;ar1ZL)>26wLj3X^n7xPMwljuo23#$x(K#nI(`{4TO`Y|TYV1CP97P3&yIremWMt0 zA^(X2VZh482ILBj%vy^U`r)SC!mNJCZht3*u*(kwN56KT?T>Fb6eVQmQx+}s`6*=9 zwOYgD_06&H>Q@doeEJ=YSSe6kd}H9v?n@4udQ|f5qoID8TbCCF%f5M?JTs|rqTFD^ z-H2CPz^8E(rfF~Idm(Tsn~~9}oM=CM)oF8faWmOT&Jt~Qv zd0%k||9vJ88U(*>M+bdAY61Yr%1Cz8lbxAQ^ODCZ3eR5KwnyxbVI^ZYysGU;>h#~- zo=pt*A#0Oej_EnBP`lz1Y&l|p-NPr9c7F6$F+`!r%=OL5#p&8~4wBIT-i15ewQLEr zBVwyuK$4d!EYByZdEapJ+Z|y@6m3V#b>o&>nps~RMD% zLEuDlSMWG@A_0IN%$$#v5eQxb&{zlfC`{)CO%0WZj;}lIFtZ^?SR7?`Vsml!FX4xe z3yIl;W=rXLPTqcobUa#8jTkI@ObTD7^!gV+>erV{fIKNv;MO0Y?&`PcsF1?mJH$(c zknqm_M=Xn=nt`1$;nxTZERWjx)$)&d-)pBRX2lT&>-t)S5g@y)0$+fL5VMZqk7R|~ zOyfmHLj|;ACXctJkiv;I?*il##FV2WLNFN>IeVjA%?`_-#<6GT;dDal!=512)yn3z ze+&^iA|l>&W6ugaGH?D(7sSR-o0`2M>hk*wA*5`d%Ft?jHB!^XKkKUZR@!N!2z^mM z=Ftx-o~2pf$hJl%3K?H=2=?B@0Z0q*7bDk)jFdgNpZ9Hadib4rgxq^8&KNjDxMnx2 z5-CKGP?11sbkLm8LeF2Z3ybs<_qX)L-FhDF?3j3NyRs2f{F;VI-t#4L?)+;`3oF~h zpafCTYws({gQ+grQQSqiFGr+(GveilvzyqwJ`2&C#~U{6P<=x-v|X8hN=B>l6sQ z5vvrc1ffBRBaSG^_vo;^W}^}pyCjPjIB8T?RYY-vhk)cYy;S?Lii2U9FihkE>nHJ+ zVT`|gXL$j~fHeL#&EogbOY)MEaHK$1kc&(g=W}^fBZbbb;$fyzu;sVzFTBG*1yp|; z*>Z{F5+n60bDZ&ux#{AAfI0xMSm@=4a61moQb^j80%imLd6sN@m8|i*`B8cTJ@Nxo zT#v5drLxW{n`AbNxxc>t6h1yC&IJNB@`%Yy#y3jji&7RMo4I>3n+rK4I0ZaH!Wva+ z_^A*HW@3REBW|pJ7OkdEBq+=;Ji~wIE87v~?d&x^UTQ4`JWmFHDR$a%lq7}Uph{c2 zH#je{7?4ef@f`R0g@6bi{L8X^IZp)eX49oC)SIrU?Zswya&1ZA<^(d1ukf_F(mp}7 zH%ppI8g&sM6`g9G;l`%JIF@Cd{e{}2IRcY!`_w9mJUGUptl{aIFfud+RsRz@!jI*P zuil}j*{R^pIJ=Xi`%Eg-04QY(i;}>#NPV5N+et9a^3%wGRhIg=E#_8L-5FG1{G8JX zjni}?f*+3UTvG`t-A0DoQ;BkOpI1d|%*!|`*5_4WtY;j<=>-?%H5vm7oYC_6&S+Y` zRjzL`k#tHUO|o*XX&WV6?db9`Cmq3@%+8^;s{y_j@P;T9xXAe3BgLB86S+!erlY~nq6COyqaZod}*t42FUyya1!Dr zc^ixYU)=KRl@YwdqZYQA$0~`V*}!Sfa}*<}@UQ~+Q!o)eH0rmK)!_~plLQg3lm@!N zm`2q*v%L*cZ-yVdcp!ZgInTi=3v{Av|w0RecmkBK12X^ zyt-fZpojU$yy4KN)*wf3>0MPY2q(FooT@7Ia|9sGY#xS#98PS7D{!(0dm0Y7VgAZl zaide{RjYh|OxCyis6WkcCu6S>;m&?q&VOiK1nxk}PR{7RC~n9HguXU79HIvq+CTyz zU6h}TtKU`%j3R7_%?JzZa=hIS3U^Sxc zSW%&owdcInCC>cL&-85bt@uC4ZCDwmW?b^Qt1W*F!Dulg@-s~@Zfa9XXAV6cyj<;? z#9iGLPI>ITNv@W3=a-u-$_f|hTm)xa-wX_=vfWt4E*|S4KD+CIf4oD`Y&=KJ5lTDC zou3?a`3C84pM6Q7v50)QgZ((`zIctRA!MUMRI1ysS*N5ph?mRx=Fxs+)va|=^5f$T zl4q}~W9}bk&8s7!j{C87dlM)ApwN&`7PfA|Laf@rDCB^zp4f3MYvG>uhyfk zkJA`(==f=zrHw}tn@-o}5$4pxvNT8?g z)%rwuf+(c^`ujDGj}Wp6K5l$te`1?2ssM>&C-maQXG+YYkiw0(BqVAlGM72!ivr0X z6Q@GUiDMpl^Eb}wrY3g>$@hZNyoue!PahD&Gj}d9_gAH^&x0aYRl%WsaqkJ+Fct%b z6&tE`bM%5}m^N_0uDuDtY+Qo6XOG1PwmoQFb$_Q-ofpVcGr6=dTBHa%`D_T}H?D}m zG)5D0<>~^P^m75X=}B7KRnY+DM4w0p>tUz z1QQxq)%dOEKkT!He%H)4gESG-uVu=Xqonw(*TwVjT&}G8B@#fCdH=IX>J=*~AY6n+}5Y}Vdmji_s0#O{LKfi?jV4#7G z++7$kK8r_=d~P=fQTznUR|iMf+1rq?5rjvv$NAk(&IXzsjV)jnaCju-Oy za(^U+RAsI&H_T$_{ft+^BxJsaJmz63mcvx+W;uXuV5;gXtb2Jh>@me6<``H?v#yB zIkVRsA@}2cU`2Vnx1LRY?sFx6_gft(XDDgVe z<%4dcbF&d6mP(&B%AZTp^AYmbznqn&QHX~59z!*h)>a$n1$UaFg{cJJW@P@-`xG#o z<|9@Uug6WrRtC1YGT=CJtt3-dR)&Op8K|fR3WJdJlX(vr26v;*pIkUnA^GKQ0254* ze}eo0)pdKbc9DIo{K~OB^NE28&{Xhc85-P~PWVDm+*P0>V*LDv zWiz|unk4a!Od^vCaIsbUlrW{61Ds!Pd4U%l>?ZzY(dq{i2&D<0J^Iz9avL$;F?^t? zs?wIRscqVWUt_vVp+HVS%enWEoGgV6B|6wz91c_i*&Lb+C&1t{;xOrAB~_I;QHw0t64p;;t8BX++HA1-9jFT0hnlyNnDTc?8_!z# zB2h3K3x()JqicMZDq*;L34*9FpS1?C0j!_w0A$1Hq!NTi0-&apMyFr(MdwzUU^=_k zaAV#rSdXImi)YF)n!yUgen}Gyo`4$C$>l-*09_|q$j^Vev5kWNh$<}Jr{ek5pa9$K zhN2@h!R<$;vr;crQ2IzQu>Q-+PwCGEP>SIWvM4Yu>GlPSDW9C_;t_p}RlGGr@bE%3 zi4AF;7gXQ-;n0J1$7ZJp$+y+YusUC{EGUrb!il2lJzz3bxr$Y2*TVoIhkheD5dt1{ z!CepJz~eC~u~d3HAYMUAKzIvQe8DGBF3NZ8&G#A5jMdF_OX{N}SRU?f&(RQu+N+V5 zt#&5VTI?&<2 z^#w{^k0(Zq$({P zVIu(gy17NLsRP}tR{|vsX}MbP3pUStHNJ~}?Xh-?s6%LOv~yU*&-wZVVYtBQj8HRI zq$>(s91|*ht`*)fgpj%D6S!*SYiR>oSZ?FYmBYGFW&SBT1(`ZSb4+&JfvT3<bOI%NHx8!JSka<#J4IeuAT2`z22`hf=BII4kL{joQic6kg@oAW!0`)h z1cO#qlEKh9ww4ggy{s2XwzdUe$nQ2OMttME%!y%=_b0xm#(g(y<>)+?0|v$cABA3x z{KyY$y&5+rT=IAt8>gu4d7h)}$fv3vkzA{WtJpCLG3)xwIgXAMAX)Z)BI?~1y&uww z@+S$Qhn|?6gXmSx07AQC@cMo5at(9An)5u^0rr+6~R!6ouw~VRp+*Kpqy2xN@jIUSfI(wiwUQO(Pe5k zTo8}*IsWPO@05GV29_Qr*J2i4CDVpSMn(4evF3Fr4DZaivv1#|cm@9u@zPP@S-{MU zfxV+f@y92xUcpP*?z+8GdJ#FsRA5`BS5$psls)J(UqKZ!#Rn=nRvy~E_8;mKv0N2f z6JHanxbhin+$n(Cz8$g=jb}y4&B6?%|E{Ne@Qiz*0}aEMhq|TE6pO%bQ<-E6*xbE$ zj`z#sP6xw{Y$e7%o4)vpuMt!!cKve8t(%dG_GeX_Oo^79t6aJ!!S}b;R21M@M{4{T za3VCa-Ji#AK2_OB^@fZdCdqA8(^iMEav6TC+H@fN7RTk{bQj55 zYrok*7u!y}A9ryv8AbWBE42%gHo zo{EOu+y!zOc%H0JDbuWbX<@6Y_<#M{-V$bB%GS3rvEGm&b!2yc+i%Qx@|&v8a*g^V zE8E?Bt!izEb;2_M9Jd~y_+tH6C2dqaSl{?O21@YmYH`JV9wKQslcsW%gY?QNUd@Da zYE{O@s>OJ|9$^fys5;!_-=HvLIckHA42MnYpHr$x=biZIcJL#>yFGBh9$|O#5{YZP zam9^Brh?)QaY|7A<@UG^RKzD6jF}CLnXrrAc|eY$pAHf-It|F|*yPE`6P!K!5;nNr z-NV6+(z!UL_~&5(bK7zno!3WrSj$l+0zb z`IVS@N^ZHt}2JwAl2!O ztC5OwJnIao%HxE-akHL&F5cmxyhf_nt040h(W%tLSbJ|~p_U(5u0u734k(twpsiK9 zcGCkZV&(HxVkg3U=w!Y9{H0&oGdO>~G}A|Pr;^dVtT%PzvKxeg)-WacspVyqa`naM zCjIjrD?7r+K5~_D+cFp9KGTdYX~2jEW4dsNjb7FL@0v@(j{E!g&(fc}%(t~dOQb&< zkT<>68#Nrs@K|xut*hHlc$fLW(yq5r&1TEJ$==L(S|>5=9@2HVGZ=JBrrnPLf%$RI zS^Q9GHa)geRAwOmYXWnIHt6i|>IYf9i#T5FA0 ze8=TkXWi^S2FE6gW9DC=L#d`-t80ebL?jNcIF>fBi z_#EG;k)jE+pfrZ5!m!W1V=Q4o7ouUB9}>(t83BOMkShLO63RE_@n^0e!HeOOWhA-89f<{`N6|_Z)H33;5}9( zxP8cM?7@XP-{g0%!zRxwcF)7)S8MqhDlCUpfl=jnDVFNB>jF?Y&n=jS2L2U$+9BV$ z2dV5du!;htLw>t`w%9nk!bjEnwLDvJmI_pk-pra%4A%)aycl{LMw}3#4b8<_M~A2s zHyE)1Q(W!_245Qha-*vKjd)5b{!Ge3VB0I?nywo zsKwBFXf(c=kK4p;#5%`L8|H&$a4N^_x>}D>;=aRJKf*wUs3s1jIx881?XgLAx zh^pDO>Jlz71lkobx0II$D(7*g#|n>|xyctjUAr@X!?W)JF{HKq*SZTOT2wUKZr{+% z4Mo#Hr(YS{w?5Bova)?^Km7u1Rkxa>g+s2^5e)(+i&n|< zQMxpS>v>Qbq%@5o+j2!3kU7fo(l`0 zXq|AyzXQ()eVEAi7M`ZMKmV?W_fJaXyr1toi8HLHxbK>cQYV?%v%eFx=1ec)(S#gLVCdd{$RsnX>n}r zaYjAIKQ#^pv&!8B8Z=C+PqELg8ruGeOXQ% z+yFSCozr#bWL^;5yrFU(c{6A>u<^HZG}&@kD0v6e9LP{aPqjNf>VP@}=95(*ccriH zb4ix)Ll#rqf4rn4Oo+r^1Kb$JKGueU(ncTdp5b78Pw(T5Lx>a=K58sO^GA4I0L};W z9+XD*E%A3w637r>=}R~$)1$Ey+v@trET!spx8KcbX!@w)cdcDBEoq&gc{bJjHrdg; zIeg%ikaJAH{@j>I>f~SZy1tG{KiytrjeX6EVgTYpLi(6v^?~IJ{L8xr2;&Esyg56d z!i1Y~ECs@%u#qskd$xNsVh|*K{pW7Uy6hvln3fUBibtEN6T$yG6ZlIlRi zA23`2*MR~w+yCcPsmZ~Y<67voj1ZCW)eh__N{Q>XL%Atl8aoI*-kSYbe5@@)#i{RftCu~S zLBhv-X;X0$I5Bd7=$`+3nE}_HDfQ1fy(6Zmz+wsD@Zt zuwz!3%VfNn8vAFG@N1W)j5t>K=*TQ0aN0&3*W}lJJYoFAWEtXW^ej2g8oO@wv48t( z;hECzddXG5vvB@>);YV3P>{v;QEl15*2L~OM@#3ikdd69+P(eI-rX#zYeDUz2Ewxd zroGO`AsB2o8>$aTa!7D`V5|fuQEfj5(lIs@{_TS0UH>c${o~a3f`RcHj{83vmIHx?!^5;vwd4?j-*R3` z#FD#+!KHeY8nbW1T1%pX9A~>=GDy=bZEk@GLgHujmyPZlfDblspqi{U=pH&ID!k-! zOv>~KUnm{OC|3+rRhEik1!O_^xAbbjFj$rqj)zTZb=!m6@LjFr5X@}Pv? z8gRnNrBwgpT`Yat^dd9(&{cTEM6ExfjZ{94b)?!imbRw92E#BB*f*-wC_?q&`-~tN zoP?x~X4hlEMPY-d?rNY?c{0V?5j8Xt6uXl7W=xa1IA#0mjA>q+V9#`XJ9N^QQ1XaXK_;B#>rDV>h*UzicTax>2G^XeriG$g(9A#HdNEvo454$ z4o|KhNG4DK6DtxWOB} zX_#bI_3S?kg{o_s24_YqQ4)vdF}}R7bIqVb?7Q`n( zEr{ks>kT77q$W4$XKu~D$9L}WaQOx*5qQNTz(Kw~6(Ik8+=(N4=b;D}I(PxO$oe6t z+Dm=Rw|QVtHtcg5#cTJ#^8v^0Xv+oh(eTkegG>uj3hxnfy!BHB1;S1n8x_6sV2}hT zyer70M#qNN(|rB1ZtbnbQ-}?3`Co${Bff%}fg0f3n9o7X+OCe@E?Xv|bkhB2llx-9 zJpaZzwq*L=x!2yo_2uKM&9s&P-AA#Db0>bvHrw9g|Md|0d$CQ1KTB6uj7|4KY=H(Y zb~JJNBj~~nQ&aHY9^)|ft6ija`9`-_e8rmwz3e$=*Qgpik7eg<(}C8k+N*4__ZPS5 zBZ42Wyc35XT8SdBk4CyRzK zqTemDvJP<>=}2u5G=^k@95S{h1#jbw+HPe|Zgr=@@q1T|!zpdIW!{?90o`G^C@}R0 zh_yfrkFa9Kqr`r=?+Q@L?DU>mORF;IThk#_#l?694fq}vkgHM)sMdb^dx6DF1f-k54OGrJ#V#f3jr^{3FUR7f zo<$Av!M`^VEG!vc))RI-i8eO9See+Cx?a#Y-JbWkga1D2l@vvH zKq0oodP!72fCIyVbc7L?e(O*7eB5|&S5z;lL=T?1#PMq)+UBDZx#38zwT(cMB;<6g zxmmX>mFMd6$|-G(To6}i2&<4qe0kubeNs>&qjF*y*K=%jf!}oEm`2QmO1+*U*b4#Q zq?@k52NaM%a@u#igfCD?KYmrGhplJZiO=asJWK>%r0!0fmtt%qS1O>GQVa@#V z6g-->3M3m8d{wt3ZY(#{c&V!IMb92L_j%Z3B2~pl_@O|gEX~=9*9q8=`k1Dm%i$zB zx@BptC;Yz^UjIB$ehL^$mulmwpvzQrf{Y#WzDF8F&`Ra@p*Hv5 z8ZBIlteXadEth*IyD8U4mYw@D^k@8%xELv^8criF=5pcT%`g4(9~lRW`j3cKjz|E- z`AUHv&6(_ZWp%ab`}e!a;)+qxHg$D%-Mzh^no*;TMqGFA-u*E&1mCVWpZoObCJC9M zUNP%U)ozdKY{Sc(iGMDf?`W%-(;4E)mnYxs}c59CW(l*P9Efo6<;<|cNUr+-oFnz#YQb&4cgshD- zZ*w}|HDje|zIVBWd}Ko66Y0){jDmEmrRZ6j=A!)Sxjw218MPzM*>#-=Z?sSnOgG=Q zL3s9nb#R3I^m8HKR+qYys;CYe&U3+961cG=47$t<1+&sZ3c}I=kP@tYW~yxHLl8T6 zq^m%{$}rBj!K&MiR%;*n7KNOwza~kYa(w+dHSR=BvYsw{zRvUc{jKtv)w;b`(n2dQb99HX&1t3zd4YezMnejZ>-3O6`@Qzrd5c zfcOcZnvA9(ES18d`hE=jgksQksIf8q)mhp#N&ESlFS< zR@a<#b0(F(D6+aoOr4 z@&&{Wm;dCLxU(}d=|iN#!he{J@;2n)~uJ$A9-YVA^Pn9C1UU;&ZrpDADex{B}u&j#FX)jpKd%uD3Mk1`BD!w~Dz1hmDoF3Pj`wp^}oElpXbNwD}i z55HdN=d79eBpcdi5s3O;UoYBtx&OU$O$I8{yB>ah8ot-TnUdI(~ z9?p%hN>+@|&Dqa*4N9EQ7%U%+!PHO ztQbB#Kw^vHt#Dkfxd6GmO~r-#b!hlqu&_R7iyPHx2@SuT)9I?4DDz8R#4?kR#D>w% z2uq>2|8X3FsiTuV;RSiv zGY!d^J9DU{_rMZ)`>k(U#yRSq2dbYY-_8>bd)q7t-=5;{(&^^iB8{(I{ZMC@LpPZx+-|$CvEN&)0 z-6YK}xige)C=8vh<03d*GR*_v-t2S*)mJQuSy#|}Fk_MLA+=n)p651p2MD(xQRoDMER^8tg`Jv9+Y#*6~em)P}<597k;JsN)DBB8NM zKwU~g;IoW?@3<7wssmWWdb~;dPkS)~yrE5AoKPI{p8W()T^57P_vyaxsF5K z(}>LcB$$Yy@B5Tbd}Cc@D;+?=xjDGh*+9>0^K{m|>lWE>mv@HhG=@Hl^0p~f?X3kR z%^`DhMoXV;eV!rdUbpy4toA2air(jcs5)Y1H(O6Se;x~tlrPFuzi1_Ielz;I=X-zD z{Umg7f7RMh#HXEFE;{BMCXx+3zY=Lq^I%ZTT-|Iw| zP}UmIvmCtk?#e;hEfv8vH8lL)<~;6Q?-E?U$qak+@%*=?RA`Bh3T#bQL2F`a3Ne4V zDkPU!KVj@Qdurw3QHM^1?HD(mU<7b|X}GDh^sKCvnWhr!jqSPFhE!i5s96cDXDi zF+N#xYuFdQ$H|I zBPa3kO;QLgEp6vos_(fq9sxm`*Zi#$si(xL{5Aj(iGC1taHBDC8l0T*u zHE#q736aOd#Qe}>V~;4b0boeRrG)%}*?B~BtcLLJrUM4ITLA0Iy}9yz=jB2r6VU1^ zu$Rwb#hYb6D>A>GJAGE@iM1vA7G)y3m-Y_mByz{Ly&tYP&j=OHpAa=ah#IW1jk|#}N z@Flr-lchIo#|wG3aaL&r^FVFEKRNY*&np$ORcx$8i=pjKXntt%Xgu9&RY1c@9K(or)T ztmd?`wFdi|@FWjt@(Yb-m@8&=XA0C_ED)F-e`4zH>8}AM$!~sI#BLBnj7rGI#>Ujq zOXPdGoE#jFfK!x}m6h8FLUrdw+#{+wb$xv=pq(&^Q8`z)HV@^7Z@D%g2RA;~sG8b3 zDe|kz$=!VmR8OnHcQF3ReM|4arfY5@4B`?JrfT zuQWiUCe|I_APLp%P2sg1y;Da7=NA^Xnvt;_Nd7pdmY#i*D)(@sDTgrcZU539#n7v* zt*yYzp%bgmk#iS(mIGiY6g|~7P&6{ArKR<4bFX#ngmP0!QSpmQB)_Bj&PrLO5(1QF z`wwo*rE}7H@;aZp*q@VA=C(Q8Osf=Y*RLl~{)75^(B@M)8TsjC>+B!^

Gj_$xgi z|8yG&xqbeUF4~hyGL68i#sRQx{d|A;)gF%kUml5k3d_2G-fF9Db~b%f`?epyn~Obm zz9LDDLhn5*VMDy~Vr~Hj-=X=EgT}{!;M~h=x4TQy|IT-9Ch%mIlDe9`-a~drjfjQd zYnO20r;+9|8SO)2B~4GEpGwlRH(0m54l%Vy%_pJCPNr6+^mYs%{f?q`zE7?TZvWKU z6f~LctAjI_ZT$k0ei^cM-iKl{R-Y$kWXSY&4%YQBrzDwr+o#mPkKu^8T6%cn(wC-y*fr{8S8v->Hja%$U*t={iR+Mtt zd#1m4((NgL8Bn%cKqB7Oy*u80JLl#zwNdY!WLXBM~D&1rKY zNN-b3gU0gg7c@g@X=a~1wN*`0&fCzH6`+Dr6@o72Rv0os>&3!&U@+~Ei)^HFw}Z2z zof`s&rZoxDz}_p}O-VZHpTqhJu`H&0Q4KARZV)P3>z=)TyZkjF)goCxM5UQtIpYH`iNoEd z;hxr3>u@2P88FTC?R)J`e+k%p?x4IvTq4_$ZR->|@j(N4;i0+4!1O&p=2 zFc0(nDXtPeUUL`>c8lqio`L)0C}3Ps`+<@tV-s0~h)TodhJ>C??jsRhTp|FU4}lS& z63B2xk59fCqksUuDgtj*{w1wg{lJndC857-jtm1jlTIZ$z=5G14bQ@R@Fk-o;KQI< zW~PQN30u|c(=io*yk5AWXpWV?JzQ$A_8~Y7Pv#S)^~now<4M~;xEbb=N6#s=(>IOU zx324V-g@VWLx{@olA+vJrjDTi?z6}WYI7inB2!t zvVz>v!Sy^v5xKEpc070y7%byrLkw^?J7C~{NX%&wfIu&aA7zq3blmNM-t7rQr(5`t zCj|54f#knzv!)LKUf>LO1JRB)KH4m4>7b^RdKG*4(iI`@DB2m<31F-HZ!-QyL)lHJ zKgkij>0ZB_n4f7h7DIB64x5&zy z^x8fhSi?X>eljrmPb6-)iUhI%E;P2Mj|W-@E0ssp^!XhQ)%1M6zlj(w@-8M87n0(F z$j<(S0w#_GI7l>91}uY=yK;CrmWv@lhz@%~mH<~jdj0=l?$QBbji@Hw((|2cg&?*I1h1T13&`1axp)Ol7X^g7~E+l!Jt z4x>F;pe&jxbS7D|41a|G|DhxI%}F5Vx8kJZ?)lt$;e15!zu0@rsH)fQZB!Jcq*1y- zX`~k*9g7f<6r_}HS<)>^!&;PpAR!^$9Rf;82nZ~?L8U=jK;XUCezyDF&-38^pZA;( z=X^M0tPg827{B$qYu?wq=A73gmvF|4=49`d>Vll>Gb2gUeq)gJP|G*W@A{^`jJTKD z$I;7MA(|yKLJQ;OJ?{32_MwmMp}Gz;<|2HV z()oi!O*-9eYJ5B02sMdX2ZKEjsRYukzmHS`6=uusizChobmOQI*pGV=m4L3cBXmF= zW`X7Ekudq#I~Y&un0IR}=;tQa0m9;mDjYTUK>!b^rv7^X59na`zq~k(8wJ9|Zy-ts z>}CTXV5^7zG-4V&0^`CMpa8-kM#;@lca=&}MIOE1iStu0zwi1}UtHtgLQsp)p+`y}$QAORL9VY|pzWmprB!{e``V)jA8j~h=hNKjcb;4$oGIiFzwljr zG2BxCULEmHIYJzWIt|=gCXUR8;5h`>+&Cv?Yf>@q}S1`4?@idq2h#c7;1thYiFA77Cny*Sb$&d zEJoApR5d>DME#qYCqvErHw}3SLa?0PCRHf=s`uy2PZS)!aoEDio!z~RZNd3*2c!9U z;d5*VpA(e&=P*i<_rOQX{)ip~06E_64gn{MI1@-yT6r(zejO8B_c|z<-?W96$x;@b z&Iv`Q>fa5kD1HOsZEyJ}-yQ}{eMPMSTE>ckGzOsi{>__+8Sxy%0f~GTcdyegAGWfp z%cWT2WjM?#=T086ll1AqwekIM69eNEdOuORd{<;eL`Wt+i)QOxqmY#B8?bPr6JR9$ zs72Lyw#9%rE?b`AHd&cYqjXUP3j`$JrO=tbrtuzliS3BcnKF^0_=+t*Kg~9E_ApX| z$TbPj|B)&#VxbT;o&INJqdpaV+aefse`P# zI9Q;5`kWwo1#R6l&UDrGQmNEwEg+DE>_*L>9>D zd8w7a=!zi&BN440@XUsU$zy=!Sc(M^HMXa@U)vh2;({%69~ek}4q+=7)+GGyF<{tu zg<095F27Cv{gS`e8-0z9#*pgO>1Erzcm=X%q-8f@9*<_c+T;79pj9=^MdMhApLpDWFM_Rkl{04 ztPp_!J>29UaSwnb5NFU#;+yvZE8=HNlB(Q5SB-C&r=p_JXJ;yFjaC-1=1(CM+sYFn zX|hqZBDU()3p1c;8?K9hVt%7cuIFy^XQyK^#$y&h>m-v42T4+_=Dq;#BkKV_NBo4! zJ#8V+yWqV&f=@nA8o-BQx=VdWoiIZblzO>jJI1#cX_ng@*I4P=gUY|w0M=0clfcFN zRFpGt8DchL+P(ppXwtf`(Fk&VZi8f|zT2q_R<$`druU(+<#s>m(NLji^}D1)B|ewh zD|D1RYXteJJ#czYWHA-?RF75<4MFFluRjEnCl2`dH@x(|L|u6sE|_7`bt@Fo&2O{C zPdmYov0lotWZHFd2J(~sn04IB(Rt*7<^ATvt9wFJvh&YXT7Jlr^c#gDG`$8mq803e zjr_#~sJ|A_fOUWTN&7cLutN=DBdl%N9Si~VoSOzi!1!|rxixv%`zOZIy$6=_lr-x!*ARN;JU6`q!9{a zucxux{76;pj_6mbC3=B)`D3d>P)xOI4GFr^d~Cd&J3%ichi9$JNw)thKAh@x1fkV4mI37u? z?-z12v`Mj-^Fu!L4E^d*0v%%s5>i{8p8{#LWx~~)PZoF(48dMa$57uNi7Y+;XdKIL zt@>%=2V_M2V2MF!y_~ZhQj=M7H;@>xdfvro9QB9c5fI*7gP;ANYunPwm9euI+@=bv za^?@X+V*KrL`hWjO=R7z)q0^bZJTMnS5li?Nw{@ChGd3sKMlPY7RR8JVKt0yuHlD) z)|A-^LyX`6fDWL49DoV6&6endb6OGAcj4<{M;&c`K*I(y$P0fl#P6!Y5SL&5^OA=q zsWFJW^{NwmG(LA*KN)xstxFmL9js16ehxOjso93?X*i^tn-8y8MI&l>{l^Uo*K>+Mr;_53%vkWBTOa*QvhKgG zo96+AkwR~*S2Z;3Ft1&K9Iv|F-=8=nhnb0AhgX6Ak||YN^M-H;(NQ)bbX5FIsN+Z3 zxaQYgG=6Fy9Iv!9r&_N`@SC1tnJk5Mzgw6A}!)(X`j_ z2!z5;*guB3#v~9qtJuSJ=S%d6lgQ5GR>ncm`Lo_=IP>fCXN1sOt*D-cIr9*_ySD`p z2z$Q4p(6J^HTAn*%!9nU^Dg)qE8lOfpRc5W!}F?qK{f0jO)V<_jsTN^fXc%vpDt5T zp0YufW_+U>whf?5MC8A_D{6P!qYFx|_-LekYc%$wq=p*HJp6F|o}MQ$HXG@q+GgU7 zoP+eWw%3^MhAhUXO$ChF(|U#WF+n9Gx&3>#!oEV-OeXnSG_ID{>iejqgY9THbA!Y< zX_Y}lvGK$Sbcfed#nw@&nx9y~`@~5zJDaPXVtkdIiy3iEl+w|nf4Ajmbh&6ebV|<_ z#3u!?v$H%f(v)SFg)!!irzl6DDx<3XTu4MA$wf8`nytHHjI(tjae zZOyZnkU2ej@cDro=E~Kt3PV~h3`ai2QtrnMbF1Qvgp7w_kYf>E9QLTBM4{EW(f;zq z>LXXJFB4ITA{vVp$k!YO0|F#%d5B9!1v~Rr1@fgvQ$h7fvPM)=a9K?kMNXQ?kW!A& zETy>I%r~=<9}Yy^JzqyJD6|sg`Beshs^Mql&aAL6)SEI-c3H_qxzl~Ol)s0BIAhO# zTCgV4UJurPgt-7r=mm?CO+9}aC|;aT zO^iC=>RZon8eg=MQ5;C#wyfOrm(f64`(&243=l_~qVup;#9O#qrfoA^&h zOwuUqFi*Z7SWseUGju)V(YCUddU2MEazc_Mp3Hm5P(`%17T|=IvpVa)$2%`5Fl6(u zwhzyu9ZJ^R^_0Er4yS;w2nTiu=PPI^#?$jEGUl~Vwhz2*i&l!8>l`B_%t=0_NkA_+ zdYh`1{y}8!knCw!Vrt-e47rA8_uYJu-k^TJqqlT2Tlbwft3Yh(6?Zx zaogC3RE7t720!J)GI_uiI{^0pKFO!f9~Wk%qY;H*fn2Otk5rmeSrJ4|Dj@gRDd(@; z<9{Q4l8%wb@_!_z=l&)!ohfPxg`G~0bzPlFpz-NcnxM#a0ER*z|3Q^e>Y>S-joQmtfp#Gp!!u%@Q-Q9 zBv3_X9up515@=Z#W!)1L_2Z25-?ab$zn=bX*}7Cw-bP83+(%GPK-kIS3S1;=vj6`X z|DTNyqTXSJnU4h{s+Z@q5HIU<@ z*FI-9#%|*aEMx(QGy>D+e)|Y=)1O(EI7W|2{b3;kLF2A>%eVw8UzJ*>Nj!UTo3hO1 zob_%6z<-ZPpNn4Tg`?u|nE~mMTL@OfLXT)A$Tuax+O-5nm@AYB$`)bZLru19cPCX- zmc;z#OU4|tYvBGY70yswy`2LgN~hOoV*1mjx|=>KT^{#hwASfNXQ) zS-+SW`a+h)H}cr?ui-)&juR$kH=t*!02q+$dCgPPNbYXiKXuqj&nZ*-(n0XtH;w*7 zd3ZL1-ztL^SEC}jhA^nf%;dssA7E}%r@PebxOtb;bm@!g9=|sEpgLpZIs?Vf)37MR zyM|=55zno|ucMJG(t&Cx3g6#XAQQNOPMTgk>)RBA0RB=W+G^35XXP*m@!1;?TtM*x zfJpM)1CDggaXTY&IWd-hu+IhI-HA%lrjue4cgOHz>U~WbSFrg3d7~!t>3A{4NvxU-K+WNGT6=y$9>^wP?$OxZ^BnIGih0LK-C zDCF+oxz@aKJ#@;Q-o_#4ZY<3)XAo$(G?Om=WBLWA`_wD1o-MM)oJSKxV=3SK)>skp zxDv&`X{{H4x=;sSN}&}662$L!+}i21;49&Q)tP}|#{NgJ1BHV(x1^12aS`u+mRKcp zCa>>!ylUEU06ctF5P5zgmjEIGY7 zOGYRk*Yo7#%RnK?%v?_jwKtpZ*_VYIY3msHzS(kn6?z1KB8-eR_d*`Rhe?}u4tXDog7&lfsLa2{M( z%M>L!g`(KocdlQI^0NqqRf20Wj*HSBl^&t*e7O}7|4R#ic_-WKz~;-#K3PePsC|AK zsQ$3(_3C8Fl^A^(JcMyNKHd=W$4G}m`9)BkaPua?AU2zv!IXb;d7>kPFKa~owF2Dj zvt|W8t1HIM>D{Xwb-P%sF3zwGG+|8VY>jQF-)bKurG%gj_dcpy_?tCBdVsr*7UxP@pNq3 z_WK-Mxu0|Q>Tk$hS=1>OL0gPX!c5A>OV4G*;(t13?8?I)+0C%$`7jiFBXUW*<}qTi|9$EjKM;Pius?ieCv-%|ngDwdYOZ?i(G z2pVjHv`C1pVc~|v58i8(_~Fl#p#v?r3eA#i_1Eg}eD=hqA#@4cEpBy*JQpZu5_{9j zWM~L}>FCMJ;}M!<$WTz@ZHfq%mCx_%T=wr4>I?E!c3NJX0QD*g6|Sm33Hm%N5#Uc> z^E1pXt0USlOI_XQCY$n3$JN9O@FeisH-;NvdvRM7)~i1^(MbA?Z|6-teO!@4fsx|^ zw4*z@U2LBAEGbZw!&#D>u9IKY1XHmCgzO$m6MUTg;UAJ&LPjV@-tMH%6s<)nSOn}x z-PfN#LDDV_kY*V7|rY zsrF8y9`MlaI(QpoM6=B)^kX-EopLG$^# z?u$F~Ji3fB1tLb1VVOISp+Ve)j~GSvFB3Q01e|)xtiMjd=r6+icR>Ibh)%0#7uU@a z$dU^)cYN>&0(srJS5ni(%}CW`@|B`QS)+u_#|9cO9YhFO6a9!|v{ zlU*X;rzYLn-%EbFE_GeWC-re?fWK^GJg08()4blyXq}{bF618t_I(qF-#7iduU>&e z=RdKvtW!!b<2T^V#V3Ci62Ya-pFI)9}5ig)9UF+tBMb(cbrF%)rQ3 znr|t-t1rNAh!lA9keLVbJ0!1edwLBajPsc%{q`nV4CHOBXOKX2n8M4Tt`bBzt=r6t=%B$F0LbHr#%xH zbi$rj=#jlM2elo>b50dzT8asa5w^u;z3B^L5bABw?@`p}Y4cA}XG~=;8xb9e1C_^h z6xMoy4B8g%|B0^KUc02Y?n zW&nDTFN4;wfB5coZ?CENO_Ec2p3x4ty5oT4oJ_lcYb5s!&16IO_-PVE1TmtcAB1(A z*F4$`!Y3mXX009&Scw6NBm8mn?bYY^G7#@r;&$1Ro3sW8zA9y@vx}HDSe9aFCd?vs zn1z%!vg1(Kw7N5lA2}T%pBJfJu}Hd{@2PJ?HEo7`-_4$eks#}|&z~aOpz!u7q5KVH zhnU=AT*s_@2t^ls%Ah^6;|_Nd9gWU0T;*$)r8 zf7;i7-)~NPPe_}dvqW|!~)_hO}a-EZqw$$j@tCz_)sk+!l>1+Wp;%bs7oZJ3hs zclUh3-5sqhBEw+H`)29uVQm!azUwy!?@v+GNzb|8{c%Z&kJ4QeS-mdxpm~d#5H=lC z&PAJe_3QWR&aJ2zlMPkx_z!wi?#$7R-Frhw)Az*o76Wp7fy&)ewF z&|e*?tzV1SGf6bZxzBt4@KfD9AtSb8rpq6S7A8$|3H2%Q)sTJBO1-QQa~oTlK^3aL5kEQmL#@$? z27#77#2@4X2lv)}S`d4?X4)TxpsnH2(cp7}FIRDdt`BZAM#0a69}C`$1*Jhh(Wu0x zpS$6weZZNF42NJRi%!ipKH;RPUrGAEv1A~o<~$Hhrp{D?s1A zYe5t#045VONl)BWdw=rMG4 zp!pZ`;eOfrGRW|g5zYqxoB=cm{-frc6@m8x)c(VMSNo4@Q1{fBwg-V&=;VEAx{8uY z;QCKS=6});0M95lrVmF%E_5HFe1~QLF9FSE9Z;HGw8I{no3epX) zyf565HsGr_H9cP9-&J zQf@#F{rC^!_CG4LL5()_h8NRCT;ab;wAsS18}cti^lcNmq2EyWpEImbUD`$>*$xyg zBK4xyQMl9uD*(Vx!~Y>^saaN*w#2q|Gt@4be5F_Tpa3Y>+w(gLf( zv-ReVq~=R2Z>6Y+8B7R`#2M|I&C=SS~v0CaA*gSjpxxmA=&; z|GaQdt1#I(baJGRc69uI%Bupx2xek(7cZ3L7-^< zTC9IBCFx>luTVrdsO0z`L^z`N?~Pxm?{Nu>Nfa1hvu5d=)U+T;mhR&iE;wF~P!ZpM zKVJWr9${SAu3jPjUF zDS-O6ZaIuMEc4MFE9B$#RZ3~w(Pv1A$hn)nryd&ckvJars9t=n)9ImX=~`Ubk6IH} zgg2Vx5PEeuWH8jAdKaBHIVOyU`c;ey%sI9D=m80INzdR%7zhbaL>mKj3JcTWcvQmJ z*(DGwE^W>y)IS`hBZ!iCYB|{|#NC=Q%6a`Caw@@mVR`7asw!PJwD@9;4Cl0aArQ`n zxOP{n68GAGBFvc(S%@TrX~mnr=)P1P4&Ku6Q>$*wiMA$HA7?2&G-J49^5MyqG1rO6 zD=NOpn-N80h7vu9$#0YD(Y7+#5vRI*batzZ7pUk?@>FXq$`7ipm*LMDqe zX{I!t`_WX12x3fr5DC1Ihk>&7R`9}YL3t{8Bpu_gJngb(DWN|!Bk~p-9^LeQs5t~w zfi6d_Us#<#m(P4By=|ZrGxH`PX`_Rram9z+0!@|{1zHhu>VoXpv)(Ic{0CRlOUR`z zTXmPQGOSDQ!}*WHH%u-YwlW?TDjCN`+TB^DSUG*jG&m5p7eov)R-MtmPoiK>hs7mO zCFcT!aQm2mKB-2Vdf|D#2GurY)I})e8G6i%_W?vuno!==_|RolkG*;mvGzDB((fA3 zXjVd;uS9iks$6-g5nGJt%9;lvllsPdxgB1af3)AFX6aY<_dRT@x6ZFm zQYD1mHepBTq!q#V_1XvAX5vCJ1;ZGr`wL+tYYj=K7v@@eV?Z)9q zw)99YsZp7_;OzdJYW8Oj5GRgV%z}i_uREu8u1VCkUw8^wI*c}# zUK|5)S|8+-n@yWS^Sv{neGDF^yvw(X(4)G}ofG#)kL9n%JmnCL8(|qtd25ZUO*4FW zjj25=tKfB8RhQOYcKDq3crGDS2Tl=Jhe6>p=5dRl!r38x&1=^{bkpdIWP0{^tW&7V z+6|c2Eb;sBkKPa@6(vlIbwG2?VxlZ#5J7$w_JaW~P|0&pvbgloORQszJc_@x%t}N< zD>)x>jNAw7h10Pfd+|Q&kVXwp3>J4o{!92h78`7iEvD2EuIR2M20~H~hRDJYgIeT! zUZ!WV-U4BeLEAfenTVHF+1HZUT2wO;Ce$k~d22DBCI!yq7_Uw;=cns44Bv7J>*7~r z){8Nh@vV=b#W6Qcb1bsKpWC3IJrHP}B$CQc4RI2w+zvKpL;CQ!MVFT_x>9k)Fp=sO6DNwJ*GSbGNR@bzuEHRj=YD2P_J zKj%5Q5Vg_yml3>jTu@LmvR&Wvl4k%yKIY}alLMVs;dj{j%g70#Njq#td4t}(ZUrr> zv|_xE?eFn*SQNuJ7Vr9Q-LQViT+U)_HcbNBs;v=lA8d;%+8$=L!0vBqs)%L}JSSp- ztYi#paoLz9jW7;vo4$1>zgf-6;8cb8SybRG+G}t^*(${PG__7S1HNgvE}8#jDm=Jw zWb0U*T+g&Ryb@%jK!9j|(jFHU`!x^1Mi5$>o$0BMYi#Mw&xe*7l4lZf~Qx zx9z%T7kazu+7VDbX zCo^LfN=^iFK=?@AoX_yGeLT=qR^MAVef)@@cTNRZe#+y;fIVogm+) zl!zV!T*B{k5o7-4xNwSHDhiG)PY* z!}$1?cQ3U$YxC3y(wU9B(kLu08YjUE=FIDM--dcWq*C)thIbo6-e#B&j(HeQk$u*J z>T@nKd*{R#^0zapXp&}Eoj>siBaXdx-677h%w}``J{<$V*JdTpm%)3I{QLKnt zlPpU|7e+_|P(?>ON-X~3&L;A=e^##WM+2p|`o4Gi$jT=vsIn~X8_+Af>Z^qQJnEUo z&O5#7(eeyjJ8z4lM+D0~FMDz%$jhaiX&Bm$?kP#R_}Ck4eU_tDwp2Vbic4c#)}pw9 z?WQ6e?=XE|b*G|c^ZVWmQqkrHl)@)#;lZei*1T;jcaw9pyoj|?R=Iiy9Pw7SDt5#7 z-RPH(eSB!nn5K=;A}jJ3J-dg#*bZr1@27|;!``ZDzMD+9XhuF6blRmfk&zpBPcpi8 zdS;;Ez!xxfEw3?i=b6NKMp2*H#j6BBE1Ed2&d_DLMGE%U*F|V@2G|OBrO39Y~G7CmQD~oBG?=g(YD83I`pafLOw^2RWU0lLwTwbPC)D}r~8gjZ%{>~%uif#6UI@|sN z5zJ^&7^6U9l34;|9Z|o>fi^~kO5hxs{^fP7b5y%!OK{L z(CvV%3~4`~;>(%F491>j&tWad`_Wx0r-a@?NN87_r-(-kjf$-!M2@1J_vvGf(SG&} z2i*+#F;LF)v&I(HtK3{#H5DdfpJksK?j>JmHj&jFGFSQ%2%z{j;eGWifo}OhYD89% z=Da9;06*@nX_m`dMs$%!t^rNdc(dR z3xg6}+s4_ zu52HDwaCl2{g}GAFRvmGNqXh3wJa1m?wStn*}RS#!Mb-tJ}HT7!!DmaPCki^ll-=jOP`9Z ziL-KJ=1f#=>{ln^2Gf*j_yAd5p+j7QxY>?`wh>t#n%pI;`eL(?o%%wjTI`vu!>pN# zX(ErT0jNxpjmGt01o_eV7+mss|9GvX`KMfdjq%(=nMl{d{_q_5ok zJ?>!-ADm-Za_y_T7Z#uVuYs!Kh#lFOq1*@kA6+7!g}duyxdaTS9yw zNM54H5c;yATci=M+vv6|v9Pu8aKtPRRW5?ENb_5Cusm&^^ZJeLwK$%v!<}Iye_R$K z?sDd}k&rusCjv8vb|EXzO6_7XQ&L=fqc5Q$4-Jec6vd2Eei$>Em?viC=(7$mcrfVj zYcLHg8ZRB}pENq%dG=5-ullr-w#pFASy3#t$6Al5N{@lpE+tDvpM}BBCbc(ece_bG z!KpCRJMye?og9zsvbEc3i0mrO$kvWBZJh4|3v5ej(H{1!TRLS~wx3EBPji+11XH&@ zpR{oce#!)C?UD)!w6(f=kvyqO?LNOL`T9x(rdULGI^a-MFulI)N~^wU;#RykY zP($%KN>l!s*6@A@e&Zn?wt_mEl;lebG*CNAxGrE~5;~x(5pk@>7ixWLKo;H+L)I`L zyH0_F&4SIK5Jef8iA&qNtz=9>bk3ucN1ReZG{~9v*0TAuvW!iv#-F7o zeZ*vbFwo39&Xx7vxYSgHV8r5~^I$Um&ZTg0$7PCeO070wAB$d&bR8B>!hCQd0u8hm z9YJ5a@*zQ>bzg}i;jqPj?`!>s56e$(=q`#(yEJYpi6!_p#vcYsQ!}_HXllu|9n2Tb>mPuS-;9>1L@WmjO4I~7bGC|+m>t? z=q_LQ2@IEP!hyeeG~Zj#ulg?!qR;+>KQ+FVhE++gOR^sXk^N3fZymmiKaY2CM)Pyt zh+v%&gkbToE}ua~ zL!03U&H~pc_gd~9L3E~MgAfj7vBt@6xG3X~$O@cb>I_1M?n-ykEZo7{>`R;trv!T4-h-nNwsJ=7aK5fo z8_wpEsV`6O6)P7{oAr_$xLi?dt0^+h-e`+{VTCuUdwzChpddL7Q{WlF{l(;%Sgtl4 zEKBWpncftIdBNkSusagTq+HT%?tRLe3GdWytp!)T7{bY+i16SZiR7~=n!7743l9#L zFFo`dzIY6h=-90OMBG$220aE!|N0l~XP2zGACyd*_2`t;1on*9V;;PnST?mD$U+F% z>NZ@Ge=8IA3^U}vKayvJQI0SD3SF283DExd_)Duk3f3Ra?$ft0z13gg)^RV=)3_^& zd)cWnWhB#nN2CA3-N zCOwq6<&kIk>(zh#?!QLD5|7SAb)(06STXe1xXq*`ThwnMDExh+acrCLz#}`x}!rH7Grw!D*vIsoaOnzKw8NiJ4n!sdn1=T4k;q6MEof z59><=|6Fx6!Z9&0?G2WhYJY92zkgA><}b}kHJ_0v>Q>D8u(ngD$f#yVk9Q| zB}%4HF8rpT-zx>@Y-nYXdQJS~<@cgQvNgF1mm}jL55FKltPhfx0C*r*PNX&DuG*(SqST+xc z^o=^yeZ9HJH*<2-@8`g&U;SxWw-$a&CCRK`{GZo#@emY0)ia|Hvp z++)*1B&vn53Qq7;9zjKJYBN<^=CU@@i-0VI>^-I`on@w?zVmWb;m<==PN+bNNcG#| zQ%h{%TigA0TGpz33IF|={5tjv3UBUXL)jk`U(X@DJUIAPxw*v$wV|ySZ+U$ix3p^s zz}h}~?|)@u%rp{CA)l44{3Jv%%gJ^)UmJNj3COvi-9Y5FR)SZ`6>Zz9eFaX1O&puH z5^V?CiDAnCnoJ(z&kaRK33l4J<8t7y>HBp&QT9m^i|IZ~AftgSkm*5prD!)ARhdz&h1lWt^$bMXeyQ^|YTR9N=+eS@qHL#o0CmC>O z5F4U)AN!~Wa9fCdz$(w6?169~E7koFy;{T%F&}*q1&uZOkZs|USn){9K4y>as1GqrBJ0@~Nx0}za3^W>G&r*<8euF(AKd(q))*RM;2w{p@U zfA&Hgg13YoySK=6x+A`ncBiu_ElWZy{9h9*=?muIWm=^R$3=$HzeRy=HZb#T!N}yG z_L|*(y^6f!HQ_9SI^K~|R~OrXtUhrb)W&PM)-s^xcRb|C49-N)hMW zGW!(JQ2NJWBt8boW)9G*8vB&UXFuN(NMCy=`x-8V?#aPMChe*aeK`{+cMBmy?>Cj# z_Q+**LD%)M<@4{Y_%)f4;vxutVs`6@*lo2W{w`qMsP>Z_ed*nYiw!4NM~Wy<-p>Pp z!K$zu>=wkS%NW69ZK1^W$9pRuvXm3JKH$NYcl#wGee*Endh=pcMOXvG`yTwl4%k<~ zCJlUilmFURSSFSoUUOej)A`kqoUuU+`(ohhgOz~z8i=RV;j-5Q7K z1?Q0xW7@4*r+%>@tHDA&KGDM|Cz)L7d(v+nzH^5mouqR=B)+&mlr9sz{9>CPIY*MQ zQQ#P$kNLr-EZ1Uk0; zTnZ#DG@PuS4|rUtnWKt3a775u1aFhchP}#afI*|C4A+FffiWN*i}+T+m<&au@9FV2 z3~8E?_S&JIH6WG|0cqi?oqO)MH080BGT-@OARy#LNWhg)3RO(a5U#^%x3m=Rovv-+ zEif~(-4Fx$=9B5I^~FdD1QJGSqgZm#gYb?(kyn=6)(2?iSJ+`oI9v>HOaOZYknj2 zb}FV6qK;&9`GDdI{JVp8!kdV#|65kyT{9!^kJ2@qkRqy8-k2i^||cv=n3{b=F*Lpf!9x( z$Q`&4y(b|~m@1`jw_5Q;Ib`$N6mMJ^#Q*b%u}EI7&B*CJY(CYzIp$>)OBn1t*taqL z;n+=-_n%S*Ny>X0geHsGHiD>np`%g z#*Z6pml)hG(J#!cro$X+5obFI&0|1wh-Q`D$>d(G_l)Nnr+%4< z93)^YtE=qCl+A4?XiQ;!VN|bjVNjfT)NVHXFCCQkX=%c@( zJ^%61bwtlnuJKk!kmO$4`On+3xettehA9N?tU}DFNNaAkQ1-E*-+9OYX~#MJb$4IB zRqABf9q-QB;i0TES?STU&_mW0B|_~RQe``E=YN>Ghwn^^I1QRcbc(0R7NTwCbA-s&aa1D>HEkgG5`$n153@D8UlPp;^?Ok_AOm?Jb z#zwd!zt8a=K{9~PH#0~UhR7|G3HP59#0*9T|JWH+<~9%I&Pnt24B^X4Y)z@SJs+|i z!?gwQO*au7QUW?tvkXqd3S}!FuV;;&EJ>VOp94r}dL6(p-(<>`o0H6oCi+uqkVt|m z#~btz`XKd>m{OSZ)&Px1Z$1fWNCuj-aRgmIlJ|UFR*_vwHBi4_zVm%DtU2C^GDh>s z(=8)U_6OAw+xLlo1B33NZbO(R=~#~D(j?NO$A0l!kxoZ@XntI`evNCdon(LY3232( z4GrN(?i>xJossmBP00&WvQDIWoe6ubv?nmcP1Y@?QcB~w1Y3FBKsavCeU~C&P%*5J zlJUzzFj7x7gMlgNxeY4bCjXdqi$FDZ8Vfpm)aTYhC>tg+B@cO~p!~53O6yZ8U_bG4 zvqGTy6etK2o=YF8U(Ykm`Zm*D~6UEn&nwg8o`nL&S(Y<`cMTJYv*Lx;vlMg>LX)2m7ZEuHb=u|vprgWFq zZ}6Ps!swrgqEeU-(Q<}7-2@}dZH}ER0}#=+r19WktzAbX?Gxkk51Y~A z6cq@AHg#H>Q=jnsppu%`VH{#V=puOeZ&1^p7)1y0eQt1j(y)jSWiU5%;NyKV^Nu6^ z%aED`oW!ZtikA>UVCX0ycDgq-((HF$QVWNOBN!07Q_W}QevRk8qMNmg(y!_cCT!i< zsrEM}g;x3L6;zTTJm`6oPnyNrST)+_#g(40DvzsjUqTdMeZZ0etp=KXPcw`LHf3qr zZn@t}5fQKv2S}IGeJ&{fL5Y#zJZ2fL`bwdc<#^pv3Zpc9r8D3O;XpYwF0PIwjX+rZ;1gfJtO}hC@qFnA9@(Do#pPCfiOM7a+n`0v=vy^G zj{E>mIYIzi=Myc0m(C_Mzg*&XYFqo_VWrh$IHgc^G$S;~{RKqMkiG-uc;>=i`f{MZ z>3uCU3Bc2n1VdN7+DJ#{=m{=M>a64w`>9$UM44GvjDml#RiJGixV6Uv7?af^ygt8? z;=!BJ`0Gf%!dl;Jj$|LTt88Zs79tzW6-loc3%;4z$!QhwxEZPrX zxHmDv@silt*y2cy@vH%5MJakU2jZ?;A-5{ zpoBO+3b6}u{`=cI3T=K7Ru+4^X>hW&Y`xhzohUG+eC$@uhj@~(|FHQ2vIXVdz?cA3 z3c-OQEu_Cly|E$VTN*h9Hska=unxOjQt!3TPK;xrB&j zd zO)PZBf+pS{68{$(3=F&^JrH1xv))+7w|;aezRRumQw>r-|1Q-Pc5aH@b6)`OBDjr4 z2w_2yIk5TVvy-hF;2n&q2u+*Kp3EFdlO^5tC~w>wRGzE?DKz8GDE^wyQWdFjyrSl3voTSHGpb&$(r1Z&n}%ODS2kp!Du9tC_g^#-iS}h8 zMA+&l>#4s0?BBm&gaPt~fs1OM1Lc5EyD!4GA3ZMAtGG2dEC%+@K{tEJ*v{9UjrEFt z#6uY|uLIOsvnWaf$B4ouM*PRz3k6}fF6M*)uh5rnbpq&_bxunY-Er)U!_=&+x%RU5 zZwI*;6d~R4q(7}T3ma|J=BgXAicN3&`zCm4i9IK{$i~8q35u1O864)y0DZ+65i>SOgNx&^? zT@TDOgK-$vBZKg19VCvr)a(w{$4B0I>^d^5Z!L!~B0_q>?>y>>=d4Hr4)g(-I120a z;@X8UM;3s*gbfr@U-h|q_69R6$B$(x#zyS9Ltfe-b062tcow91d`na(R%}t1Hu{s6 zl#2uFlOf_nQFD>`fA7x#jvpBg6fiDWq=X1YjS=AJh*L?UKt_ld;Ko9aW$g~1H^dtG z>}z`gd|YTflC5fjY5izQ=ihd0=mQ3(TAz%3pOQszSWX z-;^{=T7~}W4ANl&RupqpB>SS*k$lW>-z+e*%64?ZzGnL7a}3F?+C@CkwZe*)t1ouV z)$C{9dCa#RO>g;<=8bvoEkB+;KRaxVj-B)kVcu#!Z$fSRgQKN1C`Ao0+H2dN4!p%5 zx1gV#%$_>{pQ;K$Ejlsx;YAU!5d+hJWwReC)SLf#^d(lUP&Gxwb=1(!+=qtp?dhWv z04F#Qhs&9f4|7yg)+6}(N_)7Qr@HGtw=ad?fsW_9-E{!*1T?0i%kiKC5i5Lu_*(X4?3eGqRf=m-)B*?~H@4Da_nQI~ ziY;ox%#$jEFUJ4sHb!>SAb=1SPnwRh8&5VHT*1j5W6en>DvYUg>#F``ru znh^f4nf;IZfvy>gL~-8gBVeq$+G9V`czpuo1Mw#U8G>hy15pn4YH*|-rX&PLkod61 z!Tt&cUL~+1U+O6xCYyl(`Sz(>+uo2+>fkYU4;(}B3(I)h7Ols)f*##w zt!QmzfcvuTS`v=zItYP;s9*}$K8Mgq<7SYhIeh|_E@(H``)CJtw=H>E;8?Hr5eQP% z2x|04+u#v~G{wp{*M^P7o0jO;aP?t~Ao-pXpF}qyoZ6ya#tKZ9bMKaeAAJODws4IK zqTm)kq+Ug;;v2gYjN;=s!8v0jskd06&my6-5Y}Ql+Wtg!a^EoFrO)9yjqBs)z9XLN)G`;DbP#=tko3oa zLLvq2XDr?}j;O!I{)&A)0FUF>d(r_Ho|MbjbZCjY1~zna?ArO0m%aMc+)KxS?gVIJn#Md?q{7PYWgw#>*POM3LLb(L%%BB3hRn+nt+cZ z{Xspt^GEz^g3nVZp5+bGALQA$*=rV9C7Iysv;LVBP!+l{>o;BG`X%3>qBFight#is zDIW!&*g%;bTr@meCF%7=JpK1y`PVVh2lA^1S!%y={r%ThZ$0;aQoS!Xs_HQ;ACb>^ zGE&wO5y88!xPOKqIX=yR4WAOU3a>>Z|J`;e~SH=TZ3-_=%%C;ur6bF zNEn|93M*ybi=SwJGGQ|L?C#qNx3bULR|`72!SUhw(kYgj1&_(^>6Vv1W78VB8_#9F z*nPnjNIipPP-E*0&7Z^qDuDS^ZoMgGN#?TcPb_CDX8?{8_8+jDv2WQJYB4s^1IGVj~S`8Cni2H*IK5+=Y&^UMEf7SxvC%N#d zYA4%LP??IeSU5FHDQ-_Jd$D2jrkcKH6Vr9$z1Qcb1x9x+-YJf7?lnx|{j?n~;a_ma zUe+J(=zjN2r>4HWd}D0x!X}hIdbGGieC@Mk=1n{N?80Jeo@s0H8HX;HhT*$frS@L= zFY|c}YbMThOlx(I7E*~WPRF0fd2I6OwSYH$;Ql=8D0zTq`bBtCkXKbuE@SLUU6{Mf z!QDIg`w56Juc}D-hI%i)x&*tv5U+;X8Kw*SCssTAFO#ObD|E_L<_A)2Mbz6Nw?w-1pAIm1)BwgL|%SIt8Ko z@p|UE6}EkQb)?Ce9tD*N&vxJgA39GN7O4MblWe=SY>TS-zi1z& zzxzplfY7OX_bk6;hBO9ofBDjl_9K+jN$mqxr#meyqMv{Tgtx z-U;Lw!^>V7@r&hXe$iq6LtoNq2pTTb=Li*VYU__(8mrAjoY&Rjj(v9duo~sY|H;-* zwiI&TRn=lYqpIAcZQO;cQ1Y{DV}XUg&Pb)QD zmK^w;U-IvJ#Xj|dP$EWb%Xwm#e|~nP0pjUITjS^FO`?8|kF*NE7Hg2m8A--^QJ$R8j#NerAJ{QI?uV4}8ZOkKgq!*w zQ}{NuPz)E3^WTLVN#5Y)+%6OGI`s?l@9jzESa?azH~`bjbZS3m{$kx6CND8mTd3b# z_%gP-p2kx5$yNe}n6z0yPF1MZ&y1g;eA3V}s{T^1eck+rfzT(Aaz?sdgVKSl9^H3# z4c3H+-xh3$wu2|Ljy0j&c~j3rHOIbR`=i??yDOZLgy)6yQDd64;*X529)JL^`Hn!x z^A`h&*@v74f(5$betrAWjVaYue04M6a|PxD)(~@eg}vJ3mTBiWvsArFPtb3{L3e`X z>78v^u_FDp!0{3ufe|v_tt1*8!}vLI+?|hmuwZTLhRC;$(r?RqN1F%Uu6ayribUe? z7#Mi}FkEINJUDZ2yKfvJ;vA|~IcU1`(2(@~?)XcqzPhYO@3V?YVrC^A%3rm;tK=ke zXUSKaqHxMP3iS!_k@4SXUi@O6YS>ZKn_(yZ+Ir&tqnxZr=3b}Ef#Lf0k$}eNQLn1& zm0Qj9bsReNESH*``^&Z7?9EpKHODi%Q6eM7j_Lc3kN+Y6F#U`-YD*Alqe5RDB%Qz_^!kW~GwZZCXQ1 zs#dLe@X*PAS#zXJfADu?{eK}aK2#Ec$sFNBNb>$xhu``zy+$NQmjn~it!QQ?jE!sYs{y~Nt-5J_(0}k zziM^5K8X}oNa}0u)KSc9eQAG|aP&z3%VVA3c>D{>*`9p)Sdr&5pUr#gk;T7GJ`#Cc z^5V_YILHyv+U`M@j}Ju=n8v2fVS$HfM=b4vt9<0PV3KHefn_1FpdXkL#hq3?D0$cS z!s;*jgp^mu7_gGU_0O52IuE{j^th3-F8};OtK9n{r&y)IRCl7-`h8HPuTL`tqs!UO zvctxPO4iHCtrhX{N^K-+_{esH5&E?q@;h>0$GoMzrz^+Muh%^FN3Z|SaQ*Esx0QXK zpzLYRFOc(AXWu%Kl^CBaH4;UZqoq@HdC{|s zbg|<}I>ZJ;3|;dN+pHt1&6mr`lk}~vt;S6h@8?Fxaf!^US{z&z7Od3^t5asOO(9;D zrmt`&v#U+xT;H#()bd&c^B24%^~%>k&F1k?9PtzwFIkNyVD3)y9VLeX{byXlJOn#j z_{?hQQza)!;kU=7lGV>-Ybw1-TlfJSuB7FQ|H+~dxgSCJTwd4_}uscu#%*3O^J zPRV$)rk>&Xhi57V6rK#*`^ez$ffpj?69*3F+#^?-44TUaG$H+OT<25ya;$i8k!I=493fGvB6+*F7osf}bP^{&pW&&#bEUFR@NuxOQ}H>GMo+@I?BInvp1k@}!M z_iD|%-r6=iH8mRf8YP-ozl8aPNVE*;JXl~WN+iaB-0$|TUG$w4?)QIy7WLSHQDT1upqW}ot!S3sy(PrmEII1kI@9UN}&v*Q{83r#bd zsB~SyHy_AMNawHPm&clMl+Q?>q>2jiI#^SU9zsRa%_ktGb44HMx5M^~W>Zw~yS1Z4 zicK9v7ml4I%<=Ini190i*$9QMR({Tv-<6gAyk(ca{+iZyVrlcWgEJ`tQ>zKJ!?&@y zEK&4?)5dI|_xI2CGi%d)@tknFwu)t#Nl^7Gx4u1Evr;GW4p-KSNTYjL6n?~<#n)&& zyb(jh?%Qo@Nw@e9Pug=+^1VkHWu#i_{G>$tbAl*|r3`2Ay& z+F@8m=*_ql&5*a<@$zB$j03NQ5T&hHVRnr)fg_LO=(zr-A1}jcXp7`tS^Q7&Y^96U z!wWo>UE;Lx5*+r)N1&IM&`)o%8s7i*qLTA;q?p#=wR+`!#WVR9|STuT|0N4|Kn);$yM1Eh3R{DxMuCtpBF{TV!NQxWaSc7gU@mnAk#1^0nILL1b1? zC=(Hs)lua8{LjoXA7mPi)gP^d z9m}+pJac{h+BGd>Tx>5uz-f<}bx>dP?AtY>O<~!NUfvOV0}6{75WSi%A8Zz`0c4AHi9^c^C&Y(I(5(_r2uUXkJg(n?5CXqT=kAYJ>`?Fc7*qT3Nn zRK>RmP{?1OU|7z0S1}4I&b{+p){E>1$bIJdxq@feY5&@~{2%a1>;^Fk8Ta8AWg`GU z4=H%@tap<$Q*|r?M;o7S5ybS&QH?Drr?_q2ouizo6RFq--x8ySH@L1-tE4SPFggeS zDpLcWyW!#3XpP{gtCwjR6St$9np4gqWa;fh2UU1iB%TcmI_^a)MaZOWmWXuuO^oBR zNWsAeS{cS7Z7Fn9V)$wflGi!xs4gx;J_>Z6yc~#dN@%u74O%91cE}WsP^5HR*4T2V z$iGDmhtu39pi##(bD(m4_d!x(BZ83>pU;A|5)X%!$RS=R;`sUnTcNY#XoEt4>HS!* zv+6S|pKJ|8?52>t;5Xa_^Pg#H&K=RBFL*7oMJF)D$92|Z*TBW*?p7YnYzlQ5ZSY1o zxK~tYR8z0`_Ha?d4ftMt6s4V}5Fe&-bCslmUx+@RS+Ez1n1me%iDIN4+b%_riubCeh=|(ba1KP1U{e(wVnTg!gNl=kO%C)6~x?Mn~iLg>M>N zPj(RNi?B_h-Tik-4_57`kTxo!?NgpZCm*AmcO9IM>V7F_!N{i_Difw&U*3qOiLI7< zuzKyu{)14V42hk0CP}55n`UR1fvgG?-K}NGRlq+N5R0>{rPQ= zot#_As7G(s_wM^;G`TZ2Ri(}X9kARPMcw124h?I~ApSyS`|QsAncuNfcE56Gf@&5} z4U$3^yW@HZ+h-?pPVYsNVqSah^Njqg2Lxq}i`nr^bFFa1!zw|7;)U-E6Jvy&n<>_c zLJ_}@Je{fXyG-%f2lnPlgtN8W&w77~e?2(6)TV;-Sp|r`&Ml5lP|y*d6ay<@IzA23 zs$BnS&5)&uTy9myl5jCI5czAgDbtHfbVh@L1z&3kv6=j$Hl5YnnX1V~Yqhq7r?0H8 z0G3NZcyuOk&`WUew75*{+NVEBx_{5l`9R``v98^vVvd4YDbDcsUL$tWk~wwNR`4IF!ZFR5u`<`OCrbZko6+EOe%-+q`*}r`?48`(=J9Zv9bNbUb1^S3mFl3@wGr zL*Zl$ASE~!I|Ra9ye2gCUk-de71Q7JT{oS0P$rUU5J4xoW*L;-DClS3b-%`;{P+Cac z(^tuCzqpPWsF1?C>V@yvAMn6EsdRJW-T1+Bs?qRyebkW9J2Cf=Pzr;z?!d*za>#%; z)2S>6L3=9SS*Yj_?QD;Ynyqv<5sX|Mun5Js?=)t~7rgs&ocVXZR=L*0?RztbnHJ4@ zu@!#il zaIESFAqM!&G(?(GPx}J_0cH-z<-6BTL!arsJ^aCcA|O6No$Yy*R>P{QEI< zoHlR64cyJkak~VUcwQy!KboRWh7Q-U;FxANIo09LF5lN#*^5asQqQ-S|DyHvGp_Ev z7gaYQllglX=TxLM@pWA1vM4LCx+tIZwYB(8ldA^v14VPq<)s zr@CjEdh|Y*528hU!9bxl$T;FRnx#|aa z@5p72)5r(u3r`80rb*e=Dx$_h(*kponmnF@YXPx$dWB-It4uoMMCaPcn1Z88lo~M{l)1BL;ealc=z9}sXOaF~szOOWsS5+&} z%D9OaCRjJu^~-kA(i<86tBgKx=*Z2#Gl3-Lg8UxgQbYr#t3}wI;);*nA?YDCO9PY# ziiN8Iyes*d;!2X9z`#^o@Aa{ocSn0XR+(;21sp``(opw!8Fa= z(Z7@bltKT`UHv2d^Wg^+q!?di$za2LW29#R#n5v*0;DfUn)~HR^@*GNeK4^fUdKtv z{Fl1_?<)B}j)~RLS(BC4SK17!dT0|v0pFLsKTQn(PFeipn2iAa2*J;Xgi%s3bUqRC zBtGcwU&~_Xr&gUu=K$?A3W!L2{g}b|Mki~(|2mfY$M`Q|1vhjk2TudiJ{1xi69}n0 zviq1)&ay5ai=3u8%r&%ya@S4nHAzip#J@BZE|_yJ{%$zU&Vna+y*>6BB-ecPK%;hDdo%qvBLV6w zgLc%=CqeLKaUR|TzsCGB%Q5wD-kW?hIz2Chp!fPh+V}Mj*c!&)+z>+)54dge^uY*lQW6*x zaX0uk82@gpoydYmzC@0wf!nBO;(q;Gm6G=b3QdTi(e^KZlF)aV3lw{?QaS z@xfIB&F(&c!q~`hK;qe$tKqx^AiJjJE;}Y)p605f6wFf`<5N5je==kH^{W5(PKoxH zp)dB@qGuYW zEG2(p-+5W1`PPs_@V8@Nsl0ZL z-%pQ?07-J#giAXHOf@!Wt_4_Uv;hgd{^njmQQG-0R{4{m2Tp%CLfCfEed>gL=mtuv zp8*|EQ1Y3DUCiQ2a!`&qj}8*=T;{Ezq=Il-{vhan^{ei_tYQHKOcl*O$@BUr?R8>XaBC#vA7S2O%%z@!4yAJBZ z!wFl;hCyYq9XU!dmm)yGb`yxM7rCfN%6$-mflpz>A(?z;-;02qDA;Jc+7W^H@uA5p z&Ft>m5?~C2OTO%m=fBt1aJfKN0g!`nV9&A!9E;sKc-hYZ9hL8Pu!?)~ZMB{aYNTHx z0oGG9#Arbo_}{j4@MUR|wf*kcSvT#{SJC+ZeA3ItrxXa|f<7(Kt71#eYX7T+sx1ON zj0H1A=wWQMm*ZGI=931D#PrmLH|8bJy3W-g>g)I<|KMI+DH=4br=m31>a4osiVTY~=0R7suw4VFXPf|pNtDU!KA{isact%E7nd zJ6sBYmk;e57%8{bMNC%Om4y5?iMWfN$2RLt{l39E=m->ngoipuIu~-h1|$ zSi0E|P~yPI(QsTY%)UM(_n{KnnBxD#Z^GvHkB{x|vI|C6veW7fvnzp@57O0ey85M^ z2hznjOV>`b9%KrIxg2lL(ZA7m`uWlcpu#`Z;M9&qg!z}D7?=T!n|paZ02mwt5X%ef z>$i$bBgIG1K19dsXapZ^O$cgynypM-GwX+!r(0oTpJoOd;QEVx7S#Z{*CLlKH88$3 z-mv;v!i@DBhZxX2y~w*!W_TZe!zE9nfmh;(h~p|TdnnFhoSUMUeha9N8H)^L$~Oxp z<@)f~1jo*=?QJZ*w=EwvaOcpIj+8ikHFPwd=8!DmSshNs)4W-CG_6P;?7vdIR`3;| zwl?0?Q`w$MHAl1lgyX;&U;)tv@O>RXWh4t`JXb$JY~@y)@^vf5f%~g@>KQOI$ao5@ zjTS~0&np7krA+x#J6y!9==r$Iuesn2z#`*0180f>#abOalIG_Cx%$?o@eF}<89-~C zhM#H4pQ@`1fBwsv_?IM@?N0ji;KaVSRp?$`_VvaAlCpKP{>+9_&_)jh&&Y1cM*<}O zXGQ%%zEy76L$L>#Y;icyP$-neWmQ-X2Wtui$60bH_h+U8)2n$j!Fn{P@-?mmXri($ z@2j)u+xM5r2=>=v?o-WeekvaX=v&^yx1TplEVXhpta!EdmisEes0#cN9EZfMQ4$gv z3m9(riztAK3LU=?XdR&A7ebu&2x?09u+;5$TpC%AL&Vuqr?XXP|1h1b6j1P6mJ*6v z5W(2Uu!in1Tu^NaC&STpMr!+S^zrY{ei_|BBmETsEBXn5S#zNy#b@xl-0gNnx=JJw zpv_6x1%XFVL-c&RC0#<-M)|A1IIjDeH-LNQQ0^y-3HauHAY`|o7I|F+&fc^h1?5|s z`@I;S;{HV!Q>8=CU=Szjr#NyLSdZfG7xzhCO`;tXa>lRK+52_Mt;n06SQ>+2(ALmn z$gD2i7JN#N6^Lq_ta1oHy`MJ+3|!-p-sh&G^C-q!m=DZFwVPoPtOJ=n$-;b8Twn^D z&R=6+PCPtv1h)^|2WDVsxC||xjE~lvc^|jX-dyyw1O{0Jk(OW@w{O5FaHnQ5UUwaE za0I?~-pk&$XOD!&Dd|1`$|D#U3r2dgFc-yJNE`&Kl2Nlj&q64|z-=Wn$G0g&gYkOt z?{P$7xXCIo4nG^OzFo}lW?E@p;N+;`irRvd5UiV6Kv_uWz&5|Zky|Xc+TlO-@G}x; z3%rhLn}!P~;1mf&FZF?%Ld~!vFtJZGTpn;yN_y1>r2BiS<#_+aqQiC_-J-N21vb2Q`ajbrQ@=`?BA(I&WHM-zSX&8HPh2Gt)pP<-w5p-_79NnnB{ z=a0}Sv%FJQrHH8s%td5o=t&>Y&I;m59Pd5T{%tI;(uo7B(cwX%x43!;he7RFJf}Xz z=0~v5253K&%sT^(y?I!gTlUqghXOE_&ojEy+tvNA8PA7$)*q=qVdD!RA^Vxr|reN{+*KQIAz}txDFu+ zrmkBJk5>i^$se6kPL$hxEn6D_W&aIa=C0Lj)t}9qZmIz!I6X7_GvhP5y>|B%2AVZo z8g6r{&XXO9<<@s|O_dH`-oBMcow`t_7QTG>M9mw|&H+rucD3D{HTTuY1#G9qoXiWJ zr&DoxazYvA2p^>iT8%;HnHF00I}Vd+`ydxpW!Ni1^OfI8*Ziv0Yt8ccf~JP=nC5>a z5W@9Z*74;rRUQ#oZm5aRlRkVYQDyV+#G_!_Rkz4Udi#a-K!nI@j`ra9=cgW91-5lu ze-VrZCZRi}m?*`2NE`+hj*#=_m<6LkSdsf?73+W(jF?GEkr5*4vTd|BRa4=(KE{I; zSA%DgfT1VSCL@Kx+ z03MD=nq{aAf8~#AEFY4ZA7?;C;9c8^H)SNCdrikgH!^>?kmT@?^iGV|(KDgux3blx zpzlfsF!~(=qc)1?c5600&bHK#=aop~?GH|aSfp;6l}O)LIHv7`)sAaW56J1VljH|8 z3L9?q8zr8=P$A>thq)KD?N5Y@X%4&wPNyr`!N)$j2$gWp-)A7&4DUc2F$uVqj^i|x zJ6Xu~fmvh8cc&@NVZkMjHh^sDy&bXuLWy!mSwKWk&%-GDp=vCJGB|6nh`bxEPbSrh zjHo(jy*>8RGBlXV=of-5c4=g8e7fNvFtKbDu@#EpzuhcMH%XrXTzQ=Bo$tCiGJpFL zN6fZ}nAE9%2|)y`rGs^t$EKB$BAMu8+<~mf(9f^t!Kn620AYqCbD40Nnp4w{WC@-c z;}?U;cNg)#K^oN#-c1SMQ#;OioWJ+I)iNMAbTUd=a;zVS{4(*l7+6HR-xCb^$V7e#M;2f_+ z8i`@r1kjzzpt^oSX(1VCyC{7G-A#Ruoi@AgFLd=dfDc|3hPfTEAw*j0%oPW!=f*VimyxiIl#kA z!%)09i?BMV%&f1=HPcBHgg^P-#iAADX{D8r;3j!SU8|kHR1eu~F@TK|E6^JOr-Naw z;2PGXisTFCLgVwCnpWsjs0UPiHV~T?rXORS>+A>wqv!Ipe9A1nk!60fjbpd_QFcf# zdH=`AoFlA@8rjX1Dx$b%=e0g8)}Om^pV9 zqCiwD3C!LyNbOB!)cEGS5uh?~>R~aENk+1w%Z8)6EqNcBBbDk#Uk%8|MX)=&)d7|1 ztZ2B{5VG+L{UD0D4?KV(mQ%?t^C5N~(;d0e&=gZ5oaMSCKJy^i8~saQRW9G_73~8U zxE^6tLbua;-3|B`t-^>sx)fRy@`5{);^xVA3J`oMJ`162qv)vFo&wNBnXf35H(6y_iRj9GuB` zNHVs|IH!3ODtVWSg6}kBe|?y^u}S_BqDB-K-I%Jv5^@;NFbNGYJ%skm%-<=;p zx5#gEGE?c~EizG4Vcl3py=o^j_PK}eJbXt>54PN_$GjnR1Kv303_Ip@ui5R{j z$#9SN$+ot!3@8i{i?!o0BjF}jtU#PwN@{UJBoGQ#Lag^In3E=9kF(=da0W(+Ojxk+ zdaoxxkUaa@$G|oN4DM%hgx+MBuhi@MA)Ktfi{X?4W;fu4!1S7DKJozb40y*Ru}&d4 zRCqf+YZ$Dzo4g~A(rGW)^xzqH9u<1VYR&xyc50N}+yLG7lQ$fG%NcM+I+a=K?CT7; ze%R7>0XY*kH5^!LE6t6m0gRLE5JMDGGHwA&d&F6AwQ`YCD@}nNjxTViwoW_PU620V zN~By`n09giU4WusvPL@~nq-`OfYOn?SW(#k7P+uEi!eTmm1fwdTbNh58d*9m3ivJh zgVoOvUvvv?Cbbj^J4_8JumnxPC)-zZv)3a;Hz8_>qU^!*MK158Cr|w~)VG387N*-> z=A>zb`jKl$NmH6c=YV41Qt57c<6VR8V3OhMKMYJsL|9KwMJ5Mbh>l1QcX&2IRf(Gk zw~}qESF#dq)#v)P{l8y}l#7ysF8FQ-{|>2P6}P7GgJIpwC3(*0aal8S!70RwNUP@q z#zdDG{*c6$sv?3aGo*)|GDKhUxQTH7ol?QIfz%j++rGDbFmUcnN<%Qtm8B$?x1X>+GKFZ4KNnx-VS{DK|4o zsaO?^l1ic2gh2?tGh(S!lf@%@HppTAc63x|`Hh~!UT;zW!Wy#OT;Ewe=S-!CqJ_*u z9P(g?#sssSp5g5mk**ya0b~9?-zKA!3ls1t!zgXs5uK;Mk`8u&jYpzYN}H9T6YMC- zr`C1SIBVS-lxBo1UFah0y&%rpcXiv_geAo`RjEXQfzw&25&FUgxIQbdL)~NIbCIvX z-Z!u^@B@5YT>XsxI?q)e*ODmgYs}hh=^N4n#yj=4H*&bX6>|!uP1s z2iX4K8iYizK|;&&>7XM%@iC&@4C_w|BQVG0XNGv)R%M33r6LuI-uiH3st$0#UdRbC zd{ZilomFKgjQSEHm4)-!a)V5=%y#v5GoHWc{O!3!oO4*aVq_2_ zu2(XvC;expW{e4jyjiD@hu>42r^!tyE%v>@x`G=kcMJ;dK!u@;U{iO9JVy2@ZpC`- z4UlndvZ~MV%uFJnBA39u`62`&eAY7Or+d8#iiu6y544Nr`3^NXi)nL+Bj|!9+S#r zkpzL~6(S0%Sk$5Rk>!$e#HsJQer#Su_YW+d-uUpRY& znEzlb-gPc9uVTyVsOGt!nerS;0(plaYA*SavnI2tqIY|KU}gL1#FF>m(9L)Z`+Eo< z*B4Vo<&}hX7bWp7NJII!87=xMqdrCWe_9cI{U1p+_ctwmnFf(6FH=+oH3Qj`uKQTQ zT=mB$LXiUvO({5U7IgP4W?EII`?j~opWygm*hIF5OI>_Qw)PHWY$XgJ#2Q_ZMn?E6 z%4{&;ybY%H5*o0FakJ(`97mPbf;uad3{q}Wj-!EzGTi(XYcElFnuKmfDL&$k0nm3j zRYV8x>cw2Dk)_RR59mKdD})){|3p+krzetO*V5zrh8em#JJx#r0lFj`LXWKKYeNeSZ#BrP9hBFlg(sKuK>i_vxVyOEXM{#X}K0 z_>7n*+QDdJTeMfAyi+(b1B=O%MrLt)uEn1efgP;c8-%#kj`H+AU!XOensqwsjP$;J z5L3xw4@wU>WhdJ|%=p&5aV#9!q!&vP73V$lT>TyuPNg{#-)@7R?P~#&V}cu723s8uNGhD_I&;eEM85v=K1Mucj}^n; z(m+)rDjJ&mu?fYDNcE7?5aE>w$SJG%0ABEQ5$E?is(f%P;N82|n(qe%RX0@q zzTk{xls7)dbNjeLV*WhGBXKgr`TGer{oP`qmQpIDk<- zmHshZ03F8MQwF2(vA4R6bu?lwQOe;@eNh%5T5Q)!{8jFK>VC#ZMW5P8IeN=t)`iXw zIg=vk?UlhGH)c|+#~4Ov%!(dEm2g-DQa_wpRG^nOI$W3v6zRxo*nb+{Sqv?N{)X!0 zCJc9{*rMuI2y$LbP)l4sxvaQ07y7^@k5P_h;FyOsuLhK*nrB?@5HWeOk#C2Oq{(2D zqk0BjP9kDc8U=?8R&pdctZFb@AYmP_m>9T$zE|MFWB!}}DfEm<0>SSvGVKtQ+4;tb zh0}k-ASTNaU5D%Os)(flw3Ml`)Bl5wx|UyK+B?WPp)XJ%nfW7V4n$-!Y#F(eG#ZV2 zzK$5~OC3{hPJhh81NDkFUpLVgMlkv|d>%UZ|-P{=ww{S3`0&P9z2YC`Nk(~&n?^TP-Lq(q!_P*Y_C1p;B~(|j{(+j)eX8Z zLf`90w;n^cpzBDZxzjELMgHS}bh-QUmv6=bT~t>2H_OBlv8hCd(qw;h$Sd5Gm#e+F z6QMn^&u{-rk+F@IfX{&OG1?-Gbzap&kJwQ&gcrKw-lwcOv3)+!3I06>^YX5mBIOP+2=RF0;jrDub$qUrs7_lg&S= z{ij^q=yiMhiboS&OT$-2rP05TolrCDd}5QGP&-R3Dv3hE%4IdW;YxgwUsQcFNb#hi zTZ4w}u`A|D9ImrE4GJziAbX3~06Q=HOJThTzey`fuMkH1Og(j)Z4pbV%m0j>luzB@ zie=P#v`{x~SIWK1e~taB-Mvd6Ki53fm8EUGH{VqxKMuECL-MlWDzt39AT%y}_02(A z{RJ@Qsvk|e+rH@&pG`Uig@=;tv(-;o*b7%;c1!Kp3F|xTroy9lPdR^rQe zVPLjcRO&9p<6N|6wAgfF^}fm{);>X@D=;U{+h1~VUbBl@>G`PP)^wf5CdI!-%L>GUMtR+>VC@)+)@2_FC|+YhUtgW`INxhiA`T_~@<4t<*yf zeo)9QukUD?l+;^&vs?+CzT)8u+i9n?4Lj55^%5^L=#ozy(s!PE>nLe&$Fa!p*m5mC zUs$lutw(~>t8Lsg-^Ds5F8gWdnc;}S*)iT)i#{*Avs%VvPpo~mW>;H=_@?;FuDH0- z=7&|^?(AW~Ngt_mu4af8(dbP%#_q-ivy+CKgjc$AjCi|sOkA8_9ac1C)*qG?wmIcJ zA-$vV-PO&(m(3A~+8r0~FU2#Bx~`>Z!@qyUgR?I^@QfHdT6XRYIau{mD>BpLEZK0v z=drbl&0{Rocxx|+=n5?U{cdeQgEedNB{2|ec>2VT^zWVqFY4y1D+zn(o7G>PJu1!s5>1_RHFOw=}qU(shnyCRuq*OoStn-WXM%Ue0 z-EAS#<1De24dk`$C*I{6M81!2sNzAgZ{?1^dtkBq)`s`Vv!m57g$kdUEKg^HI`gUT zcfdFq^yq~Q#iG(CJ}!t8#w~T)1+@6tuEkn@4-8V|Ye%f^Moo8P7tO_YuH?;+UvwK@ z-rl58kKMt)0tR6PN#c?j-IixE8_cUQE=7U@D$Z1ML-WTOjgJcZN-Mmak6aC?;G8$c z1)RynL0N;PdG*OiP#A)$%n@1k>v3BGRqPpA#NB}$wKY^6!aU6(Lf>=v+4Gg-80(|M zWYP{8`*6-3xgPda4Yh3Cglh!1)Ys-pjVCis=BGu?z9%M{TVAJT>wxw0M|9TmUgh1e z?5jK`<`kxSQB_r-xBkXaPM@3fZaODvLHcCD#AG5`+s#Ohc*$Olwa)hu&WvDhDP{Bi=e$425 zkus~d&y~}9`nY`f>wjiqhy~Hu!bI4r2Z(q}AaoQ;0$t4?4SDHQSGzs!-%u-QCEbMM zEZ(S({aip7z~~HWo&DmB{PnbcH(6^A679Qh50T#_3po|bh-ZSHg&LkSSf0^I99Nut zrN(O-)n_jjKd@I+QqU=yw4{1DvC~b8ZhnU*hC3ilYOs&qNkTdOA6)QK6|tJ1rb z4~PS;^=gg1VqfcT&c30L&d`_35ms%6Nqe-?3^1rW1f6OPX~;xnC=(wP9*dhZzD%w8 zEwgXpK5$CUwxqehwp?GU0F`AkqgKNXuJRY}Fc)8ZCiF?=0^d)D-a;>wlgdu=lQL;8 zDeMS(*Ss&4dWEYhm_9j%gj9|F1nZ`Gcdt9WSvl`cz@P!_=8G^TkYZpDPIR43TiK1N4}y#OZ=jz^b4Z&DWORtd`~u-s(us zjtVt8^SvE`45uI1Um0W#pz6KKubAfT#g>lN(M7%L)MH%^by>{)6^6Kt*{IPi-@~lZ zooH3kVLqY?iS)9mj{(4cX$SUdiwA2Iq)OCz9b~e}o1ycn3@r3a=q$Aaz)Er|+TShk z9WF?;l)>?V+kJ#|UH~iR{~^=FkFYK!W&cHzl0&!Y99MJFBOabq^lJ=_eXyZu8N3rQMo^7W>H4N(tGonJoIhQzj@-( zSy5FP$To+_p9~2Ix?S==Jq7HGdg}b$#WEH3TBmG<0`WV*TeyS4)7Frm#_&6$aTjC+ zv&9_sQE-E7mNf^zQY+a#H^$s(+K5}yjnuU4I0S{b#PhdIzHd^Tx!1+G{F+=cx(N@v zv4dyaq0hxG*NBcb77u*YleKK{rFQ@sqv-mkEN&F}5WNz!2P90E0|Q*$9Z!a7y2n#q zXB+H=2SoEvZ7#M}>VS%!{YRuwJFNl{LrX`d5%v?f{e4sN^tIfCsiz$BMq=isHb5OZ z5z`+91z^Q`j92G^X(JYzaf`oeh_pK{*<{3tqSSE?vQ^Sj!_!7TE73a{EMaA2@8Ly< z2+Je}tZeF(MA-LU+p0;w8;G%J!K{G>^NNn^A~C*uh9LIQY2TY4Q_4CnLJ$(L=-Zg7 zW#9Ne{=W*qZRncH^AFyOkp2-x`+6|@aL?z~svoS4s?5b51I7FugKC=YJ*8&YvI?CS zsDr{@m=N21;D7H6phDzs-q`Lj5{R9&#?lioVQGjA%BgCXyij@r z@BMJqL>wVRBK>lS;s`SmOf=kq8chhIOhUdj=Sk&M1&(3Bu*8kx-{iq3z1{mM+4Lw5 zF=y5EG2km^Q^(qRz#y|A$#dMFJzO|twm_vZQt_@#*RH54YqcBtAni{~I4Rx|OnB9m zm3osi`4L1HVvajt7sp8jYYgrSxIr}Uf{Lt^)TXU0M=~)#h9=f3XxuaobNpq2p^>V5 zzg%~npm=i9Xrj(GfVtIRiJ?wO=wY_nu9x}LoaI8bS5t}^x?GYvIzbBKfko7o>wE^R z1iQ)>+0g|xr@;TCBc$x;KTyEicriaJKl~%XKqm?DIN#}5YE~R_-LXx6k9(FbK*-OX zb4|96#F_DVI0e5I2;2m`n>=bjkzPOq8}K>vB#N?s*PvMxd3w6%Q&lJQ0yVX>W(e2E zS79_0oO&JW+Q^HbT}7(GQR5GW@Jw!Pa)+E!b}G;qRRDzKTdLQo#ooS)+Shh!VkZ!w zvA*|MbOr28@kS_D{i{@l`OWm7Tg^%mn1wS&%w54=p$Vp{A`?)!h|d}JQ?Pyq-@&*= zkL=B!oo|a?PKaF$@DOfRE3(Nlfc@t$#9^1fsbe$|XpFP@q(+eq=;gFs12Y#rsoWT5T18dRz83Pf#wmK);4U=5B2)%N!g1M{@`rF?~dh12>M1GeevE4QjaqkR0imq$iuV z+pYh9{Mv^cv@rYBTzTOi<$KwXSd|8lr;Jo{3^K=!i&9dl82rADDuX1Pk#FwYjn|%o zXuZi6<0~Y+Z9|E}Ux-Isz<`zi0PCxe)Ukl%%xVXZANKX8YyA)yqut`Hhc6eweJUQ5 zBMMikn{W<-8ED5)GTYb`;*a596SiaLWe$L~GEec@hub2eb`l1H%-XXUvKR||a!fc6 zdhcL{@Pqfca8nu=u5x<9yj;cAhZ0A@xj+fN%8Xh~b);Nei+ljaf@8SMcUdZQE#$nH zphZ=CuaVx_WNmMEon-^0qCV=OP$Bp#Nu584MQMO8(+0SY39(^gl;OuH@xXK+?|D2I9K

UR}Qa*O9EgM4OB*#yd1J*Zppa&4wGv zeU=CyY{${{E8kCgKqVk#uDCD?w1q@$J%J2qQPprBSI%wHB7L`;6HgH^-y6H4$MAk|S3YxySbXCwco@%}Y zr*e?KDT*Ur{?fA~6&4``Zx1dC^nss>w_kT&!XSmRQ@U6B=YHv9^-~|HB+#bC8Gr7* zvVyC$koaK0#}1#z5b{)LAY%8#a$ZJnfQ-~*CG>SD^3e!JFK2t|gK|^xk|1I80$8O3 zH8;x=$}ylZR5r}PoLaycS9?H+1_6>aL>-DFy7OL-%7+~~=^GvU+~y71$CoHR(qER) z*ip|(%!?>Du)d{|5z+X+vYhWcns~QMQc4N_V`nJk3#DrWNFNE|!@|(lI+an7E+Rbz zWRtkxmjEMvX#`eeC9_MoYLO_bQkR%&y(b)J<0MPehVV4^7v|AGz3l)b#&)W@r(%Li z+S~6f4$n^k3xkrW@-rS7hLC`#onHBb{sNUtk@zRf10}?4Si8d5 zS;^gp))Na8C+o6bq`dERnNzZEwLqfTge7DqOESBy2AA~xs3K@+Dv?4Y%Sz$z)h|^B z#+<*V-CO=%WuF=rGDGY?MuSuq91(2w>7KO% zZB~u`I)B}())${;W`VGc&1lW=D$MrGdTTy9G$E09-;f9)ITEo>CTb|!t3N6++C3hf zf5+_Qy99leFToc()(%dSKMTGV=(|nShMRqjhHH|#r&Un@^j!69uq9tIax7WM_i_rW zFsOH|SQs;bH*UJuEL*?M4Hqb7hgp9WFa~1Aglz(-;8)Eb2_ZAh)<dECLAv4M_}+}2)IHL1K<9rb`wFP2y0%e8P!N$6l$J)ypj&Eaq;mkJySoGx z2^kusrMpYIrKE=eh7{@Uk-mrb6~FPj@vplUi=~Hi_I~!$`-#0j4;K6o^8nR(Byf3_ zK+Z6&-W8sH=y~{kpa3#6*Vd@DT#BgGW6mkV>7FsL`lhQLhHvvaTw##?Gw;2lhxz$l zF_p@v23Ao1!bYz)96lJl^aH(fK~ij7UATx>(R4=v+mS>5r9NagIh*gO&DhZGhdw+! zshZ2S%%O6>O315?YTLxM=|q`r+v68+8QMl;Ny(T*xd2cgYJH&Z=2pXJMnY>!4YVCB zvei{qPMtq5RJd7}91u#zPVWuPU(}%B7IPnIua?VDk?kklQs?7ASj+^u-O7AsS&@0y*{I+P?-8Lmwx$^gwJy1I~il z#V*BVsY%20{3)P}nB};YsFAX!rp8*JG$7|`^nZ|a^{?BE!nW*^4EIHmV7-Z2kHX0T zHZVx9Kiliw1ETE8_sMt{8mm*Tidte7yw;S>5HsMcYHk*%$*k_DVjB7ebfn}Cv=ILx2o zKm2|5FCs#2*PSo*zvhGYfD0hH01tfYell=SEO035w?Zd&yZLrZqGknLqVh~|Ucho0LOh$;U>AjT8J?OyS#rqGMaCo zJQ21Pdx1RqO;i*DQ)}y(yu2sJz&$gx*n+>lI66B^dwSM0Ym^9I-Qp6*Ws^)pON*w# zYx6gnTiF0odOHZ>UbF4hz!Cw?K&f74e!fiOZir16w$CroQp)b5zR%A@8-do4ULX`g zcYJ(Iy$9;O@E8dI{cuqf&1FzcPo6DW|WY<+!wN-8QeK(r-x3P>%K z0WAtE{EX%y*DCf3r_TV9A&87kij+X&S6_VqOf==lGGs9IxzW%e{>mu>f4zzO7!Wxs zR%&`*ve*r(zn#k|MxUkr zOMiF+uO4H3`+@u>PAQ<~a6Qa&9wPIOSTMj06*`P-fye+50n$|a8Ak|&wECNY@HYf;LhJ-V&U|ow&)kTJ2uJ}evc+(rwP~`8?4*;dPcB{1-`AwWk z09Hkh)g11>U=0Fb&CtPE`wOX}4*(sKN4bp*tM6wbKph;@-9Vm+TH@`TX}gZJJSsA4 z_{#uKzb+i>AhR}l6);nYI3qUln{VXb&M_>p^3Wr#zd7J)_R`oGd}OKjwZ*_8{U8%R z@RxOXg$X>=`n-p%xb)kBhataYzGEPv59bB2`ezQgJc@t86b-m*RGrVC%d+--ZOabf#!I(e|$T;}|u!*7` zZX+_FH&M?3Q?ZBtsSWu}$~OR{R7bX<^_SpL`K||LO zdpoy6C4BY}`PH;5Sj|}e_zQfSfNa!4am4^*1PFP?bE?cj{H5&3Lq2EJMV4& zg4G;=HSK6v(e5u;2>@6nYw)1AkO%Mw6a~A=%bC9f<=K@X*_XR<1L=E-Pgj(#iAqFA zqBQ!wSm3E`_kPV^^s@wJ(hpW5M}D&%;1tF0V0(F__50!ioWlNwjfLJHS$clpQF7mn zWlwpB{4kXecv$n1~*2LQ51yc_!&U+h2VqclcEhrC%*a!C6NrY8VQuez03kzsnT z4Pbw2-t#X&KXC(Z;<`xjJVsjm8#UlfV>SW$zi8TW1!xS-BC_smwgtot3-MrX|1XlP zU6JGh&mMVt%AM33Ex2+wk?Dc+Fb%*c&P_KyXM9V9jGwo{@2^~~t`A>`kssrbp~%U} z4G#`dYHDiE@4D(_-oYp3;Np@2gOfb;q@|@D6PYWKkblGX^Obj0Bh!!Z7f_TafJwEV zYre_I#1skmHgUX8c~MbOl6H2b{1?A|ZVdp(G=owGy@BD=_^f&-JU8amkoCWwj5zT@4^b0`drl=mMnO4%( zWu-VB?_2;H{bGO}PtVnLdiu8PF8w++m%gWq6DBFqT(h3aK))#6RdCny$~FD;=~K8^ zd(B_uMaKbH?j}*VGP2j^J9u{->gaK_0n|{OHpK!JNYWM-`Le*hULg!8-%SLmfu2N2 z{az2#uuJ<8KnPkE4pr~nLD4O*=;-K(*jW1NY3Bsb)5%%qVxSlE0lY_<7n%H&a~}b7 zMxj9(giQWY##hS9)D)gOzL*}m9-ADcP7q6=3&(c~NQ5Y9Ycri9wqt=tEnQ3LI-m)o z7pVASG&3`MLmds2m&|45l_SRCl2_-@Uxig8Qc~DZL0PST$zeEPgb5hc{oIieAl!Zz z@JDM`SFi3=;L7WgcsLQ0n7AR52Dn@`+g8x<5Q(1terM(J2`$O1-s|yzObFz@%O^Qy zARNp~#GPr9A(ME~_SZz{V*f{Ti_4<_V*rR@E&}a(|G1p2lx4z|ymX z;)6^T-7wJKw*a{1F+g83+N)bLe@Q6%dtE?p9DMUcK_=&?8@|B3d$p8Z5)XhYM-;c3 zF1;seo$~o!V1`(h<4Vr-wNiV8T47*dyitZun5F~nyNse@IO1s|lHmsIATBLH9>v&q zJ%5oW0}~2^Ndtnz57af3y>7cb0ol`omv1 zt-4aj>+TL!$n)T!-qfzkNlO#Cy0WbXDE~168d0`R8^sk;1i@F=OK}aG0`~_*BqdEF zaQ@3YQ%M6jO=GagxErX@ zRhB7}8Ris{wmPh&{y*w>(FdkxX5qZx{T027L%l^H-5xiN;3Q^bWRwE#>thD0ma9+k zDJZsEGg7_D|MCiNqKaI(OXjb8kX;MF-VIJntirh9C9mHV=r!qHZPU%n&Fxc%=!K4@ zu5RKn+MVy9K04$1%guP`1~^32+wt}izj26z;=8Auy$D@hdn;YVm_IzCM9cRAY+wGu zBU({ckv2QuA~U~SrhiT7!g4o#o|YQi+b2C%_QJj4@K?8VZeH$4I8grOvq;xYFKO&Q z$|c&Ab@-(Epq(){w~#0)uy@-YA0LlK25UpX-`$lf-(Yn&Qb~Ju&LvJwLg`IfzJ;1` zd`4#0*EfakWMoP9vb}{irjDm)=dmErrv*bNl8h`Gm)Lyz85oRpX>c66VH3XfE6Ol5 zCSmncqyTyKz&$N3?aYFDk@$FRlqX|rpA(WYt*FEcwyK=gx`>#zzs5`s!wFy2tmfuM zBt+&~ad&8HKPDrNVdvxwt}L%QI(pmFC#x`GUw#lz8(fJs3FqTokQI$gvWj${{N{COWGgyLORQm9QEscyPTC zalXfRyy-v2&TM-5lz;8;C#H8~WD={oE~kQ;Sxqd<=h5|XpUdvz#fjLNVMs|oc3Zlx zP~#lehvNzJ65%2pUcIG(c$h;0+Oydjg!K9IIrLROV;QllloMFDAW0dzQ#f&lroA z#~&$w>poOxiKg8*>3MG_Zry~22I`gXX}UZZ+q;-a7wA0Me>3x4%6??D*QjN~tZ08? zX0m{6+XFkM$LDOrj>olPDrMds0)FmOJLP$}_}pcQU~}rb-D#n+<8t!5y?1w#2X`1s zYz-D8cScDYlIefDvJHDYD#PfIA5 zDx8;W;Cqc{E2-Jj4ND4cGX^%b#<^1!?lrQ>Phn0Q+WPi+Wm=PC&d0lKKFh6X-KgH!4i)$~dwzkTX=Ne%pP3)IuI z_(75v$~1d=<|glE7N^J z#-%87F?h*zC3X>o(SvnEKqb$pk(RBY)apyMuC0%*y#LvuiA|*VKHDnBWR`Q%qPa85KY=A2X5sCh?km|d)j+jN6pDR%5((1vr|rWGx$Hj zB<1!j2VUFnytKCHST0L8_dF5=y!<>5*|x$0Pe{u-jVHqm7PSVZ_gcTX7!I^NZ|be! zVov4)Hy(8;NWXqr?wy963I=ti-zuO zy|OOsw6Nf;!e`OknqB%`p@lkYMDg@FBi2S8Rsl}A*Wu=7ncaxABNSX$rvB*i za1MVj5|lC;mM#U-_(eo&w2q#@!Ne9#qRtk{0pH!Io8C{?>$w%$^rb0tdddT_;1bx`al8X;jllzhp*RaUxw{!S6v+R|=G3Lm1bue8lOXBpt?&;=k@nvtr zUi*Hr8Q8-a`{V-}KX6x$4&Qdn*TMD~Q)*YscK_(pDT-M#N6p4N{m{3|RU~RXswre)Y@Gw%s{*|54Drx0|5W9n`zj3l01^b!th|)k595NV0ai z1v!7m+KXs5!Zhh3qX4tWjKWAHCvtKS2=WM)7iK1yk>HtUT%Jbr#CYD~jae+z#_O3)SoD z4tA_MiCH{7fvp#ZDR>^3$2oKxay|bhLW+5+>*kQ}xh~|hDk1udTxfhOMXOHBy2-8b z`)A&vkc09e9T=f5^!}dDi4S(&Q>PMOF2u2d4qNrKYqV?vdw&Re=@3R{r-Ipy-K;5x zZ5vPQ!}UXTL66{q2J1{&V!aKCFB4wQD=QdoCm+TZn!I;E=>bEWRlCeS=CYVmPotKz z%=jFFI^px{Gyn0K1XeBQFLK@$;F^`)J#4*;7nj?8+hwXL~XvnMwU zN!T?YTD*@Bw{>Hdxva^r2q^!AG0Nv+ZP{@5?2$El`HgIbQ(>V>y;^ zpW_qoR`L!cx#ElaNELQsTb7lFSN^k=F_|U#5&KJ#cZYTYsogiNFpC?@MMc{~_GXGi z{~@aC$@k;_%X2@-+LN&N*zeN8r;rtadU5a5)^#{5T z_8=j-_S12m0cH!=fidSg+#z-q%0`Sn=BDB+UD!%#t}AQ&Dx1Re6wu04QSf^=2I=@a+eS)YJi zscZL64cNn%*zdo-{(m9LEi@2XP26+wY`G!7rQW{k| ziyJ8CleEm8mIpV;E6Bx&J7m{!xLUeDpWTdAunxoG{h*d#Xt%JOXs`Jwx0Y^1Rx{7y zlsQpgx?1D7yEu8s>19=|lg3$MuKeC1*M6);93aQEn-q|9Oab@jlkkETayWwK=wy>`hAr*>3BLA{cf5B;elOQ>*FobtEc(OOvK z1Yti(Is*za&&-Wx{t%D|@idDd0`J5@jQiMuhW)oPT=S)2aUhd-1n(hACS)u2gN+KCsT~ev@>-H6j#+DcEb$9pMDvyqGvDY%5>biTC6Z_!2@^WyD9~_wR zPXC_5qx$^Cu+Ae+v}Cyn#PGVdV{JEJ*u;60W|p_UN*S*;#KgqU_XrePSt5MXYi|z_ zPBikBeSSG7iKtCzrRIHsbcx zU1rvjF=sd+N2~7Y+t#z`%7$A}kR9R$_M4vx)hgW=M^2BmH!$*Go zL_Nt}k7Cei>BOVxnQBs#dc@Cbp&|u@>c^|47r#TLlxt`ww?TyX6MkEmCJJ`zN9`R> zF4l)GY0gs`oApUKWc@Q7R`s1$9zMn6a4f>)UnDykpd=Pgn?*RqY#(nqf|P%aH}(8` z$P_Qu&rZTMzvYgi7n9XfW0fOY&EW>aqi$+veXa%N)3BM+575FFlXaRO<5{S@e~n{c zdDeT}x#XZ0P<@y5ChlofSE0e1?TV6S-U>Yrle+aTQctsGNCig*ox;QX+aHMCjcO<~ zG;AeABe`a#Sr;FSF5f@;shWJ-6n6jH>IvCQbNjsl(Oo(4Y_`(aF@_;Y;X0}&x`9Qz zD_{9!C0O%osT5K4c5$2i(-h@`$52}0t_Oq*QtnzVCe9YX74~In-ILlvw&d>^v$vnY zoU<*)jSi2B>fXE@v%}(=akGhKudX=s>iyD~qN0>>bKaE*Xw|LZ$fX0X=c=@Y^ub?R zBD5$cp@9bAr*W}Ki@X7aA)4CUT54M`k20r9lnG;5GPV)An6LV6uzO{kYCaNeA;b6p zAAiSiOvv}0C9rCmh+BmplW+4DS#W&j`(j}>QLFB2jkzp4Qc}+sRsOaUW>_}Zp2aZf zMOorhQLyh->d%LN05{6WtbLm;H>sg&m9)X<_;w*%iZ%OUiLlMPy=iw6DnRsv z1zbhuq7ixrQ+^x~@Q8_}O2wi(UcfQS@g&)JPckt9A&VSvx*e#+q+q*mr zw#DJVFjfL=zTnM(=eQ;iP8&#Zn;#HfmBg5Ht)fvrU|UuvB6>=S8DktAmqmNS169PgEYFh^8v)OE6uTH=e1tzl~4SzM$eYO6@~jfQIcye53|OaU2~ zuT|2|nOQ0Xxe1v*R)}0t`S`5$vLdmX=AEqQ^-Whqj75@*T9K3Wsdf>ItW8p?T2{3( zRJ+J&Mw9-tljpH^QHcCb%=&qu3)9$|>7rI=ev=XWv~bG$R;Y7TvUXy9d3kU2R;5bmjr!DR9wf_Dwe zA;uPZ#uiT1eXWw~Z>#MK-wB1WXv(oX-7~G3`LM<^RdV0e!ej}z%XoRaXLPS2{q9D5 z<;r21J>TIAl!*eCjQn@t7Z$x$nq(|dd878{Sf&k5k}^_uhm{_)DEJonNn}Yy({Qhi zOR^zw85*tgbb;t>@`Iufvl{1=iKw@})^V$5MM;|4CCic3A(}OuYjlGwOu#+&M1%p_ zHA@pkR#m0B;_oW5-Ud|)ceY~HIq#|!#k{flE+U92Pn>YRJy;#|Ir~F-t~<8JJ1h%z zX9bNKIhGS`fAi~PM9-Pu(v?RqHdy2qdgqjr+j^DmW%N6H^u&rRfI8* zj;#qr`jdD%T!@PQYzkgqrAUrsgwd`QBrz(WAdg6T^J52M6N+@{){FE;^V=AR`#G2p|whwQBx~==YNV4ZzC@Xkb^l_g6IkDz5Gaq^b$TWP^T} zaQKEZ?abN0dI!AEbp|l#Xlc8K(OxX%En+U#s>;OW6@mP(a~X3?`spE-Nm;6edv|SacB%XM3{&x1nLYlu-S9RxaTW;qTv%C>B^DOF(|cJDb`_KCl0iL-0h?a;g}TDYl8Z+OGtSaLm_D4RykF9_Pof z90(+i{TaUGD!X(_`{mkC{i7}@pDLjdCulFsd`ym5)^*c9B7d^}fYj?0aBT;s?e--F zocb|;<2g+Q&@9qyq@&1n1poZax9cjIeP$ts^_zdbR0E{8!EQuOIsYET56E&7uuI>) zrqR=@L^MWx=5ILs5+ ze7b1MOnHg?_<9$gMv6es?mBW~F7(lRutZw>M8hTxt;|~TB*W+wAAN;_IVCY5T4_I?D1v^^_s|)b?{yQZ%D-t*AQAwqqo-FQ^WSv@ zm;rzO+<2DAe~oG&y=hvaMEdb}e(8Tk1CX{q$9r|nP*1x=0L7i6mtpsy-7|PBfG}29N1U6>wQY@Uu^nwc zg?&S$_fH4UoDNUt3s_~I3LbfZG8(Fu^-MlqtU`qZ21Gj7lV2PalsW{3hDM^EVYjh1KC!qqc|IlRvRIhxoB}E1)dJ(!Zcm5r6x^#WKld8{VTBK8tw*HP zZNSJ0I0SVM1{kcc&9~qluI3ZaDJmhK4Ia1SJ}dhb4Y%T^?8^b7tuUd6Mfi@&dxbwa zkpHT@^BpG)$e8QmZPU_*XXcj55f8A-$S5ax&!#3tB*D#Kv}Bt}TW_3P6h5xknweyK zLm4*1WCriU-@kX2k_{qKkdDYOa;0Gr(2O{&bb`3IW!jE1e@-z-YFwHeA51$c9L~_2 zoaA*(sm=GvcD_IPY`ONsyW`bcu|_8_4C0iBKK+6`ikp$6Gt`W^QgF>Yfvlos&pG>F zQah*{f|aMGHBysO4OOw<)Ldq@PIs$_h|c_Z_{*x^AoNAjoRVFtV;7>tq@+epRVITg zqiGaGb|^(&MNDueu7)WuvMAY=iRUqCXErv9dL5EO=bHOjXpN#WOlTxuDwYut(#s5e zCXj-jAKkmV3@aM&?@?wcD$15;mjPSV2@3HZ)(kEBQ7PEh=bK!ZhJJ#%LmtIa<(gq^ znfxd)sql&6(N5sDE9kBa%#%JpElm}LYt^N6cdsR8Z9eAXQVjZ*C7D_koH#`@rq*h= zb%bcUb9!G&(=9X6%yR8ez1?qO#PAyw<44Zzlt0?(9w5!8gDRu}zon~``RP&loSSg4JRYjbRC)|vbrhpmGczv+xpx#KUK{pR^KBf`exV>Lb(PRLeCPFtMIsP)e8z_pu6(*B3y zkOo#L`{!m9B3 z7AyoX1)gWohzGHZ4}M`{0dN1zP<&atMP2?G29wXtHyhJ0nyjfW5YK})WNG=-%(uPm z-zv{oP-G43{VqTFw*O~q`x~s)yTQsdP0i^D5tm=&Pcm7(I1_nB82SoNT>LMk!VUsYUx>(MidnR`K+tjW5}PVGM@C zi@AIE?u*PEnlbP#%t*_-FC{3k#H$b<4XNR1V)UV7D_&aEmDwa2a-a=vt%VjW$Wkhf46qN?OpCI|9Ax{rC1pN}SG_GtI{@-VYz;}bnP zS?S1v#EbxMi6a;CA8-?^v=hgK4Y(q`_Do(3czZOzK`;SK>G4A=!#s!ETRKmtib{cy zrOzon+)|AAPgL!#FyK$%vizxddHBBB77fHn3kJ)jCo<=c&Z^i9uMTNVNmBJ zvLu8sm6>fY19@h)N1BYpQ9~@Xgh@E?1oq&| zHpgKXy?kVv{{AR##8eL;eT1R@sGKJOR;DPefJ=R)N zqbd4bvF92qg{e&A-Ep5h4O$zo48KPojC;hpc*qz>$zxHEN9avUW`Zp}jG5LD3I;mXvW+gZ2pi zI-Y6bc-0!nm{4rt(hiLmZC*-ihM6jZ;0)=~WZ`PyiMgzhPe)a6#@Zx+8>BQ1=oX|J z{X;Z)%&g@ek}y6j>T6Af$8%0J)<{&@ls^h5gipoGT*O+Xqy9A9nwRAgSyLjzjf}`z|MraP znKdt82jRP9A{htiiPBhL#ARs&xhQWAUdG_YzTu2|E(MSx!L!RR$ z$m|n^Hp2*gbXS24o!JcT6tK|>sKUJmI!dvp*5y6m+DbQbuOQF4sjHEl=p<;en9*ZO z*7xc;qu%qwL)(u;ut!#8o<<*;oS?#$Dv3D`77xWIbHZ+o=;R#y|S=y^T5~n z4ys1^r0`ivHh$%ze&0K&Gj}JC+vx+Cb&{!rW0!j@$-JV%c@FkVO&lZTL5TUwsNQuS z?m}v@$-@a;S5H%ybh%P4iwR>>o|RSObV4*0&)i``BXWItg|caPB7Jk&^qG#NRZ4q1iw4%r?lGZ@_6_j1|2b_a3( z%X__vn)Y%|CDR{Wl|w+aQt^dZE^mmkMEruG$4j1cDxAj44@wP^=|1eD$4~b(xEy5`2%f9ljc*rFY8>Ob*rm>PJ9sZpW&qa9Ms$~`y4UQV0pSAbE*=RJry#6utARp7PhxXA+3JiR? z%^_|+N9d*?R*3FdN~Dr|Nq5K^@fweYX>ua3^b$*dM1B!dROwG;+80b;AA|EDoba8_ z>cs}|>7t5Jh`fvY)^@Z|&F+&Lmc zrjAgXM-2%-C)8gE>9?f@EC6E|4cW9l^_$k3IIoY+CzI|HpdBcX9@JIT->Fg>Yf&?@+xe&6wwQYs&;3mGz^cr4&cE4;JG8X2NS4##ipQ2Z5XB54C zv$r3=yO>F)0AX{O3-lUYZr#Qrc!$_1$%Q_5$`~s2;gl!TOJ2^($HYDUGT`#{euZ8^ zPO()ii;a4(0XC__$OxA4W=;#t=9IOveKA0)62S<1H0f3otMn^|MZx8UjqIq`L_Lw#z{R-eCy zn{vfJ27?}daBxtDtfEtsm^m&qGlF9?WX4jt5hOo;6v%uP>V@sLrFyX%KW%7$MBjw8a150s4Mc7MA z{M9%NUfOh!p!c$C90wOuPPRNUGx`iHL zR$bsu9-~aBqwqPM=RdJx%XTD70Gj7Exi6b>>pNjCuZ-388{P8HaEHQ z3@0Z!%3M0*UU|NERhl=NdT&@`FXyXMnZ+o2O8uKX0qU4_~>cY2r5=F_G*e{hr>ZNFAD`)6kqW$ zUI(&TDFMN{Z77Ixy}{M{(wM49OdQ{z6mV4K5rIcbJ3*bxD7wyF*>8%;1|cho60K9{ zd4v@LpOl! zr&XLvpAYA3t4faDXom5{3z`%-C(~iCYbkrJ6Ee=b%F=gIiT4*lXlgw2cUw{4B z#`|@ayd5_@MDT8hqZ7RXd+*6?a##c3zzVMt%VQo;f;*e|SXIqd43*|-U_`T^D5ze= ztgF&XCWL=C*nl@v*ED&$SADrQcWsHeE5;D3{*9f5*;W)$3QKIE&1lyTuaL)&C1r_e zU3|QPn6=`GpnDh)zaFVSxy-9j$WzLF-0R%tIf_Dok9e}HwZT9j4IN!q{i8_3RX@!# z`&AS$KHX04J6=BC%JR}txp3q3hnDaPjeBf~UoLtj8a@pPb^NraA*I&KV;Yb=E;VQ5 zQ(Ba8NAutWgP`&K z`6z!KLQ;YAE;}=GbU9nc)x`<>)#q5D2MW^WmJoDH>$NZwZ)G}du2Iv^59^&WJlXjF z3fIJ3#Yc|U9Ja3GBc{F-QX9^q%S+MCoy}q0*`I6PwhA?@wd5Wnki$bMIYifcMl>azi1jS2 zjnjRNix~-$34(~aY6=JH5v_y9ki|Zc*uYx4iC>KAPHVM}rIlrVHDm%)4A>VtVNqnm zxSwmQqd^B%k+Tn3>;`vt_rDin$HqDRUHW%*g66wuM5+ zA1mkQ+JjW*7YnvjrXOEXlZn`AN@V#eA zh)x$t(qqgRxtLGi2_9vplZcc~5cm?_CLIT});Y%Wp3X>Mx9`PdP=hR?;e~oHbZ^mA zIiRrgSt&W3e5!A%R9VaO$|~e{k^J6Jwz~Gsv`Zt+sw1lybIEx@|J~WbEOt*|I!l_1 z@zY?z;)2xMAfwEXvU4yNgq$sQ2?q`2Kxov{n#ohk^Y_Ep{({CAVB&_*Hxb6x2&JxqpR8)z7_||MQ*q*mq1lG+85vwUd8W23`=(2 zES=$TtF>#RPd!Khmt{ex=yl0Cj~ao7rM^PQX9kWN>N?(2q-D^;{3MtUHCL}$t#)QPb)3j_d z!X-WI#k1qHYR3iI2Ec?*-v5p~Py>-))Aza>^4DIh9}`aATvuyw{0P?GOqM9Gs+RKi zP}f~uj`jD+j#8-vknV6xtZu526<& zyW2ghXA~Lwb2D@0Amjy5r_8)*4)hKaF^f25V}jM3^iu1SQdg|d@%9i|zKZ3H#N2WY z7m;#^Tm_GIhH3h(SREd@GJ8Lg9X!%QYb;q+`dV)SYKVLF-T#EVMd$%-&Z{kb9Tdi; z0dSU6nkqiVnHM`f87f6RqIX#F8HiYw9xBtxlCgBu`b#rZWqniQsmRumUqg?8=Y4zS zDF%%(vmlj=(_~)deGnJ&CR5+{P7!j~Y|%^9`4b<$CT**jki z?Hgs2br&SUH3B;6tbs~a1Z0)h&_px%d-h8l00YPDAHy0A^J>HPl>T(+4FZjHIYf76 z{}jHTV_r>LXdR{8X|XW3Wsk)MKe^m_IF%5F!)Eq~JdV@ilPuJ1M5L4h%2ijHX&I6m zvvl+iNqDT48fD`-we0ssXuBgHRxaay588GwR(U6Z1@Y5dL1zdC*36*O_Zj##?0NsU z)KgN5*oraz$F+HU#SvL~FUQ7nhbo=r*9d?bpBb6(u&2^W!~D;^jvgiz6?N6|<>*<( zHzy^8N~(%CR8%>lR&JQGSptq;9Czg_d(7dENLgFMu@S#?FfnQAArR!I;$EAk540DEh;#OlnKa$E&iZFMM_3)t1|RY63F)luukGFrsn0}!Tf((zCR$IhaTb9`3tUXp4XPIsU6PtT_P4+T7hXqT#+<>@uqWRA(`X z*g7^!BFO!y{nCBm5+w+HQZ((Tz3W7juP1pYD zG99vdf@t>_)|d#T{z*NZxhX1-QUVdVu2cc~%$m}^lxoO|0S3e$v4=;RwB5Acpfpjr zIesh@u=@|Hf7GI-@o?|StB%2DSd(gJ!nc6}9I9MKS=D8W*6S?MLvH}3KbmesuR{uH z!gJF=D6$vQAixn4NPA zCL>fI8vy@=gk&syw5l9V!&AdlCSI;|>pq~UPmBi>Z8Fy~tp5vGz5gZgt6SHUMe!XO zby5Ork0~pYyf$oloR|#wf{#`OPS3qw0<%4AQM_dR0ha-0V{aKcrmOl0h)B+54DMOO z5GiS8MOAmXkE>|a#uWYH163-M0dY`@&M9uqR$gYyopiB3)1m_mCEEP6@EYmOuK+ks zh-kRiW@&XiFd+T0blkkN?MGKi;BfZ2UlUjSyYz2Mgf)oN?1dWjnV5!;J)REK$4BZy zOeu9gm*(CnQvKmpQ~;BY=XTZp%$5Vpc8&=s`E9(S27pmz@JW+w-gsP<;hoq#YMGU> zCGb?1{?o~J8|dsKmh?Zr2JUUBu=<$hIo1yc<=*j7$12|Q^n8+u;2T_i=@&nMAzU!V zbp6uzGXb!swFXP)crIiYo-*Pm0=#*+PAK(Jufr&m}1Rhtgp{6i)d0 z#-BDDPaqZQ`LuN>VvgZn7tt?t00_JP35YYF8#iuTUP@3IVqYJ%w$X6MRcFj;`xHcS zFoSv1uK^)19hJkXR7I0N5yMiGu|kJ{vw3e%AbJtgh&|fb-a|Q0{7$Kb@t9y80*-uT za8x_h5ejFC?@Gh1cdPHJVL)ci$IwmHGs|`P&70Yx^v6-l*nSiC* zva6uGCe^6$ITv)O$1{Pbq8%i1wRS5F&zBr!xk<_gW3OyR>8*Y9Iol4-DJ;<&acj^e znj$}!-xPP%prN8Rkl-IU`slFRg)dIsC`MKa@AD&N>CkWRL>uWM7R zezGcTQ|!%pPjPslqy#7YtLcs_*V}6-Q94(ZH15i_yJ6PDEQEs@4`!+67w4@4NKAdXB3KV0<#m*(8#+@SrjebdW_vdspD?cHk z70#v+$?aSelXjWD9!Qu(Lw~PGD(9=HPbB#k_1~W?{+mJRbN6vO6m8wGn}|DxY^iQ^ zgtOsW#pTU+{aEwsn?Wf?za?YHk2hbIQGJ2t!_W~9%1{hFP+nBj(D-`rf+QTbF(P;# z{c`}fQuvCa<_D+poe&8X6{<9rK&nuGY2F~}%K;v3sXr4^028iqE!7*~QQrLL$Jc^V z*v=7akZuPzC>^62vl+w)@qDaAdi;DSCie-1SX<$T{UIy&w3=ZSYjb@Z;#kt}g70o7 zcNv`6E_dygqo#$N3ba)do03tmsmbD26C;F<4zkhjTv3tIc^s4^DbFCL27hy`i6R3X zl;jCY_hsxP`!lzh{M&icc!e7?f*4^p{`qMhM}I{c5~BPClE!_Ppxc5E8^8xw5D@c& zKUeKg6yMC|);f{-%3sLP1rNfKX}Tq+uw;Xo$!)%!*F5xM;7jKEy|mUy+mSp9N&%x} z1+IUpnTRtkyMb55HvO9OR|5 zgHBp3iDp;c<^<*cAjQS~kkf@);S`g)Fh7E4R}p~=x}{Phdc9C#lnY>`$IGp1*Oa)4 z_fP~N1v*5^=cOI{Y5=I=iVw;E!8_g)VDNR{&YvO{o#O13v1z4;>^iJWR8(oq1chnC-u31H94 z+ZXDST_0@^f36xnKBe|VHZ-+O3oCRia3u#dgNCqeYeiUdlOjC%EAPsDm$A#7klrPZ z2X>N}krlPOxb2~hg4brS>rQgvIool~c7$J=8EX{-Y>(His#rG( zeQv(SPnWV9jb8CEj!3F?%xatu%+e*RMd4_-uVdBx%9^N}`ASU6~*O_B|fs?!2 z8vgASRwmJ{8U_LPeF}%<>CV@fxnjJf*K_M5;52_&&VL zzQnq(&N9#3@n>C(%UM+~xNT4L=8hbt$!htQC-vJvpFh25zk&o&I*8?w8Y%a2v8kyLO>=9B z_mEyHrALEUOS|Mk;2o=^MQ*#TYSUF-YO^CeMu9Alt_gV$)OXDiHyv`YI8kN$g z!~|7ak_ei42H6Xch3u8OE8Q2u&8?tX!3CvH5zhVrnbMb03XpW^`bPGyeAi^cNC6;|EksMp!Vm)8c^O?uRRe(NF?jMzh`myv4|r8b-tU zoX#1)0r@lhc;5TLQ0e9@qgjwQ4DRdg?5Hj!loILaL_65^IYI5bS*-gW>CSO~#Hf@; zjo<^4YGW!vLHl2mBlCx*^EnOqC{A)9+MW%~cJbwxnToIbjcM8sNAClgRE(6Y-S@>M zzS(Z?J5~t0Dp=hGd*eR+lry0tAlA=)o^bTgmn9mAbz)j3s(R)9PhcEp|YLIo&mdbyP~T`ML;^m=YaG^ zZxM4+)p&1Cr$iA$4d#w;`8?dJA>d?v^#^P5p=k@lHaH~FA@gV2icv0+VyTPq@!lPy~$-IH}}qjj~nDT4ec?dfSzSRi#>I$0;}1F+MG zZmt3Jtx@@9yI3m?EkW~-jR;~;wq3zN?24UQ(9Y+VfGLfSgAKKZ!z#`3V(tu_C3K1L zJ&%kr__1*(Jcj*FIPU*q?ybY3T)#EoZ4)BWf`rV_CEbF+zznGj-QC>{8x>)Q!J!+0 zp`?_M4rxgTkPaQXr3D1Omwk@xed;^EbAA7O|LS$UJh7g&)_t#gtszG#bSa;-mQQL) z!{=e(C_|rRHEQud@+yLUVwN#6=lo~l?m{2uKH7W}@Vd&eV8(3s-wLPT<+Mm#orIDe z5Q)!2GqBx~iN;kxvJ;c{J1lI%AFh~wJuR>u$4|^BxRtCYOU?PzUny^-q7dpZOh58* zL%L=}JSDl}i8;1=wVVo}VR(w0GI<1M-eg)Z=b*^qdB`S~GBr5wI-~c)yErU5L-}fB zQ)Zz9h#b)O$@<6LHBd9p^BI^jM}jZ>LxGb7lVlgh@d+Xl?IHC{!>VdxXTPG%c)9e3 zu$E>CCHb9?&8M@mKbOqJZX3FV{==(Yzx1lycpEpaylN%jRTqL*SpVy_p)sQAaJrR| z4vOc)yC{YNd>II&VT33=bZVF$f6oyiuol1X{GFwzRtVVIx<}TquGsx+xrlvrOIbPe zr}?VZ^{V7RTfem2BrJ^iE6$WP76$I+AKHL_y+f>1`LJZBE2KhpNv8N$dol|{^n1^G zhnPv{8yy&&$O)z2uIB0WGr(n@$o~-fy2|Lzyr9o^92PMX~Q)1K0!|`yu}R z3qPv!JJvBBMCQL}R?0`a{e@q?P9WYNPmz7#b9Wy7)8B-Y>DOa0r%!1gBG=N8<>Hgx zKhVRp`1b%gYUKy?&#utceRIGrrEU-OUPT@K%$v4rsn5s0@P6?jlSAOTz?4-Wa=JZF zPrbc>(;@k9PDeG!iF!aq+kwD(wFW1@;-LvBbSTEEcdGII#1unzSIEyA4lr&j{>i?cr~KXY=k%K*&soW*({hHUMtR z>&-qp4!_nqylx|-KSE6V%a6zF0!6lJe9}2IZL_m>oPej9tO}&3#KoF`#~A`{Ua69C z1Y4j5QfM%&MhFhA(BiU5p&>M@c^D=tA5vOco3-gAImrv$*TO?htI>GiGKiE-GYCQ& z5(0fYw$8J;v$AawSLb^*7a1YHjw(492Y>_}rh($K|QC6bN>LUaf)u~wP zpMCRoSN?HT0SK0LNb&8fMAKORX5gS89n%^q~uQOI8#Fn{8nPi4KJqy9e*m} zA8ca44zB$-JLnE7>l~xH>%!kedrYhB!td#_py;##APF<84)7I5bF>GHKjXXRDV@G|QFL(}|Rd~+MAXVUtzuiOWC48x> zm}kEFcpeepFuvUr{4aC~Wr40HyT^V#nT z&m{2!>x_Lpx{UyQjVZ1!7zh!ze}xF?zi^S|&8B!P%MT?Vx!eB;1Y#0?C=lNfAD>eL_nAo<>;(LexLOA4?=zOW{L93YAFnXu zo0dnw@?IL>G)cs=;Jks>-UJSY`v1oVQ)cC2UM==xXdwE-#WgTcqku(cIzf`~kA$h3(m<-lxPRk?g#V<^_o43E;| zBch`d!MX_TJ~ShP-m?nr3$+|ytkzo!9b1=z9KA770HT?zY3sj`j1MpOI3JpJJT(HcgE#T0@E2(ur2X9; zRW@M2LI1J(h9*3RUiiiN3xHD*0gMCyBIL*; zD8OQ&nfhZSrNe%lhT@PyTLAuI;Eq#SZ(nF#o*58p|FC-o)~97BHg$W0Dt~(3Kl>*H z1?5RF$Fa^7;n4w*>%;WM=g7&|Ovx33wWSZb2CFUbmyKtOTh9}JDgO?SPp_-J0DMr~ zgWH~0e%$yS$;!sZ>?w}J{R8#HR1SEbs34OrIV1GK?hvyT1thVTiMsxauL?jsN%_lN zy;7jIL8JOHFUr@VpBr3IZj@R=pq5DmF^+yOVT?i?L_0|UUUcQv42w(N)aqV&q68$s zhrY%Y)!C)1EL`eMQM6Wkz^_m4f{ae)@bc?Ph)l>AtV(304sWCL`$8Cdgic;zSJnWM zazi?x*JVVIR?eVTm2!eR`37!Sl!~m@GtLHQFpnUG8{Rh2ter9`9H1*zf`c8(+fn^n zEs9Q?tya)yId!F5jl?X%35>&`sRINEAKO86{bOG#z7v8>}ZBBLa;VWWy%L&MxLr} zDTDC0croJVTc*4KWHN)wxZ!e*1xD_k2%$*%=xkW`-Y{WgpMa2}b!?a+AL0)Q;tLj~ ztqs|mu*kuKRi+LMz(}ooL{PtLOICcGm34r|sTDM5cr4@EdY1@lLmcA%sSmeko_J{? z=rc}796DLT<}R}$wT^?Q_}K>gcxhCc#h9K z4J#|9q!T;5y7dY~L>YgkYMpx_L{pYYXk=aZ>x73+H2XJ~-(M_5f8RL_eMT60zrfzSi05bv8HFtjw?r3xR2;miQR`k${FTw+pDwm_Ggr4!Nn%v3lV)A|fIp&wO~f@zs@k>AqA|ob`)yZyXuA zxMJ$$YTk^MWyeCai>VByS=SJ{tYuxN??CbqlthKw#2&bwg0Vkud z@76N0!w3h*@Ed9&s4XR9vaVdpGtb_?E4UBISc&er@Mt@q>I(DwUkBgtq&6!>KG=N1(E}ec{^028*iN=FK`GpPrs}m zgiZIv*NK7ne%ZV5Bx{Q=n*bhAfY|;AJt;INA_rNDH=;mO7@L?uCN;(s^aT}G5T>ps zY0C|rc^1~65L8cL;)rR?JI8!GV*OXrd+#lRUtQaC4}>Kqi9-n3DNLu9G-{SdqaqTke=<=Qyu-E{-=j+@P)TWADKHk=(KY@UySyt1?0W_e=e6@h4u` z*8WLI4lpos#LR+NCkBLFp+}4)hiVhua`Yn96?>dhcuRVZVGec$`}SDy`P&K4UnaL)`XmxbSnF0CX7}qr8teaWfDG_I2JqN;jetLU zUI)O3^9cCL9pzLy@c;t9b;}y+(RMBFYa`=z6Z&x1>IaCoted|aVp)-CTrkRX%9Dq>VqDn1MJ@D_YGfs6+Y9U|BQew9$Rg#m765WOPzp83 zQUz*})4P*(17Y3~tuK-ZlUGU~DB@A+sbn=B3$N)OWw9)CkuJ}f!mrE8b>8c|dzlmb z{5y_P0W4oi=gUF8D}VGr49I6cRN%FvfJE)B_vZ3tc>wCN&6Mq00U24~r?t))s!{;#3A*Q_M{gg|m@VJ>lT+$FchVx~Jfk>JkJ*e^H? z`9=7&(YR<~LA?q{&Ow;^{sZ~A`-v7q zluNeeDLA4*0Cy_MW)J?2!;+AfAz{8?FXQi!pbX@6)7?z_z~$h6PyG=JM0%osD@g*8 z{(RQEkybd3CT4f@1?;69I9k1vfwAvt%4jueB|Yp%!Fk)+d4{^n{xO-KU*iwhJ}NJ7 zy;Ad&rIQo{!jCoORqCl6Us1pNELE_kf)uJn0X=Krr|P$KB1_gr()v|E+z+0vLm4|L z9*hzZiPHJo8LTd|B2QR%uNbQqcL4J6&shDjeT4*!m4HojzG5}Lqe^}2?xH|?7?6ho z8@+-sCn)5U|h)a6G~=8NWwo`IRdnh0g;g??1w6?wQM+ zshVi{EarFJLZ1XUMY~qde!6VoI!pI!$B%$z$^8XKiCtY5n)T*w2@-^2kPErVB^v@yU0tlt@zi=VL|@+ zA1Jn;cLYBD#ZKU=&2dGAk`5LJKazf@^1IS(ou$D4I$ zzT{BPP?ECd>W85!X)+FG6$wjJhk(<=p?WBQf>p_Uv7B; z)aQSy{9SS^{6o_??xzPFNMFYG?nTeG`JGlV;c3_-9T1p^jEvG+U!3mu(E_RECAabF z$4=YTu?&X*bw+1s3V^X2LXD!Z*1|-9S%3lnLJFW)`lFs!8x3MvzHoR;{7y!%5hq=e zQ?bfP%U3S6hW@V=KKP$kDAu9+7P0Gp^>-P#c>&u&P5ihL0szF&qaP`xRe@)XjW@~! zO8T;EWwjgaB?L499%5!(Mi3eBQ11Xg@*h1A0K zePEjt%JaEI-m33ugV%MaZK85U4+om4pfTl+&QUp$Bj%&Rc%uwhk1IIQsRS5cUxD|d z->1~#&g8rWXXHxIB7B5HNGKy`i|1NC#0`Gz_cowGESMB(SazpQI9C!#Dsc#-Yb>(_ zW>0^3`8zoo83Dte_Zj`a-KnWFub*v4%@$2@+&%<}bLjtszD^1#>wDRB{MIbz3?m{& ze&{RSfr3A%y9KANo#EO`d*{3rq#0_+SyQ%8`Sm#4L%UK^)Tpyg^^%J4C{^08x!g=Z zP7EJ~s?OA(NUrt!q;?6MEiuOxumAVDv9_Q6%7=-veGPK|s<$2hJHoT;R-*T|9mVz2 z)4ejgv4V^y-JyG+PsY~vHis&Pp5%dNX#ak0)(3I~?q`_1iVWOzH;FvAOvq)#VH4sI z5Kk)M1r}tw=TjT<`mAao0ueYszo9pkf|-~JHY7qE`^bhxMA4D>H41j1wsCI4sWrap z%jJvbV|YGvEA`5ii$BboPSIPlXsT>++?Mk6Y&t%FU>C%K1iXl-7L)HZV9oZPch2_? z2;VP%kw-zJ{5yr>kaFRJ3 zJcq(X3!k{p)z|3zTq|T<)vj|E2#;7gHslWD%paib2vM~9GCdhq#a<1}U-B?(61}8{ zl9+pM5;kOXpft{ZEUa1?eHt*h<6K&-+m0c7@GpbI^_MMgJ!pr&7cuMCp9I{xME$*4 zIezaix|x4&{D6A@#Gh+Xez^72SCuW8TB5 z{n48SJ5OBdTeYrC9d9U%FuM58ys~E7wV{K*({2`IGH!Gr$GrMy-D}&8^NlCe5+5nJ zs!8U4t$A@5SxaE&NxIx+ftRQRJQ2y$eo52qPlV6)g(<|g{7LN~qpw{3oW_cNdJj(3 z=zGu_oegE0bs0n>YXP1d;J!0BNW|Y@tzZ0c?DDstd2k1C5iTv*tShfEocL$k_c&|f zF|t2Qqd>XjFf_WT0ut3o+Z|7`zo|-1z&vyWkROtXA3g%nD}H1=`m#0>js+s|MIVko zdxz?`@4AootR6WS2+nz8B<*{4tBuRH^`gl?xXRkHKL{9f8{UfDn` zQt`0eo1V#j>oKBIPg)gD-)$Vs9?(`bf7s2L`WBYcN?Gxv<*=*YrH4se{JKjkm+G2W zM;Bv{2&li7pEC(rlA&n#v?G8Pkvldds?ozDCy|(*jGy0k!(q(ltvx4s#A@pB>Kl>o zJYx!RwQmw)f}M#{4&YinM(wKh_Gu#3zznqs2iTkQ73!hE*C8RPewYhxfSBuO8MxH> zY;GlFW^k=cV>p>~Gt?-xXv}7!-+UM=d5m2N=u6`ngxns;tRIOiYocHrl$mpjv(9h} z6_3yqv9j~y!~f-n+d0rVu^3qQWfXHg-RkUY$fxY|Os}8=)iwWW*dhLSnr$#%S-6IX$Pu;u zM*G+ik+{sLlTo{JzIAfec=laid>4Y6J1QM5VC{OxULx#LK;YD*eJAzn{1JHaadp?V z)EteBWiHBx`H?2*eNMSf29=Y-xy6$J?1fS6-zCDh?RmQkvs=yS^(s_dYykEt>N{3x z(@FO8d(x`CB!eEfb9QgkYNv;q*8fyB!CCy^Y(l$HBEWg?)lu?0$5tGl>%SjF@S{tC z0mBRv{WW+n(II(I4Cy`MkqztL*;?N6SqNH(OoYn>iZ2LXY$rf=6Y1FXn|H#0SQ5vr5w*Q#a)7S3qtC0RjpG)UQV+Fe%J#R-C*Z z;j`XHK56ydRJ9Yg%72hw(~H)4HA(5##GzAZSiQ#7$7T2@^#q+fwFtGK+Nt%$_vn7C zr<||gN5>|`6Co>OlK{4`92I#nEiMmhl|)*;qiM`Uc-(?JDNd1yYjlUKo2oYVROG%f zpeRcT{tDV$lPynAtmD~n*@4g5Tub24XQZa+kq*q(8Ncuz6k{n8kM?QRoLet*%Mo~o zq@D}KD}06~wmsb>%&IE+964sy${|$rbdAL#r}!5&Rf;V^+plS94x@7N)G-GV1Ze~l z(TnqTlw7yXNy`vEBTM3mTEucmORG=sd*}0AdR77el+b+x9`9Ohn!^LdbBd~ztnqGR0j~}&YsE6ZMAcJvSn*uNTQm-xaUg}ZrPWxop#ra z?j-Yh1h}XKwu_0E8U+vUL*h^kQbj(B89B@AOVYwv~7PFGfuk&_P!p3Nd!52nRh!50fHs5iM z*R67m2&jM8{u~@k1+~+(7j#`I?@r2LmXL9?dr9=zPSa7a$`GzLbcftah7(DZy6(K{ z%6gCbydCx+>X*W^-Z+Z@fcgCV021b`bUwidur@veZH!f%AO(PInR!U`yl`Jxtv18o?_on6ov6}FUnH1W4Q5g_>(SUEvawdvMI z2ya3%(~I1o7l0@rwr1Cl!3C{q9RD@Zd1K+4K8KjyO%=FlIO5(A>}^2-OPf|ELK?+I zRhy4pnQwHGCxyR23E6-i&Nx>4Al1p2BlZp0-LMIqM{f<+G}1|fPbUZF%oc|T@@R)Z zr~sV@=vmIbeSp)|6{bX|A%RTt)TSuhJmIPwZT}|NhGTV6( zzmyzsF+X<4mO5bZlVqEN(DwafcPd8LjM#j_oTp0(14Z$?S^~DJIr%5m&kvqhf1-tSet5 z#9#~<1KePwrmy#zr8_&=Q83R1%i^bjqoQagGQ|ay6OC|qPEiCnvHlL{CdKnHcT$0< zQ01E`augjM=+pv!vI-v6&n#Tyuc!MonDA_z(YXVFZYsCsA5zGLi#oJ+Ywx8Vrfr6 zQCmFaIfc2j%qldzSnJi+Dkh-6pKAa{IJ8V%daHj4$JWIOp`?^@K<#_n(Alz{ zhMzxgfaEa_dNcU+0&IxfM<*99N*bp}NZdeX2g4bsw z*ol(JXXzI%LS57;8_FM_75+yQZ7i9++vfSj^S_LLDpPpqLQ5fMNMiCVfs#&{D4$55 z3X#CVise#-B#)*JjYJzYEcY%wiFap1N=%>C5mD}0Jr){7$?d2kRxrzz_4y=vqjQKZ zx^p+gVIf7Zn>NEX-)`~TE9G^^_~WV`bc>qK>F)X4ClXe%z!{u!eaHoEWhosH}E=ZNRwV1qyXj}PXI$7ZW7 zvKom9r%wf@D|haZNM#3%b)2|SJR=y1!@!DfBiP_B>Qm26&;w(%J_><@+*JZ9=}(y8 zPK3ye>CijXlEJwVn#t}jAAc55{}`o3O&0bCKzo2k5zY$aQfQBY^G9Yyt9@~cW3Ce8<|VYqs;;uX2QJkBlQ1s+34)n(x4#| zvF*dHnnipNa+dl&)GdM{)jhJy!qIl7BA#~)2+nq(2UX2&m#$e^(W?pTpf0o(>m(%v z5a{83m9p?^c4)S@I0%mP(53VS0E&u|QZv0>ACcCc>)0%c>Tj0ZXd#h&Q`H(B$+2q=~?NYHNA{rw3xYzprz5h2*-BMk+ce4ncyl6V2DJK zqT9*8^KRqmOE(SBsEyG^ZH0VL6uYWiw8Rf(`NvueL{hAI+W?(X&N5>5?fz8gJzL4X)o<-wn2LuTigTB!1?kRPV-R2XM)$T_{~ z*wte!(WC`2q|g4~1YSQnidbZSefeAizS|gGGY%ms$vH6Qmwl@>tRkNxQbLbrJ>e*S zQ)<)6hBkZ?4pw#%DzeT%-#wLl6Q9i9C93>{Uimm5qe#`C74cIy2euezZt=%TKn#e~0B%e33K z>GB=lD`*13PDatsV8ZHzRhqp)Q-n4VWR3S2_}b=olejQIjL!%(aou4y{aQ1*zkuD&N$jwn-@;^E|&s!Kg9vOGI&}q%2Tt;rZm-mHRqCLD)#g z@wL2>S2z0IX*+~Ez@{s<#-?qP*e3>zZ}z>QsI0U3$=*dWPFM%NmV#(SsIo!YoWi@h z0h7YD%G*7Q;d>xj295M?w)0p!=&SS@)|*()!V%f6^`XkR;}NhOD?Qgf&l zKVJzzP&?KJWKrWfwdOfG?+y8bYKaUdo4@M;GF~~W!UrcE+voD2CQrq7A-8s z;y?3qxGZQ8_5v_eRW~Ve`6?#mGLCcR)P1WIu4l`!J$RR;s>ft>E)1pxxh0<=$7f5& znZ}>eEQmX*$g1s|rqHN|!Y}*c&nHZ+dta%#R4#uz-HN#M-0zbl_tnh1)fa! z<r*=%i)+lR1*?0c>^{$VS?5>h5C3X2`5C%(ktbdjVJ=~r%oWgJ z#TN2<_O}iLeuUKBQPc<|yW&`HQ(xFi#>~oeCY!g4E5nFfOMUjO<#T4s z(mN@I>Ff4}$3YR)J+ui(m2v!EP2XrP+zhr&068ar_R7kH)$`#7t)_gj4?e3GW+3Ei zyojbiTcg+c*GB|r9$eNU%O_KmUQM7Xp7k~hxUlq1MAn8J!Q4@3EOq5UkXJN6Ga&H& zgys63x3*MX3VcgKGlic0RxojEUjIZ6rTVmYc<1qa@yHFfG@hxEyFCLvu^|_bVT-b{ z8)aQ~Amdi`hVHeM@AF!=)KC29gx<>n=J4@>3j-_me3K%`DLal_5wCdD>kyGM*Z5N{ zw$=WVe!SWoPsp|(8aa52NXm~P_p4Q?#4ex$hz?-0DNVp`P0{kBWG3mK7^Exmhzd#gDRANVmOsgXz9|Zqg-?7S_d_n^(x9#_ z{3}h&c5GU)zdXX5WVCqPeyLdG)B6iRRwnQs;+`~0=9Yn}{hNy69)DGONAU0k$AiXhd83Gq3c7^2Kl$zmuciHfDTpZ(70d+$ z6yk@v0{LT0$5jO>_`hu>1w3KAmf?Myie7X{xpub8a>lDVZZaQMm;~bT3dbC zmCU;oa*j55_qk9I7S5 z>Dq%1%%)I~R5o@LMmnUzUoF`E3)TtH0NkkPRU5X<*IbpM3e;7e`t1|tTB>8NFovf$ z6Z(&-0RAa8&Rus~ZhfM%np8trJE4O~f8ududu*mxNEP$aoT`TDDn?``(fxl!nM7K58=-HC5UO$vV1pZXA zy&ok@ZF5FLMoQ76qF55IByV#-AfJ`Mtja4`VXg1F#H~9;bpwRdjQB&R(Vqkl-!d5| zs#?MuGxDU)XE>b+E1kofZBE}1gr6=v^sU6ue}7ev$@kqp)&(V`$FPRAv{p~ZyD?m8 z;uQ2F24SpSZyNz|uz#sB)DfjYU83J4wI<##h<{GEJ;RF9uC4K#y79%WZ~@a@GB$hO zh*E#Z2)Fi*$`P0+g|g)|v0}`>m)DeDH%)}3Su02=>l8>owOidkTMUp47!|IQ#`a`H z3!nVrj2EOjP*c-FR>pB4+Wr1;TM+rm|02cnIXFgPfVMSVZ<|7@a-utTCk(6@*U>#{ zBza`Ur==O@J>oLFg4pd*^wwBXP!13=MGLAvYR70b?=L?uw6jf+pBpq_L#60eKQyUQ zSSlTe|AZ0)x`GWxMMwB)TXWwXeX)2f&0>cxR&+K7j+M_FyU|nLgw>o8&i4SyQLV2 zWFH?$fr3M$D8H9}_4B0i2r3U>?R#uID&pPbE>8ql`BXp; z^i<;K0)bku9Jr~RZ&!(pJF&}R?YV&9S9zi%u41EF0b?wS7@rZSpicQ6Z^`EodYc0P zXi5}omm3ac_Y1O)oY38oyJ9S3jy8^C;azJoU~l&f2Ijn!l&4fSXpm;dO2Co7PWJ7@ zB=Ap;u19&LWy7; zoZg%~k4lQYcpH`y#Y9zjacc2)FLN8vSpzEbZs97#f@m0@MOn+@CNbh+6uQQr2x>TP zK(pc*N6+k$gHt*MvE(4bP2z2cAcab!yVg4QnhRS#obA#~ukD&FID1|y-q_YWb3QJr z0m|+OVWn7{QghTfFP*r-(p*vqA06rE8rkfdYJGHn+?4>z&Ht+k_tIX(@*=|X8B?TVXL8#tfWRPGTE--PyA;gQO29~7(b^8x?89Y+uqzcUo3+fo~2c0NCR6#)c z{H8rz_^TQMi%jteJ@^LJV>qG7A#Mw0&77_u-`|e_CrONsYDlj)jC8m9E9%9*KMK1W z#_P(d4EN;JdEWyuf#6iI)n25-V|y&Cd(H-1o)BM4NMk76d?n?f90Dr<%WbfF#aoG5 z>b9-8uezkwJHtD36}RfgAcgIaEtlY-8Bhkw+xHe2Ri6+GiXGa#O@zriH6}rbC{BG_nSJJiz1+ zimh~F8!XRi7e`D6hlC(W2Oxh<5Qj2+BAfU+cXU1n2toS<7^w0})_LEp6DHw+m6Mlt zL>GJ_bm`l~C_>w$)fWS! ziH0myMli6+ghP}I`o65-ZkDOpwiSrmCZ;H=tj3Pw4xO;>l;1Cn@X3l29uixNzvUDC zU4!``Ywe*O!}a3f5x>s~w*nS(Y_1EO$XF*<%<+oW14Yogoro}6yM_Ada{(Hi%?NO-TK8r z>GuSR#$k%Ty$B+$JdDYEoXS{*HkD-&=ss}&LHpXHR70cGjVQkfdX9?3%bTsPTUj<4QtE&F9{MEYoA zWjPA1l@qOGgwhGwG5{CT_iJdXV!~x)fS?GapC712=;7UN-u9EViGo5EbZg@YPafznjm9#fB7r*DA$KBi!Z$ z5>+s~onb8!if0(#c&4|~u4+W6PAZZ<1Fz3hXsTFSJVon`TO{d15iYf@PfqsKBTdc> z4v(L#Vqv8PgL&am_w2~{`!h5^#_E;_U#Bq35{4&~7EKraqAUb|sZ4v6o43Vj?@*Vj zRC;N2H^gsiK8uE2BbP5J()Y<2g7g_2mp&Zu1^QzSi;C)@8D^pWR(_%Sf~-6$49?U4 zLW2~*t%l^7KBWMDz&yy^h~lS)u63oncMaF;o@fq*3E9pS=`sb%C@LVz3q*W9X;Q#b z{*Y(0b&_pK9O?dL&yJ1rNp%k{87|d9&X7!5?(%TR`pSy+v3{jU=2b()6{28kwI|YuNg&9t983 zk-a%P`)Vf5i99)B9Snse)J#7avP$d=GNGlk3alK|ntDNxJURU`_N?efC1r3H5j26t zXDj!1GI9?F5c}ipsNMBMca_9GJrs;>BhE!#J+05LBv*)cS)K&ys&Qs^6QEQw6gRCd z$SEmzj+?)pasIQTEbCJKdctE&cqM!+y>UGY5XoN4_|wb9HZ14Qxls={X+X!81*n{M z&)WB`lku?D+31(7$kM7ZK%zwf9qZAS6w<+3g95NtS4k%95h>t2rKQrxuDWV#w*FFs z0+H-WuX^p@yLnL~dG9K*?OB2i$EGy)n$bbeN$hh01#ZeWQf$}X-vs)r0WDxi#GQZk zC5TB*ho;vb5>5xIt1~POa<%t#xP5T!0Db1>cU?;jCOn?7v0If3P=45>kQ<277}9g$ zyIh$V5%9N<|LPID>=^}L@PU7nJkb=&xKO@@+;oCMi`aiqyx~*P1ZK#99Bgf%R@He= zJW|zj&xGEjCo>&%!cUC;)_nsST3kT=s%H9N9QSts2`*H5TbzA+@vFy;nY)Q~l!dGa zc$x&6&jjU-&G{KX6a6gmKM$F>(lTH6#S-hdqtT=m8XhYvalYR08zn(gm6o`g92sN( ztJKlsx?sHw-T+}ZPRt)*PA(@@H#mZEEu|jlj-#UidTh8p+2#U6f&kG4(9Liuu>3U` z*-~vdh=pA^h0UGq82J0Xy}2lGAk&Ts^4q;8?mFiXe5aG|N;v+e<>sXt`43u2h2PxX z)9WOc(#LH{f}Poq|07n+?ILh z)!pY21{;26=M%1qob?IP$+0VOLIVvMaJ;5V9yZn(Xu>`o`nN$HS7-PyHFZCY{#Erp zFQv9Tgr1G_x*%XkOnP#7YlA&0#iEP?t-&zRzeNYE{C_wpIQ6Xz)!JSxazU_Png)#4@WI`SQ?$- z*-iT0WV1#=`_0C)X^gRiQ-frHSIPp6nHTf7R|5Kvta|?W_+WqolLNJR{`NG0MjrA_ zjs(ia(bJ>2bKX@pzi~`lGvipS`|%T6&P|ktMI@jb(qxXdm=6Z(hqD_UVG)WCAm*-K z`7edxE@NM`BLG0)e2!&J19Z%rG|F<57FML+ri?DwInt`_U8SO(J%NkjR#T>?{2Vbm zIBagH7bP%PPaV{&P7KvbdKJAc_faw}0-C`>HD+}}s5xe6VAFw+aq!;j5FIbhIIosF zp0Q&HNYXvw8ST<57sX{C%Qu}LRch~oB3-v09+lXUKr<@@)5^}%AUdr(BS%(d2PgY~ zzNnns@IIN)dha%NEseb>VW-*kr1=aOb~9=w_Ad+{a5h8a7^?NIXn(%;Yol6Hx~y93 zW(RyaXnQ^{4^@?qza8+43YjS$rb$F;+P!bVE6@+Q8+EwlkH&4G6zTOEaSAKa&07-(SgmYy7`%Ix+ywe7?{1jBb%p3^t#*6`EDvd#LHuk>@dW z|KUTY?1%4cdv*#(Bu^8n(@56X{c3l=52ffGpkn<3&MtlA#AKRn zwllSSL8DP+0YbV0IhD(n(rA}7J(B-m@>nq+LbcgdUvEf+8xvX_sZU~KUgeopRDa`M zA_@L7SN!skCMiX~jDoANn!Q1k)n(N*PTL`>pmZ*JGB0PPQX`x?9Ad70S3iRb!G@4A zwZDhwR{eBHp#A66>X6|5N&?_)=6SiSY+IB5~`k3r!XSK zEv-S4A2|4KtF;foSMrDyVxT7@@5bQNRrA;by5NWMz$8J(I4#5W;G9fsRdrA(AZ8-t zsU%}hbmHG_WyHsDmPCzLSuq<19f@#Id_cojmz2Gt2q$kMGwxx#xeB|0_L}ZG;nkaM@K4b|m$^P)M=-WTRAShiULR1QuR}BHwBxzLhWOS)WJkM(D3QT;9=P zOB&_x`HIxg_kHhT%)W!l-b9nA5I68kQF2hblQM)8!29M>C^-KdH*7e=C=zE8v!ZM| zv>W{^AbGE{YI8r~7^lnhQk2CPd^+W>aNcTn@}#!Hi5FfgAI+-U`GZTdDPkalkJuqT zx-YH+7TGJK*d<%(W%O!BBeV})#llrX=4LtV03ScDuL5}Dl$6qsJKVsSG--=@v7teG zG((?^=OnSa=l{drTgFxWb#3BOq9`D(lu9=u-6(N5Af1vT4N?-)Arglaq*FMD?hXkh z1nF+1yIcCd`8|*5ea}2I^PhP$FXk;DKIfe8{_ef@x~{d>b*aVIX^=(VBhn6fFiOJa zv4}mG6&|0$aD1))GUu zJ!^!ql5kFU`o+HRFldvXLMuAaYTw#<15eBD7xo7lRnzJ#1-&Eg%6)_O7)xoAvN4a} z^an|30V7H5klGElZGV|Ge&76#05< z0?2^DwXsN#@vYpp1-J@oQAma?Iq!7$;vS@yDUDG_PC}A5n;7|%o8R*?78*8MRLT&h zd6&tcgUVN8wLeKp8J7WDSy9$zH4)_WaHH|z#Ws4$$V9zJt^Jy6u1+nnxVX5)qvLqd z#KgqRoSa7VhpwJV8M2t9;kxpk1%ekm81qLafgRM;U_5`HPBvL6cyrUTTT=9^!D3AHI)lWpn!F7e$IXRgcu^HrGer=2&DL5Bx66*0rR_vA z=0B6$7F5r^lqnNlDiykJ&f(@!+A=-;Hc*k3JLT8cwuA`zh{g^Kw+|Yu;!n6P%*Fw) zTg=aIy?gdhezcH;%b1g}(-kN`=0z%-n&RTxwu18rH7B+>p5jqW znYIRaT0Oq$0Cj^2*J4+tG(~5?JU^Erk8XHtDP^VB(ak+Iwm9R)P{AgSv8i&SA>#{poS)3foa23l4qo~+YNVqb-k<8r^vGFeOZx_ z`&X@$jqGNvPoW>X4E?=&Y?pce=iMi5$l zXmoZqdKnxT-A$osM&`DVBHRo*`wKR17VIdc+=FPer%R+d9!8xdwyM!S#Z?QiKE9J> z7{Pu|Mfqh#i$8VF+|y#kq4rY`Qwli>O1GtH1dI}=Q5C9GQ9qyDJ7A+urAj))dO?m+ zJPfvPHA$`#GtZO zf9XEqFm4ee=yLec{Gh(aUouj-p#Fl-ecSutK6gCm)ExG(NGe!gU&ryWwzuaz+?))I z)^Wc5#bM)Vvb2m$bGAy(Hn+zfw|e2LZy#-SHk!c^4l~ehRn>vd2R1Oq%NVTuWw>J<$ApWI*^38Y+DV1a0xG!*_!vHpTgV2a0K(4Pfjii z#Ya^l8KW&5zL!-<4fJk2BcTg`ua$i7E$J3A7YmkxU--kf+>gE( zdA%eG?#8bvyp|HO!f8C5BuyldFULN-Lb`vouAfXh(m_g6`mxsNo`uEwzu9^wIvQ?S*fDB;qzJ-LJIEv$s9#(CryGCfIxmjZc;r-S1(?#KI9e5 zQ~&a-Ka)vYpGy4|<;&Or9zOe3eC2%;v1EydE5&+^Hz#ab$SP-iNeUfzUiWkDJ|uut zpU;H~hIG4K*f!SHU0$5-JcHdFXcdyV{lJpfb}Efacao_oMR8+b6{LM#Vh?=egWpVM_>5pUlWf>vUk=#j*L)=N<($)%^eeKnzQwO8J zzsbDfL+pyd)F4IW_gG=zlA|X8t{0Mo5F;h$i2mBu3@d*A>&}vd)}CET@_V1TTgHW)(JA4Zu|i$-i%emWv2PEG?)b;Km> zaHGX|yBNFg6W5lnU@(`$jxEicg_fCJc~?4T3e!U@{Boifa8tJ1*B363I7b1L*~$~C z^=np2Wa4&$+^nQtwg8`_BRBc+WN2^Skbi2bVvYXZ7kNcxUoRv-s8WW#l7aNu273LY z*DDnBONlZl1RF>xJaI|0k_(;Db^FwmA!nQ+Li;J$Gbd{WExQZtkUXFir7W8y0875u z&+tO;;A`O!pB%0` z1$3n#KHh1|7Rcyv$AEoxdyynH#DdC)>kLQ>>*>OkpTHWbEke04D*r&5k( z8kI1u=sQuHh=K(B&iMJj{_=5B~D{GJ@bv<0oH= zr4$7X8|Uoj=2ze}Cnhkl2imp^Ulcfvy#z?_>b6j%COV%2))O4(th3Q7+l(x$dASxu z@7=w{PEi}1B7sV|+x*%Ivb4_5&My+-i;Ih2xrh@B-Oruz2?;}D+n^7FhKGmaO#-~U zkZj4_M6^{@3{e%QJ(bjfkUKW34rsb|s&3v*DRQ#R;_{A8GG%JxCn8I z&dTuK+|2A_Y@|=bd`hXyx2d)2q>8jOG>tXMwV0LEgOB@lLnhB_3uudjs6%d|*ZMB3 ze2;!1Rc*QU*dVQ&z*N~FwW9jeeBoU@foYbHI>#0t>}1SDktCf8o2aiB7owAOI@C7= zOA^XCv-!R~bx^`6rJzI1w}_|LXz_=_ebfhNJPMhJmg{C69@g6foaLWWoH8>jbzUjH z#oe{Fl~;`MD-Qo5yba6X>C=!96iggBT-+ORd40pD{WF#t782F z!R2D$q8Lb5GjMe|(hJjKC5Fx4H}S@{qzqI1R1y}iLVw`M(((qX7ybDB1^Hw4(kUe% zQLlYuXZ}!{bfwH>wcg#OB+DLwqazCnq{4g#sIC-*{U1NEX)2E7HB1Bq)8Nui96+Q= zt0pXKiq;;iNH#T}Xh*%f^MnyXO7x zhlrgFpfY4bLSfiD0G&&9tQ~y*l7T{|Z2?O2?;BU1^eQz2gM?fJE_E}(F45To+;42h z=0INavYcIklvjM7jNj=NmrK3CeAn0atJG{n4(=ED5O-YTskz4?%6$8W4)BsMWSzK= zGmj8;Z{NP{6kK_WDQP#2qi%7&Yuidu7D|}%NVq(l8=M@7gpy_TgsKum(#NZR)Nqv~ z?d4nQI0PzhtSF9Tmf{f|FTHDNJUMOktmG*#T`vpP-O3ZaNcS<)4Vp2@H8w*8K>$zo zM-6_Ta4zO|naPxW1Evq@Lp*I^lnd*7RUa&p&sf&*hw>U~ zbSNW(6?9)@2u)BNZZ+HR9%rVvn@=&6b;uWJ?U1OJggY{UC0pUy*L0y?ZTB{Su0+Kr z67}zUew%w?ga1O`L{Mn<)?yOred{(sv*=R=G4C2haa>bO(&TP_yQ#JbsQYPMnR=fn z|HI}eM^Mc(Ja#3g{HR4v;)?4NL817lGu&cz`0gMEu?}l%ee$K?*leo@x!Z6Ygr!0T zHCj}$CJnaTjt^Ry=mvvKx68k+;X{@PA?`X1__adE`q*I-mIX?ic%iMov_2KfI<>R% zUZy|Z0!N+84-Z@5j4H|9hCgJ8%b$nOW%acF*JIcH!}u;P>%S&DaXo>e4)R^(FV~9R zAAg~`);-%B-e`1Yz&1P3+i0a&Y%J$(@e{iTSz3OrRhI;trDORF;iYd@po?*Y!F!UZ zRHXrK$ijuC@x;B+r4RYe8bZ^qegp>^_JW(zzw1TTLZDp4mn`vV`1$WY-Qz@7jTf6A zA@iCGWDT!3s%X`{%zzqn&0jVo&(e9?jbJqQJ|Shh1gUgD!Yp0Hm8vlkrajmCWN&Z= zxs`9@A||$3K;BhtO)25|#9w&_%~j>=d1WQrH5K%Ny3lqCXI;pe$@*Ah%0$)acJ@+N z!z`LD^-kwkJmtR3<~sMS1$pU*WW{v0Oitm#YOrJXK|F0@e#n3I zXbPl0$q&MSU~_8bh3&V9X*2v)MwC7EP}0F z*8A%5@#niBan$_trHn%hdY!({-;8MmtgjDz+%kUNsN57by!O6-&|AkGrBAUWELO+A zBZz{E7TeeqD*Zc^qYM~Ww4U|yOK$3CwQT7fySZGm!+PYgBl>H3gZo%`_pjrF3nYCN zp5H~`zux8_mqA+an*TXIq-a5K_YIKSg53aeRzT-op;-ZPgp*+EPC$;sE{=Am$a<4M zIOg^{0uWjqAn3B(7CwVN-wzDjWy-I{;eAQ)coM;boK6vPQ_F36UC>JjvzwCJg-Z$} z`g7?vvbti4ZxI-j@b!Lp4V;ahfX)5@4BFcLMvGoMS5{TRIg)RnHu$##J_+9hrHfj4#@0=%)R^)8~#=Tazvv$7eB|;@30kaVu(GK&$RBSzfD<2Nh`2iUFDTn(%aa z8r1MG0O~?hFIByP{$6@5|SWr;unO-iOIz)?wTgP3W8;eZX8;tSkE!AAK zVOH+XMmCQ9x-NQnqRs-vLb{)nU-zz`;Kh}~c+r)@_{c7~6VUJr6N|}aiD;FVu4xX# zk)XIGP={nfMnVA5pv=xocLTVl!Gn9<`5E?5-Qyksc(IvmiZ-hB8}2$10jm(U4SGqcR%xyxWpwLzan zFDV0cI%PQiD;t7ciK?@G_o}_tuamU?=chjU>!*U{hTSb|k|F)k_)Y=UWQ~umKy-PN z|8~W=Wvb(@?b5eJ6Nu)K5RQ~vjHZK%7G;Psv8hybRF_+N@)&n=D@$J4r-X#}LtW?A zm2qKFicS9TUYfYlWNA|Rex!H8V_&dSvMh*zOGTI+i{6`O&$M(EOq@ksTp64NO^3MM1li0Zu3Dmt1xK{uF%JCTdW!x*3CxCe?%ox~wK|g@# zeYMn0z!v8oTaTAgFl%c50w+}5K?3zzdTTT?jHfNq1TS9l17Vo7We1~&@kCQt_mYqP zXQkD(^>05;jC+=NDEnHGq|Mm+6w~TLP?LEO$vOoD528k|-UWxLa5T?b7>DBRjJ=ei za{^lg0>J@3^SP??ZGor8&Tmc?!r$3wHTB+){$6l>^duk6fQ}mKiI0V%*MJWOs*iN# z@aH!8nm;MKfgWi1EpYn6QldaE;I4^BtWgmdg+{>2*V$Ja3^2 z$mm7T$Lm&6Th2S*U&FB8sC1rilO!QcN(Zuw+ZrxE-$TT-pqs?=OR7*ZIK=~RvgJsp*_{#jxcabJle=AK z0l%wzo~RKwnsJtpey0JTC&v40U_mXlPu>w~`G7O(;GU7*XzDSjo%JI6JM7+R^!AMe zk5MvPk06B5iA2Gp*5P@9cXOE{XwOsnMQA_p!B1RPn}J?8{L|3On;UK}VN43t?Tb?Z z4QIq4SbXE{@jh7`woYBSea!Meg@F6J3@K*iiFM^6G{N#GM$X=S!p=I(k~Ws-vZ_8= z-}*;w@A}CK)2wZ@O2q?J-|y!fK-$v!0A6x*-+|omo~#w^TlPi0nVL2EkNmpxdT~(G zk%0@sI6htc6g!2EW)Sv8MQur{olgwhvF?6pb1Gqc@+H+yokaLrVeX?y4Tx6FE~t*~ z*YVTz+2^7(vQCRcuR0o9TBut9laqXGV0h|#ky9}>kN^Rf7L0CcRghVw03pZcz{94d zNx}A%rKWuAvSDH6?7AS;laC16j^G&x#O%8&jXP%}8r0EoWQVdVI*x z$wh33*TPdRZ;+_D-nM0Ac?L!p2y36I=`uTzx{4@fl8HM@fi&_LFr#Y$E#!wrr4H+j zoBG0BSu2Z?x&h)U+!Vsb`J)xX8lk>X+=WL?D0FQv$!vjhK-ErLyo=4e0U6&H7eDQV zvS{5PnKjkB=5y%3pAiYypBvm8j-fZY|4e!^->MWOhuX|x?yKB+Y5r_2BErBa4RwsR z8tmqz`^^fDEJdIN4Fj~G;kSut&QgaEYfLpUy41gqK)g^UwD?+W_)t{UsA!+=hh3?H z;wLhJW6k|hIX%@MzLfxol;Hgbn|cJ5fh*!bJp9`F?7iqzWC|Q4Pevz-AkIl*HU<)` zXTtw5s7gKlsorJv5hid3rkGRN4i??e^^O!t@`{#_??Ysc8(-BFB5D~HfhvaeY(4oo zUeZ+g9olY=K=z1PRV&sb_f-ei?b_l0*}eTl=?Y6ebP9lPYrO_~`}k5oJrg9-P12;C zWgAU7F7-4x;w)J&O9)9=&T=K+KgD_aBOPwvIbOK5+&!~jswca$V1ow?6Nq%CjxKwK zN|Y`}9-+juoS}EHXtIVWh?}hSTl0bHNk)iGw#P zoeL!Qp99JLt7{m9Cqk%n`P|ndJ=;XCGB0IPQ;lMLDALki6_v_l;E3%Rq)(J}F|wW= zM5i_84Zbs76B7?KOF8Z#=JWT6?C}qF(jk(pp#uxNutao^TWeqGamY)+VUc6+=ZoSg z<{gtEgI9!{^}NwkD7`72ZTZ|CD5q#oPHHmFE@P7BpHc_d4t*B@oo*&UUJGj1@$@!u zx1)Rfw@IHQ2Xd{O>b{1Bmt_`5o22xo0m+rtuC}|Sa&|s|sAT)^yi5Co#HONtxs)O| zet|ln4*Joc;#q>$AGsoOm!>A!)lr*jFFN7J1e&unAzGG?sb(4-6FfNJ=^^qr6E@&A}xpZ)Eutk)4gFE4K`Gpe{rGkY1M{H|*oUz%MNrYTM z9ugKzLP|>NnMyMz*rqyLp8I?ES2cBj^I3m*J_}i_sUclQj&Hk#cW19Js5imFiJyci zMHigY;uA4pdDXtwCLel3*OcCw*4ibDdF)QZ@ejA=)=*5=vUWjH7F2$xe@=1DBrF!2 z5VonysU)x@7L{2>wC3#{woTU zVMzEIjrR}#2u_SZv^ubK^8uD_7{LAqkX?1p3dB|_ty3>vCYU)sqkx7;@%19%uxC*i zEfVX^@9-j`I&LgRT6`A=1%+P7j8A)%CZo;xq{OkFR#T2g+V?K+!yeYMl0N}X7KZwW6`U==}EE=r_nGZ2cD!Cy`kGYzsK z3^I_}0pxx4>Sx8_FLH6AuS+VV#wt{^9?w6)6{gM6a*m=(-Y5=MOO=sc>Sx&Wds1%9dxYi z(+YQY!6U_uT-ec=mYG?)Kjr<{Oa)Vhpi=EMf}5;B&Y9y+4f4sS zE08kZyHs|a;s?b8kW#52ZXg-}_kWeJ{|sc!z5fAH0J~%CLgO^o@l@+(!nx3y#yT+S zJ;U3g1*oPuEV)=B4U>5;8SD(>6`O1R=#|lxKKhk`ti%6G*mXX3*Z=wgpn5_Kp04*_gk9fX4JCZ@ z|Hja)aQ9KD4F)(5JpaJ$|0_Rb&5jT-mLvsR0-pLAtG$E$+;_WFqH~*12HSnhcK)g# zHGvJNPy_M&BgP#=_z$n};;-c@{C))|t{RQuy^9iv@r)84MM7_H??ujVX+=fc{brxv zJH-U_){!- z<#Y}3q`)7aLHs+~`^R+4#Fnye10NDb(>8bAAN%p^726_V+-P+j zJ0%~9Ob?Q|fI6FpsFV~&_M-ADn@)jTbw$CI6V$&;Y|)q7>=%guwkUzn`Y*8M>PO?} zXujeDQd7f${Rhk@TdQ7;D-jX)##zh>!X$e}Y6%w8#DDH@s;hYi)cdFoe->otc4_&$ zl-d&h9)&#U$HYXRvxztOi%U2Es}ve{=tw~4w-$gnDJIs*^q$i+$^Ii=dT|4HhVV~H z4ey-A*8cPUqSRKT(}wmdNzvr6MpR!T>2a9j>7OV;>C;-RUo<-U+TYUZ`JEH z<<3KX->d-ru!5KOYjW?$J2e1*>5b#X@ zf&ui@v#WoXDjE*`g8xLxd#oT{?@)B_UC;ZxU_>SlB_hvKxK~3Z-J}p8Mq*kE|G@&@ z#Sd_)E#c+>A4YlL#(Bk3>Hrrk+Xu&r!}}Pk8s#Y^m!3^wCa5O9Vt#z{OK(};J~h&} z84RXl)Og(Y`GqryP#;y?tcj%2NbivSjWSHu&$UaBCVt8uPlhp?9^5Q#k(6V8c95I% zq+nNZ#TZRdiT(L2CDKGqQYU5NmuvFFHJBN37Yr*mEPVA96{|aL2a7REi0^19cUF@c z`m{Gn@ddy?X0_b>{kFUwT$y(IVhOPRG3|Vf3ZQN8k)0las!pK@xuox<^x;=5_!Y227ya)5yZoPcJg%$t zy%SYR6w#U3sgs?Mb@B!9FsJK;BUtOhTx2-vV$ zK-8w7HTKJp<#$QX%-)hOMp!&~OGn8(g(_7zlSMN&>bjbjzY|*BO-f(+>uj>DxG5$)jE^< zsslvG|7|w^ifJ1uM=4mZ!qy36qo;q++NsW=T(Y>qCt;cOj(vKg@$dO#woQ2Bt8-HGBL? zIzWjh0rgJ^eu8!NG@kh1WI&6^En8ea;Gyax@tZ5t;5&SMda-q7;OD;h`i+)7_HMCp zET6>JM{hzL_9gKHFdpQ&D1&;&qeuxIsB&`{;8g6FA9FaoVGlQN-1XpUpx{)YOb(xR z(CnS`AZBq=i$$C($Fdb$l7o?Az=N%eXf7eG3+#Hzm%Wl*xyh8}9-U=_ND$Pah-zc24Y^J=4mK3>Y=4s<@YBnST1Dg0v*U%I6X)z47@x{LtcdTcDbr z?D0vZKV|f|uM#Hfv|WF@$<`#xK~%n&;fe+x+QQ*Tua)_8rE%F$5(Ua;yB%3024cfk zVN^eml0cM?)`86ZR_&oi7Exp{TFDfB0+GLeJ49^@YIih9Y{TI(WCUoo(DmG>qg_w$ za$Z%#|97iUjxb6;v0b}^gs)lOiKs9q$SHj=O3Xig(!T9ia>uP)0w!Vh?R^^w+ZR{w zRyctT5O*nPzZK{^#BH6r!-{(vPGuj`7Su!hk&ZIHRz|D^8bOpVI+)v@cG$%bCMmiy z_#Ou2(3nH0us_4-6k{_F3QpEr!lKJ0Qv(=Y$j}qO9E`v8$bCl}tazofU?@RHO`CzL zfAjzFf%3HH_ndlq4rx~7bR(6mHD?aSUeKabt7!YCAyXf>7b#PDNswWt`QGus?0un< zVY*k-Cl{$@&s@-EPO^@Yfqec#=ErlLMO>!1cPNXX80AhwjT!x*nytB)w<+vNpR)oM zBtt9(hRC_R^QK{R&i?+(h=_=w^&m80n~7-@sr{j$X0?zaY_|1<%*9{8GS{Yw_qtXn zf;zU3FOm8pH-t-lqmyP~+%rc`?3-tn=BF?n|AdaG zAWxD8qI&Ak>H5^=0C-A?$%#M?ym?>Ae4~=uNBiwthCU`mE7}r#G~T_BjGS@4W+c?d zUkDvDuGj+b5_Vh7tezE={S-?w-JnbECYVl0W>-^0ijpc-u=KRp#%NY}f|-Afe>x11 zgZ`UK38aiR@PJldfabsPtayV9IQ2i!ybS@wiv=1K){UO`08mn4ap8}5>9kje5oqQ2 z&8Z1NBg@6(DmG7~=)qh0%KhE@6+U4eIX&;LQbYL6z)f+AYAOs1k)MeYq$@Rmg(-?s z_lN6e5L1`@eD#^+$YVzISL+5hnvm_^V9)Cc>;)m8UN7#)?SH@?FtYkLuuaX@)GI%w z+_o&e;5mdF9{Hb%HzGr5Ty7(I-=6dw!4sFsqu9k@Y$tmcs}p7`Nl6xe)lJk0L-*Km zIXh0{wz2!bb0X~F!7a4w#q8|N@+S^{d;-1}*NAQe=Q@KEyK-zE?*&}Kzg6@oVDR1( z$NWR}K)CGz?F3*w@Q5HYJV`ms8o>VK@YPZsC6CI~ha{aGl zFH_6UtfKk*{J2{JRhtx)`P#j&bMlWO_y9AD&-j;^*jDU+5tv#4b403kgYa_K9DJnjMXF3sq`HT=?RrPzxVm5NQPg^7y)`f zF8JR8)>VJfjIqSLCSWpl9cy`)GzJRKEGOu93EPs8^6_c{ys0r4bM`fdWtd=E%}T01Cm_u4MfEOTKnIenPimxXd&msju*IX&XnP2M@=tDyAcV@=R z|7ZNdt3?34F@z8QM^6(b)6|QCEey-hod|)KVKz4P{w^7Xp?|0Rvk|}IUx`5=k-z&x z*72fhS#nQU%JrbpJ{DgkO-n~S;^d^xbbFlxzR>C4G2{PFs?9_OInQ=pmxEv1LG|YD z+Y*J3(Q8&o+rUfC(cr&)9#D{TO8@)S2T6c;!0|7V4UdpL+ze@TMh;a!i^GEmyOM(! zq|FK)nrETIyx=82A}fCmJP;urVEWOA0yq%@s`YxufvmGT0&>lxKJw}XixMp1yYbh- z5}O{!7WB0e_@(}tnsV}e4p&}7R1Wv7;II`aNPL8Kfc<5_BYQ{(j?p~eU1LqIcq@fp zu{g&7U$?R^;9N5dlCQ4%!{K3i*PRYGZli*zg7F^5TO|N~Uxa)PAAE}S|NE4yb@7h^ z<5i`pX+`L?p&X5RwrGhqMg`eCZEet&)P)lH+DZzBC%JA|>r+lQ6D{1V`6d*2TkHgv z6ONl>>N81wsCgy33ttSk`)W}WDQTPT+ zhOcJOA|+m-*7;8d1i(eVto8itL-$MUW4XUEv5MQF%QTzGgKoN#^-Z^P-<)AdxT3cwYWyWgniV)QcVqtAU`gQGc4T>TeGVUa>7k}lhnzUCQ-nBMufM`u zAM(;$O8D`xuTL>dEy5>`njl6^iI@Pt+?_UL-Ta=-z#a2sc7~;xeDuVM&jW?sj2-Vo z(1@lpJ0crhNTv@?p72OI%_rmiqp?zTZv!M5CU}cGf41c`6G&r9m@5BjDuUyJs#IDq~FY_b<5(qAnfMO)u>~?M<%8poiCC{01MD%Gfh#Y&y;pE#F%P1{(6J zFK-09X2mGoCt$+T^H_Cc)F7_z9kdK?<4s?CW*!xxKU4fNy-V4Qp*bu>f1{T$AtEB2 zD+iA})T$cGqUluTiUHn%a<(C5eBn6(M53oLxE6MlWVX>_;g<(h*t46OR-_Brqt|Yp zBFH;etx69!1^!$L9F+H%K?{ac&)`sKtyR+dvf*#5^nh?(pRoIsn*Y`^OpJU%!m#|V zYP<}&(jVtzeqKm& zO`*bM*f9y=2CJ{~Y7HLmQs?PWMfcO@{hFze{k-yG-{<5%sp3zobwZa`nI(E#A(jFM zj_jmdTZjYFYY?7GS#UIz#e>$TV$Ow}WK!m;YbvH>Fjdt9S+zj$Ofq~vmz)L#% z?r1yH2MW(-9b7O^tMN8ycIg;sV~x;daLpqCZeKSRZB}^u7FU_d28mDOv`2Z^$5)B0 zNAekqCzSqWRA3}gC&Uu;6^F@_VzxkSP^-57v)F_QWgS}}$D(_l%w+oG*Ff!^LQ7FQ zMTA6VyujVxH}j3#Vu|2X4dCqkf#*voS3mdM{NO@|A8R8Ek|7Y|FrvMLl25Pg9T3nE zPMTX&IVQ%Y-j}*Je88}AlhiUH0^dMGA0z1KeR`Ubf?7!_@?an%UGRIOViiRw7|<%O zM{*Mk`D|shY4*&9tIe3;)we#WPeV__seph_|5d~x7NYzG`<+30qNZNebE{pQ{&Z^Y zeGmbof3v+B{KjVEGGnD(Ll=7=M&?;5+1WAEBAwLBjgka{x}GSoei9yAcEJiL+jBy< zSu}`m^_1Wt%YUHzMZ#R#R3?jekUBGlEOAT`9xD0TV=k&R+)dCyDKuBp?6qmr*b7}F zHG=pmMIpwArYlkYvph=X&7xCTk+)dUYu+YH*v6V=SATc@0#!L`X^BgX%HC`U+$}PK zBeK8LpuS}1QJI{p&uq*ecA?VX;PWu}<_|adX)3_=d#%6PU3rQpN(8!HM=*a9uPc<` zdbCuJoghrv`Cgh;BgCAw{uZ~kZI+nP3!&s_p^uwUcJ^^k^EM5>3<%v%=q+NUUa0B; zZ4X3|@yhB*%L*E%7VBJP=6o7P4o%UrdNSPQCdcvtUnMih1N*j{_X`=ELf6ej%$2Zn z-FgdZ@%jVfzy7exHz96BYd1>K*zC_uutSJd5~`k?l8|4_&>vSV-^(Hqgk}uCg#$Ta z^Igu6IFj{|k`fY+Gnr2ar9fF-wY=^Y4IIcCYvSnaGE8?^KN^v}_9j@iy7g+~CpAY~f5FUkBE7G_OhY5WT4xJkUp$f%sgUdrQTZ z0l3X+z0e^+ihC-v5GGNVATaToLboO$=!VlTrnnm*i3T3Hqpt+0=$KukY{09SB*NZ*tm)tfme1zR>RLQ<5Po zUTt285v(7PglDzxS(C<#eb1!gu^I8*JX{uwHc4hSg;vhE5jbvL#t`L`EWPPZCfdl8 zU3beNkvmI8axk}R>TOoTpG}n!S1-%I75kEX@2A~Jt>eiL2^`2I4emyQH#-ayTvGz% ztRV?J-swXQ>J}k-9;9z)`iPnGV<*nyo2cIs{fQsg6eT?HW}@8rl#~XoRg^wkujVF z_Ze6y&Y$p+-f{bE94#5Hk51e>G{A`M4&RGjpXo}xWg+3*CQMD>6hj@Rvqu=wOx4GT zlkb-5whb-y{23o?%qMQb(YRpOUAgZ|Y#Q+?8pcZz#PL*qElgbB)XH`?t-kd2<)%&o zuVv_WdNE>ZFmi>dW<}q4u$9EQV!MA>S!kt(?Y{Sk4<@$v;)F+kqw?5S5yr*vJvFWB ztw$Gd-k#ony|!l&o}EwPcXSAy?^17~xk&8V2lij%nFfi z%O*-oZ{JVO+^vd9JL>$}*XQ?L_89*xpT8hjz7h7N2iLnx@(~k~-=rah4%Sv4cJCd4uiPol7^mjajVr5vXuW);GV$7dYruAZP-= zhev*TPtEwYXr&GJO=Ye@UxqcNW08j(w)@Fsf8n8{*a>9BM6%oRM1-14(hc4=+9u%C z=a$&TU#%syJ~bFz@pHMsy!)NR**io))5Mvw>#veQ^(vNP&Yy3vXvrOa0gK>qo7{4R z6L*J@iOXi}8On+&jzm9Sx9d^>4>o|%iGJQ~(iFe_%wv*JIRq2?ANStP;O}9we>WC? zn!e=oqHvgjrP!7o1i6${d5WVo7sn1-%O?tp0Xnv!3rnDonYe)#(*uA4IUtwcwSqJS8W4WXk^2LZ% zI_M+(6P1>^8h6Os&Fd*HqXtDu3t@15P{sF_QDnyaqu z=`=WsC)_&j@~g(1$0qk-y$9kOLVj(-LGDDI^4*Ivs@Plbqe;$|*1X*Hh)lMJPMcz< z>jYhSomlIxwRbMhb7+>AULmm$UPN_DwKUv$YqKG`;HGND`{@PxkaaOm2 zj2TlqygVO87G`8E&CzjOZDJrDC*aBwpU8@x8b~#kNoU#1a)LfxOM$|%nkmzVw#s8d z{N=OoUEu=XXFl2zMsWk2ZQ_xL3`V*x2_uISC;Q;u>icf$8xE)PwDv#tbvD9VDMS5r z2|g~(rP_*b^)rP9fh$&C=*!m*0+|hhJVSZ&dZ(L3=Yv=^zm~W-hIy&d4CdVl7w6O3 zYPuVuj=CGp;z+}Wl|BtJ%V53nNJDXXcbIyZi8*#D$#m>P2)z>nTeHe)3gARzS0$#G zK7YAxCutWgmU?~>CIRcP7e}N=bZtcp7%){mL64T)AMGP>qN8v3Z`wu+0W(7)-w1vm zKrhqh0C>jT#s31mp3c%mmdPVCiL6Gw5I5hNm$HDrKO@B_T0D)X+|9RFb{2b8I`9tG z1Y}xo1?}1YXpQuhDa%8qmQEYyrih9Md&fsanc-vE0N^1+$APMP-Kf3W^V+B9IC<^bb_iy+HsC)8aUYOqVvd+7x$@PKNrW;b4ZnXvh0;A-k^*VzOF{VX8KVcp}W< zN4Q@F&LU5*JH4)(`eeT;=YhF0#=iYHRo~26rarucgO~aT0!0}~UL%%uoqFs!1$4I4 zJyk#Q*0}kH&)S>)`3^Zy+mX?qR*K|FtYHxPr)nHziCv$H16(6o8AUhyO$IqJxYLK+ zxi#Vv9M2c9NJpoP6~Fn6Th>^t#M1BD(qa0mvEBYDLk2du0^MyJw|BCCS}7B+dCt4y|6n}V5Q|w1pc^9 zls?|x@FQ>JCU<4(nF_MIlRs99p_%{F|46HzmjQe7z)RmF8HG$yH0@d6(5MA&@AQ7F zZZWoCP{PDq>sRU~SMkJ;7?f`J66vv%%kpJMb6q3a8kfaB5o%2Q{3+}?^qsu4Y+p)> zig5xLhFH&l;{(gPte6NliS*`Mp1ga3!6H)K{#x11F>=|cJSjgj_-Ny(pUW>vN4LU} z{KgS61Iyo_&fb;Uu|G4Mg-@2GWOZ&XLK#GXsP!oqd7#pVj6@&)wl{4d>qnvlhlYS}??qF5fDjKYUWv&7A}Nrn4DEm-X&n=(;`gs@9*_Bk$j z9kL6uKREWuQZK8!!8lC#>R1Y@K>O6Y{T`BG*3o2$MWVz28NZ3RN2q2~qSb_dM9ENS zpR#93&QqH~Ov6l_81MHD%N;f#C&9+#zGi?V?*WG9_S1XoH-D$MBCqlM{dg29hThfn2WHPcqiC}RI;cs%;<<9u?+y-fJ=LT#T z9k!OaQrjPw#R@1EtY`%uQd4u@-Xk*BPZJ}~Q`de)k@^(g(`|b|-CI9A&4xKqWKO@tVDx>-V9#q#7WOaYEGy56of=WD~j2qLqd0F z>UVAN4??eG2X7HUSE6aK1rMH;24Dp1$eX70(J^a36r+XYJq+V3#6dnpQZjEFTYEx} z%9$$Ge;dnxV9&Y6$o1wNnKlJ>BK5a$ClSlkKqh+0PbR5%-F|pqlIrD8M0z-X-+%Qm zi!Er-rc@$IX%kv2cVHM@4LKR@Ac|#lPG03=c>ev=lV8J)`>FVr3Jyb@a~Fj|1aWY> zEj1(mBqDs^I6Utl$j(xHtnkHbgED?;1=jRFlyNn!OGX;5wEC?|rW7{0h*vJERaOvH zN(Ie{emyUukmdgb#*v-<nS$;7C)OPr^+w0mS zCIu#9ZNzIjcK*)RfQd6E%HaJJyP8x&if`W@o)6L@Lq|1T5-BzXXpmp{h`1Vh4lQ^h zbQKQ?4FXVm&@Zd30;}yzl~S6hPf+*0$^*25WtC>k7p0Q%uwWvJIJ{J$vm~$xs=lFt zC*jB8^r@0)3P~N&q*ZEVi;);fpqh>M^f z9kOgIKNtP!;#lch30XSDE%T+;M+R>zh8b2kS?@Xu%O)4j*yWK;ZlF*ktL(S?_*eF| zMxP9FKX|!jS z0?!U_f8JO;(FQtd8ii<3&JHH8zgCQp6ay;@<6&6GyACo9IFi6C%|Mxa#oIX8P639< zD%^jJkW0>N1M=wQF`3aR113Lqm(JW_h^L}yK0-Y*O@6&|3u)MCD!1s~g(47rY}=5~ z7!y`O`I=v+qm1wHpiil`Bfjh=Z?3oSOTdZ~yU=U74|EP7#7>+HGK=yPO93K%CM2~) z2WdxHv9uJfoQm?pc*jD6E?v~SlJIw>AVZ2WiRk`Z$y0ki-!~s8mfB{YN!J$93eR0v zJ{GXEYBCF1?tUFisNytV=ps`sxRH_NwC_MWuS)PUocxJkvzVlzw6F?I*1+6TcVzOa zZaeb(L*Ya6!;!U@M=4$JD5ZQX(#0IJV)~uJQc>=q41m^&htihU>D9;kS5-ZI(vd%s zb*Qu->1OG^u@$*ohY`vvzWzo-==C3N%K?gI#?PPZPe}iF<)jzFR~y@^V?_Tyv%8oT z@Kr`9cc%}oi@iaCSFPYrY$&f2KpgNt5;tb9TL`^AfEr`OwrCU9bxq~J>5xr-hy)6z z4=;9sl}|gr^_lXVyi7v{>s+17W`p1V$JkpyRkgK^!Use_K|&CuOHewcQ>06}K|xBo zbAx~g(%qp_(jXn14y8-Fk?xL7+_}#Y-^2U;|NZV5XLuNk#az#P=F>H;CRi^R1Ni^s z9JvbNZp}z@XW$<~@y7uMyF~aeo8K|FS4LPbZ4UH81$^u4&T!?DOxtE>$JVd1n*UjF%0?e!nq`X$&tW^pj5#Q<4Tv*GI3RRXg6d)fL&S{`Y3O zSG4mcFE2}-U%mPZOt8J|;9BVo%CcUhwW7xj4ChsP$OR@r>*(lkIjlu->w0(yyai)k zqN5F6zg}2cShR*yiU-ltGBUQ^mS%cK)y?WlH(T)Mxc~jb-<_9O;=YhSZlMGhBA7)& z&^VJ9ZK^aP6V*`dMKBc97Ze&Ew{@jFC8Uo?hb=!RAzU?+6qNzzRT&Nc+4_Q-+ zC?Cl(<-KG-UU~E|M^b{5hUoO8iZfR z{)v)42yFoE=iP6cBkMgM_@NVf|9l(#z13p02qBTf;<=$=M!2{V_g683i^D?CR`}?q zNbju#vw}voy1j?kU}}mYy5B8%zFnlP+I>0faQjY%idp!qL@=lo^giBn}55 z^4sttNBsh{ZjQzbB%wcD$u|DGK;i$Z8XcV||Cw+81+ffPm6EEqKXIY7@DX<2FM>`H$c7Z_NV%~0xoY73@`@^tloNMIjd`PRwUoTpI& z?!2U>^0;ox-+eM0BY2P7Ivw|Zcm7JrIMw})F@b!Y;zymhYN`KwE%$+Bb(M$9=z?BE zL`0V!im_2TB^_Jy4AIflqV3%GaLSdHm3I@H=wo7Lq{IU5kC^ICD9SCT0>D7M+3NLz zX15vGbeP4(ecB-VnxrHmb$TV3w4oT$Y4)E{0u=HM+_O!jscsK>kBLR?w6d~ER|Gq+ zDi<}et2X|Z@ox=5)St39%B^NX^sdfEPP(l;d<;8d=D@gBW`V2I-qgrWvR|GNM`D9h zrnE$m)7ZzsJ`q=DYrGIJU#sGUCXuP$!J^(lZCdh7eg0~Ny-uMx-c);i>K5<#{Ggc! zZ2SVYQbW?q+#e-mJ4Qo8<7E)yNMHV2J_@m;j1tN5?d(K4pS5msX-<&Ur}d;PUNU*0 zkU}pqOYUp`5n*IrNbugozkC`P|FB7yIb!5PaavjJh^2z6D*PvMNz7QXP$;x$VpBp| zTEx+jV{d=I_*EmKvYJ|AFYQY)#&A%B?U9g}U^Cl*z?Cegl$HJM+rSI)hYug#mdn={ zU*|3>FQ5PQBXVV9Ysg+G86E{mPnW{1cO?b9+Prg?AeT&H4c1W~4@Z5Z6Gp1no!m)~U}Te%3p)!lh+p_`6@8 zi=|nzva-HZSNBI{to(~kHIF%5W^r+meDAU1yhLyA1GA9=9p|m8Wtphn^V5ABFeFxF zMp{Y=jbyVXb6Kf4?P8|FiZsfroSO)pGLijuYW=q6w&Y^0$FSKwUFDZhFzU{EA(*qs zq@QA?oVNWS0nASF0rT(-cIR82uV6D))6&w?K2FO#zoocMqRnh%yFqPwhxTw_50~(w3W!M7gmA~Nm9ES7!$0tE8E+aM5 zzP;{CY+`38W-O*NAtd|(spr3*I4?%4z0}hS?dp=jW>Hp9c>Trq{6ek2%yCoG{d|Fx zbNh_j*#%%}H%sv1t{`lz*Shr~7z!E4$l5OlJKpA0l%h;5s9GWzL=NUj;CFM}+0ykjFdQ>&x|GD z#q^|-72oFIe7|?0xSgb#t(VWBC(|JuXb~$4x4peRnS+o_FJ9YU{vW^heKSL5GrVy z9v)BthUDsRRy)Me(X6t6`Mziyzmg4gJJvtmngSzwf2!#PYgty^ru}kW7Ie_0Ld1VK zq;%3B>KT+F9rH*yGkcr8TGBB78<D(DU&+)Y{MY?mQ2MGEeS)w@_z`+M@QVspMG)y3$y+iJQl z8R3s<^^$rJN-HXaDXmB#2MUyLOXd2E1z{TJ3xbzv*_7!HVfJ8z>7FZ;`kSOutLd*^ z_>5S^ZXL%l!^(XhC;jg6XPMIx9V8E*0~^8lWX^9F$U5xmZQki!^uGt4p3Dcbm>tnS z{&7_*r|6_s5iUj>caSl1Fo)s!DD~=mIT%fGEj>E)>$NzgT|%(_znYgEE8a2P>R z7&PC3>3Y^+cH;c7men%_4vn!f30%nq?8aZw2_l;EV0tqN>lZOIGRRI3?`|uTJA&Qg zYH{BHMqr+3#!65Ivp-9|M;^T6a$GYLEbp-w*;i_JrjZfONE=*kG4Y|D1cD~dclutE z?$`^%{Vyk}RCbl*0EVkCBdhj2yT2mk=K=9&gS&*{FORIQhF@TuDh;phrg)s0p3fo+ zGApQRF0YVY&XHRGpo$fuiV#yd)oTowy4?RWit-D5+{yTR7xzE=m4NdGy8=n*pZ(T8 zhVM5mc+!%yCyCqOeeb&$yItYc{CL{dULQ9kB_;39T?Oi$BhFkNRoJZbkx!m)CGM{F zx)UUf7gSb`n)7Oa%NLeTb{9^jwmtkt^gM|Z3bw#x(fW@E{r;rECik{NJ+44FIF3gm zp=Yb0W~-^2bH=>u*4gqo@FHV z@;;7Wzn>&U6+G1MBx{<*q{GfCx!7NzQxlZl|1~)kkcjKd|{ML2__^^#Oed`&-F+_!!bp;o^T76AL&uR83EdF#Z6F4X>5O(wvOBK9yH( z4>kqtmylklkY-34$@x!NX!0Z zAODrBdMUwdr>qciznA|wDLWOYT4TjUQ>6X~D(RQX%Ix$5-vv#T5h16YdTpId^=LIy zmPZ&;yGgWoEiX7o!Fa$2@PkEhKBCRZIB=Oqf4legWU0R2S*RzxQK zu>Ko%OipkNHX^;)e_Sk?2;gEpZnWY2Lv!(>Wc@7)($c74===R>7TRFt>90Y%;w0H` z`o{&TSDb*M@b2f^)?m(Q|4{c|lwfLlb2_-_DNBhv{aG}~|45Npz((5zKFGVzW5+Q$ z_@0+ViuGX1MjtzQ9y=KcMr(p=;4>6dRfoUJ-by`8VCRusR1*`sqgtT-(GynT*_@}DjwRvSaIKT?FSUnEJz zT1=G3KP3uo$u2D9jF2!p1GUkQ$fHtIt=MtF%ZGTWL zt-k_;T~o@<$GmH{>j>?Zy6~%bubVc7v;hUR-sYA6bKw*7ZEy@q@7i_#q!EqK;k2KS zu+v{&;)6gAfJ~1NGWb$Y!+I(-yQ1<-k`t#2ESL?(1)Ez6idKa`DR?|U1;)Ls< z!^S-616v>XD-zIzSXQr6o8(i85&4RK|L58fP%aE|uP6P5m;aOu4a)YdyuRA27dRH4 zgscbuZ-W*e^T5wQvrs~`Pf4Etw(P}G-7&PcmR)p0LLp7&aBk?h{t7bbGy_@L+zgh; z;fWTTIbY)Zj3~KIZCcG2t^PL)d#2N@!}%^fU{=zlrYGV^)Qsfof!MnDAl^V}SGN~k_zCbH-{io8 zNe=JB=bE7sy~jSJ=YMH-!VMk*X?cOu#~#3nB;YPH9i)aUe`<&14yvm!WpN&t`>BaT zj>s4!?Cr3tFyB6l#SJ1s@jP z7scTvlL)6!sg8Il%JVrEQxg$k;K${(@f=AHS|qbE>kk>$-loXGBVD(Ah|vOXJnU)ApZR-8Nyv0#Czg$NOvVb2;uKPMnL43<`$ET zeA6zDX9PWf@Q#6g%t7-|Vatv1*vc#Eq1}A0J#O2{5GYi3x|G=dN!TY~R3rhqUQol> z!@Zm|KvKs$`|j2}I4|q4y%1>Rnzz#0AzCT_L7&L(tPe&TU16=F;;uz$znG{Hg$}Xk z+*j(Q5~Rlf{%gSDKYlVpc}nDPU)(GlpaK5i6~dD~rT1xV$_`GGHsa^<_%o*mi1Z2PjhPif^~-*$(q(mhv{u&JZ^hF?J7PWqmr7ud=(<_h z&_Z?t_0SYoEQ%WMc=h?1`In!cjnp%?hiir^yO9D7AQez!7^9I{P_PlpPLnKW4BX+x zljRh*g3?o9U(jzB9NQ{#y%k> zc6{R`EKfNF!!g5HhWUOu-c1}7rb-C4OdKYPjK4US!%3~xOr6cqm@ys#aux;*$D(}I zsJJb`ENa0T(Scn>CvD*?$?@u77Vb0C) zM5hNM+WH@<0^)olO#lJVZ#^EDA(bd`8kFU{jX2guXr-Cx&-9NDc61%n{+i-5*G?9iUg%tkUzq$ zJ1vzq3jzLY_A7l6ZI;2$hM&R+FMd4{{0MGN$W4FWoLvg23l|S#L_FvWoRe^ z`YTv{V2NLp&4!KroG)J6oFA#~dzu!)H3B|o>)fNcrxazARI%C*X09%e1J_N!aQ46B za;*w4+@5kJSpq*d;v2dG3f_RGS^h)1yrSn=RhaFV!NBHSosXi(m=;DRR(}5##16}= zm%b{bb$H0vpZzS<2PQwG_oMb<(-TeN4Z?rUUx*WP)@n8Vhm!tU>FVxTSA4!^(dd`J zmIinAKZGqLPXW*wDb`$mjp06nyFo6$emI}(W@(Q5l^E{~NVn~YAh!oJ$5@=!vN2K1 zC8;AN1JjAGpzdr<-QlN-bU+1=k~<7PLBR#~RtjvpUJz5(dVA7s8!GDFs8M=1|GDt> zEP(U9PCSG=#U>YrBN&sSflPJIwcwG>ZYy4_odk{FijR=}cXB@@{h=wR8RD#DM@_8H z98>m#Tx(y`0&e-g!kZZ^D1>a9iE=$hjq@9BJf=?%?{@7YHX|AOTI+k} zV;}gBCrc_na5;8F(PZ$a_lO8Jxx8;iK?rnYgboc9cl8n5DM&2==d`fqS^ABBxTKsy!U?H%L#nO27A5<)y+){I8S8N^&FSS*5iVe@ zUjorg&MQ|(P}sQ4t=Gf!Lk=wbsnV?yRfJzPYP>gLcib2ia$lRN_mn*R@b3MJcVCka zy8~Q4KbK2x3h7nVby=iXE4spBq(&kY5VsfRd;aJ-+B{60#O1LixGQ5mk0OI;ULl#W zz=CiUwF$SkGE9P@T|nGl$YBm|q|ivJ`bnxgD~bC~BgLbBKQ%o_*@2hYAO;Cn^hz%X zHfW|!@5%58oV}NLX43bG!ZE+3hewp&jQcy@Kb+}h`Iy;ot(5C(V10hca|%zMANKro zLX&g3=y)3owoz?yF>L%t8ud{E@putBcd!3Nx|xTcLc> zP5p_`^EP9QKuW>V-bB(9sXtof;4h$7_+Md~dtYPrUrVA=BlxD>w$t8>{w<}2sXtkl zl7$vPB7e*6Pn{^WXXC%ffai^?`Lx4BC=hK$uswTijRz8QE!rC?!5_^AXY)t=^Mrl{ z9477oR=MZ(r|j=~iKjdw3>7?;k=B8n9em7)CBhN$qkq^Oqm{X6r+3X#WL{zb+F*8f zhHL42E+jN(;_--T;_2@vLKWCt+g2lORk1oV10k2FOhkl1>I~(Nsw~YbTEDBKphjHF zJA}KXa0z(F--_zDc3Rm5fVrMyjtO03+7;1@^x|yeWQ(cC(zJu()>*5ZR%j5^X_|Jr zq#ga9IKvxv^6L*%0$hm^)cx7E2iYfUcEO+si>PZ{P22@a4~9AV*FVcQxJv@y>xeiA ze{W~yC;a<|I}`G?`o0{WoXv4|pz-w1OJM1_8XLLovxpxa znVMQn6~(ht@>cxj5?d@GR&J9~+Mn(}WGfq$Sjf`5l;Lr1SLb zP>bu%!S=}LB&*eo$^JT4xBmBX#;B{qURPK}w$r%%qhATHLL9zK3*7ypSMQOrRo$q} z1_*`KlecX68+2$waNSo_XI5i8aoySI$uP4)`OAXaT+C0__Qar8r4?*u-lrW*iAB6{Tg^ zhfPtt-XaUopn85KQzq2>lEbBQ{jKsG&JkvJ>FDIIvE0&bE|z&tngw&&+`T-5s_HSW z*@;KP*8B0?s%sV}d1*{fWK$kwU9L7&&VSHa=zr2)MsRH9epJM@CjzrCr47C2Kq18a z_wSne`5HGwua!~U%-zz5A$NX}h)-CQ>u!{E4)1zFoM?pUyj8IETo_W^Y@W+FpGM0$ z#J^ivqbtOeW-NA=ZisFe@5G+@dc3HnyQvHfM0he4o-ikZi-Vl1F7*Kai2Yte{#Sji^%MZk*Cxgl~75~$xD(po{U=wtljNmN6DB5J8mwU^&+ z_4=dVIy(-X>tw?2WMr#cM0-C|+*|i@sVO%yK@S_L9ShTsNsv7TzgAfqI>@$jc~H6B zslC#bKo%AnR;@o=W8X7ZjY;IW7Qeh#xBpfqg{SPYvblG%{SITbM0Tv3h7R>If7Vdq zvE-InH|{8-fOUV97kH|h$WE2=h?lWaoV|XS0=FBs@W2O`&hB+_UOi*suSwVQRhz|> z;d9NvN0r|xij|K%Pm}7c3sAL5-PYo5>rEZ=6_C*?V~eK=d=0mrg$l5gPvRx$m<@G8 zoOTp_Y_}M7TDS$$to57^J{osGK97~Pol2u?@~!Jod_dR8iQYNgKwVfUev(Ra?AwE4 zC(Ts#qU!h?HGyx7v28wo%)$q*u?>PHT@Gi{H67Lu^w2kSuS2R}ErY$6;-PQlRulLP zHP2Vi>GX%{w?aQ+S5?$%;Dk)SnWMXwbgt+?e5SDBmcO_bbtE#M@~wyJubPTaKbW=F z)mUQt)mnu<818R`Ap}{e6xw*>rCU^a#5bnWv7Uu92#TA$1EWm24BR@1BU` z`E~AzoG8@Ni<%Q2xxohO-5e&|31pYD71Jt9h-Qe(vTV029(o-USByXAm@s+KnT-+w z@{2PzPCYYE+66}TT*URrc*rQWN)I!O=c>(0la4e4UXBExsZAdD8SHl1EocdBub?9N zFDgDBHBFQ?Xi3N|P5Mw|(=A&OTeUM8$W-P&-H#_tU#Ff|G&yU&ZM3MXn(Fr#7#38~ zMj09!yz8Q8R*t4?`!cVz$+oyzm9i6>CqRm|ySFZu#NNOp!F@j#}RE8=rS+q)yn`_XuZr>l)VBi>gwMp@G zczMQl8DgVg+t59P|Je*$rpj;_qHUG0D%b8gdA42oYn$`o@~aZFM~x}4(>v>VDZOSm z!nKy$Kd&ko7;7Bu2aj*}i2s5zTu}?0cI}^5KNImyODOu|haEhI+g+yY=7HCC7k^BH z*84OMh(R;jd#GE5#r*eM#|%yV`}jCee$g~L#ww0W$IblYwa$P}jE`2Hb1S2^gNITL zNm3_=2P>XaT5HgyOi!nr=`uA!k|&VWG#1;bXIdVomoM&7g99x%>C6~um93bW$#&(` zbvitmzMJY_rlyxo^=za$lwZYrY+1)qWYVLzIXTBEv3odpJJnhykKMfr*KY`FKoVGQ z+;`eUcZGi(Z$AE{fgE~yad!W${qS}xnQ5;8LGN-J-_m|ozWvRNy3w!Z9lKh)$SZ7+Lh07 z5JY(Yc)jhZM&$VmA%6|J(?Q58aoJnxc%ZY2V#;_TKz4h4@dG2VIK;Y;i*`gq#~LdM z(*M)98MGQG4S4*VarosjOJP;%EpRjkS@?9}WX65Fq62?f4?k9$jaF>N##3@}bk7i3 z7fsW=U)`bAZ~+^+p7&_R02ae$-E@9ADQ%ZB6@j=D2j8^*DivPnGc|nNcA{#}+Faqd z=CR{kHYPcirSi&kucg*CLqB@_^>>mOxZ?lx6&h-M>`nCj+Tc1auRDV~ zh|KIyY^+X}A5;K37*W z;kYeP>bgnem

sSLc=iwt3auVr+$P!P!7e*iaEH?qp-aDk#a-e~$taB$6 zfU_OY@eRrCP(-LEEW zlDV1170R+(xWoE;5hc%a-pr5rF>tk>{Y)G_mLXkym*9*69U1j`0Eiu%*2Ar&kDbM| zGo1R=B6d)DuV|b=NX}O#ymgaa!$eu&3y3s(+{Y2JQ15ofYqoKgbP?-qBh*nYOG$fG z<*eu!X!vTS+OP0_kagM9mJyni&WFMvi7n0Mt1`g9u_tKmX;(&IRp8qp#w&_YVWEvey&4Vf>UnJeF~i$^}Sc+Irq$GF_|yq0|&tHpj>W!3sV zU%UoPy~*wC04ERR<~uNLEt~C_Ltg<|5nD8`yKbQ4qI+&j3&-$zKgAEH8F)NY#Mf}?0aA=wbH~{YKDL~&wLz2(lVK`(7WZudpZvwS7tW+ECF>@q@J{PX{D$9Gs73D z-&%5!&%z@z1Dsi3*Rt=>GOdt!%SsSRP)NK~E5v!rxO0%o`DvVmTnfF({p+h(!Z~zc zzIBh#rd?ZH5{mDDGG8T6y`S_-Kl_g0k&#LM))e~g;ZC)dM9*^3a28)C{5)64BP;wF zH^s~aNueZsqo~jz z?y8T0o84?D^d{<;2pa}zP5$s;Ms}Px_Yf*8Vx`r}A3I96hh_~ltsUZWL>PuxBSoX4LdtH~Wa!fjT`-O+zJMr6EC*6b%|4^a@Tdtr@CtOB_4{*_5*0ek5djQlGG?_kHGS0Jk)H#Xt=>=i_CEPCwF15 z_+CGm;S)F1uObKGc}aA!jQth+YV|0MeA39G{`7Hcjh<#|(O^aVH|_2R{q7;$s{Yzr zx#X@{qoPQDzNM!@3C8^z_N;ZFrJh85BU<`MY#LJ8Qe2eAmb~tg_(eQs2i}VfRE=b( zJvgXFsSgaZCq9c4dM^)SNd-{uTxrC%C$v6I5U__8PosNrnHs=F8l2ab&A$lW4p)Iy z1gNu;t>2TDR=iI`vCf;gJRTV{$fXX_nK!SZ;dBpH==@1!zZ6Xl^|R5H5TKy(st-QB zep^NMaaXCSZmjd=Ldy@LE|2|+n+cVJ#g_SYza-F}=?>mYl~}n!czH9>CPP9%e+pz~ z`j1d1n?}i!U#?-_dXjUElW$aUjvXXG-nN%~oKx~>55OTlWvL$dnmaJ%Ur6?9CF!=* zokXvzie!GTa>9OfJmJBMI?n&V=jJ?%bCjak+r`%uJFQ8O|HRkd|kw^P-HArS~oLhUGdJU8fhd*qc6UvIiR# z(qN=Ed)VI{7TiOtGUbWQN@;dQN>lboDt$9JCHyoy;x-41;%5f@tXG~Ik6(yzULs|j zl}O(7vw0<%T*|yWrP_#eOUk)WDyNjl_py$8^kMFb(;|1Jj%!(|Cx`Ugv;h9)me(f1 z@#2Yb0Q;p-X%pmwHcwdAL{B^7$~h=ZwC~>yrcw-J9H)HROdULrNh_IIN4Pz2=}&mA zm8|uku*K0O)>HejK-p~q+AxdD6{vHM-jqh*MRM9d#gFCGvL7>=Q#EK12$oH+yLP5O z>7bNXo?pZ#>2@v6)9ldOxmiMQZq42(h%AlR6Oa@bpt=7wWcB0a`@1bsX-%FV&&3P=- zwM|Hg^Jc^=c}+WjYexO^p%`mZJrlW$6-bhiT)F#T_vYw48na%rfV9CwHRr86ZS6KZ zlF83l%;vrk>b6>?XH-bDj95&1=wW*%#f;FVwsqRPc@2(>oik8dL!*^Tr;PD_@+{e* zLMc5*-=j#|a~6$ZJ3#kQc7DjTCK*KYLfj-#RlUy!dw3?M{($@a+S74evhSpI<79nN zC)Ng3C~^N*o!#F&+Q&bN7Q?AZB8?Jhc9M`#)@aXL_^`gE9ROrt)Rixz@-uu%8cDp4 z(fv#h%fX`TLvrpGkGfQHI+K_=+7|30db;)W?s+l^V3OV7^`Z5~M)!$5STYFi@OA(x zCH~qkQLoQabrKpUv7#qo#nCp70JD&;M8sij;yl^A@d{Zf2?7xVji+rj6>2I$j8x)IW4XRG#Ym$UrS4gB}S&U;VyWSf`+6B{w8M$ z>W1*LuJL=t#NnJ-1?;12`XV&v6*2X^9G+8scqB=Mz*?3n5!*LWzhp6FGv+uKa*8J& zFaBcPIbHNRJTs}KL~{x1S)NBj7f=hk00ppGO2B>!pwKZcYr7_~rgu$;jpl2+?DeUA8_AHx+RKwgY)G#RUv`&C9K@wtT3@Eq z(z8hB`xrN(zsnQ%VQiq~^4=c`G2uFb7vTx5)qLyn73s}GEm&&@Icx&_(YQ8+MnVGZ zHx(9La*c2FN{B%*Lo%7ahll0u8w9+=7f>m&+x4LwH@h_LM{WH@pIN#9P3%2DG^7A$ zD_c6(yS!dHcV3%~no2X0hI`6wMNfJ5EUWAr-Eeh~rTxehz|ZMtZI8qZaDt#PFH|xr zlVGOxPpxEv72Q%2K0+rS>5LIFAtW|Fob|;kqk1`6EPZ^GS%AWX?MHX{}9o-8{ z^a2Zgn4C6+6ej94x=$%1nU%()3+NkGGVrJ?s}oF;|jK;}B)Ivn%RGCY*n(%Wkkz%$c*t^ZPe z5k~E7E}^+)I@Rk@kh+>ALw48wvH|wibOW_Rp#RxPeY-k>WpOzFbO4vMbNE~UCjLVF zHTqOaJoIJ)ZFs4rhl1sY=%+1=!Hk4=NyDAn*eP&kR4eDVn0>2fD5Knv+$HFw7}ZV` z077*fD@tUO(wrt=Kh-b zrjf}@13hncFL}@Gyqj=T6BvOXz+YOuYnSGmD&rdny9*(rw?-wIFVsgPnbTq}*gCtc zl%l8I)_FR#$%+4c4`CTJA+7UBjTF|;7i7tJ93tC92d*q(9YRyDUn3nLct(oo=eo3~ zE5HiteOVS@V}Nc)Km|KBJtZ18V3-i-chhV25WuQ86@Fi|LcO>381?|2eee{6f@1zo$ClsC#+nX!mTxHIOX}Q#2fAktLev>J9pX`Z36btSD%VhmWfP|BTR9{r^0bTGwF?{pN)9Cg@VGBWe#}_| z(yFzeZkAo|NU(V1Zw)iHSFRQIA;pk77mvyKexhZ7>hsK@(tES}T0yIpwp7L=PK%kr z&fSE*-5@(t^%31BZnuZ_a@xFT?Lpn8e#YqqrwG@pDflW`D%S6HJpC~V&St@w0@PM+ zDm&Dko4c(6XNRjTSS3;{BrLb#FNji!U(tDFIh#>^c$S~Ik8#@bT3=DknRTSqGg`bW z=xjp`WN(Gr?mS}UYqg{feumm}J+W zIO=q~&jea|(wWt;-f|9WD(K`qf@|Y$?4Z$K-UwLNkKrWdZMJpk^2KLDJ8xZD`JMl0 zvo{pzy*Owfnu?)AxcnQbv~iqnIE!|G?nz(>`#NbZ#!^cy7FQ+ft6Xl!cX1s_C4{JY zd$~)C{>(Itn@nyky6pCS?^g+9M@bwL{^ba5&eIY=0$nd8OFVDEZRbrp4)du}JL)5LuTos;ytoF8IIzj8Xq;j>}H zTie!Xcc1vQ-H@zyq{&2qr$PdA>N0nvYY^4V10;pz;{Z6TrCrUXIlOqOVevJ$p2Ryz zfiO%yN4P$$hW7CU6@r)Mo(iTBy$N5$e7ykJ`>^_p1n9X~Z#};^bL2B4 z9W%Um=eqNg4lI*`(vwA1Hj=Bigv6uX3L{yO+3gn>o?%Rpnk45^h1H_I*NxmV)ENn9 z&p*$jbMMAP`@@82WDQK#Hj*^E)}-KPxT$^TI7LHaf)an&poij5ZsVOTI#ws_p{iEdT}}00 zs!xD1<8mjv2D5vnRVr72a=#Blg7euvd!Q|6Fs?xi4BM3ul%s znwBo&j%hKfC~?90Gp-|!gVep zKt{dW!^HaBIYV04L<{Z<1Gvp@tY2f-4TGkm3St*6WoFMV`*6itk(bn6cu*1ZsnF<# zZSQn_)O5bnqZBVGVfyi`O3Nr7+lP}1jgdV1(LR!W^-=NWSqhz$3(3)BT;*uPWzkzg zQx3CQqvHTgU^nZ|EBeB<@KNrey{sp--->5cHxjai3~>&$tJ?h3rky-^G1#t!X}s;+ zs`N0iGV43NbEf0nY5|d$)ldQN3bD)BgKuUPXKm$)^=eIqidOXoznAz z%Szvtc-U|YyW@J#r%6k*&Y6BLXT5abF~9&tSlylidE0mYc`HBNj$f?MEf0bfUsrA| z9H1=sCnX?!b0tMJnx`k355!RB1I?=KKc)N6EC__A)Tw>~BLX>4UGwcdyPLwJ48W-3 zk1Ix@A`m(vI$5q`cjpR(mqOlKnfy-y@l*foV#a!LF?jQtL6p@kf}jYye?Eh3Mg zDj=7MGn7auH-9pPR}OEF4Pg#S!+YD$Ow9cIWKt8Lp3{FZtCC0da1vjyEsR5wfrQ6Y zl_0QcyD(c4RNgpK5<0U0GZ6+!LfuQI-v7GMNLT>n4UyvlHqdN<7@ zNFPE$)rq;EU*0`Czj&N$KMMg#ILeGH{v#~#BW(&CUu8_qVs>msT!vyISJFB2EVWYJP+L!1}jAWx?fG%I6tNsw}tuR64o~w zc4|Vva`kD3a+MnrK}!q1XZVz3t&D@NKnMIY?!?b~g)(KF(54#${efqJn++6emH_Y; z>>#t^Q63bim>ba9Th;@`7+qip-stRN_E z$QVPKf)wcWT70)_+o1cMD)2`pUV-v2VxYqOx#r5af4d7R1af`42?7uygNm>pS=M4; ze0WA`erXjXnM^}v*nvDlx9>q=xmkYb9#A7VzXodmp0|q(yQU-l$m%_j6(F`oD24gO z1ScrhtelGT!t;~i8I<^6&*U%@S(U?J!bRlowzrUlt?;bQT(rjw*pl$ zR_<2j&bXi(1$5Qz=5rV5_|DtrYNB7oKL@h5Bh`x;I_3)-;2g#2&k zO2O%k0AP#j#mx`1|9$d7C^C;rRtzg0tFjJvIsFLHlXAu2;E_qQcagoGW2KwOPP4L=N|8`&h_)(kf zyf$S5l*-Bhz~`X~#R>s zPRJPnNu@?0re*>72Eg6;k_ld%(_-_m#eYcTpP!6@mw>(<)1bAa<{f@tn1oJsJOS;k z2B@&o0Tl9011Nf?*nD58HHDJIaUJIhC?E(Eadp~U>}Q9+Lh;XrK`v9hi5f)iS{=?C z>wT!kUsMHpV$u^i(g)?(30K02<=fr22`%5 zKYtke?@Z$?+HsSWE>KrZ4oMV(;M;1xdWPZpp;%S10ONq)%hmteISwPx!C)p|`GFc; z5x4+C6rO(|&&ahH*LI7ob?i5~$5W(0({hMl^@`NTf~XZz}qK_*9KMY=Q~&0xh2cwJ!dkgp8WP z@pJHCo|;X!q#kIM7zFitTaVePpa)g!irWGa%Xi$M#cJb$#kNz6%KZZ!=5OQx>RdWS z_DrFsMpnRi_8#g28U`R7+M@V3Sl{Kl6U2A#fl?!<*;nCrN9?)E#LsifmfH7SHtvZHpxKkFX3@Or}q3;q|vfdj*a=>0VnK(oEh z&ftvuZ_hv!4Hq5iMp6Qrx=i)nmt##`1h6IQRr*>9pz8eXl1TxmEpLK4gZSOTop1ps z+GYy`tTlNq@xLq%EV(PQ#!I-^sE)>B-&tZtki%H5OCn#(IEQ7kB>0-5{ z`tUAs>czzx_ri{c%*KnFNYdgGdKj8#3Papd^qrS`N3OA!_|!CyHMvBZ8b_i=oE!Iq z<_7vV+E*RnqNh%y|Lw1VQjgi|-;EvDyOadon~Jl|dP{}b4^GYZ;Jn}FkSFp!na0+1%%4>BL> ztEuPNt#< zR61>8uOcl}&ZfOoE4RxeGG!e&{#|kbI~X3__*=bCc3aA-@~U4|%H5aWWlS#uOTe(2 z?nc~nFHhk?d|mv@>Y)RZ@vN--Z`d^IRQAuL5xz_>N;e^9_IvNHoV<$rlEm92X?Y>6 z5ER%G^MFQV>Qyd|hW6M&B;Rh#s1<08Vmk2@fR<{zhGJr3C6tflgPmDZbref13P)dx zg~sqZjWuhx&a!qNw(Q31Bs`nc?BaSS6Uo!vv}vlEt1DSDo|m7Ou%++%b}nH>gWMwh zL1ui=%;(P%!|hvh5{_>gWk@1pxHZb{3NNxgl%VXl-T}4^UT41FNWIg_!eGjFSB2w- zVYw4eim>4xQK@g_RbJfAr?}23llarEpe3H^V4nWfpv>>;)9J1Im1@%Y6KO@HN2@_1 z8QWl2yUL#jU-Ssk?eoeE2Oimr4}TKQSYD`>oN?p%Hg{h=@!9q#AQQe+Ul_`YDILlh zvXYF!-Ca!_$}A#vk1;7XF30vN9Q`Uny6MQe&Xdt^e8Je7PNtx$jMy__;o_4$7?sB< zv|s+Jkwwn}JOng0+pCV=f3lO_tFQ$Sc4{HV{WW4gk+-ttpfltUs2DU0x^ehzg*We& zJp;>kkj!sb{|q;Ob8~YI&o-+)_6A7xA3qubxG&YOT7; zdkgp#vS?QDDfvJx+qMA|Sb0@uUwQ}wVMVt)!P0*AG@@qSFT|X zf;q!bR)5!bGW@P5oiS!W8O##~MZi~I`LO_TRHgxdK&U?>?E|2!gKYr2H2_%2O-v)n zqP_gsgSNvR_al4Xpcc{PyJjkgdyW^$BL$S*x6-u+p|Ut~ku)a1YuQ4nH{xQLx8QHz|bZ7-IesL+s%~sBeE*DAc>!7cdHfLYF>=WBBHvr_-+pk}? z`n8JVk686T87eE_V>4L#TQPue4hp#K#vCv<=~yG}~=w>cZi zFZ@^4N9|`c2n``%F*{%cJ~L3;-zc+qCmU@HGI`A%WE=Y5Y?X^|0qvt7>R`6U;_oio zdAZ$v3&dX@e4*$YQ3aS-#j%H*?~)CiUAi~cL26wd2F^wA8s@61jB!lR2qUW>QsNlC z^%`uE9W~uJ)QJTh^I#8J%PPO^0lBOTpt({Y2TYFL^3Zz~-#O2sa+~=cxZ?VK7Q-&H~wmA*r{oTcEM} zIMC6+#B^<Fdx4c#b?i%I_ja9=yM|!umdSAoxv0%zb z0Oi(}&d*da3U%S-IAT6h)zQ%@7=`d?g$e>3CGUcdkneVHPq7PmKR*DFNzcD)t-owaZSJ&+lN(_eOxDd_le4IEYuR z31xo~UQ#yUz`w6YVY-~7A7 zCnFlah%S|TsaC;Cor;gi4*HW4RI4B)+Oq>GNfqM7muL}3SNWu=c*@0pC)p`9bSZQx zV;pp;_x9+u1hoqLgSRb}Si~%qmP9Q#N71F)tUl+G%e;U5B7QowDH}o|&dK>YEm?@Aqrh*NZQ< zm&bocj=5~KV|UiRiWwV2@#(sRWr&D;FQ0wf6Gaq3Drv2&^tgMU0d)R)9A;4OONt%Y zRlz(IGdtDYFh`_zof3)PZeV0&l)M%=f?FXZo2t=PhjF?M|KSR-ovp*y&ka~NVc#? zbQ(?9uY~M>GS^y+;(s;5>-aXlI+l8Ux8W0d(5jXB` zSbvD40gsineJ7Fs^i~v--MBov;;XcSh_OgP9-CA43S)chgRj-k%Whoveej`3q0j8W ztSh=yNcVL9?=M3X149~9YC-g1c>M55xE@!g^Q|2Xb7k_O1(&`Z2j41z@GWH}CBiDZ zQiR2K{Xz@g4?+D$-HJ&~>c-_(USI7hn$)gSJHoX0AOSX{p-eOj56t(0)rsxe#r!7SjaI zv(7Q~G^SaX)5U@Y3Edn_LcKeF6c)B-D$Al;Ru{X`FrlKA$*OoiWa7!E42y| zwK#t_D^9G8xD}iw7QV8tEzI>=ANC&4mt)8qsD9`Wb)~XVOwDf*Jj-0RWK%v|q%vJZ zdalSMX)fV9B_gXf&B}MAppTnK=D0B6qZlY|4!1P!fj6t}+WpIyFWc^Y<`?|2K_vQA zZ8re&GX>^Rw-cBE`;QZ7c}%3NXz6vpkL@i)9$>K{lFxVk^sPr=C`wCP7XM@@CpH*L z;8#rjPbC;EOKoDMe3i%b{hLr$^&!xgR0jYe6{5%m8&x*Qe;9TPC8uf?$g69jD1K+N z@)r@di+zKDVZ|3;rK3}e4ui0tOgfdsx|Lpj0He=j9Gz<)k z2db)Z6;I^a^8tK0vVdr^ODit$~=sAjrxl%DgQr zTjYf&&{v+dVjUe}pi6br zP5eCBzwP%09|E944nj>R4Sj+6n|t&<_6TcI0a%mqTNOXngD7K_|GXuPK|zs9!q3di zEHDU+=EafHd~Wj*zK3s+#OPAb<)2Xgb!I<*5@r04I4#V4SeOTr+<*p@tpE72#G)hO zr)_mb(PM%kcxVad4+nfz)KcR9&t*g~h*~OL$i{B+{`sr_*@>+txTOv)UD+RW{*Vz2 zWo9SE4E$|qp#~UQq$P%;JpLXcjevoPDKAr`she2KMF$cPJVWu4IE#b%@3@V6uKdsY z10S5di;O=p0X_DmFhJ=Gt_$PPTLKwqxd5JD;{R%<($dnRr!y_` z8W3dbZJqsZ>pZp#N{}a^fQ^71hC2VYP2_LBn4hPu2$ZNK(QgDNr{u&0)+fg<^zQcq zg}o*O3R-Hd$q8p#$OaGppmIBk|M!Ov$Ljq&!Eli|Pc^toclIbxwXHh;~8xaB5!T>jlZu=HfC-rkAZ2n8|CtQmcaqD=n5LxCpjuNy+5 z*hlF`0c7|m-=4Y1(Bc0wL*e6NlcrswnXdHdyS9*=rQ<5p;; zn+y!)hmB*xMj`d=gr10SuA7bwMDSSXFq(a2dlzAyE zm{zrhoJi@rj*Cb5hf@+kEG?O~e)#a*E^g9NQBg4?7eC}J#VF z{Ow9a<4Dl`{QT}|txHBLO|}XqZ_3%Ssaxl>I@b@ag%s=O>_#+>1~LbNMm_+3=(?Y{A3YRjn?gG*MYKL_*G?Y^jZaF|2*N}mwO7qiDif^i^5%GaXug! z&NB~DAu)0gh+RTr{5VNFrnnYbWs<_R35F!)VK~k5W?+cw9ZMxFArZg7$o=O}y+jBg zyaxNV+{vR+v)l_kN&bb#pITnNay`Goq)7-TA|f1U3~3d&EZ`RO?@I9tJRw2RQNr8O z(s{|O4H{};Uz6&Ze`grxBmNGjwPSHa#^Q=s* z!_pbkM=E3_-ONhL6{eWSH-m@6z+gP#KJj)UMW&wuqHL219YtkUb=QG`_DJ4g5Yl`C z3>T|{!Ldr`Vc-NKJ50IqFx&&kF)Yz1#DCxjMhM`x96N;afYXNyO$>k(uL>mN?)X!93wkAy-N=Ykfvpab~Dbr}!sQuW_7c*J7D__wSo9Y8bk)HSJ zfvX%s1tWcdfu5dzD2wTZvKeqfCm3qbXu`?p#KT?xBia61O%nQ5xfs(6^s)hc#z+BN zg;(L>MnEa--c54~uGGmi+jmNx*@N2x5=%GEKRJq12q)SuzS3~tTbY?jo%X%g3-zOg zEj$whoQ?Q7emnJ)4`^7psD1Wu-fT6U#VS=(Nl712;WUzRwHOzLdwjG{u&+4I$R%8r z$uuk04DBh?XcuZ`;$!kmU=n?N;jg9s{7KXi<2(_W(YHvN*m1*FV|-Mo>l2cMj-w{e zA5=F(X!xSH=A}zkLIK5rOI5<=Q@671is*@y?MKY0-`~)}3(-2ygm{|ah&OubeHjI~ zt<@ zT_#aM#V75}`6s1I8tV!hA5%dEYouB0kaCgH%vVLJ+MpkslBYl{z))W|Hq4fX3hh8N zWvDHjE1{l$!sE;maKYvk6cqfIYXnC^M8w25bjdTJJ-v#}mhguHaSCmQAO$tSz{}hJ zAX}4MJbwP;J1Jjl5TYIi=y(Q@%Lgcn7_|yCuLm*z!2YGFf@>8zlD9$&%|kk^3-Wpu z)@XkWdo2}L=9v(}tTOb|WS9h5OJm8vlNl{w>-S%#wW?J>=v$~73@$$KO-oSp{p&37 zfCBVXd;P(0v;^cu9?lgWzK4rMQt7NF%L{pwRI3svDmcOdSRt-n-b6pi=88H=K z+Oo=vX{lo&7zBoHt*2Rc{9z~>nDx9z8{3arb-_NF&gxtLKJpaWdoWD_`pHs_W5G(7 z%b$>O{JvCqX7psaSaUg{bqSjc5S6k6NfAT91#LDzKi|FEQpLc;lC!jB!fG4d5oW^;*XAFiairm0HmNtkMYRNh#VkyDawHpllAQvJ-(*NRrVLV^5{jxzqiDBkp#5X7+1Pbh1ppGAg zuKHTwY9GAlJUNOSjW_|%1XY$N;QouOrf;>jw))!v%kzTCXjvh*>A3rFyNr3y zb0Q_r(}m*XT9?s25s;aUJGDDVS}Jj6oss&LJfIugx(BW-M#EC}7lpj-%Qdz5 ze`0CMkTt6O@~0Bf0IrFyBSQEOuAnVxa7&sR=CT)0o;Gy(!8jj`{&e}!(@xePKpkso zrx2q!U;$%D`#%hYDT&6}SoVVGe>tdUW#Qub$jQmcl=D8?JuxMxCa*ucFvB*zzQ1Gq z{lgz8!r%{fr}o<6hvE0jrbJ6I*vgkU%RvA%p_B306KJ+Hkc%!AjQzt)lcEMmjYTKR z>%aWsF)I^|CV;Tam%h)%26fN_Hx)=yGdS4Y%0D*g!E^EZh10;v%!((7erMGlqO8D7 zPV*rSym?}R8Jz{E5bW1+#|mKF0aetbzop6%M(shu`PAEl9YHAA>{% zpxIo%d*{ycKeD_Q;Q4@jdV#zn+S%3Xw+iw>!o(m3@gw+QwO9e?X?BtZmpe6X1Ap*A zd5B^m0O)^RjN9(Kq0Gi?`!Z-nmq@#j!k(ZNb$nP#%5t=Zq;=AuCJc= z_>N5glw1HBw5S;y8~+!7288*401&GK`-&{a$Jolg&dpKAJ_J~3RZ z$)!m^_9kH;2t$^~#=wo{x>&;Nd5qV3R7$9`|xlW>@}Js4Yl zVQT01p{LM@&V#XVQp%H|6M{s5fu-c+bUn4UHtWk!E7E=M_hrB5<3}^lVX7De$B!LX zZx{Yoexip|^5nG&nwl}0YWa<9Ag&V0p?53sn$xBs^pZ1N_qP9w00d5Fv`MKS+7#mW z8h9QS8iyRL2e}tOdJFb+Q&2I$mWloUto1(>!~K87S|>n(dOql@aVysJWAd^5`x}k^ zziFY^&jBV1;W9NVYy07DfAtV>R%STwto;72PStkX;b{IrkieAQ)_{ofW00I;TmYqH z41#E5rv2(z@As)ac0rSps6rP(@%L?>_@hOqFAy)(8aEr*C%~PSZeQwwDHo|A#iY zd`iCy<5LI=NYnl3&3!Vh62zP1pSt{MTJ1MzI2T!RfJ!L23VN_U2dT&X0ybZ?frldK zUs>$dV*fB}@rW*fhqo)pr@r}Y0N8UGfCWOY$`HOmMk4|1 zn4CNW+DjRp!Ne;78A*TEEbHpTTU%Q4c*k8nhO%jpEN9$*C(8rk4onALFAb;59mFgP zaLC+zepmLD-qe5^4kM_COnf^tOBu*A2FcNQ`75L4Qj335F5kBuG5VoaL&_DqPdBIm>Y?b=cd04;1Sd0Lkp|B9_RxHQ>Ew`6|? zS5BzwnBr0#!BFni2uq2R*_UJi-PnuopJ>a!5>K2s7)zu(ee-0f%R5MwHquN00>ybT zvlvQk>;7)L(EH&Qip=Qv*8g8LSil-(BCdacdSTtx0fkJEN8%xfL_rpfnB{Aj{yz-5 zPwvP59#ALs(M^Z;B)u|&n`{se<$iIfR+5^$wx3j!j@B-=MR%o$tzBPoVQ2~kZ;+Y#$z@# z05ZDG!^6XYeYyiZHA13`s@uDTz(-^IEp2K0+Yh^OOAQd6h?Z-|&-m%ziS8BxM9v1F z-jd7I!os6~)jo#O!hpWT`a8K(@eP^RGXF;8z#+p#2n-V5C6N64yeKRx$`sTgb1I^N|I?g%NI19$5Ye;^Ln zYe_JcMY`gL%AW@eiIDc^_J11MD-W)N#7T#b1zJ1ee~940dR-88@ihM5RX)%$GJfc# zPRc%!5P1T{7xL@GE$|P6)XtccWRe>O<*K&%RYVhh-$_iI8lM+t87E}$EB#pH%#aj{Ukg}k*Qw_v;3^ts(5l~gmG9#Hr_AKlnW)d#H!reNiODhY~>$Y zv8fIy1h34l6|TEJEbq`%Zf(%K#uF#(7khc)=y18MHFs-n7eEGahA2OT8_Q? z;&P^K`h%IV2V!CfeSL#r!F-Vd8TBI7ciYRE_Ce=NV*9d2hM86|zkb~?n_1v4`nJJt z`rVI+C&yiDIKi6Uq(3=u#sPCn(L!alBSOz;c}VsDQ1;$oO>N)WsNzO23PA-?s@Uj) zN)1(_ROyK*NE4MNO*#o6B}7F6iXhTr=!B~D5)e>96hd#ID>XoXKmvqv7u(;t_uI#P z&iCAZ)}v%)u9eZ|oMVjnz7>~K3hXOX_S3dgBl-owd;NL02h$AI!2PwBNqut20S@PR z;$(!*Sldq7AnDr+c%DnAJr?aV984VTtFu&cHb(8~Yp>k|awH{Od1h$2iu=7h&eMze zp3Urifzg*~(+I%<&3a;hy;p_6=wYs5!cN2E@9p2pZcADWbX-SRkmDsKgPrRLyX}nK zm-hsRmY=F1?q<6XTW(;ZX=cEBw{EP2SMG5!&}KRUB=iWgrH9#g9_$l96aYD*|I30G zXdOO8>wto9I$M-F*k&g`dBG;2JOF{&qJjRm8~5z29}kp%TMC%%@vB*O^zfViX>~SsZaQB%6oS)6CGD<`mXlN?5mY?On&PEqaOZ zdG=yem1{oA%T61P)OZQT>`&CCSny2XW3+(sFir4dG~1fzV5MV8TcQ$isdCSjyx9D4 zZ2Y>qBL<)+EY6qoEqLpS*1afLSgMHq%ziH^$IegZ)9+ z+?}Y?kk3HZ$OBUfqNUGm;Pj~P(2AA?cW|(e4`zp4$k1r%Cl$pmec!nD5rRSZlJYU* z+FWo&fqUXITL2Ge8nk0i@bs1Y;h7m3|CgG&J3tOX+CNxY;4%{o4+8jY1_wUa75pD- zT)?@k;xx_1yz&T?l+D~W(*@#8>Pg6EnZ5V6+TJ`_?q{aiV)*&1i@)qJYxSju6~D5+ z^#M>Swh5cE}qSx`C zRC0Ac##s@(rq=h4#i^Y~Q|z0g)s$gTaQVfdb@F0n|5&NLbx_+VGPt}V`?`QuF^ z{?SD${}4tKd<3NlaNu9M*8aTAXzUjZSZ3vw56EEAigH<%l@G^@Gv}WS+C51gBqX`C z58aQ=+HM+UxEESy6Bb@l_ytwYiz8lgZ6bj^IE0;zJzMw7>#)y`)}X8FDzA*ZV z><5U@bc<>ej!Q$fq#afjy!Gv7)3d1nu4bTCB>4sH?A#|Y$PreY7vrG{Ylsk!pU=^ZTM4Io-c+I4oT)nOb6WqAKi`x#un2n?viGIB{Vu~{Seqd-$4&j&La%xQl5KyTYj>~rfCbVa zA84xDlzylm+O{-Z{mSinL!Y#8EX1F;rYOQL*6a{AEZ4u}5sunlqfXF0E6H7Uni3!j&Yyz728ov$s2+;^o7Q-<(f~wd;4feZ8uOWp=M_G2XwL4 zdbdZdoKOOf({^G>s(8TIx}#vQnElHm7jnN(i@GQTK6#4K;j5;&^MJRG64UX`a_hA2 zUVgh3e{5aP*I#+9*m_G})6-7N97S@Fm6j$K?7^M%==p1qOKgJKa+$RF!} zFrx#F=10EgSKS(yzA)4-HmG4=PFrMj*Ba{cFpOL;E*X{`VJpe>$t*AMv3Sx+>Disa zEt2_s%VzKOW|dklD)ov>u2qecwRMqte15%Ws9mg-TmMP=aAYzia+}2I-{j1wo&IUxpI<@m~+lxKq=1!wa(WI$0Avk3ru-|eD$(t zH)9fNczInnmi~xXh7b%?Uic|&b+C+Rbx319YUr_5<*4goq-r(;APsqQ@4+SKvgpP% z4Y$Z3zTN0P$NFkRE+c1bqGz6;XlkThzYgR1o&l-Dv-)Nv44hX{q2r?GB1goEGNYJx z`3#RVpR<;P98Rf-U_u7BYxw~4Z@ABuALvx2jif#M_tH9wR*TL#3`$R9_iz%4>^{AE9Ww zio%qFI2P$|qoRqFhNloV_{9(H=$$TS=CcJBWouQJ%`Fo5m}{o@p80Az2gQ8Riw|#i z@wSn7>{2)uoxyI(Eas@%tfz$Sze#<>$#Phr*3Uj4A|)HFxuW{9wi_;FG%ruXv3J4+ zE=%0ozRRl6TSPt8wU*W zyC<={7_`ACc@`$GCR1?-T15z1T{xQQ`5IB7k zU9S2j&L9@M_NFa%RzN$PVQTeYwtOpA36D_SUX zblR`2_qO(`el{ueeId^pcN|pD>F!W?mRb1Q=;ml&n6Zn6C-av)#U78;sU--@U6d|x^%yIF*IWvE5a2g<#me#|7*mRs ztP@;>Vl8We?EIoJ;>jfz-|{~A7Ux;U%PVxzN>q|NR%VlX3waSWrgvWbqRAWVL`yOP zP;tjFUYD!6jC5=Un6t?ln?4Up`bgPBR;FOrUxl{o<$kPL<#Bz5 z1z`KWK1fD3O1g=$3r*_dcuYMNbfPiTYJ|S58%=oJ9*Uy1^16{MN2de)Pv=&>j{x&O zK7`zmF01N_Y~<$07SPvdWTYTnB~Z!ixeNk2u5?njS4u%-5u^w}eb|gW3MudmThw>Xoxkr(*G@Ozok;l!9M{sqLl(L2NAU(~pBBjV)s&~KNv>`} zM%h!6=2_wP#SnvDooX~1!o58|g9|ec=`fY5T?jT-Bi7xc!3UxtiKKwj?v1a0KF-+d zB`{rJEApYH&Z} zH)ZLghBxVl_T50#%CL0 zJF6`KaVV0G?rKnd1N)|%@|r&5SyVgSG~HIZiIt#LX(2-q;3Hm`JQZnU*)J^-YegD@EXRRZ8OV1(Q9hVBPU6BGxAlj&V}9RNY0C!uBvC8>@@zBVoG`{UurzGYW4Q8zv3o|04qRS} z;f-MIfT3J3Lgi7XMykD5Y0?ru-laL8&t5&OMXo&=7{Ve0R#_=U1RW0zy-<%)srEif zJwwFvOYV0oL+64na-Q&B8W`Q8FS%&FJi|48!Nj8fp0Zn= z+IMs+qL*|}x6)eGytsvwXYK123u2Pt8=)I2^g$3dIG>$~(CEcBUo}<(VSW`=q$bOg z8ln(2-?~}SO6OeOBdd`NBZ|G6La%|!(yV*_#l6f=+Lwr%giN2HA&=&bZJEG)QwD9q zh1iB{bFJN}$5{$oy&nu#jXo;$xj9m89pMxK?yk6S&!Ya95Qk~fk(p`sM2cpl#-%b_ z6<#ZMDU>_N%Uexk**NZbL=Smp0uvfZ=iY3IBvYYdG;xsvdnUP$|(= zOQ>MTRawDlo=Xb--w+CSkv85H3Nu{a`4*d zrs@TfMNm|#|6E+hqi99sE!Ofo_qEN z|5qM4gM(jL1dMXK&2i5l!M%gq8NP*eE2At{xy6^g3I(M7jQof>hv;;tS6?QiXxM<2 zhm=~?Ap^S2dzYl!syf@RInIeZBh~uru^#0-Jr%H#fj`I}4RMI(m;b8ty#{StC)3}4 z3@q@301YbkjmQho;X01>*J79mvKtBw98rBUM91)0-{YUC7bo1puH0iC7{w}$h1Fa( z&Pjj4)7IJZr3fu1%{i1iuhDNM6dTIP-z*jERlUAKnSUHK2t9cwH_SR>1#})BC7N$^ z;A*QrUz(SHIzZUBEnS4oSWZl zZ;NiMPI_w1qA#quNvu3yluChcsoSEXVuYO97SCNiS!hLhk|q1 z_$z&-k#p-6NY&KrMp-hcCcA7!_e5FIy4B6B14aiIzG(>_pAPCQkN4Za9Wd>+O%&Fe z$(p~w43gSf2^zOGa#p*HDi1`ezFR>HoPg}=SyPLUyorZqUuD~efBN~>K1dCAId{jw zIH~gUjGjqMTGi|J%}%gJ%>J9#{4~YA&d{nTw8F(c&fBiOyiU$IYm7rhv~(+q$n<%L z-s<*zUn3Zcck+tz9+COWoUT$m@LY>epwWoA9dt(bGX9NmO=qBT;8!YcNax`5)NA;A zZ}LKKF(*{hIkj6QfpxO&Ku{1xRC;ut-tt!Uak5xej~o%Q_)D7Qb*fJAyC(&NDdKrJ zZwjOP+Ttf$Xp^mjYrcDw07C4q)NHK#txA?rq+9&X6r~?g+}Ens{pISg#=)Y{Gi-6$ z2MoBZv;7q^&q{$X${fny(fIC_e41{0_Rg9244NcWcZcxh>B<&>9`5Gkm z3HnhWNM$eRJ8W;^tx&RALXK1Q77w3jiy8B@cv0q>j%+&n!rqF7&FKdMN##TV#f zkuEv;XL{NOSbqSaq^Pg=%63cawFL}|z!Cm;TY8(uyKiap%Vy0JFW*|N3lVSo--!_j zJ&%%34E{M3tk1^ZFn>WHctEJ!Y)pl4s6>DE9vr=(B~Z(U*s#Ufc^Xo-gBqYIM=x1V z1)I8!eyFx*<$xW@z-4EttUo>t(P@&lTt$!huWJ~BjaQLDxLl!Hhe@vMKQ*ID2?S_) znQwAC&Ha}w^MNp>(J~YfemTGQXw5*veH%l{dkQxsjDp<_ zXSwbSWvSZzXnin@=}qtseN_5aTtu2_c1dUwDpHLu7Q^B)nr5gWooVD;$tX3{JIZwo zbOAWd<=D*6&m^8Qv$sPX!`-))qJ;AUU^WpHQl+C_PX1;~6NrCYs(qu*_c4Vxl$%-0 zPqCbF`oxi$A(TD;@1XH3d@@+ZxRi0D)IkQE+72xmaQS0{2zAz5u#xxe+w|!lRP~F%6i&FRhD}%GD7F8mBYWC&jTAx-$i>1#bX$XTzCPI|OaPmKtb*7Z@7h)54pZ z*i&5S$_~||WI0zKazlef3PVpN!$=wiQE-}RNYqFG5|>I|UD2yzl5IulPocevl&ZMG zDzRbC+o|63bdAbvYR9IySBJ9BPcZLPouo$G@S!>@+%x=~!RoTMi%DTrZKzHhT!&4z z-@rKM?A>3Bcj2CU)#M;>=herbI|8BJiFglA_BU2yC-`!mv^@>l{5EBkh7KL%&(lCL z2dP-C9k}Jn;x9WGy)*2U>?HWs5@C0G$Q=w8<9wNuW=MScc5E%?JLQX%iqTe+qCI+^ zZ=@T4R-)?HuW3K;$z-DyYK)q*47UB5Ej08w4R>IE#on5=iI^Lz19t@VO*Xv32gvMU zlz%};hi))m#DCu0J)%Xzv0#cvcVFf@xMy%U=Ty|k5?soupet;28kQv+WxK%Hc#k$S zhQ^EBb-H_!%r+K`?-!q ztNa>wuNP4^Ti$730J|(TV+Rbk69ob~+{q*7o5c0h=#2F+o8*b>og%?j3u=CXMefc& ziDAgl^b_1h@Qgg9Ujxuc*|A??oR!~ip%uW%wA#dOwJ-td%ve7MVzN7q*mxwLv zIWR3oJtQK+*W_Z%^n$<7G`Vh6sgq!?k+J~dgL(9HDan-?^n&Mq!ODmwqt6SRgxK`y z&12hFkCkOD4*L)f3DtHzSlAK|Vr$4^?~_%%7q1}+Kkl&8`3{Xz|JCF#h#uF568RVS z*Xx%-$Jpw#gg(pY)J}^U+m}T6nVin7AMTGqP?Zym{mQ;tlZ%$5c7Fav{&g@cV%!v+ zA92XkyFoB$omwo|R&}jA&jWUm6O1uftDhd5jk^uP>@qsMC>)myQ#+O|<`>}aKiYI= z(-M4R>X%Zge%2n^ZMl-}GOK#X1@XnW&T_5B8dWSh2rX3{y)^9 zvmXP1MB6QJHwN5$HL^Brqcb~|D%7aDj3o^LH3bf_Vy|0xNC$^APrFMf*gv!CLFq8E z*%BA34Y!3zc~3hhRKxT716bBHYQ8|r&lTP2suck(!fbsu*X7Z=#k&Y6)x;7FuEfw+ z5=9IClbhU&UzL`Q^jpEfc!hw*yx7#B2Jy$W^R~~>l2H1f5{%}M%8h#Nsgi7xy=_u- zIrh1jEvzad#?!aq6_YU=**g+#0<2OsPD6~FIPi9Aqnc%lN=v~Cu)@#c7#JK!jX{ge zwkyqK12w0j4xb+X?JP68LyNTK_Z{>xTKN{$=y}GcTMe%o3(~&|@Zjw@zdEQEh%0l# zruxnnT~MO_ItdlYd#@dmH=SzoB4^nII^gK1#AfR==q-=v)vxbhm*Gpt_xQ*K8{G^| z?^l-YcJ~8aK1^HWFSHG=2kKJVkz#mGqzIsqbh#(pi@vDxo_a$!-f>(;#GRfiQgkOd zl$%?Cv|l3nGUyC=+2-SOrTBX=b zFIZ?Q%t~l2YfN0F+L!u%z0@cqQ*J%eY+}wARFC2JO`!N@unpw}qr%b;1_ngn%T2fW z%>CBeUc@Y8ZUZgaPX@f~xC>w(& z`VP6Cj>5~SL~>4xzOh=QdK-I`?rhDCTF=?ifR4&~sJjesh?GIE-*ZW+vm+2h(xDvou6-WP}~>f_2_Ff1y~iCc0jGJuV~KJbzosX_0>O=>r}JMJuw z&J}vA5KPUWm&(?34@lyz?p6e)`-%kfm8TnY+7Uw4V@LMBg{5CsHT-c!U~PLo|C)wy z9mHDp!D|fEQ&JS>Fq5)`2USrzg^_;wT(;VZ%%(hvx_WhB=kf-q6uiViVZT?!=htQ7ixT%f-kG5BT`^$z#2ne7c0A)8J< zHP8ZQPA+s;yGYk~>h&qM=FE;q`(>MnKAF#|J_6dD+2agm_Er}-Oukw2 zlM|)pBXakSKp*i>QLoSOA&sNLKtVC)VQD#p? zKX(~ffi`@&`9z;}a0FHaOku=KVb`-2b9&0lauN+ZA6fc0sWmN!;A6TvLRRZ-R^uj+ zuq)mTMZrr=8{Sldos8uw`pgfukle--XLdJ(+mG;5c~T7N|}15>1~J-CM>dAH>C4Uw?yd@+n`CQwW$=O!8V+ zYwQRG9p>y4Q`oUt(z4x9WVQe~!Qkmmd3~dWGG15!z=52uu0fx+wE7S~U2i7R*(na@ zf;bgkDPv*d&3!?*1x9NjWFs{uvsC3xzulz2nl|qu;1CNhjs^=xWON%jUF|P^ephkO z)gTwBdK>RByYNwnjg+c6&RM zyxwGVGnbf?W+X%#BQTXZ+6l@PJNLV%ByM(KgQzI=C-1D!w_VJNvEJ5!}%!6nNu^km+*4iOxI3}CP zI#x=0Yd9MV*NH`1tYw8iWcq0Va(Z^MWILg4)y8J)8WHVo1V4tT%!DlWdK4&T8K_P_Gh!1uDtQ&wOZ;MHNroRg z3S}O+SQgT!FJq4KKm#%5*dWibiLY(GZ`=nE&`$YdNlSZ(vA zSt62aC}w)XScJnG7LpF)Bp%mT&Z+W~b7@~KcBD6S9>%`ao9IfRtDV4q;k+mI9+*eX zW!|J{Kv57XkboVk9m=3>qV)<{se>ZRJ?^22j7xPnJF`|Dvc@;jL-VKH<8%=x>b z;U;(T4(K+#8p4U8WG;-NscP&lx2QUfg+g$IXu@f798`EC`6h_dn&wO|9z%Z2bf zJ>?O6VX%1p(N>|51f4F_<8z8{PVvclTSdfED0Ct>RLj;UK3D6*Ixvkl&%M;Z-! zr^lj&_D@I~*VY6%ay}zN@v)lE24 zbgnx{PwHaWd3f$lGz+OorL=XqSSvE9zNe^7jZ&*LoCI>^k|?{izL}(~V6uUXF%y`t z!cCkkb4XBD@pRoKsP`XiXuw9knD zqxbATeYPQx%SPDH^WJjK8#!l2KNKiB1bIw5;!fB|Q4aKe>=DNGOF!W$G=N_OVR}S! z4Yc5y4<62JA$h|zSX}zszfhHb`O9`(D%vobagOZik$!MjZ7yeh9gJD#AAyhG4wwp|;dG`EesSRuCrsTl zoQ`<^MV8uu0a{##@j;!-4^?Wz_CBg%1tEK~ zpKsp{OaDnoy9vK6wdt=qNrH@u$FS4wjLP(&SfVEUaw2*9vYPNtkeXnkep7ZX)7ihY z1T-Bm!pvGLtqC9&E&InuG06JNuym?Q>D;LI46tQ$D0Qe4ZgbA`e~l3gt^`N+s^yzq zBx-=^g&oCSsw`2E*51v6_tW&ZRuh!xD0Lfzw{{rEt}<1Y6Ls% zA_`x_PPJ_)omKYQHXGZf7=?JYEbr8wbIfN!vMmq=Tnq0JAh5fZ8zgHK`<9>4YPp$% zCE~kFgU!>556@2qEiU>Jh+|Y+5BFb-Fep72WY|;%_jS)`Rb&1LMx0G!Xw!RwRByd2 zEQ*ki?fqhAZ9lTJYgC*&u6_NPYW94=1F+anC|&fa-x&5FUSrs|!%Co#kGFhtVtZ3{ zZ!o>Qg5^C=>#evuxw&o7J8$Areo`%5j#0>cWu7^(t}5qerLQALJ-=Jhhay$gdY;r5 z3Gc1=PAI19ilMaT6U-2%%%-m^?vBhP5_N5pBkqd-6fEAx&)?+|bE12cLpw+<>4CLHL z>6e+RN~0C8mKe8t+uP$`H@p%%7VO^WF|ljL=u5K)wu)4!NYTW`+O1B$`}lDa_YB*= zdT+xgdYnlFywENuL_DEe`N5NpYd0r8hGrNYoZrP8bZ_4TMafc<8lhqsAD_6oD$Mw7 z(xv==fz_K&+27K_BymDR0(4xjK=9H<${W5@kip)~JL63#$rXc6dt`#7zGjgTw87Yy za`rZQMc;RL(K95U9h%LrLX#%X#6fr(b#bqmIK3U59K^j%vMQW^LhagyfpnxbZEuYs z9PI3#v3yBMxZjn)sFTKivjJb`Y;?fHeO2W%9YCX{c`oVk_krYkjEm}P&D>>j_EJZW zBeiEgo4IGScgtF3y6>$CFJc8Q9b3cobHW@>%fZWjSeFjC*3d(weRTxE<(C;$3UsC( z62Nu2b}9tlFGw7Ej=e*_s(5SWB%)x#6&dk%i&_Og&X;ogp2x2^UZ9-XUcNNypD*^R zi%W~=YF;LqNq*GhZQ!_&M!y07jsh4=1%E*ocPX3D4GJ>$7mVITo+s#)}fE-3aVz_hB z=KDT|J(q;YMkk*sm75EFM9@FS{iB(FPAjx&R&-nBZowN0v{4oVe9=0MB$YPHROG1JlnITK|=HK<|2_Mh^fi8!+(ZA^wrE>@!+sYaA7spjzhb zJuLo8N{?Aohs{t2!qP-{la6b}KJDFe=(C?n&jmjlXXKm6@J$VFOYG?xWeP)HXWwzlCL~M=x)6>j*zGD=9V5b_{ z@Jfv&dm}TIT-_YJ6MId=eWBJW#7|#(sXzN|%EV%DxEa}TxAHapfx^IF+j_*vwNU}r za%by@i1Ytb$9Ip_e&X73Km_isytdKpv*_Zx_4xcW|7%1z`+fMD#JcUbM}eiEdW`Wp zg8&x7N7~mY9+LN7-T5DPKXxjiqG@Nf`Qzt}`Z!Dzi4IjPO7^Kw?w`J`vwx}dCk6x*p@&3Mlcs!} zCxxPz&bA%N#4uz@)1xM3uORI}i6x`sF5%N5RW>f5&?D%_vJ)e{og9I7CiilIBrBvo zaJ(FufcDPqfP4@8%_?|xjM^&AYxhe(u-^19rOWYxF_4T*f4|j$oLh)7(w)6gl}7t5 zgkOfx;vZAiKl#C0 z1Q?suai5r~(pVe-dRVr*N_H)mQg(14(?3jy0?#p0hhK}_-Zjq2!u^6-W=@3Rj4~Hqh>CUa48iCK zZYSHXSLY+kU-S7<{+S&{!D9|Z0i7J?^U^Tpj`q;=ZI0<;Z@8@juX$+}S^b=uZN?iI zp2?w_Lmx`gi(YeSCB|y=FV3Ism-bUpR#x)c+tGN4#>Q%*C10cq8t%wS=WWwl(9EdRv7Be zyW+)H6u_E3FU!_D&_haUhtg5ppTo=5c8j6NXI~p|0Z^&{T;Cn0oS%SaxK{7f3-lnl zrokpQUaMEMB5<8P%1TGCXE$bPrTTqMU75IGhVa%&1XFW9T^^(~KNOg3qYBMd`@hFk zl>%n)!U(QKtlfCQB3d=E4QtCJ<0tsMqaFQZ;8JA+K5Y+kUMRp@MPC|e=c#L6;tzDi z+lC#E;4ln{6DyOr(Lsv7ef_&;y}do+S?2sTR2ffvVTqZE=!VhLf|;0z`0ixY#Pd!K z!q|9mg|1;Wo<0Mv!03%(gArt1;aX|=#5O0A{O&^wiFAC_3V%*v3&?`-;6FSk?qtkE zRnBudh|6?(8Xim-3i^gmdfm`@Qebp_<%M{enhN&U*}bCL7oHm(feSH*XcbL_6NZbF zl8buvvI-_}l2g6ZBSsy(fVPVk+*IY|R1@E+Drueh&<#}8^XyL-8c`eO?iR%0l^cu` z#||ZW8c;iPTXlu%hI;&pXkFV6c~-q}EKgasO70xSk9Gw^PY{69UFOL<|8gp4{l@q{ zYN1Z-rJMGozNV+&FW3A4*bbaWD%ilBJ0DL-y+97&{$J3N=lbxC^c5gWt1r2|;3=58 zkZxofGn3LX=eVemoi!t7D{_I&(sx&iSbiPh|Ij!|&UQ(NjUVU(r1;r0KwvI8l>3D; zf6t=Qxf75O`r3=Ac790sPt%yI>vibHVv8oEh;o}}#*yJ(T;8}Y_4BM0*-+u?HP~8) zd9vB^4$MI?c=ax;39zXkQ{~qw-QqtO zCHL3oJ3_YVl5+75`4m0Rwa1dnwkx$zOE|kHx5TlKHyHgHygRh%06TRvij#F#iVkP? zpq!5;0TZ>`d`d^QfN!bU4KAYS+SKaQ#2q+ucN1FB)V_3DWbiIJA3}SWuPcBO(GsOoYM4<|>9#`XJ* zH6K-pK@RX?Mg(r^L7sD3yFWbNp*`hh9f1!#!l~}Gp$OM?(3bR06C95d%H~641mmwe zKtUJB)?Y}$FXleEN$LWciR(Zd-cej@4zrh;kMT?z@c*g~0tw4X+*YG-noZrFyUAsj z7^ayrgbl0$Iwc^7%ZQ@_Cq!90Yc=O@sSQ9QN6(CGE8AtYUK%-ymo_<_ZUns^JM?b) zs|o7abV8>`t7AM13}h_(W^&*a3u-=pI>@CrH+beybU3ZM&a%cPAD?Y03CiRAI-i)M z;#7B*4LHELks@hj!4=VgAcfWW2Mq%>47fHrePa3vp-YPkif=KgQF6VPWgXcl-TaD4 z6)r9+Li%`-)T9q?*$xcc-tO653-WHvGj|x-iV}L@*H4iuLolPZES>A!4*N4tY746B z_sW4pe99cy1=)_~B8=k9?*`?At_%UtPyHFrr_&4=w@;^+=So|6Iyug&z$--_7}vn# z(wRh<<#@78dNuK_CfaEntIzGNk3qTclDUfx19E|84`WA*gpRJ4OdD@epxqh|yNv3E zdkeXGM7H~4e0j`Gh{A3BnNH$mLd-9&fCFT_vLc2|Yr11XcnX)e_AK?}6x&b5M-xiR(ZQPHg*iTpq@h_==!X3b!Fu*)=~i zr=udk@Ps1!_e*z~OF$W3sR57J=pYqTAE=^i);z~#P6zyzYkZClG?lK+dy|o)`Sy(n znZ4J*?XN*PT(@$!3>F`j3f4ECAE^(ha~LpfGt8c8F}8wNm(0MtI(pXtoR~zLL5V+_ z!p<6vF&9=%^O*W^S_9BJ3jnrCDFDzjD4?Z?V3XKyz7Om?LAOW&ngYk@40 z**o)GNAZ0|A~>v+DrLCiNba~Me%blFoy!ssR%VhRuJ`K)ByyHap7 zWrHX57XR`*AN(24K-D|KN&BPZ=<`}{=Mc1%B-H*REn62=U=b*&NTN+py4{~hBKLS& z^?HxvABim`N1@tH;1wcIjYZPcMRKp3czxqMkkfVDQ)$YdHDhPaulFu~99nONio>Ur z8btQZ)joOhq-oO$lR5b`=WO_S#lfoh#KaTrRU7y0f(&ljs?Gm^-+Y$r{jy*?nM=e$ zKu~dZd#bW;#`<--7_&@%3PxptM1;eI@HdRm{R^B z?LF(P6i|ebC+0jBlk|X*B7uvn|m#RhDu%Tx-7ppB6XH~vx7jK5-m5c z#|8;UPnO5?V&KggKSc+x3(D2GFy>+X_M8WR_L+n6lUTvCSKV^%f+1jq7iHvsR6#lwUGN!4?9xn=vis+g><4j z9r8H3ISWArZusY<%$bX6_Ym+1W0LDH>fC4ICJAFo52#zR->XeMQ$1S!HCOb@qB*0d z3|tsATsEEDlOS*ulIjz8BK0D~prC&}KC@9`q+}?ImPgNXakTh& zmyC-VBD7zhNu!I-zR4VF+bi)Ebbs<@U9cIu7QUtd(mA_^QJ>XtIPzD_Io*>ST)AQi zIpcy+=Pa4J-Wa^Oe3PjQb=ETA?KxR>R-LmKGWvJau);qcxq0doUPkz0SG<=-hZn99 zYwca5fA|Iy!_*klzDd-Eb@j6eVnub;OA43oDO?IFpCx3QBK4^Qqmy)O1h@ky?Wrs6~>3>AQ4sOnvPxBH7C-xGd1@%8GWhn;fGre*QndBY*!dxPh0t3I?ft9B$tA*=+OPFh7;Rx4Qy zs3Q&3y0{}eYJSMqtPqEVYhSNNVixo4UI-mOcxcDIo-11EbDf9lOmPH*n9^OLWGX6ivQo2lid!9Quagf#|i z#kWjaAuVyfV~57}L#M1^$T^d4v87`JL#mXFEzm{D&MRfQf_FZ;RP`yA-<)wUb}x5` zytzm8ne&^JD=nKHy4N%ISS5N;Hfd1`Vfpc8(f2~pKF_wVXpt|BV)+=l zey;rN9@gDqtQ}E7ctyffQ+R)^ys^t>!?dpC4+#G$$M?rsInt!F^(J2Hi?t)c-eqr0 z-C<_U50rjO0cpv!&c&zy5?%$MG~GX~65zGl0nicg0O&ql9W}m`?P?Qc$5X;@UYP1s z{UJ8s=e#i)2Rc2#`}~qYZ(SxIOeH=3s51+&Va);%)IotR*wcKyw>$d9|CV_1o%gDt z>HeNGU&Mw*Cjj=kqx<9xLG}RtY7jtLa$cjq(2`2lUHLqo!`Zq7{b#J&w+fLH0G2s# zBS31KU`+zR*W6o#(BUCTz)6eV3E@aK%<5-VR1 zgX5+uZMO6LF+K(9Qs@55^@W^<$1u9M1=T;|qI$@=kZuDd$?YZI`tL=H0O70xz-H7G zBLr;(NEKJf&J5rc%R}p|{(i-fFZ&OBSYRfn&c_g6zwZun@}Ot`srnlL1_HpT_)C%% zvyTem{)T(y`ENU%5;np%7+3KX0GH$=4BQ{z)z|$ro}-iE3f?ycqxY{^PxHyXJ~?Om zFDjY31lXRd{pa1^pA{nyhNVyOR-yiB@qa%-o$aq<7^kPxdcOs*k;puMULjE}IB1{N zv2t&h(WFGFjaAc_Rvrd`6*ZbM{C{f)H~&8g4S3W!j(YfS6&z&k(Q=modO)QC#4LcR zL^vEcLa$O(OfIa60JfSEYfB?a|GN(U`*1)N15l$ha$_05oSLm|?7%^r;AWOw0L5vn zwb={d!&;d}K7xO`xB4r<^^ZRr6Z_i=I}ZFce@y{kgdRB2&ef_^0wvX`tOsY zdWiW^z#8 zh3~)wJVbx|0^-@9w|?pua__rqj{-PD73;|Wau3=RAbVb@rMBC~!Eb*nDq4(19i2wM zvqlO$x|RB-&WV&-WcFL5qJTVPr%EUve!KHOUUpr?9iVuQ0LZ%J>sdn6_X)>rJKkT% zS=R?`u9pP@PGND~y6Ep8^65UH*e|D4P5&PCpO+kHh?$lD#oFC1^5!W3yp3OO*rut& zK2sIodSn0pGmyhmKm=^d8G{e^V`9ihE@=fZlBr$upT2&O8GjCN8Lo>J|A`$=z&GKZ zPsINWrs^SA_s8izX;<<0I6N7EqQjJ3e9Zq21pl~*1c%%$d4w7)v7b?DjmiN41yS(@ z=BQI`Ni+TZwC4@RF3f?1-!>4*JTI=02#{JS%w@zjE>?`Cjt%bH_Dya}&H(yS%D!#4 zJ@F9zf*D5tX;JS#-46k1u+%01BGa}a2U5AqfKy&UqgY=42>>D_+1i=}I7S)Q+WK!> zkT{hSJ?=1C>p$^bs{RW~}>hC29cqpIfc-o;Gy$ zw^G{wC6s&>w{h?BBNk|7?YlsS-`|l|-py*C+!tUJQ)&d5I^H!~0PO0G0GXR$0?UUt zk)C{UCZs^of4JG-IicBQ?L}DtjD=d(RV#e*4@%<$9+O5ag5R!@vI; z@)1yhSdHm@^q;;P@>K|st3rFa*57kxN9n$v`!;C~|DuA|fYjn%4)OmMsF6LOXKzhM z=${etDH96#48^1RpRi}UKij^fK9KsCK@SVN0~oactkwj0GiV>&cj8y9A-mFIsp%d6 zsW>XIQg|>Cx@kMjjYXe{70xUgDxV?$9>$>+c!>Vy0%GCMH&9mr-2*VDCIHCsRtgRX zgA;&_90mB-TxLlXZhtPTK8-tkKoS2G&q3ySqwzNRv1624szKcsph}G@>oXT$TNu2y z4}@D^R}QRzoWy7U^GbIlY}`u+8twmlp{Go2M!;NjFjb}|f8T(O%7K@Eh;Uj+_^r)< zd_x@$XaS%03&^_d+b&EW_@oAV^8ffFPj=B8^B&H-f|rB^@Fy3@w7Bblm--=l6NOpgkulWD%hy3YMpTd+IlVDynA!rWPS}@z+$gUJl z1csC2pFTs>3LHOp+X%tlF4iw)l*~w(hT`48ov*lWYhA5EAxFn`uG0z<1%%e zvHWQOVh1=^<4;S>}IOZ8}D3zn#3FXNOo0I#~%9s{3ljRp#bkx)-e3zQT7DC8(|pn z|M~aR0j`CYRqp=*pGyVu>E~pq1tWv z4>95yrBFVv6Zik$JbKSxcpwIuyjjwIayTpjR6vwxHi!Sg^AjMhfdzpp+ozq}nA`wl zY3F9{{QJZ`!LbDEipbz>w*TpyZ-C`F<1w;S`g47Le}~2q&h3m__@E1D=c7`OjXB6% znIWm~z&aiwbV3_6jc$&F|9lr46w=-6N)QB1O&u4rGf`k1xiRa{{{E9bB zi{!xA#EtoGQqXad5b%`hJHVK~nivM~mZtV_*&zs^&)LD~DFCLqUylC)8f#pDT6+bl z@K;@^D+ludu%OvKv4I09B(3gK{<9UQ;Wwm%EV&Grx-56ixtxqHuXb-S6>5K-^FoTA zF*#6if+^1pc0ns_L&Q$Ek6{9keQsQ`8Xyq`9oJ_&l_de&enPp-(IW=3DJWqVJp_dYbp+gAG~ zGjX^wdP(gMb~*%@ero9hwUb-%L$PYGH1vWwbwD(I8v~pmrLj8}DTO)1U`qy_ww|&F zgAlPKBo0Rw@BiVhLU5Jo8C731TB#*bW%$YzWzqWljDeqkHmgwhvb8h}5jti=g=S`( z3%$GFe*slEm|V^I|0Wgu`tXzp(Ye`C7$X0)ZVsv+Xop|8F69cii-BI}Ur_iTx9JBb ze0d4LY&&Iw@C;Q(li8!Hzj**&Zh)HJK7H_y*Z*46MY$A2gQh*!U+~^usmQJISCq{I zSG(pnG`+K-xHg6`Q1Y~>!nCuK^|K=hGs8~g8KELtMbt-dJ*th4ovoWAD;q~7i zC^6_VOAlnq|KV2t-|?pwo)X)PyuBI!t^K8bL60TLwO#b&a!S*D!!Iz2&`$9PrAfW^ z?oR4H`~iyk{Z`gudrWoWZ#e%S$H4jkC!H|7umgE6N|x}p%sGa+&LnD@uH=awKWX`& zPhn9v#WKD;RW&F))n(-<)vl$>_49ksXI670^52i8#}Tg!kXeqP+5a4Aic$nK_HC3J z0U)Hj7bfa_eY}mNUke|$3m$GfHJf|cjj7QS{+6RR`il^_32W3`G`If5{cxdIvh>>+ zU#ZP#epmSDWL6iK-Hm@-_sM3quEZHMdSo=oV*rg`A(31v6m*=dhv-yW*Fz7kE?#l>5nQ75~HzWT3J@(qSOviZx4BMt-NhDP!Ag|4n;fQE{GyZi^U zL##WQ79m5B?6rRL2&P|ddv0%QnE`TYBjtASCMG)Sv7^=v2OegNJEWKn0U5%L`Hpa` zrcs&wFZx%+*w3mnKIk(~Dd}?c52D1!KZw25BmbMx|J_xj$d%zb3vtH@Ns#D(*d1bt zU{SjBe{Zg6LXihwcuTE^uavHQ5zg&N_hih-e=y7N*&I-te5+pB|6Z9CIXhfuvxXXv z`$=g6&E8}*U5N}d&8w}~emz$>tu-@`zd?lo8?okKhQj<6rZ@bfX$_hj*t9==*hjpq zOL1bJRs<1QQgL8OBW=QYrQo|&jzK54G0PKq{mp~VQrdK75iSg6-#%R~wI0t<3(9H_ z|1!xHwW|4t)kNbYdvEFO9Bg!N;P6dd7#**EM9MVs2$rW`(N<6p!AVHlLAo*Dh2PKV zO@I1r!JMQr=JPpXG)1Vy2H(E-_G)3mRT)iEF87NbBVHjw64(3a7 zu1?_HOY*RCUu`(MQ4Ivx%V;oRo%!+B((4oEXG*->C+RI!-=6{%F0xACf-zu*eY{CO zu+)Vu%$|WiJ{t%CtH@qg>ytFoH+J3So*HRMh7Wm$Dm@ELWBeTw10C6s@*D_4^c3RK z5UUT$JY1^MvK}qEO%Mr-T72;O7Y9ryY<0^W319TP?baN8UcN1>Rbr6))B~XAuL;5+jWEz@|WCmx# z!vrVq_oL;|vdVc7h(rw3@#$BP*i9Q=x07^znyNY9^F}-V@$11XjVGk`9ic8{;S6H@ zx4=Y}<1{ch6BnI!CvUNWLZS{+kOTmB=*};C?f3nwf%6|)sV=iCQ8EZjYej)p>w%|E zpP3mP+>SD|H`Aq){VL_sHX~v?4(tF1GNfocLU~>l<>;F~#?Oxaqy;7)kOyB=?A$Tg z*co_ZX?<>9(1tSTnNjMu5N=>!Yp#9hxW8G(_~5dejU$4XSL`P#{u7`6+GC5#DTe-O zu_Wn^!K>CsV^wC~JMNopl?qG`zAR+z3V4b0_Wc#>zim(guDN65*fp?+h?lRp65CyQ zNRT&L7NLFJNhXc%!od9M!`ViSF>oZ!$?N{WRVlM-$Taj$1*Gp;`y_m`{<8GiuPtpC zSIJ4d8gGqs*1nz_YQq0>Z-`ttHFqf_UGkO<6$>mft|f;(Co)bz|M)U<$Jh>Am_Y5$c|$Ns z2>be8(#P$)pwBjm?D1~50w6N~2Wrh4)}X2R>hL6M1J-`=s_B1sw9@~38r2ME5}_{p8w&M9^f;jkm^67i*+BsKdO71o~yr~n4iH){doWr z3vhpWUMdi=wtoA^Uj7AsSOGCf{_ENQ1wPIA?7u$>BIb=x|N40(2w-Z+{sf)C&wnLe zG*}_NlkE>nN)SlMU&#t0L4TgH$YgwbF~7lwc*mu#fM zAsPJ&3Y;G<^ss5Kyet{p=$2(k@>>763Z!9yAUk*2_QQ|QX8o#S&>0obtKbB)M0WOo zGrj%f8Xi>)5!(=m2A*8ZC%*WM=Ujs1NgR_^1ORT>SFgVz_HRqyINFng!BMG88~nz+3=y+^Y8Z?R`glA?(W zImxj3!lk}+m34y$d1X*4SSvXz#GFf-_UE~ISwIO8vZIP1xyrR7)v3Vx4CBg%!*ezjMOu1BU6u z9UrW^x>DKKan-p+{n80(XNQw1;tEGPW8!X(BUF0zUyl5z0 zV+oXkJ%uf@A$=Ies)mn+-9iB!Fk(?akrldz+p<6Fe#7*7J0@X z7L(-py$V1bmbX0?U~3Q_?r*CrWP?q>kGx^4a+yL~1Xq=)ta;TTP-!VZ32vodYW;k9 z>VYq>tE1v%bC1vV7f%NWDq&8ON=5pMq296^wxiP1-hTrih!7&WbL7rTjri;@W59fF z2GKD`OwSw3Q1MIzCRSj(L$#`Bq9={F0AHXyp!Yw@qm=68luBXKg=!wt;5Jx0nNtws1vn)kiE$<1rtS_%eS zjAr)*t6$uBI%E6&{M<6uHIUt*X6^b3&;*0+V^CT9&0@9BQB9-X+vB&JZdFZ2UlD(N ze7KDPnPh56IkLLhub#O&-yz&h1lIaqTz?j5D3N`hPzhC%wz%-}wih4LaPOkhQJdo!c$s zwy=PSp`ZU!=w_8dUaal84QI_s=*vhN2p3hLi5;bB0lOAE8Y#aFslDR35 zL$Ff-qL*)$&Bhp@K(0+-2`E{<-fLQRF)USJ$!?yK*ig{e#W;F*oGSx1CnE=rnt5ld zmRa%TF%O;-fxoPhRn^PC0sRHt>j*}8nvqQ<4qVyXeb}@%czCl2LsCP z>&l1cfhx}9V?z2bZ9SvjBNsWr6T>EI)r@UZVDw}}4d&O`LL7T*TQlw~+t z0XLa#{*>)omQ0)djUoY@vp#C|8_CB9oPr(WnbWew$?8W^g54CDV%L=~(s)Gi_V4e< z5n9gz4tLS(W(`Vk8jMaMD5zOire6;$$EStz<#Sf%;T)(SPq4ZAy%YkKR0zXik_+r+ zy=R@mGdYtF)V;i9@g$Ock34K%35K#-L75W+hYzj(qP_S#T;o{-L;x{UwT6Z$BXgmi5vAfO+g1MVn^Hj9dzv!oUzqJTE098fqm3V~b*=RP59?1T z00S#zp94?DnXf-hddm!tT2^nNq{c`KDW4)YiOzgo@>g(+EgKPQ7OUyryzKGoIi)>N zH*%Xxv5DX+j9L)T#>9yB!f-+J`NT&g@pk2UIafZ8v@ySC$SRaJR$o=DRJcgw{f58_ z@==T{EiAqX@W(MUjN~tSlu%{tmG0Z)kD^XdUoUMbDkAoN+ZWsuh@HtgKQ+hD63MrT zoNnhh-wjWJS;Gw~Un>pJHuQoPV`M&CCCTX_?C5OZyDu7gE|+DWhep2MNTq0?Q-E2i zxLK_F6Y|4!44tkO-?fL!m#`e~wD`?85uT!M@xTy^y)_NhumSE#CDnjt?*I&CQDb8{ z#WYXW&0CW?!b=zlYp(RE-TttWo=j=en+eP~|G~0@uL`wdsdvxSvv_E!E=^0qK??2L zKtSO_cX=^Ku8C^!0<|u#f0LP>rCsF-6KY!Y*UETJ5C|1~NubHVM`sW0q}LiL`}b}C zhe!o1Pvvt(7`jFhSBV#d^tL-+)qUyrw!g8AJks4Fb86b3@q`=ehMea7OgVM%&E!~_ znyce!W1(f>IY>-L@U(c+daTkZyBJBph9-xn)E*zM1QCl4vC7&qogz8Brz@6|&K#;R zt9el&n!*Lg@+wus8BDGd=9yr~e1}*mWYvTaOl)T{Ipp-twdzhO6KQI{$9I$ zvVOPbYeu|Q9julF@c5361It7;*=6?Cu)b`wvNYc5;{35MFjv>+gz9WKd^3$P)L=4WO zb`?haD3&T+?2BrkQgkT-^*RM~FJ@$z#rVa5(Cw4Mh~#=JyNeK2TT@P^my64MM`zPw z;+VS6dej(4yzPsv_M+x^%c^(z@8K=A#TUQmwl|sbQ~;IaoVlw8{~6IL*~72uRj*8Z zBt|btPbtLfHiaaY+7Y?Gu*~htF0|9G!@94gCen?}Och?O0$K#E~aYHTRx^K2#m zR8K$^*n~2NLU=ayUzJXezj>>tAb{9pT&H*wy+E1IgGc$-~Wl zm!k84h$vN==RRXky=S~b+~V5CojGH!t;-LZg8*1nX-L#TAhuR@!Vg`>-C>{%GEw!l zAUGa})3?eDIi5o?>T1GDeIUIN z1rG#8yzr-Nombp~H|9(2qZPAeQVHfE<%Q6Nn|4~5amy(--?>V&^^|zv zrVyEO3!vZ*I2b`^#_XKX9D%(?nYFaIP`2Hy1@qV|U8f2%7eggp&~<$WM_f(Y5)rIf z*390Xnd}qSbqFCsMDMi$ibJzXx$rzk?3hLa6()1Fl|AAuOH&Uwabo3&%{yxQMVAx2 z04I29&;*9nQ>!P${~?9x%B>MS8N7-V4BEX1w^lUO93;ea6_KKmy2DiF(JBP` zv7TAQNV8>TpB*gFADOk)`vz+# zfZI^XUD$IgLC}7zg`3qvTnr~(m;2=$zIkxDuFy+k^r~Hl=jJ<;LpbsMmPiMnoVbDM zwoGMUBcFd^_C?n`TKjRA;6u&?M|pyeup!-IGXjeNL~yhYx0FRZ%faMXZYuK|vS=C9 zJIt8?-Kv);l3T=72P~L@4^)^BvPrIY2zKf8Biw|n%lGJfgK1vKm%$dip7|Bx`J;wt(JlhtIPvRlxG%CWdn2o0*L z(shkO8IHP*+PMrGd)j^2e989J{F(-TS4323v{V%slg>pPR9>3cnIOT_;%HR_2eQ03 zhPf|4Hig6*5()LGm{Y5M2XlLs6_SN;^%F9=Ap|6lS0&QOgekTwYNENXTRxS#NT)2I zMxV6z86*uw66i5$O_BGr?Jm`H8$BA29tEe|#fd+44CH#FL$Uo@6#_@6S||eEh%kNu z@l66M%OX|^INFC@~NJu~F< zH``p7hJ=U*S_Hj@(T+YvZ@60AfRXgxuAWeBd2yIbZQz4xoGF_wKY|x_vz${v^Oai9*`lM zM)yxx{;v-pDFaMa&%34NlRHatAb-b}nu{FN${+Qx}KPbB2FgAE$2I;$-~ zM7`Zess7e%i0)2!PMXi*p7whIxOrCa8~Lw~>h_I-T~@&`A-cuzkc+5j3s?zVQlL1C zMsTWPj;Y?GaaSI@h-jh~B~QeRAd!(-lP1y4J~CIL2qjk{g`wU$I@7nQ!|(^2K+JiI zS^@zVVq_eUBGP$Of9fov+{4F6w9!D6Y77dMQ7&n3P&pU(t`Cq$YwKOsIUM3H&=r=?PLpRVl$9>y$<#oja} z`Xl}`HoN6nrXg6fl)a}dAcPe2_MVrV_~?L%60&Fx6tC01qO&Q{wfhpqb=O+qgj(^7 z!-s5&!oa|i)E@i~fc6dqXk$G&;yHuOW}pxNCBiP1^9?D7N(rbBV(XwIiNMasN@f-# zv9mC4z?dY3ks&3#tpg<7j30kZ7uh_gFiD;uC`)>^yLgWFJrH~KF)9_Ua}aatFfs}i zkTg*6cRWxuDAMP|(~kb}yzuE?^B@4h$7LttX{k}g6;&~CC-$Sn)RQl+URyv%+7?am zASIWo-M_Ppq?r(PE6HJM)H9QuZRn)#I_kRQUB=9GbDj+kl?XAPRVOfXoABN)<-ena zx7EHz!XhJu)Fdi-1V#g9lvkz#0UBq9oWp>&j;c|_z%_10bGj=iycxP-*_N9cjjJxD zALXr^R()~p6U0zu7GH8fnc1M=CL`M{q`)rx^7tDPqImG^aQ%1*r+d5yn|V;>(QAfnhvDq?bO!$;3X^Ojh35$J@yha(AX6{7o+ zk2W)@=ExE>#xd;LDx2|D$QbR{@;5LC*E&K=LGnr`OZ@x+Yl?X-3!2rRAtxcrsx8MS zWdA11?i7kqg;Aa#s4abOL~AyUT1~XDO=%?$t25^CX|MIr-U2qR;?tS^Jlo`&7zuxI z6hIB^oo0&BWPX&?>JW3}Y?^dFCrNbtVL#$|)7VXfmF5S`m;xiEeTQ)49z&?za+u>p@ZkSD+L{CLN4lYkF77y}lQw5J5-|vt*;xR*J?`ayk=ERV+lrgk`x(8+{L9 ztvTE1GfYZtiMU8^?i5Bw;|xYtza1K0IUy-xfVfn@ukrG~`FFV}N>)HE`j$hYSnV>w zC0a8Wp+XqB6_i=Xo(gmfz+z&jgLf@qqB-x$h{(2X#fl4cgkfUIc{RA@E9KJL0MbjC z`;gj((m;9JR?Wt6M`qcwY=&tBum-UPu1*HChl8l78$aA;tGwb4QO*c|-bZ*|3sP_q z5XB?_(|gb-g`&uqr;E6~G*3xm^*@EX7zDpS>;ywr{s%F375kc=J$B$-0thqY#g?8lZ1`3jikcw0vOmbP?H^WbagL=9&Y zvdA@-fx=aVNk3fTVHWB*I%GnSeLaKBWDA2joADs264j7ER8qo&oz z7koHkS1}#~zY4RB^;|fSmD>V6T%Gf=WtZzq?JSMg@{{P!V@O#{+~D>}QwhmrE~mhS z9+N^eYIrC%QDO!(sWFjvqxk|g5S&8Eps@!+naR18ja!4Y()30t81pVdyno%2LN*PK zQhqaq6by{FB3fx41hE7?Q2W&=Ix4bmMED-Cj{!)SO%cpCnFTZd{-!Mk)CD5X3Ez90 z5mfv8bVxca&$JZExM-b`{4&fwyP;LYpy9gx&DI|OMHr+T9JshB|#l=-Atv4 z!i9d#8!In(UDh5XA`sUUhm8=94T-T0xEGO>gq8}W0QA>EQqiz4tC@d8{# zN0Z_;YP+Aske}2*-ptLX_7CMh*3LMAGKj_mpOLTFA7aL&9d?a^b`Tj(59cLRfGH6K zaj}cATBpbZ0-yP&*(ogfsBkig%O^6%BEihgI>2!tqAz)@&lGAVOf1aka${;FkXH#u z`}nbVY$OWb*RJ4D=RB-n7IZzK?c_a@sK=&oZb%n=6*g$_#-_PH%8klL4v43X+T0`k02#%bk(gJS1me%6eM|ZsS!~cH`hPiR!^6(qYHm# zwgxk_7BGH$d7;qxLeDFM*a}VO&GE;)sc6I4& z!csqd2bBj@j)HILzqyUO4iYqwU+vjho4$=lX-2?-%Zn?G_?r8EF-LD(YqfDq11Jv6 zFR{m_ItLTLQG4G2ZXSu*T9}3~9h6sntIUHY_a{fW@DQ1;htW!5#fMP^=)X|YYzx4s zZJ?nPZOYYvKC&mJ9!o_n9KLfxZ8TCvUXy-j`vm>nU22Hpm15n%cNr4d;%9NKiSk;? z>Q{Z2UZpO^-U|^OK1(<%#CSgS3()k&kDRZ!a914JHAskGl$_XABS<3H?|N`QxhliH z?r@9hGx?4XCrv>^99)3+wadp7)mF+D$aR;AmHtwd{?}T7($xk+4Sa>!8|8wHbYLI* zmf4Mp8YKGOg$cp6VgnXH#!h8G2oX<4BY2l-gNZ!YF(A1->y}w-g{=<*N`XBx^@Y_d zP=#AzF-xjYVL2eC*VD*&qf|Xo-db@Klu&u(Sg|>#7@;KD9lMKf6DuL%FACED6Q1=jQA{DnGCEMSy{L;CWn8 zP>yqxnzP`;mcG8yUl>}yQ_Ey$$c@QLSLthEB%;H2qY!KIPoCq64Ig=25Crn=FUqIR zxDX3PkYiz~pFkSy1z6630lkW;2BV|BTG>*-vo1pq$JOBL? zGnPbKHjrA%cRTSB<^XS_U*=3>l}pyk2BZxfRcn!-o(m6=p@5N_YLH`UqyrVDJ-2RA zE7`VWmgAQ%u%uEgGXXN`esB%&exgk3pIdyJ;hDlykq;2+iAj<3(I$q*`q4&|vEHgD7o zl`<3E^mnf2GJdNv?dAAyh#(a+t?=$(GNyWFjN9b*) zD~(RdjR?Gpd%Y&LDQ8MEwZO1s8AonC*Iq_ zGbJUj-kiD(dk$L^PqGM?;P?^d@{&)FdJsgjv3r<3AZd_ZA`4(gRZSmI3nVz;u)}-k zLcRlBtTvwk(xcqGZ)6!|=TY^MF30E%fxygUk1Ka5rJtkrrsuul5_9-ad>t{!qT>?t z*e1b1T0_UH^!5?SX3CLGg8qR%$dO2K?M|;_-NjjEnttlhHsbjDUfTp;on~Zk}QGVCu0(l7U1Gbl`S3=5R z>6Jp)Aq9PfE?bEfK?zJuw>#lh!G}Hlg^|1m`s&tNx}8zWLF!g69f1acnnW&2SA0#kx{x9rvZ{7!~d7#X%xqK(?)ri%8 zF4NxiRkRP(m!(^)&SJu#5#;-@!~X7LdY7pRX5Xz=el^(A)?*naJ*A#%u;vr3_(XBuJ ze6O|!HP35{@fpf@=ibELV>~^dExrTyNt`&bo+7pYfup>SmQJX{f0Q7JX!wnQy7bM$ zeat$i_LUOSSSxR{7(DtYw?3d2A|waeh-s=KCyZoi08`MG2JQ3&B=K0 zKIZu5MIc)<0gtkE1a{b3_MKF=+NfR`Ip(uw(F?w@)i@vkL*~3$x~J{&)(Ea|{c3(# zewJieOAkyBai!gB4HHAH@Z^3mV}6=zWIv?+s6pgG#cBq#LNzwjPIV;x4d;Vb%Bi+k z{gm;zJ2%YQ6SR;ULcLqBZr2g@XQ;pHEt5Mgb_<=aG*9B>wQ=q$UgX$o*TJGAJxdKJ z4Mv(`L_IoU+}lNO9eFwX$Yk1jYqONZSC@t=l{+pGHMCwfj6zNXKjSUS-;mTTSq#-Fs#lzdS=G$5s~##zS{8YkktxtX}c%!vX-AtRJ!Vc(F2zGWwfZLL%ryO zS3ZMx8(vPFWhNYLivE_%u@bQv;A6FHwIWg4O_!LkB##FA_>+Nhl<C6hx8uj+NnfgBVFBgyKuu-J-`PlStpKGDh*>_m1GR`+ z!7SriwV=(XtojvaRhqrl1d{`1;R-s!LHI@_s3r4NS&_JW|J4Nk+}UaG%zNEQ+4!~| z0@Ztq?=YOnURqIB{befErQL?6yfw6r2T*y(&QeLY%_siB27t8DP-Z+iEi7dQI3{9H z{OcWtA?`Z2yq?s&0vp6xDT4rgCEzW1a7hNfimzOTYPl!%Cm%gN<#$iYEg!I$gURfSPtkjV3On>dZ`!;5CW6uLRr5)H+YEF5(;vbfJga+N1BkVi57gy( zU1Ff%*kn(7j(++>?1N`jxGYdqa%7NLUTlYNwLRf`yeT;*U8_!(;uHqbe@eh@Gvi0d z9=VUa^d^67$wKqdxZCLnNWn+}#b{ohb0Ol4(nT}3ztyw9*=o6KU{yo_APkDPFSq>mZm+d-4&=Io4gg`ICe~3h=;X2t2{#~B-21Pwc?FB?Voqm4k)Y3N z0LzR6$%a0fV#FJ89=9(mYLB*;k6OjsoyayxBxWKBw57(#J}kTm#c42cl|Ag@9Itx* z0Vq9#PLqhIpffu_o7h)%2N%a#J7v<7epq@$j1kSvt%4wqymljT1wpqJY&!iyg&#jK z7@7Rg#5n#SVNuZb{wvlZM~UNujoa4fu;s-wT7gMJef-O`{D9DSOu%NN)d{ea;8TG? z$8xf`gE_Ifj}u-6_KuBVUiMb~_s~_gSgAzN{RRst8>KM&+6_;L5e)(*2X7;T++I(F z#_!eb4HVdTMR}!#i4uUTb=dQUdU0qyl29@%fx0regGDP*$okCQ($)FM7GG7jl1*&~ z9FXHHQsA-%Yy?JQC-8k(Dxaz?G@(1Uq5dh5(h3Xc{ch2`&+Pf7DjF)V26?V+Ajo_B z&1`9?2zrw{K<&!Q6uOk~Vhy(If31y99El$0TaMnDe_l%=ziNCF)>bCygbt+Yq2OPY z)Qm!IH?R0?S98a*M^1qe0pVmQw|5LZR^F2ixP#|>h22Tx#2y*%tfrCPF7yGaQ>%FG z{WaRU1Irw|q_aLhri(I|n4gdb2RMi=$2n>dNbje(ALNugqO>ojZqU#lSD zC9MPVd=B%FYV5dJ16?^Zp zcKv?XNSIiQ3#7c_|AL#_W1k!R(TNxQylgZq*}1!k;aL^Lq|rK@zhf=-rLv=mA-&2y zt#o90J#lEWvW~Dle^r_1sigpG`%(5VQNFUlmZYmC46@QDS-xK|d`Q z#{TvEfvMycIruh)db6^xI_Pu|__Osx;z@3To4ZWp111lDe zFQKwm{xbr4B~`j*&qGhC^P?&*FY@e%4`^0}3ec8#bH!kv}|lRJU`OdQY!`(Jx{3{=XR=d3cp}xCqoG)}_3@H`b?ZnO~RnbkQ zAH7^Z=DO$d?;Ux7iSgy#AT_X8bL!9BL7ASG7OBpYOQYqgb+U4#(i$>%yzAIVY!r}o zEw#9z_xxG#9}-YD&PWu#cr5!I{5vru{hKODgU4^m<^|lF<(&ortvsGgSS#oFl%NPw z)@HN&)$9NrNs>CN%w~_m*QhUEa7mKPqDD?Ig52pb5rN2czo|(s5=qcxCU<74=Ikz} zY|5G(Ne%v1uDJe;Y14CRB}4o>jI(_~$XxNZbW%-C>3dDRZ?;?C@Av{)p++h3Z|w4- z=95KoRA^S04uJ{*njMXcnl@WV0B*CG+0(#BRiLw=2r=8}A?xf^voDfBpupi2eons; zP(^JfDdOVhbfPFBbMZ7=A#wc+W{#z;4U^W-0~($;kn4caPYodJ%lBA4H{#ggdRoyz z|KDYSC!|u_E!Oa}EPz_4FsHs*#zA2<^+~+eOV{`CM?Y@vty21y`=o2Pm)J-wXxv8b zXOgw_lQ`XYD{|BBHIrMS8z)NPK3b1Me9i+)F>i1q>g%Obj-bU^RLjVN0;W4gt^+*< zCUpc+DB9Zmu_R%S%arLYPG9?Re?Wq&{>EO$pit2MuA&}vMvLKS>3Tezp&DNw_`}I3 zg={|O#=Bj~)vZy7AaODEFb?T^THonzAjQw#c%0n6)pTLmiR1;@ydKP{vfb7-ngz6U zVFA-nnhq2fhY2kfqQjp)t$RIF{a%GJiMLV08fT?tQ`!I&JtU7V%r-Rgs z2ecpG#o0Cf@HpLVT1uAM7<2gu6asRLE`IU9i}v)PT@au_wgY2DH5GKBZ(?h)ynTh{ zI8U*ZkZ)Y?1qFj-HimBWIXmUJ1gTk-%p>et)y`m zNHh9E>k$El@Noc~X6*fApx-xneTC!;Pw|6i6D49_+|zQ#4a_Bvt~(~KO{~^bIEAe} zx(9siSBE(YfvhQlqr*UQ5X?_GU%=<#-1NRMI9l5da}g4m+e~GlV%E_ zr^!PLEQEtO6CY~rwdf~YkcPTw9V)~y_|6uuM%+K@?e5gHk9CbO>716$wJC9=T!qqs z1f7*t^|FP#N<7D=)1mVR0a3meHc0pV;_YZekDA5$!VHdjhsq2F$trSHX%`R6W*Hfs z*aH{@4lt5xNl6JpkAAmze(O-9vyHdQqIXyv8N$Z9A7)8Y!$x-6l_Q~nzc`H zz8h3F(`4M^O7Pwq{z_Q~;n^K1p-%ONS0GUTv(Mt*yjutU3FWV zk#X&-PAzP5=Pk=s>SRQxj9KR%=;Og>OLO`y%h77kN3v~a<2DKLDa!2xb(#B;tUe*p z%9Ts~Ewi>Z6Tgwtf9k|!$)`cbcnIP$b+fYr@(2pP2iRsQv9IU)0#lx*UZe{T#){H0 z`(yb;C%iTgcv@DGc~S?gh7r2Kz=>Q`_rMNs@bkihm;euE$e)LheMELlXxgEuAn z_I%`2;g;QAvV?`{HEDCOv1oJZj$h< z_va(u*zkl+Fi#2b>C%BF4rx@mMnHFsO$}mJwg~qHx+`J`ybgI+j4Nbzg*Gpk=*zx% zaDYX-GvXS#Qnz9|i+KaIaFrNYOtd9lWn zyt>1r|Eq_L^%T@{47JEOmt~gEimEcumqUBW6CF2;+ScXT;VbQ20i`f#(Acr> zLd|@FiyGH8$H23GuxKUs38xmO`_^SpOzp+q<*GQ;z<9HrfjRQXrGu`_e&9-}%?Ee? z&A7P!0$Xzrcv1`vbM$#}b2PT9$Un$$aHQ=S3n?)v$t&QJm^(3T5OJ zP|aa3073UX=a?S@jUyUli24)c#^1ZDV8(l|?{6*NUQoEP397?UAg+_Wl^`ypy+n42 z+)ZP+AJkfl*fI}rfFz)E3h~^B*PtET&MJoVG^SCAaRN#xQi(GGd`LYfeqJ09^QwKT zo;TvPm}=V z&{Trj>q{5Q4uLz$SPe=g!f#o9+dYgvvg%5BjZ*_v*nJ4Eb(9(A#IQIspIeU&w-kRn2n0B; z70oh|5bv^$SE%i=0DG?50&6Wo+Q(YJ}jI%f-Z#-5q6=R;gm|1MU)-la8? z?f#I0yu3dj&6bi}W9|J{WMMw!1jR``!IRI)96&QGKaWuHJMu!|0S{s}d_jZ9AV{6+ zL6JdzU};8AZH@bs!LC$ueH^S&U+tmnf6Nbf{#>NcV*InYjo(Gw#Y^Jvm-^b#t0qM1 z`{I|nobOeLN$V9oI3QV zsX1^t0;P4KarR%IC({ZbaL9|vde*L8q|{bL;PmpJK?G>xm3CeE76^12LU@jP$uYEy zYghbJz!|&*IRlMro8WQF;<*QuGf<6`KZs}X2@l~UihZyMl@^^cEinGj=aI3(I0*XW z^l#zHLe&66AbYYQlW-4R3bMXIz!>l>hlcR_E99R=^UGhT{cPyc`H+cR?jPNl<#wUg zu-#RKzmqlyvaENL!S!=fp#mge;k;~)frOX9cdtKLtBHJo1u(%KY(b1K;J4^d&lZ`{ zq-b*cbpXI5@nE?4h(rx(MZhCZCVI@h=C)4x04nD)7tg~{#RN3yfHk3q*5}^2!{}kj zuxqeTnc(Q_ft(N}Mxe1kI=Te(H}`!m_@DDXlam5tDPD}tooIiYtLE5PMDbX|=^-Gp z!2A&ZG^jBjH7aM?9pBy9XLlCCttj4DB>0R$q1GuE@s*619EZHT_e@Jk8~hz&fj^aG z)EM;Fy6*`;5%0}(3O)+d#8rX{eYHW^lpN_XPz=8e=qAgFPbAwg!Mr+l$X#jDb>$Q|Y_M{bV z6R+nvMTOYVMfO`I-;ge?LgczyPR)Kg6JNLJPX~D@$ylW^!AuA^6J=3_ekf~M?+CWJ6Wk<1^Fp7 zi`tRGkn=mfjnTfdwXyBNZRg~<;2OmsgL&#xp&14Z)nCTRKskQEsYUHQ$72|ldAFfE zFv+zg*a1L;HpMuod<8jU`x~jPkk%Uh7S>RVBm=rIwr{YatZ?`e`zb23A%GI8A+~^O zhtXr^`A4VB#76|6pI$N2^l2gBuy$2JBal=&oXE(hi*L|KZ>^__CIs^;`udKFH;_Cn z=mgb3tqv7eATkaFjP&JB&6j1#iD{5TF;S(9x<#!nJGL!6`S7;WHR{j4M;pmh{F>>_ zJxIiJOTwh>F>m!8srSGY-nQ(8sBZa!+&o(Ymfyu2DK*ez{pK~q@S9ax$7yOH*B0y8 zP`KPxOoJvu38n32ay0Zp5g6QyF2oktK((u~%E!2A&E&L39jwA?`*qaj_iF@(G`Ps)v>>3Urqr zb&Uz$`=0QS5pl|rjc(L2muT1xQszqdXKke>LX{C;-#HpAfqGF1KS2(`E35UVZ(pNP zLlU|n$efs{&S--wglKaJ4-y1tk-fPT;t?`L+Q7ob>Ye)d*($$5Z1Pv22UFL+NhEpW zNt2;6oyPD>;-Z$Sy`BHl*q6scy}#{CL{p)%Rk9>yP1%xtA3KAwMpSmP?;%7*L`7xE zGS*=bhHME(i|o6xr?Q7^A^h$+&-0w7^L<{=>v#U@jPjY!d%2hEzV7RueAGy+bw-p< z4wQpL!4igThZ~)5y>NoX`q+Qfi2n--00CG~HyFKA`u=ylNJ|Ogzb$C5d8vT7^~B6=Z_Hr)#IUux{94wql-hV=BZf_`(v{3 zsOE7yz)5(dtbU4h}1P;mZbWhbMM4E>o?GXp1BA8|}$OE0dnhR{qLZG7V< z11WDEeIfk;BrMFBUOWR@is-7uK}${Fi>6*1{R{|pR@34t~ z;sx)a3!Ngp*YC<1oF`(Nh!#l@&ZG(l4uH=C=NFUI7-B0?M{t|)$^{fp)2n4`*D{JE zKGHD$BCt-5d@~qC{c~nQvd=*n@1~e~>c?S}p}3}r^*NyBgA32Oa?@8!D-;8I01;!m z>lK@uk-S5+bGZc^tIU_KEBViZ=7G3GTe^s6#fV<-p?Kw0D-#W#FwSqz*GE~gZZ_>N zPhYedeK{XR_n^bNVj!3|n3)j0969+EILIgXouXDmhz&n2 zV@HuJaoF(Ob)JF)ytk@|9)bz&?ic z+YB+XYq(=f+2m>~U|MFJfzvyHgdp`0QH5m(>ep6N%%(A};l(z9+G?$cEUVTX?BF$H z76Nt@uNRCMJN10yez6Ud-ts~pAirH{2qU{k7J<0Xz!A#1{?SdCwhJQMmlldy9YsNF z`3Cj+DTo|6I2|*&%)_tRA&L`{78+*fVD7bS5j0@FYW;p40Eq~q(>>CSWB_89Gb6E3 zrd}jEOg+LH(>Q*q{pdpH#x-DL*fT3^-Ew{Uaf?K36O=`v-Q0Joh4nZ0MdWA*JTZMX zSA+nxc0KA>{Gmj`M{B%(Qu^%H#vdDS)b+VDSFpHM)IfP}S0tgOL$K*^y_rs>-a^#VvSm z5i2@G$R>i=uZO;Hlyw=pt`r+D8L-qJV+7;(UUS;}-3i;wrTgqNL?ewIPS**S`BXAx z3udPLu)t5S8aW)Zc0k0VE}&jy4lOPTpG@1DfgBBSSM{o-lg2e(Gx%w~_<;n7IK>vI zkXs-u@*;7L^d2!2@6n{xpvLcebdDSD(Ip)k`4*m>d z0l7z1da8|yq>ngpL$cwCQ@Z-sSVfwHEpAHa;|$z~-O9^~IRZ}iUcO5``7T;R9x<^$ zo|}Sq^~J>yM7H$$0O#QoLT3wV*9i;}*ZHhnJ%JaQy&3WR85M?~HdtXDpr z?c7J2R}GwsDsjd%B47PZszvF$s1vS-hRkR7yo zX&Zaura>CH(7{Lo!og9%K{g}NUeZ1~w&++|_5JJQij=)s?ZOZiogZ^BronOs zj_tZ_IyMEut>K~C6bHwe5937fl{W`=22qw-caYx_(OHZU{xp=Q6Bk#(K95ep%+W4C z#AeLK{MztZmSSlvv9o!z_AAoWAXaFRWNM#l5H%f33S0XF^qD(k|2>!Msa(h}7mtqsk)p>lR}UbeLHTZl^C7;}hS`^)go>|Y7~TivPJ`D|gSPVQZ^iL?7}RHC zd0Umc9P!oqt{IHJhK@9MEF!gEUbF9H~TYFIm0lg zCB2_xn{MK~>~LEi-LGyStD&E|COWMF_v`(Sub&nm7Jeldy6%C90fYHv3adVIlLoJQ zc(J)R)An!Tcmr>yZK}}9{wg-&PiZEc3F8!y+c+uV zncJ?e*CB4(#!|!|0EGcu@ZN4;w$`as4f+!aIEz{W1i99PB`nd)|+4(1`nk&K`u? zy#coga1;(_mT2%;tlJBs6t+=Km3pJ1^M>V}Y1>bbt844RGz9jxMW5o+m!0u{rbOQX*8Z;$wl`F&Ue zBHa+Q1QMg}634>Icf=HJ&-?tEaDAuRCAVWFBj!*%a<268kHVEnj=klSR}<>Xihhy= z-cf%oI`kok+-*s*P^wCZoQG0&DCU`k|Hoj%=NW!V4m*q|T$L;~Zdx!|yJo7h;B&Y8 z>vpX{%yvzhTO7!>`TSyHVlGz@TIII}9WW}iI5 zo7FCu`?fuJgFJvt)LLWPTW0fyZiMFwnbD^%SHZpA^%<)4$f-!HhHfPsXu}%f9eQgt zTt=#Tr%u&E_q>?fXQjE(+Uk3niDsF#%7*8^22P6$-dvjbOGo-*Pi?j5%=IuT+I2?< za~5j$f37n-T`f5UvUA(X_{2Y}B6pBUoQ)Ue?dH0x+-nHeyHszqq~psi_hBH;DvD7+ zh>{vR0Jk&y5 zMFX$hCU46k-oPYPq4VXQbcMeH#t`*-`uYpB!9T;OY!agkberPJ)9GeYd`4d_Prf&U zC3Aw}z&dFAG}y}dWufKr=9&lULfcjQar<3Phf^YtoDy3d(|LZeUXdTIW?!zqI9j_9 zGvQ-Ag&qce5Yrt*HkWp7eE6mB4KW`E>T zX?UPu^SCN*wg;%d8$A()=zPObcO8LmJ&fAheLD&K7T5K2wD6Fg2Mk@tf8gU6f7}Y$ zeogpG-qMqoZ_NkP0hF)++WkK&4G|{m3q!VuLXdsC5zmYMAvmJ{{y`{aZLWU^Gfhv=$u8~Z%0LsKL}U%9iH%ljPsWoJT9Y8dtr{ll57 zl-HqRw2?a3hjY|Q6uB~csU=l*Z2u+nqda#_51hz*we7C*r%>UMGz;<*_pzSzLOd6BJ2L3x>ovYC z14NT(n$?;p`Uyl^CapA>S!VpJyy3M96v7d4Y?9ySG7iVkCxBMi`}}N+;2keM{p^QS zW{B7Hi8Hp8QlQke6nVC`@1>}0E-0{iL;5|iG|~77JKT9C->9e@df!G*!;9X0X72sM zYr3O8@TLjxxp(UaHVlh1JBX}Kh)RX?^>bntca-W;-rnUiRWNv}gz5?+oW3PbPn|*A z)Mvm!XC3j=U|&|6W_uY7pRu@fHXtP@R*gX&j+9udyWX9wG>v`wU(y40UP=wJ2LE!C z)tPu=!#`8qAZV`PJV&44M!INntKfB&+e#hxD4_cwJXt_T`p4MKKAk zzs)WlSDl(oL}Y@##eNr_97`3n3bp`YRy89Pm}iO<_~I!|c~L?e?zZ#rsC0l9gBfrexU4RZ!da=uLxqk*mN~p~FQ=4d^gh zGtmi(3*MWvT$fLhwk?+iF$$SRd$`xPGCNe|lfDxR3D3}7kaSYi;W^<2{?^FWYvBlpW@el_VA_I45x-v26-xf9BARd>tSSnx zJC_!jYu>S2U|#LLYr(8^cf)9|G)v=|KmHRqAtwkuEk-hQF~Is<|8#A)3$$`S7d5sg zPgg5205a2cBOKeYskmYq~ti9kv0mq zs4C};Dnm+}?O5DWQ&W=(R^Mj=!gjaotYR5Pg)c7=SPUe124DzGxxm%e9UFO5g|f%1 z_JA`d?TL_W^!X>YfQ{e|f+=L5gAWw^-W9SCazJdS1((s$840!s|DI!-rMBAZy7-S1 z!;QPo)*bkAFxHI9)3C4gJ8Jt9C3|8BhL>ZcqF52vNAJ(Asivj1R(aWB zCL5n`sH`fU;bj=u7?Vs+>c~l{)zt;JY^@{wo z5La$8iY|jJ8${aQ1XEtx49kB~o2~FHMbc->12uqQbVtNDxFXZz9^$pb-N2)JQH3KB ztq4gGTF^s7IMR>uJJPa}B zSxC-j;x*gD^MkWSg>>PrGqQ{GexrK3a8&M}kGVD33mEmpF?CM;>8=@Rb!me0CnvTM zT&r1%XgL|l^VHsFXki*#AiV*%0nanzHud-MZH~_v{cimKd&u=P=AZ;svc=`QHK!PsqjC?&bcOO_l=EJQ1+QtFy|S-Z820F(IbfwX zGoWW!p>8tQ|0dCV2yb{)-}MUVz5M5)|Gjc)b`&MFUkUg3D*Cbov?^+;LJJ+_-=V~E zCDeJ?Y)_v$Z}u#kr?1S0lJ8o%XXkM7{la6SkP1F5q4_rOZgr>JeT6lt&L#(U#Iz^N z+UBdXsEq0;b<7WO-;n=U*WaEE#hW%hopn%_0=XUPCY4x!lJ!W~D=FOUIg{#$kA1oR zm1$eKP8aX>XnP!U^zF^qw3x4td+zQ*OJzTlo_5+iLCd11m9b#Ney(DnB6)mgsUr=Y zb<6qGSxW1x@e3t}=*DZyr~h#V|GD0D_jyCY44+7SUI$Y(3YR5!v)UzGNWHIps&}XQ@bOaS7(tpF=DLCj^FyKYcrZn=x>E`erWe ztIpUgTEI~tY!x@89Dm!@AU^Z%XV1ye-|zdMf09cluasAPTIE$FaPZryz|933v()l2 z@A;2ylGixmzOnM`z0NiCa~#Xf5Y0`ngx?&c}xnGW7H&VQlzj?hzNxp4BVSrogJIt}|3(mV3+#R#!u)a~uukf$m6 zS*>aIW8{Jv!^2s%Pc1aqNA4>LxJqO^?_Wbr@0^&JZ6sF)#6Z% zv~#W2)z=F(qx+-&2yH?d<=6f;JmZ$yOYx5P!qhVxkERSXw2Ky^+Nwk}Ua|S$zIwzq zY_vCRc(B57(kE7>p?U>Z#%zAlaL8$gnVa zwNYBUc-{7{ZS;2A;-@1duI|2Ah&SBVJ@X+E=C4m&wV!#ng<-7goiKj#?XxqVx&#D- zhe|@-3vN7On~hSZTA1xA(ZHfUMsjE3#zo?uTFTwyCEJXCp%1j zyCvP2zvM=8I5PC*7?4-w`Q5oM_anZ4SZ}O4eq7z`p2|n>jkhX>GYUoGglF+LlDamA zn8ut+mFI+&b`5xo2elC7z#1?-K2B)-*-ky4~-J;b^F@oBTmKR&(7)Pg;}Z?sQC3$o(RQ z*Y}TwMD%_#*>QM)Dmj@DBDahv7_E6Pc3|%n2NlWfV2Zr6tG(rVu|svw=avH@$gIve z)q4JlT|ns>_qeo0Q-!OVv0;Cih0Q*Y$D@=TQ8}T#d3t@Huit3nQGP{%oS+FW6{n6l zC!k%rNM10{Zhr_{8mfTf48~u(B3HGVL0_F|V(n-y_)&ze*{pxLBAtVCjO1AS+n2E; zDBZ)}PV2T!SUTz4y4YY_-D>)(F`;nGdb{+|!)4SrlmzuX?78=5Nn2{(eo0uv%hV6` zn%^^hiis2zC$N>_J`E%-EX=;yy!EX0T4{Hek1ZP|BO8TsJb7|9zS+;UC*4gsKjG@T zCl%L^PG zXO(EtmwkF+X(?2s8}T1shAx`CS@s*QjXKgs$jL%i{K>2Baa6dlgS?S6SA8O-RUB`$ zX}?DKNrvN*0v8cSqNK$UOc@Wk&rj=L_FEQ9s+5ve9TN)sflG5SA4N4cQ|#=V?AI)l zan2{C#fYv1H69|lE+mKbzqW)ukwWC0AU|H6Th1`lFt|cz?1suP*-YWNtg*lJ$ zF2V3}cfZs^TR|FNIH!Mi(h+-WC}*9uiPt{|gAFF3~Y z{Y4e8!VtRdi4H7BRo?AlJaxP*Cn{psb0Q;b5p{wkd~PT;<%VR~Eau42#^9U%R~QXk zO;XZs=DX{Ka#7Xp{SicuLtPfo+pg|zX8?W6y%$V1)YVHQyk?C71q58PYS-~~Dd$`C zAJ;HM__%JICZUaA@G;6w^Y3t3W^@*+lFX;-dXVPs;m&C;OSi)Xbmd?kCFkz^&v z$eAL@>N~@iSq3RdtdcTsnxzd)E?nr%LDvwy5lf;~V@a856n3*mR@P4=cwXw?Va;^px_pg$Z zEX^S;cGEBn^py{kI}gpnJK*E_j0k{W;f#tb`T)*5v~aYT?T?#~W=ugH6B(Y8L=t4t zjgWVqWWiml8|6#40L^fiI&g0Un2CV=?jRrZl3?F=-uj)BbM<^FeK8^Gw>5iPNypth zJ`K-JNJ?^)^!dJ8R2#67Xv1x7ZEd2d`7r*1IpNr`W48@a>gthP=gw_yPsproCMUBD zy$P&tH$N#GR1*l}3U(dw?Q_GGrJ_y)9l?FJPaZ!mxZV`W$i~KIab_{V#wL44VY%}!+kSh=->*SEsAh%pn%$ADWSOGo9GE7sY5V!}=iAORGc&uA_-;Nl_USdF zx!kLjB{N$Ri%Q5_#r-?=Ri13-I@+d~@3pGQ zz>JC0@axXS^nSvPq9 z@M;XO3q+*cEWN$yo+Rm0SVyz}Y`7%Ah9!&wrpCbZ98ufoHT8@k7ikT6EbrvxchP6J)Y9+srLs23TJOH8I zfYH9|#VY9^;8w3-1L+fTKdYi;5XyvdXGaj@uYfVJCNNy#_HhTD4d4V1A=v_N!Z?Yn zYdRtK6aZ#J+-%_Gpj1P60;Dn|ONWydP?Fhw^>tWz5|_a`oZN zE<+&%sY}TaZ1A7c-rsM8srn+MJumdmFg`t$W z4HZAF`lNN-#vyEZS2TVK#tIUkP&}2|JIV&-hRgkHkC>bos$7HqNQ0T4g0-St6CXTG zvX07}f1j^ZB2TT{s__8rIr?>OGL&7qoDyz~o|}hV&)0)!yuA9&r}oA0`WT$ZbcCV3 zOsoaDv&enXp_p@8iNjBwLHdpZ+mO}JZMq3-fY_Bn^%|L(@W1aUC8@v4+Xh(n3ymyULO|7hNaP)&Jgp% zzSKJ;gHP;!3AgfdKu=I1h;Zz6RdMb=d4gM0pKKKxExYUZ#!+A6O-Fd>^H4ZD}KPM-&@jNuG@qI-U!m;SgWS;}Px-4Wr6)VHz z@PKcUBx^(cR0?3k#~Uq!mqm{fjDXm;uTYI3*@FD0i}eR;>CR}%5Kfm7)3Sp+M* z@gj|?bJM-~&#qnLh{fK6E+Ht}Xs>nfUY!HF=0vJK9X%_euxD3W1$sp?>Tu&49I@X8 z`NH*}V824A=@H%2rL!8fr-)gXNZDYcHD(_Es{YWIbLSaxgy z=P<;FhI$4yy^xy%kc!de%MAkZA>OdTmhA6w}r_ zg=fN{#~2Rt6ze?~3;N}|U9`6)=<>p6ChHl2Sgr_pVoLZbTlStdvZ?AEXy=))UdWu}Q9kxs1n zZj0s5@SlBODCaKA?_`(#&f81mQxf7`2!TnUUN0GEGZtm^>MuS zdJ-3a{f(zQ#AsZdh9BCI*SL(gg%f+lR<&0xQ6n?Kg=oYX{fXqtg&f(Pyn71RpAQbl z7@R=Y1qBv~=`qtsXqaMR8n@1G6t^qt8ROl*su#_p7@nB6*atnPY4stFRk^2u_1C-4 zNrmr@I-K$Ai^6VJxi7H$i-CSLIHkHaKiC~X%XSOOaVDeZMUXtw(jyz$vAjGyA2+f) z_zLWrC0{U+xEOZb45FJ}owQvc`IMgDRV@l+D9An|peIXNARSUG;a?cq(7abo^e;We63X8?}+(pwNm@ni4 z^Y~sAsl}^fxx4t!;(MbG$MkF!LVa*|8@L)3Z~YXiUK2j-fV2W0VE)aW#j3dra|C>T zZ@y9cB2LT;#sw{_pRN7gksiiV+=XYjmC};#WgqkOn}WUJQ=2o=N6WUlEztFucQo7e zITB6h?WGg#Gi&XoCGPG29S*{usNr=L4!+?eiG1Z=$`LDg4@I8AJXEazV6jygl#MnW zTV=ZzdrF)k_+?!JW8P@Xv@>rjyE2>HeI&-99y6}Op!DgP;0A)Dl>>{ysEo=$l$Cuy*IsBHGRVKena<9MSy zxz9p;o45lysY|+A!jV7_+K?JlbY5Ox1jn;8;<5nk==bs#Mupp@Zv5!x&nz~b1$x}G zdmd0AD}`%22!fk)y+&Kof1YMK1RYEGu|MscR7eIQM#Rh(>d~X>3ouu^q$!Fipee4f z$W^8H!j@kC(Q+NX)lrdc3N@XL9^pL*BVfFKR~ATrB{B_td~dpq?B*5`NosE z`G`|4;tP?S;bsoe8_-zvS|0iVq{JNtOTMb%E=j53cIlY|K~-$MOGD5EtWZtHH1z2` zXusICi~g87dydi$2v2*CV4d+_zI?&hNQd`N{^_Kz5}ow+{NNQ5Cml|FqgC+iz5ybX z;)rVo`#gvqb*e2T&^uNEQ*1f~cWu?s5sGYhqTZY>3ky#!xEDXA#5|(?rPW6yxjki5 zoI^9jnpWCt>+VK!w$kZc#F)rWu>QCT6clbH_OfEO zJ#Oi8;4j(j6Sr!haCv2vqkVd8(Isf*luK#OwiNU#TR3Nm`RYioJ*{Yw{I0yoMe@z5 zpQ^(99NQci6e0QaJrj7;X(ExL7AWPTyx*bq@yB>JIAp_88!fPe;~Bat_pp{Am!g%5 z^aNv*>gJl6-~{7eDx+Dhbcs0=z6d{nzBfjFaStIRBv2J@-9D#nImOkrwD3e`#1kEv znf_fTJeIbupgQmr(J{@sj8E%#8vRA+{J!N2O^c)nIAEl7Eb$#w?d2qJgZRt0FMwF*j-q^sy(Z#; zWVi$zo6M2G4Rga+*OiNIA6zW*pk)!ad?fYbek~A?yqCu1tAR0ls6ZTSebGD=!q?O@ zph!;uRvD3J1njw_C3n9uVId*S(mK8Q$#WN*4Og-B)mn*_(!3JD;6l{s`$0h3uKQLu^R zzC)J%g^69fS+JpECuUFA0gx>+*<4bxch?L%dgSwvJ zXX?T1$s?jCSLQWm*j>Z1ASu2_^ba;dy4}Sz5E0{?7uJE_I5es>*BcY^?%(!6Jeg=nz+W3$x;{-vV5CqPBY-xOGck!s@OgIC{XqpbABBl9I@nua z(!KhU@G4LFrNVLa7^vF{@0vbBdnA-GrsR9ww6zVe%=DVZW23=)$`oxK++$A0jjLMe z`1@B86P3S>t|LA!->jmLi0s%h@X+at#*yP7X?)@?dE{icik|+V zDwtH%lK+IB0%!cH!FDbeJ^Z;TDsVK0H}LV3C(^|z<=BQK00g$@F~lsMXpmiGeSigK zO!TS*!pxTGGxDbp{rE9-hEg6XxV`y<^_ac4-lqfs9C#ydP;oK^Tw?X;-a~Q@O1;E2 zOt~|b&>3VPou$iP@C`Mc@irSWywN^w4kO>2Ha7nFC%n%MI7EGyEcL&3L7MKP`KfwI zSU+2HnenzmQ-7JgKJ8zgTH%Hh>BT@L}_A-8{d$df2QVAQ;C1LFYITHbN z1;1}ZAC75o#4e=tZ9*LL`dN$UoLV{x;-$v!pE1ATIrtL`Cqxp554!vj^+O(A<24Lv z_jfe%JYPm626S3_ZE5nb$og2`n+B&d$89<@lr1_jb3DaFnldr^=Cf)DhyfqPmvG=P z^LU*RUUT;b066R5p$DK1;xld(l58^Ng|9?3s2YaN{Rrg@G@R2F7Y^L=+|yGnweLPh zi>7CsDMi?p*@UBkeOVfFT9TDvX@0bJNbw?C&|Wa{NhaQr5d{d-F4tB&0!u&6Z(h~Y zkBLkL&uEM@#m8KcO}>^{tS0{?R5JV~;vxNoyjj(F-K`oMro5DtK672y%<^D|nVbZ# z#Qq*EnlCjVQ?7u${RU&>DN-A%d!3jZ%wudPNj{~&kakfyBpmA`5;k>{Hk^6lA?B`7 zJ%47S4dSln5p!SznP-!!Oywyj6UMSKGkqn+M)U=K^u%b#YY^F#9tecBrvTov+sjL9 zM6q7|Yjgb0?v~RR{&SRqTU}GDS=)BOTg}bQKgN;i&lDI&B!BV3?vM5{$jn%5)vTvE zu%q|vb|}LyZ>6;vosvMS^Kf@pc|Q7p55L^8MbBz~t!fuKgl-2Cl16F!bJ`tK-&YqA zHhJ#&gD}@^XNS{2VFV5le@7gtZ17t8gW8~8PF9cl7_>Xj`a5L^@i<3!fD?hDFXL84 z4R~a%`-BTzJRC+}kejH=pt$T-FFHAi*#3Z8z9dMuMKdRJgV+eaE{UUNT)x<98w zeNNIu5X~P=i+!b1^Q%PEWpsFGme|l^fkG>3ObU@KEW2w6xM=e<+5~4(Kw0kNaq3xY zO|v7&fXIy5!L&e`uo)PCRN3yV*L3{PSl^Wh@l4JvNs``txpApux@%7fi+^WVM8dq; zrZmuv#cuy}%?B$VBJiUPv+&A*^xue65u+cn7iOgQfs9;PAnk>&%pSMh4Ign|T!$TC zp5^{)4j>|wNpuyqCy5;%*FOj<95n!dHXxN#{<-Q?=+8i49e0gNsHYOI*+xLm&Rt1|+# z2=Xt|q-)8o3#>{x#!)4br2sm#ncPZr&tq){l|)B+D9i9L=ofgW5dNmi0*nZw$fBd8 zzeg8Kkp8Osys*Ey{w(|lNEeV!{7MM1Rscnbh2hF>As-u4O{a(PJo@h*KYq+lq)B6* zR3Eb?Jyc}mNTQE0UPAowi~d!S9scUmeNs8~%Si z?x`IyIobI+DD}r?lVAhHf9R$Jz-c}G zYMkN!dffkt$oy-a0FgQhU!9Y&M&aMzg!Gx}Wnj=A1pRU%F=(Ve|F1U^CnnK Date: Mon, 13 Apr 2026 20:50:38 +0800 Subject: [PATCH 20/76] [Bugfix] Update Flux2-dev & Dynin_omni L4 e2e test (#2723) Signed-off-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- tests/dfx/perf/tests/test_qwen_image_vllm_omni.json | 2 +- tests/e2e/online_serving/test_dynin_omni_expansion.py | 6 +++--- tests/e2e/online_serving/test_flux_2_dev_expansion.py | 4 +--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index 387e874ad5f..97c1bbfb3c7 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -72,7 +72,7 @@ "enable-negative-prompt": true, "baseline": { "throughput_qps": 0.1, - "latency_mean": 2.34, + "latency_mean": 2.7, "peak_memory_mb_mean": 61000 } }, diff --git a/tests/e2e/online_serving/test_dynin_omni_expansion.py b/tests/e2e/online_serving/test_dynin_omni_expansion.py index 39b6dc8e212..710c480f08d 100644 --- a/tests/e2e/online_serving/test_dynin_omni_expansion.py +++ b/tests/e2e/online_serving/test_dynin_omni_expansion.py @@ -120,7 +120,7 @@ def _build_i2i_messages(prompt: str) -> list[dict]: @pytest.mark.advanced_model @pytest.mark.omni -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) @pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True) def test_send_i2i_request_001(omni_server, openai_client) -> None: request_config = { @@ -136,7 +136,7 @@ def test_send_i2i_request_001(omni_server, openai_client) -> None: @pytest.mark.advanced_model @pytest.mark.omni -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) @pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True) def test_send_t2i_request_001(omni_server, openai_client) -> None: request_config = { @@ -149,7 +149,7 @@ def test_send_t2i_request_001(omni_server, openai_client) -> None: @pytest.mark.core_model @pytest.mark.omni -@hardware_test(res={"cuda": "L4", "rocm": "MI325"}) +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}) @pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True) def test_send_t2s_request_001(omni_server, dynin_t2s_openai_client) -> None: request_config = { diff --git a/tests/e2e/online_serving/test_flux_2_dev_expansion.py b/tests/e2e/online_serving/test_flux_2_dev_expansion.py index 9d96a48c0c8..f7477ed803e 100644 --- a/tests/e2e/online_serving/test_flux_2_dev_expansion.py +++ b/tests/e2e/online_serving/test_flux_2_dev_expansion.py @@ -27,7 +27,7 @@ NEGATIVE_PROMPT = "low quality, blurry, distorted, deformed, watermark" SINGLE_CARD_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}) -PARALLEL_FEATURE_MARKS = hardware_marks(res={"cuda": "L4"}, num_cards=2) +PARALLEL_FEATURE_MARKS = hardware_marks(res={"cuda": "H100"}, num_cards=2) def _get_flux_2_dev_feature_cases(model: str): @@ -48,8 +48,6 @@ def _get_flux_2_dev_feature_cases(model: str): OmniServerParams( model=model, server_args=[ - "--cache-backend", - "cache_dit", "--enable-cpu-offload", "--cfg-parallel-size", "2", From c9e2e3e8d764875764ab89c1bfbb294314959e44 Mon Sep 17 00:00:00 2001 From: Chen-Yo Sun Date: Mon, 13 Apr 2026 10:53:35 -0700 Subject: [PATCH 21/76] [Voxtral TTS] Correct decode steps param in Voxtral TTS (#2524) Signed-off-by: Chen-Yo Sun --- .../voxtral_tts/test_cuda_graph_acoustic_transformer.py | 8 ++++++++ .../models/voxtral_tts/configuration_voxtral_tts.py | 9 +++++++++ .../cuda_graph_acoustic_transformer_wrapper.py | 4 ++-- .../models/voxtral_tts/voxtral_tts_audio_generation.py | 6 +++--- 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py b/tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py index 6f072944d9a..847adae06fa 100644 --- a/tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py +++ b/tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py @@ -78,6 +78,13 @@ AudioSpecialTokens = _mod2.AudioSpecialTokens +class SyntheticAcousticTransformerArgs: + """Mimics AcousticTransformerArgs interface.""" + + def __init__(self): + self.n_decoding_steps = 7 + + class SyntheticModelArgs: """Mimics MultimodalAudioModelArgs interface.""" @@ -96,6 +103,7 @@ class SyntheticAcousticTransformer(nn.Module): def __init__(self): super().__init__() self.model_args = SyntheticModelArgs() + self.acoustic_transformer_args = SyntheticAcousticTransformerArgs() self.acoustic_embeddings_levels = ACOUSTIC_EMBEDDINGS_LEVELS # semantic_codebook_output: hidden_dim -> padded_codebook_size diff --git a/vllm_omni/model_executor/models/voxtral_tts/configuration_voxtral_tts.py b/vllm_omni/model_executor/models/voxtral_tts/configuration_voxtral_tts.py index d32a882e786..0f22c764a02 100644 --- a/vllm_omni/model_executor/models/voxtral_tts/configuration_voxtral_tts.py +++ b/vllm_omni/model_executor/models/voxtral_tts/configuration_voxtral_tts.py @@ -48,6 +48,15 @@ def _remap_mistral_audio_args(self, config_dict: dict) -> dict: audio_tokenizer_args = config_dict["multimodal"].pop("audio_tokenizer_args", None) audio_config = {} if encoder_args is not None: + # Default n_decoding_steps if not provided + acoustic_args = encoder_args.get("acoustic_transformer_args", {}) + if acoustic_args.get("n_decoding_steps") is None: + logger.warning( + "n_decoding_steps not provided in acoustic_transformer_args, defaulting to 7. " + "Please add 'n_decoding_steps' to params.json under acoustic_transformer_args." + ) + acoustic_args["n_decoding_steps"] = 7 + audio_config = { "sampling_rate": encoder_args["audio_encoding_args"]["sampling_rate"], "codec_args": audio_tokenizer_args, diff --git a/vllm_omni/model_executor/models/voxtral_tts/cuda_graph_acoustic_transformer_wrapper.py b/vllm_omni/model_executor/models/voxtral_tts/cuda_graph_acoustic_transformer_wrapper.py index a4d58df5b15..ff053342dbe 100644 --- a/vllm_omni/model_executor/models/voxtral_tts/cuda_graph_acoustic_transformer_wrapper.py +++ b/vllm_omni/model_executor/models/voxtral_tts/cuda_graph_acoustic_transformer_wrapper.py @@ -49,7 +49,7 @@ def __init__( self.acoustic_embeddings_levels = self.acoustic_transformer.acoustic_embeddings_levels self.cfg_alpha = 1.2 - self.n_steps = 8 + self.n_steps = self.acoustic_transformer.acoustic_transformer_args.n_decoding_steps # Graph storage self.graphs: dict[int, CUDAGraph] = {} @@ -73,7 +73,7 @@ def _warmup_and_capture(self, device: torch.device, dtype: torch.dtype, hidden_d ) # Pre-create persistent buffers - self.timesteps = torch.linspace(0, 1, self.n_steps, device=device, dtype=dtype) + self.timesteps = torch.linspace(0, 1, self.n_steps + 1, device=device, dtype=dtype) self.fake_eos_one = torch.tensor(1.0, dtype=dtype, device=device) self.fake_eos_zero = torch.tensor(0.0, dtype=dtype, device=device) diff --git a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py index b5d11617337..4041a53e55a 100644 --- a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py +++ b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py @@ -108,6 +108,7 @@ class AcousticTransformerArgs: use_biases: bool = False norm_eps: float = 1e-5 sigma: float = 1e-5 # was 0.01 in beta version + n_decoding_steps: int | None = None # Number of Euler ODE steps for flow matching @dataclass @@ -436,14 +437,13 @@ def __init__( self._empty_audio_token_id = AudioSpecialTokens.id(AudioSpecialTokens.empty_audio) # Flow matching constants - # TODO(chenyo): hardcoded, need to fix - self._acoustic_decode_iters = 8 + self._n_steps = args.n_decoding_steps # TODO(chenyo): hardcoded, need to fix self._cfg_alpha = 1.2 self._noise_scale = 1.0 self.register_buffer( "_timesteps", - torch.linspace(0, 1, self._acoustic_decode_iters), + torch.linspace(0, 1, self._n_steps + 1), persistent=False, ) From 14f79109000f64f61ca78045abdf5518c0b4fceb Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Tue, 14 Apr 2026 05:16:47 +0800 Subject: [PATCH 22/76] [Perf]: Speedup VoxCPM2 TTS performance and Support PagedAttention (#2690) Signed-off-by: Sy03 <1370724210@qq.com> Signed-off-by: Yueqian Lin Co-authored-by: Yueqian Lin Co-authored-by: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> --- examples/offline_inference/voxcpm2/README.md | 6 +- examples/offline_inference/voxcpm2/end2end.py | 6 +- .../entrypoints/openai/serving_speech.py | 17 + .../models/voxcpm2/minicpm4_hf_compat.py | 114 ++ .../models/voxcpm2/minicpm4_paged.py | 448 +++++++ .../models/voxcpm2/voxcpm2_talker.py | 1162 +++++++++++------ .../model_executor/stage_configs/voxcpm2.yaml | 8 +- vllm_omni/worker/gpu_ar_model_runner.py | 17 +- vllm_omni/worker/gpu_model_runner.py | 1 + 9 files changed, 1332 insertions(+), 447 deletions(-) create mode 100644 vllm_omni/model_executor/models/voxcpm2/minicpm4_hf_compat.py create mode 100644 vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py diff --git a/examples/offline_inference/voxcpm2/README.md b/examples/offline_inference/voxcpm2/README.md index df48a85f569..e9827307997 100644 --- a/examples/offline_inference/voxcpm2/README.md +++ b/examples/offline_inference/voxcpm2/README.md @@ -58,12 +58,12 @@ The script accepts the following arguments: ## Performance -Measured on a single H20 GPU (80 GB), voxcpm 0.0.0, PyTorch 2.10.0+cu128: +Measured on a single H20 GPU (80 GB): | Input length | RTF | Sample rate | |---|---|---| -| Short (~6 words) | ~0.81 | 48 kHz | -| Long (~50 words) | ~0.72 | 48 kHz | +| Short (~10 tokens) | ~0.28 | 48 kHz | +| Long (~100 tokens) | ~0.34 | 48 kHz | RTF < 1.0 means faster than real time. diff --git a/examples/offline_inference/voxcpm2/end2end.py b/examples/offline_inference/voxcpm2/end2end.py index 2dce7508975..ce404bf962d 100644 --- a/examples/offline_inference/voxcpm2/end2end.py +++ b/examples/offline_inference/voxcpm2/end2end.py @@ -71,10 +71,10 @@ def parse_args(): def extract_audio(multimodal_output: dict) -> torch.Tensor: """Extract the final complete audio tensor from multimodal output. - The output processor accumulates per-step full audio under ``audio`` - as a list. The last element is the complete waveform. + The output processor concatenates per-step delta tensors under + ``model_outputs``. Falls back to ``audio`` for backwards compat. """ - audio = multimodal_output.get("audio") or multimodal_output.get("model_outputs") + audio = multimodal_output.get("model_outputs") or multimodal_output.get("audio") if audio is None: raise ValueError(f"No audio key in multimodal_output: {list(multimodal_output.keys())}") diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index a95fa695156..3dc5f595d0a 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -49,12 +49,14 @@ _FISH_TTS_MODEL_STAGES = {"fish_speech_slow_ar"} _COSYVOICE3_TTS_MODEL_STAGES = {"cosyvoice3_talker"} _OMNIVOICE_TTS_MODEL_STAGES = {"omnivoice_generator"} +_VOXCPM2_TTS_MODEL_STAGES = {"latent_generator"} _TTS_MODEL_STAGES: set[str] = ( _VOXTRAL_TTS_MODEL_STAGES | _QWEN3_TTS_MODEL_STAGES | _FISH_TTS_MODEL_STAGES | _COSYVOICE3_TTS_MODEL_STAGES | _OMNIVOICE_TTS_MODEL_STAGES + | _VOXCPM2_TTS_MODEL_STAGES ) _TTS_LANGUAGES: set[str] = { "Auto", @@ -290,6 +292,8 @@ def _detect_tts_model_type(self) -> str | None: return "cosyvoice3" if model_stage in _OMNIVOICE_TTS_MODEL_STAGES: return "omnivoice" + if model_stage in _VOXCPM2_TTS_MODEL_STAGES: + return "voxcpm2" return None def _compute_max_instructions_length(self) -> int: @@ -787,6 +791,8 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non return self._validate_fish_tts_request(request) if self._tts_model_type == "cosyvoice3": return self._validate_cosyvoice3_request(request) + if self._tts_model_type == "voxcpm2": + return None # VoxCPM2 accepts any text input return self._validate_qwen_tts_request(request) def _validate_ref_audio_format(self, ref_audio: str) -> str | None: @@ -1430,6 +1436,15 @@ async def _prepare_speech_generation( prompt["lang"] = request.language if request.instructions: prompt["instruct"] = request.instructions + elif self._tts_model_type == "voxcpm2": + tts_params = {} + additional: dict[str, Any] = {} + if request.ref_audio is not None: + wav_list, sr = await self._resolve_ref_audio(request.ref_audio) + additional["reference_audio"] = [[wav_list, sr]] + prompt = {"prompt": request.input} + if additional: + prompt["additional_information"] = additional elif self._is_tts: validation_error = self._validate_tts_request(request) if validation_error: @@ -1466,6 +1481,8 @@ async def _prepare_speech_generation( model_type = "voxtral_tts" elif self._tts_model_type == "cosyvoice3": model_type = "cosyvoice3" + elif self._tts_model_type == "voxcpm2": + model_type = "voxcpm2" elif self._is_tts: model_type = tts_params.get("task_type", ["unknown"])[0] else: diff --git a/vllm_omni/model_executor/models/voxcpm2/minicpm4_hf_compat.py b/vllm_omni/model_executor/models/voxcpm2/minicpm4_hf_compat.py new file mode 100644 index 00000000000..cb3101b16ac --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm2/minicpm4_hf_compat.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""fp32 RoPE + MLP matching native VoxCPM2 numerics. + +Exports: _MiniCPMLongRoPE, _MiniCPMMLP, _apply_rotary_pos_emb +""" + +from __future__ import annotations + +import math +from typing import Any + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# =================================================================== +# Primitives +# =================================================================== + + +def _rotate_half(x: torch.Tensor) -> torch.Tensor: + x1, x2 = x.chunk(2, dim=-1) + return torch.cat((-x2, x1), dim=-1) + + +def _apply_rotary_pos_emb( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + """Apply rotary embeddings in float32.""" + orig_dtype = q.dtype + q, k = q.to(torch.float32), k.to(torch.float32) + q_embed = (q * cos) + (_rotate_half(q) * sin) + k_embed = (k * cos) + (_rotate_half(k) * sin) + return q_embed.to(orig_dtype), k_embed.to(orig_dtype) + + +# =================================================================== +# LongRoPE — must match native computation order exactly +# =================================================================== + + +class _MiniCPMLongRoPE(nn.Module): + """LongRoPE matching native computation order.""" + + def __init__( + self, + hidden_size: int, + num_attention_heads: int, + kv_channels: int | None, + rope_theta: float, + max_position_embeddings: int, + rope_scaling: dict[str, Any], + ) -> None: + super().__init__() + self.dim = kv_channels if kv_channels else hidden_size // num_attention_heads + self.base = rope_theta + self.max_position_embeddings = max_position_embeddings + self.short_factor = rope_scaling["short_factor"] + self.long_factor = rope_scaling["long_factor"] + self.original_max_position_embeddings = rope_scaling["original_max_position_embeddings"] + + scale = self.max_position_embeddings / self.original_max_position_embeddings + self.scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings)) + + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + self.max_seq_len_cached = 0 + self.register_buffer("cos_cached", torch.empty(0), persistent=False) + self.register_buffer("sin_cached", torch.empty(0), persistent=False) + self._set_cos_sin_cache(self.max_position_embeddings, self.inv_freq.device, torch.float32) + + def _set_cos_sin_cache(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> None: + self.max_seq_len_cached = seq_len + t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype) + + ext_factors = torch.tensor( + self.long_factor if seq_len > self.original_max_position_embeddings else self.short_factor, + dtype=torch.float32, + device=device, + ) + + freqs = torch.mul( + torch.outer(t, 1.0 / ext_factors).to(device=device), + self.inv_freq.to(device=device).to(dtype), + ) + emb = torch.cat((freqs, freqs), dim=-1) + self.cos_cached = emb.cos().to(dtype) * self.scaling_factor + self.sin_cached = emb.sin().to(dtype) * self.scaling_factor + + def forward(self, position_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + return self.cos_cached[position_ids], self.sin_cached[position_ids] + + +# =================================================================== +# MLP +# =================================================================== + + +class _MiniCPMMLP(nn.Module): + """SiLU-gated MLP matching native MiniCPMMLP.""" + + def __init__(self, hidden_size: int, intermediate_size: int) -> None: + super().__init__() + self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x)) diff --git a/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py b/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py new file mode 100644 index 00000000000..7ea5bc229dc --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py @@ -0,0 +1,448 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""MiniCPM4 with PagedAttention + fp32 RoPE/RMSNorm for VoxCPM2. + +Uses vllm Attention for KV cache, keeps fp32 precision ops from +minicpm4_hf_compat.py to match native VoxCPM2 numerics. +""" + +from __future__ import annotations + +import math +from collections.abc import Iterable +from typing import Any + +import torch +import torch.nn as nn +from vllm.config import CacheConfig, VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.layers.attention import Attention +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.utils import make_empty_intermediate_tensors_factory +from vllm.sequence import IntermediateTensors + +from .minicpm4_hf_compat import ( + _apply_rotary_pos_emb, + _MiniCPMLongRoPE, + _MiniCPMMLP, +) + +logger = init_logger(__name__) + + +def _resolve_lm_cfg(config: Any) -> Any: + """Extract lm_config from VoxCPM2Config, converting dict to namespace if needed.""" + lm_cfg = getattr(config, "lm_config", config) + if isinstance(lm_cfg, dict): + + class _Cfg: + pass + + c = _Cfg() + for k, v in lm_cfg.items(): + setattr(c, k, v) + return c + return lm_cfg + + +# =================================================================== +# Attention with vllm PagedAttention backend +# =================================================================== + + +class _PagedMiniCPM4Attention(nn.Module): + """PagedAttention + fp32 RoPE with separate q/k/v projections.""" + + def __init__( + self, + hidden_size: int, + num_attention_heads: int, + num_key_value_heads: int, + kv_channels: int | None, + layer_idx: int, + cache_config: CacheConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.layer_idx = layer_idx + self.hidden_size = hidden_size + self.num_heads = num_attention_heads + self.head_dim = kv_channels if kv_channels else hidden_size // num_attention_heads + self.num_kv_heads = num_key_value_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + + self.q_proj = nn.Linear(hidden_size, self.q_size, bias=False) + self.k_proj = nn.Linear(hidden_size, self.kv_size, bias=False) + self.v_proj = nn.Linear(hidden_size, self.kv_size, bias=False) + self.o_proj = nn.Linear(self.q_size, hidden_size, bias=False) + self._fused_qkv_weight: torch.Tensor | None = None + + self.attn = Attention( + self.num_heads, + self.head_dim, + scale=self.head_dim**-0.5, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + prefix=f"{prefix}.attn", + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + rope_emb: _MiniCPMLongRoPE | None = None, + ) -> torch.Tensor: + """Forward: fused QKV → fp32 RoPE → PagedAttention → o_proj.""" + if self._fused_qkv_weight is None: + self._fused_qkv_weight = torch.cat( + [ + self.q_proj.weight, + self.k_proj.weight, + self.v_proj.weight, + ], + dim=0, + ).detach() + qkv = nn.functional.linear(hidden_states, self._fused_qkv_weight) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + if rope_emb is not None: + cos, sin = rope_emb(positions) + bsz = q.shape[0] + q_r = q.view(bsz, self.num_heads, self.head_dim) + k_r = k.view(bsz, self.num_kv_heads, self.head_dim) + q_r = q_r.unsqueeze(0).transpose(1, 2) # [1, heads, n_tokens, dim] + k_r = k_r.unsqueeze(0).transpose(1, 2) # [1, kv_heads, n_tokens, dim] + q_r, k_r = _apply_rotary_pos_emb(q_r, k_r, cos, sin) + q = q_r.transpose(1, 2).squeeze(0).reshape(bsz, -1) # [n_tokens, q_size] + k = k_r.transpose(1, 2).squeeze(0).reshape(bsz, -1) # [n_tokens, kv_size] + + attn_output = self.attn(q, k, v) + + output = self.o_proj(attn_output) + return output + + +# =================================================================== +# Decoder Layer +# =================================================================== + + +class _PagedMiniCPM4DecoderLayer(nn.Module): + """Decoder layer: PagedAttention + fp32 RMSNorm + muP scale_depth.""" + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + num_attention_heads: int, + num_key_value_heads: int, + kv_channels: int | None, + rms_norm_eps: float, + layer_idx: int, + num_hidden_layers: int, + use_mup: bool, + scale_depth: float, + cache_config: CacheConfig | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self.self_attn = _PagedMiniCPM4Attention( + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + num_key_value_heads=num_key_value_heads, + kv_channels=kv_channels, + layer_idx=layer_idx, + cache_config=cache_config, + prefix=f"{prefix}.self_attn", + ) + self.mlp = _MiniCPMMLP(hidden_size, intermediate_size) + self.input_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps) + self.post_attention_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps) + + self.use_mup = use_mup + self.scale_depth = scale_depth + self.num_hidden_layers = num_hidden_layers + + def _residual_scale(self) -> float: + if self.use_mup: + return self.scale_depth / math.sqrt(self.num_hidden_layers) + return 1.0 + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: torch.Tensor | None, + rope_emb: _MiniCPMLongRoPE | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + # Pre-norm + attention + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.self_attn(positions, hidden_states, rope_emb) + + scale = self._residual_scale() + if scale != 1.0: + hidden_states = residual + hidden_states * scale + else: + hidden_states = residual + hidden_states + + # Pre-norm + FFN + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + + if scale != 1.0: + hidden_states = residual + hidden_states * scale + else: + hidden_states = residual + hidden_states + + return hidden_states, None + + +# =================================================================== +# Full Model +# =================================================================== + + +class MiniCPM4PagedForVoxCPM2(nn.Module): + """PagedAttention base_lm (28 layers) for VoxCPM2 scaffold.""" + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + self.config = config + + lm_cfg = _resolve_lm_cfg(config) + + hidden_size = lm_cfg.hidden_size + num_hidden_layers = lm_cfg.num_hidden_layers + kv_channels = getattr(lm_cfg, "kv_channels", None) + + self.vocab_size = lm_cfg.vocab_size + self.embed_tokens = nn.Embedding(self.vocab_size, hidden_size) + + rope_scaling = getattr(lm_cfg, "rope_scaling", None) + if isinstance(rope_scaling, dict): + rope_scaling_dict = rope_scaling + elif hasattr(rope_scaling, "__dict__"): + rope_scaling_dict = { + "short_factor": rope_scaling.short_factor, + "long_factor": rope_scaling.long_factor, + "original_max_position_embeddings": rope_scaling.original_max_position_embeddings, + } + else: + rope_scaling_dict = {} + + no_rope = getattr(lm_cfg, "no_rope", False) + if not no_rope: + self.rope_emb = _MiniCPMLongRoPE( + hidden_size=hidden_size, + num_attention_heads=lm_cfg.num_attention_heads, + kv_channels=kv_channels, + rope_theta=getattr(lm_cfg, "rope_theta", 10000.0), + max_position_embeddings=getattr(lm_cfg, "max_position_embeddings", 32768), + rope_scaling=rope_scaling_dict, + ) + else: + self.rope_emb = None + + self.layers = nn.ModuleList( + [ + _PagedMiniCPM4DecoderLayer( + hidden_size=hidden_size, + intermediate_size=lm_cfg.intermediate_size, + num_attention_heads=lm_cfg.num_attention_heads, + num_key_value_heads=lm_cfg.num_key_value_heads, + kv_channels=kv_channels, + rms_norm_eps=lm_cfg.rms_norm_eps, + layer_idx=i, + num_hidden_layers=num_hidden_layers, + use_mup=getattr(lm_cfg, "use_mup", False), + scale_depth=getattr(lm_cfg, "scale_depth", 1.0), + cache_config=cache_config, + prefix=f"{prefix}.layers.{i}", + ) + for i in range(num_hidden_layers) + ] + ) + + self.norm = RMSNorm(hidden_size, eps=lm_cfg.rms_norm_eps) + + self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], hidden_size + ) + + use_mup = getattr(lm_cfg, "use_mup", False) + self._scale_emb = getattr(lm_cfg, "scale_emb", 1.0) if use_mup else 1.0 + self._compiled_layers: set[int] = set() + + def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor: + return self.embed_tokens(input_ids) * self._scale_emb + + def forward( + self, + input_ids: torch.Tensor | None, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs: Any, + ) -> torch.Tensor | IntermediateTensors: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_input_ids(input_ids) + + residual = None + for layer in self.layers: + hidden_states, residual = layer( + positions, + hidden_states, + residual, + self.rope_emb, + ) + + hidden_states = self.norm(hidden_states) + return hidden_states + + def compile_selective(self) -> list[str]: + """Compile MLP + o_proj; keep RMSNorm/RoPE eager for precision.""" + compiled: list[str] = [] + for i, layer in enumerate(self.layers): + if i in self._compiled_layers: + continue + try: + layer.mlp = torch.compile( + layer.mlp, + mode="default", + fullgraph=True, + ) + layer.self_attn.o_proj = torch.compile( + layer.self_attn.o_proj, + mode="default", + fullgraph=True, + ) + layer.self_attn._fused_qkv_weight = None + self._compiled_layers.add(i) + if i == 0: + compiled.append(f"layers.*.mlp (×{len(self.layers)})") + compiled.append(f"layers.*.self_attn.o_proj (×{len(self.layers)})") + except Exception as e: + logger.warning("compile_selective: layer %d failed: %s", i, e) + break + return compiled + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load weights from native checkpoint (base_lm. prefix pre-stripped).""" + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded: set[str] = set() + + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + param = params_dict.get(name) + if param is None: + continue + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded.add(name) + + return loaded + + +# =================================================================== +# Residual LM with PagedAttention (no RoPE, 8 layers) +# =================================================================== + + +class MiniCPM4PagedResidualLM(nn.Module): + """PagedAttention residual LM (8 layers, no RoPE) for VoxCPM2.""" + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + self.config = config + + lm_cfg = _resolve_lm_cfg(config) + + hidden_size = lm_cfg.hidden_size + num_hidden_layers = getattr(config, "residual_lm_num_layers", 8) + kv_channels = getattr(lm_cfg, "kv_channels", None) + + self.rope_emb = None + + self.layers = nn.ModuleList( + [ + _PagedMiniCPM4DecoderLayer( + hidden_size=hidden_size, + intermediate_size=lm_cfg.intermediate_size, + num_attention_heads=lm_cfg.num_attention_heads, + num_key_value_heads=lm_cfg.num_key_value_heads, + kv_channels=kv_channels, + rms_norm_eps=lm_cfg.rms_norm_eps, + layer_idx=i, + num_hidden_layers=num_hidden_layers, + use_mup=getattr(lm_cfg, "use_mup", False), + scale_depth=getattr(lm_cfg, "scale_depth", 1.0), + cache_config=cache_config, + prefix=f"{prefix}.layers.{i}", + ) + for i in range(num_hidden_layers) + ] + ) + + self.norm = RMSNorm(hidden_size, eps=lm_cfg.rms_norm_eps) + self._compiled_layers: set[int] = set() + + def forward( + self, + positions: torch.Tensor, + inputs_embeds: torch.Tensor, + ) -> torch.Tensor: + hidden_states = inputs_embeds + residual = None + for layer in self.layers: + hidden_states, residual = layer( + positions, + hidden_states, + residual, + self.rope_emb, + ) + hidden_states = self.norm(hidden_states) + return hidden_states + + def compile_selective(self) -> list[str]: + """Compile MLP + o_proj (same as base_lm).""" + compiled: list[str] = [] + for i, layer in enumerate(self.layers): + if i in self._compiled_layers: + continue + try: + layer.mlp = torch.compile(layer.mlp, mode="default", fullgraph=True) + layer.self_attn.o_proj = torch.compile(layer.self_attn.o_proj, mode="default", fullgraph=True) + layer.self_attn._fused_qkv_weight = None + self._compiled_layers.add(i) + if i == 0: + compiled.append(f"layers.*.mlp (×{len(self.layers)})") + compiled.append(f"layers.*.self_attn.o_proj (×{len(self.layers)})") + except Exception as e: + logger.warning("compile_selective: residual layer %d failed: %s", i, e) + return compiled + + def load_weights_from_native(self, native_residual_lm: nn.Module) -> int: + """Load weights from native residual_lm. Returns param count.""" + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded = 0 + for name, param in native_residual_lm.named_parameters(): + if "rotary_emb" in name: + continue + target = params_dict.get(name) + if target is None: + continue + weight_loader = getattr(target, "weight_loader", default_weight_loader) + weight_loader(target, param.data) + loaded += 1 + return loaded diff --git a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py index b9faf9fa3b8..0898ca59ae4 100644 --- a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py +++ b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py @@ -1,33 +1,27 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""VoxCPM2 native AR talker — uses native MiniCPM4 base_lm directly. - -Uses native VoxCPM2 modules (no PagedAttention, manual KV cache). -Each AR decode step: - feat_encoder → base_lm → FSQ → residual_lm → LocDiT → stop - -TODO(PagedAttention): The base_lm is a MiniCPM4 variant (GQA + LongRoPE, -use_mup=False). vllm's MiniCPMModel already supports the architecture -(LongRoPE via Phi3LongRoPEScaledRotaryEmbedding, muP via config), but -two issues block replacing the native base_lm with a vllm MiniCPM4Model: - 1. Per-request state isolation — residual_lm and LocDiT diffusion use - shared native KV caches; concurrent requests clobber each other. - Fix: save/restore residual_lm cache per request, or pool N instances. - 2. Streaming audio — make_omni_output re-decodes all patches each step. - Fix: sliding-window VAE decode (decode_pad pattern from nanovllm). +"""VoxCPM2 AR talker — PagedAttention pipeline with per-request state. + +Architecture: + MiniCPM4PagedForVoxCPM2 (base_lm, 28 layers, PagedAttention + fp32 RoPE) + → FSQ → MiniCPM4PagedResidualLM (8 layers, PagedAttention, no RoPE) + → LocDiT (CFM solver) → AudioVAE → 48kHz waveform """ from __future__ import annotations +import dataclasses +import os +import time from collections.abc import Iterable from typing import Any import librosa import torch import torch.nn as nn +from einops import rearrange from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.model_executor.models.minicpm import MiniCPMModel from vllm.model_executor.models.utils import ( AutoWeightsLoader, WeightsMapper, @@ -37,10 +31,13 @@ from vllm_omni.model_executor.models.output_templates import OmniOutput +from .minicpm4_paged import MiniCPM4PagedForVoxCPM2, MiniCPM4PagedResidualLM from .voxcpm2_import_utils import import_voxcpm2_core logger = init_logger(__name__) +_ENABLE_PROFILING = os.environ.get("VOXCPM2_PROFILE", "0") == "1" + def _encode_raw_audio( tts: nn.Module, @@ -51,34 +48,21 @@ def _encode_raw_audio( """Encode raw audio samples using the native VoxCPM2 AudioVAE. Mirrors ``VoxCPM2Model._encode_wav`` but accepts in-memory samples - instead of a file path. This is needed for the OpenAI speech API - where ``_resolve_ref_audio`` returns decoded audio data. - - Args: - tts: Native VoxCPM2 tts_model instance. - samples: Audio samples (mono, float32). - sr: Sample rate of the input audio. - padding_mode: "right" (default) or "left" padding. - - Returns: - audio_feat: (T, P, D) tensor of latent patches. + instead of a file path (needed for the OpenAI speech API). """ if isinstance(samples, list): audio = torch.tensor(samples, dtype=torch.float32) else: audio = samples.float() - if audio.ndim == 1: audio = audio.unsqueeze(0) - # Resample to the model's expected encoding sample rate encode_sr = tts._encode_sample_rate if sr != encode_sr: audio_np = audio.squeeze(0).numpy() audio_np = librosa.resample(audio_np, orig_sr=sr, target_sr=encode_sr) audio = torch.from_numpy(audio_np).unsqueeze(0) - # Pad to patch boundary patch_len = tts.patch_size * tts.chunk_size if audio.size(1) % patch_len != 0: padding_size = patch_len - audio.size(1) % patch_len @@ -89,48 +73,301 @@ def _encode_raw_audio( return feat.view(tts.audio_vae.latent_dim, -1, tts.patch_size).permute(1, 2, 0) -class VoxCPM2TalkerForConditionalGeneration(nn.Module): - """VoxCPM2 talker using native MiniCPM4 base_lm. +# =================================================================== +# Per-request state +# =================================================================== + + +@dataclasses.dataclass +class _RequestState: + request_id: str + curr_embed_for_next: torch.Tensor | None = None + prev_feat_embed: torch.Tensor | None = None + curr_prefix_feat_cond: torch.Tensor | None = None + last_audio_patch_gpu: torch.Tensor | None = None + precomputed_stop_logits: torch.Tensor | None = None + accumulated_patches: list[torch.Tensor] = dataclasses.field(default_factory=list) + decode_step_count: int = 0 + request_start_time: float = 0.0 + prefill_completed: bool = False + prefill_text: str = "" + prompt_cache: dict | None = None + prefill_masks: tuple | None = None + is_stopping: bool = False + last_decoded_audio: torch.Tensor | None = None + + +# =================================================================== +# Profiling timer +# =================================================================== + + +class _PerfTimer: + __slots__ = ("_enabled", "_timers", "_counts", "_starts", "_pairs") + + def __init__(self, enabled: bool = False): + self._enabled = enabled + self._timers: dict[str, float] = {} + self._counts: dict[str, int] = {} + self._starts: dict[str, torch.cuda.Event] = {} + self._pairs: list[tuple[str, torch.cuda.Event, torch.cuda.Event]] = [] + + def start(self, name: str) -> None: + if not self._enabled: + return + evt = torch.cuda.Event(enable_timing=True) + evt.record() + self._starts[name] = evt + + def stop(self, name: str) -> None: + if not self._enabled or name not in self._starts: + return + start_evt = self._starts.pop(name) + end_evt = torch.cuda.Event(enable_timing=True) + end_evt.record() + self._pairs.append((name, start_evt, end_evt)) + + def _resolve(self) -> None: + if not self._pairs: + return + torch.cuda.synchronize() + for name, s, e in self._pairs: + self._timers[name] = self._timers.get(name, 0.0) + s.elapsed_time(e) + self._counts[name] = self._counts.get(name, 0) + 1 + self._pairs.clear() + + def breakdown(self) -> str: + if not self._enabled: + return "" + self._resolve() + if not self._timers: + return "" + total = self._timers.get("decode_step", sum(self._timers.values())) + lines = [ + "=== VoxCPM2 Decode Step Breakdown ===", + f"{'Component':<30} | {'ms':>10} | {'%':>6} | {'N':>5} | {'avg':>8}", + "-" * 70, + ] + for name in sorted(self._timers): + t, c = self._timers[name], self._counts[name] + lines.append(f"{name:<30} | {t:>10.2f} | {t / total * 100:>5.1f}% | {c:>5} | {t / c:>8.3f}") + lines.append(f"{'TOTAL':<30} | {total:>10.2f} |") + return "\n".join(lines) + + def reset(self) -> None: + self._timers.clear() + self._counts.clear() + self._starts.clear() + self._pairs.clear() + + +# =================================================================== +# CFM pre-allocated buffers + optimized Euler solver +# =================================================================== + + +class _CFMBufferManager: + def __init__( + self, + device: torch.device, + dtype: torch.dtype, + feat_dim: int, + patch_size: int, + dit_hidden_size: int, + max_batch_size: int = 1, + sway_sampling_coef: float = 1.0, + ): + n = 2 * max_batch_size # CFG doubles the batch + self.x_in = torch.zeros(n, feat_dim, patch_size, device=device, dtype=dtype) + self.mu_in = torch.zeros(n, dit_hidden_size, device=device, dtype=dtype) + self.t_in = torch.zeros(n, device=device, dtype=dtype) + self.dt_in = torch.zeros(n, device=device, dtype=dtype) + self.cond_in = torch.zeros(n, feat_dim, patch_size, device=device, dtype=dtype) + self.noise = torch.zeros(max_batch_size, feat_dim, patch_size, device=device, dtype=dtype) + self._sway_coef = sway_sampling_coef + self._device = device + self._dtype = dtype + self.t_span_10 = self._make_t_span(10) + + def _make_t_span(self, n: int) -> torch.Tensor: + t = torch.linspace(1, 0, n + 1, device=self._device, dtype=self._dtype) + return t + self._sway_coef * (torch.cos(torch.pi / 2 * t) - 1 + t) + + def get_t_span(self, n: int) -> torch.Tensor: + return self.t_span_10 if n == 10 else self._make_t_span(n) + + +def _optimized_solve_euler( + cfm_module: nn.Module, + mu: torch.Tensor, + patch_size: int, + cond: torch.Tensor, + n_timesteps: int, + cfg_value: float, + buffers: _CFMBufferManager, + use_cfg_zero_star: bool = True, + cfg_cutoff_ratio: float = 1.0, + perf: _PerfTimer | None = None, +) -> torch.Tensor: + estimator = cfm_module.estimator + mean_mode = getattr(cfm_module, "mean_mode", False) + b = mu.size(0) + + buffers.noise[:b].normal_() + x = buffers.noise[:b].clone() + + t_span = buffers.get_t_span(n_timesteps) + t, dt = t_span[0], t_span[0] - t_span[1] + zero_init_steps = max(1, int(len(t_span) * 0.04)) + cfg_cutoff_step = max(zero_init_steps + 1, int(len(t_span) * cfg_cutoff_ratio)) + + for step in range(1, len(t_span)): + if use_cfg_zero_star and step <= zero_init_steps: + dphi_dt = torch.zeros_like(x) + elif step <= cfg_cutoff_step: + buffers.x_in[:b].copy_(x) + buffers.x_in[b : 2 * b].copy_(x) + buffers.mu_in[:b].copy_(mu) + buffers.mu_in[b : 2 * b].zero_() + buffers.t_in[:b].fill_(t.item()) + buffers.t_in[b : 2 * b].fill_(t.item()) + if mean_mode: + buffers.dt_in[:b].fill_(dt.item()) + buffers.dt_in[b : 2 * b].fill_(dt.item()) + else: + buffers.dt_in.zero_() + buffers.cond_in[:b].copy_(cond[:b]) + buffers.cond_in[b : 2 * b].copy_(cond[:b]) + + if perf: + perf.start(" cfm.estimator_cfg") + raw_out = estimator( + buffers.x_in[: 2 * b], + buffers.mu_in[: 2 * b], + buffers.t_in[: 2 * b], + buffers.cond_in[: 2 * b], + buffers.dt_in[: 2 * b], + ) + if perf: + perf.stop(" cfm.estimator_cfg") + + dphi_dt, cfg_dphi_dt = raw_out[:b], raw_out[b : 2 * b] + if use_cfg_zero_star: + pos = dphi_dt.reshape(b, -1) + neg = cfg_dphi_dt.reshape(b, -1) + st = torch.sum(pos * neg, 1, keepdim=True) / (torch.sum(neg**2, 1, keepdim=True) + 1e-8) + st = st.view(b, *([1] * (len(dphi_dt.shape) - 1))) + else: + st = 1.0 + dphi_dt = cfg_dphi_dt * st + cfg_value * (dphi_dt - cfg_dphi_dt * st) + else: + buffers.x_in[:b].copy_(x) + buffers.mu_in[:b].copy_(mu) + buffers.t_in[:b].fill_(t.item()) + if mean_mode: + buffers.dt_in[:b].fill_(dt.item()) + else: + buffers.dt_in[:b].zero_() + buffers.cond_in[:b].copy_(cond[:b]) + if perf: + perf.start(" cfm.estimator_nocfg") + dphi_dt = estimator( + buffers.x_in[:b], buffers.mu_in[:b], buffers.t_in[:b], buffers.cond_in[:b], buffers.dt_in[:b] + ) + if perf: + perf.stop(" cfm.estimator_nocfg") - Loads the full VoxCPM2 model natively and decomposes the AR loop: - each vllm decode step runs one iteration of the native generate loop. - """ + x = x - dt * dphi_dt + t = t - dt + if step < len(t_span) - 1: + dt = t - t_span[step + 1] + return x + +# =================================================================== +# Main talker model +# =================================================================== + + +class VoxCPM2TalkerForConditionalGeneration(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.vllm_config = vllm_config self.config = vllm_config.model_config.hf_config - # Flags for OmniGPUModelRunner self.have_multimodal_outputs = True self.has_preprocess = True self.has_postprocess = True - self._accumulated_patches: list[torch.Tensor] = [] - # vllm MiniCPMModel scaffold — needed for warmup/profiling/KV cache - # sizing. Not used for actual computation (native modules are used). - self.model = MiniCPMModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) + self.model = MiniCPM4PagedForVoxCPM2( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), + ) + self.residual_model = MiniCPM4PagedResidualLM( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "residual_model"), + ) self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors - # Placeholder — actual native model loaded in load_weights self._tts: nn.Module | None = None self._device = "cuda" self._side_dtype = torch.bfloat16 - # Config values self._patch_size = getattr(self.config, "patch_size", 4) self._feat_dim = getattr(self.config, "feat_dim", 64) + self._sample_rate = getattr(self.config, "sample_rate", 48000) + self._inference_timesteps = 10 self._cfg_value = 2.0 - - # TODO: implement sliding-window VAE decode (nanovllm pattern) - # for O(1) per-step streaming. Current impl re-decodes all patches. + self._cfg_cutoff_ratio = 1.0 + self._vae_decode_interval = 5 + self._enable_torch_compile = True + self._compile_vae = True + self._max_decode_steps = 2000 + self._max_batch_size = getattr(vllm_config.scheduler_config, "max_num_seqs", 4) + + self._perf = _PerfTimer(enabled=_ENABLE_PROFILING) + self._cfm_buffers: _CFMBufferManager | None = None + + self._active_states: dict[str, _RequestState] = {} + self._current_request_id: str | None = None + self._pending_requests: list[tuple[str, bool, torch.Tensor | None, int]] = [] + self._results_queue: list[tuple[str, torch.Tensor | None]] = [] + self._audio_queue: list[tuple[str, Any]] = [] + self._deferred_cleanup_ids: set[str] = set() @property def tts(self) -> nn.Module: assert self._tts is not None, "Model not loaded yet" return self._tts + # -------------------- request state management -------------------- + + def _get_or_create_state(self, request_id: str) -> _RequestState: + if request_id not in self._active_states: + self._active_states[request_id] = _RequestState(request_id=request_id) + return self._active_states[request_id] + + def _switch_to_request(self, request_id: str) -> _RequestState: + if request_id != self._current_request_id: + self._current_request_id = request_id + return self._get_or_create_state(request_id) + + def _cleanup_request(self, request_id: str) -> None: + self._active_states.pop(request_id, None) + if self._current_request_id == request_id: + self._current_request_id = None + + def on_requests_finished(self, finished_req_ids: set[str] | list[str]) -> None: + # Defer cleanup: on_requests_finished is called before forward(), + # so we must not delete state that the current step may still need. + self._deferred_cleanup_ids.update(finished_req_ids) + + def _flush_deferred_cleanup(self) -> None: + for req_id in self._deferred_cleanup_ids: + self._cleanup_request(req_id) + self._deferred_cleanup_ids.clear() + def _build_prompt_cache( self, ref_audio: Any = None, @@ -141,20 +378,19 @@ def _build_prompt_cache( The OpenAI speech API sends decoded audio as [samples_list, sr] via ``_resolve_ref_audio``, while offline usage sends file paths. - This method detects the format and routes accordingly. """ tts = self.tts def _is_raw_audio(v: Any) -> bool: - """Check if value is [samples, sr] from serving_speech.""" + import numbers + return ( isinstance(v, (list, tuple)) and len(v) == 2 - and isinstance(v[1], int) + and isinstance(v[1], numbers.Integral) and isinstance(v[0], (list, torch.Tensor)) ) - # If all inputs are file paths (or None), use native build_prompt_cache if not _is_raw_audio(ref_audio) and not _is_raw_audio(prompt_audio): return tts.build_prompt_cache( prompt_text=prompt_text, @@ -162,39 +398,21 @@ def _is_raw_audio(v: Any) -> bool: reference_wav_path=ref_audio, ) - # Raw audio path: encode directly cache: dict[str, Any] = {} - if ref_audio is not None: if _is_raw_audio(ref_audio): samples, sr = ref_audio - cache["ref_audio_feat"] = _encode_raw_audio( - tts, - samples, - sr, - padding_mode="right", - ) + cache["ref_audio_feat"] = _encode_raw_audio(tts, samples, sr) else: - cache["ref_audio_feat"] = tts._encode_wav( - ref_audio, - padding_mode="right", - ) + cache["ref_audio_feat"] = tts._encode_wav(ref_audio, padding_mode="right") if prompt_audio is not None and prompt_text is not None: cache["prompt_text"] = prompt_text if _is_raw_audio(prompt_audio): samples, sr = prompt_audio - cache["audio_feat"] = _encode_raw_audio( - tts, - samples, - sr, - padding_mode="left", - ) + cache["audio_feat"] = _encode_raw_audio(tts, samples, sr, padding_mode="left") else: - cache["audio_feat"] = tts._encode_wav( - prompt_audio, - padding_mode="left", - ) + cache["audio_feat"] = tts._encode_wav(prompt_audio, padding_mode="left") has_ref = "ref_audio_feat" in cache has_prompt = "audio_feat" in cache @@ -207,12 +425,95 @@ def _is_raw_audio(v: Any) -> bool: return cache + # -------------------- compile setup -------------------- + + def _setup_cfm_buffers(self) -> None: + if self._cfm_buffers is not None: + return + tts = self.tts + dit_hidden = tts.lm_to_dit_proj.out_features + tts.res_to_dit_proj.out_features + self._cfm_buffers = _CFMBufferManager( + device=torch.device(self._device), + dtype=self._side_dtype, + feat_dim=self._feat_dim, + patch_size=self._patch_size, + dit_hidden_size=dit_hidden, + max_batch_size=self._max_batch_size, + ) + + def _setup_torch_compile(self) -> None: + if not self._enable_torch_compile: + return + tts = self.tts + estimator = tts.feat_decoder.estimator + if hasattr(estimator, "_compiled"): + return + + targets: list[str] = [] + + try: + tts.feat_decoder.estimator = torch.compile(estimator, mode="reduce-overhead", fullgraph=False) + tts.feat_decoder.estimator._compiled = True + targets.append("LocDiT") + except Exception as e: + logger.warning("torch.compile LocDiT failed: %s", e) + + try: + if not hasattr(tts.feat_encoder, "_compiled"): + tts.feat_encoder = torch.compile(tts.feat_encoder, mode="reduce-overhead", fullgraph=False) + tts.feat_encoder._compiled = True + targets.append("feat_encoder") + except Exception as e: + logger.warning("torch.compile feat_encoder failed: %s", e) + + if self._compile_vae: + try: + if not hasattr(tts.audio_vae, "_compiled"): + tts.audio_vae.decode = torch.compile(tts.audio_vae.decode, mode="reduce-overhead", fullgraph=False) + tts.audio_vae._compiled = True + targets.append("AudioVAE") + except Exception as e: + logger.warning("torch.compile AudioVAE failed: %s", e) + + if not getattr(self.model, "_selective_compiled", False): + try: + targets.extend(f"scaffold.{t}" for t in self.model.compile_selective()) + self.model._selective_compiled = True + except Exception as e: + logger.warning("scaffold compile failed: %s", e) + + if not getattr(self.residual_model, "_selective_compiled", False): + try: + targets.extend(f"residual.{t}" for t in self.residual_model.compile_selective()) + self.residual_model._selective_compiled = True + except Exception as e: + logger.warning("residual compile failed: %s", e) + + if not getattr(self, "_projections_compiled", False): + try: + self._compiled_dit_proj = torch.compile(self._dit_proj_fn, mode="default", fullgraph=True) + self._compiled_stop_fn = torch.compile(self._stop_fn, mode="default", fullgraph=True) + self._projections_compiled = True + targets.append("projections") + except Exception as e: + self._compiled_dit_proj = self._compiled_stop_fn = None + logger.warning("projections compile failed: %s", e) + + if targets: + logger.info("VoxCPM2: torch.compile applied to: %s", ", ".join(targets)) + + def _dit_proj_fn(self, lm_h: torch.Tensor, res_h: torch.Tensor) -> torch.Tensor: + tts = self.tts + return torch.cat([tts.lm_to_dit_proj(lm_h), tts.res_to_dit_proj(res_h)], dim=-1) + + def _stop_fn(self, lm_h: torch.Tensor) -> torch.Tensor: + tts = self.tts + return tts.stop_head(tts.stop_actn(tts.stop_proj(lm_h))) + # -------------------- vllm hooks -------------------- def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor: - """Embed input IDs using native base_lm with scale_emb.""" - embeds = self.tts.base_lm.embed_tokens(input_ids) - return embeds * self.tts.config.lm_config.scale_emb + return self.model.embed_input_ids(input_ids) def forward( self, @@ -222,8 +523,9 @@ def forward( inputs_embeds: torch.Tensor | None = None, **kwargs: Any, ) -> torch.Tensor | IntermediateTensors: - """Full VoxCPM2 AR step: base_lm → FSQ → residual_lm → diffusion.""" - # Always run scaffold model to keep FlashInfer/attention happy + self._perf.start("forward_total") + dev = input_ids.device + model_output = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) if isinstance(model_output, IntermediateTensors): return model_output @@ -231,368 +533,315 @@ def forward( if isinstance(scaffold_hidden, tuple): scaffold_hidden = scaffold_hidden[0] - # Real computation: use native modules - has_infos = bool(getattr(self, "_current_step_infos", None)) - is_prefill = scaffold_hidden.shape[0] > 1 - - if is_prefill and has_infos: - self._forward_prefill(inputs_embeds, scaffold_hidden.device) - # Return scaffold output (right shape for engine) — our side - # computation results are stored in instance state - return scaffold_hidden - - if not is_prefill and hasattr(self, "_prev_feat_embed"): - self._forward_decode(inputs_embeds, scaffold_hidden.device) - return scaffold_hidden - + # Phase 1: per-request FSQ + residual input + token_offset = 0 + residual_inputs: list[torch.Tensor] = [] + residual_positions: list[torch.Tensor] = [] + req_metas: list[tuple] = [] + + for req_id, is_prefill, _req_embeds, n in self._pending_requests: + state = self._switch_to_request(req_id) + req_hidden = scaffold_hidden[token_offset : token_offset + n] + req_pos = positions[token_offset : token_offset + n] + + if is_prefill: + res_input, meta = self._prepare_residual_prefill(state, req_hidden, dev) + elif state.prefill_completed: + res_input, meta = self._prepare_residual_decode(state, req_hidden, dev) + else: + token_offset += n + self._results_queue.append((req_id, None)) + self._audio_queue.append((req_id, None)) + continue + + residual_inputs.append(res_input) + residual_positions.append(req_pos) + req_metas.append((state, is_prefill, meta)) + token_offset += n + + # Phase 2: batch residual_model forward + if residual_inputs: + batch_in = torch.cat(residual_inputs, dim=0) + batch_pos = torch.cat(residual_positions, dim=0) + batch_out = self.residual_model(batch_pos, batch_in) + + # Phase 3: per-request LocDiT + update + offset = 0 + for idx, (state, is_prefill, meta) in enumerate(req_metas): + n = residual_inputs[idx].shape[0] + res_out = batch_out[offset : offset + n] + offset += n + + if is_prefill: + self._finish_prefill(state, meta, res_out, dev) + else: + self._finish_decode(state, meta, res_out, dev) + + self._results_queue.append((state.request_id, state.precomputed_stop_logits)) + self._audio_queue.append((state.request_id, self._collect_audio(state))) + + self._pending_requests.clear() + self._flush_deferred_cleanup() + self._perf.stop("forward_total") return scaffold_hidden - def _build_prefill_inputs(self, text: str, dev: Any): - """Build text_token / audio_feat / masks like native _generate_with_prompt_cache. + # -------------------- prefill / decode helpers -------------------- - Returns a dict with keys: text_token, audio_feat, text_mask, audio_mask, - prefix_feat_cond. Handles zero-shot, reference (voice clone), continuation, - and ref_continuation modes. - """ + def _prepare_residual_prefill(self, state: _RequestState, base_lm_out: torch.Tensor, dev: Any): tts = self.tts - dtype = self._side_dtype - cache = getattr(self, "_prompt_cache", None) - mode = cache.get("mode", "continuation") if cache else "zero_shot" - - if cache is not None and mode in ("continuation", "ref_continuation"): - full_text = cache.get("prompt_text", "") + text - else: - full_text = text - - text_token = torch.LongTensor(tts.text_tokenizer(full_text)) - text_token = torch.cat( - [ - text_token, - torch.tensor([tts.audio_start_token], dtype=torch.int32, device=text_token.device), - ], - dim=-1, - ) - text_length = text_token.shape[0] - latent_dim = tts.audio_vae.latent_dim - patch_size = tts.patch_size - - if mode in ("zero_shot", "continuation"): - prompt_audio_feat = ( - cache["audio_feat"] if cache else torch.empty((0, patch_size, latent_dim), dtype=torch.float32) - ) - audio_length = prompt_audio_feat.size(0) - text_pad_token = torch.zeros(audio_length, dtype=torch.int32) - text_pad_feat = torch.zeros((text_length, patch_size, latent_dim), dtype=torch.float32) - text_token = torch.cat([text_token, text_pad_token]) - audio_feat = torch.cat([text_pad_feat, prompt_audio_feat], dim=0) - text_mask = torch.cat( - [ - torch.ones(text_length, dtype=torch.int32), - torch.zeros(audio_length, dtype=torch.int32), - ] + text_mask, feat_mask, feat, feat_embed = state.prefill_masks + state.prefill_masks = None + + tts_len = text_mask.shape[1] + scaffold_len = base_lm_out.shape[0] + + if scaffold_len < tts_len: + # Voice clone / continuation: scaffold only processed vllm tokens. + # Pad to match TTS sequence length (extra positions are masked out). + pad = torch.zeros( + tts_len - scaffold_len, + base_lm_out.shape[-1], + device=base_lm_out.device, + dtype=base_lm_out.dtype, ) - audio_mask = torch.cat( - [ - torch.zeros(text_length, dtype=torch.int32), - torch.ones(audio_length, dtype=torch.int32), - ] - ) - elif mode == "reference": - ref_audio_feat = cache["ref_audio_feat"] - ref_tokens, ref_feats, ref_t_mask, ref_a_mask = tts._make_ref_prefix(ref_audio_feat, text_token.device) - text_pad_feat = torch.zeros((text_length, patch_size, latent_dim), dtype=torch.float32) - text_token = torch.cat([ref_tokens.cpu(), text_token]) - audio_feat = torch.cat([ref_feats.cpu(), text_pad_feat], dim=0) - text_mask = torch.cat([ref_t_mask.cpu(), torch.ones(text_length, dtype=torch.int32)]) - audio_mask = torch.cat([ref_a_mask.cpu(), torch.zeros(text_length, dtype=torch.int32)]) + enc_out = torch.cat([base_lm_out, pad], dim=0).unsqueeze(0) else: - # ref_continuation - ref_audio_feat = cache["ref_audio_feat"] - prompt_audio_feat = cache["audio_feat"] - prompt_audio_length = prompt_audio_feat.size(0) - ref_tokens, ref_feats, ref_t_mask, ref_a_mask = tts._make_ref_prefix(ref_audio_feat, text_token.device) - prompt_pad_token = torch.zeros(prompt_audio_length, dtype=torch.int32) - text_pad_feat = torch.zeros((text_length, patch_size, latent_dim), dtype=torch.float32) - text_token = torch.cat([ref_tokens.cpu(), text_token, prompt_pad_token]) - audio_feat = torch.cat([ref_feats.cpu(), text_pad_feat, prompt_audio_feat], dim=0) - text_mask = torch.cat( - [ - ref_t_mask.cpu(), - torch.ones(text_length, dtype=torch.int32), - torch.zeros(prompt_audio_length, dtype=torch.int32), - ] - ) - audio_mask = torch.cat( - [ - ref_a_mask.cpu(), - torch.zeros(text_length, dtype=torch.int32), - torch.ones(prompt_audio_length, dtype=torch.int32), - ] - ) - - return { - "text_token": text_token.unsqueeze(0).to(dev), - "audio_feat": audio_feat.unsqueeze(0).to(dev).to(dtype), - "text_mask": text_mask.unsqueeze(0).to(dev), - "audio_mask": audio_mask.unsqueeze(0).to(dev), - } + enc_out = base_lm_out.unsqueeze(0) - def _forward_prefill(self, inputs_embeds: torch.Tensor, dev: Any) -> torch.Tensor: - """Prefill: build combined embeds, run base_lm + residual_lm + first diffusion. - - Uses the same path as native ``VoxCPM2Model._inference`` so zero-shot, - voice cloning (reference), continuation, and ref_continuation modes - all share the same code. - """ - tts = self.tts - dtype = self._side_dtype - text = getattr(self, "_prefill_text", None) - if text is None: - # Fallback (should not hit at runtime; preprocess sets this) - text = "" - - inputs = self._build_prefill_inputs(text, dev) - text_token = inputs["text_token"] - feat = inputs["audio_feat"] - text_mask = inputs["text_mask"] - feat_mask = inputs["audio_mask"] - - # Compose combined_embed exactly like native _inference - feat_embed = tts.feat_encoder(feat) - feat_embed = tts.enc_to_lm_proj(feat_embed) - scale_emb = tts.config.lm_config.scale_emb if tts.config.lm_config.use_mup else 1.0 - text_embed = tts.base_lm.embed_tokens(text_token) * scale_emb - combined_embed = text_mask.unsqueeze(-1) * text_embed + feat_mask.unsqueeze(-1) * feat_embed - - # last audio patch becomes initial prefix_feat_cond (zeros for zero-shot, - # last reference/prompt patch for voice clone / continuation) prefix_feat_cond = ( feat[:, -1, ...] if feat.shape[1] > 0 - else torch.zeros(1, tts.patch_size, tts.feat_dim, device=dev, dtype=dtype) + else torch.zeros(1, self._patch_size, self._feat_dim, device=dev, dtype=self._side_dtype) ) - - # Base LM prefill - tts.base_lm.setup_cache(1, 4096, dev, dtype) - enc_out, enc_kv = tts.base_lm(inputs_embeds=combined_embed, is_causal=True) - tts.base_lm.kv_cache.fill_caches(enc_kv) - - # FSQ: identity on text positions, quantized on audio positions enc_outputs = tts.fsq_layer(enc_out) * feat_mask.unsqueeze(-1) + enc_out * text_mask.unsqueeze(-1) - lm_hidden = enc_outputs[:, -1, :] # [1, H] - - logger.info( - "PREFILL: enc shape=%s last_norm=%.4f", - enc_outputs.shape, - lm_hidden.norm().item(), - ) + lm_hidden = enc_outputs[:, -1, :] - # Residual LM prefill - tts.residual_lm.setup_cache(1, 4096, dev, dtype) residual_input = tts.fusion_concat_proj(torch.cat([enc_outputs, feat_mask.unsqueeze(-1) * feat_embed], dim=-1)) - res_out, res_kv = tts.residual_lm(inputs_embeds=residual_input, is_causal=True) - tts.residual_lm.kv_cache.fill_caches(res_kv) - residual_hidden = res_out[:, -1, :] # [1, H] - - # Precompute stop logits for first compute_logits call - stop_logits = tts.stop_head(tts.stop_actn(tts.stop_proj(lm_hidden))) - self._precomputed_stop_logits = stop_logits.detach() - logger.info("PREFILL stop: %s", stop_logits[0].tolist()) - - # First diffusion step - dit_h = torch.cat( - [ - tts.lm_to_dit_proj(lm_hidden), - tts.res_to_dit_proj(residual_hidden), - ], - dim=-1, - ) - pred_feat = tts.feat_decoder( + meta = {"lm_hidden": lm_hidden, "prefix_feat_cond": prefix_feat_cond} + return residual_input.squeeze(0), meta + + def _prepare_residual_decode(self, state: _RequestState, base_lm_out: torch.Tensor, dev: Any): + tts = self.tts + state.decode_step_count += 1 + + if state.decode_step_count >= self._max_decode_steps: + logger.warning("MAX_DECODE_STEPS for %s (%d), forcing stop", state.request_id, state.decode_step_count) + state.is_stopping = True + + h = base_lm_out.unsqueeze(0) if base_lm_out.ndim == 1 else base_lm_out + lm_h = tts.fsq_layer(h) + if lm_h.ndim == 1: + lm_h = lm_h.unsqueeze(0) + + prev = state.prev_feat_embed.to(self._side_dtype) + if prev.ndim == 1: + prev = prev.unsqueeze(0) + res_input = tts.fusion_concat_proj(torch.cat([lm_h, prev], dim=-1)) + return res_input, {"new_lm_hidden": lm_h} + + def _run_cfm(self, dit_h: torch.Tensor, cond: torch.Tensor) -> torch.Tensor: + if self._cfm_buffers is not None: + return _optimized_solve_euler( + self.tts.feat_decoder, + dit_h, + self._patch_size, + cond, + self._inference_timesteps, + self._cfg_value, + self._cfm_buffers, + cfg_cutoff_ratio=self._cfg_cutoff_ratio, + perf=self._perf, + ).transpose(1, 2) + return self.tts.feat_decoder( mu=dit_h, - patch_size=tts.patch_size, - cond=prefix_feat_cond.transpose(1, 2).contiguous(), + patch_size=self._patch_size, + cond=cond, n_timesteps=self._inference_timesteps, cfg_value=self._cfg_value, - ).transpose(1, 2) # [1, P, D] + ).transpose(1, 2) + + def _finish_prefill(self, state: _RequestState, meta: dict, res_out: torch.Tensor, dev: Any): + tts = self.tts + lm_hidden = meta["lm_hidden"] + prefix_feat_cond = meta["prefix_feat_cond"] + residual_hidden = res_out[-1:, :] + + state.precomputed_stop_logits = tts.stop_head(tts.stop_actn(tts.stop_proj(lm_hidden))).detach() + dit_h = torch.cat([tts.lm_to_dit_proj(lm_hidden), tts.res_to_dit_proj(residual_hidden)], dim=-1) + + self._setup_cfm_buffers() + if self._enable_torch_compile: + self._setup_torch_compile() + + pred_feat = self._run_cfm(dit_h, prefix_feat_cond.transpose(1, 2).contiguous()) with torch.no_grad(): curr_embed = tts.enc_to_lm_proj(tts.feat_encoder(pred_feat.unsqueeze(1))).squeeze(1) - # Store state for decode steps - self._curr_embed_for_next = curr_embed.detach() - self._prev_feat_embed = curr_embed.detach() - self._curr_prefix_feat_cond = pred_feat[0].detach() - self._last_audio_patch = pred_feat.reshape(1, -1).detach().cpu().float() + state.curr_embed_for_next = curr_embed.detach() + state.prev_feat_embed = curr_embed.detach() + state.curr_prefix_feat_cond = pred_feat[0].detach() + state.last_audio_patch_gpu = pred_feat.detach() + state.decode_step_count = 0 + state.request_start_time = time.perf_counter() + state.prefill_completed = True - logger.info( - "PREFILL patch: norm=%.4f first3=%s", - pred_feat.norm().item(), - pred_feat[0, 0, :3].tolist(), - ) + logger.info("PREFILL[%s]: patch norm=%.4f", state.request_id, pred_feat.norm().item()) + self._perf.reset() - return lm_hidden.to(dtype) - - def _forward_decode(self, inputs_embeds: torch.Tensor | None, dev: Any) -> torch.Tensor: - """Decode step: base_lm → FSQ → residual_lm → diffusion.""" + def _finish_decode(self, state: _RequestState, meta: dict, res_out: torch.Tensor, dev: Any): + self._perf.start("decode_step") tts = self.tts - dtype = self._side_dtype - # 1. Base LM step with curr_embed from previous diffusion - curr_embed = self._curr_embed_for_next.to(dev, dtype=dtype) - if curr_embed.ndim == 2: - curr_embed_3d = curr_embed.unsqueeze(0) # [1, 1, H] - else: - curr_embed_3d = curr_embed - - step_pos = torch.tensor([tts.base_lm.kv_cache.step()], device=dev) - new_hidden = tts.base_lm.forward_step(curr_embed_3d[:, 0, :], step_pos).clone() - - # 2. FSQ - new_lm_hidden = tts.fsq_layer(new_hidden) - if new_lm_hidden.ndim == 1: - new_lm_hidden = new_lm_hidden.unsqueeze(0) - - # 3. Residual LM step - prev_fe = self._prev_feat_embed.to(dtype) - if prev_fe.ndim == 1: - prev_fe = prev_fe.unsqueeze(0) - res_input = tts.fusion_concat_proj(torch.cat([new_lm_hidden, prev_fe], dim=-1)) - res_step_pos = torch.tensor([tts.residual_lm.kv_cache.step()], device=dev) - new_res_hidden = tts.residual_lm.forward_step(res_input, res_step_pos).clone() - if new_res_hidden.ndim == 1: - new_res_hidden = new_res_hidden.unsqueeze(0) - - # 4. Diffusion - p = self._patch_size - pfc = self._curr_prefix_feat_cond.to(dtype).unsqueeze(0) - - dit_h = torch.cat( - [ - tts.lm_to_dit_proj(new_lm_hidden), - tts.res_to_dit_proj(new_res_hidden), - ], - dim=-1, - ) - pred_feat = tts.feat_decoder( - mu=dit_h, - patch_size=p, - cond=pfc.transpose(1, 2).contiguous(), - n_timesteps=self._inference_timesteps, - cfg_value=self._cfg_value, - ).transpose(1, 2) # [1, P, D] + lm_h = meta["new_lm_hidden"] + res_h = res_out.unsqueeze(0) if res_out.ndim == 1 else res_out - # 5. feat_encoder → curr_embed - with torch.no_grad(): - curr_embed = tts.enc_to_lm_proj(tts.feat_encoder(pred_feat.unsqueeze(1))).squeeze(1) + dit_proj = getattr(self, "_compiled_dit_proj", None) or self._dit_proj_fn + stop_fn = getattr(self, "_compiled_stop_fn", None) or self._stop_fn + + dit_h = dit_proj(lm_h, res_h) + pfc = state.curr_prefix_feat_cond.to(self._side_dtype) + if pfc.ndim == 2: + pfc = pfc.unsqueeze(0) + + pred_feat = self._run_cfm(dit_h, pfc.transpose(1, 2).contiguous()) + next_embed = tts.enc_to_lm_proj(tts.feat_encoder(pred_feat.unsqueeze(1))).squeeze(1) + + state.precomputed_stop_logits = stop_fn(lm_h).detach() + state.curr_embed_for_next = next_embed.detach() + state.prev_feat_embed = next_embed.detach() + state.curr_prefix_feat_cond = pred_feat[0].detach() + state.last_audio_patch_gpu = pred_feat.detach() + + self._perf.stop("decode_step") + if _ENABLE_PROFILING and state.decode_step_count % 20 == 0: + logger.info("Step %d[%s]:\n%s", state.decode_step_count, state.request_id, self._perf.breakdown()) - # 6. Stop logits - stop_logits = tts.stop_head(tts.stop_actn(tts.stop_proj(new_lm_hidden))) - self._precomputed_stop_logits = stop_logits.detach() + # -------------------- audio collection -------------------- - # 7. Store state - self._curr_embed_for_next = curr_embed.detach() - self._prev_feat_embed = curr_embed.detach() - self._curr_prefix_feat_cond = pred_feat[0].detach() - self._last_audio_patch = pred_feat.reshape(1, -1).detach().cpu().float() + def _collect_audio(self, state: _RequestState) -> torch.Tensor | None: + patch = state.last_audio_patch_gpu + if patch is not None: + state.last_audio_patch_gpu = None + state.accumulated_patches.append(patch.reshape(1, -1).float()) + + if not state.accumulated_patches: + return None + + n = len(state.accumulated_patches) + if n <= 1 or n % self._vae_decode_interval == 0 or state.is_stopping: + self._perf.start("vae_decode") + all_p = torch.cat(state.accumulated_patches, dim=0) + state.accumulated_patches = [all_p] + feat = rearrange(all_p.reshape(1, -1, self._feat_dim), "b t d -> b d t") + with torch.no_grad(): + audio = self.tts.audio_vae.decode(feat.to(self._device)).reshape(-1).cpu().float() + self._perf.stop("vae_decode") + state.last_decoded_audio = audio + return audio + return state.last_decoded_audio - return new_lm_hidden[-1:].detach() + # -------------------- compute_logits -------------------- def compute_logits( - self, - hidden_states: torch.Tensor | OmniOutput, - sampling_metadata: Any = None, + self, hidden_states: torch.Tensor | OmniOutput, sampling_metadata: Any = None ) -> torch.Tensor | None: if isinstance(hidden_states, OmniOutput): hidden_states = hidden_states.text_hidden_states if hidden_states is None: return None - precomputed = getattr(self, "_precomputed_stop_logits", None) - if precomputed is not None: - self._precomputed_stop_logits = None - raw_logits = precomputed[: hidden_states.shape[0]] - else: - # Fallback for warmup - bsz = hidden_states.shape[0] - raw_logits = torch.zeros(bsz, 2, device=hidden_states.device) - raw_logits[:, 0] = 1.0 # continue - - bsz = raw_logits.shape[0] - full_logits = torch.full( - (bsz, self.config.vocab_size), - float("-inf"), - device=raw_logits.device, - dtype=raw_logits.dtype, + bsz = hidden_states.shape[0] + logits = torch.full( + (bsz, self.config.vocab_size), float("-inf"), device=hidden_states.device, dtype=hidden_states.dtype ) - full_logits[:, 0] = raw_logits[:, 0] # continue - full_logits[:, 1] = raw_logits[:, 1] # stop - return full_logits - # -------------------- Omni output -------------------- + if self._results_queue: + for i, (req_id, stop_logits) in enumerate(self._results_queue): + if i >= bsz: + break + state = self._active_states.get(req_id) + if stop_logits is not None: + if state is not None and state.is_stopping: + logits[i, 0] = 0.0 + logits[i, 1] = 1.0 + state.precomputed_stop_logits = None + else: + logits[i, 0] = stop_logits[0, 0] + logits[i, 1] = stop_logits[0, 1] + if state is not None: + state.is_stopping = bool(stop_logits[0, 1] > stop_logits[0, 0]) + state.precomputed_stop_logits = None + elif state and state.prefill_completed: + logits[i, 1] = 1.0 + else: + logits[i, 0] = 1.0 + self._results_queue.clear() + else: + logits[:, 0] = 1.0 + return logits + + # -------------------- omni output -------------------- def make_omni_output(self, model_outputs: torch.Tensor | OmniOutput, **kwargs: Any) -> OmniOutput: if isinstance(model_outputs, OmniOutput): return model_outputs - hidden = model_outputs - patch = getattr(self, "_last_audio_patch", None) mm: dict[str, Any] = {} + if self._audio_queue: + audio_by_req = {rid: audio for rid, audio in self._audio_queue} + order = [r for r, _ in self._audio_queue] + mm["model_outputs"] = [audio_by_req.get(r) for r in order] + mm["sr"] = [torch.tensor(self._sample_rate, dtype=torch.int32) for _ in order] + self._audio_queue.clear() - if patch is not None: - self._last_audio_patch = None - self._accumulated_patches.append(patch.clone()) - - # Decode all accumulated patches → full audio waveform. - # TODO: implement sliding-window VAE decode (nanovllm pattern) - # for O(1) per-step streaming instead of O(N) re-decode. - if self._accumulated_patches: - all_p = torch.cat(self._accumulated_patches, dim=0) - d = self._feat_dim - from einops import rearrange - - feat = rearrange(all_p.float().reshape(1, -1, d), "b t d -> b d t") - with torch.no_grad(): - audio = self.tts.audio_vae.decode(feat.to(self._device)).reshape(-1).detach().cpu().float() - - mm["model_outputs"] = [audio] - mm["sr"] = [torch.tensor(48000, dtype=torch.int32)] - - return OmniOutput( - text_hidden_states=hidden, - multimodal_outputs=mm, - ) + return OmniOutput(text_hidden_states=model_outputs, multimodal_outputs=mm) # -------------------- preprocess / postprocess -------------------- def preprocess( - self, - input_ids: torch.Tensor, - input_embeds: torch.Tensor | None, - **info_dict: Any, + self, input_ids: torch.Tensor, input_embeds: torch.Tensor | None, **info_dict: Any ) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any]]: - additional_information = info_dict.get("additional_information") - if isinstance(additional_information, dict): + additional = info_dict.get("additional_information") + if isinstance(additional, dict): merged = {k: v for k, v in info_dict.items() if k != "additional_information"} - for k, v in additional_information.items(): + for k, v in additional.items(): merged.setdefault(k, v) info_dict = merged span_len = int(input_ids.shape[0]) dev = input_ids.device - - if span_len > 1: - # ---- Prefill ---- - # Decode the text from input_ids for native-matching tokenization. - # Speech API tokenizes with BOS; we use the detokenized string so - # native's ``text_tokenizer`` produces the exact same tokens as - # ``generate()``. - ids = input_ids.tolist() - if ids and ids[0] == self.config.bos_token_id: - ids = ids[1:] - text = self.tts.text_tokenizer.tokenizer.decode(ids, skip_special_tokens=True) - self._prefill_text = text - - # Voice clone / continuation: build prompt cache from info_dict. + req_id = info_dict.get("request_id", "default") + is_prefill = span_len > 1 + + if is_prefill: + # Evict stale states + pending_ids = {rid for rid, *_ in self._pending_requests} + pending_ids.add(req_id) + if self._current_request_id: + pending_ids.add(self._current_request_id) + for rid in [r for r, s in self._active_states.items() if r not in pending_ids and s.prefill_completed]: + self._cleanup_request(rid) + + # VoxCPM2Tokenizer does char-level Chinese splitting, so use input_ids directly + token_ids = input_ids.tolist() + if token_ids and token_ids[0] == self.config.bos_token_id: + token_ids = token_ids[1:] + + state = self._get_or_create_state(req_id) + state.prefill_text = "" + state.accumulated_patches = [] + state.prefill_completed = False + state.decode_step_count = 0 + state.precomputed_stop_logits = None + state.last_audio_patch_gpu = None + state.curr_embed_for_next = None + state.prev_feat_embed = None + state.curr_prefix_feat_cond = None + state.is_stopping = False + state.last_decoded_audio = None + + # Voice clone / continuation ref_audio = info_dict.get("reference_audio") or info_dict.get("ref_audio") prompt_audio = info_dict.get("prompt_audio") prompt_text = info_dict.get("prompt_text") @@ -603,68 +852,111 @@ def preprocess( if isinstance(prompt_text, list): prompt_text = prompt_text[0] if prompt_text else None - self._prompt_cache = None + state.prompt_cache = None if ref_audio or (prompt_audio and prompt_text): try: - self._prompt_cache = self._build_prompt_cache( + state.prompt_cache = self._build_prompt_cache( ref_audio=ref_audio, prompt_audio=prompt_audio, prompt_text=prompt_text, ) except Exception as e: - logger.warning("build_prompt_cache failed: %s; falling back to zero-shot", e) - self._prompt_cache = None - - # Reset per-request state (fresh generation) - self._accumulated_patches = [] - if hasattr(self, "_prev_feat_embed"): - del self._prev_feat_embed - if hasattr(self, "_curr_embed_for_next"): - del self._curr_embed_for_next - - # Store info for forward - self._current_step_infos = [{"is_prefill": True}] - - # The scaffold model still needs embeddings sized to span_len for - # its warmup/attention bookkeeping. Native modules use the full - # (potentially longer) sequence internally. Pass zeros — scaffold - # output is discarded. - embeds = torch.zeros( - span_len, - self.config.hidden_size, - device=dev, - dtype=self._side_dtype, - ) - - return input_ids, embeds, {} - - # ---- Decode ---- - curr_embed = getattr(self, "_curr_embed_for_next", None) - if curr_embed is not None: - inputs_embeds = curr_embed.to(dev, dtype=self._side_dtype).reshape(1, -1) + logger.warning("build_prompt_cache failed: %s", e) + + inputs = self._build_prefill_inputs(token_ids, dev, req_id) + tts = self.tts + feat_embed = tts.enc_to_lm_proj(tts.feat_encoder(inputs["audio_feat"])) + text_embed = self.model.embed_input_ids(inputs["text_token"].to(dev)) + text_mask, feat_mask = inputs["text_mask"], inputs["audio_mask"] + embeds = (text_mask.unsqueeze(-1) * text_embed + feat_mask.unsqueeze(-1) * feat_embed).squeeze(0) + state.prefill_masks = (text_mask, feat_mask, inputs["audio_feat"], feat_embed) else: - inputs_embeds = torch.zeros( - 1, - self.config.hidden_size, - device=dev, - dtype=self._side_dtype, - ) + state = self._active_states.get(req_id) + curr = state.curr_embed_for_next if state else None + if curr is not None: + embeds = curr.to(dev, dtype=self._side_dtype).reshape(1, -1) + else: + embeds = torch.zeros(1, self.config.hidden_size, device=dev, dtype=self._side_dtype) - self._current_step_infos = [{}] - return input_ids, inputs_embeds, {} + self._pending_requests.append((req_id, is_prefill, embeds, span_len)) + return input_ids, embeds, {} def postprocess(self, hidden_states: torch.Tensor, **info: Any) -> dict[str, Any]: + req_id = info.get("request_id", self._current_request_id or "default") + if _ENABLE_PROFILING: + state = self._active_states.get(req_id) + if state and state.decode_step_count > 0: + logger.info( + "REQUEST DONE[%s]: %d steps, %.2fs\n%s", + req_id, + state.decode_step_count, + time.perf_counter() - state.request_start_time, + self._perf.breakdown(), + ) return {} - # -------------------- Weight loading -------------------- + # -------------------- build prefill inputs -------------------- + + def _build_prefill_inputs(self, token_ids: list[int], dev: Any, req_id: str = "default") -> dict: + tts = self.tts + dtype = self._side_dtype + state = self._active_states.get(req_id) + cache = state.prompt_cache if state else None + mode = cache.get("mode", "continuation") if cache else "zero_shot" + + if cache and mode in ("continuation", "ref_continuation"): + prompt_text = cache.get("prompt_text", "") + prompt_ids = list(tts.text_tokenizer(prompt_text)) if prompt_text else [] + all_ids = prompt_ids + token_ids + else: + all_ids = token_ids + + text_token = torch.tensor(all_ids, dtype=torch.int32) + text_token = torch.cat([text_token, torch.tensor([tts.audio_start_token], dtype=torch.int32)], dim=-1) + text_len = text_token.shape[0] + latent_dim = tts.audio_vae.latent_dim + ps = self._patch_size + + if mode in ("zero_shot", "continuation"): + audio_feat = cache["audio_feat"] if cache else torch.empty((0, ps, latent_dim), dtype=torch.float32) + a_len = audio_feat.size(0) + text_token = torch.cat([text_token, torch.zeros(a_len, dtype=torch.int32)]) + audio_feat = torch.cat([torch.zeros((text_len, ps, latent_dim), dtype=torch.float32), audio_feat]) + text_mask = torch.cat([torch.ones(text_len, dtype=torch.int32), torch.zeros(a_len, dtype=torch.int32)]) + audio_mask = torch.cat([torch.zeros(text_len, dtype=torch.int32), torch.ones(a_len, dtype=torch.int32)]) + elif mode == "reference": + ref = cache["ref_audio_feat"] + rt, rf, rtm, ram = tts._make_ref_prefix(ref, text_token.device) + text_token = torch.cat([rt.cpu(), text_token]) + audio_feat = torch.cat([rf.cpu(), torch.zeros((text_len, ps, latent_dim), dtype=torch.float32)]) + text_mask = torch.cat([rtm.cpu(), torch.ones(text_len, dtype=torch.int32)]) + audio_mask = torch.cat([ram.cpu(), torch.zeros(text_len, dtype=torch.int32)]) + else: # ref_continuation + ref = cache["ref_audio_feat"] + prompt = cache["audio_feat"] + p_len = prompt.size(0) + rt, rf, rtm, ram = tts._make_ref_prefix(ref, text_token.device) + text_token = torch.cat([rt.cpu(), text_token, torch.zeros(p_len, dtype=torch.int32)]) + audio_feat = torch.cat([rf.cpu(), torch.zeros((text_len, ps, latent_dim), dtype=torch.float32), prompt]) + ones_t = torch.ones(text_len, dtype=torch.int32) + zeros_p = torch.zeros(p_len, dtype=torch.int32) + zeros_t = torch.zeros(text_len, dtype=torch.int32) + ones_p = torch.ones(p_len, dtype=torch.int32) + text_mask = torch.cat([rtm.cpu(), ones_t, zeros_p]) + audio_mask = torch.cat([ram.cpu(), zeros_t, ones_p]) + + return { + "text_token": text_token.unsqueeze(0).to(dev), + "audio_feat": audio_feat.unsqueeze(0).to(dev).to(dtype), + "text_mask": text_mask.unsqueeze(0).to(dev), + "audio_mask": audio_mask.unsqueeze(0).to(dev), + } + + # -------------------- weight loading -------------------- - # Weight mapping for vllm scaffold hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"base_lm.": "model."}) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - """Load scaffold weights via vllm + native model for computation.""" - - # Filter: only pass base_lm weights to the scaffold def _base_lm_only(ws): for name, tensor in ws: if name.startswith("base_lm."): @@ -673,21 +965,27 @@ def _base_lm_only(ws): loader = AutoWeightsLoader(self) loaded = loader.load_weights(_base_lm_only(weights), mapper=self.hf_to_vllm_mapper) - # Load the full native model for actual computation model_path = self.vllm_config.model_config.model VoxCPM = import_voxcpm2_core() native = VoxCPM.from_pretrained(model_path, load_denoiser=False, optimize=False) self._tts = native.tts_model.to("cuda") self._side_dtype = self._tts.fusion_concat_proj.weight.dtype self._device = "cuda" - self._patch_size = self._tts.patch_size self._feat_dim = self._tts.feat_dim + n = self.residual_model.load_weights_from_native(self._tts.residual_lm) + for name, _ in self.residual_model.named_parameters(): + loaded.add(f"residual_model.{name}") + logger.info("VoxCPM2: loaded %d params into paged residual_model", n) + + del self._tts.base_lm + self._tts.base_lm = None + del self._tts.residual_lm + self._tts.residual_lm = None + torch.cuda.empty_cache() + logger.info( - "Loaded native VoxCPM2 (patch_size=%d, feat_dim=%d, dtype=%s)", - self._patch_size, - self._feat_dim, - self._side_dtype, + "Loaded VoxCPM2 (patch=%d, feat_dim=%d, dtype=%s)", self._patch_size, self._feat_dim, self._side_dtype ) return loaded diff --git a/vllm_omni/model_executor/stage_configs/voxcpm2.yaml b/vllm_omni/model_executor/stage_configs/voxcpm2.yaml index de15c88de4e..7cc93d6b267 100644 --- a/vllm_omni/model_executor/stage_configs/voxcpm2.yaml +++ b/vllm_omni/model_executor/stage_configs/voxcpm2.yaml @@ -1,13 +1,13 @@ -# VoxCPM2 native AR single-stage pipeline. -# Uses native MiniCPM4 base_lm + native VAE decode in one stage. -# All computation (base_lm, residual_lm, diffusion, VAE) in forward(). +# VoxCPM2 AR pipeline with per-request state batching. +# Uses native MiniCPM4 base_lm + per-request StaticKVCache. +# max_batch_size > 1 supported via KV cache save/restore. stage_args: - stage_id: 0 stage_type: llm is_comprehension: true runtime: devices: "0" - max_batch_size: 1 + max_batch_size: 4 engine_args: dtype: bfloat16 model_stage: latent_generator diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 868140d265b..4f3f843e658 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -262,6 +262,10 @@ def execute_model( # Update persistent batch states. deferred_state_corrections_fn = self._update_states(scheduler_output) + # Notify model of finished requests for state cleanup + if scheduler_output.finished_req_ids and hasattr(self.model, "on_requests_finished"): + self.model.on_requests_finished(scheduler_output.finished_req_ids) + if has_ec_transfer() and not get_ec_transfer().is_consumer: with self.maybe_get_ec_connector_output( scheduler_output, @@ -793,11 +797,14 @@ def propose_draft_token_ids(sampled_token_ids): elif isinstance(v, dict): mm_payload[k] = {sk: sv[start:end].contiguous() for sk, sv in v.items()} elif isinstance(v, list): - element = v[idx] if idx < len(v) else v[0] - # Clone tensors to avoid cross-request aliasing - if isinstance(element, torch.Tensor): - element = element.clone() - mm_payload[k] = element + if idx < len(v): + element = v[idx] + if element is not None: + if isinstance(element, torch.Tensor): + element = element.clone() + mm_payload[k] = element + # Skip None elements: msgspec cannot serialize None + # in dict[str, torch.Tensor] typed fields. elif isinstance(v, torch.Tensor): # List-derived tensor payloads are request-invariant; clone to # avoid accidental cross-request aliasing on downstream mutation. diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 1f678b579fa..5ff62c11b40 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -1241,6 +1241,7 @@ def _preprocess( span_len = int(e) - int(s) # call the custom process function + req_infos["request_id"] = req_id embed_slice = inputs_embeds[s:e] if inputs_embeds is not None else None req_input_ids, req_embeds, update_dict = self.model.preprocess( input_ids=input_ids[s:e], input_embeds=embed_slice, **req_infos From dd1389173b4e2893d21cf742979c89ab0255a5d5 Mon Sep 17 00:00:00 2001 From: Chen-Yo Sun Date: Mon, 13 Apr 2026 15:37:45 -0700 Subject: [PATCH 23/76] [Voxtral TTS] Fix Voxtral TTS input with text and ref_audio (#2750) Signed-off-by: Chen-Yo Sun --- .../voxtral_tts_audio_generation.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py index 4041a53e55a..cd67e4f0740 100644 --- a/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py +++ b/vllm_omni/model_executor/models/voxtral_tts/voxtral_tts_audio_generation.py @@ -864,6 +864,29 @@ def get_replacement(item_idx: int): ), ] + def _apply_hf_processor_mm_only( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + tokenization_kwargs: Mapping[str, object], + ) -> BatchFeature: + """ + Apply the HF processor on the multi-modal data only. + + Issue: Voxtral TTS use Mistral Tokenizer with custom audio encoder. It doesn't + inherit Transformers ProcessorMixin and can't use call_hf_processor_mm_only. + + Solution: Override this method to call _apply_hf_processor_text_mm directly. + """ + mm_counts = mm_items.get_all_counts() + _, mm_processed_data, _ = self._apply_hf_processor_text_mm( + prompt_text=self.dummy_inputs.get_dummy_text(mm_counts), + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + tokenization_kwargs=tokenization_kwargs, + ) + return mm_processed_data + def _cached_apply_hf_processor( self, inputs: ProcessorInputs, From 8d23549b29ca408b4c5176bb85a87bfd4dff0b83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zeyu=20Huang=20=7C=20=E9=BB=83=E6=BE=A4=E5=AE=87?= <11222265+fhfuih@users.noreply.github.com> Date: Tue, 14 Apr 2026 11:35:56 +0800 Subject: [PATCH 24/76] [CI] Qwen image edit performance benckmark (#2216) Signed-off-by: Huang, Zeyu <11222265+fhfuih@users.noreply.github.com> --- .buildkite/test-nightly-diffusion.yml | 19 +- .../diffusion/diffusion_benchmark_serving.py | 28 ++- .../perf/scripts/run_diffusion_benchmark.py | 170 ++++++++++++++++-- .../test_qwen_image_edit_2509_vllm_omni.json | 167 +++++++++++++++++ .../tests/test_qwen_image_edit_vllm_omni.json | 161 +++++++++++++++++ .../perf/tests/test_qwen_image_vllm_omni.json | 2 - tools/nightly/generate_nightly_perf_excel.py | 120 +++++++++---- 7 files changed, 608 insertions(+), 59 deletions(-) create mode 100644 tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json create mode 100644 tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json diff --git a/.buildkite/test-nightly-diffusion.yml b/.buildkite/test-nightly-diffusion.yml index 04b99c0a837..a520ca4356d 100644 --- a/.buildkite/test-nightly-diffusion.yml +++ b/.buildkite/test-nightly-diffusion.yml @@ -325,10 +325,23 @@ steps: if: *nightly_or_pr_label commands: - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results + - export DIFFUSION_ATTENTION_BACKEND=FLASH_ATTN - export CACHE_DIT_VERSION=1.3.0 - - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json - - buildkite-agent artifact upload "tests/dfx/perf/results/benchmark_results_*.json" - - buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" + # [HACK]: run upload in the same command block as pytest. + # Because `exit` aborts the entire commands list. + - | + set +e + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json + EXIT1=$$? + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json + EXIT2=$$? + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json + EXIT3=$$? + if [ $$EXIT1 -eq 0 ] || [ $$EXIT2 -eq 0 ] || [ $$EXIT3 -eq 0 ]; then + buildkite-agent artifact upload "tests/dfx/perf/results/diffusion_result_*.json" + buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" + fi + exit $$((EXIT1 | EXIT2 | EXIT3)) agents: queue: "mithril-h100-pool" plugins: diff --git a/benchmarks/diffusion/diffusion_benchmark_serving.py b/benchmarks/diffusion/diffusion_benchmark_serving.py index aad955b0d1d..32ec48a698f 100644 --- a/benchmarks/diffusion/diffusion_benchmark_serving.py +++ b/benchmarks/diffusion/diffusion_benchmark_serving.py @@ -558,6 +558,7 @@ def __init__(self, args, api_url: str, model: str, enable_negative_prompt: bool super().__init__(args, api_url, model) self.num_prompts = args.num_prompts self.enable_negative_prompt = enable_negative_prompt + self.num_input_images = max(1, args.num_input_images) self.random_request_config = getattr(args, "random_request_config", None) if self.random_request_config: self.random_request_config = json.loads(self.random_request_config) @@ -580,11 +581,7 @@ def __init__(self, args, api_url: str, model: str, enable_negative_prompt: bool # Random image generate if self.args.task in ["i2v", "ti2v", "ti2i", "i2i"]: - img = Image.new("RGB", (512, 512), (255, 255, 255)) - - image_path = os.path.join(tempfile.gettempdir(), "diffusion_benchmark_random_image.png") - self._random_image_path = [image_path] - img.save(image_path) + self._random_image_path = self._generate_random_image_paths() else: self._random_image_path = None @@ -619,6 +616,18 @@ def __getitem__(self, idx: int) -> RequestFuncInput: def get_requests(self) -> list[RequestFuncInput]: return [self[i] for i in range(len(self))] + def _generate_random_image_paths(self) -> list[str]: + image_paths: list[str] = [] + for image_idx in range(self.num_input_images): + img = Image.new("RGB", (512, 512), (255, 255, 255)) + image_path = os.path.join( + tempfile.gettempdir(), + f"diffusion_benchmark_random_image_{image_idx}.png", + ) + img.save(image_path) + image_paths.append(image_path) + return image_paths + def _compute_expected_latency_ms_from_base(req: RequestFuncInput, args, base_time_ms: float | None) -> float | None: """Compute expected execution time (ms) based on a base per-step-per-frame unit time. @@ -1115,6 +1124,15 @@ async def limited_request_func(req, session, pbar): '{"width":768,"height":768,"num_inference_steps":20,"weight":0.85}]' ), ) + parser.add_argument( + "--num-input-images", + type=int, + default=1, + help=( + "Number of synthetic input images to attach for image-conditioned tasks " + "(i2v, ti2v, ti2i, i2i) when using random dataset." + ), + ) args = parser.parse_args() diff --git a/tests/dfx/perf/scripts/run_diffusion_benchmark.py b/tests/dfx/perf/scripts/run_diffusion_benchmark.py index 1bd9bf1a143..123f21405e8 100644 --- a/tests/dfx/perf/scripts/run_diffusion_benchmark.py +++ b/tests/dfx/perf/scripts/run_diffusion_benchmark.py @@ -27,13 +27,14 @@ import time from datetime import datetime from pathlib import Path -from typing import Any +from typing import Any, cast import psutil import pytest os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" +os.environ.setdefault("DIFFUSION_ATTENTION_BACKEND", "FLASH_ATTN") # --------------------------------------------------------------------------- # Paths @@ -50,6 +51,7 @@ # Populated lazily after CONFIG_FILE_PATH is resolved. _SESSION_TIMESTAMP = datetime.now().strftime("%Y%m%d-%H%M%S") _RESULT_LOCK = threading.Lock() +_BRANCHPOINT_COMMIT_SHA: str | None = None def _get_config_file_from_argv() -> str | None: @@ -110,7 +112,7 @@ def load_configs(config_path: str) -> list[dict[str, Any]]: BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) _config_stem = Path(CONFIG_FILE_PATH).stem # e.g. "test_qwen_image_vllm_omni" -AGGREGATED_RESULT_FILE = BENCHMARK_RESULT_DIR / f"benchmark_results_{_config_stem}_{_SESSION_TIMESTAMP}.json" +AGGREGATED_RESULT_FILE = BENCHMARK_RESULT_DIR / f"diffusion_result_{_config_stem}_{_SESSION_TIMESTAMP}.json" def _append_to_aggregated_file(record: dict[str, Any]) -> None: @@ -232,13 +234,13 @@ class DiffusionServer: def __init__( self, - model: str, - serve_args: list[str], + server_cfg: dict[str, Any], *, port: int | None = None, ) -> None: - self.model = model - self.serve_args = serve_args + self.server_cfg: dict[str, Any] = server_cfg + self.model = server_cfg["model"] + self.serve_args = server_cfg["serve_args"] self.host = "127.0.0.1" self.port = port if port is not None else _get_open_port() self.proc: subprocess.Popen | None = None @@ -299,6 +301,95 @@ def _build_serve_args(serve_args_dict: dict[str, Any]) -> list[str]: return args +def _get_branchpoint_commit_sha() -> str: + """Return the branch-point commit SHA against main. + + Uses git command: ``git merge-base HEAD origin/main``. + """ + global _BRANCHPOINT_COMMIT_SHA + if _BRANCHPOINT_COMMIT_SHA is not None: + return _BRANCHPOINT_COMMIT_SHA + + repo_root = Path(__file__).parent.parent.parent.parent + try: + sha = ( + subprocess.check_output( + ["git", "merge-base", "HEAD", "origin/main"], + cwd=str(repo_root), + stderr=subprocess.STDOUT, + text=True, + ) + .strip() + .splitlines()[0] + ) + _BRANCHPOINT_COMMIT_SHA = sha + except Exception as e: + print(f"Warning: failed to get branch-point commit SHA: {e}") + _BRANCHPOINT_COMMIT_SHA = "" + return _BRANCHPOINT_COMMIT_SHA + + +def _to_resolution_string(params: dict[str, Any]) -> str: + width = params.get("width", "unknown width") + height = params.get("height", "unknown height") + return f"{width}x{height}" + + +def _to_parallelism_string(framework: str, serve_args_dict: dict[str, Any]) -> str: + parts: list[str] = [] + if framework == "vllm-omni": + keys = [ + "num-gpus", + "usp", + "ulysses-degree", + "ring", + "ring-degree", + "cfg-parallel-size", + "vae-patch-parallel-size", + "vae-use-tiling", + "tensor-parallel-size", + ] + for key in keys: + if key in serve_args_dict: + parts.append(f"{key}={serve_args_dict[key]}") + return ",".join(parts) if parts else "none" + + +def _to_cache_string(framework: str, serve_args_dict: dict[str, Any]) -> str: + if framework == "vllm-omni": + if "cache-backend" in serve_args_dict: + return str(serve_args_dict["cache-backend"]) + return "disabled" + + +def _to_offload_string(framework: str, serve_args_dict: dict[str, Any]) -> str: + selected: list[str] = [] + if framework == "vllm-omni": + offload_keys = [ + "enable-cpu-offload", + "enable-layerwise-offload", + ] + for key in offload_keys: + if key in serve_args_dict: + selected.append(key) + return f"enabled({';'.join(selected)})" if selected else "disabled" + + +def _to_compile_value(framework: str, serve_args_dict: dict[str, Any]) -> str: + if framework == "vllm-omni": + if "enforce-eager" in serve_args_dict: + return "disabled" + return "enabled" + return "disabled" + + +def _to_quantization_value(framework: str, serve_args_dict: dict[str, Any]) -> str: + if framework == "vllm-omni": + quant = serve_args_dict.get("quantization") + return str(quant) if quant else "disabled" + return "disabled" + + def _unique_server_params(configs: list[dict[str, Any]]) -> list[dict[str, Any]]: """Return one server-config dict per unique test_name.""" seen: set[str] = set() @@ -310,12 +401,14 @@ def _unique_server_params(configs: list[dict[str, Any]]) -> list[dict[str, Any]] seen.add(test_name) if cfg.get("server_type", "vllm-omni") != "vllm-omni": raise ValueError(f"Unsupported server_type in config: {cfg.get('server_type')}") + serve_args_dict = cfg["server_params"].get("serve_args", {}) result.append( { "test_name": test_name, "server_type": "vllm-omni", "model": cfg["server_params"]["model"], - "serve_args": _build_serve_args(cfg["server_params"].get("serve_args", {})), + "serve_args_dict": serve_args_dict, + "serve_args": _build_serve_args(serve_args_dict), "benchmark_backend": "vllm-omni", "server_params": cfg["server_params"], } @@ -334,9 +427,7 @@ def _test_param_mapping(configs: list[dict[str, Any]]) -> dict[str, list[dict]]: def _make_server(server_cfg: dict[str, Any]) -> DiffusionServer: """Factory: return a vLLM-Omni diffusion server instance for the config.""" - model = server_cfg["model"] - serve_args = server_cfg["serve_args"] - return DiffusionServer(model=model, serve_args=serve_args) + return DiffusionServer(server_cfg=server_cfg) # --------------------------------------------------------------------------- @@ -364,7 +455,6 @@ def diffusion_server(request): print(f"\nStarting {server_type} server for test: {test_name}") with _make_server(server_cfg) as server: server.test_name = test_name - server.server_params = server_cfg["server_params"] print(f"{server_type} server started successfully") yield server print(f"{server_type} server stopping…") @@ -402,16 +492,18 @@ def run_benchmark( params: dict[str, Any], test_name: str, backend: str = "vllm-omni", - server_params: dict[str, Any] | None = None, + server_cfg: dict[str, Any] | None = None, + source_file: str = "", ) -> dict[str, Any]: """Run diffusion_benchmark_serving.py as a subprocess and return parsed metrics. The raw metrics are written to a temporary file by the subprocess. After the run completes the metrics are merged with full metadata (test_name, - backend, benchmark_params, timestamp) and appended to the session-wide - aggregated JSON file (AGGREGATED_RESULT_FILE). The temporary file is - removed afterwards. Subprocess stdout/stderr are tee'd to a .log file - under BENCHMARK_RESULT_DIR/logs/; its path is stored in the record. + backend, benchmark_params, timestamp, flat reporting fields) and appended + to the session-wide aggregated JSON file (AGGREGATED_RESULT_FILE). The + temporary file is removed afterwards. Subprocess stdout/stderr are tee'd + to a .log file under BENCHMARK_RESULT_DIR/logs/; its path is stored in + the record. """ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") @@ -495,14 +587,55 @@ def run_benchmark( finally: tmp_result_file.unlink(missing_ok=True) + server_cfg = server_cfg or {} + serve_args_dict = server_cfg.get("serve_args_dict", {}) + if not isinstance(serve_args_dict, dict): + serve_args_dict = {} + + completed = metrics.get("completed_requests", metrics.get("completed", 0)) + failed = metrics.get("failed_requests", metrics.get("failed", 0)) + record: dict[str, Any] = { "test_name": test_name, "backend": backend, "timestamp": timestamp, - "server_params": server_params, + "server_params": server_cfg.get("server_params"), "benchmark_params": params, "result": metrics, "log_file": str(log_file), + "Model": model, + "Framework": backend, + "Hardware": "", + "Deployment": "", + "Task": params.get("task", "t2i"), + "Dataset": params.get("dataset", "random"), + "resolution": _to_resolution_string(params), + "Parallelism": _to_parallelism_string(backend, serve_args_dict), + "max_concurrency": params.get("max-concurrency", ""), + "Cache": _to_cache_string(backend, serve_args_dict), + "Quantization": _to_quantization_value(backend, serve_args_dict), + "offload": _to_offload_string(backend, serve_args_dict), + "compile": _to_compile_value(backend, serve_args_dict), + "Attn_backend": os.environ.get("DIFFUSION_ATTENTION_BACKEND", ""), + "num_inference_steps": params.get("num-inference-steps", ""), + "completed": completed, + "failed": failed, + "throughput_qps": metrics.get("throughput_qps"), + "latency_mean": metrics.get("latency_mean"), + "latency_median": metrics.get("latency_median"), + "latency_p99": metrics.get("latency_p99"), + "latency_p95": metrics.get("latency_p95"), + "latency_p50": metrics.get("latency_p50"), + "peak_memory_mb_max": metrics.get("peak_memory_mb_max"), + "peak_memory_mb_mean": metrics.get("peak_memory_mb_mean"), + "peak_memory_mb_median": metrics.get("peak_memory_mb_median"), + "stage_durations_mean": metrics.get("stage_durations_mean"), + "stage_durations_p50": metrics.get("stage_durations_p50"), + "stage_durations_p99": metrics.get("stage_durations_p99"), + "commit_sha": _get_branchpoint_commit_sha(), + "build_id": os.environ.get("BUILDKITE_BUILD_ID", ""), + "build_url": os.environ.get("BUILDKITE_BUILD_URL", ""), + "source_file": source_file, } _append_to_aggregated_file(record) print(f"\n Result appended to: {AGGREGATED_RESULT_FILE}") @@ -565,7 +698,8 @@ def test_diffusion_performance_benchmark(diffusion_server, benchmark_params): params=params, test_name=test_name, backend=backend, - server_params=diffusion_server.server_params, + server_cfg=getattr(diffusion_server, "server_cfg", {}), + source_file=cast(str, CONFIG_FILE_PATH), ) print(f"\n{'=' * 60}") diff --git a/tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json new file mode 100644 index 00000000000..7d1fbbfa704 --- /dev/null +++ b/tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json @@ -0,0 +1,167 @@ +[ + { + "test_name": "test_qwen_image_edit_2509_single_device", + "description": "Single-device baseline (two input images)", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Edit-2509", + "serve_args": { + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20_i2i_2img", + "dataset": "random", + "task": "i2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "num-input-images": 2, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.05, + "latency_mean": 18, + "peak_memory_mb_max": 78500, + "peak_memory_mb_mean": 78500 + } + }, + { + "name": "1536x1536_steps35_i2i_2img", + "dataset": "random", + "task": "i2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "num-input-images": 2, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.01, + "latency_mean": 70, + "peak_memory_mb_max": 81000, + "peak_memory_mb_mean": 81000 + } + } + ] + }, + { + "test_name": "test_qwen_image_edit_2509_ulysses2_cfg2_vae_patch4", + "description": "Ulysses SP=2 + CFG=2 + VAE patch parallel=4", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Edit-2509", + "serve_args": { + "ulysses-degree": 2, + "cfg-parallel-size": 2, + "vae-patch-parallel-size": 4, + "vae-use-tiling": true, + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20_i2i_2img", + "dataset": "random", + "task": "i2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "num-input-images": 2, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.1, + "latency_mean": 12, + "peak_memory_mb_max": 69000, + "peak_memory_mb_mean": 69000 + } + }, + { + "name": "1536x1536_steps35_i2i_2img", + "dataset": "random", + "task": "i2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "num-input-images": 2, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.03, + "latency_mean": 28, + "peak_memory_mb_max": 69000, + "peak_memory_mb_mean": 69000 + } + } + ] + }, + { + "test_name": "test_qwen_image_edit_2509_ulysses2_cfg2_cache_dit", + "description": "Ulysses SP=2 + CFG=2 + CacheDiT", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Edit-2509", + "serve_args": { + "ulysses-degree": 2, + "cfg-parallel-size": 2, + "cache-backend": "cache_dit", + "cache-config": { + "Fn_compute_blocks": 1, + "Bn_compute_blocks": 0, + "max_warmup_steps": 4, + "residual_diff_threshold": 0.24, + "max_continuous_cached_steps": 3, + "enable_taylorseer": false, + "taylorseer_order": 1, + "scm_steps_mask_policy": null, + "scm_steps_policy": "dynamic" + }, + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20_i2i_2img", + "dataset": "random", + "task": "i2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "num-input-images": 2, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.10, + "latency_mean": 12, + "peak_memory_mb_max": 73000, + "peak_memory_mb_mean": 73000 + } + }, + { + "name": "1536x1536_steps35_i2i_2img", + "dataset": "random", + "task": "i2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "num-input-images": 2, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.05, + "latency_mean": 20, + "peak_memory_mb_max": 81000, + "peak_memory_mb_mean": 81000 + } + } + ] + } +] diff --git a/tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json new file mode 100644 index 00000000000..f68201db5f5 --- /dev/null +++ b/tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json @@ -0,0 +1,161 @@ +[ + { + "test_name": "test_qwen_image_edit_single_device", + "description": "Single-device baseline", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Edit", + "serve_args": { + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20_i2i", + "dataset": "random", + "task": "i2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.05, + "latency_mean": 15.0, + "peak_memory_mb_max": 72500, + "peak_memory_mb_mean": 72500 + } + }, + { + "name": "1536x1536_steps35_i2i", + "dataset": "random", + "task": "i2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.01, + "latency_mean": 65.6, + "peak_memory_mb_max": 80777, + "peak_memory_mb_mean": 80777 + } + } + ] + }, + { + "test_name": "test_qwen_image_edit_ulysses2_cfg2_vae_patch4", + "description": "Ulysses SP=2 + CFG=2 + VAE patch parallel=4", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Edit", + "serve_args": { + "ulysses-degree": 2, + "cfg-parallel-size": 2, + "vae-patch-parallel-size": 4, + "vae-use-tiling": true, + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20_i2i", + "dataset": "random", + "task": "i2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.10, + "latency_mean": 7.2, + "peak_memory_mb_max": 68100, + "peak_memory_mb_mean": 68100 + } + }, + { + "name": "1536x1536_steps35_i2i", + "dataset": "random", + "task": "i2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.03, + "latency_mean": 24.0, + "peak_memory_mb_max": 68100, + "peak_memory_mb_mean": 68100 + } + } + ] + }, + { + "test_name": "test_qwen_image_edit_ulysses2_cfg2_cache_dit", + "description": "Ulysses SP=2 + CFG=2 + CacheDiT", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Edit", + "serve_args": { + "ulysses-degree": 2, + "cfg-parallel-size": 2, + "cache-backend": "cache_dit", + "cache-config": { + "Fn_compute_blocks": 1, + "Bn_compute_blocks": 0, + "max_warmup_steps": 4, + "residual_diff_threshold": 0.24, + "max_continuous_cached_steps": 3, + "enable_taylorseer": false, + "taylorseer_order": 1, + "scm_steps_mask_policy": null, + "scm_steps_policy": "dynamic" + }, + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20_i2i", + "dataset": "random", + "task": "i2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.1, + "latency_mean": 6.5, + "peak_memory_mb_max": 72600, + "peak_memory_mb_mean": 72600 + } + }, + { + "name": "1536x1536_steps35_i2i", + "dataset": "random", + "task": "i2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.05, + "latency_mean": 16.0, + "peak_memory_mb_max": 81000, + "peak_memory_mb_mean": 81000 + } + } + ] + } +] diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index 97c1bbfb3c7..1f3a2bbf77e 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -44,7 +44,6 @@ } ] }, - { "test_name": "test_qwen_image_ulysses2_cfg2_vae_patch4", "description": "Ulysses SP=2 + CFG-parallel=2 + VAE Patch Parallel=4", @@ -94,7 +93,6 @@ } ] }, - { "test_name": "test_qwen_image_ulysses2_cfg2_cache_dit", "description": "Ulysses SP=2 + CFG-parallel=2 + CacheDiT acceleration", diff --git a/tools/nightly/generate_nightly_perf_excel.py b/tools/nightly/generate_nightly_perf_excel.py index 817f37f664e..5f9eb428bca 100644 --- a/tools/nightly/generate_nightly_perf_excel.py +++ b/tools/nightly/generate_nightly_perf_excel.py @@ -23,6 +23,22 @@ GREY_BLOCK_FILL = PatternFill(start_color="D3D3D3", fill_type="solid") # Diffusion sheet columns (Qwen-Image diffusion benchmark). +# Per-stage latency metrics. Unpack from stage_durations_mean/p50/p99 dicts +DIFFUSION_STAGE_LATENCY_COLUMNS: tuple[str, ...] = ( + # "vae.encode_mean", + # "vae.encode_p50", + # "vae.encode_p99", + "vae.decode_mean", + "vae.decode_p50", + "vae.decode_p99", + "diffuse_mean", + "diffuse_p50", + "diffuse_p99", + "text_encoder.forward_mean", + "text_encoder.forward_p50", + "text_encoder.forward_p99", +) + DIFFUSION_BENCHMARK_COLUMNS: tuple[str, ...] = ( "duration", "completed_requests", @@ -36,7 +52,7 @@ "peak_memory_mb_mean", "peak_memory_mb_median", "slo_attainment_rate", -) +) + DIFFUSION_STAGE_LATENCY_COLUMNS DIFFUSION_NUMERIC_FORMAT_COLUMNS: tuple[str, ...] = DIFFUSION_BENCHMARK_COLUMNS @@ -63,7 +79,7 @@ "build_id", "build_url", "source_file", -) +) + DIFFUSION_STAGE_LATENCY_COLUMNS # Benchmark metric columns: grey the latest row's cell when value changed vs previous date. BENCHMARK_COLUMNS: tuple[str, ...] = ( @@ -106,7 +122,7 @@ _COLUMNS_FILENAME = "nightly_perf_summary_columns.txt" _RESULT_JSON_PREFIX = "result_test_" -_DIFFUSION_JSON_PREFIX = "diffusion_perf_" +_DIFFUSION_RESULT_PREFIX = "diffusion_result_" DEFAULT_INPUT_DIR = os.getenv("DEFAULT_INPUT_DIR") if os.getenv("DEFAULT_INPUT_DIR") else "tests" DEFAULT_OUTPUT_DIR = os.getenv("DEFAULT_OUTPUT_DIR") if os.getenv("DEFAULT_OUTPUT_DIR") else "tests" DEFAULT_DIFFUSION_INPUT_DIR = os.getenv("DIFFUSION_BENCHMARK_DIR") @@ -252,7 +268,7 @@ def parse_args() -> argparse.Namespace: type=str, default=None, help=( - "Directory containing diffusion_perf_*.json files; default is " + "Directory containing diffusion_result_*.json files; default is " "DIFFUSION_BENCHMARK_DIR, fallback to --input-dir." ), ) @@ -286,7 +302,7 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -def _load_json_file(path: str) -> dict[str, Any] | None: +def _load_json_file(path: str) -> dict[str, Any] | list[Any] | None: """Safely load a single JSON file; return None and log a warning on failure.""" try: with open(path, encoding="utf-8") as f: @@ -295,8 +311,8 @@ def _load_json_file(path: str) -> dict[str, Any] | None: LOGGER.warning("failed to load json '%s': %s", path, exc) return None - if not isinstance(data, dict): - LOGGER.warning("json root in '%s' is not an object, skip", path) + if not isinstance(data, (dict, list)): + LOGGER.warning("json root in '%s' is not a dict or list, skip", path) return None return data @@ -396,27 +412,29 @@ def _iter_omni_json_records(input_dir: str) -> Iterable[dict[str, Any]]: yield record -def _parse_diffusion_from_filename(filename: str) -> dict[str, Any]: - """Parse diffusion test_name/date from filename: diffusion_perf__.json""" +def _parse_diffusion_result_from_filename(filename: str) -> dict[str, Any]: + """Parse test_name/date from filename: diffusion_result__.json""" name, ext = os.path.splitext(filename) - if ext != ".json" or not name.startswith(_DIFFUSION_JSON_PREFIX): + if ext != ".json" or not name.startswith(_DIFFUSION_RESULT_PREFIX): return {} - core = name[len(_DIFFUSION_JSON_PREFIX) :] + core = name[len(_DIFFUSION_RESULT_PREFIX) :] parts = core.split("_") if len(parts) < 2: return {} timestamp = parts[-1] - test_name = "_".join(parts[:-1]) if parts[:-1] else "" parsed: dict[str, Any] = {} if len(timestamp) >= 15: parsed["date"] = timestamp - if test_name: - parsed["test_name"] = test_name return parsed -def _iter_diffusion_json_records(input_dir: str) -> Iterable[dict[str, Any]]: - """Iterate over diffusion_perf_*.json files and yield normalized diffusion records.""" +def _iter_diffusion_records(input_dir: str) -> Iterable[dict[str, Any]]: + """Iterate over diffusion_result_*.json files and yield normalized records. + + Unlike omni format where each JSON file contains one test case, diffusion format + produces a single JSON file containing a list of all test case records. + Test params (feature toggles) are NOT embedded in the filename. + """ if not os.path.isdir(input_dir): LOGGER.warning("diffusion input dir '%s' does not exist or is not a directory", input_dir) return @@ -424,7 +442,7 @@ def _iter_diffusion_json_records(input_dir: str) -> Iterable[dict[str, Any]]: for entry in sorted(os.listdir(input_dir)): if not entry.endswith(".json"): continue - if not entry.startswith(_DIFFUSION_JSON_PREFIX): + if not entry.startswith(_DIFFUSION_RESULT_PREFIX): continue full_path = os.path.join(input_dir, entry) if not os.path.isfile(full_path): @@ -434,23 +452,63 @@ def _iter_diffusion_json_records(input_dir: str) -> Iterable[dict[str, Any]]: if data is None: continue - record: dict[str, Any] = dict(data) - filename_meta = _parse_diffusion_from_filename(os.path.basename(full_path)) - if "date" not in record or not record.get("date"): - record["date"] = filename_meta.get("date") or datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") - if "test_name" not in record or not record.get("test_name"): - if "test_name" in filename_meta: - record["test_name"] = filename_meta["test_name"] - record["source_file"] = os.path.basename(full_path) - yield record + filename_meta = _parse_diffusion_result_from_filename(os.path.basename(full_path)) + if not isinstance(data, list): + LOGGER.warning("diffusion result file '%s' root is not a list, skip", full_path) + continue -def _collect_records(input_dir: str) -> list[dict[str, Any]]: + for record in data: + if not isinstance(record, dict): + continue + record = dict(record) + if "date" not in record or not record.get("date"): + record["date"] = filename_meta.get("date") or datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") + record["source_file"] = os.path.basename(full_path) + yield record + + +def _collect_omni_records(input_dir: str) -> list[dict[str, Any]]: return list(_iter_omni_json_records(input_dir)) def _collect_diffusion_records(diffusion_input_dir: str) -> list[dict[str, Any]]: - return list(_iter_diffusion_json_records(diffusion_input_dir)) + """Collect diffusion records from diffusion_result_*.json files. + Their format is different from omni JSON files. + """ + return [_process_diffusion_record(r) for r in _iter_diffusion_records(diffusion_input_dir)] + + +def _flatten_stage_durations(record: dict[str, Any]) -> dict[str, Any]: + """Flatten stage_durations dict into individual columns matching DIFFUSION_STAGE_LATENCY_COLUMNS.""" + result = dict(record) + + for prefix in ("stage_durations_mean", "stage_durations_p50", "stage_durations_p99"): + durations = result.pop(prefix, None) + if not isinstance(durations, dict): + continue + + suffix = prefix.replace("stage_durations_", "") # "mean", "p50", "p99" + + for stage_key, value in durations.items(): # e.g., "SomePipeline.vae.decode_mean": 100.0 + stage_key = stage_key.split(".", 1)[-1] # "decode_mean" + col_name = f"{stage_key}_{suffix}" + if col_name not in DIFFUSION_STAGE_LATENCY_COLUMNS: + print(f"skipping stage_key: {col_name}") + continue + result[col_name] = value + + return result + + +def _process_diffusion_record(record: dict[str, Any]) -> dict[str, Any]: + """Normalize a diffusion record by merging `result` and flattening stage metrics.""" + flat = record.copy() + flat.update(flat.pop("result", {})) + flat = _flatten_stage_durations(flat) + flat.pop("benchmark_params", None) + flat.pop("server_params", None) + return flat def _apply_build_metadata_to_latest_only( @@ -493,7 +551,7 @@ def _apply_build_metadata_to_latest_only( def _sort_records_for_summary(records: list[dict[str, Any]]) -> list[dict[str, Any]]: """Sort so that same test configuration is grouped, newest date first within each group.""" - by_date_desc = sorted(records, key=lambda r: (r.get("date") or ""), reverse=True) + by_date_desc = sorted(records, key=lambda r: r.get("date") or "", reverse=True) return sorted( by_date_desc, key=_omni_group_key, @@ -501,7 +559,7 @@ def _sort_records_for_summary(records: list[dict[str, Any]]) -> list[dict[str, A def _sort_diffusion_records_for_summary(records: list[dict[str, Any]]) -> list[dict[str, Any]]: - by_date_desc = sorted(records, key=lambda r: (r.get("date") or ""), reverse=True) + by_date_desc = sorted(records, key=lambda r: r.get("date") or "", reverse=True) return sorted(by_date_desc, key=_diffusion_group_key) @@ -678,7 +736,7 @@ def generate_excel_report( script_dir = os.path.dirname(os.path.abspath(__file__)) omni_summary_columns = _ensure_omni_summary_columns(_load_summary_columns(script_dir)) - omni_records = _collect_records(input_dir) + omni_records = _collect_omni_records(input_dir) diffusion_records = _collect_diffusion_records(diffusion_input_dir) if not omni_records: From a5b38b5d0d612d4be0b452dfd29c552f2dfa94a3 Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Tue, 14 Apr 2026 13:32:00 +0800 Subject: [PATCH 25/76] [BugFix] Remove stage_configs_path validation (#2741) Signed-off-by: amy-why-3459 --- tests/engine/test_arg_utils.py | 7 ------- vllm_omni/engine/arg_utils.py | 5 ----- 2 files changed, 12 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index a1fc18f8456..35d55f1cc4e 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -118,13 +118,6 @@ def test_qwen3_tts_codec_frame_rate_patching(): assert omni_config.codec_frame_rate_hz == 12.3 -def test_stage_configs_path_blocks_create_model_config(): - """create_model_config() should raise when stage_configs_path is set.""" - args = OmniEngineArgs(stage_configs_path="/some/path.yaml") - with pytest.raises(RuntimeError, match="stage_configs_path"): - args.create_model_config() - - def test_from_cli_args_picks_up_stage_configs_path(): """from_cli_args should pick up stage_configs_path from namespace.""" ns = argparse.Namespace( diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index 4e2ad9b257c..d61102c7e13 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -194,11 +194,6 @@ def create_model_config(self) -> OmniModelConfig: Returns: OmniModelConfig instance with all configuration fields set """ - if self.stage_configs_path is not None: - raise RuntimeError( - "create_model_config() should not be called when stage_configs_path is set. " - "Per-stage model configs are resolved from the stage config YAML." - ) # register omni models to avoid model not found error self._ensure_omni_models_registered() From 644edac0b6e29b153380a2a3796c328918c2d614 Mon Sep 17 00:00:00 2001 From: Samit <285365963@qq.com> Date: Tue, 14 Apr 2026 14:20:38 +0800 Subject: [PATCH 26/76] [Perf] Optimize MP4 encoding latency in video generation (#2735) Signed-off-by: samithuang <285365963@qq.com> --- .../openai_api/test_video_server.py | 144 +++++++++++++----- vllm_omni/diffusion/utils/media_utils.py | 7 +- vllm_omni/entrypoints/openai/api_server.py | 27 +--- vllm_omni/entrypoints/openai/serving_video.py | 16 +- .../entrypoints/openai/video_api_utils.py | 29 +++- 5 files changed, 158 insertions(+), 65 deletions(-) diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py index fd7d4df60da..82c34f87e8f 100644 --- a/tests/entrypoints/openai_api/test_video_server.py +++ b/tests/entrypoints/openai_api/test_video_server.py @@ -69,7 +69,7 @@ def set_stage_configs_if_missing(self, stage_configs): if self.stage_configs is None: self.stage_configs = stage_configs - async def generate_videos(self, request, reference_id, *, reference_image=None): + async def generate_video_bytes(self, request, reference_id, *, reference_image=None): self.started.set() try: await asyncio.Future() @@ -137,15 +137,81 @@ def _wait_until(predicate, timeout_s: float = 2.0, interval_s: float = 0.02): raise AssertionError("Timed out waiting for condition") +def test_async_video_generation_bypasses_base64(test_client, mocker: MockerFixture): + """Regression test: Ensure async video generation saves raw bytes directly + without bouncing through base64 encoding.""" + # We mock _encode_video_bytes (the correct path) + mocker.patch( + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"raw-mp4-bytes", + ) + + # We assert that encode_video_base64 is never called + mock_base64 = mocker.patch( + "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", + side_effect=RuntimeError("Regression: async video path should not base64 encode"), + ) + + response = test_client.post( + "/v1/videos", + data={"prompt": "A base64 test."}, + ) + assert response.status_code == 200 + video_id = response.json()["id"] + + # Wait for completion. If it used base64, the RuntimeError would fail the task + _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value) + mock_base64.assert_not_called() + + +def test_async_video_generation_with_audio_bypasses_base64(test_client, mocker: MockerFixture): + """Regression test: Ensure async video generation passes audio through + generate_video_bytes without bouncing through base64 encoding.""" + mock_encode = mocker.patch( + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"raw-mp4-bytes", + ) + + mock_base64 = mocker.patch( + "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", + side_effect=RuntimeError("Regression: async video path should not base64 encode"), + ) + + engine = test_client.app.state.openai_serving_video._engine_client + + async def _generate(prompt, request_id, sampling_params_list): + engine.captured_prompt = prompt + engine.captured_sampling_params_list = sampling_params_list + yield MockVideoResult([object()], audios=[object()], sample_rate=48000) + + engine.generate = _generate + + response = test_client.post( + "/v1/videos", + data={"prompt": "A base64 test with audio."}, + ) + assert response.status_code == 200 + video_id = response.json()["id"] + + _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value) + mock_base64.assert_not_called() + + mock_encode.assert_called_once() + kwargs = mock_encode.call_args.kwargs + assert "audio" in kwargs + assert kwargs["audio"] is not None + assert kwargs["audio_sample_rate"] == 48000 + + def test_t2v_video_generation_form(test_client, mocker: MockerFixture): fps_values = [] - def _fake_encode(video, fps): + def _fake_encode(video, fps, audio=None, audio_sample_rate=None, **kwargs): fps_values.append(fps) - return "Zg==" + return b"fake-video" mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", side_effect=_fake_encode, ) response = test_client.post( @@ -177,8 +243,8 @@ def test_i2v_video_generation_form(test_client, mocker: MockerFixture): image_bytes = _make_test_image_bytes((48, 32)) mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -203,8 +269,8 @@ def test_i2v_video_generation_resizes_input_to_requested_dimensions(test_client, image_bytes = _make_test_image_bytes((48, 32)) mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -229,8 +295,8 @@ def test_i2v_video_generation_resizes_input_to_requested_dimensions(test_client, def test_i2v_video_generation_with_image_reference_form(test_client, mocker: MockerFixture): mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -254,12 +320,12 @@ def test_i2v_video_generation_with_image_reference_form(test_client, mocker: Moc def test_seconds_defaults_fps_and_frames(test_client, mocker: MockerFixture): fps_values = [] - def _fake_encode(video, fps): + def _fake_encode(video, fps, audio=None, audio_sample_rate=None, **kwargs): fps_values.append(fps) - return "Zg==" + return b"fake-video" mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", side_effect=_fake_encode, ) response = test_client.post( @@ -283,8 +349,8 @@ def _fake_encode(video, fps): def test_size_param_sets_width_height(test_client, mocker: MockerFixture): mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -305,8 +371,8 @@ def test_size_param_sets_width_height(test_client, mocker: MockerFixture): def test_sampling_params_pass_through(test_client, mocker: MockerFixture): mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -337,10 +403,10 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture): def test_audio_sample_rate_comes_from_model_config(test_client, mocker: MockerFixture): audio_sample_rates = [] - def _fake_encode(video, fps, audio=None, audio_sample_rate=None): - del video, fps, audio + def _fake_encode(video, fps, audio=None, audio_sample_rate=None, video_codec_options=None): + del video, fps, audio, video_codec_options audio_sample_rates.append(audio_sample_rate) - return "Zg==" + return b"fake-video" engine = test_client.app.state.openai_serving_video._engine_client engine.model_config = SimpleNamespace( @@ -354,12 +420,14 @@ def _fake_encode(video, fps, audio=None, audio_sample_rate=None): async def _generate(prompt, request_id, sampling_params_list): engine.captured_prompt = prompt engine.captured_sampling_params_list = sampling_params_list - yield MockVideoResult([object()], audios=[object()]) + import numpy as np + + yield MockVideoResult([np.zeros((1, 64, 64, 3), dtype=np.uint8)], audios=[object()]) engine.generate = _generate mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", side_effect=_fake_encode, ) response = test_client.post( @@ -387,8 +455,8 @@ async def _generate(prompt, request_id, sampling_params_list): engine.generate = _generate mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post("/v1/videos", data={"prompt": "profile me"}) @@ -457,8 +525,8 @@ def test_invalid_seconds_returns_422(test_client): def test_negative_prompt_and_seed_pass_through(test_client, mocker: MockerFixture): mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -531,8 +599,8 @@ def test_video_request_validation(): def test_list_videos_supports_order_after_and_limit(test_client, mocker: MockerFixture): mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) ids = [] for i in range(3): @@ -600,8 +668,8 @@ def test_list_videos_supports_order_after_and_limit(test_client, mocker: MockerF def test_delete_completed_job_removes_file_and_metadata(test_client, mocker: MockerFixture): mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) create_resp = test_client.post("/v1/videos", data={"prompt": "Delete this video"}) assert create_resp.status_code == 200 @@ -672,8 +740,8 @@ def test_video_response_file_extension_is_robust(): def test_extra_params_merged_into_extra_args(test_client, mocker: MockerFixture): """extra_params JSON object is merged into sampling_params.extra_args.""" mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) extra_params = { "is_enable_stage2": True, @@ -703,8 +771,8 @@ def test_extra_params_merged_into_extra_args(test_client, mocker: MockerFixture) def test_extra_params_none_by_default(test_client, mocker: MockerFixture): """When extra_params is omitted, extra_args stays empty.""" mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -744,8 +812,8 @@ def test_extra_params_invalid_json(test_client): def test_extra_params_merged_with_existing_extra_args(test_client, mocker: MockerFixture): """extra_params is merged on top of existing extra_args (e.g. flow_shift).""" mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", @@ -769,8 +837,8 @@ def test_extra_params_merged_with_existing_extra_args(test_client, mocker: Mocke def test_sample_solver_forwarded_via_extra_params(test_client, mocker: MockerFixture): """sample_solver can be passed through existing extra_params for Wan2.2 online serving.""" mocker.patch( - "vllm_omni.entrypoints.openai.serving_video.encode_video_base64", - return_value="Zg==", + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", ) response = test_client.post( "/v1/videos", diff --git a/vllm_omni/diffusion/utils/media_utils.py b/vllm_omni/diffusion/utils/media_utils.py index f96a28fbd71..a09cd459539 100644 --- a/vllm_omni/diffusion/utils/media_utils.py +++ b/vllm_omni/diffusion/utils/media_utils.py @@ -20,6 +20,7 @@ def mux_video_audio_bytes( video_codec: str = "h264", audio_codec: str = "aac", crf: str = "18", + video_codec_options: dict[str, str] | None = None, ) -> bytes: """Mux video frames and optional audio waveform into MP4 bytes. @@ -42,7 +43,11 @@ def mux_video_audio_bytes( v_stream.width = video_frames.shape[2] v_stream.height = video_frames.shape[1] v_stream.pix_fmt = "yuv420p" - v_stream.options = {"crf": crf} + + options = {"crf": str(crf)} + if video_codec_options: + options.update(video_codec_options) + v_stream.options = options a_stream = None if audio_waveform is not None: diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index defaa9822cc..6a65f443322 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1955,18 +1955,6 @@ def video_response_from_request(model_name: str, req: VideoGenerationRequest) -> return resp -async def decode_and_save_video_output(output: Any, file_name: str) -> str: - if not output.b64_json: - raise RuntimeError(f"Video output for {file_name} did not include b64_json content.") - - try: - video_bytes = base64.b64decode(output.b64_json) - except Exception as decode_exc: - raise RuntimeError(f"Failed to decode generated video payload for {file_name}") from decode_exc - - return await STORAGE_MANAGER.save(video_bytes, file_name) - - def _cleanup_video(video_id: str, output_path: str | None): try: if output_path is not None: @@ -1990,15 +1978,12 @@ async def _run_video_generation_job( started_at = time.perf_counter() output_path = None try: - response = await handler.generate_videos(request, video_id, reference_image=reference_image) - if not response.data: - raise RuntimeError("Video generation completed but returned no outputs.") - - if (video_count := len(response.data)) > 1: - logger.warning("Video request %s generated %s outputs but we only expected one.", video_id, video_count) + video_bytes, stage_durations, peak_memory_mb = await handler.generate_video_bytes( + request, video_id, reference_image=reference_image + ) file_name = f"{video_id}.{job.file_extension}" - output_path = await decode_and_save_video_output(response.data[0], file_name) + output_path = await STORAGE_MANAGER.save(video_bytes, file_name) logger.info("Video request %s persisted %s output file.", video_id, output_path) await VIDEO_STORE.update_fields( @@ -2009,8 +1994,8 @@ async def _run_video_generation_job( "file_name": file_name, "completed_at": int(time.time()), "inference_time_s": time.perf_counter() - started_at, - "stage_durations": response.stage_durations, - "peak_memory_mb": response.peak_memory_mb, + "stage_durations": stage_durations, + "peak_memory_mb": peak_memory_mb, }, ) except Exception as exc: diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py index 3e05a1eedd0..0001fa65f89 100644 --- a/vllm_omni/entrypoints/openai/serving_video.py +++ b/vllm_omni/entrypoints/openai/serving_video.py @@ -178,17 +178,24 @@ async def generate_videos( reference_image: ReferenceImage | None = None, ) -> VideoGenerationResponse: artifacts = await self._run_and_extract(request, reference_id, reference_image=reference_image) + + video_codec_options = {"preset": "ultrafast", "threads": "0"} + if request.extra_params is not None and isinstance(request.extra_params, dict): + if "video_codec_options" in request.extra_params: + video_codec_options = request.extra_params["video_codec_options"] + _t_encode_start = time.perf_counter() video_data = [ VideoData( b64_json=( - encode_video_base64(video, fps=artifacts.output_fps) + encode_video_base64(video, fps=artifacts.output_fps, video_codec_options=video_codec_options) if artifacts.audios[idx] is None else encode_video_base64( video, fps=artifacts.output_fps, audio=artifacts.audios[idx], audio_sample_rate=artifacts.audio_sample_rate, + video_codec_options=video_codec_options, ) ) ) @@ -219,11 +226,18 @@ async def generate_video_bytes( len(artifacts.videos), ) audio = artifacts.audios[0] + + video_codec_options = {"preset": "ultrafast", "threads": "0"} + if request.extra_params is not None and isinstance(request.extra_params, dict): + if "video_codec_options" in request.extra_params: + video_codec_options = request.extra_params["video_codec_options"] + _t_encode_start = time.perf_counter() video_bytes = _encode_video_bytes( artifacts.videos[0], fps=artifacts.output_fps, **({"audio": audio, "audio_sample_rate": artifacts.audio_sample_rate} if audio is not None else {}), + video_codec_options=video_codec_options, ) _t_encode_ms = (time.perf_counter() - _t_encode_start) * 1000 logger.info("Video response encoding (MP4 bytes): %.2f ms", _t_encode_ms) diff --git a/vllm_omni/entrypoints/openai/video_api_utils.py b/vllm_omni/entrypoints/openai/video_api_utils.py index 69178fb3d3d..19354697928 100644 --- a/vllm_omni/entrypoints/openai/video_api_utils.py +++ b/vllm_omni/entrypoints/openai/video_api_utils.py @@ -202,7 +202,13 @@ def _coerce_audio_to_numpy(audio: Any) -> np.ndarray: return arr.astype(np.float32) -def _encode_video_bytes(video: Any, fps: int, audio: Any | None = None, audio_sample_rate: int | None = None) -> bytes: +def _encode_video_bytes( + video: Any, + fps: int, + audio: Any | None = None, + audio_sample_rate: int | None = None, + video_codec_options: dict[str, str] | None = None, +) -> bytes: """Encode a video payload into MP4 bytes, optionally muxing audio.""" from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes @@ -213,7 +219,13 @@ def _encode_video_bytes(video: Any, fps: int, audio: Any | None = None, audio_sa frames_np = np.stack(frames, axis=0) if frames_np.ndim == 4 and frames_np.shape[-1] == 4: frames_np = frames_np[..., :3] - frames_u8 = (np.clip(frames_np, 0.0, 1.0) * 255).round().clip(0, 255).astype(np.uint8) + + if frames_np.dtype == np.uint8: + frames_u8 = frames_np + else: + frames_np = np.clip(frames_np, 0.0, 1.0) + frames_np *= 255.0 + frames_u8 = np.round(frames_np).astype(np.uint8) audio_np = _coerce_audio_to_numpy(audio) if audio is not None else None @@ -222,10 +234,19 @@ def _encode_video_bytes(video: Any, fps: int, audio: Any | None = None, audio_sa audio_np, fps=float(fps), audio_sample_rate=audio_sample_rate or 24000, + video_codec_options=video_codec_options, ) -def encode_video_base64(video: Any, fps: int, audio: Any | None = None, audio_sample_rate: int | None = None) -> str: +def encode_video_base64( + video: Any, + fps: int, + audio: Any | None = None, + audio_sample_rate: int | None = None, + video_codec_options: dict[str, str] | None = None, +) -> str: """Encode a video (frames/array/tensor) to base64 MP4.""" - video_bytes = _encode_video_bytes(video, fps=fps, audio=audio, audio_sample_rate=audio_sample_rate) + video_bytes = _encode_video_bytes( + video, fps=fps, audio=audio, audio_sample_rate=audio_sample_rate, video_codec_options=video_codec_options + ) return base64.b64encode(video_bytes).decode("utf-8") From 48c30bc399b40cadb550b106f5846f0b3354bddd Mon Sep 17 00:00:00 2001 From: iancarrasco-b10 Date: Tue, 14 Apr 2026 02:49:13 -0400 Subject: [PATCH 27/76] [Qwen3-TTS] Remove hardcoded `distributed_executor_backend` to improve single-GPU performance (#2604) Signed-off-by: Ian Carrasco --- examples/online_serving/qwen3_tts/README.md | 48 +++++++++ .../stage_configs/qwen3_tts_uniproc.yaml | 97 +++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 vllm_omni/model_executor/stage_configs/qwen3_tts_uniproc.yaml diff --git a/examples/online_serving/qwen3_tts/README.md b/examples/online_serving/qwen3_tts/README.md index e53fa7392bc..b48db9cf453 100644 --- a/examples/online_serving/qwen3_tts/README.md +++ b/examples/online_serving/qwen3_tts/README.md @@ -378,6 +378,54 @@ Server -> Client: {"type": "session.done", "total_sentences": 1} ``` +## Choosing an Execution Backend: Uniproc vs Multiprocessing + +Qwen3-TTS stage configs support two execution backends controlled by the +`distributed_executor_backend` engine arg. The performance tradeoff between +them is **both hardware- and task-dependent**, so there is no single best +default (see [#2603](https://github.com/vllm-project/vllm-omni/issues/2603), +[#2604](https://github.com/vllm-project/vllm-omni/pull/2604) for the full +investigation). + +| Backend | Stage config setting | Behaviour | +| ------- | -------------------- | --------- | +| **Uniproc** (default, world_size=1) | `distributed_executor_backend` omitted | Both stages run inside the orchestrator process. Avoids IPC serialisation, D2H copies, and msgpack overhead between stages. | +| **Multiprocessing** | `distributed_executor_backend: "mp"` | Each stage runs in its own subprocess. The Talker can continue decoding while Code2Wav runs the vocoder in parallel, improving pipeline utilisation under concurrency. | + +> **Note:** When `distributed_executor_backend` is omitted and `world_size=1`, +> vLLM [automatically uses the uniproc executor](https://github.com/vllm-project/vllm/blob/main/vllm/config/parallel.py#L825). +> When `world_size > 1`, it defaults to `mp`. + +### When uniproc wins + +The uniproc path eliminates inter-process data transfer (D2H copies, +msgpack serialisation/deserialisation, tensor detaching). This matters most +when per-request processing is heavy relative to autoregressive decode. + +The Base cloning task involves reference-audio encoding on every request, making IPC +overhead a larger fraction of total cost. Qwen3-Omni shows a similar pattern. + +### When multiprocessing (`mp`) wins + +For lighter per-request workloads, process-level parallelism between the +Talker and Code2Wav stages dominates. + +CustomVoice is lighter per-request (no reference audio encoding), so the +process-level parallelism of `mp` outweighs its serialisation cost at +concurrency ≥ 4. + +### How to switch + +To use the uniproc executor on a single-GPU setup, pass the +`qwen3_tts_uniproc.yaml` stage config: + +```bash +vllm serve Qwen/Qwen3-TTS-12Hz-1.7B-Base \ + --omni \ + --stage-configs-path vllm_omni/model_executor/stage_configs/qwen3_tts_uniproc.yaml \ + --port 8091 +``` + ## Limitations - **Single request**: Batch processing is not yet optimized for online serving. diff --git a/vllm_omni/model_executor/stage_configs/qwen3_tts_uniproc.yaml b/vllm_omni/model_executor/stage_configs/qwen3_tts_uniproc.yaml new file mode 100644 index 00000000000..d2e920806d2 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/qwen3_tts_uniproc.yaml @@ -0,0 +1,97 @@ +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + engine_args: + model_stage: qwen3_tts + max_num_seqs: 10 + model_arch: Qwen3TTSTalkerForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: false + trust_remote_code: true + async_scheduling: true + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.3 + max_num_batched_tokens: 512 + max_model_len: 4096 + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_tts.talker2code2wav_async_chunk + # Use named connector to apply runtime.connectors.extra. + output_connectors: + to_stage_1: connector_of_shared_memory + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + detokenize: false + repetition_penalty: 1.05 + stop_token_ids: [2150] + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + engine_args: + model_stage: code2wav + max_num_seqs: 1 + model_arch: Qwen3TTSCode2Wav + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: true + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.3 + # Must be divisible by num_code_groups and cover (left_context + chunk). + # Prefill length is Q * num_frames (e.g. 16 * 2148 = 34368); keep headroom past 32k. + max_num_batched_tokens: 65536 + # async_chunk appends windows per step; max_model_len must cover accumulated flat codec stream. + max_model_len: 65536 + engine_input_source: [0] + final_output: true + final_output_type: audio + # Distributed connector configuration + input_connectors: + from_stage_0: connector_of_shared_memory + tts_args: + max_instructions_length: 500 + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + detokenize: true + repetition_penalty: 1.0 + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 + + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + # Frame-aligned codec streaming transport. + codec_streaming: true + # Connector polling / timeout (unit: loop count, sleep interval in seconds). + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + # Match the decoder sliding attention window to avoid chunk-boundary noise. + codec_chunk_frames: 25 + codec_left_context_frames: 72 + + edges: + - from: 0 + to: 1 + window_size: -1 From 17acd0589a26a84bd30733496d9ffedee7f8cb67 Mon Sep 17 00:00:00 2001 From: Zhang Jian Date: Tue, 14 Apr 2026 15:05:12 +0800 Subject: [PATCH 28/76] [Test] Add Stable Audio offline e2e TeaCache Test (#2377) Signed-off-by: Zhang Signed-off-by: Zhang Jian Co-authored-by: Claude Opus 4.6 (1M context) --- .buildkite/test-amd-merge.yml | 2 +- .buildkite/test-amd-ready.yaml | 2 +- .buildkite/test-merge.yml | 18 ---- .buildkite/test-ready.yml | 2 +- docs/contributing/ci/CI_5levels.md | 2 +- docs/contributing/ci/tests_style.md | 2 +- docs/user_guide/diffusion_features.md | 2 +- .../offline_inference/text_to_audio/README.md | 2 + .../text_to_audio/text_to_audio.py | 26 +++++ pyproject.toml | 1 + tests/conftest.py | 28 ++++-- .../test_stable_audio_expansion.py | 99 +++++++++++++++++++ .../test_stable_audio_model.py | 63 ------------ 13 files changed, 156 insertions(+), 93 deletions(-) create mode 100644 tests/e2e/offline_inference/test_stable_audio_expansion.py delete mode 100644 tests/e2e/offline_inference/test_stable_audio_model.py diff --git a/.buildkite/test-amd-merge.yml b/.buildkite/test-amd-merge.yml index b6f2037d18a..ac52f60b35b 100644 --- a/.buildkite/test-amd-merge.yml +++ b/.buildkite/test-amd-merge.yml @@ -54,7 +54,7 @@ steps: # - export GPU_ARCHS=gfx942 # - export VLLM_LOGGING_LEVEL=DEBUG # - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model - label: "Diffusion Cache Backend Test" agent_pool: mi325_1 diff --git a/.buildkite/test-amd-ready.yaml b/.buildkite/test-amd-ready.yaml index ced91635c25..30bbc769412 100644 --- a/.buildkite/test-amd-ready.yaml +++ b/.buildkite/test-amd-ready.yaml @@ -69,7 +69,7 @@ steps: # - export GPU_ARCHS=gfx942 # - export VLLM_LOGGING_LEVEL=DEBUG # - export VLLM_WORKER_MULTIPROC_METHOD=spawn -# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py +# - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model - label: "Diffusion Cache Backend Test" agent_pool: mi325_1 diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 24fc6dd3dc2..2a6cb6488a0 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -76,24 +76,6 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Audio Generation Model Test" - timeout_in_minutes: 20 - depends_on: upload-merge-pipeline - commands: - - pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py - agents: - queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Diffusion Cache Backend Test" timeout_in_minutes: 15 depends_on: upload-merge-pipeline diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 13a812a62f3..2f749f0ee9f 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -123,7 +123,7 @@ steps: - label: "Audio Generation Model Test" depends_on: upload-ready-pipeline commands: - - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py + - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_expansion.py -m "advanced_model and diffusion and L4" --run-level advanced_model agents: queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU plugins: diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index 74ae1a38eb8..93060357385 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -242,7 +242,7 @@ vllm_omni/ tests/ ├── test_zimage_tensor_parallel.py ├── test_cache_dit.py ├── test_teacache.py - ├── test_stable_audio_model.py + ├── test_stable_audio_expansion.py ├── test_diffusion_cpu_offload.py ├── test_diffusion_layerwise_offload.py ├── test_diffusion_lora.py diff --git a/docs/contributing/ci/tests_style.md b/docs/contributing/ci/tests_style.md index 8b10cf4cc1c..69d5b16d7a5 100644 --- a/docs/contributing/ci/tests_style.md +++ b/docs/contributing/ci/tests_style.md @@ -147,7 +147,7 @@ vllm_omni/ tests/ ├── test_zimage_tensor_parallel.py ├── test_cache_dit.py ├── test_teacache.py - ├── test_stable_audio_model.py + ├── test_stable_audio_expansion.py ├── test_diffusion_cpu_offload.py ├── test_diffusion_layerwise_offload.py ├── test_diffusion_lora.py diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index ac140ff84a0..31cd1500fa0 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -147,7 +147,7 @@ The following tables show which models support each feature: | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | |-------|:----------:|:-----------:|:---------------------:|:--------------:|:-----------------:|:------:|:------------------------:|:--------------------:|:--------------:|:----------------:| -| **Stable-Audio-Open** | ❌ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | +| **Stable-Audio-Open** | ✅ | ❌ | ❓ | ❓ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ## Feature Compatibility diff --git a/examples/offline_inference/text_to_audio/README.md b/examples/offline_inference/text_to_audio/README.md index 7edc38092ad..50bab3e2f2d 100644 --- a/examples/offline_inference/text_to_audio/README.md +++ b/examples/offline_inference/text_to_audio/README.md @@ -23,6 +23,7 @@ python text_to_audio.py \ --guidance-scale 7.0 \ --audio-length 10.0 \ --num-inference-steps 100 \ + --cache-backend tea_cache \ --output stable_audio_output.wav ``` @@ -34,4 +35,5 @@ Key arguments: - `--guidance-scale`: classifier-free guidance scale. - `--audio-length`: audio duration in seconds. - `--num-inference-steps`: diffusion sampling steps.(more steps = higher quality, slower). +- `--cache-backend`: cache acceleration backend. Stable Audio currently supports `tea_cache`. - `--output`: path to save the generated WAV file. diff --git a/examples/offline_inference/text_to_audio/text_to_audio.py b/examples/offline_inference/text_to_audio/text_to_audio.py index a6968c419f6..3adb3ad53a5 100644 --- a/examples/offline_inference/text_to_audio/text_to_audio.py +++ b/examples/offline_inference/text_to_audio/text_to_audio.py @@ -11,6 +11,7 @@ python text_to_audio.py --prompt "The sound of a dog barking" python text_to_audio.py --prompt "A piano playing a gentle melody" --audio-length 10.0 python text_to_audio.py --prompt "Thunder and rain sounds" --negative-prompt "Low quality" + python text_to_audio.py --prompt "A soft synth pad" --cache-backend tea_cache """ import argparse @@ -90,6 +91,23 @@ def parse_args() -> argparse.Namespace: default=44100, help="Sample rate for output audio (Stable Audio uses 44100 Hz).", ) + parser.add_argument( + "--cache-backend", + type=str, + default=None, + choices=["tea_cache"], + help=( + "Cache backend to use for acceleration. " + "Stable Audio currently supports 'tea_cache'. " + "Default: None (no cache acceleration)." + ), + ) + parser.add_argument( + "--tea-cache-rel-l1-thresh", + type=float, + default=0.2, + help="[tea_cache] Threshold for accumulated relative L1 distance.", + ) parser.add_argument( "--enable-diffusion-pipeline-profiler", action="store_true", @@ -124,6 +142,11 @@ def save_audio(audio_data: np.ndarray, output_path: str, sample_rate: int = 4410 def main(): args = parse_args() generator = torch.Generator(device=current_omni_platform.device_type).manual_seed(args.seed) + cache_config = None + if args.cache_backend == "tea_cache": + cache_config = { + "rel_l1_thresh": args.tea_cache_rel_l1_thresh, + } print(f"\n{'=' * 60}") print("Stable Audio Open - Text-to-Audio Generation") @@ -134,12 +157,15 @@ def main(): print(f" Audio length: {args.audio_length}s") print(f" Inference steps: {args.num_inference_steps}") print(f" Guidance scale: {args.guidance_scale}") + print(f" Cache backend: {args.cache_backend if args.cache_backend else 'None (no acceleration)'}") print(f" Seed: {args.seed}") print(f"{'=' * 60}\n") # Initialize Omni with Stable Audio model omni = Omni( model=args.model, + cache_backend=args.cache_backend, + cache_config=cache_config, enable_diffusion_pipeline_profiler=args.enable_diffusion_pipeline_profiler, ) diff --git a/pyproject.toml b/pyproject.toml index e49aa6e3251..57a4b474fd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -182,6 +182,7 @@ markers = [ "H100: Tests that require H100 GPU", "L4: Tests that require L4 GPU", "MI325: Tests that require MI325 GPU (AMD/ROCm)", + "B60: Tests that require Intel Arc Pro B60 XPU", "S5000: Tests that require S5000 GPU (Moore Threads/MUSA)", "A2: Tests that require A2 NPU", "A3: Tests that require A3 NPU", diff --git a/tests/conftest.py b/tests/conftest.py index 9c739533b83..e41d15bdf56 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -167,7 +167,6 @@ def assert_audio_diffusion_response( Validate audio diffusion response. """ raise NotImplementedError("Audio validation is not implemented yet") - # consider using assert_audio_valid defined above def _maybe_int(value: Any) -> int | None: @@ -277,15 +276,32 @@ def assert_video_valid( pass -def assert_audio_valid(path: Path, *, sample_rate: int, channels: int, duration_s: float) -> None: - """Assert the WAV has the expected sample rate, channel count, and duration.""" +def assert_audio_valid( + audio_or_path: Path | np.ndarray, + *, + sample_rate: int, + channels: int, + duration_s: float, +) -> None: + """Assert WAV file or (batch, channels, samples) ndarray matches expected audio format.""" + expected_samples = int(duration_s * sample_rate) + if isinstance(audio_or_path, np.ndarray): + audio = audio_or_path + assert audio.ndim == 3, f"Expected audio ndim=3 (batch, channels, samples), got shape {audio.shape}" + assert audio.shape[0] == 1, f"Expected batch size 1, got {audio.shape[0]}" + assert audio.shape[1] == channels, f"Expected {channels} channels, got {audio.shape[1]}" + assert audio.shape[2] == expected_samples, ( + f"Expected {expected_samples} samples ({duration_s}s @ {sample_rate} Hz), got {audio.shape[2]}" + ) + return + + path = audio_or_path assert path.exists(), f"Audio not found: {path}" info = sf.info(str(path)) assert info.samplerate == sample_rate, f"Expected sample_rate={sample_rate}, got {info.samplerate}" assert info.channels == channels, f"Expected {channels} channel(s), got {info.channels}" - expected_frames = int(duration_s * sample_rate) - assert info.frames == expected_frames, ( - f"Expected {expected_frames} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}" + assert info.frames == expected_samples, ( + f"Expected {expected_samples} frames ({duration_s}s @ {sample_rate} Hz), got {info.frames}" ) diff --git a/tests/e2e/offline_inference/test_stable_audio_expansion.py b/tests/e2e/offline_inference/test_stable_audio_expansion.py new file mode 100644 index 00000000000..54c1799e145 --- /dev/null +++ b/tests/e2e/offline_inference/test_stable_audio_expansion.py @@ -0,0 +1,99 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Stable Audio offline e2e: real weights, FP8 + TeaCache (single job to save GPU). + +NOTE: This test instantiates Omni directly instead of using the omni_runner +fixture (introduced in PR #2711) because the fixture's parametrize interface +only accepts (model, stage_config_path) and does not support extra kwargs like +quantization, cache_backend, or cache_config. +""" + +from __future__ import annotations + +import numpy as np +import pytest +import torch + +from tests.conftest import assert_audio_valid +from tests.utils import hardware_test +from vllm_omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput +from vllm_omni.platforms import current_omni_platform + +_SAMPLE_RATE = 44100 +_CLIP_DURATION_S = 2.0 + + +def generate_stable_audio_short_clip( + omni: Omni, + *, + audio_start_in_s: float = 0.0, + audio_end_in_s: float = 2.0, + num_inference_steps: int = 4, + seed: int = 42, +) -> np.ndarray: + """Run a minimal Stable Audio generation and return audio as (batch, channels, samples).""" + outputs = omni.generate( + prompts={ + "prompt": "The sound of a dog barking", + "negative_prompt": "Low quality.", + }, + sampling_params_list=OmniDiffusionSamplingParams( + num_inference_steps=num_inference_steps, + guidance_scale=7.0, + generator=torch.Generator(current_omni_platform.device_type).manual_seed(seed), + num_outputs_per_prompt=1, + extra_args={ + "audio_start_in_s": audio_start_in_s, + "audio_end_in_s": audio_end_in_s, + }, + ), + ) + + assert outputs is not None + first_output = outputs[0] + # Outer OmniRequestOutput.final_output_type comes from get_stage_metadata. + # The nested request_output is the worker OmniRequestOutput + # (e.g. final_output_type="audio") and holds the multimodal payload. + # Follow-up: add StableAudioPipeline stage YAML, and pass model into + # _create_default_diffusion_stage_cfg so default diffusion metadata can set + # final_output_type to "audio" for future audio pipelines without YAML. + assert first_output.final_output_type == "image" + assert hasattr(first_output, "request_output") and first_output.request_output + + req_out = first_output.request_output + assert isinstance(req_out, OmniRequestOutput) + assert req_out.final_output_type == "audio" + assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output + audio = req_out.multimodal_output.get("audio") + assert isinstance(audio, np.ndarray) + return audio + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@pytest.mark.cache +@hardware_test(res={"cuda": "L4", "xpu": "B60"}) +def test_stable_audio_quantization_and_teacache() -> None: + """Stable Audio Open on real Hub weights with FP8 + TeaCache (covers former L2 smoke + L4 features). + + CI should provide ``HF_TOKEN`` if the checkpoint is gated. + """ + m = Omni( + model="stabilityai/stable-audio-open-1.0", + quantization="fp8", + cache_backend="tea_cache", + cache_config={"rel_l1_thresh": 0.2}, + ) + try: + audio = generate_stable_audio_short_clip(m) + assert_audio_valid( + audio, + sample_rate=_SAMPLE_RATE, + channels=2, + duration_s=_CLIP_DURATION_S, + ) + finally: + m.close() diff --git a/tests/e2e/offline_inference/test_stable_audio_model.py b/tests/e2e/offline_inference/test_stable_audio_model.py deleted file mode 100644 index 21d75aad52a..00000000000 --- a/tests/e2e/offline_inference/test_stable_audio_model.py +++ /dev/null @@ -1,63 +0,0 @@ -import numpy as np -import pytest -import torch - -from tests.utils import hardware_test -from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from vllm_omni.outputs import OmniRequestOutput -from vllm_omni.platforms import current_omni_platform - -# Use random weights model for CI testing (small, no authentication required) -models = ["linyueqian/stable_audio_random"] - -# omni_runner expects (model, stage_configs_path); single-stage diffusion has no YAML. -test_params = [(m, None) for m in models] - - -@pytest.mark.core_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "L4", "xpu": "B60"}) -@pytest.mark.parametrize("omni_runner", test_params, indirect=True) -def test_stable_audio_model(omni_runner): - # Use minimal settings for testing - # Generate a short 2-second audio clip with minimal inference steps - audio_start_in_s = 0.0 - audio_end_in_s = 2.0 # Short duration for fast testing - sample_rate = 44100 # Stable Audio uses 44100 Hz - - outputs = omni_runner.omni.generate( - prompts={ - "prompt": "The sound of a dog barking", - "negative_prompt": "Low quality.", - }, - sampling_params_list=OmniDiffusionSamplingParams( - num_inference_steps=4, # Minimal steps for speed - guidance_scale=7.0, - generator=torch.Generator(current_omni_platform.device_type).manual_seed(42), - num_outputs_per_prompt=1, - extra_args={ - "audio_start_in_s": audio_start_in_s, - "audio_end_in_s": audio_end_in_s, - }, - ), - ) - - # Extract audio from OmniRequestOutput - assert outputs is not None - first_output = outputs[0] - assert first_output.final_output_type == "image" - assert hasattr(first_output, "request_output") and first_output.request_output - - req_out = first_output.request_output - assert isinstance(req_out, OmniRequestOutput) - assert req_out.final_output_type == "audio" - assert hasattr(req_out, "multimodal_output") and req_out.multimodal_output - audio = req_out.multimodal_output.get("audio") - assert isinstance(audio, np.ndarray) - # audio shape: (batch, channels, samples) - # For stable-audio-open-1.0: sample_rate=44100, so 2 seconds = 88200 samples - assert audio.ndim == 3 - assert audio.shape[0] == 1 # batch size - assert audio.shape[1] == 2 # stereo channels - expected_samples = int((audio_end_in_s - audio_start_in_s) * sample_rate) - assert audio.shape[2] == expected_samples # 88200 samples for 2 seconds From 6d01a8b506a2a28a7aedc1ffd5c989a407b0bd70 Mon Sep 17 00:00:00 2001 From: NATURE Date: Tue, 14 Apr 2026 16:06:37 +0800 Subject: [PATCH 29/76] [Omni Connector] Omni Transfer Engine Connector: Enable 1-receiver-to-N-senders to support Bagel TP/CFG parallel (#2731) Signed-off-by: natureofnature --- .../omni_connectors/test_shm_connector.py | 184 ++++++++++++++++++ .../omni_connectors/connectors/base.py | 10 +- .../connectors/mooncake_store_connector.py | 19 +- .../mooncake_transfer_engine_connector.py | 178 +++++++++++------ .../connectors/shm_connector.py | 113 ++++++++--- .../omni_connectors/utils/initialization.py | 5 + 6 files changed, 422 insertions(+), 87 deletions(-) create mode 100644 tests/distributed/omni_connectors/test_shm_connector.py diff --git a/tests/distributed/omni_connectors/test_shm_connector.py b/tests/distributed/omni_connectors/test_shm_connector.py new file mode 100644 index 00000000000..e702318e3f3 --- /dev/null +++ b/tests/distributed/omni_connectors/test_shm_connector.py @@ -0,0 +1,184 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for SharedMemoryConnector focusing on TP / CFG / metadata fallback.""" + +import pytest + +from vllm_omni.distributed.omni_connectors.connectors.shm_connector import ( + SharedMemoryConnector, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +@pytest.fixture() +def connector(): + c = SharedMemoryConnector({"shm_threshold_bytes": 64}) + yield c + c.close() + + +# ── Key-based read (the fundamental SHM path) ──────────────────────── + + +class TestKeyBasedReadWrite: + def test_put_then_get_by_key(self, connector): + data = {"hello": "world", "n": 42} + ok, size, meta = connector.put("s0", "s1", "test_key_1", data) + assert ok + assert size > 0 + assert "shm" in meta + assert "test_key_1" in connector._pending_keys + + result = connector.get("s0", "s1", "test_key_1", metadata=None) + assert result is not None + obj, rsize = result + assert obj == data + assert rsize == size + assert "test_key_1" not in connector._pending_keys + + def test_get_nonexistent_key_returns_none(self, connector): + result = connector.get("s0", "s1", "no_such_key_xyz", metadata=None) + assert result is None + + def test_rank_aware_keys_independent(self, connector): + """Each TP rank writes/reads its own key — simulates homogeneous TP.""" + payloads = {} + for rank in range(4): + key = f"req1_s0_0_{rank}_{rank}" + data = {"rank": rank, "values": list(range(rank, rank + 3))} + ok, _, _ = connector.put("s0", "s1", key, data) + assert ok + payloads[rank] = data + + for rank in range(4): + key = f"req1_s0_0_{rank}_{rank}" + result = connector.get("s0", "s1", key, metadata=None) + assert result is not None + obj, _ = result + assert obj == payloads[rank] + + +# ── Metadata fallback behaviour ────────────────────────────────────── + + +class TestMetadataFallback: + def test_rdma_style_metadata_falls_back_to_key(self, connector): + """source_host/source_port metadata should be ignored; key read used.""" + data = {"payload": True} + connector.put("s0", "s1", "fb_key_1", data) + + rdma_meta = {"source_host": "10.0.0.1", "source_port": 12345} + result = connector.get("s0", "s1", "fb_key_1", metadata=rdma_meta) + assert result is not None + obj, _ = result + assert obj == data + + def test_non_dict_metadata_falls_back_to_key(self, connector): + data = {"val": 99} + connector.put("s0", "s1", "fb_key_2", data) + + result = connector.get("s0", "s1", "fb_key_2", metadata="not_a_dict") + assert result is not None + obj, _ = result + assert obj == data + + def test_empty_dict_metadata_falls_back_to_key(self, connector): + data = {"x": 1} + connector.put("s0", "s1", "fb_key_3", data) + + result = connector.get("s0", "s1", "fb_key_3", metadata={}) + assert result is not None + obj, _ = result + assert obj == data + + def test_shm_handle_metadata_still_works(self, connector): + """When metadata contains a proper 'shm' handle, use it directly.""" + data = {"direct": True} + ok, size, meta = connector.put("s0", "s1", "shm_direct_1", data) + assert ok + result = connector.get("s0", "s1", "shm_direct_1", metadata=meta) + assert result is not None + obj, _ = result + assert obj == data + + def test_metadata_keyed_by_request_id(self, connector): + """Metadata wrapped as {get_key: actual_meta} should be unwrapped.""" + data = {"wrapped": True} + ok, size, meta = connector.put("s0", "s1", "wrap_key", data) + assert ok + wrapped = {"wrap_key": meta} + result = connector.get("s0", "s1", "wrap_key", metadata=wrapped) + assert result is not None + obj, _ = result + assert obj == data + + +# ── Heterogeneous TP multi-key read ────────────────────────────────── + + +class TestHeteroTPMultiKey: + def test_receiver_reads_multiple_sender_keys(self, connector): + """Simulates from_tp=2 -> to_tp=1: receiver reads 2 keys and merges.""" + for sender_rank in range(2): + key = f"req1_s0_0_{sender_rank}_0" + data = {"sender": sender_rank, "shard": [sender_rank * 10]} + connector.put("s0", "s1", key, data) + + shards = [] + for sender_rank in range(2): + key = f"req1_s0_0_{sender_rank}_0" + result = connector.get("s0", "s1", key, metadata=None) + assert result is not None + obj, _ = result + shards.append(obj) + + assert len(shards) == 2 + assert shards[0]["sender"] == 0 + assert shards[1]["sender"] == 1 + + def test_sender_writes_multiple_receiver_keys(self, connector): + """Simulates from_tp=1 -> to_tp=2: sender writes 2 sliced keys.""" + for recv_rank in range(2): + key = f"req1_s0_0_0_{recv_rank}" + data = {"target": recv_rank, "slice": list(range(recv_rank, recv_rank + 2))} + connector.put("s0", "s1", key, data) + + for recv_rank in range(2): + key = f"req1_s0_0_0_{recv_rank}" + result = connector.get("s0", "s1", key, metadata=None) + assert result is not None + obj, _ = result + assert obj["target"] == recv_rank + + +# ── Cleanup ────────────────────────────────────────────────────────── + + +class TestCleanup: + def test_cleanup_removes_unconsumed_segment(self, connector): + data = {"leak": True} + connector.put("s0", "s1", "cleanup_req_42", data) + assert "cleanup_req_42" in connector._pending_keys + + connector.cleanup("req_42") + assert "cleanup_req_42" not in connector._pending_keys + + result = connector.get("s0", "s1", "cleanup_req_42", metadata=None) + assert result is None + + def test_cleanup_noop_for_consumed_segment(self, connector): + data = {"consumed": True} + connector.put("s0", "s1", "consumed_req_99", data) + connector.get("s0", "s1", "consumed_req_99", metadata=None) + + connector.cleanup("req_99") + assert "consumed_req_99" not in connector._pending_keys + + def test_close_cleans_all_pending(self, connector): + for i in range(3): + connector.put("s0", "s1", f"close_test_{i}", {"i": i}) + + assert len(connector._pending_keys) == 3 + connector.close() + assert len(connector._pending_keys) == 0 diff --git a/vllm_omni/distributed/omni_connectors/connectors/base.py b/vllm_omni/distributed/omni_connectors/connectors/base.py index 83edb2ab0ae..0df428f2ff5 100644 --- a/vllm_omni/distributed/omni_connectors/connectors/base.py +++ b/vllm_omni/distributed/omni_connectors/connectors/base.py @@ -34,13 +34,21 @@ def put(self, from_stage: str, to_stage: str, put_key: str, data: Any) -> tuple[ pass @abstractmethod - def get(self, from_stage: str, to_stage: str, get_key: str, metadata=None) -> tuple[Any, int] | None: + def get( + self, from_stage: str, to_stage: str, get_key: str, metadata: dict[str, Any] | None = None + ) -> tuple[Any, int] | None: """Retrieve Python object and payload size (bytes). Args: from_stage: Source stage identifier to_stage: Destination stage identifier get_key: Unique request identifier + metadata: Optional transport-specific metadata. When provided, + the connector uses it directly (e.g. source_host, source_port, + data_size) instead of querying the sender. For heterogeneous + TP the manager may supply partial metadata (host/port only); + the connector will query the sender at that address to fill + in data_size. Returns: Tuple of (Python object, serialized byte size) if found, None otherwise diff --git a/vllm_omni/distributed/omni_connectors/connectors/mooncake_store_connector.py b/vllm_omni/distributed/omni_connectors/connectors/mooncake_store_connector.py index c672e35f793..fa1fc3286db 100644 --- a/vllm_omni/distributed/omni_connectors/connectors/mooncake_store_connector.py +++ b/vllm_omni/distributed/omni_connectors/connectors/mooncake_store_connector.py @@ -78,7 +78,24 @@ def put(self, from_stage: str, to_stage: str, put_key: str, data: Any) -> tuple[ try: serialized_data = self.serialize_obj(data) key = self._make_key(put_key, from_stage, to_stage) - self.store.put(key, serialized_data, self.pin) + put_rc = self.store.put(key, serialized_data, self.pin) + + if isinstance(put_rc, bool): + put_ok = put_rc + else: + put_ok = put_rc is None or put_rc == 0 + + if not put_ok: + self._metrics["errors"] += 1 + logger.error( + "MooncakeStoreConnector put failed for %s (%s -> %s), rc=%r, %d bytes", + key, + from_stage, + to_stage, + put_rc, + len(serialized_data), + ) + return False, 0, None self._metrics["puts"] += 1 self._metrics["bytes_transferred"] += len(serialized_data) diff --git a/vllm_omni/distributed/omni_connectors/connectors/mooncake_transfer_engine_connector.py b/vllm_omni/distributed/omni_connectors/connectors/mooncake_transfer_engine_connector.py index 96a528963f4..bd4160f3e63 100644 --- a/vllm_omni/distributed/omni_connectors/connectors/mooncake_transfer_engine_connector.py +++ b/vllm_omni/distributed/omni_connectors/connectors/mooncake_transfer_engine_connector.py @@ -230,16 +230,19 @@ class MooncakeTransferEngineConnector(OmniConnectorBase): sender immediately cleans up the buffer (``cleanup()``), so only the first receiver to pull a given key will succeed. Broadcast / multicast (1 sender → N receivers sharing the same data) is not yet supported. - - **1 receiver → 1 sender**: ``update_sender_info()`` stores a single - ``(sender_host, sender_zmq_port)`` pair, so a receiver can only query - metadata from one sender at a time. + - **1 receiver → N senders**: Supported via partial metadata. The + manager constructs metadata with the target sender's + ``source_host`` / ``source_port`` (computed from ``from_rank``) + and passes it to ``get(metadata=...)``. The connector detects + that ``data_size`` is missing, queries the specified sender at + the given address to fill it in, then performs the RDMA pull. + This enables heterogeneous TP (sender TP > receiver TP) where a + single receiver must pull KV shards from multiple sender ranks. Future work: - Support 1 sender → N receivers (e.g. reference-counted buffers, or explicit ``retain()`` / ``release()`` semantics so the buffer survives multiple pulls). - - Support 1 receiver → N senders (e.g. a sender registry mapping - ``get_key`` prefixes to different sender endpoints). """ # RDMA connector copies raw bytes/tensor directly to the memory pool @@ -267,6 +270,7 @@ def __init__(self, config: dict[str, Any]): self._req_local = threading.local() self._worker_local = threading.local() self._last_ttl_check: float = _time_mod.monotonic() + self._sender_endpoints: dict[int, tuple[str, int]] = {} self._metrics = { "puts": 0, @@ -408,16 +412,38 @@ def get_connection_info(self) -> dict[str, Any]: "can_put": self.can_put, } - def update_sender_info(self, sender_host: str, sender_zmq_port: int) -> None: - """ - Inject the sender's ZMQ endpoint into the receiver connector. - Used for NO METADATA GET calls.(E.g: KV-cache transfer path) - Must be called before using get() without metadata! - Otherwise, get() will raise an error. + def update_sender_info( + self, + sender_host: str, + sender_zmq_port: int, + sender_rank: int | None = None, + ) -> None: + """Inject a sender's ZMQ endpoint into the receiver connector. + + When ``sender_rank`` is ``None`` (default), sets the single default + sender used by ``get()`` when no rank is specified — this preserves + backward-compatible 1:1 semantics. + + When ``sender_rank`` is an integer, the endpoint is stored in a + per-rank registry for internal use (e.g. by + ``_query_metadata_from_sender(sender_rank=R)``). """ - self.sender_host = sender_host - self.sender_zmq_port = sender_zmq_port - logger.info(f"Sender info updated: host={sender_host!r}, zmq_port={sender_zmq_port}") + if sender_rank is not None: + self._sender_endpoints[sender_rank] = (sender_host, sender_zmq_port) + logger.info( + "Sender info updated for rank %s: host=%r, zmq_port=%s", + sender_rank, + sender_host, + sender_zmq_port, + ) + else: + self.sender_host = sender_host + self.sender_zmq_port = sender_zmq_port + logger.info( + "Sender info updated (default): host=%r, zmq_port=%s", + sender_host, + sender_zmq_port, + ) def _get_local_ip(self) -> str: """ @@ -657,56 +683,75 @@ def put(self, from_stage: str, to_stage: str, put_key: str, data: Any) -> tuple[ logger.error(f"RDMA Put failed for {put_key}: {e}", exc_info=True) return False, 0, None - def _query_metadata_from_sender(self, get_key: str) -> dict[str, Any] | None: - """Query metadata from sender via ZMQ (fallback when ``metadata=None``). - - ``get()`` supports two metadata resolution paths:: - - get(metadata=?) - ├── metadata provided (adapter path) - │ → use metadata directly (source_host/port/data_size) - │ → RDMA pull - └── metadata=None (KV-transfer polling path) - → _query_metadata_from_sender(get_key) ← this method - │ - ├── sender_host resolved (via update_sender_info) - │ → ZMQ query → get data_size/is_fast_path - │ → construct metadata → RDMA pull - └── sender_host unresolved ("auto" / None) - → return None → caller retries or times out + def _resolve_sender_endpoint(self, sender_rank: int | None = None) -> tuple[str, int] | None: + """Return ``(host, zmq_port)`` for *sender_rank*. - For the second path, the caller must call - :meth:`update_sender_info` before ``get()`` to resolve the sender's ZMQ endpoint. - Support the two paths in case that the orchestrator pushes the request info - to different stages at the same time knowing metadata or not. + Resolution order: + 1. Per-rank registry (``_sender_endpoints[sender_rank]``) + 2. Default sender (``sender_host`` / ``sender_zmq_port``) + 3. ``None`` if nothing is configured. + """ + if sender_rank is not None and sender_rank in self._sender_endpoints: + return self._sender_endpoints[sender_rank] + host = getattr(self, "sender_host", None) + port = getattr(self, "sender_zmq_port", None) + if host and port and str(host).lower() != "auto": + return (host, int(port)) + return None + + def _query_metadata_at(self, get_key: str, host: str, port: int) -> dict[str, Any] | None: + """Query metadata from a sender endpoint via ZMQ. + + Returns ``{source_host, source_port, data_size, is_fast_path}`` + or ``None`` when the key is not found / the query fails. """ - zmq_addr = f"tcp://{self.sender_host}:{self.sender_zmq_port}" + zmq_addr = f"tcp://{host}:{port}" req_socket = self._get_req_socket(zmq_addr, timeout_ms=5000) - try: - # Send query request - query = QueryRequest(request_id=get_key) - req_socket.send(QUERY_INFO + msgspec.msgpack.encode(query)) + req_socket.send(QUERY_INFO + msgspec.msgpack.encode(QueryRequest(request_id=get_key))) resp = req_socket.recv() - if resp == INFO_NOT_FOUND: return None - - # Parse response query_resp = msgspec.msgpack.decode(resp, type=QueryResponse) return { - # source_host/source_port are used for verification - "source_host": self.sender_host, - "source_port": self.sender_zmq_port, + "source_host": host, + "source_port": port, "data_size": query_resp.data_size, "is_fast_path": query_resp.is_fast_path, } except Exception as e: - # Socket may be stuck in bad state after timeout; discard it self._invalidate_req_socket(zmq_addr) - logger.debug(f"Failed to query metadata for {get_key}: {e}") + logger.debug("Failed to query metadata at %s for %s: %s", zmq_addr, get_key, e) return None + def _query_metadata_from_sender(self, get_key: str, sender_rank: int | None = None) -> dict[str, Any] | None: + """Query metadata from sender via ZMQ (fallback when ``metadata=None``). + + ``get()`` supports three metadata resolution paths:: + + get(metadata=?) + ├── Path 1: metadata has data_size (adapter path) + │ → use metadata directly → RDMA pull + ├── Path 2: metadata has source_host/port but no data_size + │ → _query_metadata_at(host, port) → get data_size → RDMA pull + └── Path 3: metadata=None (KV-transfer polling path) + → _query_metadata_from_sender(get_key) ← this method + │ + ├── sender endpoint resolved (via update_sender_info) + │ → ZMQ query → get data_size/is_fast_path + │ → construct metadata → RDMA pull + └── sender endpoint unresolved + → return None → caller retries or times out + + When *sender_rank* is provided, the query is routed to that + rank's endpoint (registered via ``update_sender_info(rank=...)``). + Otherwise the default sender is used. + """ + endpoint = self._resolve_sender_endpoint(sender_rank) + if endpoint is None: + return None + return self._query_metadata_at(get_key, *endpoint) + def get( self, from_stage: str, @@ -714,12 +759,18 @@ def get( get_key: str, metadata: dict[str, Any] | None = None, ) -> tuple[Any, int] | None: - """ - Consumer Side. - Allocates from local pool and pulls data via RDMA. + """Consumer Side. Allocates from local pool and pulls data via RDMA. + + Metadata resolution: - If metadata is not provided, will attempt to query it from sender - using configured sender_host/sender_zmq_port. + 1. ``metadata`` provided **with** ``data_size`` → use directly (RDMA pull). + 2. ``metadata`` provided with ``source_host``/``source_port`` but + **without** ``data_size`` → query that specific sender for + ``data_size`` / ``is_fast_path``, then RDMA pull. This is the + heterogeneous-TP path where the manager knows the target sender + endpoint but not the payload size. + 3. ``metadata=None`` → query the default sender (set via + ``update_sender_info()``) for the full metadata. Returns: ``(data, size)`` on success, ``None`` on failure. @@ -727,9 +778,6 @@ def get( - **is_fast_path=True** (tensor *or* bytes payload): Returns ``(ManagedBuffer, size)``. **CALLER MUST call ``ManagedBuffer.release()`` after consuming.** - Note: even if the producer ``put()`` raw ``bytes``, the consumer - receives a ``ManagedBuffer`` — use ``buf.to_bytes()`` to obtain - a ``bytes`` copy, or ``buf.tensor`` for zero-copy access. - **is_fast_path=False** (serialized Python object): Returns ``(DeserializedObject, size)``. Buffer is auto-released internally after deserialization. @@ -741,9 +789,8 @@ def get( _t0 = _time_mod.perf_counter() - # If no metadata provided, try to query from sender if not metadata: - # Must insert sender info before using get() without metadata. + # Path 3: no metadata at all — query default sender if not self.sender_host or not self.sender_zmq_port or str(self.sender_host).lower() == "auto": raise RuntimeError( f"get(metadata=None) requires sender info to be resolved, " @@ -753,6 +800,21 @@ def get( metadata = self._query_metadata_from_sender(get_key) if not metadata: return None + elif "data_size" not in metadata: + # Path 2: partial metadata (host/port only) — query that sender + partial_host = metadata.get("source_host") + partial_port = metadata.get("source_port") + if not partial_host or not partial_port: + logger.warning( + "get(%s): partial metadata missing source_host/source_port, cannot resolve data_size. metadata=%s", + get_key, + metadata, + ) + return None + queried = self._query_metadata_at(get_key, str(partial_host), int(partial_port)) + if not queried: + return None + metadata = queried _t1 = _time_mod.perf_counter() _query_ms = (_t1 - _t0) * 1000 diff --git a/vllm_omni/distributed/omni_connectors/connectors/shm_connector.py b/vllm_omni/distributed/omni_connectors/connectors/shm_connector.py index 5c7384c1f8b..6cf5c2f15b5 100644 --- a/vllm_omni/distributed/omni_connectors/connectors/shm_connector.py +++ b/vllm_omni/distributed/omni_connectors/connectors/shm_connector.py @@ -15,9 +15,13 @@ class SharedMemoryConnector(OmniConnectorBase): - """ - Connector that uses SharedMemory for large objects and inline data for small objects. - Acts as a unified replacement for the legacy IPC fallback logic. + """Key-addressed local shared-memory connector. + + SHM is a local-only transport: it reads/writes POSIX shared memory + segments identified purely by *key*. It does **not** understand + remote-transport metadata such as ``source_host`` / ``source_port`` + (that is the RDMA connector's job). When such metadata is passed in, + the connector silently falls back to key-based lookup. """ def __init__(self, config: dict[str, Any]): @@ -25,6 +29,7 @@ def __init__(self, config: dict[str, Any]): self.stage_id = config.get("stage_id", -1) self.device = config.get("device", "cuda:0") self.threshold = int(config.get("shm_threshold_bytes", 65536)) + self._pending_keys: set[str] = set() self._metrics = { "puts": 0, "gets": 0, @@ -59,6 +64,7 @@ def put( # meta contains {'name': ..., 'size': ...} metadata = {"shm": meta, "size": size} + self._pending_keys.add(put_key) self._metrics["shm_writes"] += 1 else: # Inline - pass bytes directly to avoid double serialization of the object @@ -93,6 +99,28 @@ def _get_data_with_lock(self, lock_file: str, shm_handle: dict): if obj and os.path.exists(lock_file): os.remove(lock_file) + def _get_by_key(self, get_key: str) -> tuple[Any, int] | None: + """Read a SHM segment addressed purely by *get_key*.""" + shm = None + try: + shm = shm_pkg.SharedMemory(name=get_key) + if shm is None or shm.size == 0: + return None + lock_file = f"/dev/shm/shm_{get_key}_lockfile.lock" + shm_handle = {"name": get_key, "size": shm.size} + result = self._get_data_with_lock(lock_file, shm_handle) + if result is not None: + self._pending_keys.discard(get_key) + return result + except FileNotFoundError: + return None + except Exception: + logger.debug("_get_by_key: unexpected error reading SHM segment %s", get_key, exc_info=True) + return None + finally: + if shm: + shm.close() + def get( self, from_stage: str, @@ -101,16 +129,16 @@ def get( metadata=None, ) -> tuple[Any, int] | None: if metadata is not None: - # Some callers may wrap metadata by request id. if isinstance(metadata, dict) and get_key in metadata: metadata = metadata.get(get_key) if not isinstance(metadata, dict): - return None + return self._get_by_key(get_key) if "inline_bytes" in metadata: try: obj = self.deserialize_obj(metadata["inline_bytes"]) + self._pending_keys.discard(get_key) return obj, int(metadata.get("size", 0)) except Exception as e: logger.error(f"SharedMemoryConnector inline get failed for req {get_key}: {e}") @@ -119,33 +147,64 @@ def get( if "shm" in metadata: shm_handle = metadata["shm"] lock_file = f"/dev/shm/shm_{shm_handle['name']}_lockfile.lock" - return self._get_data_with_lock(lock_file, shm_handle) + result = self._get_data_with_lock(lock_file, shm_handle) + if result is not None: + self._pending_keys.discard(get_key) + return result - return None - shm = None - try: - shm = shm_pkg.SharedMemory(name=get_key) - if shm is None or shm.size == 0: - return None - lock_file = f"/dev/shm/shm_{get_key}_lockfile.lock" - shm_handle = {"name": get_key, "size": shm.size} - return self._get_data_with_lock(lock_file, shm_handle) - except Exception: - return None - finally: - if shm: - shm.close() + # Metadata is a dict but has no SHM-specific handle (e.g. RDMA- + # style source_host/source_port). Fall back to key-based read. + return self._get_by_key(get_key) + + return self._get_by_key(get_key) def cleanup(self, request_id: str) -> None: - # SHM segments are automatically unlinked during 'get' (shm_read_bytes). - # If 'get' is never called (e.g. error flow), the SHM segment might leak. - # A robust implementation might track created segments and unlink them here - # if they haven't been consumed. - # For now, we rely on the consumer to read and unlink. - pass + """Best-effort cleanup of unconsumed SHM segments for *request_id*. + + Matches pending keys where *request_id* appears as the full key, + as a ``_``-delimited prefix, or as a ``_``-delimited suffix. + If ``get()`` was never called, we unlink it here so /dev/shm + doesn't leak. + """ + stale = [ + k + for k in self._pending_keys + if k == request_id or k.startswith(request_id + "_") or k.endswith("_" + request_id) + ] + for key in stale: + self._pending_keys.discard(key) + try: + seg = shm_pkg.SharedMemory(name=key) + seg.close() + seg.unlink() + logger.debug("cleanup: unlinked unconsumed SHM segment %s", key) + except FileNotFoundError: + pass + except Exception as e: + logger.debug("cleanup: failed to unlink SHM segment %s: %s", key, e) + lock_file = f"/dev/shm/shm_{key}_lockfile.lock" + if os.path.exists(lock_file): + try: + os.remove(lock_file) + except OSError: + pass def close(self) -> None: - pass + """Unlink all remaining tracked SHM segments.""" + for key in list(self._pending_keys): + try: + seg = shm_pkg.SharedMemory(name=key) + seg.close() + seg.unlink() + except Exception: + pass + lock_file = f"/dev/shm/shm_{key}_lockfile.lock" + if os.path.exists(lock_file): + try: + os.remove(lock_file) + except OSError: + pass + self._pending_keys.clear() def health(self) -> dict[str, Any]: return {"status": "healthy", "threshold": self.threshold, **self._metrics} diff --git a/vllm_omni/distributed/omni_connectors/utils/initialization.py b/vllm_omni/distributed/omni_connectors/utils/initialization.py index 37b7d0d7f83..0497bbb3a23 100644 --- a/vllm_omni/distributed/omni_connectors/utils/initialization.py +++ b/vllm_omni/distributed/omni_connectors/utils/initialization.py @@ -23,6 +23,11 @@ # collide with request-forwarding endpoints that share the same base port. KV_TRANSFER_PORT_OFFSET = 100 +# Port stride between TP ranks so each worker binds a unique ZMQ port +# when TP > 1. Must be larger than the maximum number of pipeline stages. +# Formula: zmq_port = base + KV_TRANSFER_PORT_OFFSET + rank * STRIDE + stage +KV_RANK_PORT_STRIDE = 16 + def initialize_connectors_from_config( config_path: str | Path | None = None, From 3229bae331cb7ad37a71bb19853dae62fff9b4ec Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Tue, 14 Apr 2026 18:33:31 +0800 Subject: [PATCH 30/76] [skip ci] fix docs, gdown remove --id param (#2787) Signed-off-by: rongfu.leng --- benchmarks/build_dataset/download_process_data_seedtts.md | 4 ++-- benchmarks/qwen3-omni/README.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/build_dataset/download_process_data_seedtts.md b/benchmarks/build_dataset/download_process_data_seedtts.md index ec16f64424a..faf072303b8 100644 --- a/benchmarks/build_dataset/download_process_data_seedtts.md +++ b/benchmarks/build_dataset/download_process_data_seedtts.md @@ -27,7 +27,7 @@ pip install gdown Download the dataset from Google Drive: ```bash -gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP +gdown 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP ``` ### 4. Extract the Dataset @@ -74,7 +74,7 @@ rm meta.lst # Full setup and benchmark cd benchmarks/build_dataset pip install gdown -gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP +gdown 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP tar -xf seedtts_testset.tar cp seedtts_testset/en/meta.lst meta.lst python extract_tts_prompts.py -i meta.lst -o top100.txt -n 100 diff --git a/benchmarks/qwen3-omni/README.md b/benchmarks/qwen3-omni/README.md index de27c05c2c4..dc282d0525f 100644 --- a/benchmarks/qwen3-omni/README.md +++ b/benchmarks/qwen3-omni/README.md @@ -9,7 +9,7 @@ cd benchmarks/build_dataset pip install gdown # Download SeedTTS test set from Google Drive -gdown --id 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP +gdown 1GlSjVfSHkW3-leKKBlfrjuuTGqQ_xaLP # Extract tar -xf seedtts_testset.tar From 159d6558ea55ef59b3c57cf512e8114b62cd881e Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Tue, 14 Apr 2026 19:36:02 +0800 Subject: [PATCH 31/76] [Tests][Qwen3-Omni]Add test cases for long videos and long audios. (#2598) Signed-off-by: amy-why-3459 --- .../test_qwen3_omni_expansion.py | 159 ++++++------------ 1 file changed, 54 insertions(+), 105 deletions(-) diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index 1637627695e..3065439084a 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -29,6 +29,16 @@ IMAGE_KEY = ["square", "quadrate", "rectangle"] VIDEO_KEY = ["sphere", "globe", "circle", "round", "ball"] +# Heavier synthetic inputs than the default expansion cases (longer timeline / more pixels). +# Long video: 120s @ 30fps => 3600 frames (generate_synthetic_video in tests/conftest.py). +# Use 224² spatial size to bound RAM (~W*H*num_frames*3) vs. 288² at this frame count. +LONG_VIDEO_WIDTH = 224 +LONG_VIDEO_HEIGHT = 224 +LONG_VIDEO_FRAMES = 3600 +LARGE_IMAGE_WIDTH = 1920 +LARGE_IMAGE_HEIGHT = 1080 +LONG_AUDIO_DURATION_SEC = 120 + def get_chunk_config(default_path): path = modify_stage_config( @@ -37,7 +47,8 @@ def get_chunk_config(default_path): "async_chunk": True, "stage_args": { 0: { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk" + "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk", + "default_sampling_params.max_tokens": 2048, }, 1: { "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk" @@ -167,88 +178,17 @@ def test_text_to_text_audio_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_image_to_text_001(omni_server, openai_client) -> None: - """ - Input Modal: image - Output Modal: text - Input Setting: stream=True - Datasets: single request - """ - image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" - messages = dummy_messages_from_mix_data(image_data_url=image_data_url) - - request_config = { - "model": omni_server.model, - "messages": messages, - "modalities": ["text"], - "stream": True, - "key_words": {"image": IMAGE_KEY}, - } - - openai_client.send_omni_request(request_config) - - -@pytest.mark.advanced_model -@pytest.mark.omni -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_image_to_audio_001(omni_server, openai_client) -> None: - """ - Input Modal: image - Output Modal: audio - Input Setting: stream=False - Datasets: single request - """ - image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" - messages = dummy_messages_from_mix_data(image_data_url=image_data_url) - - request_config = { - "model": omni_server.model, - "messages": messages, - "modalities": ["audio"], - "key_words": {"image": IMAGE_KEY}, - } - - openai_client.send_omni_request(request_config) - - -@pytest.mark.advanced_model -@pytest.mark.omni -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_image_to_text_audio_001(omni_server, openai_client) -> None: - """ - Input Modal: image - Output Modal: text, audio - Input Setting: stream=False - Datasets: few requests - """ - image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(1280, 720)['base64']}" - - messages = dummy_messages_from_mix_data(image_data_url=image_data_url) - - request_config = { - "model": omni_server.model, - "messages": messages, - "key_words": {"image": IMAGE_KEY}, - } - - openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) - - -@pytest.mark.advanced_model -@pytest.mark.omni -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_video_to_text_001(omni_server, openai_client) -> None: +def test_text_video_to_text_001(omni_server, openai_client) -> None: """ - Input Modal: video + Input Modal: long synthetic video (120s @ 30fps, LONG_VIDEO_FRAMES frames) Output Modal: text Input Setting: stream=False Datasets: single request """ - video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}" - messages = dummy_messages_from_mix_data(video_data_url=video_data_url) + video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(LONG_VIDEO_WIDTH, LONG_VIDEO_HEIGHT, LONG_VIDEO_FRAMES)['base64']}" + messages = dummy_messages_from_mix_data( + video_data_url=video_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_video") + ) request_config = { "model": omni_server.model, @@ -257,28 +197,29 @@ def test_video_to_text_001(omni_server, openai_client) -> None: "key_words": {"video": VIDEO_KEY}, } - openai_client.send_omni_request(request_config) + openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) @pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_video_to_audio_001(omni_server, openai_client) -> None: +@pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True) +def test_text_audio_to_text_audio_001(omni_server, openai_client) -> None: """ - Input Modal: video - Output Modal: audio + Input Modal: text, audio + Output Modal: text, audio Input Setting: stream=False Datasets: single request """ - video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}" - messages = dummy_messages_from_mix_data(video_data_url=video_data_url) + audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(5, 1)['base64']}" + messages = dummy_messages_from_mix_data( + audio_data_url=audio_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_audio") + ) request_config = { "model": omni_server.model, "messages": messages, - "modalities": ["audio"], - "key_words": {"video": VIDEO_KEY}, + "key_words": {"audio": AUDIO_KEY}, } openai_client.send_omni_request(request_config) @@ -287,22 +228,25 @@ def test_video_to_audio_001(omni_server, openai_client) -> None: @pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_video_to_text_audio_001(omni_server, openai_client) -> None: +@pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True) +def test_text_audio_to_text_audio_002(omni_server, openai_client) -> None: """ - Input Modal: video + Input Modal: text, long-duration audio (~LONG_AUDIO_DURATION_SEC s WAV) Output Modal: text, audio Input Setting: stream=False - Datasets: few requests + Datasets: single request """ - video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}" - - messages = dummy_messages_from_mix_data(video_data_url=video_data_url) + audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(LONG_AUDIO_DURATION_SEC, 1)['base64']}" + messages = dummy_messages_from_mix_data( + audio_data_url=audio_data_url, + system_prompt=get_system_prompt(), + content_text=get_prompt("text_audio"), + ) request_config = { "model": omni_server.model, "messages": messages, - "key_words": {"video": VIDEO_KEY}, + "key_words": {"audio": AUDIO_KEY}, } openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) @@ -312,22 +256,23 @@ def test_video_to_text_audio_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True) -def test_text_audio_to_text_audio_001(omni_server, openai_client) -> None: +def test_text_image_to_text_audio_001(omni_server, openai_client) -> None: """ - Input Modal: text, audio + Input Modal: text, image Output Modal: text, audio Input Setting: stream=False Datasets: single request """ - audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(5, 1)['base64']}" + image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" + messages = dummy_messages_from_mix_data( - audio_data_url=audio_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_audio") + image_data_url=image_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_image") ) request_config = { "model": omni_server.model, "messages": messages, - "key_words": {"audio": AUDIO_KEY}, + "key_words": {"image": IMAGE_KEY}, } openai_client.send_omni_request(request_config) @@ -337,17 +282,21 @@ def test_text_audio_to_text_audio_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True) -def test_text_image_to_text_audio_001(omni_server, openai_client) -> None: +def test_large_image_to_text_audio_001(omni_server, openai_client) -> None: """ - Input Modal: text, image + Input Modal: text, high-resolution image (1080p-class JPEG) Output Modal: text, audio Input Setting: stream=False Datasets: single request """ - image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" + image_data_url = ( + f"data:image/jpeg;base64,{generate_synthetic_image(LARGE_IMAGE_WIDTH, LARGE_IMAGE_HEIGHT)['base64']}" + ) messages = dummy_messages_from_mix_data( - image_data_url=image_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_image") + image_data_url=image_data_url, + system_prompt=get_system_prompt(), + content_text=get_prompt("text_image"), ) request_config = { @@ -356,7 +305,7 @@ def test_text_image_to_text_audio_001(omni_server, openai_client) -> None: "key_words": {"image": IMAGE_KEY}, } - openai_client.send_omni_request(request_config) + openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) @pytest.mark.advanced_model From f87674aa447b24fb305f3eafcab1e51b30e0d9a6 Mon Sep 17 00:00:00 2001 From: Hongsheng Liu Date: Tue, 14 Apr 2026 20:26:27 +0800 Subject: [PATCH 32/76] [skip ci]add skills (#2710) Signed-off-by: hsliuustc0106 --- .claude/skills/add-diffusion-model/SKILL.md | 534 ++++++++++++++++ .../references/cache-dit-patterns.md | 254 ++++++++ .../references/custom-model-patterns.md | 273 +++++++++ .../references/parallelism-patterns.md | 571 ++++++++++++++++++ .../references/transformer-adaptation.md | 218 +++++++ .../references/troubleshooting.md | 178 ++++++ .claude/skills/add-tts-model/SKILL.md | 284 +++++++++ .claude/skills/readme.md | 34 ++ .gitignore | 14 +- 9 files changed, 2359 insertions(+), 1 deletion(-) create mode 100644 .claude/skills/add-diffusion-model/SKILL.md create mode 100644 .claude/skills/add-diffusion-model/references/cache-dit-patterns.md create mode 100644 .claude/skills/add-diffusion-model/references/custom-model-patterns.md create mode 100644 .claude/skills/add-diffusion-model/references/parallelism-patterns.md create mode 100644 .claude/skills/add-diffusion-model/references/transformer-adaptation.md create mode 100644 .claude/skills/add-diffusion-model/references/troubleshooting.md create mode 100644 .claude/skills/add-tts-model/SKILL.md create mode 100644 .claude/skills/readme.md diff --git a/.claude/skills/add-diffusion-model/SKILL.md b/.claude/skills/add-diffusion-model/SKILL.md new file mode 100644 index 00000000000..a7e0bbf9a57 --- /dev/null +++ b/.claude/skills/add-diffusion-model/SKILL.md @@ -0,0 +1,534 @@ +--- +name: add-diffusion-model +description: Add a new diffusion model (text-to-image, text-to-video, image-to-video, text-to-audio, image editing) to vLLM-Omni, including Cache-DiT acceleration and parallelism support (TP, SP/USP, CFG-Parallel, HSDP). Use when integrating a new diffusion model, porting a diffusers pipeline or a custom model repo to vllm-omni, creating a new DiT transformer adapter, adding diffusion model support, or enabling multi-GPU parallelism and cache acceleration for an existing model. +--- + +# Adding a Diffusion Model to vLLM-Omni + +## Overview + +This skill guides you through adding a new diffusion model to vLLM-Omni. The model may come from HuggingFace Diffusers (structured pipeline) or from a private/custom repo. The workflow differs significantly depending on the source. + +## Prerequisites + +Before starting, determine: + +1. **Model category**: Text-to-Image, Text-to-Video, Image-to-Video, Image Editing, Text-to-Audio, or Omni +2. **Reference source**: Diffusers pipeline, custom repo, or a combination +3. **Model HuggingFace ID** or local checkpoint path +4. **Architecture**: Scheduler, text encoder, VAE, transformer/backbone + +## Step 0: Classify the Migration Path + +Check the model's HF repo for `model_index.json`. This determines your path: + +| Scenario | How to identify | Migration path | +|----------|----------------|----------------| +| **Already supported** | `_class_name` in `model_index.json` matches a key in `_DIFFUSION_MODELS` in `registry.py` | Skip to Step 5 (test) and Step 7 (docs) | +| **Diffusers-based** | Has standard `model_index.json` with `_diffusers_version`, subfolders for `transformer/`, `vae/`, etc. | Follow **Path A** below | +| **Custom/private repo** | No diffusers `model_index.json`, weights in non-standard format, custom model code in a separate git repo | Follow **Path B** below | +| **Hybrid** | Has some diffusers components (VAE) but custom transformer/fusion | Mix of Path A and Path B | + +## Path A: Diffusers-Based Model + +For models with a standard diffusers layout. See [references/transformer-adaptation.md](references/transformer-adaptation.md) for detailed code patterns. + +### A1. Analyze `model_index.json` + +Identify components: `transformer`, `scheduler`, `vae`, `text_encoder`, `tokenizer`. + +### A2. Create model directory + +``` +vllm_omni/diffusion/models/your_model_name/ +├── __init__.py +├── pipeline_your_model.py +└── your_model_transformer.py +``` + +### A3. Adapt transformer + +1. Copy from diffusers source. Remove mixins (`ModelMixin`, `ConfigMixin`, `AttentionModuleMixin`). +2. Replace attention with `vllm_omni.diffusion.attention.layer.Attention` (QKV shape: `[B, seq, heads, head_dim]`). +3. Add `od_config: OmniDiffusionConfig | None = None` to `__init__`. +4. Add `load_weights()` method mapping diffusers weight names to vllm-omni names. +5. Add class attributes: `_repeated_blocks`, `_layerwise_offload_blocks_attr`. + +### A4. Adapt pipeline + +Inherit from `nn.Module`. The key contract: + +```python +class YourPipeline(nn.Module): + def __init__(self, *, od_config: OmniDiffusionConfig, prefix: str = ""): + # Load VAE, text encoder, tokenizer via from_pretrained() + # Instantiate transformer (weights loaded later via weights_sources) + self.weights_sources = [ + DiffusersPipelineLoader.ComponentSource( + model_or_path=od_config.model, subfolder="transformer", + prefix="transformer.", fall_back_to_pt=True)] + + def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: + # Encode prompt → prepare latents → denoise loop → VAE decode + return DiffusionOutput(output=output) + + def load_weights(self, weights): + return AutoWeightsLoader(self).load_weights(weights) +``` + +Add post/pre-process functions in the same pipeline file. Register them in `registry.py`. + +### A5. Register, test, docs → continue at Step 4 below. + +--- + +## Path B: Custom/Private Repo Model + +For models without a diffusers pipeline — weights in custom format, model code in a private repo. Real examples: DreamID-Omni, BAGEL, HunyuanImage3. + +### B1. Understand the reference repo + +Study the original model's code to identify: +- **Model architecture files** (transformers, fusion modules, embeddings) +- **Weight format** (safetensors, `.pth`, custom checkpoint structure) +- **Weight loading helpers** (custom init functions, checkpoint loaders) +- **Pre/post-processing** (image/audio transforms, tokenization, VAE encode/decode) +- **External dependencies** (packages not on PyPI) +- **Config format** (JSON config files, hardcoded dicts) + +### B2. Decide what lives WHERE + +This is the key design decision for custom models. Follow these placement rules: + +| Code type | Where to place | Example | +|-----------|---------------|---------| +| **Pipeline orchestration** (init, forward, denoise loop) | `vllm_omni/diffusion/models//pipeline_.py` | Always required | +| **Custom transformer/backbone** (ported and adapted to vllm-omni) | `vllm_omni/diffusion/models//_transformer.py` or similar | `wan2_2.py`, `fusion.py`, `bagel_transformer.py` | +| **Custom sub-models** (VAE, fusion, autoencoder) | `vllm_omni/diffusion/models//` as separate files | `autoencoder.py`, `fusion.py` | +| **External dependency code** (original repo utilities) | **External repo**, installed via download script or pip | `dreamid_omni` package via git clone | +| **Hardcoded model configs** | Module-level dicts in pipeline file | `VIDEO_CONFIG`, `AUDIO_CONFIG` dicts | +| **Download/setup script** | `examples/offline_inference//download_.py` | `download_dreamid_omni.py` | +| **Custom `model_index.json`** | Generated by download script, placed at model root | Minimal: `{"_class_name": "YourPipeline", ...}` | + +### B3. Handle external dependencies + +If the model's code lives in a separate git repo: + +**Option 1: Import with graceful fallback** (recommended for models with external utils) + +```python +try: + from external_model.utils import init_vae, load_checkpoint +except ImportError: + raise ImportError( + "Failed to import from dependency 'external_model'. " + "Please run the download script first." + ) +``` + +**Option 2: Port the code directly** (preferred when feasible) + +Copy the essential model files into `vllm_omni/diffusion/models//` and adapt them. This avoids external dependencies. BAGEL does this — `autoencoder.py` and `bagel_transformer.py` are ported directly. + +**Decision criteria**: Port if the code is self-contained and won't diverge. Use external deps if the model repo is actively maintained and the code is complex. + +### B4. Handle custom weight loading + +Custom models have two patterns for weight loading: + +**Pattern 1: Bypass standard loader** (DreamID-Omni style) + +When the original model has complex custom init functions that load weights in `__init__`: + +```python +class CustomPipeline(nn.Module): + def __init__(self, *, od_config, prefix=""): + super().__init__() + model = od_config.model + # Load everything eagerly in __init__ using custom helpers + self.vae = custom_init_vae(model, device=self.device) + self.text_encoder = custom_init_text_encoder(model, device=self.device) + self.transformer = CustomFusionModel(CONFIG) + load_custom_checkpoint(self.transformer, + checkpoint_path=os.path.join(model, "model.safetensors")) + # NO weights_sources defined — bypasses standard loader + + def load_weights(self, weights): + pass # No-op — all weights loaded in __init__ +``` + +**Pattern 2: Use standard loader with custom `load_weights`** (BAGEL style) + +When weights are in safetensors format but need name remapping: + +```python +class CustomPipeline(nn.Module): + def __init__(self, *, od_config, prefix=""): + super().__init__() + # Instantiate model architecture without weights + self.bagel = BagelModel(config) + self.vae = AutoEncoder(ae_params) + + # Point loader at the safetensors in the model root + self.weights_sources = [ + DiffusersPipelineLoader.ComponentSource( + model_or_path=od_config.model, + subfolder=None, # weights at root, not in subfolder + prefix="", + fall_back_to_pt=False, + ) + ] + + def load_weights(self, weights): + # Custom name remapping for non-diffusers weight names + params = dict(self.named_parameters()) + loaded = set() + for name, tensor in weights: + # Remap original weight names to vllm-omni module names + name = self._remap_weight_name(name) + if name in params: + default_weight_loader(params[name], tensor) + loaded.add(name) + return loaded +``` + +### B5. Create the `model_index.json` + +Custom models need a `model_index.json` at the model root for vllm-omni to discover them. For custom models, this is minimal: + +```json +{ + "_class_name": "YourModelPipeline", + "custom_key": "path/to/custom_weights.safetensors" +} +``` + +The `_class_name` must match a key in `_DIFFUSION_MODELS` in `registry.py`. Additional keys are model-specific (accessed via `od_config.model_config`). + +If the model's weights come from multiple HF repos, write a **download script** that: +1. Downloads from each repo +2. Assembles into a single directory +3. Generates `model_index.json` +4. Installs any external dependencies (git clone + `.pth` file) + +Place at: `examples/offline_inference//download_.py` + +### B6. Handle multi-modal inputs + +If the model accepts images, audio, or other multi-modal inputs, implement the protocol classes from `vllm_omni/diffusion/models/interface.py`: + +```python +from vllm_omni.diffusion.models.interface import SupportImageInput, SupportAudioInput + +class MyPipeline(nn.Module, SupportImageInput, SupportAudioInput): + # Protocol markers — the engine uses these to enable proper input routing + pass +``` + +Preprocessing for custom models is typically done **inside `forward()`** rather than via registered pre-process functions, since the logic is often tightly coupled to the model. + +### B7. Continue at Step 4 below. + +--- + +## Common Steps (Both Paths) + +### Step 4: Register Model in registry.py + +Edit `vllm_omni/diffusion/registry.py`: + +```python +_DIFFUSION_MODELS = { + "YourModelPipeline": ("your_model_name", "pipeline_your_model", "YourModelPipeline"), +} +_DIFFUSION_POST_PROCESS_FUNCS = { + "YourModelPipeline": "get_your_model_post_process_func", # if applicable +} +_DIFFUSION_PRE_PROCESS_FUNCS = { + "YourModelPipeline": "get_your_model_pre_process_func", # if applicable +} +``` + +The registry key is the `_class_name` from `model_index.json`. The tuple is `(folder_name, module_file, class_name)`. + +Create `__init__.py` exporting the pipeline class and any factory functions. + +### Step 5: Run, Test, Debug + +Use the appropriate existing example script: + +| Category | Script | +|----------|--------| +| Text-to-Image | `examples/offline_inference/text_to_image/text_to_image.py` | +| Text-to-Video | `examples/offline_inference/text_to_video/text_to_video.py` | +| Image-to-Video | `examples/offline_inference/image_to_video/image_to_video.py` | +| Image-to-Image | `examples/offline_inference/image_to_image/image_edit.py` | +| Text-to-Audio | `examples/offline_inference/text_to_audio/text_to_audio.py` | + +For custom/Omni models that don't fit these categories, create a dedicated example script. + +**Validation**: No errors, output is meaningful, quality matches reference implementation. + +See [references/troubleshooting.md](references/troubleshooting.md) for common errors. + +### Step 6: Add Example Scripts + +For Omni or custom models, create: +- `examples/offline_inference/your_model_name/` — offline script + README +- `examples/online_serving/your_model_name/` — server script + client +- Download script if weights require assembly from multiple sources + +### Step 7: Update Documentation + +Required updates: +1. `docs/user_guide/diffusion/parallelism_acceleration.md` — parallelism support table +2. `docs/user_guide/diffusion/teacache.md` — if TeaCache supported +3. `docs/user_guide/diffusion/cache_dit_acceleration.md` — if Cache-DiT supported +4. `examples/offline_inference/xxx/README.md` — offline example docs +5. `examples/online_serve/xxx/README.md` — online serve docs + +### Step 8: Add E2E Tests (Recommended) + +Create `tests/e2e/online_serving/test_your_model_expansion.py`. + +### Step 9: Add Cache-DiT Acceleration + +Cache-DiT accelerates inference by caching intermediate computation results across denoising steps. After your model is working correctly on a single GPU, add cache-dit support. + +See [references/cache-dit-patterns.md](references/cache-dit-patterns.md) for detailed code patterns. + +#### 9a. Determine your model type + +| Model Type | Description | Action | +|------------|-------------|--------| +| **Standard single-transformer** | One transformer with one `ModuleList` of blocks | No code needed — `CacheDiTBackend` auto-detects via `enable_cache_for_dit()` | +| **Multi-block-list** | One transformer with multiple block lists (e.g., `transformer_blocks` + `single_transformer_blocks`) | Write custom enabler with `BlockAdapter` | +| **Dual-transformer** | Two transformers (e.g., high-noise + low-noise) | Write custom enabler with `BlockAdapter` wrapping both | + +#### 9b. Standard models — verify automatic support + +For standard single-transformer models, test directly: + +```python +omni = Omni( + model="your-model-name", + cache_backend="cache_dit", + cache_config={ + "Fn_compute_blocks": 1, + "Bn_compute_blocks": 0, + "max_warmup_steps": 4, + } +) +``` + +Check logs for "Cache-dit enabled successfully on xxx". If it works, skip to Step 9e. + +#### 9c. Custom architectures — write a custom enabler + +For multi-block-list or dual-transformer models, write a custom enabler function: + +```python +from cache_dit import BlockAdapter, ForwardPattern, ParamsModifier, DBCacheConfig + +def enable_cache_for_your_model(pipeline, cache_config): + db_cache_config = DBCacheConfig( + num_inference_steps=None, + Fn_compute_blocks=cache_config.Fn_compute_blocks, + Bn_compute_blocks=cache_config.Bn_compute_blocks, + max_warmup_steps=cache_config.max_warmup_steps, + max_cached_steps=cache_config.max_cached_steps, + max_continuous_cached_steps=cache_config.max_continuous_cached_steps, + residual_diff_threshold=cache_config.residual_diff_threshold, + ) + + cache_dit.enable_cache( + BlockAdapter( + transformer=pipeline.transformer, + blocks=[ + pipeline.transformer.transformer_blocks, + pipeline.transformer.single_transformer_blocks, + ], + forward_pattern=[ForwardPattern.Pattern_1, ForwardPattern.Pattern_1], + params_modifiers=[ParamsModifier(...)], + ), + cache_config=db_cache_config, + ) + + def refresh_cache_context(pipeline, num_inference_steps, verbose=True): + cache_dit.refresh_context( + pipeline.transformer, num_inference_steps=num_inference_steps, verbose=verbose + ) + return refresh_cache_context +``` + +#### 9d. Register the custom enabler + +Add your enabler to `CUSTOM_DIT_ENABLERS` in `vllm_omni/diffusion/cache/cache_dit_backend.py`: + +```python +CUSTOM_DIT_ENABLERS = { + "Wan22Pipeline": enable_cache_for_wan22, + "LongCatImagePipeline": enable_cache_for_longcat_image, + "YourModelPipeline": enable_cache_for_your_model, # Add here +} +``` + +#### 9e. Test Cache-DiT + +```python +omni = Omni( + model="your-model-name", + cache_backend="cache_dit", + cache_config={ + "Fn_compute_blocks": 1, "Bn_compute_blocks": 0, + "max_warmup_steps": 4, "residual_diff_threshold": 0.24, + } +) +images = omni.generate("a beautiful landscape", + OmniDiffusionSamplingParams(num_inference_steps=50)) +``` + +**Verify**: 1) logs show cache enabled, 2) 1.5-2x speedup, 3) output quality acceptable vs baseline. + +If quality degrades, lower `residual_diff_threshold` (try 0.12-0.18) or increase `max_warmup_steps` (try 6-8). + +--- + +### Step 10: Add Parallelism Support + +After the model works on a single GPU, add multi-GPU parallelism. Add each type incrementally, testing after each addition. + +See [references/parallelism-patterns.md](references/parallelism-patterns.md) for detailed code patterns and API reference. + +**Recommended order**: TP → SP/USP → CFG Parallel → HSDP + +#### 10a. Tensor Parallelism (TP) + +Shards DiT linear layers across GPUs. Requires code changes in the transformer. + +**What to change in the transformer**: +1. Replace `nn.Linear` with `ColumnParallelLinear` / `RowParallelLinear` / `QKVParallelLinear` +2. Update `load_weights()` to handle QKV fusion with `stacked_params_mapping` +3. Use `self.to_qkv.num_heads` (local heads) instead of total heads for split sizes + +```python +from vllm.model_executor.layers.linear import ( + QKVParallelLinear, RowParallelLinear, ColumnParallelLinear, +) + +# Attention: QKV → RowParallel output +self.to_qkv = QKVParallelLinear(dim, head_dim, num_heads, num_kv_heads) +self.to_out = RowParallelLinear(dim, dim, input_is_parallel=True) + +# FFN: ColumnParallel → RowParallel +self.w1 = ColumnParallelLinear(dim, ffn_dim) +self.w2 = RowParallelLinear(ffn_dim, dim, input_is_parallel=True) +``` + +**Constraints**: `num_heads % tp_size == 0` and `num_kv_heads % tp_size == 0`. + +**Test**: `--tensor-parallel-size 2` + +#### 10b. Sequence Parallelism (SP / USP) + +Splits sequence tokens across GPUs. Non-intrusive via `_sp_plan` on the transformer class — no changes to `forward()`. + +**What to change in the transformer**: + +Add `_sp_plan` class attribute: + +```python +from vllm_omni.diffusion.distributed.sp_plan import ( + SequenceParallelInput, SequenceParallelOutput, +) + +class YourTransformer(nn.Module): + _sp_plan = { + "blocks.0": { + "hidden_states": SequenceParallelInput(split_dim=1, expected_dims=3), + }, + "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), + } +``` + +If inline tensor ops (e.g., `torch.cat`) exist between shard/gather points, extract them into `nn.Module` submodules so hooks can intercept them. + +For RoPE that needs splitting, add an entry for the RoPE module with `split_output=True`. + +**Test**: `--ulysses-degree 2` (offline) or `--usp 2` (online serving) + +#### 10c. CFG Parallel + +Distributes positive/negative CFG branches across 2 GPUs. Requires the pipeline to inherit `CFGParallelMixin`. + +**What to change in the pipeline**: + +```python +from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin + +class YourPipeline(nn.Module, CFGParallelMixin): + def diffuse(self, ...) -> torch.Tensor: + for i, t in enumerate(timesteps): + positive_kwargs = {...} + negative_kwargs = {...} if do_true_cfg else None + noise_pred = self.predict_noise_maybe_with_cfg( + do_true_cfg=do_true_cfg, true_cfg_scale=cfg_scale, + positive_kwargs=positive_kwargs, negative_kwargs=negative_kwargs, + ) + latents = self.scheduler_step_maybe_with_cfg( + noise_pred, t, latents, do_true_cfg + ) + return latents +``` + +Override `predict_noise()` if your transformer call is non-standard. Override `combine_cfg_noise()` for multi-output models (e.g., video + audio). + +**Constraint**: Exactly 2 GPUs. Only for models using classifier-free guidance. + +**Test**: `--cfg-parallel-size 2` + +#### 10d. HSDP (Hybrid Sharded Data Parallel) + +Shards transformer weights via PyTorch FSDP2 to reduce per-GPU VRAM. No code changes to the forward pass — just add a class attribute. + +**What to change in the transformer**: + +```python +class YourTransformer(nn.Module): + @staticmethod + def _is_transformer_block(name: str, module) -> bool: + return "blocks" in name and name.split(".")[-1].isdigit() + + _hsdp_shard_conditions = [_is_transformer_block] +``` + +**Constraint**: Cannot combine with TP. For standalone HSDP, set `hsdp_shard_size` explicitly. + +**Test**: `--use-hsdp` or `DiffusionParallelConfig(use_hsdp=True)` + +#### 10e. Update parallelism documentation + +After adding parallelism support, update: +1. `docs/user_guide/diffusion/parallelism_acceleration.md` — add your model to the support table +2. Record which parallelism methods are supported (USP, Ring, CFG, TP, HSDP, VAE-Patch) + +--- + +## Iterative Development Tips + +1. **Start minimal**: Basic generation first, no parallelism/caching +2. **Use `--enforce-eager`**: Disable torch.compile during debugging +3. **Use small models**: Test with smaller variants first +4. **Check tensor shapes**: Most errors are reshape mismatches in attention +5. **Add features incrementally**: Single GPU → TP → SP → CFG → HSDP → Cache-DiT +6. **For custom models**: Get the model running with the original code first, then progressively replace components with vllm-omni equivalents +7. **Cache-DiT before parallelism tuning**: Cache-DiT is lossy — verify quality at baseline before combining with parallelism +8. **Combine lossless + lossy**: e.g., TP + SP + Cache-DiT for maximum throughput + +## Reference Files + +- [Transformer Adaptation](references/transformer-adaptation.md) — porting transformers from diffusers +- [Custom Model Patterns](references/custom-model-patterns.md) — patterns for non-diffusers models +- [Parallelism Patterns](references/parallelism-patterns.md) — TP, SP/USP, CFG parallel, HSDP implementation details +- [Cache-DiT Patterns](references/cache-dit-patterns.md) — cache-dit acceleration for standard and custom architectures +- [Troubleshooting](references/troubleshooting.md) — common errors and fixes diff --git a/.claude/skills/add-diffusion-model/references/cache-dit-patterns.md b/.claude/skills/add-diffusion-model/references/cache-dit-patterns.md new file mode 100644 index 00000000000..d34ce0e0f43 --- /dev/null +++ b/.claude/skills/add-diffusion-model/references/cache-dit-patterns.md @@ -0,0 +1,254 @@ +# Cache-DiT Patterns Reference + +## Overview + +Cache-DiT accelerates Diffusion Transformers by caching intermediate computation results across denoising steps. Adjacent steps produce similar features, so redundant computations can be skipped. + +Three caching strategies: +- **DBCache**: Dynamic block-level caching — selectively computes or caches transformer blocks based on residual differences +- **TaylorSeer**: Calibration-based prediction using Taylor expansion to estimate block outputs +- **SCM** (Step Computation Masking): Dynamic step skipping based on configurable policies + +**Typical speedup**: 1.5-2.5x depending on model and configuration. + +**Official docs**: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/cache_dit + +## Architecture + +vLLM-Omni integrates cache-dit through `CacheDiTBackend`: + +| Component | Purpose | +|-----------|---------| +| `CacheDiTBackend` | Unified backend — auto-selects enabler (standard or custom) | +| `enable_cache_for_dit()` | Default enabler for standard single-transformer models | +| `CUSTOM_DIT_ENABLERS` dict | Registry of custom enablers keyed by pipeline class name | +| `BlockAdapter` | Wraps complex architectures (multi-block-list or multi-transformer) | +| `ForwardPattern` | Specifies block forward signature: `Pattern_0`, `Pattern_1`, `Pattern_2` | +| `ParamsModifier` | Per-transformer or per-block-list config customization | +| `DBCacheConfig` | Configuration for DBCache parameters | +| `cache_dit.refresh_context()` | Updates cache context when `num_inference_steps` changes | + +**Source files**: +- `vllm_omni/diffusion/cache/cache_dit_backend.py` — `CacheDiTBackend`, enablers, `CUSTOM_DIT_ENABLERS` +- `vllm_omni/diffusion/cache/` — cache backend implementations + +## Standard Models: Automatic Support + +Most DiT models follow this pattern: +- Single transformer with one `nn.ModuleList` of blocks +- Standard forward signature +- Compatible with cache-dit's automatic detection + +**Examples**: Qwen-Image, Z-Image, FLUX + +No code changes needed. `CacheDiTBackend` automatically uses `enable_cache_for_dit()`: + +```python +from vllm_omni import Omni + +omni = Omni( + model="Qwen/Qwen-Image", + cache_backend="cache_dit", + cache_config={ + "Fn_compute_blocks": 1, + "Bn_compute_blocks": 0, + "max_warmup_steps": 4, + } +) +``` + +What happens automatically: + +```python +def enable_cache_for_dit(pipeline, cache_config): + db_cache_config = DBCacheConfig( + num_inference_steps=None, + Fn_compute_blocks=cache_config.Fn_compute_blocks, + Bn_compute_blocks=cache_config.Bn_compute_blocks, + max_warmup_steps=cache_config.max_warmup_steps, + max_cached_steps=cache_config.max_cached_steps, + max_continuous_cached_steps=cache_config.max_continuous_cached_steps, + residual_diff_threshold=cache_config.residual_diff_threshold, + ) + + cache_dit.enable_cache(pipeline.transformer, cache_config=db_cache_config) + + def refresh_cache_context(pipeline, num_inference_steps, verbose=True): + cache_dit.refresh_context( + pipeline.transformer, num_inference_steps=num_inference_steps, verbose=verbose + ) + return refresh_cache_context +``` + +## Custom Architectures: Writing Custom Enablers + +### When you need a custom enabler + +- Model has multiple block lists in one transformer (e.g., `transformer_blocks` + `single_transformer_blocks`) +- Model has two transformers (e.g., high-noise + low-noise like Wan2.2) +- Model uses non-standard block forward signature + +### Pattern 1: Multi-Block-List (LongCat-Image style) + +Single transformer with two block lists: + +```python +import cache_dit +from cache_dit import BlockAdapter, ForwardPattern, ParamsModifier, DBCacheConfig + +def enable_cache_for_your_model(pipeline, cache_config): + db_cache_config = DBCacheConfig( + num_inference_steps=None, + Fn_compute_blocks=cache_config.Fn_compute_blocks, + Bn_compute_blocks=cache_config.Bn_compute_blocks, + max_warmup_steps=cache_config.max_warmup_steps, + max_cached_steps=cache_config.max_cached_steps, + max_continuous_cached_steps=cache_config.max_continuous_cached_steps, + residual_diff_threshold=cache_config.residual_diff_threshold, + ) + + cache_dit.enable_cache( + BlockAdapter( + transformer=pipeline.transformer, + blocks=[ + pipeline.transformer.transformer_blocks, + pipeline.transformer.single_transformer_blocks, + ], + forward_pattern=[ForwardPattern.Pattern_1, ForwardPattern.Pattern_1], + params_modifiers=[ParamsModifier(...)], + ), + cache_config=db_cache_config, + ) + + def refresh_cache_context(pipeline, num_inference_steps, verbose=True): + cache_dit.refresh_context( + pipeline.transformer, num_inference_steps=num_inference_steps, verbose=verbose + ) + return refresh_cache_context +``` + +For single transformer with multiple block lists, `refresh_context` works the same as standard models — call it once on the transformer. + +### Pattern 2: Dual-Transformer (Wan2.2 style) + +Two transformers with separate configs: + +```python +def enable_cache_for_dual_transformer(pipeline, cache_config): + db_cache_config = DBCacheConfig(...) + + cache_dit.enable_cache( + BlockAdapter( + transformer=[pipeline.transformer, pipeline.transformer_2], + blocks=[pipeline.transformer.blocks, pipeline.transformer_2.blocks], + forward_pattern=[ForwardPattern.Pattern_2, ForwardPattern.Pattern_2], + params_modifiers=[ + ParamsModifier(...), # Config for transformer 1 + ParamsModifier(...), # Config for transformer 2 + ], + ), + cache_config=db_cache_config, + ) + + def refresh_cache_context(pipeline, num_inference_steps, verbose=True): + high_steps, low_steps = _split_inference_steps(num_inference_steps) + cache_dit.refresh_context( + pipeline.transformer, num_inference_steps=high_steps, verbose=verbose + ) + cache_dit.refresh_context( + pipeline.transformer_2, num_inference_steps=low_steps, verbose=verbose + ) + return refresh_cache_context +``` + +Key difference: `refresh_context` must be called on **each transformer separately** with its own step count. + +### Choosing the ForwardPattern + +| Pattern | Block forward signature | Example models | +|---------|------------------------|----------------| +| `Pattern_0` | `block(hidden_states, **kwargs)` → residual added inside block | Default | +| `Pattern_1` | `block(hidden_states, **kwargs)` → returns `(hidden_states, ...)` tuple | FLUX-style single blocks | +| `Pattern_2` | `block(hidden_states, **kwargs)` → `(hidden_states, ...)` with different residual pattern | Wan2.2 blocks | + +Inspect your block's `forward()` return type and residual connection pattern to choose the right one. See [Cache-DiT API Reference](https://cache-dit.readthedocs.io/en/latest/user_guide/CACHE_API/) for details. + +## Registering Custom Enablers + +Add your enabler to `CUSTOM_DIT_ENABLERS` in `vllm_omni/diffusion/cache/cache_dit_backend.py`: + +```python +CUSTOM_DIT_ENABLERS = { + "Wan22Pipeline": enable_cache_for_wan22, + "LongCatImagePipeline": enable_cache_for_longcat_image, + "YourModelPipeline": enable_cache_for_your_model, +} +``` + +The key must match `pipeline.__class__.__name__`. + +## Configuration Parameters + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `Fn_compute_blocks` | 1 | Number of blocks to always compute at the front | +| `Bn_compute_blocks` | 0 | Number of blocks to always compute at the back | +| `max_warmup_steps` | 4 | Steps to run without caching at the beginning | +| `max_cached_steps` | — | Max total cached steps | +| `max_continuous_cached_steps` | — | Max consecutive cached steps | +| `residual_diff_threshold` | 0.24 | Threshold for deciding whether to cache a block | + +### Tuning for quality vs speed + +| Goal | Adjustments | +|------|-------------| +| **More speed, acceptable quality loss** | Higher `residual_diff_threshold` (0.24-0.4), lower `max_warmup_steps` (2-4) | +| **Better quality, less speed** | Lower `residual_diff_threshold` (0.12-0.18), higher `max_warmup_steps` (6-8), lower `max_continuous_cached_steps` (2) | + +## Testing + +```python +from vllm_omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + +omni = Omni( + model="your-model-name", + cache_backend="cache_dit", + cache_config={ + "Fn_compute_blocks": 1, + "Bn_compute_blocks": 0, + "max_warmup_steps": 4, + "residual_diff_threshold": 0.24, + } +) +images = omni.generate( + "a beautiful landscape", + OmniDiffusionSamplingParams(num_inference_steps=50), +) +``` + +CLI (online serving): + +```bash +vllm serve your-model --omni --port 8098 \ + --cache-backend cache_dit \ + --cache-config '{"Fn_compute_blocks": 1, "Bn_compute_blocks": 0, "max_warmup_steps": 4}' +``` + +**Verification checklist**: +1. Logs show "Cache-dit enabled successfully on xxx" +2. Performance: 1.5-2x speedup vs no cache +3. Quality: compare output with `cache_backend=None` + +## Excluded Models + +Models listed in `_NO_CACHE_ACCELERATION` in `vllm_omni/diffusion/registry.py` do not support cache-dit (e.g., `NextStep11Pipeline`, `StableDiffusionPipeline`). Check this set before attempting to enable cache-dit. + +## Reference Implementations + +| Model | Path | Notes | +|-------|------|-------| +| Standard DiT | `cache_dit_backend.py::enable_cache_for_dit` | Default enabler, automatic | +| Wan2.2 | `cache_dit_backend.py::enable_cache_for_wan22` | Dual-transformer, auto-detects mode | +| LongCat | `cache_dit_backend.py::enable_cache_for_longcat_image` | Multi-block-list | +| BAGEL | `cache_dit_backend.py::enable_cache_for_bagel` | Complex omni model | diff --git a/.claude/skills/add-diffusion-model/references/custom-model-patterns.md b/.claude/skills/add-diffusion-model/references/custom-model-patterns.md new file mode 100644 index 00000000000..2434e0b5da0 --- /dev/null +++ b/.claude/skills/add-diffusion-model/references/custom-model-patterns.md @@ -0,0 +1,273 @@ +# Custom Model Patterns Reference + +Patterns for adding models that don't come from the standard diffusers pipeline format. + +## Directory Structure Comparison + +### Diffusers-based model (e.g., Wan2.2) + +``` +vllm_omni/diffusion/models/wan2_2/ +├── __init__.py # Exports pipeline + transformer + helpers +├── pipeline_wan2_2.py # Pipeline: loads components via from_pretrained() +├── pipeline_wan2_2_i2v.py # Variant pipeline for image-to-video +└── wan2_2_transformer.py # Transformer: ported from diffusers, uses Attention layer +``` + +The transformer is loaded separately via `weights_sources` + `load_weights()`. Non-transformer components (VAE, text encoder) are loaded in `__init__` via `from_pretrained()`. + +### Custom model with external deps (e.g., DreamID-Omni) + +``` +vllm_omni/diffusion/models/dreamid_omni/ +├── __init__.py # Exports pipeline only +├── pipeline_dreamid_omni.py # Pipeline: loads ALL weights in __init__ via custom helpers +├── fusion.py # Custom fusion architecture (video + audio cross-attention) +└── wan2_2.py # Re-implemented Wan backbone with split API + +examples/offline_inference/x_to_video_audio/ +└── download_dreamid_omni.py # Downloads weights from 3 HF repos + clones code repo +``` + +All weights loaded eagerly in `__init__`. `load_weights()` is a no-op. External dependency (`dreamid_omni` package) imported with try/except. + +### Custom model with ported code (e.g., BAGEL) + +``` +vllm_omni/diffusion/models/bagel/ +├── __init__.py +├── pipeline_bagel.py # Pipeline: instantiates models, uses weights_sources +├── bagel_transformer.py # Full LLM backbone (Qwen2-MoT) ported into vllm-omni +└── autoencoder.py # Custom VAE ported from original repo +``` + +Model code is fully ported (no external dependency). Uses `weights_sources` and `load_weights()` with custom name remapping to handle non-diffusers safetensors format. + +## Weight Loading Patterns + +### Pattern 1: Standard diffusers flow (Wan2.2, Z-Image, FLUX) + +``` +init → create transformer (empty) → set weights_sources → [loader calls load_weights()] +``` + +- `weights_sources` points to safetensors in HF subfolder (e.g., `transformer/`) +- `load_weights()` receives `(name, tensor)` pairs from the loader +- Name remapping handles diffusers→vllm-omni differences (QKV fusion, Sequential index removal) + +### Pattern 2: Custom safetensors at root (BAGEL) + +``` +init → create all models (empty) → set weights_sources(subfolder=None) → [loader calls load_weights()] +``` + +- `weights_sources` points to **root** of model directory, not a subfolder +- Weights have non-diffusers names (e.g., `bagel.language_model.model.layers.0.self_attn.q_proj.weight`) +- `load_weights()` does heavy name normalization + +```python +self.weights_sources = [ + DiffusersPipelineLoader.ComponentSource( + model_or_path=od_config.model, + subfolder=None, # root directory + prefix="", # no prefix stripping + fall_back_to_pt=False, + ) +] +``` + +### Pattern 3: Fully custom loading (DreamID-Omni) + +``` +init → load ALL weights eagerly via custom helpers → load_weights() = no-op +``` + +- No `weights_sources` attribute — standard loader finds nothing to iterate +- Custom init functions (e.g., `init_wan_vae_2_2()`, `load_fusion_checkpoint()`) handle downloading and loading +- `load_weights()` is `pass` +- Weights may come from multiple HF repos in different formats (`.pth`, `.safetensors`) + +Use this when: +- The original model has complex, well-tested loading code you don't want to rewrite +- Weights span multiple HF repos +- Weight format is non-standard (e.g., a single `.pth` file, not sharded safetensors) + +## model_index.json for Custom Models + +Standard diffusers `model_index.json`: +```json +{ + "_class_name": "WanPipeline", + "_diffusers_version": "0.35.0.dev0", + "scheduler": ["diffusers", "UniPCMultistepScheduler"], + "transformer": ["diffusers", "WanTransformer3DModel"], + "vae": ["diffusers", "AutoencoderKLWan"] +} +``` + +Custom model `model_index.json` (minimal): +```json +{ + "_class_name": "DreamIDOmniPipeline", + "fusion": "DreamID-Omni/dreamid_omni.safetensors" +} +``` + +The only **required** field is `_class_name` — it must match a key in `_DIFFUSION_MODELS` in `registry.py`. Other fields are model-specific and accessible via `od_config.model_config` dict. + +## External Dependency Management + +### Git clone + .pth injection (DreamID-Omni pattern) + +```python +def download_dependency(): + CACHE_DIR.mkdir(parents=True, exist_ok=True) + with open(LOCK_FILE, "w") as f: + fcntl.flock(f, fcntl.LOCK_EX) + if not DEPENDENCY_DIR.exists(): + subprocess.run([ + "git", "clone", "--depth", "1", + REPO_URL, "--branch", BRANCH, + str(DEPENDENCY_DIR) + ], check=True) + fcntl.flock(f, fcntl.LOCK_UN) + + # Add to Python path via .pth file + site_packages = Path(site.getsitepackages()[0]) + pth_file = site_packages / "vllm_omni_dependency.pth" + pth_file.write_text(str(DEPENDENCY_DIR)) +``` + +### Direct port (BAGEL pattern) + +Copy essential files from the original repo into `vllm_omni/diffusion/models//`. Adapt imports to use vllm-omni utilities. Benefits: no external dependency, no git clone step. Drawback: must maintain the ported code. + +## Multi-Modal Input/Output Protocols + +Custom models that handle images, audio, or video I/O should implement protocol classes: + +```python +from vllm_omni.diffusion.models.interface import ( + SupportImageInput, # Model accepts image input + SupportAudioInput, # Model accepts audio input + SupportAudioOutput, # Model produces audio output +) + +class MyPipeline(nn.Module, SupportImageInput, SupportAudioInput, SupportAudioOutput): + pass # Protocol markers enable proper engine routing +``` + +The engine checks `isinstance(pipeline, SupportImageInput)` at startup to configure input validation and warmup behavior. + +## Hardcoded Config vs Config Files + +Diffusers models use `config.json` in each subfolder. Custom models often use: + +**Module-level config dicts** (DreamID-Omni): +```python +VIDEO_CONFIG = { + "patch_size": [1, 2, 2], "model_type": "ti2v", + "dim": 3072, "ffn_dim": 14336, "num_heads": 24, "num_layers": 30, ... +} +``` + +**Loaded from custom JSON** (BAGEL): +```python +cfg_path = os.path.join(model_path, "config.json") +with open(cfg_path) as f: + bagel_cfg = json.load(f) +vae_cfg = bagel_cfg.get("vae_config", {}) +``` + +## Custom Architecture Patterns + +### Split forward API (DreamID-Omni) + +When a fusion model needs to interleave blocks from two backbones: + +```python +class WanModel(nn.Module): + def prepare_transformer_block_kwargs(self, x, t, context, ...): + # Patch embed, time embed, text embed, RoPE + return x, e, kwargs + + def post_transformer_block_out(self, x, grid_sizes, e): + # Output projection, unpatchify + return output + + def forward(self, *args, **kwargs): + raise NotImplementedError # Fusion model handles block iteration +``` + +The `FusionModel` then iterates blocks in lock-step: +```python +for video_block, audio_block in zip(self.video_model.blocks, self.audio_model.blocks): + video_out = video_block(video_hidden, ...) + audio_out = audio_block(audio_hidden, ...) + # Cross-attend between modalities + video_out = cross_attention(video_out, audio_out) + audio_out = cross_attention(audio_out, video_out) +``` + +### LLM-as-denoiser (BAGEL) + +When the backbone is a language model that also does diffusion: + +```python +class BagelModel(nn.Module): + def __init__(self): + self.language_model = Qwen2MoTForCausalLM(config) + self.vit_model = SiglipVisionModel(vit_config) +``` + +The LLM processes both text tokens and latent image tokens in a single forward pass, using KV caching for the text portion. + +## Pre/Post Processing for Custom Models + +Custom models typically handle pre/post processing **inside `forward()`** rather than via registered functions, because the logic is tightly coupled: + +```python +def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: + # Inline preprocessing + image = self._load_and_resize_image(req.prompts[0].get("multi_modal_data", {}).get("image")) + image_latent = self._vae_encode(image) + + # ... denoising loop ... + + # Inline postprocessing + pil_image = self._decode_to_pil(latents) + return DiffusionOutput(output=[pil_image]) +``` + +If pre/post functions are not registered in `_DIFFUSION_PRE_PROCESS_FUNCS` / `_DIFFUSION_POST_PROCESS_FUNCS`, the engine simply skips those steps. + +## Download Script Template + +```python +# examples/offline_inference//download_.py +from huggingface_hub import snapshot_download +import json, os + +def main(output_dir): + # Download model weights from HF + snapshot_download(repo_id="org/model-weights", local_dir=os.path.join(output_dir, "weights")) + + # Download additional components if from separate repos + snapshot_download(repo_id="org/vae-weights", local_dir=os.path.join(output_dir, "vae"), + allow_patterns=["*.safetensors"]) + + # Generate model_index.json + config = {"_class_name": "YourPipeline", "custom_key": "weights/model.safetensors"} + with open(os.path.join(output_dir, "model_index.json"), "w") as f: + json.dump(config, f, indent=2) + + # Install external code dependency (if needed) + download_dependency() + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--output-dir", default="./your_model") + args = parser.parse_args() + main(args.output_dir) +``` diff --git a/.claude/skills/add-diffusion-model/references/parallelism-patterns.md b/.claude/skills/add-diffusion-model/references/parallelism-patterns.md new file mode 100644 index 00000000000..933e2d23204 --- /dev/null +++ b/.claude/skills/add-diffusion-model/references/parallelism-patterns.md @@ -0,0 +1,571 @@ +# Parallelism Patterns Reference + +## Overview + +vLLM-Omni supports multiple parallelism strategies for diffusion models. Each targets a different bottleneck: + +| Strategy | Splits | Best For | Constraint | +|----------|--------|----------|------------| +| Tensor Parallel (TP) | Model layers across GPUs | Latency reduction, large models | Requires fast GPU interconnect, `num_heads % tp == 0` | +| Sequence Parallel (SP/USP) | Sequence tokens across GPUs | Long sequences (video, high-res) | Near-linear scaling | +| CFG Parallel | Positive/negative CFG branches | Models using classifier-free guidance | Exactly 2 GPUs | +| HSDP | Weight shards via FSDP2 | VRAM reduction | Cannot combine with TP | +| VAE Patch Parallel | VAE decode spatial tiles | Large VAE outputs | Auto-enables tiling | + +**Recommended integration order**: TP → SP → CFG Parallel → HSDP + +**Official design docs**: +- TP: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/tensor_parallel +- SP: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/sequence_parallel +- CFG: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/cfg_parallel +- HSDP: https://docs.vllm.ai/projects/vllm-omni/en/latest/design/feature/hsdp + +--- + +## Tensor Parallelism (TP) + +Replace standard `nn.Linear` with vLLM's parallel linear layers. This is the most invasive change but provides direct VRAM savings and compute speedup. + +### Layer replacement rules + +| Pattern | vLLM Layer | When to Use | +|---------|-----------|-------------| +| Fan-out (first in FFN) | `ColumnParallelLinear` | Projection that splits output across ranks | +| Fan-in (second in FFN) | `RowParallelLinear` | Projection that gathers across ranks | +| QKV projection | `QKVParallelLinear` | Fused Q/K/V for self-attention | +| Single Q or K or V | `ColumnParallelLinear` | Separate projections (cross-attention) | +| Attention output | `RowParallelLinear` | Output projection after attention | +| Must not shard | `ReplicatedLinear` | Layers that must stay replicated | + +### MLP Block (Up-Down Pattern) + +```python +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, RowParallelLinear, +) + +class TPFeedForward(nn.Module): + def __init__(self, dim, ffn_dim): + super().__init__() + self.fc1 = ColumnParallelLinear(dim, ffn_dim, bias=False, return_bias=False) + self.fc2 = RowParallelLinear( + ffn_dim, dim, bias=False, + input_is_parallel=True, # Input already sharded from fc1 + return_bias=False, + ) + + def forward(self, x): + x, _ = self.fc1(x) + x = torch.nn.functional.gelu(x) + x, _ = self.fc2(x) + return x +``` + +### Attention Block (QKV-Out Pattern) + +```python +from vllm.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear +from vllm_omni.diffusion.attention.layer import Attention + +class TPSelfAttention(nn.Module): + def __init__(self, dim, num_heads, num_kv_heads=None): + super().__init__() + num_kv_heads = num_kv_heads or num_heads + self.head_dim = dim // num_heads + + self.to_qkv = QKVParallelLinear( + hidden_size=dim, + head_size=self.head_dim, + total_num_heads=num_heads, + total_num_kv_heads=num_kv_heads, + bias=False, + return_bias=False, + ) + self.to_out = RowParallelLinear( + dim, dim, bias=False, + input_is_parallel=True, + return_bias=False, + ) + self.attn = Attention( + num_heads=self.to_qkv.num_heads, # Local heads per GPU + head_size=self.head_dim, + softmax_scale=1.0 / (self.head_dim ** 0.5), + causal=False, + num_kv_heads=self.to_qkv.num_kv_heads, # Local KV heads per GPU + ) + + def forward(self, x): + qkv, _ = self.to_qkv(x) + q, k, v = qkv.split( + [self.to_qkv.num_heads * self.head_dim, + self.to_qkv.num_kv_heads * self.head_dim, + self.to_qkv.num_kv_heads * self.head_dim], + dim=-1, + ) + B, S, _ = x.shape + q = q.view(B, S, self.to_qkv.num_heads, self.head_dim) + k = k.view(B, S, self.to_qkv.num_kv_heads, self.head_dim) + v = v.view(B, S, self.to_qkv.num_kv_heads, self.head_dim) + out = self.attn(q, k, v) + out = out.reshape(B, S, -1) + out, _ = self.to_out(out) + return out +``` + +### QKV Fusion in load_weights + +When you fuse separate Q/K/V into `QKVParallelLinear`, map diffusers' separate weight names: + +```python +stacked_params_mapping = [ + ("to_qkv", "to_q", "q"), + ("to_qkv", "to_k", "k"), + ("to_qkv", "to_v", "v"), +] + +def load_weights(self, weights): + params = dict(self.named_parameters()) + loaded = set() + for name, tensor in weights: + for fused_name, orig_name, shard_id in stacked_params_mapping: + if orig_name in name: + name = name.replace(orig_name, fused_name) + param = params[name] + param.weight_loader(param, tensor, shard_id) + loaded.add(name) + break + else: + if name in params: + param = params[name] + if hasattr(param, "weight_loader"): + param.weight_loader(param, tensor) + else: + default_weight_loader(param, tensor) + loaded.add(name) + return loaded +``` + +### RMSNorm with TP + +When RMSNorm sits between TP-sharded dimensions, use `DistributedRMSNorm` — it computes global RMS via all-reduce across TP ranks. See the Wan2.2 implementation for the pattern. + +### TP Constraints + +- `num_heads % tp_size == 0` +- `num_kv_heads % tp_size == 0` +- Use `self.to_qkv.num_heads` (local per-GPU count), not total heads, for split sizes + +### Testing TP + +```bash +python text_to_image.py --model Your-org/your-model \ + --tensor-parallel-size 2 --output "tp_test.png" +``` + +**Verify**: speedup, memory reduction proportional to TP size, quality matches single-GPU. + +### Reference implementations + +| Model | Path | +|-------|------| +| Z-Image | `vllm_omni/diffusion/models/z_image/z_image_transformer.py` | +| FLUX | `vllm_omni/diffusion/models/flux/flux_transformer.py` | +| Qwen-Image | `vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py` | + +--- + +## Sequence Parallelism (SP / USP) + +SP splits sequence tokens across GPUs using Ulysses (all-to-all) or Ring (P2P) communication. It is applied non-intrusively via the `_sp_plan` dict — no changes to `forward()` logic. + +### Approach 1: Non-Intrusive `_sp_plan` (Recommended) + +The framework automatically registers hooks to shard inputs and gather outputs at `nn.Module` boundaries. + +#### Step 1: Identify module boundaries + +Find where tensors need sharding/gathering: + +```python +class MyTransformer(nn.Module): + def __init__(self): + self.patch_embed = PatchEmbed() # Before blocks + self.pos_embed = RoPE() # RoPE may need splitting + self.blocks = nn.ModuleList([...]) # Blocks process sharded x + self.norm_out = LayerNorm() + self.proj_out = Linear() # Gather after this + + def forward(self, x): + x = self.patch_embed(x) + pos = self.pos_embed(x) + for block in self.blocks: + x = block(x, pos) + x = self.norm_out(x) + return self.proj_out(x) +``` + +#### Step 2: Handle inline operations + +`_sp_plan` hooks only work at `nn.Module` boundaries. Inline ops like `torch.cat()` must be extracted into submodules: + +```python +# BAD: Inline — hooks can't intercept +unified = torch.cat([x, cap_feats], dim=1) + +# GOOD: Extract into submodule +class UnifiedPrepare(nn.Module): + def forward(self, x, cap_feats): + return torch.cat([x, cap_feats], dim=1) + +self.unified_prepare = UnifiedPrepare() +unified = self.unified_prepare(x, cap_feats) +``` + +Common cases: `torch.cat()`, `pad_sequence()`, `tensor.reshape()`, complex preprocessing. + +#### Step 3: Write `_sp_plan` + +**Pattern 1: Shard at first block, gather at output** (most common) + +```python +from vllm_omni.diffusion.distributed.sp_plan import ( + SequenceParallelInput, SequenceParallelOutput, +) + +class StandardTransformer(nn.Module): + _sp_plan = { + "blocks.0": { + "hidden_states": SequenceParallelInput(split_dim=1, expected_dims=3), + }, + "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), + } +``` + +**Pattern 2: Shard RoPE outputs separately** + +```python +class TransformerWithRoPE(nn.Module): + _sp_plan = { + "rope": { + 0: SequenceParallelInput(split_dim=1, expected_dims=4, split_output=True), + 1: SequenceParallelInput(split_dim=1, expected_dims=4, split_output=True), + }, + "blocks.0": { + "hidden_states": SequenceParallelInput(split_dim=1, expected_dims=3), + }, + "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), + } +``` + +**Pattern 3: Dual-stream (shard image, replicate text)** + +```python +class DualStreamTransformer(nn.Module): + _sp_plan = { + "rope_preparer": { + 2: SequenceParallelInput(split_dim=0, expected_dims=2, split_output=True), + 3: SequenceParallelInput(split_dim=0, expected_dims=2, split_output=True), + }, + "transformer_blocks.0": { + "hidden_states": SequenceParallelInput(split_dim=1, expected_dims=3), + }, + "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), + } +``` + +### API Reference + +**SequenceParallelInput**: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `split_dim` | int | Dimension to split (usually 1 for sequence) | +| `expected_dims` | int/None | Expected tensor rank for validation | +| `split_output` | bool | `False`: shard input params; `True`: shard output tensors | +| `auto_pad` | bool | Auto-pad if sequence not divisible by world_size | + +**SequenceParallelOutput**: + +| Parameter | Type | Description | +|-----------|------|-------------| +| `gather_dim` | int | Dimension to gather (usually 1 for sequence) | +| `expected_dims` | int/None | Expected tensor rank for validation | + +**Module naming**: + +| Key | Meaning | +|-----|---------| +| `"blocks.0"` | First element of ModuleList | +| `"blocks.*"` | All elements of ModuleList | +| `"rope"` | Named submodule | + +**Dictionary value types**: + +| Key type | split_output | Description | +|----------|-------------|-------------| +| `"param_name"` (str) | False | Shard input parameter by name | +| `0, 1, ...` (int) | True | Shard output tuple by index | + +### Approach 2: Intrusive Modification (Complex Cases) + +For dynamic sharding logic that can't be expressed via `_sp_plan`: + +```python +from vllm_omni.diffusion.distributed.sp_sharding import sp_shard, sp_gather + +def forward(self, hidden_states, ...): + if self.parallel_config.sequence_parallel_size > 1: + hidden_states = sp_shard(hidden_states, dim=1) + for block in self.blocks: + hidden_states = block(hidden_states) + if self.parallel_config.sequence_parallel_size > 1: + hidden_states = sp_gather(hidden_states, dim=1) + return hidden_states +``` + +Use intrusive modification as a last resort — `_sp_plan` is preferred for maintainability. + +### UAA Mode (Experimental) + +`ulysses_mode="advanced_uaa"` handles arbitrary sequence lengths and head counts that aren't divisible by `ulysses_degree`. Uses variable all-to-all split sizes and temporary head padding. + +### Combining SP methods + +Ulysses and Ring can be combined: `ulysses_degree × ring_degree = total SP GPUs`. + +```python +DiffusionParallelConfig(ulysses_degree=2, ring_degree=2) # 4 GPUs total +``` + +### Testing SP + +```bash +# Offline +python text_to_image.py --model Your-model --ulysses-degree 2 + +# Online serving +vllm serve Your-model --omni --usp 2 +``` + +### Reference implementations + +| Model | Path | +|-------|------| +| Qwen-Image | `vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py` | +| Wan2.2 | `vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py` | +| Z-Image | `vllm_omni/diffusion/models/z_image/z_image_transformer.py` | + +--- + +## CFG Parallelism + +Distributes positive/negative Classifier-Free Guidance branches across 2 GPUs. + +### Implementation + +Inherit `CFGParallelMixin` and implement `diffuse()`: + +```python +from vllm_omni.diffusion.distributed.cfg_parallel import CFGParallelMixin + +class YourPipeline(nn.Module, CFGParallelMixin): + def diffuse(self, latents, timesteps, prompt_embeds, negative_embeds, + do_true_cfg, true_cfg_scale, **kwargs): + for i, t in enumerate(timesteps): + positive_kwargs = { + "hidden_states": latents, + "encoder_hidden_states": prompt_embeds, + "timestep": t, + } + negative_kwargs = { + "hidden_states": latents, + "encoder_hidden_states": negative_embeds, + "timestep": t, + } if do_true_cfg else None + + noise_pred = self.predict_noise_maybe_with_cfg( + do_true_cfg=do_true_cfg, + true_cfg_scale=true_cfg_scale, + positive_kwargs=positive_kwargs, + negative_kwargs=negative_kwargs, + ) + latents = self.scheduler_step_maybe_with_cfg( + noise_pred, t, latents, do_true_cfg + ) + return latents +``` + +### Customization hooks + +| Method | Override when | +|--------|-------------| +| `predict_noise()` | Non-standard transformer call (e.g., dual-transformer like Wan2.2) | +| `cfg_normalize_function()` | Custom normalization (e.g., LongCat with clamping) | +| `combine_cfg_noise()` | Multi-output models (e.g., video + audio: CFG on video, positive-only on audio) | + +**Custom predict_noise** (Wan2.2 — selects active transformer): + +```python +def predict_noise(self, current_model=None, **kwargs): + if current_model is None: + current_model = self.transformer + return current_model(**kwargs)[0] +``` + +**Custom combine_cfg_noise** (multi-output): + +```python +def combine_cfg_noise(self, positive_pred, negative_pred, scale, normalize): + video_pos, audio_pos = positive_pred + video_neg, audio_neg = negative_pred + video_combined = super().combine_cfg_noise(video_pos, video_neg, scale, normalize) + return (video_combined, audio_pos) +``` + +### Composite scheduler for multi-output + +When each output has its own schedule: + +```python +class VideoAudioScheduler: + def __init__(self, video_scheduler, audio_scheduler): + self.video_scheduler = video_scheduler + self.audio_scheduler = audio_scheduler + + def step(self, noise_pred, t, latents, return_dict=False, generator=None): + video_out = self.video_scheduler.step( + noise_pred[0], t[0], latents[0], return_dict=False, generator=generator + )[0] + audio_out = self.audio_scheduler.step( + noise_pred[1], t[1], latents[1], return_dict=False, generator=generator + )[0] + return ((video_out, audio_out),) +``` + +### Testing CFG Parallel + +```bash +python text_to_image.py --model Your-model \ + --cfg-parallel-size 2 --cfg-scale 4.0 \ + --negative-prompt "ugly, unclear" +``` + +**Constraint**: `guidance_scale > 1.0` and negative prompt must be provided. + +### Reference implementations + +| Model | Path | +|-------|------| +| Qwen-Image | `vllm_omni/diffusion/models/qwen_image/cfg_parallel.py` | +| Wan2.2 | `vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py` | +| Mixin base | `vllm_omni/diffusion/distributed/cfg_parallel.py` | + +--- + +## HSDP (Hybrid Sharded Data Parallel) + +Shards model weights across GPUs using PyTorch FSDP2. Reduces per-GPU VRAM without changing computation. + +### Implementation + +Add `_hsdp_shard_conditions` to the transformer class: + +```python +class YourTransformer(nn.Module): + @staticmethod + def _is_transformer_block(name: str, module) -> bool: + return "blocks" in name and name.split(".")[-1].isdigit() + + _hsdp_shard_conditions = [_is_transformer_block] +``` + +For MoE models, add additional conditions: + +```python +class MoETransformer(nn.Module): + @staticmethod + def _is_transformer_block(name, module): + return "blocks" in name and name.split(".")[-1].isdigit() + + @staticmethod + def _is_moe_expert(name, module): + return "experts" in name and name.split(".")[-1].isdigit() + + _hsdp_shard_conditions = [_is_transformer_block, _is_moe_expert] +``` + +A module is sharded if **any** condition returns `True`. + +### Constraints + +- Cannot combine with Tensor Parallelism +- For standalone HSDP (no other parallelism), `hsdp_shard_size` must be specified explicitly +- Can combine with SP: HSDP reduces memory while SP distributes sequence + +### Testing HSDP + +```python +from vllm_omni.diffusion.data import DiffusionParallelConfig + +parallel_config = DiffusionParallelConfig(use_hsdp=True, hsdp_shard_size=8) +omni = Omni(model="your-model", parallel_config=parallel_config) +``` + +Or CLI: + +```bash +vllm serve Your-model --omni --use-hsdp +``` + +**Verify**: logs show "HSDP Inference: replicate_size=..., shard_size=..." and "Sharded N modules + root". Check VRAM reduction. + +### Reference implementations + +| Model | Path | +|-------|------| +| Wan2.2 | `vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py` | +| HSDP Core | `vllm_omni/diffusion/distributed/hsdp.py` | + +--- + +## VAE Patch Parallelism + +Shards VAE decode spatially across ranks using tiling: + +```bash +python text_to_image.py --model Your-model --vae-patch-parallel-size 4 +``` + +Auto-enables `--vae-use-tiling`. Uses `DistributedAutoencoderKLWan` or similar distributed VAE. Set `vae_patch_parallel_size` in `DiffusionParallelConfig`. + +--- + +## Combining Parallelism Methods + +Common multi-GPU recipes: + +```bash +# 4 GPUs: CFG (2) × Ulysses (2) +python text_to_image.py --model Qwen/Qwen-Image \ + --cfg-parallel-size 2 --ulysses-degree 2 + +# 8 GPUs: Ulysses (4) × Ring (2) + VAE patch (8) +python text_to_video.py --model Wan-AI/Wan2.2-T2V-A14B-Diffusers \ + --ulysses-degree 4 --ring-degree 2 --vae-patch-parallel-size 8 + +# 2 GPUs: HSDP + Ulysses (cannot combine HSDP with TP) +vllm serve Your-model --omni --use-hsdp --usp 2 +``` + +## Discovering Parallelism Support + +Check which parallelism methods a model supports: + +| Check | How | +|-------|-----| +| **Ulysses / Ring SP** | Transformer defines `_sp_plan`. Search: `grep -r '_sp_plan' vllm_omni/diffusion/models/` | +| **CFG Parallel** | Pipeline inherits `CFGParallelMixin`. Search: `grep -r 'CFGParallelMixin' vllm_omni/diffusion/models/` | +| **TP** | Uses `ColumnParallelLinear` / `QKVParallelLinear`. Search: `grep -r 'ParallelLinear\|QKVParallel' vllm_omni/diffusion/models//` | +| **HSDP** | Transformer defines `_hsdp_shard_conditions`. Search: `grep -r '_hsdp_shard_conditions' vllm_omni/diffusion/models/` | + +The canonical per-model support table is in `docs/user_guide/diffusion/parallelism_acceleration.md`. diff --git a/.claude/skills/add-diffusion-model/references/transformer-adaptation.md b/.claude/skills/add-diffusion-model/references/transformer-adaptation.md new file mode 100644 index 00000000000..6e344b6a66e --- /dev/null +++ b/.claude/skills/add-diffusion-model/references/transformer-adaptation.md @@ -0,0 +1,218 @@ +# Transformer Adaptation Reference + +## Adapting a Diffusers Transformer to vLLM-Omni + +### Step-by-step Checklist + +1. Copy the transformer class from diffusers source +2. Remove all mixin classes — inherit only from `nn.Module` +3. Replace attention dispatch with `vllm_omni.diffusion.attention.layer.Attention` +4. Replace logger with `vllm.logger.init_logger` +5. Add `od_config: OmniDiffusionConfig | None = None` to `__init__` +6. Remove training-only code (gradient checkpointing, dropout) +7. Add `load_weights()` method for weight loading from safetensors +8. Add class-level attributes for acceleration features + +### Mixin Removal + +Remove these diffusers mixins (and their imports): + +```python +# Remove all of these: +from diffusers.models.modeling_utils import ModelMixin +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.models.attention_processor import AttentionModuleMixin +from diffusers.loaders import PeftAdapterMixin, FromOriginalModelMixin + +# Replace: +class MyTransformer(ModelMixin, ConfigMixin, AttentionModuleMixin): +# With: +class MyTransformer(nn.Module): +``` + +Also remove `@register_to_config` decorators from `__init__`. + +### Attention Replacement + +The vLLM-Omni `Attention` layer wraps backend selection (FlashAttention, SDPA, SageAttn, etc.) and supports sequence parallelism hooks. + +**QKV tensor shape must be `[batch, seq_len, num_heads, head_dim]`.** + +#### Self-Attention Pattern + +```python +from vllm_omni.diffusion.attention.layer import Attention +from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata + +class SelfAttentionBlock(nn.Module): + def __init__(self, dim, num_heads): + super().__init__() + self.num_heads = num_heads + self.head_dim = dim // num_heads + + self.to_q = nn.Linear(dim, dim) + self.to_k = nn.Linear(dim, dim) + self.to_v = nn.Linear(dim, dim) + self.to_out = nn.Linear(dim, dim) + + self.attn = Attention( + num_heads=num_heads, + head_size=self.head_dim, + softmax_scale=1.0 / (self.head_dim ** 0.5), + causal=False, + num_kv_heads=num_heads, + ) + + def forward(self, x, attn_mask=None): + B, S, _ = x.shape + q = self.to_q(x).view(B, S, self.num_heads, self.head_dim) + k = self.to_k(x).view(B, S, self.num_heads, self.head_dim) + v = self.to_v(x).view(B, S, self.num_heads, self.head_dim) + + attn_metadata = AttentionMetadata(attn_mask=attn_mask) + out = self.attn(q, k, v, attn_metadata=attn_metadata) + out = out.reshape(B, S, -1) + return self.to_out(out) +``` + +#### Fused QKV with TP (Advanced) + +For tensor parallelism, use vLLM's parallel linear layers: + +```python +from vllm.model_executor.layers.linear import ( + QKVParallelLinear, RowParallelLinear +) + +class TPSelfAttention(nn.Module): + def __init__(self, dim, num_heads): + super().__init__() + self.num_heads = num_heads + self.head_dim = dim // num_heads + + self.to_qkv = QKVParallelLinear( + hidden_size=dim, + head_size=self.head_dim, + total_num_heads=num_heads, + total_num_kv_heads=num_heads, + ) + self.to_out = RowParallelLinear(dim, dim) + + self.attn = Attention( + num_heads=num_heads, + head_size=self.head_dim, + softmax_scale=1.0 / (self.head_dim ** 0.5), + causal=False, + num_kv_heads=num_heads, + ) +``` + +### Logger Replacement + +```python +# Replace: +from diffusers.utils import logging +logger = logging.get_logger(__name__) + +# With: +from vllm.logger import init_logger +logger = init_logger(__name__) +``` + +### Custom Layers from vLLM-Omni + +Available utility layers: + +```python +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm_omni.diffusion.layers.rope import RotaryEmbedding +from vllm_omni.diffusion.layers.adalayernorm import AdaLayerNorm +``` + +### Config Support + +```python +from vllm_omni.diffusion.data import OmniDiffusionConfig + +class MyTransformer(nn.Module): + def __init__(self, *, od_config=None, num_layers=28, hidden_size=3072, **kwargs): + super().__init__() + self.od_config = od_config + self.parallel_config = od_config.parallel_config if od_config else None + # ... build layers +``` + +The transformer config values come from `model_index.json` → `config.json` in the transformer subfolder. The pipeline uses `get_transformer_config_kwargs(od_config.tf_model_config, TransformerClass)` to filter config keys to match the `__init__` signature. + +### Weight Loading + +The `load_weights` method receives an iterable of `(name, tensor)` from safetensors files, with the prefix (e.g., `"transformer."`) already stripped by the loader. + +```python +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + +class MyTransformer(nn.Module): + def load_weights(self, weights): + params = dict(self.named_parameters()) + loaded = set() + for name, tensor in weights: + # Optional: remap names from diffusers to vllm-omni naming + # e.g., "ff.net.0.proj" -> "ff.net_0.proj" + + if name in params: + param = params[name] + if hasattr(param, "weight_loader"): + param.weight_loader(param, tensor) + else: + default_weight_loader(param, tensor) + loaded.add(name) + return loaded +``` + +#### QKV Fusion in load_weights + +If you fused separate Q/K/V into a `QKVParallelLinear`, you need to map diffusers' separate weight names: + +```python +stacked_params_mapping = [ + ("to_qkv", "to_q", "q"), + ("to_qkv", "to_k", "k"), + ("to_qkv", "to_v", "v"), +] + +def load_weights(self, weights): + params = dict(self.named_parameters()) + loaded = set() + for name, tensor in weights: + for fused_name, orig_name, shard_id in stacked_params_mapping: + if orig_name in name: + name = name.replace(orig_name, fused_name) + param = params[name] + param.weight_loader(param, tensor, shard_id) + loaded.add(name) + break + else: + # Normal loading + ... + return loaded +``` + +### Class-Level Attributes for Features + +```python +class MyTransformer(nn.Module): + # torch.compile: list block class names that repeat and can be compiled + _repeated_blocks = ["MyTransformerBlock"] + + # CPU offload: attribute name of the nn.ModuleList containing blocks + _layerwise_offload_blocks_attr = "blocks" + + # LoRA: mapping of fused param names to original param names + packed_modules_mapping = {"to_qkv": ["to_q", "to_k", "to_v"]} + + # Sequence parallelism plan (advanced — add after basic impl works) + _sp_plan = { + "blocks.0": SequenceParallelInput(split_dim=1), + "proj_out": SequenceParallelOutput(gather_dim=1), + } +``` diff --git a/.claude/skills/add-diffusion-model/references/troubleshooting.md b/.claude/skills/add-diffusion-model/references/troubleshooting.md new file mode 100644 index 00000000000..27acdd8d154 --- /dev/null +++ b/.claude/skills/add-diffusion-model/references/troubleshooting.md @@ -0,0 +1,178 @@ +# Troubleshooting Reference + +## Common Errors When Adding a Diffusion Model + +### ImportError / ModuleNotFoundError + +**Cause**: Missing or incorrect registration. + +**Fix checklist**: +1. Model registered in `vllm_omni/diffusion/registry.py` `_DIFFUSION_MODELS` dict +2. `__init__.py` exports the pipeline class +3. Pipeline file exists at the correct path: `vllm_omni/diffusion/models/{folder}/{file}.py` +4. Class name in registry matches the actual class name in the file + +### Shape Mismatch in Attention + +**Symptom**: `RuntimeError: shape mismatch` or `expected 4D tensor` + +**Cause**: QKV tensors not reshaped to `[batch, seq_len, num_heads, head_dim]`. + +**Fix**: Before calling `self.attn(q, k, v, ...)`, ensure: +```python +q = q.view(batch, seq_len, self.num_heads, self.head_dim) +k = k.view(batch, kv_seq_len, self.num_kv_heads, self.head_dim) +v = v.view(batch, kv_seq_len, self.num_kv_heads, self.head_dim) +``` + +After attention, reshape back: +```python +out = out.reshape(batch, seq_len, -1) +``` + +### Weight Loading Failures + +**Symptom**: `RuntimeError: size mismatch for parameter ...` or missing keys + +**Debugging**: +1. Print diffusers weight names: `safetensors.safe_open(path, "pt").keys()` +2. Print model parameter names: `dict(model.named_parameters()).keys()` +3. Compare and add name remappings in `load_weights()` + +**Common remappings needed**: +- `ff.net.0.proj` → `ff.net_0.proj` (PyTorch Sequential indexing) +- `.to_out.0.` → `.to_out.` (Sequential unwrapping) +- `scale_shift_table` → moved to a wrapper module + +### Black/Blank/Noisy Output + +**Possible causes**: +1. **Wrong latent normalization**: Check VAE expects latents scaled by `vae.config.scaling_factor` +2. **Wrong scheduler**: Using the wrong scheduler class or wrong `flow_shift` +3. **Missing CFG**: Some models require `guidance_scale > 1.0` with negative prompt +4. **Wrong timestep format**: Some schedulers expect float, others expect int/long +5. **Missing post-processing**: Raw VAE output may need denormalization + +**Quick test**: Run with diffusers directly using the same seed and compare latents at each step. + +### OOM (Out of Memory) + +**Solutions** (in order of preference): +1. `--enforce-eager` to disable torch.compile (saves compile memory) +2. `--enable-cpu-offload` for model-level offload +3. `--enable-layerwise-offload` for block-level offload (better for large models) +4. `--vae-use-slicing --vae-use-tiling` for VAE memory reduction +5. Reduce resolution: `--height 480 --width 832` +6. Use TP: `--tensor-parallel-size 2` + +### Different Output vs Diffusers Reference + +**Common causes**: +1. **Attention backend difference**: FlashAttention vs SDPA may produce slightly different results. Set `DIFFUSION_ATTENTION_BACKEND=TORCH_SDPA` to match diffusers +2. **Float precision**: vLLM-Omni may use bfloat16 where diffusers uses float32 for some operations +3. **Missing normalization**: Check all LayerNorm/RMSNorm are preserved +4. **Scheduler rounding**: Some schedulers have numerical sensitivity + +### Tensor Parallel Errors + +**Symptom**: `AssertionError: not divisible` or incorrect output with TP>1 + +**Fix**: +1. Verify `num_heads % tp_size == 0` and `num_kv_heads % tp_size == 0` +2. Ensure `ColumnParallelLinear` / `RowParallelLinear` are used correctly +3. Check that norms between parallel layers use distributed norm if needed +4. Verify `load_weights` handles TP sharding for norm weights +5. Use `self.to_qkv.num_heads` (local heads per GPU) for QKV split sizes, not total heads + +**Missing `input_is_parallel=True`**: + +`RowParallelLinear` expects sharded input from `ColumnParallelLinear`: +```python +self.w1 = ColumnParallelLinear(dim, hidden_dim, return_bias=False) +self.w2 = RowParallelLinear(hidden_dim, dim, input_is_parallel=True, return_bias=False) +``` + +### Sequence Parallel Errors + +**Symptom**: Incorrect output or crashes with `--ulysses-degree N` or `--usp N` + +**Possible causes**: +1. **Inline operations between shard/gather points**: `torch.cat()`, `pad_sequence()` etc. not at `nn.Module` boundaries. Fix: extract into submodule. +2. **Wrong `split_dim`**: Check the tensor shape at the shard point. Sequence dimension is typically `dim=1` for `[B, S, D]` tensors. +3. **RoPE not sharded**: If RoPE is computed separately, add it to `_sp_plan` with `split_output=True`. +4. **Sequence not divisible by SP degree**: Use `auto_pad=True` in `SequenceParallelInput` or switch to `ulysses_mode="advanced_uaa"`. + +**Debugging**: Add `expected_dims=N` to `SequenceParallelInput`/`Output` for shape validation at runtime. + +### CFG Parallel Errors + +**Symptom**: CFG parallel not activating, no speedup + +**Fix checklist**: +1. Pipeline inherits `CFGParallelMixin` +2. `guidance_scale > 1.0` +3. Negative prompt provided (even if empty string) +4. `--cfg-parallel-size 2` specified +5. `diffuse()` method calls `predict_noise_maybe_with_cfg()` and `scheduler_step_maybe_with_cfg()` + +**Symptom**: Different output with CFG parallel vs sequential + +**Possible cause**: Non-deterministic scheduler. Fix: pass `generator=torch.Generator(device).manual_seed(seed)` to `scheduler_step_maybe_with_cfg()`. + +### HSDP Errors + +**Symptom**: HSDP not activating or errors during weight loading + +**Fix checklist**: +1. Transformer defines `_hsdp_shard_conditions` class attribute +2. Shard condition functions return `True` for correct modules (test with `model.named_modules()`) +3. Not combining with TP (HSDP and TP are incompatible) +4. For standalone HSDP, `hsdp_shard_size` is specified explicitly + +**Verify**: Check logs for "HSDP Inference: replicate_size=..., shard_size=..." and "Sharded N modules + root". + +### Cache-DiT Not Applied + +**Symptom**: No speedup, no cache-related log messages + +**Fix checklist**: +1. Model not in `_NO_CACHE_ACCELERATION` in `registry.py` +2. Pipeline class name matches `CUSTOM_DIT_ENABLERS` key (if using custom enabler) +3. `cache_backend="cache_dit"` specified +4. Check logs for "Cache-dit enabled successfully on xxx" + +**Verify pipeline name**: `print(pipeline.__class__.__name__)` — must match registry key. + +### Cache-DiT Quality Degradation + +**Symptom**: Artifacts or lower quality with cache-dit + +**Fix**: Reduce aggressiveness: +```python +cache_config={ + "residual_diff_threshold": 0.12, # Lower from 0.24 + "max_warmup_steps": 6, # Increase from 4 + "max_continuous_cached_steps": 2, # Reduce if higher +} +``` + +If quality is still poor, the model may need a custom enabler with per-block-list `ParamsModifier` tuning. + +### Model Not Detected / Wrong Pipeline Class + +**Symptom**: `ValueError: Model class ... not found in diffusion model registry` + +**Cause**: The model's `model_index.json` has a `_class_name` for the pipeline that doesn't match registry keys. + +**Fix**: The registry key must match the diffusers pipeline class name from `model_index.json`. If using a different name, map it in the registry: +```python +"DiffusersPipelineClassName": ("your_folder", "your_file", "YourVllmClassName"), +``` + +## Debugging Workflow + +1. **Add verbose logging**: Use `logger.info()` to print tensor shapes at each stage +2. **Compare step-by-step**: Run diffusers and vllm-omni side by side, comparing tensors after each major operation +3. **Use small configs**: Reduce `num_inference_steps=2`, small resolution for fast iteration +4. **Test transformer isolation**: Feed the same input to both diffusers and vllm-omni transformers, compare outputs +5. **Binary search for bugs**: Comment out blocks/layers to isolate where divergence starts diff --git a/.claude/skills/add-tts-model/SKILL.md b/.claude/skills/add-tts-model/SKILL.md new file mode 100644 index 00000000000..e64e7e763ef --- /dev/null +++ b/.claude/skills/add-tts-model/SKILL.md @@ -0,0 +1,284 @@ +--- +name: add-tts-model +description: "Integrate a new text-to-speech model into vLLM-Omni from HuggingFace reference implementation through production-ready serving with streaming and CUDA graph acceleration. Use when adding a new TTS model, wiring stage separation for speech synthesis, enabling online voice generation serving, debugging TTS integration behavior, or building audio output pipelines." +--- + +# TTS Model Integration Workflow + +## Overview + +``` +HF Reference -> Stage Separation -> Online Serving -> Async Chunk -> CUDA Graph + (Phase 1) (Phase 2) (Phase 3) (Phase 4) (Phase 5) +``` + +## Phase 1: HuggingFace Reference + +**Goal**: Understand the reference implementation and verify it produces correct audio. + +### Steps + +1. **Run the reference model** end-to-end using the official HuggingFace / GitHub code +2. **Document the architecture**: + - What are the sub-models? (AR decoder, codec decoder, vocoder, etc.) + - What is the token vocabulary? (semantic codes, RVQ codebooks, special tokens) + - What is the output format? (sample rate, channels, codec type) +3. **Capture reference outputs** for comparison during integration +4. **Identify the config structure**: `config.json` fields, `model_type`, sub-model configs + +### Key Questions + +- How many codebooks? What are the codebook sizes? +- What special tokens exist? (`<|voice|>`, `<|audio_start|>`, `<|im_end|>`, etc.) +- What is the token-to-ID mapping for codec codes? +- What is the hop length / frame rate of the codec? +- Does the model support voice cloning? How? (reference audio encoding, speaker embeddings, etc.) + +### Deliverables + +- Working reference script that produces audio +- Architecture diagram / notes +- Token vocabulary mapping +- Reference audio samples for regression testing + +## Phase 2: Stage Separation (Offline Inference) + +**Goal**: Split the model into vLLM-Omni stages and get offline inference working. + +### Steps + +1. **Register the model** in `vllm_omni/model_executor/models/registry.py` +2. **Create config classes** (`configuration_.py`) with `model_type` registration +3. **Implement Stage 0** (AR model): + - Subclass appropriate base (e.g., wrap Qwen3 decoder layers) + - Implement `forward()` for autoregressive token generation + - Handle special token logic (start/stop tokens, codec token mapping) + - If dual-AR (like Fish Speech), implement Fast AR as a nested module +4. **Implement Stage 1** (Decoder): + - Load codec weights (may need lazy loading from separate checkpoint) + - Implement `forward()`: codec codes -> audio waveform + - Return `OmniOutput` with `multimodal_outputs` +5. **Create stage config YAML** defining both stages, memory allocation, and model paths +6. **Create stage input processor** for prompt building +7. **Write end2end.py** test script + +### Critical Parameters to Get Right + +| Parameter | Impact if Wrong | +|-----------|----------------| +| Hop length | Audio duration wrong, streaming noise | +| Token ID mapping | Garbage codes -> noise output | +| Codebook count/size | Shape mismatch crashes | +| Stop token | Generation never stops or stops too early | +| dtype / autocast | Numerical issues, silent quality degradation | +| Repetition penalty | Must match reference (often 1.0 for TTS) | + +### Debugging Priority (from experience) + +When audio output is wrong, check in this order: + +1. **RoPE / attention**: Are position encodings correct? Is the attention mask right? +2. **Normalization**: RMSNorm epsilon, layer norm placement (pre vs post) +3. **Hop length**: Product of all upsample rates in the codec decoder +4. **Token mapping**: Are codec IDs correctly offset from the vocabulary base? +5. **Sampling parameters**: Temperature, top_k, top_p, repetition_penalty +6. **Tensor layout**: Codebook-major vs frame-major ordering +7. **dtype**: Float32 for codec decoders (autocast can corrupt audio) + +### Deliverables + +- Model files in `vllm_omni/model_executor/models//` +- Stage config YAML +- Working `end2end.py` with correct audio output +- README.md in the example directory + +## Phase 3: Online Serving + +**Goal**: Expose the model via `/v1/audio/speech` API endpoint. + +### Steps + +1. **Register in `serving_speech.py`**: + - Add model stage name to `_TTS_MODEL_STAGES` set + - Add model detection flag (e.g., `_is_fish_speech`) + - Implement prompt builder method (e.g., `_build_fish_speech_prompt()`) +2. **Handle model-specific parameters**: + - Voice cloning: `ref_audio` encoding and prompt injection + - `max_new_tokens` override in sampling params + - Model-specific default values +3. **Create client scripts**: `speech_client.py`, `run_server.sh` +4. **Test all response formats**: wav, mp3, flac, pcm +5. **Add Gradio demo**: Interactive web UI with streaming support + +### Voice Cloning Pattern + +```python +import base64 +from pathlib import Path + +def build_voice_clone_prompt(ref_audio_path: str, text: str, codec) -> list: + """Build prompt with reference audio for voice cloning in serving_speech.py.""" + audio_bytes = Path(ref_audio_path).read_bytes() + codes = codec.encode(audio_bytes) # Encode on CPU using model's codec (e.g., DAC) + token_ids = [code + codec.vocab_offset for code in codes.flatten().tolist()] + return [ + {"role": "system", "content": f"<|voice|>{''.join(chr(t) for t in token_ids)}"}, + {"role": "user", "content": text}, + ] +``` + +### Deliverables + +- Updated `serving_speech.py` with model-specific prompt builder +- Client scripts and server launcher +- Gradio demo with streaming and voice cloning UI +- Documentation (offline + online serving docs) + +## Phase 4: Async Chunk (Streaming) + +**Goal**: Enable inter-stage streaming so audio chunks are produced while AR generation continues. + +### Steps + +1. **Update stage config YAML**: + ```yaml + async_chunk: true + codec_chunk_frames: 25 # frames per chunk + codec_left_context_frames: 25 # overlap for smooth boundaries + ``` +2. **Implement chunk handling in Stage 1**: + - Accept partial input (chunk of codec codes) + - Handle left context for smooth audio boundaries + - Return partial audio in `OmniOutput` +3. **Test streaming**: + - Verify audio quality matches non-streaming output + - Check for artifacts at chunk boundaries + - Measure TTFA (time to first audio) +4. **Update online serving** to support `stream=true` with PCM output + +### Streaming Architecture + +``` +Stage 0 (AR) Stage 1 (Decoder) + | | + |-- chunk 0 (25 frames) ------> decode -> audio chunk 0 -> client + |-- chunk 1 (25 frames) ------> decode -> audio chunk 1 -> client + |-- chunk 2 (25 frames) ------> decode -> audio chunk 2 -> client + ... +``` + +### Key Considerations + +- **Left context overlap**: Prevents audible artifacts at chunk boundaries +- **Hop length matters**: `context_audio_samples = context_frames * hop_length` +- **First chunk latency**: Can use larger initial chunk for better quality, then smaller chunks + +### Deliverables + +- Updated stage config with async_chunk enabled +- Smooth streaming audio without boundary artifacts +- TTFA metrics + +## Phase 5: CUDA Graph Acceleration + +**Goal**: Capture the AR loop as a CUDA graph for significant speedup. + +### Steps + +1. **Identify the hot loop**: The AR decoding loop that runs N steps per token +2. **Create static buffers**: + - KV caches with fixed max sequence length + - Pre-built causal masks and position tensors per step + - Static input/output tensors +3. **Implement graph capture**: + - Warm up with real data + - Capture the forward pass + - Replay with updated inputs +4. **Handle constraints**: + - Use `torch.argmax` instead of `torch.multinomial` (graph-safe) + - Fixed batch size (fall back to eager for other sizes) + - No dynamic control flow inside the graph + +### Example: Code Predictor CUDA Graph (Qwen3-TTS) + +```python +import torch + +class CodePredictorGraph: + """Captures the 16-step code predictor AR loop as a single CUDA graph.""" + + def setup_graph(self, device: torch.device, kv_heads: int = 4, head_dim: int = 64): + self.num_steps = 16 + self.kv_cache = torch.zeros(1, kv_heads, self.num_steps, head_dim, device=device) + self.positions = torch.arange(self.num_steps, device=device) + self.causal_mask = torch.tril(torch.ones(self.num_steps, self.num_steps, device=device)) + self.input_buf = torch.zeros(1, 1, kv_heads * head_dim, device=device) + self.output_buf = torch.zeros(1, self.num_steps, device=device, dtype=torch.long) + # Warm up, then: self.graph = torch.cuda.CUDAGraph(); self.graph.capture(...) + + def run_graph(self, initial_input: torch.Tensor) -> torch.Tensor: + self.input_buf.copy_(initial_input) + self.graph.replay() + return self.output_buf.clone() +``` + +### Performance Expectations + +Based on Qwen3-TTS code predictor experience: +- **3-5x speedup** for the graphed component +- Only effective for fixed batch sizes (typically batch_size=1) +- Falls back to eager mode for unsupported configurations + +### Deliverables + +- CUDA graph implementation for the AR hot loop +- Benchmark script comparing eager vs graph performance +- Documentation of constraints and fallback behavior + +## Integration Checklist + +Use this checklist when integrating a new TTS model: + +### Phase 1: HF Reference +- [ ] Reference model runs and produces correct audio +- [ ] Architecture documented (stages, codebooks, tokens, sample rate) +- [ ] Reference audio samples saved for comparison + +### Phase 2: Stage Separation +- [ ] Model registered in `registry.py` +- [ ] Config classes created with `model_type` registration +- [ ] Stage 0 (AR) implemented and generates correct tokens +- [ ] Stage 1 (Decoder) produces correct audio from tokens +- [ ] Stage config YAML created +- [ ] `end2end.py` produces audio matching reference quality +- [ ] README.md written + +### Phase 3: Online Serving +- [ ] Model added to `serving_speech.py` +- [ ] Prompt builder handles text input correctly +- [ ] Voice cloning works (if supported) +- [ ] All response formats work (wav, mp3, flac, pcm) +- [ ] Client scripts and server launcher created +- [ ] Gradio demo working +- [ ] Documentation added (offline + online docs, nav, supported models) + +### Phase 4: Async Chunk +- [ ] Stage config updated with `async_chunk: true` +- [ ] Stage 1 handles partial chunks correctly +- [ ] No audio artifacts at chunk boundaries +- [ ] Streaming via API (`stream=true`) works +- [ ] TTFA measured and acceptable + +### Phase 5: CUDA Graph +- [ ] Hot loop identified and profiled +- [ ] Static buffers allocated +- [ ] Graph captured and replays correctly +- [ ] Benchmark shows meaningful speedup +- [ ] Fallback to eager works for unsupported configs + +## References + +- [TTS audio skill](../vllm-omni-audio-tts/SKILL.md) -- supported models and usage +- [Fish Speech integration](../vllm-omni-audio-tts/references/fish-speech.md) -- complete example of Phases 1-3 +- [Qwen3-TTS reference](../vllm-omni-audio-tts/references/qwen-tts.md) -- complete example of all 5 phases +- [Adding a TTS model (developer guide)](https://github.com/vllm-project/vllm-omni/blob/main/docs/contributing/model/adding_tts_model.md) diff --git a/.claude/skills/readme.md b/.claude/skills/readme.md new file mode 100644 index 00000000000..b66f2ecd131 --- /dev/null +++ b/.claude/skills/readme.md @@ -0,0 +1,34 @@ +# Claude Skills for vLLM-Omni + +This directory contains Claude Code skills maintained for the `vllm-omni` +repository. These skills capture repeatable workflows for common contributor +tasks such as model integration, pull request review, and release note +generation. + +## Directory Structure + +Each skill lives in its own directory under `.claude/skills/`. A skill may +include: + +- `SKILL.md`: the main workflow and operating instructions +- `references/`: focused reference material used by the skill +- `scripts/`: small helper scripts used by the skill + +## Available Skills + +- `add-diffusion-model`: guides integration of a new diffusion model into + `vllm-omni` +- `add-omni-model`: covers addition of new omni-modality model support +- `add-tts-model`: covers integration of new TTS models and related serving + workflows +- `generate-release-note`: helps prepare release notes for repository changes +- `review-pr`: provides a structured workflow for reviewing pull requests + +## Maintenance Guidelines + +- Keep skill names short and task-oriented. +- Prefer repository-local paths, commands, and examples. +- Avoid hardcoding fast-changing support matrices unless the skill is actively + maintained alongside those changes. +- Treat skills as contributor tooling: optimize for clarity, actionability, and + low maintenance overhead. diff --git a/.gitignore b/.gitignore index 7f101a784c9..c0ee968064c 100644 --- a/.gitignore +++ b/.gitignore @@ -158,7 +158,19 @@ cython_debug/ # Claude CLAUDE.md -.claude/ +/.claude/* +!.claude/skills/ +!.claude/skills/readme.md +!.claude/skills/add-diffusion-model/ +!.claude/skills/add-diffusion-model/SKILL.md +!.claude/skills/add-diffusion-model/references/ +!.claude/skills/add-diffusion-model/references/*.md +!.claude/skills/add-tts-model/ +!.claude/skills/add-tts-model/SKILL.md +!.claude/skills/review-pr/ +!.claude/skills/review-pr/SKILL.md +!.claude/skills/review-pr/references/ +!.claude/skills/review-pr/references/*.md # Codex AGENTS.md From bcd5f16321df6bbc6f997a3906d16a23c8bb489e Mon Sep 17 00:00:00 2001 From: n1ptune Date: Tue, 14 Apr 2026 20:41:23 +0800 Subject: [PATCH 33/76] [Misc] clean Temporary CI Configs (#2784) Signed-off-by: neptune Co-authored-by: neptune --- tests/conftest.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e41d15bdf56..adb87cbd728 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,4 @@ +import atexit import base64 import datetime import io @@ -1362,9 +1363,10 @@ def delete_by_path(config_dict: dict, path: str) -> None: continue # Delete specified paths in this stage - for path in delete_paths: - if path: # Skip empty paths - delete_by_path(target_stage, path) + # Avoid shadowing the original YAML Path used for the output filename below. + for delete_path in delete_paths: + if delete_path: # Skip empty paths + delete_by_path(target_stage, delete_path) elif "." in key: # Delete using dot-separated path delete_by_path(config, key) @@ -1394,15 +1396,15 @@ def delete_by_path(config_dict: dict, path: str) -> None: raise KeyError(f"Stage ID {stage_id} not found, available: {available_ids}") # Apply updates to this stage - for path, val in stage_updates.items(): + for update_path, val in stage_updates.items(): # Check if this is a simple key (not dot-separated) # Example: 'engine_input_source' vs 'engine_args.max_model_len' - if "." not in path: + if "." not in update_path: # Direct key assignment (e.g., updating a list value) - target_stage[path] = val + target_stage[update_path] = val else: # Dot-separated path (e.g., nested dict access) - apply_update(target_stage, path, val) + apply_update(target_stage, update_path, val) elif "." in key: # Apply using dot-separated path apply_update(config, key, value) @@ -1414,13 +1416,14 @@ def delete_by_path(config_dict: dict, path: str) -> None: # within the same second (e.g. test_qwen3_omni_expansion imports both # get_chunk_config and get_batch_token_config). int(time.time()) would collide # and the later write would overwrite the earlier YAML on disk. - base_name = yaml_path.rsplit(".", 1)[0] if "." in yaml_path else yaml_path - output_path = f"{base_name}_{time.time_ns()}.yaml" + # Keep generated configs outside the repo and delete them when pytest exits. + output_fd, output_path = tempfile.mkstemp(prefix=f"{path.stem}_", suffix=".yaml") + atexit.register(Path(output_path).unlink, missing_ok=True) - with open(output_path, "w", encoding="utf-8") as f: + with os.fdopen(output_fd, "w", encoding="utf-8") as f: yaml.dump(config, f, default_flow_style=None, sort_keys=False, allow_unicode=True, indent=2) - return output_path + return str(output_path) class OmniServer: From 5ce0a434920590e090d7080f9f67e03c4c300d82 Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Tue, 14 Apr 2026 20:48:04 +0800 Subject: [PATCH 34/76] [CI][Bugfix] Update thresholds for accuracy tests (#2725) Signed-off-by: wangyu <410167048@qq.com> --- tests/e2e/accuracy/test_gedit_bench_h100_smoke.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py b/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py index ac5f2cb3cfd..960ea57960c 100644 --- a/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py +++ b/tests/e2e/accuracy/test_gedit_bench_h100_smoke.py @@ -106,9 +106,9 @@ def test_gedit_bench_h100_smoke( group_summary = language_summary["by_group"][group] assert set(group_summary) == {"count", "Q_SC", "Q_PQ", "Q_O"} - assert summary["languages"]["en"]["overall"]["Q_SC"] >= 7.0 + assert summary["languages"]["en"]["overall"]["Q_SC"] >= 6.95 assert summary["languages"]["en"]["overall"]["Q_PQ"] >= 5.8 - assert summary["languages"]["en"]["overall"]["Q_O"] >= 6.2 + assert summary["languages"]["en"]["overall"]["Q_O"] >= 6.15 assert summary["languages"]["cn"]["overall"]["Q_SC"] >= 6.9 assert summary["languages"]["cn"]["overall"]["Q_PQ"] >= 5.7 assert summary["languages"]["cn"]["overall"]["Q_O"] >= 6.1 From cf1fcd5acf9ec0c7d74daf550a922f6fd3d716ca Mon Sep 17 00:00:00 2001 From: Alex Brooks Date: Tue, 14 Apr 2026 06:49:57 -0600 Subject: [PATCH 35/76] [CI/BugFix] Fix Flaky Test for Qwen Omni Perf (#2754) Signed-off-by: Alex Brooks --- vllm_omni/benchmarks/patch/patch.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm_omni/benchmarks/patch/patch.py b/vllm_omni/benchmarks/patch/patch.py index 343655df20e..17d7498ba26 100644 --- a/vllm_omni/benchmarks/patch/patch.py +++ b/vllm_omni/benchmarks/patch/patch.py @@ -143,7 +143,11 @@ async def async_request_openai_chat_omni_completions( if response.status == 200: handler = StreamedResponseHandler() async for chunk_bytes in response.content.iter_any(): - chunk_bytes = chunk_bytes.strip() + # NOTE: Do NOT strip() here; TCP may fragment the SSE messages, + # so stripping here can cause problems depending on how it is split. + # + # Simple example: [b'data: ', b'{json}\n\n'] <- stripping the first + # chunk will break SSE parsing because the space after 'data:' is required. if not chunk_bytes: continue From 4fb078a03166fc749e889a1934b6a59b483d5e18 Mon Sep 17 00:00:00 2001 From: Bvicii <98971614+scyyh11@users.noreply.github.com> Date: Tue, 14 Apr 2026 05:53:06 -0700 Subject: [PATCH 36/76] [Bugfix] Reject /v1/audio/speech for Qwen omni models (#2763) Signed-off-by: Bvicii --- .../openai_api/test_serving_speech.py | 26 +++++++++++++++++++ .../entrypoints/openai/serving_speech.py | 18 +++++++++++++ 2 files changed, 44 insertions(+) diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index c8841206207..b388b18606b 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -684,6 +684,32 @@ def test_is_tts_detection_with_tts_stage(self, mocker: MockerFixture): assert server._is_tts is True assert server._tts_stage is mock_stage + def test_prepare_speech_rejects_non_tts_omni_model(self, mocker: MockerFixture): + """Multi-stage omni models (e.g. Qwen3-Omni) must not use /v1/audio/speech.""" + mock_engine_client = mocker.MagicMock() + mock_engine_client.errored = False + mock_engine_client.tts_max_instructions_length = None + + # Simulate Qwen3-Omni: multiple stages, none in _TTS_MODEL_STAGES + thinker = SimpleNamespace(engine_args=SimpleNamespace(model_stage="thinker"), tts_args={}) + talker = SimpleNamespace(engine_args=SimpleNamespace(model_stage="talker"), tts_args={}) + code2wav = SimpleNamespace(engine_args=SimpleNamespace(model_stage="code2wav"), tts_args={}) + mock_engine_client.stage_configs = [thinker, talker, code2wav] + + mock_models = mocker.MagicMock() + mock_models.is_base_model.return_value = True + server = OmniOpenAIServingSpeech( + engine_client=mock_engine_client, + models=mock_models, + request_logger=mocker.MagicMock(), + ) + assert server._is_tts is False + + request = OpenAICreateSpeechRequest(input="Hello world") + with pytest.raises(ValueError, match="only supported for dedicated TTS models"): + asyncio.run(server._prepare_speech_generation(request)) + server.shutdown() + def test_estimate_prompt_len_fallback(self, speech_server): """Test prompt length estimation falls back to 2048 when model is unavailable.""" tts_params = {"text": ["Hello"], "task_type": ["CustomVoice"]} diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 3dc5f595d0a..1d9754853f3 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -1471,6 +1471,24 @@ async def _prepare_speech_generation( ph_len = await self._estimate_prompt_len_async(tts_params) prompt = {"prompt_token_ids": [1] * ph_len, "additional_information": tts_params} else: + # Qwen omni models (Qwen3-Omni, Qwen2.5-Omni) use a "talker" + # stage whose preprocess requires chat-templated tokens. The + # async-chunk orchestrator prewarms the talker via + # compute_talker_prompt_ids_length(), which scans for Qwen + # chat-template markers (im_start_token_id 151644). A raw-text + # prompt produces a 1-token placeholder that crashes the talker's + # prefill/decode handoff. Reject early with an actionable message. + stage_names = { + getattr(getattr(s, "engine_args", None), "model_stage", None) for s in self.engine_client.stage_configs + } + if "talker" in stage_names: + raise ValueError( + "The /v1/audio/speech endpoint is only supported for " + "dedicated TTS models (e.g., Qwen3-TTS, Voxtral, Fish " + "Speech, CosyVoice3, OmniVoice, VoxCPM2). For omni " + "models like Qwen3-Omni, use /v1/chat/completions with " + '\'"modalities": ["audio"]\' instead.' + ) tts_params = {} prompt = {"prompt": request.input} From 53a9cf49a6a2ee8dbacb7985458390ffb804ddbe Mon Sep 17 00:00:00 2001 From: "Yiyang \"Ian\" Liu" Date: Tue, 14 Apr 2026 06:52:32 -0700 Subject: [PATCH 37/76] fix: do not apply FP8 quant config to vision/audio encoders for pre-quantized checkpoints (#2702) Signed-off-by: Yiyang Liu <37043548+ianliuy@users.noreply.github.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../models/test_encoder_quant_config.py | 77 +++++++++++++++++++ .../qwen2_5_omni/qwen2_5_omni_thinker.py | 12 ++- .../qwen3_omni/qwen3_omni_moe_thinker.py | 26 ++++--- vllm_omni/quantization/component_config.py | 25 ++++++ 4 files changed, 129 insertions(+), 11 deletions(-) create mode 100644 tests/model_executor/models/test_encoder_quant_config.py diff --git a/tests/model_executor/models/test_encoder_quant_config.py b/tests/model_executor/models/test_encoder_quant_config.py new file mode 100644 index 00000000000..80201849863 --- /dev/null +++ b/tests/model_executor/models/test_encoder_quant_config.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Regression test for #2686: pre-quantized methods must not apply +quant config to vision / audio encoders. + +For modelopt FP8/FP4/MXFP8 checkpoints the Thinker LM is the only +quantized component. Vision and audio encoder weights are BF16 with no +FP8 scale tensors — passing quant_config to them causes FP8 kernels to +run on BF16 weights, producing garbage embeddings. +""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +import pytest + +from vllm_omni.quantization.component_config import ( + PRE_QUANTIZED_METHODS, + ComponentQuantizationConfig, + resolve_encoder_quant_config, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + +# --------------------------------------------------------------------------- +# resolve_encoder_quant_config — the core routing logic for encoder quant +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("method", sorted(PRE_QUANTIZED_METHODS)) +def test_pre_quantized_returns_none(method: str) -> None: + """visual_quant_config and audio_quant_config must be None for + pre-quantized methods (modelopt, modelopt_fp4, modelopt_mxfp8).""" + mock_config = MagicMock() + mock_config.get_name.return_value = method + + assert resolve_encoder_quant_config(mock_config) is None + + +@pytest.mark.parametrize("method", ["fp8", "awq", "gptq", "bitsandbytes"]) +def test_non_pre_quantized_preserves_config(method: str) -> None: + """Non-pre-quantized methods should pass through the original config.""" + mock_config = MagicMock() + mock_config.get_name.return_value = method + + assert resolve_encoder_quant_config(mock_config) is mock_config + + +def test_none_input_returns_none() -> None: + """No quantization → None for encoders.""" + assert resolve_encoder_quant_config(None) is None + + +def test_component_config_passed_through() -> None: + """ComponentQuantizationConfig should be returned as-is so the caller + can call .resolve() with the appropriate prefix.""" + inner = MagicMock() + inner.get_name.return_value = "modelopt" # would be None if not Component + component = ComponentQuantizationConfig( + component_configs={"language_model": inner}, + default_config=None, + ) + + result = resolve_encoder_quant_config(component) + assert result is component + + +# --------------------------------------------------------------------------- +# PRE_QUANTIZED_METHODS constant — exhaustiveness check +# --------------------------------------------------------------------------- + + +def test_pre_quantized_methods_contains_expected() -> None: + """Guard against accidental removal of a known pre-quantized method.""" + expected = {"modelopt", "modelopt_fp4", "modelopt_mxfp8"} + assert PRE_QUANTIZED_METHODS == expected diff --git a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py index 0307034089c..617f0f9e325 100644 --- a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py +++ b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py @@ -64,6 +64,10 @@ ) from vllm.sequence import IntermediateTensors +from vllm_omni.quantization.component_config import ( + resolve_encoder_quant_config, +) + try: import flash_attn except (ImportError, ModuleNotFoundError): @@ -359,6 +363,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.quant_config = quant_config + # Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) only quantize + # the Thinker LM. Vision encoder weights remain in BF16 with no FP8 + # scale tensors; passing quant_config causes FP8 kernels to run on + # BF16 weights, producing garbage embeddings. Keep None for encoders. + visual_quant_config = resolve_encoder_quant_config(quant_config) + with self._mark_tower_model(vllm_config, "audio"): if multimodal_config.get_limit_per_prompt("audio"): self.audio_tower = Qwen2_5OmniAudioEncoder(thinker_config.audio_config) @@ -370,7 +380,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.visual = Qwen2_5_VisionTransformer( vision_config=thinker_config.vision_config, norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6), - quant_config=quant_config, + quant_config=visual_quant_config, prefix=maybe_prefix(prefix, "visual"), ) else: diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py index 671ffb6cb16..d03a96fd85a 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py @@ -119,7 +119,10 @@ from vllm_omni.model_executor.models.qwen2_5_omni.qwen2_5_omni_thinker import ( Qwen2_5OmniConditionalGenerationMixin, ) -from vllm_omni.quantization.component_config import ComponentQuantizationConfig +from vllm_omni.quantization.component_config import ( + PRE_QUANTIZED_METHODS, + ComponentQuantizationConfig, +) try: import flash_attn @@ -1114,21 +1117,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.multimodal_config = multimodal_config self.quant_config = quant_config - # Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) quantize the - # entire thinker — audio tower, visual encoder, and language model - # all share the same quant method. Dynamic quantization methods - # (e.g. --quantization fp8) should only target the language model. - _PRE_QUANTIZED_METHODS = {"modelopt", "modelopt_fp4", "modelopt_mxfp8"} + # Pre-quantized checkpoints (modelopt NVFP4/FP8/MXFP8) only quantize + # the Thinker LM (language model). Vision and audio encoder weights + # remain in BF16 and have no corresponding scale tensors in the + # checkpoint. Dynamic quantization methods (e.g. --quantization fp8) + # should also only target the language model. if isinstance(quant_config, ComponentQuantizationConfig): audio_quant_config = quant_config.resolve("audio_tower") visual_quant_config = quant_config.resolve("visual") language_quant_config = quant_config.resolve("language_model") elif quant_config is not None: - if quant_config.get_name() in _PRE_QUANTIZED_METHODS: - # Pre-quantized: pass quant_config to all subcomponents. - audio_quant_config = quant_config - visual_quant_config = quant_config + if quant_config.get_name() in PRE_QUANTIZED_METHODS: + # Pre-quantized: only the Thinker LM is quantized. + # Vision/audio encoder weights are BF16 with no FP8 scales; + # passing quant_config to them causes FP8 kernels to run on + # BF16 weights (producing garbage embeddings). Keep None. + audio_quant_config = None + visual_quant_config = None language_quant_config = quant_config else: # Dynamic quantization: scope to language_model only. diff --git a/vllm_omni/quantization/component_config.py b/vllm_omni/quantization/component_config.py index 7986da8850b..f9286079be1 100644 --- a/vllm_omni/quantization/component_config.py +++ b/vllm_omni/quantization/component_config.py @@ -23,6 +23,31 @@ ) +# Pre-quantized checkpoints (modelopt FP8/FP4/MXFP8) only quantize the +# Thinker LM. Vision and audio encoder weights remain in BF16 with no +# corresponding scale tensors in the checkpoint. +PRE_QUANTIZED_METHODS: frozenset[str] = frozenset({"modelopt", "modelopt_fp4", "modelopt_mxfp8"}) + + +def resolve_encoder_quant_config( + quant_config: QuantizationConfig | None, +) -> QuantizationConfig | None: + """Resolve quantization config for vision / audio encoders. + + Returns *None* for pre-quantized methods so that FP8 kernels are never + applied to BF16 encoder weights (which lack scale tensors). All other + configs — including ``ComponentQuantizationConfig`` and ``None`` — are + returned as-is so the caller can handle them. + """ + if ( + quant_config is not None + and not isinstance(quant_config, ComponentQuantizationConfig) + and quant_config.get_name() in PRE_QUANTIZED_METHODS + ): + return None + return quant_config + + class ComponentQuantizationConfig(QuantizationConfig): """Routes quantization to different configs by layer prefix.""" From f03ab38783cb6ed5f110540966aae54fec06828d Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Tue, 14 Apr 2026 22:26:55 +0800 Subject: [PATCH 38/76] [BugFix] Fix NoneType' object has no attribute 'detach' (#2797) Signed-off-by: amy-why-3459 --- tests/e2e/online_serving/test_qwen3_omni.py | 2 +- vllm_omni/worker/gpu_ar_model_runner.py | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py index fcda20ba388..f4aabb8b957 100644 --- a/tests/e2e/online_serving/test_qwen3_omni.py +++ b/tests/e2e/online_serving/test_qwen3_omni.py @@ -120,7 +120,7 @@ def test_mix_to_text_audio_001(omni_server, openai_client) -> None: } # Test single completion - openai_client.send_omni_request(request_config) + openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) @pytest.mark.advanced_model diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 4f3f843e658..62a0c857164 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -797,12 +797,11 @@ def propose_draft_token_ids(sampled_token_ids): elif isinstance(v, dict): mm_payload[k] = {sk: sv[start:end].contiguous() for sk, sv in v.items()} elif isinstance(v, list): - if idx < len(v): - element = v[idx] - if element is not None: - if isinstance(element, torch.Tensor): - element = element.clone() - mm_payload[k] = element + element = v[idx] if idx < len(v) else v[0] + if element is not None: + if isinstance(element, torch.Tensor): + element = element.clone() + mm_payload[k] = element # Skip None elements: msgspec cannot serialize None # in dict[str, torch.Tensor] typed fields. elif isinstance(v, torch.Tensor): From bc4a659f03f7d28892fa1a52a1cceaa55ddac0ba Mon Sep 17 00:00:00 2001 From: "Yiyang \"Ian\" Liu" Date: Tue, 14 Apr 2026 07:41:28 -0700 Subject: [PATCH 39/76] [Bugfix] Make mrope kwargs optional in HunyuanImage3 get_mrope_input_positions (#2654) Signed-off-by: Yiyang Liu Co-authored-by: SYLAR <125541396+lishunyang12@users.noreply.github.com> --- .../model_executor/models/hunyuan_image3/hunyuan_image3.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py index 6d25274f901..5c280ddcf48 100644 --- a/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py +++ b/vllm_omni/model_executor/models/hunyuan_image3/hunyuan_image3.py @@ -1507,9 +1507,9 @@ def get_mrope_input_positions( input_tokens: list[int], mm_features: list[MultiModalFeatureSpec] | None = None, *, - hf_config: PretrainedConfig, - image_grid_thw: list[list[int]] | torch.Tensor, - video_grid_thw: list[list[int]] | torch.Tensor, + hf_config: PretrainedConfig | None = None, + image_grid_thw: list[list[int]] | torch.Tensor | None = None, + video_grid_thw: list[list[int]] | torch.Tensor | None = None, second_per_grid_ts: list[float] | None = None, context_len: int = 0, seq_len: int | None = None, From 9e46a79c17d4f0153f8347a17fc18710e10a8298 Mon Sep 17 00:00:00 2001 From: "rongfu.leng" Date: Wed, 15 Apr 2026 08:52:32 +0800 Subject: [PATCH 40/76] [Bugfix] Handle numpy array outputs when generate image (#1680) Signed-off-by: rongfu.leng --- .../openai_api/test_image_server.py | 88 +++++++++++++++++++ vllm_omni/entrypoints/openai/api_server.py | 35 +++++++- 2 files changed, 122 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/openai_api/test_image_server.py b/tests/entrypoints/openai_api/test_image_server.py index c91c5a5c751..4b38692da33 100644 --- a/tests/entrypoints/openai_api/test_image_server.py +++ b/tests/entrypoints/openai_api/test_image_server.py @@ -1165,3 +1165,91 @@ def test_image_edit_with_seed_zero_single_stage(test_client): f"Expected seed=0, but got seed={captured_sampling_params.seed}. " "This indicates the bug where seed=0 is treated as falsy." ) + + +def test_normalize_image(): + """Test _normalize_image with various input types""" + import numpy as np + + from vllm_omni.entrypoints.openai.api_server import _normalize_image + + # Test PIL Image input + img = Image.new("RGB", (64, 64), color="red") + result = _normalize_image(img) + assert isinstance(result, Image.Image) + assert result.size == (64, 64) + + # Test uint8 numpy array + arr = np.random.randint(0, 255, (64, 64, 3), dtype=np.uint8) + result = _normalize_image(arr) + assert isinstance(result, Image.Image) + assert result.size == (64, 64) + + # Test float [0, 1] numpy array + arr = np.random.rand(64, 64, 3).astype(np.float32) + result = _normalize_image(arr) + assert isinstance(result, Image.Image) + assert result.size == (64, 64) + + # Test float [-1, 1] numpy array + arr = np.random.rand(64, 64, 3).astype(np.float32) * 2 - 1 + result = _normalize_image(arr) + assert isinstance(result, Image.Image) + assert result.size == (64, 64) + + # Test batch dimensions (1, 1, H, W, C) + arr = np.random.randint(0, 255, (1, 1, 64, 64, 3), dtype=np.uint8) + result = _normalize_image(arr) + assert isinstance(result, Image.Image) + assert result.size == (64, 64) + + +def test_extract_images_from_result(): + """Test _extract_images_from_result with various result formats""" + import numpy as np + + from vllm_omni.entrypoints.openai.api_server import _extract_images_from_result + + # Test empty result + class EmptyResult: + pass + + result = EmptyResult() + images = _extract_images_from_result(result) + assert images == [] + + # Test nested batch: [np.array(shape=(3, 64, 64, 3))] + batch = np.random.randint(0, 255, (3, 1, 64, 64, 3), dtype=np.uint8) + + class BatchResult: + def __init__(self): + self.images = [batch] + + result = BatchResult() + images = _extract_images_from_result(result) + assert len(images) == 3 + assert all(isinstance(img, Image.Image) for img in images) + assert all(img.size == (64, 64) for img in images) + + # Test dict path: result.request_output["images"] + class DictRequestOutput: + def __init__(self): + self.request_output = {"images": [np.random.randint(0, 255, (64, 64, 3), dtype=np.uint8)]} + + result = DictRequestOutput() + images = _extract_images_from_result(result) + assert len(images) == 1 + assert isinstance(images[0], Image.Image) + + # Test attribute path: result.request_output.images + class AttrRequestOutput: + def __init__(self): + self.request_output = type( + "obj", (), {"images": [np.random.randint(0, 255, (32, 32, 3), dtype=np.uint8)]} + )() + + result = AttrRequestOutput() + images = _extract_images_from_result(result) + assert len(images) == 1 + assert isinstance(images[0], Image.Image) + assert images[0].size == (32, 32) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 6a65f443322..d847a96db66 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -18,6 +18,7 @@ from typing import Annotated, Any, Literal, cast import httpx +import numpy as np import vllm.envs as envs from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, Request, UploadFile, WebSocket from fastapi.responses import FileResponse, JSONResponse, Response, StreamingResponse @@ -1767,6 +1768,34 @@ def _update_if_not_none(object: Any, key: str, val: Any) -> None: setattr(object, key, val) +def _normalize_image(image: Any) -> Any: + """Normalize a single image output to a PIL-compatible format.""" + if isinstance(image, Image.Image): + return image + if not isinstance(image, np.ndarray): + raise ValueError(f"Unsupported image type: {type(image)}") + if not np.issubdtype(image.dtype, np.integer) and not np.issubdtype(image.dtype, np.floating): + raise ValueError(f"Unsupported dtype: {image.dtype}") + if isinstance(image, np.ndarray): + while image.ndim > 3: + image = image[0] + if image.min() < 0: + if image.min() < -1.01 or image.max() > 1.01: + logger.warning( + f"Image float range [{image.min():.2f}, {image.max():.2f}] outside expected [-1, 1]. " + f"Clipping to [-1, 1] before normalization." + ) + image = np.clip(image, -1.0, 1.0) * 0.5 + 0.5 + elif image.max() > 1.01: + logger.warning( + f"Image float range [{image.min():.2f}, {image.max():.2f}] outside expected [0, 1]. " + f"Clipping to [0, 1] before normalization." + ) + image = (np.clip(image, 0.0, 1.0) * 255).astype(np.uint8) + image = Image.fromarray(image) + return image + + def _extract_images_from_result(result: Any) -> list[Any]: images = [] if hasattr(result, "images") and result.images: @@ -1777,6 +1806,10 @@ def _extract_images_from_result(result: Any) -> list[Any]: images = request_output["images"] elif hasattr(request_output, "images") and request_output.images: images = request_output.images + # Handle when generate more than one image + if images and isinstance(images[0], np.ndarray) and images[0].shape[0] > 1 and images[0].ndim == 5: + # Unwrap batch: (N, T, H, W, C) -> [img1, img2, ...] + images = list(images[0]) # Flatten nested lists (e.g., from layered models like Qwen-Image-Layered). # Note: This only flattens one level deep. Deeper nesting is not supported. flattened = [] @@ -1785,7 +1818,7 @@ def _extract_images_from_result(result: Any) -> list[Any]: flattened.extend(img) else: flattened.append(img) - return flattened + return [_normalize_image(img) for img in flattened] async def _load_input_images( From 02e5dc747d028ab75a136988985b32dc83d33557 Mon Sep 17 00:00:00 2001 From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Date: Tue, 14 Apr 2026 21:58:25 -0400 Subject: [PATCH 41/76] [Perf] VoxCPM2: streaming VAE + compile optimization (45% RTF reduction) (#2758) Signed-off-by: Yueqian Lin --- examples/offline_inference/voxcpm2/end2end.py | 10 +- .../online_serving/voxcpm2/gradio_demo.py | 602 ++++++++++++++++++ tests/e2e/offline_inference/test_voxcpm2.py | 10 +- vllm_omni/engine/output_processor.py | 7 +- .../models/voxcpm2/minicpm4_paged.py | 71 +-- .../models/voxcpm2/voxcpm2_talker.py | 95 ++- 6 files changed, 716 insertions(+), 79 deletions(-) create mode 100644 examples/online_serving/voxcpm2/gradio_demo.py diff --git a/examples/offline_inference/voxcpm2/end2end.py b/examples/offline_inference/voxcpm2/end2end.py index ce404bf962d..687e596018c 100644 --- a/examples/offline_inference/voxcpm2/end2end.py +++ b/examples/offline_inference/voxcpm2/end2end.py @@ -74,16 +74,20 @@ def extract_audio(multimodal_output: dict) -> torch.Tensor: The output processor concatenates per-step delta tensors under ``model_outputs``. Falls back to ``audio`` for backwards compat. """ - audio = multimodal_output.get("model_outputs") or multimodal_output.get("audio") + audio = multimodal_output.get("model_outputs") + if audio is None: + audio = multimodal_output.get("audio") if audio is None: raise ValueError(f"No audio key in multimodal_output: {list(multimodal_output.keys())}") if isinstance(audio, list): - # Take the last valid tensor (most complete audio) + # Defensive: usually the output processor consolidates into a single + # tensor at request completion, but concatenate here too in case the + # caller consumes intermediate (pre-consolidation) outputs. valid = [torch.as_tensor(a).float().cpu().reshape(-1) for a in audio if a is not None] if not valid: raise ValueError("Audio list is empty or all elements are None.") - return valid[-1] + return torch.cat(valid, dim=0) if len(valid) > 1 else valid[0] return torch.as_tensor(audio).float().cpu().reshape(-1) diff --git a/examples/online_serving/voxcpm2/gradio_demo.py b/examples/online_serving/voxcpm2/gradio_demo.py new file mode 100644 index 00000000000..a33a2d9245f --- /dev/null +++ b/examples/online_serving/voxcpm2/gradio_demo.py @@ -0,0 +1,602 @@ +"""Gradio demo for VoxCPM2 TTS with gapless streaming audio playback. + +Uses a custom AudioWorklet-based player for gap-free streaming +(adapted from the Qwen3-TTS demo). Audio is streamed from the vLLM +server through a same-origin proxy and played via the Web Audio API's +AudioWorklet, which maintains a FIFO buffer queue and plays samples at +the audio clock rate. + +Usage: + # Start the vLLM server first: + python -m vllm_omni.entrypoints.openai.api_server \ + --model openbmb/VoxCPM2 \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm2.yaml \ + --host 0.0.0.0 --port 8000 + + # Then launch the demo: + python gradio_demo.py --api-base http://localhost:8000 +""" + +from __future__ import annotations + +import argparse +import base64 +import io +import json +import logging + +import gradio as gr +import httpx +import numpy as np +import soundfile as sf +from fastapi import FastAPI, Request +from fastapi.responses import Response, StreamingResponse + +logger = logging.getLogger(__name__) + +SAMPLE_RATE = 48000 + +# ── AudioWorklet processor (loaded in browser via Blob URL) ────────── +WORKLET_JS = r""" +class TTSPlaybackProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this.queue = []; + this.buf = null; + this.pos = 0; + this.playing = false; + this.played = 0; + this.port.onmessage = (e) => { + if (e.data && e.data.type === 'clear') { + this.queue = []; this.buf = null; this.pos = 0; this.played = 0; + if (this.playing) { this.playing = false; this.port.postMessage({type:'stopped'}); } + return; + } + this.queue.push(e.data); + }; + } + process(inputs, outputs) { + const out = outputs[0][0]; + for (let i = 0; i < out.length; i++) { + if (!this.buf || this.pos >= this.buf.length) { + if (this.queue.length > 0) { + this.buf = this.queue.shift(); this.pos = 0; + } else { + for (let j = i; j < out.length; j++) out[j] = 0; + if (this.playing) { this.playing = false; this.port.postMessage({type:'stopped', played:this.played}); } + return true; + } + } + out[i] = this.buf[this.pos++] / 32768; + this.played++; + } + if (!this.playing) { this.playing = true; this.port.postMessage({type:'started'}); } + return true; + } +} +registerProcessor('tts-playback-processor', TTSPlaybackProcessor); +""" + +PLAYER_HTML = """ +

+""" + + +def _build_player_js() -> str: + return f""" + +""" + + +def _encode_audio(audio_data: tuple) -> str: + sr, audio_np = audio_data + if audio_np.dtype in (np.float32, np.float64): + audio_np = np.clip(audio_np, -1.0, 1.0) + audio_np = (audio_np * 32767).astype(np.int16) + elif audio_np.dtype != np.int16: + audio_np = audio_np.astype(np.int16) + buf = io.BytesIO() + sf.write(buf, audio_np, sr, format="WAV") + return f"data:audio/wav;base64,{base64.b64encode(buf.getvalue()).decode()}" + + +def create_app(api_base: str): + app = FastAPI() + _pending: dict[str, dict] = {} + + @app.post("/proxy/v1/audio/speech") + async def proxy_speech(request: Request): + body = await request.json() + req_id = body.get("_req_id") + if req_id and req_id in _pending: + body = _pending.pop(req_id) + logger.info("Proxy: %s", {k: (f"<{len(str(v))} chars>" if k == "ref_audio" else v) for k, v in body.items()}) + try: + client = httpx.AsyncClient(timeout=300) + resp = await client.send( + client.build_request( + "POST", + f"{api_base}/v1/audio/speech", + json=body, + headers={"Authorization": "Bearer EMPTY", "Content-Type": "application/json"}, + ), + stream=True, + ) + except Exception as exc: + logger.exception("Proxy connection error") + await client.aclose() + return Response(content=str(exc), status_code=502) + if resp.status_code != 200: + content = await resp.aread() + await resp.aclose() + await client.aclose() + return Response(content=content, status_code=resp.status_code) + + async def relay(): + try: + async for chunk in resp.aiter_bytes(): + yield chunk + finally: + await resp.aclose() + await client.aclose() + + return StreamingResponse(relay(), media_type="application/octet-stream") + + css = """ + #generate-btn button { width: 100%; } + #streaming-player { border: 1px solid var(--border-color-primary) !important; border-radius: var(--block-radius) !important; padding: var(--block-padding) !important; } + """ + theme = gr.themes.Default( + primary_hue=gr.themes.Color( + c50="#f0f5ff", + c100="#dce6f9", + c200="#b8cef3", + c300="#8eb2eb", + c400="#6496e0", + c500="#4A90D9", + c600="#3a7bc8", + c700="#2d66b0", + c800="#1f4f8f", + c900="#163a6e", + c950="#0e2650", + ), + ) + + with gr.Blocks(title="VoxCPM2 TTS Demo") as demo: + gr.HTML(f""" + + """) + + gr.Markdown( + "**Three modes:** " + "**Voice Design** (control instruction only) · " + "**Controllable Cloning** (ref audio + optional style control) · " + "**Ultimate Cloning** (ref audio + transcript for audio continuation)" + ) + + with gr.Row(): + with gr.Column(scale=3): + text_input = gr.Textbox( + label="Target Text", + placeholder="Enter text to synthesize...", + lines=4, + ) + control_instruction = gr.Textbox( + label="Control Instruction (optional)", + placeholder="e.g. A warm young woman / Excited and fast-paced", + lines=2, + info="Describe voice style, emotion, pace. Works for both Voice Design and Controllable Cloning.", + ) + + with gr.Accordion("Voice Cloning", open=False): + ref_audio = gr.Audio( + label="Reference Audio (upload for cloning)", + type="numpy", + sources=["upload", "microphone"], + ) + ref_audio_url = gr.Textbox( + label="or Reference Audio URL", + placeholder="https://example.com/reference.wav", + ) + ultimate_clone = gr.Checkbox( + label="Ultimate Cloning Mode", + value=False, + info="Provide transcript of ref audio for audio continuation (disables control instruction)", + ) + prompt_text = gr.Textbox( + label="Reference Audio Transcript", + placeholder="Transcript of your reference audio (for ultimate cloning)", + lines=2, + visible=False, + ) + + with gr.Row(): + stream_checkbox = gr.Checkbox( + label="Stream (gapless)", + value=True, + info="AudioWorklet streaming", + ) + with gr.Row(): + generate_btn = gr.Button( + "Generate Speech", + variant="primary", + size="lg", + elem_id="generate-btn", + scale=3, + ) + reset_btn = gr.Button("Reset", variant="secondary", size="lg", scale=1) + + with gr.Column(scale=2): + player_html = gr.HTML( + value=PLAYER_HTML, + visible=True, + label="streaming player", + elem_id="streaming-player", + ) + audio_output = gr.Audio( + label="generated audio", + interactive=False, + autoplay=True, + visible=False, + ) + gr.Examples( + examples=[ + ["Hello, this is a VoxCPM2 demo running on vLLM-Omni.", ""], + [ + "I have a dream that my four little children will one day live in a nation " + "where they will not be judged by the color of their skin but by the content " + "of their character.", + "", + ], + [ + "I never asked you to stay. It's not like I care or anything. " + "But why does it still hurt so much now that you're gone?", + "A young girl with a soft, sweet voice. Speaks slowly with a melancholic tone.", + ], + ], + inputs=[text_input, control_instruction], + label="examples", + ) + gr.HTML(""" +
+ + vLLM-Omni + +
+ """) + + hidden_payload = gr.Textbox(visible=False, elem_id="tts-payload") + + def on_ultimate_toggle(checked): + return ( + gr.update(visible=checked), # prompt_text + gr.update(interactive=not checked), # control_instruction + ) + + ultimate_clone.change( + fn=on_ultimate_toggle, + inputs=[ultimate_clone], + outputs=[prompt_text, control_instruction], + ) + + def on_stream_change(stream: bool): + if stream: + return gr.update(visible=True), gr.update(visible=False) + return gr.update(visible=False), gr.update(visible=True) + + stream_checkbox.change( + fn=on_stream_change, + inputs=[stream_checkbox], + outputs=[player_html, audio_output], + ) + + def on_reset(): + return "", "", None, "", False, "", PLAYER_HTML + + reset_btn.click( + fn=on_reset, + outputs=[ + text_input, + control_instruction, + audio_output, + hidden_payload, + ultimate_clone, + prompt_text, + player_html, + ], + js="() => { if (window.ttsStop) window.ttsStop(); }", + ) + + def on_generate(stream_enabled, text, ctrl_instr, ref_a, ref_url, ult_clone, p_text): + import time as _time + + if not text or not text.strip(): + raise gr.Error("Please enter text to synthesize.") + + # VoxCPM2 uses "(instruction)text" format for control + ctrl = ctrl_instr.strip() if ctrl_instr and not ult_clone else "" + final_text = f"({ctrl}){text.strip()}" if ctrl else text.strip() + + payload: dict = { + "input": final_text, + "voice": "default", + "response_format": "pcm" if stream_enabled else "wav", + "stream": stream_enabled, + } + + # Reference audio for cloning + ref_url_s = ref_url.strip() if ref_url else "" + if ref_url_s: + payload["ref_audio"] = ref_url_s + elif ref_a is not None: + payload["ref_audio"] = _encode_audio(ref_a) + + # Ultimate cloning: prompt_audio + prompt_text for continuation + if ult_clone and p_text and p_text.strip(): + if ref_url_s: + payload["prompt_audio"] = ref_url_s + elif ref_a is not None: + payload["prompt_audio"] = payload.get("ref_audio", "") + payload["prompt_text"] = p_text.strip() + + if stream_enabled: + if ref_a is not None and not ref_url_s: + req_id = f"req-{int(_time.time() * 1000)}" + _pending[req_id] = payload + browser_payload = {"_req_id": req_id, "_nonce": int(_time.time() * 1000)} + return json.dumps(browser_payload), gr.update() + payload["_nonce"] = int(_time.time() * 1000) + return json.dumps(payload), gr.update() + else: + try: + with httpx.Client(timeout=300.0) as client: + resp = client.post( + f"{api_base}/v1/audio/speech", + json=payload, + headers={"Content-Type": "application/json", "Authorization": "Bearer EMPTY"}, + ) + except httpx.ConnectError: + raise gr.Error(f"Cannot connect to server at {api_base}.") + if resp.status_code != 200: + raise gr.Error(f"Server error ({resp.status_code}): {resp.text[:200]}") + audio_np, sr = sf.read(io.BytesIO(resp.content)) + if audio_np.ndim > 1: + audio_np = audio_np[:, 0] + return "", (sr, audio_np.astype(np.float32)) + + generate_btn.click( + fn=on_generate, + inputs=[ + stream_checkbox, + text_input, + control_instruction, + ref_audio, + ref_audio_url, + ultimate_clone, + prompt_text, + ], + outputs=[hidden_payload, audio_output], + ).then( + fn=lambda p: p, + inputs=[hidden_payload], + outputs=[hidden_payload], + js="(p) => { if (p && p.trim()) { const d = JSON.parse(p); delete d._nonce; window.ttsGenerate(d); } return p; }", + ) + + demo.queue() + + return gr.mount_gradio_app(app, demo, path="/", css=css, theme=theme, head=_build_player_js()) + + +def main(): + parser = argparse.ArgumentParser(description="VoxCPM2 streaming Gradio demo") + parser.add_argument("--api-base", default="http://localhost:8000", help="vLLM API server URL") + parser.add_argument("--host", default="0.0.0.0", help="Gradio server host") + parser.add_argument("--port", type=int, default=7860, help="Gradio server port") + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO) + print(f"Connecting to vLLM server at: {args.api_base}") + + import uvicorn + + uvicorn.run(create_app(args.api_base), host=args.host, port=args.port) + + +if __name__ == "__main__": + main() diff --git a/tests/e2e/offline_inference/test_voxcpm2.py b/tests/e2e/offline_inference/test_voxcpm2.py index 4e4f635d5c4..6ec4630a45e 100644 --- a/tests/e2e/offline_inference/test_voxcpm2.py +++ b/tests/e2e/offline_inference/test_voxcpm2.py @@ -33,14 +33,16 @@ def _extract_audio(multimodal_output: dict) -> torch.Tensor: """Extract the final complete audio tensor from multimodal output.""" assert isinstance(multimodal_output, dict), f"Expected dict, got {type(multimodal_output)}" - # Output processor accumulates per-step full audio under "audio". - audio = multimodal_output.get("audio") or multimodal_output.get("model_outputs") + # Output processor accumulates per-step audio chunks under "audio". + audio = multimodal_output.get("audio") + if audio is None: + audio = multimodal_output.get("model_outputs") assert audio is not None, f"No audio key, got {list(multimodal_output.keys())}" if isinstance(audio, list): - valid = [x for x in audio if isinstance(x, torch.Tensor) and x.numel() > 100] + valid = [torch.as_tensor(x).float().cpu().reshape(-1) for x in audio if x is not None] assert valid, "No valid audio tensors in output list" - audio = valid[-1] + audio = torch.cat(valid, dim=0) if len(valid) > 1 else valid[0] assert isinstance(audio, torch.Tensor), f"Expected Tensor, got {type(audio)}" return audio diff --git a/vllm_omni/engine/output_processor.py b/vllm_omni/engine/output_processor.py index 43d02e85b84..badd799fc94 100644 --- a/vllm_omni/engine/output_processor.py +++ b/vllm_omni/engine/output_processor.py @@ -118,9 +118,10 @@ def _consolidate_multimodal_tensors(self) -> None: if isinstance(v, list) and v and isinstance(v[0], torch.Tensor): try: if k == "audio": - # When the audio tensor shape is inconsistent, torch.cat will fail. - # We need to use torch.cat in -1 dimension. - continue + # Concatenate delta audio chunks (1-D) into the full waveform. + # Each entry is a per-step slice; flatten to -1 so chunks with + # inconsistent leading dims can still be joined on the sample axis. + self.mm_accumulated[k] = torch.cat([t.reshape(-1) for t in v], dim=0) elif k == "sr": # Sample rate is a constant scalar, keep last value. self.mm_accumulated[k] = v[-1] diff --git a/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py b/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py index 7ea5bc229dc..40bacfff6c7 100644 --- a/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py +++ b/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py @@ -308,31 +308,28 @@ def forward( return hidden_states def compile_selective(self) -> list[str]: - """Compile MLP + o_proj; keep RMSNorm/RoPE eager for precision.""" - compiled: list[str] = [] - for i, layer in enumerate(self.layers): - if i in self._compiled_layers: - continue - try: - layer.mlp = torch.compile( - layer.mlp, - mode="default", - fullgraph=True, - ) - layer.self_attn.o_proj = torch.compile( - layer.self_attn.o_proj, - mode="default", - fullgraph=True, - ) - layer.self_attn._fused_qkv_weight = None - self._compiled_layers.add(i) - if i == 0: - compiled.append(f"layers.*.mlp (×{len(self.layers)})") - compiled.append(f"layers.*.self_attn.o_proj (×{len(self.layers)})") - except Exception as e: - logger.warning("compile_selective: layer %d failed: %s", i, e) - break - return compiled + """Compile the full model forward as one graph. + + Earlier versions compiled ``layer.mlp`` + ``layer.self_attn.o_proj`` + (PR #2690) and then the whole ``layer`` (perf/voxcpm2-streaming-vae). + Both still paid one Dynamo dispatch per layer per decode step. + V3 profiling showed 1,332 per-layer dispatches (~28 layers × ~47 + decode steps) costing ~726 ms of CPU self-time for a long prompt. + + Compiling ``forward`` at the model level lets Dynamo unroll the + 28-layer Python loop inside the graph. Graph breaks at + PagedAttention produce sub-graphs but Dynamo memoises the whole + trace once, so the per-step dispatch drops from 28 to just a few. + """ + if self._compiled_layers: + return [] + # Null the fused-qkv caches so the compile sees the real weight layout. + for layer in self.layers: + layer.self_attn._fused_qkv_weight = None + self.forward = torch.compile(self.forward, mode="default", fullgraph=False) + # Mark every layer as compiled so idempotent callers don't double-wrap. + self._compiled_layers.update(range(len(self.layers))) + return ["forward (whole model)"] def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: """Load weights from native checkpoint (base_lm. prefix pre-stripped).""" @@ -415,22 +412,14 @@ def forward( return hidden_states def compile_selective(self) -> list[str]: - """Compile MLP + o_proj (same as base_lm).""" - compiled: list[str] = [] - for i, layer in enumerate(self.layers): - if i in self._compiled_layers: - continue - try: - layer.mlp = torch.compile(layer.mlp, mode="default", fullgraph=True) - layer.self_attn.o_proj = torch.compile(layer.self_attn.o_proj, mode="default", fullgraph=True) - layer.self_attn._fused_qkv_weight = None - self._compiled_layers.add(i) - if i == 0: - compiled.append(f"layers.*.mlp (×{len(self.layers)})") - compiled.append(f"layers.*.self_attn.o_proj (×{len(self.layers)})") - except Exception as e: - logger.warning("compile_selective: residual layer %d failed: %s", i, e) - return compiled + """Compile the full residual model forward as one graph (same strategy as base_lm).""" + if self._compiled_layers: + return [] + for layer in self.layers: + layer.self_attn._fused_qkv_weight = None + self.forward = torch.compile(self.forward, mode="default", fullgraph=False) + self._compiled_layers.update(range(len(self.layers))) + return ["forward (whole residual)"] def load_weights_from_native(self, native_residual_lm: nn.Module) -> int: """Load weights from native residual_lm. Returns param count.""" diff --git a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py index 0898ca59ae4..94f06589046 100644 --- a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py +++ b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py @@ -11,6 +11,7 @@ from __future__ import annotations import dataclasses +import logging import os import time from collections.abc import Iterable @@ -19,7 +20,6 @@ import librosa import torch import torch.nn as nn -from einops import rearrange from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.models.utils import ( @@ -86,7 +86,11 @@ class _RequestState: curr_prefix_feat_cond: torch.Tensor | None = None last_audio_patch_gpu: torch.Tensor | None = None precomputed_stop_logits: torch.Tensor | None = None - accumulated_patches: list[torch.Tensor] = dataclasses.field(default_factory=list) + # Rolling tail of previously-decoded latents used as VAE receptive-field context. + # Shape (n_pad_frames, feat_dim) on GPU. None before first decode. + decode_pad: torch.Tensor | None = None + # Audio chunks already emitted (CPU float32), concatenated for cumulative output. + audio_chunks: list[torch.Tensor] = dataclasses.field(default_factory=list) decode_step_count: int = 0 request_start_time: float = 0.0 prefill_completed: bool = False @@ -229,11 +233,11 @@ def _optimized_solve_euler( buffers.x_in[b : 2 * b].copy_(x) buffers.mu_in[:b].copy_(mu) buffers.mu_in[b : 2 * b].zero_() - buffers.t_in[:b].fill_(t.item()) - buffers.t_in[b : 2 * b].fill_(t.item()) + # Broadcast the 0-dim GPU scalar directly instead of + # ``.fill_(t.item())`` — ``.item()`` forces a GPU->CPU sync. + buffers.t_in[: 2 * b].copy_(t) if mean_mode: - buffers.dt_in[:b].fill_(dt.item()) - buffers.dt_in[b : 2 * b].fill_(dt.item()) + buffers.dt_in[: 2 * b].copy_(dt) else: buffers.dt_in.zero_() buffers.cond_in[:b].copy_(cond[:b]) @@ -263,9 +267,10 @@ def _optimized_solve_euler( else: buffers.x_in[:b].copy_(x) buffers.mu_in[:b].copy_(mu) - buffers.t_in[:b].fill_(t.item()) + # Broadcast the 0-dim GPU scalar; ``.fill_(t.item())`` would sync. + buffers.t_in[:b].copy_(t) if mean_mode: - buffers.dt_in[:b].fill_(dt.item()) + buffers.dt_in[:b].copy_(dt) else: buffers.dt_in[:b].zero_() buffers.cond_in[:b].copy_(cond[:b]) @@ -320,7 +325,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._inference_timesteps = 10 self._cfg_value = 2.0 self._cfg_cutoff_ratio = 1.0 - self._vae_decode_interval = 5 + # Number of trailing latent frames to keep as VAE receptive-field context + # for sliding-window streaming decode. 12 matches the nanovllm reference + # implementation and covers the longest VAE decoder receptive field. + self._n_decode_pad_frames = 12 self._enable_torch_compile = True self._compile_vae = True self._max_decode_steps = 2000 @@ -686,7 +694,9 @@ def _finish_prefill(self, state: _RequestState, meta: dict, res_out: torch.Tenso state.request_start_time = time.perf_counter() state.prefill_completed = True - logger.info("PREFILL[%s]: patch norm=%.4f", state.request_id, pred_feat.norm().item()) + if logger.isEnabledFor(logging.DEBUG): + # Only compute the norm (which forces a GPU->CPU sync) if we will log it. + logger.debug("PREFILL[%s]: patch norm=%.4f", state.request_id, pred_feat.norm().item()) self._perf.reset() def _finish_decode(self, state: _RequestState, meta: dict, res_out: torch.Tensor, dev: Any): @@ -720,26 +730,54 @@ def _finish_decode(self, state: _RequestState, meta: dict, res_out: torch.Tensor # -------------------- audio collection -------------------- def _collect_audio(self, state: _RequestState) -> torch.Tensor | None: - patch = state.last_audio_patch_gpu - if patch is not None: - state.last_audio_patch_gpu = None - state.accumulated_patches.append(patch.reshape(1, -1).float()) + """Per-step sliding-window VAE decode (nanovllm pattern). - if not state.accumulated_patches: + Each decode step feeds ``[decode_pad, new_patch]`` through the VAE + and slices out only the audio region corresponding to the new patch. + The pad buffer (last ``_n_decode_pad_frames`` latent frames) provides + the receptive-field context needed by the VAE's transposed convolutions, + eliminating boundary artifacts between chunks. + + Returns the delta audio chunk (not cumulative) so the output processor + can stream each chunk to the client independently. + """ + patch = state.last_audio_patch_gpu + if patch is None: return None + state.last_audio_patch_gpu = None + + # patch shape: (patch_size, feat_dim) or (1, patch_size, feat_dim) + new_latent = patch.reshape(-1, self._feat_dim).to(torch.float32) + n_new = new_latent.shape[0] # = patch_size (typically 4) + + self._perf.start("vae_decode") + + # Build VAE input: [pad_frames | new_latent] + if state.decode_pad is not None: + vae_input = torch.cat([state.decode_pad, new_latent], dim=0) + pad_frames = state.decode_pad.shape[0] + else: + vae_input = new_latent + pad_frames = 0 + + # VAE decode: (1, feat_dim, T_frames) -> (1, 1, T_samples) + feat = vae_input.unsqueeze(0).transpose(1, 2).contiguous() + with torch.no_grad(): + audio = self.tts.audio_vae.decode(feat.to(self._device)).reshape(-1) + + # Slice out only the new audio (after the pad region). + # Each latent frame maps to decoder_chunk_size audio samples. + dcs = int(getattr(self.tts.audio_vae, "decode_chunk_size", audio.numel() // vae_input.shape[0])) + new_audio = audio[pad_frames * dcs : (pad_frames + n_new) * dcs].detach().cpu().float() + + # Roll the pad buffer: keep last N latent frames as context for next step. + all_latents = vae_input # [pad + new] + state.decode_pad = all_latents[-self._n_decode_pad_frames :].detach() - n = len(state.accumulated_patches) - if n <= 1 or n % self._vae_decode_interval == 0 or state.is_stopping: - self._perf.start("vae_decode") - all_p = torch.cat(state.accumulated_patches, dim=0) - state.accumulated_patches = [all_p] - feat = rearrange(all_p.reshape(1, -1, self._feat_dim), "b t d -> b d t") - with torch.no_grad(): - audio = self.tts.audio_vae.decode(feat.to(self._device)).reshape(-1).cpu().float() - self._perf.stop("vae_decode") - state.last_decoded_audio = audio - return audio - return state.last_decoded_audio + state.audio_chunks.append(new_audio) + state.last_decoded_audio = new_audio + self._perf.stop("vae_decode") + return new_audio # -------------------- compute_logits -------------------- @@ -830,7 +868,8 @@ def preprocess( state = self._get_or_create_state(req_id) state.prefill_text = "" - state.accumulated_patches = [] + state.decode_pad = None + state.audio_chunks = [] state.prefill_completed = False state.decode_step_count = 0 state.precomputed_stop_logits = None From a782ae47805d9761f446e4e715530af0f54859ab Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Wed, 15 Apr 2026 10:28:28 +0800 Subject: [PATCH 42/76] [Perf] Enhance benchmark script to support baseline thresholds and proved result handling (#2789) --- tests/dfx/perf/scripts/run_benchmark.py | 98 +++++++++++++++++-- .../scripts/test_benchmark_stability.py | 2 + tools/nightly/generate_nightly_perf_excel.py | 49 +++++++--- tools/nightly/generate_nightly_perf_html.py | 45 +++++++-- 4 files changed, 162 insertions(+), 32 deletions(-) diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index b64cc0d9503..67dedcd0480 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -56,16 +56,41 @@ def omni_server(request): print("OmniServer stopped") +def _safe_filename_token(value: Any | None, *, default: str = "na") -> str: + """Make a single path segment safe for result filenames on common filesystems.""" + if value is None: + return default + s = str(value).strip() + for bad in ("/", "\\", ":", "*", "?", '"', "<", ">", "|"): + s = s.replace(bad, "_") + return s if s else default + + def run_benchmark( args: list, test_name: str, flow, dataset_name: str, num_prompt, + *, + baseline_config: dict[str, Any] | None = None, + sweep_index: int | None = None, + request_rate: Any | None = None, + max_concurrency: Any | None = None, + random_input_len: Any | None = None, + random_output_len: Any | None = None, ) -> Any: - """Run a single benchmark iteration and return the parsed result JSON.""" + """Run a single benchmark iteration and return the parsed result JSON. + + After ``vllm bench`` writes the JSON, ``result["baseline"]`` holds the same + per-metric resolved thresholds as ``assert_result`` (via ``_baseline_thresholds_for_step``). + When ``random_input_len`` / ``random_output_len`` are set, they are also written into the result JSON; + omitted keys when not configured. + """ current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") - result_filename = f"result_{test_name}_{dataset_name}_{flow}_{num_prompt}_{current_dt}.json" + ri = _safe_filename_token(random_input_len) + ro = _safe_filename_token(random_output_len) + result_filename = f"result_{test_name}_{dataset_name}_{flow}_{num_prompt}_in{ri}_out{ro}_{current_dt}.json" if "--result-filename" in args: print(f"The result file will be overwritten by {result_filename}") command = ( @@ -97,8 +122,26 @@ def run_benchmark( else: result_dir = "./" - with open(os.path.join(result_dir, result_filename), encoding="utf-8") as f: + result_path = os.path.join(result_dir, result_filename) + with open(result_path, encoding="utf-8") as f: result = json.load(f) + + if baseline_config: + result["baseline"] = _baseline_thresholds_for_step( + baseline_config, + sweep_index=sweep_index, + request_rate=request_rate, + max_concurrency=max_concurrency, + ) + else: + result["baseline"] = {} + if random_input_len is not None: + result["random_input_len"] = random_input_len + if random_output_len is not None: + result["random_output_len"] = random_output_len + with open(result_path, "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=2) + return result @@ -164,10 +207,33 @@ def _resolve_baseline_value( f"or request_rate={request_rate!r}; keys={list(baseline_raw.keys())!r}" ) if isinstance(baseline_raw, (list, tuple)): + if sweep_index is None: + raise ValueError("list baseline requires sweep_index") + if not (0 <= sweep_index < len(baseline_raw)): + raise IndexError(f"baseline list len={len(baseline_raw)} has no index {sweep_index}") return baseline_raw[sweep_index] return baseline_raw +def _baseline_thresholds_for_step( + baseline_data: dict[str, Any], + *, + sweep_index: int | None = None, + max_concurrency: Any = None, + request_rate: Any = None, +) -> dict[str, Any]: + """Resolve ``test.json`` ``baseline`` block to one threshold per metric (same as ``assert_result``).""" + return { + metric_name: _resolve_baseline_value( + baseline_raw, + sweep_index=sweep_index, + max_concurrency=max_concurrency, + request_rate=request_rate, + ) + for metric_name, baseline_raw in baseline_data.items() + } + + def assert_result( result, params, @@ -179,14 +245,14 @@ def assert_result( ) -> None: assert result["completed"] == num_prompt, "Request failures exist" baseline_data = params.get("baseline", {}) - for metric_name, baseline_raw in baseline_data.items(): + thresholds = _baseline_thresholds_for_step( + baseline_data, + sweep_index=sweep_index, + max_concurrency=max_concurrency, + request_rate=request_rate, + ) + for metric_name, baseline_value in thresholds.items(): current_value = result[metric_name] - baseline_value = _resolve_baseline_value( - baseline_raw, - sweep_index=sweep_index, - max_concurrency=max_concurrency, - request_rate=request_rate, - ) if "throughput" in metric_name: if current_value <= baseline_value: print( @@ -258,6 +324,12 @@ def to_list(value, default=None): flow=qps, dataset_name=dataset_name, num_prompt=num_prompt, + baseline_config=params.get("baseline"), + sweep_index=i, + request_rate=qps, + max_concurrency=None, + random_input_len=params.get("random_input_len"), + random_output_len=params.get("random_output_len"), ) assert_result( result, @@ -276,6 +348,12 @@ def to_list(value, default=None): flow=concurrency, dataset_name=dataset_name, num_prompt=num_prompt, + baseline_config=params.get("baseline"), + sweep_index=i, + request_rate=None, + max_concurrency=concurrency, + random_input_len=params.get("random_input_len"), + random_output_len=params.get("random_output_len"), ) assert_result( result, diff --git a/tests/dfx/stability/scripts/test_benchmark_stability.py b/tests/dfx/stability/scripts/test_benchmark_stability.py index e8568652d18..a9faae8ab84 100644 --- a/tests/dfx/stability/scripts/test_benchmark_stability.py +++ b/tests/dfx/stability/scripts/test_benchmark_stability.py @@ -112,6 +112,8 @@ def _run_one_benchmark_batch( flow=flow, dataset_name=dataset_name, num_prompt=num_prompts, + random_input_len=params.get("random_input_len"), + random_output_len=params.get("random_output_len"), ) return result except (FileNotFoundError, OSError) as e: diff --git a/tools/nightly/generate_nightly_perf_excel.py b/tools/nightly/generate_nightly_perf_excel.py index 5f9eb428bca..4bb77853171 100644 --- a/tools/nightly/generate_nightly_perf_excel.py +++ b/tools/nightly/generate_nightly_perf_excel.py @@ -319,10 +319,10 @@ def _load_json_file(path: str) -> dict[str, Any] | list[Any] | None: def _parse_from_filename(filename: str) -> dict[str, Any]: - """Parse test-related metadata from a result JSON filename. + """Parse test-related metadata from a ``result_test_*.json`` filename. - Expected pattern (after prefix/suffix stripped): - ____ + Matches ``tests/dfx/perf/scripts/run_benchmark.py`` naming, including optional + ``_in{X}_out{Y}_`` before the timestamp (``na`` when unset). """ name, ext = os.path.splitext(filename) if ext != ".json" or not name.startswith(_RESULT_JSON_PREFIX): @@ -331,22 +331,42 @@ def _parse_from_filename(filename: str) -> dict[str, Any]: core = name[len(_RESULT_JSON_PREFIX) :] parts = core.split("_") if len(parts) < 5: - LOGGER.warning("filename '%s' does not match expected pattern, skip parsing test metadata", filename) + LOGGER.warning( + "filename '%s' does not match expected pattern (need >= 5 segments), skip parsing", + filename, + ) return {} - timestamp = parts[-1] - num_prompts_str = parts[-2] - max_concurrency_str = parts[-3] - dataset_name = parts[-4] - test_name = "_".join(parts[:-4]) if parts[:-4] else "" + idx = len(parts) - 1 + timestamp = parts[idx] + idx -= 1 parsed: dict[str, Any] = {} - if len(timestamp) >= 15: parsed["date"] = timestamp - if dataset_name in DATASET_NAME_ALLOWED: - parsed["dataset_name"] = dataset_name + if idx >= 0 and parts[idx].startswith("out"): + parsed["random_output_len"] = parts[idx][3:] + idx -= 1 + if idx >= 0 and parts[idx].startswith("in"): + parsed["random_input_len"] = parts[idx][2:] + idx -= 1 + + if idx < 3: + LOGGER.warning( + "filename '%s' has too few segments after timestamp / optional in-out (idx=%s)", + filename, + idx, + ) + return parsed + + num_prompts_str = parts[idx] + idx -= 1 + flow_str = parts[idx] + idx -= 1 + dataset_name = parts[idx] + idx -= 1 + test_name = "_".join(parts[: idx + 1]) if idx >= 0 else "" try: parsed["num_prompts"] = int(num_prompts_str) @@ -354,13 +374,16 @@ def _parse_from_filename(filename: str) -> dict[str, Any]: pass try: - parsed["max_concurrency"] = int(max_concurrency_str) + parsed["max_concurrency"] = int(flow_str) except (TypeError, ValueError): pass if test_name: parsed["test_name"] = test_name + if dataset_name in DATASET_NAME_ALLOWED: + parsed["dataset_name"] = dataset_name + return parsed diff --git a/tools/nightly/generate_nightly_perf_html.py b/tools/nightly/generate_nightly_perf_html.py index 05dc48d717c..dd5ece79074 100644 --- a/tools/nightly/generate_nightly_perf_html.py +++ b/tools/nightly/generate_nightly_perf_html.py @@ -67,6 +67,7 @@ def _load_json_file(path: str) -> dict[str, Any] | None: def _parse_from_filename(filename: str) -> dict[str, Any]: + """Parse ``result_test_*.json`` filenames; same rules as ``generate_nightly_perf_excel``.""" name, ext = os.path.splitext(filename) if ext != ".json" or not name.startswith(_RESULT_JSON_PREFIX): return {} @@ -75,32 +76,58 @@ def _parse_from_filename(filename: str) -> dict[str, Any]: parts = core.split("_") if len(parts) < 5: LOGGER.warning( - "filename '%s' does not match expected pattern, skip parsing test metadata", + "filename '%s' does not match expected pattern (need >= 5 segments), skip parsing", filename, ) return {} - timestamp = parts[-1] - num_prompts_str = parts[-2] - max_concurrency_str = parts[-3] - dataset_name = parts[-4] - test_name = "_".join(parts[:-4]) if parts[:-4] else "" + idx = len(parts) - 1 + timestamp = parts[idx] + idx -= 1 parsed: dict[str, Any] = {} if len(timestamp) >= 15: parsed["date"] = timestamp - if dataset_name in ("random", "random-mm"): - parsed["dataset_name"] = dataset_name + + if idx >= 0 and parts[idx].startswith("out"): + parsed["random_output_len"] = parts[idx][3:] + idx -= 1 + if idx >= 0 and parts[idx].startswith("in"): + parsed["random_input_len"] = parts[idx][2:] + idx -= 1 + + if idx < 3: + LOGGER.warning( + "filename '%s' has too few segments after timestamp / optional in-out (idx=%s)", + filename, + idx, + ) + return parsed + + num_prompts_str = parts[idx] + idx -= 1 + flow_str = parts[idx] + idx -= 1 + dataset_name = parts[idx] + idx -= 1 + test_name = "_".join(parts[: idx + 1]) if idx >= 0 else "" + try: parsed["num_prompts"] = int(num_prompts_str) except (TypeError, ValueError): pass + try: - parsed["max_concurrency"] = int(max_concurrency_str) + parsed["max_concurrency"] = int(flow_str) except (TypeError, ValueError): pass + if test_name: parsed["test_name"] = test_name + + if dataset_name in ("random", "random-mm"): + parsed["dataset_name"] = dataset_name + return parsed From 227bab3038a10ba1bde4c2c9154be428b496a7e3 Mon Sep 17 00:00:00 2001 From: amy-why-3459 Date: Wed, 15 Apr 2026 11:07:35 +0800 Subject: [PATCH 43/76] [Benchmark]Omni-modality model accuracy benchmark(Daily-Omni & seed-tts-eval) (#2558) Signed-off-by: amy-why-3459 --- pyproject.toml | 11 + .../data_modules/daily_omni_dataset.py | 887 ++++++++++++++++++ .../data_modules/daily_omni_eval.py | 406 ++++++++ .../data_modules/daily_omni_text_audio.py | 255 +++++ .../data_modules/seed_tts_dataset.py | 272 ++++++ .../benchmarks/data_modules/seed_tts_eval.py | 729 ++++++++++++++ vllm_omni/benchmarks/patch/__init__.py | 3 + vllm_omni/benchmarks/patch/patch.py | 332 ++++++- vllm_omni/benchmarks/serve.py | 12 + vllm_omni/entrypoints/cli/benchmark/serve.py | 143 ++- 10 files changed, 3041 insertions(+), 9 deletions(-) create mode 100644 vllm_omni/benchmarks/data_modules/daily_omni_dataset.py create mode 100644 vllm_omni/benchmarks/data_modules/daily_omni_eval.py create mode 100644 vllm_omni/benchmarks/data_modules/daily_omni_text_audio.py create mode 100644 vllm_omni/benchmarks/data_modules/seed_tts_dataset.py create mode 100644 vllm_omni/benchmarks/data_modules/seed_tts_eval.py diff --git a/pyproject.toml b/pyproject.toml index 57a4b474fd5..753e0e39817 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,17 @@ demo = [ "gradio>=6.7.0", ] +# Seed-TTS serve benchmark WER (BytedanceSpeech/seed-tts-eval run_wer.py protocol). +seed-tts-eval = [ + "jiwer>=3.0.0", + "zhon>=2.0.0", + "zhconv>=1.4.2", + "scipy>=1.10.0", + "soundfile>=0.12.0", + "transformers>=4.36.0", + "funasr>=1.0.0", +] + docs = [ "mkdocs>=1.5.0", "mkdocs-api-autonav", diff --git a/vllm_omni/benchmarks/data_modules/daily_omni_dataset.py b/vllm_omni/benchmarks/data_modules/daily_omni_dataset.py new file mode 100644 index 00000000000..01b86d0fd1e --- /dev/null +++ b/vllm_omni/benchmarks/data_modules/daily_omni_dataset.py @@ -0,0 +1,887 @@ +"""Daily-Omni Dataset loader for benchmark. + +Daily-Omni is an audio-visual reasoning benchmark with 684 videos +and 1,197 multiple-choice QA pairs across 6 major task types. + +Dataset source: https://huggingface.co/datasets/liarliar/Daily-Omni + +Supports loading QA metadata from: +- Local JSON file (``qa_json_path``): recommended for offline/air-gapped environments +- HuggingFace datasets (``dataset_path``): legacy online mode + +The videos must be separately downloaded and extracted from Videos.tar. + +Why ``BenchmarkDataset`` instead of ``HuggingFaceDataset``? + vLLM's ``HuggingFaceDataset`` is a thin wrapper whose ``__init__`` always ends by calling + ``load_data()`` → ``datasets.load_dataset(...)`` with a required Hub id and split. That + contract fits "Hub-only" benches, but Daily-Omni also needs **offline QA metadata** from a + local ``qa.json`` without touching the network. Subclassing ``HuggingFaceDataset`` would + mean fighting the parent constructor (fake ``dataset_path``, reordering ``load_data``, or + duplicating half the parent) and would still imply ``datasets`` is always relevant. + + This class therefore inherits only ``BenchmarkDataset`` (minimal: ``dataset_path``, + ``random_seed``, ``self.data``) and implements **two explicit loaders**: + ``_load_from_local_json`` (default path for air-gapped runs) and ``_load_from_huggingface`` + (optional legacy path for users who prefer ``datasets`` + Hub cache). The latter is **not** + inheritance; it is the same Hub rows as before, factored into a helper so one class can + serve both deployment modes without mandatory ``datasets`` when using ``qa_json_path``. + +Usage: + from vllm_omni.benchmarks.data_modules.daily_omni_dataset import DailyOmniDataset + + # Local JSON mode (recommended) + dataset = DailyOmniDataset( + qa_json_path="/path/to/qa.json", + video_dir="/path/to/Videos", + random_seed=42, + ) + + # HuggingFace mode (legacy, requires network) + dataset = DailyOmniDataset( + dataset_path="liarliar/Daily-Omni", + dataset_split="train", + random_seed=42, + ) + requests = dataset.sample( + tokenizer=tokenizer, + num_requests=100, + output_len=256, + ) +""" + +import base64 +import json +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Literal + +try: + from vllm.benchmarks.datasets import BenchmarkDataset, SampleRequest +except ImportError: + # Fallback: if BenchmarkDataset not available, use base class from same module + from vllm.benchmarks.datasets import HuggingFaceDataset as BenchmarkDataset + from vllm.benchmarks.datasets import SampleRequest +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.hf import get_cached_tokenizer + +try: + from datasets import load_dataset +except ImportError: + load_dataset = None + +logger = logging.getLogger(__name__) + + +class _ListDatasetIterator: + """Simple iterator wrapper around a list to mimic HuggingFace streaming dataset behavior.""" + + def __init__(self, data: list[dict[str, Any]]) -> None: + self._data = data + self._index = 0 + + def __iter__(self): + self._index = 0 + return self + + def __next__(self) -> dict[str, Any]: + if self._index >= len(self._data): + raise StopIteration + item = self._data[self._index] + self._index += 1 + return item + + def __len__(self) -> int: + return len(self._data) + + def __getitem__(self, idx: int | slice) -> dict[str, Any] | list[dict[str, Any]]: + return self._data[idx] + + +# Aligns with Lliar-liar/Daily-Omni CLI ``--input_mode`` (test_model/*/testmodel.py). +DailyOmniInputMode = Literal["all", "visual", "audio"] + +# ``build_conversation()`` in Daily-Omni ``test_model/Qwen2.5-Omni/testmodel.py`` (verbatim). +DAILY_OMNI_SYSTEM_TEXT = ( + "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, " + "capable of perceiving auditory and visual inputs, as well as generating text and speech." +) + + +@dataclass +class DailyOmniSampleRequest(SampleRequest): + """``SampleRequest`` with Daily-Omni gold labels for post-run accuracy scoring.""" + + daily_omni_gold_answer: str = "" + daily_omni_video_id: str = "" + daily_omni_task_type: str = "" + #: Official qa.json ``video_duration`` (e.g. ``30s``, ``60s``) for leaderboard-style breakdown. + daily_omni_video_duration: str = "" + #: Official ``video_category`` (YouTube-style category string) for per-category accuracy. + daily_omni_video_category: str = "" + #: Extra JSON fields merged into chat-completions ``extra_body`` (e.g. ``mm_processor_kwargs``). + omni_extra_body: dict[str, Any] | None = None + #: Full OpenAI ``messages`` (system + user) mirroring upstream Daily-Omni conversation. + omni_chat_messages: list[dict[str, Any]] | None = None + #: Used only when ``omni_chat_messages`` is None (non-Daily-Omni-style requests). + omni_chat_mm_position: Literal["first", "last"] = "last" + + +class DailyOmniDataset(BenchmarkDataset): + """Daily-Omni audio-visual QA dataset for benchmarking. + + Inherits ``BenchmarkDataset`` only (not ``HuggingFaceDataset``): see module docstring for why + Hub loading lives in ``_load_from_huggingface`` instead of subclassing the HF base class. + + The dataset includes: + - 684 videos from daily life scenarios (available in Videos.tar) + - 1,197 multiple-choice QA pairs in qa.json + - 6 major task categories + + QA metadata can be loaded from: + - Local JSON file (``qa_json_path``): recommended for offline/air-gapped environments + - HuggingFace datasets (``dataset_path``): legacy online mode + + The videos must be separately downloaded and extracted from Videos.tar. + + Args: + qa_json_path: Path to local qa.json file (offline mode, preferred). When provided, + ``dataset_path`` and ``dataset_split`` are ignored. + dataset_path: HuggingFace dataset path (e.g., "liarliar/Daily-Omni"). Used only if + ``qa_json_path`` is not provided (legacy online mode). + dataset_split: Dataset split to use (default: "train"). Used only in online mode. + random_seed: Random seed for shuffling + video_dir: Directory containing extracted video files (default: None) + input_mode: Which modalities to send, matching upstream Daily-Omni ``--input_mode``: + ``all`` — video + WAV (default; official audio-visual protocol); + ``visual`` — video only; + ``audio`` — extracted WAV only (requires ``{video_id}/{video_id}_audio.wav`` under ``video_dir``). + max_duration_seconds: Reserved for future ffprobe-based filtering; currently **not applied** + when building requests (metadata ``video_duration`` is still passed through for eval). + dataset_subset: Optional HuggingFace subset name (``load_dataset(..., name=...)``); used by bench + ``--hf-subset`` / patch. + no_stream: If True, load the Hub split non-streaming (matches bench ``--no-stream``). + inline_local_video: If True, embed local MP4 as ``data:video/mp4;base64,...`` in requests so + the API server does not need ``--allowed-local-media-path`` (large JSON; use for small runs). + When ``input_mode`` is ``audio`` or ``all``, local WAV is embedded the same way + (``data:audio/wav;base64,...``). + trust_remote_code: Whether to trust remote code when loading HuggingFace dataset + (online mode only). + """ + + SUPPORTED_DATASET_PATHS: set[str] = { + "liarliar/Daily-Omni", + } + #: Default Hub id for synthetic video URLs when ``qa_json_path`` is used (``dataset_path`` None). + DEFAULT_HF_DATASET_ID = "liarliar/Daily-Omni" + IS_MULTIMODAL = True + DEFAULT_OUTPUT_LEN = 256 + + def __init__( + self, + qa_json_path: str | None = None, + dataset_path: str | None = None, + dataset_split: str = "train", + random_seed: int = 0, + video_dir: str | None = None, + input_mode: DailyOmniInputMode = "all", + inline_local_video: bool = False, + trust_remote_code: bool = False, + max_duration_seconds: float | None = None, + dataset_subset: str | None = None, + no_stream: bool = False, + **kwargs, + ) -> None: + if input_mode not in ("all", "visual", "audio"): + raise ValueError(f"input_mode must be 'all', 'visual', or 'audio', got {input_mode!r}") + + # Validate arguments: need either local JSON or HF path + if qa_json_path is None and dataset_path is None: + raise ValueError( + "Either 'qa_json_path' (local JSON) or 'dataset_path' (HuggingFace) must be provided. " + "For offline/air-gapped environments, download qa.json and use qa_json_path." + ) + + # Store configuration + self.qa_json_path = Path(qa_json_path) if qa_json_path else None + self.dataset_path = dataset_path + self.dataset_split = dataset_split + self.dataset_subset = dataset_subset + #: Match vLLM ``HuggingFaceDataset`` / bench CLI ``--no-stream``. + self._hf_streaming = not no_stream + self.video_dir = Path(video_dir) if video_dir else None + self.inline_local_video = inline_local_video + self.input_mode: DailyOmniInputMode = input_mode + self.max_duration_seconds = max_duration_seconds + self.trust_remote_code = trust_remote_code + + #: In-process cache of ffprobe durations only (no disk persistence). + self._video_durations: dict[str, float] = {} + + # Initialize parent BenchmarkDataset + super().__init__( + dataset_path=dataset_path if qa_json_path is None else None, + random_seed=random_seed, + **kwargs, + ) + + # Load data based on mode + self.load_data() + + # Verify dataset info + logger.info( + "Loaded Daily-Omni dataset: mode=%s, source=%s, random_seed=%d, input_mode=%s, max_duration=%s", + "local_json" if self.qa_json_path else "huggingface", + str(self.qa_json_path) if self.qa_json_path else f"{dataset_path}/{dataset_split}", + random_seed, + input_mode, + f"{max_duration_seconds}s" if max_duration_seconds else "unlimited", + ) + + def load_data(self) -> None: + """Populate ``self.data`` from either local JSON or the Hub. + + See module docstring: we do not subclass ``HuggingFaceDataset`` because Daily-Omni needs + a first-class offline path; Hub loading is an optional branch implemented below. + """ + if self.qa_json_path is not None: + self._load_from_local_json() + else: + self._load_from_huggingface() + + def _load_from_local_json(self) -> None: + """Load QA data from local JSON file.""" + if not self.qa_json_path.exists(): + raise FileNotFoundError(f"QA JSON file not found: {self.qa_json_path}") + + with open(self.qa_json_path, encoding="utf-8") as f: + data = json.load(f) + + # Support both list format and dict with "train"/"test" splits + if isinstance(data, dict): + # Try to get the requested split, fallback to first available + split_data = data.get(self.dataset_split) + if split_data is None: + available = list(data.keys()) + if available: + logger.warning( + "Split '%s' not found in %s, using '%s' instead", + self.dataset_split, + self.qa_json_path, + available[0], + ) + split_data = data[available[0]] + else: + split_data = [] + data = split_data + + if not isinstance(data, list): + raise ValueError(f"Expected list of QA items in JSON, got {type(data).__name__}") + + # Shuffle if requested + if not getattr(self, "disable_shuffle", False) and self.random_seed is not None: + import random + + rng = random.Random(self.random_seed) + shuffled = data[:] + rng.shuffle(shuffled) + data = shuffled + + # Create an iterator-like wrapper for compatibility + self.data = _ListDatasetIterator(data) + + def _load_from_huggingface(self) -> None: + """Load QA rows via ``datasets.load_dataset`` (legacy / convenience path). + + Kept for backward compatibility: callers can still pass ``dataset_path=liarliar/Daily-Omni`` + and get the same parquet-backed rows as the Hub dataset card, with streaming (or + non-streaming if ``no_stream=True``) and shuffle. + + This is intentionally **not** implemented by subclassing ``HuggingFaceDataset``: that base + always runs Hub ``load_dataset`` from its constructor and expects a Hub id as the primary + API; Daily-Omni instead chooses the source in ``load_data()`` (JSON vs Hub) while sharing + one ``sample()`` / request-building implementation for both. + """ + if load_dataset is None: + raise ImportError( + "datasets library is required for HuggingFace mode. " + "Install with: pip install datasets, or use local JSON mode instead." + ) + + ds = load_dataset( + self.dataset_path, + name=self.dataset_subset, + split=self.dataset_split, + streaming=self._hf_streaming, + trust_remote_code=self.trust_remote_code, + ) + if not getattr(self, "disable_shuffle", False): + ds = ds.shuffle(seed=self.random_seed) + self.data = ds + + def get_task_statistics(self) -> dict[str, int]: + """Get distribution of task types in the dataset. + + Returns: + Dict mapping task type to count + """ + stats: dict[str, int] = {} + for item in self.data: + row = self._coerce_row(item) + fields = self._normalize_qa_fields(row) + task_type = fields["task_type"] or "unknown" + stats[task_type] = stats.get(task_type, 0) + 1 + return stats + + @staticmethod + def _coerce_row(item: Any) -> dict[str, Any]: + """Turn a dataset row into a plain dict (Arrow / Mapping).""" + if isinstance(item, dict): + return item + if hasattr(item, "as_py"): + return dict(item.as_py()) # pyarrow Row + try: + return dict(item) + except (TypeError, ValueError): + return {k: item[k] for k in item} # type: ignore[misc] + + @staticmethod + def _normalize_qa_fields(row: dict[str, Any]) -> dict[str, Any]: + """Map official Daily-Omni qa.json / Hub schema to internal fields. + + Official fields (see liarliar/Daily-Omni ``qa.json``): ``Question``, ``Choice`` (list), + ``Answer``, ``video_id``, ``Type``, ``video_duration`` (``30s`` / ``60s``), ``video_category``, + plus other category columns. Legacy aliases (lowercase / older loaders) are still accepted. + """ + out: dict[str, Any] = {} + + out["question"] = str(row.get("Question") or row.get("question") or "").strip() + vid = row.get("video_id") if row.get("video_id") is not None else row.get("video") + out["video_id"] = str(vid).strip() if vid is not None else "" + out["task_type"] = str(row.get("Type") or row.get("task_type") or row.get("type") or "").strip() + vc = row.get("video_category") if row.get("video_category") is not None else row.get("videoCategory") + out["video_category"] = str(vc).strip() if vc is not None else "" + vd = row.get("video_duration") if row.get("video_duration") is not None else row.get("videoDuration") + out["video_duration"] = str(vd).strip() if vd is not None else "" + out["answer"] = str(row.get("Answer") or row.get("answer") or "").strip() + vu = row.get("video_url") if row.get("video_url") is not None else row.get("Video_URL") + out["video_url"] = str(vu).strip() if vu is not None and str(vu).strip() else None + + choice = row.get("Choice") + if choice is None: + choice = row.get("options") or row.get("choice") + out["choice"] = choice + + return out + + def sample( + self, + tokenizer: TokenizerLike, + num_requests: int, + output_len: int | None = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list[SampleRequest]: + """Sample requests from Daily-Omni dataset. + + Args: + tokenizer: Tokenizer for computing prompt length + num_requests: Number of requests to sample + output_len: Target output length in tokens (default: 256) + request_id_prefix: Prefix for request IDs + no_oversample: If True, do not oversample if fewer examples available + **kwargs: Additional arguments (ignored) + + Returns: + List of SampleRequest objects with video URLs and prompts + """ + if output_len is None: + output_len = self.DEFAULT_OUTPUT_LEN + + sampled_requests: list[SampleRequest] = [] + ind = 0 + cached_tokenizer = get_cached_tokenizer(tokenizer) + + # Iterate over shuffled dataset + for item in self.data: + if len(sampled_requests) >= num_requests: + break + + request = self._create_sample_request( + self._coerce_row(item), cached_tokenizer, output_len, request_id_prefix, ind + ) + if request: + sampled_requests.append(request) + ind += 1 + + logger.info("Created %d sample requests from Daily-Omni dataset", len(sampled_requests)) + + # Handle oversampling if needed + self.maybe_oversample_requests(sampled_requests, num_requests, request_id_prefix, no_oversample) + + return sampled_requests + + def _create_sample_request( + self, + qa_item: dict[str, Any], + tokenizer: TokenizerLike, + output_len: int, + request_id_prefix: str, + index: int, + ) -> SampleRequest | None: + """Create a SampleRequest from a QA item. + + Args: + qa_item: QA pair from the dataset + tokenizer: Tokenizer + output_len: Target output length + request_id_prefix: Prefix for request ID + index: Request index + + Returns: + SampleRequest or None if invalid + """ + fields = self._normalize_qa_fields(qa_item) + video_id = fields["video_id"] + question = fields["question"] + choice = fields["choice"] + task_type = fields["task_type"] + video_url = fields["video_url"] + video_duration = fields.get("video_duration") or "" + video_category = fields.get("video_category") or "" + + if not video_id and not video_url: + logger.warning("Skipping item: no video_id / video_url") + return None + + if not question: + logger.warning("Skipping item: no question found") + return None + + # Official layout after extracting Videos.tar (see Lliar-liar/Daily-Omni test_model): + # {video_base_dir}/{video_id}/{video_id}_video.mp4 + mm_payload, omni_extra, mm_pos = self._compose_daily_omni_multimodal(video_id, video_url) + if not mm_payload: + return None + + messages = self._build_daily_omni_openai_messages(mm_payload, question, choice) + user_text = self._official_daily_omni_user_prompt(question, choice) + # Text-only length estimate (same as before: no MM token count in bench). + prompt_len = len(tokenizer.encode(f"{DAILY_OMNI_SYSTEM_TEXT}\n{user_text}")) + + return DailyOmniSampleRequest( + prompt=user_text, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=None, + request_id=f"{request_id_prefix}{index}", + daily_omni_gold_answer=fields["answer"], + daily_omni_video_id=video_id, + daily_omni_task_type=task_type, + daily_omni_video_duration=video_duration, + daily_omni_video_category=video_category, + omni_extra_body=omni_extra, + omni_chat_messages=messages, + omni_chat_mm_position=mm_pos, + ) + + @staticmethod + def _official_video_relpath(video_id: str) -> str: + """Relative path inside extracted ``Videos/`` per upstream Daily-Omni scripts.""" + return f"{video_id}/{video_id}_video.mp4" + + @staticmethod + def _official_audio_relpath(video_id: str) -> str: + """Relative path for extracted WAV per upstream ``get_audio_path``.""" + return f"{video_id}/{video_id}_audio.wav" + + def _resolve_local_video_path(self, video_id: str) -> Path | None: + """Pick an existing file under ``video_dir`` (official layout + flat fallback).""" + if not self.video_dir or not video_id: + return None + + candidates = [ + self.video_dir / self._official_video_relpath(video_id), + self.video_dir / f"{video_id}.mp4", # flat layout (custom mirrors / outdated docs) + ] + seen: set[Path] = set() + for p in candidates: + rp = p.resolve() + if rp in seen: + continue + seen.add(rp) + if p.exists(): + return p + return None + + def _resolve_local_audio_path(self, video_id: str) -> Path | None: + """Pick an existing WAV under ``video_dir`` (official layout + flat fallback).""" + if not self.video_dir or not video_id: + return None + candidates = [ + self.video_dir / self._official_audio_relpath(video_id), + self.video_dir / f"{video_id}.wav", + ] + seen: set[Path] = set() + for p in candidates: + rp = p.resolve() + if rp in seen: + continue + seen.add(rp) + if p.exists(): + return p + return None + + def _local_file_to_video_url_payload(self, video_path: Path) -> dict[str, Any]: + """Build OpenAI-style video_url part for a resolved local file. + + vLLM rejects ``file://`` unless the server was started with + ``--allowed-local-media-path`` set to a directory that **contains** the file + (typically the extracted ``Videos`` root). Use ``inline_local_video=True`` to + send base64 data URLs instead (no server path allowlist; larger requests). + """ + path = video_path.expanduser().resolve() + if self.inline_local_video: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + return { + "type": "video_url", + "video_url": {"url": f"data:video/mp4;base64,{b64}"}, + } + return { + "type": "video_url", + "video_url": {"url": path.as_uri()}, + } + + def _local_file_to_audio_url_payload(self, audio_path: Path) -> dict[str, Any]: + """Build OpenAI-style ``audio_url`` part for a resolved local WAV file.""" + path = audio_path.expanduser().resolve() + if self.inline_local_video: + raw = path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + return { + "type": "audio_url", + "audio_url": {"url": f"data:audio/wav;base64,{b64}"}, + } + return { + "type": "audio_url", + "audio_url": {"url": path.as_uri()}, + } + + def _get_video_content( + self, + video_id: str, + video_url: str | None, + ) -> dict[str, Any] | None: + """Resolve video for OpenAI-style ``video_url`` content. + + Upstream uses ``get_video_path(video_id, base) -> base/video_id/video_id_video.mp4``. + The Hub repo only publishes ``Videos.tar``; use ``--daily-omni-video-dir`` pointing + at the extracted ``Videos`` folder (parent of per-``video_id`` subdirs). + + For ``file://`` URLs, start ``vllm serve`` with e.g. + ``--allowed-local-media-path /same/path/as/daily-omni-video-dir``. + """ + if video_url: + url = video_url + if not url.startswith(("http://", "https://", "file://")): + url = f"https://{url.lstrip('/')}" + return {"type": "video_url", "video_url": {"url": url}} + + if self.video_dir and video_id: + video_path = self._resolve_local_video_path(video_id) + if video_path is not None: + return self._local_file_to_video_url_payload(video_path) + logger.warning( + "Video not found under video_dir=%s for video_id=%r (expected %s or %s)", + self.video_dir, + video_id, + self._official_video_relpath(video_id), + f"{video_id}.mp4", + ) + + if video_id: + repo = self.dataset_path or self.DEFAULT_HF_DATASET_ID + rel = self._official_video_relpath(video_id) + hf_video_url = f"https://huggingface.co/datasets/{repo}/resolve/main/Videos/{rel}" + logger.debug( + "Using HF video URL (likely 404 — Hub ships Videos.tar only): %s", + hf_video_url, + ) + return {"type": "video_url", "video_url": {"url": hf_video_url}} + + logger.error("Could not determine video source for video_id=%r", video_id) + return None + + def _get_audio_content(self, video_id: str) -> dict[str, Any] | None: + """Resolve extracted WAV for OpenAI-style ``audio_url`` (local files only).""" + if not self.video_dir or not video_id: + logger.warning( + "Daily-Omni input_mode %r requires --daily-omni-video-dir with %s", + self.input_mode, + self._official_audio_relpath(video_id), + ) + return None + audio_path = self._resolve_local_audio_path(video_id) + if audio_path is not None: + return self._local_file_to_audio_url_payload(audio_path) + logger.warning( + "Audio not found under video_dir=%s for video_id=%r (expected %s or %s)", + self.video_dir, + video_id, + self._official_audio_relpath(video_id), + f"{video_id}.wav", + ) + return None + + def _compose_daily_omni_multimodal( + self, + video_id: str, + video_url: str | None, + ) -> tuple[dict[str, Any] | list[dict[str, Any]] | None, dict[str, Any] | None, Literal["first", "last"]]: + """Build ``multi_modal_data`` and request extras for the active ``input_mode``. + + Mirrors upstream Daily-Omni: separate video + WAV with ``use_audio_in_video=False``. + """ + extra: dict[str, Any] = {"mm_processor_kwargs": {"use_audio_in_video": False}} + mode = self.input_mode + + if mode == "visual": + v = self._get_video_content(video_id, video_url) + return v, extra, "last" + + if mode == "audio": + a = self._get_audio_content(video_id) + return a, extra, "first" + + v = self._get_video_content(video_id, video_url) + a = self._get_audio_content(video_id) + if not v or not a: + return None, None, "first" + return [v, a], extra, "first" + + @staticmethod + def _media_desc_for_official_prompt(mode: DailyOmniInputMode) -> str: + """``media_desc`` in upstream ``build_conversation``.""" + if mode == "audio": + return "given audio" + if mode == "all": + return "given video and audio together" + return "given video" + + @staticmethod + def _choices_repr_for_official_prompt(choice: Any) -> str: + """Format ``Choice`` from qa.json for the model (one option per line when possible). + + Using ``str(list)`` embeds Python list brackets and quotes, which is poor for MCQ + reading; lists/tuples are joined with newlines instead. Other shapes fall back to + ``str(choice)`` for parity with exotic upstream payloads. + """ + if choice is None: + return "" + if isinstance(choice, (list, tuple)): + lines = [str(x).strip() for x in choice if str(x).strip()] + return "\n".join(lines) + if isinstance(choice, dict): + return "\n".join(f"{k}. {v}" for k, v in choice.items()) + return str(choice) + + def _official_daily_omni_user_prompt(self, question: str, choice: Any) -> str: + """User text block from Daily-Omni ``build_conversation`` (after media parts).""" + task_prompt = self._media_desc_for_official_prompt(self.input_mode) + choices = self._choices_repr_for_official_prompt(choice) + # Single f-string with explicit newlines avoids accidental implicit concatenation + # gluing sentences (e.g. ``...media_desc.Select...``) when editing. + return ( + "Your task is to accurately answer multiple-choice questions " + f"based on the {task_prompt}.\n" + "Select the single most accurate answer from the given choices.\n" + f"Question: {question}\n" + f"Choices: {choices}\n" + "Your answer should be a capital letter representing your choice: " + "A, B, C, or D. Don't generate any other text.\n" + ) + + def _build_daily_omni_openai_messages( + self, + mm_payload: dict[str, Any] | list[dict[str, Any]], + question: str, + choice: Any, + ) -> list[dict[str, Any]]: + """Map upstream conversation to OpenAI Chat Completions ``messages`` (video_url / audio_url parts).""" + user_text = self._official_daily_omni_user_prompt(question, choice) + mm_list: list[dict[str, Any]] = mm_payload if isinstance(mm_payload, list) else [mm_payload] + user_content: list[dict[str, Any]] = [*mm_list, {"type": "text", "text": user_text}] + return [ + {"role": "system", "content": [{"type": "text", "text": DAILY_OMNI_SYSTEM_TEXT}]}, + {"role": "user", "content": user_content}, + ] + + def sample_by_task_type( + self, + tokenizer: TokenizerLike, + task_type: str, + num_samples: int, + output_len: int | None = None, + request_id_prefix: str = "", + **kwargs, + ) -> list[SampleRequest]: + """Sample requests filtered by task type. + + Args: + tokenizer: Tokenizer + task_type: Task type to filter by + num_samples: Number of samples + output_len: Target output length + request_id_prefix: Prefix for request IDs + **kwargs: Additional sampling arguments + + Returns: + List of SampleRequest objects matching the task type + """ + if output_len is None: + output_len = self.DEFAULT_OUTPUT_LEN + + filtered = [ + item for item in self.data if self._normalize_qa_fields(self._coerce_row(item))["task_type"] == task_type + ] + + available = len(filtered) + if available < num_samples: + logger.warning( + "Only %d samples available for task type '%s', requested %d", + available, + task_type, + num_samples, + ) + num_samples = available + + sampled_requests: list[SampleRequest] = [] + cached_tokenizer = get_cached_tokenizer(tokenizer) + + for i, item in enumerate(filtered[:num_samples]): + request = self._create_sample_request(item, cached_tokenizer, output_len, request_id_prefix, i) + if request: + sampled_requests.append(request) + + return sampled_requests + + def __repr__(self) -> str: + return ( + f"DailyOmniDataset(" + f"dataset_path={self.dataset_path!r}, " + f"dataset_split={self.dataset_split!r}, " + f"video_dir={self.video_dir!r}, " + f"input_mode={self.input_mode!r}, " + f"inline_local_video={self.inline_local_video!r}, " + f"max_duration_seconds={self.max_duration_seconds}, " + f"random_seed={self.random_seed}" + f")" + ) + + +def load_daily_omni_dataset( + qa_json_path: str | None = None, + dataset_path: str | None = None, + dataset_split: str = "train", + random_seed: int = 0, + video_dir: str | None = None, + input_mode: DailyOmniInputMode = "all", + max_duration_seconds: float | None = None, + dataset_subset: str | None = None, + no_stream: bool = False, + **kwargs, +) -> DailyOmniDataset: + """Convenience function to load Daily-Omni dataset. + + Args: + qa_json_path: Path to local qa.json file (recommended for offline/air-gapped environments). + When provided, ``dataset_path`` is ignored. + dataset_path: HuggingFace dataset path (default: liarliar/Daily-Omni). Used only if + ``qa_json_path`` is not provided (legacy online mode). + dataset_split: Dataset split to use (default: "train") + random_seed: Random seed for shuffling + video_dir: Directory containing extracted ``Videos/`` tree (MP4 and, for ``all``/``audio``, WAV) + input_mode: ``visual`` | ``audio`` | ``all`` (same semantics as upstream Daily-Omni) + max_duration_seconds: Maximum video duration in seconds (e.g., 30 for 30s subset, 60 for 60s subset); + uses ffprobe on local files under ``video_dir`` (in-memory cache only for this process). + **kwargs: Additional arguments passed to DailyOmniDataset + + Returns: + DailyOmniDataset instance + + Example: + >>> from vllm_omni.benchmarks.data_modules.daily_omni_dataset import load_daily_omni_dataset + + # Local JSON mode (recommended for offline) + >>> dataset = load_daily_omni_dataset( + ... qa_json_path="/path/to/qa.json", + ... video_dir="/path/to/Daily-Omni/Videos", + ... random_seed=42, + ... max_duration_seconds=30, + ... ) + + # HuggingFace mode (legacy online) + >>> dataset = load_daily_omni_dataset( + ... dataset_path="liarliar/Daily-Omni", + ... video_dir="/path/to/Daily-Omni/Videos", + ... random_seed=42, + ... ) + >>> requests = dataset.sample(tokenizer, num_requests=100) + """ + return DailyOmniDataset( + qa_json_path=qa_json_path, + dataset_path=dataset_path, + dataset_split=dataset_split, + random_seed=random_seed, + video_dir=video_dir, + input_mode=input_mode, + max_duration_seconds=max_duration_seconds, + dataset_subset=dataset_subset, + no_stream=no_stream, + **kwargs, + ) + + +def get_daily_omni_statistics( + qa_json_path: str | None = None, + dataset_path: str | None = DailyOmniDataset.DEFAULT_HF_DATASET_ID, + dataset_split: str = "train", +) -> dict[str, Any]: + """Get statistics about the Daily-Omni dataset. + + Args: + qa_json_path: Path to local qa.json file (recommended for offline/air-gapped environments). + When provided, ``dataset_path`` is ignored. + dataset_path: HuggingFace dataset path. Defaults to ``DailyOmniDataset.DEFAULT_HF_DATASET_ID`` + when ``qa_json_path`` is omitted. Pass ``None`` only together with ``qa_json_path``. + dataset_split: Dataset split to use (default: "train") + + Returns: + Statistics dict with task type distribution and other info + + Example: + >>> from vllm_omni.benchmarks.data_modules.daily_omni_dataset import get_daily_omni_statistics + + # Local JSON mode + >>> stats = get_daily_omni_statistics(qa_json_path="/path/to/qa.json") + + # HuggingFace mode + >>> stats = get_daily_omni_statistics(dataset_path="liarliar/Daily-Omni") + >>> print(f"Total QA pairs: {stats['total_qa_pairs']}") + >>> print(f"Task distribution: {stats['task_distribution']}") + """ + dataset = DailyOmniDataset( + qa_json_path=qa_json_path, + dataset_path=dataset_path, + dataset_split=dataset_split, + ) + task_stats = dataset.get_task_statistics() + + source = str(qa_json_path) if qa_json_path else f"{dataset_path}/{dataset_split}" + return { + "source": source, + "total_qa_pairs": len(list(dataset.data)), + "task_distribution": task_stats, + } diff --git a/vllm_omni/benchmarks/data_modules/daily_omni_eval.py b/vllm_omni/benchmarks/data_modules/daily_omni_eval.py new file mode 100644 index 00000000000..ecc9edc8445 --- /dev/null +++ b/vllm_omni/benchmarks/data_modules/daily_omni_eval.py @@ -0,0 +1,406 @@ +"""Daily-Omni multiple-choice accuracy scoring for vLLM-Omni bench serve. + +Compares model ``generated_text`` to dataset ``Answer`` (A/B/C/D). + +**Alignment with open-source** (`Lliar-liar/Daily-Omni` ``test_model/.../testmodel.py``): + +- Answer extraction defaults to the same rules as ``extract_choice_letter`` (strip after an + ``assistant`` marker, then leading ``A``–``D``, else first ``\\b[A-D]\\b``). Set env + ``DAILY_OMNI_EXTRACT_MODE=relaxed`` to use the older vLLM-Omni heuristics (last ``answer:``, + tail scan, etc.). +- Overall accuracy comparable to the official script uses **successful HTTP responses only** as + the denominator (their ``valid_questions = total - failed`` excludes inference / I/O skips). + We also report ``daily_omni_accuracy_incl_http_fail`` where each failed request counts as a + wrong answer in the denominator (stricter throughput-bench view). +- **By video length:** mirrors upstream ``--- Accuracy by Video Duration ---`` for ``30s`` / + ``60s`` (``qa.json`` ``video_duration``): ``daily_omni_per_duration*`` metrics and a printed block. +- **By video category:** mirrors ``--- Accuracy by Video Category ---`` using ``video_category`` + from ``qa.json`` (``daily_omni_per_category*``; empty category is bucketed as ``unknown``). +- **Correctness:** uses the same ``evaluate_answer`` rule as upstream (truthy extracted letter vs + raw ``Answer`` string, both ``strip().upper()``). Rows with empty ``Answer`` are skipped + (``no_gold``), matching missing-field skips in the official loop. +""" + +from __future__ import annotations + +import os +import re +from typing import Any + +from vllm.benchmarks.lib.endpoint_request_func import RequestFuncOutput + +from vllm_omni.benchmarks.data_modules.daily_omni_dataset import DailyOmniSampleRequest + +_VALID = frozenset("ABCD") + +# Official ``testmodel.py`` buckets (``qa.json`` ``video_duration``). +DAILY_OMNI_DURATION_KEYS: tuple[str, ...] = ("30s", "60s") + + +def extract_choice_letter_official(text: str | None) -> str | None: + """Port of Daily-Omni ``extract_choice_letter`` (first A–D, assistant-tail semantics).""" + if not text: + return None + raw = str(text).strip() + if not raw: + return None + match = re.search(r"assistant\s*([\s\S]*)$", raw, flags=re.IGNORECASE) + candidate = match.group(1).strip() if match else raw + direct = re.match(r"(?i)^\s*([A-D])(?:[\s\.\)::]|$)", candidate) + if direct: + return direct.group(1).upper() + fallback = re.search(r"\b([A-D])\b", candidate.upper()) + if fallback: + return fallback.group(1) + return None + + +def evaluate_answer_official(model_answer: str | None, correct_answer: str) -> bool: + """Port of Daily-Omni ``evaluate_answer`` (strict string match after strip/upper).""" + if not model_answer: + return False + return model_answer.strip().upper() == (correct_answer or "").strip().upper() + + +def normalize_gold_answer(gold: str) -> str | None: + """Best-effort single letter from ``Answer`` (for ``gold_normalized`` in saved items only).""" + g = (gold or "").strip().upper() + if len(g) == 1 and g in _VALID: + return g + m = re.search(r"([ABCD])\b", g) + if m: + return m.group(1).upper() + return None + + +def _extract_predicted_choice_relaxed(text: str) -> str | None: + """Legacy vLLM-Omni heuristics (last ``answer:`` patterns, tail scan).""" + if not text or not str(text).strip(): + return None + t = str(text).strip() + + strong_patterns = [ + r"(?i)\*\*answer\*\*\s*[::]?\s*\(?([ABCD])\)?", + r"(?i)\banswer\s*[::]?\s*\(?([ABCD])\)?", + r"(?i)\bfinal\s+answer\s*[::]?\s*\(?([ABCD])\)?", + r"(?i)\bcorrect\s+(?:answer|option)\s*[::]?\s*\(?([ABCD])\)?", + r"(?i)\bthe\s+(?:correct\s+)?option\s+(?:is|would\s+be)\s*\(?([ABCD])\)?", + r"(?i)\bI\s+(?:would\s+)?(?:choose|select|pick)\s*\(?([ABCD])\)?", + ] + last_letter: str | None = None + for pat in strong_patterns: + for m in re.finditer(pat, t): + last_letter = m.group(1).upper() + if last_letter: + return last_letter + + # Weaker phrases: first match can be spurious; still prefer last occurrence. + weak_patterns = [ + r"(?i)\boption\s*[::]?\s*\(?([ABCD])\)?", + r"(?i)\bchoice\s*[::]?\s*\(?([ABCD])\)?", + ] + for pat in weak_patterns: + for m in re.finditer(pat, t): + last_letter = m.group(1).upper() + if last_letter: + return last_letter + + paren = list(re.finditer(r"\(([ABCD])\)", t)) + if paren: + return paren[-1].group(1).upper() + + # First line sometimes is just "B" or "B." — allow if whole output is short + one_line = t.split("\n", 1)[0].strip() + if len(t) < 120 and len(one_line) <= 6: + m0 = re.match(r"^([ABCD])\s*[.:\)]?\s*$", one_line, re.I) + if m0: + return m0.group(1).upper() + + # Tail-only: avoids matching echoed "A. ..." option blocks at the start + tail_len = min(500, len(t)) + tail = t[-tail_len:] + # ``\b`` after the letter avoids "Because"/"Definitely" false positives + m = re.search(r"(?:^|[^\w])([ABCD])\b", tail, re.I) + if m: + return m.group(1).upper() + + return None + + +def extract_predicted_choice(text: str | None) -> str | None: + """Parse model output to A–D (official Daily-Omni rules by default).""" + if not text or not str(text).strip(): + return None + mode = os.environ.get("DAILY_OMNI_EXTRACT_MODE", "official").strip().lower() + if mode in ("relaxed", "heuristic", "legacy"): + return _extract_predicted_choice_relaxed(str(text)) + return extract_choice_letter_official(text) + + +def compute_daily_omni_accuracy_metrics( + input_requests: list[Any], + outputs: list[RequestFuncOutput], + *, + include_per_item: bool = False, +) -> dict[str, Any] | None: + """If all requests are :class:`DailyOmniSampleRequest`, compute accuracy stats. + + Rows with empty ``Answer`` (after strip) are skipped as ``no_gold``, like upstream missing + ``correct_answer``. + + **Denominators:** The open-source script excludes items that hit inference / I/O failures + from ``valid_questions``; we mirror that with ``daily_omni_accuracy`` (= correct / + successful responses). Failed HTTP requests are also tracked and used in + ``daily_omni_accuracy_incl_http_fail`` (each failure counts as incorrect in the + denominator). + """ + if not input_requests or len(input_requests) != len(outputs): + return None + if not all(isinstance(r, DailyOmniSampleRequest) for r in input_requests): + return None + + # total / correct: all rows with gold (incl. HTTP fail in total) + # total_ok / correct_ok: successful HTTP only (GitHub-style per-type denominator) + per_task: dict[str, dict[str, int]] = {} + per_category: dict[str, dict[str, int]] = {} + per_duration: dict[str, dict[str, int]] = { + k: {"correct": 0, "total": 0, "correct_ok": 0, "total_ok": 0} for k in DAILY_OMNI_DURATION_KEYS + } + items: list[dict[str, Any]] = [] + correct = 0 + evaluated = 0 + no_gold = 0 + request_failed = 0 + parse_failed = 0 # success but could not extract A–D + + for req, out in zip(input_requests, outputs, strict=True): + assert isinstance(req, DailyOmniSampleRequest) + gold_raw = (req.daily_omni_gold_answer or "").strip() + gold_norm = normalize_gold_answer(req.daily_omni_gold_answer) + tt = (req.daily_omni_task_type or "unknown").strip() or "unknown" + dur_key = (req.daily_omni_video_duration or "").strip() + dur_active = dur_key in per_duration + cat_key = (req.daily_omni_video_category or "").strip() or "unknown" + if tt not in per_task: + per_task[tt] = {"correct": 0, "total": 0, "correct_ok": 0, "total_ok": 0} + if cat_key not in per_category: + per_category[cat_key] = {"correct": 0, "total": 0, "correct_ok": 0, "total_ok": 0} + + if not gold_raw: + no_gold += 1 + items.append( + { + "request_id": req.request_id, + "skipped": True, + "reason": "no_gold", + "task_type": tt, + "video_id": req.daily_omni_video_id, + "video_duration": dur_key or None, + "video_category": cat_key if cat_key != "unknown" else None, + } + ) + continue + + if not out.success: + request_failed += 1 + evaluated += 1 + per_task[tt]["total"] += 1 + per_category[cat_key]["total"] += 1 + if dur_active: + per_duration[dur_key]["total"] += 1 + # GitHub: failed inference not in valid_questions — do not increment total_ok + items.append( + { + "request_id": req.request_id, + "gold": gold_raw, + "gold_normalized": gold_norm, + "predicted": None, + "correct": False, + "task_type": tt, + "video_id": req.daily_omni_video_id, + "video_duration": dur_key or None, + "video_category": cat_key if cat_key != "unknown" else None, + "error": (out.error or "")[:500], + } + ) + continue + + pred = extract_predicted_choice(out.generated_text) + evaluated += 1 + per_task[tt]["total"] += 1 + per_task[tt]["total_ok"] += 1 + per_category[cat_key]["total"] += 1 + per_category[cat_key]["total_ok"] += 1 + if dur_active: + per_duration[dur_key]["total"] += 1 + per_duration[dur_key]["total_ok"] += 1 + if pred is None: + parse_failed += 1 + is_correct = evaluate_answer_official(pred, req.daily_omni_gold_answer) + if is_correct: + correct += 1 + per_task[tt]["correct"] += 1 + per_task[tt]["correct_ok"] += 1 + per_category[cat_key]["correct"] += 1 + per_category[cat_key]["correct_ok"] += 1 + if dur_active: + per_duration[dur_key]["correct"] += 1 + per_duration[dur_key]["correct_ok"] += 1 + + items.append( + { + "request_id": req.request_id, + "gold": gold_raw, + "gold_normalized": gold_norm, + "predicted": pred, + "correct": is_correct, + "parse_failed": pred is None, + "task_type": tt, + "video_id": req.daily_omni_video_id, + "video_duration": dur_key or None, + "video_category": cat_key if cat_key != "unknown" else None, + } + ) + + evaluated_ok = evaluated - request_failed + accuracy_github = (correct / evaluated_ok) if evaluated_ok else None + accuracy_incl_fail = (correct / evaluated) if evaluated else None + + per_task_accuracy: dict[str, float | None] = {} + per_task_accuracy_github: dict[str, float | None] = {} + for name, st in per_task.items(): + tot = st["total"] + per_task_accuracy[name] = (st["correct"] / tot) if tot else None + tok = st["total_ok"] + per_task_accuracy_github[name] = (st["correct_ok"] / tok) if tok else None + + per_category_accuracy: dict[str, float | None] = {} + per_category_accuracy_github: dict[str, float | None] = {} + for name, st in per_category.items(): + tot = st["total"] + per_category_accuracy[name] = (st["correct"] / tot) if tot else None + tok = st["total_ok"] + per_category_accuracy_github[name] = (st["correct_ok"] / tok) if tok else None + + per_duration_accuracy: dict[str, float | None] = {} + per_duration_accuracy_github: dict[str, float | None] = {} + for name, st in per_duration.items(): + tot = st["total"] + per_duration_accuracy[name] = (st["correct"] / tot) if tot else None + tok = st["total_ok"] + per_duration_accuracy_github[name] = (st["correct_ok"] / tok) if tok else None + + out: dict[str, Any] = { + # Comparable to GitHub testmodel.py: correct / successful inferences + "daily_omni_accuracy": accuracy_github, + "daily_omni_accuracy_incl_http_fail": accuracy_incl_fail, + "daily_omni_correct": correct, + "daily_omni_evaluated": evaluated, + "daily_omni_evaluated_ok": evaluated_ok, + "daily_omni_no_gold": no_gold, + "daily_omni_request_failed": request_failed, + "daily_omni_parse_failed": parse_failed, + "daily_omni_per_task": {k: dict(v) for k, v in per_task.items()}, + "daily_omni_per_task_accuracy": per_task_accuracy, + "daily_omni_per_task_accuracy_github_style": per_task_accuracy_github, + "daily_omni_per_category": {k: dict(v) for k, v in per_category.items()}, + "daily_omni_per_category_accuracy": per_category_accuracy, + "daily_omni_per_category_accuracy_github_style": per_category_accuracy_github, + "daily_omni_per_duration": {k: dict(v) for k, v in per_duration.items()}, + "daily_omni_per_duration_accuracy": per_duration_accuracy, + "daily_omni_per_duration_accuracy_github_style": per_duration_accuracy_github, + } + if include_per_item: + out["daily_omni_eval_items"] = items + return out + + +def print_daily_omni_accuracy_summary(metrics: dict[str, Any]) -> None: + """Pretty-print accuracy block (stdout).""" + acc = metrics.get("daily_omni_accuracy") + acc_fail = metrics.get("daily_omni_accuracy_incl_http_fail") + if acc is None and acc_fail is None and metrics.get("daily_omni_evaluated", 0) == 0: + return + print("{s:{c}^{n}}".format(s=" Daily-Omni accuracy (MCQ) ", n=50, c="=")) + ok = int(metrics.get("daily_omni_evaluated_ok", 0) or 0) + cor = int(metrics.get("daily_omni_correct", 0) or 0) + if ok > 0 and acc is not None: + print(f"Overall Accuracy: {cor}/{ok} = {acc:.2%}") + elif int(metrics.get("daily_omni_evaluated", 0) or 0) > 0: + print("Overall Accuracy: 0/0 = N/A (no successful HTTP responses)") + print( + "{:<40} {:<10}".format( + "Submitted (gold present):", + metrics.get("daily_omni_evaluated", 0), + ) + ) + print( + "{:<40} {:<10}".format( + "Successful HTTP (GitHub denom.):", + metrics.get("daily_omni_evaluated_ok", 0), + ) + ) + print("{:<40} {:<10}".format("Correct:", metrics.get("daily_omni_correct", 0))) + if acc is not None: + print("{:<40} {:<10.4f}".format("Accuracy (ratio, same as above):", acc)) + if acc_fail is not None and metrics.get("daily_omni_request_failed", 0): + print( + "{:<40} {:<10.4f}".format( + "Accuracy (incl. HTTP as wrong):", + acc_fail, + ) + ) + print("{:<40} {:<10}".format("Skipped (no gold):", metrics.get("daily_omni_no_gold", 0))) + print( + "{:<40} {:<10}".format( + "HTTP failed (excl. from GitHub acc.):", + metrics.get("daily_omni_request_failed", 0), + ) + ) + print( + "{:<40} {:<10}".format( + "Parsed OK but no A–D found:", + metrics.get("daily_omni_parse_failed", 0), + ) + ) + pt = metrics.get("daily_omni_per_task") or {} + pta = metrics.get("daily_omni_per_task_accuracy_github_style") or {} + if pta: + print("\n--- Accuracy by QA Type ---") + for name in sorted(pta.keys()): + a = pta[name] + st = pt.get(name) or {} + tok = int(st.get("total_ok", 0) or 0) + cok = int(st.get("correct_ok", 0) or 0) + if tok and a is not None: + print(f"{name}: {cok}/{tok} = {a:.2%}") + else: + print(f"{name}: 0/0 = N/A") + + pc = metrics.get("daily_omni_per_category") or {} + ptc = metrics.get("daily_omni_per_category_accuracy_github_style") or {} + if ptc: + print("\n--- Accuracy by Video Category ---") + for name in sorted(ptc.keys()): + a = ptc[name] + st = pc.get(name) or {} + tok = int(st.get("total_ok", 0) or 0) + cok = int(st.get("correct_ok", 0) or 0) + if tok and a is not None: + print(f"{name}: {cok}/{tok} = {a:.2%}") + else: + print(f"{name}: 0/0 = N/A") + + pdf = metrics.get("daily_omni_per_duration_accuracy_github_style") or {} + if pdf: + print("\n--- Accuracy by Video Duration ---") + for name in DAILY_OMNI_DURATION_KEYS: + a = pdf.get(name) + st = (metrics.get("daily_omni_per_duration") or {}).get(name) or {} + tok = int(st.get("total_ok", 0) or 0) + cor = int(st.get("correct_ok", 0) or 0) + if tok and a is not None: + print(f"{name} Duration: {cor}/{tok} = {a:.2%}") + else: + print(f"{name} Duration: 0/0 = N/A") + print("=" * 50) diff --git a/vllm_omni/benchmarks/data_modules/daily_omni_text_audio.py b/vllm_omni/benchmarks/data_modules/daily_omni_text_audio.py new file mode 100644 index 00000000000..69fbe026bd8 --- /dev/null +++ b/vllm_omni/benchmarks/data_modules/daily_omni_text_audio.py @@ -0,0 +1,255 @@ +"""Daily-Omni: optional consistency check between text stream and generated speech. + +The benchmark MCQ accuracy uses ``generated_text`` only. When the omni server also +streams ``modality=audio`` (TTS), this module can transcribe the concatenated WAV +with Whisper and compare the inferred option letter to the one parsed from text. + +Requires ``openai-whisper`` (``pip install openai-whisper``). Enable via env +``DAILY_OMNI_TEXT_AUDIO_CONSISTENCY=1`` or CLI ``--daily-omni-text-audio-consistency``. + +Whisper model name defaults to ``tiny`` (override with ``DAILY_OMNI_WHISPER_MODEL``). +""" + +from __future__ import annotations + +import logging +import os +import re +import threading +from typing import Any + +from vllm_omni.benchmarks.data_modules.daily_omni_dataset import DailyOmniSampleRequest +from vllm_omni.benchmarks.data_modules.daily_omni_eval import extract_predicted_choice + +logger = logging.getLogger(__name__) + +_whisper_model = None +_whisper_model_name: str | None = None +_whisper_lock = threading.Lock() + + +def env_text_audio_check_enabled() -> bool: + return os.environ.get("DAILY_OMNI_TEXT_AUDIO_CONSISTENCY", "").lower() in ( + "1", + "true", + "yes", + ) + + +def extract_choice_from_asr_transcript(transcript: str) -> str | None: + """Parse A–D from ASR text; extends :func:`extract_predicted_choice` with spoken Chinese phrases.""" + c = extract_predicted_choice(transcript) + if c: + return c + t = transcript or "" + for pat in ( + r"(?i)选项\s*([ABCD])\b", + r"(?i)选\s*([ABCD])\b", + r"(?i)答案\s*是\s*([ABCD])\b", + r"(?i)答案\s*([ABCD])\b", + ): + m = re.search(pat, t) + if m: + return m.group(1).upper() + return None + + +def _get_whisper_model(model_name: str): + global _whisper_model, _whisper_model_name + with _whisper_lock: + if _whisper_model is None or _whisper_model_name != model_name: + import whisper + + logger.warning( + "Loading Whisper model %r for Daily-Omni text/audio consistency (one-time)...", + model_name, + ) + _whisper_model = whisper.load_model(model_name) + _whisper_model_name = model_name + return _whisper_model + + +def transcribe_wav_bytes( + wav_bytes: bytes, + *, + language: str | None = None, + model_name: str | None = None, +) -> tuple[str | None, str | None]: + """Transcribe WAV bytes. Returns ``(transcript, error)`` — one of them is set. + + Args: + wav_bytes: RIFF WAV file bytes. + language: Optional Whisper language code (e.g. ``en``, ``zh``); improves accuracy/latency. + model_name: Override model id; else ``DAILY_OMNI_WHISPER_MODEL`` or ``tiny``. + """ + if not wav_bytes: + return None, "empty_wav" + if model_name is None or not str(model_name).strip(): + model_name = os.environ.get("DAILY_OMNI_WHISPER_MODEL") or "tiny" + model_name = str(model_name).strip() or "tiny" + path: str | None = None + try: + import tempfile + + model = _get_whisper_model(model_name) + fd, path = tempfile.mkstemp(suffix=".wav") + with os.fdopen(fd, "wb") as fp: + fp.write(wav_bytes) + kwargs: dict = {} + if language: + kwargs["language"] = language + result = model.transcribe(path, **kwargs) + text = (result.get("text") or "").strip() + return (text if text else None), None + except ImportError: + return None, "openai-whisper is not installed (pip install openai-whisper)" + except Exception as e: + return None, str(e)[:500] + finally: + if path: + try: + os.unlink(path) + except OSError: + pass + + +def compute_daily_omni_text_audio_consistency_metrics( + input_requests: list[Any], + outputs: list[Any], + *, + include_per_item: bool = False, +) -> dict[str, Any] | None: + """Compare option letter from ``generated_text`` vs Whisper transcript of output audio. + + Only considers requests where ``outputs[i]`` has ``generated_audio_wav_bytes`` set + (populated by the omni benchmark when TA check is enabled). + """ + if not input_requests or len(input_requests) != len(outputs): + return None + if not all(isinstance(r, DailyOmniSampleRequest) for r in input_requests): + return None + + ta_no_wav = 0 + ta_asr_failed = 0 + ta_text_unparsed = 0 + ta_audio_unparsed = 0 + ta_consistent = 0 + ta_mismatch = 0 + ta_both_parsed = 0 + items: list[dict[str, Any]] = [] + + for req, out in zip(input_requests, outputs, strict=True): + assert isinstance(req, DailyOmniSampleRequest) + rid = req.request_id + if not getattr(out, "success", False): + if include_per_item: + items.append( + { + "request_id": rid, + "skipped": True, + "reason": "request_not_success", + } + ) + continue + + wav = getattr(out, "generated_audio_wav_bytes", None) + if not wav: + ta_no_wav += 1 + if include_per_item: + items.append( + { + "request_id": rid, + "skipped": False, + "reason": "no_output_audio", + "text_choice": extract_predicted_choice(getattr(out, "generated_text", "") or ""), + } + ) + continue + + transcript, asr_err = transcribe_wav_bytes(wav) + if asr_err: + ta_asr_failed += 1 + if include_per_item: + items.append( + { + "request_id": rid, + "asr_error": asr_err, + "text_choice": extract_predicted_choice(getattr(out, "generated_text", "") or ""), + } + ) + continue + + text_choice = extract_predicted_choice(getattr(out, "generated_text", "") or "") + audio_choice = extract_choice_from_asr_transcript(transcript or "") + + if text_choice is None: + ta_text_unparsed += 1 + if audio_choice is None: + ta_audio_unparsed += 1 + + if text_choice is not None and audio_choice is not None: + ta_both_parsed += 1 + if text_choice == audio_choice: + ta_consistent += 1 + else: + ta_mismatch += 1 + + if include_per_item: + consistent: bool | None + if text_choice is None or audio_choice is None: + consistent = None + else: + consistent = text_choice == audio_choice + items.append( + { + "request_id": rid, + "text_choice": text_choice, + "audio_choice": audio_choice, + "asr_transcript": (transcript or "")[:500], + "text_audio_consistent": consistent, + } + ) + + comparable = ta_consistent + ta_mismatch + rate = (ta_consistent / comparable) if comparable else None + + out: dict[str, Any] = { + "daily_omni_ta_enabled": True, + "daily_omni_ta_no_output_audio": ta_no_wav, + "daily_omni_ta_asr_failed": ta_asr_failed, + "daily_omni_ta_text_unparsed": ta_text_unparsed, + "daily_omni_ta_audio_unparsed": ta_audio_unparsed, + "daily_omni_ta_both_parsed": ta_both_parsed, + "daily_omni_ta_consistent": ta_consistent, + "daily_omni_ta_mismatch": ta_mismatch, + "daily_omni_ta_consistency_rate": rate, + } + if include_per_item: + out["daily_omni_ta_items"] = items + return out + + +def print_daily_omni_text_audio_summary(metrics: dict[str, Any]) -> None: + if not metrics.get("daily_omni_ta_enabled"): + return + print("{s:{c}^{n}}".format(s=" Daily-Omni text vs audio (ASR) ", n=50, c="=")) + print("{:<40} {:<10}".format("No output audio captured:", metrics.get("daily_omni_ta_no_output_audio", 0))) + print("{:<40} {:<10}".format("ASR failed:", metrics.get("daily_omni_ta_asr_failed", 0))) + print("{:<40} {:<10}".format("Both text+audio letter parsed:", metrics.get("daily_omni_ta_both_parsed", 0))) + print("{:<40} {:<10}".format("Consistent (same letter):", metrics.get("daily_omni_ta_consistent", 0))) + print("{:<40} {:<10}".format("Mismatch:", metrics.get("daily_omni_ta_mismatch", 0))) + r = metrics.get("daily_omni_ta_consistency_rate") + if r is not None: + print("{:<40} {:<10.4f}".format("Consistency rate (of both parsed):", r)) + print( + "{:<40} {:<10}".format( + "Text unparsed (among w/ audio):", + metrics.get("daily_omni_ta_text_unparsed", 0), + ) + ) + print( + "{:<40} {:<10}".format( + "Audio unparsed (among w/ audio):", + metrics.get("daily_omni_ta_audio_unparsed", 0), + ) + ) diff --git a/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py b/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py new file mode 100644 index 00000000000..ca6de4cb202 --- /dev/null +++ b/vllm_omni/benchmarks/data_modules/seed_tts_dataset.py @@ -0,0 +1,272 @@ +"""Seed-TTS zero-shot evaluation-style prompts for ``vllm bench serve``. + +Loads rows from the `meta.lst` format used in `BytedanceSpeech/seed-tts-eval`_ (or any +HuggingFace dataset repo with the same layout):: + + utt_id|prompt_transcript|prompt_wav_relative_path|text_to_synthesize + +Each benchmark request supplies target text plus ``ref_text`` / ``ref_audio`` (Qwen3-TTS ``Base`` / +voice clone), merged into the JSON body. By default ``ref_audio`` is an inline ``data:`` URL so +the server does not need ``--allowed-local-media-path``. Use ``--seed-tts-file-ref-audio`` for +``file://`` (smaller bodies; requires that flag). Use ``--backend openai-audio-speech`` +(``/v1/audio/speech``) or ``--backend openai-chat-omni`` (``/v1/chat/completions`` with the same +fields on the body plus a Qwen3-Omni-style ``system`` message and the target text as ``user`` content). + +.. _BytedanceSpeech/seed-tts-eval: https://github.com/BytedanceSpeech/seed-tts-eval +""" + +from __future__ import annotations + +import base64 +import logging +import random +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from vllm.benchmarks.datasets import BenchmarkDataset, SampleRequest +from vllm.tokenizers import TokenizerLike +from vllm.tokenizers.hf import get_cached_tokenizer + +logger = logging.getLogger(__name__) + +# Matches Qwen3-Omni serving examples (``openai_chat_completion_client_for_multimodal_generation`` / +# ``qwen3_omni/gradio_demo``) plus explicit TTS / voice-clone instructions for chat completions. +SEED_TTS_DEFAULT_OMNI_SYSTEM_PROMPT = ( + "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, " + "capable of perceiving auditory and visual inputs, as well as generating text and speech.\n" + "For this request you act as a text-to-speech engine with zero-shot voice cloning: " + "the API provides reference audio and its transcript (ref_audio, ref_text) and task_type Base. " + "The user message is the exact text you must speak. " + "Synthesize natural speech in the same language as that user text, " + "matching the timbre, prosody, and speaking style of the reference audio while reading the new content clearly." +) + + +@dataclass +class SeedTTSSampleRequest(SampleRequest): + """``SampleRequest`` with per-row fields merged into ``/v1/audio/speech`` JSON.""" + + #: Shallow-merged into ``RequestFuncInput.extra_body`` (ref_audio, ref_text, task_type, …). + seed_tts_speech_extra: dict[str, Any] | None = None + seed_tts_utterance_id: str = "" + seed_tts_locale: str = "" + #: For ``openai-chat-omni``: becomes the chat ``system`` message (Qwen3-Omni + TTS behavior). + seed_tts_system_prompt: str = "" + #: Local path to reference prompt WAV (for SIM vs. synthesized PCM in ``seed_tts_eval``). + seed_tts_ref_wav_path: str = "" + + +@dataclass +class _SeedTTSRow: + utterance_id: str + ref_text: str + prompt_wav_rel: str + target_text: str + + +def _parse_meta_line(line: str) -> _SeedTTSRow | None: + line = line.strip() + if not line or line.startswith("#"): + return None + parts = line.split("|") + if len(parts) < 4: + logger.warning("Skipping malformed meta.lst line (need 4 '|'-fields): %r", line[:120]) + return None + utt_id, ref_text, wav_rel, target = parts[0], parts[1], parts[2], parts[3] + if not target.strip(): + return None + return _SeedTTSRow( + utterance_id=utt_id.strip(), + ref_text=ref_text.strip(), + prompt_wav_rel=wav_rel.strip(), + target_text=target.strip(), + ) + + +def _load_meta_rows(meta_file: Path) -> list[_SeedTTSRow]: + text = meta_file.read_text(encoding="utf-8") + rows: list[_SeedTTSRow] = [] + for line in text.splitlines(): + r = _parse_meta_line(line) + if r is not None: + rows.append(r) + return rows + + +def resolve_seed_tts_root(dataset_path: str | None, *, explicit_root: str | None) -> Path: + """Return directory containing ``{locale}/meta.lst`` and ``{locale}/prompt-wavs/``.""" + if explicit_root: + root = Path(explicit_root).expanduser().resolve() + if not root.is_dir(): + raise FileNotFoundError(f"--seed-tts-root is not a directory: {root}") + return root + + if not dataset_path: + raise ValueError("Seed-TTS requires --dataset-path (HF repo id or local root) or --seed-tts-root.") + + p = Path(dataset_path).expanduser() + if p.exists() and p.is_dir(): + return p.resolve() + + repo_id = dataset_path.strip() + try: + from huggingface_hub import snapshot_download + except ImportError as e: + raise ImportError( + "Install huggingface_hub to download Seed-TTS from the Hub, or clone the dataset " + "locally and pass --dataset-path / --seed-tts-root to that directory." + ) from e + cache = snapshot_download(repo_id=repo_id, repo_type="dataset") + return Path(cache).resolve() + + +def _ref_audio_payload(wav_path: Path, *, inline: bool) -> str: + if inline: + raw = wav_path.read_bytes() + b64 = base64.b64encode(raw).decode("ascii") + return f"data:audio/wav;base64,{b64}" + return wav_path.expanduser().resolve().as_uri() + + +class SeedTTSDataset(BenchmarkDataset): + """Seed-TTS-style zero-shot TTS rows for throughput/latency benchmarking. + + Args: + dataset_path: HuggingFace dataset repo id (``org/dataset``) or local directory with + ``en/meta.lst`` (and ``zh/meta.lst`` if using zh). + locale: ``en`` or ``zh`` — which subfolder under the root to read. + inline_ref_audio: If True (default), embed prompt WAV as ``data:audio/wav;base64,...`` + so Qwen3-TTS / ``/v1/audio/speech`` works without server + ``--allowed-local-media-path``. If False, use ``file://`` (smaller + requests; server must set ``--allowed-local-media-path`` to the dataset root). + seed_tts_root: Optional override for the root directory (same layout as HF dataset). + system_prompt: Optional override for the chat system message when using + ``--backend openai-chat-omni``; defaults to :data:`SEED_TTS_DEFAULT_OMNI_SYSTEM_PROMPT`. + """ + + IS_MULTIMODAL = False + DEFAULT_OUTPUT_LEN = 2048 + + def __init__( + self, + dataset_path: str, + random_seed: int = 0, + locale: str = "en", + inline_ref_audio: bool = True, + seed_tts_root: str | None = None, + system_prompt: str | None = None, + disable_shuffle: bool = False, + **kwargs: Any, + ) -> None: + if locale not in ("en", "zh"): + raise ValueError("locale must be 'en' or 'zh'") + self.locale = locale + self.inline_ref_audio = inline_ref_audio + self._explicit_root = seed_tts_root + sp = (system_prompt or "").strip() + self._system_prompt = sp if sp else SEED_TTS_DEFAULT_OMNI_SYSTEM_PROMPT + super().__init__( + dataset_path=dataset_path, + random_seed=random_seed, + disable_shuffle=disable_shuffle, + **kwargs, + ) + self._root = resolve_seed_tts_root(self.dataset_path, explicit_root=self._explicit_root) + self._rows: list[_SeedTTSRow] = [] + self.load_data() + + def load_data(self) -> None: + meta = self._root / self.locale / "meta.lst" + if not meta.is_file(): + raise FileNotFoundError( + f"Seed-TTS meta not found: {meta}. " + f"Expected layout from seed-tts-eval (e.g. {self._root}/{self.locale}/meta.lst)." + ) + self._rows = _load_meta_rows(meta) + if not self._rows: + raise ValueError(f"No valid rows in {meta}") + if not self.disable_shuffle: + rng = random.Random(self.random_seed) + rng.shuffle(self._rows) + self.data = self._rows + logger.info( + "Loaded Seed-TTS: root=%s locale=%s rows=%d inline_ref_audio=%s", + self._root, + self.locale, + len(self._rows), + self.inline_ref_audio, + ) + + def sample( + self, + tokenizer: TokenizerLike, + num_requests: int, + output_len: int | None = None, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs: Any, + ) -> list[SampleRequest]: + if output_len is None: + output_len = self.DEFAULT_OUTPUT_LEN + + tok = get_cached_tokenizer(tokenizer) + out: list[SampleRequest] = [] + for i, row in enumerate(self._rows): + if len(out) >= num_requests: + break + wav_path = (self._root / self.locale / row.prompt_wav_rel).resolve() + if not wav_path.is_file(): + logger.warning("Missing prompt wav for %s: %s", row.utterance_id, wav_path) + continue + + target = row.target_text + prompt_len = len(tok.encode(f"{self._system_prompt}\n{target}")) + lang = "English" if self.locale == "en" else "Chinese" + ref_uri = _ref_audio_payload(wav_path, inline=self.inline_ref_audio) + speech_extra: dict[str, Any] = { + "ref_audio": ref_uri, + "ref_text": row.ref_text, + "task_type": "Base", + "language": lang, + "max_new_tokens": output_len, + } + + out.append( + SeedTTSSampleRequest( + prompt=target, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=None, + request_id=f"{request_id_prefix}{i}", + seed_tts_speech_extra=speech_extra, + seed_tts_utterance_id=row.utterance_id, + seed_tts_locale=self.locale, + seed_tts_system_prompt=self._system_prompt, + seed_tts_ref_wav_path=str(wav_path), + ) + ) + + logger.info("Seed-TTS: built %d requests (asked %d)", len(out), num_requests) + self.maybe_oversample_requests(out, num_requests, request_id_prefix, no_oversample) + return out + + +def load_seed_tts_dataset( + dataset_path: str, + random_seed: int = 0, + locale: str = "en", + inline_ref_audio: bool = True, + seed_tts_root: str | None = None, + system_prompt: str | None = None, + **kwargs: Any, +) -> SeedTTSDataset: + return SeedTTSDataset( + dataset_path=dataset_path, + random_seed=random_seed, + locale=locale, + inline_ref_audio=inline_ref_audio, + seed_tts_root=seed_tts_root, + system_prompt=system_prompt, + **kwargs, + ) diff --git a/vllm_omni/benchmarks/data_modules/seed_tts_eval.py b/vllm_omni/benchmarks/data_modules/seed_tts_eval.py new file mode 100644 index 00000000000..d5f1b64709f --- /dev/null +++ b/vllm_omni/benchmarks/data_modules/seed_tts_eval.py @@ -0,0 +1,729 @@ +"""Seed-TTS WER aligned with Bytedance ``seed-tts-eval`` / ``run_wer.py``. + +Matches the published protocol (see Hugging Face dataset card and +https://github.com/BytedanceSpeech/seed-tts-eval): + +- **EN**: ``openai/whisper-large-v3`` via ``transformers``, audio resampled to **16 kHz** + (same as ``run_wer.py``). +- **ZH**: ``funasr`` **paraformer-zh**, hypothesis converted with **zhconv** to zh-cn. +- **WER**: ``jiwer`` after punctuation stripping (``zhon.hanzi.punctuation`` + ``string.punctuation``, + preserving ``'``) and EN lowercasing / ZH per-character spacing. Supports jiwer 3.x + (``compute_measures``) and 4.x (``process_words``). + +- **SIM** (speaker similarity proxy): cosine similarity of L2-normalized mean-pooled **WavLM** + embeddings (reference prompt WAV vs. synthesized PCM), 16 kHz. Official ``cal_sim.sh`` uses + UniSpeech ``verification_pair_list_v2.py`` with a **fine-tuned** WavLM SV checkpoint — set + ``SEED_TTS_WAVLM_MODEL`` to another HF id if you need closer parity. Disable with + ``SEED_TTS_SIM_EVAL=0``. Optional: ``SEED_TTS_SIM_DEVICE`` (e.g. ``cpu``) to avoid GPU + issues when Whisper already uses CUDA; ``SEED_TTS_WAVLM_MIN_SAMPLES`` pads very short + waveforms so the WavLM CNN front-end does not fail. + +- **UTMOS** (predicted MOS from TorchScript): default ``balacoon/utmos`` → ``utmos.jit`` + (Sarulab-style demo export). Uses ``torch`` + ``huggingface_hub`` only. Aggregate metrics + are over **all requests with captured PCM** (independent of ASR/WER). Non-finite scores are + dropped and counted as failures. Override repo/file via ``SEED_TTS_UTMOS_HF_REPO`` / + ``SEED_TTS_UTMOS_JIT_FILE``. **Device**: defaults to **CPU** when ``SEED_TTS_UTMOS_DEVICE`` + is unset; set ``SEED_TTS_UTMOS_DEVICE=cuda:0`` (or ``cuda:1`` etc.) to run on GPU. The JIT + model is loaded directly onto the target device via ``map_location`` to avoid cross-device + issues (some PyTorch builds/Windows have problems moving TorchScript modules after load). + Forward uses **float32** waveform in ``[-1, 1]`` (same as the WER resampled array) so + tensor dtypes match JIT weights; using int16 triggers + ``RuntimeError: input type and weight type should be same`` on common exports. Disable + with ``SEED_TTS_UTMOS_EVAL=0``. + +Enable with ``SEED_TTS_WER_EVAL=1`` or ``--seed-tts-wer-eval``. Install optional deps:: + + pip install 'vllm-omni[seed-tts-eval]' + +Env: ``SEED_TTS_EVAL_DEVICE`` (e.g. ``cuda:0``, ``cpu``); ``SEED_TTS_HF_WHISPER_MODEL`` +defaults to ``openai/whisper-large-v3`` (override for debugging only). +""" + +from __future__ import annotations + +import io +import logging +import math +import os +import statistics +import string +import tempfile +import threading +import wave +from typing import Any + +import numpy as np +from vllm.benchmarks.datasets import SampleRequest + +from vllm_omni.benchmarks.data_modules.seed_tts_dataset import SeedTTSSampleRequest + +logger = logging.getLogger(__name__) + +# Mirrors seed-tts-eval/run_wer.py +OFFICIAL_WHISPER_HF_ID = "openai/whisper-large-v3" +PARAFORMER_MODEL_ID = "paraformer-zh" + +_lock = threading.Lock() +_device: str | None = None +_en_processor = None +_en_model = None +_zh_paraformer = None +_wavlm_model = None +_wavlm_processor = None +_wavlm_device: str | None = None +_utmos_jit_model = None +_utmos_jit_device: str | None = None +_utmos_jit_load_failed = False +_utmos_forward_warned = False + + +def pcm_s16le_mono_to_wav_bytes(pcm: bytes, *, sample_rate: int = 24000) -> bytes: + buf = io.BytesIO() + with wave.open(buf, "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(pcm) + return buf.getvalue() + + +def _get_eval_device() -> str: + explicit = os.environ.get("SEED_TTS_EVAL_DEVICE", "").strip() + if explicit: + return explicit + try: + import torch + + return "cuda:0" if torch.cuda.is_available() else "cpu" + except ImportError: + return "cpu" + + +def _punctuation_all() -> str: + from zhon.hanzi import punctuation + + return punctuation + string.punctuation + + +def _jiwer_wer(reference: str, hypothesis: str) -> float: + """Word-level WER; strings are normalized like ``run_wer.process_one``. + + jiwer 4.x removed ``compute_measures`` (``ImportError``); fall back to ``process_words``. + """ + try: + from jiwer import compute_measures + + return float(compute_measures(reference, hypothesis)["wer"]) + except ImportError: + import jiwer + + out = jiwer.process_words(reference, hypothesis) + return float(out.wer) + + +def process_one_official(hypo: str, truth: str, lang: str) -> tuple[float, str, str]: + """Same normalization + ``jiwer`` call as ``run_wer.process_one`` (hypo=ASR, truth=reference).""" + raw_truth = truth + raw_hypo = hypo + truth_n = truth + hypo_n = hypo + for x in _punctuation_all(): + if x == "'": + continue + truth_n = truth_n.replace(x, "") + hypo_n = hypo_n.replace(x, "") + truth_n = truth_n.replace(" ", " ") + hypo_n = hypo_n.replace(" ", " ") + if lang == "zh": + truth_n = " ".join([x for x in truth_n]) + hypo_n = " ".join([x for x in hypo_n]) + elif lang == "en": + truth_n = truth_n.lower() + hypo_n = hypo_n.lower() + else: + raise ValueError(f"unsupported lang {lang!r}") + wer = _jiwer_wer(truth_n, hypo_n) + return wer, raw_truth, raw_hypo + + +def _pcm_s16le_to_f32_16k(pcm: bytes, pcm_sample_rate: int = 24000) -> np.ndarray: + import scipy.signal + + if not pcm: + return np.zeros(0, dtype=np.float32) + raw = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0 + target_len = int(len(raw) * 16000 / pcm_sample_rate) + if target_len <= 0: + return np.zeros(0, dtype=np.float32) + return scipy.signal.resample(raw, target_len).astype(np.float32) + + +def _eval_submetric_enabled(env_name: str, *, default: bool = True) -> bool: + raw = os.environ.get(env_name, "").strip().lower() + if raw in ("0", "false", "no", "off"): + return False + if raw in ("1", "true", "yes", "on"): + return True + return default + + +def _audio_path_to_f32_16k(path: str) -> np.ndarray: + import scipy.signal + import soundfile as sf + + data, sr = sf.read(path, dtype="float32", always_2d=True) + mono = np.mean(data, axis=1).astype(np.float32) + if int(sr) == 16000: + return mono + target_len = max(1, int(len(mono) * 16000 / int(sr))) + return scipy.signal.resample(mono, target_len).astype(np.float32) + + +def _ensure_wavlm_sim() -> None: + global _wavlm_model, _wavlm_processor, _wavlm_device + with _lock: + if _wavlm_model is not None: + return + from transformers import AutoFeatureExtractor, AutoModel + + mid = os.environ.get("SEED_TTS_WAVLM_MODEL", "microsoft/wavlm-base-plus").strip() or "microsoft/wavlm-base-plus" + _wavlm_device = os.environ.get("SEED_TTS_SIM_DEVICE", "").strip() or _get_eval_device() + logger.warning( + "Loading WavLM %r on %s for Seed-TTS SIM (embedding cosine; not identical to " + "seed-tts-eval UniSpeech SV checkpoint).", + mid, + _wavlm_device, + ) + _wavlm_processor = AutoFeatureExtractor.from_pretrained(mid) + _wavlm_model = AutoModel.from_pretrained(mid).to(_wavlm_device) + _wavlm_model.eval() + + +def _wavlm_prepare_waveform(wav: np.ndarray) -> np.ndarray: + """Trim, pad to a minimum length WavLM/Wav2Vec2 CNN stack accepts, float32 mono.""" + max_sec = float(os.environ.get("SEED_TTS_WAVLM_MAX_SECONDS", "30")) + cap = int(max_sec * 16000) + w = np.asarray(wav, dtype=np.float32).reshape(-1) + if len(w) == 0: + return w + if len(w) > cap: + w = w[:cap].copy() + # Very short clips make the strided conv front-end fail (shape / empty time dim). + min_samples = int(os.environ.get("SEED_TTS_WAVLM_MIN_SAMPLES", "4000")) + if len(w) < min_samples: + w = np.pad(w, (0, min_samples - len(w)), mode="constant") + return w + + +def _wavlm_mean_embedding_f32_16k(wav: np.ndarray) -> np.ndarray | None: + import torch + + _ensure_wavlm_sim() + w = _wavlm_prepare_waveform(wav) + if len(w) == 0: + return None + assert _wavlm_processor is not None and _wavlm_model is not None and _wavlm_device is not None + # Single utterance: avoid padding=True (adds zeros that distort mean pooling). Still pass + # attention_mask when the extractor provides it (sample-level; do not mix with hidden length). + try: + inputs = _wavlm_processor( + w, + sampling_rate=16000, + return_tensors="pt", + padding=False, + return_attention_mask=True, + ) + except TypeError: + inputs = _wavlm_processor( + w, + sampling_rate=16000, + return_tensors="pt", + padding=False, + ) + iv = inputs["input_values"].to(_wavlm_device) + am = inputs.get("attention_mask") + if am is not None: + am = am.to(_wavlm_device) + with torch.inference_mode(): + out = _wavlm_model(iv, attention_mask=am) + h = out.last_hidden_state + v = h.mean(dim=1).squeeze(0).float().cpu().numpy() + n = float(np.linalg.norm(v)) + if not np.isfinite(n) or n < 1e-8: + return None + return (v / n).astype(np.float32) + + +def _cosine_similarity_unit_vectors(a: np.ndarray, b: np.ndarray) -> float: + return float(np.dot(a, b)) + + +def _ensure_utmos_jit_model() -> Any | None: + """Load UTMOS as TorchScript (``balacoon/utmos`` style): no ``import utmos`` / fairseq.""" + global _utmos_jit_model, _utmos_jit_device, _utmos_jit_load_failed + with _lock: + if _utmos_jit_load_failed: + return None + if _utmos_jit_model is not None: + return _utmos_jit_model + try: + import torch + from huggingface_hub import hf_hub_download + + repo = os.environ.get("SEED_TTS_UTMOS_HF_REPO", "balacoon/utmos").strip() or "balacoon/utmos" + fname = os.environ.get("SEED_TTS_UTMOS_JIT_FILE", "utmos.jit").strip() or "utmos.jit" + logger.warning( + "Loading UTMOS TorchScript from Hugging Face %r file %r (one-time download/cache)...", + repo, + fname, + ) + path = hf_hub_download(repo_id=repo, filename=fname, repo_type="model") + + # TODO The model weights in UTMOS must be loaded in cuda:0; otherwise, the model execution will fail. + want = "cuda:0" + if want.startswith("cuda") and torch.cuda.is_available(): + idx = want.split(":")[-1] if ":" in want else "0" + target_dev = f"cuda:{idx}" + else: + target_dev = "cpu" + + try: + m = torch.jit.load(path, map_location=target_dev) + m.eval() + _utmos_jit_device = target_dev + except Exception as load_e: + if target_dev.startswith("cuda"): + logger.warning( + "UTMOS JIT load on %s failed (%s), retrying on CPU...", + target_dev, + load_e, + ) + m = torch.jit.load(path, map_location="cpu") + m.eval() + _utmos_jit_device = "cpu" + else: + raise + _utmos_jit_model = m + except Exception as e: + logger.warning( + "UTMOS JIT unavailable (install torch + huggingface_hub; check HF access): %s", + e, + ) + _utmos_jit_load_failed = True + return None + return _utmos_jit_model + + +def _utmos_predict_f32_16k(wav_f32: np.ndarray) -> float | None: + """MOS from JIT model; input is float32 mono @ 16 kHz in ``[-1, 1]`` (WER pipeline). + + ``balacoon/utmos`` demos sometimes use int16 numpy, but the exported ``.jit`` weights are + float32; passing int16 tensors causes: "RuntimeError: ... input type and weight type + should be same". + """ + import torch + + if len(wav_f32) == 0: + return None + model = _ensure_utmos_jit_model() + if model is None: + return None + # Infer model's device from its first parameter/buffer to guarantee input sits with weights. + try: + model_dev = next(model.parameters()).device + except StopIteration: + try: + model_dev = next(model.buffers()).device + except StopIteration: + model_dev = torch.device("cpu") + w = np.ascontiguousarray(wav_f32, dtype=np.float32) + x = torch.from_numpy(w).unsqueeze(0).to(device=model_dev, dtype=torch.float32) + with torch.no_grad(): + out = model(x) + val = float(out.reshape(-1)[0].item()) + if not math.isfinite(val): + return None + return val + + +def _ensure_en_asr() -> None: + global _en_processor, _en_model, _device + with _lock: + if _en_processor is not None: + return + from transformers import WhisperForConditionalGeneration, WhisperProcessor + + _device = _get_eval_device() + mid = os.environ.get("SEED_TTS_HF_WHISPER_MODEL", OFFICIAL_WHISPER_HF_ID).strip() or OFFICIAL_WHISPER_HF_ID + logger.warning( + "Loading Seed-TTS eval Whisper HF model %r on %s (one-time, seed-tts-eval protocol)...", + mid, + _device, + ) + _en_processor = WhisperProcessor.from_pretrained(mid) + _en_model = WhisperForConditionalGeneration.from_pretrained(mid).to(_device) + _en_model.eval() + + +def _ensure_zh_asr() -> None: + global _zh_paraformer, _device + with _lock: + if _zh_paraformer is not None: + return + from funasr import AutoModel + + _device = _get_eval_device() + logger.warning( + "Loading Seed-TTS eval Paraformer %r on %s (one-time, seed-tts-eval protocol)...", + PARAFORMER_MODEL_ID, + _device, + ) + try: + _zh_paraformer = AutoModel(model=PARAFORMER_MODEL_ID, device=_device) + except TypeError: + _zh_paraformer = AutoModel(model=PARAFORMER_MODEL_ID) + + +def _transcribe_en_f32_16k(wav_f32: np.ndarray) -> str: + import torch + + _ensure_en_asr() + if len(wav_f32) == 0: + return "" + with _lock: + assert _en_processor is not None and _en_model is not None and _device is not None + inputs = _en_processor(wav_f32, sampling_rate=16000, return_tensors="pt") + input_features = inputs.input_features.to(_device) + with torch.no_grad(): + try: + forced = _en_processor.get_decoder_prompt_ids(language="english", task="transcribe") + predicted_ids = _en_model.generate(input_features, forced_decoder_ids=forced) + except Exception: + predicted_ids = _en_model.generate( + input_features, + language="english", + task="transcribe", + ) + text = _en_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] + return (text or "").strip() + + +def _transcribe_zh_wav_path(wav_path: str) -> str: + import zhconv + + _ensure_zh_asr() + with _lock: + assert _zh_paraformer is not None + res = _zh_paraformer.generate(input=wav_path, batch_size_s=300) + transcription = res[0]["text"] if res else "" + return zhconv.convert(transcription, "zh-cn").strip() + + +def _missing_deps_message(lang: str) -> str | None: + try: + import jiwer # noqa: F401 + from zhon.hanzi import punctuation # noqa: F401 + except ImportError as e: + return f"Seed-TTS WER eval needs jiwer and zhon ({e!s}). Install: pip install 'vllm-omni[seed-tts-eval]'" + try: + import scipy.signal # noqa: F401 + import soundfile # noqa: F401 + except ImportError as e: + return f"Seed-TTS WER eval needs scipy and soundfile ({e!s})." + if lang == "en": + try: + import torch # noqa: F401 + from transformers import WhisperForConditionalGeneration # noqa: F401 + except ImportError as e: + return f"English WER needs torch and transformers ({e!s}). Install: pip install 'vllm-omni[seed-tts-eval]'" + else: + try: + import zhconv # noqa: F401 + from funasr import AutoModel # noqa: F401 + except ImportError as e: + return f"Chinese WER needs funasr and zhconv ({e!s}). Install: pip install 'vllm-omni[seed-tts-eval]'" + return None + + +def compute_seed_tts_wer_metrics( + input_requests: list[SampleRequest], + outputs: list[Any], + *, + include_per_item: bool = False, +) -> dict[str, Any] | None: + """If all requests are :class:`SeedTTSSampleRequest`, run seed-tts-eval-style WER.""" + global _utmos_forward_warned + if not input_requests or len(input_requests) != len(outputs): + return None + if not all(isinstance(r, SeedTTSSampleRequest) for r in input_requests): + return None + + first = input_requests[0] + assert isinstance(first, SeedTTSSampleRequest) + lang = "zh" if (first.seed_tts_locale or "en").lower().startswith("zh") else "en" + + setup_err = _missing_deps_message(lang) + if setup_err: + logger.error("%s", setup_err) + return { + "seed_tts_eval_setup_error": setup_err, + "seed_tts_eval_protocol": "seed-tts-eval", + "seed_tts_content_evaluated": 0, + "seed_tts_content_error_mean": None, + "seed_tts_content_error_median": None, + "seed_tts_request_failed": 0, + "seed_tts_no_pcm": 0, + "seed_tts_asr_failed": 0, + "seed_tts_content_metric": "wer", + } + + import soundfile as sf + + errs: list[float] = [] + items: list[dict[str, Any]] = [] + asr_failed = 0 + no_pcm = 0 + request_failed = 0 + sim_values: list[float] = [] + utmos_values: list[float] = [] + sim_failed = 0 + sim_skipped_no_ref = 0 + utmos_failed = 0 + utmos_on = _eval_submetric_enabled("SEED_TTS_UTMOS_EVAL", default=True) + + for req, out in zip(input_requests, outputs, strict=True): + assert isinstance(req, SeedTTSSampleRequest) + ref = req.prompt + locale = req.seed_tts_locale or "en" + row_lang = "zh" if locale.lower().startswith("zh") else "en" + utmos_v: float | None = None + + if not out.success: + request_failed += 1 + if include_per_item: + items.append( + { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "error": "request_failed", + "detail": (out.error or "")[:500], + } + ) + continue + + pcm = getattr(out, "tts_output_pcm_bytes", None) + if not pcm: + no_pcm += 1 + if include_per_item: + items.append( + { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "error": "no_pcm", + } + ) + continue + + wav_16k = _pcm_s16le_to_f32_16k(pcm) + if len(wav_16k) == 0: + asr_failed += 1 + if include_per_item: + items.append( + { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "error": "empty_audio", + } + ) + continue + + # UTMOS scores synthesized audio only; do not gate on ASR/WER (those can fail independently). + if utmos_on: + try: + utmos_v = _utmos_predict_f32_16k(wav_16k) + if utmos_v is not None: + utmos_values.append(utmos_v) + elif not _utmos_jit_load_failed: + utmos_failed += 1 + except Exception: + if not _utmos_forward_warned: + _utmos_forward_warned = True + logger.warning( + "UTMOS JIT forward failed (first utterance=%s; set logging DEBUG for " + "full trace). Check sample rate (16 kHz), input shape, or " + "SEED_TTS_UTMOS_DEVICE.", + req.seed_tts_utterance_id, + exc_info=True, + ) + else: + logger.debug( + "UTMOS forward failed for %s", + req.seed_tts_utterance_id, + exc_info=True, + ) + utmos_failed += 1 + + try: + if row_lang == "en": + hyp = _transcribe_en_f32_16k(wav_16k) + else: + fd, tmp_wav = tempfile.mkstemp(suffix=".wav") + os.close(fd) + try: + sf.write(tmp_wav, wav_16k, 16000, subtype="PCM_16") + hyp = _transcribe_zh_wav_path(tmp_wav) + finally: + try: + os.unlink(tmp_wav) + except OSError: + pass + except Exception as e: + logger.exception("Seed-TTS ASR failed for %s", req.seed_tts_utterance_id) + asr_failed += 1 + if include_per_item: + items.append( + { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "error": "asr_exception", + "detail": str(e)[:500], + } + ) + continue + + if not hyp: + asr_failed += 1 + if include_per_item: + items.append( + { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "error": "empty_asr", + } + ) + continue + + try: + wer, raw_truth, raw_hypo = process_one_official(hyp, ref, row_lang) + except Exception as e: + logger.warning("jiwer/normalize failed for %s: %s", req.seed_tts_utterance_id, e) + asr_failed += 1 + if include_per_item: + items.append( + { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "error": "wer_compute_failed", + "detail": str(e)[:500], + } + ) + continue + + errs.append(wer) + sim_v: float | None = None + + if _eval_submetric_enabled("SEED_TTS_SIM_EVAL", default=True): + ref_path = getattr(req, "seed_tts_ref_wav_path", "") or "" + if ref_path and os.path.isfile(ref_path): + try: + ref_wav = _audio_path_to_f32_16k(ref_path) + e_ref = _wavlm_mean_embedding_f32_16k(ref_wav) + e_hyp = _wavlm_mean_embedding_f32_16k(wav_16k) + if e_ref is not None and e_hyp is not None: + sim_v = _cosine_similarity_unit_vectors(e_ref, e_hyp) + sim_values.append(sim_v) + except Exception as e: + logger.warning( + "SIM embedding failed for utterance=%s: %s: %s", + req.seed_tts_utterance_id, + type(e).__name__, + e, + ) + sim_failed += 1 + else: + sim_skipped_no_ref += 1 + + if include_per_item: + row: dict[str, Any] = { + "utterance_id": req.seed_tts_utterance_id, + "locale": locale, + "wer": wer, + "reference_raw": raw_truth, + "asr_raw": raw_hypo, + } + if sim_v is not None: + row["sim"] = sim_v + if utmos_v is not None: + row["utmos"] = utmos_v + items.append(row) + + result: dict[str, Any] = { + "seed_tts_eval_protocol": "seed-tts-eval", + "seed_tts_content_evaluated": len(errs), + "seed_tts_content_error_mean": statistics.fmean(errs) if errs else None, + "seed_tts_content_error_median": statistics.median(errs) if errs else None, + "seed_tts_request_failed": request_failed, + "seed_tts_no_pcm": no_pcm, + "seed_tts_asr_failed": asr_failed, + "seed_tts_content_metric": "wer", + "seed_tts_sim_evaluated": len(sim_values), + "seed_tts_sim_mean": statistics.fmean(sim_values) if sim_values else None, + "seed_tts_sim_median": statistics.median(sim_values) if sim_values else None, + "seed_tts_sim_failed": sim_failed, + "seed_tts_sim_skipped_no_ref": sim_skipped_no_ref, + "seed_tts_utmos_evaluated": len(utmos_values), + "seed_tts_utmos_mean": statistics.fmean(utmos_values) if utmos_values else None, + "seed_tts_utmos_median": statistics.median(utmos_values) if utmos_values else None, + "seed_tts_utmos_failed": utmos_failed, + } + if include_per_item: + result["seed_tts_wer_eval_items"] = items + return result + + +def print_seed_tts_wer_summary(metrics: dict[str, Any]) -> None: + setup = metrics.get("seed_tts_eval_setup_error") + if setup: + print("{s:{c}^{n}}".format(s=" Seed-TTS eval (seed-tts-eval protocol) ", n=50, c="=")) + print(setup) + return + + ev = int(metrics.get("seed_tts_content_evaluated", 0) or 0) + rf = int(metrics.get("seed_tts_request_failed", 0) or 0) + npc = int(metrics.get("seed_tts_no_pcm", 0) or 0) + af = int(metrics.get("seed_tts_asr_failed", 0) or 0) + sim_ev = int(metrics.get("seed_tts_sim_evaluated", 0) or 0) + ut_ev = int(metrics.get("seed_tts_utmos_evaluated", 0) or 0) + if ev == 0 and rf == 0 and npc == 0 and af == 0 and sim_ev == 0 and ut_ev == 0: + return + print("{s:{c}^{n}}".format(s=" Seed-TTS eval (seed-tts-eval protocol) ", n=50, c="=")) + print("{:<40} {:<10}".format("Evaluated (WER, lower is better):", ev)) + mean = metrics.get("seed_tts_content_error_mean") + if mean is not None: + print("{:<40} {:<10.4f}".format("Mean WER:", float(mean))) + med = metrics.get("seed_tts_content_error_median") + if med is not None: + print("{:<40} {:<10.4f}".format("Median WER:", float(med))) + print("{:<40} {:<10}".format("Request failed:", metrics.get("seed_tts_request_failed", 0))) + print("{:<40} {:<10}".format("No PCM captured:", metrics.get("seed_tts_no_pcm", 0))) + print("{:<40} {:<10}".format("ASR / WER failed:", metrics.get("seed_tts_asr_failed", 0))) + if sim_ev or metrics.get("seed_tts_sim_skipped_no_ref") or metrics.get("seed_tts_sim_failed"): + print("{:<40} {:<10}".format("SIM evaluated (higher ~ closer):", sim_ev)) + sm = metrics.get("seed_tts_sim_mean") + if sm is not None: + print("{:<40} {:<10.4f}".format("Mean SIM:", float(sm))) + s_med = metrics.get("seed_tts_sim_median") + if s_med is not None: + print("{:<40} {:<10.4f}".format("Median SIM:", float(s_med))) + print("{:<40} {:<10}".format("SIM skipped (no ref path):", metrics.get("seed_tts_sim_skipped_no_ref", 0))) + print("{:<40} {:<10}".format("SIM embedding errors:", metrics.get("seed_tts_sim_failed", 0))) + if ut_ev or metrics.get("seed_tts_utmos_failed"): + print("{:<40} {:<10}".format("UTMOS evaluated (JIT MOS, higher better):", ut_ev)) + um = metrics.get("seed_tts_utmos_mean") + if um is not None: + print("{:<40} {:<10.4f}".format("Mean UTMOS:", float(um))) + u_med = metrics.get("seed_tts_utmos_median") + if u_med is not None: + print("{:<40} {:<10.4f}".format("Median UTMOS:", float(u_med))) + print("{:<40} {:<10}".format("UTMOS errors:", metrics.get("seed_tts_utmos_failed", 0))) + print("=" * 50) diff --git a/vllm_omni/benchmarks/patch/__init__.py b/vllm_omni/benchmarks/patch/__init__.py index e69de29bb2d..ca6b41ba8f7 100644 --- a/vllm_omni/benchmarks/patch/__init__.py +++ b/vllm_omni/benchmarks/patch/__init__.py @@ -0,0 +1,3 @@ +"""Omni benchmark monkey-patches (side effects in ``patch.patch``).""" + +from . import patch as _patch_module # noqa: F401 diff --git a/vllm_omni/benchmarks/patch/patch.py b/vllm_omni/benchmarks/patch/patch.py index 17d7498ba26..41aed094235 100644 --- a/vllm_omni/benchmarks/patch/patch.py +++ b/vllm_omni/benchmarks/patch/patch.py @@ -6,6 +6,7 @@ import os import random import ssl +import sys import time import traceback from collections.abc import Iterable @@ -33,15 +34,245 @@ from vllm.tokenizers import TokenizerLike logger = init_logger(__name__) + +from vllm_omni.benchmarks.data_modules.daily_omni_dataset import DailyOmniDataset, DailyOmniSampleRequest from vllm_omni.benchmarks.data_modules.random_multi_modal_dataset import OmniRandomMultiModalDataset +from vllm_omni.benchmarks.data_modules.seed_tts_dataset import ( + SEED_TTS_DEFAULT_OMNI_SYSTEM_PROMPT, + SeedTTSDataset, + SeedTTSSampleRequest, +) get_samples_old = datasets.get_samples +_DEFAULT_DAILY_OMNI_REPO = "liarliar/Daily-Omni" + + +def _seed_tts_capture_pcm_for_wer() -> bool: + return os.environ.get("SEED_TTS_WER_EVAL", "").lower() in ( + "1", + "true", + "yes", + ) + + +def _merge_extra_body_mm_kwargs(base: dict | None, overlay: dict | None) -> dict | None: + """Shallow-merge ``extra_body`` dicts; deep-merge ``mm_processor_kwargs`` if both set.""" + if not base and not overlay: + return None + out = dict(base or {}) + if not overlay: + return out + for k, v in overlay.items(): + if k == "mm_processor_kwargs" and isinstance(v, dict): + prev = out.get("mm_processor_kwargs") + merged_kw = {**(prev if isinstance(prev, dict) else {}), **v} + out["mm_processor_kwargs"] = merged_kw + else: + out[k] = v + return out + + +def _attach_daily_omni_to_request_func_input(sample: SampleRequest, rfi: RequestFuncInput) -> None: + """Apply per-request OpenAI fields (``mm_processor_kwargs``, messages) for Daily-Omni.""" + if not isinstance(sample, DailyOmniSampleRequest): + return + rfi.extra_body = _merge_extra_body_mm_kwargs(rfi.extra_body, sample.omni_extra_body) + if sample.omni_chat_messages is not None: + setattr(rfi, "omni_chat_messages", sample.omni_chat_messages) + else: + setattr(rfi, "mm_position", sample.omni_chat_mm_position) + + +def _attach_seed_tts_to_request_func_input(sample: SampleRequest, rfi: RequestFuncInput) -> None: + """Merge Seed-TTS per-row TTS fields (ref_audio, ref_text, task_type, …) into ``extra_body``. + + Used by both ``/v1/audio/speech`` and ``/v1/chat/completions`` (flattened into JSON body). + For ``openai-chat-omni``, also sets ``omni_chat_messages`` (system + user) so Qwen3-Omni + follows the same role layout as official TTS / multimodal demos. ``/v1/audio/speech`` ignores + ``messages`` and only uses ``input`` + body fields. + Flags ``openai-chat-omni`` to request audio output and optionally export PCM for WER. + """ + if not isinstance(sample, SeedTTSSampleRequest): + return + ex = sample.seed_tts_speech_extra + if not ex: + return + base = dict(rfi.extra_body) if rfi.extra_body else {} + base.update(ex) + rfi.extra_body = base + # Used by request funcs to force streaming TTS behavior and to export PCM when WER is on. + setattr(rfi, "seed_tts_row", True) + sys_prompt = (sample.seed_tts_system_prompt or "").strip() or SEED_TTS_DEFAULT_OMNI_SYSTEM_PROMPT + setattr( + rfi, + "omni_chat_messages", + [ + {"role": "system", "content": [{"type": "text", "text": sys_prompt}]}, + {"role": "user", "content": [{"type": "text", "text": sample.prompt}]}, + ], + ) + + +def _daily_omni_repo_from_args(args) -> str | None: + """Resolve HuggingFace repo id for Daily-Omni from CLI args. + + vLLM allows ``--dataset-path`` to be a local path while the real HF id is + passed via ``--hf-name``. Upstream ``get_samples`` for ``hf`` only matches + a fixed elif-chain and never discovers Omni's loader, so we must detect + Daily-Omni here using either field. + """ + dp = getattr(args, "dataset_path", None) + hn = getattr(args, "hf_name", None) + if dp in DailyOmniDataset.SUPPORTED_DATASET_PATHS: + return dp + if hn in DailyOmniDataset.SUPPORTED_DATASET_PATHS: + return hn + return None + def get_samples(args, tokenizer): - if args.backend not in ["openai-chat-omni", "openai-audio-speech"]: + # Daily-Omni: explicit dataset name, or hf + matching path/hf-name + is_daily_omni = args.dataset_name == "daily-omni" or ( + args.dataset_name == "hf" and _daily_omni_repo_from_args(args) is not None + ) + is_seed_tts = args.dataset_name == "seed-tts" + + # Check if we need to handle omni-related backends/datasets + is_omni_backend = args.backend in ["openai-chat-omni", "openai-audio-speech", "daily-omni"] + is_omni_dataset = is_daily_omni or is_seed_tts or args.dataset_name == "random-mm" + + if not is_omni_backend and not is_omni_dataset: + # Not an omni-related request, delegate to original implementation return get_samples_old(args, tokenizer) - elif args.dataset_name == "random-mm": + + # Handle Daily-Omni dataset + if is_daily_omni: + # Support: + # --dataset-name daily-omni [--dataset-path liarliar/Daily-Omni] + # --dataset-name daily-omni --daily-omni-qa-json /path/to/qa.json (offline QA) + # --dataset-name hf --dataset-path liarliar/Daily-Omni + # --dataset-name hf --hf-name liarliar/Daily-Omni (dataset-path may be local) + + # Validate backend supports multimodal (video) + if args.backend not in ["openai-chat-omni", "daily-omni"]: + raise ValueError( + f"Daily-Omni dataset requires a multimodal backend that supports video. " + f"Got backend='{args.backend}'. Please use '--backend openai-chat-omni'" + ) + + # Determine video directory if specified (for local video files) + video_dir = getattr(args, "daily_omni_video_dir", None) + + # Get HF split (default to "train"; unused when loading from local qa.json) + dataset_split = getattr(args, "hf_split", None) or "train" + + qa_json = getattr(args, "daily_omni_qa_json", None) + if isinstance(qa_json, str): + qa_json = qa_json.strip() or None + + if qa_json is not None: + logger.info( + "Loading Daily-Omni dataset: qa_json=%s, video_dir=%s (Hub not used for QA)", + qa_json, + video_dir, + ) + dataset = DailyOmniDataset( + qa_json_path=qa_json, + dataset_path=None, + dataset_split=dataset_split, + random_seed=args.seed, + video_dir=video_dir, + input_mode=getattr(args, "daily_omni_input_mode", "all"), + inline_local_video=getattr(args, "daily_omni_inline_local_video", False), + trust_remote_code=getattr(args, "trust_remote_code", False), + disable_shuffle=getattr(args, "disable_shuffle", False), + ) + else: + repo_id = _daily_omni_repo_from_args(args) + if args.dataset_name == "daily-omni": + if repo_id is None: + repo_id = _DEFAULT_DAILY_OMNI_REPO + elif repo_id is None: + raise ValueError( + "Daily-Omni with --dataset-name hf requires " + f"--dataset-path {_DEFAULT_DAILY_OMNI_REPO} or " + f"--hf-name {_DEFAULT_DAILY_OMNI_REPO}." + ) + + logger.info( + "Loading Daily-Omni dataset: hf_repo=%s, split=%s, video_dir=%s", + repo_id, + dataset_split, + video_dir, + ) + + dataset = DailyOmniDataset( + dataset_path=repo_id, + dataset_split=dataset_split, + dataset_subset=getattr(args, "hf_subset", None), + random_seed=args.seed, + video_dir=video_dir, + input_mode=getattr(args, "daily_omni_input_mode", "all"), + inline_local_video=getattr(args, "daily_omni_inline_local_video", False), + trust_remote_code=getattr(args, "trust_remote_code", False), + no_stream=getattr(args, "no_stream", False), + disable_shuffle=getattr(args, "disable_shuffle", False), + ) + + out_len = getattr(args, "output_len", None) + if out_len is None: + out_len = getattr(args, "hf_output_len", None) + if out_len is None: + out_len = DailyOmniDataset.DEFAULT_OUTPUT_LEN + + input_requests = dataset.sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=out_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ) + return input_requests + + if is_seed_tts: + if args.backend not in ("openai-audio-speech", "openai-chat-omni"): + raise ValueError( + "Seed-TTS requires --backend openai-audio-speech (POST /v1/audio/speech) or " + "--backend openai-chat-omni (POST /v1/chat/completions with ref_audio/ref_text). " + f"Got backend={args.backend!r}." + ) + repo_id = getattr(args, "dataset_path", None) or getattr(args, "hf_name", None) + if not repo_id: + raise ValueError( + "Seed-TTS requires --dataset-path (HF dataset repo id or local directory) or " + "--hf-name for the Hub dataset id." + ) + + dataset = SeedTTSDataset( + dataset_path=repo_id, + random_seed=args.seed, + locale=getattr(args, "seed_tts_locale", "en"), + inline_ref_audio=not getattr(args, "seed_tts_file_ref_audio", False), + seed_tts_root=getattr(args, "seed_tts_root", None), + system_prompt=getattr(args, "seed_tts_system_prompt", None), + disable_shuffle=getattr(args, "disable_shuffle", False), + ) + out_len = getattr(args, "output_len", None) + if out_len is None: + out_len = getattr(args, "hf_output_len", None) + if out_len is None: + out_len = SeedTTSDataset.DEFAULT_OUTPUT_LEN + return dataset.sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + output_len=out_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ) + + # Handle random-mm dataset (Omni's synthetic multimodal dataset) + if args.dataset_name == "random-mm": dataset = OmniRandomMultiModalDataset(random_seed=args.seed, dataset_path=args.dataset_path) input_requests = dataset.sample( tokenizer=tokenizer, @@ -64,6 +295,10 @@ def get_samples(args, tokenizer): datasets.get_samples = get_samples +_serve_mod = sys.modules.get("vllm.benchmarks.serve") +if _serve_mod is not None: + _serve_mod.get_samples = get_samples + @dataclass class MixRequestFuncOutput(RequestFuncOutput): @@ -72,6 +307,9 @@ class MixRequestFuncOutput(RequestFuncOutput): audio_frames: int = 0 audio_rtf: float = 0.0 text_latency: float = 0.0 + #: Raw PCM s16le mono at 24 kHz for Seed-TTS WER: from ``/v1/audio/speech`` stream or + #: resampled export after ``openai-chat-omni`` audio deltas. + tts_output_pcm_bytes: bytes | None = None async def async_request_openai_chat_omni_completions( @@ -83,13 +321,17 @@ async def async_request_openai_chat_omni_completions( api_url = request_func_input.api_url _validate_api_url(api_url, "OpenAI Chat Completions API", "chat/completions") - content = _get_chat_content(request_func_input, mm_position=mm_position) + omni_messages = getattr(request_func_input, "omni_chat_messages", None) + if omni_messages is not None: + messages_payload = omni_messages + else: + effective_mm_position = getattr(request_func_input, "mm_position", mm_position) + content = _get_chat_content(request_func_input, mm_position=effective_mm_position) + messages_payload = [{"role": "user", "content": content}] payload = { "model": request_func_input.model_name if request_func_input.model_name else request_func_input.model, - "messages": [ - {"role": "user", "content": content}, - ], + "messages": messages_payload, "temperature": 0.0, "max_tokens": request_func_input.output_len, "stream": True, @@ -98,6 +340,10 @@ async def async_request_openai_chat_omni_completions( }, } _update_payload_common(payload, request_func_input) + # Seed-TTS via chat: voice-clone fields live on the body; ensure audio is streamed. + if getattr(request_func_input, "seed_tts_row", False): + if payload.get("modalities") is None: + payload["modalities"] = ["text", "audio"] response_format = payload.get("response_format", "wav") if response_format == "pcm": @@ -167,7 +413,10 @@ async def async_request_openai_chat_omni_completions( data = json.loads(chunk) if choices := data.get("choices"): modality = data.get("modality") - content = choices[0]["delta"].get("content") + delta = choices[0].get("delta") or {} + content = delta.get("content") + if not content and isinstance(delta.get("audio"), dict): + content = delta["audio"].get("data") if modality == "text": # First token if ttft == 0.0: @@ -182,7 +431,7 @@ async def async_request_openai_chat_omni_completions( if output.audio_ttfp == 0.0: output.audio_ttfp = timestamp - st audio_generate_time = timestamp - st - if content != "": + if content: audio_bytes = base64.b64decode(content) seg = AudioSegment.from_file(io.BytesIO(audio_bytes)) if seg is not None: @@ -214,6 +463,12 @@ async def async_request_openai_chat_omni_completions( else: output.audio_rtf = 0 logger.warning("Audio duration is zero") + if _seed_tts_capture_pcm_for_wer() and getattr(request_func_input, "seed_tts_row", False): + try: + seg = generated_audio.set_frame_rate(24000).set_channels(1).set_sample_width(2) + output.tts_output_pcm_bytes = bytes(seg.raw_data) + except Exception as ex: + logger.warning("seed_tts WER PCM export failed: %s", ex) output.success = True else: output.error = response.reason or "" @@ -268,6 +523,10 @@ async def async_request_openai_audio_speech( "response_format": "pcm", } _update_payload_common(payload, request_func_input) + # Seed-TTS + WER: ``--extra-body`` may set stream=false / other formats; speech must stream PCM. + if getattr(request_func_input, "seed_tts_row", False) and _seed_tts_capture_pcm_for_wer(): + payload["stream"] = True + payload["response_format"] = "pcm" headers = { "Content-Type": "application/json", @@ -286,6 +545,8 @@ async def async_request_openai_audio_speech( st = time.perf_counter() output.start_time = st total_pcm_bytes = 0 + capture_wer_pcm = _seed_tts_capture_pcm_for_wer() and getattr(request_func_input, "seed_tts_row", False) + pcm_capture = bytearray() if capture_wer_pcm else None try: async with session.post(url=api_url, json=payload, headers=headers) as response: if response.status == 200: @@ -297,6 +558,8 @@ async def async_request_openai_audio_speech( output.audio_ttfp = timestamp - st output.ttft = output.audio_ttfp total_pcm_bytes += len(chunk) + if pcm_capture is not None: + pcm_capture.extend(chunk) end_time = time.perf_counter() output.latency = end_time - st @@ -309,6 +572,16 @@ async def async_request_openai_audio_speech( else: output.audio_rtf = 0 logger.warning("Audio duration is zero") + if pcm_capture is not None and pcm_capture: + output.tts_output_pcm_bytes = bytes(pcm_capture) + elif capture_wer_pcm: + ct = response.headers.get("Content-Type", "") + logger.warning( + "Seed-TTS WER: HTTP 200 but no PCM bytes (Content-Type=%r, url=%s). " + "Check stream=true and response_format=pcm on the server.", + ct, + api_url, + ) output.success = True else: output.error = response.reason or "" @@ -331,6 +604,12 @@ async def async_request_openai_audio_speech( if "openai-audio-speech" not in OPENAI_COMPATIBLE_BACKENDS: OPENAI_COMPATIBLE_BACKENDS.append("openai-audio-speech") +# Daily-Omni backend for audio-visual reasoning benchmark +# Reuses openai-chat-omni completions for video+text understanding +ASYNC_REQUEST_FUNCS["daily-omni"] = async_request_openai_chat_omni_completions +if "daily-omni" not in OPENAI_COMPATIBLE_BACKENDS: + OPENAI_COMPATIBLE_BACKENDS.append("daily-omni") + # ruff: noqa: E402 # Prevent import order from causing patch failures from vllm.benchmarks import serve @@ -422,6 +701,8 @@ async def benchmark( extra_headers=extra_headers, extra_body=extra_body, ) + _attach_daily_omni_to_request_func_input(input_requests[0], test_input) + _attach_seed_tts_to_request_func_input(input_requests[0], test_input) if ready_check_timeout_sec > 0: test_output = await wait_for_endpoint( @@ -484,6 +765,8 @@ async def warmup_limited_request_func(): extra_headers=extra_headers, extra_body=extra_body, ) + _attach_daily_omni_to_request_func_input(input_requests[0], profile_input) + _attach_seed_tts_to_request_func_input(input_requests[0], profile_input) profile_output = await request_func(request_func_input=profile_input, session=session) if profile_output.success: print("Profiler started") @@ -564,6 +847,8 @@ async def limited_request_func(request_func_input, session, pbar): extra_body=extra_body, request_id=request_id, ) + _attach_daily_omni_to_request_func_input(request, request_func_input) + _attach_seed_tts_to_request_func_input(request, request_func_input) tasks.append( asyncio.create_task(limited_request_func(request_func_input=request_func_input, session=session, pbar=pbar)) ) @@ -631,6 +916,37 @@ async def limited_request_func(request_func_input, session, pbar): "errors": [output.error for output in outputs], } + from vllm_omni.benchmarks.data_modules.daily_omni_eval import ( + compute_daily_omni_accuracy_metrics, + print_daily_omni_accuracy_summary, + ) + + _save_items = os.environ.get("DAILY_OMNI_SAVE_EVAL_ITEMS", "").lower() in ( + "1", + "true", + "yes", + ) + _daily_acc = compute_daily_omni_accuracy_metrics(input_requests, outputs, include_per_item=_save_items) + if _daily_acc is not None: + result.update(_daily_acc) + print_daily_omni_accuracy_summary(_daily_acc) + + if _seed_tts_capture_pcm_for_wer(): + from vllm_omni.benchmarks.data_modules.seed_tts_eval import ( + compute_seed_tts_wer_metrics, + print_seed_tts_wer_summary, + ) + + _save_wer = os.environ.get("SEED_TTS_WER_SAVE_ITEMS", "").lower() in ( + "1", + "true", + "yes", + ) + _wer_m = compute_seed_tts_wer_metrics(input_requests, outputs, include_per_item=_save_wer) + if _wer_m is not None: + result.update(_wer_m) + print_seed_tts_wer_summary(_wer_m) + if rps_change_events: result["rps_change_events"] = rps_change_events diff --git a/vllm_omni/benchmarks/serve.py b/vllm_omni/benchmarks/serve.py index fe946036931..d3f3510c567 100644 --- a/vllm_omni/benchmarks/serve.py +++ b/vllm_omni/benchmarks/serve.py @@ -1,9 +1,21 @@ import argparse import asyncio +import os from typing import Any from vllm.benchmarks.serve import main_async +# Import patch to register daily-omni dataset and omni backends +# This monkey-patches vllm.benchmarks.datasets.get_samples before it's used +# Must be imported before any vllm.benchmarks module usage +import vllm_omni.benchmarks.patch.patch # noqa: F401 + def main(args: argparse.Namespace) -> dict[str, Any]: + if getattr(args, "seed_tts_wer_eval", False): + os.environ["SEED_TTS_WER_EVAL"] = "1" + if getattr(args, "seed_tts_wer_save_items", False): + os.environ["SEED_TTS_WER_SAVE_ITEMS"] = "1" + if getattr(args, "daily_omni_save_eval_items", False): + os.environ["DAILY_OMNI_SAVE_EVAL_ITEMS"] = "1" return asyncio.run(main_async(args)) diff --git a/vllm_omni/entrypoints/cli/benchmark/serve.py b/vllm_omni/entrypoints/cli/benchmark/serve.py index 906e8851a4a..d281432e59b 100644 --- a/vllm_omni/entrypoints/cli/benchmark/serve.py +++ b/vllm_omni/entrypoints/cli/benchmark/serve.py @@ -1,4 +1,5 @@ import argparse +import os from vllm.benchmarks.serve import add_cli_args @@ -6,15 +7,149 @@ from vllm_omni.entrypoints.cli.benchmark.base import OmniBenchmarkSubcommandBase +def add_daily_omni_cli_args(parser: argparse.ArgumentParser) -> None: + """Add CLI arguments specific to Daily-Omni dataset. + + This function should be called by the CLI entrypoint to add additional + arguments for daily-omni benchmark support. + + Args: + parser: The ArgumentParser instance to extend + """ + # Daily-Omni specific arguments + daily_omni_group = parser.add_argument_group("Daily-Omni Dataset Options") + + daily_omni_group.add_argument( + "--daily-omni-qa-json", + type=str, + default=None, + help="Path to local upstream qa.json. When set, QA rows are read from this file and " + "the HuggingFace dataset is not loaded (no network). Use with --daily-omni-video-dir " + "for fully offline runs. --dataset-path / Hub split flags are then ignored for QA loading.", + ) + daily_omni_group.add_argument( + "--daily-omni-video-dir", + type=str, + default=None, + help="Root directory of extracted Daily-Omni videos (contents of Videos.tar: " + "each video_id in its own subdir with {video_id}_video.mp4). " + "When using file URLs, you MUST start the vLLM server with " + "--allowed-local-media-path set to this same directory (or a parent), " + "otherwise requests fail with 'Cannot load local files without " + "--allowed-local-media-path'.", + ) + daily_omni_group.add_argument( + "--daily-omni-inline-local-video", + action="store_true", + default=False, + help="For local videos only: embed MP4 as base64 data URLs in benchmark " + "requests so the server does not need --allowed-local-media-path. " + "Increases request size and client memory; use for small --num-prompts. " + "When using --daily-omni-input-mode audio or all, local WAV files are " + "embedded the same way.", + ) + daily_omni_group.add_argument( + "--daily-omni-input-mode", + type=str, + choices=["all", "visual", "audio"], + default="all", + help="Daily-Omni input protocol (mirrors upstream Lliar-liar/Daily-Omni " + "--input_mode). 'visual': video only (default). 'audio': WAV only, " + "requires {video_id}/{video_id}_audio.wav under --daily-omni-video-dir. " + "'all': video + WAV together. Sets mm_processor_kwargs.use_audio_in_video=false " + "and matches official separate video/audio streams.", + ) + daily_omni_group.add_argument( + "--daily-omni-save-eval-items", + action="store_true", + default=False, + help="Include per-request Daily-Omni accuracy rows (gold/predicted/correct) " + "in the saved JSON under key daily_omni_eval_items. " + "Alternatively set env DAILY_OMNI_SAVE_EVAL_ITEMS=1.", + ) + + # Note: --dataset-name daily-omni via get_samples patch; use either Hub (--dataset-path + # liarliar/Daily-Omni) or local --daily-omni-qa-json (offline). + + +def add_seed_tts_cli_args(parser: argparse.ArgumentParser) -> None: + """CLI for Seed-TTS zero-shot TTS benchmark (``--dataset-name seed-tts``).""" + g = parser.add_argument_group("Seed-TTS Dataset Options") + g.add_argument( + "--seed-tts-locale", + type=str, + choices=["en", "zh"], + default="en", + help="Which Seed-TTS split to load: en/meta.lst or zh/meta.lst under the dataset root.", + ) + g.add_argument( + "--seed-tts-root", + type=str, + default=None, + help="Override root directory that contains en/ and zh/ (meta.lst + prompt-wavs). " + "If set, --dataset-path can still name the HF repo for logging; this path is used for files.", + ) + g.add_argument( + "--seed-tts-file-ref-audio", + action="store_true", + default=False, + help="Send ref_audio as file:// URIs (smaller HTTP bodies). Requires the API server " + "to be started with --allowed-local-media-path covering the Seed-TTS dataset root. " + "Default is inline data:audio/wav;base64 so Qwen3-TTS works without that flag.", + ) + g.add_argument( + "--seed-tts-inline-ref-audio", + action="store_true", + default=False, + help=argparse.SUPPRESS, + ) + g.add_argument( + "--seed-tts-system-prompt", + type=str, + default=None, + help="Override chat system message for --backend openai-chat-omni (Qwen3-Omni TTS). " + "Default follows official Qwen3-Omni identity + zero-shot voice-clone instructions.", + ) + g.add_argument( + "--seed-tts-wer-eval", + action="store_true", + default=False, + help="Keep synthesized audio as 24 kHz mono PCM for WER (works with " + "--backend openai-audio-speech or openai-chat-omni). Scoring follows " + "BytedanceSpeech/seed-tts-eval (Whisper-large-v3 / Paraformer-zh + jiwer). " + "Sets SEED_TTS_WER_EVAL=1. Install: pip install 'vllm-omni[seed-tts-eval]'. " + "Optional: SEED_TTS_EVAL_DEVICE, SEED_TTS_HF_WHISPER_MODEL.", + ) + g.add_argument( + "--seed-tts-wer-save-items", + action="store_true", + default=False, + help="Include per-utterance ASR rows in the saved JSON under key seed_tts_wer_eval_items. " + "Or set SEED_TTS_WER_SAVE_ITEMS=1.", + ) + + class OmniBenchmarkServingSubcommand(OmniBenchmarkSubcommandBase): """The `serve` subcommand for vllm bench.""" name = "serve" - help = "Benchmark the online serving throughput." + help = "Benchmark the online serving throughput. Supports Daily-Omni and Seed-TTS datasets." @classmethod def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: add_cli_args(parser) + + # Add Daily-Omni specific arguments + add_daily_omni_cli_args(parser) + add_seed_tts_cli_args(parser) + + for action in parser._actions: + if action.dest == "dataset_name" and action.choices is not None: + extra = [c for c in ("daily-omni", "seed-tts") if c not in action.choices] + if extra: + action.choices = list(action.choices) + extra + + # Update help messages for omni-specific features for action in parser._actions: if action.dest == "percentile_metrics": action.help = ( @@ -48,4 +183,10 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: @staticmethod def cmd(args: argparse.Namespace) -> None: + if getattr(args, "daily_omni_save_eval_items", False): + os.environ["DAILY_OMNI_SAVE_EVAL_ITEMS"] = "1" + if getattr(args, "seed_tts_wer_eval", False): + os.environ["SEED_TTS_WER_EVAL"] = "1" + if getattr(args, "seed_tts_wer_save_items", False): + os.environ["SEED_TTS_WER_SAVE_ITEMS"] = "1" main(args) From 0d020739a7d85e2b2ec2d30f26d0d741b4f4fb98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zeyu=20Huang=20=7C=20=E9=BB=83=E6=BE=A4=E5=AE=87?= <11222265+fhfuih@users.noreply.github.com> Date: Wed, 15 Apr 2026 11:13:11 +0800 Subject: [PATCH 44/76] [CI] qwen image edit L4 accuracy test (#2761) --- .buildkite/test-nightly-diffusion.yml | 40 ++++ pyproject.toml | 1 + tests/conftest.py | 10 +- tests/e2e/accuracy/conftest.py | 25 +++ tests/e2e/accuracy/test_qwen_image_edit.py | 232 +++++++++++++++++++++ tests/e2e/accuracy/utils.py | 74 +++++++ 6 files changed, 377 insertions(+), 5 deletions(-) create mode 100644 tests/e2e/accuracy/test_qwen_image_edit.py create mode 100644 tests/e2e/accuracy/utils.py diff --git a/.buildkite/test-nightly-diffusion.yml b/.buildkite/test-nightly-diffusion.yml index a520ca4356d..b5ba8a117c6 100644 --- a/.buildkite/test-nightly-diffusion.yml +++ b/.buildkite/test-nightly-diffusion.yml @@ -375,3 +375,43 @@ steps: hostPath: path: /mnt/hf-cache type: DirectoryOrCreate + + - label: ":full_moon: Diffusion · Qwen-Image · Accuracy Test" + key: nightly-qwen-image-accuracy + timeout_in_minutes: 180 + if: *nightly_or_pr_label + commands: + - pytest -s -v tests/e2e/accuracy/test_qwen_image*.py --run-level advanced_model + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate diff --git a/pyproject.toml b/pyproject.toml index 753e0e39817..9b034a7c8e9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ dev = [ "pyttsx3>=2.99", "opencc>=1.2.0", "mistune>=3.2.0", # for example tests + "torchmetrics>=1.4.0", # for accuracy similarity metrics ] demo = [ diff --git a/tests/conftest.py b/tests/conftest.py index adb87cbd728..4ad4706fc1f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2397,7 +2397,7 @@ def _process_diffusion_response(self, chat_completion) -> DiffusionResponse: image_url = item.get("image_url", {}).get("url") else: image_url_obj = getattr(item, "image_url", None) - image_url = hasattr(image_url_obj, "url", None) if image_url_obj else None + image_url = getattr(image_url_obj, "url", None) if image_url_obj else None if image_url and image_url.startswith("data:image"): b64_data = image_url.split(",", 1)[1] img = decode_b64_image(b64_data) @@ -2703,7 +2703,7 @@ def _stream_task(): return responses - def send_diffusion_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[OmniResponse]: + def send_diffusion_request(self, request_config: dict[str, Any], request_num: int = 1) -> list[DiffusionResponse]: """ Send OpenAI requests for diffusion models. @@ -2711,9 +2711,9 @@ def send_diffusion_request(self, request_config: dict[str, Any], request_num: in request_config: Request configuration dictionary containing parameters like model, messages request_num: Number of requests to send concurrently, defaults to 1 (single request) Returns: - List[OmniResponse]: List of response objects + List[DiffusionResponse]: List of response objects """ - responses = [] + responses: list[DiffusionResponse] = [] stream = request_config.get("stream", False) modalities = request_config.get("modalities", omit) # Most diffusion models don't require modalities param extra_body = request_config.get("extra_body", None) @@ -2876,7 +2876,7 @@ def _build_url(self, path: str) -> str: return f"{self.base_url.rstrip('/')}/{path.lstrip('/')}" -@pytest.fixture +@pytest.fixture(scope="module") def openai_client(request: pytest.FixtureRequest, run_level: str): """Create OpenAIClientHandler fixture to facilitate communication with OmniServer with encapsulated request sending, concurrent requests, response handling, and validation.""" diff --git a/tests/e2e/accuracy/conftest.py b/tests/e2e/accuracy/conftest.py index 062750b3cd2..3d614b8cdc1 100644 --- a/tests/e2e/accuracy/conftest.py +++ b/tests/e2e/accuracy/conftest.py @@ -5,10 +5,13 @@ import subprocess from contextlib import contextmanager from dataclasses import dataclass +from io import BytesIO from pathlib import Path import pytest +import requests import torch +from PIL import Image from tests.conftest import OmniServer, OmniServerParams @@ -183,6 +186,28 @@ def accuracy_artifact_root() -> Path: return root +@pytest.fixture(scope="session") +def qwen_bear_image(accuracy_artifact_root: Path) -> Image.Image: + """Download the Qwen bear image from the URL and save it to the accuracy artifact root.""" + QWEN_BEAR_IMAGE_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/omni-assets/qwen-bear.png" + response = requests.get(QWEN_BEAR_IMAGE_URL, timeout=60) + response.raise_for_status() + image = Image.open(BytesIO(response.content)).convert("RGB") + image.save(accuracy_artifact_root / "qwen_bear.png") + return image + + +@pytest.fixture(scope="session") +def rabbit_image(accuracy_artifact_root: Path) -> Image.Image: + """Download the rabbit image from the URL and save it to the accuracy artifact root.""" + RABBIT_IMAGE_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/omni-assets/rabbit.png" + response = requests.get(RABBIT_IMAGE_URL, timeout=60) + response.raise_for_status() + image = Image.open(BytesIO(response.content)).convert("RGB") + image.save(accuracy_artifact_root / "rabbit.png") + return image + + def reset_artifact_dir(path: Path) -> Path: if path.exists(): shutil.rmtree(path) diff --git a/tests/e2e/accuracy/test_qwen_image_edit.py b/tests/e2e/accuracy/test_qwen_image_edit.py new file mode 100644 index 00000000000..9a970103438 --- /dev/null +++ b/tests/e2e/accuracy/test_qwen_image_edit.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +import gc +from pathlib import Path + +import pytest +import requests +import torch +from diffusers import QwenImageEditPipeline, QwenImageEditPlusPipeline +from PIL import Image + +from benchmarks.accuracy.common import decode_base64_image, pil_to_png_bytes +from tests.conftest import ( + OmniServer, + _run_post_test_cleanup, + _run_pre_test_cleanup, +) +from tests.e2e.accuracy.utils import assert_similarity, model_output_dir +from tests.utils import hardware_test + +SINGLE_MODEL = "Qwen/Qwen-Image-Edit" +MULTIPLE_MODEL = "Qwen/Qwen-Image-Edit-2509" +WIDTH = 512 +HEIGHT = 512 +NUM_INFERENCE_STEPS = 20 +TRUE_CFG_SCALE = 4.0 +SEED = 42 +SSIM_THRESHOLD = 0.94 +PSNR_THRESHOLD = 28.0 + +PROMPT_SINGLE_IMAGE = "The input is a 2D cartoon bear mascot. Restyle it into a painterly oil artwork with warm colors while preserving the main structure." +PROMPT_MULTIPLE_IMAGE = "Put the cartoon bear mascot and the furry rabbit into one coherent scene with a painterly oil artwork style and consistent lighting." +NEGATIVE_PROMPT = "low quality, blurry, artifacts, distortion" +SERVER_ARGS = ["--num-gpus", "1", "--stage-init-timeout", "300", "--init-timeout", "900"] + + +def _run_vllm_omni_image_edit( + *, + omni_server: OmniServer, + prompt: str, + input_images: list[Image.Image], + output_path: Path, +) -> Image.Image: + response = requests.post( + f"http://{omni_server.host}:{omni_server.port}/v1/images/edits", + data={ + "model": omni_server.model, + "prompt": prompt, + "size": f"{WIDTH}x{HEIGHT}", + "n": 1, + "response_format": "b64_json", + "negative_prompt": NEGATIVE_PROMPT, + "num_inference_steps": NUM_INFERENCE_STEPS, + "true_cfg_scale": TRUE_CFG_SCALE, + "seed": SEED, + }, + files=[ + ("image", (f"image_{index}.png", pil_to_png_bytes(image), "image/png")) + for index, image in enumerate(input_images) + ], + timeout=600, + ) + response.raise_for_status() + payload = response.json() + assert len(payload["data"]) == 1 + image = decode_base64_image(payload["data"][0]["b64_json"]) + image.load() + image.save(output_path) + return image + + +def _run_diffusers_image_edit( + *, + model: str, + pipeline_class: type[QwenImageEditPipeline] | type[QwenImageEditPlusPipeline], + prompt: str, + input_images: list[Image.Image], + output_path: Path, +) -> Image.Image: + _run_pre_test_cleanup(enable_force=True) + pipe: QwenImageEditPipeline | QwenImageEditPlusPipeline | None = None + device = torch.device("cuda:0") + torch.cuda.set_device(device) + try: + images = input_images[0] if len(input_images) == 1 else input_images + pipe = pipeline_class.from_pretrained( + model, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + ).to(device) + pipe.set_progress_bar_config(disable=False) + generator = torch.Generator(device=device).manual_seed(SEED) + result = pipe( # pyright: ignore[reportCallIssue] + prompt=prompt, + image=images, + negative_prompt=NEGATIVE_PROMPT, + num_inference_steps=NUM_INFERENCE_STEPS, + true_cfg_scale=TRUE_CFG_SCALE, + width=WIDTH, + height=HEIGHT, + generator=generator, + ) + output_image = result.images[0].convert("RGB") # pyright: ignore[reportAttributeAccessIssue] + output_image.save(output_path) + return output_image + finally: + if pipe is not None and hasattr(pipe, "maybe_free_model_hooks"): + pipe.maybe_free_model_hooks() + del pipe + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + _run_post_test_cleanup(enable_force=True) + + +def _vllm_omni_output_single_image( + accuracy_artifact_root: Path, + qwen_bear_image: Image.Image, +) -> Image.Image: + output_dir = model_output_dir(accuracy_artifact_root, SINGLE_MODEL) + output_path = output_dir / "vllm_omni_single.png" + with OmniServer(model=SINGLE_MODEL, serve_args=SERVER_ARGS) as server: + output = _run_vllm_omni_image_edit( + omni_server=server, + prompt=PROMPT_SINGLE_IMAGE, + input_images=[qwen_bear_image], + output_path=output_path, + ) + return output + + +def _diffusers_output_single_image(accuracy_artifact_root: Path, qwen_bear_image: Image.Image) -> Image.Image: + output_dir = model_output_dir(accuracy_artifact_root, SINGLE_MODEL) + output_path = output_dir / "diffusers_single.png" + return _run_diffusers_image_edit( + model=SINGLE_MODEL, + pipeline_class=QwenImageEditPipeline, + prompt=PROMPT_SINGLE_IMAGE, + input_images=[qwen_bear_image], + output_path=output_path, + ) + + +def _vllm_omni_output_multiple_image( + accuracy_artifact_root: Path, + qwen_bear_image: Image.Image, + rabbit_image: Image.Image, +) -> Image.Image: + output_dir = model_output_dir(accuracy_artifact_root, MULTIPLE_MODEL) + output_path = output_dir / "vllm_omni_multiple.png" + with OmniServer(model=MULTIPLE_MODEL, serve_args=SERVER_ARGS) as server: + output = _run_vllm_omni_image_edit( + omni_server=server, + prompt=PROMPT_MULTIPLE_IMAGE, + input_images=[qwen_bear_image, rabbit_image], + output_path=output_path, + ) + return output + + +def _diffusers_output_multiple_image( + accuracy_artifact_root: Path, qwen_bear_image: Image.Image, rabbit_image: Image.Image +) -> Image.Image: + output_dir = model_output_dir(accuracy_artifact_root, MULTIPLE_MODEL) + output_path = output_dir / "diffusers_multiple.png" + return _run_diffusers_image_edit( + model=MULTIPLE_MODEL, + pipeline_class=QwenImageEditPlusPipeline, + prompt=PROMPT_MULTIPLE_IMAGE, + input_images=[qwen_bear_image, rabbit_image], + output_path=output_path, + ) + + +@pytest.mark.advanced_model +@pytest.mark.benchmark +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}, num_cards=1) +def test_qwen_image_edit_single_matches_diffusers( + accuracy_artifact_root: Path, + qwen_bear_image: Image.Image, +) -> None: + vllm_image = _vllm_omni_output_single_image( + accuracy_artifact_root=accuracy_artifact_root, + qwen_bear_image=qwen_bear_image, + ) + diffusers_image = _diffusers_output_single_image( + accuracy_artifact_root=accuracy_artifact_root, + qwen_bear_image=qwen_bear_image, + ) + assert_similarity( + model_name=SINGLE_MODEL, + vllm_image=vllm_image, + diffusers_image=diffusers_image, + width=WIDTH, + height=HEIGHT, + ssim_threshold=SSIM_THRESHOLD, + psnr_threshold=PSNR_THRESHOLD, + ) + + +@pytest.mark.advanced_model +@pytest.mark.benchmark +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}, num_cards=1) +@pytest.mark.skip( + reason="Skipping as the second image seems to be ignored by the API. Will come back to this later after #2772 is merged." +) +def test_qwen_image_edit_multiple_matches_diffusers( + accuracy_artifact_root: Path, + qwen_bear_image: Image.Image, + rabbit_image: Image.Image, +) -> None: + vllm_image = _vllm_omni_output_multiple_image( + accuracy_artifact_root=accuracy_artifact_root, + qwen_bear_image=qwen_bear_image, + rabbit_image=rabbit_image, + ) + diffusers_image = _diffusers_output_multiple_image( + accuracy_artifact_root=accuracy_artifact_root, + qwen_bear_image=qwen_bear_image, + rabbit_image=rabbit_image, + ) + assert_similarity( + model_name=MULTIPLE_MODEL, + vllm_image=vllm_image, + diffusers_image=diffusers_image, + width=WIDTH, + height=HEIGHT, + ssim_threshold=SSIM_THRESHOLD, + psnr_threshold=PSNR_THRESHOLD, + ) diff --git a/tests/e2e/accuracy/utils.py b/tests/e2e/accuracy/utils.py new file mode 100644 index 00000000000..eb0eea757ee --- /dev/null +++ b/tests/e2e/accuracy/utils.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pytest +import torch +from PIL import Image +from torchmetrics.image import PeakSignalNoiseRatio, StructuralSimilarityIndexMeasure + + +def model_output_dir(parent_dir: Path, model: str) -> Path: + safe_model_name = model.split("/")[-1].replace(".", "_") + path = parent_dir / safe_model_name + path.mkdir(parents=True, exist_ok=True) + return path + + +def assert_similarity( + *, + model_name: str, + vllm_image: Image.Image, + diffusers_image: Image.Image, + width: int, + height: int, + ssim_threshold: float, + psnr_threshold: float, +) -> None: + requested_size = (width, height) + if diffusers_image.size != requested_size: + pytest.skip( + "Skipping as diffusers baseline output is corrupt and not comparable: " + f"dimensions do not match requested size; requested={requested_size}, got={diffusers_image.size}." + ) + + assert vllm_image.size == diffusers_image.size, ( + f"Online and diffusers output sizes mismatch: online={vllm_image.size}, diffusers={diffusers_image.size}" + ) + + ssim_score, psnr_score = compute_image_ssim_psnr(prediction=vllm_image, reference=diffusers_image) + print(f"{model_name} similarity metrics:") + print(f" SSIM: value={ssim_score:.6f}, threshold>={ssim_threshold:.6f}, range=[-1, 1], higher_is_better=True") + print( + f" PSNR: value={psnr_score:.6f} dB, threshold>={psnr_threshold:.6f} dB, range=[0, +inf), higher_is_better=True" + ) + + assert ssim_score >= ssim_threshold, ( + f"SSIM below threshold for {model_name}: got {ssim_score:.6f}, expected >= {ssim_threshold:.6f}." + ) + assert psnr_score >= psnr_threshold, ( + f"PSNR below threshold for {model_name}: got {psnr_score:.6f}, expected >= {psnr_threshold:.6f}." + ) + + +def compute_image_ssim_psnr( + *, + prediction: Image.Image, + reference: Image.Image, +) -> tuple[float, float]: + pred_tensor = _pil_to_batched_tensor(prediction) + ref_tensor = _pil_to_batched_tensor(reference) + + ssim_metric = StructuralSimilarityIndexMeasure(data_range=1.0) + psnr_metric = PeakSignalNoiseRatio(data_range=1.0) + + ssim_value = float(ssim_metric(pred_tensor, ref_tensor).item()) + psnr_value = float(psnr_metric(pred_tensor, ref_tensor).item()) + return ssim_value, psnr_value + + +def _pil_to_batched_tensor(image: Image.Image) -> torch.Tensor: + array = np.asarray(image.convert("RGB"), dtype=np.float32) / 255.0 + tensor = torch.from_numpy(array).permute(2, 0, 1).unsqueeze(0) + return tensor From 61a3cbdff5785290501d711717e2b2e526ffe34f Mon Sep 17 00:00:00 2001 From: Samit <285365963@qq.com> Date: Wed, 15 Apr 2026 11:46:06 +0800 Subject: [PATCH 45/76] [Perf] Eliminate Hop 3 IPC overhead for single-stage diffusion via inline execution (#2736) Signed-off-by: samithuang <285365963@qq.com> Signed-off-by: Samit <285365963@qq.com> --- .../test_inline_stage_diffusion_client.py | 96 +++++ .../test_async_omni_engine_stage_init.py | 3 +- vllm_omni/diffusion/data.py | 43 +++ .../inline_stage_diffusion_client.py | 348 ++++++++++++++++++ vllm_omni/diffusion/stage_diffusion_client.py | 25 ++ vllm_omni/diffusion/stage_diffusion_proc.py | 46 +-- vllm_omni/engine/async_omni_engine.py | 2 + vllm_omni/engine/orchestrator.py | 17 + vllm_omni/engine/stage_init_utils.py | 8 +- .../entrypoints/openai/video_api_utils.py | 3 + vllm_omni/outputs.py | 3 + 11 files changed, 546 insertions(+), 48 deletions(-) create mode 100644 tests/diffusion/test_inline_stage_diffusion_client.py create mode 100644 vllm_omni/diffusion/inline_stage_diffusion_client.py diff --git a/tests/diffusion/test_inline_stage_diffusion_client.py b/tests/diffusion/test_inline_stage_diffusion_client.py new file mode 100644 index 00000000000..385f39b1240 --- /dev/null +++ b/tests/diffusion/test_inline_stage_diffusion_client.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import asyncio +from unittest.mock import MagicMock, patch + +import pytest + +from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.inline_stage_diffusion_client import InlineStageDiffusionClient +from vllm_omni.engine.stage_init_utils import StageMetadata +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +@pytest.fixture +def mock_engine(): + with patch("vllm_omni.diffusion.inline_stage_diffusion_client.DiffusionEngine") as mock: + engine_instance = MagicMock() + mock.make_engine.return_value = engine_instance + yield engine_instance + + +@pytest.fixture +def client(mock_engine): + metadata = StageMetadata( + stage_id=0, + stage_type="diffusion", + engine_output_type="image", + is_comprehension=False, + requires_multimodal_data=False, + engine_input_source="prompt", + final_output=True, + final_output_type="image", + default_sampling_params={}, + custom_process_input_func=None, + model_stage=None, + runtime_cfg=None, + ) + with patch.object(InlineStageDiffusionClient, "_enrich_config"): + od_config = MagicMock(spec=OmniDiffusionConfig) + c = InlineStageDiffusionClient(model="test_model", od_config=od_config, metadata=metadata, batch_size=1) + yield c + c.shutdown() + + +@pytest.mark.asyncio +async def test_inline_dispatch_request_success(client, mock_engine): + # Setup mock engine step to return a successful result + mock_result = OmniRequestOutput.from_diffusion(request_id="req-1", images=[MagicMock()]) + mock_engine.step.return_value = [mock_result] + + sampling_params = OmniDiffusionSamplingParams() + await client.add_request_async("req-1", "A test prompt", sampling_params) + + # Wait for the task to be processed + for _ in range(10): + output = client.get_diffusion_output_nowait() + if output is not None: + break + await asyncio.sleep(0.01) + + assert output is not None + assert output.request_id == "req-1" + mock_engine.step.assert_called_once() + + +@pytest.mark.asyncio +async def test_inline_dispatch_request_error(client, mock_engine): + # Setup mock engine step to raise an exception + mock_engine.step.side_effect = RuntimeError("Engine failure") + + sampling_params = OmniDiffusionSamplingParams() + await client.add_request_async("req-err", "A test prompt", sampling_params) + + for _ in range(10): + output = client.get_diffusion_output_nowait() + if output is not None: + break + await asyncio.sleep(0.01) + + assert output is not None + assert output.request_id == "req-err" + assert output.error == "Engine failure" + assert not output.images + + +def test_inline_shutdown(client, mock_engine): + assert not client._shutting_down + + # Shutting down should cleanly cancel anything queued and close engine + client.shutdown() + + assert client._shutting_down + mock_engine.close.assert_called_once() diff --git a/tests/engine/test_async_omni_engine_stage_init.py b/tests/engine/test_async_omni_engine_stage_init.py index 7b995fe70db..84b0cb0bed0 100644 --- a/tests/engine/test_async_omni_engine_stage_init.py +++ b/tests/engine/test_async_omni_engine_stage_init.py @@ -100,10 +100,11 @@ def test_initialize_stages_passes_stage_init_timeout_to_diffusion_handshake(monk engine.log_stats = False engine.model = "dummy-model" engine.config_path = "dummy-config" - engine.num_stages = 1 + engine.num_stages = 2 engine.async_chunk = False engine.diffusion_batch_size = 1 engine.single_stage_mode = False + engine._omni_master_server = None engine.stage_configs = [types.SimpleNamespace(stage_id=0, stage_type="diffusion", engine_args={})] metadata = types.SimpleNamespace( diff --git a/vllm_omni/diffusion/data.py b/vllm_omni/diffusion/data.py index 56a891aa5cf..fca0a5bad05 100644 --- a/vllm_omni/diffusion/data.py +++ b/vllm_omni/diffusion/data.py @@ -666,6 +666,49 @@ def set_tf_model_config(self, tf_config: "TransformerConfig") -> None: def update_multimodal_support(self) -> None: self.supports_multimodal_inputs = self.model_class_name in {"QwenImageEditPlusPipeline"} + def enrich_config(self) -> None: + """Load model metadata from HuggingFace and populate config fields. + + Diffusers-style models expose ``model_index.json`` with ``_class_name``. + Non-diffusers models (e.g. Bagel, NextStep) only have ``config.json``, + so we fall back to reading that and mapping model_type manually. + """ + from vllm.transformers_utils.config import get_hf_file_to_dict + + try: + config_dict = get_hf_file_to_dict("model_index.json", self.model) + if config_dict is not None: + if self.model_class_name is None: + self.model_class_name = config_dict.get("_class_name", None) + self.update_multimodal_support() + + tf_config_dict = get_hf_file_to_dict("transformer/config.json", self.model) + self.tf_model_config = TransformerConfig.from_dict(tf_config_dict) + else: + raise FileNotFoundError("model_index.json not found") + except (AttributeError, OSError, ValueError, FileNotFoundError): + cfg = get_hf_file_to_dict("config.json", self.model) + if cfg is None: + raise ValueError(f"Could not find config.json or model_index.json for model {self.model}") + + self.tf_model_config = TransformerConfig.from_dict(cfg) + model_type = cfg.get("model_type") + architectures = cfg.get("architectures") or [] + + if model_type == "bagel" or "BagelForConditionalGeneration" in architectures: + self.model_class_name = "BagelPipeline" + self.tf_model_config = TransformerConfig() + self.update_multimodal_support() + elif model_type == "nextstep": + if self.model_class_name is None: + self.model_class_name = "NextStep11Pipeline" + self.tf_model_config = TransformerConfig() + self.update_multimodal_support() + elif architectures and len(architectures) == 1: + self.model_class_name = architectures[0] + else: + raise + @classmethod def from_kwargs(cls, **kwargs: Any) -> "OmniDiffusionConfig": # Backwards-compatibility: older callers may use a diffusion-specific diff --git a/vllm_omni/diffusion/inline_stage_diffusion_client.py b/vllm_omni/diffusion/inline_stage_diffusion_client.py new file mode 100644 index 00000000000..a33a3e95619 --- /dev/null +++ b/vllm_omni/diffusion/inline_stage_diffusion_client.py @@ -0,0 +1,348 @@ +"""Inline Stage Diffusion Client for vLLM-Omni multi-stage runtime. + +Runs DiffusionEngine in a ThreadPoolExecutor inside the Orchestrator process +instead of spawning a separate StageDiffusionProc subprocess, eliminating ZMQ +IPC overhead. Used when there is only a single diffusion stage. +""" + +from __future__ import annotations + +import asyncio +import time +from concurrent.futures import ThreadPoolExecutor +from typing import TYPE_CHECKING, Any + +import torch +from PIL import Image +from vllm.logger import init_logger + +from vllm_omni.diffusion.data import DiffusionRequestAbortedError +from vllm_omni.diffusion.diffusion_engine import DiffusionEngine +from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.engine.stage_init_utils import StageMetadata +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput + +if TYPE_CHECKING: + from vllm_omni.diffusion.data import OmniDiffusionConfig + from vllm_omni.inputs.data import OmniPromptType + +logger = init_logger(__name__) + + +class InlineStageDiffusionClient: + """Runs DiffusionEngine in a thread executor inside the Orchestrator.""" + + stage_type: str = "diffusion" + + def __init__( + self, + model: str, + od_config: OmniDiffusionConfig, + metadata: StageMetadata, + batch_size: int = 1, + ) -> None: + self.model = model + self.od_config = od_config + self.stage_id = metadata.stage_id + self.final_output = metadata.final_output + self.final_output_type = metadata.final_output_type + self.default_sampling_params = metadata.default_sampling_params + self.custom_process_input_func = metadata.custom_process_input_func + self.engine_input_source = metadata.engine_input_source + self.batch_size = batch_size + + self._enrich_config() + self._engine = DiffusionEngine.make_engine(self.od_config) + self._executor = ThreadPoolExecutor(max_workers=1, thread_name_prefix="inline-diffusion") + + self._output_queue: asyncio.Queue[OmniRequestOutput] = asyncio.Queue() + self._tasks: dict[str, asyncio.Task] = {} + self._shutting_down = False + + logger.info( + "[InlineStageDiffusionClient] Stage-%s initialized inline (batch_size=%d)", + self.stage_id, + self.batch_size, + ) + + def _enrich_config(self) -> None: + """Load model metadata from HuggingFace and populate od_config fields.""" + self.od_config.enrich_config() + + # ------------------------------------------------------------------ + # Request processing + # ------------------------------------------------------------------ + + async def add_request_async( + self, + request_id: str, + prompt: OmniPromptType, + sampling_params: OmniDiffusionSamplingParams, + kv_sender_info: dict[int, dict[str, Any]] | None = None, + ) -> None: + task = asyncio.create_task( + self._dispatch_request( + request_id, + prompt, + sampling_params, + kv_sender_info, + ) + ) + self._tasks[request_id] = task + + async def _dispatch_request( + self, + request_id: str, + prompt: Any, + sampling_params: OmniDiffusionSamplingParams, + kv_sender_info: dict[str, Any] | None = None, + ) -> None: + try: + request = OmniDiffusionRequest( + prompts=[prompt], + sampling_params=sampling_params, + request_ids=[request_id], + request_id=request_id, + kv_sender_info=kv_sender_info, + ) + + loop = asyncio.get_running_loop() + results = await loop.run_in_executor(self._executor, self._engine.step, request) + result = results[0] + if not result.request_id: + result.request_id = request_id + + self._output_queue.put_nowait(result) + except DiffusionRequestAbortedError as e: + logger.info("request_id: %s aborted: %s", request_id, str(e)) + except Exception as e: + logger.exception("Diffusion request %s failed: %s", request_id, e) + error_output = OmniRequestOutput.from_diffusion( + request_id=request_id, + images=[], + ) + error_output.error = str(e) + self._output_queue.put_nowait(error_output) + finally: + self._tasks.pop(request_id, None) + + async def add_batch_request_async( + self, + request_id: str, + prompts: list[OmniPromptType], + sampling_params: OmniDiffusionSamplingParams, + kv_sender_info: dict[int, dict[str, Any]] | None = None, + ) -> None: + task = asyncio.create_task( + self._dispatch_batch( + request_id, + prompts, + sampling_params, + kv_sender_info, + ) + ) + self._tasks[request_id] = task + + async def _dispatch_batch( + self, + request_id: str, + prompts: list[Any], + sampling_params: OmniDiffusionSamplingParams, + kv_sender_info: dict[str, Any] | None = None, + ) -> None: + try: + request = OmniDiffusionRequest( + prompts=prompts, + sampling_params=sampling_params, + request_ids=[f"{request_id}-{i}" for i in range(len(prompts))], + request_id=request_id, + kv_sender_info=kv_sender_info, + ) + + loop = asyncio.get_running_loop() + results = await loop.run_in_executor(self._executor, self._engine.step, request) + + all_images: list = [] + merged_mm: dict[str, Any] = {} + merged_metrics: dict[str, Any] = {} + merged_durations: dict[str, float] = {} + merged_custom: dict[str, Any] = {} + peak_mem = 0.0 + latents = None + trajectory_latents: list[torch.Tensor] | None = None + trajectory_timesteps: list[torch.Tensor] | None = None + trajectory_log_probs: torch.Tensor | None = None + trajectory_decoded: list[Image.Image] | None = None + final_output_type = "image" + + for r in results: + all_images.extend(r.images) + merged_mm.update(r._multimodal_output) + merged_metrics.update(r.metrics) + merged_durations.update(r.stage_durations) + merged_custom.update(r._custom_output) + peak_mem = max(peak_mem, r.peak_memory_mb) + if latents is None and r.latents is not None: + latents = r.latents + if trajectory_latents is None: + trajectory_latents = r.trajectory_latents + if trajectory_timesteps is None: + trajectory_timesteps = r.trajectory_timesteps + if trajectory_log_probs is None: + trajectory_log_probs = r.trajectory_log_probs + if trajectory_decoded is None: + trajectory_decoded = r.trajectory_decoded + if r.final_output_type != "image": + final_output_type = r.final_output_type + + result = OmniRequestOutput.from_diffusion( + request_id=request_id, + images=all_images, + prompt=prompts[0] if len(prompts) == 1 else None, + metrics=merged_metrics, + latents=latents, + trajectory_latents=trajectory_latents, + trajectory_timesteps=trajectory_timesteps, + trajectory_log_probs=trajectory_log_probs, + trajectory_decoded=trajectory_decoded, + custom_output=merged_custom or None, + multimodal_output=merged_mm or None, + final_output_type=final_output_type, + stage_durations=merged_durations, + peak_memory_mb=peak_mem, + ) + + self._output_queue.put_nowait(result) + except DiffusionRequestAbortedError as e: + logger.info("request_id: %s aborted: %s", request_id, str(e)) + except Exception as e: + logger.exception("Batch diffusion request %s failed: %s", request_id, e) + error_output = OmniRequestOutput.from_diffusion( + request_id=request_id, + images=[], + ) + error_output.error = str(e) + self._output_queue.put_nowait(error_output) + finally: + self._tasks.pop(request_id, None) + + def get_diffusion_output_nowait(self) -> OmniRequestOutput | None: + try: + return self._output_queue.get_nowait() + except asyncio.QueueEmpty: + return None + + async def abort_requests_async(self, request_ids: list[str]) -> None: + for rid in request_ids: + task = self._tasks.pop(rid, None) + if task: + task.cancel() + self._engine.abort(rid) + + async def collective_rpc_async( + self, + method: str, + timeout: float | None = None, + args: tuple[Any, ...] = (), + kwargs: dict[str, Any] | None = None, + ) -> Any: + loop = asyncio.get_running_loop() + + if method == "profile": + is_start = args[0] if args else True + profile_prefix = args[1] if len(args) > 1 else None + if is_start and profile_prefix is None: + profile_prefix = f"stage_{self.stage_id}_diffusion_{int(time.time())}" + return await loop.run_in_executor( + self._executor, + self._engine.profile, + is_start, + profile_prefix, + ) + + kwargs = kwargs or {} + + # LoRA methods + if method == "add_lora": + lora_request = args[0] if args else kwargs.get("lora_request") + results = await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + "add_lora", + timeout, + (), + {"lora_request": lora_request}, + None, + ) + return all(results) if isinstance(results, list) else results + + if method == "remove_lora": + results = await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + "remove_lora", + timeout, + args, + kwargs, + None, + ) + return all(results) if isinstance(results, list) else results + + if method == "list_loras": + results = await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + "list_loras", + timeout, + (), + {}, + None, + ) + if not isinstance(results, list): + return results or [] + merged: set[int] = set() + for part in results: + merged.update(part or []) + return sorted(merged) + + if method == "pin_lora": + lora_id = args[0] if args else kwargs.get("adapter_id") + results = await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + "pin_lora", + timeout, + (), + {"adapter_id": lora_id}, + None, + ) + return all(results) if isinstance(results, list) else results + + return await loop.run_in_executor( + self._executor, + self._engine.collective_rpc, + method, + timeout, + args, + kwargs, + None, + ) + + def shutdown(self) -> None: + self._shutting_down = True + + # Cancel all pending tasks + for task in self._tasks.values(): + task.cancel() + + try: + # Cancel queued futures and wait for the running one to complete deterministically + self._executor.shutdown(wait=True, cancel_futures=True) + except Exception: + pass + + try: + self._engine.close() + except Exception: + pass diff --git a/vllm_omni/diffusion/stage_diffusion_client.py b/vllm_omni/diffusion/stage_diffusion_client.py index 7e740dc893d..480d113d192 100644 --- a/vllm_omni/diffusion/stage_diffusion_client.py +++ b/vllm_omni/diffusion/stage_diffusion_client.py @@ -34,6 +34,24 @@ logger = init_logger(__name__) +def create_diffusion_client( + model: str, + od_config: OmniDiffusionConfig, + metadata: StageMetadata, + stage_init_timeout: int, + batch_size: int = 1, + use_inline: bool = False, +) -> Any: + """Factory to create either an inline or out-of-process diffusion client.""" + if use_inline: + from vllm_omni.diffusion.inline_stage_diffusion_client import InlineStageDiffusionClient + + return InlineStageDiffusionClient(model, od_config, metadata, batch_size=batch_size) + return StageDiffusionClient( + model, od_config, metadata, stage_init_timeout=stage_init_timeout, batch_size=batch_size + ) + + class StageDiffusionClient: """Communicates with StageDiffusionProc via ZMQ for use inside the Orchestrator. @@ -154,6 +172,13 @@ def _drain_responses(self) -> None: "error": True, "reason": error_msg, } + elif req_id is not None: + error_output = OmniRequestOutput.from_diffusion( + request_id=req_id, + images=[], + ) + error_output.error = error_msg + self._output_queue.put_nowait(error_output) # Fields that are subprocess-local and cannot be serialized across # process boundaries. They are recreated in the subprocess with diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py index cef697630f1..eced444fd32 100644 --- a/vllm_omni/diffusion/stage_diffusion_proc.py +++ b/vllm_omni/diffusion/stage_diffusion_proc.py @@ -19,12 +19,11 @@ import zmq.asyncio from PIL import Image from vllm.logger import init_logger -from vllm.transformers_utils.config import get_hf_file_to_dict from vllm.utils.network_utils import get_open_zmq_ipc_path, zmq_socket_ctx from vllm.utils.system_utils import get_mp_context from vllm.v1.utils import shutdown -from vllm_omni.diffusion.data import DiffusionRequestAbortedError, TransformerConfig +from vllm_omni.diffusion.data import DiffusionRequestAbortedError from vllm_omni.diffusion.diffusion_engine import DiffusionEngine from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.distributed.omni_connectors.utils.serialization import ( @@ -66,47 +65,8 @@ def initialize(self) -> None: logger.info("StageDiffusionProc initialized with model: %s", self._model) def _enrich_config(self) -> None: - """Load model metadata from HuggingFace and populate od_config fields. - - Diffusers-style models expose ``model_index.json`` with ``_class_name``. - Non-diffusers models (e.g. Bagel, NextStep) only have ``config.json``, - so we fall back to reading that and mapping model_type manually. - """ - od_config = self._od_config - - try: - config_dict = get_hf_file_to_dict("model_index.json", od_config.model) - if config_dict is not None: - if od_config.model_class_name is None: - od_config.model_class_name = config_dict.get("_class_name", None) - od_config.update_multimodal_support() - - tf_config_dict = get_hf_file_to_dict("transformer/config.json", od_config.model) - od_config.tf_model_config = TransformerConfig.from_dict(tf_config_dict) - else: - raise FileNotFoundError("model_index.json not found") - except (AttributeError, OSError, ValueError, FileNotFoundError): - cfg = get_hf_file_to_dict("config.json", od_config.model) - if cfg is None: - raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}") - - od_config.tf_model_config = TransformerConfig.from_dict(cfg) - model_type = cfg.get("model_type") - architectures = cfg.get("architectures") or [] - - if model_type == "bagel" or "BagelForConditionalGeneration" in architectures: - od_config.model_class_name = "BagelPipeline" - od_config.tf_model_config = TransformerConfig() - od_config.update_multimodal_support() - elif model_type == "nextstep": - if od_config.model_class_name is None: - od_config.model_class_name = "NextStep11Pipeline" - od_config.tf_model_config = TransformerConfig() - od_config.update_multimodal_support() - elif architectures and len(architectures) == 1: - od_config.model_class_name = architectures[0] - else: - raise + """Load model metadata from HuggingFace and populate od_config fields.""" + self._od_config.enrich_config() # ------------------------------------------------------------------ # Request processing diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 9609cf6e26b..054d5342d9f 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -759,12 +759,14 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: self._omni_master_server, ) else: + use_inline = True if self.num_stages == 1 else False stage_clients[stage_idx] = initialize_diffusion_stage( self.model, stage_cfg, metadata, stage_init_timeout=stage_init_timeout, batch_size=self.diffusion_batch_size, + use_inline=use_inline, ) logger.info( "[AsyncOmniEngine] Stage %s initialized (diffusion, batch_size=%d)", diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 386b545eb75..0fdab9c0d2b 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -246,6 +246,23 @@ async def _orchestration_loop(self) -> None: idle = False req_state = self.request_states.get(output.request_id) if req_state is not None: + if getattr(output, "error", None) is not None: + parent_id = self._companion_to_parent.get(output.request_id, output.request_id) + await self.output_async_queue.put( + { + "type": "error", + "request_id": parent_id, + "stage_id": stage_id, + "error": output.error, + } + ) + role_map = self._companion_map.get(parent_id, {}) + for cid in role_map.values(): + self.request_states.pop(cid, None) + self._cleanup_companion_state(parent_id) + self.request_states.pop(parent_id, None) + continue + stage_metrics = self._build_stage_metrics(stage_id, output.request_id, [output], req_state) await self._route_output(stage_id, output, req_state, stage_metrics) continue diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 158b4c54777..bf40aa77cd5 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -530,6 +530,7 @@ def initialize_diffusion_stage( metadata: StageMetadata, stage_init_timeout: int, batch_size: int = 1, + use_inline: bool = False, ) -> Any: """Build a diffusion stage client. @@ -541,13 +542,12 @@ def initialize_diffusion_stage( batch_size: Maximum number of requests to batch together in the diffusion engine. Passed through to ``StageDiffusionClient`` and ultimately to ``AsyncOmni``. + use_inline: If True, uses the inline diffusion client instead of subprocess. """ - from vllm_omni.diffusion.stage_diffusion_client import StageDiffusionClient + from vllm_omni.diffusion.stage_diffusion_client import create_diffusion_client od_config = build_diffusion_config(model, stage_cfg, metadata) - return StageDiffusionClient( - model, od_config, metadata, stage_init_timeout=stage_init_timeout, batch_size=batch_size - ) + return create_diffusion_client(model, od_config, metadata, stage_init_timeout, batch_size, use_inline) def _shutdown_or_close_resource(resource: Any, resource_name: str, stage_id: int) -> None: diff --git a/vllm_omni/entrypoints/openai/video_api_utils.py b/vllm_omni/entrypoints/openai/video_api_utils.py index 19354697928..3fb991225c0 100644 --- a/vllm_omni/entrypoints/openai/video_api_utils.py +++ b/vllm_omni/entrypoints/openai/video_api_utils.py @@ -227,6 +227,9 @@ def _encode_video_bytes( frames_np *= 255.0 frames_u8 = np.round(frames_np).astype(np.uint8) + # Ensure contiguous memory layout for faster PyAV muxing + frames_u8 = np.ascontiguousarray(frames_u8) + audio_np = _coerce_audio_to_numpy(audio) if audio is not None else None return mux_video_audio_bytes( diff --git a/vllm_omni/outputs.py b/vllm_omni/outputs.py index 2c2c1d21c11..c02c0c1427c 100644 --- a/vllm_omni/outputs.py +++ b/vllm_omni/outputs.py @@ -100,6 +100,9 @@ class OmniRequestOutput: # memory usage info peak_memory_mb: float = 0.0 + # error handling + error: str | None = None + @classmethod def from_pipeline( cls, From 6c6551dff8856e8e936cf29b5886174e4b149e4a Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Wed, 15 Apr 2026 11:52:20 +0800 Subject: [PATCH 46/76] [Feature] feat: add video frame interpolation postprocess (#2555) Signed-off-by: David Chen <530634352@qq.com> --- docs/.nav.yml | 1 + .../diffusion/frame_interpolation.md | 92 ++++ docs/user_guide/diffusion_features.md | 9 +- .../examples/online_serving/image_to_video.md | 32 ++ .../examples/online_serving/text_to_video.md | 32 ++ .../openai_api/test_video_api_utils.py | 92 ++++ .../openai_api/test_video_server.py | 134 +++++- vllm_omni/diffusion/diffusion_engine.py | 23 +- .../models/wan2_2/pipeline_wan2_2.py | 16 +- .../models/wan2_2/pipeline_wan2_2_i2v.py | 16 +- .../models/wan2_2/pipeline_wan2_2_ti2v.py | 16 +- vllm_omni/diffusion/postprocess/__init__.py | 10 + .../postprocess/rife_interpolator.py | 440 ++++++++++++++++++ vllm_omni/entrypoints/openai/api_server.py | 8 + .../entrypoints/openai/protocol/videos.py | 23 + vllm_omni/entrypoints/openai/serving_video.py | 22 +- vllm_omni/inputs/data.py | 4 + 17 files changed, 961 insertions(+), 9 deletions(-) create mode 100644 docs/user_guide/diffusion/frame_interpolation.md create mode 100644 tests/entrypoints/openai_api/test_video_api_utils.py create mode 100644 vllm_omni/diffusion/postprocess/__init__.py create mode 100644 vllm_omni/diffusion/postprocess/rife_interpolator.py diff --git a/docs/.nav.yml b/docs/.nav.yml index 86ce4a3b0c4..441ef9f521e 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -64,6 +64,7 @@ nav: - FP8: user_guide/diffusion/quantization/fp8.md - Int8: user_guide/diffusion/quantization/int8.md - GGUF: user_guide/diffusion/quantization/gguf.md + - Frame Interpolation: user_guide/diffusion/frame_interpolation.md - Parallelism: - Overview: user_guide/diffusion/parallelism/overview.md - CFG Parallel: user_guide/diffusion/parallelism/cfg_parallel.md diff --git a/docs/user_guide/diffusion/frame_interpolation.md b/docs/user_guide/diffusion/frame_interpolation.md new file mode 100644 index 00000000000..349af50c51c --- /dev/null +++ b/docs/user_guide/diffusion/frame_interpolation.md @@ -0,0 +1,92 @@ +# Frame Interpolation + +## Overview + +vLLM-Omni supports post-generation frame interpolation for supported video +diffusion pipelines. This feature inserts synthesized intermediate frames +between adjacent generated frames to improve temporal smoothness without +rerunning the diffusion denoising loop. + +Frame interpolation runs in the diffusion worker post-processing path instead +of the API server encoding path. This allows the interpolation step to reuse +the worker's current accelerator device and keeps the FastAPI event loop free +from heavy synchronous PyTorch work. + +For an input video with `N` generated frames and interpolation exponent `exp`, +the output frame count is: + +```text +(N - 1) * 2**exp + 1 +``` + +The output FPS is multiplied by `2**exp` so the clip duration remains close to +the original generated video. + +## Supported Pipelines + +Frame interpolation is currently supported for: + +- `WanPipeline` (Wan2.2 text-to-video) +- `WanImageToVideoPipeline` +- `Wan22TI2VPipeline` + +## Request Parameters + +The video APIs `/v1/videos` and `/v1/videos/sync` accept: + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `enable_frame_interpolation` | bool | `false` | Enable post-generation frame interpolation | +| `frame_interpolation_exp` | int | `1` | Interpolation exponent. `1=2x`, `2=4x`, etc. | +| `frame_interpolation_scale` | float | `1.0` | RIFE inference scale | +| `frame_interpolation_model_path` | str | `None` | Local directory or Hugging Face repo ID containing `flownet.pkl` | + +## Execution Flow + +For supported Wan2.2 pipelines, the execution order is: + +1. Diffusion worker finishes denoising and decodes the raw video tensor. +2. Worker-side model-specific post-processing runs. +3. If frame interpolation is enabled, RIFE interpolates the decoded video + tensor on the worker side and records a FPS multiplier in `custom_output`. +4. The API server receives the already-interpolated video and only performs + MP4 export. + +This design keeps interpolation close to the generated tensor and avoids +introducing another heavyweight GPU context in the API server process. + +## Example + +Start the server: + +```bash +vllm serve Wan-AI/Wan2.2-T2V-A14B-Diffusers --omni --port 8091 +``` + +Run a sync request with interpolation enabled: + +```bash +curl -X POST http://localhost:8091/v1/videos/sync \ + -F "prompt=A dog running through a park" \ + -F "num_frames=81" \ + -F "width=832" \ + -F "height=480" \ + -F "fps=16" \ + -F "num_inference_steps=40" \ + -F "guidance_scale=1.0" \ + -F "guidance_scale_2=1.0" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ + -F "seed=42" \ + -o sync_t2v_interpolated.mp4 +``` + +## Notes + +- This is a post-processing feature. It does not modify the diffusion denoising + schedule. +- Higher interpolation exponents increase post-processing time and memory usage. +- If the interpolation model weights are not available locally, + `frame_interpolation_model_path` may point to a Hugging Face repo containing + `flownet.pkl`. diff --git a/docs/user_guide/diffusion_features.md b/docs/user_guide/diffusion_features.md index 31cd1500fa0..45953b85299 100644 --- a/docs/user_guide/diffusion_features.md +++ b/docs/user_guide/diffusion_features.md @@ -14,7 +14,7 @@ vLLM-Omni supports various advanced features for diffusion models: - Acceleration: **cache methods**, **parallelism methods**, **startup optimizations** - Memory optimization: **cpu offloading**, **quantization** -- Extensions: **LoRA inference** +- Extensions: **LoRA inference**, **frame interpolation** - Execution modes: **step execution** ## Supported Features @@ -69,6 +69,7 @@ Extension methods add specialized capabilities to diffusion models beyond standa | Method | Description | Best For | |--------|-------------|----------| | **[LoRA Inference](diffusion/lora.md)** | Enables inference with Low-Rank Adaptation (LoRA) adapters weights | Reinforcement learning extensions | +| **[Frame Interpolation](diffusion/frame_interpolation.md)** | Inserts intermediate video frames after generation for smoother motion | Video generation pipelines that need higher temporal smoothness | ### Execution Modes @@ -143,6 +144,11 @@ The following tables show which models support each feature: | **HunyuanVideo-1.5 T2V I2V** | ❌ | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ (decode) | ✅ | ❌ | | **DreamID-Omni** | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +**Frame Interpolation Support** + +- **Supported**: Wan2.2 text-to-video, image-to-video, and TI2V pipelines +- **Not supported**: Wan2.1-VACE, LTX-2, Helios, HunyuanVideo-1.5, DreamID-Omni + ### AudioGen | Model | ⚡TeaCache | ⚡Cache-DiT | 🔀SP (Ulysses & Ring) | 🔀CFG-Parallel | 🔀Tensor-Parallel | 🔀HSDP | 💾CPU Offload (Layerwise) | 💾VAE-Patch-Parallel | 💾Quantization | 🔄Step Execution | @@ -258,6 +264,7 @@ Measured on NVIDIA H800: **Extensions:** - **[LoRA Inference Guide](diffusion/lora.md)** - Low-Rank Adaptation for style customization and fine-tuning +- **[Frame Interpolation Guide](diffusion/frame_interpolation.md)** - Worker-side post-generation video frame interpolation for smoother motion **Execution Modes:** diff --git a/docs/user_guide/examples/online_serving/image_to_video.md b/docs/user_guide/examples/online_serving/image_to_video.md index 00b67d74e26..781f0c2a5ed 100644 --- a/docs/user_guide/examples/online_serving/image_to_video.md +++ b/docs/user_guide/examples/online_serving/image_to_video.md @@ -72,6 +72,9 @@ curl -X POST http://localhost:8091/v1/videos/sync \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ -F "seed=42" \ -o sync_i2v_output.mp4 ``` @@ -114,6 +117,9 @@ create_response=$(curl -s http://localhost:8091/v1/videos \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ -F "seed=42") video_id=$(echo "$create_response" | jq -r '.id') @@ -172,9 +178,35 @@ curl -X POST http://localhost:8091/v1/videos \ -F "guidance_scale_2=1.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=12.0" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ -F "seed=42" ``` +Frame interpolation is also available for supported Wan2.2 I2V requests. See +[Frame Interpolation](../../diffusion/frame_interpolation.md) for worker-side +execution details and feature constraints. + +### Frame Interpolation Example + +```bash +curl -X POST http://localhost:8091/v1/videos/sync \ + -F "prompt=A bear playing with yarn, smooth motion" \ + -F "input_reference=@/path/to/qwen-bear.png" \ + -F "width=832" \ + -F "height=480" \ + -F "num_frames=33" \ + -F "fps=16" \ + -F "num_inference_steps=40" \ + -F "guidance_scale=1.0" \ + -F "guidance_scale_2=1.0" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ + -o sync_i2v_interpolated.mp4 +``` + ## Create Response Format `POST /v1/videos` returns a job record, not inline base64 video data. diff --git a/docs/user_guide/examples/online_serving/text_to_video.md b/docs/user_guide/examples/online_serving/text_to_video.md index 01e6d9d464b..00a9c167239 100644 --- a/docs/user_guide/examples/online_serving/text_to_video.md +++ b/docs/user_guide/examples/online_serving/text_to_video.md @@ -165,6 +165,9 @@ curl -X POST http://localhost:8091/v1/videos \ -F "guidance_scale_2=4.0" \ -F "boundary_ratio=0.875" \ -F "flow_shift=5.0" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ -F "seed=42" ``` @@ -187,6 +190,35 @@ curl -X POST http://localhost:8091/v1/videos \ | `flow_shift` | float | None | Scheduler flow shift (Wan2.2) | | `seed` | int | None | Random seed (reproducible) | | `lora` | object | None | LoRA configuration | +| `enable_frame_interpolation` | bool | false | Enable RIFE frame interpolation before MP4 encoding | +| `frame_interpolation_exp` | int | 1 | Interpolation exponent; 1=2x temporal resolution, 2=4x | +| `frame_interpolation_scale` | float | 1.0 | RIFE inference scale; use 0.5 for high-resolution inputs | +| `frame_interpolation_model_path` | str | None | Local directory or Hugging Face repo ID with `flownet.pkl`; defaults to `elfgum/RIFE-4.22.lite` | + +## Frame Interpolation + +Frame interpolation is an optional post-processing step for `/v1/videos` and +`/v1/videos/sync`. It synthesizes intermediate frames between generated frames +without rerunning the diffusion model. If the generated video has `N` frames, +the interpolated output frame count is `(N - 1) * 2**exp + 1`. The encoder FPS +is multiplied by `2**exp` so the output duration remains close to the original. + +Frame interpolation runs in the diffusion worker post-processing path instead of +the API server encoding path, so it can reuse the worker's current accelerator +device without blocking the FastAPI event loop. + +Example: generate 5 frames and interpolate to 9 frames: + +```bash +curl -X POST http://localhost:8091/v1/videos/sync \ + -F "prompt=A dog running through a park" \ + -F "num_frames=5" \ + -F "fps=8" \ + -F "enable_frame_interpolation=true" \ + -F "frame_interpolation_exp=1" \ + -F "frame_interpolation_scale=1.0" \ + -o sync_t2v_interpolated.mp4 +``` ## Create Response Format diff --git a/tests/entrypoints/openai_api/test_video_api_utils.py b/tests/entrypoints/openai_api/test_video_api_utils.py new file mode 100644 index 00000000000..5012c9b9826 --- /dev/null +++ b/tests/entrypoints/openai_api/test_video_api_utils.py @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for OpenAI-compatible video API encoding helpers.""" + +import numpy as np +import pytest +import torch + +from vllm_omni.diffusion.postprocess import rife_interpolator +from vllm_omni.entrypoints.openai import video_api_utils + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def _install_fake_video_mux(monkeypatch, mux_calls): + def _fake_mux_video_audio_bytes(frames, audio, fps, audio_sample_rate, video_codec_options=None): + mux_calls.append( + { + "frames": frames, + "audio": audio, + "fps": fps, + "audio_sample_rate": audio_sample_rate, + "video_codec_options": video_codec_options, + } + ) + return b"fake-video" + + monkeypatch.setattr( + "vllm_omni.diffusion.utils.media_utils.mux_video_audio_bytes", + _fake_mux_video_audio_bytes, + ) + + +def test_encode_video_bytes_exports_frames_without_interpolation(monkeypatch): + mux_calls = [] + _install_fake_video_mux(monkeypatch, mux_calls) + + frames = [np.full((2, 2, 3), fill_value=i / 5, dtype=np.float32) for i in range(5)] + video_bytes = video_api_utils._encode_video_bytes( + frames, + fps=8, + ) + + assert video_bytes == b"fake-video" + assert mux_calls[0]["frames"].shape == (5, 2, 2, 3) + assert mux_calls[0]["frames"].dtype == np.uint8 + assert mux_calls[0]["fps"] == 8.0 + assert mux_calls[0]["audio"] is None + + +def test_rife_model_inference_runs_on_dummy_tensors(): + model = rife_interpolator.Model().eval() + img0 = torch.rand(1, 3, 32, 32) + img1 = torch.rand(1, 3, 32, 32) + + output = model.inference(img0, img1, scale=1.0) + + assert output.shape == (1, 3, 32, 32) + assert torch.isfinite(output).all() + + +def test_frame_interpolator_runs_actual_torch_tensor_path(monkeypatch): + model = rife_interpolator.Model().eval() + interpolator = rife_interpolator.FrameInterpolator() + monkeypatch.setattr(interpolator, "_ensure_model_loaded", lambda preferred_device=None: model) + + video = torch.zeros(1, 3, 2, 32, 32) + output_video, multiplier = interpolator.interpolate_tensor(video, exp=1, scale=1.0) + + assert multiplier == 2 + assert output_video.shape == (1, 3, 3, 32, 32) + assert torch.isfinite(output_video).all() + + +def test_frame_interpolator_prefers_input_tensor_device(monkeypatch): + chosen_devices = [] + model = rife_interpolator.Model().eval() + + def _fake_ensure_model_loaded(*, preferred_device=None): + chosen_devices.append(preferred_device) + return model + + interpolator = rife_interpolator.FrameInterpolator() + monkeypatch.setattr(interpolator, "_ensure_model_loaded", _fake_ensure_model_loaded) + monkeypatch.setattr(model.flownet, "to", lambda device: model.flownet) + + video = torch.zeros(1, 3, 2, 32, 32) + output_video, multiplier = interpolator.interpolate_tensor(video, exp=1, scale=1.0) + + assert chosen_devices == [video.device] + assert multiplier == 2 + assert output_video.shape == (1, 3, 3, 32, 32) diff --git a/tests/entrypoints/openai_api/test_video_server.py b/tests/entrypoints/openai_api/test_video_server.py index 82c34f87e8f..7a395bab5b0 100644 --- a/tests/entrypoints/openai_api/test_video_server.py +++ b/tests/entrypoints/openai_api/test_video_server.py @@ -34,15 +34,28 @@ class MockVideoResult: - def __init__(self, videos, audios=None, sample_rate=None, stage_durations=None, peak_memory_mb=0.0): + def __init__( + self, + videos, + audios=None, + sample_rate=None, + custom_output=None, + stage_durations=None, + peak_memory_mb=0.0, + ): self.multimodal_output = {"video": videos} if audios is not None: self.multimodal_output["audio"] = audios if sample_rate is not None: self.multimodal_output["audio_sample_rate"] = sample_rate + self._custom_output = custom_output or {} self.stage_durations = stage_durations or {} self.peak_memory_mb = peak_memory_mb + @property + def custom_output(self): + return self._custom_output + class FakeAsyncOmni: def __init__(self): @@ -400,6 +413,67 @@ def test_sampling_params_pass_through(test_client, mocker: MockerFixture): assert captured.extra_args["flow_shift"] == 0.25 +def test_frame_interpolation_params_pass_to_diffusion_sampling_params(test_client, mocker: MockerFixture): + """Frame interpolation parameters should be forwarded to diffusion worker sampling params.""" + mocker.patch( + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + return_value=b"fake-video", + ) + response = test_client.post( + "/v1/videos", + data={ + "prompt": "smooth motion", + "fps": "8", + "enable_frame_interpolation": "true", + "frame_interpolation_exp": "2", + "frame_interpolation_scale": "0.5", + "frame_interpolation_model_path": "local-rife", + }, + ) + + assert response.status_code == 200 + video_id = response.json()["id"] + _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value) + + engine = test_client.app.state.openai_serving_video._engine_client + captured = engine.captured_sampling_params_list[0] + assert captured.enable_frame_interpolation is True + assert captured.frame_interpolation_exp == 2 + assert captured.frame_interpolation_scale == 0.5 + assert captured.frame_interpolation_model_path == "local-rife" + + +def test_worker_fps_multiplier_is_applied_to_async_encoding(test_client, mocker: MockerFixture): + fps_values = [] + engine = test_client.app.state.openai_serving_video._engine_client + + async def _generate(prompt, request_id, sampling_params_list): + engine.captured_prompt = prompt + engine.captured_sampling_params_list = sampling_params_list + import numpy as np + + yield MockVideoResult([np.zeros((1, 64, 64, 3), dtype=np.uint8)], custom_output={"video_fps_multiplier": 2}) + + engine.generate = _generate + + def _fake_encode(video, fps, **kwargs): + del video, kwargs + fps_values.append(fps) + return b"fake-video" + + mocker.patch( + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + side_effect=_fake_encode, + ) + + response = test_client.post("/v1/videos", data={"prompt": "fps multiplier", "fps": "8"}) + + assert response.status_code == 200 + video_id = response.json()["id"] + _wait_for_status(test_client, video_id, VideoGenerationStatus.COMPLETED.value) + assert fps_values == [16] + + def test_audio_sample_rate_comes_from_model_config(test_client, mocker: MockerFixture): audio_sample_rates = [] @@ -595,6 +669,10 @@ def test_video_request_validation(): with pytest.raises(ValueError): VideoGenerationRequest(prompt="test", image_reference={"file_id": "file-1", "image_url": "https://example.com"}) + with pytest.raises(ValueError): + VideoGenerationRequest(prompt="test", frame_interpolation_exp=0) + with pytest.raises(ValueError): + VideoGenerationRequest(prompt="test", frame_interpolation_scale=0) def test_list_videos_supports_order_after_and_limit(test_client, mocker: MockerFixture): @@ -1032,3 +1110,57 @@ def test_sync_sampling_params_pass_through(test_client, mocker: MockerFixture): assert captured.num_inference_steps == 30 assert captured.guidance_scale == 6.5 assert captured.seed == 42 + + +def test_sync_frame_interpolation_params_pass_to_sampling_params(test_client, mocker: MockerFixture): + """Frame interpolation parameters should be forwarded on the sync path.""" + encode_mock = _mock_encode_video_bytes(mocker) + response = test_client.post( + "/v1/videos/sync", + data={ + "prompt": "smooth sync", + "fps": "8", + "enable_frame_interpolation": "true", + "frame_interpolation_exp": "2", + "frame_interpolation_scale": "0.5", + "frame_interpolation_model_path": "local-rife", + }, + ) + + assert response.status_code == 200 + engine = test_client.app.state.openai_serving_video._engine_client + captured = engine.captured_sampling_params_list[0] + assert captured.enable_frame_interpolation is True + assert captured.frame_interpolation_exp == 2 + assert captured.frame_interpolation_scale == 0.5 + assert captured.frame_interpolation_model_path == "local-rife" + _, kwargs = encode_mock.call_args + assert kwargs["fps"] == 8 + + +def test_worker_fps_multiplier_is_applied_to_sync_encoding(test_client, mocker: MockerFixture): + engine = test_client.app.state.openai_serving_video._engine_client + fps_values = [] + + async def _generate(prompt, request_id, sampling_params_list): + engine.captured_prompt = prompt + engine.captured_sampling_params_list = sampling_params_list + yield MockVideoResult([object()], custom_output={"video_fps_multiplier": 2}) + + engine.generate = _generate + + def _fake_encode(video, fps, **kwargs): + del video, kwargs + fps_values.append(fps) + return b"fps-multiplied" + + mocker.patch( + "vllm_omni.entrypoints.openai.serving_video._encode_video_bytes", + side_effect=_fake_encode, + ) + + response = test_client.post("/v1/videos/sync", data={"prompt": "fps multiplier", "fps": "8"}) + + assert response.status_code == 200 + assert response.content == b"fps-multiplied" + assert fps_values == [16] diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index 52a8f385479..fe940d623e5 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -3,6 +3,7 @@ from __future__ import annotations +import inspect import queue import threading import time @@ -78,6 +79,12 @@ def __init__( self.post_process_func = get_diffusion_post_process_func(od_config) self.pre_process_func = get_diffusion_pre_process_func(od_config) + # Cache whether the model-specific postprocess accepts request-level + # sampling params so step() can support both legacy and extended hooks. + self._post_process_accepts_sampling_params = bool( + self.post_process_func is not None + and "sampling_params" in inspect.signature(self.post_process_func).parameters + ) executor_class = DiffusionExecutor.get_class(od_config) self.executor = executor_class(od_config) @@ -143,12 +150,22 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: output_data = output_data.cpu() postprocess_start_time = time.perf_counter() - outputs = self.post_process_func(output_data) if self.post_process_func is not None else output_data + if self.post_process_func is not None: + # Some video pipelines need request-level controls during + # postprocess (for example worker-side frame interpolation). + if self._post_process_accepts_sampling_params: + outputs = self.post_process_func(output_data, sampling_params=request.sampling_params) + else: + outputs = self.post_process_func(output_data) + else: + outputs = output_data audio_payload = None + custom_output = output.custom_output or {} model_audio_sample_rate = None model_fps = None if isinstance(outputs, dict): audio_payload = outputs.get("audio") + custom_output.update(outputs.get("custom_output") or {}) model_audio_sample_rate = outputs.get("audio_sample_rate") model_fps = outputs.get("fps") outputs = outputs.get("video", outputs) @@ -225,7 +242,7 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: trajectory_timesteps=output.trajectory_timesteps, trajectory_log_probs=output.trajectory_log_probs, trajectory_decoded=output.trajectory_decoded, - custom_output=output.custom_output or {}, + custom_output=custom_output, multimodal_output=mm_output, stage_durations=output.stage_durations, peak_memory_mb=output.peak_memory_mb, @@ -295,7 +312,7 @@ def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: trajectory_timesteps=output.trajectory_timesteps, trajectory_log_probs=output.trajectory_log_probs, trajectory_decoded=output.trajectory_decoded, - custom_output=output.custom_output or {}, + custom_output=custom_output, multimodal_output=mm_output, stage_durations=output.stage_durations, peak_memory_mb=output.peak_memory_mb, diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py index 84d89619e86..a1b10439c85 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py @@ -26,6 +26,7 @@ from vllm_omni.diffusion.models.schedulers import FlowUniPCMultistepScheduler from vllm_omni.diffusion.models.wan2_2.scheduling_wan_euler import WanEulerScheduler from vllm_omni.diffusion.models.wan2_2.wan2_2_transformer import WanTransformer3DModel +from vllm_omni.diffusion.postprocess import interpolate_video_tensor from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniTextPrompt @@ -162,10 +163,23 @@ def get_wan22_post_process_func( def post_process_func( video: torch.Tensor, output_type: str = "np", + sampling_params=None, ): if output_type == "latent": return video - return video_processor.postprocess_video(video, output_type=output_type) + custom_output = {} + if sampling_params is not None and getattr(sampling_params, "enable_frame_interpolation", False): + video, multiplier = interpolate_video_tensor( + video, + exp=sampling_params.frame_interpolation_exp, + scale=sampling_params.frame_interpolation_scale, + model_path=sampling_params.frame_interpolation_model_path, + ) + custom_output["video_fps_multiplier"] = multiplier + return { + "video": video_processor.postprocess_video(video, output_type=output_type), + "custom_output": custom_output, + } return post_process_func diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py index 46484cd789d..ddc6e0bc2b9 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py @@ -32,6 +32,7 @@ resolve_wan_sample_solver, retrieve_latents, ) +from vllm_omni.diffusion.postprocess import interpolate_video_tensor from vllm_omni.diffusion.profiler.diffusion_pipeline_profiler import DiffusionPipelineProfilerMixin from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniTextPrompt @@ -74,10 +75,23 @@ def get_wan22_i2v_post_process_func( def post_process_func( video: torch.Tensor, output_type: str = "np", + sampling_params=None, ): if output_type == "latent": return video - return video_processor.postprocess_video(video, output_type=output_type) + custom_output = {} + if sampling_params is not None and getattr(sampling_params, "enable_frame_interpolation", False): + video, multiplier = interpolate_video_tensor( + video, + exp=sampling_params.frame_interpolation_exp, + scale=sampling_params.frame_interpolation_scale, + model_path=sampling_params.frame_interpolation_model_path, + ) + custom_output["video_fps_multiplier"] = multiplier + return { + "video": video_processor.postprocess_video(video, output_type=output_type), + "custom_output": custom_output, + } return post_process_func diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py index 939fe294a33..62df13cbdea 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py @@ -44,6 +44,7 @@ resolve_wan_sample_solver, retrieve_latents, ) +from vllm_omni.diffusion.postprocess import interpolate_video_tensor from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniTextPrompt from vllm_omni.platforms import current_omni_platform @@ -61,10 +62,23 @@ def get_wan22_ti2v_post_process_func( def post_process_func( video: torch.Tensor, output_type: str = "np", + sampling_params=None, ): if output_type == "latent": return video - return video_processor.postprocess_video(video, output_type=output_type) + custom_output = {} + if sampling_params is not None and getattr(sampling_params, "enable_frame_interpolation", False): + video, multiplier = interpolate_video_tensor( + video, + exp=sampling_params.frame_interpolation_exp, + scale=sampling_params.frame_interpolation_scale, + model_path=sampling_params.frame_interpolation_model_path, + ) + custom_output["video_fps_multiplier"] = multiplier + return { + "video": video_processor.postprocess_video(video, output_type=output_type), + "custom_output": custom_output, + } return post_process_func diff --git a/vllm_omni/diffusion/postprocess/__init__.py b/vllm_omni/diffusion/postprocess/__init__.py new file mode 100644 index 00000000000..e6fe5b2d220 --- /dev/null +++ b/vllm_omni/diffusion/postprocess/__init__.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Diffusion post-processing helpers.""" + +from vllm_omni.diffusion.postprocess.rife_interpolator import ( + FrameInterpolator, + interpolate_video_tensor, +) + +__all__ = ["FrameInterpolator", "interpolate_video_tensor"] diff --git a/vllm_omni/diffusion/postprocess/rife_interpolator.py b/vllm_omni/diffusion/postprocess/rife_interpolator.py new file mode 100644 index 00000000000..b2b4a931914 --- /dev/null +++ b/vllm_omni/diffusion/postprocess/rife_interpolator.py @@ -0,0 +1,440 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +RIFE 4.22.lite frame interpolation for vLLM-Omni video generation. + +RIFE model code is vendored and adapted from: + - https://github.com/hzwer/ECCV2022-RIFE (MIT License) + - https://github.com/hzwer/Practical-RIFE (MIT License) + Copyright (c) 2021 Zhewei Huang + +The FrameInterpolator wrapper and vLLM-Omni integration code are original work. +""" + +from __future__ import annotations + +import os +import threading +from typing import Any + +import torch +import torch.nn as nn +import torch.nn.functional as F +from vllm.logger import init_logger + +logger = init_logger(__name__) + +_DEFAULT_RIFE_HF_REPO = "elfgum/RIFE-4.22.lite" +_MODEL_CACHE: dict[tuple[str, str], Model] = {} +_MODEL_CACHE_LOCK = threading.Lock() + + +def warp(ten_input: torch.Tensor, ten_flow: torch.Tensor) -> torch.Tensor: + """Warp input tensor by optical flow using grid_sample.""" + ten_horizontal = ( + torch.linspace(-1.0, 1.0, ten_flow.shape[3], device=ten_flow.device) + .view(1, 1, 1, ten_flow.shape[3]) + .expand(ten_flow.shape[0], -1, ten_flow.shape[2], -1) + ) + ten_vertical = ( + torch.linspace(-1.0, 1.0, ten_flow.shape[2], device=ten_flow.device) + .view(1, 1, ten_flow.shape[2], 1) + .expand(ten_flow.shape[0], -1, -1, ten_flow.shape[3]) + ) + ten_grid = torch.cat([ten_horizontal, ten_vertical], dim=1) + + ten_flow = torch.cat( + [ + ten_flow[:, 0:1, :, :] / ((ten_input.shape[3] - 1.0) / 2.0), + ten_flow[:, 1:2, :, :] / ((ten_input.shape[2] - 1.0) / 2.0), + ], + dim=1, + ) + grid = (ten_grid + ten_flow).permute(0, 2, 3, 1) + return F.grid_sample( + input=ten_input, + grid=grid, + mode="bilinear", + padding_mode="border", + align_corners=True, + ) + + +def _conv( + in_planes: int, + out_planes: int, + kernel_size: int = 3, + stride: int = 1, + padding: int = 1, + dilation: int = 1, +) -> nn.Sequential: + return nn.Sequential( + nn.Conv2d( + in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=True, + ), + nn.LeakyReLU(0.2, True), + ) + + +class ResConv(nn.Module): + """Residual convolution block with learnable beta scaling.""" + + def __init__(self, c: int, dilation: int = 1): + super().__init__() + self.conv = nn.Conv2d(c, c, 3, 1, dilation, dilation=dilation, groups=1) + self.beta = nn.Parameter(torch.ones((1, c, 1, 1)), requires_grad=True) + self.relu = nn.LeakyReLU(0.2, True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.relu(self.conv(x) * self.beta + x) + + +class IFBlock(nn.Module): + """Single-scale optical flow, mask, and feature block.""" + + def __init__(self, in_planes: int, c: int = 64): + super().__init__() + self.conv0 = nn.Sequential( + _conv(in_planes, c // 2, 3, 2, 1), + _conv(c // 2, c, 3, 2, 1), + ) + self.convblock = nn.Sequential( + ResConv(c), + ResConv(c), + ResConv(c), + ResConv(c), + ResConv(c), + ResConv(c), + ResConv(c), + ResConv(c), + ) + self.lastconv = nn.Sequential( + nn.ConvTranspose2d(c, 4 * 13, 4, 2, 1), + nn.PixelShuffle(2), + ) + + def forward( + self, + x: torch.Tensor, + flow: torch.Tensor | None = None, + scale: float = 1.0, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + x = F.interpolate(x, scale_factor=1.0 / scale, mode="bilinear", align_corners=False) + if flow is not None: + flow = ( + F.interpolate( + flow, + scale_factor=1.0 / scale, + mode="bilinear", + align_corners=False, + ) + * 1.0 + / scale + ) + x = torch.cat((x, flow), 1) + feat = self.conv0(x) + feat = self.convblock(feat) + tmp = self.lastconv(feat) + tmp = F.interpolate(tmp, scale_factor=scale, mode="bilinear", align_corners=False) + flow = tmp[:, :4] * scale + mask = tmp[:, 4:5] + feat = tmp[:, 5:] + return flow, mask, feat + + +class Head(nn.Module): + """Feature encoder producing four-channel features at full resolution.""" + + def __init__(self): + super().__init__() + self.cnn0 = nn.Conv2d(3, 16, 3, 2, 1) + self.cnn1 = nn.Conv2d(16, 16, 3, 1, 1) + self.cnn2 = nn.Conv2d(16, 16, 3, 1, 1) + self.cnn3 = nn.ConvTranspose2d(16, 4, 4, 2, 1) + self.relu = nn.LeakyReLU(0.2, True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x0 = self.cnn0(x) + x = self.relu(x0) + x1 = self.cnn1(x) + x = self.relu(x1) + x2 = self.cnn2(x) + x = self.relu(x2) + x3 = self.cnn3(x) + return x3 + + +class IFNet(nn.Module): + """Four-scale IFNet optical flow network.""" + + def __init__(self): + super().__init__() + self.block0 = IFBlock(7 + 8, c=192) + self.block1 = IFBlock(8 + 4 + 8 + 8, c=128) + self.block2 = IFBlock(8 + 4 + 8 + 8, c=64) + self.block3 = IFBlock(8 + 4 + 8 + 8, c=32) + self.encode = Head() + + def forward( + self, + x: torch.Tensor, + timestep: float = 0.5, + scale_list: list[float] | None = None, + ) -> tuple[list[torch.Tensor], torch.Tensor, list[tuple[torch.Tensor, torch.Tensor] | torch.Tensor]]: + if scale_list is None: + scale_list = [8, 4, 2, 1] + + channel = x.shape[1] // 2 + img0 = x[:, :channel] + img1 = x[:, channel:] + + if not torch.is_tensor(timestep): + timestep = (x[:, :1].clone() * 0 + 1) * timestep + else: + timestep = timestep.repeat(1, 1, img0.shape[2], img0.shape[3]) + + f0 = self.encode(img0[:, :3]) + f1 = self.encode(img1[:, :3]) + + flow_list: list[torch.Tensor] = [] + merged: list[tuple[torch.Tensor, torch.Tensor] | torch.Tensor] = [] + mask_list: list[torch.Tensor] = [] + warped_img0 = img0 + warped_img1 = img1 + flow = None + mask = None + + for i, block in enumerate([self.block0, self.block1, self.block2, self.block3]): + if flow is None: + flow, mask, feat = block( + torch.cat((img0[:, :3], img1[:, :3], f0, f1, timestep), 1), + None, + scale=scale_list[i], + ) + else: + wf0 = warp(f0, flow[:, :2]) + wf1 = warp(f1, flow[:, 2:4]) + fd, m0, feat = block( + torch.cat( + ( + warped_img0[:, :3], + warped_img1[:, :3], + wf0, + wf1, + timestep, + mask, + feat, + ), + 1, + ), + flow, + scale=scale_list[i], + ) + mask = m0 + flow = flow + fd + + mask_list.append(mask) + flow_list.append(flow) + warped_img0 = warp(img0, flow[:, :2]) + warped_img1 = warp(img1, flow[:, 2:4]) + merged.append((warped_img0, warped_img1)) + + mask = torch.sigmoid(mask) + merged[3] = warped_img0 * mask + warped_img1 * (1 - mask) + return flow_list, mask_list[3], merged + + +class Model: + """Wraps IFNet and exposes RIFE-compatible load/inference helpers.""" + + def __init__(self): + self.flownet = IFNet() + + def eval(self) -> Model: + self.flownet.eval() + return self + + def device(self) -> torch.device: + return next(self.flownet.parameters()).device + + def load_model(self, path: str) -> None: + flownet_path = os.path.join(path, "flownet.pkl") + if not os.path.isfile(flownet_path): + raise FileNotFoundError( + f"RIFE weight file not found: {flownet_path}. Expected layout: /flownet.pkl" + ) + + state = torch.load(flownet_path, map_location="cpu", weights_only=False) + state = {k.removeprefix("module."): v for k, v in state.items()} + self.flownet.load_state_dict(state, strict=False) + logger.info("Loaded RIFE weights from %s", flownet_path) + + def inference( + self, + img0: torch.Tensor, + img1: torch.Tensor, + scale: float = 1.0, + timestep: float = 0.5, + ) -> torch.Tensor: + _n, _c, h, w = img0.shape + ph = ((h - 1) // 32 + 1) * 32 + pw = ((w - 1) // 32 + 1) * 32 + pad = (0, pw - w, 0, ph - h) + img0 = F.pad(img0, pad) + img1 = F.pad(img1, pad) + + imgs = torch.cat((img0, img1), 1) + scale_list = [8 / scale, 4 / scale, 2 / scale, 1 / scale] + with torch.no_grad(): + _flow_list, _mask, merged = self.flownet( + imgs, + timestep=timestep, + scale_list=scale_list, + ) + return merged[3][:, :, :h, :w] + + +def _resolve_rife_model_path(model_path: str | None) -> str: + model_path = model_path or _DEFAULT_RIFE_HF_REPO + if os.path.isdir(model_path): + return model_path + from vllm_omni.model_executor.model_loader.weight_utils import ( + download_weights_from_hf_specific, + ) + + return download_weights_from_hf_specific( + model_path, + cache_dir=None, + allow_patterns=["flownet.pkl"], + require_all=True, + ) + + +def _select_torch_device() -> torch.device: + try: + from vllm_omni.platforms import current_omni_platform + + return current_omni_platform.get_torch_device() + except Exception as exc: + logger.warning("Failed to resolve current vLLM-Omni torch device: %s", exc) + + if torch.cuda.is_available(): + return torch.device("cuda") + return torch.device("cpu") + + +def _normalize_video_tensor_layout(video: torch.Tensor) -> tuple[torch.Tensor, Any]: + if video.ndim == 5: + if video.shape[1] in (3, 4): + return video, lambda out: out + if video.shape[2] in (3, 4): + return video.permute(0, 2, 1, 3, 4), lambda out: out.permute(0, 2, 1, 3, 4) + elif video.ndim == 4: + if video.shape[0] in (3, 4): + return video.unsqueeze(0), lambda out: out.squeeze(0) + if video.shape[1] in (3, 4): + return video.permute(1, 0, 2, 3).unsqueeze(0), lambda out: out.squeeze(0).permute(1, 0, 2, 3) + raise ValueError(f"Unsupported video tensor shape for interpolation: {tuple(video.shape)}") + + +def _normalize_video_tensor_range(video: torch.Tensor) -> tuple[torch.Tensor, Any]: + original_dtype = video.dtype + video = video.detach() + if video.is_floating_point(): + video = video.to(torch.float32) + if torch.amin(video) < 0.0 or torch.amax(video) > 1.0: + return video.clamp(-1.0, 1.0) * 0.5 + 0.5, lambda out: (out * 2.0 - 1.0).to(original_dtype) + return video.clamp(0.0, 1.0), lambda out: out.to(original_dtype) + return video.to(torch.float32) / 255.0, lambda out: (out * 255.0).round().clamp(0, 255).to(original_dtype) + + +class FrameInterpolator: + """Lazy-loaded RIFE 4.22.lite frame interpolator.""" + + def __init__(self, model_path: str | None = None): + self._model_path = model_path + self._resolved_path: str | None = None + + def _ensure_model_loaded(self, preferred_device: torch.device | None = None) -> Model: + resolved_path = _resolve_rife_model_path(self._model_path) + self._resolved_path = resolved_path + device = preferred_device or _select_torch_device() + cache_key = (resolved_path, str(device)) + + with _MODEL_CACHE_LOCK: + if cache_key in _MODEL_CACHE: + return _MODEL_CACHE[cache_key] + + model = Model() + model.load_model(resolved_path) + model.eval() + model.flownet = model.flownet.to(device) + _MODEL_CACHE[cache_key] = model + logger.info("RIFE model loaded on device: %s", device) + return model + + def _make_inference( + self, + model: Model, + img0: torch.Tensor, + img1: torch.Tensor, + n: int, + scale: float, + ) -> list[torch.Tensor]: + if n == 1: + return [model.inference(img0, img1, scale=scale)] + mid = model.inference(img0, img1, scale=scale) + return ( + self._make_inference(model, img0, mid, n // 2, scale) + + [mid] + + self._make_inference(model, mid, img1, n // 2, scale) + ) + + def interpolate_tensor( + self, + video: torch.Tensor, + exp: int = 1, + scale: float = 1.0, + ) -> tuple[torch.Tensor, int]: + if exp < 1: + raise ValueError(f"frame interpolation exp must be >= 1, got {exp}") + if scale <= 0: + raise ValueError(f"frame interpolation scale must be > 0, got {scale}") + + video, restore_layout = _normalize_video_tensor_layout(video) + if video.shape[2] < 2: + return restore_layout(video), 1 + + video, restore_range = _normalize_video_tensor_range(video) + # Prefer the decoded video's current device so CPU-offloaded requests do + # not move the tensor back to GPU just for interpolation. + model = self._ensure_model_loaded(preferred_device=video.device) + video = video.to(model.device()) + intermediates_per_pair = 2**exp // 2 + + result_frames: list[torch.Tensor] = [] + for idx in range(video.shape[2] - 1): + img0 = video[:, :, idx, :, :] + img1 = video[:, :, idx + 1, :, :] + result_frames.append(img0) + result_frames.extend(self._make_inference(model, img0, img1, intermediates_per_pair, scale)) + result_frames.append(video[:, :, -1, :, :]) + result = torch.stack(result_frames, dim=2) + return restore_layout(restore_range(result)), 2**exp + + +def interpolate_video_tensor( + video: torch.Tensor, + exp: int = 1, + scale: float = 1.0, + model_path: str | None = None, +) -> tuple[torch.Tensor, int]: + """Interpolate a video tensor and return the FPS multiplier.""" + interpolator = FrameInterpolator(model_path=model_path) + return interpolator.interpolate_tensor(video, exp=exp, scale=scale) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index d847a96db66..11ba59e43a1 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -2075,6 +2075,10 @@ async def _parse_video_form( true_cfg_scale: float | None = Form(default=None), seed: int | None = Form(default=None), negative_prompt: str | None = Form(default=None), + enable_frame_interpolation: bool = Form(default=False), + frame_interpolation_exp: int = Form(default=1, ge=1), + frame_interpolation_scale: float = Form(default=1.0, gt=0.0), + frame_interpolation_model_path: str | None = Form(default=None), lora: str | None = Form(default=None), extra_params: str | None = Form(default=None), ) -> tuple[VideoGenerationRequest, "OmniOpenAIServingVideo", str, ReferenceImage | None]: @@ -2111,6 +2115,10 @@ async def _parse_video_form( "true_cfg_scale": true_cfg_scale, "seed": seed, "negative_prompt": negative_prompt, + "enable_frame_interpolation": enable_frame_interpolation, + "frame_interpolation_exp": frame_interpolation_exp, + "frame_interpolation_scale": frame_interpolation_scale, + "frame_interpolation_model_path": frame_interpolation_model_path, "lora": _parse_form_json(lora, expected_type=dict), "extra_params": _parse_form_json(extra_params, expected_type=dict), } diff --git a/vllm_omni/entrypoints/openai/protocol/videos.py b/vllm_omni/entrypoints/openai/protocol/videos.py index de5362dd97c..7c2c3164d92 100644 --- a/vllm_omni/entrypoints/openai/protocol/videos.py +++ b/vllm_omni/entrypoints/openai/protocol/videos.py @@ -150,6 +150,29 @@ class VideoGenerationRequest(BaseModel): ) seed: int | None = Field(default=None, description="Random seed for reproducibility") + # vllm-omni extensions for post-generation frame interpolation. + enable_frame_interpolation: bool = Field( + default=False, + description="Enable post-generation RIFE frame interpolation before MP4 encoding.", + ) + frame_interpolation_exp: int = Field( + default=1, + ge=1, + description="Interpolation exponent: 1=2x temporal resolution, 2=4x, etc.", + ) + frame_interpolation_scale: float = Field( + default=1.0, + gt=0.0, + description="RIFE inference scale. Use 0.5 for high-resolution inputs to save memory.", + ) + frame_interpolation_model_path: str | None = Field( + default=None, + description=( + "Local directory or Hugging Face repo ID containing RIFE flownet.pkl weights. " + "Defaults to elfgum/RIFE-4.22.lite." + ), + ) + # vllm-omni extension for per-request LoRA. lora: dict[str, Any] | None = Field( default=None, diff --git a/vllm_omni/entrypoints/openai/serving_video.py b/vllm_omni/entrypoints/openai/serving_video.py index 0001fa65f89..741295c7c25 100644 --- a/vllm_omni/entrypoints/openai/serving_video.py +++ b/vllm_omni/entrypoints/openai/serving_video.py @@ -113,6 +113,10 @@ async def _run_and_extract( if vp.fps is not None: gen_params.fps = vp.fps gen_params.frame_rate = float(vp.fps) + gen_params.enable_frame_interpolation = request.enable_frame_interpolation + gen_params.frame_interpolation_exp = request.frame_interpolation_exp + gen_params.frame_interpolation_scale = request.frame_interpolation_scale + gen_params.frame_interpolation_model_path = request.frame_interpolation_model_path if request.num_inference_steps is not None: gen_params.num_inference_steps = request.num_inference_steps @@ -160,7 +164,7 @@ async def _run_and_extract( videos = self._extract_video_outputs(result) audios = self._extract_audio_outputs(result, expected_count=len(videos)) audio_sample_rate = self._resolve_audio_sample_rate(result) - output_fps = vp.fps or self._resolve_fps(result) or 24 + output_fps = (vp.fps or self._resolve_fps(result) or 24) * self._resolve_video_fps_multiplier(result) return VideoGenerationArtifacts( videos=videos, audios=audios, @@ -243,6 +247,22 @@ async def generate_video_bytes( logger.info("Video response encoding (MP4 bytes): %.2f ms", _t_encode_ms) return video_bytes, artifacts.stage_durations, artifacts.peak_memory_mb + @staticmethod + def _resolve_video_fps_multiplier(result: Any) -> int: + custom_output = getattr(result, "custom_output", None) + if isinstance(custom_output, dict): + multiplier = custom_output.get("video_fps_multiplier") + if multiplier is not None: + return int(multiplier) + request_output = getattr(result, "request_output", None) + if request_output is not None: + custom_output = getattr(request_output, "custom_output", None) + if isinstance(custom_output, dict): + multiplier = custom_output.get("video_fps_multiplier") + if multiplier is not None: + return int(multiplier) + return 1 + @staticmethod def _apply_lora(lora_body: Any, gen_params: OmniDiffusionSamplingParams) -> None: try: diff --git a/vllm_omni/inputs/data.py b/vllm_omni/inputs/data.py index 9cb6c44335c..85faf6b9499 100644 --- a/vllm_omni/inputs/data.py +++ b/vllm_omni/inputs/data.py @@ -227,6 +227,10 @@ class OmniDiffusionSamplingParams: frame_rate: float | None = None # Floating-point rate used by the diffusion model when it differs from `fps`. height_not_provided: bool = False width_not_provided: bool = False + enable_frame_interpolation: bool = False + frame_interpolation_exp: int = 1 + frame_interpolation_scale: float = 1.0 + frame_interpolation_model_path: str | None = None # Timesteps timesteps: torch.Tensor | None = None From 1ad726f49524be5a4fb96f777ed90722f1276692 Mon Sep 17 00:00:00 2001 From: TaffyOfficial <2587297563@qq.com> Date: Wed, 15 Apr 2026 12:53:47 +0800 Subject: [PATCH 47/76] =?UTF-8?q?[Fix]=20HunyuanImage-3.0:=20unify=20namin?= =?UTF-8?q?g=20hunyuan=5Fimage=5F3=20=E2=86=92=20hunyuan=5Fimage3=20(#2712?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/design/feature/expert_parallel.md | 4 ++-- .../test_hunyuan_fused_moe.py | 22 +++++++++---------- .../test_hunyuanimage3_text2img.py | 2 +- .../__init__.py | 6 ++--- .../autoencoder.py | 0 .../hunyuan_fused_moe.py | 0 .../hunyuan_image3_tokenizer.py} | 2 +- .../hunyuan_image3_transformer.py} | 2 +- .../pipeline_hunyuan_image3.py} | 4 ++-- .../system_prompt.py | 0 vllm_omni/diffusion/registry.py | 4 ++-- ...age_3_moe.yaml => hunyuan_image3_moe.yaml} | 0 ...3_moe_dit.yaml => hunyuan_image3_t2i.yaml} | 0 ...2gpu.yaml => hunyuan_image3_t2i_2gpu.yaml} | 0 vllm_omni/platforms/interface.py | 2 +- vllm_omni/platforms/musa/platform.py | 2 +- ...3_moe_dit.yaml => hunyuan_image3_t2i.yaml} | 0 ...age_3_moe.yaml => hunyuan_image3_t2i.yaml} | 0 18 files changed, 25 insertions(+), 25 deletions(-) rename tests/diffusion/models/{hunyuan_image_3 => hunyuan_image3}/test_hunyuan_fused_moe.py (85%) rename vllm_omni/diffusion/models/{hunyuan_image_3 => hunyuan_image3}/__init__.py (58%) rename vllm_omni/diffusion/models/{hunyuan_image_3 => hunyuan_image3}/autoencoder.py (100%) rename vllm_omni/diffusion/models/{hunyuan_image_3 => hunyuan_image3}/hunyuan_fused_moe.py (100%) rename vllm_omni/diffusion/models/{hunyuan_image_3/hunyuan_image_3_tokenizer.py => hunyuan_image3/hunyuan_image3_tokenizer.py} (99%) rename vllm_omni/diffusion/models/{hunyuan_image_3/hunyuan_image_3_transformer.py => hunyuan_image3/hunyuan_image3_transformer.py} (99%) rename vllm_omni/diffusion/models/{hunyuan_image_3/pipeline_hunyuan_image_3.py => hunyuan_image3/pipeline_hunyuan_image3.py} (99%) rename vllm_omni/diffusion/models/{hunyuan_image_3 => hunyuan_image3}/system_prompt.py (100%) rename vllm_omni/model_executor/stage_configs/{hunyuan_image_3_moe.yaml => hunyuan_image3_moe.yaml} (100%) rename vllm_omni/model_executor/stage_configs/{hunyuan_image3_moe_dit.yaml => hunyuan_image3_t2i.yaml} (100%) rename vllm_omni/model_executor/stage_configs/{hunyuan_image_3_moe_2gpu.yaml => hunyuan_image3_t2i_2gpu.yaml} (100%) rename vllm_omni/platforms/npu/stage_configs/{hunyuan_image3_moe_dit.yaml => hunyuan_image3_t2i.yaml} (100%) rename vllm_omni/platforms/xpu/stage_configs/{hunyuan_image_3_moe.yaml => hunyuan_image3_t2i.yaml} (100%) diff --git a/docs/design/feature/expert_parallel.md b/docs/design/feature/expert_parallel.md index 9a7c4cdbac7..e05eec33613 100644 --- a/docs/design/feature/expert_parallel.md +++ b/docs/design/feature/expert_parallel.md @@ -207,9 +207,9 @@ Complete examples in the codebase: | Model | Path | Pattern | Notes | |-------|------|---------|-------| -| **HunyuanImage3.0** | `vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py` | Standard EP | Full implementation with validation | +| **HunyuanImage3.0** | `vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py` | Standard EP | Full implementation with validation | | **EP Tests** | `vllm-omni/tests/e2e/offline_inference/test_expert_parallel.py` | E2E testing | EP correctness and performance | -| **Constraint Tests** | `vllm-omni/tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py` | Unit testing | Validation logic | +| **Constraint Tests** | `vllm-omni/tests/diffusion/models/hunyuan_image3/test_hunyuan_fused_moe.py` | Unit testing | Validation logic | --- ## Summary diff --git a/tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py b/tests/diffusion/models/hunyuan_image3/test_hunyuan_fused_moe.py similarity index 85% rename from tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py rename to tests/diffusion/models/hunyuan_image3/test_hunyuan_fused_moe.py index 2cda9116c7d..626f78eed9c 100644 --- a/tests/diffusion/models/hunyuan_image_3/test_hunyuan_fused_moe.py +++ b/tests/diffusion/models/hunyuan_image3/test_hunyuan_fused_moe.py @@ -12,7 +12,7 @@ class TestSetForwardContextNumTokens: def test_sets_num_tokens_when_context_available(self, mocker): """num_tokens should be set on ForwardContext when available.""" - import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe mock_ctx = mocker.MagicMock() del mock_ctx.in_profile_run # simulate missing attr @@ -26,7 +26,7 @@ def test_sets_num_tokens_when_context_available(self, mocker): def test_sets_in_profile_run_only_if_missing(self, mocker): """in_profile_run should not be overwritten if already set.""" - import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe mock_ctx = mocker.MagicMock() mock_ctx.in_profile_run = True # already set @@ -40,7 +40,7 @@ def test_sets_in_profile_run_only_if_missing(self, mocker): def test_noop_when_context_unavailable(self, mocker): """Should do nothing when ForwardContext is not available.""" - import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe mocker.patch.object(hunyuan_moe._vllm_fc, "is_forward_context_available", return_value=False) mock_get = mocker.patch.object(hunyuan_moe._vllm_fc, "get_forward_context") @@ -55,11 +55,11 @@ class TestHunyuanFusedMoEPlatformDispatch: def test_default_platform_uses_default_impl_qualname(self, mocker): """HunyuanFusedMoE should resolve the impl class from the platform hook.""" - import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe mock_platform = mocker.MagicMock() mock_platform.get_diffusion_model_impl_qualname.return_value = ( - "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault" + "vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe.HunyuanFusedMoEDefault" ) mocker.patch.object( @@ -71,7 +71,7 @@ def test_default_platform_uses_default_impl_qualname(self, mocker): mock_impl = mocker.MagicMock() mock_resolve.return_value = mock_impl - from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import ( + from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe import ( HunyuanFusedMoE, ) @@ -80,7 +80,7 @@ def test_default_platform_uses_default_impl_qualname(self, mocker): mock_platform.prepare_diffusion_op_runtime.assert_called_once_with("hunyuan_fused_moe") mock_platform.get_diffusion_model_impl_qualname.assert_called_once_with("hunyuan_fused_moe") mock_resolve.assert_called_once_with( - "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault" + "vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe.HunyuanFusedMoEDefault" ) mock_impl.assert_called_once_with(prefix="") @@ -90,7 +90,7 @@ class TestHunyuanFusedMoEFactory: def test_new_delegates_to_impl_class(self, mocker): """HunyuanFusedMoE(prefix=..., **kwargs) should instantiate and return impl instance.""" - import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe class MockImpl: def __init__(self, *, prefix: str = "", **kwargs): @@ -104,7 +104,7 @@ def __init__(self, *, prefix: str = "", **kwargs): mock_impl_class = mocker.MagicMock(return_value=MockImpl(prefix="test", a=1)) mocker.patch.object(hunyuan_moe, "resolve_obj_by_qualname", return_value=mock_impl_class) - from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import ( + from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe import ( HunyuanFusedMoE, ) @@ -119,7 +119,7 @@ def __init__(self, *, prefix: str = "", **kwargs): def test_make_expert_params_mapping_delegates_to_impl(self, mocker): """make_expert_params_mapping should delegate to impl class method.""" - import vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe as hunyuan_moe + import vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe as hunyuan_moe expected_mapping = [("a", "b", 0, "c")] mock_platform = mocker.MagicMock() @@ -130,7 +130,7 @@ def test_make_expert_params_mapping_delegates_to_impl(self, mocker): mock_impl_class.make_expert_params_mapping = mocker.MagicMock(return_value=expected_mapping) mocker.patch.object(hunyuan_moe, "resolve_obj_by_qualname", return_value=mock_impl_class) - from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import ( + from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe import ( HunyuanFusedMoE, ) diff --git a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py index 79bb64dca1b..6898763e406 100644 --- a/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py +++ b/tests/e2e/offline_inference/test_hunyuanimage3_text2img.py @@ -17,7 +17,7 @@ MODEL_NAME = "tencent/HunyuanImage-3.0" LOCAL_CLIP_PATH = "openai/clip-vit-base-patch32" REPO_ROOT = Path(__file__).resolve().parents[3] -STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image_3_moe.yaml" +STAGE_CONFIG_PATH = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "hunyuan_image3_moe.yaml" pytestmark = [pytest.mark.advanced_model, pytest.mark.diffusion] diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/__init__.py b/vllm_omni/diffusion/models/hunyuan_image3/__init__.py similarity index 58% rename from vllm_omni/diffusion/models/hunyuan_image_3/__init__.py rename to vllm_omni/diffusion/models/hunyuan_image3/__init__.py index cbc6a8ad1f4..6612bd855ba 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/__init__.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/__init__.py @@ -2,12 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Hunyuan Image 3 diffusion model components.""" -from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import HunyuanFusedMoE -from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_image_3_transformer import ( +from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe import HunyuanFusedMoE +from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_image3_transformer import ( HunyuanImage3Model, HunyuanImage3Text2ImagePipeline, ) -from vllm_omni.diffusion.models.hunyuan_image_3.pipeline_hunyuan_image_3 import ( +from vllm_omni.diffusion.models.hunyuan_image3.pipeline_hunyuan_image3 import ( HunyuanImage3Pipeline, ) diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/autoencoder.py b/vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py similarity index 100% rename from vllm_omni/diffusion/models/hunyuan_image_3/autoencoder.py rename to vllm_omni/diffusion/models/hunyuan_image3/autoencoder.py diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_fused_moe.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_fused_moe.py similarity index 100% rename from vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_fused_moe.py rename to vllm_omni/diffusion/models/hunyuan_image3/hunyuan_fused_moe.py diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_tokenizer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py similarity index 99% rename from vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_tokenizer.py rename to vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py index ce563f71159..4a29e9df93e 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_tokenizer.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_tokenizer.py @@ -13,7 +13,7 @@ from transformers import AutoTokenizer from vllm.logger import init_logger -from .hunyuan_image_3_transformer import ImageInfo, JointImageInfo, default +from .hunyuan_image3_transformer import ImageInfo, JointImageInfo, default logger = init_logger(__name__) diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py similarity index 99% rename from vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py rename to vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py index bc81ca9c3ed..327260ee0bb 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/hunyuan_image_3_transformer.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py @@ -74,7 +74,7 @@ ) from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.layers.rope import RotaryEmbedding -from vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe import HunyuanFusedMoE +from vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe import HunyuanFusedMoE logger = logging.getLogger(__name__) diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py similarity index 99% rename from vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py rename to vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index 7e9e2d27877..2f140b48fc4 100644 --- a/vllm_omni/diffusion/models/hunyuan_image_3/pipeline_hunyuan_image_3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -25,8 +25,8 @@ from vllm_omni.diffusion.request import OmniDiffusionRequest from .autoencoder import AutoencoderKLConv3D -from .hunyuan_image_3_tokenizer import TokenizerWrapper -from .hunyuan_image_3_transformer import ( +from .hunyuan_image3_tokenizer import TokenizerWrapper +from .hunyuan_image3_transformer import ( CausalMMOutputWithPast, HunyuanImage3ImageProcessor, HunyuanImage3Model, diff --git a/vllm_omni/diffusion/models/hunyuan_image_3/system_prompt.py b/vllm_omni/diffusion/models/hunyuan_image3/system_prompt.py similarity index 100% rename from vllm_omni/diffusion/models/hunyuan_image_3/system_prompt.py rename to vllm_omni/diffusion/models/hunyuan_image3/system_prompt.py diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index 97bc7fa2925..517b061ecec 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -119,8 +119,8 @@ "FluxKontextPipeline", ), "HunyuanImage3ForCausalMM": ( - "hunyuan_image_3", - "pipeline_hunyuan_image_3", + "hunyuan_image3", + "pipeline_hunyuan_image3", "HunyuanImage3Pipeline", ), "Flux2KleinPipeline": ( diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image_3_moe.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml similarity index 100% rename from vllm_omni/model_executor/stage_configs/hunyuan_image_3_moe.yaml rename to vllm_omni/model_executor/stage_configs/hunyuan_image3_moe.yaml diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i.yaml similarity index 100% rename from vllm_omni/model_executor/stage_configs/hunyuan_image3_moe_dit.yaml rename to vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i.yaml diff --git a/vllm_omni/model_executor/stage_configs/hunyuan_image_3_moe_2gpu.yaml b/vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i_2gpu.yaml similarity index 100% rename from vllm_omni/model_executor/stage_configs/hunyuan_image_3_moe_2gpu.yaml rename to vllm_omni/model_executor/stage_configs/hunyuan_image3_t2i_2gpu.yaml diff --git a/vllm_omni/platforms/interface.py b/vllm_omni/platforms/interface.py index 8f1e66747d1..b69731a67d5 100644 --- a/vllm_omni/platforms/interface.py +++ b/vllm_omni/platforms/interface.py @@ -64,7 +64,7 @@ def get_default_stage_config_path(cls) -> str: @classmethod def get_diffusion_model_impl_qualname(cls, op_name: str) -> str: if op_name == "hunyuan_fused_moe": - return "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault" + return "vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe.HunyuanFusedMoEDefault" raise NotImplementedError(f"Unsupported diffusion model op: {op_name}") @classmethod diff --git a/vllm_omni/platforms/musa/platform.py b/vllm_omni/platforms/musa/platform.py index fe1ccc6d0bf..64a70a9bebe 100644 --- a/vllm_omni/platforms/musa/platform.py +++ b/vllm_omni/platforms/musa/platform.py @@ -39,7 +39,7 @@ def get_default_stage_config_path(cls) -> str: def get_diffusion_model_impl_qualname(cls, op_name: str) -> str: # MUSA uses default implementations for diffusion ops if op_name == "hunyuan_fused_moe": - return "vllm_omni.diffusion.models.hunyuan_image_3.hunyuan_fused_moe.HunyuanFusedMoEDefault" + return "vllm_omni.diffusion.models.hunyuan_image3.hunyuan_fused_moe.HunyuanFusedMoEDefault" return super().get_diffusion_model_impl_qualname(op_name) @classmethod diff --git a/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_moe_dit.yaml b/vllm_omni/platforms/npu/stage_configs/hunyuan_image3_t2i.yaml similarity index 100% rename from vllm_omni/platforms/npu/stage_configs/hunyuan_image3_moe_dit.yaml rename to vllm_omni/platforms/npu/stage_configs/hunyuan_image3_t2i.yaml diff --git a/vllm_omni/platforms/xpu/stage_configs/hunyuan_image_3_moe.yaml b/vllm_omni/platforms/xpu/stage_configs/hunyuan_image3_t2i.yaml similarity index 100% rename from vllm_omni/platforms/xpu/stage_configs/hunyuan_image_3_moe.yaml rename to vllm_omni/platforms/xpu/stage_configs/hunyuan_image3_t2i.yaml From 2dff2d7c747864378764195e0e4a6b137c3cf5df Mon Sep 17 00:00:00 2001 From: fan2956 Date: Wed, 15 Apr 2026 14:02:41 +0800 Subject: [PATCH 48/76] [PERF] Wan2.2 support adalayernorm fused op (#2585) Signed-off-by: fan2956 Co-authored-by: Canlin Guo --- .../diffusion/cache/teacache/extractors.py | 3 +- vllm_omni/diffusion/layers/adalayernorm.py | 80 +++++-------------- .../qwen_image/qwen_image_transformer.py | 19 +++-- .../models/wan2_2/wan2_2_transformer.py | 25 +++--- .../models/wan2_2/wan2_2_vace_transformer.py | 2 +- 5 files changed, 45 insertions(+), 84 deletions(-) diff --git a/vllm_omni/diffusion/cache/teacache/extractors.py b/vllm_omni/diffusion/cache/teacache/extractors.py index 3d247e31878..84c237b60d5 100644 --- a/vllm_omni/diffusion/cache/teacache/extractors.py +++ b/vllm_omni/diffusion/cache/teacache/extractors.py @@ -222,7 +222,8 @@ def extract_qwen_context( block = module.transformer_blocks[0] img_mod_params = block.img_mod(temb) img_mod1, _ = img_mod_params.chunk(2, dim=-1) - img_modulated, _ = block.img_norm1(hidden_states, img_mod1) + img_scale1, img_shift1, _ = block._modulate(img_mod1) + img_modulated = block.img_norm1(hidden_states, img_scale1, img_shift1) # ============================================================================ # DEFINE TRANSFORMER EXECUTION (Qwen-specific) diff --git a/vllm_omni/diffusion/layers/adalayernorm.py b/vllm_omni/diffusion/layers/adalayernorm.py index 35f63e2fc91..4d70ed52f71 100644 --- a/vllm_omni/diffusion/layers/adalayernorm.py +++ b/vllm_omni/diffusion/layers/adalayernorm.py @@ -29,105 +29,61 @@ def __init__(self, hidden_size: int, elementwise_affine: bool = False, eps: floa self.hidden_size = hidden_size self.layernorm = nn.LayerNorm(self.hidden_size, elementwise_affine=self.elementwise_affine, eps=self.eps) - def preprocess( - self, - mod_params: torch.Tensor, - index: torch.Tensor = None, - ) -> torch.Tensor: - # shift: b d, scale: b d, gate: b d - shift, scale, gate = mod_params.chunk(3, dim=-1) - - if index is not None: - # Assuming mod_params batch dim is 2*actual_batch (chunked into 2 parts) - # So shift, scale, gate have shape [2*actual_batch, d] - actual_batch = shift.size(0) // 2 - shift_0, shift_1 = shift[:actual_batch], shift[actual_batch:] # each: [actual_batch, d] - scale_0, scale_1 = scale[:actual_batch], scale[actual_batch:] - gate_0, gate_1 = gate[:actual_batch], gate[actual_batch:] - - # index: [b, l] where b is actual batch size - # Expand to [b, l, 1] to match feature dimension - index_expanded = index.unsqueeze(-1) # [b, l, 1] - - # Expand chunks to [b, 1, d] then broadcast to [b, l, d] - shift_0_exp = shift_0.unsqueeze(1) # [b, 1, d] - shift_1_exp = shift_1.unsqueeze(1) # [b, 1, d] - scale_0_exp = scale_0.unsqueeze(1) - scale_1_exp = scale_1.unsqueeze(1) - gate_0_exp = gate_0.unsqueeze(1) - gate_1_exp = gate_1.unsqueeze(1) - - # Use torch.where to select based on index - shift_result = torch.where(index_expanded == 0, shift_0_exp, shift_1_exp) - scale_result = torch.where(index_expanded == 0, scale_0_exp, scale_1_exp) - gate_result = torch.where(index_expanded == 0, gate_0_exp, gate_1_exp) - else: - shift_result = shift.unsqueeze(1) - scale_result = scale.unsqueeze(1) - gate_result = gate.unsqueeze(1) - - return shift_result, scale_result, gate_result - def forward_cuda( self, x: torch.Tensor, - mod_params: torch.Tensor, - index: torch.Tensor = None, + scale: torch.Tensor, + shift: torch.Tensor, ) -> torch.Tensor: - return self.forward_native(x, mod_params, index) + return self.forward_native(x, scale, shift) def forward_hip( self, x: torch.Tensor, - mod_params: torch.Tensor, - index: torch.Tensor = None, + scale: torch.Tensor, + shift: torch.Tensor, ) -> torch.Tensor: - return self.forward_native(x, mod_params, index) + return self.forward_native(x, scale, shift) def forward_npu( self, x: torch.Tensor, - mod_params: torch.Tensor, - index: torch.Tensor = None, + scale: torch.Tensor, + shift: torch.Tensor, ) -> torch.Tensor: - shift_result, scale_result, gate_result = self.preprocess(mod_params, index) - if _HAS_MINDIESD: try: from mindiesd import layernorm_scale_shift - output = layernorm_scale_shift(self.layernorm, x, scale_result, shift_result, fused=True) + output = layernorm_scale_shift(self.layernorm, x, scale, shift, fused=True) - return output, gate_result + return output except ImportError as e: logger.warning_once(f"mindiesd import failed, falling back to torch_npu: {e}") import torch_npu output = ( - torch_npu.npu_layer_norm_eval(x, normalized_shape=[self.hidden_size], eps=self.eps) * (1 + scale_result) - + shift_result + torch_npu.npu_layer_norm_eval(x, normalized_shape=[self.hidden_size], eps=self.eps) * (1 + scale) + shift ) - return output, gate_result + return output def forward_xpu( self, x: torch.Tensor, - mod_params: torch.Tensor, - index: torch.Tensor = None, + scale: torch.Tensor, + shift: torch.Tensor, ) -> torch.Tensor: - return self.forward_native(x, mod_params, index) + return self.forward_native(x, scale, shift) def forward_native( self, x: torch.Tensor, - mod_params: torch.Tensor, - index: torch.Tensor = None, + scale: torch.Tensor, + shift: torch.Tensor, ) -> torch.Tensor: - shift_result, scale_result, gate_result = self.preprocess(mod_params, index) - - return self.layernorm(x) * (1 + scale_result) + shift_result, gate_result + return self.layernorm(x) * (1 + scale) + shift class AdaLayerNormZero(nn.Module): diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py index b34f19e954a..9f16d8808c8 100644 --- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py +++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py @@ -744,9 +744,9 @@ def __init__( self.zero_cond_t = zero_cond_t - def _modulate(self, x, mod_params, index=None): + def _modulate(self, mod_params, index=None): """Apply modulation to input tensor""" - # x: b l d, shift: b d, scale: b d, gate: b d + # shift: b d, scale: b d, gate: b d shift, scale, gate = mod_params.chunk(3, dim=-1) if index is not None: @@ -778,7 +778,7 @@ def _modulate(self, x, mod_params, index=None): scale_result = scale.unsqueeze(1) gate_result = gate.unsqueeze(1) - return x * (1 + scale_result) + shift_result, gate_result + return scale_result, shift_result, gate_result def forward( self, @@ -804,10 +804,12 @@ def forward( txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1) # Each [B, 3*dim] # Process image stream - norm1 + modulation - img_modulated, img_gate1 = self.img_norm1(hidden_states, img_mod1, modulate_index) + img_scale1, img_shift1, img_gate1 = self._modulate(img_mod1, modulate_index) + img_modulated = self.img_norm1(hidden_states, img_scale1, img_shift1) # Process text stream - norm1 + modulation - txt_modulated, txt_gate1 = self.txt_norm1(encoder_hidden_states, txt_mod1) + txt_scale1, txt_shift1, txt_gate1 = self._modulate(txt_mod1) + txt_modulated = self.txt_norm1(encoder_hidden_states, txt_scale1, txt_shift1) # Use QwenAttnProcessor2_0 for joint attention computation # This directly implements the DoubleStreamLayerMegatron logic: @@ -832,13 +834,16 @@ def forward( encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output # Process image stream - norm2 + MLP - img_modulated2, img_gate2 = self.img_norm2(hidden_states, img_mod2, modulate_index) + img_scale2, img_shift2, img_gate2 = self._modulate(img_mod2, modulate_index) + img_modulated2 = self.img_norm2(hidden_states, img_scale2, img_shift2) img_mlp_output = self.img_mlp(img_modulated2) hidden_states = hidden_states + img_gate2 * img_mlp_output # Process text stream - norm2 + MLP - txt_modulated2, txt_gate2 = self.txt_norm2(encoder_hidden_states, txt_mod2) + txt_scale2, txt_shift2, txt_gate2 = self._modulate(txt_mod2) + txt_modulated2 = self.txt_norm2(encoder_hidden_states, txt_scale2, txt_shift2) + txt_mlp_output = self.txt_mlp(txt_modulated2) encoder_hidden_states = encoder_hidden_states + txt_gate2 * txt_mlp_output diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py index 3b43f3eaf51..b870193a140 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py @@ -29,6 +29,7 @@ SequenceParallelOutput, ) from vllm_omni.diffusion.forward_context import get_forward_context +from vllm_omni.diffusion.layers.adalayernorm import AdaLayerNorm from vllm_omni.platforms import current_omni_platform logger = init_logger(__name__) @@ -620,7 +621,7 @@ def __init__( head_dim = dim // num_heads # 1. Self-attention - self.norm1 = FP32LayerNorm(dim, eps, elementwise_affine=False) + self.norm1 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) self.attn1 = WanSelfAttention( dim=dim, num_heads=num_heads, @@ -640,7 +641,7 @@ def __init__( # 3. Feed-forward self.ffn = WanFeedForward(dim=dim, inner_dim=ffn_dim, dim_out=dim) - self.norm3 = FP32LayerNorm(dim, eps, elementwise_affine=False) + self.norm3 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) # Scale-shift table for modulation self.scale_shift_table = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5) @@ -656,7 +657,7 @@ def forward( if temb.ndim == 4: # temb: batch_size, seq_len, 6, inner_dim (wan2.2 ti2v) shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = ( - self.scale_shift_table.unsqueeze(0) + temb.float() + self.scale_shift_table.unsqueeze(0) + temb ).chunk(6, dim=2) shift_msa = shift_msa.squeeze(2) scale_msa = scale_msa.squeeze(2) @@ -667,25 +668,23 @@ def forward( else: # temb: batch_size, 6, inner_dim (wan2.1/wan2.2 14B) shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = ( - self.scale_shift_table + temb.float() + self.scale_shift_table + temb ).chunk(6, dim=1) # 1. Self-attention - norm_hidden_states = (self.norm1(hidden_states.float()) * (1 + scale_msa) + shift_msa).type_as(hidden_states) + norm_hidden_states = self.norm1(hidden_states, scale_msa, shift_msa).type_as(hidden_states) attn_output = self.attn1(norm_hidden_states, rotary_emb, hidden_states_mask) - hidden_states = (hidden_states.float() + attn_output * gate_msa).type_as(hidden_states) + hidden_states = (hidden_states + attn_output * gate_msa).type_as(hidden_states) # 2. Cross-attention - norm_hidden_states = self.norm2(hidden_states.float()).type_as(hidden_states) + norm_hidden_states = self.norm2(hidden_states).type_as(hidden_states) attn_output = self.attn2(norm_hidden_states, encoder_hidden_states) hidden_states = hidden_states + attn_output # 3. Feed-forward - norm_hidden_states = (self.norm3(hidden_states.float()) * (1 + c_scale_msa) + c_shift_msa).type_as( - hidden_states - ) + norm_hidden_states = self.norm3(hidden_states, c_scale_msa, c_shift_msa).type_as(hidden_states) ff_output = self.ffn(norm_hidden_states) - hidden_states = (hidden_states.float() + ff_output.float() * c_gate_msa).type_as(hidden_states) + hidden_states = (hidden_states + ff_output * c_gate_msa).type_as(hidden_states) return hidden_states @@ -854,7 +853,7 @@ def __init__( ) # 4. Output norm & projection - self.norm_out = FP32LayerNorm(inner_dim, eps, elementwise_affine=False) + self.norm_out = AdaLayerNorm(inner_dim, elementwise_affine=False, eps=eps) self.proj_out = nn.Linear(inner_dim, out_channels * math.prod(patch_size)) # SP helper modules @@ -942,7 +941,7 @@ def forward( shift = shift.unsqueeze(1) scale = scale.unsqueeze(1) - hidden_states = (self.norm_out(hidden_states.float()) * (1 + scale) + shift).type_as(hidden_states) + hidden_states = self.norm_out(hidden_states, scale, shift).type_as(hidden_states) hidden_states = self.proj_out(hidden_states) hidden_states = hidden_states.reshape( diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_vace_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_vace_transformer.py index 4f4217dabfa..c48938e1baa 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_vace_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_vace_transformer.py @@ -239,7 +239,7 @@ def forward( shift = shift.unsqueeze(1) scale = scale.unsqueeze(1) - hidden_states = (self.norm_out(hidden_states.float()) * (1 + scale) + shift).type_as(hidden_states) + hidden_states = self.norm_out(hidden_states, scale, shift).type_as(hidden_states) hidden_states = self.proj_out(hidden_states) hidden_states = hidden_states.reshape( From 133e2f97068f4ae57fc91d7afd1e405386a0e12e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zeyu=20Huang=20=7C=20=E9=BB=83=E6=BE=A4=E5=AE=87?= <11222265+fhfuih@users.noreply.github.com> Date: Wed, 15 Apr 2026 14:08:00 +0800 Subject: [PATCH 49/76] [hotfix] API connection error in CI (#2810) --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 4ad4706fc1f..098fd8d970c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2876,7 +2876,7 @@ def _build_url(self, path: str) -> str: return f"{self.base_url.rstrip('/')}/{path.lstrip('/')}" -@pytest.fixture(scope="module") +@pytest.fixture def openai_client(request: pytest.FixtureRequest, run_level: str): """Create OpenAIClientHandler fixture to facilitate communication with OmniServer with encapsulated request sending, concurrent requests, response handling, and validation.""" From 38d5f2d530c84cdb5462116103944b2b84e44182 Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Wed, 15 Apr 2026 14:09:22 +0800 Subject: [PATCH 50/76] [Perf] VoxCPM2: Speedup by manual CUDA Graph capture for scaffold/residual forward (#2803) Signed-off-by: Sy03 <1370724210@qq.com> --- .../models/voxcpm2/minicpm4_paged.py | 20 ++ .../models/voxcpm2/voxcpm2_talker.py | 188 ++++++++++++++++-- 2 files changed, 189 insertions(+), 19 deletions(-) diff --git a/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py b/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py index 40bacfff6c7..b87ec5aafef 100644 --- a/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py +++ b/vllm_omni/model_executor/models/voxcpm2/minicpm4_paged.py @@ -307,6 +307,16 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states + def precompute_fused_qkv(self) -> None: + """Materialize fused QKV weights before CUDA Graph capture.""" + for layer in self.layers: + attn = layer.self_attn + if attn._fused_qkv_weight is None: + attn._fused_qkv_weight = torch.cat( + [attn.q_proj.weight, attn.k_proj.weight, attn.v_proj.weight], + dim=0, + ).detach() + def compile_selective(self) -> list[str]: """Compile the full model forward as one graph. @@ -411,6 +421,16 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states + def precompute_fused_qkv(self) -> None: + """Materialize fused QKV weights before CUDA Graph capture.""" + for layer in self.layers: + attn = layer.self_attn + if attn._fused_qkv_weight is None: + attn._fused_qkv_weight = torch.cat( + [attn.q_proj.weight, attn.k_proj.weight, attn.v_proj.weight], + dim=0, + ).detach() + def compile_selective(self) -> list[str]: """Compile the full residual model forward as one graph (same strategy as base_lm).""" if self._compiled_layers: diff --git a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py index 94f06589046..02bcae821e1 100644 --- a/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py +++ b/vllm_omni/model_executor/models/voxcpm2/voxcpm2_talker.py @@ -10,6 +10,7 @@ from __future__ import annotations +import copy import dataclasses import logging import os @@ -21,6 +22,7 @@ import torch import torch.nn as nn from vllm.config import VllmConfig +from vllm.forward_context import get_forward_context, override_forward_context from vllm.logger import init_logger from vllm.model_executor.models.utils import ( AutoWeightsLoader, @@ -101,6 +103,14 @@ class _RequestState: last_decoded_audio: torch.Tensor | None = None +@dataclasses.dataclass +class _CapturedGraph: + graph: torch.cuda.CUDAGraph + input_embeds: torch.Tensor + positions: torch.Tensor + output: torch.Tensor + + # =================================================================== # Profiling timer # =================================================================== @@ -336,6 +346,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._perf = _PerfTimer(enabled=_ENABLE_PROFILING) self._cfm_buffers: _CFMBufferManager | None = None + self._enable_cuda_graph = True + self._scaffold_graphs: dict[int, _CapturedGraph] = {} + self._residual_graphs: dict[int, _CapturedGraph] = {} + self._max_cached_graphs = self._max_batch_size + self._cuda_graph_pool: tuple | None = None + self._cuda_graph_warmup_steps = 0 + self._cuda_graph_warmup_threshold = 3 self._active_states: dict[str, _RequestState] = {} self._current_request_id: str | None = None @@ -483,19 +500,24 @@ def _setup_torch_compile(self) -> None: except Exception as e: logger.warning("torch.compile AudioVAE failed: %s", e) - if not getattr(self.model, "_selective_compiled", False): - try: - targets.extend(f"scaffold.{t}" for t in self.model.compile_selective()) - self.model._selective_compiled = True - except Exception as e: - logger.warning("scaffold compile failed: %s", e) + if not self._enable_cuda_graph: + if not getattr(self.model, "_selective_compiled", False): + try: + targets.extend(f"scaffold.{t}" for t in self.model.compile_selective()) + self.model._selective_compiled = True + except Exception as e: + logger.warning("scaffold compile failed: %s", e) - if not getattr(self.residual_model, "_selective_compiled", False): - try: - targets.extend(f"residual.{t}" for t in self.residual_model.compile_selective()) - self.residual_model._selective_compiled = True - except Exception as e: - logger.warning("residual compile failed: %s", e) + if not getattr(self.residual_model, "_selective_compiled", False): + try: + targets.extend(f"residual.{t}" for t in self.residual_model.compile_selective()) + self.residual_model._selective_compiled = True + except Exception as e: + logger.warning("residual compile failed: %s", e) + else: + self.model.precompute_fused_qkv() + self.residual_model.precompute_fused_qkv() + targets.append("scaffold+residual (CUDA Graph, skipping compile)") if not getattr(self, "_projections_compiled", False): try: @@ -518,6 +540,90 @@ def _stop_fn(self, lm_h: torch.Tensor) -> torch.Tensor: tts = self.tts return tts.stop_head(tts.stop_actn(tts.stop_proj(lm_h))) + def _get_cuda_graph_pool(self) -> tuple: + if self._cuda_graph_pool is None: + self._cuda_graph_pool = torch.cuda.graph_pool_handle() + return self._cuda_graph_pool + + @staticmethod + def _nullify_volatile_metadata(ctx: Any) -> Any: + """Set ``scheduler_metadata`` to None on all attention layers. + + This is the only tensor FA3 reallocates each step (variable shape). + All other metadata tensors are persistent model-runner buffers. + Setting it to None makes FA3 use default scheduling (~0.1ms cost). + """ + if not isinstance(ctx.attn_metadata, dict): + return ctx + + ctx = copy.copy(ctx) + new_meta: dict[str, Any] = {} + for layer_name, meta in ctx.attn_metadata.items(): + if getattr(meta, "scheduler_metadata", None) is not None: + meta = copy.copy(meta) + meta.scheduler_metadata = None + new_meta[layer_name] = meta + ctx.attn_metadata = new_meta + return ctx + + def _capture_graph( + self, + model: nn.Module, + batch_size: int, + label: str, + is_residual: bool = False, + ) -> _CapturedGraph: + """Capture a CUDA Graph for *model* at *batch_size*.""" + hidden_size = self.config.hidden_size + dtype = self._side_dtype + dev = torch.device(self._device) + pool = self._get_cuda_graph_pool() + + model.precompute_fused_qkv() + + g = _CapturedGraph( + graph=torch.cuda.CUDAGraph(), + input_embeds=torch.zeros(batch_size, hidden_size, device=dev, dtype=dtype), + positions=torch.zeros(batch_size, device=dev, dtype=torch.long), + output=torch.zeros(batch_size, hidden_size, device=dev, dtype=dtype), + ) + + if is_residual: + call_kwargs = dict(positions=g.positions, inputs_embeds=g.input_embeds) + else: + call_kwargs = dict(input_ids=None, positions=g.positions, inputs_embeds=g.input_embeds) + + ctx = get_forward_context() + patched_ctx = self._nullify_volatile_metadata(ctx) + + with override_forward_context(patched_ctx): + for _ in range(3): + _ = model(**call_kwargs) + + with torch.cuda.graph(g.graph, pool=pool): + g.output = model(**call_kwargs) + + logger.info("CUDA Graph captured for %s (batch_size=%d)", label, batch_size) + return g + + def _replay_graph( + self, + g: _CapturedGraph, + inputs_embeds: torch.Tensor, + positions: torch.Tensor, + batch_size: int, + ) -> torch.Tensor: + """Copy fresh inputs into static buffers, then replay. + + No metadata copy needed: persistent buffers (seq_lens, slot_mapping, + etc.) are updated in-place by the model runner. scheduler_metadata + was nullified at capture time so no kernel references it. + """ + g.input_embeds[:batch_size].copy_(inputs_embeds[:batch_size]) + g.positions[:batch_size].copy_(positions[:batch_size]) + g.graph.replay() + return g.output[:batch_size].clone() + # -------------------- vllm hooks -------------------- def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor: @@ -534,12 +640,35 @@ def forward( self._perf.start("forward_total") dev = input_ids.device - model_output = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) - if isinstance(model_output, IntermediateTensors): - return model_output - scaffold_hidden = model_output - if isinstance(scaffold_hidden, tuple): - scaffold_hidden = scaffold_hidden[0] + num_reqs = len(self._pending_requests) + num_decode = sum(1 for _, is_p, _, n in self._pending_requests if not is_p and n == 1) + is_all_decode = num_decode == num_reqs and num_reqs > 0 + + tts_compiled = getattr(self.tts.feat_decoder.estimator, "_compiled", False) if self._tts is not None else False + graph_ready = tts_compiled and self._cuda_graph_warmup_steps >= self._cuda_graph_warmup_threshold + if num_decode > 0: + self._cuda_graph_warmup_steps += 1 + + can_use_graph = ( + self._enable_cuda_graph and graph_ready and intermediate_tensors is None and inputs_embeds is not None + ) + + if can_use_graph and is_all_decode and num_reqs <= self._max_cached_graphs: + self._perf.start("scaffold_fwd") + if num_reqs not in self._scaffold_graphs: + self._scaffold_graphs[num_reqs] = self._capture_graph(self.model, num_reqs, "scaffold") + scaffold_hidden = self._replay_graph(self._scaffold_graphs[num_reqs], inputs_embeds, positions, num_reqs) + self._perf.stop("scaffold_fwd") + + else: + self._perf.start("scaffold_fwd") + model_output = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) + self._perf.stop("scaffold_fwd") + if isinstance(model_output, IntermediateTensors): + return model_output + scaffold_hidden = model_output + if isinstance(scaffold_hidden, tuple): + scaffold_hidden = scaffold_hidden[0] # Phase 1: per-request FSQ + residual input token_offset = 0 @@ -571,7 +700,28 @@ def forward( if residual_inputs: batch_in = torch.cat(residual_inputs, dim=0) batch_pos = torch.cat(residual_positions, dim=0) - batch_out = self.residual_model(batch_pos, batch_in) + + residual_batch_size = batch_in.shape[0] + use_residual_graph = ( + self._enable_cuda_graph + and is_all_decode + and graph_ready + and residual_batch_size == num_reqs # 1 token per request + and residual_batch_size <= self._max_cached_graphs + ) + + self._perf.start("residual_fwd") + if use_residual_graph: + if residual_batch_size not in self._residual_graphs: + self._residual_graphs[residual_batch_size] = self._capture_graph( + self.residual_model, residual_batch_size, "residual", is_residual=True + ) + batch_out = self._replay_graph( + self._residual_graphs[residual_batch_size], batch_in, batch_pos, residual_batch_size + ) + else: + batch_out = self.residual_model(batch_pos, batch_in) + self._perf.stop("residual_fwd") # Phase 3: per-request LocDiT + update offset = 0 From 4bf4c6314741da606ff2b99efde5a83713cd8a22 Mon Sep 17 00:00:00 2001 From: IsleOfDawnlight Date: Wed, 15 Apr 2026 15:04:58 +0800 Subject: [PATCH 51/76] Add voxcpm model support. (#2467) Signed-off-by: Celeste-jq <591998922@qq.com> Signed-off-by: lyj-jjj Signed-off-by: IsleOfDawnlight Signed-off-by: Yueqian Lin Co-authored-by: Celeste-jq <591998922@qq.com> Co-authored-by: lyj-jjj Co-authored-by: Yueqian Lin --- .buildkite/test-ready.yml | 25 + benchmarks/voxcpm/README.md | 119 +++ .../voxcpm/vllm_omni/bench_tts_offline.py | 890 ++++++++++++++++++ .../voxcpm/vllm_omni/bench_tts_serve.py | 283 ++++++ .../voxcpm/vllm_omni/run_offline_matrix.py | 303 ++++++ examples/offline_inference/voxcpm/README.md | 123 +++ examples/offline_inference/voxcpm/end2end.py | 206 ++++ examples/online_serving/voxcpm/README.md | 166 ++++ .../voxcpm/openai_speech_client.py | 155 +++ examples/online_serving/voxcpm/run_server.sh | 38 + tests/e2e/offline_inference/test_voxcpm.py | 156 +++ tests/engine/test_arg_utils.py | 19 + .../openai_api/test_serving_speech_voxcpm.py | 143 +++ tests/entrypoints/test_utils.py | 33 + .../test_voxcpm_async_chunk.py | 87 ++ vllm_omni/engine/arg_utils.py | 3 + .../entrypoints/openai/serving_speech.py | 72 +- vllm_omni/model_executor/models/registry.py | 6 + .../model_executor/models/voxcpm/__init__.py | 7 + .../models/voxcpm/configuration_voxcpm.py | 3 + .../model_executor/models/voxcpm/voxcpm.py | 886 +++++++++++++++++ .../models/voxcpm/voxcpm_loader.py | 247 +++++ .../models/voxcpm/voxcpm_runtime_utils.py | 44 + .../models/voxcpm/voxcpm_stage_wrappers.py | 185 ++++ .../model_executor/stage_configs/voxcpm.yaml | 69 ++ .../stage_configs/voxcpm_async_chunk.yaml | 102 ++ .../stage_input_processors/voxcpm.py | 128 +++ .../platforms/npu/stage_configs/voxcpm.yaml | 67 ++ .../npu/stage_configs/voxcpm_async_chunk.yaml | 93 ++ .../transformers_utils/configs/__init__.py | 3 + .../transformers_utils/configs/voxcpm.py | 68 ++ 31 files changed, 4727 insertions(+), 2 deletions(-) create mode 100644 benchmarks/voxcpm/README.md create mode 100644 benchmarks/voxcpm/vllm_omni/bench_tts_offline.py create mode 100644 benchmarks/voxcpm/vllm_omni/bench_tts_serve.py create mode 100644 benchmarks/voxcpm/vllm_omni/run_offline_matrix.py create mode 100644 examples/offline_inference/voxcpm/README.md create mode 100644 examples/offline_inference/voxcpm/end2end.py create mode 100644 examples/online_serving/voxcpm/README.md create mode 100644 examples/online_serving/voxcpm/openai_speech_client.py create mode 100755 examples/online_serving/voxcpm/run_server.sh create mode 100644 tests/e2e/offline_inference/test_voxcpm.py create mode 100644 tests/entrypoints/openai_api/test_serving_speech_voxcpm.py create mode 100644 tests/model_executor/stage_input_processors/test_voxcpm_async_chunk.py create mode 100644 vllm_omni/model_executor/models/voxcpm/__init__.py create mode 100644 vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py create mode 100644 vllm_omni/model_executor/models/voxcpm/voxcpm.py create mode 100644 vllm_omni/model_executor/models/voxcpm/voxcpm_loader.py create mode 100644 vllm_omni/model_executor/models/voxcpm/voxcpm_runtime_utils.py create mode 100644 vllm_omni/model_executor/models/voxcpm/voxcpm_stage_wrappers.py create mode 100644 vllm_omni/model_executor/stage_configs/voxcpm.yaml create mode 100644 vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml create mode 100644 vllm_omni/model_executor/stage_input_processors/voxcpm.py create mode 100644 vllm_omni/platforms/npu/stage_configs/voxcpm.yaml create mode 100644 vllm_omni/platforms/npu/stage_configs/voxcpm_async_chunk.yaml create mode 100644 vllm_omni/transformers_utils/configs/voxcpm.py diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 2f749f0ee9f..68f8e615286 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -295,6 +295,31 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" + - label: "VoxCPM E2E Test" + timeout_in_minutes: 20 + depends_on: upload-ready-pipeline + commands: + - | + timeout 20m bash -c ' + pip install voxcpm + export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_WORKER_MULTIPROC_METHOD=spawn + pytest -s -v tests/e2e/offline_inference/test_voxcpm.py -m "core_model" --run-level "core_model" + ' + agents: + queue: "gpu_1_queue" + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + - label: "VoxCPM2 Native AR E2E Test" timeout_in_minutes: 20 depends_on: upload-ready-pipeline diff --git a/benchmarks/voxcpm/README.md b/benchmarks/voxcpm/README.md new file mode 100644 index 00000000000..17f904101bb --- /dev/null +++ b/benchmarks/voxcpm/README.md @@ -0,0 +1,119 @@ +# VoxCPM Benchmark + +This directory contains both: + +- online serving benchmark through the OpenAI-compatible `/v1/audio/speech` API +- offline benchmark for `Omni` / `AsyncOmni` +- full offline smoke-matrix orchestration + +Both benchmark paths report: + +- TTFP: time to first PCM packet +- E2E latency +- RTF: real-time factor (`e2e / audio_duration`) + +## Offline Benchmark + +Single offline benchmark run: + +```bash +python benchmarks/voxcpm/vllm_omni/bench_tts_offline.py \ + --model /path/to/voxcpm-model \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm.yaml \ + --text "This is a split-stage VoxCPM synthesis example running on vLLM Omni." \ + --warmup-runs 1 \ + --output-dir benchmarks/voxcpm/results/offline_single +``` + +Streaming offline benchmark: + +```bash +python benchmarks/voxcpm/vllm_omni/bench_tts_offline.py \ + --model /path/to/voxcpm-model \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \ + --text "This is a split-stage VoxCPM streaming example running on vLLM Omni." \ + --warmup-runs 1 \ + --output-dir benchmarks/voxcpm/results/offline_streaming +``` + +Full fixed offline matrix, equivalent to the old `examples/offline_inference/voxcpm/test.py`: + +```bash +python benchmarks/voxcpm/vllm_omni/run_offline_matrix.py \ + --model /path/to/voxcpm-model \ + --ref-audio /path/to/reference.wav \ + --ref-text "The exact transcript spoken in reference.wav." \ + --output-root benchmarks/voxcpm/results/offline_matrix +``` + +The full matrix covers both routes: + +- streaming: `voxcpm_async_chunk.yaml` +- sync: `voxcpm.yaml` + +And these six scenarios under each route: + +- warmup + single TTS +- warmup + single voice cloning +- warmup + batch TTS +- warmup + batch voice cloning +- cold single TTS +- cold single voice cloning + +`bench_tts_offline.py` itself no longer writes `summary.json` / `results.json`; it prints TTFP / RTF inline and saves generated WAV files only. The matrix runner keeps only per-case `run.log`. + +## Start the Server + +Async-chunk: + +```bash +vllm serve /path/to/voxcpm-model \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \ + --trust-remote-code \ + --enforce-eager \ + --omni \ + --port 8091 +``` + +Non-streaming: + +```bash +vllm serve /path/to/voxcpm-model \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm.yaml \ + --trust-remote-code \ + --enforce-eager \ + --omni \ + --port 8091 +``` + +## Run the Benchmark + +```bash +python benchmarks/voxcpm/vllm_omni/bench_tts_serve.py \ + --host 127.0.0.1 \ + --port 8091 \ + --num-prompts 20 \ + --max-concurrency 1 \ + --result-dir /tmp/voxcpm_bench +``` + +Voice cloning benchmark: + +```bash +python benchmarks/voxcpm/vllm_omni/bench_tts_serve.py \ + --host 127.0.0.1 \ + --port 8091 \ + --num-prompts 10 \ + --max-concurrency 1 \ + --ref-audio https://example.com/reference.wav \ + --ref-text "The exact transcript spoken in the reference audio." \ + --result-dir /tmp/voxcpm_clone_bench +``` + +## Notes + +- The benchmark uses `stream=true` and `response_format=pcm` so TTFP is measured from the first audio packet. +- `RTF < 1.0` means the server generates audio faster than real time. +- For `voxcpm_async_chunk.yaml`, keep concurrency at `1`. This matches native VoxCPM streaming more closely. +- Do not benchmark concurrent online streaming on `voxcpm_async_chunk.yaml`; use `voxcpm.yaml` for multi-request throughput runs. +- For the offline matrix mode, `--ref-audio` and `--ref-text` are required because clone cases are part of the fixed coverage set. diff --git a/benchmarks/voxcpm/vllm_omni/bench_tts_offline.py b/benchmarks/voxcpm/vllm_omni/bench_tts_offline.py new file mode 100644 index 00000000000..a3bad3e6928 --- /dev/null +++ b/benchmarks/voxcpm/vllm_omni/bench_tts_offline.py @@ -0,0 +1,890 @@ +"""Offline VoxCPM benchmark for vLLM Omni. + +Supports both: +- sync one-shot (Omni.generate) +- streaming (AsyncOmni.generate with async_chunk config) +- text-only synthesis +- voice cloning +- text/clone batch inputs from txt or jsonl +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import os +import tempfile +import time +import uuid +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import torch +from vllm.utils.argparse_utils import FlexibleArgumentParser + +from vllm_omni import AsyncOmni, Omni + +REPO_ROOT = Path(__file__).resolve().parents[3] +DEFAULT_STAGE_ASYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm_async_chunk.yaml" +DEFAULT_STAGE_SYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True, slots=True) +class PromptSpec: + text: str + label: str + ref_audio: str | None = None + ref_text: str | None = None + + +def _require_soundfile(): + try: + import soundfile as sf # type: ignore + except ModuleNotFoundError as exc: + raise RuntimeError( + "soundfile is required to write VoxCPM benchmark WAV outputs. Install it with: pip install soundfile" + ) from exc + return sf + + +def _build_prompt( + args, + *, + text: str, + ref_audio: str | None = None, + ref_text: str | None = None, + global_request_id: str | None = None, +) -> dict[str, Any]: + additional_information: dict[str, list[Any]] = { + "text": [text], + "cfg_value": [args.cfg_value], + "inference_timesteps": [args.inference_timesteps], + "min_len": [args.min_len], + "max_new_tokens": [args.max_new_tokens], + } + if args.streaming_prefix_len is not None: + additional_information["streaming_prefix_len"] = [args.streaming_prefix_len] + + if ref_audio: + additional_information["ref_audio"] = [ref_audio] + if ref_text: + additional_information["ref_text"] = [ref_text] + if global_request_id is not None: + additional_information["global_request_id"] = [global_request_id] + + return { + "prompt_token_ids": [1], + "additional_information": additional_information, + } + + +def _extract_audio_tensor(mm: dict[str, Any]) -> torch.Tensor: + audio = mm.get("audio", mm.get("model_outputs")) + if audio is None: + raise ValueError("No audio output found in multimodal output.") + if isinstance(audio, list): + parts = [torch.as_tensor(a).float().cpu().reshape(-1) for a in audio] + audio = torch.cat(parts, dim=-1) if parts else torch.zeros(0) + if not isinstance(audio, torch.Tensor): + audio = torch.as_tensor(audio) + return audio.float().cpu().reshape(-1) + + +def _extract_sample_rate(mm: dict[str, Any]) -> int: + sr_raw = mm.get("sr", 24000) + if isinstance(sr_raw, list) and sr_raw: + sr_raw = sr_raw[-1] + if hasattr(sr_raw, "item"): + return int(sr_raw.item()) + return int(sr_raw) + + +def _emit_offline_metrics( + *, + request_id: str, + elapsed_s: float, + first_audio_elapsed: float | None, + audio_duration_s: float, +) -> None: + metrics = { + "request_id": request_id, + "ttfp_ms": round(first_audio_elapsed * 1000.0, 3) if first_audio_elapsed is not None else None, + "audio_duration_s": round(audio_duration_s, 6), + "rtf": round(elapsed_s / audio_duration_s, 6) if audio_duration_s > 0 else None, + } + print(f"[OfflineMetrics] {metrics}") + + +def _write_audio_tensor(output_path: Path, audio_tensor: Any, sample_rate: int) -> None: + sf = _require_soundfile() + if isinstance(audio_tensor, torch.Tensor): + audio_np = audio_tensor.float().cpu().clamp(-1.0, 1.0).numpy() + else: + audio_np = torch.as_tensor(audio_tensor).float().cpu().clamp(-1.0, 1.0).numpy() + sf.write( + output_path, + audio_np, + sample_rate, + format="WAV", + subtype="PCM_16", + ) + + +def _save_wav(mm: dict[str, Any], output_dir: Path, request_id: str) -> Path: + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / f"output_{request_id}.wav" + _write_audio_tensor(output_path, _extract_audio_tensor(mm), _extract_sample_rate(mm)) + return output_path + + +def _iter_request_multimodal_outputs(request_output: Any): + outputs = getattr(request_output, "outputs", None) + if outputs: + for output in outputs: + mm = getattr(output, "multimodal_output", None) + if isinstance(mm, dict): + yield mm + + mm = getattr(request_output, "multimodal_output", None) + if isinstance(mm, dict): + yield mm + + +def _read_non_empty_lines(path: str) -> list[str]: + with open(path, encoding="utf-8") as f: + return [line.strip() for line in f if line.strip()] + + +def _load_prompt_specs(args) -> list[PromptSpec]: + specs: list[PromptSpec] = [] + + if args.txt_prompts is not None: + texts = _read_non_empty_lines(args.txt_prompts) + if not texts: + raise ValueError(f"No prompts found in {args.txt_prompts}") + for idx, text in enumerate(texts, start=1): + specs.append( + PromptSpec( + text=text, + label=f"item{idx:03d}", + ref_audio=args.ref_audio, + ref_text=args.ref_text, + ) + ) + return specs + + if args.jsonl_prompts is not None: + with open(args.jsonl_prompts, encoding="utf-8") as f: + for line_no, raw_line in enumerate(f, start=1): + line = raw_line.strip() + if not line: + continue + try: + item = json.loads(line) + except json.JSONDecodeError as exc: + raise ValueError(f"{args.jsonl_prompts}:{line_no} is not valid JSON: {exc}") from exc + if not isinstance(item, dict): + raise ValueError(f"{args.jsonl_prompts}:{line_no} must be a JSON object") + + text = item.get("text") + if not isinstance(text, str) or not text.strip(): + raise ValueError(f"{args.jsonl_prompts}:{line_no} requires non-empty string field 'text'") + + ref_audio = item.get("ref_audio", args.ref_audio) + ref_text = item.get("ref_text", args.ref_text) + if (ref_audio is None) != (ref_text is None): + raise ValueError( + f"{args.jsonl_prompts}:{line_no} must provide both 'ref_audio' and 'ref_text' together" + ) + + specs.append( + PromptSpec( + text=text.strip(), + label=f"item{len(specs) + 1:03d}", + ref_audio=ref_audio, + ref_text=ref_text, + ) + ) + + if not specs: + raise ValueError(f"No prompts found in {args.jsonl_prompts}") + return specs + + specs.append( + PromptSpec( + text=args.text, + label="item001", + ref_audio=args.ref_audio, + ref_text=args.ref_text, + ) + ) + return specs + + +def _build_prompt_for_spec(args, spec: PromptSpec, *, global_request_id: str | None = None) -> dict[str, Any]: + return _build_prompt( + args, + text=spec.text, + ref_audio=spec.ref_audio, + ref_text=spec.ref_text, + global_request_id=global_request_id, + ) + + +def _count_voice_clone_prompts(prompt_specs: list[PromptSpec]) -> int: + return sum(1 for spec in prompt_specs if spec.ref_audio is not None) + + +def _get_warmup_specs(prompt_specs: list[PromptSpec]) -> list[PromptSpec]: + return prompt_specs[:1] + + +def _extract_stream_finished(stage_output: Any) -> bool: + request_output = getattr(stage_output, "request_output", None) + request_finished = getattr(request_output, "finished", None) + if request_finished is not None: + return bool(request_finished) + return bool(getattr(stage_output, "finished", False)) + + +def _build_profiled_stage_config( + stage_configs_path: str, + profiler_dir: str, +) -> str: + stage_config_path = Path(stage_configs_path) + yaml_text = stage_config_path.read_text(encoding="utf-8") + injected_lines: list[str] = [] + injected_count = 0 + + for line in yaml_text.splitlines(): + injected_lines.append(line) + if line.strip() != "engine_args:": + continue + indent = line[: len(line) - len(line.lstrip())] + child_indent = indent + " " + grandchild_indent = child_indent + " " + injected_lines.extend( + [ + f"{child_indent}profiler_config:", + f'{grandchild_indent}profiler: "torch"', + f'{grandchild_indent}torch_profiler_dir: "{profiler_dir}"', + f"{grandchild_indent}torch_profiler_with_stack: true", + ] + ) + injected_count += 1 + + if injected_count == 0: + raise ValueError(f"No engine_args block found in stage config: {stage_configs_path}") + + tmp = tempfile.NamedTemporaryFile( + mode="w", + encoding="utf-8", + delete=False, + suffix=".yaml", + prefix=f"{stage_config_path.stem}_profile_", + ) + tmp.write("\n".join(injected_lines) + "\n") + tmp.close() + return tmp.name + + +def parse_args(): + parser = FlexibleArgumentParser( + description="Offline split-stage VoxCPM inference with vLLM Omni (auto sync/streaming by stage config)" + ) + parser.add_argument( + "--model", + type=str, + default=os.environ.get("VOXCPM_MODEL"), + help="Local VoxCPM model directory. Defaults to $VOXCPM_MODEL.", + ) + parser.add_argument( + "--text", + type=str, + default="This is a split-stage VoxCPM synthesis example running on vLLM Omni.", + help="Text to synthesize. Ignored when --txt-prompts or --jsonl-prompts is used.", + ) + parser.add_argument( + "--txt-prompts", + type=str, + default=None, + help="Path to a .txt file with one synthesis text per line.", + ) + parser.add_argument( + "--jsonl-prompts", + type=str, + default=None, + help=( + "Path to a .jsonl file. Each line must contain at least {'text': ...}; " + "clone rows can also set ref_audio/ref_text, and ref_text must be the " + "real transcript of ref_audio." + ), + ) + parser.add_argument( + "--ref-audio", + type=str, + default=None, + help=( + "Optional reference audio path for voice cloning. With --txt-prompts, " + "the same reference is applied to every line." + ), + ) + parser.add_argument( + "--ref-text", + type=str, + default=None, + help=( + "Real transcript of the reference audio. Placeholder text or mismatched " + "text will usually produce noisy/electronic clone audio." + ), + ) + parser.add_argument( + "--stage-configs-path", + type=str, + default=str(DEFAULT_STAGE_SYNC), + help="Stage config YAML path. Routing is selected only from this path.", + ) + parser.add_argument( + "--cfg-value", + type=float, + default=2.0, + help="Classifier-free guidance value for VoxCPM.", + ) + parser.add_argument( + "--inference-timesteps", + type=int, + default=10, + help="Number of inference timesteps.", + ) + parser.add_argument( + "--min-len", + type=int, + default=2, + help="Minimum generated token length.", + ) + parser.add_argument( + "--max-new-tokens", + type=int, + default=4096, + help="Maximum generated token length.", + ) + parser.add_argument( + "--streaming-prefix-len", + type=int, + default=None, + help="VoxCPM streaming window (optional, streaming mode only).", + ) + parser.add_argument( + "--output-dir", + type=str, + default=None, + help="Directory for output WAV files.", + ) + parser.add_argument( + "--stage-init-timeout", + type=int, + default=600, + help="Stage initialization timeout in seconds.", + ) + parser.add_argument( + "--log-stats", + dest="log_stats", + action="store_true", + help="Enable vLLM Omni stats logging.", + ) + parser.add_argument( + "--no-log-stats", + dest="log_stats", + action="store_false", + help="Disable vLLM Omni stats logging.", + ) + parser.set_defaults(log_stats=True) + parser.add_argument( + "--num-runs", + type=int, + default=1, + help="Number of full inference runs (same prompt each time). Default 1.", + ) + parser.add_argument( + "--warmup-runs", + type=int, + default=0, + help=( + "Optional number of warmup passes before measured runs. Warmup uses only " + "the first prompt and does not save outputs." + ), + ) + parser.add_argument( + "--enable-profiler", + action="store_true", + help=( + "Enable torch profiler for the configured stages. A temporary profiled " + "stage config is generated automatically." + ), + ) + parser.add_argument( + "--profiler-dir", + type=str, + default=None, + help="Directory for profiler traces. Defaults to /profiler when profiling is enabled.", + ) + parser.add_argument( + "--profiler-stages", + type=int, + nargs="*", + default=None, + help="Optional stage ids to profile. Defaults to all stages that have profiler_config.", + ) + parser.add_argument( + "--profiler-wait-seconds", + type=float, + default=30.0, + help="Seconds to wait after stop_profile for trace files to flush.", + ) + args = parser.parse_args() + + if not args.model: + parser.error("--model is required unless $VOXCPM_MODEL is set") + if args.txt_prompts is not None and args.jsonl_prompts is not None: + parser.error("--txt-prompts and --jsonl-prompts are mutually exclusive") + if (args.ref_audio is None) != (args.ref_text is None): + parser.error("--ref-audio and --ref-text must be provided together") + if args.num_runs < 1: + parser.error("--num-runs must be >= 1") + if args.warmup_runs < 0: + parser.error("--warmup-runs must be >= 0") + if args.output_dir is None: + args.output_dir = ( + "output_audio_streaming" if _is_streaming_stage_config(args.stage_configs_path) else "output_audio" + ) + if args.enable_profiler and args.profiler_dir is None: + args.profiler_dir = str(Path(args.output_dir) / "profiler") + try: + args.prompt_specs = _load_prompt_specs(args) + except ValueError as exc: + parser.error(str(exc)) + + return args + + +def _is_streaming_stage_config(stage_configs_path: str) -> bool: + cfg_name = Path(stage_configs_path).name.lower() + # Keep routing purely config-path based: + # - voxcpm.yaml => sync + # - voxcpm_async_chunk.yaml => streaming + return "async_chunk" in cfg_name + + +async def _collect_streaming_audio( + omni: AsyncOmni, + args: Any, + spec: PromptSpec, + request_id: str, + *, + phase_label: str, + prompt_index: int, + prompt_count: int, + print_prompt: bool = False, +) -> tuple[torch.Tensor, int, float, float | None]: + prompt = _build_prompt_for_spec(args, spec, global_request_id=request_id) + delta_chunks: list[torch.Tensor] = [] + sample_rate = 24000 + chunk_i = 0 + prev_total_samples = 0 + t_start = time.perf_counter() + first_audio_elapsed: float | None = None + + if print_prompt: + print(f"---prompt---:{prompt}") + + async for stage_output in omni.generate(prompt, request_id=request_id): + mm = getattr(stage_output, "multimodal_output", None) + if not isinstance(mm, dict): + ro = getattr(stage_output, "request_output", None) + if ro is None: + continue + mm = getattr(ro, "multimodal_output", None) + if not isinstance(mm, dict) and getattr(ro, "outputs", None): + seq = ro.outputs[0] + mm = getattr(seq, "multimodal_output", None) + if not isinstance(mm, dict): + continue + sample_rate = _extract_sample_rate(mm) + try: + w = _extract_audio_tensor(mm) + n = int(w.numel()) + if n == 0: + continue + finished = _extract_stream_finished(stage_output) + if n > prev_total_samples: + delta = w.reshape(-1)[prev_total_samples:] + prev_total_samples = n + elif finished and n == prev_total_samples: + delta = w.reshape(-1)[:0] + else: + delta = w.reshape(-1) + prev_total_samples += int(delta.numel()) + if int(delta.numel()) > 0: + delta_chunks.append(delta) + if first_audio_elapsed is None and int(delta.numel()) > 0: + first_audio_elapsed = time.perf_counter() - t_start + logger.info( + "%s prompt=%d/%d chunk=%d delta_samples=%d buf_len=%d finished=%s", + phase_label, + prompt_index + 1, + prompt_count, + chunk_i, + int(delta.numel()), + n, + finished, + ) + chunk_i += 1 + except ValueError: + if not _extract_stream_finished(stage_output): + logger.debug("skip non-audio partial output chunk=%d", chunk_i) + + if not delta_chunks: + raise RuntimeError("No audio chunks received; check stage config and logs.") + + audio_cat = torch.cat([c.reshape(-1) for c in delta_chunks], dim=0) + elapsed = time.perf_counter() - t_start + return audio_cat, sample_rate, elapsed, first_audio_elapsed + + +async def _abort_streaming_residual_work( + omni: AsyncOmni, + request_id: str, + *, + settle_seconds: float = 0.1, +) -> None: + """Stop any late stage-0 work once the final audio has been collected.""" + await omni.engine.abort_async([request_id]) + if settle_seconds > 0: + await asyncio.sleep(settle_seconds) + + +async def _run_streaming_single( + omni: AsyncOmni, + args: Any, + spec: PromptSpec, + output_dir: Path, + request_id: str, + *, + run_index: int, + num_runs: int, + prompt_index: int, + prompt_count: int, +) -> Path: + audio_cat, sample_rate, elapsed, first_audio_elapsed = await _collect_streaming_audio( + omni, + args, + spec, + request_id, + phase_label=f"run={run_index + 1}/{num_runs}", + prompt_index=prompt_index, + prompt_count=prompt_count, + print_prompt=(run_index == 0 and prompt_index == 0), + ) + await _abort_streaming_residual_work(omni, request_id) + output_path = output_dir / f"output_run{run_index + 1}_{spec.label}.wav" + _write_audio_tensor(output_path, audio_cat, sample_rate) + audio_duration_s = float(audio_cat.numel()) / float(sample_rate) if sample_rate > 0 else 0.0 + ttfp_text = f", ttfp={first_audio_elapsed:.2f}s" if first_audio_elapsed is not None else "" + rtf_text = f", rtf={elapsed / audio_duration_s:.3f}" if audio_duration_s > 0 else "" + print( + f"Saved (streaming) run {run_index + 1}/{num_runs}, " + f"prompt {prompt_index + 1}/{prompt_count}: {output_path} ({elapsed:.2f}s{ttfp_text}{rtf_text})" + ) + _emit_offline_metrics( + request_id=request_id, + elapsed_s=elapsed, + first_audio_elapsed=first_audio_elapsed, + audio_duration_s=audio_duration_s, + ) + return output_path + + +async def _run_streaming_warmup(args, omni: AsyncOmni) -> None: + if args.warmup_runs == 0: + return + + warmup_specs = _get_warmup_specs(args.prompt_specs) + print( + f"Warmup: {args.warmup_runs} run(s) using the first prompt " + f"({len(warmup_specs)} prompt(s)); outputs will be discarded." + ) + for warmup_index in range(args.warmup_runs): + t_warmup = time.perf_counter() + tasks = [] + request_ids: list[str] = [] + for prompt_index, spec in enumerate(warmup_specs): + request_id = f"warmup_stream_{warmup_index + 1}_{spec.label}_{uuid.uuid4().hex[:8]}" + request_ids.append(request_id) + tasks.append( + _collect_streaming_audio( + omni, + args, + spec, + request_id, + phase_label=f"warmup={warmup_index + 1}/{args.warmup_runs}", + prompt_index=prompt_index, + prompt_count=len(warmup_specs), + ) + ) + results = await asyncio.gather(*tasks) + for request_id in request_ids: + await _abort_streaming_residual_work(omni, request_id) + total_samples = sum(int(audio.numel()) for audio, _, _, _ in results) + warmup_ttfps = [ttfp for _, _, _, ttfp in results if ttfp is not None] + ttfp_text = f", ttfp={min(warmup_ttfps):.2f}s" if warmup_ttfps else "" + print( + f"Warmup (streaming) {warmup_index + 1}/{args.warmup_runs} finished: " + f"{len(results)} prompt(s), {total_samples} sample(s) " + f"({time.perf_counter() - t_warmup:.2f}s{ttfp_text})" + ) + + +async def _run_streaming(args) -> list[Path]: + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + omni = AsyncOmni( + model=args.model, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + stage_init_timeout=args.stage_init_timeout, + ) + + await _run_streaming_warmup(args, omni) + profiler_started = False + if args.enable_profiler: + profile_prefix = f"voxcpm_streaming_{int(time.time())}" + stages_text = args.profiler_stages if args.profiler_stages is not None else "all-configured" + print(f"Starting profiler (streaming): stages={stages_text}, dir={args.profiler_dir}") + await omni.start_profile(profile_prefix=profile_prefix, stages=args.profiler_stages) + profiler_started = True + t_total = time.perf_counter() + total_elapsed = 0.0 + paths: list[Path] = [] + prompt_specs: list[PromptSpec] = args.prompt_specs + try: + for run in range(args.num_runs): + for prompt_index, spec in enumerate(prompt_specs): + request_id = f"stream_{run + 1}_{spec.label}_{uuid.uuid4().hex[:8]}" + paths.append( + await _run_streaming_single( + omni, + args, + spec, + output_dir, + request_id, + run_index=run, + num_runs=args.num_runs, + prompt_index=prompt_index, + prompt_count=len(prompt_specs), + ) + ) + total_elapsed = time.perf_counter() - t_total + finally: + if profiler_started: + print("Stopping profiler (streaming)...") + await omni.stop_profile(stages=args.profiler_stages) + if args.profiler_wait_seconds > 0: + print(f"Waiting {args.profiler_wait_seconds:.1f}s for profiler traces to flush...") + await asyncio.sleep(args.profiler_wait_seconds) + + print( + f"All streaming runs finished: {args.num_runs} run(s), " + f"{len(prompt_specs)} prompt(s), {len(paths)} file(s) in {total_elapsed:.2f}s total" + ) + return paths + + +def _run_sync(args) -> list[Path]: + output_dir = Path(args.output_dir) + + omni = Omni( + model=args.model, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + stage_init_timeout=args.stage_init_timeout, + ) + + def _run_sync_single( + spec: PromptSpec, + *, + request_prefix: str, + save_outputs: bool, + run_index: int | None = None, + ) -> tuple[list[Path], int, float | None, float, float, str]: + global_request_id = f"{request_prefix}_{spec.label}" + prompt = _build_prompt_for_spec(args, spec, global_request_id=global_request_id) + if save_outputs and run_index == 0 and spec.label == "item001": + print(f"---prompt---:{prompt}") + + saved_paths: list[Path] = [] + output_count = 0 + first_audio_elapsed: float | None = None + total_audio_duration_s = 0.0 + metrics_request_id = global_request_id + t_start = time.perf_counter() + for stage_outputs in omni.generate(prompt): + request_output = stage_outputs.request_output + if request_output is None: + continue + request_output_id = getattr(request_output, "request_id", None) + if isinstance(request_output_id, str) and request_output_id: + metrics_request_id = request_output_id + for j, mm in enumerate(_iter_request_multimodal_outputs(request_output)): + output_count += 1 + if first_audio_elapsed is None: + try: + audio_tensor = _extract_audio_tensor(mm) + if int(audio_tensor.numel()) > 0: + first_audio_elapsed = time.perf_counter() - t_start + total_audio_duration_s += float(audio_tensor.numel()) / float(_extract_sample_rate(mm)) + except ValueError: + pass + else: + try: + audio_tensor = _extract_audio_tensor(mm) + total_audio_duration_s += float(audio_tensor.numel()) / float(_extract_sample_rate(mm)) + except ValueError: + pass + if not save_outputs: + continue + save_stem = f"run{run_index + 1}_{spec.label}" if j == 0 else f"run{run_index + 1}_{spec.label}_{j}" + saved_paths.append(_save_wav(mm, output_dir, save_stem)) + + if output_count == 0: + raise RuntimeError("No output from Omni.generate") + elapsed_s = time.perf_counter() - t_start + return saved_paths, output_count, first_audio_elapsed, elapsed_s, total_audio_duration_s, metrics_request_id + + if args.warmup_runs: + warmup_specs = _get_warmup_specs(args.prompt_specs) + print( + f"Warmup: {args.warmup_runs} run(s) using the first prompt " + f"({len(warmup_specs)} prompt(s)); outputs will be discarded." + ) + for warmup_index in range(args.warmup_runs): + t_warmup = time.perf_counter() + _, output_count, first_audio_elapsed, elapsed_s, audio_duration_s, _ = _run_sync_single( + warmup_specs[0], + request_prefix=f"warmup_sync{warmup_index + 1}", + save_outputs=False, + ) + ttfp_text = f", ttfp={first_audio_elapsed:.2f}s" if first_audio_elapsed is not None else "" + rtf_text = f", rtf={elapsed_s / audio_duration_s:.3f}" if audio_duration_s > 0 else "" + print( + f"Warmup (sync) {warmup_index + 1}/{args.warmup_runs} finished: " + f"{output_count} output(s) ({time.perf_counter() - t_warmup:.2f}s{ttfp_text}{rtf_text})" + ) + + profiler_started = False + if args.enable_profiler: + profile_prefix = f"voxcpm_sync_{int(time.time())}" + stages_text = args.profiler_stages if args.profiler_stages is not None else "all-configured" + print(f"Starting profiler (sync): stages={stages_text}, dir={args.profiler_dir}") + omni.start_profile(profile_prefix=profile_prefix, stages=args.profiler_stages) + profiler_started = True + + t_total = time.perf_counter() + total_elapsed = 0.0 + saved_paths: list[Path] = [] + prompt_specs: list[PromptSpec] = args.prompt_specs + try: + for run in range(args.num_runs): + t_run = time.perf_counter() + run_paths: list[Path] = [] + for prompt_index, spec in enumerate(prompt_specs): + prompt_paths, _, first_audio_elapsed, elapsed_s, audio_duration_s, metrics_request_id = ( + _run_sync_single( + spec, + request_prefix=f"sync_run{run + 1}_{prompt_index + 1:03d}", + save_outputs=True, + run_index=run, + ) + ) + run_paths.extend(prompt_paths) + ttfp_text = f", ttfp={first_audio_elapsed:.2f}s" if first_audio_elapsed is not None else "" + rtf_text = f", rtf={elapsed_s / audio_duration_s:.3f}" if audio_duration_s > 0 else "" + print( + f"Saved (sync) run {run + 1}/{args.num_runs}, " + f"prompt {prompt_index + 1}/{len(prompt_specs)}: {len(prompt_paths)} file(s){ttfp_text}{rtf_text}" + ) + _emit_offline_metrics( + request_id=metrics_request_id, + elapsed_s=elapsed_s, + first_audio_elapsed=first_audio_elapsed, + audio_duration_s=audio_duration_s, + ) + + saved_paths.extend(run_paths) + print( + f"Run {run + 1}/{args.num_runs} finished: {len(run_paths)} file(s) ({time.perf_counter() - t_run:.2f}s)" + ) + for path in run_paths: + print(f" {path}") + + total_elapsed = time.perf_counter() - t_total + finally: + if profiler_started: + print("Stopping profiler (sync)...") + omni.stop_profile(stages=args.profiler_stages) + if args.profiler_wait_seconds > 0: + print(f"Waiting {args.profiler_wait_seconds:.1f}s for profiler traces to flush...") + time.sleep(args.profiler_wait_seconds) + + print( + f"All sync runs finished: {args.num_runs} run(s), " + f"{len(prompt_specs)} prompt(s), {len(saved_paths)} file(s) in {total_elapsed:.2f}s total" + ) + return saved_paths + + +def main(args) -> int: + logging.basicConfig(level=logging.INFO) + profiled_stage_config_path: str | None = None + original_stage_config_path = args.stage_configs_path + if args.enable_profiler: + Path(args.profiler_dir).mkdir(parents=True, exist_ok=True) + profiled_stage_config_path = _build_profiled_stage_config( + args.stage_configs_path, + str(Path(args.profiler_dir).resolve()), + ) + args.stage_configs_path = profiled_stage_config_path + + is_streaming = _is_streaming_stage_config(args.stage_configs_path) + voice_clone_count = _count_voice_clone_prompts(args.prompt_specs) + print(f"Model: {args.model}") + print(f"Stage config: {original_stage_config_path}") + print(f"Route: {'streaming' if is_streaming else 'sync'} (from stage-configs-path)") + print(f"Prompt count: {len(args.prompt_specs)}") + print("Batch mode: sequential (aligned with native VoxCPM)") + print(f"Warmup runs: {args.warmup_runs}") + print(f"Voice cloning prompts: {voice_clone_count}/{len(args.prompt_specs)}") + if args.enable_profiler: + print(f"Profiler: enabled (dir={args.profiler_dir}, stages={args.profiler_stages or 'all-configured'})") + print(f"Profiled stage config: {args.stage_configs_path}") + if voice_clone_count: + print("Voice cloning note: --ref-text/ref_text must match the spoken content of the reference audio.") + print(f"Num runs: {args.num_runs}") + try: + if is_streaming: + asyncio.run(_run_streaming(args)) + else: + _run_sync(args) + finally: + if profiled_stage_config_path is not None and os.path.exists(profiled_stage_config_path): + os.unlink(profiled_stage_config_path) + return 0 + + +if __name__ == "__main__": + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + raise SystemExit(main(parse_args())) diff --git a/benchmarks/voxcpm/vllm_omni/bench_tts_serve.py b/benchmarks/voxcpm/vllm_omni/bench_tts_serve.py new file mode 100644 index 00000000000..816df32796d --- /dev/null +++ b/benchmarks/voxcpm/vllm_omni/bench_tts_serve.py @@ -0,0 +1,283 @@ +"""Benchmark VoxCPM via /v1/audio/speech. + +Reports TTFP (time to first packet), E2E latency, and RTF (real-time factor). +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import time +from dataclasses import asdict, dataclass, field +from datetime import datetime +from pathlib import Path + +import aiohttp +import numpy as np +from tqdm.asyncio import tqdm + +DEFAULT_MODEL = "OpenBMB/VoxCPM1.5" +DEFAULT_SAMPLE_RATE = 24000 +PROMPTS = [ + "Hello, welcome to the VoxCPM speech benchmark.", + "This is a short benchmark prompt for online text-to-speech generation.", + "The quick brown fox jumps over the lazy dog near the riverbank.", + "Please remember to bring your identification documents tomorrow morning.", + "Learning a new language takes patience, practice, and curiosity.", + "This benchmark reports TTFP and RTF for the VoxCPM online serving path.", +] + + +@dataclass +class RequestResult: + success: bool = False + ttfp: float = 0.0 + e2e: float = 0.0 + audio_bytes: int = 0 + audio_duration: float = 0.0 + rtf: float = 0.0 + prompt: str = "" + error: str = "" + + +@dataclass +class BenchmarkResult: + concurrency: int = 0 + num_prompts: int = 0 + completed: int = 0 + failed: int = 0 + duration_s: float = 0.0 + mean_ttfp_ms: float = 0.0 + median_ttfp_ms: float = 0.0 + p95_ttfp_ms: float = 0.0 + mean_e2e_ms: float = 0.0 + median_e2e_ms: float = 0.0 + p95_e2e_ms: float = 0.0 + mean_rtf: float = 0.0 + median_rtf: float = 0.0 + p95_rtf: float = 0.0 + total_audio_duration_s: float = 0.0 + request_throughput: float = 0.0 + per_request: list[dict[str, float | str]] = field(default_factory=list) + + +def pcm_bytes_to_duration(num_bytes: int, sample_rate: int = DEFAULT_SAMPLE_RATE, sample_width: int = 2) -> float: + num_samples = num_bytes / sample_width + return num_samples / sample_rate + + +async def send_tts_request( + session: aiohttp.ClientSession, + api_url: str, + *, + model: str, + prompt: str, + ref_audio: str | None, + ref_text: str | None, + pbar: tqdm | None = None, +) -> RequestResult: + payload: dict[str, object] = { + "model": model, + "input": prompt, + "stream": True, + "response_format": "pcm", + } + if ref_audio is not None: + payload["ref_audio"] = ref_audio + if ref_text is not None: + payload["ref_text"] = ref_text + + result = RequestResult(prompt=prompt) + started_at = time.perf_counter() + + try: + async with session.post(api_url, json=payload) as response: + if response.status != 200: + result.error = f"HTTP {response.status}: {await response.text()}" + return result + + first_chunk = True + total_bytes = 0 + async for chunk in response.content.iter_any(): + if not chunk: + continue + if first_chunk: + result.ttfp = time.perf_counter() - started_at + first_chunk = False + total_bytes += len(chunk) + + result.e2e = time.perf_counter() - started_at + result.audio_bytes = total_bytes + result.audio_duration = pcm_bytes_to_duration(total_bytes) + if result.audio_duration > 0: + result.rtf = result.e2e / result.audio_duration + result.success = True + except Exception as e: + result.error = str(e) + result.e2e = time.perf_counter() - started_at + + if pbar is not None: + pbar.update(1) + return result + + +async def run_benchmark( + *, + host: str, + port: int, + model: str, + num_prompts: int, + max_concurrency: int, + num_warmups: int, + ref_audio: str | None, + ref_text: str | None, +) -> BenchmarkResult: + api_url = f"http://{host}:{port}/v1/audio/speech" + connector = aiohttp.TCPConnector(limit=max_concurrency, limit_per_host=max_concurrency, keepalive_timeout=60) + timeout = aiohttp.ClientTimeout(total=600) + + async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session: + if num_warmups > 0: + print(f" Warming up with {num_warmups} requests...") + warmup_tasks = [ + send_tts_request( + session, + api_url, + model=model, + prompt=PROMPTS[i % len(PROMPTS)], + ref_audio=ref_audio, + ref_text=ref_text, + ) + for i in range(num_warmups) + ] + await asyncio.gather(*warmup_tasks) + print(" Warmup done.") + + request_prompts = [PROMPTS[i % len(PROMPTS)] for i in range(num_prompts)] + semaphore = asyncio.Semaphore(max_concurrency) + pbar = tqdm(total=num_prompts, desc=f" concurrency={max_concurrency}") + + async def limited_request(prompt: str) -> RequestResult: + async with semaphore: + return await send_tts_request( + session, + api_url, + model=model, + prompt=prompt, + ref_audio=ref_audio, + ref_text=ref_text, + pbar=pbar, + ) + + started_at = time.perf_counter() + results = await asyncio.gather(*[asyncio.create_task(limited_request(prompt)) for prompt in request_prompts]) + duration = time.perf_counter() - started_at + pbar.close() + + succeeded = [result for result in results if result.success] + bench = BenchmarkResult( + concurrency=max_concurrency, + num_prompts=num_prompts, + completed=len(succeeded), + failed=len(results) - len(succeeded), + duration_s=duration, + ) + + if not succeeded: + return bench + + ttfps = np.array([result.ttfp * 1000 for result in succeeded], dtype=np.float64) + e2es = np.array([result.e2e * 1000 for result in succeeded], dtype=np.float64) + rtfs = np.array([result.rtf for result in succeeded], dtype=np.float64) + audio_durations = np.array([result.audio_duration for result in succeeded], dtype=np.float64) + + bench.mean_ttfp_ms = float(np.mean(ttfps)) + bench.median_ttfp_ms = float(np.median(ttfps)) + bench.p95_ttfp_ms = float(np.percentile(ttfps, 95)) + bench.mean_e2e_ms = float(np.mean(e2es)) + bench.median_e2e_ms = float(np.median(e2es)) + bench.p95_e2e_ms = float(np.percentile(e2es, 95)) + bench.mean_rtf = float(np.mean(rtfs)) + bench.median_rtf = float(np.median(rtfs)) + bench.p95_rtf = float(np.percentile(rtfs, 95)) + bench.total_audio_duration_s = float(np.sum(audio_durations)) + bench.request_throughput = len(succeeded) / duration if duration > 0 else 0.0 + bench.per_request = [ + { + "prompt": result.prompt, + "ttfp_ms": result.ttfp * 1000, + "e2e_ms": result.e2e * 1000, + "rtf": result.rtf, + "audio_duration_s": result.audio_duration, + } + for result in succeeded + ] + + return bench + + +def print_summary(result: BenchmarkResult) -> None: + width = 54 + print("") + print("=" * width) + print(f"{'VoxCPM Serving Benchmark':^{width}}") + print("=" * width) + print(f"concurrency : {result.concurrency}") + print(f"requests : {result.completed}/{result.num_prompts} succeeded") + print(f"wall time (s) : {result.duration_s:.3f}") + print(f"mean TTFP (ms) : {result.mean_ttfp_ms:.2f}") + print(f"p95 TTFP (ms) : {result.p95_ttfp_ms:.2f}") + print(f"mean E2E (ms) : {result.mean_e2e_ms:.2f}") + print(f"p95 E2E (ms) : {result.p95_e2e_ms:.2f}") + print(f"mean RTF : {result.mean_rtf:.3f}") + print(f"p95 RTF : {result.p95_rtf:.3f}") + print(f"request throughput : {result.request_throughput:.2f} req/s") + print("=" * width) + + +async def main_async(args) -> None: + result_dir = Path(args.result_dir) + result_dir.mkdir(parents=True, exist_ok=True) + + all_results: list[BenchmarkResult] = [] + for concurrency in args.max_concurrency: + result = await run_benchmark( + host=args.host, + port=args.port, + model=args.model, + num_prompts=args.num_prompts, + max_concurrency=concurrency, + num_warmups=args.num_warmups, + ref_audio=args.ref_audio, + ref_text=args.ref_text, + ) + print_summary(result) + all_results.append(result) + + payload = { + "model": args.model, + "created_at": datetime.utcnow().isoformat() + "Z", + "results": [asdict(result) for result in all_results], + } + result_path = result_dir / "bench_tts_serve.json" + result_path.write_text(json.dumps(payload, indent=2), encoding="utf-8") + print(f"Saved results to: {result_path}") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Benchmark VoxCPM via /v1/audio/speech") + parser.add_argument("--host", default="127.0.0.1", help="Server host") + parser.add_argument("--port", type=int, default=8091, help="Server port") + parser.add_argument("--model", default=DEFAULT_MODEL, help="Model name or path") + parser.add_argument("--num-prompts", type=int, default=20, help="Number of prompts to send") + parser.add_argument("--max-concurrency", type=int, nargs="+", default=[1], help="Concurrency levels to benchmark") + parser.add_argument("--num-warmups", type=int, default=3, help="Warmup request count") + parser.add_argument("--ref-audio", default=None, help="Reference audio URL or data URL for voice cloning") + parser.add_argument("--ref-text", default=None, help="Reference audio transcript for voice cloning") + parser.add_argument("--result-dir", default="results", help="Directory to save benchmark JSON") + return parser.parse_args() + + +if __name__ == "__main__": + asyncio.run(main_async(parse_args())) diff --git a/benchmarks/voxcpm/vllm_omni/run_offline_matrix.py b/benchmarks/voxcpm/vllm_omni/run_offline_matrix.py new file mode 100644 index 00000000000..cee46c0f867 --- /dev/null +++ b/benchmarks/voxcpm/vllm_omni/run_offline_matrix.py @@ -0,0 +1,303 @@ +"""Run the full offline VoxCPM smoke matrix. + +This script keeps the old `test.py` coverage, but delegates each case to +`bench_tts_offline.py` so the benchmark runner itself stays focused on a +single execution path. +""" + +from __future__ import annotations + +import shlex +import subprocess +import sys +import time +from dataclasses import dataclass +from pathlib import Path + +from vllm.utils.argparse_utils import FlexibleArgumentParser + +REPO_ROOT = Path(__file__).resolve().parents[3] +BENCH_SCRIPT = Path(__file__).with_name("bench_tts_offline.py") +DEFAULT_STAGE_ASYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm_async_chunk.yaml" +DEFAULT_STAGE_SYNC = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" +DEFAULT_OUTPUT_ROOT = BENCH_SCRIPT.parents[1] / "results" / "offline_matrix" + +SINGLE_TTS_TEXT = "This is a single text-to-speech smoke test for VoxCPM on vLLM Omni." +SINGLE_CLONE_TEXT = "This sentence is synthesized with the cloned voice for validation." +BATCH_TTS_TEXTS = [ + "The first batch text-to-speech sample validates sequential batch execution.", + "The second batch text-to-speech sample checks another prompt in the same file.", + "The third batch text-to-speech sample completes the sequential batch path.", +] +BATCH_CLONE_TEXTS = [ + "The first cloned sample validates sequential batch voice cloning.", + "The second cloned sample checks the same reference voice on another prompt.", + "The third cloned sample finishes the shared-reference clone batch path.", +] + + +@dataclass(frozen=True, slots=True) +class ModeSpec: + name: str + stage_config: Path + + +@dataclass(frozen=True, slots=True) +class CaseSpec: + name: str + warmup_runs: int + prompt_kind: str + voice_clone: bool + + +@dataclass(frozen=True, slots=True) +class CaseResult: + mode: str + case: str + returncode: int + elapsed_s: float + output_dir: Path + log_path: Path + + @property + def ok(self) -> bool: + return self.returncode == 0 + + +MODE_SPECS = [ + ModeSpec(name="streaming", stage_config=DEFAULT_STAGE_ASYNC), + ModeSpec(name="sync", stage_config=DEFAULT_STAGE_SYNC), +] + +CASE_SPECS = [ + CaseSpec(name="warmup_single_tts", warmup_runs=1, prompt_kind="single", voice_clone=False), + CaseSpec(name="warmup_single_clone", warmup_runs=1, prompt_kind="single", voice_clone=True), + CaseSpec(name="warmup_batch_tts", warmup_runs=1, prompt_kind="batch", voice_clone=False), + CaseSpec(name="warmup_batch_clone", warmup_runs=1, prompt_kind="batch", voice_clone=True), + CaseSpec(name="cold_single_tts", warmup_runs=0, prompt_kind="single", voice_clone=False), + CaseSpec(name="cold_single_clone", warmup_runs=0, prompt_kind="single", voice_clone=True), +] + + +def _write_lines(path: Path, lines: list[str]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def _prepare_batch_inputs(output_root: Path) -> tuple[Path, Path]: + input_dir = output_root / "inputs" + batch_tts_path = input_dir / "batch_tts_prompts.txt" + batch_clone_path = input_dir / "batch_clone_prompts.txt" + _write_lines(batch_tts_path, BATCH_TTS_TEXTS) + _write_lines(batch_clone_path, BATCH_CLONE_TEXTS) + return batch_tts_path, batch_clone_path + + +def _base_command(args, mode: ModeSpec, output_dir: Path) -> list[str]: + cmd = [ + args.python, + str(BENCH_SCRIPT), + "--model", + args.model, + "--stage-configs-path", + str(mode.stage_config), + "--output-dir", + str(output_dir), + "--num-runs", + str(args.num_runs), + "--stage-init-timeout", + str(args.stage_init_timeout), + ] + cmd.append("--log-stats" if args.log_stats else "--no-log-stats") + cmd.extend(["--cfg-value", str(args.cfg_value)]) + cmd.extend(["--inference-timesteps", str(args.inference_timesteps)]) + cmd.extend(["--min-len", str(args.min_len)]) + cmd.extend(["--max-new-tokens", str(args.max_new_tokens)]) + if args.streaming_prefix_len is not None: + cmd.extend(["--streaming-prefix-len", str(args.streaming_prefix_len)]) + if args.enable_profiler: + profiler_dir = Path(args.profiler_dir) if args.profiler_dir is not None else (output_dir / "profiler") + cmd.append("--enable-profiler") + cmd.extend(["--profiler-dir", str(profiler_dir)]) + cmd.extend(["--profiler-wait-seconds", str(args.profiler_wait_seconds)]) + if args.profiler_stages is not None: + cmd.append("--profiler-stages") + cmd.extend(str(stage_id) for stage_id in args.profiler_stages) + return cmd + + +def _build_case_command( + args, + mode: ModeSpec, + case: CaseSpec, + *, + batch_tts_path: Path, + batch_clone_path: Path, + output_dir: Path, +) -> list[str]: + cmd = _base_command(args, mode, output_dir) + cmd.extend(["--warmup-runs", str(case.warmup_runs)]) + if case.prompt_kind == "single": + cmd.extend(["--text", SINGLE_CLONE_TEXT if case.voice_clone else SINGLE_TTS_TEXT]) + else: + cmd.extend(["--txt-prompts", str(batch_clone_path if case.voice_clone else batch_tts_path)]) + if case.voice_clone: + cmd.extend(["--ref-audio", args.ref_audio, "--ref-text", args.ref_text]) + return cmd + + +def _run_case( + args, + mode: ModeSpec, + case: CaseSpec, + *, + batch_tts_path: Path, + batch_clone_path: Path, + output_root: Path, +) -> CaseResult: + case_output_dir = output_root / mode.name / case.name + case_output_dir.mkdir(parents=True, exist_ok=True) + case_log_path = case_output_dir / "run.log" + cmd = _build_case_command( + args, + mode, + case, + batch_tts_path=batch_tts_path, + batch_clone_path=batch_clone_path, + output_dir=case_output_dir, + ) + + print() + print("=" * 80) + print(f"[{mode.name}] {case.name}") + print(f"Output directory: {case_output_dir}") + print(shlex.join(cmd)) + + start = time.perf_counter() + with case_log_path.open("w", encoding="utf-8") as log_fp: + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + assert process.stdout is not None + for line in process.stdout: + print(line, end="") + log_fp.write(line) + process.wait() + + elapsed_s = time.perf_counter() - start + status = "PASS" if (process.returncode or 0) == 0 else f"FAIL({process.returncode})" + print(f"[{mode.name}] {case.name} -> {status} ({elapsed_s:.2f}s)") + return CaseResult( + mode=mode.name, + case=case.name, + returncode=int(process.returncode or 0), + elapsed_s=elapsed_s, + output_dir=case_output_dir, + log_path=case_log_path, + ) + + +def parse_args(): + parser = FlexibleArgumentParser(description="Run the full offline VoxCPM smoke matrix.") + parser.add_argument("--model", type=str, required=True, help="Local VoxCPM model directory.") + parser.add_argument("--ref-audio", type=str, required=True, help="Reference audio path for clone cases.") + parser.add_argument("--ref-text", type=str, required=True, help="Exact transcript spoken in --ref-audio.") + parser.add_argument("--output-root", type=str, default=str(DEFAULT_OUTPUT_ROOT), help="Root directory for outputs.") + parser.add_argument("--python", type=str, default=sys.executable, help="Python executable used to launch cases.") + parser.add_argument("--stage-init-timeout", type=int, default=600, help="Stage initialization timeout in seconds.") + parser.add_argument("--log-stats", dest="log_stats", action="store_true", help="Enable vLLM Omni stats logging.") + parser.add_argument( + "--no-log-stats", + dest="log_stats", + action="store_false", + help="Disable vLLM Omni stats logging.", + ) + parser.set_defaults(log_stats=True) + parser.add_argument("--num-runs", type=int, default=1, help="Number of measured runs per case.") + parser.add_argument("--cfg-value", type=float, default=2.0, help="Classifier-free guidance value for VoxCPM.") + parser.add_argument("--inference-timesteps", type=int, default=10, help="Number of inference timesteps.") + parser.add_argument("--min-len", type=int, default=2, help="Minimum generated token length.") + parser.add_argument("--max-new-tokens", type=int, default=4096, help="Maximum generated token length.") + parser.add_argument( + "--streaming-prefix-len", + type=int, + default=None, + help="Optional VoxCPM streaming window passed to streaming cases.", + ) + parser.add_argument("--enable-profiler", action="store_true", help="Enable torch profiler for each case.") + parser.add_argument( + "--profiler-dir", + type=str, + default=None, + help="Profiler output root. Defaults to /profiler.", + ) + parser.add_argument( + "--profiler-stages", + type=int, + nargs="*", + default=None, + help="Optional stage ids to profile. Defaults to all configured stages.", + ) + parser.add_argument( + "--profiler-wait-seconds", + type=float, + default=30.0, + help="Seconds to wait after stopping profiler for traces to flush.", + ) + args = parser.parse_args() + if args.num_runs < 1: + parser.error("--num-runs must be >= 1") + return args + + +def main(args) -> int: + output_root = Path(args.output_root) + output_root.mkdir(parents=True, exist_ok=True) + batch_tts_path, batch_clone_path = _prepare_batch_inputs(output_root) + + print(f"Model: {args.model}") + print(f"Reference audio: {args.ref_audio}") + print(f"Reference text: {args.ref_text}") + print(f"Python: {args.python}") + print(f"Output root: {output_root}") + print(f"Cases: {len(MODE_SPECS) * len(CASE_SPECS)}") + + results: list[CaseResult] = [] + for mode in MODE_SPECS: + for case in CASE_SPECS: + results.append( + _run_case( + args, + mode, + case, + batch_tts_path=batch_tts_path, + batch_clone_path=batch_clone_path, + output_root=output_root, + ) + ) + + failed = [result for result in results if not result.ok] + print() + print("=" * 80) + print("Summary:") + for result in results: + status = "PASS" if result.ok else f"FAIL({result.returncode})" + print(f"- [{result.mode}] {result.case}: {status} ({result.elapsed_s:.2f}s)") + print(f" output_dir={result.output_dir}") + print(f" log={result.log_path}") + + print(f"Passed: {len(results) - len(failed)}/{len(results)}") + if failed: + print("Failed cases:") + for result in failed: + print(f"- [{result.mode}] {result.case}: see {result.log_path}") + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(parse_args())) diff --git a/examples/offline_inference/voxcpm/README.md b/examples/offline_inference/voxcpm/README.md new file mode 100644 index 00000000000..1eaea9b0dba --- /dev/null +++ b/examples/offline_inference/voxcpm/README.md @@ -0,0 +1,123 @@ +# VoxCPM Offline Example + +This directory contains the minimal offline VoxCPM example for vLLM Omni. + +`end2end.py` is intentionally small and only covers: + +- single text-to-speech +- single voice cloning with `ref_audio` + `ref_text` +- non-streaming with `vllm_omni/model_executor/stage_configs/voxcpm.yaml` +- streaming with `vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml` + +Advanced workflows were moved out of the getting-started example: + +- `benchmarks/voxcpm/vllm_omni/bench_tts_offline.py`: warmup, batch prompts, profiler, offline TTFP / RTF +- `benchmarks/voxcpm/vllm_omni/run_offline_matrix.py`: fixed offline smoke matrix +- `benchmarks/voxcpm/`: benchmark scripts and benchmark docs + +## Prerequisites + +Install VoxCPM in one of these ways: + +```bash +pip install voxcpm +``` + +or point vLLM Omni to the local VoxCPM source tree: + +```bash +export VLLM_OMNI_VOXCPM_CODE_PATH=/path/to/VoxCPM/src +``` + +The example writes WAV files with `soundfile`: + +```bash +pip install soundfile +``` + +## Model Path + +Pass the native VoxCPM model directory directly: + +```bash +export VOXCPM_MODEL=/path/to/voxcpm-model +``` + +If the native VoxCPM `config.json` does not contain HuggingFace metadata such as +`model_type`, prepare a persistent HF-compatible config directory and point the +stage configs to it with `VLLM_OMNI_VOXCPM_HF_CONFIG_PATH`: + +```bash +export VLLM_OMNI_VOXCPM_HF_CONFIG_PATH=/tmp/voxcpm_hf_config +mkdir -p "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH" +cp "$VOXCPM_MODEL/config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/config.json" +cp "$VOXCPM_MODEL/generation_config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/generation_config.json" 2>/dev/null || true +python3 -c 'import json, os; p=os.path.join(os.environ["VLLM_OMNI_VOXCPM_HF_CONFIG_PATH"], "config.json"); cfg=json.load(open(p, "r", encoding="utf-8")); cfg["model_type"]="voxcpm"; cfg.setdefault("architectures", ["VoxCPMForConditionalGeneration"]); json.dump(cfg, open(p, "w", encoding="utf-8"), indent=2, ensure_ascii=False)' +``` + +If the model directory itself already has `model_type`, this extra directory is +not required. + +## Quick Start + +Single text-to-speech, non-streaming: + +```bash +python examples/offline_inference/voxcpm/end2end.py \ + --model "$VOXCPM_MODEL" \ + --text "This is a split-stage VoxCPM synthesis example running on vLLM Omni." +``` + +Single voice cloning, non-streaming: + +```bash +python examples/offline_inference/voxcpm/end2end.py \ + --model "$VOXCPM_MODEL" \ + --text "This sentence is synthesized with a cloned voice." \ + --ref-audio /path/to/reference.wav \ + --ref-text "The exact transcript spoken in reference.wav." +``` + +Streaming: + +```bash +python examples/offline_inference/voxcpm/end2end.py \ + --model "$VOXCPM_MODEL" \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \ + --text "This is a split-stage VoxCPM streaming example running on vLLM Omni." +``` + +By default, `end2end.py` writes to `output_audio/` for non-streaming and +`output_audio_streaming/` for streaming. + +## Advanced Workflows + +Use `benchmarks/voxcpm/vllm_omni/bench_tts_offline.py` when you need: + +- warmup runs +- prompt files +- batch JSONL inputs +- profiler injection +- offline TTFP / RTF emission + +Use `benchmarks/voxcpm/vllm_omni/run_offline_matrix.py` when you need the fixed offline smoke matrix that previously lived in `test.py`. + +Full matrix benchmark example: + +```bash +python benchmarks/voxcpm/vllm_omni/run_offline_matrix.py \ + --model "$VOXCPM_MODEL" \ + --ref-audio /path/to/reference.wav \ + --ref-text "The exact transcript spoken in reference.wav." +``` + +For online serving examples, see [examples/online_serving/voxcpm](../../online_serving/voxcpm/README.md). + +For benchmark reporting, see [benchmarks/voxcpm](../../../benchmarks/voxcpm/README.md). + +## Notes + +- `voxcpm.yaml` is the default non-streaming stage config. +- `voxcpm_async_chunk.yaml` is the streaming stage config. +- Streaming is currently single-request oriented; the fixed smoke matrix now lives in `benchmarks/voxcpm/vllm_omni/run_offline_matrix.py`. +- `ref_text` must be the real transcript of the reference audio. Mismatched text usually causes obvious quality degradation. diff --git a/examples/offline_inference/voxcpm/end2end.py b/examples/offline_inference/voxcpm/end2end.py new file mode 100644 index 00000000000..980410feaeb --- /dev/null +++ b/examples/offline_inference/voxcpm/end2end.py @@ -0,0 +1,206 @@ +"""Minimal offline VoxCPM example for vLLM Omni.""" + +from __future__ import annotations + +import asyncio +import time +from pathlib import Path +from typing import Any + +import soundfile as sf +import torch +from vllm.utils.argparse_utils import FlexibleArgumentParser + +from vllm_omni import AsyncOmni, Omni + +REPO_ROOT = Path(__file__).resolve().parents[3] +DEFAULT_SYNC_STAGE_CONFIG = REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" + + +def _build_prompt(args) -> dict[str, Any]: + additional_information: dict[str, list[Any]] = { + "text": [args.text], + "cfg_value": [args.cfg_value], + "inference_timesteps": [args.inference_timesteps], + "min_len": [args.min_len], + "max_new_tokens": [args.max_new_tokens], + } + if args.streaming_prefix_len is not None: + additional_information["streaming_prefix_len"] = [args.streaming_prefix_len] + if args.ref_audio is not None: + additional_information["ref_audio"] = [args.ref_audio] + if args.ref_text is not None: + additional_information["ref_text"] = [args.ref_text] + return { + "prompt_token_ids": [1], + "additional_information": additional_information, + } + + +def _extract_audio_tensor(mm: dict[str, Any]) -> torch.Tensor: + audio = mm.get("audio", mm.get("model_outputs")) + if audio is None: + raise ValueError("No audio output found in multimodal output.") + if isinstance(audio, list): + parts = [torch.as_tensor(item).float().cpu().reshape(-1) for item in audio] + audio = torch.cat(parts, dim=-1) if parts else torch.zeros(0) + if not isinstance(audio, torch.Tensor): + audio = torch.as_tensor(audio) + return audio.float().cpu().reshape(-1) + + +def _extract_sample_rate(mm: dict[str, Any]) -> int: + sr_raw = mm.get("sr", 24000) + if isinstance(sr_raw, list) and sr_raw: + sr_raw = sr_raw[-1] + if hasattr(sr_raw, "item"): + return int(sr_raw.item()) + return int(sr_raw) + + +def _is_streaming_stage_config(stage_config_path: str) -> bool: + return "async_chunk" in Path(stage_config_path).stem + + +def _save_audio(audio: torch.Tensor, sample_rate: int, output_dir: Path, request_id: str) -> Path: + output_dir.mkdir(parents=True, exist_ok=True) + output_path = output_dir / f"output_{request_id}.wav" + sf.write( + output_path, + audio.float().cpu().clamp(-1.0, 1.0).numpy(), + sample_rate, + format="WAV", + subtype="PCM_16", + ) + return output_path + + +async def _run_streaming(args) -> Path: + prompt = _build_prompt(args) + output_dir = Path(args.output_dir) if args.output_dir is not None else Path("output_audio_streaming") + request_id = "streaming_example" + sample_rate = 24000 + buffered_samples = 0 + chunks: list[torch.Tensor] = [] + started = time.perf_counter() + omni = AsyncOmni( + model=args.model, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + stage_init_timeout=args.stage_init_timeout, + ) + try: + async for stage_output in omni.generate(prompt, request_id=request_id): + mm = getattr(stage_output, "multimodal_output", None) + if not isinstance(mm, dict): + request_output = getattr(stage_output, "request_output", None) + if request_output is None: + continue + mm = getattr(request_output, "multimodal_output", None) + if not isinstance(mm, dict) and getattr(request_output, "outputs", None): + mm = getattr(request_output.outputs[0], "multimodal_output", None) + if not isinstance(mm, dict): + continue + audio = _extract_audio_tensor(mm) + if audio.numel() == 0: + continue + sample_rate = _extract_sample_rate(mm) + if audio.numel() > buffered_samples: + delta = audio[buffered_samples:] + buffered_samples = int(audio.numel()) + else: + delta = audio + buffered_samples += int(delta.numel()) + if delta.numel() > 0: + chunks.append(delta) + if not chunks: + raise RuntimeError("No streaming audio chunks received from VoxCPM.") + output_audio = torch.cat(chunks, dim=0) + output_path = _save_audio(output_audio, sample_rate, output_dir, request_id) + print(f"Saved streaming audio to: {output_path} ({time.perf_counter() - started:.2f}s)") + return output_path + finally: + omni.shutdown() + + +def _run_sync(args) -> Path: + prompt = _build_prompt(args) + output_dir = Path(args.output_dir) if args.output_dir is not None else Path("output_audio") + request_id = "sync_example" + started = time.perf_counter() + last_mm: dict[str, Any] | None = None + omni = Omni( + model=args.model, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + stage_init_timeout=args.stage_init_timeout, + ) + for stage_outputs in omni.generate(prompt): + request_output = getattr(stage_outputs, "request_output", None) + if request_output is None: + continue + outputs = getattr(request_output, "outputs", None) + if outputs: + for output in outputs: + mm = getattr(output, "multimodal_output", None) + if isinstance(mm, dict): + last_mm = mm + mm = getattr(request_output, "multimodal_output", None) + if isinstance(mm, dict): + last_mm = mm + if last_mm is None: + raise RuntimeError("No audio output received from VoxCPM.") + output_path = _save_audio( + _extract_audio_tensor(last_mm), + _extract_sample_rate(last_mm), + output_dir, + request_id, + ) + print(f"Saved audio to: {output_path} ({time.perf_counter() - started:.2f}s)") + return output_path + + +def parse_args(): + parser = FlexibleArgumentParser(description="Minimal offline VoxCPM example for vLLM Omni.") + parser.add_argument("--model", type=str, required=True, help="Local VoxCPM model directory.") + parser.add_argument( + "--stage-configs-path", + type=str, + default=str(DEFAULT_SYNC_STAGE_CONFIG), + help=("Stage config path. Use voxcpm.yaml for non-streaming or voxcpm_async_chunk.yaml for streaming."), + ) + parser.add_argument("--text", type=str, required=True, help="Input text for synthesis.") + parser.add_argument("--ref-audio", type=str, default=None, help="Reference audio path for voice cloning.") + parser.add_argument("--ref-text", type=str, default=None, help="Transcript of the reference audio.") + parser.add_argument("--output-dir", type=str, default=None, help="Output directory for generated wav files.") + parser.add_argument("--cfg-value", type=float, default=2.0, help="Guidance value passed to VoxCPM.") + parser.add_argument("--inference-timesteps", type=int, default=10, help="Number of diffusion timesteps.") + parser.add_argument("--min-len", type=int, default=2, help="Minimum latent length.") + parser.add_argument("--max-new-tokens", type=int, default=4096, help="Maximum latent length.") + parser.add_argument( + "--streaming-prefix-len", + type=int, + default=3, + help="Streaming prefix length used by voxcpm_async_chunk.yaml.", + ) + parser.add_argument("--stage-init-timeout", type=int, default=600, help="Stage initialization timeout in seconds.") + parser.add_argument("--log-stats", action="store_true", help="Enable vLLM Omni stats logging.") + args = parser.parse_args() + if (args.ref_audio is None) != (args.ref_text is None): + raise ValueError("Voice cloning requires --ref-audio and --ref-text together.") + return args + + +def main(args) -> None: + route = "streaming" if _is_streaming_stage_config(args.stage_configs_path) else "sync" + print(f"Model: {args.model}") + print(f"Stage config: {args.stage_configs_path}") + print(f"Route: {route}") + if route == "streaming": + asyncio.run(_run_streaming(args)) + else: + _run_sync(args) + + +if __name__ == "__main__": + main(parse_args()) diff --git a/examples/online_serving/voxcpm/README.md b/examples/online_serving/voxcpm/README.md new file mode 100644 index 00000000000..78e1bf4aaa3 --- /dev/null +++ b/examples/online_serving/voxcpm/README.md @@ -0,0 +1,166 @@ +# VoxCPM + +## Prerequisites + +Install VoxCPM in one of these ways: + +```bash +pip install voxcpm +``` + +or point vLLM-Omni to a local VoxCPM source tree: + +```bash +export VLLM_OMNI_VOXCPM_CODE_PATH=/path/to/VoxCPM/src +``` + +If the native VoxCPM `config.json` lacks HF metadata such as `model_type`, +prepare a persistent HF-compatible config directory and export: + +```bash +export VLLM_OMNI_VOXCPM_HF_CONFIG_PATH=/tmp/voxcpm_hf_config +mkdir -p "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH" +cp "$VOXCPM_MODEL/config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/config.json" +cp "$VOXCPM_MODEL/generation_config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/generation_config.json" 2>/dev/null || true +python3 -c 'import json, os; p=os.path.join(os.environ["VLLM_OMNI_VOXCPM_HF_CONFIG_PATH"], "config.json"); cfg=json.load(open(p, "r", encoding="utf-8")); cfg["model_type"]="voxcpm"; cfg.setdefault("architectures", ["VoxCPMForConditionalGeneration"]); json.dump(cfg, open(p, "w", encoding="utf-8"), indent=2, ensure_ascii=False)' +``` + +The VoxCPM stage configs read `VLLM_OMNI_VOXCPM_HF_CONFIG_PATH` directly. The `python3 -c` form above avoids heredoc/indentation issues in interactive shells. + +## Launch the Server + +Use the async-chunk stage config by default: + +```bash +export VOXCPM_MODEL=/path/to/voxcpm-model +cd examples/online_serving/voxcpm +./run_server.sh +``` + +Use the non-streaming stage config: + +```bash +./run_server.sh sync +``` + +You can also launch the server directly: + +```bash +vllm serve "$VOXCPM_MODEL" \ + --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \ + --trust-remote-code \ + --enforce-eager \ + --omni \ + --port 8091 +``` + +## Send Requests + +### Basic text-to-speech + +```bash +python openai_speech_client.py \ + --model "$VOXCPM_MODEL" \ + --text "This is a VoxCPM online text-to-speech example." +``` + +### Voice cloning + +```bash +python openai_speech_client.py \ + --model "$VOXCPM_MODEL" \ + --text "This sentence is synthesized with a cloned voice." \ + --ref-audio /path/to/reference.wav \ + --ref-text "The exact transcript spoken in reference.wav." +``` + +`ref_text` must be the real transcript of the reference audio. Placeholder text or mismatched text will usually degrade quality badly. + +### Streaming PCM output + +```bash +python openai_speech_client.py \ + --model "$VOXCPM_MODEL" \ + --text "This is a streaming VoxCPM request." \ + --stream \ + --output voxcpm_stream.pcm +``` + +### Using curl + +```bash +curl -X POST http://localhost:8091/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{ + "model": "OpenBMB/VoxCPM1.5", + "input": "Hello from VoxCPM online serving.", + "response_format": "wav" + }' --output output.wav +``` + +Voice cloning: + +```bash +curl -X POST http://localhost:8091/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{ + "model": "OpenBMB/VoxCPM1.5", + "input": "This sentence uses a cloned voice.", + "ref_audio": "https://example.com/reference.wav", + "ref_text": "The exact transcript spoken in the reference audio.", + "response_format": "wav" + }' --output cloned.wav +``` + +Streaming PCM: + +```bash +curl -X POST http://localhost:8091/v1/audio/speech \ + -H "Content-Type: application/json" \ + -d '{ + "model": "OpenBMB/VoxCPM1.5", + "input": "This is a streaming VoxCPM request.", + "stream": true, + "response_format": "pcm" + }' --output output.pcm +``` + +## Supported Request Shape + +VoxCPM online serving currently supports: + +- plain text-to-speech +- voice cloning with `ref_audio` + `ref_text` +- `stream=true` with `response_format=pcm` or `wav` + +VoxCPM online serving does not use these generic TTS fields: + +- `voice` +- `instructions` +- `language` +- `speaker_embedding` +- `x_vector_only_mode` + +## Streaming vs Non-Streaming + +- `voxcpm_async_chunk.yaml` enables async-chunk streaming and is best for single-request streaming latency. +- `voxcpm.yaml` performs one-shot latent generation then VAE decode. + +Like native VoxCPM, the async streaming path should be treated as single-request. If you need stable throughput benchmarking, prefer `voxcpm.yaml`. + +Do not use `voxcpm_async_chunk.yaml` for concurrent online streaming or `/v1/audio/speech/batch`. For multiple requests, prefer `voxcpm.yaml`. + +## Benchmark + +The serving benchmark reports TTFP and RTF: + +```bash +python benchmarks/voxcpm/vllm_omni/bench_tts_serve.py \ + --host 127.0.0.1 \ + --port 8091 \ + --num-prompts 10 \ + --max-concurrency 1 \ + --result-dir /tmp/voxcpm_bench +``` + +For the async-chunk server, keep `--max-concurrency 1`. diff --git a/examples/online_serving/voxcpm/openai_speech_client.py b/examples/online_serving/voxcpm/openai_speech_client.py new file mode 100644 index 00000000000..c400114e8be --- /dev/null +++ b/examples/online_serving/voxcpm/openai_speech_client.py @@ -0,0 +1,155 @@ +"""OpenAI-compatible client for VoxCPM via /v1/audio/speech. + +Examples: + # Basic text-to-speech + python openai_speech_client.py --text "Hello from VoxCPM" + + # Voice cloning + python openai_speech_client.py \ + --text "This sentence uses the cloned voice." \ + --ref-audio /path/to/reference.wav \ + --ref-text "The exact transcript spoken in the reference audio." + + # Streaming PCM output + python openai_speech_client.py \ + --text "This is a streaming VoxCPM request." \ + --stream \ + --output output.pcm +""" + +import argparse +import base64 +import os + +import httpx + +DEFAULT_API_BASE = "http://localhost:8091" +DEFAULT_API_KEY = "EMPTY" +DEFAULT_MODEL = "OpenBMB/VoxCPM1.5" + + +def encode_audio_to_base64(audio_path: str) -> str: + """Encode a local audio file to base64 data URL.""" + if not os.path.exists(audio_path): + raise FileNotFoundError(f"Audio file not found: {audio_path}") + + ext = audio_path.lower().rsplit(".", 1)[-1] + mime_map = { + "wav": "audio/wav", + "mp3": "audio/mpeg", + "flac": "audio/flac", + "ogg": "audio/ogg", + } + mime_type = mime_map.get(ext, "audio/wav") + + with open(audio_path, "rb") as f: + audio_b64 = base64.b64encode(f.read()).decode("utf-8") + return f"data:{mime_type};base64,{audio_b64}" + + +def build_payload(args) -> dict[str, object]: + payload: dict[str, object] = { + "model": args.model, + "input": args.text, + "response_format": "pcm" if args.stream else args.response_format, + } + + if args.ref_audio: + if args.ref_audio.startswith(("http://", "https://", "data:")): + payload["ref_audio"] = args.ref_audio + else: + payload["ref_audio"] = encode_audio_to_base64(args.ref_audio) + if args.ref_text: + payload["ref_text"] = args.ref_text + if args.max_new_tokens is not None: + payload["max_new_tokens"] = args.max_new_tokens + if args.stream: + payload["stream"] = True + + return payload + + +def run_tts(args) -> None: + payload = build_payload(args) + api_url = f"{args.api_base}/v1/audio/speech" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {args.api_key}", + } + + print(f"Model: {args.model}") + print(f"Text: {args.text}") + if args.ref_audio: + print("Mode: voice cloning") + print(f"Reference audio: {args.ref_audio}") + else: + print("Mode: text-to-speech") + + if args.stream: + output_path = args.output or "voxcpm_output.pcm" + with httpx.Client(timeout=300.0) as client: + with client.stream("POST", api_url, json=payload, headers=headers) as response: + if response.status_code != 200: + print(f"Error: {response.status_code}") + print(response.read().decode("utf-8", errors="ignore")) + return + + total_bytes = 0 + with open(output_path, "wb") as f: + for chunk in response.iter_bytes(): + if not chunk: + continue + f.write(chunk) + total_bytes += len(chunk) + print(f"Streamed {total_bytes} bytes to: {output_path}") + return + + with httpx.Client(timeout=300.0) as client: + response = client.post(api_url, json=payload, headers=headers) + + if response.status_code != 200: + print(f"Error: {response.status_code}") + print(response.text) + return + + try: + text = response.content.decode("utf-8") + if text.startswith('{"error"'): + print(f"Error: {text}") + return + except UnicodeDecodeError: + pass + + output_path = args.output or "voxcpm_output.wav" + with open(output_path, "wb") as f: + f.write(response.content) + print(f"Audio saved to: {output_path}") + + +def main(): + parser = argparse.ArgumentParser(description="VoxCPM OpenAI-compatible speech client") + parser.add_argument("--api-base", default=DEFAULT_API_BASE, help="API base URL") + parser.add_argument("--api-key", default=DEFAULT_API_KEY, help="API key") + parser.add_argument("--model", "-m", default=DEFAULT_MODEL, help="Model name or path") + parser.add_argument("--text", required=True, help="Text to synthesize") + parser.add_argument("--ref-audio", default=None, help="Reference audio path, URL, or data URL") + parser.add_argument( + "--ref-text", + default=None, + help="The exact transcript spoken in the reference audio", + ) + parser.add_argument("--stream", action="store_true", help="Enable streaming PCM output") + parser.add_argument( + "--response-format", + default="wav", + choices=["wav", "pcm", "flac", "mp3", "aac", "opus"], + help="Audio format for non-streaming mode (default: wav)", + ) + parser.add_argument("--max-new-tokens", type=int, default=None, help="Maximum tokens to generate") + parser.add_argument("--output", "-o", default=None, help="Output file path") + args = parser.parse_args() + run_tts(args) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/voxcpm/run_server.sh b/examples/online_serving/voxcpm/run_server.sh new file mode 100755 index 00000000000..ab4b6fe854e --- /dev/null +++ b/examples/online_serving/voxcpm/run_server.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Launch vLLM-Omni server for VoxCPM online speech serving. +# +# Usage: +# ./run_server.sh # default: async_chunk stage config +# ./run_server.sh async # async_chunk stage config +# ./run_server.sh sync # no-async-chunk stage config +# VOXCPM_MODEL=/path/to/model ./run_server.sh + +set -e + +MODE="${1:-async}" +MODEL="${VOXCPM_MODEL:-OpenBMB/VoxCPM1.5}" + +case "$MODE" in + async) + STAGE_CONFIG="vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml" + ;; + sync) + STAGE_CONFIG="vllm_omni/model_executor/stage_configs/voxcpm.yaml" + ;; + *) + echo "Unknown mode: $MODE" + echo "Supported: async, sync" + exit 1 + ;; +esac + +echo "Starting VoxCPM server with model: $MODEL" +echo "Stage config: $STAGE_CONFIG" + +vllm serve "$MODEL" \ + --stage-configs-path "$STAGE_CONFIG" \ + --host 0.0.0.0 \ + --port 8091 \ + --trust-remote-code \ + --enforce-eager \ + --omni diff --git a/tests/e2e/offline_inference/test_voxcpm.py b/tests/e2e/offline_inference/test_voxcpm.py new file mode 100644 index 00000000000..d7f65525e93 --- /dev/null +++ b/tests/e2e/offline_inference/test_voxcpm.py @@ -0,0 +1,156 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""E2E test for VoxCPM offline inference.""" + +import json +import os +from pathlib import Path +from typing import Any + +import numpy as np +import pytest +import torch + +import tests.conftest as omni_test_conftest +from tests.conftest import OmniRunner +from tests.utils import hardware_test +from vllm_omni.model_executor.models.voxcpm.voxcpm_runtime_utils import ( + prepare_voxcpm_hf_config_dir, + resolve_voxcpm_model_dir, +) + +VOXCPM_MODEL = os.environ.get("VOXCPM_MODEL", "OpenBMB/VoxCPM1.5") +STAGE_CONFIG = str( + Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm.yaml" +) +SAMPLE_RATE = 24000 + + +@pytest.fixture(autouse=True) +def _patch_npu_cleanup_for_voxcpm(monkeypatch: pytest.MonkeyPatch): + """Limit the NPU cleanup workaround to this VoxCPM test module only.""" + original_cleanup = omni_test_conftest.cleanup_dist_env_and_memory + + def _safe_cleanup() -> None: + try: + original_cleanup() + except RuntimeError as exc: + if "Allocator for npu is not a DeviceAllocator" in str(exc): + return + raise + + monkeypatch.setattr(omni_test_conftest, "cleanup_dist_env_and_memory", _safe_cleanup) + + +def _build_prompt(text: str) -> dict[str, Any]: + return { + "prompt_token_ids": [1], + "additional_information": { + "text": [text], + "cfg_value": [2.0], + "inference_timesteps": [10], + "min_len": [2], + "max_new_tokens": [1024], + }, + } + + +def _extract_audio_tensor(multimodal_output: dict[str, Any]) -> torch.Tensor: + audio = multimodal_output.get("audio", multimodal_output.get("model_outputs")) + assert audio is not None, f"No audio output found, keys={list(multimodal_output.keys())}" + + if isinstance(audio, list): + parts: list[torch.Tensor] = [] + for item in audio: + if item is None: + continue + tensor = torch.as_tensor(item) + if tensor.numel() == 0: + continue + parts.append(tensor.float().cpu().reshape(-1)) + return torch.cat(parts, dim=-1) if parts else torch.zeros((0,), dtype=torch.float32) + + return torch.as_tensor(audio).float().cpu().reshape(-1) + + +def _extract_final_multimodal_output(outputs) -> dict[str, Any]: + for item in reversed(outputs): + request_output = getattr(item, "request_output", None) + if request_output is not None: + multimodal_output = getattr(request_output, "multimodal_output", None) + if isinstance(multimodal_output, dict): + return multimodal_output + completions = getattr(request_output, "outputs", None) or [] + for completion in completions: + multimodal_output = getattr(completion, "multimodal_output", None) + if isinstance(multimodal_output, dict): + return multimodal_output + + multimodal_output = getattr(item, "multimodal_output", None) + if isinstance(multimodal_output, dict): + return multimodal_output + + raise AssertionError("No multimodal audio output found in VoxCPM generate results") + + +@pytest.fixture +def voxcpm_model_path(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> str: + model_dir = resolve_voxcpm_model_dir(VOXCPM_MODEL) + + hf_config_env = os.environ.get("VLLM_OMNI_VOXCPM_HF_CONFIG_PATH") + if hf_config_env: + hf_config_dir = Path(hf_config_env).expanduser() + else: + hf_config_dir = tmp_path / "voxcpm_hf_config" + + if not (hf_config_dir / "config.json").exists(): + prepare_voxcpm_hf_config_dir(model_dir, hf_config_dir) + + monkeypatch.setenv("VLLM_OMNI_VOXCPM_HF_CONFIG_PATH", str(hf_config_dir)) + return str(model_dir) + + +def test_prepare_voxcpm_hf_config_dir(tmp_path: Path): + model_dir = tmp_path / "model" + model_dir.mkdir() + (model_dir / "config.json").write_text(json.dumps({"hidden_size": 1024}), encoding="utf-8") + (model_dir / "generation_config.json").write_text(json.dumps({"do_sample": False}), encoding="utf-8") + + hf_config_dir = prepare_voxcpm_hf_config_dir(model_dir, tmp_path / "voxcpm_hf_config") + + prepared_config = json.loads((hf_config_dir / "config.json").read_text(encoding="utf-8")) + assert prepared_config["model_type"] == "voxcpm" + assert prepared_config["architectures"] == ["VoxCPMForConditionalGeneration"] + assert (hf_config_dir / "generation_config.json").exists() + + +def test_resolve_voxcpm_model_dir_local_path(tmp_path: Path): + model_dir = tmp_path / "OpenBMB" / "VoxCPM1.5" + model_dir.mkdir(parents=True) + + assert resolve_voxcpm_model_dir(str(model_dir)) == model_dir + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "L4"}, num_cards=1) +def test_voxcpm_zero_shot_001(voxcpm_model_path: str): + with OmniRunner(voxcpm_model_path, stage_configs_path=STAGE_CONFIG) as runner: + outputs = list(runner.omni.generate(_build_prompt("Hello, this is a VoxCPM offline inference test."))) + + assert outputs, "No outputs returned" + + multimodal_output = _extract_final_multimodal_output(outputs) + audio = _extract_audio_tensor(multimodal_output) + assert audio.numel() > SAMPLE_RATE // 2, f"Audio too short: {audio.numel()} samples" + + duration_s = audio.shape[0] / SAMPLE_RATE + assert 0.5 < duration_s < 30.0, f"Audio duration out of range: {duration_s:.2f}s" + + peak = float(torch.max(torch.abs(audio)).item()) if audio.numel() > 0 else 0.0 + assert peak > 0.01, "Generated audio appears to be silence" + + audio_np = audio.numpy() + rms = float(np.sqrt(np.mean(np.square(audio_np)))) if audio_np.size else 0.0 + assert rms > 1e-4, "Generated audio RMS too low" diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 35d55f1cc4e..565c83c1ad4 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -7,6 +7,7 @@ import argparse import inspect from types import SimpleNamespace +from unittest.mock import Mock import pytest from pydantic import ValidationError @@ -166,6 +167,24 @@ def test_stage_configs_path_field(): assert args.stage_configs_path == "/some/path.yaml" +def test_voxcpm_model_arch_injects_model_type_override(mocker): + """Ensure VoxCPM model_arch injects hf_overrides for config resolution.""" + mocker.patch.object(OmniEngineArgs, "_ensure_omni_models_registered", return_value=True) + mocker.patch.object(OmniEngineArgs, "_patch_empty_hf_config") + mocker.patch.object(EngineArgs, "create_model_config", return_value=Mock()) + mocker.patch.object(OmniModelConfig, "from_vllm_model_config", return_value=Mock()) + + args = OmniEngineArgs( + model="OpenBMB/VoxCPM1.5", + model_arch="VoxCPMForConditionalGeneration", + ) + args.create_model_config() + + assert args.hf_overrides["architectures"] == ["VoxCPMForConditionalGeneration"] + assert args.hf_overrides["model_type"] == "voxcpm" + args._patch_empty_hf_config.assert_called_once_with("voxcpm") + + def test_strip_single_engine_args(): """_strip_single_engine_args should remove EngineArgs fields but keep omni fields.""" kwargs = { diff --git a/tests/entrypoints/openai_api/test_serving_speech_voxcpm.py b/tests/entrypoints/openai_api/test_serving_speech_voxcpm.py new file mode 100644 index 00000000000..48660b6d1cd --- /dev/null +++ b/tests/entrypoints/openai_api/test_serving_speech_voxcpm.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""UTs for VoxCPM OpenAI speech serving behavior.""" + +import asyncio +from types import SimpleNamespace +from unittest.mock import AsyncMock + +import pytest +from pytest_mock import MockerFixture + +from vllm_omni.entrypoints.openai.protocol.audio import OpenAICreateSpeechRequest +from vllm_omni.entrypoints.openai.serving_speech import OmniOpenAIServingSpeech + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +@pytest.fixture +def voxcpm_server(mocker: MockerFixture): + mocker.patch.object(OmniOpenAIServingSpeech, "_load_supported_speakers", return_value=set()) + mocker.patch.object(OmniOpenAIServingSpeech, "_load_codec_frame_rate", return_value=None) + + mock_engine_client = mocker.MagicMock() + mock_engine_client.errored = False + mock_engine_client.model_config = mocker.MagicMock(model="OpenBMB/VoxCPM1.5") + mock_engine_client.default_sampling_params_list = [SimpleNamespace(max_tokens=2048)] + mock_engine_client.tts_batch_max_items = 32 + mock_engine_client.generate = mocker.MagicMock(return_value="generator") + mock_engine_client.stage_configs = [ + SimpleNamespace( + engine_args=SimpleNamespace( + model_stage="latent_generator", + model_arch="VoxCPMForConditionalGeneration", + ), + tts_args={}, + ), + SimpleNamespace( + engine_args=SimpleNamespace(model_stage="vae"), + tts_args={}, + ), + ] + + mock_models = mocker.MagicMock() + mock_models.is_base_model.return_value = True + + return OmniOpenAIServingSpeech( + engine_client=mock_engine_client, + models=mock_models, + request_logger=mocker.MagicMock(), + ) + + +class TestVoxCPMServing: + def test_voxcpm_model_type_detection(self, voxcpm_server): + assert voxcpm_server._tts_model_type == "voxcpm" + assert voxcpm_server._is_tts is True + assert voxcpm_server.supported_speakers == set() + + @pytest.mark.parametrize( + ("request_kwargs", "expected_substring"), + [ + ({"voice": "alice"}, "voice"), + ({"instructions": "whisper"}, "instructions"), + ({"language": "en"}, "language"), + ({"task_type": "CustomVoice"}, "plain tts"), + ({"x_vector_only_mode": True}, "x_vector_only_mode"), + ({"speaker_embedding": [0.1, 0.2]}, "speaker_embedding"), + ({"initial_codec_chunk_frames": 4}, "initial_codec_chunk_frames"), + ({"ref_text": "reference"}, "ref_audio"), + ], + ) + def test_validate_voxcpm_rejects_unsupported_fields(self, voxcpm_server, request_kwargs, expected_substring): + request = OpenAICreateSpeechRequest(input="hello voxcpm", **request_kwargs) + error = voxcpm_server._validate_voxcpm_request(request) + assert error is not None + assert expected_substring in error.lower() + + def test_validate_voxcpm_accepts_plain_tts_request(self, voxcpm_server): + request = OpenAICreateSpeechRequest(input="hello voxcpm", max_new_tokens=256) + assert voxcpm_server._validate_voxcpm_request(request) is None + + def test_validate_voxcpm_accepts_voice_clone_request(self, voxcpm_server): + request = OpenAICreateSpeechRequest( + input="clone this voice", + ref_audio="data:audio/wav;base64,QUJD", + ref_text="reference transcript", + max_new_tokens=256, + ) + assert voxcpm_server._validate_voxcpm_request(request) is None + + def test_prepare_speech_generation_voxcpm_text_only(self, voxcpm_server): + request = OpenAICreateSpeechRequest(input="hello voxcpm", max_new_tokens=321) + + request_id, generator, tts_params = asyncio.run(voxcpm_server._prepare_speech_generation(request)) + + assert request_id.startswith("speech-") + assert generator == "generator" + assert tts_params == { + "text": ["hello voxcpm"], + "cfg_value": [2.0], + "inference_timesteps": [10], + "min_len": [2], + "max_new_tokens": [321], + } + + voxcpm_server.engine_client.generate.assert_called_once() + call = voxcpm_server.engine_client.generate.call_args + assert call.kwargs["prompt"] == { + "prompt_token_ids": [1], + "additional_information": tts_params, + } + assert call.kwargs["output_modalities"] == ["audio"] + + def test_prepare_speech_generation_voxcpm_voice_clone_resolves_ref_audio(self, voxcpm_server): + voxcpm_server._resolve_ref_audio = AsyncMock(return_value=([0.1, -0.1, 0.2], 16000)) + request = OpenAICreateSpeechRequest( + input="clone this voice", + ref_audio="data:audio/wav;base64,QUJD", + ref_text="reference transcript", + max_new_tokens=512, + ) + + request_id, generator, tts_params = asyncio.run(voxcpm_server._prepare_speech_generation(request)) + + assert request_id.startswith("speech-") + assert generator == "generator" + assert tts_params == { + "text": ["clone this voice"], + "cfg_value": [2.0], + "inference_timesteps": [10], + "min_len": [2], + "max_new_tokens": [512], + "ref_text": ["reference transcript"], + "ref_audio": [[[0.1, -0.1, 0.2], 16000]], + } + + voxcpm_server._resolve_ref_audio.assert_awaited_once_with("data:audio/wav;base64,QUJD") + call = voxcpm_server.engine_client.generate.call_args + assert call.kwargs["prompt"] == { + "prompt_token_ids": [1], + "additional_information": tts_params, + } diff --git a/tests/entrypoints/test_utils.py b/tests/entrypoints/test_utils.py index 94e254c250b..248629d51df 100644 --- a/tests/entrypoints/test_utils.py +++ b/tests/entrypoints/test_utils.py @@ -310,6 +310,39 @@ def mock_exists(path): assert result is not None assert "glm_image.yaml" in result + def test_voxcpm_transformers_format_resolution(self, mocker: MockerFixture): + """Test VoxCPM transformers config resolves to the voxcpm stage config.""" + mocker.patch( + "vllm_omni.entrypoints.utils.get_config", + side_effect=ValueError("missing transformers config"), + ) + mocker.patch( + "vllm_omni.entrypoints.utils.file_or_path_exists", + side_effect=lambda _model, filename, revision=None: filename == "config.json", + ) + mocker.patch( + "vllm_omni.entrypoints.utils.get_hf_file_to_dict", + return_value={"model_type": "voxcpm"}, + ) + mocker.patch( + "vllm_omni.entrypoints.utils.current_omni_platform.get_default_stage_config_path", + return_value="vllm_omni/model_executor/stage_configs", + ) + + original_exists = os.path.exists + + def mock_exists(path): + if "voxcpm.yaml" in str(path): + return True + return original_exists(path) + + mocker.patch("os.path.exists", side_effect=mock_exists) + + result = resolve_model_config_path("OpenBMB/VoxCPM1.5") + + assert result is not None + assert "voxcpm.yaml" in result + class TestLoadAndResolveStageConfigs: def test_load_and_resolve_with_kwargs(self): diff --git a/tests/model_executor/stage_input_processors/test_voxcpm_async_chunk.py b/tests/model_executor/stage_input_processors/test_voxcpm_async_chunk.py new file mode 100644 index 00000000000..7d6fc6e74c9 --- /dev/null +++ b/tests/model_executor/stage_input_processors/test_voxcpm_async_chunk.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""UTs for VoxCPM async-chunk stage input processing.""" + +from types import SimpleNamespace + +import pytest +import torch + +from vllm_omni.model_executor.stage_input_processors.voxcpm import ( + _VOXCPM_LATENT_MAGIC, + _coerce_finished_flag, + latent2vae_async_chunk, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def _request(*, finished): + return SimpleNamespace(is_finished=lambda: finished) + + +def _decode_serialized_latent(codes: list[int]) -> torch.Tensor: + assert codes[0] == _VOXCPM_LATENT_MAGIC + latent_dim = codes[1] + time_dim = codes[2] + payload = torch.tensor(codes[3:], dtype=torch.int32).to(torch.uint16) + return payload.view(torch.bfloat16).to(torch.float32).reshape(1, latent_dim, time_dim) + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + (None, False), + (False, False), + (True, True), + (torch.tensor(False), False), + (torch.tensor(True), True), + ([torch.tensor(True)], True), + (([True],), True), + ([], False), + ], +) +def test_coerce_finished_flag(value, expected): + assert _coerce_finished_flag(value) is expected + + +def test_latent2vae_async_chunk_serializes_latent_payload(): + latent = torch.arange(6, dtype=torch.float32).reshape(2, 3) + + payload = latent2vae_async_chunk( + transfer_manager=None, + pooling_output={"latent_audio_feat": latent}, + request=_request(finished=False), + is_finished=torch.tensor(False), + ) + + assert payload is not None + assert torch.equal(payload["finished"], torch.tensor(False, dtype=torch.bool)) + recovered = _decode_serialized_latent(payload["code_predictor_codes"]) + torch.testing.assert_close(recovered, latent.to(torch.bfloat16).to(torch.float32).unsqueeze(0)) + + +def test_latent2vae_async_chunk_returns_terminal_marker_without_latent(): + payload = latent2vae_async_chunk( + transfer_manager=None, + pooling_output=None, + request=_request(finished=[torch.tensor(True)]), + is_finished=False, + ) + + assert payload == { + "code_predictor_codes": [], + "finished": torch.tensor(True, dtype=torch.bool), + } + + +def test_latent2vae_async_chunk_returns_none_for_nonterminal_empty_chunk(): + payload = latent2vae_async_chunk( + transfer_manager=None, + pooling_output={"latent_audio_feat": torch.zeros((0,), dtype=torch.float32)}, + request=_request(finished=False), + is_finished=False, + ) + + assert payload is None diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index d61102c7e13..5b69d6b1f0c 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -21,6 +21,7 @@ "CosyVoice3Model": "cosyvoice3", "OmniVoiceModel": "omnivoice", "VoxCPM2TalkerForConditionalGeneration": "voxcpm2", + "VoxCPMForConditionalGeneration": "voxcpm", } # Maps model architecture names to tokenizer subfolder paths within HF repos. @@ -41,6 +42,7 @@ def _register_omni_hf_configs() -> None: from vllm_omni.model_executor.models.voxtral_tts.configuration_voxtral_tts import ( VoxtralTTSConfig, ) + from vllm_omni.transformers_utils.configs.voxcpm import VoxCPMConfig from vllm_omni.transformers_utils.configs.voxcpm2 import VoxCPM2Config except Exception as exc: # pragma: no cover - best-effort optional registration logger.warning("Skipping omni HF config registration due to import error: %s", exc) @@ -59,6 +61,7 @@ def _register_omni_hf_configs() -> None: ("cosyvoice3", CosyVoice3Config), ("omnivoice", OmniVoiceConfig), ("voxtral_tts", VoxtralTTSConfig), + ("voxcpm", VoxCPMConfig), ("voxcpm2", VoxCPM2Config), ]: try: diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 1d9754853f3..1f78f5691b9 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -49,6 +49,7 @@ _FISH_TTS_MODEL_STAGES = {"fish_speech_slow_ar"} _COSYVOICE3_TTS_MODEL_STAGES = {"cosyvoice3_talker"} _OMNIVOICE_TTS_MODEL_STAGES = {"omnivoice_generator"} +_VOXCPM_TTS_MODEL_STAGES = {"latent_generator", "vae"} _VOXCPM2_TTS_MODEL_STAGES = {"latent_generator"} _TTS_MODEL_STAGES: set[str] = ( _VOXTRAL_TTS_MODEL_STAGES @@ -56,6 +57,7 @@ | _FISH_TTS_MODEL_STAGES | _COSYVOICE3_TTS_MODEL_STAGES | _OMNIVOICE_TTS_MODEL_STAGES + | _VOXCPM_TTS_MODEL_STAGES | _VOXCPM2_TTS_MODEL_STAGES ) _TTS_LANGUAGES: set[str] = { @@ -282,6 +284,11 @@ def _detect_tts_model_type(self) -> str | None: if self._tts_stage is None: return None model_stage = getattr(self._tts_stage.engine_args, "model_stage", None) + model_arch = getattr(self._tts_stage.engine_args, "model_arch", None) + if model_arch == "VoxCPM2TalkerForConditionalGeneration": + return "voxcpm2" + if model_arch == "VoxCPMForConditionalGeneration": + return "voxcpm" if model_stage in _QWEN3_TTS_MODEL_STAGES: return "qwen3_tts" if model_stage in _VOXTRAL_TTS_MODEL_STAGES: @@ -292,8 +299,12 @@ def _detect_tts_model_type(self) -> str | None: return "cosyvoice3" if model_stage in _OMNIVOICE_TTS_MODEL_STAGES: return "omnivoice" - if model_stage in _VOXCPM2_TTS_MODEL_STAGES: - return "voxcpm2" + if model_stage in (_VOXCPM_TTS_MODEL_STAGES | _VOXCPM2_TTS_MODEL_STAGES): + has_vae_stage = any( + getattr(getattr(stage, "engine_args", None), "model_stage", None) == "vae" + for stage in self.engine_client.stage_configs + ) + return "voxcpm" if has_vae_stage or model_stage == "vae" else "voxcpm2" return None def _compute_max_instructions_length(self) -> int: @@ -318,6 +329,8 @@ def _compute_max_instructions_length(self) -> int: def _load_supported_speakers(self) -> set[str]: """Load supported speakers (case-insensitive) from the model configuration.""" try: + if self._tts_model_type == "voxcpm": + return set() if self._tts_model_type == "voxtral_tts": config = self.engine_client.model_config.hf_config.audio_config else: @@ -377,6 +390,8 @@ def _estimate_ref_code_len(self, ref_audio: object) -> int | None: def _estimate_prompt_len(self, tts_params: dict[str, Any]) -> int: """Estimate prompt length so the placeholder matches model-side embeddings.""" try: + if self._tts_model_type == "voxcpm": + return 1 from vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_talker import ( Qwen3TTSTalkerForConditionalGeneration, ) @@ -791,6 +806,8 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non return self._validate_fish_tts_request(request) if self._tts_model_type == "cosyvoice3": return self._validate_cosyvoice3_request(request) + if self._tts_model_type == "voxcpm": + return self._validate_voxcpm_request(request) if self._tts_model_type == "voxcpm2": return None # VoxCPM2 accepts any text input return self._validate_qwen_tts_request(request) @@ -832,6 +849,43 @@ def _validate_voxtral_tts_request(self, request: OpenAICreateSpeechRequest) -> s return None + def _validate_voxcpm_request(self, request: OpenAICreateSpeechRequest) -> str | None: + """Validate VoxCPM request parameters. Returns error message or None.""" + if not request.input or not request.input.strip(): + return "Input text cannot be empty" + + if request.voice is not None: + return "'voice' is not supported for VoxCPM" + if request.instructions is not None: + return "'instructions' is not supported for VoxCPM" + if request.language is not None: + return "'language' is not supported for VoxCPM" + if request.task_type not in (None, "Base"): + return "VoxCPM only supports plain TTS or voice cloning with ref_audio/ref_text" + if request.x_vector_only_mode is not None: + return "'x_vector_only_mode' is not supported for VoxCPM" + if request.speaker_embedding is not None: + return "'speaker_embedding' is not supported for VoxCPM" + if request.initial_codec_chunk_frames is not None: + return "'initial_codec_chunk_frames' is not supported for VoxCPM" + + if request.ref_audio is not None: + fmt_err = self._validate_ref_audio_format(request.ref_audio) + if fmt_err: + return fmt_err + if not request.ref_text or not request.ref_text.strip(): + return "Voice cloning requires 'ref_text' (transcript of the reference audio)" + elif request.ref_text is not None: + return "'ref_text' requires 'ref_audio' for VoxCPM voice cloning" + + if request.max_new_tokens is not None: + if request.max_new_tokens < _TTS_MAX_NEW_TOKENS_MIN: + return f"max_new_tokens must be at least {_TTS_MAX_NEW_TOKENS_MIN}" + if request.max_new_tokens > _TTS_MAX_NEW_TOKENS_MAX: + return f"max_new_tokens cannot exceed {_TTS_MAX_NEW_TOKENS_MAX}" + + return None + def _validate_qwen_tts_request(self, request: OpenAICreateSpeechRequest) -> str | None: """Validate Qwen TTS request parameters. Returns error message or None.""" # Infer Base task when ref_audio or ref_text is provided without explicit task_type. @@ -1169,6 +1223,18 @@ def _build_tts_params(self, request: OpenAICreateSpeechRequest) -> dict[str, Any Processes each parameter if present, skips if not. Values are wrapped in lists as required by the model. """ + if self._tts_model_type == "voxcpm": + params: dict[str, Any] = { + "text": [request.input], + "cfg_value": [2.0], + "inference_timesteps": [10], + "min_len": [2], + "max_new_tokens": [request.max_new_tokens or 4096], + } + if request.ref_text is not None: + params["ref_text"] = [request.ref_text] + return params + params: dict[str, Any] = {} # Text content (always required) @@ -1499,6 +1565,8 @@ async def _prepare_speech_generation( model_type = "voxtral_tts" elif self._tts_model_type == "cosyvoice3": model_type = "cosyvoice3" + elif self._tts_model_type == "voxcpm": + model_type = "voxcpm" elif self._tts_model_type == "voxcpm2": model_type = "voxcpm2" elif self._is_tts: diff --git a/vllm_omni/model_executor/models/registry.py b/vllm_omni/model_executor/models/registry.py index 08940880056..3407b428695 100644 --- a/vllm_omni/model_executor/models/registry.py +++ b/vllm_omni/model_executor/models/registry.py @@ -145,6 +145,12 @@ "fish_speech_dac_decoder", "FishSpeechDACDecoder", ), + ## VoxCPM + "VoxCPMForConditionalGeneration": ( + "voxcpm", + "voxcpm", + "VoxCPMForConditionalGeneration", + ), ## VoxCPM2 "VoxCPM2TalkerForConditionalGeneration": ( "voxcpm2", diff --git a/vllm_omni/model_executor/models/voxcpm/__init__.py b/vllm_omni/model_executor/models/voxcpm/__init__.py new file mode 100644 index 00000000000..3b064c0f683 --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm/__init__.py @@ -0,0 +1,7 @@ +from .configuration_voxcpm import VoxCPMConfig +from .voxcpm import VoxCPMForConditionalGeneration + +__all__ = [ + "VoxCPMConfig", + "VoxCPMForConditionalGeneration", +] diff --git a/vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py b/vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py new file mode 100644 index 00000000000..ce1d809bd38 --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py @@ -0,0 +1,3 @@ +from vllm_omni.transformers_utils.configs.voxcpm import VoxCPMConfig + +__all__ = ["VoxCPMConfig"] diff --git a/vllm_omni/model_executor/models/voxcpm/voxcpm.py b/vllm_omni/model_executor/models/voxcpm/voxcpm.py new file mode 100644 index 00000000000..6fa36fc4200 --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm/voxcpm.py @@ -0,0 +1,886 @@ +from __future__ import annotations + +import json +import os +import sys +import tempfile +import warnings +import wave +from collections.abc import Callable, Generator, Iterable +from pathlib import Path +from typing import Any + +import numpy as np +import torch +import torch.nn as nn +from einops import rearrange +from tqdm import tqdm +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.sequence import IntermediateTensors + +from vllm_omni.model_executor.models.output_templates import OmniOutput + +from .voxcpm_loader import ( + _build_prompt_cache_with_soundfile, + _device_to_string, + _force_cuda_available_for_npu, + _import_voxcpm_audio_vae_classes, + _import_voxcpm_base_model_class, + _is_torchcodec_load_error, + _normalize_dtype_name, + _prepare_runtime_model_dir, + _resolve_runtime_device, +) +from .voxcpm_runtime_utils import resolve_voxcpm_model_dir +from .voxcpm_stage_wrappers import _DirectVoxCPMAudioVAE, _DirectVoxCPMLatentGenerator + +logger = init_logger(__name__) +_VOXCPM_LATENT_MAGIC = 131071 + + +def _make_voxcpm_model_for_omni(base: type[Any]) -> type[Any]: + """Subclass upstream VoxCPMModel: local ``_inference`` + ``latents_only`` prompt-cache generation.""" + + from voxcpm.model.utils import get_dtype + + class VoxCPMModelForOmni(base): + @torch.inference_mode() + def build_prompt_cache(self, *args: Any, **kwargs: Any): + try: + return super().build_prompt_cache(*args, **kwargs) + except (ImportError, ModuleNotFoundError, RuntimeError) as exc: + if not _is_torchcodec_load_error(exc): + raise + return _build_prompt_cache_with_soundfile(self, *args, **kwargs) + + @torch.inference_mode() + def _inference( + self, + text: torch.Tensor, + text_mask: torch.Tensor, + feat: torch.Tensor, + feat_mask: torch.Tensor, + min_len: int = 2, + max_len: int = 2000, + inference_timesteps: int = 10, + cfg_value: float = 2.0, + streaming: bool = False, + streaming_prefix_len: int = 3, + ) -> Generator[tuple[torch.Tensor, torch.Tensor | list[torch.Tensor]], None, None]: + B, _, _, _ = feat.shape + + feat_embed = self.feat_encoder(feat) + feat_embed = self.enc_to_lm_proj(feat_embed) + + scale_emb = self.config.lm_config.scale_emb if self.config.lm_config.use_mup else 1.0 + text_embed = self.base_lm.embed_tokens(text) * scale_emb + combined_embed = text_mask.unsqueeze(-1) * text_embed + feat_mask.unsqueeze(-1) * feat_embed + + prefix_feat_cond = feat[:, -1, ...] + pred_feat_seq: list[torch.Tensor] = [] + + audio_patch_count = int(feat_mask.sum().item()) + if audio_patch_count > 0: + context_len = min(streaming_prefix_len - 1, audio_patch_count) + prompt_context_patches = list(feat[:, -context_len:, :, :].split(1, dim=1)) + pred_feat_seq = prompt_context_patches + pred_feat_seq + + enc_outputs, kv_cache_tuple = self.base_lm( + inputs_embeds=combined_embed, + is_causal=True, + ) + self.base_lm.kv_cache.fill_caches(kv_cache_tuple) + + enc_outputs = self.fsq_layer(enc_outputs) * feat_mask.unsqueeze(-1) + enc_outputs * text_mask.unsqueeze(-1) + lm_hidden = enc_outputs[:, -1, :] + + residual_enc_outputs, residual_kv_cache_tuple = self.residual_lm( + inputs_embeds=enc_outputs + feat_mask.unsqueeze(-1) * feat_embed, + is_causal=True, + ) + self.residual_lm.kv_cache.fill_caches(residual_kv_cache_tuple) + residual_hidden = residual_enc_outputs[:, -1, :] + + for step_idx in tqdm(range(max_len)): + dit_hidden = self.lm_to_dit_proj(lm_hidden) + self.res_to_dit_proj(residual_hidden) + pred_feat = self.feat_decoder( + mu=dit_hidden, + patch_size=self.patch_size, + cond=prefix_feat_cond.transpose(1, 2).contiguous(), + n_timesteps=inference_timesteps, + cfg_value=cfg_value, + ).transpose(1, 2) + + curr_embed = self.enc_to_lm_proj(self.feat_encoder(pred_feat.unsqueeze(1))) + pred_feat_seq.append(pred_feat.unsqueeze(1)) + prefix_feat_cond = pred_feat + + if streaming: + pred_feat_chunk = torch.cat(pred_feat_seq[-streaming_prefix_len:], dim=1) + feat_pred = rearrange(pred_feat_chunk, "b t p d -> b d (t p)", b=B, p=self.patch_size) + yield feat_pred, pred_feat_seq + + stop_flag = self.stop_head(self.stop_actn(self.stop_proj(lm_hidden))).argmax(dim=-1)[0].cpu().item() + if step_idx > min_len and stop_flag == 1: + break + + lm_hidden = self.base_lm.forward_step( + curr_embed[:, 0, :], + torch.tensor([self.base_lm.kv_cache.step()], device=curr_embed.device), + ).clone() + lm_hidden = self.fsq_layer(lm_hidden) + residual_hidden = self.residual_lm.forward_step( + lm_hidden + curr_embed[:, 0, :], + torch.tensor([self.residual_lm.kv_cache.step()], device=curr_embed.device), + ).clone() + + if not streaming: + pred_feat_seq_cat = torch.cat(pred_feat_seq, dim=1) + feat_pred = rearrange(pred_feat_seq_cat, "b t p d -> b d (t p)", b=B, p=self.patch_size) + yield feat_pred, pred_feat_seq_cat.squeeze(0).cpu() + + @torch.inference_mode() + def generate_latents_with_prompt_cache( + self, + target_text: str, + prompt_cache: dict, + min_len: int = 2, + max_len: int = 2000, + inference_timesteps: int = 10, + cfg_value: float = 2.0, + retry_badcase: bool = False, + retry_badcase_max_times: int = 3, + retry_badcase_ratio_threshold: float = 6.0, + streaming_prefix_len: int = 3, + ) -> tuple[None, torch.Tensor, torch.Tensor]: + return next( + self._generate_with_prompt_cache( + target_text=target_text, + prompt_cache=prompt_cache, + min_len=min_len, + max_len=max_len, + inference_timesteps=inference_timesteps, + cfg_value=cfg_value, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + streaming=False, + streaming_prefix_len=streaming_prefix_len, + latents_only=True, + ) + ) + + @torch.inference_mode() + def generate_latents_with_prompt_cache_streaming( + self, + target_text: str, + prompt_cache: dict, + min_len: int = 2, + max_len: int = 2000, + inference_timesteps: int = 10, + cfg_value: float = 2.0, + retry_badcase: bool = False, + retry_badcase_max_times: int = 3, + retry_badcase_ratio_threshold: float = 6.0, + streaming_prefix_len: int = 3, + ) -> Generator[tuple[None, torch.Tensor, torch.Tensor], None, None]: + return self._generate_with_prompt_cache( + target_text=target_text, + prompt_cache=prompt_cache, + min_len=min_len, + max_len=max_len, + inference_timesteps=inference_timesteps, + cfg_value=cfg_value, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + streaming=True, + streaming_prefix_len=streaming_prefix_len, + latents_only=True, + ) + + @torch.inference_mode() + def _generate_with_prompt_cache( + self, + target_text: str, + prompt_cache: dict, + min_len: int = 2, + max_len: int = 2000, + inference_timesteps: int = 10, + cfg_value: float = 2.0, + retry_badcase: bool = False, + retry_badcase_max_times: int = 3, + retry_badcase_ratio_threshold: float = 6.0, + streaming: bool = False, + streaming_prefix_len: int = 3, + latents_only: bool = False, + ) -> Generator[tuple[torch.Tensor | None, torch.Tensor, torch.Tensor | list[torch.Tensor]], None, None]: + if retry_badcase and streaming: + warnings.warn("Retry on bad cases is not supported in streaming mode, setting retry_badcase=False.") + retry_badcase = False + if prompt_cache is None: + prompt_audio_feat = torch.empty((0, self.patch_size, self.audio_vae.latent_dim), dtype=torch.float32) + text = target_text + else: + prompt_audio_feat = prompt_cache["audio_feat"] + prompt_text = prompt_cache["prompt_text"] + text = prompt_text + target_text + + text_token = torch.LongTensor(self.text_tokenizer(text)) + text_token = torch.cat( + [ + text_token, + torch.tensor([self.audio_start_token], dtype=torch.int32, device=text_token.device), + ], + dim=-1, + ) + target_text_token = torch.LongTensor(self.text_tokenizer(target_text)) + + audio_length = prompt_audio_feat.size(0) + text_length = text_token.shape[0] + text_pad_token = torch.zeros(audio_length, dtype=torch.int32, device=text_token.device) + audio_pad_feat = torch.zeros( + (text_token.shape[0], self.patch_size, self.audio_vae.latent_dim), + dtype=torch.float32, + device=text_token.device, + ) + text_token = torch.cat([text_token, text_pad_token]) + audio_feat = torch.cat([audio_pad_feat, prompt_audio_feat], dim=0) + text_mask = ( + torch.cat([torch.ones(text_length), torch.zeros(audio_length)]).type(torch.int32).to(text_token.device) + ) + audio_mask = ( + torch.cat([torch.zeros(text_length), torch.ones(audio_length)]).type(torch.int32).to(text_token.device) + ) + + text_token = text_token.unsqueeze(0).to(self.device) + text_mask = text_mask.unsqueeze(0).to(self.device) + audio_feat = audio_feat.unsqueeze(0).to(self.device).to(get_dtype(self.config.dtype)) + audio_mask = audio_mask.unsqueeze(0).to(self.device) + + target_text_length = len(self.text_tokenizer(target_text)) + retry_badcase_times = 0 + while retry_badcase_times < retry_badcase_max_times: + inference_result = self._inference( + text_token, + text_mask, + audio_feat, + audio_mask, + min_len=min_len, + max_len=min(int(target_text_length * retry_badcase_ratio_threshold + 10), max_len), + inference_timesteps=inference_timesteps, + cfg_value=cfg_value, + streaming=streaming, + streaming_prefix_len=streaming_prefix_len, + ) + if streaming: + patch_len = self.patch_size * self.chunk_size + for latent_pred, pred_audio_feat in inference_result: + if latents_only: + decode_audio = None + yield (decode_audio, target_text_token, latent_pred) + else: + decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)) + decode_audio = decode_audio[..., -patch_len:].squeeze(1).cpu() + yield (decode_audio, target_text_token, pred_audio_feat) + break + + latent_pred, pred_audio_feat = next(inference_result) + if retry_badcase and pred_audio_feat.shape[0] >= target_text_length * retry_badcase_ratio_threshold: + ratio = pred_audio_feat.shape[0] / target_text_length + print(f" Badcase detected, audio_text_ratio={ratio}, retrying...", file=sys.stderr) + retry_badcase_times += 1 + continue + break + + if not streaming: + if latents_only: + decode_audio = None + else: + decode_audio = self.audio_vae.decode(latent_pred.to(torch.float32)) + patch_len = self.patch_size * self.chunk_size + if audio_mask.sum().item() > 0: + decode_audio = decode_audio[..., patch_len * (streaming_prefix_len - 1) :].squeeze(1).cpu() + else: + decode_audio = decode_audio[..., :].squeeze(1).cpu() + yield (decode_audio, target_text_token, pred_audio_feat) + + VoxCPMModelForOmni.__name__ = "VoxCPMModelForOmni" + VoxCPMModelForOmni.__qualname__ = "VoxCPMModelForOmni" + return VoxCPMModelForOmni + + +def _import_voxcpm_model_class() -> type[Any]: + base = _import_voxcpm_base_model_class() + return _make_voxcpm_model_for_omni(base) + + +def _load_native_voxcpm_model( + model_path: str, + *, + device: torch.device, + dtype: str | None, +): + VoxCPMModel = _import_voxcpm_model_class() + model_dir = resolve_voxcpm_model_dir(model_path) + runtime_model_path = _prepare_runtime_model_dir(model_dir, target_device=device, target_dtype=dtype) + + if device.type == "npu" and hasattr(torch, "npu"): + torch.npu.set_device(device) + + with _force_cuda_available_for_npu(device): + return VoxCPMModel.from_local( + runtime_model_path, + optimize=device.type == "cuda", + ) + + +def _load_native_voxcpm_latent_generator( + model_path: str, + *, + device: torch.device, + dtype: str | None, +) -> _DirectVoxCPMLatentGenerator: + return _DirectVoxCPMLatentGenerator(_load_native_voxcpm_model(model_path, device=device, dtype=dtype)) + + +def _load_native_voxcpm_audio_vae( + model_path: str, + *, + device: torch.device, +) -> _DirectVoxCPMAudioVAE: + AudioVAE, AudioVAEConfig = _import_voxcpm_audio_vae_classes() + model_dir = resolve_voxcpm_model_dir(model_path) + runtime_model_path = _prepare_runtime_model_dir(model_dir, target_device=device, target_dtype="float32") + config_dict = json.loads((Path(runtime_model_path) / "config.json").read_text()) + audio_vae_config = config_dict.get("audio_vae_config") + audio_vae = AudioVAE(config=AudioVAEConfig(**audio_vae_config)) if audio_vae_config is not None else AudioVAE() + + state_dict = torch.load( + Path(runtime_model_path) / "audiovae.pth", + map_location="cpu", + weights_only=True, + )["state_dict"] + audio_vae.load_state_dict(state_dict, strict=True) + audio_vae = audio_vae.to(device=device, dtype=torch.float32).eval() + if device.type == "npu" and hasattr(torch, "npu"): + torch.npu.set_device(device) + patch_size = int(config_dict.get("patch_size", 2)) + return _DirectVoxCPMAudioVAE(audio_vae, patch_size=patch_size) + + +class VoxCPMForConditionalGeneration(nn.Module): + input_modalities = "audio" + _LATENT_STAGES = {"latent_generator", "latent", "ar_dit"} + _VAE_STAGES = {"vae", "audio_vae"} + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + del prefix + self.vllm_config = vllm_config + self.model_path = vllm_config.model_config.model + self.model_stage = getattr(vllm_config.model_config, "model_stage", "latent_generator") + self.have_multimodal_outputs = True + self.has_preprocess = False + self.has_postprocess = False + self.enable_update_additional_information = True + self.requires_raw_input_tokens = True + self.inject_omni_request_id_into_runtime_info = True + self._pipeline = None + self._latent_stream_gens: dict[str, Any] = {} + self._latent_stream_terminal_pending: dict[str, int] = {} + self._latent_stream_completed: set[str] = set() + self._next_local_stream_key = 0 + self._ar_emit_stop_token = True + + def _runner_hidden_device_dtype(self) -> tuple[torch.device, torch.dtype]: + device = _resolve_runtime_device(self.vllm_config) + model_config = getattr(self.vllm_config, "model_config", None) + dtype = getattr(model_config, "dtype", torch.float32) if model_config is not None else torch.float32 + return device, dtype + + def _ensure_model_loaded(self): + if self._pipeline is not None: + return + + target_device = _resolve_runtime_device(self.vllm_config) + model_dtype = getattr(self.vllm_config.model_config, "dtype", None) + normalized_dtype = _normalize_dtype_name(model_dtype) + if self.model_stage in self._LATENT_STAGES: + self._pipeline = _load_native_voxcpm_latent_generator( + self.model_path, + device=target_device, + dtype=normalized_dtype, + ) + elif self.model_stage in self._VAE_STAGES: + self._pipeline = _load_native_voxcpm_audio_vae( + self.model_path, + device=target_device, + ) + else: + raise ValueError( + f"Unsupported VoxCPM model_stage: {self.model_stage}. " + "pure_voxcpm only supports split-stage latent_generator/vae inference." + ) + + logger.info("Loaded VoxCPM stage '%s' on %s", self.model_stage, _device_to_string(target_device)) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + del weights + self._ensure_model_loaded() + return set() + + @staticmethod + def _extract_val(info: dict[str, Any], key: str, default: Any) -> Any: + value = info.get(key, default) + if isinstance(value, list): + return value[0] if value else default + return value + + def _resolve_stream_request_key(self, info: dict[str, Any]) -> str: + request_key = info.get("__voxcpm_stream_key") + if request_key is not None: + return str(request_key) + + request_key = info.get("_omni_req_id") + if request_key is not None: + request_key = str(request_key) + info["__voxcpm_stream_key"] = request_key + return request_key + + request_key = f"voxcpm-local-{self._next_local_stream_key}" + self._next_local_stream_key += 1 + info["__voxcpm_stream_key"] = request_key + return str(request_key) + + def _recover_latent_from_input_ids(self, input_ids: torch.Tensor | None) -> torch.Tensor | None: + if input_ids is None or input_ids.numel() == 0: + return None + flat_ids = input_ids.detach().reshape(-1).to("cpu") + if flat_ids.numel() < 4 or int(flat_ids[0].item()) != _VOXCPM_LATENT_MAGIC: + return None + latent_dim = int(flat_ids[1].item()) + time_dim = int(flat_ids[2].item()) + payload = flat_ids[3:] + expected = latent_dim * time_dim + if latent_dim <= 0 or time_dim <= 0: + raise ValueError(f"Invalid VoxCPM latent header: latent_dim={latent_dim}, time_dim={time_dim}") + if int(payload.numel()) != expected: + raise ValueError( + "Invalid VoxCPM latent payload size: " + f"expected={expected}, actual={int(payload.numel())}, " + f"latent_dim={latent_dim}, time_dim={time_dim}" + ) + packed = payload.to(dtype=torch.int32).to(torch.uint16) + return packed.view(torch.bfloat16).to(torch.float32).reshape(1, latent_dim, time_dim) + + def _maybe_recover_vae_infos( + self, + infos: list[dict[str, Any]], + input_ids: torch.Tensor | None, + *, + async_chunk: bool, + ) -> list[dict[str, Any]]: + if not async_chunk: + return infos + if any(self._extract_val(info, "latent_audio_feat", None) is not None for info in infos): + return infos + recovered = self._recover_latent_from_input_ids(input_ids) + if recovered is None: + return infos + return [{"latent_audio_feat": recovered}] + + @staticmethod + def _normalize_audio_samples(samples: Any) -> np.ndarray: + if isinstance(samples, torch.Tensor): + return samples.detach().cpu().float().reshape(-1).numpy() + return np.asarray(samples, dtype=np.float32).reshape(-1) + + @classmethod + def _normalize_ref_audio(cls, ref_audio: Any) -> tuple[np.ndarray, int]: + if isinstance(ref_audio, str): + raise TypeError("String ref_audio should be handled as a path before waveform normalization.") + + if isinstance(ref_audio, dict): + sample_rate = ref_audio.get("sample_rate") or ref_audio.get("sampling_rate") or ref_audio.get("sr") + samples = None + for key in ("audio", "wav", "samples", "array", "waveform"): + if key in ref_audio and ref_audio[key] is not None: + samples = ref_audio[key] + break + if sample_rate is None or samples is None: + raise ValueError("ref_audio dict must contain waveform data and sample rate.") + return cls._normalize_audio_samples(samples), int(sample_rate) + + if isinstance(ref_audio, (list, tuple)): + if len(ref_audio) == 1: + return cls._normalize_ref_audio(ref_audio[0]) + if len(ref_audio) == 2 and np.isscalar(ref_audio[1]): + return cls._normalize_audio_samples(ref_audio[0]), int(ref_audio[1]) + + raise TypeError(f"Unsupported ref_audio format: {type(ref_audio)!r}") + + @staticmethod + def _write_temp_prompt_wav(waveform: np.ndarray, sample_rate: int) -> str: + prompt_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") + prompt_file.close() + + wav = np.asarray(waveform, dtype=np.float32).reshape(-1) + wav = np.clip(wav, -1.0, 1.0) + pcm16 = (wav * 32767.0).astype(np.int16) + with wave.open(prompt_file.name, "wb") as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(int(sample_rate)) + wav_file.writeframes(pcm16.tobytes()) + + return prompt_file.name + + @classmethod + def _resolve_prompt_inputs(cls, info: dict[str, Any]) -> tuple[str | None, str | None, str | None]: + prompt_text = cls._extract_val(info, "prompt_text", None) + prompt_wav_path = cls._extract_val(info, "prompt_wav_path", None) + if prompt_wav_path: + if prompt_text is None: + prompt_text = cls._extract_val(info, "ref_text", None) + return prompt_wav_path, prompt_text, None + + ref_audio = cls._extract_val(info, "ref_audio", None) + ref_text = cls._extract_val(info, "ref_text", None) + if ref_audio is None or ref_text is None: + return None, None, None + if isinstance(ref_audio, str): + return ref_audio, ref_text, None + + waveform, sample_rate = cls._normalize_ref_audio(ref_audio) + temp_prompt_wav = cls._write_temp_prompt_wav(waveform, sample_rate) + return temp_prompt_wav, ref_text, temp_prompt_wav + + def embed_input_ids(self, input_ids: torch.Tensor, **_: Any) -> torch.Tensor: + if input_ids.numel() == 0: + return torch.empty((0, 1), device=input_ids.device, dtype=torch.float32) + return torch.zeros((input_ids.shape[0], 1), device=input_ids.device, dtype=torch.float32) + + def _get_vocab_size(self) -> int: + model_config = getattr(self.vllm_config, "model_config", None) + if model_config is not None: + getter = getattr(model_config, "get_vocab_size", None) + if callable(getter): + try: + return int(getter()) + except Exception: + pass + hf_config = getattr(model_config, "hf_text_config", None) + if hf_config is not None and hasattr(hf_config, "vocab_size"): + return int(hf_config.vocab_size) + return 32000 + + def _make_empty_output( + self, + *, + output_key: str, + payload_factory: Callable[[], torch.Tensor], + infos: list[dict[str, Any]], + sample_rate: int, + out_device: torch.device, + out_dtype: torch.dtype, + hidden_rows: int | None = None, + ) -> OmniOutput: + if hidden_rows is None: + hidden_rows = len(infos) + return OmniOutput( + text_hidden_states=torch.zeros((hidden_rows, 1), device=out_device, dtype=out_dtype), + multimodal_outputs={ + output_key: [payload_factory() for _ in infos], + "sr": [torch.tensor(sample_rate, dtype=torch.int32) for _ in infos], + }, + ) + + def _finalize_stage_output( + self, + *, + output_key: str, + outputs: list[torch.Tensor], + sample_rates: list[torch.Tensor], + out_device: torch.device, + out_dtype: torch.dtype, + hidden_rows: int | None = None, + ) -> OmniOutput: + multimodal_outputs: dict[str, Any] = {output_key: outputs, "sr": sample_rates} + if hidden_rows is not None: + text_hidden_states = torch.zeros((hidden_rows, 1), device=out_device, dtype=out_dtype) + elif outputs: + outputs_tensor = torch.stack(outputs) + text_hidden_states = ( + outputs_tensor.unsqueeze(-1) + if outputs_tensor.ndim == 1 + else outputs_tensor.reshape(-1, outputs_tensor.shape[-1]) + ) + else: + text_hidden_states = torch.zeros((0, 1), device=out_device, dtype=out_dtype) + text_hidden_states = text_hidden_states.to(device=out_device, dtype=out_dtype) + return OmniOutput( + text_hidden_states=text_hidden_states, + multimodal_outputs=multimodal_outputs, + ) + + def _forward_vae_stage( + self, + infos: list[dict[str, Any]], + *, + sample_rate: int, + async_chunk: bool, + out_device: torch.device, + out_dtype: torch.dtype, + ) -> OmniOutput: + if all(self._extract_val(info, "latent_audio_feat", None) is None for info in infos): + self._ar_emit_stop_token = True + return self._make_empty_output( + output_key="model_outputs", + payload_factory=lambda: torch.zeros((0,), dtype=torch.float32), + infos=infos, + sample_rate=sample_rate, + out_device=out_device, + out_dtype=out_dtype, + ) + + outputs: list[torch.Tensor] = [] + sample_rates: list[torch.Tensor] = [] + for info in infos: + latent_audio_feat = self._extract_val(info, "latent_audio_feat", None) + audio_tensor = self._pipeline.decode(latent_audio_feat, trim_streaming_patch=async_chunk) + outputs.append(audio_tensor.float().cpu()) + sample_rates.append(torch.tensor(sample_rate, dtype=torch.int32)) + + self._ar_emit_stop_token = True + return self._finalize_stage_output( + output_key="model_outputs", + outputs=outputs, + sample_rates=sample_rates, + out_device=out_device, + out_dtype=out_dtype, + ) + + def _forward_latent_stage( + self, + infos: list[dict[str, Any]], + *, + sample_rate: int, + async_chunk: bool, + out_device: torch.device, + out_dtype: torch.dtype, + hidden_rows: int, + ) -> OmniOutput: + texts = [self._extract_val(info, "text", "") for info in infos] + if all(not text for text in texts): + self._ar_emit_stop_token = True + return self._make_empty_output( + output_key="latent_audio_feat", + payload_factory=lambda: torch.zeros((0,), dtype=torch.float32), + infos=infos, + sample_rate=sample_rate, + out_device=out_device, + out_dtype=out_dtype, + hidden_rows=hidden_rows, + ) + + outputs: list[torch.Tensor] = [] + sample_rates: list[torch.Tensor] = [] + last_chunk_flags: list[bool] | None = [] if async_chunk else None + payload_finished_flags: list[bool] | None = [] if async_chunk else None + for info in infos: + text = self._extract_val(info, "text", "") + cfg_value = float(self._extract_val(info, "cfg_value", 2.0)) + inference_timesteps = int(self._extract_val(info, "inference_timesteps", 10)) + min_len = int(self._extract_val(info, "min_len", 2)) + max_len = int(self._extract_val(info, "max_len", self._extract_val(info, "max_new_tokens", 4096))) + retry_badcase = bool(self._extract_val(info, "retry_badcase", True)) + retry_badcase_max_times = int(self._extract_val(info, "retry_badcase_max_times", 3)) + retry_badcase_ratio_threshold = float(self._extract_val(info, "retry_badcase_ratio_threshold", 6.0)) + streaming_prefix_len = int(self._extract_val(info, "streaming_prefix_len", 3)) + + request_key = self._resolve_stream_request_key(info) + created_temp: str | None = None + + if async_chunk: + terminal_pending = self._latent_stream_terminal_pending.get(request_key, 0) + if terminal_pending > 0: + outputs.append(torch.zeros((0,), dtype=torch.float32)) + assert last_chunk_flags is not None + last_chunk_flags.append(True) + assert payload_finished_flags is not None + payload_finished_flags.append(terminal_pending == 1) + if terminal_pending == 1: + self._latent_stream_terminal_pending.pop(request_key, None) + else: + self._latent_stream_terminal_pending[request_key] = terminal_pending - 1 + sample_rates.append(torch.tensor(sample_rate, dtype=torch.int32)) + continue + + if request_key in self._latent_stream_completed: + outputs.append(torch.zeros((0,), dtype=torch.float32)) + assert last_chunk_flags is not None + last_chunk_flags.append(True) + assert payload_finished_flags is not None + payload_finished_flags.append(False) + sample_rates.append(torch.tensor(sample_rate, dtype=torch.int32)) + continue + + if request_key not in self._latent_stream_gens: + prompt_wav_path, prompt_text, temp_prompt_wav = self._resolve_prompt_inputs(info) + created_temp = temp_prompt_wav + self._latent_stream_gens[request_key] = self._pipeline.iter_latent_chunks_streaming( + text=text, + prompt_wav_path=prompt_wav_path, + prompt_text=prompt_text, + cfg_value=cfg_value, + inference_timesteps=inference_timesteps, + min_len=min_len, + max_len=max_len, + streaming_prefix_len=streaming_prefix_len, + retry_badcase=False, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + ) + generator = self._latent_stream_gens[request_key] + try: + chunk_latent, is_last = next(generator) + except StopIteration: + self._latent_stream_gens.pop(request_key, None) + self._latent_stream_terminal_pending[request_key] = 1 + self._latent_stream_completed.add(request_key) + outputs.append(torch.zeros((0,), dtype=torch.float32)) + assert last_chunk_flags is not None + last_chunk_flags.append(True) + assert payload_finished_flags is not None + payload_finished_flags.append(True) + else: + if is_last: + self._latent_stream_gens.pop(request_key, None) + self._latent_stream_terminal_pending[request_key] = 1 + self._latent_stream_completed.add(request_key) + outputs.append(chunk_latent.detach().float().cpu()) + assert last_chunk_flags is not None + last_chunk_flags.append(bool(is_last)) + assert payload_finished_flags is not None + payload_finished_flags.append(False) + finally: + if created_temp is not None and os.path.exists(created_temp): + os.unlink(created_temp) + sample_rates.append(torch.tensor(sample_rate, dtype=torch.int32)) + continue + + prompt_wav_path, prompt_text, temp_prompt_wav = self._resolve_prompt_inputs(info) + try: + latent_audio_feat = self._pipeline.generate_latents( + text=text, + prompt_wav_path=prompt_wav_path, + prompt_text=prompt_text, + cfg_value=cfg_value, + inference_timesteps=inference_timesteps, + min_len=min_len, + max_len=max_len, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + ) + outputs.append(latent_audio_feat.float().cpu()) + finally: + if temp_prompt_wav is not None and os.path.exists(temp_prompt_wav): + os.unlink(temp_prompt_wav) + + sample_rates.append(torch.tensor(sample_rate, dtype=torch.int32)) + + self._ar_emit_stop_token = all(last_chunk_flags) if async_chunk and last_chunk_flags else True + output = self._finalize_stage_output( + output_key="latent_audio_feat", + outputs=outputs, + sample_rates=sample_rates, + out_device=out_device, + out_dtype=out_dtype, + hidden_rows=hidden_rows, + ) + if async_chunk and payload_finished_flags is not None: + output.multimodal_outputs["finished"] = [ + torch.tensor(flag, dtype=torch.bool) for flag in payload_finished_flags + ] + return output + + def compute_logits(self, hidden_states: torch.Tensor | OmniOutput, sampling_metadata: Any = None) -> torch.Tensor: + del sampling_metadata + if isinstance(hidden_states, OmniOutput): + hidden_states = hidden_states.text_hidden_states + if hidden_states is None: + device, dtype = self._runner_hidden_device_dtype() + hidden_states = torch.zeros((0, 1), device=device, dtype=dtype) + if hidden_states.ndim == 1: + hidden_states = hidden_states.unsqueeze(-1) + elif hidden_states.ndim > 2: + hidden_states = hidden_states.reshape(-1, hidden_states.shape[-1]) + + vocab_size = self._get_vocab_size() + num_rows = int(hidden_states.shape[0]) + logits = torch.zeros((num_rows, vocab_size), dtype=torch.float32, device=hidden_states.device) + eos_id = 2 if vocab_size > 2 else 0 + safe_id = 1 if vocab_size > 1 and 1 != eos_id else 0 + emit_stop = getattr(self, "_ar_emit_stop_token", True) + if num_rows > 0: + if emit_stop: + logits[:, eos_id] = 1.0e6 + else: + logits[:, eos_id] = -1.0e9 + logits[:, safe_id] = 1.0e6 + return logits + + @torch.no_grad() + def forward( + self, + input_ids: torch.Tensor | None = None, + positions: torch.Tensor | None = None, + intermediate_tensors: Any = None, + inputs_embeds: torch.Tensor | None = None, + runtime_additional_information: list[dict[str, Any]] | None = None, + model_intermediate_buffer: list[dict[str, Any]] | None = None, + **kwargs: Any, + ) -> OmniOutput: + del positions, intermediate_tensors, inputs_embeds, kwargs + self._ensure_model_loaded() + out_device, out_dtype = self._runner_hidden_device_dtype() + if input_ids is not None and input_ids.device.type == out_device.type: + out_device = input_ids.device + + infos = model_intermediate_buffer or runtime_additional_information or [{}] + hidden_rows = len(infos) + if input_ids is not None and len(input_ids.shape) > 0: + hidden_rows = max(hidden_rows, int(input_ids.shape[0])) + sample_rate = int(getattr(self._pipeline, "sample_rate", 24000)) + async_chunk = bool(getattr(self.vllm_config.model_config, "async_chunk", False)) + if self.model_stage in self._VAE_STAGES: + infos = self._maybe_recover_vae_infos(infos, input_ids, async_chunk=async_chunk) + return self._forward_vae_stage( + infos, + sample_rate=sample_rate, + async_chunk=async_chunk, + out_device=out_device, + out_dtype=out_dtype, + ) + if self.model_stage in self._LATENT_STAGES: + return self._forward_latent_stage( + infos, + sample_rate=sample_rate, + async_chunk=async_chunk, + out_device=out_device, + out_dtype=out_dtype, + hidden_rows=hidden_rows, + ) + raise ValueError(f"Unsupported VoxCPM model_stage at runtime: {self.model_stage}") + + def make_empty_intermediate_tensors( + self, batch_size: int, dtype: torch.dtype, device: torch.device + ) -> IntermediateTensors: + del batch_size, dtype, device + return {} + + +__all__ = ["VoxCPMForConditionalGeneration"] diff --git a/vllm_omni/model_executor/models/voxcpm/voxcpm_loader.py b/vllm_omni/model_executor/models/voxcpm/voxcpm_loader.py new file mode 100644 index 00000000000..dac7117cad8 --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm/voxcpm_loader.py @@ -0,0 +1,247 @@ +from __future__ import annotations + +import importlib +import json +import os +import shutil +import sys +import tempfile +from contextlib import contextmanager +from hashlib import sha256 +from pathlib import Path +from typing import Any +from unittest.mock import patch + +import numpy as np +import torch +from vllm.config import VllmConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def _iter_voxcpm_src_candidates() -> list[Path]: + candidates: list[Path] = [] + env_path = os.environ.get("VLLM_OMNI_VOXCPM_CODE_PATH") + if env_path: + candidates.append(Path(env_path).expanduser()) + + repo_root = Path(__file__).resolve().parents[4] + candidates.append(repo_root.parent / "VoxCPM" / "src") + + unique_candidates: list[Path] = [] + seen: set[str] = set() + for candidate in candidates: + candidate_key = str(candidate) + if candidate_key in seen: + continue + seen.add(candidate_key) + unique_candidates.append(candidate) + return unique_candidates + + +def _prepend_voxcpm_src(candidate: Path) -> None: + candidate_str = str(candidate) + if candidate_str not in sys.path: + sys.path.insert(0, candidate_str) + + +def _import_voxcpm_attrs(module_name: str, *attr_names: str) -> tuple[Any, ...]: + last_exc: ImportError | None = None + for candidate in _iter_voxcpm_src_candidates(): + if not candidate.exists(): + continue + _prepend_voxcpm_src(candidate) + try: + module = importlib.import_module(module_name) + return tuple(getattr(module, attr_name) for attr_name in attr_names) + except ImportError as exc: + last_exc = exc + + try: + module = importlib.import_module(module_name) + return tuple(getattr(module, attr_name) for attr_name in attr_names) + except ImportError as exc: + last_exc = exc + + raise ImportError(f"Failed to import {module_name}.") from last_exc + + +def _import_voxcpm_base_model_class(): + """Import upstream ``VoxCPMModel`` from ``VoxCPM/src/voxcpm`` (env, sibling tree, or pip).""" + try: + (VoxCPMModel,) = _import_voxcpm_attrs("voxcpm.model.voxcpm", "VoxCPMModel") + return VoxCPMModel + except ImportError as exc: + raise ImportError( + "Failed to import VoxCPMModel. Install the `voxcpm` package or set " + "`VLLM_OMNI_VOXCPM_CODE_PATH` to the VoxCPM repository `src` directory " + "(the parent of the `voxcpm` package that contains `model/` and `modules/`)." + ) from exc + + +def _import_voxcpm_audio_vae_classes(): + try: + return _import_voxcpm_attrs("voxcpm.modules.audiovae", "AudioVAE", "AudioVAEConfig") + except ImportError as exc: + raise ImportError( + "Failed to import VoxCPM AudioVAE. Install the `voxcpm` package or set " + "`VLLM_OMNI_VOXCPM_CODE_PATH` to the VoxCPM repository `src` directory." + ) from exc + + +def _device_to_string(device: torch.device) -> str: + if device.index is None: + return device.type + return f"{device.type}:{device.index}" + + +def _normalize_dtype_name(dtype: Any) -> str | None: + if dtype is None: + return None + if isinstance(dtype, torch.dtype): + mapping = { + torch.bfloat16: "bfloat16", + torch.float16: "float16", + torch.float32: "float32", + } + return mapping.get(dtype, str(dtype).removeprefix("torch.")) + dtype_str = str(dtype) + return dtype_str.removeprefix("torch.") + + +def _resolve_runtime_device(vllm_config: VllmConfig) -> torch.device: + try: + from vllm_omni.platforms import current_omni_platform + + return current_omni_platform.get_torch_device() + except Exception: + pass + + device = getattr(getattr(vllm_config, "device_config", None), "device", None) + if isinstance(device, torch.device): + return device + if device: + return torch.device(device) + return torch.device("cpu") + + +def _prepare_runtime_model_dir( + model_path: str | Path, + *, + target_device: torch.device, + target_dtype: str | None, +) -> str: + source_dir = Path(model_path) + config_path = source_dir / "config.json" + if not config_path.exists(): + return str(source_dir) + + config_text = config_path.read_text() + config_dict = json.loads(config_text) + desired_device = target_device.type + desired_dtype = target_dtype or config_dict.get("dtype") + + if config_dict.get("device") == desired_device and config_dict.get("dtype") == desired_dtype: + return str(source_dir) + + digest = sha256(f"{source_dir.resolve()}:{config_text}:{desired_device}:{desired_dtype}".encode()).hexdigest()[:16] + runtime_dir = Path(tempfile.gettempdir()) / "vllm_omni_voxcpm_runtime" / digest + runtime_dir.mkdir(parents=True, exist_ok=True) + + for entry in source_dir.iterdir(): + target = runtime_dir / entry.name + if entry.name == "config.json" or target.exists(): + continue + try: + target.symlink_to(entry, target_is_directory=entry.is_dir()) + except OSError as exc: + logger.warning( + "Falling back to copying VoxCPM runtime artifact %s into %s because symlink creation failed: %s", + entry, + runtime_dir, + exc, + ) + if entry.is_dir(): + shutil.copytree(entry, target, dirs_exist_ok=True) + else: + shutil.copy2(entry, target) + + patched_config = dict(config_dict) + patched_config["device"] = desired_device + if desired_dtype is not None: + patched_config["dtype"] = desired_dtype + (runtime_dir / "config.json").write_text(json.dumps(patched_config, indent=2, sort_keys=True)) + return str(runtime_dir) + + +@contextmanager +def _force_cuda_available_for_npu(device: torch.device): + if device.type != "npu": + yield + return + + with patch("torch.cuda.is_available", return_value=True): + yield + + +def _is_torchcodec_load_error(exc: BaseException) -> bool: + message = str(exc).lower() + return "torchcodec" in message or "load_with_torchcodec" in message + + +def _load_audio_with_soundfile( + prompt_wav_path: str, + *, + sample_rate: int, +) -> torch.Tensor: + try: + import soundfile as sf + except ImportError: + raise + + audio_np, source_sr = sf.read(prompt_wav_path, dtype="float32", always_2d=True) + audio = torch.from_numpy(np.ascontiguousarray(audio_np.T)) + + if audio.size(0) > 1: + audio = audio.mean(dim=0, keepdim=True) + + if int(source_sr) != int(sample_rate): + try: + import torchaudio + except ImportError as exc: + raise ImportError("torchaudio is required for resampling prompt audio.") from exc + audio = torchaudio.functional.resample(audio, int(source_sr), int(sample_rate)) + + return audio + + +def _build_prompt_cache_with_soundfile(model: Any, *args: Any, **kwargs: Any) -> dict[str, Any]: + if args: + prompt_text = args[0] + prompt_wav_path = args[1] if len(args) > 1 else kwargs.get("prompt_wav_path") + else: + prompt_text = kwargs.get("prompt_text") + prompt_wav_path = kwargs.get("prompt_wav_path") + + if not prompt_text or not prompt_wav_path: + raise ValueError("prompt_text and prompt_wav_path are required") + + audio = _load_audio_with_soundfile(prompt_wav_path, sample_rate=int(model.sample_rate)) + + patch_len = model.patch_size * model.chunk_size + if audio.size(1) % patch_len != 0: + padding_size = patch_len - audio.size(1) % patch_len + audio = torch.nn.functional.pad(audio, (padding_size, 0)) + + audio_feat = model.audio_vae.encode(audio.to(model.device), model.sample_rate).cpu() + audio_feat = audio_feat.view( + model.audio_vae.latent_dim, + -1, + model.patch_size, + ).permute(1, 2, 0) + + return { + "prompt_text": prompt_text, + "audio_feat": audio_feat, + } diff --git a/vllm_omni/model_executor/models/voxcpm/voxcpm_runtime_utils.py b/vllm_omni/model_executor/models/voxcpm/voxcpm_runtime_utils.py new file mode 100644 index 00000000000..36b4282c2d7 --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm/voxcpm_runtime_utils.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +import json +import shutil +from pathlib import Path + + +def resolve_voxcpm_model_dir(model: str) -> Path: + model_path = Path(model).expanduser() + if model_path.exists(): + return model_path + + from huggingface_hub import snapshot_download + + return Path(snapshot_download(repo_id=model)) + + +def prepare_voxcpm_hf_config_dir(model_dir: str | Path, hf_config_dir: str | Path) -> Path: + model_dir = Path(model_dir).expanduser() + hf_config_dir = Path(hf_config_dir).expanduser() + hf_config_dir.mkdir(parents=True, exist_ok=True) + + source_config_path = model_dir / "config.json" + if not source_config_path.exists(): + raise FileNotFoundError(f"VoxCPM config.json not found under {model_dir}") + + config_path = hf_config_dir / "config.json" + shutil.copy2(source_config_path, config_path) + + source_generation_config_path = model_dir / "generation_config.json" + if source_generation_config_path.exists(): + shutil.copy2(source_generation_config_path, hf_config_dir / "generation_config.json") + + config_dict = json.loads(config_path.read_text(encoding="utf-8")) + config_dict["model_type"] = "voxcpm" + config_dict.setdefault("architectures", ["VoxCPMForConditionalGeneration"]) + config_path.write_text(json.dumps(config_dict, indent=2, ensure_ascii=False), encoding="utf-8") + return hf_config_dir + + +__all__ = [ + "prepare_voxcpm_hf_config_dir", + "resolve_voxcpm_model_dir", +] diff --git a/vllm_omni/model_executor/models/voxcpm/voxcpm_stage_wrappers.py b/vllm_omni/model_executor/models/voxcpm/voxcpm_stage_wrappers.py new file mode 100644 index 00000000000..f4446c796e4 --- /dev/null +++ b/vllm_omni/model_executor/models/voxcpm/voxcpm_stage_wrappers.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +import os +from collections.abc import Generator +from typing import Any + +import torch +import torch.nn as nn +from einops import rearrange + + +class _DirectVoxCPMLatentGenerator: + def __init__(self, tts_model: Any): + self.tts_model = tts_model + self.sample_rate = int(getattr(tts_model, "sample_rate", 24000)) + + def generate_latents( + self, + *, + text: str, + prompt_wav_path: str | None = None, + prompt_text: str | None = None, + cfg_value: float = 2.0, + inference_timesteps: int = 10, + min_len: int = 2, + max_len: int = 4096, + retry_badcase: bool = True, + retry_badcase_max_times: int = 3, + retry_badcase_ratio_threshold: float = 6.0, + ) -> torch.Tensor: + if not isinstance(text, str) or not text.strip(): + raise ValueError("target text must be a non-empty string") + if (prompt_wav_path is None) != (prompt_text is None): + raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None") + if prompt_wav_path is not None and not os.path.exists(prompt_wav_path): + raise FileNotFoundError(f"prompt_wav_path does not exist: {prompt_wav_path}") + + prompt_cache = None + if prompt_wav_path is not None and prompt_text is not None: + prompt_cache = self.tts_model.build_prompt_cache( + prompt_text=prompt_text, + prompt_wav_path=prompt_wav_path, + ) + + gen_kw = dict( + target_text=" ".join(text.split()), + prompt_cache=prompt_cache, + min_len=min_len, + max_len=max_len, + inference_timesteps=inference_timesteps, + cfg_value=cfg_value, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + ) + latent_entry = getattr(self.tts_model, "generate_latents_with_prompt_cache", None) + if latent_entry is not None: + _, _, pred_audio_feat = latent_entry(**gen_kw) + else: + try: + _, _, pred_audio_feat = self.tts_model.generate_with_prompt_cache( + **gen_kw, + latents_only=True, + ) + except TypeError: + _, _, pred_audio_feat = self.tts_model.generate_with_prompt_cache(**gen_kw) + return pred_audio_feat.detach().cpu().to(torch.float32) + + def iter_latent_chunks_streaming( + self, + *, + text: str, + prompt_wav_path: str | None = None, + prompt_text: str | None = None, + cfg_value: float = 2.0, + inference_timesteps: int = 10, + min_len: int = 2, + max_len: int = 4096, + streaming_prefix_len: int = 3, + retry_badcase: bool = False, + retry_badcase_max_times: int = 3, + retry_badcase_ratio_threshold: float = 6.0, + ) -> Generator[tuple[torch.Tensor, bool], None, None]: + """Yield ``(latent_window, is_last_chunk)`` for Omni async_chunk latent to VAE.""" + if not isinstance(text, str) or not text.strip(): + raise ValueError("target text must be a non-empty string") + if (prompt_wav_path is None) != (prompt_text is None): + raise ValueError("prompt_wav_path and prompt_text must both be provided or both be None") + if prompt_wav_path is not None and not os.path.exists(prompt_wav_path): + raise FileNotFoundError(f"prompt_wav_path does not exist: {prompt_wav_path}") + + prompt_cache = None + if prompt_wav_path is not None and prompt_text is not None: + prompt_cache = self.tts_model.build_prompt_cache( + prompt_text=prompt_text, + prompt_wav_path=prompt_wav_path, + ) + + gen_kw = dict( + target_text=" ".join(text.split()), + prompt_cache=prompt_cache, + min_len=min_len, + max_len=max_len, + inference_timesteps=inference_timesteps, + cfg_value=cfg_value, + retry_badcase=retry_badcase, + retry_badcase_max_times=retry_badcase_max_times, + retry_badcase_ratio_threshold=retry_badcase_ratio_threshold, + streaming_prefix_len=streaming_prefix_len, + ) + stream_entry = getattr(self.tts_model, "generate_latents_with_prompt_cache_streaming", None) + if stream_entry is not None: + gen = stream_entry(**gen_kw) + else: + fallback_stream_entry = getattr(self.tts_model, "generate_with_prompt_cache_streaming", None) + if fallback_stream_entry is not None: + gen = fallback_stream_entry(**gen_kw, latents_only=True) + else: + gen = self.tts_model._generate_with_prompt_cache(streaming=True, latents_only=True, **gen_kw) + + iterator = iter(gen) + previous = next(iterator, None) + while previous is not None: + current = next(iterator, None) + _, _target_tok, chunk_latent = previous + if not isinstance(chunk_latent, torch.Tensor): + chunk_latent = torch.as_tensor(chunk_latent) + yield chunk_latent, current is None + previous = current + + +class _DirectVoxCPMAudioVAE: + def __init__(self, audio_vae: nn.Module, *, patch_size: int = 2): + self.audio_vae = audio_vae + self.sample_rate = int(getattr(audio_vae, "sample_rate", 24000)) + self.latent_dim = int(getattr(audio_vae, "latent_dim", 64)) + self.patch_size = int(patch_size) + self._chunk_size = int(getattr(audio_vae, "chunk_size", 1)) + self._stream_audio_patch_samples = max(1, self.patch_size * self._chunk_size) + + def _prepare_latents_for_decode(self, latent_audio_feat: Any) -> torch.Tensor: + latents = latent_audio_feat + if not isinstance(latents, torch.Tensor): + latents = torch.tensor(latents, dtype=torch.float32) + latents = latents.detach().to(torch.float32) + + if latents.ndim == 3: + if latents.shape[-1] == self.latent_dim: + latents = rearrange(latents, "t p d -> 1 d (t p)") + elif latents.shape[1] == self.latent_dim: + latents = latents.contiguous() + else: + raise ValueError(f"Unsupported latent_audio_feat shape: {tuple(latents.shape)}") + elif latents.ndim == 2: + if latents.shape[0] == self.latent_dim: + latents = latents.unsqueeze(0) + elif latents.shape[1] == self.latent_dim: + latents = rearrange(latents, "t d -> 1 d t") + else: + raise ValueError(f"Unsupported latent_audio_feat shape: {tuple(latents.shape)}") + else: + raise ValueError(f"Unsupported latent_audio_feat ndim: {latents.ndim}") + + return latents + + @torch.no_grad() + def decode(self, latent_audio_feat: Any, *, trim_streaming_patch: bool = False) -> torch.Tensor: + latents = self._prepare_latents_for_decode(latent_audio_feat) + device = next(self.audio_vae.parameters()).device + raw = self.audio_vae.decode(latents.to(device=device, dtype=torch.float32)) + if isinstance(raw, dict): + audio = raw.get("audio") + if audio is None: + audio = next(v for v in raw.values() if isinstance(v, torch.Tensor)) + else: + audio = raw + if audio.dim() == 3: + stream = audio.squeeze(1) + elif audio.dim() == 2: + stream = audio + else: + stream = audio.reshape(audio.shape[0], -1) + if trim_streaming_patch: + stream = stream[..., -self._stream_audio_patch_samples :] + return stream.reshape(-1).detach().cpu().to(torch.float32) diff --git a/vllm_omni/model_executor/stage_configs/voxcpm.yaml b/vllm_omni/model_executor/stage_configs/voxcpm.yaml new file mode 100644 index 00000000000..a5f324f6602 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/voxcpm.yaml @@ -0,0 +1,69 @@ +# VoxCPM two-stage (latent → VAE) without async_chunk: one-shot latent then decode. +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: bfloat16 + model_stage: latent_generator + model_arch: VoxCPMForConditionalGeneration + # Optional persistent HF-compatible config dir for native VoxCPM models. + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.7 + distributed_executor_backend: "mp" + max_num_batched_tokens: 4096 + max_model_len: 4096 + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 4096 + stop_token_ids: [2] + seed: 42 + detokenize: false + repetition_penalty: 1.0 + final_output: false + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: float32 + model_stage: vae + model_arch: VoxCPMForConditionalGeneration + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.15 + distributed_executor_backend: "mp" + max_num_batched_tokens: 8192 + max_model_len: 4096 + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.voxcpm.latent2vae + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 1 + seed: 42 + detokenize: true + repetition_penalty: 1.0 diff --git a/vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml b/vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml new file mode 100644 index 00000000000..cf78d4e4381 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml @@ -0,0 +1,102 @@ +# VoxCPM two-stage streaming (align with qwen3_tts.yaml async_chunk pattern). +# Stage0 (latent_generator) emits latent in time chunks; Stage1 (VAE) decodes as chunks arrive. +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: bfloat16 + model_stage: latent_generator + model_arch: VoxCPMForConditionalGeneration + # Optional persistent HF-compatible config dir for native VoxCPM models. + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: true + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.7 + distributed_executor_backend: "mp" + max_num_batched_tokens: 4096 + max_model_len: 4096 + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.voxcpm.latent2vae_async_chunk + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 4096 + stop_token_ids: [2] + seed: 42 + detokenize: false + repetition_penalty: 1.0 + final_output: false + output_connectors: + to_stage_1: voxcpm_shm + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: float32 + model_stage: vae + model_arch: VoxCPMForConditionalGeneration + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.15 + distributed_executor_backend: "mp" + max_num_batched_tokens: 8192 + max_model_len: 4096 + engine_input_source: [0] + final_output: true + final_output_type: audio + input_connectors: + from_stage_0: voxcpm_shm + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 128 + seed: 42 + detokenize: true + repetition_penalty: 1.0 + + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 + + connectors: + voxcpm_shm: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + # Frame-aligned codec streaming transport. + codec_streaming: true + # Connector polling / timeout (unit: loop count, sleep interval in seconds). + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + # Align with Omni: small chunks with sufficient context overlap. + codec_chunk_frames: 1 + codec_left_context_frames: 1 + + edges: + - from: 0 + to: 1 + window_size: -1 diff --git a/vllm_omni/model_executor/stage_input_processors/voxcpm.py b/vllm_omni/model_executor/stage_input_processors/voxcpm.py new file mode 100644 index 00000000000..c2fcf521bf4 --- /dev/null +++ b/vllm_omni/model_executor/stage_input_processors/voxcpm.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +from typing import Any + +import torch +from vllm.inputs import TextPrompt + +from vllm_omni.inputs.data import OmniTokensPrompt + +_VOXCPM_LATENT_MAGIC = 131071 + + +def _serialize_latent_to_codes(latent: Any) -> list[int]: + latent_tensor = latent if isinstance(latent, torch.Tensor) else torch.as_tensor(latent) + latent_tensor = latent_tensor.detach().cpu().contiguous() + if latent_tensor.ndim == 3: + if latent_tensor.shape[0] != 1: + raise ValueError(f"Expected batch=1 latent tensor, got shape={tuple(latent_tensor.shape)}") + latent_tensor = latent_tensor.squeeze(0) + if latent_tensor.ndim != 2: + raise ValueError(f"Unsupported latent_audio_feat shape for async chunk: {tuple(latent_tensor.shape)}") + latent_dim, time_dim = int(latent_tensor.shape[0]), int(latent_tensor.shape[1]) + packed = latent_tensor.to(torch.bfloat16).contiguous().view(torch.uint16).reshape(-1).to(torch.int32) + return [_VOXCPM_LATENT_MAGIC, latent_dim, time_dim, *packed.tolist()] + + +def _coerce_finished_flag(value: Any) -> bool: + """Normalize VoxCPM async-chunk finished markers to a Python bool.""" + if value is None: + return False + if isinstance(value, torch.Tensor): + if value.numel() != 1: + raise ValueError(f"finished tensor must be scalar, got shape={tuple(value.shape)}") + return bool(value.detach().cpu().item()) + if isinstance(value, (list, tuple)): + if not value: + return False + if len(value) != 1: + raise ValueError(f"finished container must have one element, got len={len(value)}") + return _coerce_finished_flag(value[0]) + return bool(value) + + +def latent2vae( + stage_list: list[Any], + engine_input_source: list[int], + prompt: OmniTokensPrompt | TextPrompt | None = None, + requires_multimodal_data: bool = False, +) -> list[OmniTokensPrompt]: + del prompt, requires_multimodal_data + + if not engine_input_source: + raise ValueError("engine_input_source cannot be empty") + + source_stage_id = engine_input_source[0] + if source_stage_id >= len(stage_list): + raise IndexError(f"Invalid stage_id: {source_stage_id}") + + source_outputs = stage_list[source_stage_id].engine_outputs + if source_outputs is None: + raise RuntimeError(f"Stage {source_stage_id} has no outputs yet") + + vae_inputs: list[OmniTokensPrompt] = [] + for source_output in source_outputs: + output = source_output.outputs[0] + multimodal_output = getattr(output, "multimodal_output", None) + if not isinstance(multimodal_output, dict) or "latent_audio_feat" not in multimodal_output: + raise ValueError( + "VoxCPM latent stage output missing 'latent_audio_feat'. " + f"request_id={getattr(source_output, 'request_id', None)}" + ) + + additional_information = { + "latent_audio_feat": multimodal_output["latent_audio_feat"], + } + if "sr" in multimodal_output: + additional_information["sample_rate"] = [int(multimodal_output["sr"])] + + vae_inputs.append( + OmniTokensPrompt( + prompt_token_ids=[0], + additional_information=additional_information, + multi_modal_data=None, + mm_processor_kwargs=None, + ) + ) + + return vae_inputs + + +def latent2vae_async_chunk( + transfer_manager: Any, + pooling_output: dict[str, Any] | None, + request: Any, + is_finished: bool = False, +) -> dict[str, Any] | None: + """Stage-0 latent → stage-1 VAE under ``async_chunk`` (connector payload).""" + # Kept for callback signature compatibility with OmniChunkTransferAdapter. + _ = transfer_manager + finished_request = _coerce_finished_flag(is_finished) + if callable(getattr(request, "is_finished", None)): + finished_request = finished_request or _coerce_finished_flag(request.is_finished()) + if not isinstance(pooling_output, dict): + if finished_request: + return { + "code_predictor_codes": [], + "finished": torch.tensor(True, dtype=torch.bool), + } + return None + + latent = pooling_output.get("latent_audio_feat") + if isinstance(latent, torch.Tensor) and latent.numel() == 0: + latent = None + + if latent is None: + if finished_request: + return { + "code_predictor_codes": [], + "finished": torch.tensor(True, dtype=torch.bool), + } + return None + + serialized_codes = _serialize_latent_to_codes(latent) + out: dict[str, Any] = { + "code_predictor_codes": serialized_codes, + "finished": torch.tensor(finished_request, dtype=torch.bool), + } + return out diff --git a/vllm_omni/platforms/npu/stage_configs/voxcpm.yaml b/vllm_omni/platforms/npu/stage_configs/voxcpm.yaml new file mode 100644 index 00000000000..dcd1f40517b --- /dev/null +++ b/vllm_omni/platforms/npu/stage_configs/voxcpm.yaml @@ -0,0 +1,67 @@ +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: bfloat16 + model_stage: latent_generator + model_arch: VoxCPMForConditionalGeneration + # Optional persistent HF-compatible config dir for native VoxCPM models. + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.75 + distributed_executor_backend: "mp" + max_num_batched_tokens: 4096 + max_model_len: 4096 + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 4096 + seed: 42 + detokenize: false + repetition_penalty: 1.0 + final_output: false + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: float32 + model_stage: vae + model_arch: VoxCPMForConditionalGeneration + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.1 + distributed_executor_backend: "mp" + max_num_batched_tokens: 8192 + max_model_len: 4096 + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.voxcpm.latent2vae + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 1 + seed: 42 + detokenize: true + repetition_penalty: 1.0 diff --git a/vllm_omni/platforms/npu/stage_configs/voxcpm_async_chunk.yaml b/vllm_omni/platforms/npu/stage_configs/voxcpm_async_chunk.yaml new file mode 100644 index 00000000000..0a4ed7497d5 --- /dev/null +++ b/vllm_omni/platforms/npu/stage_configs/voxcpm_async_chunk.yaml @@ -0,0 +1,93 @@ +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm + is_comprehension: true + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: bfloat16 + model_stage: latent_generator + model_arch: VoxCPMForConditionalGeneration + # Optional persistent HF-compatible config dir for native VoxCPM models. + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: latent + gpu_memory_utilization: 0.75 + distributed_executor_backend: "mp" + max_num_batched_tokens: 4096 + max_model_len: 4096 + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.voxcpm.latent2vae_async_chunk + output_connectors: + to_stage_1: connector_of_shared_memory + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 4096 + seed: 42 + detokenize: false + repetition_penalty: 1.0 + final_output: false + + - stage_id: 1 + stage_type: llm + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + dtype: float32 + model_stage: vae + model_arch: VoxCPMForConditionalGeneration + hf_config_path: ${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,} + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio + gpu_memory_utilization: 0.1 + distributed_executor_backend: "mp" + max_num_batched_tokens: 8192 + max_model_len: 4096 + engine_input_source: [0] + input_connectors: + from_stage_0: connector_of_shared_memory + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 1 + seed: 42 + detokenize: true + repetition_penalty: 1.0 + +runtime: + enabled: true + defaults: + window_size: -1 + max_inflight: 1 + + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + shm_threshold_bytes: 65536 + codec_streaming: false + connector_get_sleep_s: 0.01 + connector_get_max_wait_first_chunk: 3000 + connector_get_max_wait: 300 + + edges: + - from: 0 + to: 1 + window_size: -1 diff --git a/vllm_omni/transformers_utils/configs/__init__.py b/vllm_omni/transformers_utils/configs/__init__.py index 5f957c2f6de..0aa3624f802 100644 --- a/vllm_omni/transformers_utils/configs/__init__.py +++ b/vllm_omni/transformers_utils/configs/__init__.py @@ -17,6 +17,7 @@ "FishSpeechConfig": "vllm_omni.transformers_utils.configs.fish_speech", "FishSpeechSlowARConfig": "vllm_omni.transformers_utils.configs.fish_speech", "FishSpeechFastARConfig": "vllm_omni.transformers_utils.configs.fish_speech", + "VoxCPMConfig": "vllm_omni.transformers_utils.configs.voxcpm", "VoxCPM2Config": "vllm_omni.transformers_utils.configs.voxcpm2", } @@ -28,6 +29,7 @@ "FishSpeechConfig", "FishSpeechSlowARConfig", "FishSpeechFastARConfig", + "VoxCPMConfig", "VoxCPM2Config", ] @@ -49,4 +51,5 @@ def __dir__(): # run as soon as `vllm_omni.transformers_utils.configs` is imported. from vllm_omni.transformers_utils.configs import fish_speech as _fish_speech # noqa: F401, E402 from vllm_omni.transformers_utils.configs import mammoth_moda2 as _mammoth_moda2 # noqa: F401, E402 +from vllm_omni.transformers_utils.configs import voxcpm as _voxcpm # noqa: F401, E402 from vllm_omni.transformers_utils.configs import voxcpm2 as _voxcpm2 # noqa: F401, E402 diff --git a/vllm_omni/transformers_utils/configs/voxcpm.py b/vllm_omni/transformers_utils/configs/voxcpm.py new file mode 100644 index 00000000000..02678389150 --- /dev/null +++ b/vllm_omni/transformers_utils/configs/voxcpm.py @@ -0,0 +1,68 @@ +from transformers import AutoConfig +from transformers.configuration_utils import PretrainedConfig + + +class VoxCPMConfig(PretrainedConfig): + model_type = "voxcpm" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + bos_token_id: int = 1, + eos_token_id: int = 2, + vocab_size: int = 32000, + hidden_size: int = 1024, + intermediate_size: int = 4096, + max_position_embeddings: int = 4096, + num_attention_heads: int = 16, + num_hidden_layers: int = 24, + num_key_value_heads: int = 16, + rms_norm_eps: float = 1e-6, + rope_theta: float = 10000.0, + rope_scaling: dict | None = None, + lm_config: dict | None = None, + encoder_config: dict | None = None, + dit_config: dict | None = None, + audio_vae_config: dict | None = None, + patch_size: int = 2, + feat_dim: int = 64, + residual_lm_num_layers: int = 6, + scalar_quantization_latent_dim: int = 256, + scalar_quantization_scale: int = 9, + max_length: int = 4096, + device: str = "cuda", + dtype: str = "bfloat16", + dit_mean_mode: bool = False, + **kwargs, + ): + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.max_position_embeddings = max_position_embeddings + self.num_attention_heads = num_attention_heads + self.num_hidden_layers = num_hidden_layers + self.num_key_value_heads = num_key_value_heads + self.rms_norm_eps = rms_norm_eps + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + + self.lm_config = lm_config or {} + self.encoder_config = encoder_config or {} + self.dit_config = dit_config or {} + self.audio_vae_config = audio_vae_config + + self.patch_size = patch_size + self.feat_dim = feat_dim + self.residual_lm_num_layers = residual_lm_num_layers + self.scalar_quantization_latent_dim = scalar_quantization_latent_dim + self.scalar_quantization_scale = scalar_quantization_scale + self.max_length = max_length + self.device = device + self.dtype = dtype + self.dit_mean_mode = dit_mean_mode + + +AutoConfig.register("voxcpm", VoxCPMConfig) + +__all__ = ["VoxCPMConfig"] From 82f8c93343552d81e0e4730d90ce08e072fc3bcb Mon Sep 17 00:00:00 2001 From: Juan Pablo Zuluaga <46724788+JuanPZuluaga@users.noreply.github.com> Date: Wed, 15 Apr 2026 09:14:57 +0200 Subject: [PATCH 52/76] [Feat][Qwen3-Omni] Shared code predictor module for Qwen3-TTS and Qwen3-Omni (#2375) Signed-off-by: JuanPZuluaga Co-authored-by: Hongsheng Liu --- .../qwen3_tts/test_code_predictor_dtype.py | 92 ++- vllm_omni/engine/stage_init_utils.py | 5 +- .../model_executor/models/common/__init__.py | 0 .../models/common/qwen3_code_predictor.py | 654 ++++++++++++++++++ .../qwen3_omni_moe_code_predictor_mtp.py | 520 +------------- .../qwen3_tts_code_predictor_vllm.py | 571 +-------------- 6 files changed, 778 insertions(+), 1064 deletions(-) create mode 100644 vllm_omni/model_executor/models/common/__init__.py create mode 100644 vllm_omni/model_executor/models/common/qwen3_code_predictor.py diff --git a/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py b/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py index b0ce10a8d5e..8798cb3ca9a 100644 --- a/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py +++ b/tests/model_executor/models/qwen3_tts/test_code_predictor_dtype.py @@ -21,7 +21,7 @@ from pytest_mock import MockerFixture # Direct file import to avoid vllm_omni.__init__ patch dependencies. -_BASE = os.path.join( +_MODELS = os.path.join( os.path.dirname(__file__), os.pardir, os.pardir, @@ -30,14 +30,16 @@ "vllm_omni", "model_executor", "models", - "qwen3_tts", ) +_BASE = os.path.join(_MODELS, "qwen3_tts") +_COMMON = os.path.join(_MODELS, "common") def _load_module(name: str, filename: str): path = os.path.abspath(os.path.join(_BASE, filename)) spec = importlib.util.spec_from_file_location(name, path) mod = importlib.util.module_from_spec(spec) + sys.modules[name] = mod # register before exec (needed for dataclasses etc.) spec.loader.exec_module(mod) return mod @@ -59,8 +61,17 @@ def _build_mock_modules(mocker: MockerFixture) -> dict[str, object]: weight_utils_mock = mocker.MagicMock() weight_utils_mock.default_weight_loader = lambda p, w: None - pkg = types.ModuleType("vllm_omni.model_executor.models.qwen3_tts") - pkg.__path__ = [os.path.abspath(_BASE)] + tts_pkg = types.ModuleType("vllm_omni.model_executor.models.qwen3_tts") + tts_pkg.__path__ = [os.path.abspath(_BASE)] + + common_pkg = types.ModuleType("vllm_omni.model_executor.models.common") + common_pkg.__path__ = [os.path.abspath(_COMMON)] + + models_pkg = types.ModuleType("vllm_omni.model_executor.models") + models_pkg.__path__ = [os.path.abspath(_MODELS)] + + vllm_parallel_mock = mocker.MagicMock() + vllm_parallel_mock.VocabParallelEmbedding = torch.nn.Embedding return { "vllm_omni": mocker.MagicMock(), @@ -69,9 +80,11 @@ def _build_mock_modules(mocker: MockerFixture) -> dict[str, object]: "vllm.config": mocker.MagicMock(), "vllm.config.vllm": vllm_config_mod, "vllm.model_executor.model_loader.weight_utils": weight_utils_mock, + "vllm.model_executor.layers.vocab_parallel_embedding": vllm_parallel_mock, "vllm_omni.model_executor": types.ModuleType("vllm_omni.model_executor"), - "vllm_omni.model_executor.models": types.ModuleType("vllm_omni.model_executor.models"), - "vllm_omni.model_executor.models.qwen3_tts": pkg, + "vllm_omni.model_executor.models": models_pkg, + "vllm_omni.model_executor.models.common": common_pkg, + "vllm_omni.model_executor.models.qwen3_tts": tts_pkg, } @@ -88,6 +101,15 @@ def _load_target_classes(mocker: MockerFixture): ) sys.modules["vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts"] = config_mod + # Load the shared common module (thin wrappers import from it) + common_cp_path = os.path.abspath(os.path.join(_COMMON, "qwen3_code_predictor.py")) + common_spec = importlib.util.spec_from_file_location( + "vllm_omni.model_executor.models.common.qwen3_code_predictor", common_cp_path + ) + common_cp_mod = importlib.util.module_from_spec(common_spec) + sys.modules["vllm_omni.model_executor.models.common.qwen3_code_predictor"] = common_cp_mod + common_spec.loader.exec_module(common_cp_mod) + cp_mod = _load_module( "vllm_omni.model_executor.models.qwen3_tts.qwen3_tts_code_predictor_vllm", "qwen3_tts_code_predictor_vllm.py", @@ -104,6 +126,7 @@ def loaded_target_classes(mocker: MockerFixture): config_mod.Qwen3TTSTalkerConfig, cp_mod.Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM, cp_mod.Qwen3TTSTalkerCodePredictorModelVLLM, + cp_mod.CodePredictorWrapperConfig, ) @@ -114,6 +137,7 @@ def _make_tiny_config(loaded_target_classes) -> tuple: qwen3_tts_talker_config, _, _, + _, ) = loaded_target_classes cp_config = qwen3_tts_talker_code_predictor_config( vocab_size=64, @@ -145,7 +169,7 @@ class TestCodePredictorDtypeAlignment: def test_ensure_buffers_uses_given_dtype(self, mocker: MockerFixture, loaded_target_classes) -> None: """_ensure_buffers should create proj_buf with the given dtype.""" - _, _, code_predictor_wrapper, _ = loaded_target_classes + _, _, code_predictor_wrapper, _, _ = loaded_target_classes cp_config, talker_config = _make_tiny_config(loaded_target_classes) vllm_config = _make_vllm_config(mocker) @@ -156,17 +180,17 @@ def test_ensure_buffers_uses_given_dtype(self, mocker: MockerFixture, loaded_tar ) # Create buffer in float16 - predictor._ensure_buffers(torch.device("cpu"), torch.float16) + predictor._ensure_buffers(torch.device("cpu"), torch.float16, 4) assert predictor._proj_buf is not None assert predictor._proj_buf.dtype == torch.float16 # Re-create buffer in float32 (different dtype triggers re-allocation) - predictor._ensure_buffers(torch.device("cpu"), torch.float32) + predictor._ensure_buffers(torch.device("cpu"), torch.float32, 4) assert predictor._proj_buf.dtype == torch.float32 def test_warmup_aligns_buffer_to_model_params(self, mocker: MockerFixture, loaded_target_classes) -> None: """_warmup_buckets should align proj_buf dtype to model parameters.""" - _, _, code_predictor_wrapper, _ = loaded_target_classes + _, _, code_predictor_wrapper, _, _ = loaded_target_classes cp_config, talker_config = _make_tiny_config(loaded_target_classes) vllm_config = _make_vllm_config(mocker, max_num_seqs=2) @@ -180,7 +204,7 @@ def test_warmup_aligns_buffer_to_model_params(self, mocker: MockerFixture, loade predictor = predictor.to(torch.float16) # Pre-create proj_buf with WRONG dtype (float32) — simulating the bug - predictor._ensure_buffers(torch.device("cpu"), torch.float32) + predictor._ensure_buffers(torch.device("cpu"), torch.float32, 2) assert predictor._proj_buf.dtype == torch.float32 # Simulate _setup_compile having cached model dtype and compiled forward @@ -194,7 +218,7 @@ def test_warmup_aligns_buffer_to_model_params(self, mocker: MockerFixture, loade def test_setup_compile_caches_model_dtype(self, mocker: MockerFixture, loaded_target_classes) -> None: """_setup_compile should cache model parameter dtype.""" - _, _, code_predictor_wrapper, _ = loaded_target_classes + _, _, code_predictor_wrapper, _, _ = loaded_target_classes cp_config, talker_config = _make_tiny_config(loaded_target_classes) vllm_config = _make_vllm_config(mocker, max_num_seqs=2) @@ -211,7 +235,7 @@ def test_setup_compile_caches_model_dtype(self, mocker: MockerFixture, loaded_ta def test_forward_with_mismatched_input_dtype(self, mocker: MockerFixture, loaded_target_classes) -> None: """forward() should not crash when inputs are float32 but model is float16.""" - _, _, code_predictor_wrapper, _ = loaded_target_classes + _, _, code_predictor_wrapper, _, _ = loaded_target_classes cp_config, talker_config = _make_tiny_config(loaded_target_classes) vllm_config = _make_vllm_config(mocker, max_num_seqs=2) @@ -250,9 +274,9 @@ class TestCodePredictorModelDtype: def test_model_forward_float16(self, loaded_target_classes) -> None: """Inner model forward should work in float16.""" - _, _, _, code_predictor_model = loaded_target_classes + _, _, _, code_predictor_model, _ = loaded_target_classes cp_config, _ = _make_tiny_config(loaded_target_classes) - model = code_predictor_model(cp_config, talker_hidden_size=32).to(torch.float16) + model = code_predictor_model(cp_config, embedding_dim=32).to(torch.float16) bsz, seq_len = 1, 4 inputs = torch.randn(bsz, seq_len, 32, dtype=torch.float16) @@ -264,9 +288,9 @@ def test_model_forward_float16(self, loaded_target_classes) -> None: def test_model_forward_float32(self, loaded_target_classes) -> None: """Inner model forward should work in float32.""" - _, _, _, code_predictor_model = loaded_target_classes + _, _, _, code_predictor_model, _ = loaded_target_classes cp_config, _ = _make_tiny_config(loaded_target_classes) - model = code_predictor_model(cp_config, talker_hidden_size=32).to(torch.float32) + model = code_predictor_model(cp_config, embedding_dim=32).to(torch.float32) bsz, seq_len = 1, 4 inputs = torch.randn(bsz, seq_len, 32, dtype=torch.float32) @@ -275,3 +299,37 @@ def test_model_forward_float32(self, loaded_target_classes) -> None: output = model(inputs, pos_ids) assert output.dtype == torch.float32 assert output.shape == (bsz, seq_len, 32) + + +class TestCodePredictorWrapperConfig: + """Test wrapper configuration for different models.""" + + def test_omni_config(self, loaded_target_classes) -> None: + """Qwen3-Omni uses correct wrapper config.""" + _, _, _, _, code_predictor_wrapper_config = loaded_target_classes + config = code_predictor_wrapper_config( + use_cuda_graphs=False, + use_parallel_embedding=True, + use_projection=False, + return_proj_buf=True, + sampling_mode="stored", + ) + assert config.use_cuda_graphs is False + assert config.use_parallel_embedding is True + assert config.return_proj_buf is True + assert config.sampling_mode == "stored" + + def test_tts_config(self, loaded_target_classes) -> None: + """Qwen3-TTS uses correct wrapper config.""" + _, _, _, _, code_predictor_wrapper_config = loaded_target_classes + config = code_predictor_wrapper_config( + use_cuda_graphs=True, + use_parallel_embedding=False, + use_projection=True, + return_proj_buf=False, + sampling_mode="per_call", + ) + assert config.use_cuda_graphs is True + assert config.use_parallel_embedding is False + assert config.return_proj_buf is False + assert config.sampling_mode == "per_call" diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index bf40aa77cd5..3a7fe4bad77 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -192,8 +192,9 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata: default_sampling_params: OmniSamplingParams = SPClass(**default_sp) custom_process_input_func: Callable | None = None - if hasattr(stage_config, "custom_process_input_func"): - mod_path, fn_name = stage_config.custom_process_input_func.rsplit(".", 1) + _cpif_path = getattr(stage_config, "custom_process_input_func", None) + if _cpif_path: + mod_path, fn_name = _cpif_path.rsplit(".", 1) custom_process_input_func = getattr(importlib.import_module(mod_path), fn_name) prompt_expand_func: Callable | None = None diff --git a/vllm_omni/model_executor/models/common/__init__.py b/vllm_omni/model_executor/models/common/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/vllm_omni/model_executor/models/common/qwen3_code_predictor.py b/vllm_omni/model_executor/models/common/qwen3_code_predictor.py new file mode 100644 index 00000000000..3a904442fa8 --- /dev/null +++ b/vllm_omni/model_executor/models/common/qwen3_code_predictor.py @@ -0,0 +1,654 @@ +"""Qwen3 Code Predictor -- optimized re-prefill, no KV cache. + +Shared by Qwen3-Omni and Qwen3-TTS talker models. + +* SDPA attention (F.scaled_dot_product_attention) with native GQA support +* HF-compatible numerics (float32 RMSNorm, float32 RoPE, separate linear layers) +* Per-call embedding buffer to avoid cross-request aliasing +* Pre-allocated position_ids (read-only, safe to persist) +* torch.compile (epilogue_fusion=False) on inner transformer by default +* Optional manual CUDA graph capture per batch-size bucket +* Inline sampling (top-k + top-p) -- no custom op overhead +""" + +from __future__ import annotations + +import dataclasses +from collections.abc import Iterable + +import torch +import torch.nn as nn +import torch.nn.functional as F +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + +from vllm_omni.platforms import current_omni_platform + +logger = init_logger(__name__) + + +# =================================================================== +# HF-numerics-compatible layers for code predictor +# =================================================================== +# +# These use plain PyTorch ops (nn.Linear, manual RMSNorm in float32, +# rotate_half RoPE) to produce outputs numerically identical to the +# HuggingFace reference. vLLM's fused kernels (RMSNorm, QKVParallel, +# get_rope) introduce small precision differences that compound across +# the autoregressive steps of the code predictor, causing severe +# audio quality degradation. +# +# See: https://github.com/vllm-project/vllm-omni/issues/2274 + + +class _RMSNorm(nn.Module): + """RMSNorm matching HuggingFace's implementation exactly. + + Computes variance in float32 to avoid bfloat16 precision loss. + """ + + def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +def _rotate_half(x: torch.Tensor) -> torch.Tensor: + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +class _RotaryEmbedding(nn.Module): + """RoPE matching HuggingFace's implementation exactly. + + Forces float32 computation for cos/sin, matching HF's torch.autocast(enabled=False). + """ + + def __init__(self, config) -> None: + super().__init__() + head_dim = getattr( + config, + "head_dim", + config.hidden_size // config.num_attention_heads, + ) + rope_theta = getattr(config, "rope_theta", 10000.0) + inv_freq = 1.0 / (rope_theta ** (torch.arange(0, head_dim, 2, dtype=torch.float32) / head_dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + def forward(self, x: torch.Tensor, position_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + # position_ids: [batch, seq_len] + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + # Force float32 (matching HF) + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +# =================================================================== +# Attention +# =================================================================== + + +class CodePredictorAttention(nn.Module): + """Multi-head self-attention for code predictor. + + Uses ``F.scaled_dot_product_attention`` with HF-compatible RoPE and RMSNorm. + No KV cache -- the code predictor always re-prefills the full (short) + sequence each AR step. + + Input : [B, seq_len, hidden_size] + Output: [B, seq_len, hidden_size] + """ + + def __init__(self, config, *, prefix: str = "") -> None: + super().__init__() + self.num_heads = config.num_attention_heads + self.num_kv_heads = config.num_key_value_heads + self.head_dim = getattr( + config, + "head_dim", + config.hidden_size // config.num_attention_heads, + ) + self.hidden_size = config.hidden_size + self.scaling = self.head_dim**-0.5 + self._use_gqa = self.num_kv_heads != self.num_heads + + # Separate q/k/v projections matching HF (no fused packing) + bias = getattr(config, "attention_bias", False) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_kv_heads * self.head_dim, bias=bias) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + self.q_norm = _RMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.k_norm = _RMSNorm(self.head_dim, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + ) -> torch.Tensor: + bsz, seq_len, _ = hidden_states.shape + hidden_shape_q = (bsz, seq_len, self.num_heads, self.head_dim) + hidden_shape_kv = (bsz, seq_len, self.num_kv_heads, self.head_dim) + + q = self.q_norm(self.q_proj(hidden_states).view(hidden_shape_q)).transpose(1, 2) + k = self.k_norm(self.k_proj(hidden_states).view(hidden_shape_kv)).transpose(1, 2) + v = self.v_proj(hidden_states).view(hidden_shape_kv).transpose(1, 2) + + cos, sin = position_embeddings + # cos/sin are [batch, seq_len, head_dim], need unsqueeze at dim=1 for heads + cos = cos.unsqueeze(1) # [batch, 1, seq_len, head_dim] + sin = sin.unsqueeze(1) + q = (q * cos) + (_rotate_half(q) * sin) + k = (k * cos) + (_rotate_half(k) * sin) + + attn_out = F.scaled_dot_product_attention( + q, + k, + v, + scale=self.scaling, + is_causal=True, + enable_gqa=self._use_gqa, + ) + + attn_out = attn_out.transpose(1, 2).reshape(bsz, seq_len, -1) + return self.o_proj(attn_out) + + +# =================================================================== +# MLP +# =================================================================== + + +class CodePredictorMLP(nn.Module): + """SiLU-gated MLP for code predictor, matching HF's implementation.""" + + def __init__(self, config, *, prefix: str = "") -> None: + super().__init__() + self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) + self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) + self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return self.down_proj(F.silu(self.gate_proj(hidden_states)) * self.up_proj(hidden_states)) + + +# =================================================================== +# Decoder Layer +# =================================================================== + + +class CodePredictorDecoderLayer(nn.Module): + """Transformer decoder layer (SDPA, no KV cache).""" + + def __init__(self, config, *, prefix: str = "") -> None: + super().__init__() + self.self_attn = CodePredictorAttention(config, prefix=f"{prefix}.self_attn") + self.mlp = CodePredictorMLP(config, prefix=f"{prefix}.mlp") + self.input_layernorm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + ) -> torch.Tensor: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.self_attn(hidden_states, position_embeddings) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +# =================================================================== +# Base Transformer Model (re-prefill, no KV cache) +# =================================================================== + + +class CodePredictorBaseModel(nn.Module): + """Inner transformer for code predictor. + + Signature: ``forward(inputs_embeds, position_ids) -> hidden_states`` + """ + + def __init__( + self, + config, + *, + embedding_dim: int | None = None, + use_parallel_embedding: bool = False, + prefix: str = "", + ) -> None: + super().__init__() + self.config = config + + emb_dim = int(embedding_dim) if embedding_dim is not None else int(config.hidden_size) + if use_parallel_embedding: + self.codec_embedding = nn.ModuleList( + [VocabParallelEmbedding(config.vocab_size, emb_dim) for _ in range(config.num_code_groups - 1)] + ) + else: + self.codec_embedding = nn.ModuleList( + [nn.Embedding(config.vocab_size, emb_dim) for _ in range(config.num_code_groups - 1)] + ) + + self.layers = nn.ModuleList( + [ + CodePredictorDecoderLayer(config, prefix=f"{prefix}.layers.{idx}") + for idx in range(config.num_hidden_layers) + ] + ) + self.norm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = _RotaryEmbedding(config) + + def get_input_embeddings(self) -> nn.ModuleList: + return self.codec_embedding + + def forward( + self, + inputs_embeds: torch.Tensor, + position_ids: torch.Tensor, + ) -> torch.Tensor: + hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + for layer in self.layers: + hidden_states = layer(hidden_states, position_embeddings) + hidden_states = self.norm(hidden_states) + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + param = params_dict.get(name) + if param is None: + continue + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +# =================================================================== +# Wrapper Configuration +# =================================================================== + + +@dataclasses.dataclass +class CodePredictorWrapperConfig: + """Controls behavioral differences between model-specific code predictors.""" + + use_cuda_graphs: bool = False + use_parallel_embedding: bool = False + use_projection: bool = False + return_proj_buf: bool = False + sampling_mode: str = "stored" + + +# =================================================================== +# Code Predictor Wrapper (optimized re-prefill, persistent buffers) +# =================================================================== + + +class CodePredictorWrapper(nn.Module): + """Optimized code predictor -- re-prefill approach, no KV cache. + + Each AR step forwards the full growing sequence (len 2 -> num_code_groups+1) + through the transformer. The extra O(T^2) FLOPs are negligible for + short sequences, and this avoids all KV-cache management overhead. + + Optimizations: + 1. Per-call embedding buffer -- avoids cross-request aliasing. + 2. Pre-allocated position_ids -- no torch.arange per step. + 3. Cached module references -- bypass ModuleList indexing. + 4. torch.compile on inner transformer. + 5. Inline sampling (top-k + top-p) -- no custom op overhead. + 6. Optional manual CUDA graph capture per batch-size bucket. + """ + + def __init__( + self, + *, + vllm_config: VllmConfig, + cp_config, + wrapper_config: CodePredictorWrapperConfig, + talker_hidden_size: int | None = None, + prefix: str = "", + ) -> None: + super().__init__() + self._vllm_config = vllm_config + self.config = cp_config + self._wrapper_config = wrapper_config + self.prefix = prefix + + self._num_groups = int(cp_config.num_code_groups) + self._cp_hidden = int(cp_config.hidden_size) + + # For Omni backward compat (accessed by the talker) + self.num_code_groups = self._num_groups + + # Determine embedding dimension + _talker_hidden = int(talker_hidden_size) if talker_hidden_size is not None else self._cp_hidden + + self.model = CodePredictorBaseModel( + cp_config, + embedding_dim=_talker_hidden, + use_parallel_embedding=wrapper_config.use_parallel_embedding, + prefix=f"{prefix}.model" if prefix else "model", + ) + + self.lm_head = nn.ModuleList( + [nn.Linear(cp_config.hidden_size, cp_config.vocab_size, bias=False) for _ in range(self._num_groups - 1)] + ) + + # Projection: Identity when hidden sizes match or not needed + if wrapper_config.use_projection and _talker_hidden != self._cp_hidden: + self.small_to_mtp_projection = nn.Linear(_talker_hidden, self._cp_hidden, bias=True) + else: + self.small_to_mtp_projection = nn.Identity() + + # Sampling defaults for "stored" mode + self._top_k: int = 50 + self._top_p: float = 0.8 + + # Lazily initialised state + self._proj_buf: torch.Tensor | None = None + self._model_dtype: torch.dtype | None = None + self._compiled_model_fwd = None + self._bucket_sizes: list[int] = [] + self._bucket_pos_ids: dict[int, torch.Tensor] = {} + self._lm_heads_list: list[nn.Module] | None = None + self._codec_embeds_list: list[nn.Module] | None = None + self._cuda_graphs: dict[int, tuple[torch.cuda.CUDAGraph, torch.Tensor]] = {} + + def get_input_embeddings(self) -> nn.ModuleList: + return self.model.get_input_embeddings() + + def set_sampling_params(self, top_k: int = 50, top_p: float = 0.8) -> None: + """Configure sampling parameters to maintain consistency with previous implementation.""" + self._top_k = top_k + self._top_p = top_p + logger.debug("Sampling parameters updated: top_k=%d, top_p=%.2f", top_k, top_p) + + # ------------------------------------------------------------------ + # Lazy-init helpers + # ------------------------------------------------------------------ + + def _ensure_buffers(self, device: torch.device, dtype: torch.dtype, bsz: int) -> None: + """Ensure the projection buffer can hold at least *bsz* rows.""" + max_seq = self._num_groups + 1 + if ( + self._proj_buf is not None + and self._proj_buf.device == device + and self._proj_buf.dtype == dtype + and self._proj_buf.shape[0] >= bsz + ): + return + self._proj_buf = torch.zeros(bsz, max_seq, self._cp_hidden, dtype=dtype, device=device) + + def _setup_compile(self) -> None: + """Lazily set up torch.compile with optional CUDA graph capture.""" + if self._compiled_model_fwd is not None: + return + + # Cache model parameter dtype so forward() doesn't need to query it + # on every call. Also ensures warmup buffers match model precision + # even when upstream modules produce a different dtype (#2385). + self._model_dtype = next(self.model.parameters()).dtype + self._lm_heads_list = list(self.lm_head) + self._codec_embeds_list = list(self.model.codec_embedding) + + if not current_omni_platform.supports_torch_inductor(): + logger.warning_once("code_predictor: torch.compile disabled") + self._compiled_model_fwd = self.model.forward + return + + # torch.compile fuses RMSNorm/RoPE in ways that lose float32 + # precision, compounding across AR steps. Use epilogue_fusion=False + # to disable the problematic fusions while still getting kernel + # fusion benefits for the linear layers and SDPA. + self._compiled_model_fwd = torch.compile( + self.model.forward, + dynamic=False, + options={"epilogue_fusion": False}, + ) + self._warmup_buckets() + + if self._wrapper_config.use_cuda_graphs: + self._capture_cuda_graphs() + logger.info("code_predictor: torch.compile (no epilogue fusion) + CUDA graphs") + else: + logger.info("code_predictor: torch.compile (dynamic=False, no epilogue fusion)") + + def _padded_bsz(self, bsz: int) -> int: + """Round batch size up to nearest power-of-2 bucket.""" + for bucket in self._bucket_sizes: + if bsz <= bucket: + return bucket + return bsz + + def _warmup_buckets(self) -> None: + """Warmup power-of-2 batch-size buckets to front-load Inductor compilation.""" + max_bsz = self._vllm_config.scheduler_config.max_num_seqs + bucket_sizes = [1 << i for i in range(max_bsz.bit_length()) if (1 << i) <= max_bsz] + if max_bsz not in bucket_sizes: + bucket_sizes.append(max_bsz) + self._bucket_sizes = sorted(bucket_sizes) + + max_seq = self._num_groups + 1 + device = next(self.model.parameters()).device + + # Ensure proj_buf matches model parameter dtype to avoid dtype + # mismatch during warmup compilation (see #2385). + self._ensure_buffers(device, self._model_dtype, max(self._bucket_sizes)) + proj_buf = self._proj_buf + + for bsz in self._bucket_sizes: + pos_ids = torch.arange(max_seq, device=device, dtype=torch.long).unsqueeze(0).expand(bsz, -1).contiguous() + self._bucket_pos_ids[bsz] = pos_ids + for _ in range(3): + self._compiled_model_fwd(proj_buf[:bsz, :max_seq, :], pos_ids) + logger.info("code_predictor: warmup done for buckets %s", self._bucket_sizes) + + def _capture_cuda_graphs(self) -> None: + """Capture a CUDA graph per bucket using vLLM's global graph pool.""" + from vllm.platforms import current_platform + + pool = current_platform.get_global_graph_pool() + max_seq = self._num_groups + 1 + proj_buf = self._proj_buf + + for bsz in self._bucket_sizes: + static_input = proj_buf[:bsz, :max_seq, :] + pos_ids = self._bucket_pos_ids[bsz] + + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g, pool=pool): + static_output = self._compiled_model_fwd(static_input, pos_ids) + + self._cuda_graphs[bsz] = (g, static_output) + + logger.info("code_predictor: captured CUDA graphs for buckets %s", self._bucket_sizes) + + # ------------------------------------------------------------------ + # Forward -- re-prefill + inline sampling + # ------------------------------------------------------------------ + + @torch.inference_mode() + def forward( + self, + layer0_code: torch.Tensor, + layer0_embed: torch.Tensor, + last_talker_hidden: torch.Tensor, + do_sample: bool = True, + temperature: float = 0.9, + top_k: int = 50, + top_p: float = 1.0, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + """Predict residual codebooks 1..G-1 autoregressively via re-prefill.""" + bsz = int(layer0_code.shape[0]) + num_groups = self._num_groups + device = layer0_code.device + + # _setup_compile caches _model_dtype on first call; use it for buffers + # so they always match model weight precision (#2385). + self._setup_compile() + dtype = self._model_dtype + + padded_bsz = self._padded_bsz(bsz) + self._ensure_buffers(device, dtype, padded_bsz) + + proj_buf = self._proj_buf + max_seq = num_groups + 1 + projection = self.small_to_mtp_projection + model_fwd = self._compiled_model_fwd + lm_heads = self._lm_heads_list + codec_embeds = self._codec_embeds_list + + # Zero the padded region of the buffer + proj_buf[:padded_bsz].zero_() + + # Fill buffer positions 0 (talker hidden) & 1 (layer0 embed) + proj_buf[:bsz, 0, :] = projection(last_talker_hidden.reshape(bsz, 1, -1).to(dtype)).reshape(bsz, -1) + proj_buf[:bsz, 1, :] = projection(layer0_embed.reshape(bsz, 1, -1).to(dtype)).reshape(bsz, -1) + + # Get pre-computed pos_ids for this bucket + full_pos_ids = self._bucket_pos_ids.get(padded_bsz) + if full_pos_ids is None: + full_pos_ids = ( + torch.arange(max_seq, device=device, dtype=torch.long).unsqueeze(0).expand(padded_bsz, -1).contiguous() + ) + + # Use captured CUDA graph if available, otherwise call compiled fn. + cuda_graph_entry = self._cuda_graphs.get(padded_bsz) + + # Prepare sampling parameters + stored_mode = self._wrapper_config.sampling_mode == "stored" + if stored_mode: + s_top_k = self._top_k + s_top_p = self._top_p + else: + use_sampling = do_sample and temperature > 0 + inv_temperature = 1.0 / max(temperature, 1e-6) if use_sampling else 0.0 + if use_sampling and top_p != 1.0: + raise NotImplementedError( + "top_p sampling is not implemented for the vLLM-native code predictor; please set top_p=1.0." + ) + + # Output codes -- shape depends on return mode + if self._wrapper_config.return_proj_buf: + all_codes = torch.empty(bsz, num_groups, 1, dtype=torch.int64, device=device) + all_codes[:, 0] = layer0_code.reshape(bsz, -1)[:, :1] + else: + all_codes = torch.empty(bsz, num_groups, dtype=torch.long, device=device) + all_codes[:, 0] = layer0_code.reshape(bsz) + + # Autoregressive loop: predict layers 1..G-1 + for step in range(1, num_groups): + # Run transformer (CUDA graph replay or compiled forward) + if cuda_graph_entry is not None: + cuda_graph_entry[0].replay() + hidden_out = cuda_graph_entry[1] + else: + hidden_out = model_fwd(proj_buf[:padded_bsz, :max_seq, :], full_pos_ids) + + logits = lm_heads[step - 1](hidden_out[:bsz, step, :]) + + # Sample next code + if stored_mode: + # "stored" mode: top-k -> top-p -> softmax -> multinomial + if s_top_k > 0: + topk_vals, _ = logits.topk(s_top_k, dim=-1) + logits = logits.masked_fill(logits < topk_vals[:, -1:], float("-inf")) + if s_top_p < 1.0: + sorted_logits, sorted_idx = logits.sort(dim=-1, descending=True) + sorted_probs = F.softmax(sorted_logits, dim=-1) + cumulative_probs = sorted_probs.cumsum(dim=-1) + remove_mask = (cumulative_probs - sorted_probs) >= s_top_p + sorted_logits[remove_mask] = float("-inf") + logits = sorted_logits.scatter(1, sorted_idx, sorted_logits) + probs = F.softmax(logits, dim=-1) + code = torch.multinomial(probs, num_samples=1) + else: + # "per_call" mode: temperature-scaled + top-k + if use_sampling: + scaled = logits * inv_temperature + if top_k > 0: + topk_vals, _ = scaled.topk(top_k, dim=-1) + scaled = scaled.masked_fill(scaled < topk_vals[:, -1:], float("-inf")) + probs = F.softmax(scaled, dim=-1) + code = torch.multinomial(probs, num_samples=1) + else: + code = logits.argmax(dim=-1, keepdim=True) + + # Store code + if self._wrapper_config.return_proj_buf: + all_codes[:, step] = code + else: + all_codes[:, step] = code.reshape(bsz) + + # Embed predicted code -> project -> next buffer position + if step < num_groups - 1 or self._wrapper_config.return_proj_buf: + new_embed = codec_embeds[step - 1](code) + proj_buf[:bsz, step + 1, :] = projection(new_embed.reshape(bsz, 1, -1)).reshape(bsz, -1) + + if self._wrapper_config.return_proj_buf: + return all_codes, proj_buf[:bsz].clone() + return all_codes + + # ------------------------------------------------------------------ + # Weight loading + # ------------------------------------------------------------------ + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load weights directly (no fused projection remapping needed).""" + loaded: set[str] = set() + model_weights: list[tuple[str, torch.Tensor]] = [] + other_weights: list[tuple[str, torch.Tensor]] = [] + + for name, w in weights: + if "rotary_emb.inv_freq" in name: + continue + if name.startswith("model."): + model_weights.append((name[len("model.") :], w)) + else: + other_weights.append((name, w)) + + loaded_model = self.model.load_weights(model_weights) + loaded |= {f"model.{n}" for n in loaded_model} + + params = dict(self.named_parameters(remove_duplicate=False)) + for name, w in other_weights: + param = params.get(name) + if param is None: + continue + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, w) + loaded.add(name) + + return loaded diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py index 2ceaafdb670..819e22e181e 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_code_predictor_mtp.py @@ -1,510 +1,28 @@ -"""Qwen3-Omni Code Predictor -- optimized re-prefill, no KV cache. +"""Qwen3-Omni Code Predictor -- thin wrapper over CodePredictorWrapper.""" -* SDPA attention (F.scaled_dot_product_attention) with native GQA support -* HF-compatible numerics (float32 RMSNorm, float32 RoPE, separate linear layers) -* Per-call embedding buffer to avoid cross-request aliasing -* Pre-allocated position_ids (read-only, safe to persist) -* torch.compile (epilogue_fusion=False) on inner transformer by default -* Inline sampling (top-k + top-p) -- no custom op overhead -""" - -import torch -import torch.nn as nn -import torch.nn.functional as F from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding -from vllm.model_executor.model_loader.weight_utils import default_weight_loader - -from vllm_omni.platforms import current_omni_platform - -logger = init_logger(__name__) - - -# =================================================================== -# HF-numerics-compatible layers for code predictor -# =================================================================== -# -# These use plain PyTorch ops (nn.Linear, manual RMSNorm in float32, -# rotate_half RoPE) to produce outputs numerically identical to the -# HuggingFace reference. vLLM's fused kernels (RMSNorm, QKVParallel, -# get_rope) introduce small precision differences that compound across -# the autoregressive steps of the code predictor, causing severe -# audio quality degradation. -# -# See: https://github.com/vllm-project/vllm-omni/issues/2274 - - -class _RMSNorm(nn.Module): - """RMSNorm matching HuggingFace's implementation exactly. - - Computes variance in float32 to avoid bfloat16 precision loss. - """ - - def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - -def _rotate_half(x: torch.Tensor) -> torch.Tensor: - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -class _RotaryEmbedding(nn.Module): - """RoPE matching HuggingFace's implementation exactly. - - Forces float32 computation for cos/sin, matching HF's torch.autocast(enabled=False). - """ - - def __init__(self, config) -> None: - super().__init__() - head_dim = getattr( - config, - "head_dim", - config.hidden_size // config.num_attention_heads, - ) - rope_theta = getattr(config, "rope_theta", 10000.0) - inv_freq = 1.0 / (rope_theta ** (torch.arange(0, head_dim, 2, dtype=torch.float32) / head_dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - def forward(self, x: torch.Tensor, position_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - # position_ids: [batch, seq_len] - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) - position_ids_expanded = position_ids[:, None, :].float() - - # Force float32 (matching HF) - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() - sin = emb.sin() - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - -class Qwen3OmniCodePredictorAttention(nn.Module): - """Multi-head self-attention for code predictor. - - Uses ``F.scaled_dot_product_attention`` with HF-compatible RoPE and RMSNorm. - No KV cache -- the code predictor always re-prefills the full (short) - sequence each AR step. - - Input : [B, seq_len, hidden_size] - Output: [B, seq_len, hidden_size] - """ - - def __init__( - self, - config, - prefix: str = "", - ): - super().__init__() - cp_cfg = config.code_predictor_config - self.num_heads = cp_cfg.num_attention_heads - self.num_kv_heads = cp_cfg.num_key_value_heads - self.head_dim = getattr( - cp_cfg, - "head_dim", - cp_cfg.hidden_size // cp_cfg.num_attention_heads, - ) - self.hidden_size = cp_cfg.hidden_size - self.scaling = self.head_dim**-0.5 - self._use_gqa = self.num_kv_heads != self.num_heads - - # Separate q/k/v projections matching HF (no fused packing) - self.q_proj = nn.Linear( - self.hidden_size, - self.num_heads * self.head_dim, - bias=False, - ) - self.k_proj = nn.Linear( - self.hidden_size, - self.num_kv_heads * self.head_dim, - bias=False, - ) - self.v_proj = nn.Linear( - self.hidden_size, - self.num_kv_heads * self.head_dim, - bias=False, - ) - self.o_proj = nn.Linear( - self.num_heads * self.head_dim, - self.hidden_size, - bias=False, - ) - self.q_norm = _RMSNorm(self.head_dim, eps=cp_cfg.rms_norm_eps) - self.k_norm = _RMSNorm(self.head_dim, eps=cp_cfg.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - ) -> torch.Tensor: - bsz, seq_len, _ = hidden_states.shape - hidden_shape_q = (bsz, seq_len, self.num_heads, self.head_dim) - hidden_shape_kv = (bsz, seq_len, self.num_kv_heads, self.head_dim) - - q = self.q_norm(self.q_proj(hidden_states).view(hidden_shape_q)).transpose(1, 2) - k = self.k_norm(self.k_proj(hidden_states).view(hidden_shape_kv)).transpose(1, 2) - v = self.v_proj(hidden_states).view(hidden_shape_kv).transpose(1, 2) - - cos, sin = position_embeddings - # cos/sin are [batch, seq_len, head_dim], need unsqueeze at dim=1 for heads - cos = cos.unsqueeze(1) # [batch, 1, seq_len, head_dim] - sin = sin.unsqueeze(1) - q = (q * cos) + (_rotate_half(q) * sin) - k = (k * cos) + (_rotate_half(k) * sin) - - attn_out = F.scaled_dot_product_attention( - q, - k, - v, - scale=self.scaling, - is_causal=True, - enable_gqa=self._use_gqa, - ) - - attn_out = attn_out.transpose(1, 2).reshape(bsz, seq_len, -1) - output = self.o_proj(attn_out) - return output - - -# =================================================================== -# MLP -# =================================================================== - - -class Qwen3OmniCodePredictorMLP(nn.Module): - """SiLU-gated MLP for code predictor, matching HF's implementation.""" - - def __init__( - self, - config, - prefix: str = "", - ): - super().__init__() - hidden_size = config.code_predictor_config.hidden_size - intermediate_size = config.code_predictor_config.intermediate_size - - self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) - self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) - self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - return self.down_proj(F.silu(self.gate_proj(hidden_states)) * self.up_proj(hidden_states)) - - -# =================================================================== -# Decoder Layer -# =================================================================== - - -class Qwen3OmniCodePredictorDecoderLayer(nn.Module): - """Transformer decoder layer (SDPA, no KV cache).""" - - def __init__( - self, - config, - prefix: str = "", - ) -> None: - super().__init__() - self.self_attn = Qwen3OmniCodePredictorAttention( - config, - prefix=f"{prefix}.self_attn", - ) - self.mlp = Qwen3OmniCodePredictorMLP( - config, - prefix=f"{prefix}.mlp", - ) - cp_cfg = config.code_predictor_config - self.input_layernorm = _RMSNorm(cp_cfg.hidden_size, eps=cp_cfg.rms_norm_eps) - self.post_attention_layernorm = _RMSNorm(cp_cfg.hidden_size, eps=cp_cfg.rms_norm_eps) - def forward( - self, - hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - ) -> torch.Tensor: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - hidden_states = self.self_attn(hidden_states, position_embeddings) - hidden_states = residual + hidden_states +from vllm_omni.model_executor.models.common.qwen3_code_predictor import ( + CodePredictorWrapper, + CodePredictorWrapperConfig, +) - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - return hidden_states +class Qwen3OmniMoeTalkerCodePredictor(CodePredictorWrapper): + """Qwen3-Omni code predictor (no CUDA graphs, VocabParallelEmbedding).""" -# =================================================================== -# Base Transformer Model (re-prefill, no KV cache) -# =================================================================== - - -class Qwen3OmniCodePredictorBaseModel(nn.Module): - """Inner transformer for code predictor. - - Signature: ``forward(inputs_embeds, position_ids) -> hidden_states`` - -- plain Tensor in, plain Tensor out (no namedtuple). - """ - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config.code_predictor_config - self.config = config - - self.codec_embedding = nn.ModuleList( - [VocabParallelEmbedding(config.vocab_size, config.hidden_size) for _ in range(config.num_code_groups - 1)] - ) - - self.layers = nn.ModuleList( - [ - Qwen3OmniCodePredictorDecoderLayer( - vllm_config.model_config.hf_config, - prefix=f"{prefix}.layers.{idx}", - ) - for idx in range(config.num_hidden_layers) - ] - ) - self.norm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = _RotaryEmbedding(config) - - def forward( - self, - inputs_embeds: torch.Tensor, - position_ids: torch.Tensor, - ) -> torch.Tensor: - hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) - for layer in self.layers: - hidden_states = layer(hidden_states, position_embeddings) - hidden_states = self.norm(hidden_states) - return hidden_states - - -# =================================================================== -# Code Predictor Wrapper (optimized re-prefill, persistent buffers) -# =================================================================== - - -class Qwen3OmniMoeTalkerCodePredictor(nn.Module): - """Optimized code predictor -- re-prefill approach, no KV cache. - - Each AR step forwards the full growing sequence (len 2 -> num_code_groups+1) - through the transformer. The extra O(T^2) FLOPs are negligible for - short sequences, and this avoids all KV-cache management overhead. - - Optimizations: - 1. Per-call embedding buffer -- avoids cross-request aliasing. - 2. Pre-allocated position_ids -- no torch.arange per step. - 3. Cached module references -- bypass ModuleList indexing. - 4. torch.compile on inner transformer. - 5. Inline sampling (top-k + top-p) -- no custom op overhead. - """ - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - self.config = config - self.quant_config = vllm_config.quant_config - self.prefix = prefix - - self.num_code_groups = config.code_predictor_config.num_code_groups - self._hidden_size = config.code_predictor_config.hidden_size - - self.model = Qwen3OmniCodePredictorBaseModel( + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + cp_config = vllm_config.model_config.hf_config.code_predictor_config + super().__init__( vllm_config=vllm_config, + cp_config=cp_config, + wrapper_config=CodePredictorWrapperConfig( + use_cuda_graphs=False, + use_parallel_embedding=True, + use_projection=False, + return_proj_buf=True, + sampling_mode="stored", + ), + talker_hidden_size=cp_config.hidden_size, prefix=prefix, ) - - # One lm_head per residual layer (layers 1 .. G-1) - self.lm_head = nn.ModuleList( - [ - nn.Linear( - config.code_predictor_config.hidden_size, - config.code_predictor_config.vocab_size, - bias=False, - ) - for _ in range(self.num_code_groups - 1) - ] - ) - - self.set_sampling_params() - - # Lazily initialised position ids (read-only, safe to persist) - self._pos_ids: torch.Tensor | None = None - - # Cached plain-list refs (set once) - self._lm_heads: list | None = None - self._codec_embeds: list | None = None - - # Model forward (optionally compiled) - self._model_fwd: object | None = None - - def set_sampling_params(self, top_k: int = 50, top_p: float = 0.8): - """Configure sampling parameters to maintain consistency with previous implementation.""" - self._top_k = top_k - self._top_p = top_p - logger.debug(f"Sampling parameters updated: top_k={top_k}, top_p={top_p}s") - - # ------------------------------------------------------------------ - # Lazy-init helpers - # ------------------------------------------------------------------ - - def _ensure_pos_ids(self, device: torch.device) -> None: - if self._pos_ids is not None and self._pos_ids.device == device: - return - max_seq = self.num_code_groups + 1 - # [1, max_seq] for HF-style RoPE (will be expanded to [bsz, seq_len] at use) - self._pos_ids = torch.arange(max_seq, dtype=torch.long, device=device).unsqueeze(0) - - def _ensure_cached_refs(self) -> None: - if self._lm_heads is not None: - return - self._lm_heads = list(self.lm_head) - self._codec_embeds = list(self.model.codec_embedding) - - def _ensure_model_fwd(self) -> None: - if self._model_fwd is not None: - return - if current_omni_platform.supports_torch_inductor(): - # torch.compile fuses RMSNorm/RoPE in ways that lose float32 - # precision, compounding across AR steps. Use epilogue_fusion=False - # to disable the problematic fusions while still getting kernel - # fusion benefits for the linear layers and SDPA. - self._model_fwd = torch.compile( - self.model.forward, - dynamic=True, - options={ - "epilogue_fusion": False, - }, - ) - logger.info("code_predictor: torch.compile enabled (no epilogue fusion)") - else: - self._model_fwd = self.model.forward - logger.info("code_predictor: using eager mode (no torch.compile)") - - # ------------------------------------------------------------------ - # Forward -- re-prefill + inline sampling - # ------------------------------------------------------------------ - - @torch.inference_mode() - def forward( - self, - layer0_code: torch.Tensor, - layer0_embed: torch.Tensor, - last_talker_hidden: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - """Predict residual codebooks 1..G-1 autoregressively via re-prefill. - - Args: - layer0_code: [bsz, 1] int64 - layer0_embed: [bsz, 1, hidden_size] - last_talker_hidden: [bsz, 1, hidden_size] - - Returns: - all_codes: [bsz, num_code_groups, 1] - proj_buf: [bsz, num_code_groups + 1, hidden_size] - pos 0 = last_talker_hidden (NOT a codec embed) - pos 1 = layer0_embed - pos 2.. = `codec_embedding[i](predicted_code_i)` - """ - bsz = int(layer0_code.shape[0]) - device = layer0_code.device - dtype = last_talker_hidden.dtype - num_groups = self.num_code_groups - - # Lazy init (read-only caches only) - self._ensure_pos_ids(device) - self._ensure_model_fwd() - self._ensure_cached_refs() - - # Allocate proj_buf locally each call to avoid cross-call aliasing - max_seq = num_groups + 1 - proj_buf = torch.zeros(bsz, max_seq, self._hidden_size, dtype=dtype, device=device) - pos_ids = self._pos_ids - model_fwd = self._model_fwd - lm_heads = self._lm_heads - codec_embeds = self._codec_embeds - - # Output codes - all_codes = torch.empty(bsz, num_groups, 1, dtype=torch.int64, device=device) - all_codes[:, 0] = layer0_code - - # Fill buffer positions 0 & 1 - proj_buf[:bsz, 0:1, :] = last_talker_hidden - proj_buf[:bsz, 1:2, :] = layer0_embed - - # Autoregressive loop: predict layers 1..G-1 - for step in range(1, num_groups): - seq_len = step + 1 - projected = proj_buf[:bsz, :seq_len, :] - # position_ids: [batch, seq_len] for HF-style RoPE - step_pos_ids = pos_ids[:, :seq_len].expand(bsz, -1) - - hidden_out = model_fwd(projected, step_pos_ids) - - # Inline sampling: top-k -> top-p -> softmax -> multinomial - logits = lm_heads[step - 1](hidden_out[:, -1, :]) # [bsz, vocab] - if self._top_k > 0: - topk_vals, _ = logits.topk(self._top_k, dim=-1) - logits = logits.masked_fill(logits < topk_vals[:, -1:], float("-inf")) - if self._top_p < 1.0: - sorted_logits, sorted_idx = logits.sort(dim=-1, descending=True) - cumulative_probs = F.softmax(sorted_logits, dim=-1).cumsum(dim=-1) - # Remove tokens with cumulative probability above top_p - remove_mask = cumulative_probs - F.softmax(sorted_logits, dim=-1) >= self._top_p - sorted_logits[remove_mask] = float("-inf") - logits = sorted_logits.scatter(1, sorted_idx, sorted_logits) - probs = F.softmax(logits, dim=-1) - code = torch.multinomial(probs, num_samples=1) # [bsz, 1] - - all_codes[:, step] = code - - # Embed predicted code -> next buffer position - new_embed = codec_embeds[step - 1](code) # [batch, 1, hidden_size] - proj_buf[:bsz, step + 1 : step + 2, :] = new_embed - - return all_codes, proj_buf[:bsz] - - # ------------------------------------------------------------------ - # Weight loading - # ------------------------------------------------------------------ - - def load_weights(self, weights: list[tuple[str, torch.Tensor]]) -> set[str]: - """Load weights directly (no fused projection remapping needed). - - Since we use separate nn.Linear for q/k/v/o and gate/up/down, - weight names match the HF checkpoint directly. - """ - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - - for name, loaded_weight in weights: - # Skip rotary embeddings - if "rotary_emb.inv_freq" in name: - continue - - param = params_dict.get(name) - if param is None: - continue - - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - - return loaded_params diff --git a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py index 1e84eaebaa5..8d2f0686ae0 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py +++ b/vllm_omni/model_executor/models/qwen3_tts/qwen3_tts_code_predictor_vllm.py @@ -1,318 +1,27 @@ +"""Qwen3-TTS Code Predictor -- thin wrapper over CodePredictorWrapper.""" + from __future__ import annotations from collections.abc import Iterable import torch -import torch.nn as nn -import torch.nn.functional as F from vllm.config import VllmConfig from vllm.config.vllm import set_current_vllm_config -from vllm.logger import init_logger -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, -) -from vllm_omni.platforms import current_omni_platform +from vllm_omni.model_executor.models.common.qwen3_code_predictor import ( + CodePredictorBaseModel, + CodePredictorWrapper, + CodePredictorWrapperConfig, +) from .configuration_qwen3_tts import Qwen3TTSTalkerCodePredictorConfig, Qwen3TTSTalkerConfig -logger = init_logger(__name__) - - -# =================================================================== -# HF-numerics-compatible layers for code predictor -# =================================================================== -# -# These use plain PyTorch ops (nn.Linear, manual RMSNorm in float32, -# rotate_half RoPE) to produce outputs numerically identical to the -# HuggingFace reference. vLLM's fused kernels (RMSNorm, QKVParallel, -# get_rope) introduce small precision differences that compound across -# the 15 autoregressive steps of the code predictor, causing severe -# audio quality degradation (UTMOS ~4.26 → ~2.66). -# -# See: https://github.com/vllm-project/vllm-omni/issues/2274 - - -class _RMSNorm(nn.Module): - """RMSNorm matching HuggingFace's Qwen3TTSRMSNorm exactly. - - Computes variance in float32 to avoid bfloat16 precision loss. - """ - - def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - -def _rotate_half(x: torch.Tensor) -> torch.Tensor: - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -class _RotaryEmbedding(nn.Module): - """RoPE matching HuggingFace's Qwen3TTSRotaryEmbedding exactly. - - Forces float32 computation for cos/sin, matching HF's torch.autocast(enabled=False). - """ - - def __init__(self, config: Qwen3TTSTalkerCodePredictorConfig) -> None: - super().__init__() - head_dim = getattr( - config, - "head_dim", - config.hidden_size // config.num_attention_heads, - ) - # Standard default RoPE - rope_theta = getattr(config, "rope_theta", 10000.0) - inv_freq = 1.0 / (rope_theta ** (torch.arange(0, head_dim, 2, dtype=torch.float32) / head_dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - def forward(self, x: torch.Tensor, position_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - # position_ids: [batch, seq_len] - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) - position_ids_expanded = position_ids[:, None, :].float() - - # Force float32 (matching HF) - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() - sin = emb.sin() - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - -class _CodePredictorAttention(nn.Module): - """Standalone multi-head attention for code predictor. - - Uses F.scaled_dot_product_attention with HF-compatible RoPE and RMSNorm. - Input: [B, seq_len, hidden_size], output: [B, seq_len, hidden_size]. - """ - - def __init__( - self, - config: Qwen3TTSTalkerCodePredictorConfig, - *, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.num_kv_heads = config.num_key_value_heads - self.head_dim = getattr( - config, - "head_dim", - config.hidden_size // config.num_attention_heads, - ) - self.scaling = self.head_dim**-0.5 - self._use_gqa = self.num_kv_heads != self.num_heads - - # Separate q/k/v projections matching HF (no fused packing) - self.q_proj = nn.Linear( - self.hidden_size, - self.num_heads * self.head_dim, - bias=getattr(config, "attention_bias", False), - ) - self.k_proj = nn.Linear( - self.hidden_size, - self.num_kv_heads * self.head_dim, - bias=getattr(config, "attention_bias", False), - ) - self.v_proj = nn.Linear( - self.hidden_size, - self.num_kv_heads * self.head_dim, - bias=getattr(config, "attention_bias", False), - ) - self.o_proj = nn.Linear( - self.num_heads * self.head_dim, - self.hidden_size, - bias=False, - ) - self.q_norm = _RMSNorm(self.head_dim, eps=config.rms_norm_eps) - self.k_norm = _RMSNorm(self.head_dim, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - ) -> torch.Tensor: - bsz, seq_len, _ = hidden_states.shape - hidden_shape_q = (bsz, seq_len, self.num_heads, self.head_dim) - hidden_shape_kv = (bsz, seq_len, self.num_kv_heads, self.head_dim) - - q = self.q_norm(self.q_proj(hidden_states).view(hidden_shape_q)).transpose(1, 2) - k = self.k_norm(self.k_proj(hidden_states).view(hidden_shape_kv)).transpose(1, 2) - v = self.v_proj(hidden_states).view(hidden_shape_kv).transpose(1, 2) - - cos, sin = position_embeddings - # cos/sin are [batch, seq_len, head_dim], need unsqueeze at dim=1 for heads - cos = cos.unsqueeze(1) # [batch, 1, seq_len, head_dim] - sin = sin.unsqueeze(1) - q = (q * cos) + (_rotate_half(q) * sin) - k = (k * cos) + (_rotate_half(k) * sin) - - attn_out = F.scaled_dot_product_attention( - q, - k, - v, - scale=self.scaling, - is_causal=True, - enable_gqa=self._use_gqa, - ) - - attn_out = attn_out.transpose(1, 2).reshape(bsz, seq_len, -1) - output = self.o_proj(attn_out) - return output - - -class _CodePredictorMLP(nn.Module): - """SiLU-gated MLP for code predictor, matching HF's Qwen3TTSTalkerTextMLP.""" - - def __init__( - self, - config: Qwen3TTSTalkerCodePredictorConfig, - *, - prefix: str = "", - ) -> None: - super().__init__() - self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) - self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) - self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x)) - - -class _CodePredictorDecoderLayer(nn.Module): - """Transformer decoder layer for code predictor (SDPA, no KV cache).""" - - def __init__( - self, - config: Qwen3TTSTalkerCodePredictorConfig, - *, - prefix: str = "", - ) -> None: - super().__init__() - self.self_attn = _CodePredictorAttention(config, prefix=f"{prefix}.self_attn") - self.mlp = _CodePredictorMLP(config, prefix=f"{prefix}.mlp") - self.input_layernorm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - ) -> torch.Tensor: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - hidden_states = self.self_attn(hidden_states, position_embeddings) - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - return hidden_states - - -# =================================================================== -# Code Predictor Transformer Model -# =================================================================== - - -class Qwen3TTSTalkerCodePredictorModelVLLM(nn.Module): - """Transformer model for the code predictor (re-prefill, no KV cache).""" - - def __init__( - self, - config: Qwen3TTSTalkerCodePredictorConfig, - *, - talker_hidden_size: int | None = None, - prefix: str = "", - ) -> None: - super().__init__() - self.config = config - - self.layers = nn.ModuleList( - [_CodePredictorDecoderLayer(config, prefix=f"{prefix}.layers.{i}") for i in range(config.num_hidden_layers)] - ) - self.norm = _RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = _RotaryEmbedding(config) - - # Codec embeddings: one per residual group. Stored in talker hidden dim - # (some checkpoints use talker_hidden_size != code_predictor hidden_size). - emb_dim = int(talker_hidden_size) if talker_hidden_size is not None else int(config.hidden_size) - self.codec_embedding = nn.ModuleList( - [nn.Embedding(config.vocab_size, emb_dim) for _ in range(config.num_code_groups - 1)] - ) - - def get_input_embeddings(self) -> nn.ModuleList: - return self.codec_embedding - - def forward( - self, - inputs_embeds: torch.Tensor, - position_ids: torch.Tensor, - ) -> torch.Tensor: - hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) - for layer in self.layers: - hidden_states = layer(hidden_states, position_embeddings) - hidden_states = self.norm(hidden_states) - return hidden_states - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - param = params_dict.get(name) - if param is None: - continue - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -# =================================================================== -# Code Predictor Wrapper (optimized re-prefill + torch.compile) -# =================================================================== - - -class Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM(nn.Module): - """vLLM-native code_predictor for the AR talker (residual codebooks). +# Backward-compat alias used by tests +Qwen3TTSTalkerCodePredictorModelVLLM = CodePredictorBaseModel - Re-prefill approach: each AR step forwards the full growing sequence - through the 5-layer transformer. No KV cache needed. This trades - ~O(T^2) extra attention FLOPs (negligible for T=16, 5 layers) for - zero KV cache management overhead and a simpler execution model. - Uses HF-compatible layers (plain nn.Linear, float32 RMSNorm, rotate_half - RoPE) to ensure numerical fidelity with the reference implementation. - Precision matters here because small errors compound across 15 AR steps. - - Optimizations preserved: - 1. torch.compile on model forward -- fuses small kernel launches. - 2. Pre-allocated embedding buffer [B, max_seq, H] -- no torch.cat per step. - 3. Projection caching -- each token projected once and cached. - 4. Pre-allocated position_ids -- no torch.arange per step. - 5. Inline sampling -- no custom op / forward_context overhead. - 6. Cached module references -- bypass nn.Module.__call__ overhead. - 7. CUDA graphs per batch-size bucket. - """ +class Qwen3TTSTalkerCodePredictorForConditionalGenerationVLLM(CodePredictorWrapper): + """Qwen3-TTS code predictor (CUDA graphs, per-call sampling, projection).""" def __init__( self, @@ -322,250 +31,24 @@ def __init__( talker_config: Qwen3TTSTalkerConfig, prefix: str = "code_predictor", ) -> None: - super().__init__() - self._vllm_config = vllm_config - self.config = config - self.talker_config = talker_config - - self.model = Qwen3TTSTalkerCodePredictorModelVLLM( - config, + super().__init__( + vllm_config=vllm_config, + cp_config=config, + wrapper_config=CodePredictorWrapperConfig( + use_cuda_graphs=True, + use_parallel_embedding=False, + use_projection=(config.hidden_size != talker_config.hidden_size), + return_proj_buf=False, + sampling_mode="per_call", + ), talker_hidden_size=int(talker_config.hidden_size), - prefix=f"{prefix}.model", + prefix=prefix, ) - - self.lm_head = nn.ModuleList( - [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(config.num_code_groups - 1)] - ) - - if config.hidden_size != talker_config.hidden_size: - self.small_to_mtp_projection = nn.Linear(talker_config.hidden_size, config.hidden_size, bias=True) - else: - self.small_to_mtp_projection = nn.Identity() - - self._num_groups = int(config.num_code_groups) - self._talker_hidden = int(talker_config.hidden_size) - self._cp_hidden = int(config.hidden_size) - - # Pre-allocated buffers (lazily initialized on first forward). - self._proj_buf: torch.Tensor | None = None - self._model_dtype: torch.dtype | None = None - - # torch.compile + warmup state (lazily initialized in _setup_compile). - self._compiled_model_fwd = None - self._bucket_sizes: list[int] = [] - self._bucket_pos_ids: dict[int, torch.Tensor] = {} - self._lm_heads_list: list[nn.Module] | None = None - self._codec_embeds_list: list[nn.Module] | None = None - self._cuda_graphs: dict[int, tuple[torch.cuda.CUDAGraph, torch.Tensor]] = {} - - def get_input_embeddings(self) -> nn.ModuleList: - return self.model.get_input_embeddings() + # Store talker_config for backward compat (accessed by some callers) + self.talker_config = talker_config + self._vllm_config = vllm_config def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load weights with vllm config context (required for VocabParallelEmbedding).""" with set_current_vllm_config(self._vllm_config): - loaded: set[str] = set() - model_weights: list[tuple[str, torch.Tensor]] = [] - other_weights: list[tuple[str, torch.Tensor]] = [] - for name, w in weights: - if name.startswith("model."): - model_weights.append((name[len("model.") :], w)) - else: - other_weights.append((name, w)) - - loaded_model = self.model.load_weights(model_weights) - loaded |= {f"model.{n}" for n in loaded_model} - - params = dict(self.named_parameters(remove_duplicate=False)) - for name, w in other_weights: - if name not in params: - continue - default_weight_loader(params[name], w) - loaded.add(name) - - return loaded - - # ------------------------------------------------------------------ - # Pre-allocated buffer management - # ------------------------------------------------------------------ - - def _ensure_buffers(self, device: torch.device, dtype: torch.dtype) -> None: - max_seq = self._num_groups + 1 - if self._proj_buf is not None and self._proj_buf.device == device and self._proj_buf.dtype == dtype: - return - max_bsz = self._vllm_config.scheduler_config.max_num_seqs - self._proj_buf = torch.zeros( - max_bsz, - max_seq, - self._cp_hidden, - dtype=dtype, - device=device, - ) - - def _setup_compile(self) -> None: - """Lazily set up torch.compile with manual CUDA graph capture.""" - if self._compiled_model_fwd is not None: - return - # Cache model parameter dtype so forward() doesn't need to query it - # on every call. Also ensures warmup buffers match model precision - # even when upstream modules produce a different dtype (#2385). - self._model_dtype = next(self.model.parameters()).dtype - self._lm_heads_list = list(self.lm_head) - self._codec_embeds_list = list(self.model.codec_embedding) - if not current_omni_platform.supports_torch_inductor(): - logger.warning_once("code_predictor: torch.compile disabled") - self._compiled_model_fwd = self.model.forward - return - - # torch.compile fuses RMSNorm/RoPE in ways that lose float32 - # precision, compounding across 15 AR steps. Use torch.compile - # with options that disable the problematic fusions while still - # getting kernel fusion benefits for the linear layers and SDPA. - self._compiled_model_fwd = torch.compile( - self.model.forward, - dynamic=False, - options={ - "epilogue_fusion": False, - }, - ) - self._warmup_buckets() - self._capture_cuda_graphs() - logger.info("code_predictor: torch.compile (no epilogue fusion) + CUDA graphs") - - def _padded_bsz(self, bsz: int) -> int: - for bucket in self._bucket_sizes: - if bsz <= bucket: - return bucket - return bsz - - def _warmup_buckets(self) -> None: - """Warmup power-of-2 batch-size buckets to front-load Inductor compilation.""" - max_bsz = self._vllm_config.scheduler_config.max_num_seqs - bucket_sizes = [1 << i for i in range(max_bsz.bit_length()) if (1 << i) <= max_bsz] - if max_bsz not in bucket_sizes: - bucket_sizes.append(max_bsz) - self._bucket_sizes = sorted(bucket_sizes) - - max_seq = self._num_groups + 1 - device = next(self.model.parameters()).device - - # Ensure proj_buf matches model parameter dtype to avoid dtype - # mismatch during warmup compilation (see #2385). - self._ensure_buffers(device, self._model_dtype) - proj_buf = self._proj_buf - for bsz in self._bucket_sizes: - # position_ids: [batch, seq_len] for HF-style RoPE - pos_ids = torch.arange(max_seq, device=device, dtype=torch.long).unsqueeze(0).expand(bsz, -1) - self._bucket_pos_ids[bsz] = pos_ids - for _ in range(3): - self._compiled_model_fwd(proj_buf[:bsz, :max_seq, :], pos_ids) - logger.info("code_predictor: warmup done for buckets %s", self._bucket_sizes) - - def _capture_cuda_graphs(self) -> None: - """Capture a CUDA graph per bucket using vLLM's global graph pool.""" - from vllm.platforms import current_platform - - pool = current_platform.get_global_graph_pool() - - max_seq = self._num_groups + 1 - proj_buf = self._proj_buf - - for bsz in self._bucket_sizes: - static_input = proj_buf[:bsz, :max_seq, :] - pos_ids = self._bucket_pos_ids[bsz] - - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(g, pool=pool): - static_output = self._compiled_model_fwd(static_input, pos_ids) - - self._cuda_graphs[bsz] = (g, static_output) - - logger.info("code_predictor: captured CUDA graphs for buckets %s", self._bucket_sizes) - - # ------------------------------------------------------------------ - # Optimized forward: re-prefill + torch.compile + projection cache - # ------------------------------------------------------------------ - - @torch.inference_mode() - def forward( - self, - layer0_code: torch.Tensor, - layer0_embed: torch.Tensor, - last_talker_hidden: torch.Tensor, - do_sample: bool = True, - temperature: float = 0.9, - top_k: int = 50, - top_p: float = 1.0, - ) -> torch.Tensor: - """Predict residual codebooks 1..Q-1 autoregressively via re-prefill. - - torch.compile fuses the ~60 small kernel launches per step into fewer - fused kernels, reducing kernel launch overhead by ~75%. - - Projection caching: each token is projected once via small_to_mtp_projection - and cached in _proj_buf, avoiding redundant re-projection of past tokens. - """ - bsz = int(layer0_code.shape[0]) - num_groups = self._num_groups - device = layer0_code.device - - all_codes = torch.empty(bsz, num_groups, dtype=torch.long, device=device) - all_codes[:, 0] = layer0_code.reshape(bsz) - - # _setup_compile caches _model_dtype on first call; use it for buffers - # so they always match model weight precision (#2385). - self._setup_compile() - dtype = self._model_dtype - self._ensure_buffers(device, dtype) - - proj_buf = self._proj_buf - max_seq = self._num_groups + 1 - - projection = self.small_to_mtp_projection - model_fwd = self._compiled_model_fwd - lm_heads = self._lm_heads_list - codec_embeds = self._codec_embeds_list - - use_sampling = do_sample and temperature > 0 - inv_temperature = 1.0 / max(temperature, 1e-6) if use_sampling else 0.0 - if use_sampling and top_p != 1.0: - raise NotImplementedError( - "top_p sampling is not implemented for the vLLM-native code predictor; please set top_p=1.0." - ) - - padded_bsz = self._padded_bsz(bsz) - proj_buf[:padded_bsz].zero_() - - proj_buf[:bsz, 0, :] = projection(last_talker_hidden.reshape(bsz, 1, -1).to(dtype)).reshape(bsz, -1) - proj_buf[:bsz, 1, :] = projection(layer0_embed.reshape(bsz, 1, -1).to(dtype)).reshape(bsz, -1) - full_pos_ids = self._bucket_pos_ids.get(padded_bsz) - if full_pos_ids is None: - full_pos_ids = torch.arange(max_seq, device=device, dtype=torch.long).unsqueeze(0).expand(padded_bsz, -1) - - # Use captured CUDA graph if available, otherwise call compiled fn. - cuda_graph_entry = self._cuda_graphs.get(padded_bsz) - - for step in range(1, num_groups): - if cuda_graph_entry is not None: - cuda_graph_entry[0].replay() - hidden_out = cuda_graph_entry[1] - else: - hidden_out = model_fwd(proj_buf[:padded_bsz, :max_seq, :], full_pos_ids) - logits = lm_heads[step - 1](hidden_out[:bsz, step, :]) - - if use_sampling: - scaled = logits * inv_temperature - if top_k > 0: - topk_vals, _ = scaled.topk(top_k, dim=-1) - scaled = scaled.masked_fill(scaled < topk_vals[:, -1:], float("-inf")) - probs = F.softmax(scaled, dim=-1) - next_ids = torch.multinomial(probs, num_samples=1) - else: - next_ids = logits.argmax(dim=-1, keepdim=True) - - all_codes[:, step] = next_ids.reshape(bsz) - - if step < num_groups - 1: - new_embed = codec_embeds[step - 1](next_ids) - proj_buf[:bsz, step + 1, :] = projection(new_embed.reshape(bsz, 1, -1)).reshape(bsz, -1) - - return all_codes + return super().load_weights(weights) From 50ae1de7da006324942715fd5c03d298290065de Mon Sep 17 00:00:00 2001 From: "Y. Fisher" Date: Wed, 15 Apr 2026 15:54:38 +0800 Subject: [PATCH 53/76] [Feature] HunyuanImage3 allow guidance_scale<=1 in DiT stage (#2762) Signed-off-by: KexiongYu --- .../models/hunyuan_image3/hunyuan_image3_transformer.py | 3 ++- .../models/hunyuan_image3/pipeline_hunyuan_image3.py | 6 ++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py index 327260ee0bb..fbdacddaf34 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/hunyuan_image3_transformer.py @@ -1684,7 +1684,8 @@ def forward( else: attn_output = self.attn(q, k, v) # For o_proj - attn_output = attn_output.view(q.shape[0], -1) + # image_attn may return a non-contiguous tensor; reshape is safe here. + attn_output = attn_output.reshape(q.shape[0], -1) output, _ = self.o_proj(attn_output) output = output.reshape(bsz, q_len, -1) return output, None, past_key_value diff --git a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py index 2f140b48fc4..3de0ab31016 100644 --- a/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py +++ b/vllm_omni/diffusion/models/hunyuan_image3/pipeline_hunyuan_image3.py @@ -6,7 +6,6 @@ from collections.abc import Iterable from typing import Any -import numpy as np import torch import torch.nn as nn from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler @@ -544,7 +543,7 @@ def prepare_model_inputs( generator = [torch.Generator(self.device).manual_seed(seed) for seed in seeds] # 3. apply chat template - cfg_factor = {"gen_text": 1, "gen_image": 2} + cfg_factor = {"gen_text": 1, "gen_image": 1 + int(guidance_scale > 1.0)} bot_task = kwargs.pop("bot_task", "auto") # If `drop_think` enabled, always drop parts in the context. drop_think = kwargs.get("drop_think", self.generation_config.drop_think) @@ -1009,8 +1008,7 @@ def forward( if req.sampling_params.guidance_scale_provided: guidance_scale = req.sampling_params.guidance_scale if guidance_scale <= 1.0: - logger.warning("HunyuanImage3.0 does not support guidance_scale <= 1.0, will set it to 1.0 + epsilon.") - guidance_scale = 1.0 + np.finfo(float).eps + logger.info("HunyuanImage3.0 runs without classifier-free guidance when guidance_scale <= 1.0.") image_size = (height, width) model_inputs = self.prepare_model_inputs( prompt=prompt, From c6d76d081b3e926ea44bece356889f846445440a Mon Sep 17 00:00:00 2001 From: Zhang Jian Date: Wed, 15 Apr 2026 22:25:41 +0800 Subject: [PATCH 54/76] [Bugfix] Fix broken fp8 quantisation on Z-Image-Turbo, Qwen-Image, FLUX.1-dev (#2795) Signed-off-by: Zhang Co-authored-by: pjh4993 --- .../diffusion/models/flux/flux_transformer.py | 12 ++++-- .../qwen_image/qwen_image_transformer.py | 31 ++++++++----- .../models/z_image/z_image_transformer.py | 43 ++++++++++++++++--- 3 files changed, 66 insertions(+), 20 deletions(-) diff --git a/vllm_omni/diffusion/models/flux/flux_transformer.py b/vllm_omni/diffusion/models/flux/flux_transformer.py index 680b8bfbbed..297c6267515 100644 --- a/vllm_omni/diffusion/models/flux/flux_transformer.py +++ b/vllm_omni/diffusion/models/flux/flux_transformer.py @@ -381,7 +381,9 @@ def __init__( super().__init__() self.mlp_hidden_dim = int(dim * mlp_ratio) - self.norm = AdaLayerNormZeroSingle(dim, quant_config=quant_config, prefix=f"{prefix}.norm") + # Modulation linear kept full precision; shift/scale/gate outputs + # are multiplied into the residual stream every block (see #2728). + self.norm = AdaLayerNormZeroSingle(dim, quant_config=None, prefix=f"{prefix}.norm") self.proj_mlp = ReplicatedLinear( dim, self.mlp_hidden_dim, @@ -563,13 +565,16 @@ def __init__( self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim) self.x_embedder = nn.Linear(in_channels, self.inner_dim) + # Dual-stream blocks kept full precision — FP8 on their joint + # attention path causes noise on FLUX (#2728). Single-stream + # blocks (38 vs 19) still get FP8 for memory savings. self.transformer_blocks = nn.ModuleList( [ FluxTransformerBlock( dim=self.inner_dim, num_attention_heads=num_attention_heads, attention_head_dim=attention_head_dim, - quant_config=quant_config, + quant_config=None, prefix=f"transformer_blocks.{i}", ) for i in range(num_layers) @@ -589,12 +594,13 @@ def __init__( ] ) + # Final modulation feeds proj_out; keep full precision (see #2728). self.norm_out = AdaLayerNormContinuous( self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6, - quant_config=quant_config, + quant_config=None, prefix="norm_out", ) self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True) diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py index 9f16d8808c8..88a66d7f6b0 100644 --- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py +++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py @@ -169,12 +169,15 @@ def __init__( self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000) self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim) + # Time embedding MLP is kept full precision (quant_config=None) — + # small layers that feed per-block modulation; precision-sensitive + # (see #2728). self.timestep_embedder.linear_1 = ReplicatedLinear( 256, embedding_dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="timestep_embedder.linear_1", ) self.timestep_embedder.linear_2 = ReplicatedLinear( @@ -182,7 +185,7 @@ def __init__( embedding_dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="timestep_embedder.linear_2", ) self.use_additional_t_cond = use_additional_t_cond @@ -701,7 +704,10 @@ def __init__( self.num_attention_heads = num_attention_heads self.attention_head_dim = attention_head_dim - # Image processing modules + # Image processing modules. + # Modulation linear is kept full precision (quant_config=None) — it + # produces shift/scale/gate values that are precision-sensitive + # (see #2728). self.img_mod = nn.Sequential( nn.SiLU(), ReplicatedLinear( @@ -709,7 +715,7 @@ def __init__( 6 * dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="img_mod.1", ), ) @@ -725,7 +731,7 @@ def __init__( self.img_norm2 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) self.img_mlp = FeedForward(dim=dim, dim_out=dim, quant_config=quant_config, prefix="img_mlp") - # Text processing modules + # Text processing modules. self.txt_mod = nn.Sequential( nn.SiLU(), ReplicatedLinear( @@ -733,7 +739,7 @@ def __init__( 6 * dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="txt_mod.1", ), ) @@ -963,12 +969,14 @@ def __init__( self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6) + # Entry projections (image/text) are kept full precision — + # small sensitive layers at the network boundary (see #2728). self.img_in = ReplicatedLinear( in_channels, self.inner_dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="img_in", ) self.txt_in = ReplicatedLinear( @@ -976,7 +984,7 @@ def __init__( self.inner_dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="txt_in", ) @@ -993,13 +1001,16 @@ def __init__( ] ) + # Final modulation and output projection are kept full precision — + # they produce the output latent and are precision-sensitive + # (see #2728). self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6) self.norm_out.linear = ReplicatedLinear( self.inner_dim, 2 * self.inner_dim, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="norm_out.linear", ) self.proj_out = ReplicatedLinear( @@ -1007,7 +1018,7 @@ def __init__( patch_size * patch_size * self.out_channels, bias=True, return_bias=False, - quant_config=quant_config, + quant_config=None, prefix="proj_out", ) diff --git a/vllm_omni/diffusion/models/z_image/z_image_transformer.py b/vllm_omni/diffusion/models/z_image/z_image_transformer.py index 3ffad221ba9..c36ea746654 100644 --- a/vllm_omni/diffusion/models/z_image/z_image_transformer.py +++ b/vllm_omni/diffusion/models/z_image/z_image_transformer.py @@ -214,12 +214,14 @@ def __init__( super().__init__() if mid_size is None: mid_size = out_size + # Time embedding MLP is kept full precision (quant_config=None) — + # small layers that feed adaLN; precision-sensitive (see #2728). self.mlp = nn.Sequential( ReplicatedLinear( frequency_embedding_size, mid_size, bias=True, - quant_config=quant_config, + quant_config=None, return_bias=False, ), nn.SiLU(), @@ -227,7 +229,7 @@ def __init__( mid_size, out_size, bias=True, - quant_config=quant_config, + quant_config=None, return_bias=False, ), ) @@ -426,9 +428,16 @@ def __init__( self.modulation = modulation if modulation: + # Modulation linear is kept at full precision (quant_config=None) + # — it produces scale/gate values that are precision-sensitive + # (see #2728, mirrors OmniGen2 fix). self.adaLN_modulation = nn.Sequential( ReplicatedLinear( - min(dim, ADALN_EMBED_DIM), 4 * dim, bias=True, return_bias=False, quant_config=quant_config + min(dim, ADALN_EMBED_DIM), + 4 * dim, + bias=True, + quant_config=None, + return_bias=False, ), ) @@ -485,14 +494,24 @@ class FinalLayer(nn.Module): def __init__(self, hidden_size, out_channels, quant_config: "QuantizationConfig | None" = None): super().__init__() self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) + # Final output projection and its modulation are precision-sensitive + # (produce the output latent); keep at full precision (see #2728). self.linear = ReplicatedLinear( - hidden_size, out_channels, bias=True, quant_config=quant_config, return_bias=False + hidden_size, + out_channels, + bias=True, + quant_config=None, + return_bias=False, ) self.adaLN_modulation = nn.Sequential( nn.SiLU(), ReplicatedLinear( - min(hidden_size, ADALN_EMBED_DIM), hidden_size, bias=True, quant_config=quant_config, return_bias=False + min(hidden_size, ADALN_EMBED_DIM), + hidden_size, + bias=True, + quant_config=None, + return_bias=False, ), ) @@ -673,11 +692,13 @@ def __init__( all_x_embedder = {} all_final_layer = {} for patch_idx, (patch_size, f_patch_size) in enumerate(zip(all_patch_size, all_f_patch_size)): + # x_embedder (patch embed) is a small precision-sensitive entry + # layer; keep full precision (see #2728). x_embedder = ReplicatedLinear( f_patch_size * patch_size * patch_size * in_channels, dim, bias=True, - quant_config=quant_config, + quant_config=None, return_bias=False, ) all_x_embedder[f"{patch_size}-{f_patch_size}"] = x_embedder @@ -720,9 +741,17 @@ def __init__( ] ) self.t_embedder = TimestepEmbedder(min(dim, ADALN_EMBED_DIM), mid_size=1024, quant_config=quant_config) + # Caption embedder maps text features -> hidden; keep full precision + # (see #2728). self.cap_embedder = nn.Sequential( RMSNorm(cap_feat_dim, eps=norm_eps), - ReplicatedLinear(cap_feat_dim, dim, bias=True, return_bias=False, quant_config=quant_config), + ReplicatedLinear( + cap_feat_dim, + dim, + bias=True, + quant_config=None, + return_bias=False, + ), ) self.x_pad_token = nn.Parameter(torch.empty((1, dim))) From cad8956842f9fbb62e49be7e8688f809aac36826 Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Wed, 15 Apr 2026 22:26:47 +0800 Subject: [PATCH 55/76] fix: align processor/statge-replica Signed-off-by: ZhengWG --- vllm_omni/engine/async_omni_engine.py | 33 ++++++------ vllm_omni/engine/orchestrator.py | 77 ++++++++++++++++++++------- vllm_omni/engine/stage_pool.py | 27 ++++++++++ 3 files changed, 101 insertions(+), 36 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 56913f01c2d..114f75ce025 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -1146,6 +1146,11 @@ def _build_add_request_message( original_prompt = prompt stage_type = self.stage_metadata[0].get("stage_type") + # Text forwarded to the stage-0 output processor at registration time. + # Populated only on the LLM path below; for diffusion / pre-built + # EngineCoreRequest paths it stays None (the orchestrator's admit() + # still works — prompt text is optional on add_request). + output_prompt_text: Any = None if stage_type != "diffusion" and not isinstance(prompt, EngineCoreRequest): # Inject global_request_id into the raw prompt. if isinstance(prompt, dict): @@ -1184,17 +1189,15 @@ def _build_add_request_message( request.external_req_id = request_id request = _apply_omni_final_stage_metadata(request, final_stage_id) - # Register with stage 0's output processor. + # Registration with stage 0's output processor is deferred to the + # orchestrator thread (see Orchestrator._handle_add_request). The + # orchestrator must know which replica it picked via select_replica + # before it can register on the correct per-replica processor; a + # hardcoded ``output_processors[0]`` here would misalign for any + # stage-0 request routed to replica > 0. output_prompt_text = prompt_text if output_prompt_text is None and isinstance(original_prompt, dict): output_prompt_text = original_prompt.get("prompt") - self.output_processors[0].add_request( - request=request, - prompt=output_prompt_text, - parent_req=None, - request_index=0, - queue=None, - ) prompt = request return { @@ -1202,6 +1205,7 @@ def _build_add_request_message( "request_id": request_id, "prompt": prompt, "original_prompt": original_prompt, + "output_prompt_text": output_prompt_text, "sampling_params_list": effective_sampling_params_list, "final_stage_id": final_stage_id, } @@ -1241,14 +1245,10 @@ def _enqueue_cfg_companions( request = _upgrade_to_omni_request(request, companion_prompt) request.external_req_id = cid - self.output_processors[0].add_request( - request=request, - prompt=companion_prompt, - parent_req=None, - request_index=0, - queue=None, - ) - + # Registration of this companion on stage-0's output processor + # is deferred to Orchestrator._handle_add_companion, which calls + # stage_pools[0].admit(..., affinity_from=parent_replica) so that + # select + register + submit all land on the same replica. self.request_queue.sync_q.put_nowait( { "type": "add_companion_request", @@ -1256,6 +1256,7 @@ def _enqueue_cfg_companions( "parent_id": parent_id, "role": ep.role, "prompt": request, + "companion_prompt_text": companion_prompt, "sampling_params_list": companion_spl, } ) diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index f6143f05ab9..afef31e734b 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -851,26 +851,44 @@ async def _handle_add_request(self, msg: dict[str, Any]) -> None: ) self.request_states[request_id] = req_state - # Choose a replica for logical stage 0 - client_index = self._choose_client_index(logical_stage_id, req_state) req_state.stage_submit_ts[logical_stage_id] = _time.time() - # Stage-0 prompt is already a fully-formed OmniEngineCoreRequest - # (pre-processed by AsyncOmniEngine.add_request, output processor - # already registered there) - submit directly. request = prompt - stage_client = self.stage_clients[client_index] - if stage_client.stage_type == "diffusion": + stage0_pool = self.stage_pools[logical_stage_id] + + # Diffusion: no output_processor on stage 0, just select + submit. + if stage0_pool.replicas[0].client.stage_type == "diffusion": + replica = stage0_pool.select_replica(req_state) if isinstance(prompt, list): - await stage_client.add_batch_request_async( + await replica.client.add_batch_request_async( request_id, prompt, params, ) else: - await stage_client.add_request_async(request_id, prompt, params) + await replica.client.add_request_async(request_id, prompt, params) else: - await stage_client.add_request_async(request) + # LLM: atomically pick a replica and register on its output + # processor. Registration must target the same replica that + # will serve the request or the raw outputs come back to a + # processor that has never seen the req_id. + output_prompt_text = msg.get("output_prompt_text") + replica = stage0_pool.admit(req_state, request, output_prompt_text) + try: + await replica.client.add_request_async(request) + except Exception: + # Roll back the processor registration so we don't leak a + # half-admitted request on a failed submit. + try: + replica.output_processor.abort_requests([request_id], internal=False) + except Exception: + logger.exception( + "[Orchestrator] Failed to roll back output_processor registration " + "for req=%s after submit failure", + request_id, + ) + self.request_states.pop(request_id, None) + raise if self.async_chunk and logical_stage_id == 0 and final_stage_id > 0: await self._prewarm_async_chunk_stages(request_id, request, req_state) @@ -1003,10 +1021,11 @@ async def _handle_add_companion(self, msg: dict[str, Any]) -> None: self.request_states[companion_id] = companion_state # CFG companions must land on the same stage-0 replica as their - # parent so the diffusion stage can fetch both KV caches from a - # single device. Pass affinity_from explicitly; if the parent is - # already gone (aborted between add_request and add_companion) fall - # back to round-robin rather than failing the companion. + # parent so (a) the diffusion stage can fetch both KV caches from + # one device and (b) the output_processor that gets the companion's + # raw outputs is the same one that admit() registered it against. + # Pass affinity_from explicitly; if the parent is already gone + # (aborted between add_request and add_companion) fall back to RR. stage0_pool = self.stage_pools[0] parent_state = self.request_states.get(parent_id) parent_replica: StageReplica | None = None @@ -1015,15 +1034,33 @@ async def _handle_add_companion(self, msg: dict[str, Any]) -> None: if parent_flat is not None: parent_replica = stage0_pool.get_replica_by_flat_index(parent_flat) - companion_replica = stage0_pool.select_replica( - companion_state, - affinity_from=parent_replica, - ) - companion_state.stage_submit_ts[0] = _time.time() request = companion_prompt # Already a processed OmniEngineCoreRequest - await companion_replica.client.add_request_async(request) + companion_prompt_text = msg.get("companion_prompt_text") + companion_replica = stage0_pool.admit( + companion_state, + request, + companion_prompt_text, + affinity_from=parent_replica, + ) + try: + await companion_replica.client.add_request_async(request) + except Exception: + try: + companion_replica.output_processor.abort_requests([companion_id], internal=False) + except Exception: + logger.exception( + "[Orchestrator] Failed to roll back companion registration for %s", + companion_id, + ) + # Undo companion tracking so parent can proceed (parent may still + # succeed without this companion — expected in CFG fallback mode). + self.request_states.pop(companion_id, None) + self._companion_ids.discard(companion_id) + self._companion_to_parent.pop(companion_id, None) + self._companion_map.get(parent_id, {}).pop(role, None) + raise logger.info( "[Orchestrator] CFG companion submitted: %s (role=%s, parent=%s, stage-0 replica-%s)", diff --git a/vllm_omni/engine/stage_pool.py b/vllm_omni/engine/stage_pool.py index 86a2fdcd77f..c55e5714948 100644 --- a/vllm_omni/engine/stage_pool.py +++ b/vllm_omni/engine/stage_pool.py @@ -90,6 +90,33 @@ def select_replica( req_state.chosen_client_index[self.logical_stage_id] = chosen.flat_index return chosen + def admit( + self, + req_state: OrchestratorRequestState, + request: Any, + prompt_text: Any, + *, + affinity_from: StageReplica | None = None, + ) -> StageReplica: + """Select a replica and register *request* on its output_processor. + + Atomically couples replica selection with output_processor registration + so that "which replica will serve this request" and "which processor + knows about this request" are the same by construction. Call sites + must follow up with ``replica.client.add_request_async(request)`` and + on submission failure call ``replica.output_processor.abort_requests + ([request.request_id], internal=False)`` to roll back the registration. + """ + replica = self.select_replica(req_state, affinity_from=affinity_from) + replica.output_processor.add_request( + request=request, + prompt=prompt_text, + parent_req=None, + request_index=0, + queue=None, + ) + return replica + def build_stage_pools( stage_clients: list[Any], From f1e3f037265852b952cef654c489182bf7c26686 Mon Sep 17 00:00:00 2001 From: Alex Brooks Date: Wed, 15 Apr 2026 11:01:45 -0600 Subject: [PATCH 56/76] [feature] Hidden State Prefix Caching (#2164) Signed-off-by: Alex Brooks --- docs/.nav.yml | 1 + docs/design/feature/prefix_caching.md | 164 +++++++++ tests/conftest.py | 6 + tests/core/test_prefix_cache.py | 347 ++++++++++++++++++++ tests/e2e/online_serving/test_qwen3_omni.py | 75 ++++- vllm_omni/core/prefix_cache.py | 264 +++++++++++++++ vllm_omni/utils/mm_outputs.py | 93 ++++++ vllm_omni/worker/gpu_ar_model_runner.py | 202 +++++++++--- vllm_omni/worker/gpu_model_runner.py | 59 +++- 9 files changed, 1144 insertions(+), 67 deletions(-) create mode 100644 docs/design/feature/prefix_caching.md create mode 100644 tests/core/test_prefix_cache.py create mode 100644 vllm_omni/core/prefix_cache.py create mode 100644 vllm_omni/utils/mm_outputs.py diff --git a/docs/.nav.yml b/docs/.nav.yml index 441ef9f521e..79d7c38e274 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -98,6 +98,7 @@ nav: - design/feature/disaggregated_inference.md - design/feature/ray_based_execution.md - design/feature/omni_connectors/ + - design/feature/prefix_caching.md - design/feature/cfg_parallel.md - design/feature/expert_parallel.md - design/feature/sequence_parallel.md diff --git a/docs/design/feature/prefix_caching.md b/docs/design/feature/prefix_caching.md new file mode 100644 index 00000000000..ebad8b69106 --- /dev/null +++ b/docs/design/feature/prefix_caching.md @@ -0,0 +1,164 @@ +# Automatic Prefix Caching in Omni Models + + +--- + +## Table of Contents + +- [Overview](#overview) +- [High-Level Approach](#high-level-approach) +- [Example](#example) +- [What About Multimodal Inputs?](#what-about-multimodal-inputs) + +--- + +### Overview + +Prefix caching in the context of kv-cache management is a useful optimization for avoiding redundant computations. The main idea is that we store portions of the kv-cache from processed requests, so that we can reuse them if incoming requests have the same prefix as previous requests. + +vLLM manages the kv-cache as blocks, which represent a span of tokens of a fixed length. Blocks are hashable by the content that they contain, which typically means the tokens within the span, but also could be influenced by other factors, e.g., LoRA and multimodal data. + +vLLM implements automatic prefix caching for managing its kv-cache, which is best understood by reading the design document [here](https://docs.vllm.ai/en/latest/design/prefix_caching/). vLLM-Omni builds on top of the prefix caching mechanism in a noninvasive way to allow caching between stages in Omni pipelines. This typically means for a given stage we aim to support caching for the following: + +- The last hidden states produced by the stage +- Model / stage specific multimodal data + +!!! note "Note 1" + This document describes vLLM-Omni's mechanism for caching tensor outputs that are meant to be passed between stages, when requests have common prefixes, similar to the way in which vLLM has prefix caching for the kv-cache. This works in conjunction with vLLM's multimodal encoder caching, but is distinct. See the final section for a concrete example for how they tie together in practice. + +### High-Level Approach +!!! note "Note 2" + Prior to reading this section, it's recommended to take a look at the design documents in vLLM for [Automatic Prefix Caching](https://docs.vllm.ai/en/latest/features/automatic_prefix_caching/), which will make some of the concepts more clear. + +The main focus of vLLM-Omni's approach to prefix caching stage outputs is to build on vLLM's prefix caching in the least invasive way possible while minimizing impact for cache misses, and consuming a minimal amount of GPU memory. To understand the implementation, there are a few important things to note: + +- Between stages, device tensors are generally moved to CPU; this is important since we're just caching the outputs of stages, so it is okay to keep the entire cache on the CPU. + +- For a tensor to be considered cacheable, the first dimension (currently) needs to be the same as the token count, as it allows us to reuse block/slot mappings for our externally maintained tensor caches. This allows us to dynamically discover the tensors to be marked as cacheable outputs in each Omni model without having to explicitly specify cacheable output field names in every model. + +With this in mind, consider the set of blocks in a 2D layout, where the row represents the index of blocks being considered, and the columns represent the slots corresponding to tokens within each block. Since we know the `num_blocks` and `block_size` from our kv cache config, if we want to cache a tensor with feature size `D`, we can preallocate a CPU tensor of size `(num_blocks, block_size, D)`, and use the same block index and slot mapping to retrieve the corresponding feature vector. + + +### Example +!!! note "Note 3" + Prefix caching in vLLM-Omni currently is only supported on AutoRegressive stages with one kv-cache group. It can be enabled/disabled per-stage via the `enable_prefix_caching` parameter in the model's stage config. + +The way in which vLLM-Omni ties into vLLM's prefix caching is best understood by example. Say that we have the following: + +- `num_blocks=8` +- `block_size=4` +- `hidden_size=2` +- A stage specific multimodal output tensor named `mm_feature` with feature dimension `16` + +The prefix cache flow is then outlined below. + +1. When the model is initialized, we can determine the `hidden_size` from the `ModelConfig`, and allocate a cache of size `(num_blocks, block_size, hidden_size)`. + +2. Say we process the request `The quick brown fox was tired and slept beneath the shady tree`, which is 12 tokens and evenly divides into 3 blocks as shown below. + +``` + [ The quick brown fox ] [ was tired and slept ] [beneath the shady tree ] +Block 1: |<--- block tokens ---->| +Block 2: |<------- prefix ------>| |<--- block tokens --->| +Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->| +``` + +When the request processes, we inspect the multimodal outputs and identify the `mm_feature` tensor, which will be of shape `(seq_len, feature_dim)`, i.e., `(12, 16)` in this example. We note that the first axis is dependent on the `seq_len` and add a new cache_tensor of shape `(num_blocks, block_size, feature_dim)` to our multimodal cache for tensors. + + +3. If we lay out the cache as a 2D tensor of shape (`num_blocks`, `block_size`), we'll have something like the following: + +``` +0: [ The quick brown fox ] +1: [ was tired and slept ] +2: [beneath the shady tree ] +3: [EMPTY] +... +7: [EMPTY] +``` + +Or, if we flatten it down to 1D, +``` +0: The +1: quick +2: brown +3: fox +... +11: tree +12: [EMPTY] +... +``` + +which we can think of as row indices into the hidden states tensor if we view it as the 2D shape `(num_blocks x block_size, feature_dim)`. That is, the analogous flattened (from 3D -> 2D) mapping of the cache for hidden states becomes the following. +``` +0: +1: +2: +3: +... +11: +12: [EMPTY] +... +``` + +Similarly, for the multimodal outputs cache, the flattened coordinates are the same, but the `mm_feature` maps to vectors of length `16` instead of the hidden size of `2`. Note that in practice, we may have multiple multimodal output tensors per forward pass, which may have different names and different feature dimensions. + + +4. Now, say that we receive a new request `The quick brown fox jumped over the dog`. + +``` + [ The quick brown fox ] [ jumped over the dog ] +Block 1: |<--- block tokens ---->| +Block 2: |<------- prefix ------>| |<--- block tokens --->| +``` + +Here, we will have a cache hit for `Block 1` which will be detected by vLLM based on the hash of the first block when it's handling the prefix caching on the kv-cache. As a result, when we get the output from the scheduler, we will see that `num_computed_tokens=4` (corresponding to the cached first block), and we only need to process the remaining 4 new tokens in the new prefill. + +Since we have the block indices / slot mappings from the kv cache manager, we can simply mirror the mappings and leverage the same indices for the cached hidden states and multimodal outputs. This allows us to look up the correct tensors from our externally maintained 3D caches. + +``` +0: [ The quick brown fox ] < already in the cache +1: [ was tired and slept ] +2: [beneath the shady tree ] +3: [ jumped over the dog ] < added on the second request +4: [EMPTY] +... +7: [EMPTY] +... +``` + +Finally, to pass the full hidden states and multimodal outputs to the next stage, we simply concatenate the cached contents with the corresponding new tensors computed from the current forward call. + + +### What About Multimodal Inputs? +It's also useful to consider the case about how Omni prefix caching is handled when we have multimodal inputs that don't cleanly end on block boundaries, as well as how this works with multimodal encoder caching in vLLM. For example: + +``` + [ Im0 Im1 Im2 Im3 ] [ Im4 Im5 foo ] +Block 1: |<--- block tokens ---->| +Block 2: |<------- prefix ------>| |<--- block tokens --->| +``` + +In this case, only `Block 1` will have outputs stored in the prefix tensor cache, because vLLM does not store partial blocks. This may appear to be a problem at first glance, because the multimodal input is fragmented across a new block that wasn't cached. + +In reality, this isn't a big problem for correctness, because vLLM also maintains an encoder cache for multimodal inputs. In other words, after the first pass, we'll have the following: + +- The Block 1 hash, which is used for prefix caching +- The hash describing the image data starting at position 0 and with length 6 +- In vLLM's encoder cache, a mapping from the image hash above to the encoder output + + +To understand what happens, say we get the following input as a second request: +``` + [ Im0 Im1 Im2 Im3 ] [ Im4 Im5 bar baz ] +Block 1: |<--- block tokens ---->| +Block 2: |<------- prefix ------>| |<--- block tokens --->| +``` + +First, the scheduler will check for a prefix cache hit, which we will see on `Block 1`. As a result, we will have 4 tokens marked as precomputed, and only see the remaining 4 tokens in the following prefill. + +Because we have multimodal data in a scheduled span that isn't fully precomputed, we still need to call the visual encoder. However, since we have the image hash and encoder cache, we will retrieve the encoder outputs for `Im4` and `Im5` as we create the multimodal embeddings. + +When we pass our multimodal tensors to the language model component in the same stage, we'll then expect the same outputs, because the prefix caching behaviors in vLLM-Omni / vLLM match, so the LLM will use vLLM's KV cache manager's prefix caching to correctly handle the attention information for `Block 1` while calculating the outputs for `Block 2`, giving us the correct results for processing `Block 2` with the context of `Block 1`. + +Finally, we look up the output hidden states/multimodal tensors corresponding to the prefix cache hit `Block 1` and concatenate it with the forward pass result to get the final result, which is expected to be identical to the full hidden states when prefix caching is disabled. diff --git a/tests/conftest.py b/tests/conftest.py index 098fd8d970c..ad1008b7263 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1850,6 +1850,7 @@ class OmniResponse: e2e_latency: float | None = None success: bool = False error_message: str | None = None + cached_tokens: int | None = None @dataclass @@ -2345,6 +2346,11 @@ def _process_non_stream_omni_response(self, chat_completion) -> OmniResponse: if hasattr(choice.message, "content") and choice.message.content is not None: text_content = choice.message.content + # Extract cached_tokens for prefix caching tests + usage = getattr(chat_completion, "usage", None) + if usage and (details := getattr(usage, "prompt_tokens_details", None)): + result.cached_tokens = details.cached_tokens + # Calculate end-to-end latency result.e2e_latency = time.perf_counter() - start_time diff --git a/tests/core/test_prefix_cache.py b/tests/core/test_prefix_cache.py new file mode 100644 index 00000000000..c3d8c1ff928 --- /dev/null +++ b/tests/core/test_prefix_cache.py @@ -0,0 +1,347 @@ +from unittest.mock import Mock, patch + +import pytest +import torch + +from vllm_omni.core.prefix_cache import OmniTensorPrefixCache + +DEFAULT_SEQ_LEN = 15 +NUM_BLOCKS = 10 +BLOCK_SIZE = 4 +HIDDEN_SIZE = 2 +DTYPE = torch.float32 +OTHER_DTYPE = torch.float16 +DEFAULT_SHAPE = torch.Size([NUM_BLOCKS, BLOCK_SIZE, HIDDEN_SIZE]) + + +class MockInputBatch: + def __init__(self, num_computed_tokens_cpu): + self.req_ids = ["req1", "req2"] + self.req_id_to_index = {req_id: i for i, req_id in enumerate(self.req_ids)} + self.num_computed_tokens_cpu = num_computed_tokens_cpu + # Block table is only mocked for validation of length; + # we don't actually need to add valid values here since + # we patch the table when testing. + self.block_table = Mock() + self.block_table.block_tables = [None] + + +def get_omni_pcache_with_mm_tensors(feat_dims, seq_len) -> OmniTensorPrefixCache: + """Build an OmniTensorPrefixCache and init mm tensors.""" + cache = get_omni_pcache() + mm_outputs = get_multimodal_outputs(feat_dims, seq_len) + cache.maybe_init_missing_mm_cache_keys(mm_outputs, seq_len) + return cache + + +def get_omni_pcache() -> OmniTensorPrefixCache: + """Build an OmniTensorPrefixCache, but don't init mm tensors.""" + cache = OmniTensorPrefixCache( + num_blocks=NUM_BLOCKS, + block_size=BLOCK_SIZE, + hidden_size=HIDDEN_SIZE, + hs_dtype=DTYPE, + ) + return cache + + +def get_multimodal_outputs(feat_dims: dict[str, int], seq_len: int) -> dict[str, torch.Tensor]: + fake_mm_inputs = {} + for mm_key, feat_dim in feat_dims.items(): + fake_mm_inputs[mm_key] = torch.rand((seq_len, feat_dim), dtype=DTYPE) + return fake_mm_inputs + + +### Tests for initialization +def test_initialization_simple(): + """Check default initialization only creates the hidden states.""" + cache = get_omni_pcache() + assert isinstance(cache.hidden_states_cache, torch.Tensor) + assert cache.hidden_states_cache.shape == DEFAULT_SHAPE + assert len(cache.mm_outputs_cache) == 0 + assert len(cache.mm_cache_keys) == 0 + + +def test_initialization_with_multimodal(): + """Check initialization + registration of multimodal outputs.""" + cache = get_omni_pcache() + feat_dims = {"foo": 100, "bar": 50, "baz": 10} + mm_outputs = get_multimodal_outputs( + feat_dims, + seq_len=DEFAULT_SEQ_LEN, + ) + # Cast one of the keys to a different dtype; the dtype of the tensor + # that is used to initialize the cache dictates the cache dtype. + mm_outputs["foo"] = mm_outputs["foo"].to(OTHER_DTYPE) + + cache.maybe_init_missing_mm_cache_keys(mm_outputs, DEFAULT_SEQ_LEN) + assert len(cache.mm_cache_keys) == 3 + assert set(cache.mm_cache_keys) == set(feat_dims.keys()) + for mm_key in cache.mm_cache_keys: + cache_tensor = cache.mm_outputs_cache[mm_key] + assert isinstance(cache_tensor, torch.Tensor) + assert cache_tensor.shape[-1] == feat_dims[mm_key] + assert mm_outputs[mm_key].dtype == cache_tensor.dtype + + +def test_init_missing_mm_cache_keys_is_idempotent(): + """Ensure that the cache doesn't reinitialize old keys.""" + cache = get_omni_pcache() + mm_key = "foo" + feat_dims = {mm_key: 100} + mm_outputs = get_multimodal_outputs( + feat_dims, + seq_len=DEFAULT_SEQ_LEN, + ) + cache.maybe_init_missing_mm_cache_keys(mm_outputs, DEFAULT_SEQ_LEN) + assert len(cache.mm_cache_keys) == 1 + assert mm_key in cache.mm_cache_keys + + # Cache is initialized to 0 - fill it with 1s + cache.mm_outputs_cache[mm_key].fill_(1) + + # Ensure that running another initialization + # doesn't zero out our cache values + cache.maybe_init_missing_mm_cache_keys(mm_outputs, DEFAULT_SEQ_LEN) + assert len(cache.mm_cache_keys) == 1 + assert mm_key in cache.mm_cache_keys + assert torch.all(cache.mm_outputs_cache[mm_key] == 1) + + +### Tests for Update +def test_update_no_multimodal(): + """Test that slot mappings act as row indices hidden states.""" + cache = get_omni_pcache() + + num_tokens_unpadded = 8 + slot_offset = 8 + slot_mapping = torch.arange(slot_offset, slot_offset + num_tokens_unpadded) + new_hidden_states = torch.rand((num_tokens_unpadded, HIDDEN_SIZE), dtype=DTYPE) + + cache.update_omni_tensor_prefix_cache( + hidden_states=new_hidden_states, + multimodal_outputs=None, + num_tokens_unpadded=num_tokens_unpadded, + slot_mapping=slot_mapping, + ) + + # Ensure that if we reshape our 3D cache back to 2D, we can use the + # indices in our slot mappings to access the hidden states as expected + hs_rows = cache.hidden_states_cache.view(NUM_BLOCKS * BLOCK_SIZE, HIDDEN_SIZE) + for slot_idx, new_states in zip(slot_mapping, new_hidden_states): + slot_states = hs_rows[slot_idx] + assert torch.all(slot_states == new_states) + + +@pytest.mark.parametrize( + "feat_dims", + [ + {"foo": 100, "bar": 100}, + {"foo": 100, "bar": 50, "baz": 10}, + ], +) +def test_update_with_multimodal_outputs(feat_dims): + """Test that slot mappings are correct for multimodal tensors.""" + cache = get_omni_pcache_with_mm_tensors(feat_dims, seq_len=DEFAULT_SEQ_LEN) + + num_tokens_unpadded = 8 + slot_offset = 8 + slot_mapping = torch.arange(slot_offset, slot_offset + num_tokens_unpadded) + feature_dims = {key: val.shape[-1] for key, val in cache.mm_outputs_cache.items()} + mm_outputs = {key: torch.rand((num_tokens_unpadded, feature_dims[key]), dtype=DTYPE) for key in cache.mm_cache_keys} + cache.update_omni_tensor_prefix_cache( + hidden_states=None, + multimodal_outputs=mm_outputs, + num_tokens_unpadded=num_tokens_unpadded, + slot_mapping=slot_mapping, + ) + + for mm_key in feat_dims.keys(): + assert mm_key in cache.mm_outputs_cache + key_feat_dim = feature_dims[mm_key] + mm_state_rows = cache.mm_outputs_cache[mm_key].view(NUM_BLOCKS * BLOCK_SIZE, key_feat_dim) + + # Similar to hidden states, but for each key in the dict; + # Different tensors may have different feature dims + new_mm_outputs = mm_outputs[mm_key] + for slot_idx, new_output in zip(slot_mapping, new_mm_outputs): + slot_states = mm_state_rows[slot_idx] + assert torch.all(slot_states == new_output) + + +### Tests for Merging +def fake_get_cached_block_ids(self, req_idx, *args, **kwargs): + """Fake block table lookup. + + Assumption: + req_idx 0 is a cache hit with slots 8, 9, ..., 15 + req_idx 1 is a cache miss + """ + assert req_idx < 2 + if req_idx == 0: + # With the slot offset we provided (8), the corresponding + # blocks IDs are 2 & 3 because the block size is 4. + return torch.tensor([2, 3], dtype=torch.long) + return torch.tensor([], dtype=torch.long) + + +@pytest.mark.parametrize("num_tokens_padded", [None, 16]) +def test_get_merged_hidden_states(num_tokens_padded): + """Ensure that hidden states are merged correctly.""" + cache = get_omni_pcache() + + orig_num_tokens_unpadded = 8 + slot_offset = 8 # We'll put our states in slots 8, 9, 10, ..., 15 + orig_slot_mapping = torch.arange(slot_offset, slot_offset + orig_num_tokens_unpadded) + orig_hidden_states = torch.rand((orig_num_tokens_unpadded, HIDDEN_SIZE), dtype=DTYPE) + + cache.update_omni_tensor_prefix_cache( + hidden_states=orig_hidden_states, + multimodal_outputs=None, + num_tokens_unpadded=orig_num_tokens_unpadded, + slot_mapping=orig_slot_mapping, + num_tokens_padded=num_tokens_padded, + ) + + # Say that we have two requests, but only one of them is a cache hit + num_new_toks_req1 = 3 + num_new_toks_req2 = 2 + cache.add_prefix_cached_new_req_id("req1") + + num_scheduled_tokens = { + "req1": num_new_toks_req1, + "req2": num_new_toks_req2, + } + new_hidden_states = torch.rand( + (num_new_toks_req1 + num_new_toks_req2, HIDDEN_SIZE), + dtype=DTYPE, + ) + req1_new_states = new_hidden_states[:num_new_toks_req1] + req2_new_states = new_hidden_states[-num_new_toks_req2:] + + input_batch = MockInputBatch(num_computed_tokens_cpu=torch.Tensor([orig_num_tokens_unpadded, 0])) + + with patch( + "vllm_omni.core.prefix_cache.OmniTensorPrefixCache._get_cached_block_ids", + new=fake_get_cached_block_ids, + ): + merged_states = cache.get_merged_hidden_states( + query_start_loc=[0, num_new_toks_req1], + input_batch=input_batch, + hidden_states=new_hidden_states, + num_scheduled_tokens=num_scheduled_tokens, + ) + + assert "req1" in merged_states and "req2" in merged_states + req1_merged_states = merged_states["req1"] + req2_merged_states = merged_states["req2"] + + # First, check the cache hit case + assert req1_merged_states.shape == torch.Size([orig_num_tokens_unpadded + num_new_toks_req1, HIDDEN_SIZE]) + # Ensure that the req1 merged states are the cached states + the new req1 states + assert torch.all(req1_merged_states[:orig_num_tokens_unpadded] == orig_hidden_states) + assert torch.all(req1_merged_states[-num_new_toks_req1:] == req1_new_states) + + # Next, ensure that the cache miss case only has the new states + assert req2_merged_states.shape == torch.Size([num_new_toks_req2, HIDDEN_SIZE]) + assert torch.all(req2_merged_states == req2_new_states) + + +@pytest.mark.parametrize("num_tokens_padded", [None, 16]) +@pytest.mark.parametrize( + "feat_dims", + [ + {"foo": 100, "bar": 100}, + {"foo": 100, "bar": 50, "baz": 10}, + ], +) +def test_get_merged_multimodal_outputs(feat_dims, num_tokens_padded): + cache = get_omni_pcache_with_mm_tensors(feat_dims, seq_len=DEFAULT_SEQ_LEN) + + orig_num_tokens_unpadded = 8 + slot_offset = 8 # We'll put our states in slots 8, 9, 10, ..., 15 + orig_slot_mapping = torch.arange(slot_offset, slot_offset + orig_num_tokens_unpadded) + feature_dims = {key: val.shape[-1] for key, val in cache.mm_outputs_cache.items()} + orig_mm_outputs = { + key: torch.rand((orig_num_tokens_unpadded, feature_dims[key]), dtype=DTYPE) for key in cache.mm_cache_keys + } + + cache.update_omni_tensor_prefix_cache( + hidden_states=None, + multimodal_outputs=orig_mm_outputs, + num_tokens_unpadded=orig_num_tokens_unpadded, + slot_mapping=orig_slot_mapping, + num_tokens_padded=num_tokens_padded, + ) + + # Similar to hs test- say that we have two requests, but only one of them is a cache hit + num_new_toks_req1 = 3 + num_new_toks_req2 = 2 + cache.add_prefix_cached_new_req_id("req1") + + num_scheduled_tokens = { + "req1": num_new_toks_req1, + "req2": num_new_toks_req2, + } + + new_mm_outputs = {} + for mm_key in cache.mm_cache_keys: + new_mm_outputs[mm_key] = torch.rand( + (num_new_toks_req1 + num_new_toks_req2, feature_dims[mm_key]), + dtype=DTYPE, + ) + # We also want to make sure passthrough data (outside of our keys) isn't dropped + new_mm_outputs["passthrough_data"] = "Something else" + # Lists are a special case because we can't split them yet if we want to match + # the nonprefix cache behavior, because this runs before post process. + new_mm_outputs["passthrough_list"] = ["should", "not", "split"] + + input_batch = MockInputBatch(num_computed_tokens_cpu=torch.Tensor([orig_num_tokens_unpadded, 0])) + + with patch( + "vllm_omni.core.prefix_cache.OmniTensorPrefixCache._get_cached_block_ids", + new=fake_get_cached_block_ids, + ): + merged_mm_outputs = cache.get_merged_multimodal_states( + query_start_loc=[0, num_new_toks_req1], + input_batch=input_batch, + multimodal_outputs=new_mm_outputs, + num_scheduled_tokens=num_scheduled_tokens, + ) + + # Ensure the passthrough data wasn't dropped + assert "passthrough_data" in merged_mm_outputs + assert "passthrough_list" in merged_mm_outputs + + for mm_key, mm_output in merged_mm_outputs.items(): + # Ensure passthrough data is just forwarded normally and not duplicated + assert isinstance(mm_output, dict) + assert "req1" in mm_output and "req2" in mm_output + if mm_key == "passthrough_data": + assert mm_key not in cache.mm_cache_keys + assert new_mm_outputs[mm_key] == mm_output["req1"] + assert new_mm_outputs[mm_key] == mm_output["req2"] + elif mm_key == "passthrough_list": + assert mm_key not in cache.mm_cache_keys + assert new_mm_outputs[mm_key] == mm_output["req1"] + assert new_mm_outputs[mm_key] == mm_output["req2"] + else: + assert mm_key in cache.mm_cache_keys + curr_feat_dim = feature_dims[mm_key] + # Ensure that req1 (cache hit) merged the mm data + req1_merged_mm_outputs = mm_output["req1"] + req1_new_mm_outputs = new_mm_outputs[mm_key][:num_new_toks_req1] + + assert req1_merged_mm_outputs.shape == torch.Size( + [orig_num_tokens_unpadded + num_new_toks_req1, curr_feat_dim] + ) + # Ensure that the req1 merged mm data are the cached data + the new data + assert torch.all(req1_merged_mm_outputs[:orig_num_tokens_unpadded] == orig_mm_outputs[mm_key]) + assert torch.all(req1_merged_mm_outputs[-num_new_toks_req1:] == req1_new_mm_outputs) + + # Ensure that req2 (cache miss) only has the new mm data + req2_merged_mm_outputs = mm_output["req2"] + req2_new_mm_outputs = new_mm_outputs[mm_key][-num_new_toks_req2:] + + assert req2_merged_mm_outputs.shape == torch.Size([num_new_toks_req2, curr_feat_dim]) + assert torch.all(req2_merged_mm_outputs == req2_new_mm_outputs) diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py index f4aabb8b957..c05f8f50674 100644 --- a/tests/e2e/online_serving/test_qwen3_omni.py +++ b/tests/e2e/online_serving/test_qwen3_omni.py @@ -23,11 +23,13 @@ models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] +QWEN3_OMNI_CONFIG_PATH = str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml") +QWEN3_OMNI_XPU_CONFIG_PATH = str(Path(__file__).parent.parent / "stage_configs" / "xpu" / "qwen3_omni_ci.yaml") -def get_chunk_config(): +def get_chunk_config(config_path: str): path = modify_stage_config( - str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml"), + config_path, updates={ "async_chunk": True, "stage_args": { @@ -44,15 +46,41 @@ def get_chunk_config(): return path +def get_prefix_caching_config(config_path: str): + """Create a stage config with prefix caching enabled on the thinker (stage 0).""" + path = modify_stage_config( + config_path, + updates={ + "stage_args": { + 0: {"engine_args.enable_prefix_caching": True}, + }, + }, + ) + return path + + if current_omni_platform.is_xpu(): - stage_configs = [str(Path(__file__).parent.parent / "stage_configs" / "xpu" / "qwen3_omni_ci.yaml")] + stage_configs = [QWEN3_OMNI_XPU_CONFIG_PATH] + prefix_caching_stage_configs = [get_prefix_caching_config(QWEN3_OMNI_XPU_CONFIG_PATH)] else: # MI325 GPU should share the same config as H100 - stage_configs = [get_chunk_config()] + stage_configs = [get_chunk_config(QWEN3_OMNI_CONFIG_PATH)] + prefix_caching_stage_configs = [get_prefix_caching_config(QWEN3_OMNI_CONFIG_PATH)] # Create parameter combinations for model and stage config test_params = [ OmniServerParams(model=model, stage_config_path=stage_config) for model in models for stage_config in stage_configs ] +# For prefix caching, we need to enable prompt token details so that we +# can determine if any tokens were cached. +prefix_test_params = [ + OmniServerParams( + model=model, + stage_config_path=stage_config, + server_args=["--enable-prompt-tokens-details"], # Enable prompt tokens details to get cached_tokens + ) + for model in models + for stage_config in prefix_caching_stage_configs +] def get_system_prompt(): @@ -75,6 +103,7 @@ def get_prompt(prompt_type="text_only"): prompts = { "text_only": "What is the capital of China? Answer in 20 words.", "mix": "What is recited in the audio? What is in this image? Describe the video briefly.", + "text_image": "What color are the squares in this image?", } return prompts.get(prompt_type, prompts["text_only"]) @@ -147,3 +176,41 @@ def test_text_to_text_001(omni_server, openai_client) -> None: } openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) + + +@pytest.mark.advanced_model +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@pytest.mark.parametrize("omni_server", prefix_test_params, indirect=True) +def test_thinker_prefix_caching(omni_server, openai_client) -> None: + """ + Test thinker prefix caching by sending identical requests with an image (i.e., + a large shared prefix) and verifying that the second request uses cached tokens + & produces the same output. + """ + image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + image_data_url=image_data_url, + content_text=get_prompt("text_image"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": False, + "modalities": ["text"], + } + + response_1 = openai_client.send_omni_request(request_config, request_num=1)[0] + response_2 = openai_client.send_omni_request(request_config, request_num=1)[0] + + assert response_1.success + assert response_2.success + assert response_2.cached_tokens is not None + # We should cache the vast majority of the prompt (image + up to last full block), + # and set seed in the CI config, so the second request should give an identical + # response for the generated input image, even if we use dummy weights + assert response_2.cached_tokens > 0 + assert response_1.text_content == response_2.text_content diff --git a/vllm_omni/core/prefix_cache.py b/vllm_omni/core/prefix_cache.py new file mode 100644 index 00000000000..69e7346c4c1 --- /dev/null +++ b/vllm_omni/core/prefix_cache.py @@ -0,0 +1,264 @@ +""" +Utilities for Prefix Caching in Omni models. +""" + +import torch +from vllm.logger import init_logger +from vllm.v1.worker.gpu_input_batch import InputBatch + +from vllm_omni.utils.mm_outputs import build_mm_cpu, to_payload_element + +logger = init_logger(__name__) + + +class OmniTensorPrefixCache: + """Prefix cache for hidden states (model outputs) and model specific + multimodal outputs. + + This class implements prefix caching in a non-invasive way on top of + vLLM by leveraging the same slot mappings that the vLLM scheduler uses + for the KV Cache. + + Conceptually, this means we are mapping vLLM's cache mapping: + (num_blocks, block_size) + + to 3D tensors of shape: + (num_blocks, block_size, feature_size) + + Note that feature_size may vary across multimodal_outputs. + """ + + def __init__( + self, + num_blocks: int, + block_size: int, + hidden_size: int, + hs_dtype: torch.dtype, + ): + self.num_blocks = num_blocks + self.block_size = block_size + self.default_hidden_size = hidden_size + + # Initialize the hidden states cache immediately + self.hidden_states_cache = self._get_cache_tensor(dtype=hs_dtype) + + # Defer initialization of the mm_outputs_cache until we + # actually see mm output tensors dependent on num tokens. + self.mm_outputs_cache = {} + self.mm_cache_keys = set() + self._new_req_cache_hit_ids: set[str] = set() + + def maybe_init_missing_mm_cache_keys(self, multimodal_outputs: dict, seq_len: int): + """Given multimodal outputs from executing the model, dynamically + determine which multimodal outputs are tensors depending on sequence + length and should be cached, and initialize the cache tensors + accordingly. + + NOTE: This is done to avoid the need for explicit specification of + cache keys for every model/stage and aligns with the current way + that we slice the multimodal outputs based on the first dimension. + + This will usually be called by the first forward pass, i.e., + determined by the warmup. + """ + for key, val in multimodal_outputs.items(): + if isinstance(val, torch.Tensor) and val.shape[0] == seq_len and key not in self.mm_cache_keys: + feat_dim = val.shape[-1] + self.mm_outputs_cache[key] = self._get_cache_tensor( + dtype=val.dtype, + hidden_size=feat_dim, + ) + self.mm_cache_keys.add(key) + new_tensor_shape = self.mm_outputs_cache[key].shape + logger.info("Initializing multimodal output cache of size %s for key: %s", list(new_tensor_shape), key) + + def _get_cache_tensor(self, dtype: torch.dtype, hidden_size: int | None = None) -> torch.Tensor: + """Allocate a CPU cache tensor for a specific key.""" + actual_hidden_size = hidden_size if hidden_size is not None else self.default_hidden_size + return torch.zeros( + (self.num_blocks, self.block_size, actual_hidden_size), + dtype=dtype, + device="cpu", + ) + + def add_prefix_cached_new_req_id(self, req_id: str): + """Adds a new request ID to the set of prefix cache hits on the batch.""" + self._new_req_cache_hit_ids.add(req_id) + + def reset_prefix_cached_new_req_ids(self): + """Clears the cache hit IDs to prepare for a new engine step.""" + self._new_req_cache_hit_ids.clear() + + @staticmethod + def _coerce_to_cpu_tensor(maybe_gpu_tensor: torch.Tensor) -> torch.Tensor: + """Convert GPU tensors -> contiguous CPU tensors if needed.""" + return maybe_gpu_tensor.detach().cpu().contiguous() + + def update_omni_tensor_prefix_cache( + self, + hidden_states: torch.Tensor | None, + multimodal_outputs: dict[str, torch.Tensor] | None, + num_tokens_unpadded: int, + slot_mapping: torch.Tensor, + num_tokens_padded: int | None = None, + ): + """Updates the hidden cache state for the provided hidden states and multimodal outputs. + + Args: + hidden_states: Hidden states tensor to cache (if any) + multimodal_outputs: Multimodal dict whose tensors may be cached + num_tokens_unpadded: Number of tokens without padding + slot_mapping: Slot mapping for the input sequence + num_tokens_padded: Total number of tokens including padding + """ + unpadded_slot_mapping = slot_mapping[:num_tokens_unpadded] + if num_tokens_padded is None: + num_tokens_padded = num_tokens_unpadded + + if hidden_states is not None: + # Slice to unpadded portion before caching + hidden_states = hidden_states[:num_tokens_unpadded] + # Ensure that hidden states are on the CPU + hidden_states = OmniTensorPrefixCache._coerce_to_cpu_tensor(hidden_states) + # View the cache as 2D so that we can treat our slots as row indices + flat_cache = self.hidden_states_cache.view(-1, self.hidden_states_cache.shape[-1]) + flat_cache[unpadded_slot_mapping] = hidden_states + logger.debug("Writing to hidden states for %s tokens", num_tokens_unpadded) + + # Do the same for the stage's cached multimodal outputs + if multimodal_outputs is not None: + # If we haven't initialized the keys already, do it now + # We check against the padded token count since we haven't sliced yet + self.maybe_init_missing_mm_cache_keys( + multimodal_outputs, + seq_len=num_tokens_padded, + ) + + for mm_out_key, mm_cache in self.mm_outputs_cache.items(): + if mm_out_key in multimodal_outputs: + # Slice to unpadded portion before caching + mm_state = multimodal_outputs[mm_out_key][:num_tokens_unpadded] + mm_state = OmniTensorPrefixCache._coerce_to_cpu_tensor(mm_state) + flat_cache = mm_cache.view(-1, mm_cache.shape[-1]) + flat_cache[unpadded_slot_mapping] = mm_state + logger.debug("Writing to mm output cache for %s tokens", num_tokens_unpadded) + + def _coerce_to_payload_dict( + self, + element: object, + query_start_loc: torch.Tensor, + input_batch: InputBatch, + num_scheduled_tokens: dict[str, int], + ) -> dict[str, object]: + """Build the multimodal passthrough data per request for + the object under consideration. This is identical to the case + for no prefix cache when we tensor does have a first dimension + matching the seq len. + """ + elem_dict = {} + for req_id in input_batch.req_ids: + req_idx = input_batch.req_id_to_index[req_id] + start = query_start_loc[req_idx] + end = start + num_scheduled_tokens[req_id] + elem_dict[req_id] = to_payload_element( + element, req_idx, start=start, end=end, pass_lists_through=True, seq_len=None + ) + return elem_dict + + def get_merged_multimodal_states( + self, + query_start_loc: torch.Tensor, + input_batch: InputBatch, + multimodal_outputs: dict, + num_scheduled_tokens: dict[str, int], + ): + """Get the merged multimodal states if hidden state prefix caching is enabled.""" + combined_multimodal_outputs = {} + # First get the prefix cached tensors that are present in the mm data + for mm_key in self.mm_cache_keys: + if mm_key in multimodal_outputs: + combined_multimodal_outputs[mm_key] = self._get_merged_tensors( + query_start_loc=query_start_loc, + input_batch=input_batch, + cache=self.mm_outputs_cache[mm_key], + hidden_states=multimodal_outputs[mm_key], + num_scheduled_tokens=num_scheduled_tokens, + ) + + # Then, get everything else (passthrough data); first, convert to CPU + # tensors similarly to the non prefix cached path, and then populate + # the subdicts mapping request IDs -> payload objects + passthrough_keys = set(multimodal_outputs.keys()) - self.mm_cache_keys + passthrough_mm_data = {k: v for k, v in multimodal_outputs.items() if k in passthrough_keys} + mm_cpu = build_mm_cpu(multimodal_outputs=passthrough_mm_data) + + for mm_key, mm_val in mm_cpu.items(): + combined_multimodal_outputs[mm_key] = self._coerce_to_payload_dict( + element=mm_val, + query_start_loc=query_start_loc, + input_batch=input_batch, + num_scheduled_tokens=num_scheduled_tokens, + ) + return combined_multimodal_outputs + + def get_merged_hidden_states(self, *args, **kwargs) -> dict[str, torch.Tensor]: + """Get the merged hidden states.""" + return self._get_merged_tensors( + *args, + **kwargs, + cache=self.hidden_states_cache, + ) + + def _get_merged_tensors( + self, + query_start_loc: torch.Tensor, + input_batch: InputBatch, + cache: torch.Tensor, + hidden_states: torch.Tensor, + num_scheduled_tokens: dict[str, int], + ) -> dict[str, torch.Tensor]: + """When hidden state caching is enabled, takes the input hidden_states, + which only correspond to the scheduled tokens, and returns a mapping + from request IDs to their full hidden states. This is accomplished by + looking up the block IDs & scheduled token counts to split the + hidden_states. + """ + # We do not support hybrid caches at the moment. + if len(input_batch.block_table.block_tables) > 1: + logger.warning_once( + "Omni prefix caching is enabled, but the batch block table appears to" + " have multiple kv groups; only the first group will be used!" + ) + + combined_hidden_states = {} + hidden_states = OmniTensorPrefixCache._coerce_to_cpu_tensor(hidden_states) + for req_id in input_batch.req_ids: + req_idx = input_batch.req_id_to_index[req_id] + + if req_id in self._new_req_cache_hit_ids: + block_ids = self._get_cached_block_ids(req_idx, input_batch) + cached_hs = cache[block_ids].reshape(-1, cache.shape[-1]) + + # Slice the hidden states corresponding to this request; + # we do this by using the query start + start = query_start_loc[req_idx] + new_hs = hidden_states[start : start + num_scheduled_tokens[req_id]] + combined_hidden_states[req_id] = torch.cat([cached_hs, new_hs], dim=0) + else: + # cache miss for this request, pass through normally + start = query_start_loc[req_idx] + new_hs = hidden_states[start : start + num_scheduled_tokens[req_id]] + combined_hidden_states[req_id] = new_hs + + return combined_hidden_states + + def _get_cached_block_ids(self, req_idx: int, input_batch: InputBatch) -> torch.Tensor: + """Given an input batch and request index in the batch (not ID), get the + block IDs corresponding to the cache hit. + """ + num_computed = input_batch.num_computed_tokens_cpu[req_idx] + # NOTE: vLLM only caches full blocks + num_cached_blocks = num_computed // self.block_size + # Get the block IDs attached to this cache hit and reindex into + # the flattened cached hidden states (i.e., 1 row per token). + return input_batch.block_table[0].block_table.cpu[req_idx, :num_cached_blocks] diff --git a/vllm_omni/utils/mm_outputs.py b/vllm_omni/utils/mm_outputs.py new file mode 100644 index 00000000000..66d4e6ffe04 --- /dev/null +++ b/vllm_omni/utils/mm_outputs.py @@ -0,0 +1,93 @@ +"""Utilities for handling multimodal outputs / building multimodal output +payloads, most of which are shared by the prefix cache / no prefix cache path. +""" + +import torch +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def build_mm_cpu(multimodal_outputs: dict) -> dict[str, object]: + """Pre-copies multimodal tensor to CPU once (not per-request) to avoid + redundant D2H transfers when gpu_resident_buffer_keys keeps them on GPU. + + In the case of prefix caching, the multimodal outputs provided will + only contain the passthrough data. + + Args: + multimodal_outputs: Multimodal dict mapping strings to objects. + """ + # Pre-copy multimodal tensors to CPU once (not per-request) to avoid + # redundant D2H transfers when gpu_resident_buffer_keys keeps them on GPU. + mm_cpu: dict[str, object] = {} + # Currently there are some cases where this is true at the + # moment, which should be fixed. + if not isinstance(multimodal_outputs, dict): + logger.warning("Multimodal outputs are not a dict and will not be passed") + + if multimodal_outputs: + for k, v in multimodal_outputs.items(): + if isinstance(v, torch.Tensor): + mm_cpu[k] = v.detach().to("cpu").contiguous() + elif isinstance(v, dict): + sub_dict: dict[str, torch.Tensor] = {} + for sk, sv in v.items(): + if isinstance(sv, torch.Tensor): + sub_dict[str(sk)] = sv.detach().to("cpu").contiguous() + if sub_dict: + mm_cpu[k] = sub_dict + elif isinstance(v, list) and len(v) > 0: + cpu_list = [] + for elem in v: + if isinstance(elem, torch.Tensor): + cpu_list.append(elem.detach().to("cpu").contiguous()) + else: + cpu_list.append(elem) + mm_cpu[k] = cpu_list + elif v is not None: + mm_cpu[k] = v + return mm_cpu + + +def to_payload_element( + element: object, idx: int, start: int, end: int, pass_lists_through: bool = False, seq_len: int | None = None +): + """Build an mm payload element corresponding to one request index + from an element containing 0 or more CPU tensors. + + Args: + element: The object to be added to the payload. + idx: The index of the request. + start: The start index corresponding to the request idx. + end: The end index corresponding to the request idx. + pass_lists_through: bool Whether or not lists should be treated as + passthrough data; this should be False in normal cases, but True + if we need to avoid splitting nonempty lists prior to calling + postprocess, which is the case for prefix cache. + seq_len: Optional sequence length (i.e., dim 0 of hidden states). + This should be set to None in the prefix caching case, because + the condition that would be executed here is the same as the + criteria for being added to the multimodal outputs cache. + """ + # Prefix cache won't hit this case because this is the condition + # for being a mm_cache_key in the multimodal outputs tensor. + if seq_len is not None and isinstance(element, torch.Tensor) and element.shape[0] == seq_len: + return element[start:end].contiguous() + # Every other case is shared between prefix cache (passthrough data) + # and running a model without prefix caching. + elif isinstance(element, dict): + return {sk: sv[start:end].contiguous() for sk, sv in element.items()} + elif isinstance(element, list): + # For lists, clone tensors to avoid cross-request aliasing + if pass_lists_through: + return [elem.clone() if isinstance(elem, torch.Tensor) else elem for elem in element] + element = element[idx] if idx < len(element) else element[0] + if isinstance(element, torch.Tensor): + element = element.clone() + return element + elif isinstance(element, torch.Tensor): + # List-derived tensor payloads are request-invariant; clone to + # avoid accidental cross-request aliasing on downstream mutation. + return element.clone() + return element diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 62a0c857164..f37b2224efb 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -39,6 +39,7 @@ from vllm_omni.distributed.omni_connectors.kv_transfer_manager import OmniKVTransferManager from vllm_omni.outputs import OmniModelRunnerOutput +from vllm_omni.utils.mm_outputs import build_mm_cpu, to_payload_element from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner from vllm_omni.worker.omni_connector_model_runner_mixin import OmniConnectorModelRunnerMixin @@ -201,6 +202,63 @@ def _capture_talker_mtp_graphs(self) -> None: finally: set_cudagraph_capturing_enabled(False) + def _maybe_update_prefix_cache( + self, + hidden_states: torch.Tensor, + multimodal_outputs: dict, + num_tokens_unpadded: int, + num_tokens_padded: int, + ): + """If prefix caching is enabled and it's the last pipeline parallelism rank, + retrieve the hidden states & multimodal outputs from the prefix cache based + on our batch slot mappings. + """ + # Cache hidden states if we've enabled hidden state prefix caching + # unless this isn't the last pipeline parallelism rank. + if self.omni_prefix_cache is not None and get_pp_group().is_last_rank: + # If this happens, it generally means the model is not following the correct + # interface yet and is therefore currently not compatible with prefix cache. + if multimodal_outputs is not None and not isinstance(multimodal_outputs, dict): + logger.warning_once( + "prefix caching expects mm outputs to be a dict, but got %s", + type(multimodal_outputs), + ) + + self.omni_prefix_cache.update_omni_tensor_prefix_cache( + hidden_states=hidden_states, + multimodal_outputs=multimodal_outputs, + num_tokens_unpadded=num_tokens_unpadded, + slot_mapping=self.input_batch.block_table[0].slot_mapping.cpu, + num_tokens_padded=num_tokens_padded, + ) + + def _maybe_get_combined_prefix_cache_tensors( + self, + hidden_states: torch.Tensor, + multimodal_outputs: dict, + num_scheduled_tokens: dict[str, int], + ) -> tuple[dict[str, torch.Tensor] | None, dict | None]: + """If prefix caching is enabled, extract the merged hidden states and multimodal outputs for + all requests in the batch (including those that aren't a hit on Prefix cache). + """ + # Prior to applying the post-processing func, extract + # the prefix cached hidden states and multimodal states. + combined_hidden_states, combined_multimodal_outputs = None, None + if self.omni_prefix_cache is not None: + combined_hidden_states = self.omni_prefix_cache.get_merged_hidden_states( + query_start_loc=self.query_start_loc.cpu, + input_batch=self.input_batch, + hidden_states=hidden_states, + num_scheduled_tokens=num_scheduled_tokens, + ) + combined_multimodal_outputs = self.omni_prefix_cache.get_merged_multimodal_states( + query_start_loc=self.query_start_loc.cpu, + input_batch=self.input_batch, + multimodal_outputs=multimodal_outputs, + num_scheduled_tokens=num_scheduled_tokens, + ) + return combined_hidden_states, combined_multimodal_outputs + @torch.inference_mode() def execute_model( self, @@ -476,6 +534,15 @@ def execute_model( hidden_states, multimodal_outputs = self.extract_multimodal_outputs(model_output) + # Cache hidden states & multimodal outputs if we've enabled hidden state + # prefix caching unless this isn't the last pipeline parallelism rank. + self._maybe_update_prefix_cache( + hidden_states=hidden_states, + multimodal_outputs=multimodal_outputs, + num_tokens_unpadded=num_tokens_unpadded, + num_tokens_padded=num_tokens_padded, + ) + if not self.broadcast_pp_output: # Common case. if not get_pp_group().is_last_rank: @@ -589,6 +656,23 @@ def _sample( return super()._sample(logits, spec_decode_metadata) + @staticmethod + def _resolve_req_hidden_states( + hidden_states_cpu: torch.Tensor, + combined_hidden_states: dict[str, torch.Tensor] | None, + rid: str, + start: int, + end: int, + ): + if combined_hidden_states is not None: + # We always have all request IDs for prefix cache, even for + # partial cache misses, so this should never happen. + if rid not in combined_hidden_states: + raise RuntimeError("Request IDs in the batch are missing from the merged states!") + return combined_hidden_states[rid] + # Prefix caching is disabled + return hidden_states_cpu[start:end] + @torch.inference_mode() def sample_tokens( self, @@ -597,6 +681,13 @@ def sample_tokens( kv_extracted_req_ids = getattr(self, "kv_extracted_req_ids", None) self.kv_extracted_req_ids = None + # Used for prefix cache + combined_hidden_states = None + combined_multimodal_outputs = None + # Used when we don't use prefix cache; prefix cache builds the payloads + # internally since it already needs to do this for the cached tensors + mm_cpu = {} + if self.execute_model_state is None: kv_connector_output = self.kv_connector_output self.kv_connector_output = None @@ -628,6 +719,7 @@ def sample_tokens( slot_mappings, # OMNI: unpack slot_mappings for drafter ) = self.execute_model_state self.execute_model_state = None + seq_len = hidden_states.shape[0] # Apply structured output bitmasks if present. if grammar_output is not None: @@ -749,67 +841,73 @@ def propose_draft_token_ids(sampled_token_ids): dtype=np.int32, ) + # Prior to applying the post-processing func, extract + # the prefix cached hidden states and multimodal states. + if self.omni_prefix_cache is not None: + ( + combined_hidden_states, + combined_multimodal_outputs, + ) = self._maybe_get_combined_prefix_cache_tensors( + hidden_states, + multimodal_outputs, + scheduler_output.num_scheduled_tokens, + ) + # Otherwise we don't have the mm CPU data yet, so we still need to build it + if self.omni_prefix_cache is None: + mm_cpu = build_mm_cpu(multimodal_outputs) + self._process_additional_information_updates( - hidden_states, multimodal_outputs, num_scheduled_tokens_np, scheduler_output + hidden_states, + multimodal_outputs, + num_scheduled_tokens_np, + scheduler_output, + combined_hidden_states, + combined_multimodal_outputs, ) - # Pre-copy multimodal tensors to CPU once (not per-request) to avoid - # redundant D2H transfers when gpu_resident_buffer_keys keeps them on GPU. - mm_cpu: dict[str, object] = {} - if isinstance(multimodal_outputs, dict) and multimodal_outputs: - for k, v in multimodal_outputs.items(): - try: - if isinstance(v, torch.Tensor) and v.shape[0] == hidden_states_cpu.shape[0]: - mm_cpu[k] = v.detach().to("cpu").contiguous() - elif isinstance(v, dict): - sub_dict: dict[str, torch.Tensor] = {} - for sk, sv in v.items(): - if isinstance(sv, torch.Tensor) and sv.shape[0] == hidden_states_cpu.shape[0]: - sub_dict[str(sk)] = sv.detach().to("cpu").contiguous() - if sub_dict: - mm_cpu[k] = sub_dict - elif isinstance(v, list): - if len(v) == 0: - continue - cpu_list = [] - for elem in v: - if isinstance(elem, torch.Tensor): - cpu_list.append(elem.detach().to("cpu").contiguous()) - else: - cpu_list.append(elem) - mm_cpu[k] = cpu_list - except Exception as e: - logger.error(f"Error in merge multimodal outputs: {e}") - pooler_output: list[dict[str, object]] = [] for rid in req_ids_output_copy: idx = req_id_to_index_output_copy[rid] start = int(self.query_start_loc.cpu[idx]) sched = int(num_scheduled_tokens_np[idx]) end = start + sched - hidden_slice = hidden_states_cpu[start:end] - payload: dict[str, object] = {"hidden": hidden_slice} - if mm_cpu: - mm_payload: dict[str, object] = {} - for k, v in mm_cpu.items(): - if isinstance(v, torch.Tensor) and v.shape[0] == hidden_states_cpu.shape[0]: - mm_payload[k] = v[start:end].contiguous() - elif isinstance(v, dict): - mm_payload[k] = {sk: sv[start:end].contiguous() for sk, sv in v.items()} - elif isinstance(v, list): - element = v[idx] if idx < len(v) else v[0] - if element is not None: - if isinstance(element, torch.Tensor): - element = element.clone() - mm_payload[k] = element - # Skip None elements: msgspec cannot serialize None - # in dict[str, torch.Tensor] typed fields. - elif isinstance(v, torch.Tensor): - # List-derived tensor payloads are request-invariant; clone to - # avoid accidental cross-request aliasing on downstream mutation. - mm_payload[k] = v.clone() - else: - mm_payload[k] = v + # If prefix cache is enabled, we have already split everything + # by request and converted the states to CPU tensors + req_hidden_states = self._resolve_req_hidden_states( + hidden_states_cpu, + combined_hidden_states, + rid, + start, + end, + ) + payload: dict[str, object] = {"hidden": req_hidden_states} + + mm_payload: dict[str, object] = {} + if combined_multimodal_outputs or mm_cpu: + if combined_multimodal_outputs: + # Prefix cache enabled; all items have already been processed + # and split apart for each request as needed, and all tensors + # have already been detached to the CPU. The only exception is + # lists, which we keep as passthrough data for consistent behavior + # in postprocess. + for mm_key in combined_multimodal_outputs.keys(): + value = combined_multimodal_outputs[mm_key][rid] + if isinstance(value, list): + mm_payload[mm_key] = value[idx] if idx < len(value) else value[0] + else: + mm_payload[mm_key] = value + + else: + # Prefix cache disabled; we still need to process the data + for mm_key, mm_val in mm_cpu.items(): + mm_payload[mm_key] = to_payload_element( + element=mm_val, + idx=idx, + start=start, + end=end, + pass_lists_through=False, + seq_len=seq_len, + ) payload.update(mm_payload) pooler_output.append(payload) with record_function_or_nullcontext("gpu_model_runner: ModelRunnerOutput"): diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 5ff62c11b40..de78011c75a 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -20,6 +20,7 @@ from vllm.v1.worker.gpu_model_runner import GPUModelRunner, IntermediateTensors, PerLayerAttnMetadata from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices +from vllm_omni.core.prefix_cache import OmniTensorPrefixCache from vllm_omni.engine.serialization import deserialize_additional_information from vllm_omni.model_executor.layers.rotary_embedding.mrope import OmniMRotaryEmbedding as MRotaryEmbedding from vllm_omni.model_executor.models.output_templates import OmniOutput @@ -43,6 +44,9 @@ def __init__(self, *args, **kwargs): self.model_intermediate_buffer: dict[str, dict[str, Any]] = {} self._omni_num_scheduled_tokens_np: np.ndarray | None = None self._omni_last_model_output: object | None = None + # The Omni tensor prefix cache will be allocated + # when we initialize the metadata builders if enabled + self.omni_prefix_cache = None def initialize_metadata_builders(self, kv_cache_config, kernel_block_sizes): """Override to fix scheduler_metadata buffer size for FA3 + CUDA graph. @@ -70,6 +74,16 @@ def initialize_metadata_builders(self, kv_cache_config, kernel_block_sizes): device=sm.device, ) + # Initialize the wrapper for both multimodal output tensors + # and for hidden states to be passed between stages + if self.cache_config.enable_prefix_caching: + self.omni_prefix_cache = OmniTensorPrefixCache( + num_blocks=kv_cache_config.num_blocks, + block_size=self.cache_config.block_size, + hidden_size=self.model_config.get_hidden_size(), + hs_dtype=self.dtype, + ) + @instrument(span_name="Loading (GPU)") def load_model(self, *args, **kwargs) -> None: super().load_model(*args, **kwargs) @@ -234,6 +248,10 @@ def _update_states(self, scheduler_output: "SchedulerOutput"): The SamplingMetadata is updated and copied to the GPU if there is a new/resumed/paused/finished request in the batch. """ + # Used for prefix cache + if self.omni_prefix_cache is not None: + self.omni_prefix_cache.reset_prefix_cached_new_req_ids() + # Remove finished requests from the cached states. for req_id in scheduler_output.finished_req_ids: self.requests.pop(req_id, None) @@ -294,6 +312,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput"): reqs_to_add.append(req_state) continue + # Since this is the first time the request has been scheduled, + # num_computed_tokens > 0 means that we have a hit in prefix + # caching; mark it so that we can manage the hidden states + # later on as needed. + if self.omni_prefix_cache is not None and new_req_data.num_computed_tokens > 0: + self.omni_prefix_cache.add_prefix_cached_new_req_id(req_id) + sampling_params = new_req_data.sampling_params pooling_params = new_req_data.pooling_params @@ -1010,6 +1035,8 @@ def _process_additional_information_updates( multimodal_outputs: object, num_scheduled_tokens_np: np.ndarray, scheduler_output: "SchedulerOutput", + combined_hidden_states: dict[str, torch.Tensor] | None = None, + combined_multimodal_outputs: dict[str, object] | None = None, ) -> None: """Process model-provided per-request updates and merge into model_intermediate_buffer.""" try: @@ -1018,21 +1045,31 @@ def _process_additional_information_updates( if hasattr(self.model, "has_postprocess") and self.model.has_postprocess: for req_index, req_id in enumerate(self.input_batch.req_ids): req_infos = self.model_intermediate_buffer.get(req_id, {}) - start_offset = int(self.query_start_loc.cpu[req_index]) - sched_tokens = int(num_scheduled_tokens_np[req_index]) - s, e = start_offset, start_offset + sched_tokens - # only consider to store data into update dict. - hidden_states_slice = hidden_states[s:e] + if combined_hidden_states: + # Combined hidden states contains all hidden states for every request + hidden_states_slice = combined_hidden_states[req_id] + else: + start_offset = int(self.query_start_loc.cpu[req_index]) + sched_tokens = int(num_scheduled_tokens_np[req_index]) + s, e = start_offset, start_offset + sched_tokens + # only consider to store data into update dict. + hidden_states_slice = hidden_states[s:e] + + if combined_multimodal_outputs: + # NOTE this is a bit ugly, but the mm data is structured as a list of + # keys mapping to request IDs, and if enabled, we will always have all + # request IDs in every subdict, including for cache misses. + mm_out = {k: v[req_id] for k, v in combined_multimodal_outputs.items()} + else: + mm_out = multimodal_outputs update_dict = self.model.postprocess( - hidden_states_slice, multimodal_outputs=multimodal_outputs, **req_infos + hidden_states_slice, + multimodal_outputs=mm_out, + **req_infos, ) self._update_intermediate_buffer(req_id, update_dict) except Exception as e: - logger.error( - f"Error merging for requests:{self.input_batch.req_ids} " - f"additional information update: {e}, with the multimodal_outputs " - f"as {multimodal_outputs}" - ) + logger.error(f"Error merging for requests:{self.input_batch.req_ids} additional information update: {e}") import traceback traceback.print_exc() From e9581137e9d887c0876885d1c4a74ea7d63ba2eb Mon Sep 17 00:00:00 2001 From: Didan Deng <33117903+wtomin@users.noreply.github.com> Date: Thu, 16 Apr 2026 01:45:16 +0800 Subject: [PATCH 57/76] [Perf] Add Performance Test for Qwen-Image Step-Level Execution (#2707) Signed-off-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- .../perf/tests/test_qwen_image_vllm_omni.json | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json index 1f3a2bbf77e..5ec7f1cc2b6 100644 --- a/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +++ b/tests/dfx/perf/tests/test_qwen_image_vllm_omni.json @@ -44,6 +44,52 @@ } ] }, + { + "test_name": "test_qwen_image_single_device_step_execution", + "description": "Single-device baseline (no parallelism) with step execution", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image", + "serve_args": { + "enable-diffusion-pipeline-profiler": true, + "step-execution": true + } + }, + "benchmark_params": [ + { + "name": "512x512_steps20", + "dataset": "random", + "task": "t2i", + "width": 512, + "height": 512, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.30, + "latency_mean": 3.50, + "peak_memory_mb_mean": 67000 + } + }, + { + "name": "1536x1536_steps35", + "dataset": "random", + "task": "t2i", + "width": 1536, + "height": 1536, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.037, + "latency_mean": 27.0, + "peak_memory_mb_mean": 74000 + } + } + ] + }, { "test_name": "test_qwen_image_ulysses2_cfg2_vae_patch4", "description": "Ulysses SP=2 + CFG-parallel=2 + VAE Patch Parallel=4", From 880a758b1f4b8be49618affbe4a735352f070993 Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Thu, 16 Apr 2026 10:15:39 +0800 Subject: [PATCH 58/76] [CI] Skip test_thinker_prefix_caching in tests/e2e/online_serving/test_qwen3_omni.py (#2836) Signed-off-by: wangyu <410167048@qq.com> --- tests/e2e/online_serving/test_qwen3_omni.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py index c05f8f50674..13af2ad1109 100644 --- a/tests/e2e/online_serving/test_qwen3_omni.py +++ b/tests/e2e/online_serving/test_qwen3_omni.py @@ -183,6 +183,7 @@ def test_text_to_text_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", prefix_test_params, indirect=True) +@pytest.mark.skip(reason="issue: #2833") def test_thinker_prefix_caching(omni_server, openai_client) -> None: """ Test thinker prefix caching by sending identical requests with an image (i.e., From cde22f8ebe9235a2cf59fd70d7a1e3efac2c14ee Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Thu, 16 Apr 2026 11:11:49 +0800 Subject: [PATCH 59/76] refactor: make stagepoll more clean Signed-off-by: ZhengWG --- vllm_omni/engine/async_omni_engine.py | 11 +- vllm_omni/engine/orchestrator.py | 380 +++++++++++--------------- vllm_omni/engine/stage_pool.py | 20 +- 3 files changed, 166 insertions(+), 245 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 114f75ce025..221baf7e238 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -81,6 +81,7 @@ split_devices_for_replicas, terminate_alive_proc, ) +from vllm_omni.engine.stage_pool import build_stage_pools from vllm_omni.entrypoints.utils import load_and_resolve_stage_configs from vllm_omni.inputs.preprocess import OmniInputPreprocessor from vllm_omni.platforms import current_omni_platform @@ -1071,11 +1072,13 @@ async def _run_orchestrator() -> None: request_async_queue=self.request_queue.async_q, output_async_queue=self.output_queue.async_q, rpc_async_queue=self.rpc_output_queue.async_q, + stage_pools=build_stage_pools( + self.stage_clients, + self.output_processors, + self.stage_vllm_configs, + self.logical_stage_to_clients, + ), async_chunk=self.async_chunk, - stage_clients=self.stage_clients, - output_processors=self.output_processors, - stage_vllm_configs=self.stage_vllm_configs, - logical_stage_to_clients=self.logical_stage_to_clients, ) if not startup_future.done(): startup_future.set_result(asyncio.get_running_loop()) diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index afef31e734b..88e8572f852 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -28,7 +28,7 @@ OmniEngineCoreRequest, ) from vllm_omni.engine.serialization import serialize_additional_information -from vllm_omni.engine.stage_pool import StagePool, StageReplica, build_stage_pools +from vllm_omni.engine.stage_pool import StagePool, StageReplica from vllm_omni.metrics.stats import StageRequestStats as StageRequestMetrics from vllm_omni.metrics.stats import StageStats from vllm_omni.metrics.utils import count_tokens_from_outputs @@ -93,6 +93,15 @@ def build_engine_core_request_from_tokens( # ============================================================ +@dataclass +class _ReplicaMetrics: + """Per-replica metrics accumulators owned by Orchestrator (not StagePool).""" + + batch_seq: int = 0 + agg_total_tokens: int = 0 + agg_total_gen_time_ms: float = 0.0 + + @dataclass class OrchestratorRequestState: """Per-request bookkeeping inside the Orchestrator.""" @@ -105,10 +114,12 @@ class OrchestratorRequestState: # Metrics: timestamp when request was submitted to each stage stage_submit_ts: dict[int, float] = field(default_factory=dict) - # Multi-replica: maps logical_stage_id -> client_index chosen for this + # Multi-replica: maps logical_stage_id -> StageReplica chosen for this # request. Ensures the same request always hits the same replica within - # a given logical stage (KV / intermediate-state affinity). - chosen_client_index: dict[int, int] = field(default_factory=dict) + # a given logical stage (KV / intermediate-state affinity + processor + # alignment). Stored as an object handle, not an int, so pool internals + # (flat index / replica index) stay encapsulated. + chosen_replica: dict[int, StageReplica] = field(default_factory=dict) class Orchestrator: @@ -123,54 +134,19 @@ def __init__( request_async_queue: janus.AsyncQueue[dict[str, Any]], output_async_queue: janus.AsyncQueue[dict[str, Any]], rpc_async_queue: janus.AsyncQueue[dict[str, Any]], - stage_clients: list[Any], - output_processors: list[Any], - stage_vllm_configs: list[Any], + stage_pools: list[StagePool], *, async_chunk: bool = False, - logical_stage_to_clients: list[list[int]] | None = None, ) -> None: self.request_async_queue = request_async_queue self.output_async_queue = output_async_queue self.rpc_async_queue = rpc_async_queue - self.num_clients = len(stage_clients) self.async_chunk = bool(async_chunk) - # Flat-list view: retained as a compatibility layer so existing call - # sites that index by flat client_index (metrics, shutdown, collective - # RPC fan-out, etc.) keep working. StagePool below is the canonical - # path for replica selection and should be preferred in new code. - # TODO(stage-pool): migrate remaining flat-list readers onto - # self.stage_pools and drop these attributes. - self.stage_clients: list[Any] = stage_clients - self.output_processors: list[Any] = output_processors - self.stage_vllm_configs: list[Any] = stage_vllm_configs - - # Multi-replica mapping: logical_stage_id -> list of client indices. - # When not provided (single-replica), default to identity mapping. - if logical_stage_to_clients is not None: - self.logical_stage_to_clients = logical_stage_to_clients - else: - self.logical_stage_to_clients = [[i] for i in range(self.num_clients)] - self.num_logical_stages = len(self.logical_stage_to_clients) - - # Canonical per-logical-stage replica container. - self.stage_pools: list[StagePool] = build_stage_pools( - stage_clients, - output_processors, - stage_vllm_configs, - self.logical_stage_to_clients, - ) + self.num_logical_stages = len(stage_pools) - # Reverse mappings: client_index -> (logical_stage_id, replica_index). - # Kept for metrics/shutdown log lines that index by flat client_index. - self._client_to_logical: list[int] = [0] * self.num_clients - self._client_to_replica: list[int] = [0] * self.num_clients - for logical_id, client_indices in enumerate(self.logical_stage_to_clients): - for ri, ci in enumerate(client_indices): - self._client_to_logical[ci] = logical_id - self._client_to_replica[ci] = ri + self.stage_pools: list[StagePool] = stage_pools # Backward compat: num_stages now means num_logical_stages self.num_stages = self.num_logical_stages @@ -185,33 +161,15 @@ def __init__( self._companion_done: dict[str, set[str]] = {} self._deferred_parents: dict[str, dict[str, Any]] = {} - # Per-client metrics accumulators. - self._batch_seq: list[int] = [0] * self.num_clients - self._agg_total_tokens: list[int] = [0] * self.num_clients - self._agg_total_gen_time_ms: list[float] = [0.0] * self.num_clients + # Per-replica metrics accumulators (keyed by StageReplica identity). + self._replica_metrics: dict[StageReplica, _ReplicaMetrics] = { + replica: _ReplicaMetrics() for pool in self.stage_pools for replica in pool.replicas + } # Shutdown coordination self._shutdown_event = asyncio.Event() self._stages_shutdown = False - def _choose_client_index( - self, - logical_stage_id: int, - req_state: OrchestratorRequestState, - ) -> int: - """Pick a flat client_index for *logical_stage_id* via the stage pool. - - Thin wrapper that delegates to ``StagePool.select_replica`` so the - flat-index-based call sites keep working. New code should call the - pool directly when the StageReplica object itself is useful. - """ - replica = self.stage_pools[logical_stage_id].select_replica(req_state) - return replica.flat_index - - def _resolve_client_index(self, stage_id: int, replica_index: int = 0) -> int: - """Resolve (stage_id, replica_index) to a flat client index.""" - return self.logical_stage_to_clients[stage_id][replica_index] - async def run(self) -> None: """Main entry point for the Orchestrator event loop.""" logger.info("[Orchestrator] Starting event loop") @@ -295,25 +253,20 @@ async def _orchestration_loop(self) -> None: Control flow: poll raw → process through output processor → route. - Multi-replica: iterates over every (stage_id, replica_index) pair, - resolves to a flat client_index internally for resource access. + Iterates every replica across all logical stages via stage_pools. """ while not self._shutdown_event.is_set(): idle = True - for stage_id in range(self.num_logical_stages): - for replica_index in range(len(self.logical_stage_to_clients[stage_id])): + for pool in self.stage_pools: + for replica in pool.replicas: if self._shutdown_event.is_set(): return - client_index = self._resolve_client_index(stage_id, replica_index) + stage_id = replica.logical_stage_id # 1) Diffusion stage: poll non-blocking queue - # TODO (Peiqi): the output of diffusion stage is OmniRequestOutput, - # which is different from EngineCoreOutputs (LLM stages). We may want to unify - # the output format in the future to simplify the processing logic in Orchestrator. - stage_client = self.stage_clients[client_index] - if stage_client.stage_type == "diffusion": - output = stage_client.get_diffusion_output_nowait() + if replica.client.stage_type == "diffusion": + output = replica.client.get_diffusion_output_nowait() if output is not None: idle = False req_state = self.request_states.get(output.request_id) @@ -336,25 +289,23 @@ async def _orchestration_loop(self) -> None: continue stage_metrics = self._build_stage_metrics( - stage_id, + replica, output.request_id, [output], req_state, - replica_index=replica_index, ) await self._route_output( - stage_id, + replica, output, req_state, stage_metrics, - replica_index=replica_index, ) continue # 1) Poll raw outputs from the stage replica try: raw_outputs = await asyncio.wait_for( - self._poll_stage_raw(stage_id, replica_index=replica_index), + self._poll_stage_raw(replica), timeout=0.001, ) except asyncio.TimeoutError: @@ -367,7 +318,7 @@ async def _orchestration_loop(self) -> None: logger.exception( "[Orchestrator] _poll_stage_raw failed for stage-%s replica-%s", stage_id, - replica_index, + replica.replica_index, ) raise @@ -377,16 +328,14 @@ async def _orchestration_loop(self) -> None: # Handle prefill-finished KV-ready signals before finished outputs. await self._handle_kv_ready_raw_outputs( - stage_id, + replica, raw_outputs, - replica_index=replica_index, ) # 2) Process raw outputs through the output processor request_outputs = await self._process_stage_outputs( - stage_id, + replica, raw_outputs, - replica_index=replica_index, ) # 3) Route each processed output @@ -398,25 +347,23 @@ async def _orchestration_loop(self) -> None: "at stage-%s replica-%s (known reqs: %s)", output.request_id, stage_id, - replica_index, + replica.replica_index, list(self.request_states.keys()), ) continue stage_metrics = None if output.finished: stage_metrics = self._build_stage_metrics( - stage_id, + replica, output.request_id, [output], req_state, - replica_index=replica_index, ) await self._route_output( - stage_id, + replica, output, req_state, stage_metrics, - replica_index=replica_index, ) if idle: @@ -426,24 +373,16 @@ async def _orchestration_loop(self) -> None: async def _route_output( self, - stage_id: int, + replica: StageReplica, output: Any, req_state: OrchestratorRequestState, stage_metrics: Any, - *, - replica_index: int = 0, ) -> None: - """Route a processed output: send to main thread and/or forward to next stage. - - Args: - stage_id: Logical stage id. - replica_index: Replica index within the logical stage. - """ - client_index = self._resolve_client_index(stage_id, replica_index) + """Route a processed output: send to main thread and/or forward to next stage.""" + stage_id = replica.logical_stage_id req_id = output.request_id finished = output.finished submit_ts = req_state.stage_submit_ts.get(stage_id) - stage_client = self.stage_clients[client_index] # CFG companion handling: companions don't produce user-visible output # and don't forward to the next stage directly. @@ -452,7 +391,7 @@ async def _route_output( self.request_states.pop(req_id, None) return - if stage_client.final_output: + if replica.client.final_output: await self.output_async_queue.put( { "type": "output", @@ -485,15 +424,14 @@ async def _route_output( self._deferred_parents[req_id] = { "stage_id": stage_id, "output": output, - "replica_index": replica_index, + "replica": replica, } else: await self._forward_to_next_stage( req_id, - stage_id, + replica, output, req_state, - replica_index=replica_index, ) if finished and stage_id == req_state.final_stage_id: @@ -535,22 +473,20 @@ async def _handle_cfg_companion_ready(self, req_id: str) -> None: if parent_state is not None and not self._next_stage_already_submitted(deferred["stage_id"], parent_state): await self._forward_to_next_stage( parent_id, - deferred["stage_id"], + deferred["replica"], deferred["output"], parent_state, - replica_index=deferred.get("replica_index", 0), ) async def _handle_kv_ready_raw_outputs( self, - stage_id: int, + replica: StageReplica, raw_outputs: EngineCoreOutputs, - *, - replica_index: int = 0, ) -> None: """Forward split requests once stage-0 KV is ready, not only when decode fully finishes.""" if self.async_chunk: return + stage_id = replica.logical_stage_id for raw_output in raw_outputs.outputs: kv_params = getattr(raw_output, "kv_transfer_params", None) if not (isinstance(kv_params, dict) and kv_params.get("kv_ready")): @@ -570,32 +506,29 @@ async def _handle_kv_ready_raw_outputs( self._deferred_parents[req_id] = { "stage_id": stage_id, "output": raw_output, - "replica_index": replica_index, + "replica": replica, } else: await self._forward_to_next_stage( req_id, - stage_id, + replica, raw_output, req_state, - replica_index=replica_index, ) def _build_stage_metrics( self, - stage_id: int, + replica: StageReplica, req_id: str, request_outputs: list[RequestOutput], req_state: OrchestratorRequestState, - *, - replica_index: int = 0, ) -> StageRequestMetrics: """Build StageRequestMetrics for a finished request at a stage replica. Reuses StageRequestMetrics so OrchestratorMetrics and downstream metric handlers can consume a stable schema. """ - client_index = self._resolve_client_index(stage_id, replica_index) + stage_id = replica.logical_stage_id now = _time.time() submit_ts = req_state.stage_submit_ts.get(stage_id, now) stage_gen_time_ms = (now - submit_ts) * 1000.0 @@ -608,13 +541,14 @@ def _build_stage_metrics( if ptids is not None: num_tokens_in += len(ptids) - # Monotonic batch counter per client. - self._batch_seq[client_index] += 1 - batch_id = self._batch_seq[client_index] + # Monotonic batch counter per replica. + metrics = self._replica_metrics[replica] + metrics.batch_seq += 1 + batch_id = metrics.batch_seq # Accumulate for running-average stage_stats - self._agg_total_tokens[client_index] += num_tokens_out - self._agg_total_gen_time_ms[client_index] += stage_gen_time_ms + metrics.agg_total_tokens += num_tokens_out + metrics.agg_total_gen_time_ms += stage_gen_time_ms return StageRequestMetrics( num_tokens_in=num_tokens_in, @@ -626,8 +560,8 @@ def _build_stage_metrics( rx_transfer_bytes=0, rx_in_flight_time_ms=0.0, stage_stats=StageStats( - total_token=self._agg_total_tokens[client_index], - total_gen_time_ms=self._agg_total_gen_time_ms[client_index], + total_token=metrics.agg_total_tokens, + total_gen_time_ms=metrics.agg_total_gen_time_ms, ), ) @@ -638,7 +572,7 @@ def _build_kv_sender_info(self, sender_stage_ids: list[int]) -> dict[int, dict[s if sender_stage_id < 0 or sender_stage_id >= self.num_stages: continue - sender_stage = self.stage_clients[sender_stage_id] + sender_stage = self.stage_pools[sender_stage_id].replicas[0].client get_sender_info = getattr(sender_stage, "get_kv_sender_info", None) if not callable(get_sender_info): continue @@ -655,37 +589,41 @@ def _build_kv_sender_info(self, sender_stage_ids: list[int]) -> dict[int, dict[s return sender_infos or None + def _stage_client_list_for_legacy(self) -> list[Any]: + """First-replica client per logical stage. + + Legacy helper for ``process_engine_inputs`` and + ``custom_process_input_func`` which expect a flat list indexed by + logical stage id. Will be removed once those interfaces are + refactored to accept StagePool directly. + """ + return [pool.replicas[0].client for pool in self.stage_pools] + async def _forward_to_next_stage( self, req_id: str, - stage_id: int, + replica: StageReplica, output: Any, req_state: OrchestratorRequestState, - *, - replica_index: int = 0, ) -> None: """Forward output from current stage to the next stage. Handles the full pipeline: set outputs on current stage, compute next-stage inputs, build lightweight requests, and submit them. - - Args: - stage_id: Logical stage id that produced the output. - replica_index: Replica index of the stage that produced the output. """ - client_index = self._resolve_client_index(stage_id, replica_index) - + stage_id = replica.logical_stage_id next_logical = stage_id + 1 - next_ci = self._choose_client_index(next_logical, req_state) - next_client = self.stage_clients[next_ci] + next_pool = self.stage_pools[next_logical] + next_replica = next_pool.select_replica(req_state) params = req_state.sampling_params_list[next_logical] - if next_client.stage_type == "diffusion": - self.stage_clients[client_index].set_engine_outputs([output]) - if next_client.custom_process_input_func is not None: - diffusion_prompt = next_client.custom_process_input_func( - self.stage_clients, - next_client.engine_input_source, + if next_replica.client.stage_type == "diffusion": + replica.client.set_engine_outputs([output]) + if next_replica.client.custom_process_input_func is not None: + stage_list = self._stage_client_list_for_legacy() + diffusion_prompt = next_replica.client.custom_process_input_func( + stage_list, + next_replica.client.engine_input_source, req_state.prompt, False, ) @@ -709,17 +647,17 @@ async def _forward_to_next_stage( req_id, ) - source_stage_ids = list(getattr(next_client, "engine_input_source", None) or [stage_id]) + source_stage_ids = list(getattr(next_replica.client, "engine_input_source", None) or [stage_id]) kv_sender_info = self._build_kv_sender_info(sender_stage_ids=source_stage_ids) if isinstance(diffusion_prompt, list): - await next_client.add_batch_request_async( + await next_replica.client.add_batch_request_async( req_id, diffusion_prompt, params, kv_sender_info=kv_sender_info, ) else: - await next_client.add_request_async( + await next_replica.client.add_request_async( req_id, diffusion_prompt, params, @@ -729,21 +667,22 @@ async def _forward_to_next_stage( return # Set outputs on the client that actually produced them - self.stage_clients[client_index].set_engine_outputs([output]) + replica.client.set_engine_outputs([output]) # Process inputs for next stage + stage_list = self._stage_client_list_for_legacy() try: - next_inputs = next_client.process_engine_inputs( - stage_list=self.stage_clients, + next_inputs = next_replica.client.process_engine_inputs( + stage_list=stage_list, prompt=req_state.prompt, - source_client=self.stage_clients[client_index], + source_client=replica.client, ) except Exception: logger.exception( "[Orchestrator] req=%s process_engine_inputs FAILED for stage-%s replica-%s", req_id, next_logical, - self._client_to_replica[next_ci], + next_replica.replica_index, ) raise @@ -753,13 +692,13 @@ async def _forward_to_next_stage( request_id=req_id, prompt=next_input, params=params, - model_config=self.stage_vllm_configs[next_ci].model_config, + model_config=next_replica.vllm_config.model_config, ) # TODO: Here we directly use the req id to assign. request.external_req_id = request.request_id - self.output_processors[next_ci].add_request( + next_replica.output_processor.add_request( request=request, prompt=None, parent_req=None, @@ -767,41 +706,31 @@ async def _forward_to_next_stage( queue=None, ) - await next_client.add_request_async(request) + await next_replica.client.add_request_async(request) # Record submit timestamp for the next logical stage req_state.stage_submit_ts[next_logical] = _time.time() async def _poll_stage_raw( self, - stage_id: int, - *, - replica_index: int = 0, + replica: StageReplica, ) -> EngineCoreOutputs | None: - """Pull raw EngineCoreOutputs from a stage replica without processing. - - Returns the raw outputs object, or None when there is nothing - to consume. - """ - client_index = self._resolve_client_index(stage_id, replica_index) - outputs = await self.stage_clients[client_index].get_output_async() + """Pull raw EngineCoreOutputs from a stage replica without processing.""" + outputs = await replica.client.get_output_async() if not outputs.outputs: return None return outputs async def _process_stage_outputs( self, - stage_id: int, + replica: StageReplica, raw_outputs: EngineCoreOutputs, - *, - replica_index: int = 0, ) -> list[RequestOutput]: """Run the output processor on raw outputs, returning RequestOutputs. Also handles abort forwarding and scheduler stats updates. """ - client_index = self._resolve_client_index(stage_id, replica_index) - processor = self.output_processors[client_index] + processor = replica.output_processor processed = processor.process_outputs( raw_outputs.outputs, @@ -810,7 +739,7 @@ async def _process_stage_outputs( ) if processed.reqs_to_abort: - await self.stage_clients[client_index].abort_requests_async(processed.reqs_to_abort) + await replica.client.abort_requests_async(processed.reqs_to_abort) if raw_outputs.scheduler_stats is not None: processor.update_scheduler_stats(raw_outputs.scheduler_stats) @@ -914,12 +843,15 @@ async def _handle_streaming_update(self, msg: dict[str, Any]) -> None: req_state.sampling_params_list = msg["sampling_params_list"] req_state.stage_submit_ts[stage_id] = _time.time() - stage_client = self.stage_clients[stage_id] - if stage_client.stage_type == "diffusion": + # Streaming updates re-use the already-chosen replica from initial submit. + replica = req_state.chosen_replica.get(stage_id) + if replica is None: + replica = self.stage_pools[stage_id].select_replica(req_state) + if replica.client.stage_type == "diffusion": params = req_state.sampling_params_list[stage_id] - await stage_client.add_request_async(request_id, request, params) + await replica.client.add_request_async(request_id, request, params) else: - await stage_client.add_request_async(request) + await replica.client.add_request_async(request) async def _prewarm_async_chunk_stages( self, @@ -933,8 +865,8 @@ async def _prewarm_async_chunk_stages( so downstream stages should be armed once at request start instead of waiting for stage-finished forwarding. - Multi-replica: uses _choose_client_index so the prewarm targets align - with the orchestration-face chosen replicas. + Multi-replica: uses stage_pools[].select_replica so the prewarm + targets align with the orchestration-face chosen replicas. """ if req_state.final_stage_id <= 0: return @@ -962,14 +894,13 @@ async def _prewarm_async_chunk_stages( base_input["mm_processor_kwargs"] = None for next_logical in range(1, req_state.final_stage_id + 1): - next_ci = self._choose_client_index(next_logical, req_state) - next_client = self.stage_clients[next_ci] + next_replica = self.stage_pools[next_logical].select_replica(req_state) params = req_state.sampling_params_list[next_logical] - if next_client.stage_type == "diffusion": - source_stage_ids = list(getattr(next_client, "engine_input_source", None) or [next_logical - 1]) + if next_replica.client.stage_type == "diffusion": + source_stage_ids = list(getattr(next_replica.client, "engine_input_source", None) or [next_logical - 1]) kv_sender_info = self._build_kv_sender_info(sender_stage_ids=source_stage_ids) - await next_client.add_request_async( + await next_replica.client.add_request_async( request_id, req_state.prompt, params, @@ -982,18 +913,18 @@ async def _prewarm_async_chunk_stages( request_id=request_id, prompt=base_input, params=params, - model_config=self.stage_vllm_configs[next_ci].model_config, + model_config=next_replica.vllm_config.model_config, ) request.external_req_id = request.request_id - self.output_processors[next_ci].add_request( + next_replica.output_processor.add_request( request=request, prompt=None, parent_req=None, request_index=0, queue=None, ) - await next_client.add_request_async(request) + await next_replica.client.add_request_async(request) req_state.stage_submit_ts[next_logical] = _time.time() async def _handle_add_companion(self, msg: dict[str, Any]) -> None: @@ -1030,9 +961,7 @@ async def _handle_add_companion(self, msg: dict[str, Any]) -> None: parent_state = self.request_states.get(parent_id) parent_replica: StageReplica | None = None if parent_state is not None: - parent_flat = parent_state.chosen_client_index.get(0) - if parent_flat is not None: - parent_replica = stage0_pool.get_replica_by_flat_index(parent_flat) + parent_replica = parent_state.chosen_replica.get(0) companion_state.stage_submit_ts[0] = _time.time() @@ -1086,8 +1015,9 @@ async def _handle_abort(self, msg: dict[str, Any]) -> None: self._deferred_parents.pop(req_id, None) all_ids_to_abort = list(request_ids) + companion_ids_to_abort - for ci in range(self.num_clients): - await self.stage_clients[ci].abort_requests_async(all_ids_to_abort) + for pool in self.stage_pools: + for replica in pool.replicas: + await replica.client.abort_requests_async(all_ids_to_abort) for req_id in request_ids: self.request_states.pop(req_id, None) logger.info("[Orchestrator] Aborted request(s) %s", request_ids) @@ -1105,34 +1035,25 @@ async def _handle_collective_rpc(self, msg: dict[str, Any]) -> None: args = tuple(msg.get("args", ())) kwargs = dict(msg.get("kwargs") or {}) requested_stage_ids = msg.get("stage_ids") - # When stage_ids are provided they refer to logical stages; expand - # to all client indices belonging to those logical stages. + + # Collect the replicas to target. + target_replicas: list[StageReplica] = [] if requested_stage_ids is None: - stage_ids = list(range(self.num_clients)) + for pool in self.stage_pools: + target_replicas.extend(pool.replicas) else: - stage_ids = [] for lid in requested_stage_ids: if 0 <= lid < self.num_logical_stages: - stage_ids.extend(self.logical_stage_to_clients[lid]) - else: - stage_ids.append(lid) # keep invalid id for error reporting + target_replicas.extend(self.stage_pools[lid].replicas) + # else: silently skip invalid stage ids results: list[Any] = [] - for stage_id in stage_ids: - if stage_id < 0 or stage_id >= self.num_clients: - results.append( - { - "supported": False, - "todo": True, - "error": f"Invalid client index {stage_id}", - } - ) - continue - - stage_client = self.stage_clients[stage_id] + stage_ids: list[int] = [] + for replica in target_replicas: + stage_ids.append(replica.logical_stage_id) try: - if hasattr(stage_client, "collective_rpc_async"): - stage_result = await stage_client.collective_rpc_async( + if hasattr(replica.client, "collective_rpc_async"): + stage_result = await replica.client.collective_rpc_async( method=method, timeout=timeout, args=args, @@ -1142,12 +1063,13 @@ async def _handle_collective_rpc(self, msg: dict[str, Any]) -> None: stage_result = { "supported": False, "todo": True, - "reason": (f"{stage_client.__class__.__name__}.collective_rpc_async is not implemented yet"), + "reason": (f"{replica.client.__class__.__name__}.collective_rpc_async is not implemented yet"), } except Exception as exc: logger.exception( - "[Orchestrator] collective_rpc failed: stage=%s method=%s", - stage_id, + "[Orchestrator] collective_rpc failed: stage=%s replica=%s method=%s", + replica.logical_stage_id, + replica.replica_index, method, ) stage_result = { @@ -1173,19 +1095,21 @@ def _shutdown_stages(self) -> None: return self._stages_shutdown = True - logger.info("[Orchestrator] Shutting down all %d client(s)", self.num_clients) - for ci, stage_client in enumerate(self.stage_clients): - try: - stage_client.shutdown() - logger.info( - "[Orchestrator] Stage %d replica %d shut down", - self._client_to_logical[ci], - self._client_to_replica[ci], - ) - except Exception as e: - logger.warning( - "[Orchestrator] Failed to shutdown stage %d replica %d: %s", - self._client_to_logical[ci], - self._client_to_replica[ci], - e, - ) + total = sum(pool.num_replicas for pool in self.stage_pools) + logger.info("[Orchestrator] Shutting down all %d client(s)", total) + for pool in self.stage_pools: + for replica in pool.replicas: + try: + replica.client.shutdown() + logger.info( + "[Orchestrator] Stage %d replica %d shut down", + replica.logical_stage_id, + replica.replica_index, + ) + except Exception as e: + logger.warning( + "[Orchestrator] Failed to shutdown stage %d replica %d: %s", + replica.logical_stage_id, + replica.replica_index, + e, + ) diff --git a/vllm_omni/engine/stage_pool.py b/vllm_omni/engine/stage_pool.py index c55e5714948..a52cd29683b 100644 --- a/vllm_omni/engine/stage_pool.py +++ b/vllm_omni/engine/stage_pool.py @@ -15,18 +15,17 @@ from vllm_omni.engine.orchestrator import OrchestratorRequestState -@dataclass +@dataclass(eq=False) class StageReplica: """One replica of a logical stage. - flat_index is the index into Orchestrator's flat stage_clients list; it - is the value cached in OrchestratorRequestState.chosen_client_index so - existing call sites that resolve a flat client keep working unchanged. + ``eq=False`` keeps identity-based equality/hash so StageReplica instances + can be used as dict keys (Orchestrator caches them on req_state and in + per-replica metrics accumulators). """ logical_stage_id: int replica_index: int - flat_index: int client: Any output_processor: Any vllm_config: Any @@ -47,15 +46,11 @@ def __init__( self.stage_type = stage_type self.replicas: list[StageReplica] = replicas self._rr_cursor = 0 - self._by_flat_index: dict[int, StageReplica] = {r.flat_index: r for r in replicas} @property def num_replicas(self) -> int: return len(self.replicas) - def get_replica_by_flat_index(self, flat_index: int) -> StageReplica: - return self._by_flat_index[flat_index] - def select_replica( self, req_state: OrchestratorRequestState, @@ -70,9 +65,9 @@ def select_replica( inheriting its parent's replica at stage 0). 3. Round-robin across replicas. """ - cached = req_state.chosen_client_index.get(self.logical_stage_id) + cached = req_state.chosen_replica.get(self.logical_stage_id) if cached is not None: - return self._by_flat_index[cached] + return cached if affinity_from is not None: if affinity_from.logical_stage_id != self.logical_stage_id: @@ -87,7 +82,7 @@ def select_replica( chosen = self.replicas[self._rr_cursor % self.num_replicas] self._rr_cursor += 1 - req_state.chosen_client_index[self.logical_stage_id] = chosen.flat_index + req_state.chosen_replica[self.logical_stage_id] = chosen return chosen def admit( @@ -131,7 +126,6 @@ def build_stage_pools( StageReplica( logical_stage_id=logical_id, replica_index=ri, - flat_index=ci, client=stage_clients[ci], output_processor=output_processors[ci], vllm_config=stage_vllm_configs[ci], From c83f664fe17a372e0cfcf31b81b423ffee940e6b Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Thu, 16 Apr 2026 11:13:41 +0800 Subject: [PATCH 60/76] [CI][Perf] Add nightly PR labels, consolidate pipeline, and switch benchmark flag to --test-config-file (#2816) Signed-off-by: wangyu <410167048@qq.com> Co-authored-by: Y. Fisher Co-authored-by: inaniloquentee --- .buildkite/pipeline.yml | 12 +- .buildkite/test-nightly-diffusion.yml | 417 ----------------- .buildkite/test-nightly.yml | 432 ++++++++++++++++-- docs/contributing/ci/CI_5levels.md | 7 +- .../test_examples/l4_performance_tests.inc.md | 2 +- docs/contributing/ci/test_guide.md | 5 +- tests/dfx/conftest.py | 12 + tests/dfx/perf/scripts/run_benchmark.py | 49 +- .../perf/scripts/run_diffusion_benchmark.py | 25 +- .../tests/{test.json => test_qwen_omni.json} | 32 -- tests/dfx/perf/tests/test_tts.json | 34 ++ 11 files changed, 493 insertions(+), 534 deletions(-) delete mode 100644 .buildkite/test-nightly-diffusion.yml rename tests/dfx/perf/tests/{test.json => test_qwen_omni.json} (92%) create mode 100644 tests/dfx/perf/tests/test_tts.json diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index d9a2315953a..00823951dcc 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -44,11 +44,19 @@ steps: agents: queue: "cpu_queue_premerge" - # L4 Test — main+NIGHTLY=1 (scheduled), or PR with label nightly-test (e.g. add label then Rebuild) + # L4 Test — main+NIGHTLY=1 (scheduled), or PR with specific label (e.g. add label then Rebuild) - label: "Upload Nightly Pipeline" depends_on: image-build key: upload-nightly-pipeline - if: '(build.branch == "main" && build.env("NIGHTLY") == "1") || (build.branch != "main" && build.pull_request.labels includes "nightly-test")' + if: >- + (build.branch == "main" && build.env("NIGHTLY") == "1") || + (build.branch != "main" && ( + build.pull_request.labels includes "nightly-test" || + build.pull_request.labels includes "omni-test" || + build.pull_request.labels includes "tts-test" || + build.pull_request.labels includes "diffusion-x2iat-test" || + build.pull_request.labels includes "diffusion-x2v-test" + )) commands: - buildkite-agent pipeline upload .buildkite/test-nightly.yml agents: diff --git a/.buildkite/test-nightly-diffusion.yml b/.buildkite/test-nightly-diffusion.yml deleted file mode 100644 index b5ba8a117c6..00000000000 --- a/.buildkite/test-nightly-diffusion.yml +++ /dev/null @@ -1,417 +0,0 @@ -# Nightly diffusion GPU tests — appended to the main nightly build via -# buildkite-agent pipeline upload .buildkite/test-nightly-diffusion.yml -# from test-nightly.yml (step key: nightly-diffusion-model-test). Top-level groups are -# foldable in the Buildkite UI (Other / Wan / Qwen-Image). -env: - VLLM_WORKER_MULTIPROC_METHOD: spawn - HF_HUB_DOWNLOAD_TIMEOUT: 300 - HF_HUB_ETAG_TIMEOUT: 60 - -steps: - - group: ":card_index_dividers: Other Model Test" - key: nightly-other-model-test-group - steps: - - label: ":full_moon: Diffusion · Other · Function Test with H100" - timeout_in_minutes: 120 - # Shared nightly vs PR label conditional; referenced below as *nightly_or_pr_label - if: &nightly_or_pr_label 'build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"' - commands: - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not test_qwen_image" -m "advanced_model and diffusion and H100" --run-level "advanced_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion · Other · Function Test with L4" - timeout_in_minutes: 60 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model" - agents: - queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU - plugins: - - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - always-pull: true - propagate-environment: true - shm-size: "8gb" - environment: - - "HF_HOME=/fsx/hf_cache" - - "HF_TOKEN" - volumes: - - "/fsx/hf_cache:/fsx/hf_cache" - - - label: ":full_moon: Diffusion · Other · Doc Test" - timeout_in_minutes: 60 - if: *nightly_or_pr_label - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY="1" - - pytest -s -v tests/examples/online_serving/test_text_to_image.py tests/examples/offline_inference/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - group: ":card_index_dividers: Wan Series Model Test" - key: nightly-wan-model-test-group - steps: - - label: ":full_moon: Diffusion · Wan · Function Test" - timeout_in_minutes: 90 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py -m "advanced_model" --run-level "advanced_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion · Wan · Accuracy Test" - key: nightly-wan22-i2v-accuracy - timeout_in_minutes: 180 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level advanced_model - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - group: ":card_index_dividers: Qwen-Image Series Model Test" - key: nightly-qwen-image-edit-group - steps: - - label: ":full_moon: Diffusion · Qwen-Image · Function Test with H100" - timeout_in_minutes: 120 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/online_serving/test_qwen_image*_expansion.py -m "advanced_model and diffusion and H100" --run-level "advanced_model" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 2 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion · Qwen-Image · GEBench Accuracy Test" - key: nightly-gebench-accuracy - timeout_in_minutes: 60 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1 - - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 1 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion · Qwen-Image · GEdit-Bench Accuracy Test" - key: nightly-gedit-bench-accuracy - timeout_in_minutes: 60 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1 - - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv" - - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json" - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 1 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_HTTP_TIMEOUT_KEEP_ALIVE - value: "120" - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion · Qwen-Image · Perf Test" - key: nightly-qwen-image-performance - timeout_in_minutes: 180 - if: *nightly_or_pr_label - commands: - - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results - - export DIFFUSION_ATTENTION_BACKEND=FLASH_ATTN - - export CACHE_DIT_VERSION=1.3.0 - # [HACK]: run upload in the same command block as pytest. - # Because `exit` aborts the entire commands list. - - | - set +e - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json - EXIT1=$$? - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json - EXIT2=$$? - pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json - EXIT3=$$? - if [ $$EXIT1 -eq 0 ] || [ $$EXIT2 -eq 0 ] || [ $$EXIT3 -eq 0 ]; then - buildkite-agent artifact upload "tests/dfx/perf/results/diffusion_result_*.json" - buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" - fi - exit $$((EXIT1 | EXIT2 | EXIT3)) - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 4 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate - - - label: ":full_moon: Diffusion · Qwen-Image · Accuracy Test" - key: nightly-qwen-image-accuracy - timeout_in_minutes: 180 - if: *nightly_or_pr_label - commands: - - pytest -s -v tests/e2e/accuracy/test_qwen_image*.py --run-level advanced_model - agents: - queue: "mithril-h100-pool" - plugins: - - kubernetes: - podSpec: - containers: - - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - resources: - limits: - nvidia.com/gpu: 1 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: HF_HOME - value: /root/.cache/huggingface - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - node.kubernetes.io/instance-type: gpu-h100-sxm - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /mnt/hf-cache - type: DirectoryOrCreate diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 31b3e17976c..58e1e55af7f 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -7,12 +7,11 @@ steps: # Group: collapses under one heading in the Buildkite UI; child steps still run in parallel. - group: ":card_index_dividers: Omni Model Test" key: nightly-omni-test-group + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" || build.pull_request.labels includes "omni-test" steps: - - label: ":full_moon: Omni · Function Test with H100" + - label: ":full_moon: Omni · Function Test" timeout_in_minutes: 90 - depends_on: upload-nightly-pipeline - # Shared nightly vs PR label conditional; referenced below as *nightly_or_pr_label - if: &nightly_or_pr_label 'build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"' commands: - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and H100 and omni" --run-level "advanced_model" agents: @@ -49,13 +48,11 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - - label: ":full_moon: Omni · Function Test with L4" + - label: ":full_moon: Omni · Doc Test with L4" timeout_in_minutes: 90 - depends_on: upload-nightly-pipeline - if: *nightly_or_pr_label commands: - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model" + - pytest -s -v tests/examples/ -m "advanced_model and omni and L4" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: @@ -70,13 +67,203 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: ":full_moon: Omni · Doc Test with L4" + - label: ":full_moon: Omni · Doc Test with H100" + timeout_in_minutes: 90 + commands: + - pytest -s -v tests/examples/ -m "advanced_model and omni and H100" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Omni · Perf Test" + key: nightly-omni-performance + timeout_in_minutes: 180 + commands: + - export BENCHMARK_DIR=tests/dfx/perf/results + - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_omni.json + - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + + - group: ":card_index_dividers: TTS Model Test" + key: nightly-tts-test-group + depends_on: upload-nightly-pipeline + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" || build.pull_request.labels includes "tts-test" + steps: + - label: ":full_moon: TTS · Function Test" timeout_in_minutes: 90 - depends_on: upload-nightly-pipeline - if: *nightly_or_pr_label commands: - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - pytest -s -v tests/examples/ -m "advanced_model and omni and L4" --run-level "advanced_model" + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model" + agents: + queue: "gpu_1_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + always-pull: true + propagate-environment: true + shm-size: "8gb" + environment: + - "HF_HOME=/fsx/hf_cache" + - "HF_TOKEN" + volumes: + - "/fsx/hf_cache:/fsx/hf_cache" + + - label: ":full_moon: TTS · Perf Test" + key: nightly-tts-performance + timeout_in_minutes: 180 + commands: + - export BENCHMARK_DIR=tests/dfx/perf/results + - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" + - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py --test-config-file tests/dfx/perf/tests/test_tts.json + - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + # Diffusion X2I suite: x2i / x2a / x2t and related non-video paths; x2v is only in "Diffusion X2V Model Test" below. + - group: ":card_index_dividers: Diffusion X2I(&A&T) Model Test" + key: nightly-diffusion-x2iat-group + depends_on: upload-nightly-pipeline + if: >- + build.env("NIGHTLY") == "1" || + build.pull_request.labels includes "nightly-test" || + build.pull_request.labels includes "diffusion-x2iat-test" + steps: + - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100" + timeout_in_minutes: 120 + commands: + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "advanced_model and diffusion and H100" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4" + timeout_in_minutes: 60 + commands: + - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "advanced_model and diffusion and L4" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: @@ -91,12 +278,11 @@ steps: volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: ":full_moon: Omni · Doc Test with H100" - timeout_in_minutes: 90 - depends_on: upload-nightly-pipeline - if: *nightly_or_pr_label + - label: ":full_moon: Diffusion X2I(&A&T) · Doc Test" + timeout_in_minutes: 60 commands: - - pytest -s -v tests/examples/ -m "advanced_model and omni and H100" --run-level "advanced_model" + - export VLLM_TEST_CLEAN_GPU_MEMORY="1" + - pytest -s -v tests/examples/*/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model" agents: queue: "mithril-h100-pool" plugins: @@ -131,16 +317,109 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - - label: ":full_moon: Omni · Perf Test" - key: nightly-omni-performance + - label: ":full_moon: Diffusion X2I(&A&T) · GEBench Accuracy Test" + timeout_in_minutes: 60 + commands: + - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1 + - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Diffusion X2I(&A&T) · GEdit-Bench Accuracy Test" + timeout_in_minutes: 60 + commands: + - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1 + - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv" + - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_HTTP_TIMEOUT_KEEP_ALIVE + value: "120" + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Diffusion X2I(&A&T) · Perf Test" + key: nightly-diffusion-x2iat-performance timeout_in_minutes: 180 - depends_on: upload-nightly-pipeline - if: *nightly_or_pr_label commands: - - export BENCHMARK_DIR=tests/dfx/perf/results - - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py - - buildkite-agent artifact upload "tests/dfx/perf/results/*.json" + - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results + - export DIFFUSION_ATTENTION_BACKEND=FLASH_ATTN + - export CACHE_DIT_VERSION=1.3.0 + # [HACK]: run upload in the same command block as pytest. + # Because `exit` aborts the entire commands list. + - | + set +e + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json + EXIT1=$$? + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json + EXIT2=$$? + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json + EXIT3=$$? + if [ $$EXIT1 -eq 0 ] || [ $$EXIT2 -eq 0 ] || [ $$EXIT3 -eq 0 ]; then + buildkite-agent artifact upload "tests/dfx/perf/results/diffusion_result_*.json" + buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" + fi + exit $$((EXIT1 | EXIT2 | EXIT3)) agents: queue: "mithril-h100-pool" plugins: @@ -150,7 +429,7 @@ steps: - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT resources: limits: - nvidia.com/gpu: 2 + nvidia.com/gpu: 4 volumeMounts: - name: devshm mountPath: /dev/shm @@ -175,23 +454,96 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - # Dynamically appends steps from test-nightly-diffusion.yml into this build (same mechanism as - # pipeline.yml → test-ready.yml / test-merge.yml / test-nightly.yml). Foldable groups stay in the - # uploaded YAML (Other / Wan / Qwen-Image). - - label: ":card_index_dividers: Diffusion Model Test" - key: nightly-diffusion-model-test + # Diffusion x2v only (Wan, HunyuanVideo, …). x2i/x2a/x2t live in the X2I group above, not here. + - group: ":card_index_dividers: Diffusion X2V Model Test" + key: nightly-diffusion-x2v-group depends_on: upload-nightly-pipeline - if: *nightly_or_pr_label - commands: - - buildkite-agent pipeline upload .buildkite/test-nightly-diffusion.yml - agents: - queue: "cpu_queue_premerge" + if: >- + build.env("NIGHTLY") == "1" || + build.pull_request.labels includes "nightly-test" || + build.pull_request.labels includes "diffusion-x2v-test" + steps: + - label: ":full_moon: Diffusion X2V · Function Test" + timeout_in_minutes: 90 + commands: + - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py -m "advanced_model" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + + - label: ":full_moon: Diffusion X2V · Accuracy Test" + timeout_in_minutes: 180 + commands: + - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level advanced_model + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate - label: ":bar_chart: Testcase Statistics" key: nightly-testcase-statistics timeout_in_minutes: 120 depends_on: upload-nightly-pipeline - if: *nightly_or_pr_label + if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" commands: - python tools/nightly/buildkite_testcase_statistics.py -o tests/dfx/perf/results/buildkite_testcase_statistics.html - buildkite-agent artifact upload "tests/dfx/perf/results/*.html" @@ -234,15 +586,17 @@ steps: key: nightly-perf-distribution depends_on: - nightly-omni-performance - - nightly-qwen-image-performance + - nightly-tts-performance + - nightly-diffusion-x2iat-performance - nightly-testcase-statistics if: build.env("NIGHTLY") == "1" commands: - pip install openpyxl - export DEFAULT_INPUT_DIR=tests/dfx/perf/results - export DEFAULT_OUTPUT_DIR=tests/dfx/perf/results + - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-tts-performance - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-omni-performance - - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-qwen-image-performance + - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-diffusion-x2iat-performance - buildkite-agent artifact download "tests/dfx/perf/results/*.html" . --step nightly-testcase-statistics - python tools/nightly/generate_nightly_perf_excel.py - python tools/nightly/generate_nightly_perf_html.py diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md index 93060357385..b0428ddd7de 100644 --- a/docs/contributing/ci/CI_5levels.md +++ b/docs/contributing/ci/CI_5levels.md @@ -86,7 +86,8 @@ Through five levels (L1-L5) and common (Common) specifications, the system clari /tests/e2e/online_serving/test_{model_name}_expansion.py
/tests/e2e/offline_inference/test_{model_name}_expansion.py
Performance:
- /tests/dfx/perf/tests/test.json
+ /tests/dfx/perf/tests/test_qwen_omni.json (Omni), test_tts.json (TTS),
+ and /tests/dfx/perf/tests/test_{diffusion_model}_vllm_omni.json (Diffusion)
Doc Test:
tests/example/online_serving/test_{model_name}.py
tests/example/offline_inference/test_{model_name}.py @@ -530,13 +531,13 @@ L4 level testing is a comprehensive quality audit before a version release. It e ### 3.2 Testing Content and Scope - ***Full Functionality Testing***: Executes all test cases defined in `test_{model_name}_expansion.py`, covering all implemented features, positive flows, boundary conditions, and exception handling. -- ***Performance Testing***: Uses the `tests/dfx/perf/tests/test.json` configuration file to drive performance testing tools for stress, load, and endurance tests, collecting metrics like throughput, response time, and resource utilization. +- ***Performance Testing***: Uses `tests/dfx/perf/tests/test_qwen_omni.json`, `tests/dfx/perf/tests/test_tts.json`, and diffusion configs in the form `tests/dfx/perf/tests/test_*_vllm_omni.json` (passed to `run_benchmark.py` via `--test-config-file`) to drive performance testing tools for stress, load, and endurance tests, collecting metrics like throughput, response time, and resource utilization. - ***Documentation Testing***: Verifies whether the example code provided to users is runnable and its results match the description. ### 3.3 Test Directory and Execution Files - ***Functional Testing***: Same directories as L3. -- ***Performance Test Configuration***: `tests/dfx/perf/tests/test.json` +- ***Performance Test Configuration***: `tests/dfx/perf/tests/test_qwen_omni.json`, `tests/dfx/perf/tests/test_tts.json`, and diffusion configs `tests/dfx/perf/tests/test_*_vllm_omni.json` (e.g. `test_qwen_image_vllm_omni.json`) - ***Documentation Example Tests***: - - `tests/example/online_serving/test_{model_name}.py` - `tests/example/offline_inference/test_{model_name}.py` diff --git a/docs/contributing/ci/test_examples/l4_performance_tests.inc.md b/docs/contributing/ci/test_examples/l4_performance_tests.inc.md index 8093e1459f5..f1f3073dc52 100644 --- a/docs/contributing/ci/test_examples/l4_performance_tests.inc.md +++ b/docs/contributing/ci/test_examples/l4_performance_tests.inc.md @@ -1,4 +1,4 @@ -When you want to add L4-level ***performance test*** cases, you can refer to the following format for case addition in tests/dfx/perf/tests/test.json: +When you want to add L4-level ***performance test*** cases, you can refer to the following format for case addition in `tests/dfx/perf/tests/test_qwen_omni.json`, `tests/dfx/perf/tests/test_tts.json`, or diffusion configs such as `tests/dfx/perf/tests/test_*_vllm_omni.json` (selected via `pytest ... run_benchmark.py --test-config-file `): ```JSON { diff --git a/docs/contributing/ci/test_guide.md b/docs/contributing/ci/test_guide.md index 425f24332c2..08b2e3b4ea9 100644 --- a/docs/contributing/ci/test_guide.md +++ b/docs/contributing/ci/test_guide.md @@ -45,7 +45,6 @@ Our test scripts use the pytest framework. First, please use `git clone https:// === "L3 level & L4 level" ```bash - cd tests pytest -s -v -m "advanced_model" --run-level=advanced_model ``` If you only want to run L3 test case, you can use: @@ -60,9 +59,9 @@ Our test scripts use the pytest framework. First, please use `git clone https:// ```bash pytest -s -v -m "core_model and distributed_cuda and L4" --run-level=core_model ``` - Note: To run performance tests, use: + Note: To run performance tests (defaults to ``test_qwen_omni.json``; use ``--test-config-file tests/dfx/perf/tests/test_tts.json`` for TTS): ```bash - pytest -s -v perf/scripts/run_benchmark.py + pytest -s -v tests/dfx/perf/scripts/run_benchmark.py ``` The latest L3 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-merge.yml). diff --git a/tests/dfx/conftest.py b/tests/dfx/conftest.py index e54141b3442..997f25e6e54 100644 --- a/tests/dfx/conftest.py +++ b/tests/dfx/conftest.py @@ -2,6 +2,8 @@ from pathlib import Path from typing import Any +import pytest + from tests.conftest import modify_stage_config @@ -95,3 +97,13 @@ def create_benchmark_indices( indices.append((test_name, idx)) return indices + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Register shared CLI options for DFX benchmark suites.""" + parser.addoption( + "--test-config-file", + action="store", + default=None, + help=("Path to benchmark config JSON. Example: --test-config-file tests/dfx/perf/tests/test_tts.json"), + ) diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py index 67dedcd0480..d5ef1b49e7e 100644 --- a/tests/dfx/perf/scripts/run_benchmark.py +++ b/tests/dfx/perf/scripts/run_benchmark.py @@ -21,10 +21,30 @@ os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" -CONFIG_FILE_PATH = str(Path(__file__).parent.parent / "tests" / "test.json") -BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) -STAGE_INIT_TIMEOUT = 600 +def _get_config_file_from_argv() -> str | None: + """Read ``--test-config-file`` from ``sys.argv`` at import time so parametrization can use it.""" + import sys + + for i, arg in enumerate(sys.argv): + if arg == "--test-config-file" and i + 1 < len(sys.argv): + return sys.argv[i + 1] + if arg.startswith("--test-config-file="): + return arg.split("=", 1)[1] + return None + + +_PERF_TESTS_DIR = Path(__file__).resolve().parent.parent / "tests" +_DEFAULT_CONFIG_FILE = str(_PERF_TESTS_DIR / "test_qwen_omni.json") + +CONFIG_FILE_PATH = _get_config_file_from_argv() +if CONFIG_FILE_PATH is None: + print( + "No --test-config-file in argv, using default: tests/dfx/perf/tests/test_qwen_omni.json " + "(override with e.g. --test-config-file tests/dfx/perf/tests/test_tts.json)" + ) + CONFIG_FILE_PATH = _DEFAULT_CONFIG_FILE +BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH) STAGE_CONFIGS_DIR = Path(__file__).parent.parent / "stage_configs" test_params = create_unique_server_params(BENCHMARK_CONFIGS, STAGE_CONFIGS_DIR) @@ -44,7 +64,7 @@ def omni_server(request): print(f"Starting OmniServer with test: {test_name}, model: {model}") - server_args = ["--stage-init-timeout", str(STAGE_INIT_TIMEOUT), "--init-timeout", "900"] + server_args = ["--stage-init-timeout", "300", "--init-timeout", "900"] if stage_config_path: server_args = ["--stage-configs-path", stage_config_path] + server_args with OmniServer(model, server_args) as server: @@ -97,8 +117,6 @@ def run_benchmark( ["vllm", "bench", "serve", "--omni"] + args + [ - "--num-warmups", - "2", "--save-result", "--result-dir", os.environ.get("BENCHMARK_DIR", "tests"), @@ -141,7 +159,6 @@ def run_benchmark( result["random_output_len"] = random_output_len with open(result_path, "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=2) - return result @@ -207,10 +224,6 @@ def _resolve_baseline_value( f"or request_rate={request_rate!r}; keys={list(baseline_raw.keys())!r}" ) if isinstance(baseline_raw, (list, tuple)): - if sweep_index is None: - raise ValueError("list baseline requires sweep_index") - if not (0 <= sweep_index < len(baseline_raw)): - raise IndexError(f"baseline list len={len(baseline_raw)} has no index {sweep_index}") return baseline_raw[sweep_index] return baseline_raw @@ -245,14 +258,14 @@ def assert_result( ) -> None: assert result["completed"] == num_prompt, "Request failures exist" baseline_data = params.get("baseline", {}) - thresholds = _baseline_thresholds_for_step( - baseline_data, - sweep_index=sweep_index, - max_concurrency=max_concurrency, - request_rate=request_rate, - ) - for metric_name, baseline_value in thresholds.items(): + for metric_name, baseline_raw in baseline_data.items(): current_value = result[metric_name] + baseline_value = _resolve_baseline_value( + baseline_raw, + sweep_index=sweep_index, + max_concurrency=max_concurrency, + request_rate=request_rate, + ) if "throughput" in metric_name: if current_value <= baseline_value: print( diff --git a/tests/dfx/perf/scripts/run_diffusion_benchmark.py b/tests/dfx/perf/scripts/run_diffusion_benchmark.py index 123f21405e8..8eeeec8df25 100644 --- a/tests/dfx/perf/scripts/run_diffusion_benchmark.py +++ b/tests/dfx/perf/scripts/run_diffusion_benchmark.py @@ -5,8 +5,8 @@ - vllm-omni (default): starts DiffusionServer via vllm_omni.entrypoints.cli.main, benchmarks with diffusion_benchmark_serving.py --backend vllm-omni -A config JSON file is REQUIRED via --config-file: - pytest run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json +A config JSON file is REQUIRED via --test-config-file: + pytest run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json JSON config entries use a "server_type" field, and this runner executes the vllm-omni path. @@ -55,16 +55,16 @@ def _get_config_file_from_argv() -> str | None: - """Read --config-file from sys.argv at import time so pytest parametrize can use it. + """Read --test-config-file from sys.argv at import time so pytest parametrize can use it. pytest_addoption (below) registers the same flag so pytest does not reject it. - Supports both ``--config-file path`` and ``--config-file=path`` forms. + Supports both ``--test-config-file path`` and ``--test-config-file=path`` forms. Returns None if the flag is not present; callers must handle the missing case. """ for i, arg in enumerate(sys.argv): - if arg == "--config-file" and i + 1 < len(sys.argv): + if arg == "--test-config-file" and i + 1 < len(sys.argv): return sys.argv[i + 1] - if arg.startswith("--config-file="): + if arg.startswith("--test-config-file="): return arg.split("=", 1)[1] return None @@ -133,19 +133,6 @@ def _append_to_aggregated_file(record: dict[str, Any]) -> None: json.dump(records, f, indent=2, ensure_ascii=False) -# Register --config-file with pytest so it does not reject the argument. -def pytest_addoption(parser: pytest.Parser) -> None: - parser.addoption( - "--config-file", - action="store", - default=None, - help=( - "Path to the benchmark config JSON file (required). " - "Example: --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json" - ), - ) - - _server_lock = threading.Lock() # --------------------------------------------------------------------------- diff --git a/tests/dfx/perf/tests/test.json b/tests/dfx/perf/tests/test_qwen_omni.json similarity index 92% rename from tests/dfx/perf/tests/test.json rename to tests/dfx/perf/tests/test_qwen_omni.json index 159e27a064b..4662f8c0c71 100644 --- a/tests/dfx/perf/tests/test.json +++ b/tests/dfx/perf/tests/test_qwen_omni.json @@ -329,37 +329,5 @@ } } ] - }, - { - "test_name": "test_qwen3_tts", - "server_params": { - "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" - }, - "benchmark_params": [ - { - "dataset_name": "random", - "backend": "openai-audio-speech", - "endpoint": "/v1/audio/speech", - "num_prompts": [ - 10, - 40 - ], - "max_concurrency": [ - 1, - 4 - ], - "random_input_len": 100, - "random_output_len": 100, - "extra_body": { - "voice": "Vivian", - "language": "English" - }, - "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", - "baseline": { - "mean_audio_ttfp_ms": [6000, 6000], - "mean_audio_rtf": [0.3, 0.3] - } - } - ] } ] diff --git a/tests/dfx/perf/tests/test_tts.json b/tests/dfx/perf/tests/test_tts.json new file mode 100644 index 00000000000..3583b45b4f2 --- /dev/null +++ b/tests/dfx/perf/tests/test_tts.json @@ -0,0 +1,34 @@ +[ + { + "test_name": "test_qwen3_tts", + "server_params": { + "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" + }, + "benchmark_params": [ + { + "dataset_name": "random", + "backend": "openai-audio-speech", + "endpoint": "/v1/audio/speech", + "num_prompts": [ + 10, + 40 + ], + "max_concurrency": [ + 1, + 4 + ], + "random_input_len": 100, + "random_output_len": 100, + "extra_body": { + "voice": "Vivian", + "language": "English" + }, + "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration", + "baseline": { + "mean_audio_ttfp_ms": [6000, 6000], + "mean_audio_rtf": [0.3, 0.3] + } + } + ] + } +] From de5f8a23b2cc4c51bdfe9d59f9887965c146d5d8 Mon Sep 17 00:00:00 2001 From: Yuanheng Zhao <54058983+yuanheng-zhao@users.noreply.github.com> Date: Thu, 16 Apr 2026 11:26:48 +0800 Subject: [PATCH 61/76] [Doc][Misc] Update DreamID-Omni Example; Add DreamID-Omni post process function (#2809) Signed-off-by: yuanheng --- .../offline_inference/x_to_video_audio.md | 28 ++++++++++++-- .../x_to_video_audio/x_to_video_audio.md | 28 ++++++++++++-- .../x_to_video_audio/x_to_video_audio.py | 38 +++++++++++++++---- .../dreamid_omni/pipeline_dreamid_omni.py | 15 ++++++++ vllm_omni/diffusion/registry.py | 1 + 5 files changed, 96 insertions(+), 14 deletions(-) diff --git a/docs/user_guide/examples/offline_inference/x_to_video_audio.md b/docs/user_guide/examples/offline_inference/x_to_video_audio.md index 8ea39d81156..cec8d47c591 100644 --- a/docs/user_guide/examples/offline_inference/x_to_video_audio.md +++ b/docs/user_guide/examples/offline_inference/x_to_video_audio.md @@ -31,9 +31,9 @@ dreamid_omni/ ``` ### Run the Inference -``` +```python python x_to_video_audio.py \ - --model /xx/dreamid_omni \ + --model /path/to/dreamid_omni \ --prompt "Two people walking together and singing happily" \ --image-path ./example0.png ./example1.png \ --audio-path ./example0.wav ./example1.wav \ @@ -43,11 +43,33 @@ python x_to_video_audio.py \ --num-inference-steps 45 \ --height 704 \ --width 1280 \ - --output dreamid_omni.mp4 + --output out_dreamid_omni_twoip.mp4 ``` In the current test scenario (2 images + 2 audio inputs), the VRAM requirement is 72GB, regardless of whether cfg-parallel is enabled or disabled. The VRAM usage can be reduced by enabling CPU offload via --enable-cpu-offload. + +You could take reference images/audios from the test cases in the official repo: https://github.com/Guoxu1233/DreamID-Omni + +For example, single IP ref resources can be found under https://github.com/Guoxu1233/DreamID-Omni/tree/main/test_case/oneip, you could download them correspondingly to your local and use them for testing. + +```python +# Example usage for oneip, ref media from the official repo DreamID-Omni +python x_to_video_audio.py \ + --model /path/to/dreamid_omni \ + --prompt ": In the frame, a woman with black long hair is identified as .\n**Overall Environment/Scene**: A lively open-kitchen café at night; stove flames flare, steam rises, and warm pendant lights swing slightly as staff move behind her. The shot is an upper-body close-up.\n**Main Characters/Subjects Appearance**: is a young woman with thick dark wavy hair and a side part. She wears a fitted black top under a light apron, a thin gold chain necklace, and small stud earrings.\n**Main Characters/Subjects Actions**: tastes the sauce with a spoon, then turns her face toward the camera while still holding the spoon, her expression shifting from focused to conflicted.\n maintains eye contact, swallows as if choosing her words, and says, I keep telling myself I’m fine,but some nights it feels like I’m just performing calm." \ + --image-path 9.png \ + --audio-path 9.wav \ + --video-negative-prompt "jitter, bad hands, blur, distortion" \ + --audio-negative-prompt "robotic, muffled, echo, distorted" \ + --cfg-parallel-size 2 \ + --num-inference-steps 45 \ + --height 704 \ + --width 1280 \ + --output out_dreamid_omni_oneip.mp4 +``` + + Key arguments: - `--prompt`: text description (string). - `--model`: path to the model local directory. diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.md b/examples/offline_inference/x_to_video_audio/x_to_video_audio.md index 4b5188f41b2..13f2cfe7c0a 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.md +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.md @@ -30,9 +30,9 @@ dreamid_omni/ ``` ### Run the Inference -``` +```python python x_to_video_audio.py \ - --model /xx/dreamid_omni \ + --model /path/to/dreamid_omni \ --prompt "Two people walking together and singing happily" \ --image-path ./example0.png ./example1.png \ --audio-path ./example0.wav ./example1.wav \ @@ -42,11 +42,33 @@ python x_to_video_audio.py \ --num-inference-steps 45 \ --height 704 \ --width 1280 \ - --output dreamid_omni.mp4 + --output out_dreamid_omni_twoip.mp4 ``` In the current test scenario (2 images + 2 audio inputs), the VRAM requirement is 72GB, regardless of whether cfg-parallel is enabled or disabled. The VRAM usage can be reduced by enabling CPU offload via --enable-cpu-offload. + +You could take reference images/audios from the test cases in the official repo: https://github.com/Guoxu1233/DreamID-Omni + +For example, single IP ref resources can be found under https://github.com/Guoxu1233/DreamID-Omni/tree/main/test_case/oneip, you could download them correspondingly to your local and use them for testing. + +```python +# Example usage for oneip, ref media from the official repo DreamID-Omni +python x_to_video_audio.py \ + --model /path/to/dreamid_omni \ + --prompt ": In the frame, a woman with black long hair is identified as .\n**Overall Environment/Scene**: A lively open-kitchen café at night; stove flames flare, steam rises, and warm pendant lights swing slightly as staff move behind her. The shot is an upper-body close-up.\n**Main Characters/Subjects Appearance**: is a young woman with thick dark wavy hair and a side part. She wears a fitted black top under a light apron, a thin gold chain necklace, and small stud earrings.\n**Main Characters/Subjects Actions**: tastes the sauce with a spoon, then turns her face toward the camera while still holding the spoon, her expression shifting from focused to conflicted.\n maintains eye contact, swallows as if choosing her words, and says, I keep telling myself I’m fine,but some nights it feels like I’m just performing calm." \ + --image-path 9.png \ + --audio-path 9.wav \ + --video-negative-prompt "jitter, bad hands, blur, distortion" \ + --audio-negative-prompt "robotic, muffled, echo, distorted" \ + --cfg-parallel-size 2 \ + --num-inference-steps 45 \ + --height 704 \ + --width 1280 \ + --output out_dreamid_omni_oneip.mp4 +``` + + Key arguments: - `--prompt`: text description (string). - `--model`: path to the model local directory. diff --git a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py index 49a0f496f81..322b184e520 100644 --- a/examples/offline_inference/x_to_video_audio/x_to_video_audio.py +++ b/examples/offline_inference/x_to_video_audio/x_to_video_audio.py @@ -5,10 +5,12 @@ import re import time +import numpy as np from PIL import Image from vllm.multimodal.media.audio import load_audio from vllm_omni.diffusion.data import DiffusionParallelConfig +from vllm_omni.diffusion.utils.media_utils import mux_video_audio_bytes from vllm_omni.entrypoints.omni import Omni from vllm_omni.inputs.data import OmniDiffusionSamplingParams @@ -131,15 +133,35 @@ def main() -> None: if not outputs: raise RuntimeError("No output returned from DreamID-Omni.") - output = outputs[0].request_output - generated_video = output.images[0][0] - generated_audio = output.images[0][1] - try: - from dreamid_omni.utils.io_utils import save_video - except Exception as e: - raise RuntimeError(f"Failed to extract video and audio from DreamID-Omni output. Error: {e}") + result = outputs[0] + if not result.images: + raise RuntimeError("No video frames found in DreamID-Omni output.") + generated_video = result.images[0] + mm = result.multimodal_output or {} + generated_audio = mm.get("audio") + fps = int(mm.get("fps", 24)) + sample_rate = int(mm.get("audio_sample_rate", 16000)) + + # DreamID-Omni returns video as (C, F, H, W) float32 in [-1, 1]. + # mux_video_audio_bytes expects (F, H, W, C) uint8. + if not isinstance(generated_video, np.ndarray) or generated_video.ndim != 4: + raise RuntimeError(f"Unexpected video shape: {getattr(generated_video, 'shape', None)}") + frames = generated_video.transpose(1, 2, 3, 0) + frames = (np.clip((frames + 1.0) / 2.0, 0.0, 1.0) * 255.0).round().astype(np.uint8) + + audio_np = None + if generated_audio is not None: + audio_np = np.squeeze(np.asarray(generated_audio)).astype(np.float32) + output_path = args.output - save_video(output_path, generated_video, generated_audio, fps=24, sample_rate=16000) + video_bytes = mux_video_audio_bytes( + frames, + audio_np, + fps=float(fps), + audio_sample_rate=sample_rate, + ) + with open(output_path, "wb") as f: + f.write(video_bytes) print(f"Saved generated video to {output_path}") print(f"Total time: {elapsed:.2f}s") diff --git a/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py b/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py index 974cc582f1d..c7ab4662d14 100644 --- a/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py +++ b/vllm_omni/diffusion/models/dreamid_omni/pipeline_dreamid_omni.py @@ -38,6 +38,21 @@ logger = logging.getLogger(__name__) +def get_dreamid_omni_post_process_func(*args, **kwargs): + def post_process(output): + if isinstance(output, tuple) and len(output) == 2: + video, audio = output + return { + "video": video, + "audio": audio, + "audio_sample_rate": 16000, + "fps": 24, + } + return output + + return post_process + + AUDIO_CONFIG = { "patch_size": [1], "model_type": "t2a", diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index 517b061ecec..0bf8c04517b 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -375,6 +375,7 @@ def _apply_sequence_parallel_if_enabled(model, od_config: OmniDiffusionConfig) - "HunyuanVideo15ImageToVideoPipeline": "get_hunyuan_video_15_i2v_post_process_func", "MagiHumanPipeline": "get_magi_human_post_process_func", "OmniVoicePipeline": "get_omnivoice_post_process_func", + "DreamIDOmniPipeline": "get_dreamid_omni_post_process_func", } _DIFFUSION_PRE_PROCESS_FUNCS = { From b43c6c6663311090e5a276826f2e2005d13ac05f Mon Sep 17 00:00:00 2001 From: Lancer Date: Thu, 16 Apr 2026 12:20:46 +0800 Subject: [PATCH 62/76] [Feat] add GLM-Image SP support (#1983) Signed-off-by: Lancer Signed-off-by: Didan Deng <33117903+wtomin@users.noreply.github.com> Co-authored-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- .../models/glm_image/test_glm_image_sp.py | 134 ++++++++ .../diffusion/attention/parallel/ulysses.py | 4 - .../models/glm_image/glm_image_transformer.py | 288 ++++++++++++++---- .../models/glm_image/pipeline_glm_image.py | 35 ++- 4 files changed, 397 insertions(+), 64 deletions(-) create mode 100644 tests/diffusion/models/glm_image/test_glm_image_sp.py diff --git a/tests/diffusion/models/glm_image/test_glm_image_sp.py b/tests/diffusion/models/glm_image/test_glm_image_sp.py new file mode 100644 index 00000000000..1b1c8d7a75b --- /dev/null +++ b/tests/diffusion/models/glm_image/test_glm_image_sp.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for GLM-Image Sequence Parallelism support.""" + +from unittest.mock import MagicMock, patch + +import pytest + +from vllm_omni.diffusion.data import DiffusionParallelConfig + + +@pytest.fixture(scope="function", autouse=True) +def setup_sp_groups(): + """Set up SP and TP groups for each test function.""" + with patch("vllm_omni.diffusion.distributed.parallel_state.get_sp_group") as mock_get_sp_group: + with patch("vllm.model_executor.layers.linear.get_tensor_model_parallel_world_size", return_value=1): + with patch("vllm.distributed.parallel_state.get_tp_group") as mock_get_tp_group: + mock_sp_group = MagicMock() + mock_sp_group.world_size = 4 + mock_get_sp_group.return_value = mock_sp_group + + mock_tp_group = MagicMock() + mock_tp_group.world_size = 1 + mock_get_tp_group.return_value = mock_tp_group + yield + + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def test_glm_image_sp_plan_defined(): + """Test that _sp_plan is properly defined on GlmImageTransformer2DModel.""" + from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageTransformer2DModel, + ) + + assert hasattr(GlmImageTransformer2DModel, "_sp_plan") + plan = GlmImageTransformer2DModel._sp_plan + assert plan is not None + + # Verify plan structure + assert "prepare" in plan + assert "proj_out" in plan + + +def test_glm_image_sp_plan_valid(): + """Validate _sp_plan structure.""" + from vllm_omni.diffusion.distributed.sp_plan import validate_sp_plan + from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageTransformer2DModel, + ) + + plan = GlmImageTransformer2DModel._sp_plan + validate_sp_plan(plan) + + +def test_glm_image_prepare_module_exists(): + """Test that GlmImagePrepare module exists.""" + from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImagePrepare, + ) + + assert GlmImagePrepare is not None + + +def test_glm_image_attention_accepts_parallel_config(): + """Test that GlmImageAttention accepts parallel_config parameter.""" + from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageAttention, + ) + + parallel_config = DiffusionParallelConfig( + ulysses_degree=2, + ring_degree=2, + tensor_parallel_size=1, + sequence_parallel_size=4, + ) + + attn = GlmImageAttention( + dim=2560, + num_heads=64, + head_dim=40, + parallel_config=parallel_config, + ) + + assert attn.parallel_config is not None + assert attn.parallel_config.sequence_parallel_size == 4 + + +def test_glm_image_transformer_block_accepts_parallel_config(): + """Test that GlmImageTransformerBlock accepts parallel_config parameter.""" + from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageTransformerBlock, + ) + + parallel_config = DiffusionParallelConfig( + ulysses_degree=2, + ring_degree=2, + tensor_parallel_size=1, + sequence_parallel_size=4, + ) + + block = GlmImageTransformerBlock( + dim=2560, + num_attention_heads=64, + attention_head_dim=40, + time_embed_dim=512, + parallel_config=parallel_config, + ) + + assert block.attn1.parallel_config is not None + assert block.attn1.parallel_config.sequence_parallel_size == 4 + + +def test_glm_image_has_sp_support(): + """Test that GLM-Image has SP support implemented.""" + from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageTransformer2DModel, + ) + + # Check that the model has parallel_config support + assert hasattr(GlmImageTransformer2DModel, "__init__") + + # Verify the model can be instantiated with SP config + + # This test just verifies the structure exists + # Actual SP testing requires multi-GPU setup + + +@pytest.mark.cuda +@pytest.mark.sp +def test_glm_image_sp_inference(): + """Test SP inference (requires multi-GPU setup).""" + pytest.skip("Requires multi-GPU SP setup") diff --git a/vllm_omni/diffusion/attention/parallel/ulysses.py b/vllm_omni/diffusion/attention/parallel/ulysses.py index 5d860b3350e..326b5d45671 100644 --- a/vllm_omni/diffusion/attention/parallel/ulysses.py +++ b/vllm_omni/diffusion/attention/parallel/ulysses.py @@ -414,10 +414,6 @@ def pre_attention( def post_attention(self, attn_output: torch.Tensor, ctx: ParallelAttentionContext | None) -> torch.Tensor: assert isinstance(ctx, _UlyssesCtx), f"Unexpected ctx type: {type(ctx)!r}" - # If we have joint tensors (Text), they were Head-Sliced. - # The main sequence (Image) was Sequence-Sliced. - # attn_output contains [Joint_Sliced | Image_Sliced] (if strategy='front'). - if ctx.joint_len > 0: joint_len = ctx.joint_len diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py index 490e0198b93..7ff42a5f008 100644 --- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py +++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py @@ -19,10 +19,16 @@ ) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata from vllm_omni.diffusion.attention.layer import Attention from vllm_omni.diffusion.cache.base import CachedTransformer -from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.data import DiffusionParallelConfig, OmniDiffusionConfig from vllm_omni.diffusion.distributed.hsdp_utils import is_transformer_block_module +from vllm_omni.diffusion.distributed.sp_plan import ( + SequenceParallelInput, + SequenceParallelOutput, +) +from vllm_omni.diffusion.forward_context import get_forward_context logger = init_logger(__name__) @@ -108,8 +114,8 @@ def __init__( def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: batch_size, channel, height, width = hidden_states.shape - post_patch_height = height // self.patch_size - post_patch_width = width // self.patch_size + post_patch_height = torch.tensor(height // self.patch_size, device=hidden_states.device, dtype=torch.int64) + post_patch_width = torch.tensor(width // self.patch_size, device=hidden_states.device, dtype=torch.int64) # Reshape: [B, C, H, W] -> [B, H', W', C*p*p] -> [B, H'*W', C*p*p] hidden_states = hidden_states.reshape( @@ -159,6 +165,65 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tens return (freqs.cos(), freqs.sin()) +class GlmImagePrepare(nn.Module): + """Prepare module for GLM-Image that handles patch embedding and RoPE computation. + + This module encapsulates the input processing pipeline to create a module boundary + where _sp_plan can shard outputs via split_output=True. + + Similar to Qwen-Image's ImageRopePrepare, this ensures hidden_states and RoPE + embeddings are sharded together to maintain dimension alignment. + """ + + def __init__( + self, + image_projector: nn.Module, + rope: GlmImageRotaryPosEmbed, + patch_size: int, + ): + super().__init__() + self.image_projector = image_projector + self.rope = rope + self.patch_size = patch_size + + def forward( + self, + hidden_states: torch.Tensor, + prior_hidden_states: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Process hidden_states and compute RoPE embeddings. + + Args: + hidden_states: Input latent tensor [B, C, H, W] + prior_hidden_states: Optional prior embedding to add + + Returns: + hidden_states: Patched hidden states [B, seq_len, D] + rope_cos: RoPE cos embeddings [seq_len, dim] + rope_sin: RoPE sin embeddings [seq_len, dim] + post_patch_height: Scalar tensor for height after patching + post_patch_width: Scalar tensor for width after patching + """ + batch_size, num_channels, height, width = hidden_states.shape + + post_patch_height = torch.tensor(height // self.patch_size, device=hidden_states.device, dtype=torch.int64) + post_patch_width = torch.tensor(width // self.patch_size, device=hidden_states.device, dtype=torch.int64) + + # Compute RoPE (uses original 4D hidden_states shape) + image_rotary_emb = self.rope(hidden_states) + rope_cos = image_rotary_emb[0].to(hidden_states.device) + rope_sin = image_rotary_emb[1].to(hidden_states.device) + + # Patch embedding: [B, C, H, W] -> [B, seq_len, D] + hidden_states = self.image_projector(hidden_states) + + # Add prior embedding if provided + if prior_hidden_states is not None: + hidden_states = hidden_states + prior_hidden_states + + return hidden_states, rope_cos, rope_sin, post_patch_height, post_patch_width + + class GlmImageAdaLayerNormZero(nn.Module): """Adaptive LayerNorm with zero initialization for both image and text streams.""" @@ -397,6 +462,7 @@ def __init__( dim: int, num_heads: int, head_dim: int, + parallel_config: DiffusionParallelConfig | None = None, out_bias: bool = True, eps: float = 1e-5, ): @@ -404,6 +470,7 @@ def __init__( self.dim = dim self.total_num_heads = num_heads self.head_dim = head_dim + self.parallel_config = parallel_config # QKV projection (fused for efficiency) self.to_qkv = QKVParallelLinear( @@ -450,16 +517,19 @@ def forward( attention_mask: torch.Tensor | None = None, kv_cache: GlmImageLayerKVCache | None = None, kv_cache_mode: KVCacheMode | None = None, + hidden_states_mask: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: """ Forward pass for joint attention. Args: - hidden_states: Image hidden states [B, img_seq_len, D] - encoder_hidden_states: Text hidden states [B, text_seq_len, D] - image_rotary_emb: Tuple of (cos, sin) for RoPE + hidden_states: Image hidden states [B, img_seq_len, D] (sharded in SP mode) + encoder_hidden_states: Text hidden states [B, text_seq_len, D] (full in SP mode) + image_rotary_emb: Tuple of (cos, sin) for RoPE (sharded in SP mode) + attention_mask: Optional attention mask kv_cache: Optional layer KV cache for image editing kv_cache_mode: Cache mode (WRITE, READ, SKIP) + hidden_states_mask: Mask for SP padding (True=valid, False=padding) Returns: Tuple of (image_hidden_states, text_hidden_states) @@ -467,6 +537,13 @@ def forward( dtype = encoder_hidden_states.dtype batch_size, text_seq_length, _ = encoder_hidden_states.shape + # Check if SP is enabled + sp_size = self.parallel_config.sequence_parallel_size if self.parallel_config else None + use_sp = sp_size is not None and sp_size > 1 + if use_sp: + forward_ctx = get_forward_context() + use_sp = not forward_ctx.split_text_embed_in_sp + # Concatenate text and image: [text, image] hidden_states_combined = torch.cat([encoder_hidden_states, hidden_states], dim=1) @@ -485,41 +562,88 @@ def forward( query = self.norm_q(query).to(dtype=dtype) key = self.norm_k(key).to(dtype=dtype) - # Apply RoPE only to image tokens (not text tokens) - if image_rotary_emb is not None: - # Only apply RoPE to image part (after text_seq_length) - query_img = query[:, text_seq_length:, :, :] - key_img = key[:, text_seq_length:, :, :] - from diffusers.models.embeddings import apply_rotary_emb - - query_img = apply_rotary_emb(query_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) - key_img = apply_rotary_emb(key_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) - query = torch.cat([query[:, :text_seq_length, :, :], query_img], dim=1) - key = torch.cat([key[:, :text_seq_length, :, :], key_img], dim=1) - - # Handle KV cache for image editing - if kv_cache is not None and kv_cache_mode is not None: - if kv_cache_mode == KVCacheMode.WRITE: - kv_cache.store(key, value) - elif kv_cache_mode == KVCacheMode.READ: - k_cached, v_cached = kv_cache.get() - if k_cached is not None: - key = torch.cat([k_cached, key], dim=1) - value = torch.cat([v_cached, value], dim=1) - # KVCacheMode.SKIP: do nothing - - # Attention computation - hidden_states_out = self.attn(query, key, value) - hidden_states_out = hidden_states_out.flatten(2, 3) - hidden_states_out = hidden_states_out.to(dtype) + if use_sp: + # SP mode: use joint attention mechanism + # Split Q/K/V into text and image parts + text_query = query[:, :text_seq_length, :, :] + text_key = key[:, :text_seq_length, :, :] + text_value = value[:, :text_seq_length, :, :] + img_query = query[:, text_seq_length:, :, :] + img_key = key[:, text_seq_length:, :, :] + img_value = value[:, text_seq_length:, :, :] + + # Apply RoPE only to image part + if image_rotary_emb is not None: + from diffusers.models.embeddings import apply_rotary_emb + + img_query = apply_rotary_emb(img_query, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) + img_key = apply_rotary_emb(img_key, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) + + # Create attention metadata for joint attention + attn_metadata = AttentionMetadata( + joint_query=text_query, + joint_key=text_key, + joint_value=text_value, + joint_strategy="front", + ) - # Output projection - for module in self.to_out: - hidden_states_out = module(hidden_states_out) + # Add padding mask for SP if available + if hidden_states_mask is not None: + attn_metadata.attn_mask = hidden_states_mask + + # Attention computation with joint text/image + # Note: Ulysses post_attention returns [text, image] concatenated + joint_hidden_states_out = self.attn(img_query, img_key, img_value, attn_metadata) + + # Project combined [text, image] outputs, then split. + # This keeps SP numerically aligned with the non-SP path. + joint_hidden_states_out = joint_hidden_states_out.flatten(2, 3).to(dtype) + for module in self.to_out: + joint_hidden_states_out = module(joint_hidden_states_out) - # Split back to text and image - encoder_hidden_states_out = hidden_states_out[:, :text_seq_length, :] - hidden_states_out = hidden_states_out[:, text_seq_length:, :] + encoder_hidden_states_out = joint_hidden_states_out[:, :text_seq_length, :] + hidden_states_out = joint_hidden_states_out[:, text_seq_length:, :] + else: + # Non-SP mode: original logic + # Apply RoPE only to image tokens (not text tokens) + if image_rotary_emb is not None: + query_img = query[:, text_seq_length:, :, :] + key_img = key[:, text_seq_length:, :, :] + from diffusers.models.embeddings import apply_rotary_emb + + query_img = apply_rotary_emb(query_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) + key_img = apply_rotary_emb(key_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) + query = torch.cat([query[:, :text_seq_length, :, :], query_img], dim=1) + key = torch.cat([key[:, :text_seq_length, :, :], key_img], dim=1) + + # Handle KV cache for image editing + if kv_cache is not None and kv_cache_mode is not None: + if kv_cache_mode == KVCacheMode.WRITE: + kv_cache.store(key, value) + elif kv_cache_mode == KVCacheMode.READ: + k_cached, v_cached = kv_cache.get() + if k_cached is not None: + key = torch.cat([k_cached, key], dim=1) + value = torch.cat([v_cached, value], dim=1) + + # Attention computation + attn_metadata = None + if attention_mask is not None: + if attention_mask.dim() == 3: + attention_mask = attention_mask.unsqueeze(1) + attn_metadata = AttentionMetadata(attn_mask=attention_mask) + + hidden_states_out = self.attn(query, key, value, attn_metadata) + hidden_states_out = hidden_states_out.flatten(2, 3) + hidden_states_out = hidden_states_out.to(dtype) + + # Output projection + for module in self.to_out: + hidden_states_out = module(hidden_states_out) + + # Split back to text and image + encoder_hidden_states_out = hidden_states_out[:, :text_seq_length, :] + hidden_states_out = hidden_states_out[:, text_seq_length:, :] return hidden_states_out, encoder_hidden_states_out @@ -628,6 +752,7 @@ def __init__( attention_head_dim: int = 40, time_embed_dim: int = 512, ffn_hidden_dim: int | None = None, + parallel_config: DiffusionParallelConfig | None = None, ) -> None: super().__init__() @@ -637,6 +762,7 @@ def __init__( dim=dim, num_heads=num_attention_heads, head_dim=attention_head_dim, + parallel_config=parallel_config, ) # 2. Feedforward @@ -654,6 +780,7 @@ def forward( attention_kwargs: dict[str, Any] | None = None, kv_cache: GlmImageLayerKVCache | None = None, kv_cache_mode: KVCacheMode | None = None, + hidden_states_mask: torch.Tensor | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: """ Forward pass for transformer block. @@ -667,6 +794,7 @@ def forward( attention_kwargs: Additional attention arguments kv_cache: Layer-specific KV cache for image editing kv_cache_mode: Cache mode (WRITE, READ, SKIP) + hidden_states_mask: Mask for SP padding (True=valid, False=padding) Returns: Tuple of (image_hidden_states, text_hidden_states) @@ -693,6 +821,7 @@ def forward( attention_mask=attention_mask, kv_cache=kv_cache, kv_cache_mode=kv_cache_mode, + hidden_states_mask=hidden_states_mask, ) hidden_states = hidden_states + attn_hidden_states * gate_msa.unsqueeze(1) encoder_hidden_states = encoder_hidden_states + attn_encoder_hidden_states * c_gate_msa.unsqueeze(1) @@ -724,6 +853,26 @@ class GlmImageTransformer2DModel(CachedTransformer): """ _repeated_blocks = ["GlmImageTransformerBlock"] + # SP plan using GlmImagePrepare module for sharding hidden_states and RoPE together. + # Similar to Qwen-Image's ImageRopePrepare, this creates a module boundary where + # _sp_plan can shard outputs via split_output=True. + # + # Key insight: hidden_states and RoPE embeddings MUST be sharded together + # to maintain dimension alignment for RoPE computation in attention layers. + _sp_plan = { + # Shard GlmImagePrepare outputs (hidden_states and RoPE must be sharded together) + "prepare": { + # hidden_states: [B, seq_len, D] - shard along sequence dimension + 0: SequenceParallelInput(split_dim=1, expected_dims=3, split_output=True, auto_pad=True), + # RoPE cos: [seq_len, dim] - shard along sequence dimension + 1: SequenceParallelInput(split_dim=0, expected_dims=2, split_output=True, auto_pad=True), + # RoPE sin: [seq_len, dim] - shard along sequence dimension + 2: SequenceParallelInput(split_dim=0, expected_dims=2, split_output=True, auto_pad=True), + # post_patch_height and post_patch_width are scalars, not sharded + }, + # Gather output at proj_out + "proj_out": SequenceParallelOutput(gather_dim=1, expected_dims=3), + } _hsdp_shard_conditions = [is_transformer_block_module] @@ -790,6 +939,9 @@ def __init__( dim=inner_dim, dim_out=inner_dim, inner_dim=inner_dim, activation_fn="linear-silu" ) + # Prepare module for SP (encapsulates patch embedding and RoPE for _sp_plan) + self.prepare = GlmImagePrepare(self.image_projector, self.rope, patch_size) + self.time_condition_embed = GlmImageCombinedTimestepSizeEmbeddings( embedding_dim=time_embed_dim, condition_dim=condition_dim, @@ -806,6 +958,7 @@ def __init__( attention_head_dim, time_embed_dim, ffn_hidden_dim=ffn_hidden_dim, + parallel_config=self.parallel_config, ) for _ in range(num_layers) ] @@ -859,33 +1012,51 @@ def forward( # Get KV cache mode kv_cache_mode = kv_cache.mode if kv_cache is not None else None - # 1. RoPE - if image_rotary_emb is None: - image_rotary_emb = self.rope(hidden_states) - # Move to correct device - image_rotary_emb = ( - image_rotary_emb[0].to(hidden_states.device), - image_rotary_emb[1].to(hidden_states.device), - ) - - # 2. Patch & Timestep embeddings - p = self.patch_size - post_patch_height = height // p - post_patch_width = width // p + # Set SP context if enabled + sp_size = self.parallel_config.sequence_parallel_size + if sp_size is not None and sp_size > 1: + get_forward_context().split_text_embed_in_sp = False - hidden_states = self.image_projector(hidden_states) + # Text embedding projection encoder_hidden_states = self.glyph_projector(encoder_hidden_states) # Prior embedding with dropout prior_embedding = self.prior_token_embedding(prior_token_id) prior_embedding[prior_token_drop] *= 0.0 prior_hidden_states = self.prior_projector(prior_embedding) - hidden_states = hidden_states + prior_hidden_states + + # 1. Prepare hidden_states and RoPE via GlmImagePrepare module + # _sp_plan will shard hidden_states and RoPE together via split_output=True + hidden_states, rope_cos, rope_sin, post_patch_height_t, post_patch_width_t = self.prepare( + hidden_states, prior_hidden_states + ) + image_rotary_emb = (rope_cos, rope_sin) + post_patch_height = int(post_patch_height_t.item()) + post_patch_width = int(post_patch_width_t.item()) # Timestep conditioning temb = self.time_condition_embed(timestep, target_size, crop_coords, hidden_states.dtype) - # 3. Transformer blocks + # Create padding mask for SP if needed (after _sp_plan hooks have run) + hidden_states_mask = None + if sp_size is not None and sp_size > 1: + from vllm_omni.diffusion.forward_context import is_forward_context_available + + if is_forward_context_available(): + ctx = get_forward_context() + if ctx.sp_original_seq_len is not None and ctx.sp_padding_size > 0: + img_padded_seq_len = ctx.sp_original_seq_len + ctx.sp_padding_size + hidden_states_mask = torch.ones( + batch_size, + img_padded_seq_len, + dtype=torch.bool, + device=hidden_states.device, + ) + hidden_states_mask[:, ctx.sp_original_seq_len :] = False + if hidden_states_mask.all(): + hidden_states_mask = None + + # 2. Transformer blocks for layer_idx, block in enumerate(self.transformer_blocks): # Get layer-specific KV cache if available layer_kv_cache = kv_cache[layer_idx] if kv_cache is not None else None @@ -899,13 +1070,16 @@ def forward( attention_kwargs, kv_cache=layer_kv_cache, kv_cache_mode=kv_cache_mode, + hidden_states_mask=hidden_states_mask, ) - # 4. Output norm & projection + # 3. Output norm & projection + # _sp_plan will gather hidden_states via proj_out hook hidden_states = self.norm_out(hidden_states, temb) hidden_states = self.proj_out(hidden_states) - # 5. Unpatchify: [B, H'*W', C*p*p] -> [B, C, H, W] + # 4. Unpatchify: [B, H'*W', C*p*p] -> [B, C, H, W] + p = self.patch_size hidden_states = hidden_states.reshape(batch_size, post_patch_height, post_patch_width, -1, p, p) output = hidden_states.permute(0, 3, 1, 4, 2, 5).flatten(4, 5).flatten(2, 3) diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index 375f7e7b80d..03863649984 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -712,6 +712,14 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: if img is not None: preprocessed_images = [img] + # Priority: prompt dict (from ar2diffusion) > sampling_params + # ar2diffusion returns adjusted height/width that matches prior_token_ids + if not isinstance(first_prompt, str): + ar_height = first_prompt.get("height") + ar_width = first_prompt.get("width") + else: + ar_height = ar_width = None + img_height = req.sampling_params.height img_width = req.sampling_params.width @@ -719,12 +727,19 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: # Treat that as t2i warmup to avoid requiring i2i-only KV-cache inputs. is_image_edit = (preprocessed_images is not None) and (not is_dummy_warmup) - # Use image dimensions as default if available - height = req.sampling_params.height or img_height or self.default_sample_size * self.vae_scale_factor - width = req.sampling_params.width or img_width or self.default_sample_size * self.vae_scale_factor + # Use prompt dict dimensions (from ar2diffusion) as priority, then sampling_params + height = ( + ar_height or req.sampling_params.height or img_height or self.default_sample_size * self.vae_scale_factor + ) + width = ar_width or req.sampling_params.width or img_width or self.default_sample_size * self.vae_scale_factor num_inference_steps = req.sampling_params.num_inference_steps or 50 guidance_scale = req.sampling_params.guidance_scale or 1.5 + # Ensure dimensions are multiples of vae_scale_factor * patch_size + multiple_of = self.vae_scale_factor * self._patch_size + height = height // multiple_of * multiple_of + width = width // multiple_of * multiple_of + self.check_inputs(prompt=prompt, height=height, width=width, prompt_embeds=prompt_embeds) batch_size = 1 @@ -753,6 +768,20 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: prior_token_id = prior_token_id.to(device=self.device, dtype=torch.long) if prior_token_id.dim() == 1: prior_token_id = prior_token_id.unsqueeze(0) + + # Validate that prior_token_id seq_len matches dimensions + prior_seq_len = prior_token_id.shape[1] + expected_seq_len = (height // self.vae_scale_factor // self._patch_size) * ( + width // self.vae_scale_factor // self._patch_size + ) + if prior_seq_len != expected_seq_len: + raise ValueError( + f"prior_token_ids seq_len ({prior_seq_len}) doesn't match dimensions " + f"({height}x{width}, expected seq_len={expected_seq_len}). " + f"This indicates a mismatch between AR output and Diffusion input. " + f"Please ensure ar2diffusion returns correct height/width." + ) + prior_token_image_ids = None if external_prior_image_ids is not None: if isinstance(external_prior_image_ids, torch.Tensor): From 30ee64edfc39aeba346db7c5a68f1582070862aa Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Thu, 16 Apr 2026 14:52:40 +0800 Subject: [PATCH 63/76] refactor: init replica in stage_pool Signed-off-by: ZhengWG --- vllm_omni/engine/async_omni_engine.py | 230 ++++++++++++++++---------- vllm_omni/engine/orchestrator.py | 182 +++++++++----------- 2 files changed, 220 insertions(+), 192 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 221baf7e238..e44b2d75580 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -653,28 +653,17 @@ def _attach_llm_stage( logger.info("[AsyncOmniEngine] Stage %s initialized", started.stage_id) return stage_client, output_processor, started.vllm_config, input_processor - def _initialize_stages(self, stage_init_timeout: int) -> None: - """Initialize stage clients/processors in orchestrator thread and assign to self. - - Multi-replica support: when a stage config contains - ``runtime.num_replicas > 1``, each replica launches its own EngineCore - process with a dedicated slice of devices. The flat ``stage_clients`` - list contains all replica clients; ``logical_stage_to_clients`` maps - each logical stage id to the list of client indices that belong to it. + def _compute_replica_layout( + self, + ) -> tuple[list[int], dict[int, list[str]], int]: + """Compute per-stage replica counts and device assignments. + + Returns: + replicas_per_stage: num_replicas per logical stage. + replica_devices_map: stage_idx → per-replica device strings + (only for stages with num_replicas > 1). + total_llm_replicas: total LLM replica count across all stages. """ - device_control_env = current_omni_platform.device_control_env_var - - num_stages = self.num_stages - input_processor: InputProcessor | None = None - # Per-stage launch futures and results: stage_id → [replicas] - llm_stage_ids: list[int] = [] - llm_launch_futures: dict[int, list[concurrent.futures.Future[StartedLlmStage]]] = {} - started_llm_stages: dict[int, list[StartedLlmStage]] = {} - llm_stage_launch_lock = threading.Lock() - # Diffusion stages (no multi-replica support yet) - diffusion_clients: dict[int, Any] = {} - - # Track per-logical-stage replica count from config replicas_per_stage: list[int] = [] for stage_cfg in self.stage_configs: runtime_cfg = getattr(stage_cfg, "runtime", {}) @@ -685,8 +674,6 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: ) replicas_per_stage.append(max(1, num_replicas)) - # Pre-compute per-replica device assignments for multi-replica stages - # stage_id → [devices_str_per_replica] replica_devices_map: dict[int, list[str]] = {} for logical_id, stage_cfg in enumerate(self.stage_configs): num_replicas = replicas_per_stage[logical_id] @@ -711,21 +698,127 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: replica_devices_map[logical_id], ) - async_chunk = self.async_chunk - prompt_expand_func = None total_llm_replicas = sum( replicas_per_stage[i] for i, cfg in enumerate(self.stage_configs) if getattr(cfg, "stage_type", "llm") != "diffusion" ) + return replicas_per_stage, replica_devices_map, total_llm_replicas + + def _build_flat_client_lists( + self, + *, + num_stages: int, + replicas_per_stage: list[int], + llm_stage_ids: list[int], + stage_attach_results: dict[int, list[Any]], + stage_output_proc_results: dict[int, list[Any]], + stage_vllm_cfg_results: dict[int, list[Any]], + diffusion_clients: dict[int, Any], + input_processor: InputProcessor | None, + ) -> tuple[list[Any], list[Any], list[Any], list[list[int]], list[Any], list[dict[str, Any]]]: + """Build flat client/processor/config lists and finalize stage metadata. + + Returns: + (flat_clients, flat_output_processors, flat_vllm_configs, + logical_stage_to_clients, default_sampling_params_list, + stage_metadata_list) + + TODO(stage-pool): move this assembly into StagePool.from_stage_config + so _initialize_stages only collects pools, not flat lists. + """ + flat_clients: list[Any] = [] + flat_output_processors: list[Any] = [] + flat_vllm_configs: list[Any] = [] + logical_stage_to_clients: list[list[int]] = [] + + logical_stage_clients_for_finalize: list[Any | None] = [None] * num_stages + all_clients: dict[int, list[Any]] = {} + all_output_processors: dict[int, list[Any]] = {} + all_vllm_configs: dict[int, list[Any]] = {} + + for stage_idx in llm_stage_ids: + all_clients[stage_idx] = stage_attach_results[stage_idx] + all_output_processors[stage_idx] = stage_output_proc_results[stage_idx] + all_vllm_configs[stage_idx] = stage_vllm_cfg_results[stage_idx] + logical_stage_clients_for_finalize[stage_idx] = stage_attach_results[stage_idx][0] + + for stage_idx, diff_client in diffusion_clients.items(): + logical_stage_clients_for_finalize[stage_idx] = diff_client + + _, default_sampling_params_list, stage_metadata = finalize_initialized_stages( + logical_stage_clients_for_finalize, + input_processor, + ) + + logical_default_sampling_params: list[Any] = [] + logical_stage_metadata: list[dict[str, Any]] = [] + + for logical_id in range(num_stages): + num_replicas = replicas_per_stage[logical_id] + client_indices: list[int] = [] + + if logical_id in diffusion_clients: + ci = len(flat_clients) + client_indices.append(ci) + flat_clients.append(diffusion_clients[logical_id]) + flat_output_processors.append(None) + flat_vllm_configs.append(None) + else: + for replica_idx in range(num_replicas): + ci = len(flat_clients) + client_indices.append(ci) + flat_clients.append(all_clients[logical_id][replica_idx]) + flat_output_processors.append(all_output_processors[logical_id][replica_idx]) + flat_vllm_configs.append(all_vllm_configs[logical_id][replica_idx]) + if num_replicas > 1: + logger.info( + "[AsyncOmniEngine] Stage %s replica %s → client %s (isolated)", + logical_id, + replica_idx, + ci, + ) + + logical_stage_to_clients.append(client_indices) + logical_default_sampling_params.append(default_sampling_params_list[logical_id]) + logical_stage_metadata.append(stage_metadata[logical_id]) + + return ( + flat_clients, + flat_output_processors, + flat_vllm_configs, + logical_stage_to_clients, + logical_default_sampling_params, + logical_stage_metadata, + ) + + def _initialize_stages(self, stage_init_timeout: int) -> None: + """Initialize stage clients/processors in orchestrator thread and assign to self. + + Phases: + 1. Compute replica layout (counts + device splits). + 2. Launch all stage engine processes (parallel via ThreadPoolExecutor). + 3. Attach launched engines (parallel) and collect clients/processors. + 4. Assemble flat client lists and finalize stage metadata. + """ + device_control_env = current_omni_platform.device_control_env_var + num_stages = self.num_stages + + replicas_per_stage, replica_devices_map, total_llm_replicas = self._compute_replica_layout() + + input_processor: InputProcessor | None = None + llm_stage_ids: list[int] = [] + llm_launch_futures: dict[int, list[concurrent.futures.Future[StartedLlmStage]]] = {} + started_llm_stages: dict[int, list[StartedLlmStage]] = {} + llm_stage_launch_lock = threading.Lock() + diffusion_clients: dict[int, Any] = {} + prompt_expand_func = None + async_chunk = self.async_chunk prepare_engine_environment() omni_transfer_config = load_omni_transfer_config_for_model(self.model, self.config_path) - # Initialized outside try so error handler can always access them - flat_clients: list[Any] = [] - # stage_id → [client_per_replica] - all_clients: dict[int, list[Any]] = {} + flat_clients: list[Any] = [] # populated by _build_flat_client_lists # ------------------------------------------------------------------ # # Single-stage mode: start OmniMasterServer before launching stages. # @@ -947,65 +1040,25 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: if stage0_input_processor is not None: input_processor = stage0_input_processor - # ---- Build flat client lists directly ---- - flat_output_processors: list[Any] = [] - flat_vllm_configs: list[Any] = [] - logical_stage_to_clients: list[list[int]] = [] - - # Per-logical-stage lists (not per-client) - logical_stage_clients_for_finalize: list[Any | None] = [None] * num_stages - all_output_processors: dict[int, list[Any]] = {} - all_vllm_configs: dict[int, list[Any]] = {} - - for stage_idx in llm_stage_ids: - all_clients[stage_idx] = stage_attach_results[stage_idx] - all_output_processors[stage_idx] = stage_output_proc_results[stage_idx] - all_vllm_configs[stage_idx] = stage_vllm_cfg_results[stage_idx] - logical_stage_clients_for_finalize[stage_idx] = stage_attach_results[stage_idx][0] - - # Place diffusion clients into the logical list - for stage_idx, diff_client in diffusion_clients.items(): - logical_stage_clients_for_finalize[stage_idx] = diff_client - - initialized_stage_clients, default_sampling_params_list, stage_metadata = finalize_initialized_stages( - logical_stage_clients_for_finalize, - input_processor, + # ---- Assemble flat lists + finalize metadata ---- + ( + flat_clients, + flat_output_processors, + flat_vllm_configs, + logical_stage_to_clients, + logical_default_sampling_params, + logical_stage_metadata, + ) = self._build_flat_client_lists( + num_stages=num_stages, + replicas_per_stage=replicas_per_stage, + llm_stage_ids=llm_stage_ids, + stage_attach_results=stage_attach_results, + stage_output_proc_results=stage_output_proc_results, + stage_vllm_cfg_results=stage_vllm_cfg_results, + diffusion_clients=diffusion_clients, + input_processor=input_processor, ) - # Now build flat lists in logical-stage order, replicas within - logical_default_sampling_params: list[Any] = [] - logical_stage_metadata: list[dict[str, Any]] = [] - - for logical_id in range(num_stages): - num_replicas = replicas_per_stage[logical_id] - client_indices: list[int] = [] - - if logical_id in diffusion_clients: - # Diffusion: single client, no multi-replica - ci = len(flat_clients) - client_indices.append(ci) - flat_clients.append(diffusion_clients[logical_id]) - flat_output_processors.append(None) - flat_vllm_configs.append(None) - else: - for replica_idx in range(num_replicas): - ci = len(flat_clients) - client_indices.append(ci) - flat_clients.append(all_clients[logical_id][replica_idx]) - flat_output_processors.append(all_output_processors[logical_id][replica_idx]) - flat_vllm_configs.append(all_vllm_configs[logical_id][replica_idx]) - if num_replicas > 1: - logger.info( - "[AsyncOmniEngine] Stage %s replica %s → client %s (isolated)", - logical_id, - replica_idx, - ci, - ) - - logical_stage_to_clients.append(client_indices) - logical_default_sampling_params.append(default_sampling_params_list[logical_id]) - logical_stage_metadata.append(stage_metadata[logical_id]) - except Exception: for stage_id, futures in llm_launch_futures.items(): for f in futures: @@ -1013,10 +1066,7 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: continue started_llm_stages.setdefault(stage_id, []).append(f.result()) # Collect all initialized clients for cleanup - cleanup_clients: list[Any] = list(diffusion_clients.values()) - for clients in all_clients.values(): - cleanup_clients.extend(clients) - cleanup_clients = [c for c in cleanup_clients if c is not None] + cleanup_clients: list[Any] = list(diffusion_clients.values()) + [c for c in flat_clients if c is not None] all_started = [s for stages in started_llm_stages.values() for s in stages] logger.exception( "[AsyncOmniEngine] Stage initialization failed; shutting down %s initialized client(s)", diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index 88e8572f852..b3b0fefe924 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -163,7 +163,7 @@ def __init__( # Per-replica metrics accumulators (keyed by StageReplica identity). self._replica_metrics: dict[StageReplica, _ReplicaMetrics] = { - replica: _ReplicaMetrics() for pool in self.stage_pools for replica in pool.replicas + sr: _ReplicaMetrics() for pool in self.stage_pools for sr in pool.replicas } # Shutdown coordination @@ -258,15 +258,15 @@ async def _orchestration_loop(self) -> None: while not self._shutdown_event.is_set(): idle = True for pool in self.stage_pools: - for replica in pool.replicas: + for stage_replica in pool.replicas: if self._shutdown_event.is_set(): return - stage_id = replica.logical_stage_id + stage_id = stage_replica.logical_stage_id # 1) Diffusion stage: poll non-blocking queue - if replica.client.stage_type == "diffusion": - output = replica.client.get_diffusion_output_nowait() + if stage_replica.client.stage_type == "diffusion": + output = stage_replica.client.get_diffusion_output_nowait() if output is not None: idle = False req_state = self.request_states.get(output.request_id) @@ -289,13 +289,13 @@ async def _orchestration_loop(self) -> None: continue stage_metrics = self._build_stage_metrics( - replica, + stage_replica, output.request_id, [output], req_state, ) await self._route_output( - replica, + stage_replica, output, req_state, stage_metrics, @@ -305,7 +305,7 @@ async def _orchestration_loop(self) -> None: # 1) Poll raw outputs from the stage replica try: raw_outputs = await asyncio.wait_for( - self._poll_stage_raw(replica), + self._poll_stage_raw(stage_replica), timeout=0.001, ) except asyncio.TimeoutError: @@ -318,7 +318,7 @@ async def _orchestration_loop(self) -> None: logger.exception( "[Orchestrator] _poll_stage_raw failed for stage-%s replica-%s", stage_id, - replica.replica_index, + stage_replica.replica_index, ) raise @@ -328,13 +328,13 @@ async def _orchestration_loop(self) -> None: # Handle prefill-finished KV-ready signals before finished outputs. await self._handle_kv_ready_raw_outputs( - replica, + stage_replica, raw_outputs, ) # 2) Process raw outputs through the output processor request_outputs = await self._process_stage_outputs( - replica, + stage_replica, raw_outputs, ) @@ -347,20 +347,20 @@ async def _orchestration_loop(self) -> None: "at stage-%s replica-%s (known reqs: %s)", output.request_id, stage_id, - replica.replica_index, + stage_replica.replica_index, list(self.request_states.keys()), ) continue stage_metrics = None if output.finished: stage_metrics = self._build_stage_metrics( - replica, + stage_replica, output.request_id, [output], req_state, ) await self._route_output( - replica, + stage_replica, output, req_state, stage_metrics, @@ -373,13 +373,13 @@ async def _orchestration_loop(self) -> None: async def _route_output( self, - replica: StageReplica, + stage_replica: StageReplica, output: Any, req_state: OrchestratorRequestState, stage_metrics: Any, ) -> None: """Route a processed output: send to main thread and/or forward to next stage.""" - stage_id = replica.logical_stage_id + stage_id = stage_replica.logical_stage_id req_id = output.request_id finished = output.finished submit_ts = req_state.stage_submit_ts.get(stage_id) @@ -391,7 +391,7 @@ async def _route_output( self.request_states.pop(req_id, None) return - if replica.client.final_output: + if stage_replica.client.final_output: await self.output_async_queue.put( { "type": "output", @@ -424,12 +424,12 @@ async def _route_output( self._deferred_parents[req_id] = { "stage_id": stage_id, "output": output, - "replica": replica, + "stage_replica": stage_replica, } else: await self._forward_to_next_stage( req_id, - replica, + stage_replica, output, req_state, ) @@ -473,20 +473,20 @@ async def _handle_cfg_companion_ready(self, req_id: str) -> None: if parent_state is not None and not self._next_stage_already_submitted(deferred["stage_id"], parent_state): await self._forward_to_next_stage( parent_id, - deferred["replica"], + deferred["stage_replica"], deferred["output"], parent_state, ) async def _handle_kv_ready_raw_outputs( self, - replica: StageReplica, + stage_replica: StageReplica, raw_outputs: EngineCoreOutputs, ) -> None: """Forward split requests once stage-0 KV is ready, not only when decode fully finishes.""" if self.async_chunk: return - stage_id = replica.logical_stage_id + stage_id = stage_replica.logical_stage_id for raw_output in raw_outputs.outputs: kv_params = getattr(raw_output, "kv_transfer_params", None) if not (isinstance(kv_params, dict) and kv_params.get("kv_ready")): @@ -506,19 +506,19 @@ async def _handle_kv_ready_raw_outputs( self._deferred_parents[req_id] = { "stage_id": stage_id, "output": raw_output, - "replica": replica, + "stage_replica": stage_replica, } else: await self._forward_to_next_stage( req_id, - replica, + stage_replica, raw_output, req_state, ) def _build_stage_metrics( self, - replica: StageReplica, + stage_replica: StageReplica, req_id: str, request_outputs: list[RequestOutput], req_state: OrchestratorRequestState, @@ -528,7 +528,7 @@ def _build_stage_metrics( Reuses StageRequestMetrics so OrchestratorMetrics and downstream metric handlers can consume a stable schema. """ - stage_id = replica.logical_stage_id + stage_id = stage_replica.logical_stage_id now = _time.time() submit_ts = req_state.stage_submit_ts.get(stage_id, now) stage_gen_time_ms = (now - submit_ts) * 1000.0 @@ -542,7 +542,7 @@ def _build_stage_metrics( num_tokens_in += len(ptids) # Monotonic batch counter per replica. - metrics = self._replica_metrics[replica] + metrics = self._replica_metrics[stage_replica] metrics.batch_seq += 1 batch_id = metrics.batch_seq @@ -589,7 +589,7 @@ def _build_kv_sender_info(self, sender_stage_ids: list[int]) -> dict[int, dict[s return sender_infos or None - def _stage_client_list_for_legacy(self) -> list[Any]: + def _first_replica_clients_per_stage(self) -> list[Any]: """First-replica client per logical stage. Legacy helper for ``process_engine_inputs`` and @@ -602,7 +602,7 @@ def _stage_client_list_for_legacy(self) -> list[Any]: async def _forward_to_next_stage( self, req_id: str, - replica: StageReplica, + stage_replica: StageReplica, output: Any, req_state: OrchestratorRequestState, ) -> None: @@ -611,16 +611,16 @@ async def _forward_to_next_stage( Handles the full pipeline: set outputs on current stage, compute next-stage inputs, build lightweight requests, and submit them. """ - stage_id = replica.logical_stage_id + stage_id = stage_replica.logical_stage_id next_logical = stage_id + 1 next_pool = self.stage_pools[next_logical] next_replica = next_pool.select_replica(req_state) params = req_state.sampling_params_list[next_logical] if next_replica.client.stage_type == "diffusion": - replica.client.set_engine_outputs([output]) + stage_replica.client.set_engine_outputs([output]) if next_replica.client.custom_process_input_func is not None: - stage_list = self._stage_client_list_for_legacy() + stage_list = self._first_replica_clients_per_stage() diffusion_prompt = next_replica.client.custom_process_input_func( stage_list, next_replica.client.engine_input_source, @@ -667,15 +667,15 @@ async def _forward_to_next_stage( return # Set outputs on the client that actually produced them - replica.client.set_engine_outputs([output]) + stage_replica.client.set_engine_outputs([output]) # Process inputs for next stage - stage_list = self._stage_client_list_for_legacy() + stage_list = self._first_replica_clients_per_stage() try: next_inputs = next_replica.client.process_engine_inputs( stage_list=stage_list, prompt=req_state.prompt, - source_client=replica.client, + source_client=stage_replica.client, ) except Exception: logger.exception( @@ -713,24 +713,24 @@ async def _forward_to_next_stage( async def _poll_stage_raw( self, - replica: StageReplica, + stage_replica: StageReplica, ) -> EngineCoreOutputs | None: """Pull raw EngineCoreOutputs from a stage replica without processing.""" - outputs = await replica.client.get_output_async() + outputs = await stage_replica.client.get_output_async() if not outputs.outputs: return None return outputs async def _process_stage_outputs( self, - replica: StageReplica, + stage_replica: StageReplica, raw_outputs: EngineCoreOutputs, ) -> list[RequestOutput]: """Run the output processor on raw outputs, returning RequestOutputs. Also handles abort forwarding and scheduler stats updates. """ - processor = replica.output_processor + processor = stage_replica.output_processor processed = processor.process_outputs( raw_outputs.outputs, @@ -739,7 +739,7 @@ async def _process_stage_outputs( ) if processed.reqs_to_abort: - await replica.client.abort_requests_async(processed.reqs_to_abort) + await stage_replica.client.abort_requests_async(processed.reqs_to_abort) if raw_outputs.scheduler_stats is not None: processor.update_scheduler_stats(raw_outputs.scheduler_stats) @@ -787,37 +787,25 @@ async def _handle_add_request(self, msg: dict[str, Any]) -> None: # Diffusion: no output_processor on stage 0, just select + submit. if stage0_pool.replicas[0].client.stage_type == "diffusion": - replica = stage0_pool.select_replica(req_state) + stage_replica = stage0_pool.select_replica(req_state) if isinstance(prompt, list): - await replica.client.add_batch_request_async( + await stage_replica.client.add_batch_request_async( request_id, prompt, params, ) else: - await replica.client.add_request_async(request_id, prompt, params) + await stage_replica.client.add_request_async(request_id, prompt, params) else: - # LLM: atomically pick a replica and register on its output - # processor. Registration must target the same replica that - # will serve the request or the raw outputs come back to a - # processor that has never seen the req_id. + # LLM: atomically pick a stage replica and register on its + # output processor so select + register + submit all target + # the same replica. + # TODO(stage-pool): when _request_handler gets per-message error + # handling, add rollback here (abort_requests + state cleanup) + # so a failed submit releases resources without killing the loop. output_prompt_text = msg.get("output_prompt_text") - replica = stage0_pool.admit(req_state, request, output_prompt_text) - try: - await replica.client.add_request_async(request) - except Exception: - # Roll back the processor registration so we don't leak a - # half-admitted request on a failed submit. - try: - replica.output_processor.abort_requests([request_id], internal=False) - except Exception: - logger.exception( - "[Orchestrator] Failed to roll back output_processor registration " - "for req=%s after submit failure", - request_id, - ) - self.request_states.pop(request_id, None) - raise + stage_replica = stage0_pool.admit(req_state, request, output_prompt_text) + await stage_replica.client.add_request_async(request) if self.async_chunk and logical_stage_id == 0 and final_stage_id > 0: await self._prewarm_async_chunk_stages(request_id, request, req_state) @@ -843,15 +831,15 @@ async def _handle_streaming_update(self, msg: dict[str, Any]) -> None: req_state.sampling_params_list = msg["sampling_params_list"] req_state.stage_submit_ts[stage_id] = _time.time() - # Streaming updates re-use the already-chosen replica from initial submit. - replica = req_state.chosen_replica.get(stage_id) - if replica is None: - replica = self.stage_pools[stage_id].select_replica(req_state) - if replica.client.stage_type == "diffusion": + # Streaming updates re-use the already-chosen stage replica from initial submit. + stage_replica = req_state.chosen_replica.get(stage_id) + if stage_replica is None: + stage_replica = self.stage_pools[stage_id].select_replica(req_state) + if stage_replica.client.stage_type == "diffusion": params = req_state.sampling_params_list[stage_id] - await replica.client.add_request_async(request_id, request, params) + await stage_replica.client.add_request_async(request_id, request, params) else: - await replica.client.add_request_async(request) + await stage_replica.client.add_request_async(request) async def _prewarm_async_chunk_stages( self, @@ -973,23 +961,11 @@ async def _handle_add_companion(self, msg: dict[str, Any]) -> None: companion_prompt_text, affinity_from=parent_replica, ) - try: - await companion_replica.client.add_request_async(request) - except Exception: - try: - companion_replica.output_processor.abort_requests([companion_id], internal=False) - except Exception: - logger.exception( - "[Orchestrator] Failed to roll back companion registration for %s", - companion_id, - ) - # Undo companion tracking so parent can proceed (parent may still - # succeed without this companion — expected in CFG fallback mode). - self.request_states.pop(companion_id, None) - self._companion_ids.discard(companion_id) - self._companion_to_parent.pop(companion_id, None) - self._companion_map.get(parent_id, {}).pop(role, None) - raise + # TODO(stage-pool): when _request_handler gets per-message error + # handling, add rollback here (abort_requests + companion state + # cleanup) so a failed submit releases resources without killing + # the orchestrator loop. + await companion_replica.client.add_request_async(request) logger.info( "[Orchestrator] CFG companion submitted: %s (role=%s, parent=%s, stage-0 replica-%s)", @@ -1016,8 +992,8 @@ async def _handle_abort(self, msg: dict[str, Any]) -> None: all_ids_to_abort = list(request_ids) + companion_ids_to_abort for pool in self.stage_pools: - for replica in pool.replicas: - await replica.client.abort_requests_async(all_ids_to_abort) + for stage_replica in pool.replicas: + await stage_replica.client.abort_requests_async(all_ids_to_abort) for req_id in request_ids: self.request_states.pop(req_id, None) logger.info("[Orchestrator] Aborted request(s) %s", request_ids) @@ -1049,11 +1025,11 @@ async def _handle_collective_rpc(self, msg: dict[str, Any]) -> None: results: list[Any] = [] stage_ids: list[int] = [] - for replica in target_replicas: - stage_ids.append(replica.logical_stage_id) + for stage_replica in target_replicas: + stage_ids.append(stage_replica.logical_stage_id) try: - if hasattr(replica.client, "collective_rpc_async"): - stage_result = await replica.client.collective_rpc_async( + if hasattr(stage_replica.client, "collective_rpc_async"): + stage_result = await stage_replica.client.collective_rpc_async( method=method, timeout=timeout, args=args, @@ -1063,13 +1039,15 @@ async def _handle_collective_rpc(self, msg: dict[str, Any]) -> None: stage_result = { "supported": False, "todo": True, - "reason": (f"{replica.client.__class__.__name__}.collective_rpc_async is not implemented yet"), + "reason": ( + f"{stage_replica.client.__class__.__name__}.collective_rpc_async is not implemented yet" + ), } except Exception as exc: logger.exception( "[Orchestrator] collective_rpc failed: stage=%s replica=%s method=%s", - replica.logical_stage_id, - replica.replica_index, + stage_replica.logical_stage_id, + stage_replica.replica_index, method, ) stage_result = { @@ -1098,18 +1076,18 @@ def _shutdown_stages(self) -> None: total = sum(pool.num_replicas for pool in self.stage_pools) logger.info("[Orchestrator] Shutting down all %d client(s)", total) for pool in self.stage_pools: - for replica in pool.replicas: + for stage_replica in pool.replicas: try: - replica.client.shutdown() + stage_replica.client.shutdown() logger.info( "[Orchestrator] Stage %d replica %d shut down", - replica.logical_stage_id, - replica.replica_index, + stage_replica.logical_stage_id, + stage_replica.replica_index, ) except Exception as e: logger.warning( "[Orchestrator] Failed to shutdown stage %d replica %d: %s", - replica.logical_stage_id, - replica.replica_index, + stage_replica.logical_stage_id, + stage_replica.replica_index, e, ) From 24e61f4d7bccb61d020f9020c22da51546a4c7c5 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Thu, 16 Apr 2026 14:52:53 +0800 Subject: [PATCH 64/76] [CI] add qwen image and layered accuracy test (#2772) Signed-off-by: david6666666 <530634352@qq.com> --- tests/e2e/accuracy/test_qwen_image.py | 124 ++++++++++++++ tests/e2e/accuracy/test_qwen_image_layered.py | 151 ++++++++++++++++++ tests/e2e/accuracy/utils.py | 47 ++++-- 3 files changed, 313 insertions(+), 9 deletions(-) create mode 100644 tests/e2e/accuracy/test_qwen_image.py create mode 100644 tests/e2e/accuracy/test_qwen_image_layered.py diff --git a/tests/e2e/accuracy/test_qwen_image.py b/tests/e2e/accuracy/test_qwen_image.py new file mode 100644 index 00000000000..e73195017aa --- /dev/null +++ b/tests/e2e/accuracy/test_qwen_image.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +import base64 +import gc +import io +import os +from pathlib import Path + +import pytest +import requests +import torch +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from PIL import Image + +from tests.conftest import ( + OmniServer, + _run_post_test_cleanup, + _run_pre_test_cleanup, +) +from tests.e2e.accuracy.utils import assert_similarity, model_output_dir +from tests.utils import hardware_test + +MODEL_ID = "Qwen/Qwen-Image" +MODEL_ENV_VAR = "QWEN_IMAGE_MODEL" +PROMPT = "A photo of a cat sitting on a laptop keyboard, digital art style." +NEGATIVE_PROMPT = "blurry, low quality" +WIDTH = 512 +HEIGHT = 512 +NUM_INFERENCE_STEPS = 20 +TRUE_CFG_SCALE = 4.0 +SEED = 42 +SSIM_THRESHOLD = 0.97 +PSNR_THRESHOLD = 30.0 + + +def _model_name() -> str: + return os.environ.get(MODEL_ENV_VAR, MODEL_ID) + + +def _local_files_only(model: str) -> bool: + return Path(model).exists() + + +def _run_vllm_omni_qwen_image(*, model: str, output_path: Path) -> Image.Image: + server_args = ["--num-gpus", "1", "--stage-init-timeout", "300", "--init-timeout", "900"] + with OmniServer(model, server_args, use_omni=True) as omni_server: + response = requests.post( + f"http://{omni_server.host}:{omni_server.port}/v1/images/generations", + json={ + "model": omni_server.model, + "prompt": PROMPT, + "size": f"{WIDTH}x{HEIGHT}", + "n": 1, + "response_format": "b64_json", + "negative_prompt": NEGATIVE_PROMPT, + "num_inference_steps": NUM_INFERENCE_STEPS, + "true_cfg_scale": TRUE_CFG_SCALE, + "seed": SEED, + }, + timeout=600, + ) + response.raise_for_status() + payload = response.json() + assert len(payload["data"]) == 1 + image_bytes = base64.b64decode(payload["data"][0]["b64_json"]) + image = Image.open(io.BytesIO(image_bytes)).convert("RGB") + image.load() + image.save(output_path) + return image + + +def _run_diffusers_qwen_image(*, model: str, output_path: Path) -> Image.Image: + _run_pre_test_cleanup(enable_force=True) + pipe: DiffusionPipeline | None = None + try: + pipe = DiffusionPipeline.from_pretrained( + model, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + local_files_only=_local_files_only(model), + ).to("cuda") + generator = torch.Generator(device="cuda").manual_seed(SEED) + result = pipe( # pyright: ignore[reportCallIssue] + prompt=PROMPT, + negative_prompt=NEGATIVE_PROMPT, + width=WIDTH, + height=HEIGHT, + num_inference_steps=NUM_INFERENCE_STEPS, + true_cfg_scale=TRUE_CFG_SCALE, + generator=generator, + ) + output_image = result.images[0].convert("RGB") + output_image.save(output_path) + return output_image + finally: + if pipe is not None and hasattr(pipe, "maybe_free_model_hooks"): + pipe.maybe_free_model_hooks() + del pipe + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + _run_post_test_cleanup(enable_force=True) + + +@pytest.mark.advanced_model +@pytest.mark.benchmark +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}, num_cards=1) +def test_qwen_image_matches_diffusers(accuracy_artifact_root: Path) -> None: + model = _model_name() + output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID) + + vllm_output = _run_vllm_omni_qwen_image(model=model, output_path=output_dir / "vllm_omni.png") + diffusers_output = _run_diffusers_qwen_image(model=model, output_path=output_dir / "diffusers.png") + + assert_similarity( + model_name=MODEL_ID, + vllm_image=vllm_output, + diffusers_image=diffusers_output, + width=WIDTH, + height=HEIGHT, + ssim_threshold=SSIM_THRESHOLD, + psnr_threshold=PSNR_THRESHOLD, + ) diff --git a/tests/e2e/accuracy/test_qwen_image_layered.py b/tests/e2e/accuracy/test_qwen_image_layered.py new file mode 100644 index 00000000000..04b13df3bb2 --- /dev/null +++ b/tests/e2e/accuracy/test_qwen_image_layered.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +import base64 +import gc +import io +import os +from pathlib import Path + +import pytest +import requests +import torch +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from PIL import Image + +from tests.conftest import ( + OmniServer, + _run_post_test_cleanup, + _run_pre_test_cleanup, +) +from tests.e2e.accuracy.utils import assert_image_sequence_similarity, model_output_dir +from tests.utils import hardware_test + +MODEL_ID = "Qwen/Qwen-Image-Layered" +MODEL_ENV_VAR = "QWEN_IMAGE_LAYERED_MODEL" +PROMPT = "decompose into layers" +NEGATIVE_PROMPT = " " +NUM_INFERENCE_STEPS = 20 +TRUE_CFG_SCALE = 4.0 +SEED = 777 +LAYERS = 3 +RESOLUTION = 640 +SSIM_THRESHOLD = 0.97 +PSNR_THRESHOLD = 30.0 + + +def _model_name() -> str: + return os.environ.get(MODEL_ENV_VAR, MODEL_ID) + + +def _local_files_only(model: str) -> bool: + return Path(model).exists() + + +def _normalize_layered_images(images: object) -> list[Image.Image]: + if not isinstance(images, list) or not images: + raise AssertionError(f"Unexpected layered output container: {type(images).__name__}") + + first_item = images[0] + if isinstance(first_item, Image.Image): + return [image.convert("RGBA") for image in images if isinstance(image, Image.Image)] + if isinstance(first_item, (list, tuple)): + return [image.convert("RGBA") for image in first_item if isinstance(image, Image.Image)] + raise AssertionError(f"Unexpected layered image element type: {type(first_item).__name__}") + + +def _run_vllm_omni_qwen_image_layered(*, model: str, input_image: Image.Image, output_dir: Path) -> list[Image.Image]: + input_image.save(output_dir / "input.png") + server_args = ["--num-gpus", "1", "--stage-init-timeout", "300", "--init-timeout", "900"] + with OmniServer(model, server_args, use_omni=True) as omni_server: + buffer = io.BytesIO() + input_image.save(buffer, format="PNG") + buffer.seek(0) + response = requests.post( + f"http://{omni_server.host}:{omni_server.port}/v1/images/edits", + data={ + "model": omni_server.model, + "prompt": PROMPT, + "size": "auto", + "n": 1, + "response_format": "b64_json", + "negative_prompt": NEGATIVE_PROMPT, + "num_inference_steps": NUM_INFERENCE_STEPS, + "true_cfg_scale": TRUE_CFG_SCALE, + "seed": SEED, + "layers": LAYERS, + "resolution": RESOLUTION, + }, + files=[("image", ("input.png", buffer, "image/png"))], + timeout=600, + ) + response.raise_for_status() + payload = response.json() + assert len(payload["data"]) == LAYERS + output_images = [] + for item in payload["data"]: + image_bytes = base64.b64decode(item["b64_json"]) + image = Image.open(io.BytesIO(image_bytes)).convert("RGBA") + image.load() + output_images.append(image) + for index, image in enumerate(output_images, start=1): + image.save(output_dir / f"vllm_omni_layer_{index}.png") + return output_images + + +def _run_diffusers_qwen_image_layered(*, model: str, input_image: Image.Image, output_dir: Path) -> list[Image.Image]: + _run_pre_test_cleanup(enable_force=True) + pipe: DiffusionPipeline | None = None + try: + pipe = DiffusionPipeline.from_pretrained( + model, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + local_files_only=_local_files_only(model), + ).to("cuda") + generator = torch.Generator(device="cuda").manual_seed(SEED) + result = pipe( # pyright: ignore[reportCallIssue] + image=input_image, + prompt=PROMPT, + negative_prompt=NEGATIVE_PROMPT, + num_inference_steps=NUM_INFERENCE_STEPS, + true_cfg_scale=TRUE_CFG_SCALE, + generator=generator, + num_images_per_prompt=1, + layers=LAYERS, + resolution=RESOLUTION, + ) + output_images = _normalize_layered_images(result.images) + assert len(output_images) == LAYERS, f"Expected {LAYERS} diffusers layers, got {len(output_images)}" + for index, image in enumerate(output_images, start=1): + image.save(output_dir / f"diffusers_layer_{index}.png") + return output_images + finally: + if pipe is not None and hasattr(pipe, "maybe_free_model_hooks"): + pipe.maybe_free_model_hooks() + del pipe + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + _run_post_test_cleanup(enable_force=True) + + +@pytest.mark.advanced_model +@pytest.mark.benchmark +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}, num_cards=1) +def test_qwen_image_layered_matches_diffusers(accuracy_artifact_root: Path, qwen_bear_image: Image.Image) -> None: + model = _model_name() + output_dir = model_output_dir(accuracy_artifact_root, MODEL_ID) + input_image = qwen_bear_image.convert("RGBA") + + vllm_outputs = _run_vllm_omni_qwen_image_layered(model=model, input_image=input_image, output_dir=output_dir) + diffusers_outputs = _run_diffusers_qwen_image_layered(model=model, input_image=input_image, output_dir=output_dir) + + assert_image_sequence_similarity( + model_name=MODEL_ID, + vllm_images=vllm_outputs, + diffusers_images=diffusers_outputs, + ssim_threshold=SSIM_THRESHOLD, + psnr_threshold=PSNR_THRESHOLD, + compare_mode="RGBA", + ) diff --git a/tests/e2e/accuracy/utils.py b/tests/e2e/accuracy/utils.py index eb0eea757ee..d722b69b011 100644 --- a/tests/e2e/accuracy/utils.py +++ b/tests/e2e/accuracy/utils.py @@ -21,13 +21,14 @@ def assert_similarity( model_name: str, vllm_image: Image.Image, diffusers_image: Image.Image, - width: int, - height: int, ssim_threshold: float, psnr_threshold: float, + width: int | None = None, + height: int | None = None, + compare_mode: str = "RGB", ) -> None: - requested_size = (width, height) - if diffusers_image.size != requested_size: + requested_size = (width, height) if width is not None and height is not None else None + if requested_size is not None and diffusers_image.size != requested_size: pytest.skip( "Skipping as diffusers baseline output is corrupt and not comparable: " f"dimensions do not match requested size; requested={requested_size}, got={diffusers_image.size}." @@ -37,7 +38,11 @@ def assert_similarity( f"Online and diffusers output sizes mismatch: online={vllm_image.size}, diffusers={diffusers_image.size}" ) - ssim_score, psnr_score = compute_image_ssim_psnr(prediction=vllm_image, reference=diffusers_image) + ssim_score, psnr_score = compute_image_ssim_psnr( + prediction=vllm_image, + reference=diffusers_image, + compare_mode=compare_mode, + ) print(f"{model_name} similarity metrics:") print(f" SSIM: value={ssim_score:.6f}, threshold>={ssim_threshold:.6f}, range=[-1, 1], higher_is_better=True") print( @@ -52,13 +57,37 @@ def assert_similarity( ) +def assert_image_sequence_similarity( + *, + model_name: str, + vllm_images: list[Image.Image], + diffusers_images: list[Image.Image], + ssim_threshold: float, + psnr_threshold: float, + compare_mode: str = "RGB", +) -> None: + assert len(vllm_images) == len(diffusers_images), ( + f"Output image count mismatch for {model_name}: online={len(vllm_images)}, diffusers={len(diffusers_images)}" + ) + for index, (vllm_image, diffusers_image) in enumerate(zip(vllm_images, diffusers_images, strict=True), start=1): + assert_similarity( + model_name=f"{model_name}[layer={index}]", + vllm_image=vllm_image, + diffusers_image=diffusers_image, + ssim_threshold=ssim_threshold, + psnr_threshold=psnr_threshold, + compare_mode=compare_mode, + ) + + def compute_image_ssim_psnr( *, prediction: Image.Image, reference: Image.Image, + compare_mode: str = "RGB", ) -> tuple[float, float]: - pred_tensor = _pil_to_batched_tensor(prediction) - ref_tensor = _pil_to_batched_tensor(reference) + pred_tensor = _pil_to_batched_tensor(prediction, compare_mode=compare_mode) + ref_tensor = _pil_to_batched_tensor(reference, compare_mode=compare_mode) ssim_metric = StructuralSimilarityIndexMeasure(data_range=1.0) psnr_metric = PeakSignalNoiseRatio(data_range=1.0) @@ -68,7 +97,7 @@ def compute_image_ssim_psnr( return ssim_value, psnr_value -def _pil_to_batched_tensor(image: Image.Image) -> torch.Tensor: - array = np.asarray(image.convert("RGB"), dtype=np.float32) / 255.0 +def _pil_to_batched_tensor(image: Image.Image, *, compare_mode: str) -> torch.Tensor: + array = np.asarray(image.convert(compare_mode), dtype=np.float32) / 255.0 tensor = torch.from_numpy(array).permute(2, 0, 1).unsqueeze(0) return tensor From 8d1ce6340adb8fab94c2c022428eae31d74c5764 Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Thu, 16 Apr 2026 15:08:13 +0800 Subject: [PATCH 65/76] refactor: init replica in stage_pool part2 Signed-off-by: ZhengWG --- vllm_omni/engine/async_omni_engine.py | 225 ++++++-------------------- vllm_omni/engine/stage_pool.py | 147 +++++++++++++---- 2 files changed, 159 insertions(+), 213 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index e44b2d75580..9da0cd66633 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -71,17 +71,15 @@ extract_stage_metadata, finalize_initialized_stages, get_stage_connector_spec, - get_stage_tp_size, initialize_diffusion_stage, inject_kv_stage_info, load_omni_transfer_config_for_model, prepare_engine_environment, release_device_locks, setup_stage_devices, - split_devices_for_replicas, terminate_alive_proc, ) -from vllm_omni.engine.stage_pool import build_stage_pools +from vllm_omni.engine.stage_pool import StagePool, compute_replica_layout from vllm_omni.entrypoints.utils import load_and_resolve_stage_configs from vllm_omni.inputs.preprocess import OmniInputPreprocessor from vllm_omni.platforms import current_omni_platform @@ -288,9 +286,8 @@ def __init__( self.num_stages = len(self.stage_configs) stage0_args = getattr(self.stage_configs[0], "engine_args", None) if self.num_stages > 0 else None self.async_chunk = bool(getattr(stage0_args, "async_chunk", False)) - self.stage_clients: list[Any] = [] - self.stage_vllm_configs: list[Any] = [] - self.output_processors: list[MultimodalOutputProcessor | None] = [] + self.stage_pools: list[StagePool] = [] + self.stage_clients: list[Any] = [] # flat view for external readers self.input_processor: InputProcessor | None = None self.supported_tasks: tuple[str, ...] = ("generate",) self.default_sampling_params_list: list[Any] = [] @@ -653,145 +650,6 @@ def _attach_llm_stage( logger.info("[AsyncOmniEngine] Stage %s initialized", started.stage_id) return stage_client, output_processor, started.vllm_config, input_processor - def _compute_replica_layout( - self, - ) -> tuple[list[int], dict[int, list[str]], int]: - """Compute per-stage replica counts and device assignments. - - Returns: - replicas_per_stage: num_replicas per logical stage. - replica_devices_map: stage_idx → per-replica device strings - (only for stages with num_replicas > 1). - total_llm_replicas: total LLM replica count across all stages. - """ - replicas_per_stage: list[int] = [] - for stage_cfg in self.stage_configs: - runtime_cfg = getattr(stage_cfg, "runtime", {}) - num_replicas = int( - runtime_cfg.get("num_replicas", 1) - if hasattr(runtime_cfg, "get") - else getattr(runtime_cfg, "num_replicas", 1) - ) - replicas_per_stage.append(max(1, num_replicas)) - - replica_devices_map: dict[int, list[str]] = {} - for logical_id, stage_cfg in enumerate(self.stage_configs): - num_replicas = replicas_per_stage[logical_id] - if num_replicas <= 1: - continue - runtime_cfg = getattr(stage_cfg, "runtime", {}) - devices_str = ( - runtime_cfg.get("devices") if hasattr(runtime_cfg, "get") else getattr(runtime_cfg, "devices", None) - ) - tp_size = get_stage_tp_size(stage_cfg) - replica_devices_map[logical_id] = split_devices_for_replicas( - devices_str, - num_replicas, - tp_size, - logical_id, - ) - logger.info( - "[AsyncOmniEngine] Stage %s: %d replicas, tp=%d, devices split: %s", - logical_id, - num_replicas, - tp_size, - replica_devices_map[logical_id], - ) - - total_llm_replicas = sum( - replicas_per_stage[i] - for i, cfg in enumerate(self.stage_configs) - if getattr(cfg, "stage_type", "llm") != "diffusion" - ) - return replicas_per_stage, replica_devices_map, total_llm_replicas - - def _build_flat_client_lists( - self, - *, - num_stages: int, - replicas_per_stage: list[int], - llm_stage_ids: list[int], - stage_attach_results: dict[int, list[Any]], - stage_output_proc_results: dict[int, list[Any]], - stage_vllm_cfg_results: dict[int, list[Any]], - diffusion_clients: dict[int, Any], - input_processor: InputProcessor | None, - ) -> tuple[list[Any], list[Any], list[Any], list[list[int]], list[Any], list[dict[str, Any]]]: - """Build flat client/processor/config lists and finalize stage metadata. - - Returns: - (flat_clients, flat_output_processors, flat_vllm_configs, - logical_stage_to_clients, default_sampling_params_list, - stage_metadata_list) - - TODO(stage-pool): move this assembly into StagePool.from_stage_config - so _initialize_stages only collects pools, not flat lists. - """ - flat_clients: list[Any] = [] - flat_output_processors: list[Any] = [] - flat_vllm_configs: list[Any] = [] - logical_stage_to_clients: list[list[int]] = [] - - logical_stage_clients_for_finalize: list[Any | None] = [None] * num_stages - all_clients: dict[int, list[Any]] = {} - all_output_processors: dict[int, list[Any]] = {} - all_vllm_configs: dict[int, list[Any]] = {} - - for stage_idx in llm_stage_ids: - all_clients[stage_idx] = stage_attach_results[stage_idx] - all_output_processors[stage_idx] = stage_output_proc_results[stage_idx] - all_vllm_configs[stage_idx] = stage_vllm_cfg_results[stage_idx] - logical_stage_clients_for_finalize[stage_idx] = stage_attach_results[stage_idx][0] - - for stage_idx, diff_client in diffusion_clients.items(): - logical_stage_clients_for_finalize[stage_idx] = diff_client - - _, default_sampling_params_list, stage_metadata = finalize_initialized_stages( - logical_stage_clients_for_finalize, - input_processor, - ) - - logical_default_sampling_params: list[Any] = [] - logical_stage_metadata: list[dict[str, Any]] = [] - - for logical_id in range(num_stages): - num_replicas = replicas_per_stage[logical_id] - client_indices: list[int] = [] - - if logical_id in diffusion_clients: - ci = len(flat_clients) - client_indices.append(ci) - flat_clients.append(diffusion_clients[logical_id]) - flat_output_processors.append(None) - flat_vllm_configs.append(None) - else: - for replica_idx in range(num_replicas): - ci = len(flat_clients) - client_indices.append(ci) - flat_clients.append(all_clients[logical_id][replica_idx]) - flat_output_processors.append(all_output_processors[logical_id][replica_idx]) - flat_vllm_configs.append(all_vllm_configs[logical_id][replica_idx]) - if num_replicas > 1: - logger.info( - "[AsyncOmniEngine] Stage %s replica %s → client %s (isolated)", - logical_id, - replica_idx, - ci, - ) - - logical_stage_to_clients.append(client_indices) - logical_default_sampling_params.append(default_sampling_params_list[logical_id]) - logical_stage_metadata.append(stage_metadata[logical_id]) - - return ( - flat_clients, - flat_output_processors, - flat_vllm_configs, - logical_stage_to_clients, - logical_default_sampling_params, - logical_stage_metadata, - ) - def _initialize_stages(self, stage_init_timeout: int) -> None: """Initialize stage clients/processors in orchestrator thread and assign to self. @@ -799,12 +657,12 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: 1. Compute replica layout (counts + device splits). 2. Launch all stage engine processes (parallel via ThreadPoolExecutor). 3. Attach launched engines (parallel) and collect clients/processors. - 4. Assemble flat client lists and finalize stage metadata. + 4. Build StagePool list and finalize stage metadata. """ device_control_env = current_omni_platform.device_control_env_var num_stages = self.num_stages - replicas_per_stage, replica_devices_map, total_llm_replicas = self._compute_replica_layout() + replicas_per_stage, replica_devices_map, total_llm_replicas = compute_replica_layout(self.stage_configs) input_processor: InputProcessor | None = None llm_stage_ids: list[int] = [] @@ -818,7 +676,7 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: prepare_engine_environment() omni_transfer_config = load_omni_transfer_config_for_model(self.model, self.config_path) - flat_clients: list[Any] = [] # populated by _build_flat_client_lists + stage_pools: list[StagePool] = [] # ------------------------------------------------------------------ # # Single-stage mode: start OmniMasterServer before launching stages. # @@ -1040,25 +898,32 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: if stage0_input_processor is not None: input_processor = stage0_input_processor - # ---- Assemble flat lists + finalize metadata ---- - ( - flat_clients, - flat_output_processors, - flat_vllm_configs, - logical_stage_to_clients, - logical_default_sampling_params, - logical_stage_metadata, - ) = self._build_flat_client_lists( - num_stages=num_stages, - replicas_per_stage=replicas_per_stage, - llm_stage_ids=llm_stage_ids, - stage_attach_results=stage_attach_results, - stage_output_proc_results=stage_output_proc_results, - stage_vllm_cfg_results=stage_vllm_cfg_results, - diffusion_clients=diffusion_clients, - input_processor=input_processor, + # ---- Build StagePool list + finalize metadata ---- + # Use first replica's client per stage for finalize (default sampling params, metadata). + logical_stage_clients_for_finalize: list[Any | None] = [None] * num_stages + for stage_idx in llm_stage_ids: + logical_stage_clients_for_finalize[stage_idx] = stage_attach_results[stage_idx][0] + for stage_idx, diff_client in diffusion_clients.items(): + logical_stage_clients_for_finalize[stage_idx] = diff_client + + _, default_sampling_params_list, stage_metadata_list = finalize_initialized_stages( + logical_stage_clients_for_finalize, + input_processor, ) + for logical_id in range(num_stages): + if logical_id in diffusion_clients: + stage_pools.append(StagePool.from_diffusion_client(logical_id, diffusion_clients[logical_id])) + else: + stage_pools.append( + StagePool.from_attach_results( + logical_id, + clients=stage_attach_results[logical_id], + output_processors=stage_output_proc_results[logical_id], + vllm_configs=stage_vllm_cfg_results[logical_id], + ) + ) + except Exception: for stage_id, futures in llm_launch_futures.items(): for f in futures: @@ -1066,7 +931,11 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: continue started_llm_stages.setdefault(stage_id, []).append(f.result()) # Collect all initialized clients for cleanup - cleanup_clients: list[Any] = list(diffusion_clients.values()) + [c for c in flat_clients if c is not None] + cleanup_clients: list[Any] = list(diffusion_clients.values()) + for pool in stage_pools: + for sr in pool.replicas: + if sr.client is not None: + cleanup_clients.append(sr.client) all_started = [s for stages in started_llm_stages.values() for s in stages] logger.exception( "[AsyncOmniEngine] Stage initialization failed; shutting down %s initialized client(s)", @@ -1080,22 +949,23 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: logger.exception("[AsyncOmniEngine] Failed to stop OmniMasterServer during stage-init cleanup") raise - self.stage_clients = flat_clients - self.output_processors = flat_output_processors - self.stage_vllm_configs = flat_vllm_configs - self.logical_stage_to_clients = logical_stage_to_clients + self.stage_pools = stage_pools self.input_processor = input_processor self.prompt_expand_func = prompt_expand_func + + # Derive flat views for external readers (entrypoints/async_omni.py). + self.stage_clients = [sr.client for pool in stage_pools for sr in pool.replicas] + # TODO(Peiqi): Hack here supported_tasks: set[str] = set() - if any(getattr(stage_client, "is_comprehension", False) for stage_client in flat_clients): + if any(getattr(sr.client, "is_comprehension", False) for pool in stage_pools for sr in pool.replicas): supported_tasks.add("generate") - if any(metadata.get("final_output_type") == "audio" for metadata in logical_stage_metadata): + if any(m.get("final_output_type") == "audio" for m in stage_metadata_list): supported_tasks.add("speech") self.supported_tasks = tuple(supported_tasks) if supported_tasks else ("generate",) - self.default_sampling_params_list = logical_default_sampling_params - self.stage_metadata = logical_stage_metadata + self.default_sampling_params_list = list(default_sampling_params_list) + self.stage_metadata = list(stage_metadata_list) def _initialize_janus_queues(self) -> None: """Initialize janus queues inside orchestrator thread loop context.""" @@ -1122,12 +992,7 @@ async def _run_orchestrator() -> None: request_async_queue=self.request_queue.async_q, output_async_queue=self.output_queue.async_q, rpc_async_queue=self.rpc_output_queue.async_q, - stage_pools=build_stage_pools( - self.stage_clients, - self.output_processors, - self.stage_vllm_configs, - self.logical_stage_to_clients, - ), + stage_pools=self.stage_pools, async_chunk=self.async_chunk, ) if not startup_future.done(): diff --git a/vllm_omni/engine/stage_pool.py b/vllm_omni/engine/stage_pool.py index a52cd29683b..d133b566bf8 100644 --- a/vllm_omni/engine/stage_pool.py +++ b/vllm_omni/engine/stage_pool.py @@ -2,18 +2,22 @@ Groups the {client, output_processor, vllm_config} triple of each replica under a single logical stage and centralizes replica selection (round-robin -+ per-request affinity). The Orchestrator still owns flat lists as a -compatibility view; StagePool is the canonical lookup going forward. ++ per-request affinity). """ from __future__ import annotations +from collections.abc import Sequence from dataclasses import dataclass from typing import TYPE_CHECKING, Any +from vllm.logger import init_logger + if TYPE_CHECKING: from vllm_omni.engine.orchestrator import OrchestratorRequestState +logger = init_logger(__name__) + @dataclass(eq=False) class StageReplica: @@ -47,6 +51,57 @@ def __init__( self.replicas: list[StageReplica] = replicas self._rr_cursor = 0 + # ---- Construction helpers ---- + + @classmethod + def from_attach_results( + cls, + logical_stage_id: int, + clients: Sequence[Any], + output_processors: Sequence[Any], + vllm_configs: Sequence[Any], + ) -> StagePool: + """Build a pool from parallel lists returned by _attach_llm_stage. + + Each positional index corresponds to one replica of the same logical + stage. The first replica's ``client.stage_type`` is used as the + pool-level stage_type. + """ + replicas = [ + StageReplica( + logical_stage_id=logical_stage_id, + replica_index=ri, + client=clients[ri], + output_processor=output_processors[ri], + vllm_config=vllm_configs[ri], + ) + for ri in range(len(clients)) + ] + stage_type = getattr(clients[0], "stage_type", None) if clients else None + return cls(logical_stage_id, stage_type, replicas) + + @classmethod + def from_diffusion_client( + cls, + logical_stage_id: int, + client: Any, + ) -> StagePool: + """Build a single-replica pool for a diffusion stage. + + Diffusion stages have no output_processor or vllm_config on the + orchestrator side. + """ + replica = StageReplica( + logical_stage_id=logical_stage_id, + replica_index=0, + client=client, + output_processor=None, + vllm_config=None, + ) + return cls(logical_stage_id, "diffusion", [replica]) + + # ---- Selection / admission ---- + @property def num_replicas(self) -> int: return len(self.replicas) @@ -97,41 +152,67 @@ def admit( Atomically couples replica selection with output_processor registration so that "which replica will serve this request" and "which processor - knows about this request" are the same by construction. Call sites - must follow up with ``replica.client.add_request_async(request)`` and - on submission failure call ``replica.output_processor.abort_requests - ([request.request_id], internal=False)`` to roll back the registration. + knows about this request" are the same by construction. """ - replica = self.select_replica(req_state, affinity_from=affinity_from) - replica.output_processor.add_request( + stage_replica = self.select_replica(req_state, affinity_from=affinity_from) + stage_replica.output_processor.add_request( request=request, prompt=prompt_text, parent_req=None, request_index=0, queue=None, ) - return replica - - -def build_stage_pools( - stage_clients: list[Any], - output_processors: list[Any], - stage_vllm_configs: list[Any], - logical_stage_to_clients: list[list[int]], -) -> list[StagePool]: - """Assemble StagePool list from the flat-list view owned by the engine.""" - pools: list[StagePool] = [] - for logical_id, client_indices in enumerate(logical_stage_to_clients): - replicas = [ - StageReplica( - logical_stage_id=logical_id, - replica_index=ri, - client=stage_clients[ci], - output_processor=output_processors[ci], - vllm_config=stage_vllm_configs[ci], - ) - for ri, ci in enumerate(client_indices) - ] - stage_type = getattr(stage_clients[client_indices[0]], "stage_type", None) - pools.append(StagePool(logical_id, stage_type, replicas)) - return pools + return stage_replica + + +def compute_replica_layout( + stage_configs: Sequence[Any], +) -> tuple[list[int], dict[int, list[str]], int]: + """Compute per-stage replica counts and device assignments. + + Returns: + replicas_per_stage: num_replicas per logical stage. + replica_devices_map: stage_idx -> per-replica device strings + (only for stages with num_replicas > 1). + total_llm_replicas: total LLM replica count across all stages. + """ + from vllm_omni.engine.stage_init_utils import get_stage_tp_size, split_devices_for_replicas + + replicas_per_stage: list[int] = [] + for stage_cfg in stage_configs: + runtime_cfg = getattr(stage_cfg, "runtime", {}) + num_replicas = int( + runtime_cfg.get("num_replicas", 1) + if hasattr(runtime_cfg, "get") + else getattr(runtime_cfg, "num_replicas", 1) + ) + replicas_per_stage.append(max(1, num_replicas)) + + replica_devices_map: dict[int, list[str]] = {} + for logical_id, stage_cfg in enumerate(stage_configs): + num_replicas = replicas_per_stage[logical_id] + if num_replicas <= 1: + continue + runtime_cfg = getattr(stage_cfg, "runtime", {}) + devices_str = ( + runtime_cfg.get("devices") if hasattr(runtime_cfg, "get") else getattr(runtime_cfg, "devices", None) + ) + tp_size = get_stage_tp_size(stage_cfg) + replica_devices_map[logical_id] = split_devices_for_replicas( + devices_str, + num_replicas, + tp_size, + logical_id, + ) + logger.info( + "[StagePool] Stage %s: %d replicas, tp=%d, devices split: %s", + logical_id, + num_replicas, + tp_size, + replica_devices_map[logical_id], + ) + + total_llm_replicas = sum( + replicas_per_stage[i] for i, cfg in enumerate(stage_configs) if getattr(cfg, "stage_type", "llm") != "diffusion" + ) + return replicas_per_stage, replica_devices_map, total_llm_replicas From 4d816ff1ded1e35393d6175d8f0dbbe07d570add Mon Sep 17 00:00:00 2001 From: NATURE Date: Thu, 16 Apr 2026 16:25:13 +0800 Subject: [PATCH 66/76] [Feature] Bagel: Support tp+cfg parallel using mooncake transfer engine connector (#2705) Signed-off-by: natureofnature Co-authored-by: Hongsheng Liu --- .../omni_connectors/test_tp_rank_aware.py | 716 +++++++++++++++++ .../test_async_omni_engine_stage_init.py | 69 ++ tests/engine/test_single_stage_mode.py | 2 + .../distributed/group_coordinator.py | 5 +- .../diffusion/models/bagel/pipeline_bagel.py | 36 +- .../omni_connectors/kv_transfer_manager.py | 721 ++++++++++++------ .../omni_connectors/utils/kv_utils.py | 367 ++++++++- vllm_omni/engine/async_omni_engine.py | 16 +- vllm_omni/engine/stage_engine_core_client.py | 7 +- vllm_omni/engine/stage_init_utils.py | 116 ++- vllm_omni/entrypoints/openai/serving_chat.py | 53 +- vllm_omni/inputs/data.py | 4 + 12 files changed, 1846 insertions(+), 266 deletions(-) create mode 100644 tests/distributed/omni_connectors/test_tp_rank_aware.py diff --git a/tests/distributed/omni_connectors/test_tp_rank_aware.py b/tests/distributed/omni_connectors/test_tp_rank_aware.py new file mode 100644 index 00000000000..d4793479aaf --- /dev/null +++ b/tests/distributed/omni_connectors/test_tp_rank_aware.py @@ -0,0 +1,716 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for rank-aware KV transfer (TP > 1) and heterogeneous TP support. + +Covers: +- _build_rank_aware_send_keys / _build_rank_aware_recv_keys +- _get_kv_source_ranks / _get_kv_target_ranks / get_kv_connector_key +- update_sender_info storing base host/port +- receive path constructing per-rank metadata for connector.get() +- Mooncake connector _query_metadata_at and partial-metadata get() path +""" + +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import pytest +import torch + +from vllm_omni.distributed.omni_connectors.kv_transfer_manager import ( + KVCacheTransferData, + OmniKVCacheConfig, + OmniKVTransferManager, +) +from vllm_omni.distributed.omni_connectors.utils.initialization import ( + KV_RANK_PORT_STRIDE, +) +from vllm_omni.distributed.omni_connectors.utils.kv_utils import ( + KVTPTopology, + build_rank_aware_recv_keys, + build_rank_aware_send_keys, + get_kv_connector_key, + get_kv_source_ranks, + get_kv_target_ranks, + merge_received_rank_shards, + slice_received_rank_shard, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] + + +def _make_manager( + from_tp: int = 1, + to_tp: int = 1, + local_rank: int = 0, + from_stage: str = "stage0", + to_stage: str = "stage1", + stage_id: str = "stage1", + need_recv: bool = True, + need_send: bool = False, + recv_timeout: float = 0.3, +) -> OmniKVTransferManager: + """Build a manager with TP params injected, bypassing torch.distributed.""" + config = OmniKVCacheConfig( + connector_config={"type": "mock"}, + from_stage=from_stage, + to_stage=to_stage, + stage_id=stage_id, + need_recv_cache=need_recv, + need_send_cache=need_send, + recv_timeout=recv_timeout, + from_tp=from_tp, + to_tp=to_tp, + ) + with ( + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_local_tp_rank", return_value=local_rank), + patch( + "vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_tp_world_size", + return_value=max(from_tp, to_tp), + ), + ): + mgr = OmniKVTransferManager(config) + return mgr + + +def _make_payload(head_values: list[float], request_id: str = "req-1") -> dict: + head_tensor = torch.tensor(head_values, dtype=torch.float32).view(1, len(head_values), 1).repeat(2, 1, 1) + return { + "request_id": request_id, + "layer_blocks": { + "key_cache": [head_tensor.clone()], + "value_cache": [(head_tensor + 100).clone()], + }, + "block_ids": [0], + "metadata": {"seq_len": 2}, + } + + +def _make_transfer_data(head_values: list[float], request_id: str = "req-1") -> KVCacheTransferData: + payload = _make_payload(head_values, request_id=request_id) + return KVCacheTransferData( + request_id=request_id, + layer_blocks=payload["layer_blocks"], + block_ids=payload["block_ids"], + metadata=payload["metadata"], + ) + + +# ── Key format helper ──────────────────────────────────────────────── + + +class TestConnectorKeyFormat: + def test_key_format_matches_pr2677(self): + key = get_kv_connector_key("req-1", "stage0", 0, 1, 2) + assert key == "req-1_stage0_0_1_2" + + def test_key_fields_are_positional(self): + key = get_kv_connector_key("r", "s", 5, 3, 7) + parts = key.split("_") + assert parts == ["r", "s", "5", "3", "7"] + + +# ── Source / target rank mapping ───────────────────────────────────── + + +class TestRankMapping: + """Verify get_kv_target_ranks and get_kv_source_ranks for various TP configs.""" + + def test_homogeneous_tp2_rank0(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=0) + assert get_kv_target_ranks(topo) == [0] + assert get_kv_source_ranks(topo) == [0] + + def test_homogeneous_tp2_rank1(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=1) + assert get_kv_target_ranks(topo) == [1] + assert get_kv_source_ranks(topo) == [1] + + def test_homogeneous_tp4_rank3(self): + topo = KVTPTopology(source_tp_size=4, target_tp_size=4, local_rank=3) + assert get_kv_target_ranks(topo) == [3] + assert get_kv_source_ranks(topo) == [3] + + def test_sender_gt_receiver_tp4_to_tp2_rank0(self): + """Receiver rank 0 should receive from sender rank 0 and 1.""" + topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=0) + assert get_kv_source_ranks(topo) == [0, 1] + + def test_sender_gt_receiver_tp4_to_tp2_rank1(self): + """Receiver rank 1 should receive from sender rank 2 and 3.""" + topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=1) + assert get_kv_source_ranks(topo) == [2, 3] + + def test_sender_lt_receiver_tp2_to_tp4_rank0(self): + """Sender rank 0 should send to receiver ranks 0 and 1.""" + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=0) + assert get_kv_target_ranks(topo) == [0, 1] + + def test_sender_lt_receiver_tp2_to_tp4_rank1(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=1) + assert get_kv_target_ranks(topo) == [2, 3] + + def test_receiver_lt_sender_source_ranks(self): + """Receiver rank 0 with tp2_to_tp4 should source from rank 0 only.""" + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=0) + assert get_kv_source_ranks(topo) == [0] + + def test_invalid_topology_raises(self): + topo = KVTPTopology(source_tp_size=3, target_tp_size=2, local_rank=0) + with pytest.raises(ValueError, match="divisible"): + get_kv_source_ranks(topo) + + +# ── _build_rank_aware_recv_keys ────────────────────────────────────── + + +class TestBuildRankAwareRecvKeys: + """Verify build_rank_aware_recv_keys returns (key, from_rank) tuples.""" + + def test_tp1_returns_legacy_key_with_none_rank(self): + topo = KVTPTopology(source_tp_size=1, target_tp_size=1, local_rank=0) + pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) + assert len(pairs) == 1 + key, rank = pairs[0] + assert key == "omni_stage0_to_stage1_kv_cache_req-1" + assert rank is None + + def test_homogeneous_tp2_rank0(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=0) + pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) + assert len(pairs) == 1 + key, rank = pairs[0] + assert key == "req-1_stage0_0_0_0" + assert rank == 0 + + def test_homogeneous_tp2_rank1(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=1) + pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) + assert len(pairs) == 1 + key, rank = pairs[0] + assert key == "req-1_stage0_0_1_1" + assert rank == 1 + + def test_heterogeneous_tp4_to_tp2_rank0_gets_two_keys(self): + """Receiver rank 0 with source_tp=4, target_tp=2 should get 2 keys.""" + topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=0) + pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) + assert len(pairs) == 2 + + keys = [k for k, _ in pairs] + ranks = [r for _, r in pairs] + assert keys == ["req-1_stage0_0_0_0", "req-1_stage0_0_1_0"] + assert ranks == [0, 1] + + def test_heterogeneous_tp4_to_tp2_rank1_gets_two_keys(self): + topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=1) + pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) + assert len(pairs) == 2 + + ranks = [r for _, r in pairs] + assert ranks == [2, 3] + + def test_heterogeneous_tp2_to_tp4_rank2_gets_one_key(self): + """Receiver rank 2 with source_tp=2, target_tp=4 should get 1 key from sender rank 1.""" + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=2) + pairs = build_rank_aware_recv_keys("req-1", "stage0", "stage1", topo) + assert len(pairs) == 1 + key, rank = pairs[0] + assert rank == 1 + assert key == "req-1_stage0_0_1_2" + + +# ── _build_rank_aware_send_keys ────────────────────────────────────── + + +class TestBuildRankAwareSendKeys: + def test_tp1_returns_legacy_key(self): + topo = KVTPTopology(source_tp_size=1, target_tp_size=1, local_rank=0) + keys = build_rank_aware_send_keys("req-1", "stage0", "stage1", topo) + assert keys == ["omni_stage0_to_stage1_kv_cache_req-1"] + + def test_homogeneous_tp2_rank0(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=2, local_rank=0) + keys = build_rank_aware_send_keys("req-1", "stage0", "stage1", topo) + assert keys == ["req-1_stage0_0_0_0"] + + def test_sender_lt_receiver_tp2_to_tp4_rank0_sends_two_keys(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=0) + keys = build_rank_aware_send_keys("req-1", "stage0", "stage1", topo) + assert len(keys) == 2 + assert keys == ["req-1_stage0_0_0_0", "req-1_stage0_0_0_1"] + + +# ── update_sender_info stores base host/port ───────────────────────── + + +class TestUpdateSenderInfoBase: + def test_stores_base_host_and_port(self): + mgr = _make_manager(from_tp=2, to_tp=2, local_rank=0) + mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) + + assert mgr._sender_base_host == "10.0.0.1" + assert mgr._sender_base_zmq_port == 50151 + + def test_rank1_adjusts_default_port_but_preserves_base(self): + mgr = _make_manager(from_tp=2, to_tp=2, local_rank=1) + mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) + + assert mgr._sender_base_host == "10.0.0.1" + assert mgr._sender_base_zmq_port == 50151 + expected_adjusted = 50151 + 1 * KV_RANK_PORT_STRIDE + assert mgr.config.connector_config["sender_zmq_port"] == expected_adjusted + + def test_nested_sender_info_resolves_correctly(self): + """Nested sender_info keyed by integer stage id should resolve + using recv_stages (engine_input_source → recv_from).""" + config = OmniKVCacheConfig( + connector_config={"type": "mock"}, + stage_id=2, + engine_input_source=[1], + need_recv_cache=True, + from_tp=2, + to_tp=2, + ) + with ( + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_local_tp_rank", return_value=0), + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_tp_world_size", return_value=2), + ): + mgr = OmniKVTransferManager(config) + mgr.update_sender_info( + { + 0: {"host": "10.0.0.1", "zmq_port": 50151}, + 1: {"host": "10.0.0.2", "zmq_port": 50152}, + } + ) + assert mgr._sender_base_host == "10.0.0.2" + assert mgr._sender_base_zmq_port == 50152 + + +# ── receive path constructs per-rank metadata ──────────────────────── + + +class TestReceiveConstructsMetadata: + """Verify that receive_kv_cache_for_request passes metadata with + correct (host, port) to connector.get() for heterogeneous TP.""" + + def test_tp1_no_metadata_passed(self): + """TP=1: connector.get() should be called WITHOUT metadata.""" + mgr = _make_manager(from_tp=1, to_tp=1, local_rank=0, recv_timeout=0.05) + mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) + + calls = [] + + class _Connector: + def get(self, from_stage, to_stage, get_key, metadata=None): + calls.append({"key": get_key, "metadata": metadata}) + return None + + mgr._connector = _Connector() + mgr.receive_kv_cache_for_request("req-1") + + assert len(calls) > 0 + assert calls[0]["metadata"] is None + + def test_homogeneous_tp2_rank0_passes_metadata(self): + """TP=2 rank 0: metadata should point to sender rank 0's port.""" + mgr = _make_manager(from_tp=2, to_tp=2, local_rank=0, recv_timeout=0.05) + mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) + + calls = [] + + class _Connector: + def get(self, from_stage, to_stage, get_key, metadata=None): + calls.append({"key": get_key, "metadata": metadata}) + return None + + mgr._connector = _Connector() + mgr.receive_kv_cache_for_request("req-1") + + assert len(calls) > 0 + meta = calls[0]["metadata"] + assert meta is not None + assert meta["source_host"] == "10.0.0.1" + assert meta["source_port"] == 50151 + 0 * KV_RANK_PORT_STRIDE + + def test_homogeneous_tp2_rank1_passes_metadata_with_offset(self): + mgr = _make_manager(from_tp=2, to_tp=2, local_rank=1, recv_timeout=0.05) + mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) + + calls = [] + + class _Connector: + def get(self, from_stage, to_stage, get_key, metadata=None): + calls.append({"key": get_key, "metadata": metadata}) + return None + + mgr._connector = _Connector() + mgr.receive_kv_cache_for_request("req-1") + + meta = calls[0]["metadata"] + assert meta["source_port"] == 50151 + 1 * KV_RANK_PORT_STRIDE + + def test_heterogeneous_tp4_to_tp2_rank0_multiple_metadata(self): + """Receiver rank 0 with source_tp=4, target_tp=2 should call get() with + two different metadata entries for sender ranks 0 and 1.""" + mgr = _make_manager(from_tp=4, to_tp=2, local_rank=0, recv_timeout=0.05) + mgr.update_sender_info({"host": "10.0.0.1", "zmq_port": 50151}) + + calls = [] + + class _Connector: + def get(self, from_stage, to_stage, get_key, metadata=None): + calls.append({"key": get_key, "metadata": metadata}) + return None + + mgr._connector = _Connector() + mgr.receive_kv_cache_for_request("req-1") + + seen_ports = set() + for c in calls: + if c["metadata"]: + seen_ports.add(c["metadata"]["source_port"]) + expected_ports = { + 50151 + 0 * KV_RANK_PORT_STRIDE, + 50151 + 1 * KV_RANK_PORT_STRIDE, + } + assert expected_ports.issubset(seen_ports) + + +# ── Mooncake connector _query_metadata_at ──────────────────────────── + + +class TestMooncakeQueryMetadataAt: + """Test the connector's _query_metadata_at method and partial-metadata + path in get() without requiring real RDMA/Mooncake.""" + + def test_query_metadata_at_returns_full_metadata(self): + """Mock the ZMQ interaction to verify _query_metadata_at returns + complete metadata including data_size.""" + + try: + from vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector import ( + MooncakeTransferEngineConnector, + QueryResponse, + ) + except ImportError: + pytest.skip("Mooncake not available") + + import msgspec + + connector = MagicMock(spec=MooncakeTransferEngineConnector) + connector._get_req_socket = MagicMock() + + mock_socket = MagicMock() + resp = QueryResponse(request_id="test_key@s0_s1", data_size=4096, is_fast_path=True) + mock_socket.recv.return_value = msgspec.msgpack.encode(resp) + connector._get_req_socket.return_value = mock_socket + + result = MooncakeTransferEngineConnector._query_metadata_at( + connector, + "test_key@s0_s1", + "10.0.0.1", + 50151, + ) + + assert result is not None + assert result["source_host"] == "10.0.0.1" + assert result["source_port"] == 50151 + assert result["data_size"] == 4096 + assert result["is_fast_path"] is True + + def test_query_metadata_at_returns_none_on_not_found(self): + try: + from vllm_omni.distributed.omni_connectors.connectors.mooncake_transfer_engine_connector import ( + INFO_NOT_FOUND, + MooncakeTransferEngineConnector, + ) + except ImportError: + pytest.skip("Mooncake not available") + + connector = MagicMock(spec=MooncakeTransferEngineConnector) + mock_socket = MagicMock() + mock_socket.recv.return_value = INFO_NOT_FOUND + connector._get_req_socket.return_value = mock_socket + + result = MooncakeTransferEngineConnector._query_metadata_at( + connector, + "test_key@s0_s1", + "10.0.0.1", + 50151, + ) + assert result is None + + +# ── Merge / slice hooks ────────────────────────────────────────────── + + +class TestMergeSliceHooks: + def test_single_shard_passes_through(self): + payload = {"layer_blocks": {"key_cache": [1]}} + assert merge_received_rank_shards([payload]) == payload + + def test_default_merger_concats_head_dim(self): + p0 = _make_payload([0.0]) + p1 = _make_payload([1.0]) + result = merge_received_rank_shards([p0, p1]) + key_cache = result["layer_blocks"]["key_cache"][0] + value_cache = result["layer_blocks"]["value_cache"][0] + assert key_cache.shape == (2, 2, 1) + assert value_cache.shape == (2, 2, 1) + assert torch.equal(key_cache[:, :, 0], torch.tensor([[0.0, 1.0], [0.0, 1.0]])) + assert torch.equal(value_cache[:, :, 0], torch.tensor([[100.0, 101.0], [100.0, 101.0]])) + + def test_custom_merger_hook_called(self): + merged = {"merged": True} + assert merge_received_rank_shards([{}, {}], merger=lambda payloads: merged) == merged + + def test_slicer_hook_called(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=0) + sliced = {"sliced": True} + assert slice_received_rank_shard({"full": True}, topo, slicer=lambda payload: sliced) == sliced + + def test_default_slicer_extracts_rank_local_heads(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=1) + payload = _make_payload([0.0, 1.0]) + result = slice_received_rank_shard(payload, topo) + key_cache = result["layer_blocks"]["key_cache"][0] + value_cache = result["layer_blocks"]["value_cache"][0] + assert key_cache.shape == (2, 1, 1) + assert value_cache.shape == (2, 1, 1) + assert torch.equal(key_cache[:, :, 0], torch.tensor([[1.0], [1.0]])) + assert torch.equal(value_cache[:, :, 0], torch.tensor([[101.0], [101.0]])) + + def test_presliced_payload_is_not_sliced_twice(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=1) + payload = _make_payload([1.0]) + payload["metadata"]["tp_head_slice"] = {"applied": True, "target_rank": 1} + result = slice_received_rank_shard(payload, topo) + assert result is payload + + def test_round_trip_merge_from_tp4_to_tp2(self): + topo = KVTPTopology(source_tp_size=4, target_tp_size=2, local_rank=1) + source_ranks = get_kv_source_ranks(topo) + payloads = [_make_payload([float(rank)]) for rank in source_ranks] + result = merge_received_rank_shards(payloads) + key_cache = result["layer_blocks"]["key_cache"][0] + assert torch.equal(key_cache[:, :, 0], torch.tensor([[2.0, 3.0], [2.0, 3.0]])) + + def test_round_trip_slice_from_tp2_to_tp4(self): + topo = KVTPTopology(source_tp_size=2, target_tp_size=4, local_rank=3) + payload = _make_payload([2.0, 3.0]) + result = slice_received_rank_shard(payload, topo) + key_cache = result["layer_blocks"]["key_cache"][0] + assert torch.equal(key_cache[:, :, 0], torch.tensor([[3.0], [3.0]])) + + +class TestSenderSideSlicing: + def test_transfer_slices_before_sending_to_multiple_targets(self): + mgr = _make_manager( + from_tp=2, + to_tp=4, + local_rank=0, + need_send=True, + need_recv=False, + ) + sent_payloads = [] + + class _Connector: + supports_raw_data = False + + def put(self, from_stage, to_stage, put_key, data): + sent_payloads.append((put_key, KVCacheTransferData.from_bytes(data))) + return True, len(data), {} + + mgr._connector = _Connector() + mgr._transfer_kv_cache(_make_transfer_data([0.0, 1.0]), "req-1") + + assert [key for key, _ in sent_payloads] == ["req-1_stage0_0_0_0", "req-1_stage0_0_0_1"] + assert sent_payloads[0][1]["layer_blocks"]["key_cache"][0].shape == (2, 1, 1) + assert sent_payloads[1][1]["layer_blocks"]["key_cache"][0].shape == (2, 1, 1) + assert torch.equal( + sent_payloads[0][1]["layer_blocks"]["key_cache"][0][:, :, 0], + torch.tensor([[0.0], [0.0]]), + ) + assert torch.equal( + sent_payloads[1][1]["layer_blocks"]["key_cache"][0][:, :, 0], + torch.tensor([[1.0], [1.0]]), + ) + assert sent_payloads[0][1]["metadata"]["tp_head_slice"]["target_rank"] == 0 + assert sent_payloads[1][1]["metadata"]["tp_head_slice"]["target_rank"] == 1 + + +class _MockBroadcastGroup: + def __init__(self, world_size: int, rank_in_group: int, broadcast_value=None, recv_value=None): + self.world_size = world_size + self.rank_in_group = rank_in_group + self.broadcast_value = broadcast_value + self.recv_value = recv_value + self.broadcast_calls = [] + self.send_calls = [] + self.recv_calls = [] + self.shm_broadcaster = None + + def broadcast_object(self, obj=None, src: int = 0): + self.broadcast_calls.append((obj, src)) + return self.broadcast_value if self.broadcast_value is not None else obj + + def send_object(self, obj, dst: int): + self.send_calls.append((dst, obj)) + + def recv_object(self, src: int): + self.recv_calls.append(src) + return self.recv_value + + +class TestDistributedReceive: + def test_tp_cfg_leader_receives_then_sends_branch_local_payloads(self): + mgr = _make_manager(from_tp=2, to_tp=4, local_rank=0) + req = SimpleNamespace(request_id="req-1", sampling_params=SimpleNamespace()) + world_group = _MockBroadcastGroup(world_size=4, rank_in_group=2) + cfg_group = _MockBroadcastGroup(world_size=3, rank_in_group=0) + + def _receive(req_obj, cfg_func, target_device): + req_obj.past_key_values = SimpleNamespace(key_cache=[torch.tensor([1.0])]) + req_obj.kv_metadata = {"source": "leader"} + req_obj.sampling_params.past_key_values = req_obj.past_key_values + req_obj.sampling_params.kv_metadata = req_obj.kv_metadata + req_obj.sampling_params.cfg_text_past_key_values = SimpleNamespace(key_cache=[torch.tensor([2.0])]) + req_obj.sampling_params.cfg_text_kv_metadata = {"source": "cfg_text"} + req_obj.sampling_params.cfg_img_past_key_values = SimpleNamespace(key_cache=[torch.tensor([3.0])]) + req_obj.sampling_params.cfg_img_kv_metadata = {"source": "cfg_img"} + return True + + mgr.receive_multi_kv_cache = MagicMock(side_effect=_receive) + with ( + patch("vllm_omni.diffusion.distributed.parallel_state.get_world_group", return_value=world_group), + patch( + "vllm_omni.diffusion.distributed.parallel_state.get_classifier_free_guidance_world_size", + return_value=3, + ), + patch( + "vllm_omni.diffusion.distributed.parallel_state.get_classifier_free_guidance_rank", + return_value=0, + ), + patch("vllm_omni.diffusion.distributed.parallel_state.get_cfg_group", return_value=cfg_group), + ): + assert mgr.receive_multi_kv_cache_distributed(req) is True + + mgr.receive_multi_kv_cache.assert_called_once() + assert mgr.receive_multi_kv_cache.call_args.args[2] == torch.device("cpu") + assert req.kv_metadata == {"source": "leader"} + assert cfg_group.broadcast_calls == [] + assert [dst for dst, _ in cfg_group.send_calls] == [1, 2] + rank1_payload = cfg_group.send_calls[0][1] + rank2_payload = cfg_group.send_calls[1][1] + assert torch.equal(rank1_payload["past_key_values"].key_cache[0], torch.tensor([1.0])) + assert torch.equal(rank2_payload["past_key_values"].key_cache[0], torch.tensor([1.0])) + assert rank1_payload["sp.cfg_active_branch"] == "cfg_text" + assert rank2_payload["sp.cfg_active_branch"] == "cfg_img" + assert rank1_payload["sp.cfg_branch_roles"] == ["cfg_text", "cfg_img"] + assert rank2_payload["sp.cfg_branch_roles"] == ["cfg_text", "cfg_img"] + assert "sp.cfg_branch_past_key_values" in rank1_payload + assert "sp.cfg_branch_past_key_values" in rank2_payload + assert list(rank1_payload["sp.cfg_branch_past_key_values"].keys()) == ["cfg_text"] + assert list(rank2_payload["sp.cfg_branch_past_key_values"].keys()) == ["cfg_img"] + assert "sp.cfg_text_past_key_values" in rank1_payload + assert "sp.cfg_img_past_key_values" not in rank1_payload + assert "sp.cfg_img_past_key_values" in rank2_payload + assert "sp.cfg_text_past_key_values" not in rank2_payload + + def test_tp_cfg_follower_receives_local_payload_without_receiving(self): + mgr = _make_manager(from_tp=2, to_tp=4, local_rank=1) + req = SimpleNamespace(request_id="req-1", sampling_params=SimpleNamespace()) + world_group = _MockBroadcastGroup(world_size=4, rank_in_group=3) + cfg_payload = { + "past_key_values": SimpleNamespace(key_cache=[torch.tensor([1.0])]), + "kv_metadata": {"source": "main"}, + "sp.past_key_values": SimpleNamespace(key_cache=[torch.tensor([1.0])]), + "sp.kv_metadata": {"source": "main"}, + "sp.cfg_active_branch": "cfg_text", + "sp.cfg_branch_roles": ["cfg_text", "cfg_img"], + "sp.cfg_branch_past_key_values": { + "cfg_text": SimpleNamespace(key_cache=[torch.tensor([2.0])]), + }, + "sp.cfg_branch_kv_metadata": {"cfg_text": {"source": "cfg-text"}}, + "sp.cfg_text_past_key_values": SimpleNamespace(key_cache=[torch.tensor([2.0])]), + } + cfg_group = _MockBroadcastGroup(world_size=2, rank_in_group=1, recv_value=cfg_payload) + + mgr.receive_multi_kv_cache = MagicMock(return_value=True) + with ( + patch("vllm_omni.diffusion.distributed.parallel_state.get_world_group", return_value=world_group), + patch( + "vllm_omni.diffusion.distributed.parallel_state.get_classifier_free_guidance_world_size", + return_value=2, + ), + patch( + "vllm_omni.diffusion.distributed.parallel_state.get_classifier_free_guidance_rank", + return_value=1, + ), + patch("vllm_omni.diffusion.distributed.parallel_state.get_cfg_group", return_value=cfg_group), + ): + assert mgr.receive_multi_kv_cache_distributed(req) is True + + mgr.receive_multi_kv_cache.assert_not_called() + assert req.kv_metadata == {"source": "main"} + assert torch.equal(req.past_key_values.key_cache[0], torch.tensor([1.0])) + assert torch.equal(req.sampling_params.past_key_values.key_cache[0], torch.tensor([1.0])) + assert req.sampling_params.cfg_active_branch == "cfg_text" + assert req.sampling_params.cfg_branch_roles == ["cfg_text", "cfg_img"] + assert torch.equal( + req.sampling_params.cfg_branch_past_key_values["cfg_text"].key_cache[0], + torch.tensor([2.0]), + ) + assert req.sampling_params.cfg_branch_kv_metadata == {"cfg_text": {"source": "cfg-text"}} + assert torch.equal(req.sampling_params.cfg_text_past_key_values.key_cache[0], torch.tensor([2.0])) + assert cfg_group.broadcast_calls == [] + assert cfg_group.recv_calls == [0] + + def test_tp_without_cfg_keeps_independent_receive_path(self): + mgr = _make_manager(from_tp=2, to_tp=2, local_rank=1) + req = SimpleNamespace(request_id="req-1", sampling_params=SimpleNamespace()) + world_group = _MockBroadcastGroup(world_size=2, rank_in_group=1) + mgr.receive_multi_kv_cache = MagicMock(return_value=True) + + with patch("vllm_omni.diffusion.distributed.parallel_state.get_world_group", return_value=world_group): + assert mgr.receive_multi_kv_cache_distributed(req, target_device=torch.device("cpu")) is True + + mgr.receive_multi_kv_cache.assert_called_once_with(req, None, torch.device("cpu")) + + +# ── TP auto-detect ─────────────────────────────────────────────────── + + +class TestAutoDetectTP: + def test_auto_detect_when_config_defaults(self): + """When config from_tp/to_tp == 1 (default), manager should auto-detect.""" + config = OmniKVCacheConfig( + connector_config={"type": "mock"}, + from_stage="s0", + stage_id="s1", + need_recv_cache=True, + ) + with ( + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_local_tp_rank", return_value=0), + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_tp_world_size", return_value=4), + ): + mgr = OmniKVTransferManager(config) + assert mgr._tp_topo.source_tp_size == 4 + assert mgr._tp_topo.target_tp_size == 4 + + def test_explicit_tp_overrides_auto_detect(self): + config = OmniKVCacheConfig( + connector_config={"type": "mock"}, + from_stage="s0", + stage_id="s1", + need_recv_cache=True, + from_tp=2, + to_tp=4, + ) + with ( + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_local_tp_rank", return_value=0), + patch("vllm_omni.distributed.omni_connectors.kv_transfer_manager.get_tp_world_size", return_value=8), + ): + mgr = OmniKVTransferManager(config) + assert mgr._tp_topo.source_tp_size == 2 + assert mgr._tp_topo.target_tp_size == 4 diff --git a/tests/engine/test_async_omni_engine_stage_init.py b/tests/engine/test_async_omni_engine_stage_init.py index 84b0cb0bed0..5c2a9edb771 100644 --- a/tests/engine/test_async_omni_engine_stage_init.py +++ b/tests/engine/test_async_omni_engine_stage_init.py @@ -183,6 +183,7 @@ def test_launch_llm_stage_passes_stage_init_timeout_to_complete_stage_handshake( engine.model = "dummy-model" engine.single_stage_mode = False engine._omni_master_server = None + engine.stage_configs = [] metadata = types.SimpleNamespace(stage_id=0, runtime_cfg={"devices": "0"}) fake_vllm_config = types.SimpleNamespace() @@ -238,6 +239,7 @@ def test_launch_llm_stage_releases_launch_lock_before_complete_stage_handshake(m engine.model = "dummy-model" engine.single_stage_mode = False engine._omni_master_server = None + engine.stage_configs = [] fake_vllm_config = types.SimpleNamespace() fake_addresses = types.SimpleNamespace() @@ -378,3 +380,70 @@ def __init__(self, vllm_config, renderer=None): assert input_processor is not None assert isinstance(input_processor.input_preprocessor, DummyOmniInputPreprocessor) assert input_processor.input_preprocessor.renderer is input_processor.renderer + + +def test_inject_kv_stage_info_infers_sender_tp_topology(): + from vllm_omni.engine.stage_init_utils import inject_kv_stage_info + + stage0 = types.SimpleNamespace( + stage_id=0, + engine_args={ + "tensor_parallel_size": 4, + "omni_kv_config": { + "need_send_cache": True, + "omni_from_stage": "0", + "omni_to_stage": "1", + }, + }, + engine_input_source=[], + ) + stage1 = types.SimpleNamespace( + stage_id=1, + engine_args={ + "parallel_config": { + "tensor_parallel_size": 2, + "cfg_parallel_size": 1, + }, + "omni_kv_config": {"need_recv_cache": True}, + }, + engine_input_source=[0], + ) + + inject_kv_stage_info(stage0, 0, [stage0, stage1]) + + assert stage0.engine_args["omni_kv_config"]["stage_id"] == 0 + assert stage0.engine_args["omni_kv_config"]["rank_mapping"] == {"from_tp": 4, "to_tp": 2} + + +def test_inject_kv_stage_info_infers_receiver_tp_topology(): + from vllm_omni.engine.stage_init_utils import inject_kv_stage_info + + stage0 = types.SimpleNamespace( + stage_id=0, + engine_args={ + "tensor_parallel_size": 4, + "omni_kv_config": {"need_send_cache": True}, + }, + engine_input_source=[], + ) + stage1 = types.SimpleNamespace( + stage_id=1, + engine_args={ + "parallel_config": { + "tensor_parallel_size": 2, + "cfg_parallel_size": 1, + }, + "omni_kv_config": { + "need_recv_cache": True, + "omni_from_stage": "0", + "omni_to_stage": "1", + }, + }, + engine_input_source=[0], + ) + + inject_kv_stage_info(stage1, 1, [stage0, stage1]) + + assert stage1.engine_args["omni_kv_config"]["stage_id"] == 1 + assert stage1.engine_args["omni_kv_config"]["engine_input_source"] == [0] + assert stage1.engine_args["omni_kv_config"]["rank_mapping"] == {"from_tp": 4, "to_tp": 2} diff --git a/tests/engine/test_single_stage_mode.py b/tests/engine/test_single_stage_mode.py index 608e92ac49e..28ccccaa2b5 100644 --- a/tests/engine/test_single_stage_mode.py +++ b/tests/engine/test_single_stage_mode.py @@ -1555,6 +1555,7 @@ def _build_engine_with_oms(self, mocker: MockerFixture) -> AsyncOmniEngine: engine.single_stage_mode = True engine._single_stage_id_filter = 0 engine._llm_stage_launch_lock = threading.Lock() + engine.stage_configs = [] mock_oms = mocker.Mock(spec=OmniMasterServer) mock_oms.address = "127.0.0.1" mock_oms.port = 25000 @@ -1629,6 +1630,7 @@ def test_spawn_stage_core_used_in_normal_mode(self, mocker: MockerFixture): engine.single_stage_mode = False engine._omni_master_server = None engine._llm_stage_launch_lock = threading.Lock() + engine.stage_configs = [] fake_vllm_config = mocker.Mock() fake_executor_cls = mocker.Mock() diff --git a/vllm_omni/diffusion/distributed/group_coordinator.py b/vllm_omni/diffusion/distributed/group_coordinator.py index 8ab38f2a651..5294e6c9ed6 100644 --- a/vllm_omni/diffusion/distributed/group_coordinator.py +++ b/vllm_omni/diffusion/distributed/group_coordinator.py @@ -104,6 +104,7 @@ def __init__( self.local_rank = local_rank self.device_group = None self.cpu_group = None + self.shm_broadcaster = None for ranks in group_ranks: device_group = torch.distributed.new_group(ranks, backend=torch_distributed_backend) @@ -316,7 +317,7 @@ def send_object(self, obj: Any, dst: int) -> None: assert dst < self.world_size, f"Invalid dst rank ({dst})" - assert dst != self.rank, "Invalid destination rank. Destination rank is the same as the current rank." + assert dst != self.rank_in_group, "Invalid destination rank. Destination rank is the same as the current rank." # Serialize object to tensor and get the size as well object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8) @@ -338,7 +339,7 @@ def recv_object(self, src: int) -> Any: assert src < self.world_size, f"Invalid src rank ({src})" - assert src != self.rank, "Invalid source rank. Source rank is the same as the current rank." + assert src != self.rank_in_group, "Invalid source rank. Source rank is the same as the current rank." size_tensor = torch.empty(1, dtype=torch.long, device="cpu") diff --git a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py index 72e53e7f48f..a3d2259e643 100644 --- a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py +++ b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py @@ -365,28 +365,52 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: if req.sampling_params.kv_metadata and "image_shape" in req.sampling_params.kv_metadata: image_shape = tuple(req.sampling_params.kv_metadata["image_shape"]) - cfg_text_kv = getattr(req.sampling_params, "cfg_text_past_key_values", None) + branch_kvs = getattr(req.sampling_params, "cfg_branch_past_key_values", None) or {} + branch_metadata = getattr(req.sampling_params, "cfg_branch_kv_metadata", None) or {} + active_branch = getattr(req.sampling_params, "cfg_active_branch", None) + branch_roles = getattr(req.sampling_params, "cfg_branch_roles", None) or list(branch_kvs.keys()) + + cfg_text_kv = getattr(req.sampling_params, "cfg_text_past_key_values", None) or branch_kvs.get("cfg_text") + cfg_text_metadata = getattr(req.sampling_params, "cfg_text_kv_metadata", None) or branch_metadata.get( + "cfg_text" + ) + cfg_img_kv = getattr(req.sampling_params, "cfg_img_past_key_values", None) or branch_kvs.get("cfg_img") + cfg_img_metadata = getattr(req.sampling_params, "cfg_img_kv_metadata", None) or branch_metadata.get( + "cfg_img" + ) + + cfg_parallel_contract = ( + active_branch is not None or bool(branch_roles) or cfg_text_kv is not None or cfg_img_kv is not None + ) + if cfg_parallel_contract: + logger.info( + "CFG enabled with injected branch KV context roles=%s active=%s", + branch_roles, + active_branch, + ) + if cfg_text_kv is not None: - logger.info("CFG enabled with multi-KV: using injected cfg_text KV Cache") cfg_text_seq_len = cfg_text_kv.key_cache[0].shape[0] cfg_text_context["past_key_values"] = cfg_text_kv cfg_text_context["kv_lens"] = [cfg_text_seq_len] - cfg_text_metadata = getattr(req.sampling_params, "cfg_text_kv_metadata", None) if cfg_text_metadata and "ropes" in cfg_text_metadata: cfg_text_context["ropes"] = cfg_text_metadata["ropes"] else: cfg_text_context["ropes"] = [cfg_text_seq_len] - cfg_img_kv = getattr(req.sampling_params, "cfg_img_past_key_values", None) or injected_kv + if cfg_img_kv is None and cfg_text_kv is not None: + cfg_img_kv = injected_kv + + if cfg_img_kv is not None: cfg_img_seq_len = cfg_img_kv.key_cache[0].shape[0] cfg_img_context["past_key_values"] = cfg_img_kv cfg_img_context["kv_lens"] = [cfg_img_seq_len] - cfg_img_metadata = getattr(req.sampling_params, "cfg_img_kv_metadata", None) if cfg_img_metadata and "ropes" in cfg_img_metadata: cfg_img_context["ropes"] = cfg_img_metadata["ropes"] else: cfg_img_context["ropes"] = [cfg_img_seq_len] - else: + + if not cfg_parallel_contract: logger.warning("CFG is disabled: only single KV cache available") gen_params = BagelGenParams( num_timesteps=gen_params.num_timesteps, diff --git a/vllm_omni/distributed/omni_connectors/kv_transfer_manager.py b/vllm_omni/distributed/omni_connectors/kv_transfer_manager.py index 1958c9d40a5..ad008c3971f 100644 --- a/vllm_omni/distributed/omni_connectors/kv_transfer_manager.py +++ b/vllm_omni/distributed/omni_connectors/kv_transfer_manager.py @@ -14,8 +14,20 @@ from .factory import OmniConnectorFactory from .utils.config import ConnectorSpec -from .utils.initialization import KV_TRANSFER_PORT_OFFSET -from .utils.kv_utils import normalize_layer_kv +from .utils.initialization import KV_RANK_PORT_STRIDE +from .utils.kv_utils import ( + KVTPTopology, + build_rank_aware_recv_keys, + build_rank_aware_send_keys, + get_kv_target_ranks, + get_local_tp_rank, + get_tp_world_size, + kv_zmq_port, + merge_received_rank_shards, + normalize_layer_kv, + slice_layer_blocks, + slice_received_rank_shard, +) logger = init_logger(__name__) @@ -57,6 +69,8 @@ class OmniKVCacheConfig: need_recv_cache: bool = False need_send_cache: bool = False recv_timeout: float = 30.0 + from_tp: int = 1 + to_tp: int = 1 @dataclass @@ -72,82 +86,44 @@ def to_dict(self) -> dict[str, Any]: """Convert to dictionary for serialization.""" return asdict(self) - def to_bytes(self) -> bytes: - """Convert to compact binary format for fast transfer.""" - tensors_desc: list[dict[str, Any]] = [] - tensor_bufs: list[bytes] = [] - data_offset = 0 - - for cache_name in ("key_cache", "value_cache"): - cache_list = self.layer_blocks.get(cache_name, []) - for layer_idx, tensor in enumerate(cache_list): - if tensor is None: - tensors_desc.append({"n": f"{cache_name}_{layer_idx}", "x": True}) - continue - - t = tensor.detach().cpu().contiguous() - dtype_str = str(t.dtype).removeprefix("torch.") - raw = t.view(torch.uint8).numpy().tobytes() - tensors_desc.append( - { - "n": f"{cache_name}_{layer_idx}", - "i": layer_idx, - "d": dtype_str, - "s": list(t.shape), - "o": data_offset, - "b": len(raw), - } - ) - tensor_bufs.append(raw) - data_offset += len(raw) - - header = json.dumps( - { - "rid": self.request_id, - "bids": self.block_ids, - "meta": self.metadata, - "td": tensors_desc, - "nl": len(self.layer_blocks.get("key_cache", [])), - }, - separators=(",", ":"), - ).encode("utf-8") - return b"".join([struct.pack(">I", len(header)), header] + tensor_bufs) + def _build_tensors_desc(self, *, cpu: bool) -> tuple[list[dict[str, Any]], list, int, torch.device | None]: + """Iterate layer blocks and build tensor descriptors + data chunks. - def to_gpu_tensor(self) -> torch.Tensor: - """Convert to a packed GPU tensor for raw-data connectors.""" + Returns ``(tensors_desc, chunks, total_bytes, device)``. + *chunks* contains ``bytes`` when *cpu* is True, flat uint8 GPU tensors otherwise. + """ tensors_desc: list[dict[str, Any]] = [] - gpu_tensors: list[torch.Tensor] = [] + chunks: list = [] data_offset = 0 device = None for cache_name in ("key_cache", "value_cache"): - cache_list = self.layer_blocks.get(cache_name, []) - for layer_idx, tensor in enumerate(cache_list): + for layer_idx, tensor in enumerate(self.layer_blocks.get(cache_name, [])): if tensor is None: tensors_desc.append({"n": f"{cache_name}_{layer_idx}", "x": True}) continue - t = tensor.detach().contiguous() - if device is None and t.is_cuda: + if cpu: + t = t.cpu() + elif device is None and t.is_cuda: device = t.device - dtype_str = str(t.dtype).removeprefix("torch.") nbytes = t.numel() * t.element_size() tensors_desc.append( { "n": f"{cache_name}_{layer_idx}", "i": layer_idx, - "d": dtype_str, + "d": str(t.dtype).removeprefix("torch."), "s": list(t.shape), "o": data_offset, "b": nbytes, } ) - gpu_tensors.append(t.view(torch.uint8).flatten()) + chunks.append(t.view(torch.uint8).numpy().tobytes() if cpu else t.view(torch.uint8).flatten()) data_offset += nbytes - if device is None: - raise RuntimeError("No CUDA tensors found, use to_bytes() instead") + return tensors_desc, chunks, data_offset, device + def _build_header_bytes(self, tensors_desc: list[dict[str, Any]]) -> bytes: header = json.dumps( { "rid": self.request_id, @@ -158,19 +134,26 @@ def to_gpu_tensor(self) -> torch.Tensor: }, separators=(",", ":"), ).encode("utf-8") + return struct.pack(">I", len(header)) + header - header_prefix = struct.pack(">I", len(header)) + header - total_size = len(header_prefix) + data_offset - output = torch.empty(total_size, dtype=torch.uint8, device=device) - header_tensor = torch.frombuffer(bytearray(header_prefix), dtype=torch.uint8) - output[: len(header_prefix)].copy_(header_tensor) + def to_bytes(self) -> bytes: + """Convert to compact binary format for fast transfer.""" + tensors_desc, chunks, _, _ = self._build_tensors_desc(cpu=True) + return b"".join([self._build_header_bytes(tensors_desc)] + chunks) + def to_gpu_tensor(self) -> torch.Tensor: + """Convert to a packed GPU tensor for raw-data connectors.""" + tensors_desc, chunks, data_offset, device = self._build_tensors_desc(cpu=False) + if device is None: + raise RuntimeError("No CUDA tensors found, use to_bytes() instead") + header_prefix = self._build_header_bytes(tensors_desc) + output = torch.empty(len(header_prefix) + data_offset, dtype=torch.uint8, device=device) + output[: len(header_prefix)].copy_(torch.frombuffer(bytearray(header_prefix), dtype=torch.uint8)) pos = len(header_prefix) - for t_flat in gpu_tensors: + for t_flat in chunks: n = t_flat.numel() output[pos : pos + n].copy_(t_flat) pos += n - return output @staticmethod @@ -237,11 +220,8 @@ def _resolve_layer_idx(info: dict[str, Any], num_layers: int) -> int: return layer_idx @staticmethod - def from_bytes(raw: "bytes | bytearray | memoryview") -> dict[str, Any]: - """Reconstruct KV cache data from the packed bytes format.""" - raw_mv = memoryview(raw) if not isinstance(raw, memoryview) else raw - header, tensor_data_mv = KVCacheTransferData._load_header_from_memoryview(raw_mv) - + def _populate_caches(header: dict[str, Any], get_tensor: callable) -> dict[str, Any]: + """Shared deserialization loop for both CPU and GPU paths.""" num_layers = header["nl"] key_cache: list[torch.Tensor | None] = [None] * num_layers value_cache: list[torch.Tensor | None] = [None] * num_layers @@ -249,20 +229,9 @@ def from_bytes(raw: "bytes | bytearray | memoryview") -> dict[str, Any]: for info in header["td"]: if info.get("x"): continue - name: str = info["n"] torch_dtype = KVCacheTransferData._resolve_torch_dtype(info["d"]) - offset, nbytes = KVCacheTransferData._validate_tensor_span(name, info, len(tensor_data_mv)) - t = ( - torch.frombuffer( - tensor_data_mv, - dtype=torch.uint8, - offset=offset, - count=nbytes, - ) - .view(torch_dtype) - .reshape(info["s"]) - ) + t = get_tensor(info).view(torch_dtype).reshape(info["s"]) layer_idx = KVCacheTransferData._resolve_layer_idx(info, num_layers) if name.startswith("key_cache_"): key_cache[layer_idx] = t @@ -276,37 +245,30 @@ def from_bytes(raw: "bytes | bytearray | memoryview") -> dict[str, Any]: "metadata": header["meta"], } + @staticmethod + def from_bytes(raw: "bytes | bytearray | memoryview") -> dict[str, Any]: + """Reconstruct KV cache data from the packed bytes format.""" + raw_mv = memoryview(raw) if not isinstance(raw, memoryview) else raw + header, tensor_data_mv = KVCacheTransferData._load_header_from_memoryview(raw_mv) + data_len = len(tensor_data_mv) + + def _get(info: dict) -> torch.Tensor: + offset, nbytes = KVCacheTransferData._validate_tensor_span(info["n"], info, data_len) + return torch.frombuffer(tensor_data_mv, dtype=torch.uint8, offset=offset, count=nbytes) + + return KVCacheTransferData._populate_caches(header, _get) + @staticmethod def from_bytes_gpu(gpu_tensor: torch.Tensor) -> dict[str, Any]: """Reconstruct KV cache data from a packed GPU tensor.""" header, data_start = KVCacheTransferData._load_header_from_tensor(gpu_tensor) + data_len = int(gpu_tensor.numel()) - data_start - num_layers = header["nl"] - key_cache: list[torch.Tensor | None] = [None] * num_layers - value_cache: list[torch.Tensor | None] = [None] * num_layers - tensor_data_bytes = int(gpu_tensor.numel()) - data_start + def _get(info: dict) -> torch.Tensor: + offset, nbytes = KVCacheTransferData._validate_tensor_span(info["n"], info, data_len) + return gpu_tensor[data_start + offset : data_start + offset + nbytes].clone() - for info in header["td"]: - if info.get("x"): - continue - - name: str = info["n"] - torch_dtype = KVCacheTransferData._resolve_torch_dtype(info["d"]) - offset, nbytes = KVCacheTransferData._validate_tensor_span(name, info, tensor_data_bytes) - t = gpu_tensor[data_start + offset : data_start + offset + nbytes].clone() - t = t.view(torch_dtype).reshape(info["s"]) - layer_idx = KVCacheTransferData._resolve_layer_idx(info, num_layers) - if name.startswith("key_cache_"): - key_cache[layer_idx] = t - elif name.startswith("value_cache_"): - value_cache[layer_idx] = t - - return { - "request_id": header["rid"], - "layer_blocks": {"key_cache": key_cache, "value_cache": value_cache}, - "block_ids": header["bids"], - "metadata": header["meta"], - } + return KVCacheTransferData._populate_caches(header, _get) class OmniKVTransferManager: @@ -341,6 +303,30 @@ def __init__(self, config: OmniKVCacheConfig): else (None, None) ) + local_rank = get_local_tp_rank() + + if config.from_tp <= 1 and config.to_tp <= 1: + detected_tp = get_tp_world_size() + from_tp = detected_tp + to_tp = detected_tp + else: + from_tp = config.from_tp + to_tp = config.to_tp + + self._tp_topo = KVTPTopology(source_tp_size=from_tp, target_tp_size=to_tp, local_rank=local_rank) + + # Injectable hooks (compatible with PR #2677 OmniConnectorModelRunnerMixin). + self.kv_send_key_builder: Callable | None = None + self.kv_recv_key_builder: Callable | None = None + self.kv_payload_merger: Callable | None = None + self.kv_payload_slicer: Callable | None = None + + # Base sender endpoint (rank-0 host/port) stored during + # update_sender_info(). Used by the receive path to construct + # per-rank metadata for heterogeneous TP without querying a registry. + self._sender_base_host: str | None = None + self._sender_base_zmq_port: int | None = None + if config.need_send_cache and config.connector_config: try: _ = self.connector @@ -348,11 +334,20 @@ def __init__(self, config: OmniKVCacheConfig): except Exception as e: logger.warning("Failed to eagerly initialize sender connector: %s", e) + # ------------------------------------------------------------------ # + # Factory helpers + # ------------------------------------------------------------------ # + @classmethod def _create(cls, cfg: dict | None) -> "OmniKVTransferManager": """Create manager from raw config dict.""" if not cfg or not isinstance(cfg, dict): return cls(OmniKVCacheConfig()) + + rank_mapping = cfg.get("rank_mapping", {}) + if not isinstance(rank_mapping, dict): + rank_mapping = {} + return cls( OmniKVCacheConfig( connector_config=cfg.get("connector_config"), @@ -363,19 +358,18 @@ def _create(cls, cfg: dict | None) -> "OmniKVTransferManager": need_recv_cache=cfg.get("need_recv_cache", False), need_send_cache=cfg.get("need_send_cache", False), recv_timeout=cfg.get("recv_timeout", 30.0), + from_tp=int(rank_mapping.get("from_tp", 1)), + to_tp=int(rank_mapping.get("to_tp", 1)), ) ) - @classmethod - def from_model_config(cls, config: Any) -> "OmniKVTransferManager": - """Create from model config (for AR model runner).""" - return cls._create(getattr(config, "omni_kv_config", None)) - @classmethod def from_od_config(cls, config: Any) -> "OmniKVTransferManager": - """Create from OmniDiffusion config (for diffusion runner).""" + """Create from model or OmniDiffusion config.""" return cls._create(getattr(config, "omni_kv_config", None)) + from_model_config = from_od_config + @classmethod def from_vllm_config(cls, vllm_config: Any, model_config: Any) -> "OmniKVTransferManager": """Create from vllm config with fallback to kv_transfer_config.""" @@ -417,45 +411,33 @@ def connector(self): ) c_extra["to_stage"] = str(self.config.to_stage) if self.config.to_stage is not None else "1" + try: + stage_int = int(self.config.from_stage) if self.config.from_stage is not None else 0 + except (TypeError, ValueError): + stage_int = 0 + zmq_port = kv_zmq_port(base_port, stage_int, self._tp_topo.local_rank) + if self.config.need_send_cache: c_extra["role"] = "sender" - from_stage = self.config.from_stage - if from_stage is not None: - try: - c_extra["zmq_port"] = base_port + KV_TRANSFER_PORT_OFFSET + int(from_stage) - except (TypeError, ValueError): - c_extra["zmq_port"] = base_port + KV_TRANSFER_PORT_OFFSET + c_extra["zmq_port"] = zmq_port elif self.config.need_recv_cache: c_extra["role"] = "receiver" - from_stage = self.config.from_stage - sender_port = base_port + KV_TRANSFER_PORT_OFFSET - if from_stage is not None: - try: - sender_port = base_port + KV_TRANSFER_PORT_OFFSET + int(from_stage) - except (TypeError, ValueError): - pass c_extra.setdefault("sender_host", c_extra.get("host", "127.0.0.1")) - c_extra.setdefault("sender_zmq_port", sender_port) + c_extra.setdefault("sender_zmq_port", zmq_port) logger.info( - "Initializing OmniConnector (purpose=kv_transfer) with config: %s, role: %s", - cfg, + "Initializing OmniConnector type=%s role=%s", + c_type, c_extra.get("role", "N/A"), ) self._connector = OmniConnectorFactory.create_connector(ConnectorSpec(name=c_type, extra=c_extra)) - except Exception as e: - logger.error(f"Failed to initialize OmniConnector: {e}") - import traceback - - traceback.print_exc() - # Cache failure sentinel to avoid repeated initialization attempts in hot paths. + except Exception: + logger.exception("Failed to initialize OmniConnector") self._connector = False return self._connector if self._connector else None - def get_connector(self): - """Get connector (compatibility wrapper for existing code).""" - return self.connector + get_connector = property(lambda self: self.connector) def _resolve_sender_info( self, sender_info: dict[str, Any], sender_stage_id: str | int | None = None @@ -513,8 +495,187 @@ def _clone_received_payload_tensors(data: dict[str, Any]) -> dict[str, Any]: cache_list[idx] = tensor.clone() return data + def _slice_transfer_data_for_target(self, kv_data: KVCacheTransferData, target_rank: int) -> KVCacheTransferData: + """Pre-slice sender payload for one target rank when sender TP < receiver TP.""" + topo = self._tp_topo + ratio = topo.target_tp_size // topo.source_tp_size + offset_in_sender = target_rank % ratio + metadata = dict(kv_data.metadata) if isinstance(kv_data.metadata, dict) else {} + metadata["tp_head_slice"] = { + "applied": True, + "side": "sender", + "target_rank": target_rank, + "source_rank": topo.local_rank, + "from_tp": topo.source_tp_size, + "to_tp": topo.target_tp_size, + "offset_in_shard": offset_in_sender, + "num_slices": ratio, + } + return KVCacheTransferData( + request_id=kv_data.request_id, + layer_blocks=slice_layer_blocks(kv_data.layer_blocks, offset_in_sender, ratio), + block_ids=list(kv_data.block_ids), + metadata=metadata, + ) + + def _serialize_transfer_payload(self, kv_data: KVCacheTransferData) -> torch.Tensor | bytes | dict[str, Any]: + """Serialize KV transfer data using the connector's fastest supported path.""" + if getattr(self.connector, "supports_raw_data", False): + try: + return kv_data.to_gpu_tensor() + except Exception: + pass + try: + return kv_data.to_bytes() + except Exception: + return kv_data.to_dict() + + @staticmethod + def _collect_request_kv_payload(req: Any) -> dict[str, object]: + """Collect request-side KV objects for object broadcast.""" + kv_payload: dict[str, object] = {} + for attr in ("past_key_values", "kv_metadata"): + val = getattr(req, attr, None) + if val is not None: + kv_payload[attr] = val + + if hasattr(req, "sampling_params") and req.sampling_params is not None: + for key in list(vars(req.sampling_params).keys()): + if key in ("past_key_values", "kv_metadata") or ( + key.startswith("cfg_") + and ( + key.endswith("_past_key_values") + or key.endswith("_kv_metadata") + or key + in ( + "cfg_kv_request_ids", + "cfg_active_branch", + "cfg_branch_roles", + "cfg_branch_past_key_values", + "cfg_branch_kv_metadata", + ) + ) + ): + val = getattr(req.sampling_params, key, None) + if val is not None: + kv_payload[f"sp.{key}"] = val + + return kv_payload + + @staticmethod + def _apply_request_kv_payload( + req: Any, + kv_payload: dict[str, object], + target_device: torch.device | None = None, + ) -> None: + """Apply a broadcast KV payload back onto a request object.""" + for attr in ("past_key_values", "kv_metadata"): + val = kv_payload.get(attr) + if val is not None: + if target_device is not None: + val = _move_to_device(val, target_device) + setattr(req, attr, val) + + if hasattr(req, "sampling_params") and req.sampling_params is not None: + for key, val in kv_payload.items(): + if key.startswith("sp."): + if target_device is not None: + val = _move_to_device(val, target_device) + setattr(req.sampling_params, key[3:], val) + + @staticmethod + def _discover_cfg_branch_roles(req: Any) -> list[str]: + """Discover CFG branch roles in a stable order.""" + sampling_params = getattr(req, "sampling_params", None) + if sampling_params is None: + return [] + + roles: list[str] = [] + branch_map = getattr(sampling_params, "cfg_branch_past_key_values", None) or {} + for preferred_role in ("cfg_text", "cfg_img"): + if ( + preferred_role in branch_map + or getattr(sampling_params, f"{preferred_role}_past_key_values", None) is not None + ): + roles.append(preferred_role) + + for role in branch_map.keys(): + if role not in roles and branch_map.get(role) is not None: + roles.append(role) + + for key in vars(sampling_params).keys(): + if not (key.startswith("cfg_") and key.endswith("_past_key_values")): + continue + role = key.removesuffix("_past_key_values") + if role in ("cfg_branch",) or role in roles: + continue + if getattr(sampling_params, key, None) is not None: + roles.append(role) + + return roles + + @classmethod + def _build_cfg_rank_local_payloads(cls, req: Any, cfg_size: int) -> list[dict[str, object] | None]: + """Build per-cfg-rank payloads so each rank receives only its branch KV.""" + full_payload = cls._collect_request_kv_payload(req) + payloads: list[dict[str, object] | None] = [] + + main_payload = { + key: value + for key, value in full_payload.items() + if key in ("past_key_values", "kv_metadata", "sp.past_key_values", "sp.kv_metadata") + } + branch_roles = cls._discover_cfg_branch_roles(req) + if branch_roles: + main_payload["sp.cfg_branch_roles"] = list(branch_roles) + main_payload["sp.cfg_active_branch"] = None + payloads.append(main_payload or None) + + sampling_params = getattr(req, "sampling_params", None) + branch_map = getattr(sampling_params, "cfg_branch_past_key_values", None) or {} + branch_metadata_map = getattr(sampling_params, "cfg_branch_kv_metadata", None) or {} + + for role in branch_roles: + if sampling_params is None: + payloads.append(None) + continue + + branch_kv = branch_map.get(role) + if branch_kv is None: + branch_kv = getattr(sampling_params, f"{role}_past_key_values", None) + branch_metadata = branch_metadata_map.get(role) + if branch_metadata is None: + branch_metadata = getattr(sampling_params, f"{role}_kv_metadata", None) + if branch_kv is None: + payloads.append(None) + continue + + local_payload = dict(main_payload) + local_payload["sp.cfg_active_branch"] = role + local_payload["sp.cfg_branch_roles"] = list(branch_roles) + local_payload["sp.cfg_branch_past_key_values"] = {role: branch_kv} + local_payload[f"sp.{role}_past_key_values"] = branch_kv + if branch_metadata is not None: + local_payload["sp.cfg_branch_kv_metadata"] = {role: branch_metadata} + local_payload[f"sp.{role}_kv_metadata"] = branch_metadata + + payloads.append(local_payload) + + while len(payloads) < cfg_size: + payloads.append(None) + + return payloads[:cfg_size] + def update_sender_info(self, sender_info: dict[str, Any], sender_stage_id: str | int | None = None) -> None: - """Update receiver-side sender info before loading remote KV cache.""" + """Update receiver-side sender info before loading remote KV cache. + + The orchestrator always reports rank-0's ZMQ port. When TP > 1 the + receiver must offset the port so that each TP rank connects to the + corresponding sender rank's port. + + The base host/port are also stored so that the receive path can + construct per-rank metadata for heterogeneous TP scenarios. + """ if not self.config.need_recv_cache: return @@ -523,18 +684,39 @@ def update_sender_info(self, sender_info: dict[str, Any], sender_stage_id: str | logger.warning("Invalid sender_info format: %s", sender_info) return + sender_host = actual_info.get("host") + base_zmq_port = actual_info.get("zmq_port") + + # Store base sender info for per-rank metadata construction. + self._sender_base_host = sender_host + if base_zmq_port is not None: + self._sender_base_zmq_port = int(base_zmq_port) + + # --- Default sender: offset to match this receiver's corresponding sender rank --- + zmq_port = base_zmq_port + if zmq_port is not None and self._tp_topo.local_rank > 0: + zmq_port = int(zmq_port) + self._tp_topo.local_rank * KV_RANK_PORT_STRIDE + if self.config.connector_config: - self.config.connector_config["sender_host"] = actual_info.get("host") - self.config.connector_config["sender_zmq_port"] = actual_info.get("zmq_port") + self.config.connector_config["sender_host"] = sender_host + self.config.connector_config["sender_zmq_port"] = zmq_port if self._connector and hasattr(self._connector, "update_sender_info"): try: - self._connector.update_sender_info(actual_info.get("host"), actual_info.get("zmq_port")) + self._connector.update_sender_info(sender_host, zmq_port) except Exception: if hasattr(self._connector, "sender_host"): - self._connector.sender_host = actual_info.get("host") + self._connector.sender_host = sender_host if hasattr(self._connector, "sender_zmq_port"): - self._connector.sender_zmq_port = actual_info.get("zmq_port") + self._connector.sender_zmq_port = zmq_port + + logger.info( + "Sender info updated: host=%s, base_port=%s, adjusted_port=%s (local_rank=%s)", + sender_host, + base_zmq_port, + zmq_port, + self._tp_topo.local_rank, + ) def handle_finished_requests_kv_transfer( self, @@ -692,35 +874,54 @@ def _transfer_kv_cache(self, kv_data: KVCacheTransferData, transfer_req_id: str) kv_data.request_id = transfer_req_id serialization_start = time.perf_counter() - transfer_data: torch.Tensor | bytes | dict[str, Any] - supports_raw = getattr(self.connector, "supports_raw_data", False) + topo = self._tp_topo + send_keys = build_rank_aware_send_keys( + transfer_req_id, from_stage, to_stage, topo, hook=self.kv_send_key_builder + ) + sender_slice_active = ( + topo.source_tp_size < topo.target_tp_size and len(send_keys) > 1 and not callable(self.kv_send_key_builder) + ) + per_key_payloads: list[tuple[str, torch.Tensor | bytes | dict[str, Any]]] = [] - try: - if supports_raw: - transfer_data = kv_data.to_gpu_tensor() + if sender_slice_active: + target_ranks = get_kv_target_ranks(topo) + if len(target_ranks) != len(send_keys): + logger.warning( + "Skip sender-side KV slicing because target rank count does not match send key count: " + "target_ranks=%s send_keys=%s", + len(target_ranks), + len(send_keys), + ) + sender_slice_active = False else: - raise RuntimeError("Connector does not support raw tensor") - except Exception: - try: - transfer_data = kv_data.to_bytes() - except Exception: - data_dict = kv_data.to_dict() - data_dict["request_id"] = transfer_req_id - transfer_data = data_dict + for put_key, target_rank in zip(send_keys, target_ranks, strict=False): + sliced_kv_data = self._slice_transfer_data_for_target(kv_data, target_rank) + per_key_payloads.append((put_key, self._serialize_transfer_payload(sliced_kv_data))) + + if not per_key_payloads: + transfer_data = self._serialize_transfer_payload(kv_data) + per_key_payloads = [(put_key, transfer_data) for put_key in send_keys] serialization_ms = (time.perf_counter() - serialization_start) * 1000 logger.info("KV cache serialized for %s in %.1f ms", transfer_req_id, serialization_ms) transfer_start = time.perf_counter() - success, size, _ = self._transfer_with_retry(from_stage, to_stage, f"kv_cache_{transfer_req_id}", transfer_data) + total_size = 0 + all_succeeded = True + for put_key, transfer_data in per_key_payloads: + success, size, _ = self._transfer_with_retry(from_stage, to_stage, put_key, transfer_data) + total_size += size + all_succeeded = all_succeeded and success + elapsed = time.perf_counter() - transfer_start - if success: - mbps = (size / 1024 / 1024) / elapsed if elapsed > 0 else 0 + if all_succeeded: + mbps = (total_size / 1024 / 1024) / elapsed if elapsed > 0 else 0 logger.info( - "KV transfer OK: %s, %s bytes, %.3fs, %.1f MB/s", + "KV transfer OK: %s, %s bytes across %s key(s), %.3fs, %.1f MB/s", transfer_req_id, - size, + total_size, + len(send_keys), elapsed, mbps, ) @@ -731,7 +932,7 @@ def _transfer_with_retry( self, from_stage: str, to_stage: str, - request_id: str, + put_key: str, data: "dict[str, Any] | bytes | torch.Tensor", max_retries: int = 3, ) -> tuple[bool, int, dict[str, Any] | None]: @@ -740,7 +941,7 @@ def _transfer_with_retry( Args: from_stage: Source stage identifier to_stage: Target stage identifier - request_id: Request identifier for the key + put_key: Pre-built connector key (rank-aware when TP > 1) data: Data to transfer max_retries: Maximum number of retry attempts @@ -749,14 +950,12 @@ def _transfer_with_retry( """ for attempt in range(max_retries): try: - # Build the full key for connector - full_request_id = f"omni_{from_stage}_to_{to_stage}_{request_id}" success, size, metadata = self.connector.put( - from_stage=from_stage, to_stage=to_stage, put_key=full_request_id, data=data + from_stage=from_stage, to_stage=to_stage, put_key=put_key, data=data ) if success: return success, size, metadata - logger.warning(f"Transfer attempt {attempt + 1} failed for {request_id}") + logger.warning(f"Transfer attempt {attempt + 1} failed for {put_key}") except Exception as e: logger.warning(f"Transfer attempt {attempt + 1} exception: {e}") @@ -801,22 +1000,46 @@ def receive_kv_cache_for_request( poll_interval = 0.01 max_poll_interval = 0.5 - logger.info(f"Wait for KV cache for request {request_id} from stage {from_stage} to {to_stage}...") + topo = self._tp_topo + recv_key_pairs = build_rank_aware_recv_keys( + request_id, from_stage, to_stage, topo, hook=self.kv_recv_key_builder + ) + pending_pairs = list(recv_key_pairs) + received_payloads: dict[str, tuple[dict[str, Any], int]] = {} + + logger.info( + "Wait for KV cache for request %s from stage %s to %s via %s key(s)...", + request_id, + from_stage, + to_stage, + len(recv_key_pairs), + ) try: while True: - # Build the full key for connector - full_request_id = f"omni_{from_stage}_to_{to_stage}_kv_cache_{request_id}" link_start = time.perf_counter() - result = self.connector.get( - from_stage=from_stage, - to_stage=to_stage, - get_key=full_request_id, - ) - if result: + for get_key, from_rank in list(pending_pairs): + # Construct per-rank metadata so the connector queries + # the correct sender endpoint (heterogeneous TP path). + # When from_rank is None (TP<=1), metadata stays None + # and the connector falls back to its default sender. + rank_metadata: dict[str, Any] | None = None + if from_rank is not None and self._sender_base_host and self._sender_base_zmq_port is not None: + rank_metadata = { + "source_host": self._sender_base_host, + "source_port": self._sender_base_zmq_port + from_rank * KV_RANK_PORT_STRIDE, + } + + result = self.connector.get( + from_stage=from_stage, + to_stage=to_stage, + get_key=get_key, + metadata=rank_metadata, + ) + if not result: + continue + raw_data, size = result - elapsed = time.time() - start_time - link_ms = (time.perf_counter() - link_start) * 1000 managed_buffer = None if hasattr(raw_data, "tensor") and hasattr(raw_data, "release"): @@ -844,6 +1067,21 @@ def receive_kv_cache_for_request( else: data = raw_data + received_payloads[get_key] = (data, size) + pending_pairs.remove((get_key, from_rank)) + + if not pending_pairs and received_payloads: + elapsed = time.time() - start_time + link_ms = (time.perf_counter() - link_start) * 1000 + ordered_payloads = [received_payloads[key][0] for key, _ in recv_key_pairs] + total_size = sum(received_payloads[key][1] for key, _ in recv_key_pairs) + + if len(ordered_payloads) == 1: + data = ordered_payloads[0] + else: + data = merge_received_rank_shards(ordered_payloads, merger=self.kv_payload_merger) + data = slice_received_rank_shard(data, topo, slicer=self.kv_payload_slicer) + try: if isinstance(data, dict) and "layer_blocks" in data: layer_blocks = data["layer_blocks"] @@ -856,18 +1094,18 @@ def receive_kv_cache_for_request( continue if target_device is not None and tensor.device != target_device: cache_list[i] = tensor.to(target_device).contiguous() - finally: - if managed_buffer is not None: - managed_buffer.release() + except Exception: + logger.exception("Failed to move KV cache tensors to target device") logger.info( - "Successfully received KV cache for %s, %s bytes, wait=%.3fs, link=%.1fms", + "Successfully received KV cache for %s, %s bytes across %s key(s), wait=%.3fs, link=%.1fms", request_id, - size, + total_size, + len(recv_key_pairs), elapsed, link_ms, ) - return data, size + return data, total_size if time.time() - start_time > timeout: logger.error(f"Timeout waiting for KV cache for request {request_id} after {timeout}s") @@ -876,11 +1114,8 @@ def receive_kv_cache_for_request( time.sleep(poll_interval) poll_interval = min(poll_interval * 2, max_poll_interval) - except Exception as e: - logger.error(f"Error receiving KV cache for {request_id}: {e}") - import traceback - - traceback.print_exc() + except Exception: + logger.exception("Error receiving KV cache for %s", request_id) return None, 0 def apply_kv_cache_to_request(self, req: Any, data: dict[str, Any]) -> None: @@ -994,73 +1229,79 @@ def receive_multi_kv_cache_distributed( cfg_kv_collect_func: Callable | None = None, target_device: torch.device | None = None, ) -> bool: - """Broadcast-aware wrapper around :meth:`receive_multi_kv_cache`. - - SharedMemory connector is single-reader: once rank 0 consumes the - segment it is deleted. For multi-GPU stages (e.g. sequence-parallel) - only rank 0 receives; the result is then broadcast to every other - rank via the world process-group. - - For single-worker stages this is equivalent to calling - :meth:`receive_multi_kv_cache` directly. + """Distributed wrapper around :meth:`receive_multi_kv_cache`. + + TP-aware path selection: + - world size 1: direct receive + - TP active, cfg size 1: each rank independently receives + - TP active, cfg size > 1: cfg-rank 0 receives, then broadcasts to + peers that share the same TP rank + - TP inactive: legacy rank-0 receive then world broadcast """ - from vllm_omni.diffusion.distributed.parallel_state import get_world_group + from vllm_omni.diffusion.distributed.parallel_state import ( + get_cfg_group, + get_classifier_free_guidance_rank, + get_classifier_free_guidance_world_size, + get_world_group, + ) world = get_world_group() if world.world_size <= 1: return self.receive_multi_kv_cache(req, cfg_kv_collect_func, target_device) - # --- rank 0: receive to CPU (needed for pickle-based broadcast) --- - if world.rank_in_group == 0: - self.receive_multi_kv_cache(req, cfg_kv_collect_func, torch.device("cpu")) + topo = self._tp_topo + tp_active = topo.source_tp_size > 1 or topo.target_tp_size > 1 + cfg_size = 1 + cfg_rank = 0 + cfg_group = None + try: + cfg_size = get_classifier_free_guidance_world_size() + cfg_rank = get_classifier_free_guidance_rank() + cfg_group = get_cfg_group() + except Exception: + cfg_size = 1 + cfg_rank = 0 + cfg_group = None - kv_payload: dict[str, object] = {} - for attr in ("past_key_values", "kv_metadata"): - val = getattr(req, attr, None) - if val is not None: - kv_payload[attr] = val + if tp_active and cfg_size <= 1: + logger.info( + "Rank-aware KV receive: rank %s independently receiving (from_tp=%s, to_tp=%s)", + topo.local_rank, + topo.source_tp_size, + topo.target_tp_size, + ) + return self.receive_multi_kv_cache(req, cfg_kv_collect_func, target_device) - if hasattr(req, "sampling_params") and req.sampling_params is not None: - for key in list(vars(req.sampling_params).keys()): - if (key.startswith("cfg_") and key.endswith("_past_key_values")) or key in ( - "past_key_values", - "kv_metadata", - ): - val = getattr(req.sampling_params, key, None) - if val is not None: - kv_payload[f"sp.{key}"] = val - - payload_list = [kv_payload] - # Use broadcast_object_list (pickle-based) instead of broadcast_tensor_dict - # because the KV cache is a heterogeneous nested structure (NaiveCache objects - # with metadata + tensors), not a flat tensor dict. This runs once before - # the denoising loop so the serialization cost is negligible. - torch.distributed.broadcast_object_list(payload_list, src=world.ranks[0], group=world.cpu_group) - kv_payload = payload_list[0] - else: - payload_list: list[dict[str, object] | None] = [None] - torch.distributed.broadcast_object_list(payload_list, src=world.ranks[0], group=world.cpu_group) - kv_payload = payload_list[0] + if tp_active and cfg_size > 1 and cfg_group is not None: + kv_payload: dict[str, object] | None = None + if cfg_rank == 0: + received = self.receive_multi_kv_cache(req, cfg_kv_collect_func, torch.device("cpu")) + rank_payloads = self._build_cfg_rank_local_payloads(req, cfg_size) if received else [None] * cfg_size + kv_payload = rank_payloads[0] + for dst_rank in range(1, cfg_size): + cfg_group.send_object(rank_payloads[dst_rank], dst_rank) + else: + kv_payload = cfg_group.recv_object(0) - # --- apply on ALL ranks (rank 0 also needs CPU→GPU move) --- - if not kv_payload: - return False + if not kv_payload: + return False - for attr in ("past_key_values", "kv_metadata"): - val = kv_payload.get(attr) - if val is not None: - if target_device is not None: - val = _move_to_device(val, target_device) - setattr(req, attr, val) + self._apply_request_kv_payload(req, kv_payload, target_device) + return True - if hasattr(req, "sampling_params") and req.sampling_params is not None: - for key, val in kv_payload.items(): - if key.startswith("sp."): - if target_device is not None: - val = _move_to_device(val, target_device) - setattr(req.sampling_params, key[3:], val) + kv_payload: dict[str, object] | None = None + if world.rank_in_group == 0: + received = self.receive_multi_kv_cache(req, cfg_kv_collect_func, torch.device("cpu")) + if received: + kv_payload = self._collect_request_kv_payload(req) + + kv_payload = world.broadcast_object(kv_payload, src=0) + + if not kv_payload: + return False + self._apply_request_kv_payload(req, kv_payload, target_device) return True diff --git a/vllm_omni/distributed/omni_connectors/utils/kv_utils.py b/vllm_omni/distributed/omni_connectors/utils/kv_utils.py index 2cb48a8b344..12b9b3d4f77 100644 --- a/vllm_omni/distributed/omni_connectors/utils/kv_utils.py +++ b/vllm_omni/distributed/omni_connectors/utils/kv_utils.py @@ -1,15 +1,380 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Utility helpers for KV cache manipulation.""" +"""Utility helpers for KV cache manipulation, TP routing, and merge/slice.""" + +from __future__ import annotations + +import os +from collections.abc import Callable +from dataclasses import dataclass +from typing import Any import torch +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, +) from vllm.logger import init_logger +from .initialization import KV_RANK_PORT_STRIDE, KV_TRANSFER_PORT_OFFSET + logger = init_logger(__name__) LayerKV = torch.Tensor | tuple[torch.Tensor, torch.Tensor] +# ------------------------------------------------------------------ # +# TP Topology +# ------------------------------------------------------------------ # + + +@dataclass(frozen=True) +class KVTPTopology: + """Immutable descriptor for a KV-transfer parallel mapping. + + Captures sender/receiver parallel sizes and the local rank within + that parallel dimension. Works for any divisible parallel dimension + (TP, SP, Ring Attention). + """ + + source_tp_size: int + target_tp_size: int + local_rank: int + + def __post_init__(self) -> None: + if self.source_tp_size <= 0 or self.target_tp_size <= 0: + raise ValueError( + f"Parallel sizes must be positive: " + f"source_tp_size={self.source_tp_size}, target_tp_size={self.target_tp_size}" + ) + if self.local_rank < 0: + raise ValueError(f"local_rank must be non-negative, got {self.local_rank}") + + @property + def is_heterogeneous(self) -> bool: + return self.source_tp_size != self.target_tp_size + + @property + def ratio(self) -> int: + """Larger parallel size divided by smaller. Always >= 1.""" + return max(self.source_tp_size, self.target_tp_size) // min(self.source_tp_size, self.target_tp_size) + + +# ------------------------------------------------------------------ # +# Runtime TP detection +# ------------------------------------------------------------------ # + + +def get_local_tp_rank() -> int: + """Return the TP-local rank of this worker process. + + Uses ``get_tensor_model_parallel_rank()`` which returns the rank + within the TP group only, not the stage-global rank. + """ + try: + return get_tensor_model_parallel_rank() + except Exception: + logger.debug("TP parallel state not initialized, falling back to LOCAL_RANK env", exc_info=True) + try: + return int(os.environ.get("LOCAL_RANK", "0")) + except (ValueError, TypeError): + return 0 + + +def get_tp_world_size() -> int: + """Return the TP world size (tensor-parallel dimension only). + + Uses ``get_tensor_model_parallel_world_size()`` so that + cfg_parallel, SP, PP etc. are not included in the count. + """ + try: + return get_tensor_model_parallel_world_size() + except Exception: + logger.debug("TP parallel state not initialized, defaulting world_size=1", exc_info=True) + return 1 + + +# ------------------------------------------------------------------ # +# ZMQ port computation +# ------------------------------------------------------------------ # + + +def kv_zmq_port(base_port: int, from_stage: int, local_rank: int = 0) -> int: + """Compute the ZMQ port for a KV-transfer connector. + + Each TP rank gets its own port so that TP > 1 deployments do not + cause ``EADDRINUSE`` when multiple sender workers bind on the same + host. The formula is backward-compatible: rank 0 produces the same + port as the previous ``base + OFFSET + stage`` formula. + """ + return base_port + KV_TRANSFER_PORT_OFFSET + local_rank * KV_RANK_PORT_STRIDE + from_stage + + +# ------------------------------------------------------------------ # +# TP topology validation and rank routing +# ------------------------------------------------------------------ # + + +def validate_kv_tp_topology(topo: KVTPTopology) -> None: + """Reject heterogeneous TP mappings that cannot be routed losslessly.""" + larger = max(topo.source_tp_size, topo.target_tp_size) + smaller = min(topo.source_tp_size, topo.target_tp_size) + if larger % smaller != 0: + raise ValueError( + f"KV TP mapping must be divisible: " + f"source_tp_size={topo.source_tp_size}, " + f"target_tp_size={topo.target_tp_size}" + ) + + +def get_kv_target_ranks(topo: KVTPTopology) -> list[int]: + """Which remote ranks this local rank sends KV shards to (send side).""" + validate_kv_tp_topology(topo) + if topo.source_tp_size == topo.target_tp_size: + return [topo.local_rank] + if topo.source_tp_size > topo.target_tp_size: + return [topo.local_rank // (topo.source_tp_size // topo.target_tp_size)] + ratio = topo.target_tp_size // topo.source_tp_size + return [topo.local_rank * ratio + i for i in range(ratio)] + + +def get_kv_source_ranks(topo: KVTPTopology) -> list[int]: + """Which remote ranks this local rank receives KV shards from (recv side).""" + validate_kv_tp_topology(topo) + if topo.source_tp_size == topo.target_tp_size: + return [topo.local_rank] + if topo.source_tp_size > topo.target_tp_size: + ratio = topo.source_tp_size // topo.target_tp_size + return [topo.local_rank * ratio + i for i in range(ratio)] + return [topo.local_rank // (topo.target_tp_size // topo.source_tp_size)] + + +# ------------------------------------------------------------------ # +# Rank-aware connector key building +# ------------------------------------------------------------------ # + + +def get_kv_connector_key( + req_id: str, + from_stage: int | str, + chunk_id: int, + from_rank: int, + to_rank: int, +) -> str: + """Build connector key that includes rank info for KV transfers. + + Format matches PR #2677: ``{req_id}_{from_stage}_{chunk_id}_{from_rank}_{to_rank}`` + """ + return f"{req_id}_{from_stage}_{chunk_id}_{from_rank}_{to_rank}" + + +def build_rank_aware_send_keys( + request_id: str, + from_stage: str, + to_stage: str, + topo: KVTPTopology, + hook: Callable | None = None, +) -> list[str]: + """Build send-side connector keys, checking injectable hook first.""" + if callable(hook): + keys = list(hook(request_id, from_stage, to_stage)) + if keys: + return keys + if topo.source_tp_size <= 1 and topo.target_tp_size <= 1: + return [f"omni_{from_stage}_to_{to_stage}_kv_cache_{request_id}"] + target_ranks = get_kv_target_ranks(topo) + return [get_kv_connector_key(request_id, from_stage, 0, topo.local_rank, r) for r in target_ranks] + + +def build_rank_aware_recv_keys( + request_id: str, + from_stage: str, + to_stage: str, + topo: KVTPTopology, + hook: Callable | None = None, +) -> list[tuple[str, int | None]]: + """Build recv-side connector keys with sender rank info. + + Returns a list of ``(key, from_rank)`` tuples. ``from_rank`` is + ``None`` when TP <= 1 (single sender, no per-rank routing needed). + For TP > 1, ``from_rank`` identifies which sender rank owns the + key so that the connector can route metadata queries to the + correct endpoint. + """ + if callable(hook): + raw = list(hook(request_id, from_stage, to_stage)) + if raw: + if isinstance(raw[0], tuple): + return raw + # Hook returned plain strings (e.g. OmniConnectorModelRunnerMixin. + # get_rank_aware_kv_keys). Reconstruct from_rank from topology so + # Mooncake connector can route metadata queries to the correct + # sender endpoint in heterogeneous TP. + # TODO: have the mixin return (key, from_rank) tuples directly + # to avoid this indirect reconstruction. + source_ranks = get_kv_source_ranks(topo) + if len(raw) == len(source_ranks): + return list(zip(raw, source_ranks)) + return [(k, None) for k in raw] + if topo.source_tp_size <= 1 and topo.target_tp_size <= 1: + return [(f"omni_{from_stage}_to_{to_stage}_kv_cache_{request_id}", None)] + source_ranks = get_kv_source_ranks(topo) + return [(get_kv_connector_key(request_id, from_stage, 0, r, topo.local_rank), r) for r in source_ranks] + + +# ------------------------------------------------------------------ # +# KV tensor head slicing (heterogeneous TP) +# ------------------------------------------------------------------ # + + +def slice_kv_tensor_heads( + tensor: torch.Tensor | None, + offset_in_shard: int, + num_slices: int, +) -> torch.Tensor | None: + """Slice one KV tensor along its head dimension (dim 1).""" + if tensor is None: + return None + if not isinstance(tensor, torch.Tensor): + return tensor + if tensor.dim() < 2: + raise ValueError(f"Expected KV tensor with a head dimension, got shape={tuple(tensor.shape)}") + if num_slices <= 0: + raise ValueError(f"num_slices must be > 0, got {num_slices}") + if not (0 <= offset_in_shard < num_slices): + raise ValueError(f"offset_in_shard must be in [0, {num_slices}), got {offset_in_shard}") + + heads_in_shard = tensor.shape[1] + if heads_in_shard % num_slices != 0: + raise ValueError( + "KV head count must be divisible for heterogeneous TP slicing: " + f"heads_in_shard={heads_in_shard}, num_slices={num_slices}" + ) + + heads_per_slice = heads_in_shard // num_slices + start = offset_in_shard * heads_per_slice + end = start + heads_per_slice + return tensor[:, start:end, ...].contiguous() + + +def slice_layer_blocks( + layer_blocks: dict[str, Any], + offset_in_shard: int, + num_slices: int, +) -> dict[str, list[torch.Tensor | None]]: + """Slice all KV layers for one logical receiver rank.""" + sliced_blocks: dict[str, list[torch.Tensor | None]] = {} + for cache_name in ("key_cache", "value_cache"): + cache_list = layer_blocks.get(cache_name, []) + sliced_blocks[cache_name] = [ + slice_kv_tensor_heads(tensor, offset_in_shard, num_slices) for tensor in cache_list + ] + return sliced_blocks + + +# ------------------------------------------------------------------ # +# Multi-rank merge and receiver-side slice +# ------------------------------------------------------------------ # + + +def merge_received_rank_shards( + payloads: list[dict[str, Any]], + merger: Callable | None = None, +) -> dict[str, Any] | None: + """Merge multiple source-rank KV shards for one target rank. + + When *merger* is provided (injectable hook), it is called directly. + Otherwise the default merges along the head dimension (dim 1). + """ + if callable(merger): + return merger(payloads) + if not payloads: + return None + if len(payloads) == 1: + return payloads[0] + + base_payload = payloads[0] + if not isinstance(base_payload, dict) or "layer_blocks" not in base_payload: + return base_payload + + merged: dict[str, Any] = { + "request_id": base_payload.get("request_id"), + "block_ids": list(base_payload.get("block_ids", [])), + "metadata": dict(base_payload.get("metadata", {})), + } + merged_layer_blocks: dict[str, list[torch.Tensor | None]] = {} + + for cache_name in ("key_cache", "value_cache"): + cache_lists = [payload.get("layer_blocks", {}).get(cache_name, []) for payload in payloads] + num_layers = max((len(cache_list) for cache_list in cache_lists), default=0) + merged_cache: list[torch.Tensor | None] = [] + + for layer_idx in range(num_layers): + layer_tensors = [ + cache_list[layer_idx] + for cache_list in cache_lists + if layer_idx < len(cache_list) and cache_list[layer_idx] is not None + ] + if not layer_tensors: + merged_cache.append(None) + elif len(layer_tensors) == 1 or not isinstance(layer_tensors[0], torch.Tensor): + merged_cache.append(layer_tensors[0]) + else: + merged_cache.append(torch.cat(layer_tensors, dim=1).contiguous()) + + merged_layer_blocks[cache_name] = merged_cache + + merged["layer_blocks"] = merged_layer_blocks + return merged + + +def slice_received_rank_shard( + payload: dict[str, Any] | None, + topo: KVTPTopology, + slicer: Callable | None = None, +) -> dict[str, Any] | None: + """Optionally slice a received payload to extract this rank's portion. + + Used when ``to_tp > from_tp``: the sender sent full heads and each + receiver rank slices out its own subset. + """ + if callable(slicer): + return slicer(payload) + if not payload or topo.target_tp_size <= topo.source_tp_size or "layer_blocks" not in payload: + return payload + + metadata = payload.get("metadata", {}) + slice_metadata = metadata.get("tp_head_slice") if isinstance(metadata, dict) else None + if isinstance(slice_metadata, dict) and slice_metadata.get("applied"): + tagged_rank = slice_metadata.get("target_rank") + if tagged_rank is not None and tagged_rank != topo.local_rank: + logger.warning( + "Received pre-sliced KV payload for unexpected target rank: expected=%s got=%s", + topo.local_rank, + tagged_rank, + ) + return payload + + ratio = topo.target_tp_size // topo.source_tp_size + offset_in_sender = topo.local_rank % ratio + updated_metadata = dict(metadata) if isinstance(metadata, dict) else {} + updated_metadata["tp_head_slice"] = { + "applied": True, + "side": "receiver", + "target_rank": topo.local_rank, + "from_tp": topo.source_tp_size, + "to_tp": topo.target_tp_size, + "offset_in_shard": offset_in_sender, + "num_slices": ratio, + } + return { + "request_id": payload.get("request_id"), + "layer_blocks": slice_layer_blocks(payload["layer_blocks"], offset_in_sender, ratio), + "block_ids": list(payload.get("block_ids", [])), + "metadata": updated_metadata, + } + + def normalize_layer_kv( layer_kv: LayerKV, *, diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 054d5342d9f..23a85e9f5f2 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -61,6 +61,7 @@ ) from vllm_omni.engine.stage_init_utils import ( StartedLlmStage, + _inject_inferred_kv_tp_topology, acquire_device_locks, build_diffusion_config, build_engine_args_dict, @@ -78,7 +79,10 @@ setup_stage_devices, terminate_alive_proc, ) -from vllm_omni.entrypoints.utils import load_and_resolve_stage_configs +from vllm_omni.entrypoints.utils import ( + inject_omni_kv_config, + load_and_resolve_stage_configs, +) from vllm_omni.inputs.preprocess import OmniInputPreprocessor from vllm_omni.platforms import current_omni_platform @@ -378,6 +382,12 @@ def _launch_llm_stage( omni_kv["omni_to_stage"] = omni_to omni_kv.setdefault("stage_id", metadata.stage_id) engine_args_dict["omni_kv_config"] = omni_kv + if self.stage_configs: + _inject_inferred_kv_tp_topology( + engine_args_dict.get("omni_kv_config"), + metadata.stage_id, + self.stage_configs, + ) vllm_config, executor_class = build_vllm_config( stage_cfg, self.model, @@ -747,10 +757,8 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: setup_stage_devices(configured_stage_id, metadata.runtime_cfg) omni_conn_cfg, omni_from, omni_to = omni_kv_connector if omni_conn_cfg: - from vllm_omni.entrypoints.utils import inject_omni_kv_config - inject_omni_kv_config(stage_cfg, omni_conn_cfg, omni_from, omni_to) - inject_kv_stage_info(stage_cfg, configured_stage_id) + inject_kv_stage_info(stage_cfg, configured_stage_id, self.stage_configs) if self.single_stage_mode: assert self._omni_master_server is not None stage_clients[stage_idx] = self._launch_diffusion_stage( diff --git a/vllm_omni/engine/stage_engine_core_client.py b/vllm_omni/engine/stage_engine_core_client.py index 52e674f4763..ab2de757bac 100644 --- a/vllm_omni/engine/stage_engine_core_client.py +++ b/vllm_omni/engine/stage_engine_core_client.py @@ -14,7 +14,9 @@ from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.core_client import AsyncMPClient, DPLBAsyncMPClient -from vllm_omni.distributed.omni_connectors.utils.initialization import KV_TRANSFER_PORT_OFFSET +from vllm_omni.distributed.omni_connectors.utils.initialization import ( + KV_TRANSFER_PORT_OFFSET, +) from vllm_omni.engine.stage_init_utils import StageMetadata if TYPE_CHECKING: @@ -246,6 +248,8 @@ def _initialize_kv_sender_endpoint(self) -> None: from_stage = omni_kv_config.get("omni_from_stage", from_stage) try: + # Orchestrator always reports rank-0's port; receiver + # workers add their own local_rank * KV_RANK_PORT_STRIDE. sender_port = int(base_port) + KV_TRANSFER_PORT_OFFSET + int(from_stage) except (TypeError, ValueError): logger.warning( @@ -284,6 +288,7 @@ def get_kv_sender_info( self._kv_sender_host = self._resolve_contact_host() if self._kv_sender_host is None: return None + # rank-0 base port; receiver workers adjust per KV_RANK_PORT_STRIDE. return { "host": self._kv_sender_host, "zmq_port": base_port + kv_transfer_port_offset + int(self.stage_id), diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 3a7fe4bad77..c697e34bac9 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -13,7 +13,7 @@ import multiprocessing as mp import os import time -from collections.abc import Callable +from collections.abc import Callable, Sequence from dataclasses import dataclass from typing import Any, Literal @@ -101,8 +101,110 @@ def resolve_worker_cls(engine_args: dict[str, Any]) -> None: raise ValueError(f"Unknown worker_type: {worker_type}") -def inject_kv_stage_info(stage_cfg: Any, stage_id: int) -> None: - """Inject stage metadata into omni_kv_config when present.""" +def _get_attr_or_item(obj: Any, key: str, default: Any = None) -> Any: + """Read *key* from *obj* regardless of whether it's a dict or object.""" + if hasattr(obj, "get"): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _tp_size_for_stage(stage_configs: Sequence[Any], stage_id: Any) -> int | None: + """Resolve tensor_parallel_size for *stage_id* from the loaded stage configs.""" + id_strs = {str(stage_id)} + try: + id_strs.add(str(int(stage_id))) + except (TypeError, ValueError): + pass + + for stage_cfg in stage_configs: + if str(getattr(stage_cfg, "stage_id", None)) not in id_strs: + continue + engine_args = getattr(stage_cfg, "engine_args", None) + if engine_args is None: + return 1 + parallel_config = _get_attr_or_item(engine_args, "parallel_config") + if parallel_config is not None: + tp = _get_attr_or_item(parallel_config, "tensor_parallel_size", 1) + else: + tp = _get_attr_or_item(engine_args, "tensor_parallel_size", 1) + try: + return max(1, int(tp)) + except (TypeError, ValueError): + return 1 + return None + + +def _inject_inferred_kv_tp_topology( + omni_kv: Any, + stage_id: int, + stage_configs: Sequence[Any], + engine_input_source: Sequence[int] | None = None, +) -> None: + """Infer adjacent-stage TP topology and inject it into omni_kv_config. + + This keeps heterogeneous TP working without requiring user-authored + rank_mapping blocks in config files. + """ + if omni_kv is None: + return + + if hasattr(omni_kv, "get"): + need_send = bool(omni_kv.get("need_send_cache", False)) + need_recv = bool(omni_kv.get("need_recv_cache", False)) + omni_from_stage = omni_kv.get("omni_from_stage") + omni_to_stage = omni_kv.get("omni_to_stage") + rank_mapping = omni_kv.get("rank_mapping") + else: + need_send = bool(getattr(omni_kv, "need_send_cache", False)) + need_recv = bool(getattr(omni_kv, "need_recv_cache", False)) + omni_from_stage = getattr(omni_kv, "omni_from_stage", None) + omni_to_stage = getattr(omni_kv, "omni_to_stage", None) + rank_mapping = getattr(omni_kv, "rank_mapping", None) + + if not need_send and not need_recv: + return + + current_tp = _tp_size_for_stage(stage_configs, stage_id) + if current_tp is None: + return + + peer_stage_id = None + from_tp = None + to_tp = None + if str(omni_from_stage) == str(stage_id): + peer_stage_id = omni_to_stage + from_tp = current_tp + to_tp = _tp_size_for_stage(stage_configs, peer_stage_id) + elif str(omni_to_stage) == str(stage_id): + peer_stage_id = omni_from_stage + from_tp = _tp_size_for_stage(stage_configs, peer_stage_id) + to_tp = current_tp + elif need_recv and engine_input_source: + peer_stage_id = engine_input_source[0] + from_tp = _tp_size_for_stage(stage_configs, peer_stage_id) + to_tp = current_tp + + if from_tp is None or to_tp is None: + return + + if not isinstance(rank_mapping, dict): + rank_mapping = {} + rank_mapping.setdefault("from_tp", int(from_tp)) + rank_mapping.setdefault("to_tp", int(to_tp)) + + if hasattr(omni_kv, "__setitem__"): + omni_kv["rank_mapping"] = rank_mapping + else: + setattr(omni_kv, "rank_mapping", rank_mapping) + + +def inject_kv_stage_info(stage_cfg: Any, stage_id: int, stage_configs: Sequence[Any] | None = None) -> None: + """Inject stage_id, engine_input_source, and inferred TP topology into omni_kv_config. + + When *stage_configs* is provided, also infers from_tp/to_tp for + heterogeneous TP topologies so the KV transfer manager can compute + rank mappings automatically. + """ try: engine_args = stage_cfg.engine_args if hasattr(engine_args, "get"): @@ -125,6 +227,14 @@ def inject_kv_stage_info(stage_cfg: Any, stage_id: int) -> None: omni_kv.setdefault("engine_input_source", list(engine_input_source)) elif hasattr(omni_kv, "__setitem__") and "engine_input_source" not in omni_kv: omni_kv["engine_input_source"] = list(engine_input_source) + + if stage_configs: + _inject_inferred_kv_tp_topology( + omni_kv, + stage_id=stage_id, + stage_configs=stage_configs, + engine_input_source=engine_input_source, + ) except Exception as e: logger.debug("Failed to inject stage info into omni_kv_config: %s", e) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 39fcbc9a0aa..4b3a7045caa 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -86,6 +86,7 @@ from vllm_omni.entrypoints.openai.protocol import OmniChatCompletionStreamResponse from vllm_omni.entrypoints.openai.protocol.audio import AudioResponse, CreateAudio from vllm_omni.entrypoints.openai.utils import ( + get_stage_type, get_supported_speakers_from_hf_config, parse_lora_request, validate_requested_speaker, @@ -294,6 +295,8 @@ async def create_chat_completion( ) num_inference_steps = None + cfg_text_scale = None + cfg_img_scale = None # Omni multistage image generation: Stage-0 (AR) should receive a clean # text prompt (and optional conditioning image/size) so the model's own # processor can construct the correct inputs. @@ -342,6 +345,8 @@ async def create_chat_completion( except Exception: pass negative_prompt = extra_body.get("negative_prompt") + cfg_text_scale = extra_body.get("cfg_text_scale") + cfg_img_scale = extra_body.get("cfg_img_scale") engine_prompt_image: dict[str, Any] | None = None is_img2img = False @@ -397,14 +402,18 @@ async def create_chat_completion( sampling_params_list = self._build_sampling_params_list_from_request(request) # Apply user-specified overrides to diffusion stage(s) for image generation - if _image_gen_height is not None or _image_gen_width is not None or num_inference_steps is not None: - for idx, sp in enumerate(sampling_params_list): - if hasattr(sp, "height") and _image_gen_height is not None: - sp.height = _image_gen_height - if hasattr(sp, "width") and _image_gen_width is not None: - sp.width = _image_gen_width - if hasattr(sp, "num_inference_steps") and num_inference_steps is not None: - sp.num_inference_steps = num_inference_steps + for idx, sp in enumerate(sampling_params_list): + if hasattr(sp, "height") and _image_gen_height is not None: + sp.height = _image_gen_height + if hasattr(sp, "width") and _image_gen_width is not None: + sp.width = _image_gen_width + if hasattr(sp, "num_inference_steps") and num_inference_steps is not None: + sp.num_inference_steps = num_inference_steps + if hasattr(sp, "extra_args") and sp.extra_args is not None: + if cfg_text_scale is not None: + sp.extra_args["cfg_text_scale"] = cfg_text_scale + if cfg_img_scale is not None: + sp.extra_args["cfg_img_scale"] = cfg_img_scale self._log_inputs( request_id, @@ -2108,6 +2117,8 @@ async def _create_diffusion_chat_completion( num_inference_steps = extra_body.get("num_inference_steps") guidance_scale = extra_body.get("guidance_scale") true_cfg_scale = extra_body.get("true_cfg_scale") or extra_body.get("cfg_scale") + cfg_text_scale = extra_body.get("cfg_text_scale") + cfg_img_scale = extra_body.get("cfg_img_scale") seed = extra_body.get("seed") negative_prompt = extra_body.get("negative_prompt") num_outputs_per_prompt = extra_body.get("num_outputs_per_prompt", 1) @@ -2162,6 +2173,10 @@ async def _create_diffusion_chat_completion( gen_params.guidance_scale = guidance_scale if true_cfg_scale is not None: gen_params.true_cfg_scale = true_cfg_scale + if cfg_text_scale is not None: + gen_params.extra_args["cfg_text_scale"] = cfg_text_scale + if cfg_img_scale is not None: + gen_params.extra_args["cfg_img_scale"] = cfg_img_scale if num_frames is not None: gen_params.num_frames = num_frames if guidance_scale_2 is not None: @@ -2206,10 +2221,30 @@ async def _create_diffusion_chat_completion( # Generate image diffusion_engine = cast(AsyncOmni, self._diffusion_engine) + stage_configs = list(getattr(diffusion_engine, "stage_configs", []) or []) + default_params_list = list(getattr(diffusion_engine, "default_sampling_params_list", []) or []) + + sampling_params_list: list[Any] = [] + for idx, stage_cfg in enumerate(stage_configs): + if get_stage_type(stage_cfg) == "diffusion": + sampling_params_list.append(gen_params) + continue + + default_stage_params = default_params_list[idx] if idx < len(default_params_list) else SamplingParams() + if hasattr(default_stage_params, "clone"): + try: + default_stage_params = default_stage_params.clone() + except Exception: + pass + sampling_params_list.append(default_stage_params) + + if not sampling_params_list: + sampling_params_list = [gen_params] + result = None async for output in diffusion_engine.generate( prompt=gen_prompt, - sampling_params_list=[gen_params], # Pass as single-stage params + sampling_params_list=sampling_params_list, request_id=request_id, ): result = output diff --git a/vllm_omni/inputs/data.py b/vllm_omni/inputs/data.py index 85faf6b9499..e4c33a58c20 100644 --- a/vllm_omni/inputs/data.py +++ b/vllm_omni/inputs/data.py @@ -267,6 +267,10 @@ class OmniDiffusionSamplingParams: cfg_text_kv_metadata: dict[str, Any] | None = None cfg_img_kv_metadata: dict[str, Any] | None = None cfg_kv_request_ids: dict[str, str] | None = None + cfg_active_branch: str | None = None + cfg_branch_roles: list[str] | None = None + cfg_branch_past_key_values: dict[str, Any] | None = None + cfg_branch_kv_metadata: dict[str, dict[str, Any]] | None = None # Component modules modules: dict[str, Any] = field(default_factory=dict) From bd6985ee4f3295c1e967bd7accd5967777465108 Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Thu, 16 Apr 2026 15:28:42 +0800 Subject: [PATCH 67/76] refactor: init replica in stage_pool part3 Signed-off-by: ZhengWG --- vllm_omni/engine/async_omni_engine.py | 15 +++++++++++++-- vllm_omni/engine/stage_pool.py | 4 ++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 9da0cd66633..45babab44f2 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -658,6 +658,10 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: 2. Launch all stage engine processes (parallel via ThreadPoolExecutor). 3. Attach launched engines (parallel) and collect clients/processors. 4. Build StagePool list and finalize stage metadata. + + TODO(stage-pool): move per-stage launch + attach logic into a + StagePool.build_from_config() classmethod so this method only + iterates stage_configs, collects pools, and finalizes metadata. """ device_control_env = current_omni_platform.device_control_env_var num_stages = self.num_stages @@ -794,6 +798,11 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: # a stage runs remotely (doesn't match the local filter) # replica fan-out is delegated to the remote process, so # we launch exactly one client future per such stage. + # TODO(stage-pool): support remote multi-replica by looping + # num_replicas times here (like the local branch below), + # calling _create_remote_llm_stage(..., replica_index=ri) + # for each. Requires OmniMasterServer protocol to support + # per-replica addressing: (stage_id, replica_index). is_remote_llm_stage = ( self.single_stage_mode and self._single_stage_id_filter is not None @@ -913,10 +922,10 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: for logical_id in range(num_stages): if logical_id in diffusion_clients: - stage_pools.append(StagePool.from_diffusion_client(logical_id, diffusion_clients[logical_id])) + stage_pools.append(StagePool.build_from_diffusion_client(logical_id, diffusion_clients[logical_id])) else: stage_pools.append( - StagePool.from_attach_results( + StagePool.build_from_replicas( logical_id, clients=stage_attach_results[logical_id], output_processors=stage_output_proc_results[logical_id], @@ -955,6 +964,8 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: # Derive flat views for external readers (entrypoints/async_omni.py). self.stage_clients = [sr.client for pool in stage_pools for sr in pool.replicas] + self.stage_vllm_configs = [sr.vllm_config for pool in stage_pools for sr in pool.replicas] + self.output_processors = [sr.output_processor for pool in stage_pools for sr in pool.replicas] # TODO(Peiqi): Hack here supported_tasks: set[str] = set() diff --git a/vllm_omni/engine/stage_pool.py b/vllm_omni/engine/stage_pool.py index d133b566bf8..8adc9d17bf6 100644 --- a/vllm_omni/engine/stage_pool.py +++ b/vllm_omni/engine/stage_pool.py @@ -54,7 +54,7 @@ def __init__( # ---- Construction helpers ---- @classmethod - def from_attach_results( + def build_from_replicas( cls, logical_stage_id: int, clients: Sequence[Any], @@ -81,7 +81,7 @@ def from_attach_results( return cls(logical_stage_id, stage_type, replicas) @classmethod - def from_diffusion_client( + def build_from_diffusion_client( cls, logical_stage_id: int, client: Any, From 658415e2df3659952569b04ef9d57372e320bf27 Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Thu, 16 Apr 2026 17:19:27 +0800 Subject: [PATCH 68/76] add sample yaml for multi-replica Signed-off-by: ZhengWG --- ...3_omni_moe_async_chunk_multi_replicas.yaml | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk_multi_replicas.yaml diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk_multi_replicas.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk_multi_replicas.yaml new file mode 100644 index 00000000000..b80d19460d0 --- /dev/null +++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe_async_chunk_multi_replicas.yaml @@ -0,0 +1,123 @@ +# Stage config for running Qwen3-Omni-MoE with 3-stage architecture +# and multi-replica scale-out on stage 1 (talker) and stage 2 (code2wav). +# +# Stage 0: Thinker (multimodal understanding + text generation) — 1 replica, GPU 0 +# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes) — 2 replicas, GPU 1+2 +# Stage 2: Code2Wav (16-layer RVQ codes → audio waveform) — 2 replicas, GPU 1+2 (shared) +# +# Hardware: 3x H20-96G GPUs (GPU 0 for thinker, GPU 1+2 shared by talker + code2wav replicas). +# Note: stage 1 and stage 2 share GPU 1+2. Code2Wav uses gpu_memory_utilization=0.1 +# so the combined footprint (talker 0.6 + code2wav 0.1) fits within a single GPU. +async_chunk: true +stage_args: + - stage_id: 0 + stage_type: llm # Use llm stage type for AR stages + runtime: + devices: "0" + engine_args: + model_stage: thinker + max_num_seqs: 64 + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.9 + enforce_eager: false + trust_remote_code: true + engine_output_type: latent # Output hidden states for talker + distributed_executor_backend: "mp" + enable_prefix_caching: false + max_num_batched_tokens: 32768 + hf_config_name: thinker_config + tensor_parallel_size: 1 + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk + final_output: true + final_output_type: text + is_comprehension: true + # Use named connector to apply runtime.connectors.extra. + output_connectors: + to_stage_1: connector_of_shared_memory + default_sampling_params: + temperature: 0.4 + top_p: 0.9 + top_k: 1 + max_tokens: 2048 + seed: 42 + detokenize: True + repetition_penalty: 1.05 + + - stage_id: 1 + stage_type: llm # Use llm stage type for AR stages + runtime: + devices: "1,2" + num_replicas: 2 + engine_args: + model_stage: talker + max_num_seqs: 64 + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.6 + enforce_eager: false + trust_remote_code: true + engine_output_type: latent # Output codec codes for code2wav + enable_prefix_caching: false + max_num_batched_tokens: 32768 + distributed_executor_backend: "mp" + hf_config_name: talker_config + custom_process_next_stage_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk + engine_input_source: [0] + # final_output: true + # final_output_type: text + # Distributed connector configuration + input_connectors: + from_stage_0: connector_of_shared_memory + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 4096 + seed: 42 + detokenize: False + repetition_penalty: 1.05 + stop_token_ids: [2150] + + - stage_id: 2 + stage_type: llm # Use llm stage type for AR stages + runtime: + devices: "1,2" + num_replicas: 2 + engine_args: + model_stage: code2wav + max_num_seqs: 64 + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_type: generation + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + async_scheduling: false + enable_prefix_caching: false + engine_output_type: audio # Final output: audio waveform + gpu_memory_utilization: 0.1 + distributed_executor_backend: "mp" + max_num_batched_tokens: 51200 # [TODO] if max_num_batch_tokens < max_num_seqs * 800, there will be precision problem. + hf_config_name: thinker_config + engine_input_source: [1] + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 65536 + seed: 42 + detokenize: True + repetition_penalty: 1.1 + +runtime: + + connectors: + connector_of_shared_memory: + name: SharedMemoryConnector + extra: + # Align with Omni: small chunks with sufficient context overlap. + codec_chunk_frames: 25 # code2wav decode chunk size + codec_left_context_frames: 25 # code2wav left context size From f1cb4ebe4ce200ccddb8297c88203c8da9b4fd53 Mon Sep 17 00:00:00 2001 From: fan2956 Date: Thu, 16 Apr 2026 18:21:34 +0800 Subject: [PATCH 69/76] [PERF] Wan2.2 support rmsnorm fused op (#2583) Signed-off-by: fan2956 Signed-off-by: gcanlin Co-authored-by: gcanlin --- tests/diffusion/layers/test_norm.py | 453 ++++++++++++++++++ vllm_omni/diffusion/layers/adalayernorm.py | 3 +- vllm_omni/diffusion/layers/norm.py | 110 +++++ .../models/wan2_2/wan2_2_transformer.py | 29 +- 4 files changed, 585 insertions(+), 10 deletions(-) create mode 100644 tests/diffusion/layers/test_norm.py create mode 100644 vllm_omni/diffusion/layers/norm.py diff --git a/tests/diffusion/layers/test_norm.py b/tests/diffusion/layers/test_norm.py new file mode 100644 index 00000000000..e420415285d --- /dev/null +++ b/tests/diffusion/layers/test_norm.py @@ -0,0 +1,453 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for LayerNorm and RMSNorm custom ops in diffusion layers.""" + +import pytest +import torch + +pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] + + +# ── Import tests ── + + +def test_layernorm_import(): + """Verify LayerNorm can be imported from the norm module.""" + from vllm_omni.diffusion.layers.norm import LayerNorm # noqa: F401 + + +def test_rmsnorm_import(): + """Verify RMSNorm can be imported from the norm module.""" + from vllm_omni.diffusion.layers.norm import RMSNorm # noqa: F401 + + +# ── LayerNorm tests ── + + +def test_layernorm_forward_shape(): + """LayerNorm produces correct output shapes.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + batch = 2 + seq_len = 4 + norm = LayerNorm(dim) + + x = torch.randn(batch, seq_len, dim) + out = norm(x) + + assert out.shape == (batch, seq_len, dim) + + +def test_layernorm_forward_shape_2d(): + """LayerNorm works with 2D input tensors.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + batch = 2 + norm = LayerNorm(dim) + + x = torch.randn(batch, dim) + out = norm(x) + + assert out.shape == (batch, dim) + + +def test_layernorm_preserves_dtype_fp32(): + """LayerNorm preserves float32 dtype.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim) + + x = torch.randn(2, 4, dim, dtype=torch.float32) + out = norm(x) + + assert out.dtype == torch.float32 + + +def test_layernorm_preserves_dtype_fp16(): + """LayerNorm preserves float16 dtype.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim) + + x = torch.randn(2, 4, dim, dtype=torch.float16) + out = norm(x) + + assert out.dtype == torch.float16 + + +def test_layernorm_preserves_dtype_bf16(): + """LayerNorm preserves bfloat16 dtype.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim) + + x = torch.randn(2, 4, dim, dtype=torch.bfloat16) + out = norm(x) + + assert out.dtype == torch.bfloat16 + + +def test_layernorm_without_elementwise_affine(): + """LayerNorm works without elementwise_affine (no learned parameters).""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim, elementwise_affine=False) + + assert norm.weight is None + assert norm.bias is None + + x = torch.randn(2, 4, dim) + out = norm(x) + + assert out.shape == (2, 4, dim) + + +def test_layernorm_custom_eps(): + """LayerNorm accepts custom epsilon value.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + eps = 1e-5 + norm = LayerNorm(dim, eps=eps) + + assert norm.eps == eps + + +def test_layernorm_has_learnable_parameters(): + """LayerNorm has learnable weight and bias by default.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim) + + assert norm.weight is not None + assert norm.bias is not None + assert norm.weight.shape == (dim,) + assert norm.bias.shape == (dim,) + + +def test_layernorm_matches_fp32_reference(): + """Verify LayerNorm produces identical output to FP32 nn.LayerNorm.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + eps = 1e-6 + torch.manual_seed(42) + + ours = LayerNorm(dim, eps=eps) + ref = torch.nn.LayerNorm(dim, eps=eps) + + # Copy weights + ref.weight.data.copy_(ours.weight.data) + ref.bias.data.copy_(ours.bias.data) + + x = torch.randn(2, 4, dim) + + out_ours = ours(x) + out_ref = ref(x.float()).to(x.dtype) + + torch.testing.assert_close(out_ours, out_ref, atol=1e-5, rtol=1e-5) + + +def test_layernorm_matches_diffusers_fp32layernorm(): + """Verify LayerNorm produces identical output to diffusers FP32LayerNorm.""" + from diffusers.models.normalization import FP32LayerNorm + + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + eps = 1e-6 + torch.manual_seed(42) + + ours = LayerNorm(dim, eps=eps) + ref = FP32LayerNorm(dim, eps=eps) + + # Copy weights + ref.weight.data.copy_(ours.weight.data) + ref.bias.data.copy_(ours.bias.data) + + # Test with fp16 input to verify FP32 computation + x = torch.randn(2, 4, dim, dtype=torch.float16) + + out_ours = ours(x) + out_ref = ref(x) + + torch.testing.assert_close(out_ours, out_ref, atol=1e-3, rtol=1e-3) + + +# ── RMSNorm tests ── + + +def test_rmsnorm_forward_shape(): + """RMSNorm produces correct output shapes.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + batch = 2 + seq_len = 4 + norm = RMSNorm(hidden_size) + + x = torch.randn(batch, seq_len, hidden_size) + out = norm(x) + + assert out.shape == (batch, seq_len, hidden_size) + + +def test_rmsnorm_forward_shape_2d(): + """RMSNorm works with 2D input tensors.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + batch = 2 + norm = RMSNorm(hidden_size) + + x = torch.randn(batch, hidden_size) + out = norm(x) + + assert out.shape == (batch, hidden_size) + + +def test_rmsnorm_preserves_dtype_fp32(): + """RMSNorm preserves float32 dtype.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + norm = RMSNorm(hidden_size) + + x = torch.randn(2, 4, hidden_size, dtype=torch.float32) + out = norm(x) + + assert out.dtype == torch.float32 + + +def test_rmsnorm_preserves_dtype_fp16(): + """RMSNorm preserves float16 dtype.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + norm = RMSNorm(hidden_size) + + x = torch.randn(2, 4, hidden_size, dtype=torch.float16) + out = norm(x) + + assert out.dtype == torch.float16 + + +def test_rmsnorm_preserves_dtype_bf16(): + """RMSNorm preserves bfloat16 dtype.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + norm = RMSNorm(hidden_size) + + x = torch.randn(2, 4, hidden_size, dtype=torch.bfloat16) + out = norm(x) + + assert out.dtype == torch.bfloat16 + + +def test_rmsnorm_custom_eps(): + """RMSNorm accepts custom epsilon value.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + eps = 1e-5 + norm = RMSNorm(hidden_size, eps=eps) + + assert norm.variance_epsilon == eps + + +def test_rmsnorm_has_weight_parameter(): + """RMSNorm has learnable weight parameter initialized to ones.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + norm = RMSNorm(hidden_size) + + assert norm.weight is not None + assert norm.weight.shape == (hidden_size,) + torch.testing.assert_close(norm.weight, torch.ones(hidden_size)) + + +def test_rmsnorm_numerical_correctness(): + """Verify RMSNorm produces numerically correct output.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + eps = 1e-6 + torch.manual_seed(42) + + norm = RMSNorm(hidden_size, eps=eps) + x = torch.randn(2, 4, hidden_size) + + # Compute expected output manually + x_fp32 = x.to(torch.float32) + variance = x_fp32.pow(2).mean(-1, keepdim=True) + expected = x_fp32 * torch.rsqrt(variance + eps) + expected = norm.weight.to(torch.float32) * expected + expected = expected.to(x.dtype) + + out = norm(x) + + torch.testing.assert_close(out, expected, atol=1e-5, rtol=1e-5) + + +def test_rmsnorm_matches_reference_implementation(): + """Verify RMSNorm matches a reference implementation.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + def reference_rmsnorm(x, weight, eps): + """Reference RMSNorm implementation.""" + input_dtype = x.dtype + x = x.to(torch.float32) + variance = x.pow(2).mean(-1, keepdim=True) + out = x * torch.rsqrt(variance + eps) + out = weight.to(torch.float32) * out + return out.to(input_dtype) + + hidden_size = 128 + eps = 1e-6 + torch.manual_seed(123) + + norm = RMSNorm(hidden_size, eps=eps) + + # Test with various dtypes + for dtype in [torch.float32, torch.float16, torch.bfloat16]: + x = torch.randn(4, 8, hidden_size, dtype=dtype) + expected = reference_rmsnorm(x, norm.weight, eps) + out = norm(x) + torch.testing.assert_close(out, expected, atol=1e-3, rtol=1e-3) + + +# ── CustomOp dispatch tests ── + + +def test_layernorm_inherits_from_customop(): + """LayerNorm inherits from CustomOp for platform dispatch.""" + from vllm_omni.diffusion.layers.custom_op import CustomOp + from vllm_omni.diffusion.layers.norm import LayerNorm + + norm = LayerNorm(64) + assert isinstance(norm, CustomOp) + + +def test_rmsnorm_inherits_from_customop(): + """RMSNorm inherits from CustomOp for platform dispatch.""" + from vllm_omni.diffusion.layers.custom_op import CustomOp + from vllm_omni.diffusion.layers.norm import RMSNorm + + norm = RMSNorm(64) + assert isinstance(norm, CustomOp) + + +def test_layernorm_has_platform_methods(): + """LayerNorm has forward methods for each platform.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + norm = LayerNorm(64) + + assert hasattr(norm, "forward_cuda") + assert hasattr(norm, "forward_hip") + assert hasattr(norm, "forward_xpu") + assert hasattr(norm, "forward_npu") + assert hasattr(norm, "forward_native") + + +def test_rmsnorm_has_platform_methods(): + """RMSNorm has forward methods for each platform.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + norm = RMSNorm(64) + + assert hasattr(norm, "forward_cuda") + assert hasattr(norm, "forward_hip") + assert hasattr(norm, "forward_xpu") + assert hasattr(norm, "forward_npu") + assert hasattr(norm, "forward_native") + + +def test_layernorm_forward_native_directly(): + """LayerNorm.forward_native can be called directly.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim) + x = torch.randn(2, 4, dim) + + out = norm.forward_native(x) + + assert out.shape == (2, 4, dim) + + +def test_rmsnorm_forward_native_directly(): + """RMSNorm.forward_native can be called directly.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + norm = RMSNorm(hidden_size) + x = torch.randn(2, 4, hidden_size) + + out = norm.forward_native(x) + + assert out.shape == (2, 4, hidden_size) + + +# ── Edge case tests ── + + +def test_layernorm_with_large_dim(): + """LayerNorm works with large hidden dimensions.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 4096 + norm = LayerNorm(dim) + x = torch.randn(1, 16, dim) + + out = norm(x) + + assert out.shape == (1, 16, dim) + + +def test_rmsnorm_with_large_dim(): + """RMSNorm works with large hidden dimensions.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 4096 + norm = RMSNorm(hidden_size) + x = torch.randn(1, 16, hidden_size) + + out = norm(x) + + assert out.shape == (1, 16, hidden_size) + + +def test_layernorm_with_single_element_batch(): + """LayerNorm works with batch size of 1.""" + from vllm_omni.diffusion.layers.norm import LayerNorm + + dim = 64 + norm = LayerNorm(dim) + x = torch.randn(1, 1, dim) + + out = norm(x) + + assert out.shape == (1, 1, dim) + + +def test_rmsnorm_with_single_element_batch(): + """RMSNorm works with batch size of 1.""" + from vllm_omni.diffusion.layers.norm import RMSNorm + + hidden_size = 64 + norm = RMSNorm(hidden_size) + x = torch.randn(1, 1, hidden_size) + + out = norm(x) + + assert out.shape == (1, 1, hidden_size) diff --git a/vllm_omni/diffusion/layers/adalayernorm.py b/vllm_omni/diffusion/layers/adalayernorm.py index 4d70ed52f71..d147bdcfeb6 100644 --- a/vllm_omni/diffusion/layers/adalayernorm.py +++ b/vllm_omni/diffusion/layers/adalayernorm.py @@ -7,6 +7,7 @@ from vllm.model_executor.layers.linear import ReplicatedLinear from vllm_omni.diffusion.layers.custom_op import CustomOp +from vllm_omni.diffusion.layers.norm import LayerNorm if TYPE_CHECKING: from vllm.model_executor.layers.quantization.base_config import QuantizationConfig @@ -27,7 +28,7 @@ def __init__(self, hidden_size: int, elementwise_affine: bool = False, eps: floa self.eps = eps self.elementwise_affine = elementwise_affine self.hidden_size = hidden_size - self.layernorm = nn.LayerNorm(self.hidden_size, elementwise_affine=self.elementwise_affine, eps=self.eps) + self.layernorm = LayerNorm(self.hidden_size, elementwise_affine=self.elementwise_affine, eps=self.eps) def forward_cuda( self, diff --git a/vllm_omni/diffusion/layers/norm.py b/vllm_omni/diffusion/layers/norm.py new file mode 100644 index 00000000000..6096ad7c370 --- /dev/null +++ b/vllm_omni/diffusion/layers/norm.py @@ -0,0 +1,110 @@ +from importlib.util import find_spec + +import torch +import torch.nn as nn +import torch.nn.functional as F +from vllm.logger import init_logger + +from vllm_omni.diffusion.layers.custom_op import CustomOp + +logger = init_logger(__name__) + +_HAS_MINDIESD = find_spec("mindiesd") is not None + + +class LayerNorm(nn.LayerNorm, CustomOp): + """ + LayerNorm implementation that inherits from both ``nn.LayerNorm`` and ``CustomOp``. + NPU: + Uses ``mindiesd.fast_layernorm(self, x)`` when MindIE-SD is installed. + CUDA / HIP / XPU / native: + Falls back to FP32 nn.LayerNorm implementation. + """ + + def __init__(self, dim: int, eps: float = 1e-6, elementwise_affine: bool = True): + super().__init__(normalized_shape=dim, eps=eps, elementwise_affine=elementwise_affine) + # CustomOp.__init__ cannot be called here because it would re-run + # nn.Module initialization and clear LayerNorm parameters. + self._forward_method = CustomOp.dispatch_forward(self) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self._forward_method(x) + + def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: + return self.forward_native(x) + + def forward_hip(self, x: torch.Tensor) -> torch.Tensor: + return self.forward_native(x) + + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: + return self.forward_native(x) + + def forward_npu(self, x: torch.Tensor) -> torch.Tensor: + if _HAS_MINDIESD: + try: + from mindiesd import fast_layernorm + + return fast_layernorm(self, x) + except ImportError as e: + logger.warning_once( + "mindiesd.fast_layernorm import failed, falling back to FP32 layer_norm: %s", + e, + ) + + return self.forward_native(x) + + def forward_native(self, x: torch.Tensor) -> torch.Tensor: + origin_dtype = x.dtype + return F.layer_norm( + x.float(), + self.normalized_shape, + self.weight.float() if self.weight is not None else None, + self.bias.float() if self.bias is not None else None, + self.eps, + ).to(origin_dtype) + + +class RMSNorm(CustomOp): + def __init__(self, hidden_size: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward_cuda( + self, + x: torch.Tensor, + ) -> torch.Tensor: + return self.forward_native(x) + + def forward_hip( + self, + x: torch.Tensor, + ) -> torch.Tensor: + return self.forward_native(x) + + def forward_npu( + self, + x: torch.Tensor, + ) -> torch.Tensor: + import torch_npu + + output = torch_npu.npu_rms_norm(x, gamma=self.weight, epsilon=self.variance_epsilon)[0] + + return output + + def forward_xpu( + self, + x: torch.Tensor, + ) -> torch.Tensor: + return self.forward_native(x) + + def forward_native( + self, + x: torch.Tensor, + ) -> torch.Tensor: + input_dtype = x.dtype + x = x.to(torch.float32) + variance = x.pow(2).mean(-1, keepdim=True) + out = x * torch.rsqrt(variance + self.variance_epsilon) + out = self.weight.to(torch.float32) * out + return out.to(input_dtype) diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py index b870193a140..d4d81b78eb8 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py @@ -11,7 +11,6 @@ from diffusers.models.attention import FeedForward from diffusers.models.embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps from diffusers.models.modeling_outputs import Transformer2DModelOutput -from diffusers.models.normalization import FP32LayerNorm from vllm.distributed import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -30,6 +29,7 @@ ) from vllm_omni.diffusion.forward_context import get_forward_context from vllm_omni.diffusion.layers.adalayernorm import AdaLayerNorm +from vllm_omni.diffusion.layers.norm import LayerNorm, RMSNorm from vllm_omni.platforms import current_omni_platform logger = init_logger(__name__) @@ -236,9 +236,9 @@ class WanImageEmbedding(nn.Module): def __init__(self, in_features: int, out_features: int, pos_embed_seq_len: int | None = None): super().__init__() - self.norm1 = FP32LayerNorm(in_features) + self.norm1 = LayerNorm(in_features) self.ff = FeedForward(in_features, out_features, mult=1, activation_fn="gelu") - self.norm2 = FP32LayerNorm(out_features) + self.norm2 = LayerNorm(out_features) if pos_embed_seq_len is not None: self.pos_embed = nn.Parameter(torch.zeros(1, pos_embed_seq_len, in_features)) else: @@ -378,8 +378,12 @@ def __init__( self.tp_inner_dim = self.num_heads * head_dim # QK normalization using vLLM's RMSNorm - self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) - self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + if get_tensor_model_parallel_world_size() > 1: + self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + else: + self.norm_q = RMSNorm(self.tp_inner_dim, eps=eps) + self.norm_k = RMSNorm(self.tp_inner_dim, eps=eps) self.to_out = RowParallelLinear( self.inner_dim, @@ -498,8 +502,12 @@ def __init__( self.tp_inner_dim = self.num_heads * head_dim # QK normalization - self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) - self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + if get_tensor_model_parallel_world_size() > 1: + self.norm_q = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + self.norm_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + else: + self.norm_q = RMSNorm(self.tp_inner_dim, eps=eps) + self.norm_k = RMSNorm(self.tp_inner_dim, eps=eps) # Optional added KV projections for I2V (image embeddings) self.added_kv_proj_dim = added_kv_proj_dim @@ -518,7 +526,10 @@ def __init__( gather_output=False, return_bias=False, ) - self.norm_added_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + if get_tensor_model_parallel_world_size() > 1: + self.norm_added_k = DistributedRMSNorm(self.tp_inner_dim, eps=eps) + else: + self.norm_added_k = RMSNorm(self.tp_inner_dim, eps=eps) else: self.add_k_proj = None self.add_v_proj = None @@ -637,7 +648,7 @@ def __init__( eps=eps, added_kv_proj_dim=added_kv_proj_dim, ) - self.norm2 = FP32LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity() + self.norm2 = LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity() # 3. Feed-forward self.ffn = WanFeedForward(dim=dim, inner_dim=ffn_dim, dim_out=dim) From e8658b55d14482cdd30b5ee9cc2b6ca8e81d3f15 Mon Sep 17 00:00:00 2001 From: John Liu BUAA Date: Thu, 16 Apr 2026 18:49:59 +0800 Subject: [PATCH 70/76] [Test] Add performance tests for Qwen-Image-Layered model (#2807) Signed-off-by: John Liu BUAA --- .buildkite/test-nightly.yml | 4 +- .../test_qwen_image_layered_vllm_omni.json | 49 +++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 58e1e55af7f..ac43b597d15 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -415,7 +415,9 @@ steps: EXIT2=$$? pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json EXIT3=$$? - if [ $$EXIT1 -eq 0 ] || [ $$EXIT2 -eq 0 ] || [ $$EXIT3 -eq 0 ]; then + pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json + EXIT4=$$? + if [ $$EXIT1 -eq 0 ] || [ $$EXIT2 -eq 0 ] || [ $$EXIT3 -eq 0 ] || [ $$EXIT4 -eq 0 ]; then buildkite-agent artifact upload "tests/dfx/perf/results/diffusion_result_*.json" buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log" fi diff --git a/tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json b/tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json new file mode 100644 index 00000000000..3cf13509c8d --- /dev/null +++ b/tests/dfx/perf/tests/test_qwen_image_layered_vllm_omni.json @@ -0,0 +1,49 @@ +[ + { + "test_name": "test_qwen_image_layered_single_device", + "description": "Single-device baseline", + "server_type": "vllm-omni", + "server_params": { + "model": "Qwen/Qwen-Image-Layered", + "serve_args": { + "enable-diffusion-pipeline-profiler": true + } + }, + "benchmark_params": [ + { + "name": "640x640_steps20_i2i", + "dataset": "random", + "task": "i2i", + "width": 640, + "height": 640, + "num-inference-steps": 20, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.02, + "latency_mean": 40.0, + "peak_memory_mb_max": 70000, + "peak_memory_mb_mean": 70000 + } + }, + { + "name": "1024x1024_steps35_i2i", + "dataset": "random", + "task": "i2i", + "width": 1024, + "height": 1024, + "num-inference-steps": 35, + "num-prompts": 10, + "max-concurrency": 1, + "enable-negative-prompt": true, + "baseline": { + "throughput_qps": 0.005, + "latency_mean": 80.0, + "peak_memory_mb_max": 70000, + "peak_memory_mb_mean": 70000 + } + } + ] + } +] From dab0720af2374d9594545d83de374063d355e648 Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Thu, 16 Apr 2026 18:31:31 +0800 Subject: [PATCH 71/76] clean code Signed-off-by: ZhengWG --- vllm_omni/engine/async_omni_engine.py | 19 ++------- vllm_omni/engine/orchestrator.py | 57 ++++++++++----------------- 2 files changed, 25 insertions(+), 51 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 94200019abf..5fd362ecc79 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -806,17 +806,12 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: # a stage runs remotely (doesn't match the local filter) # replica fan-out is delegated to the remote process, so # we launch exactly one client future per such stage. - # TODO(stage-pool): support remote multi-replica by looping - # num_replicas times here (like the local branch below), - # calling _create_remote_llm_stage(..., replica_index=ri) - # for each. Requires OmniMasterServer protocol to support - # per-replica addressing: (stage_id, replica_index). - is_remote_llm_stage = ( + # TODO: support remote multi-replica with stage-pool + if ( self.single_stage_mode and self._single_stage_id_filter is not None and configured_stage_id != self._single_stage_id_filter - ) - if is_remote_llm_stage: + ): assert self._omni_master_server is not None if num_replicas > 1: logger.warning( @@ -1083,10 +1078,6 @@ def _build_add_request_message( original_prompt = prompt stage_type = self.stage_metadata[0].get("stage_type") - # Text forwarded to the stage-0 output processor at registration time. - # Populated only on the LLM path below; for diffusion / pre-built - # EngineCoreRequest paths it stays None (the orchestrator's admit() - # still works — prompt text is optional on add_request). output_prompt_text: Any = None if stage_type != "diffusion" and not isinstance(prompt, EngineCoreRequest): # Inject global_request_id into the raw prompt. @@ -1129,9 +1120,7 @@ def _build_add_request_message( # Registration with stage 0's output processor is deferred to the # orchestrator thread (see Orchestrator._handle_add_request). The # orchestrator must know which replica it picked via select_replica - # before it can register on the correct per-replica processor; a - # hardcoded ``output_processors[0]`` here would misalign for any - # stage-0 request routed to replica > 0. + # before it can register on the correct per-replica processor. output_prompt_text = prompt_text if output_prompt_text is None and isinstance(original_prompt, dict): output_prompt_text = original_prompt.get("prompt") diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index b3b0fefe924..bfb59ebef53 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -117,8 +117,7 @@ class OrchestratorRequestState: # Multi-replica: maps logical_stage_id -> StageReplica chosen for this # request. Ensures the same request always hits the same replica within # a given logical stage (KV / intermediate-state affinity + processor - # alignment). Stored as an object handle, not an int, so pool internals - # (flat index / replica index) stay encapsulated. + # alignment). chosen_replica: dict[int, StageReplica] = field(default_factory=dict) @@ -144,13 +143,9 @@ def __init__( self.async_chunk = bool(async_chunk) - self.num_logical_stages = len(stage_pools) - + self.num_stages = len(stage_pools) self.stage_pools: list[StagePool] = stage_pools - # Backward compat: num_stages now means num_logical_stages - self.num_stages = self.num_logical_stages - # Per-request state self.request_states: dict[str, OrchestratorRequestState] = {} @@ -210,14 +205,7 @@ async def run(self) -> None: await asyncio.gather(*pending, return_exceptions=True) async def _request_handler(self) -> None: - """Read messages from the main thread via request_async_queue. - - TODO(stage-pool): the while loop below has no top-level try/except, so - any unhandled exception inside a _handle_* coroutine kills this task - and leaves the orchestrator unable to consume further messages. Wrap - each dispatch in a per-message try/except so one bad request can't - wedge the whole engine. - """ + """Read messages from the main thread via request_async_queue.""" while True: msg = await self.request_async_queue.get() msg_type = msg.get("type") @@ -265,6 +253,9 @@ async def _orchestration_loop(self) -> None: stage_id = stage_replica.logical_stage_id # 1) Diffusion stage: poll non-blocking queue + # TODO (Peiqi): the output of diffusion stage is OmniRequestOutput, + # which is different from EngineCoreOutputs (LLM stages). We may want to unify + # the output format in the future to simplify the processing logic in Orchestrator. if stage_replica.client.stage_type == "diffusion": output = stage_replica.client.get_diffusion_output_nowait() if output is not None: @@ -666,7 +657,6 @@ async def _forward_to_next_stage( req_state.stage_submit_ts[next_logical] = _time.time() return - # Set outputs on the client that actually produced them stage_replica.client.set_engine_outputs([output]) # Process inputs for next stage @@ -748,7 +738,7 @@ async def _process_stage_outputs( async def _handle_add_request(self, msg: dict[str, Any]) -> None: """Handle an add_request message from the main thread.""" - logical_stage_id = 0 + stage_id = 0 request_id = msg["request_id"] prompt = msg["prompt"] original_prompt = msg.get("original_prompt", prompt) @@ -762,7 +752,7 @@ async def _handle_add_request(self, msg: dict[str, Any]) -> None: "[Orchestrator] _handle_add_request: stage=%s req=%s " "prompt_type=%s original_prompt_type=%s final_stage=%s " "num_sampling_params=%d", - logical_stage_id, + stage_id, request_id, type(prompt).__name__, type(original_prompt).__name__, @@ -780,10 +770,10 @@ async def _handle_add_request(self, msg: dict[str, Any]) -> None: ) self.request_states[request_id] = req_state - req_state.stage_submit_ts[logical_stage_id] = _time.time() + req_state.stage_submit_ts[stage_id] = _time.time() request = prompt - stage0_pool = self.stage_pools[logical_stage_id] + stage0_pool = self.stage_pools[stage_id] # Diffusion: no output_processor on stage 0, just select + submit. if stage0_pool.replicas[0].client.stage_type == "diffusion": @@ -800,14 +790,11 @@ async def _handle_add_request(self, msg: dict[str, Any]) -> None: # LLM: atomically pick a stage replica and register on its # output processor so select + register + submit all target # the same replica. - # TODO(stage-pool): when _request_handler gets per-message error - # handling, add rollback here (abort_requests + state cleanup) - # so a failed submit releases resources without killing the loop. output_prompt_text = msg.get("output_prompt_text") stage_replica = stage0_pool.admit(req_state, request, output_prompt_text) await stage_replica.client.add_request_async(request) - if self.async_chunk and logical_stage_id == 0 and final_stage_id > 0: + if self.async_chunk and stage_id == 0 and final_stage_id > 0: await self._prewarm_async_chunk_stages(request_id, request, req_state) async def _handle_streaming_update(self, msg: dict[str, Any]) -> None: @@ -881,12 +868,14 @@ async def _prewarm_async_chunk_stages( base_input["multi_modal_data"] = None base_input["mm_processor_kwargs"] = None - for next_logical in range(1, req_state.final_stage_id + 1): - next_replica = self.stage_pools[next_logical].select_replica(req_state) - params = req_state.sampling_params_list[next_logical] + for next_stage_id in range(1, req_state.final_stage_id + 1): + next_replica = self.stage_pools[next_stage_id].select_replica(req_state) + params = req_state.sampling_params_list[next_stage_id] if next_replica.client.stage_type == "diffusion": - source_stage_ids = list(getattr(next_replica.client, "engine_input_source", None) or [next_logical - 1]) + source_stage_ids = list( + getattr(next_replica.client, "engine_input_source", None) or [next_stage_id - 1] + ) kv_sender_info = self._build_kv_sender_info(sender_stage_ids=source_stage_ids) await next_replica.client.add_request_async( request_id, @@ -894,7 +883,7 @@ async def _prewarm_async_chunk_stages( params, kv_sender_info=kv_sender_info, ) - req_state.stage_submit_ts[next_logical] = _time.time() + req_state.stage_submit_ts[next_stage_id] = _time.time() continue request = build_engine_core_request_from_tokens( @@ -913,7 +902,7 @@ async def _prewarm_async_chunk_stages( queue=None, ) await next_replica.client.add_request_async(request) - req_state.stage_submit_ts[next_logical] = _time.time() + req_state.stage_submit_ts[next_stage_id] = _time.time() async def _handle_add_companion(self, msg: dict[str, Any]) -> None: """Handle an add_companion_request message: submit companion to stage 0.""" @@ -961,10 +950,6 @@ async def _handle_add_companion(self, msg: dict[str, Any]) -> None: companion_prompt_text, affinity_from=parent_replica, ) - # TODO(stage-pool): when _request_handler gets per-message error - # handling, add rollback here (abort_requests + companion state - # cleanup) so a failed submit releases resources without killing - # the orchestrator loop. await companion_replica.client.add_request_async(request) logger.info( @@ -1019,9 +1004,9 @@ async def _handle_collective_rpc(self, msg: dict[str, Any]) -> None: target_replicas.extend(pool.replicas) else: for lid in requested_stage_ids: - if 0 <= lid < self.num_logical_stages: + if 0 <= lid < self.num_stages: target_replicas.extend(self.stage_pools[lid].replicas) - # else: silently skip invalid stage ids + # TODO: currently silently skip invalid stage ids, need to add error handling here. results: list[Any] = [] stage_ids: list[int] = [] From 05e7e7138c6c7f333c1b56f625505afe44797247 Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Thu, 16 Apr 2026 20:24:50 +0800 Subject: [PATCH 72/76] refacotr: keep name style & keep stage_pool clean Signed-off-by: ZhengWG --- vllm_omni/engine/async_omni_engine.py | 3 +- vllm_omni/engine/stage_init_utils.py | 51 +++++++++++++++++++++ vllm_omni/engine/stage_pool.py | 65 +++------------------------ 3 files changed, 59 insertions(+), 60 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 5fd362ecc79..87e6f8cb5e7 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -69,6 +69,7 @@ build_vllm_config, cleanup_failed_stage_initialization, close_started_llm_stage, + compute_replica_layout, extract_stage_metadata, finalize_initialized_stages, get_stage_connector_spec, @@ -80,7 +81,7 @@ setup_stage_devices, terminate_alive_proc, ) -from vllm_omni.engine.stage_pool import StagePool, compute_replica_layout +from vllm_omni.engine.stage_pool import StagePool from vllm_omni.entrypoints.utils import ( inject_omni_kv_config, load_and_resolve_stage_configs, diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 1771f6621b9..6d1712d9d47 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -425,6 +425,57 @@ def get_stage_tp_size(stage_cfg: Any) -> int: return int(getattr(engine_args, "tensor_parallel_size", 1) or 1) +def compute_replica_layout( + stage_configs: Sequence[Any], +) -> tuple[list[int], dict[int, list[str]], int]: + """Compute per-stage replica counts and device assignments. + + Returns: + replicas_per_stage: num_replicas per logical stage. + replica_devices_map: stage_idx -> per-replica device strings + (only for stages with num_replicas > 1). + total_llm_replicas: total LLM replica count across all stages. + """ + replicas_per_stage: list[int] = [] + for stage_cfg in stage_configs: + runtime_cfg = getattr(stage_cfg, "runtime", {}) + num_replicas = int( + runtime_cfg.get("num_replicas", 1) + if hasattr(runtime_cfg, "get") + else getattr(runtime_cfg, "num_replicas", 1) + ) + replicas_per_stage.append(max(1, num_replicas)) + + replica_devices_map: dict[int, list[str]] = {} + for logical_id, stage_cfg in enumerate(stage_configs): + num_replicas = replicas_per_stage[logical_id] + if num_replicas <= 1: + continue + runtime_cfg = getattr(stage_cfg, "runtime", {}) + devices_str = ( + runtime_cfg.get("devices") if hasattr(runtime_cfg, "get") else getattr(runtime_cfg, "devices", None) + ) + tp_size = get_stage_tp_size(stage_cfg) + replica_devices_map[logical_id] = split_devices_for_replicas( + devices_str, + num_replicas, + tp_size, + logical_id, + ) + logger.info( + "[stage_init] Stage %s: %d replicas, tp=%d, devices split: %s", + logical_id, + num_replicas, + tp_size, + replica_devices_map[logical_id], + ) + + total_llm_replicas = sum( + replicas_per_stage[i] for i, cfg in enumerate(stage_configs) if getattr(cfg, "stage_type", "llm") != "diffusion" + ) + return replicas_per_stage, replica_devices_map, total_llm_replicas + + def setup_stage_devices(stage_id: int, runtime_cfg: Any) -> None: """Device mapping via set_stage_devices for a single stage.""" physical_devices = set_stage_devices( diff --git a/vllm_omni/engine/stage_pool.py b/vllm_omni/engine/stage_pool.py index 8adc9d17bf6..ef41ed985ad 100644 --- a/vllm_omni/engine/stage_pool.py +++ b/vllm_omni/engine/stage_pool.py @@ -1,7 +1,7 @@ -"""StagePool: per-logical-stage replica container. +"""StagePool: per-stage replica container. Groups the {client, output_processor, vllm_config} triple of each replica -under a single logical stage and centralizes replica selection (round-robin +under a single stage and centralizes replica selection (round-robin + per-request affinity). """ @@ -49,7 +49,7 @@ def __init__( self.logical_stage_id = logical_stage_id self.stage_type = stage_type self.replicas: list[StageReplica] = replicas - self._rr_cursor = 0 + self._next_replica_idx = 0 # ---- Construction helpers ---- @@ -61,7 +61,7 @@ def build_from_replicas( output_processors: Sequence[Any], vllm_configs: Sequence[Any], ) -> StagePool: - """Build a pool from parallel lists returned by _attach_llm_stage. + """Build a pool from parallel replica lists. Each positional index corresponds to one replica of the same logical stage. The first replica's ``client.stage_type`` is used as the @@ -134,8 +134,8 @@ def select_replica( elif self.num_replicas == 1: chosen = self.replicas[0] else: - chosen = self.replicas[self._rr_cursor % self.num_replicas] - self._rr_cursor += 1 + chosen = self.replicas[self._next_replica_idx] + self._next_replica_idx = (self._next_replica_idx + 1) % self.num_replicas req_state.chosen_replica[self.logical_stage_id] = chosen return chosen @@ -163,56 +163,3 @@ def admit( queue=None, ) return stage_replica - - -def compute_replica_layout( - stage_configs: Sequence[Any], -) -> tuple[list[int], dict[int, list[str]], int]: - """Compute per-stage replica counts and device assignments. - - Returns: - replicas_per_stage: num_replicas per logical stage. - replica_devices_map: stage_idx -> per-replica device strings - (only for stages with num_replicas > 1). - total_llm_replicas: total LLM replica count across all stages. - """ - from vllm_omni.engine.stage_init_utils import get_stage_tp_size, split_devices_for_replicas - - replicas_per_stage: list[int] = [] - for stage_cfg in stage_configs: - runtime_cfg = getattr(stage_cfg, "runtime", {}) - num_replicas = int( - runtime_cfg.get("num_replicas", 1) - if hasattr(runtime_cfg, "get") - else getattr(runtime_cfg, "num_replicas", 1) - ) - replicas_per_stage.append(max(1, num_replicas)) - - replica_devices_map: dict[int, list[str]] = {} - for logical_id, stage_cfg in enumerate(stage_configs): - num_replicas = replicas_per_stage[logical_id] - if num_replicas <= 1: - continue - runtime_cfg = getattr(stage_cfg, "runtime", {}) - devices_str = ( - runtime_cfg.get("devices") if hasattr(runtime_cfg, "get") else getattr(runtime_cfg, "devices", None) - ) - tp_size = get_stage_tp_size(stage_cfg) - replica_devices_map[logical_id] = split_devices_for_replicas( - devices_str, - num_replicas, - tp_size, - logical_id, - ) - logger.info( - "[StagePool] Stage %s: %d replicas, tp=%d, devices split: %s", - logical_id, - num_replicas, - tp_size, - replica_devices_map[logical_id], - ) - - total_llm_replicas = sum( - replicas_per_stage[i] for i, cfg in enumerate(stage_configs) if getattr(cfg, "stage_type", "llm") != "diffusion" - ) - return replicas_per_stage, replica_devices_map, total_llm_replicas From 7d47ccf538cd5c8038b7ab020fa53ffc82513759 Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Thu, 16 Apr 2026 20:46:39 +0800 Subject: [PATCH 73/76] refacotr: keep stage_id variable name Signed-off-by: ZhengWG --- vllm_omni/engine/async_omni_engine.py | 14 ++++++------ vllm_omni/engine/orchestrator.py | 25 ++++++++++----------- vllm_omni/engine/stage_init_utils.py | 20 +++++++---------- vllm_omni/engine/stage_pool.py | 32 +++++++++++++-------------- 4 files changed, 43 insertions(+), 48 deletions(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 87e6f8cb5e7..b11ff1123a7 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -924,16 +924,16 @@ def _initialize_stages(self, stage_init_timeout: int) -> None: input_processor, ) - for logical_id in range(num_stages): - if logical_id in diffusion_clients: - stage_pools.append(StagePool.build_from_diffusion_client(logical_id, diffusion_clients[logical_id])) + for stage_id in range(num_stages): + if stage_id in diffusion_clients: + stage_pools.append(StagePool.build_from_diffusion_client(stage_id, diffusion_clients[stage_id])) else: stage_pools.append( StagePool.build_from_replicas( - logical_id, - clients=stage_attach_results[logical_id], - output_processors=stage_output_proc_results[logical_id], - vllm_configs=stage_vllm_cfg_results[logical_id], + stage_id, + clients=stage_attach_results[stage_id], + output_processors=stage_output_proc_results[stage_id], + vllm_configs=stage_vllm_cfg_results[stage_id], ) ) diff --git a/vllm_omni/engine/orchestrator.py b/vllm_omni/engine/orchestrator.py index bfb59ebef53..3ef272931cf 100644 --- a/vllm_omni/engine/orchestrator.py +++ b/vllm_omni/engine/orchestrator.py @@ -114,10 +114,9 @@ class OrchestratorRequestState: # Metrics: timestamp when request was submitted to each stage stage_submit_ts: dict[int, float] = field(default_factory=dict) - # Multi-replica: maps logical_stage_id -> StageReplica chosen for this - # request. Ensures the same request always hits the same replica within - # a given logical stage (KV / intermediate-state affinity + processor - # alignment). + # Multi-replica: maps stage_id -> StageReplica chosen for this request. + # Ensures the same request always hits the same replica within a given + # stage (KV / intermediate-state affinity + processor alignment). chosen_replica: dict[int, StageReplica] = field(default_factory=dict) @@ -250,7 +249,7 @@ async def _orchestration_loop(self) -> None: if self._shutdown_event.is_set(): return - stage_id = stage_replica.logical_stage_id + stage_id = stage_replica.stage_id # 1) Diffusion stage: poll non-blocking queue # TODO (Peiqi): the output of diffusion stage is OmniRequestOutput, @@ -370,7 +369,7 @@ async def _route_output( stage_metrics: Any, ) -> None: """Route a processed output: send to main thread and/or forward to next stage.""" - stage_id = stage_replica.logical_stage_id + stage_id = stage_replica.stage_id req_id = output.request_id finished = output.finished submit_ts = req_state.stage_submit_ts.get(stage_id) @@ -477,7 +476,7 @@ async def _handle_kv_ready_raw_outputs( """Forward split requests once stage-0 KV is ready, not only when decode fully finishes.""" if self.async_chunk: return - stage_id = stage_replica.logical_stage_id + stage_id = stage_replica.stage_id for raw_output in raw_outputs.outputs: kv_params = getattr(raw_output, "kv_transfer_params", None) if not (isinstance(kv_params, dict) and kv_params.get("kv_ready")): @@ -519,7 +518,7 @@ def _build_stage_metrics( Reuses StageRequestMetrics so OrchestratorMetrics and downstream metric handlers can consume a stable schema. """ - stage_id = stage_replica.logical_stage_id + stage_id = stage_replica.stage_id now = _time.time() submit_ts = req_state.stage_submit_ts.get(stage_id, now) stage_gen_time_ms = (now - submit_ts) * 1000.0 @@ -602,7 +601,7 @@ async def _forward_to_next_stage( Handles the full pipeline: set outputs on current stage, compute next-stage inputs, build lightweight requests, and submit them. """ - stage_id = stage_replica.logical_stage_id + stage_id = stage_replica.stage_id next_logical = stage_id + 1 next_pool = self.stage_pools[next_logical] next_replica = next_pool.select_replica(req_state) @@ -1011,7 +1010,7 @@ async def _handle_collective_rpc(self, msg: dict[str, Any]) -> None: results: list[Any] = [] stage_ids: list[int] = [] for stage_replica in target_replicas: - stage_ids.append(stage_replica.logical_stage_id) + stage_ids.append(stage_replica.stage_id) try: if hasattr(stage_replica.client, "collective_rpc_async"): stage_result = await stage_replica.client.collective_rpc_async( @@ -1031,7 +1030,7 @@ async def _handle_collective_rpc(self, msg: dict[str, Any]) -> None: except Exception as exc: logger.exception( "[Orchestrator] collective_rpc failed: stage=%s replica=%s method=%s", - stage_replica.logical_stage_id, + stage_replica.stage_id, stage_replica.replica_index, method, ) @@ -1066,13 +1065,13 @@ def _shutdown_stages(self) -> None: stage_replica.client.shutdown() logger.info( "[Orchestrator] Stage %d replica %d shut down", - stage_replica.logical_stage_id, + stage_replica.stage_id, stage_replica.replica_index, ) except Exception as e: logger.warning( "[Orchestrator] Failed to shutdown stage %d replica %d: %s", - stage_replica.logical_stage_id, + stage_replica.stage_id, stage_replica.replica_index, e, ) diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py index 6d1712d9d47..cc9f33eec0d 100644 --- a/vllm_omni/engine/stage_init_utils.py +++ b/vllm_omni/engine/stage_init_utils.py @@ -257,10 +257,8 @@ class StageMetadata: runtime_cfg: Any prompt_expand_func: Callable | None = None cfg_kv_collect_func: Callable | None = None - # Multi-replica fields: logical_stage_id is the original stage_id from - # the YAML config; replica_index distinguishes replicas of the same - # logical stage. For single-replica stages these default to stage_id / 0. - logical_stage_id: int = -1 + # Multi-replica: replica_index distinguishes replicas of the same stage. + # For single-replica stages this defaults to 0. replica_index: int = 0 @@ -339,7 +337,6 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata: model_stage=None, runtime_cfg=runtime_cfg, cfg_kv_collect_func=cfg_kv_collect_func, - logical_stage_id=stage_id, ) model_stage = getattr(engine_args, "model_stage", None) @@ -361,7 +358,6 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata: model_stage=model_stage, runtime_cfg=runtime_cfg, prompt_expand_func=prompt_expand_func, - logical_stage_id=stage_id, ) @@ -447,8 +443,8 @@ def compute_replica_layout( replicas_per_stage.append(max(1, num_replicas)) replica_devices_map: dict[int, list[str]] = {} - for logical_id, stage_cfg in enumerate(stage_configs): - num_replicas = replicas_per_stage[logical_id] + for stage_id, stage_cfg in enumerate(stage_configs): + num_replicas = replicas_per_stage[stage_id] if num_replicas <= 1: continue runtime_cfg = getattr(stage_cfg, "runtime", {}) @@ -456,18 +452,18 @@ def compute_replica_layout( runtime_cfg.get("devices") if hasattr(runtime_cfg, "get") else getattr(runtime_cfg, "devices", None) ) tp_size = get_stage_tp_size(stage_cfg) - replica_devices_map[logical_id] = split_devices_for_replicas( + replica_devices_map[stage_id] = split_devices_for_replicas( devices_str, num_replicas, tp_size, - logical_id, + stage_id, ) logger.info( "[stage_init] Stage %s: %d replicas, tp=%d, devices split: %s", - logical_id, + stage_id, num_replicas, tp_size, - replica_devices_map[logical_id], + replica_devices_map[stage_id], ) total_llm_replicas = sum( diff --git a/vllm_omni/engine/stage_pool.py b/vllm_omni/engine/stage_pool.py index ef41ed985ad..1ecab6e5f94 100644 --- a/vllm_omni/engine/stage_pool.py +++ b/vllm_omni/engine/stage_pool.py @@ -28,7 +28,7 @@ class StageReplica: per-replica metrics accumulators). """ - logical_stage_id: int + stage_id: int replica_index: int client: Any output_processor: Any @@ -40,13 +40,13 @@ class StagePool: def __init__( self, - logical_stage_id: int, + stage_id: int, stage_type: str | None, replicas: list[StageReplica], ) -> None: if not replicas: - raise ValueError(f"StagePool for logical stage {logical_stage_id} has no replicas") - self.logical_stage_id = logical_stage_id + raise ValueError(f"StagePool for stage {stage_id} has no replicas") + self.stage_id = stage_id self.stage_type = stage_type self.replicas: list[StageReplica] = replicas self._next_replica_idx = 0 @@ -56,20 +56,20 @@ def __init__( @classmethod def build_from_replicas( cls, - logical_stage_id: int, + stage_id: int, clients: Sequence[Any], output_processors: Sequence[Any], vllm_configs: Sequence[Any], ) -> StagePool: """Build a pool from parallel replica lists. - Each positional index corresponds to one replica of the same logical + Each positional index corresponds to one replica of the same stage. The first replica's ``client.stage_type`` is used as the pool-level stage_type. """ replicas = [ StageReplica( - logical_stage_id=logical_stage_id, + stage_id=stage_id, replica_index=ri, client=clients[ri], output_processor=output_processors[ri], @@ -78,12 +78,12 @@ def build_from_replicas( for ri in range(len(clients)) ] stage_type = getattr(clients[0], "stage_type", None) if clients else None - return cls(logical_stage_id, stage_type, replicas) + return cls(stage_id, stage_type, replicas) @classmethod def build_from_diffusion_client( cls, - logical_stage_id: int, + stage_id: int, client: Any, ) -> StagePool: """Build a single-replica pool for a diffusion stage. @@ -92,13 +92,13 @@ def build_from_diffusion_client( orchestrator side. """ replica = StageReplica( - logical_stage_id=logical_stage_id, + stage_id=stage_id, replica_index=0, client=client, output_processor=None, vllm_config=None, ) - return cls(logical_stage_id, "diffusion", [replica]) + return cls(stage_id, "diffusion", [replica]) # ---- Selection / admission ---- @@ -120,15 +120,15 @@ def select_replica( inheriting its parent's replica at stage 0). 3. Round-robin across replicas. """ - cached = req_state.chosen_replica.get(self.logical_stage_id) + cached = req_state.chosen_replica.get(self.stage_id) if cached is not None: return cached if affinity_from is not None: - if affinity_from.logical_stage_id != self.logical_stage_id: + if affinity_from.stage_id != self.stage_id: raise ValueError( - f"affinity_from is for logical stage {affinity_from.logical_stage_id}, " - f"cannot be used to select in stage {self.logical_stage_id}" + f"affinity_from is for stage {affinity_from.stage_id}, " + f"cannot be used to select in stage {self.stage_id}" ) chosen = affinity_from elif self.num_replicas == 1: @@ -137,7 +137,7 @@ def select_replica( chosen = self.replicas[self._next_replica_idx] self._next_replica_idx = (self._next_replica_idx + 1) % self.num_replicas - req_state.chosen_replica[self.logical_stage_id] = chosen + req_state.chosen_replica[self.stage_id] = chosen return chosen def admit( From 322620fd5774ffaf938395f0c065d703f85eed90 Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Thu, 16 Apr 2026 20:47:39 +0800 Subject: [PATCH 74/76] [Fix][Fish Speech] Remove redundant get_vocab() in control token encoding (#2842) Signed-off-by: Sy03 <1370724210@qq.com> --- vllm_omni/model_executor/models/fish_speech/prompt_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm_omni/model_executor/models/fish_speech/prompt_utils.py b/vllm_omni/model_executor/models/fish_speech/prompt_utils.py index 923e97b63af..8b8d8559ead 100644 --- a/vllm_omni/model_executor/models/fish_speech/prompt_utils.py +++ b/vllm_omni/model_executor/models/fish_speech/prompt_utils.py @@ -38,10 +38,7 @@ def _encode_plain_text(tokenizer: Any, text: str) -> list[int]: def _encode_control_token(tokenizer: Any, token: str) -> list[int]: - vocab = tokenizer.get_vocab() if hasattr(tokenizer, "get_vocab") else {} - token_id = vocab.get(token) - if token_id is None: - token_id = tokenizer.convert_tokens_to_ids(token) + token_id = tokenizer.convert_tokens_to_ids(token) if token_id is None or token_id == getattr(tokenizer, "unk_token_id", None): raise ValueError(f"Fish Speech tokenizer is missing required control token: {token}") return [int(token_id)] From 45760d61d231d433b01fb798f8180d146d3bc7ab Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Thu, 16 Apr 2026 21:27:43 +0800 Subject: [PATCH 75/76] [Test] Skip tests for known issues in audio and speaker recognition (#2851) --- tests/e2e/online_serving/test_qwen3_omni_expansion.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index 3065439084a..06847f3d51b 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -371,6 +371,7 @@ def test_mix_to_text_audio_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) +@pytest.mark.skip(reason="issue: #2827") def test_audio_in_video_001(omni_server, openai_client) -> None: """ Input Modal: text + video (synthetic MP4 with embedded audio; ``use_audio_in_video`` uses audio from the video). @@ -491,6 +492,7 @@ def test_speaker_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) +@pytest.mark.skip(reason="Known issue: occasional inaccuracy in voice recognition.") def test_speaker_002(omni_server, openai_client) -> None: """ Input Modal: text only (one-word answer constraint). From 1219c0f781856957700523ea92e5d0089c266445 Mon Sep 17 00:00:00 2001 From: ZhengWG Date: Thu, 16 Apr 2026 22:01:27 +0800 Subject: [PATCH 76/76] UT: add ut for multi-replica Signed-off-by: ZhengWG --- tests/engine/test_orchestrator.py | 209 ++++++++++++++++++++++++++++-- 1 file changed, 201 insertions(+), 8 deletions(-) diff --git a/tests/engine/test_orchestrator.py b/tests/engine/test_orchestrator.py index 7bf2eccf7f5..1406ebc4e8f 100644 --- a/tests/engine/test_orchestrator.py +++ b/tests/engine/test_orchestrator.py @@ -15,6 +15,7 @@ from vllm.sampling_params import SamplingParams from vllm_omni.engine.orchestrator import Orchestrator +from vllm_omni.engine.stage_pool import StagePool from vllm_omni.inputs.data import OmniDiffusionSamplingParams from vllm_omni.outputs import OmniRequestOutput @@ -70,7 +71,7 @@ def get_diffusion_output_nowait(self): def set_engine_outputs(self, outputs) -> None: return None - def process_engine_inputs(self, stage_list, prompt=None): + def process_engine_inputs(self, stage_list, prompt=None, source_client=None): return list(self.next_inputs) async def abort_requests_async(self, request_ids: list[str]) -> None: @@ -141,17 +142,67 @@ def _build_request_output( ) +def _build_stage_pools( + stage_clients: list[list[FakeStageClient]], + *, + output_processors: list[list[FakeOutputProcessor]] | None = None, + stage_vllm_configs: list[list[object]] | None = None, +) -> list[StagePool]: + """Build StagePool list from per-stage replica lists. + + ``stage_clients[i]`` is the list of FakeStageClient replicas for stage i. + For backward compat, callers may pass a flat list of single-replica clients + via the ``_build_harness`` wrapper. + """ + num_stages = len(stage_clients) + if output_processors is None: + output_processors = [[FakeOutputProcessor() for _ in replicas] for replicas in stage_clients] + if stage_vllm_configs is None: + stage_vllm_configs = [ + [SimpleNamespace(model_config=SimpleNamespace(max_model_len=64)) for _ in replicas] + for replicas in stage_clients + ] + + pools: list[StagePool] = [] + for stage_id in range(num_stages): + clients = stage_clients[stage_id] + if clients[0].stage_type == "diffusion": + pools.append(StagePool.build_from_diffusion_client(stage_id, clients[0])) + else: + pools.append( + StagePool.build_from_replicas( + stage_id, + clients=clients, + output_processors=output_processors[stage_id], + vllm_configs=stage_vllm_configs[stage_id], + ) + ) + return pools + + def _build_harness( stage_clients: list[object], *, output_processors: list[object] | None = None, stage_vllm_configs: list[object] | None = None, async_chunk: bool = False, + stage_pools: list[StagePool] | None = None, ) -> OrchestratorFixture: - if output_processors is None: - output_processors = [FakeOutputProcessor() for _ in stage_clients] - if stage_vllm_configs is None: - stage_vllm_configs = [SimpleNamespace(model_config=SimpleNamespace(max_model_len=64)) for _ in stage_clients] + """Build an Orchestrator test harness. + + Accepts either pre-built ``stage_pools`` or flat lists of single-replica + clients/processors (legacy convenience interface). + """ + if stage_pools is None: + # Wrap flat lists into per-stage single-replica lists. + nested_clients = [[c] for c in stage_clients] + nested_procs = [[p] for p in output_processors] if output_processors else None + nested_cfgs = [[c] for c in stage_vllm_configs] if stage_vllm_configs else None + stage_pools = _build_stage_pools( + nested_clients, + output_processors=nested_procs, + stage_vllm_configs=nested_cfgs, + ) ready_future: concurrent.futures.Future[tuple[Orchestrator, janus.Queue, janus.Queue, janus.Queue]] = ( concurrent.futures.Future() @@ -170,9 +221,7 @@ async def _run() -> None: request_async_queue=request_queue.async_q, output_async_queue=output_queue.async_q, rpc_async_queue=rpc_queue.async_q, - stage_clients=stage_clients, - output_processors=output_processors, - stage_vllm_configs=stage_vllm_configs, + stage_pools=stage_pools, async_chunk=async_chunk, ) ready_future.set_result((orchestrator, request_queue, output_queue, rpc_queue)) @@ -288,6 +337,11 @@ def _factory(*args, **kwargs) -> OrchestratorFixture: q.close() +# --------------------------------------------------------------------------- +# Existing single-replica tests (adapted to StagePool interface) +# --------------------------------------------------------------------------- + + @pytest.mark.asyncio async def test_run_two_stage_llm(orchestrator_factory) -> None: stage0 = FakeStageClient(stage_type="llm", final_output=False) @@ -508,3 +562,142 @@ async def test_run_abort(orchestrator_factory) -> None: assert "req-abort" not in orchestrator_fixture.orchestrator.request_states finally: await _shutdown_orchestrator(orchestrator_fixture) + + +# --------------------------------------------------------------------------- +# Multi-replica tests +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_multi_replica_round_robin_distribution(orchestrator_factory) -> None: + """Two replicas at stage-0, single replica at stage-1. + + Send two requests — they should land on different stage-0 replicas + (round-robin), then both forward to the single stage-1 replica. + """ + stage0_r0 = FakeStageClient(stage_type="llm", final_output=False) + stage0_r1 = FakeStageClient(stage_type="llm", final_output=False) + stage1 = FakeStageClient( + stage_type="llm", + final_output=True, + next_inputs=[{"prompt_token_ids": [7, 8]}], + ) + + proc0_r0 = FakeOutputProcessor(request_outputs=[_build_request_output("req-0", token_ids=[3], finished=True)]) + proc0_r1 = FakeOutputProcessor(request_outputs=[_build_request_output("req-1", token_ids=[4], finished=True)]) + proc1 = FakeOutputProcessor(request_outputs=[_build_request_output("req-0", token_ids=[10], finished=True)]) + + default_vllm_cfg = SimpleNamespace(model_config=SimpleNamespace(max_model_len=64)) + stage_pools = _build_stage_pools( + [[stage0_r0, stage0_r1], [stage1]], + output_processors=[[proc0_r0, proc0_r1], [proc1]], + stage_vllm_configs=[[default_vllm_cfg, default_vllm_cfg], [default_vllm_cfg]], + ) + + orchestrator_fixture = orchestrator_factory([], stage_pools=stage_pools) + + try: + # Request 0 → should land on replica 0 (RR starts at 0) + await _enqueue_add_request( + orchestrator_fixture, + request_id="req-0", + prompt=SimpleNamespace(request_id="req-0", prompt_token_ids=[1, 2]), + original_prompt={"prompt": "hello 0"}, + sampling_params_list=[_sampling_params(), _sampling_params()], + final_stage_id=1, + ) + await _wait_for(lambda: len(stage0_r0.add_request_calls) == 1) + assert len(stage0_r1.add_request_calls) == 0 + + # Request 1 → should land on replica 1 (RR advances) + await _enqueue_add_request( + orchestrator_fixture, + request_id="req-1", + prompt=SimpleNamespace(request_id="req-1", prompt_token_ids=[5, 6]), + original_prompt={"prompt": "hello 1"}, + sampling_params_list=[_sampling_params(), _sampling_params()], + final_stage_id=1, + ) + await _wait_for(lambda: len(stage0_r1.add_request_calls) == 1) + assert len(stage0_r0.add_request_calls) == 1 # unchanged + + # Complete req-0 at stage-0 replica-0 → should forward to stage-1 + stage0_r0.push_engine_core_outputs(_engine_core_outputs("s0r0-raw", 1.0)) + await _wait_for(lambda: len(stage1.add_request_calls) == 1) + assert stage1.add_request_calls[0][0].request_id == "req-0" + + # Complete req-0 at stage-1 → final output + proc1.request_outputs = [_build_request_output("req-0", token_ids=[10], finished=True)] + stage1.push_engine_core_outputs(_engine_core_outputs("s1-raw", 2.0)) + output_msg = await _get_output_message(orchestrator_fixture) + + assert output_msg["request_id"] == "req-0" + assert output_msg["stage_id"] == 1 + assert output_msg["finished"] is True + assert "req-0" not in orchestrator_fixture.orchestrator.request_states + finally: + await _shutdown_orchestrator(orchestrator_fixture) + + +@pytest.mark.asyncio +async def test_multi_replica_abort_broadcasts_to_all_replicas(orchestrator_factory) -> None: + """Abort must be sent to every replica across all stages.""" + stage0_r0 = FakeStageClient(stage_type="llm", final_output=False) + stage0_r1 = FakeStageClient(stage_type="llm", final_output=False) + stage1 = FakeStageClient(stage_type="llm", final_output=True) + + proc0_r0 = FakeOutputProcessor() + proc0_r1 = FakeOutputProcessor() + proc1 = FakeOutputProcessor() + + default_vllm_cfg = SimpleNamespace(model_config=SimpleNamespace(max_model_len=64)) + stage_pools = _build_stage_pools( + [[stage0_r0, stage0_r1], [stage1]], + output_processors=[[proc0_r0, proc0_r1], [proc1]], + stage_vllm_configs=[[default_vllm_cfg, default_vllm_cfg], [default_vllm_cfg]], + ) + orchestrator_fixture = orchestrator_factory([], stage_pools=stage_pools) + + try: + await _enqueue_add_request( + orchestrator_fixture, + request_id="req-abort-mr", + prompt=SimpleNamespace(request_id="req-abort-mr", prompt_token_ids=[1]), + original_prompt={"prompt": "cancel"}, + sampling_params_list=[_sampling_params(), _sampling_params()], + final_stage_id=1, + ) + await _wait_for(lambda: len(stage0_r0.add_request_calls) == 1) + + await _enqueue_abort_request(orchestrator_fixture, ["req-abort-mr"]) + + all_clients = [stage0_r0, stage0_r1, stage1] + await _wait_for(lambda: all(c.abort_calls for c in all_clients)) + + for client in all_clients: + assert client.abort_calls == [["req-abort-mr"]] + assert "req-abort-mr" not in orchestrator_fixture.orchestrator.request_states + finally: + await _shutdown_orchestrator(orchestrator_fixture) + + +@pytest.mark.asyncio +async def test_multi_replica_shutdown_all_replicas(orchestrator_factory) -> None: + """Shutdown must shut down every replica across all stages.""" + stage0_r0 = FakeStageClient(stage_type="llm", final_output=False) + stage0_r1 = FakeStageClient(stage_type="llm", final_output=False) + stage1 = FakeStageClient(stage_type="llm", final_output=True) + + default_vllm_cfg = SimpleNamespace(model_config=SimpleNamespace(max_model_len=64)) + stage_pools = _build_stage_pools( + [[stage0_r0, stage0_r1], [stage1]], + stage_vllm_configs=[[default_vllm_cfg, default_vllm_cfg], [default_vllm_cfg]], + ) + orchestrator_fixture = orchestrator_factory([], stage_pools=stage_pools) + + await _shutdown_orchestrator(orchestrator_fixture) + + assert not orchestrator_fixture.thread.is_alive() + for client in [stage0_r0, stage0_r1, stage1]: + assert client.shutdown_calls == 1

s%`HI1UC3M zDXlN1FQw6kkTlL=zovddq8qF1FdcV@+J?4#heoaKMH^rQh|JU*|KI(y1A9cl({86a%_Bf< zs)6H}1LyKIT}4aeLV~PH#Y?miCywW|1+b+YXUO1BuOxf|Ba6oGm6iy+{KN+DjK0*i%8(fku(e>nd`a z1HYpX45-^k^*EWzl!hmiRtasIh;!ytC^N?liYM9WwF&*c8HP((DFKv4{K=>=tgU9| zuxBN*c1$e*>VFxwT2k;G*0kv)7?p2jxt*TD1|1;)NLwJCW^(L2&~S(kgCQkfE=zY* zlO=1G7>>z|RXT1p?(d3B6K_xIzv*1vP;wlP$7^*++YUk|-F%T#ZRpo>W|mJx+IeJ| za{R42jATD*MlURzg-vB@Fj+!{(|zpQ_6SydwRM_IjRxM)TUh_pVV(@%X)2xp~nrZIu-my8Ei9SjFz@Q}yIt zA#9#E>zw6ynE(&lRxsPBGjqK#9%d5Ub+PE!<8r=~B9mTxK91pu*9&F-yjmtkSh>CR zZ8MXqn*t{A)Au5+E`erIx1(In(l%eAT~5marv_@bKWZFnq5tE9vICyBqt5wusOQY8 zvV)uXRmdp6((Go{plbe+9M6I`9oxfM(#~l~HY1MW4<-4i%TV)~d38CV4`x66Nr#_I;D=_a?lmW|D@mB-_Fk~LM7DbpppoeJEmqN@%w z*{x_L&Q}t5Hb!@dgugH{8SY$+;k7LrJeNtCJQq7GwHm|WG_m408oHQ&|8dv6{ICkg zDbtZ+bo@#C$J4Uu@!Z>lIiXbuo#cuTU>933%dLSRxnlnLCn+Sn@_a z=5qeReXsNLtLKXpou;Pa*^Jw;X-t?&)v>!b3|KvyPMkOT+w~7t6g0N?id&5+4g825 z{Xy)GF<0;D=#1ZKC!b|Gvr(S!PgN4!P7LA!Tb}15Y4QTtOuv#=x>7iT#^z}pn;cid zIDHY{SAxin)>2ctWcQ}D=z_Xl;4;6Ls~q19NDb|TAs+U`v81g6--lV<@%w=2Fa4gMi`m90fyg8t^Y_U! z<6DnX@5TJ!4lJs5pMHYnzSgAa#(ZT|l9D)NvcD}G1v~xLiPlrbQR!dr01f>$NgTPK zH^#HR@3ORQMR`j=i+|*Y6t_72wX%`;^aKl}6C~4KYhC;9@YD{O*J(rpNSPCIPCFYU z*|g!am=2_?+jT!%_?i|H021}gAJB<;^p1XrT`RwT0$5PEGTSC{a=Q+X{S$byC-(xC zWMuHb?ySq+uOG$Wa?G}VaZ+NC4|c0vAl_bSqSN>t;rpFjBJ-+}2C=EFy)wsZP5C!~ zpYSlbmXqwf+d;}j`+eH$<_{0&Oh9rmD8GC>2M@C-@XyZR3pIE@2a5>fOTw;$WWNC- z^a3S3H7Me-+P0E`)<7&mkH)3vzD|g>qTbw5GCV?u|2Bp4caZHmlI8_AdJ9cY>Kfd? zf4%<v$dG z`{$Q&c)$)>x@}l^6TB`|u=Kw7w~@c;|CUE!;cE-8?4c`GRcnW=6xtVtZAhAkF!#AO}M+WHI`n2`zkHr0(6mJ8l`w^cdIg0&>83XwX1yCy#l6dek{eb~aAojPx z8U>$Bh;O3$e@O0s==J~az(7p*E5Zk+8*qTNKY&jfe{~7Dv6((>5L7*EQ7D$Z@yP~v zQFv^XfRXUu$VRx^1s3TypFR0+au0zM0}55>bw2VR!dgOxW4rn3nApG3D)C7i8@%1$ zY_M>XYpDTVpCNRL{1@P*;$T1gv>wzqP}LO*Kj3zSlI|OT3x&e-NNFFM+pe|TKe)1k z2fEQ`U2<;ho(utek{jbD7`J`F9=%y3j{cK7hOf240X8%BVfw#VOb2dA2YP8H-&nX0 z1{OZ7W2tEWH+1SR0kRpsQkvD@{Gy8(050Y8(er=59M+%>vIXq12wKn^OBc-o>}l?_ zkCObGJw8>yPk@2aLhuImysF?!7kn<$zi|SuNWdBi?F~>cUGpFO{Qc#kSs+d|ciM(X zB>yEDK2vp{F|^JtQf)3#P^r3?v2NBS77ViR|Y2kg={!4 zaby1*2mntPIbZ{Bq?Oeh_z(((OEZHHO&@L^?LHk`ZaMriueh<9z8FC7S`>A$zlGlK z9rZm1Z&Q1@RNj2q0L})ZpNn7L_+-O5TmpOG(@fmt^;%-ENCoAu;~Q_kLz@230FwOJ ze2jG`^KpDGwrGSL^dUTJUq|5?_uTw)*Tj+w%^P?L6Y7QU%C4;`=D&7D;|0Gix^bZi zm5IKof3K5?RGi4>#Rkx8#oZheU0Kj?V>iAoWc3&eqGWy%~@^ivf zSRA{lB7M!qHye=NZ1I3?;!Rl9Lt=G1JO29&WWhPwMHV>RTn*~okP)j(u$=1Y0oSa% z<7S1eVdNt7?{1;{AA_Qr;t9)|!+Uoe6>scPof#Z6!=nWgrW;=YNjvK)hmlrMJRKbP zW*4H>bU#wXs8bw`8iaD~G~6<{x;Wl$i*Yf%LA^qV-GENthQx~BgzbSdsu%b?gEjF5 zsLJ$@)pe!8)v@m%)emIo5D@nxab3|0IllE@OYuhoGLa|7pP;%aUO17!`=2X8vA}!_ zwtNttt~2@B;W}QkAI9H#a+~wr_Fm45qR%(Z;2n@jNw@y%(Nb6gD9T&__k@QUgz|H= zg%A^>GL*ig|Ms0ilMmlWsX>^QMG*SY!zSpqvk9uVLb+tIxni#8z2y;|yu zXK_7PfoG)W8<9za@Y#%_NJwp3utfPZ4Wm^$>cmmG?$kWFu`Bw&0D!pfgiBFxEGKmQ z;A;-;7U)qqIjX29hMP@%puRan*lnxImTII=#yf;9pgbE_oD^c82g(<$wEU27lKq1fOSt^t zN7U83NkL>Nd{^iDEN_Rtp1_NpfE6XTI@sWK>(=gypWyjxb5O8gxSe+UONrx`&bzak zxu|Y$%G7vlW!#oe?;@W@H3obqQ2I{c$_T#60YE2N_Hcm;3#(D66v_bjzP$$8p){@4PCcZ)wYj(WE^)*d&Ay zAU$3h1ZQn-9E3hd>~B#to`)Nn79F#nz9M<)GQeVzJW&a>XSh&)23m zC~$-ts>&E>7m2)29Gg53t0lN!_>XlAg$n{JVi(v=>%Z$EqCEeh^4(ewfbu{fETQSR zT7W_f0H*2Z`yD4N@Nx^P*R-q3qkB@3JV51C%hh@BRg@4v;zBGCYU&(ZX^?194wt%3pN!R-x8Fe0AHC6w9bXmFWIS?+ zS1Y*Vn>L0q2^5&>(ImMAe)j1HG<;SV_Iddr(o&gq%SA+#eR><^!v(`w9bAL9AR1UJ z1JHg{&xqHcI|bzfvhkiq(QMV0XzRW^rK4f>-#UFw%&~ZKHS> z)UcIoP_Z)|DJpO!To94T&H%bmYw-d9avn`N=^a5vZyw9)FvAdz;}}aN08f6QE$x)Z+TV zZ#fp=cAI?O$`htl~pSajipya4ahzUni-XcelB;-{7Tpv^ocFgU#dtSMm25VZH9ASQt{S!Yu*=ufuV3DOcBWzItDECvlhXVLrp{*xhKh z6PPYIK$=7G%Ir95_9^dOby9&K|9 zD%{4%3mo|Q2JfuVU39X%*5+m92zIbva?nE8E~#@AFO(<#+2zhGc4~5Tel)IAG8=!k z#4oL|jKWppEk zEU~42bb_wySaFy03Vv3TL+gM6&a@U&}f^2}~6j>AIJRB)b+NYd@@U=PmwV97}8 zXyhQ|`vl5MwEHn`IBxfo?FS@<393NT2%j9+-21(A$B{3(YYSep$P3k}CA_iwDZpG{ zJyvJq8<$?-ww$btC)Y~36%g1l0dbz`(G@TKqiFwu-_j?`aI_nNOBtCS2?4g2d`6?$NZp&Y4h&;9!a6f zk4A@QB6nM`{i$DiY(tN6)jIPlrZv4s#EBi0&8rq;4Cyg(F?l)0P14js+YPgx+VS+2siIBxr+%0k#)sPBcdQ3^{1X5A|hEti+b7e+yI z%r#9erMI$rnlS{m$HgI&sc~5%e&`H#Q46DXDCE)W^~u)n!)_Zd5nC)040ZAdapmB% zaRO7K3O&$p^U!d<6MG>$E0uPu1XNO1qF4(8W)5_VTxbs=)*5E3oo&2`igGMXvv@7X zclx3r@=5B`dGuqQVLpQa$KWO;1p9-PeqX{bxb5G{2=_eT6nBq=kt( zC*x*tqS!h`i{HV?1rAmX53qi7 z(f(4!Co5nLi;K@@b%`@9!1~_c5p7zSlNKsa-O*qhP4S{L?#*mj$2*JDpk5{RoF2ie zb6TIR_-+fMXs`LT5Za^)@g71Q-*wndmXf*NviFaKupo@`<;FReMG9j{dgVC7#w3@g17&{C8kL& zx=AHkcx??{w(bA|h|2g4mfSJi@m|Zc#|}Alhxt~~m;J-lQ}{2K_7B2D(6a|Nm6jg2 zSEzYF&Q^bdl3taJcM6aN!kMR#THxa3$dId6Hc{Q$RJr%^Clip3M>!^k_z;P?jI;gJ zEA7$DotG>md3eM33-Pf@R)2j}w9fD0S_o2g7&3^;rpU#X=PPzy=AF?nd<9JsAvMKz zF9$Mw4g9HlwD;@iFN*jRst=!?W|H{eKyz?7axgqknpFZ68MMvnv;_xMR8!A}vT_J5 zD?k5KQ9ov$)eWN=%aJkF$cXEI>tW+@k40rGQgw-HBr)8e<8Xq`J%vKdVZx#!dN)({ zAoZvrd>5ZZ|Ahl^$kc+CQ!h`lXm9)5PJzbusfR=pbX{pUGC9C{DLFQtPwu2(9&@HuGQ%XH(pP%307o`sw$?Y&4xXTqx6@TlI7`Y&^AOApfJzBx zF2dUItgnE(rj_BZ#FZ|gIX*f~kj!ps8&5g(<2>#1oJFV#cm#*ggowYxBZi-I}Uq$v3kpCT!0c5;FBQ zIp|j9XLTk7IMO*S<3>+M6L5@2{bHA|XWiRGZVe21@q4U`v6r=NzCvTbkPKjZSNwSv z^P~Y;023dHN$%l*ybk2>XL#fE{GA_vS=%hn4*Y0vvB7^cThC>Ag67l&TsYOEQvIZf zm;!5}IqOEq4s7O1fK@^-31cJe5Ofb#a7|e2yy>gaCK$PUvBaf~!^LimJIDN`x3+!P zr^FlLTRvu#IBHdYp`VPkUQ5+Iyc$2eJu!2pYNMYXhDFoTnrd7Bb=|7+GZ&L}5`Cm) z%;(hr?iCOZNQE3*OkAjLYhZJ6%?2uAj65iyP7 zRJJb4fNB#up@)zkEqT$&5(+&han^{3xwu=mc?Y5sZuSR~B1K?Q%Obkf4{?5zT2gVZ~4w!F?F2DeZuS#w2;H zbgtJXbQVUkn?=p&jYN$WgYh_C=_Lk*T$3-;bDAxp=U+;mN~XpS7G3=;^iWCCn3PsR zqg$AWx$D+zqaBcyaTU}LnwBnDsf#SlN#!zuD9%XVFxapT7`UStlxdSZ9QF0PSDw{h1Dw4^BST^-R9T!VsVS%}o}h#@yyrqd4$1ID>f%Nx z84|Z9VL@VXR7~Cv4kKuT9S%X?ePxmlSwu*h zt-qk;=?ZVQpD5~5WU>d4 z40Sq((e4AWDuU%?f=5)vxX{f1b*XWbvX6|&cH2~*>r3Imt#E1`1^BpwSeDHEASMS2 z9{!^h1Dla4r)U_l{vAXep4tu;f7>-A`0q3v3t)~HH?#0+LB#D9#zlO z!CE!V3RUW?&iw>-M39bL_3Ox`Ad7T*AMFcbq7s5FtoV9I{vkoHOq8y0y&sV`!qf10 zPBRkgtXjB~|Ilmv6}r`^y{m?@bcxPcBT3@uc@asSn0gAP)jkQyD41WAU3N3;GSY0E z*I5(0&*DRxsGW~$RKy?;~A=v8_nb4bL8>}A4&`p-Nd1T6jbI}S}Qj_p0a>IZQVj?ALP z5p@i$)oT$l;q5$ys20WrY^jno;uDIsbf)tX3=*zF1DDTYa1f-|M8OSlBw8x>I1FXh z3|^%ae2F;Q?U!L0uiCdIo)>EyF73H?{O=!jiO>=6jbrlo+vk z*~Udx>4Wnld+7mRcGjIeo`nqhAlK_Ik6fj{YpQA}9A{J4n#4YjsK0!6kyK&cnD51| zo4(|2IH4?;?$QZS4^Tovte3>QUw`7ao96*dMVOm!BY#HBh?|5GOy#p1uCFI)y^Bn|CVJG>KpezsvuxH@N2DT`0yV=TfV*(L#`6+s1W+#PheMK36Y zJzbsR+5%07jCQ$8-Y`UJE;0#`57Hq~PU-jy__6Xj)LEpJ|@9-EC;4R-7md>0< zB?4J#y6%)-reaQ`h3be&;YCtt=Sq5{+`R#^2qQFPX>d+N1ijgcbaI=;>7RV$ zyHSigfw!kY^o%P*(Vl)kLfNsar0r;je=B;@IJXkdCZzueTi``R~8cJ2o7e+hQ{i8WLFy5GUddu2X1nQC_2+vK1ovGnlvHJ9zf@seaFg4 zu#9E#n1Lw!mgLWmbRP^fkck=ch?8Dc7D-_Is3m(ghgY}b%W5=$^y#vE!iQYCbhem+4SF@3 zMPr^ThcRM`Yi%*%(!7l}$i~ci6`)QfO9CmkzmL%7Rn`iLg^|fgU6Pc0kzHFtY zpT-|TAq^~FDj=>CZvKAJ-O{73Dp^*bxQ_T8am5WaNtZk=7cR%m~R&_R8KaBUi>H71?BE@4Z*ZCfQraCacWn{7~Qf z?ymdy{XNg$&mZ;js?+&7$8jF}cpvlMVhSu1L-gblp*li3ZpuY8oj_qv_+wT4H66|1 zKD@UfdmGfgWap2zw2p|<6W59){LYfICdyw4p{|2(csujaTw*-Sn8^5f&!#e@wAs0j zTb%U9MME-XMHWtxHg1V?4Rn2+J4I!dT5+-at_ZIr6BrelRjj1ytcdDSnAf^Q)MsJ$ zH{Vg1i#b+HWywT+P)V-#7ziUr@J3bo{UZ)l44+Uryq_$~C`-%p#-Q`E#1+ka_FEdT z2Lv@4?#uUbUT71pI;xJvoI0(A)+{)Ccm)uSK?m+6!v*ZdjmwE5j9vri1B1Tal`61% z`S`ecs&Ah7&^FxkKIKg{!Uw~J(V~Z2U4x5W{W5N64)^F(vUsm1xprB>XB*yT!OTe@ zLH{LS7)=GI|4Oq=T(q{U28j>tb~`_)n)cVD|7iBf*JigAPz&O8E^HEm9^3Wki`0#W z4FV0HZjIS`sosZ?qOV=1B_j!U%S1XOxeKk6McnK<`Kl~)u2EQuzR+W_^tCOhA?ZFd z_W`fey>m`vuT<5((pEi+)of8`pFj_eHfREC!tb9!tknuX2Hpw~`ZOSXi`jH>(W5eI zs3WkbY{P$T1=(-&ju?cu5I<4_wVPNk`Y-D10SDhfroilSAY_`qI1jzNzZhm2nWCr&x$u(Xgz#*nGlVbRM*`7SZ zl9S^s>{JHzERiQWilQM3!g5MFK%QxD!{4nPQ`n5QYD}#Yd9-x%mJZvLfH+J1azUT`FSjNRo;!FFe(8RNvgAmqKoh)o#Ck{vZ@~{ew&<#Y`v)^IbtRVWleNO<-(H#{ zspFTuhMrUu@6un`5CQR%HVvTjTffOJ{D-9b0zd8Phkl!~HLwrM{G}&SE3O|OEeQQ7 zYc#C~v!{{?pDu|HzVTv!rG)5ZD<7?mAacPX^!dxQQ38c}OCjfOYq_>_)4`(lVj7Bp z&w@s}Kqj&;!&2hm70;H$RhkKzoTwTO#%4I;+l;d_hA;Mf+imh&3mMUz%NNBuicn`e zlH5zU;2hSgku*iYS5Z^~so@Jf3}nltb1`)Ca@RWsU2Z>!^waaM5m`xJ zJ*-|WI)8Df2+PhkhXNCOeg;kgzXv;OUqIzIp(}PzEba;&nJ%`e;6wNs_{9?97vgYx zZ1dE{*G~g&Ir`xd1T^rru;G#nNNb%!iavfTT;c^MwPXk1Z_2=)mN6FhruHt`_NJA< zo#m`MXYJ=GQBzUr*hErm=BtM+)7cFH(PZ|d*7S?j!zOoEKHc{wk-#77h)T*D`Ix%N zzGK5gqomG@X?->qogjz{bV4@MCaqOHKI(?%NxvF|JLJ8l6kHNDEGH+i{BkpKgfA7$v zMdnrzjakPwb^ej1H}xsn=5F>r;brqEdY7gs5Auy6&K;}?aox&SPv8Xj{Zx!trQ;hB z-@PvxLFql$sv#Dkb*shh*RVn5Gw}AeCrbdvjx-YkT3{jt4e~?w*qu~K?N}7}isKp4|HckudnQB2_z;CuQtp4%MkgK^JOuH7^*hFij$3dWgNxOOT_5 zqQ_N-np1f;S7dS{&^J4(tX7`mRZa+iq86B z1OW=xMq82>i?4gCjtE8DgKE#Ba(UqwU}J$KjqC8z@68r?u zSFZA?W2s|CP?0p|aPi~aU2gBQBN}bQz^N*3ph4{V~KPm@F`W$lu zoxW%9zTv=)CZh*Fz1DmK@VjBH%UDt{f|9aZ?*y@|2xZNDUaNiToP%*ib&i-Y@2?4) zoLdjsrgfUk(v!*c#g8X8qQ~keR?)6=y)Z|-aSHL=(I0tR$HkJ`ifroz399H-D8cFA zLQ4S2^=91w&uVpM7YcL@TKlaUzma!!ZJ=HmL>}}8MdI(kch1Zbk6YtU#I#2xyMtA1 zhP%dn;o38&uIaIBVCv(W%QrWo$72`>=2>l}kAk1saCYePDfHOD)=`*DN$%n944~Aj zO*m9<`&Put7g&P1#_M|>we+?zN_R={X{VWb?6FI=%>?j)jfj%|zp5vXO|Uf{beK4S z>6MZXT&E?`UZ5X--x4{kf%c*>ALH}~PG>YeBFp|4o*9`rJpt25ZE8>4(`qBE=*5>Y z73C)JsJRKUu}IDY$%`Lx9*3@zM z2qM1+;`mz=uo;9L`db54GH{|sNLvUFFSVa~S(oX%=KZb3K1V#oAdBi0*LLFO^d;0g za-n*B%#Xkw*OwNFv~MHtA*zd7*Tbbbe@@fMlBMXIZc;YPlh03kB*wDAab);1R|Cy1 z5_ZS$Zl9n@v>lBNXFn7`%$pq?hoJ&ZE>Zj2S+Tm=XwQ?NWqjOS<mm3jius*q7muqces2g*u9VV4Y&|7w6y|MGDke_QpuK! zM^CVeZ%ra=1jHQC3mjR=aw%?aA*YeT{$DPfw(WV@o^@@zaQaJfq&X9c)|(csqMZ+A znM5Tz3L^Q@Pswthxkoy$QeDdF)A`_b7l{wNcY!3_X)vi0d4bP-(mKx%+taA3s9(N# zX=#T_=yX5akPN7s@x(zM1R9!1VH-`rg@^pe3q77KV)+t!;vyXT*!pWq5x2R^#mQmU z+pM+~bTg@vslA2wyXx0>(vLKwZu7M)2oN|Ccv(?ejVh~X6fPb4-1L5J1+(CD05OCT z8WL*p`z3fgx((7jtsL%#l)9vQ!!>7u{R^HW#bLO}2;@ER-V36qZ%0Z=-)VsD?ibpe ze;PaKlt|DZBPk#v!LobHxS_~r3s=xvVaRY^04=+-`FhgF7EbH zlGSP%UnpDSHOO!0VRkkPe(;*mZbS|;74vu{JMblD-hXR>#}Wi0WwV$wB*p_^L#q1%r|cUhb4 zHodfB-JC)Y+e|)rbu^>Y%4gFBopw!w^v-dLY9^gh+70zo4MFVqSsAK-xp~$solw1{ zR#YzE z&SL-gcJ7|{OKmsG`BF$}!=psY147=` zkz|c<&+I9ZeC7bHmAU&#T-hx(cKr0C(wVX!gi&et4pe6q%39M1+tV;99O6E4L_5Bs zkqfEaR@5rvY`lG8H#jrkjSO4U|LTRVQo1W+`s5zIiNMOCwpuXPGdPpT@?F9;7-u&wiq# zLX63t5-j+c6Y`h2+GT=eqr@j3aA!^oOX3}A5<0r|tKF}c#a4?jsA=z`h!oy2`_OuB z&C4<(484OOY)rZuxiu22=R7mn6XP^4NnVL9uQikfu1ZsJ?a-$6=qM!BCz*W|+;`K& zmIAmtTP!}b5+a_(;dz