From 3059e272b3fc6d141f6be9e1eb6557bf7cc69879 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 8 Jan 2026 17:55:45 +0800 Subject: [PATCH 01/59] init and registry Signed-off-by: JaredforReal --- .../diffusion/models/glm_image/__init__.py | 17 +++++++ .../models/glm_image/glm_image_transformer.py | 15 +++++++ .../models/glm_image/pipeline_glm_image.py | 44 +++++++++++++++++++ vllm_omni/diffusion/registry.py | 7 +++ 4 files changed, 83 insertions(+) create mode 100644 vllm_omni/diffusion/models/glm_image/__init__.py create mode 100644 vllm_omni/diffusion/models/glm_image/glm_image_transformer.py create mode 100644 vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py diff --git a/vllm_omni/diffusion/models/glm_image/__init__.py b/vllm_omni/diffusion/models/glm_image/__init__.py new file mode 100644 index 00000000000..64a305571ee --- /dev/null +++ b/vllm_omni/diffusion/models/glm_image/__init__.py @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""GLM Image diffusion model components.""" + +from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageTransformer2DModel, +) +from vllm_omni.diffusion.models.glm_image.pipeline_glm_image import ( + GlmImagePipeline, + get_glm_image_post_process_func, +) + +__all__ = [ + "GlmImagePipeline", + "GlmImageTransformer2DModel", + "get_glm_image_post_process_func", +] diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py new file mode 100644 index 00000000000..ca8b28ba02f --- /dev/null +++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import functools +from collections.abc import Iterable +from math import prod +from typing import Any + +import torch +import torch.nn as nn +from diffusers.models.attention import FeedForward + +from vllm_omni.diffusion.cache.base import CachedTransformer + +class GlmImageTransformer2DModel(CachedTransformer): diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py new file mode 100644 index 00000000000..ad50b2c3f62 --- /dev/null +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import inspect +import json +import logging +import os +from collections.abc import Iterable +from typing import Any, Callable + +import numpy as np +import torch +import torch.distributed as dist +from diffusers.image_processor import VaeImageProcessor +from diffusers.models.autoencoders.autoencoder_kl import ( + AutoencoderKL, +) +from diffusers.schedulers.scheduling_flow_match_euler_discrete import ( + FlowMatchEulerDiscreteScheduler, +) +from diffusers.utils.torch_utils import randn_tensor +from torch import nn +from transformers import ByT5Tokenizer, T5EncoderModel, GlmImageProcessor, GlmImageForConditionalGeneration +from vllm.model_executor.models.utils import AutoWeightsLoader + +from vllm_omni.diffusion.data import OmniDiffusionConfig, DiffusionOutput +from vllm_omni.diffusion.distributed.parallel_state import ( + get_cfg_group, + get_classifier_free_guidance_rank, + get_classifier_free_guidance_world_size, +) +from vllm_omni.diffusion.distributed.utils import get_local_device +from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader +from vllm_omni.diffusion.models.glm_image.glm_image_transformer import GlmImageTransformer2DModel +from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.model_executor.model_loader.weight_utils import ( + download_weights_from_hf_specific, +) + +logger = logging.getLogger(__name__) + +def get_glm_image_post_process_func( + od_config: OmniDiffusionConfig, +): diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index 7c57ac3a876..743e9f4c455 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -29,6 +29,11 @@ "pipeline_qwen_image_layered", "QwenImageLayeredPipeline", ), + "GlmImagePipeline": ( + "glm_image", + "pipeline_glm_image", + "GlmImagePipeline", + ), "ZImagePipeline": ( "z_image", "pipeline_z_image", @@ -111,6 +116,7 @@ def initialize_model( "QwenImagePipeline": "get_qwen_image_post_process_func", "QwenImageEditPipeline": "get_qwen_image_edit_post_process_func", "QwenImageEditPlusPipeline": "get_qwen_image_edit_plus_post_process_func", + "GlmImagePipeline": "get_glm_image_post_process_func", "ZImagePipeline": "get_post_process_func", "OvisImagePipeline": "get_ovis_image_post_process_func", "WanPipeline": "get_wan22_post_process_func", @@ -128,6 +134,7 @@ def initialize_model( # where mod_folder and mod_relname are defined and mapped using `_DIFFUSION_MODELS` via the `arch` key "QwenImageEditPipeline": "get_qwen_image_edit_pre_process_func", "QwenImageEditPlusPipeline": "get_qwen_image_edit_plus_pre_process_func", + "GlmImagePipeline": "get_glm_image_pre_process_func", "LongCatImageEditPipeline": "get_longcat_image_edit_pre_process_func", "QwenImageLayeredPipeline": "get_qwen_image_layered_pre_process_func", "WanPipeline": "get_wan22_pre_process_func", From c0a76849b6d9799c88ad1cbe1a09215b90bb0f01 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Thu, 8 Jan 2026 19:12:09 +0800 Subject: [PATCH 02/59] implement glm_image_transformer.py Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_transformer.py | 610 +++++++++++++++++- 1 file changed, 608 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py index ca8b28ba02f..dde5f29f3c9 100644 --- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py +++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py @@ -1,15 +1,621 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import functools from collections.abc import Iterable -from math import prod +from enum import Enum from typing import Any import torch import torch.nn as nn +import torch.nn.functional as F from diffusers.models.attention import FeedForward +from diffusers.models.embeddings import GlmImageCombinedTimestepSizeEmbeddings +from diffusers.models.modeling_outputs import Transformer2DModelOutput +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import QKVParallelLinear +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm_omni.diffusion.attention.layer import Attention from vllm_omni.diffusion.cache.base import CachedTransformer +from vllm_omni.diffusion.data import OmniDiffusionConfig +from vllm_omni.diffusion.layers.rope import RotaryEmbedding + +logger = init_logger(__name__) + + +class GlmImageImageProjector(nn.Module): + """Projects latent image patches to transformer hidden dimension.""" + + def __init__( + self, + in_channels: int = 16, + hidden_size: int = 2560, + patch_size: int = 2, + ): + super().__init__() + self.patch_size = patch_size + self.proj = nn.Linear(in_channels * patch_size**2, hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, channel, height, width = hidden_states.shape + post_patch_height = height // self.patch_size + post_patch_width = width // self.patch_size + + # Reshape: [B, C, H, W] -> [B, H', W', C*p*p] -> [B, H'*W', C*p*p] + hidden_states = hidden_states.reshape( + batch_size, channel, post_patch_height, self.patch_size, post_patch_width, self.patch_size + ) + hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5).flatten(3, 5).flatten(1, 2) + hidden_states = self.proj(hidden_states) + return hidden_states + + +class GlmImageRotaryPosEmbed(nn.Module): + """Rotary positional embedding for 2D image patches.""" + + def __init__(self, dim: int, patch_size: int, theta: float = 10000.0) -> None: + super().__init__() + self.dim = dim + self.patch_size = patch_size + self.theta = theta + + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + batch_size, num_channels, height, width = hidden_states.shape + height, width = height // self.patch_size, width // self.patch_size + + dim_h, dim_w = self.dim // 2, self.dim // 2 + h_inv_freq = 1.0 / ( + self.theta ** (torch.arange(0, dim_h, 2, dtype=torch.float32)[: (dim_h // 2)].float() / dim_h) + ) + w_inv_freq = 1.0 / ( + self.theta ** (torch.arange(0, dim_w, 2, dtype=torch.float32)[: (dim_w // 2)].float() / dim_w) + ) + h_seq = torch.arange(height, device=hidden_states.device) + w_seq = torch.arange(width, device=hidden_states.device) + h_inv_freq = h_inv_freq.to(hidden_states.device) + w_inv_freq = w_inv_freq.to(hidden_states.device) + + freqs_h = torch.outer(h_seq, h_inv_freq) + freqs_w = torch.outer(w_seq, w_inv_freq) + + # Create position matrices: [height, 1, dim//4] and [1, width, dim//4] + freqs_h = freqs_h.unsqueeze(1).expand(height, width, -1) + freqs_w = freqs_w.unsqueeze(0).expand(height, width, -1) + + # Concatenate: [height, width, dim//2] -> [height, width, dim] + freqs = torch.cat([freqs_h, freqs_w], dim=-1) + freqs = torch.cat([freqs, freqs], dim=-1) + freqs = freqs.reshape(height * width, -1) + return (freqs.cos(), freqs.sin()) + + +class GlmImageAdaLayerNormZero(nn.Module): + """Adaptive LayerNorm with zero initialization for both image and text streams.""" + + def __init__(self, embedding_dim: int, dim: int) -> None: + super().__init__() + self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5) + self.norm_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5) + self.linear = nn.Linear(embedding_dim, 12 * dim, bias=True) + + def forward( + self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, temb: torch.Tensor + ) -> tuple[torch.Tensor, ...]: + dtype = hidden_states.dtype + norm_hidden_states = self.norm(hidden_states).to(dtype=dtype) + norm_encoder_hidden_states = self.norm_context(encoder_hidden_states).to(dtype=dtype) + + emb = self.linear(temb) + ( + shift_msa, + c_shift_msa, + scale_msa, + c_scale_msa, + gate_msa, + c_gate_msa, + shift_mlp, + c_shift_mlp, + scale_mlp, + c_scale_mlp, + gate_mlp, + c_gate_mlp, + ) = emb.chunk(12, dim=1) + + hidden_states = norm_hidden_states * (1 + scale_msa.unsqueeze(1)) + shift_msa.unsqueeze(1) + encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_msa.unsqueeze(1)) + c_shift_msa.unsqueeze(1) + + return ( + hidden_states, + gate_msa, + shift_mlp, + scale_mlp, + gate_mlp, + encoder_hidden_states, + c_gate_msa, + c_shift_mlp, + c_scale_mlp, + c_gate_mlp, + ) + + +class GlmImageAdaLayerNormContinuous(nn.Module): + """Final AdaLN for output projection (no activation before Linear).""" + + def __init__( + self, + embedding_dim: int, + conditioning_embedding_dim: int, + elementwise_affine: bool = True, + eps: float = 1e-5, + bias: bool = True, + ): + super().__init__() + self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias) + self.norm = nn.LayerNorm(embedding_dim, eps=eps, elementwise_affine=elementwise_affine) + + def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor: + # NO SiLU here + emb = self.linear(conditioning_embedding.to(x.dtype)) + scale, shift = torch.chunk(emb, 2, dim=1) + x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :] + return x + + +class GlmImageAttenProcessorState(Enum): + """State machine for attention processor to support image editing. + + - ImageGen: Normal text-to-image generation, no KV caching. + - ImageEditWriteKV: Write condition image's KV to cache. + - ImageEditReadKV: Read cached KV and concatenate with current KV. + - ImageEditDontReadKV: Don't read cached KV (for some special cases). + """ + + ImageGen = "ImageGen" + ImageEditWriteKV = "ImageEditWriteKV" + ImageEditReadKV = "ImageEditReadKV" + ImageEditDontReadKV = "ImageEditDontReadKV" + + +class GlmImageAttention(nn.Module): + """ + Joint attention for GLM-Image model using vllm-omni's optimized attention. + + This combines text and image streams for joint attention computation. + Supports KV caching for image editing workflows. + """ + + def __init__( + self, + dim: int, + num_heads: int, + head_dim: int, + out_bias: bool = True, + eps: float = 1e-5, + ): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.head_dim = head_dim + self.inner_dim = num_heads * head_dim + + # QKV projection (fused for efficiency) + self.to_qkv = QKVParallelLinear( + hidden_size=dim, + head_size=head_dim, + total_num_heads=num_heads, + disable_tp=True, + bias=True, + ) + + # QK normalization (LayerNorm, not RMSNorm for GLM-Image) + self.norm_q = nn.LayerNorm(head_dim, elementwise_affine=False, eps=eps) + self.norm_k = nn.LayerNorm(head_dim, elementwise_affine=False, eps=eps) + + # Output projection + self.to_out = nn.Sequential( + nn.Linear(self.inner_dim, dim, bias=out_bias), + nn.Dropout(0.0), + ) + + # RoPE and attention + self.rope = RotaryEmbedding(is_neox_style=False) + self.attn = Attention( + num_heads=num_heads, + head_size=head_dim, + softmax_scale=1.0 / (head_dim**0.5), + causal=False, + ) + + # KV cache for image editing + self.processor_state = GlmImageAttenProcessorState.ImageGen + self.k_cache: torch.Tensor | None = None + self.v_cache: torch.Tensor | None = None + + def clear_cache(self): + """Clear the KV cache.""" + self.k_cache = None + self.v_cache = None + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None, + attention_mask: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + dtype = encoder_hidden_states.dtype + batch_size, text_seq_length, _ = encoder_hidden_states.shape + + # Concatenate text and image: [text, image] + hidden_states_combined = torch.cat([encoder_hidden_states, hidden_states], dim=1) + + # QKV projection + qkv, _ = self.to_qkv(hidden_states_combined) + query, key, value = qkv.chunk(3, dim=-1) + + # Reshape: [B, S, H*D] -> [B, S, H, D] + query = query.unflatten(-1, (self.num_heads, -1)) + key = key.unflatten(-1, (self.num_heads, -1)) + value = value.unflatten(-1, (self.num_heads, -1)) + + # QK normalization + query = self.norm_q(query).to(dtype=dtype) + key = self.norm_k(key).to(dtype=dtype) + + # Apply RoPE only to image tokens (not text tokens) + if image_rotary_emb is not None: + cos, sin = image_rotary_emb + cos = cos.to(query.dtype) + sin = sin.to(query.dtype) + # Only apply RoPE to image part (after text_seq_length) + query_img = query[:, text_seq_length:, :, :] + key_img = key[:, text_seq_length:, :, :] + query_img = self.rope(query_img, cos, sin) + key_img = self.rope(key_img, cos, sin) + query = torch.cat([query[:, :text_seq_length, :, :], query_img], dim=1) + key = torch.cat([key[:, :text_seq_length, :, :], key_img], dim=1) + + # Handle KV cache for image editing + if self.processor_state == GlmImageAttenProcessorState.ImageEditWriteKV: + # Write to cache: accumulate KV from condition images + if self.k_cache is None: + self.k_cache = key + self.v_cache = value + else: + self.k_cache = torch.cat([self.k_cache, key], dim=1) + self.v_cache = torch.cat([self.v_cache, value], dim=1) + elif self.processor_state == GlmImageAttenProcessorState.ImageEditReadKV: + # Read from cache: concatenate cached KV with current KV + if self.k_cache is not None: + key = torch.cat([self.k_cache, key], dim=1) + value = torch.cat([self.v_cache, value], dim=1) + + # Attention computation + hidden_states_out = self.attn(query, key, value) + hidden_states_out = hidden_states_out.flatten(2, 3) + hidden_states_out = hidden_states_out.to(dtype) + + # Output projection + hidden_states_out = self.to_out(hidden_states_out) + + # Split back to text and image + encoder_hidden_states_out = hidden_states_out[:, :text_seq_length, :] + hidden_states_out = hidden_states_out[:, text_seq_length:, :] + + return hidden_states_out, encoder_hidden_states_out + + +class GlmImageTransformerBlock(nn.Module): + """Single transformer block for GLM-Image.""" + + def __init__( + self, + dim: int = 2560, + num_attention_heads: int = 64, + attention_head_dim: int = 40, + time_embed_dim: int = 512, + ) -> None: + super().__init__() + + # 1. Attention with AdaLN + self.norm1 = GlmImageAdaLayerNormZero(time_embed_dim, dim) + self.attn = GlmImageAttention( + dim=dim, + num_heads=num_attention_heads, + head_dim=attention_head_dim, + ) + + # 2. Feedforward + self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5) + self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5) + self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate") + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + temb: torch.Tensor, + image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None, + attention_mask: torch.Tensor | None = None, + attention_kwargs: dict[str, Any] | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + # 1. Timestep conditioning via AdaLN + ( + norm_hidden_states, + gate_msa, + shift_mlp, + scale_mlp, + gate_mlp, + norm_encoder_hidden_states, + c_gate_msa, + c_shift_mlp, + c_scale_mlp, + c_gate_mlp, + ) = self.norm1(hidden_states, encoder_hidden_states, temb) + + # 2. Attention + attn_hidden_states, attn_encoder_hidden_states = self.attn( + hidden_states=norm_hidden_states, + encoder_hidden_states=norm_encoder_hidden_states, + image_rotary_emb=image_rotary_emb, + attention_mask=attention_mask, + ) + hidden_states = hidden_states + attn_hidden_states * gate_msa.unsqueeze(1) + encoder_hidden_states = encoder_hidden_states + attn_encoder_hidden_states * c_gate_msa.unsqueeze(1) + + # 3. Feedforward + norm_hidden_states = self.norm2(hidden_states) * (1 + scale_mlp.unsqueeze(1)) + shift_mlp.unsqueeze(1) + norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states) * ( + 1 + c_scale_mlp.unsqueeze(1) + ) + c_shift_mlp.unsqueeze(1) + + ff_output = self.ff(norm_hidden_states) + ff_output_context = self.ff(norm_encoder_hidden_states) + hidden_states = hidden_states + ff_output * gate_mlp.unsqueeze(1) + encoder_hidden_states = encoder_hidden_states + ff_output_context * c_gate_mlp.unsqueeze(1) + + return hidden_states, encoder_hidden_states + class GlmImageTransformer2DModel(CachedTransformer): + """ + GLM-Image Transformer model for 2D image generation. + + This is the vllm-omni optimized version of the GLM-Image DiT model. + + Args: + od_config: OmniDiffusionConfig containing model configuration. + patch_size: Size of image patches. + in_channels: Number of input channels (latent channels). + num_layers: Number of transformer blocks. + attention_head_dim: Dimension of each attention head. + num_attention_heads: Number of attention heads. + out_channels: Number of output channels. + text_embed_dim: Dimension of text embeddings. + time_embed_dim: Dimension of timestep embeddings. + condition_dim: Dimension of conditioning embeddings. + prior_vq_quantizer_codebook_size: Size of prior VQ codebook. + """ + + def __init__( + self, + od_config: OmniDiffusionConfig, + patch_size: int = 2, + in_channels: int = 16, + num_layers: int = 30, + attention_head_dim: int = 40, + num_attention_heads: int = 64, + out_channels: int = 16, + text_embed_dim: int = 1472, + time_embed_dim: int = 512, + condition_dim: int = 256, + prior_vq_quantizer_codebook_size: int = 16384, + ): + super().__init__() + + # Get num_layers from config if available + model_config = od_config.tf_model_config + if model_config is not None and hasattr(model_config, "num_layers"): + num_layers = model_config.num_layers + + self.od_config = od_config + self.patch_size = patch_size + self.in_channels = in_channels + self.out_channels = out_channels + + # GlmImage uses 2 additional SDXL-like conditions - target_size, crop_coords + pooled_projection_dim = 2 * 2 * condition_dim + inner_dim = num_attention_heads * attention_head_dim + + # 1. RoPE + self.rope = GlmImageRotaryPosEmbed(attention_head_dim, patch_size, theta=10000.0) + + # 2. Patch & Text-timestep embedding + self.image_projector = GlmImageImageProjector(in_channels, inner_dim, patch_size) + self.glyph_projector = FeedForward(text_embed_dim, inner_dim, inner_dim=inner_dim, activation_fn="gelu") + self.prior_token_embedding = nn.Embedding(prior_vq_quantizer_codebook_size, inner_dim) + self.prior_projector = FeedForward(inner_dim, inner_dim, inner_dim=inner_dim, activation_fn="linear-silu") + + self.time_condition_embed = GlmImageCombinedTimestepSizeEmbeddings( + embedding_dim=time_embed_dim, + condition_dim=condition_dim, + pooled_projection_dim=pooled_projection_dim, + timesteps_dim=time_embed_dim, + ) + + # 3. Transformer blocks + self.transformer_blocks = nn.ModuleList( + [ + GlmImageTransformerBlock(inner_dim, num_attention_heads, attention_head_dim, time_embed_dim) + for _ in range(num_layers) + ] + ) + + # 4. Output projection + self.norm_out = GlmImageAdaLayerNormContinuous(inner_dim, time_embed_dim, elementwise_affine=False) + self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * out_channels, bias=True) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + prior_token_id: torch.Tensor, + prior_token_drop: torch.Tensor, + timestep: torch.LongTensor, + target_size: torch.Tensor, + crop_coords: torch.Tensor, + attention_kwargs: dict[str, Any] | None = None, + return_dict: bool = True, + attention_mask: torch.Tensor | None = None, + image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None, + ) -> torch.Tensor | Transformer2DModelOutput: + """ + Forward pass of the GLM-Image Transformer. + + Args: + hidden_states: Input latent tensor of shape [B, C, H, W]. + encoder_hidden_states: Text embeddings of shape [B, S, D]. + prior_token_id: Prior VQ token IDs. + prior_token_drop: Mask for dropping prior tokens (CFG). + timestep: Diffusion timestep. + target_size: Target image size for conditioning. + crop_coords: Crop coordinates for conditioning. + attention_kwargs: Additional attention arguments. + return_dict: Whether to return a dataclass. + attention_mask: Optional attention mask for text tokens. + image_rotary_emb: Pre-computed rotary embeddings. + + Returns: + Output tensor or Transformer2DModelOutput. + """ + batch_size, num_channels, height, width = hidden_states.shape + + # 1. RoPE + if image_rotary_emb is None: + image_rotary_emb = self.rope(hidden_states) + # Move to correct device + image_rotary_emb = ( + image_rotary_emb[0].to(hidden_states.device), + image_rotary_emb[1].to(hidden_states.device), + ) + + # 2. Patch & Timestep embeddings + p = self.patch_size + post_patch_height = height // p + post_patch_width = width // p + + hidden_states = self.image_projector(hidden_states) + encoder_hidden_states = self.glyph_projector(encoder_hidden_states) + + # Prior embedding with dropout + prior_embedding = self.prior_token_embedding(prior_token_id) + prior_embedding[prior_token_drop] *= 0.0 + prior_hidden_states = self.prior_projector(prior_embedding) + hidden_states = hidden_states + prior_hidden_states + + # Timestep conditioning + temb = self.time_condition_embed(timestep, target_size, crop_coords, hidden_states.dtype) + temb = F.silu(temb) + + # 3. Transformer blocks + for block in self.transformer_blocks: + hidden_states, encoder_hidden_states = block( + hidden_states, + encoder_hidden_states, + temb, + image_rotary_emb, + attention_mask, + attention_kwargs, + ) + + # 4. Output norm & projection + hidden_states = self.norm_out(hidden_states, temb) + hidden_states = self.proj_out(hidden_states) + + # 5. Unpatchify: [B, H'*W', C*p*p] -> [B, C, H, W] + hidden_states = hidden_states.reshape(batch_size, post_patch_height, post_patch_width, -1, p, p) + output = hidden_states.permute(0, 3, 1, 4, 2, 5).flatten(4, 5).flatten(2, 3) + + if not return_dict: + return (output,) + return Transformer2DModelOutput(sample=output) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """ + Load weights from pretrained checkpoint. + + This method handles the mapping from diffusers weight names to vllm-omni weight names, + especially for fused QKV projections. + """ + stacked_params_mapping = [ + # Fused QKV projection: to_q, to_k, to_v -> to_qkv + (".to_qkv", ".to_q", "q"), + (".to_qkv", ".to_k", "k"), + (".to_qkv", ".to_v", "v"), + ] + + params_dict = dict(self.named_parameters()) + + # Also include buffers (for any beta/eps parameters) + for name, buffer in self.named_buffers(): + params_dict[name] = buffer + + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + # Handle fused QKV projections + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + + # Map diffusers name to vllm-omni name + name = name.replace(weight_name, param_name) + + if name not in params_dict: + logger.warning(f"Skipping weight {name} - not found in model") + break + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight, shard_id) + break + else: + # Standard weight loading (not fused) + if name not in params_dict: + logger.warning(f"Skipping weight {name} - not found in model") + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + + loaded_params.add(name) + + return loaded_params + + # Image Editing Support: KV Cache State Management + def set_attention_processors_state(self, state: GlmImageAttenProcessorState): + """ + Set the attention processor state for all transformer blocks. + + This controls how KV cache is handled during image editing: + - ImageGen: Normal generation, no caching + - ImageEditWriteKV: Cache KV from condition images + - ImageEditReadKV: Use cached KV during generation + - ImageEditDontReadKV: Skip reading cache + + Args: + state: The attention processor state to set. + """ + for block in self.transformer_blocks: + block.attn.processor_state = state + + def clear_attention_processors_cache(self): + """ + Clear the KV cache in all attention layers. + + Should be called before processing a new image editing request + to ensure no stale cache from previous requests. + """ + for block in self.transformer_blocks: + block.attn.clear_cache() From 800cea48561b2b850b3cd64e4536dfdbba589fa2 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 9 Jan 2026 12:52:49 +0800 Subject: [PATCH 03/59] update transformer Signed-off-by: JaredforReal --- .../models/glm_image/glm_image_transformer.py | 288 ++++++++++++++---- 1 file changed, 229 insertions(+), 59 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py index dde5f29f3c9..40341c23d9b 100644 --- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py +++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py @@ -2,12 +2,10 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable -from enum import Enum -from typing import Any +from typing import Any, Enum import torch import torch.nn as nn -import torch.nn.functional as F from diffusers.models.attention import FeedForward from diffusers.models.embeddings import GlmImageCombinedTimestepSizeEmbeddings from diffusers.models.modeling_outputs import Transformer2DModelOutput @@ -161,19 +159,157 @@ def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torc return x -class GlmImageAttenProcessorState(Enum): - """State machine for attention processor to support image editing. +class KVCacheMode(Enum): + """Mode for KV cache operations. - - ImageGen: Normal text-to-image generation, no KV caching. - - ImageEditWriteKV: Write condition image's KV to cache. - - ImageEditReadKV: Read cached KV and concatenate with current KV. - - ImageEditDontReadKV: Don't read cached KV (for some special cases). + - WRITE: Store the K/V tensors from condition images + - READ: Concatenate cached K/V with current K/V + - SKIP: Do not use cache (pass-through) """ - ImageGen = "ImageGen" - ImageEditWriteKV = "ImageEditWriteKV" - ImageEditReadKV = "ImageEditReadKV" - ImageEditDontReadKV = "ImageEditDontReadKV" + WRITE = "write" + READ = "read" + SKIP = "skip" + + +class GlmImageLayerKVCache: + """KV cache for a single attention layer. + + Stores key and value tensors for image editing. The cache accumulates + KV pairs during write mode and provides them during read mode. + + Shape convention (vllm-omni): + key/value: [batch_size, seq_length, num_heads, head_dim] + """ + + def __init__(self): + self.k_cache: torch.Tensor | None = None + self.v_cache: torch.Tensor | None = None + + def store(self, key: torch.Tensor, value: torch.Tensor) -> None: + """Store or accumulate KV tensors. + + If cache is empty, stores the tensors directly. + If cache is not empty, concatenates new tensors along seq_length dim. + + Args: + key: Key tensor of shape [B, S, H, D] + value: Value tensor of shape [B, S, H, D] + """ + if self.k_cache is None: + self.k_cache = key + self.v_cache = value + else: + # Concatenate along sequence dimension (dim=1 for [B, S, H, D]) + self.k_cache = torch.cat([self.k_cache, key], dim=1) + self.v_cache = torch.cat([self.v_cache, value], dim=1) + + def get(self) -> tuple[torch.Tensor | None, torch.Tensor | None]: + """Get cached KV tensors. + + Returns: + Tuple of (k_cache, v_cache), both may be None if cache is empty. + """ + return self.k_cache, self.v_cache + + def clear(self) -> None: + """Clear the cache.""" + self.k_cache = None + self.v_cache = None + + @property + def is_empty(self) -> bool: + """Check if cache is empty.""" + return self.k_cache is None + + def __repr__(self) -> str: + if self.is_empty: + return "GlmImageLayerKVCache(empty)" + return f"GlmImageLayerKVCache(k_shape={self.k_cache.shape}, v_shape={self.v_cache.shape})" + + +class GlmImageKVCache: + """Container for all layers' KV caches. + + Manages KV cache for all transformer layers in GLM-Image model. + Provides a unified interface for setting mode and clearing cache. + + Args: + num_layers: Number of transformer layers in the model. + + Example: + kv_cache = GlmImageKVCache(num_layers=28) + kv_cache.set_mode(KVCacheMode.WRITE) + # ... process condition image ... + kv_cache.set_mode(KVCacheMode.READ) + # ... process target image ... + kv_cache.clear() + """ + + def __init__(self, num_layers: int): + self.num_layers = num_layers + self.caches = [GlmImageLayerKVCache() for _ in range(num_layers)] + self._mode: KVCacheMode | None = None + + def __getitem__(self, layer_idx: int) -> GlmImageLayerKVCache: + """Get cache for a specific layer. + + Args: + layer_idx: Index of the layer (0-indexed). + + Returns: + GlmImageLayerKVCache for the specified layer. + + Raises: + IndexError: If layer_idx is out of range. + """ + if layer_idx < 0 or layer_idx >= self.num_layers: + raise IndexError(f"Layer index {layer_idx} out of range [0, {self.num_layers})") + return self.caches[layer_idx] + + def __len__(self) -> int: + """Return number of layers.""" + return self.num_layers + + @property + def mode(self) -> KVCacheMode | None: + """Get current cache mode.""" + return self._mode + + def set_mode(self, mode: KVCacheMode | str | None) -> None: + """Set cache mode for all layers. + + Args: + mode: Cache mode (WRITE, READ, SKIP) or string ("write", "read", "skip"). + Use None to disable cache operations. + + Raises: + ValueError: If mode is an invalid string. + """ + if mode is None: + self._mode = None + elif isinstance(mode, str): + try: + self._mode = KVCacheMode(mode.lower()) + except ValueError: + raise ValueError(f"Invalid mode: '{mode}', must be one of 'write', 'read', 'skip'") + else: + self._mode = mode + + def clear(self) -> None: + """Clear cache for all layers and reset mode.""" + for cache in self.caches: + cache.clear() + self._mode = None + + @property + def is_empty(self) -> bool: + """Check if all layer caches are empty.""" + return all(cache.is_empty for cache in self.caches) + + def __repr__(self) -> str: + mode_str = self._mode.value if self._mode else "None" + return f"GlmImageKVCache(num_layers={self.num_layers}, mode={mode_str}, is_empty={self.is_empty})" class GlmImageAttention(nn.Module): @@ -181,7 +317,7 @@ class GlmImageAttention(nn.Module): Joint attention for GLM-Image model using vllm-omni's optimized attention. This combines text and image streams for joint attention computation. - Supports KV caching for image editing workflows. + Supports KV caching for image editing workflows via external cache. """ def __init__( @@ -226,23 +362,29 @@ def __init__( causal=False, ) - # KV cache for image editing - self.processor_state = GlmImageAttenProcessorState.ImageGen - self.k_cache: torch.Tensor | None = None - self.v_cache: torch.Tensor | None = None - - def clear_cache(self): - """Clear the KV cache.""" - self.k_cache = None - self.v_cache = None - def forward( self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None, attention_mask: torch.Tensor | None = None, + kv_cache: GlmImageLayerKVCache | None = None, + kv_cache_mode: KVCacheMode | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Forward pass for joint attention. + + Args: + hidden_states: Image hidden states [B, img_seq_len, D] + encoder_hidden_states: Text hidden states [B, text_seq_len, D] + image_rotary_emb: Tuple of (cos, sin) for RoPE + attention_mask: Optional attention mask for text tokens + kv_cache: Optional layer KV cache for image editing + kv_cache_mode: Cache mode (WRITE, READ, SKIP) + + Returns: + Tuple of (image_hidden_states, text_hidden_states) + """ dtype = encoder_hidden_states.dtype batch_size, text_seq_length, _ = encoder_hidden_states.shape @@ -276,19 +418,15 @@ def forward( key = torch.cat([key[:, :text_seq_length, :, :], key_img], dim=1) # Handle KV cache for image editing - if self.processor_state == GlmImageAttenProcessorState.ImageEditWriteKV: - # Write to cache: accumulate KV from condition images - if self.k_cache is None: - self.k_cache = key - self.v_cache = value - else: - self.k_cache = torch.cat([self.k_cache, key], dim=1) - self.v_cache = torch.cat([self.v_cache, value], dim=1) - elif self.processor_state == GlmImageAttenProcessorState.ImageEditReadKV: - # Read from cache: concatenate cached KV with current KV - if self.k_cache is not None: - key = torch.cat([self.k_cache, key], dim=1) - value = torch.cat([self.v_cache, value], dim=1) + if kv_cache is not None and kv_cache_mode is not None: + if kv_cache_mode == KVCacheMode.WRITE: + kv_cache.store(key, value) + elif kv_cache_mode == KVCacheMode.READ: + k_cached, v_cached = kv_cache.get() + if k_cached is not None: + key = torch.cat([k_cached, key], dim=1) + value = torch.cat([v_cached, value], dim=1) + # KVCacheMode.SKIP: do nothing # Attention computation hidden_states_out = self.attn(query, key, value) @@ -338,7 +476,25 @@ def forward( image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None, attention_mask: torch.Tensor | None = None, attention_kwargs: dict[str, Any] | None = None, + kv_cache: GlmImageLayerKVCache | None = None, + kv_cache_mode: KVCacheMode | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: + """ + Forward pass for transformer block. + + Args: + hidden_states: Image hidden states + encoder_hidden_states: Text hidden states + temb: Timestep embedding + image_rotary_emb: RoPE embeddings + attention_mask: Text attention mask + attention_kwargs: Additional attention arguments + kv_cache: Layer-specific KV cache for image editing + kv_cache_mode: Cache mode (WRITE, READ, SKIP) + + Returns: + Tuple of (image_hidden_states, text_hidden_states) + """ # 1. Timestep conditioning via AdaLN ( norm_hidden_states, @@ -359,6 +515,8 @@ def forward( encoder_hidden_states=norm_encoder_hidden_states, image_rotary_emb=image_rotary_emb, attention_mask=attention_mask, + kv_cache=kv_cache, + kv_cache_mode=kv_cache_mode, ) hidden_states = hidden_states + attn_hidden_states * gate_msa.unsqueeze(1) encoder_hidden_states = encoder_hidden_states + attn_encoder_hidden_states * c_gate_msa.unsqueeze(1) @@ -468,6 +626,7 @@ def forward( return_dict: bool = True, attention_mask: torch.Tensor | None = None, image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None, + kv_cache: GlmImageKVCache | None = None, ) -> torch.Tensor | Transformer2DModelOutput: """ Forward pass of the GLM-Image Transformer. @@ -484,12 +643,20 @@ def forward( return_dict: Whether to return a dataclass. attention_mask: Optional attention mask for text tokens. image_rotary_emb: Pre-computed rotary embeddings. + kv_cache: Optional KV cache for image editing. When provided, + the cache's mode determines behavior: + - WRITE: Store KV from condition images + - READ: Use cached KV during generation + - SKIP: No caching (same as None) Returns: Output tensor or Transformer2DModelOutput. """ batch_size, num_channels, height, width = hidden_states.shape + # Get KV cache mode + kv_cache_mode = kv_cache.mode if kv_cache is not None else None + # 1. RoPE if image_rotary_emb is None: image_rotary_emb = self.rope(hidden_states) @@ -515,10 +682,12 @@ def forward( # Timestep conditioning temb = self.time_condition_embed(timestep, target_size, crop_coords, hidden_states.dtype) - temb = F.silu(temb) # 3. Transformer blocks - for block in self.transformer_blocks: + for layer_idx, block in enumerate(self.transformer_blocks): + # Get layer-specific KV cache if available + layer_kv_cache = kv_cache[layer_idx] if kv_cache is not None else None + hidden_states, encoder_hidden_states = block( hidden_states, encoder_hidden_states, @@ -526,6 +695,8 @@ def forward( image_rotary_emb, attention_mask, attention_kwargs, + kv_cache=layer_kv_cache, + kv_cache_mode=kv_cache_mode, ) # 4. Output norm & projection @@ -593,29 +764,28 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: return loaded_params - # Image Editing Support: KV Cache State Management - def set_attention_processors_state(self, state: GlmImageAttenProcessorState): + def create_kv_cache(self) -> GlmImageKVCache: """ - Set the attention processor state for all transformer blocks. + Create a KV cache for image editing. - This controls how KV cache is handled during image editing: - - ImageGen: Normal generation, no caching - - ImageEditWriteKV: Cache KV from condition images - - ImageEditReadKV: Use cached KV during generation - - ImageEditDontReadKV: Skip reading cache + Returns a new GlmImageKVCache instance sized for this model's + number of transformer layers. Use this for image editing workflows. - Args: - state: The attention processor state to set. - """ - for block in self.transformer_blocks: - block.attn.processor_state = state + Example: + kv_cache = transformer.create_kv_cache() + kv_cache.set_mode("write") + transformer(condition_image, kv_cache=kv_cache) + kv_cache.set_mode("read") + for t in timesteps: + transformer(noisy_target, kv_cache=kv_cache) + kv_cache.clear() - def clear_attention_processors_cache(self): + Returns: + GlmImageKVCache instance with correct number of layers. """ - Clear the KV cache in all attention layers. + return GlmImageKVCache(num_layers=len(self.transformer_blocks)) - Should be called before processing a new image editing request - to ensure no stale cache from previous requests. - """ - for block in self.transformer_blocks: - block.attn.clear_cache() + @property + def num_layers(self) -> int: + """Return number of transformer layers.""" + return len(self.transformer_blocks) From 86646950650b24b419d21fc4031c7dd6700f43ee Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 9 Jan 2026 13:54:12 +0800 Subject: [PATCH 04/59] init pipeline_glm_image.py Signed-off-by: JaredforReal --- .../diffusion/models/glm_image/__init__.py | 4 + .../models/glm_image/glm_image_transformer.py | 5 + .../models/glm_image/pipeline_glm_image.py | 684 +++++++++++++++++- 3 files changed, 676 insertions(+), 17 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/__init__.py b/vllm_omni/diffusion/models/glm_image/__init__.py index 64a305571ee..fc8256d8de6 100644 --- a/vllm_omni/diffusion/models/glm_image/__init__.py +++ b/vllm_omni/diffusion/models/glm_image/__init__.py @@ -3,15 +3,19 @@ """GLM Image diffusion model components.""" from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageKVCache, GlmImageTransformer2DModel, ) from vllm_omni.diffusion.models.glm_image.pipeline_glm_image import ( GlmImagePipeline, get_glm_image_post_process_func, + get_glm_image_pre_process_func, ) __all__ = [ + "GlmImageKVCache", "GlmImagePipeline", "GlmImageTransformer2DModel", "get_glm_image_post_process_func", + "get_glm_image_pre_process_func", ] diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py index 40341c23d9b..4aceb4ebfff 100644 --- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py +++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py @@ -789,3 +789,8 @@ def create_kv_cache(self) -> GlmImageKVCache: def num_layers(self) -> int: """Return number of transformer layers.""" return len(self.transformer_blocks) + + @property + def dtype(self) -> torch.dtype: + """Return dtype of model parameters.""" + return next(self.parameters()).dtype diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index ad50b2c3f62..ebed17f37b5 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -1,37 +1,47 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +GlmImagePipeline implementation for vLLM-Omni. + +This pipeline implements GLM-Image text-to-image generation with: +- AR stage: GlmImageForConditionalGeneration generates prior tokens +- DiT stage: GlmImageTransformer2DModel performs diffusion denoising +- VAE: AutoencoderKL decodes latents to images +""" + +from __future__ import annotations import inspect import json import logging import os +import re from collections.abc import Iterable -from typing import Any, Callable +from math import sqrt import numpy as np +import PIL.Image import torch -import torch.distributed as dist from diffusers.image_processor import VaeImageProcessor -from diffusers.models.autoencoders.autoencoder_kl import ( - AutoencoderKL, -) +from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL from diffusers.schedulers.scheduling_flow_match_euler_discrete import ( FlowMatchEulerDiscreteScheduler, ) from diffusers.utils.torch_utils import randn_tensor from torch import nn -from transformers import ByT5Tokenizer, T5EncoderModel, GlmImageProcessor, GlmImageForConditionalGeneration -from vllm.model_executor.models.utils import AutoWeightsLoader - -from vllm_omni.diffusion.data import OmniDiffusionConfig, DiffusionOutput -from vllm_omni.diffusion.distributed.parallel_state import ( - get_cfg_group, - get_classifier_free_guidance_rank, - get_classifier_free_guidance_world_size, +from transformers import ( + ByT5Tokenizer, + GlmImageForConditionalGeneration, + GlmImageProcessor, + T5EncoderModel, ) + +from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader -from vllm_omni.diffusion.models.glm_image.glm_image_transformer import GlmImageTransformer2DModel +from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageTransformer2DModel, +) from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.model_executor.model_loader.weight_utils import ( download_weights_from_hf_specific, @@ -39,6 +49,646 @@ logger = logging.getLogger(__name__) -def get_glm_image_post_process_func( - od_config: OmniDiffusionConfig, -): + +def calculate_shift( + image_seq_len: int, + base_seq_len: int = 256, + base_shift: float = 0.25, + max_shift: float = 0.75, +) -> float: + """Calculate timestep shift based on image sequence length.""" + m = (image_seq_len / base_seq_len) ** 0.5 + mu = m * max_shift + base_shift + return mu + + +def retrieve_timesteps( + scheduler, + num_inference_steps: int | None = None, + device: str | torch.device | None = None, + timesteps: list[int] | None = None, + sigmas: list[float] | None = None, + **kwargs, +) -> tuple[torch.Tensor, int]: + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps. + """ + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + accepts_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + + if timesteps is not None and sigmas is not None: + raise ValueError("Cannot pass both `timesteps` and `sigmas`.") + + if timesteps is not None: + if not accepts_timesteps: + raise ValueError(f"Scheduler {scheduler.__class__} doesn't support custom timesteps.") + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + if not accepts_sigmas: + raise ValueError(f"Scheduler {scheduler.__class__} doesn't support custom sigmas.") + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + + return timesteps, num_inference_steps + + +def retrieve_latents( + encoder_output: torch.Tensor, + generator: torch.Generator | None = None, + sample_mode: str = "sample", +) -> torch.Tensor: + """Extract latents from VAE encoder output.""" + if hasattr(encoder_output, "latent_dist") and sample_mode == "sample": + return encoder_output.latent_dist.sample(generator) + elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax": + return encoder_output.latent_dist.mode() + elif hasattr(encoder_output, "latents"): + return encoder_output.latents + else: + raise AttributeError("Could not access latents of provided encoder_output") + + +def get_glm_image_post_process_func(od_config: OmniDiffusionConfig): + """Get post-processing function for GLM-Image pipeline.""" + model_name = od_config.model + if os.path.exists(model_name): + model_path = model_name + else: + model_path = download_weights_from_hf_specific(model_name, None, ["*"]) + + vae_config_path = os.path.join(model_path, "vae/config.json") + with open(vae_config_path) as f: + vae_config = json.load(f) + block_out_channels = vae_config.get("block_out_channels", [128, 256, 512, 512]) + vae_scale_factor = 2 ** (len(block_out_channels) - 1) + + image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor) + + def post_process_func(images: torch.Tensor): + return image_processor.postprocess(images) + + return post_process_func + + +def get_glm_image_pre_process_func(od_config: OmniDiffusionConfig): + """Get pre-processing function for GLM-Image pipeline. + + For text-to-image, no pre-processing is needed. + For image-to-image, could handle condition image processing. + """ + + def pre_process_func(requests: list[OmniDiffusionRequest]) -> list[OmniDiffusionRequest]: + # Currently just pass through, can add image preprocessing later + return requests + + return pre_process_func + + +class GlmImagePipeline(nn.Module): + """ + GLM-Image Pipeline for text-to-image and image-to-image generation. + + This pipeline integrates: + - AR model (GlmImageForConditionalGeneration): Generates prior image tokens + - Text encoder (T5EncoderModel): Encodes glyph/text embeddings + - DiT model (GlmImageTransformer2DModel): Diffusion transformer + - VAE (AutoencoderKL): Encodes/decodes images to/from latent space + + The pipeline flow: + 1. AR generates prior_token_ids from text prompt + 2. T5 encodes glyph text for text rendering + 3. DiT performs iterative denoising conditioned on prior tokens + 4. VAE decodes final latents to image + """ + + def __init__( + self, + *, + od_config: OmniDiffusionConfig, + prefix: str = "", + ): + super().__init__() + self.od_config = od_config + self.parallel_config = od_config.parallel_config + self.device = get_local_device() + + model = od_config.model + local_files_only = os.path.exists(model) + + if local_files_only: + model_path = model + else: + model_path = download_weights_from_hf_specific(model, od_config.revision, ["*"]) + + # Load scheduler + self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained( + model_path, subfolder="scheduler", local_files_only=True + ) + + # Load AR model (vision_language_encoder) + logger.info("Loading GlmImageForConditionalGeneration (AR model)...") + self.vision_language_encoder = GlmImageForConditionalGeneration.from_pretrained( + model_path, + subfolder="vision_language_encoder", + local_files_only=True, + torch_dtype=torch.bfloat16, + ).to(self.device) + self.vision_language_encoder.eval() + + # Load processor for AR model + self.processor = GlmImageProcessor.from_pretrained(model_path, subfolder="processor", local_files_only=True) + + # Load text encoder (T5 for glyph embeddings) + logger.info("Loading T5EncoderModel (glyph encoder)...") + self.text_encoder = T5EncoderModel.from_pretrained( + model_path, + subfolder="text_encoder", + local_files_only=True, + torch_dtype=torch.bfloat16, + ).to(self.device) + self.text_encoder.eval() + + # Load tokenizer for glyph encoding + self.tokenizer = ByT5Tokenizer.from_pretrained(model_path, subfolder="tokenizer", local_files_only=True) + + # Load VAE + logger.info("Loading AutoencoderKL (VAE)...") + self.vae = AutoencoderKL.from_pretrained( + model_path, subfolder="vae", local_files_only=True, torch_dtype=torch.bfloat16 + ).to(self.device) + self.vae.eval() + + # Load transformer (DiT) + logger.info("Loading GlmImageTransformer2DModel (DiT)...") + self.transformer = GlmImageTransformer2DModel(od_config=od_config) + + # Weight sources for DiT loading + self.weights_sources = [ + DiffusersPipelineLoader.ComponentSource( + model_or_path=od_config.model, + subfolder="transformer", + revision=od_config.revision, + prefix="transformer.", + fall_back_to_pt=True, + ) + ] + + # Configure scale factors + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.default_sample_size = 128 + + # Get transformer config for patch size + self._patch_size = getattr(self.transformer, "patch_size", 2) + + # ==================== AR Stage Methods ==================== + + @staticmethod + def _build_image_grid_thw( + token_h: int, + token_w: int, + prev_token_h: int, + prev_token_w: int, + existing_grid: torch.Tensor | None = None, + device: torch.device | None = None, + ) -> torch.Tensor: + """Build image grid tensor for AR model.""" + if existing_grid is None or existing_grid.numel() == 0: + return torch.tensor( + [ + [1, token_h, token_w], + [1, prev_token_h, prev_token_w], + ], + device=device, + ) + else: + return torch.cat( + [existing_grid.to(device), torch.tensor([[1, token_h, token_w]], device=device)], + dim=0, + ) + + @staticmethod + def _calculate_ar_generation_params( + token_h: int, token_w: int, prev_token_h: int, prev_token_w: int, is_text_to_image: bool + ) -> tuple[int, int]: + """Calculate AR generation parameters.""" + large_image_tokens = token_h * token_w + small_image_tokens = prev_token_h * prev_token_w + + if is_text_to_image: + max_new_tokens = small_image_tokens + large_image_tokens + 1 + large_image_start_offset = small_image_tokens + else: + max_new_tokens = large_image_tokens + 1 + large_image_start_offset = 0 + + return max_new_tokens, large_image_start_offset + + @staticmethod + def _extract_large_image_tokens( + outputs: torch.Tensor, input_length: int, large_image_start_offset: int, large_image_tokens: int + ) -> torch.Tensor: + """Extract large image tokens from AR output.""" + generated_tokens = outputs[0][input_length:] + large_image_start = large_image_start_offset + large_image_end = large_image_start + large_image_tokens + return generated_tokens[large_image_start:large_image_end] + + @staticmethod + def _upsample_token_ids(token_ids: torch.Tensor, token_h: int, token_w: int) -> torch.Tensor: + """Upsample token IDs by 2x using nearest neighbor interpolation.""" + token_ids = token_ids.view(1, 1, token_h, token_w) + token_ids = torch.nn.functional.interpolate(token_ids.float(), scale_factor=2, mode="nearest").to( + dtype=torch.long + ) + token_ids = token_ids.view(1, -1) + return token_ids + + @staticmethod + def _build_prompt_with_shape( + prompt: str, + height: int, + width: int, + is_text_to_image: bool, + factor: int = 32, + ) -> tuple[str, int, int, int, int]: + """Build prompt with shape information for AR model.""" + token_h = height // factor + token_w = width // factor + ratio = token_h / token_w + prev_token_h = int(sqrt(ratio) * (factor // 2)) + prev_token_w = int(sqrt(1 / ratio) * (factor // 2)) + + if is_text_to_image: + expanded_prompt = f"{prompt}{token_h} {token_w}{prev_token_h} {prev_token_w}" + else: + expanded_prompt = f"{prompt}{token_h} {token_w}" + + return expanded_prompt, token_h, token_w, prev_token_h, prev_token_w + + @torch.inference_mode() + def generate_prior_tokens( + self, + prompt: str, + height: int, + width: int, + image: list[PIL.Image.Image] | None = None, + factor: int = 32, + ) -> tuple[torch.Tensor, torch.Tensor | None, int, int]: + """ + Generate prior tokens using the AR model. + + Args: + prompt: Text prompt for generation + height: Target image height + width: Target image width + image: Optional condition images for image-to-image + factor: Token factor (default 32) + + Returns: + Tuple of (prior_token_ids, prior_token_image_ids, pixel_height, pixel_width) + """ + device = self.vision_language_encoder.device + height = (height // factor) * factor + width = (width // factor) * factor + is_text_to_image = image is None or len(image) == 0 + + expanded_prompt, token_h, token_w, prev_h, prev_w = self._build_prompt_with_shape( + prompt, height, width, is_text_to_image + ) + + # Build message content + content = [] + if image is not None: + for img in image: + content.append({"type": "image", "image": img}) + content.append({"type": "text", "text": expanded_prompt}) + messages = [{"role": "user", "content": content}] + + # Apply chat template + inputs = self.processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_dict=True, + return_tensors="pt", + ) + + # Build image grid + existing_grid = inputs.get("image_grid_thw") + inputs["image_grid_thw"] = self._build_image_grid_thw( + token_h, + token_w, + prev_h, + prev_w, + existing_grid=existing_grid if not is_text_to_image else None, + device=device, + ) + + max_new_tokens, large_image_offset = self._calculate_ar_generation_params( + token_h, token_w, prev_h, prev_w, is_text_to_image + ) + large_image_tokens = token_h * token_w + + inputs = inputs.to(device) + input_length = inputs["input_ids"].shape[-1] + + # Process condition images if provided + prior_token_image_ids = None + if image is not None and existing_grid is not None: + prior_token_image_embed = self.vision_language_encoder.get_image_features( + inputs["pixel_values"], existing_grid + ) + prior_token_image_embed = torch.cat(prior_token_image_embed, dim=0) + prior_token_image_ids = self.vision_language_encoder.get_image_tokens( + prior_token_image_embed, existing_grid + ) + + # Generate with AR model + outputs = self.vision_language_encoder.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=True, + ) + + # Extract and upsample tokens + prior_token_ids_d32 = self._extract_large_image_tokens( + outputs, input_length, large_image_offset, large_image_tokens + ) + prior_token_ids = self._upsample_token_ids(prior_token_ids_d32, token_h, token_w) + + pixel_height = token_h * factor + pixel_width = token_w * factor + + return prior_token_ids, prior_token_image_ids, pixel_height, pixel_width + + # ==================== Text Encoding Methods ==================== + + def get_glyph_texts(self, prompt: str | list[str]) -> list[str]: + """Extract text within quotes for glyph rendering.""" + prompt = prompt[0] if isinstance(prompt, list) else prompt + ocr_texts = ( + re.findall(r"'([^']*)'", prompt) + + re.findall(r"“([^“”]*)”", prompt) + + re.findall(r'"([^"]*)"', prompt) + + re.findall(r"「([^「」]*)」", prompt) + ) + return ocr_texts + + def _get_glyph_embeds( + self, + prompt: str | list[str], + max_sequence_length: int = 2048, + device: torch.device | None = None, + dtype: torch.dtype | None = None, + ) -> torch.Tensor: + """Get glyph embeddings from T5 encoder for text rendering.""" + device = device or self.device + dtype = dtype or self.text_encoder.dtype + + glyph_texts = self.get_glyph_texts(prompt) + input_ids = self.tokenizer( + glyph_texts if len(glyph_texts) > 0 else [""], + max_length=max_sequence_length, + truncation=True, + ).input_ids + + # Pad to even length + input_ids = [[self.tokenizer.pad_token_id] * ((len(ids) + 1) % 2) + ids for ids in input_ids] + max_length = max(len(ids) for ids in input_ids) + + attention_mask = torch.tensor( + [[1] * len(ids) + [0] * (max_length - len(ids)) for ids in input_ids], + device=device, + ) + input_ids = torch.tensor( + [ids + [self.tokenizer.pad_token_id] * (max_length - len(ids)) for ids in input_ids], + device=device, + ) + + outputs = self.text_encoder(input_ids, attention_mask=attention_mask) + glyph_embeds = outputs.last_hidden_state[attention_mask.bool()].unsqueeze(0) + + return glyph_embeds.to(device=device, dtype=dtype) + + def encode_prompt( + self, + prompt: str | list[str], + do_classifier_free_guidance: bool = True, + num_images_per_prompt: int = 1, + prompt_embeds: torch.Tensor | None = None, + device: torch.device | None = None, + dtype: torch.dtype | None = None, + max_sequence_length: int = 2048, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + """Encode prompt into glyph embeddings for text rendering.""" + device = device or self.device + + prompt = [prompt] if isinstance(prompt, str) else prompt + batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0] + + if prompt_embeds is None: + prompt_embeds = self._get_glyph_embeds(prompt, max_sequence_length, device, dtype) + + seq_len = prompt_embeds.size(1) + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + negative_prompt_embeds = None + if do_classifier_free_guidance: + negative_prompt = [""] * batch_size + negative_prompt_embeds = self._get_glyph_embeds(negative_prompt, max_sequence_length, device, dtype) + seq_len = negative_prompt_embeds.size(1) + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + return prompt_embeds, negative_prompt_embeds + + # ==================== Latent Preparation ==================== + + def prepare_latents( + self, + batch_size: int, + num_channels_latents: int, + height: int, + width: int, + dtype: torch.dtype, + device: torch.device, + generator: torch.Generator | None, + latents: torch.Tensor | None = None, + ) -> torch.Tensor: + """Prepare random noise latents.""" + if latents is not None: + return latents.to(device) + + shape = ( + batch_size, + num_channels_latents, + int(height) // self.vae_scale_factor, + int(width) // self.vae_scale_factor, + ) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError(f"Passed {len(generator)} generators but batch size is {batch_size}.") + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + return latents + + # ==================== Main Forward Pass ==================== + + @torch.inference_mode() + def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: + """ + Main generation forward pass. + + Args: + req: OmniDiffusionRequest with generation parameters + + Returns: + DiffusionOutput containing generated image + """ + prompt = req.prompt or "" + if isinstance(prompt, list): + prompt = prompt[0] if prompt else "" + + height = req.height or self.default_sample_size * self.vae_scale_factor + width = req.width or self.default_sample_size * self.vae_scale_factor + num_inference_steps = req.num_inference_steps or 50 + guidance_scale = req.guidance_scale or 1.5 + + batch_size = 1 + do_classifier_free_guidance = guidance_scale > 1.0 + + # Set seed if provided + generator = None + if req.seed is not None: + generator = torch.Generator(device=self.device).manual_seed(req.seed) + + # 1. Generate prior tokens with AR model + logger.info("Generating prior tokens with AR model...") + prior_token_id, prior_token_image_ids, ar_height, ar_width = self.generate_prior_tokens( + prompt=prompt, + image=None, # Text-to-image for now + height=height, + width=width, + ) + height = ar_height + width = ar_width + + # 2. Encode prompt for glyph embeddings + logger.info("Encoding prompt...") + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + do_classifier_free_guidance=do_classifier_free_guidance, + num_images_per_prompt=1, + device=self.device, + dtype=self.transformer.dtype, + ) + + # 3. Prepare latents + latent_channels = self.transformer.in_channels + latents = self.prepare_latents( + batch_size=batch_size, + num_channels_latents=latent_channels, + height=height, + width=width, + dtype=prompt_embeds.dtype, + device=self.device, + generator=generator, + ) + + # 4. Prepare timesteps + image_seq_len = ((height // self.vae_scale_factor) * (width // self.vae_scale_factor)) // (self._patch_size**2) + timesteps_array = np.linspace(self.scheduler.config.num_train_timesteps, 1.0, num_inference_steps + 1)[:-1] + timesteps_array = timesteps_array.astype(np.int64).astype(np.float32) + sigmas = timesteps_array / self.scheduler.config.num_train_timesteps + + mu = calculate_shift( + image_seq_len, + self.scheduler.config.get("base_image_seq_len", 256), + self.scheduler.config.get("base_shift", 0.25), + self.scheduler.config.get("max_shift", 0.75), + ) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, self.device, timesteps_array.tolist(), sigmas.tolist(), mu=mu + ) + + # 5. Prepare conditioning tensors + target_size = torch.tensor([[height, width]], dtype=prompt_embeds.dtype, device=self.device) + crop_coords = torch.zeros((1, 2), dtype=prompt_embeds.dtype, device=self.device) + + prior_token_drop_cond = torch.full_like(prior_token_id, False, dtype=torch.bool) + prior_token_drop_uncond = torch.full_like(prior_token_id, True, dtype=torch.bool) + + # 6. Denoising loop + logger.info(f"Starting denoising loop with {num_inference_steps} steps...") + transformer_dtype = self.transformer.dtype + + for i, t in enumerate(timesteps): + latent_model_input = latents.to(transformer_dtype) + timestep = t.expand(latents.shape[0]) - 1 + + # Conditional forward pass + noise_pred_cond = self.transformer( + hidden_states=latent_model_input, + encoder_hidden_states=prompt_embeds, + prior_token_id=prior_token_id, + prior_token_drop=prior_token_drop_cond, + timestep=timestep, + target_size=target_size, + crop_coords=crop_coords, + return_dict=False, + )[0].float() + + # CFG: Unconditional forward pass + if do_classifier_free_guidance: + noise_pred_uncond = self.transformer( + hidden_states=latent_model_input, + encoder_hidden_states=negative_prompt_embeds, + prior_token_id=prior_token_id, + prior_token_drop=prior_token_drop_uncond, + timestep=timestep, + target_size=target_size, + crop_coords=crop_coords, + return_dict=False, + )[0].float() + + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + else: + noise_pred = noise_pred_cond + + # Scheduler step + latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + + # 7. VAE decode + logger.info("Decoding latents with VAE...") + latents = latents.to(self.vae.dtype) + latents_mean = ( + torch.tensor(self.vae.config.latents_mean) + .view(1, self.vae.config.latent_channels, 1, 1) + .to(latents.device, latents.dtype) + ) + latents_std = ( + torch.tensor(self.vae.config.latents_std) + .view(1, self.vae.config.latent_channels, 1, 1) + .to(latents.device, latents.dtype) + ) + latents = latents * latents_std + latents_mean + image = self.vae.decode(latents, return_dict=False, generator=generator)[0] + + # 8. Post-process + image = self.image_processor.postprocess(image, output_type="pil")[0] + + return DiffusionOutput(output=image) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load transformer weights.""" + # Filter weights for transformer only + transformer_weights = ( + (name.replace("transformer.", ""), weight) for name, weight in weights if name.startswith("transformer.") + ) + return self.transformer.load_weights(transformer_weights) From b88b4b20acb2a72dd0603fee0aa938be7d60b615 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 9 Jan 2026 14:20:20 +0800 Subject: [PATCH 05/59] init pipeline_glm_image.py Signed-off-by: JaredforReal --- .../models/glm_image/pipeline_glm_image.py | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index ebed17f37b5..657bbe898b4 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -72,22 +72,36 @@ def retrieve_timesteps( ) -> tuple[torch.Tensor, int]: """ Calls the scheduler's `set_timesteps` method and retrieves timesteps. + Handles custom timesteps and sigmas schedules. """ accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) accepts_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) if timesteps is not None and sigmas is not None: - raise ValueError("Cannot pass both `timesteps` and `sigmas`.") - - if timesteps is not None: + # Both provided - check if scheduler supports both + if not accepts_timesteps and not accepts_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep or sigma schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif timesteps is not None: if not accepts_timesteps: - raise ValueError(f"Scheduler {scheduler.__class__} doesn't support custom timesteps.") + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) elif sigmas is not None: if not accepts_sigmas: - raise ValueError(f"Scheduler {scheduler.__class__} doesn't support custom sigmas.") + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigma schedules. Please check whether you are using the correct scheduler." + ) scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) From b9108f4509a93d0607b78c07446052e93550e307 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 9 Jan 2026 14:38:55 +0800 Subject: [PATCH 06/59] remove pre process Signed-off-by: JaredforReal --- .../models/glm_image/pipeline_glm_image.py | 58 +++++++------------ vllm_omni/diffusion/registry.py | 1 - 2 files changed, 22 insertions(+), 37 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index 657bbe898b4..72026406813 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -50,6 +50,28 @@ logger = logging.getLogger(__name__) +def get_glm_image_post_process_func(od_config: OmniDiffusionConfig): + """Get post-processing function for GLM-Image pipeline.""" + model_name = od_config.model + if os.path.exists(model_name): + model_path = model_name + else: + model_path = download_weights_from_hf_specific(model_name, None, ["*"]) + + vae_config_path = os.path.join(model_path, "vae/config.json") + with open(vae_config_path) as f: + vae_config = json.load(f) + block_out_channels = vae_config.get("block_out_channels", [128, 256, 512, 512]) + vae_scale_factor = 2 ** (len(block_out_channels) - 1) + + image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor) + + def post_process_func(images: torch.Tensor): + return image_processor.postprocess(images) + + return post_process_func + + def calculate_shift( image_seq_len: int, base_seq_len: int = 256, @@ -128,42 +150,6 @@ def retrieve_latents( raise AttributeError("Could not access latents of provided encoder_output") -def get_glm_image_post_process_func(od_config: OmniDiffusionConfig): - """Get post-processing function for GLM-Image pipeline.""" - model_name = od_config.model - if os.path.exists(model_name): - model_path = model_name - else: - model_path = download_weights_from_hf_specific(model_name, None, ["*"]) - - vae_config_path = os.path.join(model_path, "vae/config.json") - with open(vae_config_path) as f: - vae_config = json.load(f) - block_out_channels = vae_config.get("block_out_channels", [128, 256, 512, 512]) - vae_scale_factor = 2 ** (len(block_out_channels) - 1) - - image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor) - - def post_process_func(images: torch.Tensor): - return image_processor.postprocess(images) - - return post_process_func - - -def get_glm_image_pre_process_func(od_config: OmniDiffusionConfig): - """Get pre-processing function for GLM-Image pipeline. - - For text-to-image, no pre-processing is needed. - For image-to-image, could handle condition image processing. - """ - - def pre_process_func(requests: list[OmniDiffusionRequest]) -> list[OmniDiffusionRequest]: - # Currently just pass through, can add image preprocessing later - return requests - - return pre_process_func - - class GlmImagePipeline(nn.Module): """ GLM-Image Pipeline for text-to-image and image-to-image generation. diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index 743e9f4c455..ff29be67345 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -134,7 +134,6 @@ def initialize_model( # where mod_folder and mod_relname are defined and mapped using `_DIFFUSION_MODELS` via the `arch` key "QwenImageEditPipeline": "get_qwen_image_edit_pre_process_func", "QwenImageEditPlusPipeline": "get_qwen_image_edit_plus_pre_process_func", - "GlmImagePipeline": "get_glm_image_pre_process_func", "LongCatImageEditPipeline": "get_longcat_image_edit_pre_process_func", "QwenImageLayeredPipeline": "get_qwen_image_layered_pre_process_func", "WanPipeline": "get_wan22_pre_process_func", From 371afd55f314658ca80c62d1044e153d170311bc Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 9 Jan 2026 17:13:11 +0800 Subject: [PATCH 07/59] add check_input(), implement CFG parallel in diffuse(), align generate_prior_tokens Signed-off-by: JaredforReal --- .../models/glm_image/pipeline_glm_image.py | 234 ++++++++++++++---- 1 file changed, 186 insertions(+), 48 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index 72026406813..96d944f7e31 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -37,9 +37,15 @@ ) from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig +from vllm_omni.diffusion.distributed.parallel_state import ( + get_cfg_group, + get_classifier_free_guidance_rank, + get_classifier_free_guidance_world_size, +) from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.glm_image.glm_image_transformer import ( + GlmImageKVCache, GlmImageTransformer2DModel, ) from vllm_omni.diffusion.request import OmniDiffusionRequest @@ -247,6 +253,41 @@ def __init__( # Get transformer config for patch size self._patch_size = getattr(self.transformer, "patch_size", 2) + # ==================== Input Validation ==================== + + def check_inputs( + self, + prompt: str | list[str] | None, + height: int | None, + width: int | None, + prompt_embeds: torch.Tensor | None = None, + ) -> None: + """Validate input arguments before generation.""" + # Check dimension alignment + multiple_of = self.vae_scale_factor * self._patch_size + if height is not None and height % multiple_of != 0: + logger.warning( + f"`height` should be divisible by {multiple_of} but is {height}. " + "Dimensions will be adjusted accordingly." + ) + if width is not None and width % multiple_of != 0: + logger.warning( + f"`width` should be divisible by {multiple_of} but is {width}. Dimensions will be adjusted accordingly." + ) + + # Check prompt/prompt_embeds mutual exclusivity + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. " + "Please provide only one of the two." + ) + if prompt is None and prompt_embeds is None: + raise ValueError("Provide either `prompt` or `prompt_embeds`. Cannot leave both undefined.") + + # Check prompt type + if prompt is not None and not isinstance(prompt, (str, list)): + raise ValueError(f"`prompt` must be of type `str` or `list` but is {type(prompt)}") + # ==================== AR Stage Methods ==================== @staticmethod @@ -423,10 +464,7 @@ def generate_prior_tokens( ) prior_token_ids = self._upsample_token_ids(prior_token_ids_d32, token_h, token_w) - pixel_height = token_h * factor - pixel_width = token_w * factor - - return prior_token_ids, prior_token_image_ids, pixel_height, pixel_width + return prior_token_ids, prior_token_image_ids # ==================== Text Encoding Methods ==================== @@ -538,6 +576,133 @@ def prepare_latents( latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) return latents + def diffuse( + self, + latents: torch.Tensor, + prior_token_id: torch.Tensor, + prompt_embeds: torch.Tensor, + negative_prompt_embeds: torch.Tensor | None, + timesteps: torch.Tensor, + target_size: torch.Tensor, + crop_coords: torch.Tensor, + guidance_scale: float, + do_classifier_free_guidance: bool, + kv_caches: GlmImageKVCache | None = None, + ) -> torch.Tensor: + """ + Denoising loop for diffusion process with CFG-Parallel support. + + Args: + latents: Initial noise latents + prior_token_id: Prior tokens generated by AR model + prompt_embeds: Encoded positive prompt embeddings (glyph embeddings) + negative_prompt_embeds: Encoded negative prompt embeddings + timesteps: Denoising timesteps + target_size: Target image size tensor [[height, width]] + crop_coords: Crop coordinates tensor + guidance_scale: CFG scale + do_classifier_free_guidance: Whether to apply CFG + kv_caches: Optional KV cache for Image Edit mode + + Returns: + Denoised latents ready for VAE decode + """ + # Prepare conditional/unconditional drop flags + prior_token_drop_cond = torch.full_like(prior_token_id, False, dtype=torch.bool) + prior_token_drop_uncond = torch.full_like(prior_token_id, True, dtype=torch.bool) + + transformer_dtype = self.transformer.dtype + + # Enable CFG-parallel: rank0 computes positive, rank1 computes negative + cfg_parallel_ready = do_classifier_free_guidance and get_classifier_free_guidance_world_size() > 1 + + for i, t in enumerate(timesteps): + latent_model_input = latents.to(transformer_dtype) + timestep = t.expand(latents.shape[0]) - 1 + + if cfg_parallel_ready: + cfg_group = get_cfg_group() + cfg_rank = get_classifier_free_guidance_rank() + + if cfg_rank == 0: + # Rank 0: Compute positive (conditional) prediction + local_pred = self.transformer( + hidden_states=latent_model_input, + encoder_hidden_states=prompt_embeds, + prior_token_id=prior_token_id, + prior_token_drop=prior_token_drop_cond, + timestep=timestep, + target_size=target_size, + crop_coords=crop_coords, + kv_caches=kv_caches, + return_dict=False, + )[0].float() + else: + # Rank 1: Compute negative (unconditional) prediction + local_pred = self.transformer( + hidden_states=latent_model_input, + encoder_hidden_states=negative_prompt_embeds, + prior_token_id=prior_token_id, + prior_token_drop=prior_token_drop_uncond, + timestep=timestep, + target_size=target_size, + crop_coords=crop_coords, + kv_caches=kv_caches, + return_dict=False, + )[0].float() + + # All-gather predictions from all ranks + gathered = cfg_group.all_gather(local_pred, separate_tensors=True) + + if cfg_rank == 0: + # Rank 0: Combine predictions and apply CFG + noise_pred_cond = gathered[0] + noise_pred_uncond = gathered[1] + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + # Scheduler step + latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + + # Broadcast updated latents to all ranks + cfg_group.broadcast(latents, src=0) + + else: + # Sequential CFG (single GPU or no CFG) + # Conditional forward pass + noise_pred_cond = self.transformer( + hidden_states=latent_model_input, + encoder_hidden_states=prompt_embeds, + prior_token_id=prior_token_id, + prior_token_drop=prior_token_drop_cond, + timestep=timestep, + target_size=target_size, + crop_coords=crop_coords, + kv_caches=kv_caches, + return_dict=False, + )[0].float() + + if do_classifier_free_guidance: + # Unconditional forward pass + noise_pred_uncond = self.transformer( + hidden_states=latent_model_input, + encoder_hidden_states=negative_prompt_embeds, + prior_token_id=prior_token_id, + prior_token_drop=prior_token_drop_uncond, + timestep=timestep, + target_size=target_size, + crop_coords=crop_coords, + kv_caches=kv_caches, + return_dict=False, + )[0].float() + + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) + else: + noise_pred = noise_pred_cond + + # Scheduler step + latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + + return latents + # ==================== Main Forward Pass ==================== @torch.inference_mode() @@ -560,6 +725,9 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: num_inference_steps = req.num_inference_steps or 50 guidance_scale = req.guidance_scale or 1.5 + # 0. Validate inputs + self.check_inputs(prompt=prompt, height=height, width=width) + batch_size = 1 do_classifier_free_guidance = guidance_scale > 1.0 @@ -570,14 +738,12 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: # 1. Generate prior tokens with AR model logger.info("Generating prior tokens with AR model...") - prior_token_id, prior_token_image_ids, ar_height, ar_width = self.generate_prior_tokens( + prior_token_id, prior_token_image_ids = self.generate_prior_tokens( prompt=prompt, image=None, # Text-to-image for now height=height, width=width, ) - height = ar_height - width = ar_width # 2. Encode prompt for glyph embeddings logger.info("Encoding prompt...") @@ -621,48 +787,20 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: target_size = torch.tensor([[height, width]], dtype=prompt_embeds.dtype, device=self.device) crop_coords = torch.zeros((1, 2), dtype=prompt_embeds.dtype, device=self.device) - prior_token_drop_cond = torch.full_like(prior_token_id, False, dtype=torch.bool) - prior_token_drop_uncond = torch.full_like(prior_token_id, True, dtype=torch.bool) - - # 6. Denoising loop + # 6. Denoising loop with CFG-parallel support logger.info(f"Starting denoising loop with {num_inference_steps} steps...") - transformer_dtype = self.transformer.dtype - - for i, t in enumerate(timesteps): - latent_model_input = latents.to(transformer_dtype) - timestep = t.expand(latents.shape[0]) - 1 - - # Conditional forward pass - noise_pred_cond = self.transformer( - hidden_states=latent_model_input, - encoder_hidden_states=prompt_embeds, - prior_token_id=prior_token_id, - prior_token_drop=prior_token_drop_cond, - timestep=timestep, - target_size=target_size, - crop_coords=crop_coords, - return_dict=False, - )[0].float() - - # CFG: Unconditional forward pass - if do_classifier_free_guidance: - noise_pred_uncond = self.transformer( - hidden_states=latent_model_input, - encoder_hidden_states=negative_prompt_embeds, - prior_token_id=prior_token_id, - prior_token_drop=prior_token_drop_uncond, - timestep=timestep, - target_size=target_size, - crop_coords=crop_coords, - return_dict=False, - )[0].float() - - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond) - else: - noise_pred = noise_pred_cond - - # Scheduler step - latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + latents = self.diffuse( + latents=latents, + prior_token_id=prior_token_id, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + timesteps=timesteps, + target_size=target_size, + crop_coords=crop_coords, + guidance_scale=guidance_scale, + do_classifier_free_guidance=do_classifier_free_guidance, + kv_caches=None, # TODO: Add KV cache support for Image Edit + ) # 7. VAE decode logger.info("Decoding latents with VAE...") From 3d4f5f245825172355d783685f611da2d1dd79e2 Mon Sep 17 00:00:00 2001 From: JaredforReal Date: Fri, 9 Jan 2026 18:04:58 +0800 Subject: [PATCH 08/59] fix check_input(prompt_embed), add KVCache for Image Edit Signed-off-by: JaredforReal --- .../models/glm_image/pipeline_glm_image.py | 155 ++++++++++++++++-- 1 file changed, 144 insertions(+), 11 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index 96d944f7e31..599eb2cdd8e 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -705,6 +705,112 @@ def diffuse( # ==================== Main Forward Pass ==================== + def _prepare_condition_image_kv_cache( + self, + condition_images: list[torch.Tensor], + prior_token_image_ids: list[torch.Tensor], + prompt_embeds: torch.Tensor, + generator: torch.Generator | None = None, + ) -> GlmImageKVCache: + """ + Prepare KV cache by running condition images through transformer at timestep 0. + + This is used for Image Edit mode where we need to cache the condition image's + KV states for cross-attention during denoising. + + Args: + condition_images: List of preprocessed condition images + prior_token_image_ids: Prior token IDs for each condition image from AR model + prompt_embeds: Prompt embeddings (used to get dtype) + generator: Optional random generator + + Returns: + GlmImageKVCache with cached KV states from condition images + """ + kv_caches = self.transformer.create_kv_cache() + kv_caches.set_mode("write") + + # Prepare VAE normalization parameters + latents_mean = ( + torch.tensor(self.vae.config.latents_mean) + .view(1, self.vae.config.latent_channels, 1, 1) + .to(device=self.device, dtype=prompt_embeds.dtype) + ) + latents_std = ( + torch.tensor(self.vae.config.latents_std) + .view(1, self.vae.config.latent_channels, 1, 1) + .to(device=self.device, dtype=prompt_embeds.dtype) + ) + + # Process each condition image through transformer to populate KV cache + for condition_image, condition_prior_token_id in zip(condition_images, prior_token_image_ids): + condition_image = condition_image.to(device=self.device, dtype=prompt_embeds.dtype) + + # Encode condition image to latent space + # Use argmax (mode) for deterministic encoding of condition images + condition_latent = retrieve_latents( + self.vae.encode(condition_image), generator=generator, sample_mode="argmax" + ) + condition_latent = (condition_latent - latents_mean) / latents_std + + # Run forward pass at timestep 0 to cache KV states + # Empty encoder_hidden_states since we only want to cache image features + _ = self.transformer( + hidden_states=condition_latent, + encoder_hidden_states=torch.zeros_like(prompt_embeds)[:1, :0, ...], + prior_token_id=condition_prior_token_id, + prior_token_drop=torch.full_like(condition_prior_token_id, False, dtype=torch.bool), + timestep=torch.zeros((1,), device=self.device), + target_size=torch.tensor([condition_image.shape[-2:]], device=self.device, dtype=prompt_embeds.dtype), + crop_coords=torch.zeros((1, 2), device=self.device, dtype=prompt_embeds.dtype), + kv_caches=kv_caches, + return_dict=False, + ) + + return kv_caches + + def _preprocess_condition_images( + self, + images: list[PIL.Image.Image] | PIL.Image.Image | None, + ) -> tuple[list[torch.Tensor] | None, int | None, int | None]: + """ + Preprocess condition images for Image Edit mode. + + Args: + images: Input images (PIL or list of PIL) + + Returns: + Tuple of (preprocessed_images, height, width) + """ + if images is None: + return None, None, None + + if not isinstance(images, list): + images = [images] + + preprocessed = [] + height, width = None, None + + for img in images: + if isinstance(img, PIL.Image.Image): + img_h, img_w = img.size[::-1] + else: + img_h, img_w = img.shape[:2] + + # Align to multiple of vae_scale_factor * patch_size + multiple_of = self.vae_scale_factor * self._patch_size + img_h = (img_h // multiple_of) * multiple_of + img_w = (img_w // multiple_of) * multiple_of + + processed = self.image_processor.preprocess(img, height=img_h, width=img_w) + preprocessed.append(processed) + + # Use first image dimensions as default + if height is None: + height, width = img_h, img_w + + return preprocessed, height, width + @torch.inference_mode() def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: """ @@ -720,13 +826,26 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: if isinstance(prompt, list): prompt = prompt[0] if prompt else "" - height = req.height or self.default_sample_size * self.vae_scale_factor - width = req.width or self.default_sample_size * self.vae_scale_factor + # Get pre-computed prompt embeddings if provided + prompt_embeds = req.prompt_embeds if isinstance(req.prompt_embeds, torch.Tensor) else None + + # Get condition images for Image Edit mode + condition_images = req.pil_image + if condition_images is not None and not isinstance(condition_images, list): + condition_images = [condition_images] + + # Preprocess condition images and get dimensions + preprocessed_images, img_height, img_width = self._preprocess_condition_images(condition_images) + is_image_edit = preprocessed_images is not None + + # Use image dimensions as default if available + height = req.height or img_height or self.default_sample_size * self.vae_scale_factor + width = req.width or img_width or self.default_sample_size * self.vae_scale_factor num_inference_steps = req.num_inference_steps or 50 guidance_scale = req.guidance_scale or 1.5 # 0. Validate inputs - self.check_inputs(prompt=prompt, height=height, width=width) + self.check_inputs(prompt=prompt, height=height, width=width, prompt_embeds=prompt_embeds) batch_size = 1 do_classifier_free_guidance = guidance_scale > 1.0 @@ -740,7 +859,7 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: logger.info("Generating prior tokens with AR model...") prior_token_id, prior_token_image_ids = self.generate_prior_tokens( prompt=prompt, - image=None, # Text-to-image for now + image=condition_images, height=height, width=width, ) @@ -751,11 +870,25 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: prompt, do_classifier_free_guidance=do_classifier_free_guidance, num_images_per_prompt=1, + prompt_embeds=prompt_embeds, device=self.device, dtype=self.transformer.dtype, ) - # 3. Prepare latents + # 3. Prepare KV cache for Image Edit mode + kv_caches = None + if is_image_edit and prior_token_image_ids is not None: + logger.info("Preparing KV cache for Image Edit mode...") + kv_caches = self._prepare_condition_image_kv_cache( + condition_images=preprocessed_images, + prior_token_image_ids=prior_token_image_ids, + prompt_embeds=prompt_embeds, + generator=generator, + ) + # Switch to read mode for denoising + kv_caches.set_mode("read") + + # 4. Prepare latents latent_channels = self.transformer.in_channels latents = self.prepare_latents( batch_size=batch_size, @@ -767,7 +900,7 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: generator=generator, ) - # 4. Prepare timesteps + # 5. Prepare timesteps image_seq_len = ((height // self.vae_scale_factor) * (width // self.vae_scale_factor)) // (self._patch_size**2) timesteps_array = np.linspace(self.scheduler.config.num_train_timesteps, 1.0, num_inference_steps + 1)[:-1] timesteps_array = timesteps_array.astype(np.int64).astype(np.float32) @@ -783,11 +916,11 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: self.scheduler, num_inference_steps, self.device, timesteps_array.tolist(), sigmas.tolist(), mu=mu ) - # 5. Prepare conditioning tensors + # 6. Prepare conditioning tensors target_size = torch.tensor([[height, width]], dtype=prompt_embeds.dtype, device=self.device) crop_coords = torch.zeros((1, 2), dtype=prompt_embeds.dtype, device=self.device) - # 6. Denoising loop with CFG-parallel support + # 7. Denoising loop with CFG-parallel support logger.info(f"Starting denoising loop with {num_inference_steps} steps...") latents = self.diffuse( latents=latents, @@ -799,10 +932,10 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: crop_coords=crop_coords, guidance_scale=guidance_scale, do_classifier_free_guidance=do_classifier_free_guidance, - kv_caches=None, # TODO: Add KV cache support for Image Edit + kv_caches=kv_caches, ) - # 7. VAE decode + # 8. VAE decode logger.info("Decoding latents with VAE...") latents = latents.to(self.vae.dtype) latents_mean = ( @@ -818,7 +951,7 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: latents = latents * latents_std + latents_mean image = self.vae.decode(latents, return_dict=False, generator=generator)[0] - # 8. Post-process + # 9. Post-process image = self.image_processor.postprocess(image, output_type="pil")[0] return DiffusionOutput(output=image) From 0810dae881dbc5b160f62f97dfc11584f2565e09 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 13 Jan 2026 06:28:23 +0000 Subject: [PATCH 09/59] print out vllm version Signed-off-by: root --- examples/offline_inference/qwen3_omni/end2end.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index 7280d7d2e23..9a1324305cf 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -13,6 +13,7 @@ import numpy as np import soundfile as sf from PIL import Image +import vllm from vllm import SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset @@ -237,6 +238,7 @@ def get_multi_audios_query() -> QueryResult: def main(args): model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct" + print(f"="*20,"\n",f"vllm version: {vllm.__version__}","\n","="*20) # Get paths from args video_path = getattr(args, "video_path", None) @@ -302,8 +304,8 @@ def main(args): sampling_params_list = [ thinker_sampling_params, - talker_sampling_params, # code predictor is integrated into talker for Qwen3 Omni - code2wav_sampling_params, + # talker_sampling_params, # code predictor is integrated into talker for Qwen3 Omni + # code2wav_sampling_params, ] if args.txt_prompts is None: From 8e36c517381eb17e4cb69a72282510061586c1b1 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Tue, 13 Jan 2026 06:33:39 +0000 Subject: [PATCH 10/59] update model config Signed-off-by: tzhouam --- vllm_omni/config/model.py | 162 ++++++++------------------------------ 1 file changed, 35 insertions(+), 127 deletions(-) diff --git a/vllm_omni/config/model.py b/vllm_omni/config/model.py index e074689c9e2..2e53a7af2e1 100644 --- a/vllm_omni/config/model.py +++ b/vllm_omni/config/model.py @@ -6,15 +6,10 @@ import vllm.envs as envs from pydantic import ConfigDict from pydantic.dataclasses import dataclass -from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.config import ModelConfig, config from vllm.config.model import ( _RUNNER_CONVERTS, - _RUNNER_TASKS, - ConvertOption, - ConvertType, - RunnerOption, - TaskOption, _get_and_verify_dtype, get_served_model_name, ) @@ -31,11 +26,8 @@ from vllm.transformers_utils.gguf_utils import ( maybe_patch_hf_config_from_gguf, ) -from vllm.transformers_utils.utils import ( - is_gguf, - maybe_model_redirect, -) - +from vllm.transformers_utils.utils import maybe_model_redirect +from vllm.transformers_utils.gguf_utils import is_gguf import vllm_omni.model_executor.models as me_models logger = init_logger(__name__) @@ -116,7 +108,9 @@ def __post_init__( video_pruning_rate: float | None, ) -> None: # Keep set served_model_name before maybe_model_redirect(self.model) - self.served_model_name = get_served_model_name(self.model, self.served_model_name) + self.served_model_name = get_served_model_name( + self.model, self.served_model_name + ) self.model = maybe_model_redirect(self.model) # The tokenizer is consistent with the model by default. if self.tokenizer is None: @@ -146,14 +140,6 @@ def __post_init__( self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) - if (backend := envs.VLLM_ATTENTION_BACKEND) and backend == "FLASHINFER" and find_spec("flashinfer") is None: - raise ValueError( - "VLLM_ATTENTION_BACKEND is set to FLASHINFER, but flashinfer " - "module was not found. See " - "https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile " # noqa: E501 - "for instructions on how to install it." - ) - if self.override_attention_dtype is not None and not current_platform.is_rocm(): warnings.warn( "override-attention-dtype is set but not using ROCm platform", @@ -181,115 +167,24 @@ def __post_init__( if dict_overrides: self._apply_dict_overrides(hf_config, dict_overrides) self.hf_text_config = self.draw_hf_text_config() - self.attention_chunk_size = getattr(self.hf_text_config, "attention_chunk_size", None) + self.attention_chunk_size = getattr( + self.hf_text_config, "attention_chunk_size", None + ) self.encoder_config = self._get_encoder_config() - # Try to load image processor config, but allow it to fail for stages that don't need it - try: - self.hf_image_processor_config = get_hf_image_processor_config( - self.model, hf_token=self.hf_token, revision=self.revision - ) - except (OSError, ValueError, IndexError) as e: - # Some stages (e.g., code2wav, talker) don't need image processor - # Log warning but allow initialization to continue - logger.warning( - f"Failed to load image processor config for model '{self.model}': {e}. " - "This is expected for stages that don't require image processing." - ) - self.hf_image_processor_config = None + self.hf_image_processor_config = get_hf_image_processor_config( + self.model, hf_token=self.hf_token, revision=self.revision + ) + self.model_arch_config = self.get_model_arch_config() architectures = self.architectures registry = self.registry is_generative_model = registry.is_text_generation_model(architectures, self) is_pooling_model = registry.is_pooling_model(architectures, self) - def _task_to_convert(task: TaskOption) -> ConvertType: - if task == "embedding" or task == "embed": - return "embed" - if task == "classify": - return "classify" - if task == "reward": - return "reward" - if task == "score": - new_task = self._get_default_pooling_task(architectures) - return "classify" if new_task == "classify" else "embed" - - return "none" - - if self.task is not None: - runner: RunnerOption = "auto" - convert: ConvertOption = "auto" - msg_prefix = ( - "The 'task' option has been deprecated and will be removed in v0.13.0 or v1.0, whichever comes first." - ) - msg_hint = "Please remove this option." - - is_generative_task = self.task in _RUNNER_TASKS["generate"] - is_pooling_task = self.task in _RUNNER_TASKS["pooling"] - - if is_generative_model and is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = ( - "Please replace this option with `--runner " - "generate` to continue using this model " - "as a generative model." - ) - elif is_pooling_task: - runner = "pooling" - convert = "auto" - msg_hint = ( - "Please replace this option with `--runner " - "pooling` to continue using this model " - "as a pooling model." - ) - else: # task == "auto" - pass - elif is_generative_model or is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = "Please remove this option" - elif is_pooling_task: - runner = "pooling" - convert = _task_to_convert(self.task) - msg_hint = ( - "Please replace this option with `--convert " - f"{convert}` to continue using this model " - "as a pooling model." - ) - else: # task == "auto" - pass - else: - # Neither generative nor pooling model - try to convert if possible - if is_pooling_task: - runner = "pooling" - convert = _task_to_convert(self.task) - msg_hint = ( - "Please replace this option with `--runner pooling " - f"--convert {convert}` to continue using this model " - "as a pooling model." - ) - else: - debug_info = { - "architectures": architectures, - "is_generative_model": is_generative_model, - "is_pooling_model": is_pooling_model, - } - raise AssertionError( - "The model should be a generative or " - "pooling model when task is set to " - f"{self.task!r}. Found: {debug_info}" - ) - - self.runner = runner - self.convert = convert - - msg = f"{msg_prefix} {msg_hint}" - warnings.warn(msg, DeprecationWarning, stacklevel=2) - self.runner_type = self._get_runner_type(architectures, self.runner) - self.convert_type = self._get_convert_type(architectures, self.runner_type, self.convert) + self.convert_type = self._get_convert_type( + architectures, self.runner_type, self.convert + ) if self.runner_type == "generate" and not is_generative_model: generate_converts = _RUNNER_CONVERTS["generate"] @@ -325,9 +220,12 @@ def _task_to_convert(task: TaskOption) -> ConvertType: if getattr(self.pooler_config, k) is None: setattr(self.pooler_config, k, v) - default_pooling_type = self._model_info.default_pooling_type - if self.pooler_config.pooling_type is None: - self.pooler_config.pooling_type = default_pooling_type + default_seq_pooling_type = self._model_info.default_seq_pooling_type + if self.pooler_config.seq_pooling_type is None: + self.pooler_config.seq_pooling_type = default_seq_pooling_type + default_tok_pooling_type = self._model_info.default_tok_pooling_type + if self.pooler_config.tok_pooling_type is None: + self.pooler_config.tok_pooling_type = default_tok_pooling_type self.dtype: torch.dtype = _get_and_verify_dtype( self.model, @@ -339,9 +237,17 @@ def _task_to_convert(task: TaskOption) -> ConvertType: self.original_max_model_len = self.max_model_len self.max_model_len = self.get_and_verify_max_len(self.max_model_len) + + if self.is_encoder_decoder: + self.mm_processor_cache_gb = 0 + logger.info("Encoder-decoder model detected, disabling mm processor cache.") + # Init multimodal config if needed if self._model_info.supports_multimodal: - if mm_encoder_tp_mode == "data" and not self._model_info.supports_multimodal_encoder_tp_data: + if ( + mm_encoder_tp_mode == "data" + and not self._model_info.supports_multimodal_encoder_tp_data + ): logger.warning_once( "This model does not support `--mm-encoder-tp-mode data`. " "Falling back to `--mm-encoder-tp-mode weights`." @@ -363,7 +269,9 @@ def _task_to_convert(task: TaskOption) -> ConvertType: video_pruning_rate=video_pruning_rate, ) - mm_config_kwargs = {k: v for k, v in mm_config_kwargs.items() if v is not None} + mm_config_kwargs = { + k: v for k, v in mm_config_kwargs.items() if v is not None + } self.multimodal_config = MultiModalConfig(**mm_config_kwargs) @@ -382,7 +290,7 @@ def _task_to_convert(task: TaskOption) -> ConvertType: # Avoid running try_verify_and_update_config multiple times self.config_updated = False - + self._try_verify_and_update_model_config() self._verify_quantization() self._verify_cuda_graph() self._verify_bnb_config() From 7f704d5d0bb72d47ccf7085338d9a365de90e3a9 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Tue, 13 Jan 2026 07:23:00 +0000 Subject: [PATCH 11/59] update worker Signed-off-by: tzhouam --- vllm_omni/worker/gpu_ar_worker.py | 60 +++++++++++----------- vllm_omni/worker/gpu_generation_worker.py | 61 +++++++++++------------ 2 files changed, 57 insertions(+), 64 deletions(-) diff --git a/vllm_omni/worker/gpu_ar_worker.py b/vllm_omni/worker/gpu_ar_worker.py index 9e058addb6e..599dea31f2f 100644 --- a/vllm_omni/worker/gpu_ar_worker.py +++ b/vllm_omni/worker/gpu_ar_worker.py @@ -3,19 +3,21 @@ import torch from vllm.logger import init_logger -from vllm.model_executor import set_random_seed +from vllm.utils.torch_utils import set_random_seed from vllm.platforms import current_platform -from vllm.utils.mem_constants import GiB_bytes -from vllm.utils.mem_utils import MemorySnapshot +from vllm.utils.mem_utils import MemorySnapshot, format_gib from vllm.v1.utils import report_usage_stats from vllm.v1.worker.gpu_worker import Worker as GPUWorker from vllm.v1.worker.gpu_worker import init_worker_distributed_environment from vllm_omni.worker.gpu_ar_model_runner import GPUARModelRunner - +from vllm.v1.worker.workspace import init_workspace_manager +from vllm.v1.worker.utils import request_memory +from vllm.logger import init_logger logger = init_logger(__name__) + class GPUARWorker(GPUWorker): """GPU worker for autoregressive omni model stages. @@ -24,24 +26,24 @@ class GPUARWorker(GPUWorker): """ def init_device(self): - device = self.device_config.device - if isinstance(device, torch.device) and device.type == "cuda": + if self.device_config.device_type == "cuda": # This env var set by Ray causes exceptions with graph building. os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) + parallel_config = self.parallel_config if ( - self.parallel_config.data_parallel_size > 1 - and self.parallel_config.data_parallel_size_local > 0 - and self.parallel_config.distributed_executor_backend not in ["ray", "external_launcher"] - and self.vllm_config.parallel_config.data_parallel_backend != "ray" - and self.vllm_config.parallel_config.nnodes_within_dp == 1 + parallel_config.distributed_executor_backend + not in ("ray", "external_launcher") + and parallel_config.data_parallel_backend != "ray" + and parallel_config.nnodes_within_dp == 1 ): # Use local DP rank if available, otherwise use global DP rank. dp_local_rank = self.parallel_config.data_parallel_rank_local if dp_local_rank is None: - dp_local_rank = self.parallel_config.data_parallel_rank + dp_local_rank = self.parallel_config.data_parallel_index tp_pp_world_size = ( - self.parallel_config.pipeline_parallel_size * self.parallel_config.tensor_parallel_size + self.parallel_config.pipeline_parallel_size + * self.parallel_config.tensor_parallel_size ) # DP_LOCAL_RANK * TP_PP_WORLD_SIZE + TP_LOCAL_RANK @@ -49,7 +51,9 @@ def init_device(self): assert self.local_rank < torch.cuda.device_count(), ( f"DP adjusted local rank {self.local_rank} is out of bounds. " ) - visible_device_count = torch.cuda.device_count() if torch.cuda.is_available() else 0 + visible_device_count = ( + torch.cuda.device_count() if torch.cuda.is_available() else 0 + ) assert self.parallel_config.local_world_size <= visible_device_count, ( f"local_world_size ({self.parallel_config.local_world_size}) must " f"be less than or equal to the number of visible devices " @@ -80,28 +84,22 @@ def init_device(self): torch.cuda.empty_cache() # take current memory snapshot - self.init_snapshot = MemorySnapshot() - self.requested_memory = self.init_snapshot.total_memory * self.cache_config.gpu_memory_utilization - if self.init_snapshot.free_memory < self.requested_memory: - - def gib(bytes_val: float) -> float: - return round(bytes_val / GiB_bytes, 2) - - raise ValueError( - f"Free memory on device " - f"({gib(self.init_snapshot.free_memory)}/" - f"{gib(self.init_snapshot.total_memory)} GiB) on startup " - f"is less than desired GPU memory utilization " - f"({self.cache_config.gpu_memory_utilization}, " - f"{gib(self.requested_memory)} GiB). Decrease GPU memory " - f"utilization or reduce GPU memory used by other processes." - ) + self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device) + self.requested_memory = request_memory(init_snapshot, self.cache_config) + logger.debug("worker init memory snapshot: %r", self.init_snapshot) + logger.debug( + "worker requested memory: %sGiB", format_gib(self.requested_memory) + ) else: raise RuntimeError(f"Not support device type: {self.device_config.device}") + # Initialize workspace manager + num_ubatches = 2 if self.vllm_config.parallel_config.enable_dbo else 1 + init_workspace_manager(self.device, num_ubatches) + # Construct the model runner self.model_runner = GPUARModelRunner(self.vllm_config, self.device) if self.rank == 0: # If usage stat is enabled, collect relevant info. - report_usage_stats(self.vllm_config) + report_usage_stats(self.vllm_config) \ No newline at end of file diff --git a/vllm_omni/worker/gpu_generation_worker.py b/vllm_omni/worker/gpu_generation_worker.py index 27111f39408..6a1a3039211 100644 --- a/vllm_omni/worker/gpu_generation_worker.py +++ b/vllm_omni/worker/gpu_generation_worker.py @@ -2,17 +2,17 @@ import os import torch -from vllm.model_executor import set_random_seed +from vllm.utils.torch_utils import set_random_seed from vllm.platforms import current_platform -from vllm.utils.mem_constants import GiB_bytes -from vllm.utils.mem_utils import MemorySnapshot +from vllm.utils.mem_utils import MemorySnapshot, format_gib from vllm.v1.utils import report_usage_stats from vllm.v1.worker.gpu_worker import Worker as GPUWorker from vllm.v1.worker.gpu_worker import init_worker_distributed_environment - +from vllm.v1.worker.workspace import init_workspace_manager +from vllm.v1.worker.utils import request_memory from vllm_omni.worker.gpu_generation_model_runner import GPUGenerationModelRunner - - +from vllm.logger import init_logger +logger = init_logger(__name__) class GPUGenerationWorker(GPUWorker): """GPU Worker for Generation model (non-autoregressive waveform generation). @@ -21,24 +21,24 @@ class GPUGenerationWorker(GPUWorker): """ def init_device(self): - device = self.device_config.device - if isinstance(device, torch.device) and device.type == "cuda": + if self.device_config.device_type == "cuda": # This env var set by Ray causes exceptions with graph building. os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) + parallel_config = self.parallel_config if ( - self.parallel_config.data_parallel_size > 1 - and self.parallel_config.data_parallel_size_local > 0 - and self.parallel_config.distributed_executor_backend not in ["ray", "external_launcher"] - and self.vllm_config.parallel_config.data_parallel_backend != "ray" - and self.vllm_config.parallel_config.nnodes_within_dp == 1 + parallel_config.distributed_executor_backend + not in ("ray", "external_launcher") + and parallel_config.data_parallel_backend != "ray" + and parallel_config.nnodes_within_dp == 1 ): # Use local DP rank if available, otherwise use global DP rank. dp_local_rank = self.parallel_config.data_parallel_rank_local if dp_local_rank is None: - dp_local_rank = self.parallel_config.data_parallel_rank + dp_local_rank = self.parallel_config.data_parallel_index tp_pp_world_size = ( - self.parallel_config.pipeline_parallel_size * self.parallel_config.tensor_parallel_size + self.parallel_config.pipeline_parallel_size + * self.parallel_config.tensor_parallel_size ) # DP_LOCAL_RANK * TP_PP_WORLD_SIZE + TP_LOCAL_RANK @@ -46,7 +46,9 @@ def init_device(self): assert self.local_rank < torch.cuda.device_count(), ( f"DP adjusted local rank {self.local_rank} is out of bounds. " ) - visible_device_count = torch.cuda.device_count() if torch.cuda.is_available() else 0 + visible_device_count = ( + torch.cuda.device_count() if torch.cuda.is_available() else 0 + ) assert self.parallel_config.local_world_size <= visible_device_count, ( f"local_world_size ({self.parallel_config.local_world_size}) must " f"be less than or equal to the number of visible devices " @@ -77,26 +79,19 @@ def init_device(self): torch.cuda.empty_cache() # take current memory snapshot - self.init_snapshot = MemorySnapshot() - self.requested_memory = self.init_snapshot.total_memory * self.cache_config.gpu_memory_utilization - if self.init_snapshot.free_memory < self.requested_memory: - - def gib(bytes_val: float) -> float: - return round(bytes_val / GiB_bytes, 2) - - raise ValueError( - f"Free memory on device " - f"({gib(self.init_snapshot.free_memory)}/" - f"{gib(self.init_snapshot.total_memory)} GiB) on startup " - f"is less than desired GPU memory utilization " - f"({self.cache_config.gpu_memory_utilization}, " - f"{gib(self.requested_memory)} GiB). Decrease GPU memory " - f"utilization or reduce GPU memory used by other processes." - ) + self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device) + self.requested_memory = request_memory(init_snapshot, self.cache_config) + logger.debug("worker init memory snapshot: %r", self.init_snapshot) + logger.debug( + "worker requested memory: %sGiB", format_gib(self.requested_memory) + ) else: raise RuntimeError(f"Not support device type: {self.device_config.device}") - # Construct the model runner + # Initialize workspace manager + num_ubatches = 2 if self.vllm_config.parallel_config.enable_dbo else 1 + init_workspace_manager(self.device, num_ubatches) + self.model_runner = GPUGenerationModelRunner(self.vllm_config, self.device) if self.rank == 0: From 4afb2ffa5878bb844455b54050b8cdb0f6c40604 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Tue, 13 Jan 2026 07:24:01 +0000 Subject: [PATCH 12/59] update one import in AsyncOmniLLM (not finish all, but can run) Signed-off-by: tzhouam --- vllm_omni/entrypoints/async_omni_llm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_omni/entrypoints/async_omni_llm.py b/vllm_omni/entrypoints/async_omni_llm.py index 567af03770f..287f12b9ed7 100644 --- a/vllm_omni/entrypoints/async_omni_llm.py +++ b/vllm_omni/entrypoints/async_omni_llm.py @@ -10,7 +10,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry -from vllm.tokenizers import init_tokenizer_from_config +from vllm.tokenizers import cached_tokenizer_from_config from vllm.tracing import init_tracer from vllm.transformers_utils.config import maybe_register_config_serialize_by_value from vllm.usage.usage_lib import UsageContext @@ -111,7 +111,7 @@ def __init__( tokenizer = None else: # Tokenizer (+ ensure liveness if running in another process). - tokenizer = init_tokenizer_from_config(model_config=vllm_config.model_config) + tokenizer = cached_tokenizer_from_config(model_config=vllm_config.model_config) # InputProcessor (converts Inputs --> EngineCoreRequests). self.input_processor = OmniInputProcessor( From cb2e053baccdf14d26c623b85ee0c03c59a78177 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Tue, 13 Jan 2026 07:25:31 +0000 Subject: [PATCH 13/59] update Qwen3 Omni ViT init based on updated interface (the update for Qwen3 Omni Thinker is not finished) Signed-off-by: tzhouam --- .../model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py index 361a9349b25..7f3320a82eb 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py @@ -684,13 +684,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.audio_tower = Qwen3OmniMoeAudioEncoder(thinker_config.audio_config) - attn_backend_override = multimodal_config.mm_encoder_attn_backend if multimodal_config is not None else None self.visual = Qwen3Omni_VisionTransformer( vision_config=thinker_config.vision_config, norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6), quant_config=quant_config, prefix=maybe_prefix(prefix, "visual"), - attn_backend_override=attn_backend_override, + multimodal_config=multimodal_config, ) self.quant_config = quant_config From e052c4a21ee67a1f3b834875b126718a8fb24eb0 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Tue, 13 Jan 2026 07:32:42 +0000 Subject: [PATCH 14/59] Remove unnecessary override for OmniRequestState (the update for OmniRequestState is not finished) Signed-off-by: tzhouam --- vllm_omni/engine/output_processor.py | 59 ---------------------------- 1 file changed, 59 deletions(-) diff --git a/vllm_omni/engine/output_processor.py b/vllm_omni/engine/output_processor.py index 6f22c6165b2..714b8dfcc53 100644 --- a/vllm_omni/engine/output_processor.py +++ b/vllm_omni/engine/output_processor.py @@ -37,65 +37,6 @@ def __init__( self.mm_type: str | None = None self.mm_accumulated: Dict[str, Any] | None = None - @classmethod - def from_new_request( - cls, - tokenizer: TokenizerLike, - request: EngineCoreRequest, - prompt: str | None, - parent_req: ParentRequest | None, - request_index: int, - queue: Any | None, - log_stats: bool, - stream_interval: int, - ) -> "OmniRequestState": - if sampling_params := request.sampling_params: - if not sampling_params.detokenize: - tokenizer = None - output_kind = sampling_params.output_kind - logprobs_processor = LogprobsProcessor.from_new_request( - tokenizer=tokenizer, - request=request, - ) - detokenizer = IncrementalDetokenizer.from_new_request( - tokenizer=tokenizer, - request=request, - ) - max_tokens_param = sampling_params.max_tokens - top_p = sampling_params.top_p - n = sampling_params.n - temperature = sampling_params.temperature - else: - logprobs_processor = None - detokenizer = None - max_tokens_param = None - top_p = None - n = None - temperature = None - assert request.pooling_params is not None - output_kind = request.pooling_params.output_kind - - return cls( - request_id=request.request_id, - parent_req=parent_req, - request_index=request_index, - lora_name=(request.lora_request.name if request.lora_request is not None else None), - output_kind=output_kind, - prompt=prompt, - prompt_token_ids=request.prompt_token_ids, - prompt_embeds=request.prompt_embeds, - logprobs_processor=logprobs_processor, - detokenizer=detokenizer, - max_tokens_param=max_tokens_param, - top_p=top_p, - n=n, - temperature=temperature, - arrival_time=request.arrival_time, - queue=queue, - log_stats=log_stats, - stream_interval=stream_interval, - ) - def add_multimodal_tensor(self, payload: Any | None, mm_type: str | None) -> None: if payload is None: return From c08dcdd930c58a765068d8b5c01c5568022f1595 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Tue, 13 Jan 2026 07:33:18 +0000 Subject: [PATCH 15/59] update model runner dummy run Signed-off-by: tzhouam --- .../worker/gpu_generation_model_runner.py | 126 ++++++++++----- vllm_omni/worker/gpu_model_runner.py | 144 +++++++++++------- 2 files changed, 182 insertions(+), 88 deletions(-) diff --git a/vllm_omni/worker/gpu_generation_model_runner.py b/vllm_omni/worker/gpu_generation_model_runner.py index 17740d85805..57f3985e790 100644 --- a/vllm_omni/worker/gpu_generation_model_runner.py +++ b/vllm_omni/worker/gpu_generation_model_runner.py @@ -25,8 +25,9 @@ get_pp_group, set_forward_context, ) +from vllm.model_executor.models.interfaces import supports_mm_encoder_only from vllm.v1.worker.utils import sanity_check_mm_encoder_outputs - +from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices from vllm_omni.outputs import OmniModelRunnerOutput from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner @@ -70,6 +71,7 @@ def execute_model( batch_desc, ubatch_slices, num_tokens_across_dp, + _ ) = self._determine_batch_execution_and_padding( num_tokens=num_tokens_unpadded, num_reqs=num_reqs, @@ -263,7 +265,15 @@ def _dummy_run( remove_lora: If False, dummy LoRAs are not destroyed after the run activate_lora: If False, dummy_run is performed without LoRAs. """ - assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes() + if supports_mm_encoder_only(self.model): + # The current dummy run only covers LM execution, so we can skip it. + # mm encoder dummy run may need to add in the future. + return torch.tensor([]), torch.tensor([]) + + assert ( + cudagraph_runtime_mode is None + or cudagraph_runtime_mode.valid_runtime_modes() + ) # If cudagraph_mode.decode_mode() == FULL and # cudagraph_mode.separate_routine(). This means that we are using @@ -316,23 +326,26 @@ def _dummy_run( num_sampled_tokens = np.ones(num_reqs, dtype=np.int32) - _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp = self._determine_batch_execution_and_padding( - num_tokens=num_tokens_unpadded, - num_reqs=num_reqs, - num_scheduled_tokens_np=num_scheduled_tokens, - max_num_scheduled_tokens=max_query_len, - use_cascade_attn=False, - allow_microbatching=allow_microbatching, - force_eager=is_profile or (cudagraph_runtime_mode == CUDAGraphMode.NONE), - # `force_uniform_decode` is used for cudagraph capture; because for - # capturing mixed prefill-decode batches, we sometimes use - # num_tokens == num_reqs which looks like a uniform decode batch to the - # dispatcher; but we actually want to capture a piecewise cudagraph - force_uniform_decode=uniform_decode, - # `force_has_lora` is used for cudagraph capture; because LoRA is - # activated later in the context manager, but we need to know the - # LoRA state when determining the batch descriptor for capture - force_has_lora=activate_lora, + _cudagraph_mode, batch_desc, should_ubatch, num_tokens_across_dp, _ = ( + self._determine_batch_execution_and_padding( + num_tokens=num_tokens_unpadded, + num_reqs=num_reqs, + num_scheduled_tokens_np=num_scheduled_tokens, + max_num_scheduled_tokens=max_query_len, + use_cascade_attn=False, + allow_microbatching=allow_microbatching, + force_eager=is_profile + or (cudagraph_runtime_mode == CUDAGraphMode.NONE), + # `force_uniform_decode` is used for cudagraph capture; because for + # capturing mixed prefill-decode batches, we sometimes use + # num_tokens == num_reqs which looks like a uniform decode batch to the + # dispatcher; but we actually want to capture a piecewise cudagraph + force_uniform_decode=uniform_decode, + # `force_has_lora` is used for cudagraph capture; because LoRA is + # activated later in the context manager, but we need to know the + # LoRA state when determining the batch descriptor for capture + force_has_lora=activate_lora, + ) ) if cudagraph_runtime_mode is None: @@ -344,7 +357,21 @@ def _dummy_run( ) num_tokens_padded = batch_desc.num_tokens - num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs + num_reqs_padded = ( + batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs + ) + ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices( + should_ubatch, + num_scheduled_tokens, + num_tokens_padded, + num_reqs_padded, + self.vllm_config.parallel_config.num_ubatches, + ) + logger.debug( + "ubatch_slices: %s, ubatch_slices_padded: %s", + ubatch_slices, + ubatch_slices_padded, + ) attn_metadata: PerLayerAttnMetadata | None = None @@ -366,11 +393,12 @@ def _dummy_run( self.query_start_loc.np[1 : num_reqs + 1] = cum_num_tokens self.query_start_loc.copy_to_gpu() + pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL attn_metadata, _ = self._build_attention_metadata( num_tokens=num_tokens_unpadded, num_reqs=num_reqs_padded, max_query_len=max_query_len, - ubatch_slices=ubatch_slices, + ubatch_slices=ubatch_slices_padded if pad_attn else ubatch_slices, for_cudagraph_capture=is_graph_capturing, ) @@ -383,10 +411,10 @@ def _dummy_run( ): # Make sure padding doesn't exceed max_num_tokens assert num_tokens_padded <= self.max_num_tokens - model_kwargs = self._init_model_kwargs(num_tokens_padded) + model_kwargs = self._init_model_kwargs() if self.supports_mm_inputs and not self.model_config.is_encoder_decoder: - input_ids = None - inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded] + input_ids, inputs_embeds = self._prepare_mm_inputs(num_tokens_padded) + model_kwargs = { **model_kwargs, **self._dummy_mm_kwargs(num_reqs), @@ -394,7 +422,7 @@ def _dummy_run( elif self.enable_prompt_embeds: input_ids = None inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded] - model_kwargs = self._init_model_kwargs(num_tokens_padded) + model_kwargs = self._init_model_kwargs() else: input_ids = self.input_ids.gpu[:num_tokens_padded] inputs_embeds = None @@ -410,24 +438,28 @@ def _dummy_run( intermediate_tensors = None else: if self.intermediate_tensors is None: - self.intermediate_tensors = self.model.make_empty_intermediate_tensors( - batch_size=self.max_num_tokens, - dtype=self.model_config.dtype, - device=self.device, + self.intermediate_tensors = ( + self.model.make_empty_intermediate_tensors( + batch_size=self.max_num_tokens, + dtype=self.model_config.dtype, + device=self.device, + ) ) - intermediate_tensors = self.sync_and_slice_intermediate_tensors(num_tokens_padded, None, False) + intermediate_tensors = self.sync_and_slice_intermediate_tensors( + num_tokens_padded, None, False + ) - if ubatch_slices is not None: + if ubatch_slices_padded is not None: # Adjust values to reflect a single ubatch. # TODO(sage,lucas): this is cruft that should be addressed in # the padding refactor. - num_tokens_padded = ubatch_slices[0].num_tokens + num_tokens_padded = ubatch_slices_padded[0].num_tokens if num_tokens_across_dp is not None: num_tokens_across_dp[:] = num_tokens_padded with ( - self.maybe_randomize_inputs(input_ids), + self.maybe_randomize_inputs(input_ids, inputs_embeds), set_forward_context( attn_metadata, self.vllm_config, @@ -435,7 +467,7 @@ def _dummy_run( num_tokens_across_dp=num_tokens_across_dp, cudagraph_runtime_mode=cudagraph_runtime_mode, batch_descriptor=batch_desc, - ubatch_slices=ubatch_slices, + ubatch_slices=ubatch_slices_padded, ), ): outputs = self.model( @@ -453,10 +485,19 @@ def _dummy_run( hidden_states, multimodal_outputs = self.extract_multimodal_outputs(hidden_states) if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) + # Eagle currently only supports PIECEWISE cudagraphs. + # Therefore only use cudagraphs if the main model uses PIECEWISE + # NOTE(lucas): this is a hack, need to clean up. use_cudagraphs = ( - cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE) - and not self.speculative_config.enforce_eager - ) + ( + is_graph_capturing + and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + ) + or ( + not is_graph_capturing + and cudagraph_runtime_mode != CUDAGraphMode.NONE + ) + ) and not self.speculative_config.enforce_eager # Note(gnovack) - We need to disable cudagraphs for one of the two # lora cases when cudagraph_specialize_lora is enabled. This is a @@ -471,6 +512,17 @@ def _dummy_run( is_graph_capturing=is_graph_capturing, ) + # We register layerwise NVTX hooks here after the first dynamo tracing is + # done to avoid nvtx operations in hook functions being traced by + # torch dynamo and causing graph breaks. + # Note that for DYNAMO_ONCE and VLLM_COMPILE mode, + # compiled model's dynamo tracing is only done once and the compiled model's + # __call__ function is replaced by calling the compiled function. + # So it's safe to register hooks here. Hooks will be registered to + # both compiled and uncompiled models but they will never + # be called on the compiled model execution path. + self._register_layerwise_nvtx_hooks() + # This is necessary to avoid blocking DP. # For dummy runs, we typically skip EPLB since we don't have any real # requests to process. diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 69729d95429..4a9ea2e930f 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -8,7 +8,7 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.models.interfaces import supports_mrope +from vllm.model_executor.models.interfaces import supports_mrope, supports_mm_encoder_only from vllm.model_executor.models.interfaces_base import VllmModelForPooling from vllm.sampling_params import SamplingType from vllm.utils.import_utils import LazyLoader @@ -16,7 +16,7 @@ from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.worker.gpu_input_batch import CachedRequestState from vllm.v1.worker.gpu_model_runner import GPUModelRunner, IntermediateTensors, PerLayerAttnMetadata - +from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices from vllm_omni.model_executor.models.output_templates import OmniOutput if TYPE_CHECKING: @@ -388,7 +388,15 @@ def _dummy_run( remove_lora: If False, dummy LoRAs are not destroyed after the run activate_lora: If False, dummy_run is performed without LoRAs. """ - assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes() + if supports_mm_encoder_only(self.model): + # The current dummy run only covers LM execution, so we can skip it. + # mm encoder dummy run may need to add in the future. + return torch.tensor([]), torch.tensor([]) + + assert ( + cudagraph_runtime_mode is None + or cudagraph_runtime_mode.valid_runtime_modes() + ) # If cudagraph_mode.decode_mode() == FULL and # cudagraph_mode.separate_routine(). This means that we are using @@ -441,23 +449,26 @@ def _dummy_run( num_sampled_tokens = np.ones(num_reqs, dtype=np.int32) - _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp = self._determine_batch_execution_and_padding( - num_tokens=num_tokens_unpadded, - num_reqs=num_reqs, - num_scheduled_tokens_np=num_scheduled_tokens, - max_num_scheduled_tokens=max_query_len, - use_cascade_attn=False, - allow_microbatching=allow_microbatching, - force_eager=is_profile or (cudagraph_runtime_mode == CUDAGraphMode.NONE), - # `force_uniform_decode` is used for cudagraph capture; because for - # capturing mixed prefill-decode batches, we sometimes use - # num_tokens == num_reqs which looks like a uniform decode batch to the - # dispatcher; but we actually want to capture a piecewise cudagraph - force_uniform_decode=uniform_decode, - # `force_has_lora` is used for cudagraph capture; because LoRA is - # activated later in the context manager, but we need to know the - # LoRA state when determining the batch descriptor for capture - force_has_lora=activate_lora, + _cudagraph_mode, batch_desc, should_ubatch, num_tokens_across_dp, _ = ( + self._determine_batch_execution_and_padding( + num_tokens=num_tokens_unpadded, + num_reqs=num_reqs, + num_scheduled_tokens_np=num_scheduled_tokens, + max_num_scheduled_tokens=max_query_len, + use_cascade_attn=False, + allow_microbatching=allow_microbatching, + force_eager=is_profile + or (cudagraph_runtime_mode == CUDAGraphMode.NONE), + # `force_uniform_decode` is used for cudagraph capture; because for + # capturing mixed prefill-decode batches, we sometimes use + # num_tokens == num_reqs which looks like a uniform decode batch to the + # dispatcher; but we actually want to capture a piecewise cudagraph + force_uniform_decode=uniform_decode, + # `force_has_lora` is used for cudagraph capture; because LoRA is + # activated later in the context manager, but we need to know the + # LoRA state when determining the batch descriptor for capture + force_has_lora=activate_lora, + ) ) if cudagraph_runtime_mode is None: @@ -469,7 +480,21 @@ def _dummy_run( ) num_tokens_padded = batch_desc.num_tokens - num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs + num_reqs_padded = ( + batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs + ) + ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices( + should_ubatch, + num_scheduled_tokens, + num_tokens_padded, + num_reqs_padded, + self.vllm_config.parallel_config.num_ubatches, + ) + logger.debug( + "ubatch_slices: %s, ubatch_slices_padded: %s", + ubatch_slices, + ubatch_slices_padded, + ) attn_metadata: PerLayerAttnMetadata | None = None @@ -491,11 +516,12 @@ def _dummy_run( self.query_start_loc.np[1 : num_reqs + 1] = cum_num_tokens self.query_start_loc.copy_to_gpu() + pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL attn_metadata, _ = self._build_attention_metadata( num_tokens=num_tokens_unpadded, num_reqs=num_reqs_padded, max_query_len=max_query_len, - ubatch_slices=ubatch_slices, + ubatch_slices=ubatch_slices_padded if pad_attn else ubatch_slices, for_cudagraph_capture=is_graph_capturing, ) @@ -508,10 +534,10 @@ def _dummy_run( ): # Make sure padding doesn't exceed max_num_tokens assert num_tokens_padded <= self.max_num_tokens - model_kwargs = self._init_model_kwargs(num_tokens_padded) + model_kwargs = self._init_model_kwargs() if self.supports_mm_inputs and not self.model_config.is_encoder_decoder: - input_ids = None - inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded] + input_ids, inputs_embeds = self._prepare_mm_inputs(num_tokens_padded) + model_kwargs = { **model_kwargs, **self._dummy_mm_kwargs(num_reqs), @@ -519,7 +545,7 @@ def _dummy_run( elif self.enable_prompt_embeds: input_ids = None inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded] - model_kwargs = self._init_model_kwargs(num_tokens_padded) + model_kwargs = self._init_model_kwargs() else: input_ids = self.input_ids.gpu[:num_tokens_padded] inputs_embeds = None @@ -535,24 +561,28 @@ def _dummy_run( intermediate_tensors = None else: if self.intermediate_tensors is None: - self.intermediate_tensors = self.model.make_empty_intermediate_tensors( - batch_size=self.max_num_tokens, - dtype=self.model_config.dtype, - device=self.device, + self.intermediate_tensors = ( + self.model.make_empty_intermediate_tensors( + batch_size=self.max_num_tokens, + dtype=self.model_config.dtype, + device=self.device, + ) ) - intermediate_tensors = self.sync_and_slice_intermediate_tensors(num_tokens_padded, None, False) + intermediate_tensors = self.sync_and_slice_intermediate_tensors( + num_tokens_padded, None, False + ) - if ubatch_slices is not None: + if ubatch_slices_padded is not None: # Adjust values to reflect a single ubatch. # TODO(sage,lucas): this is cruft that should be addressed in # the padding refactor. - num_tokens_padded = ubatch_slices[0].num_tokens + num_tokens_padded = ubatch_slices_padded[0].num_tokens if num_tokens_across_dp is not None: num_tokens_across_dp[:] = num_tokens_padded with ( - self.maybe_randomize_inputs(input_ids), + self.maybe_randomize_inputs(input_ids, inputs_embeds), set_forward_context( attn_metadata, self.vllm_config, @@ -560,20 +590,9 @@ def _dummy_run( num_tokens_across_dp=num_tokens_across_dp, cudagraph_runtime_mode=cudagraph_runtime_mode, batch_descriptor=batch_desc, - ubatch_slices=ubatch_slices, + ubatch_slices=ubatch_slices_padded, ), ): - if ( - getattr(self.model, "talker", None) is not None - and hasattr(self.model, "talker_mtp") - and num_tokens_padded == 1 - ): - outputs = self.talker_mtp( - self.talker_mtp_input_ids.gpu[:num_tokens_padded], - self.talker_mtp_inputs_embeds.gpu[:num_tokens_padded], - self.last_talker_hidden.gpu[:num_tokens_padded], - self.text_step.gpu[:num_tokens_padded], - ) outputs = self.model( input_ids=input_ids, positions=positions, @@ -589,10 +608,19 @@ def _dummy_run( hidden_states, multimodal_outputs = self.extract_multimodal_outputs(hidden_states) if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) + # Eagle currently only supports PIECEWISE cudagraphs. + # Therefore only use cudagraphs if the main model uses PIECEWISE + # NOTE(lucas): this is a hack, need to clean up. use_cudagraphs = ( - cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE) - and not self.speculative_config.enforce_eager - ) + ( + is_graph_capturing + and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + ) + or ( + not is_graph_capturing + and cudagraph_runtime_mode != CUDAGraphMode.NONE + ) + ) and not self.speculative_config.enforce_eager # Note(gnovack) - We need to disable cudagraphs for one of the two # lora cases when cudagraph_specialize_lora is enabled. This is a @@ -607,6 +635,17 @@ def _dummy_run( is_graph_capturing=is_graph_capturing, ) + # We register layerwise NVTX hooks here after the first dynamo tracing is + # done to avoid nvtx operations in hook functions being traced by + # torch dynamo and causing graph breaks. + # Note that for DYNAMO_ONCE and VLLM_COMPILE mode, + # compiled model's dynamo tracing is only done once and the compiled model's + # __call__ function is replaced by calling the compiled function. + # So it's safe to register hooks here. Hooks will be registered to + # both compiled and uncompiled models but they will never + # be called on the compiled model execution path. + self._register_layerwise_nvtx_hooks() + # This is necessary to avoid blocking DP. # For dummy runs, we typically skip EPLB since we don't have any real # requests to process. @@ -618,7 +657,9 @@ def _dummy_run( self.eplb_step(is_dummy=True, is_profile=is_profile) logit_indices = np.cumsum(num_scheduled_tokens) - 1 - logit_indices_device = torch.from_numpy(logit_indices).to(self.device, non_blocking=True) + logit_indices_device = torch.from_numpy(logit_indices).to( + self.device, non_blocking=True + ) return hidden_states, hidden_states[logit_indices_device] def _decode_and_store_request_payloads(self, scheduler_output: "SchedulerOutput") -> None: @@ -897,9 +938,10 @@ def _preprocess( req_input_ids, req_embeds, update_dict = self.model.preprocess( input_ids=input_ids[s:e], input_embeds=inputs_embeds[s:e], **req_infos ) + #TODO: This is Model Specific Code, need to be generalized in the future ZTC # run talker mtp decode if hasattr(self.model, "talker_mtp"): - _cudagraph_mode, batch_desc, _, _ = self._determine_batch_execution_and_padding( + _cudagraph_mode, batch_desc, _, _, _ = self._determine_batch_execution_and_padding( num_tokens=span_len, num_reqs=1, num_scheduled_tokens_np=num_scheduled_tokens_np[req_index], From 9cdf592782950d7119bc68355a9aa92b54d76b6b Mon Sep 17 00:00:00 2001 From: JustQJ <37905360+JustQJ@users.noreply.github.com> Date: Tue, 13 Jan 2026 15:55:25 +0800 Subject: [PATCH 16/59] [Misc] Enable tensor_parallel_size argument with online serving cmd (#761) Signed-off-by: TangPeng <85704592@qq.com> --- vllm_omni/entrypoints/async_omni.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py index 1cd7f506319..99f872db7f2 100644 --- a/vllm_omni/entrypoints/async_omni.py +++ b/vllm_omni/entrypoints/async_omni.py @@ -131,15 +131,16 @@ def _create_default_diffusion_stage_cfg(self, kwargs: dict[str, Any]) -> dict[st ulysses_degree = kwargs.get("ulysses_degree") or 1 ring_degree = kwargs.get("ring_degree") or 1 sequence_parallel_size = kwargs.get("sequence_parallel_size") + tensor_parallel_size = kwargs.get("tensor_parallel_size") or 1 if sequence_parallel_size is None: sequence_parallel_size = ulysses_degree * ring_degree - num_devices = sequence_parallel_size + num_devices = sequence_parallel_size * tensor_parallel_size for i in range(1, num_devices): devices += f",{i}" parallel_config = DiffusionParallelConfig( pipeline_parallel_size=1, data_parallel_size=1, - tensor_parallel_size=1, + tensor_parallel_size=tensor_parallel_size, sequence_parallel_size=sequence_parallel_size, ulysses_degree=ulysses_degree, ring_degree=ring_degree, @@ -161,6 +162,7 @@ def _create_default_diffusion_stage_cfg(self, kwargs: dict[str, Any]) -> dict[st "cache_backend": cache_backend, "cache_config": cache_config, "enable_cpu_offload": kwargs.get("enable_cpu_offload", False), + "enforce_eager": kwargs.get("enforce_eager", False), }, "final_output": True, "final_output_type": "image", From 166fc788750e4bfd0116b952ef103b438a7e1c50 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Tue, 13 Jan 2026 08:43:04 +0000 Subject: [PATCH 17/59] update ar scheduler Signed-off-by: tzhouam --- vllm_omni/core/sched/omni_ar_scheduler.py | 61 ++++++++++++++++++++--- 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/vllm_omni/core/sched/omni_ar_scheduler.py b/vllm_omni/core/sched/omni_ar_scheduler.py index 7918e16878e..00bdb32aa4c 100644 --- a/vllm_omni/core/sched/omni_ar_scheduler.py +++ b/vllm_omni/core/sched/omni_ar_scheduler.py @@ -73,6 +73,11 @@ def update_from_output( pooler_outputs = model_runner_output.pooler_output num_nans_in_logits = model_runner_output.num_nans_in_logits kv_connector_output = model_runner_output.kv_connector_output + cudagraph_stats = model_runner_output.cudagraph_stats + + perf_stats: PerfStats | None = None + if self.perf_metrics and self.perf_metrics.is_enabled(): + perf_stats = self.perf_metrics.get_step_perf_stats_per_gpu(scheduler_output) outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list) spec_decoding_stats: SpecDecodingStats | None = None @@ -131,11 +136,14 @@ def update_from_output( spec_decoding_stats, num_draft_tokens=num_draft_tokens, num_accepted_tokens=num_accepted, + num_invalid_spec_tokens=scheduler_output.num_invalid_spec_tokens, + request_id=req_id, ) stopped = False new_logprobs = None new_token_ids = generated_token_ids + pooler_output = pooler_outputs[req_index] if pooler_outputs else None kv_transfer_params = None status_before_stop = request.status @@ -144,13 +152,34 @@ def update_from_output( new_token_ids, stopped = self._update_request_with_output(request, new_token_ids) # Stop checking for pooler models. - pooler_output = None - if pooler_outputs: - pooler_output = pooler_outputs[req_index] + if pooler_output: + # Note: As we occupied the pooler output, for multimodal outputs, we do not intermediate stop checking for pooler output if request.output_token_ids: stopped = check_stop(request, self.max_model_len, pooler_output) - + routed_experts = None if stopped: + if self.vllm_config.model_config.enable_return_routed_experts: + kv_blocks = self.kv_cache_manager.get_blocks(request.request_id) + block_ids = kv_blocks.get_block_ids()[0] + num_tokens = request.num_tokens - 1 + + # compute slot mapping + block_ids_array = np.array(block_ids, dtype=np.int32) + num_blocks = len(block_ids) + block_size = self.block_size + + # generate block offsets + block_offsets = np.arange(0, block_size) + + # compute slot mapping: slot = block_id * block_size + offset + slot_mapping = ( + block_offsets.reshape((1, block_size)) + + block_ids_array.reshape((num_blocks, 1)) * block_size + ).flatten()[:num_tokens] + + routed_experts = self.routed_experts_reader.get_routed_experts( + indices=slot_mapping + ) kv_transfer_params = self._free_request(request) if status_before_stop == RequestStatus.RUNNING: stopped_running_reqs.add(request) @@ -165,7 +194,13 @@ def update_from_output( struct_output_request = request.structured_output_request assert struct_output_request is not None assert struct_output_request.grammar is not None - struct_output_request.grammar.accept_tokens(req_id, new_token_ids) + ok = struct_output_request.grammar.accept_tokens(req_id, new_token_ids) + if not ok: + logger.warning( + "Unexpected: grammar rejected tokens %s for request %s.", + new_token_ids, + req_id, + ) if num_nans_in_logits is not None and req_id in num_nans_in_logits: request.num_nans_in_logits = num_nans_in_logits[req_id] @@ -200,7 +235,21 @@ def update_from_output( if stopped_preempted_reqs: # This is a rare case and unlikely to impact performance. self.waiting.remove_requests(stopped_preempted_reqs) - + + if failed_kv_load_req_ids and not self.recompute_kv_load_failures: + requests = [self.requests[req_id] for req_id in failed_kv_load_req_ids] + self.finish_requests(failed_kv_load_req_ids, RequestStatus.FINISHED_ERROR) + for request in requests: + outputs[request.client_index].append( + EngineCoreOutput( + request_id=request.request_id, + new_token_ids=[], + finish_reason=request.get_finished_reason(), + events=request.take_events(), + trace_headers=request.trace_headers, + num_cached_tokens=request.num_cached_tokens, + ) + ) # KV Connector: update state for finished KV Transfers. if kv_connector_output: self._update_from_kv_xfer_finished(kv_connector_output) From 8e20b33b389f0cfb1121974a079b1e25de29412a Mon Sep 17 00:00:00 2001 From: XU Mingshi <91017482+mxuax@users.noreply.github.com> Date: Tue, 13 Jan 2026 17:13:31 +0800 Subject: [PATCH 18/59] [Bugfix] Raise ValueError when joint_strategy='rear' and causal=True in Ring Attention (#767) Signed-off-by: XU Mingshi <91017482+mxuax@users.noreply.github.com> Signed-off-by: mxuax --- .../attention/backends/ring_flash_attn.py | 14 ++++++++++++++ .../attention/backends/ring_pytorch_attn.py | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/vllm_omni/diffusion/attention/backends/ring_flash_attn.py b/vllm_omni/diffusion/attention/backends/ring_flash_attn.py index e163ef1ace1..dd27f88a8c1 100644 --- a/vllm_omni/diffusion/attention/backends/ring_flash_attn.py +++ b/vllm_omni/diffusion/attention/backends/ring_flash_attn.py @@ -28,6 +28,20 @@ def ring_flash_attn_forward( joint_tensor_value=None, joint_strategy="front", ): + # Validate causal + joint_strategy combination + # When causal=True and joint_strategy="rear", the causal mask would incorrectly + # prevent local query tokens from attending to joint key tokens (which are + # concatenated at the end). This breaks the semantics where joint tokens + # (e.g., text conditioning) should be visible to all local tokens. + if causal and joint_tensor_key is not None and joint_strategy == "rear": + raise ValueError( + "joint_strategy='rear' is not compatible with causal=True in Ring Attention. " + "When using causal attention with joint tokens, use joint_strategy='front' " + "to ensure joint tokens act as a visible prefix for all local tokens. " + "With 'rear' strategy, the causal mask would incorrectly block local tokens " + "from seeing the joint tokens." + ) + comm = RingComm(process_group) out = None diff --git a/vllm_omni/diffusion/attention/backends/ring_pytorch_attn.py b/vllm_omni/diffusion/attention/backends/ring_pytorch_attn.py index 43ee35f7098..9ed2c2076c4 100644 --- a/vllm_omni/diffusion/attention/backends/ring_pytorch_attn.py +++ b/vllm_omni/diffusion/attention/backends/ring_pytorch_attn.py @@ -62,6 +62,20 @@ def forward( joint_tensor_value=None, joint_strategy="front", ): + # Validate causal + joint_strategy combination + # When causal=True and joint_strategy="rear", the causal mask would incorrectly + # prevent local query tokens from attending to joint key tokens (which are + # concatenated at the end). This breaks the semantics where joint tokens + # (e.g., text conditioning) should be visible to all local tokens. + if is_causal and joint_tensor_key is not None and joint_strategy == "rear": + raise ValueError( + "joint_strategy='rear' is not compatible with causal=True in Ring Attention. " + "When using causal attention with joint tokens, use joint_strategy='front' " + "to ensure joint tokens act as a visible prefix for all local tokens. " + "With 'rear' strategy, the causal mask would incorrectly block local tokens " + "from seeing the joint tokens." + ) + comm = RingComm(group) # Ensure tensors are contiguous for P2P communication q = q.contiguous() From 4db8f0b96d70a1e7c2517ba14e300f9b5d8e10ee Mon Sep 17 00:00:00 2001 From: tzhouam Date: Tue, 13 Jan 2026 09:29:48 +0000 Subject: [PATCH 19/59] update _preprocess, execute model and sample_tokens for AR Model Runner Signed-off-by: tzhouam --- vllm_omni/worker/gpu_ar_model_runner.py | 304 +++++++++++++++++------- vllm_omni/worker/gpu_model_runner.py | 9 +- 2 files changed, 219 insertions(+), 94 deletions(-) diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index 2d2b7ef8e2d..d4e7e195fe8 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -26,11 +26,18 @@ get_pp_group, get_tp_group, has_kv_transfer_group, + ) from vllm.v1.worker.utils import is_residual_scattered_for_sp - +from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( + RoutedExpertsCapturer, +) +from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer +from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group from vllm_omni.outputs import OmniModelRunnerOutput from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner +from vllm.v1.outputs import make_empty_encoder_model_runner_output +from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices logger = init_logger(__name__) @@ -44,6 +51,7 @@ class ExecuteModelState(NamedTuple): sample_hidden_states: torch.Tensor aux_hidden_states: list[torch.Tensor] | None ec_connector_output: Any + cudagraph_stats: Any multimodal_outputs: Any @@ -82,67 +90,148 @@ def execute_model( scheduler_output: SchedulerOutput, intermediate_tensors: IntermediateTensors | None = None, ) -> OmniModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors | None: - with record_function_or_nullcontext("Preprocess"): - with self.synchronize_input_prep(): - self._update_states(scheduler_output) - self._decode_and_store_request_payloads(scheduler_output) - - if not scheduler_output.total_num_scheduled_tokens: - if not has_kv_transfer_group(): - return EMPTY_MODEL_RUNNER_OUTPUT - return self.kv_connector_no_forward(scheduler_output, self.vllm_config) - if self.cache_config.kv_sharing_fast_prefill: - assert not self.input_batch.num_prompt_logprobs, ( - "--kv-sharing-fast-prefill produces incorrect " - "logprobs for prompt tokens, tokens, please disable " - "it when the requests need prompt logprobs" - ) + if self.execute_model_state is not None: + raise RuntimeError( + "State error: sample_tokens() must be called " + "after execute_model() returns None." + ) - num_reqs = self.input_batch.num_reqs - req_ids = self.input_batch.req_ids - tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids] - num_scheduled_tokens_np = np.array(tokens, dtype=np.int32) - max_num_scheduled_tokens = int(num_scheduled_tokens_np.max()) - num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens + if self.vllm_config.model_config.enable_return_routed_experts: + capturer = RoutedExpertsCapturer.get_instance() + if capturer is not None: + capturer.clear_buffer() # noqa + else: + logger.error("RoutedExpertsCapturer not initialized.") + + if scheduler_output.preempted_req_ids and has_kv_transfer_group(): + get_kv_transfer_group().handle_preemptions( + scheduler_output.preempted_req_ids + ) - logits_indices, spec_decode_metadata = self._prepare_inputs( + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + with ( + record_function_or_nullcontext("gpu_model_runner: preprocess"), + self.synchronize_input_prep(), + ): + # Update persistent batch states. + self._update_states(scheduler_output) + + if has_ec_transfer() and get_ec_transfer().is_producer: + with self.maybe_get_ec_connector_output( scheduler_output, - num_scheduled_tokens_np, + encoder_cache=self.encoder_cache, + ) as ec_connector_output: + self._execute_mm_encoder(scheduler_output) + return make_empty_encoder_model_runner_output(scheduler_output) + + if not num_scheduled_tokens: + if ( + self.parallel_config.distributed_executor_backend + == "external_launcher" + and self.parallel_config.data_parallel_size > 1 + ): + # this is a corner case when both external launcher + # and DP are enabled, num_scheduled_tokens could be + # 0, and has_unfinished_requests in the outer loop + # returns True. before returning early here we call + # dummy run to ensure coordinate_batch_across_dp + # is called into to avoid out of sync issues. + self._dummy_run(1) + if not has_kv_transfer_group(): + # Return empty ModelRunnerOutput if no work to do. + return EMPTY_MODEL_RUNNER_OUTPUT + return self.kv_connector_no_forward(scheduler_output, self.vllm_config) + + if self.cache_config.kv_sharing_fast_prefill: + assert not self.num_prompt_logprobs, ( + "--kv-sharing-fast-prefill produces incorrect " + "logprobs for prompt tokens, tokens, please disable " + "it when the requests need prompt logprobs" ) - ( - cudagraph_mode, - batch_desc, - ubatch_slices, - num_tokens_across_dp, - ) = self._determine_batch_execution_and_padding( - num_tokens=num_tokens_unpadded, - num_reqs=num_reqs, - num_scheduled_tokens_np=num_scheduled_tokens_np, - max_num_scheduled_tokens=max_num_scheduled_tokens, - use_cascade_attn=False, + num_reqs = self.input_batch.num_reqs + req_ids = self.input_batch.req_ids + tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids] + num_scheduled_tokens_np = np.array(tokens, dtype=np.int32) + max_num_scheduled_tokens = int(num_scheduled_tokens_np.max()) + num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens + + logits_indices, spec_decode_metadata = self._prepare_inputs( + scheduler_output, + num_scheduled_tokens_np, + ) + + cascade_attn_prefix_lens = None + # Disable cascade attention when using microbatching (DBO) + if self.cascade_attn_enabled and not self.parallel_config.use_ubatching: + # Pre-compute cascade attention prefix lengths + cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens( + num_scheduled_tokens_np, + self.input_batch.num_computed_tokens_cpu[:num_reqs], + scheduler_output.num_common_prefix_blocks, ) - num_tokens_padded = batch_desc.num_tokens - num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs - use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0 - pad_attn = cudagraph_mode == CUDAGraphMode.FULL + ( + cudagraph_mode, + batch_desc, + should_ubatch, + num_tokens_across_dp, + cudagraph_stats, + ) = self._determine_batch_execution_and_padding( + num_tokens=num_tokens_unpadded, + num_reqs=num_reqs, + num_scheduled_tokens_np=num_scheduled_tokens_np, + max_num_scheduled_tokens=max_num_scheduled_tokens, + use_cascade_attn=cascade_attn_prefix_lens is not None, + num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs), + ) - ( - attn_metadata, - spec_decode_common_attn_metadata, - ) = self._build_attention_metadata( + logger.debug( + "Running batch with cudagraph_mode: %s, batch_descriptor: %s, " + "should_ubatch: %s, num_tokens_across_dp: %s", + cudagraph_mode, + batch_desc, + should_ubatch, + num_tokens_across_dp, + ) + + num_tokens_padded = batch_desc.num_tokens + num_reqs_padded = ( + batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs + ) + ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices( + should_ubatch, + num_scheduled_tokens_np, + num_tokens_padded, + num_reqs_padded, + self.parallel_config.num_ubatches, + ) + + logger.debug( + "ubatch_slices: %s, ubatch_slices_padded: %s", + ubatch_slices, + ubatch_slices_padded, + ) + + pad_attn = cudagraph_mode == CUDAGraphMode.FULL + + use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0 + ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices + + attn_metadata, spec_decode_common_attn_metadata = ( + self._build_attention_metadata( num_tokens=num_tokens_unpadded, num_tokens_padded=num_tokens_padded if pad_attn else None, num_reqs=num_reqs, num_reqs_padded=num_reqs_padded if pad_attn else None, max_query_len=max_num_scheduled_tokens, - ubatch_slices=ubatch_slices, + ubatch_slices=ubatch_slices_attn, logits_indices=logits_indices, use_spec_decode=use_spec_decode, num_scheduled_tokens=scheduler_output.num_scheduled_tokens, - cascade_attn_prefix_lens=None, + cascade_attn_prefix_lens=cascade_attn_prefix_lens, ) + ) ( input_ids, @@ -152,15 +241,19 @@ def execute_model( model_kwargs, ec_connector_output, ) = self._preprocess( - scheduler_output, - num_tokens_padded, - intermediate_tensors, + scheduler_output, num_tokens_padded, intermediate_tensors ) + # Set cudagraph mode to none if calc_kv_scales is true. + # KV scales calculation involves dynamic operations that are incompatible + # with CUDA graph capture. if self.calculate_kv_scales: cudagraph_mode = CUDAGraphMode.NONE + # Mark KV scales as calculated after the first forward pass self.calculate_kv_scales = False + # Run the model. + # Use persistent buffers for CUDA graphs. with ( set_forward_context( attn_metadata, @@ -169,9 +262,9 @@ def execute_model( num_tokens_across_dp=num_tokens_across_dp, cudagraph_runtime_mode=cudagraph_mode, batch_descriptor=batch_desc, - ubatch_slices=ubatch_slices, + ubatch_slices=ubatch_slices_padded, ), - record_function_or_nullcontext("Forward"), + record_function_or_nullcontext("gpu_model_runner: forward"), self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output, ): model_output = self._model_forward( @@ -208,30 +301,37 @@ def execute_model( logger.debug("[AR] execute_model: multimodal_outputs is None") if not self.broadcast_pp_output: + # Common case. if not get_pp_group().is_last_rank: + # Return the intermediate tensors. assert isinstance(hidden_states, IntermediateTensors) hidden_states.kv_connector_output = kv_connector_output + self.kv_connector_output = kv_connector_output return hidden_states if self.is_pooling_model: - output = self._pool( + # Return the pooling output. + return self._pool( hidden_states, - num_tokens_padded, + num_scheduled_tokens, num_scheduled_tokens_np, + kv_connector_output, ) - output.kv_connector_output = kv_connector_output - return output sample_hidden_states = hidden_states[logits_indices] logits = self.model.compute_logits( sample_hidden_states, sampling_metadata=self.input_batch.sampling_metadata ) else: + # Rare case. assert not self.is_pooling_model + sample_hidden_states = hidden_states[logits_indices] if not get_pp_group().is_last_rank: all_gather_tensors = { - "residual": not is_residual_scattered_for_sp(self.vllm_config, num_tokens_padded) + "residual": not is_residual_scattered_for_sp( + self.vllm_config, num_tokens_padded + ) } get_pp_group().send_tensor_dict( hidden_states.tensors, @@ -240,7 +340,6 @@ def execute_model( ) logits = None else: - sample_hidden_states = hidden_states[logits_indices] logits = self.model.compute_logits( sample_hidden_states, sampling_metadata=self.input_batch.sampling_metadata ) @@ -264,6 +363,7 @@ def execute_model( sample_hidden_states, aux_hidden_states, ec_connector_output, + cudagraph_stats, multimodal_outputs, ) self.kv_connector_output = kv_connector_output @@ -278,14 +378,20 @@ def sample_tokens( self.kv_connector_output = None if self.execute_model_state is None: + # Nothing to do (PP non-final rank case), output isn't used. if not kv_connector_output: return None # type: ignore[return-value] + + # In case of PP with kv transfer, we need to pass through the + # kv_connector_output if kv_connector_output.is_empty(): return EMPTY_MODEL_RUNNER_OUTPUT + output = copy(EMPTY_MODEL_RUNNER_OUTPUT) output.kv_connector_output = kv_connector_output return output + # Unpack ephemeral state. ( scheduler_output, logits, @@ -295,16 +401,22 @@ def sample_tokens( sample_hidden_states, aux_hidden_states, ec_connector_output, + cudagraph_stats, multimodal_outputs, ) = self.execute_model_state self.execute_model_state = None + # Apply structured output bitmasks if present. if grammar_output is not None: - apply_grammar_bitmask(scheduler_output, grammar_output, self.input_batch, logits) + apply_grammar_bitmask( + scheduler_output, grammar_output, self.input_batch, logits + ) with record_function_or_nullcontext("gpu_model_runner: sample"): sampler_output = self._sample(logits, spec_decode_metadata) + self._draft_token_ids = None + self._draft_token_req_ids = None self.input_batch.prev_sampled_token_ids = None def propose_draft_token_ids(sampled_token_ids): @@ -320,39 +432,44 @@ def propose_draft_token_ids(sampled_token_ids): spec_decode_metadata, spec_decode_common_attn_metadata, ) + self._copy_draft_token_ids_to_cpu(scheduler_output) spec_config = self.speculative_config - use_padded_batch_for_eagle = ( - spec_config is not None and spec_config.use_eagle() and not spec_config.disable_padded_drafter_batch - ) - effective_drafter_max_model_len = self.max_model_len - if effective_drafter_max_model_len is None: - effective_drafter_max_model_len = self.model_config.max_model_len - if ( - spec_config is not None - and spec_config.draft_model_config is not None - and spec_config.draft_model_config.max_model_len is not None - ): - effective_drafter_max_model_len = spec_config.draft_model_config.max_model_len - input_fits_in_drafter = spec_decode_common_attn_metadata and ( - spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens <= effective_drafter_max_model_len - ) - if use_padded_batch_for_eagle: - assert self.speculative_config is not None - assert isinstance(self.drafter, EagleProposer) - sampled_token_ids = sampler_output.sampled_token_ids - if input_fits_in_drafter: - propose_draft_token_ids(sampled_token_ids) - elif self.valid_sampled_token_count_event is not None: - assert spec_decode_common_attn_metadata is not None - next_token_ids, valid_sampled_tokens_count = self.drafter.prepare_next_token_ids_padded( - spec_decode_common_attn_metadata, - sampled_token_ids, - self.requests, - self.input_batch, - self.discard_request_mask.gpu, - ) - self._copy_valid_sampled_token_count(next_token_ids, valid_sampled_tokens_count) + propose_drafts_after_bookkeeping = False + if spec_config is not None: + input_fits_in_drafter = spec_decode_common_attn_metadata is not None and ( + spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens + <= self.effective_drafter_max_model_len + ) + if spec_config.use_eagle() and not spec_config.disable_padded_drafter_batch: + # EAGLE speculative decoding can use the GPU sampled tokens + # as inputs, and does not need to wait for bookkeeping to finish. + assert isinstance(self.drafter, EagleProposer) + sampled_token_ids = sampler_output.sampled_token_ids + if input_fits_in_drafter: + propose_draft_token_ids(sampled_token_ids) + elif self.valid_sampled_token_count_event is not None: + assert spec_decode_common_attn_metadata is not None + next_token_ids, valid_sampled_tokens_count = ( + self.drafter.prepare_next_token_ids_padded( + spec_decode_common_attn_metadata, + sampled_token_ids, + self.requests, + self.input_batch, + self.discard_request_mask.gpu, + ) + ) + self._copy_valid_sampled_token_count( + next_token_ids, valid_sampled_tokens_count + ) + # Since we couldn't run the drafter, + # just use zeros for the draft tokens. + self._draft_token_ids = torch.zeros( + 1, device=self.device, dtype=torch.int32 + ).expand(len(self.input_batch.req_ids), self.num_spec_tokens) + self._copy_draft_token_ids_to_cpu(scheduler_output, zeros_only=True) + else: + propose_drafts_after_bookkeeping = input_fits_in_drafter with record_function_or_nullcontext("gpu_model_runner: bookkeep"): ( @@ -372,7 +489,9 @@ def propose_draft_token_ids(sampled_token_ids): spec_decode_metadata, ) - if self.speculative_config and not use_padded_batch_for_eagle and input_fits_in_drafter: + if propose_drafts_after_bookkeeping: + # ngram and other speculative decoding methods use the sampled + # tokens on the CPU, so they are run after bookkeeping. propose_draft_token_ids(valid_sampled_token_ids) with record_function_or_nullcontext("gpu_model_runner: eplb"): @@ -421,6 +540,12 @@ def propose_draft_token_ids(sampled_token_ids): payload.update(mm_payload) pooler_output.append(payload) with record_function_or_nullcontext("gpu_model_runner: ModelRunnerOutput"): + if self.model_config.enable_return_routed_experts: + capturer = RoutedExpertsCapturer.get_instance() + if capturer is not None: + capturer.save_captured_experts(indices=self.slot_mapping) # noqa + else: + logger.error("RoutedExpertsCapturer not initialized.") output = OmniModelRunnerOutput( req_ids=req_ids_output_copy, req_id_to_index=req_id_to_index_output_copy, @@ -431,6 +556,7 @@ def propose_draft_token_ids(sampled_token_ids): kv_connector_output=kv_connector_output, ec_connector_output=ec_connector_output if self.supports_mm_inputs else None, num_nans_in_logits=num_nans_in_logits, + cudagraph_stats=cudagraph_stats, ) if not self.use_async_scheduling: diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 4a9ea2e930f..b90f2f1d346 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -847,10 +847,9 @@ def _preprocess( # TODO(woosuk): Avoid the copy. Optimize. self.inputs_embeds.gpu[:num_scheduled_tokens].copy_(inputs_embeds_scheduled) - input_ids = self.input_ids.gpu[:num_input_tokens] - inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens] + input_ids, inputs_embeds = self._prepare_mm_inputs(num_input_tokens) model_kwargs = { - **self._init_model_kwargs(num_scheduled_tokens), + **self._init_model_kwargs(), **self._extract_mm_kwargs(scheduler_output), } elif self.enable_prompt_embeds and is_first_rank: @@ -874,7 +873,7 @@ def _preprocess( self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens] - model_kwargs = self._init_model_kwargs(num_input_tokens) + model_kwargs = self._init_model_kwargs() input_ids = self.input_ids.gpu[:num_input_tokens] else: # For text-only models, we use token ids as input. @@ -883,7 +882,7 @@ def _preprocess( # then the embedding layer is not included in the CUDA graph. input_ids = self.input_ids.gpu[:num_input_tokens] inputs_embeds = None - model_kwargs = self._init_model_kwargs(num_input_tokens) + model_kwargs = self._init_model_kwargs() if self.uses_mrope: positions = self.mrope_positions.gpu[:, :num_input_tokens] From f7611198ae87ee82fc3275d3ed2a5c0dcd1edd64 Mon Sep 17 00:00:00 2001 From: Sihyeon Jang Date: Tue, 13 Jan 2026 18:43:55 +0900 Subject: [PATCH 20/59] [Feat] add vllm-omni version collection (#740) Signed-off-by: Sihyeon Jang --- collect_env.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/collect_env.py b/collect_env.py index 71cec0c4a87..8b09379e1a3 100644 --- a/collect_env.py +++ b/collect_env.py @@ -57,6 +57,7 @@ "cpu_info", "rocm_version", # vllm specific field "vllm_version", # vllm specific field + "vllm_omni_version", # vllm-omni specific field "vllm_build_flags", # vllm specific field "gpu_topo", # vllm specific field "env_vars", @@ -289,6 +290,31 @@ def get_vllm_version(): return __version__ +def get_vllm_omni_version(run_lambda): + try: + import vllm_omni + from vllm_omni import __version__, __version_tuple__ + + version_str = __version_tuple__[-1] + if isinstance(version_str, str) and version_str.startswith("g"): + if "." in version_str: + git_sha = version_str.split(".")[0][1:] + date = version_str.split(".")[-1][1:] + return f"{__version__} (git sha: {git_sha}, date: {date})" + else: + git_sha = version_str[1:] + return f"{__version__} (git sha: {git_sha})" + + package_dir = os.path.dirname(os.path.abspath(vllm_omni.__file__)) + git_sha = run_and_read_all(run_lambda, f"git -C {package_dir} rev-parse --short HEAD") + if git_sha: + return f"{__version__} (git sha: {git_sha})" + + return __version__ + except ImportError: + return "N/A (vllm_omni not installed)" + + def summarize_vllm_build_flags(): # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. return "CUDA Archs: {}; ROCm: {}".format( @@ -524,6 +550,7 @@ def get_version_or_na(cfg, prefix): rocm_version = get_rocm_version(run_lambda) vllm_version = get_vllm_version() + vllm_omni_version = get_vllm_omni_version(run_lambda) vllm_build_flags = summarize_vllm_build_flags() gpu_topo = get_gpu_topo(run_lambda) @@ -555,6 +582,7 @@ def get_version_or_na(cfg, prefix): cpu_info=get_cpu_info(run_lambda), rocm_version=rocm_version, vllm_version=vllm_version, + vllm_omni_version=vllm_omni_version, vllm_build_flags=vllm_build_flags, gpu_topo=gpu_topo, env_vars=get_env_vars(), @@ -621,6 +649,7 @@ def get_version_or_na(cfg, prefix): ============================== ROCM Version : {rocm_version} vLLM Version : {vllm_version} +vLLM-Omni Version : {vllm_omni_version} vLLM Build Flags: {vllm_build_flags} GPU Topology: From 63a69a5816a2ed77d16c83fdb7dba656c0ae11a6 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Tue, 13 Jan 2026 09:55:52 +0000 Subject: [PATCH 21/59] debug AR Scheduler Signed-off-by: tzhouam --- vllm_omni/core/sched/omni_ar_scheduler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm_omni/core/sched/omni_ar_scheduler.py b/vllm_omni/core/sched/omni_ar_scheduler.py index 00bdb32aa4c..dc3f56ac2db 100644 --- a/vllm_omni/core/sched/omni_ar_scheduler.py +++ b/vllm_omni/core/sched/omni_ar_scheduler.py @@ -151,11 +151,10 @@ def update_from_output( if new_token_ids: new_token_ids, stopped = self._update_request_with_output(request, new_token_ids) - # Stop checking for pooler models. if pooler_output: # Note: As we occupied the pooler output, for multimodal outputs, we do not intermediate stop checking for pooler output if request.output_token_ids: - stopped = check_stop(request, self.max_model_len, pooler_output) + stopped = check_stop(request, self.max_model_len) routed_experts = None if stopped: if self.vllm_config.model_config.enable_return_routed_experts: From 5bcdb43fa043990f871bcff473e52433508ba57b Mon Sep 17 00:00:00 2001 From: tzhouam Date: Tue, 13 Jan 2026 09:56:24 +0000 Subject: [PATCH 22/59] update OmniGPUModelRunner._update_states Signed-off-by: tzhouam --- vllm_omni/worker/gpu_model_runner.py | 96 +++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 16 deletions(-) diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index b90f2f1d346..81103483820 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -129,6 +129,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Remove finished requests from the cached states. for req_id in scheduler_output.finished_req_ids: self.requests.pop(req_id, None) + self.num_prompt_logprobs.pop(req_id, None) # Remove the finished requests from the persistent batch. # NOTE(woosuk): There could be an edge case where finished_req_ids and # scheduled_req_ids overlap. This happens when a request is aborted and @@ -149,7 +150,14 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # they will be scheduled again sometime in the future. scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys() cached_req_ids = self.input_batch.req_id_to_index.keys() - unscheduled_req_ids = cached_req_ids - scheduled_req_ids + resumed_req_ids = scheduler_output.scheduled_cached_reqs.resumed_req_ids + # NOTE(zhuohan): cached_req_ids and resumed_req_ids are usually disjoint, + # so `(scheduled_req_ids - resumed_req_ids) == scheduled_req_ids` holds + # apart from the forced-preemption case in reset_prefix_cache. And in + # that case we include the resumed_req_ids in the unscheduled set so + # that they get cleared from the persistent batch before being re-scheduled + # in the normal resumed request path. + unscheduled_req_ids = cached_req_ids - (scheduled_req_ids - resumed_req_ids) # NOTE(woosuk): The persistent batch optimization assumes that # consecutive batches contain mostly the same requests. If batches # have low request overlap (e.g., alternating between two distinct @@ -240,22 +248,64 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: except Exception as e: logger.error(f"Error decoding additional information: {e}") pass - + + if sampling_params and sampling_params.prompt_logprobs is not None: + self.num_prompt_logprobs[req_id] = ( + self.input_batch.vocab_size + if sampling_params.prompt_logprobs == -1 + else sampling_params.prompt_logprobs + ) # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: self._init_mrope_positions(req_state) - + + # Only relevant for models using XD-RoPE (e.g, HunYuan-VL) + if self.uses_xdrope_dim > 0: + self._init_xdrope_positions(req_state) + reqs_to_add.append(self.requests[req_id]) # Update the states of the running/resumed requests. is_last_rank = get_pp_group().is_last_rank req_data = scheduler_output.scheduled_cached_reqs + scheduled_spec_tokens = scheduler_output.scheduled_spec_decode_tokens + + # Wait until valid_sampled_tokens_count is copied to cpu, + # then use it to update actual num_computed_tokens of each request. + valid_sampled_token_count = self._get_valid_sampled_token_count() + for i, req_id in enumerate(req_data.req_ids): req_state = self.requests[req_id] num_computed_tokens = req_data.num_computed_tokens[i] new_block_ids = req_data.new_block_ids[i] - resumed_from_preemption = req_data.resumed_from_preemption[i] - + resumed_from_preemption = req_id in req_data.resumed_req_ids + num_output_tokens = req_data.num_output_tokens[i] + req_index = self.input_batch.req_id_to_index.get(req_id) + + if req_state.prev_num_draft_len and self.use_async_scheduling: + # prev_num_draft_len is used in async scheduling mode with + # spec decode. it indicates if need to update num_computed_tokens + # of the request. for example: + # fist step: num_computed_tokens = 0, spec_tokens = [], + # prev_num_draft_len = 0. + # second step: num_computed_tokens = 100(prompt lenth), + # spec_tokens = [a,b], prev_num_draft_len = 0. + # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d], + # prev_num_draft_len = 2. + # num_computed_tokens in first step and second step does't contain + # the spec tokens length, but in third step it contains the + # spec tokens length. we only need to update num_computed_tokens + # when prev_num_draft_len > 0. + if req_index is None: + req_state.prev_num_draft_len = 0 + else: + assert self.input_batch.prev_req_id_to_index is not None + prev_req_index = self.input_batch.prev_req_id_to_index[req_id] + num_accepted = valid_sampled_token_count[prev_req_index] - 1 + num_rejected = req_state.prev_num_draft_len - num_accepted + num_computed_tokens -= num_rejected + req_state.output_token_ids.extend([-1] * num_accepted) + # Update the cached states. req_state.num_computed_tokens = num_computed_tokens @@ -272,7 +322,17 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: req_state.output_token_ids.append(new_token_ids[-1]) elif num_new_tokens > 0: req_state.output_token_ids.extend(new_token_ids[-num_new_tokens:]) - + elif num_output_tokens < len(req_state.output_token_ids): + # Some output tokens were discarded due to a sync-KV-load + # failure. Align the cached state. + del req_state.output_token_ids[num_output_tokens:] + if req_index is not None: + end_idx = ( + self.input_batch.num_prompt_tokens[req_index] + + num_output_tokens + ) + self.input_batch.num_tokens_no_spec[req_index] = end_idx + # Update the block IDs. if not resumed_from_preemption: if new_block_ids is not None: @@ -280,6 +340,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: for block_ids, new_ids in zip(req_state.block_ids, new_block_ids): block_ids.extend(new_ids) else: + assert req_index is None assert new_block_ids is not None # The request is resumed from preemption. # Replace the existing block IDs with the new ones. @@ -290,6 +351,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # The request is not in the persistent batch. # The request was either preempted and resumed later, or was not # scheduled in the previous step and needs to be added again. + + if self.use_async_scheduling and num_output_tokens > 0: + # We must recover the output token ids for resumed requests in the + # async scheduling case, so that correct input_ids are obtained. + resumed_token_ids = req_data.all_token_ids[req_id] + req_state.output_token_ids = resumed_token_ids[-num_output_tokens:] + reqs_to_add.append(req_state) continue @@ -304,24 +372,20 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Add new_token_ids to token_ids_cpu. start_token_index = num_computed_tokens end_token_index = num_computed_tokens + len(new_token_ids) - self.input_batch.token_ids_cpu[req_index, start_token_index:end_token_index] = new_token_ids + self.input_batch.token_ids_cpu[ + req_index, start_token_index:end_token_index + ] = new_token_ids self.input_batch.num_tokens_no_spec[req_index] = end_token_index - self.input_batch.num_tokens[req_index] = end_token_index # Add spec_token_ids to token_ids_cpu. - spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()) - if spec_token_ids: - num_spec_tokens = len(spec_token_ids) - start_index = self.input_batch.num_tokens_no_spec[req_index] - end_token_index = start_index + num_spec_tokens - self.input_batch.token_ids_cpu[req_index, start_index:end_token_index] = spec_token_ids - # NOTE(woosuk): `num_tokens` here may include spec tokens. - self.input_batch.num_tokens[req_index] += num_spec_tokens + self.input_batch.update_req_spec_token_ids(req_state, scheduled_spec_tokens) + # Add the new or resumed requests to the persistent batch. # The smaller empty indices are filled first. for request in reqs_to_add: self.input_batch.add_request(request) + self.input_batch.update_req_spec_token_ids(request, scheduled_spec_tokens) # Condense the batched states if there are gaps left by removed requests self.input_batch.condense() From d7cd00e26cc46efcdda36b73d1170889d8ff6c59 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Tue, 13 Jan 2026 19:21:24 +0800 Subject: [PATCH 23/59] [Doc] refactor diffusion doc (#753) Signed-off-by: zjy0516 --- docs/.nav.yml | 13 ++++++------- docs/api/README.md | 1 + docs/configuration/README.md | 6 +++--- .../cache_dit_acceleration.md | 0 .../diffusion}/cpu_offload_diffusion.md | 2 +- .../parallelism_acceleration.md | 0 .../{acceleration => diffusion}/teacache.md | 0 docs/user_guide/diffusion_acceleration.md | 18 +++++++++--------- 8 files changed, 20 insertions(+), 20 deletions(-) rename docs/user_guide/{acceleration => diffusion}/cache_dit_acceleration.md (100%) rename docs/{features => user_guide/diffusion}/cpu_offload_diffusion.md (93%) rename docs/user_guide/{acceleration => diffusion}/parallelism_acceleration.md (100%) rename docs/user_guide/{acceleration => diffusion}/teacache.md (100%) diff --git a/docs/.nav.yml b/docs/.nav.yml index abe4865af33..7493e71e8af 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -26,17 +26,16 @@ nav: - Configuration: - configuration/README.md - configuration/* - - Diffusion Acceleration: - - Overview: user_guide/diffusion_acceleration.md - - Acceleration Methods: - - TeaCache: user_guide/acceleration/teacache.md - - Cache-DiT: user_guide/acceleration/cache_dit_acceleration.md - - Parallelism Acceleration: user_guide/acceleration/parallelism_acceleration.md - Models: - models/supported_models.md - Features: - Sleep Mode: features/sleep_mode.md - - CPU Offloading for Diffusion Model: features/cpu_offload_diffusion.md + - Diffusion Features: + - Overview: user_guide/diffusion_acceleration.md + - TeaCache: user_guide/diffusion/teacache.md + - Cache-DiT: user_guide/diffusion/cache_dit_acceleration.md + - Parallelism Acceleration: user_guide/diffusion/parallelism_acceleration.md + - CPU Offloading: user_guide/diffusion/cpu_offload_diffusion.md - Developer Guide: - General: - contributing/README.md diff --git a/docs/api/README.md b/docs/api/README.md index 4fa85cdc663..a9d751bce25 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -82,6 +82,7 @@ Model execution components. - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_code_predictor_mtp.Qwen3OmniMoeTalkerCodePredictor][] - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeModel][] - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeTalkerForConditionalGeneration][] +- [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_talker.Qwen3OmniMoeTalkerSharedExpertWrapper][] - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3MoeLLMForCausalLM][] - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3MoeLLMModel][] - [vllm_omni.model_executor.models.qwen3_omni.qwen3_omni_moe_thinker.Qwen3OmniMoeConditionalGenerationMixin][] diff --git a/docs/configuration/README.md b/docs/configuration/README.md index 40439d51121..37e28bd0c57 100644 --- a/docs/configuration/README.md +++ b/docs/configuration/README.md @@ -16,6 +16,6 @@ For introduction, please check [Introduction for stage config](./stage_configs.m ## Optimization Features -- **[TeaCache Configuration](../user_guide/acceleration/teacache.md)** - Enable TeaCache adaptive caching for DiT models to achieve 1.5x-2.0x speedup with minimal quality loss -- **[Cache-DiT Configuration](../user_guide/acceleration/cache_dit_acceleration.md)** - Enable Cache-DiT as cache acceleration backends for DiT models -- **[Parallelism Configuration](../user_guide/acceleration/parallelism_acceleration.md)** - Enable parallelism (e.g., sequence parallelism) for for DiT models +- **[TeaCache Configuration](../user_guide/diffusion/teacache.md)** - Enable TeaCache adaptive caching for DiT models to achieve 1.5x-2.0x speedup with minimal quality loss +- **[Cache-DiT Configuration](../user_guide/diffusion/cache_dit_acceleration.md)** - Enable Cache-DiT as cache acceleration backends for DiT models +- **[Parallelism Configuration](../user_guide/diffusion/parallelism_acceleration.md)** - Enable parallelism (e.g., sequence parallelism) for for DiT models diff --git a/docs/user_guide/acceleration/cache_dit_acceleration.md b/docs/user_guide/diffusion/cache_dit_acceleration.md similarity index 100% rename from docs/user_guide/acceleration/cache_dit_acceleration.md rename to docs/user_guide/diffusion/cache_dit_acceleration.md diff --git a/docs/features/cpu_offload_diffusion.md b/docs/user_guide/diffusion/cpu_offload_diffusion.md similarity index 93% rename from docs/features/cpu_offload_diffusion.md rename to docs/user_guide/diffusion/cpu_offload_diffusion.md index aaa4243a3a2..533b6b3b964 100644 --- a/docs/features/cpu_offload_diffusion.md +++ b/docs/user_guide/diffusion/cpu_offload_diffusion.md @@ -23,7 +23,7 @@ if __name__ == "__main__": m = Omni(model="Qwen/Qwen-Image",enable_cpu_offload=True) ``` -- **CLI**: pass `--dit-cpu-offload` to the diffusion service entrypoint. +- **CLI**: pass `--enable-cpu-offload` to the diffusion service entrypoint. ## Known Limitations - Cold start latency increases for over one minute for some models(e.g., Qwen-Image) diff --git a/docs/user_guide/acceleration/parallelism_acceleration.md b/docs/user_guide/diffusion/parallelism_acceleration.md similarity index 100% rename from docs/user_guide/acceleration/parallelism_acceleration.md rename to docs/user_guide/diffusion/parallelism_acceleration.md diff --git a/docs/user_guide/acceleration/teacache.md b/docs/user_guide/diffusion/teacache.md similarity index 100% rename from docs/user_guide/acceleration/teacache.md rename to docs/user_guide/diffusion/teacache.md diff --git a/docs/user_guide/diffusion_acceleration.md b/docs/user_guide/diffusion_acceleration.md index 8f78ae32e50..cf04c6228a6 100644 --- a/docs/user_guide/diffusion_acceleration.md +++ b/docs/user_guide/diffusion_acceleration.md @@ -6,8 +6,8 @@ vLLM-Omni supports various cache acceleration methods to speed up diffusion mode vLLM-Omni currently supports two main cache acceleration backends: -1. **[TeaCache](acceleration/teacache.md)** - Hook-based adaptive caching that caches transformer computations when consecutive timesteps are similar -2. **[Cache-DiT](acceleration/cache_dit_acceleration.md)** - Library-based acceleration using multiple techniques: +1. **[TeaCache](diffusion/teacache.md)** - Hook-based adaptive caching that caches transformer computations when consecutive timesteps are similar +2. **[Cache-DiT](diffusion/cache_dit_acceleration.md)** - Library-based acceleration using multiple techniques: - **DBCache** (Dual Block Cache): Caches intermediate transformer block outputs based on residual differences - **TaylorSeer**: Uses Taylor expansion-based forecasting for faster inference - **SCM** (Step Computation Masking): Selectively computes steps based on adaptive masking @@ -16,11 +16,11 @@ Both methods can provide significant speedups (typically **1.5x-2.0x**) while ma vLLM-Omni also supports parallelism methods for diffusion models, including: -1. [Ulysses-SP](acceleration/parallelism_acceleration.md#ulysses-sp) - splits the input along the sequence dimension and uses all-to-all communication to allow each device to compute only a subset of attention heads. +1. [Ulysses-SP](diffusion/parallelism_acceleration.md#ulysses-sp) - splits the input along the sequence dimension and uses all-to-all communication to allow each device to compute only a subset of attention heads. -2. [Ring-Attention](acceleration/parallelism_acceleration.md#ring-attention) - splits the input along the sequence dimension and uses ring-based P2P communication to accumulate attention results, keeping the sequence dimension sharded. +2. [Ring-Attention](diffusion/parallelism_acceleration.md#ring-attention) - splits the input along the sequence dimension and uses ring-based P2P communication to accumulate attention results, keeping the sequence dimension sharded. -3. [CFG-Parallel](acceleration/parallelism_acceleration.md#cfg-parallel) - runs the positive/negative prompts of classifier-free guidance (CFG) on different devices, then merges on a single device to perform the scheduler step. +3. [CFG-Parallel](diffusion/parallelism_acceleration.md#cfg-parallel) - runs the positive/negative prompts of classifier-free guidance (CFG) on different devices, then merges on a single device to perform the scheduler step. ## Quick Comparison @@ -197,7 +197,7 @@ outputs = omni.generate(prompt="turn this cat to a dog", For detailed information on each acceleration method: -- **[TeaCache Guide](acceleration/teacache.md)** - Complete TeaCache documentation, configuration options, and best practices -- **[Cache-DiT Acceleration Guide](acceleration/cache_dit_acceleration.md)** - Comprehensive Cache-DiT guide covering DBCache, TaylorSeer, SCM, and configuration parameters -- **[Sequence Parallelism](acceleration/parallelism_acceleration.md#sequence-parallelism)** - Guidance on how to set sequence parallelism with configuration. -- **[CFG-Parallel](acceleration/parallelism_acceleration.md#cfg-parallel)** - Guidance on how to set CFG-Parallel to run positive/negative branches across ranks. +- **[TeaCache Guide](diffusion/teacache.md)** - Complete TeaCache documentation, configuration options, and best practices +- **[Cache-DiT Acceleration Guide](diffusion/cache_dit_acceleration.md)** - Comprehensive Cache-DiT guide covering DBCache, TaylorSeer, SCM, and configuration parameters +- **[Sequence Parallelism](diffusion/parallelism_acceleration.md#sequence-parallelism)** - Guidance on how to set sequence parallelism with configuration. +- **[CFG-Parallel](diffusion/parallelism_acceleration.md#cfg-parallel)** - Guidance on how to set CFG-Parallel to run positive/negative branches across ranks. From e9a1beec77fe8dcb980926e7f6972ba881dde433 Mon Sep 17 00:00:00 2001 From: catcat <108673086+iwzbi@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:12:32 +0800 Subject: [PATCH 24/59] [Bugfix] Fix stable diffusion3 compatibility error (#772) Signed-off-by: iwzbi --- vllm_omni/diffusion/models/sd3/sd3_transformer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm_omni/diffusion/models/sd3/sd3_transformer.py b/vllm_omni/diffusion/models/sd3/sd3_transformer.py index a2afa8c0946..22a11741a53 100644 --- a/vllm_omni/diffusion/models/sd3/sd3_transformer.py +++ b/vllm_omni/diffusion/models/sd3/sd3_transformer.py @@ -102,8 +102,8 @@ def __init__( else: self.to_out = None - self.norm_added_q = RMSNorm(head_dim, eps=eps) - self.norm_added_k = RMSNorm(head_dim, eps=eps) + self.norm_added_q = RMSNorm(head_dim, eps=eps) if qk_norm else nn.Identity() + self.norm_added_k = RMSNorm(head_dim, eps=eps) if qk_norm else nn.Identity() self.attn = Attention( num_heads=num_heads, @@ -341,8 +341,10 @@ def __init__( self.pooled_projection_dim = model_config.pooled_projection_dim self.joint_attention_dim = model_config.joint_attention_dim self.patch_size = model_config.patch_size - self.dual_attention_layers = model_config.dual_attention_layers - self.qk_norm = model_config.qk_norm + self.dual_attention_layers = ( + model_config.dual_attention_layers if hasattr(model_config, "dual_attention_layers") else () + ) + self.qk_norm = model_config.qk_norm if hasattr(model_config, "qk_norm") else "" self.pos_embed_max_size = model_config.pos_embed_max_size self.pos_embed = PatchEmbed( From 2a0f72f79e28ffb3af88c48efbc63e1441eeca6a Mon Sep 17 00:00:00 2001 From: tzhouam Date: Wed, 14 Jan 2026 02:50:24 +0000 Subject: [PATCH 25/59] update the offline LLM request sorting due to changed requested id format Signed-off-by: tzhouam --- vllm_omni/entrypoints/omni_llm.py | 55 ++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/vllm_omni/entrypoints/omni_llm.py b/vllm_omni/entrypoints/omni_llm.py index 05a48feee0e..74fe6a80376 100644 --- a/vllm_omni/entrypoints/omni_llm.py +++ b/vllm_omni/entrypoints/omni_llm.py @@ -2,7 +2,9 @@ import cloudpickle from pydantic import ValidationError - +from tqdm import tqdm +from vllm.outputs import RequestOutput, PoolingRequestOutput +from typing import Callable # External library imports (vLLM) from vllm.config import CompilationConfig, StructuredOutputsConfig, is_init_field from vllm.entrypoints.llm import LLM @@ -190,3 +192,54 @@ def __del__(self) -> None: # best-effort self.close() except Exception as e: logger.debug("[Orchestrator] __del__ close() raised: %s", e, exc_info=True) + + def _run_engine( + self, *, use_tqdm: bool | Callable[..., tqdm] = True + ) -> list[RequestOutput | PoolingRequestOutput]: + # Initialize tqdm. + if use_tqdm: + num_requests = self.llm_engine.get_num_unfinished_requests() + tqdm_func = use_tqdm if callable(use_tqdm) else tqdm + pbar = tqdm_func( + total=num_requests, + desc="Processed prompts", + dynamic_ncols=True, + postfix=(f"est. speed input: {0:.2f} toks/s, output: {0:.2f} toks/s"), + ) + + # Run the engine. + outputs: list[RequestOutput | PoolingRequestOutput] = [] + total_in_toks = 0 + total_out_toks = 0 + while self.llm_engine.has_unfinished_requests(): + step_outputs = self.llm_engine.step() + for output in step_outputs: + if output.finished: + outputs.append(output) + if use_tqdm: + if isinstance(output, RequestOutput): + # Calculate tokens only for RequestOutput + n = len(output.outputs) + assert output.prompt_token_ids is not None + total_in_toks += len(output.prompt_token_ids) * n + in_spd = total_in_toks / pbar.format_dict["elapsed"] + total_out_toks += sum( + len(stp.token_ids) for stp in output.outputs + ) + out_spd = total_out_toks / pbar.format_dict["elapsed"] + pbar.postfix = ( + f"est. speed input: {in_spd:.2f} toks/s, " + f"output: {out_spd:.2f} toks/s" + ) + pbar.update(n) + else: + pbar.update(1) + if pbar.n == num_requests: + pbar.refresh() + + if use_tqdm: + pbar.close() + # Sort the outputs by the int part of request ID which is in format of 'int-uuid'. + # This is necessary because some requests may be finished earlier than + # its previous requests. + return sorted(outputs, key=lambda x: int(x.request_id.split("-")[0])) \ No newline at end of file From f7c8af944e65223b5387fdfb08d041728ca50baf Mon Sep 17 00:00:00 2001 From: tzhouam Date: Wed, 14 Jan 2026 08:24:31 +0000 Subject: [PATCH 26/59] update Qwen3 Omni to fit with the engine core logic Signed-off-by: tzhouam --- vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py index 7675caa638e..c459289af34 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py @@ -154,6 +154,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # for CI: Initialize special tokens embeddings early to avoid AttributeError when loading dummy weights self._init_special_tokens_embeddings() + self.requires_raw_input_tokens = True elif self.model_stage == "code2wav": self.thinker = None @@ -168,6 +169,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): architectures=["Qwen3OmniMoeCode2Wav"], ) self.model = self.code2wav + self.requires_raw_input_tokens = True else: raise ValueError( f"Invalid model_stage: {self.model_stage}. Must be one of: 'thinker', 'talker', 'code2wav'" From 1444e1fc45e2262751fd15eb82fe209566cde163 Mon Sep 17 00:00:00 2001 From: Ziming Huang Date: Wed, 14 Jan 2026 19:07:31 +0800 Subject: [PATCH 27/59] [Feature] Support Qwen3 Omni talker mtp batch inference (#722) Signed-off-by: ZeldaHuang --- .buildkite/scripts/simple_test.sh | 1 + tests/worker/test_omni_gpu_model_runner.py | 123 ++++++++++++++++++ vllm_omni/entrypoints/omni_stage.py | 2 - vllm_omni/entrypoints/utils.py | 4 + .../models/qwen3_omni/qwen3_omni.py | 47 +++---- .../qwen3_omni/qwen3_omni_moe_talker.py | 10 +- .../stage_input_processors/qwen3_omni.py | 2 +- vllm_omni/worker/gpu_model_runner.py | 72 +++++----- 8 files changed, 189 insertions(+), 72 deletions(-) create mode 100644 tests/worker/test_omni_gpu_model_runner.py diff --git a/.buildkite/scripts/simple_test.sh b/.buildkite/scripts/simple_test.sh index 33248d99cde..55ac27cec9f 100755 --- a/.buildkite/scripts/simple_test.sh +++ b/.buildkite/scripts/simple_test.sh @@ -52,3 +52,4 @@ VENV_PYTHON="${VENV_DIR}/bin/python" "${VENV_PYTHON}" -m pytest -v -s tests/entrypoints/ "${VENV_PYTHON}" -m pytest -v -s tests/diffusion/cache/ "${VENV_PYTHON}" -m pytest -v -s tests/model_executor/models/qwen2_5_omni/test_audio_length.py +"${VENV_PYTHON}" -m pytest -v -s tests/worker/ diff --git a/tests/worker/test_omni_gpu_model_runner.py b/tests/worker/test_omni_gpu_model_runner.py new file mode 100644 index 00000000000..b0132306c81 --- /dev/null +++ b/tests/worker/test_omni_gpu_model_runner.py @@ -0,0 +1,123 @@ +from contextlib import contextmanager + +import torch + +from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner + + +class DummyBuffer: + """A minimal buffer wrapper that exposes the `.gpu` attribute.""" + + def __init__(self, t: torch.Tensor): + self.gpu = t + + +class DummyInputBatch: + """A minimal input batch that only provides `req_ids`.""" + + def __init__(self, req_ids): + self.req_ids = req_ids + + +class DummyReqState: + """A minimal request state container.""" + + pass + + +class DummyTalkerMTP(torch.nn.Module): + """A fake talker_mtp module for deterministic CPU testing.""" + + def forward(self, req_input_ids, req_embeds, last_talker_hidden, text_step): + # Deterministic behavior: + # - output embeds = input embeds + 1 + # - output codes = [[0], [1], ...] + bsz = req_embeds.shape[0] + new_embeds = req_embeds + 1.0 + codes = torch.arange(bsz, dtype=torch.int64).view(bsz, 1) + return new_embeds, codes + + +@contextmanager +def _noop_forward_context(*args, **kwargs): + """A no-op context manager to replace vLLM forward context in CPU tests.""" + yield + + +def _make_runner(req_ids=("r1", "r2"), hidden_size=4): + # Create an instance without calling OmniGPUModelRunner.__init__ + runner = object.__new__(OmniGPUModelRunner) + + # Minimal attributes used by OmniGPUModelRunner._talker_mtp_forward + runner.input_batch = DummyInputBatch(list(req_ids)) + runner.requests = {rid: DummyReqState() for rid in req_ids} + + # query_start_loc.cpu[req_index] is used to locate the token position + # in the flattened `inputs_embeds`. + runner.query_start_loc = type("QSL", (), {})() + # Map: r1 -> offset 0, r2 -> offset 3 + runner.query_start_loc.cpu = torch.tensor([0, 3], dtype=torch.int32) + + bsz = len(req_ids) + runner.talker_mtp_input_ids = DummyBuffer(torch.zeros((bsz,), dtype=torch.int64)) + runner.talker_mtp_inputs_embeds = DummyBuffer(torch.zeros((bsz, hidden_size), dtype=torch.float32)) + runner.last_talker_hidden = DummyBuffer(torch.zeros((bsz, hidden_size), dtype=torch.float32)) + runner.text_step = DummyBuffer(torch.zeros((bsz, hidden_size), dtype=torch.float32)) + + runner.talker_mtp = DummyTalkerMTP() + runner.vllm_config = object() + + # Provide a minimal implementation that returns the expected 4-tuple. + def _determine_batch_execution_and_padding(**kwargs): + return None, object(), None, None + + runner._determine_batch_execution_and_padding = _determine_batch_execution_and_padding + + # Use the real merge method from OmniGPUModelRunner. + return runner + + +def test_talker_mtp_forward_cpu_updates_inputs_and_info(monkeypatch): + # Patch the module-level `set_forward_context` symbol used inside + # OmniGPUModelRunner._talker_mtp_forward. + import vllm_omni.worker.gpu_model_runner as mod # Must be the same module that defines OmniGPUModelRunner + + monkeypatch.setattr(mod, "set_forward_context", _noop_forward_context) + + runner = _make_runner(req_ids=("r1", "r2"), hidden_size=4) + + # Initialize per-request embeds (batch-major inside talker_mtp_inputs_embeds) + runner.talker_mtp_inputs_embeds.gpu[0] = torch.tensor([1.0, 2.0, 3.0, 4.0]) + runner.talker_mtp_inputs_embeds.gpu[1] = torch.tensor([10.0, 20.0, 30.0, 40.0]) + + # Flattened `inputs_embeds`: offsets 0 and 3 will be overwritten + inputs_embeds = torch.zeros((6, 4), dtype=torch.float32) + + # Call the original implementation from OmniGPUModelRunner (no re-implementation) + OmniGPUModelRunner._talker_mtp_forward(runner, ["r1", "r2"], inputs_embeds) + + # Validate embeds were written back (+1) + assert torch.allclose(inputs_embeds[0], torch.tensor([2.0, 3.0, 4.0, 5.0])) + assert torch.allclose(inputs_embeds[3], torch.tensor([11.0, 21.0, 31.0, 41.0])) + + # Validate per-request additional_information_cpu was updated + info_r1 = runner.requests["r1"].additional_information_cpu + info_r2 = runner.requests["r2"].additional_information_cpu + assert int(info_r1["code_predictor_codes"][0, 0]) == 0 + assert int(info_r2["code_predictor_codes"][0, 0]) == 1 + + +def test_talker_mtp_forward_cpu_empty_batch_noop(monkeypatch): + import vllm_omni.worker.gpu_model_runner as mod + + monkeypatch.setattr(mod, "set_forward_context", _noop_forward_context) + + runner = _make_runner(req_ids=("r1",), hidden_size=4) + + inputs_embeds = torch.randn((2, 4)) + before = inputs_embeds.clone() + + OmniGPUModelRunner._talker_mtp_forward(runner, [], inputs_embeds) + + # Ensure no changes were made + assert torch.allclose(inputs_embeds, before) diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py index af6f60f0420..804ab7b7fb8 100644 --- a/vllm_omni/entrypoints/omni_stage.py +++ b/vllm_omni/entrypoints/omni_stage.py @@ -951,8 +951,6 @@ async def _stage_worker_async( except Exception as e: logger.warning("Device setup failed: %s", e) - max_batch_size = int(runtime_cfg.get("max_batch_size", 1) or 1) - engine_args["max_num_seqs"] = max_batch_size # Initialize OmniConnectors if configured to match sync worker behavior connectors: dict[Any, Any] = {} if connectors_config: diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index 135b0e89ff2..eae3ea7afc4 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -195,6 +195,10 @@ def load_stage_configs_from_yaml(config_path: str, base_engine_args: dict | None # Update base_engine_args with stage-specific engine_args if they exist if hasattr(stage_arg, "engine_args") and stage_arg.engine_args is not None: base_engine_args_tmp = OmegaConf.merge(base_engine_args_tmp, stage_arg.engine_args) + if hasattr(stage_arg, "runtime") and stage_arg.runtime is not None: + runtime_cfg = stage_arg.runtime + max_batch_size = int(runtime_cfg.get("max_batch_size", 1) or 1) + base_engine_args_tmp["max_num_seqs"] = max_batch_size stage_arg.engine_args = base_engine_args_tmp return stage_args diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py index 7675caa638e..ba46d4a1483 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py @@ -573,30 +573,22 @@ def talker_preprocess(self, input_ids: torch.Tensor, input_embeds: torch.Tensor, if input_embeds is None and input_ids is not None: input_embeds = self.talker.embed_input_ids(input_ids) - text_step = torch.zeros( - 1, - self.talker_config.text_config.hidden_size, - device=self._module_device(self.talker), - dtype=torch.bfloat16, - ) - last_talker_hidden = torch.zeros( - 1, - 1, - self.talker_config.text_config.hidden_size, - device=self._module_device(self.talker), - dtype=torch.bfloat16, - ) - span_len = input_ids.shape[0] if span_len > 1: # prefill input_ids, input_embeds, update_dict = self.talker_preprocess_prefill(input_ids, input_embeds, **info_dict) + code_predictor_codes = torch.zeros( + (input_embeds.shape[0], self.talker.num_code_groups), + device=self._module_device(self.talker), + dtype=torch.long, + ) + update_dict["code_predictor_codes"] = code_predictor_codes else: last_talker_hidden, text_step, update_dict = self.talker_preprocess_decode( input_ids, input_embeds, **info_dict ) - update_dict["mtp_inputs"] = last_talker_hidden, text_step + update_dict["mtp_inputs"] = last_talker_hidden, text_step return input_ids, input_embeds, update_dict @@ -608,24 +600,19 @@ def talker_mtp( text_step: torch.Tensor, ): # TODO(Peiqi): not support intermediate_tensors now - input_ids = safe_tensor_reshape(input_ids, (1, -1)) + input_ids = safe_tensor_reshape(input_ids, (input_ids.shape[0], -1)) inputs_embeds = safe_tensor_reshape(input_embeds, (-1, self.talker_config.text_config.hidden_size)) - text_step = safe_tensor_reshape(text_step, (1, -1)) - last_talker_hidden = safe_tensor_reshape(last_talker_hidden, (1, 1, self.talker_config.text_config.hidden_size)) + text_step = safe_tensor_reshape(text_step, (-1, self.talker_config.text_config.hidden_size)) + last_talker_hidden = safe_tensor_reshape( + last_talker_hidden, (-1, 1, self.talker_config.text_config.hidden_size) + ) # for profiling if inputs_embeds.shape[-1] == 2048: inputs_embeds = self.text_projection(inputs_embeds) - if inputs_embeds.shape[0] == 1: - code_predictor_codes, summed_embeddings = self.talker.code_predictor_forward( - input_ids, inputs_embeds.clone(), last_talker_hidden=last_talker_hidden - ) - inputs_embeds = summed_embeddings.clone() - else: - code_predictor_codes = torch.zeros( - (inputs_embeds.shape[0], self.talker.num_code_groups), - device=self._module_device(self.talker), - dtype=torch.long, - ) + code_predictor_codes, summed_embeddings = self.talker.code_predictor_forward( + input_ids, inputs_embeds.clone(), last_talker_hidden=last_talker_hidden + ) + inputs_embeds = summed_embeddings.clone() inputs_embeds = (inputs_embeds + text_step).reshape(-1, self.talker_config.text_config.hidden_size) return inputs_embeds, code_predictor_codes.squeeze(-1) @@ -848,7 +835,7 @@ def talker_preprocess_decode(self, input_ids: torch.Tensor, input_embeds: torch. use_vec = q_tail[0:1, :] new_q_tail = ( q_tail[1:, :].detach().to("cpu").contiguous() - if q_tail.shape[1] > 1 + if q_tail.shape[0] > 1 else self.tts_pad_embed.to(input_embeds.device, dtype=input_embeds.dtype) ) text_step = use_vec.to(input_embeds.device, dtype=input_embeds.dtype) diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py index 4e8730eab52..2f1893e00ca 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py @@ -234,13 +234,7 @@ def code_predictor_forward( # Use the corresponding lm_head for this layer logits = self.code_predictor.lm_head[layer_idx](hidden_state[:, -1:, :]) # [batch, 1, vocab_size] - if len(pos_codes) > 1: - input_ids_for_logits_processors = torch.cat(pos_codes[1:], dim=1).to( - device=logits.device, dtype=torch.long - ) - else: - input_ids_for_logits_processors = self.empty_code - logits = logits_processors(input_ids_for_logits_processors, logits.squeeze(0)).unsqueeze(0) + logits = logits_processors(None, logits[:, -1]) # Sample from the filtered distribution probs = F.softmax(logits, dim=-1) @@ -288,7 +282,7 @@ def code_predictor_forward( all_summed_embeddings.append(pos_summed) # Concatenate across positions: [batch, seq_len, hidden_size] - summed_embeddings = torch.cat(all_summed_embeddings, dim=1) + summed_embeddings = torch.cat(all_summed_embeddings, dim=1).squeeze(1) return result_codes, summed_embeddings diff --git a/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py b/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py index 246ea2996e8..a1457a9750b 100644 --- a/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py +++ b/vllm_omni/model_executor/stage_input_processors/qwen3_omni.py @@ -160,7 +160,7 @@ def talker2code2wav( # Process each talker output for i, talker_output in enumerate(talker_outputs): output = talker_output.outputs[0] - seq_len = len(output.token_ids) + seq_len = len(output.token_ids) - 1 # Extract codec codes from talker output # Expected shape: [8, seq_len] (8-layer RVQ codes) codec_codes = ( diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 69729d95429..b0d0e165e08 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -563,11 +563,7 @@ def _dummy_run( ubatch_slices=ubatch_slices, ), ): - if ( - getattr(self.model, "talker", None) is not None - and hasattr(self.model, "talker_mtp") - and num_tokens_padded == 1 - ): + if getattr(self.model, "talker", None) is not None and hasattr(self.model, "talker_mtp"): outputs = self.talker_mtp( self.talker_mtp_input_ids.gpu[:num_tokens_padded], self.talker_mtp_inputs_embeds.gpu[:num_tokens_padded], @@ -884,6 +880,7 @@ def _preprocess( if hasattr(self.model, "has_preprocess") and self.model.has_preprocess: # Overlay custom prompt_embeds per request for the prompt portion; # collect additional_information (tensor/list) for prefill portion only + decode_req_ids = [] for req_index, req_id in enumerate(self.input_batch.req_ids): req_state = self.requests.get(req_id) req_infos = getattr(req_state, "additional_information_cpu", None) if req_state is not None else None @@ -897,33 +894,14 @@ def _preprocess( req_input_ids, req_embeds, update_dict = self.model.preprocess( input_ids=input_ids[s:e], input_embeds=inputs_embeds[s:e], **req_infos ) - # run talker mtp decode - if hasattr(self.model, "talker_mtp"): - _cudagraph_mode, batch_desc, _, _ = self._determine_batch_execution_and_padding( - num_tokens=span_len, - num_reqs=1, - num_scheduled_tokens_np=num_scheduled_tokens_np[req_index], - max_num_scheduled_tokens=1, - force_eager=span_len > 1, - use_cascade_attn=False, - ) + if hasattr(self.model, "talker_mtp") and span_len == 1: last_talker_hidden, text_step = update_dict.pop("mtp_inputs") - if _cudagraph_mode != CUDAGraphMode.NONE: - self.talker_mtp_input_ids.gpu[:span_len].copy_(req_input_ids) - self.talker_mtp_inputs_embeds.gpu[:span_len].copy_(req_embeds) - self.last_talker_hidden.gpu[:span_len].copy_(last_talker_hidden) - self.text_step.gpu[:span_len].copy_(text_step) - req_input_ids = self.talker_mtp_input_ids.gpu[:span_len] - req_embeds = self.talker_mtp_inputs_embeds.gpu[:span_len] - last_talker_hidden = self.last_talker_hidden.gpu[:span_len] - text_step = self.text_step.gpu[:span_len] - with set_forward_context( - None, self.vllm_config, cudagraph_runtime_mode=_cudagraph_mode, batch_descriptor=batch_desc - ): - req_embeds, code_predictor_codes = self.talker_mtp( - req_input_ids, req_embeds, last_talker_hidden, text_step - ) - update_dict["code_predictor_codes"] = code_predictor_codes + decode_slice = slice(len(decode_req_ids), len(decode_req_ids) + 1) + self.talker_mtp_input_ids.gpu[decode_slice].copy_(req_input_ids) + self.talker_mtp_inputs_embeds.gpu[decode_slice].copy_(req_embeds) + self.last_talker_hidden.gpu[decode_slice].copy_(last_talker_hidden) + self.text_step.gpu[decode_slice].copy_(text_step) + decode_req_ids.append(req_id) # TODO(Peiqi): the merge stage could move out from the critical path self._merge_additional_information_update(req_id, update_dict) @@ -934,6 +912,10 @@ def _preprocess( if isinstance(req_input_ids, torch.Tensor) and req_input_ids.numel() == seg_len: input_ids[s : s + seg_len] = req_input_ids + # run talker mtp decode + if hasattr(self.model, "talker_mtp"): + self._talker_mtp_forward(decode_req_ids, inputs_embeds) + return ( input_ids, inputs_embeds, @@ -943,6 +925,34 @@ def _preprocess( ec_connector_output, ) + def _talker_mtp_forward(self, decode_req_ids: list[str], inputs_embeds: torch.Tensor) -> None: + decode_batch_size = len(decode_req_ids) + if decode_batch_size == 0: + return + _cudagraph_mode, batch_desc, _, _ = self._determine_batch_execution_and_padding( + num_tokens=decode_batch_size, + num_reqs=decode_batch_size, + num_scheduled_tokens_np=np.ones(decode_batch_size, dtype=np.int32), + max_num_scheduled_tokens=1, + use_cascade_attn=False, + ) + req_input_ids = self.talker_mtp_input_ids.gpu[:decode_batch_size] + req_embeds = self.talker_mtp_inputs_embeds.gpu[:decode_batch_size] + last_talker_hidden = self.last_talker_hidden.gpu[:decode_batch_size] + text_step = self.text_step.gpu[:decode_batch_size] + with set_forward_context( + None, self.vllm_config, cudagraph_runtime_mode=_cudagraph_mode, batch_descriptor=batch_desc + ): + req_embeds, code_predictor_codes = self.talker_mtp(req_input_ids, req_embeds, last_talker_hidden, text_step) + # update the inputs_embeds and code_predictor_codes + code_predictor_codes_cpu = code_predictor_codes.detach().to("cpu").contiguous() + for idx, req_id in enumerate(decode_req_ids): + req_index = self.input_batch.req_ids.index(req_id) + start_offset = int(self.query_start_loc.cpu[req_index]) + inputs_embeds[start_offset : start_offset + 1] = req_embeds[idx : idx + 1] + update_dict = {"code_predictor_codes": code_predictor_codes_cpu[idx : idx + 1]} + self._merge_additional_information_update(req_id, update_dict) + def _model_forward( self, input_ids: torch.Tensor | None = None, From e2462d23b586dfb859fdadc41503d2f94a7c4894 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Wed, 14 Jan 2026 11:22:25 +0000 Subject: [PATCH 28/59] update generation model runner Signed-off-by: tzhouam --- .../worker/gpu_generation_model_runner.py | 247 +++++++++++++++--- vllm_omni/worker/gpu_model_runner.py | 14 +- 2 files changed, 215 insertions(+), 46 deletions(-) diff --git a/vllm_omni/worker/gpu_generation_model_runner.py b/vllm_omni/worker/gpu_generation_model_runner.py index 57f3985e790..40b26e2ba50 100644 --- a/vllm_omni/worker/gpu_generation_model_runner.py +++ b/vllm_omni/worker/gpu_generation_model_runner.py @@ -5,16 +5,16 @@ """ from __future__ import annotations +from copy import copy import gc import logging - +from typing import Any import numpy as np import torch from vllm.config import CUDAGraphMode -from vllm.multimodal.inputs import MultiModalKwargs from vllm.utils.math_utils import cdiv -from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.core.sched.output import SchedulerOutput, GrammarOutput from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.utils import record_function_or_nullcontext from vllm.v1.worker.gpu_model_runner import ( @@ -30,7 +30,15 @@ from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices from vllm_omni.outputs import OmniModelRunnerOutput from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner - +from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( + RoutedExpertsCapturer, +) +from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer +from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group +from vllm.v1.outputs import make_empty_encoder_model_runner_output +from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices +from vllm.v1.outputs import AsyncModelRunnerOutput +from vllm_omni.worker.gpu_ar_model_runner import ExecuteModelState logger = logging.getLogger(__name__) @@ -48,58 +56,149 @@ def execute_model( scheduler_output: SchedulerOutput, intermediate_tensors: IntermediateTensors | None = None, ) -> OmniModelRunnerOutput | IntermediateTensors: - with record_function_or_nullcontext("Preprocess"): - with self.synchronize_input_prep(): - self._update_states(scheduler_output) - if not scheduler_output.total_num_scheduled_tokens: - return EMPTY_MODEL_RUNNER_OUTPUT + if self.execute_model_state is not None: + raise RuntimeError( + "State error: sample_tokens() must be called " + "after execute_model() returns None." + ) - num_reqs = self.input_batch.num_reqs - req_ids = self.input_batch.req_ids - tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids] - num_scheduled_tokens_np = np.array(tokens, dtype=np.int32) - max_num_scheduled_tokens = int(num_scheduled_tokens_np.max()) - num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens + if self.vllm_config.model_config.enable_return_routed_experts: + capturer = RoutedExpertsCapturer.get_instance() + if capturer is not None: + capturer.clear_buffer() # noqa + else: + logger.error("RoutedExpertsCapturer not initialized.") - logits_indices, spec_decode_metadata = self._prepare_inputs( + if scheduler_output.preempted_req_ids and has_kv_transfer_group(): + get_kv_transfer_group().handle_preemptions( + scheduler_output.preempted_req_ids + ) + + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + with ( + record_function_or_nullcontext("gpu_model_runner: preprocess"), + self.synchronize_input_prep(), + ): + self._update_states(scheduler_output) + if not scheduler_output.total_num_scheduled_tokens: + return EMPTY_MODEL_RUNNER_OUTPUT + + if has_ec_transfer() and get_ec_transfer().is_producer: + with self.maybe_get_ec_connector_output( scheduler_output, + encoder_cache=self.encoder_cache, + ) as ec_connector_output: + self._execute_mm_encoder(scheduler_output) + return make_empty_encoder_model_runner_output(scheduler_output) + + if not num_scheduled_tokens: + if ( + self.parallel_config.distributed_executor_backend + == "external_launcher" + and self.parallel_config.data_parallel_size > 1 + ): + # this is a corner case when both external launcher + # and DP are enabled, num_scheduled_tokens could be + # 0, and has_unfinished_requests in the outer loop + # returns True. before returning early here we call + # dummy run to ensure coordinate_batch_across_dp + # is called into to avoid out of sync issues. + self._dummy_run(1) + if not has_kv_transfer_group(): + # Return empty ModelRunnerOutput if no work to do. + return EMPTY_MODEL_RUNNER_OUTPUT + + return self.kv_connector_no_forward(scheduler_output, self.vllm_config) + + if self.cache_config.kv_sharing_fast_prefill: + assert not self.num_prompt_logprobs, ( + "--kv-sharing-fast-prefill produces incorrect " + "logprobs for prompt tokens, tokens, please disable " + "it when the requests need prompt logprobs" + ) + num_reqs = self.input_batch.num_reqs + req_ids = self.input_batch.req_ids + tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids] + num_scheduled_tokens_np = np.array(tokens, dtype=np.int32) + max_num_scheduled_tokens = int(num_scheduled_tokens_np.max()) + num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens + + logits_indices, spec_decode_metadata = self._prepare_inputs( + scheduler_output, + num_scheduled_tokens_np, + ) + + cascade_attn_prefix_lens = None + # Disable cascade attention when using microbatching (DBO) + if self.cascade_attn_enabled and not self.parallel_config.use_ubatching: + # Pre-compute cascade attention prefix lengths + cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens( num_scheduled_tokens_np, + self.input_batch.num_computed_tokens_cpu[:num_reqs], + scheduler_output.num_common_prefix_blocks, ) + + ( + cudagraph_mode, + batch_desc, + should_ubatch, + num_tokens_across_dp, + cudagraph_stats, + ) = self._determine_batch_execution_and_padding( + num_tokens=num_tokens_unpadded, + num_reqs=num_reqs, + num_scheduled_tokens_np=num_scheduled_tokens_np, + max_num_scheduled_tokens=max_num_scheduled_tokens, + use_cascade_attn=cascade_attn_prefix_lens is not None, + num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs), + ) - ( - cudagraph_mode, - batch_desc, - ubatch_slices, - num_tokens_across_dp, - _ - ) = self._determine_batch_execution_and_padding( - num_tokens=num_tokens_unpadded, - num_reqs=num_reqs, - num_scheduled_tokens_np=num_scheduled_tokens_np, - max_num_scheduled_tokens=max_num_scheduled_tokens, - use_cascade_attn=False, - ) + logger.debug( + "Running batch with cudagraph_mode: %s, batch_descriptor: %s, " + "should_ubatch: %s, num_tokens_across_dp: %s", + cudagraph_mode, + batch_desc, + should_ubatch, + num_tokens_across_dp, + ) + + num_tokens_padded = batch_desc.num_tokens + num_reqs_padded = ( + batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs + ) + ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices( + should_ubatch, + num_scheduled_tokens_np, + num_tokens_padded, + num_reqs_padded, + self.parallel_config.num_ubatches, + ) - num_tokens_padded = batch_desc.num_tokens - num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs - use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0 - pad_attn = cudagraph_mode == CUDAGraphMode.FULL + logger.debug( + "ubatch_slices: %s, ubatch_slices_padded: %s", + ubatch_slices, + ubatch_slices_padded, + ) - ( - attn_metadata, - spec_decode_common_attn_metadata, - ) = self._build_attention_metadata( + pad_attn = cudagraph_mode == CUDAGraphMode.FULL + + use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0 + ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices + + attn_metadata, spec_decode_common_attn_metadata = ( + self._build_attention_metadata( num_tokens=num_tokens_unpadded, num_tokens_padded=num_tokens_padded if pad_attn else None, num_reqs=num_reqs, num_reqs_padded=num_reqs_padded if pad_attn else None, max_query_len=max_num_scheduled_tokens, - ubatch_slices=ubatch_slices, + ubatch_slices=ubatch_slices_attn, logits_indices=logits_indices, use_spec_decode=use_spec_decode, num_scheduled_tokens=scheduler_output.num_scheduled_tokens, - cascade_attn_prefix_lens=None, + cascade_attn_prefix_lens=cascade_attn_prefix_lens, ) + ) ( input_ids, @@ -114,10 +213,16 @@ def execute_model( intermediate_tensors, ) + # Set cudagraph mode to none if calc_kv_scales is true. + # KV scales calculation involves dynamic operations that are incompatible + # with CUDA graph capture. if self.calculate_kv_scales: cudagraph_mode = CUDAGraphMode.NONE + # Mark KV scales as calculated after the first forward pass self.calculate_kv_scales = False + # Run the model. + # Use persistent buffers for CUDA graphs. with ( set_forward_context( attn_metadata, @@ -126,7 +231,7 @@ def execute_model( num_tokens_across_dp=num_tokens_across_dp, cudagraph_runtime_mode=cudagraph_mode, batch_descriptor=batch_desc, - ubatch_slices=ubatch_slices, + ubatch_slices=ubatch_slices_padded, ), record_function_or_nullcontext("Forward"), self.maybe_get_kv_connector_output(scheduler_output) as kv_connector_output, @@ -141,6 +246,58 @@ def execute_model( ) _, multimodal_outputs = self.extract_multimodal_outputs(outputs) + self.execute_model_state = ExecuteModelState( + scheduler_output, + None, + spec_decode_metadata, + spec_decode_common_attn_metadata, + None, + None, + None, + ec_connector_output, + cudagraph_stats, + multimodal_outputs, + ) + self.kv_connector_output = kv_connector_output + return None + + @torch.inference_mode() + def sample_tokens( + self, + grammar_output: GrammarOutput | None = None, + ) -> OmniModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors: + kv_connector_output = self.kv_connector_output + self.kv_connector_output = None + + if self.execute_model_state is None: + # Nothing to do (PP non-final rank case), output isn't used. + if not kv_connector_output: + return None # type: ignore[return-value] + + # In case of PP with kv transfer, we need to pass through the + # kv_connector_output + if kv_connector_output.is_empty(): + return EMPTY_MODEL_RUNNER_OUTPUT + + output = copy(EMPTY_MODEL_RUNNER_OUTPUT) + output.kv_connector_output = kv_connector_output + return output + + # Unpack ephemeral state. + ( + scheduler_output, + logits, + spec_decode_metadata, + spec_decode_common_attn_metadata, + hidden_states, + sample_hidden_states, + aux_hidden_states, + ec_connector_output, + cudagraph_stats, + multimodal_outputs, + ) = self.execute_model_state + self.execute_model_state = None + pooler_output: list[object] = [] if isinstance(multimodal_outputs, torch.Tensor): assert multimodal_outputs.shape[0] == 1, ( @@ -171,6 +328,10 @@ def execute_model( pooler_output=pooler_output, kv_connector_output=kv_connector_output, num_nans_in_logits={}, + cudagraph_stats=cudagraph_stats, + ec_connector_output=ec_connector_output + if self.supports_mm_inputs + else None, ) if not self.use_async_scheduling: @@ -178,9 +339,11 @@ def execute_model( return AsyncGPUModelRunnerOutput( model_runner_output=output, - sampled_token_ids=[], + sampled_token_ids=torch.tensor([], device=self.device), invalid_req_indices=[], async_output_copy_stream=self.async_output_copy_stream, + vocab_size=self.input_batch.vocab_size, + logprobs_tensors=None, ) def _run_generation_model( @@ -208,7 +371,7 @@ def _run_generation_model( positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, - **MultiModalKwargs.as_kwargs(model_kwargs, device=self.device), + **model_kwargs, sampling_metadata=self.input_batch.sampling_metadata, logits_index=logits_indices, sampler=self.sampler, diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 81103483820..24d4ffd028e 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -881,7 +881,7 @@ def _preprocess( num_input_tokens: int, intermediate_tensors: IntermediateTensors | None = None, ): - """Align with v0.12 preprocess and omni's additional information handling.""" + """Align with v0.14.0 preprocess and omni's additional information handling.""" num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens is_first_rank = get_pp_group().is_first_rank is_encoder_decoder = self.model_config.is_encoder_decoder @@ -998,9 +998,15 @@ def _preprocess( span_len = int(e) - int(s) # call the custom process function - req_input_ids, req_embeds, update_dict = self.model.preprocess( - input_ids=input_ids[s:e], input_embeds=inputs_embeds[s:e], **req_infos - ) + try: + req_input_ids, req_embeds, update_dict = self.model.preprocess( + input_ids=input_ids[s:e], input_embeds=inputs_embeds[s:e], **req_infos + ) + except Exception as e: + logger.error(f"Error in preprocess for request {req_id}: {e}") + import traceback + traceback.print_exc() + raise e #TODO: This is Model Specific Code, need to be generalized in the future ZTC # run talker mtp decode if hasattr(self.model, "talker_mtp"): From d89e3c4f3de2ef853b5acfa1ff467bbebd72411a Mon Sep 17 00:00:00 2001 From: tzhouam Date: Wed, 14 Jan 2026 11:42:08 +0000 Subject: [PATCH 29/59] debug GLM-Image Model Signed-off-by: tzhouam --- vllm_omni/diffusion/forward_context.py | 11 +++++- .../diffusion/models/glm_image/__init__.py | 4 +- .../models/glm_image/glm_image_transformer.py | 37 +++++++++++-------- .../models/glm_image/pipeline_glm_image.py | 6 +-- 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/vllm_omni/diffusion/forward_context.py b/vllm_omni/diffusion/forward_context.py index 3c33f8105eb..d898bceaf62 100644 --- a/vllm_omni/diffusion/forward_context.py +++ b/vllm_omni/diffusion/forward_context.py @@ -86,5 +86,14 @@ def set_forward_context( attn_metadata=attn_metadata, split_text_embed_in_sp=split_text_embed_in_sp, ) + # vLLM CustomOp dispatch (e.g. QKVParallelLinear) requires a global + # vLLM config set via set_current_vllm_config(). with override_forward_context(forward_context): - yield + if vllm_config is None: + yield + else: + # Local import to avoid importing vllm.config.vllm at module import time. + from vllm.config.vllm import set_current_vllm_config + + with set_current_vllm_config(vllm_config): + yield diff --git a/vllm_omni/diffusion/models/glm_image/__init__.py b/vllm_omni/diffusion/models/glm_image/__init__.py index fc8256d8de6..ac7a98fa743 100644 --- a/vllm_omni/diffusion/models/glm_image/__init__.py +++ b/vllm_omni/diffusion/models/glm_image/__init__.py @@ -9,7 +9,7 @@ from vllm_omni.diffusion.models.glm_image.pipeline_glm_image import ( GlmImagePipeline, get_glm_image_post_process_func, - get_glm_image_pre_process_func, + # get_glm_image_pre_process_func, ) __all__ = [ @@ -17,5 +17,5 @@ "GlmImagePipeline", "GlmImageTransformer2DModel", "get_glm_image_post_process_func", - "get_glm_image_pre_process_func", + # "get_glm_image_pre_process_func", ] diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py index 4aceb4ebfff..d141c40054b 100644 --- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py +++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py @@ -2,12 +2,13 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Iterable -from typing import Any, Enum +from enum import Enum +from typing import Any import torch import torch.nn as nn from diffusers.models.attention import FeedForward -from diffusers.models.embeddings import GlmImageCombinedTimestepSizeEmbeddings +from diffusers.models.transformers.transformer_glm_image import GlmImageCombinedTimestepSizeEmbeddings from diffusers.models.modeling_outputs import Transformer2DModelOutput from vllm.logger import init_logger from vllm.model_executor.layers.linear import QKVParallelLinear @@ -412,8 +413,10 @@ def forward( # Only apply RoPE to image part (after text_seq_length) query_img = query[:, text_seq_length:, :, :] key_img = key[:, text_seq_length:, :, :] - query_img = self.rope(query_img, cos, sin) - key_img = self.rope(key_img, cos, sin) + from diffusers.models.embeddings import apply_rotary_emb + query_img = apply_rotary_emb(query_img,image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) + # key_img = self.rope(key_img, cos, sin) + key_img = apply_rotary_emb(key_img,image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) query = torch.cat([query[:, :text_seq_length, :, :], query_img], dim=1) key = torch.cat([key[:, :text_seq_length, :, :], key_img], dim=1) @@ -457,7 +460,7 @@ def __init__( # 1. Attention with AdaLN self.norm1 = GlmImageAdaLayerNormZero(time_embed_dim, dim) - self.attn = GlmImageAttention( + self.attn1 = GlmImageAttention( dim=dim, num_heads=num_attention_heads, head_dim=attention_head_dim, @@ -510,7 +513,7 @@ def forward( ) = self.norm1(hidden_states, encoder_hidden_states, temb) # 2. Attention - attn_hidden_states, attn_encoder_hidden_states = self.attn( + attn_hidden_states, attn_encoder_hidden_states = self.attn1( hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states, image_rotary_emb=image_rotary_emb, @@ -558,18 +561,20 @@ class GlmImageTransformer2DModel(CachedTransformer): def __init__( self, od_config: OmniDiffusionConfig, - patch_size: int = 2, - in_channels: int = 16, - num_layers: int = 30, - attention_head_dim: int = 40, - num_attention_heads: int = 64, - out_channels: int = 16, - text_embed_dim: int = 1472, - time_embed_dim: int = 512, - condition_dim: int = 256, - prior_vq_quantizer_codebook_size: int = 16384, ): super().__init__() + + patch_size = od_config.tf_model_config.patch_size + in_channels = od_config.tf_model_config.in_channels + out_channels = od_config.tf_model_config.out_channels + num_attention_heads = od_config.tf_model_config.num_attention_heads + attention_head_dim = od_config.tf_model_config.attention_head_dim + time_embed_dim = od_config.tf_model_config.time_embed_dim + condition_dim = od_config.tf_model_config.condition_dim + prior_vq_quantizer_codebook_size = od_config.tf_model_config.prior_vq_quantizer_codebook_size + text_embed_dim = od_config.tf_model_config.text_embed_dim + + # Get num_layers from config if available model_config = od_config.tf_model_config diff --git a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py index 599eb2cdd8e..f582c3b9b69 100644 --- a/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py +++ b/vllm_omni/diffusion/models/glm_image/pipeline_glm_image.py @@ -72,8 +72,8 @@ def get_glm_image_post_process_func(od_config: OmniDiffusionConfig): image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor) - def post_process_func(images: torch.Tensor): - return image_processor.postprocess(images) + def post_process_func(images: PIL.Image.Image): + return images return post_process_func @@ -676,7 +676,7 @@ def diffuse( timestep=timestep, target_size=target_size, crop_coords=crop_coords, - kv_caches=kv_caches, + kv_cache=kv_caches, return_dict=False, )[0].float() From e213bdd0a47e3c85c24268df3423d118e05438a9 Mon Sep 17 00:00:00 2001 From: Yuhan Liu <30294295+liuyuhanalex@users.noreply.github.com> Date: Wed, 14 Jan 2026 20:08:23 +0800 Subject: [PATCH 30/59] [BugFix]Remove duplicate error handling for request results (#781) Signed-off-by: Yuhan Liu <30294295+liuyuhanalex@users.noreply.github.com> --- vllm_omni/entrypoints/async_omni.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py index 99f872db7f2..e41569281cf 100644 --- a/vllm_omni/entrypoints/async_omni.py +++ b/vllm_omni/entrypoints/async_omni.py @@ -349,12 +349,6 @@ async def generate(self, *args: Any, **kwargs: dict[str, Any]) -> AsyncGenerator result = await req_state.queue.get() assert stage_id == req_state.stage_id - req_id = result.get("request_id") - if "error" in result: - logger.error( - f"[{self._name}] Stage {stage_id} error on request {req_id}: {result['error']}", - ) - raise RuntimeError(result) # Request Finished due to error req_id = result.get("request_id") if "error" in result: logger.error( From f269e0e453b5a000209fc22b59ce99f6c6bb0e00 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Wed, 14 Jan 2026 12:10:55 +0000 Subject: [PATCH 31/59] remove deleted args from doc string Signed-off-by: tzhouam --- .../models/glm_image/glm_image_transformer.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py index d141c40054b..09f7b17e133 100644 --- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py +++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py @@ -545,17 +545,9 @@ class GlmImageTransformer2DModel(CachedTransformer): This is the vllm-omni optimized version of the GLM-Image DiT model. Args: - od_config: OmniDiffusionConfig containing model configuration. - patch_size: Size of image patches. - in_channels: Number of input channels (latent channels). - num_layers: Number of transformer blocks. - attention_head_dim: Dimension of each attention head. - num_attention_heads: Number of attention heads. - out_channels: Number of output channels. - text_embed_dim: Dimension of text embeddings. - time_embed_dim: Dimension of timestep embeddings. - condition_dim: Dimension of conditioning embeddings. - prior_vq_quantizer_codebook_size: Size of prior VQ codebook. + od_config: OmniDiffusionConfig containing model configuration. The + transformer hyper-parameters (e.g. patch size / channels / heads) + are read from `od_config.tf_model_config`. """ def __init__( From 1fe64e85a6653fe586aec57f32c6024b6ab92bdd Mon Sep 17 00:00:00 2001 From: Alicia <115451386+congw729@users.noreply.github.com> Date: Wed, 14 Jan 2026 20:35:26 +0800 Subject: [PATCH 32/59] [CI] Add pytest markers in config files. (#719) Signed-off-by: Alicia <115451386+congw729@users.noreply.github.com> --- docs/.nav.yml | 1 - docs/contributing/ci/tests_markers.md | 160 ++++++++ .../contributing/{tests => ci}/tests_style.md | 20 +- .../model/adding_diffusion_model.md | 2 +- docs/contributing/model/adding_omni_model.md | 2 +- pyproject.toml | 31 +- pytest.ini | 3 - tests/utils.py | 377 +++++++++++++++++- tools/pre_commit/check_pickle_imports.py | 1 + 9 files changed, 580 insertions(+), 17 deletions(-) create mode 100644 docs/contributing/ci/tests_markers.md rename docs/contributing/{tests => ci}/tests_style.md (94%) delete mode 100644 pytest.ini diff --git a/docs/.nav.yml b/docs/.nav.yml index 7493e71e8af..911f0fbc9b8 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -46,7 +46,6 @@ nav: - contributing/model/adding_omni_model.md - contributing/model/adding_diffusion_model.md - CI: contributing/ci - - Tests: contributing/tests - Design Documents: - design/index.md - design/architecture_overview.md diff --git a/docs/contributing/ci/tests_markers.md b/docs/contributing/ci/tests_markers.md new file mode 100644 index 00000000000..bf56914f8da --- /dev/null +++ b/docs/contributing/ci/tests_markers.md @@ -0,0 +1,160 @@ +# Markers for Tests + +By adding markers before test functions, tests can later be executed uniformly by simply declaring the corresponding marker type. + +## Current Markers +Defined in `pyproject.toml`: + +| Marker | Description | +| ------------------ | ------------------------------------------------------- | +| `core_model` | Core model tests (run in each PR) | +| `diffusion` | Diffusion model tests | +| `omni` | Omni model tests | +| `cache` | Cache backend tests | +| `parallel` | Parallelism/distributed tests | +| `cpu` | Tests that run on CPU | +| `gpu` | Tests that run on GPU (auto-added) | +| `cuda` | Tests that run on CUDA (auto-added) | +| `rocm` | Tests that run on AMD/ROCm (auto-added) | +| `npu` | Tests that run on NPU/Ascend (auto-added) | +| `H100` | Tests that require H100 GPU | +| `L4` | Tests that require L4 GPU | +| `MI325` | Tests that require MI325 GPU (AMD/ROCm) | +| `A2` | Tests that require A2 NPU | +| `A3` | Tests that require A3 NPU | +| `distributed_cuda` | Tests that require multi cards on CUDA platform | +| `distributed_rocm` | Tests that require multi cards on ROCm platform | +| `distributed_npu` | Tests that require multi cards on NPU platform | +| `skipif_cuda` | Skip if the num of CUDA cards is less than the required | +| `skipif_rocm` | Skip if the num of ROCm cards is less than the required | +| `skipif_npu` | Skip if the num of NPU cards is less than the required | +| `slow` | Slow tests (may skip in quick CI) | +| `benchmark` | Benchmark tests | + +For those markers shown as auto-added, they will be added by the `@hardware_test` decorator. + +### Example usage for markers + +```python +from tests.utils import hardware_test + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test( + res={"cuda": "L4", "rocm": "MI325", "npu": "A2"}, + num_cards=2, +) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_video_to_audio() + ... +``` +### Decorator: `@hardware_test` + +This decorator is intended to make hardware-aware, cross-platform test authoring easier and more robust for CI/CD environments. The `hardware_test` decorator in `vllm-omni/tests/utils.py` performs the following actions: + +1. **Applies platform and resource markers** + Adds the appropriate pytest markers for each specified hardware platform (e.g., `cuda`, `rocm`, `npu`) and resource type (e.g., `L4`, `H100`, `MI325`, `A2`, `A3`). + ``` + @pytest.mark.cuda + @pytest.mark.L4 + ``` +2. **Handles multi-card (distributed) scenarios** + For tests requiring multiple cards, it automatically adds distributed markers such as `distributed_cuda`, `distributed_rocm`, or `distributed_npu`. + ``` + @pytest.mark.distributed_cuda(num_cards=num_cards) + ``` +3. **Supports flexible card requirements** + Accepts `num_cards` as either a single integer for all platforms or as a dictionary with per-platform values. If not specified, defaults to 1 card per platform. + +4. **Integrates resource validation** + On CUDA, adds a skip marker (`skipif_cuda`) if the system does not have the required number of devices. + Support for `skipif_rocm` and `skipif_npu` will be implemented later. + + +5. **Runs each test in a new process** + Automatically wraps the distributed test with a decorator (`@create_new_process_for_each_test`) to ensure isolation and compatibility with multi-process hardware backends. + +6. **Works with pytest filtering** + Allows tests to be filtered and selected at runtime using standard pytest marker expressions (e.g., `-m "distributed_cuda and L4"`). + +#### Example usage for decorator +- Single call for multiple platforms: + ```python + @hardware_test( + res={"cuda": "L4", "rocm": "MI325", "npu": "A2"}, + num_cards={"cuda": 2, "rocm": 2, "npu": 2}, + ) + ``` + or + ```python + @hardware_test( + res={"cuda": "L4", "rocm": "MI325", "npu": "A2"}, + num_cards=2, + ) + ``` +- `res` must be a dict; supported resources: CUDA (L4/H100), ROCm (MI325), NPU (A2/A3) +- `num_cards` can be int (all platforms) or dict (per platform); defaults to 1 when missing +- `hardware_test` automatically applies `@create_new_process_for_each_test` for distributed tests. +- Distributed markers (`distributed_cuda`, `distributed_rocm`, `distributed_npu`) are auto-added for multi-card cases +- Filtering examples: + - CUDA only: `pytest -m "distributed_cuda and L4"` + - ROCm only: `pytest -m "distributed_rocm and MI325"` + - NPU only: `pytest -m "distributed_npu"` + +## Add Support for a New Platform + +If you want to add support for a new platform (e.g., "tpu" for a new accelerator), follow these steps: + +1. **Extend the marker list in your pytest config** so that platform/resource markers are defined: + ```toml + # In pyproject.toml or pytest.ini + [tool.pytest.ini_options] + markers = [ + # ... existing markers ... + "tpu: Tests that require TPU device", + "TPU_V3: Tests that require TPU v3 hardware", + "distributed_tpu: Tests that require multiple TPU devices", + ] + ``` +2. **Implement a marker construction function for your platform** in `vllm-omni/tests/utils.py`: + ```python + # In vllm-omni/tests/utils.py + + def tpu_marks(*, res: str, num_cards: int): + test_platform = pytest.mark.tpu + if res == "TPU_V3": + test_resource = pytest.mark.TPU_V3 + else: + raise ValueError( + f"Invalid TPU resource type: {res}. Supported: TPU_V3") + + if num_cards == 1: + return [test_platform, test_resource] + else: + test_distributed = pytest.mark.distributed_tpu(num_cards=num_cards) + # Optionally: add skipif_tpu when implemented + return [test_platform, test_resource, test_distributed] + ``` +3. **Update `hardware_test` to recognize your new platform**: + In the relevant place (see the `hardware_test` implementation), add: + ```python + if platform == "tpu": + marks = tpu_marks(res=resource, num_cards=cards) + ``` +4. **(Recommended) Add a test using your new markers**: + ```python + @hardware_test( + res={"tpu": "TPU_V3"}, + num_cards=2, + ) + def test_my_tpu_feature(): + ... + ``` + +**Summary**: +- Add pytest markers for your new platform/resources +- Implement a marker function (`xxx_marks`) +- Plug into `hardware_test` +- You're done: tests decorated with `@hardware_test` using your platform now automatically get the correct markers, distribution, and isolation! + +See code in `vllm-omni/tests/utils.py` for existing examples (`cuda_marks`, `rocm_marks`, `npu_marks`). diff --git a/docs/contributing/tests/tests_style.md b/docs/contributing/ci/tests_style.md similarity index 94% rename from docs/contributing/tests/tests_style.md rename to docs/contributing/ci/tests_style.md index c88e17dee34..65c2b044346 100644 --- a/docs/contributing/tests/tests_style.md +++ b/docs/contributing/ci/tests_style.md @@ -139,7 +139,7 @@ vllm_omni/ tests/ 4. **Documentation**: Add docstrings to all test functions 5. **Environment variables**: Set uniformly in `conftest.py` or at the top of files 6. **Type annotations**: Add type annotations to all test function parameters -7. **Resources**, Using pytest tag to specify the computation resources the test required. +7. **Pytest Markers**: Add necessary markers like `@pytest.mark.core_model` and use `@hardware_test` to declare hardware requirements (check detailed in [Markers for Tests](../ci/tests_markers.md)). ### Template #### E2E - Online serving @@ -155,6 +155,7 @@ from pathlib import Path import pytest import openai +from tests.utils import hardware_test # Optional: set process start method for workers os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -184,6 +185,12 @@ def base64_encoded_video() -> str: def dummy_messages_from_video_data(video_data_url: str, content_text: str) -> str: xxx +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test( + res={"cuda": "L4", "rocm": "MI325", "npu": "A2"}, + num_cards={"cuda": 2, "rocm": 2, "npu": 4}, +) @pytest.mark.parametrize("omni_server", test_params, indirect=True) def test_video_to_audio( client: openai.OpenAI, @@ -226,6 +233,7 @@ from pathlib import Path import pytest from vllm.assets.video import VideoAsset +from tests.utils import hardware_test from ..multi_stages.conftest import OmniRunner # Optional: set process start method for workers @@ -239,7 +247,12 @@ test_params = [(model, stage_config) for model in models for stage_config in sta # function name: test_{input_modality}_to_{output_modality} # modality candidate: text, image, audio, video, mixed_modalities -@pytest.mark.gpu_mem_high # requires high-memory GPU node +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test( + res={"cuda": "L4", "rocm": "MI325", "npu": "A2"}, + num_cards=2, +) @pytest.mark.parametrize("test_config", test_params) def test_video_to_audio(omni_runner: type[OmniRunner], model: str) -> None: """Offline inference: video input, audio output.""" @@ -263,4 +276,5 @@ def test_video_to_audio(omni_runner: type[OmniRunner], model: str) -> None: 1. The file is saved in an appropriate place and the file name is clear. 2. The coding style follows the requirements outlined above. -3. For e2e model test, please ensure the test is configured under the `./buildkite/` folder. +3. **All test functions have appropriate pytest markers** +4. For tests that need run in CI, please ensure the test is configured under the `./buildkite/` folder. diff --git a/docs/contributing/model/adding_diffusion_model.md b/docs/contributing/model/adding_diffusion_model.md index 70fdc6a0817..7eb56d5f5bc 100644 --- a/docs/contributing/model/adding_diffusion_model.md +++ b/docs/contributing/model/adding_diffusion_model.md @@ -140,7 +140,7 @@ Key point for writing the example: + Save or display the generated results so users can validate the integration. # Testing -For comprehensive testing guidelines, please refer to the [Test File Structure and Style Guide](../tests/tests_style.md). +For comprehensive testing guidelines, please refer to the [Test File Structure and Style Guide](../ci/tests_style.md). ## Adding a Model Recipe diff --git a/docs/contributing/model/adding_omni_model.md b/docs/contributing/model/adding_omni_model.md index 2a91a305091..81499118623 100644 --- a/docs/contributing/model/adding_omni_model.md +++ b/docs/contributing/model/adding_omni_model.md @@ -572,7 +572,7 @@ def talker2code2wav( ## Testing -For comprehensive testing guidelines, please refer to the [Test File Structure and Style Guide](../tests/tests_style.md). +For comprehensive testing guidelines, please refer to the [Test File Structure and Style Guide](../ci/tests_style.md). ## Adding a Model Recipe diff --git a/pyproject.toml b/pyproject.toml index 209a085bf87..4833b117487 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -151,11 +151,34 @@ addopts = [ "--cov-report=xml", ] markers = [ - "unit: Unit tests", - "integration: Integration tests", + # ci/cd required + "core_model: Core model tests (run in each PR)", + # function module markers + "diffusion: Diffusion model tests", + "omni: Omni model tests", + "cache: Cache backend tests", + "parallel: Parallelism/distributed tests", + # platform markers + "cpu: Tests that run on CPU", + "gpu: Tests that run on GPU (auto-added)", + "cuda: Tests that run on CUDA (auto-added)", + "rocm: Tests that run on AMD/ROCm (auto-added)", + "npu: Tests that run on NPU/Ascend (auto-added)", + # specified computation resources marks (auto-added) + "H100: Tests that require H100 GPU", + "L4: Tests that require L4 GPU", + "MI325: Tests that require MI325 GPU (AMD/ROCm)", + "A2: Tests that require A2 NPU", + "A3: Tests that require A3 NPU", + "distributed_cuda: Tests that require multi cards on CUDA platform", + "distributed_rocm: Tests that require multi cards on ROCm platform", + "distributed_npu: Tests that require multi cards on NPU platform", + "skipif_cuda: Skip if the num of CUDA cards is less than the required", + "skipif_rocm: Skip if the num of ROCm cards is less than the required", + "skipif_npu: Skip if the num of NPU cards is less than the required", + # more detailed markers + "slow: Slow tests (may skip in quick CI)", "benchmark: Benchmark tests", - "slow: Slow tests", - "core_model: enable this model test in each PR instead of only nightly", ] [tool.typos.default] diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index 8fb4beb9755..00000000000 --- a/pytest.ini +++ /dev/null @@ -1,3 +0,0 @@ -[pytest] -markers = - gpu_mem_high: needs high VRAM diff --git a/tests/utils.py b/tests/utils.py index aba734501eb..2a2dca238a8 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,11 +1,24 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +# Some functions are copied from vllm/tests/utils.py +import functools import os +import signal +import subprocess +import sys +import tempfile import time -from contextlib import contextmanager +from collections.abc import Callable +from contextlib import ExitStack, contextmanager, suppress +from typing import Any, Literal +import cloudpickle +import pytest +from typing_extensions import ParamSpec from vllm.platforms import current_platform +from vllm.utils.torch_utils import cuda_device_count_stateless + +_P = ParamSpec("_P") if current_platform.is_rocm(): from amdsmi import ( @@ -90,10 +103,16 @@ def wait_for_gpu_memory_to_clear( print("") if threshold_bytes is not None: - is_free = lambda used, total: used <= threshold_bytes / 2**30 # noqa E731 + + def is_free(used, total): + return used <= threshold_bytes / 2**30 # noqa E731 + threshold = f"{threshold_bytes / 2**30} GiB" else: - is_free = lambda used, total: used / total <= threshold_ratio # noqa E731 + + def is_free(used, total): + return used / total <= threshold_ratio # noqa E731 + threshold = f"{threshold_ratio:.2f}" dur_s = time.time() - start_time @@ -105,3 +124,353 @@ def wait_for_gpu_memory_to_clear( raise ValueError(f"Memory of devices {devices=} not free after {dur_s=:.02f} ({threshold=})") time.sleep(5) + + +def fork_new_process_for_each_test(func: Callable[_P, None]) -> Callable[_P, None]: + """Decorator to fork a new process for each test function. + See https://github.com/vllm-project/vllm/issues/7053 for more details. + """ + + @functools.wraps(func) + def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: + # Make the process the leader of its own process group + # to avoid sending SIGTERM to the parent process + os.setpgrp() + from _pytest.outcomes import Skipped + + # Create a unique temporary file to store exception info from child + # process. Use test function name and process ID to avoid collisions. + with ( + tempfile.NamedTemporaryFile( + delete=False, mode="w+b", prefix=f"vllm_test_{func.__name__}_{os.getpid()}_", suffix=".exc" + ) as exc_file, + ExitStack() as delete_after, + ): + exc_file_path = exc_file.name + delete_after.callback(os.remove, exc_file_path) + + pid = os.fork() + print(f"Fork a new process to run a test {pid}") + if pid == 0: + # Parent process responsible for deleting, don't delete + # in child. + delete_after.pop_all() + try: + func(*args, **kwargs) + except Skipped as e: + # convert Skipped to exit code 0 + print(str(e)) + os._exit(0) + except Exception as e: + import traceback + + tb_string = traceback.format_exc() + + # Try to serialize the exception object first + exc_to_serialize: dict[str, Any] + try: + # First, try to pickle the actual exception with + # its traceback. + exc_to_serialize = {"pickled_exception": e} + # Test if it can be pickled + cloudpickle.dumps(exc_to_serialize) + except (Exception, KeyboardInterrupt): + # Fall back to string-based approach. + exc_to_serialize = { + "exception_type": type(e).__name__, + "exception_msg": str(e), + "traceback": tb_string, + } + try: + with open(exc_file_path, "wb") as f: + cloudpickle.dump(exc_to_serialize, f) + except Exception: + # Fallback: just print the traceback. + print(tb_string) + os._exit(1) + else: + os._exit(0) + else: + pgid = os.getpgid(pid) + _pid, _exitcode = os.waitpid(pid, 0) + # ignore SIGTERM signal itself + old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN) + # kill all child processes + os.killpg(pgid, signal.SIGTERM) + # restore the signal handler + signal.signal(signal.SIGTERM, old_signal_handler) + if _exitcode != 0: + # Try to read the exception from the child process + exc_info = {} + if os.path.exists(exc_file_path): + with suppress(Exception), open(exc_file_path, "rb") as f: + exc_info = cloudpickle.load(f) + + if (original_exception := exc_info.get("pickled_exception")) is not None: + # Re-raise the actual exception object if it was + # successfully pickled. + assert isinstance(original_exception, Exception) + raise original_exception + + if (original_tb := exc_info.get("traceback")) is not None: + # Use string-based traceback for fallback case + raise AssertionError( + f"Test {func.__name__} failed when called with" + f" args {args} and kwargs {kwargs}" + f" (exit code: {_exitcode}):\n{original_tb}" + ) from None + + # Fallback to the original generic error + raise AssertionError( + f"function {func.__name__} failed when called with" + f" args {args} and kwargs {kwargs}" + f" (exit code: {_exitcode})" + ) from None + + return wrapper + + +def spawn_new_process_for_each_test(f: Callable[_P, None]) -> Callable[_P, None]: + """Decorator to spawn a new process for each test function.""" + + @functools.wraps(f) + def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: + # Check if we're already in a subprocess + if os.environ.get("RUNNING_IN_SUBPROCESS") == "1": + # If we are, just run the function directly + return f(*args, **kwargs) + + import torch.multiprocessing as mp + + with suppress(RuntimeError): + mp.set_start_method("spawn") + + # Get the module + module_name = f.__module__ + + # Create a process with environment variable set + env = os.environ.copy() + env["RUNNING_IN_SUBPROCESS"] = "1" + + with tempfile.TemporaryDirectory() as tempdir: + output_filepath = os.path.join(tempdir, "new_process.tmp") + + # `cloudpickle` allows pickling complex functions directly + input_bytes = cloudpickle.dumps((f, output_filepath)) + + cmd = [sys.executable, "-m", f"{module_name}"] + + returned = subprocess.run(cmd, input=input_bytes, capture_output=True, env=env) + + # check if the subprocess is successful + try: + returned.check_returncode() + except Exception as e: + # wrap raised exception to provide more information + raise RuntimeError(f"Error raised in subprocess:\n{returned.stderr.decode()}") from e + + return wrapper + + +def create_new_process_for_each_test( + method: Literal["spawn", "fork"] | None = None, +) -> Callable[[Callable[_P, None]], Callable[_P, None]]: + """Creates a decorator that runs each test function in a new process. + + Args: + method: The process creation method. Can be either "spawn" or "fork". + If not specified, it defaults to "spawn" on ROCm and XPU + platforms and "fork" otherwise. + + Returns: + A decorator to run test functions in separate processes. + """ + if method is None: + # TODO: Spawn is not working correctly on ROCm + # The test content will not run and tests passed immediately. + # For now, using `fork` for ROCm as it can run with `fork` + # and tests are running correctly. + use_spawn = current_platform.is_xpu() + method = "spawn" if use_spawn else "fork" + + assert method in ["spawn", "fork"], "Method must be either 'spawn' or 'fork'" + + if method == "fork": + return fork_new_process_for_each_test + + return spawn_new_process_for_each_test + + +def cuda_marks(*, res: str, num_cards: int): + """ + Get a collection of pytest marks to apply for `@cuda_test`. + + Args: + res: Resource type, e.g., "L4" or "H100". + num_cards: Number of GPU cards required. + + Returns: + List of pytest marks to apply. + """ + test_platform_detail = pytest.mark.cuda + + if res == "L4": + test_resource = pytest.mark.L4 + elif res == "H100": + test_resource = pytest.mark.H100 + else: + raise ValueError(f"Invalid CUDA resource type: {res}. Supported: L4, H100") + + marks = [test_resource, test_platform_detail] + + if num_cards == 1: + return marks + else: + test_distributed = pytest.mark.distributed_cuda(num_cards=num_cards) + test_skipif = pytest.mark.skipif_cuda( + cuda_device_count_stateless() < num_cards, + reason=f"Need at least {num_cards} CUDA GPUs to run the test.", + ) + return marks + [test_distributed, test_skipif] + + +def rocm_marks(*, res: str, num_cards: int): + """ + Get a collection of pytest marks to apply for `@rocm_test`. + + Args: + res: Resource type, e.g., "MI325". + num_cards: Number of GPU cards required. + + Returns: + List of pytest marks to apply. + """ + test_platform_detail = pytest.mark.rocm + + if res == "MI325": + test_resource = pytest.mark.MI325 + else: + raise ValueError(f"Invalid ROCm resource type: {res}. Supported: MI325") + + marks = [test_resource, test_platform_detail] + + if num_cards == 1: + return marks + else: + test_distributed = pytest.mark.distributed_rocm(num_cards=num_cards) + # TODO: add ROCm support for `skipif_rocm` marker + return marks + [test_distributed] + + +def gpu_marks(*, res: str, num_cards: int): + """ + Get a collection of pytest marks to apply for `@gpu_test`. + Platform is automatically determined based on resource type. + + Args: + res: Resource type, e.g., "L4", "H100" for CUDA, or "MI325" for ROCm. + num_cards: Number of GPU cards required. + + Returns: + List of pytest marks to apply. + """ + test_platform = pytest.mark.gpu + if res in ("L4", "H100"): + return [test_platform] + cuda_marks(res=res, num_cards=num_cards) + if res == "MI325": + return [test_platform] + rocm_marks(res=res, num_cards=num_cards) + raise ValueError(f"Invalid resource type: {res}. Supported: L4, H100, MI325") + + +def npu_marks(*, res: str, num_cards: int): + """Get a collection of pytest marks to apply for `@npu_test`.""" + test_platform = pytest.mark.npu + if res == "A2": + test_resource = pytest.mark.A2 + elif res == "A3": + test_resource = pytest.mark.A3 + else: + # TODO: Currently we don't have various NPU card types defined + # Use None to skip resource-specific marking for unknown types + test_resource = None + + if num_cards == 1: + return [mark for mark in [test_platform, test_resource] if mark is not None] + else: + # Multiple cards scenario needs distributed_npu mark + test_distributed = pytest.mark.distributed_npu(num_cards=num_cards) + # TODO: add NPU support for `skipif_npu` marker + return [mark for mark in [test_platform, test_resource, test_distributed] if mark is not None] + + +def hardware_test(*, res: dict[str, str], num_cards: int | dict[str, int] = 1): + """ + Decorate a test for multiple hardware platforms with a single call. + Automatically wraps the test with @create_new_process_for_each_test() for distributed tests. + + Args: + res: Mapping from platform to resource type. Supported platforms/resources: + - cuda: L4, H100 + - rocm: MI325 + - npu: A2, A3 + num_cards: Number of cards required. Can be: + - int: same card count for all platforms (default: 1) + - dict: per-platform card count, e.g., {"cuda": 2, "rocm": 2} + + Example: + @hardware_test( + res={"cuda": "L4", "rocm": "MI325", "npu": "A2"}, + num_cards={"cuda": 2, "rocm": 2, "npu": 2}, + ) + def test_multi_platform(): + ... + """ + # Validate platforms + # Don't validate platform details in this decorator + for platform, _ in res.items(): + if platform not in ("cuda", "rocm", "npu"): + raise ValueError(f"Unsupported platform: {platform}") + + # Normalize num_cards + if isinstance(num_cards, int): + num_cards_dict = {platform: num_cards for platform in res.keys()} + else: + num_cards_dict = num_cards + for platform in num_cards_dict.keys(): + if platform not in res: + raise ValueError( + f"Platform '{platform}' in num_cards but not in res. Available platforms: {list(res.keys())}" + ) + for platform in res.keys(): + if platform not in num_cards_dict: + num_cards_dict[platform] = 1 + + # Collect marks from all platforms + all_marks: list[Callable[[Callable[_P, None]], Callable[_P, None]]] = [] + for platform, resource in res.items(): + cards = num_cards_dict[platform] + if platform == "cuda" or platform == "rocm": + marks = gpu_marks(res=resource, num_cards=cards) + elif platform == "npu": + marks = npu_marks(res=resource, num_cards=cards) + else: + raise ValueError(f"Unsupported platform: {platform}") + all_marks.extend(marks) + + create_new_process_flag = False + for cards in num_cards_dict.values(): + if cards > 1: + create_new_process_flag = True + break + + def wrapper(f: Callable[_P, None]) -> Callable[_P, None]: + if create_new_process_flag: + # only for distributed tests + func = create_new_process_for_each_test()(f) + else: + func = f + for mark in reversed(all_marks): + func = mark(func) + return func + + return wrapper diff --git a/tools/pre_commit/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py index 562999d7e58..db45f29900d 100644 --- a/tools/pre_commit/check_pickle_imports.py +++ b/tools/pre_commit/check_pickle_imports.py @@ -18,6 +18,7 @@ ALLOWED_FILES = { "vllm_omni/entrypoints/omni_llm.py", "tests/e2e/offline_inference/utils.py", + "tests/utils.py", "vllm_omni/diffusion/distributed/group_coordinator.py", "tests/diffusion/attention/test_sequence_parallel.py", } From 314bb4e324c9b8a6f7442d5d8f7bb1e82e663160 Mon Sep 17 00:00:00 2001 From: Alicia <115451386+congw729@users.noreply.github.com> Date: Thu, 15 Jan 2026 10:38:27 +0800 Subject: [PATCH 33/59] [Doc] Fix mkdocs. (#785) Signed-off-by: Alicia <115451386+congw729@users.noreply.github.com> --- docs/.nav.yml | 2 +- docs/design/architecture_overview.md | 16 ++++++++-------- .../examples/online_serving/gradio_demo.md | 7 ------- ...ompletion_client_for_multimodal_generation.md | 7 ------- mkdocs.yml | 4 ++++ 5 files changed, 13 insertions(+), 23 deletions(-) delete mode 100644 docs/user_guide/examples/online_serving/gradio_demo.md delete mode 100644 docs/user_guide/examples/online_serving/openai_chat_completion_client_for_multimodal_generation.md diff --git a/docs/.nav.yml b/docs/.nav.yml index 911f0fbc9b8..be930637175 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -3,7 +3,7 @@ nav: - User Guide: - Getting Started: - getting_started/quickstart.md - - getting_started/installation + - getting_started/installation/* - Serving: - OpenAI-Compatible API: - Image Generation: serving/image_generation_api.md diff --git a/docs/design/architecture_overview.md b/docs/design/architecture_overview.md index ea7eff4397d..16f81ab1594 100644 --- a/docs/design/architecture_overview.md +++ b/docs/design/architecture_overview.md @@ -64,13 +64,13 @@ According to analysis for current popular open-source models, most of them have ## Key Components -| Component | Description | -|-----------|-------------| -| **OmniRouter** | provide an intelligent router for Omni-modality requests dispatch | -| **EntryPoints** | define the APIs for offline/online serving (APIServer, Omni/AsyncOmni) and provide the OmniStage abstraction for different AR/DiT stages | -| **AR** | adapted for omni-modality models while inheriting efficient features from vLLM, such as cache management | -| **Diffusion** | natively implemented and optimized using acceleration components | -| **OmniConnector** | supports fully disaggregation based on E/P/D/G (Encoding/Processing/Decoding/Generation) disaggregation across stages | +| Component | Description | +| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | +| **OmniRouter** | provide an intelligent router for Omni-modality requests dispatch | +| **EntryPoints** | define the APIs for offline/online serving (APIServer, Omni/AsyncOmni) and provide the OmniStage abstraction for different AR/DiT stages | +| **AR** | adapted for omni-modality models while inheriting efficient features from vLLM, such as cache management | +| **Diffusion** | natively implemented and optimized using acceleration components | +| **OmniConnector** | supports fully disaggregation based on E/P/D/G (Encoding/Processing/Decoding/Generation) disaggregation across stages | Disaggregated stages are managed through configuration, such as in the Qwen3-Omni example, where stages like Thinker, Talker, and Code2wav are defined as separate OmniStage instances with specific resources and input/output type. @@ -192,4 +192,4 @@ curl -sS -X POST http://localhost:8091/v1/chat/completions \ } ``` -For more usages, please refer to [examples](../user_guide/examples/). +For more usages, please refer to [examples](../examples/README.md). diff --git a/docs/user_guide/examples/online_serving/gradio_demo.md b/docs/user_guide/examples/online_serving/gradio_demo.md deleted file mode 100644 index 38278d9cf5a..00000000000 --- a/docs/user_guide/examples/online_serving/gradio_demo.md +++ /dev/null @@ -1,7 +0,0 @@ -# Gradio Demo - -Source . - -``````py ---8<-- "examples/online_serving/gradio_demo.py" -`````` diff --git a/docs/user_guide/examples/online_serving/openai_chat_completion_client_for_multimodal_generation.md b/docs/user_guide/examples/online_serving/openai_chat_completion_client_for_multimodal_generation.md deleted file mode 100644 index ca3fa8306b3..00000000000 --- a/docs/user_guide/examples/online_serving/openai_chat_completion_client_for_multimodal_generation.md +++ /dev/null @@ -1,7 +0,0 @@ -# OpenAI Chat Completion Client For Multimodal Generation - -Source . - -``````py ---8<-- "examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py" -`````` diff --git a/mkdocs.yml b/mkdocs.yml index 71cfe030569..1e8e38f5104 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -60,6 +60,10 @@ hooks: - docs/mkdocs/hooks/url_schemes.py - docs/mkdocs/hooks/generate_examples.py +# Exclude include files from navigation warnings +exclude_docs: | + **/*.inc.md + # Plugins plugins: - meta From 2d5faf3634d2dc9e9285b1d0e05f6b633a73ec5e Mon Sep 17 00:00:00 2001 From: Samit <285365963@qq.com> Date: Thu, 15 Jan 2026 11:18:28 +0800 Subject: [PATCH 34/59] [Bugfix] Fix generation artifacts of Qwen-Image-Edit-2511 and update pipeline DiT param parsing (#776) Signed-off-by: samithuang <285365963@qq.com> --- .../image_to_image/image_edit.py | 16 +++++- .../image_to_image/openai_chat_client.py | 2 +- .../text_to_image/openai_chat_client.py | 2 +- vllm_omni/diffusion/diffusion_engine.py | 1 + .../models/qwen_image/pipeline_qwen_image.py | 4 +- .../qwen_image/pipeline_qwen_image_edit.py | 4 +- .../pipeline_qwen_image_edit_plus.py | 5 +- .../qwen_image/pipeline_qwen_image_layered.py | 15 ++---- .../qwen_image/qwen_image_transformer.py | 9 ++-- vllm_omni/diffusion/utils/tf_utils.py | 54 +++++++++++++++++++ 10 files changed, 89 insertions(+), 23 deletions(-) create mode 100644 vllm_omni/diffusion/utils/tf_utils.py diff --git a/examples/offline_inference/image_to_image/image_edit.py b/examples/offline_inference/image_to_image/image_edit.py index 8f4dbeef98e..5d2b1052bec 100644 --- a/examples/offline_inference/image_to_image/image_edit.py +++ b/examples/offline_inference/image_to_image/image_edit.py @@ -55,7 +55,15 @@ --prompt "Edit description" \ --cfg_parallel_size 2 \ --num_inference_steps 50 \ - --cfg_scale 4.0 \ + --cfg_scale 4.0 + +Usage (disable torch.compile): + python image_edit.py \ + --image input.png \ + --prompt "Edit description" \ + --enforce_eager \ + --num_inference_steps 50 \ + --cfg_scale 4.0 For more options, run: python image_edit.py --help @@ -260,6 +268,11 @@ def parse_args() -> argparse.Namespace: choices=[1, 2], help="Number of GPUs used for classifier free guidance parallel size.", ) + parser.add_argument( + "--enforce_eager", + action="store_true", + help="Disable torch.compile and force eager execution.", + ) return parser.parse_args() @@ -321,6 +334,7 @@ def main(): cache_backend=args.cache_backend, cache_config=cache_config, parallel_config=parallel_config, + enforce_eager=args.enforce_eager, ) print("Pipeline loaded") diff --git a/examples/online_serving/image_to_image/openai_chat_client.py b/examples/online_serving/image_to_image/openai_chat_client.py index 14bec8a3be4..0fe4b0edece 100644 --- a/examples/online_serving/image_to_image/openai_chat_client.py +++ b/examples/online_serving/image_to_image/openai_chat_client.py @@ -127,7 +127,7 @@ def main(): parser.add_argument("--width", type=int, default=1024, help="Output image width") parser.add_argument("--steps", type=int, default=50, help="Inference steps") parser.add_argument("--guidance", type=float, default=7.5, help="Guidance scale") - parser.add_argument("--seed", type=int, help="Random seed") + parser.add_argument("--seed", type=int, default=0, help="Random seed") parser.add_argument("--negative", help="Negative prompt") args = parser.parse_args() diff --git a/examples/online_serving/text_to_image/openai_chat_client.py b/examples/online_serving/text_to_image/openai_chat_client.py index c529bf203fd..39fa7dc22b7 100644 --- a/examples/online_serving/text_to_image/openai_chat_client.py +++ b/examples/online_serving/text_to_image/openai_chat_client.py @@ -100,7 +100,7 @@ def main(): parser.add_argument("--width", type=int, default=1024, help="Image width") parser.add_argument("--steps", type=int, default=50, help="Inference steps") parser.add_argument("--cfg-scale", type=float, default=4.0, help="True CFG scale") - parser.add_argument("--seed", type=int, default=42, help="Random seed") + parser.add_argument("--seed", type=int, default=0, help="Random seed") parser.add_argument("--negative", help="Negative prompt") args = parser.parse_args() diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index 64184caa749..1948b40a3b4 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -293,6 +293,7 @@ def add_req_and_wait_for_response(self, requests: list[OmniDiffusionRequest]): def _dummy_run(self): """A dummy run to warm up the model.""" prompt = "dummy run" + # note that num_inference_steps=1 will cause timestep and temb None in the pipeline num_inference_steps = 1 height = 1024 width = 1024 diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py index 6401b526a5f..87c7c2f73c2 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image.py @@ -36,6 +36,7 @@ QwenImageTransformer2DModel, ) from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs from vllm_omni.model_executor.model_loader.weight_utils import ( download_weights_from_hf_specific, ) @@ -274,7 +275,8 @@ def __init__( self.vae = AutoencoderKLQwenImage.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to( self.device ) - self.transformer = QwenImageTransformer2DModel(od_config=od_config) + transformer_kwargs = get_transformer_config_kwargs(od_config.tf_model_config, QwenImageTransformer2DModel) + self.transformer = QwenImageTransformer2DModel(od_config=od_config, **transformer_kwargs) self.tokenizer = Qwen2Tokenizer.from_pretrained(model, subfolder="tokenizer", local_files_only=local_files_only) diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py index 53609c87757..901625e34b6 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit.py @@ -38,6 +38,7 @@ QwenImageTransformer2DModel, ) from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs from vllm_omni.model_executor.model_loader.weight_utils import ( download_weights_from_hf_specific, ) @@ -231,7 +232,8 @@ def __init__( self.vae = AutoencoderKLQwenImage.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to( self.device ) - self.transformer = QwenImageTransformer2DModel(od_config=od_config) + transformer_kwargs = get_transformer_config_kwargs(od_config.tf_model_config, QwenImageTransformer2DModel) + self.transformer = QwenImageTransformer2DModel(od_config=od_config, **transformer_kwargs) self.tokenizer = Qwen2Tokenizer.from_pretrained(model, subfolder="tokenizer", local_files_only=local_files_only) self.processor = Qwen2VLProcessor.from_pretrained( model, subfolder="processor", local_files_only=local_files_only diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py index 80e48112c71..6df502d48f0 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_edit_plus.py @@ -41,6 +41,7 @@ QwenImageTransformer2DModel, ) from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs from vllm_omni.model_executor.model_loader.weight_utils import ( download_weights_from_hf_specific, ) @@ -191,7 +192,9 @@ def __init__( self.vae = AutoencoderKLQwenImage.from_pretrained(model, subfolder="vae", local_files_only=local_files_only).to( self.device ) - self.transformer = QwenImageTransformer2DModel(od_config=od_config) + + transformer_kwargs = get_transformer_config_kwargs(od_config.tf_model_config, QwenImageTransformer2DModel) + self.transformer = QwenImageTransformer2DModel(od_config=od_config, **transformer_kwargs) self.tokenizer = Qwen2Tokenizer.from_pretrained(model, subfolder="tokenizer", local_files_only=local_files_only) self.processor = Qwen2VLProcessor.from_pretrained( model, subfolder="processor", local_files_only=local_files_only diff --git a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py index eb5e2aa987a..4642b3eb418 100644 --- a/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py +++ b/vllm_omni/diffusion/models/qwen_image/pipeline_qwen_image_layered.py @@ -37,6 +37,7 @@ QwenImageTransformer2DModel, ) from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs from vllm_omni.model_executor.model_loader.weight_utils import ( download_weights_from_hf_specific, ) @@ -211,18 +212,8 @@ def __init__( ) ] - use_additional_t_cond = od_config.tf_model_config.use_additional_t_cond - zero_cond_t = od_config.tf_model_config.zero_cond_t - use_layer3d_rope = od_config.tf_model_config.use_layer3d_rope - guidance_embeds = od_config.tf_model_config.guidance_embeds - - self.transformer = QwenImageTransformer2DModel( - od_config=od_config, - use_additional_t_cond=use_additional_t_cond, - zero_cond_t=zero_cond_t, - use_layer3d_rope=use_layer3d_rope, - guidance_embeds=guidance_embeds, - ) + transformer_kwargs = get_transformer_config_kwargs(od_config.tf_model_config, QwenImageTransformer2DModel) + self.transformer = QwenImageTransformer2DModel(od_config=od_config, **transformer_kwargs) # Pipeline configuration & processing parameters self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8 diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py index 2aa9c29104d..472806aa778 100644 --- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py +++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py @@ -604,7 +604,7 @@ def forward( txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1) # Each [B, 3*dim] # Process image stream - norm1 + modulation - img_modulated, img_gate1 = self.img_norm1(hidden_states, img_mod1) + img_modulated, img_gate1 = self.img_norm1(hidden_states, img_mod1, modulate_index) # Process text stream - norm1 + modulation txt_modulated, txt_gate1 = self.txt_norm1(encoder_hidden_states, txt_mod1) @@ -632,7 +632,8 @@ def forward( encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output # Process image stream - norm2 + MLP - img_modulated2, img_gate2 = self.img_norm2(hidden_states, img_mod2) + img_modulated2, img_gate2 = self.img_norm2(hidden_states, img_mod2, modulate_index) + img_mlp_output = self.img_mlp(img_modulated2) hidden_states = hidden_states + img_gate2 * img_mlp_output @@ -692,15 +693,13 @@ def __init__( attention_head_dim: int = 128, num_attention_heads: int = 24, joint_attention_dim: int = 3584, - guidance_embeds: bool = False, # TODO: this should probably be removed + guidance_embeds: bool = False, axes_dims_rope: tuple[int, int, int] = (16, 56, 56), zero_cond_t: bool = False, use_additional_t_cond: bool = False, use_layer3d_rope: bool = False, ): super().__init__() - model_config = od_config.tf_model_config - num_layers = model_config.num_layers self.parallel_config = od_config.parallel_config self.in_channels = in_channels self.out_channels = out_channels or in_channels diff --git a/vllm_omni/diffusion/utils/tf_utils.py b/vllm_omni/diffusion/utils/tf_utils.py new file mode 100644 index 00000000000..44a78804452 --- /dev/null +++ b/vllm_omni/diffusion/utils/tf_utils.py @@ -0,0 +1,54 @@ +import inspect +from typing import Any + +from vllm_omni.diffusion.data import TransformerConfig + + +def get_transformer_config_kwargs( + tf_model_config: TransformerConfig, model_class: type[Any] | None = None +) -> dict[str, Any]: + """ + This function extracts parameters from a TransformerConfig instance and filters out internal + diffusers metadata keys (those starting with '_') that should not be passed to model initialization. + Also filters out parameters that are not accepted by the model's __init__ method (e.g., pooled_projection_dim + for QwenImageTransformer2DModel). + + This uses inspect.signature to dynamically detect accepted parameters, making it general for any model class. + Similar to how diffusers' @register_to_config decorator works. + + Args: + tf_model_config: TransformerConfig instance containing model parameters + model_class: Optional model class to inspect for accepted __init__ parameters. + If None, all non-internal parameters are returned (backward compatibility). + + Returns: + dict: Filtered dictionary of parameters suitable for transformer model initialization + """ + # Extract transformer config parameters, filtering out internal diffusers metadata + # TransformerConfig stores params in a 'params' dict, and we need to exclude + # internal keys like '_class_name' and '_diffusers_version' + tf_config_params = tf_model_config.to_dict() + + # Filter out internal diffusers metadata keys that start with '_' + filtered_params = {k: v for k, v in tf_config_params.items() if not k.startswith("_")} + + # If model_class is provided, use inspect.signature to get accepted parameters + if model_class is not None: + try: + # Get the signature of the model's __init__ method + sig = inspect.signature(model_class.__init__) + # Get all parameter names (excluding 'self' and special parameters) + accepted_params = { + name + for name, param in sig.parameters.items() + if name != "self" and param.kind != inspect.Parameter.VAR_KEYWORD # Exclude **kwargs + } + + # Filter to only include parameters that are in the model's signature + filtered_params = {k: v for k, v in filtered_params.items() if k in accepted_params} + except (TypeError, AttributeError): + # If inspection fails, fall back to returning all non-internal params + # This maintains backward compatibility + pass + + return filtered_params From 2773996597789257f5c4062fd76c016a293e7a9b Mon Sep 17 00:00:00 2001 From: tzhouam Date: Thu, 15 Jan 2026 08:51:44 +0000 Subject: [PATCH 35/59] disable async scheduling for generation models, avoiding inconsistency from race condition Signed-off-by: tzhouam --- vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml | 7 ++++--- vllm_omni/worker/gpu_generation_model_runner.py | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml index c63dc563815..d4de078231a 100644 --- a/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen3_omni_moe.yaml @@ -16,7 +16,7 @@ stage_args: worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler gpu_memory_utilization: 0.6 - enforce_eager: false + enforce_eager: true trust_remote_code: true engine_output_type: latent # Output hidden states for talker distributed_executor_backend: "mp" @@ -46,8 +46,8 @@ stage_args: model_arch: Qwen3OmniMoeForConditionalGeneration worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler - gpu_memory_utilization: 0.3 - enforce_eager: false + gpu_memory_utilization: 0.35 + enforce_eager: true trust_remote_code: true engine_output_type: latent # Output codec codes for code2wav # tensor_parallel_size: 2 @@ -80,6 +80,7 @@ stage_args: scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler enforce_eager: true trust_remote_code: true + async_scheduling: false enable_prefix_caching: false engine_output_type: audio # Final output: audio waveform gpu_memory_utilization: 0.1 diff --git a/vllm_omni/worker/gpu_generation_model_runner.py b/vllm_omni/worker/gpu_generation_model_runner.py index 40b26e2ba50..03ecd98f96d 100644 --- a/vllm_omni/worker/gpu_generation_model_runner.py +++ b/vllm_omni/worker/gpu_generation_model_runner.py @@ -266,13 +266,15 @@ def sample_tokens( self, grammar_output: GrammarOutput | None = None, ) -> OmniModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors: + # NOTE: Even though the model is non-autoregressive, we still need to have this function to match the interface of the engine core. + # In this case, this function kv_connector_output = self.kv_connector_output self.kv_connector_output = None if self.execute_model_state is None: # Nothing to do (PP non-final rank case), output isn't used. if not kv_connector_output: - return None # type: ignore[return-value] + return EMPTY_MODEL_RUNNER_OUTPUT # In case of PP with kv transfer, we need to pass through the # kv_connector_output From b9ea7541e5df6513ef003660fc3df5af0cc13e9c Mon Sep 17 00:00:00 2001 From: tzhouam Date: Thu, 15 Jan 2026 08:52:17 +0000 Subject: [PATCH 36/59] Update Qwen 3 Omni Signed-off-by: tzhouam --- .../qwen3_omni/qwen3_omni_moe_talker.py | 2 +- .../qwen3_omni/qwen3_omni_moe_thinker.py | 457 ++++++++++++------ 2 files changed, 322 insertions(+), 137 deletions(-) diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py index 4e8730eab52..14cae177e3b 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py @@ -111,7 +111,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() talker_config: Qwen3OmniMoeTalkerConfig = vllm_config.model_config.hf_config talker_config.text_config.rope_parameters = talker_config.text_config.rope_scaling - talker_config.text_config.rope_parameters["rope_theta"] = talker_config.text_config.rope_theta + talker_config.text_config.rope_parameters["rope_theta"] = talker_config.text_config.rope_parameters["rope_theta"] self.quant_config = vllm_config.quant_config self.prefix = prefix self.vllm_config = vllm_config diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py index 7f3320a82eb..86a00a95400 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py @@ -21,7 +21,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen3-Omni-Moe model (thinker part).""" - +from vllm.model_executor.models.module_mapping import MultiModelKeys from collections.abc import Iterable, Mapping, Sequence from functools import partial from typing import Any @@ -37,7 +37,7 @@ Qwen3OmniMoeConfig, Qwen3OmniMoeThinkerConfig, ) -from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( +from vllm.model_executor.models.qwen3_omni_moe_thinker import ( Qwen3OmniMoeAudioEncoder, ) from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import ( @@ -83,7 +83,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems -from vllm.multimodal.processing import ( +from vllm.multimodal.processing.processor import ( MultiModalPromptUpdates, PlaceholderFeaturesInfo, PromptReplacement, @@ -170,15 +170,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = Qwen3MoeLLMModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) + self.model = Qwen3MoeLLMModel( + vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") + ) + self.lm_head = ParallelLMHead( + config.vocab_size, config.hidden_size, quant_config=quant_config + ) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors + ) -class Qwen3OmniMoeThinkerProcessingInfo(Qwen2AudioProcessingInfo, Qwen2_5_VLProcessingInfo): +class Qwen3OmniMoeThinkerProcessingInfo( + Qwen2AudioProcessingInfo, Qwen2_5_VLProcessingInfo +): def get_hf_config(self): return self.ctx.get_hf_config(Qwen3OmniMoeConfig).thinker_config @@ -248,11 +256,42 @@ def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray: # https://github.com/huggingface/transformers/pull/41473 mm_kwargs = dict(mm_kwargs) tok_kwargs = dict(tok_kwargs) + mm_kwargs["audio_kwargs"] = dict(mm_kwargs.get("audio_kwargs") or {}) + mm_kwargs["text_kwargs"] = dict(mm_kwargs.get("text_kwargs") or {}) if Version(TRANSFORMERS_VERSION) < Version("4.58.0"): + # Extract audio_sample_rate before restructuring + audio_sample_rate = mm_kwargs.pop("audio_sample_rate", None) + # move truncation to audio_kwargs level to avoid conflict # with tok_kwargs - mm_kwargs["audio_kwargs"] = {"truncation": mm_kwargs.pop("truncation", False)} - mm_kwargs["text_kwargs"] = {"truncation": tok_kwargs.pop("truncation", False)} + mm_kwargs["audio_kwargs"].setdefault( + "truncation", mm_kwargs.pop("truncation", False) + ) + mm_kwargs["text_kwargs"].setdefault( + "truncation", tok_kwargs.pop("truncation", False) + ) + + # Validate and conditionally pass audio_sample_rate + # WhisperFeatureExtractor has a fixed sampling rate, and vLLM's + # audio loader already resamples audio to the target rate. + # Only pass the value if it matches to avoid unexpected behavior. + if audio_sample_rate is not None: + expected_sr = feature_extractor.sampling_rate + if audio_sample_rate != expected_sr: + logger.warning( + "[%s] audio_sample_rate mismatch: user provided %dHz " + "but model expects %dHz. Ignoring user value. " + "vLLM's audio loader already resampled to %dHz.", + self.__class__.__name__, + audio_sample_rate, + expected_sr, + expected_sr, + ) + else: + # Sample rate matches, safe to pass + mm_kwargs["audio_kwargs"]["audio_sample_rate"] = ( + audio_sample_rate + ) hf_inputs = super()._call_hf_processor( prompt=prompt, @@ -270,12 +309,18 @@ def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray: for _, audio in enumerate(audios): audio_length = len(audio[0]) if isinstance(audio, tuple) else len(audio) num_frame = ( - (audio_length // hop_length) if audio_length % hop_length == 0 else (audio_length // hop_length - 1) + (audio_length // hop_length) + if audio_length % hop_length == 0 + else (audio_length // hop_length - 1) ) if mm_kwargs.get("truncation", False): - num_frame = min(num_frame, feature_extractor.n_samples // hop_length) + num_frame = min( + num_frame, feature_extractor.n_samples // hop_length + ) audio_num_frames.append(num_frame) - hf_inputs["feature_attention_mask"] = [torch.ones(num_frame) for num_frame in audio_num_frames] + hf_inputs["feature_attention_mask"] = [ + torch.ones(num_frame) for num_frame in audio_num_frames + ] hf_inputs["audio_feature_lengths"] = torch.tensor(audio_num_frames) return hf_inputs @@ -313,13 +358,17 @@ def _maybe_apply_prompt_updates( ) else: if use_audio_in_video and "audio" in mm_prompt_updates: - filtered_updates = {k: v for k, v in mm_prompt_updates.items() if k != "audio"} + filtered_updates = { + k: v for k, v in mm_prompt_updates.items() if k != "audio" + } prompt_ids, mm_placeholders = self._apply_prompt_updates( prompt_ids, filtered_updates, ) # Derive audio placeholders from video placeholders - mm_placeholders = self._derive_audio_from_video_placeholders(mm_placeholders, mm_prompt_updates) + mm_placeholders = self._derive_audio_from_video_placeholders( + mm_placeholders, mm_prompt_updates + ) else: prompt_ids, mm_placeholders = self._apply_prompt_updates( prompt_ids, @@ -356,12 +405,19 @@ def get_updates_use_audio_in_video( video_token_indices, (video_token_indices.shape[0], height, width) ).reshape(-1) video_token_indices = ( - (video_token_indices + shift) * next(iter([video_second_per_grid_t])) * position_id_per_seconds + (video_token_indices + shift) + * next(iter([video_second_per_grid_t])) + * position_id_per_seconds ) video_data_index, audio_data_index = 0, 0 updates = [audio_start_token_id] - while video_data_index < len(video_token_indices) and audio_data_index < len(audio_token_indices): - if video_token_indices[video_data_index] <= audio_token_indices[audio_data_index]: + while video_data_index < len(video_token_indices) and audio_data_index < len( + audio_token_indices + ): + if ( + video_token_indices[video_data_index] + <= audio_token_indices[audio_data_index] + ): updates += [video_token_id] video_data_index += 1 else: @@ -398,11 +454,13 @@ def _get_prompt_updates( if audio_feature_lengths is None and feature_attention_mask is None: audio_output_lengths = [] elif audio_feature_lengths is not None: - _, audio_output_lens = _get_feat_extract_output_lengths(audio_feature_lengths) + audio_output_lens = _get_feat_extract_output_lengths(audio_feature_lengths) audio_output_lengths = audio_output_lens.tolist() elif feature_attention_mask is not None: assert isinstance(feature_attention_mask, torch.Tensor) - _, audio_output_lens = _get_feat_extract_output_lengths(feature_attention_mask.sum(-1)) + audio_output_lens = _get_feat_extract_output_lengths( + feature_attention_mask.sum(-1) + ) audio_output_lengths = audio_output_lens.tolist() # number of audios read from video. @@ -420,7 +478,8 @@ def get_replacement_qwen2_audio(item_idx: int): audios = mm_items.get_items("audio", AudioProcessorItems) audio = audios.get(item_idx) raise ValueError( - f"The audio {audio} (len={len(audio)}) is too short to be represented inside the model" + f"The audio {audio} (len={len(audio)}) is too short " + "to be represented inside the model" ) return [audio_token_id] * num_features @@ -438,7 +497,9 @@ def get_replacement_qwen2_vision(item_idx: int, modality: str): def get_replacement_qwen2_use_audio_in_video(item_idx: int): nonlocal audio_in_video_item_idx - audio_num_features = audio_output_lengths[audio_in_video_item_idx + item_idx] + audio_num_features = audio_output_lengths[ + audio_in_video_item_idx + item_idx + ] video_grid_thw = out_mm_data["video_grid_thw"][item_idx] audio_in_video_item_idx += 1 @@ -455,7 +516,9 @@ def get_replacement_qwen2_use_audio_in_video(item_idx: int): video_grid_thw=video_grid_thw, video_second_per_grid_t=video_second_per_grid_t, ) - return PromptUpdateDetails.select_token_id(placeholder, embed_token_id=video_token_id) + return PromptUpdateDetails.select_token_id( + placeholder, embed_token_id=video_token_id + ) video_replacement_fn = ( get_replacement_qwen2_use_audio_in_video @@ -498,7 +561,8 @@ def _derive_audio_from_video_placeholders( num_audios = len(mm_prompt_updates.get("audio", [])) if num_audios != num_videos: raise ValueError( - f"use_audio_in_video requires equal number of audio and video items, got {num_audios=}, {num_videos=}" + f"use_audio_in_video requires equal number of audio and video items, " + f"got {num_audios=}, {num_videos=}" ) tokenizer = self.info.get_tokenizer() @@ -554,7 +618,11 @@ def _get_raw_input_ids( end = i break if end is not None: - result = result[:start] + [vision_bos_token, video_token, vision_eos_token] + result[end + 2 :] + result = ( + result[:start] + + [vision_bos_token, video_token, vision_eos_token] + + result[end + 2 :] + ) else: break @@ -569,61 +637,22 @@ def _get_raw_input_ids( class Qwen3OmniMoeConditionalGenerationMixin(Qwen2_5OmniConditionalGenerationMixin): - def _parse_and_validate_audio_input(self, **kwargs: object) -> Qwen2_5OmniAudioFeatureInputs | None: - input_audio_features = kwargs.pop("input_audio_features", None) - audio_feature_lengths = kwargs.pop("audio_feature_lengths", None) - feature_attention_mask = kwargs.pop("feature_attention_mask", None) - if input_audio_features is None: - return None - if ( - input_audio_features is not None - and isinstance(input_audio_features, torch.Tensor) - and input_audio_features.ndim == 3 - ): - # (batch_size, feature_dim, chunk_size) -> (feature_dim, batch_size * chunk_size) - input_audio_features = input_audio_features.permute(1, 0, 2).flatten(1) - elif input_audio_features is not None and isinstance(input_audio_features, list): - input_audio_features = torch.cat(input_audio_features, dim=-1) - if ( - audio_feature_lengths is not None - and isinstance(audio_feature_lengths, torch.Tensor) - and audio_feature_lengths.ndim == 2 - ): - audio_feature_lengths = audio_feature_lengths.reshape(-1) - elif audio_feature_lengths is not None and isinstance(audio_feature_lengths, list): - audio_feature_lengths = torch.cat(audio_feature_lengths, dim=-1) - if ( - feature_attention_mask is not None - and isinstance(feature_attention_mask, torch.Tensor) - and feature_attention_mask.ndim == 3 - ): - feature_attention_mask = feature_attention_mask.reshape(-1, feature_attention_mask.shape[-1]) - elif feature_attention_mask is not None and isinstance(feature_attention_mask, list): - for i in range(len(feature_attention_mask)): - feature_attention_mask[i] = feature_attention_mask[i].reshape(-1) - return Qwen2_5OmniAudioFeatureInputs( - type="audio_features", - input_features=input_audio_features, - audio_feature_lengths=audio_feature_lengths, - feature_attention_mask=feature_attention_mask, - ) - def _process_audio_input( self, audio_input: Qwen2_5OmniAudioFeatureInputs, audio_hashes: list[str] | None = None, cached_audio_features: torch.Tensor | None = None, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, ...]: input_features = audio_input["input_features"] audio_feature_lengths = audio_input["audio_feature_lengths"] - audio_feat_lengths, audio_output_lengths = _get_feat_extract_output_lengths(audio_feature_lengths) + audio_output_lengths = _get_feat_extract_output_lengths(audio_feature_lengths) - audio_outputs = self.audio_tower( + audio_features = self.audio_tower( input_features.to(self.audio_tower.dtype), feature_lens=audio_feature_lengths, + aftercnn_lens=audio_output_lengths, ) - audio_features = audio_outputs.last_hidden_state return audio_features.split(audio_output_lengths.tolist()) @@ -639,8 +668,6 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( SupportsMRoPE, Qwen3OmniMoeConditionalGenerationMixin, ): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "thinker.lm_head.": "language_model.lm_head.", @@ -649,6 +676,18 @@ class Qwen3OmniMoeThinkerForConditionalGeneration( } ) + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: if modality.startswith("image"): @@ -663,26 +702,19 @@ def get_placeholder_str(cls, modality: str, i: int) -> str | None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.vllm_config = vllm_config # needed for torch compile forward context - thinker_config: Qwen3OmniMoeThinkerConfig = vllm_config.model_config.hf_config + thinker_config: Qwen3OmniMoeThinkerConfig = ( + vllm_config.model_config.hf_config + ) quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config self.config = thinker_config self.multimodal_config = multimodal_config - # force "use_flash_attention_2=True" to audio tower to align - # the results. - if flash_attn is not None: - audio_config = thinker_config.audio_config - audio_config._attn_implementation_autoset = True - audio_config._attn_implementation = "flash_attention_2" - else: - logger.warning( - "flash_attn is not available, the model may not yield the " - "exactly same result as the transformers implementation " - "in the audio tower part." - ) - - self.audio_tower = Qwen3OmniMoeAudioEncoder(thinker_config.audio_config) + self.audio_tower = Qwen3OmniMoeAudioEncoder( + thinker_config.audio_config, + multimodal_config=multimodal_config, + prefix=maybe_prefix(prefix, "audio_tower"), + ) self.visual = Qwen3Omni_VisionTransformer( vision_config=thinker_config.vision_config, @@ -694,15 +726,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.quant_config = quant_config self.language_model = Qwen3MoeLLMForCausalLM( - vllm_config=vllm_config.with_hf_config(thinker_config.text_config, architectures=["Qwen3MoeForCausalLM"]), + vllm_config=vllm_config.with_hf_config( + thinker_config.text_config, architectures=["Qwen3MoeForCausalLM"] + ), prefix=maybe_prefix(prefix, "language_model"), ) - self.make_empty_intermediate_tensors = self.language_model.make_empty_intermediate_tensors + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors + ) - self.use_deepstack = hasattr(thinker_config.vision_config, "deepstack_visual_indexes") + self.use_deepstack = hasattr( + thinker_config.vision_config, "deepstack_visual_indexes" + ) self.deepstack_num_level = ( - len(thinker_config.vision_config.deepstack_visual_indexes) if self.use_deepstack else 0 + len(thinker_config.vision_config.deepstack_visual_indexes) + if self.use_deepstack + else 0 ) # register buffer for deepstack self.deepstack_input_embeds = ( @@ -723,7 +763,9 @@ def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors: # get deepstack_input_embeds from buffer, and clear the buffer return IntermediateTensors( { - f"deepstack_input_embeds_{idx}": self.deepstack_input_embeds[idx][:num_tokens] + f"deepstack_input_embeds_{idx}": self.deepstack_input_embeds[idx][ + :num_tokens + ] for idx in range(self.deepstack_num_level) } ) @@ -742,7 +784,9 @@ def _set_deepstack_input_embeds(self, deepstack_input_embeds: torch.Tensor) -> N for _ in range(self.deepstack_num_level) ] for idx in range(self.deepstack_num_level): - self.deepstack_input_embeds[idx][:num_tokens].copy_(deepstack_input_embeds[idx]) + self.deepstack_input_embeds[idx][:num_tokens].copy_( + deepstack_input_embeds[idx] + ) def _clear_deepstack_input_embeds(self, num_tokens: int) -> None: # clear deepstack_input_embeds in buffer @@ -756,12 +800,27 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: # Preserve the order of modalities if there are multiple of them # from the order of kwargs. for input_key in kwargs: - if input_key in ("pixel_values", "image_embeds") and "image" not in mm_input_by_modality: - mm_input_by_modality["image"] = self._parse_and_validate_image_input(**kwargs) - if input_key in ("pixel_values_videos", "video_embeds") and "video" not in mm_input_by_modality: - mm_input_by_modality["video"] = self._parse_and_validate_video_input(**kwargs) - if input_key in ("input_audio_features") and "audio" not in mm_input_by_modality: - mm_input_by_modality["audio"] = self._parse_and_validate_audio_input(**kwargs) + if ( + input_key in ("pixel_values", "image_embeds") + and "image" not in mm_input_by_modality + ): + mm_input_by_modality["image"] = self._parse_and_validate_image_input( + **kwargs + ) + if ( + input_key in ("pixel_values_videos", "video_embeds") + and "video" not in mm_input_by_modality + ): + mm_input_by_modality["video"] = self._parse_and_validate_video_input( + **kwargs + ) + if ( + input_key in ("input_audio_features") + and "audio" not in mm_input_by_modality + ): + mm_input_by_modality["audio"] = self._parse_and_validate_audio_input( + **kwargs + ) return mm_input_by_modality def get_language_model(self) -> torch.nn.Module: @@ -810,13 +869,14 @@ def embed_input_ids( return inputs_embeds deepstack_input_embeds = None - # TODO (ywang96): support overlapping modalitiy embeddings so that - # `use_audio_in_video` will work on V1. # split the feat dim to obtain multi-scale visual feature has_vision_embeddings = [ - embeddings.shape[-1] != self.config.text_config.hidden_size for embeddings in multimodal_embeddings + embeddings.shape[-1] != self.config.text_config.hidden_size + for embeddings in multimodal_embeddings ] - if self.visual.deepstack_visual_indexes is not None and any(has_vision_embeddings): + if self.visual.deepstack_visual_indexes is not None and any( + has_vision_embeddings + ): multiscale_len = len(self.visual.deepstack_visual_indexes) multimodal_embeddings_multiscale = [] is_vision = torch.zeros_like(is_multimodal) @@ -824,13 +884,17 @@ def embed_input_ids( mm_position_idx = 0 for index, embeddings in enumerate(multimodal_embeddings): num_tokens = embeddings.shape[0] - current_positions = mm_positions[mm_position_idx : mm_position_idx + num_tokens] + current_positions = mm_positions[ + mm_position_idx : mm_position_idx + num_tokens + ] # Vision embeddings if embeddings.shape[-1] != self.config.text_config.hidden_size: visual_dim = embeddings.shape[-1] // (multiscale_len + 1) multi_dim = visual_dim * multiscale_len - embeddings_main, embeddings_multiscale = torch.split(embeddings, [visual_dim, multi_dim], dim=-1) + embeddings_main, embeddings_multiscale = torch.split( + embeddings, [visual_dim, multi_dim], dim=-1 + ) multimodal_embeddings[index] = embeddings_main multimodal_embeddings_multiscale.append(embeddings_multiscale) is_vision[current_positions] = True @@ -850,7 +914,9 @@ def embed_input_ids( is_multimodal=is_vision, ) deepstack_input_embeds = ( - deepstack_input_embeds.view(inputs_embeds.shape[0], multiscale_len, visual_dim) + deepstack_input_embeds.view( + inputs_embeds.shape[0], multiscale_len, visual_dim + ) .permute(1, 0, 2) .contiguous() ) @@ -934,8 +1000,12 @@ def get_mrope_input_positions( audio_feature_lengths = kwargs.get("audio_feature_lengths", []) use_audio_in_video = any(kwargs.get("use_audio_in_video", [])) - image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(image_grid_thw) - video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(video_grid_thw) + image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)( + image_grid_thw + ) + video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)( + video_grid_thw + ) input_ids = torch.tensor(input_tokens) if input_ids is None or input_ids.ndim != 1: @@ -944,11 +1014,16 @@ def get_mrope_input_positions( seq_len = input_ids.shape[0] if isinstance(audio_feature_lengths, list): - audio_feature_lengths = torch.tensor(audio_feature_lengths, dtype=torch.long) + audio_feature_lengths = torch.tensor( + audio_feature_lengths, dtype=torch.long + ) if not len(second_per_grid_ts) and len(video_grid_thw): second_per_grid_ts = 2.0 - second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32) * second_per_grid_ts + second_per_grids = ( + torch.ones(len(video_grid_thw), dtype=torch.float32) + * second_per_grid_ts + ) else: second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32) @@ -961,7 +1036,9 @@ def get_mrope_input_positions( audio_start_token_id = config.audio_start_token_id position_id_per_seconds = config.position_id_per_seconds - vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1) + vision_start_indices = torch.argwhere( + input_ids == vision_start_token_id + ).squeeze(1) if vision_start_indices.numel() > 0: vision_tokens = input_ids[vision_start_indices + 1] else: @@ -980,7 +1057,11 @@ def get_mrope_input_positions( video_idx = 0 audio_idx = 0 remain_images, remain_videos, remain_audios = image_nums, video_nums, audio_nums # noqa: E501 - multimodal_nums = image_nums + audio_nums if use_audio_in_video else image_nums + video_nums + audio_nums # noqa: E501 + multimodal_nums = ( + image_nums + audio_nums + if use_audio_in_video + else image_nums + video_nums + audio_nums + ) # noqa: E501 for _ in range(multimodal_nums): st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 @@ -1000,28 +1081,55 @@ def get_mrope_input_positions( text_len = min_ed - st if text_len != 0: st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long) + .view(1, -1) + .expand(3, -1) + + st_idx + ) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 bos_len = 1 - llm_pos_ids_list.append(torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) + llm_pos_ids_list.append( + torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - _, audio_len = _get_feat_extract_output_lengths(audio_feature_lengths[audio_idx]) - llm_pos_ids = torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx + audio_len = _get_feat_extract_output_lengths( + audio_feature_lengths[audio_idx] + ) + llm_pos_ids = ( + torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) llm_pos_ids_list.append(llm_pos_ids) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 eos_len = 1 - llm_pos_ids_list.append(torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) + llm_pos_ids_list.append( + torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) st += text_len + bos_len + audio_len + eos_len audio_idx += 1 remain_audios -= 1 - elif min_ed == ed_vision_start and input_ids[ed_vision_start + 1] == image_token_id: + elif ( + min_ed == ed_vision_start + and input_ids[ed_vision_start + 1] == image_token_id + ): text_len = min_ed - st if text_len != 0: st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long) + .view(1, -1) + .expand(3, -1) + + st_idx + ) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 bos_len = 1 - llm_pos_ids_list.append(torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) + llm_pos_ids_list.append( + torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 grid_t = image_grid_thw[image_idx][0] grid_hs = image_grid_thw[:, 1] @@ -1034,7 +1142,10 @@ def get_mrope_input_positions( llm_pos_ids_list.append(llm_pos_ids) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 eos_len = 1 - llm_pos_ids_list.append(torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) + llm_pos_ids_list.append( + torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) st += text_len + bos_len + image_len + eos_len image_idx += 1 remain_images -= 1 @@ -1046,15 +1157,27 @@ def get_mrope_input_positions( text_len = min_ed - st if text_len != 0: st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long) + .view(1, -1) + .expand(3, -1) + + st_idx + ) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 bos_len = 1 - llm_pos_ids_list.append(torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) + llm_pos_ids_list.append( + torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 grid_t = video_grid_thw[video_idx][0] grid_hs = video_grid_thw[:, 1] grid_ws = video_grid_thw[:, 2] - t_index = torch.arange(grid_t) * float(second_per_grids[video_idx].item()) * position_id_per_seconds + t_index = ( + torch.arange(grid_t) + * float(second_per_grids[video_idx].item()) + * position_id_per_seconds + ) llm_pos_ids = get_llm_pos_ids_for_vision( st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws ) @@ -1062,46 +1185,95 @@ def get_mrope_input_positions( llm_pos_ids_list.append(llm_pos_ids) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 eos_len = 1 - llm_pos_ids_list.append(torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) + llm_pos_ids_list.append( + torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) st += text_len + bos_len + video_len + eos_len video_idx += 1 remain_videos -= 1 - elif min_ed == ed_vision_start and ed_vision_start + 1 == ed_audio_start and use_audio_in_video: + elif ( + min_ed == ed_vision_start + and ed_vision_start + 1 == ed_audio_start + and use_audio_in_video + ): text_len = min_ed - st if text_len != 0: st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long) + .view(1, -1) + .expand(3, -1) + + st_idx + ) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 bos_len = 1 - bos_block = torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx + bos_block = ( + torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) llm_pos_ids_list.append(bos_block) llm_pos_ids_list.append(bos_block) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - _, audio_len = _get_feat_extract_output_lengths(audio_feature_lengths[audio_idx]) - audio_llm_pos_ids = torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx + audio_len = _get_feat_extract_output_lengths( + audio_feature_lengths[audio_idx] + ) + audio_llm_pos_ids = ( + torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) grid_t = video_grid_thw[video_idx][0] grid_hs = video_grid_thw[:, 1] grid_ws = video_grid_thw[:, 2] - t_index = torch.arange(grid_t) * float(second_per_grids[video_idx].item()) * position_id_per_seconds + t_index = ( + torch.arange(grid_t) + * float(second_per_grids[video_idx].item()) + * position_id_per_seconds + ) video_llm_pos_ids = get_llm_pos_ids_for_vision( st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws ) video_data_index, audio_data_index = 0, 0 - while video_data_index < video_llm_pos_ids.shape[-1] and audio_data_index < audio_llm_pos_ids.shape[-1]: - if video_llm_pos_ids[0][video_data_index] <= audio_llm_pos_ids[0][audio_data_index]: - llm_pos_ids_list.append(video_llm_pos_ids[:, video_data_index : video_data_index + 1]) + while ( + video_data_index < video_llm_pos_ids.shape[-1] + and audio_data_index < audio_llm_pos_ids.shape[-1] + ): + if ( + video_llm_pos_ids[0][video_data_index] + <= audio_llm_pos_ids[0][audio_data_index] + ): + llm_pos_ids_list.append( + video_llm_pos_ids[ + :, video_data_index : video_data_index + 1 + ] + ) video_data_index += 1 else: - llm_pos_ids_list.append(audio_llm_pos_ids[:, audio_data_index : audio_data_index + 1]) + llm_pos_ids_list.append( + audio_llm_pos_ids[ + :, audio_data_index : audio_data_index + 1 + ] + ) audio_data_index += 1 if video_data_index < video_llm_pos_ids.shape[-1]: - llm_pos_ids_list.append(video_llm_pos_ids[:, video_data_index : video_llm_pos_ids.shape[-1]]) + llm_pos_ids_list.append( + video_llm_pos_ids[ + :, video_data_index : video_llm_pos_ids.shape[-1] + ] + ) if audio_data_index < audio_llm_pos_ids.shape[-1]: - llm_pos_ids_list.append(audio_llm_pos_ids[:, audio_data_index : audio_llm_pos_ids.shape[-1]]) + llm_pos_ids_list.append( + audio_llm_pos_ids[ + :, audio_data_index : audio_llm_pos_ids.shape[-1] + ] + ) video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 eos_len = 1 - eos_block = torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx + eos_block = ( + torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) llm_pos_ids_list.append(eos_block) llm_pos_ids_list.append(eos_block) st += text_len + bos_len * 2 + audio_len + video_len + eos_len * 2 # noqa: E501 @@ -1113,7 +1285,10 @@ def get_mrope_input_positions( if st < len(input_tokens): st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 text_len = len(input_tokens) - st - llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) + llm_pos_ids_list.append( + torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + + st_idx + ) llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) if llm_positions.shape[1] != seq_len: @@ -1121,3 +1296,13 @@ def get_mrope_input_positions( mrope_position_delta = llm_positions.max() + 1 - seq_len return llm_positions, mrope_position_delta + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="visual.merger", + tower_model=["visual.", "audio_tower."], + ) From aaea77a180fcb9775b285f4440ec2f910bb5d738 Mon Sep 17 00:00:00 2001 From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Date: Thu, 15 Jan 2026 06:14:34 -0700 Subject: [PATCH 37/59] [bugfix] Fix Wan2.2 I2V warmup failure by adding support_image_input attribute (#791) Signed-off-by: linyueqian --- vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py | 3 ++- vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py index d72afc9ee84..f3172e3d90d 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py @@ -18,6 +18,7 @@ from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader +from vllm_omni.diffusion.models.interface import SupportImageInput from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import ( create_transformer_from_config, load_transformer_config, @@ -112,7 +113,7 @@ def pre_process_func(requests: list[OmniDiffusionRequest]) -> list[OmniDiffusion return pre_process_func -class Wan22I2VPipeline(nn.Module): +class Wan22I2VPipeline(nn.Module, SupportImageInput): """ Wan2.2 Image-to-Video Pipeline. diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py index bee70a7a96b..5351419ba68 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py @@ -31,6 +31,7 @@ from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader +from vllm_omni.diffusion.models.interface import SupportImageInput from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import ( create_transformer_from_config, load_transformer_config, @@ -102,7 +103,7 @@ def pre_process_func(requests: list[OmniDiffusionRequest]) -> list[OmniDiffusion return pre_process_func -class Wan22TI2VPipeline(nn.Module): +class Wan22TI2VPipeline(nn.Module, SupportImageInput): """ Wan2.2 Text-Image-to-Video (TI2V) Pipeline. From 4203bfa3505fea67730816787676f1a16cca0486 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Thu, 15 Jan 2026 22:22:16 +0800 Subject: [PATCH 38/59] [Misc] add wechat group and star history on README (#801) Signed-off-by: David Chen <530634352@qq.com> --- README.md | 6 +++++- docs/assets/WeChat.jpg | Bin 0 -> 231347 bytes 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 docs/assets/WeChat.jpg diff --git a/README.md b/README.md index 829d683c3c3..fd2fed53469 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Easy, fast, and cheap omni-modality model serving for everyone

-| Documentation | User Forum | Developer Slack | +| Documentation | User Forum | Developer Slack | WeChat |

--- @@ -70,6 +70,10 @@ Please check out [Contributing to vLLM-Omni](https://vllm-omni.readthedocs.io/en ## Join the Community Feel free to ask questions, provide feedbacks and discuss with fellow users of vLLM-Omni in `#sig-omni` slack channel at [slack.vllm.ai](https://slack.vllm.ai) or vLLM user forum at [discuss.vllm.ai](https://discuss.vllm.ai). +## Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=vllm-project/vllm-omni&type=date&legend=top-left)](https://www.star-history.com/#vllm-project/vllm-omni&type=date&legend=top-left) + ## License Apache License 2.0, as found in the [LICENSE](./LICENSE) file. diff --git a/docs/assets/WeChat.jpg b/docs/assets/WeChat.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5a63afde85c697f3c8d8b2f25d0d3e647e28ed4b GIT binary patch literal 231347 zcmeFYb#NU|l0Wza7Bj;WGc#F|#Y`46GfPj*%*+aV zE-v=sV*j}6=vNup)mizfv#auxmHlVw&lUhhT3kvT00ssCfPEZ*KRe)pQX(SyUzHTZ zrDVnab^)9j@KL}~003J%XONPFFtLWF7V%%azZXMeCx^e&|Dyho`>^nrbO2zM;a^<; zzhl9gm^vAMG`jjYok1U+e<*A6L1UQz8}0rVZS-$+^j|bcMM>nNP2LAhZT_EVqyI#^ zIXgIiv>E=Zy|Dx6FM92RR@;n;Jya{AmabQGfD>lnnM5poRxp@$nyXIU} zQ4;__{{;ZRY5!f(f86_zXYqgWU%5Z-0e<{X`;WsMU;{7(5CfzDb^v1l;|IkGUGh>;o%YC0r;OFAfdp(p|9|!W;Shj|eC5}-pGN{-oXfo$*cusw;1Hl1Y0AvV#T4lORjF08GI>xPhHup zt$_A)rmN{wHvxDBaV*}&j7yu|;<^eYWo0UuvhuW#$_jyQq-@%^dFXQlRp&QGM%col zTRl~3p|Sl`+7>U?c(g7<#O$j3rY0RYpZ;0Y(3H{1*m*MP+W*GS=T~DHcyrRIw=u0kK=MFtMN84r| zt5f3%*fkrUuGXkv&P`n6n#V!I2rWYc`P7261WoSdOR`%@uS2z>MEP9N!d^KEz0bC` zT0b2)#4@S#v6l*a7S|kyz=nVhKu|TI#M{onWMTj5JO3gK2J?aDWa((30B1T$RQ<1` zIcc~@?wKHlXastM0J6{=Li3ZVYTD&*50z_lE>&19>~>{LT}0Tuh$D%aq5aO%17!_WkCa-I4d`A-Zq!a<_{jRns&XR5&F3=iKQIxY|LbX zv$MuGXyE8?IKM7C85LdEF3gRPOzJlQ=Uf($ZORx<&IcaW{UD_Dc(TIu2z6c#Q@Ggl z9o{E)qtO}rpE4;$YEda4TX8_}%rMsEQlCT%5?#e)CQSM^*kClu%|D6K58!Vim1kzQ zOU=Oi*cE?=h~+`go_A$*DzAhc<`qH>X3Lp8CXO|>0OU{72D{q~I$Dy8O74{wdfE*` zd{R8^aHffzSVvb|II9HBQ>238Wr%>Izc=F^4&QH11h&j{pVi-U;#T*6dOLX4jPw88 zT@Q)1p@$;_j%yt$K|WTr5pp==VX{Q-o0?-p=dZ9zrstE$OQZy(nP@fAK1E=SMR{-B z0cp<*9O{w(J|PEg5+f}X^UEC9Ahbjhf2T}0r#ESGPb2)9@?v#$l;E;kY#K*ir^rCS zLhM*>$TBpM9%1qnGa24uErE>R)Tl}$`3K-Du$ucaR`LfBknVP@|LiGXA5ex3^uHv0 zUu7k-_598~%o7U(PLx*DDJ(%CAJJE-nZ;^UD|jp=N)Q+9c@=^3Wr1VHMl!}(>@C?f zMBV0GbiT4lA?|P1<~X|ILFguz@*L5h@yG<|CY|w~uZO;&AOns2ulHDxqcMe64_=t5 zCq~DT1AqA6rQJ(IQiIc^5b{|&Po#Kmn=?Sg`C`fJrt7_2zpwoPZ0Jh7B8+uO{{hTS zzj*%?Px_s2&g&eL)+RKf;n>lFtTiZ4+CqtlMP>2$(VAEgvM)>JRa2t5%?%hMWuFP< zz9(}Z&JS5;y2RnCh=J4YjP-dp-MHhH?7YxCcHS5`*Cd+)YK>kl4>z5{v{<;eZ)r2t zeJ^8pvlK}P}5gudbny|KY06vGI}*)kdwv+E31v9b%{o&R_(x!cS)4hgJ} z#GO~KuQEZn5}we6HT-ux*2u{u|= z^kL)W(2LU_fQJjw6&vFl-5-F0#Q7D6@NSCcb=Rf-=ZD@ufG<08Y5qOIru3hIl!Ex& zgCZLeMrPk{NK=Pq05^;(J-;WA;K^eyN~F-r*?lJ_Y!us6Gnd6|<^Ih90O9ejo*gL8 zBZVa|W%)qj^N*t2BoOp0^)!TdokfV7CE>Jhd|k0*G+S5OLxhM85B#P#{CG&ehR@LVeDv0-ZJ#%JU1x6g`FkVVdBouKuO}z@ zSqo1yb9~AYi%`i(8R0!C6TNQ7nSL4m=wXriq5a+c8IVT!;vc|kG3)v$TqiM)zh-W% z>1pEY*b6&@z&EskuZU)0#^#COGQFfX6)u#iQo`a@j(#n&^1~~8#4$1{a+T=ihcUr* z!uuDUbBYgbww+G`$M~c8j&906*Rch*p}c+jyDjT%wT~)kXyP)$uzeH>kJHB~`UG}n z>ePZ)tilDCVbnoZHSPoF(N#^Jt;<7^yS!JXNian;NQDof(is|9gb}GK*zYpYTmHq%M4Z6KBCHpa$#_Jot z*FCo2juWYwGB>e;A??@ZEzPLoGWv3_Kf`4pFp3t5u^Es;hG}gVur5YWS=`Ud&5!FU z(iXT|@eI+auDf}l$AF!lsx_(W*TyGmvN?b1kJ&udo z*21{~tTYnJ1<~@C7oYGxG~nlVh<5z<=LA)v1Nxb*?Uq2Y`kwEqP@^*&u~M6R93R~u zs}SB-B^2Jp6cj;6mq|CBCupUUsvyKtde;`j%^8oNlC z#d^b!G7lgoNFm)LmiX4BEYZ8^@62 zUrq~*%N0DkGRms;!2~6B*g9L>yD1Sj0vDI;E>xI{!)C4B--m84{PfHmE5^v!oI$u4 z5xZfEp~blRxOgfg;6q3)sfePU^R(0P50#rd>I>M7ZKlGLYff~Rf15c+O5WoX&e#~A ziL{?j7oYv?XZ_?e+TiJy4}y2zN)v_g1t}>aMu_YEgR)hBNwC4t&Glz-5k8Er?U%4I zT1$-M@OMbqFep8g(RY+;)CJJAJ5CC1+71q_d@5<_vc0i(2^VL+lir01v}(lXOOYBC zFeeJ>TWam#P_%m1W~9gBo0yHVm~RV|s@NBuuDfSIiWp9(1Mc8hu0^GqUsdSG<&U3} zP-gE*eWh06HR=iT;1W4V^e zDY@Tc;Fs*npwvp2`!p^7XbDgeHf9`}ad5K0D3X9l7ndZ~<6;O0QsE^HjW)I|b5bH) znaf5-5N}+b#C!Ivx>2uR6;(DqdX%U~E4ZWE4dx|I$SsFirmPV71VpDI4(m<*y5ul& z9ncui8<}L2!XsZXg=bWwr`36nB;OEm{z2m!u5~PY)BtaK{ZBZ~i9~@W|HrUfZYv^D zyytxhCEgr0ow>Z4E*KfsJOZJu;%=haONGgZ2&D|BV!fBL_zmP{d+X>=*VeSt#qj{| zf=+&)gZBgb@mqa&6o?^U4OQ%+l7sJ}>w+P0d~ne)z=56H=9CHmW8kmpJ+Fz%1(ttZ zQ=QJDR3Tn0&nWBtL5bnK(8G^+x=ih zpF63zx^cVW58$V@z|8yM<(udAZNz8K7d^bGm(cn*rVHOcfW>MEtPP%qw0|MQdq+N} zb1}63{Vx$RFS%X!r`-e1v9=WdLI}}+@9T=ib~jV%UkDlc7k&MuaK~!YuHt;o&{nz{ z&mi7x97i+xMpED}zRCz_CR|Q5l_e>0Qe|E%cy3`KbX5?$O1saCIie2jdgk41!S*>u zJ^5>8to-&_L82r1_wFGj8|^ROcGJ=GVu5W4duODg;;g#eVG6i$?t4c4P~3_^VX`{1rq)4Q zOnQy)9Qe(R?A5+vr)?Ew7pa}ET`k4AAp!#)HGWLqDNpUE(BNC`{{}SJ+HZq_!e=du zPhJ}QB|okQoq*pzbLbV!61Qgn_OY-#@_j3f=QFu5N?DcrThFbkmMhr6CTci~@Q5sNh5?!Qz+bcY_Kl^KuH`an7RS7pil0l?O-}ONR5>ht za_4SC-W{nF^sW0e{{S2o{0iTuHlOde9EKulb|6}{dmiGAu0y6+ececB7Aw`QoxWd! zgiO;r?ZwXI%j;cJEXvKLl=iUY8#?GNn>uqexpLD(MM&LGsDJ_8y8%1Q!{M%|kNrkp zdVY52^+wwpt3im@SQ}YzS}jAK*NxFP<>ZRlcQ)(0EUMqFK{My9qxM5yAkD?m_G}Fu z+2~o8orQDgW;R$fi6>OWsM_rGFUUvcmB~aPe3x(l7fbMKnmx=Ng99P}Nb*>)w0?IJ6SNHS=dEoN*>lmp*E96;&dsCp!%x~&d z(PUjX%C5$R7Y4Oe6{R$HCi9LBiz#KM+J0-(SOxMYGHMQIkU#|UPn^3U(zsxD&0F|7 z`aGRbeRy>I5Pm-XIScq3vl%j4?%`GUngY3eHVMb>v?|rZu{B3<0^SiugMr;u2^(>A z()WWHH79G-qjD-S^C{X$E5&;D6o)|R&swDgMo|=|qe}bqO_ww>FywfzLoZ2e ze=nPk(S=`sQWjsWDr8iZCabY#&_*w`_PLXyP^oNMh$!~i7+fueHv64h?W*f8?fSyC zNAH!y?nG`Z%`3N$$C->U{(ZkP&4P(g+{mP4;<$7!I8Yw2jO6QI;8=eqwM^T@fY=J*Bb(ylHh)qSLvrHYV4PA1JJ z0E)KK$g)j%UjX)Y=~cpa?m+rg>uTx)s7%T{d(z#%<10HCwD9YkBQ;8A-eb$p*K?p<))sA0)8;N)|k5kkg4QoQCN5vsJ=9Rsrm-PHH?6u(D z*QzH8Hh@ zkO5Y&yw<`_?V*@IAC~SPfY^~h;%}{vkTV!vi$VEYaq5wo1El!?vfS@iB^tO-gDG;- zz4-?_PqeNvw^Y$5`&aP?=*WiU8b1k1R%fkhu&(Gahe+V-x`^#MTN_3lvq*Aa_kkg) zvHq=JL9hV@sG6nG4ycP|>je5_bX3_rP!gLmt5sXk&ge#vd)T3-mNUJP9h;j67oy#PSiY`A93=RUF+U@#thQ+NoPR`Y(zQeH%_GPw}W7BF{F&@qP-9vMH#Nz@tPpn?>{{Tw& z7oYaI6+UgB+Yes93ItsD%U~GeRK_aD^%@Fmhk~;$0;$i}j~eZ5`7fP(m zX|2l4RW@nL_-Om@#tL#7bV~Wy-H+ttoZ6}Te7i|i0nmJh9d3s}4V{2CF5Y8k?89jv zuP7iB!mCSV=cSrb>lO}6S?@e27ge5^e5QH;Ki4Ebd{yE3YB#-; zTF(P>bpC`*Yb@(b2JBa%Y^==umVq68AY~bUy{AK+5-igk7jH*{e?crWFV5w9-w}D6 z(|NkEX;jD(>oe$pa2{@A2PRb!BMZfnBo5PxS7u6fIJ9|IT^B`mc2tjMoTUk>SgVLB zryki2PDLQ%@!qI2u7f5deoNpKL=DNx%r@T<(*D765|sw@r$<>g-$HKd{am9TU@T z=0Gh-kfT(!@J!X#vx!fx*IMWOevSV5H++k(7zAXMwo@OmktBv#2b%{3h_8_}PGNUxWnJ3LRN zFNt_`cN;GP3vI`EK$T-vl?zJ;t2Cg{Dp^LYtKlHlrq}F-_S~b3;##j zO?suyQ=u!e*oy4RohK$T6RTGnhKshx3rC$= z1I76sUApyaG^2eRI`gc8CbcuEZHUA%l0AvyQE`kE&!0$dancuTWe4}jq9o#wjWnzW z!>_Ub*&BP~=h#x7?Zt6KUSw6KQ z1qv3wj#!PblCPLbMF%JS9PL+wF#?_zawcN2AUgZ)e6l3u} z55xx30q9F}ljsaMOZ}eW2q8H=zivd)UD@anD<^*d%of)1TGx?{D}(z=>Cz6;ncf|H&`9HGi*RpHl`0+e zeWPlV+dq%t-(2uZ1ge0>T?H0A_r*`USzZCm-nwG$0`#J%ltgiY{7AF1Y zDYLvY_8sC@FsxzjeE68*{Qx5jmG2rhq#}63Pq17=p$$UHlLhkXFpnigafDHly+*KF z-87~9%CM=f7xq802~aF?~5qlWMufljy1Q8VNXBROm@L zUg(kwQti;>JCBP!%QO#&RQYz;wX{6N} z56byB9$0>*qAp1?0?)K{6<5~-LWSo%Yr?S@f=xz{&|{>laU3xGi1Ct4fZqP#%AC># z3=jyC0ZBHydYH?NcpH9ul=p-%HxF`?{XZ7frru-3~QIjdOrlVD(ck3(& z3<_`?LWeKx(g}ak8%~tsQVCAeum~udbd?0s?yf8|62;$yg1b~J_|QBL8&$rW=f#5^ zzO_5`cl7xd)fk_Q@(3$8^r=N29W z*D@$EK@MlZ#<~xjJkW zIQb&)Fdl!Y?>Q&97F>%Ss$U9WlNcA7>Rx|?%p}!%l$U*>TUED687M_&y0|bmTQ?=v zfN=9FHiY7`R`l6>IckVLWSjovJJkosFe^xMLF_13CE}LjO|&#llh#;`oqRWmRV6Ja#1b3LbEO0MgfJN7Ii}^4Q{q2MVq)md*@ai68S>Q(O>q2OES5=|vRLSIOrt&1c}j~Tc&kZ-M$!-MJQdyX%ji0;+AM2N zEUk(|x4H`1g;u;Wm*e4u{yx;s()3EJ?SuGrwkOIBMCon}z6Tq_u53M_Ls@N_7F~o5(Ta6c6ao-0y;5ezU zkmXUQNx`0(nqz9y#NokG4=N@VtE2x$r>@zG`j{aPt*VInfEaJJE`h91ovoMPo}VrC zFGlEpO?2e{Q8>pl8|CgM->q1$y|HXm-*iFkZX0WZbtrrvFc4kg>;5!E{Dqi+ z23$=QdD;h?MY7)FJmHdTIIdVZo0#5QZ5pg;labE2q)6ucg0i-}%P|eUY)wsj{{t-T z)w?S@h0$umF1osRyUcO==H}zAwcHfFK3{dyS0t5#9AAt-0L+7)*Nz+YgWC$=I9rCt zqw=eICiKnk5&g``8V5%BzqZkxruIFIv4mjqNgWffU)k$Zr4sE})mdYQB&&E#*Epn} z>7x?O_SK|Y%+6cwDx~+AbjyP)mCA@KoBjYW{5ZBt4P6}{o3sa&l>2Era3JHRjEZDB z%gx649L0?#FhTbd+6{KwGk0S6CrKt)x(Y^#>gyJs_Q2QI3{A|CsP?@|f<{d)#B5H7?6SP=)?-~ynY>o@4_AxAXQU*1VbgR`+l5X) z_rowf+LKs~a4c1YWNxasB7s8WYC7zRPB@UN7&^sb+GQm>A$Vbn=ua-;bdk|3q+}WiEsn^jx5oQC_X!yyFa}Dzo3ob&a5e*jC(v9|K zJ{=P{aFplk+TYL49`;Mp8jH`)AKbLhN{<7pPWU{TiydaY71MzSE#j@_veuYIY<{0{ zQL7YVCmeZ=Zg?am(=~8mD`@fx6eGOLgbjc{QH$h(EQyj3(qbb?b;K8VxF|Ods|D|~ zOycj4OP42p`(E5{Q_sH03X9eA4<_w+J9Da^E0;|IL@f$aRM{KpL9`nwjD;$-9QI2C zXHqg1cdJHeq>W9wTDl-tVz(t7^BtE(0+`?4WS+WL4Xp^u|u)1%n=BC-v{P5%@L~!L{ctdz~)b;Q#*wKIG zVbq>tc@#HTc2&xjW?r}%zdB$PAobS5j-1$XeNwyw@SYy+3^%rXI}H?T!W%@4Mf;-LHoNzHT2YOzgP+H13cm6h&~US!#1(;|^3E7q5M|p37O+QWDf(%g z98zqbW*JuGYYdim0n7w7j#FY;y{joGNQ)8mQKfv0Xt!nwxudDWSA562a)=}N0)0>& z*0sD=*fpN8Z?v(14kd~?(o|K^UzvHZjukaj7P2S04oZQf2O}5=16!9DtwSAk1)@X@ z_}qZv6djwMJYeG1Gep1<5AMWpW@lkXyEZ(q*JIo|jKoCDL?m?rhl_(GE5kK5Bq1w6 zV4Yfk)Dd(G;>oUZ5`R<{|7r3)-)%)`g0IM&N zDDG+jZt&NcEuw{;@U`PK&?B}Wh?GUKp8~DUUW!*z5RB|wiIK43F;%0)LIjHQwP~92 zsh^H(hwl`Q`PFxrcfKtzF5q)sn6Q)>D=(0lRuCmoRmU+t7!MQU4Nu=wRWkt!o^g6c zMUex&ES$bDxpVBS(kG>{@XMnA0eh!N7Xg(4kuR8j(l?2<;=AX@XTD|Yl+Wi>19Z@? zG>tRs^+9K^&02-&{Pk2VMjXB@&8&eba?lMn6g2J}5qn2uOEFWzwS?n&820oe>au6P zEF^>*$93DO=3HgHehcPkK4%8{J&HPoW^`Afz1J0S@bUZE*BPdKHdw;kSyMpp#RrXF zS!?(lmAZ45xI3D4dI$6oR+`olg&IQ}iDHqlwm1dg*iGa!7uM||Mn_(>=!|ICu#t!; zs4cLl4N&1|JdKqQ&{jv&i>b@f>-x$Dfz>%1+p{5Pb^B_dM*eTUZL#iCr?HFG%ex5g z7$ihPS@+-jCB6}M*QA||!ck9@q6QCr7P`72+axhgRaoM;^{mWcb6r-+;&5J9MR5pB zrA_%=?9;3wP#^1l(uP`nM4JzPDEm2bxYSdIv`Q|7vP`xj$iRiF)~<;cA_^rB8Gz{SJ>P@YL(XN@*?f-QEr&xFPQmo zn?{Y&FDiwdgC}a=pF6pC12T~_!)8m1^(kikQX&*+WF`XJN{%Gp-Eei}n3&Rjth{Ah zleyI*~f-`{lt2Nx&EU*#M{g;wXKZ2g_>)T0Z#?~86l=81yOeGCK3z@6 z?SD$+Adkn3C!k|YA4BR;UgD~$=rr?wO~1NEgpv^bNt^d`$)Q^_7HhWL{Xg7}!)r8h zZsx=Nu`Uy8$s)V(Bx_vF5dg8Ue$ z5?_gH?t+MA<&DmtGK{@l^@L@kIRSopVSB>eSbPe4vY45 zUN=@J6PxcOq!7Fs8sh4S?xI0#jI)>yHHDkWZbQ}Wn`;wKIo6lN&>3~Ehdbv%Z9*&R zUQRemPiFNdPYuo}Hk0#wZn8so!vltnR}M>}H5xPNgfKfW7B)aYN&10NPJ9fJ3EXGZ z5)@V$ilPI@T7)b2fj)|B5Zh#5QQeXVEWj=CvBFJTq<2b+OfI)sV~KbcI=#tgf$?k2 zDX`X<*bzN3pJep7>npCz*K&%W?AP3Ps?ON=-zWkTUN|=qI?HU07g;XKHT=|r&+9DN z?F%(p;~UI55TnP5PQ>UD{XqzLh!&dM5y1^IQr+OSEW6#*6|pL#lIff~qm9X02ZeZ< z>1Cv9`!p6p zEV>`=?r0)`3CP{lNR5B7iXU4ol9EjRMxVfP^0`qvW9j*7N^fQ6H)meQ#JQu&h1Q&E znR9Ez=9!=2-}9M4xq8+_my2shF)${cg zpfbJ7?hkL;P!4IG%I;*2w`T2A^xEImjev-=?z^!Nv1&;z0u=P9^OVZC)u*4z z9^PHA+)-~+AD69iRj3z7-me`W@9)fCiu~9n0`~wne(gHG{64tv$GPLT20!uM{yF*F ziCCZdGEU#4t0nW6Z*jx={=X$6jE%Yrys_XP{!Z&B>Wh$|Ui>c%2yuA{eS1GlE|X0X z-SXFPxfar_*v^f2TDy@>;WqUrluQQ}{C@y3^vYeK>un3S2gPq)TnIxG;VrS3G0;OH z-#3@Vjb7K-h&V~NctTPi-uUp7ZpxSJN6)%4i*Df=&Y#`6sP@%sHdB`UAX0e#%N{u! z!heGq@PQxc83;OU%Of5Tln?t|j5i3~uRZ--rmRIA5qj=XVRtdz#7Pi<9aKO>rHc|C z0MI~h$LA!2COlT{aPVU9d^~z8UPLp>gAKbEuBO?a#Dl~)qVX>JSq>p%0G1n_I5rE| zf$_01b+D-IlcxOqLtXq2z{ch}zr4;pDl`^bNJiU@VYh2VWw(yMXx#G;pt`yVAHxPp zG0j}EeG--;I9!mGhg8yF>>FRS$&4x?`GO*ri>emCVg|ApszFI_#m1`BeOS%5H|8$l z+85=q4>y=-(#5+8E_7s%L1UElK`>>v7nK?Y<6LoBWN%Qk$ZSt6#crZ?BnpMzXAFz6 z9&Ge8FsM+n0e0@zb0i($T`X2h2OAxb!wPI&z(qai-k~gT!QtQr(ulrqL z-oHDbH5$a1kc0#_1~?h2vQmxQ)8;FTQ+8fmeJk5zD)Q?Xyy!cF8u@^MzZVhxlE|;v zNv|zN(xGXcm;Gb*Jc0vl3^2x+ghi7RQFTh86uG0%;=_kAvRhm4F9=bzsA~-p(>7JK zsf?Uk8=A6ep-DEszu}GXySo&JxU_M%!W4ZqlDFkgK?F2jEQ{;yY2ufP)r#dVzKI6l zJlop>rA`Iz6ED#|JPl9wrk5fz{{vi~z`gN+G%h!*gBDIh;;hlN!|@6V@R3%bF0gvP^gquj@b8xT zDyA#!c(8t=@sEKt5WNr-S6a^qGQGK?T*5>ZPQg>H8?P2#w5@KbFNW2(or8%uX{Izm zP+Yo57-4A`6oUOMVc}rH9l(8z4WTO#IRUcKW22Az(gI)ExD>P{#g8Wy6{RBNj9qDM ziYZ4yzj~Uk;*z}^unKQ`wg4MzzKF&RfoqgC7JG})Mlxk`B^)@58)YU~XGgZ9&679nrFQOxROG!O+lg2V5 z|EYxsE`f%m{m%3yv{|DrA3-%p2Bx9#q3E34B5-2@|5E;i0fg2bct^+P%Cs;j+PwsI zH3|AAmBq^-z8k8en380@h4oE$oZS$UN$%Ti=r0PNfHhP_lEn@&%GgDl%v?=TmiB`n zJ>+FB;*3vASkz1`@#urUI9z7b;X}TVw|#rn65-S5YD~v~5xC~(z#+NAIY>wH;?-9f+2z#5M$&< z9*Ubf6m0k#mv(;-ByoL1VSH3f?G(=+?iek}2MR*P@V+;xCNt*zZfEyz(ISkOTmF%B zDELKjr06~~G?-z8!ukRu9uy5FM(Dwb7R;qk8i{_j8*RQyKyX$Cyas2-@wKi-fRkgcT9Jq1a`ELIYN^ z?M-rKCX^Y^3%HtCRI_!P6s<9v1^m2X*29(%69rtCtnZS#fBJ=l$_}G5;4izIR2dQ{ zh}DMrArKLQGm7`UYDWo`r8^$9dHPnTu1LkaDNyfB)dn-QY9-;f8!kj2bAg)L}&XsLEs_|{w_X% zeiPP$6=8{3YYTBv}yK7&Lui|t5%KI=2W$7M+R7~xNGJ&x`+R(5g zoG3JO>R9(7?=^%rIV!8UespZf4~7F~IT0h+c%%AhDm8Nc`O{DBjwoX?87M9RwNHEY z5qdY18vCWQ9;xUIS&k1B$r>K}^1Dki#2Oy1sNuD|mn}=R5Y=f?)Es4SGtCEsHb*1P&ze86O}+#?!Na=m4mv)y}tP}xk<-D~rskjspv(5V$yu%Yphr zf}0uJpQA6j&D?`)7+{@P3ELL!c%o)}Abn;Ew#4pzd6SBARCagCrAKm-WWvI!O|93U z8+DB+Q50j)WZ>_A&qf3O``0fY)o7_l_}*DatmiiXCp2q(Kdvc%^&s~UDH##@mia^| zKS}hhNkugKPK>>ry5o+-4r=pW=(-Pne8~^I_F7jfZo9^E4=_E+4tI0kk=9+u2#yxw z{hLm@@mdCdSQz??o+?%uNfRpa{)nD}LYV{&_M|{Hjg!d}|TB?QR!r`j1_}@M(fm3^`G8qp*n?08c z7^Fl4FrQ^-98A<C@ zokp2W*xVg$ffvXeHkWdv*CD_+R_~QP7ppR!e`MYpHesK#BS2M!E2{!5R``N zP)n~e%3p?3;r}U_XXaj2NDe|a7V8T`ol6T8lU}*_NKpv7`1lh{0(vw$hvV504Bg%ny|U(w!r!{O(w-j2<>nkO1=*uB z6&DX2F}I;NFjH~MU=G!75!?5y2XJwQKObA_YbJGAs8u|t8}=v@IG0=uT`25d!v-wWz#z)9gzu@bh>E2MvtSu(rVYr}`=1m9WT2PZVr@&U`I#|cWN zP3#>e%H})pAsYFFSj_}u9%i8N1HVar5(J6kjzvyxpN()!^V8{3;i&Xm_9QsBR_eAw zNvF7SOdL=5Szjv6&F5Q{nm$)J2_WO<#CXBXb^O-1aqUCX^(Wz;0?O_L3QEU{BT0e4 zDxz_RMc>V*!wBQqOuq0Xs&_En)fY9dgDRHmZ8|0~G!Y>l{~i}mKAkY$X*RRd zaP>?3wEv}j;S1e__mkJI{=}jABW~z_fkylCE_x9@e0a7u<%_waXCeWu_8*ot!3 z2t)O2dDlNzJ_*5l64+oAAnc6(18|-?@Tk%8Csg0 zfC>CQ!S(4S*Kamo^Ng1X-3$WZv&Lk3o-))cbf(|An((_aYR}H)`)Z_Y@p~eUOI?&8@8=J*JAYWB4#3$XfTjvc!(~d3$sJ+*+2mG5h zrY%r0`|VZnV>Ku8{$*kpSGK`{Vb<(aZ|+X}l0aIg^y`L8P=JHpN3<&a(os+5t;A?! zpvn5v$$Zs z+mHs$rXe^vojhRX!Zo2|AcvRX+EW$JcQ7N( zhywl9DU6(8Eom_1WqHsTR!i<6zkxwPb>rAt1&u{|&ILgu;r*1-E|5or`)1}kx4MZOx4uG=0EJ7*W2V=M+c}7tdGj%L_2R5fpnJme^hfXQK*U)kIH+vy&#G zr!F5HUs4TEjVAb9Dr&iWS&BV$9Hi}Tgua1S?1b3RU|-sE_cdLzokzA01(G2fGKQzvDaO6Qxg z1sj5=0MSU{rYv#2%+9?caiYk#qnB^>Em1;lc`5#k$!lZlZp8T1=i$ZjST;Qqn$wVq zsqgA^1HuFhA$NTNc(P<@Fy-}5Q6-&XQUR*lAiO#lD=*e2%V_qed-G)d94! zoWVZ;Loxp9B`*O(zuLPOCS$jPY5khR>5*yGemmjyR%yo8tufL_8ck4SMOg*#1DzF{ z(*5OQEi9X+&NSb5|2$CjDE)}g3t#LkEH2(q6k2Ah4Z5u$A^}{UV9#f65HPCM^4R%@QEbNEB z4;fX4CN~eri>3@bI$>i5q_N4Y$K(u78X;IBx{EyyXv`P947UzK#r%quHldY~X2={s!=Xzi$l$)RJz^#ZQ%RIbIl$P^(3u z(ci9U)Jiufjvg=TD?pfiAesO@ee%hv`rqoU{OCEStN^I}wdK@w!=rc6FMKwkYdVn$ zQuO`7KSf2a@~2yPPBX+YP-uRnm*>&zpnewykD;6lT>Is>DdATA;N%y?nd8T`p&d5y zg@7U9ezRI)_c{Ws+rhR1%gVw)5(FvMACxfgg!HAmBH=~+lmmw6cqT}M!&6FaVhzK0 zB2rpsf0RZ)9U4Nr3y*Ntm8yY&fA|J5KM$hC8p32@X|G5Dl3vtQC#(AxhN(ylL6~@7 zXyi~Vg3#vAi&JcqZXVIhX7_lfnX=(FS0yP zV1;tM#@l@eOazq2t+ErO)N#+35)pv765c?8xEteJk6Wyd zAtL@$tU|)sXKnNgC%0_rk2zsi(^m+BHj;-M=>LPfHY}bxlQR*8w$m=qwD+@X&#l@% zj?Q-CBp#0#$A#i28Vj!r>YJYzImn5IN@pQ;-E1m2kO2X>X4FYrK91qRIsQ^wU$b2H ziU|yt8W#Ay7Mf|q%>?nSj+{=lKdXW>pgXYaw|S?$d|&Zxb7_`cwc@Bfb4+%lXkXC! z&4`6fkc&2ukoA=dDTfL-Qluy$>1&c1_LDbVPK5Uzeks2+AjoYNdp%8t+k9X~^6$)(wOr z8Rr62^3kl9n)QB}lr0f^{gY|wCJDcEE+I7kqI5Y+qU$zPCJ35$<$;3KU%^)5z#0c- zdE-HqWJM6ez*w72Q<;~3Ho7pxjaZsP@7VcNr__f?lq3q-T* z)Et{u7PV<+j}!ZwtIw|~m-fW$P?qeeg=xQu~Qkh2QL|s@*j}p83Kfbd;jMkeV^YB)-j#bsY*wwu-RyHrY(Ah4m+gg9@sjbYU zG<~E1%rsQeXsM+XS|0x&>fR|x(qP{gZDZP)wr!i!cF(kJ+qP|crfpQ)Mzw8o+S7K= zt#$T2d+)W@zArcKdAKiC5fvF(UqpVH`Nt1OUA`>804E>$W?l<8LqUgB+~w`af}|N- z$ReI&?DM{Hky|4J89XUIBghPX)vj-egInzqYgBu(l~a@{hG&N%m;7qCZkOKJt5^&B zf>x7r9}a`hDC~#1ka!^>L#txce0-&ZxnsLSx0uNq8LYN_w-LufkZp{PP6%-EQsTNK z7)moh6h_*lXZM^JWe*5)i~2m5a1ZbX&Ela?VeA}Z86x~Xp3~e^hKg6#qa_Q@Vb5rK z)&zVcW%VqmmkL-DMuz^92=IIqpE;(zx7JA84y1JhV+g7wFDU5>q`5UK8*spnLkJ-s ztQ>RN&=x`p%f%G;iFRnZ7+f^+_;ksM)hrmLAf<^m(@W^Z<2>bv35z%Gx;Z!oV-dD= zg2~LSpPG}JypSgoe8p{|&GsDMO0Nof^rNT>(_|Xmm!bW)}mcrkz4vz*#>DwL+Y6`>;-R-NA`lZB4|8dGL! zsz{gQF*>v&Uvjw8JIRKp5Xed5>Zm%@6f(aQXA~KyGmvI3y63?+xsBKjd=*z_3n$CR1PJ|ZuVDhAJt+}AJ;Jz{@iB%>YLZR zp;xYACy_&95VSxfYSEK91u=XmvXx5yYWLAju_?x+9a-$Lnie~yFMv<6e1Y3#8RS4)p3d9();RTX2Dyr zqx$Wg){m|ZR+5<7022$quJo8#B63A$acKa-dTa+cmKo%9FP8o~imkA8~1`cn3k>z|u%oM|VkjdqExsj$K0;Q__W>@n^U>CHR zC10QNxnjl$L?MD^=1ef4AQRPD=ww~P=_t^J#4qEna3<%RcH@NbSO<%@dn_qp!pF)} zxUox7+5ahE=J5Dk72q6UM(DgBmp%n%vImHcpj_Ryyk(VbMPXu3fTQ^8)j|4u3+84N zJz9O90*s}-ui^UIMZ}|(sK^}gCi=Kpi2BE=Eq7^`bnHLYql$FAJzW!=7S0(wl^dIz zrs3N>E7$*8kMY&$m3X@O=msBaS=}u+TC?pyK)}DW|0aTO|D{8SILN!?O?m&vs&%|h zKY+dg*}06bPP``{iR(q}eCEiVv~P#`Okj49dMC(`1|tAn1(7S6-3Q9=ALTj3wy43U3>X*R&Xea(U+uX!F>XL zKls~X+kp8m7%~}XtG@THmo3euWKF&F`LVp>K4nb&1sja)1<}ZuA1fY2<9rLQv56B5 zBm+7u?gUnuzM)XR4UZnI%oSW^z0YtV1l6tlZ}pJ>5O-pOzKwr=c*=Rp>SdquJMPsT z!ye~9SlM>CGJl`pdv%QOt?*@d*?1o%=<&I&iTC`Hgf4XBs$gObXE(mblcS5&?TT$ zy46%WHW*sYYfI>GBLA!H2fzHU`R{1Kh*8W*-j3yZ^;(B(;=x+f4YUromg_3(hXb!w zMy+5sJyUa)fv)#dSv9lj+(U+4;-mk1hq^)iDEPnN{kFI>wHzE4`rC!+y2IaVC!Pr^ zrUc$;Kk27JUVIMgrcN|@GcEolncmgK1}(CRO|t{(XGJ~J{paqxpM^@8 zOE|arA2aWEf(gEWN66b)pQN7&pI@0jC=1>ecWouMQEWP-R*)G+o#Fuf7cGfBMjRD5 z?!8Gd5)@*x2vH1SB_~je1+dI$rmhc^u9+R!+7X`Tv3;IIYms^sQInKh6}FyaHd2Lr{bx&x)D8q!dBwNa&Wj zlbDJ))nl^q{2n_VYu5gjyZ0Okh6$@1>ZFVkzN0J_>f_Vm27J2q9)3hTNf$Le|Gn(L13cmp5(rWFGh#FxtCZF~;*uec!b~z!)rLrR zIx9ni^$6EaIv;}6hYRx+yDGU0S!}xgjT25oxCdZgBw4eI zze|lOD2yev{7_gXNFAAiK>V6ejjF#pQXudEGllC@CZIcsGNo~#*jGS8p!G6z`5nRl z{myIYZAo#-)kA3xi`=9qdUW! z^{~o%Gv^-mo~@KqH$ULjS+nqBo4q2xw7?%sqH<*Y{YO;sB8%N#Vw~> zdJW%MslMas7~2sFjxcBR{Q;Hm`W`Djg3e(-ucjOY3(R~AI;_0%ZoN~_<|&)hi>Mt3 zwt|G?SP0?Gbf8=9Ie8^)_(GC0wugQ2v{yyTwp4=$v>QNB`8B`T*^BLB-wjchCu>W zzNH`>7dA@M8q9BVS~wZ9neF8@ogacAo#&F#0N&le!jm0g`d zQd?znJLg<93%t2dN5}PIX`z8)bHV*D- zQc6#Sqo@+}ceaa4Y*8d_J-H|CrlwZ=7vz96^LowK`^cp6>Y4ZliLMZ23xW4HrM>S|~?qV~4ef!OT)v z=h+Z-`5fittJM`c*ciHkt&xH>C*;r}oCR?(o63<6`om;GMM?b9890xiJj@Kpm~7@rVte_< zGLRA;`VY+oDa?|R1CwL$Go&YVI5Cy6`k44s7k@PC;@BVE0?3?-AsUoyHj^>f-+vP4 zp8N%KK7VToct?9Ca=FgqvwoA}Ir(E#`4_AxU&lw;M}J;^V%d!!ESRR0dO4R9#+{u+ zWFf{orJ6*F0P4$TI*k!=0_!)irh`lA(UD<#YzdyXzQZ7z?O^ow1+>iO+sAH;fM&3Z zjq8N`-jI%mzBh8B?}D66nuQOiY!KE}N#4!!0wdYoHh?9y@r=ckDm{~>-1Vae`-hTC zM?jU5nnPh_MF61Z^!)cs2XYTTSn82HY}I-Xv%K>+wBAaaTunSeNHAtHiOfwRA$`Y? zcXKcj$|$Lw)GwGJD$?tU%L(B$QyvPHl?rSMM~(07{#;hqW1TGiD)j~sUOrs6Y-bk8 zS6B)IUa)nR%NO4rtCntbcUZ<)uL!)98sWS`5GiqX<=zWWm?`}E5WR~f+gy{!V7ku2 zt1+XeQhrf?Y8B_p3kv41Bg)~TET?LGpO7oY78|+148Y&L-|jaU*J_8Fuf-Z=`IQ2* zugK${3CRS-+j9?{;#w7+kfYrwQ9-=!G2*d$GBN59z#|YbZN2gr?EEJ5^DT4d*%Ujd zV_JSlvw%}7De_p}D1mKJp z+xrqTt$~|fCm`7O+7I>(juZCAdXRXm$L)y#S#7hXZ6F)ET8rA^F64xy!JHCqSVKVs zQE3l?0$$jRFiN~H-m;)sTU69Hu>84$QGGCn=>dPM&3caWE1`5vStGq1lgFLrpb41zaKu_Yi35;6`R5 zCPCX~TG(!;76rwRZmAjo9J9IKbV3Q##LXMC`kux7IQe^T{YH0E&4kNl|AM8Q{r>24 z?)B>cB`Z(LS#(HYR6NH%xoq#Xe7UoWY{>7nPCUAtuhVNm9GVwll_A@;e$TfwtT
ybEQk^6#DzoG13Yu0+G7RJ@jPvR)|APH^kgK!2HW@|1&)$p8 z#A2yMMk7FfXv$%fJ0X?Li4YAhlmJ*t<#n6F9o$jIFM3{%u3w60>P_om>yxhApzo+>!V`ZGfqry^HH z*LG38RQNJ|*Rg5G&1Gn`?*pu`O9Xo5tc%`9Ib()~F_7mR$c_$@k%++jG9I_7BbWX&HL0%^~V% z(qqd^niYD4QbnR&$24iCap9bN;wZ+f0DIf$=Agx@0%KcQbcRd1~p zKkFVvKQzdrZiNEIZ`z(iyp&Q2%00`g&5u3Sy`spsq z@L9OdBQkhKdNx*_9j)U>2mJ4iyD{)xha&i_5jFuf^29yLL}A2yNig|+Y8{Frn2{p% zAO&PFhAr%JF3)JFFd@quio6#4BaY>k;aM4N{k-0?6Y}4%^Tfq}!6^J3-g-aENDDOb zociuA8dHC^31c71Po+$k;EdvwVo;Bz0P!xjI>j+X32^sb;LWt_MJ86TP%!%!suC@? zJig4)tww9?=-Ab8BzU!1+dzhE2G74g`jWF1Ylrg{U%?TIiRN@_p&g~7?-hj&Z%dR! zc6B*V6VQzxRjx@ab<YGhU^WdF6FduqV3D%fr5p|6^g^TvR z*yTD+)_x_*MgK}|kthZu;lbAHf-s6#x+nv8E~g{Zou#8vA3{m6y*5R_+0dAeav3wG zQ#s|m@#5ZlKS`1q2HiSj?@+tV_$8&dTCcIiKDir~F_)}_sX-N6LT+A-|5V2Ta1LUE zl-SIi9|Wn$@9j-2eQE=pwcaIN0GT-ANq4~p6{YlP2M>mU+XBve8y!+PGps|CtG}Kq z_ggp+g9@C6Ib~o(GXd6V6)r%(!zbMx)8q5cQx=2Y+!>SyK?-$Vf-Br>yr2lZLmvAT z>0tZ2+tj0?eeYEY^k;~dvpx^z`cul5i(g7k-CJ>gz|U>Lf13mTL+_rtNKk3)&57h_O2 z$1}$cQk|x{krJAth?!vTCDk@0ZX~{M*&KI2>+nxHStIQNAHNf9W~J~7j*zcUYgj#) zJ5w{DLNG)%4P=Cyz>Ucim-B4E?vT+F<4KNCMua<$h^~g(IS+2B%yfpFnEJK5(>;E( zY*>kXTww<@zW`!~ZZ<*VXQDw8ep&OVVycAr20m0Ub)^*7= zLxz(PDdMf|ysRc~Z}5i|wGnTqPy^ENJU^}|7wd@Z==0BFZ-5)dFE~bkZtiR<=Wn3( zWOn&aV=+D6?%l_$5#6nIOzpJLmw`{#zhDkUvOxzj{jlU_Vc_f&ew)Fsi9yInNpx5i}~RlmV@Gb6XUGW+P=@Q)oz`={IjFQ zKpYun?RlqW3m$g`!4EoK<-4RC&I789bR#^EdfWU33xLuS{Ce*#tREluPVBN5?CpAYS@BLNk5H7HWvR zn&w6XL;2tu*EDf37;H1k|K6~C!s!wG*fJjjcKTN@zZUklIa3o#PVlu`r^U!Fd0dR2 zE<$t!hzomIj5e@l!3oLIn$cu?9SN}P0%nbzN#aF^n^qou&R9O$zcb|DTsY&**S4;t zM{&R7#Hcec(;3YtHL%_NK-fxWh!;-QVT_dvN{dHI!lNWZaNE0}L(PK3#FD8J8`%)f zw&%gzmK%tuEaGIQ&*!wP8K{Ct#`K(lS zU2ZGc0l4V~p)igrG8E<1SGv-bf1Y!ZyA`lSh42A$>-ySp`liB{Gu+taQm0S?D8l{G z=&Z$0bNJLhaR@)Cq)HWQETlH+OW}y1d7+e*ToVkBj_BjMM5CJ0!AGYWvVvYRQUkr6 z)fW)D(O8d{N3X1TdUNS!)cjh3N*oP`X}_7>(9ocpg2TxXANLnbZ#MEzDO{c9h(n?| z*T{gf;S$m08FndA^q=%Kq^azv&fV4`9^*db!v-Fgn$VT^I(rT`4LjHcs^*xzpN2Bp z;&%E!R3OSKpMR`CGw5}?M!n{wHo|b(E@sYoQN*_MLWO@*dTAVu>vdHHHcIGKzW)Ms zF6cVv#eqVOouk*=OpRsn>1u;vs#8Ecj>3t_zPAuhU}z<+b!BGF-}*ez0o(1(Zo#Xhj%H(Ds^|I!JEMfqfJb# z$V!)C#C8L2?<}2E4zR&4p37av=OEg9SBNnSho_+}2OuCgXYR+tabOWrp$!xKsXtfc z2!(gSy>==ol)%LiFIMg9MQ6dFA{ahH)*l@giZ=!WV2Mq($SBR|1+!61xKFEU8c{LZ zip*;u9t7ccwl7KkbZ{>ixi6v-{TgK~@5n^O{iQDM${6eutpg66Cs$;eTkQZMID8$Q z-fXT|3KdgYEK#p&f*URV@bf0IRVoA(v&?Z|x&=vtnU{b3 ziI}LonDJE5>D#r#*L1ZdVgo}o+D;`GBD1UtzliDd&9$jgwZwRJJa9DAR?|DT}pL_9q zcwE1IL~lc7w1u;9Jj;psh`W%~2uS9P#EzU4;=yT+Ibuc_4g+&+N`ItsfAMQnOa9iE zN-@`V5rsBn>o`UJ6C$GlU;2yj9+Jy5vLv?Uhn#htoo6y-th+>UF>bCCi5uw%t_494 z7CptmQJg*{Z)BfCJX|8AD~ux#Nh!Kvcz-I<)r|qKU5>QP*VG-yZ`9ld zPcp%O!9=?1)w>@|O0^e$)pgv)nKUKXq=qwDDk|Hsc}ZCSetF?zNz0uyHZm;W<=qxp zBN^YG6y`(`!Sn}fN@+%-KuMR7rmv(p{4rV3VOxIPU)UsOX36YR<=;2U*ny=;N0%O;r0R`^vtL$_yQsO*|-s z8Kb<;Q%z0o57qucVq6>NlbfX+rq}(B)$|-4uo!G?x0B?g-_$o7KKN#$g87`E$z@LgY18l%oxMz&nv~S+?+u052)FRUj33UW5@bO zGG~rqVZ|h?`VSPt3b0-WwVdTztxo$Nq`O0?HC3^~_*}rCKJlcwmU_DWqAFVLlHWC2 z{}C{5pD%#a^5wBVb@3;XZ~7u@D_^x26^p)^Qgyr*U)k{huX1#l$6s_4$r@W5=2occ zBW=vc(VpwUZyM^#+wt)Ma4}^GxoZ8`_E#cxO>R`G7Xfv-xB`MP?23P^^(lc)HmhY+ zqc6lptny42#6NzHn1`2Ivy-Kvu(3Vu!_`yGuf)!^55m^ymB=MJraieQuxL!3OpiBm zX(t1xu#(2653|};B4*G$b?1^3ii>jvDO7(!=L(UD>_2UHmc_4VO@!5TtEG0!)pq=9 ztwrClBU;mne!kOtg4J>I>GXdjmFT$oQ z=Q^&lDJom^_vj3e^$s5N7VUR|4T^R?q^z;5)1 zA%BDHUrD@%oLek<9cK|K!*N-0(vmbKlW|}S^1KuT@&2cPMx$eC-}9@L!nu>u7y&R? zHCCPaZ(C@dyWcb{C~?tCbEEJoR~iX+yCvf*B(e&l6VCec^mJ=<$S)uy$n{`7Cj^rM z%@;aY0i%j>(YEW1<|kIx89-g852z$?vmhfoc>||3rY5(RDA(1rB^Oy1$C%3El9shC z*T1E#-Tm-w?sfH%sv_Qa+yRDyum{k5CWEa!kjTdnp z7!79BTyl%qH(aLSZCc-+Cx^z)qXQsX1XHQe+TQyOnI)b2 zo<~Lv8Q!u8RxE-2Q1>-np)n6!wv6`uw}*|0+4sEwoa@$5=9{^>xgvwb+BpxSO^b);t#Ir88KX`A~DH$g5!fL z=@xBS<8Km=Y@0m=5r-|R3-WK#Sj!3|wrup6Be&&!$H&rzBfQkiF_Bfqq}QjaRlOCKv!5ahj&sN?7Q3npmgejL2* zyA@Yt%>Z2#`xeZ$pXtQ@*!AVsk_QI&3%t@1B8%grP*erFL$d9h$5hyBWgU@ji*~4V zB9q9C@k~A@ozg)e1u)6Ea~?`sQ2&XUSjhAELATTM5=72%JsOB+>iFM`1g+7VN+DNb zxAkwEw1oqxNwT*%@DLDVU4}`0vU?YS3O`CQECK2m=oO?Ua5TJ^SQb8WlX|*w8kWIP zVf#LHXBZ`MIjfra1J_jBA#=GK!ZF)4QLZkvsDoE7ZO>vXXgAbyk(2w#KV6#;34`_* zh4E~Mmqh6hMILszF^Cjm)r_u!x~ve6Vl8Q~?TzoO0PJH$q&&_!G3B8)Y4E%}Kq}40 zSdtr7cJ5xzD<*lA%Hc04i98P=vM=gG7wY#wM?dS$tgoL&_DuRi&CYgwY1V>!|t@g3?x|dIR6jKS=z1xJ$1(JmgeBp;4G>!m`W@we)huC%*t@3J@3 zb=v3U-M?T*y|mXyxIEbyr+1~^$r&ORP$qbHoczEvHQ(?vCz&)HNZYL1t(V{#PI zN$BwC%o+&{-XG^x>V;P;O0sB<+^PDR+;)yel8nG_jkP@|p74&$MBpq`771*M{MqoX zBvtO=ko?{xzcC7lI&8cz@~XE6-RerB)zj{8Dh($!2hi_-)be5-xCb1^9JpCJZ`zlHoE@u(^w z(jz)6q66N=le4F@Eo*bZJomx3*f68hh!F?-83V_(2DQaV|MbUUr4;$N50539if2@C z7GWu7bGUzC>y!5OFmCx=KLX`ig5{W}SA~3%7$hFWvBQwRWl9{Ypx8S(!S9ha`UY_7 z*6mM zB+K+aGih=wrmNkw$XL1qFZ%Z1zdQ~mpkp7*UZ<(_lKj{Dk`i!wLhmRnN|=p)b`aJHtsyuJm} z^U_{euv;oC>;joTy@{0=tc}w7Gq>wJhh#C_^eyM?Sh`ld@f>|0exeURb-HfiQ~Stu zF57=g>y6}1o*_W8>VAxn?QM)#eK&A!y@4hAKQYB3f>T$&_icz_mJ*VXQ<6)(FI7xn zV&vYW9p&g;|ZDZ-`7D2rG`KorB_%Lh+Dpde`1mKsj}+ zMP#d3dh7+EMw%lRyOepe#7v!*;pzIqmw>($*~00s_SWiV&Kbjc>DRiB+ibr*G}n0B z7o{0QOL+kO5|gYj8JIMWZCVGfwgn``145DwT@3PYCO2UZAEE3!azt}Z z-!WQRI%s|6q@r4C1XH}1f7wlgm<_oO_WF$7u@*%A?!t!6P7D_5H|hh2xKwubk|F}} z2PO^h$f81;LlMNO@T7j;p-gc-%FV=NcpZaN(B7QmQSpqTL8DIbh@iUQrI1mEAzIhw zD27q4w6f@uKv+tq=~$^4Em|z8aV<~?2_8xU9bF|JU2JpLM9|jfYm9(}lXIc?>V1kq z+m%W|=qob;nDWalBe%3Kg>5mOD5n z0wZR+af)>@{QO=xrO8CwzMLw_R$u5%zYcmB?_i;hxwDvNFPBi7CE575dBxwo#tgjA z{-Xt_J0Km`_S0pPcxX%33<)n(?u6XY@>68Lwse65j&NP0N2Q=VS;At?G}z_#!(Hqe^vVb)qYS1)#NGZKLfxe9<+3Z3d_`c#Ov{k}k8jeKy^oUYzUyn!Cf+nz57$edfI5$O zm{`(gYLy&h{Ren8a)N*gr42XyC2cL8>XHY=XdgN)otj7e1Svr4!fDyl-8oW^s-f{= zQZxTD5#!rowR(mc{gUOJ+CsCemRrUWf1P4#Px_ACQcLqAw~m2?C9csiF)GsvnU;_P z!H~!emi7qQ_bxl=u`58XBI2VCUX7iqkti!jHmKT`EoAg`X9g3n(ts{KN!3LEe*a!1+p0!||BWAs4Ye`!blTmt~diQ8-&XkqeDI z-@y6EEHKffz9fN!QYvJDrrvdGQ-pOK)x>0DV}1Q3u138|E5o*GNUkk$K8nQi0{e@+ z$v{FT-WS~&t|7>#)No^oL8WA_LZr&c*_M$tiXaYHZ1dS7%-r?0H&epmO7)GwVU7CB z{GA#WkjVkV`b8dNpO;-MgkaL7CZI7(!1LX#t9HZus8;__F$wc>eKHfXO7hx)P(0DM zaUxMdjuE5pmq$Wx-0|HhwIM6x=te8f8{Vj+sj+=76)9$V_}DSPHi!D)C7;!E$-|IH zDOQ?5nj!1-WXPcW=!Go}ibS({otHzE^4{~(aIPBdd?1S9;(f<2!|Gou>k#oTT1cd6>S-A>O@Z8n1z6;F*={_g>~3k2*sXy-DromAU0BlR@}Ab z&$Xj5TIb94+NxAk*Geus62GvY)#n%b6 z*li(#l=PlXktfuuLwts2?dsDGEJ2Y4+1fS3sfz2!Cl>{fBESmPUfJaj=MKgK>U=}H z1@A3R`vz=xlB0cl)P`}_W*a3vek{d}kKh&^KY9m-fCHmh>Z8L>(Nd>iCi;k@GLJv)M zyF>hJUoYT;9#Xm(6G=)D%%Oo>xZ0)y*g=c$!$p-!Iwc8OSf# zW18md_ado9}V{a~#&E=@UZBvGrx1!PQuR!Z|%X=1UE@Ye>Jia^*bCv!1Y4LvjA2wP@ z<Fe-NPwDjO3>a zr2X)se+@v%1<#!(}D88Q;zh$Jz>5);!82m0tFH*Ry!;DblHZTlp+ zDBZHt3^eGjBo{i8DU(bsIxo#?RV#-r2!G}4UssckL2;2IQRn)xtE}Da$mWW_&$~GY zV8H7c;W3oMX4Y=?RIc8^@HV#>BBu3Snf$>&wb=HA|ayi`^jUoLUBr1oyrvRfg( z5H6DcRquy?eyx+4LN674EGQv{qhKWYJ*ErMfO-5T6weFQ57F^P>U`^qr1T6)hALFh zW!*ZNuBQ2@5*9-iS%3A~_?;Kmb^f5+({kPmh8Be#DK0&;1&CKbn@BVN#HLALYyvfo zegNP>U(z25HgDAevq#i=vXj<_{*_EAMm8=xOZrPh8s)#@tGj>1SM*PU+z>bCr{sce zGZp+^XlpO7rb6nQLpYLAqDGa?87#v}nVJk4s8Cv)B}MlV<0vwAW2WD$3G~p3NOVZz z)H&LXp)*5fCjN)_xCb`MzzpA_hX>!P;d9wkLn}1DRo*E$(Km3Bj z?WpDeMj$OoDi|70HU#Kqz)TmzOdLo~SkHuQDIz55PmP+3C?zg0St(yCqO#3h8aQj7 z6^n<4efzv=-mY1?U7oIeDdf;W98@WFSenG&#ogDzgPuyJXbjNnWQzwSOU z$~iA@{-^qRPUDjyX^ezPsyd1WF^@?9t&c*}hJE*Ho?Wu94w5CIJu;G^IF+3mSh~q8 z^S=V}Z@tB*@Ml)k3(tF6rUMDay`{;BU^s}3Q!FFj`xw*(V@)9JYq@rF8fNE#>=3C* zo!H}vhTYKugjdb+s8%ko?mqvXF)Ws8u{l+*)pWdM728V0CLVQ>M%_B?{laifR5~uJ zXF)HF6{mRFcml>)J_CNUC>)&BR$FGPx49o#>os8}p(?PYMWr5VP zU^r^j>aih?)ic@kybp>DWx`y>S{6?ULrtxnG$l+zm&D_?90>(>8V;^#0luXGcTZ>> z!d=pAkW=h>MnyECX*v2{ce2?oitp(+5tmy)jWs~zrWdjrMR((W<$hui$p=CtT|T$ zA6O-c;X=HSC{j%kp=bDMhd;_*edBROU9S*&!FH*+`Vo|AOT<2&IqpKGCf}lPSSp3U zAJ^Z`!+1e5UyG(IYv$K%bTV3J(_gH;8U4grA4A2A|GNyNwndwF)VX%NZ@DWdH|SlK zDzm40mzTw7#y#d(LVyS)8pP=h1|Z=fkTrO(lfYA&BkL136!F8McKKkX`DeauH@RYi zn1!|fLtL2c>Unq7R3FwXi+6Z_mkqjIrQ+#x*p96s$ zH5oa+Q&e_4oq#$fE0eTbx_X73e~01~`&+PNnO0TmTKn^XB?YdoxH`#Cd&(qZGc40% zk;ky7^BcPkq(0*%xN+x;9njQxlxAQh#@fmHP-E6WG70%tB5r6OIC=>l7`4#6cdargt#2rXQSkLYR` z$0zsaN}YGgKb2=(DD!#n7JM#|(*i6ADxQ0e9QKg~|EWB~rgach!`(A`s`2PFUJ!Go z{z-E1e`JceildVG?x-EVtWG+R*{Pmeluq56;znx6hK-I&%X#01S#wVl$cy2vrX{Ve z^t|G`f>%MSM_!vur~A5*mljUvyI(kKsFkH6Mj)R2mw8t3NWuEt$cx;cu9)3(Ypj@B zmB0PuGcgIlr+%PTs_DJP|H_DmO!?S9xB(Fep5#9*a-he+kD`55cIRIlJn^diIf*Gu zxtODEZBIy&jJJ=nhYF!M7pM;;l_BYrd%}x!XPgPh=u%?3xi*dEY3qvCcw3r4effW) z4)f{;kb;s%_uhy=Q(whqjg>??$h! zvhcwBDX3HiqHR8<{so)bH6$M8{_`MNaAgka{HlO*y98A$kVEUok?@@-4(!d;`+pmJ zu!I-To&z!r|Ng!W1ODz3fbw5s|N9g{fdK?Nga2*tf1Bg~{U7oYYPyPa3TB$}(Dy%2 z+eNy0Hny0SbW(H22|!~iQ%dN0&$o>XMm4BV;siK86-F?FpfSEiI#x39TpWOr>E@m# zJImn?w>q92t@^g;@R3z-A~sS25o_)W*kv|AYcG%0pea7prwA!k)u07%I}Mvk?`?G2 zUdiK&fk!XM6-h_Vwq=8{Y!*~6-mv=f>ia z7S>qn-7fO<7Aaqdkol^=H@vQ3GBl7uQd)@;y7`bj3FN36zC9F zt6*`Ut3-^HiiKnV3g5DO%_D=sK%N_)@obuK;ofDC0MG7NApiSuyT~z1yKWI7l|<+y zfh~fq7l+onDP*b+k{zixNJDX)OiSH_j*vAB{*DU%&MFqj_6{MO-(-|eKLk^Z6gGV; zfWay}28fjQ09LNAk>iNP>V2(_D{rmja5Z#cxBS}07^;f6SYf&b?~A>Ckchv{?d=$! zYoZzItm*dYGMQE3B2JcAQfwnqSiLtVvC5HBz|AI&nWla<>#1qNYP1&Wxp!O3tY)*$ zDIsyH%N#(M__;#Bev`-hioB~WHD&;R6c=$i>h2_j8YJ!PE(bCyy+uv5Op!%T$tR4z zo2Cb%Z-BwIyy@jK3=pCxVww@7=M0Xnm>63AnJ*(o!MnyY_;v3EI7H~Qvr2a(vuI^Y ziu#=^hZaj$+)31olLen+n;1H&$WkhZTyn;VCppi{@H_JNspvTX)&%>Jh^iC(GJi8n zOYoiG;LNQN>fJ2nGlahH?mK)gbDG-vN7ZVKVKRba2$H8)I2JI5mtr=-8e5$1N=d$<^SkIt#(01Z_=`>bkv(y=E@_7H_Luw=O0&dQ9EEQdFnr#W*G(yyoiU?WmSZ^@wzFwF zsy6@k8RYdr04tO&zUI+zSG``QrrU|A)aY8FV^>*K3v^9V%rJ0k(a~A#*|5?93AK3p zaf8(NY-ZXIf`a`G2VaE2Oka9%CjW#yAJCeFT$^BMs^;ZQ$_KNJocS2jQ(x4HOlW|l zv-Zqzn0_lIDQ01#KKyB|A*^BA=JPb5JDT3%SS{rWmZpNZq56TH3a}Dkr1cyV5(di& z{US?Ys@{zyCRKcSM}<@lxObLL83~bWb^jP=yk4C@p1SU6)!)LusvFd)mEo_g@$a_L zvC;kuW)Etfw2(xr)hJ6)oNIg2>tF!JmkXrkv^Qkco9^F*;$bI189E*1v050QR{1;z zZNh00U4$*cNHESQ@wNgCiRLfZF%@DDZ=Qs|^(2SvYj+dAN%hW-YI5a0ujnl8vO5!^16=!;; zFjhJM1<(yS9+nedy_Q0~Gq@7Q&E&vouohgp%X$%>EDS_ar-jWA`B964-*9cx#9317 z-NoTBab9!3v~4d^v~}`NC$1nT{e?4~`!q-VhaB>XDZ~L7vP)L!QDa#(&%9|Z+?ND8 zegMQss^O$wODA<}fW}RU-kiR1oM$*D`LEUq3QbR>-@`@p)q0v=)zw?di_+DE*lHQ> z{IP2nYi(3i7I#e2V_h7jrdDx@qB*F!AH+erFi%fCoCE}BKd@Y(!_7;G*3M*;B>td~xcXiiHfu?HFE_qgN3~Bl zQVO}9QeW%9csKc%RsB@;S0;t(SABinrc*;ZcQ9z{#m1wOPNCAcUVeT}EB&?Q_e{Sl zhi}1fHkHu5c@KXo`Y@78lj1Tg83|@Yuo6zZtZbVfdrs&GuP(s)IwTG(yjYE0-?@HR z(^yPxAgl|6lUm1*58n&8GY6VSwYUq1!Z9`Z4V-!C2?K7b+Q|=#+p1S`F7n#-Cgu!3 z)ii$9Y&uwMTED1ea8W^;WJ#BQK+EQOg=e_G?h(?X!ylDNVVsW|Spg8FZR5yNHQa62 z`+Oz$-r*lenZ}Q6+$+kWoCEoA$Ts8_+LWHg+uiB&ecTp4_dMInaaZb5=SPF5$d@v7 z9nxw8BFkFZz+mR%mmai7AcNQnUzVb;hUXQJoiz5sx|=gG4tJ7Sr=xiGVdPVoTH?~8aP*z2cGsbRpq>0DbgB_vk)SuDfS}f<~1K}YpzXd3Z{NRzS2|*h{{#-&8 z3-coBTVgs+3=o{zv`V#jFoHhHx41obRsk+$9UY^kQTu=mi_3ftYw>voE**iusd}VC z%|8j>&#}CigQsz=qLBB@2uPac1>%v$#+9kn%Ni-CXL}Wf)2PUdN{02MXe;o=e+Ws{ z+?Z0&+ekST-wD(Ky?dX(2>N(L-aMU?o{3q;wG149dQ{`Gb&!Qo1qtthc&7U@%8Ao! zcFsW1#_EI!S(<@M@!P%R_wlW!=4*GpPXfTT)sO(rLoggO4@vlG-_*Xk{+-*Vj%$it z4wh$(<J!$^Txu8QJd!|K2wEtYOq<%xAi>3@a9Y3t*xI5z3iiY86x{`VW&;m}qpf zHr?x?V_Gv{03mYm4|m*l#uB&*{_Hxf3F&`B_2X-IEdAAYhtYTXBEn~so@r<-#UStF zSle3bk6p^3qVz9_+A;e%&x|FYWJZEclKnpBGVgG_0W z^t;K7728pmIY1EJwE(p6C~KfP(J<#+nUXpN2}jk+C(48_&iG0_CnwZPnrLC>scS6d zk5enc3-)1JEv5cs5E6@Bfl9agRqjRxMGNeOd=b{!0~x-Cu&pP@LwmYt)tX}s9R)L)36@((A`3{7g3 zPthPtIJVxJ017f5I?igX$2#zEN`k9SN(IlCwTiAbZriR5^7K|U3kC&dut&~An{ye5 zKB(t=4OcmW=8z6W%Q82;g;Lz*tPJp>Ku;MfuH7(7+NY&F=a7uAzzf4@OK`-`ZsM^s z%b=}JbS%oyCHI&2)|M8+T_4LRwlLKF2`PJ<%NgG&nZuMqdPreu5j2dQf{_SH@_>zC zBXMkT7r+ZuD7{S8B;is^9p~z`{Kk)1`fReRSf41Ay<$+w9U@Ehv@Hg_PRm7^D&Q`i z>j|QTtH=LFpu`AY?Ym^>O<%Ldt}1Ztm}c*+XA8@G?J5pv9HgWh+^7(x}be~z4O z#`~L9I@ZTsNPBu-uIV6~wsaZ`X~1TYQ7zsBPm5!6NB%o)1B%PQb)^?g9+zi>uB0}B zG^8v{)v<469B~zYGo1aR^T>Lcol*Q(`(1a(M*S|K!stn4eh<+1(uK2m}mtX{+kyd>wR~+6`jb=3iOM#T@$gP$ymT&baQE zd--GWWeg{pM3oowLxVr>O=?+go+lVGjidx6@5JMgALhd3r7_-Db7Om5G}~e8t<`kN z&7Yk_uFlF(j!upO24aAKq>F&y7a)e>{-#)2MeXFWk{ zNW88wc69z3SHPK&T0Jn3UZ3LMr`=bL+J9*JT04Dz@W-89{{+_1AEZOMZ??>TxLpzb z>KKM~BI!^;SOXD^WcFvQ`$?|@1pCssgKKy|$HB*Tc^8+`$hJYa~>}|APU4NB%L>JyNM2Z{!d6mw| zKmXnq)Ld>IA+jxV>}IHfTf>&_{7q%W1lBM_mg8jA{tw+j10>*rUbuOMduQ(xr8*9r zvA+?!t(OH0r3fZ{i%&6*es^bUg@9vG`uXFvTVm_Kd*ISod@b}xJCo>{V3b3X0inM!|`_F5xFHXlDbAQ%jBz0j*dy+mIYpI zJ&JKJS$huiEvKMZx#F9Vh4a2hhj2T6LALj`U^8*1ili4%3JVccqP6T#ZDg$D?uK>B z-84I<^Y4WQLKA%~ypV4eG~W`R>>^w7PIUFq><@CVq=k(Z5J#_KR@M4z+UH~mkuk(8 zvUkK{NM@!Mv5LypendU!G_ zpC0-Tvj+|FA@DP@iW|&+^1p)$pJJ6quD~k>u!rdH7fW#3{g0bTD;B^?m#^c2N7w9V z`?*j5aWC*c9)S$~boB9m@4fktC;b2ZyA1Gf(`x)+esR1dHk$3`rACL(yb8MRj~cRob8T zL=ox~{RZO=Lq|C*nYIiIm?M)P67}3HN;0MHR>l=pmK3dZVpi7Tf)Kja307hlv|8n` zQ)_LGYweXMnDoHa9y>5O>;kyjbNm0XWB(r@+RP}xAQom=RnAidG~cgk;0pyIufdTe zCri$Z?I}vpdO4Ox`#iE#7V3FS=dqQjG+tF|FAnp|P;#M1%nC@pyiM;O{owQEFT|(S zzYupyAmT;C{l$yhAY9N(e6Cq`6E4I{quW98ST61x!eVaO=@RUIN-{ZMkk$Q{zCnWX zXCcdQqB+fox)W}#PWIEwfBe~b9GxT?$3KlTzszm0#6_BFucFS&4@u&ZfxQgxtdx2~ zFm>^@*InMaILeYPZ{K+m>8yIaof|bN;{^{!do0W|>0O?EMgQctWwN!WPFM+o4GFTN zopToV_i+dPUpH>}15wF}$4zaTAj-t-1hd3u_xlA;5MygFpMl%xAJM^G7Vp9N&Dbdc zUb@7FFjD@oCWRCwjJBus&njQ7gUXpR$?Sej3yNL`%_Rqa&P`#XXIItn`h#hK&|tYo z>wg<5TWD3nc5PVr7eaCaVSGRDqB_^APQN;8)XG6`f+G(U6O+LSiJ)k=iMv(iBYGOI z5Xz5y2FWSAno%%SeTtYAiTgfgCO*UM*dg&nRwUs(^nhtWi!D!qV{*plW`V(;{F#3^ zIV&o_FayQXo~so<#dcZ_188MVn__>0qjrC737?>M##;T&c3s7yKf6wVq$_Zp&anH6 zC)qq+ZjD~qxL_nmpMeA2lGF(+NuQ*80-xE$di73^@!$_G8?Jq@X3&cK>|-B^dSAH> zE+7B%H>B3fo)%nld|UO+6f^;>Rih5Rbi|>?em~~vR!@-L z@T~qq%<<7QL;c?1Yr#*!$_GQsDS)nJRAV8hW@LP^9hxo{IkbnKtaEC3vaOWN6-_Kg zh`jJa6h{MifIIQzb2%>eU*z}TeHt9S?DiLeNX2}cxi{#onDoG!>MumN!e0nEtB-qu zM&)C#)BA6|s~aD{qdX-Rv^0xJe7>xtk<>x{>#rTvNSi(uDG)_^6`=Eu&~NbE?4Ijt zu7(und19@u*zWhssXN=aTi=~o^@A<>KZpwyXZDLPN_O@;6=-k@J{M%0nzbVtNukkB z52V2ElNkhS4MnS&jwb=;>(_d0Y^QJ$Po9SYjkbP@-e+$==_hnbIoc<09ub{#QSaaj z6s%DvPkKYZ@jDI?0kFr$pbs)``7VCx-gy!qVd9GOP&3E!eza2K3k%&u@!AGsYFbF9 zx{bF5w2>kHg=qRs^>NY`d&k!Qt)VNw5;y$mNy7}e_eb@<(0 z6z4C5#k9=(f&<>MaqzS#)=&Jl8NZkku}oo#$3W0~m%kymH>4rfE@^t>IUe7*U~HDm zZT|FO$Y)ErIoFh80TUV=)X6wY+Lw)JvN**7^FbWzr+&g##4s}@Y75#T8f2X4UK_GP zBQRQRlqh+{Gt=V%5sz{MJgtR3NOj9#CT?;}Y(*{XsC>u8T}b%#t0g|F9JPV!SK@|- zfwkxs@^Xt1G;*otKOsaSs5ARXu9+!TvJ;FsyiO%lt5v1Zs~1@lqBK@beW&_P&6);k zW^$`1wIV4o#%7)x3m%X94orFqUtdK{4UEH;?d|{2e<@y}6hcs-N*zBdFuTOikthQT z+ENiZyVeutnYa|U@yUn`cf{Okj>TI-VyYl;i1+*a{(p8eb+<-4pY~%ezHzpP5T{hL zmcZv{`s)3;ZX^x^$v7p9-eZl%XZ+bFRTu~RlWf?jcsP^d8sE z0*++4_j^!l{{Fr)5r%`S-Bhaz4ED2~^>}(<(<(TG#v%wA4m4JHYBU&C+=M@3EF38Vj}u|KR_v0O*$MS?_dT zF3Z7{x8IuW!xnzy=C*5A2`X8ny}Wg`fr?$D6^4L^JfcpKP}XmzR1;6WKG@=dA>Rl; zWeJ96l}d8Y{D0mh&2L1@9<|1mB80pK`&+S87hXk(B_;SgX%d5D@c~vxHKJ+-O(nQc z5UNjTBb_*o?eMw#8PsRActUHt2b7^jFL3LDjHLX{%0K?C$Mf+cKV(d^{GIryHt^#o zzRFc6czilS+`IN;+iS0zp>K|a}x5P^FqHXr*ouvkN=4{?^IKJsWdz&#H($JVXl3yoSgt=>stCUKFbxo*P$_Gw}?av+lf;k?yhJ4RDX}4)TE8lv~ znHG$sM!ZsF*8Q5q;=)j~x>2Ibi$S2FL%bO$!jqG+2K^lfKrdS0fe$c(_ zSRi6JSnvn@(kUf`9=jI{S}Z`*2vb6piF@7}gjvr8`BK?Zw{hdiASeg@E-!?e6>dxg zqK>hg&eq^vr8w3XVF$RIKk>jkd|5Zuj1K{P!N!e89pBoF6_a?2^$J?WiPFJbQB0K2 z1gL}tPfz=H#(7O5ejh|Q=H&F;#x;blFkJ0G2Pp2jGZ_P7jLb>@Z>~Z#b6Y<#!Ye!U zF;E9%0a6IMNcg5!TJm1>4;lI0u~;h1Cwv zT^KAiean7RjSEHyDu_hu?oh0F$YF;9{ixoA1L!li$cGf}rJnZV9*u zFXNEeC`rug9Fkeog{14FK;!>!BFj+kD?9rC5OZbwlhUK! z{UGvb!pa=|hIO33cOfHr?2l(PA2RDmTHn#otonhz{E;Es?ti&i2g_aXbT{TTSFEFoGixSw~};7e93 z$P>D0QS%of$<&JoP8HH9I+C@n*|Yb@x<5?U_D zA!`D(Yl~z1?*S8(Ysc^SvyVYj!;0ag^Q(GYLV@%~>gSje%4X}Xvnu`=rHz`AGmCX$ z@oWy72r>Hummo`>L{8#ZE9h}{>zhxi&8z&V;Mo4i+sNhTGFx5XIg!Or;FEq1;VoWw z+t8*Wpw}%r{USq(2>3J!a}ulgEvZWK+AmI2OG)U;jV!b~0#=NNd~pc&TRkm0y#OpW z;b=JG${a4P=x2_KqlP}ppID;8uP?s~cQpdU1J6va>t9)2#V&KLe|~xE$%y~Vqd<6? zfgOKBiQZ*LV)_NNH=iT}{}Y7{0w{0=+ z?vn7oM2WK8{I&PAUY+wG6~-QSzu&~$l0=$2T_QRDpfa`NPY8;=&x&^lI=|$1E~tlA zO`3dIXZgWN7F{x9%e6y|((On6*Phd6N6rH`7`vnW&sLq5#&)M8rq$B7UwJmdU8bCT z4uih^8B;XifV#I$scuDHi3u+(i@7%gO{r;Rz*OB#^K2vxa?jLv9I$G3WIs@Z!5nA+ z%&!5{t;xiE43q>RfcPy^HWUQvR3U7H=HQ0{5tBsc5Fs8?;;*S+Fxi!4fqxTNc|D%Zl zOeJY8{$o13Be4Uy`7cBcnK5aSf-LKMj>7Oa@H4T_@4`Qjz&{-41`8OtWqPoA@Ywk@ zR_FCrW*4yJH2#6UDjJ@W!D1JgwShmw)Eok)O!Cy?Ct?q8o3tqZ(~o1YjZwks#nbs_XKNviTLAUE(XM=bdsm z#lIm;`5+Y?VuSf8=ukGObB$grlG}yp+aq~jh=S! zEm-*V-^~C2rM&)k<=P&glU^&RyrLpyr}ELUcCQk8rp-0XpcnAV=34vM->Fld(pB(P zXO^Qk$9lVCLL?=TyLLaZ4h&Dj+U~u#WMz z^lXDh${NMVr)vJEOngSy2Pc`!)A2L1Qa|#SofBehybwuaAx4UEI!os#gO-h!PEHjz zn)CiBRGJtR-B!i!a82?o-9m7jXd!ZhqWAXnh5POed&!>SxP0j^gl4vDEqLJS=M|B` z_8RSkL|eCZvnOMea-KN!zF#){OqfMKr6)p^!LxNu2x1_qc}Rb!@9dEt_l2yq~HW*GyOF~h+sl4btGoD16en2#D? z(vstOBfI+6c@&W#GRqlJdBb2T%`c2qTTR7VZA}JzmcJK(7zuGdzoEY*hE9hc8Oi4rcq1{6jf z-_DazKQ)~@hB#4Exp)qVm~ut$Qq0qJRAJTA(z*H`ED*%0%Rtlji8H0R7zwvJcjSBHyVaqQ4hJ^O9-2)GFpD0v%B3OGmTR==aoL3 zr3Yvehx#VO?%V4potg;-(Ba}XnyPDsfP#}b0&ad(LaY+GxR6h6AN!*kt&i8WKCc*3 zz41J`3=8+#-wxKjEkb$Bn5+)zA{3a>Q4B*2#wMFoA!A5(ITt3P}U0 z%dBl>zh^a5o}1oWXMqP5|1kv;tauM;{j9b!PP=;}tF-)9UwrBlVCS+tG|uFvdquGo z!-?nA&u*{`b2w%xIE_!+#D6^NZQ~Zl@wEz#&x_$LsjC z#&>Q;A4OLZiAsy@fA0SJ;rsfYm=(5&pUWJC{$!Oee<7UTMaWu@pG3&?cMh?h>z4$M zV~B`OpyU=NV+BEVdcMS}!@0blAXvu5n_gDQVQuVXv8C>7oS5Rn0A4E*3Zr9qcy!5w z>Xf4(cbyaI(U7l1|JLiC7y7?U+FAd*SlET#pAk==g2Iq4ZHoHhg=-(1yz9gwaK~Zs zZh04sn8|qqTPi_o*Qx@c4^dt7N`?ognOCZ8mxDi?fT+Mz5$s4)I9Cb$Nm{P2i}JQw z3KS_S0>|#86%fHKZM8sN^ZJx*To>8B!EiPC8pyIFk+fF`hA!U+3t!5zSe)*HS%5UJR zPaK*$LpN6`tS1E-JVRZ4RD7_oznAcd8R41%oo-;8odr7(+h^LD!1DItwOGIR$TC5D z!C1+aSE$tP`i=WtyE$>dIP@%V;{NR3n}-ZBVD?gzNqdC_UJ0?%0#}VVyeO(v!fo6q6%UK_ptPq z`f`?c|8=j7>y5y??mV?ZF=rEV#>I^{jM|qAzd(M(1;{hI;yn(oB^M;#tNv<2VKx+A zWt>#mrQSYt>8vqq`r4$^?zkAimdQ-iP{*ZvTNn`j;y{1pp|RM6*HWFGy0cTf;onv* zzr7oAe_bT>n^@f_!fR~=BgC8yY=ko=O}JvA-8YD*WO~zKm=82Q*#Qi@Db;3k6NqT5 zwFYv?Nr6@QCU||| zXBc_Wmdax^7Sc^yddYE4ZaQ}MncQeZgg!zEsqVrq?daC~vSW>u$RPc8F*@g!u*&1C zBI)%Zs$k}x|J}{tGX3LR8)#IJud!e(b7shAEVFp`QH@y`Id}goZ)CfAAT}$idNI3% zGO2fyWul6_5G~#}IzsuJzZj;)mb71FfhbpD8Ww?um?uB1BuVy7YwP?=R+JDxBvX7vv9MrSYcM}J4ZdL*Z{9$aWRsV{#IY-)K5XyyIMub%etOBB(6 zT8A%>Xh}LA=IomcJ8O1GUOGt53?hXwGcSX5_Dc$b{iPi>2uph?wE4j^GU07mA)X2k z{cu;^oGKd0OlNh_T8Q~iPY0?=Xj%YqcCwvu@gEP?4|MdS$Bk)dt2b6Y$9oY)kOMnz0~{B$yOOZX{3vGc%?^K7U=Z8H#2cUl5Cpi{6vQ&=-Mue+ z*cmGY>Z6K01^$Ila#UM14yUuM#v)Fxjh&;|{pB~uIWscT2GPkY!RDM_znOq}r61f% z|JSGaBdPUo(jw!QY4)Udo)u7 znO{M#6KZPxeW0k`K-1#fdTZjsI{u`+bEp-wcYa_#@BcK`fKUm2PObH4c5`49;&GkJ zctMVTpd>t9@xuLcULr64)qZ&=nbR5wCU{@*I~IXu4IQs9->CMq^~a`;K%>5A5$pa`o`a#12vGBfRUXvIpH)YY|1HfUJMcLX&AMLKs)hjQtps3(o-ZIXoI_WY&I7PuAe<#zEvz3xYn z*_LbW(b+~hoO-;aN(qFJKvi8?iZ0~uq;%4aX&i(O z5%9dAvxiOs%O`Ui20TNXi%pYR2cDKQJWnaVMf!Q&{&Wz~M>;QPJ&N;&o=g( z^&UA$66NBF8I^ECC2IeZHz zmyB9evQ*c!oRJCrB{7?_pX;cO@>x{`ND7l z4k$I!ME|+r*7~x4*i7$p zQ+CQ*FK1x5+1-T}3m9^yRd7-M&$3GNv@;nON@P@?JTXLE;}sRQCuaM2^S{eKWjc@+ zUy0(aDK)Mz!(3toskDF^@uz!8$5J{Ke^SY8@v5_9O7BzeA$!q^AT%A4U<*r{l_w;% z_+adw!z>P_1Q}{<2gZGPoln?@J4{#qCYHfTlmB7zqtghjf%-^o7WyhA!J3lVjg$n) zzfOVSYhzll5f+7=q%W8dFhFUJ_S4bqx<8b2%VA=X#m)$$_Y#JH)i@c=>=Zve{-e<;pn8{Q)LYn$yjCm|% zID^C4ey2m12&saSP&jq;3Vzj82V#%HPty0khR)J$60t;LSp7Po#Je4HAeM9NWc*L@ zMPE$!Vn}*4D2g3f-;=@&(K~}46i-hyZC9%hd|@uo9RGVX^Z#5CG@?bUe292{Q4>`?jFcOTlafQC^=vC5$aR@26zI)`>1|)rf32NAnF%$G3L;?I0*Akpxf+ss6oD@(#0==$hJA#XRy>x_}q0)5Wk+tX-SO?P9?g zn}#F(y3n;CYM_VJ-kdPBw@#JdOe;H%pGsjWOO4GOia$J~veA^*pC;tJn*zyUIy@HA zy{6dYEHd(eM&CD(?xuFm!8ytdm@2$lR^R%Y(sI%p(>?o{NM}o=zcZ9%iV2H!aKu0n zQDzFjUbhK{oHJ|yZ*8#;L&`k3R6UD!AAi}t;@?sj`=I+F=gci|eY(+`x>{?smgi9D zT2jay*-wB*tQ$8u9SvqC2u~bmc7+7-yK?SLljI4q&fnaT!Amxm%SU&fz)oow;}G%K zx%6p-4izf%O6YkM+BO+^kEHmiV~GKTodP&<_z5os*jR_~8pp^FKW$6Ke4pJB4>%`t z>{ke>CEU_`0N`22PtHkA#3q7=%keo)?Z11j z@Rk*;lU=liz!ctb4y+^B*{qT#hb=@%`@$A<_?|0CMX@p{@hKxl+m|kY^{sP_fiT^Q zT|Oke4e|iERnm|B=o&45f{otn=N}P;I5bF?OAceV95~bjr);ZhijXH15Tp18O}KjJ zF1*UwLSNdy@8V$fo!PaPwM-`-qPX@GaB3B6mJ(ng>{Tu%Yq8W=f;4Vd=&_Elt)~Px z&9&N-$&ZdMYP!zQ<=be$@~SpQW|7cmg?_ifOAHWkHbH%Ms>?stqG@ z-pYLr-Ja@aa^#Y^y@0UIHpQq%Uh+dh^^y6u5&Ms_7iW?7JcU(Jn*zH;-|8@q>) zTT#-bo}bC&2Zfxnsl1p`HT0b(GsNshEmC_*+q_cxN!l;rB8iRNJGI{H27998&OMLj zKCc^ZNU<`lEGb;Dt(M5%u((g3yQ)&h?WtMmj$z=m+T9jKA>%-0l9 zr|#ZlDw5LXhG91W74tf*_tw-bCu-s!7(9yJkT<^srxCtUib{EmH8kGY-_nwvzh82@B=5H*%1PPsjhhkE4Ip5117RWVi)doJ* zR7aA6?CSeYmJYC_-MKQN2typ9_&ys~%A^mGJh82FEM)7Ob|+9#;}|*H5C`oh*y# zT2%x{dm=2xxX6Eh#~-&7izbR!DwQg^nGd19`3x8qmRm`2sMjSj&^ydc1miPc(v@2( z0{&F>P4O)gdy-TrG^|bTGm-GmJmq3f6cf*$vpAdA`2n3;{*7n7dhAT6~?p=7nFG{X=hRF}tk z0QSDR)lh3%V8!!g_<-Dk{87S&C^wz}*H0+Ydz%*5Hu&>Q5~YY0Wh}@zns$@UA-UJ| z5Wk1`UdYk@zBpo>G5$gNwtW_5B*`sj-D`alGqJFw;iQlM+ctpz4?tvx0YURo&Rz9_3E9OzISL z@VjH5{tAXkH{VJ(4pL}uG;e|OjKmGQvxPtEH?C;JcP*o(2|KTE4Bd{T_cFZh>1+?D zs!ad=^n8#!+%L4v_l5fG4mv9uy;1_+$->UK3~tbZncPbj2+eCvyU`RjhGpWUw3})7 z%1~=+7m-)3SYdQ3X)k^?&DhpL{)PB&2MFrDPsl&|qGr2v8rUQ)HBQ|G_$_ZnkhF!I zb|B`ML1I=D2r29O-BZ2k(12OqXdte!$d)=g(~iI5ERD=JH@UX8Et0cu?`Q`823a}D7r5^D*E%v{tlbfDI zg}`Jsd2FS1V;s|8MoMm4wmr-U(r6Lm0ECrT{Bjykuo5vty{XZRA=GA6+X{wdF-!!u zAzMGpO4yuiao5i)9GK3QeRnydsd=T!Uip{AF@?=IJ*pMSQjl8hO)%h~K;Ksyy!LOX z*Ng6}PKrM3{|+rEcc{}9bY7Lr=wX)Fn_G!E!M#Ytwp*MryDd{5_C>)O-)mx4U{V5P zQ?zQ1W;szG9J^F3WV?K++XKRmQGr?wcRbdq6&b#SMz?=EU#S@y?uip3+*~$nv_E3_ z)U*oj7xkb};v~3CD&WqPdTsDu+s-YlbO5gZwWS;ga(8SA^N`f;?#)X$>G6jX4 zj1qlGGwT+6ko#u#zM{O6Bu@4~@(U66jhN-H>s#a{DbLeN1^-$VliJM4+wEKB4{hn3f0Y#&YfartMuTB=wy5 zJVP#HuV+qh;AYk3wa$Ys5HftkWmjjyTfEds?I5XtO`Z(((JLh;%}1c(6~4ps;*(*0 zCfym)rc=YKjTqi}^IOVMF)6c8WqvDHRfAe>NO0=td`)GJmF^%PmY9Q{Ty{TgvVj*P z=cPQ3zP&FoqsEcx4UssO5@tMXCu#M=g#mwW6NNdrV-lQ*=3SrXruqbg$R}SB;8~~6_*mZShKc!qE76_N1-P-Un9ySzYczL+sNDKV7LC$* zbvZ#rMPLwf&VP1pTs5eG?=Q+qK!2IKmOUzFenh z>fK_w&M=L#ys9LM7N{sJ_N#c*pA$j4%485pNocNjoxGNFkd#!a2fgu+@f2EDDNXn2 zZEFX;FRzc0=i_TLqhzdqA(#dJfnc_pEd6^|Afs7?KnDzQW#W5E#FW8Q#f^0u zktzp@5<0yOttYLYW#oL5IM-{wW+Rf15Y?*r+D&GN{a-5^^KqvT?3IWzwZ9g3A-L|~ zH|1U6lyP%Fi-bL%EC>iav`jHkPff8im#Q^qO6no^3@$`<+`5OZxv`WJmR@G5`SO}x z)INEL+uDyJS!=%YKk2?t5i0 z~`}ryVWM;(HOO;sK-Y zE@Crs4TXFG`2_e>pSe(OJKso03~`~w7T(5R$kEgjYpu23mVj!W!)FPI-N|Ss9bbL? z20b|*pbLm$M5j^tEY))AOW3B-YHg;Wxx1Yi@20+esomAaHHr!_#XzcB@K^Mab&C`G zC_XgGK1kma^*mq%ftiU!hRR9}@XV;jSBpu{@hg6_Twdpp$X;(;ef?x6aK~lF^f|$) zo)D+=`HY~)QjF3x-f+6}3m9WS9nEgV5^AnXog#X&b6;)CmP@Ycrm1CHYh!k@?)ptD zW7Ei-)VHDbHkFu*ilg&YzSA30opzjcUHIT8K7&$oZKjZM!@84;B6Ln|{B$doVseec z&cZ>w7=W(CUAy|Tkywa-#Q}a386|F&no7BYw>HJ*m_EULw=e}V)= z#c=&J?PBwHb|mSti;$ULdtUr(7P* zo{brD4&rmMgP9e%ICv(o;zF)S3287g@)+XB_D5YTxig_Whxf_j&zNaXjz?K%Wn)WL z4KDA|zZ~F{h~k{SHV8kaf6sD7saU48lz|y!_YfjaYkmCZOoiE;4u)x#bM7)-z3BK%pEH7kE15x)4gTi}&3;TbFqf-~6vHQk} zaR>^tBzqheCM%D>iiZQ$fvCfRrSf z<~o;&N?Lll3g4KOVkyWM*weUXw+-t7i_z8=`#WY>wkI$OxD)CnciOA4rTs@SOpIZGWId(=qu%#Sphut(c7WZkDk8k z!b$?-g1IIM;25xxM{vJYCEIvq)ztVK17+yDyKo#O&FwZi>^`asf&VT3*)bLyGWL83 zH50$IM4#>qFwNM_a$%O?0Z!jy6mhOvbLuijNLw+O>bd}(68hkK=P(jhQT#3j&GV~X zjxx()V^x)}dy4BL3exQ_dqX!icJo}2e_{P>6D|6ym~8 zFayQKl5FfuOU1K`&@oZ{FS_0_IuhvX(@n>=ZQHilNjkP|+Z`tr+jdg1ZQJQM9jiM| z`qqEmcjm5{x%X3juC-2`+WYL^ejW-8YNn6q!<%Hs&5SO)+8sCSNz&OaEyDwiKeMM6 z`Py4B9cl?PyY`AGMh})sBb|QOW`&D%%&bwe>{x0X2_=f6Q&6(nQqZXZ#NNA0@jDzv zoM@6^IJ6zsz}Di)Qmou>8=9|U$`Q%XJD+8uJ>HE+p&4%Zu0Gro$>{euR~pWorCGm+ zV5!f$w5ms9dF+WQ6moj(yFD=F-yfbcHg0xC{QZ2N<^lTok0chen-I0bk8ykO@rx<- zc*g;DNhh7jtcMV5O2^zs=l~|P-UW|k-wpnGTn_h$Y1PqG^lya(%QXDh+&G}?d@>r& zWMHJ-2+DkLf?2V7M4MI06tYBdoJ~$k8NEp+oT$}&ZMwR<0+!4)W;-z9(f4dSS}W1W zFcTjwwm{`!EhTu(6mJd#M(;H9h+6wd*W5eXyhql1CC!VigR}Hf3d<1>Dz5`Ib_j`} z;Tlu=u2ALjc8{c)!A=C%zx$){G%k=-AhxMnru-cOZ3D4@d}_j2znw7dsN|7nE>RIL zIAfe-5-ygFp0YayX1h>#dKJG&i&-Bi<)NqDJK<*#eMa|x+yW&ckv9eQhK)9DgNhud zY)PhVooCE4lpV)}ld%A`^#QW`@%!Qk`O!)8jRERu?7lzJTdK_#zbHfJFr=eY1y&F| z|3A}`pzI{#o+rhuo-4uWN9MdPSE5eXe#lbBP*aNiDFodVaJzE9nwoo9y%>PFi5SVv zMee=26Z-qgU!a`zX=Hn(iZ+)XO|CYq!nfn$a*HT;PZ6$;z@Xk6t&eZRX2cI6B-w5u zE)pk4{J~HR0Gl{qKDvDz*CYf*6Q*skTU2&{w@Dx@Xp%h>hRb?ECyTfg0xWn^BL4@r zUiJ@c44LoOuCTvdp2J7SU$2K*lzk9`s-LoUadmB4qE$irE|4W_4iFMGiN|c0k0(Df za)ZIrd%-4(YOMWEDi-Hg_-m))mJ)hC?XR6}iC9S}({JrM(m0vyXYP_DHdx~^Fg!al| zg7THa4thV!t&xE+{2CTmO}sGro~hT2X!bKszSPff8+V))SsN*Z!k%e{Mg2MHk9b z#V*(9Oj6}nfy~N3uud8)IuO3zY5%iNl_jKJMV1JZKwejFli)gIlpH3pa1d>}x9|Cd zNqzX?#~TCq#+D-$Z5D$5s$)Yc2QEqu6-<&KR~jy3Xb|-Zc5VS@09NqV?$V#CZrTcu1`G&p+9FFcDb%*K-*Hp(w>0ABBp%njopd3 zi-#?8)Lb3t00hg8Pq9j6!so$_PZf)%Jt`^#C$b&v4xy3Baem|b%=_gdcZFj_FwVNH zq%q*_C|Qq+~C1xIDAoO$eftH*1w=~}*Kp@`^@Q2RrqikD=JkuNWgxLQ|+*SZIH zcFC9@g~BFCmDXhjZh+xds zM@>6G4)4UMH9>*=5%*$vR0&>Jnc2sTZJe44Vwf#s0*G{S=PUW*uOU!aCY(5Hc=Bbb z`#Eqj9BB8M#BAPmKWnpurg{?H{{ywzMexplVquD3^F9tC-=u&-Q3=Cxs*)#AC)XK5 zO*uv5$hkO4cZ^oqNi0y_r=j_}D8<%set$;~IV&v;ui)^t4cd$e4p%IfjdC?DBtLXQ z2ijdj2IQ-AC!(Lm0@)Ulr`&4%28a z;I|{Lvg?ENO7wWr)vl*cjWeji3m<2W<}d3DayN?cpVN1{kgL*?)>pS9lvj{(gG6;GN7Dm##i$E=Y=!R zpZ^}ko_gi9pQ^OuuG=(ga;$#ZuVcTs;m;)=Eg(NG5av)%TK*b_<4+_JHW?F|S*Yw7 zvxrOdKIY|?h+<^5m`(O|&Z$Z36+(RtPv2i^*>))Xv%Cn}>oRnB6Y=ll0YC5c_la}} zwBHd#3HQv<=Q`{t;XJ18qxw<*4JO%)51xx*!s@3~(Gum&L{^Lw0S5`Oca8#Ok)tJL zOei5QCF7_eq9R+g(F5KC>C~r!B_T&0|E~y_<9B9dmm0wnt}M$_pjzg!G#?Lbo@fbM zW(O?gAqEu{7a3LXL<|wD_QajbfIS4eF}wDc=IaUp?_PHwD-+JE=_SocJ;pS_YxRds z2YSEy-{8afIT~A$%L<6ArANK1tyQ(^x65LAGAYLD$+B-7kQ%D=2J%&=3PwO?MmdGr zKij75g{@V)x<;<@YH7f01a}%I3cbsU4iqO-RR{oT))&K2%Af+pxUIO&P z)^+xg07JtDiy4`j^3O=CE55j9eMQM4Xr$8O5y)QL(_sA-DqBP z$z7qB$QfYaod7%+WGKc&D?I;FMjcb;ABoY)(-5aGqFIu`);ZZ69OO6%QOKN9IFwmZ z1Kh5F@{Gr;?!&_?h+~D;AVAg`>%qQ_`l3@RvCoy6$MAXn>b{~B9!>fYdNQx0Uqc+V zC@xrvnRj>znf$Q%0v`#*JRv3&RisRm0ddl33`!6|a)dch0!bLD$^{4{WIEQ`A1{w1z$+1NoH)gg?OhX;>3GRnGs~{qe>y zh!NcI%4(B~V9NDKRG&RWFsG=SC~*PVm!*K>_fpu!7O4>g8CG!NCbbbIemCAi7)fU( zMzs+zeA*5P7pmz=_cjQ8_Lh7Mgej>olOeExCaLRa>U(U<$aczA8c)*Za7Nc`WcwyR zH0LhO8in84Q=9Teo{+hFHE{pHbaLgMn?i!1q@0BWayz>Tl<=1fROSVYSmFq?Wg0bw zSoi}l5_pYgMt(kNk*)@(H$+#Nkv$2LUl+5$*4X7l0z0Qiw9yS9BRemIC1HX{P?UPZ z)e2<(g0XxFK@5~`-*W9ur~2NDbsdx-!5E&8Ug7tJTP>mJv3Kg-*zO+v;ifc2$s>I78M7zdH4Pzd&J&3RA0^#eOp2*iNag5EML1JvGJrG=7M$Nb!e z9Qdg+)L>T2or6!nAM(=>XyE2_UgT4W+Bx9h;C#rdA7RYGdLqL-N0#{?vCS|i0#J8c zbfCKw%lm=g_K3;@(+T?EI$#)JADPX*!1GF4n!`@&|C2S*tSAC*%mH-(liC6)Eh9Sv zz}Y!#*ul++3wCtf7H^)Yyq458^ZvJf?rS_A`Gfch=P~`P;{RB_MH+mY+7k^(vOa~{ z89;UI*d**jmJq*lz1_mrg5wXVg>K=;RS~h9aweb!K6_^JfIX z!aB1gu8Cx{EnEfO*`2mu|LiiBWGf#zw*HBTM7|E_MCRiAdE8S*4}&tUW@>F7*UkHl zB8-fJenWpj;Ku+7F>3Y0o%u{(Yz0J?(XY%f4}tsgzZDPvz?OsN7ziJo@*zN*MSQO| zGac^W7;O2tVwXj%nJqu{RU20pe#C|>HZjliiiIlYa@v=y=CEsUS9N%@Mr4cZmLlbTqyd(&`O?0B)7D zosAS%?)$>*M(lx~JzjEZakJS%>?LQIw)%>D-$@h4?i=~*3*T2#0l2wxzGfs%^ zFK=0PKkfGtM;?uA-amqgp}Tsgyvn=`va#y*B*NLSQm!I8YRMxSqiU=3W{Us=Z4s#4L=8-l^MpRSo^RSGQN*3%7%(gF$?O9!4q8 z<-J0Ge2nT6NCOLM_ZuIV9==veGi zSss`r`8dr|^FPa0*$mk;av47$d5JX}NCiojTv8T+@CMnP%p)5c4Lqr_pLl%By2$ z5T&Gh&>>F=Pi6szvdGk=VUs*dpbKMk2(nIm#z1My`p(P0!X|BcA3+iSz%Cjfl0M(o zt%A(PA_Gyl5niVso_nP|iRW8{@W%)&?4jm303@Ei1cK2TTapkQ1>y$O)ZcH&-`BMK z4@u?z26!WtJBUtm*SzP8ugf<-%RNzltl^Ak9a@bS1nED1&^rw@7yQ4QgmH7b-Doze zd-3MiO|QWqCz|M9NQ7vyVO!>1{n@e?eQcaiM&IA`+nnLi9^lzoEt2hN0k?RV%wl#! zz>0~a(f?krH@9avjVueje;NlJQ2grr2c~uXDS!Mqu(m#4&{H+;>|A?IH0zx}6YrML z-y5AGtd6~5(ngm)%ufFxX&MWI!I(OAGEjP`KQhw(pd&Kg*`JO3f02$rd%XWE%0PI(^I6b@-2i&q@S1CO-gw3_E38MJqC6wq z2(z3`swl#aN3XN40x!+gPeBvk(ykd|Sr@A@Z_toe&mW!IeKq}%+lG4{TQ8Uvlz0nJ zdH5khaofdj`I+Qav5&9w4{WlNNhr`Wy1X}6yo|#@X_7&{HXNr#u#1ig`IL%3ai=%E zav%%0NZ@l(kxW-Z0XjoDhQk$zP;{NnWYt(tPS{fs2D+Ni`htA?#Xfjb15`i(XcdSS zuNuo930a3dKc6OTDa5jToP5D<7IQ{!6#id{Ct2`o_V%e6gwI&!+AXk)WicD4o3#K` zKRcQn8VW!frTZqk44k{n_jk0zyGQp!a^KgH;d|)y{8)L>BRpwMq>OCCSG99?OGlOj zL`%Gp9#s28N*KTUI4sEEXr0O--~?c)hAk--+>QNJ<()by=yK^F*bxlj?M;C&*Ie6| zzHj%dW#k=a&aFC+I}38MiFJj;_wSv06dc6%KY1ar2!v^p50f&Ui!KvX1V-7tO9AeZ z!z%%qO-ta+Q2dm;)29%1F4y%9Art8n-r7|3ah2m*7IDNCqzHaIOPOm+eR1p_HrVYv zUKHj|j46nB7#23BqgxsHqGlmad~)wJ4o(_S6eu$UN8uC?mO~_;+!j+U&O8perU4c^ zv({cQr)-QqIjwFS1H&yq?)dlu(b7AQC1W+_2masWv$iZdX?Rf_r~(AuoITndgVbgX zqd1tO+t%&REdeB-A&V+QPh)|ZfoHd$zK3`BPCh0 zd7X6cQ$-*Tyr(BIdpANH3pC1L)6@>x>3-B}&{BVYT_UdCg}%&BBO?Fp?=2b}C16Ja zP8Ei)V^2KoZQC=aW3wi@uC;VN9sQAK6JCKU-Nu$K1dzjIJUpU0L@LbU=0jE_c)&6k zX31(4A2H1P+nd9g+?T#~+3oeG!X9+$j<^DqN_#*M>FOXkzT0tpv8G4bel6`4OY%)P zNaUw#Isnwkw?{a8u9B%FP6Q(K>190EcKyAlc=@TYk#zjKSZs*`OvzZ$-+ROF0=@IX zeD}LiA)|hZV}}nnW8?c=Vi_~AJ=xNB$YK3KCZdH8bc_nM>m8DtvpQ4$d(`hjtyuN{`c0Tn6z<2k)SZ_8j`wjCTd>TdzMpTtm6tkP32 zR-CiERWlU&vQm~xXZKRYK>ZxNjpvs?h!Gr&H>ie|b51Gd-=EP2Z-0j+S51cCAh`WV zdlm~T|Jq1f*V`mAUQ@anVrG3BJF@ol8gl(xveY&N$M0GAj2yX_C3{@WKo^IOZ_jb7 zW%!OaVOjyx<&Bz!<|U+0m&`FbP?cDDxQSo6&gbG96mFqE zL2M(22P$(&;$&w35OkKjPNQNUyxw_#wcSMjxF>`MXTK0tynN6puVG88me4vrHmaiH zFIhr^9a}_+7V+|t(c<&LF@|A?h92WTvEA#?G>Je&->|q&A{5FIK(o(1ch|{zL(Z6S z<5aJTBDGgZC5eQ6CWT6mpMY>xY@-#@!YM4ZIn_#)KSnt$`ekH#CN(zEN|^E&CW%(a z)?a+1FUJs@ zf@^kkq<{ru@J3*o&T>2b`m%2ggR9~On}g}?6q@UT@ml?afc>WK`TI>mR+W*=MJ$ML z?(`QrU9$kB4hm3qM@hLIG3qt?glG`SOMpEU{y=K(y1#Cgc^AhM_NvO_I}Tps^k41i zLSX}czz{TLdiShmO>2)$?_ZCSH0X9Mj8y?XJ;l$o=)2u&=x7pm>)%YaDU4hzP0nCD zLgPs=T&1YhZk$y0Q6c5zN>9maH;L7Z$f1tB!ZV;SF_=i!T2NC*twwiJhPKd)J$AEb zHennH+{fBKPgG01x($(deH3h0tkw|ke{yfjJ;cuTr*`r{ND$X^3yi9L&W|DfiACWa z=@wR7I%Ja^XNa6dk}7r2;b-qrG|$&7m^wTxKh3J$x5jbUXyNUx>BXgu(E@06cXK*g z8Te{!i1PmMb7}pe$;mWJE{z+^%_5^sAdVB@T*Q`Mz@iH)h?84Db-*rJ;RG+cfz;U# zHiwOO{Xq3%bSb>-J_271N@oTJ>K+L0%!fdod#|{M##Gf={v4_CmDesyRcEpnpIke> zDO+2V)SRB8tjo`_xmwnoURL|1)V}G_ZXiE96DSN#2}!9*k1A=&f``g2LJ&Vb21JAw zhUB1>j)XSS0%=ZYxOu)dS@60FS`mCo18a)q2azj|r)HZ$bfi;WO)hQuF$x=!at#7c# z50^MNmb`u&$JQ>cLL$yvr(W&4(d@*Ype<|k`#&m&v3v&-S>{UrhV3tD_CyZJeMhg- z?bX`P@B_BaXUzu$pB1djXH(dA=dFA6%&Ind1KUBW#SE<b zL5LYv>a;Fi&W!oo5hAQ#oxQWqPBdKxW+Hf?1~=tATtXM-kgO=9!*N6>4_nkj80kN- zE%oBN&?9>D`gaad1ny(R?I5l zO6O(l*7^SAvS%i!b{!Nd_9M!D@BgYk&sg189`E~op7SI%R}gFiYQ(6Imk)bamU~N< zS3TF*Quf*7)sscz)3PV2o>8O);RQ3Nm$D^+-V~e~tS3<1&K=Q`vPX7nnkT>4*5%FCr9o5xx^A@Tz3D!{=N(a1!HS!hcy_jZuarnlRC&ZuGa9rOTKPR3)HDpOKSQQ<#|m zxo9m~+!r|qOJ9?Cin0vZf&KM7LpfLKlOpQ>z!q{`o<$z!|ADbQeI-^LJUH>7RH!TW zDS-A98PN09GD12z)oogGFs(oItcO=u|ng0a_}w8){yd(v)1~z;s8Iz=e3zgO*{5CpB7e zJeK>W=agY`i}I?VmEa$?70;QI^EIzoejTVZ3;lFcAl*u?`#6(0MzZV=``zZ{d9yph zFq}AT>(SXtaZ!y*&D&!uLNq|0v*f*+xxw<$5W#uXE+~3)Bm7Z92Ld%3f&Q`<-$QV8 zYnJA_yC|VB0 z-^5ftl9s$8l^j!XRqvzVGw$p9?R}8j4#OaEQ|D#JC${jKsrMWwFat^%*TXgy1wATU zy04!)V}g*%?Rz0Kbxph0R&!c_z8}xL?w<^Lw$s>qbyJcIW$KS;`$&l0fM*%SUqEK} z4+sU3R~~8(X#t`Y!_mep#pjw$vD|zte8%sTyWtVNi*laAq|9gXOPed^!05gNPHLvQ z@z4}6BR6=czB465{St9n_Tl!7=6lGu?|zl<+on#FKbM?^y3L=AhJBs_r*9rCdj+zO zc`#n_mwh~$66_`P{U}oRq;zxHu;jm`PynW4{bkm0nymNPh6!tY=TOD_c0DfSP~`a@ z$L?-sCMgU|j@UXPEhtg77y)j8Q44DHEJaluPa#eSqLk@nIYQD!bFU**U{6y~X-FVzi2c z)QVWXm=fiuv69sYcmjY~wKU+sZSGqspU$`LDAzw6<1%HLh+S;F@xKE_+9Xhf$rX{0 znrBv75Nw>Cwv&lmtZTce31cS4b+3Uuf^Zb@+GLWnCG27O96FX4N?wG>L7TH$>%G$> zGOWV$rX736qv$kYuu0LAv|_!{sfZT|qnW7Y3DbC~c_KtwUMS?R z%~7@gcO&-T{<80OLpI!XQpyXa$uW||D-E$;%Yn}**3p(gOU;A+5 zP+c~HEN6kvo+zK}Su#`}-Q`|=!xE~#ryIZ#%4(U1}M9;Y4E;4yGOXM$=GFz4MY z;vR-tHmUlOvA29Fk*`Wwd7fR_AbIsPr6(tG194b-m2u3~Qjp}6!A838K(oH?Y2g24 z!YdB@`%N*9ZeB3}B2&wS_E^ANT&SVB`QZ4`SG>IU(UY>{a7O`H-KdzLaf#24t zk^B(9`ha%{A?_CK;5nz`37XK-jza_e7|`O3BY+gNDeLPE*DjM}qSd~cn-ZSql=m+J z4ZRyrx4OCyQC+>76ALvXK}Y$c1wQt#A5eP#d6CfMdmnh&^TU4CI%zn-NPtel)tXYM zxHn+q=^HwO2~d3`r3*jeCz_NV+uFLE zHxX`~3q-vf(tXp$vyU*awj~3(1jk^k?1{}fu(T(pj=h!!tBlOT?lkYz!K@^5;E3-f zDM!OHiH8nUsu_U^IsL6MFDtGu#4aN1B=ss+C>A2)9eN=jM*sh}X;)i^y=TOTJ2OIo zGvXQ6Q7`tmcU`CKpK!gOO)t`d;WAs>c12X%MjmrxveE)wr&J^Tts}Xqd86kHCK48m zy*I==z2)seEtOlKOH}N3ZXWTDM&*~g`>gz%$PfLq_n=BnV9P%+ko5h+^}Fxg^}KYq zxRpq^x@%Y`WEftjm7F`eIn*(^?+hPg&T3q1sebQQk22}O#WQrrJBpL_%oGkX*+ncr z(d9*&c2mZX12rBBgDw-wC z9K%kO1P*TsLHGms`_34R#q-lgH`{x48@(K^HTt(x`lf@{>U`)u#M*WG@ESJe*=E4z z9}rrhB^xGRuI5DIv3aiO+(_XjW*BZt&t2_{Q z)Yy0Ri*HWwKmW#N*K`j!moNd3!F67PS)R({v=h?3L0Bj$g=I}ge0F0zMb}9in-cQA zS#BdP*d}1R;v)8nmnMrwyPQhj~8)~mXzccEGXT!t7gJ<>@F zi?xLc!q%4xnWB=+?&u1}!0&z-r?G4{l{oB-i7op0ADDN?hwmH5=dwuF>7X;$+kYnp z{`=z$`H81E^9)ENHno+QGD{dY{y?Fs0zG+Dy=}s~(nYURnn3Eib`eyGZp$nx6k9Q* z|3;8vV>+|iHLdaF`HVw|_z-ZEulR0yXEa&~`h6FqbN*g0rgc&Vp~Ig$({9{mI&*e{ zfp~7R7Yp8i4Mm%&W3VtZH2s};KiP$pvw65oPm8I9fNZAi;#J5Ri#7105b?U;yG%Ux zSv-8b5FVd;_7OE5hG9T%fZ%ZW17OTtS%C5^V6UaFZ@oL*aja3 zvRt~{xlA6?+H;hs5Z{g>UyDz>ZwLl7Oo|%t^#QQeYpeBpUEr?ZnQ!}5fIS^N{OW8XWyZJ(F9 z)|gSGo_A`=^XQ5u(TsPG>}L6L61xLa+Y|z~TJ9~i?Mwwr4xau%ciVwk?Y%d?ZChR~ zfAhRzz2c+gMB`(qIZ$_4==)g2VdqsS^fK?YJ_PKE?KqLH(9Sdl(eUeq6=_I6d3NCM z)^Fr|5OHdf8)oG^B(RHrw1*N+&s1P&(jp=JUq=LqB_k9`(akel2y#bDQP07V^l2BT z4k(8VA{b-6Y=WxW45U%?wNt|XfvuvcUH)i(tFf6vGiQMXr?gUAe;E%PmS#Je{3gTN zaoi4xo-!FSj^kMz8bO$ZG6jZ*Pd-{U+2$`pbmc-bp{A=&yc^|^gu4d0gTY&6aq`4MC;j5;#i4-7Wh}O+no21< z)Z_vS*)iRmPM(6RqheZK?lInRaA~y#YYM9KkBBNe=Oi0_t{g9xb<&4K9LM3+poR_+Z0%bMKkp!?d&_yPZ;C)|?RWbDaE-+L zdQ$#pnob^UFPbQlf`+zH(@ffpOn6uJ!qCDEg<5{c(=Qk_6 zNzJFbH{RR82Z=&c_9*dj{1*awFed=sjROL6AiN~mjIGqLxF{qzF=>h>^C#_Vd<~DK z+=8ifvyOsRZ}Ok!4b6A&Hx!aFR9tDU=;DJ~En-QE@7Bv+j;X%qJ%Q9%0I&&u-(K71 zb~t3>O|Co!-Rc89M-BP4_sjY9hcsc^*BS=Vzbrr41lc_UIT9Wn)hDb^DjThSn2!a{ zfnnpjlC#TJ({0_qIFC)#TCleBwWt?QAW?qiRD_JwPB7Q}*BL%buop%QyV`@De!ERx zCa)Ks&AiZyv($v~FI;Mup%ya;V^AGIFZ*Hr5Km0!=7qh~s-&yuRSXhB&EUUQ(?kZ^ zO}&OyXF2;92>_I}4~VMs4sx~ZqgP@1ejX0u#zHHG4n~ToiWupGQ8fE7?dzhtKJKuo z;EzM@&G8em79F5N_Yhgy;xU$bHNFhXejXPX!d2CI(u+yLgS5!{945F)FxTlH8C={N`~fHTuEamhUJSzrb=PF)R`W# zx?}@~?Lf+<_YY^ne_$Yxl5fzo@^^&UX|5CJ#Fm_Ik$DqP2b(KFbjNtPV1Ct{Q1ltY z3B#kKu?0#9jLw>cx&$;gW;pfO_^aCu?ig0$aHJ>n;CDPydwBw5dY@HUHQ!y|H(@)F zZ4yvWWoJ|`>?uhRWXy}G7vXTrGUSG(a>vV60nndwfx8n!la1eXxGxjEPY8a$-V#1o zty~!*ytEd4axAAa--plo7zzA&YC96 zY**|ou|i7;a(l4y^oJ63?#jxhdL)+jug^b)3@VFCVBqm5rQ-Q%FuLOpQfc_E%5`VF zdEB-Rd9jrA6Q_;R!cGt@-!WRy-6A<@sa2yw_nWX?s3058KO@_z`Movn21Awcq7oXeM9~gfrGn z61uIXi;zJpfZ`a2fCHav%)29$Iz5e_PY^se68_^D!|J3uw8|pkrlGtWA<-kO&2W%R zrmkL{;Cgys@%%zn=n}d}Vw3XRQhD=9)+MF-2Jg%A0r+oxQMYP74GjB%lbl^TQ8ccw z-(I1|4{99)?qJ-dmlqi0lPS>nYGt>%Y7L@|)xackTRjKDtZ21+>)^Os!Ob=V9n2EB zt~S$lZ>d`^vzI$zF7P3K6!X=3vdgU!j8>(UZTblh#w-6~cg)tp5z&sL1pB5OOaHd< z`%p38Y*GIpFH(46O^g-wRrfKV-O?x%=igS|7e_O9{|V^wVx)eYXeu`htulwQO-{iio(iQb@#JZ<4#!k4tNZk2-IELB$(o@`TfGG=~Q^|}quQXSPqUZ7996+0y zwJPnHX@&@c_FYI3s{K~{Dy`IZyW!sX@+gC==O)Vz2%Mas*Wg_gd_~|)BVZTxRGmrM zrq`6@UxJx&6uHS24OfHBIIQc*exJHV+{ldrkzkzCc*97Z)_~eMETy+m9c|#;$BPL& z8i<+^Ik7O*8R-wLVLm+kL9)Q+v6nS9Y1Wy(AZIg);W*7s>aSz#FIBeHzsjx%nXr<7p{t! zBzb?)AFFzfNv9!NQ+U`rcL1q>0`m5F@s8ie1+$(;_xod2L0JM3CY-8Wip#I|zZzT@ z$17uIe36#M$&q}+4tw2;vrFc(#teH*oLY?`gLz?zL{^ zdOf)OojI2L_Rj&#-Zuxk(Zfk3ZfO^zG+p~P!y{R%dKaraj;J$8-wkG?{1CKsuxFPv zbd7wiDJ)i$H%ETIk25o@Sn5x_!?4U>o?VSH{uCHfwHhRwb6Jt5`^G8>w@KhdN+f6D zs5F6yAOEv&DKi3+@$0}Q!E%1O)of_Sk#>Vn zCxxT_l5;9^4vK+@i7#<~BRF3GAWs;q89o%RZ1_w*Rf{U5u9T^7_;j%1-!W~#V|pH2 zrskP;-LbFb*RvN&PBbDoya~qa;1$xWOhx#BAD$U!;&UbV#7F-3b^KfYC_#Q)r@cpR z$SN^1@4}R)Moz}eFQAV+TW^_fFQ`X(>x~@@!Ukc}%YFZIZc%3>O0S)LRsS@wFDJp8s82Nn+Cp69M(k zjSQ&6@mTmG)tF{R;qeT9?9Y_r1=oB*5}Dt3T)%(M7kLLM*F8dZEOa#MD0(m!YX7`-7z=e(Nw3qpmpfhG)^4N|paGkGc($xeF<*~Y+Xd5%ih6^`~%p+@ev-n#q58f?yIqbW1 zvF`{qTPhao=-&O^SPM7^*VE>nxDxK@baq1a2!icapaamxu{>mLlOYkUR@a55v5zu zVWhGdiXx}M(jZSYIRiyq4#!KL>sRR%4F;-A?v}uCdW;VS4y(H{6?NJq;5bKVP1wcc zP(TWdMXHhB|DnKGHzB~cU^I9Kg|Y4JXSK8gc8<9(^{-_XvtDX z@haY{1X;#LxA`9T$sl`Exc}4}oUYTKJbygmd@I*-k0RlYkMOzznrnRPo$`wLtbYXj zu@-=H_aq^R7-gb$nDHQEJ5O4bOLey|QmdF2bh~64<3MB$6!=&iA5!5uiO%nLY{N`f zp=4*&0_9G=_fbS-`p675ZRghiG^YV>br^NIhyat_ch17E#P$~hx9u^MG*`ke1m)4 z5g*<~gPF1evG~Rb$CrWy1`YCU2$Z?XT=^zI#%b?)T`_ry3*&pf(rchK9|USlBFE%14g8dGTCs9Tf)w`;;5X zcX$8sx5Y!zd~VXhLmWU;&83XtY>GmT8@?0i%xYRQi{o}}!uPxh>i8yQtIR_}`-$Ni zClZ)3K(h+LIN4&+1a%miIPT8D(MgK!E?0CWsp7ds0UoW`JD@oq^Dd5vWNJYo>&ScK zI8K!GU5#S{5eEq~Sf2VDzN*x1B661Cq?-9rRjQkrXLvLz-Xx?{>n1kP0iH!Hdt!71eF^mh-JQ8{A zbs3A(Jj^)^$=H~n;#nWqgTcFhz7vLb5@$tfM-#4egj~7OC_ZcBtEHJ&Z|L^^9F zo)AjEddG*#Ga{N#D9MI1qwl0(VW-Db7Z~r**817Cl-2+t*zbLoBHZwwryvmh;XJ>m zrJnMfLceKU4MoFErX z7Ymt(f#s4&<|3%}dqz`NU*1Iv9Z#WnDGc~K(<)J=m+c2`3?i?yOYIMDk4XiJ`+<^# zOK)4a4Waz$&2bnC{`rJO(4$mQIEDdKsq7gL{6#GzcU{?B!iAvQKf$M45&yCK$HP?IQiDkW<)pz7_Dfm|RzRmF-lBUzM zy0@s#N&Hk`Q=m}mbXMgV&6y~klbjrwE;}RU5T5975y7+$K0L~=oFPa8(EO((TK%6L z(Rwz4%Rt1=;F5IbaemoRuVPHPa_|u^qxoaC9uGk}oW^>pzNL3DgEvuB6 zNw3Qp5huOigD{F%gQ*Mbg>U`e%NS1XqY&^0@yZ~huW)CxIJmy zBh(Cl00nr6vh$?epJHO9T}A`md%6_C418rN<93`5Yu@C+?0#)s`${FPkvh-M)t4Ou zk0_>q);|WOAm?)+{2O!IbdCHQ!+FJ^u3A5m5aFO3mt}5UlaH=fE=Tx{Sn@-k0ze>T zwtUIU-BW+l%Dsu-zyqTRI}V1`I@ZJl9Kn8q(wl;8KLmx4&<&euSVp-?P&VZE)1m2Z z%}kDMzMBF(hP*?f3F?gHKQKJKx6)YGUD<;yy2$3N(9&Sow))*lxEzyb0GSzrfpAYi z!X|%bnH{KqS}JItvHkD^Rq4Zd-8OpuCZH-E-@<88%1d*D-zDA>N{@;xa#Dsl z`$|6A$IsFU&@}D0j;rszY?^MWZ-+B`EHDGZWnmL(8}2-k>spel!SXlPAWt|KDQk+mLvVh=+n9#5S9&=vz=&K7wBF2oKS-C z9g;#OrzLQ&#ULdu0>d*Ic>%<#t6O8tvk6FlghaN9JaEVcwn;A4rH-X!?SyJT5Amfl z;$Xi!Wb!VU+mTZpZ) z=Kew%OI}?tdX%l+D1&r}vzCIdii zQ$sv*tPC$Uk?|mw1_nOXUr8%8vKLkUr||#h*A}1c;|$Pn#BQ`Ee*3xcLhh|>(m9kG zZ6=HOb_hY;G-Af;Uj)%7mjp)6wVc`E27PMHh}LTRAbQ8!{<8r-YW@D1+C~-7g?4>Q zqHNLV!O3Dm^*l143Ze4@9$HrmKIyw%8G)`P6D593oDu|ut!EdZSnJ;4umxjR+ol%~ zxQozZ_wGN}TX(tVSU)LK!fi3@nLQecJ;8=;4Tc5~H&3Pd8LZ*$_k(uin>A3i-DH=K z@U%9ru}3d9vz2lu1?Z``Z>e{bvGmLArN1SP1n9G_%MRUV8sI7J#*j)lvmZUQp31~J z=9cS>&jo3Go?UNF6R?{;`mw8xJ+laQ_N8oK-JLw9SK`Gl49=UGkn zsMZZ@z|=(IWJ$+eR9cEH>39={c$nQ+3RQO5J!f=8W<((~#F{Iwp0L+HMBJ%>J~rfP zJmSsg2MVKO=TnV8m!F~eRzqU$NptpnQaQB6%Nmyq3On6E7N)(S;6y32aEe`B(iUuR z3>~TQ?v{maf^U{|oMED=$-qUOm|*X_Cj@6qc8*+Q?hP@$DBVXfj=lh1g8xd0?O&>x zkfOUkiiLJD?5&pn4{vW36=&1EeL`@8yKCcajXRCIySoJ%cXxM(1b24}PUFFy;0{58 zhddw8JKuck|IVy6hjZ8mb>CgRx@zy*b?x7knQ9hLpR-j~Q(|IcB`lC&mLH#JHvC+| z(t%Tn_o6^F2 zvTpxfRbKmxcvy70HNQA|A>ql@aNUt00Ssh}hoQ2r3(c~5;1ParmQ0KgH~ve)&o3`{Icw*YQfty9L5Rz$q727!_x-Pq3y5Ftt*u5ZU)~E zsdZ|YzRv`0bGp|I)|0gAd2(C$)-RO)12rpg_4CU8Drq-S?r*|O_4BU8_oT6`05d_Te8m%Kwl%(+qL9V#_m6kq&6Jn6{<YBABG=E;F8(3;GNUH z)Nsadv9jDthr?ru58I$#JpxSv`$$|kWiJql(Uuf{%~liVos;PdvB%rwb7Y$4SSl{! zaUet@$M3IqB7L*=#0HNJ&6UCvBXxeTC*kXjL;z>R86{??BK`|rLancPI|Z#9m(Xn+ zec_hte-VEgmux&y$L~L)mv$jjvqT@pkEUGmd9tpTRWA@zpr_=q<9NG8owWiHv+p5M zz2Wxv9h*ly{#k;zn<6bh7{v3jlGyMu%hY%QCefL@_-VwKQQ7b`C9t+2+^5fYj1eWH zYGIdPbqUMY8X@n$4WUV<*-HfJosH^ntVuoITAB+LMt(+Cn(8BGzMZ*Jx!T%qRZgsG z3Y!Zfi|uW{e+DdJ#l#3M!4;Q0Z(Qrtz^;PyuW|w zh^Cq_^aH2UX4Khi9ENaf@Zf2)oqFQl-@!WhgC-Ks{q~~{j7J|qlyi`i24HP}RHqM5 zDdDc8A}9f2x<`<-Tgqij(hD`-9uf6ghUfnVE$~{aCC>3AaampIP4ImtFhv?>?%>Wo zt~}Ef>)0RIs4=ie`J!*x5_Y#RTl6(~PYLs}b-r!*PB|OZ{%Bu6{dT~0lJN!Qh0u8= z>K9jn_>#8nD-;|45S7gd7k{{>wj^D$;$iffp6$qb66ZG1)s;gn@scDJHNfsn0!>cD z|M9s0xXBFOb&d^fA0#V|BgiwjkQ}BM7ii9Tf&>}`rXX9tAizTN(bLvVCW!S&t&r8+ z)?-l2@Z!46SZmUO%*JD4uU1wT0Q(6W!zAl4Q}c zWKokTR{}f9jLG$wpc(6^r~CN0%HLGtpj8a?oBN;bTCTGKm-zN1c6b?=iECUj)PezF z94Rr4&vB_bF}5yo9{Nor%X_fN;AWIM-xmaQ+fa8S&&t|uxID(XL@cz?gB+p0yOV3Z z;c za^lT~tT(JYv*mNye;V&A8`98yHbKjm<0dX56;Y94c}IvTBn>o^*zD8gc9g_qF1>QK z%u=$cc!}aRQiPSJF(*u2ZkqF^qoF8QZLdkyZNEDG3V#3LS3rIh={QF~_y_9HWN}`l z=L{9*ZGSO;qF>n1F|)%IPki#ohsgWU01wtS0YpQsLueUYFBL#c@_KyHE_-`BUm;QQ z_`NTAIbOgz<`ebCPEBd^(hEemH=w?t=?~_F=zLeLm%8B4m!{c1k40C&%@_ufvL-5< z1w(vHU=_yI;qw8NLX&%VvQ<7+TISv%JY^Dk9MO5bVYI1IX9qf@cHwCj;1wb}hs9dY ztDtDLsn95>60Nk`m_tQQII4Q8ENTB6PmW$%l(Jn}9ptVztYSSzpNocRx|H|Cl1U|9 zy~bZser3!jaqaKM8c=xm>k9Ypz)=~yM!Zb4E3sUT8HSLz4$(L#lJ_%({SQvEOGU+Y z@!Lh%CG)D(`pWe@CPWOEEdxAiOxqcZC5xOpADsun{!N~q=YOE=+bUobJP}AIZdw&^ z`v_ykFftUs-RV;5ndvZ19@ORP_E41POe9yTU*Y=C$e{R$TVro%Q7)TPAL)2B4Z zO(3(;1_DF>3?B*~cxSo~nKJa!l{7JIl#oN*fR)b}C`Sf<&Tv9K7b7_ql?V^28IK)y zjm5&bur~-q+Dp|lcDC-PeYGTKS`M`n|K>*z3SR2I*^sVI@m-E}mNUN(lL4x2oOLvL z9U24Dq)}43-JHmg7>XH{Dp-p?f?wTWt!4&n(doi<&n{j z8+myZL_gb@K#pOZv(JT0DWp=V)UK9df_#&)f@E6>R~waQ&B~72b-&bzwc0+XDJf*A zPi6FSBD_G1dfxxfAJbn2E3y4?QOZSR@sOo<9G(3PH6?rh_ipEco(u}7Z4-g3P$6I> z(uzo}je(GyiUz1eU&1>lya|AC9VCJ{Vdc4Huex5Ir&Bmo$SfwV#k3-h)Wal>xNt$` z`zx_n%ABaTHEXojhlctqPnpd%eujpjk0s!A^~x#*2dlw15&jII+}+z%wTx&s={Q~E z7=K8Hh}@wBay@nY{IbSvhCrf3El52nPqW&o17i7Y zZ}Y~UDE4^8zp9XYV+$&j>zOHi51(t@RLeP-_8=Z`eMaWeGaLd?Q)PYd4vXS*71)YE_wa0W;w~Uysw>Z&J7wGbzNS>p^Jvs6Xb|jDJcf>>pb-_fqr3AtxhH1ELrr zEub{gL#^Kt;4D`2Ft2E&^vz-&lBzgr051vn51G}v-JV7R%MFP>$_SIPHYB8xZU(B8 z;|A6!8i{vl-mkwLX*&U&uwp_@7*kkEEdU533+H2#+Ex69?)7iRzPjW#c^aFV|DeRD z>)(b>|AA@=3&_9H))^(LNpsubn)6`=<$k@7XCP+bgAHcr(n(ZLgk;!G;6qtcmiZHi+Z_ zY=+^PRx~c7Gt)4!=5bN-RHHfAu**u6ViD=wFTp1U%$q6}PVMgwb}C_~{y4wg-v9Mb z_?ex0Q{sR>-y!CWh79k4d64djNw0^HBd3*=D-G2Mk;n2e~OE{+k z+~uZKV2a*PC49upFf&~ne3utOHc7nQoA&nvaC{z1e+bnWOl4Q6z2NYl+CWF8PthcU z?`1di`e=FLJL2ul^1Io%ZoaQaTP~16MNW`E@=kXS$~&_A3dBw_PJ&qTQx*9hK2yfs2T)g^G?aY!wUvKIFU~mTekmE>>@ej2&|?k?)6M9q^)kOTUfxMlMiS>9$o0UU znGYW}o>5yXo&Vw2R^W9{RnbRh<^)O+vO3<3fMkh2CN`~Q9I$p2m(?`P`kTt`bVjxYZJWIVsS5)S6+0HhtP3~E| z%*O)kx|kixabL617Z$$3&Amr&w{Dy@Xc&c$5n2$5Thx75yM?v7qc_`fBDW%E76D_{ zNNq*5maP^`i-%GRT95Pb>AJWhl8@7f zg_<|fMK{asl|(q%GqY!|*4dB9r+~MEk3E@hH_muLAeP>7_X63gWAjr#X&x?=@qr`% zqvGAR^<(i!Npeijf1oHTV%u&Wi8H^&%=1IioTOM`{A%jMT zePyqyYUCl-v-BCvsy064J2I-3znxIKScy+|A3M=5`F~fS_|SADCp(cQ5aj)+qA^C5 zYJnDE=$KMsC*c{UD71#L7@St;j`PNk#o_EAn^H)F)Ow?JYQ0fyD#OnJW|O&U0Yv6} z2HV4A*~Q!Y;X*_p#xU(+covR09MN|YS-;j5`-ix($;$019VGhL(<)#45C3HC3={-@ z-v&e|I)*&nF%{KOJEHG+`|Lnjd;TNnEmwZWW~*=gIsni^|Ac?8`5A=44h;7RKm~YvLrDQm(roW0W2Y0CEAKTTWoU!1pAwP0l zXUDg4NOiF5=ZV86$UY!?XFIn-B^=Xy6@ziK`^EF#zv~bgGyPktqep5nX4RihwuFDc zCdk?0q<}cN0UU|P($Jz$Mmlj!@gj)u-j;Ixlij$cYnr!8$}R(FddB7N0Bowl93_3a0}6w z`zHFtQ6C~+R@a#*^ex$$q&O86LF*6w!z;ziEiyA3S=VXG_w*#Xq+y&tO~lUltCJkT zZ9w+&B}^lnIrC5SIIL;!Ow^R0C zTLR5C`=k5J%N$;#gYmfH&zDUS%jc~Tz8%krF>qSqb%2!Q zo~o?ZZ{5vG++bB*TGp|UadPy=bw#c7S}}OwW<2pdHTk7dB;SJUKol z_6e4M&y6NpVBALCalmx&wA1(fB7(Jzo2X4(Yo))niTk?y!}jw1*Xrxm8&OBl6HUl> zm93_KpiBaS9V)E#SIO54oVcaZ*}L#>^%Tt&CmPQR1qx!V`Ohu`1TmKPXca9b)^f1^ zp0iUaii?eoLr#liapx1w$IR_{!dx|<4XKU-?L>Z ztj@?_&IcQM0kdg+Ad(B%Yc4K4>{|PQQ147l>^?pTL!jZFR2>>whLYnM#qbTbkJGxh zx*~homs>W5E9=Uh%U#1x=;2u9mkb_NO(2-}rpM&OjS`e4luYS;YqA7TM&b1l>k2c% zg;%pbLyl*0Jy}ZQv9l__#<)Ipwult?SbFqy3Zf@IOyqN!-MB|FWJ$20W*=uC^OC{^ zang399Hh|EQ0fWTOS#&KBOt}Yx9lm6^3j&KUSNuEuF!WiYWpm|J8!5$7w0jBxWWg- zk1P8BfJ!=M_QHV-4i!A`QdgU*o|zWc>0;F&B(7^6Sf;va*Xu}BYw@tPe+xSS8sDD6 zf_WG5LSs|l8KKHLvlrhYb4hMA-gI*&+|np_wr1|=`AQ_dLD(62W;7nN4vbp^S041c zKnAa)y%aEH#@nwN!L&mSX|cUIrggB%txt$m!9U7s}(TF3l)ojcXNjTMC_Lt%QJJnOD;612eKL%>^5YP@JIZMlA*%8Y7P z;wi-N7BZmwLL6n_I_c{7@V7%`tnTOggz1;XCpyWR>z{3O?aT%Okk`vQ145=a1qv=Y z^FPHx?1oYBj_L~v(kl2i@O)E z5pm$~15%mW{`q4wjCGg+RB$3&?0(tp;lkcS8$Qk6^-&j?-aDG9VBT|DUf|Pcm*(bM zG3!aF5r_Xt+3V|F@Wp0hUwonijsQo2OQBRg(4|nsHQ(ZfQtfOygE*~=URA=yKZd^# ziR-wq`O?9SC8CA$+`ifNL0!Jhx>G%DN)vN1;K#9f=`bz{t3qreCBYuua6zl7zaPnm z-g$M9j1L4n(re-5j?Plnt?Xu*X!IR4xO883Ppc#6wVT`Us@l}^H&e=6sl{)Um#AZ~ z6jz*zqtv5%9*trgI{-rW>l3sM)iSv>EmJx$mN;}bIcS{)sH&_zJGxb)HGaFp;9RRP z&NU%U2gS#wNFR#x)L_JO!#>MYVxd`z)4BqTl4zO3wC8k%?W>iEy2WYc)km4z&k?pf zyrC|2YOj0D?}E!pz_vSa_F_ml?$0M7vmpZHPzSf8L%A3ryhFH5Jn^1LfQ*Q9*ccmz zUQ4>V5z}Qyw|5zXyf1?ah~Y*({S~KT<$q^CM;?MWca9hL^(>K0wP~HlE;EEcM&X+H z;7BwWSY~Q+eE$zL^{8dI4Iwss-&WV!kn3n}m$+(xG!H|uBI+sWkp@0|Tj zoIvSFadh*m+L2~V>H9$k*cN?+!s8>T4W60_6m=7)L=a+BTCSZ$ z@V`F3B%s<`cxk|KdpJF^MzrFZr zu`!3X-scHCycW%&Jrr-mf|un?AEad_YUx)?vodC6no_AMo*6{r0Y6A2jI#HN_U z`=n_{XBHX6GPM^Ad(giK4v+b2UT?9-{fh>b)1&hwE)qlPQGR#|hD@>8b_?RS$NeOF3K+wgc!Q%2t{Y2{Ey1Woe(ltXi*+2OS`GCI@zjCb&N#KpUy(M?oT&yv{qA1L)L*DRD4 zTf#1Jg(I4fBMCDqN_aGIPxgWOM{~^ZlN1>b#{4ofdx-ajL{Gg_edY1<(!l`xNb0-O7lys3=wCT z7vyX6kR``{7+M%rK^7M?LENW!JeSc;kENnOxAt;JX{3!S_mTAvnB?K7!)#H?(d2+n zzLXJC?dAgpQBdU4TX0=?=g>@8?DX-5YjAmAJGnlQBN^dj+5}|&R)L#Z&Y{4LS46Ray&Vb29mqU-fxv?`goSJ$8RvcPH-ljK@}T z_Z{gHcd;@o4)ZzPrC3f%F!#c*@m?VF7d;j_BK3RPN)1|q*V2;)ctst zHDFuJ)>iOUN!7F$4rLSv(3B~D5VOg(t~`8kYRqESB6>p0`bKwC21|2LP*12o-r0QF zt|B7}>KUsxjIFA1iYu{N zr7es7fatzC*C|jN-8~Ptqy&2|mP_p?=H}rXwU*~vm3wM)?;*@AOTDvwP9e)_|F+b; zAOca~wY+-xq5wbD{aD)2NjvSCQ!Y?ceBk#f_s~-5Vx&-*EGWiMJXcrsuPGM`&hLEM z&xtU6x?ncI0$vHCaUR%t1R07?ElnLP4i(BS7cDy{4t|w z(4n=tI%hPb^mRE8SgBilZNzURHSfHQ^GnO>OB45H_gh_i8|dlc-8mq^;QL$Zz|V?n z6)#=Vdi(E=+0LNMhq}v3$!9=pN`6-C9s6F$owi`$yShAR3!)3dLG|fSXj-;InwmCP z76rvK&)nQlz=86Uzu3SiV(`;Gc|VR;TvG)62W~of0X5V5kKdz$@%jON9f@vUWc>!8 zoD@mB5%-F?hiy_r&uzld?N}FDwvw&=V#2GYFovT z1uSr}v6fv=<^k0iPh&XIK3XP>Hpf4+)&&y(>hAr$g<7ibT0^TW+g79Xa1TwMhVD!& z<$!KZKr6uLVgULDUUJpB6*DgowcMV}s*kf_tc-Oq0cWl&T-VtX8vCdO3y}BrjZ+yi zm{}Z|do=zry2YF$W7;zD8dC$u|3v zQjH%GO^g0$2oN@$r-@dLZaocq@J_OGkqf!Sutn9`>Kk$`OBYR4lhz&cFAy&y_ZQ!l zpI~k6LP5;)n%Yj3a(=$BhnMmu!gQgvE@pKtl9fNQMB*gL;l42#C!g@lu*GkNihs5{ zKZHFxlpd9oY8b&Wt4}$9_05cnW+7iN68z+?hY=-VKh}@PjnhL%t$@hPi#4K`f~V-V zH19nShY(#me?ezkdQ_91A?|*uC3sb#ar<91#Bfmnl=NW$?XSwU8}o|j3h42298&_f z<|GYFl3BifNlFGkZTkAYlHg!G#l5UmI_aU|#6YFHN>1B7_v8CQhNkrk)bqb66J1ZH zN9KEcHUy*F5V{3w)U3dn#<*oEdqr%#`JRr*7*eYDWQ>9AR?k0Yp^lP`wnqm`s;cbZ zBZv^)0IAzL{M00I1aq&YNXld|ehL-;{?`c7R2zR}X#t!RKg>=)tnu+t8uZXa_dU7@ z!m+t7^Vm}aYQMm54SxOXWFLL@JJRys(dXZ-i6n2{e@4;129nDxKL;jNoUUfFoDR1b zS3U_zn~nr{nW_Rr@#~U(Q#kF$)nRUUUuB@b6gV%BK4QUdxWP$ zZ|(0DF}I1PyqKTXk#MYphGez%7?BwWaBf23_sqzDG=m9Yx*^1{Bx1INf+<={75|xY z(8rXEP({dB%t}QOemn=FN!yb+7k#J6v?e%J&C?2HrbkFi^@80eqN8kXqXR5T7d5&Y28wj3=|1WU>q?i3b@ zegyO@s%b2$iw2lq{-jX(@)~RD1!83y3Bu6zVtSA-BzFVOv!AK6qOp;e z+92~`p(WY)STE0q`Np?-oMHK$AZ`<2pAq~EasZtt_R8_KsZVraZk^Y<>0gdaX%B93!{VICsx7&j1Rt7ZLcXPg*Y$G{HmIxFSmDGU~ptz z-z&jl?a8LaHd`@d%coy+l*3>5ghUCKTmj2ho#@phv)D!+y6J(W#nQ}$c`c`@p*tia z5O?>Eb!fd8-2Mu8M7Ww|DoBa5f>>%GWJdT`T*=z|CebBn0B-a+byG_s{%dH=;2rak zb#oZK@tI3yjxD<@O^zW;)90yPqQRro|68(-_8~v{pNIB;KLs(JOh+_(E9Q-rL?+tr zeRv|~Q_ZL0DPi^vcO;Q^_I(sS>_hEY_lC}OW9ipFGEt|GEeYne^91ns`!?nqPaEBB z(s0oftKW{mr0x{E3`@4TVI1rv2k}lfp=AZBLdm`D(Ma~2Nqti5mTd?}maxs$gzuuf zK=svoo2ZlWoXwX~c9Zm=W;^`9eQbZ(U1$HE-5(fUd^sAg`JQPT#AZ??#Lhj=XssBN za$7iMO+t*A3)z+pH5cdL+xj7dy$zT*OcqO(L$XvBK{9hF>OzUJ?+Ro$hzvDMp)Vnh z5qKi|{zu^fUDmUR$O#Z7WF;KV5hV}Iv53|z7DB@&!%PcJHkBslr@iJ>8l>il7PM~? zH^ot_Jdtt2c`o}UljuT{t^f4zHOR@L?4j$aH;j6h-R`_?tE0g|Dgz+C1S1$K0&iU= ze42<|$+ILPj${V6K!oCD5|s!j2ijuJ)i(0rk9+ugnzUY5B^n<{@>kDAU^6jL8C5v+ z@VT*SmM2&0js$Q;W{710pAwZh8owbF*2H#=O96EgoQh+v7Pi;GIKQ#vD4y5u;D^An zqCAI{c3qw~WJznzCEU7V8{f<)4v9fAS1IH$MM9*iU8+`rCq&lP{E^&@s?~m&-|WAR zK+hn*R;_I$zZkM8rkG+Vn`F%s8?~_ zO8V=q_BEDm%O?G&1J^%p5d+LZ1W@9IcuxgDwrx$2wUR^74ex|kgq=dG!O`a<#=MnC)y6+ z$$@+t`S#{io7$xrpT<6aI_)5A-qH*+hR9}N{YfGcpUGY78{I69QV)>Xfc0$hX)Tq( zxNK$9SnevT${%Jk5jn}6dD2+{HcbqWHR`5cplkR>>(gK1qi+He z7k=;_Ry83Wad50Ii=RuSD!Pz8X!Sc}Ii(c@*eNvJs2tVx)ItJ!h3theT*Mv$1GX+< z$=xRnr_Xu2$%+52wL;P~pqKtl)k=!vMB^GS*2D+j?G5o*af+RuaP@0}P#}L(dngUv zCD1AcC65QQE9- zzIREA8)RGgjKuM2gw>KIZbA8UzIfSjFs~xc5B_RJec-g7|JgAr?Y5jMp@wsc5Dhh+ z3bEH#*iKtAGdMzeA@J`-t@)@%syYlj?d{Q{iG}oPckwK1`-NKb>!!ZOz;~1D==a=> zwCC}?xuYzYy3UhZbO2xDu{7LwOIbNur5H4N;RSRA43gW=q+Ed$7gjiqXKCe=w0kY3 zxN2}e)JId*e_yws-*sHP&y%+ps0R`J;s4XT>(RlUxw-UL(~hb@Q#cl}tP9nvdF-Ric(SGOFn`1bJ$E zbmE;d4&K$Vs4BVkWh=Ye9f$>yyB8&Kx2GGdY(=)sN=al{AY{V4HVv|vd^fgV8cxaw z6mEH(ERR*bu#j;!m=}D;QTTtbCDV&8ipJ)u{+P5scQB9i;3Ql-wqRw*(16%$!5&f0 zT6RL>_gGEqA;9MiXM5EfL!n8cAKw_v3T3y5?%Jlz<;2h zE{vSPko@iLXP3!c;LoMnUlrd47^^b@nP=zbu~h+8<+*;uQ4MMezv@_;rH?%vnTgw>qSBvs5i6w6uEQxxAmg6r|EX`9lT3OMgpf?hfhisr(-29C#8Rs0E zau6tu__dEf*iB}KIrV47?ZwrVy^aLVch6*4|3E#HEsHh^oBji}y9+%x zQk&|GE5>hF=+mj*w^tK86L}b8lBqv`%#Gu&W=)EGm~srK9@RanjILIccWu(?M%k$; znSy+{qE2^_Q2v2}V^zT|Rf9iFi!&_U_h&UYXSwup2*D$_vBL@HH&2}<_xo(ZcR5?y z>+Ab{rJys>QJP`rhPWryW%sD8F`L=bA$FS@E_r@YZMvDmlVHf)z2|t}{_&AnmCizS ziBUW^@t22zqxmi?>i>Xtk7Yr;4oWX!eho0)xgox#|1Hw~E|bRsZV1SJZnY0G-xLIF zq;nZ7te&li+0Gc|b5g-_Lzrh(2fJaCGxhV+(>w;hYf1u{$(qdq4z{#%mDNCX59(9H z^!>&?(;qd5-`}5_*5`KXbKJgj*hGjFN`jd=b&`P{+*Z1V-p_XnLe6*iII*0$sFrQ` z+Cr}q6fL2}YsuYHn7+HdfgRdyS^!*e5Q`rF^i@#VyY*ERg87z8YMh*_V-d`EmwN0; z_6Bi@8`bjp1hiZbUQ)K7Z43sIi*3A9Pmi3Do%rFwKrB0LLBd2MJHg1kh6?JJSY?B| zvFV_+KOD2XPs#e5kDe#!gPNb8pE8Rs;u`X)SCnfdJ#64KC!oy8gwhquv3PlFgQzUS zYRB5rm63Uxv=E|t>q2F?O%XvcXlN9+$J|{Ptp(|Uq?;cQ`m~-aA@3s;Ds3TVo$};EE3fpD(>h{tMk$39U)no-q=fNVVT;4sXb>)P zYdNkJzUjWt@@8W(9=G7O-ULESndMgdg>-^&GJUjsKV?l@D{9qUj$k7DI}fZ zE8`U#JUrEqj>^XfN!HuLZ)~1V`zWOAeOB&j9oWCRb_At%e0OX!?(mGLHJ#cKcd%Js z&liPLWjmOK{r=qIFHL0h57hh{)61F-nl5ktOoc#3l)H(3lXCU08e7=@r8Me~uX4Cl zyH#MeX#yJBbjY-*n}>Z@jAHG47o~}gQVVQRhD-E{qAu-vu`t(%c!UIjv-6(lc{|cS zjs5#g5&gYppw4PX5*e!bRRgVfi(?Co3cIyw>^OD=NsdHhVmc*HB)Z-K7_UD$7#U>a@0*Z42lf`!`%mmJ{>n($Donzb3a7^^;9clwy-*7DBv-S&JIo=%V0t zpF;@|wr{zeIFMz6L*$fm|K&b8zCvL9sM8~{jM26oUFYVPUc6FT=2(`|SbyrlU%Cmi zQsaaO@i4%2C`5^h;f*_KpR^yHg{O zM*zFxe=G>*C?^dNLfF9}YBN6HG4>ux+cC8HGSw$krvtzJ9@&~*h=ps`wFQkj61!qP zCif-aw-wCg%#Ui+q%#EVQunl#bKdWnrwDhgFh~{4Y6a6@ItvzYsI{ETWVokMnhiw} zGl8Y5)KzpqCD4}!tXBtRsm|NDxaw1kq=Nt1a%T0}db?G~nN^XlhL*oZNnrX?{7`>` zi;2v{&~?brOCeBoTL-fdV4B|I=Jr`$;pDf0Q!80h?JGRe(aH`AHR4+O(Pt3}-6Gsr z8K$mEVw)u-4)85yBgMq94llu94;5>$+4VT=laFM%3`FQ}=U{C8 z1`b~J=c zha_8(q`(x;+#B0Y4AzA+rO{rl%Q(lIzxI9dImE#+{5(3My{O)KGkl_g(YHsZg#qjR z2I4W7E)Vs3J_$Y)u3zp20rZJ89yOp2ko-Io$Qy7VK1rlQrO_NgR$Ar{rs{G83o)&} z1&k#!ylrjxcYCRS}__JqSPwf?r zGPVhguER~F<9ti3M$0=($}|StJAWgK`;)an}x5s*?s?RPs0N zhA2&LhBpX{R!j z<{Jm&zgUJ5ZGXJW=}pk>KkxDzI1)wp{^+w#{)kX#a*%&8Nq%o#_{~>$Rr#waKLCv} zeTp+ta_TU$A%(Mc25(=E5RRlX!<@rYt%)@g8DJQ+o%VU7n@2YUp^S{r{XF`$MSFQ< z{U^S0VV=#_w=2@GqdZy&V1-G_1}v12FdgbxHTH=d%Mu`iP|@e?A4ZwNZ8tcFEQ}xw z73t!dA?*q?d^2_9#6aHu?tU%ZmOq|Fz|tdu*Mi&K^AjWBD@~*Ka6qv0we&EGR{jvj zG}tMYgbp>0lZqp|HrpzZJ-Yz-p6{jE zomZ52Sym<9Dy!x7Yy*oD-B6-De~(f5FKO#_=noU5OUEX|P$fXzjv0~tp$Azr%r!Z_ z9+o9U$1-xCo{HpJB15$jcX2-=VL?j5bSvFXA3(rG$VG=9#A3*ZpU4p0vGpSkcg6mUMos^EY{gSzLM zz%OjytK6$p?G1PqlD^e&k_F>>y`5VB+OMz4(%uI+3Ph5ctbd<}^&}@0qcys)U3B;= zmz4MkwxO?)$~RVHTR^m{b(*#9lM>{8uD?t!3IBmYDiHeXV%Uiu7o_MHvr^o{kn==9 z5=5ZTB9>h)ny@#9<#t*l@;qKY{xm!gEHAi`L=xhx(V z^5uhh^KZEam+_uEZ@O4m%|cvFPcst8MS0!N7_K4{#{NfxKOeZo+qfub-W++EPm!|r zK=b1?KW_`H-DRfH^wo-Srva`-kEDaYF)F13nK<{7C4- zXA6V)SYe-@(=4ZBr|SE6;J6B=HsZ%PzTI8A2qR%y=7^Qix*S{k8q1(NSaeg80M!nS zfu|uHc--Ds5X{f4&9P{9U`-ls+mYlhIoR-TFi}e1Jv9eY=SVQRfq#b}TrX)& zjn{9{7hfB8YJoDf-pSFe zw_&*o``c@FYqm?pVo*}IJ4SIcqmitCY+u}+Q1H_LmC&gzw%tlI(b7^-gd#C445IfFugir1<)o<;#*j;SPH4SM&i=G?2`Yw!?@kN%(L&I zm=yG?B$yR#T)Q`u8vKlpGey1cZ$v$+F65VB`rPLIva;?)DX}q;Evqj@EEiG~$M5Hn zCB%%g8~lWXr>uyhf~ETvD=ZX| zW~och_?!`QO_Xw`mPdLYh_$=RaG+PSr!zyTIHgLvDs+saq#0Tw=}=B&=t8{n4?ysw zvV)qAod_wr%#Et{Q(K?Twagf`b%Kx*atE34R0OkfcfaW+xSVXLGYGTVc=&vO+9>IE zGAhl=Wof1gF(xK{$B1K9Ai(7f$&j;w#mem@i>J+=Qb(DwK)Y91=HHUyK_WcU`@!_l zV&K*rQUGUKQHBsg>pGYKcPYB(9u0QHta)GUG~LvG1mD$<0yCMJAQG0cZK4r}qr_7& z8PpA=bF>>c!DhAt#vC$xF{OyxBv&Ek2#MHWa!Lr($3cerK+j=K_r>`<06^n-CDmCB zJgchv@{l%TY`cjQ(^U8q_JQO^rpBd#^?hN3*$-m!Xdpg7o-13=)UYmsBWx}2PXZOY zWj`=&4E$Fml2G-x+|}y3*$>2=cv+}Ec)d^5$^wE|@waG5w?fw{R7Zb;K2LeW3)eqG zen-^Ut=Oj>7x+P_ntodm<9cf@Cp(&CTi*M)#^v$$g@c9Y-L%UrRp43#DWk_Tn3#~U zZcm`OvpR}wwM8Y=D5Q~<@pEY`%_i^z^UeO`>pk-|H6sLH8iyI?ka5R7qD(MvJUQhG zyKmz<%6YJKN0KawPD+;7eZTBWY_-}J{7gRKrbx74hm>sAjAfgSay#I5Cx)zUM4oks z#BGf0K|O1+0;GqnQcEJ`4!S7L?+35!#Vz*yXLmyN745cB5uIo>MEW~ISi#P7Q#Z>6 zzpg;hk%<8Kg&&hf%`TF2o4wV3WMK$ahac!mI(^R<3x|4vXsq3PA;r4kvC3lAeIa@n zwxUOv(dY1zez~zpG~BFOhgTXO1kzC>st0IPw|~g?$;LQQ-3(PklE#jvQPeh{LIW0J zd4uzc;E5y#cTRiKW8!2t6P!uVfJZJq8@ zza1NdxT;0FQhGUFzH(G>D!VDgASE6VlzncTqE>A~0#`)q- zj|g8gx(ON#?HxoCPi2&q#U;kWArm>bgZ4YwU*7E7^eal-RU{-|t5xBhNWF|epSZmo z4QZML$SBW#PhS$}^VFWgWY7(BJ&L6#DJv{~mdx1jT}%D7(crWKOYjiJoi)%yFyHbc zMk^BLX(Y%^$#Tt^dx7xu^<4`Z@#A8&1G=_DFwwSV^XULGpV4-J-2REO|bUM zaSu=Shl7Jb(_)TP$OtT2+AwVhr2>9nsitDTK>-kvq*#>?^#u~POSSU z`gDNSc~z^k-LB*rt*zI@&mOurcQM(M8cF714s)NR2YNcSHr$WRv^T>o+||7kFi|T7 zQ24M9^G`WjxA?|@XnT7g{xuw^x2Fz!UIPq`oiRyQkv5oH^Bmu;t6xM+58`G;K!RVb zngwXvd>UKG-cYmKQHSi^F(KipW{*%|QNVwJ{GtYhb}){~!bX2>$WBU)7cv?V;{umC ze6*^z)v=b@>1$`yeaHm+bG0)nhUFK&qcW*PW*pBHin=klzm>nEq$^vV9AoKtr&m77 z93Bi=(TmX38_VY11eD&-*|OWj6{ zMU$3XG1JM1Lsl%D(jbP6iP%1VsuVKTGo4YCJa#!7wuv;VTPq`B|67lIej|ltEro{e zw%RW5*Xg61oqcza6b7kKE;(r@bd1y=NM8H4f%L8^u&C10^RcI@$()c)dnr)wV!({wa*$!4E8ic(blRu6&ObEV2O7GAB*F!*4H49H8T- ztT-MmZUq=_stke;{}hB&0Tx<1IoVY?V_?CzJ+v{8C-cV!tnr6AT!*Eqc1?HPcC}$2 zvD(u{d*dZ{G$1<0VOQS#uZil{`a-glw^SV6G36DIMCmz1a9}~_KVhg4uBg31&qE!! zVbuHv9p%F8`Z}v0%aGa3tPYd%=M zGa47HY@5XWlyO&kbPEat?)?MBy94_Vlq&@KX?`0gAphs?OlQ!{Q&Irwyv+K{UzE+L zKR>;(-^VEs{(Fq@Z!ZwJ4ev9TGx>eMuF4ZhF!_HC`LCyih;D(Q0{-WSga3Zo|1aMF zI`2jDxAu{{K}GMGe(`yELKff3cCURzM@>Mx0eH5+6vqH4pJgOpxi>UHkx)1RN^cT$ z?oNFQ_H5Aetmiv6sw0Mu`d@6lRal!*x3&v~V#VFv-QC?O6o+8J-MthmuEDh}?v&u} z5WKj%yIYa&tZ(ggUH{tqKbm<@axn8|&LPjZACEb=(CX|orf7FoE#D-8$>ky`Sa;mQ z??^(4gq3tNu4O>m-^?3h$ET<>ZD~y*muqgru%kcxKx)=l>?PE2)H7M|uAcv3T(5bV z-3N#wcd}Kf{=t;zyJQ58Ddm6x@bSZ^Own?CdWbX*5d_2u-|4iO&m82mnlLDHAUW0J zi%?w~;L^3E9h9(|>6UI3YxCqNta{$!Eu^-_PL{Oe!d1!i<*}+?mX;x6YC|kGH+j<% zV26|(Y&l5%Grl`pK8zgQHg@ZgCnIH`GprX`nHFQ4i)f>rB4)sZAm~7VM2P28mXS(q zC<^#70asalgAQNzWr6LFD?sqbh=Q@RkZ78CEnt!S{=N6dRo3j(;m5H?-wrRThCI`) zyx3%?F~K_qM#qLOnC&hibZ-e>mIR=0=vp|f9h30}p|=zgy|rh<^0Kwr&UbYlSW*kL zUW1P}Jk|xjuNoqu>W-HDQK?@2D3GC55BLk*ee5l66Hfo zqGwyRWHb$Ty3)QzCaO}sjTR9s=(IFohdo2Lqd%$}^DS9Z0}xNox9YPCt)5mYjGOZ6 z4z6@qAAgz!)TfV?T0q$29~ePRMqe|Gyc()GqW2_rdzTTGAEZD67#dt1-^_oKpivBK zpoiKdyXmrlc9huJ2K8UCm{E5Od?@>tM!%0{_7f`?-XK*r$2J3&mtKQ04L>3|o1?ha z;yGUy%_22;%asRT|A7O;?4V5*z2H?91CxHqQv93?FFS&WTcoQ-MSqyrQR)Wb#VJ~{ zXE*7BE&?Lv?)+aaL%I1rjJhGfkv%}dxML#46qh2VsgX-H<+SRxNUnCDr)YM zwTb{jfl@Dx3!c|2r-H5((B9Pj@w*>F=gz#VGhFYJCeT|+w|^;&*kaz&4x>fOW`6ij zPaPdv3HxKX3yHZ=xXyjp1?ArZ?^^jsQ7`S2&iC$~t)RL6nM@3W&X7sU8y?r^Tp2Zf zWhK0{p%^h_XlX%uN5~Eqp*NGX-e+etE^r+Y3XTB~sF4pS%67lt;4!DC!7%JI9l!(B-_143o&8Y`bYgw@LSjv5H$;k?mWJ)>V zj;y#wFUDMwVuu6lHXO*SXX@0%nv{6>Ip(b6sYCpN?GI-AN+GdxJu0-+Iki9>{3zB? z+KB1DEYm4_DW0n<3*gmf()C-_{aN+R=6k>J0x2d#k-C&6*N>JTH#0x8s&guDYqmft zH)}fkdqN09;rgQ;v@@Y}C#P&mkgkyx+m~INKNRMMnjIGphwf%ruZYCMf;Rxn80m!{ zK+zuzBhbmgNGju2vqS9319R$ai}){D}f9U0QfAtSUcrHuJ$+_8hbnyX-56}l6 zWF$@|sS?k;>6Q$Zt46tHDVWXU%;X7`EtFP6GauB*#-8~J)kb`8Qyu&ffeVC*#He>h zCnALAT6I0Y)LM*ZOcUs)%ikqZ6{&bK&3i$%aea({CuCM@S)vt7Ca>tm!`D+E77fz6a=VMbufRbB1vz;hbeF&=N_trGN+*`Oau!tx}+>w z0wdpX@a|-`C1*;+iHPM~9XtthwhKYilU-jB!}_?TXf6EreL!I1>4QL$B5{4rft|SH zV%AaaSCXcAQd|*zUG8x$zlN%`YR%3vO4uRVWSJTT9(HGLEG;c5{^^^<<6-9jo;U}5 z_1S3{ZqZBAN{N`9eoM<_MXFX#)a)D5V1lfjKtN_VZuaPs zAke;KBCn#{!ShS;qgI2~V~yD9TbN1W1@t7maU|gNJ}Q*FNi-sJ--)-MP@vhriLtSI zCbu5t@~K9O%7n;jad*(s;I{x8qQ@;(3`Ik>9|~B*UP$KvHH7r*FFl#+P*`$AhJ}Yx z;6IqaW@06wR7HAhJZ~U>y!KvZ`>mBWDFzLs9|0I}!_wmAAtg-+CKT$_c_?u-6tMpu zU67V;bn7(&jm!lORrg%&WTB`r4I571cC;atZUo)wNU*9SRoz23ak~w#rLo9k-w6N1 zrk#K9YD5C7=Eqd<@yi*<;8-)kOI{uKdQk86TkJq91Ji_ioC@vmEdtWeL&Yrh2#ShM zB0-c@TrE~Anh=qzjwZ&OTo%mK8-QW=lVIIw{7PV+&r{pOmVjsD4Dh$UxI|562GPF0 zyiWRFG=gcb5cMiCFq$*VovRKDRU>VG#+||P;v+p(=vX~D?pIQF$t-Or`xJ#=C5t+rWFrhk;a$*`tSn z%lg@)yT4+2jm^+?_P;)VCC~N`>gG0__^;O#xK=CjeW%ku@$l7!B1yysc|19HwHA|0 z=!1Y4t1IIWDtOL-^jZ1Mts98ke<1;Lax;*|QGc(O6~XppY+I3Zow%;bZ;UccEd!>1T!_w4HTE4!Ge%^FJBa4Z82mfkM@8cv4FfZ~-We@#y%R`PH*Qf5PMQSL11B7{!!Kx?We%rkm`29qT;ct`|MfLlcytQz%hngws#=}q(*1#C zT&h0M`6E@3=m!J&Y1fekrdL|y-_u-fM|JzlYPzh1;mKpJ=$j~tLRym9$sJo;5Q&WY z>wwW`Q9ZTw9%CM=35E42YCm>`82d?8t46nm5P)kgL|fovg$8Stc15IIJWs$?e$l~> zBT-jHn)@;yYjC3f?nd_#&pVE=-|JSZr*mYR?lOS)RK|l7VLCGbXx!RiPQRrW8Eq^s zsfU8W6$cf28I4da6=kidn*j#V))|atwf}|@|5nTVJEBc}eqtfRY5`Ay7kKwep(YAM zecZnTPa*gp^>(HIMP9pS=L!4%SlV2t)uQ3cypGX@rbA=$=>qq7im zQb^*Ss}T+%1ZIFRNKkb}kiozMcTb;5{~@;M&r@=}V6KJkV2EsPlO0uJ1be8rIO)7+ z_rTPkX$R6=mnK=F{b$h#c--Kg;QFP>!BfhXy}8dj=)@RlA4mkoHHDI{5m$YGbuY9k(CCWP!cjE`vR6QsbqtH7PZfJf}npmppr4}uFtEATH zN)0f>m*c=)lnH05yw+*{!zbk7Se2&@sr$UivG+MvtGNC*921d4)01dgUD!>+A3 zLRQEL63dHE^yn6b3I`fkJ?5EnMTVa!D7-=kBg}MxND{Q{r}IR#XCv;iH67*6w2A_j zOgZU0Sq}ml`SN6?<07QFS(7NGE)bRylr{djH!(s2VG0xQM9P}rocTLTy(M`3L)NZ6JN zFJ+^SwNzL;jt=u$-m#!_HPn+M9^d)8pt8|5X4L0WQGb!r@_ohsg)EV3hA0J+zF>}; z#wabyHNrgX_AUyUZk1iJfJ#z~Ga+}jNRP58IDYJ(FTi}(SrBUyN1W4@^zqdth3mSQz>xg zlwPNDY3TN)n=o9f5TC0EF|0RH{<=iKsShR@WGg`o&IwX-etW8mP?a4iDJ`p6CIBE#}6 z?BJBwfACbtMQT&E9wz&E6c}m~G6Y0g z)V8xV=CVR+h^8hzmYJNj9C4Ge(WK_3DTzj1sk71DETt?)|C*||9`jyQfz410#jV1J zJk?LRQ7U~Ko#`ct@WF=1ZKQI=qs@9tRo`72`+bm#i8E^nEk$ROfB6>T$j};K&}!__ z068JrpMfdZ$XGM#Y;u{`g)7Vc73E*&p~;08t5l`ug7>ku;ZN`aLRyzjXWIn@;Joo0 zx;^V)K7&AxzPb%}T*bH|aOQxOl6JQ*M@~91C5DIGai9-O+{P@`bEsjTL%Y&Y_|3U< zFm>h;@^$4c{~7M;YZ$a%^;$=ev%vQwDeb66ABd48I#dVAH;bUiX(HbC-7?S)zGKYf zltV7HnpjF=?$IPb#d32Gs&|m9Rz6KFX01kiPyDlG9~tukUXI-0tM8hbklV6uG9``s zLdM^^>Ju%CKIV`|P53cFksGi02%v|gt7p0VYMzo_7?n(>i7#d`az55vxY#9gv3T|f z!G%H=9x&brpsun(oqpD&Le(NLFa0Du7DK*1l&vG&HY)p3cW8vuB>N?GkhjjEXPvbM zGh&*HlwP~tvgfR?8I&9Mi>P8gXw$k~ePg#yQk52!YY!lpc|0SZ3X_uo_6SS04L;X>E@C5ipeneo_V6 z{eAh-*c!qZ82W0+p$`Ds%wBpJui{Au`LI~DN|t?VCoqtYQ6w7%MyP$ZXJ*`o9j`bZ zj4Il$9y{jyk8Fub07CP%p22wC$x`((7=)Uj&Bq;SVLKkR>MyR0UG+Y5B$$;zTwxEU zbtHzX?GTanX5?QSqdwk22RZ2`(2u1R-i!~;o~6y6JtM(Oz7G02TMyhGtN30mekA?y zeK<9##9LJgp^34AYCz{jBuexFPp!gXrDDF3+0^ToG(}+KdhPOMk_6(C#GUC_;ZwuI zakE(XPG>ii44oCBoqkH!a$-ObX8G=!9Bl54&oQVpL@DwLA} zH*Pp`gHpr4IOYC#9~vh!);Y_Y!=5G5b$wUN>fy%B#Xt)kTSs+Y!)?;|Is~dB*7+%U&aMzB` zhSI%4VGL&F;|#0X2r{^j5g_`N-g|gXDNPW^4d2zOdb)`hqLgAGJ-X4ns=e^g&q_A= zCChrBgk%|E-$$pXmyTZ6SZpCry5q8m>Mzxf)54oy7nL)T7v?ioHny7XKvNSeIK4c@ z_s$5IQt;;e>@Iw7eOzM39pR;ViLA-k#y!Y5riVw}amjvms%rJrd*hz+~VYm{(0Vw*=Zicv76p3X1REhHJ z*`L!^Ma#Pdd?}gvZW<6@GL`D$&VDY<@im&s-M%XHh2V-bc$1cVz@ypl&AoN3ZPVA> z56E<&nBEQ#%R;N7G`YLIPTK+oCy#HTpe7dB5x_2Ca%MG;D=#sJnrdV) z|2klZ=SG)3u@;p_Mo${cvUsZ-BI2N`r6AlZrh0ly;F(BPxW!55t&<~CfWA7{zXp&u zZ7MlC+O5_%Vhvf_q!cv%_kVTJxB8*H5s+*apC0X06y+_S>tNhXx`e~7ZT81KD-*TX zT7k_r+V&^v$Ds=^bT4FsO!q$dgw7*r)g>(J!k))ernGKey{qt{`lomNP0?I$3M#Bb zchN(b@ZfQzIQMbXff8||ET&LhH<+}Q)UNgkWA7wDJ%BX_Dm&!h5YyHD0H4I?h~H}zrb%c|HWF3#>>@XvxO8iPXZ>k=#ASIRarm--7jQ@ADpt`jBGm5_)|6oqu z{BIWWSpYMa^iPhuLGxnW5VZOLPG9Ys-hOVo;JT-?1?~eOp~0oCTjtJU#!tE#!An|?`1x7 z!#R>e7O*gW*Nf&7vIfcVr3j~JaF+-W!8bPaFl~LH-qsqfgpk$D*%wdXu?|WAh#_Q0 zgcaXKp>gNkoA~YDH;mAh7sS8f7;hO+>(+Zejtz&WVXDT>JlpX}DlqP~)VgHME|w5V zA?X6+EtTx8ykDB4c@tsNST+L{5lQyoLQJa&+2=&Px)AzfeNH}NoqNWG0 zt~^ish2_01Gn(a3?0u?jnP{X15qIs_joBxYExMRUft1ahrMx7l1Y zSYCOsf!wnl{$?ROGH3g&8!>UO{m-k#(z@5bjNW@+{d2nf0o;K{Hfm7F_py(a8y3MS zMq-!-dPRS+z|PzPvE?IDQ9YFDz+?O>sfpxnDc(!BhSpShOa=nWeq@#K{+JS;7+~0T zeTM%j&c1TguuGMMz&C<_b>yE}$vS99C#Ji=pwXc;_M^=4$d-X5w}(Rp0zwYY*? z_x}lz+A*X6!%qB5_mSMp38`0%V%!y>B->t56(RSAEZL8{HRQyKNMD84u+)<}5ot`cauba;qdo%vV=a18#9 zZ#0O!?*}?X!-66U9t_B$6fkyamfErQj63?I@O%*BhOsDwORgK2&|CySPRPDY2I0Q0r98ITF)wuNYH)?Q2`aX*s|r2d z0Hqs!Z7gP$+sTRpqmH$mo8`)xgK@6TBoQT)z=64Di|A6luu6^1T)?AaA|@8rd(CLl zY~^5ukNR!tHBZsFvyPy2>VrEOWAgu<1^)LFxe-loi@Jr?UIHMp&ay&UVgBup7|rlB zO@!&+99cr%`@Zl2;lR^*{YJ-$?&jKW4ZO}_o}Dp_rB>amSObI^6d>bq3<6-8QY5Pm z;a-(I4l(qC;8+k3Lo;h%ApHR+O7riuf&xSqB} zpbdAGJo#TY%mpvcjNew4K<|77|6p|BKFhb*bN$^ zJIajqtntn57k;)6;y3*@8V{nUAXxOK2*^C#(=dCM%}1!{{QRuCl`!W0lf7uvtIfRH z50Mgy{SXsZkIEopceWhWR!WICf?lU5*9XPpQ&8MzA)elmBxgxvk&igUS4kEbaJK$) zfx|vSVY_{QF3$?Nf}DvGbFf@V|JtFti?O3Wh_HOX*Px)Y=g0a;g+K7e=Rv?VrGd?# z6NGLc+=!_5$34FCXEBazjEaGZw)$B0OTHjE&EMt5?z5miiO@`nYYd4}_I*~(3VjkP zYoh)0ttTKHFbn=b)6-jYf7W)57#*)yi$5YxeyM8TfiTXZI8F}P9Lo2a`y)l-bmuRY zk({&kILNm22$?eLW#o~x`ZBw9YWuw?!y z<^-cKyG)yoSqe9V34+6r;i(}^|K)yun;}8#&mN`uhuMmu{INc%rfeT~@U9`UK^lIG zuUZ5l<#Fwp7RJF!a6IFHgp$EkE>)lf_(;`WQK@B4&;1zdNSvGNv?#Oa_;i@5LCxrM zosY9_qcCGzv*|0>8bSE40|Uh!eIIIc9{psR!sQ{+G?wF?|ytO7mE9DqPCA~l?d5+@*izItNKM6h1Z z@dEIvrmd`6+rB`jtr2_*Z7@wp81Xm1B`zskeSv$YI&609*`MfL8E5+0R+sjWBfzLF zPu5!}PaB&UF|5t+_BZ@fR7Br)vLx!sd`8<$tKHI`DBi$ESWL7ppz>MiX%3M5rO>wh zva8`A%sC6trLT+Q-60oTn{qi|+G+X*S1Xa1fOH`Kiss?Za>=6GuH=*Hy1}`yazbx7 z6kncVY!QMIog!1xa=cPkR(Te|Z-o~oIlvE!0(D`KfaEoTEtI!iEaYe+=$srFtPm5N zI&ax~i0|vNz98+WiRIWl<+G=P!fVqg_V3(Kw1il_Mho`&qrc+~fN#!2Gy;*l0&8au?y zk`F6ie+;x;rtuLw)JpT<7U_y7ST1DQvG$C6{?(THsjnE8%q3VtY5+6CGi4aA|MX~C zfwjA!9`)&wVx2mSR;5jY%9w0+ZWGVEdC6Mm_wy8;oYP3+&rmCg@=PRa22T?ud}KBj z5jyltE=CJAju=Im@Mt*f`&nE+wZpt(MhY zCt53Ic`Rq;R$s#|B-HC%bPHcZy(bNlToE)QIWeojOvsVQSN)v;u(#uupME9;wGz}-&U-SHaAJg-ZWloZ75p*!D#dz|GoeF z>s{<|;m2FCqtlb#;6IoNk96;2nwE3Ra~_m^2K_Kpjw;Yw5bMxcUf8r!mhYLr1b>&T zGzD+$7*LZjHAu2gPjN1Ph6eP3UXk=lIw~2vW!UW*K8(!S6t2${YA?T1`hZwP@zT!M zbFUVB^E6F+D=+T?e{zQ2FS9%XNKCYTZm9W$ki55DSfv!hy{Jj^gl$EM0LiFa_m#Jq zstQ;Wk*~0I#kLpzz*8@rf8#Vu*0FkI4K^yExmy8b{n4dMxPccOu<)PM~@xEaktUmMd!B6MbpIUzC6{0BqV{p&{pOD#ugpvpg(NV=XC2B?C? z0&SIGC}-Jb3`%#B5Uh%yggN~7FnC?^U~uyfW-W1&g~lWJ|4h#Qw{rZypF%d@)L-p7 z?Owa@Hg98wMOP0q!@NYUOYTdd-!c1bl7;lvB_A%ib7j-Qm!e8dD96z`C>9U$D<;_` z=4VTi_ax084ahl-Id8MRdsVx*-L;`3%BZq zNXlI%o>cI`At0y1U(gb@#cavW$G*kj}K?r%k+ z`hGy*c`}rP9VL~rAZt!zcUZihvT~Co(E=$TT;WHIEEq#gGS&8Q9CN>4>9u-M3!9@N zyNeqx7uK&+e2+U z-M2rOQaRn4%)Qa{`RnHxSxuhVWy^mr+{J^xQvKDqQCRo~)BX=;#hvKhZ;=#(cl_BO zu68Ms;n29uT>pWmKM5IX`ic0NP<3W|`?sq`5lC7w5@q_?JqC;|v>LvIYI%O!K3jlh zRMGfgU?`h^8$0BbK;%SpTH||hV0$-)Mx`f$`@p(pBb`8dbkj&X z8u2+`qmJd;vUXD4ORTmJ?6sD+$_?>~2Bc#wGs%@N>v(s>2En^MzexCx46KS*Gkf4z zXC^S@()w=&NV zE5Uq+nXDn~K@ig?$y?PH2EJcuYrDM~<^K4ixx2l-?(Ej;q$*=8E#eSl9a_cEhS;Mk z1Fn)P8o5%5){RToAL^eS4Ij+oEuY)j4nXouiEjFE0@C_zak7^LmXn6xsKnxx2EN;3 z$%GJbV+Vsu(Xg3U2J{o=IUv172CFgyfKp zYZrf>|4VnQY%f4m$ghecs&-&1QT)C2@JV|!#LRSIqWT{U@B!-;x91;Bl;2S@Ary|6 zU@VGGh8G^~c`vR0eE`@;1I$geOAa z8=@LNaZBAJya;YJ`mpm=D*FuA2S-_{7kQtc)r(YeZLN>@x+gk-^UqF8zM09NUiAiV zx~x!_OhKLixs3w4dXg3P%eUiineVX6@fm}J7%V5ai>AP3do6AR21f3&sQriD5_g;G z0hy(&$moHqoA+H3;q#vv*7BW{EmmaBzAYJW=f3;-fh1#hXU+tp8D`xq+!ui>baz*F z&7-v@-&pF<5eo8-AFC_!jPnKrS?C>XPP@hpkd*zbTguuB3NfF^c`HYZMd_vJc+w>g z&GVHkwSwJ!_nf&%5hSxe^o=Qx#tdXvW7>(-{^fv9sQ-4((#UX`Nb>i` z^Q`r^|6GMHUVj^uw;aFEGWfKpw6x%Vp~D!OaimFi{Y8ZfSt@&uJs39_qRU7xVmF4P z1jozEky^WNF%XGimxU-u<6s~24Y%TMDS=!8*@3HGe@W6x-Bj6%apD$Zfnb7ADnHvM zN0%7mJu20&p?GNm!$ntnKL^Z|^t^>!KPJyKDdHiz9)e+}3)z1te028-+k6o{OMHP_ zsJ)(a+sAk!j=8VfFAr%RcD!CF-$OIZa&QyvyF4%=QpXt?IpEv}e;G__5<^dl#)xsd@D(tf@Q&iy#Q zQIIv6b%HL#%9jBlwpjZbNeEVwb+0liT!W2u}+CR7!TOx~R+9 zY3aFDkU)|sPV{?!e;T_9)Ch66$bJk?z@fBVz!*S4#rHo-)*9q0Vh!Q8G^q~AsD(&) zwt4RB5V(RfDk?HIy%zQE{+yY|yUh=93@_cFee7tgc47$>rv1R>CSss7 zXkaM46}B{gUd?8_7S<4zBDW#Cb{Ixl{JXy4|Ko9(A7J)xL1ovu}`wF_ZhsvvrPmG^YO2D8RUw@6vU8O*%<@3S`H_lCO$6< z724`1_UxdQyp0LwO(*th&hPt29hAt)HQt+)_dyQ@Yy&M(c?Bou-#n^m0*6!=9v|u+ zrgRHeR}Fvx>(d-At)E!sD*^ybtCG`3&|Pq^8NVd|d7O>qL!w!yUkg{tr*p$4Y-y)L z$wJX#6>aifXvAie%bsJTs~%G=TNcmJ6+NxIsl2LggtdODIjkIqK=V5u=&wD3ARWQ2 zLY>&C5RTNUR!CpIR^7y)vL z6SzJkcWMNSv_~zZ9eu5t@#NXVU*nW$wFG~DYja*Ovt@_p4KW}g22s#pyLQk^;%j22 zL#Pjq{ca0hmAY2bRbOp532topRMo6Cuys-BN!dD}?%ijy?Q(E*Z@AZ-!>h<^; z=i@b)3mzE(oh33+Z1%M5u6f9@r0?l}XX!^vrQZ>z#3bxo46?UN5(_&0iDD0P@iO9- zYap$e(@#U1$pv#!Wh{;p2ACRzMHKo)6va-_Iay`R{2nO3Iq3+fO^hqNuOh4L! zuI0#bxGS0TfM7=wIwuhjeE^1xw;o1g^se+B{Z0{8-2%vwf>(T8kbzGp7D0~ACXd${ zmneSA!-Zevm&yQpPmJ&U>PngZFUyra1ZLZuFXy;Tl4JU}^lAOS5{=KKqKygR zB}jn{a@k4T0NZ7vFGHos=qTG;ZtQ*a$z+NX)qaVlapl}HFP+nOgqZ$=*a!8PePs=v z?UvE;Z!THJiLHA$aw@F8W9zVI6#kVljFC0mwudWktzd!W-=(lqRFhdRD5v!NJemBT z82a51I1|~0%A-5ZW0FjJv786Z#EdeWt`!R8 zJX5n>A$*UOdYO~Q9?SaG3p*kV;OAH*!wh@LZy!oGmA}vho+cbk=%jSY&v_n1ty>Z3 zEq_nw3d+#fbX3uoK#J>rCJ-s}JNI29`|7Igr|OY^UoR**%Hw-8I5>LUB{e)lkFJnq z|8RQp`>>CkOLLW5QK-0Cpt`c)V1PjW#j*D`W9{fcg~o2Dc@3ut4Cj<5XStdl4j6N< zopvLU&yEdKo5p0J@2o&nj!?$okR4vznDT^leLK1@b;kQFPE`6?obJq;dq_h>Ex&KQ z0hH%>Le64))t>C8fP~|jJzZC$zD0!9j0#(B)lXDoA?aCAy;AB2N4vgOIe$n~NZu(d zyJ)asG$!V7ILE)r(uB#)M+4zu%|RHsdHq7frAN?$h9gb!arg|#N_Wzv9;u?uvNJVn zwxrk27E#f1=*M|8{Z1(x)p&s|E>wqDJ)F8&wJ7cQ*|SX*GM%(FNi=(Bh@EQSW#LRC?iTBi4$732|F_VDTF#?3rxmd(?5N?Y_*&k@__xu zpLqP%Wsoz{kVPd%a_In-G^2W!i4t<@(X6*vZfll2d_#`l7L&_i1ZC8*lf7)qt^mA@ zx{2>s36v`4M@LK(HQ-ExqVH5n-vChuaght;Ccp*+D{Ow(@vOWe0|c5BFsUPu>sJEW zKow-5;Io@1p~d7|g~fIXVY@UAj)Q=SvcKXA5jq8CbpDWk=5;bx^RlyCT|mvHriXD+ zr60e1pYNEkg|Fym(ac>VBy7QO;V+Sgmq6L#Pt|VyAC|Knnfo+I*&&>5f|@9U1mV-v zOb?$LdS`>~xj1;&elb{_d3F}&VUgr;zGzOZfg6?i0jCIq$^w?h+O>cR_#XwK(IwY| z74wOn(IBoCBQ4oF#9}1DrK)MUO)Ss$#n@@oSQ_VeeD{4he7%(=L>I3I@ z238F}ZMhZ%w1ar-z77xZuoS^?#TSC0v!8JxI3xOo3I+Q7lN9IW|$F5;7SEu`}uyv!v&5p+iqGtZVIJi&rou@b5>1vHN({E|paeSIA z{{khij|EJ2rZZVRQq~yur7Af%wPhHC(qxP^4WrAIb;nL(Mp#rF3`>eL{<0VCooz&W zT`ANe%jX}A2fz#&iY0KBR#49@R`2xxhD4?4VSL=vA8*NO+SNapP4|%>aR(pXohW8s0vi>S4FU5Pl@8j)K@Ha1 z^#h+6R7_`SZDzQi7L@WVKU|+~a8Y2VEA37nwKQ#Qog;Nxkwsy|hhz5j4JMfMZjT7< zMsStxzjTl&fj(zg_NG!kT%#_XWbudAdUy6`gJ^fnR+1>!<41MC9Hx!9T=m0t#r7AU zNqg~9JNcuG%v7u`uij5=*sYir)X2Zsa<1Bqf4>8@jG`y(j$2q`(e*_G0m zl};7}tVL}Pl-SR251nz<5UPm_DlYK520-dB7fmY<Z_cDLTGw=O~ zF{mggD00264>q#sd#7NzwJUv;TWlW=2(Laoid%g|m{`S+;Vz^zCN+?Uw9#6v<~X); z{-g&z^*p4tN`Fw@*i8~%jW0sY0TFOb4VcISy2xLRq<4PQhsBv0I&X)_*J9VTkHR(5 z8f$6;raO;KmGu{qp5)Au?NPqf88xO_N0)KssmpUA2*&~c^73eR(&FIoXW&2tB#12L zD;yX7;E0d4&kDx)<@cFGee!lgG+q2W<|WR0JHLI;rFdQSn=(l3llh`v721O1UcUhi zER=QnS&Pv^4)9FaWNQj^OO$zxxoq!hOqb?WPFT|(pq0%gfR~s6pgjTMGfC`@%L;uR z5*^zxm06AvI{w(=M(uZ%&Z^$V7@R`dY%N4)3lwR!71oR2<#ZneXvJcMIlN3NwK?|f z_T8AY@g>!n_~P@TM*0O?fBD{}e%XqMXl+WZ)(qqXq2=>yH7#VU+C!Gls@hi1@0Pt9 zYyQ4QnM}25&P>jjS!G&fe$l75SNy6Kex?dFgo{k@o@mBdhW_d@4kVK7+H z$z+%{kNxLifl2{yJTMZ8TC0#D5&fTbEVO}r`Cb~tyV3A&`q$-o5SZ_D zJ1*6h_cZH2`~2;=`w%a|uXze~66I{nu%X|+VT^P4>r<7lcxxO&!;Tb4jTDH@^nF4; zN4U(m87W6xY9g7bNcXWulkno8NI_9Qh3R8TaJoz}NHBWieqSkYG8)T#?w!|uAZ)=Y z-uZx<*t}j(rCIX#ULAQ4p`stHrq%!y$kUQ+g=2p*|M_>+^kubrf&6|``I+R8hyww!X8LOdAAqP zTyE3i98`C#S!P&`ZK1Ru60U!IYoz75(<|d-V01KCjr}QP4}I_N??Dd(aUn>+ZuPX% z0Bb+wwS@Nz6*G!JW<&~)PZs#QlXJ=7ZeR~vQQtnYpI%a zapq}2wMSm3ST#!-N1r30()2rO^=eYy&-ZbGx;VYvDxgVq-FV)g@%tokvtjwX?3cDb z(_g?EWCeF{Sf_d*ED2dXN zd)y;b2NZQl?opA|h$Jp=M!8~l-wpeU33?PIo8E>JOW@Mi(q zreK_MkR_+5vX1`sKvIici~=3ktQYzIAOZ>DO<9NhQlP7NfdmB{HViro9uhqtVO$gm zMf@-fx-}_oAd)Yzzg zT0V+6B^-RsNgeKqaK=oBUUwJD*x19+v&drUzuQwsAPgPl+VCYVLtJ z45QbmX6I8--81nd)T?g1&{mDz#X;s!2UE2@q6!mb4YF++oxpaSv%gEWkYQ$oLAN~q zUG*^XMs>eZ?*W9n? z+2Uia9d^Z7^}Xv=gd*}L3q52T*%)r;V^2eDwIbn1r!PUvBIk0@iI9?Vz7kHP9r&TORiY_FF*^gcfa00{RdO6bggo-s_AhCXWqy5 zyR|?gF2;a5O16G&mrR9gmdQwXrHtvz(wYXA6e$4*VSt4d47GOs?+F#^rIZl%b+ARH z1FKt4RL~!P`RT5ew4p<9ZPptmkej`8_^<~$t`JG2 zN=5|Hj;|w%(f4t!k%Wm2SyNfcJ0cs#_ZlypE~eWyP={Rp4WrTocrQnU$oof=j-qT} zdR>jg&|kj9I#3uR`Qd_s+>WbTsxgio)Y@2u^)7RHi+fZktx}xzsVYH+$yc?id@cDb zc)!>#>pwLMikAxmp6lEm(}}{r-&|5U_j?qhHnQCnTBkOC-@h)0MReUWs5IVBPZrg5 zT7`Anc1`J99UASY-r|*p27vbRMjyjYTzT3bTz_EMpODhfFB-fBL z7v9rcPuCq?ZM#jI;B5U>O}nWXmRXZJ?<0}*8qaa_5iu!DqPYNtHp1;_ZscOSVW2@QArYFZ2C;q8Ep3@?S6qxqsEmH6%G zp6u*Gi11jRJ;N>_c|cjS9O2W(_v5vCh6!!W`k@NlJu(4JSpx7~ZQIH@M~_FW&!%K! zB^cl@QJo`$?M3-LHSo_}kTmS96wa9oH*rq5oVwdw^3qOuaLRAT>)TIhuDnU0?P$D2 zh-_wJIxX9#om)>?b4TmRj|a1j?!HNkcc+dY?Ct(b^ZB(`dMBSMlikNK;f{~(=0lXS@B z`K8q61K{U!p_a5-^6-saF%7n2T>P%EIDsV?j+Trw5sg2svEVqIT~)=c7{GhiZaXoW zK(gu{J+m^BNM!@zaJh|0@B;kKIt3J=?g#-Y#{=MC+1)I#Y}JYH^<{^SN2**>O_7lmoc*xh z5dl{4tl8Fqeb3POA(*NaZg)aSx>xNGX81eDqu<;tquRyA>y(NfLr%5!g`OEw6C<1M zI>900ByaV?%v^QR2<6e(TVb>)crNKhgQTdW^{FX=Qn0DRtQB(BYBf5esYg z=~41!Ij8ePR`MC2F_g;=69ILKSqJT&l?=-WM4_G3IGGFPY_by67I|?SJX2bAFDf5GK}~kZK3Im zX%^9zy`jL}EM`D-UC}Oy5k=NUFM~6XnvUuQc|VDhtA0O>(2-~?-DP;`t>EtA|Mqhi zels%o>(@y9sZ4|{)fVSoenJqAWKXDZx_1;Ap(tH}%=!u3kHCCgLPGo+V&oZi5n=A zno-T%U&0ksHzCL-dq%KlbC0~rjMV{pwWRYr(BUHQPA9_`-KAP1zYSPmPQt_v$nu7v^wtqd!0l6B0Tm2tXx*5)^s{`-;AI8oqDz0ct z*8~Xe?(Xg`!JXjlP`G>05Zv9}-3oV?!h*X5cY;fh+&ZWG-qZJVKlDrOG3sfLU3;yX zYtH}w{zwKkF@YBQ;HvgsSR!YldeTIhuWK@`(=Nlmt__a3@us{{99>n> zye$mvmMX!tw=a^gX}%(oa!EfSGkI@pEVNcG>FIm;ljjuHBVVl%;4#fo#cLvk+A6)e zzz`iqvW0(^DmvB3z;I@`aR0L#En;JSF4p*6 zEH|^=rIV1&=KmMMVqV)Ej6FL50#s}&hgI@|VH%V}xEE2iGl?Oxjl+oTg{)&i6;llEIPE4rbY=86_e9k(JJ_L(A}-K=8M%CrVF7ZWLWr+q z2RH?}YzwXUjU+qgFsBIO%g?|O=*-GuHH)i7P6hdHHKgMIaspLV-^`x=O2J5|4xW=`Rid> zI=I9n%@7}pM+`3m{xc;Gp)y#+jsk!27eemhLSLelR!PK>b@VqUm!rUU-C7tu({_F? z9A7L@qb7VzRR!HO<{A@|${M7Sw$%HlZgTI#q(D2n%bC_*qOGn!0gwW@Cx5`qAIbE4 z(|2L)*SuUczF!)?qfZ675ocYr$A(esx(A0xA)#0b zD6~0DV7C0y=#?aHiui*VIic}UOg71NpaW)Dxc1dT%|+|P*kgG`ZiWwk_+fXCzYnj? z&Et;MenxD-dFpEz(4LnS(=N6x3W2(~jp*n8iwru`Qs-c`DIwrh2$B9FPput zEGDG$h0o+&A<*mKZTaJ?ykP@q~*As<=`4`(G{lmso#c(FRo3W zeN*6H2%^}icS_g$ZA#&|E1Xp`hxHLb&WuCd)3P?o=(6@1k2Dn0oI}aFiO0i{64gu= z@q~200Bt;ySCV*t0!J4wz<_U!fx0=bOcDehn&HcHxj6&lm&G= zf)x7Axkke;n9gojzN6nhT*uH3;CO}s-HBQIt>1;E3}4%Dtb|k3JE_?*?tQHw4@@*I zjt<^UbTuZR{oeEomg4`{dbNNtl72vkniuM51KUgqr`$yXmpl`CYHAONvpm0m%T7V` zBjr+cGUUC*gVC0Gt|$(-cUZmmHIR5x&pY&tP5ApoL!4C_yV$d^+GxnFtY^KBpf2!k z;b+z`#f)B`<6);_6l8mKQ*5%Ed5K}c+@LmZt!Q;KYvlr|R>po+u2}3InnM`gl)8JE zq9-g>2|Em+%&ioEM+V;j>XO%;Z=lU}@nU8>6Ay_(hK1SA0HK z?kz^Upx@zm1YN&|M-#jx`sf_9pN3eSUV8fa@#@Jhl<6lM?0wiLy?qss`bIeTtfgmH z*crkbZ|l+Gu$z79^)cYv9Jkz(Y#@1gTaT)`+=R9na~Vi>5sQ+&Y=o*gPDkU+5c6rus|~-0w(VE} zU9sPSxU#vNe(S6ye>#6v9pQ+wDtIA+b`I{!q-Sm-$tpaab3tJQjO)1%X)Jz`cfxX9 z8tT;9lDg>8ICT!B3I7KDP`{dHy~ zpnQ(SYJ^pcWT8H*)pF88-mX4}=gk%YgwFLz_at+JTbRI6-4*^vsFC%x^ua@eAU@Dg zEetn+J<$|UAN)gkjG>d8DoIE;VpNtxPAihz`MxRjM96%jX7MSA*s$2+XXLBkV#jjZ zK}r@9&uM45i^a$8i7;&(kP_!?mMSQALSL%ZvgJv`Wwe4J)fnFOkgz9pQF+_!Fqds{ z-|&t{ZGfK%FaU%h&y#0A=m(tf=+zvci(ds}NJlj}Usk2fO)8u_OmFeYw=U-m*qy#> zdi_XJpqSl#U}i}g+bgQ{$E@ThPwXF?lV4bnJiE?LtF836kzY>pwF1`CpO*4xlxH|% z4ny+hBVWc5jpOZ^Oj_S4%<5wK(b$g~sivv+%gdN6TUc(y_`P|5N@B8O8QJ~R?BGLk zf)Oeo$+EGmZ8(`wk^V0a*#D+8JP!EnTVrEv*GZsm#J+iAfk`*Wq3FeLj$J~x>7Bfw$chJ@#~XCrG5;0%CPwDb5SElg+$f!bS)M>|W;c_AlY{zhYQ{FaR z;P6wRk~M49AC|2#y_l>C3;WRWPN{|2w;+PPnSyC_1!=YFtAJf%R z#GXrp6VfFwH3;h>L2H*k&|Qxn2{U+qn8MO}Z1Fy-^mWb3yy-7_iqDVzSoTeYJ(#o-=34cHDH3sP@)5Yka>7F4g%1ZT@{$Ng> zj_q{4h%Nxw0*O$h5lb$JC zqC%m_IwuSc6V&Z?++a&G%bF5DR@SSVa7vpA_P3i`r1k|B7L=Rj7=~nsIIQ# z(kf}_XjHp!V>#AeE)2`wfzy8>-duSY?_>Mt{z8nwmMAVPPVt&iZx9NGXH8}!)UY8| zI0}0Ix)7(v8bhJB3--6j`9Awofk+|$hcE0pEso4&yS1)_ zmW7I@Jn{tVNYND>%HzeQ$T=H>5yzTs4(ioOAk96rv^{w0fNzfCvmyr}q6<^Sa2XMR z-uyF1@Eiiv|-8Fn(?{>cOB6D|4K%#8G4@Nh$HkV>n zYCoAmC6S=n4;m}pmEpA=sAb3=l5!N2P)k;W=cj~~4K4NwztPybG~4k*ZxYlL9lVM0 zt)DMk(@;m!Wu_NOQnXYpk4^i8?Yvr|#WhQ(9aYr?9QE%x4_OEHWJ{;G-q6xnvriQY zP+UO)EncOUm~2Eg4TFB-?Fb)KJYtQjFh@+TG4e=h#wdbXlm;?RKo<=?6v--eSuFtw znxv{hSdMZoRA*{j8mkmOJ=aA4p`rQjmvMqy?S9>83!m?6MBjori^!KmbRiay<+62nqjBla1|$G#NcgXlJ4J+ zY)}c}DPF`_8%kNN__MN9?P$F6PpzdT{Sj`>w|5Z`#mfuRR}OF(tu*4;?5g`STMG&l zJ;M)1vQBkUz3T_RUvj3AxI{-WF>NGWM~)6YB4Z9rs&)*Xw8(&k5osvIXjgJ|8d(ku zDt$PE*o9lk@eZYI7^JKso`;++?^BEHRxw6LRR9=}$=rEfDgWDe4C=d2)i6KdCy$O2 z9etKjgCP!2&gCg>v|P#UPNIhcPk9P9PzO88W~BFb3gu_h$Q{>Ii?1V@E=@x+oNQkLjNmv8Qwu3Yc?lk0ex-WB$nhIBa~$AmfNhuO-DrB%y# z7%BXqY%p$ybD+RQ>nBXS8+oeiuTkkxl&w>*;89+K;KMolsQA1VCn>Df zC}sfT=^^3!>&(J?F!((GEI!`%7M@mpe%=jk&J*hbu0Yh!+j8+6=Y&KVL#AX9Jr!HE zhZ$L)NI)Ef>_f+n*$mF;5zRoMXO7Swv2&St!w6rg6hgOS(?t9l)(rD7M49-#ni#wj zim$;0X!6)pmXfNbaj6!Pc8}4qjh6K~-C~wA*|mVEf=+pw($*aZ8wIWB?{*}CZfluL z4+csc=HWMvIOC0#Q_uaSpMD@x4xdb}(|sIy>MhsxwANm> zFs#WdiArzFA9Sg#zOd9`H>TdI~$IYZAj!Z;H(V77V=A*`KgKt8Z#5Q=z zH`!Yztw;ujtl(+=LA55Tzk8XrSG(0Wc&?ZQhm!lJZhO7Q*P4hhWigW>X}sI0rs8qK z6(R6o#l?5xs6yj11?6Zx{LoGo*o$jn#>La{W_YiSitM+?z;K@6hE~+^<}1u1m9ULwV| z4E$PEJ9;XtJyTn6b1m5-xLeasYhT){jJ#m+m%Q^(?GpN9+kd%pG)pjBWu9i3T1XjP zCs1fhTB+WdrgDu0<1-q2Uc_}Wbn7Wc1WeBUWRGkebjZW(-+--jj$lINR5ow#DCxqU zI4CbEC{r(%*mquWl$7(=4em!Ep;)TyEZL>4CE%+*`q+@J;?;WUMnan(`6le4E=A-f zqzPXARLRQZWMHVL+LrG_TwgeMT7tV^g#bmQ{D1N62!SU`uL$5S31M1C3AYJ zErd0Tcr9N=*U3F8GUhtiFE5qWHr8xc3m2!<}B z-p=KsOJddYWbKj%_YikKf-%ZEsd^4koFPjq*M)?!UeN%b4Tjl-q&2&0qkV^4zN z_~exBixb^s`%qn~pVZi;GC{~@n>PMSD4%yo*9*Zy$4G9Bc3T>7lRlTkijAk4>QKA* z<$H;7BoqHS0@<5f!S9_gF6IS}BGMj$o9a?P^;o!;CoPrdVfj2=07))J{BCy|FXO@8 zj5r?>4lYFeYL}}kCbhDP8?;(++eHSuMRurM8pUP9E>m8wA@b(n`$`v0 zV4_Y3X^xikF@#&+tJ(8{FVjolmCP}AVn;s_EEmVMhsFGb%j!46Y;x^WrIxiMCx+Y> zH*={0KEx`SaTQHHU9`PAq-9+Q4aYUjT5u+vE(1;OI?* z#S07GO-6xFkaI+Gxnr!QQ|dEev!XqV1hr>*9n%jxtu(4w4Wm|CXB*D{H!-Wl_1G%6 z%+Qh>d)I%cly2%Bd?cw|&GaU>FrcoRHJh$+k<+M%J+0&5vsq1LU<`n5nGE<1 zU75(F7?sd!{es(uoYs&HV6vCv2(G1Az@Y+ZrESm(tKAm{`CK5x>E;+a4*u3S8nkK> zS^Qe@>#ZiFRokC zXdkxbaeql29JfOeLqStRHWrqd5eP)7Ub+wVgVn zbx%(l1*=dVDrBRi1|`=Qv9({0W5;5&C^$}|d#9%hzyHZ8^>!oB1hce;HBiq%Twy(z(!p=)^F{Oq0ce zq++9DU$0;gieJ*j4!h@%^a5~bx~7=BQxCU#?_{-u#HT!de^D@88-KNzH*Au(xGp1h z&)h|e)dK%cPio0ZX)IJsil`Ta7sSm_osEI9{hE^c#Yxq+O1MQ)YO(s>AcR(Y<{!ot zuozOkkHASUkoBGSZLjes&&v_n*1uU+!SYC{Ko|d7P&a64 zzL^jzjvB)R%I_JPLs}_*76*zA1NLW^x6_&2#pyE%GRq`~?BF+do<>91N`g;!kZZHbBr2M|caa(fl$7^}6p?3Tj!llpe3(6kZttmHw%y(; zy3}qRVX7VPwC)OXob8}h!Zief82t{OLAKjN(_P<}nPQ2DF~iXmv82s~COK9H(&1om z@#tgAu5X;=(0Q-iJaSVKNeK8%JaM}?#M?TZr+lVi(GBa`{xvFh8{?tm9rnnFw2Y0+ zFVdYN{Jgiet)IaQxx`zx5^H_}qL(X?Y|`}v2~bqrI5JuOh(l}C#-#;JtU1_uaJ~v3 zJv4goSvJO!45p1%X~dc@=Z!a2&*-79;5^YR@W$I`BTmePSI@>y**E6?6z;68X(3pX zhlzO`Q}27%S*cK}VDpOaytzy%WBo60j*GhzH9j5b?dUw8=)(Yu_cOGrZ*A}r*4Tivq3Z&p^8`@AX7wr5c8&?U>rv^lm8RX z^|MQGcq&YPG+vYDK3J_C;ldh4VYmPETkz{6Dp;zti%9Vo;+8~*w$PZotLxkNTKK1f z3p`Pzqi!#Feb%v*;0$t#?{e&JPz}$flcYlYt~8QUPzp5UEOIRs0(chmwqRsAF_fDP zS<^o=Dkp}YmJWoyR9%-ReVD-@4v1f&L>?AyVt9Pe~rimr;*g9EsPz((_NJ{-VpNI!qPS!d)FyAUOW zU=jeGew%9oX1}pyqmw6w0M2lZ-=CTQ8q>BrkdtdPGJi z9eaPv3>ix`*Dl?wpZ}r1b`w^dJK9RZ1L!UgL6x;%d=LQ0Jbx1BffndZB6nBM^>l*W z)Wp#L@S%~PYO3r0G0WojxU!Q$&gAnvokwifAR`?6t;NGg>$z4YRjA|XsUb~1d1Ek; zjMNTuz-Dd98a#&YiKac@~sg@F?}^VCGl)!(pI-q zusGtuSoCaBf*^dxGJF4-YYa#300VcQgFl^lA(oa3pXR}WC0eAmE+f%%z*dx~DJ=d{ zZozZ1+2|4smvW_eExA+{UGywmYjC`|tI6lfni<`TQAO>T9xARc2L#DdopxlTOM@dG z@CivEnOx3OVR>C&_DcxjoNRHsA4!- z%OX9R?^+uwk60z;?rn!8RzL?3RN2dd{{Cv5W-8@vdb^`)ObJX$;pVu@n}B*Epx~JK zAl31}TqdES(yAW`vAWuxiE}f3s;+s+ss?h`;D2nixMI4sB^oOe?4*~kvNB%DhT+rV znCe2iq4`ZA?28EAdWzC5i*wn7BLQa>dhOKP@44GkLdIg4Sd*J z%>g6eY2#$vch4}wFYIt7Kq{2}3SX9DNRR#we=wB*PB`L3O7e&$g+W;ch0E`|_ zTYCBq&9y69DW!Jf^y2{vJ+ynLi(Nz$PZ}Qc-jd&QO#VVl_2R5XZu-pcZqbeZFr#6M zQ6P&LUn@DJ#!#&BUK3OH4_)E*Zpn+T!fcuH^UM7Z+5&HA%NekDngjW+`3q+&CwGd| z3|DTo%0nFJn_nvhxk#xFBA!vR!V~6a4T@vF3PYJkal1Ju0H04o1B#McI!f z*$U(ti$>`qJEbJ=Xs}P*`ea&NQE+MkXK^w>bbBHGMuTlXiFu<{Yc-w#cKE&yn;k5h zCs&nDy(nc+YdM;vp!QsCF~ty);PQS_86&t3Y$>de3A3M{Nurt0JbiP0Gj@4M+R+yZ z6$~nlZ413kgC~F8M_xZyd5OvUlV2frXfC6_K>|tbNn9VH3bVSaxx8l7pCo& za(&>dRz)$SOxuab#K~+v70YLsxD=KwHoR8a6>ImAv#?TDO)^q&Q@YC*wxp@5vE&$f zl?R19{r?1{bVKgFk$L_F>e!AYDEk~dn&+a6!Q+@~Bu=Ra0nPQ=kWlPI0z@=+#N`ee ziqRJPfna?M+c{|ck}S_B%SaFSm1BMBGFJ!5{!8|^Dxc{D5K_%15Rh<)jWK$N)xcML zcQ?=ReV9*Io-cq)uuKV#Ppb1LhAx=>HJh~kFAUichJNtfP3`*>+ zLNsS>0)D{rE!R|}g3hbw{$}ID=$ibr_OHC02TW%x@m}gaE?H);P{G`B!5k^H%Zrgj znHDvLNo!;IUGfnc;|2VXh7>TfcN#sp!LcPG^X3)6kCH=#no`FsN2pYXJILoeB)?MU zz~7Srhk>BiK(*GSWAfk0%wqnP{5{0iH8|6ntyoG1z+SPcv@lz2uIft`8Kaqv=`}{B zLzS0=61~a8D%m=s7y!3o)imP}l8ABeK=pf)Czz8{@j&_)B7Th|AaK(Omh@R)2;4Mt zoDoeHY6+Jg>P!VxA1$2`Pkjl)bP;B^KYT^a{=Eb*H?PsOR+!{4Ih_SaF(l)8Y>A;{ zwU4$PO)=?qO%E{yPCyU;=k#Ehu$48I7x**%;~>CAmoVqk3>f|NZ|7uZqO)nz4fvUO z2S>DZ17D9bl412k4qnLjjzNb5)mN=z8!DH}2mvd*up`L2hti5pts~FQ4&arlAs0jH zX89PD7qjYGw{H8NyQo%eBtqFop-FVHTa_96`}N(Z$?{9Ur)2Rzde!2;BFir1r0rI- zF7ujosnvn6b`AoBnF#k^x36^0>=;U6b6$IroAasiZOb%Atn1IE3s|>|%>lPFB#q}G zdJSoD;;oF}kg%>V{CEn+u5wINx9n@~ZaVf*4H^12?26jFc8}7r07ZQ+bMuq?UWPyM zupODkAB1Xj#+xdGXxMV5(fn(-kk zE3pUK2Xzzj;yTV_nj5+O<-C$HO_wF|g6)>t@%1p*+xJuuwL|ODTJkUm zm#oJ|lUUU6aAL)vYGq3$L0PSQBE0?m==_Pnt`~CQA}kEsCcy>dm&!5Dw8wTTSujz1 zFtdMAETNGzb}%@MU}uLQ;(nnIQ${49b+jVV5{o5fSM3(o7F@fki7+inCV_lps+NfXbr;3gcm7n>w@G*^u0z&XuK}{LFa1y>&9wWv$uC&%D6A&WVOvz2Ag9n5?$-I^e5e6` zN`b~fxnsH}ZvG&Zf3l`4{mN;p@wnk^~)hGJJcZEW55YheRD_HbyS?Tmv`ZEJ^K z`qLZ2XM~Z4idMeS8x=NdEHbp!VR2Giyjd0|oDnKLpxHD~iwcZubZ9Q^ z&HCfmd0s_Y>^G9g^A(JmU;Ms@Y``8caOa*2^aHJH7o28~)-kEVG8+_>p0qmxy-I-E z+gWLbfTdEGvs=pZhOdT(u@-QC(2UMH3S7W8ZqqsY#KwkS1s3sJMLsOtEp<3ml=~QA z!y`ao8kc>t9fti#wx`mpayII1NS^ae#2;0Wo+Fz`fJNzBxTrJ6%FpQkZU5p>T4*kS!CvC_ba*F`IpMJ5<<~2F zO*^M^L&?68ryy}K=sB4@reclAV9$2rLPPlv3*W4G%%YRal4j^5lsR)dEBuv)*t?ncCiUZh*aUA+c=4TPmEfT0*sUs^x zUd&Np#QB0Ppr))#WsH*`doT|36KDqI*+$iZeNl{jB^C%dD?bB-orwonu* z!_^AicNbb&aaYDs{3nzWI=oUnS3g!zQ;F_l^u zxAe67WV&^IWD{~Y{Flo!Oip&ys5jZ6a^0H!2)RLmx_i@5pHLIl;Na*xqYWZMs3`=>Y zwvJEA7Erh1s=H3wS|byUaJ$j2)!Y`v_nQXOJ?d|czr0eH>cS5XuX%o6eu-ewHAA!i z-pq$0Fu5jm0z<>lBi)wNHNpnpFus=VkWh%VKV}p!EQT_Q+wD<|5bJ(IogF7mgLTVu zT5fXNjfr4RJc)Hjeo%Z#(J*k^EtS{Ut_I}b>YX!9&9tyL#14xNFkPyQK?ri2Vqvpc zy$%+(ZlpoleCj>F01zukmUc)!Uv6tSJ2NG6B|CuAKm8LGwSz z2csjUddFRVuI43(eLt-VJN`@nsQ-n~w4YIc%M(9{sdIeRHI-E9TwgC@s2XoDtJz4x zd7)lIvz`jpRFh>3I~??*@8#SIdx5d|I)kd)`};+-?Q6BYs88-Z>S`0qn>i0jeorz# zH39KbZ|p-Eg*&bCb;6rvt31VU$g-rk7;UcN3>9-rY-p$vfRCLlsT3dIpQx7<-%*Lo z0U?o2m4>H};?hH0jPHQ=;%5S^>Yw0@LmgP5%h`-=tq`Lnczav!=q5{LagmpGoQ`HW zITY5>JG@Et*ts6Th+vfwM8vAdGO27$EZ|1p74YeOG3=KmU+<^>Y*!kE)T0#vX=;L` zy|1HT2C)?OVF8|ty_40_{ecpr(#Th});P)qtW#{WcF4s zLzk%ZFVo#mW7Q6X7Y&M7Mti0-`0Vz!Qg_c95q>{ZKW9SA7Tay4BE*a$nTb;AaVY`ju2jj77p$RO1*RTy;7+)}wq1d!d^iz@Ck2yaUQXHMyyMPPpfMMV`pdLDB!n z_e##32~1P)n3Q?gI!KcL167~36-0Kbt5>pA;)o3$CWF7e710POLoOevl4Qe~aF6z%d7r0jTdSkOeI_m`4U)T4RIrb6c^pWnxhH`Y&Y22q5y=ufw zoz?=y89c?7FqS)t%4bcZgdhnmSEyp7}f_hthw%5%=~Tf1zmC?0-VM^g-)Dn0u|0(AEPC{3tfRssX)!QF|-kAld)yIm@Ia9Z7^Q^u7*f(d=jdCgGT^;<+vcyC{<|algk7 z%5*68@sLKR>joTMNcV_-uQ~m?bCCZ1HsIt;)}6^C?2^a&`Nni0Q$RH6(TD}U2^f!V zw&kW~Lqmj392Y4CxKN2)7=a1Jm|4nC37Cua*reSw%-mVYNtbn16&a1#f70{5{*nZw zOyd1aOzl&bCZ^L!otU{@MRM92IgsiyZd0JLR3)0d=gWvp^84QY&H2gCC&dq|?9Y>* zkG=$8ECZ(UEoQ#Dee-X6@dt*mhh{(T6H$^EKHYP~!29Wa-saLH$wbuw^3{pD@8K0h z>lqRlsytZv&u`3~#1m1u^(wn92JxN7hC4yn=+aKNbRk=Ul`M7&xk!AC_DU5VO#Z^r$<{fYld#Ycjt7QM)l`U_F7@E0P6$gQu_?|DUF zhczAsaJ`^^E9(rP%l;FR+7&ToAR{jV z_cAmfONcKmmuoUzm^uvQ{^C3(@U^B7ZL_q>w>Fr^&B8ODXo|e^of9g(=mv?6Mq^oq z8Z;|0__aSgmzEQH)_Gjr_2PXwX?p~BRGyb0#v?SC&c+5!rAPZeZXCsXqpwuZ>fX;F87RAZ_-iu;+g|BBy7rWbJ8xFmk`=8_MD8~o%HBRG$C8@GJ5k;KUkMm5)F2@ zh9{e$BoiUVyNhvC-~Yt;f~B-IgvNW;&s%`>34kBH@Z4_6@bMQyWCgUdKegT0{uSbn zD7HPJamT~E=>40hOxn-W{E>jbmAW$tUPMz*anwZ8-MM*KzB0=2(ec&RZ%k>UsDp^2 zIlURsR{jGr(o|ByKk;xT1bhq^tQ?J->^W-X{z4eL2Qb}jk9etpPkv90voX%btl=>W z%)=bdYnVsO05-LT@#;pbtg}Xmqt(pul9UUViFqs#> zn0QjMKs^YRH3$YcISZNGRMOCkONkHur~K=GH691Q&~66t*l$?o2={lvazq~H;ksE0 z!c$00a+56>){wH_v?vI+mqs3>l7}xadS))|%xvmCPU{1Ky=N<#gtz{WP~-az25pt? z$%;2N5xteWZ(dQ9t96_qS)%)+xD}qltG{i5lM%gTUUL!Na(1PU{Dlv_+v^W<;RZ?PlsfYj7Pl0{m|@p@VAFe` zh{HbnP!@pqA`?DYEY-so>#FL`Wgm(k^&1qn7)KDDkQ*Z|(# zy-i6f6*#fCurB=o>LS@j+Gq)1XQDG$+!HrJKgI_4bZhKzrGs|Gw4G<2f9knLq+q%p zTFElpuArnkc8#TokNk%F174fNe<2osk7h6DgIABzUkJV$x9Q$*5xoC4-~I;_=pU1B z@CD{euj&p-@9q1~<>`~RuiTG)edC{wvJ6+e+m*XWx9CX@&$mYgZyMj)NF!UORFQfG z#J`wv7<@wjYu_tl0{)QT?}vO&36&hC@Md33G0fbZL1bgNto#!NR&oA>yDaN8#BC$H z(&{UOuK6xSV<)kniGNW`*J50Fb(>gbDTa)3av3aaxyuYnFdNWb6d!ZkDD*g6#~qwS zEfJObko~js6!@wvAyO}}pXVRxQWJ=@S7Q~vXNT#6J%aZKKh~TDxkGF>b^luD*J?W; zy32MhSX*+5h`HtWW~&+K^OiwXMYDbGaP zdveK7gr1x@3wCKU`|TQTLZn`aM=ng`ENU0yOmY7ErH9D-A>;*yz;9@-rG#jxHdq@t z#M+L~0+on(R0a`NjQ1kptU-6XRFC zv$nFwi z;9gE(YfWD%6!wSbJ5)j2qkea8MtD~mw{_Te~MZ@!?*W!aoY<|T_tRA%tDsIsX z!&wC5usn~pO9zJzNyq*wp4e>DizoimuC&;Ty7ZC0&n&B9<6nXOyU?w>Gwoq*#roMXye`-lz6tX`1xk zo+i}V8wogF?${!5l>{syi*t*={|oVQ8~ztU=H$*WAXtFIYhjU=4|AOVdw||F{>!3? zedREF@!aYGvFyv+eeP$9Vz;Gv@_;Wil2vCe1Y5jjYuU@hefn>1H-f_)PiNuZ6Eto52)y(JvoBy z5;p!d(|z-(CLF(IYvQMq)-!$B>+Dms0F-Z_GsG^{jEQc>ivP-*}IP6x1QQ%_2(D z8|%Nb!g>1kl?S1(4-}a3EaM;yToZ0Uo>)2j)jE!(M$p~KpSU`a7OfH-^+`*9ijP8E zkEGkUYE!pikad;pqq=NvKkc!s<+gr4JUV2*4tXx_%4qPyfRkF*g$!QbfFErbssw6; znR?v0(ugM&Z=AL4lODf;PP;};CxrM9CGNaDs?Da`sdbqXnlHFi|9{DMfbl|YFKw}m zQp6)j^D7c$;3nd(j`yWD4e~K$x*^U&F$9v5QBvVhRnzqA4g|wX*5?d^&5YOd>M)e- z_SO-4L4#0KaznZ4jm9PE!C%wfvDBOsair!QtFqyBtK%V=dAzyMnOX3pRh6639rT~X ziOHaCCt4H}8MA>8%`8S6Rrv59?L-f+Jcm1jij&Xh>7Cc@>$RaEpzv5fKQ5z!l};TZ z=GY4jDREk|o_LL?A$>vyESVouhkh=e?PBpKd+QTDrwVQvS=B?mKTAtm&8siVhPnQA zhSzT@j|hD|@!!1iQRBM35VfWp>ipK2hz0s@6!Qs#_jM%QSfoqv{z6dUv!V&;>;fOY zPkcra5n;*ttk-N|k!|UT?Q8oOBekJ{rB4XTl4nDB5WQxiK2BQ}9nXu6B!`W!5t5Hl?WPGFHpar-SuS@_&Sh+JC3<>bx#w zJbf@WJ+tcFjs}cyZgRtqUFMp!V+}FM7OO=-hApzp+u(+NVlfk#esxnafMVP2`~f$| ztJL!i_hhMb$pd}y$=Y%f31O@*c2DWll_|7IrL6ksmpP{U`-BqVw&Ys-m+QjS^{Q5b zi%PExsdy(T;y&-SB8FNM!{ZKz*A3Hcjx5w12}{mj9Lu{2D@-2w$BhRvKN6>kFna=~ z7G4TkhozcryB$vzgGt9~`EGfg+e)Bb>)^d`BLsAdr=GzUBov?bY}enrG=MjcX+mG` zvp(1IUVZ|l%IAHOR*>MQ(zt6F$%)lvxR%6BlYT zI*C}eh3V%aU6DDy#*};b{iJsEx&4`8*kopMCAfLo1WTPsv8(Aa zb~IE7Y;LFCe^g`qN~|33xj*>?H9VXM%$PE%#}b7uQ)s6-ioJ_Bf-zI$#|rmgiw1LY z0g_5QSpSE*w~A`>ZP$Hip*R$GcXy|_dvSuhYbb6ll;ZAQv=BT%aM$2&L5e%Y3x)Du z`R4l8+;i?d57y}(BN@p^a`fhXhqB9{hTV-FYiVN##MpV{d_EKHNi5`=>+sBhvgUF2=MjxvEGy% zw~csw{r$>OwBch)Ex7~kZB3`xvuWqf zW3emo@$X#2N05`|$O$vxXfIDNX>7N>&TVxqF(`5GiOk(d)5cQ;n|JW)qw>sNJsHFg zHv=aiae_pL<9RI$ha&oG(sw2qFKOBEi4R_*h?I#BzvZ0ylw8T{B((}|4BJJL#B9~+ z6W_i~&xn5;v-iP`M~se@oPbPh-*P|AHRG}fC&=4nj8s5Ar)0VipYCG55gOiF?{UAt zZSw+)fzY!n-B5|72qurHbmfpIC$`M{R<2FpQ6V zr{OFRa%Eo&QU3S;RRl;Nt>i&i^|VKS~-mmN&7dcr1%Ah( zeG`w)6kOx5I=aQrq^IlGiu$QZZ4BbxZ?aXcBvoLywz7V({eBq35`K_@rP$^8z|434 zOFJ8%7Ji87njnHt56wP7Q~Bkn2dk`;%$7_THYN<;h80aJ<0^_C=i6-CBiU1Rjk85o06*45!BCr_1i#6(>aseRFtthy-=5zhK?)i@acaKNx(tzM~YjT{IDT~(WUhW3@W7KJ{#*uIj z{$pl$*P_PR0XNAkZyw*7sm{0jklZfP^GO{FC#BeeXb)b?18DWRYOCfPG>Bx_s(j$T zT~k|ZUE5T1ujRa3&Rp(ALB!-3?96j3Z3`Ju_6LvEE6j#Bu&`_tHZ(V9k z+|ne=IHAV429t-#uJ}#- z0sHr!@@L|v=1rq<-EP?#e>L)-PPdq?$xKhT`1>D(DK*8AhjX{MBwziA-@6tXXuuw( zMwG(WFE*E#{C=JG=_+~5VwwG`%>2Ja(Ehsu53}6m9>TFU(p`~@iT7Royo;Fyr+{(i z)Y^PYTo+Y5Bb1r2ThOp2QhnCbP~+^GsPbv9mr6=Rj`&LvU@BK1Hz+9FEcXimiUb-L zO!$oz$aKhMOIWj{?r-)@F^lO{%+=)Elp@b~DJ~b8^lY9!A0le(+#r6H+xgC|QK)Pp zdRE^7?o+zn8PV4W<2=NTgJXB1$%R>?tYdtxS zJWl;=gP^5#mxhKALuTv#*lpF$SWJ;(7Tr#F|Fk2aDjZ=emS-R8x`gJ?yC7s+Ge(oN z?xV)eFKsgvljPV>`419k_A(TWF@i85G0$#p>uXDg>+SYRopBR`I}bwUK=(7BJ5UB- zcxz_C!bUZSOt)|?EU#hm1m2=i?#P9nwObEiv37<9uai(_LvBcRwuyjVcz{z&Ts7xi}+s7K>Ts>!Q zby|S43313vU+3}_bPGwBo0(SBl?SHVH24QWt$9PR#U~^o zz2re>`&-|p@l*KjOKzDr?t?CcUG!m+DIIU!pp2rZ2}vq&$HF@HwldGTgB?>fqoaz= zh_M2hUdymL6AR~lfc2e|_{J|av4@;2jNPA8s@7Oi!#ZHh##b_pzXY5-BDe#4gwULT zTP+t?XM-obe3rLDTd_*K$C2|mT)RgeDU;=dr$-(S`qTs0Auq6>f6mxrrW&jP&kUWz zKxNUmeO|-a5}?x1mol3BDotA9)hD;@)5kUez=ROfu6MoQL60;nK~kl#z^; zF?eS0kYfIg6S+9bhKM*Nhl7#(s>u9E_~|kK0uA=uG|h@8`9h^v)q8wh#FO#Tpm8fS zDywtXz&dW=h3#q*(^3QjYshPVr;tXn;}QnGOb2UOL8gvpu!AtYBfVLdizJUj-VG^c zSfR2yO@JmdLn>Z+WZJ0wUQW#zTt$-H#*l`GXp4Z-Wo_zHtI`6lgE5>g&lE7!INA0T z1ZfPnzqLw?zyI4h&*uq!!*VBr_~Ft}v{0?-kgLdQ`8^rM=Sf;RnZeou z#fW;YRB=vP_ucE_c>wOf`iDTk9=fT7@J^K_~Fle8Y$>a%@oEWODRy9PeY_`*0x`8wHs5g0U6d)>v*uS zmdjQ{;N{W1cWcMWm#^{3b{h#O@ztnFL|ls)PG~HeYG3!6aUTx0pK)vAC`j>xS+EXF zd2otHZBjV&q?afO97$F}|5aNF+x@$?61^IRB_wCL6OEryjB-yg8EL$>jCprAvzNF}wA0Ytrr5wizvw@=0PiiUpI3ip zy}d>}Gx7B~a)$6bL#Qo)ioJV;f_H~K;Wl}6c)?aunKKM))bICQ3P!%`Tn&_I?);>` za$w2H>T>Mkk5G(6P85;6|Jowb=ZFP2j`3&TiognzW84)XXP8pl>N{=9zTgq>(MeP3 z&gcOKaDF|T14XdxN7rPv>`b9*k#q`?@9L_OpG4jzCe=|^3-j@-NmQ5#VMyro6fL!K zy->fuhyH8y+(=r4IJ-{|TzulrR@Uf zGz2w~`5#^agXeI-hi(?%q-SOO%hWyx{$a>3{4OOn(Ny0u(L)_e)vx>e`WK=xcI@q| z={i+KGn{4Tl!6(VQNe>Ir((L;;pWOq-$9zr>?u8@4q8HfNaha? zo3bLQKlpYv1GxeXm(JTO%*;-)Ar4>Y5$q5(vK%vgTBmPcK~g z8Wim622wZQmhh+`HaKhWJRnT~3`taH6*D@*6{v zhxb8#h!?xz_`NqIkNRP%;wSyn3Ii+kWB`ym3iuImK0Ng^uq%k4WexvxbWDmU(@0CV z;uJMcXe?R^fkqmioS9JI5@Ag;H|O8(HJ2dSkS|L1PWxeaRxjEKF@43_HZvu=!-QjA z-%!dA3W7dY-cO`+{bhQnlm^$E(G2JDPGzXX>ESdL2GXBuHMVrBE>0#0F2E6QkzR-B z(Cdwv=C$?3=x zvA^%&!tH=8XI2)p)UqGndAlUGgNJ^7xc>*CET@x&6rzv{e5iDJRV|`!jKj^$&CzJI zHCAF^ml-5)rWi?@Nbx;eA`x_->V-V(e$r8eaa4#)i)CFfn0>ko^?cI)q99mOKaOKR{BDD}ar%puQ(>#)NP)|t z;%}yQM3ggZjG+Y&PxDG;<|%W@k(A1sh>dV|LF?+BW;V_x(hu5sCL65}Ep_f0E6Wpd zb^iZ(fbn1V^W>DRzvjCg9Nq0GbLx*w8t1s|avn4CIa%^Lrt1dcwk zPh8DlCx44VNp0|bP~2X6CnF-ZE!iF$o`$N#9qIt%et9#$-H-h7uh`8ZUl=auoN#?QpN5+CFCe(1b)k+qqBM3&7Mj~Mart(9v` z9veBRT)kz&wYw1N4IaaRREs?L$dowEXLL4nsUhPT*1I2PD0?61FeAO~bB<~D3beJL zjpSb;lWi0xjV=$sGmdo(zO)_sP2X2>jghd>?3pwZuQIct;_;b+oP1yPc#y0D6 z-f1Z7U}j_gE*;j5`h?)Id%2R}D@&lxl zB*FP-x5bM+;Dg`hLjv1UX%_2ysqDgD?GE&sJfg7LBToMN)~p|P%MiCS4>G}r`)X@* z@JR>N49rj!*0TbAN*gKIcmFe;E9x3UY=m}vZz`-$u-(*Z#M6XP(A2HQ(X!xT!5cVz zgsLxJc~M$%-wVsDb}OdEk>Y=yRGP9*0!uP1wtsk2_E|S3Kaf4?hQUD9o-b{vPCx)D z`I|{9FRB>~oh$9F7DATm+!Pelgn|YG`pmV5r15TV5N~jmUiz z?A(zND1ax-Tc6O)d)Y0fY;$R9NgE?eL$$2@WEoZ+ZtPSEjwy-t1A$ssb5)q_Rw_%f zHq~4;#5v2|d!8?AIl&W7CUqUn<*dfb>4#_QrkjHrVvC1~vw-S~j=6C1cGn&8IkV2M zUatC#$!3JC&M_RZnH#w=Rc@_6LRw8iSbf+vQPi$SDgK>Tfj$^O=9iT%mh}GfcjN1X zV!oQ*0@rlte5M_;ZG!Y3&m;frUR*C*nyS;a*=ovifi160@r8URE~xAG zrI}fxY55N<+WQbRn2%*0|@0 zPaBDl=Gxf2Tp;8SKPVUa^P*?{n=P5B^#y0#l)irT5TWai7<w??zav(v(oM8spwTqAmNpgx^_+q80oy zbuE-Sr&oaSIv$+GULr%d6)$k~$CqR(Y@)<{TU+SuPJ?s}qPb263P)GzHVF-MAQc6_ zbw;UNFbjXIWA~(LmsX>Q&vv;(tDj-jF3j9Y=S)*|e@p%J!O%=V732KuI5ORIZgT~7&Pyz@5tYVJg3uI%-;40VC9Cn{ponxT*v;i%2&D4}4KzUEnpQ`MG&G zeS)ugN`!gbxpX&G%_UYM@sgjmR2V7B3~A+1-X^pe-x+!eQ zcO4o&kK0}+d4xo-!iTS4ee;^aDUq0r!EcVKoncb}-we>2ou2DSM<&-=5UXC6;xnP@ zb4^E<9y2p~K#4ChJ9|I%IT6`%P9mYA^!!E|ANmy1n77b6Afbx6LnDwi zmOfpdjjFCj7$lGtTgSD3I`H98==|i`t)I~|v;aQ4ENAbv4tr%R`g_pZt%>rym-sKP z&X@ecr-0ADBn4r?XK^iA@6Q&!ad*37SVm-YCg#w-E`KsgM}TqdD*)XyN`{ENSsPmFD{k$+|Xo@XtNnP>v1Vuvfs>Yb04K^!zAR zDKRmwx3VJnMNDcDo%R%sLer{fo}wK|QO{x3xb8a)^IqB)*S1%_zb@Hju_zjS?B|Y# zxm~Z{nTUW#Nj3F$FfJ8bICNmd?jW^NAGdN)3CMZPq^1W}ee1~2{8hL}gRH^v;I{gq z)MBx*Nq`X84Q(BtQ{p0MA#m+htC8tot7}<=K+0lO*+bmG6LsGmq5F3G)CGTjO!^xF zTEa+d2HS<);7nYfZDlT_ai*v1~I$oC?A;+(I!Fqq%W-1w5T;RbUPXUN)KqYb6(*_ML& zgvY@IlL4#i=uDmTblTEZ7&t+)u0D)QI zj!PkrZ_y0a;-)%rnrB}L2%IMF$t(*uFfD0?*|l`!Xlf91};x{Vj`!!qF(eL>rUwnOJ_jP6AY>*sMMHaO*J+ ze5&R1KdSxf1zaorfi>zL4}iX#bq7^8xYiL&#p`%p>z#MNnX>mV11C`TGIP)mP8iHb zA0WYps$>+6ftAXsL@z9__UwDDNddVn9o699XZs9Tq|yGz0Lp+T_cBdL;8 z{Jgc{>_cT_^|tKdzKp|vHoU5%yVEqRz4Yj-9CP&P)~^qM()66}8Ir?g_Y4_5n0R{) zQe>i=YxlA?3H7QmCL;gBPz&2Crr ze(}SHv=6E4j!TWad@@XUl|wrhUqs4(L$OsiMdfK&ZUr(!jjBg#jR6Yo~ufGhARKnn%lbEXe|y?Kmy^kVs`b?A39d#d!#9`j4or9}SCS-c;O7fDzOPn|@p z+O=(THBw4*_iUAwX%d;iM+Sg+LF7{j6<6b)`?q#RB9>5rW0~F1#(ke`>TzI+VM#5e$vxOU_Ky;cy|vQCaL7 zvi{&lucGwKloECZSLE)YQ2e?qk`$bVyqX4fKG{cI32nk>Zi;tn%Y7$QZBE}zb06L- zH-@+f@vZ7bQI37%Pr;oS#bVm|`N8R3bo-~N0QKYHT{A+=xrhqY&#sv1aBkpvD4V@1 zRhHC;&Ox{d*WNa)PlOtNx%0W<0Sjj@dz(5TPRq>+_Y_JvDRl^3S))|( z@H4s|d*vORRk7Lu=hLJId=>IbDE|iSr9VqsZL&UE0j1ej>Mgo!U7qYzM3i2N^F2t# zWhafkZUxN!)>#vBVj1%;!0L;t(3SgYo9gY1KnInrm=Nt=!$WT-0VLlMe1ZplY6~Th z7pwj1fH1nT8v~oDuzQ!Q8@YS1+rv=!eslZv?Od3i8Ok+RXwl}LLh7t*n7J}gWF-St z-ed0YYYosG)^iYFeqsJf(P9@=Z{3+r%Ca@;%n)g&?O`al{;E-a<{|&vT7^8@&AfiM zYm+S+75G-1E(h66j$T&W$L1i?Sl+w;GQXi^vsKi6?KBb39z0#|CG1xVlrpUcP{?eUT~O9jj?3@2R+CTh_e3oH+HMMNDG!4x zl9n6CuEKcVy_NYtRey>@FAfGVv>>Nwh@0~k%gCYr0YL+gA2dT6!9d~Vm7w)E>!!OF zaV^W*V)1sU$?5y7Hj`827^vwJdywTbRq_Z3FifEg z|2O{pX5whWQCi$5i6e_^W%GA1;V@`gjRy8Az@W)C2HOOhLx&Wh=Wu4(X6o)u7^!Li zSodgrT6^QPp-uh+cRvpTAE~NHFr@+<J~!4GiIT=G8%Vb%v+Y26;o(yMRIJknYbf6~qUb z{@9AIey8tiHuIrBZiNCCDA-fM@zqQ047udU_xQ+*)@wou`FLI0xEGt9EgdQq)x#g# zVY#sLjT&%9?R_e?2<}pSxAJ`>sGY~acBNZnP^6Uw(eZCI0Pdh1)`)S}CKC?3bRI@c z)<}!OI??0uUt`!@O*kz}eJ}z;w)-T`E*;d#Uis1X>K42lYh%-iLsebky)Xkw_`*_4 zq>0?6Nhy+_a&N|*8bo5^}%8Y+5SD zrl=izuF{+Yy?$R#9B*P)!URij-;XJ|Uevh5GJ1sww2`!|zN@T%5c(df{(d<_dA*$S zWP0Wa@VY0Dz6stYr8AMTGnSd@N~}(9Boc{%UgsCj{#j33xoT%s&!F<6duP+@;kbDE zHkHU?sh$QGN;6G@A28cNyPvZhX4!r&1li3Y(jNs#8Vt|R+UHDDQbZsZM0y7-Y^8D#{E_JZznt9#7$f1~ zwxUgKXY}JRv&XQ2U(FZ7NVR?88kd}U*d>FnNt*rhJBt3A$}20;@t2QatAsqWi(j_0 z!tyG@W=Qba#9ZKYh;27>RH3<7;C|K&bS4Cht=(wVm!}XmgEyw$&Gl4)Vxg>RdMbM9 zkTU|!XSbe-i`rJ5ES=_`!T&Q{=l|5EGZ{$4pE@{=ORBTD^fiz#W;O4#%zWxgoakaz zV8B=qD9gm!Y0YhAJ7f_8bMbM~M5?-TiDDF;X`Sx0N`B5U`be~3{W9|K`_wcpP@&~` zDd)z`4P@fBeYRA*3ja?QYFS;vW5FIKwIy4*ZZhNM{>67woJ%D6QDG07T(W|>H!mti z<`m)7lm?MBBxR+=2O26dLf(Egd}&&yi6omy1df|OqZcq>>wBwXgF&4#M?Z}h}`F*ml^kO z_-Kl+PYAwh839YF9D50cXrDgqdW>a`c}l7M8W&6b2O&H!?vdj2YtFy!Z0`jj$`Ie2 zACy@?*SG?gC%=||ra>A*A5X!IjUIEC9U5yI67M zWq>wt)G3Rn&K^H_#VGxR*eak@RHuIaIrDXj(M=7ih$Z8V@x?3XE%w*ez(-?wJPM^H z-k&)wz&oh`mx4ih(aC$4Rm`8a^qA%M0i{Y;i!&eAcNlR<3Q?>-1qcWJW3~^p*Mo4WyjVxBw zG2Ui#DIH-}sz_5C%@E6Cr%~W#KJm zzjFLaFltt@#{Fb{?`ReDsB9r&Qd$ znV>=a5c^1xaIV?b`#*20Kja-YUYDHx9wR4x^?X(s5A_tzT$%jw)!QC-_({v8?CRtj zTJ~T>@9+;x*6+&8bTkg~TO0 z?3rBC9y>gcXGn=qwEmU+#mxVC-`Z|E>rcb<6O%>O%~p~nv`)2%sOxUrSQ_J@%vNz( zfxwiAE*l(A$|7@6X;gjWE7<5TT?ZE+t@!T@oIZ#xkMSin(^JAp#6R_a*2>PLC{$JN|df+pen5_KK94Q ze6W>?_EtN)<>q0F*8<Ne+mY!Wkis z8PAKQ=_Ih#&Y0O>y&nEds)3;#t=ETBwrIJ7v5jtdctG|8_V?2h%KgF`#D@7Zk9sMc zEbxYgLWOR&W``INi9U@4L+0#+-PE2ARm^#TqF|5Pk_QNU{a_|3Z<=Nxv`{alBzzZT()A3d--Q8(tjJ`Kfw*FOb8_3L& znBp2p$w6)BQOc*PRHkl(|B+R11<#6rfJ%lu*~{m_*MkE6&GU%V<6Kx@-s=iQ!#@bB zl=!=vd8JQApKJPQRw?GPmq>(6(u#(DjaZV)(ofXfh~m9TfK4&G8C5RE4Ip7CTR`LR z*mee3AkAvXTV>6c*L}sI(~m&sp*Tr9G$NMsKE+m>6M_sCB$s#l6p9#Z(8fHQ1TsP& zvYt>aTi&V|9HxEBE_0g3#}c5BHZj78av8pyYta+~CGIkJtO3}$V$XUiE9ci9Wv5>_ zS`+l~8NZ*mEm*BlcHK7w5Gm znMukK68fEUGdcirQ>Mu@XBu^Hl!OR}cFZ0|Tov#QRNj{-Vu?T{cOKsc-gwcgh1~17 z7Bh3!n8}}az!n?XwtX@;-BxtcYuRU}XtNvf%jaBStb0+|rfj)buws$;nvS#D5PTH^ z9rf~YHW0|nE#`+^t3}9f9hj3zwW>Ft7Vl(LO5#H}3SL&u81JrWj`XI~TH~ z!mN6R4t2ZFoh~jxD%szcg~Wr)K_Vc|G}&Per8R(YE>GyA?xDiFZeM}y6}O^9y8iLW zZJyvM7YczuC@g5fa|vdMR?+Fe!je12Q4m`m(h751#Htr}I&(pL$orv_;5Oy`whfB# zWAz~3C3C3Sxn<}{mFcZI4#P$pik)m^_VDZ*j`X*g^3!-^t7V=if=HAxs_UZ_yr-Xn zCMQK7jv9-0hlySTx_;N4K6qQ2KQ`UB3VUT9wyHhoz19uzPiD}K3-hO!FyAkTOwEjz zd%)JyqDrQrJGj>N>pud8h<~#!%S+gLw@y(yRQIiUmtB_fc<8KOutn~kxBa$adP>zy z|H0-!)+8%Rm0d*y{3OpKAc@MSFYq1@4TnHE0;MoUJ$ntxKDa+lq3+X1ckN_Ko9Shg zH9bNI^?$Kq!l+!jy1{4JW1ie7^>Nkjge0mk>~x%1sIHf=Gh1hxslt4kYb7X3>SN45 zO``equa^OH@IHR$K%?~F8hmSt+YO^|p-r`hcSglz86rTOmLLKNQdFk}0jE>q45y$! zf-MW9GEYuh+8xP+;PK;A{c)v4+LB91tCMx1?-f`OPdX+6> z^M9D(=<$0qzX~QH>p?zK^?FVXn&pH(h~kZ1%=O5Za1 zBOb@QSOi?oV82kq9kU}yXTmo$Ua(d4>`lAn(}Tu(-(%%N@dH7PIO5mm{!@TgNeq&_|j!0 z^Fs%z{uG5$f{{g>gjouh?~@hFK3j;8IN~;HrUeRAg0ToxCWI#o=u$5>=*nB+9qQ$b z!!xlWW+|bi0_U9ZaZyp$EK5vb=G5FE6E1(trxzazHrvw9G|12;P&%%aYo#_5V5e{V z2L_kJvr3l{ciH@dkU;!`)_dVKA%5Je76>g5@tg_V>1uA`kZx*#?&dLyFx|z=PwQvd zp?dl=tUmI}#-}uq#nu>ncq>Je_=f9AI=sKZ(`chmbN?=XW8k{)d;1LX=R~Y*bHTML z8Q$S>93zC%+~9dxS4(}$YX8Q=_YG@fgZPo8jqS+$7~lk@Ts+_0dh!Uo*kvAB%MROX z&``I!1DgF<(JB;q%APpIFF8M0XVNW{D6Y>Y>bxBMr!5Ww&EZuESb(%zFICYg8YQCl z6B`I2x1*?}JK4L75cFT*59=BW!GL`BQ}TYtzQw)MTICX{{lOyPq&-#_*=OB*4`Ez~BE2xu_W0Xyfh zSsGD(nTaHr%4Gr#(4e!ygHiVL6y9mq8_~hJ#6q#jdRD7X6>NdpzEku1cjmcOTh&UF z`Kz68Z8pToRH_>EkdESQW$G( z-r6Eg>TFIDM8i+`St#@_KlkI!Z>B?hQSyn2NxLNSIepTo;=M#l30DMt4kE(Z+zdp}1O0%Cz1`jPjS~^;hw=KbvAJV11w@u!V(lP7ldCByZBTV#yP@k#Y%o9+0 zJa;S0&o3T^CF)lJ=d}-px(iwwfzOM_{1!o`nY$ck3~QY&0@X1aFj3DM%z^G-rM zIb$?OvFC?4UkMngXsX{qhnPkAxQVkA$L7SAnER8yFkA&7bEWCOb2V6U1CO!@1K7!* z;0bMsze4YuP(^Z;35F~M-%@8F?af#ij5V1iaZWzH($nND8vC@M;GtY+>)BG)Z)*Dh zuqnNL30-V%%-g8$qQ2P^Kiq|Mw�-CD)4}CEuD9aLx4o7W7Op81#B=tfzuZP8tQd z+`AzbChj1%`3&%xjYDXrj}rgAFA4~v<>0v?m}EnjXC2V;UL<*Hm-MaJetU8b$x$na z$8~tH3eHkJ;MGsTqGXwc((*oBBP}2uWAmI;1*#f>;1s>8ZReQEf~7|JU*P7BiSwTI z`7;m5sm_@@q#5pNU{{@QKAe@n+<%G83H&yfpOC?X;;=B-R+eT1T;pFtqvFc<4dabR zZp$I;=AwpM_;~tXA%_QMA{ynf9RNS#%9b6&F#l2bwu$=@KWu$3Q{p6|^hJDOnRUX; zGC)zT5C{pRVQRNwOHX1?93n`NucDJh!;81^1)t<850cyhOd|(xs#T_`lgWf+_QKte zgx^ph3cvgT&`0JfX?3`kkF~iHRDU~NJ1#%rh$CE7QFAdiC>)=-fSiR0)Jh2a=!fq^ zgRfSW=1J3NQ--_0zwI-6j8B*&sURt|YI{{n+q3dGSK0M4Z2AG?O+tf`Cg|JgqCa+N z#B?B5E*}hvfUSEAoqHVLC2G+<>PX+YKdfFA=;a!>)K@isY` z%;t=85TwRpZZ33^D!F-3Q~jLC#Ez3>=m&ovv+qm;x=T;4y|u^y)^y!H)6Ho1q)sy7 zS!3Xf1g|kVH~pwDsP}R2XsLBmLBcO_%W2U-cAb+cK`qJe@V`3vkP!d;W!~45G(Vha zhW?;Y-jzqPj3Q{`~NMPifiDGWBNd z={KaGU9`%c3?@u^7!zIz;Ht`+8-~aM~#D97J|GzKal0Wj%Iv;PGHNP`(bFSLh zA^HQbv0IYOg6XFz4CGm!2oQEl**UwIFHOZ*n0T~zEW`9J3%7bcIR2SFyEJ8Hdlx+= zL@yBTJ6RUQa4uuXmuD4e@_jk0BYc3~tF4BDEfdGhi`H%uDRok=5}dF$5n_C{(9>_~ zDK(X8Q~+7CgFiB1c9{S-PRImjIjb84rmB+(b7NICfj@Q9%UQLQq*mZ{dP3pKpPhm3 zGN;K2V{g;u5h3TkYSU7iQ|?i77_bX_)SYq=)U=j@nUJ z)i#l(!_#tmiE4=&dUI?WKFG%iUunm5Zw-K3B!GxW%v6?Fens3ThITV`d34-RW zMvHa_Ct#2cID5ytTAWz^P$zQko&Jp~v2yic+v#kTdvf1n5=+q1sGlczO$)v6xb zZ<<|wZY8c65K#+}?qq%ePONOocs{E72Z4}6W%+PrR0u|M zVH)AUl)d(a{TkuTn2A1{XnQ%DxSOGXhyHBIy<*lJTTKSR zgXEgSj}wxEE*V#EIPZ{;ZYFIL^10td*hU(zFepfU1C`+pVwH z0mFx!)s?fSD`oT;K=DIQ9xSNU?7@gvt{f5!ks25F{IebcV( zB600YG_#Yb?vc#+eK+-%-8&4}3LduI*rnH6+X_p(xIO+{B4SS;&0{Kj7}be86p7dFaB|V3fhn3Qrymw!Q`wd^(p`YHHk<>K!jZFPKmAci)35~RubNhekY;;OyPn5MI zYWc_p@bg^C@#MJ}pl7Mh^ma@XQKUk7gN>jI@7-007TVYNlFTH$flh072Iha@!)G~j zq1+!gZn4+UD6I;xY1Cu($U^OrBHzkijRo2zBu0F8#}+}N z5ox=LrJC#)s z`@fVeAXR`fTm$>BDci(YH%nY{3}0Pq1^H?J#hZ>Er)Y0eW;G|DO^;R$mz_L)X0&9?ea0xltc8jC8TLPcz&Y4QZ5-@q@;MO1fx$b{!hnDPF z+PbgY>svBiBo%yOXWHd|pl>Sb35w$=Keg~#!n(F;V)@!Lsw#h=-7f3ut~k0fs);FV zBfg=nmfW~BXeQdyJ+YqJqPSFTr&pqCFm9&Cbr;4FK_gwFIcLR|H)O+|FWj<=S%&wq zzo2gZUSDxV8A5jt9v5zhTSu49!TTT(+YhzsK)?AMx5qDbAL2+a;)S^1@Jq5&?w<>1 zQfTeCp|ach8k|y+m=2lkcx60jWCfxocz6n4K$uQ?A$1AWL2{AvI-;E~;{n7hzPDZf zAb<#_Ww@KDkDMU&b_H;=a8kRos;(H#4T;VJYJ>*ax&|h!X&J7FGxWXRTa> zSn!Nf4%^^3D+DkH&YVN5g$u8>-Q115*+!}=S9xs?KTT7{r3O-l@gIha`$mREol_mS z>Q~Bd61UM@Z){bHUJiIjmovtDs5bf|8(N&E+6 zqQlGreu!}3FcWdFdw4Kt0gta7FT-3cVDRMTpJ8?X+k8tME1K%HX!m!wEbUm&zr?8w z9HFWD1ue{)-Q+yQ;K5_N)yvr#_E*PnNzAys7b~Myk}7OAfSFQdd!0>KGE!O9Jhj?L zfTE1l;de=bMC$nZRK#y=E){kcwG5q@<;4>V4R{?K{SSiOlPvI#Pv`@H73(lddC9T`HK4}?zLR9DP zH4j!h9B9I!rmprBu^rr$G}S1Yd%BpB6p7IO46mntyGXb@e-rYtKOigc^(- zSOVgU$8zSKW+}TIJB({4_vAu*-(##Zz<})A4>4#3)phx6oD zSDebR2VeXTzY=P)Y%{9fQUyd)<*~4etwywCjCFAdq0zl(+0ouJV%u;tHd3yyn?E}a z!A3s&lTw`>B!3@4;QT;Z{G}}kS>6B{>u&OQCvyQq)mt4Ej>3;#w%OpW;wLm4pd2C9 zkbr+sDi_%sOjHc*T|l76=|Ys0UB1I}S6mp`O>&+_GmHp+z}aaI9Nd_Y^iBh!&jlL_ z7lA@H=T8)+n#A|pu;xmYx+PB}oDV6<(5ORq3<*OOBG`=o!ZL_~B~M45!aJzJnAw4L zrS?P0??5G049c`jc1o2X;w8Z}jlFmILf!Hn<*o)3dbQAPRagkCaiHiEcukEIfo@-B zBMu22kg;rZ1#tQl>_wDSnGyHHB~uHP?f@fQL2ntnqyug(D~~l%T?=Wc#d)E-fhcn= zgnFT=lzmR*4UZkb7jdOMd1v}L4tnTuQ_ZUF>6%gdkZAe`x1sQaX}PM^vk7U_G=Y+B z$F4FU{iaZ1>WRoM^JahYRTc_Bx;+sXcVMAuJH|Vz zYSdb5&Sxg9@reD*X*7@XL?>-EU+r=q^oITw37Igc9ry%{9{a}YXGoNX~35U7Ntz6(VhR zM2Zd^Q1wx1;5@HN0Wa%h7YichKxs@MOFyMt?qy9;pGy{U#F%1ly6YdoU9)vHAe8M_ zYhvVh#!iXH0WJY^rxuTQ8nQ;l;|03R6~n1 z&gwMa^DTQZvEkH+RmcKXiC5yWU!y0W=9(&1=SQ3}KXt~C(O}^Mt%c3LPtrvq#P`&| zsY*Gw=b0(Zs>fpYkF_`Z`FuV_8y>{=@N2&CymV@s$M`<>+xzJUkxHI|dO1eCS#g@o zekA^v3{tWM7V{$z%#U}8Uc8~e)XVR=l`}OQng%$T(od4;;;i!jJ3sNiIg$U*0#CIW zcU^VyURGMM6rn_&rkC(a{Fso{A(6UxcKgNBjV`SWRpBY0Sv2dL%ERNwYkNSQt z>TVRLdPGqr{mD~H>6c(Zximk`&y9wB9_V4uZZ~WW$9-V-w5zPl<~a)H(5=j$@)dKB z`1h_Wzyh+&`jvnQpS0C@tY;s+&&SLp>-*jWkbQ!-ovGY)97lmayXg5Lj&dS~4t7xF z^F1y!{=17yn>LF~n_G8Ue9|P{(MGF^2ZP8%t*jAPoY23gbb5u0^*>rJtfuHPH zL6q%Ztku^v?3G$UFtH|nu+lJ!8M;bb>=4vKN1LymB?m9VbTWv9&Sgc#%E7QB37RTM zoIXOBGU{mhK}=3iAS%lANzA>hugk6LTCsjnK;%|P+~P5pE>S_6Ivl<^LrkGxouzCB zvn2%=bIyf9g~wsAAjy4qf62DXi*eP%p|hnPcdu#_RkZ|hvuIU9qcob+#6@hx<;6tY z!TSJs7pOSEh0NSWF&(QhxR~nL83R<*V3E|TE6dVN8Z+w)xSf3)iMEx>8!I7Fx3?pb zg6PlCfFea*?K1=KeTyWbV87?5VWmp%e&0huRFTtXO$(zp>h17C%!N2+Pn?xAj5l?2 zlu2Rxv~r|f5PdYio26OXse{IHuhyh5>&x=hsYPHinHOI=r%gJq?zl>O$p*dS zL|U+gSZZz1MJz5@N#uTVAb-1}bPuw7S9<@0nf5+=a+tE?Rx>xcB*?R4o5zoB3QM2y zHbqTGl|atWUq?B6(gv0mqM2$;P+w9s+wW+w+C$Fg^wt6*VqYxoH|pMu`YVMKkyS{o zD%ME5T?`5k_&@HG&yh51G9)Ql+A2%!bwa z?(SdFuXegmK@E8g#(x}fu6Xyb6`I3>t6)WH89pw2jpEQT17a>ZuDk}gs;12RBv2$} z@tkiUYH^WBFRLzd_}Rd7II*zxvj*1!y-3vXDzNecnw9G*Fv-VS>T+gqz>gF~T*3MBWuJ?}pAPR)5^+apr89(Se zu88agZEAB~a-Kf2p(;gBy8K)qOGuE=k%}kjd`KY3P+yfUb+n!;!Tuw*R=ab})KQEWA^pfsk#Aj!5KS=r9T9xb61+Q~G8RV66x>o0`bmN{8HtMX zA%3%w)=wkAVCCSrt^5z_zQfA>f+ufm3xnekw*5dze(7>St`)5cj|WubplemEcU$c)y;>S&*ye(FJxnD5tUrlt?3`%hS8VvyW3PT1~E-XSj0s|u)8Kk z-joSfuD76CO@O$Z)vP6_EX?-?dNp`w%M|M-Rb>_1?LQ$Cs4B>n(RitCJ%J^fO8F|C z%zmmLn9d#QJ6EvC`-b>Fp_!V7lMpb(U9*8k&E=95&9iZXK13lU@KcR3aJLLNP_z@3 zWR5jB5PvxQuKSf=el zj=jBC&$iM|Unj#&C?7IoNu%eqkTH+RPd^qzr*rY*(MjP>b=(PSZNp+7-aP);g2$(f z_&Do@n92DEo_xYXNhO@-CI?OBhNUG{Zs_Dg2m)l$AU!!xP8h)Ea8XAL)Qv}sr)YXN zaGP_G=51Ui;;dl316|eQ#wj~CTh9Q2EvAAZUZlZFZh1cWEBEG~;dsBWd=Q4ALeVg} ze$-zQ@T8UmC1bGHOEPPve|m$@3;3(dnXdoGPujK{8>P`IWqpM9q}|O#0^?Xm zM5J7Gq9BhfODP^2-$H{l;Dv)MB@s(i!(yA9 zXJ+G;PW@EJD?HlVbf)=;W$v=}8rCj&`?M@_oODXHaY=W(=MuTl`>6L_Q{qB-vEnR^ z%N^l3?Z|}8OBUuST^(?!dlFcjOsCo0PX?@ky^9Q_#JBD2|AT z4X!JA8jr1N)>YPpViP*#=^gVpaBxyWapYDhrzDFyLLI*cepH#2w|&>Ct}-l1MB+a2 z{_Q1ET~%QKH!-Fn$pWdaR-@2Ob?-gZ)4=OoEn8OM7Ze(QpPen1HP_iV*!|X0@2$8K z^C{`^Jgu)jfD4W{XAw;zL(S@rukA|KB;{vHp7@1HaWPbt1iN`8|!>M($aN zA|Q(l-OmjxLE43HkWq;WyGJVP;PF*cq)w+;imd z1gSC183c~_-n^P)_|hhVBUSlRW;&7R75ZXojmU4>#X$tB%Ws_U#)9Ql#o)|C*mc@&4r60 z1ssc*(PL{=UPov84|ip?#yj3UV^OwQ*e@3>o9Pa=3Kh4P{~8c)V#wrpTFUYc|u#~ft*rBTzwPtag|-wQrc+HTk4(YmNUi8smDmqk)Y;VlKof<6i18Aju3!NG*y8P<-Z3a=K(ENusxwZ zv+N_-lBUSgS*FuMgn@fG$(}_Ah->MDLHBtd?Ua;b&PFKvb@FJ9i8fdQ*@kokpB8_g zuZbhFCzrZT7A2NS;qme^lF!A7MSqWeHD?YfnOeHA zBo+TJ59B)w#}&pu?fGFQAn~rZmw9PKEa^Z&C6|UDhVz5yW-aBuN#lx?d<;RPHz?Z;`FRPahu3lp`ewrq+9WPa#R+VOpm2FQ zCVB19cx$!C!4C0m*BGIG!+L9$l8e*K``A{D~L3*noB?DY0l4I{fQm62{ zTP{23NWkN@_2BST?MuhnuddHhQK4e2nJVM`EKY}nwEdCDh}EJmim z?b^%Co;<^L?0Bevdv=HoZh$K{%OXU5{bAwl;4OPMh+s)Z~-NnUon|tVt=mC%`e2K@18K(sbB--43-vKwqjo@)VfxdS<%pTbESuPw3!9 z>d1^7gNa=)D78@GX7Do7B-xTP8u&wkgY5Hpcn)JLXOeL~8Y31G+`-gCTmomR(sg{S zq3>LRU=?EAx6c<($OP4=r$9yZHENasG#^S&*(QVdoj0}hm<=uAy@bfpSb*Ss_AT-5 z!sECkMU2c5`hf|h=_88`6Ry?NUZN;b;uvYCtQ`e}hVi?_=og9hKt1R~apEYDuqnQ( z)ps~-%pS! z!(dlBh1ehqSzfpGvUeDTA8zR)Od0rc5%Res)IK3|h)<7T_a;)RypuLAC0H~>k|NDG zu3GnmUU?%%V?ljGHEt!eYORH_jIBbj;@!K{(ydK(KX+2NWM{734}HxNvKuiO?8nRn z4}`+!KpS!AI7(yb=hrt4f20jmwtLz6vFTPa1mL!|Ju^*a&3GR@h@kHzXIdA^gN$8E zCD*J`+u{2yE+{T6Gt<#8eB2~AKLkls9{XkF0yR8ii>njnI!`McTm4eL<=YmP*-uY3 z$CUs*8nP)r9a_#FAvVDLQr#mJ!1ZA$_$o)2j%$Z6Q_d1`er|YhZn=BLn6YX? zanpDO(=kXFb)lfd(qCi>&>n#~%j|4NCbPWToz{az-&KwWxM~bVN$Mdd>WWhfxM(`e zb%RrrE}fs8bFYXoQ@W5B1KzakTEAfkEk9Clj4Wu^z}&jR7RXwaL-@3)Ng0T3$QUgU zR$&$CD2e@L2ledBE_B-zEe!PZ_=379DW_+x3^MHRZTdUU-Hfj-gP>7O0;UFuF{3z+ z_}#O4ZJ)&V{cJNhvY*9K)FN))%dEC?sRRE>yBWF<3pf*46!c+Ra@!h8Z z;S`sBh!IvkElThnDjPRPq9h;pMUAKSG${!xD>93D+4;AoFXIEnTP-9qDnq&e1rEQ1 z-yDAidbte-4)p5~MpFxvGIp$g*1o7j-PB95Vz&|_VCWilFrg0hkI55zrqNs1X@*5M z8mx*Da+4*fbSx6e8`s#JJG5@qIWYZ$p%Wzg2h&~EkS^?+XR<_QrUQ;F>PYxu^#W7- ztZKzcf=Hg17B|Wf)*dIh3qDpD7jmVl>ChVs)1ZV-@i2MMh2}RBG2tN0@iDSyjB=9x z&vSpyN7!Av$8fz1EvS`uQ9$|o!{m3__e8S?9C|KFB~GHrINTvq2q;8st%U}_73zA( z#Y8WRpqPRB)r9dF)m{-EJ!W)Z*58e~FF^R0(P|?7w~f^r;e&zK=8Cv0U%!Tx5x*JT zijIgqV(WaXkI5@C1m>g!(Plo3NZ3z>yranAlK4Pl!&hUc-PYx35nI^htKlrxw;wp?ZwlCiU7xZ!{rs_^euZcUS3lglBbMf z-Mi@PdexlWl|&GJ?B_#y4Pf@6loFu(9XC$DXM+%)GbUanPwby&H-boNmyVQxS5SxV z{cvzLIBwQTvtCl4JQXoMCm1p0!^wG9$7)|Q@vb_85(dT|?VMu(V zT>l4ixSoZ46LO{J@qE03j#W)!X~+?S3jKXN6F?DOFh zRkW07v`dnlT&XGvmB!6V&_div4^P_p#M>^w=oGWh(Z>&nC%x zWp}!{M4||XHD}N)bl(&X`J;=>YxeKS9PaU9j(z8abQMW6CTg}?dmk;$R-CVm6HBt{ zxq?msjH_4PFC{NVPf>r}MvW#MZNZD4`?6hdXSX`Qzcv_%FxiLNveGxuW`id<>`Yhf z{1HW%FVFlww)!g3T*EYprEsFD-tW}r5UFhnnf+;`DryEQ{|~<2r5?gqZG@d~_NADwhpXsM@J^J^ z7xtt#fS<;jYW)a*j#*kI3YQfDN<#l9tb$vyOjS+!*Sk1L{~rwfWv3@4cnUUNj3-&<$xwcv zqnX}8p0fS~NYAi~EPCXe%CS<@W=jPe!||L_V;k$>Wz`c>Fr9R8O^ZU=lqYiOA4&Ul z{UCkccZ>CUl-QTK(R&o9rArE0Rw_NTyD#Khpqy@nr;o_N9FgmCd{#k2!$!I1ETD@^ zGFVBECe@MOYBK+E*4bua%gm>l)ef1k{|EE`+oYV<+Eg4Gc*d{GVX)!a@iKLp29j)g zd16>L-Hc*nct%(L)aO#KtHWal08f%b?hN;J%Xb!IK?Z=j&P!xKo)HLtydXu6nL=Ew0Yudzj^Wq$<1?ALxGNDnaEh}RAIhT+Z?u4! zD-&PK9Cc{sNHPKc;PZma)~GmqSN%j$d&zu!w)Wo@C!x{-eIuFBwis=XH>oLpgIA~Kc3 z3*Qg}by)V~zGE)Bn1rHyx~D;>C_5-cOFGD*?UTO||O03sAZf9$nG<#|Ve&oJ8TgAF}%4lH*WKkV5?HA4QJt^2clQZ=1)PRMEcR8=KkyWkbm}^W~GnHh7 zip>}^O18aL#T=M9)T32AtDbW+LHAvi1aBt;tPJ5ETFd83R%C+M8L*220s`ljN@T&P zFj^-P*&3R3Vfgm5(MK_=XSO9)2`RcW7IK>wg4}iH6)8b&1?{I4->n7Q8c!7R<0sJe z!74n?eke7FCp43)Hn0ArxrEq>RHA^7M6Z=1QRu?{uJ>kx$VetH8R{L?iuuc0Tb;JZ zD|!&4&JB}StNym~d1O}VC4)E^7#-XwCFA@~GDa2j{8Scwus{&CIiEIoa$aZe20+0U zC#8@}QhMUr*xLiyH4#% zqcUzzXra<~qA~c&tyh_|CEdJFwpXsTJ30Nbk;K zJddZ3tz9_g)W`iaNICqNH4!N)A)<-8g1Q6+84}yw8?)k^(*Y4t~e^uW|Jk8p=9*j zL%tPA0zTu$hg}jt3sM1+3!b#2 zjrL8WGlgB6LnKX>b3L(q?kD4?+Hs!848Ap=0^Z#U%bRkVFNIosXH;>gR}~e2?KJ8s zaJR9!G7Vn<5pWI#uQU>AIG#wVJht7uAjRy;NZK5IH4@~8!)5^Yd6VFwmg6GYz>s12 zGE&5Rn|{^+@KalVno%d_xVw{~uH6>IkWfIhJ8Gr~BSb+oS?M~s3CPO?Nb}oYmxQ0| zbdFV7w5b(yVuD6HY$@z0EVKb!^!_5Q0jA{1@ z`Wee|cjiti0}ITOq~owc46nQh<>Z_DZOhl3?aHdB&3p@$(~d1EC`U54mX^N6`SXiL1RtG)0={F-@Ll z-8p34;kzF%S(#`9uV2& zv5r)4_EV#{ekHZ1v!>NeW8EOAMZe&OaUe)DvA?_Cy-DIej>>kyUO21iJPo(wsQ^0P z4u9N_vJphX>HSojij+X-sME{*m(|U)ZX(?}s-y6f?c7dMU){DI|F;0~p-WMYIGc#e zp?Lr5DB@r_CsMz{!I%x7dnhXc_${^+x{+*dvfk7ds*v!W_xamTS}xYO^T5fzlO;py zcOp9-1jv1J*)JHrtm{(Q-?4Pgt*qePkD#6g;eMHo@~H42)0-9iiZ4Rv5wD9WQS&< zQ7QZa85r;WbrQ`>tUP8uCmM?@qF`h4m}L|kOkq2Yi;QsSY2J4Jzhl>#h>3LKR_BZH z%;MGR4;{{RzjD`7Q5amS-}(Dxj`u(Yhh?ofx#c`o1nNvCJm>5FVu>}~3zI9#GCM?l zU`ufvt?xaPwa!DLmS0N)fK}?W4TP`^vg9B0nPJ%?b7CwA9{m)GxlV?~y&CrzXJ&rz ziOW_ZKU=R%9spI;|LS~3%yMVTEH#83|9F%K%)m_!B?(gX6#I-4l47qY%MMDM)S{Xp(`Kj|Zm=jdLrP8AZd=J&toJRcVeYAVsDiLvT?g2Z&xA7ut$UpwE zvQ;8P4tcF79+2G82*l9_-)nx6?8?(6LQ;$&SZWu=zmS|t*TMT)wxsu>^q>AMD3KxM zr>Y9!WsvvMd)rAmCEc9ywVbo6e;=WOI=(MhVhx-E-}7R5pNVj{fIPKEV50SoO`4jQ zX;+ato!?A++V%-jZ*epEfWrHYrw!H$UgaL`1Dp~2&W2CwI0C`Wut7by_*1F-0=SmY z%fo=0y+92_cZD2Zg>dZL6Tzv^znl1tZD5(fM-8O_KL6N9NDK1g5X7u&a|Ja+5!{d5k zyAt}L$|0cVrTlw9L>Jvl-3aA~KiIPMey`$%Zn?t|Iw%(_(WpT2#W~WzhMM3&M~E#$ zVkNYQK$((wLApckr2YgI7XB*WGz|}-P&zj>`H$<_F;QP$bECp zTqH*&REdS?xze-a8=yBvKb!eLDcR*@kqpPbGjekqL;S9Uu=YTMo$+Yd^r5>Zl+ZxS z?s@Z}<9^Zc{^e3@)j;Rwc(mQIwXIb=qmk{K?Qg+PP?($rdYqvU@F6e^0tJt_V^cB- zIo31c2?@L%BZtF<(wW5S8X(OeYM?Dy*R|3R*Loh|rHlJ1oPQPQ1^P1UES=d8si1kD^%|(3V#rV#-|2;)vI`Qk39NN5Ba^KU9Bme(Ym1$lNZxQG zMrPy>7~F)ojd6GgF-2a3Cl2VTPxClL!`nP1!djRd7QtOo6RhvvHU~&^nNQJ!)i2Q= zp521MF+Mx?y91nGELjoDY6OZ+*xiDmfJH2(=&aB4((zHG6I2wk-?&04zNR5(@u>FO zP&nJbw$`ZUe{nm=1?{`dgAB@sz{g%Xjhj4a@|ZPPXkb|9^6m&Xq#$PFV>I`T%n|IR z1u3|FIpJ159cT-iBK5HT#vA_1^8hpA{^S24Ca1rxeB(60ALwWUn0hNAP7{xVU{b%t z^NfRWoR41(jn7SOs7!vtrdl~*>e7MZj1o<+Z#5tA9H%&~zo&V{Fj(uOC2AOncSHW9 zc>=?TPoG$Xv+iCmO*zL26U)^e=G^AobUt5G@0uhtsK@g8il{vm@iu=%e+R`9U+#uQ z_UP21YM+p*JCjxQ<-F^8Xig3BG=)ip-1SLauMw(kWj?zCR>XHNU0R(Y+fwBWIUT#a z9}y33j!RW=5!0aF3I6e!4T>W^Wk^xcAo`i!C zbNxRUrmK`MG%u3b>%UNAzcG?JummWSdiGUvAH-sAVUBD=t zJiH1bQI}Pj#`qj=yr9fA2 zFKAR1Vi+?zQOII6ucOELVtvVN@FF}hmSq?Lal{{4#rEs%g$C%PARWVGy|$^XJTtv= zJ%pK4Anp)~(j7mXC!dfcooUP8N?-w|^hMbz=({rd(t*s4)y?IbT}gO8rGc0{wR*bY zmEzAbfLeL)-Iv@oj3O<4o;K+{-A|b}-|p#(;kYv zpUO3SjouRdJy@jS4RlJe@`{Ia2JRgwP)FJEQUIB9{5H8#Rm{MZ zJilvmcZ<@<25A+DId;S1O@XaLuG6ElXm{)}_u2~qUwd;d5Pb4sbK zA~K5Vky{6xJq(ZHM#H!WsTHGAA1oh`s#Wx?>ILi|VK{2ckCg=0>({bNzs-9@bn3WK z+m)A}GGdUA9fq{#v_*xDv_o(vI5A?{=zOHLdWOW1liXEthEJ#U+EP_(zWQ)7qwB8C z4Sag@GAY#PG$rd5cfpvicrm!T?bWlIqn}k$f9{ zkuw21H$S(r3l!V$dbpP(QyjfHLi_GRw@R1JStiThg{>wY!;h zX9CJzn@h%sM4xJ=iVB9RotjyxGWqS2=D=glqwx1Y)VdAT);C1*Q^*Ps^lGA z4Fjq3{j^2nE_olv(Y6h*6qMQxR+cR2@|f#$xVgk$2yn$HPm`e;w-8?;l7OcajhZaJ zVu_WAvG}o7r9FbpsZ2~@$c2&eQe^8&A3e`>UVUyC`O}ox?Vb=XU94@;^<9C#dYm3o zKYc{#C@LG4j9}6wPD~?ZEMGQ)=eT&PIF1rH^!*@R?B}mU-?=lp4Er5?rzhX@*m3&2 zt8(mPcl+1ImxSUUPI_VSQ8eBI=Y!^zt0kCoL`+>3)h&2rem6t;+7V+ML}Tj@TBVDS z@%QoHkU-2Aod}0=)zJ>hy2r>fB@QsN*@=_~Z=^GNh^jP=EgNI4K}viTr59_U5RD|E zXo`*oE9C{Wz1i1YY|Fyy`K$>VS60=#wtMcC(77FTB5_8S`ld{?I!jO7&vB<+kXgI& zHM(Shs9I=AdpNv1bqc<^+E)KdP3_tMfd*-nM-fh-JvNh`(#Tz*=Oq`F9dh!CfGz;e ze47K$w1Uq0{HzdtqJ##(3~Ux|8>71EpY03?^RP7WD%V+38eq!L_c$2}j$gipOfdHQ~pvaoSv*e2m zH#yRjZUW|ak`@BzQs1PWtuySOXCB^mI#Z2}!W0>>&azQnj zNk~1`Cuojx!+4SC3x@$kk6X;s-?jh27~PiJM-7!?{sQwP&X&LX0t{}x|iX3HagxQ|8Qhg^rg*{b|M>7FRh^)$d|m2 z3{IOnkvP!ETcgL;D^8@~$CEhK;{QH+rSkpkSDR$)8MWKc5@b%`znILzTs%2a96y12 z$(AFe(nXEhV<+j&FYB5e*?dFBYE4%0{4Egq_KwCM3I;ufKe2UeOC$cs(y`aIstYTQ zGP!Zy8qSMwp78OFRNLz>@ll8v8xzN=X+P51eD!4C0Fu%;k`*!I;KgLx(1=pcoJhx? zyGUIIEvogU;xB$ny)3*B1M}LR`;nP%U}Y*vkCTtk-0&yUYnvaK(a4X{Qw^lV)?UL# z(awFHG5=$0wwTID9^ygVewUJAzt7_Hjx77XwecwI_~w)8tL!@J!|3w22?6obMa-Bo1c1_I2xOw+K+!QC(&EN-Mv(Q$>c!KLGOw#kNf-UhrMq_TgMwsNiu@g z9uvhxB~%LT47?$0Ly)4EaD`SrO@&b%&E04zHn64lQ+F8NrafI6-$Yh*kG>f)E>m9NIO*$(4B81s4dH(#Sy9vNiX07gpLTxV#w)33e|)}4DGRvQ^YW@dT|^M z8-Do+gLZQ{JM~Ydy(}mOJ4?cgiZR9Lf2PGoG%q7u zyBk;bg9H>tzV=N%l8Bc8l-ii3)O@cWT;`*)WpMRXkTW%lDIShS&Mq$Yu11B$`{vpy z31gy$?>3`;9fF>4SA)a^SV}Pys|N%k4jO94g^?Jmz;?zmC^kmMt_oTZs2YvqlwwR%5&S23=w;+hovU%V>%f0;)8>TQ{(s)>XW@a4vJ3Y9II*r9$IC$uxwEWW(5vMLVIrzF8jLKE+U z#1B8UTMF5~kFnkR-ifpdYYULFds-zgLxx^Z9*H!Y3^}q@K1OCu_(di!3+y?Vs$F`G zok&T6+7#RxQF_9b5%0({E8W6)^5)bs13OCgN})LJJ@`Qk(|lthW!pQ8bOhJ3Hj8S1 z)!!ExB?%xBCQADg#FBxHhal$^+h@vAaJALeGKO47OsluiF3CUmkRSi+sF%rcv{%Xx z@rh4xX7%-+t+sd7GB^-Ka`r5BqyUB6!D#J2ESaGYYjW&8+VFf8Zv!us&?U2^Br;HE zSa=hYovYww+?>n#d%Vny7=zlKT#D*G(IsuE_(^Xy13 zkKrOlqpv2kBqjJOzWmtMNgle=MKC>?uWA0cc%tw8r=)dI?^hl-Iga;zc$%An?M5dJsMen+)FQzZJd@eAwlVc+}_TF zcV?~I@ASgncGLW;iXJ;U^yS;*njgiJ#X0f;(F3h0G67reNTJp z#zK?dKN!r}jhXBL2-HpB{gyYqn;=wm)&}Rma}p0k4eNQ}uY~Dxzk9UaRb|2dU>LTd`+WPJkqv|R$*?B_p1KL9&J4*n z$k5U}YJ`X8@^)F;D4q(kM>SkeVryu^?xiH~NF-w`dk8#1R7CXB6fARkWU*fWE_iJB zOG~che{YOL|Bk=)R{Vo$8>BoKAw#kMUGO;E>bI{!{o1jE)%92TV&U|p1gd*#S9~FH zNwbGG9|1HAqEAU09wq)=6wsYuLcKk$qKH@ zTh)_{Hx7e{h@U8Ifk-#oi z;qYSCI?F89D6K(|%*GkILDui-L6N+DbcS?rq0@+T10!qx5#F@v69uM$$)y&7MHomy zpek5d%lMiBevH@gO;V>**0P>e*8B=G`$O;lL+w0(+$sOVJsJEg**5J*=!DGBx{xxKGAI(t~dTwFSllPNXrUziW z2_`P^Q1ja*-OB+p^5R6-vIGv;Evc7t`S~q{;fyVhG-lOgf?e8rzo+act+iV!$9^xi znN83&5P7#!X9|y+G*bFG_`W6#J?SOd-fmtg7l>UHT8!plbae#~&IvRyCMCzdF5yi* zDq{!Mn-wMe;4#z4u{^C6*kSSJ`F$PtF^(p!88H`g^5T>x+ExcM`#-lN;7~jy0)OKV zFLOFOS6n=K+_0lm_7{d!%0A$5jm*cxIfRrt8l1p+DHK9!9*y>~YYtM)S z@0k>}oeNJeHhqU@i9TDeH+WYAUuwc7IK6dYL*Lbn2z!sO#(q!S!Q8ZDW&$qA=mY$e z=J%pBfkTWa=l-8y_w?EUqDU9(-xIRxPX1_i_uQ^@LS8I#VITMH8cs+$X?!zQpLCSJJd9uoJw?Ey#55> zVX#Kqzwb9sMGIbY-Hx=T*lo5E^Z2L~U)`h4e_tQE4H6Ls&HO!Bet1_>>F>-({<;IpZCl4%jiuu$>LJsRg953E>}xSs8wW@vPCC~OF%eN#IOlCnsl_l{mJ6rpPwMZM#nsf5 z)*RWNXl0A36<_E-#2_T`*(*My1-o?Q z0&{l=Waq0Ysm;fzeMs?116nN0lCC-~+Ddf4bRHa+*$EYpcnB{#W@Jqek4gYNc{^{z ztSb_+4pnr*un*S3v&g^?0x#rEE|F|XoJhCRIVj2aG9|wGRCy*^hArAWHWo!WPBvS_ zTV~(b$$MV6Zzw@EM*mc9*xhnKmfuPK<~NDs!#L%Z6G?kA0?SjYSdG89vlWqLJ&`p=}Jjttfw@~@|D zKpCF6d@Z9mNb`}|3v|7sP|=xeV)-nj($JN7W&xW$4#_%8y^TNPG_#?#Y1X2==o$uy zaH|6UO~n0y{QlC^pNx4ckname0QEdDE_6`{{S_j-pf?Ffc>#sl_;2hdbQen3u>sRG zjDL=V(@{stI62PKW;%HAC&i_dD}tOu%9d&@cW0WL$zy^aIQZex3Y1=LdTd_%{%qaG zjtZWBX>n^DLOn6(Y6FeBi6UljaL#1N6rG3A+$8{B#6!|Lx47ZO+ z(stKIEB3LF5@xpM{z7{vK6r1<)wbO-nU)~%fG{~=f z_HmVZC(IL6G#pRhKWkhh4tz19Or@87310(K2*&A0)>JeUc52X1kpN}2{)78ttnkfR zcL-c)41inyHPMo)Rr5jsh=YcV#_!MB2>2YtTeR;{MVZUy+ffgGW6q1TDfkc)>cG4b zC%ub~%&1N5iG`T%xhXQ3meq8P@ch-U@@3uBX)JQyRrGDYH|#CrCE)EJ40>SPlVE_Q z`#+ef9=&bidz68509ULf^5y2Ce^ly5BQ?i3MnLIozeticadfak$(;Aet|)x;2Sz{L zuOY7WmD}KAwqIq}Aj2 zWe&^pJVJuILVlO5LJ4Lr3S7p;*?GuUbQ5s&R9t~C{#oNvC#%*yr|HE(?SiDV-IllZ zQKH+D>w1C868@fpr<8LDJCm}3iBQ{*AN*0EC(wd9sTwD#0N&s zWlgN^I5dNegc;H~WrpZ`tEGF9^iir=p&#y%4!~?uLgMJbVt#{={nQ+VJbCc_X+Q!FmI2KLvVwrKx;zt(;w-3o0&-2&&FhJ?l4gp5#@oY+UrfwTrL4xHEw5))WnnJRIXGur&N zsk~tu7yt=;?4)G2k704D54yWq8^jT=j}$J#%arxX=U5iPqwRVHs8xZ26aonMytvdD z7X+y7sh0D$1lV;b7fKZh=M#>iiT<|7FUN(fEw!f-bk!{QKx+}nx|ZK5%6b_sB5f&h zFASbd3z0SV2XG;hO=mgjoCa}nJpoCOR$fo0(j_3@;$#g&B7@0wWQY(VpxJmmnwyGg zc&vkL_>AYH`zjqHQR~fp?&jb561%@o$bI|*Tw_QNsn1yVS{DxwuN=?p2hlEV8YE^3 zoo`-CX$lT%zC)fgm%N-v+Iiy?*#v;=oMJafluqU9f=%^J$T^+DL}HCDDp0_6FGUo{ zNM~&-I}!e<=5N)7af1&1mQT?XuyqsU_)XJ3;J?qP@;?}-Na2Zi5GT3fAK8jGiL|ja z0cI_#{a0|>3LOzs#!XkhqXn$Xnj?HhAH}GyF(lFXz&B0f z2wr<)RwwoaaJvz+LFjAkM8~al*20hSE#@Ad*ZLt)8O7#fqQ^yxA8N`yVxi(pHtXeA zzC$fQ$nJ6@9kr)C8cWS8@lK`u4umsaGX>YcHJrFqDxSr^^R&f2s*Op$Z>h*T$>XY; zA1x7x@k?PqFQ>`06T{fEMg`(2ZKuS zHuK!)bgIR0=@{4b#Vm{;ZfW%rY>y&2y;pS$JflC?>ulm7NlvPpHgjWKc(E>4oR;1) zFxwetNz05~%KCoNd7-7jUwv{#_lNSlVg2gr(d>^=|F#iw-@VT&f$&pM_Ksq8 zNhhsUmMO$~^hTQ%hl2jF?1>U%3 zXTkqs>n)?&jKXbQC=@B~?he7--Q67u1a~b?ad&r$y9C!34esvlQoI!D&EDtid(YnY z{K%K&N5;tbWUckSYtCoNhVx7RwF*VxXb2GOUK6NtDg`9s&n1+BDx)5!$Kg{gfJ8OcDQfj#rU- zyhDe|UvoJT19(HM*wEC*i;Jp;j+xB z*2TV50U}4t)zSdUZp*W03iQS&W^bM^)_H_Gqxl}>v65XO)ghYgChIkr3$W6rjw!}N z6DP<%Z9ruUlM7kyQ-#4I5PONaiB-9k61SUk*~4A*_}_XMv`T9)fhM|()5~nn$^y6` zyGRSCI)$d+Pbj=l9>ScS$B-<8s5n(TZ$t%Iqh(I!_@!`x1aS#u+zGkS3I{{;)q^S{ zhZaVW0{9_Zb*rSAtR z0k~*&#>C^F@sI~-rB!J{$M-;f;pGQ%`ze6wKJjA#e*u@Admfg(1S}1qy}UY0pkq3j zFRfNjcmhjptw-U4u8KitBd9lN{%S17Il~gH-ZB`&!YlA~vfBt@+gC)Mgx<5GmO@MI zi+K=f)oHUPW~q+OZrx$1m{EK~{OebP(EroOmjIK#cvqb~YGKT9nH#lt!2UQs;S%Oj zy!S-}yNUkxtrZ8~1MxFHOZW{85pmbSM69b@oIHoVP9nPgXI7O9oI9r1qS?+u;QP1Z zcXcqw?(En@^`EddVhM>kU8u3q+r?7gtP!vzjF__0%gtK+68Xo4Mu6DkH~S-aA$g|x zO7jK?4qQy`kb83D0Hf-FJ4Qdwl{#T`SV6cpaO?}HihNkr0B_@cx@A06UCK9{eq+T6 znBvG{QmJp}5gp{~z7@Ip8vhRp-+}Y3{-Ypq{b9pzM6mR#;{5M>{KTu(H64DwRkEuM zx{@kB2d!13_ZwHqy+A6vWOz z9G2)9Us8%j5JVPluS2cFhq%XEANpPEpwlo_;WK{MGYHS;8;T(xgF7~Q=ijSY?Imq> zCYH9Ux4EBVSm2Fse$bl;8SCet)M_)rLJ3B0Ygb4|Y1B78Ot5<}@9g3P78e^A*g4En zsN(_or!Ae?6fA4n-1-7`y@?Da>LN5-^FJBcqmij%gByLtb7jj5PA=8XvLX_QsbfYP&I0I#Aryh}IBw@t&ys~+Ec4%y>x5-ARo+YEBeVDcsWy&*AF5g zwHGz^wx*X!JdS58C(V&E&BxK3bmo;~XJH)W8%h&cxt^>~gO(>kj?-%KSkqRBZ}}xo zk*G!A?0net*TQe1;4%=r>HKHVtnla?B=YInSHcPsF?=sW;{6SCIIc@&ERB+lQ5(r> z_C@E)oFl|rmy2FO9v?D{VnBI^BxfDn#C>V{0sjp=VALJXf<02H*7W*3Y${Bvd;y}4 z>UMbxhQG%3etA8KHQC!(soyjyiB#F*QUW%FEQgd3|6S+-)-it#kxY&fG`Cl=BsD#_be`g5V75e2 z!_x&cIp;h6dhOKO4*4HF{0Ac}vJ@~|BN-q#?@=qaikBFEdK>zZxvA3Rn?)kp!e`h! z6g+9lz22=)$Sa*dW|xVruLDu9vUQ0zAVJuBk?C-Mhu6=G2}7<{n^%pewB%v-_x!fA z6p~ZsK5W-YDN4JbP0z1zZIX#y&UnNGaaC(~vPhL>=e3jGKZ*-|Usw7cuy*-nS$V67 z!y`IRTP&j&00(Y>!Au0Q#8%qJ4#~uWIfMWxVHO6OOLO^E1$<_4R1ayBno@c!bnusMClo|ViL>S@c#LPHa z5*UOQt{exH#GVxSbf9w9F`OigiXG9OCv{Tpc{dvpm-m=!($&s5`)Sm^CFxHKT&Nr5 zi^HZ_z6)8!>P_-Uj%=Y6g09nreicvU}p`LDs30VDsQ;&=!?bc0l- z15e6dO@xermeVyYILkOQnDj3W?dk4-Tv26<1+>tODTg$zCCQ{TPv@N!H4|}ogoTSIQ_enEH z@XMOed(E$=Ylop%q-5&O{^*JqIJ`JzDU?HbXbZZgxsJDKskMofWb2hJdj|aQ0{A2t zZ|eAfF^($lRTV&g0mXaM)ZOJL=Is5B1vvARJKV~dyQwu<(nHHV_Am$aHI7j7WCrCE z8fzT!FO5(bc%N4Lc&;XW!u-qYhuO%-O?m^PY$*5+)3n z8yLZ(f3*4SgIptg$u`EnXQpMcS28&|GFVJCoocg2SGFcy5--QH2NkO6Sb@h7bht7* zec+|sxGHG(@Nw$#FyxFJwX!xsmogeGh7eAKpL@6&heZ*Uu7K1G=b5alPaNm47Y@033UL3M0GOv+CO2Tn%e0Y zdos9X8svcPZ7ti44K&(?*Hz-~jwMd9a2~B;CtR_7x?zywrb5Sz zBfLoQlIhf|ULBatJZ~avCmbnF zB=m^H|H~Y=i8qp1R+oN(hU=)5Yd1bz^1^C;9Xyh||ECF~8R=_st)Y{<)<4D1|Om zOln5QXkw-kLN#NQYcLpaIbNzLgpJNIF5-V#dsZb?G-nIpS?NW;U`O5Jg-??TnC%ZK z=5XnSOp~Pn)Iu}SqLTghz9pKlG-om~g4%aHIz6DNl15(RXQins;)2zty^x4miZq<1 zw{?VUgK5j5QhH-fjsCD(wbqzh5HIPdZY)=JiECXL`jD@M4ym>ySgGa?-%8ODSQ**j zzm6ou6(#6{)fCH-zHwV zn{*@p@?&VF*u}RXxY*1`F0mW5ZoJHK^6(aBabQ8G*W!^lzr~^QyVw#*!qdm4(&KdX zMH&0qTX{l(-)D%)j~IB;jNk@VO2dHI2Cs8Vy$P1PK|D83{JoR;*!_p|_Ayl)f_JY9k8Z^@XaBhGZr)=@X^}s(DsXNQ4Y3 z0|P@rQ!sS$j~SDqwc9%Y)k!R=|B#@iHsw{3xYR_a)0&h_qW9ecQ8gCLA$JM~!B67A ztnr)zrHQoh*)1N1GGJmx|q+?G?f(e#+$(ssJO|L$z|9b zDPEmrFfTPTF2)qcj?L#-Y96sBAmTvv3@#Pc&=F}7JK6RKlxfc)KE+22|L}(V0$)K= zOul~4GOA=+vDp@qlCN|T7}5sWpzOehlWwx6XQB_bRqGdHNNC(^K~gpZn&l9k^y$HH zbK}=Sc2uVKI6)!Wfwd&{NERO$Y+J1%kAfnBYxhV)8D1GsW06J)cZ^1F)&=|Cy`rw{ zXT$HgV(PiF13?v^FJIJmwRS-ceK}j6_GfkW2LEA-f*cykdzNa-P?PQha~{8S7*v5Y zM;|m9Z-jzqKP98aaN6=%hv_S`%g0Fwdycb4+w&eFFi9(JFF$iS)eK zuGL*fi@C~okZq@VTm&ENyp4gJ*faCvZ8aju1NyW! z{9aSjY067~_;nlTQ1f^=Tm2R5IC>43@oCaq!#>oUt!UGJlHoZgQxWvN8LY0UV>QFB z!y-5#^s(XoosCG6PD^44jcF73HA_K$M3H9AI=4(JGpZyc*^NRy@{JuRg9ccYn`C&E zXD^?j#Ax4V>xZBF(_Lz^+6a|sIu5)s`GVsogK9iF!*7fPq5nUlciyP@HNBF#{yad5 z2uk1i&orve?U)DWEVdO%{lVI|vUJL(x2+Zj>-%U1vk;vmX?Eg4a`;ek5rm^x%8X@T z5^I)vT`9tYp8%Swo~$(E%lLr0mxQB91n3V0HBY(RJN=!M#{DI*(uUkP&}<| z(D@7*oRBdT>C!@nFAFTsIeLWE_}FagTQ;&A*1cza7z#KSh?mYJrp9fm$5mC_U) z$KoP2LJ-^kl`zg`Fzv>V{FB>qOsnxMTdF`Z<%AY(^pJWxN5G{Mk)4-yJc~vC8#Q8z z|DH6lkpJVrbzVcj1xww3AXm3Dv&}F6K}FNq|7b&E<{DrZvd?W2xR*Hknl1UL4qiPQ zDuo`dw5=Zc!Eq)bOJyai6}~qWuPuA23E;o#k0V0yzPky*{Tq-fPib85*`~up&Pzah z;lu64aT@1evpcgna@?6if|3PZ@#6QI#zh1~k@DlD#HcGm#MT6i=*6fJ7+y=ZjrTOk zqjFyDSaCmvEHB;ot2;U`{b4|fMq#jb;`F1sq+a+M6Wc=;sk%r6u9j=$A*D{lBMmGD zb?DUrC`-Dryq2qjL66_~98Q(|LYqjtj{&ZY;ibrK$aM6C>9~WMm7CY13D6@yK+>CS zKHLD34)8q3v_3){^p8}JmNhSPO7J0gOc4AJ%JUUz10Qwm{F6&hqV6A%aP;x&wj_O` z1*nN|KyEY!(pME`;+Palvi&P>)a*9aI~g72(N8$n{8WGAw#W&Q=^N9^9dcZkb-g8z z&k&@JD%i6K=w+WF%8tY(>$9)$D(fC0vK-wHJBVW^waOw@c~(*9cx_z*+R~=|oueZ? zZV(!6p5YFlk++2H*4A_lp`(qs@z*>T*dL)?ikWAiI8dN)rGh2yXfQ{rU-B^K*4LRfWX!qRBriLy4>{eD z#?;5r>{`OqreSV-SOi@-gI;Yt+aWyLo}d4ql6Vu~-mgu~^^x4TQPS{Oqvp``@)5cps7;=6YK2eAcN)*N-IachCSbIi)xLtX&Ofd^x0_ajarR2 zKuo;XVP4g5@*fDs#QI~!&W*&+N_jSxh4?*{AImbT{`0uW>bMkzwIm9nxA1(%7baJ9 z3yGAbG2#dk3K*8C0#4lY;kel35$f9kX9+or6m;h#TI}CcRJsmQ;>oZ%ZRCKgXXfoj z|1Ee&Jb8~)jjd;gI(y6XhPk@7<)3I3dk>oDZz6>o5BCh0%==^;N;^#Al${|IalbQI zI09;HkF|d?9!!SR(sEk1chKvHL6pT>ho!@YCGKLuMAgDI2dMg#TC)^*B3y$yOb|v# z*-wMCinp`}7tx#lukM)4oC;K=@?67h8O)bM>5K}V$InY6)^}^jNPtV8;2iPCV@Udf zQ2dNc>ZHJdhe|@uI9JUY*BCvLgJC@3BC%kvLTVd&xHdbz`=nm*+iJ=7@9(p{(>@Bk zhUPp;S`kbrDY2-_8JfedKE!}N6|*9k+NhpUsS>&pyIFOYn#9CZ9DOEEnKR5Mc?2}XvKMh@nCsy^&{8-@ zNJ`Ky>_~r67hUqUz(Kqu9rg)Z)>qDYr6bf8uA@_z@y;Z04h9ot+I+I$4{! zar2>c79Zm>Em@d@hcKjy=nlR(nshl!LjB?phsmex61E+q7w0Q>cPra3q8)Qef63nO zWQ4-T?w1Bs4h>)H75AE;&z~nbTBAzJxaRRi{&7GZGVD@-Zi|olmb(&i5dURxo=v$D z7Zp#vDB@62z(f5GX5J|k#9}1kYM!90ZCJ`#frjri+9+z~d*5WJfJe6HVBFv?t{Y;8KufwWlOOVXsbn$WJ8joR1!j4~7hTc;1uv zp(Rxb5S@w}wzyHbf@KkS)iExOyL+**7ZugmXx8hh1(s^t$LexS8?vad&HRpWV=9>} zc`f+Z_&n#4B$i$jqFZ|V&p$$|<=^$^e0giY+$z=EBzV(0+a(bPgt8s;dUGMkXT*Nm zZP>doyQ6}bHUv$4DJE~~-;eYFTfYZF$#}(2?~pUaH+XF*k3eI!`I#F_LU+R7hjpwS zP;y;DB3a_-ve`-Hhjqssh$QjAwWBTz>O={8lqUmMOfI=f+V3THa-L+`y@X5m3QQM> z5hzH2=JX7gJxe1#2u_}*Yuu!I0#Ns;uAQN+u;m+56|iDboVn%s{2xUGq+l@7(;RkF zr24$AQW%TiKWmoCmP*`vwTC21XxRgXIjxHku^yDQO6BW*_ z%NG1l1uA-e^j05vJQ@^HyLZUp47m1K)A?Wvv_a<@(Tkk!FK_R|?~PmW?@KKofY-u>1rwv!p}|(+iERxc<&dXtI?*Yc zeKoepM6^-OFY41`5--syU!!GAah=bvcEB5I3ak;Od->CAe8vN|{3V`@>F#Pr&;C>{ zPih40TMowxXyj&5W$3l8i58mAwe^mqqMV6iDg*+soMrW(JEzbD@-a7sW_hdH2Dezt zP1^Gtn_U~;|E`$+Z{J`5ET^xBSFm`NuV%4qPczdlsO-P8t*k@L)QseA*(i*{M^fA= zG}8)eTI2*-*-{#$NAu{rW9S@NZRq{H%;C_iBH_ONq$1sJd@=#Qu#M~iYez{bDD9L+ z028pOqKR}NEKM@uRBpj_hK&I_QhIzYDXb$ufm)I`+AItQ8cK$&*4f|YqBy)rufV>! zz*F%*8rei<98M;|CXV@rZCMt8NDe*|5}pJh&=Dc-C)+YSY2LUMow#u}M&$T96a;$V zQoB7N_H-UHfLLPzMu?o7iOv`Yt$L*rCChY!QXb&EoxuXFMYJIriINK zcU7;GYJay@a+H8$xE#W#ZB{lgyVCzbi57PJY$Iyg>1z8qTAvo zlJ~s4a7^CUP_J{OW*BWp%jzr!&zxMdUPW9wU)NRKT5o<7~<8+UEe6c@m^pOE5U>>aj({0^_%=7P4jobO%J)o)GS*dC24qX{R=qM z=I*~_vcbje(G89pgPqy;7OkQ79A`$iM9@)qGwS>;Pi=jp?4cI4-uI+^vFtp{@tE$Y zECZ@$e#1X)&cq`|mR2hOS8SGa?ZwAVA6R^T^vrFqx`nl&qYUBTi`+?(1M2GWL(2BU zdwx#rDT4kr41>cf8gdoJ&oBE9k$Ts=q3M{NHIegTaA6NoyZn+30F~`;B`RJeQ0E!d zR#^DB)c2;p9H@)U5^qWn58I?!Mmzn8dE;?7&R1-yFraveA=@}3D2DWJW9=-lEO&Fk z3zq&SR=LN$k#3>N*UOV1J*47Jc*0fc$U>adtK&z-8u~q3>zE$wCuXl!T)UkKZtDl~BZu-N_q%xu^`)Ac)@;6BP>!=aNiq~jD9@Pf zr+d_hyHzk#WMErfweevRS6SP~kgkt}0&Fwwu^j8oRm6|t1L zF3X(yULl%R(1fjFeOF5vgTZpzaTHN+r5~5zkg7i0{U(EuB%|5mSAAVNi}9$s1q7wf zd9U>2qoXRp%8H|{N+|Nr;+w;Rmu*|z(D-CV~Q zuR)5=N!Hf059;KBJ7CU^<0$LuF+73DKvNX~7h|kM@h^VxDPJP19}@z%TsH*cWSpu# z#xp6KgX1l+xA}u8+PT0!NxT%gNkz@IVUGA4KlS*r8ExckbKY{xS(>YMeUo|Ah&@{( z54AK1FRn3mDZKPnLQ;VXh&O$Zz(+9e!}m>g)vCvf$5gwuL3ph7hR`f8=o4-HpI&<@ zvV5Yk#y@XERq{=;ULrl=QYJ?j7*EU&R=@12jaV)Gc2X&ihJ6nT#Dqsi4#`UyZnix+ zim)grS1DzHu35K+!@c5PGn=hw>OSE zO;iy?hzO6BETYqg@4q=*-1BmEAtq9>+op;Q&3DS2W5(sl#7)K+Jjk;vuYxDIT71l$ zwsr6kus|ijB^WMcr78LZ>c1@JcBu;R`MmeO*(74(vn8K)1gcfZUK4?c`C~n|g}u6% zb1AFG*DU+HbTX{Q;3jNG${idU^^)B3{z29LA>)yk_+HZW35|Tb1H>f20bnNU@cJ7f zn=EuxbS~_5v_()HPu!vJY*n_!KFxiROQP%wZ?aAm?Is7Wt`r;FvZBYn_!sb^6WbfF zY{n;pMws^9yBQvJwf2<1KM~BQo!YL9t>60G<^tkwQtW05p4qU@RJMPraTxK9kHfW- z4)c8FIGTk{@ZFNg;Er#aOtGRji(zw_d+v1J`7BKmRO586Ah)InQG|U(q0zq;jlcPnT#M_}yaJ8chBW>>I8a&cuq>spe95L9i(%~X zSgW>Ub-c!tCNwGQuo|<_pa;giiI~7n8t7wl>G0CW$wdeuktfKB_ZgWL$x0?mH$l_F zV%i=0us7ru-Nr6ow0guSmIY*lFZb99f-VgOo}3dNRV|v|a?P4d2Bh5-HRO(@YuW5z zEpYccKQ!PegK=zfal*T4_qj^Fr24l=V_2xu>+x(QeuVT|my&S+Z#f5j6nr)#P{+~# z0HQtw`qcsReyqw_O2ceGO5QdsSep)he_?qK5 zLCqG&X{$PH#e6P@DTx}S(B}eD!Y>aO5(yG@4RYYM`042w6f1Ia^X_{$V&a$;L|0P) z(J2a*Ac*vlI7XuEgR?QZ1vBH(Ms-z6Il?G`1tp#z#~HrR!-(oR05?CiRC$-X2_8zr?*(s?U z;oKsY&7!u1(IKq0_)EE8c2jH*5jB0}qc-BTH_xm)M=A$r+GC2??*~E)c%uB&!@W*^{=Omx z7G$--_cGTGa10w&hz6Wc)SW!g;-ZB!o2DBmt2bMQru#FUum7M-db$(i5*-8s+Pf@! zR~!zb~wo>_K2d=i$$I7MY)adf+H3mus2Rc;u?wJUJ5 z0t7HRxRV}-6p(lQgU}-8Oy=mE*eBD|1kJt=L7rzR@v_i}!!u$+ltL&)gP{!n8sAhP z!rLeme0go#tcO4oR27C+7S@!^XCO#5X~iWQHHa;*ZR@Qc+L18{lV=_46XPr~2jkj( z?6T7rF1khC8Vnfqy7(L_-wP(Kg(tgeNJ^2X6%n|mM`aAuZrwp=#Q8L$_=JS(z;WlT z^_@q@-K?y*20C=S0tVQ)5|8<{qgm@4aie_~lY0bfq=*5S*;yN|nnq12GLeoi#~m;N zF>$-0iH77;5o8Vl<8blaL_9QO*MAa9r^Y&}rOZiso+4-|E$ z=z=SQl!be*5*$twy`JH@F%pl6PfEm}&^;vdsco)T2v{~%zgzeu0G?_63a%=;sIRH0 zI}ok+dnfghny&IX>J$KH`-@(}J2$Qs2?TG3zBdi0MbVUV0S zmquc&wR<5ZD$zy#W@Xy5e^6@q{Pulet%hc!f0n)M5uBqft0ug9R90|T)X;K@xDs@6 zSeI<9_+rxH$)zU@UUR9kp&7}E?r_W(%i;V6e4y;PnKRH+ZA`Bx5y{FJx-U ziqSaQe}4?Vi0H2Wy6hzl9sP2EtR4wUC5{$!8gGf0Mo^(*ff^cj$`fw8FI0PR|KE2*<=3* zQn(g{8~ORE2gfq88P9d6Sn5u{PutC}$hTdiSX#m#lHh zC41;9RG{lKR_b&NM701NxgvA1M4l>9ETPzJGa*eKqSQ zd9AU3(|;BXx>(7qSgAj3+ij0K*18P4YwU!Z-Fy0cTVyN$HKqgCB^S94aY*8hIE|+M z5~!dx71s5F6($@aHeP_gd>2Z>=_cJ=+a;1XdmPm@T=6O-J9r?p$HON{&Oh-&S#h_C zMo^0Tck0Odqw|6Ym~j#^@}B=is3(q;%Uq0@JHB_DYpIJK0r&9?dMZC$;bKSDHLx&_ z)57*y#BTks=y2h}tLIURkcyx8w(;m%KE?2e0{E1#jGMyHDK8V5G!kk%!5N`wh3zKA1XWvco0cssoN8d0OjVdul0s0AW^(a z4V(|9qWm0-;pjvt99O=iuoOm)avWF2){2a zM#)+S{W?LHZS#*EKX%bvHAASu{W;8?D+N zFXMgS_1Pxs+85uwyU$k+BEK)_CxhDvkofv5IupsvU!3?SSQmsb0Wj2xl2QoWk5-jz z-m{}=Ulxab94|6l@;G#wQ$^u3UsuLLADRG|qu)I5I^!G$l~dVkjww0$v2H>K(^x(>m-= zPj>o!b+Kiu%KX0!YO3(La0KJ=-y-#&aBKh(S#rl!(v7p1S-Ff2*xd}n98cc4IUzmd z;p1vXA){%N(6^IgtJ4*-^y)6Um@oOsYT8U6iL<+Q>`lF+SHFS?-qcqgd~ASzHtyuV zvlbUC<)qc{VUqzrc!ocNuz}3e4L}~P+c23i7@)oRuc3VU^IHo;E<65i1E#l+!2OR^ zCvf2T`adY#fSDbLcCXc|Q1!|CTjVkHK%z_7OP|R|di+t<6PFtIL$1{Adp4K`s7n_w z%MhTz)#JdZfd7(=$NTKTq>EH)WKU-?#%rIy0;61rc|G?G?{Cp<5ykV9s%tWKH*s;? z6lIJfH=vum!#T}VWgR=7m_D+Fo0eE&c98_OS4S&wJ}jTJcRn7&CWV*m2u_HS?OF{A z?Cb8@!S!8*Zq(s-93zq-`FI_)B5;;NU;1#!A;7Y}Ny8(Nh87{4u1;Oemz88wB;h)6 z>fUPQAiO$msA$M4;R-A%*zBs-aL|1os+a!?nx=m+NF)&w+PgKounq26>n;I zIArk#a-+mj+Zu#ttmF^MZA-;@>eWGbjEcwdk!}&{7fb|lO^}Zs?(iNWwWu+#|f`sWfV-a-@Nj_Y3|1?KJQC`};X>1f-lV z_}x7Xj>=d8vP*7XE+bI5Gbk^%&d#ZtxSAPIv62f=+{Vk)fqKj#DS-oX!z^F+kH-bO zo+cM1+3IanA*AR!pRE(+{IVq>GdhW_AJM=h!hcYHPp3$#jF|18RKE5_fx_~*ng)r= zhH{|fx^r_8^}=vqPE*{@Ix5`~bKEMT%%FK)eBu$b zwBeB~ZWb#Hb8O?W!o;t`Mh;~x^U9S&6GqFyDUABc(U6a34XfahxtyR`wB{f|#y*D=5zRqI8%UcQ1fN>$|AZ#6cqc8|5gX^59LQY(=uP=s=KwVTi()9s|$G#*~>I}9~kg6F{LZW zw?9|Lz5 zE5)pTT01^flNZK+3Yv<|5I)`Mnbh653}Oa{e?7^$$aI-6L6L7-&_`*m%v}Yh4@`=U z4NT<@?a6oe@5bWQDt`7MF~5z8Dgq&)+q4qvC!w)2?EZs-Xx$38P0@o;X^2q>Amz+w z3%|-cWpJ+iNUNsF!Z+q=6Om>AZOz>7j$EZ+(U{~+d|Bz+p zO!QlvI<)!GyVdYbQgZm3QhLW)ZUYvvZ8V!{HcO}KV)#9(Vb=Jxd8QWBJ#X5gX)Mo8 z9b1c0CGW#gX+4?o;(Q-u6x9UT zeX>b*t6yca5r{iiD(Xx)!u8(^stq#T32-=jLkDl7p*`P$SX&G5Zo2~B)?cirFs|E; zpJnTeQrD1HCu?veAKwBcWX*RI&^-ZkiAUZbT+AR=3n{(B$I-C^QDOnUWTwyn6 zq;?x6Im!4hk!0~`tfBLCgZoumpBWwk?VefhIaaRWuTXqOyQIc|CB2U15+cdGXCZ<1 zc3P;GzwJqwY{21R?7!d%kZ6z6(e?ivTTODCpFbM%GKyo@dbJEbbRIS&cAL*=#M@@r zM=QHcAV}>@-QK}~2-logIbldZ8%LE%g7=@HOaNaMob>sDbF<({Meg2VOxn;PEUP{> z{}#L`T`&d4i|~o*t(*y5Eo(3=ZCyPgJhr$WRxuCYkDsjFdV0HQK$SsBm{#B7Rj)K_ zSz1@d{KZXI+5<%Xb=#UCn^m7%E+)z8kqN3+F^Dza@~8>Oqzlm0#|CK30P~;qzcqCg zj8D|J_2uZaG;H4f3P!zs9UKl2lWJls;gh}|JnET0ZNg9BbtP{@!;zrDGLE(7%6nKF z(y>Y9MJ~W0oRsr&oo9{W_jHRff*>b_+S)$e^V`=eLjt)F|3UGv1c>JGK3-ki-{|N6 zg9`7sNYmMMqFKRx+TR{+MZ>#Fph9-yB+P50q%T#FSdU7HnxwqZLS&9J+T&WV(Jr!J z&f_w-49{qypB^$UF&9~LL-b;ujMMx;mi>9G{_}LRUNl;Fs5wI9)erhbiFE?-TQnYG z#K7$ASVji_@J%;(&}$Vh)@6bi`rX=Tw}&n}6>F6RnF1W^mLQ?)bpf~=3PIKoWK3Oy zPl0$xMdjKtb-$N=O?=K2KPqK{J`ENVpq*IqM^^7`Hc*H-+}1Bj>Tg+T#gQuk`rK{* zcl&em!8BTjjzYb}W>H|h75s6{ZTYUkyXrDbtSDc8B?R7_hdL%Zm7m#8bG4;CfF-*D zcv`|(Th!9$dQTV5Kcfq5rm~AMbS}RDbOP)8&NbGnESU_eeBPzkiA}rL!BJj`80A)% z*13Dj^1qtWzZ4C|rPP4?l|uP=j)-|n=HMdIQ171VN#p$d6!THM|F}m?XAC&!QhT5H zD;OX?H)VY2y0Lz;_yC$0{x%o>2gL!gd&rzTFVB64_hXO)RMcm$=mBsU3O; zV;7vZC;sEyY~0iVn%7LfRh({_1Z0YeGDBb$Am>Z}nLnK~pJIGsr_RX;$FZV5ux8u! zWw9fYgo)rGaF`I+c-5&-oL!5U$=x9|VdcCs>h!I#{KVXg;d?}ICQ}+w$kZi29J@V2ir$DDCIBSFQUI=t2GSE!26xMU1t z7h~2L=A(<4W8e2=eiT~_WrF!}hFX4nbz)%*e7clELX8Q|p_FQ(Q2-%gVg&JcPXXuF z6$E-dVRCAQ*iv%it@fV)2cRL;AkS#91jfprj(d1~w#P^b>Annmu|Qnb_DY zY13|OacZ(Sx8E2UsCc9=uqK?^iW~g$+^K!zzPfxcRXZ^gb)rddxgSm)Cy(|gxrLRD zNAEkoJZlD%T|Sw;%Yb}+0{q}Z^+sK|Ob}Q^@tSz$%~)i$;vbkT zPL0+sf|#i{B48?WY&o!n1#2H8XSVB?Vx1keamVQQm-0KDmbf!jcRJBY-u#vG!`^q1 z{y`;*{boFTo>r=ZghmL~sb9nEoekF#3vb{73hKv~oJ+LxnRQ<5L~4Sk?ef%maJmQ#E&*viT73;)Kv;Q z#J1SfwRCN06)UcmrD<$9_iZ?r^^K=>oon>P)|~xq$i}aD!%qyFED4(H{7cG5Y`uzl zWq#e&O2vFNZA|Pv#EBD&V@NgU<>6r`OPxwYgdH)GrE?>xgCj`KL9G#Px}7vf->DY* zsA;oiSfywh#6j-2uRLH$9F z<>;_WlRr%M@4u3zw7abt49N$dztV2$u_+}blM6f85Tj`QC7KmD|CH6B2Ybh5Iy|c@^f5tg_APKTH|I98Wk>y z#M>v>;(c&qlDB3pR~5njbn}+wb1bOr+aZp)}1g-+zQq&Qj`P*6-=cs z^ATthMzGC$Na>hyo|3ZtUMrs4a(vHK$YZc8QO7-LtCItPlnWF}a3Y0F(ff_7X6>+% zBg7OmFA)Vblzg^-+|qg!*-v3bcD(wJTFKRXg##b)uOcxgV~2C#jal+_&Ax zFSKSsdm`$)^1pZZDH$D{jKrb#LRbXE>NsTnX}O{%t}!R>;Er5Tu}0!FR; z*+!ig>Pc|K9OH+uQ8`_v{P2+jW*)Ca0Hc!cQ%aU}N%59e@QC|-F5f)Y zz3J^r8(#!!PMjbK3V%;apOl7(@2T3{9OmShQEbPWCK9%B(U8cFKqn?$ z>PI@n6;>2vY`Qr>C&j+fiA5n}0(y$126HBn*bSf8v^jmvk=J_+DnAX~yA8R%nfnJ7 zwS@-B&c2J>MA7Ziyi*=FE^z?F>L%K)=k%wUN}Eo)ZZ)k-Dz+Be>4SSpf_p|(+D8yht3c|tN;2%UDxpq zQRxv#*RyN=jc2*-uOmeGH{bJ7&qr49k{lD%ANBk;&ihUT**ERXHH&$W z)M-XtvBE~(iD!$`h8dBOlGf9ZP}dzE9zIZ?rMlKH5fBxo;lak@-svVK_Pn|BH~PM( zY}_MR>GOG8Q4|yICCFSa^*(xt_{-$$lh;qp>L|JcvBG=v2D+URR&syUjl4k?GscUB zUdySDZLV@n3lu!YW~zZ>y{1peddWVV`l&WrUy6^QAZjY}2^j@&%44z&bQ%W^FzYD%*agF4v{6z_lCgf zmGj5{J&0+YkiXRW+jPC(!HopviOk*==AmRaqj~!FCiwn9T)*eLJcqRSVfEu@iG44H zi_78Zj;AuiTfrJ%wyvWZwQRW&z9OD6<1j+gS{P-bcaAv0sfFwG{-2}dhj8vcC|lzy zQGf3pNSk^(mlHxT2t?axSSr?qF$+xe>l_yTX|dTVrf2-rd*pyG#<;L)l{~I&{xm{s zms2XDN}SssJDAL%u>w|ZWu%$(Fr%P`uTo6)79PD*CAa5#=^ikDG5AK@pF?>AFjq@~ zw;*3*F)hrufLZuHd1j8C!vDP@oK``G5ZlutlM=o!iJ$pe_>!2w3cq^hQ{wNP{{cSu z&9!&IIuQupYZwsMst7ozK=IY&OjaRDpi}SZIbi2`Qnf^gT;>Q;2l-?;01?r(qnr76CxKw{h*^1y^3 z+skf<5ix{>e^bpy(^Vv&&!Z1_*nKriRymrk*)yL6mLCSaS+j@-E ziSn-~TIuSHJ4ug^Oy`hN+i+f&W5_|9sm9UPFjZrGc^FU^5dXB?wxuxNS|;&xr?z2` z_wsmjC(NPsFY3|%L)KeHwY7F_qfo547BBAZPH|eaMS?rQ-JK%EJ;B{wgB5pzy9M_a zEv{|%m;Jo&`ObLG`IC(NTUqxSbIohcONzv#R%c3h9`MGPUy3t6-qflq@O`&dnZQZt z1p(706CFVZ7NCze2)tFKh4je=kA;eC_#vwq9lkOsZ#nJb$`ZM`dG`>i`|5E6lOIMT z%QIc@YUG3RwwDzggupNWOF_k2fR$rg`pQ2DmghB2tWN(%XZ6pA!zq*UNftc}w$!$K zzQAAB*S`}aLeZ&q*+nn;KrqyF3WPG-k}rH9J4{BoA>;;tF0N&ZB+%>9s5CrVNT*U$ zO#Ugt^OB`<&}X!bv^D$_-&fyaSe@>h7=!5vXD%u=)a%HaTFE2A8k;Jb6Uc;+h6rqNbFoJ`J=^pQwUg|}5zv9!D#3#~F4xuW@rvZQ3$pEtppWJ9?lQn9Nkx4t%bY{sU z9INZL1YJSVA~Fj2`r?bdXAZJ?p$pWw-l9Vs{CQ4A9De*xl3+%*u!CA~3Q^c_DKU|o z?EF(%f0I$J;2W%fi)p=bK51hRE3hsS;0n52Gt(wXkH9ytFY9ZAtldLNC#nr4o>sGhC zxcDh8o~&h(TKsJnhcmJ5cMlUu8QzxWiW@W32O8xdUQ~T;6Z4GK*$NWN2zvRo96Z6( z^IhED%xA*VQB5F4;m~Y*4K)B&2nJ-^{U!vk)K!<;GzSTzi>y?3nCZE3W6C( z{}QGRfAzjz8)=?3^uE@>Id$;xp)IfFI_LjUTvmi4^xoc96g*Z~_2Cfx{dD8zmC?|n zvXrNzqT2YEHc_x&Jv@hyCw82yC8Z7wNNs9#Y&=#Ad~W5+%1`B0BUab}E4M9q=)55< z92OY}hdq3y#ONcx#y(ja2rC1}oznm(2m%%R_dGP~!eg7x%8b?pw0w9o67k4Kmu;70 zx!!jg3so^%%;;)num?tZtuCOM{xMTICL`2ux=Ra;;Uf(Z{E_uR4S1!|+0y;;ol$gmFr1jrVD1iHf#!ZRd4=!gR^SZCw!Ogz}zb?na#$<<{<_B{qV598U+;GX)8Sl(KE&D%je zAMYlkE*2ozPO)&_Fmr~!Mu|aPLth6WlPGOC^xLBuB>E*prUtCAH-y=QN=s7yw$b7T zYwyoxfb;q*We0#r=gf!t1EbS~sn??cNEB=TQ-u@!p35&NPZFw6eATi_->_;aYg*2FT|zk+p(w5AwEk>6))E`%v6YXP4Fq;txmwHtk~H~HObtY_fs z2V15dI;8Dj&)dIu=k0rg1>;l(T8U_)=9KjH>+JO**Vdfq_=Dfk-VPP?8v7*aA-zFB zU}DcmRkH@kvI#ork`%)3!VK9(PA5zaP4I({7zpdM5`G1mFBHI@$Hq*mS<`#MIpfz)YbZHfPMXWo2hj-ig5yyQ0%OBA?nLe&Us}<>Yp| z>l|9ybyP|5gCt$G6<1QWIhvIAp(zVA9}ve!PtNt*-Ab|C2sz$N>l{XG<-Ur>Q#gUf z_5FY&vd{AlKmFYQAS=sPovltBe;t{nI#uxxmss7;h1bfo)3RLGvKV%E&#pD2VsDPR zm>ok0uk`9NxIG$CXNFx)zdOh4`;$I}SG7KB(0Y8Z(-f3!?IhlY?qKLawql3>GneRKYZfF5q#zxkQ~L3;{dTCu(U*VdYx z9*}YXW|dE)=DK`c-T2PW2jLBsx;m#iyMGPYIB1m}5)&Pgic>=!R$3f3ap4Y@ANRhn zXb0g&{e$oe;DcHBwH$8tX-Wmk|9aBUSup{JZk`OS5@cK)(TGnautu-A@!_y2)6hKVYA-w50$%O>sRO2 zB%db#EEiqAJ8|Iy@%Yr(K1tG^BTb;I6=6#Lxvk2&ut{Q4{8|U5j5gy7IL9`C(;{}Q0P;XwSr%OapA|cCbuNM0 z&T3RR%aC>+Za;P_pgFEMSY-;isfwrJ<)xR)jgnReiCH0hoBUO+Ooc*4@or=x^QY%} za*ux>GR;Q1S?u<2F{e>L+92#9O5Gshz?5twa^bSO*^=WQgk}vrzi?g}Q8r`d6p6TI z1?!l!%>F?&8rRW8D~ok@f9*iNn-Mv5cli?U1Wv~yAwM6s>YD1}NtWVCjh{gz z>guxP9t9%JmzCG0-j@~lG6F70hgU~%|WWmHg!#UmFtw%-(G^=*J(s~#v z@XJoN3>`jOS>06>zp|gSod%LcpVJ3F*L?tsl7KF6ab8jH;LWfg%AoAuX=Ws(Br8;R?3m2 zD_8GqIR~oW6ZHtVmG|l;WSDxbFj_tNvz4usIXM_G!wZprhM^Oy@{ZT)N*Y{*-`cY4 ze4o~g({bXA;*orl3PB1L{DnU{fch!+XFDQR^eD7&3d=;8^6k5_?Z}_PY9YRd_9DDq z^$J2$$Vr{Ui^^-WkE8Ed{}g2QJ+i0&gYYr8W;R)+-QIWW&ZIceh&A9RPNvDj>x$;< z){b|qsGHXeT>8J+vntZIeA!{JWl*EcYVa!N?5NC#$xZPMc{iQRqT9tr>E2muZI73T z@Sq^TW$|K0`QJO|t;j5|)%jZA#KsyQWg0{XX{~G1gcBR9`_>lnCw`sQoHPtgHeb+ZOTXvj) z-TKzrZDm=(engsF+x9d(Q9Lk=VZ6~|OMF5~rg~V{!xHgCFs)Yvzo_~Bs8>dXpLRMP z?0xU(cPM)#DN9GC+zR9w(X4j5$fHRot5%Vp#+vbU<-?K;FN3iVyX~swILpK+L)3SV z=X2bxK7gTM+CHfi+GRRdu9ec+a-^Jv5G3cL@h4@5d7t!HN?Cm(x`{KUd6D{y6jU2$4| zfkn2CUS4V$Jbc$fy{Ecb^!6=VY%E<}mJfxoMU1Jty}r+Mvdd|0@5I8H!y`OxD71SP zKJfm*`F-@|Al<8(cc5+8kte-_C_LLO?XsYLJ{~Af@5FL}T$y+C#@Bk0pNjdX!cr}VvrIo!RU#D3OctCpm>0E|@^>l(AJ0_!`(FCpy= zZ9|p<>B^@Y20B&MX^%{0X$?)9ll72OosAkNcdlJW{rQ>hd!%Td2i25&8D7M6X{t;; zdKTa#^sSP+G2Fe|`KPM<**Mt}#P{^Co>!BJ2JZ0o{g52ZpV=0O*M@(Q;>_jfmr zwc??Ib(rUB4fD?uVrNKgp5{d1i9jw~-_BB2Rhzb4_AtFNaQ|4jAy}2@nZ&YJ33)k9 zb=CG~#Y$>&M*Zzr>Q$UT2U2yEDHb%LXOEp%Ai}dt-~t#@;!&B?6-|+oe{A;M1vLCu zYo)p!>6KUU$7PSumqg#Kny%R6&9`_UKNw|WvK+<pe9sGf4hF2 zw2JbyyDL`arWcvL-$RmCqkgy!1(r%1e&;c)-;l@;9V2eha8uU*n0g~9)FxSUhzOwI z1Obv|Dw_{s0-YP{g$HZ&^5-d+X|3GQ4;0Os=zRvy@xeiBPc4i?ZJakm^&c6M2TB?! zwf$m*r11H}hgeb=7u;;YiecqnLIx6lZmB>88R}+k27BS}(tD!bBO*A~fCG@a$DN@2 zx>eJlRk-bv*Q!qIzb~EtppH5};lIKvY#HAEWcJFHc{Q`{Z`d^_FCnpIMyz0>YGuv! zMOX!ZQ2hnwQ@*}s)$T7Tw9tah)P?r*W4-gF`&i(QCej-NZuTUET-&Kd!by&hcX^nW zmfDCq0q%6@^i1?*W-DRR8SWq}gKs@FHQk;ZW;)>_ZDjjWi^F2ZBzGCyFZTFl9=fJ8(+%yp09Qz8Q7f8&K2vN-21}rl=HOf+Ix0CP}%jT zq2#{A{Mz-$)4E8ZRiR+i+6+DIC`OnT=6n1 zoX#8F5p!krqwMkBV~Q@Zn8{^#nl!b!6v0PYby4gI@`@5oKm214UGUMT)1xK* z2Vr_e6AY68DIzY^7|9z6o(@LB-b??{RvfAkRQ%Df(IS&)%rNojl&A9eKM^*wG_zOm zOn}CG%}~c^5&1ZzPeshhlk|MDwFWX5-vlhG=pz__<095egm|3#o8BOUKFWQbU3)%M z`OzNot=F%nhh%)F)Q zBfYbCku985;ViY~qVJMNo33@SjH%6X@SMUXOD$uK4y&MZE#!(2w4Ij^eUId%50JSXI9ZQ{%{CkQY<85GThL)zL{53=agZn;8Hfg-;;6T6Z@Jl zHgV{EYc@YXL27k?r2qN-epP8dM^I{VjiHGs+b1G^N|k}g)ZA7>r7^v7rRM4$H(eJ+ z0oT)4c;jSjWA`-<(`jskp%u+6WtM`RZ1L8&naWDPw0{lFomb8NzQfI|TyLKSYpU{H zm65u{xGY!R+BHUx1@B^Hx=G^KoTciT%WZhos^%#321{)K-t`{Ala8$XEW9<_ci^p- zzEvB{iQQD8&CsdkPBXRagA+DDTR2HgTLFNtnFDC>o|kXv5I?ccvZu?2GjrHgs#F%? zqawH;fLDgGQ;^U~OQj>)+AsBW{&jo3=ar>{MVzv=iAS}47(EpwepvqJle z+Si)$$L<59HRnM#0-#x`kG3*Wt5|J(p;KKHxMg*^4j+VSsGKW_yJ)7rve&7JJHQWf zAyo5KC}A(8+8uS55TzCtH)f%!marSdGLr6#!Lpuz%-yjH3dGM-B;J!nV-ksdt! z<$B6w>q)`1CP&dfs*=;H zHqP_rM&gQ6;2F=S#?);1260ln(mk!oFtWFrn^5_l0FsMg@;cGG_z!|j-kaZSj#9UJ zfAeGi%&L|A_UWV@|0CNn^NcBVG)5&o_1ppH9F+Rtb|e#x@@C2_y+Q`JU^H1nMSLX$ z#ltb;RhV2!ad?~{>X$6Vt79lvPFC8z%V#OTP37 z!y8k#sV#}t0KzRSyES=3174;fD=gudT~Y&V336sUHHNjNtQh%cxpWFz7D+q~ma4ls zX@151%x({jgT?B{@|7PL%l?4E&%muHWHz3;Kc^e(Qg|{cih_Pk9iAo15>!XRPSron zl{^N%8EOD)tIt2Rc#f=_jCZw7wrj}OV~>6Rf#n6xy+NJ2|5d($l$PJb2y*}1eIR@UCw!~K&RJ}0oEC!_D4PS9O^YR5YGZM&x-FeR!!fJ`=${hr2^Q>WF5mN5);*ytCvge zH998@OA>>#vZ18(f`=ko1{<`Y`>HAYNFIlhc4_6sR|m`}%Lgvj1+}du$1H$BsxB#;1wq*dOslxCsLO`s<0Sju@Qesqt?R zv;tDARTW~*mU9#z^gv3Q=_{?6J$GsQ)6L}AJ$RfW>*v41RfPuS4wJ6qxlOTWtsKUW z7T=~OqADoCBobnqCd@5EEkT;dRZk7yc+I^dg#VYzb<~Ejr3WGBW=gjAK52* z4eU8hoA29OeNrFD&R1|COP4OvFmfeZLZ}xV$s&&|ffmneI4dQk2oZdueQ4NhWpVXe zIXL+8_247S`wn~oMlj2onX%?*jeWpp{Vhi;e@i8saYU@`Kse&@I}*9Y`Z)`7Hi`og zC$o=D8fJ-Ti0BR;3->WeSB5P82+i`u987A`_0TJEENFT$LVM3mLzKI_L|~J11lCA# z$fuWcp=+u_?UO^y{Sflf;9FF7lyW_jufedDW*gzGldZI$>&uS6f9@2EC_y7n#Tzoz zv}=J<@8qXDSmZW8-!zN9J>8|4k?c8V&)(BfqgYq(;h!ec$*Qw-@#m2Z?HIB1l^4!I zs^d}fo&XuiZkX{IZ8|G}2hgPw)Hl(f7RBxe7Kp0y;@0*;J($66Q~bfc1!iAu)DMFBvl^aVCZ9c_u;87kZuiFqYi1ayV?<6%TNf?%kRPfm zOo=G8-_fo?Z<0l}AwR|c^oNILQIfh+Qx&M@wN##;_7k!DpBJ1MNz6jGDG_XMz}mNv z0n6dX8W%1GNAUhImt~Wmqa1sFif0WJV9O%k8eNoN-#Gz5JRswzn&vj!%F(%4RAc8v z+P7P4J2^BNiE}W~UCS-W;uTUECL9kfgoQaUkAFk?bbTx296}JAlaaxpE)HN(ShPqW z{oWeNzy+ykz5k|ZGIvI`9Qdhi(XPEh!f~a_5w)u#u?@q_hWkBYqdP|mZnoTwz|uTl zv&dY*(huyzFYct8xB&TFvkhoSBF$2EQ*lv#9Tmio<D~0G?>+>x6kZZkN{_d0y=FqdOM;x7bVdjE?tB4pE0SOJkTua0sXHV7N$#wOU zgu&PHf6NipP~xenGMU@C#v6%Ep|Duyj#n}UNyHQZkr`322i3=Mkr&ytw#uatvOqdR z76uGyfX9IHc0`Bgb3-JfP zIjM^y$*8{$@BTsHo@g(g&xdC(8`X{?nBqxFDu zp|sZ%Pzx9o>#2}cAjs+iamr7!8K%O-iMRBACiChvo2{Eqonoi}r; zkEJyOZabz~k=lqg>u(Oq&+m?=!s82f>A(0X>3v5Wq;q|yY2vJo9!NO7IQq#U9p07h zdB1*Ibw4;h~Z4tb-GP3t51Sy@lCLrzu;3* z45K@i4|B>PYw(xp%pfU9WC486yR~jt83OCk#s-wmiP(V$>(8uTZLqnSCMU9P=UxzTl z1UPkszcDAD8pl0g#8%bj-(4~{E2+F|hMkYv>dac_E%okLgPIK&G}mQSt2CnUc)wOmU6Lt+woo^#yIp4lvfK~(g%Xo_~=hb;v^Z0^DGp??qrlTLm%Y~HQULZKz1zck?Hcxf{o z_Vmfk>Qmoi(9^Jn&&Yz}_L9%t9Jk(rG;Tppxr#c$$_CAgrKK)o(C|2$&+c#Hi4Qg_ zqGt8E3pD-qheESyHr8H-@EJ^K&o3Ty0z7maxH6jC7e8CMG(&c*T*~va;i&=@6`?uA z7FWgF$61Q5k$vasWd(x|V^pRIvDWCpX3>ZD7L&&ILmO%Xay*nPY@ydHRjxl8WIh?; z@Ovne$%KgEJ-&7E_QKuwX%+M7A@#9uYyWWe`aKbu-qRPgGre8#7v=J~rCv%%81 zLr1wpMTF>LaQo-+9^$HI?j`$>@&rYLYdMapuFL?d^CY1o^iloP%le6MNr_~MWIXgF z4;>ZgoNK#iU7IleyGR8q`p}PqShm?%dVJO^8Oz9=#EkvE3p)O>*vZ7ICt$CT3~Szt}CaGey>|;k~hW~kgoFG zzER(__bZuk)On6u?ZzMN8B%)@3qPF1QAjtnnaXtWfow_$5P&o79xgS zW=Dg~K*{Mk@-d)L1ePB4XFPsz(_;_}`Jsu&S++q@Vz!@%6(_4|FD^8znT6o{GW$h! zTv{W{;1Tyd9gl?9jBz~jUqce0@hMA^7ac};+_=yyE?i9QV^0CQi!!>-haSaC#<-@7 zLUD@mH$}zLEK#Ua$G_Na4r*E$D3hDXplhH+bHwp(pNZ&ay2U3u^X33H2|Z=oHD;!| z-o<{Bo{;@zvbkm<1T?=E2CX?(`+Ajz_eh6R6ogB%GGR@!8w)GfN03oUYY?#p$a3Tf z=mUwcrNp&=_HJ#T9vFHuV;o}^-CS@dTJY-80a95N=)qRJI4tE9W>C(aSjBA+1u%@M zB&q>ts3dliOF^5eh=N9F1gT2nm!IzX*Yn##(+D+q8KeG>{p!4&9OO6DT);#1_EN6f zz5_|bKC$cQc_K7R+cttplpAm8800X5Q$54h?{u6X*%8O~-K5T7nSl+`yLx_kdWlTz zNwy?kQ0b%t_c#KBWq{oSnKo62s;Mn>tB(h^^N535iKs|u!Jd@f>UP-{cTB+hIIVKSaJ7>Xw}X1bP0`^gdSfkC7i8B_Qyn@X+k zlZa!uzqpy|y6%C1@7!nN){BM2)ldLR!1E6FLHVa+@?QZ|=Pcn+m(5@5yUUZS>|Ji! zLZ5k?WotTgj?$_*$)>*8F0)kyZLe%qzin6Ck+m(&_TI4KEZQeovpzhU-k)3_%JN2H zv=01CyL#enRzlguopR{0Ykid!EEJF9MFv*ENaLZjye7CoC&d8gjEcL?1q>Uudgip6 zL=`<~?c=t0p?4W#H?G$;X72^_tYKyGPuCCGJTs|Af2Q-OEn*Yo%~hnRNW2=AOUjLD z{GdM>T~tOL#cdlEjl@MeIhQdGG?A$aCZlK|T058Hf(0ck&+F+>3t)3*vxYPJ8JMP0 z3&(e=ZXnEuYHhwlj-}}+j?aqo239R=ArDLzk41dgE0)>*c>=$uEK0|med2B*knQTT zbc`9oP42|Qr?EB*FlX#x1&tQc%>(4%>iUErbM$S3WMH6g`r~#8^7}$bo6B_nKfKFS1})o>zrw?KhNz*7Kw{@qES}<+&OpGhbP-YKA^RaZ=p<1nRFW8?YGsD1qn7Ak|WJ8Ii3Qf7* z=v0h$E2Zhl)8@&3GIn;F6V*A9Z6Bl1@Iy0A{p5>`=g6=mYBXTbCdu?)I zVpCTl0#xj2bX+&_@l&Tj3}tT>M_{uxn3eVoa|6P4xcBL6?8!!Cf=SsK7Ti-SSFcCz zp|4FH>Y+Gx9G^zB*%KFspE5i9Laztq1v)ytNlL=I+%+E@+0!5s>6cN6#}kZWsFkUk zr80^*#FU`Z-&nnrv%eHg{;0ZD>5Ywf{;DJ+zQOS$W#6v=*qGl3)KfQ~Agryz6;HfX z)QXa^Qd&c`p=Fx`MugL~aFT;COZWz+ac#b~cMvscFq9KcCN*JCkK|k=d1c@^QmGAz zdyZs?j+<`I5y=!}KzK;V)5j#I1V3 z=-1p}4u1$fiJq$2zGB0h@5OPAEFxSd8QdtA|WlGWG7rDuRi?L z<8VA0E-*%j`V;^ZIDFtxmR4Rv3*%NFW^_g75L_B~ zor6qHz`|B5fW{Ddg(RZ!>}9sA+BZYuQs>qv+;P+Wzqr=eoM~zB8S%2Bb$`m6UOU%GcGuEC{>+S~!F5S}BdscDC3V8#xb&Fh_I>-(Gsur5^)v(_K%VW6%x68SWReODyW3LKXvs4t7vps=>Ur z2bD0sZbI%6;g3KD=3PG2icIK@y7f32IkNwfoj>;*6?4sT85&Ilh0OZHfQQ5!gFOMG z&7YP{BY4&_b!)#z+neq0E4gya-+!F~a|jYm)j?WzXfj4CROFaZ-&$IbKIZ~02=!hc z&=e6NCw9!h$VAWe2+G3IRR@;ociAGCeI9Y*gp1Na;U^~jc1w}=x|EI6^Gt|2GjiD0 zcAIAB-d|8%NLaq>PPEC-r2?Bmi9V1C@0+|s@xzD9ylIY_BK771uRrwvoyi;hZNqPR zvy<_(@^}7G^z%ETk*)^5pfc~T1cLmrv@pH?$OxQL&T?Rgg*nK5MdD#4@_4X>pnZm% znqh0CCf8P^NKsx}X~KCBSMtczZl`%B6rOpn9MO8u;r=}do zc_}c*X*fu84oKWGf9uGM`Y%Y6b(`1cAomT{Vgi3QYh-0buOj21;Yr?&fM$j#BEfT$ z7Wvw5+TshJp}+s1Yqq{MmhyL+jPdn-!2h1r?&vM9r!4suZ$W_#l@y z3L&y&G@tUZPl3gkX7L{!j9Pj`rDrFYT=d6{qvL#Wa7$vdlB0dDyS)0bSzFiU2jw{Z zJA5WSrI2=moWm6BkPm!Tq)iDW%gg@C2>Lk22VGPAO3uKe0a8zSYu9AWRe)6!Kpkm& z(HYnE4}w3%-eQMdhEt=p@j7<&innhP!6E7B7p6rdrJH!Txux(Mp%0(QlXz{CUE|*f z2Y{eo zUso|eu3Nq^c-XyHe~e6wAeb?Qsjyg6h{58Pj8?`+v~nNRk8}EL6FMW6OydM<$Cr>=#Y5KVH@9&M!hCY`+K3Xkm#N*ld z3tx@wm<41dg+eEC`Ae=o%@pCJvr;~A9^ORUR9$^sd2@l5j25q^nUQaKG90MR4Y?6tW* zOPv0z))x=W`O2JkTh!!XM9y}51YzsA8am^xg;KMG=eXx5n=Ydsg8v|JK9RdVU3=Q3 z_=iRO7{~FR{L&qj{eVCGNwoiamd7_VKK*8XFiQ8iRf5~ z;=MBAjU_PbuVzyR{yvcwHpQd7vfWqD(>Bb&J-6+2Tu~?q=(=M0ywn#HxoWR?w&|iagtf#!2g+cY~k6ciw2t4t|NeTlSP4hP=x>F)2uOo^bPu5#BQrr1GV4x zC9PMU{Po|t_x~G{@ZY~^CdYmlczjs9XKeD#K^!sxV#Q{0*ho4MkQNJNaNxfG+&{%I z9_d)OviS)jJtCA2ajHGO?jQVkU+7e^*41PhDv|L!>*%DL?{3HH zYB=20bl{rSWh~TrRZ&<^Pt10v5@8r zYTt443xumd;SOBcrWw{r7mhVepV~%cU7h8|MB-@oSSYEan4$Av!mswJ{i%|H-(-re z%jFBc84IZLx=!41&MCZ*>;zbGKXg@1RL3mz42fx*$r#i%smEl}jD0y?9wNiX5DAu< zP$Me>Y(h00liEe#0`1npaT+-B&c4dRd&^7M&e1<1*JysEovt`~$15LZwN$oT$3Am?kcgJ-$RKen*RMZ_ zNGe03sXt0H?)r3VgQID^H83u~F{i=8brDb}Q_a@rza?l?=T${(waQku1X*9rOg`j9 z-ikLBO+ToLthRXoWvYgh0$ayt3AXb9)OWdpS9JGxeIas}PfR@gx8 zhFg0IX&Zs~pc$>pL##>uolcEd)nQuiGrlKKUyYD;}P zb9mGh7%j5tA`Fg}OBx4e>g zvtKcT{y_js%!!u;ceib#CG`G2`&~gq6m^U}(Pee4$A->n{Py`)nvbi~;@k$qJt&4f z*@UG#^5a{c5~hz@wzr!RiiG+^DKrV9-Z&zp8rcg8BT1wj2&2{KTC)A)eK;Bwr@YCV z{#E#iX~6rwTEQf)<*#<`VO6glR_QXPmgn;T`OCJKb*|k*# zEZGfDbi68ZFU>FB%6xE!PfoQpHwZLO{*?K-J5p7-;CQTXnyz#USztCS48kp7nP)#u z$4pvfagiEjR(13{@m|U&prN7O-+@TpQT>Ck<#9+BulYrP*Q-=$sQ7nYxCG1_wAqDF zK5n0n7!;PC-0A>6AH?RP$%QO%T`rS%aU$vXrD4nM6rfg*Ww}r#(})C?31N(;g5@qb zfsHJ-zo6_+hZNgW%@uRLeD9+g4}F-AyHE5PzL|5R`AIEx!dZA-ME!81oSHR}oPuGPag@qYO&m9s@K)8f*Z>>X7GO;>ZlT>=W+*?)IW0r+Y0vb{S2o{fd|JY8+&P6&B zGfaS*!6L}T_!Nj}qkb5I-07tBG^rS*oYXo5oDO6y_i=g2!Oc2q_!@wdb4U7PrP5cG zlt*!+2Osi1{cWY8hWl9Vuy@IpKd+T?=APMT)A$pmX(+Em&SX)B1%pTmQ2mLT+tP8E zn&njI-9FM2ssDqZHiyeAnLHyN6y_Q{?LdrY7k8^J8@KN;ZB~9e;IX#ZKubkI9F^b7 z`d2ySV;nS#nrQ~kGhE|I>cH%v;|Oa&W1wscrNm;?TspT>}u(vs_bk5dM05*p+U)Rl!F z-0}pr+Z**|>6=~Y4s>ug6rbV^I{WN7^4j|n_Fciy!D-dl3et7( zqc9+a`65cQ{BG5#fh=;5GFF`9Fh=!;kS;6?2^-PkB44bC_0zml0iN;3Lp(Ib(*M_X zhsh*?V#|ZBmJ!^D`0KNo?dUq;1(#aE0VxH;%(aly=+iVQQ9IQL_Dr_u*M!}a%J^=L zZx-iD9SC3Sv^8}`tw0U|1GAZZc7SA)R!FPHH1+1V<2NiFscLZ|>ATFa?@x-AM=BcdSlgY*lHhv2n6Pf*)E=iF+K3)AnemBj6`O0mTd-r?p1-SdI%Kn#aH@yGA$@QPFX59umVYs> zaX3d929GN_f-MtMwwWZsLhU;cnIKsTPg|KoB2#{X#h(sEx#P?qI78+)>8j%`oe$p( zT13chuy$Bp>K`dl?@H<8`$sHKl|hVp*$loemo$sAS4{>N~0Pc@w%Z+sT}(f zJ>q`J)y|6(Dt2tMF?;2Q27`2)8z$@IuL zX$~#-p$z{UE$$n%aa=d)9uFlE7|C+MSFeNmNqE|MUeM z_zBU`RnrYhLnIFL_UEKqlj566TwBobvcFNyi?~~W4EBNo8lta>XihN4t<5HbI|r{S ztlD7dBf*07hs|$`#BCF_?ws=g;$ka6WJ1$czefw;(Ql9H%KCbQz?KPOBNtA6$GkS0kladA<%$;7BcGiJijm~ zi&6*AsTp-8k4>rIDYJ=ji(-=8h=>OakO^DAOmCj!Mur3~TVbU|sE0?4{=8^@hn=ag zyPuBje6$}U+`3VURu2|P)m-oJa_0osX&X%F#8UlN-9BpZJm|3v9`rl_i>u}D`Oy$$ zU(o^4`1gN?&&MXE0LKu%H)l**5|qcW9hDMmmyU`(*I5C&f6GD@!)kyFO~-ubW>#E} zYC?A&P|I!mnGw2+xHMR&a;71DjeKsK&K=3;4g7>kQ2u}+S39gM=mahZ(J}RcOEjEa za3$&2Gr1m)vD3yxl!FqRkZp{QMN&4CXrLH_t6wFur`%FgrClu;9HB&ToBLvg_o;y+ z)$vgpis)QD@BA!0Z!>xw^TM~Uw+>nz)N+zKCLVd6?{u7euTg#2gAC~|*2yAbadJx| z9GiStAl9JM4gacE7po=3@#mjtW2=nbUHN?0jpt05;g;RGm{j#}z@NQQcmmixKHr1TjDrM1R zLYV{X)htRF+p)JhW{Y&{NP@+LFx+R_gu7gy?b?ZblBCHc`F+Wir343MxZ?4%!q|ZFXH=+4;}x77ccR@ayYeZ8jVG9Z<64 zT7HsI{Q_(peBqWEuJ5hiM->o~lUpc+GH}zCO8JWTuB|ldc~V38D<7H+0|S$&9jYbPgPY_kqMJ$nm3Y>0}dE&Ye6!4 zVuIhgM0_AiSjaMSD6)cKpx8AKka0lxZV-pse{$mOX#VH0ZZ^vIN$50CFr8x0CV^XoXTD+2Z^M(}aA!1gVC6by*Viz`yGe%mv`DIq$ zN9Ux=D=3|A`>g&l5PUq};V@fvx{s}~!kgK4g26dC2N3pZzD*k(?qG$=B9djj4ylh&HlBlh;cFTy$?WSv!){%p5of%1N4cQ^g_3R$AqA4QtABi*nO)Dj4RhT2i+W}~|_y-y5XcHR4(pU%f4 zqB7Jgp=|{gmm|T*H>Ni>*)={S+u#qKEpkR}NjUo?WH?^m9qG!hLX0x(EsHg1BKHdU z#2=qEat$=>q8KR@aau@cqzFW66h3q94KAcYW0mN=l>R^E@6G5o{d!gYy-%Ip$n&rF zIioJOkx(Um24v(s%*b)0gU7W+64}vAKE&xNFhUjG+z>m2O25lyBCdvjgx2CSK?Nu8 znDfsctpTr|i-6nfO*?CjaXom*=TH_9A%jFbTq-9rQQ@dnDsk0UHRW!r}sV{rm3LX1x^BLU0kgEE)JU{8zGo98+0D_Mw|7pj&^0 z+xaQ=-S7`Kis-HnRMqABm2wyQi7E4G*Y`;y)#@;w9NK4;#{!m1B{__2^lET!?zG*oNEaOQ9YA-OHOQe}=3;-Z*H{en6q zrw=1rvA_Qhb#K`nH<+z$nqoU-k1Ll)TS}H9q>Cx7+x9)x2Uyu5Z=Gf3z>e_yg{slvyo~wj2I;iE4q$hS@5WZzS zjn$ltme@L}@kxt7(#k1IE_~6l3{_1YB4ct8%1M?^z`9oF^Nse6obw-9%UJ6x>$^vC z{@`%_lLvexWNqOm8r3mfgaCOF<0kU)Bjpb3^4P!Y;9OB`7?&ZDCM>z`Z;nU7EI-V1 zX2=z#U#nN6!O~Yt3wmR3e>Q*opL$O9+d{r>$&d-qJqOQjW#SrIW^k#-CIZhU=pAlF zI57|{T#z$`{Dl+#h;pDGEU}Hyv`e_rQVFX9f!)k=zy<|+LPO0}j;&?^FCLo7FUxJW zOEbsMy#*mh%Y_7f&Vny55iBFZhf17 zw>$}~O0zP^sIl+nw4nR7QUMib!)Qw6>cMOL9!0{!GPam0Yl|;RIMY8x@nurV3_{|o zbq)(8z8aK+%O$ejPKd+=b=AsZj^J(o?9z0O)MEj;JyN?JCfndrZd&+`jjjWDN219* zRJp_8(RjJyOb6nc%O2T`6b{OCs9iJqJs85WaYz?;>IV6$u;OQ2cnLcey$0CPzte7H`a{v34~jHNCGR=JNuak~x*~3`WHU zLb-)`nUr}=3T*Y{-Ja(2&Oh>y1hEJtKEQu-KpbX%zMaYc`J9a)iit1r4}AXr=MGsr zZaOE3nJVMVvMlTEp1S1_UYlI^aiw8ni{UaQ$OJVPk>dbYLcXwul3)zi^BiOHy?x?| zvC!s+sl!pVp2IFSs@RE=C9X4WSE;oQdVN`%ptV1NWGhD@*$7Ha4lwB0!o|^To+FoP z$&TxTaQfNb4eTDbL4Bf6xgdEddfn+KNB&r^76|Jq!8<-L`4<1=~gnCzTGfKg><;yJV+wju%2r?O__&0monxeU>9v!9Xb4;?KK_8~U=ej<0~O^< zE;a;Pk87O6Gtz zktJ`0=;|(TEjB$0mz2=$hn3ujPLse0kirJ43#%q@Ib5!$&BpF=(>Mu8T)Seh!B#fF| zz^F-HVX@7Xnt-bMP{}n=RouVgFId49%uT>?$fv>e2a&=c`u>61E&YB%?@qtJ3jEy^ z-w7g9s`XW$26IsyqcY;_+NY67a!ONo&w>1<^Zl32oY|}9CMXapm~E6fT0rs$YP$tC)hD<6vB%0XkxSZL>KHs2`HpWyS5wbpjh66IpmsM;*@FX`05AnDle0Fw>NNu@2@#Ft4H>r z*r&&>6L0fvCtvpV6g{nBE1>P!(l;vSix`pttQ~AKtAFKfCD|Q<9_6gNDEK64gk4-B zl}i&!;C)3X4y$ubO{reuYYx{@4c%dE>pmb{G^mCMD%)5OCs#{Jp@~;-l?B9UkUfOY z?9Bg~XFB8S_s;Rs{fvtyr36nYtM^@MmMGA!r>7Du=Es`DWGhBohA81w=~84qPM3@ zeS&OW7lx(mj^O#67wqjf+8ggYqULC4o`05wPvA{H`WUj@3Lip$qu(_>^g8{g z@2PQpZUe*+?lgM{Xf===df=Yt`JU;^hHdMQ=@a4jx*%J#Nj$bP2GDKK-|ig~I&tT1 zYFuS38{Geu%mMKpnXcC-<~P zn74FJn2B~|6hYrqg{O{`Eh-0-DK4uVI0vWAj2aK^q{HU$1_%$WZH3N{?T0xq*{paB z9YC(P()Fw152(qL0J;1u>uXV#Ns6QebPOg7wZH3a6BA)L`g>bI zxWniS{|lBm@}xWbsaMYAC+ZUplS?iPkJ`-e?$*9y+Aiimntt7KRTZo~6@4p4;MC7Y z4Q4SD^Pyxb)f&@axyo$Uh6=F`ykgO_Q|u>&TSI0XK%Ef!RNxWtm>1A7fQYvhNDA9E zi$=qD{tl~?A)hE-DB?aMLF6qwa)m;xG?Zav%=nL|DCx9syX;&;%PG?~#U4;Ut@i>} zHF&_{2sSBuGOA)3JS7MSl^2LM=f7Yj#!eX(7k&;KAAR3-W4*f8SiOIBWnXMHv=(~( z1=Ddy0^7%{Rigw%atH-aYF3R0o?ffBs^6gKeZy}cADD^iCVas{)TLa%*SV4={kVbw zO-&4&nMTQM|AHwHxx9MY^jBwuaxSTW#2HjuelHR#uYSlc0(A*nhqWx1pnZ5Ag2q|u z@(kBbzC}$QeM^-_#nslL<-V5Ay9U~sWmkwzbpq`n@yZI+3tAk zR@aE3QyiB8>Jr_ywyLmO-F0Iqvk~eSOz}DNyWOx9+=UK!L7jMSCY(A!MzGj~%*;2F zWxJ3`&OO2*U7CQVlyrnKNm1S$uXZ|6qRgxy8@rt}AxM4*`|c~m{Clds$8EFdDoy$A zD5mtPc8%pLd0CWEm!-jE= zc;QqDl&^EU%Z@4AklskCRyiF)-^y+vNGM_(u^R{`#HVrrRsdJiQJ#0A7|BHK8MK4G zHm_1+OV~9x+{H;gM<^4pUP8R49RbFtESoXl#9Nsc_~CN*P|coU3*Ii~5CuiculwOi|#=SId%^~{DrUv^(F?$G;P zzS;j4;LIQCReP~{7CX#Qy(`KaHFLmf`GHY*GvRcP5Yt+;uhgib!qv1LDtAwE&E#hZ z=0$PSCr4@+B`+RZMeW0_OhmhN;A|xGL(}4^7KZlMr+E@1>}|BHKZF{i+^HG(@Gl9)eo0&QN;|O_ zCa2Sz5N*fAKum-r8*)#MOG`=0G!PbAL?iOMlp|Zu7>?#-4?ah~%Rb-#y37vOA1lt1u7jHnNbFLnv8$X6EYVH4STu2%dzhE zt-ySu@&C*Hq)uULo5;&dI}dpoVN1i)K4iDLYG4X^imVBagBQvX@+YHwkPb`PN}msj z|4HN~ju^p;6exAT3S708tG2LIDtJ))9r;Fd=kWB|?|=HKDfn6-dHGKAM2IA~iq+V6 zMW-b=*3p-ZeTUr%%zGfBTlvn_TRbE|s+lSKJEbNJO^&7*hoWWT)oLv(Hbl2r^G24; z2)tHNgi%6F?+qS;Rm>7^6WPRujnV@&hZ0x+^pITrp{;xt3Tf?v-hkeb`9l(B$jC{w z16rD%7OObNxk*XMeNJ~w3sXsr0pYuh;&&c4_Yxa>Ag+O@bUl9UZ)Cci^dlINAO*f7 zVOl(THt{gzz4v%xl+6h~@x1yOOjp9VuqZRhAUV2LL3H^0xLR_z_TG3KG%IEf0eJ8w+n^~sHr|FU!Sp>?Z^IGgFQ;USJ6Es zgLwUt9e(p)Z7`rl0dT;tMIyrg#~DCvIUv?PHVadnAGO+eia`md2YcczTr>h zjf_WA402A=2r~sx0w8NLEC(>i;I%0a5cPRow`%bd#GG*be&a8n4%?WXc0vA3Y#)BG z?(5TR9u->U3d%q>5FnR_C8ddPiZz44;!{)I)WWn$a|nSl9sH@q7Aqb^VXfaN2}hI4 z$#(y{;<=s!`S@3IA7`H7o-Wou!oXoXv^$~W?&^>FxgK+mPt3zQC7tY-r|Jb=kM{8s z0%jr>y3l(@bk)7~;mkr#v^67a+t%@?WW^IQ*>PAOC%SwX>@y|x(uIzm@v7i+`ruWa z$qmpY+I;YV_{TMLI7jOK`gBYCobEWb>tHBI6ZT@3@2H`Tiwbe6Am8X-dbp1BJ@AY_ zY??4FuQr`GX@(;78#e>hyIW$~cL*7l~c2fnvZ4}HY|Bpgwc!7Pfwla*}r(k?XP5hMb8Vy~T};knA5 z(*Dgf78@2>B z4}qXBmu5fn;H_y`ZbB#6F|iyIp<78<0h?!uI#agx`ruqo8<|Nyau&w~EZ*_v89NC z`G9R{oowvKl@f&&9KQnCQx06(X@Hj&=<4)1VMU%@3jpl9;E`?=_OQAyyp5-=d=d{j z3}(^0cA@}(2enX;;qnUk+gg9`FA(6YxVN6#BZiOUCYUQo%n|4Kq?e@hmRjb-yT5qcuiZWgFFJ(b8DV70|AB^3MQxfqogYbCjs;Us1sjH*h*Z}eTS?ow9cO>(m4 zjQO2B+*-Kff zPydLF9NRyG2IdX4xh0p$_cj($ownp`0cn&GC7DgG88+1{;?uh6A3z;#tVZ{auWoFY zN1YCLjWowJUUiZ_1W*t+-#+oj8*SI!k!G+{O~&QIi0iYZ zw50T2r%1=q9>X*{+?Nm47=gv<8UdhCxLdu3`Hxr`Amz5Fg*=jTz`XeDGhy9ec8JS) zp!-n1*R`7PVZUNcL&3=_q=_q?2-c{ziJNwU)?Ab{eNuk*#3FYsEb518QdDT}Ii~TZ z&c&b>w>^8$YVpd{%~dbtzoOPZ7TgCqOn=&KVRmHb0>F`7HoacfLp4<>2doZ+`-Hhzj*>$oW^x9v)+x4m*Nc*SKx&ospy zVnZp}&HHjCkGj6P$garo6Qos;RN&k#s~7wHo5YJR-$UN9IUVzeoT35y}=x*h!PTqr9uo(28!pum5 zxXz`jbt1`TvJzHs@3RtyA&~5i4!(IK9!5|g>Mn!7# z)!T4zPICvhfwg1!YRpiqiIPN6 z2O*BNw_NcU(LgL_)K!ArB`4q8fo-XcL;}1Ow#MYMk+cp<&QjiV=<|Yo7SC2Vw@jIk zxUe!Z20K!q`4d~irK7$$(M9k_&7{M&bkm$oNgn9aKbV;Wn><2*wBuXkOig(koM0MEoyuEX~R8kVdQ&*HKp=Pajsbo8xZ=Jt0ss1)D`Bb%*)l5g(ech6I-09mvN|E;x{) z(}2Y^Oy&CC&X;BwUE1N*OH|dhey4I1+)47ZWvYLZjBp9}++yw!fsdjzyrX5d^@*Dv zn8i;MooMDq15QV_l)f^E{NNfWDX!!+l6251y7E^ER_bG;z;p!!TL;WOB$R*m9kQ*oxg}g10vvtxQf!k}@W13`LdH z0jZfnIb?KHVi%T^&;{|*zNukq_AQs+@@Ye>QaaPHZBBSv;u7(RVhz`_zgb26KxMGiAd(0l5mL;8uaQ5k|Cqv9 zUD?(%%iuED{8rmiyAuyn3(BSbPod?1r}r+z2GqY247~a}a_;$7cWfS}gM9x{t~mKT zahSe0@I--}ns^pcPFst+t1r&aHu4oct6#1tccT9DDsb$F@#k}yTVhsUW#BK^dE~BO zcQ5fLzBCsu;GZ212l&nF9{Qey){=N9d7H~MYKf3*(P2g@SNjR1cSb8*kN!(8^CNR0 zlaOdNb{dNxWM(!xOce`lEfqPwFN75W>qx2o0IrG`h5lrJZag9Z$?tP*qGd#}@id<* zxA#Zhxo@UCO3&j51;>IAz~>{||NPU~T+)Thb>g|?Az5@l>ibu!!+gA8cNaa5V2&H3 zI_@cqQ>kj0DkLA`5%l)0D^bbu)wQaHsk41FSmK@xZ@Uh5rnU<`>7|A9GOIb~J%NOU z6HMN9X%&wr($q2YKZ5s)!J?$f*yWeG51vke8s|^;V21#K+B$stO=p~PHQY-EKJAMNW&Mo z?~Ga>#QJEyB0G`5y7OhcN!BRC$v3Vh9hD=ls}Vr@mBP3dzp%@~iWk!orzpMvB4uQb zg$g%)G%7t7IGmo4PR_m@>C!l_rA+#@*Wgac-NhWu1}R> zMj3|;?HDMOzlonaruLA8R{7n1Mz7g@8Bm5ix3!8YoaT>?kD8=%5otV5L4#sGnJwTFb3sRAg(FRr%JmngCdWrbX`ah* zcb!^z(pIVOued{bw)TagHU;(yVqc7sP^7?WjR&?DSs_GxoscN|Xf{oRFJpKi{?jDxBsfX6 zEJoC=x8VtTom6XT>A#FC+}|LQY8bOQSmB}gGkyPeDaH4x)m?a$}Q;?E1 zv0E;8nJ+emCViZAgkk;ml`3#8Cf~?nGBLMcY$N2lzmob+vj5 zoZ9*jXY1v%Vz>)uP^nbTN(`($nxwG&Oy=ZG!ASe!%qWeI*rxq}`32S5P#CiYE|M}P zT|uit^-4+cAim_j*Qnnw*($Ud*^~w!>Q90R^9dlS8}=?$S~THKUNTm!{1$R(yZp#1 z5dbEZ%jw*oA}LjT%|Vg1vDQ9NpRKt$o$F@N2wz50aHo*}LycnI^ZeU_@`#ca6rU*X0ir<&UI_D}kK z#=2ix*~+%Go}HDpa`yr#qIlQ>J(DiO-F@;bWXx^%a*0rmbBkV|@}%ufLAnl_?M10Lt#T>EkhloZ-jF%5;yQ5sW9%-IGE%Bbg{&nJP|5r9Vdl|wt5mO<*l#&PieoNOW(#PgXfoU61N^*1 zC`ZlEj2om@)jDCAt!EjGYxNl!{~l6mkkceWu2!Bc1^5Z76V*T(cBoTmZhP_g;GOf~ zM-_}tbjEN#8KPkOi4h8z(e9LH>OI8xtVDN`>bH5-wDBC zeE9WelT*H~R99a9!|5o&L|LOXCa##M1Lm=<^3OGVZBZl}N*X-q2h}PdvqqllSquv! zA(tVIhP*9C2)QnO+7rgHR=1M(=LQ4D;@04)mD7E$0@+-LSLiEMe5rr?6shE))ANvL z+0>kgeI{oo?R)j3O&2D~P}$^9Gc-?N;YNpT@=Dgf6(jyFeG)cE)_0!DzFo>+InpKL z7mTBd*1#A<<(o92FVmOu9Rwga=rXSF}=Ju?{Y%9FI_%O4r22JWTc=iFsk@bjc7B zX*q$~mUZ5)|AL*mnAg3ry|Mi=Q0Jwo^M0fkzG@{_AecJ19s0sQ)+VQGl3eVCl4~n- zxyfZ?n|M~aw2)MOl#0)=K1Q#d#7L7p*(XYmoIA29xv{^sJ8}1y$DxJUiVsrHr(TeyC1B^S0R$f zT5OJK%?$;`|TvA-msKJK| z?Z4e^!6gm{N5P@EJcFjQNc@RV^&6iQ6e;nP8hpgVj@ohAlZqN+RS+VMZp(_j_$EBx zqu05uzK~Er7v6^QW7eSyS1RUR6jP*}Oz(EtsPt5w`Lx-pfNUaQ!#U$ zUkm{R&_?fN(_$!%raRmBTz6=iN)W=n)&&7Yza+*{Qe=7=NZ~vI z4DDB2nDa@-ley?}b6VQbp^n6cd75tTeMK{y-bt~N3;{_@$?nu^=mS!^O;U;Go&BXI zCHK@9gXXA_`wKpD_muMaox;}$5oXy|9C_B5h67ypUu-$>*J2}*ShomuVo7R>G!-a^ zEh-?QC{4dzj#DYSimeldjfgh8p9K|D3j!Lj9Xt4HpqqVRXb+E%^PE3XFRZ%ofr+M? z5A?fDiugFGt1+D|RygjYDTjPBl&Sp$m_Q~sP03?Zl0Yr#99U1D7dG?3`Jt-BhzZle zm<`^vwL8BByVza-W*f*A*k~MN(*3PVgf*(?>*iLaOu7ArGfb7pagoEIl@_h;bl26B zkEGjM@q7dyG7OCI_Q7y-o6+KAl<}qijaLg(g9bci=PNz-@yJ=GUoi?W_Buwf7$SX=e$HsbrlQ;zo`rfI_sS(%|y9i?#TtiWxW=h~Q zv@YV}sU*8-yK#@U-kqDIg^=`-KrXJ$9v|eXruj#Lnwr6l1pR!JJxd%wDUcymq|#0w z$<~R-9ELtoNk`UbcMg*29 zfe@z)T$O~L3LGU4$6465ckyyH#vQBItAu`3^%Zj~%E~9s6g4K*#qkgP0|Xv7k(l&L z2mzEv>SXfSUNRf$(e>_6soR9b8Q`5>M6S(;7?%1p&ni-n$dWfb%lQ+poQ3JJ!RhNh z@JS%XIf_VR$-p1U(awt`l_^R@?C|g%g;}23F{+HkDW~}6H2*F3$o7%OS3ThgJQx*n zZGsQro@|AcDxI-8&l=uUB^Y0m*njV?9ehuIsZbZ4LU>s5TMMH9u0;giAm5@LNMNI5tJt-5xH znT(17B5tC@$0li-MY_YW$2SPW&JP6RoW>BR0?R7mCd-t`nymOTJ7}QdC5ah@CeFbR znJ`Ip*PEeqxrtRfl#z4=$$@VX$V7a47V_z8ZVtWItgkv34o*&j!kf!gIKhyYAHR#^7{z~| z#%I+{e9byvn_Bnm+2*?o7BshIXgXD_LW39Orq6Zj@lom)Nb(F>wYx z#~B3^V7Vc%9N+|OB%F(aE;r%>IH|1x)YKDV$S6n9vJ09D`C@F7l_}-tUSVkYHG25u|yA+>0Cs@3BH0IxpLmAHQfC(v| z6{>9n=MED1KHPI7jmYn4CPVB@|32pQ46x{rGgx;OxwL^<7K;dRB$4s}eI_t(`F+ll zY++*aM0Df?E*jMuUnSV*dPA0wrm0>s!#}vqN(XTp9CGx_{{NAD%n13qyAJW5C1_~I ze7*7nzr`<>SL2(6Dw>hf7s{2=7o$x|TsYVBQ|fX@JE6C-=o-YS69;EZd-!6o^8#bH z33d3t_yM?IUQ(y@9YH)n(@prC{DSeO=rlWQBa38>5nmescVGAz&vXZQf~3iO_U(=! z8QP7oL^O=Wg{QhqY^HF#U87&HyoVCry?1vV{(|x5V0dAi`+Mx%)ZMLMu7V*@tc!^? z!K6lr<_h)%X*t$-0w+wAp7Z$ChMdt(r|WwQMZ0vJ#dFKlVFgMa)?3{&v1X00ORgPn z91dQR10uOF&wr=SzWB=!?AU%K8An6+48dd_YMyq1qCMt3AW@Eu6s2m*5s2G5rXY;5 zAuClSYa-d`YM?mA>txP6n)*ij@PORx?mmr2mR`}Z%6&vbDKn+TBXXg@nFAzLGB%Il zX=SS^tNhW}aBCb65H5^-_he<+BNM_#NRSLifMxc4++gsGbt`M%Fv8t@pJ-{1-9a?U zn;-dI358C|N`pTsV-TxviY0jzzHcf~cm!p3*UUk0audZZR^&E@;hvZ}^p>eu2Dq0L z@oP7-{mwHI?`k7>w7Czn{Y8dkKh}bM{JF_`hE}urD;usk`Iks+Fr(<1oz{}J5G16k z)u?2sFC@mYyeJ}Rp2quQYy4v<#y3JT=m$?xHWEl?Z zB~vla)nkEd59zC&NVb+G`?<&*^j^@L6!*0Lv9Fp}^Bxu(6=?wTGP5njIA%(jgp;hR zOFj`Z%}HL(EBY9-3`jgzTdE&G^9p=!+hGJp9k(>~2TV$}{>MI>B{o5|NwVe1n@X51(S+s>-8*y37K@1w zm@Gn$!ZLOpgd?E4pTD3XW2sI3u>YoK#-#jKR~yr<;Id*orZ>LFCyFqxOsqL%T(Vxw zlD!wn(>QEG95a^vlIa}#O_Vgh^tj2qnANlYAnBa?4M%E()q@Odx=Lw1v;o-un)%0T zGnZC}Dnj6L$y$y-x`2tm-IySf5*<59gQTDxzXN-?jfXk#D;G=^XUBNRK2Z;bUU1xQ zf@_!BqWbyXZQ=Q5qpi+HJmCi4X-x;2zT@YQz{zs6^5zgRRv z_y9nYK3^Y<>w7r1MK zBN4m);f4<9MTik(LTa(~AyfSfHp{M6@)#_7;AFj2Dz*4UHJ`uu+eo_m!)0rl^Z8^| zU1t?-+0*J}J4i40FZ76UA13~4vQvU{_C8ejgaKUSX*!M|kr@~!eb3<0!hCzaTVjaP zCN6`G%y?9an0N=axT(7HK;-E|4$_bpZ;4$?jD|18qw zl;6WMT)%p9QXOkF?53>;K&Ek56od(mcr^qt`KG;eU&62=oaFn1R0H2m2Htz_G*0eo z@Zp0&1yzKLH`}}7D;RKx#}S*}TIk{SQ!JaUN71mV*bS6blJVoaOz?xEqvi~8JwE)* z(Fpwtu@T7)jtV08RZNiC6eM#Lt+~<<4dvF08HGbNU2o)=Leg9G5LIwDiiNx$Ycww(K|?xU**a-v z(D`P3=6xfYpN#=PsCLyWjgCPCkld3N7f{*FNX2uQBbK+k8 zflNOzzuMN!kYZsL*40rmoTMKY1@Aqc0`pB_Bfr6Y{nBUI9(2jS=KX72|C_cN$>MUm zAZ6#_!og50LXBOFk|Ai&5q@C4`PjnoaUJUCiQ*M=2i|uGT5eZvVo_9UvxXDn`9`0c zYBOCfC%+4wvw6DAEibGvW};4pU)}A=B3CRa!r7=!n2eblio7*G$ij_AtK2|Z0o;A_ zIwj$I2RrMg+EqDr`<`z!mQwNb_bAr_{7FXPP0haFl33c$!Jb$xg<|z*3Dtpkt=5W# z3(4l8+jSVxwROEGOqJ_CXs+4%JIRr1I*hFjNmw51^)-I(KQlI#zQ(CA+T%K}Q(Kju zLzOnZ+n|q<-2x}5HxGW6upK#n<#(=Bg4|`iZrG=X$@6mIMBak$C|3!IKw%aR3_>7G zrB%9A8iGc$?mB>y63`!=&SW-SXV>lazMcHeX}#Pj&A9JI4#TSD*f|<5OYj`#&OA}0EPZ@c zZqYz_h=>|+5vKl?L zNWP?EuTM_Uf3TR^+ZeYsxu_16z46_3T`{3L*iY?U8zVkygX z8=NKxNKusE?z#qAMix!ALW*X8qLa*|sO-T}t{+I%dL9Nw#c>c^x2ZL7U$*>7llw+J0(-1Q2+W6-*X1tN^=ew+_qoW~ zNjH#Ah->8qTNO!nV7X|8d5Mb1_exsqA`OQcSWQ)v&t+u%#3K&U>0n6z%%$-Ald@-H ztI~aL5HbBJR8W`Y%g)f>1t+Q=W*z0;k9^eV^H9}ZyT0dGr4!FBGTU!d_}?$c|Hvk3 zw~uPK{R1wdnd%z&&0>ZfCUwASc0}Dpa*>*(k?62T>kewABkj<7AHOFtkhn0KSd-)T zw1Q8sch$!Ibgsf$w<5IYI>o>kOkwOg0!C5z`juK32X}ItU}?bE8V2b~y#!`Jouz4! z0xwBI^Vy7xS2~@4`mDm1dCO_^Fh|e%{Q?bN`qhGe4BuYe5*UvPmP}V7zUyp_aM8X= z>`WT2tZv`ojiH0&tz>{7=7=15RK1BA5lnPnq72MA9i;Z- z7XkPSBIL4UqXs5#`6S5qCy1k|-FwWc((z)Ez>rwqHcgOc4V32aH0{&2POD@P{4Rtw z^Pk{Qmk3O@kqonfY9|r>&g@!fRIYHE3-OGSaZE2=qm{xo%1-7<5)zi;3ao^*$0=BNF0W;fq&C< zDAVZ5{z~0DvFMDTrj|3DQ-bFf1PMHPoBj9CO*a&@GAejG%>f!<<`DL5tq3}gnLizdfq^4 zXbwkNo{@_c4x6D^D|PqHIYtP?p~;~*ggJh}A_NZtkESi5ywW4pAWBcnP3-hMHcQq- z*{{uBH!{;(aoF{FA_6rWvR^Koe*};CA=iR%4V|ESd;RLIFyo3-{*iX+o8ZG*0sVWJr*YBuHklHD)jfd!ocarop~5ezYYK%JG(5iFWYee4&4(p6{s) z5cY=*{5qK|T(n^l$CWuVlN$2g8swV_;P#rXz~)V^>bXw#HfQAfBO4p4$^J|HZkhlI zS1kB~^x6Ys^=W5&3?#DI_J02Y_sRR2pRjpUZTkmcm=QKx&8$vCF84gaC!!5Z4vf~k z@>TnsIChaWoy*Z(jA2XYLC{q)FV|>fnUz2%{J_#^@m^Q6l zjv1wu+IlV}8}T3%lF-D4q=hxXNKpxgazJHvlY*)`8%^{b0=}s7##khS3`!$J;tXEuBx0YEjO{pI)NwZxN%^XaTx8nBvfU zAlP)h>+*L@BupS%K4*_K5e|Yemtl*y2z4Ua9#8-Vr%W>iL)GSIq9%8g12Wcsp{v^}{ztt>&_Tq@Ijn=q%`wR_Y~@PYiZ)(%xyuUL%3`}k>uA^8vrLu{ zzkauiNF%YBnjv$R^Na}uR9L6<%9m$c*4K z!rA@V3RfqHAA+FemDQR@?dO3*Zc9oODUC9WZ9&8!FP#7NS@<8j1xNgGtV!O()g6Z$ z^Y=OSr|ggak~Tqs-QDGbtjT31UOtX*VSXPdO|t*@mi~p5q}9#c5xI5FaSiLgV5I2w zQWa$NKl-9`nj8}AmOsTPw-6+rLEPtmiA$j_J?DKrwFy2_&6#F~SniD(O}Y++Z}%-d z9+fitI1$EXBHNLS%zFA1ppV|G9`WwFm`={S2d8}e^ARUr)UG&>-3|JW<*iQl6>jxy ziKccb{2h1(ricE!`~b1n?KB3F|Iy;{<#0X516M8T2}W8_|D@me?733xo++SLH%d{V*FlX^zQsD{Go)SYpqw>9_AHp09%@D_GyXxKUW# z!xdu7$6Wq`;VGEx()R?umk=IVlKlk>llcn??&UNIuad0co9vJdp#RDqSRmBSH3q+dN5mq_pI!x?*|_3HSo@wY3wS|8-0w zz~u2=FCa0&{UofJp-p;AyEaRCv{*7_o`#I-~z5hwF))+~~8Y}tAb$#ZX+KdYuCW@?Y3tHSoyoy6ZBskTl5dH6| zI!z8=8y@m~I+MzODZBpz$9d#?V{7rGJN80HpC7>uzFZ36b9@gG#AcQN-39kf{hD$c z$H7^-!z-`%tHfh!)b_qteZbT>xT;=yWl#;-4<L zRGV(&vpWhqtFS|N`k5@4<8d?uQsY%oBr0=A9zF{b_N>xb{R8zHuMXL| z{7u}k<$zRx$UzN1D5l^}J6Pn{RUMo%-D?b)p_A-z=%8fHu|oJ)gceq7iFC;Ya}OMy znGIJx_?SLJ6C2*i+A`##5SWph+nzP3uM*!$pEhoUA!=lV`cmwpWa6^NG)nm1EGwvN0WHvBWYqI=5=KLPJZvPgD-}0 z@UQGqRZ7$YlmfV)s4&<=)Id*Jl-rR#|I2gW`{hGx znpG2AVpfAS{i3HC5`vXxh^BnV)e&b0XBR3 zjSke7Ea;2;)USjp*y;*lrii_vqn1PK>t4UGFx4}(ayk5B*?QI>LC(>u=aq)eB@QRY zB2NlkB(5jcVC07SXZ*>tPZLfWGEBZ=Ohf*$ZC-!Z#Xb<GnT?{^pJUR?TWQ+Hp zWMZ#9A-?y>o~bn1Sg^bVE>kqZf7IP?U(u(!pI^?Ng=&p3diXYI2Ff@{*$1({2a`EwV?F`MFX{o!j__ZJQAT2*aJwv{GGR}4Uc0dt zgWc3j53K%6L;(*Zn5i*FTy0x?5i-BOGP+Zh{;C9}I0e?BiI@PKM&S7ahhw$If>rYw zr(&l!UMm<`ydr0kzA54fZ~b0QI)3E@sG*gXI864C{EB?c zt=0wC)+~&SQ+)zKm9PTjRn>iIylZ4vCm7@TrphN$<_Du7;~G;pfnXJn4{|BR?@Sc7yY*L`PDM z9JKXR`&F6*7|G<&kX56Z#;SW_R@MLK{~dfZEwGJtxopz<&~4gC#7FN94MU(`=ejGw zDc`wGrK;HceUC5pX64HkhV;g`f<$dC060e(9EIQ8n5E?|f(7gQJUx6UD?A&>=qPo2 zGPs?{lCTqi*Q0lObsn%C-&flC50oIfYP^w1)5)IDza-z7v-NKZu{ zSF&%-8*?x`RswH2jUf%uUl2N(LnJDtnn&RB!#&PkwT|Rd3Xt~lAa7DCJNF1NrL}i& zg3f(PkvKNWTY}8uGPWXQjIj#nIk8SAm3eHfXfA@Y;CY`d0zhn!3^UOWzOJG6$RG%- zOuE{wZ^Na_SF@KMjgS{iFeVfH<&YOIan{F8<9dtBGlB76;&n8vH>O+uv*XH*gBc?( z80F=(J7|ta4S_~A)1qqu#fOcfdE_nI1b5BgO)&fQ0Y+oK8E@5=Sq5vJ@O4 zUsR)d9m6)cU(sgN&4$ke6 zhgDJV%w_wi9an|)8!QXg6`CnUsGXVH8T7>cu%Mu(if8*3Qqg%HD#5ZW?d1PJIlygh ze}sDe;COc!Cv)XxIl~(jl7v`z%;m6zCQjn_Fpu=T&m#sOk8Uus+K@vz*e~Z%!5E+= zaV@;c$o!=pb83 z%;8)4Vp46t8kTRLcf38*KYdkc2(9X4XJ?m;pToMZ(14efMlB(Wgy790K+)UP zyJzH<``7$~DvziCEvi9zGgsq4ui%2$4Y9brVX5qAw&8@YUk8_3hgf{Z2NNEIv#A6V zjQw)<%prmOokka4c(6ACBxjWT8~eq92~md z$i_zFj5GRbpSK|#3zuzsQ$otE(bR>2TPDL??eM2#)?luc=kABrKTy}+Kc3PctHICt zcj>RGZ$<`o&prFKdu(kG*@j8yX;oJuPmeNBo~h`#3x%GCzH6Y^BPWUrYO=4VHM$7= z&WarT&|tlr8-}Ng`!F{&ExzF#7Ic_g5Ejo17E2b}tZ>K`O@0*2_nvxYg_prSbU15= z&6()*(w?UbtET6`_%oh)wV#*oLI@85WV-K7hcSZe=RUtKdXO@1bPf3>(n?16WH3ju z-=QtPSbi5{OAT{fKy?%D<zJ$02y?MSC7Npso>DG3|<|T}OJ5%PVWjhF&(gQOHjQU34F{FI9 z7msSDw_fLdei9 z^2X{ySsIg+kk}H;_+fb&hgU;PvEV~3>nCnsI0Qu1(P9QcF7kW~s8|yE$Z|B)7o2N? z_!EeYUFY8I+?U5gO7m)OlbolJ7TFK|nyl|+QoGN*81oNeE`R>|-6ecq66H1ye1WkP znl#_rP@4mSxFs}hrF)sktLbCl~EaqjbEWoWB2R-VUx z?xy)lY!Y%iZq*Q16t&RCrnH~FG8sIz5TgtC27I@juAC$VkT@;Ge(!5Y`b0duHBleP z6e(4#DlY6kyI+r-hRgz5i+4W^3j1EuIUkirr19DhEs;85R*wdaVT3 zvHmYj66{c~|Im#IkzjZt(Lzc($!BxsxK8r~6T6!7*Wxc{8t2On9A0+}SqGf?Ix#g_ z*6Avi2BMpD*2vSIONxXsASn??ehMJ%E z3r%^er#7P*8TNInhzq?m^ee{ba~+%(j^H=elYf~;=U3+^)RccnQwSsB{D1c|A`nDfSa9PQCWOPQb_H#C30dFglNQ zOLpkKU}n3^2#^GHy=!eIe-HPry`|vYvEcjvbfM`CheP-Ke_x%0I0Bss;{fMbSUTCGJLWqLjS-R0*~Il?tuzrS`?8?QC*VFG_3F#fkR$C_YD_?_W{X)5%^ z|Dbm2SdSkM4syLX^A|$HGu3~A8;-$sN!9NtCCNrxxoxyw(TB42eM$gqsZz96>aqGW zcUoy687?kBrRY=uD;id^DXA`D>&qF(1O+L=WP8V6?qYz)2fNXeb>dVts-To6wbvG- z2M#n#YE~%ebAMtgavwPJ1Sp=a9$!0lMqrj$0#XLHn3;&SH>culq!dJoc-%KkH4C7xi~N?w?gBrjuA@ zuLL@N2x&d0H*HKA48Z3O)5;^^j>7drRw77<^vss0I0xx1z)@}c&OBNjQsrO0gPEzG z{0>V8)vmt_mL03AhTJYwJHE!*IjZq6tbbTx2K)W!ck%TRd(8A(w!BM zwl?iyj%>y6Bc*d{GhFmbOc3T`a%NO}qJ`cX7p?#Mig zz;QPD37P~a4wHHa+LnV_U~lq^$%q|y;MmZK0kD`&^T}w+q2J;gk4r~~ag2KEDAtHf zlm6s&3Tb~W1Fk<${9&gd84{B#I?z9_I7m-`J4lMN$wfCZj`bzCQXTFR+^%s={0?!V zZ0*59w)SBHMB;M7i-(8k5sYdg-ZN|NRq(pr_Qi~jA;#JsgkKPJcv({GbWKfVwVFgN zQh#aR-n!$$;8{Ii-L~-Cj3FLBK-o_aG(^jT5B;TmIBTA)Z|$~1f1jo^Nu%>5Lhz~g zYimY`jH6^caa=sIZBdg%k@r__-Ucu3O{H@ck;Ww}9E~$a6(O!+=I*zv?#AYI=H^vM z(e$jQn!%~D#KB-GrG&}gZrUI7cMQj>Y~>7(F8_vIn7y_?mD{Sc*WOfY-*JJHd$9&U zqzg|b5g6L;*-{VccfBZiuswY&);FJVTGsb!S@m*iUz;H>&GVBVpZP&2#Hkh}|bh zsbm7=3U=+3JmBq%5;o*xiJinn&|oY09(q`GM-Fvh%E)FrbtYcR@l@+i>ZPrQ>Vt4C5QZ=*zT@mbLwNr*KqZP;HWBQF? zCs}rxLr~+z@Tn+OEbNfE(2*O&hqYo`-TR)O)E`V@NEP^~Ri)mEhVZJOxolmJNrs1P zVzF1|KzXl5Pm2kY--@yriQmmqtCHlF5!?0W zKBf%Me3%%g_UkX#ddyA+@FtDOZ<3cBDpuF|oKl;tr?c8*&v2a~1EO99N=ZO;adq0S z+@))r#&rLp8>Fz~9)8qVFQx(w(C&0UiMbDurgZTkhb^}sA@kc>?>R7|iqq}>y zukzM&(|~ZCAE{dz^G!A-IWxK1GtFa*G~CW%g%3l{9%r(1bs-B~LkvH-ZgyN6a8rlM zIBwiWwNhaF8J)U};gW`8;u7R0nwqpR+eRBGjUXMKuF9ec9(_=%W>GnPuJfa}v0}hK zPz=GlD#BS~jX&)u-3g3>4y-@b_326UrZV1K|4PPA_8` zf&p=_Sx-=(04&N5m21^>r_VLtEeXWMClTK|a9vt@{hY9JN1{SxzQ>WoADz`LHW(_J zGE0<1+utE-*Om@Pl(|8eKDU@2%$^-+T6+A_rBw6F!5n|>!Z4c*4=2L0^i$e6x&Vm0 zzIE7g#3nxDn^$_dgM|5z{AMEg(HVC4EBP#Vd})HMJDIzZvy?}zF7|O)8YL8U?3}g#o)P$tb`o=%{ z=-f8~fII}cPu$`1s4_IO8k1_W2lcn;VcD>@mOzqtdgf%B&z@svdgHwGkbcF8G^ugNW zr(D*}SdpFDm4=JmM;EC+cfRXODH4P$sh0u97%uH=l@|>DZ03%mo>%*0H32dQ%0II# zS9Kin8ibd)ixHLAG`=)4gmIB|`%>4hT#wdCg|A?JBj-hx+CxAQaBW9KHXAimzkiZ( z*FoP8-q*Z{S?<(w(A_Qfn%3v_p$kN7j^t6_-|`9hKffDS>dD>OArPzE8b zrP3?aahH3c{a&l2X9Q(+(_xI0Rvp!+u2nFAnZ~k7PGC9z+MiNtNn+lwYSk;dT6&@6i-b0oM!J5$(gL4gK48)%T9Sr#TmbrW zEXW~2T#*y$z6zC}a>$tSaKJq3OQINYpCr0rgKs-(m76XT8jYp>>Kjpb{Uh&jX+9c`>>(;KH)I%nOO!DVs7VZnI zq=dEG6&=|>hl|9aN2GY$6iWxio0-3sMDnRl*n|chM<_Rk8gNX;gb7hN%A$lQg{hY) z4q59vf5Jq2s?Eygr#Poie)^_pX)sx$g= z0f|;}M|XCrikV-M6c1@kvPB)EN45<>{Z@#~9U~C$p)`FNT%Fgo+%!6uq6wDf;I{8D z7G7`6geE*du2SMLcUq7x`vEKron)zOcBK6-NRCGL3kk2nt(0L$;84X?iX_DR&2RS0 zmnJ#H^&j#EiJ0%Dk}(~D8M}3D8`4=ZX|l(& zC!Wf)w(mho`Hlc(!zUIJ$|8H6oOPz4gFtV)Wm?P&I&JE;mKaOC2ZFuFjMZoWP}Y40rHNZxO_y?0+qy^;{}c z(|8&xx_ri4C2_q~GBA#x9E@|pMm_7G%d%htp2YCzAvu|lR99YVWDcSfA}fnfQ+{Es zwuT5Y;z}=xpK+5m!ay@6>`%cBcE95{qdDQ7D3VA3ke zq>7XH);ZvS4Dr}WqKIbe*$h=5%Ka+jf2e*48~fR9EjFy4yL5!Kn9}YAUFM1h566n@ z#ShC;t00b;sG!!3Xs#Oep9vzrXK=7gX=eb~d>{PR6MP&Dz>`=iw28_Gw1?pRZ~!=p zLay`x3p+k1nccPx53fB?J`*9D@nnD58Asg)9Z%x{N8@$VYq9#9KsT4>Sd~~i3m@8- z)KRJ;ci4~s>>N~7?VU%?m8sS|Kfm9cxYcPY(EaD!`07doUG)IR?|qd(r=loBAL!G7 zIp24$za4M<1NOJgN2qf~4arnhLuwzklYO}elW{lC{F9;r^jJ)k(-!IQY`8z;Kj8?; z&(%m_nk{jkjP^xz&C^i=g-!Qj7k5#UKUqArw03LR-dYklo)HX1nbm_GLpI$nunq9Y z$k`Z2kvO?%RnJ1vyn8>V$8HJB==5*1IK`%f_AsvAZ|QChFAhiX@vf z0s2chhm35ye&VNMy;{pOwTw;|KWD-Y+g<8q=iM*N5OS;@65V)aX8=Fdm3BW?C6Z(> z%eBbBG=@$|7W_G$cX_Nz#;0s+?~xoDCF-kWAqIXEQsmFyH!70+buUCfy#^iWIWC8= zty0-`i2L6En!H@Q=#u5Fvuw(-3G9kA6pR}5uBo$BC^&B{;4$P?_gV|3WTSgL{Nl*4 z=oRMEXYyD}&ICN$E13cyp0A51)37RkT2%eYKx^1SzoDjP4mO;!WmFB$rK44)6G8=o zY80b*LzKrDS2z^)QSC2DdmUZ%MhAwUP>6YomV3TiAD3)t<67>;F2NvqmPeC}4Vp)C zCFa=2h)TXe3ya|>wDHN85%S+G86Y3dy6WiLf>n(Wu-dN-swzNwKjskix4G-4kqdvG7oayX@yQgVlB+r4v!m3R_zI#`aUr5`S-|}`6PdEqKxoe4Y^E=BTdQf({ z9!lLwxBxhV4nFBqHXp^2jht7`PARUutTf@r`)%-_Ry~KQ(H8i7$4EktA3zu0O)_MZ za|7!bZ+;QzP9B)tK{5IoD6UQB_C?c}e`*m@P|ECR22yyXzw<1n2P?)f?W&b02v@^q z9_H4Ke`_HbR%_y77_8W@YyDgNH;`tf_TZ!)KnT2DTFUy$h=}J&yBq_bjQTJK zx3W>>l0dH4FWOj3tx|;l$hhcTwV_~x^>vp2%|}hE2sNt)aJK}L(IPRE z0c~o_wyB($>Ez$eow8&dnnoBRaOxLDf4)y>8@i%$Oyh-t5oIT=2ZX5?eI9 zh3|xVEiq-$w*cHJ2#?q90{Qxm?O#S{5o4@o<)8#jo`p42Tc--=etu8~Z<|nwZV4uQBnD`E<#+IOJ zb(_2K`uEC9hf$@N3um-g%0B;OIe~ggF*(QC1#tLldcrK+xsTlbehnoSRecj&voUWI zs!>U0<($Z{g5(7y65-6O8Y(LbPnZ1qz`YvCmT z%bvs29UDeLFQ>DalNcI~F?qz+aWDvAuT9132pljen5G{UB1g41{-!D`+%^(+|KYq` zN0KvCT>{2&Yugk`Qk#^(W`0~LWa!l7FtWFMz+4duBzGVjS20@RXyKS1V*ftzs&*v1`9^AV^p|_QC*C zGiY;jurCg8(K!5%Wcgb(nWg4fjr_LpVDikNJ5o;c-;^neOFkOL>pn4e&?>GpD(0H- zckqP4ptv_`M7)F3y$pzVv>-N(&Wd3syL$Ctnds^$TcH4}4VF{t^@u&g)EgvY0#Bok zB^*x#``v1Kr7C)vogF`PHq2l%K;3(6Zj=SG%QtZ4zQ2+yL8Cd(h@hJGhpE44e4-cE zFNs<`k{kx%j@V!_rKc$k>~oUafPW48K-hDvxo^szt(<6Re!v$)D%6?!05uo?tyMpl1!aL+Me5=WSvW@TVPZJP;=?vMYLaU$bA_#4|5H{#97h=&-%(w)5Go ztk$&A*9qLSWTp5#oD`)v2V#_`=Kq`4d$8gl@0Ro=RJq5oKByE58v6wNkq3B&+E+TNhffFMGxm`>_eAE37{XI57TYCS}0F& zHa0+Bc3O+g6DI~9Q&t@+RbP=l^M_qox(YMueUZlEz)L$k44BC zpx;d{ox!Z12h?a+h7r>#CF%=Zf?UFwU!5OD%<&Hoz~ZMc1W3VFi7Or{pkIL@>JZzrqvVOg@3&t`SbG2DzRlK zMtAa2CCQ_-sqb>ZD`9H;#BX=GSD!&rrdqZGmi2%Uj&?^cd2Am z43cj?yKa|?5h^GgSlMZlDfHK#jLp_!Bzt8B*40S4cf9EZxsoA@_yRm3v-{)WT>&93@xhKPzaKf;Nx~mt84Zif?5fFX zn0E5wd&?_RmTN9v-%+f~Qx|9H**AHME?JqaYRrTC20c5ZJ> zz2rGu@x1<5ivjV|;h`|uM@H=3+r%Em?jR_|;lEADtLaCZ$==Z0O3ZkY=59+G-X^O|r4%+j*xer{XjzL%_Ht zaXQV<{eU0w=hWm5izme6b<7ZkbeUd#tr*!`cc$j_g<3IcV?^AA979Eps5QvwI$MpViD)f2Ba`#`S zM!KS}xXy8yp1&{tm`~@$*bkPNAyFeX9$GXW06IV3%7&Piu=lRI^MkNCvt>(@HL(@6 zz8AAhrpbOl*~_4k;F@C#d+}18C#2DTjp8O2w8q=;{QJ9y3<7Wb8%vy;|MqW{TWFf-U_swd2W#?fFme8@Jsv~GqqA~m==uWF)xx{o%B~k^;RV7MkrT5>1 zTSi5EUr?j;PU@yZah)X+`K&-@{Qhvhc z+a8`bMXvA#hDFu_c|c_X%dglTEbNtRPM{^P2}G;68Gs)rm6(eW6kN=d0gYvpuS;e> z2^NtaG5w$0u>22{OIH1d(f*Wxf5^tmvdLfP3*0SB`~yGjgd1UN(lB;q@Nqtu-6)t< zEp)FiVzR~Dj}56F*p{PDt3`=Evf;lG4cGdsl59hv#+>vew3Dr&Urp_A?Aeb$5J0Q) z;jFh=4k)M4(X(7*zha!T@yqo)`;jqFyALx*xww=>C@n|gy%`|^86Y=&vyrFy8HvQY z*tpYHyhN!p5uduP#pS2m@K3BGO|Lu+kD;IKvD@beyyv|2m&`lP_T3E8i__OtzitJ- z{G5bZ6E%b$8RB>=OyEDaXSd*e1xMUc#4q7&T!+L?@_C9+z@QS1)Qt4HCwyoalU<3K z`7?xgcc_?4av2x0yBhmv?=_aV{vrl*dFuG5a`KnISrQOxe}UAw2ukHVNXR+rWy|`K zuvq535#(|L_jr8b)bGY^I*gBh8>jUVg32`eo)y}1@UhKl+eBjQX*kjq(o)sH&HMzg z@}7@;G;wfoaB9SCKWceww9D)m)kD?i0GaH#C-dZCdzd9r^H9`I;`A43j>^J}sul z{OjEcAyRIWl!r!ym3D!z9jE@Ru-|x88+{?Un1ETI7B{)mb-d$Aq}g!n8^=>fQhB-e zqx^Cc4|Df{_JH5vg_5DwLVG#Gg-L{<0>bE=A~B}U8htR873>%=9anf^7Be3$P10Ucq=uvvV6;# z8ur&+3uNpjK^E8DnS6uPWt|D_HjtH(x=?KiYjtCH`P|y3l7pe0sS1Os94I)+#NtNp z(t-34X^@TNOo@0OE?%CLJ<6&z&!q3OndWcYrA1jX2g3YA`HfbXpJ-YtoD!4B;F}UG zzgf`U`Dc6M`?&kq{n#~J{Bs`@Zz5aQl_{?wgC}OERC&S#fU>684{mbYMZ&t!;Q@<{ zU{=3{OdeCefskV-d~(~PxaB4dj4mz9U%w|!G;${ZMuq&@;*Su~ZTRR+Ck5`{^|@t) znw(isP5Z)dkeRznY%x|cBaeNOj~KIFQFJ0TXe5${a25d1okW_@8T}?cNs@5Z-0isJ z=v0qiYj!wQuOO}wDasSk@w1c1=`v#qWK};%%KV_wzho)>h0tOTI+ES<1jF9?@_?)b z9egYMhpL{WZ~N)Zt?YSHr&o&e01Fpaf{1``#P>rf27O=t7kvOGk8&Y*BpSq1md8HR zHEciLxZSZ`COi|1cUTH3FKFc$SnG=bAeY3Pm)>JWCBSDnuaV31n$o+sDn$6j?LjV_ zVgu2yM#G~4y&$HQV3x>$F;gx3Cl57HuBTxCv!g%P^!Pz2#VHWtr)ME`h zdk{!4nU-WMQCt%xC$EP}+DaKZ4%p(E5oBq3VZdAw{e@Z!2q9`L28CfqgmXLEU?^bS zE>A>KxljT6x^CY{iFZvda zHD?1}Wnhj+PN|yRT+V9rpfIQJ1Ijv{_$`aNc8!;Q4vv1;?EvKhYypyPm6O-!qT=6u zS9#S_kCc%pKEWe`7oKD##ZmDX$7o8OES928HuS_VbH#kLh>SqoU$pO>>~>}+cCRx$ z736e8HCS-t<#7D6bjhvXSRJ&h6%V!A1WlC~OCDNQb*UY74~aJJ zF{q?yxk$=;*kq{6Mv@cv2?AOAgZGnzMw!$+sRk#c+=>HLS@Z8YPc`LZMy|)D7TQK_YXug)&Tzv+TOc5bA#uhS;?QcRBw>yOY5n+3LSDwIXqo zy>lL*{IX;a0%$WqPf-ftm3GP`k!KFX*DrqmCF_(GjX=pirc=JIHV^?PZQ}a zt%1l+CkGK>Slz9)#5_5GFZ*~Kn9gd@i0bv?g^xC}^C|B#mXrprg~C=d%G?F!N2x>j zl5d5T_plHh22c=P#Xa1`W8;Gxy;V5J zZ#yf)jzab+EC#DS@P>rc+X~>?r}!;zY81iG&f;#3tc@h4JGce* zJ$B$aTz-=>zTrijdJ+31Ltamh&XT^O&`pvzMqWdEJaeFjnlqn?GxXalM%Z?20IHv$ zs+WsasiJQ-WDBoMRIRH!X!V?WpXJ;t^NlEZlw)*Z0B5p(ni3eE9G;RWDSVld(95yl ztV*15hUEW?dZ!X?Qe;e+w(+OZ1Tv|IaBwwTI}(q zlI&nwxH&x)!9%Egx%kb9=S+=O%Jl{nh(MEm_+s1{$ibS{;RwmyBOONNm);Yj0>+^P zT;Onjjg@I=~eXkie1s5g+E8f4X(GvPCc?K}gmYjx(Fle(C z++`{As{&h1$z|y*>o@HAfMD*b@g#6_OHJ|}RZ zH!0ohN#4Ghf7;g|Sny}pPgLUj!W=gY2Omu-iF<^#<2yhui(FTLMWvsFr%{-jD~dBY z7RVg1(0>ctPlvH+ooT^DmViY-GNx`c8w4#^x|_}4-FM4jUixCd$BeU)?BZYoiib7( zo7FUY2&eTjMbDulh-m^#UUV{3e`|0t#tiM#J^(`d!V$xy9&SXsFKY450tu_YjyZY zHX7Loh9VNS;PA?;Ty(Ks2loD?^Z`ydDu_R_F@ZgqC%ic&NPy7u_Kfkql}l231K&=8 zo38)eu%jjUKfajlVcTTkBIKCFou_I!O}qE*#N5DpLeb=0tIy-N`bzas>x_kzXrRP> z>t*{@tqA16+XF*}S~X$}(&ZR~eFQU9f^d>dS?M#3-3=?C(I^qP)4`*LoCP|kzYi!etwNwk?-9)G#)$@0Rpt*t& zg-m3u9nGgauP$~C+$t01)wbU9QTG*?M$o0aS9%D1 z_&eR)z^W_Gd)>9ZH3#m?2>z)#R_tKpl-s3vGQfiXUnrr1nZ?MGGaV^D64;`69JZPF zee-Ua6`up2Wt_2#T=egEw_bNpx2Cb^#kQ6Dl-amJ5EQRc#tz{Defhii!Mi;PMMC)4 zjCsw6J=L6D{qwohKTw(JnD_IvIOP)4q#V?szM2g|bK91sz5U?$pZ{m`t@gLue&mHK zo^?OyRca;jI4)4(bV*w{Xr!5s4ag!|IrQh{osx)~CTaJ@(OHn+*7yq7FIv@tURqTd zx*>4VBS}>m1v8*v!7#P)iK?u*dLY}D&e&h3`FTxQK_W_MeQms-NSxEeAM(GCY>SPTq5)o5;+sJXsdB~BdZmfkEBsbj$qf6>F1|DC9W z8$c1NG+TMc4EnN24;> z9f@_c=v!W$>_fyfdEg-`7lPwqgiKOO+6H2d4K1LxD#5LFZ%7MEwwcqHWpgUoPdMqM zwG9`NLukCkA=1LNsp6Jq^Bq~p#7&210@JaaWeh8(Fv12c!7`2p+G=TJSk^61DP?pt zCy9}CUNb%U5ixiSva+ei+mNBy=Qcmxpj1oOkW?g)vA}c0U5Q*LJ~*+-}+V-H=)M{5tsq z{-NWZuNQ*r0Cm;;y_N>m>^QBrp;2OK{SlM{a6UyElEo0&%=nlJlB@)ExY_+`nBI~%j*K!{o!b7XBLW6KNW<+r1H!#jU<60=)exGrYa?l# z4-o^s)`n3TFs@eY#D4r2d$27&qLHyJdq2i%Jgu(*b6jF`84%AU+h^rn&8|LTOMVFO z;@mw?IziRu94szV42F6USlDd3`G$xe^i|!VDwaROsGtExmfig@QAj`0u_Jd}(@B3;;9lZXWMiF7D%tsa1Yxx1h#Ql^L;sx1Bf@&~rqZLy*wNjnw z^=)Ph>d=S0`|-n~@0<4h*JP>BT?`wJrgww0PNE(zy@@z*6Gy?kborg>!jV0Wb+aJ1kUnAW^jS?&>b>eYt^#U_<5QyD7 zw-h&191--}v57~srVGs447sdbSEQxi8*r`hhO3eK%dl;3cf^swfv5{1!`GEzDB^dz zlms=Jn{qLVdLtVn=H__3U4k_5;!k*`9sIO~-4*{&fU@F6@DH^{+3E|JPA)OZR{yFZtqkDeh_gIY%s*IP&0a zJZ{(Z#~4T5h1J$?X7y=;`WT)nnDyXtbo{NeoDQATCLxj73R7z$L#iYgYn7;(Z+`rf z8GjzNrv#L!Kao1o{>ro<^bioD!<}qrCP}bJHj+>y6Rd#;3H5VMrot{3 z&BZ7FY`3H^S>K*D-*vMQZlA_Xv#0Ak*r)$X+1}gA_8;Sz@x(`}!-nd1fo1|O?OnlH zv6ZrC+oazc@j`Z@Hb5BkaY`~VE4FYLOg}U#TCW%t0AcJm#_ru1MUut=vzSq6Gfu@H zxz{dPN7Xa#8_0N>UPRPpomOKSLaXDko?`v*4ut9$p^g|oIS4Y?*^^kR$NbJ*#u^M{pz zs@h?WJ{204)^gKqEHx~Y>~w}RE70f?6d~Z_?hSHkF)cw5BY+>lzr#mzg(2+S&A7(y zXTtZ!MZ-hD!urz?F-5?t-uoL>uE8k!e5B5{VNtzpJT{YC^1)dF%&Y%3w(X8;$=GKl z1{CHNR}N5qAs)uEz--Z%+(;V0=MC4 z7ggwN;4vOq=Q*_AdM?*(I*(sca>BY5UT{Y2|Gw)O>L}IG(+bK zd#`&vLE;cL1D-aO9KmZkXAschm8E&zPH_psw%aLcq*OZ?yfUb%@p}EC{KPj*BK#+X ziE;=-(rTYWK(5$Q!#plD+iXcHq10U*C`|akZOq%4fwhmqT{NceUX{;llJdy7Zm@`k z!TizXWK2|ZqDk7)yr@|{QTzkRq=I+u%|TlOoB_wUQRMkiGhtid)qZBD=GZWQUhb5$ zyHQ%WThkEYooKll)&Ms{C7HK3dS|;zfvu3UC%ID)v~qGM=VZhU#FCZA8MQ+Ve!T=q zX`;#}xMRtl!YZ#Q(TOH0hfG^0!B5S1*`Pyzk$}}~ZlSLjuQTROR$cqfXHxQH=+r#F zTfGPw2$xgLo^JHYE_?uN!>oL-RELp*u9qO#r!R4wfKU<%EB>E!yP*{NfO9pvfIF2- zb_ig~8axi8iXV=Q^E}3>UJm1>Z!r0-AYAgZt5Y-_$$kzOt48^kn4;{8j+3UKRB3Totl_8&C+t~tQj`o>qBJVNlP0am_EeV~B5u`r`z%9UUCpdI zVfXaah%DUA4KMsAk9=t5ydGYg@Vyw1Y;cd-6usg^uo59&9EtRBg0amxG`dOO@b}}r zhVI(mdLhSI@XeovaJ?HIjiy3D zIMKeHB=06|rBxBh1Ks9{nub*!g;0bcGA?|G-o&Ug>+))>Butzg0K4|MIapfGakDspBv<9rFvE@w}}v;88G|3QMd~CP$SF zTaDC6EG$BuA;&hvAWREYsRyOW5pv_QgnT}_1SYeBZruKag#BObomEg=;g*G)K(OHM z?(Q@i8gJYsIE~Y|2TNnYt#JZ`03oI3JnS0--=3$=B+j%&3 zYVUvTz5Z_nLH*VT90^07%AL9x;haDW4>m8oK!bOMOiK+$2|)NLdU*`98i6zlH$N9+ zy0-v%2g6(k85OB_M#vir;u=+hHfq1g_a-Nyw7dibeZED8#rh_>2QDd61_UI@ zlFku#>zzdX8UleL38FPEMzhpB@mcdv+*tOX*8ubz*1BUv0W*U%8hZcCRUfq^?y{#*>n3lGiQG6M(J#1UlvYFaT z(Y}3@0t>E{AfWXO09E~tJ&w`dNYi8idZPC5Ilv7$S1-am z&?>O(N@100Zzgi}Dr)(zIMW&$`MF2AZ^HTp%ZNZ%zp*F01L4HOYr}idH@=oUuRyd= z&;TQDMq)_{i_@rTCN;!y7RJmdo%)u$Qbt&b$HR4h;3j0$Y@C{I#)fv{A`Tp-2BM-) zWw$=id3&JLX+2{~>VqDiq%4Or83k8--?^O}GP}HPS!v+8WLMjf{q?Bk;&leHu*%N) z8=dQhM>t9%GD{og%NuZGSd*@OgIOiw&_&tbjgs?Wv(ofXc=D(USI8wyHJRhQ@f?4_ zJ%cGbFG*!OOWPGh@8nSAmOE{mdGu{_ZpK^a;GH}0@k*FMHLvVe3O z@Q_&m<6UuWzCWrz5q$=fmS)SbIll$n9xa{BHO4ZVWhty*JpCfyFIzs5h@{50I29%Oy6Oif5Rds&95-XY=~p4=!Lk(AW;S$u1fP{ z%YA+Kd5wWE$ng;Vr%u{wCOyom5Ln~mt8h$%hc?^uOnEFNuUNtPdSCx^Lk5&br|+#i zacyPiwAF&!%^YPFPh6|Qx5X|VtG%_z$+m{;uiV2fOaeAp7Ha}#3^a5CA4Zf&OuJR5 zv}ek#;GCUE3}G-C6hpuvo|jU{DPA+L70f~a7J)ZTV5}lRdN^vGF{AKm*L4r;{%bTf z5FT5Amzd;h(u`QM2KB7yJ)B;_?13_7sDHX;2evk)fXEkH&F+S(Wqu+b8JP-dT>VH|q-O;Hs`! zX18C__mXHu>kr}?PUlI9{w^k$$~KNtQIHSG3HdYt$>->pe!Hr1^2Pfg{A!(!=64S$5YW4yb5$pqAvd zc8iiKT`&7uYMw^y`5zfQlaVjA#eRiK)J`6c=#{F0{KY%bRIPi($r1v5E!W^7QGq9O z2~+EtrflwCX4i?|-;%ZrpJg`$^L&13|4Mh|=r^86n)qlcP7W>bP9;*|8h~cSq)Xrv zIA&_%8`laR^xxB`2!h`k$$~K4)08$p=?;J;JPYE@Y$wbD;D4HVHJ`|Dx34&p!r{`` zCV4AehmO%20hCs8-p@Z1wq);%69j+Qcji){t?N;~z7{40y>9CeL)E8{a+7S?pFZ}O zT0c(z5v;UWe47s1ELk*j%IV z<5PC1$V9j$nio*=*$^7sgB&=yUmuIsR93iUZF-a|UW6%=rhE%r6@}!S5EH%`tquH4 z4U;di`A}R5T8d-azg3E`>rb)-HlXoXOzYh3S1WNF^qjEU^ZpJ%O1auzFIlCZkBiBM zk=EW6-j4OAS?Uz>0j%+`w6YX5(2kOE$cUKkM(L2H;?#}=>UVo$cM2&{j{f(z@8Aq` z_NGm~WE}f?T_aw57eR|d)){=vjG2NXon8q-G{47Cw6WRoBU(p-TG>u47^*}vKds(? zX*z^x*NjOTN{nMxO?C{#R&U0$U1ORN;5IKZU>xWY~NKDRg4A zVSMzEd))jbr8N8CJFyPENuAFK7f!?KhOnam)Q_+Cm#y6g_Mu7k8^ndS-xO@B{h)^P zXid-*4rOg(l%yGfG;D7+4vseh-T}gT*9Vj6w6tjck!zDUz?2a?SR%iO&J}EQAvMq= z>{=ox~R7h>xaB-OD^> z(6-#rI}gjh0B0$hKS!R~H=+95GZ;`4bVzJYiP0{|Zd zJ`JGF46y`gBxXmeKA(th$Jr#1Tg^e2T|k@VW+0|aR`tkVe#Xm}kdSwjtowE_P8Cfs zQ~mit`$m0+gY@~6FAlv8d97s1yyBJI)5c48;>|CAPPn40yB8qmTa7uE#>on^4h3_k z;%1Tw39aql#_8;w>s3cBT#;*i7{_2@p7aBw_g{d2kt#mSltaLuV162x|3;2^bJ)Do zmK7il?27+znti$3puK)EW|e5}@k6Drdxa*cu&PsHAhgcqlsA14R4Q;g4Bkd;6eXA5 zaeS+l8rxOI5c?wzeg>t$%EB`siNfj^A{bxYT%A8p51Rsg+>5MFcSlW@(8n8=L`&A2fVG zYZpUHeKw|4OaYIpjTB6OmVe53cVn;(%w5w(A?#nk3|y?Yg4e*r-+wiH%>dSyu3 zZ?(Hi#Hq3qAsr2P&ZX{rylW-y!$ZPs;yK{07OG*Sz*VF%$AXt~A@%K>x!Mh*_0^<} zJUSo+>h)nE@#;z7Y1o|)uXwDg;Rr+XA8>NQER^J;L78~m(y71Ess={jummW z2F|km)G~8PMR?N*Z=mMw_>KENHSp_FYHCE6HRl+Q^Rs9~p9dzF#WyDr6B7Ju-)3xS?fb(uDzZx{|!gAiu;7_M|1D#XEXKvFzrwMpF*+s;#n<{zFaz zjss2FEnZ9ESoSsd0MEVjmY*>lT4PKZSSuva#>+L`u5B4AS4q0dkOhdDj{aP*60~q! zt4SXY#Ai9_QS3vx9(o6*5Yy0SuAe|1o+?c-i;GR>r+5wkSkAr0lX7D9J~IyX%(P%Q z9=kQwU5M%T>X86gOa-C29%?ssnZ~mK8*Pe&3)NHf8LN^ZG3T(v*j>Lk)e`xGkLAx1 zj>BEyOxYv2k*FltLyTn!blU&0>ZGNYRH|=L-H%mY=0pi})6p z=;E+v*34X9Iv4QSJSvtVBA(q<4^6%)4b>CeX;0E!7EL*41@Uff%a9oZ{`84wyRLSI zGdX_CeTL&1E?EKqJjLe6EVkBY5I0>2Th7XW@4IsX!*PXISy{b--Ha`Y&JD z*Nabg(Q(&08>msI)!rL)7V$S5Bwqrn`Y^CW%^Mrya)b@f zCj_WeZwQ(@>yiFkL+b99USpn&4DD**pJ^3J)g>x=S@m9h)rzT)WBSPW)=VqF%mo%(K5$6>kwvc1 z{Q1OrU3Hh1uC{F1(kp=8`yoWSX9SBELSvcE8f&L|3Z18axTk-R$aairC*jU@?bUjd zqbgT?>&oi3{o(_V0`ErcB{oi|MB?$q`ZS9!CqT7OVMb@^TO(7%W+Ne=yg1Hlk?ZgG zwL}d>u~-Z{WZGHI-W3e{>bgBAR>x($`9Ht3h@V-lg!g?E{iAWSju3R3;vhKc5(8qh zQWB)c#wCv^;y=TD#Z_BXrc1-1k8Hv$RhcWEAKdcG>`$vJSM(}5LznQo|CKA^*m6L) zBnAqFBaLE_-`qz?T5&44eDR04qm~zLh3`QpRYGWASlBV2g$UQ}3OtmrmJ}GIcR{ z%~?KhGlw2m*e`FsG7z9a93{0`%D?ru-SYhWl?{{>Y%D`ia6Q%{*vo;Bm^5t+SJ@1zS`M(t39z(-#Tg;-ZlCvUO#AJ9!X8P`M3mQffy zSel3rO)nTyZ8;MTu&M%EeUA2oQRR}x-w3iB*{>v4vANNuaY+y~p)_(NAkqfcw3tt@mqJky8-;?E(L`ATm z(Q&DARQqkpj6jSc2kl;B^eH*tc-)Txo&0R>L=sWaI%Iw-xy3dGS;DUdKlt?u)_&5j zr7)2*tZS6vqIX)0*9&<8yD7_f`DZk!kpA2?>2017G_}GBxYk${onggyY4rdiPlfAMOX3^w=}iM24*VGhgp^hZY;3zSd-lo-Ag!PFDDh zRr!Rz0=;Id;WXo~Yj~&DUpiV`^N3K0)xL;rh?n1Ku;-`I zrUBx+Mni5)G+?)7^o?VHNhu$OMuS%$Eek}hK3qT9xg{4m=?MJ!S6u2|($hp?b+6;} zU%)TU)4ChOy*j}-2kr?0Lnd+zKL8$Pzl(;=@# zP+NHFsAhr&evKyxebW(59AkfV&mJ+PwpU-nu|!oqAA{O;0N#;Hl6| z^s_jLScpZ@K9%D6dLk*$~NZ^bdw-RU+? zaLalqLUqBcdd>vzN;;zaN}_WDvSu z^Zlst>A)>RhFMJSP;v9QOqWTR^fc1^vt-8kSanc`T(BFT|5kgjO1&HfuVQS!G2S~M zyyyHaTN9Qd5_GCCs$b_`(YN8QS~0Y--so1ABaEbR2A z!xbEICXcH5SS*<)y}ErX_>>YY3FtV{f_6L*B5B)ph{fdU9pcX-yaK<+vM~(QYL;FvJg`vt5$&yPKwiMmS0n8_DQ}8G4R7%n}&S0O7KdSea_3Zu=?gf7vC~ol(C!)Pd8sw zI!tQH=MfuG{-Edt#gPtb?036xR}pq-m@^UhbXGqMlvbRMu82G8$he=zDe^(qNf6_o ziKH3-+~^DNKEKXesI9x=X#$)!Ypi`Y}Xl=!C~i#JlAkKFk4`g-kBy({dU(7CANKHYOtA3i0g?( zumFLRBhvp62pU&G85ijs0om;;T5Te?c{V!%Oe zj{d!nhIDb-*x&*Hyk3bkWE_>rkd*hU=^yD8nf?2*2DzS(cDVj6MIC)zw#q*-)mbDK z7b;EVc3rhnG;yYo=jGNLw_=>{Q`Cq)NGjg=Agcsdt6<6+xFSK2j5KuSnti=A|4|$j z<`dP(Nyut9o1s&@j7V<9{;Xs!>%f?PvANOvbJ2r%rp}M}CE|B1k%xU#6r7S&ONB7z z@OXqP9TAg0BL1pmrUrWthY|uc0(2?fttHj)dG;@(~!Eb|es#$7`&%KMqCm>*{$3$vI z6+2J=!wFypx`UcXaZba2d<`)WE2GsDX{_4%4`Fcn7tkVugxs@LUNIdy`~|EVyJ&CI ziQm4w`8f3#nT=Nzd3Sp2lsVlmq-~mK4FBOj(~+=aZF8#um8h0yM@z2U$v@(XH1PZ( zGQrIh?j>}zC+Jupk!j>);e(C`E=cF)ekE4{HzKTS`&QOhceK zq|^Be$Tz=DTGT4M6-{;!kg+8c6oN2QD7Gq0JX*J7i5ccGN!HdRzY%a3C5?u8P{e^`y3$n_eDf5S(NVO|QC(mJj6|b!NIY!)Jb5& Date: Thu, 15 Jan 2026 22:24:21 +0800 Subject: [PATCH 39/59] [BugFix] Fix incorrect mrope positions under cuda graph (#803) Signed-off-by: ZeldaHuang --- vllm_omni/worker/gpu_model_runner.py | 2 +- vllm_omni/worker/npu/npu_model_runner.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index b0d0e165e08..e10301a520f 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -94,7 +94,7 @@ def _init_mrope_positions(self, req_state: CachedRequestState): if use_audio_in_video_value is not None: use_audio_in_video = bool(use_audio_in_video_value.item()) - if supports_mrope(self.model): + if supports_mrope(self.get_model()): req_state.mrope_positions, req_state.mrope_position_delta = self.model.get_mrope_input_positions( req_state.prompt_token_ids, mm_features=req_state.mm_features, diff --git a/vllm_omni/worker/npu/npu_model_runner.py b/vllm_omni/worker/npu/npu_model_runner.py index 1083bbe40d4..264d5e3413b 100644 --- a/vllm_omni/worker/npu/npu_model_runner.py +++ b/vllm_omni/worker/npu/npu_model_runner.py @@ -64,7 +64,7 @@ def _init_mrope_positions(self, req_state: CachedRequestState): if use_audio_in_video_value is not None: use_audio_in_video = bool(use_audio_in_video_value.item()) - if supports_mrope(self.model): + if supports_mrope(self.get_model()): req_state.mrope_positions, req_state.mrope_position_delta = self.model.get_mrope_input_positions( req_state.prompt_token_ids, mm_features=req_state.mm_features, From 35f994e2558e724bc64f3adf8903b5f1db8ab167 Mon Sep 17 00:00:00 2001 From: Peiqi Yin <60515999+yinpeiqi@users.noreply.github.com> Date: Fri, 16 Jan 2026 00:41:59 +0800 Subject: [PATCH 40/59] [BugFix] Qwen2.5-omni supress end token and won't stop (#773) Signed-off-by: yinpeiqi --- .../models/qwen2_5_omni/qwen2_5_omni_talker.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_talker.py b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_talker.py index 46175568e07..7b0b4430917 100644 --- a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_talker.py +++ b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_talker.py @@ -155,7 +155,18 @@ def forward( def bad_word_processor(self, logits: torch.Tensor) -> torch.Tensor: # suppress token IDs unsupported by token2wav if self.suppress_start_id and self.suppress_start_id < logits.size(-1): - logits[..., self.suppress_start_id : logits.size(-1)] = -1e9 + # skip the end token id. + if hasattr(self.config, "tts_codec_end_token_id"): + end_id = int(getattr(self.config, "tts_codec_end_token_id")) + if self.suppress_start_id == end_id: + logits[..., end_id + 1 : logits.size(-1)] = -1e9 + elif self.suppress_start_id < end_id: + logits[..., self.suppress_start_id : end_id] = -1e9 + logits[..., end_id + 1 : logits.size(-1)] = -1e9 + else: + logits[..., self.suppress_start_id : logits.size(-1)] = -1e9 + else: + raise ValueError("config must have tts_codec_end_token_id attribute") if hasattr(self.config, "tts_codec_start_token_id"): bos_id = int(getattr(self.config, "tts_codec_start_token_id")) From 43ec7f701b8a0f027a66d9b0595b25cd7a93a41c Mon Sep 17 00:00:00 2001 From: Didan Deng <33117903+wtomin@users.noreply.github.com> Date: Fri, 16 Jan 2026 11:01:11 +0800 Subject: [PATCH 41/59] [Feature] Flash Attention to Support Attention Mask (#760) Signed-off-by: Didan Deng <33117903+wtomin@users.noreply.github.com> --- tests/diffusion/attention/test_flash_attn.py | 290 ++++++++++++++++++ .../attention/backends/flash_attn.py | 58 +++- .../attention/backends/utils/__init__.py | 13 + .../diffusion/attention/backends/utils/fa.py | 209 +++++++++++++ vllm_omni/diffusion/attention/selector.py | 2 + .../qwen_image/qwen_image_transformer.py | 6 +- 6 files changed, 566 insertions(+), 12 deletions(-) create mode 100644 tests/diffusion/attention/test_flash_attn.py create mode 100644 vllm_omni/diffusion/attention/backends/utils/__init__.py create mode 100644 vllm_omni/diffusion/attention/backends/utils/fa.py diff --git a/tests/diffusion/attention/test_flash_attn.py b/tests/diffusion/attention/test_flash_attn.py new file mode 100644 index 00000000000..3f3862405ed --- /dev/null +++ b/tests/diffusion/attention/test_flash_attn.py @@ -0,0 +1,290 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Test script for FlashAttention backend with padding handling. + +This script tests two main scenarios: +1. Case 1: Comparing padded vs unpadded inputs for batch_size=1 +2. Case 2: Comparing FlashAttention and SDPA backends for batch_size=2 with padding +""" + +import pytest +import torch + +from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata +from vllm_omni.diffusion.attention.backends.flash_attn import FlashAttentionImpl +from vllm_omni.diffusion.attention.backends.sdpa import SDPAImpl + + +def create_attention_mask(batch_size: int, seq_len: int, valid_len: int, device: torch.device) -> torch.Tensor: + """ + Create attention mask where first valid_len tokens are valid (1) and rest are padding (0). + + Args: + batch_size: Batch size + seq_len: Total sequence length (including padding) + valid_len: Number of valid (non-padded) tokens + + Returns: + Attention mask of shape (batch_size, seq_len) + """ + mask = torch.zeros(batch_size, seq_len, dtype=torch.bool, device=device) + mask[:, :valid_len] = True + return mask + + +def pad_tensor(tensor: torch.Tensor, target_seq_len: int, pad_value: float = 0.0) -> torch.Tensor: + """ + Pad tensor along sequence dimension (dim=1). + + Args: + tensor: Input tensor of shape (batch_size, seq_len, num_heads, head_dim) + target_seq_len: Target sequence length after padding + pad_value: Value to use for padding + + Returns: + Padded tensor of shape (batch_size, target_seq_len, num_heads, head_dim) + """ + batch_size, seq_len, num_heads, head_dim = tensor.shape + if target_seq_len <= seq_len: + return tensor + + padding = torch.full( + (batch_size, target_seq_len - seq_len, num_heads, head_dim), pad_value, dtype=tensor.dtype, device=tensor.device + ) + return torch.cat([tensor, padding], dim=1) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="FlashAttention requires CUDA") +def test_padding_equivalence(): + """ + Case 1: Test that padded and unpadded inputs produce similar outputs. + + - Input A: batch_size=1, hidden_states (1, 48), encoder_hidden_states (1, 16) + Concatenated length: 64, NO attention_mask + - Input B: Same data but padded: hidden_states (1, 58), encoder_hidden_states (1, 26) + Concatenated length: 84, WITH attention_mask + + Expected: Output A and Output B should be very close. + """ + device = torch.device("cuda") + dtype = torch.bfloat16 + + # Configuration + batch_size = 1 + hidden_seq_len = 48 + encoder_seq_len = 16 + pad_length = 10 + num_heads = 8 + head_dim = 64 + + # Initialize FlashAttention + fa_impl = FlashAttentionImpl( + num_heads=num_heads, head_size=head_dim, softmax_scale=1.0 / (head_dim**0.5), causal=False + ) + + # Create base tensors with random values (same for both A and B) + torch.manual_seed(42) + hidden_states_base = torch.randn(batch_size, hidden_seq_len, num_heads, head_dim, device=device, dtype=dtype) + encoder_hidden_states_base = torch.randn( + batch_size, encoder_seq_len, num_heads, head_dim, device=device, dtype=dtype + ) + + # ========== Input A: Unpadded, no attention mask ========== + query_a = torch.cat([hidden_states_base, encoder_hidden_states_base], dim=1) + key_a = query_a.clone() + value_a = query_a.clone() + + attn_metadata_a = AttentionMetadata(attn_mask=None) + + output_a = fa_impl.forward(query=query_a, key=key_a, value=value_a, attn_metadata=attn_metadata_a) + + # ========== Input B: Padded with attention mask ========== + hidden_states_padded = pad_tensor(hidden_states_base, hidden_seq_len + pad_length) + encoder_hidden_states_padded = pad_tensor(encoder_hidden_states_base, encoder_seq_len + pad_length) + + query_b = torch.cat([hidden_states_padded, encoder_hidden_states_padded], dim=1) + key_b = query_b.clone() + value_b = query_b.clone() + + # Create attention mask + attn_mask_b = torch.cat( + [ + create_attention_mask(batch_size, hidden_seq_len + pad_length, hidden_seq_len, device), + create_attention_mask(batch_size, encoder_seq_len + pad_length, encoder_seq_len, device), + ], + dim=1, + ) + + attn_metadata_b = AttentionMetadata(attn_mask=attn_mask_b) + + output_b = fa_impl.forward(query=query_b, key=key_b, value=value_b, attn_metadata=attn_metadata_b) + + # Extract non-padded portion from output_b + output_b_unpadded = torch.cat( + [ + output_b[:, :hidden_seq_len, :, :], + output_b[:, hidden_seq_len + pad_length : hidden_seq_len + pad_length + encoder_seq_len, :, :], + ], + dim=1, + ) + + # Compare outputs + max_diff = torch.max(torch.abs(output_a - output_b_unpadded)).item() + mean_diff = torch.mean(torch.abs(output_a - output_b_unpadded)).item() + + print("\n=== Case 1: Padding Equivalence Test ===") + print(f"Output A shape: {output_a.shape}") + print(f"Output B shape: {output_b.shape}") + print(f"Output B unpadded shape: {output_b_unpadded.shape}") + print(f"Max absolute difference: {max_diff:.6f}") + print(f"Mean absolute difference: {mean_diff:.6f}") + + # Assert that outputs are close + # Using higher tolerance for bfloat16 + assert max_diff < 0.1, f"Max difference {max_diff} exceeds threshold 0.1" + assert mean_diff < 0.01, f"Mean difference {mean_diff} exceeds threshold 0.01" + + print("✓ Case 1 PASSED: Padded and unpadded outputs are very close!") + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="FlashAttention requires CUDA") +def test_fa_vs_sdpa(): + """ + Case 2: Compare FlashAttention and SDPA backends with padding. + + - batch_size=2 + - hidden_states: (2, 48) padded to (2, 58) + - encoder_hidden_states: (2, 16) padded to (2, 26) + - Concatenated length: 84 + - Compare FA and SDPA outputs + + Expected: FA and SDPA outputs should be very close. + """ + device = torch.device("cuda") + dtype = torch.bfloat16 + + # Configuration + batch_size = 2 + hidden_seq_len = 48 + encoder_seq_len = 16 + pad_length = 10 + num_heads = 8 + head_dim = 64 + + # Initialize both backends + fa_impl = FlashAttentionImpl( + num_heads=num_heads, head_size=head_dim, softmax_scale=1.0 / (head_dim**0.5), causal=False + ) + + sdpa_impl = SDPAImpl(num_heads=num_heads, head_size=head_dim, softmax_scale=1.0 / (head_dim**0.5), causal=False) + + # Create base tensors + torch.manual_seed(123) + hidden_states_base = torch.randn(batch_size, hidden_seq_len, num_heads, head_dim, device=device, dtype=dtype) + encoder_hidden_states_base = torch.randn( + batch_size, encoder_seq_len, num_heads, head_dim, device=device, dtype=dtype + ) + + # Pad tensors + hidden_states_padded = pad_tensor(hidden_states_base, hidden_seq_len + pad_length) + encoder_hidden_states_padded = pad_tensor(encoder_hidden_states_base, encoder_seq_len + pad_length) + + # Concatenate + query = torch.cat([hidden_states_padded, encoder_hidden_states_padded], dim=1) + key = query.clone() + value = query.clone() + + # Create attention mask + attn_mask = torch.cat( + [ + create_attention_mask(batch_size, hidden_seq_len + pad_length, hidden_seq_len, device), + create_attention_mask(batch_size, encoder_seq_len + pad_length, encoder_seq_len, device), + ], + dim=1, + ) + + attn_metadata = AttentionMetadata(attn_mask=attn_mask) + + # Run FlashAttention + output_fa = fa_impl.forward(query=query.clone(), key=key.clone(), value=value.clone(), attn_metadata=attn_metadata) + + # Run SDPA + # SDPA expects 4D attention mask: (batch_size, 1, seq_len, seq_len) or (batch_size, seq_len) + # For causal=False, we need to convert 2D mask to 4D + if attn_mask is not None: + # Expand mask for SDPA: (batch_size, seq_len) -> (batch_size, 1, 1, seq_len) + attn_mask_4d = attn_mask.unsqueeze(1).unsqueeze(2) + # Convert bool to float: True -> 0.0, False -> -inf + attn_mask_float = torch.zeros_like(attn_mask_4d, dtype=dtype) + attn_mask_float.masked_fill_(~attn_mask_4d, float("-inf")) + attn_metadata_sdpa = AttentionMetadata(attn_mask=attn_mask_float) + else: + attn_metadata_sdpa = AttentionMetadata(attn_mask=None) + + output_sdpa = sdpa_impl.forward( + query=query.clone(), key=key.clone(), value=value.clone(), attn_metadata=attn_metadata_sdpa + ) + + # Compare outputs (only compare valid regions) + output_fa_valid = torch.cat( + [ + output_fa[:, :hidden_seq_len, :, :], + output_fa[:, hidden_seq_len + pad_length : hidden_seq_len + pad_length + encoder_seq_len, :, :], + ], + dim=1, + ) + output_sdpa_valid = torch.cat( + [ + output_sdpa[:, :hidden_seq_len, :, :], + output_sdpa[:, hidden_seq_len + pad_length : hidden_seq_len + pad_length + encoder_seq_len, :, :], + ], + dim=1, + ) + + max_diff = torch.max(torch.abs(output_fa_valid - output_sdpa_valid)).item() + mean_diff = torch.mean(torch.abs(output_fa_valid - output_sdpa_valid)).item() + + print("\n=== Case 2: FA vs SDPA Comparison ===") + print(f"Batch size: {batch_size}") + print(f"FA output shape: {output_fa.shape}") + print(f"SDPA output shape: {output_sdpa.shape}") + print(f"Max absolute difference (valid region): {max_diff:.6f}") + print(f"Mean absolute difference (valid region): {mean_diff:.6f}") + + # Assert that outputs are close + # Using higher tolerance for bfloat16 and different implementations + assert max_diff < 0.01, f"Max difference {max_diff} exceeds threshold 0.01" + assert mean_diff < 0.001, f"Mean difference {mean_diff} exceeds threshold 0.001" + + print("✓ Case 2 PASSED: FA and SDPA outputs are very close!") + + +if __name__ == "__main__": + print("Running FlashAttention Padding Tests...") + print("=" * 60) + + # Try to run CUDA tests + if torch.cuda.is_available(): + try: + print("\n[Running Case 1: Padding Equivalence for FA]") + test_padding_equivalence() + except Exception as e: + print(f"✗ Case 1 failed: {e}") + import traceback + + traceback.print_exc() + + try: + print("\n[Running Case 2: FA vs SDPA]") + test_fa_vs_sdpa() + except Exception as e: + print(f"✗ Case 2 failed: {e}") + import traceback + + traceback.print_exc() + else: + raise RuntimeError("CUDA is not available") + print("\n" + "=" * 60) + print("Test suite completed!") diff --git a/vllm_omni/diffusion/attention/backends/flash_attn.py b/vllm_omni/diffusion/attention/backends/flash_attn.py index 6921c79aee9..3623d49db8f 100644 --- a/vllm_omni/diffusion/attention/backends/flash_attn.py +++ b/vllm_omni/diffusion/attention/backends/flash_attn.py @@ -9,13 +9,14 @@ AttentionImpl, AttentionMetadata, ) +from vllm_omni.diffusion.attention.backends.utils.fa import _pad_input, _unpad_input, _upad_input logger = init_logger(__name__) try: # only tested with flash_attn v3 # from flash_attn_interface import flash_attn_func as flash_attn_3_func # not available in flash-attn 2.8.1 - from flash_attn import flash_attn_func # can be FA2 or FA3 + from flash_attn import flash_attn_func, flash_attn_varlen_func # can be FA2 or FA3 except ImportError: logger.warning( "FlashAttentionBackend is not available. You may install flash-attn " @@ -63,12 +64,51 @@ def forward( value: torch.Tensor, attn_metadata: AttentionMetadata = None, ) -> torch.Tensor: - # TODO: flash_attn_func does not support attn_mask. - out: torch.Tensor = flash_attn_func( - query, - key, - value, - causal=self.causal, - softmax_scale=self.softmax_scale, - ) + """ + Flash attention implementation. + + Args: + query: (batch_size, seq_len, num_heads, head_dim) + key: (batch_size, seq_len, num_heads, head_dim) + value: (batch_size, seq_len, num_heads, head_dim) + attn_metadata: AttentionMetadata. Attention mask is supported as attn_metadata.attn_mask + + Returns: + (batch_size, seq_len, num_heads, head_dim) + """ + query_length = query.size(1) + attention_mask = attn_metadata.attn_mask if attn_metadata is not None else None + # Contains at least one padding token in the sequence + if attention_mask is not None and torch.any(~attention_mask): + assert attention_mask.ndim == 2, "attention_mask must be 2D, (batch_size, seq_len)" + q, k, v, indices_q, (cu_seq_lens_q, cu_seq_lens_k), (max_length_q, max_length_k) = _upad_input( + query, key, value, attention_mask, query_length, _unpad_input + ) + + out_unpad = flash_attn_varlen_func( + q, + k, + v, + cu_seqlens_q=cu_seq_lens_q, + cu_seqlens_k=cu_seq_lens_k, + max_seqlen_q=max_length_q, + max_seqlen_k=max_length_k, + **{ + "causal": self.causal, + "softmax_scale": self.softmax_scale, + }, + ) + if isinstance(out_unpad, tuple): + out_unpad = out_unpad[0] + + out = _pad_input(out_unpad, indices_q, query.size(0), query_length) + + else: + out: torch.Tensor = flash_attn_func( + query, + key, + value, + causal=self.causal, + softmax_scale=self.softmax_scale, + ) return out diff --git a/vllm_omni/diffusion/attention/backends/utils/__init__.py b/vllm_omni/diffusion/attention/backends/utils/__init__.py new file mode 100644 index 00000000000..92c7c8027cb --- /dev/null +++ b/vllm_omni/diffusion/attention/backends/utils/__init__.py @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Utils for attention backends. +""" + +from vllm_omni.diffusion.attention.backends.utils.fa import _pad_input, _unpad_input, _upad_input + +__all__ = [ + "_pad_input", + "_unpad_input", + "_upad_input", +] diff --git a/vllm_omni/diffusion/attention/backends/utils/fa.py b/vllm_omni/diffusion/attention/backends/utils/fa.py new file mode 100644 index 00000000000..d89082e717a --- /dev/null +++ b/vllm_omni/diffusion/attention/backends/utils/fa.py @@ -0,0 +1,209 @@ +# Copyright 2025 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_flash_attention_utils.py +import torch +import torch.nn.functional as F + + +def _index_first_axis(tensor, indices): + """ + A local implementation of the PyTorch indexing operation `tensor[indices]` on the first axis, + after flattening the first two dimensions of the tensor. This is functionally equivalent to + FA2's `index_first_axis` and replaces the need to import it. + """ + # The input tensor is expected to be of shape (batch, seq_len, ...). We flatten the first + # two dimensions to get (total_tokens, ...) before indexing. + reshaped_tensor = tensor.reshape(-1, *tensor.shape[2:]) + return reshaped_tensor[indices] + + +def _unpad_input(hidden_states, attention_mask, unused_mask=None): + """ + unpad_input function for flash attention variants that do not have them within their pkg themselves, e.g. fa3. + + Arguments: + hidden_states: (batch, seqlen, ...) + attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid. + unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused. + + Return: + hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask. + indices: (total_nnz), the indices of masked tokens from the flattened input sequence. + cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states. + max_seqlen_in_batch: int + seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask. + """ + all_masks = (attention_mask + unused_mask) if unused_mask is not None else attention_mask + seqlens_in_batch = all_masks.sum(dim=-1, dtype=torch.int32) + used_seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(all_masks.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + + return ( + _index_first_axis(hidden_states, indices), + indices, + cu_seqlens, + max_seqlen_in_batch, + used_seqlens_in_batch, + ) + + +def _pad_input(hidden_states, indices, batch, seqlen): + """ + pad_input function for flash attention variants that do not have them within their pkg themselves, e.g. fa3. + + Arguments: + hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask. + indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence. + batch: int, batch size for the padded sequence. + seqlen: int, maximum sequence length for the padded sequence. + + Return: + hidden_states: (batch, seqlen, ...) + """ + dim = hidden_states.shape[1:] + output = torch.zeros((batch * seqlen), *dim, device=hidden_states.device, dtype=hidden_states.dtype) + output[indices] = hidden_states + return output.view(batch, seqlen, *dim) + + +def _get_unpad_data(attention_mask: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, int]: + """ + Retrieves indexing data required to repad unpadded (ragged) tensors. + + Arguments: + attention_mask (`torch.Tensor`): + Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid. + + Return: + indices (`torch.Tensor`): + The indices of non-masked tokens from the flattened input sequence. + cu_seqlens (`torch.Tensor`): + The cumulative sequence lengths, used to index into ragged (unpadded) tensors. + `cu_seqlens` shape is (batch_size + 1,). + max_seqlen_in_batch (`int`): + Maximum sequence length in batch. + """ + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + # NOTE: Similar to the `.item()` in prepare_fa2_from_position_ids, with torch compile, + # this might cause a graph break + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def _upad_input( + query_layer: torch.Tensor, + key_layer: torch.Tensor, + value_layer: torch.Tensor, + attention_mask: torch.Tensor, + query_length: int, + unpad_input_func, +): + """ + Unpads query, key, and values tensors, using a single dimension for all tokens even though they belong + to different batches. This function is used instead of `flash_attn.bert_padding.unpad_input` in + order to avoid the recomputation of the same intermediary tensors for query, key, value tensors. + + Arguments: + query_layer (`torch.Tensor`): + Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim). + key_layer (`torch.Tensor`): + Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim). + value_layer (`torch.Tensor`): + Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim). + attention_mask (`torch.Tensor`): + Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid. + query_length (`int`): + Target length. + unpad_input_func: + The function to use for unpadding the input tensors. + + Return: + query_layer (`torch.Tensor`): + Query state without padding. Shape: (total_target_length, num_heads, head_dim). + key_layer (`torch.Tensor`): + Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim). + value_layer (`torch.Tensor`): + Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim). + indices_q (`torch.Tensor`): + The indices of non-masked tokens from the flattened input target sequence. + (cu_seqlens_q, cu_seqlens_k) (`tuple[int]`): + The cumulative sequence lengths for the target (query) and source (key, value), used to index into + ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,). + (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`tuple[int]`): + Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, + `max_seqlen_in_batch_k` for the source sequence i.e. key/value). + """ + if torch.compiler.is_compiling(): + # allow PyTorch compiler to include operations that return scalar values (like .item() + torch._dynamo.config.capture_scalar_outputs = True + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + # With static caches, the k/v states may be larger than the mask -> + # we need to slice them to avoid generating garbage + # It's a bit of an anti-pattern, but otherwise we silently compute wrong attentions scores + if key_layer.shape[1] > (seq_len := attention_mask.shape[-1]): + key_layer, value_layer = key_layer[:, :seq_len, :, :], value_layer[:, :seq_len, :, :] + + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = _index_first_axis(key_layer, indices_k) + value_layer = _index_first_axis(value_layer, indices_k) + if query_length == kv_seq_len: + query_layer = _index_first_axis(query_layer, indices_k) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q, *_ = unpad_input_func(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +def _is_packed_sequence(position_ids, batch_size): + """ + Check the position ids whether packed sequences are indicated or not + 1. Position ids exist + 2. Flattened sequences only are supported + 3. Compile-friendly `not (torch.diff(position_ids, dim=-1) >= 0).all()`, i.e. + we have multiple increasing sequences + """ + if position_ids is None: + return False + + increasing_position_sequences = torch.arange(position_ids.shape[1], device=position_ids.device) + position_ids.min() + return batch_size == 1 and (increasing_position_sequences - position_ids).abs().sum().bool() diff --git a/vllm_omni/diffusion/attention/selector.py b/vllm_omni/diffusion/attention/selector.py index f2f60243f04..b9f62e0681d 100644 --- a/vllm_omni/diffusion/attention/selector.py +++ b/vllm_omni/diffusion/attention/selector.py @@ -31,6 +31,8 @@ "ASCEND": {"module": "vllm_omni.diffusion.attention.backends.ascend_attn", "class": "AscendAttentionBackend"}, } +_BACKENDS_SUPPORT_ATTENTION_MASK = ["SDPA", "ASCEND", "FLASH_ATTN"] + def load_backend(backend_name: str) -> type[AttentionBackend]: config = _BACKEND_CONFIG[backend_name] diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py index 472806aa778..8ac5014ce89 100644 --- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py +++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py @@ -23,7 +23,7 @@ AttentionMetadata, ) from vllm_omni.diffusion.attention.layer import Attention -from vllm_omni.diffusion.attention.selector import get_attn_backend +from vllm_omni.diffusion.attention.selector import _BACKENDS_SUPPORT_ATTENTION_MASK, get_attn_backend from vllm_omni.diffusion.cache.base import CachedTransformer from vllm_omni.diffusion.data import OmniDiffusionConfig from vllm_omni.diffusion.distributed.parallel_state import ( @@ -790,8 +790,8 @@ def forward( sp_size = get_sequence_parallel_world_size() if seq_len % sp_size != 0: - # flash_attn, ring_attn, sage_attn do not support attention_mask - if get_attn_backend(-1).get_name() != "SDPA" and get_attn_backend(-1).get_name() != "ASCEND": + # ring_attn, sage_attn do not support attention_mask + if get_attn_backend(-1).get_name() not in _BACKENDS_SUPPORT_ATTENTION_MASK: raise ValueError( f"When generating image shape that the sequence length is NOT divisible by sp_size={sp_size}," f"cannot use {get_attn_backend(-1).get_name()} which does not support attention_mask." From d28b059c6331278b9ad1b0f6079f5e5cd465724d Mon Sep 17 00:00:00 2001 From: tzhouam Date: Fri, 16 Jan 2026 05:02:36 +0000 Subject: [PATCH 42/59] support online serving for Qwen3 Omni Signed-off-by: tzhouam --- vllm_omni/entrypoints/chat_utils.py | 6 +- vllm_omni/entrypoints/openai/api_server.py | 246 ++++++++++++++---- .../openai/protocol/chat_completion.py | 4 +- vllm_omni/entrypoints/openai/serving_chat.py | 176 ++++++++----- .../entrypoints/openai/serving_speech.py | 2 +- .../models/qwen3_omni/qwen3_omni.py | 16 +- 6 files changed, 317 insertions(+), 133 deletions(-) diff --git a/vllm_omni/entrypoints/chat_utils.py b/vllm_omni/entrypoints/chat_utils.py index 6517ae666fe..a5a36bb636c 100644 --- a/vllm_omni/entrypoints/chat_utils.py +++ b/vllm_omni/entrypoints/chat_utils.py @@ -22,9 +22,6 @@ _postprocess_messages, _ToolParser, ) -from vllm.transformers_utils.tokenizer import AnyTokenizer - - class OmniAsyncMultiModalItemTracker(AsyncMultiModalItemTracker): def create_parser(self) -> "BaseMultiModalContentParser": return OmniAsyncMultiModalContentParser(self) @@ -129,7 +126,6 @@ def _cleanup_file_sync(file_path: str) -> None: def parse_chat_messages_futures( messages: list[ChatCompletionMessageParam], model_config: ModelConfig, - tokenizer: AnyTokenizer, content_format: _ChatTemplateContentFormat, mm_processor_kwargs: dict[str, Any] | None = None, ) -> tuple[ @@ -138,7 +134,7 @@ def parse_chat_messages_futures( MultiModalUUIDDict | None, ]: conversation: list[ConversationMessage] = [] - mm_tracker = OmniAsyncMultiModalItemTracker(model_config, tokenizer) + mm_tracker = OmniAsyncMultiModalItemTracker(model_config) for msg in messages: sub_messages = _parse_chat_message_content( diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 49958eb66f6..6f4ebc94c92 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -16,29 +16,49 @@ from starlette.datastructures import State from vllm.config import VllmConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import load_chat_template, resolve_hf_chat_template, resolve_mistral_chat_template +from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.api_server import ( base, build_app, load_log_config, - maybe_register_tokenizer_info_endpoint, router, setup_server, validate_json_request, ) -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ChatCompletionResponse, ErrorResponse -from vllm.entrypoints.openai.serving_models import BaseModelPath, LoRAModulePath, OpenAIServingModels -from vllm.entrypoints.openai.tool_parsers import ToolParserManager +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, +) +from vllm.entrypoints.openai.engine.protocol import ErrorResponse +from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses +from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion +from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels +from vllm.entrypoints.openai.translations.serving import ( + OpenAIServingTranscription, + OpenAIServingTranslation, +) +from vllm.entrypoints.pooling.classify.serving import ServingClassification +from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding +from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling +from vllm.entrypoints.pooling.score.serving import ServingScores +from vllm.entrypoints.serve.disagg.serving import ServingTokens +from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization +from vllm.tool_parsers import ToolParserManager # yapf conflicts with isort for this block # yapf: disable # yapf: enable from vllm.entrypoints.tool_server import DemoToolServer, MCPToolServer, ToolServer -from vllm.entrypoints.utils import load_aware_call, with_cancellation +from vllm.entrypoints.utils import ( + load_aware_call, + process_chat_template, + process_lora_modules, + with_cancellation, +) from vllm.logger import init_logger -from vllm.tokenizers import MistralTokenizer +from vllm.tasks import POOLING_TASKS from vllm.utils.system_utils import decorate_logs from vllm_omni.entrypoints.async_omni import AsyncOmni @@ -88,6 +108,10 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None, if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: ToolParserManager.import_tool_parser(args.tool_parser_plugin) + if args.reasoning_parser_plugin and len(args.reasoning_parser_plugin) > 3: + from vllm.reasoning import ReasoningParserManager + + ReasoningParserManager.import_reasoning_parser(args.reasoning_parser_plugin) # Load logging config for uvicorn if specified log_config = load_log_config(args.log_config_file) @@ -98,7 +122,6 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None, args, client_config=client_config, ) as engine_client: - maybe_register_tokenizer_info_endpoint(args) app = build_app(args) vllm_config = await engine_client.get_vllm_config() @@ -305,29 +328,11 @@ async def omni_init_app_state( if vllm_config is not None: _model_config = vllm_config.model_config - resolved_chat_template = load_chat_template(args.chat_template) - if resolved_chat_template is not None and vllm_config is not None: - # Get the tokenizer to check official template - tokenizer = await engine_client.get_tokenizer() - - if tokenizer is not None: - if isinstance(tokenizer, MistralTokenizer): - # The warning is logged in resolve_mistral_chat_template. - resolved_chat_template = resolve_mistral_chat_template(chat_template=resolved_chat_template) - else: - hf_chat_template = resolve_hf_chat_template( - tokenizer=tokenizer, - chat_template=None, - tools=None, - model_config=vllm_config.model_config, - ) - - if hf_chat_template != resolved_chat_template: - logger.warning( - "Using supplied chat template: %s\nIt is different from official chat template '%s'. This discrepancy may lead to performance degradation.", # noqa: E501 - resolved_chat_template, - args.model, - ) + resolved_chat_template = await process_chat_template( + args.chat_template, + engine_client, + vllm_config.model_config if vllm_config is not None else None, + ) if args.tool_server == "demo": tool_server: ToolServer | None = DemoToolServer() @@ -340,23 +345,12 @@ async def omni_init_app_state( tool_server = None # Merge default_mm_loras into the static lora_modules - default_mm_loras = {} - if vllm_config is not None and vllm_config.lora_config is not None: - default_mm_loras = vllm_config.lora_config.default_mm_loras - - lora_modules = args.lora_modules - if default_mm_loras: - default_mm_lora_paths = [ - LoRAModulePath( - name=modality, - path=lora_path, - ) - for modality, lora_path in default_mm_loras.items() - ] - if args.lora_modules is None: - lora_modules = default_mm_lora_paths - else: - lora_modules += default_mm_lora_paths + default_mm_loras = ( + vllm_config.lora_config.default_mm_loras + if vllm_config is not None and vllm_config.lora_config is not None + else {} + ) + lora_modules = process_lora_modules(args.lora_modules, default_mm_loras) # Ensure input_processor, io_processor, and model_config exist for OpenAIServingModels compatibility if ( @@ -415,6 +409,30 @@ async def omni_init_app_state( lora_modules=lora_modules, ) await state.openai_serving_models.init_static_loras() + supported_tasks: set[str] = {"generate"} + if hasattr(engine_client, "get_supported_tasks"): + supported_tasks = set(await engine_client.get_supported_tasks()) + + state.openai_serving_responses = ( + OpenAIServingResponses( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, + enable_auto_tools=args.enable_auto_tool_choice, + tool_parser=args.tool_call_parser, + tool_server=tool_server, + reasoning_parser=args.structured_outputs_config.reasoning_parser, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_force_include_usage=args.enable_force_include_usage, + enable_log_outputs=args.enable_log_outputs, + log_error_stack=args.log_error_stack, + ) + if "generate" in supported_tasks + else None + ) state.openai_serving_chat = OmniOpenAIServingChat( engine_client, state.openai_serving_models, @@ -422,6 +440,7 @@ async def omni_init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, + default_chat_template_kwargs=args.default_chat_template_kwargs, trust_request_chat_template=args.trust_request_chat_template, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, @@ -431,8 +450,136 @@ async def omni_init_app_state( enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, + enable_log_deltas=args.enable_log_deltas, log_error_stack=args.log_error_stack, ) + state.openai_serving_completion = ( + OpenAIServingCompletion( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_force_include_usage=args.enable_force_include_usage, + log_error_stack=args.log_error_stack, + ) + if "generate" in supported_tasks + else None + ) + state.openai_serving_pooling = ( + OpenAIServingPooling( + engine_client, + state.openai_serving_models, + supported_tasks=supported_tasks, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + log_error_stack=args.log_error_stack, + ) + if any(task in POOLING_TASKS for task in supported_tasks) + else None + ) + state.openai_serving_embedding = ( + OpenAIServingEmbedding( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + log_error_stack=args.log_error_stack, + ) + if "embed" in supported_tasks + else None + ) + state.openai_serving_classification = ( + ServingClassification( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + log_error_stack=args.log_error_stack, + ) + if "classify" in supported_tasks + else None + ) + state.openai_serving_scores = ( + ServingScores( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + score_template=resolved_chat_template, + log_error_stack=args.log_error_stack, + ) + if ("embed" in supported_tasks or "score" in supported_tasks) + else None + ) + state.openai_serving_tokenization = OpenAIServingTokenization( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + trust_request_chat_template=args.trust_request_chat_template, + log_error_stack=args.log_error_stack, + ) + state.openai_serving_transcription = ( + OpenAIServingTranscription( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + log_error_stack=args.log_error_stack, + enable_force_include_usage=args.enable_force_include_usage, + ) + if "transcription" in supported_tasks + else None + ) + state.openai_serving_translation = ( + OpenAIServingTranslation( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + log_error_stack=args.log_error_stack, + enable_force_include_usage=args.enable_force_include_usage, + ) + if "transcription" in supported_tasks + else None + ) + state.anthropic_serving_messages = ( + AnthropicServingMessages( + engine_client, + state.openai_serving_models, + args.response_role, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, + enable_auto_tools=args.enable_auto_tool_choice, + tool_parser=args.tool_call_parser, + reasoning_parser=args.structured_outputs_config.reasoning_parser, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_force_include_usage=args.enable_force_include_usage, + ) + if "generate" in supported_tasks + else None + ) + state.serving_tokens = ( + ServingTokens( + engine_client, + state.openai_serving_models, + request_logger=request_logger, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, + log_error_stack=args.log_error_stack, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_log_outputs=args.enable_log_outputs, + force_no_detokenize=args.tokens_only, + ) + if "generate" in supported_tasks + else None + ) state.openai_serving_speech = OmniOpenAIServingSpeech( engine_client, state.openai_serving_models, request_logger=request_logger @@ -474,7 +621,8 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re if isinstance(generator, ErrorResponse): return JSONResponse( - content=generator.model_dump(), status_code=generator.code if hasattr(generator, "code") else 400 + content=generator.model_dump(), + status_code=generator.error.code if generator.error else 400, ) elif isinstance(generator, ChatCompletionResponse): diff --git a/vllm_omni/entrypoints/openai/protocol/chat_completion.py b/vllm_omni/entrypoints/openai/protocol/chat_completion.py index 9f607624938..d106b7aa7ae 100644 --- a/vllm_omni/entrypoints/openai/protocol/chat_completion.py +++ b/vllm_omni/entrypoints/openai/protocol/chat_completion.py @@ -1,4 +1,6 @@ -from vllm.entrypoints.openai.protocol import ChatCompletionStreamResponse +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionStreamResponse, +) class OmniChatCompletionStreamResponse(ChatCompletionStreamResponse): diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index 6fb9750ccc1..c63d19c4f7d 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -29,8 +29,11 @@ make_tool_call_id, resolve_chat_template_content_format, ) -from vllm.entrypoints.harmony_utils import get_streamable_parser_for_assistant, parse_chat_output -from vllm.entrypoints.openai.protocol import ( +from vllm.entrypoints.openai.parser.harmony_utils import ( + get_streamable_parser_for_assistant, + parse_chat_output, +) +from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, ChatCompletionResponse, @@ -38,9 +41,13 @@ ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, +) +from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat +from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, + ErrorInfo, ErrorResponse, FunctionCall, FunctionDefinition, @@ -49,21 +56,14 @@ ToolCall, UsageInfo, ) -from vllm.entrypoints.openai.serving_chat import OpenAIServingChat -from vllm.entrypoints.openai.serving_engine import ( +from vllm.entrypoints.openai.engine.serving import ( ChatLikeRequest, - EngineTokensPrompt, - RequestPrompt, ResponsesRequest, - TextTokensPrompt, clamp_prompt_logprobs, - is_list_of, ) -from vllm.entrypoints.openai.tool_parsers import ToolParser -from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls from vllm.entrypoints.utils import should_include_usage -from vllm.inputs.data import PromptType +from vllm.inputs.data import PromptType, TokensPrompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput @@ -76,7 +76,9 @@ validate_request_params, ) from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils.collection_utils import as_list +from vllm.tool_parsers import ToolParser +from vllm.tool_parsers.mistral_tool_parser import MistralToolCall +from vllm.utils.collection_utils import as_list, is_list_of from vllm_omni.entrypoints.chat_utils import parse_chat_messages_futures from vllm_omni.entrypoints.openai.audio_utils_mixin import AudioMixin @@ -176,53 +178,79 @@ async def create_chat_completion( maybe_serialize_tool_calls(request) truncate_tool_call_ids(request) validate_request_params(request) - - if ( - request.tool_choice == "auto" - and not (self.enable_auto_tools and tool_parser is not None) + + # Check if tool parsing is unavailable (common condition) + tool_parsing_unavailable = ( + tool_parser is None and not isinstance(tokenizer, MistralTokenizer) and not self.use_harmony + ) + + # Validate tool_choice when tool parsing is required but unavailable + if tool_parsing_unavailable and request.tool_choice not in ( + None, + "none", ): - # for hf tokenizers, "auto" tools requires - # --enable-auto-tool-choice and --tool-call-parser - return self.create_error_response( - '"auto" tool choice requires --enable-auto-tool-choice and --tool-call-parser to be set' - ) + if request.tool_choice == "auto" and not self.enable_auto_tools: + # for hf tokenizers, "auto" tools requires + # --enable-auto-tool-choice and --tool-call-parser + return self.create_error_response( + '"auto" tool choice requires --enable-auto-tool-choice and --tool-call-parser to be set' + ) + elif request.tool_choice != "auto": + # "required" or named tool requires tool parser + return self.create_error_response( + f'tool_choice="{request.tool_choice}" requires ' + "--tool-call-parser to be set" + ) if request.tools is None or (request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none): tool_dicts = None else: tool_dicts = [tool.model_dump() for tool in request.tools] - # Common case. - request_chat_template = request.chat_template - chat_template_kwargs = request.chat_template_kwargs - if not self.trust_request_chat_template and ( - request_chat_template is not None - or (chat_template_kwargs and chat_template_kwargs.get("chat_template") is not None) - ): - return self.create_error_response( - "Chat template is passed with request, but --trust-request-chat-template is not set. " - "Refused request with untrusted chat template." + if not self.use_harmony: + error_check_ret = self._validate_chat_template( + request_chat_template=request.chat_template, + chat_template_kwargs=request.chat_template_kwargs, + trust_request_chat_template=self.trust_request_chat_template, ) - ( - conversation, - request_prompts, - engine_prompts, - ) = await self._preprocess_chat( - request, - tokenizer, - request.messages, - chat_template=request_chat_template or self.chat_template, - chat_template_content_format=self.chat_template_content_format, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, - tool_dicts=tool_dicts, - documents=request.documents, - chat_template_kwargs=request.chat_template_kwargs, - tool_parser=tool_parser, - add_special_tokens=request.add_special_tokens, - ) + if error_check_ret is not None: + return error_check_ret + + chat_template_kwargs = self._prepare_extra_chat_template_kwargs( + request.chat_template_kwargs, + self.default_chat_template_kwargs, + ) + chat_template_kwargs.update(reasoning_effort=request.reasoning_effort) + + ( + conversation, + request_prompts, + engine_prompts, + ) = await self._preprocess_chat( + request, + tokenizer, + request.messages, + chat_template=request.chat_template or self.chat_template, + chat_template_content_format=self.chat_template_content_format, + add_generation_prompt=request.add_generation_prompt, + continue_final_message=request.continue_final_message, + tool_dicts=tool_dicts, + documents=getattr(request, "documents", None), + chat_template_kwargs=chat_template_kwargs, + tool_parser=tool_parser, + add_special_tokens=request.add_special_tokens, + ) + else: + should_include_tools = tool_dicts is not None + conversation, engine_prompts = self._make_request_with_harmony( + request, should_include_tools + ) + request_prompts = [ + engine_prompt.get("prompt_token_ids", []) + for engine_prompt in engine_prompts + ] except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") @@ -318,8 +346,8 @@ async def _preprocess_chat( add_special_tokens: bool = False, ) -> tuple[ list[ConversationMessage], - Sequence[RequestPrompt], - list[EngineTokensPrompt], + Sequence[PromptType], + list[TokensPrompt], ]: model_config = self.model_config @@ -333,7 +361,6 @@ async def _preprocess_chat( conversation, mm_data_future, mm_uuids = parse_chat_messages_futures( messages, model_config, - tokenizer, content_format=resolved_content_format, mm_processor_kwargs=getattr(request, "mm_processor_kwargs", None), ) @@ -388,7 +415,7 @@ async def _preprocess_chat( "Prompt has to be a string", "when the tokenizer is not initialised", ) - prompt_inputs = TextTokensPrompt(prompt=request_prompt, prompt_token_ids=[1]) + prompt_inputs = TokensPrompt(prompt=request_prompt, prompt_token_ids=[1]) elif isinstance(request_prompt, str): prompt_inputs = await self._tokenize_prompt_input_async( request, @@ -399,20 +426,21 @@ async def _preprocess_chat( else: # For MistralTokenizer assert is_list_of(request_prompt, int), "Prompt has to be either a string or a list of token ids" - prompt_inputs = TextTokensPrompt( + prompt_inputs = TokensPrompt( prompt=tokenizer.decode(request_prompt), prompt_token_ids=request_prompt, ) - engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_inputs["prompt_token_ids"]) + engine_prompt = TokensPrompt(prompt_token_ids=prompt_inputs["prompt_token_ids"]) if mm_data is not None: engine_prompt["multi_modal_data"] = mm_data if mm_uuids is not None: engine_prompt["multi_modal_uuids"] = mm_uuids - if request.mm_processor_kwargs is not None: - engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs + mm_processor_kwargs = getattr(request, "mm_processor_kwargs", None) + if mm_processor_kwargs is not None: + engine_prompt["mm_processor_kwargs"] = mm_processor_kwargs if hasattr(request, "cache_salt") and request.cache_salt is not None: engine_prompt["cache_salt"] = request.cache_salt @@ -513,7 +541,7 @@ def _build_sampling_params_list_from_request( def _log_inputs( self, request_id: str, - inputs: RequestPrompt | PromptType, + inputs: PromptType, params_list: list[SamplingParams] | None, lora_request: LoRARequest | None, ) -> None: @@ -599,9 +627,13 @@ async def chat_completion_stream_generator( try: if self.reasoning_parser: + chat_template_kwargs = self._prepare_extra_chat_template_kwargs( + request.chat_template_kwargs, + self.default_chat_template_kwargs, + ) reasoning_parser = self.reasoning_parser( tokenizer, - chat_template_kwargs=request.chat_template_kwargs, # type: ignore + chat_template_kwargs=chat_template_kwargs, # type: ignore ) except RuntimeError as e: logger.exception("Error in reasoning parser creation.") @@ -1052,10 +1084,9 @@ async def chat_completion_stream_generator( # wasn't ready to send a token, then # get the next token without streaming a chunk if delta_message is None: - if output.finish_reason is None: + if output.finish_reason is None and not request.return_token_ids: continue - else: - delta_message = DeltaMessage() + delta_message = DeltaMessage() # Log streaming delta if output logging is enabled if self.enable_log_outputs and self.request_logger: @@ -1438,13 +1469,22 @@ def _create_text_choice( if self.reasoning_parser: try: - reasoning_parser = self.reasoning_parser(tokenizer) + chat_template_kwargs = self._prepare_extra_chat_template_kwargs( + request.chat_template_kwargs, + self.default_chat_template_kwargs, + ) + reasoning_parser = self.reasoning_parser( + tokenizer, + chat_template_kwargs=chat_template_kwargs, # type: ignore + ) except RuntimeError as e: logger.exception("Error in reasoning parser creation.") return self.create_error_response(str(e)) # If the reasoning parser is enabled, # tool calls are extracted exclusively from the content. - reasoning_content, content = reasoning_parser.extract_reasoning_content(output.text, request=request) + reasoning_content, content = reasoning_parser.extract_reasoning( + output.text, request=request + ) if not request.include_reasoning: reasoning_content = None else: @@ -2049,7 +2089,9 @@ def _create_error_response( ) -> ErrorResponse: """Create an error response following OpenAI error format.""" return ErrorResponse( - message=message, - type=err_type, - code=status_code, + error=ErrorInfo( + message=message, + type=err_type, + code=status_code, + ) ) diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index c6b87810e98..77be4cc8f35 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -2,7 +2,7 @@ from fastapi import Request from fastapi.responses import Response -from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.logger import init_logger from vllm.utils import random_uuid diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py index c459289af34..a363adf177c 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py @@ -356,18 +356,14 @@ def forward( elif self.model_stage == "code2wav": # Extract codec codes from input codes = [] - if input_ids is not None: + if input_ids.shape[0] % 16 == 0: codes.append(input_ids.reshape(1, 16, -1)) - else: - # for profile, we use max length from inputs_embeds - codes.append( - torch.zeros( - (1, 16, inputs_embeds.shape[1]), - dtype=torch.long, - device=inputs_embeds.device, - ) - ) + logger.warning(f"Input_ids length: {input_ids.shape[0]} is not divisible by 16, padding with zeros. This should only happen in warm up.") + input_ids_flatten = input_ids.reshape(-1) + input_ids_flatten = torch.cat([input_ids_flatten, torch.zeros(16 - input_ids.shape[0] % 16, dtype=torch.long, device=input_ids.device)]) + codes.append(input_ids_flatten.reshape(1, 16, -1)) + # Generate audio from codec codes audio_tensors = [] From 236f73e419bf75e993c392caf7186ca502b62deb Mon Sep 17 00:00:00 2001 From: tzhouam Date: Fri, 16 Jan 2026 06:29:06 +0000 Subject: [PATCH 43/59] fix pre-commit Signed-off-by: tzhouam --- vllm_omni/config/model.py | 28 +- vllm_omni/core/sched/omni_ar_scheduler.py | 21 +- .../models/glm_image/glm_image_transformer.py | 11 +- vllm_omni/engine/output_processor.py | 2 - vllm_omni/entrypoints/chat_utils.py | 2 + vllm_omni/entrypoints/omni_llm.py | 20 +- vllm_omni/entrypoints/openai/api_server.py | 2 +- vllm_omni/entrypoints/openai/serving_chat.py | 32 +- .../models/qwen3_omni/qwen3_omni.py | 16 +- .../qwen3_omni/qwen3_omni_moe_talker.py | 4 +- .../qwen3_omni/qwen3_omni_moe_thinker.py | 341 ++++-------------- vllm_omni/worker/gpu_ar_model_runner.py | 93 ++--- vllm_omni/worker/gpu_ar_worker.py | 25 +- .../worker/gpu_generation_model_runner.py | 125 +++---- vllm_omni/worker/gpu_generation_worker.py | 24 +- vllm_omni/worker/gpu_model_runner.py | 70 ++-- 16 files changed, 266 insertions(+), 550 deletions(-) diff --git a/vllm_omni/config/model.py b/vllm_omni/config/model.py index 2e53a7af2e1..592c501a631 100644 --- a/vllm_omni/config/model.py +++ b/vllm_omni/config/model.py @@ -1,12 +1,9 @@ import warnings -from importlib.util import find_spec from typing import Any import torch -import vllm.envs as envs from pydantic import ConfigDict from pydantic.dataclasses import dataclass -from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.config import ModelConfig, config from vllm.config.model import ( _RUNNER_CONVERTS, @@ -24,10 +21,12 @@ get_pooling_config, ) from vllm.transformers_utils.gguf_utils import ( + is_gguf, maybe_patch_hf_config_from_gguf, ) from vllm.transformers_utils.utils import maybe_model_redirect -from vllm.transformers_utils.gguf_utils import is_gguf +from vllm.v1.attention.backends.registry import AttentionBackendEnum + import vllm_omni.model_executor.models as me_models logger = init_logger(__name__) @@ -108,9 +107,7 @@ def __post_init__( video_pruning_rate: float | None, ) -> None: # Keep set served_model_name before maybe_model_redirect(self.model) - self.served_model_name = get_served_model_name( - self.model, self.served_model_name - ) + self.served_model_name = get_served_model_name(self.model, self.served_model_name) self.model = maybe_model_redirect(self.model) # The tokenizer is consistent with the model by default. if self.tokenizer is None: @@ -167,9 +164,7 @@ def __post_init__( if dict_overrides: self._apply_dict_overrides(hf_config, dict_overrides) self.hf_text_config = self.draw_hf_text_config() - self.attention_chunk_size = getattr( - self.hf_text_config, "attention_chunk_size", None - ) + self.attention_chunk_size = getattr(self.hf_text_config, "attention_chunk_size", None) self.encoder_config = self._get_encoder_config() self.hf_image_processor_config = get_hf_image_processor_config( self.model, hf_token=self.hf_token, revision=self.revision @@ -182,9 +177,7 @@ def __post_init__( is_pooling_model = registry.is_pooling_model(architectures, self) self.runner_type = self._get_runner_type(architectures, self.runner) - self.convert_type = self._get_convert_type( - architectures, self.runner_type, self.convert - ) + self.convert_type = self._get_convert_type(architectures, self.runner_type, self.convert) if self.runner_type == "generate" and not is_generative_model: generate_converts = _RUNNER_CONVERTS["generate"] @@ -244,10 +237,7 @@ def __post_init__( # Init multimodal config if needed if self._model_info.supports_multimodal: - if ( - mm_encoder_tp_mode == "data" - and not self._model_info.supports_multimodal_encoder_tp_data - ): + if mm_encoder_tp_mode == "data" and not self._model_info.supports_multimodal_encoder_tp_data: logger.warning_once( "This model does not support `--mm-encoder-tp-mode data`. " "Falling back to `--mm-encoder-tp-mode weights`." @@ -269,9 +259,7 @@ def __post_init__( video_pruning_rate=video_pruning_rate, ) - mm_config_kwargs = { - k: v for k, v in mm_config_kwargs.items() if v is not None - } + mm_config_kwargs = {k: v for k, v in mm_config_kwargs.items() if v is not None} self.multimodal_config = MultiModalConfig(**mm_config_kwargs) diff --git a/vllm_omni/core/sched/omni_ar_scheduler.py b/vllm_omni/core/sched/omni_ar_scheduler.py index dc3f56ac2db..3c262b7c927 100644 --- a/vllm_omni/core/sched/omni_ar_scheduler.py +++ b/vllm_omni/core/sched/omni_ar_scheduler.py @@ -3,6 +3,8 @@ from collections import defaultdict from time import time +import numpy as np +from vllm.compilation.cuda_graph import CUDAGraphStat from vllm.distributed.kv_events import KVEventBatch from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats from vllm.logger import init_logger @@ -10,10 +12,13 @@ from vllm.v1.core.sched.scheduler import Scheduler as VLLMScheduler from vllm.v1.core.sched.utils import check_stop, remove_all from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs +from vllm.v1.metrics.perf import PerfStats from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.spec_decode.metrics import SpecDecodingStats +logger = init_logger(__name__) + class OmniARScheduler(VLLMScheduler): """ @@ -73,7 +78,7 @@ def update_from_output( pooler_outputs = model_runner_output.pooler_output num_nans_in_logits = model_runner_output.num_nans_in_logits kv_connector_output = model_runner_output.kv_connector_output - cudagraph_stats = model_runner_output.cudagraph_stats + cudagraph_stats: CUDAGraphStat | None = model_runner_output.cudagraph_stats perf_stats: PerfStats | None = None if self.perf_metrics and self.perf_metrics.is_enabled(): @@ -152,7 +157,7 @@ def update_from_output( new_token_ids, stopped = self._update_request_with_output(request, new_token_ids) if pooler_output: - # Note: As we occupied the pooler output, for multimodal outputs, we do not intermediate stop checking for pooler output + # Note: For multimodal outputs, we skip intermediate stop checks. if request.output_token_ids: stopped = check_stop(request, self.max_model_len) routed_experts = None @@ -172,13 +177,10 @@ def update_from_output( # compute slot mapping: slot = block_id * block_size + offset slot_mapping = ( - block_offsets.reshape((1, block_size)) - + block_ids_array.reshape((num_blocks, 1)) * block_size + block_offsets.reshape((1, block_size)) + block_ids_array.reshape((num_blocks, 1)) * block_size ).flatten()[:num_tokens] - routed_experts = self.routed_experts_reader.get_routed_experts( - indices=slot_mapping - ) + routed_experts = self.routed_experts_reader.get_routed_experts(indices=slot_mapping) kv_transfer_params = self._free_request(request) if status_before_stop == RequestStatus.RUNNING: stopped_running_reqs.add(request) @@ -221,6 +223,7 @@ def update_from_output( kv_transfer_params=kv_transfer_params, trace_headers=request.trace_headers, num_cached_tokens=request.num_cached_tokens, + routed_experts=routed_experts, num_nans_in_logits=request.num_nans_in_logits, ) ) @@ -234,7 +237,7 @@ def update_from_output( if stopped_preempted_reqs: # This is a rare case and unlikely to impact performance. self.waiting.remove_requests(stopped_preempted_reqs) - + if failed_kv_load_req_ids and not self.recompute_kv_load_failures: requests = [self.requests[req_id] for req_id in failed_kv_load_req_ids] self.finish_requests(failed_kv_load_req_ids, RequestStatus.FINISHED_ERROR) @@ -286,7 +289,7 @@ def update_from_output( engine_core_outputs[client_index] = EngineCoreOutputs(finished_requests=finished_set) finished_req_ids.clear() - if (stats := self.make_stats(spec_decoding_stats, kv_connector_stats)) is not None: + if (stats := self.make_stats(spec_decoding_stats, kv_connector_stats, cudagraph_stats, perf_stats)) is not None: # Return stats to only one of the front-ends. if (eco := next(iter(engine_core_outputs.values()), None)) is None: # We must return the stats even if there are no request diff --git a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py index 09f7b17e133..615b9194af4 100644 --- a/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py +++ b/vllm_omni/diffusion/models/glm_image/glm_image_transformer.py @@ -8,8 +8,8 @@ import torch import torch.nn as nn from diffusers.models.attention import FeedForward -from diffusers.models.transformers.transformer_glm_image import GlmImageCombinedTimestepSizeEmbeddings from diffusers.models.modeling_outputs import Transformer2DModelOutput +from diffusers.models.transformers.transformer_glm_image import GlmImageCombinedTimestepSizeEmbeddings from vllm.logger import init_logger from vllm.model_executor.layers.linear import QKVParallelLinear from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -414,9 +414,10 @@ def forward( query_img = query[:, text_seq_length:, :, :] key_img = key[:, text_seq_length:, :, :] from diffusers.models.embeddings import apply_rotary_emb - query_img = apply_rotary_emb(query_img,image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) + + query_img = apply_rotary_emb(query_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) # key_img = self.rope(key_img, cos, sin) - key_img = apply_rotary_emb(key_img,image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) + key_img = apply_rotary_emb(key_img, image_rotary_emb, sequence_dim=1, use_real_unbind_dim=-2) query = torch.cat([query[:, :text_seq_length, :, :], query_img], dim=1) key = torch.cat([key[:, :text_seq_length, :, :], key_img], dim=1) @@ -555,7 +556,7 @@ def __init__( od_config: OmniDiffusionConfig, ): super().__init__() - + patch_size = od_config.tf_model_config.patch_size in_channels = od_config.tf_model_config.in_channels out_channels = od_config.tf_model_config.out_channels @@ -565,8 +566,6 @@ def __init__( condition_dim = od_config.tf_model_config.condition_dim prior_vq_quantizer_codebook_size = od_config.tf_model_config.prior_vq_quantizer_codebook_size text_embed_dim = od_config.tf_model_config.text_embed_dim - - # Get num_layers from config if available model_config = od_config.tf_model_config diff --git a/vllm_omni/engine/output_processor.py b/vllm_omni/engine/output_processor.py index 714b8dfcc53..6aef7fb162c 100644 --- a/vllm_omni/engine/output_processor.py +++ b/vllm_omni/engine/output_processor.py @@ -8,8 +8,6 @@ from vllm.sampling_params import RequestOutputKind from vllm.tokenizers import TokenizerLike from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason -from vllm.v1.engine.detokenizer import IncrementalDetokenizer -from vllm.v1.engine.logprobs import LogprobsProcessor from vllm.v1.engine.output_processor import OutputProcessor as VLLMOutputProcessor from vllm.v1.engine.output_processor import OutputProcessorOutput, RequestOutputCollector, RequestState from vllm.v1.engine.parallel_sampling import ParentRequest diff --git a/vllm_omni/entrypoints/chat_utils.py b/vllm_omni/entrypoints/chat_utils.py index a5a36bb636c..0fdef5edbb7 100644 --- a/vllm_omni/entrypoints/chat_utils.py +++ b/vllm_omni/entrypoints/chat_utils.py @@ -22,6 +22,8 @@ _postprocess_messages, _ToolParser, ) + + class OmniAsyncMultiModalItemTracker(AsyncMultiModalItemTracker): def create_parser(self) -> "BaseMultiModalContentParser": return OmniAsyncMultiModalContentParser(self) diff --git a/vllm_omni/entrypoints/omni_llm.py b/vllm_omni/entrypoints/omni_llm.py index 74fe6a80376..8db4b11d94d 100644 --- a/vllm_omni/entrypoints/omni_llm.py +++ b/vllm_omni/entrypoints/omni_llm.py @@ -1,14 +1,15 @@ +from collections.abc import Callable from typing import Any import cloudpickle from pydantic import ValidationError from tqdm import tqdm -from vllm.outputs import RequestOutput, PoolingRequestOutput -from typing import Callable + # External library imports (vLLM) from vllm.config import CompilationConfig, StructuredOutputsConfig, is_init_field from vllm.entrypoints.llm import LLM from vllm.logger import init_logger +from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.plugins.io_processors import get_io_processor from vllm.usage.usage_lib import UsageContext from vllm.utils.counter import Counter @@ -193,9 +194,7 @@ def __del__(self) -> None: # best-effort except Exception as e: logger.debug("[Orchestrator] __del__ close() raised: %s", e, exc_info=True) - def _run_engine( - self, *, use_tqdm: bool | Callable[..., tqdm] = True - ) -> list[RequestOutput | PoolingRequestOutput]: + def _run_engine(self, *, use_tqdm: bool | Callable[..., tqdm] = True) -> list[RequestOutput | PoolingRequestOutput]: # Initialize tqdm. if use_tqdm: num_requests = self.llm_engine.get_num_unfinished_requests() @@ -223,14 +222,9 @@ def _run_engine( assert output.prompt_token_ids is not None total_in_toks += len(output.prompt_token_ids) * n in_spd = total_in_toks / pbar.format_dict["elapsed"] - total_out_toks += sum( - len(stp.token_ids) for stp in output.outputs - ) + total_out_toks += sum(len(stp.token_ids) for stp in output.outputs) out_spd = total_out_toks / pbar.format_dict["elapsed"] - pbar.postfix = ( - f"est. speed input: {in_spd:.2f} toks/s, " - f"output: {out_spd:.2f} toks/s" - ) + pbar.postfix = f"est. speed input: {in_spd:.2f} toks/s, output: {out_spd:.2f} toks/s" pbar.update(n) else: pbar.update(1) @@ -242,4 +236,4 @@ def _run_engine( # Sort the outputs by the int part of request ID which is in format of 'int-uuid'. # This is necessary because some requests may be finished earlier than # its previous requests. - return sorted(outputs, key=lambda x: int(x.request_id.split("-")[0])) \ No newline at end of file + return sorted(outputs, key=lambda x: int(x.request_id.split("-")[0])) diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 6f4ebc94c92..297935a0fed 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -45,7 +45,6 @@ from vllm.entrypoints.pooling.score.serving import ServingScores from vllm.entrypoints.serve.disagg.serving import ServingTokens from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization -from vllm.tool_parsers import ToolParserManager # yapf conflicts with isort for this block # yapf: disable @@ -59,6 +58,7 @@ ) from vllm.logger import init_logger from vllm.tasks import POOLING_TASKS +from vllm.tool_parsers import ToolParserManager from vllm.utils.system_utils import decorate_logs from vllm_omni.entrypoints.async_omni import AsyncOmni diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index c63d19c4f7d..bb3d242af3b 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -29,10 +29,6 @@ make_tool_call_id, resolve_chat_template_content_format, ) -from vllm.entrypoints.openai.parser.harmony_utils import ( - get_streamable_parser_for_assistant, - parse_chat_output, -) from vllm.entrypoints.openai.chat_completion.protocol import ( ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, @@ -61,6 +57,10 @@ ResponsesRequest, clamp_prompt_logprobs, ) +from vllm.entrypoints.openai.parser.harmony_utils import ( + get_streamable_parser_for_assistant, + parse_chat_output, +) from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls from vllm.entrypoints.utils import should_include_usage from vllm.inputs.data import PromptType, TokensPrompt @@ -75,9 +75,9 @@ truncate_tool_call_ids, validate_request_params, ) -from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.tool_parsers import ToolParser from vllm.tool_parsers.mistral_tool_parser import MistralToolCall +from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils.collection_utils import as_list, is_list_of from vllm_omni.entrypoints.chat_utils import parse_chat_messages_futures @@ -178,12 +178,10 @@ async def create_chat_completion( maybe_serialize_tool_calls(request) truncate_tool_call_ids(request) validate_request_params(request) - + # Check if tool parsing is unavailable (common condition) tool_parsing_unavailable = ( - tool_parser is None - and not isinstance(tokenizer, MistralTokenizer) - and not self.use_harmony + tool_parser is None and not isinstance(tokenizer, MistralTokenizer) and not self.use_harmony ) # Validate tool_choice when tool parsing is required but unavailable @@ -200,8 +198,7 @@ async def create_chat_completion( elif request.tool_choice != "auto": # "required" or named tool requires tool parser return self.create_error_response( - f'tool_choice="{request.tool_choice}" requires ' - "--tool-call-parser to be set" + f'tool_choice="{request.tool_choice}" requires --tool-call-parser to be set' ) if request.tools is None or (request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none): @@ -244,13 +241,8 @@ async def create_chat_completion( ) else: should_include_tools = tool_dicts is not None - conversation, engine_prompts = self._make_request_with_harmony( - request, should_include_tools - ) - request_prompts = [ - engine_prompt.get("prompt_token_ids", []) - for engine_prompt in engine_prompts - ] + conversation, engine_prompts = self._make_request_with_harmony(request, should_include_tools) + request_prompts = [engine_prompt.get("prompt_token_ids", []) for engine_prompt in engine_prompts] except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") @@ -1482,9 +1474,7 @@ def _create_text_choice( return self.create_error_response(str(e)) # If the reasoning parser is enabled, # tool calls are extracted exclusively from the content. - reasoning_content, content = reasoning_parser.extract_reasoning( - output.text, request=request - ) + reasoning_content, content = reasoning_parser.extract_reasoning(output.text, request=request) if not request.include_reasoning: reasoning_content = None else: diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py index a363adf177c..b73b5d4ccf0 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py @@ -359,12 +359,22 @@ def forward( if input_ids.shape[0] % 16 == 0: codes.append(input_ids.reshape(1, 16, -1)) else: - logger.warning(f"Input_ids length: {input_ids.shape[0]} is not divisible by 16, padding with zeros. This should only happen in warm up.") + logger.warning( + ( + "Input_ids length: %s is not divisible by 16, padding " + "with zeros. This should only happen in warm up." + ), + input_ids.shape[0], + ) input_ids_flatten = input_ids.reshape(-1) - input_ids_flatten = torch.cat([input_ids_flatten, torch.zeros(16 - input_ids.shape[0] % 16, dtype=torch.long, device=input_ids.device)]) + input_ids_flatten = torch.cat( + [ + input_ids_flatten, + torch.zeros(16 - input_ids.shape[0] % 16, dtype=torch.long, device=input_ids.device), + ] + ) codes.append(input_ids_flatten.reshape(1, 16, -1)) - # Generate audio from codec codes audio_tensors = [] for code in codes: diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py index 14cae177e3b..4d0350ff6cc 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_talker.py @@ -111,7 +111,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() talker_config: Qwen3OmniMoeTalkerConfig = vllm_config.model_config.hf_config talker_config.text_config.rope_parameters = talker_config.text_config.rope_scaling - talker_config.text_config.rope_parameters["rope_theta"] = talker_config.text_config.rope_parameters["rope_theta"] + talker_config.text_config.rope_parameters["rope_theta"] = talker_config.text_config.rope_parameters[ + "rope_theta" + ] self.quant_config = vllm_config.quant_config self.prefix = prefix self.vllm_config = vllm_config diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py index 86a00a95400..3271506ac2c 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py @@ -21,7 +21,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen3-Omni-Moe model (thinker part).""" -from vllm.model_executor.models.module_mapping import MultiModelKeys + from collections.abc import Iterable, Mapping, Sequence from functools import partial from typing import Any @@ -37,9 +37,6 @@ Qwen3OmniMoeConfig, Qwen3OmniMoeThinkerConfig, ) -from vllm.model_executor.models.qwen3_omni_moe_thinker import ( - Qwen3OmniMoeAudioEncoder, -) from transformers.models.qwen3_omni_moe.processing_qwen3_omni_moe import ( Qwen3OmniMoeProcessor, ) @@ -56,6 +53,7 @@ SupportsMultiModal, SupportsPP, ) +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.qwen2_5_omni_thinker import ( Qwen2_5OmniAudioFeatureInputs, Qwen2_5OmniThinkerDummyInputsBuilder, @@ -69,6 +67,7 @@ from vllm.model_executor.models.qwen3_moe import Qwen3MoeModel as _Qwen3MoeLLMModel from vllm.model_executor.models.qwen3_omni_moe_thinker import ( Qwen3Omni_VisionTransformer, + Qwen3OmniMoeAudioEncoder, _get_feat_extract_output_lengths, ) from vllm.model_executor.models.utils import ( @@ -170,23 +169,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = Qwen3MoeLLMModel( - vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") - ) - self.lm_head = ParallelLMHead( - config.vocab_size, config.hidden_size, quant_config=quant_config - ) + self.model = Qwen3MoeLLMModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors - ) + self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors -class Qwen3OmniMoeThinkerProcessingInfo( - Qwen2AudioProcessingInfo, Qwen2_5_VLProcessingInfo -): +class Qwen3OmniMoeThinkerProcessingInfo(Qwen2AudioProcessingInfo, Qwen2_5_VLProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(Qwen3OmniMoeConfig).thinker_config @@ -264,12 +255,8 @@ def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray: # move truncation to audio_kwargs level to avoid conflict # with tok_kwargs - mm_kwargs["audio_kwargs"].setdefault( - "truncation", mm_kwargs.pop("truncation", False) - ) - mm_kwargs["text_kwargs"].setdefault( - "truncation", tok_kwargs.pop("truncation", False) - ) + mm_kwargs["audio_kwargs"].setdefault("truncation", mm_kwargs.pop("truncation", False)) + mm_kwargs["text_kwargs"].setdefault("truncation", tok_kwargs.pop("truncation", False)) # Validate and conditionally pass audio_sample_rate # WhisperFeatureExtractor has a fixed sampling rate, and vLLM's @@ -289,9 +276,7 @@ def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray: ) else: # Sample rate matches, safe to pass - mm_kwargs["audio_kwargs"]["audio_sample_rate"] = ( - audio_sample_rate - ) + mm_kwargs["audio_kwargs"]["audio_sample_rate"] = audio_sample_rate hf_inputs = super()._call_hf_processor( prompt=prompt, @@ -309,18 +294,12 @@ def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray: for _, audio in enumerate(audios): audio_length = len(audio[0]) if isinstance(audio, tuple) else len(audio) num_frame = ( - (audio_length // hop_length) - if audio_length % hop_length == 0 - else (audio_length // hop_length - 1) + (audio_length // hop_length) if audio_length % hop_length == 0 else (audio_length // hop_length - 1) ) if mm_kwargs.get("truncation", False): - num_frame = min( - num_frame, feature_extractor.n_samples // hop_length - ) + num_frame = min(num_frame, feature_extractor.n_samples // hop_length) audio_num_frames.append(num_frame) - hf_inputs["feature_attention_mask"] = [ - torch.ones(num_frame) for num_frame in audio_num_frames - ] + hf_inputs["feature_attention_mask"] = [torch.ones(num_frame) for num_frame in audio_num_frames] hf_inputs["audio_feature_lengths"] = torch.tensor(audio_num_frames) return hf_inputs @@ -358,17 +337,13 @@ def _maybe_apply_prompt_updates( ) else: if use_audio_in_video and "audio" in mm_prompt_updates: - filtered_updates = { - k: v for k, v in mm_prompt_updates.items() if k != "audio" - } + filtered_updates = {k: v for k, v in mm_prompt_updates.items() if k != "audio"} prompt_ids, mm_placeholders = self._apply_prompt_updates( prompt_ids, filtered_updates, ) # Derive audio placeholders from video placeholders - mm_placeholders = self._derive_audio_from_video_placeholders( - mm_placeholders, mm_prompt_updates - ) + mm_placeholders = self._derive_audio_from_video_placeholders(mm_placeholders, mm_prompt_updates) else: prompt_ids, mm_placeholders = self._apply_prompt_updates( prompt_ids, @@ -405,19 +380,12 @@ def get_updates_use_audio_in_video( video_token_indices, (video_token_indices.shape[0], height, width) ).reshape(-1) video_token_indices = ( - (video_token_indices + shift) - * next(iter([video_second_per_grid_t])) - * position_id_per_seconds + (video_token_indices + shift) * next(iter([video_second_per_grid_t])) * position_id_per_seconds ) video_data_index, audio_data_index = 0, 0 updates = [audio_start_token_id] - while video_data_index < len(video_token_indices) and audio_data_index < len( - audio_token_indices - ): - if ( - video_token_indices[video_data_index] - <= audio_token_indices[audio_data_index] - ): + while video_data_index < len(video_token_indices) and audio_data_index < len(audio_token_indices): + if video_token_indices[video_data_index] <= audio_token_indices[audio_data_index]: updates += [video_token_id] video_data_index += 1 else: @@ -458,9 +426,7 @@ def _get_prompt_updates( audio_output_lengths = audio_output_lens.tolist() elif feature_attention_mask is not None: assert isinstance(feature_attention_mask, torch.Tensor) - audio_output_lens = _get_feat_extract_output_lengths( - feature_attention_mask.sum(-1) - ) + audio_output_lens = _get_feat_extract_output_lengths(feature_attention_mask.sum(-1)) audio_output_lengths = audio_output_lens.tolist() # number of audios read from video. @@ -478,8 +444,7 @@ def get_replacement_qwen2_audio(item_idx: int): audios = mm_items.get_items("audio", AudioProcessorItems) audio = audios.get(item_idx) raise ValueError( - f"The audio {audio} (len={len(audio)}) is too short " - "to be represented inside the model" + f"The audio {audio} (len={len(audio)}) is too short to be represented inside the model" ) return [audio_token_id] * num_features @@ -497,9 +462,7 @@ def get_replacement_qwen2_vision(item_idx: int, modality: str): def get_replacement_qwen2_use_audio_in_video(item_idx: int): nonlocal audio_in_video_item_idx - audio_num_features = audio_output_lengths[ - audio_in_video_item_idx + item_idx - ] + audio_num_features = audio_output_lengths[audio_in_video_item_idx + item_idx] video_grid_thw = out_mm_data["video_grid_thw"][item_idx] audio_in_video_item_idx += 1 @@ -516,9 +479,7 @@ def get_replacement_qwen2_use_audio_in_video(item_idx: int): video_grid_thw=video_grid_thw, video_second_per_grid_t=video_second_per_grid_t, ) - return PromptUpdateDetails.select_token_id( - placeholder, embed_token_id=video_token_id - ) + return PromptUpdateDetails.select_token_id(placeholder, embed_token_id=video_token_id) video_replacement_fn = ( get_replacement_qwen2_use_audio_in_video @@ -561,8 +522,7 @@ def _derive_audio_from_video_placeholders( num_audios = len(mm_prompt_updates.get("audio", [])) if num_audios != num_videos: raise ValueError( - f"use_audio_in_video requires equal number of audio and video items, " - f"got {num_audios=}, {num_videos=}" + f"use_audio_in_video requires equal number of audio and video items, got {num_audios=}, {num_videos=}" ) tokenizer = self.info.get_tokenizer() @@ -618,11 +578,7 @@ def _get_raw_input_ids( end = i break if end is not None: - result = ( - result[:start] - + [vision_bos_token, video_token, vision_eos_token] - + result[end + 2 :] - ) + result = result[:start] + [vision_bos_token, video_token, vision_eos_token] + result[end + 2 :] else: break @@ -702,9 +658,7 @@ def get_placeholder_str(cls, modality: str, i: int) -> str | None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() self.vllm_config = vllm_config # needed for torch compile forward context - thinker_config: Qwen3OmniMoeThinkerConfig = ( - vllm_config.model_config.hf_config - ) + thinker_config: Qwen3OmniMoeThinkerConfig = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config self.config = thinker_config @@ -726,23 +680,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.quant_config = quant_config self.language_model = Qwen3MoeLLMForCausalLM( - vllm_config=vllm_config.with_hf_config( - thinker_config.text_config, architectures=["Qwen3MoeForCausalLM"] - ), + vllm_config=vllm_config.with_hf_config(thinker_config.text_config, architectures=["Qwen3MoeForCausalLM"]), prefix=maybe_prefix(prefix, "language_model"), ) - self.make_empty_intermediate_tensors = ( - self.language_model.make_empty_intermediate_tensors - ) + self.make_empty_intermediate_tensors = self.language_model.make_empty_intermediate_tensors - self.use_deepstack = hasattr( - thinker_config.vision_config, "deepstack_visual_indexes" - ) + self.use_deepstack = hasattr(thinker_config.vision_config, "deepstack_visual_indexes") self.deepstack_num_level = ( - len(thinker_config.vision_config.deepstack_visual_indexes) - if self.use_deepstack - else 0 + len(thinker_config.vision_config.deepstack_visual_indexes) if self.use_deepstack else 0 ) # register buffer for deepstack self.deepstack_input_embeds = ( @@ -763,9 +709,7 @@ def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors: # get deepstack_input_embeds from buffer, and clear the buffer return IntermediateTensors( { - f"deepstack_input_embeds_{idx}": self.deepstack_input_embeds[idx][ - :num_tokens - ] + f"deepstack_input_embeds_{idx}": self.deepstack_input_embeds[idx][:num_tokens] for idx in range(self.deepstack_num_level) } ) @@ -784,9 +728,7 @@ def _set_deepstack_input_embeds(self, deepstack_input_embeds: torch.Tensor) -> N for _ in range(self.deepstack_num_level) ] for idx in range(self.deepstack_num_level): - self.deepstack_input_embeds[idx][:num_tokens].copy_( - deepstack_input_embeds[idx] - ) + self.deepstack_input_embeds[idx][:num_tokens].copy_(deepstack_input_embeds[idx]) def _clear_deepstack_input_embeds(self, num_tokens: int) -> None: # clear deepstack_input_embeds in buffer @@ -800,27 +742,12 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: # Preserve the order of modalities if there are multiple of them # from the order of kwargs. for input_key in kwargs: - if ( - input_key in ("pixel_values", "image_embeds") - and "image" not in mm_input_by_modality - ): - mm_input_by_modality["image"] = self._parse_and_validate_image_input( - **kwargs - ) - if ( - input_key in ("pixel_values_videos", "video_embeds") - and "video" not in mm_input_by_modality - ): - mm_input_by_modality["video"] = self._parse_and_validate_video_input( - **kwargs - ) - if ( - input_key in ("input_audio_features") - and "audio" not in mm_input_by_modality - ): - mm_input_by_modality["audio"] = self._parse_and_validate_audio_input( - **kwargs - ) + if input_key in ("pixel_values", "image_embeds") and "image" not in mm_input_by_modality: + mm_input_by_modality["image"] = self._parse_and_validate_image_input(**kwargs) + if input_key in ("pixel_values_videos", "video_embeds") and "video" not in mm_input_by_modality: + mm_input_by_modality["video"] = self._parse_and_validate_video_input(**kwargs) + if input_key in ("input_audio_features") and "audio" not in mm_input_by_modality: + mm_input_by_modality["audio"] = self._parse_and_validate_audio_input(**kwargs) return mm_input_by_modality def get_language_model(self) -> torch.nn.Module: @@ -871,12 +798,9 @@ def embed_input_ids( deepstack_input_embeds = None # split the feat dim to obtain multi-scale visual feature has_vision_embeddings = [ - embeddings.shape[-1] != self.config.text_config.hidden_size - for embeddings in multimodal_embeddings + embeddings.shape[-1] != self.config.text_config.hidden_size for embeddings in multimodal_embeddings ] - if self.visual.deepstack_visual_indexes is not None and any( - has_vision_embeddings - ): + if self.visual.deepstack_visual_indexes is not None and any(has_vision_embeddings): multiscale_len = len(self.visual.deepstack_visual_indexes) multimodal_embeddings_multiscale = [] is_vision = torch.zeros_like(is_multimodal) @@ -884,17 +808,13 @@ def embed_input_ids( mm_position_idx = 0 for index, embeddings in enumerate(multimodal_embeddings): num_tokens = embeddings.shape[0] - current_positions = mm_positions[ - mm_position_idx : mm_position_idx + num_tokens - ] + current_positions = mm_positions[mm_position_idx : mm_position_idx + num_tokens] # Vision embeddings if embeddings.shape[-1] != self.config.text_config.hidden_size: visual_dim = embeddings.shape[-1] // (multiscale_len + 1) multi_dim = visual_dim * multiscale_len - embeddings_main, embeddings_multiscale = torch.split( - embeddings, [visual_dim, multi_dim], dim=-1 - ) + embeddings_main, embeddings_multiscale = torch.split(embeddings, [visual_dim, multi_dim], dim=-1) multimodal_embeddings[index] = embeddings_main multimodal_embeddings_multiscale.append(embeddings_multiscale) is_vision[current_positions] = True @@ -914,9 +834,7 @@ def embed_input_ids( is_multimodal=is_vision, ) deepstack_input_embeds = ( - deepstack_input_embeds.view( - inputs_embeds.shape[0], multiscale_len, visual_dim - ) + deepstack_input_embeds.view(inputs_embeds.shape[0], multiscale_len, visual_dim) .permute(1, 0, 2) .contiguous() ) @@ -1000,12 +918,8 @@ def get_mrope_input_positions( audio_feature_lengths = kwargs.get("audio_feature_lengths", []) use_audio_in_video = any(kwargs.get("use_audio_in_video", [])) - image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)( - image_grid_thw - ) - video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)( - video_grid_thw - ) + image_grid_thw = (torch.stack if image_grid_thw else torch.tensor)(image_grid_thw) + video_grid_thw = (torch.stack if video_grid_thw else torch.tensor)(video_grid_thw) input_ids = torch.tensor(input_tokens) if input_ids is None or input_ids.ndim != 1: @@ -1014,16 +928,11 @@ def get_mrope_input_positions( seq_len = input_ids.shape[0] if isinstance(audio_feature_lengths, list): - audio_feature_lengths = torch.tensor( - audio_feature_lengths, dtype=torch.long - ) + audio_feature_lengths = torch.tensor(audio_feature_lengths, dtype=torch.long) if not len(second_per_grid_ts) and len(video_grid_thw): second_per_grid_ts = 2.0 - second_per_grids = ( - torch.ones(len(video_grid_thw), dtype=torch.float32) - * second_per_grid_ts - ) + second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32) * second_per_grid_ts else: second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32) @@ -1036,9 +945,7 @@ def get_mrope_input_positions( audio_start_token_id = config.audio_start_token_id position_id_per_seconds = config.position_id_per_seconds - vision_start_indices = torch.argwhere( - input_ids == vision_start_token_id - ).squeeze(1) + vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1) if vision_start_indices.numel() > 0: vision_tokens = input_ids[vision_start_indices + 1] else: @@ -1057,11 +964,7 @@ def get_mrope_input_positions( video_idx = 0 audio_idx = 0 remain_images, remain_videos, remain_audios = image_nums, video_nums, audio_nums # noqa: E501 - multimodal_nums = ( - image_nums + audio_nums - if use_audio_in_video - else image_nums + video_nums + audio_nums - ) # noqa: E501 + multimodal_nums = image_nums + audio_nums if use_audio_in_video else image_nums + video_nums + audio_nums # noqa: E501 for _ in range(multimodal_nums): st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 @@ -1081,55 +984,28 @@ def get_mrope_input_positions( text_len = min_ed - st if text_len != 0: st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - llm_pos_ids_list.append( - torch.arange(text_len, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) + llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 bos_len = 1 - llm_pos_ids_list.append( - torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) - + st_idx - ) + llm_pos_ids_list.append(torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - audio_len = _get_feat_extract_output_lengths( - audio_feature_lengths[audio_idx] - ) - llm_pos_ids = ( - torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) - + st_idx - ) + audio_len = _get_feat_extract_output_lengths(audio_feature_lengths[audio_idx]) + llm_pos_ids = torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx llm_pos_ids_list.append(llm_pos_ids) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 eos_len = 1 - llm_pos_ids_list.append( - torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) - + st_idx - ) + llm_pos_ids_list.append(torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) st += text_len + bos_len + audio_len + eos_len audio_idx += 1 remain_audios -= 1 - elif ( - min_ed == ed_vision_start - and input_ids[ed_vision_start + 1] == image_token_id - ): + elif min_ed == ed_vision_start and input_ids[ed_vision_start + 1] == image_token_id: text_len = min_ed - st if text_len != 0: st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - llm_pos_ids_list.append( - torch.arange(text_len, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) + llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 bos_len = 1 - llm_pos_ids_list.append( - torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) - + st_idx - ) + llm_pos_ids_list.append(torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 grid_t = image_grid_thw[image_idx][0] grid_hs = image_grid_thw[:, 1] @@ -1142,10 +1018,7 @@ def get_mrope_input_positions( llm_pos_ids_list.append(llm_pos_ids) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 eos_len = 1 - llm_pos_ids_list.append( - torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) - + st_idx - ) + llm_pos_ids_list.append(torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) st += text_len + bos_len + image_len + eos_len image_idx += 1 remain_images -= 1 @@ -1157,27 +1030,15 @@ def get_mrope_input_positions( text_len = min_ed - st if text_len != 0: st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - llm_pos_ids_list.append( - torch.arange(text_len, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) + llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 bos_len = 1 - llm_pos_ids_list.append( - torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) - + st_idx - ) + llm_pos_ids_list.append(torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 grid_t = video_grid_thw[video_idx][0] grid_hs = video_grid_thw[:, 1] grid_ws = video_grid_thw[:, 2] - t_index = ( - torch.arange(grid_t) - * float(second_per_grids[video_idx].item()) - * position_id_per_seconds - ) + t_index = torch.arange(grid_t) * float(second_per_grids[video_idx].item()) * position_id_per_seconds llm_pos_ids = get_llm_pos_ids_for_vision( st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws ) @@ -1185,95 +1046,46 @@ def get_mrope_input_positions( llm_pos_ids_list.append(llm_pos_ids) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 eos_len = 1 - llm_pos_ids_list.append( - torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) - + st_idx - ) + llm_pos_ids_list.append(torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) st += text_len + bos_len + video_len + eos_len video_idx += 1 remain_videos -= 1 - elif ( - min_ed == ed_vision_start - and ed_vision_start + 1 == ed_audio_start - and use_audio_in_video - ): + elif min_ed == ed_vision_start and ed_vision_start + 1 == ed_audio_start and use_audio_in_video: text_len = min_ed - st if text_len != 0: st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - llm_pos_ids_list.append( - torch.arange(text_len, dtype=torch.long) - .view(1, -1) - .expand(3, -1) - + st_idx - ) + llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 bos_len = 1 - bos_block = ( - torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) - + st_idx - ) + bos_block = torch.arange(bos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx llm_pos_ids_list.append(bos_block) llm_pos_ids_list.append(bos_block) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 - audio_len = _get_feat_extract_output_lengths( - audio_feature_lengths[audio_idx] - ) - audio_llm_pos_ids = ( - torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) - + st_idx - ) + audio_len = _get_feat_extract_output_lengths(audio_feature_lengths[audio_idx]) + audio_llm_pos_ids = torch.arange(audio_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx grid_t = video_grid_thw[video_idx][0] grid_hs = video_grid_thw[:, 1] grid_ws = video_grid_thw[:, 2] - t_index = ( - torch.arange(grid_t) - * float(second_per_grids[video_idx].item()) - * position_id_per_seconds - ) + t_index = torch.arange(grid_t) * float(second_per_grids[video_idx].item()) * position_id_per_seconds video_llm_pos_ids = get_llm_pos_ids_for_vision( st_idx, video_idx, spatial_merge_size, t_index, grid_hs, grid_ws ) video_data_index, audio_data_index = 0, 0 - while ( - video_data_index < video_llm_pos_ids.shape[-1] - and audio_data_index < audio_llm_pos_ids.shape[-1] - ): - if ( - video_llm_pos_ids[0][video_data_index] - <= audio_llm_pos_ids[0][audio_data_index] - ): - llm_pos_ids_list.append( - video_llm_pos_ids[ - :, video_data_index : video_data_index + 1 - ] - ) + while video_data_index < video_llm_pos_ids.shape[-1] and audio_data_index < audio_llm_pos_ids.shape[-1]: + if video_llm_pos_ids[0][video_data_index] <= audio_llm_pos_ids[0][audio_data_index]: + llm_pos_ids_list.append(video_llm_pos_ids[:, video_data_index : video_data_index + 1]) video_data_index += 1 else: - llm_pos_ids_list.append( - audio_llm_pos_ids[ - :, audio_data_index : audio_data_index + 1 - ] - ) + llm_pos_ids_list.append(audio_llm_pos_ids[:, audio_data_index : audio_data_index + 1]) audio_data_index += 1 if video_data_index < video_llm_pos_ids.shape[-1]: - llm_pos_ids_list.append( - video_llm_pos_ids[ - :, video_data_index : video_llm_pos_ids.shape[-1] - ] - ) + llm_pos_ids_list.append(video_llm_pos_ids[:, video_data_index : video_llm_pos_ids.shape[-1]]) if audio_data_index < audio_llm_pos_ids.shape[-1]: - llm_pos_ids_list.append( - audio_llm_pos_ids[ - :, audio_data_index : audio_llm_pos_ids.shape[-1] - ] - ) + llm_pos_ids_list.append(audio_llm_pos_ids[:, audio_data_index : audio_llm_pos_ids.shape[-1]]) video_len = video_grid_thw[video_idx].prod() // (spatial_merge_size**2) st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 eos_len = 1 - eos_block = ( - torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) - + st_idx - ) + eos_block = torch.arange(eos_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx llm_pos_ids_list.append(eos_block) llm_pos_ids_list.append(eos_block) st += text_len + bos_len * 2 + audio_len + video_len + eos_len * 2 # noqa: E501 @@ -1285,10 +1097,7 @@ def get_mrope_input_positions( if st < len(input_tokens): st_idx = llm_pos_ids_list[-1].max() + 1 if llm_pos_ids_list else 0 text_len = len(input_tokens) - st - llm_pos_ids_list.append( - torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) - + st_idx - ) + llm_pos_ids_list.append(torch.arange(text_len, dtype=torch.long).view(1, -1).expand(3, -1) + st_idx) llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) if llm_positions.shape[1] != seq_len: diff --git a/vllm_omni/worker/gpu_ar_model_runner.py b/vllm_omni/worker/gpu_ar_model_runner.py index d4e7e195fe8..18c8bc761b1 100644 --- a/vllm_omni/worker/gpu_ar_model_runner.py +++ b/vllm_omni/worker/gpu_ar_model_runner.py @@ -12,10 +12,15 @@ import numpy as np import torch from vllm.config import CUDAGraphMode +from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer +from vllm.distributed.kv_transfer import get_kv_transfer_group from vllm.forward_context import set_forward_context from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( + RoutedExpertsCapturer, +) from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput -from vllm.v1.outputs import AsyncModelRunnerOutput +from vllm.v1.outputs import AsyncModelRunnerOutput, make_empty_encoder_model_runner_output from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.structured_output.utils import apply_grammar_bitmask from vllm.v1.utils import record_function_or_nullcontext @@ -26,18 +31,12 @@ get_pp_group, get_tp_group, has_kv_transfer_group, - ) +from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices from vllm.v1.worker.utils import is_residual_scattered_for_sp -from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( - RoutedExpertsCapturer, -) -from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer -from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group + from vllm_omni.outputs import OmniModelRunnerOutput from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner -from vllm.v1.outputs import make_empty_encoder_model_runner_output -from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices logger = init_logger(__name__) @@ -91,10 +90,7 @@ def execute_model( intermediate_tensors: IntermediateTensors | None = None, ) -> OmniModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors | None: if self.execute_model_state is not None: - raise RuntimeError( - "State error: sample_tokens() must be called " - "after execute_model() returns None." - ) + raise RuntimeError("State error: sample_tokens() must be called after execute_model() returns None.") if self.vllm_config.model_config.enable_return_routed_experts: capturer = RoutedExpertsCapturer.get_instance() @@ -104,9 +100,7 @@ def execute_model( logger.error("RoutedExpertsCapturer not initialized.") if scheduler_output.preempted_req_ids and has_kv_transfer_group(): - get_kv_transfer_group().handle_preemptions( - scheduler_output.preempted_req_ids - ) + get_kv_transfer_group().handle_preemptions(scheduler_output.preempted_req_ids) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens with ( @@ -126,8 +120,7 @@ def execute_model( if not num_scheduled_tokens: if ( - self.parallel_config.distributed_executor_backend - == "external_launcher" + self.parallel_config.distributed_executor_backend == "external_launcher" and self.parallel_config.data_parallel_size > 1 ): # this is a corner case when both external launcher @@ -196,9 +189,7 @@ def execute_model( ) num_tokens_padded = batch_desc.num_tokens - num_reqs_padded = ( - batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs - ) + num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices( should_ubatch, num_scheduled_tokens_np, @@ -218,19 +209,17 @@ def execute_model( use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0 ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices - attn_metadata, spec_decode_common_attn_metadata = ( - self._build_attention_metadata( - num_tokens=num_tokens_unpadded, - num_tokens_padded=num_tokens_padded if pad_attn else None, - num_reqs=num_reqs, - num_reqs_padded=num_reqs_padded if pad_attn else None, - max_query_len=max_num_scheduled_tokens, - ubatch_slices=ubatch_slices_attn, - logits_indices=logits_indices, - use_spec_decode=use_spec_decode, - num_scheduled_tokens=scheduler_output.num_scheduled_tokens, - cascade_attn_prefix_lens=cascade_attn_prefix_lens, - ) + attn_metadata, spec_decode_common_attn_metadata = self._build_attention_metadata( + num_tokens=num_tokens_unpadded, + num_tokens_padded=num_tokens_padded if pad_attn else None, + num_reqs=num_reqs, + num_reqs_padded=num_reqs_padded if pad_attn else None, + max_query_len=max_num_scheduled_tokens, + ubatch_slices=ubatch_slices_attn, + logits_indices=logits_indices, + use_spec_decode=use_spec_decode, + num_scheduled_tokens=scheduler_output.num_scheduled_tokens, + cascade_attn_prefix_lens=cascade_attn_prefix_lens, ) ( @@ -240,9 +229,7 @@ def execute_model( intermediate_tensors, model_kwargs, ec_connector_output, - ) = self._preprocess( - scheduler_output, num_tokens_padded, intermediate_tensors - ) + ) = self._preprocess(scheduler_output, num_tokens_padded, intermediate_tensors) # Set cudagraph mode to none if calc_kv_scales is true. # KV scales calculation involves dynamic operations that are incompatible @@ -329,9 +316,7 @@ def execute_model( sample_hidden_states = hidden_states[logits_indices] if not get_pp_group().is_last_rank: all_gather_tensors = { - "residual": not is_residual_scattered_for_sp( - self.vllm_config, num_tokens_padded - ) + "residual": not is_residual_scattered_for_sp(self.vllm_config, num_tokens_padded) } get_pp_group().send_tensor_dict( hidden_states.tensors, @@ -408,9 +393,7 @@ def sample_tokens( # Apply structured output bitmasks if present. if grammar_output is not None: - apply_grammar_bitmask( - scheduler_output, grammar_output, self.input_batch, logits - ) + apply_grammar_bitmask(scheduler_output, grammar_output, self.input_batch, logits) with record_function_or_nullcontext("gpu_model_runner: sample"): sampler_output = self._sample(logits, spec_decode_metadata) @@ -450,23 +433,19 @@ def propose_draft_token_ids(sampled_token_ids): propose_draft_token_ids(sampled_token_ids) elif self.valid_sampled_token_count_event is not None: assert spec_decode_common_attn_metadata is not None - next_token_ids, valid_sampled_tokens_count = ( - self.drafter.prepare_next_token_ids_padded( - spec_decode_common_attn_metadata, - sampled_token_ids, - self.requests, - self.input_batch, - self.discard_request_mask.gpu, - ) - ) - self._copy_valid_sampled_token_count( - next_token_ids, valid_sampled_tokens_count + next_token_ids, valid_sampled_tokens_count = self.drafter.prepare_next_token_ids_padded( + spec_decode_common_attn_metadata, + sampled_token_ids, + self.requests, + self.input_batch, + self.discard_request_mask.gpu, ) + self._copy_valid_sampled_token_count(next_token_ids, valid_sampled_tokens_count) # Since we couldn't run the drafter, # just use zeros for the draft tokens. - self._draft_token_ids = torch.zeros( - 1, device=self.device, dtype=torch.int32 - ).expand(len(self.input_batch.req_ids), self.num_spec_tokens) + self._draft_token_ids = torch.zeros(1, device=self.device, dtype=torch.int32).expand( + len(self.input_batch.req_ids), self.num_spec_tokens + ) self._copy_draft_token_ids_to_cpu(scheduler_output, zeros_only=True) else: propose_drafts_after_bookkeeping = input_fits_in_drafter diff --git a/vllm_omni/worker/gpu_ar_worker.py b/vllm_omni/worker/gpu_ar_worker.py index 599dea31f2f..d2dafce6877 100644 --- a/vllm_omni/worker/gpu_ar_worker.py +++ b/vllm_omni/worker/gpu_ar_worker.py @@ -3,19 +3,18 @@ import torch from vllm.logger import init_logger -from vllm.utils.torch_utils import set_random_seed from vllm.platforms import current_platform from vllm.utils.mem_utils import MemorySnapshot, format_gib +from vllm.utils.torch_utils import set_random_seed from vllm.v1.utils import report_usage_stats from vllm.v1.worker.gpu_worker import Worker as GPUWorker from vllm.v1.worker.gpu_worker import init_worker_distributed_environment +from vllm.v1.worker.utils import request_memory +from vllm.v1.worker.workspace import init_workspace_manager from vllm_omni.worker.gpu_ar_model_runner import GPUARModelRunner -from vllm.v1.worker.workspace import init_workspace_manager -from vllm.v1.worker.utils import request_memory -from vllm.logger import init_logger -logger = init_logger(__name__) +logger = init_logger(__name__) class GPUARWorker(GPUWorker): @@ -31,8 +30,7 @@ def init_device(self): os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) parallel_config = self.parallel_config if ( - parallel_config.distributed_executor_backend - not in ("ray", "external_launcher") + parallel_config.distributed_executor_backend not in ("ray", "external_launcher") and parallel_config.data_parallel_backend != "ray" and parallel_config.nnodes_within_dp == 1 ): @@ -42,8 +40,7 @@ def init_device(self): dp_local_rank = self.parallel_config.data_parallel_index tp_pp_world_size = ( - self.parallel_config.pipeline_parallel_size - * self.parallel_config.tensor_parallel_size + self.parallel_config.pipeline_parallel_size * self.parallel_config.tensor_parallel_size ) # DP_LOCAL_RANK * TP_PP_WORLD_SIZE + TP_LOCAL_RANK @@ -51,9 +48,7 @@ def init_device(self): assert self.local_rank < torch.cuda.device_count(), ( f"DP adjusted local rank {self.local_rank} is out of bounds. " ) - visible_device_count = ( - torch.cuda.device_count() if torch.cuda.is_available() else 0 - ) + visible_device_count = torch.cuda.device_count() if torch.cuda.is_available() else 0 assert self.parallel_config.local_world_size <= visible_device_count, ( f"local_world_size ({self.parallel_config.local_world_size}) must " f"be less than or equal to the number of visible devices " @@ -87,9 +82,7 @@ def init_device(self): self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device) self.requested_memory = request_memory(init_snapshot, self.cache_config) logger.debug("worker init memory snapshot: %r", self.init_snapshot) - logger.debug( - "worker requested memory: %sGiB", format_gib(self.requested_memory) - ) + logger.debug("worker requested memory: %sGiB", format_gib(self.requested_memory)) else: raise RuntimeError(f"Not support device type: {self.device_config.device}") @@ -102,4 +95,4 @@ def init_device(self): if self.rank == 0: # If usage stat is enabled, collect relevant info. - report_usage_stats(self.vllm_config) \ No newline at end of file + report_usage_stats(self.vllm_config) diff --git a/vllm_omni/worker/gpu_generation_model_runner.py b/vllm_omni/worker/gpu_generation_model_runner.py index 8410b6f7383..011c74d2e83 100644 --- a/vllm_omni/worker/gpu_generation_model_runner.py +++ b/vllm_omni/worker/gpu_generation_model_runner.py @@ -5,16 +5,23 @@ """ from __future__ import annotations -from copy import copy import gc import logging -from typing import Any +from copy import copy + import numpy as np import torch from vllm.config import CUDAGraphMode +from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer +from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group +from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( + RoutedExpertsCapturer, +) +from vllm.model_executor.models.interfaces import supports_mm_encoder_only from vllm.utils.math_utils import cdiv -from vllm.v1.core.sched.output import SchedulerOutput, GrammarOutput +from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput +from vllm.v1.outputs import AsyncModelRunnerOutput, make_empty_encoder_model_runner_output from vllm.v1.spec_decode.eagle import EagleProposer from vllm.v1.utils import record_function_or_nullcontext from vllm.v1.worker.gpu_model_runner import ( @@ -25,20 +32,13 @@ get_pp_group, set_forward_context, ) -from vllm.model_executor.models.interfaces import supports_mm_encoder_only -from vllm.v1.worker.utils import sanity_check_mm_encoder_outputs from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices +from vllm.v1.worker.utils import sanity_check_mm_encoder_outputs + from vllm_omni.outputs import OmniModelRunnerOutput -from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner -from vllm.model_executor.layers.fused_moe.routed_experts_capturer import ( - RoutedExpertsCapturer, -) -from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer -from vllm.distributed.kv_transfer import get_kv_transfer_group, has_kv_transfer_group -from vllm.v1.outputs import make_empty_encoder_model_runner_output -from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices -from vllm.v1.outputs import AsyncModelRunnerOutput from vllm_omni.worker.gpu_ar_model_runner import ExecuteModelState +from vllm_omni.worker.gpu_model_runner import OmniGPUModelRunner + logger = logging.getLogger(__name__) @@ -57,10 +57,7 @@ def execute_model( intermediate_tensors: IntermediateTensors | None = None, ) -> OmniModelRunnerOutput | IntermediateTensors: if self.execute_model_state is not None: - raise RuntimeError( - "State error: sample_tokens() must be called " - "after execute_model() returns None." - ) + raise RuntimeError("State error: sample_tokens() must be called after execute_model() returns None.") if self.vllm_config.model_config.enable_return_routed_experts: capturer = RoutedExpertsCapturer.get_instance() @@ -70,10 +67,8 @@ def execute_model( logger.error("RoutedExpertsCapturer not initialized.") if scheduler_output.preempted_req_ids and has_kv_transfer_group(): - get_kv_transfer_group().handle_preemptions( - scheduler_output.preempted_req_ids - ) - + get_kv_transfer_group().handle_preemptions(scheduler_output.preempted_req_ids) + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens with ( record_function_or_nullcontext("gpu_model_runner: preprocess"), @@ -82,7 +77,7 @@ def execute_model( self._update_states(scheduler_output) if not scheduler_output.total_num_scheduled_tokens: return EMPTY_MODEL_RUNNER_OUTPUT - + if has_ec_transfer() and get_ec_transfer().is_producer: with self.maybe_get_ec_connector_output( scheduler_output, @@ -93,8 +88,7 @@ def execute_model( if not num_scheduled_tokens: if ( - self.parallel_config.distributed_executor_backend - == "external_launcher" + self.parallel_config.distributed_executor_backend == "external_launcher" and self.parallel_config.data_parallel_size > 1 ): # this is a corner case when both external launcher @@ -107,7 +101,7 @@ def execute_model( if not has_kv_transfer_group(): # Return empty ModelRunnerOutput if no work to do. return EMPTY_MODEL_RUNNER_OUTPUT - + return self.kv_connector_no_forward(scheduler_output, self.vllm_config) if self.cache_config.kv_sharing_fast_prefill: @@ -127,7 +121,7 @@ def execute_model( scheduler_output, num_scheduled_tokens_np, ) - + cascade_attn_prefix_lens = None # Disable cascade attention when using microbatching (DBO) if self.cascade_attn_enabled and not self.parallel_config.use_ubatching: @@ -137,7 +131,7 @@ def execute_model( self.input_batch.num_computed_tokens_cpu[:num_reqs], scheduler_output.num_common_prefix_blocks, ) - + ( cudagraph_mode, batch_desc, @@ -163,9 +157,7 @@ def execute_model( ) num_tokens_padded = batch_desc.num_tokens - num_reqs_padded = ( - batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs - ) + num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices( should_ubatch, num_scheduled_tokens_np, @@ -185,19 +177,17 @@ def execute_model( use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0 ubatch_slices_attn = ubatch_slices_padded if pad_attn else ubatch_slices - attn_metadata, spec_decode_common_attn_metadata = ( - self._build_attention_metadata( - num_tokens=num_tokens_unpadded, - num_tokens_padded=num_tokens_padded if pad_attn else None, - num_reqs=num_reqs, - num_reqs_padded=num_reqs_padded if pad_attn else None, - max_query_len=max_num_scheduled_tokens, - ubatch_slices=ubatch_slices_attn, - logits_indices=logits_indices, - use_spec_decode=use_spec_decode, - num_scheduled_tokens=scheduler_output.num_scheduled_tokens, - cascade_attn_prefix_lens=cascade_attn_prefix_lens, - ) + attn_metadata, spec_decode_common_attn_metadata = self._build_attention_metadata( + num_tokens=num_tokens_unpadded, + num_tokens_padded=num_tokens_padded if pad_attn else None, + num_reqs=num_reqs, + num_reqs_padded=num_reqs_padded if pad_attn else None, + max_query_len=max_num_scheduled_tokens, + ubatch_slices=ubatch_slices_attn, + logits_indices=logits_indices, + use_spec_decode=use_spec_decode, + num_scheduled_tokens=scheduler_output.num_scheduled_tokens, + cascade_attn_prefix_lens=cascade_attn_prefix_lens, ) ( @@ -260,14 +250,15 @@ def execute_model( ) self.kv_connector_output = kv_connector_output return None - + @torch.inference_mode() def sample_tokens( self, grammar_output: GrammarOutput | None = None, ) -> OmniModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors: - # NOTE: Even though the model is non-autoregressive, we still need to have this function to match the interface of the engine core. - # In this case, this function + # NOTE: Even though the model is non-autoregressive, we still need + # this function to match the interface of the engine core. + # In this case, this function kv_connector_output = self.kv_connector_output self.kv_connector_output = None @@ -331,9 +322,7 @@ def sample_tokens( kv_connector_output=kv_connector_output, num_nans_in_logits={}, cudagraph_stats=cudagraph_stats, - ec_connector_output=ec_connector_output - if self.supports_mm_inputs - else None, + ec_connector_output=ec_connector_output if self.supports_mm_inputs else None, ) if not self.use_async_scheduling: @@ -435,10 +424,7 @@ def _dummy_run( # mm encoder dummy run may need to add in the future. return torch.tensor([]), torch.tensor([]) - assert ( - cudagraph_runtime_mode is None - or cudagraph_runtime_mode.valid_runtime_modes() - ) + assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes() # If cudagraph_mode.decode_mode() == FULL and # cudagraph_mode.separate_routine(). This means that we are using @@ -499,8 +485,7 @@ def _dummy_run( max_num_scheduled_tokens=max_query_len, use_cascade_attn=False, allow_microbatching=allow_microbatching, - force_eager=is_profile - or (cudagraph_runtime_mode == CUDAGraphMode.NONE), + force_eager=is_profile or (cudagraph_runtime_mode == CUDAGraphMode.NONE), # `force_uniform_decode` is used for cudagraph capture; because for # capturing mixed prefill-decode batches, we sometimes use # num_tokens == num_reqs which looks like a uniform decode batch to the @@ -522,9 +507,7 @@ def _dummy_run( ) num_tokens_padded = batch_desc.num_tokens - num_reqs_padded = ( - batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs - ) + num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices( should_ubatch, num_scheduled_tokens, @@ -603,17 +586,13 @@ def _dummy_run( intermediate_tensors = None else: if self.intermediate_tensors is None: - self.intermediate_tensors = ( - self.model.make_empty_intermediate_tensors( - batch_size=self.max_num_tokens, - dtype=self.model_config.dtype, - device=self.device, - ) + self.intermediate_tensors = self.model.make_empty_intermediate_tensors( + batch_size=self.max_num_tokens, + dtype=self.model_config.dtype, + device=self.device, ) - intermediate_tensors = self.sync_and_slice_intermediate_tensors( - num_tokens_padded, None, False - ) + intermediate_tensors = self.sync_and_slice_intermediate_tensors(num_tokens_padded, None, False) if ubatch_slices_padded is not None: # Adjust values to reflect a single ubatch. @@ -654,14 +633,8 @@ def _dummy_run( # Therefore only use cudagraphs if the main model uses PIECEWISE # NOTE(lucas): this is a hack, need to clean up. use_cudagraphs = ( - ( - is_graph_capturing - and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE - ) - or ( - not is_graph_capturing - and cudagraph_runtime_mode != CUDAGraphMode.NONE - ) + (is_graph_capturing and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE) + or (not is_graph_capturing and cudagraph_runtime_mode != CUDAGraphMode.NONE) ) and not self.speculative_config.enforce_eager # Note(gnovack) - We need to disable cudagraphs for one of the two diff --git a/vllm_omni/worker/gpu_generation_worker.py b/vllm_omni/worker/gpu_generation_worker.py index 6a1a3039211..19f8ab84b99 100644 --- a/vllm_omni/worker/gpu_generation_worker.py +++ b/vllm_omni/worker/gpu_generation_worker.py @@ -2,17 +2,21 @@ import os import torch -from vllm.utils.torch_utils import set_random_seed +from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils.mem_utils import MemorySnapshot, format_gib +from vllm.utils.torch_utils import set_random_seed from vllm.v1.utils import report_usage_stats from vllm.v1.worker.gpu_worker import Worker as GPUWorker from vllm.v1.worker.gpu_worker import init_worker_distributed_environment -from vllm.v1.worker.workspace import init_workspace_manager from vllm.v1.worker.utils import request_memory +from vllm.v1.worker.workspace import init_workspace_manager + from vllm_omni.worker.gpu_generation_model_runner import GPUGenerationModelRunner -from vllm.logger import init_logger + logger = init_logger(__name__) + + class GPUGenerationWorker(GPUWorker): """GPU Worker for Generation model (non-autoregressive waveform generation). @@ -26,8 +30,7 @@ def init_device(self): os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) parallel_config = self.parallel_config if ( - parallel_config.distributed_executor_backend - not in ("ray", "external_launcher") + parallel_config.distributed_executor_backend not in ("ray", "external_launcher") and parallel_config.data_parallel_backend != "ray" and parallel_config.nnodes_within_dp == 1 ): @@ -37,8 +40,7 @@ def init_device(self): dp_local_rank = self.parallel_config.data_parallel_index tp_pp_world_size = ( - self.parallel_config.pipeline_parallel_size - * self.parallel_config.tensor_parallel_size + self.parallel_config.pipeline_parallel_size * self.parallel_config.tensor_parallel_size ) # DP_LOCAL_RANK * TP_PP_WORLD_SIZE + TP_LOCAL_RANK @@ -46,9 +48,7 @@ def init_device(self): assert self.local_rank < torch.cuda.device_count(), ( f"DP adjusted local rank {self.local_rank} is out of bounds. " ) - visible_device_count = ( - torch.cuda.device_count() if torch.cuda.is_available() else 0 - ) + visible_device_count = torch.cuda.device_count() if torch.cuda.is_available() else 0 assert self.parallel_config.local_world_size <= visible_device_count, ( f"local_world_size ({self.parallel_config.local_world_size}) must " f"be less than or equal to the number of visible devices " @@ -82,9 +82,7 @@ def init_device(self): self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device) self.requested_memory = request_memory(init_snapshot, self.cache_config) logger.debug("worker init memory snapshot: %r", self.init_snapshot) - logger.debug( - "worker requested memory: %sGiB", format_gib(self.requested_memory) - ) + logger.debug("worker requested memory: %sGiB", format_gib(self.requested_memory)) else: raise RuntimeError(f"Not support device type: {self.device_config.device}") diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 24d4ffd028e..7b86554855e 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -8,7 +8,7 @@ from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding -from vllm.model_executor.models.interfaces import supports_mrope, supports_mm_encoder_only +from vllm.model_executor.models.interfaces import supports_mm_encoder_only, supports_mrope from vllm.model_executor.models.interfaces_base import VllmModelForPooling from vllm.sampling_params import SamplingType from vllm.utils.import_utils import LazyLoader @@ -17,6 +17,7 @@ from vllm.v1.worker.gpu_input_batch import CachedRequestState from vllm.v1.worker.gpu_model_runner import GPUModelRunner, IntermediateTensors, PerLayerAttnMetadata from vllm.v1.worker.ubatch_utils import maybe_create_ubatch_slices + from vllm_omni.model_executor.models.output_templates import OmniOutput if TYPE_CHECKING: @@ -248,7 +249,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: except Exception as e: logger.error(f"Error decoding additional information: {e}") pass - + if sampling_params and sampling_params.prompt_logprobs is not None: self.num_prompt_logprobs[req_id] = ( self.input_batch.vocab_size @@ -258,11 +259,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Only relevant for models using M-RoPE (e.g, Qwen2-VL) if self.uses_mrope: self._init_mrope_positions(req_state) - + # Only relevant for models using XD-RoPE (e.g, HunYuan-VL) if self.uses_xdrope_dim > 0: self._init_xdrope_positions(req_state) - + reqs_to_add.append(self.requests[req_id]) # Update the states of the running/resumed requests. @@ -281,14 +282,14 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: resumed_from_preemption = req_id in req_data.resumed_req_ids num_output_tokens = req_data.num_output_tokens[i] req_index = self.input_batch.req_id_to_index.get(req_id) - + if req_state.prev_num_draft_len and self.use_async_scheduling: # prev_num_draft_len is used in async scheduling mode with # spec decode. it indicates if need to update num_computed_tokens # of the request. for example: # fist step: num_computed_tokens = 0, spec_tokens = [], # prev_num_draft_len = 0. - # second step: num_computed_tokens = 100(prompt lenth), + # second step: num_computed_tokens = 100(prompt length), # spec_tokens = [a,b], prev_num_draft_len = 0. # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d], # prev_num_draft_len = 2. @@ -305,7 +306,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: num_rejected = req_state.prev_num_draft_len - num_accepted num_computed_tokens -= num_rejected req_state.output_token_ids.extend([-1] * num_accepted) - + # Update the cached states. req_state.num_computed_tokens = num_computed_tokens @@ -327,12 +328,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # failure. Align the cached state. del req_state.output_token_ids[num_output_tokens:] if req_index is not None: - end_idx = ( - self.input_batch.num_prompt_tokens[req_index] - + num_output_tokens - ) + end_idx = self.input_batch.num_prompt_tokens[req_index] + num_output_tokens self.input_batch.num_tokens_no_spec[req_index] = end_idx - + # Update the block IDs. if not resumed_from_preemption: if new_block_ids is not None: @@ -372,15 +370,12 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Add new_token_ids to token_ids_cpu. start_token_index = num_computed_tokens end_token_index = num_computed_tokens + len(new_token_ids) - self.input_batch.token_ids_cpu[ - req_index, start_token_index:end_token_index - ] = new_token_ids + self.input_batch.token_ids_cpu[req_index, start_token_index:end_token_index] = new_token_ids self.input_batch.num_tokens_no_spec[req_index] = end_token_index # Add spec_token_ids to token_ids_cpu. self.input_batch.update_req_spec_token_ids(req_state, scheduled_spec_tokens) - # Add the new or resumed requests to the persistent batch. # The smaller empty indices are filled first. for request in reqs_to_add: @@ -457,10 +452,7 @@ def _dummy_run( # mm encoder dummy run may need to add in the future. return torch.tensor([]), torch.tensor([]) - assert ( - cudagraph_runtime_mode is None - or cudagraph_runtime_mode.valid_runtime_modes() - ) + assert cudagraph_runtime_mode is None or cudagraph_runtime_mode.valid_runtime_modes() # If cudagraph_mode.decode_mode() == FULL and # cudagraph_mode.separate_routine(). This means that we are using @@ -521,8 +513,7 @@ def _dummy_run( max_num_scheduled_tokens=max_query_len, use_cascade_attn=False, allow_microbatching=allow_microbatching, - force_eager=is_profile - or (cudagraph_runtime_mode == CUDAGraphMode.NONE), + force_eager=is_profile or (cudagraph_runtime_mode == CUDAGraphMode.NONE), # `force_uniform_decode` is used for cudagraph capture; because for # capturing mixed prefill-decode batches, we sometimes use # num_tokens == num_reqs which looks like a uniform decode batch to the @@ -544,9 +535,7 @@ def _dummy_run( ) num_tokens_padded = batch_desc.num_tokens - num_reqs_padded = ( - batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs - ) + num_reqs_padded = batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs ubatch_slices, ubatch_slices_padded = maybe_create_ubatch_slices( should_ubatch, num_scheduled_tokens, @@ -625,17 +614,13 @@ def _dummy_run( intermediate_tensors = None else: if self.intermediate_tensors is None: - self.intermediate_tensors = ( - self.model.make_empty_intermediate_tensors( - batch_size=self.max_num_tokens, - dtype=self.model_config.dtype, - device=self.device, - ) + self.intermediate_tensors = self.model.make_empty_intermediate_tensors( + batch_size=self.max_num_tokens, + dtype=self.model_config.dtype, + device=self.device, ) - intermediate_tensors = self.sync_and_slice_intermediate_tensors( - num_tokens_padded, None, False - ) + intermediate_tensors = self.sync_and_slice_intermediate_tensors(num_tokens_padded, None, False) if ubatch_slices_padded is not None: # Adjust values to reflect a single ubatch. @@ -676,14 +661,8 @@ def _dummy_run( # Therefore only use cudagraphs if the main model uses PIECEWISE # NOTE(lucas): this is a hack, need to clean up. use_cudagraphs = ( - ( - is_graph_capturing - and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE - ) - or ( - not is_graph_capturing - and cudagraph_runtime_mode != CUDAGraphMode.NONE - ) + (is_graph_capturing and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE) + or (not is_graph_capturing and cudagraph_runtime_mode != CUDAGraphMode.NONE) ) and not self.speculative_config.enforce_eager # Note(gnovack) - We need to disable cudagraphs for one of the two @@ -721,9 +700,7 @@ def _dummy_run( self.eplb_step(is_dummy=True, is_profile=is_profile) logit_indices = np.cumsum(num_scheduled_tokens) - 1 - logit_indices_device = torch.from_numpy(logit_indices).to( - self.device, non_blocking=True - ) + logit_indices_device = torch.from_numpy(logit_indices).to(self.device, non_blocking=True) return hidden_states, hidden_states[logit_indices_device] def _decode_and_store_request_payloads(self, scheduler_output: "SchedulerOutput") -> None: @@ -1005,9 +982,10 @@ def _preprocess( except Exception as e: logger.error(f"Error in preprocess for request {req_id}: {e}") import traceback + traceback.print_exc() raise e - #TODO: This is Model Specific Code, need to be generalized in the future ZTC + # TODO: This is Model Specific Code, need to be generalized in the future ZTC # run talker mtp decode if hasattr(self.model, "talker_mtp"): _cudagraph_mode, batch_desc, _, _, _ = self._determine_batch_execution_and_padding( From a7f99260136dc0afebbee52549914733f6705b79 Mon Sep 17 00:00:00 2001 From: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Date: Fri, 16 Jan 2026 16:32:28 +0800 Subject: [PATCH 44/59] [Model] add flux2 klein (#809) Signed-off-by: David Chen <530634352@qq.com> --- docs/models/supported_models.md | 1 + .../diffusion/models/flux2_klein/__init__.py | 17 + .../flux2_klein/flux2_klein_transformer.py | 723 +++++++++++++ .../flux2_klein/pipeline_flux2_klein.py | 963 ++++++++++++++++++ vllm_omni/diffusion/registry.py | 6 + 5 files changed, 1710 insertions(+) create mode 100644 vllm_omni/diffusion/models/flux2_klein/__init__.py create mode 100644 vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py create mode 100644 vllm_omni/diffusion/models/flux2_klein/pipeline_flux2_klein.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 1b993152837..cd70019de32 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -32,6 +32,7 @@ th { |`LongcatImagePipeline` | LongCat-Image | `meituan-longcat/LongCat-Image` | |`LongCatImageEditPipeline` | LongCat-Image-Edit | `meituan-longcat/LongCat-Image-Edit` | |`StableDiffusion3Pipeline` | Stable-Diffusion-3 | `stabilityai/stable-diffusion-3.5-medium` | +|`Flux2KleinPipeline` | FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B`, `black-forest-labs/FLUX.2-klein-9B` | |`StableAudioPipeline` | Stable-Audio-Open | `stabilityai/stable-audio-open-1.0` | diff --git a/vllm_omni/diffusion/models/flux2_klein/__init__.py b/vllm_omni/diffusion/models/flux2_klein/__init__.py new file mode 100644 index 00000000000..0d477ab0a48 --- /dev/null +++ b/vllm_omni/diffusion/models/flux2_klein/__init__.py @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Flux2 klein diffusion model components.""" + +from vllm_omni.diffusion.models.flux2_klein.flux2_klein_transformer import ( + Flux2Transformer2DModel, +) +from vllm_omni.diffusion.models.flux2_klein.pipeline_flux2_klein import ( + Flux2KleinPipeline, + get_flux2_klein_post_process_func, +) + +__all__ = [ + "Flux2KleinPipeline", + "Flux2Transformer2DModel", + "get_flux2_klein_post_process_func", +] diff --git a/vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py b/vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py new file mode 100644 index 00000000000..86658a01deb --- /dev/null +++ b/vllm_omni/diffusion/models/flux2_klein/flux2_klein_transformer.py @@ -0,0 +1,723 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Iterable +from types import SimpleNamespace +from typing import Any + +import torch +import torch.nn as nn +from diffusers.models.embeddings import ( + TimestepEmbedding, + Timesteps, + get_1d_rotary_pos_embed, +) +from diffusers.models.modeling_outputs import Transformer2DModelOutput +from diffusers.models.normalization import AdaLayerNormContinuous +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import QKVParallelLinear, ReplicatedLinear +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + +from vllm_omni.diffusion.attention.backends.abstract import AttentionMetadata +from vllm_omni.diffusion.attention.layer import Attention +from vllm_omni.diffusion.layers.rope import RotaryEmbedding + + +class Flux2SwiGLU(nn.Module): + """SwiGLU activation used by Flux2.""" + + def __init__(self): + super().__init__() + self.gate_fn = nn.SiLU() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x1, x2 = x.chunk(2, dim=-1) + return self.gate_fn(x1) * x2 + + +class Flux2FeedForward(nn.Module): + def __init__( + self, + dim: int, + dim_out: int | None = None, + mult: float = 3.0, + inner_dim: int | None = None, + bias: bool = False, + ): + super().__init__() + if inner_dim is None: + inner_dim = int(dim * mult) + dim_out = dim_out or dim + + self.linear_in = nn.Linear(dim, inner_dim * 2, bias=bias) + self.act_fn = Flux2SwiGLU() + self.linear_out = nn.Linear(inner_dim, dim_out, bias=bias) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.linear_in(x) + x = self.act_fn(x) + return self.linear_out(x) + + +class Flux2Attention(nn.Module): + def __init__( + self, + query_dim: int, + heads: int = 8, + dim_head: int = 64, + dropout: float = 0.0, + bias: bool = False, + added_kv_proj_dim: int | None = None, + added_proj_bias: bool | None = True, + out_bias: bool = True, + eps: float = 1e-5, + out_dim: int = None, + elementwise_affine: bool = True, + ): + super().__init__() + self.head_dim = dim_head + self.inner_dim = out_dim if out_dim is not None else dim_head * heads + self.query_dim = query_dim + self.out_dim = out_dim if out_dim is not None else query_dim + self.heads = out_dim // dim_head if out_dim is not None else heads + self.dropout = dropout + self.added_kv_proj_dim = added_kv_proj_dim + + self.to_qkv = QKVParallelLinear( + hidden_size=query_dim, + head_size=self.head_dim, + total_num_heads=self.heads, + disable_tp=True, + bias=bias, + ) + + self.norm_q = RMSNorm(dim_head, eps=eps) + self.norm_k = RMSNorm(dim_head, eps=eps) + + self.to_out = nn.ModuleList( + [ReplicatedLinear(self.inner_dim, self.out_dim, bias=out_bias), nn.Dropout(dropout)] + ) + + if added_kv_proj_dim is not None: + self.norm_added_q = RMSNorm(dim_head, eps=eps) + self.norm_added_k = RMSNorm(dim_head, eps=eps) + self.add_kv_proj = QKVParallelLinear( + hidden_size=added_kv_proj_dim, + head_size=self.head_dim, + total_num_heads=self.heads, + disable_tp=True, + bias=added_proj_bias, + ) + self.to_add_out = ReplicatedLinear(self.inner_dim, query_dim, bias=out_bias) + + self.rope = RotaryEmbedding(is_neox_style=False) + self.attn = Attention( + num_heads=self.heads, + head_size=self.head_dim, + softmax_scale=1.0 / (self.head_dim**0.5), + causal=False, + ) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor | None = None, + attention_mask: torch.Tensor | None = None, + image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None, + **kwargs, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + qkv, _ = self.to_qkv(hidden_states) + query, key, value = qkv.chunk(3, dim=-1) + + encoder_query = encoder_key = encoder_value = None + if encoder_hidden_states is not None and self.added_kv_proj_dim is not None: + encoder_qkv, _ = self.add_kv_proj(encoder_hidden_states) + encoder_query, encoder_key, encoder_value = encoder_qkv.chunk(3, dim=-1) + + query = query.unflatten(-1, (self.heads, -1)) + key = key.unflatten(-1, (self.heads, -1)) + value = value.unflatten(-1, (self.heads, -1)) + + query = self.norm_q(query) + key = self.norm_k(key) + + if encoder_hidden_states is not None and self.added_kv_proj_dim is not None: + encoder_query = encoder_query.unflatten(-1, (self.heads, -1)) + encoder_key = encoder_key.unflatten(-1, (self.heads, -1)) + encoder_value = encoder_value.unflatten(-1, (self.heads, -1)) + + encoder_query = self.norm_added_q(encoder_query) + encoder_key = self.norm_added_k(encoder_key) + + query = torch.cat([encoder_query, query], dim=1) + key = torch.cat([encoder_key, key], dim=1) + value = torch.cat([encoder_value, value], dim=1) + + if image_rotary_emb is not None: + cos, sin = image_rotary_emb + cos = cos.to(query.dtype) + sin = sin.to(query.dtype) + query = self.rope(query, cos, sin) + key = self.rope(key, cos, sin) + + attn_metadata = None + if attention_mask is not None: + if attention_mask.dim() == 3: + attention_mask = attention_mask.unsqueeze(1) + attn_metadata = AttentionMetadata(attn_mask=attention_mask) + + hidden_states = self.attn(query, key, value, attn_metadata) + hidden_states = hidden_states.flatten(2, 3).to(query.dtype) + + if encoder_hidden_states is not None: + context_len = encoder_hidden_states.shape[1] + encoder_hidden_states, hidden_states = hidden_states.split_with_sizes( + [context_len, hidden_states.shape[1] - context_len], + dim=1, + ) + encoder_hidden_states, _ = self.to_add_out(encoder_hidden_states) + + hidden_states, _ = self.to_out[0](hidden_states) + hidden_states = self.to_out[1](hidden_states) + + if encoder_hidden_states is not None: + return hidden_states, encoder_hidden_states + return hidden_states + + +class Flux2ParallelSelfAttention(nn.Module): + """ + Parallel attention block that fuses QKV projections with MLP input projections. + """ + + def __init__( + self, + query_dim: int, + heads: int = 8, + dim_head: int = 64, + dropout: float = 0.0, + bias: bool = False, + out_bias: bool = True, + eps: float = 1e-5, + out_dim: int = None, + elementwise_affine: bool = True, + mlp_ratio: float = 4.0, + mlp_mult_factor: int = 2, + ): + super().__init__() + self.head_dim = dim_head + self.inner_dim = out_dim if out_dim is not None else dim_head * heads + self.query_dim = query_dim + self.out_dim = out_dim if out_dim is not None else query_dim + self.heads = out_dim // dim_head if out_dim is not None else heads + self.dropout = dropout + + self.mlp_ratio = mlp_ratio + self.mlp_hidden_dim = int(query_dim * self.mlp_ratio) + self.mlp_mult_factor = mlp_mult_factor + + self.to_qkv_mlp_proj = nn.Linear( + self.query_dim, + self.inner_dim * 3 + self.mlp_hidden_dim * self.mlp_mult_factor, + bias=bias, + ) + self.mlp_act_fn = Flux2SwiGLU() + + self.norm_q = RMSNorm(dim_head, eps=eps) + self.norm_k = RMSNorm(dim_head, eps=eps) + + self.to_out = nn.Linear(self.inner_dim + self.mlp_hidden_dim, self.out_dim, bias=out_bias) + self.rope = RotaryEmbedding(is_neox_style=False) + self.attn = Attention( + num_heads=self.heads, + head_size=self.head_dim, + softmax_scale=1.0 / (self.head_dim**0.5), + causal=False, + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor | None = None, + image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None, + **kwargs, + ) -> torch.Tensor: + hidden_states = self.to_qkv_mlp_proj(hidden_states) + qkv, mlp_hidden_states = torch.split( + hidden_states, + [3 * self.inner_dim, self.mlp_hidden_dim * self.mlp_mult_factor], + dim=-1, + ) + + query, key, value = qkv.chunk(3, dim=-1) + query = query.unflatten(-1, (self.heads, -1)) + key = key.unflatten(-1, (self.heads, -1)) + value = value.unflatten(-1, (self.heads, -1)) + + query = self.norm_q(query) + key = self.norm_k(key) + + if image_rotary_emb is not None: + cos, sin = image_rotary_emb + cos = cos.to(query.dtype) + sin = sin.to(query.dtype) + query = self.rope(query, cos, sin) + key = self.rope(key, cos, sin) + + attn_metadata = None + if attention_mask is not None: + if attention_mask.dim() == 3: + attention_mask = attention_mask.unsqueeze(1) + attn_metadata = AttentionMetadata(attn_mask=attention_mask) + + attn_output = self.attn(query, key, value, attn_metadata) + attn_output = attn_output.flatten(2, 3).to(query.dtype) + + mlp_hidden_states = self.mlp_act_fn(mlp_hidden_states) + hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=-1) + return self.to_out(hidden_states) + + +class Flux2SingleTransformerBlock(nn.Module): + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + mlp_ratio: float = 3.0, + eps: float = 1e-6, + bias: bool = False, + ): + super().__init__() + self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=eps) + self.attn = Flux2ParallelSelfAttention( + query_dim=dim, + dim_head=attention_head_dim, + heads=num_attention_heads, + out_dim=dim, + bias=bias, + out_bias=bias, + eps=eps, + mlp_ratio=mlp_ratio, + mlp_mult_factor=2, + ) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor | None, + temb_mod_params: tuple[torch.Tensor, torch.Tensor, torch.Tensor], + image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None, + joint_attention_kwargs: dict[str, Any] | None = None, + split_hidden_states: bool = False, + text_seq_len: int | None = None, + ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: + if encoder_hidden_states is not None: + text_seq_len = encoder_hidden_states.shape[1] + hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1) + + mod_shift, mod_scale, mod_gate = temb_mod_params + + norm_hidden_states = self.norm(hidden_states) + norm_hidden_states = (1 + mod_scale) * norm_hidden_states + mod_shift + + joint_attention_kwargs = joint_attention_kwargs or {} + attn_output = self.attn( + hidden_states=norm_hidden_states, + image_rotary_emb=image_rotary_emb, + **joint_attention_kwargs, + ) + + hidden_states = hidden_states + mod_gate * attn_output + if hidden_states.dtype == torch.float16: + hidden_states = hidden_states.clip(-65504, 65504) + + if split_hidden_states: + encoder_hidden_states, hidden_states = hidden_states[:, :text_seq_len], hidden_states[:, text_seq_len:] + return encoder_hidden_states, hidden_states + return hidden_states + + +class Flux2TransformerBlock(nn.Module): + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + mlp_ratio: float = 3.0, + eps: float = 1e-6, + bias: bool = False, + ): + super().__init__() + self.norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps) + self.norm1_context = nn.LayerNorm(dim, elementwise_affine=False, eps=eps) + + self.attn = Flux2Attention( + query_dim=dim, + added_kv_proj_dim=dim, + dim_head=attention_head_dim, + heads=num_attention_heads, + out_dim=dim, + bias=bias, + added_proj_bias=bias, + out_bias=bias, + eps=eps, + ) + + self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps) + self.ff = Flux2FeedForward(dim=dim, dim_out=dim, mult=mlp_ratio, bias=bias) + + self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=eps) + self.ff_context = Flux2FeedForward(dim=dim, dim_out=dim, mult=mlp_ratio, bias=bias) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + temb_mod_params_img: tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...], + temb_mod_params_txt: tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...], + image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None, + joint_attention_kwargs: dict[str, Any] | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + joint_attention_kwargs = joint_attention_kwargs or {} + + (shift_msa, scale_msa, gate_msa), (shift_mlp, scale_mlp, gate_mlp) = temb_mod_params_img + (c_shift_msa, c_scale_msa, c_gate_msa), (c_shift_mlp, c_scale_mlp, c_gate_mlp) = temb_mod_params_txt + + norm_hidden_states = self.norm1(hidden_states) + norm_hidden_states = (1 + scale_msa) * norm_hidden_states + shift_msa + + norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states) + norm_encoder_hidden_states = (1 + c_scale_msa) * norm_encoder_hidden_states + c_shift_msa + + attn_output, context_attn_output = self.attn( + hidden_states=norm_hidden_states, + encoder_hidden_states=norm_encoder_hidden_states, + image_rotary_emb=image_rotary_emb, + **joint_attention_kwargs, + ) + + attn_output = gate_msa * attn_output + hidden_states = hidden_states + attn_output + + norm_hidden_states = self.norm2(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp + ff_output = self.ff(norm_hidden_states) + hidden_states = hidden_states + gate_mlp * ff_output + + context_attn_output = c_gate_msa * context_attn_output + encoder_hidden_states = encoder_hidden_states + context_attn_output + + norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states) + norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp) + c_shift_mlp + context_ff_output = self.ff_context(norm_encoder_hidden_states) + encoder_hidden_states = encoder_hidden_states + c_gate_mlp * context_ff_output + if encoder_hidden_states.dtype == torch.float16: + encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504) + + return encoder_hidden_states, hidden_states + + +class Flux2PosEmbed(nn.Module): + def __init__(self, theta: int, axes_dim: list[int]): + super().__init__() + self.theta = theta + self.axes_dim = axes_dim + + def forward(self, ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + cos_out = [] + sin_out = [] + pos = ids.float() + is_mps = ids.device.type == "mps" + is_npu = ids.device.type == "npu" + freqs_dtype = torch.float32 if (is_mps or is_npu) else torch.float64 + for i in range(len(self.axes_dim)): + freqs_cis = get_1d_rotary_pos_embed( + self.axes_dim[i], + pos[..., i], + theta=self.theta, + use_real=False, + freqs_dtype=freqs_dtype, + ) + cos_out.append(freqs_cis.real) + sin_out.append(freqs_cis.imag) + freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device) + freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device) + return freqs_cos, freqs_sin + + +class Flux2TimestepGuidanceEmbeddings(nn.Module): + def __init__( + self, + in_channels: int = 256, + embedding_dim: int = 6144, + bias: bool = False, + guidance_embeds: bool = True, + ): + super().__init__() + self.time_proj = Timesteps(num_channels=in_channels, flip_sin_to_cos=True, downscale_freq_shift=0) + self.timestep_embedder = TimestepEmbedding( + in_channels=in_channels, + time_embed_dim=embedding_dim, + sample_proj_bias=bias, + ) + + if guidance_embeds: + self.guidance_embedder = TimestepEmbedding( + in_channels=in_channels, + time_embed_dim=embedding_dim, + sample_proj_bias=bias, + ) + else: + self.guidance_embedder = None + + def forward(self, timestep: torch.Tensor, guidance: torch.Tensor | None) -> torch.Tensor: + timesteps_proj = self.time_proj(timestep) + timesteps_emb = self.timestep_embedder(timesteps_proj.to(timestep.dtype)) + + if guidance is not None and self.guidance_embedder is not None: + guidance_proj = self.time_proj(guidance) + guidance_emb = self.guidance_embedder(guidance_proj.to(guidance.dtype)) + return timesteps_emb + guidance_emb + return timesteps_emb + + +class Flux2Modulation(nn.Module): + def __init__(self, dim: int, mod_param_sets: int = 2, bias: bool = False): + super().__init__() + self.mod_param_sets = mod_param_sets + self.linear = nn.Linear(dim, dim * 3 * self.mod_param_sets, bias=bias) + self.act_fn = nn.SiLU() + + def forward(self, temb: torch.Tensor) -> tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...]: + mod = self.act_fn(temb) + mod = self.linear(mod) + if mod.ndim == 2: + mod = mod.unsqueeze(1) + mod_params = torch.chunk(mod, 3 * self.mod_param_sets, dim=-1) + return tuple(mod_params[3 * i : 3 * (i + 1)] for i in range(self.mod_param_sets)) + + +class Flux2Transformer2DModel(nn.Module): + """ + The Transformer model introduced in Flux 2. + """ + + _repeated_blocks = ["Flux2TransformerBlock", "Flux2SingleTransformerBlock"] + + def __init__( + self, + patch_size: int = 1, + in_channels: int = 128, + out_channels: int | None = None, + num_layers: int = 8, + num_single_layers: int = 48, + attention_head_dim: int = 128, + num_attention_heads: int = 48, + joint_attention_dim: int = 15360, + timestep_guidance_channels: int = 256, + mlp_ratio: float = 3.0, + axes_dims_rope: tuple[int, ...] = (32, 32, 32, 32), + rope_theta: int = 2000, + eps: float = 1e-6, + guidance_embeds: bool = True, + ): + super().__init__() + self.out_channels = out_channels or in_channels + self.inner_dim = num_attention_heads * attention_head_dim + self.config = SimpleNamespace( + patch_size=patch_size, + in_channels=in_channels, + out_channels=self.out_channels, + num_layers=num_layers, + num_single_layers=num_single_layers, + attention_head_dim=attention_head_dim, + num_attention_heads=num_attention_heads, + joint_attention_dim=joint_attention_dim, + timestep_guidance_channels=timestep_guidance_channels, + mlp_ratio=mlp_ratio, + axes_dims_rope=axes_dims_rope, + rope_theta=rope_theta, + eps=eps, + guidance_embeds=guidance_embeds, + ) + + self.pos_embed = Flux2PosEmbed(theta=rope_theta, axes_dim=list(axes_dims_rope)) + self.time_guidance_embed = Flux2TimestepGuidanceEmbeddings( + in_channels=timestep_guidance_channels, + embedding_dim=self.inner_dim, + bias=False, + guidance_embeds=guidance_embeds, + ) + + self.double_stream_modulation_img = Flux2Modulation(self.inner_dim, mod_param_sets=2, bias=False) + self.double_stream_modulation_txt = Flux2Modulation(self.inner_dim, mod_param_sets=2, bias=False) + self.single_stream_modulation = Flux2Modulation(self.inner_dim, mod_param_sets=1, bias=False) + + self.x_embedder = nn.Linear(in_channels, self.inner_dim, bias=False) + self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim, bias=False) + + self.transformer_blocks = nn.ModuleList( + [ + Flux2TransformerBlock( + dim=self.inner_dim, + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + mlp_ratio=mlp_ratio, + eps=eps, + bias=False, + ) + for _ in range(num_layers) + ] + ) + + self.single_transformer_blocks = nn.ModuleList( + [ + Flux2SingleTransformerBlock( + dim=self.inner_dim, + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + mlp_ratio=mlp_ratio, + eps=eps, + bias=False, + ) + for _ in range(num_single_layers) + ] + ) + + self.norm_out = AdaLayerNormContinuous( + self.inner_dim, + self.inner_dim, + elementwise_affine=False, + eps=eps, + bias=False, + ) + self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=False) + + @property + def dtype(self) -> torch.dtype: + return next(self.parameters()).dtype + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + timestep: torch.LongTensor, + img_ids: torch.Tensor, + txt_ids: torch.Tensor, + guidance: torch.Tensor | None = None, + joint_attention_kwargs: dict[str, Any] | None = None, + return_dict: bool = True, + ) -> torch.Tensor | Transformer2DModelOutput: + joint_attention_kwargs = joint_attention_kwargs or {} + + num_txt_tokens = encoder_hidden_states.shape[1] + + timestep = timestep.to(hidden_states.dtype) * 1000 + if guidance is not None: + guidance = guidance.to(hidden_states.dtype) * 1000 + + temb = self.time_guidance_embed(timestep, guidance) + + double_stream_mod_img = self.double_stream_modulation_img(temb) + double_stream_mod_txt = self.double_stream_modulation_txt(temb) + single_stream_mod = self.single_stream_modulation(temb)[0] + + hidden_states = self.x_embedder(hidden_states) + encoder_hidden_states = self.context_embedder(encoder_hidden_states) + + if img_ids.ndim == 3: + img_ids = img_ids[0] + if txt_ids.ndim == 3: + txt_ids = txt_ids[0] + + image_rotary_emb = self.pos_embed(img_ids) + text_rotary_emb = self.pos_embed(txt_ids) + concat_rotary_emb = ( + torch.cat([text_rotary_emb[0], image_rotary_emb[0]], dim=0), + torch.cat([text_rotary_emb[1], image_rotary_emb[1]], dim=0), + ) + + for block in self.transformer_blocks: + encoder_hidden_states, hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + temb_mod_params_img=double_stream_mod_img, + temb_mod_params_txt=double_stream_mod_txt, + image_rotary_emb=concat_rotary_emb, + joint_attention_kwargs=joint_attention_kwargs, + ) + + hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1) + + for block in self.single_transformer_blocks: + hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=None, + temb_mod_params=single_stream_mod, + image_rotary_emb=concat_rotary_emb, + joint_attention_kwargs=joint_attention_kwargs, + ) + + hidden_states = hidden_states[:, num_txt_tokens:, ...] + hidden_states = self.norm_out(hidden_states, temb) + output = self.proj_out(hidden_states) + + if not return_dict: + return (output,) + return Transformer2DModelOutput(sample=output) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + (".to_qkv", ".to_q", "q"), + (".to_qkv", ".to_k", "k"), + (".to_qkv", ".to_v", "v"), + (".add_kv_proj", ".add_q_proj", "q"), + (".add_kv_proj", ".add_k_proj", "k"), + (".add_kv_proj", ".add_v_proj", "v"), + ] + + params_dict = dict(self.named_parameters()) + + for name, buffer in self.named_buffers(): + if name.endswith(".beta") or name.endswith(".eps"): + params_dict[name] = buffer + + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "to_qkvkv_mlp_proj" in name: + name = name.replace("to_qkvkv_mlp_proj", "to_qkv_mlp_proj") + if "to_qkv_mlp_proj" in name: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm_omni/diffusion/models/flux2_klein/pipeline_flux2_klein.py b/vllm_omni/diffusion/models/flux2_klein/pipeline_flux2_klein.py new file mode 100644 index 00000000000..ba29e681c32 --- /dev/null +++ b/vllm_omni/diffusion/models/flux2_klein/pipeline_flux2_klein.py @@ -0,0 +1,963 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Copyright 2025 Black Forest Labs and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import math +import os +from collections.abc import Callable, Iterable +from typing import Any + +import numpy as np +import PIL.Image +import torch +import torch.nn as nn +from diffusers.image_processor import VaeImageProcessor +from diffusers.models.autoencoders.autoencoder_kl_flux2 import AutoencoderKLFlux2 +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import retrieve_latents +from diffusers.schedulers import FlowMatchEulerDiscreteScheduler +from diffusers.utils.torch_utils import randn_tensor +from transformers import Qwen2TokenizerFast, Qwen3ForCausalLM +from vllm.logger import init_logger +from vllm.model_executor.models.utils import AutoWeightsLoader + +from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig +from vllm_omni.diffusion.distributed.utils import get_local_device +from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader +from vllm_omni.diffusion.models.flux2_klein.flux2_klein_transformer import ( + Flux2Transformer2DModel, +) +from vllm_omni.diffusion.models.interface import SupportImageInput +from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs +from vllm_omni.model_executor.model_loader.weight_utils import download_weights_from_hf_specific + +logger = init_logger(__name__) + + +class Flux2ImageProcessor(VaeImageProcessor): + """Image processor to preprocess the reference image for Flux2 klein.""" + + def __init__( + self, + do_resize: bool = True, + vae_scale_factor: int = 16, + vae_latent_channels: int = 32, + do_normalize: bool = True, + do_convert_rgb: bool = True, + ): + super().__init__( + do_resize=do_resize, + vae_scale_factor=vae_scale_factor, + vae_latent_channels=vae_latent_channels, + do_normalize=do_normalize, + do_convert_rgb=do_convert_rgb, + ) + + @staticmethod + def check_image_input( + image: PIL.Image.Image, + max_aspect_ratio: int = 8, + min_side_length: int = 64, + max_area: int = 1024 * 1024, + ) -> PIL.Image.Image: + if not isinstance(image, PIL.Image.Image): + raise ValueError(f"Image must be a PIL.Image.Image, got {type(image)}") + + width, height = image.size + if width < min_side_length or height < min_side_length: + raise ValueError(f"Image too small: {width}x{height}. Both dimensions must be at least {min_side_length}px") + + aspect_ratio = max(width / height, height / width) + if aspect_ratio > max_aspect_ratio: + raise ValueError( + f"Aspect ratio too extreme: {width}x{height} (ratio: {aspect_ratio:.1f}:1). " + f"Maximum allowed ratio is {max_aspect_ratio}:1" + ) + + if width * height > max_area: + logger.warning("Image area exceeds recommended maximum; resizing will be applied.") + + return image + + @staticmethod + def _resize_to_target_area(image: PIL.Image.Image, target_area: int = 1024 * 1024) -> PIL.Image.Image: + image_width, image_height = image.size + scale = math.sqrt(target_area / (image_width * image_height)) + width = int(image_width * scale) + height = int(image_height * scale) + return image.resize((width, height), PIL.Image.Resampling.LANCZOS) + + @staticmethod + def _resize_if_exceeds_area(image: PIL.Image.Image, target_area: int = 1024 * 1024) -> PIL.Image.Image: + image_width, image_height = image.size + if image_width * image_height <= target_area: + return image + return Flux2ImageProcessor._resize_to_target_area(image, target_area) + + def _resize_and_crop(self, image: PIL.Image.Image, width: int, height: int) -> PIL.Image.Image: + image_width, image_height = image.size + left = (image_width - width) // 2 + top = (image_height - height) // 2 + right = left + width + bottom = top + height + return image.crop((left, top, right, bottom)) + + @staticmethod + def concatenate_images(images: list[PIL.Image.Image]) -> PIL.Image.Image: + if len(images) == 1: + return images[0].copy() + + images = [img.convert("RGB") if img.mode != "RGB" else img for img in images] + total_width = sum(img.width for img in images) + max_height = max(img.height for img in images) + background_color = (255, 255, 255) + new_img = PIL.Image.new("RGB", (total_width, max_height), background_color) + + x_offset = 0 + for img in images: + y_offset = (max_height - img.height) // 2 + new_img.paste(img, (x_offset, y_offset)) + x_offset += img.width + + return new_img + + +def get_flux2_klein_post_process_func( + od_config: OmniDiffusionConfig, +): + model_name = od_config.model + if os.path.exists(model_name): + model_path = model_name + else: + model_path = download_weights_from_hf_specific(model_name, None, ["*"]) + + vae_config_path = os.path.join(model_path, "vae/config.json") + with open(vae_config_path) as f: + vae_config = json.load(f) + vae_scale_factor = 2 ** (len(vae_config["block_out_channels"]) - 1) if "block_out_channels" in vae_config else 8 + + image_processor = Flux2ImageProcessor(vae_scale_factor=vae_scale_factor * 2) + + def post_process_func(images: torch.Tensor): + return image_processor.postprocess(images) + + return post_process_func + + +# Copied from diffusers.pipelines.flux2.pipeline_flux2.compute_empirical_mu +def compute_empirical_mu(image_seq_len: int, num_steps: int) -> float: + a1, b1 = 8.73809524e-05, 1.89833333 + a2, b2 = 0.00016927, 0.45666666 + + if image_seq_len > 4300: + mu = a2 * image_seq_len + b2 + return float(mu) + + m_200 = a2 * image_seq_len + b2 + m_10 = a1 * image_seq_len + b1 + + a = (m_200 - m_10) / 190.0 + b = m_200 - 200.0 * a + mu = a * num_steps + b + + return float(mu) + + +class Flux2KleinPipeline(nn.Module, SupportImageInput): + """Flux2 klein pipeline for text-to-image generation.""" + + support_image_input = True + + def __init__( + self, + *, + od_config: OmniDiffusionConfig, + prefix: str = "", + is_distilled: bool = False, + ): + super().__init__() + self.od_config = od_config + self.is_distilled = is_distilled + self.weights_sources = [ + DiffusersPipelineLoader.ComponentSource( + model_or_path=od_config.model, + subfolder="transformer", + revision=None, + prefix="transformer.", + fall_back_to_pt=True, + ) + ] + + self._execution_device = get_local_device() + model = od_config.model + local_files_only = os.path.exists(model) + + self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained( + model, + subfolder="scheduler", + local_files_only=local_files_only, + ) + self.text_encoder = Qwen3ForCausalLM.from_pretrained( + model, + subfolder="text_encoder", + local_files_only=local_files_only, + ) + self.tokenizer = Qwen2TokenizerFast.from_pretrained( + model, + subfolder="tokenizer", + local_files_only=local_files_only, + ) + self.vae = AutoencoderKLFlux2.from_pretrained( + model, + subfolder="vae", + local_files_only=local_files_only, + ).to(self._execution_device) + + transformer_kwargs = get_transformer_config_kwargs(od_config.tf_model_config, Flux2Transformer2DModel) + self.transformer = Flux2Transformer2DModel(**transformer_kwargs) + + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8 + self.image_processor = Flux2ImageProcessor(vae_scale_factor=self.vae_scale_factor * 2) + self.tokenizer_max_length = 512 + self.default_sample_size = 128 + + self._guidance_scale = None + self._attention_kwargs = None + self._num_timesteps = None + self._current_timestep = None + self._interrupt = False + + @staticmethod + def _get_qwen3_prompt_embeds( + text_encoder: Qwen3ForCausalLM, + tokenizer: Qwen2TokenizerFast, + prompt: str | list[str], + dtype: torch.dtype | None = None, + device: torch.device | None = None, + max_sequence_length: int = 512, + hidden_states_layers: list[int] = (9, 18, 27), + ): + dtype = text_encoder.dtype if dtype is None else dtype + device = text_encoder.device if device is None else device + + prompt = [prompt] if isinstance(prompt, str) else prompt + + all_input_ids = [] + all_attention_masks = [] + + for single_prompt in prompt: + messages = [{"role": "user", "content": single_prompt}] + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + enable_thinking=False, + ) + inputs = tokenizer( + text, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=max_sequence_length, + ) + + all_input_ids.append(inputs["input_ids"]) + all_attention_masks.append(inputs["attention_mask"]) + + input_ids = torch.cat(all_input_ids, dim=0).to(device) + attention_mask = torch.cat(all_attention_masks, dim=0).to(device) + + # Forward pass through the model + output = text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + output_hidden_states=True, + use_cache=False, + ) + + # Only use outputs from intermediate layers and stack them + out = torch.stack([output.hidden_states[k] for k in hidden_states_layers], dim=1) + out = out.to(dtype=dtype, device=device) + + batch_size, num_channels, seq_len, hidden_dim = out.shape + prompt_embeds = out.permute(0, 2, 1, 3).reshape(batch_size, seq_len, num_channels * hidden_dim) + + return prompt_embeds + + @staticmethod + # Copied from diffusers.pipelines.flux2.pipeline_flux2.Flux2Pipeline._prepare_text_ids + def _prepare_text_ids( + x: torch.Tensor, # (B, L, D) or (L, D) + t_coord: torch.Tensor | None = None, + ): + B, L, _ = x.shape + out_ids = [] + + for i in range(B): + t = torch.arange(1) if t_coord is None else t_coord[i] + h = torch.arange(1) + w = torch.arange(1) + seq_positions = torch.arange(L) + + coords = torch.cartesian_prod(t, h, w, seq_positions) + out_ids.append(coords) + + return torch.stack(out_ids) + + @staticmethod + # Copied from diffusers.pipelines.flux2.pipeline_flux2.Flux2Pipeline._prepare_latent_ids + def _prepare_latent_ids( + latents: torch.Tensor, # (B, C, H, W) + ): + r""" + Generates 4D position coordinates (T, H, W, L) for latent tensors. + + Args: + latents (torch.Tensor): + Latent tensor of shape (B, C, H, W) + + Returns: + torch.Tensor: + Position IDs tensor of shape (B, H*W, 4) All batches share the same coordinate structure: T=0, + H=[0..H-1], W=[0..W-1], L=0 + """ + + batch_size, _, height, width = latents.shape + + t = torch.arange(1) # [0] - time dimension + h = torch.arange(height) + w = torch.arange(width) + layer_ids = torch.arange(1) # [0] - layer dimension + + # Create position IDs: (H*W, 4) + latent_ids = torch.cartesian_prod(t, h, w, layer_ids) + + # Expand to batch: (B, H*W, 4) + latent_ids = latent_ids.unsqueeze(0).expand(batch_size, -1, -1) + + return latent_ids + + @staticmethod + # Copied from diffusers.pipelines.flux2.pipeline_flux2.Flux2Pipeline._prepare_image_ids + def _prepare_image_ids( + image_latents: list[torch.Tensor], # [(1, C, H, W), (1, C, H, W), ...] + scale: int = 10, + ): + r""" + Generates 4D time-space coordinates (T, H, W, L) for a sequence of image latents. + + This function creates a unique coordinate for every pixel/patch across all input latent with different + dimensions. + + Args: + image_latents (List[torch.Tensor]): + A list of image latent feature tensors, typically of shape (C, H, W). + scale (int, optional): + A factor used to define the time separation (T-coordinate) between latents. T-coordinate for the i-th + latent is: 'scale + scale * i'. Defaults to 10. + + Returns: + torch.Tensor: + The combined coordinate tensor. Shape: (1, N_total, 4) Where N_total is the sum of (H * W) for all + input latents. + + Coordinate Components (Dimension 4): + - T (Time): The unique index indicating which latent image the coordinate belongs to. + - H (Height): The row index within that latent image. + - W (Width): The column index within that latent image. + - L (Seq. Length): A sequence length dimension, which is always fixed at 0 (size 1) + """ + + if not isinstance(image_latents, list): + raise ValueError(f"Expected `image_latents` to be a list, got {type(image_latents)}.") + + # create time offset for each reference image + t_coords = [scale + scale * t for t in torch.arange(0, len(image_latents))] + t_coords = [t.view(-1) for t in t_coords] + + image_latent_ids = [] + for x, t in zip(image_latents, t_coords): + x = x.squeeze(0) + _, height, width = x.shape + + x_ids = torch.cartesian_prod(t, torch.arange(height), torch.arange(width), torch.arange(1)) + image_latent_ids.append(x_ids) + + image_latent_ids = torch.cat(image_latent_ids, dim=0) + image_latent_ids = image_latent_ids.unsqueeze(0) + + return image_latent_ids + + @staticmethod + # Copied from diffusers.pipelines.flux2.pipeline_flux2.Flux2Pipeline._patchify_latents + def _patchify_latents(latents): + batch_size, num_channels_latents, height, width = latents.shape + latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2) + latents = latents.permute(0, 1, 3, 5, 2, 4) + latents = latents.reshape(batch_size, num_channels_latents * 4, height // 2, width // 2) + return latents + + @staticmethod + # Copied from diffusers.pipelines.flux2.pipeline_flux2.Flux2Pipeline._unpatchify_latents + def _unpatchify_latents(latents): + batch_size, num_channels_latents, height, width = latents.shape + latents = latents.reshape(batch_size, num_channels_latents // (2 * 2), 2, 2, height, width) + latents = latents.permute(0, 1, 4, 2, 5, 3) + latents = latents.reshape(batch_size, num_channels_latents // (2 * 2), height * 2, width * 2) + return latents + + @staticmethod + # Copied from diffusers.pipelines.flux2.pipeline_flux2.Flux2Pipeline._pack_latents + def _pack_latents(latents): + """ + pack latents: (batch_size, num_channels, height, width) -> (batch_size, height * width, num_channels) + """ + + batch_size, num_channels, height, width = latents.shape + latents = latents.reshape(batch_size, num_channels, height * width).permute(0, 2, 1) + + return latents + + @staticmethod + # Copied from diffusers.pipelines.flux2.pipeline_flux2.Flux2Pipeline._unpack_latents_with_ids + def _unpack_latents_with_ids(x: torch.Tensor, x_ids: torch.Tensor) -> list[torch.Tensor]: + """ + using position ids to scatter tokens into place + """ + x_list = [] + for data, pos in zip(x, x_ids): + _, ch = data.shape # noqa: F841 + h_ids = pos[:, 1].to(torch.int64) + w_ids = pos[:, 2].to(torch.int64) + + h = torch.max(h_ids) + 1 + w = torch.max(w_ids) + 1 + + flat_ids = h_ids * w + w_ids + + out = torch.zeros((h * w, ch), device=data.device, dtype=data.dtype) + out.scatter_(0, flat_ids.unsqueeze(1).expand(-1, ch), data) + + # reshape from (H * W, C) to (H, W, C) and permute to (C, H, W) + + out = out.view(h, w, ch).permute(2, 0, 1) + x_list.append(out) + + return torch.stack(x_list, dim=0) + + def encode_prompt( + self, + prompt: str | list[str], + device: torch.device | None = None, + num_images_per_prompt: int = 1, + prompt_embeds: torch.Tensor | None = None, + max_sequence_length: int = 512, + text_encoder_out_layers: tuple[int, ...] = (9, 18, 27), + ): + device = device or self._execution_device + + if prompt is None: + prompt = "" + + prompt = [prompt] if isinstance(prompt, str) else prompt + + if prompt_embeds is None: + prompt_embeds = self._get_qwen3_prompt_embeds( + text_encoder=self.text_encoder, + tokenizer=self.tokenizer, + prompt=prompt, + device=device, + max_sequence_length=max_sequence_length, + hidden_states_layers=text_encoder_out_layers, + ) + + batch_size, seq_len, _ = prompt_embeds.shape + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + text_ids = self._prepare_text_ids(prompt_embeds) + text_ids = text_ids.to(device) + return prompt_embeds, text_ids + + # Copied from diffusers.pipelines.flux2.pipeline_flux2.Flux2Pipeline._encode_vae_image + def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): + if image.ndim != 4: + raise ValueError(f"Expected image dims 4, got {image.ndim}.") + + image_latents = retrieve_latents(self.vae.encode(image), generator=generator, sample_mode="argmax") + image_latents = self._patchify_latents(image_latents) + + latents_bn_mean = self.vae.bn.running_mean.view(1, -1, 1, 1).to(image_latents.device, image_latents.dtype) + latents_bn_std = torch.sqrt(self.vae.bn.running_var.view(1, -1, 1, 1) + self.vae.config.batch_norm_eps) + image_latents = (image_latents - latents_bn_mean) / latents_bn_std + + return image_latents + + # Copied from diffusers.pipelines.flux2.pipeline_flux2.Flux2Pipeline.prepare_latents + def prepare_latents( + self, + batch_size, + num_latents_channels, + height, + width, + dtype, + device, + generator: torch.Generator, + latents: torch.Tensor | None = None, + ): + # VAE applies 8x compression on images but we must also account for packing which requires + # latent height and width to be divisible by 2. + height = 2 * (int(height) // (self.vae_scale_factor * 2)) + width = 2 * (int(width) // (self.vae_scale_factor * 2)) + + shape = (batch_size, num_latents_channels * 4, height // 2, width // 2) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device=device, dtype=dtype) + + latent_ids = self._prepare_latent_ids(latents) + latent_ids = latent_ids.to(device) + + latents = self._pack_latents(latents) # [B, C, H, W] -> [B, H*W, C] + return latents, latent_ids + + # Copied from diffusers.pipelines.flux2.pipeline_flux2.Flux2Pipeline.prepare_image_latents + def prepare_image_latents( + self, + images: list[torch.Tensor], + batch_size, + generator: torch.Generator, + device, + dtype, + ): + image_latents = [] + for image in images: + image = image.to(device=device, dtype=dtype) + imagge_latent = self._encode_vae_image(image=image, generator=generator) + image_latents.append(imagge_latent) # (1, 128, 32, 32) + + image_latent_ids = self._prepare_image_ids(image_latents) + + # Pack each latent and concatenate + packed_latents = [] + for latent in image_latents: + # latent: (1, 128, 32, 32) + packed = self._pack_latents(latent) # (1, 1024, 128) + packed = packed.squeeze(0) # (1024, 128) - remove batch dim + packed_latents.append(packed) + + # Concatenate all reference tokens along sequence dimension + image_latents = torch.cat(packed_latents, dim=0) # (N*1024, 128) + image_latents = image_latents.unsqueeze(0) # (1, N*1024, 128) + + image_latents = image_latents.repeat(batch_size, 1, 1) + image_latent_ids = image_latent_ids.repeat(batch_size, 1, 1) + image_latent_ids = image_latent_ids.to(device) + + return image_latents, image_latent_ids + + def check_inputs( + self, + prompt, + height, + width, + prompt_embeds=None, + callback_on_step_end_tensor_inputs=None, + guidance_scale=None, + ): + if ( + height is not None + and height % (self.vae_scale_factor * 2) != 0 + or width is not None + and width % (self.vae_scale_factor * 2) != 0 + ): + logger.warning( + "`height` and `width` have to be divisible by %s but are %s and %s. " + "Dimensions will be resized accordingly", + self.vae_scale_factor * 2, + height, + width, + ) + + if callback_on_step_end_tensor_inputs is not None and not all( + k in ["latents", "prompt_embeds"] for k in callback_on_step_end_tensor_inputs + ): + raise ValueError("`callback_on_step_end_tensor_inputs` must be a subset of ['latents', 'prompt_embeds'].") + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if guidance_scale > 1.0 and self.is_distilled: + logger.warning(f"Guidance scale {guidance_scale} is ignored for step-wise distilled models.") + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def do_classifier_free_guidance(self): + return self._guidance_scale is not None and self._guidance_scale > 1 and not self.is_distilled + + @property + def attention_kwargs(self): + return self._attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def current_timestep(self): + return self._current_timestep + + @property + def interrupt(self): + return self._interrupt + + @torch.no_grad() + def forward( + self, + req: OmniDiffusionRequest, + image: PIL.Image.Image | list[PIL.Image.Image] | None = None, + prompt: str | list[str] | None = None, + height: int | None = None, + width: int | None = None, + num_inference_steps: int = 50, + sigmas: list[float] | None = None, + guidance_scale: float | None = 4.0, + num_images_per_prompt: int = 1, + generator: torch.Generator | list[torch.Generator] | None = None, + latents: torch.Tensor | None = None, + prompt_embeds: torch.Tensor | None = None, + negative_prompt_embeds: torch.Tensor | None = None, + output_type: str | None = "pil", + return_dict: bool = True, + attention_kwargs: dict[str, Any] | None = None, + callback_on_step_end: Callable[[int, int, dict], None] | None = None, + callback_on_step_end_tensor_inputs: list[str] = ["latents"], + max_sequence_length: int = 512, + text_encoder_out_layers: tuple[int, ...] = (9, 18, 27), + ) -> DiffusionOutput: + r""" + Function invoked when calling the pipeline for generation. + + Args: + image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, or list of these): + `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both + numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list + or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a + list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image + latents as `image`, but if passing latents directly it is not encoded again. + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion + Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2. + of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting + `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to + the text `prompt`, usually at the expense of lower image quality. For step-wise distilled models, + `guidance_scale` is ignored. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. This is set to 1024 by default for the best results. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. This is set to 1024 by default for the best results. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + sigmas (`List[float]`, *optional*): + Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in + their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed + will be used. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.Tensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will be generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Note that "" is used as the negative prompt in this pipeline. + If not provided, will be generated from "". + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.qwenimage.QwenImagePipelineOutput`] instead of a plain tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`. + text_encoder_out_layers (`Tuple[int]`): + Layer indices to use in the `text_encoder` to derive the final prompt embeddings. + + Examples: + + Returns: + [`~pipelines.flux2.Flux2PipelineOutput`] or `tuple`: [`~pipelines.flux2.Flux2PipelineOutput`] if + `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the + generated images. + """ + + prompt = req.prompt if req.prompt is not None else prompt + image = req.pil_image if req.pil_image is not None else image + height = req.height or height + width = req.width or width + num_inference_steps = req.num_inference_steps or num_inference_steps + guidance_scale = req.guidance_scale if req.guidance_scale is not None else guidance_scale + generator = req.generator or generator + req_num_outputs = getattr(req, "num_outputs_per_prompt", None) + if req_num_outputs and req_num_outputs > 0: + num_images_per_prompt = req_num_outputs + + if isinstance(req.prompt_embeds, torch.Tensor): + prompt_embeds = req.prompt_embeds + if isinstance(req.negative_prompt_embeds, torch.Tensor): + negative_prompt_embeds = req.negative_prompt_embeds + + if req.max_sequence_length is not None: + max_sequence_length = req.max_sequence_length + if getattr(req, "text_encoder_out_layers", None) is not None: + text_encoder_out_layers = req.text_encoder_out_layers + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt=prompt, + height=height, + width=width, + prompt_embeds=prompt_embeds, + callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs, + guidance_scale=guidance_scale, + ) + + self._guidance_scale = guidance_scale + self._attention_kwargs = attention_kwargs + self._current_timestep = None + self._interrupt = False + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + # 3. prepare text embeddings + prompt_embeds, text_ids = self.encode_prompt( + prompt=prompt, + prompt_embeds=prompt_embeds, + device=device, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + text_encoder_out_layers=text_encoder_out_layers, + ) + + if self.do_classifier_free_guidance: + negative_prompt = "" + if prompt is not None and isinstance(prompt, list): + negative_prompt = [negative_prompt] * len(prompt) + negative_prompt_embeds, negative_text_ids = self.encode_prompt( + prompt=negative_prompt, + prompt_embeds=negative_prompt_embeds, + device=device, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + text_encoder_out_layers=text_encoder_out_layers, + ) + + # 4. process images + if image is not None and not isinstance(image, list): + image = [image] + + condition_images = None + if image is not None: + for img in image: + self.image_processor.check_image_input(img) + + condition_images = [] + for img in image: + image_width, image_height = img.size + if image_width * image_height > 1024 * 1024: + img = self.image_processor._resize_to_target_area(img, 1024 * 1024) + image_width, image_height = img.size + + multiple_of = self.vae_scale_factor * 2 + image_width = (image_width // multiple_of) * multiple_of + image_height = (image_height // multiple_of) * multiple_of + img = self.image_processor.preprocess(img, height=image_height, width=image_width, resize_mode="crop") + condition_images.append(img) + height = height or image_height + width = width or image_width + + height = height or self.default_sample_size * self.vae_scale_factor + width = width or self.default_sample_size * self.vae_scale_factor + + # 5. prepare latent variables + num_channels_latents = self.transformer.config.in_channels // 4 + latents, latent_ids = self.prepare_latents( + batch_size=batch_size * num_images_per_prompt, + num_latents_channels=num_channels_latents, + height=height, + width=width, + dtype=prompt_embeds.dtype, + device=device, + generator=generator, + latents=latents, + ) + + image_latents = None + image_latent_ids = None + if condition_images is not None: + image_latents, image_latent_ids = self.prepare_image_latents( + images=condition_images, + batch_size=batch_size * num_images_per_prompt, + generator=generator, + device=device, + dtype=self.vae.dtype, + ) + + # 6. Prepare timesteps + sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas + if hasattr(self.scheduler.config, "use_flow_sigmas") and self.scheduler.config.use_flow_sigmas: + sigmas = None + image_seq_len = latents.shape[1] + mu = compute_empirical_mu(image_seq_len=image_seq_len, num_steps=num_inference_steps) + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, + num_inference_steps, + device, + sigmas=sigmas, + mu=mu, + ) + self._num_timesteps = len(timesteps) + + # 7. Denoising loop + # We set the index here to remove DtoH sync, helpful especially during compilation. + # Check out more details here: https://github.com/huggingface/diffusers/pull/11696 + self.scheduler.set_begin_index(0) + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + self._current_timestep = t + timestep = t.expand(latents.shape[0]).to(latents.dtype) + + latent_model_input = latents.to(self.transformer.dtype) + latent_image_ids = latent_ids + + if image_latents is not None: + latent_model_input = torch.cat([latents, image_latents], dim=1).to(self.transformer.dtype) + latent_image_ids = torch.cat([latent_ids, image_latent_ids], dim=1) + + noise_pred = self.transformer( + hidden_states=latent_model_input, + timestep=timestep / 1000, + guidance=None, + encoder_hidden_states=prompt_embeds, + txt_ids=text_ids, + img_ids=latent_image_ids, + joint_attention_kwargs=self.attention_kwargs, + return_dict=False, + )[0] + + noise_pred = noise_pred[:, : latents.size(1) :] + + if self.do_classifier_free_guidance: + neg_noise_pred = self.transformer( + hidden_states=latent_model_input, + timestep=timestep / 1000, + guidance=None, + encoder_hidden_states=negative_prompt_embeds, + txt_ids=negative_text_ids, + img_ids=latent_image_ids, + joint_attention_kwargs=self.attention_kwargs, + return_dict=False, + )[0] + neg_noise_pred = neg_noise_pred[:, : latents.size(1) :] + noise_pred = neg_noise_pred + guidance_scale * (noise_pred - neg_noise_pred) + + latents_dtype = latents.dtype + latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0] + + if latents.dtype != latents_dtype and torch.backends.mps.is_available(): + latents = latents.to(latents_dtype) + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + + self._current_timestep = None + + latents = self._unpack_latents_with_ids(latents, latent_ids) + + latents_bn_mean = self.vae.bn.running_mean.view(1, -1, 1, 1).to(latents.device, latents.dtype) + latents_bn_std = torch.sqrt(self.vae.bn.running_var.view(1, -1, 1, 1) + self.vae.config.batch_norm_eps).to( + latents.device, latents.dtype + ) + latents = latents * latents_bn_std + latents_bn_mean + latents = self._unpatchify_latents(latents) + if output_type == "latent": + image = latents + else: + if latents.dtype != self.vae.dtype: + latents = latents.to(self.vae.dtype) + image = self.vae.decode(latents, return_dict=False)[0] + + return DiffusionOutput(output=image) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) diff --git a/vllm_omni/diffusion/registry.py b/vllm_omni/diffusion/registry.py index c49ba0a3cd9..5edd87a827d 100644 --- a/vllm_omni/diffusion/registry.py +++ b/vllm_omni/diffusion/registry.py @@ -74,6 +74,11 @@ "pipeline_sd3", "StableDiffusion3Pipeline", ), + "Flux2KleinPipeline": ( + "flux2_klein", + "pipeline_flux2_klein", + "Flux2KleinPipeline", + ), } @@ -121,6 +126,7 @@ def initialize_model( "BagelPipeline": "get_bagel_post_process_func", "LongCatImageEditPipeline": "get_longcat_image_post_process_func", "StableDiffusion3Pipeline": "get_sd3_image_post_process_func", + "Flux2KleinPipeline": "get_flux2_klein_post_process_func", } _DIFFUSION_PRE_PROCESS_FUNCS = { From 14e83e7a18edec635ed3e76a5435da825161f3d4 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Fri, 16 Jan 2026 09:59:35 +0000 Subject: [PATCH 45/59] inherit engine outputs Signed-off-by: tzhouam --- vllm_omni/engine/__init__.py | 34 ++-------------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) diff --git a/vllm_omni/engine/__init__.py b/vllm_omni/engine/__init__.py index 79dc25cf494..1eb479c33bf 100644 --- a/vllm_omni/engine/__init__.py +++ b/vllm_omni/engine/__init__.py @@ -9,11 +9,8 @@ import msgspec import torch from vllm.v1.engine import ( - EngineCoreEvent, EngineCoreRequest, - FinishReason, - LogprobsLists, - LogprobsTensors, + EngineCoreOutput, SchedulerStats, UtilityOutput, ) @@ -79,36 +76,9 @@ class OmniEngineCoreRequest(EngineCoreRequest): additional_information: AdditionalInformationPayload | None = None -class OmniEngineCoreOutput( - msgspec.Struct, - array_like=True, # type: ignore[call-arg] - omit_defaults=True, # type: ignore[call-arg] - gc=False, -): # type: ignore[call-arg] - request_id: str - new_token_ids: list[int] - - new_logprobs: LogprobsLists | None = None - new_prompt_logprobs_tensors: LogprobsTensors | None = None - +class OmniEngineCoreOutput(EngineCoreOutput): pooling_output: dict[str, torch.Tensor] | None = None - finish_reason: FinishReason | None = None - stop_reason: int | str | None = None - events: list[EngineCoreEvent] | None = None - kv_transfer_params: dict[str, Any] | None = None - - trace_headers: Mapping[str, str] | None = None - # The number of tokens with prefix cache hits. - num_cached_tokens: int = 0 - - # The number of NaNs in logits. - # A value greater than 0 indicates that the output is corrupted. - num_nans_in_logits: int = 0 - - @property - def finished(self) -> bool: - return self.finish_reason is not None class OmniEngineCoreOutputs( From b00685cf4c6d301e037e02256c65e4adcb454ae8 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Fri, 16 Jan 2026 10:01:04 +0000 Subject: [PATCH 46/59] supporting audio in video(not finished) Signed-off-by: tzhouam --- .../offline_inference/qwen3_omni/end2end.py | 84 ++++++++++++++++--- ...letion_client_for_multimodal_generation.py | 26 ++++++ vllm_omni/assets/video.py | 14 ++++ vllm_omni/engine/__init__.py | 32 +------ 4 files changed, 117 insertions(+), 39 deletions(-) create mode 100644 vllm_omni/assets/video.py diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index 9a1324305cf..3b6e526b501 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -12,14 +12,15 @@ import librosa import numpy as np import soundfile as sf -from PIL import Image import vllm +from PIL import Image from vllm import SamplingParams from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset, video_to_ndarrays from vllm.multimodal.image import convert_image_mode from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm_omni.assets.video import extract_video_audio from vllm_omni.entrypoints.omni import Omni @@ -200,7 +201,6 @@ def get_mixed_modalities_query( limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1}, ) - def get_multi_audios_query() -> QueryResult: question = "Are these two audio clips the same?" prompt = ( @@ -224,7 +224,64 @@ def get_multi_audios_query() -> QueryResult: "audio": 2, }, ) - + +# def get_use_audio_in_video_query(video_path: str | None = None) -> QueryResult: + # question = ( + # "Describe the content of the video in details, then convert what the " + # "baby say into text." + # ) + # prompt = ( + # f"<|im_start|>system\n{default_system}<|im_end|>\n" + # "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>" + # f"{question}<|im_end|>\n" + # f"<|im_start|>assistant\n" + # ) + # if video_path: + # if not os.path.exists(video_path): + # raise FileNotFoundError(f"Video file not found: {video_path}") + # video_frames = video_to_ndarrays(video_path, num_frames=16) + # else: + # video_frames = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays + # audio = extract_video_audio(video_path, sampling_rate=16000) + # return QueryResult( + # inputs={ + # "prompt": prompt, + # "multi_modal_data": { + # "video": video_frames, + # "audio": audio, + # }, + # "mm_processor_kwargs": { + # "use_audio_in_video": True, + # }, + # }, + # limit_mm_per_prompt={"audio": 1, "video": 1}, + # ) +def get_use_audio_in_video_query() -> QueryResult: + question = ( + "Describe the content of the video in details, then convert what the " + "baby say into text." + ) + prompt = ( + f"<|im_start|>system\n{default_system}<|im_end|>\n" + "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>" + f"{question}<|im_end|>\n" + f"<|im_start|>assistant\n" + ) + asset = VideoAsset(name="baby_reading", num_frames=16) + audio = asset.get_audio(sampling_rate=16000) + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": { + "video": asset.np_ndarrays, + "audio": audio, + }, + "mm_processor_kwargs": { + "use_audio_in_video": True, + }, + }, + limit_mm_per_prompt={"audio": 1, "video": 1}, + ) query_map = { "text": get_text_query, @@ -233,6 +290,7 @@ def get_multi_audios_query() -> QueryResult: "use_video": get_video_query, "multi_audios": get_multi_audios_query, "mixed_modalities": get_mixed_modalities_query, + "use_audio_in_video": get_use_audio_in_video_query, } @@ -261,6 +319,10 @@ def main(args): num_frames=getattr(args, "num_frames", 16), sampling_rate=getattr(args, "sampling_rate", 16000), ) + elif args.query_type == "multi_audios": + query_result = query_func() + elif args.query_type == "use_audio_in_video": + query_result = query_func() else: query_result = query_func() @@ -272,13 +334,13 @@ def main(args): ) thinker_sampling_params = SamplingParams( - temperature=0.4, - top_p=0.9, - top_k=-1, - max_tokens=1200, - repetition_penalty=1.05, - logit_bias={}, - seed=SEED, + temperature=0.2, + # top_p=0.9, + # top_k=-1, + # max_tokens=1200, + # repetition_penalty=1.05, + # logit_bias={}, + seed=0, ) talker_sampling_params = SamplingParams( @@ -333,6 +395,8 @@ def main(args): total_requests = len(prompts) processed_count = 0 + print(f"query type: {args.query_type}") + for stage_outputs in omni_generator: if stage_outputs.final_output_type == "text": for output in stage_outputs.request_output: diff --git a/examples/online_serving/qwen3_omni/openai_chat_completion_client_for_multimodal_generation.py b/examples/online_serving/qwen3_omni/openai_chat_completion_client_for_multimodal_generation.py index 0d502063576..b9a76858161 100644 --- a/examples/online_serving/qwen3_omni/openai_chat_completion_client_for_multimodal_generation.py +++ b/examples/online_serving/qwen3_omni/openai_chat_completion_client_for_multimodal_generation.py @@ -304,6 +304,25 @@ def get_multi_audios_query(custom_prompt: str | None = None): ], } +def get_use_audio_in_video_query( + video_path: str | None = None, + audio_path: str | None = None, + custom_prompt: str | None = None, +): + question = custom_prompt or ( + "Describe the content of the video in details, then convert what the " + "baby say into text." + ) + video_url = get_video_url_from_path(video_path) + audio_url = get_audio_url_from_path(audio_path) + return { + "role": "user", + "content": [ + {"type": "video_url", "video_url": {"url": video_url}}, + {"type": "audio_url", "audio_url": {"url": audio_url}}, + {"type": "text", "text": question}, + ], + } query_map = { "text": get_text_query, @@ -312,6 +331,7 @@ def get_multi_audios_query(custom_prompt: str | None = None): "use_video": get_video_query, "use_mixed_modalities": get_mixed_modalities_query, "use_multi_audios": get_multi_audios_query, + "use_audio_in_video": get_use_audio_in_video_query, } @@ -372,6 +392,12 @@ def run_multimodal_generation(args) -> None: prompt = query_func(audio_path=audio_path, custom_prompt=custom_prompt) elif args.query_type == "text": prompt = query_func(custom_prompt=custom_prompt) + elif args.query_type == "use_audio_in_video": + prompt = query_func( + video_path=video_path, + audio_path=audio_path, + custom_prompt=custom_prompt, + ) else: prompt = query_func() diff --git a/vllm_omni/assets/video.py b/vllm_omni/assets/video.py new file mode 100644 index 00000000000..361e2ac785f --- /dev/null +++ b/vllm_omni/assets/video.py @@ -0,0 +1,14 @@ +import librosa +import numpy as np +from vllm.assets.video import VideoAsset +def extract_video_audio(path: str = None, sampling_rate: int = 16000) -> np.ndarray: + """ This function extracts the audio from a video file path and returns the audio as a numpy array. + Args: + path: The path to the video file. + Returns: + The audio as a numpy array. + """ + if not path: + path = VideoAsset(name="baby_reading").video_path + audio_signal, sr = librosa.load(path, sr=sampling_rate) + return audio_signal \ No newline at end of file diff --git a/vllm_omni/engine/__init__.py b/vllm_omni/engine/__init__.py index 1eb479c33bf..47d094e4163 100644 --- a/vllm_omni/engine/__init__.py +++ b/vllm_omni/engine/__init__.py @@ -11,6 +11,7 @@ from vllm.v1.engine import ( EngineCoreRequest, EngineCoreOutput, + EngineCoreOutputs, SchedulerStats, UtilityOutput, ) @@ -81,32 +82,5 @@ class OmniEngineCoreOutput(EngineCoreOutput): -class OmniEngineCoreOutputs( - msgspec.Struct, - array_like=True, # type: ignore[call-arg] - omit_defaults=True, # type: ignore[call-arg] - gc=False, -): # type: ignore[call-arg] - # NOTE(Nick): We could consider ways to make this more compact, - # e.g. columnwise layout - - engine_index: int = 0 - - # [num_reqs] - outputs: list[OmniEngineCoreOutput] = [] - scheduler_stats: SchedulerStats | None = None - timestamp: float = 0.0 - - utility_output: UtilityOutput | None = None - finished_requests: set[str] | None = None - - # In DP case, used to signal that the current wave of requests - # has finished and the engines are paused. - wave_complete: int | None = None - # In DP case, used to signal that a request was received for an - # "old" wave, so the next wave needs to be started in other engines. - start_wave: int | None = None - - def __post_init__(self): - if self.timestamp == 0.0: - self.timestamp = time.monotonic() +class OmniEngineCoreOutputs(EngineCoreOutputs): + outputs: list[OmniEngineCoreOutput] = [] \ No newline at end of file From 3fb6adc40a345236bf2d01e02b67323c7495c63f Mon Sep 17 00:00:00 2001 From: Yueqian Lin <70319226+linyueqian@users.noreply.github.com> Date: Fri, 16 Jan 2026 05:31:25 -0700 Subject: [PATCH 47/59] [bugfix] use unipc scheduler for Wan 2.2 (#804) Signed-off-by: linyueqian Signed-off-by: Hongsheng Liu Co-authored-by: Hongsheng Liu --- .../diffusion/models/schedulers/__init__.py | 10 + vllm_omni/diffusion/models/schedulers/base.py | 48 ++ .../scheduling_flow_unipc_multistep.py | 741 ++++++++++++++++++ .../models/wan2_2/pipeline_wan2_2.py | 14 +- .../models/wan2_2/pipeline_wan2_2_i2v.py | 14 +- .../models/wan2_2/pipeline_wan2_2_ti2v.py | 14 +- 6 files changed, 823 insertions(+), 18 deletions(-) create mode 100644 vllm_omni/diffusion/models/schedulers/__init__.py create mode 100644 vllm_omni/diffusion/models/schedulers/base.py create mode 100644 vllm_omni/diffusion/models/schedulers/scheduling_flow_unipc_multistep.py diff --git a/vllm_omni/diffusion/models/schedulers/__init__.py b/vllm_omni/diffusion/models/schedulers/__init__.py new file mode 100644 index 00000000000..6f8df78ebf0 --- /dev/null +++ b/vllm_omni/diffusion/models/schedulers/__init__.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from vllm_omni.diffusion.models.schedulers.scheduling_flow_unipc_multistep import ( + FlowUniPCMultistepScheduler, +) + +__all__ = [ + "FlowUniPCMultistepScheduler", +] diff --git a/vllm_omni/diffusion/models/schedulers/base.py b/vllm_omni/diffusion/models/schedulers/base.py new file mode 100644 index 00000000000..bc9d87d7f55 --- /dev/null +++ b/vllm_omni/diffusion/models/schedulers/base.py @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from https://github.com/hao-ai-lab/FastVideo +# Originally from https://github.com/huggingface/diffusers +# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. +"""Base scheduler class for diffusion models.""" + +from abc import ABC, abstractmethod + +import torch + + +class BaseScheduler(ABC): + """ + Abstract base class for schedulers. + + Subclasses must define: + - timesteps: torch.Tensor + - order: int + - num_train_timesteps: int + """ + + timesteps: torch.Tensor + order: int + num_train_timesteps: int + + def __init__(self): + required_attrs = ["timesteps", "order", "num_train_timesteps"] + for attr in required_attrs: + if not hasattr(self, attr): + raise AttributeError( + f"Subclass {self.__class__.__name__} must define `{attr}` before calling super().__init__()" + ) + + @abstractmethod + def set_shift(self, shift: float) -> None: + """Set the shift parameter for the scheduler.""" + raise NotImplementedError + + @abstractmethod + def set_timesteps(self, *args, **kwargs) -> None: + """Set the timesteps for the scheduler.""" + raise NotImplementedError + + @abstractmethod + def scale_model_input(self, sample: torch.Tensor, timestep: int | None = None) -> torch.Tensor: + """Scale the model input.""" + raise NotImplementedError diff --git a/vllm_omni/diffusion/models/schedulers/scheduling_flow_unipc_multistep.py b/vllm_omni/diffusion/models/schedulers/scheduling_flow_unipc_multistep.py new file mode 100644 index 00000000000..3efe564bc61 --- /dev/null +++ b/vllm_omni/diffusion/models/schedulers/scheduling_flow_unipc_multistep.py @@ -0,0 +1,741 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Adapted from https://github.com/hao-ai-lab/FastVideo +# Originally from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_unipc_multistep.py +# Convert unipc for flow matching +# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved. +""" +FlowUniPCMultistepScheduler - A training-free framework for fast sampling of flow-matching diffusion models. + +This scheduler implements the UniPC (Unified Predictor-Corrector) algorithm adapted for flow matching, +providing faster convergence than simple Euler methods while maintaining quality. +""" + +from __future__ import annotations + +import math +from typing import Any + +import numpy as np +import torch +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput +from diffusers.utils import deprecate + +from vllm_omni.diffusion.models.schedulers.base import BaseScheduler + + +class FlowUniPCMultistepScheduler(SchedulerMixin, ConfigMixin, BaseScheduler): + """ + `FlowUniPCMultistepScheduler` is a training-free framework designed for the fast sampling of + flow-matching diffusion models. + + This scheduler implements the UniPC (Unified Predictor-Corrector) algorithm adapted for flow matching, + which can achieve the same quality as Euler methods in fewer steps (typically 20-30 steps vs 40-50). + + Args: + num_train_timesteps (`int`, defaults to 1000): + The number of diffusion steps to train the model. + solver_order (`int`, default `2`): + The UniPC order which can be any positive integer. The effective order of accuracy is `solver_order + 1` + due to the UniC. It is recommended to use `solver_order=2` for guided sampling, and `solver_order=3` for + unconditional sampling. + prediction_type (`str`, defaults to "flow_prediction"): + Prediction type of the scheduler function; must be `flow_prediction` for this scheduler. + shift (`float`, defaults to 1.0): + The shift parameter for the noise schedule. For Wan2.2: use 5.0 for 720p, 12.0 for 480p. + use_dynamic_shifting (`bool`, defaults to False): + Whether to use dynamic shifting based on image resolution. + thresholding (`bool`, defaults to `False`): + Whether to use the "dynamic thresholding" method. + dynamic_thresholding_ratio (`float`, defaults to 0.995): + The ratio for the dynamic thresholding method. + sample_max_value (`float`, defaults to 1.0): + The threshold value for dynamic thresholding. + predict_x0 (`bool`, defaults to `True`): + Whether to use the updating algorithm on the predicted x0. + solver_type (`str`, default `bh2`): + Solver type for UniPC. Use `bh1` for unconditional sampling when steps < 10, `bh2` otherwise. + lower_order_final (`bool`, default `True`): + Whether to use lower-order solvers in the final steps. Stabilizes sampling for steps < 15. + disable_corrector (`list`, default `[]`): + Steps to disable the corrector to mitigate misalignment with large guidance scales. + timestep_spacing (`str`, defaults to `"linspace"`): + The way the timesteps should be scaled. + final_sigmas_type (`str`, defaults to `"zero"`): + The final `sigma` value for the noise schedule. Either `"zero"` or `"sigma_min"`. + """ + + _compatibles = [e.name for e in KarrasDiffusionSchedulers] + order = 1 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 1000, + solver_order: int = 2, + prediction_type: str = "flow_prediction", + shift: float | None = 1.0, + use_dynamic_shifting: bool = False, + thresholding: bool = False, + dynamic_thresholding_ratio: float = 0.995, + sample_max_value: float = 1.0, + predict_x0: bool = True, + solver_type: str = "bh2", + lower_order_final: bool = True, + disable_corrector: tuple = (), + solver_p: SchedulerMixin | None = None, + timestep_spacing: str = "linspace", + steps_offset: int = 0, + final_sigmas_type: str | None = "zero", + **kwargs, + ): + if solver_type not in ["bh1", "bh2"]: + if solver_type in ["midpoint", "heun", "logrho"]: + self.register_to_config(solver_type="bh2") + else: + raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}") + + self.predict_x0 = predict_x0 + self.num_inference_steps: int | None = None + + # Initialize sigma schedule + alphas = np.linspace(1, 1 / num_train_timesteps, num_train_timesteps)[::-1].copy() + sigmas = 1.0 - alphas + sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32) + + if not use_dynamic_shifting: + # Apply timestep shifting based on shift parameter + assert shift is not None + sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) + + self.sigmas = sigmas + self.timesteps = sigmas * num_train_timesteps + self.num_train_timesteps = num_train_timesteps + + # State for multistep solver + self.model_outputs: list[torch.Tensor | None] = [None] * solver_order + self.timestep_list: list[Any | None] = [None] * solver_order + self.lower_order_nums = 0 + self.disable_corrector = list(disable_corrector) + self.solver_p = solver_p + self.last_sample: torch.Tensor | None = None + self._step_index: int | None = None + self._begin_index: int | None = None + self.this_order: int = 1 + + # Move sigmas to CPU to reduce GPU/CPU communication + self.sigmas = self.sigmas.to("cpu") + self.sigma_min = self.sigmas[-1].item() + self.sigma_max = self.sigmas[0].item() + + BaseScheduler.__init__(self) + + @property + def step_index(self) -> int | None: + """The index counter for current timestep. Increases by 1 after each scheduler step.""" + return self._step_index + + @property + def begin_index(self) -> int | None: + """The index for the first timestep. Should be set from pipeline with `set_begin_index` method.""" + return self._begin_index + + def set_shift(self, shift: float) -> None: + """Set the shift parameter for the scheduler.""" + self.config.shift = shift + + def set_begin_index(self, begin_index: int = 0) -> None: + """ + Sets the begin index for the scheduler. Run from pipeline before inference. + + Args: + begin_index (`int`): The begin index for the scheduler. + """ + self._begin_index = begin_index + + def set_timesteps( + self, + num_inference_steps: int | None = None, + device: str | torch.device | None = None, + sigmas: list[float] | None = None, + mu: float | None = None, + shift: float | None = None, + ) -> None: + """ + Sets the discrete timesteps used for the diffusion chain (run before inference). + + Args: + num_inference_steps (`int`): + Total number of timesteps. + device (`str` or `torch.device`, *optional*): + The device to move timesteps to. + sigmas (`list[float]`, *optional*): + Custom sigma schedule. + mu (`float`, *optional*): + Parameter for dynamic shifting. + shift (`float`, *optional*): + Override shift parameter. + """ + if self.config.use_dynamic_shifting and mu is None: + raise ValueError("Must pass a value for `mu` when `use_dynamic_shifting` is True") + + if sigmas is None: + assert num_inference_steps is not None + sigmas = np.linspace(self.sigma_max, self.sigma_min, num_inference_steps + 1).copy()[:-1] + + if self.config.use_dynamic_shifting: + assert mu is not None + sigmas = self.time_shift(mu, 1.0, sigmas) + else: + if shift is None: + shift = self.config.shift + assert isinstance(sigmas, np.ndarray) + sigmas = shift * sigmas / (1 + (shift - 1) * sigmas) + + if self.config.final_sigmas_type == "sigma_min": + sigma_last = self.sigma_min + elif self.config.final_sigmas_type == "zero": + sigma_last = 0 + else: + raise ValueError(f"`final_sigmas_type` must be 'zero' or 'sigma_min', got {self.config.final_sigmas_type}") + + timesteps = sigmas * self.config.num_train_timesteps + sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32) + + self.sigmas = torch.from_numpy(sigmas) + self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64) + + self.num_inference_steps = len(timesteps) + + # Reset state + self.model_outputs = [None] * self.config.solver_order + self.timestep_list = [None] * self.config.solver_order + self.lower_order_nums = 0 + self.last_sample = None + + if self.solver_p: + self.solver_p.set_timesteps(self.num_inference_steps, device=device) + + self._step_index = None + self._begin_index = None + self.sigmas = self.sigmas.to("cpu") + + def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor: + """ + Dynamic thresholding to prevent pixel saturation. + + From "Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding" + https://arxiv.org/abs/2205.11487 + """ + dtype = sample.dtype + batch_size, channels, *remaining_dims = sample.shape + + if dtype not in (torch.float32, torch.float64): + sample = sample.float() + + sample = sample.reshape(batch_size, channels * np.prod(remaining_dims)) + abs_sample = sample.abs() + + s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1) + s = torch.clamp(s, min=1, max=self.config.sample_max_value) + s = s.unsqueeze(1) + sample = torch.clamp(sample, -s, s) / s + + sample = sample.reshape(batch_size, channels, *remaining_dims) + sample = sample.to(dtype) + + return sample + + def _sigma_to_t(self, sigma: torch.Tensor) -> torch.Tensor: + """Convert sigma to timestep.""" + return sigma * self.config.num_train_timesteps + + def _sigma_to_alpha_sigma_t(self, sigma: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Convert sigma to alpha and sigma_t for flow matching.""" + return 1 - sigma, sigma + + def time_shift(self, mu: float, sigma: float, t: np.ndarray) -> np.ndarray: + """Apply time shift transformation.""" + return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma) + + def convert_model_output( + self, + model_output: torch.Tensor, + *args, + sample: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor: + """ + Convert the model output to the format needed by the UniPC algorithm. + + Args: + model_output (`torch.Tensor`): Direct output from the diffusion model. + sample (`torch.Tensor`): Current sample in the diffusion process. + + Returns: + `torch.Tensor`: Converted model output. + """ + timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None) + if sample is None: + if len(args) > 1: + sample = args[1] + else: + raise ValueError("missing `sample` as a required keyword argument") + if timestep is not None: + deprecate( + "timesteps", + "1.0.0", + "Passing `timesteps` is deprecated and has no effect as model output conversion " + "is now handled via an internal counter `self.step_index`", + ) + + sigma = self.sigmas[self.step_index].to(sample.device) + alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma) + + if self.predict_x0: + if self.config.prediction_type == "flow_prediction": + sigma_t = sigma.to(sample.device) + x0_pred = sample - sigma_t * model_output + else: + raise ValueError( + f"prediction_type given as {self.config.prediction_type} must be `flow_prediction` " + "for the FlowUniPCMultistepScheduler." + ) + + if self.config.thresholding: + x0_pred = self._threshold_sample(x0_pred) + + return x0_pred + else: + if self.config.prediction_type == "flow_prediction": + sigma_t = sigma.to(sample.device) + epsilon = sample - (1 - sigma_t) * model_output + else: + raise ValueError( + f"prediction_type given as {self.config.prediction_type} must be `flow_prediction` " + "for the FlowUniPCMultistepScheduler." + ) + + if self.config.thresholding: + sigma_t = sigma.to(sample.device) + x0_pred = sample - sigma_t * model_output + x0_pred = self._threshold_sample(x0_pred) + epsilon = model_output + x0_pred + + return epsilon + + def multistep_uni_p_bh_update( + self, + model_output: torch.Tensor, + *args, + sample: torch.Tensor | None = None, + order: int | None = None, + **kwargs, + ) -> torch.Tensor: + """ + One step for the UniP (B(h) version) predictor. + + Args: + model_output (`torch.Tensor`): Direct output from the diffusion model. + sample (`torch.Tensor`): Current sample. + order (`int`): The order of UniP at this timestep. + + Returns: + `torch.Tensor`: The sample tensor at the previous timestep. + """ + prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None) + if sample is None: + if len(args) > 1: + sample = args[1] + else: + raise ValueError("missing `sample` as a required keyword argument") + if order is None: + if len(args) > 2: + order = args[2] + else: + raise ValueError("missing `order` as a required keyword argument") + if prev_timestep is not None: + deprecate( + "prev_timestep", + "1.0.0", + "Passing `prev_timestep` is deprecated and has no effect.", + ) + + model_output_list = self.model_outputs + + s0 = self.timestep_list[-1] + m0 = model_output_list[-1] + x = sample + + if self.solver_p: + x_t = self.solver_p.step(model_output, s0, x).prev_sample + return x_t + + device = sample.device + sigma_t, sigma_s0 = ( + self.sigmas[self.step_index + 1].to(device), + self.sigmas[self.step_index].to(device), + ) + alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t) + alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0) + + lambda_t = torch.log(alpha_t) - torch.log(sigma_t) + lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0) + + h = lambda_t - lambda_s0 + + rks = [] + D1s: list[Any] | None = [] + for i in range(1, order): + si = self.step_index - i + mi = model_output_list[-(i + 1)] + alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si].to(device)) + lambda_si = torch.log(alpha_si) - torch.log(sigma_si) + rk = (lambda_si - lambda_s0) / h + rks.append(rk) + assert mi is not None + D1s.append((mi - m0) / rk) + + rks.append(1.0) + rks = torch.tensor(rks, device=device) + + R = [] + b = [] + + hh = -h if self.predict_x0 else h + h_phi_1 = torch.expm1(hh) + h_phi_k = h_phi_1 / hh - 1 + + factorial_i = 1 + + if self.config.solver_type == "bh1": + B_h = hh + elif self.config.solver_type == "bh2": + B_h = torch.expm1(hh) + else: + raise NotImplementedError() + + for i in range(1, order + 1): + R.append(torch.pow(rks, i - 1)) + b.append(h_phi_k * factorial_i / B_h) + factorial_i *= i + 1 + h_phi_k = h_phi_k / hh - 1 / factorial_i + + R = torch.stack(R) + b = torch.tensor(b, device=device) + + if D1s is not None and len(D1s) > 0: + D1s = torch.stack(D1s, dim=1) + if order == 2: + rhos_p = torch.tensor([0.5], dtype=x.dtype, device=device) + else: + assert isinstance(R, torch.Tensor) + rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1]).to(device).to(x.dtype) + else: + D1s = None + + if self.predict_x0: + x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0 + if D1s is not None: + pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s) + else: + pred_res = 0 + x_t = x_t_ - alpha_t * B_h * pred_res + else: + x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0 + if D1s is not None: + pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s) + else: + pred_res = 0 + x_t = x_t_ - sigma_t * B_h * pred_res + + x_t = x_t.to(x.dtype) + return x_t + + def multistep_uni_c_bh_update( + self, + this_model_output: torch.Tensor, + *args, + last_sample: torch.Tensor | None = None, + this_sample: torch.Tensor | None = None, + order: int | None = None, + **kwargs, + ) -> torch.Tensor: + """ + One step for the UniC (B(h) version) corrector. + + Args: + this_model_output (`torch.Tensor`): Model outputs at `x_t`. + last_sample (`torch.Tensor`): Sample before the last predictor `x_{t-1}`. + this_sample (`torch.Tensor`): Sample after the last predictor `x_{t}`. + order (`int`): The order of UniC-p. Effective accuracy is `order + 1`. + + Returns: + `torch.Tensor`: The corrected sample tensor. + """ + this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None) + if last_sample is None: + if len(args) > 1: + last_sample = args[1] + else: + raise ValueError("missing `last_sample` as a required keyword argument") + if this_sample is None: + if len(args) > 2: + this_sample = args[2] + else: + raise ValueError("missing `this_sample` as a required keyword argument") + if order is None: + if len(args) > 3: + order = args[3] + else: + raise ValueError("missing `order` as a required keyword argument") + if this_timestep is not None: + deprecate( + "this_timestep", + "1.0.0", + "Passing `this_timestep` is deprecated and has no effect.", + ) + + model_output_list = self.model_outputs + + m0 = model_output_list[-1] + x = last_sample + x_t = this_sample + model_t = this_model_output + + device = this_sample.device + sigma_t, sigma_s0 = ( + self.sigmas[self.step_index].to(device), + self.sigmas[self.step_index - 1].to(device), + ) + alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t) + alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0) + + lambda_t = torch.log(alpha_t) - torch.log(sigma_t) + lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0) + + h = lambda_t - lambda_s0 + + rks = [] + D1s: list[Any] | None = [] + for i in range(1, order): + si = self.step_index - (i + 1) + mi = model_output_list[-(i + 1)] + alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si].to(device)) + lambda_si = torch.log(alpha_si) - torch.log(sigma_si) + rk = (lambda_si - lambda_s0) / h + rks.append(rk) + assert mi is not None + D1s.append((mi - m0) / rk) + + rks.append(1.0) + rks = torch.tensor(rks, device=device) + + R = [] + b = [] + + hh = -h if self.predict_x0 else h + h_phi_1 = torch.expm1(hh) + h_phi_k = h_phi_1 / hh - 1 + + factorial_i = 1 + + if self.config.solver_type == "bh1": + B_h = hh + elif self.config.solver_type == "bh2": + B_h = torch.expm1(hh) + else: + raise NotImplementedError() + + for i in range(1, order + 1): + R.append(torch.pow(rks, i - 1)) + b.append(h_phi_k * factorial_i / B_h) + factorial_i *= i + 1 + h_phi_k = h_phi_k / hh - 1 / factorial_i + + R = torch.stack(R) + b = torch.tensor(b, device=device) + + if D1s is not None and len(D1s) > 0: + D1s = torch.stack(D1s, dim=1) + else: + D1s = None + + if order == 1: + rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device) + else: + rhos_c = torch.linalg.solve(R, b).to(device).to(x.dtype) + + if self.predict_x0: + x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0 + if D1s is not None: + corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s) + else: + corr_res = 0 + D1_t = model_t - m0 + x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t) + else: + x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0 + if D1s is not None: + corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s) + else: + corr_res = 0 + D1_t = model_t - m0 + x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t) + + x_t = x_t.to(x.dtype) + return x_t + + def index_for_timestep(self, timestep: torch.Tensor, schedule_timesteps: torch.Tensor | None = None) -> int: + """Get the index for a given timestep.""" + if schedule_timesteps is None: + schedule_timesteps = self.timesteps + + indices = (schedule_timesteps == timestep).nonzero() + pos = 1 if len(indices) > 1 else 0 + step_index: int = indices[pos].item() + + return step_index + + def _init_step_index(self, timestep: torch.Tensor) -> None: + """Initialize the step_index counter for the scheduler.""" + if self.begin_index is None: + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + self._step_index = self.index_for_timestep(timestep) + else: + self._step_index = self._begin_index + + def step( + self, + model_output: torch.Tensor, + timestep: int | torch.Tensor, + sample: torch.Tensor, + return_dict: bool = True, + generator: torch.Generator | None = None, + ) -> SchedulerOutput | tuple: + """ + Predict the sample from the previous timestep by reversing the SDE using multistep UniPC. + + Args: + model_output (`torch.Tensor`): Direct output from the diffusion model. + timestep (`int`): Current discrete timestep in the diffusion chain. + sample (`torch.Tensor`): Current sample created by the diffusion process. + return_dict (`bool`): Whether to return a SchedulerOutput or tuple. + + Returns: + `SchedulerOutput` or `tuple`: The sample tensor at the previous timestep. + """ + if self.num_inference_steps is None: + raise ValueError( + "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" + ) + + if self.step_index is None: + self._init_step_index(timestep) + + use_corrector = ( + self.step_index > 0 and self.step_index - 1 not in self.disable_corrector and self.last_sample is not None + ) + + model_output_convert = self.convert_model_output(model_output, sample=sample) + + if use_corrector: + sample = self.multistep_uni_c_bh_update( + this_model_output=model_output_convert, + last_sample=self.last_sample, + this_sample=sample, + order=self.this_order, + ) + + # Update model output history + for i in range(self.config.solver_order - 1): + self.model_outputs[i] = self.model_outputs[i + 1] + self.timestep_list[i] = self.timestep_list[i + 1] + + self.model_outputs[-1] = model_output_convert + self.timestep_list[-1] = timestep + + # Determine order for this step + if self.config.lower_order_final: + this_order = min(self.config.solver_order, len(self.timesteps) - self.step_index) + else: + this_order = self.config.solver_order + + self.this_order = min(this_order, self.lower_order_nums + 1) # warmup for multistep + assert self.this_order > 0 + + self.last_sample = sample + prev_sample = self.multistep_uni_p_bh_update( + model_output=model_output, + sample=sample, + order=self.this_order, + ) + + if self.lower_order_nums < self.config.solver_order: + self.lower_order_nums += 1 + + assert self._step_index is not None + self._step_index += 1 + + if not return_dict: + return (prev_sample,) + + return SchedulerOutput(prev_sample=prev_sample) + + def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor: + """ + Ensures interchangeability with schedulers that need to scale the denoising model input. + + Args: + sample (`torch.Tensor`): The input sample. + + Returns: + `torch.Tensor`: A scaled input sample (unchanged for this scheduler). + """ + return sample + + def add_noise( + self, + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.IntTensor, + ) -> torch.Tensor: + """ + Add noise to the original samples. + + Args: + original_samples (`torch.Tensor`): Original samples. + noise (`torch.Tensor`): Noise to add. + timesteps (`torch.IntTensor`): Timesteps for noise addition. + + Returns: + `torch.Tensor`: Noisy samples. + """ + sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) + + if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): + schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32) + timesteps = timesteps.to(original_samples.device, dtype=torch.float32) + else: + schedule_timesteps = self.timesteps.to(original_samples.device) + timesteps = timesteps.to(original_samples.device) + + if self.begin_index is None: + step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps] + elif self.step_index is not None: + step_indices = [self.step_index] * timesteps.shape[0] + else: + step_indices = [self.begin_index] * timesteps.shape[0] + + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < len(original_samples.shape): + sigma = sigma.unsqueeze(-1) + + alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma) + noisy_samples = alpha_t * original_samples + sigma_t * noise + return noisy_samples + + def __len__(self) -> int: + return self.config.num_train_timesteps diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py index 61a858ca4d2..3b04e2a5a9b 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py @@ -9,7 +9,7 @@ import PIL.Image import torch -from diffusers import AutoencoderKLWan, FlowMatchEulerDiscreteScheduler +from diffusers import AutoencoderKLWan from diffusers.utils.torch_utils import randn_tensor from torch import nn from transformers import AutoTokenizer, UMT5EncoderModel @@ -18,6 +18,7 @@ from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader +from vllm_omni.diffusion.models.schedulers import FlowUniPCMultistepScheduler from vllm_omni.diffusion.models.wan2_2.wan2_2_transformer import WanTransformer3DModel from vllm_omni.diffusion.request import OmniDiffusionRequest @@ -244,12 +245,13 @@ def __init__( else: self.transformer_2 = None - self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained( - model, subfolder="scheduler", local_files_only=local_files_only + # Initialize UniPC scheduler + flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 # default for 720p + self.scheduler = FlowUniPCMultistepScheduler( + num_train_timesteps=1000, + shift=flow_shift, + prediction_type="flow_prediction", ) - # Apply flow_shift if specified (12.0 for 480p, 5.0 for 720p recommended for Wan2.2) - if od_config.flow_shift is not None: - self.scheduler.config.flow_shift = od_config.flow_shift self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4 self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8 diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py index f3172e3d90d..ed0d6e4c6b6 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py @@ -9,7 +9,7 @@ import numpy as np import PIL.Image import torch -from diffusers import AutoencoderKLWan, FlowMatchEulerDiscreteScheduler +from diffusers import AutoencoderKLWan from diffusers.utils.torch_utils import randn_tensor from torch import nn from transformers import AutoTokenizer, CLIPImageProcessor, CLIPVisionModel, UMT5EncoderModel @@ -19,6 +19,7 @@ from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.interface import SupportImageInput +from vllm_omni.diffusion.models.schedulers import FlowUniPCMultistepScheduler from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import ( create_transformer_from_config, load_transformer_config, @@ -199,12 +200,13 @@ def __init__( else: self.transformer_2 = None - # Scheduler - self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained( - model, subfolder="scheduler", local_files_only=local_files_only + # Initialize UniPC scheduler + flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 # default for 720p + self.scheduler = FlowUniPCMultistepScheduler( + num_train_timesteps=1000, + shift=flow_shift, + prediction_type="flow_prediction", ) - if od_config.flow_shift is not None: - self.scheduler.config.flow_shift = od_config.flow_shift # VAE scale factors self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if hasattr(self.vae, "config") else 4 diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py index 5351419ba68..6a9a6a6a0e9 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_ti2v.py @@ -22,7 +22,7 @@ import numpy as np import PIL.Image import torch -from diffusers import AutoencoderKLWan, FlowMatchEulerDiscreteScheduler +from diffusers import AutoencoderKLWan from diffusers.utils.torch_utils import randn_tensor from torch import nn from transformers import AutoTokenizer, UMT5EncoderModel @@ -32,6 +32,7 @@ from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader from vllm_omni.diffusion.models.interface import SupportImageInput +from vllm_omni.diffusion.models.schedulers import FlowUniPCMultistepScheduler from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import ( create_transformer_from_config, load_transformer_config, @@ -157,12 +158,13 @@ def __init__( transformer_config = load_transformer_config(model, "transformer", local_files_only) self.transformer = create_transformer_from_config(transformer_config) - # Scheduler - self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained( - model, subfolder="scheduler", local_files_only=local_files_only + # Initialize UniPC scheduler + flow_shift = od_config.flow_shift if od_config.flow_shift is not None else 5.0 # default for 720p + self.scheduler = FlowUniPCMultistepScheduler( + num_train_timesteps=1000, + shift=flow_shift, + prediction_type="flow_prediction", ) - if od_config.flow_shift is not None: - self.scheduler.config.flow_shift = od_config.flow_shift # VAE scale factors self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if hasattr(self.vae, "config") else 4 From 4e23bfff9eb4dfc3539399644c2caab9f4cca280 Mon Sep 17 00:00:00 2001 From: wangyu <53896905+yenuo26@users.noreply.github.com> Date: Fri, 16 Jan 2026 21:16:43 +0800 Subject: [PATCH 48/59] [Test] Add full test for Qwen3-Omni-30B-A3B-Instruct (#720) Signed-off-by: wangyu31577 Co-authored-by: wangyu31577 --- docker/Dockerfile.ci | 6 + pyproject.toml | 2 + tests/conftest.py | 294 ++++++++++++++++++ .../test_qwen3_omni_expansion.py | 158 ++++++++++ tests/e2e/stage_configs/qwen3_omni_ci.yaml | 95 ++++++ 5 files changed, 555 insertions(+) create mode 100644 tests/e2e/online_serving/test_qwen3_omni_expansion.py create mode 100644 tests/e2e/stage_configs/qwen3_omni_ci.yaml diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 5e1d00a5f88..aecc429454c 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -6,6 +6,12 @@ WORKDIR ${APP_DIR} COPY . . +# Install system dependencies +RUN apt-get update && \ + apt-get install -y ffmpeg && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + # Install vllm-omni into the same uv-managed Python environment used by the base image. RUN uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]" diff --git a/pyproject.toml b/pyproject.toml index 4833b117487..2e2cddc5b7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,8 @@ dev = [ "pytest-cov>=4.0.0", "mypy==1.11.1", "pre-commit==4.0.1", + "openai-whisper>=20250625", + "psutil>=7.2.0" ] docs = [ diff --git a/tests/conftest.py b/tests/conftest.py index 82c959f07ca..5b21f671bdb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,19 @@ +import base64 import os +import socket +import subprocess +import sys +import time +from pathlib import Path +from typing import Any +import psutil import pytest import torch +import whisper +import yaml from vllm.logger import init_logger +from vllm.utils import get_open_port logger = init_logger(__name__) @@ -34,3 +45,286 @@ def clean_gpu_memory_between_tests(): if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() + + +def dummy_messages_from_mix_data( + system_prompt: dict[str, Any] = None, + video_data_url: Any = None, + audio_data_url: Any = None, + image_data_url: Any = None, + content_text: str = None, +): + """Create messages with video、image、audio data URL for OpenAI API.""" + + if content_text is not None: + content = [{"type": "text", "text": content_text}] + else: + content = [] + + media_items = [] + if isinstance(video_data_url, list): + for video_url in video_data_url: + media_items.append((video_url, "video")) + else: + media_items.append((video_data_url, "video")) + + if isinstance(image_data_url, list): + for url in image_data_url: + media_items.append((url, "image")) + else: + media_items.append((image_data_url, "image")) + + if isinstance(audio_data_url, list): + for url in audio_data_url: + media_items.append((url, "audio")) + else: + media_items.append((audio_data_url, "audio")) + + content.extend( + {"type": f"{media_type}_url", f"{media_type}_url": {"url": url}} + for url, media_type in media_items + if url is not None + ) + messages = [{"role": "user", "content": content}] + if system_prompt is not None: + messages = [system_prompt] + messages + return messages + + +def cosine_similarity_text(s1, s2): + """ + Calculate cosine similarity between two text strings. + Notes: + ------ + - Higher score means more similar texts + - Score of 1.0 means identical word composition (bag-of-words) + - Score of 0.0 means completely different vocabulary + """ + from sklearn.feature_extraction.text import CountVectorizer + from sklearn.metrics.pairwise import cosine_similarity + + vectorizer = CountVectorizer().fit_transform([s1, s2]) + vectors = vectorizer.toarray() + return cosine_similarity([vectors[0]], [vectors[1]])[0][0] + + +def convert_audio_to_text(audio_data): + """ + Convert base64 encoded audio data to text using speech recognition. + """ + + audio_data = base64.b64decode(audio_data) + output_path = f"./test_{int(time.time())}" + with open(output_path, "wb") as audio_file: + audio_file.write(audio_data) + + print(f"audio data is saved: {output_path}") + model = whisper.load_model("base") + text = model.transcribe(output_path)["text"] + if text: + return text + else: + return "" + + +def modify_stage_config( + yaml_path: str, + stage_updates: dict[int, dict[str, Any]], +) -> str: + """ + Batch modify configurations for multiple stages in a YAML file. + + Args: + yaml_path: Path to the YAML configuration file. + stage_updates: Dictionary where keys are stage IDs and values are dictionaries of + modifications for that stage. Each modification dictionary uses + dot-separated paths as keys and new configuration values as values. + Example: { + 0: {'engine_args.max_model_len': 5800}, + 1: {'runtime.max_batch_size': 2} + } + + Returns: + str: Path to the newly created modified YAML file with timestamp suffix. + + Example: + >>> output_file = modify_stage_config( + ... 'config.yaml', + ... { + ... 0: {'engine_args.max_model_len': 5800}, + ... 1: {'runtime.max_batch_size': 2} + ... } + ... ) + >>> print(f"Modified configuration saved to: {output_file}") + Modified configuration saved to: config_1698765432.yaml + """ + path = Path(yaml_path) + if not path.exists(): + raise FileNotFoundError(f"yaml does not exist: {path}") + try: + with open(yaml_path, encoding="utf-8") as f: + config = yaml.safe_load(f) or {} + except Exception as e: + raise ValueError(f"Cannot parse YAML file: {e}") + + stage_args = config.get("stage_args", []) + if not stage_args: + raise ValueError("the stage_args does not exist") + + for stage_id, config_dict in stage_updates.items(): + target_stage = None + for stage in stage_args: + if stage.get("stage_id") == stage_id: + target_stage = stage + break + + if target_stage is None: + available_ids = [s.get("stage_id") for s in stage_args if "stage_id" in s] + raise KeyError(f"Stage ID {stage_id} is not exist, available IDs: {available_ids}") + + for key_path, value in config_dict.items(): + current = target_stage + keys = key_path.split(".") + for i in range(len(keys) - 1): + key = keys[i] + if key not in current: + raise KeyError(f"the {'.'.join(keys[: i + 1])} does not exist") + + elif not isinstance(current[key], dict) and i < len(keys) - 2: + raise ValueError(f"{'.'.join(keys[: i + 1])}' cannot continue deeper because it's not a dict") + current = current[key] + current[keys[-1]] = value + + output_path = f"{yaml_path.split('.')[0]}_{int(time.time())}.yaml" + with open(output_path, "w", encoding="utf-8") as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False, allow_unicode=True, indent=2) + + return output_path + + +class OmniServer: + """Omniserver for vLLM-Omni tests.""" + + def __init__( + self, + model: str, + serve_args: list[str], + *, + env_dict: dict[str, str] | None = None, + ) -> None: + self.model = model + self.serve_args = serve_args + self.env_dict = env_dict + self.proc: subprocess.Popen | None = None + self.host = "127.0.0.1" + self.port = get_open_port() + + def _start_server(self) -> None: + """Start the vLLM-Omni server subprocess.""" + env = os.environ.copy() + env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + if self.env_dict is not None: + env.update(self.env_dict) + + cmd = [ + sys.executable, + "-m", + "vllm_omni.entrypoints.cli.main", + "serve", + self.model, + "--omni", + "--host", + self.host, + "--port", + str(self.port), + ] + self.serve_args + + print(f"Launching OmniServer with: {' '.join(cmd)}") + self.proc = subprocess.Popen( + cmd, + env=env, + cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))), # Set working directory to vllm-omni root + ) + + # Wait for server to be ready + max_wait = 600 # 10 minutes + start_time = time.time() + while time.time() - start_time < max_wait: + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.settimeout(1) + result = sock.connect_ex((self.host, self.port)) + if result == 0: + print(f"Server ready on {self.host}:{self.port}") + return + except Exception: + pass + time.sleep(2) + + raise RuntimeError(f"Server failed to start within {max_wait} seconds") + + def _kill_process_tree(self, pid): + """kill process and its children""" + try: + parent = psutil.Process(pid) + children = parent.children(recursive=True) + for child in children: + try: + child.terminate() + except psutil.NoSuchProcess: + pass + + gone, still_alive = psutil.wait_procs(children, timeout=10) + + for child in still_alive: + try: + child.kill() + except psutil.NoSuchProcess: + pass + + try: + parent.terminate() + parent.wait(timeout=10) + except (psutil.NoSuchProcess, psutil.TimeoutExpired): + try: + parent.kill() + except psutil.NoSuchProcess: + pass + + except psutil.NoSuchProcess: + pass + + def __enter__(self): + self._start_server() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.proc: + try: + parent = psutil.Process(self.proc.pid) + children = parent.children(recursive=True) + for child in children: + try: + child.terminate() + except psutil.NoSuchProcess: + pass + + gone, still_alive = psutil.wait_procs(children, timeout=10) + + for child in still_alive: + try: + child.kill() + except psutil.NoSuchProcess: + pass + + try: + parent.terminate() + parent.wait(timeout=10) + except (psutil.NoSuchProcess, psutil.TimeoutExpired): + try: + parent.kill() + except psutil.NoSuchProcess: + pass + + except psutil.NoSuchProcess: + pass diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py new file mode 100644 index 00000000000..6a47e96f866 --- /dev/null +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +E2E Online tests for Qwen3-Omni model. +""" + +import concurrent.futures +import os +import time +from pathlib import Path + +import openai +import pytest + +from tests.conftest import ( + OmniServer, + convert_audio_to_text, + cosine_similarity_text, + dummy_messages_from_mix_data, + modify_stage_config, +) + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] + +# CI stage config for 2*H100-80G GPUs +stage_configs = [str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml")] + +# Create parameter combinations for model and stage config +test_params = [(model, stage_config) for model in models for stage_config in stage_configs] + + +def client(omni_server): + """OpenAI client for the running vLLM-Omni server.""" + return openai.OpenAI( + base_url=f"http://{omni_server.host}:{omni_server.port}/v1", + api_key="EMPTY", + ) + + +def get_system_prompt(): + return { + "role": "system", + "content": [ + { + "type": "text", + "text": ( + "You are Qwen, a virtual human developed by the Qwen Team, " + "Alibaba Group, capable of perceiving auditory and visual inputs, " + "as well as generating text and speech." + ), + } + ], + } + + +def get_prompt(prompt_type="text_only"): + prompts = { + "text_only": "What is the capital of China?", + "mix": "What is recited in the audio? What is in this image? Describe the video briefly.", + } + return prompts.get(prompt_type, prompts["text_only"]) + + +def get_max_batch_size(size_type="few"): + batch_sizes = {"few": 5, "medium": 100, "large": 256} + return batch_sizes.get(size_type, 5) + + +@pytest.mark.parametrize("test_config", test_params) +def test_text_to_text_001(test_config: tuple[str, str]) -> None: + """Test processing text, generating text output via OpenAI API.""" + model, stage_config_path = test_config + with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server: + messages = dummy_messages_from_mix_data(system_prompt=get_system_prompt(), content_text=get_prompt()) + + # Test single completion + api_client = client(server) + start_time = time.perf_counter() + chat_completion = api_client.chat.completions.create( + model=server.model, messages=messages, max_tokens=20, modalities=["text"] + ) + # Verify E2E + print(f"the request e2e is: {time.perf_counter() - start_time}") + # TODO: Verify the E2E latency after confirmation baseline. + + # Verify only output text + assert len(chat_completion.choices) == 1, "The generated content includes more than just text." + + # Verify text output success + text_choice = chat_completion.choices[0] + assert text_choice.message.content is not None, "No text output is generated" + assert chat_completion.usage.completion_tokens <= 20, "The output length more than the requested max_tokens." + assert "beijing" in text_choice.message.content.lower(), "The output do not contain keywords." + + +@pytest.mark.parametrize("test_config", test_params) +def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None: + """Test processing text, generating text and audio output via OpenAI API.""" + + model, stage_config_path = test_config + num_concurrent_requests = get_max_batch_size() + stage_config_path = modify_stage_config( + stage_config_path, + { + 0: {"runtime.max_batch_size": num_concurrent_requests}, + 1: {"runtime.max_batch_size": num_concurrent_requests}, + }, + ) + with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server: + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), content_text="What is the capital of China?" + ) + + # Test single completion + api_client = client(server) + e2e_list = list() + with concurrent.futures.ThreadPoolExecutor(max_workers=num_concurrent_requests) as executor: + # Submit multiple completion requests concurrently + futures = [ + executor.submit(api_client.chat.completions.create, model=server.model, messages=messages) + for _ in range(num_concurrent_requests) + ] + start_time = time.perf_counter() + # Wait for all requests to complete and collect results + chat_completions = list() + for future in concurrent.futures.as_completed(futures): + chat_completions.append(future.result()) + # Verify E2E + current_e2e = time.perf_counter() - start_time + print(f"the request e2e is: {current_e2e}") + # TODO: Verify the E2E latency after confirmation baseline. + e2e_list.append(current_e2e) + + print(f"the avg e2e is: {sum(e2e_list) / len(e2e_list)}") + # Verify all completions succeeded + assert len(chat_completions) == num_concurrent_requests, "Not all requests succeeded." + for chat_completion in chat_completions: + # Verify audio output success + audio_message = chat_completion.choices[1].message + audio_data = audio_message.audio.data + assert audio_data is not None, "No audio output is generated" + assert audio_message.audio.expires_at > time.time(), "The generated audio has expired." + + # Verify text output success + text_choice = chat_completion.choices[0] + text_content = text_choice.message.content + assert text_choice.message.content is not None, "No text output is generated" + assert "beijing" in text_choice.message.content.lower(), "The output do not contain keywords." + + # Verify text output same as audio output + audio_content = convert_audio_to_text(audio_data) + print(f"text content is: {text_content}") + print(f"audio content is: {audio_content}") + assert cosine_similarity_text(audio_content.lower(), text_content.lower()) > 0.9, ( + "The audio content is not same as the text" + ) diff --git a/tests/e2e/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_ci.yaml new file mode 100644 index 00000000000..5106b185419 --- /dev/null +++ b/tests/e2e/stage_configs/qwen3_omni_ci.yaml @@ -0,0 +1,95 @@ +# Stage config for running Qwen3-Omni-MoE with 3-stage architecture +# Stage 0: Thinker (multimodal understanding + text generation) +# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes) +# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) + +# The following config has been verified on 2x H100-80G GPUs. +stage_args: +- stage_id: 0 + runtime: + devices: "0,1" + max_batch_size: 1 + engine_args: + model_stage: thinker + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.6 + enforce_eager: true + trust_remote_code: true + engine_output_type: latent # Output hidden states for talker + distributed_executor_backend: "mp" + max_num_batched_tokens: 32768 + enable_prefix_caching: false + hf_config_name: thinker_config + tensor_parallel_size: 2 + final_output: true + final_output_type: text + is_comprehension: true + default_sampling_params: + temperature: 0.4 + top_p: 0.9 + top_k: 1 + max_tokens: 100 + seed: 42 + ignore_eos: False + detokenize: True + repetition_penalty: 1.05 + +- stage_id: 1 + runtime: + devices: "1" + max_batch_size: 1 + engine_args: + model_stage: talker + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.3 + enforce_eager: true + trust_remote_code: true + engine_output_type: latent # Output codec codes for code2wav + enable_prefix_caching: false + max_num_batched_tokens: 32768 + distributed_executor_backend: "mp" + hf_config_name: talker_config + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 100 + seed: 42 + detokenize: False + repetition_penalty: 1.05 + stop_token_ids: [2150] + +- stage_id: 2 + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: code2wav + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + engine_output_type: audio # Final output: audio waveform + gpu_memory_utilization: 0.1 + distributed_executor_backend: "mp" + max_num_batched_tokens: 1000000 + hf_config_name: thinker_config + engine_input_source: [1] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 200 + seed: 42 + detokenize: True + repetition_penalty: 1.1 From 088852087529ff7860b7273ded351e3fc1a6a374 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Sat, 17 Jan 2026 15:21:49 +0800 Subject: [PATCH 49/59] [Bagel] Support Cache-Dit (#736) Signed-off-by: princepride --- docs/user_guide/diffusion_acceleration.md | 1 + .../diffusion/cache/cache_dit_backend.py | 387 ++++++++++++++++++ .../models/bagel/bagel_transformer.py | 15 +- 3 files changed, 398 insertions(+), 5 deletions(-) diff --git a/docs/user_guide/diffusion_acceleration.md b/docs/user_guide/diffusion_acceleration.md index cf04c6228a6..0184a8fcd39 100644 --- a/docs/user_guide/diffusion_acceleration.md +++ b/docs/user_guide/diffusion_acceleration.md @@ -49,6 +49,7 @@ The following table shows which models are currently supported by each accelerat | **Qwen-Image-Layered** | `Qwen/Qwen-Image-Layered` | ❌ | ✅ | ✅ | ✅ | ✅ | | **Z-Image** | `Tongyi-MAI/Z-Image-Turbo` | ❌ | ✅ |❌ | ❌ | ❌ | | **Stable-Diffusion3.5** | `stabilityai/stable-diffusion-3.5` | ❌ | ✅ | ❌ | ❌ | ❌ | +| **Bagel** | `ByteDance-Seed/BAGEL-7B-MoT` | ❌ | ✅ | ❌ | ❌ | ❌ | ### VideoGen diff --git a/vllm_omni/diffusion/cache/cache_dit_backend.py b/vllm_omni/diffusion/cache/cache_dit_backend.py index 3485e2262c9..0c43659c9a6 100644 --- a/vllm_omni/diffusion/cache/cache_dit_backend.py +++ b/vllm_omni/diffusion/cache/cache_dit_backend.py @@ -7,11 +7,19 @@ pipelines in vllm-omni, supporting both single and dual-transformer architectures. """ +import functools from collections.abc import Callable +from contextlib import ExitStack from typing import Any, Optional import cache_dit +import torch from cache_dit import BlockAdapter, DBCacheConfig, ForwardPattern, ParamsModifier, TaylorSeerCalibratorConfig +from cache_dit.caching.block_adapters import FakeDiffusionPipeline +from cache_dit.caching.cache_adapters.cache_adapter import CachedAdapter +from cache_dit.caching.cache_blocks.pattern_0_1_2 import CachedBlocks_Pattern_0_1_2 +from cache_dit.caching.cache_contexts import BasicCacheConfig +from cache_dit.caching.cache_contexts.cache_manager import CachedContextManager from vllm.logger import init_logger from vllm_omni.diffusion.cache.base import CacheBackend @@ -401,6 +409,384 @@ def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool return refresh_cache_context +class BagelCachedContextManager(CachedContextManager): + """ + Custom CachedContextManager for Bagel that safely handles NaiveCache objects + (mapped to encoder_hidden_states) by skipping tensor operations on them. + """ + + @torch.compiler.disable + def apply_cache( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor = None, + prefix: str = "Bn", + encoder_prefix: str = "Bn_encoder", + ) -> tuple[torch.Tensor, torch.Tensor | None]: + # Allow Bn and Fn prefix to be used for residual cache. + if "Bn" in prefix: + hidden_states_prev = self.get_Bn_buffer(prefix) + else: + hidden_states_prev = self.get_Fn_buffer(prefix) + + assert hidden_states_prev is not None, f"{prefix}_buffer must be set before" + + if self.is_cache_residual(): + hidden_states = hidden_states_prev + hidden_states + else: + # If cache is not residual, we use the hidden states directly + hidden_states = hidden_states_prev + + hidden_states = hidden_states.contiguous() + + if encoder_hidden_states is not None: + if "Bn" in encoder_prefix: + encoder_hidden_states_prev = self.get_Bn_encoder_buffer(encoder_prefix) + else: + encoder_hidden_states_prev = self.get_Fn_encoder_buffer(encoder_prefix) + + if encoder_hidden_states_prev is not None: + if self.is_encoder_cache_residual(): + # FIX: Check if encoder_hidden_states is a tensor before adding + if isinstance(encoder_hidden_states, torch.Tensor) and isinstance( + encoder_hidden_states_prev, torch.Tensor + ): + encoder_hidden_states = encoder_hidden_states_prev + encoder_hidden_states + else: + # If encoder cache is not residual, we use the encoder hidden states directly + encoder_hidden_states = encoder_hidden_states_prev + + # FIX: Check if encoder_hidden_states is a tensor before calling contiguous + if isinstance(encoder_hidden_states, torch.Tensor): + encoder_hidden_states = encoder_hidden_states.contiguous() + + return hidden_states, encoder_hidden_states + + +class BagelCachedBlocks(CachedBlocks_Pattern_0_1_2): + """ + Custom CachedBlocks for Bagel that safely handles NaiveCache objects + by adding isinstance checks in call_Mn_blocks and compute_or_prune. + """ + + def call_Mn_blocks( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + *args, + **kwargs, + ): + original_hidden_states = hidden_states + original_encoder_hidden_states = encoder_hidden_states + for block in self._Mn_blocks(): + hidden_states = block( + hidden_states, + encoder_hidden_states, + *args, + **kwargs, + ) + hidden_states, encoder_hidden_states = self._process_block_outputs(hidden_states, encoder_hidden_states) + + # compute hidden_states residual + hidden_states = hidden_states.contiguous() + + hidden_states_residual = hidden_states - original_hidden_states + + if ( + encoder_hidden_states is not None + and original_encoder_hidden_states is not None + and isinstance(encoder_hidden_states, torch.Tensor) # FIX: Added Check + ): + encoder_hidden_states = encoder_hidden_states.contiguous() + encoder_hidden_states_residual = encoder_hidden_states - original_encoder_hidden_states + else: + encoder_hidden_states_residual = None + + return ( + hidden_states, + encoder_hidden_states, + hidden_states_residual, + encoder_hidden_states_residual, + ) + + def compute_or_prune( + self, + block_id: int, # Block index in the transformer blocks + # Below are the inputs to the block + block, # The transformer block to be executed + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + *args, + **kwargs, + ): + # NOTE: Although Bagel likely won't use pruning, implementing safe version just in case. + # Copy-pasted from original but adding checks. + + original_hidden_states = hidden_states + original_encoder_hidden_states = encoder_hidden_states + + can_use_prune = self._maybe_prune( + block_id, + hidden_states, + prefix=f"{self.cache_prefix}_{block_id}_Fn_original", + ) + + torch._dynamo.graph_break() + if can_use_prune: + self.context_manager.add_pruned_step() + hidden_states, encoder_hidden_states = self.context_manager.apply_prune( + hidden_states, + encoder_hidden_states, + prefix=( + f"{self.cache_prefix}_{block_id}_Bn_residual" + if self.context_manager.is_cache_residual() + else f"{self.cache_prefix}_Bn_hidden_states" + ), + encoder_prefix=( + f"{self.cache_prefix}_{block_id}_Bn_encoder_residual" + if self.context_manager.is_encoder_cache_residual() + else f"{self.cache_prefix}_{block_id}_Bn_encoder_hidden_states" + ), + ) + torch._dynamo.graph_break() + else: + # Normal steps: Compute the block and cache the residuals. + hidden_states = block( + hidden_states, + encoder_hidden_states, + *args, + **kwargs, + ) + hidden_states, encoder_hidden_states = self._process_block_outputs(hidden_states, encoder_hidden_states) + if not self._skip_prune(block_id): + hidden_states = hidden_states.contiguous() + hidden_states_residual = hidden_states - original_hidden_states + + if ( + encoder_hidden_states is not None + and original_encoder_hidden_states is not None + and isinstance(encoder_hidden_states, torch.Tensor) # FIX: Added Check + ): + encoder_hidden_states = encoder_hidden_states.contiguous() + encoder_hidden_states_residual = encoder_hidden_states - original_encoder_hidden_states + else: + encoder_hidden_states_residual = None + + self.context_manager.set_Fn_buffer( + original_hidden_states, + prefix=f"{self.cache_prefix}_{block_id}_Fn_original", + ) + if self.context_manager.is_cache_residual(): + self.context_manager.set_Bn_buffer( + hidden_states_residual, + prefix=f"{self.cache_prefix}_{block_id}_Bn_residual", + ) + else: + self.context_manager.set_Bn_buffer( + hidden_states, + prefix=f"{self.cache_prefix}_{block_id}_Bn_hidden_states", + ) + if encoder_hidden_states_residual is not None: + if self.context_manager.is_encoder_cache_residual(): + self.context_manager.set_Bn_encoder_buffer( + encoder_hidden_states_residual, + prefix=f"{self.cache_prefix}_{block_id}_Bn_encoder_residual", + ) + else: + self.context_manager.set_Bn_encoder_buffer( + encoder_hidden_states_residual, + prefix=f"{self.cache_prefix}_{block_id}_Bn_encoder_hidden_states", + ) + torch._dynamo.graph_break() + + return hidden_states, encoder_hidden_states + + +class BagelCachedAdapter(CachedAdapter): + """ + Custom CachedAdapter for Bagel that uses BagelCachedContextManager and BagelCachedBlocks. + """ + + @classmethod + def create_context( + cls, + block_adapter: BlockAdapter, + **context_kwargs, + ) -> tuple[list[str], list[dict[str, Any]]]: + # Override to use BagelCachedContextManager + + BlockAdapter.assert_normalized(block_adapter) + + if BlockAdapter.is_cached(block_adapter.pipe): + return block_adapter.pipe + + # Check context_kwargs + context_kwargs = cls.check_context_kwargs(block_adapter, **context_kwargs) + + # Each Pipeline should have it's own context manager instance. + cache_config: BasicCacheConfig = context_kwargs.get("cache_config", None) + assert cache_config is not None, "cache_config can not be None." + + # Apply cache on pipeline: wrap cache context + pipe_cls_name = block_adapter.pipe.__class__.__name__ + + # USE CUSTOM CONTEXT MANAGER + context_manager = BagelCachedContextManager( + name=f"{pipe_cls_name}_{hash(id(block_adapter.pipe))}", + persistent_context=isinstance(block_adapter.pipe, FakeDiffusionPipeline), + ) + + flatten_contexts, contexts_kwargs = cls.modify_context_params(block_adapter, **context_kwargs) + + block_adapter.pipe._context_manager = context_manager # instance level + + if not context_manager.persistent_context: + original_call = block_adapter.pipe.__class__.__call__ + + @functools.wraps(original_call) + def new_call(self, *args, **kwargs): + with ExitStack() as stack: + # cache context will be reset for each pipe inference + for context_name, context_kwargs in zip(flatten_contexts, contexts_kwargs): + stack.enter_context( + context_manager.enter_context( + context_manager.reset_context( + context_name, + **context_kwargs, + ), + ) + ) + outputs = original_call(self, *args, **kwargs) + cls.apply_stats_hooks(block_adapter) + return outputs + + block_adapter.pipe.__class__.__call__ = new_call + block_adapter.pipe.__class__._original_call = original_call + + else: + # Init persistent cache context for transformer + for context_name, context_kwargs in zip(flatten_contexts, contexts_kwargs): + context_manager.reset_context( + context_name, + **context_kwargs, + ) + + block_adapter.pipe.__class__._is_cached = True + + cls.apply_params_hooks(block_adapter, contexts_kwargs) + + return flatten_contexts, contexts_kwargs + + @classmethod + def collect_unified_blocks( + cls, + block_adapter: BlockAdapter, + contexts_kwargs: list[dict], + ) -> list[dict[str, torch.nn.ModuleList]]: + # Override to use BagelCachedBlocks + + BlockAdapter.assert_normalized(block_adapter) + + total_cached_blocks: list[dict[str, torch.nn.ModuleList]] = [] + assert hasattr(block_adapter.pipe, "_context_manager") + # Skipping isinstance check for ContextManager._supported_managers to avoid import issues + + for i in range(len(block_adapter.transformer)): + unified_blocks_bind_context = {} + for j in range(len(block_adapter.blocks[i])): + cache_config: BasicCacheConfig = contexts_kwargs[i * len(block_adapter.blocks[i]) + j]["cache_config"] + + # Directly instantiate BagelCachedBlocks + unified_blocks_bind_context[block_adapter.unique_blocks_name[i][j]] = torch.nn.ModuleList( + [ + BagelCachedBlocks( + # 0. Transformer blocks configuration + block_adapter.blocks[i][j], + transformer=block_adapter.transformer[i], + forward_pattern=block_adapter.forward_pattern[i][j], + check_forward_pattern=block_adapter.check_forward_pattern, + check_num_outputs=block_adapter.check_num_outputs, + # 1. Cache/Prune context configuration + cache_prefix=block_adapter.blocks_name[i][j], + cache_context=block_adapter.unique_blocks_name[i][j], + context_manager=block_adapter.pipe._context_manager, + cache_type=cache_config.cache_type, + ) + ] + ) + + total_cached_blocks.append(unified_blocks_bind_context) + + return total_cached_blocks + + +def enable_cache_for_bagel(pipeline: Any, cache_config: Any) -> Callable[[int], None]: + """Enable cache-dit for Bagel model (via OmniDiffusion pipeline). + + Args: + pipeline: The OmniDiffusion pipeline instance. + cache_config: DiffusionCacheConfig instance with cache configuration. + + Returns: + A refresh function that can be called to update cache context with new num_inference_steps. + """ + # Build DBCacheConfig + db_cache_config = _build_db_cache_config(cache_config) + + # Build calibrator config if TaylorSeer is enabled + calibrator_config = None + if cache_config.enable_taylorseer: + taylorseer_order = cache_config.taylorseer_order + calibrator_config = TaylorSeerCalibratorConfig(taylorseer_order=taylorseer_order) + logger.info(f"TaylorSeer enabled with order={taylorseer_order}") + + # Access the transformer: BagelPipeline -> Qwen2MoTForCausalLM -> Qwen2MoTModel + # BagelPipeline has self.language_model which is Qwen2MoTForCausalLM + # Qwen2MoTForCausalLM has self.model which is Qwen2MoTModel + transformer = pipeline.language_model.model + + logger.info( + f"Enabling cache-dit on Bagel transformer: " + f"Fn={db_cache_config.Fn_compute_blocks}, " + f"Bn={db_cache_config.Bn_compute_blocks}, " + f"W={db_cache_config.max_warmup_steps}, " + ) + + # Enable cache-dit on the transformer + # Pattern_0 corresponds to (hidden_states, encoder_hidden_states) input, output + # Custom adapter for Bagel to handle NaiveCache correctly + # from vllm_omni.diffusion.cache.bagel_cache_adapter import BagelCachedAdapter # No longer needed + BagelCachedAdapter.apply( + BlockAdapter( + transformer=transformer, + blocks=transformer.layers, + forward_pattern=ForwardPattern.Pattern_0, + ), + cache_config=db_cache_config, + calibrator_config=calibrator_config, + ) + + def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool = True) -> None: + transformer = pipeline.language_model.model + if cache_config.scm_steps_mask_policy is None: + cache_dit.refresh_context(transformer, num_inference_steps=num_inference_steps, verbose=verbose) + else: + cache_dit.refresh_context( + transformer, + cache_config=DBCacheConfig().reset( + num_inference_steps=num_inference_steps, + steps_computation_mask=cache_dit.steps_mask( + mask_policy=cache_config.scm_steps_mask_policy, + total_steps=num_inference_steps, + ), + steps_computation_policy=cache_config.scm_steps_policy, + ), + verbose=verbose, + ) + + return refresh_cache_context + + # Register custom cache-dit enablers after function definitions CUSTOM_DIT_ENABLERS.update( { @@ -409,6 +795,7 @@ def refresh_cache_context(pipeline: Any, num_inference_steps: int, verbose: bool "LongCatImagePipeline": enable_cache_for_longcat_image, "LongCatImageEditPipeline": enable_cache_for_longcat_image, "StableDiffusion3Pipeline": enable_cache_for_sd3, + "BagelPipeline": enable_cache_for_bagel, } ) diff --git a/vllm_omni/diffusion/models/bagel/bagel_transformer.py b/vllm_omni/diffusion/models/bagel/bagel_transformer.py index 7950389ef1f..256f25e0839 100644 --- a/vllm_omni/diffusion/models/bagel/bagel_transformer.py +++ b/vllm_omni/diffusion/models/bagel/bagel_transformer.py @@ -314,10 +314,12 @@ def __init__( def forward( self, - packed_query_sequence: torch.Tensor, - query_lens: torch.Tensor, - packed_query_position_embeddings: torch.Tensor, - packed_query_indexes: torch.Tensor, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor | None = None, + packed_query_sequence: torch.Tensor | None = None, + query_lens: torch.Tensor = None, + packed_query_position_embeddings: torch.Tensor = None, + packed_query_indexes: torch.Tensor = None, past_key_values: NaiveCache | None = None, key_values_lens: torch.Tensor | None = None, packed_key_value_indexes: torch.Tensor | None = None, @@ -327,6 +329,8 @@ def forward( packed_vae_token_indexes=None, packed_text_indexes=None, ) -> BaseNavitOutputWithPast: + if packed_query_sequence is None: + packed_query_sequence = hidden_states residual = packed_query_sequence if mode == "und": packed_query_sequence = self.input_layernorm(packed_query_sequence) @@ -437,7 +441,8 @@ def forward( for layer_idx, decoder_layer in enumerate(self.layers): packed_query_sequence, past_key_values = decoder_layer( - packed_query_sequence=packed_query_sequence, + hidden_states=packed_query_sequence, + encoder_hidden_states=None, query_lens=query_lens, packed_query_position_embeddings=packed_query_position_embeddings, packed_query_indexes=packed_query_indexes, From 67b279a9375bb876b4df86acabe4f38c158c30e1 Mon Sep 17 00:00:00 2001 From: Taichang Zhou Date: Sat, 17 Jan 2026 19:45:29 +0800 Subject: [PATCH 50/59] Update Qwen2.5 Omni model to version 0.14, adding support for image and video input processing, and refining position handling for MRoPE. Adjustments made to the YAML configuration to disable async scheduling for consistency. Code cleanup and formatting improvements included. Signed-off-by: Taichang Zhou --- .../models/qwen2_5_omni/qwen2_5_omni.py | 30 ++++++-- .../qwen2_5_omni/qwen2_5_omni_thinker.py | 47 ++++++++++-- .../models/qwen2_5_omni/qwen2_old.py | 73 +------------------ .../stage_configs/qwen2_5_omni.yaml | 1 + 4 files changed, 70 insertions(+), 81 deletions(-) diff --git a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py index 37733e2909c..61b6b6d2316 100644 --- a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py +++ b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py @@ -107,6 +107,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) if t2w_token_end_id: self.model.set_suppress_start_id(t2w_token_end_id + 1) + self.requires_raw_input_tokens = True elif self.model_stage == "code2wav": self.thinker = None @@ -248,14 +249,27 @@ def forward( if is_npu(): # TODO: remove this hack when NPU supports batched inputs properly thinker_input_ids = input_ids[0] if input_ids is not None and added_batch_dim else input_ids - thinker_positions = positions[0] if positions.ndim > 1 else positions + # For MRoPE, positions shape is [3, num_tokens] (T/H/W), don't slice it + if positions.ndim == 2 and positions.shape[0] == 3: + thinker_positions = positions # MRoPE positions, keep as is + else: + thinker_positions = positions[0] if positions.ndim > 1 else positions thinker_inputs_embeds = ( inputs_embeds[0] if inputs_embeds is not None and added_batch_dim else inputs_embeds ) else: - thinker_input_ids = input_ids - thinker_positions = positions[0] - thinker_inputs_embeds = inputs_embeds + # Squeeze back if we added batch dim earlier + thinker_input_ids = input_ids[0] if input_ids is not None and added_batch_dim else input_ids + # For MRoPE, positions shape is [3, num_tokens] (T/H/W), don't slice it + if positions.ndim == 2 and positions.shape[0] == 3: + thinker_positions = positions # MRoPE positions, keep as is + elif added_batch_dim: + thinker_positions = positions[0] + else: + thinker_positions = positions + thinker_inputs_embeds = ( + inputs_embeds[0] if inputs_embeds is not None and added_batch_dim else inputs_embeds + ) # Run thinker thinker_output = self.thinker( @@ -288,10 +302,16 @@ def forward( if not hasattr(self, "voice_type"): self.voice_type = voice_type + # For MRoPE, positions shape is [3, num_tokens] (T/H/W), don't slice it + if positions.ndim == 2 and positions.shape[0] == 3: + talker_positions = positions # MRoPE positions, keep as is + else: + talker_positions = positions[0] + with torch.inference_mode(): talker_hidden = self.talker( input_ids=input_ids, - positions=positions[0], + positions=talker_positions, inputs_embeds=inputs_embeds, ) diff --git a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py index 71c5e8377ac..a2250b82e73 100644 --- a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py +++ b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_thinker.py @@ -1,4 +1,4 @@ -"""Thin Omni wrapper: reuse upstream Qwen2.5-Omni thinker (v0.12) with minimal overrides.""" +"""Thin Omni wrapper: reuse upstream Qwen2.5-Omni thinker (v0.14) with minimal overrides.""" from collections.abc import Iterable from typing import Any @@ -12,6 +12,7 @@ Qwen2_5OmniAudioEncoder, ) from vllm.config import VllmConfig +from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.models.interfaces import ( MultiModalEmbeddings, @@ -26,8 +27,6 @@ Qwen2_5OmniThinkerDummyInputsBuilder, Qwen2_5OmniThinkerMultiModalProcessor, Qwen2_5OmniThinkerProcessingInfo, - get_llm_pos_ids_for_vision, - split_list_into_ranges, ) from vllm.model_executor.models.qwen2_5_omni_thinker import ( Qwen2_5OmniConditionalGenerationMixin as Qwen2_5OmniConditionalGenerationMixinBase, @@ -46,7 +45,9 @@ WeightsMapper, init_vllm_registered_model, maybe_prefix, + split_list_into_ranges, ) +from vllm.model_executor.models.vision import get_llm_pos_ids_for_vision from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import ( MultiModalFeatureSpec, @@ -166,6 +167,43 @@ def _parse_and_validate_video_input( video_grid_thw=video_grid_thw, ) + def _process_image_input(self, image_input: Qwen2_5_VLImageInputs) -> tuple[torch.Tensor, ...]: + if image_input["type"] == "image_embeds": + return image_input["image_embeds"].type(self.visual.dtype) + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + with set_forward_context(None, self.vllm_config): + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + + return image_embeds.split(sizes.tolist()) + + def _process_video_input( + self, + video_input: Qwen2_5_VLVideoInputs, + video_hashes: list[str] = None, + cached_video_embeds: torch.Tensor = None, + ) -> torch.Tensor: + if video_input["type"] == "video_embeds": + return video_input["video_embeds"].type(self.visual.dtype) + + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 + + pixel_values_videos = video_input["pixel_values_videos"].type(self.visual.dtype) + with set_forward_context(None, self.vllm_config): + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + + return video_embeds.split(sizes.tolist()) + @MULTIMODAL_REGISTRY.register_processor( Qwen2_5OmniThinkerMultiModalProcessor, @@ -180,8 +218,6 @@ class Qwen2_5OmniThinkerForConditionalGeneration( SupportsMRoPE, Qwen2_5OmniConditionalGenerationMixin, ): - merge_by_field_config = True - hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "thinker.lm_head.": "language_model.lm_head.", @@ -250,6 +286,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): norm_eps=getattr(thinker_config.text_config, "rms_norm_eps", 1e-6), quant_config=quant_config, prefix=maybe_prefix(prefix, "visual"), + multimodal_config=multimodal_config, ) else: self.visual = None diff --git a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_old.py b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_old.py index ce9819b3e68..e04010196a8 100644 --- a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_old.py +++ b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_old.py @@ -3,9 +3,6 @@ import torch from torch import nn from transformers import Qwen2Config -from vllm.attention.backends.abstract import ( - AttentionType, -) from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig @@ -15,7 +12,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead, VocabParallelEmbedding @@ -24,15 +20,14 @@ from vllm.model_executor.models.utils import ( AutoWeightsLoader, PPMissingLayer, - WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, ) from vllm.sequence import IntermediateTensors -from vllm.v1.outputs import PoolerOutput, SamplerOutput -from vllm.v1.pool.metadata import PoolingMetadata +from vllm.v1.attention.backend import AttentionType +from vllm.v1.outputs import SamplerOutput from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.sampler import Sampler @@ -130,7 +125,6 @@ def __init__( self.rotary_pos_emb = get_rope( head_size=self.head_dim, - rotary_dim=self.head_dim, max_position=max_position, is_neox_style=True, rope_parameters={ @@ -460,66 +454,3 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) return loader.load_weights(weights) - - -class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - pooler_config = vllm_config.model_config.pooler_config - - self.config = config - self.lora_config = lora_config - - self.quant_config = quant_config - self.model = Qwen2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) - - # TODO: Replace this model class with as_embedding_model( - # Qwen2ForCausalLM) after changing the default pooling method - if pooler_config.pooling_type is None: - logger.warning( - "This embedding model will default to last-token pooling in " - "an upcoming version. To avoid breaking changes, you should " - 'pass `--override-pooler-config \'{"pooling_type": "MEAN"}\'`' - " explicitly." - ) - - self._pooler = Pooler.from_config_with_defaults( - pooler_config, pooling_type=PoolingType.MEAN, normalize=True, softmax=False - ) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: IntermediateTensors | None = None, - ) -> torch.Tensor: - return self.model(input_ids, positions, intermediate_tensors) - - def pooler( - self, - hidden_states: torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> PoolerOutput | None: - return self._pooler(hidden_states, pooling_metadata) - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - weights = self.hf_to_vllm_mapper.apply(weights) - weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) - self.model.load_weights(weights) diff --git a/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml b/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml index fc84f485ea4..e3d740ad580 100644 --- a/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml +++ b/vllm_omni/model_executor/stage_configs/qwen2_5_omni.yaml @@ -76,6 +76,7 @@ stage_args: trust_remote_code: true enable_prefix_caching: false max_num_batched_tokens: 32768 + async_scheduling: false engine_output_type: audio engine_input_source: [1] final_output: true From bb24e07a7adb2c593494c3f452fa108c8cb26ef5 Mon Sep 17 00:00:00 2001 From: John Liu BUAA Date: Sat, 17 Jan 2026 19:47:11 +0800 Subject: [PATCH 51/59] [Perf] Optimize the Qwen2.5-Omni Model thinker-to-talker-proj with nn.Linear (#825) Signed-off-by: John Liu BUAA --- .buildkite/pipeline.yml | 2 +- .../models/qwen2_5_omni/qwen2_5_omni_talker.py | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index f18aef61771..d4cbbab2b88 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -94,7 +94,7 @@ steps: - "/fsx/hf_cache:/fsx/hf_cache" - label: "Diffusion Parallelism Test" - timeout_in_minutes: 20 + timeout_in_minutes: 25 depends_on: image-build commands: - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py diff --git a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_talker.py b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_talker.py index 7b0b4430917..927bc552573 100644 --- a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_talker.py +++ b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni_talker.py @@ -9,7 +9,6 @@ # from vllm.attention import AttentionMetadata # unused import from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.models.interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from vllm.model_executor.models.qwen2_5_omni_thinker import ( Qwen2_5OmniThinkerDummyInputsBuilder, @@ -68,13 +67,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.config = config - self.thinker_to_talker_proj = ColumnParallelLinear( + self.thinker_to_talker_proj = nn.Linear( self.config.embedding_size, self.config.hidden_size, - bias=True, - gather_output=True, - skip_bias_add=False, - quant_config=quant_config, ) self.language_model = init_vllm_registered_model( vllm_config=vllm_config, @@ -145,7 +140,7 @@ def forward( input_ids = None # projection - inputs_embeds, _ = self.thinker_to_talker_proj(inputs_embeds) + inputs_embeds = self.thinker_to_talker_proj(inputs_embeds) hidden_states = self.language_model.model( input_ids, positions, intermediate_tensors, inputs_embeds=inputs_embeds From 36c28767cd0a677ac92a8ffc01634e5c057f9408 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Sat, 17 Jan 2026 19:49:13 +0800 Subject: [PATCH 52/59] [Core]Add GPU Diffusion Runner (#822) Signed-off-by: princepride --- .buildkite/pipeline.yml | 2 +- .buildkite/test-amd.yaml | 2 +- docs/api/README.md | 5 +- ...worker.py => test_gpu_diffusion_worker.py} | 57 ++--- vllm_omni/diffusion/worker/__init__.py | 12 +- .../worker/gpu_diffusion_model_runner.py | 165 +++++++++++++++ ...{gpu_worker.py => gpu_diffusion_worker.py} | 194 ++++++------------ vllm_omni/diffusion/worker/npu/npu_worker.py | 132 +++++++++++- vllm_omni/utils/platform_utils.py | 2 +- 9 files changed, 405 insertions(+), 166 deletions(-) rename tests/diffusion/{test_gpu_worker.py => test_gpu_diffusion_worker.py} (81%) create mode 100644 vllm_omni/diffusion/worker/gpu_diffusion_model_runner.py rename vllm_omni/diffusion/worker/{gpu_worker.py => gpu_diffusion_worker.py} (65%) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index d4cbbab2b88..7375dd4a2c4 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -116,7 +116,7 @@ steps: timeout_in_minutes: 20 depends_on: image-build commands: - - pytest -s -v tests/diffusion/test_gpu_worker.py + - pytest -s -v tests/diffusion/test_gpu_diffusion_worker.py agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index bcece8c495d..86d65f15bcf 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -54,7 +54,7 @@ steps: commands: - export MIOPEN_DEBUG_CONV_DIRECT=0 - export MIOPEN_DEBUG_CONV_GEMM=0 - - pytest -s -v tests/diffusion/test_gpu_worker.py + - pytest -s -v tests/diffusion/test_gpu_diffusion_worker.py - label: "Omni Model Test Qwen2-5-Omni" timeout_in_minutes: 15 diff --git a/docs/api/README.md b/docs/api/README.md index a9d751bce25..a1f07011118 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -103,8 +103,9 @@ Configuration classes. Worker classes and model runners for distributed inference. -- [vllm_omni.diffusion.worker.gpu_worker.GPUWorker][] -- [vllm_omni.diffusion.worker.gpu_worker.WorkerProc][] +- [vllm_omni.diffusion.worker.gpu_diffusion_model_runner.GPUDiffusionModelRunner][] +- [vllm_omni.diffusion.worker.gpu_diffusion_worker.GPUDiffusionWorker][] +- [vllm_omni.diffusion.worker.gpu_diffusion_worker.WorkerProc][] - [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorker][] - [vllm_omni.diffusion.worker.npu.npu_worker.NPUWorkerProc][] - [vllm_omni.worker.gpu_ar_model_runner.ExecuteModelState][] diff --git a/tests/diffusion/test_gpu_worker.py b/tests/diffusion/test_gpu_diffusion_worker.py similarity index 81% rename from tests/diffusion/test_gpu_worker.py rename to tests/diffusion/test_gpu_diffusion_worker.py index defeffe5b56..7a43710c878 100644 --- a/tests/diffusion/test_gpu_worker.py +++ b/tests/diffusion/test_gpu_diffusion_worker.py @@ -2,9 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ -Unit tests for GPUWorker class. +Unit tests for GPUDiffusionWorker class. -This module tests the GPUWorker implementation: +This module tests the GPUDiffusionWorker implementation: - load_weights: Loading model weights - sleep: Putting worker into sleep mode (levels 1 and 2) - wake_up: Waking worker from sleep mode @@ -15,7 +15,7 @@ import pytest import torch -from vllm_omni.diffusion.worker.gpu_worker import GPUWorker +from vllm_omni.diffusion.worker.gpu_diffusion_worker import GPUDiffusionWorker @pytest.fixture @@ -33,20 +33,21 @@ def mock_od_config(): @pytest.fixture def mock_gpu_worker(mock_od_config): - """Create a GPUWorker with mocked initialization.""" - with patch.object(GPUWorker, "init_device_and_model"): - worker = GPUWorker(local_rank=0, rank=0, od_config=mock_od_config) - # Mock the pipeline - worker.pipeline = Mock() - worker.cache_backend = None + """Create a GPUDiffusionWorker with mocked initialization.""" + with patch.object(GPUDiffusionWorker, "init_device"): + worker = GPUDiffusionWorker(local_rank=0, rank=0, od_config=mock_od_config) + # Mock the model_runner with pipeline + worker.model_runner = Mock() + worker.model_runner.pipeline = Mock() + worker._sleep_saved_buffers = {} return worker -class TestGPUWorkerLoadWeights: - """Test GPUWorker.load_weights method.""" +class TestGPUDiffusionWorkerLoadWeights: + """Test GPUDiffusionWorker.load_weights method.""" def test_load_weights_calls_pipeline(self, mock_gpu_worker): - """Test that load_weights delegates to pipeline.load_weights.""" + """Test that load_weights delegates to model_runner.load_weights.""" # Setup mock weights mock_weights = [ ("layer1.weight", torch.randn(10, 10)), @@ -54,30 +55,30 @@ def test_load_weights_calls_pipeline(self, mock_gpu_worker): ] expected_loaded = {"layer1.weight", "layer2.weight"} - # Configure pipeline mock - mock_gpu_worker.pipeline.load_weights = Mock(return_value=expected_loaded) + # Configure model_runner mock + mock_gpu_worker.model_runner.load_weights = Mock(return_value=expected_loaded) # Call load_weights result = mock_gpu_worker.load_weights(mock_weights) - # Verify pipeline.load_weights was called with the weights - mock_gpu_worker.pipeline.load_weights.assert_called_once_with(mock_weights) + # Verify model_runner.load_weights was called with the weights + mock_gpu_worker.model_runner.load_weights.assert_called_once_with(mock_weights) assert result == expected_loaded def test_load_weights_empty_iterable(self, mock_gpu_worker): """Test load_weights with empty weights iterable.""" - mock_gpu_worker.pipeline.load_weights = Mock(return_value=set()) + mock_gpu_worker.model_runner.load_weights = Mock(return_value=set()) result = mock_gpu_worker.load_weights([]) - mock_gpu_worker.pipeline.load_weights.assert_called_once_with([]) + mock_gpu_worker.model_runner.load_weights.assert_called_once_with([]) assert result == set() -class TestGPUWorkerSleep: - """Test GPUWorker.sleep method.""" +class TestGPUDiffusionWorkerSleep: + """Test GPUDiffusionWorker.sleep method.""" - @patch("vllm_omni.diffusion.worker.gpu_worker.torch.cuda.mem_get_info") + @patch("vllm_omni.diffusion.worker.gpu_diffusion_worker.torch.cuda.mem_get_info") @patch("vllm.device_allocator.cumem.CuMemAllocator") def test_sleep_level_1(self, mock_allocator_class, mock_mem_info, mock_gpu_worker): """Test sleep mode level 1 (offload weights only).""" @@ -103,7 +104,7 @@ def test_sleep_level_1(self, mock_allocator_class, mock_mem_info, mock_gpu_worke # Verify buffers were NOT saved (level 1 doesn't save buffers) assert len(mock_gpu_worker._sleep_saved_buffers) == 0 - @patch("vllm_omni.diffusion.worker.gpu_worker.torch.cuda.mem_get_info") + @patch("vllm_omni.diffusion.worker.gpu_diffusion_worker.torch.cuda.mem_get_info") @patch("vllm.device_allocator.cumem.CuMemAllocator") def test_sleep_level_2(self, mock_allocator_class, mock_mem_info, mock_gpu_worker): """Test sleep mode level 2 (offload all, save buffers).""" @@ -121,7 +122,7 @@ def test_sleep_level_2(self, mock_allocator_class, mock_mem_info, mock_gpu_worke # Mock pipeline buffers mock_buffer1 = torch.randn(10, 10) mock_buffer2 = torch.randn(20, 20) - mock_gpu_worker.pipeline.named_buffers = Mock( + mock_gpu_worker.model_runner.pipeline.named_buffers = Mock( return_value=[ ("buffer1", mock_buffer1), ("buffer2", mock_buffer2), @@ -140,7 +141,7 @@ def test_sleep_level_2(self, mock_allocator_class, mock_mem_info, mock_gpu_worke assert "buffer1" in mock_gpu_worker._sleep_saved_buffers assert "buffer2" in mock_gpu_worker._sleep_saved_buffers - @patch("vllm_omni.diffusion.worker.gpu_worker.torch.cuda.mem_get_info") + @patch("vllm_omni.diffusion.worker.gpu_diffusion_worker.torch.cuda.mem_get_info") @patch("vllm.device_allocator.cumem.CuMemAllocator") def test_sleep_memory_freed_validation(self, mock_allocator_class, mock_mem_info, mock_gpu_worker): """Test that sleep validates memory was actually freed.""" @@ -159,8 +160,8 @@ def test_sleep_memory_freed_validation(self, mock_allocator_class, mock_mem_info mock_gpu_worker.sleep(level=1) -class TestGPUWorkerWakeUp: - """Test GPUWorker.wake_up method.""" +class TestGPUDiffusionWorkerWakeUp: + """Test GPUDiffusionWorker.wake_up method.""" @patch("vllm.device_allocator.cumem.CuMemAllocator") def test_wake_up_without_buffers(self, mock_allocator_class, mock_gpu_worker): @@ -202,7 +203,7 @@ def test_wake_up_with_buffers(self, mock_allocator_class, mock_gpu_worker): mock_buffer2 = Mock() mock_buffer2.data = Mock() - mock_gpu_worker.pipeline.named_buffers = Mock( + mock_gpu_worker.model_runner.pipeline.named_buffers = Mock( return_value=[ ("buffer1", mock_buffer1), ("buffer2", mock_buffer2), @@ -243,7 +244,7 @@ def test_wake_up_partial_buffer_restore(self, mock_allocator_class, mock_gpu_wor mock_buffer2 = Mock() mock_buffer2.data = Mock() - mock_gpu_worker.pipeline.named_buffers = Mock( + mock_gpu_worker.model_runner.pipeline.named_buffers = Mock( return_value=[ ("buffer1", mock_buffer1), ("buffer2", mock_buffer2), diff --git a/vllm_omni/diffusion/worker/__init__.py b/vllm_omni/diffusion/worker/__init__.py index dc3306dae3f..dfec4596bc2 100644 --- a/vllm_omni/diffusion/worker/__init__.py +++ b/vllm_omni/diffusion/worker/__init__.py @@ -2,6 +2,14 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Worker classes for diffusion models.""" -from vllm_omni.diffusion.worker.gpu_worker import GPUWorker, WorkerProc +from vllm_omni.diffusion.worker.gpu_diffusion_model_runner import GPUDiffusionModelRunner +from vllm_omni.diffusion.worker.gpu_diffusion_worker import ( + GPUDiffusionWorker, + WorkerProc, +) -__all__ = ["GPUWorker", "WorkerProc"] +__all__ = [ + "GPUDiffusionModelRunner", + "GPUDiffusionWorker", + "WorkerProc", +] diff --git a/vllm_omni/diffusion/worker/gpu_diffusion_model_runner.py b/vllm_omni/diffusion/worker/gpu_diffusion_model_runner.py new file mode 100644 index 00000000000..8ffa12ed2cc --- /dev/null +++ b/vllm_omni/diffusion/worker/gpu_diffusion_model_runner.py @@ -0,0 +1,165 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Diffusion Model Runner for vLLM-Omni. + +Handles model loading, compilation, caching, and execution of diffusion model +forward passes. This follows the AR pattern where the Runner handles all +model-related operations. +""" + +from __future__ import annotations + +import time +from collections.abc import Iterable +from contextlib import nullcontext + +import torch +from vllm.config import LoadConfig +from vllm.logger import init_logger +from vllm.utils.mem_utils import DeviceMemoryProfiler, GiB_bytes + +from vllm_omni.diffusion.cache.selector import get_cache_backend +from vllm_omni.diffusion.compile import regionally_compile +from vllm_omni.diffusion.data import DiffusionOutput, OmniDiffusionConfig +from vllm_omni.diffusion.forward_context import set_forward_context +from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader +from vllm_omni.diffusion.offload import apply_offload_hooks +from vllm_omni.diffusion.request import OmniDiffusionRequest + +logger = init_logger(__name__) + + +class GPUDiffusionModelRunner: + """ + Model runner that handles model loading and execution for diffusion models. + + This class follows the AR pattern where the Runner handles all model-related + operations including loading, compilation, offloading, caching, and execution. + The Worker only handles infrastructure (device, distributed env). + """ + + def __init__( + self, + vllm_config, + od_config: OmniDiffusionConfig, + device: torch.device, + ): + """ + Initialize the diffusion model runner. + + Args: + vllm_config: vLLM configuration. + od_config: OmniDiffusion configuration. + device: The device to run on. + """ + self.vllm_config = vllm_config + self.od_config = od_config + self.device = device + self.pipeline = None + self.cache_backend = None + + def load_model( + self, + memory_pool_context_fn: callable | None = None, + ) -> None: + """ + Load the diffusion model, apply compilation and offloading. + + Args: + memory_pool_context_fn: Optional function that returns a context manager + for memory pool allocation (used for sleep mode). + """ + load_device = "cpu" if self.od_config.enable_cpu_offload else str(self.device) + + def get_memory_context(): + if memory_pool_context_fn is not None: + return memory_pool_context_fn(tag="weights") + return nullcontext() + + # Load model within forward context + with set_forward_context(vllm_config=self.vllm_config, omni_diffusion_config=self.od_config): + load_config = LoadConfig() + model_loader = DiffusersPipelineLoader(load_config) + time_before_load = time.perf_counter() + + with get_memory_context(): + with DeviceMemoryProfiler() as m: + self.pipeline = model_loader.load_model( + od_config=self.od_config, + load_device=load_device, + ) + time_after_load = time.perf_counter() + + logger.info( + "Model loading took %.4f GiB and %.6f seconds", + m.consumed_memory / GiB_bytes, + time_after_load - time_before_load, + ) + logger.info("Model runner: Model loaded successfully.") + + # Apply CPU offloading (DiT <-> encoders mutual exclusion) + if self.od_config.enable_cpu_offload: + for name in ["vae"]: + module = getattr(self.pipeline, name, None) + if module is None: + continue + try: + module.to(self.device, non_blocking=True) + except Exception as exc: + logger.debug("Failed to move %s to GPU: %s", name, exc) + + apply_offload_hooks(self.pipeline, self.od_config, device=self.device) + + # Apply torch.compile if not in eager mode + if not self.od_config.enforce_eager: + try: + self.pipeline.transformer = regionally_compile( + self.pipeline.transformer, + dynamic=True, + ) + logger.info("Model runner: Model compiled with torch.compile.") + except Exception as e: + logger.warning(f"Model runner: torch.compile failed with error: {e}. Using eager mode.") + + # Setup cache backend + self.cache_backend = get_cache_backend(self.od_config.cache_backend, self.od_config.cache_config) + + if self.cache_backend is not None: + self.cache_backend.enable(self.pipeline) + + logger.info("Model runner: Initialization complete.") + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + """Load weights into the pipeline.""" + return self.pipeline.load_weights(weights) + + @torch.inference_mode() + def execute_model(self, reqs: list[OmniDiffusionRequest]) -> DiffusionOutput: + """ + Execute a forward pass for the given requests. + + Args: + reqs: List of diffusion requests to process. + + Returns: + DiffusionOutput with generated results. + """ + assert self.pipeline is not None, "Model not loaded. Call load_model() first." + if not reqs or len(reqs) == 0: + raise ValueError("Cannot execute model with empty request list") + + # TODO: dealing with first req for now + req = reqs[0] + + if req.generator is None and req.seed is not None: + req.generator = torch.Generator(device=self.device).manual_seed(req.seed) + + # Refresh cache context if needed + if self.cache_backend is not None and self.cache_backend.is_enabled(): + self.cache_backend.refresh(self.pipeline, req.num_inference_steps) + + with set_forward_context(vllm_config=self.vllm_config, omni_diffusion_config=self.od_config): + output = self.pipeline.forward(req) + + return output diff --git a/vllm_omni/diffusion/worker/gpu_worker.py b/vllm_omni/diffusion/worker/gpu_diffusion_worker.py similarity index 65% rename from vllm_omni/diffusion/worker/gpu_worker.py rename to vllm_omni/diffusion/worker/gpu_diffusion_worker.py index 99a718e389f..7fa1e3da4d5 100644 --- a/vllm_omni/diffusion/worker/gpu_worker.py +++ b/vllm_omni/diffusion/worker/gpu_diffusion_worker.py @@ -1,20 +1,23 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Diffusion Worker for vLLM-Omni. + +Handles GPU infrastructure initialization and delegates model operations +to GPUDiffusionModelRunner. +""" + import multiprocessing as mp import os -import time -from collections.abc import Iterable from contextlib import AbstractContextManager, nullcontext import torch import zmq -from vllm.config import LoadConfig, VllmConfig +from vllm.config import VllmConfig from vllm.distributed.device_communicators.shm_broadcast import MessageQueue from vllm.logger import init_logger -from vllm.utils.mem_utils import DeviceMemoryProfiler, GiB_bytes +from vllm.utils.mem_utils import GiB_bytes -from vllm_omni.diffusion.cache.selector import get_cache_backend -from vllm_omni.diffusion.compile import regionally_compile from vllm_omni.diffusion.data import ( DiffusionOutput, OmniDiffusionConfig, @@ -25,16 +28,23 @@ initialize_model_parallel, ) from vllm_omni.diffusion.forward_context import set_forward_context -from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader -from vllm_omni.diffusion.offload import apply_offload_hooks from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.diffusion.worker.gpu_diffusion_model_runner import GPUDiffusionModelRunner logger = init_logger(__name__) -class GPUWorker: +class GPUDiffusionWorker: """ - A worker that executes the model on a single GPU. + A worker that manages GPU infrastructure and delegates to the model runner. + + This class handles infrastructure initialization only: + - Device setup (CUDA device selection) + - Distributed environment (NCCL, model parallel) + - Memory management (sleep/wake) + + All model-related operations (loading, compilation, execution) are + delegated to GPUDiffusionModelRunner. """ def __init__( @@ -46,15 +56,17 @@ def __init__( self.local_rank = local_rank self.rank = rank self.od_config = od_config - self.pipeline = None - self.device = None + self.device: torch.device | None = None + self.vllm_config: VllmConfig | None = None + self.model_runner: GPUDiffusionModelRunner | None = None self._sleep_saved_buffers: dict[str, torch.Tensor] = {} - self.init_device_and_model() + self.init_device() - def init_device_and_model(self) -> None: - """Initialize the device and load the model.""" + def init_device(self) -> None: + """Initialize the device and distributed environment.""" world_size = self.od_config.num_gpus rank = self.rank + # Set environment variables for distributed initialization os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = str(self.od_config.master_port) @@ -62,19 +74,21 @@ def init_device_and_model(self) -> None: os.environ["RANK"] = str(rank) os.environ["WORLD_SIZE"] = str(world_size) + # Setup device self.device = torch.device(f"cuda:{rank}") torch.cuda.set_device(self.device) - # hack + # Create vllm_config for parallel configuration vllm_config = VllmConfig() vllm_config.parallel_config.tensor_parallel_size = self.od_config.parallel_config.tensor_parallel_size vllm_config.parallel_config.data_parallel_size = self.od_config.parallel_config.data_parallel_size self.vllm_config = vllm_config - load_device = "cpu" if self.od_config.enable_cpu_offload else str(self.device) + # Initialize distributed environment with set_forward_context(vllm_config=vllm_config, omni_diffusion_config=self.od_config): init_distributed_environment(world_size=world_size, rank=rank) logger.info(f"Worker {self.rank}: Initialized device and distributed environment.") + parallel_config = self.od_config.parallel_config initialize_model_parallel( data_parallel_size=parallel_config.data_parallel_size, @@ -86,107 +100,45 @@ def init_device_and_model(self) -> None: pipeline_parallel_size=parallel_config.pipeline_parallel_size, ) - load_config = LoadConfig() - model_loader = DiffusersPipelineLoader(load_config) - time_before_load = time.perf_counter() - with self._maybe_get_memory_pool_context(tag="weights"): - with DeviceMemoryProfiler() as m: - self.pipeline = model_loader.load_model( - od_config=self.od_config, - load_device=load_device, - ) - time_after_load = time.perf_counter() - - logger.info( - "Model loading took %.4f GiB and %.6f seconds", - m.consumed_memory / GiB_bytes, - time_after_load - time_before_load, + # Create model runner and load model + self.model_runner = GPUDiffusionModelRunner( + vllm_config=self.vllm_config, + od_config=self.od_config, + device=self.device, ) - logger.info(f"Worker {self.rank}: Model loaded successfully.") - - # Apply CPU offloading (DiT <-> encoders mutual exclusion) - if self.od_config.enable_cpu_offload: - for name in ["vae"]: - module = getattr(self.pipeline, name, None) - if module is None: - continue - try: - module.to(self.device, non_blocking=True) - except Exception as exc: - logger.debug("Failed to move %s to GPU: %s", name, exc) - - apply_offload_hooks(self.pipeline, self.od_config, device=self.device) - - if not self.od_config.enforce_eager: - try: - self.pipeline.transformer = regionally_compile( - self.pipeline.transformer, - dynamic=True, - ) - logger.info(f"Worker {self.rank}: Model compiled with torch.compile.") - except Exception as e: - logger.warning(f"Worker {self.rank}: torch.compile failed with error: {e}. Using eager mode.") - - # Setup cache backend based on type (both backends use enable()/reset() interface) - self.cache_backend = get_cache_backend(self.od_config.cache_backend, self.od_config.cache_config) - - if self.cache_backend is not None: - self.cache_backend.enable(self.pipeline) + self.model_runner.load_model( + memory_pool_context_fn=self._maybe_get_memory_pool_context, + ) + logger.info(f"Worker {self.rank}: Initialization complete.") def generate(self, requests: list[OmniDiffusionRequest]) -> DiffusionOutput: - """ - Generate output for the given requests. - - Args: - requests: List of diffusion requests - - Returns: - DiffusionOutput with generated results - """ + """Generate output for the given requests.""" return self.execute_model(requests, self.od_config) - @torch.inference_mode() def execute_model(self, reqs: list[OmniDiffusionRequest], od_config: OmniDiffusionConfig) -> DiffusionOutput: - """ - Execute a forward pass. - """ - assert self.pipeline is not None - if not reqs or len(reqs) == 0: - raise ValueError("Cannot execute model with empty request list") - # TODO: dealing with first req for now - req = reqs[0] + """Execute a forward pass by delegating to the model runner.""" + assert self.model_runner is not None, "Model runner not initialized" + return self.model_runner.execute_model(reqs) - if req.generator is None and req.seed is not None: - req.generator = torch.Generator(device=self.device).manual_seed(req.seed) - - # Refresh cache context if needed - if self.cache_backend is not None and self.cache_backend.is_enabled(): - self.cache_backend.refresh(self.pipeline, req.num_inference_steps) - with set_forward_context(vllm_config=self.vllm_config, omni_diffusion_config=self.od_config): - output = self.pipeline.forward(req) - return output - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - return self.pipeline.load_weights(weights) + def load_weights(self, weights) -> set[str]: + """Load weights by delegating to the model runner.""" + assert self.model_runner is not None, "Model runner not initialized" + return self.model_runner.load_weights(weights) def sleep(self, level: int = 1) -> bool: """ - Put the worker to sleep. The worker should not process any requests. - The caller should guarantee that no requests are being processed - during the sleep period, before `wake_up` is called. + Put the worker to sleep, offloading model weights. Args: - level: The sleep level. Level 1 sleep will offload the model - weights and discard the kv cache. - Currently only support level 1. + level: Sleep level. Level 1 offloads weights, level 2 also saves buffers. """ from vllm.device_allocator.cumem import CuMemAllocator free_bytes_before_sleep = torch.cuda.mem_get_info()[0] # Save the buffers before level 2 sleep - if level == 2: - model = self.pipeline + if level == 2 and self.model_runner is not None: + model = self.model_runner.pipeline self._sleep_saved_buffers = {name: buffer.cpu().clone() for name, buffer in model.named_buffers()} allocator = CuMemAllocator.get_instance() @@ -220,8 +172,8 @@ def wake_up(self, tags: list[str] | None = None) -> bool: allocator.wake_up(tags) # Restore the buffers after level 2 sleep - if len(self._sleep_saved_buffers): - model = self.pipeline + if len(self._sleep_saved_buffers) and self.model_runner is not None: + model = self.model_runner.pipeline for name, buffer in model.named_buffers(): if name in self._sleep_saved_buffers: buffer.data.copy_(self._sleep_saved_buffers[name].data) @@ -229,6 +181,7 @@ def wake_up(self, tags: list[str] | None = None) -> bool: return True def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager: + """Get memory pool context for sleep mode support.""" if self.od_config.enable_sleep_mode: from vllm.device_allocator.cumem import CuMemAllocator @@ -240,6 +193,7 @@ def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager: return nullcontext() def shutdown(self) -> None: + """Shutdown the worker and cleanup distributed environment.""" destroy_distributed_env() @@ -257,17 +211,14 @@ def __init__( # Inter-process Communication self.context = zmq.Context(io_threads=2) - # Initialize MessageQueue reader from handle (unified for generation & RPC) + # Initialize MessageQueue reader from handle self.mq = MessageQueue.create_from_handle(broadcast_handle, gpu_id) self.result_mq = None self.result_mq_handle = None - # Setup result sender (only for rank 0 for now, or whoever needs to reply) - # Assuming only rank 0 replies to scheduler as per original logic + # Setup result sender (only for rank 0) if gpu_id == 0: - # Create MessageQueue for results (1 writer -> 1 reader) - # We assume the reader (SyncScheduler) will act as rank 0 self.result_mq = MessageQueue(n_reader=1, n_local_reader=1, local_reader_ranks=[0]) self.result_mq_handle = self.result_mq.export_handle() logger.info(f"Worker {gpu_id} created result MessageQueue") @@ -277,31 +228,25 @@ def __init__( self.gpu_id = gpu_id self._running = True - def _create_worker(self, gpu_id: int, od_config: OmniDiffusionConfig) -> GPUWorker: + def _create_worker(self, gpu_id: int, od_config: OmniDiffusionConfig) -> GPUDiffusionWorker: """Create a worker instance. Override in subclasses for different worker types.""" - return GPUWorker( + return GPUDiffusionWorker( local_rank=gpu_id, rank=gpu_id, od_config=od_config, ) def return_result(self, output: DiffusionOutput): - """ - replies to client, only on rank 0 - """ + """Reply to client, only on rank 0.""" if self.result_mq is not None: self.result_mq.enqueue(output) def recv_message(self): - """ - Receive unified messages (RPC requests, shutdown) from broadcast queue. - Uses indefinite=True to block until a message arrives. - """ + """Receive messages from broadcast queue.""" return self.mq.dequeue(indefinite=True) def execute_rpc(self, rpc_request: dict) -> tuple[object | None, bool]: """Execute an RPC request and indicate whether to reply.""" - method = rpc_request["method"] args = rpc_request.get("args", ()) kwargs = rpc_request.get("kwargs", {}) @@ -325,14 +270,11 @@ def execute_rpc(self, rpc_request: dict) -> tuple[object | None, bool]: logger.error(f"Error executing RPC: {e}", exc_info=True) return {"status": "error", "error": str(e)}, should_reply - # TODO: queueing, cancellation def worker_busy_loop(self) -> None: - """Main busy loop for Multiprocessing Workers""" - + """Main busy loop for Multiprocessing Workers.""" logger.info(f"Worker {self.gpu_id} ready to receive requests via shared memory") while self._running: - # Receive unified message (generation request, RPC request, or shutdown) msg = None try: msg = self.recv_message() @@ -349,7 +291,6 @@ def worker_busy_loop(self) -> None: # Route message based on type if isinstance(msg, dict) and msg.get("type") == "rpc": - # Handle RPC request try: result, should_reply = self.execute_rpc(msg) if should_reply: @@ -360,13 +301,12 @@ def worker_busy_loop(self) -> None: self.return_result({"status": "error", "error": str(e)}) elif isinstance(msg, dict) and msg.get("type") == "shutdown": - # Handle shutdown message logger.info("Worker %s: Received shutdown message", self.gpu_id) self._running = False continue else: - # Handle generation request (OmniDiffusionRequest list) + # Handle generation request try: output = self.worker.execute_model(msg, self.od_config) except Exception as e: @@ -379,17 +319,14 @@ def worker_busy_loop(self) -> None: try: self.return_result(output) except zmq.ZMQError as e: - # Reply failed; log and keep loop alive to accept future requests logger.error(f"ZMQ error sending reply: {e}") continue logger.info("event loop terminated.") try: self.worker.shutdown() - except Exception as exc: # pragma: no cover - best effort cleanup + except Exception as exc: logger.warning("Worker %s: Shutdown encountered an error: %s", self.gpu_id, exc) - # if self.result_sender is not None: - # self.result_sender.close() self.context.term() @staticmethod @@ -400,7 +337,6 @@ def worker_main( broadcast_handle, ) -> None: """Worker initialization and execution loops.""" - worker_proc = WorkerProc( od_config, gpu_id=rank, diff --git a/vllm_omni/diffusion/worker/npu/npu_worker.py b/vllm_omni/diffusion/worker/npu/npu_worker.py index bfeb0d914c9..446c29cae4d 100644 --- a/vllm_omni/diffusion/worker/npu/npu_worker.py +++ b/vllm_omni/diffusion/worker/npu/npu_worker.py @@ -3,6 +3,8 @@ import multiprocessing as mp import os import time +from collections.abc import Iterable +from contextlib import AbstractContextManager, nullcontext import torch from vllm.config import LoadConfig, VllmConfig @@ -11,25 +13,42 @@ from vllm_omni.diffusion.cache.selector import get_cache_backend from vllm_omni.diffusion.data import ( + DiffusionOutput, OmniDiffusionConfig, ) from vllm_omni.diffusion.distributed.parallel_state import ( + destroy_distributed_env, init_distributed_environment, initialize_model_parallel, ) from vllm_omni.diffusion.forward_context import set_forward_context from vllm_omni.diffusion.model_loader.diffusers_loader import DiffusersPipelineLoader -from vllm_omni.diffusion.worker.gpu_worker import GPUWorker, WorkerProc +from vllm_omni.diffusion.request import OmniDiffusionRequest +from vllm_omni.diffusion.worker.gpu_diffusion_worker import WorkerProc logger = init_logger(__name__) -class NPUWorker(GPUWorker): +class NPUWorker: """ A worker that executes the model on a single NPU. Inherits from GPUWorker and overrides device-specific initialization. """ + def __init__( + self, + local_rank: int, + rank: int, + od_config: OmniDiffusionConfig, + ): + self.local_rank = local_rank + self.rank = rank + self.od_config = od_config + self.pipeline = None + self.device = None + self._sleep_saved_buffers: dict[str, torch.Tensor] = {} + self.init_device_and_model() + def init_device_and_model(self) -> None: """Initialize the NPU device and load the model.""" world_size = self.od_config.num_gpus @@ -86,6 +105,115 @@ def init_device_and_model(self) -> None: if self.cache_backend is not None: self.cache_backend.enable(self.pipeline) + def generate(self, requests: list[OmniDiffusionRequest]) -> DiffusionOutput: + """ + Generate output for the given requests. + + Args: + requests: List of diffusion requests + + Returns: + DiffusionOutput with generated results + """ + return self.execute_model(requests, self.od_config) + + @torch.inference_mode() + def execute_model(self, reqs: list[OmniDiffusionRequest], od_config: OmniDiffusionConfig) -> DiffusionOutput: + """ + Execute a forward pass. + """ + assert self.pipeline is not None + if not reqs or len(reqs) == 0: + raise ValueError("Cannot execute model with empty request list") + # TODO: dealing with first req for now + req = reqs[0] + + if req.generator is None and req.seed is not None: + req.generator = torch.Generator(device=self.device).manual_seed(req.seed) + + # Refresh cache context if needed + if self.cache_backend is not None and self.cache_backend.is_enabled(): + self.cache_backend.refresh(self.pipeline, req.num_inference_steps) + with set_forward_context(vllm_config=self.vllm_config, omni_diffusion_config=self.od_config): + output = self.pipeline.forward(req) + return output + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + return self.pipeline.load_weights(weights) + + def sleep(self, level: int = 1) -> bool: + """ + Put the worker to sleep. The worker should not process any requests. + The caller should guarantee that no requests are being processed + during the sleep period, before `wake_up` is called. + + Args: + level: The sleep level. Level 1 sleep will offload the model + weights and discard the kv cache. + Currently only support level 1. + """ + from vllm.device_allocator.cumem import CuMemAllocator + + free_bytes_before_sleep = torch.cuda.mem_get_info()[0] + + # Save the buffers before level 2 sleep + if level == 2: + model = self.pipeline + self._sleep_saved_buffers = {name: buffer.cpu().clone() for name, buffer in model.named_buffers()} + + allocator = CuMemAllocator.get_instance() + allocator.sleep(offload_tags=("weights",) if level == 1 else tuple()) + free_bytes_after_sleep, total = torch.cuda.mem_get_info() + freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep + used_bytes = total - free_bytes_after_sleep + assert freed_bytes >= 0, "Memory usage increased after sleeping." + logger.info( + "Sleep mode freed %.2f GiB memory, %.2f GiB memory is still in use.", + freed_bytes / GiB_bytes, + used_bytes / GiB_bytes, + ) + return True + + def wake_up(self, tags: list[str] | None = None) -> bool: + """ + Wake up the worker from sleep mode. See the sleep function + method for more details. + + Args: + tags: An optional list of tags to reallocate the worker memory + for specific memory allocations. Values must be in + `("weights")`. If None, all memory is reallocated. + wake_up should be called with all tags (or None) before the + worker is used again. + """ + from vllm.device_allocator.cumem import CuMemAllocator + + allocator = CuMemAllocator.get_instance() + allocator.wake_up(tags) + + # Restore the buffers after level 2 sleep + if len(self._sleep_saved_buffers): + model = self.pipeline + for name, buffer in model.named_buffers(): + if name in self._sleep_saved_buffers: + buffer.data.copy_(self._sleep_saved_buffers[name].data) + self._sleep_saved_buffers = {} + return True + + def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager: + if self.od_config.enable_sleep_mode: + from vllm.device_allocator.cumem import CuMemAllocator + + allocator = CuMemAllocator.get_instance() + if tag == "weights": + assert allocator.get_current_usage() == 0, "Sleep mode can only be used for one instance per process." + return allocator.use_memory_pool(tag=tag) + else: + return nullcontext() + + def shutdown(self) -> None: + destroy_distributed_env() + class NPUWorkerProc(WorkerProc): """Wrapper that runs one NPUWorker in a separate process.""" diff --git a/vllm_omni/utils/platform_utils.py b/vllm_omni/utils/platform_utils.py index 5f8259ab83d..fb47018e789 100644 --- a/vllm_omni/utils/platform_utils.py +++ b/vllm_omni/utils/platform_utils.py @@ -53,6 +53,6 @@ def get_diffusion_worker_class() -> type: return NPUWorkerProc else: # Default to GPU worker for cuda and other devices - from vllm_omni.diffusion.worker.gpu_worker import WorkerProc + from vllm_omni.diffusion.worker.gpu_diffusion_worker import WorkerProc return WorkerProc From 5e7035e1834db1735920673e9630ae1c2ba13016 Mon Sep 17 00:00:00 2001 From: D!NE$H <67671800+gDINESH13@users.noreply.github.com> Date: Mon, 19 Jan 2026 09:53:15 +0530 Subject: [PATCH 53/59] [Feature]: Add CFG param to online serving (#824) Signed-off-by: Dinesh G Signed-off-by: gDINESH13 --- examples/online_serving/text_to_image/README.md | 1 + vllm_omni/entrypoints/async_omni.py | 5 +++-- vllm_omni/entrypoints/cli/serve.py | 3 +++ vllm_omni/entrypoints/omni_stage.py | 6 +++++- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/examples/online_serving/text_to_image/README.md b/examples/online_serving/text_to_image/README.md index a4f1ad63321..744b7b2921d 100644 --- a/examples/online_serving/text_to_image/README.md +++ b/examples/online_serving/text_to_image/README.md @@ -116,6 +116,7 @@ Use `extra_body` to pass generation parameters: | `seed` | int | None | Random seed (reproducible) | | `negative_prompt` | str | None | Negative prompt | | `num_outputs_per_prompt` | int | 1 | Number of images to generate | +| `--cfg-parallel-size`. | int | 1 | Number of GPUs for CFG parallelism | ## Response Format diff --git a/vllm_omni/entrypoints/async_omni.py b/vllm_omni/entrypoints/async_omni.py index e41569281cf..3c275147fa0 100644 --- a/vllm_omni/entrypoints/async_omni.py +++ b/vllm_omni/entrypoints/async_omni.py @@ -132,9 +132,10 @@ def _create_default_diffusion_stage_cfg(self, kwargs: dict[str, Any]) -> dict[st ring_degree = kwargs.get("ring_degree") or 1 sequence_parallel_size = kwargs.get("sequence_parallel_size") tensor_parallel_size = kwargs.get("tensor_parallel_size") or 1 + cfg_parallel_size = kwargs.get("cfg_parallel_size") or 1 if sequence_parallel_size is None: sequence_parallel_size = ulysses_degree * ring_degree - num_devices = sequence_parallel_size * tensor_parallel_size + num_devices = sequence_parallel_size * tensor_parallel_size * cfg_parallel_size for i in range(1, num_devices): devices += f",{i}" parallel_config = DiffusionParallelConfig( @@ -144,7 +145,7 @@ def _create_default_diffusion_stage_cfg(self, kwargs: dict[str, Any]) -> dict[st sequence_parallel_size=sequence_parallel_size, ulysses_degree=ulysses_degree, ring_degree=ring_degree, - cfg_parallel_size=1, + cfg_parallel_size=cfg_parallel_size, ) default_stage_cfg = [ { diff --git a/vllm_omni/entrypoints/cli/serve.py b/vllm_omni/entrypoints/cli/serve.py index 3b222c8179c..c3a37e3c82e 100644 --- a/vllm_omni/entrypoints/cli/serve.py +++ b/vllm_omni/entrypoints/cli/serve.py @@ -208,6 +208,9 @@ def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgu default=None, help="Scheduler flow_shift for video models (e.g., 5.0 for 720p, 12.0 for 480p).", ) + omni_config_group.add_argument( + "--cfg-parallel-size", type=int, default=1, help="Number of GPUs for CFG parallel computation" + ) return serve_parser diff --git a/vllm_omni/entrypoints/omni_stage.py b/vllm_omni/entrypoints/omni_stage.py index 804ab7b7fb8..a2790cd06e5 100644 --- a/vllm_omni/entrypoints/omni_stage.py +++ b/vllm_omni/entrypoints/omni_stage.py @@ -474,12 +474,14 @@ def _stage_worker( data_parallel_size = parallel_config.get("data_parallel_size", 1) prefill_context_parallel_size = 1 # not used for diffusion sequence_parallel_size = parallel_config.get("sequence_parallel_size", 1) + cfg_parallel_size = parallel_config.get("cfg_parallel_size", 1) else: tensor_parallel_size = engine_args.get("tensor_parallel_size", 1) pipeline_parallel_size = engine_args.get("pipeline_parallel_size", 1) data_parallel_size = engine_args.get("data_parallel_size", 1) prefill_context_parallel_size = engine_args.get("prefill_context_parallel_size", 1) sequence_parallel_size = 1 # not use in omni model + cfg_parallel_size = 1 # not used in omni model # Calculate total number of devices needed for this stage # For a single stage worker: @@ -488,7 +490,8 @@ def _stage_worker( # - DP: replicates model, but each replica uses TP devices # - PCP: context parallelism, typically uses TP devices # - SP: sequence parallelism, typically uses TP devices - # The number of devices per stage is determined by TP * PP * DP * PCP * SP size + # - CFG: Classifier-Free Guidance parallelism for diffusion models + # The number of devices per stage is determined by TP * PP * DP * PCP * SP * CFG size # (PP/DP/PCP are higher-level parallelism that don't add devices per stage) num_devices_per_stage = ( tensor_parallel_size @@ -496,6 +499,7 @@ def _stage_worker( * data_parallel_size * prefill_context_parallel_size * sequence_parallel_size + * cfg_parallel_size ) # Get physical device IDs from CUDA_VISIBLE_DEVICES From 156cac778c661cc529b7223efd0907b4a020b2b0 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Mon, 19 Jan 2026 05:48:02 +0000 Subject: [PATCH 54/59] debug qwen 2.5 Omni Signed-off-by: tzhouam --- vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py index 61b6b6d2316..45a1447b3a7 100644 --- a/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py +++ b/vllm_omni/model_executor/models/qwen2_5_omni/qwen2_5_omni.py @@ -126,6 +126,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._token2wav_conds: dict[str, torch.Tensor] = {} self._token2wav_ref_mels: dict[str, torch.Tensor] = {} self.model = self.token2wav + self.requires_raw_input_tokens = True else: raise ValueError("Invalid model stage") From 3fc4f988eabf562572821896847dc96b69a4cf74 Mon Sep 17 00:00:00 2001 From: Jiangyun Zhu Date: Mon, 19 Jan 2026 14:20:31 +0800 Subject: [PATCH 55/59] [diffusion] add tp support for qwen-image and refactor some tests (#830) Signed-off-by: zjy0516 --- .../diffusion/parallelism_acceleration.md | 12 +- .../image_to_image/image_edit.py | 13 +- .../test_diffusion_cpu_offload.py | 44 +--- .../test_zimage_tensor_parallel.py | 22 +- tests/utils.py | 43 +++ .../qwen_image/qwen_image_transformer.py | 244 ++++++++++-------- 6 files changed, 228 insertions(+), 150 deletions(-) diff --git a/docs/user_guide/diffusion/parallelism_acceleration.md b/docs/user_guide/diffusion/parallelism_acceleration.md index dfacd2183ff..324301158d8 100644 --- a/docs/user_guide/diffusion/parallelism_acceleration.md +++ b/docs/user_guide/diffusion/parallelism_acceleration.md @@ -23,13 +23,23 @@ The following table shows which models are currently supported by parallelism me | **LongCat-Image** | `meituan-longcat/LongCat-Image` | ✅ | ✅ | ❌ | ❌ | | **LongCat-Image-Edit** | `meituan-longcat/LongCat-Image-Edit` | ✅ | ✅ | ❌ | ❌ | | **Ovis-Image** | `OvisAI/Ovis-Image` | ❌ | ❌ | ❌ | ❌ | -| **Qwen-Image** | `Qwen/Qwen-Image` | ✅ | ✅ | ✅ | ❌ | +| **Qwen-Image** | `Qwen/Qwen-Image` | ✅ | ✅ | ✅ | ✅ | | **Qwen-Image-Edit** | `Qwen/Qwen-Image-Edit` | ✅ | ✅ | ✅ | ❌ | | **Qwen-Image-Edit-2509** | `Qwen/Qwen-Image-Edit-2509` | ✅ | ✅ | ✅ | ❌ | | **Qwen-Image-Layered** | `Qwen/Qwen-Image-Layered` | ✅ | ✅ | ✅ | ❌ | | **Z-Image** | `Tongyi-MAI/Z-Image-Turbo` | ❌ | ❌ | ❌ | ✅ (TP=2 only) | | **Stable-Diffusion3.5** | `stabilityai/stable-diffusion-3.5` | ❌ | ❌ | ❌ | ❌ | + +!!! note "TP Limitations for Diffusion Models" + We currently implement Tensor Parallelism (TP) only for the DiT (Diffusion Transformer) blocks. This is because the `text_encoder` component in vLLM-Omni uses the original Transformers implementation, which does not yet support TP. + + - Good news: The text_encoder typically has minimal impact on overall inference performance. + - Bad news: When TP is enabled, every TP process retains a full copy of the text_encoder weights, leading to significant GPU memory waste. + + We are actively refactoring this design to address this. For details and progress, please refer to [Issue #771](https://github.com/vllm-project/vllm-omni/issues/771). + + !!! note "Why Z-Image is TP=2 only" Z-Image Turbo is currently limited to `tensor_parallel_size` of **1 or 2** due to model shape divisibility constraints. For example, the model has `n_heads=30` and a final projection out dimension of `64`, so valid TP sizes must divide both 30 and 64; the only common divisors are **1 and 2**. diff --git a/examples/offline_inference/image_to_image/image_edit.py b/examples/offline_inference/image_to_image/image_edit.py index 5d2b1052bec..c31d098252b 100644 --- a/examples/offline_inference/image_to_image/image_edit.py +++ b/examples/offline_inference/image_to_image/image_edit.py @@ -181,6 +181,12 @@ def parse_args() -> argparse.Namespace: default=1, help="Number of GPUs used for ring sequence parallelism.", ) + parser.add_argument( + "--tensor_parallel_size", + type=int, + default=1, + help="Number of GPUs used for tensor parallelism (TP) inside the DiT.", + ) parser.add_argument("--layers", type=int, default=4, help="Number of layers to decompose the input image into.") parser.add_argument( "--resolution", @@ -301,7 +307,10 @@ def main(): vae_use_slicing = is_npu() vae_use_tiling = is_npu() parallel_config = DiffusionParallelConfig( - ulysses_degree=args.ulysses_degree, ring_degree=args.ring_degree, cfg_parallel_size=args.cfg_parallel_size + ulysses_degree=args.ulysses_degree, + ring_degree=args.ring_degree, + cfg_parallel_size=args.cfg_parallel_size, + tensor_parallel_size=args.tensor_parallel_size, ) # Configure cache based on backend type @@ -351,7 +360,7 @@ def main(): else: print(f" Input image size: {input_image.size}") print( - f" Parallel configuration: ulysses_degree={args.ulysses_degree}, ring_degree={args.ring_degree}, cfg_parallel_size={args.cfg_parallel_size}" + f" Parallel configuration: ulysses_degree={args.ulysses_degree}, ring_degree={args.ring_degree}, cfg_parallel_size={args.cfg_parallel_size}, tensor_parallel_size={args.tensor_parallel_size}" ) print(f"{'=' * 60}\n") diff --git a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py index 0066d49b161..cefda891571 100644 --- a/tests/e2e/offline_inference/test_diffusion_cpu_offload.py +++ b/tests/e2e/offline_inference/test_diffusion_cpu_offload.py @@ -1,11 +1,10 @@ import sys -import threading -import time from pathlib import Path import pytest import torch +from tests.utils import GPUMemoryMonitor from vllm_omni.utils.platform_utils import is_npu, is_rocm # ruff: noqa: E402 @@ -15,39 +14,6 @@ from vllm_omni import Omni - -class GPUMemoryMonitor: - """Poll global device memory usage via CUDA APIs.""" - - def __init__(self, device_index: int, interval: float = 0.05): - self.device_index = device_index - self.interval = interval - self.peak_used_mb = 0.0 - self._stop_event = threading.Event() - self._thread: threading.Thread | None = None - - def start(self) -> None: - def monitor_loop() -> None: - while not self._stop_event.is_set(): - try: - with torch.cuda.device(self.device_index): - free_bytes, total_bytes = torch.cuda.mem_get_info() - used_mb = (total_bytes - free_bytes) / (1024**2) - self.peak_used_mb = max(self.peak_used_mb, used_mb) - except Exception: - pass - time.sleep(self.interval) - - self._thread = threading.Thread(target=monitor_loop, daemon=True) - self._thread.start() - - def stop(self) -> None: - if self._thread is None: - return - self._stop_event.set() - self._thread.join(timeout=2.0) - - models = ["riverclouds/qwen_image_random"] @@ -73,13 +39,7 @@ def inference(offload: bool = True): generator=torch.Generator("cuda").manual_seed(42), ) - monitor.stop() - torch.cuda.synchronize(device_index) - fallback_alloc = torch.cuda.max_memory_allocated(device=device_index) / (1024**2) - fallback_reserved = torch.cuda.max_memory_reserved(device=device_index) / (1024**2) - peak_memory_mb = max(monitor.peak_used_mb, fallback_alloc, fallback_reserved) - - return peak_memory_mb + return monitor.peak_used_mb offload_peak_memory = inference(offload=True) no_offload_peak_memory = inference(offload=False) diff --git a/tests/e2e/offline_inference/test_zimage_tensor_parallel.py b/tests/e2e/offline_inference/test_zimage_tensor_parallel.py index d32bb2b8223..60686992278 100644 --- a/tests/e2e/offline_inference/test_zimage_tensor_parallel.py +++ b/tests/e2e/offline_inference/test_zimage_tensor_parallel.py @@ -17,6 +17,7 @@ if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) +from tests.utils import GPUMemoryMonitor from vllm_omni import Omni from vllm_omni.diffusion.data import DiffusionParallelConfig from vllm_omni.outputs import OmniRequestOutput @@ -66,7 +67,12 @@ def _extract_single_image(outputs) -> Image.Image: def _run_zimage_generate( *, tp_size: int, height: int, width: int, num_inference_steps: int, seed: int -) -> tuple[Image.Image, float]: +) -> tuple[Image.Image, float, float]: + torch.cuda.empty_cache() + device_index = torch.cuda.current_device() + monitor = GPUMemoryMonitor(device_index=device_index, interval=0.02) + monitor.start() + m = Omni( model=_get_zimage_model(), parallel_config=DiffusionParallelConfig(tensor_parallel_size=tp_size), @@ -107,7 +113,10 @@ def _run_zimage_generate( pass median_time_s = float(np.median(per_request_times_s)) - return _extract_single_image([last_output]), median_time_s + + peak_memory_mb = monitor.peak_used_mb + + return _extract_single_image([last_output]), median_time_s, peak_memory_mb finally: m.close() cleanup_dist_env_and_memory() @@ -125,14 +134,14 @@ def test_zimage_tensor_parallel_tp2(tmp_path: Path): num_inference_steps = 2 seed = 42 - tp1_img, tp1_time_s = _run_zimage_generate( + tp1_img, tp1_time_s, tp1_peak_mem = _run_zimage_generate( tp_size=1, height=height, width=width, num_inference_steps=num_inference_steps, seed=seed, ) - tp2_img, tp2_time_s = _run_zimage_generate( + tp2_img, tp2_time_s, tp2_peak_mem = _run_zimage_generate( tp_size=2, height=height, width=width, @@ -164,3 +173,8 @@ def test_zimage_tensor_parallel_tp2(tmp_path: Path): print(f"Z-Image TP perf (lower is better): tp1_time_s={tp1_time_s:.6f}, tp2_time_s={tp2_time_s:.6f}") assert tp2_time_s < tp1_time_s, f"Expected TP=2 to be faster than TP=1 (tp1={tp1_time_s}, tp2={tp2_time_s})" + + print(f"Z-Image TP peak memory (MB): tp1_peak_mem={tp1_peak_mem:.2f}, tp2_peak_mem={tp2_peak_mem:.2f}") + assert tp2_peak_mem < tp1_peak_mem, ( + f"Expected TP=2 to use less peak memory than TP=1 (tp1={tp1_peak_mem}, tp2={tp2_peak_mem})" + ) diff --git a/tests/utils.py b/tests/utils.py index 2a2dca238a8..8e5593d6501 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -7,6 +7,7 @@ import subprocess import sys import tempfile +import threading import time from collections.abc import Callable from contextlib import ExitStack, contextmanager, suppress @@ -14,6 +15,7 @@ import cloudpickle import pytest +import torch from typing_extensions import ParamSpec from vllm.platforms import current_platform from vllm.utils.torch_utils import cuda_device_count_stateless @@ -474,3 +476,44 @@ def wrapper(f: Callable[_P, None]) -> Callable[_P, None]: return func return wrapper + + +class GPUMemoryMonitor: + """Poll global device memory usage via CUDA APIs.""" + + def __init__(self, device_index: int, interval: float = 0.05): + self.device_index = device_index + self.interval = interval + self._peak_used_mb = 0.0 + self._stop_event = threading.Event() + self._thread: threading.Thread | None = None + + def start(self) -> None: + def monitor_loop() -> None: + while not self._stop_event.is_set(): + try: + with torch.cuda.device(self.device_index): + free_bytes, total_bytes = torch.cuda.mem_get_info() + used_mb = (total_bytes - free_bytes) / (1024**2) + self._peak_used_mb = max(self._peak_used_mb, used_mb) + except Exception: + pass + time.sleep(self.interval) + + self._thread = threading.Thread(target=monitor_loop, daemon=True) + self._thread.start() + + def stop(self) -> None: + if self._thread is None: + return + self._stop_event.set() + self._thread.join(timeout=2.0) + + @property + def peak_used_mb(self) -> float: + fallback_alloc = torch.cuda.max_memory_allocated(device=self.device_index) / (1024**2) + fallback_reserved = torch.cuda.max_memory_reserved(device=self.device_index) / (1024**2) + return max(self._peak_used_mb, fallback_alloc, fallback_reserved) + + def __del__(self): + self.stop() diff --git a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py index 8ac5014ce89..cbf0b7e10ac 100644 --- a/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py +++ b/vllm_omni/diffusion/models/qwen_image/qwen_image_transformer.py @@ -8,7 +8,7 @@ import torch import torch.nn as nn -from diffusers.models.attention import FeedForward +import torch.nn.functional as F # TODO replace this with vLLM implementation from diffusers.models.embeddings import TimestepEmbedding, Timesteps @@ -16,7 +16,11 @@ from diffusers.models.normalization import AdaLayerNormContinuous from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import QKVParallelLinear, ReplicatedLinear +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm_omni.diffusion.attention.backends.abstract import ( @@ -287,79 +291,133 @@ def _compute_video_freqs(self, frame, height, width, idx=0): return freqs.clone().contiguous() +class ColumnParallelApproxGELU(nn.Module): + def __init__(self, dim_in: int, dim_out: int, *, approximate: str, bias: bool = True): + super().__init__() + self.proj = ColumnParallelLinear( + dim_in, + dim_out, + bias=bias, + gather_output=False, + return_bias=False, + ) + self.approximate = approximate + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + return F.gelu(x, approximate=self.approximate) + + +class FeedForward(nn.Module): + def __init__( + self, + dim: int, + dim_out: int | None = None, + mult: int = 4, + activation_fn: str = "gelu-approximate", + inner_dim: int | None = None, + bias: bool = True, + ) -> None: + super().__init__() + + assert activation_fn == "gelu-approximate", "Only gelu-approximate is supported." + + inner_dim = inner_dim or int(dim * mult) + dim_out = dim_out or dim + + layers: list[nn.Module] = [ + ColumnParallelApproxGELU(dim, inner_dim, approximate="tanh", bias=bias), + nn.Identity(), # placeholder for weight loading + RowParallelLinear( + inner_dim, + dim_out, + input_is_parallel=True, + return_bias=False, + ), + ] + + self.net = nn.ModuleList(layers) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for module in self.net: + hidden_states = module(hidden_states) + return hidden_states + + class QwenImageCrossAttention(nn.Module): def __init__( self, dim: int, # query_dim num_heads: int, head_dim: int, - window_size=(-1, -1), - added_kv_proj_dim: int = None, + added_kv_proj_dim: int, + window_size: tuple[int, int] = (-1, -1), out_bias: bool = True, - qk_norm=True, # rmsnorm - eps=1e-6, - pre_only=False, + qk_norm: bool = True, + eps: float = 1e-6, + pre_only: bool = False, context_pre_only: bool = False, - parallel_attention=False, - out_dim: int = None, + out_dim: int | None = None, ) -> None: - assert dim % num_heads == 0 super().__init__() + assert dim % num_heads == 0 + self.dim = dim - self.num_heads = num_heads - self.head_dim = dim // num_heads + self.head_dim = head_dim + self.total_num_heads = num_heads self.window_size = window_size self.qk_norm = qk_norm self.eps = eps - self.parallel_attention = parallel_attention - # layers - # self.to_q = ReplicatedLinear(dim, dim) - # self.to_k = ReplicatedLinear(dim, dim) - # self.to_v = ReplicatedLinear(dim, dim) self.to_qkv = QKVParallelLinear( hidden_size=dim, head_size=self.head_dim, total_num_heads=num_heads, - disable_tp=True, ) + self.query_num_heads = self.to_qkv.num_heads + self.kv_num_heads = self.to_qkv.num_kv_heads + self.norm_q = RMSNorm(head_dim, eps=eps) if qk_norm else nn.Identity() self.norm_k = RMSNorm(head_dim, eps=eps) if qk_norm else nn.Identity() - self.inner_dim = out_dim if out_dim is not None else head_dim * num_heads - self.inner_kv_dim = self.inner_dim - if added_kv_proj_dim is not None: - assert context_pre_only is not None - # self.add_k_proj = ReplicatedLinear(added_kv_proj_dim, self.inner_kv_dim, bias=True) - # self.add_v_proj = ReplicatedLinear(added_kv_proj_dim, self.inner_kv_dim, bias=True) - # self.add_q_proj = ReplicatedLinear( - # added_kv_proj_dim, self.inner_dim, bias=True - # ) - self.add_kv_proj = QKVParallelLinear( - added_kv_proj_dim, - head_size=self.inner_kv_dim // self.num_heads, - total_num_heads=self.num_heads, - disable_tp=True, - ) - if context_pre_only is not None and not context_pre_only: - self.to_add_out = ReplicatedLinear(self.inner_dim, self.dim, bias=out_bias) - else: - self.to_add_out = None + self.inner_dim = out_dim if out_dim is not None else head_dim * self.total_num_heads - if not pre_only: - self.to_out = nn.ModuleList([]) - self.to_out.append(ReplicatedLinear(self.inner_dim, self.dim, bias=out_bias)) - else: - self.to_out = None + assert context_pre_only is not None + self.add_kv_proj = QKVParallelLinear( + hidden_size=added_kv_proj_dim, + head_size=head_dim, + total_num_heads=num_heads, + ) + self.add_query_num_heads = self.add_kv_proj.num_heads + self.add_kv_num_heads = self.add_kv_proj.num_kv_heads + + assert not context_pre_only + self.to_add_out = RowParallelLinear( + self.inner_dim, + self.dim, + bias=out_bias, + input_is_parallel=True, + return_bias=False, + ) + + assert not pre_only + self.to_out = RowParallelLinear( + self.inner_dim, + self.dim, + bias=out_bias, + input_is_parallel=True, + return_bias=False, + ) self.norm_added_q = RMSNorm(head_dim, eps=eps) self.norm_added_k = RMSNorm(head_dim, eps=eps) self.attn = Attention( - num_heads=num_heads, + num_heads=self.query_num_heads, head_size=self.head_dim, softmax_scale=1.0 / (self.head_dim**0.5), causal=False, + num_kv_heads=self.kv_num_heads, ) self.rope = RotaryEmbedding(is_neox_style=False) @@ -377,61 +435,55 @@ def forward( txt_freqs: torch.Tensor, hidden_states_mask: torch.Tensor | None = None, encoder_hidden_states_mask: torch.Tensor | None = None, - ): - # if mask is all true, set it to None + ) -> tuple[torch.Tensor, torch.Tensor]: if hidden_states_mask is not None and hidden_states_mask.all(): hidden_states_mask = None if encoder_hidden_states_mask is not None and encoder_hidden_states_mask.all(): encoder_hidden_states_mask = None - seq_len_txt = encoder_hidden_states.shape[1] - # Compute QKV for image stream (sample projections) - qkv, _ = self.to_qkv(hidden_states) - img_query, img_key, img_value = qkv.chunk(3, dim=-1) + img_qkv, _ = self.to_qkv(hidden_states) + q_size = self.query_num_heads * self.head_dim + kv_size = self.kv_num_heads * self.head_dim + img_query, img_key, img_value = img_qkv.split([q_size, kv_size, kv_size], dim=-1) - # Compute QKV for text stream (context projections) - qkv, _ = self.add_kv_proj(encoder_hidden_states) - txt_query, txt_key, txt_value = qkv.chunk(3, dim=-1) + txt_qkv, _ = self.add_kv_proj(encoder_hidden_states) + add_q_size = self.add_query_num_heads * self.head_dim + add_kv_size = self.add_kv_num_heads * self.head_dim + txt_query, txt_key, txt_value = txt_qkv.split([add_q_size, add_kv_size, add_kv_size], dim=-1) - # Reshape for multi-head attention - img_query = img_query.unflatten(-1, (self.num_heads, -1)) - img_key = img_key.unflatten(-1, (self.num_heads, -1)) - img_value = img_value.unflatten(-1, (self.num_heads, -1)) + img_query = img_query.unflatten(-1, (self.query_num_heads, self.head_dim)) + img_key = img_key.unflatten(-1, (self.kv_num_heads, self.head_dim)) + img_value = img_value.unflatten(-1, (self.kv_num_heads, self.head_dim)) - txt_query = txt_query.unflatten(-1, (self.num_heads, -1)) - txt_key = txt_key.unflatten(-1, (self.num_heads, -1)) - txt_value = txt_value.unflatten(-1, (self.num_heads, -1)) + txt_query = txt_query.unflatten(-1, (self.add_query_num_heads, self.head_dim)) + txt_key = txt_key.unflatten(-1, (self.add_kv_num_heads, self.head_dim)) + txt_value = txt_value.unflatten(-1, (self.add_kv_num_heads, self.head_dim)) - # Apply QK normalization img_query = self.norm_q(img_query) img_key = self.norm_k(img_key) txt_query = self.norm_added_q(txt_query) txt_key = self.norm_added_k(txt_key) - # Apply RoPE img_cos = vid_freqs.real.to(img_query.dtype) img_sin = vid_freqs.imag.to(img_query.dtype) txt_cos = txt_freqs.real.to(txt_query.dtype) txt_sin = txt_freqs.imag.to(txt_query.dtype) + img_query = self.rope(img_query, img_cos, img_sin) img_key = self.rope(img_key, img_cos, img_sin) txt_query = self.rope(txt_query, txt_cos, txt_sin) txt_key = self.rope(txt_key, txt_cos, txt_sin) - # Concatenate for joint attention - # Order: [text, image] + seq_len_txt = encoder_hidden_states.shape[1] joint_query = torch.cat([txt_query, img_query], dim=1) joint_key = torch.cat([txt_key, img_key], dim=1) joint_value = torch.cat([txt_value, img_value], dim=1) - # Compute joint attention if ( self.parallel_config is not None and self.parallel_config.sequence_parallel_size > 1 and not get_forward_context().split_text_embed_in_sp ): - # if using sequence parallel, but not splitting text embed, - # we need to pass text embedding to attention layer as joint qkv attn_metadata = AttentionMetadata( joint_query=txt_query, joint_key=txt_key, @@ -443,22 +495,17 @@ def forward( if encoder_hidden_states_mask is not None: attn_metadata.joint_attn_mask = encoder_hidden_states_mask - joint_hidden_states = self.attn( - img_query, - img_key, - img_value, - attn_metadata, - ) + joint_hidden_states = self.attn(img_query, img_key, img_value, attn_metadata) else: attn_metadata = None if hidden_states_mask is not None or encoder_hidden_states_mask is not None: - mask_list = [] + mask_list: list[torch.Tensor] = [] if encoder_hidden_states_mask is not None: mask_list.append(encoder_hidden_states_mask) else: mask_list.append( torch.ones( - [encoder_hidden_states.shape[0], encoder_hidden_states.shape[1]], + encoder_hidden_states.shape[:2], dtype=torch.bool, device=encoder_hidden_states.device, ) @@ -468,34 +515,22 @@ def forward( else: mask_list.append( torch.ones( - [hidden_states.shape[0], hidden_states.shape[1]], + hidden_states.shape[:2], dtype=torch.bool, device=hidden_states.device, ) ) - joint_mask = ( - None if len(mask_list) == 0 else torch.cat(mask_list, dim=1) if len(mask_list) > 1 else mask_list[0] - ) + joint_mask = torch.cat(mask_list, dim=1) if len(mask_list) > 1 else mask_list[0] attn_metadata = AttentionMetadata(attn_mask=joint_mask) - joint_hidden_states = self.attn( - joint_query, - joint_key, - joint_value, - attn_metadata, - ) - joint_hidden_states = joint_hidden_states.flatten(2, 3) - joint_hidden_states = joint_hidden_states.to(joint_query.dtype) - # Split attention outputs back - txt_attn_output = joint_hidden_states[:, :seq_len_txt, :] # Text part - img_attn_output = joint_hidden_states[:, seq_len_txt:, :] # Image part + joint_hidden_states = self.attn(joint_query, joint_key, joint_value, attn_metadata) - # Apply output projections - img_attn_output, _ = self.to_out[0](img_attn_output) - if len(self.to_out) > 1: - (img_attn_output,) = self.to_out[1](img_attn_output) # dropout + joint_hidden_states = joint_hidden_states.flatten(2, 3).to(joint_query.dtype) + txt_attn_output = joint_hidden_states[:, :seq_len_txt, :] + img_attn_output = joint_hidden_states[:, seq_len_txt:, :] - txt_attn_output, _ = self.to_add_out(txt_attn_output) + img_attn_output = self.to_out(img_attn_output) + txt_attn_output = self.to_add_out(txt_attn_output) return img_attn_output, txt_attn_output @@ -530,7 +565,7 @@ def __init__( head_dim=attention_head_dim, ) self.img_norm2 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) - self.img_mlp = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate") + self.img_mlp = FeedForward(dim=dim, dim_out=dim) # Text processing modules self.txt_mod = nn.Sequential( @@ -540,7 +575,7 @@ def __init__( self.txt_norm1 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) # Text doesn't need separate attention - it's handled by img_attn joint computation self.txt_norm2 = AdaLayerNorm(dim, elementwise_affine=False, eps=eps) - self.txt_mlp = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate") + self.txt_mlp = FeedForward(dim=dim, dim_out=dim) self.zero_cond_t = zero_cond_t @@ -892,6 +927,8 @@ def get_rotary_emb_chunk(freqs, padding=0): if original_seq_len is not None: output = output[:, :original_seq_len, :] + torch.cuda.empty_cache() + return Transformer2DModelOutput(sample=output) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -916,17 +953,22 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loaded_params: set[str] = set() for name, loaded_weight in weights: + original_name = name + lookup_name = name for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: + if weight_name not in original_name: continue - name = name.replace(weight_name, param_name) - param = params_dict[name] + lookup_name = original_name.replace(weight_name, param_name) + param = params_dict[lookup_name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break else: - param = params_dict[name] + if lookup_name not in params_dict and ".to_out.0." in lookup_name: + lookup_name = lookup_name.replace(".to_out.0.", ".to_out.") + param = params_dict[lookup_name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) - loaded_params.add(name) + loaded_params.add(original_name) + loaded_params.add(lookup_name) return loaded_params From 30880aa8df2a1a3abb6cf43fae0bc9a6951d11ca Mon Sep 17 00:00:00 2001 From: tzhouam Date: Mon, 19 Jan 2026 08:04:55 +0000 Subject: [PATCH 56/59] update doc Signed-off-by: tzhouam --- README.md | 2 +- docker/Dockerfile.ci | 2 +- docker/Dockerfile.rocm | 2 +- docs/configuration/README.md | 2 +- docs/getting_started/installation/gpu/cuda.inc.md | 6 +++--- docs/getting_started/installation/gpu/rocm.inc.md | 4 ++-- docs/getting_started/installation/npu/npu.inc.md | 8 ++++---- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 829d683c3c3..12123fc9ddb 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Easy, fast, and cheap omni-modality model serving for everyone --- *Latest News* 🔥 - +- [2026/01] We released [0.14.0rc1](https://github.com/vllm-project/vllm-omni/releases/tag/v0.14.0rc2). - [2026/01] We released [0.12.0rc1](https://github.com/vllm-project/vllm-omni/releases/tag/v0.12.0rc1) - a major RC milestone focused on maturing the diffusion stack, strengthening OpenAI-compatible serving, expanding omni-model coverage, and improving stability across platforms (GPU/NPU/ROCm), please check our latest [design](https://docs.google.com/presentation/d/1qv4qMW1rKAqDREMXiUDLIgqqHQe7TDPj/edit?usp=sharing&ouid=110473603432222024453&rtpof=true&sd=true). - [2025/11] vLLM community officially released [vllm-project/vllm-omni](https://github.com/vllm-project/vllm-omni) in order to support omni-modality models serving. diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 5e1d00a5f88..047512d3621 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -1,5 +1,5 @@ ARG VLLM_BASE_IMAGE=vllm/vllm-openai -ARG VLLM_BASE_TAG=v0.12.0 +ARG VLLM_BASE_TAG=v0.14.0rc2 FROM ${VLLM_BASE_IMAGE}:${VLLM_BASE_TAG} ARG APP_DIR=/workspace/vllm-omni WORKDIR ${APP_DIR} diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 7fabb9c3c68..80f709deae0 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -2,7 +2,7 @@ ARG BASE_IMAGE=rocm/vllm-dev:nightly_main_20251205 FROM ${BASE_IMAGE} ARG COMMON_WORKDIR=/app -ARG VLLM_VERSION=v0.12.0 +ARG VLLM_VERSION=v0.14.0rc2 ARG PYTORCH_ROCM_ARCH="gfx942;gfx950" WORKDIR ${COMMON_WORKDIR} diff --git a/docs/configuration/README.md b/docs/configuration/README.md index 40439d51121..316a0b9eb71 100644 --- a/docs/configuration/README.md +++ b/docs/configuration/README.md @@ -2,7 +2,7 @@ This section lists the most common options for running vLLM-Omni. -For options within a vLLM Engine. Please refer to [vLLM Configuration](https://docs.vllm.ai/en/v0.12.0/configuration/index.html) +For options within a vLLM Engine. Please refer to [vLLM Configuration](https://docs.vllm.ai/en/v0.14.0/configuration/index.html) Currently, the main options are maintained by stage configs for each model. diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md index 540b1852947..c073c152ee9 100644 --- a/docs/getting_started/installation/gpu/cuda.inc.md +++ b/docs/getting_started/installation/gpu/cuda.inc.md @@ -58,11 +58,11 @@ If you want to check, modify or debug with source code of vLLM, install the libr ```bash git clone https://github.com/vllm-project/vllm.git cd vllm -git checkout v0.12.0 +git checkout v0.14.0rc2 ``` Set up environment variables to get pre-built wheels. If there are internet problems, just download the whl file manually. And set `VLLM_PRECOMPILED_WHEEL_LOCATION` as your local absolute path of whl file. ```bash -export VLLM_PRECOMPILED_WHEEL_LOCATION=https://github.com/vllm-project/vllm/releases/download/v0.12.0/vllm-0.12.0-cp38-abi3-manylinux_2_31_x86_64.whl +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://github.com/vllm-project/vllm/releases/download/v0.14.0/vllm-0.14.0rc2-cp38-abi3-manylinux_2_31_x86_64.whl ``` Install vllm with command below (If you have no existing PyTorch). ```bash @@ -93,7 +93,7 @@ docker run --runtime nvidia --gpus 2 \ --env "HF_TOKEN=$HF_TOKEN" \ -p 8091:8091 \ --ipc=host \ - vllm/vllm-omni:v0.12.0rc1 \ + vllm/vllm-omni:v0.14.0rc2 \ --model Qwen/Qwen3-Omni-30B-A3B-Instruct --port 8091 ``` diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md index 1fa751e2508..638c689d4be 100644 --- a/docs/getting_started/installation/gpu/rocm.inc.md +++ b/docs/getting_started/installation/gpu/rocm.inc.md @@ -68,7 +68,7 @@ docker run -it \ -v :/app/model \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=$HF_TOKEN" \ - vllm/vllm-omni-rocm:v0.12.0rc1 \ + vllm/vllm-omni-rocm:v0.14.0rc2 \ vllm serve --model Qwen/Qwen3-Omni-30B-A3B-Instruct --omni --port 8091 ``` @@ -86,7 +86,7 @@ docker run -it \ -v :/app/model \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=$HF_TOKEN" \ - vllm/vllm-omni-rocm:v0.12.0rc1 \ + vllm/vllm-omni-rocm:v0.14.0rc2 \ bash ``` diff --git a/docs/getting_started/installation/npu/npu.inc.md b/docs/getting_started/installation/npu/npu.inc.md index 9c36c2be626..5714449a70a 100644 --- a/docs/getting_started/installation/npu/npu.inc.md +++ b/docs/getting_started/installation/npu/npu.inc.md @@ -13,10 +13,10 @@ export DEVICE0=/dev/davinci0 export DEVICE1=/dev/davinci1 # Update the vllm-ascend image # Atlas A2: -# export IMAGE=quay.io/ascend/vllm-ascend:v0.12.0rc1 +# export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0rc2 # Atlas A3: -# export IMAGE=quay.io/ascend/vllm-ascend:v0.12.0rc1-a3 -export IMAGE=quay.io/ascend/vllm-ascend:v0.12.0rc1 +# export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0rc2-a3 +export IMAGE=quay.io/ascend/vllm-ascend:v0.14.0rc2 docker run --rm \ --name vllm-omni-npu \ --shm-size=1g \ @@ -42,7 +42,7 @@ source ~/.bashrc # Inside the container, install vLLM-Omni from source cd /vllm-workspace -git clone -b v0.12.0rc1 https://github.com/vllm-project/vllm-omni.git +git clone -b v0.14.0rc2 https://github.com/vllm-project/vllm-omni.git cd vllm-omni pip install -v -e . export VLLM_WORKER_MULTIPROC_METHOD=spawn From bd22edd76fb18a06172e2b8b9317794fd97660f6 Mon Sep 17 00:00:00 2001 From: tzhouam Date: Mon, 19 Jan 2026 08:36:28 +0000 Subject: [PATCH 57/59] rebase to vllm 0.14.0 Signed-off-by: tzhouam --- .../core/sched/omni_generation_scheduler.py | 8 +- vllm_omni/entrypoints/openai/api_server.py | 124 ++++++++++++------ .../openai/protocol/chat_completion.py | 2 +- vllm_omni/entrypoints/openai/serving_chat.py | 34 ++--- .../entrypoints/openai/serving_speech.py | 2 +- .../qwen3_omni/qwen3_omni_moe_thinker.py | 7 +- 6 files changed, 112 insertions(+), 65 deletions(-) diff --git a/vllm_omni/core/sched/omni_generation_scheduler.py b/vllm_omni/core/sched/omni_generation_scheduler.py index 7acef3bdcb5..2d04faeeea7 100644 --- a/vllm_omni/core/sched/omni_generation_scheduler.py +++ b/vllm_omni/core/sched/omni_generation_scheduler.py @@ -90,7 +90,7 @@ def schedule(self) -> SchedulerOutput: any_request = self.running[0] num_common_prefix_blocks = self.kv_cache_manager.get_num_common_prefix_blocks(any_request.request_id) - # Assemble SchedulerOutput (align with v0.12.0) + # Assemble SchedulerOutput (align with v0.14.0) if self.use_v2_model_runner: # No resumed reqs in fast path; pass prefill_token_ids for new reqs. new_reqs_data = [ @@ -129,7 +129,7 @@ def schedule(self) -> SchedulerOutput: preempted_req_ids=set(), ) - # Record the request ids scheduled in this step (v0.12.0 behavior). + # Record the request ids scheduled in this step (v0.14.0 behavior). self.prev_step_scheduled_req_ids.clear() self.prev_step_scheduled_req_ids.update(num_scheduled_tokens.keys()) @@ -176,7 +176,7 @@ def update_from_output( outputs: dict[int, list[EngineCoreOutput]] = defaultdict(list) spec_decoding_stats: SpecDecodingStats | None = None kv_connector_stats = kv_connector_output.kv_connector_stats if kv_connector_output else None - # Merge connector-side stats (align with v0.12.0) + # Merge connector-side stats (align with v0.14.0) if kv_connector_stats and self.connector: kv_stats = self.connector.get_kv_connector_stats() if kv_stats: @@ -294,7 +294,7 @@ def update_from_output( if kv_connector_output: self._update_from_kv_xfer_finished(kv_connector_output) - # Collect and publish KV cache events (align with v0.12.0) + # Collect and publish KV cache events (align with v0.14.0) events = self.kv_cache_manager.take_events() if self.connector is not None: connector_events = self.connector.take_events() diff --git a/vllm_omni/entrypoints/openai/api_server.py b/vllm_omni/entrypoints/openai/api_server.py index 297935a0fed..888d39085ac 100644 --- a/vllm_omni/entrypoints/openai/api_server.py +++ b/vllm_omni/entrypoints/openai/api_server.py @@ -1,3 +1,5 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project import multiprocessing import multiprocessing.forkserver as forkserver import os @@ -14,7 +16,7 @@ from fastapi import Depends, HTTPException, Request from fastapi.responses import JSONResponse, StreamingResponse from starlette.datastructures import State -from vllm.config import VllmConfig +from starlette.routing import Route from vllm.engine.protocol import EngineClient from vllm.entrypoints.anthropic.serving_messages import AnthropicServingMessages from vllm.entrypoints.launcher import serve_http @@ -25,30 +27,27 @@ load_log_config, router, setup_server, - validate_json_request, ) -from vllm.entrypoints.openai.chat_completion.protocol import ( +from vllm.entrypoints.openai.orca_metrics import metrics_header +from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, ChatCompletionResponse, + ErrorResponse, ) -from vllm.entrypoints.openai.engine.protocol import ErrorResponse -from vllm.entrypoints.openai.responses.serving import OpenAIServingResponses from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels -from vllm.entrypoints.openai.translations.serving import ( +from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses +from vllm.entrypoints.openai.serving_transcription import ( OpenAIServingTranscription, OpenAIServingTranslation, ) +from vllm.entrypoints.openai.utils import validate_json_request from vllm.entrypoints.pooling.classify.serving import ServingClassification from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling from vllm.entrypoints.pooling.score.serving import ServingScores from vllm.entrypoints.serve.disagg.serving import ServingTokens from vllm.entrypoints.serve.tokenize.serving import OpenAIServingTokenization - -# yapf conflicts with isort for this block -# yapf: disable -# yapf: enable from vllm.entrypoints.tool_server import DemoToolServer, MCPToolServer, ToolServer from vllm.entrypoints.utils import ( load_aware_call, @@ -77,6 +76,29 @@ logger = init_logger(__name__) +ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL = "endpoint-load-metrics-format" + + +def _remove_route_from_router(router_obj, path: str, methods: set[str] | None = None): + """Remove a route from the router by path and optionally by methods. + + This is needed because vllm's api_server registers routes when imported, + and we need to override some routes (like /v1/chat/completions) with + omni-specific implementations. + """ + routes_to_remove = [] + for route in router_obj.routes: + if isinstance(route, Route) and route.path == path: + if methods is None or (hasattr(route, "methods") and route.methods & methods): + routes_to_remove.append(route) + + for route in routes_to_remove: + router_obj.routes.remove(route) + + +# Remove vllm's /v1/chat/completions route so we can register our own omni version +_remove_route_from_router(router, "/v1/chat/completions", {"POST"}) + # Server entry points @@ -124,8 +146,9 @@ async def omni_run_server_worker(listen_address, sock, args, client_config=None, ) as engine_client: app = build_app(args) + await omni_init_app_state(engine_client, app.state, args) + vllm_config = await engine_client.get_vllm_config() - await omni_init_app_state(engine_client, vllm_config, app.state, args) # Check if pure diffusion mode (vllm_config will be None) is_pure_diffusion = vllm_config is None @@ -256,7 +279,6 @@ async def build_async_omni_from_stage_config( async def omni_init_app_state( engine_client: EngineClient, - vllm_config: VllmConfig | None, state: State, args: Namespace, ) -> None: @@ -269,10 +291,12 @@ async def omni_init_app_state( Args: engine_client: Engine client instance (AsyncOmni) - vllm_config: vLLM configuration object (may be None for pure diffusion) state: FastAPI application state object to initialize args: Parsed command-line arguments """ + # Get vllm_config from engine_client (following 0.14.0 pattern) + vllm_config = await engine_client.get_vllm_config() + # Detect if it's pure Diffusion mode (single stage and is Diffusion) is_pure_diffusion = False if hasattr(engine_client, "stage_configs") and engine_client.stage_configs: @@ -296,6 +320,7 @@ async def omni_init_app_state( base_model_paths = [BaseModelPath(name=name, model_path=args.model) for name in served_model_names] state.engine_client = engine_client state.log_stats = not args.disable_log_stats + state.args = args # For omni models state.stage_configs = engine_client.stage_configs if hasattr(engine_client, "stage_configs") else None @@ -325,8 +350,12 @@ async def omni_init_app_state( logger.warning("vllm_config is None, some features may not work correctly") state.vllm_config = vllm_config - if vllm_config is not None: - _model_config = vllm_config.model_config + + # Get supported tasks + supported_tasks: set[str] = {"generate"} + if hasattr(engine_client, "get_supported_tasks"): + supported_tasks = set(await engine_client.get_supported_tasks()) + logger.info("Supported tasks: %s", supported_tasks) resolved_chat_template = await process_chat_template( args.chat_template, @@ -409,9 +438,6 @@ async def omni_init_app_state( lora_modules=lora_modules, ) await state.openai_serving_models.init_static_loras() - supported_tasks: set[str] = {"generate"} - if hasattr(engine_client, "get_supported_tasks"): - supported_tasks = set(await engine_client.get_supported_tasks()) state.openai_serving_responses = ( OpenAIServingResponses( @@ -433,26 +459,34 @@ async def omni_init_app_state( if "generate" in supported_tasks else None ) - state.openai_serving_chat = OmniOpenAIServingChat( - engine_client, - state.openai_serving_models, - args.response_role, - request_logger=request_logger, - chat_template=resolved_chat_template, - chat_template_content_format=args.chat_template_content_format, - default_chat_template_kwargs=args.default_chat_template_kwargs, - trust_request_chat_template=args.trust_request_chat_template, - return_tokens_as_token_ids=args.return_tokens_as_token_ids, - enable_auto_tools=args.enable_auto_tool_choice, - exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, - tool_parser=args.tool_call_parser, - reasoning_parser=args.structured_outputs_config.reasoning_parser, - enable_prompt_tokens_details=args.enable_prompt_tokens_details, - enable_force_include_usage=args.enable_force_include_usage, - enable_log_outputs=args.enable_log_outputs, - enable_log_deltas=args.enable_log_deltas, - log_error_stack=args.log_error_stack, + state.openai_serving_chat = ( + OmniOpenAIServingChat( + engine_client, + state.openai_serving_models, + args.response_role, + request_logger=request_logger, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, + default_chat_template_kwargs=args.default_chat_template_kwargs, + trust_request_chat_template=args.trust_request_chat_template, + return_tokens_as_token_ids=args.return_tokens_as_token_ids, + enable_auto_tools=args.enable_auto_tool_choice, + exclude_tools_when_tool_choice_none=args.exclude_tools_when_tool_choice_none, + tool_parser=args.tool_call_parser, + reasoning_parser=args.structured_outputs_config.reasoning_parser, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + enable_force_include_usage=args.enable_force_include_usage, + enable_log_outputs=args.enable_log_outputs, + enable_log_deltas=args.enable_log_deltas, + log_error_stack=args.log_error_stack, + ) + if "generate" in supported_tasks + else None ) + # Warm up chat template processing to avoid first-request latency + if state.openai_serving_chat is not None: + await state.openai_serving_chat.warmup() + state.openai_serving_completion = ( OpenAIServingCompletion( engine_client, @@ -610,6 +644,7 @@ def Omnispeech(request: Request) -> OmniOpenAIServingSpeech | None: @with_cancellation @load_aware_call async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): + metrics_header_format = raw_request.headers.get(ENDPOINT_LOAD_METRICS_FORMAT_HEADER_LABEL, "") handler = Omnichat(raw_request) if handler is None: return base(raw_request).create_error_response(message="The model does not support Chat Completions API") @@ -638,18 +673,27 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re try: # Use serialize_as_any=True to bypass type checking response_dict = generator.model_dump(mode="json", serialize_as_any=True, warnings="none") - return JSONResponse(content=response_dict) + return JSONResponse( + content=response_dict, + headers=metrics_header(metrics_header_format), + ) except Exception: # Fallback: convert to JSON string and parse back to avoid any serialization issues try: response_json = generator.model_dump_json(warnings="none", serialize_as_any=True) response_dict = json_lib.loads(response_json) - return JSONResponse(content=response_dict) + return JSONResponse( + content=response_dict, + headers=metrics_header(metrics_header_format), + ) except Exception: # Last resort: regular dump with warnings suppressed with warnings_module.catch_warnings(): warnings_module.filterwarnings("ignore", category=UserWarning) - return JSONResponse(content=generator.model_dump(mode="json", warnings="none")) + return JSONResponse( + content=generator.model_dump(mode="json", warnings="none"), + headers=metrics_header(metrics_header_format), + ) return StreamingResponse(content=generator, media_type="text/event-stream") diff --git a/vllm_omni/entrypoints/openai/protocol/chat_completion.py b/vllm_omni/entrypoints/openai/protocol/chat_completion.py index d106b7aa7ae..d0c83f56f8b 100644 --- a/vllm_omni/entrypoints/openai/protocol/chat_completion.py +++ b/vllm_omni/entrypoints/openai/protocol/chat_completion.py @@ -1,4 +1,4 @@ -from vllm.entrypoints.openai.chat_completion.protocol import ( +from vllm.entrypoints.openai.protocol import ( ChatCompletionStreamResponse, ) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index bb3d242af3b..6320cb73a16 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -29,7 +29,11 @@ make_tool_call_id, resolve_chat_template_content_format, ) -from vllm.entrypoints.openai.chat_completion.protocol import ( +from vllm.entrypoints.openai.parser.harmony_utils import ( + get_streamable_parser_for_assistant, + parse_chat_output, +) +from vllm.entrypoints.openai.protocol import ( ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, ChatCompletionResponse, @@ -37,9 +41,6 @@ ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, -) -from vllm.entrypoints.openai.chat_completion.serving import OpenAIServingChat -from vllm.entrypoints.openai.engine.protocol import ( DeltaFunctionCall, DeltaMessage, DeltaToolCall, @@ -49,18 +50,15 @@ FunctionDefinition, PromptTokenUsageInfo, RequestResponseMetadata, + ResponsesRequest, ToolCall, UsageInfo, ) -from vllm.entrypoints.openai.engine.serving import ( +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_engine import ( ChatLikeRequest, - ResponsesRequest, clamp_prompt_logprobs, ) -from vllm.entrypoints.openai.parser.harmony_utils import ( - get_streamable_parser_for_assistant, - parse_chat_output, -) from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls from vllm.entrypoints.utils import should_include_usage from vllm.inputs.data import PromptType, TokensPrompt @@ -215,10 +213,7 @@ async def create_chat_completion( if error_check_ret is not None: return error_check_ret - chat_template_kwargs = self._prepare_extra_chat_template_kwargs( - request.chat_template_kwargs, - self.default_chat_template_kwargs, - ) + chat_template_kwargs = request.chat_template_kwargs or {} chat_template_kwargs.update(reasoning_effort=request.reasoning_effort) ( @@ -236,6 +231,7 @@ async def create_chat_completion( tool_dicts=tool_dicts, documents=getattr(request, "documents", None), chat_template_kwargs=chat_template_kwargs, + default_chat_template_kwargs=self.default_chat_template_kwargs, tool_parser=tool_parser, add_special_tokens=request.add_special_tokens, ) @@ -334,6 +330,7 @@ async def _preprocess_chat( tool_dicts: list[dict[str, Any]] | None = None, documents: list[dict[str, str]] | None = None, chat_template_kwargs: dict[str, Any] | None = None, + default_chat_template_kwargs: dict[str, Any] | None = None, tool_parser: Callable[[TokenizerLike], ToolParser] | None = None, add_special_tokens: bool = False, ) -> tuple[ @@ -357,6 +354,13 @@ async def _preprocess_chat( mm_processor_kwargs=getattr(request, "mm_processor_kwargs", None), ) + # Merge default_chat_template_kwargs with request-provided kwargs + # Request kwargs take precedence over defaults + merged_kwargs = self._prepare_extra_chat_template_kwargs( + chat_template_kwargs, + default_chat_template_kwargs, + ) + _chat_template_kwargs: dict[str, Any] = dict( chat_template=chat_template, add_generation_prompt=add_generation_prompt, @@ -364,7 +368,7 @@ async def _preprocess_chat( tools=tool_dicts, documents=documents, ) - _chat_template_kwargs.update(chat_template_kwargs or {}) + _chat_template_kwargs.update(merged_kwargs) request_prompt: str | list[int] diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index 77be4cc8f35..c6b87810e98 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -2,7 +2,7 @@ from fastapi import Request from fastapi.responses import Response -from vllm.entrypoints.openai.engine.serving import OpenAIServing +from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.logger import init_logger from vllm.utils import random_uuid diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py index 3271506ac2c..2d479062eb2 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni_moe_thinker.py @@ -82,7 +82,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems -from vllm.multimodal.processing.processor import ( +from vllm.multimodal.processing import ( MultiModalPromptUpdates, PlaceholderFeaturesInfo, PromptReplacement, @@ -604,11 +604,12 @@ def _process_audio_input( audio_output_lengths = _get_feat_extract_output_lengths(audio_feature_lengths) - audio_features = self.audio_tower( + audio_outputs = self.audio_tower( input_features.to(self.audio_tower.dtype), feature_lens=audio_feature_lengths, aftercnn_lens=audio_output_lengths, ) + audio_features = audio_outputs.last_hidden_state return audio_features.split(audio_output_lengths.tolist()) @@ -666,8 +667,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.audio_tower = Qwen3OmniMoeAudioEncoder( thinker_config.audio_config, - multimodal_config=multimodal_config, - prefix=maybe_prefix(prefix, "audio_tower"), ) self.visual = Qwen3Omni_VisionTransformer( From 4a7d7322d134e784774fe605aadd93a80ef4ebba Mon Sep 17 00:00:00 2001 From: tzhouam Date: Mon, 19 Jan 2026 11:14:06 +0000 Subject: [PATCH 58/59] unify query type Signed-off-by: tzhouam --- .../offline_inference/qwen2_5_omni/end2end.py | 6 +- .../qwen2_5_omni/run_single_prompt.sh | 2 +- .../offline_inference/qwen3_omni/end2end.py | 97 +++++++++---------- 3 files changed, 52 insertions(+), 53 deletions(-) diff --git a/examples/offline_inference/qwen2_5_omni/end2end.py b/examples/offline_inference/qwen2_5_omni/end2end.py index d1e0264cd7a..22f2161c22b 100644 --- a/examples/offline_inference/qwen2_5_omni/end2end.py +++ b/examples/offline_inference/qwen2_5_omni/end2end.py @@ -278,9 +278,9 @@ def get_audio_query(question: str = None, audio_path: str | None = None, samplin query_map = { - "mixed_modalities": get_mixed_modalities_query, + "use_mixed_modalities": get_mixed_modalities_query, "use_audio_in_video": get_use_audio_in_video_query, - "multi_audios": get_multi_audios_query, + "use_multi_audios": get_multi_audios_query, "use_image": get_image_query, "use_video": get_video_query, "use_audio": get_audio_query, @@ -434,7 +434,7 @@ def parse_args(): "--query-type", "-q", type=str, - default="mixed_modalities", + default="use_mixed_modalities", choices=query_map.keys(), help="Query type.", ) diff --git a/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh b/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh index 26cbf0172c5..c8e4cd2cbf3 100644 --- a/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh +++ b/examples/offline_inference/qwen2_5_omni/run_single_prompt.sh @@ -1,2 +1,2 @@ python end2end.py --output-wav output_audio \ - --query-type mixed_modalities + --query-type use_mixed_modalities diff --git a/examples/offline_inference/qwen3_omni/end2end.py b/examples/offline_inference/qwen3_omni/end2end.py index 3b6e526b501..3cd8918208e 100644 --- a/examples/offline_inference/qwen3_omni/end2end.py +++ b/examples/offline_inference/qwen3_omni/end2end.py @@ -20,7 +20,6 @@ from vllm.assets.video import VideoAsset, video_to_ndarrays from vllm.multimodal.image import convert_image_mode from vllm.utils.argparse_utils import FlexibleArgumentParser -from vllm_omni.assets.video import extract_video_audio from vllm_omni.entrypoints.omni import Omni @@ -201,6 +200,7 @@ def get_mixed_modalities_query( limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1}, ) + def get_multi_audios_query() -> QueryResult: question = "Are these two audio clips the same?" prompt = ( @@ -224,43 +224,41 @@ def get_multi_audios_query() -> QueryResult: "audio": 2, }, ) - + + # def get_use_audio_in_video_query(video_path: str | None = None) -> QueryResult: - # question = ( - # "Describe the content of the video in details, then convert what the " - # "baby say into text." - # ) - # prompt = ( - # f"<|im_start|>system\n{default_system}<|im_end|>\n" - # "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>" - # f"{question}<|im_end|>\n" - # f"<|im_start|>assistant\n" - # ) - # if video_path: - # if not os.path.exists(video_path): - # raise FileNotFoundError(f"Video file not found: {video_path}") - # video_frames = video_to_ndarrays(video_path, num_frames=16) - # else: - # video_frames = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays - # audio = extract_video_audio(video_path, sampling_rate=16000) - # return QueryResult( - # inputs={ - # "prompt": prompt, - # "multi_modal_data": { - # "video": video_frames, - # "audio": audio, - # }, - # "mm_processor_kwargs": { - # "use_audio_in_video": True, - # }, - # }, - # limit_mm_per_prompt={"audio": 1, "video": 1}, - # ) +# question = ( +# "Describe the content of the video in details, then convert what the " +# "baby say into text." +# ) +# prompt = ( +# f"<|im_start|>system\n{default_system}<|im_end|>\n" +# "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>" +# f"{question}<|im_end|>\n" +# f"<|im_start|>assistant\n" +# ) +# if video_path: +# if not os.path.exists(video_path): +# raise FileNotFoundError(f"Video file not found: {video_path}") +# video_frames = video_to_ndarrays(video_path, num_frames=16) +# else: +# video_frames = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays +# audio = extract_video_audio(video_path, sampling_rate=16000) +# return QueryResult( +# inputs={ +# "prompt": prompt, +# "multi_modal_data": { +# "video": video_frames, +# "audio": audio, +# }, +# "mm_processor_kwargs": { +# "use_audio_in_video": True, +# }, +# }, +# limit_mm_per_prompt={"audio": 1, "video": 1}, +# ) def get_use_audio_in_video_query() -> QueryResult: - question = ( - "Describe the content of the video in details, then convert what the " - "baby say into text." - ) + question = "Describe the content of the video in details, then convert what the baby say into text." prompt = ( f"<|im_start|>system\n{default_system}<|im_end|>\n" "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>" @@ -283,20 +281,21 @@ def get_use_audio_in_video_query() -> QueryResult: limit_mm_per_prompt={"audio": 1, "video": 1}, ) + query_map = { "text": get_text_query, "use_audio": get_audio_query, "use_image": get_image_query, "use_video": get_video_query, - "multi_audios": get_multi_audios_query, - "mixed_modalities": get_mixed_modalities_query, + "use_multi_audios": get_multi_audios_query, + "use_mixed_modalities": get_mixed_modalities_query, "use_audio_in_video": get_use_audio_in_video_query, } def main(args): model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct" - print(f"="*20,"\n",f"vllm version: {vllm.__version__}","\n","="*20) + print("=" * 20, "\n", f"vllm version: {vllm.__version__}", "\n", "=" * 20) # Get paths from args video_path = getattr(args, "video_path", None) @@ -334,13 +333,13 @@ def main(args): ) thinker_sampling_params = SamplingParams( - temperature=0.2, - # top_p=0.9, - # top_k=-1, - # max_tokens=1200, - # repetition_penalty=1.05, - # logit_bias={}, - seed=0, + temperature=0.9, + top_p=0.9, + top_k=-1, + max_tokens=1200, + repetition_penalty=1.05, + logit_bias={}, + seed=SEED, ) talker_sampling_params = SamplingParams( @@ -366,8 +365,8 @@ def main(args): sampling_params_list = [ thinker_sampling_params, - # talker_sampling_params, # code predictor is integrated into talker for Qwen3 Omni - # code2wav_sampling_params, + talker_sampling_params, # code predictor is integrated into talker for Qwen3 Omni + code2wav_sampling_params, ] if args.txt_prompts is None: @@ -451,7 +450,7 @@ def parse_args(): "--query-type", "-q", type=str, - default="mixed_modalities", + default="use_mixed_modalities", choices=query_map.keys(), help="Query type.", ) From cfd5d3238f1ab129ca9193763b9bf155108db39f Mon Sep 17 00:00:00 2001 From: tzhouam Date: Mon, 19 Jan 2026 11:14:23 +0000 Subject: [PATCH 59/59] fix build doc Signed-off-by: tzhouam --- docs/design/architecture_overview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/architecture_overview.md b/docs/design/architecture_overview.md index 16f81ab1594..6793895cd46 100644 --- a/docs/design/architecture_overview.md +++ b/docs/design/architecture_overview.md @@ -192,4 +192,4 @@ curl -sS -X POST http://localhost:8091/v1/chat/completions \ } ``` -For more usages, please refer to [examples](../examples/README.md). +For more usages, please refer to [examples](https://github.com/vllm-project/vllm-omni/tree/main/examples).